]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/journal/journal-file.c
journal-file: delete some unnecessary braces
[thirdparty/systemd.git] / src / journal / journal-file.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
cec736d2 2
cec736d2 3#include <errno.h>
cec736d2 4#include <fcntl.h>
11689d2a 5#include <linux/fs.h>
ac2e41f5 6#include <pthread.h>
07630cea
LP
7#include <stddef.h>
8#include <sys/mman.h>
9#include <sys/statvfs.h>
10#include <sys/uio.h>
11#include <unistd.h>
fb0951b0 12
a03d4359
ZJS
13#include "sd-event.h"
14
b5efdb8a 15#include "alloc-util.h"
f27a3864 16#include "btrfs-util.h"
c8b3094d 17#include "chattr-util.h"
07630cea 18#include "compress.h"
3ffd4af2 19#include "fd-util.h"
aa892669 20#include "format-util.h"
11b29a96 21#include "fs-util.h"
0284adc6 22#include "journal-authenticate.h"
cec736d2
LP
23#include "journal-def.h"
24#include "journal-file.h"
25#include "lookup3.h"
0a970718 26#include "memory-util.h"
5d1ce257 27#include "path-util.h"
3df3e884 28#include "random-util.h"
b58c888f 29#include "set.h"
760877e9 30#include "sort-util.h"
3cc44114 31#include "stat-util.h"
07630cea 32#include "string-util.h"
4761fd0f 33#include "strv.h"
89a5a90c 34#include "xattr-util.h"
cec736d2 35
4a92baf3
LP
36#define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
37#define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
cec736d2 38
57850536
AG
39#define DEFAULT_COMPRESS_THRESHOLD (512ULL)
40#define MIN_COMPRESS_THRESHOLD (8ULL)
807e17f0 41
babfc091 42/* This is the minimum journal file size */
6aae0b1a 43#define JOURNAL_FILE_SIZE_MIN (512 * 1024ULL) /* 512 KiB */
babfc091
LP
44
45/* These are the lower and upper bounds if we deduce the max_use value
46 * from the file system size */
6aae0b1a
ZJS
47#define MAX_USE_LOWER (1 * 1024 * 1024ULL) /* 1 MiB */
48#define MAX_USE_UPPER (4 * 1024 * 1024 * 1024ULL) /* 4 GiB */
babfc091 49
6aae0b1a
ZJS
50/* Those are the lower and upper bounds for the minimal use limit,
51 * i.e. how much we'll use even if keep_free suggests otherwise. */
52#define MIN_USE_LOW (1 * 1024 * 1024ULL) /* 1 MiB */
53#define MIN_USE_HIGH (16 * 1024 * 1024ULL) /* 16 MiB */
8580d1f7 54
babfc091 55/* This is the upper bound if we deduce max_size from max_use */
6aae0b1a 56#define MAX_SIZE_UPPER (128 * 1024 * 1024ULL) /* 128 MiB */
babfc091
LP
57
58/* This is the upper bound if we deduce the keep_free value from the
59 * file system size */
6aae0b1a 60#define KEEP_FREE_UPPER (4 * 1024 * 1024 * 1024ULL) /* 4 GiB */
babfc091
LP
61
62/* This is the keep_free value when we can't determine the system
63 * size */
6aae0b1a 64#define DEFAULT_KEEP_FREE (1024 * 1024ULL) /* 1 MB */
babfc091 65
8580d1f7 66/* This is the default maximum number of journal files to keep around. */
6aae0b1a 67#define DEFAULT_N_MAX_FILES 100
8580d1f7 68
dca6219e
LP
69/* n_data was the first entry we added after the initial file format design */
70#define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
cec736d2 71
a4bcff5b
LP
72/* How many entries to keep in the entry array chain cache at max */
73#define CHAIN_CACHE_MAX 20
74
a676e665 75/* How much to increase the journal file size at once each time we allocate something new. */
6aae0b1a 76#define FILE_SIZE_INCREASE (8 * 1024 * 1024ULL) /* 8MB */
a676e665 77
2678031a
LP
78/* Reread fstat() of the file for detecting deletions at least this often */
79#define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
80
fa6ac760
LP
81/* The mmap context to use for the header we pick as one above the last defined typed */
82#define CONTEXT_HEADER _OBJECT_TYPE_MAX
83
51804460
ZJS
84#ifdef __clang__
85# pragma GCC diagnostic ignored "-Waddress-of-packed-member"
86#endif
87
ac2e41f5
VC
88/* This may be called from a separate thread to prevent blocking the caller for the duration of fsync().
89 * As a result we use atomic operations on f->offline_state for inter-thread communications with
90 * journal_file_set_offline() and journal_file_set_online(). */
91static void journal_file_set_offline_internal(JournalFile *f) {
26687bf8 92 assert(f);
ac2e41f5
VC
93 assert(f->fd >= 0);
94 assert(f->header);
95
96 for (;;) {
97 switch (f->offline_state) {
98 case OFFLINE_CANCEL:
99 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_DONE))
100 continue;
101 return;
102
103 case OFFLINE_AGAIN_FROM_SYNCING:
104 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_SYNCING))
105 continue;
106 break;
107
108 case OFFLINE_AGAIN_FROM_OFFLINING:
109 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_SYNCING))
110 continue;
111 break;
112
113 case OFFLINE_SYNCING:
114 (void) fsync(f->fd);
26687bf8 115
ac2e41f5
VC
116 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_OFFLINING))
117 continue;
26687bf8 118
8eb85171 119 f->header->state = f->archive ? STATE_ARCHIVED : STATE_OFFLINE;
ac2e41f5
VC
120 (void) fsync(f->fd);
121 break;
122
123 case OFFLINE_OFFLINING:
124 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_DONE))
125 continue;
4831981d 126 _fallthrough_;
ac2e41f5
VC
127 case OFFLINE_DONE:
128 return;
129
130 case OFFLINE_JOINED:
131 log_debug("OFFLINE_JOINED unexpected offline state for journal_file_set_offline_internal()");
132 return;
133 }
134 }
135}
136
137static void * journal_file_set_offline_thread(void *arg) {
138 JournalFile *f = arg;
139
fa7ff4cf
LP
140 (void) pthread_setname_np(pthread_self(), "journal-offline");
141
ac2e41f5
VC
142 journal_file_set_offline_internal(f);
143
144 return NULL;
145}
146
147static int journal_file_set_offline_thread_join(JournalFile *f) {
148 int r;
149
150 assert(f);
151
152 if (f->offline_state == OFFLINE_JOINED)
153 return 0;
154
155 r = pthread_join(f->offline_thread, NULL);
156 if (r)
157 return -r;
158
159 f->offline_state = OFFLINE_JOINED;
26687bf8 160
be7cdd8e 161 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
fa6ac760
LP
162 return -EIO;
163
ac2e41f5
VC
164 return 0;
165}
26687bf8 166
ac2e41f5
VC
167/* Trigger a restart if the offline thread is mid-flight in a restartable state. */
168static bool journal_file_set_offline_try_restart(JournalFile *f) {
169 for (;;) {
170 switch (f->offline_state) {
171 case OFFLINE_AGAIN_FROM_SYNCING:
172 case OFFLINE_AGAIN_FROM_OFFLINING:
173 return true;
174
175 case OFFLINE_CANCEL:
176 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_AGAIN_FROM_SYNCING))
177 continue;
178 return true;
179
180 case OFFLINE_SYNCING:
181 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_AGAIN_FROM_SYNCING))
182 continue;
183 return true;
184
185 case OFFLINE_OFFLINING:
186 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_AGAIN_FROM_OFFLINING))
187 continue;
188 return true;
26687bf8
OS
189
190 default:
ac2e41f5
VC
191 return false;
192 }
26687bf8
OS
193 }
194}
195
ac2e41f5
VC
196/* Sets a journal offline.
197 *
198 * If wait is false then an offline is dispatched in a separate thread for a
199 * subsequent journal_file_set_offline() or journal_file_set_online() of the
200 * same journal to synchronize with.
201 *
202 * If wait is true, then either an existing offline thread will be restarted
203 * and joined, or if none exists the offline is simply performed in this
204 * context without involving another thread.
205 */
206int journal_file_set_offline(JournalFile *f, bool wait) {
207 bool restarted;
208 int r;
209
26687bf8
OS
210 assert(f);
211
212 if (!f->writable)
213 return -EPERM;
214
846e5418 215 if (f->fd < 0 || !f->header)
26687bf8
OS
216 return -EINVAL;
217
b8f99e27
VC
218 /* An offlining journal is implicitly online and may modify f->header->state,
219 * we must also join any potentially lingering offline thread when not online. */
220 if (!journal_file_is_offlining(f) && f->header->state != STATE_ONLINE)
221 return journal_file_set_offline_thread_join(f);
26687bf8 222
ac2e41f5
VC
223 /* Restart an in-flight offline thread and wait if needed, or join a lingering done one. */
224 restarted = journal_file_set_offline_try_restart(f);
225 if ((restarted && wait) || !restarted) {
226 r = journal_file_set_offline_thread_join(f);
227 if (r < 0)
228 return r;
229 }
26687bf8 230
ac2e41f5
VC
231 if (restarted)
232 return 0;
233
234 /* Initiate a new offline. */
235 f->offline_state = OFFLINE_SYNCING;
fa6ac760 236
ac2e41f5
VC
237 if (wait) /* Without using a thread if waiting. */
238 journal_file_set_offline_internal(f);
239 else {
5e9f01e8
LP
240 sigset_t ss, saved_ss;
241 int k;
242
cd2a429e 243 assert_se(sigfillset(&ss) >= 0);
08f9e80b
CM
244 /* Don't block SIGBUS since the offlining thread accesses a memory mapped file.
245 * Asynchronous SIGBUS signals can safely be handled by either thread. */
246 assert_se(sigdelset(&ss, SIGBUS) >= 0);
5e9f01e8
LP
247
248 r = pthread_sigmask(SIG_BLOCK, &ss, &saved_ss);
249 if (r > 0)
250 return -r;
251
ac2e41f5 252 r = pthread_create(&f->offline_thread, NULL, journal_file_set_offline_thread, f);
5e9f01e8
LP
253
254 k = pthread_sigmask(SIG_SETMASK, &saved_ss, NULL);
ec9ffa2c
VC
255 if (r > 0) {
256 f->offline_state = OFFLINE_JOINED;
ac2e41f5 257 return -r;
ec9ffa2c 258 }
5e9f01e8
LP
259 if (k > 0)
260 return -k;
ac2e41f5
VC
261 }
262
263 return 0;
264}
265
266static int journal_file_set_online(JournalFile *f) {
83bf6b67 267 bool wait = true;
ac2e41f5
VC
268
269 assert(f);
270
271 if (!f->writable)
272 return -EPERM;
273
846e5418 274 if (f->fd < 0 || !f->header)
ac2e41f5
VC
275 return -EINVAL;
276
83bf6b67 277 while (wait) {
ac2e41f5
VC
278 switch (f->offline_state) {
279 case OFFLINE_JOINED:
280 /* No offline thread, no need to wait. */
83bf6b67 281 wait = false;
ac2e41f5
VC
282 break;
283
284 case OFFLINE_SYNCING:
285 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_CANCEL))
286 continue;
287 /* Canceled syncing prior to offlining, no need to wait. */
83bf6b67 288 wait = false;
ac2e41f5
VC
289 break;
290
291 case OFFLINE_AGAIN_FROM_SYNCING:
292 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_CANCEL))
293 continue;
294 /* Canceled restart from syncing, no need to wait. */
83bf6b67 295 wait = false;
ac2e41f5
VC
296 break;
297
298 case OFFLINE_AGAIN_FROM_OFFLINING:
299 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_CANCEL))
300 continue;
301 /* Canceled restart from offlining, must wait for offlining to complete however. */
4831981d 302 _fallthrough_;
ac2e41f5
VC
303 default: {
304 int r;
305
306 r = journal_file_set_offline_thread_join(f);
307 if (r < 0)
308 return r;
309
83bf6b67 310 wait = false;
ac2e41f5
VC
311 break;
312 }
313 }
314 }
26687bf8 315
be7cdd8e 316 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
fa6ac760
LP
317 return -EIO;
318
ac2e41f5
VC
319 switch (f->header->state) {
320 case STATE_ONLINE:
321 return 0;
26687bf8 322
ac2e41f5
VC
323 case STATE_OFFLINE:
324 f->header->state = STATE_ONLINE;
325 (void) fsync(f->fd);
326 return 0;
327
328 default:
329 return -EINVAL;
330 }
26687bf8
OS
331}
332
b58c888f
VC
333bool journal_file_is_offlining(JournalFile *f) {
334 assert(f);
335
336 __sync_synchronize();
337
3742095b 338 if (IN_SET(f->offline_state, OFFLINE_DONE, OFFLINE_JOINED))
b58c888f
VC
339 return false;
340
341 return true;
342}
343
804ae586 344JournalFile* journal_file_close(JournalFile *f) {
c377a6f3
YW
345 if (!f)
346 return NULL;
cec736d2 347
349cc4a5 348#if HAVE_GCRYPT
b0af6f41 349 /* Write the final tag */
43cd8794
FB
350 if (f->seal && f->writable) {
351 int r;
352
353 r = journal_file_append_tag(f);
354 if (r < 0)
355 log_error_errno(r, "Failed to append tag when closing journal: %m");
356 }
feb12d3e 357#endif
b0af6f41 358
7a24f3bf 359 if (f->post_change_timer) {
b6cdfbe5
ZJS
360 if (sd_event_source_get_enabled(f->post_change_timer, NULL) > 0)
361 journal_file_post_change(f);
7a24f3bf 362
1d3fe304 363 sd_event_source_disable_unref(f->post_change_timer);
7a24f3bf
VC
364 }
365
ac2e41f5 366 journal_file_set_offline(f, true);
cec736d2 367
be7cdd8e
VC
368 if (f->mmap && f->cache_fd)
369 mmap_cache_free_fd(f->mmap, f->cache_fd);
cec736d2 370
11689d2a
LP
371 if (f->fd >= 0 && f->defrag_on_close) {
372
373 /* Be friendly to btrfs: turn COW back on again now,
374 * and defragment the file. We won't write to the file
375 * ever again, hence remove all fragmentation, and
376 * reenable all the good bits COW usually provides
377 * (such as data checksumming). */
378
db9a4254 379 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL, NULL);
11689d2a
LP
380 (void) btrfs_defrag_fd(f->fd);
381 }
f27a3864 382
5d1ce257
LP
383 if (f->close_fd)
384 safe_close(f->fd);
cec736d2 385 free(f->path);
807e17f0 386
f649045c 387 mmap_cache_unref(f->mmap);
16e9f408 388
4743015d 389 ordered_hashmap_free_free(f->chain_cache);
a4bcff5b 390
349cc4a5 391#if HAVE_XZ || HAVE_LZ4
807e17f0
LP
392 free(f->compress_buffer);
393#endif
394
349cc4a5 395#if HAVE_GCRYPT
baed47c3
LP
396 if (f->fss_file)
397 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
dc4ebc07 398 else
b7c9ae91
LP
399 free(f->fsprg_state);
400
401 free(f->fsprg_seed);
7560fffc
LP
402
403 if (f->hmac)
404 gcry_md_close(f->hmac);
405#endif
406
6b430fdb 407 return mfree(f);
cec736d2
LP
408}
409
0ac38b70 410static int journal_file_init_header(JournalFile *f, JournalFile *template) {
d89c8fdf 411 Header h = {};
cec736d2
LP
412 ssize_t k;
413 int r;
414
415 assert(f);
416
7560fffc 417 memcpy(h.signature, HEADER_SIGNATURE, 8);
23b0b2b2 418 h.header_size = htole64(ALIGN64(sizeof(h)));
cec736d2 419
d89c8fdf
ZJS
420 h.incompatible_flags |= htole32(
421 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
422 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
7560fffc 423
d89c8fdf
ZJS
424 h.compatible_flags = htole32(
425 f->seal * HEADER_COMPATIBLE_SEALED);
7560fffc 426
cec736d2
LP
427 r = sd_id128_randomize(&h.file_id);
428 if (r < 0)
429 return r;
430
0ac38b70
LP
431 if (template) {
432 h.seqnum_id = template->header->seqnum_id;
beec0085 433 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
0ac38b70
LP
434 } else
435 h.seqnum_id = h.file_id;
cec736d2
LP
436
437 k = pwrite(f->fd, &h, sizeof(h), 0);
438 if (k < 0)
439 return -errno;
440
441 if (k != sizeof(h))
442 return -EIO;
443
444 return 0;
445}
446
447static int journal_file_refresh_header(JournalFile *f) {
de190aef 448 sd_id128_t boot_id;
fa6ac760 449 int r;
cec736d2
LP
450
451 assert(f);
c88cc6af 452 assert(f->header);
cec736d2
LP
453
454 r = sd_id128_get_machine(&f->header->machine_id);
fd4885df
ZJS
455 if (IN_SET(r, -ENOENT, -ENOMEDIUM))
456 /* We don't have a machine-id, let's continue without */
457 zero(f->header->machine_id);
458 else if (r < 0)
cec736d2
LP
459 return r;
460
de190aef 461 r = sd_id128_get_boot(&boot_id);
cec736d2
LP
462 if (r < 0)
463 return r;
464
de190aef
LP
465 f->header->boot_id = boot_id;
466
fa6ac760 467 r = journal_file_set_online(f);
b788cc23 468
7560fffc 469 /* Sync the online state to disk */
fb426037 470 (void) fsync(f->fd);
b788cc23 471
a0fe2a2d
LP
472 /* We likely just created a new file, also sync the directory this file is located in. */
473 (void) fsync_directory_of_file(f->fd);
474
fa6ac760 475 return r;
cec736d2
LP
476}
477
4214009f
ZJS
478static bool warn_wrong_flags(const JournalFile *f, bool compatible) {
479 const uint32_t any = compatible ? HEADER_COMPATIBLE_ANY : HEADER_INCOMPATIBLE_ANY,
480 supported = compatible ? HEADER_COMPATIBLE_SUPPORTED : HEADER_INCOMPATIBLE_SUPPORTED;
481 const char *type = compatible ? "compatible" : "incompatible";
d89c8fdf
ZJS
482 uint32_t flags;
483
4214009f
ZJS
484 flags = le32toh(compatible ? f->header->compatible_flags : f->header->incompatible_flags);
485
486 if (flags & ~supported) {
487 if (flags & ~any)
4761fd0f 488 log_debug("Journal file %s has unknown %s flags 0x%"PRIx32,
4214009f
ZJS
489 f->path, type, flags & ~any);
490 flags = (flags & any) & ~supported;
4761fd0f
ZJS
491 if (flags) {
492 const char* strv[3];
493 unsigned n = 0;
494 _cleanup_free_ char *t = NULL;
495
496 if (compatible && (flags & HEADER_COMPATIBLE_SEALED))
497 strv[n++] = "sealed";
498 if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_XZ))
499 strv[n++] = "xz-compressed";
500 if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_LZ4))
501 strv[n++] = "lz4-compressed";
502 strv[n] = NULL;
503 assert(n < ELEMENTSOF(strv));
504
505 t = strv_join((char**) strv, ", ");
506 log_debug("Journal file %s uses %s %s %s disabled at compilation time.",
507 f->path, type, n > 1 ? "flags" : "flag", strnull(t));
508 }
4214009f
ZJS
509 return true;
510 }
511
512 return false;
513}
514
515static int journal_file_verify_header(JournalFile *f) {
6f94e420
TS
516 uint64_t arena_size, header_size;
517
cec736d2 518 assert(f);
c88cc6af 519 assert(f->header);
cec736d2 520
7560fffc 521 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
cec736d2
LP
522 return -EBADMSG;
523
4214009f
ZJS
524 /* In both read and write mode we refuse to open files with incompatible
525 * flags we don't know. */
526 if (warn_wrong_flags(f, false))
cec736d2
LP
527 return -EPROTONOSUPPORT;
528
4214009f
ZJS
529 /* When open for writing we refuse to open files with compatible flags, too. */
530 if (f->writable && warn_wrong_flags(f, true))
d89c8fdf 531 return -EPROTONOSUPPORT;
7560fffc 532
db11ac1a
LP
533 if (f->header->state >= _STATE_MAX)
534 return -EBADMSG;
535
6f94e420
TS
536 header_size = le64toh(f->header->header_size);
537
dca6219e 538 /* The first addition was n_data, so check that we are at least this large */
6f94e420 539 if (header_size < HEADER_SIZE_MIN)
23b0b2b2
LP
540 return -EBADMSG;
541
8088cbd3 542 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
beec0085
LP
543 return -EBADMSG;
544
6f94e420
TS
545 arena_size = le64toh(f->header->arena_size);
546
547 if (UINT64_MAX - header_size < arena_size || header_size + arena_size > (uint64_t) f->last_stat.st_size)
db11ac1a
LP
548 return -ENODATA;
549
6f94e420 550 if (le64toh(f->header->tail_object_offset) > header_size + arena_size)
db11ac1a
LP
551 return -ENODATA;
552
7762e02b
LP
553 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
554 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
555 !VALID64(le64toh(f->header->tail_object_offset)) ||
556 !VALID64(le64toh(f->header->entry_array_offset)))
557 return -ENODATA;
558
cec736d2 559 if (f->writable) {
cec736d2 560 sd_id128_t machine_id;
ae739cc1 561 uint8_t state;
cec736d2
LP
562 int r;
563
564 r = sd_id128_get_machine(&machine_id);
565 if (r < 0)
566 return r;
567
568 if (!sd_id128_equal(machine_id, f->header->machine_id))
569 return -EHOSTDOWN;
570
de190aef 571 state = f->header->state;
cec736d2 572
b288cdeb
ZJS
573 if (state == STATE_ARCHIVED)
574 return -ESHUTDOWN; /* Already archived */
baaa35ad
ZJS
575 else if (state == STATE_ONLINE)
576 return log_debug_errno(SYNTHETIC_ERRNO(EBUSY),
577 "Journal file %s is already online. Assuming unclean closing.",
578 f->path);
579 else if (state != STATE_OFFLINE)
580 return log_debug_errno(SYNTHETIC_ERRNO(EBUSY),
581 "Journal file %s has unknown state %i.",
582 f->path, state);
ae739cc1 583
5b3cc0c8
YN
584 if (f->header->field_hash_table_size == 0 || f->header->data_hash_table_size == 0)
585 return -EBADMSG;
586
ae739cc1
LP
587 /* Don't permit appending to files from the future. Because otherwise the realtime timestamps wouldn't
588 * be strictly ordered in the entries in the file anymore, and we can't have that since it breaks
589 * bisection. */
baaa35ad
ZJS
590 if (le64toh(f->header->tail_entry_realtime) > now(CLOCK_REALTIME))
591 return log_debug_errno(SYNTHETIC_ERRNO(ETXTBSY),
592 "Journal file %s is from the future, refusing to append new data to it that'd be older.",
593 f->path);
cec736d2
LP
594 }
595
d89c8fdf
ZJS
596 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
597 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
c586dbf1 598
f1889c91 599 f->seal = JOURNAL_HEADER_SEALED(f->header);
7560fffc 600
cec736d2
LP
601 return 0;
602}
603
2678031a 604static int journal_file_fstat(JournalFile *f) {
3cc44114
LP
605 int r;
606
2678031a
LP
607 assert(f);
608 assert(f->fd >= 0);
609
610 if (fstat(f->fd, &f->last_stat) < 0)
611 return -errno;
612
613 f->last_stat_usec = now(CLOCK_MONOTONIC);
614
8d6a4d33 615 /* Refuse dealing with with files that aren't regular */
3cc44114
LP
616 r = stat_verify_regular(&f->last_stat);
617 if (r < 0)
618 return r;
8d6a4d33 619
2678031a
LP
620 /* Refuse appending to files that are already deleted */
621 if (f->last_stat.st_nlink <= 0)
622 return -EIDRM;
623
624 return 0;
625}
626
cec736d2 627static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
eda4b58b 628 uint64_t old_size, new_size;
fec2aa2f 629 int r;
cec736d2
LP
630
631 assert(f);
c88cc6af 632 assert(f->header);
cec736d2 633
cec736d2 634 /* We assume that this file is not sparse, and we know that
38ac38b2 635 * for sure, since we always call posix_fallocate()
cec736d2
LP
636 * ourselves */
637
be7cdd8e 638 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
fa6ac760
LP
639 return -EIO;
640
cec736d2 641 old_size =
23b0b2b2 642 le64toh(f->header->header_size) +
cec736d2
LP
643 le64toh(f->header->arena_size);
644
bc85bfee 645 new_size = PAGE_ALIGN(offset + size);
23b0b2b2
LP
646 if (new_size < le64toh(f->header->header_size))
647 new_size = le64toh(f->header->header_size);
bc85bfee 648
2678031a
LP
649 if (new_size <= old_size) {
650
651 /* We already pre-allocated enough space, but before
652 * we write to it, let's check with fstat() if the
653 * file got deleted, in order make sure we don't throw
654 * away the data immediately. Don't check fstat() for
655 * all writes though, but only once ever 10s. */
656
657 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
658 return 0;
659
660 return journal_file_fstat(f);
661 }
662
663 /* Allocate more space. */
cec736d2 664
a676e665 665 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
bc85bfee 666 return -E2BIG;
cec736d2 667
a676e665 668 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
cec736d2
LP
669 struct statvfs svfs;
670
671 if (fstatvfs(f->fd, &svfs) >= 0) {
672 uint64_t available;
673
070052ab 674 available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
cec736d2
LP
675
676 if (new_size - old_size > available)
677 return -E2BIG;
678 }
679 }
680
eda4b58b 681 /* Increase by larger blocks at once */
be6b0c21 682 new_size = DIV_ROUND_UP(new_size, FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
eda4b58b
LP
683 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
684 new_size = f->metrics.max_size;
685
bc85bfee
LP
686 /* Note that the glibc fallocate() fallback is very
687 inefficient, hence we try to minimize the allocation area
688 as we can. */
fec2aa2f
GV
689 r = posix_fallocate(f->fd, old_size, new_size - old_size);
690 if (r != 0)
691 return -r;
cec736d2 692
23b0b2b2 693 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
cec736d2 694
2678031a 695 return journal_file_fstat(f);
cec736d2
LP
696}
697
78519831 698static unsigned type_to_context(ObjectType type) {
d3d3208f 699 /* One context for each type, plus one catch-all for the rest */
69adae51 700 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
fa6ac760 701 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
d05089d8 702 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
d3d3208f
MS
703}
704
b439282e 705static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret, size_t *ret_size) {
2678031a
LP
706 int r;
707
cec736d2 708 assert(f);
cec736d2
LP
709 assert(ret);
710
7762e02b
LP
711 if (size <= 0)
712 return -EINVAL;
713
2a59ea54 714 /* Avoid SIGBUS on invalid accesses */
4bbdcdb3
LP
715 if (offset + size > (uint64_t) f->last_stat.st_size) {
716 /* Hmm, out of range? Let's refresh the fstat() data
717 * first, before we trust that check. */
718
2678031a
LP
719 r = journal_file_fstat(f);
720 if (r < 0)
721 return r;
722
723 if (offset + size > (uint64_t) f->last_stat.st_size)
4bbdcdb3
LP
724 return -EADDRNOTAVAIL;
725 }
726
b439282e 727 return mmap_cache_get(f->mmap, f->cache_fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret, ret_size);
cec736d2
LP
728}
729
16e9f408
LP
730static uint64_t minimum_header_size(Object *o) {
731
b8e891e6 732 static const uint64_t table[] = {
16e9f408
LP
733 [OBJECT_DATA] = sizeof(DataObject),
734 [OBJECT_FIELD] = sizeof(FieldObject),
735 [OBJECT_ENTRY] = sizeof(EntryObject),
736 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
737 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
738 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
739 [OBJECT_TAG] = sizeof(TagObject),
740 };
741
742 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
743 return sizeof(ObjectHeader);
744
745 return table[o->object.type];
746}
747
24754f36
TR
748/* Lightweight object checks. We want this to be fast, so that we won't
749 * slowdown every journal_file_move_to_object() call too much. */
750static int journal_file_check_object(JournalFile *f, uint64_t offset, Object *o) {
751 assert(f);
752 assert(o);
753
754 switch (o->object.type) {
755
a602d93e 756 case OBJECT_DATA:
baaa35ad
ZJS
757 if ((le64toh(o->data.entry_offset) == 0) ^ (le64toh(o->data.n_entries) == 0))
758 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
759 "Bad n_entries: %" PRIu64 ": %" PRIu64,
760 le64toh(o->data.n_entries),
761 offset);
762
763 if (le64toh(o->object.size) - offsetof(DataObject, payload) <= 0)
764 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
765 "Bad object size (<= %zu): %" PRIu64 ": %" PRIu64,
766 offsetof(DataObject, payload),
767 le64toh(o->object.size),
768 offset);
24754f36 769
10e8445b
TR
770 if (!VALID64(le64toh(o->data.next_hash_offset)) ||
771 !VALID64(le64toh(o->data.next_field_offset)) ||
772 !VALID64(le64toh(o->data.entry_offset)) ||
baaa35ad
ZJS
773 !VALID64(le64toh(o->data.entry_array_offset)))
774 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
775 "Invalid offset, next_hash_offset=" OFSfmt ", next_field_offset=" OFSfmt ", entry_offset=" OFSfmt ", entry_array_offset=" OFSfmt ": %" PRIu64,
776 le64toh(o->data.next_hash_offset),
777 le64toh(o->data.next_field_offset),
778 le64toh(o->data.entry_offset),
779 le64toh(o->data.entry_array_offset),
780 offset);
24754f36
TR
781
782 break;
24754f36
TR
783
784 case OBJECT_FIELD:
baaa35ad
ZJS
785 if (le64toh(o->object.size) - offsetof(FieldObject, payload) <= 0)
786 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
787 "Bad field size (<= %zu): %" PRIu64 ": %" PRIu64,
788 offsetof(FieldObject, payload),
789 le64toh(o->object.size),
790 offset);
24754f36 791
10e8445b 792 if (!VALID64(le64toh(o->field.next_hash_offset)) ||
baaa35ad
ZJS
793 !VALID64(le64toh(o->field.head_data_offset)))
794 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
795 "Invalid offset, next_hash_offset=" OFSfmt ", head_data_offset=" OFSfmt ": %" PRIu64,
796 le64toh(o->field.next_hash_offset),
797 le64toh(o->field.head_data_offset),
798 offset);
24754f36
TR
799 break;
800
801 case OBJECT_ENTRY:
baaa35ad
ZJS
802 if ((le64toh(o->object.size) - offsetof(EntryObject, items)) % sizeof(EntryItem) != 0)
803 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
804 "Bad entry size (<= %zu): %" PRIu64 ": %" PRIu64,
805 offsetof(EntryObject, items),
806 le64toh(o->object.size),
807 offset);
808
809 if ((le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem) <= 0)
810 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
811 "Invalid number items in entry: %" PRIu64 ": %" PRIu64,
812 (le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem),
813 offset);
814
815 if (le64toh(o->entry.seqnum) <= 0)
816 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
817 "Invalid entry seqnum: %" PRIx64 ": %" PRIu64,
818 le64toh(o->entry.seqnum),
819 offset);
820
821 if (!VALID_REALTIME(le64toh(o->entry.realtime)))
822 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
823 "Invalid entry realtime timestamp: %" PRIu64 ": %" PRIu64,
824 le64toh(o->entry.realtime),
825 offset);
826
827 if (!VALID_MONOTONIC(le64toh(o->entry.monotonic)))
828 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
829 "Invalid entry monotonic timestamp: %" PRIu64 ": %" PRIu64,
830 le64toh(o->entry.monotonic),
831 offset);
24754f36
TR
832
833 break;
834
835 case OBJECT_DATA_HASH_TABLE:
836 case OBJECT_FIELD_HASH_TABLE:
837 if ((le64toh(o->object.size) - offsetof(HashTableObject, items)) % sizeof(HashItem) != 0 ||
baaa35ad
ZJS
838 (le64toh(o->object.size) - offsetof(HashTableObject, items)) / sizeof(HashItem) <= 0)
839 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
840 "Invalid %s hash table size: %" PRIu64 ": %" PRIu64,
841 o->object.type == OBJECT_DATA_HASH_TABLE ? "data" : "field",
842 le64toh(o->object.size),
843 offset);
24754f36
TR
844
845 break;
846
847 case OBJECT_ENTRY_ARRAY:
848 if ((le64toh(o->object.size) - offsetof(EntryArrayObject, items)) % sizeof(le64_t) != 0 ||
baaa35ad
ZJS
849 (le64toh(o->object.size) - offsetof(EntryArrayObject, items)) / sizeof(le64_t) <= 0)
850 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
851 "Invalid object entry array size: %" PRIu64 ": %" PRIu64,
852 le64toh(o->object.size),
853 offset);
854
855 if (!VALID64(le64toh(o->entry_array.next_entry_array_offset)))
856 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
857 "Invalid object entry array next_entry_array_offset: " OFSfmt ": %" PRIu64,
858 le64toh(o->entry_array.next_entry_array_offset),
859 offset);
24754f36
TR
860
861 break;
862
863 case OBJECT_TAG:
baaa35ad
ZJS
864 if (le64toh(o->object.size) != sizeof(TagObject))
865 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
866 "Invalid object tag size: %" PRIu64 ": %" PRIu64,
867 le64toh(o->object.size),
868 offset);
24754f36 869
baaa35ad
ZJS
870 if (!VALID_EPOCH(le64toh(o->tag.epoch)))
871 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
872 "Invalid object tag epoch: %" PRIu64 ": %" PRIu64,
873 le64toh(o->tag.epoch), offset);
24754f36
TR
874
875 break;
876 }
877
878 return 0;
879}
880
78519831 881int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
cec736d2
LP
882 int r;
883 void *t;
b439282e 884 size_t tsize;
cec736d2
LP
885 Object *o;
886 uint64_t s;
887
888 assert(f);
889 assert(ret);
890
db11ac1a 891 /* Objects may only be located at multiple of 64 bit */
baaa35ad
ZJS
892 if (!VALID64(offset))
893 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
894 "Attempt to move to object at non-64bit boundary: %" PRIu64,
895 offset);
db11ac1a 896
50809d7a 897 /* Object may not be located in the file header */
baaa35ad
ZJS
898 if (offset < le64toh(f->header->header_size))
899 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
900 "Attempt to move to object located in file header: %" PRIu64,
901 offset);
50809d7a 902
b439282e 903 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t, &tsize);
cec736d2
LP
904 if (r < 0)
905 return r;
906
907 o = (Object*) t;
908 s = le64toh(o->object.size);
909
baaa35ad
ZJS
910 if (s == 0)
911 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
912 "Attempt to move to uninitialized object: %" PRIu64,
913 offset);
914 if (s < sizeof(ObjectHeader))
915 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
916 "Attempt to move to overly short object: %" PRIu64,
917 offset);
918
919 if (o->object.type <= OBJECT_UNUSED)
920 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
921 "Attempt to move to object with invalid type: %" PRIu64,
922 offset);
923
924 if (s < minimum_header_size(o))
925 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
926 "Attempt to move to truncated object: %" PRIu64,
927 offset);
928
929 if (type > OBJECT_UNUSED && o->object.type != type)
930 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
931 "Attempt to move to object of unexpected type: %" PRIu64,
932 offset);
cec736d2 933
b439282e
VC
934 if (s > tsize) {
935 r = journal_file_move_to(f, type, false, offset, s, &t, NULL);
cec736d2
LP
936 if (r < 0)
937 return r;
938
939 o = (Object*) t;
940 }
941
24754f36
TR
942 r = journal_file_check_object(f, offset, o);
943 if (r < 0)
944 return r;
945
cec736d2
LP
946 *ret = o;
947 return 0;
948}
949
d98cc1f2 950static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
cec736d2
LP
951 uint64_t r;
952
953 assert(f);
c88cc6af 954 assert(f->header);
cec736d2 955
beec0085 956 r = le64toh(f->header->tail_entry_seqnum) + 1;
c2373f84
LP
957
958 if (seqnum) {
de190aef 959 /* If an external seqnum counter was passed, we update
c2373f84
LP
960 * both the local and the external one, and set it to
961 * the maximum of both */
962
963 if (*seqnum + 1 > r)
964 r = *seqnum + 1;
965
966 *seqnum = r;
967 }
968
beec0085 969 f->header->tail_entry_seqnum = htole64(r);
cec736d2 970
beec0085
LP
971 if (f->header->head_entry_seqnum == 0)
972 f->header->head_entry_seqnum = htole64(r);
de190aef 973
cec736d2
LP
974 return r;
975}
976
78519831 977int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
cec736d2
LP
978 int r;
979 uint64_t p;
980 Object *tail, *o;
981 void *t;
982
983 assert(f);
c88cc6af 984 assert(f->header);
d05089d8 985 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
cec736d2
LP
986 assert(size >= sizeof(ObjectHeader));
987 assert(offset);
988 assert(ret);
989
26687bf8
OS
990 r = journal_file_set_online(f);
991 if (r < 0)
992 return r;
993
cec736d2 994 p = le64toh(f->header->tail_object_offset);
cec736d2 995 if (p == 0)
23b0b2b2 996 p = le64toh(f->header->header_size);
cec736d2 997 else {
d05089d8 998 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
cec736d2
LP
999 if (r < 0)
1000 return r;
1001
1002 p += ALIGN64(le64toh(tail->object.size));
1003 }
1004
1005 r = journal_file_allocate(f, p, size);
1006 if (r < 0)
1007 return r;
1008
b439282e 1009 r = journal_file_move_to(f, type, false, p, size, &t, NULL);
cec736d2
LP
1010 if (r < 0)
1011 return r;
1012
1013 o = (Object*) t;
1014
1015 zero(o->object);
de190aef 1016 o->object.type = type;
cec736d2
LP
1017 o->object.size = htole64(size);
1018
1019 f->header->tail_object_offset = htole64(p);
cec736d2
LP
1020 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
1021
1022 *ret = o;
1023 *offset = p;
1024
1025 return 0;
1026}
1027
de190aef 1028static int journal_file_setup_data_hash_table(JournalFile *f) {
cec736d2
LP
1029 uint64_t s, p;
1030 Object *o;
1031 int r;
1032
1033 assert(f);
c88cc6af 1034 assert(f->header);
cec736d2 1035
070052ab
LP
1036 /* We estimate that we need 1 hash table entry per 768 bytes
1037 of journal file and we want to make sure we never get
1038 beyond 75% fill level. Calculate the hash table size for
1039 the maximum file size based on these metrics. */
4a92baf3 1040
dfabe643 1041 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
4a92baf3
LP
1042 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
1043 s = DEFAULT_DATA_HASH_TABLE_SIZE;
1044
507f22bd 1045 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
4a92baf3 1046
de190aef
LP
1047 r = journal_file_append_object(f,
1048 OBJECT_DATA_HASH_TABLE,
1049 offsetof(Object, hash_table.items) + s,
1050 &o, &p);
cec736d2
LP
1051 if (r < 0)
1052 return r;
1053
29804cc1 1054 memzero(o->hash_table.items, s);
cec736d2 1055
de190aef
LP
1056 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1057 f->header->data_hash_table_size = htole64(s);
cec736d2
LP
1058
1059 return 0;
1060}
1061
de190aef 1062static int journal_file_setup_field_hash_table(JournalFile *f) {
cec736d2
LP
1063 uint64_t s, p;
1064 Object *o;
1065 int r;
1066
1067 assert(f);
c88cc6af 1068 assert(f->header);
cec736d2 1069
3c1668da
LP
1070 /* We use a fixed size hash table for the fields as this
1071 * number should grow very slowly only */
1072
de190aef
LP
1073 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
1074 r = journal_file_append_object(f,
1075 OBJECT_FIELD_HASH_TABLE,
1076 offsetof(Object, hash_table.items) + s,
1077 &o, &p);
cec736d2
LP
1078 if (r < 0)
1079 return r;
1080
29804cc1 1081 memzero(o->hash_table.items, s);
cec736d2 1082
de190aef
LP
1083 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1084 f->header->field_hash_table_size = htole64(s);
cec736d2
LP
1085
1086 return 0;
1087}
1088
dade37d4 1089int journal_file_map_data_hash_table(JournalFile *f) {
cec736d2
LP
1090 uint64_t s, p;
1091 void *t;
1092 int r;
1093
1094 assert(f);
c88cc6af 1095 assert(f->header);
cec736d2 1096
dade37d4
LP
1097 if (f->data_hash_table)
1098 return 0;
1099
de190aef
LP
1100 p = le64toh(f->header->data_hash_table_offset);
1101 s = le64toh(f->header->data_hash_table_size);
cec736d2 1102
de190aef 1103 r = journal_file_move_to(f,
16e9f408 1104 OBJECT_DATA_HASH_TABLE,
fcde2389 1105 true,
de190aef 1106 p, s,
b42549ad 1107 &t, NULL);
cec736d2
LP
1108 if (r < 0)
1109 return r;
1110
de190aef 1111 f->data_hash_table = t;
cec736d2
LP
1112 return 0;
1113}
1114
dade37d4 1115int journal_file_map_field_hash_table(JournalFile *f) {
cec736d2
LP
1116 uint64_t s, p;
1117 void *t;
1118 int r;
1119
1120 assert(f);
c88cc6af 1121 assert(f->header);
cec736d2 1122
dade37d4
LP
1123 if (f->field_hash_table)
1124 return 0;
1125
de190aef
LP
1126 p = le64toh(f->header->field_hash_table_offset);
1127 s = le64toh(f->header->field_hash_table_size);
cec736d2 1128
de190aef 1129 r = journal_file_move_to(f,
16e9f408 1130 OBJECT_FIELD_HASH_TABLE,
fcde2389 1131 true,
de190aef 1132 p, s,
b42549ad 1133 &t, NULL);
cec736d2
LP
1134 if (r < 0)
1135 return r;
1136
de190aef 1137 f->field_hash_table = t;
cec736d2
LP
1138 return 0;
1139}
1140
3c1668da
LP
1141static int journal_file_link_field(
1142 JournalFile *f,
1143 Object *o,
1144 uint64_t offset,
1145 uint64_t hash) {
1146
805d1486 1147 uint64_t p, h, m;
3c1668da
LP
1148 int r;
1149
1150 assert(f);
c88cc6af 1151 assert(f->header);
90d222c1 1152 assert(f->field_hash_table);
3c1668da
LP
1153 assert(o);
1154 assert(offset > 0);
1155
1156 if (o->object.type != OBJECT_FIELD)
1157 return -EINVAL;
1158
805d1486
LP
1159 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
1160 if (m <= 0)
1161 return -EBADMSG;
3c1668da 1162
805d1486 1163 /* This might alter the window we are looking at */
3c1668da
LP
1164 o->field.next_hash_offset = o->field.head_data_offset = 0;
1165
805d1486 1166 h = hash % m;
3c1668da
LP
1167 p = le64toh(f->field_hash_table[h].tail_hash_offset);
1168 if (p == 0)
1169 f->field_hash_table[h].head_hash_offset = htole64(offset);
1170 else {
1171 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1172 if (r < 0)
1173 return r;
1174
1175 o->field.next_hash_offset = htole64(offset);
1176 }
1177
1178 f->field_hash_table[h].tail_hash_offset = htole64(offset);
1179
1180 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
1181 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
1182
1183 return 0;
1184}
1185
1186static int journal_file_link_data(
1187 JournalFile *f,
1188 Object *o,
1189 uint64_t offset,
1190 uint64_t hash) {
1191
805d1486 1192 uint64_t p, h, m;
cec736d2
LP
1193 int r;
1194
1195 assert(f);
c88cc6af 1196 assert(f->header);
90d222c1 1197 assert(f->data_hash_table);
cec736d2
LP
1198 assert(o);
1199 assert(offset > 0);
b588975f
LP
1200
1201 if (o->object.type != OBJECT_DATA)
1202 return -EINVAL;
cec736d2 1203
805d1486
LP
1204 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
1205 if (m <= 0)
1206 return -EBADMSG;
48496df6 1207
805d1486 1208 /* This might alter the window we are looking at */
de190aef
LP
1209 o->data.next_hash_offset = o->data.next_field_offset = 0;
1210 o->data.entry_offset = o->data.entry_array_offset = 0;
1211 o->data.n_entries = 0;
cec736d2 1212
805d1486 1213 h = hash % m;
8db4213e 1214 p = le64toh(f->data_hash_table[h].tail_hash_offset);
3c1668da 1215 if (p == 0)
cec736d2 1216 /* Only entry in the hash table is easy */
de190aef 1217 f->data_hash_table[h].head_hash_offset = htole64(offset);
3c1668da 1218 else {
48496df6
LP
1219 /* Move back to the previous data object, to patch in
1220 * pointer */
cec736d2 1221
de190aef 1222 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1223 if (r < 0)
1224 return r;
1225
de190aef 1226 o->data.next_hash_offset = htole64(offset);
cec736d2
LP
1227 }
1228
de190aef 1229 f->data_hash_table[h].tail_hash_offset = htole64(offset);
cec736d2 1230
dca6219e
LP
1231 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
1232 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
1233
cec736d2
LP
1234 return 0;
1235}
1236
3c1668da
LP
1237int journal_file_find_field_object_with_hash(
1238 JournalFile *f,
1239 const void *field, uint64_t size, uint64_t hash,
1240 Object **ret, uint64_t *offset) {
1241
805d1486 1242 uint64_t p, osize, h, m;
3c1668da
LP
1243 int r;
1244
1245 assert(f);
c88cc6af 1246 assert(f->header);
3c1668da
LP
1247 assert(field && size > 0);
1248
dade37d4
LP
1249 /* If the field hash table is empty, we can't find anything */
1250 if (le64toh(f->header->field_hash_table_size) <= 0)
1251 return 0;
1252
1253 /* Map the field hash table, if it isn't mapped yet. */
1254 r = journal_file_map_field_hash_table(f);
1255 if (r < 0)
1256 return r;
1257
3c1668da
LP
1258 osize = offsetof(Object, field.payload) + size;
1259
805d1486 1260 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
805d1486 1261 if (m <= 0)
3c1668da
LP
1262 return -EBADMSG;
1263
805d1486 1264 h = hash % m;
3c1668da
LP
1265 p = le64toh(f->field_hash_table[h].head_hash_offset);
1266
1267 while (p > 0) {
1268 Object *o;
1269
1270 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1271 if (r < 0)
1272 return r;
1273
1274 if (le64toh(o->field.hash) == hash &&
1275 le64toh(o->object.size) == osize &&
1276 memcmp(o->field.payload, field, size) == 0) {
1277
1278 if (ret)
1279 *ret = o;
1280 if (offset)
1281 *offset = p;
1282
1283 return 1;
1284 }
1285
1286 p = le64toh(o->field.next_hash_offset);
1287 }
1288
1289 return 0;
1290}
1291
1292int journal_file_find_field_object(
1293 JournalFile *f,
1294 const void *field, uint64_t size,
1295 Object **ret, uint64_t *offset) {
1296
1297 uint64_t hash;
1298
1299 assert(f);
1300 assert(field && size > 0);
1301
1302 hash = hash64(field, size);
1303
1304 return journal_file_find_field_object_with_hash(f,
1305 field, size, hash,
1306 ret, offset);
1307}
1308
de190aef
LP
1309int journal_file_find_data_object_with_hash(
1310 JournalFile *f,
1311 const void *data, uint64_t size, uint64_t hash,
1312 Object **ret, uint64_t *offset) {
48496df6 1313
805d1486 1314 uint64_t p, osize, h, m;
cec736d2
LP
1315 int r;
1316
1317 assert(f);
c88cc6af 1318 assert(f->header);
cec736d2
LP
1319 assert(data || size == 0);
1320
dade37d4
LP
1321 /* If there's no data hash table, then there's no entry. */
1322 if (le64toh(f->header->data_hash_table_size) <= 0)
1323 return 0;
1324
1325 /* Map the data hash table, if it isn't mapped yet. */
1326 r = journal_file_map_data_hash_table(f);
1327 if (r < 0)
1328 return r;
1329
cec736d2
LP
1330 osize = offsetof(Object, data.payload) + size;
1331
805d1486
LP
1332 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
1333 if (m <= 0)
bc85bfee
LP
1334 return -EBADMSG;
1335
805d1486 1336 h = hash % m;
de190aef 1337 p = le64toh(f->data_hash_table[h].head_hash_offset);
cec736d2 1338
de190aef
LP
1339 while (p > 0) {
1340 Object *o;
cec736d2 1341
de190aef 1342 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1343 if (r < 0)
1344 return r;
1345
807e17f0 1346 if (le64toh(o->data.hash) != hash)
85a131e8 1347 goto next;
807e17f0 1348
d89c8fdf 1349 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
349cc4a5 1350#if HAVE_XZ || HAVE_LZ4
fa1c4b51 1351 uint64_t l;
a7f7d1bd 1352 size_t rsize = 0;
cec736d2 1353
807e17f0
LP
1354 l = le64toh(o->object.size);
1355 if (l <= offsetof(Object, data.payload))
cec736d2
LP
1356 return -EBADMSG;
1357
807e17f0
LP
1358 l -= offsetof(Object, data.payload);
1359
d89c8fdf
ZJS
1360 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
1361 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
1362 if (r < 0)
1363 return r;
807e17f0 1364
b785c858 1365 if (rsize == size &&
807e17f0
LP
1366 memcmp(f->compress_buffer, data, size) == 0) {
1367
1368 if (ret)
1369 *ret = o;
1370
1371 if (offset)
1372 *offset = p;
1373
1374 return 1;
1375 }
3b1a55e1
ZJS
1376#else
1377 return -EPROTONOSUPPORT;
1378#endif
807e17f0
LP
1379 } else if (le64toh(o->object.size) == osize &&
1380 memcmp(o->data.payload, data, size) == 0) {
1381
cec736d2
LP
1382 if (ret)
1383 *ret = o;
1384
1385 if (offset)
1386 *offset = p;
1387
de190aef 1388 return 1;
cec736d2
LP
1389 }
1390
85a131e8 1391 next:
cec736d2
LP
1392 p = le64toh(o->data.next_hash_offset);
1393 }
1394
de190aef
LP
1395 return 0;
1396}
1397
1398int journal_file_find_data_object(
1399 JournalFile *f,
1400 const void *data, uint64_t size,
1401 Object **ret, uint64_t *offset) {
1402
1403 uint64_t hash;
1404
1405 assert(f);
1406 assert(data || size == 0);
1407
1408 hash = hash64(data, size);
1409
1410 return journal_file_find_data_object_with_hash(f,
1411 data, size, hash,
1412 ret, offset);
1413}
1414
3c1668da
LP
1415static int journal_file_append_field(
1416 JournalFile *f,
1417 const void *field, uint64_t size,
1418 Object **ret, uint64_t *offset) {
1419
1420 uint64_t hash, p;
1421 uint64_t osize;
1422 Object *o;
1423 int r;
1424
1425 assert(f);
1426 assert(field && size > 0);
1427
1428 hash = hash64(field, size);
1429
1430 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
1431 if (r < 0)
1432 return r;
1433 else if (r > 0) {
1434
1435 if (ret)
1436 *ret = o;
1437
1438 if (offset)
1439 *offset = p;
1440
1441 return 0;
1442 }
1443
1444 osize = offsetof(Object, field.payload) + size;
1445 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
8c92d4bb
LP
1446 if (r < 0)
1447 return r;
3c1668da
LP
1448
1449 o->field.hash = htole64(hash);
1450 memcpy(o->field.payload, field, size);
1451
1452 r = journal_file_link_field(f, o, p, hash);
1453 if (r < 0)
1454 return r;
1455
1456 /* The linking might have altered the window, so let's
1457 * refresh our pointer */
1458 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1459 if (r < 0)
1460 return r;
1461
349cc4a5 1462#if HAVE_GCRYPT
3c1668da
LP
1463 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1464 if (r < 0)
1465 return r;
1466#endif
1467
1468 if (ret)
1469 *ret = o;
1470
1471 if (offset)
1472 *offset = p;
1473
1474 return 0;
1475}
1476
48496df6
LP
1477static int journal_file_append_data(
1478 JournalFile *f,
1479 const void *data, uint64_t size,
1480 Object **ret, uint64_t *offset) {
1481
de190aef
LP
1482 uint64_t hash, p;
1483 uint64_t osize;
1484 Object *o;
d89c8fdf 1485 int r, compression = 0;
3c1668da 1486 const void *eq;
de190aef
LP
1487
1488 assert(f);
1489 assert(data || size == 0);
1490
1491 hash = hash64(data, size);
1492
1493 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1494 if (r < 0)
1495 return r;
0240c603 1496 if (r > 0) {
de190aef
LP
1497
1498 if (ret)
1499 *ret = o;
1500
1501 if (offset)
1502 *offset = p;
1503
1504 return 0;
1505 }
1506
1507 osize = offsetof(Object, data.payload) + size;
1508 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
cec736d2
LP
1509 if (r < 0)
1510 return r;
1511
cec736d2 1512 o->data.hash = htole64(hash);
807e17f0 1513
349cc4a5 1514#if HAVE_XZ || HAVE_LZ4
57850536 1515 if (JOURNAL_FILE_COMPRESS(f) && size >= f->compress_threshold_bytes) {
a7f7d1bd 1516 size_t rsize = 0;
807e17f0 1517
5d6f46b6 1518 compression = compress_blob(data, size, o->data.payload, size - 1, &rsize);
807e17f0 1519
d1afbcd2 1520 if (compression >= 0) {
807e17f0 1521 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
d89c8fdf 1522 o->object.flags |= compression;
807e17f0 1523
fa1c4b51 1524 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
d89c8fdf 1525 size, rsize, object_compressed_to_string(compression));
d1afbcd2
LP
1526 } else
1527 /* Compression didn't work, we don't really care why, let's continue without compression */
1528 compression = 0;
807e17f0
LP
1529 }
1530#endif
1531
75f32f04
ZJS
1532 if (compression == 0)
1533 memcpy_safe(o->data.payload, data, size);
cec736d2 1534
de190aef 1535 r = journal_file_link_data(f, o, p, hash);
cec736d2
LP
1536 if (r < 0)
1537 return r;
1538
349cc4a5 1539#if HAVE_GCRYPT
33685a5a
FB
1540 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1541 if (r < 0)
1542 return r;
1543#endif
1544
48496df6
LP
1545 /* The linking might have altered the window, so let's
1546 * refresh our pointer */
1547 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1548 if (r < 0)
1549 return r;
1550
08c6f819
SL
1551 if (!data)
1552 eq = NULL;
1553 else
1554 eq = memchr(data, '=', size);
3c1668da 1555 if (eq && eq > data) {
748db592 1556 Object *fo = NULL;
3c1668da 1557 uint64_t fp;
3c1668da
LP
1558
1559 /* Create field object ... */
1560 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1561 if (r < 0)
1562 return r;
1563
1564 /* ... and link it in. */
1565 o->data.next_field_offset = fo->field.head_data_offset;
1566 fo->field.head_data_offset = le64toh(p);
1567 }
1568
cec736d2
LP
1569 if (ret)
1570 *ret = o;
1571
1572 if (offset)
de190aef 1573 *offset = p;
cec736d2
LP
1574
1575 return 0;
1576}
1577
1578uint64_t journal_file_entry_n_items(Object *o) {
1579 assert(o);
b588975f
LP
1580
1581 if (o->object.type != OBJECT_ENTRY)
1582 return 0;
cec736d2
LP
1583
1584 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1585}
1586
0284adc6 1587uint64_t journal_file_entry_array_n_items(Object *o) {
de190aef 1588 assert(o);
b588975f
LP
1589
1590 if (o->object.type != OBJECT_ENTRY_ARRAY)
1591 return 0;
de190aef
LP
1592
1593 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1594}
1595
fb9a24b6
LP
1596uint64_t journal_file_hash_table_n_items(Object *o) {
1597 assert(o);
b588975f 1598
ec2ce0c5 1599 if (!IN_SET(o->object.type, OBJECT_DATA_HASH_TABLE, OBJECT_FIELD_HASH_TABLE))
b588975f 1600 return 0;
fb9a24b6
LP
1601
1602 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1603}
1604
de190aef 1605static int link_entry_into_array(JournalFile *f,
4fd052ae
FC
1606 le64_t *first,
1607 le64_t *idx,
de190aef 1608 uint64_t p) {
cec736d2 1609 int r;
de190aef
LP
1610 uint64_t n = 0, ap = 0, q, i, a, hidx;
1611 Object *o;
1612
cec736d2 1613 assert(f);
c88cc6af 1614 assert(f->header);
de190aef
LP
1615 assert(first);
1616 assert(idx);
1617 assert(p > 0);
cec736d2 1618
de190aef
LP
1619 a = le64toh(*first);
1620 i = hidx = le64toh(*idx);
1621 while (a > 0) {
1622
1623 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1624 if (r < 0)
1625 return r;
cec736d2 1626
de190aef
LP
1627 n = journal_file_entry_array_n_items(o);
1628 if (i < n) {
1629 o->entry_array.items[i] = htole64(p);
1630 *idx = htole64(hidx + 1);
1631 return 0;
1632 }
cec736d2 1633
de190aef
LP
1634 i -= n;
1635 ap = a;
1636 a = le64toh(o->entry_array.next_entry_array_offset);
1637 }
1638
1639 if (hidx > n)
1640 n = (hidx+1) * 2;
1641 else
1642 n = n * 2;
1643
1644 if (n < 4)
1645 n = 4;
1646
1647 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1648 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1649 &o, &q);
cec736d2
LP
1650 if (r < 0)
1651 return r;
1652
349cc4a5 1653#if HAVE_GCRYPT
5996c7c2 1654 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
b0af6f41
LP
1655 if (r < 0)
1656 return r;
feb12d3e 1657#endif
b0af6f41 1658
de190aef 1659 o->entry_array.items[i] = htole64(p);
cec736d2 1660
de190aef 1661 if (ap == 0)
7be3aa17 1662 *first = htole64(q);
cec736d2 1663 else {
de190aef 1664 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
cec736d2
LP
1665 if (r < 0)
1666 return r;
1667
de190aef
LP
1668 o->entry_array.next_entry_array_offset = htole64(q);
1669 }
cec736d2 1670
2dee23eb
LP
1671 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1672 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1673
de190aef
LP
1674 *idx = htole64(hidx + 1);
1675
1676 return 0;
1677}
cec736d2 1678
de190aef 1679static int link_entry_into_array_plus_one(JournalFile *f,
4fd052ae
FC
1680 le64_t *extra,
1681 le64_t *first,
1682 le64_t *idx,
de190aef
LP
1683 uint64_t p) {
1684
1685 int r;
1686
1687 assert(f);
1688 assert(extra);
1689 assert(first);
1690 assert(idx);
1691 assert(p > 0);
1692
1693 if (*idx == 0)
1694 *extra = htole64(p);
1695 else {
4fd052ae 1696 le64_t i;
de190aef 1697
7be3aa17 1698 i = htole64(le64toh(*idx) - 1);
de190aef
LP
1699 r = link_entry_into_array(f, first, &i, p);
1700 if (r < 0)
1701 return r;
cec736d2
LP
1702 }
1703
de190aef
LP
1704 *idx = htole64(le64toh(*idx) + 1);
1705 return 0;
1706}
1707
1708static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1709 uint64_t p;
1710 int r;
1711 assert(f);
1712 assert(o);
1713 assert(offset > 0);
1714
1715 p = le64toh(o->entry.items[i].object_offset);
1716 if (p == 0)
1717 return -EINVAL;
1718
1719 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1720 if (r < 0)
1721 return r;
1722
de190aef
LP
1723 return link_entry_into_array_plus_one(f,
1724 &o->data.entry_offset,
1725 &o->data.entry_array_offset,
1726 &o->data.n_entries,
1727 offset);
cec736d2
LP
1728}
1729
1730static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
de190aef 1731 uint64_t n, i;
cec736d2
LP
1732 int r;
1733
1734 assert(f);
c88cc6af 1735 assert(f->header);
cec736d2
LP
1736 assert(o);
1737 assert(offset > 0);
b588975f
LP
1738
1739 if (o->object.type != OBJECT_ENTRY)
1740 return -EINVAL;
cec736d2 1741
b788cc23
LP
1742 __sync_synchronize();
1743
cec736d2 1744 /* Link up the entry itself */
de190aef
LP
1745 r = link_entry_into_array(f,
1746 &f->header->entry_array_offset,
1747 &f->header->n_entries,
1748 offset);
1749 if (r < 0)
1750 return r;
cec736d2 1751
507f22bd 1752 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
cec736d2 1753
de190aef 1754 if (f->header->head_entry_realtime == 0)
0ac38b70 1755 f->header->head_entry_realtime = o->entry.realtime;
cec736d2 1756
0ac38b70 1757 f->header->tail_entry_realtime = o->entry.realtime;
de190aef
LP
1758 f->header->tail_entry_monotonic = o->entry.monotonic;
1759
cec736d2
LP
1760 /* Link up the items */
1761 n = journal_file_entry_n_items(o);
1762 for (i = 0; i < n; i++) {
1763 r = journal_file_link_entry_item(f, o, offset, i);
1764 if (r < 0)
1765 return r;
1766 }
1767
cec736d2
LP
1768 return 0;
1769}
1770
1771static int journal_file_append_entry_internal(
1772 JournalFile *f,
1773 const dual_timestamp *ts,
d180c349 1774 const sd_id128_t *boot_id,
cec736d2
LP
1775 uint64_t xor_hash,
1776 const EntryItem items[], unsigned n_items,
de190aef 1777 uint64_t *seqnum,
cec736d2
LP
1778 Object **ret, uint64_t *offset) {
1779 uint64_t np;
1780 uint64_t osize;
1781 Object *o;
1782 int r;
1783
1784 assert(f);
c88cc6af 1785 assert(f->header);
cec736d2 1786 assert(items || n_items == 0);
de190aef 1787 assert(ts);
cec736d2
LP
1788
1789 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1790
de190aef 1791 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
cec736d2
LP
1792 if (r < 0)
1793 return r;
1794
d98cc1f2 1795 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
75f32f04 1796 memcpy_safe(o->entry.items, items, n_items * sizeof(EntryItem));
de190aef
LP
1797 o->entry.realtime = htole64(ts->realtime);
1798 o->entry.monotonic = htole64(ts->monotonic);
cec736d2 1799 o->entry.xor_hash = htole64(xor_hash);
924426a7
CM
1800 if (boot_id)
1801 f->header->boot_id = *boot_id;
1802 o->entry.boot_id = f->header->boot_id;
cec736d2 1803
349cc4a5 1804#if HAVE_GCRYPT
5996c7c2 1805 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
b0af6f41
LP
1806 if (r < 0)
1807 return r;
feb12d3e 1808#endif
b0af6f41 1809
cec736d2
LP
1810 r = journal_file_link_entry(f, o, np);
1811 if (r < 0)
1812 return r;
1813
1814 if (ret)
1815 *ret = o;
1816
1817 if (offset)
1818 *offset = np;
1819
1820 return 0;
1821}
1822
cf244689 1823void journal_file_post_change(JournalFile *f) {
50f20cfd
LP
1824 assert(f);
1825
c5236850
DT
1826 if (f->fd < 0)
1827 return;
1828
50f20cfd
LP
1829 /* inotify() does not receive IN_MODIFY events from file
1830 * accesses done via mmap(). After each access we hence
1831 * trigger IN_MODIFY by truncating the journal file to its
1832 * current size which triggers IN_MODIFY. */
1833
bc85bfee
LP
1834 __sync_synchronize();
1835
50f20cfd 1836 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
e167d7fd 1837 log_debug_errno(errno, "Failed to truncate file to its own size: %m");
50f20cfd
LP
1838}
1839
7a24f3bf
VC
1840static int post_change_thunk(sd_event_source *timer, uint64_t usec, void *userdata) {
1841 assert(userdata);
1842
1843 journal_file_post_change(userdata);
1844
1845 return 1;
1846}
1847
1848static void schedule_post_change(JournalFile *f) {
7a24f3bf 1849 uint64_t now;
b6cdfbe5 1850 int r;
7a24f3bf
VC
1851
1852 assert(f);
1853 assert(f->post_change_timer);
1854
b6cdfbe5 1855 r = sd_event_source_get_enabled(f->post_change_timer, NULL);
7a24f3bf 1856 if (r < 0) {
e167d7fd
LP
1857 log_debug_errno(r, "Failed to get ftruncate timer state: %m");
1858 goto fail;
7a24f3bf 1859 }
b6cdfbe5 1860 if (r > 0)
7a24f3bf
VC
1861 return;
1862
ca5d90d4 1863 r = sd_event_now(sd_event_source_get_event(f->post_change_timer), CLOCK_MONOTONIC, &now);
7a24f3bf 1864 if (r < 0) {
e167d7fd
LP
1865 log_debug_errno(r, "Failed to get clock's now for scheduling ftruncate: %m");
1866 goto fail;
7a24f3bf
VC
1867 }
1868
ca5d90d4 1869 r = sd_event_source_set_time(f->post_change_timer, now + f->post_change_timer_period);
7a24f3bf 1870 if (r < 0) {
e167d7fd
LP
1871 log_debug_errno(r, "Failed to set time for scheduling ftruncate: %m");
1872 goto fail;
7a24f3bf
VC
1873 }
1874
ca5d90d4 1875 r = sd_event_source_set_enabled(f->post_change_timer, SD_EVENT_ONESHOT);
7a24f3bf 1876 if (r < 0) {
e167d7fd
LP
1877 log_debug_errno(r, "Failed to enable scheduled ftruncate: %m");
1878 goto fail;
7a24f3bf 1879 }
e167d7fd
LP
1880
1881 return;
1882
1883fail:
1884 /* On failure, let's simply post the change immediately. */
1885 journal_file_post_change(f);
7a24f3bf
VC
1886}
1887
1888/* Enable coalesced change posting in a timer on the provided sd_event instance */
1889int journal_file_enable_post_change_timer(JournalFile *f, sd_event *e, usec_t t) {
1890 _cleanup_(sd_event_source_unrefp) sd_event_source *timer = NULL;
1891 int r;
1892
1893 assert(f);
1894 assert_return(!f->post_change_timer, -EINVAL);
1895 assert(e);
1896 assert(t);
1897
1898 r = sd_event_add_time(e, &timer, CLOCK_MONOTONIC, 0, 0, post_change_thunk, f);
1899 if (r < 0)
1900 return r;
1901
1902 r = sd_event_source_set_enabled(timer, SD_EVENT_OFF);
1903 if (r < 0)
1904 return r;
1905
1cc6c93a 1906 f->post_change_timer = TAKE_PTR(timer);
7a24f3bf
VC
1907 f->post_change_timer_period = t;
1908
1909 return r;
1910}
1911
93bab288
YW
1912static int entry_item_cmp(const EntryItem *a, const EntryItem *b) {
1913 return CMP(le64toh(a->object_offset), le64toh(b->object_offset));
1f2da9ec
LP
1914}
1915
d180c349
ZJS
1916int journal_file_append_entry(
1917 JournalFile *f,
1918 const dual_timestamp *ts,
1919 const sd_id128_t *boot_id,
1920 const struct iovec iovec[], unsigned n_iovec,
1921 uint64_t *seqnum,
1922 Object **ret, uint64_t *offset) {
1923
cec736d2
LP
1924 unsigned i;
1925 EntryItem *items;
1926 int r;
1927 uint64_t xor_hash = 0;
de190aef 1928 struct dual_timestamp _ts;
cec736d2
LP
1929
1930 assert(f);
c88cc6af 1931 assert(f->header);
cec736d2
LP
1932 assert(iovec || n_iovec == 0);
1933
c6273953 1934 if (ts) {
baaa35ad
ZJS
1935 if (!VALID_REALTIME(ts->realtime))
1936 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
1937 "Invalid realtime timestamp %" PRIu64 ", refusing entry.",
1938 ts->realtime);
1939 if (!VALID_MONOTONIC(ts->monotonic))
1940 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
1941 "Invalid monotomic timestamp %" PRIu64 ", refusing entry.",
1942 ts->monotonic);
c6273953 1943 } else {
de190aef
LP
1944 dual_timestamp_get(&_ts);
1945 ts = &_ts;
1946 }
1947
349cc4a5 1948#if HAVE_GCRYPT
7560fffc
LP
1949 r = journal_file_maybe_append_tag(f, ts->realtime);
1950 if (r < 0)
1951 return r;
feb12d3e 1952#endif
7560fffc 1953
64825d3c 1954 /* alloca() can't take 0, hence let's allocate at least one */
cf409d15 1955 items = newa(EntryItem, MAX(1u, n_iovec));
cec736d2
LP
1956
1957 for (i = 0; i < n_iovec; i++) {
1958 uint64_t p;
1959 Object *o;
1960
1961 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1962 if (r < 0)
cf244689 1963 return r;
cec736d2
LP
1964
1965 xor_hash ^= le64toh(o->data.hash);
1966 items[i].object_offset = htole64(p);
de7b95cd 1967 items[i].hash = o->data.hash;
cec736d2
LP
1968 }
1969
1f2da9ec
LP
1970 /* Order by the position on disk, in order to improve seek
1971 * times for rotating media. */
93bab288 1972 typesafe_qsort(items, n_iovec, entry_item_cmp);
1f2da9ec 1973
d180c349 1974 r = journal_file_append_entry_internal(f, ts, boot_id, xor_hash, items, n_iovec, seqnum, ret, offset);
cec736d2 1975
fa6ac760
LP
1976 /* If the memory mapping triggered a SIGBUS then we return an
1977 * IO error and ignore the error code passed down to us, since
1978 * it is very likely just an effect of a nullified replacement
1979 * mapping page */
1980
be7cdd8e 1981 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
fa6ac760
LP
1982 r = -EIO;
1983
7a24f3bf
VC
1984 if (f->post_change_timer)
1985 schedule_post_change(f);
1986 else
1987 journal_file_post_change(f);
50f20cfd 1988
cec736d2
LP
1989 return r;
1990}
1991
a4bcff5b 1992typedef struct ChainCacheItem {
fb099c8d 1993 uint64_t first; /* the array at the beginning of the chain */
a4bcff5b
LP
1994 uint64_t array; /* the cached array */
1995 uint64_t begin; /* the first item in the cached array */
1996 uint64_t total; /* the total number of items in all arrays before this one in the chain */
f268980d 1997 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
a4bcff5b
LP
1998} ChainCacheItem;
1999
2000static void chain_cache_put(
4743015d 2001 OrderedHashmap *h,
a4bcff5b
LP
2002 ChainCacheItem *ci,
2003 uint64_t first,
2004 uint64_t array,
2005 uint64_t begin,
f268980d
LP
2006 uint64_t total,
2007 uint64_t last_index) {
a4bcff5b
LP
2008
2009 if (!ci) {
34741aa3
LP
2010 /* If the chain item to cache for this chain is the
2011 * first one it's not worth caching anything */
2012 if (array == first)
2013 return;
2014
29433089 2015 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
4743015d 2016 ci = ordered_hashmap_steal_first(h);
29433089
LP
2017 assert(ci);
2018 } else {
a4bcff5b
LP
2019 ci = new(ChainCacheItem, 1);
2020 if (!ci)
2021 return;
2022 }
2023
2024 ci->first = first;
2025
4743015d 2026 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
a4bcff5b
LP
2027 free(ci);
2028 return;
2029 }
2030 } else
2031 assert(ci->first == first);
2032
2033 ci->array = array;
2034 ci->begin = begin;
2035 ci->total = total;
f268980d 2036 ci->last_index = last_index;
a4bcff5b
LP
2037}
2038
f268980d
LP
2039static int generic_array_get(
2040 JournalFile *f,
2041 uint64_t first,
2042 uint64_t i,
2043 Object **ret, uint64_t *offset) {
de190aef 2044
cec736d2 2045 Object *o;
a4bcff5b 2046 uint64_t p = 0, a, t = 0;
cec736d2 2047 int r;
a4bcff5b 2048 ChainCacheItem *ci;
cec736d2
LP
2049
2050 assert(f);
2051
de190aef 2052 a = first;
a4bcff5b
LP
2053
2054 /* Try the chain cache first */
4743015d 2055 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
2056 if (ci && i > ci->total) {
2057 a = ci->array;
2058 i -= ci->total;
2059 t = ci->total;
2060 }
2061
de190aef 2062 while (a > 0) {
a4bcff5b 2063 uint64_t k;
cec736d2 2064
de190aef
LP
2065 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
2066 if (r < 0)
2067 return r;
cec736d2 2068
a4bcff5b
LP
2069 k = journal_file_entry_array_n_items(o);
2070 if (i < k) {
de190aef 2071 p = le64toh(o->entry_array.items[i]);
a4bcff5b 2072 goto found;
cec736d2
LP
2073 }
2074
a4bcff5b
LP
2075 i -= k;
2076 t += k;
de190aef
LP
2077 a = le64toh(o->entry_array.next_entry_array_offset);
2078 }
2079
a4bcff5b
LP
2080 return 0;
2081
2082found:
2083 /* Let's cache this item for the next invocation */
af13a6b0 2084 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
de190aef
LP
2085
2086 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2087 if (r < 0)
2088 return r;
2089
2090 if (ret)
2091 *ret = o;
2092
2093 if (offset)
2094 *offset = p;
2095
2096 return 1;
2097}
2098
f268980d
LP
2099static int generic_array_get_plus_one(
2100 JournalFile *f,
2101 uint64_t extra,
2102 uint64_t first,
2103 uint64_t i,
2104 Object **ret, uint64_t *offset) {
de190aef
LP
2105
2106 Object *o;
2107
2108 assert(f);
2109
2110 if (i == 0) {
2111 int r;
2112
2113 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
cec736d2
LP
2114 if (r < 0)
2115 return r;
2116
de190aef
LP
2117 if (ret)
2118 *ret = o;
cec736d2 2119
de190aef
LP
2120 if (offset)
2121 *offset = extra;
cec736d2 2122
de190aef 2123 return 1;
cec736d2
LP
2124 }
2125
de190aef
LP
2126 return generic_array_get(f, first, i-1, ret, offset);
2127}
cec736d2 2128
de190aef
LP
2129enum {
2130 TEST_FOUND,
2131 TEST_LEFT,
2132 TEST_RIGHT
2133};
cec736d2 2134
f268980d
LP
2135static int generic_array_bisect(
2136 JournalFile *f,
2137 uint64_t first,
2138 uint64_t n,
2139 uint64_t needle,
2140 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2141 direction_t direction,
2142 Object **ret,
2143 uint64_t *offset,
2144 uint64_t *idx) {
2145
2146 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
de190aef
LP
2147 bool subtract_one = false;
2148 Object *o, *array = NULL;
2149 int r;
a4bcff5b 2150 ChainCacheItem *ci;
cec736d2 2151
de190aef
LP
2152 assert(f);
2153 assert(test_object);
cec736d2 2154
a4bcff5b 2155 /* Start with the first array in the chain */
de190aef 2156 a = first;
a4bcff5b 2157
4743015d 2158 ci = ordered_hashmap_get(f->chain_cache, &first);
96d4d024 2159 if (ci && n > ci->total && ci->begin != 0) {
a4bcff5b
LP
2160 /* Ah, we have iterated this bisection array chain
2161 * previously! Let's see if we can skip ahead in the
2162 * chain, as far as the last time. But we can't jump
2163 * backwards in the chain, so let's check that
2164 * first. */
2165
2166 r = test_object(f, ci->begin, needle);
2167 if (r < 0)
2168 return r;
2169
2170 if (r == TEST_LEFT) {
f268980d 2171 /* OK, what we are looking for is right of the
a4bcff5b
LP
2172 * begin of this EntryArray, so let's jump
2173 * straight to previously cached array in the
2174 * chain */
2175
2176 a = ci->array;
2177 n -= ci->total;
2178 t = ci->total;
f268980d 2179 last_index = ci->last_index;
a4bcff5b
LP
2180 }
2181 }
2182
de190aef
LP
2183 while (a > 0) {
2184 uint64_t left, right, k, lp;
2185
2186 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
cec736d2
LP
2187 if (r < 0)
2188 return r;
2189
de190aef
LP
2190 k = journal_file_entry_array_n_items(array);
2191 right = MIN(k, n);
2192 if (right <= 0)
2193 return 0;
cec736d2 2194
de190aef
LP
2195 i = right - 1;
2196 lp = p = le64toh(array->entry_array.items[i]);
2197 if (p <= 0)
bee6a291
LP
2198 r = -EBADMSG;
2199 else
2200 r = test_object(f, p, needle);
2201 if (r == -EBADMSG) {
2202 log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (1)");
2203 n = i;
2204 continue;
2205 }
de190aef
LP
2206 if (r < 0)
2207 return r;
cec736d2 2208
de190aef
LP
2209 if (r == TEST_FOUND)
2210 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2211
2212 if (r == TEST_RIGHT) {
2213 left = 0;
2214 right -= 1;
f268980d
LP
2215
2216 if (last_index != (uint64_t) -1) {
2217 assert(last_index <= right);
2218
2219 /* If we cached the last index we
2220 * looked at, let's try to not to jump
2221 * too wildly around and see if we can
2222 * limit the range to look at early to
2223 * the immediate neighbors of the last
2224 * index we looked at. */
2225
2226 if (last_index > 0) {
2227 uint64_t x = last_index - 1;
2228
2229 p = le64toh(array->entry_array.items[x]);
2230 if (p <= 0)
2231 return -EBADMSG;
2232
2233 r = test_object(f, p, needle);
2234 if (r < 0)
2235 return r;
2236
2237 if (r == TEST_FOUND)
2238 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2239
2240 if (r == TEST_RIGHT)
2241 right = x;
2242 else
2243 left = x + 1;
2244 }
2245
2246 if (last_index < right) {
2247 uint64_t y = last_index + 1;
2248
2249 p = le64toh(array->entry_array.items[y]);
2250 if (p <= 0)
2251 return -EBADMSG;
2252
2253 r = test_object(f, p, needle);
2254 if (r < 0)
2255 return r;
2256
2257 if (r == TEST_FOUND)
2258 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2259
2260 if (r == TEST_RIGHT)
2261 right = y;
2262 else
2263 left = y + 1;
2264 }
f268980d
LP
2265 }
2266
de190aef
LP
2267 for (;;) {
2268 if (left == right) {
2269 if (direction == DIRECTION_UP)
2270 subtract_one = true;
2271
2272 i = left;
2273 goto found;
2274 }
2275
2276 assert(left < right);
de190aef 2277 i = (left + right) / 2;
f268980d 2278
de190aef
LP
2279 p = le64toh(array->entry_array.items[i]);
2280 if (p <= 0)
bee6a291
LP
2281 r = -EBADMSG;
2282 else
2283 r = test_object(f, p, needle);
2284 if (r == -EBADMSG) {
2285 log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (2)");
2286 right = n = i;
2287 continue;
2288 }
de190aef
LP
2289 if (r < 0)
2290 return r;
cec736d2 2291
de190aef
LP
2292 if (r == TEST_FOUND)
2293 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2294
2295 if (r == TEST_RIGHT)
2296 right = i;
2297 else
2298 left = i + 1;
2299 }
2300 }
2301
2173cbf8 2302 if (k >= n) {
cbdca852
LP
2303 if (direction == DIRECTION_UP) {
2304 i = n;
2305 subtract_one = true;
2306 goto found;
2307 }
2308
cec736d2 2309 return 0;
cbdca852 2310 }
cec736d2 2311
de190aef
LP
2312 last_p = lp;
2313
2314 n -= k;
2315 t += k;
f268980d 2316 last_index = (uint64_t) -1;
de190aef 2317 a = le64toh(array->entry_array.next_entry_array_offset);
cec736d2
LP
2318 }
2319
2320 return 0;
de190aef
LP
2321
2322found:
2323 if (subtract_one && t == 0 && i == 0)
2324 return 0;
2325
a4bcff5b 2326 /* Let's cache this item for the next invocation */
af13a6b0 2327 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
a4bcff5b 2328
de190aef
LP
2329 if (subtract_one && i == 0)
2330 p = last_p;
2331 else if (subtract_one)
2332 p = le64toh(array->entry_array.items[i-1]);
2333 else
2334 p = le64toh(array->entry_array.items[i]);
2335
2336 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2337 if (r < 0)
2338 return r;
2339
2340 if (ret)
2341 *ret = o;
2342
2343 if (offset)
2344 *offset = p;
2345
2346 if (idx)
cbdca852 2347 *idx = t + i + (subtract_one ? -1 : 0);
de190aef
LP
2348
2349 return 1;
cec736d2
LP
2350}
2351
f268980d
LP
2352static int generic_array_bisect_plus_one(
2353 JournalFile *f,
2354 uint64_t extra,
2355 uint64_t first,
2356 uint64_t n,
2357 uint64_t needle,
2358 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2359 direction_t direction,
2360 Object **ret,
2361 uint64_t *offset,
2362 uint64_t *idx) {
de190aef 2363
cec736d2 2364 int r;
cbdca852
LP
2365 bool step_back = false;
2366 Object *o;
cec736d2
LP
2367
2368 assert(f);
de190aef 2369 assert(test_object);
cec736d2 2370
de190aef
LP
2371 if (n <= 0)
2372 return 0;
cec736d2 2373
de190aef
LP
2374 /* This bisects the array in object 'first', but first checks
2375 * an extra */
de190aef
LP
2376 r = test_object(f, extra, needle);
2377 if (r < 0)
2378 return r;
a536e261
LP
2379
2380 if (r == TEST_FOUND)
2381 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2382
cbdca852
LP
2383 /* if we are looking with DIRECTION_UP then we need to first
2384 see if in the actual array there is a matching entry, and
2385 return the last one of that. But if there isn't any we need
2386 to return this one. Hence remember this, and return it
2387 below. */
2388 if (r == TEST_LEFT)
2389 step_back = direction == DIRECTION_UP;
de190aef 2390
cbdca852
LP
2391 if (r == TEST_RIGHT) {
2392 if (direction == DIRECTION_DOWN)
2393 goto found;
2394 else
2395 return 0;
a536e261 2396 }
cec736d2 2397
de190aef
LP
2398 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
2399
cbdca852
LP
2400 if (r == 0 && step_back)
2401 goto found;
2402
ecf68b1d 2403 if (r > 0 && idx)
313cefa1 2404 (*idx)++;
de190aef
LP
2405
2406 return r;
cbdca852
LP
2407
2408found:
2409 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
2410 if (r < 0)
2411 return r;
2412
2413 if (ret)
2414 *ret = o;
2415
2416 if (offset)
2417 *offset = extra;
2418
2419 if (idx)
2420 *idx = 0;
2421
2422 return 1;
2423}
2424
44a6b1b6 2425_pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
cbdca852
LP
2426 assert(f);
2427 assert(p > 0);
2428
2429 if (p == needle)
2430 return TEST_FOUND;
2431 else if (p < needle)
2432 return TEST_LEFT;
2433 else
2434 return TEST_RIGHT;
2435}
2436
de190aef
LP
2437static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
2438 Object *o;
2439 int r;
2440
2441 assert(f);
2442 assert(p > 0);
2443
2444 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
cec736d2
LP
2445 if (r < 0)
2446 return r;
2447
de190aef
LP
2448 if (le64toh(o->entry.seqnum) == needle)
2449 return TEST_FOUND;
2450 else if (le64toh(o->entry.seqnum) < needle)
2451 return TEST_LEFT;
2452 else
2453 return TEST_RIGHT;
2454}
cec736d2 2455
de190aef
LP
2456int journal_file_move_to_entry_by_seqnum(
2457 JournalFile *f,
2458 uint64_t seqnum,
2459 direction_t direction,
2460 Object **ret,
2461 uint64_t *offset) {
c88cc6af
VC
2462 assert(f);
2463 assert(f->header);
de190aef
LP
2464
2465 return generic_array_bisect(f,
2466 le64toh(f->header->entry_array_offset),
2467 le64toh(f->header->n_entries),
2468 seqnum,
2469 test_object_seqnum,
2470 direction,
2471 ret, offset, NULL);
2472}
cec736d2 2473
de190aef
LP
2474static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
2475 Object *o;
2476 int r;
2477
2478 assert(f);
2479 assert(p > 0);
2480
2481 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2482 if (r < 0)
2483 return r;
2484
2485 if (le64toh(o->entry.realtime) == needle)
2486 return TEST_FOUND;
2487 else if (le64toh(o->entry.realtime) < needle)
2488 return TEST_LEFT;
2489 else
2490 return TEST_RIGHT;
cec736d2
LP
2491}
2492
de190aef
LP
2493int journal_file_move_to_entry_by_realtime(
2494 JournalFile *f,
2495 uint64_t realtime,
2496 direction_t direction,
2497 Object **ret,
2498 uint64_t *offset) {
c88cc6af
VC
2499 assert(f);
2500 assert(f->header);
de190aef
LP
2501
2502 return generic_array_bisect(f,
2503 le64toh(f->header->entry_array_offset),
2504 le64toh(f->header->n_entries),
2505 realtime,
2506 test_object_realtime,
2507 direction,
2508 ret, offset, NULL);
2509}
2510
2511static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
2512 Object *o;
2513 int r;
2514
2515 assert(f);
2516 assert(p > 0);
2517
2518 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2519 if (r < 0)
2520 return r;
2521
2522 if (le64toh(o->entry.monotonic) == needle)
2523 return TEST_FOUND;
2524 else if (le64toh(o->entry.monotonic) < needle)
2525 return TEST_LEFT;
2526 else
2527 return TEST_RIGHT;
2528}
2529
2a560338 2530static int find_data_object_by_boot_id(
47838ab3
ZJS
2531 JournalFile *f,
2532 sd_id128_t boot_id,
2533 Object **o,
2534 uint64_t *b) {
2a560338 2535
fbd0b64f 2536 char t[STRLEN("_BOOT_ID=") + 32 + 1] = "_BOOT_ID=";
47838ab3
ZJS
2537
2538 sd_id128_to_string(boot_id, t + 9);
2539 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2540}
2541
de190aef
LP
2542int journal_file_move_to_entry_by_monotonic(
2543 JournalFile *f,
2544 sd_id128_t boot_id,
2545 uint64_t monotonic,
2546 direction_t direction,
2547 Object **ret,
2548 uint64_t *offset) {
2549
de190aef
LP
2550 Object *o;
2551 int r;
2552
cbdca852 2553 assert(f);
de190aef 2554
47838ab3 2555 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
de190aef
LP
2556 if (r < 0)
2557 return r;
cbdca852 2558 if (r == 0)
de190aef
LP
2559 return -ENOENT;
2560
2561 return generic_array_bisect_plus_one(f,
2562 le64toh(o->data.entry_offset),
2563 le64toh(o->data.entry_array_offset),
2564 le64toh(o->data.n_entries),
2565 monotonic,
2566 test_object_monotonic,
2567 direction,
2568 ret, offset, NULL);
2569}
2570
1fc605b0 2571void journal_file_reset_location(JournalFile *f) {
6573ef05 2572 f->location_type = LOCATION_HEAD;
1fc605b0 2573 f->current_offset = 0;
6573ef05
MS
2574 f->current_seqnum = 0;
2575 f->current_realtime = 0;
2576 f->current_monotonic = 0;
2577 zero(f->current_boot_id);
2578 f->current_xor_hash = 0;
2579}
2580
950c07d4 2581void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
6573ef05
MS
2582 f->location_type = LOCATION_SEEK;
2583 f->current_offset = offset;
2584 f->current_seqnum = le64toh(o->entry.seqnum);
2585 f->current_realtime = le64toh(o->entry.realtime);
2586 f->current_monotonic = le64toh(o->entry.monotonic);
2587 f->current_boot_id = o->entry.boot_id;
2588 f->current_xor_hash = le64toh(o->entry.xor_hash);
1fc605b0
MS
2589}
2590
d8ae66d7 2591int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
90c88092
YW
2592 int r;
2593
d8ae66d7 2594 assert(af);
c88cc6af 2595 assert(af->header);
d8ae66d7 2596 assert(bf);
c88cc6af 2597 assert(bf->header);
d8ae66d7
MS
2598 assert(af->location_type == LOCATION_SEEK);
2599 assert(bf->location_type == LOCATION_SEEK);
2600
2601 /* If contents and timestamps match, these entries are
2602 * identical, even if the seqnum does not match */
2603 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2604 af->current_monotonic == bf->current_monotonic &&
2605 af->current_realtime == bf->current_realtime &&
2606 af->current_xor_hash == bf->current_xor_hash)
2607 return 0;
2608
2609 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2610
2611 /* If this is from the same seqnum source, compare
2612 * seqnums */
90c88092
YW
2613 r = CMP(af->current_seqnum, bf->current_seqnum);
2614 if (r != 0)
2615 return r;
d8ae66d7
MS
2616
2617 /* Wow! This is weird, different data but the same
2618 * seqnums? Something is borked, but let's make the
2619 * best of it and compare by time. */
2620 }
2621
2622 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2623
2624 /* If the boot id matches, compare monotonic time */
90c88092
YW
2625 r = CMP(af->current_monotonic, bf->current_monotonic);
2626 if (r != 0)
2627 return r;
d8ae66d7
MS
2628 }
2629
2630 /* Otherwise, compare UTC time */
90c88092
YW
2631 r = CMP(af->current_realtime, bf->current_realtime);
2632 if (r != 0)
2633 return r;
d8ae66d7
MS
2634
2635 /* Finally, compare by contents */
6dd91b36 2636 return CMP(af->current_xor_hash, bf->current_xor_hash);
d8ae66d7
MS
2637}
2638
aa598ba5
LP
2639static int bump_array_index(uint64_t *i, direction_t direction, uint64_t n) {
2640
2641 /* Increase or decrease the specified index, in the right direction. */
2642
2643 if (direction == DIRECTION_DOWN) {
2644 if (*i >= n - 1)
2645 return 0;
2646
2647 (*i) ++;
2648 } else {
2649 if (*i <= 0)
2650 return 0;
2651
2652 (*i) --;
2653 }
2654
2655 return 1;
2656}
2657
b6da4ed0
LP
2658static bool check_properly_ordered(uint64_t new_offset, uint64_t old_offset, direction_t direction) {
2659
2660 /* Consider it an error if any of the two offsets is uninitialized */
2661 if (old_offset == 0 || new_offset == 0)
2662 return false;
2663
2664 /* If we go down, the new offset must be larger than the old one. */
2665 return direction == DIRECTION_DOWN ?
2666 new_offset > old_offset :
2667 new_offset < old_offset;
2668}
2669
de190aef
LP
2670int journal_file_next_entry(
2671 JournalFile *f,
f534928a 2672 uint64_t p,
de190aef
LP
2673 direction_t direction,
2674 Object **ret, uint64_t *offset) {
2675
fb099c8d 2676 uint64_t i, n, ofs;
cec736d2
LP
2677 int r;
2678
2679 assert(f);
c88cc6af 2680 assert(f->header);
de190aef
LP
2681
2682 n = le64toh(f->header->n_entries);
2683 if (n <= 0)
2684 return 0;
cec736d2 2685
f534928a 2686 if (p == 0)
de190aef 2687 i = direction == DIRECTION_DOWN ? 0 : n - 1;
cec736d2 2688 else {
de190aef
LP
2689 r = generic_array_bisect(f,
2690 le64toh(f->header->entry_array_offset),
2691 le64toh(f->header->n_entries),
2692 p,
2693 test_object_offset,
2694 DIRECTION_DOWN,
2695 NULL, NULL,
2696 &i);
2697 if (r <= 0)
2698 return r;
2699
aa598ba5
LP
2700 r = bump_array_index(&i, direction, n);
2701 if (r <= 0)
2702 return r;
cec736d2
LP
2703 }
2704
de190aef 2705 /* And jump to it */
989793d3
LP
2706 for (;;) {
2707 r = generic_array_get(f,
2708 le64toh(f->header->entry_array_offset),
2709 i,
2710 ret, &ofs);
2711 if (r > 0)
2712 break;
2713 if (r != -EBADMSG)
2714 return r;
2715
2716 /* OK, so this entry is borked. Most likely some entry didn't get synced to disk properly, let's see if
2717 * the next one might work for us instead. */
2718 log_debug_errno(r, "Entry item %" PRIu64 " is bad, skipping over it.", i);
2719
2720 r = bump_array_index(&i, direction, n);
2721 if (r <= 0)
2722 return r;
caeab8f6 2723 }
fb099c8d 2724
b6da4ed0 2725 /* Ensure our array is properly ordered. */
baaa35ad
ZJS
2726 if (p > 0 && !check_properly_ordered(ofs, p, direction))
2727 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
2728 "%s: entry array not properly ordered at entry %" PRIu64,
2729 f->path, i);
fb099c8d
ZJS
2730
2731 if (offset)
2732 *offset = ofs;
2733
2734 return 1;
de190aef 2735}
cec736d2 2736
de190aef
LP
2737int journal_file_next_entry_for_data(
2738 JournalFile *f,
2739 Object *o, uint64_t p,
2740 uint64_t data_offset,
2741 direction_t direction,
2742 Object **ret, uint64_t *offset) {
2743
ded5034e 2744 uint64_t i, n, ofs;
de190aef 2745 Object *d;
989793d3 2746 int r;
cec736d2
LP
2747
2748 assert(f);
de190aef 2749 assert(p > 0 || !o);
cec736d2 2750
de190aef 2751 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
466ccd92 2752 if (r < 0)
de190aef 2753 return r;
cec736d2 2754
de190aef
LP
2755 n = le64toh(d->data.n_entries);
2756 if (n <= 0)
2757 return n;
cec736d2 2758
de190aef
LP
2759 if (!o)
2760 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2761 else {
2762 if (o->object.type != OBJECT_ENTRY)
2763 return -EINVAL;
cec736d2 2764
de190aef
LP
2765 r = generic_array_bisect_plus_one(f,
2766 le64toh(d->data.entry_offset),
2767 le64toh(d->data.entry_array_offset),
2768 le64toh(d->data.n_entries),
2769 p,
2770 test_object_offset,
2771 DIRECTION_DOWN,
2772 NULL, NULL,
2773 &i);
2774
2775 if (r <= 0)
cec736d2
LP
2776 return r;
2777
aa598ba5
LP
2778 r = bump_array_index(&i, direction, n);
2779 if (r <= 0)
2780 return r;
de190aef 2781 }
cec736d2 2782
989793d3
LP
2783 for (;;) {
2784 r = generic_array_get_plus_one(f,
2785 le64toh(d->data.entry_offset),
2786 le64toh(d->data.entry_array_offset),
2787 i,
2788 ret, &ofs);
2789 if (r > 0)
2790 break;
2791 if (r != -EBADMSG)
2792 return r;
2793
2794 log_debug_errno(r, "Data entry item %" PRIu64 " is bad, skipping over it.", i);
2795
2796 r = bump_array_index(&i, direction, n);
2797 if (r <= 0)
2798 return r;
2799 }
ded5034e
LP
2800
2801 /* Ensure our array is properly ordered. */
baaa35ad
ZJS
2802 if (p > 0 && check_properly_ordered(ofs, p, direction))
2803 return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG),
2804 "%s data entry array not properly ordered at entry %" PRIu64,
2805 f->path, i);
ded5034e
LP
2806
2807 if (offset)
2808 *offset = ofs;
2809
2810 return 1;
de190aef 2811}
cec736d2 2812
cbdca852
LP
2813int journal_file_move_to_entry_by_offset_for_data(
2814 JournalFile *f,
2815 uint64_t data_offset,
2816 uint64_t p,
2817 direction_t direction,
2818 Object **ret, uint64_t *offset) {
2819
2820 int r;
2821 Object *d;
2822
2823 assert(f);
2824
2825 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2826 if (r < 0)
2827 return r;
2828
2829 return generic_array_bisect_plus_one(f,
2830 le64toh(d->data.entry_offset),
2831 le64toh(d->data.entry_array_offset),
2832 le64toh(d->data.n_entries),
2833 p,
2834 test_object_offset,
2835 direction,
2836 ret, offset, NULL);
2837}
2838
2839int journal_file_move_to_entry_by_monotonic_for_data(
2840 JournalFile *f,
2841 uint64_t data_offset,
2842 sd_id128_t boot_id,
2843 uint64_t monotonic,
2844 direction_t direction,
2845 Object **ret, uint64_t *offset) {
2846
cbdca852
LP
2847 Object *o, *d;
2848 int r;
2849 uint64_t b, z;
2850
2851 assert(f);
2852
2853 /* First, seek by time */
47838ab3 2854 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
cbdca852
LP
2855 if (r < 0)
2856 return r;
2857 if (r == 0)
2858 return -ENOENT;
2859
2860 r = generic_array_bisect_plus_one(f,
2861 le64toh(o->data.entry_offset),
2862 le64toh(o->data.entry_array_offset),
2863 le64toh(o->data.n_entries),
2864 monotonic,
2865 test_object_monotonic,
2866 direction,
2867 NULL, &z, NULL);
2868 if (r <= 0)
2869 return r;
2870
2871 /* And now, continue seeking until we find an entry that
2872 * exists in both bisection arrays */
2873
2874 for (;;) {
2875 Object *qo;
2876 uint64_t p, q;
2877
2878 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2879 if (r < 0)
2880 return r;
2881
2882 r = generic_array_bisect_plus_one(f,
2883 le64toh(d->data.entry_offset),
2884 le64toh(d->data.entry_array_offset),
2885 le64toh(d->data.n_entries),
2886 z,
2887 test_object_offset,
2888 direction,
2889 NULL, &p, NULL);
2890 if (r <= 0)
2891 return r;
2892
2893 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2894 if (r < 0)
2895 return r;
2896
2897 r = generic_array_bisect_plus_one(f,
2898 le64toh(o->data.entry_offset),
2899 le64toh(o->data.entry_array_offset),
2900 le64toh(o->data.n_entries),
2901 p,
2902 test_object_offset,
2903 direction,
2904 &qo, &q, NULL);
2905
2906 if (r <= 0)
2907 return r;
2908
2909 if (p == q) {
2910 if (ret)
2911 *ret = qo;
2912 if (offset)
2913 *offset = q;
2914
2915 return 1;
2916 }
2917
2918 z = q;
2919 }
cbdca852
LP
2920}
2921
de190aef
LP
2922int journal_file_move_to_entry_by_seqnum_for_data(
2923 JournalFile *f,
2924 uint64_t data_offset,
2925 uint64_t seqnum,
2926 direction_t direction,
2927 Object **ret, uint64_t *offset) {
cec736d2 2928
de190aef
LP
2929 Object *d;
2930 int r;
cec736d2 2931
91a31dde
LP
2932 assert(f);
2933
de190aef 2934 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2935 if (r < 0)
de190aef 2936 return r;
cec736d2 2937
de190aef
LP
2938 return generic_array_bisect_plus_one(f,
2939 le64toh(d->data.entry_offset),
2940 le64toh(d->data.entry_array_offset),
2941 le64toh(d->data.n_entries),
2942 seqnum,
2943 test_object_seqnum,
2944 direction,
2945 ret, offset, NULL);
2946}
cec736d2 2947
de190aef
LP
2948int journal_file_move_to_entry_by_realtime_for_data(
2949 JournalFile *f,
2950 uint64_t data_offset,
2951 uint64_t realtime,
2952 direction_t direction,
2953 Object **ret, uint64_t *offset) {
2954
2955 Object *d;
2956 int r;
2957
91a31dde
LP
2958 assert(f);
2959
de190aef 2960 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2961 if (r < 0)
de190aef
LP
2962 return r;
2963
2964 return generic_array_bisect_plus_one(f,
2965 le64toh(d->data.entry_offset),
2966 le64toh(d->data.entry_array_offset),
2967 le64toh(d->data.n_entries),
2968 realtime,
2969 test_object_realtime,
2970 direction,
2971 ret, offset, NULL);
cec736d2
LP
2972}
2973
0284adc6 2974void journal_file_dump(JournalFile *f) {
7560fffc 2975 Object *o;
7560fffc 2976 int r;
0284adc6 2977 uint64_t p;
7560fffc
LP
2978
2979 assert(f);
c88cc6af 2980 assert(f->header);
7560fffc 2981
0284adc6 2982 journal_file_print_header(f);
7560fffc 2983
0284adc6
LP
2984 p = le64toh(f->header->header_size);
2985 while (p != 0) {
d05089d8 2986 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
0284adc6
LP
2987 if (r < 0)
2988 goto fail;
7560fffc 2989
0284adc6 2990 switch (o->object.type) {
d98cc1f2 2991
0284adc6
LP
2992 case OBJECT_UNUSED:
2993 printf("Type: OBJECT_UNUSED\n");
2994 break;
d98cc1f2 2995
0284adc6
LP
2996 case OBJECT_DATA:
2997 printf("Type: OBJECT_DATA\n");
2998 break;
7560fffc 2999
3c1668da
LP
3000 case OBJECT_FIELD:
3001 printf("Type: OBJECT_FIELD\n");
3002 break;
3003
0284adc6 3004 case OBJECT_ENTRY:
507f22bd
ZJS
3005 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
3006 le64toh(o->entry.seqnum),
3007 le64toh(o->entry.monotonic),
3008 le64toh(o->entry.realtime));
0284adc6 3009 break;
7560fffc 3010
0284adc6
LP
3011 case OBJECT_FIELD_HASH_TABLE:
3012 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
3013 break;
7560fffc 3014
0284adc6
LP
3015 case OBJECT_DATA_HASH_TABLE:
3016 printf("Type: OBJECT_DATA_HASH_TABLE\n");
3017 break;
7560fffc 3018
0284adc6
LP
3019 case OBJECT_ENTRY_ARRAY:
3020 printf("Type: OBJECT_ENTRY_ARRAY\n");
3021 break;
7560fffc 3022
0284adc6 3023 case OBJECT_TAG:
507f22bd
ZJS
3024 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
3025 le64toh(o->tag.seqnum),
3026 le64toh(o->tag.epoch));
0284adc6 3027 break;
3c1668da
LP
3028
3029 default:
8facc349 3030 printf("Type: unknown (%i)\n", o->object.type);
3c1668da 3031 break;
0284adc6 3032 }
7560fffc 3033
d89c8fdf
ZJS
3034 if (o->object.flags & OBJECT_COMPRESSION_MASK)
3035 printf("Flags: %s\n",
3036 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
7560fffc 3037
0284adc6
LP
3038 if (p == le64toh(f->header->tail_object_offset))
3039 p = 0;
3040 else
3041 p = p + ALIGN64(le64toh(o->object.size));
3042 }
7560fffc 3043
0284adc6
LP
3044 return;
3045fail:
3046 log_error("File corrupt");
7560fffc
LP
3047}
3048
718fe4b1
ZJS
3049static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
3050 const char *x;
3051
3052 x = format_timestamp(buf, l, t);
3053 if (x)
3054 return x;
3055 return " --- ";
3056}
3057
0284adc6 3058void journal_file_print_header(JournalFile *f) {
2765b7bb 3059 char a[33], b[33], c[33], d[33];
ed375beb 3060 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
a1a03e30
LP
3061 struct stat st;
3062 char bytes[FORMAT_BYTES_MAX];
7560fffc
LP
3063
3064 assert(f);
c88cc6af 3065 assert(f->header);
7560fffc 3066
2c54acb1 3067 printf("File path: %s\n"
0284adc6
LP
3068 "File ID: %s\n"
3069 "Machine ID: %s\n"
3070 "Boot ID: %s\n"
2c54acb1 3071 "Sequential number ID: %s\n"
0284adc6 3072 "State: %s\n"
2c54acb1
TN
3073 "Compatible flags:%s%s\n"
3074 "Incompatible flags:%s%s%s\n"
507f22bd
ZJS
3075 "Header size: %"PRIu64"\n"
3076 "Arena size: %"PRIu64"\n"
2c54acb1
TN
3077 "Data hash table size: %"PRIu64"\n"
3078 "Field hash table size: %"PRIu64"\n"
3079 "Rotate suggested: %s\n"
3080 "Head sequential number: %"PRIu64" (%"PRIx64")\n"
3081 "Tail sequential number: %"PRIu64" (%"PRIx64")\n"
3082 "Head realtime timestamp: %s (%"PRIx64")\n"
3083 "Tail realtime timestamp: %s (%"PRIx64")\n"
3084 "Tail monotonic timestamp: %s (%"PRIx64")\n"
507f22bd 3085 "Objects: %"PRIu64"\n"
2c54acb1 3086 "Entry objects: %"PRIu64"\n",
0284adc6
LP
3087 f->path,
3088 sd_id128_to_string(f->header->file_id, a),
3089 sd_id128_to_string(f->header->machine_id, b),
3090 sd_id128_to_string(f->header->boot_id, c),
2765b7bb 3091 sd_id128_to_string(f->header->seqnum_id, d),
3223f44f
LP
3092 f->header->state == STATE_OFFLINE ? "OFFLINE" :
3093 f->header->state == STATE_ONLINE ? "ONLINE" :
3094 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
8088cbd3 3095 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
d89c8fdf
ZJS
3096 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
3097 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
3098 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
3099 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
507f22bd
ZJS
3100 le64toh(f->header->header_size),
3101 le64toh(f->header->arena_size),
3102 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3103 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
fb0951b0 3104 yes_no(journal_file_rotate_suggested(f, 0)),
0808b92f
LP
3105 le64toh(f->header->head_entry_seqnum), le64toh(f->header->head_entry_seqnum),
3106 le64toh(f->header->tail_entry_seqnum), le64toh(f->header->tail_entry_seqnum),
3107 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)), le64toh(f->header->head_entry_realtime),
3108 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)), le64toh(f->header->tail_entry_realtime),
3109 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC), le64toh(f->header->tail_entry_monotonic),
507f22bd
ZJS
3110 le64toh(f->header->n_objects),
3111 le64toh(f->header->n_entries));
7560fffc 3112
0284adc6 3113 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
2c54acb1
TN
3114 printf("Data objects: %"PRIu64"\n"
3115 "Data hash table fill: %.1f%%\n",
507f22bd 3116 le64toh(f->header->n_data),
0284adc6 3117 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
7560fffc 3118
0284adc6 3119 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
2c54acb1
TN
3120 printf("Field objects: %"PRIu64"\n"
3121 "Field hash table fill: %.1f%%\n",
507f22bd 3122 le64toh(f->header->n_fields),
0284adc6 3123 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3223f44f
LP
3124
3125 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
2c54acb1 3126 printf("Tag objects: %"PRIu64"\n",
507f22bd 3127 le64toh(f->header->n_tags));
3223f44f 3128 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
2c54acb1 3129 printf("Entry array objects: %"PRIu64"\n",
507f22bd 3130 le64toh(f->header->n_entry_arrays));
a1a03e30
LP
3131
3132 if (fstat(f->fd, &st) >= 0)
59f448cf 3133 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
7560fffc
LP
3134}
3135
fc68c929
LP
3136static int journal_file_warn_btrfs(JournalFile *f) {
3137 unsigned attrs;
3138 int r;
3139
3140 assert(f);
3141
3142 /* Before we write anything, check if the COW logic is turned
3143 * off on btrfs. Given our write pattern that is quite
3144 * unfriendly to COW file systems this should greatly improve
3145 * performance on COW file systems, such as btrfs, at the
3146 * expense of data integrity features (which shouldn't be too
3147 * bad, given that we do our own checksumming). */
3148
3149 r = btrfs_is_filesystem(f->fd);
3150 if (r < 0)
3151 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
3152 if (!r)
3153 return 0;
3154
3155 r = read_attr_fd(f->fd, &attrs);
3156 if (r < 0)
3157 return log_warning_errno(r, "Failed to read file attributes: %m");
3158
3159 if (attrs & FS_NOCOW_FL) {
3160 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
3161 return 0;
3162 }
3163
3164 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
3165 "This is likely to slow down journal access substantially, please consider turning "
3166 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
3167
3168 return 1;
3169}
3170
0284adc6 3171int journal_file_open(
5d1ce257 3172 int fd,
0284adc6
LP
3173 const char *fname,
3174 int flags,
3175 mode_t mode,
3176 bool compress,
57850536 3177 uint64_t compress_threshold_bytes,
baed47c3 3178 bool seal,
0284adc6
LP
3179 JournalMetrics *metrics,
3180 MMapCache *mmap_cache,
b58c888f 3181 Set *deferred_closes,
0284adc6
LP
3182 JournalFile *template,
3183 JournalFile **ret) {
7560fffc 3184
fa6ac760 3185 bool newly_created = false;
0284adc6 3186 JournalFile *f;
fa6ac760 3187 void *h;
0284adc6 3188 int r;
7560fffc 3189
0559d3a5 3190 assert(ret);
5d1ce257 3191 assert(fd >= 0 || fname);
7560fffc 3192
ec2ce0c5 3193 if (!IN_SET((flags & O_ACCMODE), O_RDONLY, O_RDWR))
0284adc6 3194 return -EINVAL;
7560fffc 3195
6eda13d3
LP
3196 if (fname && (flags & O_CREAT) && !endswith(fname, ".journal"))
3197 return -EINVAL;
7560fffc 3198
971b52c4 3199 f = new(JournalFile, 1);
0284adc6
LP
3200 if (!f)
3201 return -ENOMEM;
7560fffc 3202
971b52c4
LP
3203 *f = (JournalFile) {
3204 .fd = fd,
3205 .mode = mode,
3206
3207 .flags = flags,
3208 .prot = prot_from_flags(flags),
3209 .writable = (flags & O_ACCMODE) != O_RDONLY,
7560fffc 3210
349cc4a5 3211#if HAVE_LZ4
971b52c4 3212 .compress_lz4 = compress,
349cc4a5 3213#elif HAVE_XZ
971b52c4 3214 .compress_xz = compress,
48b61739 3215#endif
971b52c4
LP
3216 .compress_threshold_bytes = compress_threshold_bytes == (uint64_t) -1 ?
3217 DEFAULT_COMPRESS_THRESHOLD :
3218 MAX(MIN_COMPRESS_THRESHOLD, compress_threshold_bytes),
349cc4a5 3219#if HAVE_GCRYPT
971b52c4 3220 .seal = seal,
49a32d43 3221#endif
971b52c4 3222 };
7560fffc 3223
170a434c
ZJS
3224 if (DEBUG_LOGGING) {
3225 static int last_seal = -1, last_compress = -1;
3226 static uint64_t last_bytes = UINT64_MAX;
3227 char bytes[FORMAT_BYTES_MAX];
3228
3229 if (last_seal != f->seal ||
3230 last_compress != JOURNAL_FILE_COMPRESS(f) ||
3231 last_bytes != f->compress_threshold_bytes) {
3232
3233 log_debug("Journal effective settings seal=%s compress=%s compress_threshold_bytes=%s",
3234 yes_no(f->seal), yes_no(JOURNAL_FILE_COMPRESS(f)),
3235 format_bytes(bytes, sizeof bytes, f->compress_threshold_bytes));
3236 last_seal = f->seal;
3237 last_compress = JOURNAL_FILE_COMPRESS(f);
3238 last_bytes = f->compress_threshold_bytes;
3239 }
3240 }
57850536 3241
0284adc6
LP
3242 if (mmap_cache)
3243 f->mmap = mmap_cache_ref(mmap_cache);
3244 else {
84168d80 3245 f->mmap = mmap_cache_new();
0284adc6
LP
3246 if (!f->mmap) {
3247 r = -ENOMEM;
3248 goto fail;
3249 }
3250 }
7560fffc 3251
7645c77b 3252 if (fname) {
5d1ce257 3253 f->path = strdup(fname);
7645c77b
ZJS
3254 if (!f->path) {
3255 r = -ENOMEM;
3256 goto fail;
3257 }
3258 } else {
817b1c5b
LP
3259 assert(fd >= 0);
3260
7645c77b
ZJS
3261 /* If we don't know the path, fill in something explanatory and vaguely useful */
3262 if (asprintf(&f->path, "/proc/self/%i", fd) < 0) {
3263 r = -ENOMEM;
3264 goto fail;
3265 }
0284adc6 3266 }
7560fffc 3267
4743015d 3268 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
a4bcff5b
LP
3269 if (!f->chain_cache) {
3270 r = -ENOMEM;
3271 goto fail;
3272 }
3273
0284adc6 3274 if (f->fd < 0) {
817b1c5b
LP
3275 /* We pass O_NONBLOCK here, so that in case somebody pointed us to some character device node or FIFO
3276 * or so, we likely fail quickly than block for long. For regular files O_NONBLOCK has no effect, hence
3277 * it doesn't hurt in that case. */
3278
3279 f->fd = open(f->path, f->flags|O_CLOEXEC|O_NONBLOCK, f->mode);
5d1ce257
LP
3280 if (f->fd < 0) {
3281 r = -errno;
3282 goto fail;
3283 }
3284
3285 /* fds we opened here by us should also be closed by us. */
3286 f->close_fd = true;
817b1c5b
LP
3287
3288 r = fd_nonblock(f->fd, false);
3289 if (r < 0)
3290 goto fail;
7560fffc 3291 }
7560fffc 3292
be7cdd8e
VC
3293 f->cache_fd = mmap_cache_add_fd(f->mmap, f->fd);
3294 if (!f->cache_fd) {
3295 r = -ENOMEM;
3296 goto fail;
3297 }
3298
2678031a
LP
3299 r = journal_file_fstat(f);
3300 if (r < 0)
0284adc6 3301 goto fail;
7560fffc 3302
0284adc6 3303 if (f->last_stat.st_size == 0 && f->writable) {
11689d2a 3304
fc68c929 3305 (void) journal_file_warn_btrfs(f);
11689d2a 3306
4c2e1b39
LP
3307 /* Let's attach the creation time to the journal file, so that the vacuuming code knows the age of this
3308 * file even if the file might end up corrupted one day... Ideally we'd just use the creation time many
3309 * file systems maintain for each file, but the API to query this is very new, hence let's emulate this
3310 * via extended attributes. If extended attributes are not supported we'll just skip this, and rely
3311 * solely on mtime/atime/ctime of the file. */
3312 (void) fd_setcrtime(f->fd, 0);
7560fffc 3313
349cc4a5 3314#if HAVE_GCRYPT
0284adc6 3315 /* Try to load the FSPRG state, and if we can't, then
baed47c3 3316 * just don't do sealing */
49a32d43
LP
3317 if (f->seal) {
3318 r = journal_file_fss_load(f);
3319 if (r < 0)
3320 f->seal = false;
3321 }
feb12d3e 3322#endif
7560fffc 3323
0284adc6
LP
3324 r = journal_file_init_header(f, template);
3325 if (r < 0)
3326 goto fail;
7560fffc 3327
2678031a
LP
3328 r = journal_file_fstat(f);
3329 if (r < 0)
0284adc6 3330 goto fail;
fb0951b0
LP
3331
3332 newly_created = true;
0284adc6 3333 }
7560fffc 3334
0284adc6 3335 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
cfb571f3 3336 r = -ENODATA;
0284adc6
LP
3337 goto fail;
3338 }
7560fffc 3339
b42549ad 3340 r = mmap_cache_get(f->mmap, f->cache_fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h, NULL);
5087825e
LP
3341 if (r == -EINVAL) {
3342 /* Some file systems (jffs2 or p9fs) don't support mmap() properly (or only read-only
3343 * mmap()), and return EINVAL in that case. Let's propagate that as a more recognizable error
3344 * code. */
3345 r = -EAFNOSUPPORT;
3346 goto fail;
3347 }
977eaa1e 3348 if (r < 0)
0284adc6 3349 goto fail;
7560fffc 3350
fa6ac760
LP
3351 f->header = h;
3352
0284adc6 3353 if (!newly_created) {
f9168190 3354 set_clear_with_destructor(deferred_closes, journal_file_close);
b58c888f 3355
0284adc6
LP
3356 r = journal_file_verify_header(f);
3357 if (r < 0)
3358 goto fail;
3359 }
7560fffc 3360
349cc4a5 3361#if HAVE_GCRYPT
0284adc6 3362 if (!newly_created && f->writable) {
baed47c3 3363 r = journal_file_fss_load(f);
0284adc6
LP
3364 if (r < 0)
3365 goto fail;
3366 }
feb12d3e 3367#endif
cec736d2
LP
3368
3369 if (f->writable) {
4a92baf3
LP
3370 if (metrics) {
3371 journal_default_metrics(metrics, f->fd);
3372 f->metrics = *metrics;
3373 } else if (template)
3374 f->metrics = template->metrics;
3375
cec736d2
LP
3376 r = journal_file_refresh_header(f);
3377 if (r < 0)
3378 goto fail;
3379 }
3380
349cc4a5 3381#if HAVE_GCRYPT
baed47c3 3382 r = journal_file_hmac_setup(f);
14d10188
LP
3383 if (r < 0)
3384 goto fail;
feb12d3e 3385#endif
14d10188 3386
cec736d2 3387 if (newly_created) {
de190aef 3388 r = journal_file_setup_field_hash_table(f);
cec736d2
LP
3389 if (r < 0)
3390 goto fail;
3391
de190aef 3392 r = journal_file_setup_data_hash_table(f);
cec736d2
LP
3393 if (r < 0)
3394 goto fail;
7560fffc 3395
349cc4a5 3396#if HAVE_GCRYPT
7560fffc
LP
3397 r = journal_file_append_first_tag(f);
3398 if (r < 0)
3399 goto fail;
feb12d3e 3400#endif
cec736d2
LP
3401 }
3402
be7cdd8e 3403 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd)) {
fa6ac760
LP
3404 r = -EIO;
3405 goto fail;
3406 }
3407
7a24f3bf 3408 if (template && template->post_change_timer) {
e167d7fd
LP
3409 r = journal_file_enable_post_change_timer(
3410 f,
3411 sd_event_source_get_event(template->post_change_timer),
3412 template->post_change_timer_period);
7a24f3bf 3413
7a24f3bf
VC
3414 if (r < 0)
3415 goto fail;
3416 }
3417
f8e2f4d6 3418 /* The file is opened now successfully, thus we take possession of any passed in fd. */
5d1ce257
LP
3419 f->close_fd = true;
3420
0559d3a5 3421 *ret = f;
cec736d2
LP
3422 return 0;
3423
3424fail:
be7cdd8e 3425 if (f->cache_fd && mmap_cache_got_sigbus(f->mmap, f->cache_fd))
fa6ac760
LP
3426 r = -EIO;
3427
69a3a6fd 3428 (void) journal_file_close(f);
cec736d2
LP
3429
3430 return r;
3431}
0ac38b70 3432
7a4d21ad 3433int journal_file_archive(JournalFile *f) {
57535f47 3434 _cleanup_free_ char *p = NULL;
0ac38b70
LP
3435
3436 assert(f);
0ac38b70 3437
7a4d21ad 3438 if (!f->writable)
0ac38b70
LP
3439 return -EINVAL;
3440
5d1ce257 3441 /* Is this a journal file that was passed to us as fd? If so, we synthesized a path name for it, and we refuse
13e785f7 3442 * rotation, since we don't know the actual path, and couldn't rename the file hence. */
7a4d21ad 3443 if (path_startswith(f->path, "/proc/self/fd"))
5d1ce257
LP
3444 return -EINVAL;
3445
7a4d21ad 3446 if (!endswith(f->path, ".journal"))
0ac38b70
LP
3447 return -EINVAL;
3448
7a4d21ad
LP
3449 if (asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
3450 (int) strlen(f->path) - 8, f->path,
3451 SD_ID128_FORMAT_VAL(f->header->seqnum_id),
3452 le64toh(f->header->head_entry_seqnum),
3453 le64toh(f->header->head_entry_realtime)) < 0)
0ac38b70
LP
3454 return -ENOMEM;
3455
7a4d21ad
LP
3456 /* Try to rename the file to the archived version. If the file already was deleted, we'll get ENOENT, let's
3457 * ignore that case. */
3458 if (rename(f->path, p) < 0 && errno != ENOENT)
0ac38b70
LP
3459 return -errno;
3460
1fcefd88 3461 /* Sync the rename to disk */
7a4d21ad
LP
3462 (void) fsync_directory_of_file(f->fd);
3463
3464 /* Set as archive so offlining commits w/state=STATE_ARCHIVED. Previously we would set old_file->header->state
3465 * to STATE_ARCHIVED directly here, but journal_file_set_offline() short-circuits when state != STATE_ONLINE,
3466 * which would result in the rotated journal never getting fsync() called before closing. Now we simply queue
3467 * the archive state by setting an archive bit, leaving the state as STATE_ONLINE so proper offlining
3468 * occurs. */
3469 f->archive = true;
3470
3471 /* Currently, btrfs is not very good with out write patterns and fragments heavily. Let's defrag our journal
3472 * files when we archive them */
3473 f->defrag_on_close = true;
3474
3475 return 0;
3476}
3477
3478JournalFile* journal_initiate_close(
3479 JournalFile *f,
3480 Set *deferred_closes) {
3481
3482 int r;
3483
3484 assert(f);
3485
3486 if (deferred_closes) {
0ac38b70 3487
7a4d21ad
LP
3488 r = set_put(deferred_closes, f);
3489 if (r < 0)
3490 log_debug_errno(r, "Failed to add file to deferred close set, closing immediately.");
3491 else {
3492 (void) journal_file_set_offline(f, false);
3493 return NULL;
3494 }
3495 }
3496
3497 return journal_file_close(f);
3498}
3499
3500int journal_file_rotate(
3501 JournalFile **f,
3502 bool compress,
3503 uint64_t compress_threshold_bytes,
3504 bool seal,
3505 Set *deferred_closes) {
3506
3507 JournalFile *new_file = NULL;
3508 int r;
3509
3510 assert(f);
3511 assert(*f);
3512
3513 r = journal_file_archive(*f);
3514 if (r < 0)
3515 return r;
3516
3517 r = journal_file_open(
3518 -1,
3519 (*f)->path,
3520 (*f)->flags,
3521 (*f)->mode,
3522 compress,
3523 compress_threshold_bytes,
3524 seal,
3525 NULL, /* metrics */
3526 (*f)->mmap,
3527 deferred_closes,
3528 *f, /* template */
3529 &new_file);
3530
3531 journal_initiate_close(*f, deferred_closes);
0ac38b70 3532 *f = new_file;
7a4d21ad 3533
0ac38b70
LP
3534 return r;
3535}
3536
68127658
LP
3537int journal_file_dispose(int dir_fd, const char *fname) {
3538 _cleanup_free_ char *p = NULL;
3539 _cleanup_close_ int fd = -1;
3540
3541 assert(fname);
3542
3543 /* Renames a journal file to *.journal~, i.e. to mark it as corruped or otherwise uncleanly shutdown. Note that
3544 * this is done without looking into the file or changing any of its contents. The idea is that this is called
3545 * whenever something is suspicious and we want to move the file away and make clear that it is not accessed
3546 * for writing anymore. */
3547
3548 if (!endswith(fname, ".journal"))
3549 return -EINVAL;
3550
3551 if (asprintf(&p, "%.*s@%016" PRIx64 "-%016" PRIx64 ".journal~",
3552 (int) strlen(fname) - 8, fname,
3553 now(CLOCK_REALTIME),
3554 random_u64()) < 0)
3555 return -ENOMEM;
3556
3557 if (renameat(dir_fd, fname, dir_fd, p) < 0)
3558 return -errno;
3559
3560 /* btrfs doesn't cope well with our write pattern and fragments heavily. Let's defrag all files we rotate */
3561 fd = openat(dir_fd, p, O_RDONLY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW);
3562 if (fd < 0)
3563 log_debug_errno(errno, "Failed to open file for defragmentation/FS_NOCOW_FL, ignoring: %m");
3564 else {
3565 (void) chattr_fd(fd, 0, FS_NOCOW_FL, NULL);
3566 (void) btrfs_defrag_fd(fd);
3567 }
3568
3569 return 0;
3570}
3571
9447a7f1
LP
3572int journal_file_open_reliably(
3573 const char *fname,
3574 int flags,
3575 mode_t mode,
7560fffc 3576 bool compress,
57850536 3577 uint64_t compress_threshold_bytes,
baed47c3 3578 bool seal,
4a92baf3 3579 JournalMetrics *metrics,
27370278 3580 MMapCache *mmap_cache,
b58c888f 3581 Set *deferred_closes,
9447a7f1
LP
3582 JournalFile *template,
3583 JournalFile **ret) {
3584
68127658 3585 int r;
9447a7f1 3586
57850536
AG
3587 r = journal_file_open(-1, fname, flags, mode, compress, compress_threshold_bytes, seal, metrics, mmap_cache,
3588 deferred_closes, template, ret);
288359db 3589 if (!IN_SET(r,
b288cdeb
ZJS
3590 -EBADMSG, /* Corrupted */
3591 -ENODATA, /* Truncated */
3592 -EHOSTDOWN, /* Other machine */
3593 -EPROTONOSUPPORT, /* Incompatible feature */
3594 -EBUSY, /* Unclean shutdown */
3595 -ESHUTDOWN, /* Already archived */
288359db 3596 -EIO, /* IO error, including SIGBUS on mmap */
ae739cc1
LP
3597 -EIDRM, /* File has been deleted */
3598 -ETXTBSY)) /* File is from the future */
9447a7f1
LP
3599 return r;
3600
3601 if ((flags & O_ACCMODE) == O_RDONLY)
3602 return r;
3603
3604 if (!(flags & O_CREAT))
3605 return r;
3606
7560fffc
LP
3607 if (!endswith(fname, ".journal"))
3608 return r;
3609
5c70eab4 3610 /* The file is corrupted. Rotate it away and try it again (but only once) */
65089b82 3611 log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
9447a7f1 3612
68127658
LP
3613 r = journal_file_dispose(AT_FDCWD, fname);
3614 if (r < 0)
3615 return r;
3616
57850536
AG
3617 return journal_file_open(-1, fname, flags, mode, compress, compress_threshold_bytes, seal, metrics, mmap_cache,
3618 deferred_closes, template, ret);
9447a7f1
LP
3619}
3620
5a271b08 3621int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p) {
cf244689
LP
3622 uint64_t i, n;
3623 uint64_t q, xor_hash = 0;
3624 int r;
3625 EntryItem *items;
3626 dual_timestamp ts;
d180c349 3627 const sd_id128_t *boot_id;
cf244689
LP
3628
3629 assert(from);
3630 assert(to);
3631 assert(o);
3632 assert(p);
3633
3634 if (!to->writable)
3635 return -EPERM;
3636
3637 ts.monotonic = le64toh(o->entry.monotonic);
3638 ts.realtime = le64toh(o->entry.realtime);
d180c349 3639 boot_id = &o->entry.boot_id;
cf244689 3640
cf244689 3641 n = journal_file_entry_n_items(o);
4faa7004 3642 /* alloca() can't take 0, hence let's allocate at least one */
cf409d15 3643 items = newa(EntryItem, MAX(1u, n));
cf244689
LP
3644
3645 for (i = 0; i < n; i++) {
4fd052ae
FC
3646 uint64_t l, h;
3647 le64_t le_hash;
cf244689
LP
3648 size_t t;
3649 void *data;
3650 Object *u;
3651
3652 q = le64toh(o->entry.items[i].object_offset);
3653 le_hash = o->entry.items[i].hash;
3654
3655 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
3656 if (r < 0)
3657 return r;
3658
3659 if (le_hash != o->data.hash)
3660 return -EBADMSG;
3661
3662 l = le64toh(o->object.size) - offsetof(Object, data.payload);
3663 t = (size_t) l;
3664
3665 /* We hit the limit on 32bit machines */
3666 if ((uint64_t) t != l)
3667 return -E2BIG;
3668
d89c8fdf 3669 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
349cc4a5 3670#if HAVE_XZ || HAVE_LZ4
a7f7d1bd 3671 size_t rsize = 0;
cf244689 3672
d89c8fdf
ZJS
3673 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
3674 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
3675 if (r < 0)
3676 return r;
cf244689
LP
3677
3678 data = from->compress_buffer;
3679 l = rsize;
3b1a55e1
ZJS
3680#else
3681 return -EPROTONOSUPPORT;
3682#endif
cf244689
LP
3683 } else
3684 data = o->data.payload;
3685
3686 r = journal_file_append_data(to, data, l, &u, &h);
3687 if (r < 0)
3688 return r;
3689
3690 xor_hash ^= le64toh(u->data.hash);
3691 items[i].object_offset = htole64(h);
3692 items[i].hash = u->data.hash;
3693
3694 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
3695 if (r < 0)
3696 return r;
3697 }
3698
d180c349
ZJS
3699 r = journal_file_append_entry_internal(to, &ts, boot_id, xor_hash, items, n,
3700 NULL, NULL, NULL);
fa6ac760 3701
be7cdd8e 3702 if (mmap_cache_got_sigbus(to->mmap, to->cache_fd))
fa6ac760
LP
3703 return -EIO;
3704
3705 return r;
cf244689 3706}
babfc091 3707
8580d1f7
LP
3708void journal_reset_metrics(JournalMetrics *m) {
3709 assert(m);
3710
3711 /* Set everything to "pick automatic values". */
3712
3713 *m = (JournalMetrics) {
3714 .min_use = (uint64_t) -1,
3715 .max_use = (uint64_t) -1,
3716 .min_size = (uint64_t) -1,
3717 .max_size = (uint64_t) -1,
3718 .keep_free = (uint64_t) -1,
3719 .n_max_files = (uint64_t) -1,
3720 };
3721}
3722
babfc091 3723void journal_default_metrics(JournalMetrics *m, int fd) {
8580d1f7 3724 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
babfc091 3725 struct statvfs ss;
6aae0b1a 3726 uint64_t fs_size = 0;
babfc091
LP
3727
3728 assert(m);
3729 assert(fd >= 0);
3730
3731 if (fstatvfs(fd, &ss) >= 0)
3732 fs_size = ss.f_frsize * ss.f_blocks;
6aae0b1a 3733 else
8fc58f1a 3734 log_debug_errno(errno, "Failed to determine disk size: %m");
babfc091
LP
3735
3736 if (m->max_use == (uint64_t) -1) {
3737
6aae0b1a
ZJS
3738 if (fs_size > 0)
3739 m->max_use = CLAMP(PAGE_ALIGN(fs_size / 10), /* 10% of file system size */
3740 MAX_USE_LOWER, MAX_USE_UPPER);
3741 else
3742 m->max_use = MAX_USE_LOWER;
babfc091
LP
3743 } else {
3744 m->max_use = PAGE_ALIGN(m->max_use);
3745
8580d1f7 3746 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
babfc091
LP
3747 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3748 }
3749
6aae0b1a
ZJS
3750 if (m->min_use == (uint64_t) -1) {
3751 if (fs_size > 0)
3752 m->min_use = CLAMP(PAGE_ALIGN(fs_size / 50), /* 2% of file system size */
3753 MIN_USE_LOW, MIN_USE_HIGH);
3754 else
3755 m->min_use = MIN_USE_LOW;
3756 }
8580d1f7
LP
3757
3758 if (m->min_use > m->max_use)
3759 m->min_use = m->max_use;
3760
6aae0b1a
ZJS
3761 if (m->max_size == (uint64_t) -1)
3762 m->max_size = MIN(PAGE_ALIGN(m->max_use / 8), /* 8 chunks */
3763 MAX_SIZE_UPPER);
3764 else
babfc091
LP
3765 m->max_size = PAGE_ALIGN(m->max_size);
3766
8580d1f7
LP
3767 if (m->max_size != 0) {
3768 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3769 m->max_size = JOURNAL_FILE_SIZE_MIN;
babfc091 3770
8580d1f7
LP
3771 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3772 m->max_use = m->max_size*2;
3773 }
babfc091
LP
3774
3775 if (m->min_size == (uint64_t) -1)
3776 m->min_size = JOURNAL_FILE_SIZE_MIN;
6aae0b1a
ZJS
3777 else
3778 m->min_size = CLAMP(PAGE_ALIGN(m->min_size),
3779 JOURNAL_FILE_SIZE_MIN,
3780 m->max_size ?: UINT64_MAX);
babfc091
LP
3781
3782 if (m->keep_free == (uint64_t) -1) {
6aae0b1a
ZJS
3783 if (fs_size > 0)
3784 m->keep_free = MIN(PAGE_ALIGN(fs_size / 20), /* 5% of file system size */
3785 KEEP_FREE_UPPER);
3786 else
babfc091
LP
3787 m->keep_free = DEFAULT_KEEP_FREE;
3788 }
3789
8580d1f7
LP
3790 if (m->n_max_files == (uint64_t) -1)
3791 m->n_max_files = DEFAULT_N_MAX_FILES;
3792
3793 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3794 format_bytes(a, sizeof(a), m->min_use),
3795 format_bytes(b, sizeof(b), m->max_use),
3796 format_bytes(c, sizeof(c), m->max_size),
3797 format_bytes(d, sizeof(d), m->min_size),
3798 format_bytes(e, sizeof(e), m->keep_free),
3799 m->n_max_files);
babfc091 3800}
08984293
LP
3801
3802int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
08984293 3803 assert(f);
c88cc6af 3804 assert(f->header);
08984293
LP
3805 assert(from || to);
3806
3807 if (from) {
162566a4
LP
3808 if (f->header->head_entry_realtime == 0)
3809 return -ENOENT;
08984293 3810
162566a4 3811 *from = le64toh(f->header->head_entry_realtime);
08984293
LP
3812 }
3813
3814 if (to) {
162566a4
LP
3815 if (f->header->tail_entry_realtime == 0)
3816 return -ENOENT;
08984293 3817
162566a4 3818 *to = le64toh(f->header->tail_entry_realtime);
08984293
LP
3819 }
3820
3821 return 1;
3822}
3823
3824int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
08984293
LP
3825 Object *o;
3826 uint64_t p;
3827 int r;
3828
3829 assert(f);
3830 assert(from || to);
3831
47838ab3 3832 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
08984293
LP
3833 if (r <= 0)
3834 return r;
3835
3836 if (le64toh(o->data.n_entries) <= 0)
3837 return 0;
3838
3839 if (from) {
3840 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3841 if (r < 0)
3842 return r;
3843
3844 *from = le64toh(o->entry.monotonic);
3845 }
3846
3847 if (to) {
3848 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3849 if (r < 0)
3850 return r;
3851
3852 r = generic_array_get_plus_one(f,
3853 le64toh(o->data.entry_offset),
3854 le64toh(o->data.entry_array_offset),
3855 le64toh(o->data.n_entries)-1,
3856 &o, NULL);
3857 if (r <= 0)
3858 return r;
3859
3860 *to = le64toh(o->entry.monotonic);
3861 }
3862
3863 return 1;
3864}
dca6219e 3865
fb0951b0 3866bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
dca6219e 3867 assert(f);
c88cc6af 3868 assert(f->header);
dca6219e
LP
3869
3870 /* If we gained new header fields we gained new features,
3871 * hence suggest a rotation */
361f9cbc
LP
3872 if (le64toh(f->header->header_size) < sizeof(Header)) {
3873 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
dca6219e 3874 return true;
361f9cbc 3875 }
dca6219e
LP
3876
3877 /* Let's check if the hash tables grew over a certain fill
3878 * level (75%, borrowing this value from Java's hash table
3879 * implementation), and if so suggest a rotation. To calculate
3880 * the fill level we need the n_data field, which only exists
3881 * in newer versions. */
3882
3883 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
361f9cbc 3884 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 3885 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
361f9cbc
LP
3886 f->path,
3887 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
3888 le64toh(f->header->n_data),
3889 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3890 (unsigned long long) f->last_stat.st_size,
3891 f->last_stat.st_size / le64toh(f->header->n_data));
dca6219e 3892 return true;
361f9cbc 3893 }
dca6219e
LP
3894
3895 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
361f9cbc 3896 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 3897 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
361f9cbc
LP
3898 f->path,
3899 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
3900 le64toh(f->header->n_fields),
3901 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
dca6219e 3902 return true;
361f9cbc 3903 }
dca6219e 3904
0598fd4a
LP
3905 /* Are the data objects properly indexed by field objects? */
3906 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3907 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3908 le64toh(f->header->n_data) > 0 &&
3909 le64toh(f->header->n_fields) == 0)
3910 return true;
3911
fb0951b0
LP
3912 if (max_file_usec > 0) {
3913 usec_t t, h;
3914
3915 h = le64toh(f->header->head_entry_realtime);
3916 t = now(CLOCK_REALTIME);
3917
3918 if (h > 0 && t > h + max_file_usec)
3919 return true;
3920 }
3921
dca6219e
LP
3922 return false;
3923}