]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/journal/journal-file.c
journal-file: refuse opening non-regular journal files
[thirdparty/systemd.git] / src / journal / journal-file.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
cec736d2
LP
2/***
3 This file is part of systemd.
4
5 Copyright 2011 Lennart Poettering
6
7 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
cec736d2
LP
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 15 Lesser General Public License for more details.
cec736d2 16
5430f7f2 17 You should have received a copy of the GNU Lesser General Public License
cec736d2
LP
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19***/
20
cec736d2 21#include <errno.h>
cec736d2 22#include <fcntl.h>
11689d2a 23#include <linux/fs.h>
ac2e41f5 24#include <pthread.h>
07630cea
LP
25#include <stddef.h>
26#include <sys/mman.h>
27#include <sys/statvfs.h>
28#include <sys/uio.h>
29#include <unistd.h>
fb0951b0 30
b5efdb8a 31#include "alloc-util.h"
f27a3864 32#include "btrfs-util.h"
c8b3094d 33#include "chattr-util.h"
07630cea 34#include "compress.h"
3ffd4af2 35#include "fd-util.h"
0284adc6 36#include "journal-authenticate.h"
cec736d2
LP
37#include "journal-def.h"
38#include "journal-file.h"
39#include "lookup3.h"
6bedfcbb 40#include "parse-util.h"
5d1ce257 41#include "path-util.h"
3df3e884 42#include "random-util.h"
7a24f3bf 43#include "sd-event.h"
b58c888f 44#include "set.h"
07630cea 45#include "string-util.h"
4761fd0f 46#include "strv.h"
89a5a90c 47#include "xattr-util.h"
cec736d2 48
4a92baf3
LP
49#define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
50#define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
cec736d2 51
be19b7df 52#define COMPRESSION_SIZE_THRESHOLD (512ULL)
807e17f0 53
babfc091 54/* This is the minimum journal file size */
16098e93 55#define JOURNAL_FILE_SIZE_MIN (512ULL*1024ULL) /* 512 KiB */
babfc091
LP
56
57/* These are the lower and upper bounds if we deduce the max_use value
58 * from the file system size */
59#define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
60#define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
61
8580d1f7
LP
62/* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
63#define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL) /* 1 MiB */
64
babfc091 65/* This is the upper bound if we deduce max_size from max_use */
71100051 66#define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
babfc091
LP
67
68/* This is the upper bound if we deduce the keep_free value from the
69 * file system size */
70#define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
71
72/* This is the keep_free value when we can't determine the system
73 * size */
74#define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
75
8580d1f7
LP
76/* This is the default maximum number of journal files to keep around. */
77#define DEFAULT_N_MAX_FILES (100)
78
dca6219e
LP
79/* n_data was the first entry we added after the initial file format design */
80#define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
cec736d2 81
a4bcff5b
LP
82/* How many entries to keep in the entry array chain cache at max */
83#define CHAIN_CACHE_MAX 20
84
a676e665
LP
85/* How much to increase the journal file size at once each time we allocate something new. */
86#define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
87
2678031a
LP
88/* Reread fstat() of the file for detecting deletions at least this often */
89#define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
90
fa6ac760
LP
91/* The mmap context to use for the header we pick as one above the last defined typed */
92#define CONTEXT_HEADER _OBJECT_TYPE_MAX
93
51804460
ZJS
94#ifdef __clang__
95# pragma GCC diagnostic ignored "-Waddress-of-packed-member"
96#endif
97
ac2e41f5
VC
98/* This may be called from a separate thread to prevent blocking the caller for the duration of fsync().
99 * As a result we use atomic operations on f->offline_state for inter-thread communications with
100 * journal_file_set_offline() and journal_file_set_online(). */
101static void journal_file_set_offline_internal(JournalFile *f) {
26687bf8 102 assert(f);
ac2e41f5
VC
103 assert(f->fd >= 0);
104 assert(f->header);
105
106 for (;;) {
107 switch (f->offline_state) {
108 case OFFLINE_CANCEL:
109 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_DONE))
110 continue;
111 return;
112
113 case OFFLINE_AGAIN_FROM_SYNCING:
114 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_SYNCING))
115 continue;
116 break;
117
118 case OFFLINE_AGAIN_FROM_OFFLINING:
119 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_SYNCING))
120 continue;
121 break;
122
123 case OFFLINE_SYNCING:
124 (void) fsync(f->fd);
26687bf8 125
ac2e41f5
VC
126 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_OFFLINING))
127 continue;
26687bf8 128
8eb85171 129 f->header->state = f->archive ? STATE_ARCHIVED : STATE_OFFLINE;
ac2e41f5
VC
130 (void) fsync(f->fd);
131 break;
132
133 case OFFLINE_OFFLINING:
134 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_DONE))
135 continue;
4831981d 136 _fallthrough_;
ac2e41f5
VC
137 case OFFLINE_DONE:
138 return;
139
140 case OFFLINE_JOINED:
141 log_debug("OFFLINE_JOINED unexpected offline state for journal_file_set_offline_internal()");
142 return;
143 }
144 }
145}
146
147static void * journal_file_set_offline_thread(void *arg) {
148 JournalFile *f = arg;
149
fa7ff4cf
LP
150 (void) pthread_setname_np(pthread_self(), "journal-offline");
151
ac2e41f5
VC
152 journal_file_set_offline_internal(f);
153
154 return NULL;
155}
156
157static int journal_file_set_offline_thread_join(JournalFile *f) {
158 int r;
159
160 assert(f);
161
162 if (f->offline_state == OFFLINE_JOINED)
163 return 0;
164
165 r = pthread_join(f->offline_thread, NULL);
166 if (r)
167 return -r;
168
169 f->offline_state = OFFLINE_JOINED;
26687bf8 170
be7cdd8e 171 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
fa6ac760
LP
172 return -EIO;
173
ac2e41f5
VC
174 return 0;
175}
26687bf8 176
ac2e41f5
VC
177/* Trigger a restart if the offline thread is mid-flight in a restartable state. */
178static bool journal_file_set_offline_try_restart(JournalFile *f) {
179 for (;;) {
180 switch (f->offline_state) {
181 case OFFLINE_AGAIN_FROM_SYNCING:
182 case OFFLINE_AGAIN_FROM_OFFLINING:
183 return true;
184
185 case OFFLINE_CANCEL:
186 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_AGAIN_FROM_SYNCING))
187 continue;
188 return true;
189
190 case OFFLINE_SYNCING:
191 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_AGAIN_FROM_SYNCING))
192 continue;
193 return true;
194
195 case OFFLINE_OFFLINING:
196 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_AGAIN_FROM_OFFLINING))
197 continue;
198 return true;
26687bf8
OS
199
200 default:
ac2e41f5
VC
201 return false;
202 }
26687bf8
OS
203 }
204}
205
ac2e41f5
VC
206/* Sets a journal offline.
207 *
208 * If wait is false then an offline is dispatched in a separate thread for a
209 * subsequent journal_file_set_offline() or journal_file_set_online() of the
210 * same journal to synchronize with.
211 *
212 * If wait is true, then either an existing offline thread will be restarted
213 * and joined, or if none exists the offline is simply performed in this
214 * context without involving another thread.
215 */
216int journal_file_set_offline(JournalFile *f, bool wait) {
217 bool restarted;
218 int r;
219
26687bf8
OS
220 assert(f);
221
222 if (!f->writable)
223 return -EPERM;
224
225 if (!(f->fd >= 0 && f->header))
226 return -EINVAL;
227
b8f99e27
VC
228 /* An offlining journal is implicitly online and may modify f->header->state,
229 * we must also join any potentially lingering offline thread when not online. */
230 if (!journal_file_is_offlining(f) && f->header->state != STATE_ONLINE)
231 return journal_file_set_offline_thread_join(f);
26687bf8 232
ac2e41f5
VC
233 /* Restart an in-flight offline thread and wait if needed, or join a lingering done one. */
234 restarted = journal_file_set_offline_try_restart(f);
235 if ((restarted && wait) || !restarted) {
236 r = journal_file_set_offline_thread_join(f);
237 if (r < 0)
238 return r;
239 }
26687bf8 240
ac2e41f5
VC
241 if (restarted)
242 return 0;
243
244 /* Initiate a new offline. */
245 f->offline_state = OFFLINE_SYNCING;
fa6ac760 246
ac2e41f5
VC
247 if (wait) /* Without using a thread if waiting. */
248 journal_file_set_offline_internal(f);
249 else {
5e9f01e8
LP
250 sigset_t ss, saved_ss;
251 int k;
252
253 if (sigfillset(&ss) < 0)
254 return -errno;
255
256 r = pthread_sigmask(SIG_BLOCK, &ss, &saved_ss);
257 if (r > 0)
258 return -r;
259
ac2e41f5 260 r = pthread_create(&f->offline_thread, NULL, journal_file_set_offline_thread, f);
5e9f01e8
LP
261
262 k = pthread_sigmask(SIG_SETMASK, &saved_ss, NULL);
ec9ffa2c
VC
263 if (r > 0) {
264 f->offline_state = OFFLINE_JOINED;
ac2e41f5 265 return -r;
ec9ffa2c 266 }
5e9f01e8
LP
267 if (k > 0)
268 return -k;
ac2e41f5
VC
269 }
270
271 return 0;
272}
273
274static int journal_file_set_online(JournalFile *f) {
275 bool joined = false;
276
277 assert(f);
278
279 if (!f->writable)
280 return -EPERM;
281
282 if (!(f->fd >= 0 && f->header))
283 return -EINVAL;
284
285 while (!joined) {
286 switch (f->offline_state) {
287 case OFFLINE_JOINED:
288 /* No offline thread, no need to wait. */
289 joined = true;
290 break;
291
292 case OFFLINE_SYNCING:
293 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_CANCEL))
294 continue;
295 /* Canceled syncing prior to offlining, no need to wait. */
296 break;
297
298 case OFFLINE_AGAIN_FROM_SYNCING:
299 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_CANCEL))
300 continue;
301 /* Canceled restart from syncing, no need to wait. */
302 break;
303
304 case OFFLINE_AGAIN_FROM_OFFLINING:
305 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_CANCEL))
306 continue;
307 /* Canceled restart from offlining, must wait for offlining to complete however. */
4831981d 308 _fallthrough_;
ac2e41f5
VC
309 default: {
310 int r;
311
312 r = journal_file_set_offline_thread_join(f);
313 if (r < 0)
314 return r;
315
316 joined = true;
317 break;
318 }
319 }
320 }
26687bf8 321
be7cdd8e 322 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
fa6ac760
LP
323 return -EIO;
324
ac2e41f5
VC
325 switch (f->header->state) {
326 case STATE_ONLINE:
327 return 0;
26687bf8 328
ac2e41f5
VC
329 case STATE_OFFLINE:
330 f->header->state = STATE_ONLINE;
331 (void) fsync(f->fd);
332 return 0;
333
334 default:
335 return -EINVAL;
336 }
26687bf8
OS
337}
338
b58c888f
VC
339bool journal_file_is_offlining(JournalFile *f) {
340 assert(f);
341
342 __sync_synchronize();
343
3742095b 344 if (IN_SET(f->offline_state, OFFLINE_DONE, OFFLINE_JOINED))
b58c888f
VC
345 return false;
346
347 return true;
348}
349
804ae586 350JournalFile* journal_file_close(JournalFile *f) {
de190aef 351 assert(f);
cec736d2 352
349cc4a5 353#if HAVE_GCRYPT
b0af6f41 354 /* Write the final tag */
43cd8794
FB
355 if (f->seal && f->writable) {
356 int r;
357
358 r = journal_file_append_tag(f);
359 if (r < 0)
360 log_error_errno(r, "Failed to append tag when closing journal: %m");
361 }
feb12d3e 362#endif
b0af6f41 363
7a24f3bf
VC
364 if (f->post_change_timer) {
365 int enabled;
366
367 if (sd_event_source_get_enabled(f->post_change_timer, &enabled) >= 0)
368 if (enabled == SD_EVENT_ONESHOT)
369 journal_file_post_change(f);
370
e167d7fd 371 (void) sd_event_source_set_enabled(f->post_change_timer, SD_EVENT_OFF);
7a24f3bf
VC
372 sd_event_source_unref(f->post_change_timer);
373 }
374
ac2e41f5 375 journal_file_set_offline(f, true);
cec736d2 376
be7cdd8e
VC
377 if (f->mmap && f->cache_fd)
378 mmap_cache_free_fd(f->mmap, f->cache_fd);
cec736d2 379
11689d2a
LP
380 if (f->fd >= 0 && f->defrag_on_close) {
381
382 /* Be friendly to btrfs: turn COW back on again now,
383 * and defragment the file. We won't write to the file
384 * ever again, hence remove all fragmentation, and
385 * reenable all the good bits COW usually provides
386 * (such as data checksumming). */
387
1ed8f8c1 388 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
11689d2a
LP
389 (void) btrfs_defrag_fd(f->fd);
390 }
f27a3864 391
5d1ce257
LP
392 if (f->close_fd)
393 safe_close(f->fd);
cec736d2 394 free(f->path);
807e17f0 395
f649045c 396 mmap_cache_unref(f->mmap);
16e9f408 397
4743015d 398 ordered_hashmap_free_free(f->chain_cache);
a4bcff5b 399
349cc4a5 400#if HAVE_XZ || HAVE_LZ4
807e17f0
LP
401 free(f->compress_buffer);
402#endif
403
349cc4a5 404#if HAVE_GCRYPT
baed47c3
LP
405 if (f->fss_file)
406 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
dc4ebc07 407 else
b7c9ae91
LP
408 free(f->fsprg_state);
409
410 free(f->fsprg_seed);
7560fffc
LP
411
412 if (f->hmac)
413 gcry_md_close(f->hmac);
414#endif
415
6b430fdb 416 return mfree(f);
cec736d2
LP
417}
418
0ac38b70 419static int journal_file_init_header(JournalFile *f, JournalFile *template) {
d89c8fdf 420 Header h = {};
cec736d2
LP
421 ssize_t k;
422 int r;
423
424 assert(f);
425
7560fffc 426 memcpy(h.signature, HEADER_SIGNATURE, 8);
23b0b2b2 427 h.header_size = htole64(ALIGN64(sizeof(h)));
cec736d2 428
d89c8fdf
ZJS
429 h.incompatible_flags |= htole32(
430 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
431 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
7560fffc 432
d89c8fdf
ZJS
433 h.compatible_flags = htole32(
434 f->seal * HEADER_COMPATIBLE_SEALED);
7560fffc 435
cec736d2
LP
436 r = sd_id128_randomize(&h.file_id);
437 if (r < 0)
438 return r;
439
0ac38b70
LP
440 if (template) {
441 h.seqnum_id = template->header->seqnum_id;
beec0085 442 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
0ac38b70
LP
443 } else
444 h.seqnum_id = h.file_id;
cec736d2
LP
445
446 k = pwrite(f->fd, &h, sizeof(h), 0);
447 if (k < 0)
448 return -errno;
449
450 if (k != sizeof(h))
451 return -EIO;
452
453 return 0;
454}
455
a0fe2a2d
LP
456static int fsync_directory_of_file(int fd) {
457 _cleanup_free_ char *path = NULL, *dn = NULL;
458 _cleanup_close_ int dfd = -1;
459 struct stat st;
460 int r;
461
462 if (fstat(fd, &st) < 0)
463 return -errno;
464
465 if (!S_ISREG(st.st_mode))
466 return -EBADFD;
467
468 r = fd_get_path(fd, &path);
469 if (r < 0)
470 return r;
471
472 if (!path_is_absolute(path))
473 return -EINVAL;
474
475 dn = dirname_malloc(path);
476 if (!dn)
477 return -ENOMEM;
478
479 dfd = open(dn, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
480 if (dfd < 0)
481 return -errno;
482
483 if (fsync(dfd) < 0)
484 return -errno;
485
486 return 0;
487}
488
cec736d2 489static int journal_file_refresh_header(JournalFile *f) {
de190aef 490 sd_id128_t boot_id;
fa6ac760 491 int r;
cec736d2
LP
492
493 assert(f);
c88cc6af 494 assert(f->header);
cec736d2
LP
495
496 r = sd_id128_get_machine(&f->header->machine_id);
497 if (r < 0)
498 return r;
499
de190aef 500 r = sd_id128_get_boot(&boot_id);
cec736d2
LP
501 if (r < 0)
502 return r;
503
de190aef
LP
504 if (sd_id128_equal(boot_id, f->header->boot_id))
505 f->tail_entry_monotonic_valid = true;
506
507 f->header->boot_id = boot_id;
508
fa6ac760 509 r = journal_file_set_online(f);
b788cc23 510
7560fffc 511 /* Sync the online state to disk */
fb426037 512 (void) fsync(f->fd);
b788cc23 513
a0fe2a2d
LP
514 /* We likely just created a new file, also sync the directory this file is located in. */
515 (void) fsync_directory_of_file(f->fd);
516
fa6ac760 517 return r;
cec736d2
LP
518}
519
4214009f
ZJS
520static bool warn_wrong_flags(const JournalFile *f, bool compatible) {
521 const uint32_t any = compatible ? HEADER_COMPATIBLE_ANY : HEADER_INCOMPATIBLE_ANY,
522 supported = compatible ? HEADER_COMPATIBLE_SUPPORTED : HEADER_INCOMPATIBLE_SUPPORTED;
523 const char *type = compatible ? "compatible" : "incompatible";
d89c8fdf
ZJS
524 uint32_t flags;
525
4214009f
ZJS
526 flags = le32toh(compatible ? f->header->compatible_flags : f->header->incompatible_flags);
527
528 if (flags & ~supported) {
529 if (flags & ~any)
4761fd0f 530 log_debug("Journal file %s has unknown %s flags 0x%"PRIx32,
4214009f
ZJS
531 f->path, type, flags & ~any);
532 flags = (flags & any) & ~supported;
4761fd0f
ZJS
533 if (flags) {
534 const char* strv[3];
535 unsigned n = 0;
536 _cleanup_free_ char *t = NULL;
537
538 if (compatible && (flags & HEADER_COMPATIBLE_SEALED))
539 strv[n++] = "sealed";
540 if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_XZ))
541 strv[n++] = "xz-compressed";
542 if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_LZ4))
543 strv[n++] = "lz4-compressed";
544 strv[n] = NULL;
545 assert(n < ELEMENTSOF(strv));
546
547 t = strv_join((char**) strv, ", ");
548 log_debug("Journal file %s uses %s %s %s disabled at compilation time.",
549 f->path, type, n > 1 ? "flags" : "flag", strnull(t));
550 }
4214009f
ZJS
551 return true;
552 }
553
554 return false;
555}
556
557static int journal_file_verify_header(JournalFile *f) {
6f94e420
TS
558 uint64_t arena_size, header_size;
559
cec736d2 560 assert(f);
c88cc6af 561 assert(f->header);
cec736d2 562
7560fffc 563 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
cec736d2
LP
564 return -EBADMSG;
565
4214009f
ZJS
566 /* In both read and write mode we refuse to open files with incompatible
567 * flags we don't know. */
568 if (warn_wrong_flags(f, false))
cec736d2
LP
569 return -EPROTONOSUPPORT;
570
4214009f
ZJS
571 /* When open for writing we refuse to open files with compatible flags, too. */
572 if (f->writable && warn_wrong_flags(f, true))
d89c8fdf 573 return -EPROTONOSUPPORT;
7560fffc 574
db11ac1a
LP
575 if (f->header->state >= _STATE_MAX)
576 return -EBADMSG;
577
6f94e420
TS
578 header_size = le64toh(f->header->header_size);
579
dca6219e 580 /* The first addition was n_data, so check that we are at least this large */
6f94e420 581 if (header_size < HEADER_SIZE_MIN)
23b0b2b2
LP
582 return -EBADMSG;
583
8088cbd3 584 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
beec0085
LP
585 return -EBADMSG;
586
6f94e420
TS
587 arena_size = le64toh(f->header->arena_size);
588
589 if (UINT64_MAX - header_size < arena_size || header_size + arena_size > (uint64_t) f->last_stat.st_size)
db11ac1a
LP
590 return -ENODATA;
591
6f94e420 592 if (le64toh(f->header->tail_object_offset) > header_size + arena_size)
db11ac1a
LP
593 return -ENODATA;
594
7762e02b
LP
595 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
596 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
597 !VALID64(le64toh(f->header->tail_object_offset)) ||
598 !VALID64(le64toh(f->header->entry_array_offset)))
599 return -ENODATA;
600
cec736d2 601 if (f->writable) {
cec736d2 602 sd_id128_t machine_id;
ae739cc1 603 uint8_t state;
cec736d2
LP
604 int r;
605
606 r = sd_id128_get_machine(&machine_id);
607 if (r < 0)
608 return r;
609
610 if (!sd_id128_equal(machine_id, f->header->machine_id))
611 return -EHOSTDOWN;
612
de190aef 613 state = f->header->state;
cec736d2 614
b288cdeb
ZJS
615 if (state == STATE_ARCHIVED)
616 return -ESHUTDOWN; /* Already archived */
617 else if (state == STATE_ONLINE) {
71fa6f00
LP
618 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
619 return -EBUSY;
b288cdeb 620 } else if (state != STATE_OFFLINE) {
8facc349 621 log_debug("Journal file %s has unknown state %i.", f->path, state);
71fa6f00
LP
622 return -EBUSY;
623 }
ae739cc1 624
5b3cc0c8
YN
625 if (f->header->field_hash_table_size == 0 || f->header->data_hash_table_size == 0)
626 return -EBADMSG;
627
ae739cc1
LP
628 /* Don't permit appending to files from the future. Because otherwise the realtime timestamps wouldn't
629 * be strictly ordered in the entries in the file anymore, and we can't have that since it breaks
630 * bisection. */
631 if (le64toh(f->header->tail_entry_realtime) > now(CLOCK_REALTIME)) {
632 log_debug("Journal file %s is from the future, refusing to append new data to it that'd be older.", f->path);
633 return -ETXTBSY;
634 }
cec736d2
LP
635 }
636
d89c8fdf
ZJS
637 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
638 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
c586dbf1 639
f1889c91 640 f->seal = JOURNAL_HEADER_SEALED(f->header);
7560fffc 641
cec736d2
LP
642 return 0;
643}
644
2678031a
LP
645static int journal_file_fstat(JournalFile *f) {
646 assert(f);
647 assert(f->fd >= 0);
648
649 if (fstat(f->fd, &f->last_stat) < 0)
650 return -errno;
651
652 f->last_stat_usec = now(CLOCK_MONOTONIC);
653
8d6a4d33
LP
654 /* Refuse dealing with with files that aren't regular */
655 if (S_ISDIR(f->last_stat.st_mode))
656 return -EISDIR;
657 if (!S_ISREG(f->last_stat.st_mode))
658 return -EBADFD;
659
2678031a
LP
660 /* Refuse appending to files that are already deleted */
661 if (f->last_stat.st_nlink <= 0)
662 return -EIDRM;
663
664 return 0;
665}
666
cec736d2 667static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
eda4b58b 668 uint64_t old_size, new_size;
fec2aa2f 669 int r;
cec736d2
LP
670
671 assert(f);
c88cc6af 672 assert(f->header);
cec736d2 673
cec736d2 674 /* We assume that this file is not sparse, and we know that
38ac38b2 675 * for sure, since we always call posix_fallocate()
cec736d2
LP
676 * ourselves */
677
be7cdd8e 678 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
fa6ac760
LP
679 return -EIO;
680
cec736d2 681 old_size =
23b0b2b2 682 le64toh(f->header->header_size) +
cec736d2
LP
683 le64toh(f->header->arena_size);
684
bc85bfee 685 new_size = PAGE_ALIGN(offset + size);
23b0b2b2
LP
686 if (new_size < le64toh(f->header->header_size))
687 new_size = le64toh(f->header->header_size);
bc85bfee 688
2678031a
LP
689 if (new_size <= old_size) {
690
691 /* We already pre-allocated enough space, but before
692 * we write to it, let's check with fstat() if the
693 * file got deleted, in order make sure we don't throw
694 * away the data immediately. Don't check fstat() for
695 * all writes though, but only once ever 10s. */
696
697 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
698 return 0;
699
700 return journal_file_fstat(f);
701 }
702
703 /* Allocate more space. */
cec736d2 704
a676e665 705 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
bc85bfee 706 return -E2BIG;
cec736d2 707
a676e665 708 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
cec736d2
LP
709 struct statvfs svfs;
710
711 if (fstatvfs(f->fd, &svfs) >= 0) {
712 uint64_t available;
713
070052ab 714 available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
cec736d2
LP
715
716 if (new_size - old_size > available)
717 return -E2BIG;
718 }
719 }
720
eda4b58b
LP
721 /* Increase by larger blocks at once */
722 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
723 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
724 new_size = f->metrics.max_size;
725
bc85bfee
LP
726 /* Note that the glibc fallocate() fallback is very
727 inefficient, hence we try to minimize the allocation area
728 as we can. */
fec2aa2f
GV
729 r = posix_fallocate(f->fd, old_size, new_size - old_size);
730 if (r != 0)
731 return -r;
cec736d2 732
23b0b2b2 733 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
cec736d2 734
2678031a 735 return journal_file_fstat(f);
cec736d2
LP
736}
737
78519831 738static unsigned type_to_context(ObjectType type) {
d3d3208f 739 /* One context for each type, plus one catch-all for the rest */
69adae51 740 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
fa6ac760 741 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
d05089d8 742 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
d3d3208f
MS
743}
744
b439282e 745static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret, size_t *ret_size) {
2678031a
LP
746 int r;
747
cec736d2 748 assert(f);
cec736d2
LP
749 assert(ret);
750
7762e02b
LP
751 if (size <= 0)
752 return -EINVAL;
753
2a59ea54 754 /* Avoid SIGBUS on invalid accesses */
4bbdcdb3
LP
755 if (offset + size > (uint64_t) f->last_stat.st_size) {
756 /* Hmm, out of range? Let's refresh the fstat() data
757 * first, before we trust that check. */
758
2678031a
LP
759 r = journal_file_fstat(f);
760 if (r < 0)
761 return r;
762
763 if (offset + size > (uint64_t) f->last_stat.st_size)
4bbdcdb3
LP
764 return -EADDRNOTAVAIL;
765 }
766
b439282e 767 return mmap_cache_get(f->mmap, f->cache_fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret, ret_size);
cec736d2
LP
768}
769
16e9f408
LP
770static uint64_t minimum_header_size(Object *o) {
771
b8e891e6 772 static const uint64_t table[] = {
16e9f408
LP
773 [OBJECT_DATA] = sizeof(DataObject),
774 [OBJECT_FIELD] = sizeof(FieldObject),
775 [OBJECT_ENTRY] = sizeof(EntryObject),
776 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
777 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
778 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
779 [OBJECT_TAG] = sizeof(TagObject),
780 };
781
782 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
783 return sizeof(ObjectHeader);
784
785 return table[o->object.type];
786}
787
24754f36
TR
788/* Lightweight object checks. We want this to be fast, so that we won't
789 * slowdown every journal_file_move_to_object() call too much. */
790static int journal_file_check_object(JournalFile *f, uint64_t offset, Object *o) {
791 assert(f);
792 assert(o);
793
794 switch (o->object.type) {
795
796 case OBJECT_DATA: {
797 if ((le64toh(o->data.entry_offset) == 0) ^ (le64toh(o->data.n_entries) == 0)) {
798 log_debug("Bad n_entries: %"PRIu64": %"PRIu64,
10e8445b 799 le64toh(o->data.n_entries), offset);
24754f36
TR
800 return -EBADMSG;
801 }
802
803 if (le64toh(o->object.size) - offsetof(DataObject, payload) <= 0) {
804 log_debug("Bad object size (<= %zu): %"PRIu64": %"PRIu64,
805 offsetof(DataObject, payload),
806 le64toh(o->object.size),
807 offset);
808 return -EBADMSG;
809 }
810
10e8445b
TR
811 if (!VALID64(le64toh(o->data.next_hash_offset)) ||
812 !VALID64(le64toh(o->data.next_field_offset)) ||
813 !VALID64(le64toh(o->data.entry_offset)) ||
814 !VALID64(le64toh(o->data.entry_array_offset))) {
24754f36
TR
815 log_debug("Invalid offset, next_hash_offset="OFSfmt", next_field_offset="OFSfmt
816 ", entry_offset="OFSfmt", entry_array_offset="OFSfmt": %"PRIu64,
10e8445b
TR
817 le64toh(o->data.next_hash_offset),
818 le64toh(o->data.next_field_offset),
819 le64toh(o->data.entry_offset),
820 le64toh(o->data.entry_array_offset),
24754f36
TR
821 offset);
822 return -EBADMSG;
823 }
824
825 break;
826 }
827
828 case OBJECT_FIELD:
829 if (le64toh(o->object.size) - offsetof(FieldObject, payload) <= 0) {
830 log_debug(
831 "Bad field size (<= %zu): %"PRIu64": %"PRIu64,
832 offsetof(FieldObject, payload),
833 le64toh(o->object.size),
834 offset);
835 return -EBADMSG;
836 }
837
10e8445b
TR
838 if (!VALID64(le64toh(o->field.next_hash_offset)) ||
839 !VALID64(le64toh(o->field.head_data_offset))) {
24754f36
TR
840 log_debug(
841 "Invalid offset, next_hash_offset="OFSfmt
842 ", head_data_offset="OFSfmt": %"PRIu64,
10e8445b
TR
843 le64toh(o->field.next_hash_offset),
844 le64toh(o->field.head_data_offset),
24754f36
TR
845 offset);
846 return -EBADMSG;
847 }
848 break;
849
850 case OBJECT_ENTRY:
851 if ((le64toh(o->object.size) - offsetof(EntryObject, items)) % sizeof(EntryItem) != 0) {
852 log_debug(
853 "Bad entry size (<= %zu): %"PRIu64": %"PRIu64,
854 offsetof(EntryObject, items),
855 le64toh(o->object.size),
856 offset);
857 return -EBADMSG;
858 }
859
860 if ((le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem) <= 0) {
861 log_debug(
862 "Invalid number items in entry: %"PRIu64": %"PRIu64,
863 (le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem),
864 offset);
865 return -EBADMSG;
866 }
867
868 if (le64toh(o->entry.seqnum) <= 0) {
869 log_debug(
870 "Invalid entry seqnum: %"PRIx64": %"PRIu64,
871 le64toh(o->entry.seqnum),
872 offset);
873 return -EBADMSG;
874 }
875
876 if (!VALID_REALTIME(le64toh(o->entry.realtime))) {
877 log_debug(
878 "Invalid entry realtime timestamp: %"PRIu64": %"PRIu64,
879 le64toh(o->entry.realtime),
880 offset);
881 return -EBADMSG;
882 }
883
884 if (!VALID_MONOTONIC(le64toh(o->entry.monotonic))) {
885 log_debug(
886 "Invalid entry monotonic timestamp: %"PRIu64": %"PRIu64,
887 le64toh(o->entry.monotonic),
888 offset);
889 return -EBADMSG;
890 }
891
892 break;
893
894 case OBJECT_DATA_HASH_TABLE:
895 case OBJECT_FIELD_HASH_TABLE:
896 if ((le64toh(o->object.size) - offsetof(HashTableObject, items)) % sizeof(HashItem) != 0 ||
897 (le64toh(o->object.size) - offsetof(HashTableObject, items)) / sizeof(HashItem) <= 0) {
898 log_debug(
899 "Invalid %s hash table size: %"PRIu64": %"PRIu64,
900 o->object.type == OBJECT_DATA_HASH_TABLE ? "data" : "field",
901 le64toh(o->object.size),
902 offset);
903 return -EBADMSG;
904 }
905
906 break;
907
908 case OBJECT_ENTRY_ARRAY:
909 if ((le64toh(o->object.size) - offsetof(EntryArrayObject, items)) % sizeof(le64_t) != 0 ||
910 (le64toh(o->object.size) - offsetof(EntryArrayObject, items)) / sizeof(le64_t) <= 0) {
911 log_debug(
912 "Invalid object entry array size: %"PRIu64": %"PRIu64,
913 le64toh(o->object.size),
914 offset);
915 return -EBADMSG;
916 }
917
10e8445b 918 if (!VALID64(le64toh(o->entry_array.next_entry_array_offset))) {
24754f36
TR
919 log_debug(
920 "Invalid object entry array next_entry_array_offset: "OFSfmt": %"PRIu64,
10e8445b 921 le64toh(o->entry_array.next_entry_array_offset),
24754f36
TR
922 offset);
923 return -EBADMSG;
924 }
925
926 break;
927
928 case OBJECT_TAG:
929 if (le64toh(o->object.size) != sizeof(TagObject)) {
930 log_debug(
931 "Invalid object tag size: %"PRIu64": %"PRIu64,
932 le64toh(o->object.size),
933 offset);
934 return -EBADMSG;
935 }
936
10e8445b 937 if (!VALID_EPOCH(le64toh(o->tag.epoch))) {
24754f36
TR
938 log_debug(
939 "Invalid object tag epoch: %"PRIu64": %"PRIu64,
10e8445b 940 le64toh(o->tag.epoch),
24754f36
TR
941 offset);
942 return -EBADMSG;
943 }
944
945 break;
946 }
947
948 return 0;
949}
950
78519831 951int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
cec736d2
LP
952 int r;
953 void *t;
b439282e 954 size_t tsize;
cec736d2
LP
955 Object *o;
956 uint64_t s;
957
958 assert(f);
959 assert(ret);
960
db11ac1a 961 /* Objects may only be located at multiple of 64 bit */
202fd896
LP
962 if (!VALID64(offset)) {
963 log_debug("Attempt to move to object at non-64bit boundary: %" PRIu64, offset);
bd30fdf2 964 return -EBADMSG;
202fd896 965 }
db11ac1a 966
50809d7a 967 /* Object may not be located in the file header */
202fd896
LP
968 if (offset < le64toh(f->header->header_size)) {
969 log_debug("Attempt to move to object located in file header: %" PRIu64, offset);
50809d7a 970 return -EBADMSG;
202fd896 971 }
50809d7a 972
b439282e 973 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t, &tsize);
cec736d2
LP
974 if (r < 0)
975 return r;
976
977 o = (Object*) t;
978 s = le64toh(o->object.size);
979
1c69f096
LP
980 if (s == 0) {
981 log_debug("Attempt to move to uninitialized object: %" PRIu64, offset);
982 return -EBADMSG;
983 }
202fd896
LP
984 if (s < sizeof(ObjectHeader)) {
985 log_debug("Attempt to move to overly short object: %" PRIu64, offset);
cec736d2 986 return -EBADMSG;
202fd896 987 }
cec736d2 988
202fd896
LP
989 if (o->object.type <= OBJECT_UNUSED) {
990 log_debug("Attempt to move to object with invalid type: %" PRIu64, offset);
16e9f408 991 return -EBADMSG;
202fd896 992 }
16e9f408 993
202fd896
LP
994 if (s < minimum_header_size(o)) {
995 log_debug("Attempt to move to truncated object: %" PRIu64, offset);
16e9f408 996 return -EBADMSG;
202fd896 997 }
16e9f408 998
202fd896
LP
999 if (type > OBJECT_UNUSED && o->object.type != type) {
1000 log_debug("Attempt to move to object of unexpected type: %" PRIu64, offset);
cec736d2 1001 return -EBADMSG;
202fd896 1002 }
cec736d2 1003
b439282e
VC
1004 if (s > tsize) {
1005 r = journal_file_move_to(f, type, false, offset, s, &t, NULL);
cec736d2
LP
1006 if (r < 0)
1007 return r;
1008
1009 o = (Object*) t;
1010 }
1011
24754f36
TR
1012 r = journal_file_check_object(f, offset, o);
1013 if (r < 0)
1014 return r;
1015
cec736d2
LP
1016 *ret = o;
1017 return 0;
1018}
1019
d98cc1f2 1020static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
cec736d2
LP
1021 uint64_t r;
1022
1023 assert(f);
c88cc6af 1024 assert(f->header);
cec736d2 1025
beec0085 1026 r = le64toh(f->header->tail_entry_seqnum) + 1;
c2373f84
LP
1027
1028 if (seqnum) {
de190aef 1029 /* If an external seqnum counter was passed, we update
c2373f84
LP
1030 * both the local and the external one, and set it to
1031 * the maximum of both */
1032
1033 if (*seqnum + 1 > r)
1034 r = *seqnum + 1;
1035
1036 *seqnum = r;
1037 }
1038
beec0085 1039 f->header->tail_entry_seqnum = htole64(r);
cec736d2 1040
beec0085
LP
1041 if (f->header->head_entry_seqnum == 0)
1042 f->header->head_entry_seqnum = htole64(r);
de190aef 1043
cec736d2
LP
1044 return r;
1045}
1046
78519831 1047int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
cec736d2
LP
1048 int r;
1049 uint64_t p;
1050 Object *tail, *o;
1051 void *t;
1052
1053 assert(f);
c88cc6af 1054 assert(f->header);
d05089d8 1055 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
cec736d2
LP
1056 assert(size >= sizeof(ObjectHeader));
1057 assert(offset);
1058 assert(ret);
1059
26687bf8
OS
1060 r = journal_file_set_online(f);
1061 if (r < 0)
1062 return r;
1063
cec736d2 1064 p = le64toh(f->header->tail_object_offset);
cec736d2 1065 if (p == 0)
23b0b2b2 1066 p = le64toh(f->header->header_size);
cec736d2 1067 else {
d05089d8 1068 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
cec736d2
LP
1069 if (r < 0)
1070 return r;
1071
1072 p += ALIGN64(le64toh(tail->object.size));
1073 }
1074
1075 r = journal_file_allocate(f, p, size);
1076 if (r < 0)
1077 return r;
1078
b439282e 1079 r = journal_file_move_to(f, type, false, p, size, &t, NULL);
cec736d2
LP
1080 if (r < 0)
1081 return r;
1082
1083 o = (Object*) t;
1084
1085 zero(o->object);
de190aef 1086 o->object.type = type;
cec736d2
LP
1087 o->object.size = htole64(size);
1088
1089 f->header->tail_object_offset = htole64(p);
cec736d2
LP
1090 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
1091
1092 *ret = o;
1093 *offset = p;
1094
1095 return 0;
1096}
1097
de190aef 1098static int journal_file_setup_data_hash_table(JournalFile *f) {
cec736d2
LP
1099 uint64_t s, p;
1100 Object *o;
1101 int r;
1102
1103 assert(f);
c88cc6af 1104 assert(f->header);
cec736d2 1105
070052ab
LP
1106 /* We estimate that we need 1 hash table entry per 768 bytes
1107 of journal file and we want to make sure we never get
1108 beyond 75% fill level. Calculate the hash table size for
1109 the maximum file size based on these metrics. */
4a92baf3 1110
dfabe643 1111 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
4a92baf3
LP
1112 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
1113 s = DEFAULT_DATA_HASH_TABLE_SIZE;
1114
507f22bd 1115 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
4a92baf3 1116
de190aef
LP
1117 r = journal_file_append_object(f,
1118 OBJECT_DATA_HASH_TABLE,
1119 offsetof(Object, hash_table.items) + s,
1120 &o, &p);
cec736d2
LP
1121 if (r < 0)
1122 return r;
1123
29804cc1 1124 memzero(o->hash_table.items, s);
cec736d2 1125
de190aef
LP
1126 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1127 f->header->data_hash_table_size = htole64(s);
cec736d2
LP
1128
1129 return 0;
1130}
1131
de190aef 1132static int journal_file_setup_field_hash_table(JournalFile *f) {
cec736d2
LP
1133 uint64_t s, p;
1134 Object *o;
1135 int r;
1136
1137 assert(f);
c88cc6af 1138 assert(f->header);
cec736d2 1139
3c1668da
LP
1140 /* We use a fixed size hash table for the fields as this
1141 * number should grow very slowly only */
1142
de190aef
LP
1143 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
1144 r = journal_file_append_object(f,
1145 OBJECT_FIELD_HASH_TABLE,
1146 offsetof(Object, hash_table.items) + s,
1147 &o, &p);
cec736d2
LP
1148 if (r < 0)
1149 return r;
1150
29804cc1 1151 memzero(o->hash_table.items, s);
cec736d2 1152
de190aef
LP
1153 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1154 f->header->field_hash_table_size = htole64(s);
cec736d2
LP
1155
1156 return 0;
1157}
1158
dade37d4 1159int journal_file_map_data_hash_table(JournalFile *f) {
cec736d2
LP
1160 uint64_t s, p;
1161 void *t;
1162 int r;
1163
1164 assert(f);
c88cc6af 1165 assert(f->header);
cec736d2 1166
dade37d4
LP
1167 if (f->data_hash_table)
1168 return 0;
1169
de190aef
LP
1170 p = le64toh(f->header->data_hash_table_offset);
1171 s = le64toh(f->header->data_hash_table_size);
cec736d2 1172
de190aef 1173 r = journal_file_move_to(f,
16e9f408 1174 OBJECT_DATA_HASH_TABLE,
fcde2389 1175 true,
de190aef 1176 p, s,
b42549ad 1177 &t, NULL);
cec736d2
LP
1178 if (r < 0)
1179 return r;
1180
de190aef 1181 f->data_hash_table = t;
cec736d2
LP
1182 return 0;
1183}
1184
dade37d4 1185int journal_file_map_field_hash_table(JournalFile *f) {
cec736d2
LP
1186 uint64_t s, p;
1187 void *t;
1188 int r;
1189
1190 assert(f);
c88cc6af 1191 assert(f->header);
cec736d2 1192
dade37d4
LP
1193 if (f->field_hash_table)
1194 return 0;
1195
de190aef
LP
1196 p = le64toh(f->header->field_hash_table_offset);
1197 s = le64toh(f->header->field_hash_table_size);
cec736d2 1198
de190aef 1199 r = journal_file_move_to(f,
16e9f408 1200 OBJECT_FIELD_HASH_TABLE,
fcde2389 1201 true,
de190aef 1202 p, s,
b42549ad 1203 &t, NULL);
cec736d2
LP
1204 if (r < 0)
1205 return r;
1206
de190aef 1207 f->field_hash_table = t;
cec736d2
LP
1208 return 0;
1209}
1210
3c1668da
LP
1211static int journal_file_link_field(
1212 JournalFile *f,
1213 Object *o,
1214 uint64_t offset,
1215 uint64_t hash) {
1216
805d1486 1217 uint64_t p, h, m;
3c1668da
LP
1218 int r;
1219
1220 assert(f);
c88cc6af 1221 assert(f->header);
90d222c1 1222 assert(f->field_hash_table);
3c1668da
LP
1223 assert(o);
1224 assert(offset > 0);
1225
1226 if (o->object.type != OBJECT_FIELD)
1227 return -EINVAL;
1228
805d1486
LP
1229 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
1230 if (m <= 0)
1231 return -EBADMSG;
3c1668da 1232
805d1486 1233 /* This might alter the window we are looking at */
3c1668da
LP
1234 o->field.next_hash_offset = o->field.head_data_offset = 0;
1235
805d1486 1236 h = hash % m;
3c1668da
LP
1237 p = le64toh(f->field_hash_table[h].tail_hash_offset);
1238 if (p == 0)
1239 f->field_hash_table[h].head_hash_offset = htole64(offset);
1240 else {
1241 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1242 if (r < 0)
1243 return r;
1244
1245 o->field.next_hash_offset = htole64(offset);
1246 }
1247
1248 f->field_hash_table[h].tail_hash_offset = htole64(offset);
1249
1250 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
1251 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
1252
1253 return 0;
1254}
1255
1256static int journal_file_link_data(
1257 JournalFile *f,
1258 Object *o,
1259 uint64_t offset,
1260 uint64_t hash) {
1261
805d1486 1262 uint64_t p, h, m;
cec736d2
LP
1263 int r;
1264
1265 assert(f);
c88cc6af 1266 assert(f->header);
90d222c1 1267 assert(f->data_hash_table);
cec736d2
LP
1268 assert(o);
1269 assert(offset > 0);
b588975f
LP
1270
1271 if (o->object.type != OBJECT_DATA)
1272 return -EINVAL;
cec736d2 1273
805d1486
LP
1274 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
1275 if (m <= 0)
1276 return -EBADMSG;
48496df6 1277
805d1486 1278 /* This might alter the window we are looking at */
de190aef
LP
1279 o->data.next_hash_offset = o->data.next_field_offset = 0;
1280 o->data.entry_offset = o->data.entry_array_offset = 0;
1281 o->data.n_entries = 0;
cec736d2 1282
805d1486 1283 h = hash % m;
8db4213e 1284 p = le64toh(f->data_hash_table[h].tail_hash_offset);
3c1668da 1285 if (p == 0)
cec736d2 1286 /* Only entry in the hash table is easy */
de190aef 1287 f->data_hash_table[h].head_hash_offset = htole64(offset);
3c1668da 1288 else {
48496df6
LP
1289 /* Move back to the previous data object, to patch in
1290 * pointer */
cec736d2 1291
de190aef 1292 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1293 if (r < 0)
1294 return r;
1295
de190aef 1296 o->data.next_hash_offset = htole64(offset);
cec736d2
LP
1297 }
1298
de190aef 1299 f->data_hash_table[h].tail_hash_offset = htole64(offset);
cec736d2 1300
dca6219e
LP
1301 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
1302 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
1303
cec736d2
LP
1304 return 0;
1305}
1306
3c1668da
LP
1307int journal_file_find_field_object_with_hash(
1308 JournalFile *f,
1309 const void *field, uint64_t size, uint64_t hash,
1310 Object **ret, uint64_t *offset) {
1311
805d1486 1312 uint64_t p, osize, h, m;
3c1668da
LP
1313 int r;
1314
1315 assert(f);
c88cc6af 1316 assert(f->header);
3c1668da
LP
1317 assert(field && size > 0);
1318
dade37d4
LP
1319 /* If the field hash table is empty, we can't find anything */
1320 if (le64toh(f->header->field_hash_table_size) <= 0)
1321 return 0;
1322
1323 /* Map the field hash table, if it isn't mapped yet. */
1324 r = journal_file_map_field_hash_table(f);
1325 if (r < 0)
1326 return r;
1327
3c1668da
LP
1328 osize = offsetof(Object, field.payload) + size;
1329
805d1486 1330 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
805d1486 1331 if (m <= 0)
3c1668da
LP
1332 return -EBADMSG;
1333
805d1486 1334 h = hash % m;
3c1668da
LP
1335 p = le64toh(f->field_hash_table[h].head_hash_offset);
1336
1337 while (p > 0) {
1338 Object *o;
1339
1340 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1341 if (r < 0)
1342 return r;
1343
1344 if (le64toh(o->field.hash) == hash &&
1345 le64toh(o->object.size) == osize &&
1346 memcmp(o->field.payload, field, size) == 0) {
1347
1348 if (ret)
1349 *ret = o;
1350 if (offset)
1351 *offset = p;
1352
1353 return 1;
1354 }
1355
1356 p = le64toh(o->field.next_hash_offset);
1357 }
1358
1359 return 0;
1360}
1361
1362int journal_file_find_field_object(
1363 JournalFile *f,
1364 const void *field, uint64_t size,
1365 Object **ret, uint64_t *offset) {
1366
1367 uint64_t hash;
1368
1369 assert(f);
1370 assert(field && size > 0);
1371
1372 hash = hash64(field, size);
1373
1374 return journal_file_find_field_object_with_hash(f,
1375 field, size, hash,
1376 ret, offset);
1377}
1378
de190aef
LP
1379int journal_file_find_data_object_with_hash(
1380 JournalFile *f,
1381 const void *data, uint64_t size, uint64_t hash,
1382 Object **ret, uint64_t *offset) {
48496df6 1383
805d1486 1384 uint64_t p, osize, h, m;
cec736d2
LP
1385 int r;
1386
1387 assert(f);
c88cc6af 1388 assert(f->header);
cec736d2
LP
1389 assert(data || size == 0);
1390
dade37d4
LP
1391 /* If there's no data hash table, then there's no entry. */
1392 if (le64toh(f->header->data_hash_table_size) <= 0)
1393 return 0;
1394
1395 /* Map the data hash table, if it isn't mapped yet. */
1396 r = journal_file_map_data_hash_table(f);
1397 if (r < 0)
1398 return r;
1399
cec736d2
LP
1400 osize = offsetof(Object, data.payload) + size;
1401
805d1486
LP
1402 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
1403 if (m <= 0)
bc85bfee
LP
1404 return -EBADMSG;
1405
805d1486 1406 h = hash % m;
de190aef 1407 p = le64toh(f->data_hash_table[h].head_hash_offset);
cec736d2 1408
de190aef
LP
1409 while (p > 0) {
1410 Object *o;
cec736d2 1411
de190aef 1412 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1413 if (r < 0)
1414 return r;
1415
807e17f0 1416 if (le64toh(o->data.hash) != hash)
85a131e8 1417 goto next;
807e17f0 1418
d89c8fdf 1419 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
349cc4a5 1420#if HAVE_XZ || HAVE_LZ4
fa1c4b51 1421 uint64_t l;
a7f7d1bd 1422 size_t rsize = 0;
cec736d2 1423
807e17f0
LP
1424 l = le64toh(o->object.size);
1425 if (l <= offsetof(Object, data.payload))
cec736d2
LP
1426 return -EBADMSG;
1427
807e17f0
LP
1428 l -= offsetof(Object, data.payload);
1429
d89c8fdf
ZJS
1430 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
1431 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
1432 if (r < 0)
1433 return r;
807e17f0 1434
b785c858 1435 if (rsize == size &&
807e17f0
LP
1436 memcmp(f->compress_buffer, data, size) == 0) {
1437
1438 if (ret)
1439 *ret = o;
1440
1441 if (offset)
1442 *offset = p;
1443
1444 return 1;
1445 }
3b1a55e1
ZJS
1446#else
1447 return -EPROTONOSUPPORT;
1448#endif
807e17f0
LP
1449 } else if (le64toh(o->object.size) == osize &&
1450 memcmp(o->data.payload, data, size) == 0) {
1451
cec736d2
LP
1452 if (ret)
1453 *ret = o;
1454
1455 if (offset)
1456 *offset = p;
1457
de190aef 1458 return 1;
cec736d2
LP
1459 }
1460
85a131e8 1461 next:
cec736d2
LP
1462 p = le64toh(o->data.next_hash_offset);
1463 }
1464
de190aef
LP
1465 return 0;
1466}
1467
1468int journal_file_find_data_object(
1469 JournalFile *f,
1470 const void *data, uint64_t size,
1471 Object **ret, uint64_t *offset) {
1472
1473 uint64_t hash;
1474
1475 assert(f);
1476 assert(data || size == 0);
1477
1478 hash = hash64(data, size);
1479
1480 return journal_file_find_data_object_with_hash(f,
1481 data, size, hash,
1482 ret, offset);
1483}
1484
3c1668da
LP
1485static int journal_file_append_field(
1486 JournalFile *f,
1487 const void *field, uint64_t size,
1488 Object **ret, uint64_t *offset) {
1489
1490 uint64_t hash, p;
1491 uint64_t osize;
1492 Object *o;
1493 int r;
1494
1495 assert(f);
1496 assert(field && size > 0);
1497
1498 hash = hash64(field, size);
1499
1500 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
1501 if (r < 0)
1502 return r;
1503 else if (r > 0) {
1504
1505 if (ret)
1506 *ret = o;
1507
1508 if (offset)
1509 *offset = p;
1510
1511 return 0;
1512 }
1513
1514 osize = offsetof(Object, field.payload) + size;
1515 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
8c92d4bb
LP
1516 if (r < 0)
1517 return r;
3c1668da
LP
1518
1519 o->field.hash = htole64(hash);
1520 memcpy(o->field.payload, field, size);
1521
1522 r = journal_file_link_field(f, o, p, hash);
1523 if (r < 0)
1524 return r;
1525
1526 /* The linking might have altered the window, so let's
1527 * refresh our pointer */
1528 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1529 if (r < 0)
1530 return r;
1531
349cc4a5 1532#if HAVE_GCRYPT
3c1668da
LP
1533 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1534 if (r < 0)
1535 return r;
1536#endif
1537
1538 if (ret)
1539 *ret = o;
1540
1541 if (offset)
1542 *offset = p;
1543
1544 return 0;
1545}
1546
48496df6
LP
1547static int journal_file_append_data(
1548 JournalFile *f,
1549 const void *data, uint64_t size,
1550 Object **ret, uint64_t *offset) {
1551
de190aef
LP
1552 uint64_t hash, p;
1553 uint64_t osize;
1554 Object *o;
d89c8fdf 1555 int r, compression = 0;
3c1668da 1556 const void *eq;
de190aef
LP
1557
1558 assert(f);
1559 assert(data || size == 0);
1560
1561 hash = hash64(data, size);
1562
1563 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1564 if (r < 0)
1565 return r;
0240c603 1566 if (r > 0) {
de190aef
LP
1567
1568 if (ret)
1569 *ret = o;
1570
1571 if (offset)
1572 *offset = p;
1573
1574 return 0;
1575 }
1576
1577 osize = offsetof(Object, data.payload) + size;
1578 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
cec736d2
LP
1579 if (r < 0)
1580 return r;
1581
cec736d2 1582 o->data.hash = htole64(hash);
807e17f0 1583
349cc4a5 1584#if HAVE_XZ || HAVE_LZ4
d1afbcd2 1585 if (JOURNAL_FILE_COMPRESS(f) && size >= COMPRESSION_SIZE_THRESHOLD) {
a7f7d1bd 1586 size_t rsize = 0;
807e17f0 1587
5d6f46b6 1588 compression = compress_blob(data, size, o->data.payload, size - 1, &rsize);
807e17f0 1589
d1afbcd2 1590 if (compression >= 0) {
807e17f0 1591 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
d89c8fdf 1592 o->object.flags |= compression;
807e17f0 1593
fa1c4b51 1594 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
d89c8fdf 1595 size, rsize, object_compressed_to_string(compression));
d1afbcd2
LP
1596 } else
1597 /* Compression didn't work, we don't really care why, let's continue without compression */
1598 compression = 0;
807e17f0
LP
1599 }
1600#endif
1601
75f32f04
ZJS
1602 if (compression == 0)
1603 memcpy_safe(o->data.payload, data, size);
cec736d2 1604
de190aef 1605 r = journal_file_link_data(f, o, p, hash);
cec736d2
LP
1606 if (r < 0)
1607 return r;
1608
349cc4a5 1609#if HAVE_GCRYPT
33685a5a
FB
1610 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1611 if (r < 0)
1612 return r;
1613#endif
1614
48496df6
LP
1615 /* The linking might have altered the window, so let's
1616 * refresh our pointer */
1617 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1618 if (r < 0)
1619 return r;
1620
08c6f819
SL
1621 if (!data)
1622 eq = NULL;
1623 else
1624 eq = memchr(data, '=', size);
3c1668da 1625 if (eq && eq > data) {
748db592 1626 Object *fo = NULL;
3c1668da 1627 uint64_t fp;
3c1668da
LP
1628
1629 /* Create field object ... */
1630 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1631 if (r < 0)
1632 return r;
1633
1634 /* ... and link it in. */
1635 o->data.next_field_offset = fo->field.head_data_offset;
1636 fo->field.head_data_offset = le64toh(p);
1637 }
1638
cec736d2
LP
1639 if (ret)
1640 *ret = o;
1641
1642 if (offset)
de190aef 1643 *offset = p;
cec736d2
LP
1644
1645 return 0;
1646}
1647
1648uint64_t journal_file_entry_n_items(Object *o) {
1649 assert(o);
b588975f
LP
1650
1651 if (o->object.type != OBJECT_ENTRY)
1652 return 0;
cec736d2
LP
1653
1654 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1655}
1656
0284adc6 1657uint64_t journal_file_entry_array_n_items(Object *o) {
de190aef 1658 assert(o);
b588975f
LP
1659
1660 if (o->object.type != OBJECT_ENTRY_ARRAY)
1661 return 0;
de190aef
LP
1662
1663 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1664}
1665
fb9a24b6
LP
1666uint64_t journal_file_hash_table_n_items(Object *o) {
1667 assert(o);
b588975f 1668
ec2ce0c5 1669 if (!IN_SET(o->object.type, OBJECT_DATA_HASH_TABLE, OBJECT_FIELD_HASH_TABLE))
b588975f 1670 return 0;
fb9a24b6
LP
1671
1672 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1673}
1674
de190aef 1675static int link_entry_into_array(JournalFile *f,
4fd052ae
FC
1676 le64_t *first,
1677 le64_t *idx,
de190aef 1678 uint64_t p) {
cec736d2 1679 int r;
de190aef
LP
1680 uint64_t n = 0, ap = 0, q, i, a, hidx;
1681 Object *o;
1682
cec736d2 1683 assert(f);
c88cc6af 1684 assert(f->header);
de190aef
LP
1685 assert(first);
1686 assert(idx);
1687 assert(p > 0);
cec736d2 1688
de190aef
LP
1689 a = le64toh(*first);
1690 i = hidx = le64toh(*idx);
1691 while (a > 0) {
1692
1693 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1694 if (r < 0)
1695 return r;
cec736d2 1696
de190aef
LP
1697 n = journal_file_entry_array_n_items(o);
1698 if (i < n) {
1699 o->entry_array.items[i] = htole64(p);
1700 *idx = htole64(hidx + 1);
1701 return 0;
1702 }
cec736d2 1703
de190aef
LP
1704 i -= n;
1705 ap = a;
1706 a = le64toh(o->entry_array.next_entry_array_offset);
1707 }
1708
1709 if (hidx > n)
1710 n = (hidx+1) * 2;
1711 else
1712 n = n * 2;
1713
1714 if (n < 4)
1715 n = 4;
1716
1717 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1718 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1719 &o, &q);
cec736d2
LP
1720 if (r < 0)
1721 return r;
1722
349cc4a5 1723#if HAVE_GCRYPT
5996c7c2 1724 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
b0af6f41
LP
1725 if (r < 0)
1726 return r;
feb12d3e 1727#endif
b0af6f41 1728
de190aef 1729 o->entry_array.items[i] = htole64(p);
cec736d2 1730
de190aef 1731 if (ap == 0)
7be3aa17 1732 *first = htole64(q);
cec736d2 1733 else {
de190aef 1734 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
cec736d2
LP
1735 if (r < 0)
1736 return r;
1737
de190aef
LP
1738 o->entry_array.next_entry_array_offset = htole64(q);
1739 }
cec736d2 1740
2dee23eb
LP
1741 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1742 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1743
de190aef
LP
1744 *idx = htole64(hidx + 1);
1745
1746 return 0;
1747}
cec736d2 1748
de190aef 1749static int link_entry_into_array_plus_one(JournalFile *f,
4fd052ae
FC
1750 le64_t *extra,
1751 le64_t *first,
1752 le64_t *idx,
de190aef
LP
1753 uint64_t p) {
1754
1755 int r;
1756
1757 assert(f);
1758 assert(extra);
1759 assert(first);
1760 assert(idx);
1761 assert(p > 0);
1762
1763 if (*idx == 0)
1764 *extra = htole64(p);
1765 else {
4fd052ae 1766 le64_t i;
de190aef 1767
7be3aa17 1768 i = htole64(le64toh(*idx) - 1);
de190aef
LP
1769 r = link_entry_into_array(f, first, &i, p);
1770 if (r < 0)
1771 return r;
cec736d2
LP
1772 }
1773
de190aef
LP
1774 *idx = htole64(le64toh(*idx) + 1);
1775 return 0;
1776}
1777
1778static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1779 uint64_t p;
1780 int r;
1781 assert(f);
1782 assert(o);
1783 assert(offset > 0);
1784
1785 p = le64toh(o->entry.items[i].object_offset);
1786 if (p == 0)
1787 return -EINVAL;
1788
1789 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1790 if (r < 0)
1791 return r;
1792
de190aef
LP
1793 return link_entry_into_array_plus_one(f,
1794 &o->data.entry_offset,
1795 &o->data.entry_array_offset,
1796 &o->data.n_entries,
1797 offset);
cec736d2
LP
1798}
1799
1800static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
de190aef 1801 uint64_t n, i;
cec736d2
LP
1802 int r;
1803
1804 assert(f);
c88cc6af 1805 assert(f->header);
cec736d2
LP
1806 assert(o);
1807 assert(offset > 0);
b588975f
LP
1808
1809 if (o->object.type != OBJECT_ENTRY)
1810 return -EINVAL;
cec736d2 1811
b788cc23
LP
1812 __sync_synchronize();
1813
cec736d2 1814 /* Link up the entry itself */
de190aef
LP
1815 r = link_entry_into_array(f,
1816 &f->header->entry_array_offset,
1817 &f->header->n_entries,
1818 offset);
1819 if (r < 0)
1820 return r;
cec736d2 1821
507f22bd 1822 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
cec736d2 1823
de190aef 1824 if (f->header->head_entry_realtime == 0)
0ac38b70 1825 f->header->head_entry_realtime = o->entry.realtime;
cec736d2 1826
0ac38b70 1827 f->header->tail_entry_realtime = o->entry.realtime;
de190aef
LP
1828 f->header->tail_entry_monotonic = o->entry.monotonic;
1829
1830 f->tail_entry_monotonic_valid = true;
cec736d2
LP
1831
1832 /* Link up the items */
1833 n = journal_file_entry_n_items(o);
1834 for (i = 0; i < n; i++) {
1835 r = journal_file_link_entry_item(f, o, offset, i);
1836 if (r < 0)
1837 return r;
1838 }
1839
cec736d2
LP
1840 return 0;
1841}
1842
1843static int journal_file_append_entry_internal(
1844 JournalFile *f,
1845 const dual_timestamp *ts,
1846 uint64_t xor_hash,
1847 const EntryItem items[], unsigned n_items,
de190aef 1848 uint64_t *seqnum,
cec736d2
LP
1849 Object **ret, uint64_t *offset) {
1850 uint64_t np;
1851 uint64_t osize;
1852 Object *o;
1853 int r;
1854
1855 assert(f);
c88cc6af 1856 assert(f->header);
cec736d2 1857 assert(items || n_items == 0);
de190aef 1858 assert(ts);
cec736d2
LP
1859
1860 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1861
de190aef 1862 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
cec736d2
LP
1863 if (r < 0)
1864 return r;
1865
d98cc1f2 1866 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
75f32f04 1867 memcpy_safe(o->entry.items, items, n_items * sizeof(EntryItem));
de190aef
LP
1868 o->entry.realtime = htole64(ts->realtime);
1869 o->entry.monotonic = htole64(ts->monotonic);
cec736d2
LP
1870 o->entry.xor_hash = htole64(xor_hash);
1871 o->entry.boot_id = f->header->boot_id;
1872
349cc4a5 1873#if HAVE_GCRYPT
5996c7c2 1874 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
b0af6f41
LP
1875 if (r < 0)
1876 return r;
feb12d3e 1877#endif
b0af6f41 1878
cec736d2
LP
1879 r = journal_file_link_entry(f, o, np);
1880 if (r < 0)
1881 return r;
1882
1883 if (ret)
1884 *ret = o;
1885
1886 if (offset)
1887 *offset = np;
1888
1889 return 0;
1890}
1891
cf244689 1892void journal_file_post_change(JournalFile *f) {
50f20cfd
LP
1893 assert(f);
1894
1895 /* inotify() does not receive IN_MODIFY events from file
1896 * accesses done via mmap(). After each access we hence
1897 * trigger IN_MODIFY by truncating the journal file to its
1898 * current size which triggers IN_MODIFY. */
1899
bc85bfee
LP
1900 __sync_synchronize();
1901
50f20cfd 1902 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
e167d7fd 1903 log_debug_errno(errno, "Failed to truncate file to its own size: %m");
50f20cfd
LP
1904}
1905
7a24f3bf
VC
1906static int post_change_thunk(sd_event_source *timer, uint64_t usec, void *userdata) {
1907 assert(userdata);
1908
1909 journal_file_post_change(userdata);
1910
1911 return 1;
1912}
1913
1914static void schedule_post_change(JournalFile *f) {
1915 sd_event_source *timer;
1916 int enabled, r;
1917 uint64_t now;
1918
1919 assert(f);
1920 assert(f->post_change_timer);
1921
1922 timer = f->post_change_timer;
1923
1924 r = sd_event_source_get_enabled(timer, &enabled);
1925 if (r < 0) {
e167d7fd
LP
1926 log_debug_errno(r, "Failed to get ftruncate timer state: %m");
1927 goto fail;
7a24f3bf
VC
1928 }
1929
1930 if (enabled == SD_EVENT_ONESHOT)
1931 return;
1932
1933 r = sd_event_now(sd_event_source_get_event(timer), CLOCK_MONOTONIC, &now);
1934 if (r < 0) {
e167d7fd
LP
1935 log_debug_errno(r, "Failed to get clock's now for scheduling ftruncate: %m");
1936 goto fail;
7a24f3bf
VC
1937 }
1938
1939 r = sd_event_source_set_time(timer, now+f->post_change_timer_period);
1940 if (r < 0) {
e167d7fd
LP
1941 log_debug_errno(r, "Failed to set time for scheduling ftruncate: %m");
1942 goto fail;
7a24f3bf
VC
1943 }
1944
1945 r = sd_event_source_set_enabled(timer, SD_EVENT_ONESHOT);
1946 if (r < 0) {
e167d7fd
LP
1947 log_debug_errno(r, "Failed to enable scheduled ftruncate: %m");
1948 goto fail;
7a24f3bf 1949 }
e167d7fd
LP
1950
1951 return;
1952
1953fail:
1954 /* On failure, let's simply post the change immediately. */
1955 journal_file_post_change(f);
7a24f3bf
VC
1956}
1957
1958/* Enable coalesced change posting in a timer on the provided sd_event instance */
1959int journal_file_enable_post_change_timer(JournalFile *f, sd_event *e, usec_t t) {
1960 _cleanup_(sd_event_source_unrefp) sd_event_source *timer = NULL;
1961 int r;
1962
1963 assert(f);
1964 assert_return(!f->post_change_timer, -EINVAL);
1965 assert(e);
1966 assert(t);
1967
1968 r = sd_event_add_time(e, &timer, CLOCK_MONOTONIC, 0, 0, post_change_thunk, f);
1969 if (r < 0)
1970 return r;
1971
1972 r = sd_event_source_set_enabled(timer, SD_EVENT_OFF);
1973 if (r < 0)
1974 return r;
1975
1976 f->post_change_timer = timer;
1977 timer = NULL;
1978 f->post_change_timer_period = t;
1979
1980 return r;
1981}
1982
1f2da9ec
LP
1983static int entry_item_cmp(const void *_a, const void *_b) {
1984 const EntryItem *a = _a, *b = _b;
1985
1986 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1987 return -1;
1988 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1989 return 1;
1990 return 0;
1991}
1992
de190aef 1993int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
cec736d2
LP
1994 unsigned i;
1995 EntryItem *items;
1996 int r;
1997 uint64_t xor_hash = 0;
de190aef 1998 struct dual_timestamp _ts;
cec736d2
LP
1999
2000 assert(f);
c88cc6af 2001 assert(f->header);
cec736d2
LP
2002 assert(iovec || n_iovec == 0);
2003
de190aef
LP
2004 if (!ts) {
2005 dual_timestamp_get(&_ts);
2006 ts = &_ts;
2007 }
2008
349cc4a5 2009#if HAVE_GCRYPT
7560fffc
LP
2010 r = journal_file_maybe_append_tag(f, ts->realtime);
2011 if (r < 0)
2012 return r;
feb12d3e 2013#endif
7560fffc 2014
64825d3c 2015 /* alloca() can't take 0, hence let's allocate at least one */
9607d947 2016 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
cec736d2
LP
2017
2018 for (i = 0; i < n_iovec; i++) {
2019 uint64_t p;
2020 Object *o;
2021
2022 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
2023 if (r < 0)
cf244689 2024 return r;
cec736d2
LP
2025
2026 xor_hash ^= le64toh(o->data.hash);
2027 items[i].object_offset = htole64(p);
de7b95cd 2028 items[i].hash = o->data.hash;
cec736d2
LP
2029 }
2030
1f2da9ec
LP
2031 /* Order by the position on disk, in order to improve seek
2032 * times for rotating media. */
7ff7394d 2033 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1f2da9ec 2034
de190aef 2035 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
cec736d2 2036
fa6ac760
LP
2037 /* If the memory mapping triggered a SIGBUS then we return an
2038 * IO error and ignore the error code passed down to us, since
2039 * it is very likely just an effect of a nullified replacement
2040 * mapping page */
2041
be7cdd8e 2042 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
fa6ac760
LP
2043 r = -EIO;
2044
7a24f3bf
VC
2045 if (f->post_change_timer)
2046 schedule_post_change(f);
2047 else
2048 journal_file_post_change(f);
50f20cfd 2049
cec736d2
LP
2050 return r;
2051}
2052
a4bcff5b 2053typedef struct ChainCacheItem {
fb099c8d 2054 uint64_t first; /* the array at the beginning of the chain */
a4bcff5b
LP
2055 uint64_t array; /* the cached array */
2056 uint64_t begin; /* the first item in the cached array */
2057 uint64_t total; /* the total number of items in all arrays before this one in the chain */
f268980d 2058 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
a4bcff5b
LP
2059} ChainCacheItem;
2060
2061static void chain_cache_put(
4743015d 2062 OrderedHashmap *h,
a4bcff5b
LP
2063 ChainCacheItem *ci,
2064 uint64_t first,
2065 uint64_t array,
2066 uint64_t begin,
f268980d
LP
2067 uint64_t total,
2068 uint64_t last_index) {
a4bcff5b
LP
2069
2070 if (!ci) {
34741aa3
LP
2071 /* If the chain item to cache for this chain is the
2072 * first one it's not worth caching anything */
2073 if (array == first)
2074 return;
2075
29433089 2076 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
4743015d 2077 ci = ordered_hashmap_steal_first(h);
29433089
LP
2078 assert(ci);
2079 } else {
a4bcff5b
LP
2080 ci = new(ChainCacheItem, 1);
2081 if (!ci)
2082 return;
2083 }
2084
2085 ci->first = first;
2086
4743015d 2087 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
a4bcff5b
LP
2088 free(ci);
2089 return;
2090 }
2091 } else
2092 assert(ci->first == first);
2093
2094 ci->array = array;
2095 ci->begin = begin;
2096 ci->total = total;
f268980d 2097 ci->last_index = last_index;
a4bcff5b
LP
2098}
2099
f268980d
LP
2100static int generic_array_get(
2101 JournalFile *f,
2102 uint64_t first,
2103 uint64_t i,
2104 Object **ret, uint64_t *offset) {
de190aef 2105
cec736d2 2106 Object *o;
a4bcff5b 2107 uint64_t p = 0, a, t = 0;
cec736d2 2108 int r;
a4bcff5b 2109 ChainCacheItem *ci;
cec736d2
LP
2110
2111 assert(f);
2112
de190aef 2113 a = first;
a4bcff5b
LP
2114
2115 /* Try the chain cache first */
4743015d 2116 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
2117 if (ci && i > ci->total) {
2118 a = ci->array;
2119 i -= ci->total;
2120 t = ci->total;
2121 }
2122
de190aef 2123 while (a > 0) {
a4bcff5b 2124 uint64_t k;
cec736d2 2125
de190aef
LP
2126 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
2127 if (r < 0)
2128 return r;
cec736d2 2129
a4bcff5b
LP
2130 k = journal_file_entry_array_n_items(o);
2131 if (i < k) {
de190aef 2132 p = le64toh(o->entry_array.items[i]);
a4bcff5b 2133 goto found;
cec736d2
LP
2134 }
2135
a4bcff5b
LP
2136 i -= k;
2137 t += k;
de190aef
LP
2138 a = le64toh(o->entry_array.next_entry_array_offset);
2139 }
2140
a4bcff5b
LP
2141 return 0;
2142
2143found:
2144 /* Let's cache this item for the next invocation */
af13a6b0 2145 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
de190aef
LP
2146
2147 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2148 if (r < 0)
2149 return r;
2150
2151 if (ret)
2152 *ret = o;
2153
2154 if (offset)
2155 *offset = p;
2156
2157 return 1;
2158}
2159
f268980d
LP
2160static int generic_array_get_plus_one(
2161 JournalFile *f,
2162 uint64_t extra,
2163 uint64_t first,
2164 uint64_t i,
2165 Object **ret, uint64_t *offset) {
de190aef
LP
2166
2167 Object *o;
2168
2169 assert(f);
2170
2171 if (i == 0) {
2172 int r;
2173
2174 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
cec736d2
LP
2175 if (r < 0)
2176 return r;
2177
de190aef
LP
2178 if (ret)
2179 *ret = o;
cec736d2 2180
de190aef
LP
2181 if (offset)
2182 *offset = extra;
cec736d2 2183
de190aef 2184 return 1;
cec736d2
LP
2185 }
2186
de190aef
LP
2187 return generic_array_get(f, first, i-1, ret, offset);
2188}
cec736d2 2189
de190aef
LP
2190enum {
2191 TEST_FOUND,
2192 TEST_LEFT,
2193 TEST_RIGHT
2194};
cec736d2 2195
f268980d
LP
2196static int generic_array_bisect(
2197 JournalFile *f,
2198 uint64_t first,
2199 uint64_t n,
2200 uint64_t needle,
2201 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2202 direction_t direction,
2203 Object **ret,
2204 uint64_t *offset,
2205 uint64_t *idx) {
2206
2207 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
de190aef
LP
2208 bool subtract_one = false;
2209 Object *o, *array = NULL;
2210 int r;
a4bcff5b 2211 ChainCacheItem *ci;
cec736d2 2212
de190aef
LP
2213 assert(f);
2214 assert(test_object);
cec736d2 2215
a4bcff5b 2216 /* Start with the first array in the chain */
de190aef 2217 a = first;
a4bcff5b 2218
4743015d 2219 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
2220 if (ci && n > ci->total) {
2221 /* Ah, we have iterated this bisection array chain
2222 * previously! Let's see if we can skip ahead in the
2223 * chain, as far as the last time. But we can't jump
2224 * backwards in the chain, so let's check that
2225 * first. */
2226
2227 r = test_object(f, ci->begin, needle);
2228 if (r < 0)
2229 return r;
2230
2231 if (r == TEST_LEFT) {
f268980d 2232 /* OK, what we are looking for is right of the
a4bcff5b
LP
2233 * begin of this EntryArray, so let's jump
2234 * straight to previously cached array in the
2235 * chain */
2236
2237 a = ci->array;
2238 n -= ci->total;
2239 t = ci->total;
f268980d 2240 last_index = ci->last_index;
a4bcff5b
LP
2241 }
2242 }
2243
de190aef
LP
2244 while (a > 0) {
2245 uint64_t left, right, k, lp;
2246
2247 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
cec736d2
LP
2248 if (r < 0)
2249 return r;
2250
de190aef
LP
2251 k = journal_file_entry_array_n_items(array);
2252 right = MIN(k, n);
2253 if (right <= 0)
2254 return 0;
cec736d2 2255
de190aef
LP
2256 i = right - 1;
2257 lp = p = le64toh(array->entry_array.items[i]);
2258 if (p <= 0)
bee6a291
LP
2259 r = -EBADMSG;
2260 else
2261 r = test_object(f, p, needle);
2262 if (r == -EBADMSG) {
2263 log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (1)");
2264 n = i;
2265 continue;
2266 }
de190aef
LP
2267 if (r < 0)
2268 return r;
cec736d2 2269
de190aef
LP
2270 if (r == TEST_FOUND)
2271 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2272
2273 if (r == TEST_RIGHT) {
2274 left = 0;
2275 right -= 1;
f268980d
LP
2276
2277 if (last_index != (uint64_t) -1) {
2278 assert(last_index <= right);
2279
2280 /* If we cached the last index we
2281 * looked at, let's try to not to jump
2282 * too wildly around and see if we can
2283 * limit the range to look at early to
2284 * the immediate neighbors of the last
2285 * index we looked at. */
2286
2287 if (last_index > 0) {
2288 uint64_t x = last_index - 1;
2289
2290 p = le64toh(array->entry_array.items[x]);
2291 if (p <= 0)
2292 return -EBADMSG;
2293
2294 r = test_object(f, p, needle);
2295 if (r < 0)
2296 return r;
2297
2298 if (r == TEST_FOUND)
2299 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2300
2301 if (r == TEST_RIGHT)
2302 right = x;
2303 else
2304 left = x + 1;
2305 }
2306
2307 if (last_index < right) {
2308 uint64_t y = last_index + 1;
2309
2310 p = le64toh(array->entry_array.items[y]);
2311 if (p <= 0)
2312 return -EBADMSG;
2313
2314 r = test_object(f, p, needle);
2315 if (r < 0)
2316 return r;
2317
2318 if (r == TEST_FOUND)
2319 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2320
2321 if (r == TEST_RIGHT)
2322 right = y;
2323 else
2324 left = y + 1;
2325 }
f268980d
LP
2326 }
2327
de190aef
LP
2328 for (;;) {
2329 if (left == right) {
2330 if (direction == DIRECTION_UP)
2331 subtract_one = true;
2332
2333 i = left;
2334 goto found;
2335 }
2336
2337 assert(left < right);
de190aef 2338 i = (left + right) / 2;
f268980d 2339
de190aef
LP
2340 p = le64toh(array->entry_array.items[i]);
2341 if (p <= 0)
bee6a291
LP
2342 r = -EBADMSG;
2343 else
2344 r = test_object(f, p, needle);
2345 if (r == -EBADMSG) {
2346 log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (2)");
2347 right = n = i;
2348 continue;
2349 }
de190aef
LP
2350 if (r < 0)
2351 return r;
cec736d2 2352
de190aef
LP
2353 if (r == TEST_FOUND)
2354 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2355
2356 if (r == TEST_RIGHT)
2357 right = i;
2358 else
2359 left = i + 1;
2360 }
2361 }
2362
2173cbf8 2363 if (k >= n) {
cbdca852
LP
2364 if (direction == DIRECTION_UP) {
2365 i = n;
2366 subtract_one = true;
2367 goto found;
2368 }
2369
cec736d2 2370 return 0;
cbdca852 2371 }
cec736d2 2372
de190aef
LP
2373 last_p = lp;
2374
2375 n -= k;
2376 t += k;
f268980d 2377 last_index = (uint64_t) -1;
de190aef 2378 a = le64toh(array->entry_array.next_entry_array_offset);
cec736d2
LP
2379 }
2380
2381 return 0;
de190aef
LP
2382
2383found:
2384 if (subtract_one && t == 0 && i == 0)
2385 return 0;
2386
a4bcff5b 2387 /* Let's cache this item for the next invocation */
af13a6b0 2388 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
a4bcff5b 2389
de190aef
LP
2390 if (subtract_one && i == 0)
2391 p = last_p;
2392 else if (subtract_one)
2393 p = le64toh(array->entry_array.items[i-1]);
2394 else
2395 p = le64toh(array->entry_array.items[i]);
2396
2397 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2398 if (r < 0)
2399 return r;
2400
2401 if (ret)
2402 *ret = o;
2403
2404 if (offset)
2405 *offset = p;
2406
2407 if (idx)
cbdca852 2408 *idx = t + i + (subtract_one ? -1 : 0);
de190aef
LP
2409
2410 return 1;
cec736d2
LP
2411}
2412
f268980d
LP
2413static int generic_array_bisect_plus_one(
2414 JournalFile *f,
2415 uint64_t extra,
2416 uint64_t first,
2417 uint64_t n,
2418 uint64_t needle,
2419 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2420 direction_t direction,
2421 Object **ret,
2422 uint64_t *offset,
2423 uint64_t *idx) {
de190aef 2424
cec736d2 2425 int r;
cbdca852
LP
2426 bool step_back = false;
2427 Object *o;
cec736d2
LP
2428
2429 assert(f);
de190aef 2430 assert(test_object);
cec736d2 2431
de190aef
LP
2432 if (n <= 0)
2433 return 0;
cec736d2 2434
de190aef
LP
2435 /* This bisects the array in object 'first', but first checks
2436 * an extra */
de190aef
LP
2437 r = test_object(f, extra, needle);
2438 if (r < 0)
2439 return r;
a536e261
LP
2440
2441 if (r == TEST_FOUND)
2442 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2443
cbdca852
LP
2444 /* if we are looking with DIRECTION_UP then we need to first
2445 see if in the actual array there is a matching entry, and
2446 return the last one of that. But if there isn't any we need
2447 to return this one. Hence remember this, and return it
2448 below. */
2449 if (r == TEST_LEFT)
2450 step_back = direction == DIRECTION_UP;
de190aef 2451
cbdca852
LP
2452 if (r == TEST_RIGHT) {
2453 if (direction == DIRECTION_DOWN)
2454 goto found;
2455 else
2456 return 0;
a536e261 2457 }
cec736d2 2458
de190aef
LP
2459 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
2460
cbdca852
LP
2461 if (r == 0 && step_back)
2462 goto found;
2463
ecf68b1d 2464 if (r > 0 && idx)
313cefa1 2465 (*idx)++;
de190aef
LP
2466
2467 return r;
cbdca852
LP
2468
2469found:
2470 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
2471 if (r < 0)
2472 return r;
2473
2474 if (ret)
2475 *ret = o;
2476
2477 if (offset)
2478 *offset = extra;
2479
2480 if (idx)
2481 *idx = 0;
2482
2483 return 1;
2484}
2485
44a6b1b6 2486_pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
cbdca852
LP
2487 assert(f);
2488 assert(p > 0);
2489
2490 if (p == needle)
2491 return TEST_FOUND;
2492 else if (p < needle)
2493 return TEST_LEFT;
2494 else
2495 return TEST_RIGHT;
2496}
2497
de190aef
LP
2498static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
2499 Object *o;
2500 int r;
2501
2502 assert(f);
2503 assert(p > 0);
2504
2505 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
cec736d2
LP
2506 if (r < 0)
2507 return r;
2508
de190aef
LP
2509 if (le64toh(o->entry.seqnum) == needle)
2510 return TEST_FOUND;
2511 else if (le64toh(o->entry.seqnum) < needle)
2512 return TEST_LEFT;
2513 else
2514 return TEST_RIGHT;
2515}
cec736d2 2516
de190aef
LP
2517int journal_file_move_to_entry_by_seqnum(
2518 JournalFile *f,
2519 uint64_t seqnum,
2520 direction_t direction,
2521 Object **ret,
2522 uint64_t *offset) {
c88cc6af
VC
2523 assert(f);
2524 assert(f->header);
de190aef
LP
2525
2526 return generic_array_bisect(f,
2527 le64toh(f->header->entry_array_offset),
2528 le64toh(f->header->n_entries),
2529 seqnum,
2530 test_object_seqnum,
2531 direction,
2532 ret, offset, NULL);
2533}
cec736d2 2534
de190aef
LP
2535static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
2536 Object *o;
2537 int r;
2538
2539 assert(f);
2540 assert(p > 0);
2541
2542 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2543 if (r < 0)
2544 return r;
2545
2546 if (le64toh(o->entry.realtime) == needle)
2547 return TEST_FOUND;
2548 else if (le64toh(o->entry.realtime) < needle)
2549 return TEST_LEFT;
2550 else
2551 return TEST_RIGHT;
cec736d2
LP
2552}
2553
de190aef
LP
2554int journal_file_move_to_entry_by_realtime(
2555 JournalFile *f,
2556 uint64_t realtime,
2557 direction_t direction,
2558 Object **ret,
2559 uint64_t *offset) {
c88cc6af
VC
2560 assert(f);
2561 assert(f->header);
de190aef
LP
2562
2563 return generic_array_bisect(f,
2564 le64toh(f->header->entry_array_offset),
2565 le64toh(f->header->n_entries),
2566 realtime,
2567 test_object_realtime,
2568 direction,
2569 ret, offset, NULL);
2570}
2571
2572static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
2573 Object *o;
2574 int r;
2575
2576 assert(f);
2577 assert(p > 0);
2578
2579 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2580 if (r < 0)
2581 return r;
2582
2583 if (le64toh(o->entry.monotonic) == needle)
2584 return TEST_FOUND;
2585 else if (le64toh(o->entry.monotonic) < needle)
2586 return TEST_LEFT;
2587 else
2588 return TEST_RIGHT;
2589}
2590
2a560338 2591static int find_data_object_by_boot_id(
47838ab3
ZJS
2592 JournalFile *f,
2593 sd_id128_t boot_id,
2594 Object **o,
2595 uint64_t *b) {
2a560338 2596
fbd0b64f 2597 char t[STRLEN("_BOOT_ID=") + 32 + 1] = "_BOOT_ID=";
47838ab3
ZJS
2598
2599 sd_id128_to_string(boot_id, t + 9);
2600 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2601}
2602
de190aef
LP
2603int journal_file_move_to_entry_by_monotonic(
2604 JournalFile *f,
2605 sd_id128_t boot_id,
2606 uint64_t monotonic,
2607 direction_t direction,
2608 Object **ret,
2609 uint64_t *offset) {
2610
de190aef
LP
2611 Object *o;
2612 int r;
2613
cbdca852 2614 assert(f);
de190aef 2615
47838ab3 2616 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
de190aef
LP
2617 if (r < 0)
2618 return r;
cbdca852 2619 if (r == 0)
de190aef
LP
2620 return -ENOENT;
2621
2622 return generic_array_bisect_plus_one(f,
2623 le64toh(o->data.entry_offset),
2624 le64toh(o->data.entry_array_offset),
2625 le64toh(o->data.n_entries),
2626 monotonic,
2627 test_object_monotonic,
2628 direction,
2629 ret, offset, NULL);
2630}
2631
1fc605b0 2632void journal_file_reset_location(JournalFile *f) {
6573ef05 2633 f->location_type = LOCATION_HEAD;
1fc605b0 2634 f->current_offset = 0;
6573ef05
MS
2635 f->current_seqnum = 0;
2636 f->current_realtime = 0;
2637 f->current_monotonic = 0;
2638 zero(f->current_boot_id);
2639 f->current_xor_hash = 0;
2640}
2641
950c07d4 2642void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
6573ef05
MS
2643 f->location_type = LOCATION_SEEK;
2644 f->current_offset = offset;
2645 f->current_seqnum = le64toh(o->entry.seqnum);
2646 f->current_realtime = le64toh(o->entry.realtime);
2647 f->current_monotonic = le64toh(o->entry.monotonic);
2648 f->current_boot_id = o->entry.boot_id;
2649 f->current_xor_hash = le64toh(o->entry.xor_hash);
1fc605b0
MS
2650}
2651
d8ae66d7
MS
2652int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2653 assert(af);
c88cc6af 2654 assert(af->header);
d8ae66d7 2655 assert(bf);
c88cc6af 2656 assert(bf->header);
d8ae66d7
MS
2657 assert(af->location_type == LOCATION_SEEK);
2658 assert(bf->location_type == LOCATION_SEEK);
2659
2660 /* If contents and timestamps match, these entries are
2661 * identical, even if the seqnum does not match */
2662 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2663 af->current_monotonic == bf->current_monotonic &&
2664 af->current_realtime == bf->current_realtime &&
2665 af->current_xor_hash == bf->current_xor_hash)
2666 return 0;
2667
2668 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2669
2670 /* If this is from the same seqnum source, compare
2671 * seqnums */
2672 if (af->current_seqnum < bf->current_seqnum)
2673 return -1;
2674 if (af->current_seqnum > bf->current_seqnum)
2675 return 1;
2676
2677 /* Wow! This is weird, different data but the same
2678 * seqnums? Something is borked, but let's make the
2679 * best of it and compare by time. */
2680 }
2681
2682 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2683
2684 /* If the boot id matches, compare monotonic time */
2685 if (af->current_monotonic < bf->current_monotonic)
2686 return -1;
2687 if (af->current_monotonic > bf->current_monotonic)
2688 return 1;
2689 }
2690
2691 /* Otherwise, compare UTC time */
2692 if (af->current_realtime < bf->current_realtime)
2693 return -1;
2694 if (af->current_realtime > bf->current_realtime)
2695 return 1;
2696
2697 /* Finally, compare by contents */
2698 if (af->current_xor_hash < bf->current_xor_hash)
2699 return -1;
2700 if (af->current_xor_hash > bf->current_xor_hash)
2701 return 1;
2702
2703 return 0;
2704}
2705
aa598ba5
LP
2706static int bump_array_index(uint64_t *i, direction_t direction, uint64_t n) {
2707
2708 /* Increase or decrease the specified index, in the right direction. */
2709
2710 if (direction == DIRECTION_DOWN) {
2711 if (*i >= n - 1)
2712 return 0;
2713
2714 (*i) ++;
2715 } else {
2716 if (*i <= 0)
2717 return 0;
2718
2719 (*i) --;
2720 }
2721
2722 return 1;
2723}
2724
b6da4ed0
LP
2725static bool check_properly_ordered(uint64_t new_offset, uint64_t old_offset, direction_t direction) {
2726
2727 /* Consider it an error if any of the two offsets is uninitialized */
2728 if (old_offset == 0 || new_offset == 0)
2729 return false;
2730
2731 /* If we go down, the new offset must be larger than the old one. */
2732 return direction == DIRECTION_DOWN ?
2733 new_offset > old_offset :
2734 new_offset < old_offset;
2735}
2736
de190aef
LP
2737int journal_file_next_entry(
2738 JournalFile *f,
f534928a 2739 uint64_t p,
de190aef
LP
2740 direction_t direction,
2741 Object **ret, uint64_t *offset) {
2742
fb099c8d 2743 uint64_t i, n, ofs;
cec736d2
LP
2744 int r;
2745
2746 assert(f);
c88cc6af 2747 assert(f->header);
de190aef
LP
2748
2749 n = le64toh(f->header->n_entries);
2750 if (n <= 0)
2751 return 0;
cec736d2 2752
f534928a 2753 if (p == 0)
de190aef 2754 i = direction == DIRECTION_DOWN ? 0 : n - 1;
cec736d2 2755 else {
de190aef
LP
2756 r = generic_array_bisect(f,
2757 le64toh(f->header->entry_array_offset),
2758 le64toh(f->header->n_entries),
2759 p,
2760 test_object_offset,
2761 DIRECTION_DOWN,
2762 NULL, NULL,
2763 &i);
2764 if (r <= 0)
2765 return r;
2766
aa598ba5
LP
2767 r = bump_array_index(&i, direction, n);
2768 if (r <= 0)
2769 return r;
cec736d2
LP
2770 }
2771
de190aef 2772 /* And jump to it */
989793d3
LP
2773 for (;;) {
2774 r = generic_array_get(f,
2775 le64toh(f->header->entry_array_offset),
2776 i,
2777 ret, &ofs);
2778 if (r > 0)
2779 break;
2780 if (r != -EBADMSG)
2781 return r;
2782
2783 /* OK, so this entry is borked. Most likely some entry didn't get synced to disk properly, let's see if
2784 * the next one might work for us instead. */
2785 log_debug_errno(r, "Entry item %" PRIu64 " is bad, skipping over it.", i);
2786
2787 r = bump_array_index(&i, direction, n);
2788 if (r <= 0)
2789 return r;
caeab8f6 2790 }
fb099c8d 2791
b6da4ed0
LP
2792 /* Ensure our array is properly ordered. */
2793 if (p > 0 && !check_properly_ordered(ofs, p, direction)) {
2794 log_debug("%s: entry array not properly ordered at entry %" PRIu64, f->path, i);
fb099c8d
ZJS
2795 return -EBADMSG;
2796 }
2797
2798 if (offset)
2799 *offset = ofs;
2800
2801 return 1;
de190aef 2802}
cec736d2 2803
de190aef
LP
2804int journal_file_next_entry_for_data(
2805 JournalFile *f,
2806 Object *o, uint64_t p,
2807 uint64_t data_offset,
2808 direction_t direction,
2809 Object **ret, uint64_t *offset) {
2810
ded5034e 2811 uint64_t i, n, ofs;
de190aef 2812 Object *d;
989793d3 2813 int r;
cec736d2
LP
2814
2815 assert(f);
de190aef 2816 assert(p > 0 || !o);
cec736d2 2817
de190aef 2818 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
466ccd92 2819 if (r < 0)
de190aef 2820 return r;
cec736d2 2821
de190aef
LP
2822 n = le64toh(d->data.n_entries);
2823 if (n <= 0)
2824 return n;
cec736d2 2825
de190aef
LP
2826 if (!o)
2827 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2828 else {
2829 if (o->object.type != OBJECT_ENTRY)
2830 return -EINVAL;
cec736d2 2831
de190aef
LP
2832 r = generic_array_bisect_plus_one(f,
2833 le64toh(d->data.entry_offset),
2834 le64toh(d->data.entry_array_offset),
2835 le64toh(d->data.n_entries),
2836 p,
2837 test_object_offset,
2838 DIRECTION_DOWN,
2839 NULL, NULL,
2840 &i);
2841
2842 if (r <= 0)
cec736d2
LP
2843 return r;
2844
aa598ba5
LP
2845 r = bump_array_index(&i, direction, n);
2846 if (r <= 0)
2847 return r;
de190aef 2848 }
cec736d2 2849
989793d3
LP
2850 for (;;) {
2851 r = generic_array_get_plus_one(f,
2852 le64toh(d->data.entry_offset),
2853 le64toh(d->data.entry_array_offset),
2854 i,
2855 ret, &ofs);
2856 if (r > 0)
2857 break;
2858 if (r != -EBADMSG)
2859 return r;
2860
2861 log_debug_errno(r, "Data entry item %" PRIu64 " is bad, skipping over it.", i);
2862
2863 r = bump_array_index(&i, direction, n);
2864 if (r <= 0)
2865 return r;
2866 }
ded5034e
LP
2867
2868 /* Ensure our array is properly ordered. */
2869 if (p > 0 && check_properly_ordered(ofs, p, direction)) {
2870 log_debug("%s data entry array not properly ordered at entry %" PRIu64, f->path, i);
2871 return -EBADMSG;
2872 }
2873
2874 if (offset)
2875 *offset = ofs;
2876
2877 return 1;
de190aef 2878}
cec736d2 2879
cbdca852
LP
2880int journal_file_move_to_entry_by_offset_for_data(
2881 JournalFile *f,
2882 uint64_t data_offset,
2883 uint64_t p,
2884 direction_t direction,
2885 Object **ret, uint64_t *offset) {
2886
2887 int r;
2888 Object *d;
2889
2890 assert(f);
2891
2892 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2893 if (r < 0)
2894 return r;
2895
2896 return generic_array_bisect_plus_one(f,
2897 le64toh(d->data.entry_offset),
2898 le64toh(d->data.entry_array_offset),
2899 le64toh(d->data.n_entries),
2900 p,
2901 test_object_offset,
2902 direction,
2903 ret, offset, NULL);
2904}
2905
2906int journal_file_move_to_entry_by_monotonic_for_data(
2907 JournalFile *f,
2908 uint64_t data_offset,
2909 sd_id128_t boot_id,
2910 uint64_t monotonic,
2911 direction_t direction,
2912 Object **ret, uint64_t *offset) {
2913
cbdca852
LP
2914 Object *o, *d;
2915 int r;
2916 uint64_t b, z;
2917
2918 assert(f);
2919
2920 /* First, seek by time */
47838ab3 2921 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
cbdca852
LP
2922 if (r < 0)
2923 return r;
2924 if (r == 0)
2925 return -ENOENT;
2926
2927 r = generic_array_bisect_plus_one(f,
2928 le64toh(o->data.entry_offset),
2929 le64toh(o->data.entry_array_offset),
2930 le64toh(o->data.n_entries),
2931 monotonic,
2932 test_object_monotonic,
2933 direction,
2934 NULL, &z, NULL);
2935 if (r <= 0)
2936 return r;
2937
2938 /* And now, continue seeking until we find an entry that
2939 * exists in both bisection arrays */
2940
2941 for (;;) {
2942 Object *qo;
2943 uint64_t p, q;
2944
2945 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2946 if (r < 0)
2947 return r;
2948
2949 r = generic_array_bisect_plus_one(f,
2950 le64toh(d->data.entry_offset),
2951 le64toh(d->data.entry_array_offset),
2952 le64toh(d->data.n_entries),
2953 z,
2954 test_object_offset,
2955 direction,
2956 NULL, &p, NULL);
2957 if (r <= 0)
2958 return r;
2959
2960 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2961 if (r < 0)
2962 return r;
2963
2964 r = generic_array_bisect_plus_one(f,
2965 le64toh(o->data.entry_offset),
2966 le64toh(o->data.entry_array_offset),
2967 le64toh(o->data.n_entries),
2968 p,
2969 test_object_offset,
2970 direction,
2971 &qo, &q, NULL);
2972
2973 if (r <= 0)
2974 return r;
2975
2976 if (p == q) {
2977 if (ret)
2978 *ret = qo;
2979 if (offset)
2980 *offset = q;
2981
2982 return 1;
2983 }
2984
2985 z = q;
2986 }
cbdca852
LP
2987}
2988
de190aef
LP
2989int journal_file_move_to_entry_by_seqnum_for_data(
2990 JournalFile *f,
2991 uint64_t data_offset,
2992 uint64_t seqnum,
2993 direction_t direction,
2994 Object **ret, uint64_t *offset) {
cec736d2 2995
de190aef
LP
2996 Object *d;
2997 int r;
cec736d2 2998
91a31dde
LP
2999 assert(f);
3000
de190aef 3001 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 3002 if (r < 0)
de190aef 3003 return r;
cec736d2 3004
de190aef
LP
3005 return generic_array_bisect_plus_one(f,
3006 le64toh(d->data.entry_offset),
3007 le64toh(d->data.entry_array_offset),
3008 le64toh(d->data.n_entries),
3009 seqnum,
3010 test_object_seqnum,
3011 direction,
3012 ret, offset, NULL);
3013}
cec736d2 3014
de190aef
LP
3015int journal_file_move_to_entry_by_realtime_for_data(
3016 JournalFile *f,
3017 uint64_t data_offset,
3018 uint64_t realtime,
3019 direction_t direction,
3020 Object **ret, uint64_t *offset) {
3021
3022 Object *d;
3023 int r;
3024
91a31dde
LP
3025 assert(f);
3026
de190aef 3027 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 3028 if (r < 0)
de190aef
LP
3029 return r;
3030
3031 return generic_array_bisect_plus_one(f,
3032 le64toh(d->data.entry_offset),
3033 le64toh(d->data.entry_array_offset),
3034 le64toh(d->data.n_entries),
3035 realtime,
3036 test_object_realtime,
3037 direction,
3038 ret, offset, NULL);
cec736d2
LP
3039}
3040
0284adc6 3041void journal_file_dump(JournalFile *f) {
7560fffc 3042 Object *o;
7560fffc 3043 int r;
0284adc6 3044 uint64_t p;
7560fffc
LP
3045
3046 assert(f);
c88cc6af 3047 assert(f->header);
7560fffc 3048
0284adc6 3049 journal_file_print_header(f);
7560fffc 3050
0284adc6
LP
3051 p = le64toh(f->header->header_size);
3052 while (p != 0) {
d05089d8 3053 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
0284adc6
LP
3054 if (r < 0)
3055 goto fail;
7560fffc 3056
0284adc6 3057 switch (o->object.type) {
d98cc1f2 3058
0284adc6
LP
3059 case OBJECT_UNUSED:
3060 printf("Type: OBJECT_UNUSED\n");
3061 break;
d98cc1f2 3062
0284adc6
LP
3063 case OBJECT_DATA:
3064 printf("Type: OBJECT_DATA\n");
3065 break;
7560fffc 3066
3c1668da
LP
3067 case OBJECT_FIELD:
3068 printf("Type: OBJECT_FIELD\n");
3069 break;
3070
0284adc6 3071 case OBJECT_ENTRY:
507f22bd
ZJS
3072 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
3073 le64toh(o->entry.seqnum),
3074 le64toh(o->entry.monotonic),
3075 le64toh(o->entry.realtime));
0284adc6 3076 break;
7560fffc 3077
0284adc6
LP
3078 case OBJECT_FIELD_HASH_TABLE:
3079 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
3080 break;
7560fffc 3081
0284adc6
LP
3082 case OBJECT_DATA_HASH_TABLE:
3083 printf("Type: OBJECT_DATA_HASH_TABLE\n");
3084 break;
7560fffc 3085
0284adc6
LP
3086 case OBJECT_ENTRY_ARRAY:
3087 printf("Type: OBJECT_ENTRY_ARRAY\n");
3088 break;
7560fffc 3089
0284adc6 3090 case OBJECT_TAG:
507f22bd
ZJS
3091 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
3092 le64toh(o->tag.seqnum),
3093 le64toh(o->tag.epoch));
0284adc6 3094 break;
3c1668da
LP
3095
3096 default:
8facc349 3097 printf("Type: unknown (%i)\n", o->object.type);
3c1668da 3098 break;
0284adc6 3099 }
7560fffc 3100
d89c8fdf
ZJS
3101 if (o->object.flags & OBJECT_COMPRESSION_MASK)
3102 printf("Flags: %s\n",
3103 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
7560fffc 3104
0284adc6
LP
3105 if (p == le64toh(f->header->tail_object_offset))
3106 p = 0;
3107 else
3108 p = p + ALIGN64(le64toh(o->object.size));
3109 }
7560fffc 3110
0284adc6
LP
3111 return;
3112fail:
3113 log_error("File corrupt");
7560fffc
LP
3114}
3115
718fe4b1
ZJS
3116static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
3117 const char *x;
3118
3119 x = format_timestamp(buf, l, t);
3120 if (x)
3121 return x;
3122 return " --- ";
3123}
3124
0284adc6 3125void journal_file_print_header(JournalFile *f) {
2765b7bb 3126 char a[33], b[33], c[33], d[33];
ed375beb 3127 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
a1a03e30
LP
3128 struct stat st;
3129 char bytes[FORMAT_BYTES_MAX];
7560fffc
LP
3130
3131 assert(f);
c88cc6af 3132 assert(f->header);
7560fffc 3133
0284adc6
LP
3134 printf("File Path: %s\n"
3135 "File ID: %s\n"
3136 "Machine ID: %s\n"
3137 "Boot ID: %s\n"
3138 "Sequential Number ID: %s\n"
3139 "State: %s\n"
3140 "Compatible Flags:%s%s\n"
d89c8fdf 3141 "Incompatible Flags:%s%s%s\n"
507f22bd
ZJS
3142 "Header size: %"PRIu64"\n"
3143 "Arena size: %"PRIu64"\n"
3144 "Data Hash Table Size: %"PRIu64"\n"
3145 "Field Hash Table Size: %"PRIu64"\n"
0284adc6 3146 "Rotate Suggested: %s\n"
0808b92f
LP
3147 "Head Sequential Number: %"PRIu64" (%"PRIx64")\n"
3148 "Tail Sequential Number: %"PRIu64" (%"PRIx64")\n"
3149 "Head Realtime Timestamp: %s (%"PRIx64")\n"
3150 "Tail Realtime Timestamp: %s (%"PRIx64")\n"
3151 "Tail Monotonic Timestamp: %s (%"PRIx64")\n"
507f22bd
ZJS
3152 "Objects: %"PRIu64"\n"
3153 "Entry Objects: %"PRIu64"\n",
0284adc6
LP
3154 f->path,
3155 sd_id128_to_string(f->header->file_id, a),
3156 sd_id128_to_string(f->header->machine_id, b),
3157 sd_id128_to_string(f->header->boot_id, c),
2765b7bb 3158 sd_id128_to_string(f->header->seqnum_id, d),
3223f44f
LP
3159 f->header->state == STATE_OFFLINE ? "OFFLINE" :
3160 f->header->state == STATE_ONLINE ? "ONLINE" :
3161 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
8088cbd3 3162 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
d89c8fdf
ZJS
3163 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
3164 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
3165 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
3166 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
507f22bd
ZJS
3167 le64toh(f->header->header_size),
3168 le64toh(f->header->arena_size),
3169 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3170 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
fb0951b0 3171 yes_no(journal_file_rotate_suggested(f, 0)),
0808b92f
LP
3172 le64toh(f->header->head_entry_seqnum), le64toh(f->header->head_entry_seqnum),
3173 le64toh(f->header->tail_entry_seqnum), le64toh(f->header->tail_entry_seqnum),
3174 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)), le64toh(f->header->head_entry_realtime),
3175 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)), le64toh(f->header->tail_entry_realtime),
3176 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC), le64toh(f->header->tail_entry_monotonic),
507f22bd
ZJS
3177 le64toh(f->header->n_objects),
3178 le64toh(f->header->n_entries));
7560fffc 3179
0284adc6 3180 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
507f22bd 3181 printf("Data Objects: %"PRIu64"\n"
0284adc6 3182 "Data Hash Table Fill: %.1f%%\n",
507f22bd 3183 le64toh(f->header->n_data),
0284adc6 3184 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
7560fffc 3185
0284adc6 3186 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
507f22bd 3187 printf("Field Objects: %"PRIu64"\n"
0284adc6 3188 "Field Hash Table Fill: %.1f%%\n",
507f22bd 3189 le64toh(f->header->n_fields),
0284adc6 3190 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3223f44f
LP
3191
3192 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
507f22bd
ZJS
3193 printf("Tag Objects: %"PRIu64"\n",
3194 le64toh(f->header->n_tags));
3223f44f 3195 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
507f22bd
ZJS
3196 printf("Entry Array Objects: %"PRIu64"\n",
3197 le64toh(f->header->n_entry_arrays));
a1a03e30
LP
3198
3199 if (fstat(f->fd, &st) >= 0)
59f448cf 3200 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
7560fffc
LP
3201}
3202
fc68c929
LP
3203static int journal_file_warn_btrfs(JournalFile *f) {
3204 unsigned attrs;
3205 int r;
3206
3207 assert(f);
3208
3209 /* Before we write anything, check if the COW logic is turned
3210 * off on btrfs. Given our write pattern that is quite
3211 * unfriendly to COW file systems this should greatly improve
3212 * performance on COW file systems, such as btrfs, at the
3213 * expense of data integrity features (which shouldn't be too
3214 * bad, given that we do our own checksumming). */
3215
3216 r = btrfs_is_filesystem(f->fd);
3217 if (r < 0)
3218 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
3219 if (!r)
3220 return 0;
3221
3222 r = read_attr_fd(f->fd, &attrs);
3223 if (r < 0)
3224 return log_warning_errno(r, "Failed to read file attributes: %m");
3225
3226 if (attrs & FS_NOCOW_FL) {
3227 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
3228 return 0;
3229 }
3230
3231 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
3232 "This is likely to slow down journal access substantially, please consider turning "
3233 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
3234
3235 return 1;
3236}
3237
0284adc6 3238int journal_file_open(
5d1ce257 3239 int fd,
0284adc6
LP
3240 const char *fname,
3241 int flags,
3242 mode_t mode,
3243 bool compress,
baed47c3 3244 bool seal,
0284adc6
LP
3245 JournalMetrics *metrics,
3246 MMapCache *mmap_cache,
b58c888f 3247 Set *deferred_closes,
0284adc6
LP
3248 JournalFile *template,
3249 JournalFile **ret) {
7560fffc 3250
fa6ac760 3251 bool newly_created = false;
0284adc6 3252 JournalFile *f;
fa6ac760 3253 void *h;
0284adc6 3254 int r;
7560fffc 3255
0559d3a5 3256 assert(ret);
5d1ce257 3257 assert(fd >= 0 || fname);
7560fffc 3258
ec2ce0c5 3259 if (!IN_SET((flags & O_ACCMODE), O_RDONLY, O_RDWR))
0284adc6 3260 return -EINVAL;
7560fffc 3261
6eda13d3
LP
3262 if (fname && (flags & O_CREAT) && !endswith(fname, ".journal"))
3263 return -EINVAL;
7560fffc 3264
0284adc6
LP
3265 f = new0(JournalFile, 1);
3266 if (!f)
3267 return -ENOMEM;
7560fffc 3268
5d1ce257 3269 f->fd = fd;
0284adc6 3270 f->mode = mode;
7560fffc 3271
0284adc6
LP
3272 f->flags = flags;
3273 f->prot = prot_from_flags(flags);
3274 f->writable = (flags & O_ACCMODE) != O_RDONLY;
349cc4a5 3275#if HAVE_LZ4
d89c8fdf 3276 f->compress_lz4 = compress;
349cc4a5 3277#elif HAVE_XZ
d89c8fdf 3278 f->compress_xz = compress;
48b61739 3279#endif
349cc4a5 3280#if HAVE_GCRYPT
baed47c3 3281 f->seal = seal;
49a32d43 3282#endif
7560fffc 3283
0284adc6
LP
3284 if (mmap_cache)
3285 f->mmap = mmap_cache_ref(mmap_cache);
3286 else {
84168d80 3287 f->mmap = mmap_cache_new();
0284adc6
LP
3288 if (!f->mmap) {
3289 r = -ENOMEM;
3290 goto fail;
3291 }
3292 }
7560fffc 3293
7645c77b 3294 if (fname) {
5d1ce257 3295 f->path = strdup(fname);
7645c77b
ZJS
3296 if (!f->path) {
3297 r = -ENOMEM;
3298 goto fail;
3299 }
3300 } else {
3301 /* If we don't know the path, fill in something explanatory and vaguely useful */
3302 if (asprintf(&f->path, "/proc/self/%i", fd) < 0) {
3303 r = -ENOMEM;
3304 goto fail;
3305 }
0284adc6 3306 }
7560fffc 3307
4743015d 3308 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
a4bcff5b
LP
3309 if (!f->chain_cache) {
3310 r = -ENOMEM;
3311 goto fail;
3312 }
3313
0284adc6 3314 if (f->fd < 0) {
5d1ce257
LP
3315 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
3316 if (f->fd < 0) {
3317 r = -errno;
3318 goto fail;
3319 }
3320
3321 /* fds we opened here by us should also be closed by us. */
3322 f->close_fd = true;
7560fffc 3323 }
7560fffc 3324
be7cdd8e
VC
3325 f->cache_fd = mmap_cache_add_fd(f->mmap, f->fd);
3326 if (!f->cache_fd) {
3327 r = -ENOMEM;
3328 goto fail;
3329 }
3330
2678031a
LP
3331 r = journal_file_fstat(f);
3332 if (r < 0)
0284adc6 3333 goto fail;
7560fffc 3334
0284adc6 3335 if (f->last_stat.st_size == 0 && f->writable) {
11689d2a 3336
fc68c929 3337 (void) journal_file_warn_btrfs(f);
11689d2a 3338
fb0951b0
LP
3339 /* Let's attach the creation time to the journal file,
3340 * so that the vacuuming code knows the age of this
3341 * file even if the file might end up corrupted one
3342 * day... Ideally we'd just use the creation time many
3343 * file systems maintain for each file, but there is
3344 * currently no usable API to query this, hence let's
3345 * emulate this via extended attributes. If extended
3346 * attributes are not supported we'll just skip this,
7517e174 3347 * and rely solely on mtime/atime/ctime of the file. */
fb0951b0 3348
d61b600d 3349 fd_setcrtime(f->fd, 0);
7560fffc 3350
349cc4a5 3351#if HAVE_GCRYPT
0284adc6 3352 /* Try to load the FSPRG state, and if we can't, then
baed47c3 3353 * just don't do sealing */
49a32d43
LP
3354 if (f->seal) {
3355 r = journal_file_fss_load(f);
3356 if (r < 0)
3357 f->seal = false;
3358 }
feb12d3e 3359#endif
7560fffc 3360
0284adc6
LP
3361 r = journal_file_init_header(f, template);
3362 if (r < 0)
3363 goto fail;
7560fffc 3364
2678031a
LP
3365 r = journal_file_fstat(f);
3366 if (r < 0)
0284adc6 3367 goto fail;
fb0951b0
LP
3368
3369 newly_created = true;
0284adc6 3370 }
7560fffc 3371
0284adc6 3372 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
cfb571f3 3373 r = -ENODATA;
0284adc6
LP
3374 goto fail;
3375 }
7560fffc 3376
b42549ad 3377 r = mmap_cache_get(f->mmap, f->cache_fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h, NULL);
977eaa1e 3378 if (r < 0)
0284adc6 3379 goto fail;
7560fffc 3380
fa6ac760
LP
3381 f->header = h;
3382
0284adc6 3383 if (!newly_created) {
f9168190 3384 set_clear_with_destructor(deferred_closes, journal_file_close);
b58c888f 3385
0284adc6
LP
3386 r = journal_file_verify_header(f);
3387 if (r < 0)
3388 goto fail;
3389 }
7560fffc 3390
349cc4a5 3391#if HAVE_GCRYPT
0284adc6 3392 if (!newly_created && f->writable) {
baed47c3 3393 r = journal_file_fss_load(f);
0284adc6
LP
3394 if (r < 0)
3395 goto fail;
3396 }
feb12d3e 3397#endif
cec736d2
LP
3398
3399 if (f->writable) {
4a92baf3
LP
3400 if (metrics) {
3401 journal_default_metrics(metrics, f->fd);
3402 f->metrics = *metrics;
3403 } else if (template)
3404 f->metrics = template->metrics;
3405
cec736d2
LP
3406 r = journal_file_refresh_header(f);
3407 if (r < 0)
3408 goto fail;
3409 }
3410
349cc4a5 3411#if HAVE_GCRYPT
baed47c3 3412 r = journal_file_hmac_setup(f);
14d10188
LP
3413 if (r < 0)
3414 goto fail;
feb12d3e 3415#endif
14d10188 3416
cec736d2 3417 if (newly_created) {
de190aef 3418 r = journal_file_setup_field_hash_table(f);
cec736d2
LP
3419 if (r < 0)
3420 goto fail;
3421
de190aef 3422 r = journal_file_setup_data_hash_table(f);
cec736d2
LP
3423 if (r < 0)
3424 goto fail;
7560fffc 3425
349cc4a5 3426#if HAVE_GCRYPT
7560fffc
LP
3427 r = journal_file_append_first_tag(f);
3428 if (r < 0)
3429 goto fail;
feb12d3e 3430#endif
cec736d2
LP
3431 }
3432
be7cdd8e 3433 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd)) {
fa6ac760
LP
3434 r = -EIO;
3435 goto fail;
3436 }
3437
7a24f3bf 3438 if (template && template->post_change_timer) {
e167d7fd
LP
3439 r = journal_file_enable_post_change_timer(
3440 f,
3441 sd_event_source_get_event(template->post_change_timer),
3442 template->post_change_timer_period);
7a24f3bf 3443
7a24f3bf
VC
3444 if (r < 0)
3445 goto fail;
3446 }
3447
f8e2f4d6 3448 /* The file is opened now successfully, thus we take possession of any passed in fd. */
5d1ce257
LP
3449 f->close_fd = true;
3450
0559d3a5 3451 *ret = f;
cec736d2
LP
3452 return 0;
3453
3454fail:
be7cdd8e 3455 if (f->cache_fd && mmap_cache_got_sigbus(f->mmap, f->cache_fd))
fa6ac760
LP
3456 r = -EIO;
3457
69a3a6fd 3458 (void) journal_file_close(f);
cec736d2
LP
3459
3460 return r;
3461}
0ac38b70 3462
b58c888f 3463int journal_file_rotate(JournalFile **f, bool compress, bool seal, Set *deferred_closes) {
57535f47 3464 _cleanup_free_ char *p = NULL;
0ac38b70
LP
3465 size_t l;
3466 JournalFile *old_file, *new_file = NULL;
3467 int r;
3468
3469 assert(f);
3470 assert(*f);
3471
3472 old_file = *f;
3473
3474 if (!old_file->writable)
3475 return -EINVAL;
3476
5d1ce257 3477 /* Is this a journal file that was passed to us as fd? If so, we synthesized a path name for it, and we refuse
13e785f7 3478 * rotation, since we don't know the actual path, and couldn't rename the file hence. */
5d1ce257
LP
3479 if (path_startswith(old_file->path, "/proc/self/fd"))
3480 return -EINVAL;
3481
0ac38b70
LP
3482 if (!endswith(old_file->path, ".journal"))
3483 return -EINVAL;
3484
3485 l = strlen(old_file->path);
57535f47
ZJS
3486 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
3487 (int) l - 8, old_file->path,
3488 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
3489 le64toh((*f)->header->head_entry_seqnum),
3490 le64toh((*f)->header->head_entry_realtime));
3491 if (r < 0)
0ac38b70
LP
3492 return -ENOMEM;
3493
2678031a
LP
3494 /* Try to rename the file to the archived version. If the file
3495 * already was deleted, we'll get ENOENT, let's ignore that
3496 * case. */
0ac38b70 3497 r = rename(old_file->path, p);
2678031a 3498 if (r < 0 && errno != ENOENT)
0ac38b70
LP
3499 return -errno;
3500
1fcefd88
LP
3501 /* Sync the rename to disk */
3502 (void) fsync_directory_of_file(old_file->fd);
3503
8eb85171
VC
3504 /* Set as archive so offlining commits w/state=STATE_ARCHIVED.
3505 * Previously we would set old_file->header->state to STATE_ARCHIVED directly here,
3506 * but journal_file_set_offline() short-circuits when state != STATE_ONLINE, which
3507 * would result in the rotated journal never getting fsync() called before closing.
3508 * Now we simply queue the archive state by setting an archive bit, leaving the state
3509 * as STATE_ONLINE so proper offlining occurs. */
3510 old_file->archive = true;
0ac38b70 3511
f27a3864
LP
3512 /* Currently, btrfs is not very good with out write patterns
3513 * and fragments heavily. Let's defrag our journal files when
3514 * we archive them */
3515 old_file->defrag_on_close = true;
3516
5d1ce257 3517 r = journal_file_open(-1, old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, deferred_closes, old_file, &new_file);
b58c888f
VC
3518
3519 if (deferred_closes &&
3520 set_put(deferred_closes, old_file) >= 0)
3521 (void) journal_file_set_offline(old_file, false);
3522 else
3523 (void) journal_file_close(old_file);
0ac38b70
LP
3524
3525 *f = new_file;
3526 return r;
3527}
3528
9447a7f1
LP
3529int journal_file_open_reliably(
3530 const char *fname,
3531 int flags,
3532 mode_t mode,
7560fffc 3533 bool compress,
baed47c3 3534 bool seal,
4a92baf3 3535 JournalMetrics *metrics,
27370278 3536 MMapCache *mmap_cache,
b58c888f 3537 Set *deferred_closes,
9447a7f1
LP
3538 JournalFile *template,
3539 JournalFile **ret) {
3540
3541 int r;
3542 size_t l;
ed375beb 3543 _cleanup_free_ char *p = NULL;
9447a7f1 3544
5d1ce257 3545 r = journal_file_open(-1, fname, flags, mode, compress, seal, metrics, mmap_cache, deferred_closes, template, ret);
288359db 3546 if (!IN_SET(r,
b288cdeb
ZJS
3547 -EBADMSG, /* Corrupted */
3548 -ENODATA, /* Truncated */
3549 -EHOSTDOWN, /* Other machine */
3550 -EPROTONOSUPPORT, /* Incompatible feature */
3551 -EBUSY, /* Unclean shutdown */
3552 -ESHUTDOWN, /* Already archived */
288359db 3553 -EIO, /* IO error, including SIGBUS on mmap */
ae739cc1
LP
3554 -EIDRM, /* File has been deleted */
3555 -ETXTBSY)) /* File is from the future */
9447a7f1
LP
3556 return r;
3557
3558 if ((flags & O_ACCMODE) == O_RDONLY)
3559 return r;
3560
3561 if (!(flags & O_CREAT))
3562 return r;
3563
7560fffc
LP
3564 if (!endswith(fname, ".journal"))
3565 return r;
3566
5c70eab4
LP
3567 /* The file is corrupted. Rotate it away and try it again (but only once) */
3568
9447a7f1 3569 l = strlen(fname);
d587eca5 3570 if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
57535f47 3571 (int) l - 8, fname,
d587eca5 3572 now(CLOCK_REALTIME),
9bf3b535 3573 random_u64()) < 0)
9447a7f1
LP
3574 return -ENOMEM;
3575
65089b82 3576 if (rename(fname, p) < 0)
9447a7f1
LP
3577 return -errno;
3578
f27a3864
LP
3579 /* btrfs doesn't cope well with our write pattern and
3580 * fragments heavily. Let's defrag all files we rotate */
11689d2a 3581
a67d68b8 3582 (void) chattr_path(p, 0, FS_NOCOW_FL);
f27a3864
LP
3583 (void) btrfs_defrag(p);
3584
65089b82 3585 log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
9447a7f1 3586
5d1ce257 3587 return journal_file_open(-1, fname, flags, mode, compress, seal, metrics, mmap_cache, deferred_closes, template, ret);
9447a7f1
LP
3588}
3589
cf244689
LP
3590int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
3591 uint64_t i, n;
3592 uint64_t q, xor_hash = 0;
3593 int r;
3594 EntryItem *items;
3595 dual_timestamp ts;
3596
3597 assert(from);
3598 assert(to);
3599 assert(o);
3600 assert(p);
3601
3602 if (!to->writable)
3603 return -EPERM;
3604
3605 ts.monotonic = le64toh(o->entry.monotonic);
3606 ts.realtime = le64toh(o->entry.realtime);
3607
cf244689 3608 n = journal_file_entry_n_items(o);
4faa7004
TA
3609 /* alloca() can't take 0, hence let's allocate at least one */
3610 items = alloca(sizeof(EntryItem) * MAX(1u, n));
cf244689
LP
3611
3612 for (i = 0; i < n; i++) {
4fd052ae
FC
3613 uint64_t l, h;
3614 le64_t le_hash;
cf244689
LP
3615 size_t t;
3616 void *data;
3617 Object *u;
3618
3619 q = le64toh(o->entry.items[i].object_offset);
3620 le_hash = o->entry.items[i].hash;
3621
3622 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
3623 if (r < 0)
3624 return r;
3625
3626 if (le_hash != o->data.hash)
3627 return -EBADMSG;
3628
3629 l = le64toh(o->object.size) - offsetof(Object, data.payload);
3630 t = (size_t) l;
3631
3632 /* We hit the limit on 32bit machines */
3633 if ((uint64_t) t != l)
3634 return -E2BIG;
3635
d89c8fdf 3636 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
349cc4a5 3637#if HAVE_XZ || HAVE_LZ4
a7f7d1bd 3638 size_t rsize = 0;
cf244689 3639
d89c8fdf
ZJS
3640 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
3641 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
3642 if (r < 0)
3643 return r;
cf244689
LP
3644
3645 data = from->compress_buffer;
3646 l = rsize;
3b1a55e1
ZJS
3647#else
3648 return -EPROTONOSUPPORT;
3649#endif
cf244689
LP
3650 } else
3651 data = o->data.payload;
3652
3653 r = journal_file_append_data(to, data, l, &u, &h);
3654 if (r < 0)
3655 return r;
3656
3657 xor_hash ^= le64toh(u->data.hash);
3658 items[i].object_offset = htole64(h);
3659 items[i].hash = u->data.hash;
3660
3661 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
3662 if (r < 0)
3663 return r;
3664 }
3665
fa6ac760
LP
3666 r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
3667
be7cdd8e 3668 if (mmap_cache_got_sigbus(to->mmap, to->cache_fd))
fa6ac760
LP
3669 return -EIO;
3670
3671 return r;
cf244689 3672}
babfc091 3673
8580d1f7
LP
3674void journal_reset_metrics(JournalMetrics *m) {
3675 assert(m);
3676
3677 /* Set everything to "pick automatic values". */
3678
3679 *m = (JournalMetrics) {
3680 .min_use = (uint64_t) -1,
3681 .max_use = (uint64_t) -1,
3682 .min_size = (uint64_t) -1,
3683 .max_size = (uint64_t) -1,
3684 .keep_free = (uint64_t) -1,
3685 .n_max_files = (uint64_t) -1,
3686 };
3687}
3688
babfc091 3689void journal_default_metrics(JournalMetrics *m, int fd) {
8580d1f7 3690 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
babfc091 3691 struct statvfs ss;
8580d1f7 3692 uint64_t fs_size;
babfc091
LP
3693
3694 assert(m);
3695 assert(fd >= 0);
3696
3697 if (fstatvfs(fd, &ss) >= 0)
3698 fs_size = ss.f_frsize * ss.f_blocks;
8580d1f7
LP
3699 else {
3700 log_debug_errno(errno, "Failed to detremine disk size: %m");
3701 fs_size = 0;
3702 }
babfc091
LP
3703
3704 if (m->max_use == (uint64_t) -1) {
3705
3706 if (fs_size > 0) {
3707 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
3708
3709 if (m->max_use > DEFAULT_MAX_USE_UPPER)
3710 m->max_use = DEFAULT_MAX_USE_UPPER;
3711
3712 if (m->max_use < DEFAULT_MAX_USE_LOWER)
3713 m->max_use = DEFAULT_MAX_USE_LOWER;
3714 } else
3715 m->max_use = DEFAULT_MAX_USE_LOWER;
3716 } else {
3717 m->max_use = PAGE_ALIGN(m->max_use);
3718
8580d1f7 3719 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
babfc091
LP
3720 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3721 }
3722
8580d1f7
LP
3723 if (m->min_use == (uint64_t) -1)
3724 m->min_use = DEFAULT_MIN_USE;
3725
3726 if (m->min_use > m->max_use)
3727 m->min_use = m->max_use;
3728
babfc091
LP
3729 if (m->max_size == (uint64_t) -1) {
3730 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
3731
3732 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3733 m->max_size = DEFAULT_MAX_SIZE_UPPER;
3734 } else
3735 m->max_size = PAGE_ALIGN(m->max_size);
3736
8580d1f7
LP
3737 if (m->max_size != 0) {
3738 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3739 m->max_size = JOURNAL_FILE_SIZE_MIN;
babfc091 3740
8580d1f7
LP
3741 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3742 m->max_use = m->max_size*2;
3743 }
babfc091
LP
3744
3745 if (m->min_size == (uint64_t) -1)
3746 m->min_size = JOURNAL_FILE_SIZE_MIN;
3747 else {
3748 m->min_size = PAGE_ALIGN(m->min_size);
3749
3750 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3751 m->min_size = JOURNAL_FILE_SIZE_MIN;
3752
8580d1f7 3753 if (m->max_size != 0 && m->min_size > m->max_size)
babfc091
LP
3754 m->max_size = m->min_size;
3755 }
3756
3757 if (m->keep_free == (uint64_t) -1) {
3758
3759 if (fs_size > 0) {
8621b110 3760 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
babfc091
LP
3761
3762 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3763 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3764
3765 } else
3766 m->keep_free = DEFAULT_KEEP_FREE;
3767 }
3768
8580d1f7
LP
3769 if (m->n_max_files == (uint64_t) -1)
3770 m->n_max_files = DEFAULT_N_MAX_FILES;
3771
3772 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3773 format_bytes(a, sizeof(a), m->min_use),
3774 format_bytes(b, sizeof(b), m->max_use),
3775 format_bytes(c, sizeof(c), m->max_size),
3776 format_bytes(d, sizeof(d), m->min_size),
3777 format_bytes(e, sizeof(e), m->keep_free),
3778 m->n_max_files);
babfc091 3779}
08984293
LP
3780
3781int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
08984293 3782 assert(f);
c88cc6af 3783 assert(f->header);
08984293
LP
3784 assert(from || to);
3785
3786 if (from) {
162566a4
LP
3787 if (f->header->head_entry_realtime == 0)
3788 return -ENOENT;
08984293 3789
162566a4 3790 *from = le64toh(f->header->head_entry_realtime);
08984293
LP
3791 }
3792
3793 if (to) {
162566a4
LP
3794 if (f->header->tail_entry_realtime == 0)
3795 return -ENOENT;
08984293 3796
162566a4 3797 *to = le64toh(f->header->tail_entry_realtime);
08984293
LP
3798 }
3799
3800 return 1;
3801}
3802
3803int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
08984293
LP
3804 Object *o;
3805 uint64_t p;
3806 int r;
3807
3808 assert(f);
3809 assert(from || to);
3810
47838ab3 3811 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
08984293
LP
3812 if (r <= 0)
3813 return r;
3814
3815 if (le64toh(o->data.n_entries) <= 0)
3816 return 0;
3817
3818 if (from) {
3819 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3820 if (r < 0)
3821 return r;
3822
3823 *from = le64toh(o->entry.monotonic);
3824 }
3825
3826 if (to) {
3827 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3828 if (r < 0)
3829 return r;
3830
3831 r = generic_array_get_plus_one(f,
3832 le64toh(o->data.entry_offset),
3833 le64toh(o->data.entry_array_offset),
3834 le64toh(o->data.n_entries)-1,
3835 &o, NULL);
3836 if (r <= 0)
3837 return r;
3838
3839 *to = le64toh(o->entry.monotonic);
3840 }
3841
3842 return 1;
3843}
dca6219e 3844
fb0951b0 3845bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
dca6219e 3846 assert(f);
c88cc6af 3847 assert(f->header);
dca6219e
LP
3848
3849 /* If we gained new header fields we gained new features,
3850 * hence suggest a rotation */
361f9cbc
LP
3851 if (le64toh(f->header->header_size) < sizeof(Header)) {
3852 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
dca6219e 3853 return true;
361f9cbc 3854 }
dca6219e
LP
3855
3856 /* Let's check if the hash tables grew over a certain fill
3857 * level (75%, borrowing this value from Java's hash table
3858 * implementation), and if so suggest a rotation. To calculate
3859 * the fill level we need the n_data field, which only exists
3860 * in newer versions. */
3861
3862 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
361f9cbc 3863 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 3864 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
361f9cbc
LP
3865 f->path,
3866 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
3867 le64toh(f->header->n_data),
3868 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3869 (unsigned long long) f->last_stat.st_size,
3870 f->last_stat.st_size / le64toh(f->header->n_data));
dca6219e 3871 return true;
361f9cbc 3872 }
dca6219e
LP
3873
3874 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
361f9cbc 3875 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 3876 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
361f9cbc
LP
3877 f->path,
3878 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
3879 le64toh(f->header->n_fields),
3880 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
dca6219e 3881 return true;
361f9cbc 3882 }
dca6219e 3883
0598fd4a
LP
3884 /* Are the data objects properly indexed by field objects? */
3885 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3886 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3887 le64toh(f->header->n_data) > 0 &&
3888 le64toh(f->header->n_fields) == 0)
3889 return true;
3890
fb0951b0
LP
3891 if (max_file_usec > 0) {
3892 usec_t t, h;
3893
3894 h = le64toh(f->header->head_entry_realtime);
3895 t = now(CLOCK_REALTIME);
3896
3897 if (h > 0 && t > h + max_file_usec)
3898 return true;
3899 }
3900
dca6219e
LP
3901 return false;
3902}