]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/journal/journal-file.c
label: rework label_fix() implementations (#8583)
[thirdparty/systemd.git] / src / journal / journal-file.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
cec736d2
LP
2/***
3 This file is part of systemd.
4
5 Copyright 2011 Lennart Poettering
6
7 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
cec736d2
LP
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 15 Lesser General Public License for more details.
cec736d2 16
5430f7f2 17 You should have received a copy of the GNU Lesser General Public License
cec736d2
LP
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19***/
20
cec736d2 21#include <errno.h>
cec736d2 22#include <fcntl.h>
11689d2a 23#include <linux/fs.h>
ac2e41f5 24#include <pthread.h>
07630cea
LP
25#include <stddef.h>
26#include <sys/mman.h>
27#include <sys/statvfs.h>
28#include <sys/uio.h>
29#include <unistd.h>
fb0951b0 30
b5efdb8a 31#include "alloc-util.h"
f27a3864 32#include "btrfs-util.h"
c8b3094d 33#include "chattr-util.h"
07630cea 34#include "compress.h"
3ffd4af2 35#include "fd-util.h"
11b29a96 36#include "fs-util.h"
0284adc6 37#include "journal-authenticate.h"
cec736d2
LP
38#include "journal-def.h"
39#include "journal-file.h"
40#include "lookup3.h"
6bedfcbb 41#include "parse-util.h"
5d1ce257 42#include "path-util.h"
3df3e884 43#include "random-util.h"
7a24f3bf 44#include "sd-event.h"
b58c888f 45#include "set.h"
3cc44114 46#include "stat-util.h"
07630cea 47#include "string-util.h"
4761fd0f 48#include "strv.h"
89a5a90c 49#include "xattr-util.h"
cec736d2 50
4a92baf3
LP
51#define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
52#define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
cec736d2 53
57850536
AG
54#define DEFAULT_COMPRESS_THRESHOLD (512ULL)
55#define MIN_COMPRESS_THRESHOLD (8ULL)
807e17f0 56
babfc091 57/* This is the minimum journal file size */
16098e93 58#define JOURNAL_FILE_SIZE_MIN (512ULL*1024ULL) /* 512 KiB */
babfc091
LP
59
60/* These are the lower and upper bounds if we deduce the max_use value
61 * from the file system size */
62#define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
63#define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
64
8580d1f7
LP
65/* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
66#define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL) /* 1 MiB */
67
babfc091 68/* This is the upper bound if we deduce max_size from max_use */
71100051 69#define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
babfc091
LP
70
71/* This is the upper bound if we deduce the keep_free value from the
72 * file system size */
73#define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
74
75/* This is the keep_free value when we can't determine the system
76 * size */
77#define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
78
8580d1f7
LP
79/* This is the default maximum number of journal files to keep around. */
80#define DEFAULT_N_MAX_FILES (100)
81
dca6219e
LP
82/* n_data was the first entry we added after the initial file format design */
83#define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
cec736d2 84
a4bcff5b
LP
85/* How many entries to keep in the entry array chain cache at max */
86#define CHAIN_CACHE_MAX 20
87
a676e665
LP
88/* How much to increase the journal file size at once each time we allocate something new. */
89#define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
90
2678031a
LP
91/* Reread fstat() of the file for detecting deletions at least this often */
92#define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
93
fa6ac760
LP
94/* The mmap context to use for the header we pick as one above the last defined typed */
95#define CONTEXT_HEADER _OBJECT_TYPE_MAX
96
51804460
ZJS
97#ifdef __clang__
98# pragma GCC diagnostic ignored "-Waddress-of-packed-member"
99#endif
100
ac2e41f5
VC
101/* This may be called from a separate thread to prevent blocking the caller for the duration of fsync().
102 * As a result we use atomic operations on f->offline_state for inter-thread communications with
103 * journal_file_set_offline() and journal_file_set_online(). */
104static void journal_file_set_offline_internal(JournalFile *f) {
26687bf8 105 assert(f);
ac2e41f5
VC
106 assert(f->fd >= 0);
107 assert(f->header);
108
109 for (;;) {
110 switch (f->offline_state) {
111 case OFFLINE_CANCEL:
112 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_DONE))
113 continue;
114 return;
115
116 case OFFLINE_AGAIN_FROM_SYNCING:
117 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_SYNCING))
118 continue;
119 break;
120
121 case OFFLINE_AGAIN_FROM_OFFLINING:
122 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_SYNCING))
123 continue;
124 break;
125
126 case OFFLINE_SYNCING:
127 (void) fsync(f->fd);
26687bf8 128
ac2e41f5
VC
129 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_OFFLINING))
130 continue;
26687bf8 131
8eb85171 132 f->header->state = f->archive ? STATE_ARCHIVED : STATE_OFFLINE;
ac2e41f5
VC
133 (void) fsync(f->fd);
134 break;
135
136 case OFFLINE_OFFLINING:
137 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_DONE))
138 continue;
4831981d 139 _fallthrough_;
ac2e41f5
VC
140 case OFFLINE_DONE:
141 return;
142
143 case OFFLINE_JOINED:
144 log_debug("OFFLINE_JOINED unexpected offline state for journal_file_set_offline_internal()");
145 return;
146 }
147 }
148}
149
150static void * journal_file_set_offline_thread(void *arg) {
151 JournalFile *f = arg;
152
fa7ff4cf
LP
153 (void) pthread_setname_np(pthread_self(), "journal-offline");
154
ac2e41f5
VC
155 journal_file_set_offline_internal(f);
156
157 return NULL;
158}
159
160static int journal_file_set_offline_thread_join(JournalFile *f) {
161 int r;
162
163 assert(f);
164
165 if (f->offline_state == OFFLINE_JOINED)
166 return 0;
167
168 r = pthread_join(f->offline_thread, NULL);
169 if (r)
170 return -r;
171
172 f->offline_state = OFFLINE_JOINED;
26687bf8 173
be7cdd8e 174 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
fa6ac760
LP
175 return -EIO;
176
ac2e41f5
VC
177 return 0;
178}
26687bf8 179
ac2e41f5
VC
180/* Trigger a restart if the offline thread is mid-flight in a restartable state. */
181static bool journal_file_set_offline_try_restart(JournalFile *f) {
182 for (;;) {
183 switch (f->offline_state) {
184 case OFFLINE_AGAIN_FROM_SYNCING:
185 case OFFLINE_AGAIN_FROM_OFFLINING:
186 return true;
187
188 case OFFLINE_CANCEL:
189 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_AGAIN_FROM_SYNCING))
190 continue;
191 return true;
192
193 case OFFLINE_SYNCING:
194 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_AGAIN_FROM_SYNCING))
195 continue;
196 return true;
197
198 case OFFLINE_OFFLINING:
199 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_AGAIN_FROM_OFFLINING))
200 continue;
201 return true;
26687bf8
OS
202
203 default:
ac2e41f5
VC
204 return false;
205 }
26687bf8
OS
206 }
207}
208
ac2e41f5
VC
209/* Sets a journal offline.
210 *
211 * If wait is false then an offline is dispatched in a separate thread for a
212 * subsequent journal_file_set_offline() or journal_file_set_online() of the
213 * same journal to synchronize with.
214 *
215 * If wait is true, then either an existing offline thread will be restarted
216 * and joined, or if none exists the offline is simply performed in this
217 * context without involving another thread.
218 */
219int journal_file_set_offline(JournalFile *f, bool wait) {
220 bool restarted;
221 int r;
222
26687bf8
OS
223 assert(f);
224
225 if (!f->writable)
226 return -EPERM;
227
228 if (!(f->fd >= 0 && f->header))
229 return -EINVAL;
230
b8f99e27
VC
231 /* An offlining journal is implicitly online and may modify f->header->state,
232 * we must also join any potentially lingering offline thread when not online. */
233 if (!journal_file_is_offlining(f) && f->header->state != STATE_ONLINE)
234 return journal_file_set_offline_thread_join(f);
26687bf8 235
ac2e41f5
VC
236 /* Restart an in-flight offline thread and wait if needed, or join a lingering done one. */
237 restarted = journal_file_set_offline_try_restart(f);
238 if ((restarted && wait) || !restarted) {
239 r = journal_file_set_offline_thread_join(f);
240 if (r < 0)
241 return r;
242 }
26687bf8 243
ac2e41f5
VC
244 if (restarted)
245 return 0;
246
247 /* Initiate a new offline. */
248 f->offline_state = OFFLINE_SYNCING;
fa6ac760 249
ac2e41f5
VC
250 if (wait) /* Without using a thread if waiting. */
251 journal_file_set_offline_internal(f);
252 else {
5e9f01e8
LP
253 sigset_t ss, saved_ss;
254 int k;
255
256 if (sigfillset(&ss) < 0)
257 return -errno;
258
259 r = pthread_sigmask(SIG_BLOCK, &ss, &saved_ss);
260 if (r > 0)
261 return -r;
262
ac2e41f5 263 r = pthread_create(&f->offline_thread, NULL, journal_file_set_offline_thread, f);
5e9f01e8
LP
264
265 k = pthread_sigmask(SIG_SETMASK, &saved_ss, NULL);
ec9ffa2c
VC
266 if (r > 0) {
267 f->offline_state = OFFLINE_JOINED;
ac2e41f5 268 return -r;
ec9ffa2c 269 }
5e9f01e8
LP
270 if (k > 0)
271 return -k;
ac2e41f5
VC
272 }
273
274 return 0;
275}
276
277static int journal_file_set_online(JournalFile *f) {
278 bool joined = false;
279
280 assert(f);
281
282 if (!f->writable)
283 return -EPERM;
284
285 if (!(f->fd >= 0 && f->header))
286 return -EINVAL;
287
288 while (!joined) {
289 switch (f->offline_state) {
290 case OFFLINE_JOINED:
291 /* No offline thread, no need to wait. */
292 joined = true;
293 break;
294
295 case OFFLINE_SYNCING:
296 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_CANCEL))
297 continue;
298 /* Canceled syncing prior to offlining, no need to wait. */
299 break;
300
301 case OFFLINE_AGAIN_FROM_SYNCING:
302 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_CANCEL))
303 continue;
304 /* Canceled restart from syncing, no need to wait. */
305 break;
306
307 case OFFLINE_AGAIN_FROM_OFFLINING:
308 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_CANCEL))
309 continue;
310 /* Canceled restart from offlining, must wait for offlining to complete however. */
4831981d 311 _fallthrough_;
ac2e41f5
VC
312 default: {
313 int r;
314
315 r = journal_file_set_offline_thread_join(f);
316 if (r < 0)
317 return r;
318
319 joined = true;
320 break;
321 }
322 }
323 }
26687bf8 324
be7cdd8e 325 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
fa6ac760
LP
326 return -EIO;
327
ac2e41f5
VC
328 switch (f->header->state) {
329 case STATE_ONLINE:
330 return 0;
26687bf8 331
ac2e41f5
VC
332 case STATE_OFFLINE:
333 f->header->state = STATE_ONLINE;
334 (void) fsync(f->fd);
335 return 0;
336
337 default:
338 return -EINVAL;
339 }
26687bf8
OS
340}
341
b58c888f
VC
342bool journal_file_is_offlining(JournalFile *f) {
343 assert(f);
344
345 __sync_synchronize();
346
3742095b 347 if (IN_SET(f->offline_state, OFFLINE_DONE, OFFLINE_JOINED))
b58c888f
VC
348 return false;
349
350 return true;
351}
352
804ae586 353JournalFile* journal_file_close(JournalFile *f) {
de190aef 354 assert(f);
cec736d2 355
349cc4a5 356#if HAVE_GCRYPT
b0af6f41 357 /* Write the final tag */
43cd8794
FB
358 if (f->seal && f->writable) {
359 int r;
360
361 r = journal_file_append_tag(f);
362 if (r < 0)
363 log_error_errno(r, "Failed to append tag when closing journal: %m");
364 }
feb12d3e 365#endif
b0af6f41 366
7a24f3bf
VC
367 if (f->post_change_timer) {
368 int enabled;
369
370 if (sd_event_source_get_enabled(f->post_change_timer, &enabled) >= 0)
371 if (enabled == SD_EVENT_ONESHOT)
372 journal_file_post_change(f);
373
e167d7fd 374 (void) sd_event_source_set_enabled(f->post_change_timer, SD_EVENT_OFF);
7a24f3bf
VC
375 sd_event_source_unref(f->post_change_timer);
376 }
377
ac2e41f5 378 journal_file_set_offline(f, true);
cec736d2 379
be7cdd8e
VC
380 if (f->mmap && f->cache_fd)
381 mmap_cache_free_fd(f->mmap, f->cache_fd);
cec736d2 382
11689d2a
LP
383 if (f->fd >= 0 && f->defrag_on_close) {
384
385 /* Be friendly to btrfs: turn COW back on again now,
386 * and defragment the file. We won't write to the file
387 * ever again, hence remove all fragmentation, and
388 * reenable all the good bits COW usually provides
389 * (such as data checksumming). */
390
1ed8f8c1 391 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
11689d2a
LP
392 (void) btrfs_defrag_fd(f->fd);
393 }
f27a3864 394
5d1ce257
LP
395 if (f->close_fd)
396 safe_close(f->fd);
cec736d2 397 free(f->path);
807e17f0 398
f649045c 399 mmap_cache_unref(f->mmap);
16e9f408 400
4743015d 401 ordered_hashmap_free_free(f->chain_cache);
a4bcff5b 402
349cc4a5 403#if HAVE_XZ || HAVE_LZ4
807e17f0
LP
404 free(f->compress_buffer);
405#endif
406
349cc4a5 407#if HAVE_GCRYPT
baed47c3
LP
408 if (f->fss_file)
409 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
dc4ebc07 410 else
b7c9ae91
LP
411 free(f->fsprg_state);
412
413 free(f->fsprg_seed);
7560fffc
LP
414
415 if (f->hmac)
416 gcry_md_close(f->hmac);
417#endif
418
6b430fdb 419 return mfree(f);
cec736d2
LP
420}
421
0ac38b70 422static int journal_file_init_header(JournalFile *f, JournalFile *template) {
d89c8fdf 423 Header h = {};
cec736d2
LP
424 ssize_t k;
425 int r;
426
427 assert(f);
428
7560fffc 429 memcpy(h.signature, HEADER_SIGNATURE, 8);
23b0b2b2 430 h.header_size = htole64(ALIGN64(sizeof(h)));
cec736d2 431
d89c8fdf
ZJS
432 h.incompatible_flags |= htole32(
433 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
434 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
7560fffc 435
d89c8fdf
ZJS
436 h.compatible_flags = htole32(
437 f->seal * HEADER_COMPATIBLE_SEALED);
7560fffc 438
cec736d2
LP
439 r = sd_id128_randomize(&h.file_id);
440 if (r < 0)
441 return r;
442
0ac38b70
LP
443 if (template) {
444 h.seqnum_id = template->header->seqnum_id;
beec0085 445 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
0ac38b70
LP
446 } else
447 h.seqnum_id = h.file_id;
cec736d2
LP
448
449 k = pwrite(f->fd, &h, sizeof(h), 0);
450 if (k < 0)
451 return -errno;
452
453 if (k != sizeof(h))
454 return -EIO;
455
456 return 0;
457}
458
459static int journal_file_refresh_header(JournalFile *f) {
de190aef 460 sd_id128_t boot_id;
fa6ac760 461 int r;
cec736d2
LP
462
463 assert(f);
c88cc6af 464 assert(f->header);
cec736d2
LP
465
466 r = sd_id128_get_machine(&f->header->machine_id);
467 if (r < 0)
468 return r;
469
de190aef 470 r = sd_id128_get_boot(&boot_id);
cec736d2
LP
471 if (r < 0)
472 return r;
473
de190aef
LP
474 f->header->boot_id = boot_id;
475
fa6ac760 476 r = journal_file_set_online(f);
b788cc23 477
7560fffc 478 /* Sync the online state to disk */
fb426037 479 (void) fsync(f->fd);
b788cc23 480
a0fe2a2d
LP
481 /* We likely just created a new file, also sync the directory this file is located in. */
482 (void) fsync_directory_of_file(f->fd);
483
fa6ac760 484 return r;
cec736d2
LP
485}
486
4214009f
ZJS
487static bool warn_wrong_flags(const JournalFile *f, bool compatible) {
488 const uint32_t any = compatible ? HEADER_COMPATIBLE_ANY : HEADER_INCOMPATIBLE_ANY,
489 supported = compatible ? HEADER_COMPATIBLE_SUPPORTED : HEADER_INCOMPATIBLE_SUPPORTED;
490 const char *type = compatible ? "compatible" : "incompatible";
d89c8fdf
ZJS
491 uint32_t flags;
492
4214009f
ZJS
493 flags = le32toh(compatible ? f->header->compatible_flags : f->header->incompatible_flags);
494
495 if (flags & ~supported) {
496 if (flags & ~any)
4761fd0f 497 log_debug("Journal file %s has unknown %s flags 0x%"PRIx32,
4214009f
ZJS
498 f->path, type, flags & ~any);
499 flags = (flags & any) & ~supported;
4761fd0f
ZJS
500 if (flags) {
501 const char* strv[3];
502 unsigned n = 0;
503 _cleanup_free_ char *t = NULL;
504
505 if (compatible && (flags & HEADER_COMPATIBLE_SEALED))
506 strv[n++] = "sealed";
507 if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_XZ))
508 strv[n++] = "xz-compressed";
509 if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_LZ4))
510 strv[n++] = "lz4-compressed";
511 strv[n] = NULL;
512 assert(n < ELEMENTSOF(strv));
513
514 t = strv_join((char**) strv, ", ");
515 log_debug("Journal file %s uses %s %s %s disabled at compilation time.",
516 f->path, type, n > 1 ? "flags" : "flag", strnull(t));
517 }
4214009f
ZJS
518 return true;
519 }
520
521 return false;
522}
523
524static int journal_file_verify_header(JournalFile *f) {
6f94e420
TS
525 uint64_t arena_size, header_size;
526
cec736d2 527 assert(f);
c88cc6af 528 assert(f->header);
cec736d2 529
7560fffc 530 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
cec736d2
LP
531 return -EBADMSG;
532
4214009f
ZJS
533 /* In both read and write mode we refuse to open files with incompatible
534 * flags we don't know. */
535 if (warn_wrong_flags(f, false))
cec736d2
LP
536 return -EPROTONOSUPPORT;
537
4214009f
ZJS
538 /* When open for writing we refuse to open files with compatible flags, too. */
539 if (f->writable && warn_wrong_flags(f, true))
d89c8fdf 540 return -EPROTONOSUPPORT;
7560fffc 541
db11ac1a
LP
542 if (f->header->state >= _STATE_MAX)
543 return -EBADMSG;
544
6f94e420
TS
545 header_size = le64toh(f->header->header_size);
546
dca6219e 547 /* The first addition was n_data, so check that we are at least this large */
6f94e420 548 if (header_size < HEADER_SIZE_MIN)
23b0b2b2
LP
549 return -EBADMSG;
550
8088cbd3 551 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
beec0085
LP
552 return -EBADMSG;
553
6f94e420
TS
554 arena_size = le64toh(f->header->arena_size);
555
556 if (UINT64_MAX - header_size < arena_size || header_size + arena_size > (uint64_t) f->last_stat.st_size)
db11ac1a
LP
557 return -ENODATA;
558
6f94e420 559 if (le64toh(f->header->tail_object_offset) > header_size + arena_size)
db11ac1a
LP
560 return -ENODATA;
561
7762e02b
LP
562 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
563 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
564 !VALID64(le64toh(f->header->tail_object_offset)) ||
565 !VALID64(le64toh(f->header->entry_array_offset)))
566 return -ENODATA;
567
cec736d2 568 if (f->writable) {
cec736d2 569 sd_id128_t machine_id;
ae739cc1 570 uint8_t state;
cec736d2
LP
571 int r;
572
573 r = sd_id128_get_machine(&machine_id);
574 if (r < 0)
575 return r;
576
577 if (!sd_id128_equal(machine_id, f->header->machine_id))
578 return -EHOSTDOWN;
579
de190aef 580 state = f->header->state;
cec736d2 581
b288cdeb
ZJS
582 if (state == STATE_ARCHIVED)
583 return -ESHUTDOWN; /* Already archived */
584 else if (state == STATE_ONLINE) {
71fa6f00
LP
585 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
586 return -EBUSY;
b288cdeb 587 } else if (state != STATE_OFFLINE) {
8facc349 588 log_debug("Journal file %s has unknown state %i.", f->path, state);
71fa6f00
LP
589 return -EBUSY;
590 }
ae739cc1 591
5b3cc0c8
YN
592 if (f->header->field_hash_table_size == 0 || f->header->data_hash_table_size == 0)
593 return -EBADMSG;
594
ae739cc1
LP
595 /* Don't permit appending to files from the future. Because otherwise the realtime timestamps wouldn't
596 * be strictly ordered in the entries in the file anymore, and we can't have that since it breaks
597 * bisection. */
598 if (le64toh(f->header->tail_entry_realtime) > now(CLOCK_REALTIME)) {
599 log_debug("Journal file %s is from the future, refusing to append new data to it that'd be older.", f->path);
600 return -ETXTBSY;
601 }
cec736d2
LP
602 }
603
d89c8fdf
ZJS
604 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
605 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
c586dbf1 606
f1889c91 607 f->seal = JOURNAL_HEADER_SEALED(f->header);
7560fffc 608
cec736d2
LP
609 return 0;
610}
611
2678031a 612static int journal_file_fstat(JournalFile *f) {
3cc44114
LP
613 int r;
614
2678031a
LP
615 assert(f);
616 assert(f->fd >= 0);
617
618 if (fstat(f->fd, &f->last_stat) < 0)
619 return -errno;
620
621 f->last_stat_usec = now(CLOCK_MONOTONIC);
622
8d6a4d33 623 /* Refuse dealing with with files that aren't regular */
3cc44114
LP
624 r = stat_verify_regular(&f->last_stat);
625 if (r < 0)
626 return r;
8d6a4d33 627
2678031a
LP
628 /* Refuse appending to files that are already deleted */
629 if (f->last_stat.st_nlink <= 0)
630 return -EIDRM;
631
632 return 0;
633}
634
cec736d2 635static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
eda4b58b 636 uint64_t old_size, new_size;
fec2aa2f 637 int r;
cec736d2
LP
638
639 assert(f);
c88cc6af 640 assert(f->header);
cec736d2 641
cec736d2 642 /* We assume that this file is not sparse, and we know that
38ac38b2 643 * for sure, since we always call posix_fallocate()
cec736d2
LP
644 * ourselves */
645
be7cdd8e 646 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
fa6ac760
LP
647 return -EIO;
648
cec736d2 649 old_size =
23b0b2b2 650 le64toh(f->header->header_size) +
cec736d2
LP
651 le64toh(f->header->arena_size);
652
bc85bfee 653 new_size = PAGE_ALIGN(offset + size);
23b0b2b2
LP
654 if (new_size < le64toh(f->header->header_size))
655 new_size = le64toh(f->header->header_size);
bc85bfee 656
2678031a
LP
657 if (new_size <= old_size) {
658
659 /* We already pre-allocated enough space, but before
660 * we write to it, let's check with fstat() if the
661 * file got deleted, in order make sure we don't throw
662 * away the data immediately. Don't check fstat() for
663 * all writes though, but only once ever 10s. */
664
665 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
666 return 0;
667
668 return journal_file_fstat(f);
669 }
670
671 /* Allocate more space. */
cec736d2 672
a676e665 673 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
bc85bfee 674 return -E2BIG;
cec736d2 675
a676e665 676 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
cec736d2
LP
677 struct statvfs svfs;
678
679 if (fstatvfs(f->fd, &svfs) >= 0) {
680 uint64_t available;
681
070052ab 682 available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
cec736d2
LP
683
684 if (new_size - old_size > available)
685 return -E2BIG;
686 }
687 }
688
eda4b58b 689 /* Increase by larger blocks at once */
be6b0c21 690 new_size = DIV_ROUND_UP(new_size, FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
eda4b58b
LP
691 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
692 new_size = f->metrics.max_size;
693
bc85bfee
LP
694 /* Note that the glibc fallocate() fallback is very
695 inefficient, hence we try to minimize the allocation area
696 as we can. */
fec2aa2f
GV
697 r = posix_fallocate(f->fd, old_size, new_size - old_size);
698 if (r != 0)
699 return -r;
cec736d2 700
23b0b2b2 701 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
cec736d2 702
2678031a 703 return journal_file_fstat(f);
cec736d2
LP
704}
705
78519831 706static unsigned type_to_context(ObjectType type) {
d3d3208f 707 /* One context for each type, plus one catch-all for the rest */
69adae51 708 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
fa6ac760 709 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
d05089d8 710 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
d3d3208f
MS
711}
712
b439282e 713static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret, size_t *ret_size) {
2678031a
LP
714 int r;
715
cec736d2 716 assert(f);
cec736d2
LP
717 assert(ret);
718
7762e02b
LP
719 if (size <= 0)
720 return -EINVAL;
721
2a59ea54 722 /* Avoid SIGBUS on invalid accesses */
4bbdcdb3
LP
723 if (offset + size > (uint64_t) f->last_stat.st_size) {
724 /* Hmm, out of range? Let's refresh the fstat() data
725 * first, before we trust that check. */
726
2678031a
LP
727 r = journal_file_fstat(f);
728 if (r < 0)
729 return r;
730
731 if (offset + size > (uint64_t) f->last_stat.st_size)
4bbdcdb3
LP
732 return -EADDRNOTAVAIL;
733 }
734
b439282e 735 return mmap_cache_get(f->mmap, f->cache_fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret, ret_size);
cec736d2
LP
736}
737
16e9f408
LP
738static uint64_t minimum_header_size(Object *o) {
739
b8e891e6 740 static const uint64_t table[] = {
16e9f408
LP
741 [OBJECT_DATA] = sizeof(DataObject),
742 [OBJECT_FIELD] = sizeof(FieldObject),
743 [OBJECT_ENTRY] = sizeof(EntryObject),
744 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
745 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
746 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
747 [OBJECT_TAG] = sizeof(TagObject),
748 };
749
750 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
751 return sizeof(ObjectHeader);
752
753 return table[o->object.type];
754}
755
24754f36
TR
756/* Lightweight object checks. We want this to be fast, so that we won't
757 * slowdown every journal_file_move_to_object() call too much. */
758static int journal_file_check_object(JournalFile *f, uint64_t offset, Object *o) {
759 assert(f);
760 assert(o);
761
762 switch (o->object.type) {
763
764 case OBJECT_DATA: {
765 if ((le64toh(o->data.entry_offset) == 0) ^ (le64toh(o->data.n_entries) == 0)) {
766 log_debug("Bad n_entries: %"PRIu64": %"PRIu64,
10e8445b 767 le64toh(o->data.n_entries), offset);
24754f36
TR
768 return -EBADMSG;
769 }
770
771 if (le64toh(o->object.size) - offsetof(DataObject, payload) <= 0) {
772 log_debug("Bad object size (<= %zu): %"PRIu64": %"PRIu64,
773 offsetof(DataObject, payload),
774 le64toh(o->object.size),
775 offset);
776 return -EBADMSG;
777 }
778
10e8445b
TR
779 if (!VALID64(le64toh(o->data.next_hash_offset)) ||
780 !VALID64(le64toh(o->data.next_field_offset)) ||
781 !VALID64(le64toh(o->data.entry_offset)) ||
782 !VALID64(le64toh(o->data.entry_array_offset))) {
24754f36
TR
783 log_debug("Invalid offset, next_hash_offset="OFSfmt", next_field_offset="OFSfmt
784 ", entry_offset="OFSfmt", entry_array_offset="OFSfmt": %"PRIu64,
10e8445b
TR
785 le64toh(o->data.next_hash_offset),
786 le64toh(o->data.next_field_offset),
787 le64toh(o->data.entry_offset),
788 le64toh(o->data.entry_array_offset),
24754f36
TR
789 offset);
790 return -EBADMSG;
791 }
792
793 break;
794 }
795
796 case OBJECT_FIELD:
797 if (le64toh(o->object.size) - offsetof(FieldObject, payload) <= 0) {
798 log_debug(
799 "Bad field size (<= %zu): %"PRIu64": %"PRIu64,
800 offsetof(FieldObject, payload),
801 le64toh(o->object.size),
802 offset);
803 return -EBADMSG;
804 }
805
10e8445b
TR
806 if (!VALID64(le64toh(o->field.next_hash_offset)) ||
807 !VALID64(le64toh(o->field.head_data_offset))) {
24754f36
TR
808 log_debug(
809 "Invalid offset, next_hash_offset="OFSfmt
810 ", head_data_offset="OFSfmt": %"PRIu64,
10e8445b
TR
811 le64toh(o->field.next_hash_offset),
812 le64toh(o->field.head_data_offset),
24754f36
TR
813 offset);
814 return -EBADMSG;
815 }
816 break;
817
818 case OBJECT_ENTRY:
819 if ((le64toh(o->object.size) - offsetof(EntryObject, items)) % sizeof(EntryItem) != 0) {
820 log_debug(
821 "Bad entry size (<= %zu): %"PRIu64": %"PRIu64,
822 offsetof(EntryObject, items),
823 le64toh(o->object.size),
824 offset);
825 return -EBADMSG;
826 }
827
828 if ((le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem) <= 0) {
829 log_debug(
830 "Invalid number items in entry: %"PRIu64": %"PRIu64,
831 (le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem),
832 offset);
833 return -EBADMSG;
834 }
835
836 if (le64toh(o->entry.seqnum) <= 0) {
837 log_debug(
838 "Invalid entry seqnum: %"PRIx64": %"PRIu64,
839 le64toh(o->entry.seqnum),
840 offset);
841 return -EBADMSG;
842 }
843
844 if (!VALID_REALTIME(le64toh(o->entry.realtime))) {
845 log_debug(
846 "Invalid entry realtime timestamp: %"PRIu64": %"PRIu64,
847 le64toh(o->entry.realtime),
848 offset);
849 return -EBADMSG;
850 }
851
852 if (!VALID_MONOTONIC(le64toh(o->entry.monotonic))) {
853 log_debug(
854 "Invalid entry monotonic timestamp: %"PRIu64": %"PRIu64,
855 le64toh(o->entry.monotonic),
856 offset);
857 return -EBADMSG;
858 }
859
860 break;
861
862 case OBJECT_DATA_HASH_TABLE:
863 case OBJECT_FIELD_HASH_TABLE:
864 if ((le64toh(o->object.size) - offsetof(HashTableObject, items)) % sizeof(HashItem) != 0 ||
865 (le64toh(o->object.size) - offsetof(HashTableObject, items)) / sizeof(HashItem) <= 0) {
866 log_debug(
867 "Invalid %s hash table size: %"PRIu64": %"PRIu64,
868 o->object.type == OBJECT_DATA_HASH_TABLE ? "data" : "field",
869 le64toh(o->object.size),
870 offset);
871 return -EBADMSG;
872 }
873
874 break;
875
876 case OBJECT_ENTRY_ARRAY:
877 if ((le64toh(o->object.size) - offsetof(EntryArrayObject, items)) % sizeof(le64_t) != 0 ||
878 (le64toh(o->object.size) - offsetof(EntryArrayObject, items)) / sizeof(le64_t) <= 0) {
879 log_debug(
880 "Invalid object entry array size: %"PRIu64": %"PRIu64,
881 le64toh(o->object.size),
882 offset);
883 return -EBADMSG;
884 }
885
10e8445b 886 if (!VALID64(le64toh(o->entry_array.next_entry_array_offset))) {
24754f36
TR
887 log_debug(
888 "Invalid object entry array next_entry_array_offset: "OFSfmt": %"PRIu64,
10e8445b 889 le64toh(o->entry_array.next_entry_array_offset),
24754f36
TR
890 offset);
891 return -EBADMSG;
892 }
893
894 break;
895
896 case OBJECT_TAG:
897 if (le64toh(o->object.size) != sizeof(TagObject)) {
898 log_debug(
899 "Invalid object tag size: %"PRIu64": %"PRIu64,
900 le64toh(o->object.size),
901 offset);
902 return -EBADMSG;
903 }
904
10e8445b 905 if (!VALID_EPOCH(le64toh(o->tag.epoch))) {
24754f36
TR
906 log_debug(
907 "Invalid object tag epoch: %"PRIu64": %"PRIu64,
10e8445b 908 le64toh(o->tag.epoch),
24754f36
TR
909 offset);
910 return -EBADMSG;
911 }
912
913 break;
914 }
915
916 return 0;
917}
918
78519831 919int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
cec736d2
LP
920 int r;
921 void *t;
b439282e 922 size_t tsize;
cec736d2
LP
923 Object *o;
924 uint64_t s;
925
926 assert(f);
927 assert(ret);
928
db11ac1a 929 /* Objects may only be located at multiple of 64 bit */
202fd896
LP
930 if (!VALID64(offset)) {
931 log_debug("Attempt to move to object at non-64bit boundary: %" PRIu64, offset);
bd30fdf2 932 return -EBADMSG;
202fd896 933 }
db11ac1a 934
50809d7a 935 /* Object may not be located in the file header */
202fd896
LP
936 if (offset < le64toh(f->header->header_size)) {
937 log_debug("Attempt to move to object located in file header: %" PRIu64, offset);
50809d7a 938 return -EBADMSG;
202fd896 939 }
50809d7a 940
b439282e 941 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t, &tsize);
cec736d2
LP
942 if (r < 0)
943 return r;
944
945 o = (Object*) t;
946 s = le64toh(o->object.size);
947
1c69f096
LP
948 if (s == 0) {
949 log_debug("Attempt to move to uninitialized object: %" PRIu64, offset);
950 return -EBADMSG;
951 }
202fd896
LP
952 if (s < sizeof(ObjectHeader)) {
953 log_debug("Attempt to move to overly short object: %" PRIu64, offset);
cec736d2 954 return -EBADMSG;
202fd896 955 }
cec736d2 956
202fd896
LP
957 if (o->object.type <= OBJECT_UNUSED) {
958 log_debug("Attempt to move to object with invalid type: %" PRIu64, offset);
16e9f408 959 return -EBADMSG;
202fd896 960 }
16e9f408 961
202fd896
LP
962 if (s < minimum_header_size(o)) {
963 log_debug("Attempt to move to truncated object: %" PRIu64, offset);
16e9f408 964 return -EBADMSG;
202fd896 965 }
16e9f408 966
202fd896
LP
967 if (type > OBJECT_UNUSED && o->object.type != type) {
968 log_debug("Attempt to move to object of unexpected type: %" PRIu64, offset);
cec736d2 969 return -EBADMSG;
202fd896 970 }
cec736d2 971
b439282e
VC
972 if (s > tsize) {
973 r = journal_file_move_to(f, type, false, offset, s, &t, NULL);
cec736d2
LP
974 if (r < 0)
975 return r;
976
977 o = (Object*) t;
978 }
979
24754f36
TR
980 r = journal_file_check_object(f, offset, o);
981 if (r < 0)
982 return r;
983
cec736d2
LP
984 *ret = o;
985 return 0;
986}
987
d98cc1f2 988static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
cec736d2
LP
989 uint64_t r;
990
991 assert(f);
c88cc6af 992 assert(f->header);
cec736d2 993
beec0085 994 r = le64toh(f->header->tail_entry_seqnum) + 1;
c2373f84
LP
995
996 if (seqnum) {
de190aef 997 /* If an external seqnum counter was passed, we update
c2373f84
LP
998 * both the local and the external one, and set it to
999 * the maximum of both */
1000
1001 if (*seqnum + 1 > r)
1002 r = *seqnum + 1;
1003
1004 *seqnum = r;
1005 }
1006
beec0085 1007 f->header->tail_entry_seqnum = htole64(r);
cec736d2 1008
beec0085
LP
1009 if (f->header->head_entry_seqnum == 0)
1010 f->header->head_entry_seqnum = htole64(r);
de190aef 1011
cec736d2
LP
1012 return r;
1013}
1014
78519831 1015int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
cec736d2
LP
1016 int r;
1017 uint64_t p;
1018 Object *tail, *o;
1019 void *t;
1020
1021 assert(f);
c88cc6af 1022 assert(f->header);
d05089d8 1023 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
cec736d2
LP
1024 assert(size >= sizeof(ObjectHeader));
1025 assert(offset);
1026 assert(ret);
1027
26687bf8
OS
1028 r = journal_file_set_online(f);
1029 if (r < 0)
1030 return r;
1031
cec736d2 1032 p = le64toh(f->header->tail_object_offset);
cec736d2 1033 if (p == 0)
23b0b2b2 1034 p = le64toh(f->header->header_size);
cec736d2 1035 else {
d05089d8 1036 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
cec736d2
LP
1037 if (r < 0)
1038 return r;
1039
1040 p += ALIGN64(le64toh(tail->object.size));
1041 }
1042
1043 r = journal_file_allocate(f, p, size);
1044 if (r < 0)
1045 return r;
1046
b439282e 1047 r = journal_file_move_to(f, type, false, p, size, &t, NULL);
cec736d2
LP
1048 if (r < 0)
1049 return r;
1050
1051 o = (Object*) t;
1052
1053 zero(o->object);
de190aef 1054 o->object.type = type;
cec736d2
LP
1055 o->object.size = htole64(size);
1056
1057 f->header->tail_object_offset = htole64(p);
cec736d2
LP
1058 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
1059
1060 *ret = o;
1061 *offset = p;
1062
1063 return 0;
1064}
1065
de190aef 1066static int journal_file_setup_data_hash_table(JournalFile *f) {
cec736d2
LP
1067 uint64_t s, p;
1068 Object *o;
1069 int r;
1070
1071 assert(f);
c88cc6af 1072 assert(f->header);
cec736d2 1073
070052ab
LP
1074 /* We estimate that we need 1 hash table entry per 768 bytes
1075 of journal file and we want to make sure we never get
1076 beyond 75% fill level. Calculate the hash table size for
1077 the maximum file size based on these metrics. */
4a92baf3 1078
dfabe643 1079 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
4a92baf3
LP
1080 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
1081 s = DEFAULT_DATA_HASH_TABLE_SIZE;
1082
507f22bd 1083 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
4a92baf3 1084
de190aef
LP
1085 r = journal_file_append_object(f,
1086 OBJECT_DATA_HASH_TABLE,
1087 offsetof(Object, hash_table.items) + s,
1088 &o, &p);
cec736d2
LP
1089 if (r < 0)
1090 return r;
1091
29804cc1 1092 memzero(o->hash_table.items, s);
cec736d2 1093
de190aef
LP
1094 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1095 f->header->data_hash_table_size = htole64(s);
cec736d2
LP
1096
1097 return 0;
1098}
1099
de190aef 1100static int journal_file_setup_field_hash_table(JournalFile *f) {
cec736d2
LP
1101 uint64_t s, p;
1102 Object *o;
1103 int r;
1104
1105 assert(f);
c88cc6af 1106 assert(f->header);
cec736d2 1107
3c1668da
LP
1108 /* We use a fixed size hash table for the fields as this
1109 * number should grow very slowly only */
1110
de190aef
LP
1111 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
1112 r = journal_file_append_object(f,
1113 OBJECT_FIELD_HASH_TABLE,
1114 offsetof(Object, hash_table.items) + s,
1115 &o, &p);
cec736d2
LP
1116 if (r < 0)
1117 return r;
1118
29804cc1 1119 memzero(o->hash_table.items, s);
cec736d2 1120
de190aef
LP
1121 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1122 f->header->field_hash_table_size = htole64(s);
cec736d2
LP
1123
1124 return 0;
1125}
1126
dade37d4 1127int journal_file_map_data_hash_table(JournalFile *f) {
cec736d2
LP
1128 uint64_t s, p;
1129 void *t;
1130 int r;
1131
1132 assert(f);
c88cc6af 1133 assert(f->header);
cec736d2 1134
dade37d4
LP
1135 if (f->data_hash_table)
1136 return 0;
1137
de190aef
LP
1138 p = le64toh(f->header->data_hash_table_offset);
1139 s = le64toh(f->header->data_hash_table_size);
cec736d2 1140
de190aef 1141 r = journal_file_move_to(f,
16e9f408 1142 OBJECT_DATA_HASH_TABLE,
fcde2389 1143 true,
de190aef 1144 p, s,
b42549ad 1145 &t, NULL);
cec736d2
LP
1146 if (r < 0)
1147 return r;
1148
de190aef 1149 f->data_hash_table = t;
cec736d2
LP
1150 return 0;
1151}
1152
dade37d4 1153int journal_file_map_field_hash_table(JournalFile *f) {
cec736d2
LP
1154 uint64_t s, p;
1155 void *t;
1156 int r;
1157
1158 assert(f);
c88cc6af 1159 assert(f->header);
cec736d2 1160
dade37d4
LP
1161 if (f->field_hash_table)
1162 return 0;
1163
de190aef
LP
1164 p = le64toh(f->header->field_hash_table_offset);
1165 s = le64toh(f->header->field_hash_table_size);
cec736d2 1166
de190aef 1167 r = journal_file_move_to(f,
16e9f408 1168 OBJECT_FIELD_HASH_TABLE,
fcde2389 1169 true,
de190aef 1170 p, s,
b42549ad 1171 &t, NULL);
cec736d2
LP
1172 if (r < 0)
1173 return r;
1174
de190aef 1175 f->field_hash_table = t;
cec736d2
LP
1176 return 0;
1177}
1178
3c1668da
LP
1179static int journal_file_link_field(
1180 JournalFile *f,
1181 Object *o,
1182 uint64_t offset,
1183 uint64_t hash) {
1184
805d1486 1185 uint64_t p, h, m;
3c1668da
LP
1186 int r;
1187
1188 assert(f);
c88cc6af 1189 assert(f->header);
90d222c1 1190 assert(f->field_hash_table);
3c1668da
LP
1191 assert(o);
1192 assert(offset > 0);
1193
1194 if (o->object.type != OBJECT_FIELD)
1195 return -EINVAL;
1196
805d1486
LP
1197 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
1198 if (m <= 0)
1199 return -EBADMSG;
3c1668da 1200
805d1486 1201 /* This might alter the window we are looking at */
3c1668da
LP
1202 o->field.next_hash_offset = o->field.head_data_offset = 0;
1203
805d1486 1204 h = hash % m;
3c1668da
LP
1205 p = le64toh(f->field_hash_table[h].tail_hash_offset);
1206 if (p == 0)
1207 f->field_hash_table[h].head_hash_offset = htole64(offset);
1208 else {
1209 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1210 if (r < 0)
1211 return r;
1212
1213 o->field.next_hash_offset = htole64(offset);
1214 }
1215
1216 f->field_hash_table[h].tail_hash_offset = htole64(offset);
1217
1218 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
1219 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
1220
1221 return 0;
1222}
1223
1224static int journal_file_link_data(
1225 JournalFile *f,
1226 Object *o,
1227 uint64_t offset,
1228 uint64_t hash) {
1229
805d1486 1230 uint64_t p, h, m;
cec736d2
LP
1231 int r;
1232
1233 assert(f);
c88cc6af 1234 assert(f->header);
90d222c1 1235 assert(f->data_hash_table);
cec736d2
LP
1236 assert(o);
1237 assert(offset > 0);
b588975f
LP
1238
1239 if (o->object.type != OBJECT_DATA)
1240 return -EINVAL;
cec736d2 1241
805d1486
LP
1242 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
1243 if (m <= 0)
1244 return -EBADMSG;
48496df6 1245
805d1486 1246 /* This might alter the window we are looking at */
de190aef
LP
1247 o->data.next_hash_offset = o->data.next_field_offset = 0;
1248 o->data.entry_offset = o->data.entry_array_offset = 0;
1249 o->data.n_entries = 0;
cec736d2 1250
805d1486 1251 h = hash % m;
8db4213e 1252 p = le64toh(f->data_hash_table[h].tail_hash_offset);
3c1668da 1253 if (p == 0)
cec736d2 1254 /* Only entry in the hash table is easy */
de190aef 1255 f->data_hash_table[h].head_hash_offset = htole64(offset);
3c1668da 1256 else {
48496df6
LP
1257 /* Move back to the previous data object, to patch in
1258 * pointer */
cec736d2 1259
de190aef 1260 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1261 if (r < 0)
1262 return r;
1263
de190aef 1264 o->data.next_hash_offset = htole64(offset);
cec736d2
LP
1265 }
1266
de190aef 1267 f->data_hash_table[h].tail_hash_offset = htole64(offset);
cec736d2 1268
dca6219e
LP
1269 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
1270 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
1271
cec736d2
LP
1272 return 0;
1273}
1274
3c1668da
LP
1275int journal_file_find_field_object_with_hash(
1276 JournalFile *f,
1277 const void *field, uint64_t size, uint64_t hash,
1278 Object **ret, uint64_t *offset) {
1279
805d1486 1280 uint64_t p, osize, h, m;
3c1668da
LP
1281 int r;
1282
1283 assert(f);
c88cc6af 1284 assert(f->header);
3c1668da
LP
1285 assert(field && size > 0);
1286
dade37d4
LP
1287 /* If the field hash table is empty, we can't find anything */
1288 if (le64toh(f->header->field_hash_table_size) <= 0)
1289 return 0;
1290
1291 /* Map the field hash table, if it isn't mapped yet. */
1292 r = journal_file_map_field_hash_table(f);
1293 if (r < 0)
1294 return r;
1295
3c1668da
LP
1296 osize = offsetof(Object, field.payload) + size;
1297
805d1486 1298 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
805d1486 1299 if (m <= 0)
3c1668da
LP
1300 return -EBADMSG;
1301
805d1486 1302 h = hash % m;
3c1668da
LP
1303 p = le64toh(f->field_hash_table[h].head_hash_offset);
1304
1305 while (p > 0) {
1306 Object *o;
1307
1308 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1309 if (r < 0)
1310 return r;
1311
1312 if (le64toh(o->field.hash) == hash &&
1313 le64toh(o->object.size) == osize &&
1314 memcmp(o->field.payload, field, size) == 0) {
1315
1316 if (ret)
1317 *ret = o;
1318 if (offset)
1319 *offset = p;
1320
1321 return 1;
1322 }
1323
1324 p = le64toh(o->field.next_hash_offset);
1325 }
1326
1327 return 0;
1328}
1329
1330int journal_file_find_field_object(
1331 JournalFile *f,
1332 const void *field, uint64_t size,
1333 Object **ret, uint64_t *offset) {
1334
1335 uint64_t hash;
1336
1337 assert(f);
1338 assert(field && size > 0);
1339
1340 hash = hash64(field, size);
1341
1342 return journal_file_find_field_object_with_hash(f,
1343 field, size, hash,
1344 ret, offset);
1345}
1346
de190aef
LP
1347int journal_file_find_data_object_with_hash(
1348 JournalFile *f,
1349 const void *data, uint64_t size, uint64_t hash,
1350 Object **ret, uint64_t *offset) {
48496df6 1351
805d1486 1352 uint64_t p, osize, h, m;
cec736d2
LP
1353 int r;
1354
1355 assert(f);
c88cc6af 1356 assert(f->header);
cec736d2
LP
1357 assert(data || size == 0);
1358
dade37d4
LP
1359 /* If there's no data hash table, then there's no entry. */
1360 if (le64toh(f->header->data_hash_table_size) <= 0)
1361 return 0;
1362
1363 /* Map the data hash table, if it isn't mapped yet. */
1364 r = journal_file_map_data_hash_table(f);
1365 if (r < 0)
1366 return r;
1367
cec736d2
LP
1368 osize = offsetof(Object, data.payload) + size;
1369
805d1486
LP
1370 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
1371 if (m <= 0)
bc85bfee
LP
1372 return -EBADMSG;
1373
805d1486 1374 h = hash % m;
de190aef 1375 p = le64toh(f->data_hash_table[h].head_hash_offset);
cec736d2 1376
de190aef
LP
1377 while (p > 0) {
1378 Object *o;
cec736d2 1379
de190aef 1380 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1381 if (r < 0)
1382 return r;
1383
807e17f0 1384 if (le64toh(o->data.hash) != hash)
85a131e8 1385 goto next;
807e17f0 1386
d89c8fdf 1387 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
349cc4a5 1388#if HAVE_XZ || HAVE_LZ4
fa1c4b51 1389 uint64_t l;
a7f7d1bd 1390 size_t rsize = 0;
cec736d2 1391
807e17f0
LP
1392 l = le64toh(o->object.size);
1393 if (l <= offsetof(Object, data.payload))
cec736d2
LP
1394 return -EBADMSG;
1395
807e17f0
LP
1396 l -= offsetof(Object, data.payload);
1397
d89c8fdf
ZJS
1398 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
1399 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
1400 if (r < 0)
1401 return r;
807e17f0 1402
b785c858 1403 if (rsize == size &&
807e17f0
LP
1404 memcmp(f->compress_buffer, data, size) == 0) {
1405
1406 if (ret)
1407 *ret = o;
1408
1409 if (offset)
1410 *offset = p;
1411
1412 return 1;
1413 }
3b1a55e1
ZJS
1414#else
1415 return -EPROTONOSUPPORT;
1416#endif
807e17f0
LP
1417 } else if (le64toh(o->object.size) == osize &&
1418 memcmp(o->data.payload, data, size) == 0) {
1419
cec736d2
LP
1420 if (ret)
1421 *ret = o;
1422
1423 if (offset)
1424 *offset = p;
1425
de190aef 1426 return 1;
cec736d2
LP
1427 }
1428
85a131e8 1429 next:
cec736d2
LP
1430 p = le64toh(o->data.next_hash_offset);
1431 }
1432
de190aef
LP
1433 return 0;
1434}
1435
1436int journal_file_find_data_object(
1437 JournalFile *f,
1438 const void *data, uint64_t size,
1439 Object **ret, uint64_t *offset) {
1440
1441 uint64_t hash;
1442
1443 assert(f);
1444 assert(data || size == 0);
1445
1446 hash = hash64(data, size);
1447
1448 return journal_file_find_data_object_with_hash(f,
1449 data, size, hash,
1450 ret, offset);
1451}
1452
3c1668da
LP
1453static int journal_file_append_field(
1454 JournalFile *f,
1455 const void *field, uint64_t size,
1456 Object **ret, uint64_t *offset) {
1457
1458 uint64_t hash, p;
1459 uint64_t osize;
1460 Object *o;
1461 int r;
1462
1463 assert(f);
1464 assert(field && size > 0);
1465
1466 hash = hash64(field, size);
1467
1468 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
1469 if (r < 0)
1470 return r;
1471 else if (r > 0) {
1472
1473 if (ret)
1474 *ret = o;
1475
1476 if (offset)
1477 *offset = p;
1478
1479 return 0;
1480 }
1481
1482 osize = offsetof(Object, field.payload) + size;
1483 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
8c92d4bb
LP
1484 if (r < 0)
1485 return r;
3c1668da
LP
1486
1487 o->field.hash = htole64(hash);
1488 memcpy(o->field.payload, field, size);
1489
1490 r = journal_file_link_field(f, o, p, hash);
1491 if (r < 0)
1492 return r;
1493
1494 /* The linking might have altered the window, so let's
1495 * refresh our pointer */
1496 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1497 if (r < 0)
1498 return r;
1499
349cc4a5 1500#if HAVE_GCRYPT
3c1668da
LP
1501 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1502 if (r < 0)
1503 return r;
1504#endif
1505
1506 if (ret)
1507 *ret = o;
1508
1509 if (offset)
1510 *offset = p;
1511
1512 return 0;
1513}
1514
48496df6
LP
1515static int journal_file_append_data(
1516 JournalFile *f,
1517 const void *data, uint64_t size,
1518 Object **ret, uint64_t *offset) {
1519
de190aef
LP
1520 uint64_t hash, p;
1521 uint64_t osize;
1522 Object *o;
d89c8fdf 1523 int r, compression = 0;
3c1668da 1524 const void *eq;
de190aef
LP
1525
1526 assert(f);
1527 assert(data || size == 0);
1528
1529 hash = hash64(data, size);
1530
1531 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1532 if (r < 0)
1533 return r;
0240c603 1534 if (r > 0) {
de190aef
LP
1535
1536 if (ret)
1537 *ret = o;
1538
1539 if (offset)
1540 *offset = p;
1541
1542 return 0;
1543 }
1544
1545 osize = offsetof(Object, data.payload) + size;
1546 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
cec736d2
LP
1547 if (r < 0)
1548 return r;
1549
cec736d2 1550 o->data.hash = htole64(hash);
807e17f0 1551
349cc4a5 1552#if HAVE_XZ || HAVE_LZ4
57850536 1553 if (JOURNAL_FILE_COMPRESS(f) && size >= f->compress_threshold_bytes) {
a7f7d1bd 1554 size_t rsize = 0;
807e17f0 1555
5d6f46b6 1556 compression = compress_blob(data, size, o->data.payload, size - 1, &rsize);
807e17f0 1557
d1afbcd2 1558 if (compression >= 0) {
807e17f0 1559 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
d89c8fdf 1560 o->object.flags |= compression;
807e17f0 1561
fa1c4b51 1562 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
d89c8fdf 1563 size, rsize, object_compressed_to_string(compression));
d1afbcd2
LP
1564 } else
1565 /* Compression didn't work, we don't really care why, let's continue without compression */
1566 compression = 0;
807e17f0
LP
1567 }
1568#endif
1569
75f32f04
ZJS
1570 if (compression == 0)
1571 memcpy_safe(o->data.payload, data, size);
cec736d2 1572
de190aef 1573 r = journal_file_link_data(f, o, p, hash);
cec736d2
LP
1574 if (r < 0)
1575 return r;
1576
349cc4a5 1577#if HAVE_GCRYPT
33685a5a
FB
1578 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1579 if (r < 0)
1580 return r;
1581#endif
1582
48496df6
LP
1583 /* The linking might have altered the window, so let's
1584 * refresh our pointer */
1585 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1586 if (r < 0)
1587 return r;
1588
08c6f819
SL
1589 if (!data)
1590 eq = NULL;
1591 else
1592 eq = memchr(data, '=', size);
3c1668da 1593 if (eq && eq > data) {
748db592 1594 Object *fo = NULL;
3c1668da 1595 uint64_t fp;
3c1668da
LP
1596
1597 /* Create field object ... */
1598 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1599 if (r < 0)
1600 return r;
1601
1602 /* ... and link it in. */
1603 o->data.next_field_offset = fo->field.head_data_offset;
1604 fo->field.head_data_offset = le64toh(p);
1605 }
1606
cec736d2
LP
1607 if (ret)
1608 *ret = o;
1609
1610 if (offset)
de190aef 1611 *offset = p;
cec736d2
LP
1612
1613 return 0;
1614}
1615
1616uint64_t journal_file_entry_n_items(Object *o) {
1617 assert(o);
b588975f
LP
1618
1619 if (o->object.type != OBJECT_ENTRY)
1620 return 0;
cec736d2
LP
1621
1622 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1623}
1624
0284adc6 1625uint64_t journal_file_entry_array_n_items(Object *o) {
de190aef 1626 assert(o);
b588975f
LP
1627
1628 if (o->object.type != OBJECT_ENTRY_ARRAY)
1629 return 0;
de190aef
LP
1630
1631 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1632}
1633
fb9a24b6
LP
1634uint64_t journal_file_hash_table_n_items(Object *o) {
1635 assert(o);
b588975f 1636
ec2ce0c5 1637 if (!IN_SET(o->object.type, OBJECT_DATA_HASH_TABLE, OBJECT_FIELD_HASH_TABLE))
b588975f 1638 return 0;
fb9a24b6
LP
1639
1640 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1641}
1642
de190aef 1643static int link_entry_into_array(JournalFile *f,
4fd052ae
FC
1644 le64_t *first,
1645 le64_t *idx,
de190aef 1646 uint64_t p) {
cec736d2 1647 int r;
de190aef
LP
1648 uint64_t n = 0, ap = 0, q, i, a, hidx;
1649 Object *o;
1650
cec736d2 1651 assert(f);
c88cc6af 1652 assert(f->header);
de190aef
LP
1653 assert(first);
1654 assert(idx);
1655 assert(p > 0);
cec736d2 1656
de190aef
LP
1657 a = le64toh(*first);
1658 i = hidx = le64toh(*idx);
1659 while (a > 0) {
1660
1661 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1662 if (r < 0)
1663 return r;
cec736d2 1664
de190aef
LP
1665 n = journal_file_entry_array_n_items(o);
1666 if (i < n) {
1667 o->entry_array.items[i] = htole64(p);
1668 *idx = htole64(hidx + 1);
1669 return 0;
1670 }
cec736d2 1671
de190aef
LP
1672 i -= n;
1673 ap = a;
1674 a = le64toh(o->entry_array.next_entry_array_offset);
1675 }
1676
1677 if (hidx > n)
1678 n = (hidx+1) * 2;
1679 else
1680 n = n * 2;
1681
1682 if (n < 4)
1683 n = 4;
1684
1685 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1686 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1687 &o, &q);
cec736d2
LP
1688 if (r < 0)
1689 return r;
1690
349cc4a5 1691#if HAVE_GCRYPT
5996c7c2 1692 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
b0af6f41
LP
1693 if (r < 0)
1694 return r;
feb12d3e 1695#endif
b0af6f41 1696
de190aef 1697 o->entry_array.items[i] = htole64(p);
cec736d2 1698
de190aef 1699 if (ap == 0)
7be3aa17 1700 *first = htole64(q);
cec736d2 1701 else {
de190aef 1702 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
cec736d2
LP
1703 if (r < 0)
1704 return r;
1705
de190aef
LP
1706 o->entry_array.next_entry_array_offset = htole64(q);
1707 }
cec736d2 1708
2dee23eb
LP
1709 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1710 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1711
de190aef
LP
1712 *idx = htole64(hidx + 1);
1713
1714 return 0;
1715}
cec736d2 1716
de190aef 1717static int link_entry_into_array_plus_one(JournalFile *f,
4fd052ae
FC
1718 le64_t *extra,
1719 le64_t *first,
1720 le64_t *idx,
de190aef
LP
1721 uint64_t p) {
1722
1723 int r;
1724
1725 assert(f);
1726 assert(extra);
1727 assert(first);
1728 assert(idx);
1729 assert(p > 0);
1730
1731 if (*idx == 0)
1732 *extra = htole64(p);
1733 else {
4fd052ae 1734 le64_t i;
de190aef 1735
7be3aa17 1736 i = htole64(le64toh(*idx) - 1);
de190aef
LP
1737 r = link_entry_into_array(f, first, &i, p);
1738 if (r < 0)
1739 return r;
cec736d2
LP
1740 }
1741
de190aef
LP
1742 *idx = htole64(le64toh(*idx) + 1);
1743 return 0;
1744}
1745
1746static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1747 uint64_t p;
1748 int r;
1749 assert(f);
1750 assert(o);
1751 assert(offset > 0);
1752
1753 p = le64toh(o->entry.items[i].object_offset);
1754 if (p == 0)
1755 return -EINVAL;
1756
1757 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1758 if (r < 0)
1759 return r;
1760
de190aef
LP
1761 return link_entry_into_array_plus_one(f,
1762 &o->data.entry_offset,
1763 &o->data.entry_array_offset,
1764 &o->data.n_entries,
1765 offset);
cec736d2
LP
1766}
1767
1768static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
de190aef 1769 uint64_t n, i;
cec736d2
LP
1770 int r;
1771
1772 assert(f);
c88cc6af 1773 assert(f->header);
cec736d2
LP
1774 assert(o);
1775 assert(offset > 0);
b588975f
LP
1776
1777 if (o->object.type != OBJECT_ENTRY)
1778 return -EINVAL;
cec736d2 1779
b788cc23
LP
1780 __sync_synchronize();
1781
cec736d2 1782 /* Link up the entry itself */
de190aef
LP
1783 r = link_entry_into_array(f,
1784 &f->header->entry_array_offset,
1785 &f->header->n_entries,
1786 offset);
1787 if (r < 0)
1788 return r;
cec736d2 1789
507f22bd 1790 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
cec736d2 1791
de190aef 1792 if (f->header->head_entry_realtime == 0)
0ac38b70 1793 f->header->head_entry_realtime = o->entry.realtime;
cec736d2 1794
0ac38b70 1795 f->header->tail_entry_realtime = o->entry.realtime;
de190aef
LP
1796 f->header->tail_entry_monotonic = o->entry.monotonic;
1797
cec736d2
LP
1798 /* Link up the items */
1799 n = journal_file_entry_n_items(o);
1800 for (i = 0; i < n; i++) {
1801 r = journal_file_link_entry_item(f, o, offset, i);
1802 if (r < 0)
1803 return r;
1804 }
1805
cec736d2
LP
1806 return 0;
1807}
1808
1809static int journal_file_append_entry_internal(
1810 JournalFile *f,
1811 const dual_timestamp *ts,
1812 uint64_t xor_hash,
1813 const EntryItem items[], unsigned n_items,
de190aef 1814 uint64_t *seqnum,
cec736d2
LP
1815 Object **ret, uint64_t *offset) {
1816 uint64_t np;
1817 uint64_t osize;
1818 Object *o;
1819 int r;
1820
1821 assert(f);
c88cc6af 1822 assert(f->header);
cec736d2 1823 assert(items || n_items == 0);
de190aef 1824 assert(ts);
cec736d2
LP
1825
1826 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1827
de190aef 1828 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
cec736d2
LP
1829 if (r < 0)
1830 return r;
1831
d98cc1f2 1832 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
75f32f04 1833 memcpy_safe(o->entry.items, items, n_items * sizeof(EntryItem));
de190aef
LP
1834 o->entry.realtime = htole64(ts->realtime);
1835 o->entry.monotonic = htole64(ts->monotonic);
cec736d2
LP
1836 o->entry.xor_hash = htole64(xor_hash);
1837 o->entry.boot_id = f->header->boot_id;
1838
349cc4a5 1839#if HAVE_GCRYPT
5996c7c2 1840 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
b0af6f41
LP
1841 if (r < 0)
1842 return r;
feb12d3e 1843#endif
b0af6f41 1844
cec736d2
LP
1845 r = journal_file_link_entry(f, o, np);
1846 if (r < 0)
1847 return r;
1848
1849 if (ret)
1850 *ret = o;
1851
1852 if (offset)
1853 *offset = np;
1854
1855 return 0;
1856}
1857
cf244689 1858void journal_file_post_change(JournalFile *f) {
50f20cfd
LP
1859 assert(f);
1860
1861 /* inotify() does not receive IN_MODIFY events from file
1862 * accesses done via mmap(). After each access we hence
1863 * trigger IN_MODIFY by truncating the journal file to its
1864 * current size which triggers IN_MODIFY. */
1865
bc85bfee
LP
1866 __sync_synchronize();
1867
50f20cfd 1868 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
e167d7fd 1869 log_debug_errno(errno, "Failed to truncate file to its own size: %m");
50f20cfd
LP
1870}
1871
7a24f3bf
VC
1872static int post_change_thunk(sd_event_source *timer, uint64_t usec, void *userdata) {
1873 assert(userdata);
1874
1875 journal_file_post_change(userdata);
1876
1877 return 1;
1878}
1879
1880static void schedule_post_change(JournalFile *f) {
1881 sd_event_source *timer;
1882 int enabled, r;
1883 uint64_t now;
1884
1885 assert(f);
1886 assert(f->post_change_timer);
1887
1888 timer = f->post_change_timer;
1889
1890 r = sd_event_source_get_enabled(timer, &enabled);
1891 if (r < 0) {
e167d7fd
LP
1892 log_debug_errno(r, "Failed to get ftruncate timer state: %m");
1893 goto fail;
7a24f3bf
VC
1894 }
1895
1896 if (enabled == SD_EVENT_ONESHOT)
1897 return;
1898
1899 r = sd_event_now(sd_event_source_get_event(timer), CLOCK_MONOTONIC, &now);
1900 if (r < 0) {
e167d7fd
LP
1901 log_debug_errno(r, "Failed to get clock's now for scheduling ftruncate: %m");
1902 goto fail;
7a24f3bf
VC
1903 }
1904
1905 r = sd_event_source_set_time(timer, now+f->post_change_timer_period);
1906 if (r < 0) {
e167d7fd
LP
1907 log_debug_errno(r, "Failed to set time for scheduling ftruncate: %m");
1908 goto fail;
7a24f3bf
VC
1909 }
1910
1911 r = sd_event_source_set_enabled(timer, SD_EVENT_ONESHOT);
1912 if (r < 0) {
e167d7fd
LP
1913 log_debug_errno(r, "Failed to enable scheduled ftruncate: %m");
1914 goto fail;
7a24f3bf 1915 }
e167d7fd
LP
1916
1917 return;
1918
1919fail:
1920 /* On failure, let's simply post the change immediately. */
1921 journal_file_post_change(f);
7a24f3bf
VC
1922}
1923
1924/* Enable coalesced change posting in a timer on the provided sd_event instance */
1925int journal_file_enable_post_change_timer(JournalFile *f, sd_event *e, usec_t t) {
1926 _cleanup_(sd_event_source_unrefp) sd_event_source *timer = NULL;
1927 int r;
1928
1929 assert(f);
1930 assert_return(!f->post_change_timer, -EINVAL);
1931 assert(e);
1932 assert(t);
1933
1934 r = sd_event_add_time(e, &timer, CLOCK_MONOTONIC, 0, 0, post_change_thunk, f);
1935 if (r < 0)
1936 return r;
1937
1938 r = sd_event_source_set_enabled(timer, SD_EVENT_OFF);
1939 if (r < 0)
1940 return r;
1941
1942 f->post_change_timer = timer;
1943 timer = NULL;
1944 f->post_change_timer_period = t;
1945
1946 return r;
1947}
1948
1f2da9ec
LP
1949static int entry_item_cmp(const void *_a, const void *_b) {
1950 const EntryItem *a = _a, *b = _b;
1951
1952 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1953 return -1;
1954 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1955 return 1;
1956 return 0;
1957}
1958
de190aef 1959int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
cec736d2
LP
1960 unsigned i;
1961 EntryItem *items;
1962 int r;
1963 uint64_t xor_hash = 0;
de190aef 1964 struct dual_timestamp _ts;
cec736d2
LP
1965
1966 assert(f);
c88cc6af 1967 assert(f->header);
cec736d2
LP
1968 assert(iovec || n_iovec == 0);
1969
de190aef
LP
1970 if (!ts) {
1971 dual_timestamp_get(&_ts);
1972 ts = &_ts;
1973 }
1974
349cc4a5 1975#if HAVE_GCRYPT
7560fffc
LP
1976 r = journal_file_maybe_append_tag(f, ts->realtime);
1977 if (r < 0)
1978 return r;
feb12d3e 1979#endif
7560fffc 1980
64825d3c 1981 /* alloca() can't take 0, hence let's allocate at least one */
9607d947 1982 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
cec736d2
LP
1983
1984 for (i = 0; i < n_iovec; i++) {
1985 uint64_t p;
1986 Object *o;
1987
1988 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1989 if (r < 0)
cf244689 1990 return r;
cec736d2
LP
1991
1992 xor_hash ^= le64toh(o->data.hash);
1993 items[i].object_offset = htole64(p);
de7b95cd 1994 items[i].hash = o->data.hash;
cec736d2
LP
1995 }
1996
1f2da9ec
LP
1997 /* Order by the position on disk, in order to improve seek
1998 * times for rotating media. */
7ff7394d 1999 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1f2da9ec 2000
de190aef 2001 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
cec736d2 2002
fa6ac760
LP
2003 /* If the memory mapping triggered a SIGBUS then we return an
2004 * IO error and ignore the error code passed down to us, since
2005 * it is very likely just an effect of a nullified replacement
2006 * mapping page */
2007
be7cdd8e 2008 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
fa6ac760
LP
2009 r = -EIO;
2010
7a24f3bf
VC
2011 if (f->post_change_timer)
2012 schedule_post_change(f);
2013 else
2014 journal_file_post_change(f);
50f20cfd 2015
cec736d2
LP
2016 return r;
2017}
2018
a4bcff5b 2019typedef struct ChainCacheItem {
fb099c8d 2020 uint64_t first; /* the array at the beginning of the chain */
a4bcff5b
LP
2021 uint64_t array; /* the cached array */
2022 uint64_t begin; /* the first item in the cached array */
2023 uint64_t total; /* the total number of items in all arrays before this one in the chain */
f268980d 2024 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
a4bcff5b
LP
2025} ChainCacheItem;
2026
2027static void chain_cache_put(
4743015d 2028 OrderedHashmap *h,
a4bcff5b
LP
2029 ChainCacheItem *ci,
2030 uint64_t first,
2031 uint64_t array,
2032 uint64_t begin,
f268980d
LP
2033 uint64_t total,
2034 uint64_t last_index) {
a4bcff5b
LP
2035
2036 if (!ci) {
34741aa3
LP
2037 /* If the chain item to cache for this chain is the
2038 * first one it's not worth caching anything */
2039 if (array == first)
2040 return;
2041
29433089 2042 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
4743015d 2043 ci = ordered_hashmap_steal_first(h);
29433089
LP
2044 assert(ci);
2045 } else {
a4bcff5b
LP
2046 ci = new(ChainCacheItem, 1);
2047 if (!ci)
2048 return;
2049 }
2050
2051 ci->first = first;
2052
4743015d 2053 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
a4bcff5b
LP
2054 free(ci);
2055 return;
2056 }
2057 } else
2058 assert(ci->first == first);
2059
2060 ci->array = array;
2061 ci->begin = begin;
2062 ci->total = total;
f268980d 2063 ci->last_index = last_index;
a4bcff5b
LP
2064}
2065
f268980d
LP
2066static int generic_array_get(
2067 JournalFile *f,
2068 uint64_t first,
2069 uint64_t i,
2070 Object **ret, uint64_t *offset) {
de190aef 2071
cec736d2 2072 Object *o;
a4bcff5b 2073 uint64_t p = 0, a, t = 0;
cec736d2 2074 int r;
a4bcff5b 2075 ChainCacheItem *ci;
cec736d2
LP
2076
2077 assert(f);
2078
de190aef 2079 a = first;
a4bcff5b
LP
2080
2081 /* Try the chain cache first */
4743015d 2082 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
2083 if (ci && i > ci->total) {
2084 a = ci->array;
2085 i -= ci->total;
2086 t = ci->total;
2087 }
2088
de190aef 2089 while (a > 0) {
a4bcff5b 2090 uint64_t k;
cec736d2 2091
de190aef
LP
2092 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
2093 if (r < 0)
2094 return r;
cec736d2 2095
a4bcff5b
LP
2096 k = journal_file_entry_array_n_items(o);
2097 if (i < k) {
de190aef 2098 p = le64toh(o->entry_array.items[i]);
a4bcff5b 2099 goto found;
cec736d2
LP
2100 }
2101
a4bcff5b
LP
2102 i -= k;
2103 t += k;
de190aef
LP
2104 a = le64toh(o->entry_array.next_entry_array_offset);
2105 }
2106
a4bcff5b
LP
2107 return 0;
2108
2109found:
2110 /* Let's cache this item for the next invocation */
af13a6b0 2111 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
de190aef
LP
2112
2113 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2114 if (r < 0)
2115 return r;
2116
2117 if (ret)
2118 *ret = o;
2119
2120 if (offset)
2121 *offset = p;
2122
2123 return 1;
2124}
2125
f268980d
LP
2126static int generic_array_get_plus_one(
2127 JournalFile *f,
2128 uint64_t extra,
2129 uint64_t first,
2130 uint64_t i,
2131 Object **ret, uint64_t *offset) {
de190aef
LP
2132
2133 Object *o;
2134
2135 assert(f);
2136
2137 if (i == 0) {
2138 int r;
2139
2140 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
cec736d2
LP
2141 if (r < 0)
2142 return r;
2143
de190aef
LP
2144 if (ret)
2145 *ret = o;
cec736d2 2146
de190aef
LP
2147 if (offset)
2148 *offset = extra;
cec736d2 2149
de190aef 2150 return 1;
cec736d2
LP
2151 }
2152
de190aef
LP
2153 return generic_array_get(f, first, i-1, ret, offset);
2154}
cec736d2 2155
de190aef
LP
2156enum {
2157 TEST_FOUND,
2158 TEST_LEFT,
2159 TEST_RIGHT
2160};
cec736d2 2161
f268980d
LP
2162static int generic_array_bisect(
2163 JournalFile *f,
2164 uint64_t first,
2165 uint64_t n,
2166 uint64_t needle,
2167 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2168 direction_t direction,
2169 Object **ret,
2170 uint64_t *offset,
2171 uint64_t *idx) {
2172
2173 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
de190aef
LP
2174 bool subtract_one = false;
2175 Object *o, *array = NULL;
2176 int r;
a4bcff5b 2177 ChainCacheItem *ci;
cec736d2 2178
de190aef
LP
2179 assert(f);
2180 assert(test_object);
cec736d2 2181
a4bcff5b 2182 /* Start with the first array in the chain */
de190aef 2183 a = first;
a4bcff5b 2184
4743015d 2185 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
2186 if (ci && n > ci->total) {
2187 /* Ah, we have iterated this bisection array chain
2188 * previously! Let's see if we can skip ahead in the
2189 * chain, as far as the last time. But we can't jump
2190 * backwards in the chain, so let's check that
2191 * first. */
2192
2193 r = test_object(f, ci->begin, needle);
2194 if (r < 0)
2195 return r;
2196
2197 if (r == TEST_LEFT) {
f268980d 2198 /* OK, what we are looking for is right of the
a4bcff5b
LP
2199 * begin of this EntryArray, so let's jump
2200 * straight to previously cached array in the
2201 * chain */
2202
2203 a = ci->array;
2204 n -= ci->total;
2205 t = ci->total;
f268980d 2206 last_index = ci->last_index;
a4bcff5b
LP
2207 }
2208 }
2209
de190aef
LP
2210 while (a > 0) {
2211 uint64_t left, right, k, lp;
2212
2213 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
cec736d2
LP
2214 if (r < 0)
2215 return r;
2216
de190aef
LP
2217 k = journal_file_entry_array_n_items(array);
2218 right = MIN(k, n);
2219 if (right <= 0)
2220 return 0;
cec736d2 2221
de190aef
LP
2222 i = right - 1;
2223 lp = p = le64toh(array->entry_array.items[i]);
2224 if (p <= 0)
bee6a291
LP
2225 r = -EBADMSG;
2226 else
2227 r = test_object(f, p, needle);
2228 if (r == -EBADMSG) {
2229 log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (1)");
2230 n = i;
2231 continue;
2232 }
de190aef
LP
2233 if (r < 0)
2234 return r;
cec736d2 2235
de190aef
LP
2236 if (r == TEST_FOUND)
2237 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2238
2239 if (r == TEST_RIGHT) {
2240 left = 0;
2241 right -= 1;
f268980d
LP
2242
2243 if (last_index != (uint64_t) -1) {
2244 assert(last_index <= right);
2245
2246 /* If we cached the last index we
2247 * looked at, let's try to not to jump
2248 * too wildly around and see if we can
2249 * limit the range to look at early to
2250 * the immediate neighbors of the last
2251 * index we looked at. */
2252
2253 if (last_index > 0) {
2254 uint64_t x = last_index - 1;
2255
2256 p = le64toh(array->entry_array.items[x]);
2257 if (p <= 0)
2258 return -EBADMSG;
2259
2260 r = test_object(f, p, needle);
2261 if (r < 0)
2262 return r;
2263
2264 if (r == TEST_FOUND)
2265 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2266
2267 if (r == TEST_RIGHT)
2268 right = x;
2269 else
2270 left = x + 1;
2271 }
2272
2273 if (last_index < right) {
2274 uint64_t y = last_index + 1;
2275
2276 p = le64toh(array->entry_array.items[y]);
2277 if (p <= 0)
2278 return -EBADMSG;
2279
2280 r = test_object(f, p, needle);
2281 if (r < 0)
2282 return r;
2283
2284 if (r == TEST_FOUND)
2285 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2286
2287 if (r == TEST_RIGHT)
2288 right = y;
2289 else
2290 left = y + 1;
2291 }
f268980d
LP
2292 }
2293
de190aef
LP
2294 for (;;) {
2295 if (left == right) {
2296 if (direction == DIRECTION_UP)
2297 subtract_one = true;
2298
2299 i = left;
2300 goto found;
2301 }
2302
2303 assert(left < right);
de190aef 2304 i = (left + right) / 2;
f268980d 2305
de190aef
LP
2306 p = le64toh(array->entry_array.items[i]);
2307 if (p <= 0)
bee6a291
LP
2308 r = -EBADMSG;
2309 else
2310 r = test_object(f, p, needle);
2311 if (r == -EBADMSG) {
2312 log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (2)");
2313 right = n = i;
2314 continue;
2315 }
de190aef
LP
2316 if (r < 0)
2317 return r;
cec736d2 2318
de190aef
LP
2319 if (r == TEST_FOUND)
2320 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2321
2322 if (r == TEST_RIGHT)
2323 right = i;
2324 else
2325 left = i + 1;
2326 }
2327 }
2328
2173cbf8 2329 if (k >= n) {
cbdca852
LP
2330 if (direction == DIRECTION_UP) {
2331 i = n;
2332 subtract_one = true;
2333 goto found;
2334 }
2335
cec736d2 2336 return 0;
cbdca852 2337 }
cec736d2 2338
de190aef
LP
2339 last_p = lp;
2340
2341 n -= k;
2342 t += k;
f268980d 2343 last_index = (uint64_t) -1;
de190aef 2344 a = le64toh(array->entry_array.next_entry_array_offset);
cec736d2
LP
2345 }
2346
2347 return 0;
de190aef
LP
2348
2349found:
2350 if (subtract_one && t == 0 && i == 0)
2351 return 0;
2352
a4bcff5b 2353 /* Let's cache this item for the next invocation */
af13a6b0 2354 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
a4bcff5b 2355
de190aef
LP
2356 if (subtract_one && i == 0)
2357 p = last_p;
2358 else if (subtract_one)
2359 p = le64toh(array->entry_array.items[i-1]);
2360 else
2361 p = le64toh(array->entry_array.items[i]);
2362
2363 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2364 if (r < 0)
2365 return r;
2366
2367 if (ret)
2368 *ret = o;
2369
2370 if (offset)
2371 *offset = p;
2372
2373 if (idx)
cbdca852 2374 *idx = t + i + (subtract_one ? -1 : 0);
de190aef
LP
2375
2376 return 1;
cec736d2
LP
2377}
2378
f268980d
LP
2379static int generic_array_bisect_plus_one(
2380 JournalFile *f,
2381 uint64_t extra,
2382 uint64_t first,
2383 uint64_t n,
2384 uint64_t needle,
2385 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2386 direction_t direction,
2387 Object **ret,
2388 uint64_t *offset,
2389 uint64_t *idx) {
de190aef 2390
cec736d2 2391 int r;
cbdca852
LP
2392 bool step_back = false;
2393 Object *o;
cec736d2
LP
2394
2395 assert(f);
de190aef 2396 assert(test_object);
cec736d2 2397
de190aef
LP
2398 if (n <= 0)
2399 return 0;
cec736d2 2400
de190aef
LP
2401 /* This bisects the array in object 'first', but first checks
2402 * an extra */
de190aef
LP
2403 r = test_object(f, extra, needle);
2404 if (r < 0)
2405 return r;
a536e261
LP
2406
2407 if (r == TEST_FOUND)
2408 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2409
cbdca852
LP
2410 /* if we are looking with DIRECTION_UP then we need to first
2411 see if in the actual array there is a matching entry, and
2412 return the last one of that. But if there isn't any we need
2413 to return this one. Hence remember this, and return it
2414 below. */
2415 if (r == TEST_LEFT)
2416 step_back = direction == DIRECTION_UP;
de190aef 2417
cbdca852
LP
2418 if (r == TEST_RIGHT) {
2419 if (direction == DIRECTION_DOWN)
2420 goto found;
2421 else
2422 return 0;
a536e261 2423 }
cec736d2 2424
de190aef
LP
2425 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
2426
cbdca852
LP
2427 if (r == 0 && step_back)
2428 goto found;
2429
ecf68b1d 2430 if (r > 0 && idx)
313cefa1 2431 (*idx)++;
de190aef
LP
2432
2433 return r;
cbdca852
LP
2434
2435found:
2436 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
2437 if (r < 0)
2438 return r;
2439
2440 if (ret)
2441 *ret = o;
2442
2443 if (offset)
2444 *offset = extra;
2445
2446 if (idx)
2447 *idx = 0;
2448
2449 return 1;
2450}
2451
44a6b1b6 2452_pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
cbdca852
LP
2453 assert(f);
2454 assert(p > 0);
2455
2456 if (p == needle)
2457 return TEST_FOUND;
2458 else if (p < needle)
2459 return TEST_LEFT;
2460 else
2461 return TEST_RIGHT;
2462}
2463
de190aef
LP
2464static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
2465 Object *o;
2466 int r;
2467
2468 assert(f);
2469 assert(p > 0);
2470
2471 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
cec736d2
LP
2472 if (r < 0)
2473 return r;
2474
de190aef
LP
2475 if (le64toh(o->entry.seqnum) == needle)
2476 return TEST_FOUND;
2477 else if (le64toh(o->entry.seqnum) < needle)
2478 return TEST_LEFT;
2479 else
2480 return TEST_RIGHT;
2481}
cec736d2 2482
de190aef
LP
2483int journal_file_move_to_entry_by_seqnum(
2484 JournalFile *f,
2485 uint64_t seqnum,
2486 direction_t direction,
2487 Object **ret,
2488 uint64_t *offset) {
c88cc6af
VC
2489 assert(f);
2490 assert(f->header);
de190aef
LP
2491
2492 return generic_array_bisect(f,
2493 le64toh(f->header->entry_array_offset),
2494 le64toh(f->header->n_entries),
2495 seqnum,
2496 test_object_seqnum,
2497 direction,
2498 ret, offset, NULL);
2499}
cec736d2 2500
de190aef
LP
2501static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
2502 Object *o;
2503 int r;
2504
2505 assert(f);
2506 assert(p > 0);
2507
2508 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2509 if (r < 0)
2510 return r;
2511
2512 if (le64toh(o->entry.realtime) == needle)
2513 return TEST_FOUND;
2514 else if (le64toh(o->entry.realtime) < needle)
2515 return TEST_LEFT;
2516 else
2517 return TEST_RIGHT;
cec736d2
LP
2518}
2519
de190aef
LP
2520int journal_file_move_to_entry_by_realtime(
2521 JournalFile *f,
2522 uint64_t realtime,
2523 direction_t direction,
2524 Object **ret,
2525 uint64_t *offset) {
c88cc6af
VC
2526 assert(f);
2527 assert(f->header);
de190aef
LP
2528
2529 return generic_array_bisect(f,
2530 le64toh(f->header->entry_array_offset),
2531 le64toh(f->header->n_entries),
2532 realtime,
2533 test_object_realtime,
2534 direction,
2535 ret, offset, NULL);
2536}
2537
2538static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
2539 Object *o;
2540 int r;
2541
2542 assert(f);
2543 assert(p > 0);
2544
2545 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2546 if (r < 0)
2547 return r;
2548
2549 if (le64toh(o->entry.monotonic) == needle)
2550 return TEST_FOUND;
2551 else if (le64toh(o->entry.monotonic) < needle)
2552 return TEST_LEFT;
2553 else
2554 return TEST_RIGHT;
2555}
2556
2a560338 2557static int find_data_object_by_boot_id(
47838ab3
ZJS
2558 JournalFile *f,
2559 sd_id128_t boot_id,
2560 Object **o,
2561 uint64_t *b) {
2a560338 2562
fbd0b64f 2563 char t[STRLEN("_BOOT_ID=") + 32 + 1] = "_BOOT_ID=";
47838ab3
ZJS
2564
2565 sd_id128_to_string(boot_id, t + 9);
2566 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2567}
2568
de190aef
LP
2569int journal_file_move_to_entry_by_monotonic(
2570 JournalFile *f,
2571 sd_id128_t boot_id,
2572 uint64_t monotonic,
2573 direction_t direction,
2574 Object **ret,
2575 uint64_t *offset) {
2576
de190aef
LP
2577 Object *o;
2578 int r;
2579
cbdca852 2580 assert(f);
de190aef 2581
47838ab3 2582 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
de190aef
LP
2583 if (r < 0)
2584 return r;
cbdca852 2585 if (r == 0)
de190aef
LP
2586 return -ENOENT;
2587
2588 return generic_array_bisect_plus_one(f,
2589 le64toh(o->data.entry_offset),
2590 le64toh(o->data.entry_array_offset),
2591 le64toh(o->data.n_entries),
2592 monotonic,
2593 test_object_monotonic,
2594 direction,
2595 ret, offset, NULL);
2596}
2597
1fc605b0 2598void journal_file_reset_location(JournalFile *f) {
6573ef05 2599 f->location_type = LOCATION_HEAD;
1fc605b0 2600 f->current_offset = 0;
6573ef05
MS
2601 f->current_seqnum = 0;
2602 f->current_realtime = 0;
2603 f->current_monotonic = 0;
2604 zero(f->current_boot_id);
2605 f->current_xor_hash = 0;
2606}
2607
950c07d4 2608void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
6573ef05
MS
2609 f->location_type = LOCATION_SEEK;
2610 f->current_offset = offset;
2611 f->current_seqnum = le64toh(o->entry.seqnum);
2612 f->current_realtime = le64toh(o->entry.realtime);
2613 f->current_monotonic = le64toh(o->entry.monotonic);
2614 f->current_boot_id = o->entry.boot_id;
2615 f->current_xor_hash = le64toh(o->entry.xor_hash);
1fc605b0
MS
2616}
2617
d8ae66d7
MS
2618int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2619 assert(af);
c88cc6af 2620 assert(af->header);
d8ae66d7 2621 assert(bf);
c88cc6af 2622 assert(bf->header);
d8ae66d7
MS
2623 assert(af->location_type == LOCATION_SEEK);
2624 assert(bf->location_type == LOCATION_SEEK);
2625
2626 /* If contents and timestamps match, these entries are
2627 * identical, even if the seqnum does not match */
2628 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2629 af->current_monotonic == bf->current_monotonic &&
2630 af->current_realtime == bf->current_realtime &&
2631 af->current_xor_hash == bf->current_xor_hash)
2632 return 0;
2633
2634 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2635
2636 /* If this is from the same seqnum source, compare
2637 * seqnums */
2638 if (af->current_seqnum < bf->current_seqnum)
2639 return -1;
2640 if (af->current_seqnum > bf->current_seqnum)
2641 return 1;
2642
2643 /* Wow! This is weird, different data but the same
2644 * seqnums? Something is borked, but let's make the
2645 * best of it and compare by time. */
2646 }
2647
2648 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2649
2650 /* If the boot id matches, compare monotonic time */
2651 if (af->current_monotonic < bf->current_monotonic)
2652 return -1;
2653 if (af->current_monotonic > bf->current_monotonic)
2654 return 1;
2655 }
2656
2657 /* Otherwise, compare UTC time */
2658 if (af->current_realtime < bf->current_realtime)
2659 return -1;
2660 if (af->current_realtime > bf->current_realtime)
2661 return 1;
2662
2663 /* Finally, compare by contents */
2664 if (af->current_xor_hash < bf->current_xor_hash)
2665 return -1;
2666 if (af->current_xor_hash > bf->current_xor_hash)
2667 return 1;
2668
2669 return 0;
2670}
2671
aa598ba5
LP
2672static int bump_array_index(uint64_t *i, direction_t direction, uint64_t n) {
2673
2674 /* Increase or decrease the specified index, in the right direction. */
2675
2676 if (direction == DIRECTION_DOWN) {
2677 if (*i >= n - 1)
2678 return 0;
2679
2680 (*i) ++;
2681 } else {
2682 if (*i <= 0)
2683 return 0;
2684
2685 (*i) --;
2686 }
2687
2688 return 1;
2689}
2690
b6da4ed0
LP
2691static bool check_properly_ordered(uint64_t new_offset, uint64_t old_offset, direction_t direction) {
2692
2693 /* Consider it an error if any of the two offsets is uninitialized */
2694 if (old_offset == 0 || new_offset == 0)
2695 return false;
2696
2697 /* If we go down, the new offset must be larger than the old one. */
2698 return direction == DIRECTION_DOWN ?
2699 new_offset > old_offset :
2700 new_offset < old_offset;
2701}
2702
de190aef
LP
2703int journal_file_next_entry(
2704 JournalFile *f,
f534928a 2705 uint64_t p,
de190aef
LP
2706 direction_t direction,
2707 Object **ret, uint64_t *offset) {
2708
fb099c8d 2709 uint64_t i, n, ofs;
cec736d2
LP
2710 int r;
2711
2712 assert(f);
c88cc6af 2713 assert(f->header);
de190aef
LP
2714
2715 n = le64toh(f->header->n_entries);
2716 if (n <= 0)
2717 return 0;
cec736d2 2718
f534928a 2719 if (p == 0)
de190aef 2720 i = direction == DIRECTION_DOWN ? 0 : n - 1;
cec736d2 2721 else {
de190aef
LP
2722 r = generic_array_bisect(f,
2723 le64toh(f->header->entry_array_offset),
2724 le64toh(f->header->n_entries),
2725 p,
2726 test_object_offset,
2727 DIRECTION_DOWN,
2728 NULL, NULL,
2729 &i);
2730 if (r <= 0)
2731 return r;
2732
aa598ba5
LP
2733 r = bump_array_index(&i, direction, n);
2734 if (r <= 0)
2735 return r;
cec736d2
LP
2736 }
2737
de190aef 2738 /* And jump to it */
989793d3
LP
2739 for (;;) {
2740 r = generic_array_get(f,
2741 le64toh(f->header->entry_array_offset),
2742 i,
2743 ret, &ofs);
2744 if (r > 0)
2745 break;
2746 if (r != -EBADMSG)
2747 return r;
2748
2749 /* OK, so this entry is borked. Most likely some entry didn't get synced to disk properly, let's see if
2750 * the next one might work for us instead. */
2751 log_debug_errno(r, "Entry item %" PRIu64 " is bad, skipping over it.", i);
2752
2753 r = bump_array_index(&i, direction, n);
2754 if (r <= 0)
2755 return r;
caeab8f6 2756 }
fb099c8d 2757
b6da4ed0
LP
2758 /* Ensure our array is properly ordered. */
2759 if (p > 0 && !check_properly_ordered(ofs, p, direction)) {
2760 log_debug("%s: entry array not properly ordered at entry %" PRIu64, f->path, i);
fb099c8d
ZJS
2761 return -EBADMSG;
2762 }
2763
2764 if (offset)
2765 *offset = ofs;
2766
2767 return 1;
de190aef 2768}
cec736d2 2769
de190aef
LP
2770int journal_file_next_entry_for_data(
2771 JournalFile *f,
2772 Object *o, uint64_t p,
2773 uint64_t data_offset,
2774 direction_t direction,
2775 Object **ret, uint64_t *offset) {
2776
ded5034e 2777 uint64_t i, n, ofs;
de190aef 2778 Object *d;
989793d3 2779 int r;
cec736d2
LP
2780
2781 assert(f);
de190aef 2782 assert(p > 0 || !o);
cec736d2 2783
de190aef 2784 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
466ccd92 2785 if (r < 0)
de190aef 2786 return r;
cec736d2 2787
de190aef
LP
2788 n = le64toh(d->data.n_entries);
2789 if (n <= 0)
2790 return n;
cec736d2 2791
de190aef
LP
2792 if (!o)
2793 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2794 else {
2795 if (o->object.type != OBJECT_ENTRY)
2796 return -EINVAL;
cec736d2 2797
de190aef
LP
2798 r = generic_array_bisect_plus_one(f,
2799 le64toh(d->data.entry_offset),
2800 le64toh(d->data.entry_array_offset),
2801 le64toh(d->data.n_entries),
2802 p,
2803 test_object_offset,
2804 DIRECTION_DOWN,
2805 NULL, NULL,
2806 &i);
2807
2808 if (r <= 0)
cec736d2
LP
2809 return r;
2810
aa598ba5
LP
2811 r = bump_array_index(&i, direction, n);
2812 if (r <= 0)
2813 return r;
de190aef 2814 }
cec736d2 2815
989793d3
LP
2816 for (;;) {
2817 r = generic_array_get_plus_one(f,
2818 le64toh(d->data.entry_offset),
2819 le64toh(d->data.entry_array_offset),
2820 i,
2821 ret, &ofs);
2822 if (r > 0)
2823 break;
2824 if (r != -EBADMSG)
2825 return r;
2826
2827 log_debug_errno(r, "Data entry item %" PRIu64 " is bad, skipping over it.", i);
2828
2829 r = bump_array_index(&i, direction, n);
2830 if (r <= 0)
2831 return r;
2832 }
ded5034e
LP
2833
2834 /* Ensure our array is properly ordered. */
2835 if (p > 0 && check_properly_ordered(ofs, p, direction)) {
2836 log_debug("%s data entry array not properly ordered at entry %" PRIu64, f->path, i);
2837 return -EBADMSG;
2838 }
2839
2840 if (offset)
2841 *offset = ofs;
2842
2843 return 1;
de190aef 2844}
cec736d2 2845
cbdca852
LP
2846int journal_file_move_to_entry_by_offset_for_data(
2847 JournalFile *f,
2848 uint64_t data_offset,
2849 uint64_t p,
2850 direction_t direction,
2851 Object **ret, uint64_t *offset) {
2852
2853 int r;
2854 Object *d;
2855
2856 assert(f);
2857
2858 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2859 if (r < 0)
2860 return r;
2861
2862 return generic_array_bisect_plus_one(f,
2863 le64toh(d->data.entry_offset),
2864 le64toh(d->data.entry_array_offset),
2865 le64toh(d->data.n_entries),
2866 p,
2867 test_object_offset,
2868 direction,
2869 ret, offset, NULL);
2870}
2871
2872int journal_file_move_to_entry_by_monotonic_for_data(
2873 JournalFile *f,
2874 uint64_t data_offset,
2875 sd_id128_t boot_id,
2876 uint64_t monotonic,
2877 direction_t direction,
2878 Object **ret, uint64_t *offset) {
2879
cbdca852
LP
2880 Object *o, *d;
2881 int r;
2882 uint64_t b, z;
2883
2884 assert(f);
2885
2886 /* First, seek by time */
47838ab3 2887 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
cbdca852
LP
2888 if (r < 0)
2889 return r;
2890 if (r == 0)
2891 return -ENOENT;
2892
2893 r = generic_array_bisect_plus_one(f,
2894 le64toh(o->data.entry_offset),
2895 le64toh(o->data.entry_array_offset),
2896 le64toh(o->data.n_entries),
2897 monotonic,
2898 test_object_monotonic,
2899 direction,
2900 NULL, &z, NULL);
2901 if (r <= 0)
2902 return r;
2903
2904 /* And now, continue seeking until we find an entry that
2905 * exists in both bisection arrays */
2906
2907 for (;;) {
2908 Object *qo;
2909 uint64_t p, q;
2910
2911 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2912 if (r < 0)
2913 return r;
2914
2915 r = generic_array_bisect_plus_one(f,
2916 le64toh(d->data.entry_offset),
2917 le64toh(d->data.entry_array_offset),
2918 le64toh(d->data.n_entries),
2919 z,
2920 test_object_offset,
2921 direction,
2922 NULL, &p, NULL);
2923 if (r <= 0)
2924 return r;
2925
2926 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2927 if (r < 0)
2928 return r;
2929
2930 r = generic_array_bisect_plus_one(f,
2931 le64toh(o->data.entry_offset),
2932 le64toh(o->data.entry_array_offset),
2933 le64toh(o->data.n_entries),
2934 p,
2935 test_object_offset,
2936 direction,
2937 &qo, &q, NULL);
2938
2939 if (r <= 0)
2940 return r;
2941
2942 if (p == q) {
2943 if (ret)
2944 *ret = qo;
2945 if (offset)
2946 *offset = q;
2947
2948 return 1;
2949 }
2950
2951 z = q;
2952 }
cbdca852
LP
2953}
2954
de190aef
LP
2955int journal_file_move_to_entry_by_seqnum_for_data(
2956 JournalFile *f,
2957 uint64_t data_offset,
2958 uint64_t seqnum,
2959 direction_t direction,
2960 Object **ret, uint64_t *offset) {
cec736d2 2961
de190aef
LP
2962 Object *d;
2963 int r;
cec736d2 2964
91a31dde
LP
2965 assert(f);
2966
de190aef 2967 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2968 if (r < 0)
de190aef 2969 return r;
cec736d2 2970
de190aef
LP
2971 return generic_array_bisect_plus_one(f,
2972 le64toh(d->data.entry_offset),
2973 le64toh(d->data.entry_array_offset),
2974 le64toh(d->data.n_entries),
2975 seqnum,
2976 test_object_seqnum,
2977 direction,
2978 ret, offset, NULL);
2979}
cec736d2 2980
de190aef
LP
2981int journal_file_move_to_entry_by_realtime_for_data(
2982 JournalFile *f,
2983 uint64_t data_offset,
2984 uint64_t realtime,
2985 direction_t direction,
2986 Object **ret, uint64_t *offset) {
2987
2988 Object *d;
2989 int r;
2990
91a31dde
LP
2991 assert(f);
2992
de190aef 2993 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2994 if (r < 0)
de190aef
LP
2995 return r;
2996
2997 return generic_array_bisect_plus_one(f,
2998 le64toh(d->data.entry_offset),
2999 le64toh(d->data.entry_array_offset),
3000 le64toh(d->data.n_entries),
3001 realtime,
3002 test_object_realtime,
3003 direction,
3004 ret, offset, NULL);
cec736d2
LP
3005}
3006
0284adc6 3007void journal_file_dump(JournalFile *f) {
7560fffc 3008 Object *o;
7560fffc 3009 int r;
0284adc6 3010 uint64_t p;
7560fffc
LP
3011
3012 assert(f);
c88cc6af 3013 assert(f->header);
7560fffc 3014
0284adc6 3015 journal_file_print_header(f);
7560fffc 3016
0284adc6
LP
3017 p = le64toh(f->header->header_size);
3018 while (p != 0) {
d05089d8 3019 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
0284adc6
LP
3020 if (r < 0)
3021 goto fail;
7560fffc 3022
0284adc6 3023 switch (o->object.type) {
d98cc1f2 3024
0284adc6
LP
3025 case OBJECT_UNUSED:
3026 printf("Type: OBJECT_UNUSED\n");
3027 break;
d98cc1f2 3028
0284adc6
LP
3029 case OBJECT_DATA:
3030 printf("Type: OBJECT_DATA\n");
3031 break;
7560fffc 3032
3c1668da
LP
3033 case OBJECT_FIELD:
3034 printf("Type: OBJECT_FIELD\n");
3035 break;
3036
0284adc6 3037 case OBJECT_ENTRY:
507f22bd
ZJS
3038 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
3039 le64toh(o->entry.seqnum),
3040 le64toh(o->entry.monotonic),
3041 le64toh(o->entry.realtime));
0284adc6 3042 break;
7560fffc 3043
0284adc6
LP
3044 case OBJECT_FIELD_HASH_TABLE:
3045 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
3046 break;
7560fffc 3047
0284adc6
LP
3048 case OBJECT_DATA_HASH_TABLE:
3049 printf("Type: OBJECT_DATA_HASH_TABLE\n");
3050 break;
7560fffc 3051
0284adc6
LP
3052 case OBJECT_ENTRY_ARRAY:
3053 printf("Type: OBJECT_ENTRY_ARRAY\n");
3054 break;
7560fffc 3055
0284adc6 3056 case OBJECT_TAG:
507f22bd
ZJS
3057 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
3058 le64toh(o->tag.seqnum),
3059 le64toh(o->tag.epoch));
0284adc6 3060 break;
3c1668da
LP
3061
3062 default:
8facc349 3063 printf("Type: unknown (%i)\n", o->object.type);
3c1668da 3064 break;
0284adc6 3065 }
7560fffc 3066
d89c8fdf
ZJS
3067 if (o->object.flags & OBJECT_COMPRESSION_MASK)
3068 printf("Flags: %s\n",
3069 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
7560fffc 3070
0284adc6
LP
3071 if (p == le64toh(f->header->tail_object_offset))
3072 p = 0;
3073 else
3074 p = p + ALIGN64(le64toh(o->object.size));
3075 }
7560fffc 3076
0284adc6
LP
3077 return;
3078fail:
3079 log_error("File corrupt");
7560fffc
LP
3080}
3081
718fe4b1
ZJS
3082static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
3083 const char *x;
3084
3085 x = format_timestamp(buf, l, t);
3086 if (x)
3087 return x;
3088 return " --- ";
3089}
3090
0284adc6 3091void journal_file_print_header(JournalFile *f) {
2765b7bb 3092 char a[33], b[33], c[33], d[33];
ed375beb 3093 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
a1a03e30
LP
3094 struct stat st;
3095 char bytes[FORMAT_BYTES_MAX];
7560fffc
LP
3096
3097 assert(f);
c88cc6af 3098 assert(f->header);
7560fffc 3099
0284adc6
LP
3100 printf("File Path: %s\n"
3101 "File ID: %s\n"
3102 "Machine ID: %s\n"
3103 "Boot ID: %s\n"
3104 "Sequential Number ID: %s\n"
3105 "State: %s\n"
3106 "Compatible Flags:%s%s\n"
d89c8fdf 3107 "Incompatible Flags:%s%s%s\n"
507f22bd
ZJS
3108 "Header size: %"PRIu64"\n"
3109 "Arena size: %"PRIu64"\n"
3110 "Data Hash Table Size: %"PRIu64"\n"
3111 "Field Hash Table Size: %"PRIu64"\n"
0284adc6 3112 "Rotate Suggested: %s\n"
0808b92f
LP
3113 "Head Sequential Number: %"PRIu64" (%"PRIx64")\n"
3114 "Tail Sequential Number: %"PRIu64" (%"PRIx64")\n"
3115 "Head Realtime Timestamp: %s (%"PRIx64")\n"
3116 "Tail Realtime Timestamp: %s (%"PRIx64")\n"
3117 "Tail Monotonic Timestamp: %s (%"PRIx64")\n"
507f22bd
ZJS
3118 "Objects: %"PRIu64"\n"
3119 "Entry Objects: %"PRIu64"\n",
0284adc6
LP
3120 f->path,
3121 sd_id128_to_string(f->header->file_id, a),
3122 sd_id128_to_string(f->header->machine_id, b),
3123 sd_id128_to_string(f->header->boot_id, c),
2765b7bb 3124 sd_id128_to_string(f->header->seqnum_id, d),
3223f44f
LP
3125 f->header->state == STATE_OFFLINE ? "OFFLINE" :
3126 f->header->state == STATE_ONLINE ? "ONLINE" :
3127 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
8088cbd3 3128 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
d89c8fdf
ZJS
3129 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
3130 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
3131 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
3132 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
507f22bd
ZJS
3133 le64toh(f->header->header_size),
3134 le64toh(f->header->arena_size),
3135 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3136 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
fb0951b0 3137 yes_no(journal_file_rotate_suggested(f, 0)),
0808b92f
LP
3138 le64toh(f->header->head_entry_seqnum), le64toh(f->header->head_entry_seqnum),
3139 le64toh(f->header->tail_entry_seqnum), le64toh(f->header->tail_entry_seqnum),
3140 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)), le64toh(f->header->head_entry_realtime),
3141 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)), le64toh(f->header->tail_entry_realtime),
3142 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC), le64toh(f->header->tail_entry_monotonic),
507f22bd
ZJS
3143 le64toh(f->header->n_objects),
3144 le64toh(f->header->n_entries));
7560fffc 3145
0284adc6 3146 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
507f22bd 3147 printf("Data Objects: %"PRIu64"\n"
0284adc6 3148 "Data Hash Table Fill: %.1f%%\n",
507f22bd 3149 le64toh(f->header->n_data),
0284adc6 3150 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
7560fffc 3151
0284adc6 3152 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
507f22bd 3153 printf("Field Objects: %"PRIu64"\n"
0284adc6 3154 "Field Hash Table Fill: %.1f%%\n",
507f22bd 3155 le64toh(f->header->n_fields),
0284adc6 3156 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3223f44f
LP
3157
3158 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
507f22bd
ZJS
3159 printf("Tag Objects: %"PRIu64"\n",
3160 le64toh(f->header->n_tags));
3223f44f 3161 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
507f22bd
ZJS
3162 printf("Entry Array Objects: %"PRIu64"\n",
3163 le64toh(f->header->n_entry_arrays));
a1a03e30
LP
3164
3165 if (fstat(f->fd, &st) >= 0)
59f448cf 3166 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
7560fffc
LP
3167}
3168
fc68c929
LP
3169static int journal_file_warn_btrfs(JournalFile *f) {
3170 unsigned attrs;
3171 int r;
3172
3173 assert(f);
3174
3175 /* Before we write anything, check if the COW logic is turned
3176 * off on btrfs. Given our write pattern that is quite
3177 * unfriendly to COW file systems this should greatly improve
3178 * performance on COW file systems, such as btrfs, at the
3179 * expense of data integrity features (which shouldn't be too
3180 * bad, given that we do our own checksumming). */
3181
3182 r = btrfs_is_filesystem(f->fd);
3183 if (r < 0)
3184 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
3185 if (!r)
3186 return 0;
3187
3188 r = read_attr_fd(f->fd, &attrs);
3189 if (r < 0)
3190 return log_warning_errno(r, "Failed to read file attributes: %m");
3191
3192 if (attrs & FS_NOCOW_FL) {
3193 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
3194 return 0;
3195 }
3196
3197 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
3198 "This is likely to slow down journal access substantially, please consider turning "
3199 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
3200
3201 return 1;
3202}
3203
0284adc6 3204int journal_file_open(
5d1ce257 3205 int fd,
0284adc6
LP
3206 const char *fname,
3207 int flags,
3208 mode_t mode,
3209 bool compress,
57850536 3210 uint64_t compress_threshold_bytes,
baed47c3 3211 bool seal,
0284adc6
LP
3212 JournalMetrics *metrics,
3213 MMapCache *mmap_cache,
b58c888f 3214 Set *deferred_closes,
0284adc6
LP
3215 JournalFile *template,
3216 JournalFile **ret) {
7560fffc 3217
fa6ac760 3218 bool newly_created = false;
0284adc6 3219 JournalFile *f;
fa6ac760 3220 void *h;
0284adc6 3221 int r;
57850536 3222 char bytes[FORMAT_BYTES_MAX];
7560fffc 3223
0559d3a5 3224 assert(ret);
5d1ce257 3225 assert(fd >= 0 || fname);
7560fffc 3226
ec2ce0c5 3227 if (!IN_SET((flags & O_ACCMODE), O_RDONLY, O_RDWR))
0284adc6 3228 return -EINVAL;
7560fffc 3229
6eda13d3
LP
3230 if (fname && (flags & O_CREAT) && !endswith(fname, ".journal"))
3231 return -EINVAL;
7560fffc 3232
0284adc6
LP
3233 f = new0(JournalFile, 1);
3234 if (!f)
3235 return -ENOMEM;
7560fffc 3236
5d1ce257 3237 f->fd = fd;
0284adc6 3238 f->mode = mode;
7560fffc 3239
0284adc6
LP
3240 f->flags = flags;
3241 f->prot = prot_from_flags(flags);
3242 f->writable = (flags & O_ACCMODE) != O_RDONLY;
349cc4a5 3243#if HAVE_LZ4
d89c8fdf 3244 f->compress_lz4 = compress;
349cc4a5 3245#elif HAVE_XZ
d89c8fdf 3246 f->compress_xz = compress;
48b61739 3247#endif
57850536
AG
3248
3249 if (compress_threshold_bytes == (uint64_t) -1)
3250 f->compress_threshold_bytes = DEFAULT_COMPRESS_THRESHOLD;
3251 else
3252 f->compress_threshold_bytes = MAX(MIN_COMPRESS_THRESHOLD, compress_threshold_bytes);
3253
349cc4a5 3254#if HAVE_GCRYPT
baed47c3 3255 f->seal = seal;
49a32d43 3256#endif
7560fffc 3257
57850536
AG
3258 log_debug("Journal effective settings seal=%s compress=%s compress_threshold_bytes=%s",
3259 yes_no(f->seal), yes_no(JOURNAL_FILE_COMPRESS(f)),
3260 format_bytes(bytes, sizeof(bytes), f->compress_threshold_bytes));
3261
0284adc6
LP
3262 if (mmap_cache)
3263 f->mmap = mmap_cache_ref(mmap_cache);
3264 else {
84168d80 3265 f->mmap = mmap_cache_new();
0284adc6
LP
3266 if (!f->mmap) {
3267 r = -ENOMEM;
3268 goto fail;
3269 }
3270 }
7560fffc 3271
7645c77b 3272 if (fname) {
5d1ce257 3273 f->path = strdup(fname);
7645c77b
ZJS
3274 if (!f->path) {
3275 r = -ENOMEM;
3276 goto fail;
3277 }
3278 } else {
817b1c5b
LP
3279 assert(fd >= 0);
3280
7645c77b
ZJS
3281 /* If we don't know the path, fill in something explanatory and vaguely useful */
3282 if (asprintf(&f->path, "/proc/self/%i", fd) < 0) {
3283 r = -ENOMEM;
3284 goto fail;
3285 }
0284adc6 3286 }
7560fffc 3287
4743015d 3288 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
a4bcff5b
LP
3289 if (!f->chain_cache) {
3290 r = -ENOMEM;
3291 goto fail;
3292 }
3293
0284adc6 3294 if (f->fd < 0) {
817b1c5b
LP
3295 /* We pass O_NONBLOCK here, so that in case somebody pointed us to some character device node or FIFO
3296 * or so, we likely fail quickly than block for long. For regular files O_NONBLOCK has no effect, hence
3297 * it doesn't hurt in that case. */
3298
3299 f->fd = open(f->path, f->flags|O_CLOEXEC|O_NONBLOCK, f->mode);
5d1ce257
LP
3300 if (f->fd < 0) {
3301 r = -errno;
3302 goto fail;
3303 }
3304
3305 /* fds we opened here by us should also be closed by us. */
3306 f->close_fd = true;
817b1c5b
LP
3307
3308 r = fd_nonblock(f->fd, false);
3309 if (r < 0)
3310 goto fail;
7560fffc 3311 }
7560fffc 3312
be7cdd8e
VC
3313 f->cache_fd = mmap_cache_add_fd(f->mmap, f->fd);
3314 if (!f->cache_fd) {
3315 r = -ENOMEM;
3316 goto fail;
3317 }
3318
2678031a
LP
3319 r = journal_file_fstat(f);
3320 if (r < 0)
0284adc6 3321 goto fail;
7560fffc 3322
0284adc6 3323 if (f->last_stat.st_size == 0 && f->writable) {
11689d2a 3324
fc68c929 3325 (void) journal_file_warn_btrfs(f);
11689d2a 3326
4c2e1b39
LP
3327 /* Let's attach the creation time to the journal file, so that the vacuuming code knows the age of this
3328 * file even if the file might end up corrupted one day... Ideally we'd just use the creation time many
3329 * file systems maintain for each file, but the API to query this is very new, hence let's emulate this
3330 * via extended attributes. If extended attributes are not supported we'll just skip this, and rely
3331 * solely on mtime/atime/ctime of the file. */
3332 (void) fd_setcrtime(f->fd, 0);
7560fffc 3333
349cc4a5 3334#if HAVE_GCRYPT
0284adc6 3335 /* Try to load the FSPRG state, and if we can't, then
baed47c3 3336 * just don't do sealing */
49a32d43
LP
3337 if (f->seal) {
3338 r = journal_file_fss_load(f);
3339 if (r < 0)
3340 f->seal = false;
3341 }
feb12d3e 3342#endif
7560fffc 3343
0284adc6
LP
3344 r = journal_file_init_header(f, template);
3345 if (r < 0)
3346 goto fail;
7560fffc 3347
2678031a
LP
3348 r = journal_file_fstat(f);
3349 if (r < 0)
0284adc6 3350 goto fail;
fb0951b0
LP
3351
3352 newly_created = true;
0284adc6 3353 }
7560fffc 3354
0284adc6 3355 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
cfb571f3 3356 r = -ENODATA;
0284adc6
LP
3357 goto fail;
3358 }
7560fffc 3359
b42549ad 3360 r = mmap_cache_get(f->mmap, f->cache_fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h, NULL);
977eaa1e 3361 if (r < 0)
0284adc6 3362 goto fail;
7560fffc 3363
fa6ac760
LP
3364 f->header = h;
3365
0284adc6 3366 if (!newly_created) {
f9168190 3367 set_clear_with_destructor(deferred_closes, journal_file_close);
b58c888f 3368
0284adc6
LP
3369 r = journal_file_verify_header(f);
3370 if (r < 0)
3371 goto fail;
3372 }
7560fffc 3373
349cc4a5 3374#if HAVE_GCRYPT
0284adc6 3375 if (!newly_created && f->writable) {
baed47c3 3376 r = journal_file_fss_load(f);
0284adc6
LP
3377 if (r < 0)
3378 goto fail;
3379 }
feb12d3e 3380#endif
cec736d2
LP
3381
3382 if (f->writable) {
4a92baf3
LP
3383 if (metrics) {
3384 journal_default_metrics(metrics, f->fd);
3385 f->metrics = *metrics;
3386 } else if (template)
3387 f->metrics = template->metrics;
3388
cec736d2
LP
3389 r = journal_file_refresh_header(f);
3390 if (r < 0)
3391 goto fail;
3392 }
3393
349cc4a5 3394#if HAVE_GCRYPT
baed47c3 3395 r = journal_file_hmac_setup(f);
14d10188
LP
3396 if (r < 0)
3397 goto fail;
feb12d3e 3398#endif
14d10188 3399
cec736d2 3400 if (newly_created) {
de190aef 3401 r = journal_file_setup_field_hash_table(f);
cec736d2
LP
3402 if (r < 0)
3403 goto fail;
3404
de190aef 3405 r = journal_file_setup_data_hash_table(f);
cec736d2
LP
3406 if (r < 0)
3407 goto fail;
7560fffc 3408
349cc4a5 3409#if HAVE_GCRYPT
7560fffc
LP
3410 r = journal_file_append_first_tag(f);
3411 if (r < 0)
3412 goto fail;
feb12d3e 3413#endif
cec736d2
LP
3414 }
3415
be7cdd8e 3416 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd)) {
fa6ac760
LP
3417 r = -EIO;
3418 goto fail;
3419 }
3420
7a24f3bf 3421 if (template && template->post_change_timer) {
e167d7fd
LP
3422 r = journal_file_enable_post_change_timer(
3423 f,
3424 sd_event_source_get_event(template->post_change_timer),
3425 template->post_change_timer_period);
7a24f3bf 3426
7a24f3bf
VC
3427 if (r < 0)
3428 goto fail;
3429 }
3430
f8e2f4d6 3431 /* The file is opened now successfully, thus we take possession of any passed in fd. */
5d1ce257
LP
3432 f->close_fd = true;
3433
0559d3a5 3434 *ret = f;
cec736d2
LP
3435 return 0;
3436
3437fail:
be7cdd8e 3438 if (f->cache_fd && mmap_cache_got_sigbus(f->mmap, f->cache_fd))
fa6ac760
LP
3439 r = -EIO;
3440
69a3a6fd 3441 (void) journal_file_close(f);
cec736d2
LP
3442
3443 return r;
3444}
0ac38b70 3445
57850536 3446int journal_file_rotate(JournalFile **f, bool compress, uint64_t compress_threshold_bytes, bool seal, Set *deferred_closes) {
57535f47 3447 _cleanup_free_ char *p = NULL;
0ac38b70
LP
3448 size_t l;
3449 JournalFile *old_file, *new_file = NULL;
3450 int r;
3451
3452 assert(f);
3453 assert(*f);
3454
3455 old_file = *f;
3456
3457 if (!old_file->writable)
3458 return -EINVAL;
3459
5d1ce257 3460 /* Is this a journal file that was passed to us as fd? If so, we synthesized a path name for it, and we refuse
13e785f7 3461 * rotation, since we don't know the actual path, and couldn't rename the file hence. */
5d1ce257
LP
3462 if (path_startswith(old_file->path, "/proc/self/fd"))
3463 return -EINVAL;
3464
0ac38b70
LP
3465 if (!endswith(old_file->path, ".journal"))
3466 return -EINVAL;
3467
3468 l = strlen(old_file->path);
57535f47
ZJS
3469 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
3470 (int) l - 8, old_file->path,
3471 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
3472 le64toh((*f)->header->head_entry_seqnum),
3473 le64toh((*f)->header->head_entry_realtime));
3474 if (r < 0)
0ac38b70
LP
3475 return -ENOMEM;
3476
2678031a
LP
3477 /* Try to rename the file to the archived version. If the file
3478 * already was deleted, we'll get ENOENT, let's ignore that
3479 * case. */
0ac38b70 3480 r = rename(old_file->path, p);
2678031a 3481 if (r < 0 && errno != ENOENT)
0ac38b70
LP
3482 return -errno;
3483
1fcefd88
LP
3484 /* Sync the rename to disk */
3485 (void) fsync_directory_of_file(old_file->fd);
3486
8eb85171
VC
3487 /* Set as archive so offlining commits w/state=STATE_ARCHIVED.
3488 * Previously we would set old_file->header->state to STATE_ARCHIVED directly here,
3489 * but journal_file_set_offline() short-circuits when state != STATE_ONLINE, which
3490 * would result in the rotated journal never getting fsync() called before closing.
3491 * Now we simply queue the archive state by setting an archive bit, leaving the state
3492 * as STATE_ONLINE so proper offlining occurs. */
3493 old_file->archive = true;
0ac38b70 3494
f27a3864
LP
3495 /* Currently, btrfs is not very good with out write patterns
3496 * and fragments heavily. Let's defrag our journal files when
3497 * we archive them */
3498 old_file->defrag_on_close = true;
3499
57850536
AG
3500 r = journal_file_open(-1, old_file->path, old_file->flags, old_file->mode, compress,
3501 compress_threshold_bytes, seal, NULL, old_file->mmap, deferred_closes,
3502 old_file, &new_file);
b58c888f
VC
3503
3504 if (deferred_closes &&
3505 set_put(deferred_closes, old_file) >= 0)
3506 (void) journal_file_set_offline(old_file, false);
3507 else
3508 (void) journal_file_close(old_file);
0ac38b70
LP
3509
3510 *f = new_file;
3511 return r;
3512}
3513
9447a7f1
LP
3514int journal_file_open_reliably(
3515 const char *fname,
3516 int flags,
3517 mode_t mode,
7560fffc 3518 bool compress,
57850536 3519 uint64_t compress_threshold_bytes,
baed47c3 3520 bool seal,
4a92baf3 3521 JournalMetrics *metrics,
27370278 3522 MMapCache *mmap_cache,
b58c888f 3523 Set *deferred_closes,
9447a7f1
LP
3524 JournalFile *template,
3525 JournalFile **ret) {
3526
3527 int r;
3528 size_t l;
ed375beb 3529 _cleanup_free_ char *p = NULL;
9447a7f1 3530
57850536
AG
3531 r = journal_file_open(-1, fname, flags, mode, compress, compress_threshold_bytes, seal, metrics, mmap_cache,
3532 deferred_closes, template, ret);
288359db 3533 if (!IN_SET(r,
b288cdeb
ZJS
3534 -EBADMSG, /* Corrupted */
3535 -ENODATA, /* Truncated */
3536 -EHOSTDOWN, /* Other machine */
3537 -EPROTONOSUPPORT, /* Incompatible feature */
3538 -EBUSY, /* Unclean shutdown */
3539 -ESHUTDOWN, /* Already archived */
288359db 3540 -EIO, /* IO error, including SIGBUS on mmap */
ae739cc1
LP
3541 -EIDRM, /* File has been deleted */
3542 -ETXTBSY)) /* File is from the future */
9447a7f1
LP
3543 return r;
3544
3545 if ((flags & O_ACCMODE) == O_RDONLY)
3546 return r;
3547
3548 if (!(flags & O_CREAT))
3549 return r;
3550
7560fffc
LP
3551 if (!endswith(fname, ".journal"))
3552 return r;
3553
5c70eab4
LP
3554 /* The file is corrupted. Rotate it away and try it again (but only once) */
3555
9447a7f1 3556 l = strlen(fname);
d587eca5 3557 if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
57535f47 3558 (int) l - 8, fname,
d587eca5 3559 now(CLOCK_REALTIME),
9bf3b535 3560 random_u64()) < 0)
9447a7f1
LP
3561 return -ENOMEM;
3562
65089b82 3563 if (rename(fname, p) < 0)
9447a7f1
LP
3564 return -errno;
3565
f27a3864
LP
3566 /* btrfs doesn't cope well with our write pattern and
3567 * fragments heavily. Let's defrag all files we rotate */
11689d2a 3568
a67d68b8 3569 (void) chattr_path(p, 0, FS_NOCOW_FL);
f27a3864
LP
3570 (void) btrfs_defrag(p);
3571
65089b82 3572 log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
9447a7f1 3573
57850536
AG
3574 return journal_file_open(-1, fname, flags, mode, compress, compress_threshold_bytes, seal, metrics, mmap_cache,
3575 deferred_closes, template, ret);
9447a7f1
LP
3576}
3577
cf244689
LP
3578int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
3579 uint64_t i, n;
3580 uint64_t q, xor_hash = 0;
3581 int r;
3582 EntryItem *items;
3583 dual_timestamp ts;
3584
3585 assert(from);
3586 assert(to);
3587 assert(o);
3588 assert(p);
3589
3590 if (!to->writable)
3591 return -EPERM;
3592
3593 ts.monotonic = le64toh(o->entry.monotonic);
3594 ts.realtime = le64toh(o->entry.realtime);
3595
cf244689 3596 n = journal_file_entry_n_items(o);
4faa7004
TA
3597 /* alloca() can't take 0, hence let's allocate at least one */
3598 items = alloca(sizeof(EntryItem) * MAX(1u, n));
cf244689
LP
3599
3600 for (i = 0; i < n; i++) {
4fd052ae
FC
3601 uint64_t l, h;
3602 le64_t le_hash;
cf244689
LP
3603 size_t t;
3604 void *data;
3605 Object *u;
3606
3607 q = le64toh(o->entry.items[i].object_offset);
3608 le_hash = o->entry.items[i].hash;
3609
3610 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
3611 if (r < 0)
3612 return r;
3613
3614 if (le_hash != o->data.hash)
3615 return -EBADMSG;
3616
3617 l = le64toh(o->object.size) - offsetof(Object, data.payload);
3618 t = (size_t) l;
3619
3620 /* We hit the limit on 32bit machines */
3621 if ((uint64_t) t != l)
3622 return -E2BIG;
3623
d89c8fdf 3624 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
349cc4a5 3625#if HAVE_XZ || HAVE_LZ4
a7f7d1bd 3626 size_t rsize = 0;
cf244689 3627
d89c8fdf
ZJS
3628 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
3629 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
3630 if (r < 0)
3631 return r;
cf244689
LP
3632
3633 data = from->compress_buffer;
3634 l = rsize;
3b1a55e1
ZJS
3635#else
3636 return -EPROTONOSUPPORT;
3637#endif
cf244689
LP
3638 } else
3639 data = o->data.payload;
3640
3641 r = journal_file_append_data(to, data, l, &u, &h);
3642 if (r < 0)
3643 return r;
3644
3645 xor_hash ^= le64toh(u->data.hash);
3646 items[i].object_offset = htole64(h);
3647 items[i].hash = u->data.hash;
3648
3649 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
3650 if (r < 0)
3651 return r;
3652 }
3653
fa6ac760
LP
3654 r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
3655
be7cdd8e 3656 if (mmap_cache_got_sigbus(to->mmap, to->cache_fd))
fa6ac760
LP
3657 return -EIO;
3658
3659 return r;
cf244689 3660}
babfc091 3661
8580d1f7
LP
3662void journal_reset_metrics(JournalMetrics *m) {
3663 assert(m);
3664
3665 /* Set everything to "pick automatic values". */
3666
3667 *m = (JournalMetrics) {
3668 .min_use = (uint64_t) -1,
3669 .max_use = (uint64_t) -1,
3670 .min_size = (uint64_t) -1,
3671 .max_size = (uint64_t) -1,
3672 .keep_free = (uint64_t) -1,
3673 .n_max_files = (uint64_t) -1,
3674 };
3675}
3676
babfc091 3677void journal_default_metrics(JournalMetrics *m, int fd) {
8580d1f7 3678 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
babfc091 3679 struct statvfs ss;
8580d1f7 3680 uint64_t fs_size;
babfc091
LP
3681
3682 assert(m);
3683 assert(fd >= 0);
3684
3685 if (fstatvfs(fd, &ss) >= 0)
3686 fs_size = ss.f_frsize * ss.f_blocks;
8580d1f7 3687 else {
8fc58f1a 3688 log_debug_errno(errno, "Failed to determine disk size: %m");
8580d1f7
LP
3689 fs_size = 0;
3690 }
babfc091
LP
3691
3692 if (m->max_use == (uint64_t) -1) {
3693
3694 if (fs_size > 0) {
3695 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
3696
3697 if (m->max_use > DEFAULT_MAX_USE_UPPER)
3698 m->max_use = DEFAULT_MAX_USE_UPPER;
3699
3700 if (m->max_use < DEFAULT_MAX_USE_LOWER)
3701 m->max_use = DEFAULT_MAX_USE_LOWER;
3702 } else
3703 m->max_use = DEFAULT_MAX_USE_LOWER;
3704 } else {
3705 m->max_use = PAGE_ALIGN(m->max_use);
3706
8580d1f7 3707 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
babfc091
LP
3708 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3709 }
3710
8580d1f7
LP
3711 if (m->min_use == (uint64_t) -1)
3712 m->min_use = DEFAULT_MIN_USE;
3713
3714 if (m->min_use > m->max_use)
3715 m->min_use = m->max_use;
3716
babfc091
LP
3717 if (m->max_size == (uint64_t) -1) {
3718 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
3719
3720 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3721 m->max_size = DEFAULT_MAX_SIZE_UPPER;
3722 } else
3723 m->max_size = PAGE_ALIGN(m->max_size);
3724
8580d1f7
LP
3725 if (m->max_size != 0) {
3726 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3727 m->max_size = JOURNAL_FILE_SIZE_MIN;
babfc091 3728
8580d1f7
LP
3729 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3730 m->max_use = m->max_size*2;
3731 }
babfc091
LP
3732
3733 if (m->min_size == (uint64_t) -1)
3734 m->min_size = JOURNAL_FILE_SIZE_MIN;
3735 else {
3736 m->min_size = PAGE_ALIGN(m->min_size);
3737
3738 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3739 m->min_size = JOURNAL_FILE_SIZE_MIN;
3740
8580d1f7 3741 if (m->max_size != 0 && m->min_size > m->max_size)
babfc091
LP
3742 m->max_size = m->min_size;
3743 }
3744
3745 if (m->keep_free == (uint64_t) -1) {
3746
3747 if (fs_size > 0) {
8621b110 3748 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
babfc091
LP
3749
3750 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3751 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3752
3753 } else
3754 m->keep_free = DEFAULT_KEEP_FREE;
3755 }
3756
8580d1f7
LP
3757 if (m->n_max_files == (uint64_t) -1)
3758 m->n_max_files = DEFAULT_N_MAX_FILES;
3759
3760 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3761 format_bytes(a, sizeof(a), m->min_use),
3762 format_bytes(b, sizeof(b), m->max_use),
3763 format_bytes(c, sizeof(c), m->max_size),
3764 format_bytes(d, sizeof(d), m->min_size),
3765 format_bytes(e, sizeof(e), m->keep_free),
3766 m->n_max_files);
babfc091 3767}
08984293
LP
3768
3769int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
08984293 3770 assert(f);
c88cc6af 3771 assert(f->header);
08984293
LP
3772 assert(from || to);
3773
3774 if (from) {
162566a4
LP
3775 if (f->header->head_entry_realtime == 0)
3776 return -ENOENT;
08984293 3777
162566a4 3778 *from = le64toh(f->header->head_entry_realtime);
08984293
LP
3779 }
3780
3781 if (to) {
162566a4
LP
3782 if (f->header->tail_entry_realtime == 0)
3783 return -ENOENT;
08984293 3784
162566a4 3785 *to = le64toh(f->header->tail_entry_realtime);
08984293
LP
3786 }
3787
3788 return 1;
3789}
3790
3791int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
08984293
LP
3792 Object *o;
3793 uint64_t p;
3794 int r;
3795
3796 assert(f);
3797 assert(from || to);
3798
47838ab3 3799 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
08984293
LP
3800 if (r <= 0)
3801 return r;
3802
3803 if (le64toh(o->data.n_entries) <= 0)
3804 return 0;
3805
3806 if (from) {
3807 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3808 if (r < 0)
3809 return r;
3810
3811 *from = le64toh(o->entry.monotonic);
3812 }
3813
3814 if (to) {
3815 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3816 if (r < 0)
3817 return r;
3818
3819 r = generic_array_get_plus_one(f,
3820 le64toh(o->data.entry_offset),
3821 le64toh(o->data.entry_array_offset),
3822 le64toh(o->data.n_entries)-1,
3823 &o, NULL);
3824 if (r <= 0)
3825 return r;
3826
3827 *to = le64toh(o->entry.monotonic);
3828 }
3829
3830 return 1;
3831}
dca6219e 3832
fb0951b0 3833bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
dca6219e 3834 assert(f);
c88cc6af 3835 assert(f->header);
dca6219e
LP
3836
3837 /* If we gained new header fields we gained new features,
3838 * hence suggest a rotation */
361f9cbc
LP
3839 if (le64toh(f->header->header_size) < sizeof(Header)) {
3840 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
dca6219e 3841 return true;
361f9cbc 3842 }
dca6219e
LP
3843
3844 /* Let's check if the hash tables grew over a certain fill
3845 * level (75%, borrowing this value from Java's hash table
3846 * implementation), and if so suggest a rotation. To calculate
3847 * the fill level we need the n_data field, which only exists
3848 * in newer versions. */
3849
3850 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
361f9cbc 3851 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 3852 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
361f9cbc
LP
3853 f->path,
3854 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
3855 le64toh(f->header->n_data),
3856 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3857 (unsigned long long) f->last_stat.st_size,
3858 f->last_stat.st_size / le64toh(f->header->n_data));
dca6219e 3859 return true;
361f9cbc 3860 }
dca6219e
LP
3861
3862 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
361f9cbc 3863 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 3864 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
361f9cbc
LP
3865 f->path,
3866 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
3867 le64toh(f->header->n_fields),
3868 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
dca6219e 3869 return true;
361f9cbc 3870 }
dca6219e 3871
0598fd4a
LP
3872 /* Are the data objects properly indexed by field objects? */
3873 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3874 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3875 le64toh(f->header->n_data) > 0 &&
3876 le64toh(f->header->n_fields) == 0)
3877 return true;
3878
fb0951b0
LP
3879 if (max_file_usec > 0) {
3880 usec_t t, h;
3881
3882 h = le64toh(f->header->head_entry_realtime);
3883 t = now(CLOCK_REALTIME);
3884
3885 if (h > 0 && t > h + max_file_usec)
3886 return true;
3887 }
3888
dca6219e
LP
3889 return false;
3890}