]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/journal/journal-file.c
process-util: add new FORK_NEW_MOUNTNS flag to safe_fork()
[thirdparty/systemd.git] / src / journal / journal-file.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
cec736d2
LP
2/***
3 This file is part of systemd.
4
5 Copyright 2011 Lennart Poettering
6
7 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
cec736d2
LP
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 15 Lesser General Public License for more details.
cec736d2 16
5430f7f2 17 You should have received a copy of the GNU Lesser General Public License
cec736d2
LP
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19***/
20
cec736d2 21#include <errno.h>
cec736d2 22#include <fcntl.h>
11689d2a 23#include <linux/fs.h>
ac2e41f5 24#include <pthread.h>
07630cea
LP
25#include <stddef.h>
26#include <sys/mman.h>
27#include <sys/statvfs.h>
28#include <sys/uio.h>
29#include <unistd.h>
fb0951b0 30
b5efdb8a 31#include "alloc-util.h"
f27a3864 32#include "btrfs-util.h"
c8b3094d 33#include "chattr-util.h"
07630cea 34#include "compress.h"
3ffd4af2 35#include "fd-util.h"
0284adc6 36#include "journal-authenticate.h"
cec736d2
LP
37#include "journal-def.h"
38#include "journal-file.h"
39#include "lookup3.h"
6bedfcbb 40#include "parse-util.h"
5d1ce257 41#include "path-util.h"
3df3e884 42#include "random-util.h"
7a24f3bf 43#include "sd-event.h"
b58c888f 44#include "set.h"
07630cea 45#include "string-util.h"
4761fd0f 46#include "strv.h"
89a5a90c 47#include "xattr-util.h"
cec736d2 48
4a92baf3
LP
49#define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
50#define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
cec736d2 51
be19b7df 52#define COMPRESSION_SIZE_THRESHOLD (512ULL)
807e17f0 53
babfc091 54/* This is the minimum journal file size */
16098e93 55#define JOURNAL_FILE_SIZE_MIN (512ULL*1024ULL) /* 512 KiB */
babfc091
LP
56
57/* These are the lower and upper bounds if we deduce the max_use value
58 * from the file system size */
59#define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
60#define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
61
8580d1f7
LP
62/* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
63#define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL) /* 1 MiB */
64
babfc091 65/* This is the upper bound if we deduce max_size from max_use */
71100051 66#define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
babfc091
LP
67
68/* This is the upper bound if we deduce the keep_free value from the
69 * file system size */
70#define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
71
72/* This is the keep_free value when we can't determine the system
73 * size */
74#define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
75
8580d1f7
LP
76/* This is the default maximum number of journal files to keep around. */
77#define DEFAULT_N_MAX_FILES (100)
78
dca6219e
LP
79/* n_data was the first entry we added after the initial file format design */
80#define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
cec736d2 81
a4bcff5b
LP
82/* How many entries to keep in the entry array chain cache at max */
83#define CHAIN_CACHE_MAX 20
84
a676e665
LP
85/* How much to increase the journal file size at once each time we allocate something new. */
86#define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
87
2678031a
LP
88/* Reread fstat() of the file for detecting deletions at least this often */
89#define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
90
fa6ac760
LP
91/* The mmap context to use for the header we pick as one above the last defined typed */
92#define CONTEXT_HEADER _OBJECT_TYPE_MAX
93
51804460
ZJS
94#ifdef __clang__
95# pragma GCC diagnostic ignored "-Waddress-of-packed-member"
96#endif
97
ac2e41f5
VC
98/* This may be called from a separate thread to prevent blocking the caller for the duration of fsync().
99 * As a result we use atomic operations on f->offline_state for inter-thread communications with
100 * journal_file_set_offline() and journal_file_set_online(). */
101static void journal_file_set_offline_internal(JournalFile *f) {
26687bf8 102 assert(f);
ac2e41f5
VC
103 assert(f->fd >= 0);
104 assert(f->header);
105
106 for (;;) {
107 switch (f->offline_state) {
108 case OFFLINE_CANCEL:
109 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_DONE))
110 continue;
111 return;
112
113 case OFFLINE_AGAIN_FROM_SYNCING:
114 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_SYNCING))
115 continue;
116 break;
117
118 case OFFLINE_AGAIN_FROM_OFFLINING:
119 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_SYNCING))
120 continue;
121 break;
122
123 case OFFLINE_SYNCING:
124 (void) fsync(f->fd);
26687bf8 125
ac2e41f5
VC
126 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_OFFLINING))
127 continue;
26687bf8 128
8eb85171 129 f->header->state = f->archive ? STATE_ARCHIVED : STATE_OFFLINE;
ac2e41f5
VC
130 (void) fsync(f->fd);
131 break;
132
133 case OFFLINE_OFFLINING:
134 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_DONE))
135 continue;
4831981d 136 _fallthrough_;
ac2e41f5
VC
137 case OFFLINE_DONE:
138 return;
139
140 case OFFLINE_JOINED:
141 log_debug("OFFLINE_JOINED unexpected offline state for journal_file_set_offline_internal()");
142 return;
143 }
144 }
145}
146
147static void * journal_file_set_offline_thread(void *arg) {
148 JournalFile *f = arg;
149
fa7ff4cf
LP
150 (void) pthread_setname_np(pthread_self(), "journal-offline");
151
ac2e41f5
VC
152 journal_file_set_offline_internal(f);
153
154 return NULL;
155}
156
157static int journal_file_set_offline_thread_join(JournalFile *f) {
158 int r;
159
160 assert(f);
161
162 if (f->offline_state == OFFLINE_JOINED)
163 return 0;
164
165 r = pthread_join(f->offline_thread, NULL);
166 if (r)
167 return -r;
168
169 f->offline_state = OFFLINE_JOINED;
26687bf8 170
be7cdd8e 171 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
fa6ac760
LP
172 return -EIO;
173
ac2e41f5
VC
174 return 0;
175}
26687bf8 176
ac2e41f5
VC
177/* Trigger a restart if the offline thread is mid-flight in a restartable state. */
178static bool journal_file_set_offline_try_restart(JournalFile *f) {
179 for (;;) {
180 switch (f->offline_state) {
181 case OFFLINE_AGAIN_FROM_SYNCING:
182 case OFFLINE_AGAIN_FROM_OFFLINING:
183 return true;
184
185 case OFFLINE_CANCEL:
186 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_AGAIN_FROM_SYNCING))
187 continue;
188 return true;
189
190 case OFFLINE_SYNCING:
191 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_AGAIN_FROM_SYNCING))
192 continue;
193 return true;
194
195 case OFFLINE_OFFLINING:
196 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_AGAIN_FROM_OFFLINING))
197 continue;
198 return true;
26687bf8
OS
199
200 default:
ac2e41f5
VC
201 return false;
202 }
26687bf8
OS
203 }
204}
205
ac2e41f5
VC
206/* Sets a journal offline.
207 *
208 * If wait is false then an offline is dispatched in a separate thread for a
209 * subsequent journal_file_set_offline() or journal_file_set_online() of the
210 * same journal to synchronize with.
211 *
212 * If wait is true, then either an existing offline thread will be restarted
213 * and joined, or if none exists the offline is simply performed in this
214 * context without involving another thread.
215 */
216int journal_file_set_offline(JournalFile *f, bool wait) {
217 bool restarted;
218 int r;
219
26687bf8
OS
220 assert(f);
221
222 if (!f->writable)
223 return -EPERM;
224
225 if (!(f->fd >= 0 && f->header))
226 return -EINVAL;
227
b8f99e27
VC
228 /* An offlining journal is implicitly online and may modify f->header->state,
229 * we must also join any potentially lingering offline thread when not online. */
230 if (!journal_file_is_offlining(f) && f->header->state != STATE_ONLINE)
231 return journal_file_set_offline_thread_join(f);
26687bf8 232
ac2e41f5
VC
233 /* Restart an in-flight offline thread and wait if needed, or join a lingering done one. */
234 restarted = journal_file_set_offline_try_restart(f);
235 if ((restarted && wait) || !restarted) {
236 r = journal_file_set_offline_thread_join(f);
237 if (r < 0)
238 return r;
239 }
26687bf8 240
ac2e41f5
VC
241 if (restarted)
242 return 0;
243
244 /* Initiate a new offline. */
245 f->offline_state = OFFLINE_SYNCING;
fa6ac760 246
ac2e41f5
VC
247 if (wait) /* Without using a thread if waiting. */
248 journal_file_set_offline_internal(f);
249 else {
250 r = pthread_create(&f->offline_thread, NULL, journal_file_set_offline_thread, f);
ec9ffa2c
VC
251 if (r > 0) {
252 f->offline_state = OFFLINE_JOINED;
ac2e41f5 253 return -r;
ec9ffa2c 254 }
ac2e41f5
VC
255 }
256
257 return 0;
258}
259
260static int journal_file_set_online(JournalFile *f) {
261 bool joined = false;
262
263 assert(f);
264
265 if (!f->writable)
266 return -EPERM;
267
268 if (!(f->fd >= 0 && f->header))
269 return -EINVAL;
270
271 while (!joined) {
272 switch (f->offline_state) {
273 case OFFLINE_JOINED:
274 /* No offline thread, no need to wait. */
275 joined = true;
276 break;
277
278 case OFFLINE_SYNCING:
279 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_CANCEL))
280 continue;
281 /* Canceled syncing prior to offlining, no need to wait. */
282 break;
283
284 case OFFLINE_AGAIN_FROM_SYNCING:
285 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_CANCEL))
286 continue;
287 /* Canceled restart from syncing, no need to wait. */
288 break;
289
290 case OFFLINE_AGAIN_FROM_OFFLINING:
291 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_CANCEL))
292 continue;
293 /* Canceled restart from offlining, must wait for offlining to complete however. */
4831981d 294 _fallthrough_;
ac2e41f5
VC
295 default: {
296 int r;
297
298 r = journal_file_set_offline_thread_join(f);
299 if (r < 0)
300 return r;
301
302 joined = true;
303 break;
304 }
305 }
306 }
26687bf8 307
be7cdd8e 308 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
fa6ac760
LP
309 return -EIO;
310
ac2e41f5
VC
311 switch (f->header->state) {
312 case STATE_ONLINE:
313 return 0;
26687bf8 314
ac2e41f5
VC
315 case STATE_OFFLINE:
316 f->header->state = STATE_ONLINE;
317 (void) fsync(f->fd);
318 return 0;
319
320 default:
321 return -EINVAL;
322 }
26687bf8
OS
323}
324
b58c888f
VC
325bool journal_file_is_offlining(JournalFile *f) {
326 assert(f);
327
328 __sync_synchronize();
329
3742095b 330 if (IN_SET(f->offline_state, OFFLINE_DONE, OFFLINE_JOINED))
b58c888f
VC
331 return false;
332
333 return true;
334}
335
804ae586 336JournalFile* journal_file_close(JournalFile *f) {
de190aef 337 assert(f);
cec736d2 338
349cc4a5 339#if HAVE_GCRYPT
b0af6f41 340 /* Write the final tag */
43cd8794
FB
341 if (f->seal && f->writable) {
342 int r;
343
344 r = journal_file_append_tag(f);
345 if (r < 0)
346 log_error_errno(r, "Failed to append tag when closing journal: %m");
347 }
feb12d3e 348#endif
b0af6f41 349
7a24f3bf
VC
350 if (f->post_change_timer) {
351 int enabled;
352
353 if (sd_event_source_get_enabled(f->post_change_timer, &enabled) >= 0)
354 if (enabled == SD_EVENT_ONESHOT)
355 journal_file_post_change(f);
356
e167d7fd 357 (void) sd_event_source_set_enabled(f->post_change_timer, SD_EVENT_OFF);
7a24f3bf
VC
358 sd_event_source_unref(f->post_change_timer);
359 }
360
ac2e41f5 361 journal_file_set_offline(f, true);
cec736d2 362
be7cdd8e
VC
363 if (f->mmap && f->cache_fd)
364 mmap_cache_free_fd(f->mmap, f->cache_fd);
cec736d2 365
11689d2a
LP
366 if (f->fd >= 0 && f->defrag_on_close) {
367
368 /* Be friendly to btrfs: turn COW back on again now,
369 * and defragment the file. We won't write to the file
370 * ever again, hence remove all fragmentation, and
371 * reenable all the good bits COW usually provides
372 * (such as data checksumming). */
373
1ed8f8c1 374 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
11689d2a
LP
375 (void) btrfs_defrag_fd(f->fd);
376 }
f27a3864 377
5d1ce257
LP
378 if (f->close_fd)
379 safe_close(f->fd);
cec736d2 380 free(f->path);
807e17f0 381
f649045c 382 mmap_cache_unref(f->mmap);
16e9f408 383
4743015d 384 ordered_hashmap_free_free(f->chain_cache);
a4bcff5b 385
349cc4a5 386#if HAVE_XZ || HAVE_LZ4
807e17f0
LP
387 free(f->compress_buffer);
388#endif
389
349cc4a5 390#if HAVE_GCRYPT
baed47c3
LP
391 if (f->fss_file)
392 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
dc4ebc07 393 else
b7c9ae91
LP
394 free(f->fsprg_state);
395
396 free(f->fsprg_seed);
7560fffc
LP
397
398 if (f->hmac)
399 gcry_md_close(f->hmac);
400#endif
401
6b430fdb 402 return mfree(f);
cec736d2
LP
403}
404
0ac38b70 405static int journal_file_init_header(JournalFile *f, JournalFile *template) {
d89c8fdf 406 Header h = {};
cec736d2
LP
407 ssize_t k;
408 int r;
409
410 assert(f);
411
7560fffc 412 memcpy(h.signature, HEADER_SIGNATURE, 8);
23b0b2b2 413 h.header_size = htole64(ALIGN64(sizeof(h)));
cec736d2 414
d89c8fdf
ZJS
415 h.incompatible_flags |= htole32(
416 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
417 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
7560fffc 418
d89c8fdf
ZJS
419 h.compatible_flags = htole32(
420 f->seal * HEADER_COMPATIBLE_SEALED);
7560fffc 421
cec736d2
LP
422 r = sd_id128_randomize(&h.file_id);
423 if (r < 0)
424 return r;
425
0ac38b70
LP
426 if (template) {
427 h.seqnum_id = template->header->seqnum_id;
beec0085 428 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
0ac38b70
LP
429 } else
430 h.seqnum_id = h.file_id;
cec736d2
LP
431
432 k = pwrite(f->fd, &h, sizeof(h), 0);
433 if (k < 0)
434 return -errno;
435
436 if (k != sizeof(h))
437 return -EIO;
438
439 return 0;
440}
441
a0fe2a2d
LP
442static int fsync_directory_of_file(int fd) {
443 _cleanup_free_ char *path = NULL, *dn = NULL;
444 _cleanup_close_ int dfd = -1;
445 struct stat st;
446 int r;
447
448 if (fstat(fd, &st) < 0)
449 return -errno;
450
451 if (!S_ISREG(st.st_mode))
452 return -EBADFD;
453
454 r = fd_get_path(fd, &path);
455 if (r < 0)
456 return r;
457
458 if (!path_is_absolute(path))
459 return -EINVAL;
460
461 dn = dirname_malloc(path);
462 if (!dn)
463 return -ENOMEM;
464
465 dfd = open(dn, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
466 if (dfd < 0)
467 return -errno;
468
469 if (fsync(dfd) < 0)
470 return -errno;
471
472 return 0;
473}
474
cec736d2 475static int journal_file_refresh_header(JournalFile *f) {
de190aef 476 sd_id128_t boot_id;
fa6ac760 477 int r;
cec736d2
LP
478
479 assert(f);
c88cc6af 480 assert(f->header);
cec736d2
LP
481
482 r = sd_id128_get_machine(&f->header->machine_id);
483 if (r < 0)
484 return r;
485
de190aef 486 r = sd_id128_get_boot(&boot_id);
cec736d2
LP
487 if (r < 0)
488 return r;
489
de190aef
LP
490 if (sd_id128_equal(boot_id, f->header->boot_id))
491 f->tail_entry_monotonic_valid = true;
492
493 f->header->boot_id = boot_id;
494
fa6ac760 495 r = journal_file_set_online(f);
b788cc23 496
7560fffc 497 /* Sync the online state to disk */
fb426037 498 (void) fsync(f->fd);
b788cc23 499
a0fe2a2d
LP
500 /* We likely just created a new file, also sync the directory this file is located in. */
501 (void) fsync_directory_of_file(f->fd);
502
fa6ac760 503 return r;
cec736d2
LP
504}
505
4214009f
ZJS
506static bool warn_wrong_flags(const JournalFile *f, bool compatible) {
507 const uint32_t any = compatible ? HEADER_COMPATIBLE_ANY : HEADER_INCOMPATIBLE_ANY,
508 supported = compatible ? HEADER_COMPATIBLE_SUPPORTED : HEADER_INCOMPATIBLE_SUPPORTED;
509 const char *type = compatible ? "compatible" : "incompatible";
d89c8fdf
ZJS
510 uint32_t flags;
511
4214009f
ZJS
512 flags = le32toh(compatible ? f->header->compatible_flags : f->header->incompatible_flags);
513
514 if (flags & ~supported) {
515 if (flags & ~any)
4761fd0f 516 log_debug("Journal file %s has unknown %s flags 0x%"PRIx32,
4214009f
ZJS
517 f->path, type, flags & ~any);
518 flags = (flags & any) & ~supported;
4761fd0f
ZJS
519 if (flags) {
520 const char* strv[3];
521 unsigned n = 0;
522 _cleanup_free_ char *t = NULL;
523
524 if (compatible && (flags & HEADER_COMPATIBLE_SEALED))
525 strv[n++] = "sealed";
526 if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_XZ))
527 strv[n++] = "xz-compressed";
528 if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_LZ4))
529 strv[n++] = "lz4-compressed";
530 strv[n] = NULL;
531 assert(n < ELEMENTSOF(strv));
532
533 t = strv_join((char**) strv, ", ");
534 log_debug("Journal file %s uses %s %s %s disabled at compilation time.",
535 f->path, type, n > 1 ? "flags" : "flag", strnull(t));
536 }
4214009f
ZJS
537 return true;
538 }
539
540 return false;
541}
542
543static int journal_file_verify_header(JournalFile *f) {
6f94e420
TS
544 uint64_t arena_size, header_size;
545
cec736d2 546 assert(f);
c88cc6af 547 assert(f->header);
cec736d2 548
7560fffc 549 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
cec736d2
LP
550 return -EBADMSG;
551
4214009f
ZJS
552 /* In both read and write mode we refuse to open files with incompatible
553 * flags we don't know. */
554 if (warn_wrong_flags(f, false))
cec736d2
LP
555 return -EPROTONOSUPPORT;
556
4214009f
ZJS
557 /* When open for writing we refuse to open files with compatible flags, too. */
558 if (f->writable && warn_wrong_flags(f, true))
d89c8fdf 559 return -EPROTONOSUPPORT;
7560fffc 560
db11ac1a
LP
561 if (f->header->state >= _STATE_MAX)
562 return -EBADMSG;
563
6f94e420
TS
564 header_size = le64toh(f->header->header_size);
565
dca6219e 566 /* The first addition was n_data, so check that we are at least this large */
6f94e420 567 if (header_size < HEADER_SIZE_MIN)
23b0b2b2
LP
568 return -EBADMSG;
569
8088cbd3 570 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
beec0085
LP
571 return -EBADMSG;
572
6f94e420
TS
573 arena_size = le64toh(f->header->arena_size);
574
575 if (UINT64_MAX - header_size < arena_size || header_size + arena_size > (uint64_t) f->last_stat.st_size)
db11ac1a
LP
576 return -ENODATA;
577
6f94e420 578 if (le64toh(f->header->tail_object_offset) > header_size + arena_size)
db11ac1a
LP
579 return -ENODATA;
580
7762e02b
LP
581 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
582 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
583 !VALID64(le64toh(f->header->tail_object_offset)) ||
584 !VALID64(le64toh(f->header->entry_array_offset)))
585 return -ENODATA;
586
cec736d2 587 if (f->writable) {
cec736d2 588 sd_id128_t machine_id;
ae739cc1 589 uint8_t state;
cec736d2
LP
590 int r;
591
592 r = sd_id128_get_machine(&machine_id);
593 if (r < 0)
594 return r;
595
596 if (!sd_id128_equal(machine_id, f->header->machine_id))
597 return -EHOSTDOWN;
598
de190aef 599 state = f->header->state;
cec736d2 600
b288cdeb
ZJS
601 if (state == STATE_ARCHIVED)
602 return -ESHUTDOWN; /* Already archived */
603 else if (state == STATE_ONLINE) {
71fa6f00
LP
604 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
605 return -EBUSY;
b288cdeb 606 } else if (state != STATE_OFFLINE) {
8facc349 607 log_debug("Journal file %s has unknown state %i.", f->path, state);
71fa6f00
LP
608 return -EBUSY;
609 }
ae739cc1 610
5b3cc0c8
YN
611 if (f->header->field_hash_table_size == 0 || f->header->data_hash_table_size == 0)
612 return -EBADMSG;
613
ae739cc1
LP
614 /* Don't permit appending to files from the future. Because otherwise the realtime timestamps wouldn't
615 * be strictly ordered in the entries in the file anymore, and we can't have that since it breaks
616 * bisection. */
617 if (le64toh(f->header->tail_entry_realtime) > now(CLOCK_REALTIME)) {
618 log_debug("Journal file %s is from the future, refusing to append new data to it that'd be older.", f->path);
619 return -ETXTBSY;
620 }
cec736d2
LP
621 }
622
d89c8fdf
ZJS
623 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
624 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
c586dbf1 625
f1889c91 626 f->seal = JOURNAL_HEADER_SEALED(f->header);
7560fffc 627
cec736d2
LP
628 return 0;
629}
630
2678031a
LP
631static int journal_file_fstat(JournalFile *f) {
632 assert(f);
633 assert(f->fd >= 0);
634
635 if (fstat(f->fd, &f->last_stat) < 0)
636 return -errno;
637
638 f->last_stat_usec = now(CLOCK_MONOTONIC);
639
640 /* Refuse appending to files that are already deleted */
641 if (f->last_stat.st_nlink <= 0)
642 return -EIDRM;
643
644 return 0;
645}
646
cec736d2 647static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
eda4b58b 648 uint64_t old_size, new_size;
fec2aa2f 649 int r;
cec736d2
LP
650
651 assert(f);
c88cc6af 652 assert(f->header);
cec736d2 653
cec736d2 654 /* We assume that this file is not sparse, and we know that
38ac38b2 655 * for sure, since we always call posix_fallocate()
cec736d2
LP
656 * ourselves */
657
be7cdd8e 658 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
fa6ac760
LP
659 return -EIO;
660
cec736d2 661 old_size =
23b0b2b2 662 le64toh(f->header->header_size) +
cec736d2
LP
663 le64toh(f->header->arena_size);
664
bc85bfee 665 new_size = PAGE_ALIGN(offset + size);
23b0b2b2
LP
666 if (new_size < le64toh(f->header->header_size))
667 new_size = le64toh(f->header->header_size);
bc85bfee 668
2678031a
LP
669 if (new_size <= old_size) {
670
671 /* We already pre-allocated enough space, but before
672 * we write to it, let's check with fstat() if the
673 * file got deleted, in order make sure we don't throw
674 * away the data immediately. Don't check fstat() for
675 * all writes though, but only once ever 10s. */
676
677 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
678 return 0;
679
680 return journal_file_fstat(f);
681 }
682
683 /* Allocate more space. */
cec736d2 684
a676e665 685 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
bc85bfee 686 return -E2BIG;
cec736d2 687
a676e665 688 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
cec736d2
LP
689 struct statvfs svfs;
690
691 if (fstatvfs(f->fd, &svfs) >= 0) {
692 uint64_t available;
693
070052ab 694 available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
cec736d2
LP
695
696 if (new_size - old_size > available)
697 return -E2BIG;
698 }
699 }
700
eda4b58b
LP
701 /* Increase by larger blocks at once */
702 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
703 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
704 new_size = f->metrics.max_size;
705
bc85bfee
LP
706 /* Note that the glibc fallocate() fallback is very
707 inefficient, hence we try to minimize the allocation area
708 as we can. */
fec2aa2f
GV
709 r = posix_fallocate(f->fd, old_size, new_size - old_size);
710 if (r != 0)
711 return -r;
cec736d2 712
23b0b2b2 713 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
cec736d2 714
2678031a 715 return journal_file_fstat(f);
cec736d2
LP
716}
717
78519831 718static unsigned type_to_context(ObjectType type) {
d3d3208f 719 /* One context for each type, plus one catch-all for the rest */
69adae51 720 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
fa6ac760 721 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
d05089d8 722 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
d3d3208f
MS
723}
724
b439282e 725static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret, size_t *ret_size) {
2678031a
LP
726 int r;
727
cec736d2 728 assert(f);
cec736d2
LP
729 assert(ret);
730
7762e02b
LP
731 if (size <= 0)
732 return -EINVAL;
733
2a59ea54 734 /* Avoid SIGBUS on invalid accesses */
4bbdcdb3
LP
735 if (offset + size > (uint64_t) f->last_stat.st_size) {
736 /* Hmm, out of range? Let's refresh the fstat() data
737 * first, before we trust that check. */
738
2678031a
LP
739 r = journal_file_fstat(f);
740 if (r < 0)
741 return r;
742
743 if (offset + size > (uint64_t) f->last_stat.st_size)
4bbdcdb3
LP
744 return -EADDRNOTAVAIL;
745 }
746
b439282e 747 return mmap_cache_get(f->mmap, f->cache_fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret, ret_size);
cec736d2
LP
748}
749
16e9f408
LP
750static uint64_t minimum_header_size(Object *o) {
751
b8e891e6 752 static const uint64_t table[] = {
16e9f408
LP
753 [OBJECT_DATA] = sizeof(DataObject),
754 [OBJECT_FIELD] = sizeof(FieldObject),
755 [OBJECT_ENTRY] = sizeof(EntryObject),
756 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
757 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
758 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
759 [OBJECT_TAG] = sizeof(TagObject),
760 };
761
762 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
763 return sizeof(ObjectHeader);
764
765 return table[o->object.type];
766}
767
24754f36
TR
768/* Lightweight object checks. We want this to be fast, so that we won't
769 * slowdown every journal_file_move_to_object() call too much. */
770static int journal_file_check_object(JournalFile *f, uint64_t offset, Object *o) {
771 assert(f);
772 assert(o);
773
774 switch (o->object.type) {
775
776 case OBJECT_DATA: {
777 if ((le64toh(o->data.entry_offset) == 0) ^ (le64toh(o->data.n_entries) == 0)) {
778 log_debug("Bad n_entries: %"PRIu64": %"PRIu64,
10e8445b 779 le64toh(o->data.n_entries), offset);
24754f36
TR
780 return -EBADMSG;
781 }
782
783 if (le64toh(o->object.size) - offsetof(DataObject, payload) <= 0) {
784 log_debug("Bad object size (<= %zu): %"PRIu64": %"PRIu64,
785 offsetof(DataObject, payload),
786 le64toh(o->object.size),
787 offset);
788 return -EBADMSG;
789 }
790
10e8445b
TR
791 if (!VALID64(le64toh(o->data.next_hash_offset)) ||
792 !VALID64(le64toh(o->data.next_field_offset)) ||
793 !VALID64(le64toh(o->data.entry_offset)) ||
794 !VALID64(le64toh(o->data.entry_array_offset))) {
24754f36
TR
795 log_debug("Invalid offset, next_hash_offset="OFSfmt", next_field_offset="OFSfmt
796 ", entry_offset="OFSfmt", entry_array_offset="OFSfmt": %"PRIu64,
10e8445b
TR
797 le64toh(o->data.next_hash_offset),
798 le64toh(o->data.next_field_offset),
799 le64toh(o->data.entry_offset),
800 le64toh(o->data.entry_array_offset),
24754f36
TR
801 offset);
802 return -EBADMSG;
803 }
804
805 break;
806 }
807
808 case OBJECT_FIELD:
809 if (le64toh(o->object.size) - offsetof(FieldObject, payload) <= 0) {
810 log_debug(
811 "Bad field size (<= %zu): %"PRIu64": %"PRIu64,
812 offsetof(FieldObject, payload),
813 le64toh(o->object.size),
814 offset);
815 return -EBADMSG;
816 }
817
10e8445b
TR
818 if (!VALID64(le64toh(o->field.next_hash_offset)) ||
819 !VALID64(le64toh(o->field.head_data_offset))) {
24754f36
TR
820 log_debug(
821 "Invalid offset, next_hash_offset="OFSfmt
822 ", head_data_offset="OFSfmt": %"PRIu64,
10e8445b
TR
823 le64toh(o->field.next_hash_offset),
824 le64toh(o->field.head_data_offset),
24754f36
TR
825 offset);
826 return -EBADMSG;
827 }
828 break;
829
830 case OBJECT_ENTRY:
831 if ((le64toh(o->object.size) - offsetof(EntryObject, items)) % sizeof(EntryItem) != 0) {
832 log_debug(
833 "Bad entry size (<= %zu): %"PRIu64": %"PRIu64,
834 offsetof(EntryObject, items),
835 le64toh(o->object.size),
836 offset);
837 return -EBADMSG;
838 }
839
840 if ((le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem) <= 0) {
841 log_debug(
842 "Invalid number items in entry: %"PRIu64": %"PRIu64,
843 (le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem),
844 offset);
845 return -EBADMSG;
846 }
847
848 if (le64toh(o->entry.seqnum) <= 0) {
849 log_debug(
850 "Invalid entry seqnum: %"PRIx64": %"PRIu64,
851 le64toh(o->entry.seqnum),
852 offset);
853 return -EBADMSG;
854 }
855
856 if (!VALID_REALTIME(le64toh(o->entry.realtime))) {
857 log_debug(
858 "Invalid entry realtime timestamp: %"PRIu64": %"PRIu64,
859 le64toh(o->entry.realtime),
860 offset);
861 return -EBADMSG;
862 }
863
864 if (!VALID_MONOTONIC(le64toh(o->entry.monotonic))) {
865 log_debug(
866 "Invalid entry monotonic timestamp: %"PRIu64": %"PRIu64,
867 le64toh(o->entry.monotonic),
868 offset);
869 return -EBADMSG;
870 }
871
872 break;
873
874 case OBJECT_DATA_HASH_TABLE:
875 case OBJECT_FIELD_HASH_TABLE:
876 if ((le64toh(o->object.size) - offsetof(HashTableObject, items)) % sizeof(HashItem) != 0 ||
877 (le64toh(o->object.size) - offsetof(HashTableObject, items)) / sizeof(HashItem) <= 0) {
878 log_debug(
879 "Invalid %s hash table size: %"PRIu64": %"PRIu64,
880 o->object.type == OBJECT_DATA_HASH_TABLE ? "data" : "field",
881 le64toh(o->object.size),
882 offset);
883 return -EBADMSG;
884 }
885
886 break;
887
888 case OBJECT_ENTRY_ARRAY:
889 if ((le64toh(o->object.size) - offsetof(EntryArrayObject, items)) % sizeof(le64_t) != 0 ||
890 (le64toh(o->object.size) - offsetof(EntryArrayObject, items)) / sizeof(le64_t) <= 0) {
891 log_debug(
892 "Invalid object entry array size: %"PRIu64": %"PRIu64,
893 le64toh(o->object.size),
894 offset);
895 return -EBADMSG;
896 }
897
10e8445b 898 if (!VALID64(le64toh(o->entry_array.next_entry_array_offset))) {
24754f36
TR
899 log_debug(
900 "Invalid object entry array next_entry_array_offset: "OFSfmt": %"PRIu64,
10e8445b 901 le64toh(o->entry_array.next_entry_array_offset),
24754f36
TR
902 offset);
903 return -EBADMSG;
904 }
905
906 break;
907
908 case OBJECT_TAG:
909 if (le64toh(o->object.size) != sizeof(TagObject)) {
910 log_debug(
911 "Invalid object tag size: %"PRIu64": %"PRIu64,
912 le64toh(o->object.size),
913 offset);
914 return -EBADMSG;
915 }
916
10e8445b 917 if (!VALID_EPOCH(le64toh(o->tag.epoch))) {
24754f36
TR
918 log_debug(
919 "Invalid object tag epoch: %"PRIu64": %"PRIu64,
10e8445b 920 le64toh(o->tag.epoch),
24754f36
TR
921 offset);
922 return -EBADMSG;
923 }
924
925 break;
926 }
927
928 return 0;
929}
930
78519831 931int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
cec736d2
LP
932 int r;
933 void *t;
b439282e 934 size_t tsize;
cec736d2
LP
935 Object *o;
936 uint64_t s;
937
938 assert(f);
939 assert(ret);
940
db11ac1a 941 /* Objects may only be located at multiple of 64 bit */
202fd896
LP
942 if (!VALID64(offset)) {
943 log_debug("Attempt to move to object at non-64bit boundary: %" PRIu64, offset);
bd30fdf2 944 return -EBADMSG;
202fd896 945 }
db11ac1a 946
50809d7a 947 /* Object may not be located in the file header */
202fd896
LP
948 if (offset < le64toh(f->header->header_size)) {
949 log_debug("Attempt to move to object located in file header: %" PRIu64, offset);
50809d7a 950 return -EBADMSG;
202fd896 951 }
50809d7a 952
b439282e 953 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t, &tsize);
cec736d2
LP
954 if (r < 0)
955 return r;
956
957 o = (Object*) t;
958 s = le64toh(o->object.size);
959
1c69f096
LP
960 if (s == 0) {
961 log_debug("Attempt to move to uninitialized object: %" PRIu64, offset);
962 return -EBADMSG;
963 }
202fd896
LP
964 if (s < sizeof(ObjectHeader)) {
965 log_debug("Attempt to move to overly short object: %" PRIu64, offset);
cec736d2 966 return -EBADMSG;
202fd896 967 }
cec736d2 968
202fd896
LP
969 if (o->object.type <= OBJECT_UNUSED) {
970 log_debug("Attempt to move to object with invalid type: %" PRIu64, offset);
16e9f408 971 return -EBADMSG;
202fd896 972 }
16e9f408 973
202fd896
LP
974 if (s < minimum_header_size(o)) {
975 log_debug("Attempt to move to truncated object: %" PRIu64, offset);
16e9f408 976 return -EBADMSG;
202fd896 977 }
16e9f408 978
202fd896
LP
979 if (type > OBJECT_UNUSED && o->object.type != type) {
980 log_debug("Attempt to move to object of unexpected type: %" PRIu64, offset);
cec736d2 981 return -EBADMSG;
202fd896 982 }
cec736d2 983
b439282e
VC
984 if (s > tsize) {
985 r = journal_file_move_to(f, type, false, offset, s, &t, NULL);
cec736d2
LP
986 if (r < 0)
987 return r;
988
989 o = (Object*) t;
990 }
991
24754f36
TR
992 r = journal_file_check_object(f, offset, o);
993 if (r < 0)
994 return r;
995
cec736d2
LP
996 *ret = o;
997 return 0;
998}
999
d98cc1f2 1000static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
cec736d2
LP
1001 uint64_t r;
1002
1003 assert(f);
c88cc6af 1004 assert(f->header);
cec736d2 1005
beec0085 1006 r = le64toh(f->header->tail_entry_seqnum) + 1;
c2373f84
LP
1007
1008 if (seqnum) {
de190aef 1009 /* If an external seqnum counter was passed, we update
c2373f84
LP
1010 * both the local and the external one, and set it to
1011 * the maximum of both */
1012
1013 if (*seqnum + 1 > r)
1014 r = *seqnum + 1;
1015
1016 *seqnum = r;
1017 }
1018
beec0085 1019 f->header->tail_entry_seqnum = htole64(r);
cec736d2 1020
beec0085
LP
1021 if (f->header->head_entry_seqnum == 0)
1022 f->header->head_entry_seqnum = htole64(r);
de190aef 1023
cec736d2
LP
1024 return r;
1025}
1026
78519831 1027int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
cec736d2
LP
1028 int r;
1029 uint64_t p;
1030 Object *tail, *o;
1031 void *t;
1032
1033 assert(f);
c88cc6af 1034 assert(f->header);
d05089d8 1035 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
cec736d2
LP
1036 assert(size >= sizeof(ObjectHeader));
1037 assert(offset);
1038 assert(ret);
1039
26687bf8
OS
1040 r = journal_file_set_online(f);
1041 if (r < 0)
1042 return r;
1043
cec736d2 1044 p = le64toh(f->header->tail_object_offset);
cec736d2 1045 if (p == 0)
23b0b2b2 1046 p = le64toh(f->header->header_size);
cec736d2 1047 else {
d05089d8 1048 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
cec736d2
LP
1049 if (r < 0)
1050 return r;
1051
1052 p += ALIGN64(le64toh(tail->object.size));
1053 }
1054
1055 r = journal_file_allocate(f, p, size);
1056 if (r < 0)
1057 return r;
1058
b439282e 1059 r = journal_file_move_to(f, type, false, p, size, &t, NULL);
cec736d2
LP
1060 if (r < 0)
1061 return r;
1062
1063 o = (Object*) t;
1064
1065 zero(o->object);
de190aef 1066 o->object.type = type;
cec736d2
LP
1067 o->object.size = htole64(size);
1068
1069 f->header->tail_object_offset = htole64(p);
cec736d2
LP
1070 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
1071
1072 *ret = o;
1073 *offset = p;
1074
1075 return 0;
1076}
1077
de190aef 1078static int journal_file_setup_data_hash_table(JournalFile *f) {
cec736d2
LP
1079 uint64_t s, p;
1080 Object *o;
1081 int r;
1082
1083 assert(f);
c88cc6af 1084 assert(f->header);
cec736d2 1085
070052ab
LP
1086 /* We estimate that we need 1 hash table entry per 768 bytes
1087 of journal file and we want to make sure we never get
1088 beyond 75% fill level. Calculate the hash table size for
1089 the maximum file size based on these metrics. */
4a92baf3 1090
dfabe643 1091 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
4a92baf3
LP
1092 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
1093 s = DEFAULT_DATA_HASH_TABLE_SIZE;
1094
507f22bd 1095 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
4a92baf3 1096
de190aef
LP
1097 r = journal_file_append_object(f,
1098 OBJECT_DATA_HASH_TABLE,
1099 offsetof(Object, hash_table.items) + s,
1100 &o, &p);
cec736d2
LP
1101 if (r < 0)
1102 return r;
1103
29804cc1 1104 memzero(o->hash_table.items, s);
cec736d2 1105
de190aef
LP
1106 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1107 f->header->data_hash_table_size = htole64(s);
cec736d2
LP
1108
1109 return 0;
1110}
1111
de190aef 1112static int journal_file_setup_field_hash_table(JournalFile *f) {
cec736d2
LP
1113 uint64_t s, p;
1114 Object *o;
1115 int r;
1116
1117 assert(f);
c88cc6af 1118 assert(f->header);
cec736d2 1119
3c1668da
LP
1120 /* We use a fixed size hash table for the fields as this
1121 * number should grow very slowly only */
1122
de190aef
LP
1123 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
1124 r = journal_file_append_object(f,
1125 OBJECT_FIELD_HASH_TABLE,
1126 offsetof(Object, hash_table.items) + s,
1127 &o, &p);
cec736d2
LP
1128 if (r < 0)
1129 return r;
1130
29804cc1 1131 memzero(o->hash_table.items, s);
cec736d2 1132
de190aef
LP
1133 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1134 f->header->field_hash_table_size = htole64(s);
cec736d2
LP
1135
1136 return 0;
1137}
1138
dade37d4 1139int journal_file_map_data_hash_table(JournalFile *f) {
cec736d2
LP
1140 uint64_t s, p;
1141 void *t;
1142 int r;
1143
1144 assert(f);
c88cc6af 1145 assert(f->header);
cec736d2 1146
dade37d4
LP
1147 if (f->data_hash_table)
1148 return 0;
1149
de190aef
LP
1150 p = le64toh(f->header->data_hash_table_offset);
1151 s = le64toh(f->header->data_hash_table_size);
cec736d2 1152
de190aef 1153 r = journal_file_move_to(f,
16e9f408 1154 OBJECT_DATA_HASH_TABLE,
fcde2389 1155 true,
de190aef 1156 p, s,
b42549ad 1157 &t, NULL);
cec736d2
LP
1158 if (r < 0)
1159 return r;
1160
de190aef 1161 f->data_hash_table = t;
cec736d2
LP
1162 return 0;
1163}
1164
dade37d4 1165int journal_file_map_field_hash_table(JournalFile *f) {
cec736d2
LP
1166 uint64_t s, p;
1167 void *t;
1168 int r;
1169
1170 assert(f);
c88cc6af 1171 assert(f->header);
cec736d2 1172
dade37d4
LP
1173 if (f->field_hash_table)
1174 return 0;
1175
de190aef
LP
1176 p = le64toh(f->header->field_hash_table_offset);
1177 s = le64toh(f->header->field_hash_table_size);
cec736d2 1178
de190aef 1179 r = journal_file_move_to(f,
16e9f408 1180 OBJECT_FIELD_HASH_TABLE,
fcde2389 1181 true,
de190aef 1182 p, s,
b42549ad 1183 &t, NULL);
cec736d2
LP
1184 if (r < 0)
1185 return r;
1186
de190aef 1187 f->field_hash_table = t;
cec736d2
LP
1188 return 0;
1189}
1190
3c1668da
LP
1191static int journal_file_link_field(
1192 JournalFile *f,
1193 Object *o,
1194 uint64_t offset,
1195 uint64_t hash) {
1196
805d1486 1197 uint64_t p, h, m;
3c1668da
LP
1198 int r;
1199
1200 assert(f);
c88cc6af 1201 assert(f->header);
90d222c1 1202 assert(f->field_hash_table);
3c1668da
LP
1203 assert(o);
1204 assert(offset > 0);
1205
1206 if (o->object.type != OBJECT_FIELD)
1207 return -EINVAL;
1208
805d1486
LP
1209 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
1210 if (m <= 0)
1211 return -EBADMSG;
3c1668da 1212
805d1486 1213 /* This might alter the window we are looking at */
3c1668da
LP
1214 o->field.next_hash_offset = o->field.head_data_offset = 0;
1215
805d1486 1216 h = hash % m;
3c1668da
LP
1217 p = le64toh(f->field_hash_table[h].tail_hash_offset);
1218 if (p == 0)
1219 f->field_hash_table[h].head_hash_offset = htole64(offset);
1220 else {
1221 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1222 if (r < 0)
1223 return r;
1224
1225 o->field.next_hash_offset = htole64(offset);
1226 }
1227
1228 f->field_hash_table[h].tail_hash_offset = htole64(offset);
1229
1230 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
1231 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
1232
1233 return 0;
1234}
1235
1236static int journal_file_link_data(
1237 JournalFile *f,
1238 Object *o,
1239 uint64_t offset,
1240 uint64_t hash) {
1241
805d1486 1242 uint64_t p, h, m;
cec736d2
LP
1243 int r;
1244
1245 assert(f);
c88cc6af 1246 assert(f->header);
90d222c1 1247 assert(f->data_hash_table);
cec736d2
LP
1248 assert(o);
1249 assert(offset > 0);
b588975f
LP
1250
1251 if (o->object.type != OBJECT_DATA)
1252 return -EINVAL;
cec736d2 1253
805d1486
LP
1254 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
1255 if (m <= 0)
1256 return -EBADMSG;
48496df6 1257
805d1486 1258 /* This might alter the window we are looking at */
de190aef
LP
1259 o->data.next_hash_offset = o->data.next_field_offset = 0;
1260 o->data.entry_offset = o->data.entry_array_offset = 0;
1261 o->data.n_entries = 0;
cec736d2 1262
805d1486 1263 h = hash % m;
8db4213e 1264 p = le64toh(f->data_hash_table[h].tail_hash_offset);
3c1668da 1265 if (p == 0)
cec736d2 1266 /* Only entry in the hash table is easy */
de190aef 1267 f->data_hash_table[h].head_hash_offset = htole64(offset);
3c1668da 1268 else {
48496df6
LP
1269 /* Move back to the previous data object, to patch in
1270 * pointer */
cec736d2 1271
de190aef 1272 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1273 if (r < 0)
1274 return r;
1275
de190aef 1276 o->data.next_hash_offset = htole64(offset);
cec736d2
LP
1277 }
1278
de190aef 1279 f->data_hash_table[h].tail_hash_offset = htole64(offset);
cec736d2 1280
dca6219e
LP
1281 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
1282 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
1283
cec736d2
LP
1284 return 0;
1285}
1286
3c1668da
LP
1287int journal_file_find_field_object_with_hash(
1288 JournalFile *f,
1289 const void *field, uint64_t size, uint64_t hash,
1290 Object **ret, uint64_t *offset) {
1291
805d1486 1292 uint64_t p, osize, h, m;
3c1668da
LP
1293 int r;
1294
1295 assert(f);
c88cc6af 1296 assert(f->header);
3c1668da
LP
1297 assert(field && size > 0);
1298
dade37d4
LP
1299 /* If the field hash table is empty, we can't find anything */
1300 if (le64toh(f->header->field_hash_table_size) <= 0)
1301 return 0;
1302
1303 /* Map the field hash table, if it isn't mapped yet. */
1304 r = journal_file_map_field_hash_table(f);
1305 if (r < 0)
1306 return r;
1307
3c1668da
LP
1308 osize = offsetof(Object, field.payload) + size;
1309
805d1486 1310 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
805d1486 1311 if (m <= 0)
3c1668da
LP
1312 return -EBADMSG;
1313
805d1486 1314 h = hash % m;
3c1668da
LP
1315 p = le64toh(f->field_hash_table[h].head_hash_offset);
1316
1317 while (p > 0) {
1318 Object *o;
1319
1320 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1321 if (r < 0)
1322 return r;
1323
1324 if (le64toh(o->field.hash) == hash &&
1325 le64toh(o->object.size) == osize &&
1326 memcmp(o->field.payload, field, size) == 0) {
1327
1328 if (ret)
1329 *ret = o;
1330 if (offset)
1331 *offset = p;
1332
1333 return 1;
1334 }
1335
1336 p = le64toh(o->field.next_hash_offset);
1337 }
1338
1339 return 0;
1340}
1341
1342int journal_file_find_field_object(
1343 JournalFile *f,
1344 const void *field, uint64_t size,
1345 Object **ret, uint64_t *offset) {
1346
1347 uint64_t hash;
1348
1349 assert(f);
1350 assert(field && size > 0);
1351
1352 hash = hash64(field, size);
1353
1354 return journal_file_find_field_object_with_hash(f,
1355 field, size, hash,
1356 ret, offset);
1357}
1358
de190aef
LP
1359int journal_file_find_data_object_with_hash(
1360 JournalFile *f,
1361 const void *data, uint64_t size, uint64_t hash,
1362 Object **ret, uint64_t *offset) {
48496df6 1363
805d1486 1364 uint64_t p, osize, h, m;
cec736d2
LP
1365 int r;
1366
1367 assert(f);
c88cc6af 1368 assert(f->header);
cec736d2
LP
1369 assert(data || size == 0);
1370
dade37d4
LP
1371 /* If there's no data hash table, then there's no entry. */
1372 if (le64toh(f->header->data_hash_table_size) <= 0)
1373 return 0;
1374
1375 /* Map the data hash table, if it isn't mapped yet. */
1376 r = journal_file_map_data_hash_table(f);
1377 if (r < 0)
1378 return r;
1379
cec736d2
LP
1380 osize = offsetof(Object, data.payload) + size;
1381
805d1486
LP
1382 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
1383 if (m <= 0)
bc85bfee
LP
1384 return -EBADMSG;
1385
805d1486 1386 h = hash % m;
de190aef 1387 p = le64toh(f->data_hash_table[h].head_hash_offset);
cec736d2 1388
de190aef
LP
1389 while (p > 0) {
1390 Object *o;
cec736d2 1391
de190aef 1392 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1393 if (r < 0)
1394 return r;
1395
807e17f0 1396 if (le64toh(o->data.hash) != hash)
85a131e8 1397 goto next;
807e17f0 1398
d89c8fdf 1399 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
349cc4a5 1400#if HAVE_XZ || HAVE_LZ4
fa1c4b51 1401 uint64_t l;
a7f7d1bd 1402 size_t rsize = 0;
cec736d2 1403
807e17f0
LP
1404 l = le64toh(o->object.size);
1405 if (l <= offsetof(Object, data.payload))
cec736d2
LP
1406 return -EBADMSG;
1407
807e17f0
LP
1408 l -= offsetof(Object, data.payload);
1409
d89c8fdf
ZJS
1410 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
1411 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
1412 if (r < 0)
1413 return r;
807e17f0 1414
b785c858 1415 if (rsize == size &&
807e17f0
LP
1416 memcmp(f->compress_buffer, data, size) == 0) {
1417
1418 if (ret)
1419 *ret = o;
1420
1421 if (offset)
1422 *offset = p;
1423
1424 return 1;
1425 }
3b1a55e1
ZJS
1426#else
1427 return -EPROTONOSUPPORT;
1428#endif
807e17f0
LP
1429 } else if (le64toh(o->object.size) == osize &&
1430 memcmp(o->data.payload, data, size) == 0) {
1431
cec736d2
LP
1432 if (ret)
1433 *ret = o;
1434
1435 if (offset)
1436 *offset = p;
1437
de190aef 1438 return 1;
cec736d2
LP
1439 }
1440
85a131e8 1441 next:
cec736d2
LP
1442 p = le64toh(o->data.next_hash_offset);
1443 }
1444
de190aef
LP
1445 return 0;
1446}
1447
1448int journal_file_find_data_object(
1449 JournalFile *f,
1450 const void *data, uint64_t size,
1451 Object **ret, uint64_t *offset) {
1452
1453 uint64_t hash;
1454
1455 assert(f);
1456 assert(data || size == 0);
1457
1458 hash = hash64(data, size);
1459
1460 return journal_file_find_data_object_with_hash(f,
1461 data, size, hash,
1462 ret, offset);
1463}
1464
3c1668da
LP
1465static int journal_file_append_field(
1466 JournalFile *f,
1467 const void *field, uint64_t size,
1468 Object **ret, uint64_t *offset) {
1469
1470 uint64_t hash, p;
1471 uint64_t osize;
1472 Object *o;
1473 int r;
1474
1475 assert(f);
1476 assert(field && size > 0);
1477
1478 hash = hash64(field, size);
1479
1480 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
1481 if (r < 0)
1482 return r;
1483 else if (r > 0) {
1484
1485 if (ret)
1486 *ret = o;
1487
1488 if (offset)
1489 *offset = p;
1490
1491 return 0;
1492 }
1493
1494 osize = offsetof(Object, field.payload) + size;
1495 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
8c92d4bb
LP
1496 if (r < 0)
1497 return r;
3c1668da
LP
1498
1499 o->field.hash = htole64(hash);
1500 memcpy(o->field.payload, field, size);
1501
1502 r = journal_file_link_field(f, o, p, hash);
1503 if (r < 0)
1504 return r;
1505
1506 /* The linking might have altered the window, so let's
1507 * refresh our pointer */
1508 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1509 if (r < 0)
1510 return r;
1511
349cc4a5 1512#if HAVE_GCRYPT
3c1668da
LP
1513 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1514 if (r < 0)
1515 return r;
1516#endif
1517
1518 if (ret)
1519 *ret = o;
1520
1521 if (offset)
1522 *offset = p;
1523
1524 return 0;
1525}
1526
48496df6
LP
1527static int journal_file_append_data(
1528 JournalFile *f,
1529 const void *data, uint64_t size,
1530 Object **ret, uint64_t *offset) {
1531
de190aef
LP
1532 uint64_t hash, p;
1533 uint64_t osize;
1534 Object *o;
d89c8fdf 1535 int r, compression = 0;
3c1668da 1536 const void *eq;
de190aef
LP
1537
1538 assert(f);
1539 assert(data || size == 0);
1540
1541 hash = hash64(data, size);
1542
1543 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1544 if (r < 0)
1545 return r;
0240c603 1546 if (r > 0) {
de190aef
LP
1547
1548 if (ret)
1549 *ret = o;
1550
1551 if (offset)
1552 *offset = p;
1553
1554 return 0;
1555 }
1556
1557 osize = offsetof(Object, data.payload) + size;
1558 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
cec736d2
LP
1559 if (r < 0)
1560 return r;
1561
cec736d2 1562 o->data.hash = htole64(hash);
807e17f0 1563
349cc4a5 1564#if HAVE_XZ || HAVE_LZ4
d1afbcd2 1565 if (JOURNAL_FILE_COMPRESS(f) && size >= COMPRESSION_SIZE_THRESHOLD) {
a7f7d1bd 1566 size_t rsize = 0;
807e17f0 1567
5d6f46b6 1568 compression = compress_blob(data, size, o->data.payload, size - 1, &rsize);
807e17f0 1569
d1afbcd2 1570 if (compression >= 0) {
807e17f0 1571 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
d89c8fdf 1572 o->object.flags |= compression;
807e17f0 1573
fa1c4b51 1574 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
d89c8fdf 1575 size, rsize, object_compressed_to_string(compression));
d1afbcd2
LP
1576 } else
1577 /* Compression didn't work, we don't really care why, let's continue without compression */
1578 compression = 0;
807e17f0
LP
1579 }
1580#endif
1581
75f32f04
ZJS
1582 if (compression == 0)
1583 memcpy_safe(o->data.payload, data, size);
cec736d2 1584
de190aef 1585 r = journal_file_link_data(f, o, p, hash);
cec736d2
LP
1586 if (r < 0)
1587 return r;
1588
349cc4a5 1589#if HAVE_GCRYPT
33685a5a
FB
1590 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1591 if (r < 0)
1592 return r;
1593#endif
1594
48496df6
LP
1595 /* The linking might have altered the window, so let's
1596 * refresh our pointer */
1597 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1598 if (r < 0)
1599 return r;
1600
08c6f819
SL
1601 if (!data)
1602 eq = NULL;
1603 else
1604 eq = memchr(data, '=', size);
3c1668da 1605 if (eq && eq > data) {
748db592 1606 Object *fo = NULL;
3c1668da 1607 uint64_t fp;
3c1668da
LP
1608
1609 /* Create field object ... */
1610 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1611 if (r < 0)
1612 return r;
1613
1614 /* ... and link it in. */
1615 o->data.next_field_offset = fo->field.head_data_offset;
1616 fo->field.head_data_offset = le64toh(p);
1617 }
1618
cec736d2
LP
1619 if (ret)
1620 *ret = o;
1621
1622 if (offset)
de190aef 1623 *offset = p;
cec736d2
LP
1624
1625 return 0;
1626}
1627
1628uint64_t journal_file_entry_n_items(Object *o) {
1629 assert(o);
b588975f
LP
1630
1631 if (o->object.type != OBJECT_ENTRY)
1632 return 0;
cec736d2
LP
1633
1634 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1635}
1636
0284adc6 1637uint64_t journal_file_entry_array_n_items(Object *o) {
de190aef 1638 assert(o);
b588975f
LP
1639
1640 if (o->object.type != OBJECT_ENTRY_ARRAY)
1641 return 0;
de190aef
LP
1642
1643 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1644}
1645
fb9a24b6
LP
1646uint64_t journal_file_hash_table_n_items(Object *o) {
1647 assert(o);
b588975f 1648
ec2ce0c5 1649 if (!IN_SET(o->object.type, OBJECT_DATA_HASH_TABLE, OBJECT_FIELD_HASH_TABLE))
b588975f 1650 return 0;
fb9a24b6
LP
1651
1652 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1653}
1654
de190aef 1655static int link_entry_into_array(JournalFile *f,
4fd052ae
FC
1656 le64_t *first,
1657 le64_t *idx,
de190aef 1658 uint64_t p) {
cec736d2 1659 int r;
de190aef
LP
1660 uint64_t n = 0, ap = 0, q, i, a, hidx;
1661 Object *o;
1662
cec736d2 1663 assert(f);
c88cc6af 1664 assert(f->header);
de190aef
LP
1665 assert(first);
1666 assert(idx);
1667 assert(p > 0);
cec736d2 1668
de190aef
LP
1669 a = le64toh(*first);
1670 i = hidx = le64toh(*idx);
1671 while (a > 0) {
1672
1673 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1674 if (r < 0)
1675 return r;
cec736d2 1676
de190aef
LP
1677 n = journal_file_entry_array_n_items(o);
1678 if (i < n) {
1679 o->entry_array.items[i] = htole64(p);
1680 *idx = htole64(hidx + 1);
1681 return 0;
1682 }
cec736d2 1683
de190aef
LP
1684 i -= n;
1685 ap = a;
1686 a = le64toh(o->entry_array.next_entry_array_offset);
1687 }
1688
1689 if (hidx > n)
1690 n = (hidx+1) * 2;
1691 else
1692 n = n * 2;
1693
1694 if (n < 4)
1695 n = 4;
1696
1697 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1698 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1699 &o, &q);
cec736d2
LP
1700 if (r < 0)
1701 return r;
1702
349cc4a5 1703#if HAVE_GCRYPT
5996c7c2 1704 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
b0af6f41
LP
1705 if (r < 0)
1706 return r;
feb12d3e 1707#endif
b0af6f41 1708
de190aef 1709 o->entry_array.items[i] = htole64(p);
cec736d2 1710
de190aef 1711 if (ap == 0)
7be3aa17 1712 *first = htole64(q);
cec736d2 1713 else {
de190aef 1714 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
cec736d2
LP
1715 if (r < 0)
1716 return r;
1717
de190aef
LP
1718 o->entry_array.next_entry_array_offset = htole64(q);
1719 }
cec736d2 1720
2dee23eb
LP
1721 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1722 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1723
de190aef
LP
1724 *idx = htole64(hidx + 1);
1725
1726 return 0;
1727}
cec736d2 1728
de190aef 1729static int link_entry_into_array_plus_one(JournalFile *f,
4fd052ae
FC
1730 le64_t *extra,
1731 le64_t *first,
1732 le64_t *idx,
de190aef
LP
1733 uint64_t p) {
1734
1735 int r;
1736
1737 assert(f);
1738 assert(extra);
1739 assert(first);
1740 assert(idx);
1741 assert(p > 0);
1742
1743 if (*idx == 0)
1744 *extra = htole64(p);
1745 else {
4fd052ae 1746 le64_t i;
de190aef 1747
7be3aa17 1748 i = htole64(le64toh(*idx) - 1);
de190aef
LP
1749 r = link_entry_into_array(f, first, &i, p);
1750 if (r < 0)
1751 return r;
cec736d2
LP
1752 }
1753
de190aef
LP
1754 *idx = htole64(le64toh(*idx) + 1);
1755 return 0;
1756}
1757
1758static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1759 uint64_t p;
1760 int r;
1761 assert(f);
1762 assert(o);
1763 assert(offset > 0);
1764
1765 p = le64toh(o->entry.items[i].object_offset);
1766 if (p == 0)
1767 return -EINVAL;
1768
1769 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1770 if (r < 0)
1771 return r;
1772
de190aef
LP
1773 return link_entry_into_array_plus_one(f,
1774 &o->data.entry_offset,
1775 &o->data.entry_array_offset,
1776 &o->data.n_entries,
1777 offset);
cec736d2
LP
1778}
1779
1780static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
de190aef 1781 uint64_t n, i;
cec736d2
LP
1782 int r;
1783
1784 assert(f);
c88cc6af 1785 assert(f->header);
cec736d2
LP
1786 assert(o);
1787 assert(offset > 0);
b588975f
LP
1788
1789 if (o->object.type != OBJECT_ENTRY)
1790 return -EINVAL;
cec736d2 1791
b788cc23
LP
1792 __sync_synchronize();
1793
cec736d2 1794 /* Link up the entry itself */
de190aef
LP
1795 r = link_entry_into_array(f,
1796 &f->header->entry_array_offset,
1797 &f->header->n_entries,
1798 offset);
1799 if (r < 0)
1800 return r;
cec736d2 1801
507f22bd 1802 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
cec736d2 1803
de190aef 1804 if (f->header->head_entry_realtime == 0)
0ac38b70 1805 f->header->head_entry_realtime = o->entry.realtime;
cec736d2 1806
0ac38b70 1807 f->header->tail_entry_realtime = o->entry.realtime;
de190aef
LP
1808 f->header->tail_entry_monotonic = o->entry.monotonic;
1809
1810 f->tail_entry_monotonic_valid = true;
cec736d2
LP
1811
1812 /* Link up the items */
1813 n = journal_file_entry_n_items(o);
1814 for (i = 0; i < n; i++) {
1815 r = journal_file_link_entry_item(f, o, offset, i);
1816 if (r < 0)
1817 return r;
1818 }
1819
cec736d2
LP
1820 return 0;
1821}
1822
1823static int journal_file_append_entry_internal(
1824 JournalFile *f,
1825 const dual_timestamp *ts,
1826 uint64_t xor_hash,
1827 const EntryItem items[], unsigned n_items,
de190aef 1828 uint64_t *seqnum,
cec736d2
LP
1829 Object **ret, uint64_t *offset) {
1830 uint64_t np;
1831 uint64_t osize;
1832 Object *o;
1833 int r;
1834
1835 assert(f);
c88cc6af 1836 assert(f->header);
cec736d2 1837 assert(items || n_items == 0);
de190aef 1838 assert(ts);
cec736d2
LP
1839
1840 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1841
de190aef 1842 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
cec736d2
LP
1843 if (r < 0)
1844 return r;
1845
d98cc1f2 1846 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
75f32f04 1847 memcpy_safe(o->entry.items, items, n_items * sizeof(EntryItem));
de190aef
LP
1848 o->entry.realtime = htole64(ts->realtime);
1849 o->entry.monotonic = htole64(ts->monotonic);
cec736d2
LP
1850 o->entry.xor_hash = htole64(xor_hash);
1851 o->entry.boot_id = f->header->boot_id;
1852
349cc4a5 1853#if HAVE_GCRYPT
5996c7c2 1854 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
b0af6f41
LP
1855 if (r < 0)
1856 return r;
feb12d3e 1857#endif
b0af6f41 1858
cec736d2
LP
1859 r = journal_file_link_entry(f, o, np);
1860 if (r < 0)
1861 return r;
1862
1863 if (ret)
1864 *ret = o;
1865
1866 if (offset)
1867 *offset = np;
1868
1869 return 0;
1870}
1871
cf244689 1872void journal_file_post_change(JournalFile *f) {
50f20cfd
LP
1873 assert(f);
1874
1875 /* inotify() does not receive IN_MODIFY events from file
1876 * accesses done via mmap(). After each access we hence
1877 * trigger IN_MODIFY by truncating the journal file to its
1878 * current size which triggers IN_MODIFY. */
1879
bc85bfee
LP
1880 __sync_synchronize();
1881
50f20cfd 1882 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
e167d7fd 1883 log_debug_errno(errno, "Failed to truncate file to its own size: %m");
50f20cfd
LP
1884}
1885
7a24f3bf
VC
1886static int post_change_thunk(sd_event_source *timer, uint64_t usec, void *userdata) {
1887 assert(userdata);
1888
1889 journal_file_post_change(userdata);
1890
1891 return 1;
1892}
1893
1894static void schedule_post_change(JournalFile *f) {
1895 sd_event_source *timer;
1896 int enabled, r;
1897 uint64_t now;
1898
1899 assert(f);
1900 assert(f->post_change_timer);
1901
1902 timer = f->post_change_timer;
1903
1904 r = sd_event_source_get_enabled(timer, &enabled);
1905 if (r < 0) {
e167d7fd
LP
1906 log_debug_errno(r, "Failed to get ftruncate timer state: %m");
1907 goto fail;
7a24f3bf
VC
1908 }
1909
1910 if (enabled == SD_EVENT_ONESHOT)
1911 return;
1912
1913 r = sd_event_now(sd_event_source_get_event(timer), CLOCK_MONOTONIC, &now);
1914 if (r < 0) {
e167d7fd
LP
1915 log_debug_errno(r, "Failed to get clock's now for scheduling ftruncate: %m");
1916 goto fail;
7a24f3bf
VC
1917 }
1918
1919 r = sd_event_source_set_time(timer, now+f->post_change_timer_period);
1920 if (r < 0) {
e167d7fd
LP
1921 log_debug_errno(r, "Failed to set time for scheduling ftruncate: %m");
1922 goto fail;
7a24f3bf
VC
1923 }
1924
1925 r = sd_event_source_set_enabled(timer, SD_EVENT_ONESHOT);
1926 if (r < 0) {
e167d7fd
LP
1927 log_debug_errno(r, "Failed to enable scheduled ftruncate: %m");
1928 goto fail;
7a24f3bf 1929 }
e167d7fd
LP
1930
1931 return;
1932
1933fail:
1934 /* On failure, let's simply post the change immediately. */
1935 journal_file_post_change(f);
7a24f3bf
VC
1936}
1937
1938/* Enable coalesced change posting in a timer on the provided sd_event instance */
1939int journal_file_enable_post_change_timer(JournalFile *f, sd_event *e, usec_t t) {
1940 _cleanup_(sd_event_source_unrefp) sd_event_source *timer = NULL;
1941 int r;
1942
1943 assert(f);
1944 assert_return(!f->post_change_timer, -EINVAL);
1945 assert(e);
1946 assert(t);
1947
1948 r = sd_event_add_time(e, &timer, CLOCK_MONOTONIC, 0, 0, post_change_thunk, f);
1949 if (r < 0)
1950 return r;
1951
1952 r = sd_event_source_set_enabled(timer, SD_EVENT_OFF);
1953 if (r < 0)
1954 return r;
1955
1956 f->post_change_timer = timer;
1957 timer = NULL;
1958 f->post_change_timer_period = t;
1959
1960 return r;
1961}
1962
1f2da9ec
LP
1963static int entry_item_cmp(const void *_a, const void *_b) {
1964 const EntryItem *a = _a, *b = _b;
1965
1966 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1967 return -1;
1968 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1969 return 1;
1970 return 0;
1971}
1972
de190aef 1973int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
cec736d2
LP
1974 unsigned i;
1975 EntryItem *items;
1976 int r;
1977 uint64_t xor_hash = 0;
de190aef 1978 struct dual_timestamp _ts;
cec736d2
LP
1979
1980 assert(f);
c88cc6af 1981 assert(f->header);
cec736d2
LP
1982 assert(iovec || n_iovec == 0);
1983
de190aef
LP
1984 if (!ts) {
1985 dual_timestamp_get(&_ts);
1986 ts = &_ts;
1987 }
1988
349cc4a5 1989#if HAVE_GCRYPT
7560fffc
LP
1990 r = journal_file_maybe_append_tag(f, ts->realtime);
1991 if (r < 0)
1992 return r;
feb12d3e 1993#endif
7560fffc 1994
64825d3c 1995 /* alloca() can't take 0, hence let's allocate at least one */
9607d947 1996 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
cec736d2
LP
1997
1998 for (i = 0; i < n_iovec; i++) {
1999 uint64_t p;
2000 Object *o;
2001
2002 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
2003 if (r < 0)
cf244689 2004 return r;
cec736d2
LP
2005
2006 xor_hash ^= le64toh(o->data.hash);
2007 items[i].object_offset = htole64(p);
de7b95cd 2008 items[i].hash = o->data.hash;
cec736d2
LP
2009 }
2010
1f2da9ec
LP
2011 /* Order by the position on disk, in order to improve seek
2012 * times for rotating media. */
7ff7394d 2013 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1f2da9ec 2014
de190aef 2015 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
cec736d2 2016
fa6ac760
LP
2017 /* If the memory mapping triggered a SIGBUS then we return an
2018 * IO error and ignore the error code passed down to us, since
2019 * it is very likely just an effect of a nullified replacement
2020 * mapping page */
2021
be7cdd8e 2022 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
fa6ac760
LP
2023 r = -EIO;
2024
7a24f3bf
VC
2025 if (f->post_change_timer)
2026 schedule_post_change(f);
2027 else
2028 journal_file_post_change(f);
50f20cfd 2029
cec736d2
LP
2030 return r;
2031}
2032
a4bcff5b 2033typedef struct ChainCacheItem {
fb099c8d 2034 uint64_t first; /* the array at the beginning of the chain */
a4bcff5b
LP
2035 uint64_t array; /* the cached array */
2036 uint64_t begin; /* the first item in the cached array */
2037 uint64_t total; /* the total number of items in all arrays before this one in the chain */
f268980d 2038 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
a4bcff5b
LP
2039} ChainCacheItem;
2040
2041static void chain_cache_put(
4743015d 2042 OrderedHashmap *h,
a4bcff5b
LP
2043 ChainCacheItem *ci,
2044 uint64_t first,
2045 uint64_t array,
2046 uint64_t begin,
f268980d
LP
2047 uint64_t total,
2048 uint64_t last_index) {
a4bcff5b
LP
2049
2050 if (!ci) {
34741aa3
LP
2051 /* If the chain item to cache for this chain is the
2052 * first one it's not worth caching anything */
2053 if (array == first)
2054 return;
2055
29433089 2056 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
4743015d 2057 ci = ordered_hashmap_steal_first(h);
29433089
LP
2058 assert(ci);
2059 } else {
a4bcff5b
LP
2060 ci = new(ChainCacheItem, 1);
2061 if (!ci)
2062 return;
2063 }
2064
2065 ci->first = first;
2066
4743015d 2067 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
a4bcff5b
LP
2068 free(ci);
2069 return;
2070 }
2071 } else
2072 assert(ci->first == first);
2073
2074 ci->array = array;
2075 ci->begin = begin;
2076 ci->total = total;
f268980d 2077 ci->last_index = last_index;
a4bcff5b
LP
2078}
2079
f268980d
LP
2080static int generic_array_get(
2081 JournalFile *f,
2082 uint64_t first,
2083 uint64_t i,
2084 Object **ret, uint64_t *offset) {
de190aef 2085
cec736d2 2086 Object *o;
a4bcff5b 2087 uint64_t p = 0, a, t = 0;
cec736d2 2088 int r;
a4bcff5b 2089 ChainCacheItem *ci;
cec736d2
LP
2090
2091 assert(f);
2092
de190aef 2093 a = first;
a4bcff5b
LP
2094
2095 /* Try the chain cache first */
4743015d 2096 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
2097 if (ci && i > ci->total) {
2098 a = ci->array;
2099 i -= ci->total;
2100 t = ci->total;
2101 }
2102
de190aef 2103 while (a > 0) {
a4bcff5b 2104 uint64_t k;
cec736d2 2105
de190aef
LP
2106 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
2107 if (r < 0)
2108 return r;
cec736d2 2109
a4bcff5b
LP
2110 k = journal_file_entry_array_n_items(o);
2111 if (i < k) {
de190aef 2112 p = le64toh(o->entry_array.items[i]);
a4bcff5b 2113 goto found;
cec736d2
LP
2114 }
2115
a4bcff5b
LP
2116 i -= k;
2117 t += k;
de190aef
LP
2118 a = le64toh(o->entry_array.next_entry_array_offset);
2119 }
2120
a4bcff5b
LP
2121 return 0;
2122
2123found:
2124 /* Let's cache this item for the next invocation */
af13a6b0 2125 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
de190aef
LP
2126
2127 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2128 if (r < 0)
2129 return r;
2130
2131 if (ret)
2132 *ret = o;
2133
2134 if (offset)
2135 *offset = p;
2136
2137 return 1;
2138}
2139
f268980d
LP
2140static int generic_array_get_plus_one(
2141 JournalFile *f,
2142 uint64_t extra,
2143 uint64_t first,
2144 uint64_t i,
2145 Object **ret, uint64_t *offset) {
de190aef
LP
2146
2147 Object *o;
2148
2149 assert(f);
2150
2151 if (i == 0) {
2152 int r;
2153
2154 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
cec736d2
LP
2155 if (r < 0)
2156 return r;
2157
de190aef
LP
2158 if (ret)
2159 *ret = o;
cec736d2 2160
de190aef
LP
2161 if (offset)
2162 *offset = extra;
cec736d2 2163
de190aef 2164 return 1;
cec736d2
LP
2165 }
2166
de190aef
LP
2167 return generic_array_get(f, first, i-1, ret, offset);
2168}
cec736d2 2169
de190aef
LP
2170enum {
2171 TEST_FOUND,
2172 TEST_LEFT,
2173 TEST_RIGHT
2174};
cec736d2 2175
f268980d
LP
2176static int generic_array_bisect(
2177 JournalFile *f,
2178 uint64_t first,
2179 uint64_t n,
2180 uint64_t needle,
2181 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2182 direction_t direction,
2183 Object **ret,
2184 uint64_t *offset,
2185 uint64_t *idx) {
2186
2187 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
de190aef
LP
2188 bool subtract_one = false;
2189 Object *o, *array = NULL;
2190 int r;
a4bcff5b 2191 ChainCacheItem *ci;
cec736d2 2192
de190aef
LP
2193 assert(f);
2194 assert(test_object);
cec736d2 2195
a4bcff5b 2196 /* Start with the first array in the chain */
de190aef 2197 a = first;
a4bcff5b 2198
4743015d 2199 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
2200 if (ci && n > ci->total) {
2201 /* Ah, we have iterated this bisection array chain
2202 * previously! Let's see if we can skip ahead in the
2203 * chain, as far as the last time. But we can't jump
2204 * backwards in the chain, so let's check that
2205 * first. */
2206
2207 r = test_object(f, ci->begin, needle);
2208 if (r < 0)
2209 return r;
2210
2211 if (r == TEST_LEFT) {
f268980d 2212 /* OK, what we are looking for is right of the
a4bcff5b
LP
2213 * begin of this EntryArray, so let's jump
2214 * straight to previously cached array in the
2215 * chain */
2216
2217 a = ci->array;
2218 n -= ci->total;
2219 t = ci->total;
f268980d 2220 last_index = ci->last_index;
a4bcff5b
LP
2221 }
2222 }
2223
de190aef
LP
2224 while (a > 0) {
2225 uint64_t left, right, k, lp;
2226
2227 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
cec736d2
LP
2228 if (r < 0)
2229 return r;
2230
de190aef
LP
2231 k = journal_file_entry_array_n_items(array);
2232 right = MIN(k, n);
2233 if (right <= 0)
2234 return 0;
cec736d2 2235
de190aef
LP
2236 i = right - 1;
2237 lp = p = le64toh(array->entry_array.items[i]);
2238 if (p <= 0)
bee6a291
LP
2239 r = -EBADMSG;
2240 else
2241 r = test_object(f, p, needle);
2242 if (r == -EBADMSG) {
2243 log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (1)");
2244 n = i;
2245 continue;
2246 }
de190aef
LP
2247 if (r < 0)
2248 return r;
cec736d2 2249
de190aef
LP
2250 if (r == TEST_FOUND)
2251 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2252
2253 if (r == TEST_RIGHT) {
2254 left = 0;
2255 right -= 1;
f268980d
LP
2256
2257 if (last_index != (uint64_t) -1) {
2258 assert(last_index <= right);
2259
2260 /* If we cached the last index we
2261 * looked at, let's try to not to jump
2262 * too wildly around and see if we can
2263 * limit the range to look at early to
2264 * the immediate neighbors of the last
2265 * index we looked at. */
2266
2267 if (last_index > 0) {
2268 uint64_t x = last_index - 1;
2269
2270 p = le64toh(array->entry_array.items[x]);
2271 if (p <= 0)
2272 return -EBADMSG;
2273
2274 r = test_object(f, p, needle);
2275 if (r < 0)
2276 return r;
2277
2278 if (r == TEST_FOUND)
2279 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2280
2281 if (r == TEST_RIGHT)
2282 right = x;
2283 else
2284 left = x + 1;
2285 }
2286
2287 if (last_index < right) {
2288 uint64_t y = last_index + 1;
2289
2290 p = le64toh(array->entry_array.items[y]);
2291 if (p <= 0)
2292 return -EBADMSG;
2293
2294 r = test_object(f, p, needle);
2295 if (r < 0)
2296 return r;
2297
2298 if (r == TEST_FOUND)
2299 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2300
2301 if (r == TEST_RIGHT)
2302 right = y;
2303 else
2304 left = y + 1;
2305 }
f268980d
LP
2306 }
2307
de190aef
LP
2308 for (;;) {
2309 if (left == right) {
2310 if (direction == DIRECTION_UP)
2311 subtract_one = true;
2312
2313 i = left;
2314 goto found;
2315 }
2316
2317 assert(left < right);
de190aef 2318 i = (left + right) / 2;
f268980d 2319
de190aef
LP
2320 p = le64toh(array->entry_array.items[i]);
2321 if (p <= 0)
bee6a291
LP
2322 r = -EBADMSG;
2323 else
2324 r = test_object(f, p, needle);
2325 if (r == -EBADMSG) {
2326 log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (2)");
2327 right = n = i;
2328 continue;
2329 }
de190aef
LP
2330 if (r < 0)
2331 return r;
cec736d2 2332
de190aef
LP
2333 if (r == TEST_FOUND)
2334 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2335
2336 if (r == TEST_RIGHT)
2337 right = i;
2338 else
2339 left = i + 1;
2340 }
2341 }
2342
2173cbf8 2343 if (k >= n) {
cbdca852
LP
2344 if (direction == DIRECTION_UP) {
2345 i = n;
2346 subtract_one = true;
2347 goto found;
2348 }
2349
cec736d2 2350 return 0;
cbdca852 2351 }
cec736d2 2352
de190aef
LP
2353 last_p = lp;
2354
2355 n -= k;
2356 t += k;
f268980d 2357 last_index = (uint64_t) -1;
de190aef 2358 a = le64toh(array->entry_array.next_entry_array_offset);
cec736d2
LP
2359 }
2360
2361 return 0;
de190aef
LP
2362
2363found:
2364 if (subtract_one && t == 0 && i == 0)
2365 return 0;
2366
a4bcff5b 2367 /* Let's cache this item for the next invocation */
af13a6b0 2368 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
a4bcff5b 2369
de190aef
LP
2370 if (subtract_one && i == 0)
2371 p = last_p;
2372 else if (subtract_one)
2373 p = le64toh(array->entry_array.items[i-1]);
2374 else
2375 p = le64toh(array->entry_array.items[i]);
2376
2377 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2378 if (r < 0)
2379 return r;
2380
2381 if (ret)
2382 *ret = o;
2383
2384 if (offset)
2385 *offset = p;
2386
2387 if (idx)
cbdca852 2388 *idx = t + i + (subtract_one ? -1 : 0);
de190aef
LP
2389
2390 return 1;
cec736d2
LP
2391}
2392
f268980d
LP
2393static int generic_array_bisect_plus_one(
2394 JournalFile *f,
2395 uint64_t extra,
2396 uint64_t first,
2397 uint64_t n,
2398 uint64_t needle,
2399 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2400 direction_t direction,
2401 Object **ret,
2402 uint64_t *offset,
2403 uint64_t *idx) {
de190aef 2404
cec736d2 2405 int r;
cbdca852
LP
2406 bool step_back = false;
2407 Object *o;
cec736d2
LP
2408
2409 assert(f);
de190aef 2410 assert(test_object);
cec736d2 2411
de190aef
LP
2412 if (n <= 0)
2413 return 0;
cec736d2 2414
de190aef
LP
2415 /* This bisects the array in object 'first', but first checks
2416 * an extra */
de190aef
LP
2417 r = test_object(f, extra, needle);
2418 if (r < 0)
2419 return r;
a536e261
LP
2420
2421 if (r == TEST_FOUND)
2422 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2423
cbdca852
LP
2424 /* if we are looking with DIRECTION_UP then we need to first
2425 see if in the actual array there is a matching entry, and
2426 return the last one of that. But if there isn't any we need
2427 to return this one. Hence remember this, and return it
2428 below. */
2429 if (r == TEST_LEFT)
2430 step_back = direction == DIRECTION_UP;
de190aef 2431
cbdca852
LP
2432 if (r == TEST_RIGHT) {
2433 if (direction == DIRECTION_DOWN)
2434 goto found;
2435 else
2436 return 0;
a536e261 2437 }
cec736d2 2438
de190aef
LP
2439 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
2440
cbdca852
LP
2441 if (r == 0 && step_back)
2442 goto found;
2443
ecf68b1d 2444 if (r > 0 && idx)
313cefa1 2445 (*idx)++;
de190aef
LP
2446
2447 return r;
cbdca852
LP
2448
2449found:
2450 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
2451 if (r < 0)
2452 return r;
2453
2454 if (ret)
2455 *ret = o;
2456
2457 if (offset)
2458 *offset = extra;
2459
2460 if (idx)
2461 *idx = 0;
2462
2463 return 1;
2464}
2465
44a6b1b6 2466_pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
cbdca852
LP
2467 assert(f);
2468 assert(p > 0);
2469
2470 if (p == needle)
2471 return TEST_FOUND;
2472 else if (p < needle)
2473 return TEST_LEFT;
2474 else
2475 return TEST_RIGHT;
2476}
2477
de190aef
LP
2478static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
2479 Object *o;
2480 int r;
2481
2482 assert(f);
2483 assert(p > 0);
2484
2485 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
cec736d2
LP
2486 if (r < 0)
2487 return r;
2488
de190aef
LP
2489 if (le64toh(o->entry.seqnum) == needle)
2490 return TEST_FOUND;
2491 else if (le64toh(o->entry.seqnum) < needle)
2492 return TEST_LEFT;
2493 else
2494 return TEST_RIGHT;
2495}
cec736d2 2496
de190aef
LP
2497int journal_file_move_to_entry_by_seqnum(
2498 JournalFile *f,
2499 uint64_t seqnum,
2500 direction_t direction,
2501 Object **ret,
2502 uint64_t *offset) {
c88cc6af
VC
2503 assert(f);
2504 assert(f->header);
de190aef
LP
2505
2506 return generic_array_bisect(f,
2507 le64toh(f->header->entry_array_offset),
2508 le64toh(f->header->n_entries),
2509 seqnum,
2510 test_object_seqnum,
2511 direction,
2512 ret, offset, NULL);
2513}
cec736d2 2514
de190aef
LP
2515static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
2516 Object *o;
2517 int r;
2518
2519 assert(f);
2520 assert(p > 0);
2521
2522 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2523 if (r < 0)
2524 return r;
2525
2526 if (le64toh(o->entry.realtime) == needle)
2527 return TEST_FOUND;
2528 else if (le64toh(o->entry.realtime) < needle)
2529 return TEST_LEFT;
2530 else
2531 return TEST_RIGHT;
cec736d2
LP
2532}
2533
de190aef
LP
2534int journal_file_move_to_entry_by_realtime(
2535 JournalFile *f,
2536 uint64_t realtime,
2537 direction_t direction,
2538 Object **ret,
2539 uint64_t *offset) {
c88cc6af
VC
2540 assert(f);
2541 assert(f->header);
de190aef
LP
2542
2543 return generic_array_bisect(f,
2544 le64toh(f->header->entry_array_offset),
2545 le64toh(f->header->n_entries),
2546 realtime,
2547 test_object_realtime,
2548 direction,
2549 ret, offset, NULL);
2550}
2551
2552static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
2553 Object *o;
2554 int r;
2555
2556 assert(f);
2557 assert(p > 0);
2558
2559 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2560 if (r < 0)
2561 return r;
2562
2563 if (le64toh(o->entry.monotonic) == needle)
2564 return TEST_FOUND;
2565 else if (le64toh(o->entry.monotonic) < needle)
2566 return TEST_LEFT;
2567 else
2568 return TEST_RIGHT;
2569}
2570
2a560338 2571static int find_data_object_by_boot_id(
47838ab3
ZJS
2572 JournalFile *f,
2573 sd_id128_t boot_id,
2574 Object **o,
2575 uint64_t *b) {
2a560338 2576
fbd0b64f 2577 char t[STRLEN("_BOOT_ID=") + 32 + 1] = "_BOOT_ID=";
47838ab3
ZJS
2578
2579 sd_id128_to_string(boot_id, t + 9);
2580 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2581}
2582
de190aef
LP
2583int journal_file_move_to_entry_by_monotonic(
2584 JournalFile *f,
2585 sd_id128_t boot_id,
2586 uint64_t monotonic,
2587 direction_t direction,
2588 Object **ret,
2589 uint64_t *offset) {
2590
de190aef
LP
2591 Object *o;
2592 int r;
2593
cbdca852 2594 assert(f);
de190aef 2595
47838ab3 2596 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
de190aef
LP
2597 if (r < 0)
2598 return r;
cbdca852 2599 if (r == 0)
de190aef
LP
2600 return -ENOENT;
2601
2602 return generic_array_bisect_plus_one(f,
2603 le64toh(o->data.entry_offset),
2604 le64toh(o->data.entry_array_offset),
2605 le64toh(o->data.n_entries),
2606 monotonic,
2607 test_object_monotonic,
2608 direction,
2609 ret, offset, NULL);
2610}
2611
1fc605b0 2612void journal_file_reset_location(JournalFile *f) {
6573ef05 2613 f->location_type = LOCATION_HEAD;
1fc605b0 2614 f->current_offset = 0;
6573ef05
MS
2615 f->current_seqnum = 0;
2616 f->current_realtime = 0;
2617 f->current_monotonic = 0;
2618 zero(f->current_boot_id);
2619 f->current_xor_hash = 0;
2620}
2621
950c07d4 2622void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
6573ef05
MS
2623 f->location_type = LOCATION_SEEK;
2624 f->current_offset = offset;
2625 f->current_seqnum = le64toh(o->entry.seqnum);
2626 f->current_realtime = le64toh(o->entry.realtime);
2627 f->current_monotonic = le64toh(o->entry.monotonic);
2628 f->current_boot_id = o->entry.boot_id;
2629 f->current_xor_hash = le64toh(o->entry.xor_hash);
1fc605b0
MS
2630}
2631
d8ae66d7
MS
2632int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2633 assert(af);
c88cc6af 2634 assert(af->header);
d8ae66d7 2635 assert(bf);
c88cc6af 2636 assert(bf->header);
d8ae66d7
MS
2637 assert(af->location_type == LOCATION_SEEK);
2638 assert(bf->location_type == LOCATION_SEEK);
2639
2640 /* If contents and timestamps match, these entries are
2641 * identical, even if the seqnum does not match */
2642 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2643 af->current_monotonic == bf->current_monotonic &&
2644 af->current_realtime == bf->current_realtime &&
2645 af->current_xor_hash == bf->current_xor_hash)
2646 return 0;
2647
2648 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2649
2650 /* If this is from the same seqnum source, compare
2651 * seqnums */
2652 if (af->current_seqnum < bf->current_seqnum)
2653 return -1;
2654 if (af->current_seqnum > bf->current_seqnum)
2655 return 1;
2656
2657 /* Wow! This is weird, different data but the same
2658 * seqnums? Something is borked, but let's make the
2659 * best of it and compare by time. */
2660 }
2661
2662 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2663
2664 /* If the boot id matches, compare monotonic time */
2665 if (af->current_monotonic < bf->current_monotonic)
2666 return -1;
2667 if (af->current_monotonic > bf->current_monotonic)
2668 return 1;
2669 }
2670
2671 /* Otherwise, compare UTC time */
2672 if (af->current_realtime < bf->current_realtime)
2673 return -1;
2674 if (af->current_realtime > bf->current_realtime)
2675 return 1;
2676
2677 /* Finally, compare by contents */
2678 if (af->current_xor_hash < bf->current_xor_hash)
2679 return -1;
2680 if (af->current_xor_hash > bf->current_xor_hash)
2681 return 1;
2682
2683 return 0;
2684}
2685
aa598ba5
LP
2686static int bump_array_index(uint64_t *i, direction_t direction, uint64_t n) {
2687
2688 /* Increase or decrease the specified index, in the right direction. */
2689
2690 if (direction == DIRECTION_DOWN) {
2691 if (*i >= n - 1)
2692 return 0;
2693
2694 (*i) ++;
2695 } else {
2696 if (*i <= 0)
2697 return 0;
2698
2699 (*i) --;
2700 }
2701
2702 return 1;
2703}
2704
b6da4ed0
LP
2705static bool check_properly_ordered(uint64_t new_offset, uint64_t old_offset, direction_t direction) {
2706
2707 /* Consider it an error if any of the two offsets is uninitialized */
2708 if (old_offset == 0 || new_offset == 0)
2709 return false;
2710
2711 /* If we go down, the new offset must be larger than the old one. */
2712 return direction == DIRECTION_DOWN ?
2713 new_offset > old_offset :
2714 new_offset < old_offset;
2715}
2716
de190aef
LP
2717int journal_file_next_entry(
2718 JournalFile *f,
f534928a 2719 uint64_t p,
de190aef
LP
2720 direction_t direction,
2721 Object **ret, uint64_t *offset) {
2722
fb099c8d 2723 uint64_t i, n, ofs;
cec736d2
LP
2724 int r;
2725
2726 assert(f);
c88cc6af 2727 assert(f->header);
de190aef
LP
2728
2729 n = le64toh(f->header->n_entries);
2730 if (n <= 0)
2731 return 0;
cec736d2 2732
f534928a 2733 if (p == 0)
de190aef 2734 i = direction == DIRECTION_DOWN ? 0 : n - 1;
cec736d2 2735 else {
de190aef
LP
2736 r = generic_array_bisect(f,
2737 le64toh(f->header->entry_array_offset),
2738 le64toh(f->header->n_entries),
2739 p,
2740 test_object_offset,
2741 DIRECTION_DOWN,
2742 NULL, NULL,
2743 &i);
2744 if (r <= 0)
2745 return r;
2746
aa598ba5
LP
2747 r = bump_array_index(&i, direction, n);
2748 if (r <= 0)
2749 return r;
cec736d2
LP
2750 }
2751
de190aef 2752 /* And jump to it */
989793d3
LP
2753 for (;;) {
2754 r = generic_array_get(f,
2755 le64toh(f->header->entry_array_offset),
2756 i,
2757 ret, &ofs);
2758 if (r > 0)
2759 break;
2760 if (r != -EBADMSG)
2761 return r;
2762
2763 /* OK, so this entry is borked. Most likely some entry didn't get synced to disk properly, let's see if
2764 * the next one might work for us instead. */
2765 log_debug_errno(r, "Entry item %" PRIu64 " is bad, skipping over it.", i);
2766
2767 r = bump_array_index(&i, direction, n);
2768 if (r <= 0)
2769 return r;
caeab8f6 2770 }
fb099c8d 2771
b6da4ed0
LP
2772 /* Ensure our array is properly ordered. */
2773 if (p > 0 && !check_properly_ordered(ofs, p, direction)) {
2774 log_debug("%s: entry array not properly ordered at entry %" PRIu64, f->path, i);
fb099c8d
ZJS
2775 return -EBADMSG;
2776 }
2777
2778 if (offset)
2779 *offset = ofs;
2780
2781 return 1;
de190aef 2782}
cec736d2 2783
de190aef
LP
2784int journal_file_next_entry_for_data(
2785 JournalFile *f,
2786 Object *o, uint64_t p,
2787 uint64_t data_offset,
2788 direction_t direction,
2789 Object **ret, uint64_t *offset) {
2790
ded5034e 2791 uint64_t i, n, ofs;
de190aef 2792 Object *d;
989793d3 2793 int r;
cec736d2
LP
2794
2795 assert(f);
de190aef 2796 assert(p > 0 || !o);
cec736d2 2797
de190aef 2798 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
466ccd92 2799 if (r < 0)
de190aef 2800 return r;
cec736d2 2801
de190aef
LP
2802 n = le64toh(d->data.n_entries);
2803 if (n <= 0)
2804 return n;
cec736d2 2805
de190aef
LP
2806 if (!o)
2807 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2808 else {
2809 if (o->object.type != OBJECT_ENTRY)
2810 return -EINVAL;
cec736d2 2811
de190aef
LP
2812 r = generic_array_bisect_plus_one(f,
2813 le64toh(d->data.entry_offset),
2814 le64toh(d->data.entry_array_offset),
2815 le64toh(d->data.n_entries),
2816 p,
2817 test_object_offset,
2818 DIRECTION_DOWN,
2819 NULL, NULL,
2820 &i);
2821
2822 if (r <= 0)
cec736d2
LP
2823 return r;
2824
aa598ba5
LP
2825 r = bump_array_index(&i, direction, n);
2826 if (r <= 0)
2827 return r;
de190aef 2828 }
cec736d2 2829
989793d3
LP
2830 for (;;) {
2831 r = generic_array_get_plus_one(f,
2832 le64toh(d->data.entry_offset),
2833 le64toh(d->data.entry_array_offset),
2834 i,
2835 ret, &ofs);
2836 if (r > 0)
2837 break;
2838 if (r != -EBADMSG)
2839 return r;
2840
2841 log_debug_errno(r, "Data entry item %" PRIu64 " is bad, skipping over it.", i);
2842
2843 r = bump_array_index(&i, direction, n);
2844 if (r <= 0)
2845 return r;
2846 }
ded5034e
LP
2847
2848 /* Ensure our array is properly ordered. */
2849 if (p > 0 && check_properly_ordered(ofs, p, direction)) {
2850 log_debug("%s data entry array not properly ordered at entry %" PRIu64, f->path, i);
2851 return -EBADMSG;
2852 }
2853
2854 if (offset)
2855 *offset = ofs;
2856
2857 return 1;
de190aef 2858}
cec736d2 2859
cbdca852
LP
2860int journal_file_move_to_entry_by_offset_for_data(
2861 JournalFile *f,
2862 uint64_t data_offset,
2863 uint64_t p,
2864 direction_t direction,
2865 Object **ret, uint64_t *offset) {
2866
2867 int r;
2868 Object *d;
2869
2870 assert(f);
2871
2872 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2873 if (r < 0)
2874 return r;
2875
2876 return generic_array_bisect_plus_one(f,
2877 le64toh(d->data.entry_offset),
2878 le64toh(d->data.entry_array_offset),
2879 le64toh(d->data.n_entries),
2880 p,
2881 test_object_offset,
2882 direction,
2883 ret, offset, NULL);
2884}
2885
2886int journal_file_move_to_entry_by_monotonic_for_data(
2887 JournalFile *f,
2888 uint64_t data_offset,
2889 sd_id128_t boot_id,
2890 uint64_t monotonic,
2891 direction_t direction,
2892 Object **ret, uint64_t *offset) {
2893
cbdca852
LP
2894 Object *o, *d;
2895 int r;
2896 uint64_t b, z;
2897
2898 assert(f);
2899
2900 /* First, seek by time */
47838ab3 2901 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
cbdca852
LP
2902 if (r < 0)
2903 return r;
2904 if (r == 0)
2905 return -ENOENT;
2906
2907 r = generic_array_bisect_plus_one(f,
2908 le64toh(o->data.entry_offset),
2909 le64toh(o->data.entry_array_offset),
2910 le64toh(o->data.n_entries),
2911 monotonic,
2912 test_object_monotonic,
2913 direction,
2914 NULL, &z, NULL);
2915 if (r <= 0)
2916 return r;
2917
2918 /* And now, continue seeking until we find an entry that
2919 * exists in both bisection arrays */
2920
2921 for (;;) {
2922 Object *qo;
2923 uint64_t p, q;
2924
2925 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2926 if (r < 0)
2927 return r;
2928
2929 r = generic_array_bisect_plus_one(f,
2930 le64toh(d->data.entry_offset),
2931 le64toh(d->data.entry_array_offset),
2932 le64toh(d->data.n_entries),
2933 z,
2934 test_object_offset,
2935 direction,
2936 NULL, &p, NULL);
2937 if (r <= 0)
2938 return r;
2939
2940 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2941 if (r < 0)
2942 return r;
2943
2944 r = generic_array_bisect_plus_one(f,
2945 le64toh(o->data.entry_offset),
2946 le64toh(o->data.entry_array_offset),
2947 le64toh(o->data.n_entries),
2948 p,
2949 test_object_offset,
2950 direction,
2951 &qo, &q, NULL);
2952
2953 if (r <= 0)
2954 return r;
2955
2956 if (p == q) {
2957 if (ret)
2958 *ret = qo;
2959 if (offset)
2960 *offset = q;
2961
2962 return 1;
2963 }
2964
2965 z = q;
2966 }
cbdca852
LP
2967}
2968
de190aef
LP
2969int journal_file_move_to_entry_by_seqnum_for_data(
2970 JournalFile *f,
2971 uint64_t data_offset,
2972 uint64_t seqnum,
2973 direction_t direction,
2974 Object **ret, uint64_t *offset) {
cec736d2 2975
de190aef
LP
2976 Object *d;
2977 int r;
cec736d2 2978
91a31dde
LP
2979 assert(f);
2980
de190aef 2981 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2982 if (r < 0)
de190aef 2983 return r;
cec736d2 2984
de190aef
LP
2985 return generic_array_bisect_plus_one(f,
2986 le64toh(d->data.entry_offset),
2987 le64toh(d->data.entry_array_offset),
2988 le64toh(d->data.n_entries),
2989 seqnum,
2990 test_object_seqnum,
2991 direction,
2992 ret, offset, NULL);
2993}
cec736d2 2994
de190aef
LP
2995int journal_file_move_to_entry_by_realtime_for_data(
2996 JournalFile *f,
2997 uint64_t data_offset,
2998 uint64_t realtime,
2999 direction_t direction,
3000 Object **ret, uint64_t *offset) {
3001
3002 Object *d;
3003 int r;
3004
91a31dde
LP
3005 assert(f);
3006
de190aef 3007 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 3008 if (r < 0)
de190aef
LP
3009 return r;
3010
3011 return generic_array_bisect_plus_one(f,
3012 le64toh(d->data.entry_offset),
3013 le64toh(d->data.entry_array_offset),
3014 le64toh(d->data.n_entries),
3015 realtime,
3016 test_object_realtime,
3017 direction,
3018 ret, offset, NULL);
cec736d2
LP
3019}
3020
0284adc6 3021void journal_file_dump(JournalFile *f) {
7560fffc 3022 Object *o;
7560fffc 3023 int r;
0284adc6 3024 uint64_t p;
7560fffc
LP
3025
3026 assert(f);
c88cc6af 3027 assert(f->header);
7560fffc 3028
0284adc6 3029 journal_file_print_header(f);
7560fffc 3030
0284adc6
LP
3031 p = le64toh(f->header->header_size);
3032 while (p != 0) {
d05089d8 3033 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
0284adc6
LP
3034 if (r < 0)
3035 goto fail;
7560fffc 3036
0284adc6 3037 switch (o->object.type) {
d98cc1f2 3038
0284adc6
LP
3039 case OBJECT_UNUSED:
3040 printf("Type: OBJECT_UNUSED\n");
3041 break;
d98cc1f2 3042
0284adc6
LP
3043 case OBJECT_DATA:
3044 printf("Type: OBJECT_DATA\n");
3045 break;
7560fffc 3046
3c1668da
LP
3047 case OBJECT_FIELD:
3048 printf("Type: OBJECT_FIELD\n");
3049 break;
3050
0284adc6 3051 case OBJECT_ENTRY:
507f22bd
ZJS
3052 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
3053 le64toh(o->entry.seqnum),
3054 le64toh(o->entry.monotonic),
3055 le64toh(o->entry.realtime));
0284adc6 3056 break;
7560fffc 3057
0284adc6
LP
3058 case OBJECT_FIELD_HASH_TABLE:
3059 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
3060 break;
7560fffc 3061
0284adc6
LP
3062 case OBJECT_DATA_HASH_TABLE:
3063 printf("Type: OBJECT_DATA_HASH_TABLE\n");
3064 break;
7560fffc 3065
0284adc6
LP
3066 case OBJECT_ENTRY_ARRAY:
3067 printf("Type: OBJECT_ENTRY_ARRAY\n");
3068 break;
7560fffc 3069
0284adc6 3070 case OBJECT_TAG:
507f22bd
ZJS
3071 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
3072 le64toh(o->tag.seqnum),
3073 le64toh(o->tag.epoch));
0284adc6 3074 break;
3c1668da
LP
3075
3076 default:
8facc349 3077 printf("Type: unknown (%i)\n", o->object.type);
3c1668da 3078 break;
0284adc6 3079 }
7560fffc 3080
d89c8fdf
ZJS
3081 if (o->object.flags & OBJECT_COMPRESSION_MASK)
3082 printf("Flags: %s\n",
3083 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
7560fffc 3084
0284adc6
LP
3085 if (p == le64toh(f->header->tail_object_offset))
3086 p = 0;
3087 else
3088 p = p + ALIGN64(le64toh(o->object.size));
3089 }
7560fffc 3090
0284adc6
LP
3091 return;
3092fail:
3093 log_error("File corrupt");
7560fffc
LP
3094}
3095
718fe4b1
ZJS
3096static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
3097 const char *x;
3098
3099 x = format_timestamp(buf, l, t);
3100 if (x)
3101 return x;
3102 return " --- ";
3103}
3104
0284adc6 3105void journal_file_print_header(JournalFile *f) {
2765b7bb 3106 char a[33], b[33], c[33], d[33];
ed375beb 3107 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
a1a03e30
LP
3108 struct stat st;
3109 char bytes[FORMAT_BYTES_MAX];
7560fffc
LP
3110
3111 assert(f);
c88cc6af 3112 assert(f->header);
7560fffc 3113
0284adc6
LP
3114 printf("File Path: %s\n"
3115 "File ID: %s\n"
3116 "Machine ID: %s\n"
3117 "Boot ID: %s\n"
3118 "Sequential Number ID: %s\n"
3119 "State: %s\n"
3120 "Compatible Flags:%s%s\n"
d89c8fdf 3121 "Incompatible Flags:%s%s%s\n"
507f22bd
ZJS
3122 "Header size: %"PRIu64"\n"
3123 "Arena size: %"PRIu64"\n"
3124 "Data Hash Table Size: %"PRIu64"\n"
3125 "Field Hash Table Size: %"PRIu64"\n"
0284adc6 3126 "Rotate Suggested: %s\n"
0808b92f
LP
3127 "Head Sequential Number: %"PRIu64" (%"PRIx64")\n"
3128 "Tail Sequential Number: %"PRIu64" (%"PRIx64")\n"
3129 "Head Realtime Timestamp: %s (%"PRIx64")\n"
3130 "Tail Realtime Timestamp: %s (%"PRIx64")\n"
3131 "Tail Monotonic Timestamp: %s (%"PRIx64")\n"
507f22bd
ZJS
3132 "Objects: %"PRIu64"\n"
3133 "Entry Objects: %"PRIu64"\n",
0284adc6
LP
3134 f->path,
3135 sd_id128_to_string(f->header->file_id, a),
3136 sd_id128_to_string(f->header->machine_id, b),
3137 sd_id128_to_string(f->header->boot_id, c),
2765b7bb 3138 sd_id128_to_string(f->header->seqnum_id, d),
3223f44f
LP
3139 f->header->state == STATE_OFFLINE ? "OFFLINE" :
3140 f->header->state == STATE_ONLINE ? "ONLINE" :
3141 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
8088cbd3 3142 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
d89c8fdf
ZJS
3143 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
3144 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
3145 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
3146 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
507f22bd
ZJS
3147 le64toh(f->header->header_size),
3148 le64toh(f->header->arena_size),
3149 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3150 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
fb0951b0 3151 yes_no(journal_file_rotate_suggested(f, 0)),
0808b92f
LP
3152 le64toh(f->header->head_entry_seqnum), le64toh(f->header->head_entry_seqnum),
3153 le64toh(f->header->tail_entry_seqnum), le64toh(f->header->tail_entry_seqnum),
3154 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)), le64toh(f->header->head_entry_realtime),
3155 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)), le64toh(f->header->tail_entry_realtime),
3156 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC), le64toh(f->header->tail_entry_monotonic),
507f22bd
ZJS
3157 le64toh(f->header->n_objects),
3158 le64toh(f->header->n_entries));
7560fffc 3159
0284adc6 3160 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
507f22bd 3161 printf("Data Objects: %"PRIu64"\n"
0284adc6 3162 "Data Hash Table Fill: %.1f%%\n",
507f22bd 3163 le64toh(f->header->n_data),
0284adc6 3164 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
7560fffc 3165
0284adc6 3166 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
507f22bd 3167 printf("Field Objects: %"PRIu64"\n"
0284adc6 3168 "Field Hash Table Fill: %.1f%%\n",
507f22bd 3169 le64toh(f->header->n_fields),
0284adc6 3170 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3223f44f
LP
3171
3172 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
507f22bd
ZJS
3173 printf("Tag Objects: %"PRIu64"\n",
3174 le64toh(f->header->n_tags));
3223f44f 3175 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
507f22bd
ZJS
3176 printf("Entry Array Objects: %"PRIu64"\n",
3177 le64toh(f->header->n_entry_arrays));
a1a03e30
LP
3178
3179 if (fstat(f->fd, &st) >= 0)
59f448cf 3180 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
7560fffc
LP
3181}
3182
fc68c929
LP
3183static int journal_file_warn_btrfs(JournalFile *f) {
3184 unsigned attrs;
3185 int r;
3186
3187 assert(f);
3188
3189 /* Before we write anything, check if the COW logic is turned
3190 * off on btrfs. Given our write pattern that is quite
3191 * unfriendly to COW file systems this should greatly improve
3192 * performance on COW file systems, such as btrfs, at the
3193 * expense of data integrity features (which shouldn't be too
3194 * bad, given that we do our own checksumming). */
3195
3196 r = btrfs_is_filesystem(f->fd);
3197 if (r < 0)
3198 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
3199 if (!r)
3200 return 0;
3201
3202 r = read_attr_fd(f->fd, &attrs);
3203 if (r < 0)
3204 return log_warning_errno(r, "Failed to read file attributes: %m");
3205
3206 if (attrs & FS_NOCOW_FL) {
3207 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
3208 return 0;
3209 }
3210
3211 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
3212 "This is likely to slow down journal access substantially, please consider turning "
3213 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
3214
3215 return 1;
3216}
3217
0284adc6 3218int journal_file_open(
5d1ce257 3219 int fd,
0284adc6
LP
3220 const char *fname,
3221 int flags,
3222 mode_t mode,
3223 bool compress,
baed47c3 3224 bool seal,
0284adc6
LP
3225 JournalMetrics *metrics,
3226 MMapCache *mmap_cache,
b58c888f 3227 Set *deferred_closes,
0284adc6
LP
3228 JournalFile *template,
3229 JournalFile **ret) {
7560fffc 3230
fa6ac760 3231 bool newly_created = false;
0284adc6 3232 JournalFile *f;
fa6ac760 3233 void *h;
0284adc6 3234 int r;
7560fffc 3235
0559d3a5 3236 assert(ret);
5d1ce257 3237 assert(fd >= 0 || fname);
7560fffc 3238
ec2ce0c5 3239 if (!IN_SET((flags & O_ACCMODE), O_RDONLY, O_RDWR))
0284adc6 3240 return -EINVAL;
7560fffc 3241
5d1ce257
LP
3242 if (fname) {
3243 if (!endswith(fname, ".journal") &&
3244 !endswith(fname, ".journal~"))
3245 return -EINVAL;
3246 }
7560fffc 3247
0284adc6
LP
3248 f = new0(JournalFile, 1);
3249 if (!f)
3250 return -ENOMEM;
7560fffc 3251
5d1ce257 3252 f->fd = fd;
0284adc6 3253 f->mode = mode;
7560fffc 3254
0284adc6
LP
3255 f->flags = flags;
3256 f->prot = prot_from_flags(flags);
3257 f->writable = (flags & O_ACCMODE) != O_RDONLY;
349cc4a5 3258#if HAVE_LZ4
d89c8fdf 3259 f->compress_lz4 = compress;
349cc4a5 3260#elif HAVE_XZ
d89c8fdf 3261 f->compress_xz = compress;
48b61739 3262#endif
349cc4a5 3263#if HAVE_GCRYPT
baed47c3 3264 f->seal = seal;
49a32d43 3265#endif
7560fffc 3266
0284adc6
LP
3267 if (mmap_cache)
3268 f->mmap = mmap_cache_ref(mmap_cache);
3269 else {
84168d80 3270 f->mmap = mmap_cache_new();
0284adc6
LP
3271 if (!f->mmap) {
3272 r = -ENOMEM;
3273 goto fail;
3274 }
3275 }
7560fffc 3276
7645c77b 3277 if (fname) {
5d1ce257 3278 f->path = strdup(fname);
7645c77b
ZJS
3279 if (!f->path) {
3280 r = -ENOMEM;
3281 goto fail;
3282 }
3283 } else {
3284 /* If we don't know the path, fill in something explanatory and vaguely useful */
3285 if (asprintf(&f->path, "/proc/self/%i", fd) < 0) {
3286 r = -ENOMEM;
3287 goto fail;
3288 }
0284adc6 3289 }
7560fffc 3290
4743015d 3291 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
a4bcff5b
LP
3292 if (!f->chain_cache) {
3293 r = -ENOMEM;
3294 goto fail;
3295 }
3296
0284adc6 3297 if (f->fd < 0) {
5d1ce257
LP
3298 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
3299 if (f->fd < 0) {
3300 r = -errno;
3301 goto fail;
3302 }
3303
3304 /* fds we opened here by us should also be closed by us. */
3305 f->close_fd = true;
7560fffc 3306 }
7560fffc 3307
be7cdd8e
VC
3308 f->cache_fd = mmap_cache_add_fd(f->mmap, f->fd);
3309 if (!f->cache_fd) {
3310 r = -ENOMEM;
3311 goto fail;
3312 }
3313
2678031a
LP
3314 r = journal_file_fstat(f);
3315 if (r < 0)
0284adc6 3316 goto fail;
7560fffc 3317
0284adc6 3318 if (f->last_stat.st_size == 0 && f->writable) {
11689d2a 3319
fc68c929 3320 (void) journal_file_warn_btrfs(f);
11689d2a 3321
fb0951b0
LP
3322 /* Let's attach the creation time to the journal file,
3323 * so that the vacuuming code knows the age of this
3324 * file even if the file might end up corrupted one
3325 * day... Ideally we'd just use the creation time many
3326 * file systems maintain for each file, but there is
3327 * currently no usable API to query this, hence let's
3328 * emulate this via extended attributes. If extended
3329 * attributes are not supported we'll just skip this,
7517e174 3330 * and rely solely on mtime/atime/ctime of the file. */
fb0951b0 3331
d61b600d 3332 fd_setcrtime(f->fd, 0);
7560fffc 3333
349cc4a5 3334#if HAVE_GCRYPT
0284adc6 3335 /* Try to load the FSPRG state, and if we can't, then
baed47c3 3336 * just don't do sealing */
49a32d43
LP
3337 if (f->seal) {
3338 r = journal_file_fss_load(f);
3339 if (r < 0)
3340 f->seal = false;
3341 }
feb12d3e 3342#endif
7560fffc 3343
0284adc6
LP
3344 r = journal_file_init_header(f, template);
3345 if (r < 0)
3346 goto fail;
7560fffc 3347
2678031a
LP
3348 r = journal_file_fstat(f);
3349 if (r < 0)
0284adc6 3350 goto fail;
fb0951b0
LP
3351
3352 newly_created = true;
0284adc6 3353 }
7560fffc 3354
0284adc6 3355 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
cfb571f3 3356 r = -ENODATA;
0284adc6
LP
3357 goto fail;
3358 }
7560fffc 3359
b42549ad 3360 r = mmap_cache_get(f->mmap, f->cache_fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h, NULL);
977eaa1e 3361 if (r < 0)
0284adc6 3362 goto fail;
7560fffc 3363
fa6ac760
LP
3364 f->header = h;
3365
0284adc6 3366 if (!newly_created) {
f9168190 3367 set_clear_with_destructor(deferred_closes, journal_file_close);
b58c888f 3368
0284adc6
LP
3369 r = journal_file_verify_header(f);
3370 if (r < 0)
3371 goto fail;
3372 }
7560fffc 3373
349cc4a5 3374#if HAVE_GCRYPT
0284adc6 3375 if (!newly_created && f->writable) {
baed47c3 3376 r = journal_file_fss_load(f);
0284adc6
LP
3377 if (r < 0)
3378 goto fail;
3379 }
feb12d3e 3380#endif
cec736d2
LP
3381
3382 if (f->writable) {
4a92baf3
LP
3383 if (metrics) {
3384 journal_default_metrics(metrics, f->fd);
3385 f->metrics = *metrics;
3386 } else if (template)
3387 f->metrics = template->metrics;
3388
cec736d2
LP
3389 r = journal_file_refresh_header(f);
3390 if (r < 0)
3391 goto fail;
3392 }
3393
349cc4a5 3394#if HAVE_GCRYPT
baed47c3 3395 r = journal_file_hmac_setup(f);
14d10188
LP
3396 if (r < 0)
3397 goto fail;
feb12d3e 3398#endif
14d10188 3399
cec736d2 3400 if (newly_created) {
de190aef 3401 r = journal_file_setup_field_hash_table(f);
cec736d2
LP
3402 if (r < 0)
3403 goto fail;
3404
de190aef 3405 r = journal_file_setup_data_hash_table(f);
cec736d2
LP
3406 if (r < 0)
3407 goto fail;
7560fffc 3408
349cc4a5 3409#if HAVE_GCRYPT
7560fffc
LP
3410 r = journal_file_append_first_tag(f);
3411 if (r < 0)
3412 goto fail;
feb12d3e 3413#endif
cec736d2
LP
3414 }
3415
be7cdd8e 3416 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd)) {
fa6ac760
LP
3417 r = -EIO;
3418 goto fail;
3419 }
3420
7a24f3bf 3421 if (template && template->post_change_timer) {
e167d7fd
LP
3422 r = journal_file_enable_post_change_timer(
3423 f,
3424 sd_event_source_get_event(template->post_change_timer),
3425 template->post_change_timer_period);
7a24f3bf 3426
7a24f3bf
VC
3427 if (r < 0)
3428 goto fail;
3429 }
3430
f8e2f4d6 3431 /* The file is opened now successfully, thus we take possession of any passed in fd. */
5d1ce257
LP
3432 f->close_fd = true;
3433
0559d3a5 3434 *ret = f;
cec736d2
LP
3435 return 0;
3436
3437fail:
be7cdd8e 3438 if (f->cache_fd && mmap_cache_got_sigbus(f->mmap, f->cache_fd))
fa6ac760
LP
3439 r = -EIO;
3440
69a3a6fd 3441 (void) journal_file_close(f);
cec736d2
LP
3442
3443 return r;
3444}
0ac38b70 3445
b58c888f 3446int journal_file_rotate(JournalFile **f, bool compress, bool seal, Set *deferred_closes) {
57535f47 3447 _cleanup_free_ char *p = NULL;
0ac38b70
LP
3448 size_t l;
3449 JournalFile *old_file, *new_file = NULL;
3450 int r;
3451
3452 assert(f);
3453 assert(*f);
3454
3455 old_file = *f;
3456
3457 if (!old_file->writable)
3458 return -EINVAL;
3459
5d1ce257 3460 /* Is this a journal file that was passed to us as fd? If so, we synthesized a path name for it, and we refuse
13e785f7 3461 * rotation, since we don't know the actual path, and couldn't rename the file hence. */
5d1ce257
LP
3462 if (path_startswith(old_file->path, "/proc/self/fd"))
3463 return -EINVAL;
3464
0ac38b70
LP
3465 if (!endswith(old_file->path, ".journal"))
3466 return -EINVAL;
3467
3468 l = strlen(old_file->path);
57535f47
ZJS
3469 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
3470 (int) l - 8, old_file->path,
3471 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
3472 le64toh((*f)->header->head_entry_seqnum),
3473 le64toh((*f)->header->head_entry_realtime));
3474 if (r < 0)
0ac38b70
LP
3475 return -ENOMEM;
3476
2678031a
LP
3477 /* Try to rename the file to the archived version. If the file
3478 * already was deleted, we'll get ENOENT, let's ignore that
3479 * case. */
0ac38b70 3480 r = rename(old_file->path, p);
2678031a 3481 if (r < 0 && errno != ENOENT)
0ac38b70
LP
3482 return -errno;
3483
1fcefd88
LP
3484 /* Sync the rename to disk */
3485 (void) fsync_directory_of_file(old_file->fd);
3486
8eb85171
VC
3487 /* Set as archive so offlining commits w/state=STATE_ARCHIVED.
3488 * Previously we would set old_file->header->state to STATE_ARCHIVED directly here,
3489 * but journal_file_set_offline() short-circuits when state != STATE_ONLINE, which
3490 * would result in the rotated journal never getting fsync() called before closing.
3491 * Now we simply queue the archive state by setting an archive bit, leaving the state
3492 * as STATE_ONLINE so proper offlining occurs. */
3493 old_file->archive = true;
0ac38b70 3494
f27a3864
LP
3495 /* Currently, btrfs is not very good with out write patterns
3496 * and fragments heavily. Let's defrag our journal files when
3497 * we archive them */
3498 old_file->defrag_on_close = true;
3499
5d1ce257 3500 r = journal_file_open(-1, old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, deferred_closes, old_file, &new_file);
b58c888f
VC
3501
3502 if (deferred_closes &&
3503 set_put(deferred_closes, old_file) >= 0)
3504 (void) journal_file_set_offline(old_file, false);
3505 else
3506 (void) journal_file_close(old_file);
0ac38b70
LP
3507
3508 *f = new_file;
3509 return r;
3510}
3511
9447a7f1
LP
3512int journal_file_open_reliably(
3513 const char *fname,
3514 int flags,
3515 mode_t mode,
7560fffc 3516 bool compress,
baed47c3 3517 bool seal,
4a92baf3 3518 JournalMetrics *metrics,
27370278 3519 MMapCache *mmap_cache,
b58c888f 3520 Set *deferred_closes,
9447a7f1
LP
3521 JournalFile *template,
3522 JournalFile **ret) {
3523
3524 int r;
3525 size_t l;
ed375beb 3526 _cleanup_free_ char *p = NULL;
9447a7f1 3527
5d1ce257 3528 r = journal_file_open(-1, fname, flags, mode, compress, seal, metrics, mmap_cache, deferred_closes, template, ret);
288359db 3529 if (!IN_SET(r,
b288cdeb
ZJS
3530 -EBADMSG, /* Corrupted */
3531 -ENODATA, /* Truncated */
3532 -EHOSTDOWN, /* Other machine */
3533 -EPROTONOSUPPORT, /* Incompatible feature */
3534 -EBUSY, /* Unclean shutdown */
3535 -ESHUTDOWN, /* Already archived */
288359db 3536 -EIO, /* IO error, including SIGBUS on mmap */
ae739cc1
LP
3537 -EIDRM, /* File has been deleted */
3538 -ETXTBSY)) /* File is from the future */
9447a7f1
LP
3539 return r;
3540
3541 if ((flags & O_ACCMODE) == O_RDONLY)
3542 return r;
3543
3544 if (!(flags & O_CREAT))
3545 return r;
3546
7560fffc
LP
3547 if (!endswith(fname, ".journal"))
3548 return r;
3549
5c70eab4
LP
3550 /* The file is corrupted. Rotate it away and try it again (but only once) */
3551
9447a7f1 3552 l = strlen(fname);
d587eca5 3553 if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
57535f47 3554 (int) l - 8, fname,
d587eca5 3555 now(CLOCK_REALTIME),
9bf3b535 3556 random_u64()) < 0)
9447a7f1
LP
3557 return -ENOMEM;
3558
65089b82 3559 if (rename(fname, p) < 0)
9447a7f1
LP
3560 return -errno;
3561
f27a3864
LP
3562 /* btrfs doesn't cope well with our write pattern and
3563 * fragments heavily. Let's defrag all files we rotate */
11689d2a 3564
a67d68b8 3565 (void) chattr_path(p, 0, FS_NOCOW_FL);
f27a3864
LP
3566 (void) btrfs_defrag(p);
3567
65089b82 3568 log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
9447a7f1 3569
5d1ce257 3570 return journal_file_open(-1, fname, flags, mode, compress, seal, metrics, mmap_cache, deferred_closes, template, ret);
9447a7f1
LP
3571}
3572
cf244689
LP
3573int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
3574 uint64_t i, n;
3575 uint64_t q, xor_hash = 0;
3576 int r;
3577 EntryItem *items;
3578 dual_timestamp ts;
3579
3580 assert(from);
3581 assert(to);
3582 assert(o);
3583 assert(p);
3584
3585 if (!to->writable)
3586 return -EPERM;
3587
3588 ts.monotonic = le64toh(o->entry.monotonic);
3589 ts.realtime = le64toh(o->entry.realtime);
3590
cf244689 3591 n = journal_file_entry_n_items(o);
4faa7004
TA
3592 /* alloca() can't take 0, hence let's allocate at least one */
3593 items = alloca(sizeof(EntryItem) * MAX(1u, n));
cf244689
LP
3594
3595 for (i = 0; i < n; i++) {
4fd052ae
FC
3596 uint64_t l, h;
3597 le64_t le_hash;
cf244689
LP
3598 size_t t;
3599 void *data;
3600 Object *u;
3601
3602 q = le64toh(o->entry.items[i].object_offset);
3603 le_hash = o->entry.items[i].hash;
3604
3605 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
3606 if (r < 0)
3607 return r;
3608
3609 if (le_hash != o->data.hash)
3610 return -EBADMSG;
3611
3612 l = le64toh(o->object.size) - offsetof(Object, data.payload);
3613 t = (size_t) l;
3614
3615 /* We hit the limit on 32bit machines */
3616 if ((uint64_t) t != l)
3617 return -E2BIG;
3618
d89c8fdf 3619 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
349cc4a5 3620#if HAVE_XZ || HAVE_LZ4
a7f7d1bd 3621 size_t rsize = 0;
cf244689 3622
d89c8fdf
ZJS
3623 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
3624 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
3625 if (r < 0)
3626 return r;
cf244689
LP
3627
3628 data = from->compress_buffer;
3629 l = rsize;
3b1a55e1
ZJS
3630#else
3631 return -EPROTONOSUPPORT;
3632#endif
cf244689
LP
3633 } else
3634 data = o->data.payload;
3635
3636 r = journal_file_append_data(to, data, l, &u, &h);
3637 if (r < 0)
3638 return r;
3639
3640 xor_hash ^= le64toh(u->data.hash);
3641 items[i].object_offset = htole64(h);
3642 items[i].hash = u->data.hash;
3643
3644 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
3645 if (r < 0)
3646 return r;
3647 }
3648
fa6ac760
LP
3649 r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
3650
be7cdd8e 3651 if (mmap_cache_got_sigbus(to->mmap, to->cache_fd))
fa6ac760
LP
3652 return -EIO;
3653
3654 return r;
cf244689 3655}
babfc091 3656
8580d1f7
LP
3657void journal_reset_metrics(JournalMetrics *m) {
3658 assert(m);
3659
3660 /* Set everything to "pick automatic values". */
3661
3662 *m = (JournalMetrics) {
3663 .min_use = (uint64_t) -1,
3664 .max_use = (uint64_t) -1,
3665 .min_size = (uint64_t) -1,
3666 .max_size = (uint64_t) -1,
3667 .keep_free = (uint64_t) -1,
3668 .n_max_files = (uint64_t) -1,
3669 };
3670}
3671
babfc091 3672void journal_default_metrics(JournalMetrics *m, int fd) {
8580d1f7 3673 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
babfc091 3674 struct statvfs ss;
8580d1f7 3675 uint64_t fs_size;
babfc091
LP
3676
3677 assert(m);
3678 assert(fd >= 0);
3679
3680 if (fstatvfs(fd, &ss) >= 0)
3681 fs_size = ss.f_frsize * ss.f_blocks;
8580d1f7
LP
3682 else {
3683 log_debug_errno(errno, "Failed to detremine disk size: %m");
3684 fs_size = 0;
3685 }
babfc091
LP
3686
3687 if (m->max_use == (uint64_t) -1) {
3688
3689 if (fs_size > 0) {
3690 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
3691
3692 if (m->max_use > DEFAULT_MAX_USE_UPPER)
3693 m->max_use = DEFAULT_MAX_USE_UPPER;
3694
3695 if (m->max_use < DEFAULT_MAX_USE_LOWER)
3696 m->max_use = DEFAULT_MAX_USE_LOWER;
3697 } else
3698 m->max_use = DEFAULT_MAX_USE_LOWER;
3699 } else {
3700 m->max_use = PAGE_ALIGN(m->max_use);
3701
8580d1f7 3702 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
babfc091
LP
3703 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3704 }
3705
8580d1f7
LP
3706 if (m->min_use == (uint64_t) -1)
3707 m->min_use = DEFAULT_MIN_USE;
3708
3709 if (m->min_use > m->max_use)
3710 m->min_use = m->max_use;
3711
babfc091
LP
3712 if (m->max_size == (uint64_t) -1) {
3713 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
3714
3715 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3716 m->max_size = DEFAULT_MAX_SIZE_UPPER;
3717 } else
3718 m->max_size = PAGE_ALIGN(m->max_size);
3719
8580d1f7
LP
3720 if (m->max_size != 0) {
3721 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3722 m->max_size = JOURNAL_FILE_SIZE_MIN;
babfc091 3723
8580d1f7
LP
3724 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3725 m->max_use = m->max_size*2;
3726 }
babfc091
LP
3727
3728 if (m->min_size == (uint64_t) -1)
3729 m->min_size = JOURNAL_FILE_SIZE_MIN;
3730 else {
3731 m->min_size = PAGE_ALIGN(m->min_size);
3732
3733 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3734 m->min_size = JOURNAL_FILE_SIZE_MIN;
3735
8580d1f7 3736 if (m->max_size != 0 && m->min_size > m->max_size)
babfc091
LP
3737 m->max_size = m->min_size;
3738 }
3739
3740 if (m->keep_free == (uint64_t) -1) {
3741
3742 if (fs_size > 0) {
8621b110 3743 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
babfc091
LP
3744
3745 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3746 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3747
3748 } else
3749 m->keep_free = DEFAULT_KEEP_FREE;
3750 }
3751
8580d1f7
LP
3752 if (m->n_max_files == (uint64_t) -1)
3753 m->n_max_files = DEFAULT_N_MAX_FILES;
3754
3755 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3756 format_bytes(a, sizeof(a), m->min_use),
3757 format_bytes(b, sizeof(b), m->max_use),
3758 format_bytes(c, sizeof(c), m->max_size),
3759 format_bytes(d, sizeof(d), m->min_size),
3760 format_bytes(e, sizeof(e), m->keep_free),
3761 m->n_max_files);
babfc091 3762}
08984293
LP
3763
3764int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
08984293 3765 assert(f);
c88cc6af 3766 assert(f->header);
08984293
LP
3767 assert(from || to);
3768
3769 if (from) {
162566a4
LP
3770 if (f->header->head_entry_realtime == 0)
3771 return -ENOENT;
08984293 3772
162566a4 3773 *from = le64toh(f->header->head_entry_realtime);
08984293
LP
3774 }
3775
3776 if (to) {
162566a4
LP
3777 if (f->header->tail_entry_realtime == 0)
3778 return -ENOENT;
08984293 3779
162566a4 3780 *to = le64toh(f->header->tail_entry_realtime);
08984293
LP
3781 }
3782
3783 return 1;
3784}
3785
3786int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
08984293
LP
3787 Object *o;
3788 uint64_t p;
3789 int r;
3790
3791 assert(f);
3792 assert(from || to);
3793
47838ab3 3794 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
08984293
LP
3795 if (r <= 0)
3796 return r;
3797
3798 if (le64toh(o->data.n_entries) <= 0)
3799 return 0;
3800
3801 if (from) {
3802 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3803 if (r < 0)
3804 return r;
3805
3806 *from = le64toh(o->entry.monotonic);
3807 }
3808
3809 if (to) {
3810 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3811 if (r < 0)
3812 return r;
3813
3814 r = generic_array_get_plus_one(f,
3815 le64toh(o->data.entry_offset),
3816 le64toh(o->data.entry_array_offset),
3817 le64toh(o->data.n_entries)-1,
3818 &o, NULL);
3819 if (r <= 0)
3820 return r;
3821
3822 *to = le64toh(o->entry.monotonic);
3823 }
3824
3825 return 1;
3826}
dca6219e 3827
fb0951b0 3828bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
dca6219e 3829 assert(f);
c88cc6af 3830 assert(f->header);
dca6219e
LP
3831
3832 /* If we gained new header fields we gained new features,
3833 * hence suggest a rotation */
361f9cbc
LP
3834 if (le64toh(f->header->header_size) < sizeof(Header)) {
3835 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
dca6219e 3836 return true;
361f9cbc 3837 }
dca6219e
LP
3838
3839 /* Let's check if the hash tables grew over a certain fill
3840 * level (75%, borrowing this value from Java's hash table
3841 * implementation), and if so suggest a rotation. To calculate
3842 * the fill level we need the n_data field, which only exists
3843 * in newer versions. */
3844
3845 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
361f9cbc 3846 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 3847 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
361f9cbc
LP
3848 f->path,
3849 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
3850 le64toh(f->header->n_data),
3851 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3852 (unsigned long long) f->last_stat.st_size,
3853 f->last_stat.st_size / le64toh(f->header->n_data));
dca6219e 3854 return true;
361f9cbc 3855 }
dca6219e
LP
3856
3857 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
361f9cbc 3858 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 3859 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
361f9cbc
LP
3860 f->path,
3861 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
3862 le64toh(f->header->n_fields),
3863 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
dca6219e 3864 return true;
361f9cbc 3865 }
dca6219e 3866
0598fd4a
LP
3867 /* Are the data objects properly indexed by field objects? */
3868 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3869 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3870 le64toh(f->header->n_data) > 0 &&
3871 le64toh(f->header->n_fields) == 0)
3872 return true;
3873
fb0951b0
LP
3874 if (max_file_usec > 0) {
3875 usec_t t, h;
3876
3877 h = le64toh(f->header->head_entry_realtime);
3878 t = now(CLOCK_REALTIME);
3879
3880 if (h > 0 && t > h + max_file_usec)
3881 return true;
3882 }
3883
dca6219e
LP
3884 return false;
3885}