]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/journal/journal-file.c
hwdb: Add ACCEL_MOUNT_MATRIX for the Acer Aspire Switch 10 convertible
[thirdparty/systemd.git] / src / journal / journal-file.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
cec736d2
LP
2/***
3 This file is part of systemd.
4
5 Copyright 2011 Lennart Poettering
6
7 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
cec736d2
LP
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 15 Lesser General Public License for more details.
cec736d2 16
5430f7f2 17 You should have received a copy of the GNU Lesser General Public License
cec736d2
LP
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19***/
20
cec736d2 21#include <errno.h>
cec736d2 22#include <fcntl.h>
11689d2a 23#include <linux/fs.h>
ac2e41f5 24#include <pthread.h>
07630cea
LP
25#include <stddef.h>
26#include <sys/mman.h>
27#include <sys/statvfs.h>
28#include <sys/uio.h>
29#include <unistd.h>
fb0951b0 30
b5efdb8a 31#include "alloc-util.h"
f27a3864 32#include "btrfs-util.h"
c8b3094d 33#include "chattr-util.h"
07630cea 34#include "compress.h"
3ffd4af2 35#include "fd-util.h"
0284adc6 36#include "journal-authenticate.h"
cec736d2
LP
37#include "journal-def.h"
38#include "journal-file.h"
39#include "lookup3.h"
6bedfcbb 40#include "parse-util.h"
5d1ce257 41#include "path-util.h"
3df3e884 42#include "random-util.h"
7a24f3bf 43#include "sd-event.h"
b58c888f 44#include "set.h"
07630cea 45#include "string-util.h"
4761fd0f 46#include "strv.h"
89a5a90c 47#include "xattr-util.h"
cec736d2 48
4a92baf3
LP
49#define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
50#define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
cec736d2 51
be19b7df 52#define COMPRESSION_SIZE_THRESHOLD (512ULL)
807e17f0 53
babfc091 54/* This is the minimum journal file size */
16098e93 55#define JOURNAL_FILE_SIZE_MIN (512ULL*1024ULL) /* 512 KiB */
babfc091
LP
56
57/* These are the lower and upper bounds if we deduce the max_use value
58 * from the file system size */
59#define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
60#define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
61
8580d1f7
LP
62/* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
63#define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL) /* 1 MiB */
64
babfc091 65/* This is the upper bound if we deduce max_size from max_use */
71100051 66#define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
babfc091
LP
67
68/* This is the upper bound if we deduce the keep_free value from the
69 * file system size */
70#define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
71
72/* This is the keep_free value when we can't determine the system
73 * size */
74#define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
75
8580d1f7
LP
76/* This is the default maximum number of journal files to keep around. */
77#define DEFAULT_N_MAX_FILES (100)
78
dca6219e
LP
79/* n_data was the first entry we added after the initial file format design */
80#define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
cec736d2 81
a4bcff5b
LP
82/* How many entries to keep in the entry array chain cache at max */
83#define CHAIN_CACHE_MAX 20
84
a676e665
LP
85/* How much to increase the journal file size at once each time we allocate something new. */
86#define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
87
2678031a
LP
88/* Reread fstat() of the file for detecting deletions at least this often */
89#define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
90
fa6ac760
LP
91/* The mmap context to use for the header we pick as one above the last defined typed */
92#define CONTEXT_HEADER _OBJECT_TYPE_MAX
93
51804460
ZJS
94#ifdef __clang__
95# pragma GCC diagnostic ignored "-Waddress-of-packed-member"
96#endif
97
ac2e41f5
VC
98/* This may be called from a separate thread to prevent blocking the caller for the duration of fsync().
99 * As a result we use atomic operations on f->offline_state for inter-thread communications with
100 * journal_file_set_offline() and journal_file_set_online(). */
101static void journal_file_set_offline_internal(JournalFile *f) {
26687bf8 102 assert(f);
ac2e41f5
VC
103 assert(f->fd >= 0);
104 assert(f->header);
105
106 for (;;) {
107 switch (f->offline_state) {
108 case OFFLINE_CANCEL:
109 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_DONE))
110 continue;
111 return;
112
113 case OFFLINE_AGAIN_FROM_SYNCING:
114 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_SYNCING))
115 continue;
116 break;
117
118 case OFFLINE_AGAIN_FROM_OFFLINING:
119 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_SYNCING))
120 continue;
121 break;
122
123 case OFFLINE_SYNCING:
124 (void) fsync(f->fd);
26687bf8 125
ac2e41f5
VC
126 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_OFFLINING))
127 continue;
26687bf8 128
8eb85171 129 f->header->state = f->archive ? STATE_ARCHIVED : STATE_OFFLINE;
ac2e41f5
VC
130 (void) fsync(f->fd);
131 break;
132
133 case OFFLINE_OFFLINING:
134 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_DONE))
135 continue;
136 /* fall through */
137
138 case OFFLINE_DONE:
139 return;
140
141 case OFFLINE_JOINED:
142 log_debug("OFFLINE_JOINED unexpected offline state for journal_file_set_offline_internal()");
143 return;
144 }
145 }
146}
147
148static void * journal_file_set_offline_thread(void *arg) {
149 JournalFile *f = arg;
150
151 journal_file_set_offline_internal(f);
152
153 return NULL;
154}
155
156static int journal_file_set_offline_thread_join(JournalFile *f) {
157 int r;
158
159 assert(f);
160
161 if (f->offline_state == OFFLINE_JOINED)
162 return 0;
163
164 r = pthread_join(f->offline_thread, NULL);
165 if (r)
166 return -r;
167
168 f->offline_state = OFFLINE_JOINED;
26687bf8 169
be7cdd8e 170 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
fa6ac760
LP
171 return -EIO;
172
ac2e41f5
VC
173 return 0;
174}
26687bf8 175
ac2e41f5
VC
176/* Trigger a restart if the offline thread is mid-flight in a restartable state. */
177static bool journal_file_set_offline_try_restart(JournalFile *f) {
178 for (;;) {
179 switch (f->offline_state) {
180 case OFFLINE_AGAIN_FROM_SYNCING:
181 case OFFLINE_AGAIN_FROM_OFFLINING:
182 return true;
183
184 case OFFLINE_CANCEL:
185 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_AGAIN_FROM_SYNCING))
186 continue;
187 return true;
188
189 case OFFLINE_SYNCING:
190 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_AGAIN_FROM_SYNCING))
191 continue;
192 return true;
193
194 case OFFLINE_OFFLINING:
195 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_AGAIN_FROM_OFFLINING))
196 continue;
197 return true;
26687bf8
OS
198
199 default:
ac2e41f5
VC
200 return false;
201 }
26687bf8
OS
202 }
203}
204
ac2e41f5
VC
205/* Sets a journal offline.
206 *
207 * If wait is false then an offline is dispatched in a separate thread for a
208 * subsequent journal_file_set_offline() or journal_file_set_online() of the
209 * same journal to synchronize with.
210 *
211 * If wait is true, then either an existing offline thread will be restarted
212 * and joined, or if none exists the offline is simply performed in this
213 * context without involving another thread.
214 */
215int journal_file_set_offline(JournalFile *f, bool wait) {
216 bool restarted;
217 int r;
218
26687bf8
OS
219 assert(f);
220
221 if (!f->writable)
222 return -EPERM;
223
224 if (!(f->fd >= 0 && f->header))
225 return -EINVAL;
226
b8f99e27
VC
227 /* An offlining journal is implicitly online and may modify f->header->state,
228 * we must also join any potentially lingering offline thread when not online. */
229 if (!journal_file_is_offlining(f) && f->header->state != STATE_ONLINE)
230 return journal_file_set_offline_thread_join(f);
26687bf8 231
ac2e41f5
VC
232 /* Restart an in-flight offline thread and wait if needed, or join a lingering done one. */
233 restarted = journal_file_set_offline_try_restart(f);
234 if ((restarted && wait) || !restarted) {
235 r = journal_file_set_offline_thread_join(f);
236 if (r < 0)
237 return r;
238 }
26687bf8 239
ac2e41f5
VC
240 if (restarted)
241 return 0;
242
243 /* Initiate a new offline. */
244 f->offline_state = OFFLINE_SYNCING;
fa6ac760 245
ac2e41f5
VC
246 if (wait) /* Without using a thread if waiting. */
247 journal_file_set_offline_internal(f);
248 else {
249 r = pthread_create(&f->offline_thread, NULL, journal_file_set_offline_thread, f);
ec9ffa2c
VC
250 if (r > 0) {
251 f->offline_state = OFFLINE_JOINED;
ac2e41f5 252 return -r;
ec9ffa2c 253 }
ac2e41f5
VC
254 }
255
256 return 0;
257}
258
259static int journal_file_set_online(JournalFile *f) {
260 bool joined = false;
261
262 assert(f);
263
264 if (!f->writable)
265 return -EPERM;
266
267 if (!(f->fd >= 0 && f->header))
268 return -EINVAL;
269
270 while (!joined) {
271 switch (f->offline_state) {
272 case OFFLINE_JOINED:
273 /* No offline thread, no need to wait. */
274 joined = true;
275 break;
276
277 case OFFLINE_SYNCING:
278 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_CANCEL))
279 continue;
280 /* Canceled syncing prior to offlining, no need to wait. */
281 break;
282
283 case OFFLINE_AGAIN_FROM_SYNCING:
284 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_CANCEL))
285 continue;
286 /* Canceled restart from syncing, no need to wait. */
287 break;
288
289 case OFFLINE_AGAIN_FROM_OFFLINING:
290 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_CANCEL))
291 continue;
292 /* Canceled restart from offlining, must wait for offlining to complete however. */
293
ec251fe7 294 /* fall through */
ac2e41f5
VC
295 default: {
296 int r;
297
298 r = journal_file_set_offline_thread_join(f);
299 if (r < 0)
300 return r;
301
302 joined = true;
303 break;
304 }
305 }
306 }
26687bf8 307
be7cdd8e 308 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
fa6ac760
LP
309 return -EIO;
310
ac2e41f5
VC
311 switch (f->header->state) {
312 case STATE_ONLINE:
313 return 0;
26687bf8 314
ac2e41f5
VC
315 case STATE_OFFLINE:
316 f->header->state = STATE_ONLINE;
317 (void) fsync(f->fd);
318 return 0;
319
320 default:
321 return -EINVAL;
322 }
26687bf8
OS
323}
324
b58c888f
VC
325bool journal_file_is_offlining(JournalFile *f) {
326 assert(f);
327
328 __sync_synchronize();
329
3742095b 330 if (IN_SET(f->offline_state, OFFLINE_DONE, OFFLINE_JOINED))
b58c888f
VC
331 return false;
332
333 return true;
334}
335
804ae586 336JournalFile* journal_file_close(JournalFile *f) {
de190aef 337 assert(f);
cec736d2 338
349cc4a5 339#if HAVE_GCRYPT
b0af6f41 340 /* Write the final tag */
43cd8794
FB
341 if (f->seal && f->writable) {
342 int r;
343
344 r = journal_file_append_tag(f);
345 if (r < 0)
346 log_error_errno(r, "Failed to append tag when closing journal: %m");
347 }
feb12d3e 348#endif
b0af6f41 349
7a24f3bf
VC
350 if (f->post_change_timer) {
351 int enabled;
352
353 if (sd_event_source_get_enabled(f->post_change_timer, &enabled) >= 0)
354 if (enabled == SD_EVENT_ONESHOT)
355 journal_file_post_change(f);
356
e167d7fd 357 (void) sd_event_source_set_enabled(f->post_change_timer, SD_EVENT_OFF);
7a24f3bf
VC
358 sd_event_source_unref(f->post_change_timer);
359 }
360
ac2e41f5 361 journal_file_set_offline(f, true);
cec736d2 362
be7cdd8e
VC
363 if (f->mmap && f->cache_fd)
364 mmap_cache_free_fd(f->mmap, f->cache_fd);
cec736d2 365
11689d2a
LP
366 if (f->fd >= 0 && f->defrag_on_close) {
367
368 /* Be friendly to btrfs: turn COW back on again now,
369 * and defragment the file. We won't write to the file
370 * ever again, hence remove all fragmentation, and
371 * reenable all the good bits COW usually provides
372 * (such as data checksumming). */
373
1ed8f8c1 374 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
11689d2a
LP
375 (void) btrfs_defrag_fd(f->fd);
376 }
f27a3864 377
5d1ce257
LP
378 if (f->close_fd)
379 safe_close(f->fd);
cec736d2 380 free(f->path);
807e17f0 381
f649045c 382 mmap_cache_unref(f->mmap);
16e9f408 383
4743015d 384 ordered_hashmap_free_free(f->chain_cache);
a4bcff5b 385
349cc4a5 386#if HAVE_XZ || HAVE_LZ4
807e17f0
LP
387 free(f->compress_buffer);
388#endif
389
349cc4a5 390#if HAVE_GCRYPT
baed47c3
LP
391 if (f->fss_file)
392 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
dc4ebc07 393 else
b7c9ae91
LP
394 free(f->fsprg_state);
395
396 free(f->fsprg_seed);
7560fffc
LP
397
398 if (f->hmac)
399 gcry_md_close(f->hmac);
400#endif
401
6b430fdb 402 return mfree(f);
cec736d2
LP
403}
404
b58c888f
VC
405void journal_file_close_set(Set *s) {
406 JournalFile *f;
407
408 assert(s);
409
410 while ((f = set_steal_first(s)))
411 (void) journal_file_close(f);
412}
413
0ac38b70 414static int journal_file_init_header(JournalFile *f, JournalFile *template) {
d89c8fdf 415 Header h = {};
cec736d2
LP
416 ssize_t k;
417 int r;
418
419 assert(f);
420
7560fffc 421 memcpy(h.signature, HEADER_SIGNATURE, 8);
23b0b2b2 422 h.header_size = htole64(ALIGN64(sizeof(h)));
cec736d2 423
d89c8fdf
ZJS
424 h.incompatible_flags |= htole32(
425 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
426 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
7560fffc 427
d89c8fdf
ZJS
428 h.compatible_flags = htole32(
429 f->seal * HEADER_COMPATIBLE_SEALED);
7560fffc 430
cec736d2
LP
431 r = sd_id128_randomize(&h.file_id);
432 if (r < 0)
433 return r;
434
0ac38b70
LP
435 if (template) {
436 h.seqnum_id = template->header->seqnum_id;
beec0085 437 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
0ac38b70
LP
438 } else
439 h.seqnum_id = h.file_id;
cec736d2
LP
440
441 k = pwrite(f->fd, &h, sizeof(h), 0);
442 if (k < 0)
443 return -errno;
444
445 if (k != sizeof(h))
446 return -EIO;
447
448 return 0;
449}
450
a0fe2a2d
LP
451static int fsync_directory_of_file(int fd) {
452 _cleanup_free_ char *path = NULL, *dn = NULL;
453 _cleanup_close_ int dfd = -1;
454 struct stat st;
455 int r;
456
457 if (fstat(fd, &st) < 0)
458 return -errno;
459
460 if (!S_ISREG(st.st_mode))
461 return -EBADFD;
462
463 r = fd_get_path(fd, &path);
464 if (r < 0)
465 return r;
466
467 if (!path_is_absolute(path))
468 return -EINVAL;
469
470 dn = dirname_malloc(path);
471 if (!dn)
472 return -ENOMEM;
473
474 dfd = open(dn, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
475 if (dfd < 0)
476 return -errno;
477
478 if (fsync(dfd) < 0)
479 return -errno;
480
481 return 0;
482}
483
cec736d2 484static int journal_file_refresh_header(JournalFile *f) {
de190aef 485 sd_id128_t boot_id;
fa6ac760 486 int r;
cec736d2
LP
487
488 assert(f);
c88cc6af 489 assert(f->header);
cec736d2
LP
490
491 r = sd_id128_get_machine(&f->header->machine_id);
492 if (r < 0)
493 return r;
494
de190aef 495 r = sd_id128_get_boot(&boot_id);
cec736d2
LP
496 if (r < 0)
497 return r;
498
de190aef
LP
499 if (sd_id128_equal(boot_id, f->header->boot_id))
500 f->tail_entry_monotonic_valid = true;
501
502 f->header->boot_id = boot_id;
503
fa6ac760 504 r = journal_file_set_online(f);
b788cc23 505
7560fffc 506 /* Sync the online state to disk */
fb426037 507 (void) fsync(f->fd);
b788cc23 508
a0fe2a2d
LP
509 /* We likely just created a new file, also sync the directory this file is located in. */
510 (void) fsync_directory_of_file(f->fd);
511
fa6ac760 512 return r;
cec736d2
LP
513}
514
4214009f
ZJS
515static bool warn_wrong_flags(const JournalFile *f, bool compatible) {
516 const uint32_t any = compatible ? HEADER_COMPATIBLE_ANY : HEADER_INCOMPATIBLE_ANY,
517 supported = compatible ? HEADER_COMPATIBLE_SUPPORTED : HEADER_INCOMPATIBLE_SUPPORTED;
518 const char *type = compatible ? "compatible" : "incompatible";
d89c8fdf
ZJS
519 uint32_t flags;
520
4214009f
ZJS
521 flags = le32toh(compatible ? f->header->compatible_flags : f->header->incompatible_flags);
522
523 if (flags & ~supported) {
524 if (flags & ~any)
4761fd0f 525 log_debug("Journal file %s has unknown %s flags 0x%"PRIx32,
4214009f
ZJS
526 f->path, type, flags & ~any);
527 flags = (flags & any) & ~supported;
4761fd0f
ZJS
528 if (flags) {
529 const char* strv[3];
530 unsigned n = 0;
531 _cleanup_free_ char *t = NULL;
532
533 if (compatible && (flags & HEADER_COMPATIBLE_SEALED))
534 strv[n++] = "sealed";
535 if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_XZ))
536 strv[n++] = "xz-compressed";
537 if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_LZ4))
538 strv[n++] = "lz4-compressed";
539 strv[n] = NULL;
540 assert(n < ELEMENTSOF(strv));
541
542 t = strv_join((char**) strv, ", ");
543 log_debug("Journal file %s uses %s %s %s disabled at compilation time.",
544 f->path, type, n > 1 ? "flags" : "flag", strnull(t));
545 }
4214009f
ZJS
546 return true;
547 }
548
549 return false;
550}
551
552static int journal_file_verify_header(JournalFile *f) {
6f94e420
TS
553 uint64_t arena_size, header_size;
554
cec736d2 555 assert(f);
c88cc6af 556 assert(f->header);
cec736d2 557
7560fffc 558 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
cec736d2
LP
559 return -EBADMSG;
560
4214009f
ZJS
561 /* In both read and write mode we refuse to open files with incompatible
562 * flags we don't know. */
563 if (warn_wrong_flags(f, false))
cec736d2
LP
564 return -EPROTONOSUPPORT;
565
4214009f
ZJS
566 /* When open for writing we refuse to open files with compatible flags, too. */
567 if (f->writable && warn_wrong_flags(f, true))
d89c8fdf 568 return -EPROTONOSUPPORT;
7560fffc 569
db11ac1a
LP
570 if (f->header->state >= _STATE_MAX)
571 return -EBADMSG;
572
6f94e420
TS
573 header_size = le64toh(f->header->header_size);
574
dca6219e 575 /* The first addition was n_data, so check that we are at least this large */
6f94e420 576 if (header_size < HEADER_SIZE_MIN)
23b0b2b2
LP
577 return -EBADMSG;
578
8088cbd3 579 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
beec0085
LP
580 return -EBADMSG;
581
6f94e420
TS
582 arena_size = le64toh(f->header->arena_size);
583
584 if (UINT64_MAX - header_size < arena_size || header_size + arena_size > (uint64_t) f->last_stat.st_size)
db11ac1a
LP
585 return -ENODATA;
586
6f94e420 587 if (le64toh(f->header->tail_object_offset) > header_size + arena_size)
db11ac1a
LP
588 return -ENODATA;
589
7762e02b
LP
590 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
591 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
592 !VALID64(le64toh(f->header->tail_object_offset)) ||
593 !VALID64(le64toh(f->header->entry_array_offset)))
594 return -ENODATA;
595
cec736d2 596 if (f->writable) {
cec736d2 597 sd_id128_t machine_id;
ae739cc1 598 uint8_t state;
cec736d2
LP
599 int r;
600
601 r = sd_id128_get_machine(&machine_id);
602 if (r < 0)
603 return r;
604
605 if (!sd_id128_equal(machine_id, f->header->machine_id))
606 return -EHOSTDOWN;
607
de190aef 608 state = f->header->state;
cec736d2 609
b288cdeb
ZJS
610 if (state == STATE_ARCHIVED)
611 return -ESHUTDOWN; /* Already archived */
612 else if (state == STATE_ONLINE) {
71fa6f00
LP
613 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
614 return -EBUSY;
b288cdeb 615 } else if (state != STATE_OFFLINE) {
8facc349 616 log_debug("Journal file %s has unknown state %i.", f->path, state);
71fa6f00
LP
617 return -EBUSY;
618 }
ae739cc1 619
5b3cc0c8
YN
620 if (f->header->field_hash_table_size == 0 || f->header->data_hash_table_size == 0)
621 return -EBADMSG;
622
ae739cc1
LP
623 /* Don't permit appending to files from the future. Because otherwise the realtime timestamps wouldn't
624 * be strictly ordered in the entries in the file anymore, and we can't have that since it breaks
625 * bisection. */
626 if (le64toh(f->header->tail_entry_realtime) > now(CLOCK_REALTIME)) {
627 log_debug("Journal file %s is from the future, refusing to append new data to it that'd be older.", f->path);
628 return -ETXTBSY;
629 }
cec736d2
LP
630 }
631
d89c8fdf
ZJS
632 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
633 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
c586dbf1 634
f1889c91 635 f->seal = JOURNAL_HEADER_SEALED(f->header);
7560fffc 636
cec736d2
LP
637 return 0;
638}
639
2678031a
LP
640static int journal_file_fstat(JournalFile *f) {
641 assert(f);
642 assert(f->fd >= 0);
643
644 if (fstat(f->fd, &f->last_stat) < 0)
645 return -errno;
646
647 f->last_stat_usec = now(CLOCK_MONOTONIC);
648
649 /* Refuse appending to files that are already deleted */
650 if (f->last_stat.st_nlink <= 0)
651 return -EIDRM;
652
653 return 0;
654}
655
cec736d2 656static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
eda4b58b 657 uint64_t old_size, new_size;
fec2aa2f 658 int r;
cec736d2
LP
659
660 assert(f);
c88cc6af 661 assert(f->header);
cec736d2 662
cec736d2 663 /* We assume that this file is not sparse, and we know that
38ac38b2 664 * for sure, since we always call posix_fallocate()
cec736d2
LP
665 * ourselves */
666
be7cdd8e 667 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
fa6ac760
LP
668 return -EIO;
669
cec736d2 670 old_size =
23b0b2b2 671 le64toh(f->header->header_size) +
cec736d2
LP
672 le64toh(f->header->arena_size);
673
bc85bfee 674 new_size = PAGE_ALIGN(offset + size);
23b0b2b2
LP
675 if (new_size < le64toh(f->header->header_size))
676 new_size = le64toh(f->header->header_size);
bc85bfee 677
2678031a
LP
678 if (new_size <= old_size) {
679
680 /* We already pre-allocated enough space, but before
681 * we write to it, let's check with fstat() if the
682 * file got deleted, in order make sure we don't throw
683 * away the data immediately. Don't check fstat() for
684 * all writes though, but only once ever 10s. */
685
686 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
687 return 0;
688
689 return journal_file_fstat(f);
690 }
691
692 /* Allocate more space. */
cec736d2 693
a676e665 694 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
bc85bfee 695 return -E2BIG;
cec736d2 696
a676e665 697 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
cec736d2
LP
698 struct statvfs svfs;
699
700 if (fstatvfs(f->fd, &svfs) >= 0) {
701 uint64_t available;
702
070052ab 703 available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
cec736d2
LP
704
705 if (new_size - old_size > available)
706 return -E2BIG;
707 }
708 }
709
eda4b58b
LP
710 /* Increase by larger blocks at once */
711 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
712 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
713 new_size = f->metrics.max_size;
714
bc85bfee
LP
715 /* Note that the glibc fallocate() fallback is very
716 inefficient, hence we try to minimize the allocation area
717 as we can. */
fec2aa2f
GV
718 r = posix_fallocate(f->fd, old_size, new_size - old_size);
719 if (r != 0)
720 return -r;
cec736d2 721
23b0b2b2 722 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
cec736d2 723
2678031a 724 return journal_file_fstat(f);
cec736d2
LP
725}
726
78519831 727static unsigned type_to_context(ObjectType type) {
d3d3208f 728 /* One context for each type, plus one catch-all for the rest */
69adae51 729 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
fa6ac760 730 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
d05089d8 731 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
d3d3208f
MS
732}
733
b439282e 734static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret, size_t *ret_size) {
2678031a
LP
735 int r;
736
cec736d2 737 assert(f);
cec736d2
LP
738 assert(ret);
739
7762e02b
LP
740 if (size <= 0)
741 return -EINVAL;
742
2a59ea54 743 /* Avoid SIGBUS on invalid accesses */
4bbdcdb3
LP
744 if (offset + size > (uint64_t) f->last_stat.st_size) {
745 /* Hmm, out of range? Let's refresh the fstat() data
746 * first, before we trust that check. */
747
2678031a
LP
748 r = journal_file_fstat(f);
749 if (r < 0)
750 return r;
751
752 if (offset + size > (uint64_t) f->last_stat.st_size)
4bbdcdb3
LP
753 return -EADDRNOTAVAIL;
754 }
755
b439282e 756 return mmap_cache_get(f->mmap, f->cache_fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret, ret_size);
cec736d2
LP
757}
758
16e9f408
LP
759static uint64_t minimum_header_size(Object *o) {
760
b8e891e6 761 static const uint64_t table[] = {
16e9f408
LP
762 [OBJECT_DATA] = sizeof(DataObject),
763 [OBJECT_FIELD] = sizeof(FieldObject),
764 [OBJECT_ENTRY] = sizeof(EntryObject),
765 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
766 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
767 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
768 [OBJECT_TAG] = sizeof(TagObject),
769 };
770
771 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
772 return sizeof(ObjectHeader);
773
774 return table[o->object.type];
775}
776
24754f36
TR
777/* Lightweight object checks. We want this to be fast, so that we won't
778 * slowdown every journal_file_move_to_object() call too much. */
779static int journal_file_check_object(JournalFile *f, uint64_t offset, Object *o) {
780 assert(f);
781 assert(o);
782
783 switch (o->object.type) {
784
785 case OBJECT_DATA: {
786 if ((le64toh(o->data.entry_offset) == 0) ^ (le64toh(o->data.n_entries) == 0)) {
787 log_debug("Bad n_entries: %"PRIu64": %"PRIu64,
10e8445b 788 le64toh(o->data.n_entries), offset);
24754f36
TR
789 return -EBADMSG;
790 }
791
792 if (le64toh(o->object.size) - offsetof(DataObject, payload) <= 0) {
793 log_debug("Bad object size (<= %zu): %"PRIu64": %"PRIu64,
794 offsetof(DataObject, payload),
795 le64toh(o->object.size),
796 offset);
797 return -EBADMSG;
798 }
799
10e8445b
TR
800 if (!VALID64(le64toh(o->data.next_hash_offset)) ||
801 !VALID64(le64toh(o->data.next_field_offset)) ||
802 !VALID64(le64toh(o->data.entry_offset)) ||
803 !VALID64(le64toh(o->data.entry_array_offset))) {
24754f36
TR
804 log_debug("Invalid offset, next_hash_offset="OFSfmt", next_field_offset="OFSfmt
805 ", entry_offset="OFSfmt", entry_array_offset="OFSfmt": %"PRIu64,
10e8445b
TR
806 le64toh(o->data.next_hash_offset),
807 le64toh(o->data.next_field_offset),
808 le64toh(o->data.entry_offset),
809 le64toh(o->data.entry_array_offset),
24754f36
TR
810 offset);
811 return -EBADMSG;
812 }
813
814 break;
815 }
816
817 case OBJECT_FIELD:
818 if (le64toh(o->object.size) - offsetof(FieldObject, payload) <= 0) {
819 log_debug(
820 "Bad field size (<= %zu): %"PRIu64": %"PRIu64,
821 offsetof(FieldObject, payload),
822 le64toh(o->object.size),
823 offset);
824 return -EBADMSG;
825 }
826
10e8445b
TR
827 if (!VALID64(le64toh(o->field.next_hash_offset)) ||
828 !VALID64(le64toh(o->field.head_data_offset))) {
24754f36
TR
829 log_debug(
830 "Invalid offset, next_hash_offset="OFSfmt
831 ", head_data_offset="OFSfmt": %"PRIu64,
10e8445b
TR
832 le64toh(o->field.next_hash_offset),
833 le64toh(o->field.head_data_offset),
24754f36
TR
834 offset);
835 return -EBADMSG;
836 }
837 break;
838
839 case OBJECT_ENTRY:
840 if ((le64toh(o->object.size) - offsetof(EntryObject, items)) % sizeof(EntryItem) != 0) {
841 log_debug(
842 "Bad entry size (<= %zu): %"PRIu64": %"PRIu64,
843 offsetof(EntryObject, items),
844 le64toh(o->object.size),
845 offset);
846 return -EBADMSG;
847 }
848
849 if ((le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem) <= 0) {
850 log_debug(
851 "Invalid number items in entry: %"PRIu64": %"PRIu64,
852 (le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem),
853 offset);
854 return -EBADMSG;
855 }
856
857 if (le64toh(o->entry.seqnum) <= 0) {
858 log_debug(
859 "Invalid entry seqnum: %"PRIx64": %"PRIu64,
860 le64toh(o->entry.seqnum),
861 offset);
862 return -EBADMSG;
863 }
864
865 if (!VALID_REALTIME(le64toh(o->entry.realtime))) {
866 log_debug(
867 "Invalid entry realtime timestamp: %"PRIu64": %"PRIu64,
868 le64toh(o->entry.realtime),
869 offset);
870 return -EBADMSG;
871 }
872
873 if (!VALID_MONOTONIC(le64toh(o->entry.monotonic))) {
874 log_debug(
875 "Invalid entry monotonic timestamp: %"PRIu64": %"PRIu64,
876 le64toh(o->entry.monotonic),
877 offset);
878 return -EBADMSG;
879 }
880
881 break;
882
883 case OBJECT_DATA_HASH_TABLE:
884 case OBJECT_FIELD_HASH_TABLE:
885 if ((le64toh(o->object.size) - offsetof(HashTableObject, items)) % sizeof(HashItem) != 0 ||
886 (le64toh(o->object.size) - offsetof(HashTableObject, items)) / sizeof(HashItem) <= 0) {
887 log_debug(
888 "Invalid %s hash table size: %"PRIu64": %"PRIu64,
889 o->object.type == OBJECT_DATA_HASH_TABLE ? "data" : "field",
890 le64toh(o->object.size),
891 offset);
892 return -EBADMSG;
893 }
894
895 break;
896
897 case OBJECT_ENTRY_ARRAY:
898 if ((le64toh(o->object.size) - offsetof(EntryArrayObject, items)) % sizeof(le64_t) != 0 ||
899 (le64toh(o->object.size) - offsetof(EntryArrayObject, items)) / sizeof(le64_t) <= 0) {
900 log_debug(
901 "Invalid object entry array size: %"PRIu64": %"PRIu64,
902 le64toh(o->object.size),
903 offset);
904 return -EBADMSG;
905 }
906
10e8445b 907 if (!VALID64(le64toh(o->entry_array.next_entry_array_offset))) {
24754f36
TR
908 log_debug(
909 "Invalid object entry array next_entry_array_offset: "OFSfmt": %"PRIu64,
10e8445b 910 le64toh(o->entry_array.next_entry_array_offset),
24754f36
TR
911 offset);
912 return -EBADMSG;
913 }
914
915 break;
916
917 case OBJECT_TAG:
918 if (le64toh(o->object.size) != sizeof(TagObject)) {
919 log_debug(
920 "Invalid object tag size: %"PRIu64": %"PRIu64,
921 le64toh(o->object.size),
922 offset);
923 return -EBADMSG;
924 }
925
10e8445b 926 if (!VALID_EPOCH(le64toh(o->tag.epoch))) {
24754f36
TR
927 log_debug(
928 "Invalid object tag epoch: %"PRIu64": %"PRIu64,
10e8445b 929 le64toh(o->tag.epoch),
24754f36
TR
930 offset);
931 return -EBADMSG;
932 }
933
934 break;
935 }
936
937 return 0;
938}
939
78519831 940int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
cec736d2
LP
941 int r;
942 void *t;
b439282e 943 size_t tsize;
cec736d2
LP
944 Object *o;
945 uint64_t s;
946
947 assert(f);
948 assert(ret);
949
db11ac1a 950 /* Objects may only be located at multiple of 64 bit */
202fd896
LP
951 if (!VALID64(offset)) {
952 log_debug("Attempt to move to object at non-64bit boundary: %" PRIu64, offset);
bd30fdf2 953 return -EBADMSG;
202fd896 954 }
db11ac1a 955
50809d7a 956 /* Object may not be located in the file header */
202fd896
LP
957 if (offset < le64toh(f->header->header_size)) {
958 log_debug("Attempt to move to object located in file header: %" PRIu64, offset);
50809d7a 959 return -EBADMSG;
202fd896 960 }
50809d7a 961
b439282e 962 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t, &tsize);
cec736d2
LP
963 if (r < 0)
964 return r;
965
966 o = (Object*) t;
967 s = le64toh(o->object.size);
968
1c69f096
LP
969 if (s == 0) {
970 log_debug("Attempt to move to uninitialized object: %" PRIu64, offset);
971 return -EBADMSG;
972 }
202fd896
LP
973 if (s < sizeof(ObjectHeader)) {
974 log_debug("Attempt to move to overly short object: %" PRIu64, offset);
cec736d2 975 return -EBADMSG;
202fd896 976 }
cec736d2 977
202fd896
LP
978 if (o->object.type <= OBJECT_UNUSED) {
979 log_debug("Attempt to move to object with invalid type: %" PRIu64, offset);
16e9f408 980 return -EBADMSG;
202fd896 981 }
16e9f408 982
202fd896
LP
983 if (s < minimum_header_size(o)) {
984 log_debug("Attempt to move to truncated object: %" PRIu64, offset);
16e9f408 985 return -EBADMSG;
202fd896 986 }
16e9f408 987
202fd896
LP
988 if (type > OBJECT_UNUSED && o->object.type != type) {
989 log_debug("Attempt to move to object of unexpected type: %" PRIu64, offset);
cec736d2 990 return -EBADMSG;
202fd896 991 }
cec736d2 992
b439282e
VC
993 if (s > tsize) {
994 r = journal_file_move_to(f, type, false, offset, s, &t, NULL);
cec736d2
LP
995 if (r < 0)
996 return r;
997
998 o = (Object*) t;
999 }
1000
24754f36
TR
1001 r = journal_file_check_object(f, offset, o);
1002 if (r < 0)
1003 return r;
1004
cec736d2
LP
1005 *ret = o;
1006 return 0;
1007}
1008
d98cc1f2 1009static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
cec736d2
LP
1010 uint64_t r;
1011
1012 assert(f);
c88cc6af 1013 assert(f->header);
cec736d2 1014
beec0085 1015 r = le64toh(f->header->tail_entry_seqnum) + 1;
c2373f84
LP
1016
1017 if (seqnum) {
de190aef 1018 /* If an external seqnum counter was passed, we update
c2373f84
LP
1019 * both the local and the external one, and set it to
1020 * the maximum of both */
1021
1022 if (*seqnum + 1 > r)
1023 r = *seqnum + 1;
1024
1025 *seqnum = r;
1026 }
1027
beec0085 1028 f->header->tail_entry_seqnum = htole64(r);
cec736d2 1029
beec0085
LP
1030 if (f->header->head_entry_seqnum == 0)
1031 f->header->head_entry_seqnum = htole64(r);
de190aef 1032
cec736d2
LP
1033 return r;
1034}
1035
78519831 1036int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
cec736d2
LP
1037 int r;
1038 uint64_t p;
1039 Object *tail, *o;
1040 void *t;
1041
1042 assert(f);
c88cc6af 1043 assert(f->header);
d05089d8 1044 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
cec736d2
LP
1045 assert(size >= sizeof(ObjectHeader));
1046 assert(offset);
1047 assert(ret);
1048
26687bf8
OS
1049 r = journal_file_set_online(f);
1050 if (r < 0)
1051 return r;
1052
cec736d2 1053 p = le64toh(f->header->tail_object_offset);
cec736d2 1054 if (p == 0)
23b0b2b2 1055 p = le64toh(f->header->header_size);
cec736d2 1056 else {
d05089d8 1057 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
cec736d2
LP
1058 if (r < 0)
1059 return r;
1060
1061 p += ALIGN64(le64toh(tail->object.size));
1062 }
1063
1064 r = journal_file_allocate(f, p, size);
1065 if (r < 0)
1066 return r;
1067
b439282e 1068 r = journal_file_move_to(f, type, false, p, size, &t, NULL);
cec736d2
LP
1069 if (r < 0)
1070 return r;
1071
1072 o = (Object*) t;
1073
1074 zero(o->object);
de190aef 1075 o->object.type = type;
cec736d2
LP
1076 o->object.size = htole64(size);
1077
1078 f->header->tail_object_offset = htole64(p);
cec736d2
LP
1079 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
1080
1081 *ret = o;
1082 *offset = p;
1083
1084 return 0;
1085}
1086
de190aef 1087static int journal_file_setup_data_hash_table(JournalFile *f) {
cec736d2
LP
1088 uint64_t s, p;
1089 Object *o;
1090 int r;
1091
1092 assert(f);
c88cc6af 1093 assert(f->header);
cec736d2 1094
070052ab
LP
1095 /* We estimate that we need 1 hash table entry per 768 bytes
1096 of journal file and we want to make sure we never get
1097 beyond 75% fill level. Calculate the hash table size for
1098 the maximum file size based on these metrics. */
4a92baf3 1099
dfabe643 1100 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
4a92baf3
LP
1101 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
1102 s = DEFAULT_DATA_HASH_TABLE_SIZE;
1103
507f22bd 1104 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
4a92baf3 1105
de190aef
LP
1106 r = journal_file_append_object(f,
1107 OBJECT_DATA_HASH_TABLE,
1108 offsetof(Object, hash_table.items) + s,
1109 &o, &p);
cec736d2
LP
1110 if (r < 0)
1111 return r;
1112
29804cc1 1113 memzero(o->hash_table.items, s);
cec736d2 1114
de190aef
LP
1115 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1116 f->header->data_hash_table_size = htole64(s);
cec736d2
LP
1117
1118 return 0;
1119}
1120
de190aef 1121static int journal_file_setup_field_hash_table(JournalFile *f) {
cec736d2
LP
1122 uint64_t s, p;
1123 Object *o;
1124 int r;
1125
1126 assert(f);
c88cc6af 1127 assert(f->header);
cec736d2 1128
3c1668da
LP
1129 /* We use a fixed size hash table for the fields as this
1130 * number should grow very slowly only */
1131
de190aef
LP
1132 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
1133 r = journal_file_append_object(f,
1134 OBJECT_FIELD_HASH_TABLE,
1135 offsetof(Object, hash_table.items) + s,
1136 &o, &p);
cec736d2
LP
1137 if (r < 0)
1138 return r;
1139
29804cc1 1140 memzero(o->hash_table.items, s);
cec736d2 1141
de190aef
LP
1142 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1143 f->header->field_hash_table_size = htole64(s);
cec736d2
LP
1144
1145 return 0;
1146}
1147
dade37d4 1148int journal_file_map_data_hash_table(JournalFile *f) {
cec736d2
LP
1149 uint64_t s, p;
1150 void *t;
1151 int r;
1152
1153 assert(f);
c88cc6af 1154 assert(f->header);
cec736d2 1155
dade37d4
LP
1156 if (f->data_hash_table)
1157 return 0;
1158
de190aef
LP
1159 p = le64toh(f->header->data_hash_table_offset);
1160 s = le64toh(f->header->data_hash_table_size);
cec736d2 1161
de190aef 1162 r = journal_file_move_to(f,
16e9f408 1163 OBJECT_DATA_HASH_TABLE,
fcde2389 1164 true,
de190aef 1165 p, s,
b42549ad 1166 &t, NULL);
cec736d2
LP
1167 if (r < 0)
1168 return r;
1169
de190aef 1170 f->data_hash_table = t;
cec736d2
LP
1171 return 0;
1172}
1173
dade37d4 1174int journal_file_map_field_hash_table(JournalFile *f) {
cec736d2
LP
1175 uint64_t s, p;
1176 void *t;
1177 int r;
1178
1179 assert(f);
c88cc6af 1180 assert(f->header);
cec736d2 1181
dade37d4
LP
1182 if (f->field_hash_table)
1183 return 0;
1184
de190aef
LP
1185 p = le64toh(f->header->field_hash_table_offset);
1186 s = le64toh(f->header->field_hash_table_size);
cec736d2 1187
de190aef 1188 r = journal_file_move_to(f,
16e9f408 1189 OBJECT_FIELD_HASH_TABLE,
fcde2389 1190 true,
de190aef 1191 p, s,
b42549ad 1192 &t, NULL);
cec736d2
LP
1193 if (r < 0)
1194 return r;
1195
de190aef 1196 f->field_hash_table = t;
cec736d2
LP
1197 return 0;
1198}
1199
3c1668da
LP
1200static int journal_file_link_field(
1201 JournalFile *f,
1202 Object *o,
1203 uint64_t offset,
1204 uint64_t hash) {
1205
805d1486 1206 uint64_t p, h, m;
3c1668da
LP
1207 int r;
1208
1209 assert(f);
c88cc6af 1210 assert(f->header);
90d222c1 1211 assert(f->field_hash_table);
3c1668da
LP
1212 assert(o);
1213 assert(offset > 0);
1214
1215 if (o->object.type != OBJECT_FIELD)
1216 return -EINVAL;
1217
805d1486
LP
1218 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
1219 if (m <= 0)
1220 return -EBADMSG;
3c1668da 1221
805d1486 1222 /* This might alter the window we are looking at */
3c1668da
LP
1223 o->field.next_hash_offset = o->field.head_data_offset = 0;
1224
805d1486 1225 h = hash % m;
3c1668da
LP
1226 p = le64toh(f->field_hash_table[h].tail_hash_offset);
1227 if (p == 0)
1228 f->field_hash_table[h].head_hash_offset = htole64(offset);
1229 else {
1230 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1231 if (r < 0)
1232 return r;
1233
1234 o->field.next_hash_offset = htole64(offset);
1235 }
1236
1237 f->field_hash_table[h].tail_hash_offset = htole64(offset);
1238
1239 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
1240 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
1241
1242 return 0;
1243}
1244
1245static int journal_file_link_data(
1246 JournalFile *f,
1247 Object *o,
1248 uint64_t offset,
1249 uint64_t hash) {
1250
805d1486 1251 uint64_t p, h, m;
cec736d2
LP
1252 int r;
1253
1254 assert(f);
c88cc6af 1255 assert(f->header);
90d222c1 1256 assert(f->data_hash_table);
cec736d2
LP
1257 assert(o);
1258 assert(offset > 0);
b588975f
LP
1259
1260 if (o->object.type != OBJECT_DATA)
1261 return -EINVAL;
cec736d2 1262
805d1486
LP
1263 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
1264 if (m <= 0)
1265 return -EBADMSG;
48496df6 1266
805d1486 1267 /* This might alter the window we are looking at */
de190aef
LP
1268 o->data.next_hash_offset = o->data.next_field_offset = 0;
1269 o->data.entry_offset = o->data.entry_array_offset = 0;
1270 o->data.n_entries = 0;
cec736d2 1271
805d1486 1272 h = hash % m;
8db4213e 1273 p = le64toh(f->data_hash_table[h].tail_hash_offset);
3c1668da 1274 if (p == 0)
cec736d2 1275 /* Only entry in the hash table is easy */
de190aef 1276 f->data_hash_table[h].head_hash_offset = htole64(offset);
3c1668da 1277 else {
48496df6
LP
1278 /* Move back to the previous data object, to patch in
1279 * pointer */
cec736d2 1280
de190aef 1281 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1282 if (r < 0)
1283 return r;
1284
de190aef 1285 o->data.next_hash_offset = htole64(offset);
cec736d2
LP
1286 }
1287
de190aef 1288 f->data_hash_table[h].tail_hash_offset = htole64(offset);
cec736d2 1289
dca6219e
LP
1290 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
1291 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
1292
cec736d2
LP
1293 return 0;
1294}
1295
3c1668da
LP
1296int journal_file_find_field_object_with_hash(
1297 JournalFile *f,
1298 const void *field, uint64_t size, uint64_t hash,
1299 Object **ret, uint64_t *offset) {
1300
805d1486 1301 uint64_t p, osize, h, m;
3c1668da
LP
1302 int r;
1303
1304 assert(f);
c88cc6af 1305 assert(f->header);
3c1668da
LP
1306 assert(field && size > 0);
1307
dade37d4
LP
1308 /* If the field hash table is empty, we can't find anything */
1309 if (le64toh(f->header->field_hash_table_size) <= 0)
1310 return 0;
1311
1312 /* Map the field hash table, if it isn't mapped yet. */
1313 r = journal_file_map_field_hash_table(f);
1314 if (r < 0)
1315 return r;
1316
3c1668da
LP
1317 osize = offsetof(Object, field.payload) + size;
1318
805d1486 1319 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
805d1486 1320 if (m <= 0)
3c1668da
LP
1321 return -EBADMSG;
1322
805d1486 1323 h = hash % m;
3c1668da
LP
1324 p = le64toh(f->field_hash_table[h].head_hash_offset);
1325
1326 while (p > 0) {
1327 Object *o;
1328
1329 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1330 if (r < 0)
1331 return r;
1332
1333 if (le64toh(o->field.hash) == hash &&
1334 le64toh(o->object.size) == osize &&
1335 memcmp(o->field.payload, field, size) == 0) {
1336
1337 if (ret)
1338 *ret = o;
1339 if (offset)
1340 *offset = p;
1341
1342 return 1;
1343 }
1344
1345 p = le64toh(o->field.next_hash_offset);
1346 }
1347
1348 return 0;
1349}
1350
1351int journal_file_find_field_object(
1352 JournalFile *f,
1353 const void *field, uint64_t size,
1354 Object **ret, uint64_t *offset) {
1355
1356 uint64_t hash;
1357
1358 assert(f);
1359 assert(field && size > 0);
1360
1361 hash = hash64(field, size);
1362
1363 return journal_file_find_field_object_with_hash(f,
1364 field, size, hash,
1365 ret, offset);
1366}
1367
de190aef
LP
1368int journal_file_find_data_object_with_hash(
1369 JournalFile *f,
1370 const void *data, uint64_t size, uint64_t hash,
1371 Object **ret, uint64_t *offset) {
48496df6 1372
805d1486 1373 uint64_t p, osize, h, m;
cec736d2
LP
1374 int r;
1375
1376 assert(f);
c88cc6af 1377 assert(f->header);
cec736d2
LP
1378 assert(data || size == 0);
1379
dade37d4
LP
1380 /* If there's no data hash table, then there's no entry. */
1381 if (le64toh(f->header->data_hash_table_size) <= 0)
1382 return 0;
1383
1384 /* Map the data hash table, if it isn't mapped yet. */
1385 r = journal_file_map_data_hash_table(f);
1386 if (r < 0)
1387 return r;
1388
cec736d2
LP
1389 osize = offsetof(Object, data.payload) + size;
1390
805d1486
LP
1391 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
1392 if (m <= 0)
bc85bfee
LP
1393 return -EBADMSG;
1394
805d1486 1395 h = hash % m;
de190aef 1396 p = le64toh(f->data_hash_table[h].head_hash_offset);
cec736d2 1397
de190aef
LP
1398 while (p > 0) {
1399 Object *o;
cec736d2 1400
de190aef 1401 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1402 if (r < 0)
1403 return r;
1404
807e17f0 1405 if (le64toh(o->data.hash) != hash)
85a131e8 1406 goto next;
807e17f0 1407
d89c8fdf 1408 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
349cc4a5 1409#if HAVE_XZ || HAVE_LZ4
fa1c4b51 1410 uint64_t l;
a7f7d1bd 1411 size_t rsize = 0;
cec736d2 1412
807e17f0
LP
1413 l = le64toh(o->object.size);
1414 if (l <= offsetof(Object, data.payload))
cec736d2
LP
1415 return -EBADMSG;
1416
807e17f0
LP
1417 l -= offsetof(Object, data.payload);
1418
d89c8fdf
ZJS
1419 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
1420 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
1421 if (r < 0)
1422 return r;
807e17f0 1423
b785c858 1424 if (rsize == size &&
807e17f0
LP
1425 memcmp(f->compress_buffer, data, size) == 0) {
1426
1427 if (ret)
1428 *ret = o;
1429
1430 if (offset)
1431 *offset = p;
1432
1433 return 1;
1434 }
3b1a55e1
ZJS
1435#else
1436 return -EPROTONOSUPPORT;
1437#endif
807e17f0
LP
1438 } else if (le64toh(o->object.size) == osize &&
1439 memcmp(o->data.payload, data, size) == 0) {
1440
cec736d2
LP
1441 if (ret)
1442 *ret = o;
1443
1444 if (offset)
1445 *offset = p;
1446
de190aef 1447 return 1;
cec736d2
LP
1448 }
1449
85a131e8 1450 next:
cec736d2
LP
1451 p = le64toh(o->data.next_hash_offset);
1452 }
1453
de190aef
LP
1454 return 0;
1455}
1456
1457int journal_file_find_data_object(
1458 JournalFile *f,
1459 const void *data, uint64_t size,
1460 Object **ret, uint64_t *offset) {
1461
1462 uint64_t hash;
1463
1464 assert(f);
1465 assert(data || size == 0);
1466
1467 hash = hash64(data, size);
1468
1469 return journal_file_find_data_object_with_hash(f,
1470 data, size, hash,
1471 ret, offset);
1472}
1473
3c1668da
LP
1474static int journal_file_append_field(
1475 JournalFile *f,
1476 const void *field, uint64_t size,
1477 Object **ret, uint64_t *offset) {
1478
1479 uint64_t hash, p;
1480 uint64_t osize;
1481 Object *o;
1482 int r;
1483
1484 assert(f);
1485 assert(field && size > 0);
1486
1487 hash = hash64(field, size);
1488
1489 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
1490 if (r < 0)
1491 return r;
1492 else if (r > 0) {
1493
1494 if (ret)
1495 *ret = o;
1496
1497 if (offset)
1498 *offset = p;
1499
1500 return 0;
1501 }
1502
1503 osize = offsetof(Object, field.payload) + size;
1504 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
8c92d4bb
LP
1505 if (r < 0)
1506 return r;
3c1668da
LP
1507
1508 o->field.hash = htole64(hash);
1509 memcpy(o->field.payload, field, size);
1510
1511 r = journal_file_link_field(f, o, p, hash);
1512 if (r < 0)
1513 return r;
1514
1515 /* The linking might have altered the window, so let's
1516 * refresh our pointer */
1517 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1518 if (r < 0)
1519 return r;
1520
349cc4a5 1521#if HAVE_GCRYPT
3c1668da
LP
1522 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1523 if (r < 0)
1524 return r;
1525#endif
1526
1527 if (ret)
1528 *ret = o;
1529
1530 if (offset)
1531 *offset = p;
1532
1533 return 0;
1534}
1535
48496df6
LP
1536static int journal_file_append_data(
1537 JournalFile *f,
1538 const void *data, uint64_t size,
1539 Object **ret, uint64_t *offset) {
1540
de190aef
LP
1541 uint64_t hash, p;
1542 uint64_t osize;
1543 Object *o;
d89c8fdf 1544 int r, compression = 0;
3c1668da 1545 const void *eq;
de190aef
LP
1546
1547 assert(f);
1548 assert(data || size == 0);
1549
1550 hash = hash64(data, size);
1551
1552 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1553 if (r < 0)
1554 return r;
0240c603 1555 if (r > 0) {
de190aef
LP
1556
1557 if (ret)
1558 *ret = o;
1559
1560 if (offset)
1561 *offset = p;
1562
1563 return 0;
1564 }
1565
1566 osize = offsetof(Object, data.payload) + size;
1567 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
cec736d2
LP
1568 if (r < 0)
1569 return r;
1570
cec736d2 1571 o->data.hash = htole64(hash);
807e17f0 1572
349cc4a5 1573#if HAVE_XZ || HAVE_LZ4
d1afbcd2 1574 if (JOURNAL_FILE_COMPRESS(f) && size >= COMPRESSION_SIZE_THRESHOLD) {
a7f7d1bd 1575 size_t rsize = 0;
807e17f0 1576
5d6f46b6 1577 compression = compress_blob(data, size, o->data.payload, size - 1, &rsize);
807e17f0 1578
d1afbcd2 1579 if (compression >= 0) {
807e17f0 1580 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
d89c8fdf 1581 o->object.flags |= compression;
807e17f0 1582
fa1c4b51 1583 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
d89c8fdf 1584 size, rsize, object_compressed_to_string(compression));
d1afbcd2
LP
1585 } else
1586 /* Compression didn't work, we don't really care why, let's continue without compression */
1587 compression = 0;
807e17f0
LP
1588 }
1589#endif
1590
75f32f04
ZJS
1591 if (compression == 0)
1592 memcpy_safe(o->data.payload, data, size);
cec736d2 1593
de190aef 1594 r = journal_file_link_data(f, o, p, hash);
cec736d2
LP
1595 if (r < 0)
1596 return r;
1597
349cc4a5 1598#if HAVE_GCRYPT
33685a5a
FB
1599 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1600 if (r < 0)
1601 return r;
1602#endif
1603
48496df6
LP
1604 /* The linking might have altered the window, so let's
1605 * refresh our pointer */
1606 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1607 if (r < 0)
1608 return r;
1609
08c6f819
SL
1610 if (!data)
1611 eq = NULL;
1612 else
1613 eq = memchr(data, '=', size);
3c1668da 1614 if (eq && eq > data) {
748db592 1615 Object *fo = NULL;
3c1668da 1616 uint64_t fp;
3c1668da
LP
1617
1618 /* Create field object ... */
1619 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1620 if (r < 0)
1621 return r;
1622
1623 /* ... and link it in. */
1624 o->data.next_field_offset = fo->field.head_data_offset;
1625 fo->field.head_data_offset = le64toh(p);
1626 }
1627
cec736d2
LP
1628 if (ret)
1629 *ret = o;
1630
1631 if (offset)
de190aef 1632 *offset = p;
cec736d2
LP
1633
1634 return 0;
1635}
1636
1637uint64_t journal_file_entry_n_items(Object *o) {
1638 assert(o);
b588975f
LP
1639
1640 if (o->object.type != OBJECT_ENTRY)
1641 return 0;
cec736d2
LP
1642
1643 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1644}
1645
0284adc6 1646uint64_t journal_file_entry_array_n_items(Object *o) {
de190aef 1647 assert(o);
b588975f
LP
1648
1649 if (o->object.type != OBJECT_ENTRY_ARRAY)
1650 return 0;
de190aef
LP
1651
1652 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1653}
1654
fb9a24b6
LP
1655uint64_t journal_file_hash_table_n_items(Object *o) {
1656 assert(o);
b588975f 1657
ec2ce0c5 1658 if (!IN_SET(o->object.type, OBJECT_DATA_HASH_TABLE, OBJECT_FIELD_HASH_TABLE))
b588975f 1659 return 0;
fb9a24b6
LP
1660
1661 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1662}
1663
de190aef 1664static int link_entry_into_array(JournalFile *f,
4fd052ae
FC
1665 le64_t *first,
1666 le64_t *idx,
de190aef 1667 uint64_t p) {
cec736d2 1668 int r;
de190aef
LP
1669 uint64_t n = 0, ap = 0, q, i, a, hidx;
1670 Object *o;
1671
cec736d2 1672 assert(f);
c88cc6af 1673 assert(f->header);
de190aef
LP
1674 assert(first);
1675 assert(idx);
1676 assert(p > 0);
cec736d2 1677
de190aef
LP
1678 a = le64toh(*first);
1679 i = hidx = le64toh(*idx);
1680 while (a > 0) {
1681
1682 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1683 if (r < 0)
1684 return r;
cec736d2 1685
de190aef
LP
1686 n = journal_file_entry_array_n_items(o);
1687 if (i < n) {
1688 o->entry_array.items[i] = htole64(p);
1689 *idx = htole64(hidx + 1);
1690 return 0;
1691 }
cec736d2 1692
de190aef
LP
1693 i -= n;
1694 ap = a;
1695 a = le64toh(o->entry_array.next_entry_array_offset);
1696 }
1697
1698 if (hidx > n)
1699 n = (hidx+1) * 2;
1700 else
1701 n = n * 2;
1702
1703 if (n < 4)
1704 n = 4;
1705
1706 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1707 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1708 &o, &q);
cec736d2
LP
1709 if (r < 0)
1710 return r;
1711
349cc4a5 1712#if HAVE_GCRYPT
5996c7c2 1713 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
b0af6f41
LP
1714 if (r < 0)
1715 return r;
feb12d3e 1716#endif
b0af6f41 1717
de190aef 1718 o->entry_array.items[i] = htole64(p);
cec736d2 1719
de190aef 1720 if (ap == 0)
7be3aa17 1721 *first = htole64(q);
cec736d2 1722 else {
de190aef 1723 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
cec736d2
LP
1724 if (r < 0)
1725 return r;
1726
de190aef
LP
1727 o->entry_array.next_entry_array_offset = htole64(q);
1728 }
cec736d2 1729
2dee23eb
LP
1730 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1731 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1732
de190aef
LP
1733 *idx = htole64(hidx + 1);
1734
1735 return 0;
1736}
cec736d2 1737
de190aef 1738static int link_entry_into_array_plus_one(JournalFile *f,
4fd052ae
FC
1739 le64_t *extra,
1740 le64_t *first,
1741 le64_t *idx,
de190aef
LP
1742 uint64_t p) {
1743
1744 int r;
1745
1746 assert(f);
1747 assert(extra);
1748 assert(first);
1749 assert(idx);
1750 assert(p > 0);
1751
1752 if (*idx == 0)
1753 *extra = htole64(p);
1754 else {
4fd052ae 1755 le64_t i;
de190aef 1756
7be3aa17 1757 i = htole64(le64toh(*idx) - 1);
de190aef
LP
1758 r = link_entry_into_array(f, first, &i, p);
1759 if (r < 0)
1760 return r;
cec736d2
LP
1761 }
1762
de190aef
LP
1763 *idx = htole64(le64toh(*idx) + 1);
1764 return 0;
1765}
1766
1767static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1768 uint64_t p;
1769 int r;
1770 assert(f);
1771 assert(o);
1772 assert(offset > 0);
1773
1774 p = le64toh(o->entry.items[i].object_offset);
1775 if (p == 0)
1776 return -EINVAL;
1777
1778 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1779 if (r < 0)
1780 return r;
1781
de190aef
LP
1782 return link_entry_into_array_plus_one(f,
1783 &o->data.entry_offset,
1784 &o->data.entry_array_offset,
1785 &o->data.n_entries,
1786 offset);
cec736d2
LP
1787}
1788
1789static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
de190aef 1790 uint64_t n, i;
cec736d2
LP
1791 int r;
1792
1793 assert(f);
c88cc6af 1794 assert(f->header);
cec736d2
LP
1795 assert(o);
1796 assert(offset > 0);
b588975f
LP
1797
1798 if (o->object.type != OBJECT_ENTRY)
1799 return -EINVAL;
cec736d2 1800
b788cc23
LP
1801 __sync_synchronize();
1802
cec736d2 1803 /* Link up the entry itself */
de190aef
LP
1804 r = link_entry_into_array(f,
1805 &f->header->entry_array_offset,
1806 &f->header->n_entries,
1807 offset);
1808 if (r < 0)
1809 return r;
cec736d2 1810
507f22bd 1811 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
cec736d2 1812
de190aef 1813 if (f->header->head_entry_realtime == 0)
0ac38b70 1814 f->header->head_entry_realtime = o->entry.realtime;
cec736d2 1815
0ac38b70 1816 f->header->tail_entry_realtime = o->entry.realtime;
de190aef
LP
1817 f->header->tail_entry_monotonic = o->entry.monotonic;
1818
1819 f->tail_entry_monotonic_valid = true;
cec736d2
LP
1820
1821 /* Link up the items */
1822 n = journal_file_entry_n_items(o);
1823 for (i = 0; i < n; i++) {
1824 r = journal_file_link_entry_item(f, o, offset, i);
1825 if (r < 0)
1826 return r;
1827 }
1828
cec736d2
LP
1829 return 0;
1830}
1831
1832static int journal_file_append_entry_internal(
1833 JournalFile *f,
1834 const dual_timestamp *ts,
1835 uint64_t xor_hash,
1836 const EntryItem items[], unsigned n_items,
de190aef 1837 uint64_t *seqnum,
cec736d2
LP
1838 Object **ret, uint64_t *offset) {
1839 uint64_t np;
1840 uint64_t osize;
1841 Object *o;
1842 int r;
1843
1844 assert(f);
c88cc6af 1845 assert(f->header);
cec736d2 1846 assert(items || n_items == 0);
de190aef 1847 assert(ts);
cec736d2
LP
1848
1849 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1850
de190aef 1851 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
cec736d2
LP
1852 if (r < 0)
1853 return r;
1854
d98cc1f2 1855 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
75f32f04 1856 memcpy_safe(o->entry.items, items, n_items * sizeof(EntryItem));
de190aef
LP
1857 o->entry.realtime = htole64(ts->realtime);
1858 o->entry.monotonic = htole64(ts->monotonic);
cec736d2
LP
1859 o->entry.xor_hash = htole64(xor_hash);
1860 o->entry.boot_id = f->header->boot_id;
1861
349cc4a5 1862#if HAVE_GCRYPT
5996c7c2 1863 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
b0af6f41
LP
1864 if (r < 0)
1865 return r;
feb12d3e 1866#endif
b0af6f41 1867
cec736d2
LP
1868 r = journal_file_link_entry(f, o, np);
1869 if (r < 0)
1870 return r;
1871
1872 if (ret)
1873 *ret = o;
1874
1875 if (offset)
1876 *offset = np;
1877
1878 return 0;
1879}
1880
cf244689 1881void journal_file_post_change(JournalFile *f) {
50f20cfd
LP
1882 assert(f);
1883
1884 /* inotify() does not receive IN_MODIFY events from file
1885 * accesses done via mmap(). After each access we hence
1886 * trigger IN_MODIFY by truncating the journal file to its
1887 * current size which triggers IN_MODIFY. */
1888
bc85bfee
LP
1889 __sync_synchronize();
1890
50f20cfd 1891 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
e167d7fd 1892 log_debug_errno(errno, "Failed to truncate file to its own size: %m");
50f20cfd
LP
1893}
1894
7a24f3bf
VC
1895static int post_change_thunk(sd_event_source *timer, uint64_t usec, void *userdata) {
1896 assert(userdata);
1897
1898 journal_file_post_change(userdata);
1899
1900 return 1;
1901}
1902
1903static void schedule_post_change(JournalFile *f) {
1904 sd_event_source *timer;
1905 int enabled, r;
1906 uint64_t now;
1907
1908 assert(f);
1909 assert(f->post_change_timer);
1910
1911 timer = f->post_change_timer;
1912
1913 r = sd_event_source_get_enabled(timer, &enabled);
1914 if (r < 0) {
e167d7fd
LP
1915 log_debug_errno(r, "Failed to get ftruncate timer state: %m");
1916 goto fail;
7a24f3bf
VC
1917 }
1918
1919 if (enabled == SD_EVENT_ONESHOT)
1920 return;
1921
1922 r = sd_event_now(sd_event_source_get_event(timer), CLOCK_MONOTONIC, &now);
1923 if (r < 0) {
e167d7fd
LP
1924 log_debug_errno(r, "Failed to get clock's now for scheduling ftruncate: %m");
1925 goto fail;
7a24f3bf
VC
1926 }
1927
1928 r = sd_event_source_set_time(timer, now+f->post_change_timer_period);
1929 if (r < 0) {
e167d7fd
LP
1930 log_debug_errno(r, "Failed to set time for scheduling ftruncate: %m");
1931 goto fail;
7a24f3bf
VC
1932 }
1933
1934 r = sd_event_source_set_enabled(timer, SD_EVENT_ONESHOT);
1935 if (r < 0) {
e167d7fd
LP
1936 log_debug_errno(r, "Failed to enable scheduled ftruncate: %m");
1937 goto fail;
7a24f3bf 1938 }
e167d7fd
LP
1939
1940 return;
1941
1942fail:
1943 /* On failure, let's simply post the change immediately. */
1944 journal_file_post_change(f);
7a24f3bf
VC
1945}
1946
1947/* Enable coalesced change posting in a timer on the provided sd_event instance */
1948int journal_file_enable_post_change_timer(JournalFile *f, sd_event *e, usec_t t) {
1949 _cleanup_(sd_event_source_unrefp) sd_event_source *timer = NULL;
1950 int r;
1951
1952 assert(f);
1953 assert_return(!f->post_change_timer, -EINVAL);
1954 assert(e);
1955 assert(t);
1956
1957 r = sd_event_add_time(e, &timer, CLOCK_MONOTONIC, 0, 0, post_change_thunk, f);
1958 if (r < 0)
1959 return r;
1960
1961 r = sd_event_source_set_enabled(timer, SD_EVENT_OFF);
1962 if (r < 0)
1963 return r;
1964
1965 f->post_change_timer = timer;
1966 timer = NULL;
1967 f->post_change_timer_period = t;
1968
1969 return r;
1970}
1971
1f2da9ec
LP
1972static int entry_item_cmp(const void *_a, const void *_b) {
1973 const EntryItem *a = _a, *b = _b;
1974
1975 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1976 return -1;
1977 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1978 return 1;
1979 return 0;
1980}
1981
de190aef 1982int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
cec736d2
LP
1983 unsigned i;
1984 EntryItem *items;
1985 int r;
1986 uint64_t xor_hash = 0;
de190aef 1987 struct dual_timestamp _ts;
cec736d2
LP
1988
1989 assert(f);
c88cc6af 1990 assert(f->header);
cec736d2
LP
1991 assert(iovec || n_iovec == 0);
1992
de190aef
LP
1993 if (!ts) {
1994 dual_timestamp_get(&_ts);
1995 ts = &_ts;
1996 }
1997
349cc4a5 1998#if HAVE_GCRYPT
7560fffc
LP
1999 r = journal_file_maybe_append_tag(f, ts->realtime);
2000 if (r < 0)
2001 return r;
feb12d3e 2002#endif
7560fffc 2003
64825d3c 2004 /* alloca() can't take 0, hence let's allocate at least one */
9607d947 2005 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
cec736d2
LP
2006
2007 for (i = 0; i < n_iovec; i++) {
2008 uint64_t p;
2009 Object *o;
2010
2011 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
2012 if (r < 0)
cf244689 2013 return r;
cec736d2
LP
2014
2015 xor_hash ^= le64toh(o->data.hash);
2016 items[i].object_offset = htole64(p);
de7b95cd 2017 items[i].hash = o->data.hash;
cec736d2
LP
2018 }
2019
1f2da9ec
LP
2020 /* Order by the position on disk, in order to improve seek
2021 * times for rotating media. */
7ff7394d 2022 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1f2da9ec 2023
de190aef 2024 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
cec736d2 2025
fa6ac760
LP
2026 /* If the memory mapping triggered a SIGBUS then we return an
2027 * IO error and ignore the error code passed down to us, since
2028 * it is very likely just an effect of a nullified replacement
2029 * mapping page */
2030
be7cdd8e 2031 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
fa6ac760
LP
2032 r = -EIO;
2033
7a24f3bf
VC
2034 if (f->post_change_timer)
2035 schedule_post_change(f);
2036 else
2037 journal_file_post_change(f);
50f20cfd 2038
cec736d2
LP
2039 return r;
2040}
2041
a4bcff5b 2042typedef struct ChainCacheItem {
fb099c8d 2043 uint64_t first; /* the array at the beginning of the chain */
a4bcff5b
LP
2044 uint64_t array; /* the cached array */
2045 uint64_t begin; /* the first item in the cached array */
2046 uint64_t total; /* the total number of items in all arrays before this one in the chain */
f268980d 2047 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
a4bcff5b
LP
2048} ChainCacheItem;
2049
2050static void chain_cache_put(
4743015d 2051 OrderedHashmap *h,
a4bcff5b
LP
2052 ChainCacheItem *ci,
2053 uint64_t first,
2054 uint64_t array,
2055 uint64_t begin,
f268980d
LP
2056 uint64_t total,
2057 uint64_t last_index) {
a4bcff5b
LP
2058
2059 if (!ci) {
34741aa3
LP
2060 /* If the chain item to cache for this chain is the
2061 * first one it's not worth caching anything */
2062 if (array == first)
2063 return;
2064
29433089 2065 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
4743015d 2066 ci = ordered_hashmap_steal_first(h);
29433089
LP
2067 assert(ci);
2068 } else {
a4bcff5b
LP
2069 ci = new(ChainCacheItem, 1);
2070 if (!ci)
2071 return;
2072 }
2073
2074 ci->first = first;
2075
4743015d 2076 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
a4bcff5b
LP
2077 free(ci);
2078 return;
2079 }
2080 } else
2081 assert(ci->first == first);
2082
2083 ci->array = array;
2084 ci->begin = begin;
2085 ci->total = total;
f268980d 2086 ci->last_index = last_index;
a4bcff5b
LP
2087}
2088
f268980d
LP
2089static int generic_array_get(
2090 JournalFile *f,
2091 uint64_t first,
2092 uint64_t i,
2093 Object **ret, uint64_t *offset) {
de190aef 2094
cec736d2 2095 Object *o;
a4bcff5b 2096 uint64_t p = 0, a, t = 0;
cec736d2 2097 int r;
a4bcff5b 2098 ChainCacheItem *ci;
cec736d2
LP
2099
2100 assert(f);
2101
de190aef 2102 a = first;
a4bcff5b
LP
2103
2104 /* Try the chain cache first */
4743015d 2105 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
2106 if (ci && i > ci->total) {
2107 a = ci->array;
2108 i -= ci->total;
2109 t = ci->total;
2110 }
2111
de190aef 2112 while (a > 0) {
a4bcff5b 2113 uint64_t k;
cec736d2 2114
de190aef
LP
2115 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
2116 if (r < 0)
2117 return r;
cec736d2 2118
a4bcff5b
LP
2119 k = journal_file_entry_array_n_items(o);
2120 if (i < k) {
de190aef 2121 p = le64toh(o->entry_array.items[i]);
a4bcff5b 2122 goto found;
cec736d2
LP
2123 }
2124
a4bcff5b
LP
2125 i -= k;
2126 t += k;
de190aef
LP
2127 a = le64toh(o->entry_array.next_entry_array_offset);
2128 }
2129
a4bcff5b
LP
2130 return 0;
2131
2132found:
2133 /* Let's cache this item for the next invocation */
af13a6b0 2134 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
de190aef
LP
2135
2136 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2137 if (r < 0)
2138 return r;
2139
2140 if (ret)
2141 *ret = o;
2142
2143 if (offset)
2144 *offset = p;
2145
2146 return 1;
2147}
2148
f268980d
LP
2149static int generic_array_get_plus_one(
2150 JournalFile *f,
2151 uint64_t extra,
2152 uint64_t first,
2153 uint64_t i,
2154 Object **ret, uint64_t *offset) {
de190aef
LP
2155
2156 Object *o;
2157
2158 assert(f);
2159
2160 if (i == 0) {
2161 int r;
2162
2163 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
cec736d2
LP
2164 if (r < 0)
2165 return r;
2166
de190aef
LP
2167 if (ret)
2168 *ret = o;
cec736d2 2169
de190aef
LP
2170 if (offset)
2171 *offset = extra;
cec736d2 2172
de190aef 2173 return 1;
cec736d2
LP
2174 }
2175
de190aef
LP
2176 return generic_array_get(f, first, i-1, ret, offset);
2177}
cec736d2 2178
de190aef
LP
2179enum {
2180 TEST_FOUND,
2181 TEST_LEFT,
2182 TEST_RIGHT
2183};
cec736d2 2184
f268980d
LP
2185static int generic_array_bisect(
2186 JournalFile *f,
2187 uint64_t first,
2188 uint64_t n,
2189 uint64_t needle,
2190 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2191 direction_t direction,
2192 Object **ret,
2193 uint64_t *offset,
2194 uint64_t *idx) {
2195
2196 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
de190aef
LP
2197 bool subtract_one = false;
2198 Object *o, *array = NULL;
2199 int r;
a4bcff5b 2200 ChainCacheItem *ci;
cec736d2 2201
de190aef
LP
2202 assert(f);
2203 assert(test_object);
cec736d2 2204
a4bcff5b 2205 /* Start with the first array in the chain */
de190aef 2206 a = first;
a4bcff5b 2207
4743015d 2208 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
2209 if (ci && n > ci->total) {
2210 /* Ah, we have iterated this bisection array chain
2211 * previously! Let's see if we can skip ahead in the
2212 * chain, as far as the last time. But we can't jump
2213 * backwards in the chain, so let's check that
2214 * first. */
2215
2216 r = test_object(f, ci->begin, needle);
2217 if (r < 0)
2218 return r;
2219
2220 if (r == TEST_LEFT) {
f268980d 2221 /* OK, what we are looking for is right of the
a4bcff5b
LP
2222 * begin of this EntryArray, so let's jump
2223 * straight to previously cached array in the
2224 * chain */
2225
2226 a = ci->array;
2227 n -= ci->total;
2228 t = ci->total;
f268980d 2229 last_index = ci->last_index;
a4bcff5b
LP
2230 }
2231 }
2232
de190aef
LP
2233 while (a > 0) {
2234 uint64_t left, right, k, lp;
2235
2236 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
cec736d2
LP
2237 if (r < 0)
2238 return r;
2239
de190aef
LP
2240 k = journal_file_entry_array_n_items(array);
2241 right = MIN(k, n);
2242 if (right <= 0)
2243 return 0;
cec736d2 2244
de190aef
LP
2245 i = right - 1;
2246 lp = p = le64toh(array->entry_array.items[i]);
2247 if (p <= 0)
bee6a291
LP
2248 r = -EBADMSG;
2249 else
2250 r = test_object(f, p, needle);
2251 if (r == -EBADMSG) {
2252 log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (1)");
2253 n = i;
2254 continue;
2255 }
de190aef
LP
2256 if (r < 0)
2257 return r;
cec736d2 2258
de190aef
LP
2259 if (r == TEST_FOUND)
2260 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2261
2262 if (r == TEST_RIGHT) {
2263 left = 0;
2264 right -= 1;
f268980d
LP
2265
2266 if (last_index != (uint64_t) -1) {
2267 assert(last_index <= right);
2268
2269 /* If we cached the last index we
2270 * looked at, let's try to not to jump
2271 * too wildly around and see if we can
2272 * limit the range to look at early to
2273 * the immediate neighbors of the last
2274 * index we looked at. */
2275
2276 if (last_index > 0) {
2277 uint64_t x = last_index - 1;
2278
2279 p = le64toh(array->entry_array.items[x]);
2280 if (p <= 0)
2281 return -EBADMSG;
2282
2283 r = test_object(f, p, needle);
2284 if (r < 0)
2285 return r;
2286
2287 if (r == TEST_FOUND)
2288 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2289
2290 if (r == TEST_RIGHT)
2291 right = x;
2292 else
2293 left = x + 1;
2294 }
2295
2296 if (last_index < right) {
2297 uint64_t y = last_index + 1;
2298
2299 p = le64toh(array->entry_array.items[y]);
2300 if (p <= 0)
2301 return -EBADMSG;
2302
2303 r = test_object(f, p, needle);
2304 if (r < 0)
2305 return r;
2306
2307 if (r == TEST_FOUND)
2308 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2309
2310 if (r == TEST_RIGHT)
2311 right = y;
2312 else
2313 left = y + 1;
2314 }
f268980d
LP
2315 }
2316
de190aef
LP
2317 for (;;) {
2318 if (left == right) {
2319 if (direction == DIRECTION_UP)
2320 subtract_one = true;
2321
2322 i = left;
2323 goto found;
2324 }
2325
2326 assert(left < right);
de190aef 2327 i = (left + right) / 2;
f268980d 2328
de190aef
LP
2329 p = le64toh(array->entry_array.items[i]);
2330 if (p <= 0)
bee6a291
LP
2331 r = -EBADMSG;
2332 else
2333 r = test_object(f, p, needle);
2334 if (r == -EBADMSG) {
2335 log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (2)");
2336 right = n = i;
2337 continue;
2338 }
de190aef
LP
2339 if (r < 0)
2340 return r;
cec736d2 2341
de190aef
LP
2342 if (r == TEST_FOUND)
2343 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2344
2345 if (r == TEST_RIGHT)
2346 right = i;
2347 else
2348 left = i + 1;
2349 }
2350 }
2351
2173cbf8 2352 if (k >= n) {
cbdca852
LP
2353 if (direction == DIRECTION_UP) {
2354 i = n;
2355 subtract_one = true;
2356 goto found;
2357 }
2358
cec736d2 2359 return 0;
cbdca852 2360 }
cec736d2 2361
de190aef
LP
2362 last_p = lp;
2363
2364 n -= k;
2365 t += k;
f268980d 2366 last_index = (uint64_t) -1;
de190aef 2367 a = le64toh(array->entry_array.next_entry_array_offset);
cec736d2
LP
2368 }
2369
2370 return 0;
de190aef
LP
2371
2372found:
2373 if (subtract_one && t == 0 && i == 0)
2374 return 0;
2375
a4bcff5b 2376 /* Let's cache this item for the next invocation */
af13a6b0 2377 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
a4bcff5b 2378
de190aef
LP
2379 if (subtract_one && i == 0)
2380 p = last_p;
2381 else if (subtract_one)
2382 p = le64toh(array->entry_array.items[i-1]);
2383 else
2384 p = le64toh(array->entry_array.items[i]);
2385
2386 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2387 if (r < 0)
2388 return r;
2389
2390 if (ret)
2391 *ret = o;
2392
2393 if (offset)
2394 *offset = p;
2395
2396 if (idx)
cbdca852 2397 *idx = t + i + (subtract_one ? -1 : 0);
de190aef
LP
2398
2399 return 1;
cec736d2
LP
2400}
2401
f268980d
LP
2402static int generic_array_bisect_plus_one(
2403 JournalFile *f,
2404 uint64_t extra,
2405 uint64_t first,
2406 uint64_t n,
2407 uint64_t needle,
2408 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2409 direction_t direction,
2410 Object **ret,
2411 uint64_t *offset,
2412 uint64_t *idx) {
de190aef 2413
cec736d2 2414 int r;
cbdca852
LP
2415 bool step_back = false;
2416 Object *o;
cec736d2
LP
2417
2418 assert(f);
de190aef 2419 assert(test_object);
cec736d2 2420
de190aef
LP
2421 if (n <= 0)
2422 return 0;
cec736d2 2423
de190aef
LP
2424 /* This bisects the array in object 'first', but first checks
2425 * an extra */
de190aef
LP
2426 r = test_object(f, extra, needle);
2427 if (r < 0)
2428 return r;
a536e261
LP
2429
2430 if (r == TEST_FOUND)
2431 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2432
cbdca852
LP
2433 /* if we are looking with DIRECTION_UP then we need to first
2434 see if in the actual array there is a matching entry, and
2435 return the last one of that. But if there isn't any we need
2436 to return this one. Hence remember this, and return it
2437 below. */
2438 if (r == TEST_LEFT)
2439 step_back = direction == DIRECTION_UP;
de190aef 2440
cbdca852
LP
2441 if (r == TEST_RIGHT) {
2442 if (direction == DIRECTION_DOWN)
2443 goto found;
2444 else
2445 return 0;
a536e261 2446 }
cec736d2 2447
de190aef
LP
2448 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
2449
cbdca852
LP
2450 if (r == 0 && step_back)
2451 goto found;
2452
ecf68b1d 2453 if (r > 0 && idx)
313cefa1 2454 (*idx)++;
de190aef
LP
2455
2456 return r;
cbdca852
LP
2457
2458found:
2459 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
2460 if (r < 0)
2461 return r;
2462
2463 if (ret)
2464 *ret = o;
2465
2466 if (offset)
2467 *offset = extra;
2468
2469 if (idx)
2470 *idx = 0;
2471
2472 return 1;
2473}
2474
44a6b1b6 2475_pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
cbdca852
LP
2476 assert(f);
2477 assert(p > 0);
2478
2479 if (p == needle)
2480 return TEST_FOUND;
2481 else if (p < needle)
2482 return TEST_LEFT;
2483 else
2484 return TEST_RIGHT;
2485}
2486
de190aef
LP
2487static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
2488 Object *o;
2489 int r;
2490
2491 assert(f);
2492 assert(p > 0);
2493
2494 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
cec736d2
LP
2495 if (r < 0)
2496 return r;
2497
de190aef
LP
2498 if (le64toh(o->entry.seqnum) == needle)
2499 return TEST_FOUND;
2500 else if (le64toh(o->entry.seqnum) < needle)
2501 return TEST_LEFT;
2502 else
2503 return TEST_RIGHT;
2504}
cec736d2 2505
de190aef
LP
2506int journal_file_move_to_entry_by_seqnum(
2507 JournalFile *f,
2508 uint64_t seqnum,
2509 direction_t direction,
2510 Object **ret,
2511 uint64_t *offset) {
c88cc6af
VC
2512 assert(f);
2513 assert(f->header);
de190aef
LP
2514
2515 return generic_array_bisect(f,
2516 le64toh(f->header->entry_array_offset),
2517 le64toh(f->header->n_entries),
2518 seqnum,
2519 test_object_seqnum,
2520 direction,
2521 ret, offset, NULL);
2522}
cec736d2 2523
de190aef
LP
2524static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
2525 Object *o;
2526 int r;
2527
2528 assert(f);
2529 assert(p > 0);
2530
2531 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2532 if (r < 0)
2533 return r;
2534
2535 if (le64toh(o->entry.realtime) == needle)
2536 return TEST_FOUND;
2537 else if (le64toh(o->entry.realtime) < needle)
2538 return TEST_LEFT;
2539 else
2540 return TEST_RIGHT;
cec736d2
LP
2541}
2542
de190aef
LP
2543int journal_file_move_to_entry_by_realtime(
2544 JournalFile *f,
2545 uint64_t realtime,
2546 direction_t direction,
2547 Object **ret,
2548 uint64_t *offset) {
c88cc6af
VC
2549 assert(f);
2550 assert(f->header);
de190aef
LP
2551
2552 return generic_array_bisect(f,
2553 le64toh(f->header->entry_array_offset),
2554 le64toh(f->header->n_entries),
2555 realtime,
2556 test_object_realtime,
2557 direction,
2558 ret, offset, NULL);
2559}
2560
2561static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
2562 Object *o;
2563 int r;
2564
2565 assert(f);
2566 assert(p > 0);
2567
2568 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2569 if (r < 0)
2570 return r;
2571
2572 if (le64toh(o->entry.monotonic) == needle)
2573 return TEST_FOUND;
2574 else if (le64toh(o->entry.monotonic) < needle)
2575 return TEST_LEFT;
2576 else
2577 return TEST_RIGHT;
2578}
2579
2a560338 2580static int find_data_object_by_boot_id(
47838ab3
ZJS
2581 JournalFile *f,
2582 sd_id128_t boot_id,
2583 Object **o,
2584 uint64_t *b) {
2a560338 2585
47838ab3
ZJS
2586 char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
2587
2588 sd_id128_to_string(boot_id, t + 9);
2589 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2590}
2591
de190aef
LP
2592int journal_file_move_to_entry_by_monotonic(
2593 JournalFile *f,
2594 sd_id128_t boot_id,
2595 uint64_t monotonic,
2596 direction_t direction,
2597 Object **ret,
2598 uint64_t *offset) {
2599
de190aef
LP
2600 Object *o;
2601 int r;
2602
cbdca852 2603 assert(f);
de190aef 2604
47838ab3 2605 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
de190aef
LP
2606 if (r < 0)
2607 return r;
cbdca852 2608 if (r == 0)
de190aef
LP
2609 return -ENOENT;
2610
2611 return generic_array_bisect_plus_one(f,
2612 le64toh(o->data.entry_offset),
2613 le64toh(o->data.entry_array_offset),
2614 le64toh(o->data.n_entries),
2615 monotonic,
2616 test_object_monotonic,
2617 direction,
2618 ret, offset, NULL);
2619}
2620
1fc605b0 2621void journal_file_reset_location(JournalFile *f) {
6573ef05 2622 f->location_type = LOCATION_HEAD;
1fc605b0 2623 f->current_offset = 0;
6573ef05
MS
2624 f->current_seqnum = 0;
2625 f->current_realtime = 0;
2626 f->current_monotonic = 0;
2627 zero(f->current_boot_id);
2628 f->current_xor_hash = 0;
2629}
2630
950c07d4 2631void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
6573ef05
MS
2632 f->location_type = LOCATION_SEEK;
2633 f->current_offset = offset;
2634 f->current_seqnum = le64toh(o->entry.seqnum);
2635 f->current_realtime = le64toh(o->entry.realtime);
2636 f->current_monotonic = le64toh(o->entry.monotonic);
2637 f->current_boot_id = o->entry.boot_id;
2638 f->current_xor_hash = le64toh(o->entry.xor_hash);
1fc605b0
MS
2639}
2640
d8ae66d7
MS
2641int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2642 assert(af);
c88cc6af 2643 assert(af->header);
d8ae66d7 2644 assert(bf);
c88cc6af 2645 assert(bf->header);
d8ae66d7
MS
2646 assert(af->location_type == LOCATION_SEEK);
2647 assert(bf->location_type == LOCATION_SEEK);
2648
2649 /* If contents and timestamps match, these entries are
2650 * identical, even if the seqnum does not match */
2651 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2652 af->current_monotonic == bf->current_monotonic &&
2653 af->current_realtime == bf->current_realtime &&
2654 af->current_xor_hash == bf->current_xor_hash)
2655 return 0;
2656
2657 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2658
2659 /* If this is from the same seqnum source, compare
2660 * seqnums */
2661 if (af->current_seqnum < bf->current_seqnum)
2662 return -1;
2663 if (af->current_seqnum > bf->current_seqnum)
2664 return 1;
2665
2666 /* Wow! This is weird, different data but the same
2667 * seqnums? Something is borked, but let's make the
2668 * best of it and compare by time. */
2669 }
2670
2671 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2672
2673 /* If the boot id matches, compare monotonic time */
2674 if (af->current_monotonic < bf->current_monotonic)
2675 return -1;
2676 if (af->current_monotonic > bf->current_monotonic)
2677 return 1;
2678 }
2679
2680 /* Otherwise, compare UTC time */
2681 if (af->current_realtime < bf->current_realtime)
2682 return -1;
2683 if (af->current_realtime > bf->current_realtime)
2684 return 1;
2685
2686 /* Finally, compare by contents */
2687 if (af->current_xor_hash < bf->current_xor_hash)
2688 return -1;
2689 if (af->current_xor_hash > bf->current_xor_hash)
2690 return 1;
2691
2692 return 0;
2693}
2694
aa598ba5
LP
2695static int bump_array_index(uint64_t *i, direction_t direction, uint64_t n) {
2696
2697 /* Increase or decrease the specified index, in the right direction. */
2698
2699 if (direction == DIRECTION_DOWN) {
2700 if (*i >= n - 1)
2701 return 0;
2702
2703 (*i) ++;
2704 } else {
2705 if (*i <= 0)
2706 return 0;
2707
2708 (*i) --;
2709 }
2710
2711 return 1;
2712}
2713
b6da4ed0
LP
2714static bool check_properly_ordered(uint64_t new_offset, uint64_t old_offset, direction_t direction) {
2715
2716 /* Consider it an error if any of the two offsets is uninitialized */
2717 if (old_offset == 0 || new_offset == 0)
2718 return false;
2719
2720 /* If we go down, the new offset must be larger than the old one. */
2721 return direction == DIRECTION_DOWN ?
2722 new_offset > old_offset :
2723 new_offset < old_offset;
2724}
2725
de190aef
LP
2726int journal_file_next_entry(
2727 JournalFile *f,
f534928a 2728 uint64_t p,
de190aef
LP
2729 direction_t direction,
2730 Object **ret, uint64_t *offset) {
2731
fb099c8d 2732 uint64_t i, n, ofs;
cec736d2
LP
2733 int r;
2734
2735 assert(f);
c88cc6af 2736 assert(f->header);
de190aef
LP
2737
2738 n = le64toh(f->header->n_entries);
2739 if (n <= 0)
2740 return 0;
cec736d2 2741
f534928a 2742 if (p == 0)
de190aef 2743 i = direction == DIRECTION_DOWN ? 0 : n - 1;
cec736d2 2744 else {
de190aef
LP
2745 r = generic_array_bisect(f,
2746 le64toh(f->header->entry_array_offset),
2747 le64toh(f->header->n_entries),
2748 p,
2749 test_object_offset,
2750 DIRECTION_DOWN,
2751 NULL, NULL,
2752 &i);
2753 if (r <= 0)
2754 return r;
2755
aa598ba5
LP
2756 r = bump_array_index(&i, direction, n);
2757 if (r <= 0)
2758 return r;
cec736d2
LP
2759 }
2760
de190aef 2761 /* And jump to it */
989793d3
LP
2762 for (;;) {
2763 r = generic_array_get(f,
2764 le64toh(f->header->entry_array_offset),
2765 i,
2766 ret, &ofs);
2767 if (r > 0)
2768 break;
2769 if (r != -EBADMSG)
2770 return r;
2771
2772 /* OK, so this entry is borked. Most likely some entry didn't get synced to disk properly, let's see if
2773 * the next one might work for us instead. */
2774 log_debug_errno(r, "Entry item %" PRIu64 " is bad, skipping over it.", i);
2775
2776 r = bump_array_index(&i, direction, n);
2777 if (r <= 0)
2778 return r;
caeab8f6 2779 }
fb099c8d 2780
b6da4ed0
LP
2781 /* Ensure our array is properly ordered. */
2782 if (p > 0 && !check_properly_ordered(ofs, p, direction)) {
2783 log_debug("%s: entry array not properly ordered at entry %" PRIu64, f->path, i);
fb099c8d
ZJS
2784 return -EBADMSG;
2785 }
2786
2787 if (offset)
2788 *offset = ofs;
2789
2790 return 1;
de190aef 2791}
cec736d2 2792
de190aef
LP
2793int journal_file_next_entry_for_data(
2794 JournalFile *f,
2795 Object *o, uint64_t p,
2796 uint64_t data_offset,
2797 direction_t direction,
2798 Object **ret, uint64_t *offset) {
2799
ded5034e 2800 uint64_t i, n, ofs;
de190aef 2801 Object *d;
989793d3 2802 int r;
cec736d2
LP
2803
2804 assert(f);
de190aef 2805 assert(p > 0 || !o);
cec736d2 2806
de190aef 2807 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
466ccd92 2808 if (r < 0)
de190aef 2809 return r;
cec736d2 2810
de190aef
LP
2811 n = le64toh(d->data.n_entries);
2812 if (n <= 0)
2813 return n;
cec736d2 2814
de190aef
LP
2815 if (!o)
2816 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2817 else {
2818 if (o->object.type != OBJECT_ENTRY)
2819 return -EINVAL;
cec736d2 2820
de190aef
LP
2821 r = generic_array_bisect_plus_one(f,
2822 le64toh(d->data.entry_offset),
2823 le64toh(d->data.entry_array_offset),
2824 le64toh(d->data.n_entries),
2825 p,
2826 test_object_offset,
2827 DIRECTION_DOWN,
2828 NULL, NULL,
2829 &i);
2830
2831 if (r <= 0)
cec736d2
LP
2832 return r;
2833
aa598ba5
LP
2834 r = bump_array_index(&i, direction, n);
2835 if (r <= 0)
2836 return r;
de190aef 2837 }
cec736d2 2838
989793d3
LP
2839 for (;;) {
2840 r = generic_array_get_plus_one(f,
2841 le64toh(d->data.entry_offset),
2842 le64toh(d->data.entry_array_offset),
2843 i,
2844 ret, &ofs);
2845 if (r > 0)
2846 break;
2847 if (r != -EBADMSG)
2848 return r;
2849
2850 log_debug_errno(r, "Data entry item %" PRIu64 " is bad, skipping over it.", i);
2851
2852 r = bump_array_index(&i, direction, n);
2853 if (r <= 0)
2854 return r;
2855 }
ded5034e
LP
2856
2857 /* Ensure our array is properly ordered. */
2858 if (p > 0 && check_properly_ordered(ofs, p, direction)) {
2859 log_debug("%s data entry array not properly ordered at entry %" PRIu64, f->path, i);
2860 return -EBADMSG;
2861 }
2862
2863 if (offset)
2864 *offset = ofs;
2865
2866 return 1;
de190aef 2867}
cec736d2 2868
cbdca852
LP
2869int journal_file_move_to_entry_by_offset_for_data(
2870 JournalFile *f,
2871 uint64_t data_offset,
2872 uint64_t p,
2873 direction_t direction,
2874 Object **ret, uint64_t *offset) {
2875
2876 int r;
2877 Object *d;
2878
2879 assert(f);
2880
2881 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2882 if (r < 0)
2883 return r;
2884
2885 return generic_array_bisect_plus_one(f,
2886 le64toh(d->data.entry_offset),
2887 le64toh(d->data.entry_array_offset),
2888 le64toh(d->data.n_entries),
2889 p,
2890 test_object_offset,
2891 direction,
2892 ret, offset, NULL);
2893}
2894
2895int journal_file_move_to_entry_by_monotonic_for_data(
2896 JournalFile *f,
2897 uint64_t data_offset,
2898 sd_id128_t boot_id,
2899 uint64_t monotonic,
2900 direction_t direction,
2901 Object **ret, uint64_t *offset) {
2902
cbdca852
LP
2903 Object *o, *d;
2904 int r;
2905 uint64_t b, z;
2906
2907 assert(f);
2908
2909 /* First, seek by time */
47838ab3 2910 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
cbdca852
LP
2911 if (r < 0)
2912 return r;
2913 if (r == 0)
2914 return -ENOENT;
2915
2916 r = generic_array_bisect_plus_one(f,
2917 le64toh(o->data.entry_offset),
2918 le64toh(o->data.entry_array_offset),
2919 le64toh(o->data.n_entries),
2920 monotonic,
2921 test_object_monotonic,
2922 direction,
2923 NULL, &z, NULL);
2924 if (r <= 0)
2925 return r;
2926
2927 /* And now, continue seeking until we find an entry that
2928 * exists in both bisection arrays */
2929
2930 for (;;) {
2931 Object *qo;
2932 uint64_t p, q;
2933
2934 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2935 if (r < 0)
2936 return r;
2937
2938 r = generic_array_bisect_plus_one(f,
2939 le64toh(d->data.entry_offset),
2940 le64toh(d->data.entry_array_offset),
2941 le64toh(d->data.n_entries),
2942 z,
2943 test_object_offset,
2944 direction,
2945 NULL, &p, NULL);
2946 if (r <= 0)
2947 return r;
2948
2949 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2950 if (r < 0)
2951 return r;
2952
2953 r = generic_array_bisect_plus_one(f,
2954 le64toh(o->data.entry_offset),
2955 le64toh(o->data.entry_array_offset),
2956 le64toh(o->data.n_entries),
2957 p,
2958 test_object_offset,
2959 direction,
2960 &qo, &q, NULL);
2961
2962 if (r <= 0)
2963 return r;
2964
2965 if (p == q) {
2966 if (ret)
2967 *ret = qo;
2968 if (offset)
2969 *offset = q;
2970
2971 return 1;
2972 }
2973
2974 z = q;
2975 }
cbdca852
LP
2976}
2977
de190aef
LP
2978int journal_file_move_to_entry_by_seqnum_for_data(
2979 JournalFile *f,
2980 uint64_t data_offset,
2981 uint64_t seqnum,
2982 direction_t direction,
2983 Object **ret, uint64_t *offset) {
cec736d2 2984
de190aef
LP
2985 Object *d;
2986 int r;
cec736d2 2987
91a31dde
LP
2988 assert(f);
2989
de190aef 2990 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2991 if (r < 0)
de190aef 2992 return r;
cec736d2 2993
de190aef
LP
2994 return generic_array_bisect_plus_one(f,
2995 le64toh(d->data.entry_offset),
2996 le64toh(d->data.entry_array_offset),
2997 le64toh(d->data.n_entries),
2998 seqnum,
2999 test_object_seqnum,
3000 direction,
3001 ret, offset, NULL);
3002}
cec736d2 3003
de190aef
LP
3004int journal_file_move_to_entry_by_realtime_for_data(
3005 JournalFile *f,
3006 uint64_t data_offset,
3007 uint64_t realtime,
3008 direction_t direction,
3009 Object **ret, uint64_t *offset) {
3010
3011 Object *d;
3012 int r;
3013
91a31dde
LP
3014 assert(f);
3015
de190aef 3016 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 3017 if (r < 0)
de190aef
LP
3018 return r;
3019
3020 return generic_array_bisect_plus_one(f,
3021 le64toh(d->data.entry_offset),
3022 le64toh(d->data.entry_array_offset),
3023 le64toh(d->data.n_entries),
3024 realtime,
3025 test_object_realtime,
3026 direction,
3027 ret, offset, NULL);
cec736d2
LP
3028}
3029
0284adc6 3030void journal_file_dump(JournalFile *f) {
7560fffc 3031 Object *o;
7560fffc 3032 int r;
0284adc6 3033 uint64_t p;
7560fffc
LP
3034
3035 assert(f);
c88cc6af 3036 assert(f->header);
7560fffc 3037
0284adc6 3038 journal_file_print_header(f);
7560fffc 3039
0284adc6
LP
3040 p = le64toh(f->header->header_size);
3041 while (p != 0) {
d05089d8 3042 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
0284adc6
LP
3043 if (r < 0)
3044 goto fail;
7560fffc 3045
0284adc6 3046 switch (o->object.type) {
d98cc1f2 3047
0284adc6
LP
3048 case OBJECT_UNUSED:
3049 printf("Type: OBJECT_UNUSED\n");
3050 break;
d98cc1f2 3051
0284adc6
LP
3052 case OBJECT_DATA:
3053 printf("Type: OBJECT_DATA\n");
3054 break;
7560fffc 3055
3c1668da
LP
3056 case OBJECT_FIELD:
3057 printf("Type: OBJECT_FIELD\n");
3058 break;
3059
0284adc6 3060 case OBJECT_ENTRY:
507f22bd
ZJS
3061 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
3062 le64toh(o->entry.seqnum),
3063 le64toh(o->entry.monotonic),
3064 le64toh(o->entry.realtime));
0284adc6 3065 break;
7560fffc 3066
0284adc6
LP
3067 case OBJECT_FIELD_HASH_TABLE:
3068 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
3069 break;
7560fffc 3070
0284adc6
LP
3071 case OBJECT_DATA_HASH_TABLE:
3072 printf("Type: OBJECT_DATA_HASH_TABLE\n");
3073 break;
7560fffc 3074
0284adc6
LP
3075 case OBJECT_ENTRY_ARRAY:
3076 printf("Type: OBJECT_ENTRY_ARRAY\n");
3077 break;
7560fffc 3078
0284adc6 3079 case OBJECT_TAG:
507f22bd
ZJS
3080 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
3081 le64toh(o->tag.seqnum),
3082 le64toh(o->tag.epoch));
0284adc6 3083 break;
3c1668da
LP
3084
3085 default:
8facc349 3086 printf("Type: unknown (%i)\n", o->object.type);
3c1668da 3087 break;
0284adc6 3088 }
7560fffc 3089
d89c8fdf
ZJS
3090 if (o->object.flags & OBJECT_COMPRESSION_MASK)
3091 printf("Flags: %s\n",
3092 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
7560fffc 3093
0284adc6
LP
3094 if (p == le64toh(f->header->tail_object_offset))
3095 p = 0;
3096 else
3097 p = p + ALIGN64(le64toh(o->object.size));
3098 }
7560fffc 3099
0284adc6
LP
3100 return;
3101fail:
3102 log_error("File corrupt");
7560fffc
LP
3103}
3104
718fe4b1
ZJS
3105static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
3106 const char *x;
3107
3108 x = format_timestamp(buf, l, t);
3109 if (x)
3110 return x;
3111 return " --- ";
3112}
3113
0284adc6 3114void journal_file_print_header(JournalFile *f) {
2765b7bb 3115 char a[33], b[33], c[33], d[33];
ed375beb 3116 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
a1a03e30
LP
3117 struct stat st;
3118 char bytes[FORMAT_BYTES_MAX];
7560fffc
LP
3119
3120 assert(f);
c88cc6af 3121 assert(f->header);
7560fffc 3122
0284adc6
LP
3123 printf("File Path: %s\n"
3124 "File ID: %s\n"
3125 "Machine ID: %s\n"
3126 "Boot ID: %s\n"
3127 "Sequential Number ID: %s\n"
3128 "State: %s\n"
3129 "Compatible Flags:%s%s\n"
d89c8fdf 3130 "Incompatible Flags:%s%s%s\n"
507f22bd
ZJS
3131 "Header size: %"PRIu64"\n"
3132 "Arena size: %"PRIu64"\n"
3133 "Data Hash Table Size: %"PRIu64"\n"
3134 "Field Hash Table Size: %"PRIu64"\n"
0284adc6 3135 "Rotate Suggested: %s\n"
0808b92f
LP
3136 "Head Sequential Number: %"PRIu64" (%"PRIx64")\n"
3137 "Tail Sequential Number: %"PRIu64" (%"PRIx64")\n"
3138 "Head Realtime Timestamp: %s (%"PRIx64")\n"
3139 "Tail Realtime Timestamp: %s (%"PRIx64")\n"
3140 "Tail Monotonic Timestamp: %s (%"PRIx64")\n"
507f22bd
ZJS
3141 "Objects: %"PRIu64"\n"
3142 "Entry Objects: %"PRIu64"\n",
0284adc6
LP
3143 f->path,
3144 sd_id128_to_string(f->header->file_id, a),
3145 sd_id128_to_string(f->header->machine_id, b),
3146 sd_id128_to_string(f->header->boot_id, c),
2765b7bb 3147 sd_id128_to_string(f->header->seqnum_id, d),
3223f44f
LP
3148 f->header->state == STATE_OFFLINE ? "OFFLINE" :
3149 f->header->state == STATE_ONLINE ? "ONLINE" :
3150 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
8088cbd3 3151 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
d89c8fdf
ZJS
3152 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
3153 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
3154 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
3155 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
507f22bd
ZJS
3156 le64toh(f->header->header_size),
3157 le64toh(f->header->arena_size),
3158 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3159 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
fb0951b0 3160 yes_no(journal_file_rotate_suggested(f, 0)),
0808b92f
LP
3161 le64toh(f->header->head_entry_seqnum), le64toh(f->header->head_entry_seqnum),
3162 le64toh(f->header->tail_entry_seqnum), le64toh(f->header->tail_entry_seqnum),
3163 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)), le64toh(f->header->head_entry_realtime),
3164 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)), le64toh(f->header->tail_entry_realtime),
3165 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC), le64toh(f->header->tail_entry_monotonic),
507f22bd
ZJS
3166 le64toh(f->header->n_objects),
3167 le64toh(f->header->n_entries));
7560fffc 3168
0284adc6 3169 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
507f22bd 3170 printf("Data Objects: %"PRIu64"\n"
0284adc6 3171 "Data Hash Table Fill: %.1f%%\n",
507f22bd 3172 le64toh(f->header->n_data),
0284adc6 3173 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
7560fffc 3174
0284adc6 3175 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
507f22bd 3176 printf("Field Objects: %"PRIu64"\n"
0284adc6 3177 "Field Hash Table Fill: %.1f%%\n",
507f22bd 3178 le64toh(f->header->n_fields),
0284adc6 3179 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3223f44f
LP
3180
3181 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
507f22bd
ZJS
3182 printf("Tag Objects: %"PRIu64"\n",
3183 le64toh(f->header->n_tags));
3223f44f 3184 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
507f22bd
ZJS
3185 printf("Entry Array Objects: %"PRIu64"\n",
3186 le64toh(f->header->n_entry_arrays));
a1a03e30
LP
3187
3188 if (fstat(f->fd, &st) >= 0)
59f448cf 3189 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
7560fffc
LP
3190}
3191
fc68c929
LP
3192static int journal_file_warn_btrfs(JournalFile *f) {
3193 unsigned attrs;
3194 int r;
3195
3196 assert(f);
3197
3198 /* Before we write anything, check if the COW logic is turned
3199 * off on btrfs. Given our write pattern that is quite
3200 * unfriendly to COW file systems this should greatly improve
3201 * performance on COW file systems, such as btrfs, at the
3202 * expense of data integrity features (which shouldn't be too
3203 * bad, given that we do our own checksumming). */
3204
3205 r = btrfs_is_filesystem(f->fd);
3206 if (r < 0)
3207 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
3208 if (!r)
3209 return 0;
3210
3211 r = read_attr_fd(f->fd, &attrs);
3212 if (r < 0)
3213 return log_warning_errno(r, "Failed to read file attributes: %m");
3214
3215 if (attrs & FS_NOCOW_FL) {
3216 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
3217 return 0;
3218 }
3219
3220 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
3221 "This is likely to slow down journal access substantially, please consider turning "
3222 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
3223
3224 return 1;
3225}
3226
0284adc6 3227int journal_file_open(
5d1ce257 3228 int fd,
0284adc6
LP
3229 const char *fname,
3230 int flags,
3231 mode_t mode,
3232 bool compress,
baed47c3 3233 bool seal,
0284adc6
LP
3234 JournalMetrics *metrics,
3235 MMapCache *mmap_cache,
b58c888f 3236 Set *deferred_closes,
0284adc6
LP
3237 JournalFile *template,
3238 JournalFile **ret) {
7560fffc 3239
fa6ac760 3240 bool newly_created = false;
0284adc6 3241 JournalFile *f;
fa6ac760 3242 void *h;
0284adc6 3243 int r;
7560fffc 3244
0559d3a5 3245 assert(ret);
5d1ce257 3246 assert(fd >= 0 || fname);
7560fffc 3247
ec2ce0c5 3248 if (!IN_SET((flags & O_ACCMODE), O_RDONLY, O_RDWR))
0284adc6 3249 return -EINVAL;
7560fffc 3250
5d1ce257
LP
3251 if (fname) {
3252 if (!endswith(fname, ".journal") &&
3253 !endswith(fname, ".journal~"))
3254 return -EINVAL;
3255 }
7560fffc 3256
0284adc6
LP
3257 f = new0(JournalFile, 1);
3258 if (!f)
3259 return -ENOMEM;
7560fffc 3260
5d1ce257 3261 f->fd = fd;
0284adc6 3262 f->mode = mode;
7560fffc 3263
0284adc6
LP
3264 f->flags = flags;
3265 f->prot = prot_from_flags(flags);
3266 f->writable = (flags & O_ACCMODE) != O_RDONLY;
349cc4a5 3267#if HAVE_LZ4
d89c8fdf 3268 f->compress_lz4 = compress;
349cc4a5 3269#elif HAVE_XZ
d89c8fdf 3270 f->compress_xz = compress;
48b61739 3271#endif
349cc4a5 3272#if HAVE_GCRYPT
baed47c3 3273 f->seal = seal;
49a32d43 3274#endif
7560fffc 3275
0284adc6
LP
3276 if (mmap_cache)
3277 f->mmap = mmap_cache_ref(mmap_cache);
3278 else {
84168d80 3279 f->mmap = mmap_cache_new();
0284adc6
LP
3280 if (!f->mmap) {
3281 r = -ENOMEM;
3282 goto fail;
3283 }
3284 }
7560fffc 3285
7645c77b 3286 if (fname) {
5d1ce257 3287 f->path = strdup(fname);
7645c77b
ZJS
3288 if (!f->path) {
3289 r = -ENOMEM;
3290 goto fail;
3291 }
3292 } else {
3293 /* If we don't know the path, fill in something explanatory and vaguely useful */
3294 if (asprintf(&f->path, "/proc/self/%i", fd) < 0) {
3295 r = -ENOMEM;
3296 goto fail;
3297 }
0284adc6 3298 }
7560fffc 3299
4743015d 3300 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
a4bcff5b
LP
3301 if (!f->chain_cache) {
3302 r = -ENOMEM;
3303 goto fail;
3304 }
3305
0284adc6 3306 if (f->fd < 0) {
5d1ce257
LP
3307 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
3308 if (f->fd < 0) {
3309 r = -errno;
3310 goto fail;
3311 }
3312
3313 /* fds we opened here by us should also be closed by us. */
3314 f->close_fd = true;
7560fffc 3315 }
7560fffc 3316
be7cdd8e
VC
3317 f->cache_fd = mmap_cache_add_fd(f->mmap, f->fd);
3318 if (!f->cache_fd) {
3319 r = -ENOMEM;
3320 goto fail;
3321 }
3322
2678031a
LP
3323 r = journal_file_fstat(f);
3324 if (r < 0)
0284adc6 3325 goto fail;
7560fffc 3326
0284adc6 3327 if (f->last_stat.st_size == 0 && f->writable) {
11689d2a 3328
fc68c929 3329 (void) journal_file_warn_btrfs(f);
11689d2a 3330
fb0951b0
LP
3331 /* Let's attach the creation time to the journal file,
3332 * so that the vacuuming code knows the age of this
3333 * file even if the file might end up corrupted one
3334 * day... Ideally we'd just use the creation time many
3335 * file systems maintain for each file, but there is
3336 * currently no usable API to query this, hence let's
3337 * emulate this via extended attributes. If extended
3338 * attributes are not supported we'll just skip this,
7517e174 3339 * and rely solely on mtime/atime/ctime of the file. */
fb0951b0 3340
d61b600d 3341 fd_setcrtime(f->fd, 0);
7560fffc 3342
349cc4a5 3343#if HAVE_GCRYPT
0284adc6 3344 /* Try to load the FSPRG state, and if we can't, then
baed47c3 3345 * just don't do sealing */
49a32d43
LP
3346 if (f->seal) {
3347 r = journal_file_fss_load(f);
3348 if (r < 0)
3349 f->seal = false;
3350 }
feb12d3e 3351#endif
7560fffc 3352
0284adc6
LP
3353 r = journal_file_init_header(f, template);
3354 if (r < 0)
3355 goto fail;
7560fffc 3356
2678031a
LP
3357 r = journal_file_fstat(f);
3358 if (r < 0)
0284adc6 3359 goto fail;
fb0951b0
LP
3360
3361 newly_created = true;
0284adc6 3362 }
7560fffc 3363
0284adc6 3364 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
cfb571f3 3365 r = -ENODATA;
0284adc6
LP
3366 goto fail;
3367 }
7560fffc 3368
b42549ad 3369 r = mmap_cache_get(f->mmap, f->cache_fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h, NULL);
977eaa1e 3370 if (r < 0)
0284adc6 3371 goto fail;
7560fffc 3372
fa6ac760
LP
3373 f->header = h;
3374
0284adc6 3375 if (!newly_created) {
b58c888f
VC
3376 if (deferred_closes)
3377 journal_file_close_set(deferred_closes);
3378
0284adc6
LP
3379 r = journal_file_verify_header(f);
3380 if (r < 0)
3381 goto fail;
3382 }
7560fffc 3383
349cc4a5 3384#if HAVE_GCRYPT
0284adc6 3385 if (!newly_created && f->writable) {
baed47c3 3386 r = journal_file_fss_load(f);
0284adc6
LP
3387 if (r < 0)
3388 goto fail;
3389 }
feb12d3e 3390#endif
cec736d2
LP
3391
3392 if (f->writable) {
4a92baf3
LP
3393 if (metrics) {
3394 journal_default_metrics(metrics, f->fd);
3395 f->metrics = *metrics;
3396 } else if (template)
3397 f->metrics = template->metrics;
3398
cec736d2
LP
3399 r = journal_file_refresh_header(f);
3400 if (r < 0)
3401 goto fail;
3402 }
3403
349cc4a5 3404#if HAVE_GCRYPT
baed47c3 3405 r = journal_file_hmac_setup(f);
14d10188
LP
3406 if (r < 0)
3407 goto fail;
feb12d3e 3408#endif
14d10188 3409
cec736d2 3410 if (newly_created) {
de190aef 3411 r = journal_file_setup_field_hash_table(f);
cec736d2
LP
3412 if (r < 0)
3413 goto fail;
3414
de190aef 3415 r = journal_file_setup_data_hash_table(f);
cec736d2
LP
3416 if (r < 0)
3417 goto fail;
7560fffc 3418
349cc4a5 3419#if HAVE_GCRYPT
7560fffc
LP
3420 r = journal_file_append_first_tag(f);
3421 if (r < 0)
3422 goto fail;
feb12d3e 3423#endif
cec736d2
LP
3424 }
3425
be7cdd8e 3426 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd)) {
fa6ac760
LP
3427 r = -EIO;
3428 goto fail;
3429 }
3430
7a24f3bf 3431 if (template && template->post_change_timer) {
e167d7fd
LP
3432 r = journal_file_enable_post_change_timer(
3433 f,
3434 sd_event_source_get_event(template->post_change_timer),
3435 template->post_change_timer_period);
7a24f3bf 3436
7a24f3bf
VC
3437 if (r < 0)
3438 goto fail;
3439 }
3440
f8e2f4d6 3441 /* The file is opened now successfully, thus we take possession of any passed in fd. */
5d1ce257
LP
3442 f->close_fd = true;
3443
0559d3a5 3444 *ret = f;
cec736d2
LP
3445 return 0;
3446
3447fail:
be7cdd8e 3448 if (f->cache_fd && mmap_cache_got_sigbus(f->mmap, f->cache_fd))
fa6ac760
LP
3449 r = -EIO;
3450
69a3a6fd 3451 (void) journal_file_close(f);
cec736d2
LP
3452
3453 return r;
3454}
0ac38b70 3455
b58c888f 3456int journal_file_rotate(JournalFile **f, bool compress, bool seal, Set *deferred_closes) {
57535f47 3457 _cleanup_free_ char *p = NULL;
0ac38b70
LP
3458 size_t l;
3459 JournalFile *old_file, *new_file = NULL;
3460 int r;
3461
3462 assert(f);
3463 assert(*f);
3464
3465 old_file = *f;
3466
3467 if (!old_file->writable)
3468 return -EINVAL;
3469
5d1ce257 3470 /* Is this a journal file that was passed to us as fd? If so, we synthesized a path name for it, and we refuse
13e785f7 3471 * rotation, since we don't know the actual path, and couldn't rename the file hence. */
5d1ce257
LP
3472 if (path_startswith(old_file->path, "/proc/self/fd"))
3473 return -EINVAL;
3474
0ac38b70
LP
3475 if (!endswith(old_file->path, ".journal"))
3476 return -EINVAL;
3477
3478 l = strlen(old_file->path);
57535f47
ZJS
3479 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
3480 (int) l - 8, old_file->path,
3481 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
3482 le64toh((*f)->header->head_entry_seqnum),
3483 le64toh((*f)->header->head_entry_realtime));
3484 if (r < 0)
0ac38b70
LP
3485 return -ENOMEM;
3486
2678031a
LP
3487 /* Try to rename the file to the archived version. If the file
3488 * already was deleted, we'll get ENOENT, let's ignore that
3489 * case. */
0ac38b70 3490 r = rename(old_file->path, p);
2678031a 3491 if (r < 0 && errno != ENOENT)
0ac38b70
LP
3492 return -errno;
3493
1fcefd88
LP
3494 /* Sync the rename to disk */
3495 (void) fsync_directory_of_file(old_file->fd);
3496
8eb85171
VC
3497 /* Set as archive so offlining commits w/state=STATE_ARCHIVED.
3498 * Previously we would set old_file->header->state to STATE_ARCHIVED directly here,
3499 * but journal_file_set_offline() short-circuits when state != STATE_ONLINE, which
3500 * would result in the rotated journal never getting fsync() called before closing.
3501 * Now we simply queue the archive state by setting an archive bit, leaving the state
3502 * as STATE_ONLINE so proper offlining occurs. */
3503 old_file->archive = true;
0ac38b70 3504
f27a3864
LP
3505 /* Currently, btrfs is not very good with out write patterns
3506 * and fragments heavily. Let's defrag our journal files when
3507 * we archive them */
3508 old_file->defrag_on_close = true;
3509
5d1ce257 3510 r = journal_file_open(-1, old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, deferred_closes, old_file, &new_file);
b58c888f
VC
3511
3512 if (deferred_closes &&
3513 set_put(deferred_closes, old_file) >= 0)
3514 (void) journal_file_set_offline(old_file, false);
3515 else
3516 (void) journal_file_close(old_file);
0ac38b70
LP
3517
3518 *f = new_file;
3519 return r;
3520}
3521
9447a7f1
LP
3522int journal_file_open_reliably(
3523 const char *fname,
3524 int flags,
3525 mode_t mode,
7560fffc 3526 bool compress,
baed47c3 3527 bool seal,
4a92baf3 3528 JournalMetrics *metrics,
27370278 3529 MMapCache *mmap_cache,
b58c888f 3530 Set *deferred_closes,
9447a7f1
LP
3531 JournalFile *template,
3532 JournalFile **ret) {
3533
3534 int r;
3535 size_t l;
ed375beb 3536 _cleanup_free_ char *p = NULL;
9447a7f1 3537
5d1ce257 3538 r = journal_file_open(-1, fname, flags, mode, compress, seal, metrics, mmap_cache, deferred_closes, template, ret);
288359db 3539 if (!IN_SET(r,
b288cdeb
ZJS
3540 -EBADMSG, /* Corrupted */
3541 -ENODATA, /* Truncated */
3542 -EHOSTDOWN, /* Other machine */
3543 -EPROTONOSUPPORT, /* Incompatible feature */
3544 -EBUSY, /* Unclean shutdown */
3545 -ESHUTDOWN, /* Already archived */
288359db 3546 -EIO, /* IO error, including SIGBUS on mmap */
ae739cc1
LP
3547 -EIDRM, /* File has been deleted */
3548 -ETXTBSY)) /* File is from the future */
9447a7f1
LP
3549 return r;
3550
3551 if ((flags & O_ACCMODE) == O_RDONLY)
3552 return r;
3553
3554 if (!(flags & O_CREAT))
3555 return r;
3556
7560fffc
LP
3557 if (!endswith(fname, ".journal"))
3558 return r;
3559
5c70eab4
LP
3560 /* The file is corrupted. Rotate it away and try it again (but only once) */
3561
9447a7f1 3562 l = strlen(fname);
d587eca5 3563 if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
57535f47 3564 (int) l - 8, fname,
d587eca5 3565 now(CLOCK_REALTIME),
9bf3b535 3566 random_u64()) < 0)
9447a7f1
LP
3567 return -ENOMEM;
3568
65089b82 3569 if (rename(fname, p) < 0)
9447a7f1
LP
3570 return -errno;
3571
f27a3864
LP
3572 /* btrfs doesn't cope well with our write pattern and
3573 * fragments heavily. Let's defrag all files we rotate */
11689d2a 3574
a67d68b8 3575 (void) chattr_path(p, 0, FS_NOCOW_FL);
f27a3864
LP
3576 (void) btrfs_defrag(p);
3577
65089b82 3578 log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
9447a7f1 3579
5d1ce257 3580 return journal_file_open(-1, fname, flags, mode, compress, seal, metrics, mmap_cache, deferred_closes, template, ret);
9447a7f1
LP
3581}
3582
cf244689
LP
3583int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
3584 uint64_t i, n;
3585 uint64_t q, xor_hash = 0;
3586 int r;
3587 EntryItem *items;
3588 dual_timestamp ts;
3589
3590 assert(from);
3591 assert(to);
3592 assert(o);
3593 assert(p);
3594
3595 if (!to->writable)
3596 return -EPERM;
3597
3598 ts.monotonic = le64toh(o->entry.monotonic);
3599 ts.realtime = le64toh(o->entry.realtime);
3600
cf244689 3601 n = journal_file_entry_n_items(o);
4faa7004
TA
3602 /* alloca() can't take 0, hence let's allocate at least one */
3603 items = alloca(sizeof(EntryItem) * MAX(1u, n));
cf244689
LP
3604
3605 for (i = 0; i < n; i++) {
4fd052ae
FC
3606 uint64_t l, h;
3607 le64_t le_hash;
cf244689
LP
3608 size_t t;
3609 void *data;
3610 Object *u;
3611
3612 q = le64toh(o->entry.items[i].object_offset);
3613 le_hash = o->entry.items[i].hash;
3614
3615 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
3616 if (r < 0)
3617 return r;
3618
3619 if (le_hash != o->data.hash)
3620 return -EBADMSG;
3621
3622 l = le64toh(o->object.size) - offsetof(Object, data.payload);
3623 t = (size_t) l;
3624
3625 /* We hit the limit on 32bit machines */
3626 if ((uint64_t) t != l)
3627 return -E2BIG;
3628
d89c8fdf 3629 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
349cc4a5 3630#if HAVE_XZ || HAVE_LZ4
a7f7d1bd 3631 size_t rsize = 0;
cf244689 3632
d89c8fdf
ZJS
3633 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
3634 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
3635 if (r < 0)
3636 return r;
cf244689
LP
3637
3638 data = from->compress_buffer;
3639 l = rsize;
3b1a55e1
ZJS
3640#else
3641 return -EPROTONOSUPPORT;
3642#endif
cf244689
LP
3643 } else
3644 data = o->data.payload;
3645
3646 r = journal_file_append_data(to, data, l, &u, &h);
3647 if (r < 0)
3648 return r;
3649
3650 xor_hash ^= le64toh(u->data.hash);
3651 items[i].object_offset = htole64(h);
3652 items[i].hash = u->data.hash;
3653
3654 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
3655 if (r < 0)
3656 return r;
3657 }
3658
fa6ac760
LP
3659 r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
3660
be7cdd8e 3661 if (mmap_cache_got_sigbus(to->mmap, to->cache_fd))
fa6ac760
LP
3662 return -EIO;
3663
3664 return r;
cf244689 3665}
babfc091 3666
8580d1f7
LP
3667void journal_reset_metrics(JournalMetrics *m) {
3668 assert(m);
3669
3670 /* Set everything to "pick automatic values". */
3671
3672 *m = (JournalMetrics) {
3673 .min_use = (uint64_t) -1,
3674 .max_use = (uint64_t) -1,
3675 .min_size = (uint64_t) -1,
3676 .max_size = (uint64_t) -1,
3677 .keep_free = (uint64_t) -1,
3678 .n_max_files = (uint64_t) -1,
3679 };
3680}
3681
babfc091 3682void journal_default_metrics(JournalMetrics *m, int fd) {
8580d1f7 3683 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
babfc091 3684 struct statvfs ss;
8580d1f7 3685 uint64_t fs_size;
babfc091
LP
3686
3687 assert(m);
3688 assert(fd >= 0);
3689
3690 if (fstatvfs(fd, &ss) >= 0)
3691 fs_size = ss.f_frsize * ss.f_blocks;
8580d1f7
LP
3692 else {
3693 log_debug_errno(errno, "Failed to detremine disk size: %m");
3694 fs_size = 0;
3695 }
babfc091
LP
3696
3697 if (m->max_use == (uint64_t) -1) {
3698
3699 if (fs_size > 0) {
3700 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
3701
3702 if (m->max_use > DEFAULT_MAX_USE_UPPER)
3703 m->max_use = DEFAULT_MAX_USE_UPPER;
3704
3705 if (m->max_use < DEFAULT_MAX_USE_LOWER)
3706 m->max_use = DEFAULT_MAX_USE_LOWER;
3707 } else
3708 m->max_use = DEFAULT_MAX_USE_LOWER;
3709 } else {
3710 m->max_use = PAGE_ALIGN(m->max_use);
3711
8580d1f7 3712 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
babfc091
LP
3713 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3714 }
3715
8580d1f7
LP
3716 if (m->min_use == (uint64_t) -1)
3717 m->min_use = DEFAULT_MIN_USE;
3718
3719 if (m->min_use > m->max_use)
3720 m->min_use = m->max_use;
3721
babfc091
LP
3722 if (m->max_size == (uint64_t) -1) {
3723 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
3724
3725 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3726 m->max_size = DEFAULT_MAX_SIZE_UPPER;
3727 } else
3728 m->max_size = PAGE_ALIGN(m->max_size);
3729
8580d1f7
LP
3730 if (m->max_size != 0) {
3731 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3732 m->max_size = JOURNAL_FILE_SIZE_MIN;
babfc091 3733
8580d1f7
LP
3734 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3735 m->max_use = m->max_size*2;
3736 }
babfc091
LP
3737
3738 if (m->min_size == (uint64_t) -1)
3739 m->min_size = JOURNAL_FILE_SIZE_MIN;
3740 else {
3741 m->min_size = PAGE_ALIGN(m->min_size);
3742
3743 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3744 m->min_size = JOURNAL_FILE_SIZE_MIN;
3745
8580d1f7 3746 if (m->max_size != 0 && m->min_size > m->max_size)
babfc091
LP
3747 m->max_size = m->min_size;
3748 }
3749
3750 if (m->keep_free == (uint64_t) -1) {
3751
3752 if (fs_size > 0) {
8621b110 3753 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
babfc091
LP
3754
3755 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3756 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3757
3758 } else
3759 m->keep_free = DEFAULT_KEEP_FREE;
3760 }
3761
8580d1f7
LP
3762 if (m->n_max_files == (uint64_t) -1)
3763 m->n_max_files = DEFAULT_N_MAX_FILES;
3764
3765 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3766 format_bytes(a, sizeof(a), m->min_use),
3767 format_bytes(b, sizeof(b), m->max_use),
3768 format_bytes(c, sizeof(c), m->max_size),
3769 format_bytes(d, sizeof(d), m->min_size),
3770 format_bytes(e, sizeof(e), m->keep_free),
3771 m->n_max_files);
babfc091 3772}
08984293
LP
3773
3774int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
08984293 3775 assert(f);
c88cc6af 3776 assert(f->header);
08984293
LP
3777 assert(from || to);
3778
3779 if (from) {
162566a4
LP
3780 if (f->header->head_entry_realtime == 0)
3781 return -ENOENT;
08984293 3782
162566a4 3783 *from = le64toh(f->header->head_entry_realtime);
08984293
LP
3784 }
3785
3786 if (to) {
162566a4
LP
3787 if (f->header->tail_entry_realtime == 0)
3788 return -ENOENT;
08984293 3789
162566a4 3790 *to = le64toh(f->header->tail_entry_realtime);
08984293
LP
3791 }
3792
3793 return 1;
3794}
3795
3796int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
08984293
LP
3797 Object *o;
3798 uint64_t p;
3799 int r;
3800
3801 assert(f);
3802 assert(from || to);
3803
47838ab3 3804 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
08984293
LP
3805 if (r <= 0)
3806 return r;
3807
3808 if (le64toh(o->data.n_entries) <= 0)
3809 return 0;
3810
3811 if (from) {
3812 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3813 if (r < 0)
3814 return r;
3815
3816 *from = le64toh(o->entry.monotonic);
3817 }
3818
3819 if (to) {
3820 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3821 if (r < 0)
3822 return r;
3823
3824 r = generic_array_get_plus_one(f,
3825 le64toh(o->data.entry_offset),
3826 le64toh(o->data.entry_array_offset),
3827 le64toh(o->data.n_entries)-1,
3828 &o, NULL);
3829 if (r <= 0)
3830 return r;
3831
3832 *to = le64toh(o->entry.monotonic);
3833 }
3834
3835 return 1;
3836}
dca6219e 3837
fb0951b0 3838bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
dca6219e 3839 assert(f);
c88cc6af 3840 assert(f->header);
dca6219e
LP
3841
3842 /* If we gained new header fields we gained new features,
3843 * hence suggest a rotation */
361f9cbc
LP
3844 if (le64toh(f->header->header_size) < sizeof(Header)) {
3845 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
dca6219e 3846 return true;
361f9cbc 3847 }
dca6219e
LP
3848
3849 /* Let's check if the hash tables grew over a certain fill
3850 * level (75%, borrowing this value from Java's hash table
3851 * implementation), and if so suggest a rotation. To calculate
3852 * the fill level we need the n_data field, which only exists
3853 * in newer versions. */
3854
3855 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
361f9cbc 3856 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 3857 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
361f9cbc
LP
3858 f->path,
3859 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
3860 le64toh(f->header->n_data),
3861 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3862 (unsigned long long) f->last_stat.st_size,
3863 f->last_stat.st_size / le64toh(f->header->n_data));
dca6219e 3864 return true;
361f9cbc 3865 }
dca6219e
LP
3866
3867 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
361f9cbc 3868 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 3869 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
361f9cbc
LP
3870 f->path,
3871 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
3872 le64toh(f->header->n_fields),
3873 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
dca6219e 3874 return true;
361f9cbc 3875 }
dca6219e 3876
0598fd4a
LP
3877 /* Are the data objects properly indexed by field objects? */
3878 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3879 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3880 le64toh(f->header->n_data) > 0 &&
3881 le64toh(f->header->n_fields) == 0)
3882 return true;
3883
fb0951b0
LP
3884 if (max_file_usec > 0) {
3885 usec_t t, h;
3886
3887 h = le64toh(f->header->head_entry_realtime);
3888 t = now(CLOCK_REALTIME);
3889
3890 if (h > 0 && t > h + max_file_usec)
3891 return true;
3892 }
3893
dca6219e
LP
3894 return false;
3895}