]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/journal/journal-file.c
udev: some very trivial coding style updates
[thirdparty/systemd.git] / src / journal / journal-file.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
cec736d2
LP
2/***
3 This file is part of systemd.
4
5 Copyright 2011 Lennart Poettering
6
7 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
cec736d2
LP
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 15 Lesser General Public License for more details.
cec736d2 16
5430f7f2 17 You should have received a copy of the GNU Lesser General Public License
cec736d2
LP
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19***/
20
cec736d2 21#include <errno.h>
cec736d2 22#include <fcntl.h>
11689d2a 23#include <linux/fs.h>
ac2e41f5 24#include <pthread.h>
07630cea
LP
25#include <stddef.h>
26#include <sys/mman.h>
27#include <sys/statvfs.h>
28#include <sys/uio.h>
29#include <unistd.h>
fb0951b0 30
b5efdb8a 31#include "alloc-util.h"
f27a3864 32#include "btrfs-util.h"
c8b3094d 33#include "chattr-util.h"
07630cea 34#include "compress.h"
3ffd4af2 35#include "fd-util.h"
0284adc6 36#include "journal-authenticate.h"
cec736d2
LP
37#include "journal-def.h"
38#include "journal-file.h"
39#include "lookup3.h"
6bedfcbb 40#include "parse-util.h"
5d1ce257 41#include "path-util.h"
3df3e884 42#include "random-util.h"
7a24f3bf 43#include "sd-event.h"
b58c888f 44#include "set.h"
07630cea 45#include "string-util.h"
4761fd0f 46#include "strv.h"
89a5a90c 47#include "xattr-util.h"
cec736d2 48
4a92baf3
LP
49#define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
50#define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
cec736d2 51
be19b7df 52#define COMPRESSION_SIZE_THRESHOLD (512ULL)
807e17f0 53
babfc091 54/* This is the minimum journal file size */
16098e93 55#define JOURNAL_FILE_SIZE_MIN (512ULL*1024ULL) /* 512 KiB */
babfc091
LP
56
57/* These are the lower and upper bounds if we deduce the max_use value
58 * from the file system size */
59#define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
60#define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
61
8580d1f7
LP
62/* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
63#define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL) /* 1 MiB */
64
babfc091 65/* This is the upper bound if we deduce max_size from max_use */
71100051 66#define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
babfc091
LP
67
68/* This is the upper bound if we deduce the keep_free value from the
69 * file system size */
70#define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
71
72/* This is the keep_free value when we can't determine the system
73 * size */
74#define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
75
8580d1f7
LP
76/* This is the default maximum number of journal files to keep around. */
77#define DEFAULT_N_MAX_FILES (100)
78
dca6219e
LP
79/* n_data was the first entry we added after the initial file format design */
80#define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
cec736d2 81
a4bcff5b
LP
82/* How many entries to keep in the entry array chain cache at max */
83#define CHAIN_CACHE_MAX 20
84
a676e665
LP
85/* How much to increase the journal file size at once each time we allocate something new. */
86#define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
87
2678031a
LP
88/* Reread fstat() of the file for detecting deletions at least this often */
89#define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
90
fa6ac760
LP
91/* The mmap context to use for the header we pick as one above the last defined typed */
92#define CONTEXT_HEADER _OBJECT_TYPE_MAX
93
51804460
ZJS
94#ifdef __clang__
95# pragma GCC diagnostic ignored "-Waddress-of-packed-member"
96#endif
97
ac2e41f5
VC
98/* This may be called from a separate thread to prevent blocking the caller for the duration of fsync().
99 * As a result we use atomic operations on f->offline_state for inter-thread communications with
100 * journal_file_set_offline() and journal_file_set_online(). */
101static void journal_file_set_offline_internal(JournalFile *f) {
26687bf8 102 assert(f);
ac2e41f5
VC
103 assert(f->fd >= 0);
104 assert(f->header);
105
106 for (;;) {
107 switch (f->offline_state) {
108 case OFFLINE_CANCEL:
109 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_DONE))
110 continue;
111 return;
112
113 case OFFLINE_AGAIN_FROM_SYNCING:
114 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_SYNCING))
115 continue;
116 break;
117
118 case OFFLINE_AGAIN_FROM_OFFLINING:
119 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_SYNCING))
120 continue;
121 break;
122
123 case OFFLINE_SYNCING:
124 (void) fsync(f->fd);
26687bf8 125
ac2e41f5
VC
126 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_OFFLINING))
127 continue;
26687bf8 128
8eb85171 129 f->header->state = f->archive ? STATE_ARCHIVED : STATE_OFFLINE;
ac2e41f5
VC
130 (void) fsync(f->fd);
131 break;
132
133 case OFFLINE_OFFLINING:
134 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_DONE))
135 continue;
4831981d 136 _fallthrough_;
ac2e41f5
VC
137 case OFFLINE_DONE:
138 return;
139
140 case OFFLINE_JOINED:
141 log_debug("OFFLINE_JOINED unexpected offline state for journal_file_set_offline_internal()");
142 return;
143 }
144 }
145}
146
147static void * journal_file_set_offline_thread(void *arg) {
148 JournalFile *f = arg;
149
150 journal_file_set_offline_internal(f);
151
152 return NULL;
153}
154
155static int journal_file_set_offline_thread_join(JournalFile *f) {
156 int r;
157
158 assert(f);
159
160 if (f->offline_state == OFFLINE_JOINED)
161 return 0;
162
163 r = pthread_join(f->offline_thread, NULL);
164 if (r)
165 return -r;
166
167 f->offline_state = OFFLINE_JOINED;
26687bf8 168
be7cdd8e 169 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
fa6ac760
LP
170 return -EIO;
171
ac2e41f5
VC
172 return 0;
173}
26687bf8 174
ac2e41f5
VC
175/* Trigger a restart if the offline thread is mid-flight in a restartable state. */
176static bool journal_file_set_offline_try_restart(JournalFile *f) {
177 for (;;) {
178 switch (f->offline_state) {
179 case OFFLINE_AGAIN_FROM_SYNCING:
180 case OFFLINE_AGAIN_FROM_OFFLINING:
181 return true;
182
183 case OFFLINE_CANCEL:
184 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_AGAIN_FROM_SYNCING))
185 continue;
186 return true;
187
188 case OFFLINE_SYNCING:
189 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_AGAIN_FROM_SYNCING))
190 continue;
191 return true;
192
193 case OFFLINE_OFFLINING:
194 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_AGAIN_FROM_OFFLINING))
195 continue;
196 return true;
26687bf8
OS
197
198 default:
ac2e41f5
VC
199 return false;
200 }
26687bf8
OS
201 }
202}
203
ac2e41f5
VC
204/* Sets a journal offline.
205 *
206 * If wait is false then an offline is dispatched in a separate thread for a
207 * subsequent journal_file_set_offline() or journal_file_set_online() of the
208 * same journal to synchronize with.
209 *
210 * If wait is true, then either an existing offline thread will be restarted
211 * and joined, or if none exists the offline is simply performed in this
212 * context without involving another thread.
213 */
214int journal_file_set_offline(JournalFile *f, bool wait) {
215 bool restarted;
216 int r;
217
26687bf8
OS
218 assert(f);
219
220 if (!f->writable)
221 return -EPERM;
222
223 if (!(f->fd >= 0 && f->header))
224 return -EINVAL;
225
b8f99e27
VC
226 /* An offlining journal is implicitly online and may modify f->header->state,
227 * we must also join any potentially lingering offline thread when not online. */
228 if (!journal_file_is_offlining(f) && f->header->state != STATE_ONLINE)
229 return journal_file_set_offline_thread_join(f);
26687bf8 230
ac2e41f5
VC
231 /* Restart an in-flight offline thread and wait if needed, or join a lingering done one. */
232 restarted = journal_file_set_offline_try_restart(f);
233 if ((restarted && wait) || !restarted) {
234 r = journal_file_set_offline_thread_join(f);
235 if (r < 0)
236 return r;
237 }
26687bf8 238
ac2e41f5
VC
239 if (restarted)
240 return 0;
241
242 /* Initiate a new offline. */
243 f->offline_state = OFFLINE_SYNCING;
fa6ac760 244
ac2e41f5
VC
245 if (wait) /* Without using a thread if waiting. */
246 journal_file_set_offline_internal(f);
247 else {
248 r = pthread_create(&f->offline_thread, NULL, journal_file_set_offline_thread, f);
ec9ffa2c
VC
249 if (r > 0) {
250 f->offline_state = OFFLINE_JOINED;
ac2e41f5 251 return -r;
ec9ffa2c 252 }
ac2e41f5
VC
253 }
254
255 return 0;
256}
257
258static int journal_file_set_online(JournalFile *f) {
259 bool joined = false;
260
261 assert(f);
262
263 if (!f->writable)
264 return -EPERM;
265
266 if (!(f->fd >= 0 && f->header))
267 return -EINVAL;
268
269 while (!joined) {
270 switch (f->offline_state) {
271 case OFFLINE_JOINED:
272 /* No offline thread, no need to wait. */
273 joined = true;
274 break;
275
276 case OFFLINE_SYNCING:
277 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_CANCEL))
278 continue;
279 /* Canceled syncing prior to offlining, no need to wait. */
280 break;
281
282 case OFFLINE_AGAIN_FROM_SYNCING:
283 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_CANCEL))
284 continue;
285 /* Canceled restart from syncing, no need to wait. */
286 break;
287
288 case OFFLINE_AGAIN_FROM_OFFLINING:
289 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_CANCEL))
290 continue;
291 /* Canceled restart from offlining, must wait for offlining to complete however. */
4831981d 292 _fallthrough_;
ac2e41f5
VC
293 default: {
294 int r;
295
296 r = journal_file_set_offline_thread_join(f);
297 if (r < 0)
298 return r;
299
300 joined = true;
301 break;
302 }
303 }
304 }
26687bf8 305
be7cdd8e 306 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
fa6ac760
LP
307 return -EIO;
308
ac2e41f5
VC
309 switch (f->header->state) {
310 case STATE_ONLINE:
311 return 0;
26687bf8 312
ac2e41f5
VC
313 case STATE_OFFLINE:
314 f->header->state = STATE_ONLINE;
315 (void) fsync(f->fd);
316 return 0;
317
318 default:
319 return -EINVAL;
320 }
26687bf8
OS
321}
322
b58c888f
VC
323bool journal_file_is_offlining(JournalFile *f) {
324 assert(f);
325
326 __sync_synchronize();
327
3742095b 328 if (IN_SET(f->offline_state, OFFLINE_DONE, OFFLINE_JOINED))
b58c888f
VC
329 return false;
330
331 return true;
332}
333
804ae586 334JournalFile* journal_file_close(JournalFile *f) {
de190aef 335 assert(f);
cec736d2 336
349cc4a5 337#if HAVE_GCRYPT
b0af6f41 338 /* Write the final tag */
43cd8794
FB
339 if (f->seal && f->writable) {
340 int r;
341
342 r = journal_file_append_tag(f);
343 if (r < 0)
344 log_error_errno(r, "Failed to append tag when closing journal: %m");
345 }
feb12d3e 346#endif
b0af6f41 347
7a24f3bf
VC
348 if (f->post_change_timer) {
349 int enabled;
350
351 if (sd_event_source_get_enabled(f->post_change_timer, &enabled) >= 0)
352 if (enabled == SD_EVENT_ONESHOT)
353 journal_file_post_change(f);
354
e167d7fd 355 (void) sd_event_source_set_enabled(f->post_change_timer, SD_EVENT_OFF);
7a24f3bf
VC
356 sd_event_source_unref(f->post_change_timer);
357 }
358
ac2e41f5 359 journal_file_set_offline(f, true);
cec736d2 360
be7cdd8e
VC
361 if (f->mmap && f->cache_fd)
362 mmap_cache_free_fd(f->mmap, f->cache_fd);
cec736d2 363
11689d2a
LP
364 if (f->fd >= 0 && f->defrag_on_close) {
365
366 /* Be friendly to btrfs: turn COW back on again now,
367 * and defragment the file. We won't write to the file
368 * ever again, hence remove all fragmentation, and
369 * reenable all the good bits COW usually provides
370 * (such as data checksumming). */
371
1ed8f8c1 372 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
11689d2a
LP
373 (void) btrfs_defrag_fd(f->fd);
374 }
f27a3864 375
5d1ce257
LP
376 if (f->close_fd)
377 safe_close(f->fd);
cec736d2 378 free(f->path);
807e17f0 379
f649045c 380 mmap_cache_unref(f->mmap);
16e9f408 381
4743015d 382 ordered_hashmap_free_free(f->chain_cache);
a4bcff5b 383
349cc4a5 384#if HAVE_XZ || HAVE_LZ4
807e17f0
LP
385 free(f->compress_buffer);
386#endif
387
349cc4a5 388#if HAVE_GCRYPT
baed47c3
LP
389 if (f->fss_file)
390 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
dc4ebc07 391 else
b7c9ae91
LP
392 free(f->fsprg_state);
393
394 free(f->fsprg_seed);
7560fffc
LP
395
396 if (f->hmac)
397 gcry_md_close(f->hmac);
398#endif
399
6b430fdb 400 return mfree(f);
cec736d2
LP
401}
402
0ac38b70 403static int journal_file_init_header(JournalFile *f, JournalFile *template) {
d89c8fdf 404 Header h = {};
cec736d2
LP
405 ssize_t k;
406 int r;
407
408 assert(f);
409
7560fffc 410 memcpy(h.signature, HEADER_SIGNATURE, 8);
23b0b2b2 411 h.header_size = htole64(ALIGN64(sizeof(h)));
cec736d2 412
d89c8fdf
ZJS
413 h.incompatible_flags |= htole32(
414 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
415 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
7560fffc 416
d89c8fdf
ZJS
417 h.compatible_flags = htole32(
418 f->seal * HEADER_COMPATIBLE_SEALED);
7560fffc 419
cec736d2
LP
420 r = sd_id128_randomize(&h.file_id);
421 if (r < 0)
422 return r;
423
0ac38b70
LP
424 if (template) {
425 h.seqnum_id = template->header->seqnum_id;
beec0085 426 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
0ac38b70
LP
427 } else
428 h.seqnum_id = h.file_id;
cec736d2
LP
429
430 k = pwrite(f->fd, &h, sizeof(h), 0);
431 if (k < 0)
432 return -errno;
433
434 if (k != sizeof(h))
435 return -EIO;
436
437 return 0;
438}
439
a0fe2a2d
LP
440static int fsync_directory_of_file(int fd) {
441 _cleanup_free_ char *path = NULL, *dn = NULL;
442 _cleanup_close_ int dfd = -1;
443 struct stat st;
444 int r;
445
446 if (fstat(fd, &st) < 0)
447 return -errno;
448
449 if (!S_ISREG(st.st_mode))
450 return -EBADFD;
451
452 r = fd_get_path(fd, &path);
453 if (r < 0)
454 return r;
455
456 if (!path_is_absolute(path))
457 return -EINVAL;
458
459 dn = dirname_malloc(path);
460 if (!dn)
461 return -ENOMEM;
462
463 dfd = open(dn, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
464 if (dfd < 0)
465 return -errno;
466
467 if (fsync(dfd) < 0)
468 return -errno;
469
470 return 0;
471}
472
cec736d2 473static int journal_file_refresh_header(JournalFile *f) {
de190aef 474 sd_id128_t boot_id;
fa6ac760 475 int r;
cec736d2
LP
476
477 assert(f);
c88cc6af 478 assert(f->header);
cec736d2
LP
479
480 r = sd_id128_get_machine(&f->header->machine_id);
481 if (r < 0)
482 return r;
483
de190aef 484 r = sd_id128_get_boot(&boot_id);
cec736d2
LP
485 if (r < 0)
486 return r;
487
de190aef
LP
488 if (sd_id128_equal(boot_id, f->header->boot_id))
489 f->tail_entry_monotonic_valid = true;
490
491 f->header->boot_id = boot_id;
492
fa6ac760 493 r = journal_file_set_online(f);
b788cc23 494
7560fffc 495 /* Sync the online state to disk */
fb426037 496 (void) fsync(f->fd);
b788cc23 497
a0fe2a2d
LP
498 /* We likely just created a new file, also sync the directory this file is located in. */
499 (void) fsync_directory_of_file(f->fd);
500
fa6ac760 501 return r;
cec736d2
LP
502}
503
4214009f
ZJS
504static bool warn_wrong_flags(const JournalFile *f, bool compatible) {
505 const uint32_t any = compatible ? HEADER_COMPATIBLE_ANY : HEADER_INCOMPATIBLE_ANY,
506 supported = compatible ? HEADER_COMPATIBLE_SUPPORTED : HEADER_INCOMPATIBLE_SUPPORTED;
507 const char *type = compatible ? "compatible" : "incompatible";
d89c8fdf
ZJS
508 uint32_t flags;
509
4214009f
ZJS
510 flags = le32toh(compatible ? f->header->compatible_flags : f->header->incompatible_flags);
511
512 if (flags & ~supported) {
513 if (flags & ~any)
4761fd0f 514 log_debug("Journal file %s has unknown %s flags 0x%"PRIx32,
4214009f
ZJS
515 f->path, type, flags & ~any);
516 flags = (flags & any) & ~supported;
4761fd0f
ZJS
517 if (flags) {
518 const char* strv[3];
519 unsigned n = 0;
520 _cleanup_free_ char *t = NULL;
521
522 if (compatible && (flags & HEADER_COMPATIBLE_SEALED))
523 strv[n++] = "sealed";
524 if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_XZ))
525 strv[n++] = "xz-compressed";
526 if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_LZ4))
527 strv[n++] = "lz4-compressed";
528 strv[n] = NULL;
529 assert(n < ELEMENTSOF(strv));
530
531 t = strv_join((char**) strv, ", ");
532 log_debug("Journal file %s uses %s %s %s disabled at compilation time.",
533 f->path, type, n > 1 ? "flags" : "flag", strnull(t));
534 }
4214009f
ZJS
535 return true;
536 }
537
538 return false;
539}
540
541static int journal_file_verify_header(JournalFile *f) {
6f94e420
TS
542 uint64_t arena_size, header_size;
543
cec736d2 544 assert(f);
c88cc6af 545 assert(f->header);
cec736d2 546
7560fffc 547 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
cec736d2
LP
548 return -EBADMSG;
549
4214009f
ZJS
550 /* In both read and write mode we refuse to open files with incompatible
551 * flags we don't know. */
552 if (warn_wrong_flags(f, false))
cec736d2
LP
553 return -EPROTONOSUPPORT;
554
4214009f
ZJS
555 /* When open for writing we refuse to open files with compatible flags, too. */
556 if (f->writable && warn_wrong_flags(f, true))
d89c8fdf 557 return -EPROTONOSUPPORT;
7560fffc 558
db11ac1a
LP
559 if (f->header->state >= _STATE_MAX)
560 return -EBADMSG;
561
6f94e420
TS
562 header_size = le64toh(f->header->header_size);
563
dca6219e 564 /* The first addition was n_data, so check that we are at least this large */
6f94e420 565 if (header_size < HEADER_SIZE_MIN)
23b0b2b2
LP
566 return -EBADMSG;
567
8088cbd3 568 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
beec0085
LP
569 return -EBADMSG;
570
6f94e420
TS
571 arena_size = le64toh(f->header->arena_size);
572
573 if (UINT64_MAX - header_size < arena_size || header_size + arena_size > (uint64_t) f->last_stat.st_size)
db11ac1a
LP
574 return -ENODATA;
575
6f94e420 576 if (le64toh(f->header->tail_object_offset) > header_size + arena_size)
db11ac1a
LP
577 return -ENODATA;
578
7762e02b
LP
579 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
580 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
581 !VALID64(le64toh(f->header->tail_object_offset)) ||
582 !VALID64(le64toh(f->header->entry_array_offset)))
583 return -ENODATA;
584
cec736d2 585 if (f->writable) {
cec736d2 586 sd_id128_t machine_id;
ae739cc1 587 uint8_t state;
cec736d2
LP
588 int r;
589
590 r = sd_id128_get_machine(&machine_id);
591 if (r < 0)
592 return r;
593
594 if (!sd_id128_equal(machine_id, f->header->machine_id))
595 return -EHOSTDOWN;
596
de190aef 597 state = f->header->state;
cec736d2 598
b288cdeb
ZJS
599 if (state == STATE_ARCHIVED)
600 return -ESHUTDOWN; /* Already archived */
601 else if (state == STATE_ONLINE) {
71fa6f00
LP
602 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
603 return -EBUSY;
b288cdeb 604 } else if (state != STATE_OFFLINE) {
8facc349 605 log_debug("Journal file %s has unknown state %i.", f->path, state);
71fa6f00
LP
606 return -EBUSY;
607 }
ae739cc1 608
5b3cc0c8
YN
609 if (f->header->field_hash_table_size == 0 || f->header->data_hash_table_size == 0)
610 return -EBADMSG;
611
ae739cc1
LP
612 /* Don't permit appending to files from the future. Because otherwise the realtime timestamps wouldn't
613 * be strictly ordered in the entries in the file anymore, and we can't have that since it breaks
614 * bisection. */
615 if (le64toh(f->header->tail_entry_realtime) > now(CLOCK_REALTIME)) {
616 log_debug("Journal file %s is from the future, refusing to append new data to it that'd be older.", f->path);
617 return -ETXTBSY;
618 }
cec736d2
LP
619 }
620
d89c8fdf
ZJS
621 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
622 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
c586dbf1 623
f1889c91 624 f->seal = JOURNAL_HEADER_SEALED(f->header);
7560fffc 625
cec736d2
LP
626 return 0;
627}
628
2678031a
LP
629static int journal_file_fstat(JournalFile *f) {
630 assert(f);
631 assert(f->fd >= 0);
632
633 if (fstat(f->fd, &f->last_stat) < 0)
634 return -errno;
635
636 f->last_stat_usec = now(CLOCK_MONOTONIC);
637
638 /* Refuse appending to files that are already deleted */
639 if (f->last_stat.st_nlink <= 0)
640 return -EIDRM;
641
642 return 0;
643}
644
cec736d2 645static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
eda4b58b 646 uint64_t old_size, new_size;
fec2aa2f 647 int r;
cec736d2
LP
648
649 assert(f);
c88cc6af 650 assert(f->header);
cec736d2 651
cec736d2 652 /* We assume that this file is not sparse, and we know that
38ac38b2 653 * for sure, since we always call posix_fallocate()
cec736d2
LP
654 * ourselves */
655
be7cdd8e 656 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
fa6ac760
LP
657 return -EIO;
658
cec736d2 659 old_size =
23b0b2b2 660 le64toh(f->header->header_size) +
cec736d2
LP
661 le64toh(f->header->arena_size);
662
bc85bfee 663 new_size = PAGE_ALIGN(offset + size);
23b0b2b2
LP
664 if (new_size < le64toh(f->header->header_size))
665 new_size = le64toh(f->header->header_size);
bc85bfee 666
2678031a
LP
667 if (new_size <= old_size) {
668
669 /* We already pre-allocated enough space, but before
670 * we write to it, let's check with fstat() if the
671 * file got deleted, in order make sure we don't throw
672 * away the data immediately. Don't check fstat() for
673 * all writes though, but only once ever 10s. */
674
675 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
676 return 0;
677
678 return journal_file_fstat(f);
679 }
680
681 /* Allocate more space. */
cec736d2 682
a676e665 683 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
bc85bfee 684 return -E2BIG;
cec736d2 685
a676e665 686 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
cec736d2
LP
687 struct statvfs svfs;
688
689 if (fstatvfs(f->fd, &svfs) >= 0) {
690 uint64_t available;
691
070052ab 692 available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
cec736d2
LP
693
694 if (new_size - old_size > available)
695 return -E2BIG;
696 }
697 }
698
eda4b58b
LP
699 /* Increase by larger blocks at once */
700 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
701 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
702 new_size = f->metrics.max_size;
703
bc85bfee
LP
704 /* Note that the glibc fallocate() fallback is very
705 inefficient, hence we try to minimize the allocation area
706 as we can. */
fec2aa2f
GV
707 r = posix_fallocate(f->fd, old_size, new_size - old_size);
708 if (r != 0)
709 return -r;
cec736d2 710
23b0b2b2 711 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
cec736d2 712
2678031a 713 return journal_file_fstat(f);
cec736d2
LP
714}
715
78519831 716static unsigned type_to_context(ObjectType type) {
d3d3208f 717 /* One context for each type, plus one catch-all for the rest */
69adae51 718 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
fa6ac760 719 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
d05089d8 720 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
d3d3208f
MS
721}
722
b439282e 723static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret, size_t *ret_size) {
2678031a
LP
724 int r;
725
cec736d2 726 assert(f);
cec736d2
LP
727 assert(ret);
728
7762e02b
LP
729 if (size <= 0)
730 return -EINVAL;
731
2a59ea54 732 /* Avoid SIGBUS on invalid accesses */
4bbdcdb3
LP
733 if (offset + size > (uint64_t) f->last_stat.st_size) {
734 /* Hmm, out of range? Let's refresh the fstat() data
735 * first, before we trust that check. */
736
2678031a
LP
737 r = journal_file_fstat(f);
738 if (r < 0)
739 return r;
740
741 if (offset + size > (uint64_t) f->last_stat.st_size)
4bbdcdb3
LP
742 return -EADDRNOTAVAIL;
743 }
744
b439282e 745 return mmap_cache_get(f->mmap, f->cache_fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret, ret_size);
cec736d2
LP
746}
747
16e9f408
LP
748static uint64_t minimum_header_size(Object *o) {
749
b8e891e6 750 static const uint64_t table[] = {
16e9f408
LP
751 [OBJECT_DATA] = sizeof(DataObject),
752 [OBJECT_FIELD] = sizeof(FieldObject),
753 [OBJECT_ENTRY] = sizeof(EntryObject),
754 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
755 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
756 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
757 [OBJECT_TAG] = sizeof(TagObject),
758 };
759
760 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
761 return sizeof(ObjectHeader);
762
763 return table[o->object.type];
764}
765
24754f36
TR
766/* Lightweight object checks. We want this to be fast, so that we won't
767 * slowdown every journal_file_move_to_object() call too much. */
768static int journal_file_check_object(JournalFile *f, uint64_t offset, Object *o) {
769 assert(f);
770 assert(o);
771
772 switch (o->object.type) {
773
774 case OBJECT_DATA: {
775 if ((le64toh(o->data.entry_offset) == 0) ^ (le64toh(o->data.n_entries) == 0)) {
776 log_debug("Bad n_entries: %"PRIu64": %"PRIu64,
10e8445b 777 le64toh(o->data.n_entries), offset);
24754f36
TR
778 return -EBADMSG;
779 }
780
781 if (le64toh(o->object.size) - offsetof(DataObject, payload) <= 0) {
782 log_debug("Bad object size (<= %zu): %"PRIu64": %"PRIu64,
783 offsetof(DataObject, payload),
784 le64toh(o->object.size),
785 offset);
786 return -EBADMSG;
787 }
788
10e8445b
TR
789 if (!VALID64(le64toh(o->data.next_hash_offset)) ||
790 !VALID64(le64toh(o->data.next_field_offset)) ||
791 !VALID64(le64toh(o->data.entry_offset)) ||
792 !VALID64(le64toh(o->data.entry_array_offset))) {
24754f36
TR
793 log_debug("Invalid offset, next_hash_offset="OFSfmt", next_field_offset="OFSfmt
794 ", entry_offset="OFSfmt", entry_array_offset="OFSfmt": %"PRIu64,
10e8445b
TR
795 le64toh(o->data.next_hash_offset),
796 le64toh(o->data.next_field_offset),
797 le64toh(o->data.entry_offset),
798 le64toh(o->data.entry_array_offset),
24754f36
TR
799 offset);
800 return -EBADMSG;
801 }
802
803 break;
804 }
805
806 case OBJECT_FIELD:
807 if (le64toh(o->object.size) - offsetof(FieldObject, payload) <= 0) {
808 log_debug(
809 "Bad field size (<= %zu): %"PRIu64": %"PRIu64,
810 offsetof(FieldObject, payload),
811 le64toh(o->object.size),
812 offset);
813 return -EBADMSG;
814 }
815
10e8445b
TR
816 if (!VALID64(le64toh(o->field.next_hash_offset)) ||
817 !VALID64(le64toh(o->field.head_data_offset))) {
24754f36
TR
818 log_debug(
819 "Invalid offset, next_hash_offset="OFSfmt
820 ", head_data_offset="OFSfmt": %"PRIu64,
10e8445b
TR
821 le64toh(o->field.next_hash_offset),
822 le64toh(o->field.head_data_offset),
24754f36
TR
823 offset);
824 return -EBADMSG;
825 }
826 break;
827
828 case OBJECT_ENTRY:
829 if ((le64toh(o->object.size) - offsetof(EntryObject, items)) % sizeof(EntryItem) != 0) {
830 log_debug(
831 "Bad entry size (<= %zu): %"PRIu64": %"PRIu64,
832 offsetof(EntryObject, items),
833 le64toh(o->object.size),
834 offset);
835 return -EBADMSG;
836 }
837
838 if ((le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem) <= 0) {
839 log_debug(
840 "Invalid number items in entry: %"PRIu64": %"PRIu64,
841 (le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem),
842 offset);
843 return -EBADMSG;
844 }
845
846 if (le64toh(o->entry.seqnum) <= 0) {
847 log_debug(
848 "Invalid entry seqnum: %"PRIx64": %"PRIu64,
849 le64toh(o->entry.seqnum),
850 offset);
851 return -EBADMSG;
852 }
853
854 if (!VALID_REALTIME(le64toh(o->entry.realtime))) {
855 log_debug(
856 "Invalid entry realtime timestamp: %"PRIu64": %"PRIu64,
857 le64toh(o->entry.realtime),
858 offset);
859 return -EBADMSG;
860 }
861
862 if (!VALID_MONOTONIC(le64toh(o->entry.monotonic))) {
863 log_debug(
864 "Invalid entry monotonic timestamp: %"PRIu64": %"PRIu64,
865 le64toh(o->entry.monotonic),
866 offset);
867 return -EBADMSG;
868 }
869
870 break;
871
872 case OBJECT_DATA_HASH_TABLE:
873 case OBJECT_FIELD_HASH_TABLE:
874 if ((le64toh(o->object.size) - offsetof(HashTableObject, items)) % sizeof(HashItem) != 0 ||
875 (le64toh(o->object.size) - offsetof(HashTableObject, items)) / sizeof(HashItem) <= 0) {
876 log_debug(
877 "Invalid %s hash table size: %"PRIu64": %"PRIu64,
878 o->object.type == OBJECT_DATA_HASH_TABLE ? "data" : "field",
879 le64toh(o->object.size),
880 offset);
881 return -EBADMSG;
882 }
883
884 break;
885
886 case OBJECT_ENTRY_ARRAY:
887 if ((le64toh(o->object.size) - offsetof(EntryArrayObject, items)) % sizeof(le64_t) != 0 ||
888 (le64toh(o->object.size) - offsetof(EntryArrayObject, items)) / sizeof(le64_t) <= 0) {
889 log_debug(
890 "Invalid object entry array size: %"PRIu64": %"PRIu64,
891 le64toh(o->object.size),
892 offset);
893 return -EBADMSG;
894 }
895
10e8445b 896 if (!VALID64(le64toh(o->entry_array.next_entry_array_offset))) {
24754f36
TR
897 log_debug(
898 "Invalid object entry array next_entry_array_offset: "OFSfmt": %"PRIu64,
10e8445b 899 le64toh(o->entry_array.next_entry_array_offset),
24754f36
TR
900 offset);
901 return -EBADMSG;
902 }
903
904 break;
905
906 case OBJECT_TAG:
907 if (le64toh(o->object.size) != sizeof(TagObject)) {
908 log_debug(
909 "Invalid object tag size: %"PRIu64": %"PRIu64,
910 le64toh(o->object.size),
911 offset);
912 return -EBADMSG;
913 }
914
10e8445b 915 if (!VALID_EPOCH(le64toh(o->tag.epoch))) {
24754f36
TR
916 log_debug(
917 "Invalid object tag epoch: %"PRIu64": %"PRIu64,
10e8445b 918 le64toh(o->tag.epoch),
24754f36
TR
919 offset);
920 return -EBADMSG;
921 }
922
923 break;
924 }
925
926 return 0;
927}
928
78519831 929int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
cec736d2
LP
930 int r;
931 void *t;
b439282e 932 size_t tsize;
cec736d2
LP
933 Object *o;
934 uint64_t s;
935
936 assert(f);
937 assert(ret);
938
db11ac1a 939 /* Objects may only be located at multiple of 64 bit */
202fd896
LP
940 if (!VALID64(offset)) {
941 log_debug("Attempt to move to object at non-64bit boundary: %" PRIu64, offset);
bd30fdf2 942 return -EBADMSG;
202fd896 943 }
db11ac1a 944
50809d7a 945 /* Object may not be located in the file header */
202fd896
LP
946 if (offset < le64toh(f->header->header_size)) {
947 log_debug("Attempt to move to object located in file header: %" PRIu64, offset);
50809d7a 948 return -EBADMSG;
202fd896 949 }
50809d7a 950
b439282e 951 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t, &tsize);
cec736d2
LP
952 if (r < 0)
953 return r;
954
955 o = (Object*) t;
956 s = le64toh(o->object.size);
957
1c69f096
LP
958 if (s == 0) {
959 log_debug("Attempt to move to uninitialized object: %" PRIu64, offset);
960 return -EBADMSG;
961 }
202fd896
LP
962 if (s < sizeof(ObjectHeader)) {
963 log_debug("Attempt to move to overly short object: %" PRIu64, offset);
cec736d2 964 return -EBADMSG;
202fd896 965 }
cec736d2 966
202fd896
LP
967 if (o->object.type <= OBJECT_UNUSED) {
968 log_debug("Attempt to move to object with invalid type: %" PRIu64, offset);
16e9f408 969 return -EBADMSG;
202fd896 970 }
16e9f408 971
202fd896
LP
972 if (s < minimum_header_size(o)) {
973 log_debug("Attempt to move to truncated object: %" PRIu64, offset);
16e9f408 974 return -EBADMSG;
202fd896 975 }
16e9f408 976
202fd896
LP
977 if (type > OBJECT_UNUSED && o->object.type != type) {
978 log_debug("Attempt to move to object of unexpected type: %" PRIu64, offset);
cec736d2 979 return -EBADMSG;
202fd896 980 }
cec736d2 981
b439282e
VC
982 if (s > tsize) {
983 r = journal_file_move_to(f, type, false, offset, s, &t, NULL);
cec736d2
LP
984 if (r < 0)
985 return r;
986
987 o = (Object*) t;
988 }
989
24754f36
TR
990 r = journal_file_check_object(f, offset, o);
991 if (r < 0)
992 return r;
993
cec736d2
LP
994 *ret = o;
995 return 0;
996}
997
d98cc1f2 998static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
cec736d2
LP
999 uint64_t r;
1000
1001 assert(f);
c88cc6af 1002 assert(f->header);
cec736d2 1003
beec0085 1004 r = le64toh(f->header->tail_entry_seqnum) + 1;
c2373f84
LP
1005
1006 if (seqnum) {
de190aef 1007 /* If an external seqnum counter was passed, we update
c2373f84
LP
1008 * both the local and the external one, and set it to
1009 * the maximum of both */
1010
1011 if (*seqnum + 1 > r)
1012 r = *seqnum + 1;
1013
1014 *seqnum = r;
1015 }
1016
beec0085 1017 f->header->tail_entry_seqnum = htole64(r);
cec736d2 1018
beec0085
LP
1019 if (f->header->head_entry_seqnum == 0)
1020 f->header->head_entry_seqnum = htole64(r);
de190aef 1021
cec736d2
LP
1022 return r;
1023}
1024
78519831 1025int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
cec736d2
LP
1026 int r;
1027 uint64_t p;
1028 Object *tail, *o;
1029 void *t;
1030
1031 assert(f);
c88cc6af 1032 assert(f->header);
d05089d8 1033 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
cec736d2
LP
1034 assert(size >= sizeof(ObjectHeader));
1035 assert(offset);
1036 assert(ret);
1037
26687bf8
OS
1038 r = journal_file_set_online(f);
1039 if (r < 0)
1040 return r;
1041
cec736d2 1042 p = le64toh(f->header->tail_object_offset);
cec736d2 1043 if (p == 0)
23b0b2b2 1044 p = le64toh(f->header->header_size);
cec736d2 1045 else {
d05089d8 1046 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
cec736d2
LP
1047 if (r < 0)
1048 return r;
1049
1050 p += ALIGN64(le64toh(tail->object.size));
1051 }
1052
1053 r = journal_file_allocate(f, p, size);
1054 if (r < 0)
1055 return r;
1056
b439282e 1057 r = journal_file_move_to(f, type, false, p, size, &t, NULL);
cec736d2
LP
1058 if (r < 0)
1059 return r;
1060
1061 o = (Object*) t;
1062
1063 zero(o->object);
de190aef 1064 o->object.type = type;
cec736d2
LP
1065 o->object.size = htole64(size);
1066
1067 f->header->tail_object_offset = htole64(p);
cec736d2
LP
1068 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
1069
1070 *ret = o;
1071 *offset = p;
1072
1073 return 0;
1074}
1075
de190aef 1076static int journal_file_setup_data_hash_table(JournalFile *f) {
cec736d2
LP
1077 uint64_t s, p;
1078 Object *o;
1079 int r;
1080
1081 assert(f);
c88cc6af 1082 assert(f->header);
cec736d2 1083
070052ab
LP
1084 /* We estimate that we need 1 hash table entry per 768 bytes
1085 of journal file and we want to make sure we never get
1086 beyond 75% fill level. Calculate the hash table size for
1087 the maximum file size based on these metrics. */
4a92baf3 1088
dfabe643 1089 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
4a92baf3
LP
1090 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
1091 s = DEFAULT_DATA_HASH_TABLE_SIZE;
1092
507f22bd 1093 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
4a92baf3 1094
de190aef
LP
1095 r = journal_file_append_object(f,
1096 OBJECT_DATA_HASH_TABLE,
1097 offsetof(Object, hash_table.items) + s,
1098 &o, &p);
cec736d2
LP
1099 if (r < 0)
1100 return r;
1101
29804cc1 1102 memzero(o->hash_table.items, s);
cec736d2 1103
de190aef
LP
1104 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1105 f->header->data_hash_table_size = htole64(s);
cec736d2
LP
1106
1107 return 0;
1108}
1109
de190aef 1110static int journal_file_setup_field_hash_table(JournalFile *f) {
cec736d2
LP
1111 uint64_t s, p;
1112 Object *o;
1113 int r;
1114
1115 assert(f);
c88cc6af 1116 assert(f->header);
cec736d2 1117
3c1668da
LP
1118 /* We use a fixed size hash table for the fields as this
1119 * number should grow very slowly only */
1120
de190aef
LP
1121 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
1122 r = journal_file_append_object(f,
1123 OBJECT_FIELD_HASH_TABLE,
1124 offsetof(Object, hash_table.items) + s,
1125 &o, &p);
cec736d2
LP
1126 if (r < 0)
1127 return r;
1128
29804cc1 1129 memzero(o->hash_table.items, s);
cec736d2 1130
de190aef
LP
1131 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1132 f->header->field_hash_table_size = htole64(s);
cec736d2
LP
1133
1134 return 0;
1135}
1136
dade37d4 1137int journal_file_map_data_hash_table(JournalFile *f) {
cec736d2
LP
1138 uint64_t s, p;
1139 void *t;
1140 int r;
1141
1142 assert(f);
c88cc6af 1143 assert(f->header);
cec736d2 1144
dade37d4
LP
1145 if (f->data_hash_table)
1146 return 0;
1147
de190aef
LP
1148 p = le64toh(f->header->data_hash_table_offset);
1149 s = le64toh(f->header->data_hash_table_size);
cec736d2 1150
de190aef 1151 r = journal_file_move_to(f,
16e9f408 1152 OBJECT_DATA_HASH_TABLE,
fcde2389 1153 true,
de190aef 1154 p, s,
b42549ad 1155 &t, NULL);
cec736d2
LP
1156 if (r < 0)
1157 return r;
1158
de190aef 1159 f->data_hash_table = t;
cec736d2
LP
1160 return 0;
1161}
1162
dade37d4 1163int journal_file_map_field_hash_table(JournalFile *f) {
cec736d2
LP
1164 uint64_t s, p;
1165 void *t;
1166 int r;
1167
1168 assert(f);
c88cc6af 1169 assert(f->header);
cec736d2 1170
dade37d4
LP
1171 if (f->field_hash_table)
1172 return 0;
1173
de190aef
LP
1174 p = le64toh(f->header->field_hash_table_offset);
1175 s = le64toh(f->header->field_hash_table_size);
cec736d2 1176
de190aef 1177 r = journal_file_move_to(f,
16e9f408 1178 OBJECT_FIELD_HASH_TABLE,
fcde2389 1179 true,
de190aef 1180 p, s,
b42549ad 1181 &t, NULL);
cec736d2
LP
1182 if (r < 0)
1183 return r;
1184
de190aef 1185 f->field_hash_table = t;
cec736d2
LP
1186 return 0;
1187}
1188
3c1668da
LP
1189static int journal_file_link_field(
1190 JournalFile *f,
1191 Object *o,
1192 uint64_t offset,
1193 uint64_t hash) {
1194
805d1486 1195 uint64_t p, h, m;
3c1668da
LP
1196 int r;
1197
1198 assert(f);
c88cc6af 1199 assert(f->header);
90d222c1 1200 assert(f->field_hash_table);
3c1668da
LP
1201 assert(o);
1202 assert(offset > 0);
1203
1204 if (o->object.type != OBJECT_FIELD)
1205 return -EINVAL;
1206
805d1486
LP
1207 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
1208 if (m <= 0)
1209 return -EBADMSG;
3c1668da 1210
805d1486 1211 /* This might alter the window we are looking at */
3c1668da
LP
1212 o->field.next_hash_offset = o->field.head_data_offset = 0;
1213
805d1486 1214 h = hash % m;
3c1668da
LP
1215 p = le64toh(f->field_hash_table[h].tail_hash_offset);
1216 if (p == 0)
1217 f->field_hash_table[h].head_hash_offset = htole64(offset);
1218 else {
1219 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1220 if (r < 0)
1221 return r;
1222
1223 o->field.next_hash_offset = htole64(offset);
1224 }
1225
1226 f->field_hash_table[h].tail_hash_offset = htole64(offset);
1227
1228 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
1229 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
1230
1231 return 0;
1232}
1233
1234static int journal_file_link_data(
1235 JournalFile *f,
1236 Object *o,
1237 uint64_t offset,
1238 uint64_t hash) {
1239
805d1486 1240 uint64_t p, h, m;
cec736d2
LP
1241 int r;
1242
1243 assert(f);
c88cc6af 1244 assert(f->header);
90d222c1 1245 assert(f->data_hash_table);
cec736d2
LP
1246 assert(o);
1247 assert(offset > 0);
b588975f
LP
1248
1249 if (o->object.type != OBJECT_DATA)
1250 return -EINVAL;
cec736d2 1251
805d1486
LP
1252 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
1253 if (m <= 0)
1254 return -EBADMSG;
48496df6 1255
805d1486 1256 /* This might alter the window we are looking at */
de190aef
LP
1257 o->data.next_hash_offset = o->data.next_field_offset = 0;
1258 o->data.entry_offset = o->data.entry_array_offset = 0;
1259 o->data.n_entries = 0;
cec736d2 1260
805d1486 1261 h = hash % m;
8db4213e 1262 p = le64toh(f->data_hash_table[h].tail_hash_offset);
3c1668da 1263 if (p == 0)
cec736d2 1264 /* Only entry in the hash table is easy */
de190aef 1265 f->data_hash_table[h].head_hash_offset = htole64(offset);
3c1668da 1266 else {
48496df6
LP
1267 /* Move back to the previous data object, to patch in
1268 * pointer */
cec736d2 1269
de190aef 1270 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1271 if (r < 0)
1272 return r;
1273
de190aef 1274 o->data.next_hash_offset = htole64(offset);
cec736d2
LP
1275 }
1276
de190aef 1277 f->data_hash_table[h].tail_hash_offset = htole64(offset);
cec736d2 1278
dca6219e
LP
1279 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
1280 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
1281
cec736d2
LP
1282 return 0;
1283}
1284
3c1668da
LP
1285int journal_file_find_field_object_with_hash(
1286 JournalFile *f,
1287 const void *field, uint64_t size, uint64_t hash,
1288 Object **ret, uint64_t *offset) {
1289
805d1486 1290 uint64_t p, osize, h, m;
3c1668da
LP
1291 int r;
1292
1293 assert(f);
c88cc6af 1294 assert(f->header);
3c1668da
LP
1295 assert(field && size > 0);
1296
dade37d4
LP
1297 /* If the field hash table is empty, we can't find anything */
1298 if (le64toh(f->header->field_hash_table_size) <= 0)
1299 return 0;
1300
1301 /* Map the field hash table, if it isn't mapped yet. */
1302 r = journal_file_map_field_hash_table(f);
1303 if (r < 0)
1304 return r;
1305
3c1668da
LP
1306 osize = offsetof(Object, field.payload) + size;
1307
805d1486 1308 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
805d1486 1309 if (m <= 0)
3c1668da
LP
1310 return -EBADMSG;
1311
805d1486 1312 h = hash % m;
3c1668da
LP
1313 p = le64toh(f->field_hash_table[h].head_hash_offset);
1314
1315 while (p > 0) {
1316 Object *o;
1317
1318 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1319 if (r < 0)
1320 return r;
1321
1322 if (le64toh(o->field.hash) == hash &&
1323 le64toh(o->object.size) == osize &&
1324 memcmp(o->field.payload, field, size) == 0) {
1325
1326 if (ret)
1327 *ret = o;
1328 if (offset)
1329 *offset = p;
1330
1331 return 1;
1332 }
1333
1334 p = le64toh(o->field.next_hash_offset);
1335 }
1336
1337 return 0;
1338}
1339
1340int journal_file_find_field_object(
1341 JournalFile *f,
1342 const void *field, uint64_t size,
1343 Object **ret, uint64_t *offset) {
1344
1345 uint64_t hash;
1346
1347 assert(f);
1348 assert(field && size > 0);
1349
1350 hash = hash64(field, size);
1351
1352 return journal_file_find_field_object_with_hash(f,
1353 field, size, hash,
1354 ret, offset);
1355}
1356
de190aef
LP
1357int journal_file_find_data_object_with_hash(
1358 JournalFile *f,
1359 const void *data, uint64_t size, uint64_t hash,
1360 Object **ret, uint64_t *offset) {
48496df6 1361
805d1486 1362 uint64_t p, osize, h, m;
cec736d2
LP
1363 int r;
1364
1365 assert(f);
c88cc6af 1366 assert(f->header);
cec736d2
LP
1367 assert(data || size == 0);
1368
dade37d4
LP
1369 /* If there's no data hash table, then there's no entry. */
1370 if (le64toh(f->header->data_hash_table_size) <= 0)
1371 return 0;
1372
1373 /* Map the data hash table, if it isn't mapped yet. */
1374 r = journal_file_map_data_hash_table(f);
1375 if (r < 0)
1376 return r;
1377
cec736d2
LP
1378 osize = offsetof(Object, data.payload) + size;
1379
805d1486
LP
1380 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
1381 if (m <= 0)
bc85bfee
LP
1382 return -EBADMSG;
1383
805d1486 1384 h = hash % m;
de190aef 1385 p = le64toh(f->data_hash_table[h].head_hash_offset);
cec736d2 1386
de190aef
LP
1387 while (p > 0) {
1388 Object *o;
cec736d2 1389
de190aef 1390 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1391 if (r < 0)
1392 return r;
1393
807e17f0 1394 if (le64toh(o->data.hash) != hash)
85a131e8 1395 goto next;
807e17f0 1396
d89c8fdf 1397 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
349cc4a5 1398#if HAVE_XZ || HAVE_LZ4
fa1c4b51 1399 uint64_t l;
a7f7d1bd 1400 size_t rsize = 0;
cec736d2 1401
807e17f0
LP
1402 l = le64toh(o->object.size);
1403 if (l <= offsetof(Object, data.payload))
cec736d2
LP
1404 return -EBADMSG;
1405
807e17f0
LP
1406 l -= offsetof(Object, data.payload);
1407
d89c8fdf
ZJS
1408 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
1409 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
1410 if (r < 0)
1411 return r;
807e17f0 1412
b785c858 1413 if (rsize == size &&
807e17f0
LP
1414 memcmp(f->compress_buffer, data, size) == 0) {
1415
1416 if (ret)
1417 *ret = o;
1418
1419 if (offset)
1420 *offset = p;
1421
1422 return 1;
1423 }
3b1a55e1
ZJS
1424#else
1425 return -EPROTONOSUPPORT;
1426#endif
807e17f0
LP
1427 } else if (le64toh(o->object.size) == osize &&
1428 memcmp(o->data.payload, data, size) == 0) {
1429
cec736d2
LP
1430 if (ret)
1431 *ret = o;
1432
1433 if (offset)
1434 *offset = p;
1435
de190aef 1436 return 1;
cec736d2
LP
1437 }
1438
85a131e8 1439 next:
cec736d2
LP
1440 p = le64toh(o->data.next_hash_offset);
1441 }
1442
de190aef
LP
1443 return 0;
1444}
1445
1446int journal_file_find_data_object(
1447 JournalFile *f,
1448 const void *data, uint64_t size,
1449 Object **ret, uint64_t *offset) {
1450
1451 uint64_t hash;
1452
1453 assert(f);
1454 assert(data || size == 0);
1455
1456 hash = hash64(data, size);
1457
1458 return journal_file_find_data_object_with_hash(f,
1459 data, size, hash,
1460 ret, offset);
1461}
1462
3c1668da
LP
1463static int journal_file_append_field(
1464 JournalFile *f,
1465 const void *field, uint64_t size,
1466 Object **ret, uint64_t *offset) {
1467
1468 uint64_t hash, p;
1469 uint64_t osize;
1470 Object *o;
1471 int r;
1472
1473 assert(f);
1474 assert(field && size > 0);
1475
1476 hash = hash64(field, size);
1477
1478 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
1479 if (r < 0)
1480 return r;
1481 else if (r > 0) {
1482
1483 if (ret)
1484 *ret = o;
1485
1486 if (offset)
1487 *offset = p;
1488
1489 return 0;
1490 }
1491
1492 osize = offsetof(Object, field.payload) + size;
1493 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
8c92d4bb
LP
1494 if (r < 0)
1495 return r;
3c1668da
LP
1496
1497 o->field.hash = htole64(hash);
1498 memcpy(o->field.payload, field, size);
1499
1500 r = journal_file_link_field(f, o, p, hash);
1501 if (r < 0)
1502 return r;
1503
1504 /* The linking might have altered the window, so let's
1505 * refresh our pointer */
1506 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1507 if (r < 0)
1508 return r;
1509
349cc4a5 1510#if HAVE_GCRYPT
3c1668da
LP
1511 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1512 if (r < 0)
1513 return r;
1514#endif
1515
1516 if (ret)
1517 *ret = o;
1518
1519 if (offset)
1520 *offset = p;
1521
1522 return 0;
1523}
1524
48496df6
LP
1525static int journal_file_append_data(
1526 JournalFile *f,
1527 const void *data, uint64_t size,
1528 Object **ret, uint64_t *offset) {
1529
de190aef
LP
1530 uint64_t hash, p;
1531 uint64_t osize;
1532 Object *o;
d89c8fdf 1533 int r, compression = 0;
3c1668da 1534 const void *eq;
de190aef
LP
1535
1536 assert(f);
1537 assert(data || size == 0);
1538
1539 hash = hash64(data, size);
1540
1541 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1542 if (r < 0)
1543 return r;
0240c603 1544 if (r > 0) {
de190aef
LP
1545
1546 if (ret)
1547 *ret = o;
1548
1549 if (offset)
1550 *offset = p;
1551
1552 return 0;
1553 }
1554
1555 osize = offsetof(Object, data.payload) + size;
1556 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
cec736d2
LP
1557 if (r < 0)
1558 return r;
1559
cec736d2 1560 o->data.hash = htole64(hash);
807e17f0 1561
349cc4a5 1562#if HAVE_XZ || HAVE_LZ4
d1afbcd2 1563 if (JOURNAL_FILE_COMPRESS(f) && size >= COMPRESSION_SIZE_THRESHOLD) {
a7f7d1bd 1564 size_t rsize = 0;
807e17f0 1565
5d6f46b6 1566 compression = compress_blob(data, size, o->data.payload, size - 1, &rsize);
807e17f0 1567
d1afbcd2 1568 if (compression >= 0) {
807e17f0 1569 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
d89c8fdf 1570 o->object.flags |= compression;
807e17f0 1571
fa1c4b51 1572 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
d89c8fdf 1573 size, rsize, object_compressed_to_string(compression));
d1afbcd2
LP
1574 } else
1575 /* Compression didn't work, we don't really care why, let's continue without compression */
1576 compression = 0;
807e17f0
LP
1577 }
1578#endif
1579
75f32f04
ZJS
1580 if (compression == 0)
1581 memcpy_safe(o->data.payload, data, size);
cec736d2 1582
de190aef 1583 r = journal_file_link_data(f, o, p, hash);
cec736d2
LP
1584 if (r < 0)
1585 return r;
1586
349cc4a5 1587#if HAVE_GCRYPT
33685a5a
FB
1588 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1589 if (r < 0)
1590 return r;
1591#endif
1592
48496df6
LP
1593 /* The linking might have altered the window, so let's
1594 * refresh our pointer */
1595 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1596 if (r < 0)
1597 return r;
1598
08c6f819
SL
1599 if (!data)
1600 eq = NULL;
1601 else
1602 eq = memchr(data, '=', size);
3c1668da 1603 if (eq && eq > data) {
748db592 1604 Object *fo = NULL;
3c1668da 1605 uint64_t fp;
3c1668da
LP
1606
1607 /* Create field object ... */
1608 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1609 if (r < 0)
1610 return r;
1611
1612 /* ... and link it in. */
1613 o->data.next_field_offset = fo->field.head_data_offset;
1614 fo->field.head_data_offset = le64toh(p);
1615 }
1616
cec736d2
LP
1617 if (ret)
1618 *ret = o;
1619
1620 if (offset)
de190aef 1621 *offset = p;
cec736d2
LP
1622
1623 return 0;
1624}
1625
1626uint64_t journal_file_entry_n_items(Object *o) {
1627 assert(o);
b588975f
LP
1628
1629 if (o->object.type != OBJECT_ENTRY)
1630 return 0;
cec736d2
LP
1631
1632 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1633}
1634
0284adc6 1635uint64_t journal_file_entry_array_n_items(Object *o) {
de190aef 1636 assert(o);
b588975f
LP
1637
1638 if (o->object.type != OBJECT_ENTRY_ARRAY)
1639 return 0;
de190aef
LP
1640
1641 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1642}
1643
fb9a24b6
LP
1644uint64_t journal_file_hash_table_n_items(Object *o) {
1645 assert(o);
b588975f 1646
ec2ce0c5 1647 if (!IN_SET(o->object.type, OBJECT_DATA_HASH_TABLE, OBJECT_FIELD_HASH_TABLE))
b588975f 1648 return 0;
fb9a24b6
LP
1649
1650 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1651}
1652
de190aef 1653static int link_entry_into_array(JournalFile *f,
4fd052ae
FC
1654 le64_t *first,
1655 le64_t *idx,
de190aef 1656 uint64_t p) {
cec736d2 1657 int r;
de190aef
LP
1658 uint64_t n = 0, ap = 0, q, i, a, hidx;
1659 Object *o;
1660
cec736d2 1661 assert(f);
c88cc6af 1662 assert(f->header);
de190aef
LP
1663 assert(first);
1664 assert(idx);
1665 assert(p > 0);
cec736d2 1666
de190aef
LP
1667 a = le64toh(*first);
1668 i = hidx = le64toh(*idx);
1669 while (a > 0) {
1670
1671 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1672 if (r < 0)
1673 return r;
cec736d2 1674
de190aef
LP
1675 n = journal_file_entry_array_n_items(o);
1676 if (i < n) {
1677 o->entry_array.items[i] = htole64(p);
1678 *idx = htole64(hidx + 1);
1679 return 0;
1680 }
cec736d2 1681
de190aef
LP
1682 i -= n;
1683 ap = a;
1684 a = le64toh(o->entry_array.next_entry_array_offset);
1685 }
1686
1687 if (hidx > n)
1688 n = (hidx+1) * 2;
1689 else
1690 n = n * 2;
1691
1692 if (n < 4)
1693 n = 4;
1694
1695 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1696 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1697 &o, &q);
cec736d2
LP
1698 if (r < 0)
1699 return r;
1700
349cc4a5 1701#if HAVE_GCRYPT
5996c7c2 1702 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
b0af6f41
LP
1703 if (r < 0)
1704 return r;
feb12d3e 1705#endif
b0af6f41 1706
de190aef 1707 o->entry_array.items[i] = htole64(p);
cec736d2 1708
de190aef 1709 if (ap == 0)
7be3aa17 1710 *first = htole64(q);
cec736d2 1711 else {
de190aef 1712 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
cec736d2
LP
1713 if (r < 0)
1714 return r;
1715
de190aef
LP
1716 o->entry_array.next_entry_array_offset = htole64(q);
1717 }
cec736d2 1718
2dee23eb
LP
1719 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1720 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1721
de190aef
LP
1722 *idx = htole64(hidx + 1);
1723
1724 return 0;
1725}
cec736d2 1726
de190aef 1727static int link_entry_into_array_plus_one(JournalFile *f,
4fd052ae
FC
1728 le64_t *extra,
1729 le64_t *first,
1730 le64_t *idx,
de190aef
LP
1731 uint64_t p) {
1732
1733 int r;
1734
1735 assert(f);
1736 assert(extra);
1737 assert(first);
1738 assert(idx);
1739 assert(p > 0);
1740
1741 if (*idx == 0)
1742 *extra = htole64(p);
1743 else {
4fd052ae 1744 le64_t i;
de190aef 1745
7be3aa17 1746 i = htole64(le64toh(*idx) - 1);
de190aef
LP
1747 r = link_entry_into_array(f, first, &i, p);
1748 if (r < 0)
1749 return r;
cec736d2
LP
1750 }
1751
de190aef
LP
1752 *idx = htole64(le64toh(*idx) + 1);
1753 return 0;
1754}
1755
1756static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1757 uint64_t p;
1758 int r;
1759 assert(f);
1760 assert(o);
1761 assert(offset > 0);
1762
1763 p = le64toh(o->entry.items[i].object_offset);
1764 if (p == 0)
1765 return -EINVAL;
1766
1767 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1768 if (r < 0)
1769 return r;
1770
de190aef
LP
1771 return link_entry_into_array_plus_one(f,
1772 &o->data.entry_offset,
1773 &o->data.entry_array_offset,
1774 &o->data.n_entries,
1775 offset);
cec736d2
LP
1776}
1777
1778static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
de190aef 1779 uint64_t n, i;
cec736d2
LP
1780 int r;
1781
1782 assert(f);
c88cc6af 1783 assert(f->header);
cec736d2
LP
1784 assert(o);
1785 assert(offset > 0);
b588975f
LP
1786
1787 if (o->object.type != OBJECT_ENTRY)
1788 return -EINVAL;
cec736d2 1789
b788cc23
LP
1790 __sync_synchronize();
1791
cec736d2 1792 /* Link up the entry itself */
de190aef
LP
1793 r = link_entry_into_array(f,
1794 &f->header->entry_array_offset,
1795 &f->header->n_entries,
1796 offset);
1797 if (r < 0)
1798 return r;
cec736d2 1799
507f22bd 1800 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
cec736d2 1801
de190aef 1802 if (f->header->head_entry_realtime == 0)
0ac38b70 1803 f->header->head_entry_realtime = o->entry.realtime;
cec736d2 1804
0ac38b70 1805 f->header->tail_entry_realtime = o->entry.realtime;
de190aef
LP
1806 f->header->tail_entry_monotonic = o->entry.monotonic;
1807
1808 f->tail_entry_monotonic_valid = true;
cec736d2
LP
1809
1810 /* Link up the items */
1811 n = journal_file_entry_n_items(o);
1812 for (i = 0; i < n; i++) {
1813 r = journal_file_link_entry_item(f, o, offset, i);
1814 if (r < 0)
1815 return r;
1816 }
1817
cec736d2
LP
1818 return 0;
1819}
1820
1821static int journal_file_append_entry_internal(
1822 JournalFile *f,
1823 const dual_timestamp *ts,
1824 uint64_t xor_hash,
1825 const EntryItem items[], unsigned n_items,
de190aef 1826 uint64_t *seqnum,
cec736d2
LP
1827 Object **ret, uint64_t *offset) {
1828 uint64_t np;
1829 uint64_t osize;
1830 Object *o;
1831 int r;
1832
1833 assert(f);
c88cc6af 1834 assert(f->header);
cec736d2 1835 assert(items || n_items == 0);
de190aef 1836 assert(ts);
cec736d2
LP
1837
1838 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1839
de190aef 1840 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
cec736d2
LP
1841 if (r < 0)
1842 return r;
1843
d98cc1f2 1844 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
75f32f04 1845 memcpy_safe(o->entry.items, items, n_items * sizeof(EntryItem));
de190aef
LP
1846 o->entry.realtime = htole64(ts->realtime);
1847 o->entry.monotonic = htole64(ts->monotonic);
cec736d2
LP
1848 o->entry.xor_hash = htole64(xor_hash);
1849 o->entry.boot_id = f->header->boot_id;
1850
349cc4a5 1851#if HAVE_GCRYPT
5996c7c2 1852 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
b0af6f41
LP
1853 if (r < 0)
1854 return r;
feb12d3e 1855#endif
b0af6f41 1856
cec736d2
LP
1857 r = journal_file_link_entry(f, o, np);
1858 if (r < 0)
1859 return r;
1860
1861 if (ret)
1862 *ret = o;
1863
1864 if (offset)
1865 *offset = np;
1866
1867 return 0;
1868}
1869
cf244689 1870void journal_file_post_change(JournalFile *f) {
50f20cfd
LP
1871 assert(f);
1872
1873 /* inotify() does not receive IN_MODIFY events from file
1874 * accesses done via mmap(). After each access we hence
1875 * trigger IN_MODIFY by truncating the journal file to its
1876 * current size which triggers IN_MODIFY. */
1877
bc85bfee
LP
1878 __sync_synchronize();
1879
50f20cfd 1880 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
e167d7fd 1881 log_debug_errno(errno, "Failed to truncate file to its own size: %m");
50f20cfd
LP
1882}
1883
7a24f3bf
VC
1884static int post_change_thunk(sd_event_source *timer, uint64_t usec, void *userdata) {
1885 assert(userdata);
1886
1887 journal_file_post_change(userdata);
1888
1889 return 1;
1890}
1891
1892static void schedule_post_change(JournalFile *f) {
1893 sd_event_source *timer;
1894 int enabled, r;
1895 uint64_t now;
1896
1897 assert(f);
1898 assert(f->post_change_timer);
1899
1900 timer = f->post_change_timer;
1901
1902 r = sd_event_source_get_enabled(timer, &enabled);
1903 if (r < 0) {
e167d7fd
LP
1904 log_debug_errno(r, "Failed to get ftruncate timer state: %m");
1905 goto fail;
7a24f3bf
VC
1906 }
1907
1908 if (enabled == SD_EVENT_ONESHOT)
1909 return;
1910
1911 r = sd_event_now(sd_event_source_get_event(timer), CLOCK_MONOTONIC, &now);
1912 if (r < 0) {
e167d7fd
LP
1913 log_debug_errno(r, "Failed to get clock's now for scheduling ftruncate: %m");
1914 goto fail;
7a24f3bf
VC
1915 }
1916
1917 r = sd_event_source_set_time(timer, now+f->post_change_timer_period);
1918 if (r < 0) {
e167d7fd
LP
1919 log_debug_errno(r, "Failed to set time for scheduling ftruncate: %m");
1920 goto fail;
7a24f3bf
VC
1921 }
1922
1923 r = sd_event_source_set_enabled(timer, SD_EVENT_ONESHOT);
1924 if (r < 0) {
e167d7fd
LP
1925 log_debug_errno(r, "Failed to enable scheduled ftruncate: %m");
1926 goto fail;
7a24f3bf 1927 }
e167d7fd
LP
1928
1929 return;
1930
1931fail:
1932 /* On failure, let's simply post the change immediately. */
1933 journal_file_post_change(f);
7a24f3bf
VC
1934}
1935
1936/* Enable coalesced change posting in a timer on the provided sd_event instance */
1937int journal_file_enable_post_change_timer(JournalFile *f, sd_event *e, usec_t t) {
1938 _cleanup_(sd_event_source_unrefp) sd_event_source *timer = NULL;
1939 int r;
1940
1941 assert(f);
1942 assert_return(!f->post_change_timer, -EINVAL);
1943 assert(e);
1944 assert(t);
1945
1946 r = sd_event_add_time(e, &timer, CLOCK_MONOTONIC, 0, 0, post_change_thunk, f);
1947 if (r < 0)
1948 return r;
1949
1950 r = sd_event_source_set_enabled(timer, SD_EVENT_OFF);
1951 if (r < 0)
1952 return r;
1953
1954 f->post_change_timer = timer;
1955 timer = NULL;
1956 f->post_change_timer_period = t;
1957
1958 return r;
1959}
1960
1f2da9ec
LP
1961static int entry_item_cmp(const void *_a, const void *_b) {
1962 const EntryItem *a = _a, *b = _b;
1963
1964 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1965 return -1;
1966 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1967 return 1;
1968 return 0;
1969}
1970
de190aef 1971int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
cec736d2
LP
1972 unsigned i;
1973 EntryItem *items;
1974 int r;
1975 uint64_t xor_hash = 0;
de190aef 1976 struct dual_timestamp _ts;
cec736d2
LP
1977
1978 assert(f);
c88cc6af 1979 assert(f->header);
cec736d2
LP
1980 assert(iovec || n_iovec == 0);
1981
de190aef
LP
1982 if (!ts) {
1983 dual_timestamp_get(&_ts);
1984 ts = &_ts;
1985 }
1986
349cc4a5 1987#if HAVE_GCRYPT
7560fffc
LP
1988 r = journal_file_maybe_append_tag(f, ts->realtime);
1989 if (r < 0)
1990 return r;
feb12d3e 1991#endif
7560fffc 1992
64825d3c 1993 /* alloca() can't take 0, hence let's allocate at least one */
9607d947 1994 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
cec736d2
LP
1995
1996 for (i = 0; i < n_iovec; i++) {
1997 uint64_t p;
1998 Object *o;
1999
2000 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
2001 if (r < 0)
cf244689 2002 return r;
cec736d2
LP
2003
2004 xor_hash ^= le64toh(o->data.hash);
2005 items[i].object_offset = htole64(p);
de7b95cd 2006 items[i].hash = o->data.hash;
cec736d2
LP
2007 }
2008
1f2da9ec
LP
2009 /* Order by the position on disk, in order to improve seek
2010 * times for rotating media. */
7ff7394d 2011 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1f2da9ec 2012
de190aef 2013 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
cec736d2 2014
fa6ac760
LP
2015 /* If the memory mapping triggered a SIGBUS then we return an
2016 * IO error and ignore the error code passed down to us, since
2017 * it is very likely just an effect of a nullified replacement
2018 * mapping page */
2019
be7cdd8e 2020 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
fa6ac760
LP
2021 r = -EIO;
2022
7a24f3bf
VC
2023 if (f->post_change_timer)
2024 schedule_post_change(f);
2025 else
2026 journal_file_post_change(f);
50f20cfd 2027
cec736d2
LP
2028 return r;
2029}
2030
a4bcff5b 2031typedef struct ChainCacheItem {
fb099c8d 2032 uint64_t first; /* the array at the beginning of the chain */
a4bcff5b
LP
2033 uint64_t array; /* the cached array */
2034 uint64_t begin; /* the first item in the cached array */
2035 uint64_t total; /* the total number of items in all arrays before this one in the chain */
f268980d 2036 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
a4bcff5b
LP
2037} ChainCacheItem;
2038
2039static void chain_cache_put(
4743015d 2040 OrderedHashmap *h,
a4bcff5b
LP
2041 ChainCacheItem *ci,
2042 uint64_t first,
2043 uint64_t array,
2044 uint64_t begin,
f268980d
LP
2045 uint64_t total,
2046 uint64_t last_index) {
a4bcff5b
LP
2047
2048 if (!ci) {
34741aa3
LP
2049 /* If the chain item to cache for this chain is the
2050 * first one it's not worth caching anything */
2051 if (array == first)
2052 return;
2053
29433089 2054 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
4743015d 2055 ci = ordered_hashmap_steal_first(h);
29433089
LP
2056 assert(ci);
2057 } else {
a4bcff5b
LP
2058 ci = new(ChainCacheItem, 1);
2059 if (!ci)
2060 return;
2061 }
2062
2063 ci->first = first;
2064
4743015d 2065 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
a4bcff5b
LP
2066 free(ci);
2067 return;
2068 }
2069 } else
2070 assert(ci->first == first);
2071
2072 ci->array = array;
2073 ci->begin = begin;
2074 ci->total = total;
f268980d 2075 ci->last_index = last_index;
a4bcff5b
LP
2076}
2077
f268980d
LP
2078static int generic_array_get(
2079 JournalFile *f,
2080 uint64_t first,
2081 uint64_t i,
2082 Object **ret, uint64_t *offset) {
de190aef 2083
cec736d2 2084 Object *o;
a4bcff5b 2085 uint64_t p = 0, a, t = 0;
cec736d2 2086 int r;
a4bcff5b 2087 ChainCacheItem *ci;
cec736d2
LP
2088
2089 assert(f);
2090
de190aef 2091 a = first;
a4bcff5b
LP
2092
2093 /* Try the chain cache first */
4743015d 2094 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
2095 if (ci && i > ci->total) {
2096 a = ci->array;
2097 i -= ci->total;
2098 t = ci->total;
2099 }
2100
de190aef 2101 while (a > 0) {
a4bcff5b 2102 uint64_t k;
cec736d2 2103
de190aef
LP
2104 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
2105 if (r < 0)
2106 return r;
cec736d2 2107
a4bcff5b
LP
2108 k = journal_file_entry_array_n_items(o);
2109 if (i < k) {
de190aef 2110 p = le64toh(o->entry_array.items[i]);
a4bcff5b 2111 goto found;
cec736d2
LP
2112 }
2113
a4bcff5b
LP
2114 i -= k;
2115 t += k;
de190aef
LP
2116 a = le64toh(o->entry_array.next_entry_array_offset);
2117 }
2118
a4bcff5b
LP
2119 return 0;
2120
2121found:
2122 /* Let's cache this item for the next invocation */
af13a6b0 2123 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
de190aef
LP
2124
2125 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2126 if (r < 0)
2127 return r;
2128
2129 if (ret)
2130 *ret = o;
2131
2132 if (offset)
2133 *offset = p;
2134
2135 return 1;
2136}
2137
f268980d
LP
2138static int generic_array_get_plus_one(
2139 JournalFile *f,
2140 uint64_t extra,
2141 uint64_t first,
2142 uint64_t i,
2143 Object **ret, uint64_t *offset) {
de190aef
LP
2144
2145 Object *o;
2146
2147 assert(f);
2148
2149 if (i == 0) {
2150 int r;
2151
2152 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
cec736d2
LP
2153 if (r < 0)
2154 return r;
2155
de190aef
LP
2156 if (ret)
2157 *ret = o;
cec736d2 2158
de190aef
LP
2159 if (offset)
2160 *offset = extra;
cec736d2 2161
de190aef 2162 return 1;
cec736d2
LP
2163 }
2164
de190aef
LP
2165 return generic_array_get(f, first, i-1, ret, offset);
2166}
cec736d2 2167
de190aef
LP
2168enum {
2169 TEST_FOUND,
2170 TEST_LEFT,
2171 TEST_RIGHT
2172};
cec736d2 2173
f268980d
LP
2174static int generic_array_bisect(
2175 JournalFile *f,
2176 uint64_t first,
2177 uint64_t n,
2178 uint64_t needle,
2179 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2180 direction_t direction,
2181 Object **ret,
2182 uint64_t *offset,
2183 uint64_t *idx) {
2184
2185 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
de190aef
LP
2186 bool subtract_one = false;
2187 Object *o, *array = NULL;
2188 int r;
a4bcff5b 2189 ChainCacheItem *ci;
cec736d2 2190
de190aef
LP
2191 assert(f);
2192 assert(test_object);
cec736d2 2193
a4bcff5b 2194 /* Start with the first array in the chain */
de190aef 2195 a = first;
a4bcff5b 2196
4743015d 2197 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
2198 if (ci && n > ci->total) {
2199 /* Ah, we have iterated this bisection array chain
2200 * previously! Let's see if we can skip ahead in the
2201 * chain, as far as the last time. But we can't jump
2202 * backwards in the chain, so let's check that
2203 * first. */
2204
2205 r = test_object(f, ci->begin, needle);
2206 if (r < 0)
2207 return r;
2208
2209 if (r == TEST_LEFT) {
f268980d 2210 /* OK, what we are looking for is right of the
a4bcff5b
LP
2211 * begin of this EntryArray, so let's jump
2212 * straight to previously cached array in the
2213 * chain */
2214
2215 a = ci->array;
2216 n -= ci->total;
2217 t = ci->total;
f268980d 2218 last_index = ci->last_index;
a4bcff5b
LP
2219 }
2220 }
2221
de190aef
LP
2222 while (a > 0) {
2223 uint64_t left, right, k, lp;
2224
2225 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
cec736d2
LP
2226 if (r < 0)
2227 return r;
2228
de190aef
LP
2229 k = journal_file_entry_array_n_items(array);
2230 right = MIN(k, n);
2231 if (right <= 0)
2232 return 0;
cec736d2 2233
de190aef
LP
2234 i = right - 1;
2235 lp = p = le64toh(array->entry_array.items[i]);
2236 if (p <= 0)
bee6a291
LP
2237 r = -EBADMSG;
2238 else
2239 r = test_object(f, p, needle);
2240 if (r == -EBADMSG) {
2241 log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (1)");
2242 n = i;
2243 continue;
2244 }
de190aef
LP
2245 if (r < 0)
2246 return r;
cec736d2 2247
de190aef
LP
2248 if (r == TEST_FOUND)
2249 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2250
2251 if (r == TEST_RIGHT) {
2252 left = 0;
2253 right -= 1;
f268980d
LP
2254
2255 if (last_index != (uint64_t) -1) {
2256 assert(last_index <= right);
2257
2258 /* If we cached the last index we
2259 * looked at, let's try to not to jump
2260 * too wildly around and see if we can
2261 * limit the range to look at early to
2262 * the immediate neighbors of the last
2263 * index we looked at. */
2264
2265 if (last_index > 0) {
2266 uint64_t x = last_index - 1;
2267
2268 p = le64toh(array->entry_array.items[x]);
2269 if (p <= 0)
2270 return -EBADMSG;
2271
2272 r = test_object(f, p, needle);
2273 if (r < 0)
2274 return r;
2275
2276 if (r == TEST_FOUND)
2277 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2278
2279 if (r == TEST_RIGHT)
2280 right = x;
2281 else
2282 left = x + 1;
2283 }
2284
2285 if (last_index < right) {
2286 uint64_t y = last_index + 1;
2287
2288 p = le64toh(array->entry_array.items[y]);
2289 if (p <= 0)
2290 return -EBADMSG;
2291
2292 r = test_object(f, p, needle);
2293 if (r < 0)
2294 return r;
2295
2296 if (r == TEST_FOUND)
2297 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2298
2299 if (r == TEST_RIGHT)
2300 right = y;
2301 else
2302 left = y + 1;
2303 }
f268980d
LP
2304 }
2305
de190aef
LP
2306 for (;;) {
2307 if (left == right) {
2308 if (direction == DIRECTION_UP)
2309 subtract_one = true;
2310
2311 i = left;
2312 goto found;
2313 }
2314
2315 assert(left < right);
de190aef 2316 i = (left + right) / 2;
f268980d 2317
de190aef
LP
2318 p = le64toh(array->entry_array.items[i]);
2319 if (p <= 0)
bee6a291
LP
2320 r = -EBADMSG;
2321 else
2322 r = test_object(f, p, needle);
2323 if (r == -EBADMSG) {
2324 log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (2)");
2325 right = n = i;
2326 continue;
2327 }
de190aef
LP
2328 if (r < 0)
2329 return r;
cec736d2 2330
de190aef
LP
2331 if (r == TEST_FOUND)
2332 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2333
2334 if (r == TEST_RIGHT)
2335 right = i;
2336 else
2337 left = i + 1;
2338 }
2339 }
2340
2173cbf8 2341 if (k >= n) {
cbdca852
LP
2342 if (direction == DIRECTION_UP) {
2343 i = n;
2344 subtract_one = true;
2345 goto found;
2346 }
2347
cec736d2 2348 return 0;
cbdca852 2349 }
cec736d2 2350
de190aef
LP
2351 last_p = lp;
2352
2353 n -= k;
2354 t += k;
f268980d 2355 last_index = (uint64_t) -1;
de190aef 2356 a = le64toh(array->entry_array.next_entry_array_offset);
cec736d2
LP
2357 }
2358
2359 return 0;
de190aef
LP
2360
2361found:
2362 if (subtract_one && t == 0 && i == 0)
2363 return 0;
2364
a4bcff5b 2365 /* Let's cache this item for the next invocation */
af13a6b0 2366 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
a4bcff5b 2367
de190aef
LP
2368 if (subtract_one && i == 0)
2369 p = last_p;
2370 else if (subtract_one)
2371 p = le64toh(array->entry_array.items[i-1]);
2372 else
2373 p = le64toh(array->entry_array.items[i]);
2374
2375 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2376 if (r < 0)
2377 return r;
2378
2379 if (ret)
2380 *ret = o;
2381
2382 if (offset)
2383 *offset = p;
2384
2385 if (idx)
cbdca852 2386 *idx = t + i + (subtract_one ? -1 : 0);
de190aef
LP
2387
2388 return 1;
cec736d2
LP
2389}
2390
f268980d
LP
2391static int generic_array_bisect_plus_one(
2392 JournalFile *f,
2393 uint64_t extra,
2394 uint64_t first,
2395 uint64_t n,
2396 uint64_t needle,
2397 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2398 direction_t direction,
2399 Object **ret,
2400 uint64_t *offset,
2401 uint64_t *idx) {
de190aef 2402
cec736d2 2403 int r;
cbdca852
LP
2404 bool step_back = false;
2405 Object *o;
cec736d2
LP
2406
2407 assert(f);
de190aef 2408 assert(test_object);
cec736d2 2409
de190aef
LP
2410 if (n <= 0)
2411 return 0;
cec736d2 2412
de190aef
LP
2413 /* This bisects the array in object 'first', but first checks
2414 * an extra */
de190aef
LP
2415 r = test_object(f, extra, needle);
2416 if (r < 0)
2417 return r;
a536e261
LP
2418
2419 if (r == TEST_FOUND)
2420 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2421
cbdca852
LP
2422 /* if we are looking with DIRECTION_UP then we need to first
2423 see if in the actual array there is a matching entry, and
2424 return the last one of that. But if there isn't any we need
2425 to return this one. Hence remember this, and return it
2426 below. */
2427 if (r == TEST_LEFT)
2428 step_back = direction == DIRECTION_UP;
de190aef 2429
cbdca852
LP
2430 if (r == TEST_RIGHT) {
2431 if (direction == DIRECTION_DOWN)
2432 goto found;
2433 else
2434 return 0;
a536e261 2435 }
cec736d2 2436
de190aef
LP
2437 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
2438
cbdca852
LP
2439 if (r == 0 && step_back)
2440 goto found;
2441
ecf68b1d 2442 if (r > 0 && idx)
313cefa1 2443 (*idx)++;
de190aef
LP
2444
2445 return r;
cbdca852
LP
2446
2447found:
2448 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
2449 if (r < 0)
2450 return r;
2451
2452 if (ret)
2453 *ret = o;
2454
2455 if (offset)
2456 *offset = extra;
2457
2458 if (idx)
2459 *idx = 0;
2460
2461 return 1;
2462}
2463
44a6b1b6 2464_pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
cbdca852
LP
2465 assert(f);
2466 assert(p > 0);
2467
2468 if (p == needle)
2469 return TEST_FOUND;
2470 else if (p < needle)
2471 return TEST_LEFT;
2472 else
2473 return TEST_RIGHT;
2474}
2475
de190aef
LP
2476static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
2477 Object *o;
2478 int r;
2479
2480 assert(f);
2481 assert(p > 0);
2482
2483 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
cec736d2
LP
2484 if (r < 0)
2485 return r;
2486
de190aef
LP
2487 if (le64toh(o->entry.seqnum) == needle)
2488 return TEST_FOUND;
2489 else if (le64toh(o->entry.seqnum) < needle)
2490 return TEST_LEFT;
2491 else
2492 return TEST_RIGHT;
2493}
cec736d2 2494
de190aef
LP
2495int journal_file_move_to_entry_by_seqnum(
2496 JournalFile *f,
2497 uint64_t seqnum,
2498 direction_t direction,
2499 Object **ret,
2500 uint64_t *offset) {
c88cc6af
VC
2501 assert(f);
2502 assert(f->header);
de190aef
LP
2503
2504 return generic_array_bisect(f,
2505 le64toh(f->header->entry_array_offset),
2506 le64toh(f->header->n_entries),
2507 seqnum,
2508 test_object_seqnum,
2509 direction,
2510 ret, offset, NULL);
2511}
cec736d2 2512
de190aef
LP
2513static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
2514 Object *o;
2515 int r;
2516
2517 assert(f);
2518 assert(p > 0);
2519
2520 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2521 if (r < 0)
2522 return r;
2523
2524 if (le64toh(o->entry.realtime) == needle)
2525 return TEST_FOUND;
2526 else if (le64toh(o->entry.realtime) < needle)
2527 return TEST_LEFT;
2528 else
2529 return TEST_RIGHT;
cec736d2
LP
2530}
2531
de190aef
LP
2532int journal_file_move_to_entry_by_realtime(
2533 JournalFile *f,
2534 uint64_t realtime,
2535 direction_t direction,
2536 Object **ret,
2537 uint64_t *offset) {
c88cc6af
VC
2538 assert(f);
2539 assert(f->header);
de190aef
LP
2540
2541 return generic_array_bisect(f,
2542 le64toh(f->header->entry_array_offset),
2543 le64toh(f->header->n_entries),
2544 realtime,
2545 test_object_realtime,
2546 direction,
2547 ret, offset, NULL);
2548}
2549
2550static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
2551 Object *o;
2552 int r;
2553
2554 assert(f);
2555 assert(p > 0);
2556
2557 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2558 if (r < 0)
2559 return r;
2560
2561 if (le64toh(o->entry.monotonic) == needle)
2562 return TEST_FOUND;
2563 else if (le64toh(o->entry.monotonic) < needle)
2564 return TEST_LEFT;
2565 else
2566 return TEST_RIGHT;
2567}
2568
2a560338 2569static int find_data_object_by_boot_id(
47838ab3
ZJS
2570 JournalFile *f,
2571 sd_id128_t boot_id,
2572 Object **o,
2573 uint64_t *b) {
2a560338 2574
fbd0b64f 2575 char t[STRLEN("_BOOT_ID=") + 32 + 1] = "_BOOT_ID=";
47838ab3
ZJS
2576
2577 sd_id128_to_string(boot_id, t + 9);
2578 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2579}
2580
de190aef
LP
2581int journal_file_move_to_entry_by_monotonic(
2582 JournalFile *f,
2583 sd_id128_t boot_id,
2584 uint64_t monotonic,
2585 direction_t direction,
2586 Object **ret,
2587 uint64_t *offset) {
2588
de190aef
LP
2589 Object *o;
2590 int r;
2591
cbdca852 2592 assert(f);
de190aef 2593
47838ab3 2594 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
de190aef
LP
2595 if (r < 0)
2596 return r;
cbdca852 2597 if (r == 0)
de190aef
LP
2598 return -ENOENT;
2599
2600 return generic_array_bisect_plus_one(f,
2601 le64toh(o->data.entry_offset),
2602 le64toh(o->data.entry_array_offset),
2603 le64toh(o->data.n_entries),
2604 monotonic,
2605 test_object_monotonic,
2606 direction,
2607 ret, offset, NULL);
2608}
2609
1fc605b0 2610void journal_file_reset_location(JournalFile *f) {
6573ef05 2611 f->location_type = LOCATION_HEAD;
1fc605b0 2612 f->current_offset = 0;
6573ef05
MS
2613 f->current_seqnum = 0;
2614 f->current_realtime = 0;
2615 f->current_monotonic = 0;
2616 zero(f->current_boot_id);
2617 f->current_xor_hash = 0;
2618}
2619
950c07d4 2620void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
6573ef05
MS
2621 f->location_type = LOCATION_SEEK;
2622 f->current_offset = offset;
2623 f->current_seqnum = le64toh(o->entry.seqnum);
2624 f->current_realtime = le64toh(o->entry.realtime);
2625 f->current_monotonic = le64toh(o->entry.monotonic);
2626 f->current_boot_id = o->entry.boot_id;
2627 f->current_xor_hash = le64toh(o->entry.xor_hash);
1fc605b0
MS
2628}
2629
d8ae66d7
MS
2630int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2631 assert(af);
c88cc6af 2632 assert(af->header);
d8ae66d7 2633 assert(bf);
c88cc6af 2634 assert(bf->header);
d8ae66d7
MS
2635 assert(af->location_type == LOCATION_SEEK);
2636 assert(bf->location_type == LOCATION_SEEK);
2637
2638 /* If contents and timestamps match, these entries are
2639 * identical, even if the seqnum does not match */
2640 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2641 af->current_monotonic == bf->current_monotonic &&
2642 af->current_realtime == bf->current_realtime &&
2643 af->current_xor_hash == bf->current_xor_hash)
2644 return 0;
2645
2646 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2647
2648 /* If this is from the same seqnum source, compare
2649 * seqnums */
2650 if (af->current_seqnum < bf->current_seqnum)
2651 return -1;
2652 if (af->current_seqnum > bf->current_seqnum)
2653 return 1;
2654
2655 /* Wow! This is weird, different data but the same
2656 * seqnums? Something is borked, but let's make the
2657 * best of it and compare by time. */
2658 }
2659
2660 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2661
2662 /* If the boot id matches, compare monotonic time */
2663 if (af->current_monotonic < bf->current_monotonic)
2664 return -1;
2665 if (af->current_monotonic > bf->current_monotonic)
2666 return 1;
2667 }
2668
2669 /* Otherwise, compare UTC time */
2670 if (af->current_realtime < bf->current_realtime)
2671 return -1;
2672 if (af->current_realtime > bf->current_realtime)
2673 return 1;
2674
2675 /* Finally, compare by contents */
2676 if (af->current_xor_hash < bf->current_xor_hash)
2677 return -1;
2678 if (af->current_xor_hash > bf->current_xor_hash)
2679 return 1;
2680
2681 return 0;
2682}
2683
aa598ba5
LP
2684static int bump_array_index(uint64_t *i, direction_t direction, uint64_t n) {
2685
2686 /* Increase or decrease the specified index, in the right direction. */
2687
2688 if (direction == DIRECTION_DOWN) {
2689 if (*i >= n - 1)
2690 return 0;
2691
2692 (*i) ++;
2693 } else {
2694 if (*i <= 0)
2695 return 0;
2696
2697 (*i) --;
2698 }
2699
2700 return 1;
2701}
2702
b6da4ed0
LP
2703static bool check_properly_ordered(uint64_t new_offset, uint64_t old_offset, direction_t direction) {
2704
2705 /* Consider it an error if any of the two offsets is uninitialized */
2706 if (old_offset == 0 || new_offset == 0)
2707 return false;
2708
2709 /* If we go down, the new offset must be larger than the old one. */
2710 return direction == DIRECTION_DOWN ?
2711 new_offset > old_offset :
2712 new_offset < old_offset;
2713}
2714
de190aef
LP
2715int journal_file_next_entry(
2716 JournalFile *f,
f534928a 2717 uint64_t p,
de190aef
LP
2718 direction_t direction,
2719 Object **ret, uint64_t *offset) {
2720
fb099c8d 2721 uint64_t i, n, ofs;
cec736d2
LP
2722 int r;
2723
2724 assert(f);
c88cc6af 2725 assert(f->header);
de190aef
LP
2726
2727 n = le64toh(f->header->n_entries);
2728 if (n <= 0)
2729 return 0;
cec736d2 2730
f534928a 2731 if (p == 0)
de190aef 2732 i = direction == DIRECTION_DOWN ? 0 : n - 1;
cec736d2 2733 else {
de190aef
LP
2734 r = generic_array_bisect(f,
2735 le64toh(f->header->entry_array_offset),
2736 le64toh(f->header->n_entries),
2737 p,
2738 test_object_offset,
2739 DIRECTION_DOWN,
2740 NULL, NULL,
2741 &i);
2742 if (r <= 0)
2743 return r;
2744
aa598ba5
LP
2745 r = bump_array_index(&i, direction, n);
2746 if (r <= 0)
2747 return r;
cec736d2
LP
2748 }
2749
de190aef 2750 /* And jump to it */
989793d3
LP
2751 for (;;) {
2752 r = generic_array_get(f,
2753 le64toh(f->header->entry_array_offset),
2754 i,
2755 ret, &ofs);
2756 if (r > 0)
2757 break;
2758 if (r != -EBADMSG)
2759 return r;
2760
2761 /* OK, so this entry is borked. Most likely some entry didn't get synced to disk properly, let's see if
2762 * the next one might work for us instead. */
2763 log_debug_errno(r, "Entry item %" PRIu64 " is bad, skipping over it.", i);
2764
2765 r = bump_array_index(&i, direction, n);
2766 if (r <= 0)
2767 return r;
caeab8f6 2768 }
fb099c8d 2769
b6da4ed0
LP
2770 /* Ensure our array is properly ordered. */
2771 if (p > 0 && !check_properly_ordered(ofs, p, direction)) {
2772 log_debug("%s: entry array not properly ordered at entry %" PRIu64, f->path, i);
fb099c8d
ZJS
2773 return -EBADMSG;
2774 }
2775
2776 if (offset)
2777 *offset = ofs;
2778
2779 return 1;
de190aef 2780}
cec736d2 2781
de190aef
LP
2782int journal_file_next_entry_for_data(
2783 JournalFile *f,
2784 Object *o, uint64_t p,
2785 uint64_t data_offset,
2786 direction_t direction,
2787 Object **ret, uint64_t *offset) {
2788
ded5034e 2789 uint64_t i, n, ofs;
de190aef 2790 Object *d;
989793d3 2791 int r;
cec736d2
LP
2792
2793 assert(f);
de190aef 2794 assert(p > 0 || !o);
cec736d2 2795
de190aef 2796 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
466ccd92 2797 if (r < 0)
de190aef 2798 return r;
cec736d2 2799
de190aef
LP
2800 n = le64toh(d->data.n_entries);
2801 if (n <= 0)
2802 return n;
cec736d2 2803
de190aef
LP
2804 if (!o)
2805 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2806 else {
2807 if (o->object.type != OBJECT_ENTRY)
2808 return -EINVAL;
cec736d2 2809
de190aef
LP
2810 r = generic_array_bisect_plus_one(f,
2811 le64toh(d->data.entry_offset),
2812 le64toh(d->data.entry_array_offset),
2813 le64toh(d->data.n_entries),
2814 p,
2815 test_object_offset,
2816 DIRECTION_DOWN,
2817 NULL, NULL,
2818 &i);
2819
2820 if (r <= 0)
cec736d2
LP
2821 return r;
2822
aa598ba5
LP
2823 r = bump_array_index(&i, direction, n);
2824 if (r <= 0)
2825 return r;
de190aef 2826 }
cec736d2 2827
989793d3
LP
2828 for (;;) {
2829 r = generic_array_get_plus_one(f,
2830 le64toh(d->data.entry_offset),
2831 le64toh(d->data.entry_array_offset),
2832 i,
2833 ret, &ofs);
2834 if (r > 0)
2835 break;
2836 if (r != -EBADMSG)
2837 return r;
2838
2839 log_debug_errno(r, "Data entry item %" PRIu64 " is bad, skipping over it.", i);
2840
2841 r = bump_array_index(&i, direction, n);
2842 if (r <= 0)
2843 return r;
2844 }
ded5034e
LP
2845
2846 /* Ensure our array is properly ordered. */
2847 if (p > 0 && check_properly_ordered(ofs, p, direction)) {
2848 log_debug("%s data entry array not properly ordered at entry %" PRIu64, f->path, i);
2849 return -EBADMSG;
2850 }
2851
2852 if (offset)
2853 *offset = ofs;
2854
2855 return 1;
de190aef 2856}
cec736d2 2857
cbdca852
LP
2858int journal_file_move_to_entry_by_offset_for_data(
2859 JournalFile *f,
2860 uint64_t data_offset,
2861 uint64_t p,
2862 direction_t direction,
2863 Object **ret, uint64_t *offset) {
2864
2865 int r;
2866 Object *d;
2867
2868 assert(f);
2869
2870 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2871 if (r < 0)
2872 return r;
2873
2874 return generic_array_bisect_plus_one(f,
2875 le64toh(d->data.entry_offset),
2876 le64toh(d->data.entry_array_offset),
2877 le64toh(d->data.n_entries),
2878 p,
2879 test_object_offset,
2880 direction,
2881 ret, offset, NULL);
2882}
2883
2884int journal_file_move_to_entry_by_monotonic_for_data(
2885 JournalFile *f,
2886 uint64_t data_offset,
2887 sd_id128_t boot_id,
2888 uint64_t monotonic,
2889 direction_t direction,
2890 Object **ret, uint64_t *offset) {
2891
cbdca852
LP
2892 Object *o, *d;
2893 int r;
2894 uint64_t b, z;
2895
2896 assert(f);
2897
2898 /* First, seek by time */
47838ab3 2899 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
cbdca852
LP
2900 if (r < 0)
2901 return r;
2902 if (r == 0)
2903 return -ENOENT;
2904
2905 r = generic_array_bisect_plus_one(f,
2906 le64toh(o->data.entry_offset),
2907 le64toh(o->data.entry_array_offset),
2908 le64toh(o->data.n_entries),
2909 monotonic,
2910 test_object_monotonic,
2911 direction,
2912 NULL, &z, NULL);
2913 if (r <= 0)
2914 return r;
2915
2916 /* And now, continue seeking until we find an entry that
2917 * exists in both bisection arrays */
2918
2919 for (;;) {
2920 Object *qo;
2921 uint64_t p, q;
2922
2923 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2924 if (r < 0)
2925 return r;
2926
2927 r = generic_array_bisect_plus_one(f,
2928 le64toh(d->data.entry_offset),
2929 le64toh(d->data.entry_array_offset),
2930 le64toh(d->data.n_entries),
2931 z,
2932 test_object_offset,
2933 direction,
2934 NULL, &p, NULL);
2935 if (r <= 0)
2936 return r;
2937
2938 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2939 if (r < 0)
2940 return r;
2941
2942 r = generic_array_bisect_plus_one(f,
2943 le64toh(o->data.entry_offset),
2944 le64toh(o->data.entry_array_offset),
2945 le64toh(o->data.n_entries),
2946 p,
2947 test_object_offset,
2948 direction,
2949 &qo, &q, NULL);
2950
2951 if (r <= 0)
2952 return r;
2953
2954 if (p == q) {
2955 if (ret)
2956 *ret = qo;
2957 if (offset)
2958 *offset = q;
2959
2960 return 1;
2961 }
2962
2963 z = q;
2964 }
cbdca852
LP
2965}
2966
de190aef
LP
2967int journal_file_move_to_entry_by_seqnum_for_data(
2968 JournalFile *f,
2969 uint64_t data_offset,
2970 uint64_t seqnum,
2971 direction_t direction,
2972 Object **ret, uint64_t *offset) {
cec736d2 2973
de190aef
LP
2974 Object *d;
2975 int r;
cec736d2 2976
91a31dde
LP
2977 assert(f);
2978
de190aef 2979 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2980 if (r < 0)
de190aef 2981 return r;
cec736d2 2982
de190aef
LP
2983 return generic_array_bisect_plus_one(f,
2984 le64toh(d->data.entry_offset),
2985 le64toh(d->data.entry_array_offset),
2986 le64toh(d->data.n_entries),
2987 seqnum,
2988 test_object_seqnum,
2989 direction,
2990 ret, offset, NULL);
2991}
cec736d2 2992
de190aef
LP
2993int journal_file_move_to_entry_by_realtime_for_data(
2994 JournalFile *f,
2995 uint64_t data_offset,
2996 uint64_t realtime,
2997 direction_t direction,
2998 Object **ret, uint64_t *offset) {
2999
3000 Object *d;
3001 int r;
3002
91a31dde
LP
3003 assert(f);
3004
de190aef 3005 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 3006 if (r < 0)
de190aef
LP
3007 return r;
3008
3009 return generic_array_bisect_plus_one(f,
3010 le64toh(d->data.entry_offset),
3011 le64toh(d->data.entry_array_offset),
3012 le64toh(d->data.n_entries),
3013 realtime,
3014 test_object_realtime,
3015 direction,
3016 ret, offset, NULL);
cec736d2
LP
3017}
3018
0284adc6 3019void journal_file_dump(JournalFile *f) {
7560fffc 3020 Object *o;
7560fffc 3021 int r;
0284adc6 3022 uint64_t p;
7560fffc
LP
3023
3024 assert(f);
c88cc6af 3025 assert(f->header);
7560fffc 3026
0284adc6 3027 journal_file_print_header(f);
7560fffc 3028
0284adc6
LP
3029 p = le64toh(f->header->header_size);
3030 while (p != 0) {
d05089d8 3031 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
0284adc6
LP
3032 if (r < 0)
3033 goto fail;
7560fffc 3034
0284adc6 3035 switch (o->object.type) {
d98cc1f2 3036
0284adc6
LP
3037 case OBJECT_UNUSED:
3038 printf("Type: OBJECT_UNUSED\n");
3039 break;
d98cc1f2 3040
0284adc6
LP
3041 case OBJECT_DATA:
3042 printf("Type: OBJECT_DATA\n");
3043 break;
7560fffc 3044
3c1668da
LP
3045 case OBJECT_FIELD:
3046 printf("Type: OBJECT_FIELD\n");
3047 break;
3048
0284adc6 3049 case OBJECT_ENTRY:
507f22bd
ZJS
3050 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
3051 le64toh(o->entry.seqnum),
3052 le64toh(o->entry.monotonic),
3053 le64toh(o->entry.realtime));
0284adc6 3054 break;
7560fffc 3055
0284adc6
LP
3056 case OBJECT_FIELD_HASH_TABLE:
3057 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
3058 break;
7560fffc 3059
0284adc6
LP
3060 case OBJECT_DATA_HASH_TABLE:
3061 printf("Type: OBJECT_DATA_HASH_TABLE\n");
3062 break;
7560fffc 3063
0284adc6
LP
3064 case OBJECT_ENTRY_ARRAY:
3065 printf("Type: OBJECT_ENTRY_ARRAY\n");
3066 break;
7560fffc 3067
0284adc6 3068 case OBJECT_TAG:
507f22bd
ZJS
3069 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
3070 le64toh(o->tag.seqnum),
3071 le64toh(o->tag.epoch));
0284adc6 3072 break;
3c1668da
LP
3073
3074 default:
8facc349 3075 printf("Type: unknown (%i)\n", o->object.type);
3c1668da 3076 break;
0284adc6 3077 }
7560fffc 3078
d89c8fdf
ZJS
3079 if (o->object.flags & OBJECT_COMPRESSION_MASK)
3080 printf("Flags: %s\n",
3081 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
7560fffc 3082
0284adc6
LP
3083 if (p == le64toh(f->header->tail_object_offset))
3084 p = 0;
3085 else
3086 p = p + ALIGN64(le64toh(o->object.size));
3087 }
7560fffc 3088
0284adc6
LP
3089 return;
3090fail:
3091 log_error("File corrupt");
7560fffc
LP
3092}
3093
718fe4b1
ZJS
3094static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
3095 const char *x;
3096
3097 x = format_timestamp(buf, l, t);
3098 if (x)
3099 return x;
3100 return " --- ";
3101}
3102
0284adc6 3103void journal_file_print_header(JournalFile *f) {
2765b7bb 3104 char a[33], b[33], c[33], d[33];
ed375beb 3105 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
a1a03e30
LP
3106 struct stat st;
3107 char bytes[FORMAT_BYTES_MAX];
7560fffc
LP
3108
3109 assert(f);
c88cc6af 3110 assert(f->header);
7560fffc 3111
0284adc6
LP
3112 printf("File Path: %s\n"
3113 "File ID: %s\n"
3114 "Machine ID: %s\n"
3115 "Boot ID: %s\n"
3116 "Sequential Number ID: %s\n"
3117 "State: %s\n"
3118 "Compatible Flags:%s%s\n"
d89c8fdf 3119 "Incompatible Flags:%s%s%s\n"
507f22bd
ZJS
3120 "Header size: %"PRIu64"\n"
3121 "Arena size: %"PRIu64"\n"
3122 "Data Hash Table Size: %"PRIu64"\n"
3123 "Field Hash Table Size: %"PRIu64"\n"
0284adc6 3124 "Rotate Suggested: %s\n"
0808b92f
LP
3125 "Head Sequential Number: %"PRIu64" (%"PRIx64")\n"
3126 "Tail Sequential Number: %"PRIu64" (%"PRIx64")\n"
3127 "Head Realtime Timestamp: %s (%"PRIx64")\n"
3128 "Tail Realtime Timestamp: %s (%"PRIx64")\n"
3129 "Tail Monotonic Timestamp: %s (%"PRIx64")\n"
507f22bd
ZJS
3130 "Objects: %"PRIu64"\n"
3131 "Entry Objects: %"PRIu64"\n",
0284adc6
LP
3132 f->path,
3133 sd_id128_to_string(f->header->file_id, a),
3134 sd_id128_to_string(f->header->machine_id, b),
3135 sd_id128_to_string(f->header->boot_id, c),
2765b7bb 3136 sd_id128_to_string(f->header->seqnum_id, d),
3223f44f
LP
3137 f->header->state == STATE_OFFLINE ? "OFFLINE" :
3138 f->header->state == STATE_ONLINE ? "ONLINE" :
3139 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
8088cbd3 3140 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
d89c8fdf
ZJS
3141 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
3142 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
3143 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
3144 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
507f22bd
ZJS
3145 le64toh(f->header->header_size),
3146 le64toh(f->header->arena_size),
3147 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3148 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
fb0951b0 3149 yes_no(journal_file_rotate_suggested(f, 0)),
0808b92f
LP
3150 le64toh(f->header->head_entry_seqnum), le64toh(f->header->head_entry_seqnum),
3151 le64toh(f->header->tail_entry_seqnum), le64toh(f->header->tail_entry_seqnum),
3152 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)), le64toh(f->header->head_entry_realtime),
3153 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)), le64toh(f->header->tail_entry_realtime),
3154 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC), le64toh(f->header->tail_entry_monotonic),
507f22bd
ZJS
3155 le64toh(f->header->n_objects),
3156 le64toh(f->header->n_entries));
7560fffc 3157
0284adc6 3158 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
507f22bd 3159 printf("Data Objects: %"PRIu64"\n"
0284adc6 3160 "Data Hash Table Fill: %.1f%%\n",
507f22bd 3161 le64toh(f->header->n_data),
0284adc6 3162 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
7560fffc 3163
0284adc6 3164 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
507f22bd 3165 printf("Field Objects: %"PRIu64"\n"
0284adc6 3166 "Field Hash Table Fill: %.1f%%\n",
507f22bd 3167 le64toh(f->header->n_fields),
0284adc6 3168 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3223f44f
LP
3169
3170 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
507f22bd
ZJS
3171 printf("Tag Objects: %"PRIu64"\n",
3172 le64toh(f->header->n_tags));
3223f44f 3173 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
507f22bd
ZJS
3174 printf("Entry Array Objects: %"PRIu64"\n",
3175 le64toh(f->header->n_entry_arrays));
a1a03e30
LP
3176
3177 if (fstat(f->fd, &st) >= 0)
59f448cf 3178 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
7560fffc
LP
3179}
3180
fc68c929
LP
3181static int journal_file_warn_btrfs(JournalFile *f) {
3182 unsigned attrs;
3183 int r;
3184
3185 assert(f);
3186
3187 /* Before we write anything, check if the COW logic is turned
3188 * off on btrfs. Given our write pattern that is quite
3189 * unfriendly to COW file systems this should greatly improve
3190 * performance on COW file systems, such as btrfs, at the
3191 * expense of data integrity features (which shouldn't be too
3192 * bad, given that we do our own checksumming). */
3193
3194 r = btrfs_is_filesystem(f->fd);
3195 if (r < 0)
3196 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
3197 if (!r)
3198 return 0;
3199
3200 r = read_attr_fd(f->fd, &attrs);
3201 if (r < 0)
3202 return log_warning_errno(r, "Failed to read file attributes: %m");
3203
3204 if (attrs & FS_NOCOW_FL) {
3205 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
3206 return 0;
3207 }
3208
3209 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
3210 "This is likely to slow down journal access substantially, please consider turning "
3211 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
3212
3213 return 1;
3214}
3215
0284adc6 3216int journal_file_open(
5d1ce257 3217 int fd,
0284adc6
LP
3218 const char *fname,
3219 int flags,
3220 mode_t mode,
3221 bool compress,
baed47c3 3222 bool seal,
0284adc6
LP
3223 JournalMetrics *metrics,
3224 MMapCache *mmap_cache,
b58c888f 3225 Set *deferred_closes,
0284adc6
LP
3226 JournalFile *template,
3227 JournalFile **ret) {
7560fffc 3228
fa6ac760 3229 bool newly_created = false;
0284adc6 3230 JournalFile *f;
fa6ac760 3231 void *h;
0284adc6 3232 int r;
7560fffc 3233
0559d3a5 3234 assert(ret);
5d1ce257 3235 assert(fd >= 0 || fname);
7560fffc 3236
ec2ce0c5 3237 if (!IN_SET((flags & O_ACCMODE), O_RDONLY, O_RDWR))
0284adc6 3238 return -EINVAL;
7560fffc 3239
5d1ce257
LP
3240 if (fname) {
3241 if (!endswith(fname, ".journal") &&
3242 !endswith(fname, ".journal~"))
3243 return -EINVAL;
3244 }
7560fffc 3245
0284adc6
LP
3246 f = new0(JournalFile, 1);
3247 if (!f)
3248 return -ENOMEM;
7560fffc 3249
5d1ce257 3250 f->fd = fd;
0284adc6 3251 f->mode = mode;
7560fffc 3252
0284adc6
LP
3253 f->flags = flags;
3254 f->prot = prot_from_flags(flags);
3255 f->writable = (flags & O_ACCMODE) != O_RDONLY;
349cc4a5 3256#if HAVE_LZ4
d89c8fdf 3257 f->compress_lz4 = compress;
349cc4a5 3258#elif HAVE_XZ
d89c8fdf 3259 f->compress_xz = compress;
48b61739 3260#endif
349cc4a5 3261#if HAVE_GCRYPT
baed47c3 3262 f->seal = seal;
49a32d43 3263#endif
7560fffc 3264
0284adc6
LP
3265 if (mmap_cache)
3266 f->mmap = mmap_cache_ref(mmap_cache);
3267 else {
84168d80 3268 f->mmap = mmap_cache_new();
0284adc6
LP
3269 if (!f->mmap) {
3270 r = -ENOMEM;
3271 goto fail;
3272 }
3273 }
7560fffc 3274
7645c77b 3275 if (fname) {
5d1ce257 3276 f->path = strdup(fname);
7645c77b
ZJS
3277 if (!f->path) {
3278 r = -ENOMEM;
3279 goto fail;
3280 }
3281 } else {
3282 /* If we don't know the path, fill in something explanatory and vaguely useful */
3283 if (asprintf(&f->path, "/proc/self/%i", fd) < 0) {
3284 r = -ENOMEM;
3285 goto fail;
3286 }
0284adc6 3287 }
7560fffc 3288
4743015d 3289 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
a4bcff5b
LP
3290 if (!f->chain_cache) {
3291 r = -ENOMEM;
3292 goto fail;
3293 }
3294
0284adc6 3295 if (f->fd < 0) {
5d1ce257
LP
3296 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
3297 if (f->fd < 0) {
3298 r = -errno;
3299 goto fail;
3300 }
3301
3302 /* fds we opened here by us should also be closed by us. */
3303 f->close_fd = true;
7560fffc 3304 }
7560fffc 3305
be7cdd8e
VC
3306 f->cache_fd = mmap_cache_add_fd(f->mmap, f->fd);
3307 if (!f->cache_fd) {
3308 r = -ENOMEM;
3309 goto fail;
3310 }
3311
2678031a
LP
3312 r = journal_file_fstat(f);
3313 if (r < 0)
0284adc6 3314 goto fail;
7560fffc 3315
0284adc6 3316 if (f->last_stat.st_size == 0 && f->writable) {
11689d2a 3317
fc68c929 3318 (void) journal_file_warn_btrfs(f);
11689d2a 3319
fb0951b0
LP
3320 /* Let's attach the creation time to the journal file,
3321 * so that the vacuuming code knows the age of this
3322 * file even if the file might end up corrupted one
3323 * day... Ideally we'd just use the creation time many
3324 * file systems maintain for each file, but there is
3325 * currently no usable API to query this, hence let's
3326 * emulate this via extended attributes. If extended
3327 * attributes are not supported we'll just skip this,
7517e174 3328 * and rely solely on mtime/atime/ctime of the file. */
fb0951b0 3329
d61b600d 3330 fd_setcrtime(f->fd, 0);
7560fffc 3331
349cc4a5 3332#if HAVE_GCRYPT
0284adc6 3333 /* Try to load the FSPRG state, and if we can't, then
baed47c3 3334 * just don't do sealing */
49a32d43
LP
3335 if (f->seal) {
3336 r = journal_file_fss_load(f);
3337 if (r < 0)
3338 f->seal = false;
3339 }
feb12d3e 3340#endif
7560fffc 3341
0284adc6
LP
3342 r = journal_file_init_header(f, template);
3343 if (r < 0)
3344 goto fail;
7560fffc 3345
2678031a
LP
3346 r = journal_file_fstat(f);
3347 if (r < 0)
0284adc6 3348 goto fail;
fb0951b0
LP
3349
3350 newly_created = true;
0284adc6 3351 }
7560fffc 3352
0284adc6 3353 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
cfb571f3 3354 r = -ENODATA;
0284adc6
LP
3355 goto fail;
3356 }
7560fffc 3357
b42549ad 3358 r = mmap_cache_get(f->mmap, f->cache_fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h, NULL);
977eaa1e 3359 if (r < 0)
0284adc6 3360 goto fail;
7560fffc 3361
fa6ac760
LP
3362 f->header = h;
3363
0284adc6 3364 if (!newly_created) {
f9168190 3365 set_clear_with_destructor(deferred_closes, journal_file_close);
b58c888f 3366
0284adc6
LP
3367 r = journal_file_verify_header(f);
3368 if (r < 0)
3369 goto fail;
3370 }
7560fffc 3371
349cc4a5 3372#if HAVE_GCRYPT
0284adc6 3373 if (!newly_created && f->writable) {
baed47c3 3374 r = journal_file_fss_load(f);
0284adc6
LP
3375 if (r < 0)
3376 goto fail;
3377 }
feb12d3e 3378#endif
cec736d2
LP
3379
3380 if (f->writable) {
4a92baf3
LP
3381 if (metrics) {
3382 journal_default_metrics(metrics, f->fd);
3383 f->metrics = *metrics;
3384 } else if (template)
3385 f->metrics = template->metrics;
3386
cec736d2
LP
3387 r = journal_file_refresh_header(f);
3388 if (r < 0)
3389 goto fail;
3390 }
3391
349cc4a5 3392#if HAVE_GCRYPT
baed47c3 3393 r = journal_file_hmac_setup(f);
14d10188
LP
3394 if (r < 0)
3395 goto fail;
feb12d3e 3396#endif
14d10188 3397
cec736d2 3398 if (newly_created) {
de190aef 3399 r = journal_file_setup_field_hash_table(f);
cec736d2
LP
3400 if (r < 0)
3401 goto fail;
3402
de190aef 3403 r = journal_file_setup_data_hash_table(f);
cec736d2
LP
3404 if (r < 0)
3405 goto fail;
7560fffc 3406
349cc4a5 3407#if HAVE_GCRYPT
7560fffc
LP
3408 r = journal_file_append_first_tag(f);
3409 if (r < 0)
3410 goto fail;
feb12d3e 3411#endif
cec736d2
LP
3412 }
3413
be7cdd8e 3414 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd)) {
fa6ac760
LP
3415 r = -EIO;
3416 goto fail;
3417 }
3418
7a24f3bf 3419 if (template && template->post_change_timer) {
e167d7fd
LP
3420 r = journal_file_enable_post_change_timer(
3421 f,
3422 sd_event_source_get_event(template->post_change_timer),
3423 template->post_change_timer_period);
7a24f3bf 3424
7a24f3bf
VC
3425 if (r < 0)
3426 goto fail;
3427 }
3428
f8e2f4d6 3429 /* The file is opened now successfully, thus we take possession of any passed in fd. */
5d1ce257
LP
3430 f->close_fd = true;
3431
0559d3a5 3432 *ret = f;
cec736d2
LP
3433 return 0;
3434
3435fail:
be7cdd8e 3436 if (f->cache_fd && mmap_cache_got_sigbus(f->mmap, f->cache_fd))
fa6ac760
LP
3437 r = -EIO;
3438
69a3a6fd 3439 (void) journal_file_close(f);
cec736d2
LP
3440
3441 return r;
3442}
0ac38b70 3443
b58c888f 3444int journal_file_rotate(JournalFile **f, bool compress, bool seal, Set *deferred_closes) {
57535f47 3445 _cleanup_free_ char *p = NULL;
0ac38b70
LP
3446 size_t l;
3447 JournalFile *old_file, *new_file = NULL;
3448 int r;
3449
3450 assert(f);
3451 assert(*f);
3452
3453 old_file = *f;
3454
3455 if (!old_file->writable)
3456 return -EINVAL;
3457
5d1ce257 3458 /* Is this a journal file that was passed to us as fd? If so, we synthesized a path name for it, and we refuse
13e785f7 3459 * rotation, since we don't know the actual path, and couldn't rename the file hence. */
5d1ce257
LP
3460 if (path_startswith(old_file->path, "/proc/self/fd"))
3461 return -EINVAL;
3462
0ac38b70
LP
3463 if (!endswith(old_file->path, ".journal"))
3464 return -EINVAL;
3465
3466 l = strlen(old_file->path);
57535f47
ZJS
3467 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
3468 (int) l - 8, old_file->path,
3469 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
3470 le64toh((*f)->header->head_entry_seqnum),
3471 le64toh((*f)->header->head_entry_realtime));
3472 if (r < 0)
0ac38b70
LP
3473 return -ENOMEM;
3474
2678031a
LP
3475 /* Try to rename the file to the archived version. If the file
3476 * already was deleted, we'll get ENOENT, let's ignore that
3477 * case. */
0ac38b70 3478 r = rename(old_file->path, p);
2678031a 3479 if (r < 0 && errno != ENOENT)
0ac38b70
LP
3480 return -errno;
3481
1fcefd88
LP
3482 /* Sync the rename to disk */
3483 (void) fsync_directory_of_file(old_file->fd);
3484
8eb85171
VC
3485 /* Set as archive so offlining commits w/state=STATE_ARCHIVED.
3486 * Previously we would set old_file->header->state to STATE_ARCHIVED directly here,
3487 * but journal_file_set_offline() short-circuits when state != STATE_ONLINE, which
3488 * would result in the rotated journal never getting fsync() called before closing.
3489 * Now we simply queue the archive state by setting an archive bit, leaving the state
3490 * as STATE_ONLINE so proper offlining occurs. */
3491 old_file->archive = true;
0ac38b70 3492
f27a3864
LP
3493 /* Currently, btrfs is not very good with out write patterns
3494 * and fragments heavily. Let's defrag our journal files when
3495 * we archive them */
3496 old_file->defrag_on_close = true;
3497
5d1ce257 3498 r = journal_file_open(-1, old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, deferred_closes, old_file, &new_file);
b58c888f
VC
3499
3500 if (deferred_closes &&
3501 set_put(deferred_closes, old_file) >= 0)
3502 (void) journal_file_set_offline(old_file, false);
3503 else
3504 (void) journal_file_close(old_file);
0ac38b70
LP
3505
3506 *f = new_file;
3507 return r;
3508}
3509
9447a7f1
LP
3510int journal_file_open_reliably(
3511 const char *fname,
3512 int flags,
3513 mode_t mode,
7560fffc 3514 bool compress,
baed47c3 3515 bool seal,
4a92baf3 3516 JournalMetrics *metrics,
27370278 3517 MMapCache *mmap_cache,
b58c888f 3518 Set *deferred_closes,
9447a7f1
LP
3519 JournalFile *template,
3520 JournalFile **ret) {
3521
3522 int r;
3523 size_t l;
ed375beb 3524 _cleanup_free_ char *p = NULL;
9447a7f1 3525
5d1ce257 3526 r = journal_file_open(-1, fname, flags, mode, compress, seal, metrics, mmap_cache, deferred_closes, template, ret);
288359db 3527 if (!IN_SET(r,
b288cdeb
ZJS
3528 -EBADMSG, /* Corrupted */
3529 -ENODATA, /* Truncated */
3530 -EHOSTDOWN, /* Other machine */
3531 -EPROTONOSUPPORT, /* Incompatible feature */
3532 -EBUSY, /* Unclean shutdown */
3533 -ESHUTDOWN, /* Already archived */
288359db 3534 -EIO, /* IO error, including SIGBUS on mmap */
ae739cc1
LP
3535 -EIDRM, /* File has been deleted */
3536 -ETXTBSY)) /* File is from the future */
9447a7f1
LP
3537 return r;
3538
3539 if ((flags & O_ACCMODE) == O_RDONLY)
3540 return r;
3541
3542 if (!(flags & O_CREAT))
3543 return r;
3544
7560fffc
LP
3545 if (!endswith(fname, ".journal"))
3546 return r;
3547
5c70eab4
LP
3548 /* The file is corrupted. Rotate it away and try it again (but only once) */
3549
9447a7f1 3550 l = strlen(fname);
d587eca5 3551 if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
57535f47 3552 (int) l - 8, fname,
d587eca5 3553 now(CLOCK_REALTIME),
9bf3b535 3554 random_u64()) < 0)
9447a7f1
LP
3555 return -ENOMEM;
3556
65089b82 3557 if (rename(fname, p) < 0)
9447a7f1
LP
3558 return -errno;
3559
f27a3864
LP
3560 /* btrfs doesn't cope well with our write pattern and
3561 * fragments heavily. Let's defrag all files we rotate */
11689d2a 3562
a67d68b8 3563 (void) chattr_path(p, 0, FS_NOCOW_FL);
f27a3864
LP
3564 (void) btrfs_defrag(p);
3565
65089b82 3566 log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
9447a7f1 3567
5d1ce257 3568 return journal_file_open(-1, fname, flags, mode, compress, seal, metrics, mmap_cache, deferred_closes, template, ret);
9447a7f1
LP
3569}
3570
cf244689
LP
3571int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
3572 uint64_t i, n;
3573 uint64_t q, xor_hash = 0;
3574 int r;
3575 EntryItem *items;
3576 dual_timestamp ts;
3577
3578 assert(from);
3579 assert(to);
3580 assert(o);
3581 assert(p);
3582
3583 if (!to->writable)
3584 return -EPERM;
3585
3586 ts.monotonic = le64toh(o->entry.monotonic);
3587 ts.realtime = le64toh(o->entry.realtime);
3588
cf244689 3589 n = journal_file_entry_n_items(o);
4faa7004
TA
3590 /* alloca() can't take 0, hence let's allocate at least one */
3591 items = alloca(sizeof(EntryItem) * MAX(1u, n));
cf244689
LP
3592
3593 for (i = 0; i < n; i++) {
4fd052ae
FC
3594 uint64_t l, h;
3595 le64_t le_hash;
cf244689
LP
3596 size_t t;
3597 void *data;
3598 Object *u;
3599
3600 q = le64toh(o->entry.items[i].object_offset);
3601 le_hash = o->entry.items[i].hash;
3602
3603 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
3604 if (r < 0)
3605 return r;
3606
3607 if (le_hash != o->data.hash)
3608 return -EBADMSG;
3609
3610 l = le64toh(o->object.size) - offsetof(Object, data.payload);
3611 t = (size_t) l;
3612
3613 /* We hit the limit on 32bit machines */
3614 if ((uint64_t) t != l)
3615 return -E2BIG;
3616
d89c8fdf 3617 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
349cc4a5 3618#if HAVE_XZ || HAVE_LZ4
a7f7d1bd 3619 size_t rsize = 0;
cf244689 3620
d89c8fdf
ZJS
3621 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
3622 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
3623 if (r < 0)
3624 return r;
cf244689
LP
3625
3626 data = from->compress_buffer;
3627 l = rsize;
3b1a55e1
ZJS
3628#else
3629 return -EPROTONOSUPPORT;
3630#endif
cf244689
LP
3631 } else
3632 data = o->data.payload;
3633
3634 r = journal_file_append_data(to, data, l, &u, &h);
3635 if (r < 0)
3636 return r;
3637
3638 xor_hash ^= le64toh(u->data.hash);
3639 items[i].object_offset = htole64(h);
3640 items[i].hash = u->data.hash;
3641
3642 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
3643 if (r < 0)
3644 return r;
3645 }
3646
fa6ac760
LP
3647 r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
3648
be7cdd8e 3649 if (mmap_cache_got_sigbus(to->mmap, to->cache_fd))
fa6ac760
LP
3650 return -EIO;
3651
3652 return r;
cf244689 3653}
babfc091 3654
8580d1f7
LP
3655void journal_reset_metrics(JournalMetrics *m) {
3656 assert(m);
3657
3658 /* Set everything to "pick automatic values". */
3659
3660 *m = (JournalMetrics) {
3661 .min_use = (uint64_t) -1,
3662 .max_use = (uint64_t) -1,
3663 .min_size = (uint64_t) -1,
3664 .max_size = (uint64_t) -1,
3665 .keep_free = (uint64_t) -1,
3666 .n_max_files = (uint64_t) -1,
3667 };
3668}
3669
babfc091 3670void journal_default_metrics(JournalMetrics *m, int fd) {
8580d1f7 3671 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
babfc091 3672 struct statvfs ss;
8580d1f7 3673 uint64_t fs_size;
babfc091
LP
3674
3675 assert(m);
3676 assert(fd >= 0);
3677
3678 if (fstatvfs(fd, &ss) >= 0)
3679 fs_size = ss.f_frsize * ss.f_blocks;
8580d1f7
LP
3680 else {
3681 log_debug_errno(errno, "Failed to detremine disk size: %m");
3682 fs_size = 0;
3683 }
babfc091
LP
3684
3685 if (m->max_use == (uint64_t) -1) {
3686
3687 if (fs_size > 0) {
3688 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
3689
3690 if (m->max_use > DEFAULT_MAX_USE_UPPER)
3691 m->max_use = DEFAULT_MAX_USE_UPPER;
3692
3693 if (m->max_use < DEFAULT_MAX_USE_LOWER)
3694 m->max_use = DEFAULT_MAX_USE_LOWER;
3695 } else
3696 m->max_use = DEFAULT_MAX_USE_LOWER;
3697 } else {
3698 m->max_use = PAGE_ALIGN(m->max_use);
3699
8580d1f7 3700 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
babfc091
LP
3701 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3702 }
3703
8580d1f7
LP
3704 if (m->min_use == (uint64_t) -1)
3705 m->min_use = DEFAULT_MIN_USE;
3706
3707 if (m->min_use > m->max_use)
3708 m->min_use = m->max_use;
3709
babfc091
LP
3710 if (m->max_size == (uint64_t) -1) {
3711 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
3712
3713 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3714 m->max_size = DEFAULT_MAX_SIZE_UPPER;
3715 } else
3716 m->max_size = PAGE_ALIGN(m->max_size);
3717
8580d1f7
LP
3718 if (m->max_size != 0) {
3719 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3720 m->max_size = JOURNAL_FILE_SIZE_MIN;
babfc091 3721
8580d1f7
LP
3722 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3723 m->max_use = m->max_size*2;
3724 }
babfc091
LP
3725
3726 if (m->min_size == (uint64_t) -1)
3727 m->min_size = JOURNAL_FILE_SIZE_MIN;
3728 else {
3729 m->min_size = PAGE_ALIGN(m->min_size);
3730
3731 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3732 m->min_size = JOURNAL_FILE_SIZE_MIN;
3733
8580d1f7 3734 if (m->max_size != 0 && m->min_size > m->max_size)
babfc091
LP
3735 m->max_size = m->min_size;
3736 }
3737
3738 if (m->keep_free == (uint64_t) -1) {
3739
3740 if (fs_size > 0) {
8621b110 3741 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
babfc091
LP
3742
3743 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3744 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3745
3746 } else
3747 m->keep_free = DEFAULT_KEEP_FREE;
3748 }
3749
8580d1f7
LP
3750 if (m->n_max_files == (uint64_t) -1)
3751 m->n_max_files = DEFAULT_N_MAX_FILES;
3752
3753 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3754 format_bytes(a, sizeof(a), m->min_use),
3755 format_bytes(b, sizeof(b), m->max_use),
3756 format_bytes(c, sizeof(c), m->max_size),
3757 format_bytes(d, sizeof(d), m->min_size),
3758 format_bytes(e, sizeof(e), m->keep_free),
3759 m->n_max_files);
babfc091 3760}
08984293
LP
3761
3762int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
08984293 3763 assert(f);
c88cc6af 3764 assert(f->header);
08984293
LP
3765 assert(from || to);
3766
3767 if (from) {
162566a4
LP
3768 if (f->header->head_entry_realtime == 0)
3769 return -ENOENT;
08984293 3770
162566a4 3771 *from = le64toh(f->header->head_entry_realtime);
08984293
LP
3772 }
3773
3774 if (to) {
162566a4
LP
3775 if (f->header->tail_entry_realtime == 0)
3776 return -ENOENT;
08984293 3777
162566a4 3778 *to = le64toh(f->header->tail_entry_realtime);
08984293
LP
3779 }
3780
3781 return 1;
3782}
3783
3784int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
08984293
LP
3785 Object *o;
3786 uint64_t p;
3787 int r;
3788
3789 assert(f);
3790 assert(from || to);
3791
47838ab3 3792 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
08984293
LP
3793 if (r <= 0)
3794 return r;
3795
3796 if (le64toh(o->data.n_entries) <= 0)
3797 return 0;
3798
3799 if (from) {
3800 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3801 if (r < 0)
3802 return r;
3803
3804 *from = le64toh(o->entry.monotonic);
3805 }
3806
3807 if (to) {
3808 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3809 if (r < 0)
3810 return r;
3811
3812 r = generic_array_get_plus_one(f,
3813 le64toh(o->data.entry_offset),
3814 le64toh(o->data.entry_array_offset),
3815 le64toh(o->data.n_entries)-1,
3816 &o, NULL);
3817 if (r <= 0)
3818 return r;
3819
3820 *to = le64toh(o->entry.monotonic);
3821 }
3822
3823 return 1;
3824}
dca6219e 3825
fb0951b0 3826bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
dca6219e 3827 assert(f);
c88cc6af 3828 assert(f->header);
dca6219e
LP
3829
3830 /* If we gained new header fields we gained new features,
3831 * hence suggest a rotation */
361f9cbc
LP
3832 if (le64toh(f->header->header_size) < sizeof(Header)) {
3833 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
dca6219e 3834 return true;
361f9cbc 3835 }
dca6219e
LP
3836
3837 /* Let's check if the hash tables grew over a certain fill
3838 * level (75%, borrowing this value from Java's hash table
3839 * implementation), and if so suggest a rotation. To calculate
3840 * the fill level we need the n_data field, which only exists
3841 * in newer versions. */
3842
3843 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
361f9cbc 3844 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 3845 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
361f9cbc
LP
3846 f->path,
3847 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
3848 le64toh(f->header->n_data),
3849 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3850 (unsigned long long) f->last_stat.st_size,
3851 f->last_stat.st_size / le64toh(f->header->n_data));
dca6219e 3852 return true;
361f9cbc 3853 }
dca6219e
LP
3854
3855 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
361f9cbc 3856 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 3857 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
361f9cbc
LP
3858 f->path,
3859 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
3860 le64toh(f->header->n_fields),
3861 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
dca6219e 3862 return true;
361f9cbc 3863 }
dca6219e 3864
0598fd4a
LP
3865 /* Are the data objects properly indexed by field objects? */
3866 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3867 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3868 le64toh(f->header->n_data) > 0 &&
3869 le64toh(f->header->n_fields) == 0)
3870 return true;
3871
fb0951b0
LP
3872 if (max_file_usec > 0) {
3873 usec_t t, h;
3874
3875 h = le64toh(f->header->head_entry_realtime);
3876 t = now(CLOCK_REALTIME);
3877
3878 if (h > 0 && t > h + max_file_usec)
3879 return true;
3880 }
3881
dca6219e
LP
3882 return false;
3883}