]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/journal/journal-file.c
basic/fs-util: skip fsync_directory_of_file() if /proc/self/fd/ is not available...
[thirdparty/systemd.git] / src / journal / journal-file.c
CommitLineData
53e1b683 1/* SPDX-License-Identifier: LGPL-2.1+ */
cec736d2
LP
2/***
3 This file is part of systemd.
4
5 Copyright 2011 Lennart Poettering
6
7 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
8 under the terms of the GNU Lesser General Public License as published by
9 the Free Software Foundation; either version 2.1 of the License, or
cec736d2
LP
10 (at your option) any later version.
11
12 systemd is distributed in the hope that it will be useful, but
13 WITHOUT ANY WARRANTY; without even the implied warranty of
14 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 15 Lesser General Public License for more details.
cec736d2 16
5430f7f2 17 You should have received a copy of the GNU Lesser General Public License
cec736d2
LP
18 along with systemd; If not, see <http://www.gnu.org/licenses/>.
19***/
20
cec736d2 21#include <errno.h>
cec736d2 22#include <fcntl.h>
11689d2a 23#include <linux/fs.h>
ac2e41f5 24#include <pthread.h>
07630cea
LP
25#include <stddef.h>
26#include <sys/mman.h>
27#include <sys/statvfs.h>
28#include <sys/uio.h>
29#include <unistd.h>
fb0951b0 30
b5efdb8a 31#include "alloc-util.h"
f27a3864 32#include "btrfs-util.h"
c8b3094d 33#include "chattr-util.h"
07630cea 34#include "compress.h"
3ffd4af2 35#include "fd-util.h"
11b29a96 36#include "fs-util.h"
0284adc6 37#include "journal-authenticate.h"
cec736d2
LP
38#include "journal-def.h"
39#include "journal-file.h"
40#include "lookup3.h"
6bedfcbb 41#include "parse-util.h"
5d1ce257 42#include "path-util.h"
3df3e884 43#include "random-util.h"
7a24f3bf 44#include "sd-event.h"
b58c888f 45#include "set.h"
3cc44114 46#include "stat-util.h"
07630cea 47#include "string-util.h"
4761fd0f 48#include "strv.h"
89a5a90c 49#include "xattr-util.h"
cec736d2 50
4a92baf3
LP
51#define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
52#define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
cec736d2 53
be19b7df 54#define COMPRESSION_SIZE_THRESHOLD (512ULL)
807e17f0 55
babfc091 56/* This is the minimum journal file size */
16098e93 57#define JOURNAL_FILE_SIZE_MIN (512ULL*1024ULL) /* 512 KiB */
babfc091
LP
58
59/* These are the lower and upper bounds if we deduce the max_use value
60 * from the file system size */
61#define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
62#define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
63
8580d1f7
LP
64/* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
65#define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL) /* 1 MiB */
66
babfc091 67/* This is the upper bound if we deduce max_size from max_use */
71100051 68#define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
babfc091
LP
69
70/* This is the upper bound if we deduce the keep_free value from the
71 * file system size */
72#define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
73
74/* This is the keep_free value when we can't determine the system
75 * size */
76#define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
77
8580d1f7
LP
78/* This is the default maximum number of journal files to keep around. */
79#define DEFAULT_N_MAX_FILES (100)
80
dca6219e
LP
81/* n_data was the first entry we added after the initial file format design */
82#define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
cec736d2 83
a4bcff5b
LP
84/* How many entries to keep in the entry array chain cache at max */
85#define CHAIN_CACHE_MAX 20
86
a676e665
LP
87/* How much to increase the journal file size at once each time we allocate something new. */
88#define FILE_SIZE_INCREASE (8ULL*1024ULL*1024ULL) /* 8MB */
89
2678031a
LP
90/* Reread fstat() of the file for detecting deletions at least this often */
91#define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC)
92
fa6ac760
LP
93/* The mmap context to use for the header we pick as one above the last defined typed */
94#define CONTEXT_HEADER _OBJECT_TYPE_MAX
95
51804460
ZJS
96#ifdef __clang__
97# pragma GCC diagnostic ignored "-Waddress-of-packed-member"
98#endif
99
ac2e41f5
VC
100/* This may be called from a separate thread to prevent blocking the caller for the duration of fsync().
101 * As a result we use atomic operations on f->offline_state for inter-thread communications with
102 * journal_file_set_offline() and journal_file_set_online(). */
103static void journal_file_set_offline_internal(JournalFile *f) {
26687bf8 104 assert(f);
ac2e41f5
VC
105 assert(f->fd >= 0);
106 assert(f->header);
107
108 for (;;) {
109 switch (f->offline_state) {
110 case OFFLINE_CANCEL:
111 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_DONE))
112 continue;
113 return;
114
115 case OFFLINE_AGAIN_FROM_SYNCING:
116 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_SYNCING))
117 continue;
118 break;
119
120 case OFFLINE_AGAIN_FROM_OFFLINING:
121 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_SYNCING))
122 continue;
123 break;
124
125 case OFFLINE_SYNCING:
126 (void) fsync(f->fd);
26687bf8 127
ac2e41f5
VC
128 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_OFFLINING))
129 continue;
26687bf8 130
8eb85171 131 f->header->state = f->archive ? STATE_ARCHIVED : STATE_OFFLINE;
ac2e41f5
VC
132 (void) fsync(f->fd);
133 break;
134
135 case OFFLINE_OFFLINING:
136 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_DONE))
137 continue;
4831981d 138 _fallthrough_;
ac2e41f5
VC
139 case OFFLINE_DONE:
140 return;
141
142 case OFFLINE_JOINED:
143 log_debug("OFFLINE_JOINED unexpected offline state for journal_file_set_offline_internal()");
144 return;
145 }
146 }
147}
148
149static void * journal_file_set_offline_thread(void *arg) {
150 JournalFile *f = arg;
151
fa7ff4cf
LP
152 (void) pthread_setname_np(pthread_self(), "journal-offline");
153
ac2e41f5
VC
154 journal_file_set_offline_internal(f);
155
156 return NULL;
157}
158
159static int journal_file_set_offline_thread_join(JournalFile *f) {
160 int r;
161
162 assert(f);
163
164 if (f->offline_state == OFFLINE_JOINED)
165 return 0;
166
167 r = pthread_join(f->offline_thread, NULL);
168 if (r)
169 return -r;
170
171 f->offline_state = OFFLINE_JOINED;
26687bf8 172
be7cdd8e 173 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
fa6ac760
LP
174 return -EIO;
175
ac2e41f5
VC
176 return 0;
177}
26687bf8 178
ac2e41f5
VC
179/* Trigger a restart if the offline thread is mid-flight in a restartable state. */
180static bool journal_file_set_offline_try_restart(JournalFile *f) {
181 for (;;) {
182 switch (f->offline_state) {
183 case OFFLINE_AGAIN_FROM_SYNCING:
184 case OFFLINE_AGAIN_FROM_OFFLINING:
185 return true;
186
187 case OFFLINE_CANCEL:
188 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_CANCEL, OFFLINE_AGAIN_FROM_SYNCING))
189 continue;
190 return true;
191
192 case OFFLINE_SYNCING:
193 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_AGAIN_FROM_SYNCING))
194 continue;
195 return true;
196
197 case OFFLINE_OFFLINING:
198 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_OFFLINING, OFFLINE_AGAIN_FROM_OFFLINING))
199 continue;
200 return true;
26687bf8
OS
201
202 default:
ac2e41f5
VC
203 return false;
204 }
26687bf8
OS
205 }
206}
207
ac2e41f5
VC
208/* Sets a journal offline.
209 *
210 * If wait is false then an offline is dispatched in a separate thread for a
211 * subsequent journal_file_set_offline() or journal_file_set_online() of the
212 * same journal to synchronize with.
213 *
214 * If wait is true, then either an existing offline thread will be restarted
215 * and joined, or if none exists the offline is simply performed in this
216 * context without involving another thread.
217 */
218int journal_file_set_offline(JournalFile *f, bool wait) {
219 bool restarted;
220 int r;
221
26687bf8
OS
222 assert(f);
223
224 if (!f->writable)
225 return -EPERM;
226
227 if (!(f->fd >= 0 && f->header))
228 return -EINVAL;
229
b8f99e27
VC
230 /* An offlining journal is implicitly online and may modify f->header->state,
231 * we must also join any potentially lingering offline thread when not online. */
232 if (!journal_file_is_offlining(f) && f->header->state != STATE_ONLINE)
233 return journal_file_set_offline_thread_join(f);
26687bf8 234
ac2e41f5
VC
235 /* Restart an in-flight offline thread and wait if needed, or join a lingering done one. */
236 restarted = journal_file_set_offline_try_restart(f);
237 if ((restarted && wait) || !restarted) {
238 r = journal_file_set_offline_thread_join(f);
239 if (r < 0)
240 return r;
241 }
26687bf8 242
ac2e41f5
VC
243 if (restarted)
244 return 0;
245
246 /* Initiate a new offline. */
247 f->offline_state = OFFLINE_SYNCING;
fa6ac760 248
ac2e41f5
VC
249 if (wait) /* Without using a thread if waiting. */
250 journal_file_set_offline_internal(f);
251 else {
5e9f01e8
LP
252 sigset_t ss, saved_ss;
253 int k;
254
255 if (sigfillset(&ss) < 0)
256 return -errno;
257
258 r = pthread_sigmask(SIG_BLOCK, &ss, &saved_ss);
259 if (r > 0)
260 return -r;
261
ac2e41f5 262 r = pthread_create(&f->offline_thread, NULL, journal_file_set_offline_thread, f);
5e9f01e8
LP
263
264 k = pthread_sigmask(SIG_SETMASK, &saved_ss, NULL);
ec9ffa2c
VC
265 if (r > 0) {
266 f->offline_state = OFFLINE_JOINED;
ac2e41f5 267 return -r;
ec9ffa2c 268 }
5e9f01e8
LP
269 if (k > 0)
270 return -k;
ac2e41f5
VC
271 }
272
273 return 0;
274}
275
276static int journal_file_set_online(JournalFile *f) {
277 bool joined = false;
278
279 assert(f);
280
281 if (!f->writable)
282 return -EPERM;
283
284 if (!(f->fd >= 0 && f->header))
285 return -EINVAL;
286
287 while (!joined) {
288 switch (f->offline_state) {
289 case OFFLINE_JOINED:
290 /* No offline thread, no need to wait. */
291 joined = true;
292 break;
293
294 case OFFLINE_SYNCING:
295 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_CANCEL))
296 continue;
297 /* Canceled syncing prior to offlining, no need to wait. */
298 break;
299
300 case OFFLINE_AGAIN_FROM_SYNCING:
301 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_SYNCING, OFFLINE_CANCEL))
302 continue;
303 /* Canceled restart from syncing, no need to wait. */
304 break;
305
306 case OFFLINE_AGAIN_FROM_OFFLINING:
307 if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_AGAIN_FROM_OFFLINING, OFFLINE_CANCEL))
308 continue;
309 /* Canceled restart from offlining, must wait for offlining to complete however. */
4831981d 310 _fallthrough_;
ac2e41f5
VC
311 default: {
312 int r;
313
314 r = journal_file_set_offline_thread_join(f);
315 if (r < 0)
316 return r;
317
318 joined = true;
319 break;
320 }
321 }
322 }
26687bf8 323
be7cdd8e 324 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
fa6ac760
LP
325 return -EIO;
326
ac2e41f5
VC
327 switch (f->header->state) {
328 case STATE_ONLINE:
329 return 0;
26687bf8 330
ac2e41f5
VC
331 case STATE_OFFLINE:
332 f->header->state = STATE_ONLINE;
333 (void) fsync(f->fd);
334 return 0;
335
336 default:
337 return -EINVAL;
338 }
26687bf8
OS
339}
340
b58c888f
VC
341bool journal_file_is_offlining(JournalFile *f) {
342 assert(f);
343
344 __sync_synchronize();
345
3742095b 346 if (IN_SET(f->offline_state, OFFLINE_DONE, OFFLINE_JOINED))
b58c888f
VC
347 return false;
348
349 return true;
350}
351
804ae586 352JournalFile* journal_file_close(JournalFile *f) {
de190aef 353 assert(f);
cec736d2 354
349cc4a5 355#if HAVE_GCRYPT
b0af6f41 356 /* Write the final tag */
43cd8794
FB
357 if (f->seal && f->writable) {
358 int r;
359
360 r = journal_file_append_tag(f);
361 if (r < 0)
362 log_error_errno(r, "Failed to append tag when closing journal: %m");
363 }
feb12d3e 364#endif
b0af6f41 365
7a24f3bf
VC
366 if (f->post_change_timer) {
367 int enabled;
368
369 if (sd_event_source_get_enabled(f->post_change_timer, &enabled) >= 0)
370 if (enabled == SD_EVENT_ONESHOT)
371 journal_file_post_change(f);
372
e167d7fd 373 (void) sd_event_source_set_enabled(f->post_change_timer, SD_EVENT_OFF);
7a24f3bf
VC
374 sd_event_source_unref(f->post_change_timer);
375 }
376
ac2e41f5 377 journal_file_set_offline(f, true);
cec736d2 378
be7cdd8e
VC
379 if (f->mmap && f->cache_fd)
380 mmap_cache_free_fd(f->mmap, f->cache_fd);
cec736d2 381
11689d2a
LP
382 if (f->fd >= 0 && f->defrag_on_close) {
383
384 /* Be friendly to btrfs: turn COW back on again now,
385 * and defragment the file. We won't write to the file
386 * ever again, hence remove all fragmentation, and
387 * reenable all the good bits COW usually provides
388 * (such as data checksumming). */
389
1ed8f8c1 390 (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
11689d2a
LP
391 (void) btrfs_defrag_fd(f->fd);
392 }
f27a3864 393
5d1ce257
LP
394 if (f->close_fd)
395 safe_close(f->fd);
cec736d2 396 free(f->path);
807e17f0 397
f649045c 398 mmap_cache_unref(f->mmap);
16e9f408 399
4743015d 400 ordered_hashmap_free_free(f->chain_cache);
a4bcff5b 401
349cc4a5 402#if HAVE_XZ || HAVE_LZ4
807e17f0
LP
403 free(f->compress_buffer);
404#endif
405
349cc4a5 406#if HAVE_GCRYPT
baed47c3
LP
407 if (f->fss_file)
408 munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
dc4ebc07 409 else
b7c9ae91
LP
410 free(f->fsprg_state);
411
412 free(f->fsprg_seed);
7560fffc
LP
413
414 if (f->hmac)
415 gcry_md_close(f->hmac);
416#endif
417
6b430fdb 418 return mfree(f);
cec736d2
LP
419}
420
0ac38b70 421static int journal_file_init_header(JournalFile *f, JournalFile *template) {
d89c8fdf 422 Header h = {};
cec736d2
LP
423 ssize_t k;
424 int r;
425
426 assert(f);
427
7560fffc 428 memcpy(h.signature, HEADER_SIGNATURE, 8);
23b0b2b2 429 h.header_size = htole64(ALIGN64(sizeof(h)));
cec736d2 430
d89c8fdf
ZJS
431 h.incompatible_flags |= htole32(
432 f->compress_xz * HEADER_INCOMPATIBLE_COMPRESSED_XZ |
433 f->compress_lz4 * HEADER_INCOMPATIBLE_COMPRESSED_LZ4);
7560fffc 434
d89c8fdf
ZJS
435 h.compatible_flags = htole32(
436 f->seal * HEADER_COMPATIBLE_SEALED);
7560fffc 437
cec736d2
LP
438 r = sd_id128_randomize(&h.file_id);
439 if (r < 0)
440 return r;
441
0ac38b70
LP
442 if (template) {
443 h.seqnum_id = template->header->seqnum_id;
beec0085 444 h.tail_entry_seqnum = template->header->tail_entry_seqnum;
0ac38b70
LP
445 } else
446 h.seqnum_id = h.file_id;
cec736d2
LP
447
448 k = pwrite(f->fd, &h, sizeof(h), 0);
449 if (k < 0)
450 return -errno;
451
452 if (k != sizeof(h))
453 return -EIO;
454
455 return 0;
456}
457
458static int journal_file_refresh_header(JournalFile *f) {
de190aef 459 sd_id128_t boot_id;
fa6ac760 460 int r;
cec736d2
LP
461
462 assert(f);
c88cc6af 463 assert(f->header);
cec736d2
LP
464
465 r = sd_id128_get_machine(&f->header->machine_id);
466 if (r < 0)
467 return r;
468
de190aef 469 r = sd_id128_get_boot(&boot_id);
cec736d2
LP
470 if (r < 0)
471 return r;
472
de190aef
LP
473 if (sd_id128_equal(boot_id, f->header->boot_id))
474 f->tail_entry_monotonic_valid = true;
475
476 f->header->boot_id = boot_id;
477
fa6ac760 478 r = journal_file_set_online(f);
b788cc23 479
7560fffc 480 /* Sync the online state to disk */
fb426037 481 (void) fsync(f->fd);
b788cc23 482
a0fe2a2d
LP
483 /* We likely just created a new file, also sync the directory this file is located in. */
484 (void) fsync_directory_of_file(f->fd);
485
fa6ac760 486 return r;
cec736d2
LP
487}
488
4214009f
ZJS
489static bool warn_wrong_flags(const JournalFile *f, bool compatible) {
490 const uint32_t any = compatible ? HEADER_COMPATIBLE_ANY : HEADER_INCOMPATIBLE_ANY,
491 supported = compatible ? HEADER_COMPATIBLE_SUPPORTED : HEADER_INCOMPATIBLE_SUPPORTED;
492 const char *type = compatible ? "compatible" : "incompatible";
d89c8fdf
ZJS
493 uint32_t flags;
494
4214009f
ZJS
495 flags = le32toh(compatible ? f->header->compatible_flags : f->header->incompatible_flags);
496
497 if (flags & ~supported) {
498 if (flags & ~any)
4761fd0f 499 log_debug("Journal file %s has unknown %s flags 0x%"PRIx32,
4214009f
ZJS
500 f->path, type, flags & ~any);
501 flags = (flags & any) & ~supported;
4761fd0f
ZJS
502 if (flags) {
503 const char* strv[3];
504 unsigned n = 0;
505 _cleanup_free_ char *t = NULL;
506
507 if (compatible && (flags & HEADER_COMPATIBLE_SEALED))
508 strv[n++] = "sealed";
509 if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_XZ))
510 strv[n++] = "xz-compressed";
511 if (!compatible && (flags & HEADER_INCOMPATIBLE_COMPRESSED_LZ4))
512 strv[n++] = "lz4-compressed";
513 strv[n] = NULL;
514 assert(n < ELEMENTSOF(strv));
515
516 t = strv_join((char**) strv, ", ");
517 log_debug("Journal file %s uses %s %s %s disabled at compilation time.",
518 f->path, type, n > 1 ? "flags" : "flag", strnull(t));
519 }
4214009f
ZJS
520 return true;
521 }
522
523 return false;
524}
525
526static int journal_file_verify_header(JournalFile *f) {
6f94e420
TS
527 uint64_t arena_size, header_size;
528
cec736d2 529 assert(f);
c88cc6af 530 assert(f->header);
cec736d2 531
7560fffc 532 if (memcmp(f->header->signature, HEADER_SIGNATURE, 8))
cec736d2
LP
533 return -EBADMSG;
534
4214009f
ZJS
535 /* In both read and write mode we refuse to open files with incompatible
536 * flags we don't know. */
537 if (warn_wrong_flags(f, false))
cec736d2
LP
538 return -EPROTONOSUPPORT;
539
4214009f
ZJS
540 /* When open for writing we refuse to open files with compatible flags, too. */
541 if (f->writable && warn_wrong_flags(f, true))
d89c8fdf 542 return -EPROTONOSUPPORT;
7560fffc 543
db11ac1a
LP
544 if (f->header->state >= _STATE_MAX)
545 return -EBADMSG;
546
6f94e420
TS
547 header_size = le64toh(f->header->header_size);
548
dca6219e 549 /* The first addition was n_data, so check that we are at least this large */
6f94e420 550 if (header_size < HEADER_SIZE_MIN)
23b0b2b2
LP
551 return -EBADMSG;
552
8088cbd3 553 if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
beec0085
LP
554 return -EBADMSG;
555
6f94e420
TS
556 arena_size = le64toh(f->header->arena_size);
557
558 if (UINT64_MAX - header_size < arena_size || header_size + arena_size > (uint64_t) f->last_stat.st_size)
db11ac1a
LP
559 return -ENODATA;
560
6f94e420 561 if (le64toh(f->header->tail_object_offset) > header_size + arena_size)
db11ac1a
LP
562 return -ENODATA;
563
7762e02b
LP
564 if (!VALID64(le64toh(f->header->data_hash_table_offset)) ||
565 !VALID64(le64toh(f->header->field_hash_table_offset)) ||
566 !VALID64(le64toh(f->header->tail_object_offset)) ||
567 !VALID64(le64toh(f->header->entry_array_offset)))
568 return -ENODATA;
569
cec736d2 570 if (f->writable) {
cec736d2 571 sd_id128_t machine_id;
ae739cc1 572 uint8_t state;
cec736d2
LP
573 int r;
574
575 r = sd_id128_get_machine(&machine_id);
576 if (r < 0)
577 return r;
578
579 if (!sd_id128_equal(machine_id, f->header->machine_id))
580 return -EHOSTDOWN;
581
de190aef 582 state = f->header->state;
cec736d2 583
b288cdeb
ZJS
584 if (state == STATE_ARCHIVED)
585 return -ESHUTDOWN; /* Already archived */
586 else if (state == STATE_ONLINE) {
71fa6f00
LP
587 log_debug("Journal file %s is already online. Assuming unclean closing.", f->path);
588 return -EBUSY;
b288cdeb 589 } else if (state != STATE_OFFLINE) {
8facc349 590 log_debug("Journal file %s has unknown state %i.", f->path, state);
71fa6f00
LP
591 return -EBUSY;
592 }
ae739cc1 593
5b3cc0c8
YN
594 if (f->header->field_hash_table_size == 0 || f->header->data_hash_table_size == 0)
595 return -EBADMSG;
596
ae739cc1
LP
597 /* Don't permit appending to files from the future. Because otherwise the realtime timestamps wouldn't
598 * be strictly ordered in the entries in the file anymore, and we can't have that since it breaks
599 * bisection. */
600 if (le64toh(f->header->tail_entry_realtime) > now(CLOCK_REALTIME)) {
601 log_debug("Journal file %s is from the future, refusing to append new data to it that'd be older.", f->path);
602 return -ETXTBSY;
603 }
cec736d2
LP
604 }
605
d89c8fdf
ZJS
606 f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
607 f->compress_lz4 = JOURNAL_HEADER_COMPRESSED_LZ4(f->header);
c586dbf1 608
f1889c91 609 f->seal = JOURNAL_HEADER_SEALED(f->header);
7560fffc 610
cec736d2
LP
611 return 0;
612}
613
2678031a 614static int journal_file_fstat(JournalFile *f) {
3cc44114
LP
615 int r;
616
2678031a
LP
617 assert(f);
618 assert(f->fd >= 0);
619
620 if (fstat(f->fd, &f->last_stat) < 0)
621 return -errno;
622
623 f->last_stat_usec = now(CLOCK_MONOTONIC);
624
8d6a4d33 625 /* Refuse dealing with with files that aren't regular */
3cc44114
LP
626 r = stat_verify_regular(&f->last_stat);
627 if (r < 0)
628 return r;
8d6a4d33 629
2678031a
LP
630 /* Refuse appending to files that are already deleted */
631 if (f->last_stat.st_nlink <= 0)
632 return -EIDRM;
633
634 return 0;
635}
636
cec736d2 637static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
eda4b58b 638 uint64_t old_size, new_size;
fec2aa2f 639 int r;
cec736d2
LP
640
641 assert(f);
c88cc6af 642 assert(f->header);
cec736d2 643
cec736d2 644 /* We assume that this file is not sparse, and we know that
38ac38b2 645 * for sure, since we always call posix_fallocate()
cec736d2
LP
646 * ourselves */
647
be7cdd8e 648 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
fa6ac760
LP
649 return -EIO;
650
cec736d2 651 old_size =
23b0b2b2 652 le64toh(f->header->header_size) +
cec736d2
LP
653 le64toh(f->header->arena_size);
654
bc85bfee 655 new_size = PAGE_ALIGN(offset + size);
23b0b2b2
LP
656 if (new_size < le64toh(f->header->header_size))
657 new_size = le64toh(f->header->header_size);
bc85bfee 658
2678031a
LP
659 if (new_size <= old_size) {
660
661 /* We already pre-allocated enough space, but before
662 * we write to it, let's check with fstat() if the
663 * file got deleted, in order make sure we don't throw
664 * away the data immediately. Don't check fstat() for
665 * all writes though, but only once ever 10s. */
666
667 if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC))
668 return 0;
669
670 return journal_file_fstat(f);
671 }
672
673 /* Allocate more space. */
cec736d2 674
a676e665 675 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
bc85bfee 676 return -E2BIG;
cec736d2 677
a676e665 678 if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) {
cec736d2
LP
679 struct statvfs svfs;
680
681 if (fstatvfs(f->fd, &svfs) >= 0) {
682 uint64_t available;
683
070052ab 684 available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
cec736d2
LP
685
686 if (new_size - old_size > available)
687 return -E2BIG;
688 }
689 }
690
eda4b58b
LP
691 /* Increase by larger blocks at once */
692 new_size = ((new_size+FILE_SIZE_INCREASE-1) / FILE_SIZE_INCREASE) * FILE_SIZE_INCREASE;
693 if (f->metrics.max_size > 0 && new_size > f->metrics.max_size)
694 new_size = f->metrics.max_size;
695
bc85bfee
LP
696 /* Note that the glibc fallocate() fallback is very
697 inefficient, hence we try to minimize the allocation area
698 as we can. */
fec2aa2f
GV
699 r = posix_fallocate(f->fd, old_size, new_size - old_size);
700 if (r != 0)
701 return -r;
cec736d2 702
23b0b2b2 703 f->header->arena_size = htole64(new_size - le64toh(f->header->header_size));
cec736d2 704
2678031a 705 return journal_file_fstat(f);
cec736d2
LP
706}
707
78519831 708static unsigned type_to_context(ObjectType type) {
d3d3208f 709 /* One context for each type, plus one catch-all for the rest */
69adae51 710 assert_cc(_OBJECT_TYPE_MAX <= MMAP_CACHE_MAX_CONTEXTS);
fa6ac760 711 assert_cc(CONTEXT_HEADER < MMAP_CACHE_MAX_CONTEXTS);
d05089d8 712 return type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX ? type : 0;
d3d3208f
MS
713}
714
b439282e 715static int journal_file_move_to(JournalFile *f, ObjectType type, bool keep_always, uint64_t offset, uint64_t size, void **ret, size_t *ret_size) {
2678031a
LP
716 int r;
717
cec736d2 718 assert(f);
cec736d2
LP
719 assert(ret);
720
7762e02b
LP
721 if (size <= 0)
722 return -EINVAL;
723
2a59ea54 724 /* Avoid SIGBUS on invalid accesses */
4bbdcdb3
LP
725 if (offset + size > (uint64_t) f->last_stat.st_size) {
726 /* Hmm, out of range? Let's refresh the fstat() data
727 * first, before we trust that check. */
728
2678031a
LP
729 r = journal_file_fstat(f);
730 if (r < 0)
731 return r;
732
733 if (offset + size > (uint64_t) f->last_stat.st_size)
4bbdcdb3
LP
734 return -EADDRNOTAVAIL;
735 }
736
b439282e 737 return mmap_cache_get(f->mmap, f->cache_fd, f->prot, type_to_context(type), keep_always, offset, size, &f->last_stat, ret, ret_size);
cec736d2
LP
738}
739
16e9f408
LP
740static uint64_t minimum_header_size(Object *o) {
741
b8e891e6 742 static const uint64_t table[] = {
16e9f408
LP
743 [OBJECT_DATA] = sizeof(DataObject),
744 [OBJECT_FIELD] = sizeof(FieldObject),
745 [OBJECT_ENTRY] = sizeof(EntryObject),
746 [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject),
747 [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject),
748 [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject),
749 [OBJECT_TAG] = sizeof(TagObject),
750 };
751
752 if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0)
753 return sizeof(ObjectHeader);
754
755 return table[o->object.type];
756}
757
24754f36
TR
758/* Lightweight object checks. We want this to be fast, so that we won't
759 * slowdown every journal_file_move_to_object() call too much. */
760static int journal_file_check_object(JournalFile *f, uint64_t offset, Object *o) {
761 assert(f);
762 assert(o);
763
764 switch (o->object.type) {
765
766 case OBJECT_DATA: {
767 if ((le64toh(o->data.entry_offset) == 0) ^ (le64toh(o->data.n_entries) == 0)) {
768 log_debug("Bad n_entries: %"PRIu64": %"PRIu64,
10e8445b 769 le64toh(o->data.n_entries), offset);
24754f36
TR
770 return -EBADMSG;
771 }
772
773 if (le64toh(o->object.size) - offsetof(DataObject, payload) <= 0) {
774 log_debug("Bad object size (<= %zu): %"PRIu64": %"PRIu64,
775 offsetof(DataObject, payload),
776 le64toh(o->object.size),
777 offset);
778 return -EBADMSG;
779 }
780
10e8445b
TR
781 if (!VALID64(le64toh(o->data.next_hash_offset)) ||
782 !VALID64(le64toh(o->data.next_field_offset)) ||
783 !VALID64(le64toh(o->data.entry_offset)) ||
784 !VALID64(le64toh(o->data.entry_array_offset))) {
24754f36
TR
785 log_debug("Invalid offset, next_hash_offset="OFSfmt", next_field_offset="OFSfmt
786 ", entry_offset="OFSfmt", entry_array_offset="OFSfmt": %"PRIu64,
10e8445b
TR
787 le64toh(o->data.next_hash_offset),
788 le64toh(o->data.next_field_offset),
789 le64toh(o->data.entry_offset),
790 le64toh(o->data.entry_array_offset),
24754f36
TR
791 offset);
792 return -EBADMSG;
793 }
794
795 break;
796 }
797
798 case OBJECT_FIELD:
799 if (le64toh(o->object.size) - offsetof(FieldObject, payload) <= 0) {
800 log_debug(
801 "Bad field size (<= %zu): %"PRIu64": %"PRIu64,
802 offsetof(FieldObject, payload),
803 le64toh(o->object.size),
804 offset);
805 return -EBADMSG;
806 }
807
10e8445b
TR
808 if (!VALID64(le64toh(o->field.next_hash_offset)) ||
809 !VALID64(le64toh(o->field.head_data_offset))) {
24754f36
TR
810 log_debug(
811 "Invalid offset, next_hash_offset="OFSfmt
812 ", head_data_offset="OFSfmt": %"PRIu64,
10e8445b
TR
813 le64toh(o->field.next_hash_offset),
814 le64toh(o->field.head_data_offset),
24754f36
TR
815 offset);
816 return -EBADMSG;
817 }
818 break;
819
820 case OBJECT_ENTRY:
821 if ((le64toh(o->object.size) - offsetof(EntryObject, items)) % sizeof(EntryItem) != 0) {
822 log_debug(
823 "Bad entry size (<= %zu): %"PRIu64": %"PRIu64,
824 offsetof(EntryObject, items),
825 le64toh(o->object.size),
826 offset);
827 return -EBADMSG;
828 }
829
830 if ((le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem) <= 0) {
831 log_debug(
832 "Invalid number items in entry: %"PRIu64": %"PRIu64,
833 (le64toh(o->object.size) - offsetof(EntryObject, items)) / sizeof(EntryItem),
834 offset);
835 return -EBADMSG;
836 }
837
838 if (le64toh(o->entry.seqnum) <= 0) {
839 log_debug(
840 "Invalid entry seqnum: %"PRIx64": %"PRIu64,
841 le64toh(o->entry.seqnum),
842 offset);
843 return -EBADMSG;
844 }
845
846 if (!VALID_REALTIME(le64toh(o->entry.realtime))) {
847 log_debug(
848 "Invalid entry realtime timestamp: %"PRIu64": %"PRIu64,
849 le64toh(o->entry.realtime),
850 offset);
851 return -EBADMSG;
852 }
853
854 if (!VALID_MONOTONIC(le64toh(o->entry.monotonic))) {
855 log_debug(
856 "Invalid entry monotonic timestamp: %"PRIu64": %"PRIu64,
857 le64toh(o->entry.monotonic),
858 offset);
859 return -EBADMSG;
860 }
861
862 break;
863
864 case OBJECT_DATA_HASH_TABLE:
865 case OBJECT_FIELD_HASH_TABLE:
866 if ((le64toh(o->object.size) - offsetof(HashTableObject, items)) % sizeof(HashItem) != 0 ||
867 (le64toh(o->object.size) - offsetof(HashTableObject, items)) / sizeof(HashItem) <= 0) {
868 log_debug(
869 "Invalid %s hash table size: %"PRIu64": %"PRIu64,
870 o->object.type == OBJECT_DATA_HASH_TABLE ? "data" : "field",
871 le64toh(o->object.size),
872 offset);
873 return -EBADMSG;
874 }
875
876 break;
877
878 case OBJECT_ENTRY_ARRAY:
879 if ((le64toh(o->object.size) - offsetof(EntryArrayObject, items)) % sizeof(le64_t) != 0 ||
880 (le64toh(o->object.size) - offsetof(EntryArrayObject, items)) / sizeof(le64_t) <= 0) {
881 log_debug(
882 "Invalid object entry array size: %"PRIu64": %"PRIu64,
883 le64toh(o->object.size),
884 offset);
885 return -EBADMSG;
886 }
887
10e8445b 888 if (!VALID64(le64toh(o->entry_array.next_entry_array_offset))) {
24754f36
TR
889 log_debug(
890 "Invalid object entry array next_entry_array_offset: "OFSfmt": %"PRIu64,
10e8445b 891 le64toh(o->entry_array.next_entry_array_offset),
24754f36
TR
892 offset);
893 return -EBADMSG;
894 }
895
896 break;
897
898 case OBJECT_TAG:
899 if (le64toh(o->object.size) != sizeof(TagObject)) {
900 log_debug(
901 "Invalid object tag size: %"PRIu64": %"PRIu64,
902 le64toh(o->object.size),
903 offset);
904 return -EBADMSG;
905 }
906
10e8445b 907 if (!VALID_EPOCH(le64toh(o->tag.epoch))) {
24754f36
TR
908 log_debug(
909 "Invalid object tag epoch: %"PRIu64": %"PRIu64,
10e8445b 910 le64toh(o->tag.epoch),
24754f36
TR
911 offset);
912 return -EBADMSG;
913 }
914
915 break;
916 }
917
918 return 0;
919}
920
78519831 921int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) {
cec736d2
LP
922 int r;
923 void *t;
b439282e 924 size_t tsize;
cec736d2
LP
925 Object *o;
926 uint64_t s;
927
928 assert(f);
929 assert(ret);
930
db11ac1a 931 /* Objects may only be located at multiple of 64 bit */
202fd896
LP
932 if (!VALID64(offset)) {
933 log_debug("Attempt to move to object at non-64bit boundary: %" PRIu64, offset);
bd30fdf2 934 return -EBADMSG;
202fd896 935 }
db11ac1a 936
50809d7a 937 /* Object may not be located in the file header */
202fd896
LP
938 if (offset < le64toh(f->header->header_size)) {
939 log_debug("Attempt to move to object located in file header: %" PRIu64, offset);
50809d7a 940 return -EBADMSG;
202fd896 941 }
50809d7a 942
b439282e 943 r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t, &tsize);
cec736d2
LP
944 if (r < 0)
945 return r;
946
947 o = (Object*) t;
948 s = le64toh(o->object.size);
949
1c69f096
LP
950 if (s == 0) {
951 log_debug("Attempt to move to uninitialized object: %" PRIu64, offset);
952 return -EBADMSG;
953 }
202fd896
LP
954 if (s < sizeof(ObjectHeader)) {
955 log_debug("Attempt to move to overly short object: %" PRIu64, offset);
cec736d2 956 return -EBADMSG;
202fd896 957 }
cec736d2 958
202fd896
LP
959 if (o->object.type <= OBJECT_UNUSED) {
960 log_debug("Attempt to move to object with invalid type: %" PRIu64, offset);
16e9f408 961 return -EBADMSG;
202fd896 962 }
16e9f408 963
202fd896
LP
964 if (s < minimum_header_size(o)) {
965 log_debug("Attempt to move to truncated object: %" PRIu64, offset);
16e9f408 966 return -EBADMSG;
202fd896 967 }
16e9f408 968
202fd896
LP
969 if (type > OBJECT_UNUSED && o->object.type != type) {
970 log_debug("Attempt to move to object of unexpected type: %" PRIu64, offset);
cec736d2 971 return -EBADMSG;
202fd896 972 }
cec736d2 973
b439282e
VC
974 if (s > tsize) {
975 r = journal_file_move_to(f, type, false, offset, s, &t, NULL);
cec736d2
LP
976 if (r < 0)
977 return r;
978
979 o = (Object*) t;
980 }
981
24754f36
TR
982 r = journal_file_check_object(f, offset, o);
983 if (r < 0)
984 return r;
985
cec736d2
LP
986 *ret = o;
987 return 0;
988}
989
d98cc1f2 990static uint64_t journal_file_entry_seqnum(JournalFile *f, uint64_t *seqnum) {
cec736d2
LP
991 uint64_t r;
992
993 assert(f);
c88cc6af 994 assert(f->header);
cec736d2 995
beec0085 996 r = le64toh(f->header->tail_entry_seqnum) + 1;
c2373f84
LP
997
998 if (seqnum) {
de190aef 999 /* If an external seqnum counter was passed, we update
c2373f84
LP
1000 * both the local and the external one, and set it to
1001 * the maximum of both */
1002
1003 if (*seqnum + 1 > r)
1004 r = *seqnum + 1;
1005
1006 *seqnum = r;
1007 }
1008
beec0085 1009 f->header->tail_entry_seqnum = htole64(r);
cec736d2 1010
beec0085
LP
1011 if (f->header->head_entry_seqnum == 0)
1012 f->header->head_entry_seqnum = htole64(r);
de190aef 1013
cec736d2
LP
1014 return r;
1015}
1016
78519831 1017int journal_file_append_object(JournalFile *f, ObjectType type, uint64_t size, Object **ret, uint64_t *offset) {
cec736d2
LP
1018 int r;
1019 uint64_t p;
1020 Object *tail, *o;
1021 void *t;
1022
1023 assert(f);
c88cc6af 1024 assert(f->header);
d05089d8 1025 assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX);
cec736d2
LP
1026 assert(size >= sizeof(ObjectHeader));
1027 assert(offset);
1028 assert(ret);
1029
26687bf8
OS
1030 r = journal_file_set_online(f);
1031 if (r < 0)
1032 return r;
1033
cec736d2 1034 p = le64toh(f->header->tail_object_offset);
cec736d2 1035 if (p == 0)
23b0b2b2 1036 p = le64toh(f->header->header_size);
cec736d2 1037 else {
d05089d8 1038 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail);
cec736d2
LP
1039 if (r < 0)
1040 return r;
1041
1042 p += ALIGN64(le64toh(tail->object.size));
1043 }
1044
1045 r = journal_file_allocate(f, p, size);
1046 if (r < 0)
1047 return r;
1048
b439282e 1049 r = journal_file_move_to(f, type, false, p, size, &t, NULL);
cec736d2
LP
1050 if (r < 0)
1051 return r;
1052
1053 o = (Object*) t;
1054
1055 zero(o->object);
de190aef 1056 o->object.type = type;
cec736d2
LP
1057 o->object.size = htole64(size);
1058
1059 f->header->tail_object_offset = htole64(p);
cec736d2
LP
1060 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
1061
1062 *ret = o;
1063 *offset = p;
1064
1065 return 0;
1066}
1067
de190aef 1068static int journal_file_setup_data_hash_table(JournalFile *f) {
cec736d2
LP
1069 uint64_t s, p;
1070 Object *o;
1071 int r;
1072
1073 assert(f);
c88cc6af 1074 assert(f->header);
cec736d2 1075
070052ab
LP
1076 /* We estimate that we need 1 hash table entry per 768 bytes
1077 of journal file and we want to make sure we never get
1078 beyond 75% fill level. Calculate the hash table size for
1079 the maximum file size based on these metrics. */
4a92baf3 1080
dfabe643 1081 s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
4a92baf3
LP
1082 if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
1083 s = DEFAULT_DATA_HASH_TABLE_SIZE;
1084
507f22bd 1085 log_debug("Reserving %"PRIu64" entries in hash table.", s / sizeof(HashItem));
4a92baf3 1086
de190aef
LP
1087 r = journal_file_append_object(f,
1088 OBJECT_DATA_HASH_TABLE,
1089 offsetof(Object, hash_table.items) + s,
1090 &o, &p);
cec736d2
LP
1091 if (r < 0)
1092 return r;
1093
29804cc1 1094 memzero(o->hash_table.items, s);
cec736d2 1095
de190aef
LP
1096 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1097 f->header->data_hash_table_size = htole64(s);
cec736d2
LP
1098
1099 return 0;
1100}
1101
de190aef 1102static int journal_file_setup_field_hash_table(JournalFile *f) {
cec736d2
LP
1103 uint64_t s, p;
1104 Object *o;
1105 int r;
1106
1107 assert(f);
c88cc6af 1108 assert(f->header);
cec736d2 1109
3c1668da
LP
1110 /* We use a fixed size hash table for the fields as this
1111 * number should grow very slowly only */
1112
de190aef
LP
1113 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
1114 r = journal_file_append_object(f,
1115 OBJECT_FIELD_HASH_TABLE,
1116 offsetof(Object, hash_table.items) + s,
1117 &o, &p);
cec736d2
LP
1118 if (r < 0)
1119 return r;
1120
29804cc1 1121 memzero(o->hash_table.items, s);
cec736d2 1122
de190aef
LP
1123 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
1124 f->header->field_hash_table_size = htole64(s);
cec736d2
LP
1125
1126 return 0;
1127}
1128
dade37d4 1129int journal_file_map_data_hash_table(JournalFile *f) {
cec736d2
LP
1130 uint64_t s, p;
1131 void *t;
1132 int r;
1133
1134 assert(f);
c88cc6af 1135 assert(f->header);
cec736d2 1136
dade37d4
LP
1137 if (f->data_hash_table)
1138 return 0;
1139
de190aef
LP
1140 p = le64toh(f->header->data_hash_table_offset);
1141 s = le64toh(f->header->data_hash_table_size);
cec736d2 1142
de190aef 1143 r = journal_file_move_to(f,
16e9f408 1144 OBJECT_DATA_HASH_TABLE,
fcde2389 1145 true,
de190aef 1146 p, s,
b42549ad 1147 &t, NULL);
cec736d2
LP
1148 if (r < 0)
1149 return r;
1150
de190aef 1151 f->data_hash_table = t;
cec736d2
LP
1152 return 0;
1153}
1154
dade37d4 1155int journal_file_map_field_hash_table(JournalFile *f) {
cec736d2
LP
1156 uint64_t s, p;
1157 void *t;
1158 int r;
1159
1160 assert(f);
c88cc6af 1161 assert(f->header);
cec736d2 1162
dade37d4
LP
1163 if (f->field_hash_table)
1164 return 0;
1165
de190aef
LP
1166 p = le64toh(f->header->field_hash_table_offset);
1167 s = le64toh(f->header->field_hash_table_size);
cec736d2 1168
de190aef 1169 r = journal_file_move_to(f,
16e9f408 1170 OBJECT_FIELD_HASH_TABLE,
fcde2389 1171 true,
de190aef 1172 p, s,
b42549ad 1173 &t, NULL);
cec736d2
LP
1174 if (r < 0)
1175 return r;
1176
de190aef 1177 f->field_hash_table = t;
cec736d2
LP
1178 return 0;
1179}
1180
3c1668da
LP
1181static int journal_file_link_field(
1182 JournalFile *f,
1183 Object *o,
1184 uint64_t offset,
1185 uint64_t hash) {
1186
805d1486 1187 uint64_t p, h, m;
3c1668da
LP
1188 int r;
1189
1190 assert(f);
c88cc6af 1191 assert(f->header);
90d222c1 1192 assert(f->field_hash_table);
3c1668da
LP
1193 assert(o);
1194 assert(offset > 0);
1195
1196 if (o->object.type != OBJECT_FIELD)
1197 return -EINVAL;
1198
805d1486
LP
1199 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
1200 if (m <= 0)
1201 return -EBADMSG;
3c1668da 1202
805d1486 1203 /* This might alter the window we are looking at */
3c1668da
LP
1204 o->field.next_hash_offset = o->field.head_data_offset = 0;
1205
805d1486 1206 h = hash % m;
3c1668da
LP
1207 p = le64toh(f->field_hash_table[h].tail_hash_offset);
1208 if (p == 0)
1209 f->field_hash_table[h].head_hash_offset = htole64(offset);
1210 else {
1211 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1212 if (r < 0)
1213 return r;
1214
1215 o->field.next_hash_offset = htole64(offset);
1216 }
1217
1218 f->field_hash_table[h].tail_hash_offset = htole64(offset);
1219
1220 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
1221 f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1);
1222
1223 return 0;
1224}
1225
1226static int journal_file_link_data(
1227 JournalFile *f,
1228 Object *o,
1229 uint64_t offset,
1230 uint64_t hash) {
1231
805d1486 1232 uint64_t p, h, m;
cec736d2
LP
1233 int r;
1234
1235 assert(f);
c88cc6af 1236 assert(f->header);
90d222c1 1237 assert(f->data_hash_table);
cec736d2
LP
1238 assert(o);
1239 assert(offset > 0);
b588975f
LP
1240
1241 if (o->object.type != OBJECT_DATA)
1242 return -EINVAL;
cec736d2 1243
805d1486
LP
1244 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
1245 if (m <= 0)
1246 return -EBADMSG;
48496df6 1247
805d1486 1248 /* This might alter the window we are looking at */
de190aef
LP
1249 o->data.next_hash_offset = o->data.next_field_offset = 0;
1250 o->data.entry_offset = o->data.entry_array_offset = 0;
1251 o->data.n_entries = 0;
cec736d2 1252
805d1486 1253 h = hash % m;
8db4213e 1254 p = le64toh(f->data_hash_table[h].tail_hash_offset);
3c1668da 1255 if (p == 0)
cec736d2 1256 /* Only entry in the hash table is easy */
de190aef 1257 f->data_hash_table[h].head_hash_offset = htole64(offset);
3c1668da 1258 else {
48496df6
LP
1259 /* Move back to the previous data object, to patch in
1260 * pointer */
cec736d2 1261
de190aef 1262 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1263 if (r < 0)
1264 return r;
1265
de190aef 1266 o->data.next_hash_offset = htole64(offset);
cec736d2
LP
1267 }
1268
de190aef 1269 f->data_hash_table[h].tail_hash_offset = htole64(offset);
cec736d2 1270
dca6219e
LP
1271 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
1272 f->header->n_data = htole64(le64toh(f->header->n_data) + 1);
1273
cec736d2
LP
1274 return 0;
1275}
1276
3c1668da
LP
1277int journal_file_find_field_object_with_hash(
1278 JournalFile *f,
1279 const void *field, uint64_t size, uint64_t hash,
1280 Object **ret, uint64_t *offset) {
1281
805d1486 1282 uint64_t p, osize, h, m;
3c1668da
LP
1283 int r;
1284
1285 assert(f);
c88cc6af 1286 assert(f->header);
3c1668da
LP
1287 assert(field && size > 0);
1288
dade37d4
LP
1289 /* If the field hash table is empty, we can't find anything */
1290 if (le64toh(f->header->field_hash_table_size) <= 0)
1291 return 0;
1292
1293 /* Map the field hash table, if it isn't mapped yet. */
1294 r = journal_file_map_field_hash_table(f);
1295 if (r < 0)
1296 return r;
1297
3c1668da
LP
1298 osize = offsetof(Object, field.payload) + size;
1299
805d1486 1300 m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
805d1486 1301 if (m <= 0)
3c1668da
LP
1302 return -EBADMSG;
1303
805d1486 1304 h = hash % m;
3c1668da
LP
1305 p = le64toh(f->field_hash_table[h].head_hash_offset);
1306
1307 while (p > 0) {
1308 Object *o;
1309
1310 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1311 if (r < 0)
1312 return r;
1313
1314 if (le64toh(o->field.hash) == hash &&
1315 le64toh(o->object.size) == osize &&
1316 memcmp(o->field.payload, field, size) == 0) {
1317
1318 if (ret)
1319 *ret = o;
1320 if (offset)
1321 *offset = p;
1322
1323 return 1;
1324 }
1325
1326 p = le64toh(o->field.next_hash_offset);
1327 }
1328
1329 return 0;
1330}
1331
1332int journal_file_find_field_object(
1333 JournalFile *f,
1334 const void *field, uint64_t size,
1335 Object **ret, uint64_t *offset) {
1336
1337 uint64_t hash;
1338
1339 assert(f);
1340 assert(field && size > 0);
1341
1342 hash = hash64(field, size);
1343
1344 return journal_file_find_field_object_with_hash(f,
1345 field, size, hash,
1346 ret, offset);
1347}
1348
de190aef
LP
1349int journal_file_find_data_object_with_hash(
1350 JournalFile *f,
1351 const void *data, uint64_t size, uint64_t hash,
1352 Object **ret, uint64_t *offset) {
48496df6 1353
805d1486 1354 uint64_t p, osize, h, m;
cec736d2
LP
1355 int r;
1356
1357 assert(f);
c88cc6af 1358 assert(f->header);
cec736d2
LP
1359 assert(data || size == 0);
1360
dade37d4
LP
1361 /* If there's no data hash table, then there's no entry. */
1362 if (le64toh(f->header->data_hash_table_size) <= 0)
1363 return 0;
1364
1365 /* Map the data hash table, if it isn't mapped yet. */
1366 r = journal_file_map_data_hash_table(f);
1367 if (r < 0)
1368 return r;
1369
cec736d2
LP
1370 osize = offsetof(Object, data.payload) + size;
1371
805d1486
LP
1372 m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
1373 if (m <= 0)
bc85bfee
LP
1374 return -EBADMSG;
1375
805d1486 1376 h = hash % m;
de190aef 1377 p = le64toh(f->data_hash_table[h].head_hash_offset);
cec736d2 1378
de190aef
LP
1379 while (p > 0) {
1380 Object *o;
cec736d2 1381
de190aef 1382 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1383 if (r < 0)
1384 return r;
1385
807e17f0 1386 if (le64toh(o->data.hash) != hash)
85a131e8 1387 goto next;
807e17f0 1388
d89c8fdf 1389 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
349cc4a5 1390#if HAVE_XZ || HAVE_LZ4
fa1c4b51 1391 uint64_t l;
a7f7d1bd 1392 size_t rsize = 0;
cec736d2 1393
807e17f0
LP
1394 l = le64toh(o->object.size);
1395 if (l <= offsetof(Object, data.payload))
cec736d2
LP
1396 return -EBADMSG;
1397
807e17f0
LP
1398 l -= offsetof(Object, data.payload);
1399
d89c8fdf
ZJS
1400 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
1401 o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize, 0);
1402 if (r < 0)
1403 return r;
807e17f0 1404
b785c858 1405 if (rsize == size &&
807e17f0
LP
1406 memcmp(f->compress_buffer, data, size) == 0) {
1407
1408 if (ret)
1409 *ret = o;
1410
1411 if (offset)
1412 *offset = p;
1413
1414 return 1;
1415 }
3b1a55e1
ZJS
1416#else
1417 return -EPROTONOSUPPORT;
1418#endif
807e17f0
LP
1419 } else if (le64toh(o->object.size) == osize &&
1420 memcmp(o->data.payload, data, size) == 0) {
1421
cec736d2
LP
1422 if (ret)
1423 *ret = o;
1424
1425 if (offset)
1426 *offset = p;
1427
de190aef 1428 return 1;
cec736d2
LP
1429 }
1430
85a131e8 1431 next:
cec736d2
LP
1432 p = le64toh(o->data.next_hash_offset);
1433 }
1434
de190aef
LP
1435 return 0;
1436}
1437
1438int journal_file_find_data_object(
1439 JournalFile *f,
1440 const void *data, uint64_t size,
1441 Object **ret, uint64_t *offset) {
1442
1443 uint64_t hash;
1444
1445 assert(f);
1446 assert(data || size == 0);
1447
1448 hash = hash64(data, size);
1449
1450 return journal_file_find_data_object_with_hash(f,
1451 data, size, hash,
1452 ret, offset);
1453}
1454
3c1668da
LP
1455static int journal_file_append_field(
1456 JournalFile *f,
1457 const void *field, uint64_t size,
1458 Object **ret, uint64_t *offset) {
1459
1460 uint64_t hash, p;
1461 uint64_t osize;
1462 Object *o;
1463 int r;
1464
1465 assert(f);
1466 assert(field && size > 0);
1467
1468 hash = hash64(field, size);
1469
1470 r = journal_file_find_field_object_with_hash(f, field, size, hash, &o, &p);
1471 if (r < 0)
1472 return r;
1473 else if (r > 0) {
1474
1475 if (ret)
1476 *ret = o;
1477
1478 if (offset)
1479 *offset = p;
1480
1481 return 0;
1482 }
1483
1484 osize = offsetof(Object, field.payload) + size;
1485 r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p);
8c92d4bb
LP
1486 if (r < 0)
1487 return r;
3c1668da
LP
1488
1489 o->field.hash = htole64(hash);
1490 memcpy(o->field.payload, field, size);
1491
1492 r = journal_file_link_field(f, o, p, hash);
1493 if (r < 0)
1494 return r;
1495
1496 /* The linking might have altered the window, so let's
1497 * refresh our pointer */
1498 r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o);
1499 if (r < 0)
1500 return r;
1501
349cc4a5 1502#if HAVE_GCRYPT
3c1668da
LP
1503 r = journal_file_hmac_put_object(f, OBJECT_FIELD, o, p);
1504 if (r < 0)
1505 return r;
1506#endif
1507
1508 if (ret)
1509 *ret = o;
1510
1511 if (offset)
1512 *offset = p;
1513
1514 return 0;
1515}
1516
48496df6
LP
1517static int journal_file_append_data(
1518 JournalFile *f,
1519 const void *data, uint64_t size,
1520 Object **ret, uint64_t *offset) {
1521
de190aef
LP
1522 uint64_t hash, p;
1523 uint64_t osize;
1524 Object *o;
d89c8fdf 1525 int r, compression = 0;
3c1668da 1526 const void *eq;
de190aef
LP
1527
1528 assert(f);
1529 assert(data || size == 0);
1530
1531 hash = hash64(data, size);
1532
1533 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
1534 if (r < 0)
1535 return r;
0240c603 1536 if (r > 0) {
de190aef
LP
1537
1538 if (ret)
1539 *ret = o;
1540
1541 if (offset)
1542 *offset = p;
1543
1544 return 0;
1545 }
1546
1547 osize = offsetof(Object, data.payload) + size;
1548 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
cec736d2
LP
1549 if (r < 0)
1550 return r;
1551
cec736d2 1552 o->data.hash = htole64(hash);
807e17f0 1553
349cc4a5 1554#if HAVE_XZ || HAVE_LZ4
d1afbcd2 1555 if (JOURNAL_FILE_COMPRESS(f) && size >= COMPRESSION_SIZE_THRESHOLD) {
a7f7d1bd 1556 size_t rsize = 0;
807e17f0 1557
5d6f46b6 1558 compression = compress_blob(data, size, o->data.payload, size - 1, &rsize);
807e17f0 1559
d1afbcd2 1560 if (compression >= 0) {
807e17f0 1561 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
d89c8fdf 1562 o->object.flags |= compression;
807e17f0 1563
fa1c4b51 1564 log_debug("Compressed data object %"PRIu64" -> %zu using %s",
d89c8fdf 1565 size, rsize, object_compressed_to_string(compression));
d1afbcd2
LP
1566 } else
1567 /* Compression didn't work, we don't really care why, let's continue without compression */
1568 compression = 0;
807e17f0
LP
1569 }
1570#endif
1571
75f32f04
ZJS
1572 if (compression == 0)
1573 memcpy_safe(o->data.payload, data, size);
cec736d2 1574
de190aef 1575 r = journal_file_link_data(f, o, p, hash);
cec736d2
LP
1576 if (r < 0)
1577 return r;
1578
349cc4a5 1579#if HAVE_GCRYPT
33685a5a
FB
1580 r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
1581 if (r < 0)
1582 return r;
1583#endif
1584
48496df6
LP
1585 /* The linking might have altered the window, so let's
1586 * refresh our pointer */
1587 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
1588 if (r < 0)
1589 return r;
1590
08c6f819
SL
1591 if (!data)
1592 eq = NULL;
1593 else
1594 eq = memchr(data, '=', size);
3c1668da 1595 if (eq && eq > data) {
748db592 1596 Object *fo = NULL;
3c1668da 1597 uint64_t fp;
3c1668da
LP
1598
1599 /* Create field object ... */
1600 r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, &fp);
1601 if (r < 0)
1602 return r;
1603
1604 /* ... and link it in. */
1605 o->data.next_field_offset = fo->field.head_data_offset;
1606 fo->field.head_data_offset = le64toh(p);
1607 }
1608
cec736d2
LP
1609 if (ret)
1610 *ret = o;
1611
1612 if (offset)
de190aef 1613 *offset = p;
cec736d2
LP
1614
1615 return 0;
1616}
1617
1618uint64_t journal_file_entry_n_items(Object *o) {
1619 assert(o);
b588975f
LP
1620
1621 if (o->object.type != OBJECT_ENTRY)
1622 return 0;
cec736d2
LP
1623
1624 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
1625}
1626
0284adc6 1627uint64_t journal_file_entry_array_n_items(Object *o) {
de190aef 1628 assert(o);
b588975f
LP
1629
1630 if (o->object.type != OBJECT_ENTRY_ARRAY)
1631 return 0;
de190aef
LP
1632
1633 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
1634}
1635
fb9a24b6
LP
1636uint64_t journal_file_hash_table_n_items(Object *o) {
1637 assert(o);
b588975f 1638
ec2ce0c5 1639 if (!IN_SET(o->object.type, OBJECT_DATA_HASH_TABLE, OBJECT_FIELD_HASH_TABLE))
b588975f 1640 return 0;
fb9a24b6
LP
1641
1642 return (le64toh(o->object.size) - offsetof(Object, hash_table.items)) / sizeof(HashItem);
1643}
1644
de190aef 1645static int link_entry_into_array(JournalFile *f,
4fd052ae
FC
1646 le64_t *first,
1647 le64_t *idx,
de190aef 1648 uint64_t p) {
cec736d2 1649 int r;
de190aef
LP
1650 uint64_t n = 0, ap = 0, q, i, a, hidx;
1651 Object *o;
1652
cec736d2 1653 assert(f);
c88cc6af 1654 assert(f->header);
de190aef
LP
1655 assert(first);
1656 assert(idx);
1657 assert(p > 0);
cec736d2 1658
de190aef
LP
1659 a = le64toh(*first);
1660 i = hidx = le64toh(*idx);
1661 while (a > 0) {
1662
1663 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1664 if (r < 0)
1665 return r;
cec736d2 1666
de190aef
LP
1667 n = journal_file_entry_array_n_items(o);
1668 if (i < n) {
1669 o->entry_array.items[i] = htole64(p);
1670 *idx = htole64(hidx + 1);
1671 return 0;
1672 }
cec736d2 1673
de190aef
LP
1674 i -= n;
1675 ap = a;
1676 a = le64toh(o->entry_array.next_entry_array_offset);
1677 }
1678
1679 if (hidx > n)
1680 n = (hidx+1) * 2;
1681 else
1682 n = n * 2;
1683
1684 if (n < 4)
1685 n = 4;
1686
1687 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
1688 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
1689 &o, &q);
cec736d2
LP
1690 if (r < 0)
1691 return r;
1692
349cc4a5 1693#if HAVE_GCRYPT
5996c7c2 1694 r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q);
b0af6f41
LP
1695 if (r < 0)
1696 return r;
feb12d3e 1697#endif
b0af6f41 1698
de190aef 1699 o->entry_array.items[i] = htole64(p);
cec736d2 1700
de190aef 1701 if (ap == 0)
7be3aa17 1702 *first = htole64(q);
cec736d2 1703 else {
de190aef 1704 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
cec736d2
LP
1705 if (r < 0)
1706 return r;
1707
de190aef
LP
1708 o->entry_array.next_entry_array_offset = htole64(q);
1709 }
cec736d2 1710
2dee23eb
LP
1711 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
1712 f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1);
1713
de190aef
LP
1714 *idx = htole64(hidx + 1);
1715
1716 return 0;
1717}
cec736d2 1718
de190aef 1719static int link_entry_into_array_plus_one(JournalFile *f,
4fd052ae
FC
1720 le64_t *extra,
1721 le64_t *first,
1722 le64_t *idx,
de190aef
LP
1723 uint64_t p) {
1724
1725 int r;
1726
1727 assert(f);
1728 assert(extra);
1729 assert(first);
1730 assert(idx);
1731 assert(p > 0);
1732
1733 if (*idx == 0)
1734 *extra = htole64(p);
1735 else {
4fd052ae 1736 le64_t i;
de190aef 1737
7be3aa17 1738 i = htole64(le64toh(*idx) - 1);
de190aef
LP
1739 r = link_entry_into_array(f, first, &i, p);
1740 if (r < 0)
1741 return r;
cec736d2
LP
1742 }
1743
de190aef
LP
1744 *idx = htole64(le64toh(*idx) + 1);
1745 return 0;
1746}
1747
1748static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
1749 uint64_t p;
1750 int r;
1751 assert(f);
1752 assert(o);
1753 assert(offset > 0);
1754
1755 p = le64toh(o->entry.items[i].object_offset);
1756 if (p == 0)
1757 return -EINVAL;
1758
1759 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
cec736d2
LP
1760 if (r < 0)
1761 return r;
1762
de190aef
LP
1763 return link_entry_into_array_plus_one(f,
1764 &o->data.entry_offset,
1765 &o->data.entry_array_offset,
1766 &o->data.n_entries,
1767 offset);
cec736d2
LP
1768}
1769
1770static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
de190aef 1771 uint64_t n, i;
cec736d2
LP
1772 int r;
1773
1774 assert(f);
c88cc6af 1775 assert(f->header);
cec736d2
LP
1776 assert(o);
1777 assert(offset > 0);
b588975f
LP
1778
1779 if (o->object.type != OBJECT_ENTRY)
1780 return -EINVAL;
cec736d2 1781
b788cc23
LP
1782 __sync_synchronize();
1783
cec736d2 1784 /* Link up the entry itself */
de190aef
LP
1785 r = link_entry_into_array(f,
1786 &f->header->entry_array_offset,
1787 &f->header->n_entries,
1788 offset);
1789 if (r < 0)
1790 return r;
cec736d2 1791
507f22bd 1792 /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */
cec736d2 1793
de190aef 1794 if (f->header->head_entry_realtime == 0)
0ac38b70 1795 f->header->head_entry_realtime = o->entry.realtime;
cec736d2 1796
0ac38b70 1797 f->header->tail_entry_realtime = o->entry.realtime;
de190aef
LP
1798 f->header->tail_entry_monotonic = o->entry.monotonic;
1799
1800 f->tail_entry_monotonic_valid = true;
cec736d2
LP
1801
1802 /* Link up the items */
1803 n = journal_file_entry_n_items(o);
1804 for (i = 0; i < n; i++) {
1805 r = journal_file_link_entry_item(f, o, offset, i);
1806 if (r < 0)
1807 return r;
1808 }
1809
cec736d2
LP
1810 return 0;
1811}
1812
1813static int journal_file_append_entry_internal(
1814 JournalFile *f,
1815 const dual_timestamp *ts,
1816 uint64_t xor_hash,
1817 const EntryItem items[], unsigned n_items,
de190aef 1818 uint64_t *seqnum,
cec736d2
LP
1819 Object **ret, uint64_t *offset) {
1820 uint64_t np;
1821 uint64_t osize;
1822 Object *o;
1823 int r;
1824
1825 assert(f);
c88cc6af 1826 assert(f->header);
cec736d2 1827 assert(items || n_items == 0);
de190aef 1828 assert(ts);
cec736d2
LP
1829
1830 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
1831
de190aef 1832 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
cec736d2
LP
1833 if (r < 0)
1834 return r;
1835
d98cc1f2 1836 o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum));
75f32f04 1837 memcpy_safe(o->entry.items, items, n_items * sizeof(EntryItem));
de190aef
LP
1838 o->entry.realtime = htole64(ts->realtime);
1839 o->entry.monotonic = htole64(ts->monotonic);
cec736d2
LP
1840 o->entry.xor_hash = htole64(xor_hash);
1841 o->entry.boot_id = f->header->boot_id;
1842
349cc4a5 1843#if HAVE_GCRYPT
5996c7c2 1844 r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np);
b0af6f41
LP
1845 if (r < 0)
1846 return r;
feb12d3e 1847#endif
b0af6f41 1848
cec736d2
LP
1849 r = journal_file_link_entry(f, o, np);
1850 if (r < 0)
1851 return r;
1852
1853 if (ret)
1854 *ret = o;
1855
1856 if (offset)
1857 *offset = np;
1858
1859 return 0;
1860}
1861
cf244689 1862void journal_file_post_change(JournalFile *f) {
50f20cfd
LP
1863 assert(f);
1864
1865 /* inotify() does not receive IN_MODIFY events from file
1866 * accesses done via mmap(). After each access we hence
1867 * trigger IN_MODIFY by truncating the journal file to its
1868 * current size which triggers IN_MODIFY. */
1869
bc85bfee
LP
1870 __sync_synchronize();
1871
50f20cfd 1872 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
e167d7fd 1873 log_debug_errno(errno, "Failed to truncate file to its own size: %m");
50f20cfd
LP
1874}
1875
7a24f3bf
VC
1876static int post_change_thunk(sd_event_source *timer, uint64_t usec, void *userdata) {
1877 assert(userdata);
1878
1879 journal_file_post_change(userdata);
1880
1881 return 1;
1882}
1883
1884static void schedule_post_change(JournalFile *f) {
1885 sd_event_source *timer;
1886 int enabled, r;
1887 uint64_t now;
1888
1889 assert(f);
1890 assert(f->post_change_timer);
1891
1892 timer = f->post_change_timer;
1893
1894 r = sd_event_source_get_enabled(timer, &enabled);
1895 if (r < 0) {
e167d7fd
LP
1896 log_debug_errno(r, "Failed to get ftruncate timer state: %m");
1897 goto fail;
7a24f3bf
VC
1898 }
1899
1900 if (enabled == SD_EVENT_ONESHOT)
1901 return;
1902
1903 r = sd_event_now(sd_event_source_get_event(timer), CLOCK_MONOTONIC, &now);
1904 if (r < 0) {
e167d7fd
LP
1905 log_debug_errno(r, "Failed to get clock's now for scheduling ftruncate: %m");
1906 goto fail;
7a24f3bf
VC
1907 }
1908
1909 r = sd_event_source_set_time(timer, now+f->post_change_timer_period);
1910 if (r < 0) {
e167d7fd
LP
1911 log_debug_errno(r, "Failed to set time for scheduling ftruncate: %m");
1912 goto fail;
7a24f3bf
VC
1913 }
1914
1915 r = sd_event_source_set_enabled(timer, SD_EVENT_ONESHOT);
1916 if (r < 0) {
e167d7fd
LP
1917 log_debug_errno(r, "Failed to enable scheduled ftruncate: %m");
1918 goto fail;
7a24f3bf 1919 }
e167d7fd
LP
1920
1921 return;
1922
1923fail:
1924 /* On failure, let's simply post the change immediately. */
1925 journal_file_post_change(f);
7a24f3bf
VC
1926}
1927
1928/* Enable coalesced change posting in a timer on the provided sd_event instance */
1929int journal_file_enable_post_change_timer(JournalFile *f, sd_event *e, usec_t t) {
1930 _cleanup_(sd_event_source_unrefp) sd_event_source *timer = NULL;
1931 int r;
1932
1933 assert(f);
1934 assert_return(!f->post_change_timer, -EINVAL);
1935 assert(e);
1936 assert(t);
1937
1938 r = sd_event_add_time(e, &timer, CLOCK_MONOTONIC, 0, 0, post_change_thunk, f);
1939 if (r < 0)
1940 return r;
1941
1942 r = sd_event_source_set_enabled(timer, SD_EVENT_OFF);
1943 if (r < 0)
1944 return r;
1945
1946 f->post_change_timer = timer;
1947 timer = NULL;
1948 f->post_change_timer_period = t;
1949
1950 return r;
1951}
1952
1f2da9ec
LP
1953static int entry_item_cmp(const void *_a, const void *_b) {
1954 const EntryItem *a = _a, *b = _b;
1955
1956 if (le64toh(a->object_offset) < le64toh(b->object_offset))
1957 return -1;
1958 if (le64toh(a->object_offset) > le64toh(b->object_offset))
1959 return 1;
1960 return 0;
1961}
1962
de190aef 1963int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
cec736d2
LP
1964 unsigned i;
1965 EntryItem *items;
1966 int r;
1967 uint64_t xor_hash = 0;
de190aef 1968 struct dual_timestamp _ts;
cec736d2
LP
1969
1970 assert(f);
c88cc6af 1971 assert(f->header);
cec736d2
LP
1972 assert(iovec || n_iovec == 0);
1973
de190aef
LP
1974 if (!ts) {
1975 dual_timestamp_get(&_ts);
1976 ts = &_ts;
1977 }
1978
349cc4a5 1979#if HAVE_GCRYPT
7560fffc
LP
1980 r = journal_file_maybe_append_tag(f, ts->realtime);
1981 if (r < 0)
1982 return r;
feb12d3e 1983#endif
7560fffc 1984
64825d3c 1985 /* alloca() can't take 0, hence let's allocate at least one */
9607d947 1986 items = alloca(sizeof(EntryItem) * MAX(1u, n_iovec));
cec736d2
LP
1987
1988 for (i = 0; i < n_iovec; i++) {
1989 uint64_t p;
1990 Object *o;
1991
1992 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1993 if (r < 0)
cf244689 1994 return r;
cec736d2
LP
1995
1996 xor_hash ^= le64toh(o->data.hash);
1997 items[i].object_offset = htole64(p);
de7b95cd 1998 items[i].hash = o->data.hash;
cec736d2
LP
1999 }
2000
1f2da9ec
LP
2001 /* Order by the position on disk, in order to improve seek
2002 * times for rotating media. */
7ff7394d 2003 qsort_safe(items, n_iovec, sizeof(EntryItem), entry_item_cmp);
1f2da9ec 2004
de190aef 2005 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
cec736d2 2006
fa6ac760
LP
2007 /* If the memory mapping triggered a SIGBUS then we return an
2008 * IO error and ignore the error code passed down to us, since
2009 * it is very likely just an effect of a nullified replacement
2010 * mapping page */
2011
be7cdd8e 2012 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd))
fa6ac760
LP
2013 r = -EIO;
2014
7a24f3bf
VC
2015 if (f->post_change_timer)
2016 schedule_post_change(f);
2017 else
2018 journal_file_post_change(f);
50f20cfd 2019
cec736d2
LP
2020 return r;
2021}
2022
a4bcff5b 2023typedef struct ChainCacheItem {
fb099c8d 2024 uint64_t first; /* the array at the beginning of the chain */
a4bcff5b
LP
2025 uint64_t array; /* the cached array */
2026 uint64_t begin; /* the first item in the cached array */
2027 uint64_t total; /* the total number of items in all arrays before this one in the chain */
f268980d 2028 uint64_t last_index; /* the last index we looked at, to optimize locality when bisecting */
a4bcff5b
LP
2029} ChainCacheItem;
2030
2031static void chain_cache_put(
4743015d 2032 OrderedHashmap *h,
a4bcff5b
LP
2033 ChainCacheItem *ci,
2034 uint64_t first,
2035 uint64_t array,
2036 uint64_t begin,
f268980d
LP
2037 uint64_t total,
2038 uint64_t last_index) {
a4bcff5b
LP
2039
2040 if (!ci) {
34741aa3
LP
2041 /* If the chain item to cache for this chain is the
2042 * first one it's not worth caching anything */
2043 if (array == first)
2044 return;
2045
29433089 2046 if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) {
4743015d 2047 ci = ordered_hashmap_steal_first(h);
29433089
LP
2048 assert(ci);
2049 } else {
a4bcff5b
LP
2050 ci = new(ChainCacheItem, 1);
2051 if (!ci)
2052 return;
2053 }
2054
2055 ci->first = first;
2056
4743015d 2057 if (ordered_hashmap_put(h, &ci->first, ci) < 0) {
a4bcff5b
LP
2058 free(ci);
2059 return;
2060 }
2061 } else
2062 assert(ci->first == first);
2063
2064 ci->array = array;
2065 ci->begin = begin;
2066 ci->total = total;
f268980d 2067 ci->last_index = last_index;
a4bcff5b
LP
2068}
2069
f268980d
LP
2070static int generic_array_get(
2071 JournalFile *f,
2072 uint64_t first,
2073 uint64_t i,
2074 Object **ret, uint64_t *offset) {
de190aef 2075
cec736d2 2076 Object *o;
a4bcff5b 2077 uint64_t p = 0, a, t = 0;
cec736d2 2078 int r;
a4bcff5b 2079 ChainCacheItem *ci;
cec736d2
LP
2080
2081 assert(f);
2082
de190aef 2083 a = first;
a4bcff5b
LP
2084
2085 /* Try the chain cache first */
4743015d 2086 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
2087 if (ci && i > ci->total) {
2088 a = ci->array;
2089 i -= ci->total;
2090 t = ci->total;
2091 }
2092
de190aef 2093 while (a > 0) {
a4bcff5b 2094 uint64_t k;
cec736d2 2095
de190aef
LP
2096 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
2097 if (r < 0)
2098 return r;
cec736d2 2099
a4bcff5b
LP
2100 k = journal_file_entry_array_n_items(o);
2101 if (i < k) {
de190aef 2102 p = le64toh(o->entry_array.items[i]);
a4bcff5b 2103 goto found;
cec736d2
LP
2104 }
2105
a4bcff5b
LP
2106 i -= k;
2107 t += k;
de190aef
LP
2108 a = le64toh(o->entry_array.next_entry_array_offset);
2109 }
2110
a4bcff5b
LP
2111 return 0;
2112
2113found:
2114 /* Let's cache this item for the next invocation */
af13a6b0 2115 chain_cache_put(f->chain_cache, ci, first, a, le64toh(o->entry_array.items[0]), t, i);
de190aef
LP
2116
2117 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2118 if (r < 0)
2119 return r;
2120
2121 if (ret)
2122 *ret = o;
2123
2124 if (offset)
2125 *offset = p;
2126
2127 return 1;
2128}
2129
f268980d
LP
2130static int generic_array_get_plus_one(
2131 JournalFile *f,
2132 uint64_t extra,
2133 uint64_t first,
2134 uint64_t i,
2135 Object **ret, uint64_t *offset) {
de190aef
LP
2136
2137 Object *o;
2138
2139 assert(f);
2140
2141 if (i == 0) {
2142 int r;
2143
2144 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
cec736d2
LP
2145 if (r < 0)
2146 return r;
2147
de190aef
LP
2148 if (ret)
2149 *ret = o;
cec736d2 2150
de190aef
LP
2151 if (offset)
2152 *offset = extra;
cec736d2 2153
de190aef 2154 return 1;
cec736d2
LP
2155 }
2156
de190aef
LP
2157 return generic_array_get(f, first, i-1, ret, offset);
2158}
cec736d2 2159
de190aef
LP
2160enum {
2161 TEST_FOUND,
2162 TEST_LEFT,
2163 TEST_RIGHT
2164};
cec736d2 2165
f268980d
LP
2166static int generic_array_bisect(
2167 JournalFile *f,
2168 uint64_t first,
2169 uint64_t n,
2170 uint64_t needle,
2171 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2172 direction_t direction,
2173 Object **ret,
2174 uint64_t *offset,
2175 uint64_t *idx) {
2176
2177 uint64_t a, p, t = 0, i = 0, last_p = 0, last_index = (uint64_t) -1;
de190aef
LP
2178 bool subtract_one = false;
2179 Object *o, *array = NULL;
2180 int r;
a4bcff5b 2181 ChainCacheItem *ci;
cec736d2 2182
de190aef
LP
2183 assert(f);
2184 assert(test_object);
cec736d2 2185
a4bcff5b 2186 /* Start with the first array in the chain */
de190aef 2187 a = first;
a4bcff5b 2188
4743015d 2189 ci = ordered_hashmap_get(f->chain_cache, &first);
a4bcff5b
LP
2190 if (ci && n > ci->total) {
2191 /* Ah, we have iterated this bisection array chain
2192 * previously! Let's see if we can skip ahead in the
2193 * chain, as far as the last time. But we can't jump
2194 * backwards in the chain, so let's check that
2195 * first. */
2196
2197 r = test_object(f, ci->begin, needle);
2198 if (r < 0)
2199 return r;
2200
2201 if (r == TEST_LEFT) {
f268980d 2202 /* OK, what we are looking for is right of the
a4bcff5b
LP
2203 * begin of this EntryArray, so let's jump
2204 * straight to previously cached array in the
2205 * chain */
2206
2207 a = ci->array;
2208 n -= ci->total;
2209 t = ci->total;
f268980d 2210 last_index = ci->last_index;
a4bcff5b
LP
2211 }
2212 }
2213
de190aef
LP
2214 while (a > 0) {
2215 uint64_t left, right, k, lp;
2216
2217 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
cec736d2
LP
2218 if (r < 0)
2219 return r;
2220
de190aef
LP
2221 k = journal_file_entry_array_n_items(array);
2222 right = MIN(k, n);
2223 if (right <= 0)
2224 return 0;
cec736d2 2225
de190aef
LP
2226 i = right - 1;
2227 lp = p = le64toh(array->entry_array.items[i]);
2228 if (p <= 0)
bee6a291
LP
2229 r = -EBADMSG;
2230 else
2231 r = test_object(f, p, needle);
2232 if (r == -EBADMSG) {
2233 log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (1)");
2234 n = i;
2235 continue;
2236 }
de190aef
LP
2237 if (r < 0)
2238 return r;
cec736d2 2239
de190aef
LP
2240 if (r == TEST_FOUND)
2241 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2242
2243 if (r == TEST_RIGHT) {
2244 left = 0;
2245 right -= 1;
f268980d
LP
2246
2247 if (last_index != (uint64_t) -1) {
2248 assert(last_index <= right);
2249
2250 /* If we cached the last index we
2251 * looked at, let's try to not to jump
2252 * too wildly around and see if we can
2253 * limit the range to look at early to
2254 * the immediate neighbors of the last
2255 * index we looked at. */
2256
2257 if (last_index > 0) {
2258 uint64_t x = last_index - 1;
2259
2260 p = le64toh(array->entry_array.items[x]);
2261 if (p <= 0)
2262 return -EBADMSG;
2263
2264 r = test_object(f, p, needle);
2265 if (r < 0)
2266 return r;
2267
2268 if (r == TEST_FOUND)
2269 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2270
2271 if (r == TEST_RIGHT)
2272 right = x;
2273 else
2274 left = x + 1;
2275 }
2276
2277 if (last_index < right) {
2278 uint64_t y = last_index + 1;
2279
2280 p = le64toh(array->entry_array.items[y]);
2281 if (p <= 0)
2282 return -EBADMSG;
2283
2284 r = test_object(f, p, needle);
2285 if (r < 0)
2286 return r;
2287
2288 if (r == TEST_FOUND)
2289 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2290
2291 if (r == TEST_RIGHT)
2292 right = y;
2293 else
2294 left = y + 1;
2295 }
f268980d
LP
2296 }
2297
de190aef
LP
2298 for (;;) {
2299 if (left == right) {
2300 if (direction == DIRECTION_UP)
2301 subtract_one = true;
2302
2303 i = left;
2304 goto found;
2305 }
2306
2307 assert(left < right);
de190aef 2308 i = (left + right) / 2;
f268980d 2309
de190aef
LP
2310 p = le64toh(array->entry_array.items[i]);
2311 if (p <= 0)
bee6a291
LP
2312 r = -EBADMSG;
2313 else
2314 r = test_object(f, p, needle);
2315 if (r == -EBADMSG) {
2316 log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (2)");
2317 right = n = i;
2318 continue;
2319 }
de190aef
LP
2320 if (r < 0)
2321 return r;
cec736d2 2322
de190aef
LP
2323 if (r == TEST_FOUND)
2324 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2325
2326 if (r == TEST_RIGHT)
2327 right = i;
2328 else
2329 left = i + 1;
2330 }
2331 }
2332
2173cbf8 2333 if (k >= n) {
cbdca852
LP
2334 if (direction == DIRECTION_UP) {
2335 i = n;
2336 subtract_one = true;
2337 goto found;
2338 }
2339
cec736d2 2340 return 0;
cbdca852 2341 }
cec736d2 2342
de190aef
LP
2343 last_p = lp;
2344
2345 n -= k;
2346 t += k;
f268980d 2347 last_index = (uint64_t) -1;
de190aef 2348 a = le64toh(array->entry_array.next_entry_array_offset);
cec736d2
LP
2349 }
2350
2351 return 0;
de190aef
LP
2352
2353found:
2354 if (subtract_one && t == 0 && i == 0)
2355 return 0;
2356
a4bcff5b 2357 /* Let's cache this item for the next invocation */
af13a6b0 2358 chain_cache_put(f->chain_cache, ci, first, a, le64toh(array->entry_array.items[0]), t, subtract_one ? (i > 0 ? i-1 : (uint64_t) -1) : i);
a4bcff5b 2359
de190aef
LP
2360 if (subtract_one && i == 0)
2361 p = last_p;
2362 else if (subtract_one)
2363 p = le64toh(array->entry_array.items[i-1]);
2364 else
2365 p = le64toh(array->entry_array.items[i]);
2366
2367 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2368 if (r < 0)
2369 return r;
2370
2371 if (ret)
2372 *ret = o;
2373
2374 if (offset)
2375 *offset = p;
2376
2377 if (idx)
cbdca852 2378 *idx = t + i + (subtract_one ? -1 : 0);
de190aef
LP
2379
2380 return 1;
cec736d2
LP
2381}
2382
f268980d
LP
2383static int generic_array_bisect_plus_one(
2384 JournalFile *f,
2385 uint64_t extra,
2386 uint64_t first,
2387 uint64_t n,
2388 uint64_t needle,
2389 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
2390 direction_t direction,
2391 Object **ret,
2392 uint64_t *offset,
2393 uint64_t *idx) {
de190aef 2394
cec736d2 2395 int r;
cbdca852
LP
2396 bool step_back = false;
2397 Object *o;
cec736d2
LP
2398
2399 assert(f);
de190aef 2400 assert(test_object);
cec736d2 2401
de190aef
LP
2402 if (n <= 0)
2403 return 0;
cec736d2 2404
de190aef
LP
2405 /* This bisects the array in object 'first', but first checks
2406 * an extra */
de190aef
LP
2407 r = test_object(f, extra, needle);
2408 if (r < 0)
2409 return r;
a536e261
LP
2410
2411 if (r == TEST_FOUND)
2412 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
2413
cbdca852
LP
2414 /* if we are looking with DIRECTION_UP then we need to first
2415 see if in the actual array there is a matching entry, and
2416 return the last one of that. But if there isn't any we need
2417 to return this one. Hence remember this, and return it
2418 below. */
2419 if (r == TEST_LEFT)
2420 step_back = direction == DIRECTION_UP;
de190aef 2421
cbdca852
LP
2422 if (r == TEST_RIGHT) {
2423 if (direction == DIRECTION_DOWN)
2424 goto found;
2425 else
2426 return 0;
a536e261 2427 }
cec736d2 2428
de190aef
LP
2429 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
2430
cbdca852
LP
2431 if (r == 0 && step_back)
2432 goto found;
2433
ecf68b1d 2434 if (r > 0 && idx)
313cefa1 2435 (*idx)++;
de190aef
LP
2436
2437 return r;
cbdca852
LP
2438
2439found:
2440 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
2441 if (r < 0)
2442 return r;
2443
2444 if (ret)
2445 *ret = o;
2446
2447 if (offset)
2448 *offset = extra;
2449
2450 if (idx)
2451 *idx = 0;
2452
2453 return 1;
2454}
2455
44a6b1b6 2456_pure_ static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
cbdca852
LP
2457 assert(f);
2458 assert(p > 0);
2459
2460 if (p == needle)
2461 return TEST_FOUND;
2462 else if (p < needle)
2463 return TEST_LEFT;
2464 else
2465 return TEST_RIGHT;
2466}
2467
de190aef
LP
2468static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
2469 Object *o;
2470 int r;
2471
2472 assert(f);
2473 assert(p > 0);
2474
2475 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
cec736d2
LP
2476 if (r < 0)
2477 return r;
2478
de190aef
LP
2479 if (le64toh(o->entry.seqnum) == needle)
2480 return TEST_FOUND;
2481 else if (le64toh(o->entry.seqnum) < needle)
2482 return TEST_LEFT;
2483 else
2484 return TEST_RIGHT;
2485}
cec736d2 2486
de190aef
LP
2487int journal_file_move_to_entry_by_seqnum(
2488 JournalFile *f,
2489 uint64_t seqnum,
2490 direction_t direction,
2491 Object **ret,
2492 uint64_t *offset) {
c88cc6af
VC
2493 assert(f);
2494 assert(f->header);
de190aef
LP
2495
2496 return generic_array_bisect(f,
2497 le64toh(f->header->entry_array_offset),
2498 le64toh(f->header->n_entries),
2499 seqnum,
2500 test_object_seqnum,
2501 direction,
2502 ret, offset, NULL);
2503}
cec736d2 2504
de190aef
LP
2505static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
2506 Object *o;
2507 int r;
2508
2509 assert(f);
2510 assert(p > 0);
2511
2512 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2513 if (r < 0)
2514 return r;
2515
2516 if (le64toh(o->entry.realtime) == needle)
2517 return TEST_FOUND;
2518 else if (le64toh(o->entry.realtime) < needle)
2519 return TEST_LEFT;
2520 else
2521 return TEST_RIGHT;
cec736d2
LP
2522}
2523
de190aef
LP
2524int journal_file_move_to_entry_by_realtime(
2525 JournalFile *f,
2526 uint64_t realtime,
2527 direction_t direction,
2528 Object **ret,
2529 uint64_t *offset) {
c88cc6af
VC
2530 assert(f);
2531 assert(f->header);
de190aef
LP
2532
2533 return generic_array_bisect(f,
2534 le64toh(f->header->entry_array_offset),
2535 le64toh(f->header->n_entries),
2536 realtime,
2537 test_object_realtime,
2538 direction,
2539 ret, offset, NULL);
2540}
2541
2542static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
2543 Object *o;
2544 int r;
2545
2546 assert(f);
2547 assert(p > 0);
2548
2549 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
2550 if (r < 0)
2551 return r;
2552
2553 if (le64toh(o->entry.monotonic) == needle)
2554 return TEST_FOUND;
2555 else if (le64toh(o->entry.monotonic) < needle)
2556 return TEST_LEFT;
2557 else
2558 return TEST_RIGHT;
2559}
2560
2a560338 2561static int find_data_object_by_boot_id(
47838ab3
ZJS
2562 JournalFile *f,
2563 sd_id128_t boot_id,
2564 Object **o,
2565 uint64_t *b) {
2a560338 2566
fbd0b64f 2567 char t[STRLEN("_BOOT_ID=") + 32 + 1] = "_BOOT_ID=";
47838ab3
ZJS
2568
2569 sd_id128_to_string(boot_id, t + 9);
2570 return journal_file_find_data_object(f, t, sizeof(t) - 1, o, b);
2571}
2572
de190aef
LP
2573int journal_file_move_to_entry_by_monotonic(
2574 JournalFile *f,
2575 sd_id128_t boot_id,
2576 uint64_t monotonic,
2577 direction_t direction,
2578 Object **ret,
2579 uint64_t *offset) {
2580
de190aef
LP
2581 Object *o;
2582 int r;
2583
cbdca852 2584 assert(f);
de190aef 2585
47838ab3 2586 r = find_data_object_by_boot_id(f, boot_id, &o, NULL);
de190aef
LP
2587 if (r < 0)
2588 return r;
cbdca852 2589 if (r == 0)
de190aef
LP
2590 return -ENOENT;
2591
2592 return generic_array_bisect_plus_one(f,
2593 le64toh(o->data.entry_offset),
2594 le64toh(o->data.entry_array_offset),
2595 le64toh(o->data.n_entries),
2596 monotonic,
2597 test_object_monotonic,
2598 direction,
2599 ret, offset, NULL);
2600}
2601
1fc605b0 2602void journal_file_reset_location(JournalFile *f) {
6573ef05 2603 f->location_type = LOCATION_HEAD;
1fc605b0 2604 f->current_offset = 0;
6573ef05
MS
2605 f->current_seqnum = 0;
2606 f->current_realtime = 0;
2607 f->current_monotonic = 0;
2608 zero(f->current_boot_id);
2609 f->current_xor_hash = 0;
2610}
2611
950c07d4 2612void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
6573ef05
MS
2613 f->location_type = LOCATION_SEEK;
2614 f->current_offset = offset;
2615 f->current_seqnum = le64toh(o->entry.seqnum);
2616 f->current_realtime = le64toh(o->entry.realtime);
2617 f->current_monotonic = le64toh(o->entry.monotonic);
2618 f->current_boot_id = o->entry.boot_id;
2619 f->current_xor_hash = le64toh(o->entry.xor_hash);
1fc605b0
MS
2620}
2621
d8ae66d7
MS
2622int journal_file_compare_locations(JournalFile *af, JournalFile *bf) {
2623 assert(af);
c88cc6af 2624 assert(af->header);
d8ae66d7 2625 assert(bf);
c88cc6af 2626 assert(bf->header);
d8ae66d7
MS
2627 assert(af->location_type == LOCATION_SEEK);
2628 assert(bf->location_type == LOCATION_SEEK);
2629
2630 /* If contents and timestamps match, these entries are
2631 * identical, even if the seqnum does not match */
2632 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id) &&
2633 af->current_monotonic == bf->current_monotonic &&
2634 af->current_realtime == bf->current_realtime &&
2635 af->current_xor_hash == bf->current_xor_hash)
2636 return 0;
2637
2638 if (sd_id128_equal(af->header->seqnum_id, bf->header->seqnum_id)) {
2639
2640 /* If this is from the same seqnum source, compare
2641 * seqnums */
2642 if (af->current_seqnum < bf->current_seqnum)
2643 return -1;
2644 if (af->current_seqnum > bf->current_seqnum)
2645 return 1;
2646
2647 /* Wow! This is weird, different data but the same
2648 * seqnums? Something is borked, but let's make the
2649 * best of it and compare by time. */
2650 }
2651
2652 if (sd_id128_equal(af->current_boot_id, bf->current_boot_id)) {
2653
2654 /* If the boot id matches, compare monotonic time */
2655 if (af->current_monotonic < bf->current_monotonic)
2656 return -1;
2657 if (af->current_monotonic > bf->current_monotonic)
2658 return 1;
2659 }
2660
2661 /* Otherwise, compare UTC time */
2662 if (af->current_realtime < bf->current_realtime)
2663 return -1;
2664 if (af->current_realtime > bf->current_realtime)
2665 return 1;
2666
2667 /* Finally, compare by contents */
2668 if (af->current_xor_hash < bf->current_xor_hash)
2669 return -1;
2670 if (af->current_xor_hash > bf->current_xor_hash)
2671 return 1;
2672
2673 return 0;
2674}
2675
aa598ba5
LP
2676static int bump_array_index(uint64_t *i, direction_t direction, uint64_t n) {
2677
2678 /* Increase or decrease the specified index, in the right direction. */
2679
2680 if (direction == DIRECTION_DOWN) {
2681 if (*i >= n - 1)
2682 return 0;
2683
2684 (*i) ++;
2685 } else {
2686 if (*i <= 0)
2687 return 0;
2688
2689 (*i) --;
2690 }
2691
2692 return 1;
2693}
2694
b6da4ed0
LP
2695static bool check_properly_ordered(uint64_t new_offset, uint64_t old_offset, direction_t direction) {
2696
2697 /* Consider it an error if any of the two offsets is uninitialized */
2698 if (old_offset == 0 || new_offset == 0)
2699 return false;
2700
2701 /* If we go down, the new offset must be larger than the old one. */
2702 return direction == DIRECTION_DOWN ?
2703 new_offset > old_offset :
2704 new_offset < old_offset;
2705}
2706
de190aef
LP
2707int journal_file_next_entry(
2708 JournalFile *f,
f534928a 2709 uint64_t p,
de190aef
LP
2710 direction_t direction,
2711 Object **ret, uint64_t *offset) {
2712
fb099c8d 2713 uint64_t i, n, ofs;
cec736d2
LP
2714 int r;
2715
2716 assert(f);
c88cc6af 2717 assert(f->header);
de190aef
LP
2718
2719 n = le64toh(f->header->n_entries);
2720 if (n <= 0)
2721 return 0;
cec736d2 2722
f534928a 2723 if (p == 0)
de190aef 2724 i = direction == DIRECTION_DOWN ? 0 : n - 1;
cec736d2 2725 else {
de190aef
LP
2726 r = generic_array_bisect(f,
2727 le64toh(f->header->entry_array_offset),
2728 le64toh(f->header->n_entries),
2729 p,
2730 test_object_offset,
2731 DIRECTION_DOWN,
2732 NULL, NULL,
2733 &i);
2734 if (r <= 0)
2735 return r;
2736
aa598ba5
LP
2737 r = bump_array_index(&i, direction, n);
2738 if (r <= 0)
2739 return r;
cec736d2
LP
2740 }
2741
de190aef 2742 /* And jump to it */
989793d3
LP
2743 for (;;) {
2744 r = generic_array_get(f,
2745 le64toh(f->header->entry_array_offset),
2746 i,
2747 ret, &ofs);
2748 if (r > 0)
2749 break;
2750 if (r != -EBADMSG)
2751 return r;
2752
2753 /* OK, so this entry is borked. Most likely some entry didn't get synced to disk properly, let's see if
2754 * the next one might work for us instead. */
2755 log_debug_errno(r, "Entry item %" PRIu64 " is bad, skipping over it.", i);
2756
2757 r = bump_array_index(&i, direction, n);
2758 if (r <= 0)
2759 return r;
caeab8f6 2760 }
fb099c8d 2761
b6da4ed0
LP
2762 /* Ensure our array is properly ordered. */
2763 if (p > 0 && !check_properly_ordered(ofs, p, direction)) {
2764 log_debug("%s: entry array not properly ordered at entry %" PRIu64, f->path, i);
fb099c8d
ZJS
2765 return -EBADMSG;
2766 }
2767
2768 if (offset)
2769 *offset = ofs;
2770
2771 return 1;
de190aef 2772}
cec736d2 2773
de190aef
LP
2774int journal_file_next_entry_for_data(
2775 JournalFile *f,
2776 Object *o, uint64_t p,
2777 uint64_t data_offset,
2778 direction_t direction,
2779 Object **ret, uint64_t *offset) {
2780
ded5034e 2781 uint64_t i, n, ofs;
de190aef 2782 Object *d;
989793d3 2783 int r;
cec736d2
LP
2784
2785 assert(f);
de190aef 2786 assert(p > 0 || !o);
cec736d2 2787
de190aef 2788 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
466ccd92 2789 if (r < 0)
de190aef 2790 return r;
cec736d2 2791
de190aef
LP
2792 n = le64toh(d->data.n_entries);
2793 if (n <= 0)
2794 return n;
cec736d2 2795
de190aef
LP
2796 if (!o)
2797 i = direction == DIRECTION_DOWN ? 0 : n - 1;
2798 else {
2799 if (o->object.type != OBJECT_ENTRY)
2800 return -EINVAL;
cec736d2 2801
de190aef
LP
2802 r = generic_array_bisect_plus_one(f,
2803 le64toh(d->data.entry_offset),
2804 le64toh(d->data.entry_array_offset),
2805 le64toh(d->data.n_entries),
2806 p,
2807 test_object_offset,
2808 DIRECTION_DOWN,
2809 NULL, NULL,
2810 &i);
2811
2812 if (r <= 0)
cec736d2
LP
2813 return r;
2814
aa598ba5
LP
2815 r = bump_array_index(&i, direction, n);
2816 if (r <= 0)
2817 return r;
de190aef 2818 }
cec736d2 2819
989793d3
LP
2820 for (;;) {
2821 r = generic_array_get_plus_one(f,
2822 le64toh(d->data.entry_offset),
2823 le64toh(d->data.entry_array_offset),
2824 i,
2825 ret, &ofs);
2826 if (r > 0)
2827 break;
2828 if (r != -EBADMSG)
2829 return r;
2830
2831 log_debug_errno(r, "Data entry item %" PRIu64 " is bad, skipping over it.", i);
2832
2833 r = bump_array_index(&i, direction, n);
2834 if (r <= 0)
2835 return r;
2836 }
ded5034e
LP
2837
2838 /* Ensure our array is properly ordered. */
2839 if (p > 0 && check_properly_ordered(ofs, p, direction)) {
2840 log_debug("%s data entry array not properly ordered at entry %" PRIu64, f->path, i);
2841 return -EBADMSG;
2842 }
2843
2844 if (offset)
2845 *offset = ofs;
2846
2847 return 1;
de190aef 2848}
cec736d2 2849
cbdca852
LP
2850int journal_file_move_to_entry_by_offset_for_data(
2851 JournalFile *f,
2852 uint64_t data_offset,
2853 uint64_t p,
2854 direction_t direction,
2855 Object **ret, uint64_t *offset) {
2856
2857 int r;
2858 Object *d;
2859
2860 assert(f);
2861
2862 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2863 if (r < 0)
2864 return r;
2865
2866 return generic_array_bisect_plus_one(f,
2867 le64toh(d->data.entry_offset),
2868 le64toh(d->data.entry_array_offset),
2869 le64toh(d->data.n_entries),
2870 p,
2871 test_object_offset,
2872 direction,
2873 ret, offset, NULL);
2874}
2875
2876int journal_file_move_to_entry_by_monotonic_for_data(
2877 JournalFile *f,
2878 uint64_t data_offset,
2879 sd_id128_t boot_id,
2880 uint64_t monotonic,
2881 direction_t direction,
2882 Object **ret, uint64_t *offset) {
2883
cbdca852
LP
2884 Object *o, *d;
2885 int r;
2886 uint64_t b, z;
2887
2888 assert(f);
2889
2890 /* First, seek by time */
47838ab3 2891 r = find_data_object_by_boot_id(f, boot_id, &o, &b);
cbdca852
LP
2892 if (r < 0)
2893 return r;
2894 if (r == 0)
2895 return -ENOENT;
2896
2897 r = generic_array_bisect_plus_one(f,
2898 le64toh(o->data.entry_offset),
2899 le64toh(o->data.entry_array_offset),
2900 le64toh(o->data.n_entries),
2901 monotonic,
2902 test_object_monotonic,
2903 direction,
2904 NULL, &z, NULL);
2905 if (r <= 0)
2906 return r;
2907
2908 /* And now, continue seeking until we find an entry that
2909 * exists in both bisection arrays */
2910
2911 for (;;) {
2912 Object *qo;
2913 uint64_t p, q;
2914
2915 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
2916 if (r < 0)
2917 return r;
2918
2919 r = generic_array_bisect_plus_one(f,
2920 le64toh(d->data.entry_offset),
2921 le64toh(d->data.entry_array_offset),
2922 le64toh(d->data.n_entries),
2923 z,
2924 test_object_offset,
2925 direction,
2926 NULL, &p, NULL);
2927 if (r <= 0)
2928 return r;
2929
2930 r = journal_file_move_to_object(f, OBJECT_DATA, b, &o);
2931 if (r < 0)
2932 return r;
2933
2934 r = generic_array_bisect_plus_one(f,
2935 le64toh(o->data.entry_offset),
2936 le64toh(o->data.entry_array_offset),
2937 le64toh(o->data.n_entries),
2938 p,
2939 test_object_offset,
2940 direction,
2941 &qo, &q, NULL);
2942
2943 if (r <= 0)
2944 return r;
2945
2946 if (p == q) {
2947 if (ret)
2948 *ret = qo;
2949 if (offset)
2950 *offset = q;
2951
2952 return 1;
2953 }
2954
2955 z = q;
2956 }
cbdca852
LP
2957}
2958
de190aef
LP
2959int journal_file_move_to_entry_by_seqnum_for_data(
2960 JournalFile *f,
2961 uint64_t data_offset,
2962 uint64_t seqnum,
2963 direction_t direction,
2964 Object **ret, uint64_t *offset) {
cec736d2 2965
de190aef
LP
2966 Object *d;
2967 int r;
cec736d2 2968
91a31dde
LP
2969 assert(f);
2970
de190aef 2971 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2972 if (r < 0)
de190aef 2973 return r;
cec736d2 2974
de190aef
LP
2975 return generic_array_bisect_plus_one(f,
2976 le64toh(d->data.entry_offset),
2977 le64toh(d->data.entry_array_offset),
2978 le64toh(d->data.n_entries),
2979 seqnum,
2980 test_object_seqnum,
2981 direction,
2982 ret, offset, NULL);
2983}
cec736d2 2984
de190aef
LP
2985int journal_file_move_to_entry_by_realtime_for_data(
2986 JournalFile *f,
2987 uint64_t data_offset,
2988 uint64_t realtime,
2989 direction_t direction,
2990 Object **ret, uint64_t *offset) {
2991
2992 Object *d;
2993 int r;
2994
91a31dde
LP
2995 assert(f);
2996
de190aef 2997 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
91a31dde 2998 if (r < 0)
de190aef
LP
2999 return r;
3000
3001 return generic_array_bisect_plus_one(f,
3002 le64toh(d->data.entry_offset),
3003 le64toh(d->data.entry_array_offset),
3004 le64toh(d->data.n_entries),
3005 realtime,
3006 test_object_realtime,
3007 direction,
3008 ret, offset, NULL);
cec736d2
LP
3009}
3010
0284adc6 3011void journal_file_dump(JournalFile *f) {
7560fffc 3012 Object *o;
7560fffc 3013 int r;
0284adc6 3014 uint64_t p;
7560fffc
LP
3015
3016 assert(f);
c88cc6af 3017 assert(f->header);
7560fffc 3018
0284adc6 3019 journal_file_print_header(f);
7560fffc 3020
0284adc6
LP
3021 p = le64toh(f->header->header_size);
3022 while (p != 0) {
d05089d8 3023 r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o);
0284adc6
LP
3024 if (r < 0)
3025 goto fail;
7560fffc 3026
0284adc6 3027 switch (o->object.type) {
d98cc1f2 3028
0284adc6
LP
3029 case OBJECT_UNUSED:
3030 printf("Type: OBJECT_UNUSED\n");
3031 break;
d98cc1f2 3032
0284adc6
LP
3033 case OBJECT_DATA:
3034 printf("Type: OBJECT_DATA\n");
3035 break;
7560fffc 3036
3c1668da
LP
3037 case OBJECT_FIELD:
3038 printf("Type: OBJECT_FIELD\n");
3039 break;
3040
0284adc6 3041 case OBJECT_ENTRY:
507f22bd
ZJS
3042 printf("Type: OBJECT_ENTRY seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n",
3043 le64toh(o->entry.seqnum),
3044 le64toh(o->entry.monotonic),
3045 le64toh(o->entry.realtime));
0284adc6 3046 break;
7560fffc 3047
0284adc6
LP
3048 case OBJECT_FIELD_HASH_TABLE:
3049 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
3050 break;
7560fffc 3051
0284adc6
LP
3052 case OBJECT_DATA_HASH_TABLE:
3053 printf("Type: OBJECT_DATA_HASH_TABLE\n");
3054 break;
7560fffc 3055
0284adc6
LP
3056 case OBJECT_ENTRY_ARRAY:
3057 printf("Type: OBJECT_ENTRY_ARRAY\n");
3058 break;
7560fffc 3059
0284adc6 3060 case OBJECT_TAG:
507f22bd
ZJS
3061 printf("Type: OBJECT_TAG seqnum=%"PRIu64" epoch=%"PRIu64"\n",
3062 le64toh(o->tag.seqnum),
3063 le64toh(o->tag.epoch));
0284adc6 3064 break;
3c1668da
LP
3065
3066 default:
8facc349 3067 printf("Type: unknown (%i)\n", o->object.type);
3c1668da 3068 break;
0284adc6 3069 }
7560fffc 3070
d89c8fdf
ZJS
3071 if (o->object.flags & OBJECT_COMPRESSION_MASK)
3072 printf("Flags: %s\n",
3073 object_compressed_to_string(o->object.flags & OBJECT_COMPRESSION_MASK));
7560fffc 3074
0284adc6
LP
3075 if (p == le64toh(f->header->tail_object_offset))
3076 p = 0;
3077 else
3078 p = p + ALIGN64(le64toh(o->object.size));
3079 }
7560fffc 3080
0284adc6
LP
3081 return;
3082fail:
3083 log_error("File corrupt");
7560fffc
LP
3084}
3085
718fe4b1
ZJS
3086static const char* format_timestamp_safe(char *buf, size_t l, usec_t t) {
3087 const char *x;
3088
3089 x = format_timestamp(buf, l, t);
3090 if (x)
3091 return x;
3092 return " --- ";
3093}
3094
0284adc6 3095void journal_file_print_header(JournalFile *f) {
2765b7bb 3096 char a[33], b[33], c[33], d[33];
ed375beb 3097 char x[FORMAT_TIMESTAMP_MAX], y[FORMAT_TIMESTAMP_MAX], z[FORMAT_TIMESTAMP_MAX];
a1a03e30
LP
3098 struct stat st;
3099 char bytes[FORMAT_BYTES_MAX];
7560fffc
LP
3100
3101 assert(f);
c88cc6af 3102 assert(f->header);
7560fffc 3103
0284adc6
LP
3104 printf("File Path: %s\n"
3105 "File ID: %s\n"
3106 "Machine ID: %s\n"
3107 "Boot ID: %s\n"
3108 "Sequential Number ID: %s\n"
3109 "State: %s\n"
3110 "Compatible Flags:%s%s\n"
d89c8fdf 3111 "Incompatible Flags:%s%s%s\n"
507f22bd
ZJS
3112 "Header size: %"PRIu64"\n"
3113 "Arena size: %"PRIu64"\n"
3114 "Data Hash Table Size: %"PRIu64"\n"
3115 "Field Hash Table Size: %"PRIu64"\n"
0284adc6 3116 "Rotate Suggested: %s\n"
0808b92f
LP
3117 "Head Sequential Number: %"PRIu64" (%"PRIx64")\n"
3118 "Tail Sequential Number: %"PRIu64" (%"PRIx64")\n"
3119 "Head Realtime Timestamp: %s (%"PRIx64")\n"
3120 "Tail Realtime Timestamp: %s (%"PRIx64")\n"
3121 "Tail Monotonic Timestamp: %s (%"PRIx64")\n"
507f22bd
ZJS
3122 "Objects: %"PRIu64"\n"
3123 "Entry Objects: %"PRIu64"\n",
0284adc6
LP
3124 f->path,
3125 sd_id128_to_string(f->header->file_id, a),
3126 sd_id128_to_string(f->header->machine_id, b),
3127 sd_id128_to_string(f->header->boot_id, c),
2765b7bb 3128 sd_id128_to_string(f->header->seqnum_id, d),
3223f44f
LP
3129 f->header->state == STATE_OFFLINE ? "OFFLINE" :
3130 f->header->state == STATE_ONLINE ? "ONLINE" :
3131 f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN",
8088cbd3 3132 JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "",
d89c8fdf
ZJS
3133 (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "",
3134 JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "",
3135 JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "",
3136 (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "",
507f22bd
ZJS
3137 le64toh(f->header->header_size),
3138 le64toh(f->header->arena_size),
3139 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3140 le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
fb0951b0 3141 yes_no(journal_file_rotate_suggested(f, 0)),
0808b92f
LP
3142 le64toh(f->header->head_entry_seqnum), le64toh(f->header->head_entry_seqnum),
3143 le64toh(f->header->tail_entry_seqnum), le64toh(f->header->tail_entry_seqnum),
3144 format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)), le64toh(f->header->head_entry_realtime),
3145 format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)), le64toh(f->header->tail_entry_realtime),
3146 format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC), le64toh(f->header->tail_entry_monotonic),
507f22bd
ZJS
3147 le64toh(f->header->n_objects),
3148 le64toh(f->header->n_entries));
7560fffc 3149
0284adc6 3150 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
507f22bd 3151 printf("Data Objects: %"PRIu64"\n"
0284adc6 3152 "Data Hash Table Fill: %.1f%%\n",
507f22bd 3153 le64toh(f->header->n_data),
0284adc6 3154 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))));
7560fffc 3155
0284adc6 3156 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
507f22bd 3157 printf("Field Objects: %"PRIu64"\n"
0284adc6 3158 "Field Hash Table Fill: %.1f%%\n",
507f22bd 3159 le64toh(f->header->n_fields),
0284adc6 3160 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))));
3223f44f
LP
3161
3162 if (JOURNAL_HEADER_CONTAINS(f->header, n_tags))
507f22bd
ZJS
3163 printf("Tag Objects: %"PRIu64"\n",
3164 le64toh(f->header->n_tags));
3223f44f 3165 if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays))
507f22bd
ZJS
3166 printf("Entry Array Objects: %"PRIu64"\n",
3167 le64toh(f->header->n_entry_arrays));
a1a03e30
LP
3168
3169 if (fstat(f->fd, &st) >= 0)
59f448cf 3170 printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
7560fffc
LP
3171}
3172
fc68c929
LP
3173static int journal_file_warn_btrfs(JournalFile *f) {
3174 unsigned attrs;
3175 int r;
3176
3177 assert(f);
3178
3179 /* Before we write anything, check if the COW logic is turned
3180 * off on btrfs. Given our write pattern that is quite
3181 * unfriendly to COW file systems this should greatly improve
3182 * performance on COW file systems, such as btrfs, at the
3183 * expense of data integrity features (which shouldn't be too
3184 * bad, given that we do our own checksumming). */
3185
3186 r = btrfs_is_filesystem(f->fd);
3187 if (r < 0)
3188 return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
3189 if (!r)
3190 return 0;
3191
3192 r = read_attr_fd(f->fd, &attrs);
3193 if (r < 0)
3194 return log_warning_errno(r, "Failed to read file attributes: %m");
3195
3196 if (attrs & FS_NOCOW_FL) {
3197 log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
3198 return 0;
3199 }
3200
3201 log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
3202 "This is likely to slow down journal access substantially, please consider turning "
3203 "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
3204
3205 return 1;
3206}
3207
0284adc6 3208int journal_file_open(
5d1ce257 3209 int fd,
0284adc6
LP
3210 const char *fname,
3211 int flags,
3212 mode_t mode,
3213 bool compress,
baed47c3 3214 bool seal,
0284adc6
LP
3215 JournalMetrics *metrics,
3216 MMapCache *mmap_cache,
b58c888f 3217 Set *deferred_closes,
0284adc6
LP
3218 JournalFile *template,
3219 JournalFile **ret) {
7560fffc 3220
fa6ac760 3221 bool newly_created = false;
0284adc6 3222 JournalFile *f;
fa6ac760 3223 void *h;
0284adc6 3224 int r;
7560fffc 3225
0559d3a5 3226 assert(ret);
5d1ce257 3227 assert(fd >= 0 || fname);
7560fffc 3228
ec2ce0c5 3229 if (!IN_SET((flags & O_ACCMODE), O_RDONLY, O_RDWR))
0284adc6 3230 return -EINVAL;
7560fffc 3231
6eda13d3
LP
3232 if (fname && (flags & O_CREAT) && !endswith(fname, ".journal"))
3233 return -EINVAL;
7560fffc 3234
0284adc6
LP
3235 f = new0(JournalFile, 1);
3236 if (!f)
3237 return -ENOMEM;
7560fffc 3238
5d1ce257 3239 f->fd = fd;
0284adc6 3240 f->mode = mode;
7560fffc 3241
0284adc6
LP
3242 f->flags = flags;
3243 f->prot = prot_from_flags(flags);
3244 f->writable = (flags & O_ACCMODE) != O_RDONLY;
349cc4a5 3245#if HAVE_LZ4
d89c8fdf 3246 f->compress_lz4 = compress;
349cc4a5 3247#elif HAVE_XZ
d89c8fdf 3248 f->compress_xz = compress;
48b61739 3249#endif
349cc4a5 3250#if HAVE_GCRYPT
baed47c3 3251 f->seal = seal;
49a32d43 3252#endif
7560fffc 3253
0284adc6
LP
3254 if (mmap_cache)
3255 f->mmap = mmap_cache_ref(mmap_cache);
3256 else {
84168d80 3257 f->mmap = mmap_cache_new();
0284adc6
LP
3258 if (!f->mmap) {
3259 r = -ENOMEM;
3260 goto fail;
3261 }
3262 }
7560fffc 3263
7645c77b 3264 if (fname) {
5d1ce257 3265 f->path = strdup(fname);
7645c77b
ZJS
3266 if (!f->path) {
3267 r = -ENOMEM;
3268 goto fail;
3269 }
3270 } else {
817b1c5b
LP
3271 assert(fd >= 0);
3272
7645c77b
ZJS
3273 /* If we don't know the path, fill in something explanatory and vaguely useful */
3274 if (asprintf(&f->path, "/proc/self/%i", fd) < 0) {
3275 r = -ENOMEM;
3276 goto fail;
3277 }
0284adc6 3278 }
7560fffc 3279
4743015d 3280 f->chain_cache = ordered_hashmap_new(&uint64_hash_ops);
a4bcff5b
LP
3281 if (!f->chain_cache) {
3282 r = -ENOMEM;
3283 goto fail;
3284 }
3285
0284adc6 3286 if (f->fd < 0) {
817b1c5b
LP
3287 /* We pass O_NONBLOCK here, so that in case somebody pointed us to some character device node or FIFO
3288 * or so, we likely fail quickly than block for long. For regular files O_NONBLOCK has no effect, hence
3289 * it doesn't hurt in that case. */
3290
3291 f->fd = open(f->path, f->flags|O_CLOEXEC|O_NONBLOCK, f->mode);
5d1ce257
LP
3292 if (f->fd < 0) {
3293 r = -errno;
3294 goto fail;
3295 }
3296
3297 /* fds we opened here by us should also be closed by us. */
3298 f->close_fd = true;
817b1c5b
LP
3299
3300 r = fd_nonblock(f->fd, false);
3301 if (r < 0)
3302 goto fail;
7560fffc 3303 }
7560fffc 3304
be7cdd8e
VC
3305 f->cache_fd = mmap_cache_add_fd(f->mmap, f->fd);
3306 if (!f->cache_fd) {
3307 r = -ENOMEM;
3308 goto fail;
3309 }
3310
2678031a
LP
3311 r = journal_file_fstat(f);
3312 if (r < 0)
0284adc6 3313 goto fail;
7560fffc 3314
0284adc6 3315 if (f->last_stat.st_size == 0 && f->writable) {
11689d2a 3316
fc68c929 3317 (void) journal_file_warn_btrfs(f);
11689d2a 3318
4c2e1b39
LP
3319 /* Let's attach the creation time to the journal file, so that the vacuuming code knows the age of this
3320 * file even if the file might end up corrupted one day... Ideally we'd just use the creation time many
3321 * file systems maintain for each file, but the API to query this is very new, hence let's emulate this
3322 * via extended attributes. If extended attributes are not supported we'll just skip this, and rely
3323 * solely on mtime/atime/ctime of the file. */
3324 (void) fd_setcrtime(f->fd, 0);
7560fffc 3325
349cc4a5 3326#if HAVE_GCRYPT
0284adc6 3327 /* Try to load the FSPRG state, and if we can't, then
baed47c3 3328 * just don't do sealing */
49a32d43
LP
3329 if (f->seal) {
3330 r = journal_file_fss_load(f);
3331 if (r < 0)
3332 f->seal = false;
3333 }
feb12d3e 3334#endif
7560fffc 3335
0284adc6
LP
3336 r = journal_file_init_header(f, template);
3337 if (r < 0)
3338 goto fail;
7560fffc 3339
2678031a
LP
3340 r = journal_file_fstat(f);
3341 if (r < 0)
0284adc6 3342 goto fail;
fb0951b0
LP
3343
3344 newly_created = true;
0284adc6 3345 }
7560fffc 3346
0284adc6 3347 if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) {
cfb571f3 3348 r = -ENODATA;
0284adc6
LP
3349 goto fail;
3350 }
7560fffc 3351
b42549ad 3352 r = mmap_cache_get(f->mmap, f->cache_fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h, NULL);
977eaa1e 3353 if (r < 0)
0284adc6 3354 goto fail;
7560fffc 3355
fa6ac760
LP
3356 f->header = h;
3357
0284adc6 3358 if (!newly_created) {
f9168190 3359 set_clear_with_destructor(deferred_closes, journal_file_close);
b58c888f 3360
0284adc6
LP
3361 r = journal_file_verify_header(f);
3362 if (r < 0)
3363 goto fail;
3364 }
7560fffc 3365
349cc4a5 3366#if HAVE_GCRYPT
0284adc6 3367 if (!newly_created && f->writable) {
baed47c3 3368 r = journal_file_fss_load(f);
0284adc6
LP
3369 if (r < 0)
3370 goto fail;
3371 }
feb12d3e 3372#endif
cec736d2
LP
3373
3374 if (f->writable) {
4a92baf3
LP
3375 if (metrics) {
3376 journal_default_metrics(metrics, f->fd);
3377 f->metrics = *metrics;
3378 } else if (template)
3379 f->metrics = template->metrics;
3380
cec736d2
LP
3381 r = journal_file_refresh_header(f);
3382 if (r < 0)
3383 goto fail;
3384 }
3385
349cc4a5 3386#if HAVE_GCRYPT
baed47c3 3387 r = journal_file_hmac_setup(f);
14d10188
LP
3388 if (r < 0)
3389 goto fail;
feb12d3e 3390#endif
14d10188 3391
cec736d2 3392 if (newly_created) {
de190aef 3393 r = journal_file_setup_field_hash_table(f);
cec736d2
LP
3394 if (r < 0)
3395 goto fail;
3396
de190aef 3397 r = journal_file_setup_data_hash_table(f);
cec736d2
LP
3398 if (r < 0)
3399 goto fail;
7560fffc 3400
349cc4a5 3401#if HAVE_GCRYPT
7560fffc
LP
3402 r = journal_file_append_first_tag(f);
3403 if (r < 0)
3404 goto fail;
feb12d3e 3405#endif
cec736d2
LP
3406 }
3407
be7cdd8e 3408 if (mmap_cache_got_sigbus(f->mmap, f->cache_fd)) {
fa6ac760
LP
3409 r = -EIO;
3410 goto fail;
3411 }
3412
7a24f3bf 3413 if (template && template->post_change_timer) {
e167d7fd
LP
3414 r = journal_file_enable_post_change_timer(
3415 f,
3416 sd_event_source_get_event(template->post_change_timer),
3417 template->post_change_timer_period);
7a24f3bf 3418
7a24f3bf
VC
3419 if (r < 0)
3420 goto fail;
3421 }
3422
f8e2f4d6 3423 /* The file is opened now successfully, thus we take possession of any passed in fd. */
5d1ce257
LP
3424 f->close_fd = true;
3425
0559d3a5 3426 *ret = f;
cec736d2
LP
3427 return 0;
3428
3429fail:
be7cdd8e 3430 if (f->cache_fd && mmap_cache_got_sigbus(f->mmap, f->cache_fd))
fa6ac760
LP
3431 r = -EIO;
3432
69a3a6fd 3433 (void) journal_file_close(f);
cec736d2
LP
3434
3435 return r;
3436}
0ac38b70 3437
b58c888f 3438int journal_file_rotate(JournalFile **f, bool compress, bool seal, Set *deferred_closes) {
57535f47 3439 _cleanup_free_ char *p = NULL;
0ac38b70
LP
3440 size_t l;
3441 JournalFile *old_file, *new_file = NULL;
3442 int r;
3443
3444 assert(f);
3445 assert(*f);
3446
3447 old_file = *f;
3448
3449 if (!old_file->writable)
3450 return -EINVAL;
3451
5d1ce257 3452 /* Is this a journal file that was passed to us as fd? If so, we synthesized a path name for it, and we refuse
13e785f7 3453 * rotation, since we don't know the actual path, and couldn't rename the file hence. */
5d1ce257
LP
3454 if (path_startswith(old_file->path, "/proc/self/fd"))
3455 return -EINVAL;
3456
0ac38b70
LP
3457 if (!endswith(old_file->path, ".journal"))
3458 return -EINVAL;
3459
3460 l = strlen(old_file->path);
57535f47
ZJS
3461 r = asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal",
3462 (int) l - 8, old_file->path,
3463 SD_ID128_FORMAT_VAL(old_file->header->seqnum_id),
3464 le64toh((*f)->header->head_entry_seqnum),
3465 le64toh((*f)->header->head_entry_realtime));
3466 if (r < 0)
0ac38b70
LP
3467 return -ENOMEM;
3468
2678031a
LP
3469 /* Try to rename the file to the archived version. If the file
3470 * already was deleted, we'll get ENOENT, let's ignore that
3471 * case. */
0ac38b70 3472 r = rename(old_file->path, p);
2678031a 3473 if (r < 0 && errno != ENOENT)
0ac38b70
LP
3474 return -errno;
3475
1fcefd88
LP
3476 /* Sync the rename to disk */
3477 (void) fsync_directory_of_file(old_file->fd);
3478
8eb85171
VC
3479 /* Set as archive so offlining commits w/state=STATE_ARCHIVED.
3480 * Previously we would set old_file->header->state to STATE_ARCHIVED directly here,
3481 * but journal_file_set_offline() short-circuits when state != STATE_ONLINE, which
3482 * would result in the rotated journal never getting fsync() called before closing.
3483 * Now we simply queue the archive state by setting an archive bit, leaving the state
3484 * as STATE_ONLINE so proper offlining occurs. */
3485 old_file->archive = true;
0ac38b70 3486
f27a3864
LP
3487 /* Currently, btrfs is not very good with out write patterns
3488 * and fragments heavily. Let's defrag our journal files when
3489 * we archive them */
3490 old_file->defrag_on_close = true;
3491
5d1ce257 3492 r = journal_file_open(-1, old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, deferred_closes, old_file, &new_file);
b58c888f
VC
3493
3494 if (deferred_closes &&
3495 set_put(deferred_closes, old_file) >= 0)
3496 (void) journal_file_set_offline(old_file, false);
3497 else
3498 (void) journal_file_close(old_file);
0ac38b70
LP
3499
3500 *f = new_file;
3501 return r;
3502}
3503
9447a7f1
LP
3504int journal_file_open_reliably(
3505 const char *fname,
3506 int flags,
3507 mode_t mode,
7560fffc 3508 bool compress,
baed47c3 3509 bool seal,
4a92baf3 3510 JournalMetrics *metrics,
27370278 3511 MMapCache *mmap_cache,
b58c888f 3512 Set *deferred_closes,
9447a7f1
LP
3513 JournalFile *template,
3514 JournalFile **ret) {
3515
3516 int r;
3517 size_t l;
ed375beb 3518 _cleanup_free_ char *p = NULL;
9447a7f1 3519
5d1ce257 3520 r = journal_file_open(-1, fname, flags, mode, compress, seal, metrics, mmap_cache, deferred_closes, template, ret);
288359db 3521 if (!IN_SET(r,
b288cdeb
ZJS
3522 -EBADMSG, /* Corrupted */
3523 -ENODATA, /* Truncated */
3524 -EHOSTDOWN, /* Other machine */
3525 -EPROTONOSUPPORT, /* Incompatible feature */
3526 -EBUSY, /* Unclean shutdown */
3527 -ESHUTDOWN, /* Already archived */
288359db 3528 -EIO, /* IO error, including SIGBUS on mmap */
ae739cc1
LP
3529 -EIDRM, /* File has been deleted */
3530 -ETXTBSY)) /* File is from the future */
9447a7f1
LP
3531 return r;
3532
3533 if ((flags & O_ACCMODE) == O_RDONLY)
3534 return r;
3535
3536 if (!(flags & O_CREAT))
3537 return r;
3538
7560fffc
LP
3539 if (!endswith(fname, ".journal"))
3540 return r;
3541
5c70eab4
LP
3542 /* The file is corrupted. Rotate it away and try it again (but only once) */
3543
9447a7f1 3544 l = strlen(fname);
d587eca5 3545 if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
57535f47 3546 (int) l - 8, fname,
d587eca5 3547 now(CLOCK_REALTIME),
9bf3b535 3548 random_u64()) < 0)
9447a7f1
LP
3549 return -ENOMEM;
3550
65089b82 3551 if (rename(fname, p) < 0)
9447a7f1
LP
3552 return -errno;
3553
f27a3864
LP
3554 /* btrfs doesn't cope well with our write pattern and
3555 * fragments heavily. Let's defrag all files we rotate */
11689d2a 3556
a67d68b8 3557 (void) chattr_path(p, 0, FS_NOCOW_FL);
f27a3864
LP
3558 (void) btrfs_defrag(p);
3559
65089b82 3560 log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
9447a7f1 3561
5d1ce257 3562 return journal_file_open(-1, fname, flags, mode, compress, seal, metrics, mmap_cache, deferred_closes, template, ret);
9447a7f1
LP
3563}
3564
cf244689
LP
3565int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
3566 uint64_t i, n;
3567 uint64_t q, xor_hash = 0;
3568 int r;
3569 EntryItem *items;
3570 dual_timestamp ts;
3571
3572 assert(from);
3573 assert(to);
3574 assert(o);
3575 assert(p);
3576
3577 if (!to->writable)
3578 return -EPERM;
3579
3580 ts.monotonic = le64toh(o->entry.monotonic);
3581 ts.realtime = le64toh(o->entry.realtime);
3582
cf244689 3583 n = journal_file_entry_n_items(o);
4faa7004
TA
3584 /* alloca() can't take 0, hence let's allocate at least one */
3585 items = alloca(sizeof(EntryItem) * MAX(1u, n));
cf244689
LP
3586
3587 for (i = 0; i < n; i++) {
4fd052ae
FC
3588 uint64_t l, h;
3589 le64_t le_hash;
cf244689
LP
3590 size_t t;
3591 void *data;
3592 Object *u;
3593
3594 q = le64toh(o->entry.items[i].object_offset);
3595 le_hash = o->entry.items[i].hash;
3596
3597 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
3598 if (r < 0)
3599 return r;
3600
3601 if (le_hash != o->data.hash)
3602 return -EBADMSG;
3603
3604 l = le64toh(o->object.size) - offsetof(Object, data.payload);
3605 t = (size_t) l;
3606
3607 /* We hit the limit on 32bit machines */
3608 if ((uint64_t) t != l)
3609 return -E2BIG;
3610
d89c8fdf 3611 if (o->object.flags & OBJECT_COMPRESSION_MASK) {
349cc4a5 3612#if HAVE_XZ || HAVE_LZ4
a7f7d1bd 3613 size_t rsize = 0;
cf244689 3614
d89c8fdf
ZJS
3615 r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
3616 o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
3617 if (r < 0)
3618 return r;
cf244689
LP
3619
3620 data = from->compress_buffer;
3621 l = rsize;
3b1a55e1
ZJS
3622#else
3623 return -EPROTONOSUPPORT;
3624#endif
cf244689
LP
3625 } else
3626 data = o->data.payload;
3627
3628 r = journal_file_append_data(to, data, l, &u, &h);
3629 if (r < 0)
3630 return r;
3631
3632 xor_hash ^= le64toh(u->data.hash);
3633 items[i].object_offset = htole64(h);
3634 items[i].hash = u->data.hash;
3635
3636 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
3637 if (r < 0)
3638 return r;
3639 }
3640
fa6ac760
LP
3641 r = journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
3642
be7cdd8e 3643 if (mmap_cache_got_sigbus(to->mmap, to->cache_fd))
fa6ac760
LP
3644 return -EIO;
3645
3646 return r;
cf244689 3647}
babfc091 3648
8580d1f7
LP
3649void journal_reset_metrics(JournalMetrics *m) {
3650 assert(m);
3651
3652 /* Set everything to "pick automatic values". */
3653
3654 *m = (JournalMetrics) {
3655 .min_use = (uint64_t) -1,
3656 .max_use = (uint64_t) -1,
3657 .min_size = (uint64_t) -1,
3658 .max_size = (uint64_t) -1,
3659 .keep_free = (uint64_t) -1,
3660 .n_max_files = (uint64_t) -1,
3661 };
3662}
3663
babfc091 3664void journal_default_metrics(JournalMetrics *m, int fd) {
8580d1f7 3665 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
babfc091 3666 struct statvfs ss;
8580d1f7 3667 uint64_t fs_size;
babfc091
LP
3668
3669 assert(m);
3670 assert(fd >= 0);
3671
3672 if (fstatvfs(fd, &ss) >= 0)
3673 fs_size = ss.f_frsize * ss.f_blocks;
8580d1f7 3674 else {
8fc58f1a 3675 log_debug_errno(errno, "Failed to determine disk size: %m");
8580d1f7
LP
3676 fs_size = 0;
3677 }
babfc091
LP
3678
3679 if (m->max_use == (uint64_t) -1) {
3680
3681 if (fs_size > 0) {
3682 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
3683
3684 if (m->max_use > DEFAULT_MAX_USE_UPPER)
3685 m->max_use = DEFAULT_MAX_USE_UPPER;
3686
3687 if (m->max_use < DEFAULT_MAX_USE_LOWER)
3688 m->max_use = DEFAULT_MAX_USE_LOWER;
3689 } else
3690 m->max_use = DEFAULT_MAX_USE_LOWER;
3691 } else {
3692 m->max_use = PAGE_ALIGN(m->max_use);
3693
8580d1f7 3694 if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
babfc091
LP
3695 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
3696 }
3697
8580d1f7
LP
3698 if (m->min_use == (uint64_t) -1)
3699 m->min_use = DEFAULT_MIN_USE;
3700
3701 if (m->min_use > m->max_use)
3702 m->min_use = m->max_use;
3703
babfc091
LP
3704 if (m->max_size == (uint64_t) -1) {
3705 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
3706
3707 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
3708 m->max_size = DEFAULT_MAX_SIZE_UPPER;
3709 } else
3710 m->max_size = PAGE_ALIGN(m->max_size);
3711
8580d1f7
LP
3712 if (m->max_size != 0) {
3713 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
3714 m->max_size = JOURNAL_FILE_SIZE_MIN;
babfc091 3715
8580d1f7
LP
3716 if (m->max_use != 0 && m->max_size*2 > m->max_use)
3717 m->max_use = m->max_size*2;
3718 }
babfc091
LP
3719
3720 if (m->min_size == (uint64_t) -1)
3721 m->min_size = JOURNAL_FILE_SIZE_MIN;
3722 else {
3723 m->min_size = PAGE_ALIGN(m->min_size);
3724
3725 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
3726 m->min_size = JOURNAL_FILE_SIZE_MIN;
3727
8580d1f7 3728 if (m->max_size != 0 && m->min_size > m->max_size)
babfc091
LP
3729 m->max_size = m->min_size;
3730 }
3731
3732 if (m->keep_free == (uint64_t) -1) {
3733
3734 if (fs_size > 0) {
8621b110 3735 m->keep_free = PAGE_ALIGN(fs_size * 3 / 20); /* 15% of file system size */
babfc091
LP
3736
3737 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
3738 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
3739
3740 } else
3741 m->keep_free = DEFAULT_KEEP_FREE;
3742 }
3743
8580d1f7
LP
3744 if (m->n_max_files == (uint64_t) -1)
3745 m->n_max_files = DEFAULT_N_MAX_FILES;
3746
3747 log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
3748 format_bytes(a, sizeof(a), m->min_use),
3749 format_bytes(b, sizeof(b), m->max_use),
3750 format_bytes(c, sizeof(c), m->max_size),
3751 format_bytes(d, sizeof(d), m->min_size),
3752 format_bytes(e, sizeof(e), m->keep_free),
3753 m->n_max_files);
babfc091 3754}
08984293
LP
3755
3756int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {
08984293 3757 assert(f);
c88cc6af 3758 assert(f->header);
08984293
LP
3759 assert(from || to);
3760
3761 if (from) {
162566a4
LP
3762 if (f->header->head_entry_realtime == 0)
3763 return -ENOENT;
08984293 3764
162566a4 3765 *from = le64toh(f->header->head_entry_realtime);
08984293
LP
3766 }
3767
3768 if (to) {
162566a4
LP
3769 if (f->header->tail_entry_realtime == 0)
3770 return -ENOENT;
08984293 3771
162566a4 3772 *to = le64toh(f->header->tail_entry_realtime);
08984293
LP
3773 }
3774
3775 return 1;
3776}
3777
3778int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *from, usec_t *to) {
08984293
LP
3779 Object *o;
3780 uint64_t p;
3781 int r;
3782
3783 assert(f);
3784 assert(from || to);
3785
47838ab3 3786 r = find_data_object_by_boot_id(f, boot_id, &o, &p);
08984293
LP
3787 if (r <= 0)
3788 return r;
3789
3790 if (le64toh(o->data.n_entries) <= 0)
3791 return 0;
3792
3793 if (from) {
3794 r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o);
3795 if (r < 0)
3796 return r;
3797
3798 *from = le64toh(o->entry.monotonic);
3799 }
3800
3801 if (to) {
3802 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
3803 if (r < 0)
3804 return r;
3805
3806 r = generic_array_get_plus_one(f,
3807 le64toh(o->data.entry_offset),
3808 le64toh(o->data.entry_array_offset),
3809 le64toh(o->data.n_entries)-1,
3810 &o, NULL);
3811 if (r <= 0)
3812 return r;
3813
3814 *to = le64toh(o->entry.monotonic);
3815 }
3816
3817 return 1;
3818}
dca6219e 3819
fb0951b0 3820bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec) {
dca6219e 3821 assert(f);
c88cc6af 3822 assert(f->header);
dca6219e
LP
3823
3824 /* If we gained new header fields we gained new features,
3825 * hence suggest a rotation */
361f9cbc
LP
3826 if (le64toh(f->header->header_size) < sizeof(Header)) {
3827 log_debug("%s uses an outdated header, suggesting rotation.", f->path);
dca6219e 3828 return true;
361f9cbc 3829 }
dca6219e
LP
3830
3831 /* Let's check if the hash tables grew over a certain fill
3832 * level (75%, borrowing this value from Java's hash table
3833 * implementation), and if so suggest a rotation. To calculate
3834 * the fill level we need the n_data field, which only exists
3835 * in newer versions. */
3836
3837 if (JOURNAL_HEADER_CONTAINS(f->header, n_data))
361f9cbc 3838 if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 3839 log_debug("Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %llu file size, %"PRIu64" bytes per hash table item), suggesting rotation.",
361f9cbc
LP
3840 f->path,
3841 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
3842 le64toh(f->header->n_data),
3843 le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
3844 (unsigned long long) f->last_stat.st_size,
3845 f->last_stat.st_size / le64toh(f->header->n_data));
dca6219e 3846 return true;
361f9cbc 3847 }
dca6219e
LP
3848
3849 if (JOURNAL_HEADER_CONTAINS(f->header, n_fields))
361f9cbc 3850 if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) {
507f22bd 3851 log_debug("Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.",
361f9cbc
LP
3852 f->path,
3853 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))),
507f22bd
ZJS
3854 le64toh(f->header->n_fields),
3855 le64toh(f->header->field_hash_table_size) / sizeof(HashItem));
dca6219e 3856 return true;
361f9cbc 3857 }
dca6219e 3858
0598fd4a
LP
3859 /* Are the data objects properly indexed by field objects? */
3860 if (JOURNAL_HEADER_CONTAINS(f->header, n_data) &&
3861 JOURNAL_HEADER_CONTAINS(f->header, n_fields) &&
3862 le64toh(f->header->n_data) > 0 &&
3863 le64toh(f->header->n_fields) == 0)
3864 return true;
3865
fb0951b0
LP
3866 if (max_file_usec > 0) {
3867 usec_t t, h;
3868
3869 h = le64toh(f->header->head_entry_realtime);
3870 t = now(CLOCK_REALTIME);
3871
3872 if (h > 0 && t > h + max_file_usec)
3873 return true;
3874 }
3875
dca6219e
LP
3876 return false;
3877}