]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/journal/journal-file.c
relicense to LGPLv2.1 (with exceptions)
[thirdparty/systemd.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2011 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #include "journal-def.h"
31 #include "journal-file.h"
32 #include "lookup3.h"
33 #include "compress.h"
34
35 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*16ULL)
36 #define DEFAULT_FIELD_HASH_TABLE_SIZE (2047ULL*16ULL)
37
38 #define DEFAULT_WINDOW_SIZE (8ULL*1024ULL*1024ULL)
39
40 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
41
42 /* This is the minimum journal file size */
43 #define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL) /* 64 KiB */
44
45 /* These are the lower and upper bounds if we deduce the max_use value
46 * from the file system size */
47 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
48 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
49
50 /* This is the upper bound if we deduce max_size from max_use */
51 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
52
53 /* This is the upper bound if we deduce the keep_free value from the
54 * file system size */
55 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
56
57 /* This is the keep_free value when we can't determine the system
58 * size */
59 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
60
61 static const char signature[] = { 'L', 'P', 'K', 'S', 'H', 'H', 'R', 'H' };
62
63 #define ALIGN64(x) (((x) + 7ULL) & ~7ULL)
64
65 void journal_file_close(JournalFile *f) {
66 int t;
67
68 assert(f);
69
70 if (f->header && f->writable)
71 f->header->state = STATE_OFFLINE;
72
73
74 for (t = 0; t < _WINDOW_MAX; t++)
75 if (f->windows[t].ptr)
76 munmap(f->windows[t].ptr, f->windows[t].size);
77
78 if (f->fd >= 0)
79 close_nointr_nofail(f->fd);
80
81 free(f->path);
82
83 #ifdef HAVE_XZ
84 free(f->compress_buffer);
85 #endif
86
87 free(f);
88 }
89
90 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
91 Header h;
92 ssize_t k;
93 int r;
94
95 assert(f);
96
97 zero(h);
98 memcpy(h.signature, signature, 8);
99 h.arena_offset = htole64(ALIGN64(sizeof(h)));
100
101 r = sd_id128_randomize(&h.file_id);
102 if (r < 0)
103 return r;
104
105 if (template) {
106 h.seqnum_id = template->header->seqnum_id;
107 h.seqnum = template->header->seqnum;
108 } else
109 h.seqnum_id = h.file_id;
110
111 k = pwrite(f->fd, &h, sizeof(h), 0);
112 if (k < 0)
113 return -errno;
114
115 if (k != sizeof(h))
116 return -EIO;
117
118 return 0;
119 }
120
121 static int journal_file_refresh_header(JournalFile *f) {
122 int r;
123 sd_id128_t boot_id;
124
125 assert(f);
126
127 r = sd_id128_get_machine(&f->header->machine_id);
128 if (r < 0)
129 return r;
130
131 r = sd_id128_get_boot(&boot_id);
132 if (r < 0)
133 return r;
134
135 if (sd_id128_equal(boot_id, f->header->boot_id))
136 f->tail_entry_monotonic_valid = true;
137
138 f->header->boot_id = boot_id;
139
140 f->header->state = STATE_ONLINE;
141
142 __sync_synchronize();
143
144 return 0;
145 }
146
147 static int journal_file_verify_header(JournalFile *f) {
148 assert(f);
149
150 if (memcmp(f->header, signature, 8))
151 return -EBADMSG;
152
153 #ifdef HAVE_XZ
154 if ((le64toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
155 return -EPROTONOSUPPORT;
156 #else
157 if (f->header->incompatible_flags != 0)
158 return -EPROTONOSUPPORT;
159 #endif
160
161 if ((uint64_t) f->last_stat.st_size < (le64toh(f->header->arena_offset) + le64toh(f->header->arena_size)))
162 return -ENODATA;
163
164 if (f->writable) {
165 uint8_t state;
166 sd_id128_t machine_id;
167 int r;
168
169 r = sd_id128_get_machine(&machine_id);
170 if (r < 0)
171 return r;
172
173 if (!sd_id128_equal(machine_id, f->header->machine_id))
174 return -EHOSTDOWN;
175
176 state = f->header->state;
177
178 if (state == STATE_ONLINE)
179 log_debug("Journal file %s is already online. Assuming unclean closing. Ignoring.", f->path);
180 else if (state == STATE_ARCHIVED)
181 return -ESHUTDOWN;
182 else if (state != STATE_OFFLINE)
183 log_debug("Journal file %s has unknown state %u. Ignoring.", f->path, state);
184 }
185
186 return 0;
187 }
188
189 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
190 uint64_t old_size, new_size;
191
192 assert(f);
193
194 /* We assume that this file is not sparse, and we know that
195 * for sure, since we always call posix_fallocate()
196 * ourselves */
197
198 old_size =
199 le64toh(f->header->arena_offset) +
200 le64toh(f->header->arena_size);
201
202 new_size = PAGE_ALIGN(offset + size);
203 if (new_size < le64toh(f->header->arena_offset))
204 new_size = le64toh(f->header->arena_offset);
205
206 if (new_size <= old_size)
207 return 0;
208
209 if (f->metrics.max_size > 0 &&
210 new_size > f->metrics.max_size)
211 return -E2BIG;
212
213 if (new_size > f->metrics.min_size &&
214 f->metrics.keep_free > 0) {
215 struct statvfs svfs;
216
217 if (fstatvfs(f->fd, &svfs) >= 0) {
218 uint64_t available;
219
220 available = svfs.f_bfree * svfs.f_bsize;
221
222 if (available >= f->metrics.keep_free)
223 available -= f->metrics.keep_free;
224 else
225 available = 0;
226
227 if (new_size - old_size > available)
228 return -E2BIG;
229 }
230 }
231
232 /* Note that the glibc fallocate() fallback is very
233 inefficient, hence we try to minimize the allocation area
234 as we can. */
235 if (posix_fallocate(f->fd, old_size, new_size - old_size) < 0)
236 return -errno;
237
238 if (fstat(f->fd, &f->last_stat) < 0)
239 return -errno;
240
241 f->header->arena_size = htole64(new_size - le64toh(f->header->arena_offset));
242
243 return 0;
244 }
245
246 static int journal_file_map(
247 JournalFile *f,
248 uint64_t offset,
249 uint64_t size,
250 void **_window,
251 uint64_t *_woffset,
252 uint64_t *_wsize,
253 void **ret) {
254
255 uint64_t woffset, wsize;
256 void *window;
257
258 assert(f);
259 assert(size > 0);
260 assert(ret);
261
262 woffset = offset & ~((uint64_t) page_size() - 1ULL);
263 wsize = size + (offset - woffset);
264 wsize = PAGE_ALIGN(wsize);
265
266 /* Avoid SIGBUS on invalid accesses */
267 if (woffset + wsize > (uint64_t) PAGE_ALIGN(f->last_stat.st_size))
268 return -EADDRNOTAVAIL;
269
270 window = mmap(NULL, wsize, f->prot, MAP_SHARED, f->fd, woffset);
271 if (window == MAP_FAILED)
272 return -errno;
273
274 if (_window)
275 *_window = window;
276
277 if (_woffset)
278 *_woffset = woffset;
279
280 if (_wsize)
281 *_wsize = wsize;
282
283 *ret = (uint8_t*) window + (offset - woffset);
284
285 return 0;
286 }
287
288 static int journal_file_move_to(JournalFile *f, int wt, uint64_t offset, uint64_t size, void **ret) {
289 void *p = NULL;
290 uint64_t delta;
291 int r;
292 Window *w;
293
294 assert(f);
295 assert(ret);
296 assert(wt >= 0);
297 assert(wt < _WINDOW_MAX);
298
299 if (offset + size > (uint64_t) f->last_stat.st_size) {
300 /* Hmm, out of range? Let's refresh the fstat() data
301 * first, before we trust that check. */
302
303 if (fstat(f->fd, &f->last_stat) < 0 ||
304 offset + size > (uint64_t) f->last_stat.st_size)
305 return -EADDRNOTAVAIL;
306 }
307
308 w = f->windows + wt;
309
310 if (_likely_(w->ptr &&
311 w->offset <= offset &&
312 w->offset + w->size >= offset + size)) {
313
314 *ret = (uint8_t*) w->ptr + (offset - w->offset);
315 return 0;
316 }
317
318 if (w->ptr) {
319 if (munmap(w->ptr, w->size) < 0)
320 return -errno;
321
322 w->ptr = NULL;
323 w->size = w->offset = 0;
324 }
325
326 if (size < DEFAULT_WINDOW_SIZE) {
327 /* If the default window size is larger then what was
328 * asked for extend the mapping a bit in the hope to
329 * minimize needed remappings later on. We add half
330 * the window space before and half behind the
331 * requested mapping */
332
333 delta = (DEFAULT_WINDOW_SIZE - size) / 2;
334
335 if (delta > offset)
336 delta = offset;
337
338 offset -= delta;
339 size = DEFAULT_WINDOW_SIZE;
340 } else
341 delta = 0;
342
343 if (offset + size > (uint64_t) f->last_stat.st_size)
344 size = (uint64_t) f->last_stat.st_size - offset;
345
346 if (size <= 0)
347 return -EADDRNOTAVAIL;
348
349 r = journal_file_map(f,
350 offset, size,
351 &w->ptr, &w->offset, &w->size,
352 &p);
353
354 if (r < 0)
355 return r;
356
357 *ret = (uint8_t*) p + delta;
358 return 0;
359 }
360
361 static bool verify_hash(Object *o) {
362 uint64_t h1, h2;
363
364 assert(o);
365
366 if (o->object.type == OBJECT_DATA && !(o->object.flags & OBJECT_COMPRESSED)) {
367 h1 = le64toh(o->data.hash);
368 h2 = hash64(o->data.payload, le64toh(o->object.size) - offsetof(Object, data.payload));
369 } else if (o->object.type == OBJECT_FIELD) {
370 h1 = le64toh(o->field.hash);
371 h2 = hash64(o->field.payload, le64toh(o->object.size) - offsetof(Object, field.payload));
372 } else
373 return true;
374
375 return h1 == h2;
376 }
377
378 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
379 int r;
380 void *t;
381 Object *o;
382 uint64_t s;
383
384 assert(f);
385 assert(ret);
386 assert(type < _OBJECT_TYPE_MAX);
387
388 r = journal_file_move_to(f, type >= 0 ? type : WINDOW_UNKNOWN, offset, sizeof(ObjectHeader), &t);
389 if (r < 0)
390 return r;
391
392 o = (Object*) t;
393 s = le64toh(o->object.size);
394
395 if (s < sizeof(ObjectHeader))
396 return -EBADMSG;
397
398 if (type >= 0 && o->object.type != type)
399 return -EBADMSG;
400
401 if (s > sizeof(ObjectHeader)) {
402 r = journal_file_move_to(f, o->object.type, offset, s, &t);
403 if (r < 0)
404 return r;
405
406 o = (Object*) t;
407 }
408
409 if (!verify_hash(o))
410 return -EBADMSG;
411
412 *ret = o;
413 return 0;
414 }
415
416 static uint64_t journal_file_seqnum(JournalFile *f, uint64_t *seqnum) {
417 uint64_t r;
418
419 assert(f);
420
421 r = le64toh(f->header->seqnum) + 1;
422
423 if (seqnum) {
424 /* If an external seqnum counter was passed, we update
425 * both the local and the external one, and set it to
426 * the maximum of both */
427
428 if (*seqnum + 1 > r)
429 r = *seqnum + 1;
430
431 *seqnum = r;
432 }
433
434 f->header->seqnum = htole64(r);
435
436 if (f->header->first_seqnum == 0)
437 f->header->first_seqnum = htole64(r);
438
439 return r;
440 }
441
442 static int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
443 int r;
444 uint64_t p;
445 Object *tail, *o;
446 void *t;
447
448 assert(f);
449 assert(size >= sizeof(ObjectHeader));
450 assert(offset);
451 assert(ret);
452
453 p = le64toh(f->header->tail_object_offset);
454 if (p == 0)
455 p = le64toh(f->header->arena_offset);
456 else {
457 r = journal_file_move_to_object(f, -1, p, &tail);
458 if (r < 0)
459 return r;
460
461 p += ALIGN64(le64toh(tail->object.size));
462 }
463
464 r = journal_file_allocate(f, p, size);
465 if (r < 0)
466 return r;
467
468 r = journal_file_move_to(f, type, p, size, &t);
469 if (r < 0)
470 return r;
471
472 o = (Object*) t;
473
474 zero(o->object);
475 o->object.type = type;
476 o->object.size = htole64(size);
477
478 f->header->tail_object_offset = htole64(p);
479 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
480
481 *ret = o;
482 *offset = p;
483
484 return 0;
485 }
486
487 static int journal_file_setup_data_hash_table(JournalFile *f) {
488 uint64_t s, p;
489 Object *o;
490 int r;
491
492 assert(f);
493
494 s = DEFAULT_DATA_HASH_TABLE_SIZE;
495 r = journal_file_append_object(f,
496 OBJECT_DATA_HASH_TABLE,
497 offsetof(Object, hash_table.items) + s,
498 &o, &p);
499 if (r < 0)
500 return r;
501
502 memset(o->hash_table.items, 0, s);
503
504 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
505 f->header->data_hash_table_size = htole64(s);
506
507 return 0;
508 }
509
510 static int journal_file_setup_field_hash_table(JournalFile *f) {
511 uint64_t s, p;
512 Object *o;
513 int r;
514
515 assert(f);
516
517 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
518 r = journal_file_append_object(f,
519 OBJECT_FIELD_HASH_TABLE,
520 offsetof(Object, hash_table.items) + s,
521 &o, &p);
522 if (r < 0)
523 return r;
524
525 memset(o->hash_table.items, 0, s);
526
527 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
528 f->header->field_hash_table_size = htole64(s);
529
530 return 0;
531 }
532
533 static int journal_file_map_data_hash_table(JournalFile *f) {
534 uint64_t s, p;
535 void *t;
536 int r;
537
538 assert(f);
539
540 p = le64toh(f->header->data_hash_table_offset);
541 s = le64toh(f->header->data_hash_table_size);
542
543 r = journal_file_move_to(f,
544 WINDOW_DATA_HASH_TABLE,
545 p, s,
546 &t);
547 if (r < 0)
548 return r;
549
550 f->data_hash_table = t;
551 return 0;
552 }
553
554 static int journal_file_map_field_hash_table(JournalFile *f) {
555 uint64_t s, p;
556 void *t;
557 int r;
558
559 assert(f);
560
561 p = le64toh(f->header->field_hash_table_offset);
562 s = le64toh(f->header->field_hash_table_size);
563
564 r = journal_file_move_to(f,
565 WINDOW_FIELD_HASH_TABLE,
566 p, s,
567 &t);
568 if (r < 0)
569 return r;
570
571 f->field_hash_table = t;
572 return 0;
573 }
574
575 static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, uint64_t hash) {
576 uint64_t p, h;
577 int r;
578
579 assert(f);
580 assert(o);
581 assert(offset > 0);
582 assert(o->object.type == OBJECT_DATA);
583
584 /* This might alter the window we are looking at */
585
586 o->data.next_hash_offset = o->data.next_field_offset = 0;
587 o->data.entry_offset = o->data.entry_array_offset = 0;
588 o->data.n_entries = 0;
589
590 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
591 p = le64toh(f->data_hash_table[h].head_hash_offset);
592 if (p == 0) {
593 /* Only entry in the hash table is easy */
594 f->data_hash_table[h].head_hash_offset = htole64(offset);
595 } else {
596 /* Move back to the previous data object, to patch in
597 * pointer */
598
599 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
600 if (r < 0)
601 return r;
602
603 o->data.next_hash_offset = htole64(offset);
604 }
605
606 f->data_hash_table[h].tail_hash_offset = htole64(offset);
607
608 return 0;
609 }
610
611 int journal_file_find_data_object_with_hash(
612 JournalFile *f,
613 const void *data, uint64_t size, uint64_t hash,
614 Object **ret, uint64_t *offset) {
615
616 uint64_t p, osize, h;
617 int r;
618
619 assert(f);
620 assert(data || size == 0);
621
622 osize = offsetof(Object, data.payload) + size;
623
624 if (f->header->data_hash_table_size == 0)
625 return -EBADMSG;
626
627 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
628 p = le64toh(f->data_hash_table[h].head_hash_offset);
629
630 while (p > 0) {
631 Object *o;
632
633 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
634 if (r < 0)
635 return r;
636
637 if (le64toh(o->data.hash) != hash)
638 goto next;
639
640 if (o->object.flags & OBJECT_COMPRESSED) {
641 #ifdef HAVE_XZ
642 uint64_t l, rsize;
643
644 l = le64toh(o->object.size);
645 if (l <= offsetof(Object, data.payload))
646 return -EBADMSG;
647
648 l -= offsetof(Object, data.payload);
649
650 if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize))
651 return -EBADMSG;
652
653 if (rsize == size &&
654 memcmp(f->compress_buffer, data, size) == 0) {
655
656 if (ret)
657 *ret = o;
658
659 if (offset)
660 *offset = p;
661
662 return 1;
663 }
664 #else
665 return -EPROTONOSUPPORT;
666 #endif
667
668 } else if (le64toh(o->object.size) == osize &&
669 memcmp(o->data.payload, data, size) == 0) {
670
671 if (ret)
672 *ret = o;
673
674 if (offset)
675 *offset = p;
676
677 return 1;
678 }
679
680 next:
681 p = le64toh(o->data.next_hash_offset);
682 }
683
684 return 0;
685 }
686
687 int journal_file_find_data_object(
688 JournalFile *f,
689 const void *data, uint64_t size,
690 Object **ret, uint64_t *offset) {
691
692 uint64_t hash;
693
694 assert(f);
695 assert(data || size == 0);
696
697 hash = hash64(data, size);
698
699 return journal_file_find_data_object_with_hash(f,
700 data, size, hash,
701 ret, offset);
702 }
703
704 static int journal_file_append_data(
705 JournalFile *f,
706 const void *data, uint64_t size,
707 Object **ret, uint64_t *offset) {
708
709 uint64_t hash, p;
710 uint64_t osize;
711 Object *o;
712 int r;
713 bool compressed = false;
714
715 assert(f);
716 assert(data || size == 0);
717
718 hash = hash64(data, size);
719
720 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
721 if (r < 0)
722 return r;
723 else if (r > 0) {
724
725 if (ret)
726 *ret = o;
727
728 if (offset)
729 *offset = p;
730
731 return 0;
732 }
733
734 osize = offsetof(Object, data.payload) + size;
735 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
736 if (r < 0)
737 return r;
738
739 o->data.hash = htole64(hash);
740
741 #ifdef HAVE_XZ
742 if (f->compress &&
743 size >= COMPRESSION_SIZE_THRESHOLD) {
744 uint64_t rsize;
745
746 compressed = compress_blob(data, size, o->data.payload, &rsize);
747
748 if (compressed) {
749 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
750 o->object.flags |= OBJECT_COMPRESSED;
751
752 f->header->incompatible_flags = htole32(le32toh(f->header->incompatible_flags) | HEADER_INCOMPATIBLE_COMPRESSED);
753
754 log_debug("Compressed data object %lu -> %lu", (unsigned long) size, (unsigned long) rsize);
755 }
756 }
757 #endif
758
759 if (!compressed)
760 memcpy(o->data.payload, data, size);
761
762 r = journal_file_link_data(f, o, p, hash);
763 if (r < 0)
764 return r;
765
766 /* The linking might have altered the window, so let's
767 * refresh our pointer */
768 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
769 if (r < 0)
770 return r;
771
772 if (ret)
773 *ret = o;
774
775 if (offset)
776 *offset = p;
777
778 return 0;
779 }
780
781 uint64_t journal_file_entry_n_items(Object *o) {
782 assert(o);
783 assert(o->object.type == OBJECT_ENTRY);
784
785 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
786 }
787
788 static uint64_t journal_file_entry_array_n_items(Object *o) {
789 assert(o);
790 assert(o->object.type == OBJECT_ENTRY_ARRAY);
791
792 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
793 }
794
795 static int link_entry_into_array(JournalFile *f,
796 le64_t *first,
797 le64_t *idx,
798 uint64_t p) {
799 int r;
800 uint64_t n = 0, ap = 0, q, i, a, hidx;
801 Object *o;
802
803 assert(f);
804 assert(first);
805 assert(idx);
806 assert(p > 0);
807
808 a = le64toh(*first);
809 i = hidx = le64toh(*idx);
810 while (a > 0) {
811
812 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
813 if (r < 0)
814 return r;
815
816 n = journal_file_entry_array_n_items(o);
817 if (i < n) {
818 o->entry_array.items[i] = htole64(p);
819 *idx = htole64(hidx + 1);
820 return 0;
821 }
822
823 i -= n;
824 ap = a;
825 a = le64toh(o->entry_array.next_entry_array_offset);
826 }
827
828 if (hidx > n)
829 n = (hidx+1) * 2;
830 else
831 n = n * 2;
832
833 if (n < 4)
834 n = 4;
835
836 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
837 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
838 &o, &q);
839 if (r < 0)
840 return r;
841
842 o->entry_array.items[i] = htole64(p);
843
844 if (ap == 0)
845 *first = htole64(q);
846 else {
847 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
848 if (r < 0)
849 return r;
850
851 o->entry_array.next_entry_array_offset = htole64(q);
852 }
853
854 *idx = htole64(hidx + 1);
855
856 return 0;
857 }
858
859 static int link_entry_into_array_plus_one(JournalFile *f,
860 le64_t *extra,
861 le64_t *first,
862 le64_t *idx,
863 uint64_t p) {
864
865 int r;
866
867 assert(f);
868 assert(extra);
869 assert(first);
870 assert(idx);
871 assert(p > 0);
872
873 if (*idx == 0)
874 *extra = htole64(p);
875 else {
876 le64_t i;
877
878 i = htole64(le64toh(*idx) - 1);
879 r = link_entry_into_array(f, first, &i, p);
880 if (r < 0)
881 return r;
882 }
883
884 *idx = htole64(le64toh(*idx) + 1);
885 return 0;
886 }
887
888 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
889 uint64_t p;
890 int r;
891 assert(f);
892 assert(o);
893 assert(offset > 0);
894
895 p = le64toh(o->entry.items[i].object_offset);
896 if (p == 0)
897 return -EINVAL;
898
899 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
900 if (r < 0)
901 return r;
902
903 return link_entry_into_array_plus_one(f,
904 &o->data.entry_offset,
905 &o->data.entry_array_offset,
906 &o->data.n_entries,
907 offset);
908 }
909
910 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
911 uint64_t n, i;
912 int r;
913
914 assert(f);
915 assert(o);
916 assert(offset > 0);
917 assert(o->object.type == OBJECT_ENTRY);
918
919 __sync_synchronize();
920
921 /* Link up the entry itself */
922 r = link_entry_into_array(f,
923 &f->header->entry_array_offset,
924 &f->header->n_entries,
925 offset);
926 if (r < 0)
927 return r;
928
929 /* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */
930
931 if (f->header->head_entry_realtime == 0)
932 f->header->head_entry_realtime = o->entry.realtime;
933
934 f->header->tail_entry_realtime = o->entry.realtime;
935 f->header->tail_entry_monotonic = o->entry.monotonic;
936
937 f->tail_entry_monotonic_valid = true;
938
939 /* Link up the items */
940 n = journal_file_entry_n_items(o);
941 for (i = 0; i < n; i++) {
942 r = journal_file_link_entry_item(f, o, offset, i);
943 if (r < 0)
944 return r;
945 }
946
947 return 0;
948 }
949
950 static int journal_file_append_entry_internal(
951 JournalFile *f,
952 const dual_timestamp *ts,
953 uint64_t xor_hash,
954 const EntryItem items[], unsigned n_items,
955 uint64_t *seqnum,
956 Object **ret, uint64_t *offset) {
957 uint64_t np;
958 uint64_t osize;
959 Object *o;
960 int r;
961
962 assert(f);
963 assert(items || n_items == 0);
964 assert(ts);
965
966 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
967
968 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
969 if (r < 0)
970 return r;
971
972 o->entry.seqnum = htole64(journal_file_seqnum(f, seqnum));
973 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
974 o->entry.realtime = htole64(ts->realtime);
975 o->entry.monotonic = htole64(ts->monotonic);
976 o->entry.xor_hash = htole64(xor_hash);
977 o->entry.boot_id = f->header->boot_id;
978
979 r = journal_file_link_entry(f, o, np);
980 if (r < 0)
981 return r;
982
983 if (ret)
984 *ret = o;
985
986 if (offset)
987 *offset = np;
988
989 return 0;
990 }
991
992 void journal_file_post_change(JournalFile *f) {
993 assert(f);
994
995 /* inotify() does not receive IN_MODIFY events from file
996 * accesses done via mmap(). After each access we hence
997 * trigger IN_MODIFY by truncating the journal file to its
998 * current size which triggers IN_MODIFY. */
999
1000 __sync_synchronize();
1001
1002 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1003 log_error("Failed to to truncate file to its own size: %m");
1004 }
1005
1006 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1007 unsigned i;
1008 EntryItem *items;
1009 int r;
1010 uint64_t xor_hash = 0;
1011 struct dual_timestamp _ts;
1012
1013 assert(f);
1014 assert(iovec || n_iovec == 0);
1015
1016 if (!f->writable)
1017 return -EPERM;
1018
1019 if (!ts) {
1020 dual_timestamp_get(&_ts);
1021 ts = &_ts;
1022 }
1023
1024 if (f->tail_entry_monotonic_valid &&
1025 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1026 return -EINVAL;
1027
1028 items = alloca(sizeof(EntryItem) * n_iovec);
1029
1030 for (i = 0; i < n_iovec; i++) {
1031 uint64_t p;
1032 Object *o;
1033
1034 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1035 if (r < 0)
1036 return r;
1037
1038 xor_hash ^= le64toh(o->data.hash);
1039 items[i].object_offset = htole64(p);
1040 items[i].hash = o->data.hash;
1041 }
1042
1043 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1044
1045 journal_file_post_change(f);
1046
1047 return r;
1048 }
1049
1050 static int generic_array_get(JournalFile *f,
1051 uint64_t first,
1052 uint64_t i,
1053 Object **ret, uint64_t *offset) {
1054
1055 Object *o;
1056 uint64_t p = 0, a;
1057 int r;
1058
1059 assert(f);
1060
1061 a = first;
1062 while (a > 0) {
1063 uint64_t n;
1064
1065 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1066 if (r < 0)
1067 return r;
1068
1069 n = journal_file_entry_array_n_items(o);
1070 if (i < n) {
1071 p = le64toh(o->entry_array.items[i]);
1072 break;
1073 }
1074
1075 i -= n;
1076 a = le64toh(o->entry_array.next_entry_array_offset);
1077 }
1078
1079 if (a <= 0 || p <= 0)
1080 return 0;
1081
1082 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1083 if (r < 0)
1084 return r;
1085
1086 if (ret)
1087 *ret = o;
1088
1089 if (offset)
1090 *offset = p;
1091
1092 return 1;
1093 }
1094
1095 static int generic_array_get_plus_one(JournalFile *f,
1096 uint64_t extra,
1097 uint64_t first,
1098 uint64_t i,
1099 Object **ret, uint64_t *offset) {
1100
1101 Object *o;
1102
1103 assert(f);
1104
1105 if (i == 0) {
1106 int r;
1107
1108 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1109 if (r < 0)
1110 return r;
1111
1112 if (ret)
1113 *ret = o;
1114
1115 if (offset)
1116 *offset = extra;
1117
1118 return 1;
1119 }
1120
1121 return generic_array_get(f, first, i-1, ret, offset);
1122 }
1123
1124 enum {
1125 TEST_FOUND,
1126 TEST_LEFT,
1127 TEST_RIGHT
1128 };
1129
1130 static int generic_array_bisect(JournalFile *f,
1131 uint64_t first,
1132 uint64_t n,
1133 uint64_t needle,
1134 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1135 direction_t direction,
1136 Object **ret,
1137 uint64_t *offset,
1138 uint64_t *idx) {
1139
1140 uint64_t a, p, t = 0, i = 0, last_p = 0;
1141 bool subtract_one = false;
1142 Object *o, *array = NULL;
1143 int r;
1144
1145 assert(f);
1146 assert(test_object);
1147
1148 a = first;
1149 while (a > 0) {
1150 uint64_t left, right, k, lp;
1151
1152 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1153 if (r < 0)
1154 return r;
1155
1156 k = journal_file_entry_array_n_items(array);
1157 right = MIN(k, n);
1158 if (right <= 0)
1159 return 0;
1160
1161 i = right - 1;
1162 lp = p = le64toh(array->entry_array.items[i]);
1163 if (p <= 0)
1164 return -EBADMSG;
1165
1166 r = test_object(f, p, needle);
1167 if (r < 0)
1168 return r;
1169
1170 if (r == TEST_FOUND)
1171 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1172
1173 if (r == TEST_RIGHT) {
1174 left = 0;
1175 right -= 1;
1176 for (;;) {
1177 if (left == right) {
1178 if (direction == DIRECTION_UP)
1179 subtract_one = true;
1180
1181 i = left;
1182 goto found;
1183 }
1184
1185 assert(left < right);
1186
1187 i = (left + right) / 2;
1188 p = le64toh(array->entry_array.items[i]);
1189 if (p <= 0)
1190 return -EBADMSG;
1191
1192 r = test_object(f, p, needle);
1193 if (r < 0)
1194 return r;
1195
1196 if (r == TEST_FOUND)
1197 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1198
1199 if (r == TEST_RIGHT)
1200 right = i;
1201 else
1202 left = i + 1;
1203 }
1204 }
1205
1206 if (k > n)
1207 return 0;
1208
1209 last_p = lp;
1210
1211 n -= k;
1212 t += k;
1213 a = le64toh(array->entry_array.next_entry_array_offset);
1214 }
1215
1216 return 0;
1217
1218 found:
1219 if (subtract_one && t == 0 && i == 0)
1220 return 0;
1221
1222 if (subtract_one && i == 0)
1223 p = last_p;
1224 else if (subtract_one)
1225 p = le64toh(array->entry_array.items[i-1]);
1226 else
1227 p = le64toh(array->entry_array.items[i]);
1228
1229 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1230 if (r < 0)
1231 return r;
1232
1233 if (ret)
1234 *ret = o;
1235
1236 if (offset)
1237 *offset = p;
1238
1239 if (idx)
1240 *idx = t + i - (subtract_one ? 1 : 0);
1241
1242 return 1;
1243 }
1244
1245 static int generic_array_bisect_plus_one(JournalFile *f,
1246 uint64_t extra,
1247 uint64_t first,
1248 uint64_t n,
1249 uint64_t needle,
1250 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1251 direction_t direction,
1252 Object **ret,
1253 uint64_t *offset,
1254 uint64_t *idx) {
1255
1256 int r;
1257
1258 assert(f);
1259 assert(test_object);
1260
1261 if (n <= 0)
1262 return 0;
1263
1264 /* This bisects the array in object 'first', but first checks
1265 * an extra */
1266 r = test_object(f, extra, needle);
1267 if (r < 0)
1268 return r;
1269 else if (r == TEST_FOUND) {
1270 Object *o;
1271
1272 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1273 if (r < 0)
1274 return r;
1275
1276 if (ret)
1277 *ret = o;
1278
1279 if (offset)
1280 *offset = extra;
1281
1282 if (idx)
1283 *idx = 0;
1284
1285 return 1;
1286 } else if (r == TEST_RIGHT)
1287 return 0;
1288
1289 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1290
1291 if (r > 0)
1292 (*idx) ++;
1293
1294 return r;
1295 }
1296
1297 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1298 Object *o;
1299 int r;
1300
1301 assert(f);
1302 assert(p > 0);
1303
1304 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1305 if (r < 0)
1306 return r;
1307
1308 if (le64toh(o->entry.seqnum) == needle)
1309 return TEST_FOUND;
1310 else if (le64toh(o->entry.seqnum) < needle)
1311 return TEST_LEFT;
1312 else
1313 return TEST_RIGHT;
1314 }
1315
1316 int journal_file_move_to_entry_by_seqnum(
1317 JournalFile *f,
1318 uint64_t seqnum,
1319 direction_t direction,
1320 Object **ret,
1321 uint64_t *offset) {
1322
1323 return generic_array_bisect(f,
1324 le64toh(f->header->entry_array_offset),
1325 le64toh(f->header->n_entries),
1326 seqnum,
1327 test_object_seqnum,
1328 direction,
1329 ret, offset, NULL);
1330 }
1331
1332 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1333 Object *o;
1334 int r;
1335
1336 assert(f);
1337 assert(p > 0);
1338
1339 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1340 if (r < 0)
1341 return r;
1342
1343 if (le64toh(o->entry.realtime) == needle)
1344 return TEST_FOUND;
1345 else if (le64toh(o->entry.realtime) < needle)
1346 return TEST_LEFT;
1347 else
1348 return TEST_RIGHT;
1349 }
1350
1351 int journal_file_move_to_entry_by_realtime(
1352 JournalFile *f,
1353 uint64_t realtime,
1354 direction_t direction,
1355 Object **ret,
1356 uint64_t *offset) {
1357
1358 return generic_array_bisect(f,
1359 le64toh(f->header->entry_array_offset),
1360 le64toh(f->header->n_entries),
1361 realtime,
1362 test_object_realtime,
1363 direction,
1364 ret, offset, NULL);
1365 }
1366
1367 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1368 Object *o;
1369 int r;
1370
1371 assert(f);
1372 assert(p > 0);
1373
1374 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1375 if (r < 0)
1376 return r;
1377
1378 if (le64toh(o->entry.monotonic) == needle)
1379 return TEST_FOUND;
1380 else if (le64toh(o->entry.monotonic) < needle)
1381 return TEST_LEFT;
1382 else
1383 return TEST_RIGHT;
1384 }
1385
1386 int journal_file_move_to_entry_by_monotonic(
1387 JournalFile *f,
1388 sd_id128_t boot_id,
1389 uint64_t monotonic,
1390 direction_t direction,
1391 Object **ret,
1392 uint64_t *offset) {
1393
1394 char t[8+32+1] = "_BOOT_ID=";
1395 Object *o;
1396 int r;
1397
1398 sd_id128_to_string(boot_id, t + 8);
1399
1400 r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
1401 if (r < 0)
1402 return r;
1403 else if (r == 0)
1404 return -ENOENT;
1405
1406 return generic_array_bisect_plus_one(f,
1407 le64toh(o->data.entry_offset),
1408 le64toh(o->data.entry_array_offset),
1409 le64toh(o->data.n_entries),
1410 monotonic,
1411 test_object_monotonic,
1412 direction,
1413 ret, offset, NULL);
1414 }
1415
1416 static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1417 assert(f);
1418 assert(p > 0);
1419
1420 if (p == needle)
1421 return TEST_FOUND;
1422 else if (p < needle)
1423 return TEST_LEFT;
1424 else
1425 return TEST_RIGHT;
1426 }
1427
1428 int journal_file_next_entry(
1429 JournalFile *f,
1430 Object *o, uint64_t p,
1431 direction_t direction,
1432 Object **ret, uint64_t *offset) {
1433
1434 uint64_t i, n;
1435 int r;
1436
1437 assert(f);
1438 assert(p > 0 || !o);
1439
1440 n = le64toh(f->header->n_entries);
1441 if (n <= 0)
1442 return 0;
1443
1444 if (!o)
1445 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1446 else {
1447 if (o->object.type != OBJECT_ENTRY)
1448 return -EINVAL;
1449
1450 r = generic_array_bisect(f,
1451 le64toh(f->header->entry_array_offset),
1452 le64toh(f->header->n_entries),
1453 p,
1454 test_object_offset,
1455 DIRECTION_DOWN,
1456 NULL, NULL,
1457 &i);
1458 if (r <= 0)
1459 return r;
1460
1461 if (direction == DIRECTION_DOWN) {
1462 if (i >= n - 1)
1463 return 0;
1464
1465 i++;
1466 } else {
1467 if (i <= 0)
1468 return 0;
1469
1470 i--;
1471 }
1472 }
1473
1474 /* And jump to it */
1475 return generic_array_get(f,
1476 le64toh(f->header->entry_array_offset),
1477 i,
1478 ret, offset);
1479 }
1480
1481 int journal_file_skip_entry(
1482 JournalFile *f,
1483 Object *o, uint64_t p,
1484 int64_t skip,
1485 Object **ret, uint64_t *offset) {
1486
1487 uint64_t i, n;
1488 int r;
1489
1490 assert(f);
1491 assert(o);
1492 assert(p > 0);
1493
1494 if (o->object.type != OBJECT_ENTRY)
1495 return -EINVAL;
1496
1497 r = generic_array_bisect(f,
1498 le64toh(f->header->entry_array_offset),
1499 le64toh(f->header->n_entries),
1500 p,
1501 test_object_offset,
1502 DIRECTION_DOWN,
1503 NULL, NULL,
1504 &i);
1505 if (r <= 0)
1506 return r;
1507
1508 /* Calculate new index */
1509 if (skip < 0) {
1510 if ((uint64_t) -skip >= i)
1511 i = 0;
1512 else
1513 i = i - (uint64_t) -skip;
1514 } else
1515 i += (uint64_t) skip;
1516
1517 n = le64toh(f->header->n_entries);
1518 if (n <= 0)
1519 return -EBADMSG;
1520
1521 if (i >= n)
1522 i = n-1;
1523
1524 return generic_array_get(f,
1525 le64toh(f->header->entry_array_offset),
1526 i,
1527 ret, offset);
1528 }
1529
1530 int journal_file_next_entry_for_data(
1531 JournalFile *f,
1532 Object *o, uint64_t p,
1533 uint64_t data_offset,
1534 direction_t direction,
1535 Object **ret, uint64_t *offset) {
1536
1537 uint64_t n, i;
1538 int r;
1539 Object *d;
1540
1541 assert(f);
1542 assert(p > 0 || !o);
1543
1544 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1545 if (r < 0)
1546 return r;
1547
1548 n = le64toh(d->data.n_entries);
1549 if (n <= 0)
1550 return n;
1551
1552 if (!o)
1553 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1554 else {
1555 if (o->object.type != OBJECT_ENTRY)
1556 return -EINVAL;
1557
1558 r = generic_array_bisect_plus_one(f,
1559 le64toh(d->data.entry_offset),
1560 le64toh(d->data.entry_array_offset),
1561 le64toh(d->data.n_entries),
1562 p,
1563 test_object_offset,
1564 DIRECTION_DOWN,
1565 NULL, NULL,
1566 &i);
1567
1568 if (r <= 0)
1569 return r;
1570
1571 if (direction == DIRECTION_DOWN) {
1572 if (i >= n - 1)
1573 return 0;
1574
1575 i++;
1576 } else {
1577 if (i <= 0)
1578 return 0;
1579
1580 i--;
1581 }
1582
1583 }
1584
1585 return generic_array_get_plus_one(f,
1586 le64toh(d->data.entry_offset),
1587 le64toh(d->data.entry_array_offset),
1588 i,
1589 ret, offset);
1590 }
1591
1592 int journal_file_move_to_entry_by_seqnum_for_data(
1593 JournalFile *f,
1594 uint64_t data_offset,
1595 uint64_t seqnum,
1596 direction_t direction,
1597 Object **ret, uint64_t *offset) {
1598
1599 Object *d;
1600 int r;
1601
1602 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1603 if (r <= 0)
1604 return r;
1605
1606 return generic_array_bisect_plus_one(f,
1607 le64toh(d->data.entry_offset),
1608 le64toh(d->data.entry_array_offset),
1609 le64toh(d->data.n_entries),
1610 seqnum,
1611 test_object_seqnum,
1612 direction,
1613 ret, offset, NULL);
1614 }
1615
1616 int journal_file_move_to_entry_by_realtime_for_data(
1617 JournalFile *f,
1618 uint64_t data_offset,
1619 uint64_t realtime,
1620 direction_t direction,
1621 Object **ret, uint64_t *offset) {
1622
1623 Object *d;
1624 int r;
1625
1626 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1627 if (r <= 0)
1628 return r;
1629
1630 return generic_array_bisect_plus_one(f,
1631 le64toh(d->data.entry_offset),
1632 le64toh(d->data.entry_array_offset),
1633 le64toh(d->data.n_entries),
1634 realtime,
1635 test_object_realtime,
1636 direction,
1637 ret, offset, NULL);
1638 }
1639
1640 void journal_file_dump(JournalFile *f) {
1641 char a[33], b[33], c[33];
1642 Object *o;
1643 int r;
1644 uint64_t p;
1645
1646 assert(f);
1647
1648 printf("File Path: %s\n"
1649 "File ID: %s\n"
1650 "Machine ID: %s\n"
1651 "Boot ID: %s\n"
1652 "Arena size: %llu\n"
1653 "Objects: %lu\n"
1654 "Entries: %lu\n",
1655 f->path,
1656 sd_id128_to_string(f->header->file_id, a),
1657 sd_id128_to_string(f->header->machine_id, b),
1658 sd_id128_to_string(f->header->boot_id, c),
1659 (unsigned long long) le64toh(f->header->arena_size),
1660 (unsigned long) le64toh(f->header->n_objects),
1661 (unsigned long) le64toh(f->header->n_entries));
1662
1663 p = le64toh(f->header->arena_offset);
1664 while (p != 0) {
1665 r = journal_file_move_to_object(f, -1, p, &o);
1666 if (r < 0)
1667 goto fail;
1668
1669 switch (o->object.type) {
1670
1671 case OBJECT_UNUSED:
1672 printf("Type: OBJECT_UNUSED\n");
1673 break;
1674
1675 case OBJECT_DATA:
1676 printf("Type: OBJECT_DATA\n");
1677 break;
1678
1679 case OBJECT_ENTRY:
1680 printf("Type: OBJECT_ENTRY %llu %llu %llu\n",
1681 (unsigned long long) le64toh(o->entry.seqnum),
1682 (unsigned long long) le64toh(o->entry.monotonic),
1683 (unsigned long long) le64toh(o->entry.realtime));
1684 break;
1685
1686 case OBJECT_FIELD_HASH_TABLE:
1687 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
1688 break;
1689
1690 case OBJECT_DATA_HASH_TABLE:
1691 printf("Type: OBJECT_DATA_HASH_TABLE\n");
1692 break;
1693
1694 case OBJECT_ENTRY_ARRAY:
1695 printf("Type: OBJECT_ENTRY_ARRAY\n");
1696 break;
1697 }
1698
1699 if (o->object.flags & OBJECT_COMPRESSED)
1700 printf("Flags: COMPRESSED\n");
1701
1702 if (p == le64toh(f->header->tail_object_offset))
1703 p = 0;
1704 else
1705 p = p + ALIGN64(le64toh(o->object.size));
1706 }
1707
1708 return;
1709 fail:
1710 log_error("File corrupt");
1711 }
1712
1713 int journal_file_open(
1714 const char *fname,
1715 int flags,
1716 mode_t mode,
1717 JournalFile *template,
1718 JournalFile **ret) {
1719
1720 JournalFile *f;
1721 int r;
1722 bool newly_created = false;
1723
1724 assert(fname);
1725
1726 if ((flags & O_ACCMODE) != O_RDONLY &&
1727 (flags & O_ACCMODE) != O_RDWR)
1728 return -EINVAL;
1729
1730 if (!endswith(fname, ".journal"))
1731 return -EINVAL;
1732
1733 f = new0(JournalFile, 1);
1734 if (!f)
1735 return -ENOMEM;
1736
1737 f->fd = -1;
1738 f->flags = flags;
1739 f->mode = mode;
1740 f->writable = (flags & O_ACCMODE) != O_RDONLY;
1741 f->prot = prot_from_flags(flags);
1742
1743 if (template) {
1744 f->metrics = template->metrics;
1745 f->compress = template->compress;
1746 }
1747
1748 f->path = strdup(fname);
1749 if (!f->path) {
1750 r = -ENOMEM;
1751 goto fail;
1752 }
1753
1754 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
1755 if (f->fd < 0) {
1756 r = -errno;
1757 goto fail;
1758 }
1759
1760 if (fstat(f->fd, &f->last_stat) < 0) {
1761 r = -errno;
1762 goto fail;
1763 }
1764
1765 if (f->last_stat.st_size == 0 && f->writable) {
1766 newly_created = true;
1767
1768 r = journal_file_init_header(f, template);
1769 if (r < 0)
1770 goto fail;
1771
1772 if (fstat(f->fd, &f->last_stat) < 0) {
1773 r = -errno;
1774 goto fail;
1775 }
1776 }
1777
1778 if (f->last_stat.st_size < (off_t) sizeof(Header)) {
1779 r = -EIO;
1780 goto fail;
1781 }
1782
1783 f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
1784 if (f->header == MAP_FAILED) {
1785 f->header = NULL;
1786 r = -errno;
1787 goto fail;
1788 }
1789
1790 if (!newly_created) {
1791 r = journal_file_verify_header(f);
1792 if (r < 0)
1793 goto fail;
1794 }
1795
1796 if (f->writable) {
1797 r = journal_file_refresh_header(f);
1798 if (r < 0)
1799 goto fail;
1800 }
1801
1802 if (newly_created) {
1803
1804 r = journal_file_setup_field_hash_table(f);
1805 if (r < 0)
1806 goto fail;
1807
1808 r = journal_file_setup_data_hash_table(f);
1809 if (r < 0)
1810 goto fail;
1811 }
1812
1813 r = journal_file_map_field_hash_table(f);
1814 if (r < 0)
1815 goto fail;
1816
1817 r = journal_file_map_data_hash_table(f);
1818 if (r < 0)
1819 goto fail;
1820
1821 if (ret)
1822 *ret = f;
1823
1824 return 0;
1825
1826 fail:
1827 journal_file_close(f);
1828
1829 return r;
1830 }
1831
1832 int journal_file_rotate(JournalFile **f) {
1833 char *p;
1834 size_t l;
1835 JournalFile *old_file, *new_file = NULL;
1836 int r;
1837
1838 assert(f);
1839 assert(*f);
1840
1841 old_file = *f;
1842
1843 if (!old_file->writable)
1844 return -EINVAL;
1845
1846 if (!endswith(old_file->path, ".journal"))
1847 return -EINVAL;
1848
1849 l = strlen(old_file->path);
1850
1851 p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
1852 if (!p)
1853 return -ENOMEM;
1854
1855 memcpy(p, old_file->path, l - 8);
1856 p[l-8] = '@';
1857 sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
1858 snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
1859 "-%016llx-%016llx.journal",
1860 (unsigned long long) le64toh((*f)->header->seqnum),
1861 (unsigned long long) le64toh((*f)->header->tail_entry_realtime));
1862
1863 r = rename(old_file->path, p);
1864 free(p);
1865
1866 if (r < 0)
1867 return -errno;
1868
1869 old_file->header->state = STATE_ARCHIVED;
1870
1871 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, old_file, &new_file);
1872 journal_file_close(old_file);
1873
1874 *f = new_file;
1875 return r;
1876 }
1877
1878 int journal_file_open_reliably(
1879 const char *fname,
1880 int flags,
1881 mode_t mode,
1882 JournalFile *template,
1883 JournalFile **ret) {
1884
1885 int r;
1886 size_t l;
1887 char *p;
1888
1889 r = journal_file_open(fname, flags, mode, template, ret);
1890 if (r != -EBADMSG && /* corrupted */
1891 r != -ENODATA && /* truncated */
1892 r != -EHOSTDOWN && /* other machine */
1893 r != -EPROTONOSUPPORT) /* incompatible feature */
1894 return r;
1895
1896 if ((flags & O_ACCMODE) == O_RDONLY)
1897 return r;
1898
1899 if (!(flags & O_CREAT))
1900 return r;
1901
1902 /* The file is corrupted. Rotate it away and try it again (but only once) */
1903
1904 l = strlen(fname);
1905 if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
1906 (int) (l-8), fname,
1907 (unsigned long long) now(CLOCK_REALTIME),
1908 random_ull()) < 0)
1909 return -ENOMEM;
1910
1911 r = rename(fname, p);
1912 free(p);
1913 if (r < 0)
1914 return -errno;
1915
1916 log_warning("File %s corrupted, renaming and replacing.", fname);
1917
1918 return journal_file_open(fname, flags, mode, template, ret);
1919 }
1920
1921 struct vacuum_info {
1922 off_t usage;
1923 char *filename;
1924
1925 uint64_t realtime;
1926 sd_id128_t seqnum_id;
1927 uint64_t seqnum;
1928
1929 bool have_seqnum;
1930 };
1931
1932 static int vacuum_compare(const void *_a, const void *_b) {
1933 const struct vacuum_info *a, *b;
1934
1935 a = _a;
1936 b = _b;
1937
1938 if (a->have_seqnum && b->have_seqnum &&
1939 sd_id128_equal(a->seqnum_id, b->seqnum_id)) {
1940 if (a->seqnum < b->seqnum)
1941 return -1;
1942 else if (a->seqnum > b->seqnum)
1943 return 1;
1944 else
1945 return 0;
1946 }
1947
1948 if (a->realtime < b->realtime)
1949 return -1;
1950 else if (a->realtime > b->realtime)
1951 return 1;
1952 else if (a->have_seqnum && b->have_seqnum)
1953 return memcmp(&a->seqnum_id, &b->seqnum_id, 16);
1954 else
1955 return strcmp(a->filename, b->filename);
1956 }
1957
1958 int journal_directory_vacuum(const char *directory, uint64_t max_use, uint64_t min_free) {
1959 DIR *d;
1960 int r = 0;
1961 struct vacuum_info *list = NULL;
1962 unsigned n_list = 0, n_allocated = 0, i;
1963 uint64_t sum = 0;
1964
1965 assert(directory);
1966
1967 if (max_use <= 0)
1968 return 0;
1969
1970 d = opendir(directory);
1971 if (!d)
1972 return -errno;
1973
1974 for (;;) {
1975 int k;
1976 struct dirent buf, *de;
1977 size_t q;
1978 struct stat st;
1979 char *p;
1980 unsigned long long seqnum, realtime;
1981 sd_id128_t seqnum_id;
1982 bool have_seqnum;
1983
1984 k = readdir_r(d, &buf, &de);
1985 if (k != 0) {
1986 r = -k;
1987 goto finish;
1988 }
1989
1990 if (!de)
1991 break;
1992
1993 if (fstatat(dirfd(d), de->d_name, &st, AT_SYMLINK_NOFOLLOW) < 0)
1994 continue;
1995
1996 if (!S_ISREG(st.st_mode))
1997 continue;
1998
1999 q = strlen(de->d_name);
2000
2001 if (endswith(de->d_name, ".journal")) {
2002
2003 /* Vacuum archived files */
2004
2005 if (q < 1 + 32 + 1 + 16 + 1 + 16 + 8)
2006 continue;
2007
2008 if (de->d_name[q-8-16-1] != '-' ||
2009 de->d_name[q-8-16-1-16-1] != '-' ||
2010 de->d_name[q-8-16-1-16-1-32-1] != '@')
2011 continue;
2012
2013 p = strdup(de->d_name);
2014 if (!p) {
2015 r = -ENOMEM;
2016 goto finish;
2017 }
2018
2019 de->d_name[q-8-16-1-16-1] = 0;
2020 if (sd_id128_from_string(de->d_name + q-8-16-1-16-1-32, &seqnum_id) < 0) {
2021 free(p);
2022 continue;
2023 }
2024
2025 if (sscanf(de->d_name + q-8-16-1-16, "%16llx-%16llx.journal", &seqnum, &realtime) != 2) {
2026 free(p);
2027 continue;
2028 }
2029
2030 have_seqnum = true;
2031
2032 } else if (endswith(de->d_name, ".journal~")) {
2033 unsigned long long tmp;
2034
2035 /* Vacuum corrupted files */
2036
2037 if (q < 1 + 16 + 1 + 16 + 8 + 1)
2038 continue;
2039
2040 if (de->d_name[q-1-8-16-1] != '-' ||
2041 de->d_name[q-1-8-16-1-16-1] != '@')
2042 continue;
2043
2044 p = strdup(de->d_name);
2045 if (!p) {
2046 r = -ENOMEM;
2047 goto finish;
2048 }
2049
2050 if (sscanf(de->d_name + q-1-8-16-1-16, "%16llx-%16llx.journal~", &realtime, &tmp) != 2) {
2051 free(p);
2052 continue;
2053 }
2054
2055 have_seqnum = false;
2056 } else
2057 continue;
2058
2059 if (n_list >= n_allocated) {
2060 struct vacuum_info *j;
2061
2062 n_allocated = MAX(n_allocated * 2U, 8U);
2063 j = realloc(list, n_allocated * sizeof(struct vacuum_info));
2064 if (!j) {
2065 free(p);
2066 r = -ENOMEM;
2067 goto finish;
2068 }
2069
2070 list = j;
2071 }
2072
2073 list[n_list].filename = p;
2074 list[n_list].usage = 512UL * (uint64_t) st.st_blocks;
2075 list[n_list].seqnum = seqnum;
2076 list[n_list].realtime = realtime;
2077 list[n_list].seqnum_id = seqnum_id;
2078 list[n_list].have_seqnum = have_seqnum;
2079
2080 sum += list[n_list].usage;
2081
2082 n_list ++;
2083 }
2084
2085 qsort(list, n_list, sizeof(struct vacuum_info), vacuum_compare);
2086
2087 for(i = 0; i < n_list; i++) {
2088 struct statvfs ss;
2089
2090 if (fstatvfs(dirfd(d), &ss) < 0) {
2091 r = -errno;
2092 goto finish;
2093 }
2094
2095 if (sum <= max_use &&
2096 (uint64_t) ss.f_bavail * (uint64_t) ss.f_bsize >= min_free)
2097 break;
2098
2099 if (unlinkat(dirfd(d), list[i].filename, 0) >= 0) {
2100 log_info("Deleted archived journal %s/%s.", directory, list[i].filename);
2101 sum -= list[i].usage;
2102 } else if (errno != ENOENT)
2103 log_warning("Failed to delete %s/%s: %m", directory, list[i].filename);
2104 }
2105
2106 finish:
2107 for (i = 0; i < n_list; i++)
2108 free(list[i].filename);
2109
2110 free(list);
2111
2112 if (d)
2113 closedir(d);
2114
2115 return r;
2116 }
2117
2118 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2119 uint64_t i, n;
2120 uint64_t q, xor_hash = 0;
2121 int r;
2122 EntryItem *items;
2123 dual_timestamp ts;
2124
2125 assert(from);
2126 assert(to);
2127 assert(o);
2128 assert(p);
2129
2130 if (!to->writable)
2131 return -EPERM;
2132
2133 ts.monotonic = le64toh(o->entry.monotonic);
2134 ts.realtime = le64toh(o->entry.realtime);
2135
2136 if (to->tail_entry_monotonic_valid &&
2137 ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2138 return -EINVAL;
2139
2140 if (ts.realtime < le64toh(to->header->tail_entry_realtime))
2141 return -EINVAL;
2142
2143 n = journal_file_entry_n_items(o);
2144 items = alloca(sizeof(EntryItem) * n);
2145
2146 for (i = 0; i < n; i++) {
2147 uint64_t l, h;
2148 le64_t le_hash;
2149 size_t t;
2150 void *data;
2151 Object *u;
2152
2153 q = le64toh(o->entry.items[i].object_offset);
2154 le_hash = o->entry.items[i].hash;
2155
2156 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2157 if (r < 0)
2158 return r;
2159
2160 if (le_hash != o->data.hash)
2161 return -EBADMSG;
2162
2163 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2164 t = (size_t) l;
2165
2166 /* We hit the limit on 32bit machines */
2167 if ((uint64_t) t != l)
2168 return -E2BIG;
2169
2170 if (o->object.flags & OBJECT_COMPRESSED) {
2171 #ifdef HAVE_XZ
2172 uint64_t rsize;
2173
2174 if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize))
2175 return -EBADMSG;
2176
2177 data = from->compress_buffer;
2178 l = rsize;
2179 #else
2180 return -EPROTONOSUPPORT;
2181 #endif
2182 } else
2183 data = o->data.payload;
2184
2185 r = journal_file_append_data(to, data, l, &u, &h);
2186 if (r < 0)
2187 return r;
2188
2189 xor_hash ^= le64toh(u->data.hash);
2190 items[i].object_offset = htole64(h);
2191 items[i].hash = u->data.hash;
2192
2193 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2194 if (r < 0)
2195 return r;
2196 }
2197
2198 return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2199 }
2200
2201 void journal_default_metrics(JournalMetrics *m, int fd) {
2202 uint64_t fs_size = 0;
2203 struct statvfs ss;
2204 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2205
2206 assert(m);
2207 assert(fd >= 0);
2208
2209 if (fstatvfs(fd, &ss) >= 0)
2210 fs_size = ss.f_frsize * ss.f_blocks;
2211
2212 if (m->max_use == (uint64_t) -1) {
2213
2214 if (fs_size > 0) {
2215 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2216
2217 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2218 m->max_use = DEFAULT_MAX_USE_UPPER;
2219
2220 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2221 m->max_use = DEFAULT_MAX_USE_LOWER;
2222 } else
2223 m->max_use = DEFAULT_MAX_USE_LOWER;
2224 } else {
2225 m->max_use = PAGE_ALIGN(m->max_use);
2226
2227 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2228 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2229 }
2230
2231 if (m->max_size == (uint64_t) -1) {
2232 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2233
2234 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2235 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2236 } else
2237 m->max_size = PAGE_ALIGN(m->max_size);
2238
2239 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2240 m->max_size = JOURNAL_FILE_SIZE_MIN;
2241
2242 if (m->max_size*2 > m->max_use)
2243 m->max_use = m->max_size*2;
2244
2245 if (m->min_size == (uint64_t) -1)
2246 m->min_size = JOURNAL_FILE_SIZE_MIN;
2247 else {
2248 m->min_size = PAGE_ALIGN(m->min_size);
2249
2250 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2251 m->min_size = JOURNAL_FILE_SIZE_MIN;
2252
2253 if (m->min_size > m->max_size)
2254 m->max_size = m->min_size;
2255 }
2256
2257 if (m->keep_free == (uint64_t) -1) {
2258
2259 if (fs_size > 0) {
2260 m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */
2261
2262 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2263 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2264
2265 } else
2266 m->keep_free = DEFAULT_KEEP_FREE;
2267 }
2268
2269 log_info("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2270 format_bytes(a, sizeof(a), m->max_use),
2271 format_bytes(b, sizeof(b), m->max_size),
2272 format_bytes(c, sizeof(c), m->min_size),
2273 format_bytes(d, sizeof(d), m->keep_free));
2274 }