]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/journal/journal-file.c
Fixed handling of posix_fallocate() returned value
[thirdparty/systemd.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2011 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #include "journal-def.h"
31 #include "journal-file.h"
32 #include "lookup3.h"
33 #include "compress.h"
34
35 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*16ULL)
36 #define DEFAULT_FIELD_HASH_TABLE_SIZE (2047ULL*16ULL)
37
38 #define DEFAULT_WINDOW_SIZE (8ULL*1024ULL*1024ULL)
39
40 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
41
42 /* This is the minimum journal file size */
43 #define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL) /* 64 KiB */
44
45 /* These are the lower and upper bounds if we deduce the max_use value
46 * from the file system size */
47 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
48 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
49
50 /* This is the upper bound if we deduce max_size from max_use */
51 #define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
52
53 /* This is the upper bound if we deduce the keep_free value from the
54 * file system size */
55 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
56
57 /* This is the keep_free value when we can't determine the system
58 * size */
59 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
60
61 static const char signature[] = { 'L', 'P', 'K', 'S', 'H', 'H', 'R', 'H' };
62
63 #define ALIGN64(x) (((x) + 7ULL) & ~7ULL)
64
65 void journal_file_close(JournalFile *f) {
66 int t;
67
68 assert(f);
69
70 if (f->header && f->writable)
71 f->header->state = STATE_OFFLINE;
72
73
74 for (t = 0; t < _WINDOW_MAX; t++)
75 if (f->windows[t].ptr)
76 munmap(f->windows[t].ptr, f->windows[t].size);
77
78 if (f->fd >= 0)
79 close_nointr_nofail(f->fd);
80
81 free(f->path);
82
83 #ifdef HAVE_XZ
84 free(f->compress_buffer);
85 #endif
86
87 free(f);
88 }
89
90 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
91 Header h;
92 ssize_t k;
93 int r;
94
95 assert(f);
96
97 zero(h);
98 memcpy(h.signature, signature, 8);
99 h.arena_offset = htole64(ALIGN64(sizeof(h)));
100
101 r = sd_id128_randomize(&h.file_id);
102 if (r < 0)
103 return r;
104
105 if (template) {
106 h.seqnum_id = template->header->seqnum_id;
107 h.seqnum = template->header->seqnum;
108 } else
109 h.seqnum_id = h.file_id;
110
111 k = pwrite(f->fd, &h, sizeof(h), 0);
112 if (k < 0)
113 return -errno;
114
115 if (k != sizeof(h))
116 return -EIO;
117
118 return 0;
119 }
120
121 static int journal_file_refresh_header(JournalFile *f) {
122 int r;
123 sd_id128_t boot_id;
124
125 assert(f);
126
127 r = sd_id128_get_machine(&f->header->machine_id);
128 if (r < 0)
129 return r;
130
131 r = sd_id128_get_boot(&boot_id);
132 if (r < 0)
133 return r;
134
135 if (sd_id128_equal(boot_id, f->header->boot_id))
136 f->tail_entry_monotonic_valid = true;
137
138 f->header->boot_id = boot_id;
139
140 f->header->state = STATE_ONLINE;
141
142 __sync_synchronize();
143
144 return 0;
145 }
146
147 static int journal_file_verify_header(JournalFile *f) {
148 assert(f);
149
150 if (memcmp(f->header, signature, 8))
151 return -EBADMSG;
152
153 #ifdef HAVE_XZ
154 if ((le64toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
155 return -EPROTONOSUPPORT;
156 #else
157 if (f->header->incompatible_flags != 0)
158 return -EPROTONOSUPPORT;
159 #endif
160
161 if ((uint64_t) f->last_stat.st_size < (le64toh(f->header->arena_offset) + le64toh(f->header->arena_size)))
162 return -ENODATA;
163
164 if (f->writable) {
165 uint8_t state;
166 sd_id128_t machine_id;
167 int r;
168
169 r = sd_id128_get_machine(&machine_id);
170 if (r < 0)
171 return r;
172
173 if (!sd_id128_equal(machine_id, f->header->machine_id))
174 return -EHOSTDOWN;
175
176 state = f->header->state;
177
178 if (state == STATE_ONLINE)
179 log_debug("Journal file %s is already online. Assuming unclean closing. Ignoring.", f->path);
180 else if (state == STATE_ARCHIVED)
181 return -ESHUTDOWN;
182 else if (state != STATE_OFFLINE)
183 log_debug("Journal file %s has unknown state %u. Ignoring.", f->path, state);
184 }
185
186 return 0;
187 }
188
189 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
190 uint64_t old_size, new_size;
191 int r;
192
193 assert(f);
194
195 /* We assume that this file is not sparse, and we know that
196 * for sure, since we always call posix_fallocate()
197 * ourselves */
198
199 old_size =
200 le64toh(f->header->arena_offset) +
201 le64toh(f->header->arena_size);
202
203 new_size = PAGE_ALIGN(offset + size);
204 if (new_size < le64toh(f->header->arena_offset))
205 new_size = le64toh(f->header->arena_offset);
206
207 if (new_size <= old_size)
208 return 0;
209
210 if (f->metrics.max_size > 0 &&
211 new_size > f->metrics.max_size)
212 return -E2BIG;
213
214 if (new_size > f->metrics.min_size &&
215 f->metrics.keep_free > 0) {
216 struct statvfs svfs;
217
218 if (fstatvfs(f->fd, &svfs) >= 0) {
219 uint64_t available;
220
221 available = svfs.f_bfree * svfs.f_bsize;
222
223 if (available >= f->metrics.keep_free)
224 available -= f->metrics.keep_free;
225 else
226 available = 0;
227
228 if (new_size - old_size > available)
229 return -E2BIG;
230 }
231 }
232
233 /* Note that the glibc fallocate() fallback is very
234 inefficient, hence we try to minimize the allocation area
235 as we can. */
236 r = posix_fallocate(f->fd, old_size, new_size - old_size);
237 if (r != 0)
238 return -r;
239
240 if (fstat(f->fd, &f->last_stat) < 0)
241 return -errno;
242
243 f->header->arena_size = htole64(new_size - le64toh(f->header->arena_offset));
244
245 return 0;
246 }
247
248 static int journal_file_map(
249 JournalFile *f,
250 uint64_t offset,
251 uint64_t size,
252 void **_window,
253 uint64_t *_woffset,
254 uint64_t *_wsize,
255 void **ret) {
256
257 uint64_t woffset, wsize;
258 void *window;
259
260 assert(f);
261 assert(size > 0);
262 assert(ret);
263
264 woffset = offset & ~((uint64_t) page_size() - 1ULL);
265 wsize = size + (offset - woffset);
266 wsize = PAGE_ALIGN(wsize);
267
268 /* Avoid SIGBUS on invalid accesses */
269 if (woffset + wsize > (uint64_t) PAGE_ALIGN(f->last_stat.st_size))
270 return -EADDRNOTAVAIL;
271
272 window = mmap(NULL, wsize, f->prot, MAP_SHARED, f->fd, woffset);
273 if (window == MAP_FAILED)
274 return -errno;
275
276 if (_window)
277 *_window = window;
278
279 if (_woffset)
280 *_woffset = woffset;
281
282 if (_wsize)
283 *_wsize = wsize;
284
285 *ret = (uint8_t*) window + (offset - woffset);
286
287 return 0;
288 }
289
290 static int journal_file_move_to(JournalFile *f, int wt, uint64_t offset, uint64_t size, void **ret) {
291 void *p = NULL;
292 uint64_t delta;
293 int r;
294 Window *w;
295
296 assert(f);
297 assert(ret);
298 assert(wt >= 0);
299 assert(wt < _WINDOW_MAX);
300
301 if (offset + size > (uint64_t) f->last_stat.st_size) {
302 /* Hmm, out of range? Let's refresh the fstat() data
303 * first, before we trust that check. */
304
305 if (fstat(f->fd, &f->last_stat) < 0 ||
306 offset + size > (uint64_t) f->last_stat.st_size)
307 return -EADDRNOTAVAIL;
308 }
309
310 w = f->windows + wt;
311
312 if (_likely_(w->ptr &&
313 w->offset <= offset &&
314 w->offset + w->size >= offset + size)) {
315
316 *ret = (uint8_t*) w->ptr + (offset - w->offset);
317 return 0;
318 }
319
320 if (w->ptr) {
321 if (munmap(w->ptr, w->size) < 0)
322 return -errno;
323
324 w->ptr = NULL;
325 w->size = w->offset = 0;
326 }
327
328 if (size < DEFAULT_WINDOW_SIZE) {
329 /* If the default window size is larger then what was
330 * asked for extend the mapping a bit in the hope to
331 * minimize needed remappings later on. We add half
332 * the window space before and half behind the
333 * requested mapping */
334
335 delta = (DEFAULT_WINDOW_SIZE - size) / 2;
336
337 if (delta > offset)
338 delta = offset;
339
340 offset -= delta;
341 size = DEFAULT_WINDOW_SIZE;
342 } else
343 delta = 0;
344
345 if (offset + size > (uint64_t) f->last_stat.st_size)
346 size = (uint64_t) f->last_stat.st_size - offset;
347
348 if (size <= 0)
349 return -EADDRNOTAVAIL;
350
351 r = journal_file_map(f,
352 offset, size,
353 &w->ptr, &w->offset, &w->size,
354 &p);
355
356 if (r < 0)
357 return r;
358
359 *ret = (uint8_t*) p + delta;
360 return 0;
361 }
362
363 static bool verify_hash(Object *o) {
364 uint64_t h1, h2;
365
366 assert(o);
367
368 if (o->object.type == OBJECT_DATA && !(o->object.flags & OBJECT_COMPRESSED)) {
369 h1 = le64toh(o->data.hash);
370 h2 = hash64(o->data.payload, le64toh(o->object.size) - offsetof(Object, data.payload));
371 } else if (o->object.type == OBJECT_FIELD) {
372 h1 = le64toh(o->field.hash);
373 h2 = hash64(o->field.payload, le64toh(o->object.size) - offsetof(Object, field.payload));
374 } else
375 return true;
376
377 return h1 == h2;
378 }
379
380 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
381 int r;
382 void *t;
383 Object *o;
384 uint64_t s;
385
386 assert(f);
387 assert(ret);
388 assert(type < _OBJECT_TYPE_MAX);
389
390 r = journal_file_move_to(f, type >= 0 ? type : WINDOW_UNKNOWN, offset, sizeof(ObjectHeader), &t);
391 if (r < 0)
392 return r;
393
394 o = (Object*) t;
395 s = le64toh(o->object.size);
396
397 if (s < sizeof(ObjectHeader))
398 return -EBADMSG;
399
400 if (type >= 0 && o->object.type != type)
401 return -EBADMSG;
402
403 if (s > sizeof(ObjectHeader)) {
404 r = journal_file_move_to(f, o->object.type, offset, s, &t);
405 if (r < 0)
406 return r;
407
408 o = (Object*) t;
409 }
410
411 if (!verify_hash(o))
412 return -EBADMSG;
413
414 *ret = o;
415 return 0;
416 }
417
418 static uint64_t journal_file_seqnum(JournalFile *f, uint64_t *seqnum) {
419 uint64_t r;
420
421 assert(f);
422
423 r = le64toh(f->header->seqnum) + 1;
424
425 if (seqnum) {
426 /* If an external seqnum counter was passed, we update
427 * both the local and the external one, and set it to
428 * the maximum of both */
429
430 if (*seqnum + 1 > r)
431 r = *seqnum + 1;
432
433 *seqnum = r;
434 }
435
436 f->header->seqnum = htole64(r);
437
438 if (f->header->first_seqnum == 0)
439 f->header->first_seqnum = htole64(r);
440
441 return r;
442 }
443
444 static int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
445 int r;
446 uint64_t p;
447 Object *tail, *o;
448 void *t;
449
450 assert(f);
451 assert(size >= sizeof(ObjectHeader));
452 assert(offset);
453 assert(ret);
454
455 p = le64toh(f->header->tail_object_offset);
456 if (p == 0)
457 p = le64toh(f->header->arena_offset);
458 else {
459 r = journal_file_move_to_object(f, -1, p, &tail);
460 if (r < 0)
461 return r;
462
463 p += ALIGN64(le64toh(tail->object.size));
464 }
465
466 r = journal_file_allocate(f, p, size);
467 if (r < 0)
468 return r;
469
470 r = journal_file_move_to(f, type, p, size, &t);
471 if (r < 0)
472 return r;
473
474 o = (Object*) t;
475
476 zero(o->object);
477 o->object.type = type;
478 o->object.size = htole64(size);
479
480 f->header->tail_object_offset = htole64(p);
481 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
482
483 *ret = o;
484 *offset = p;
485
486 return 0;
487 }
488
489 static int journal_file_setup_data_hash_table(JournalFile *f) {
490 uint64_t s, p;
491 Object *o;
492 int r;
493
494 assert(f);
495
496 s = DEFAULT_DATA_HASH_TABLE_SIZE;
497 r = journal_file_append_object(f,
498 OBJECT_DATA_HASH_TABLE,
499 offsetof(Object, hash_table.items) + s,
500 &o, &p);
501 if (r < 0)
502 return r;
503
504 memset(o->hash_table.items, 0, s);
505
506 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
507 f->header->data_hash_table_size = htole64(s);
508
509 return 0;
510 }
511
512 static int journal_file_setup_field_hash_table(JournalFile *f) {
513 uint64_t s, p;
514 Object *o;
515 int r;
516
517 assert(f);
518
519 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
520 r = journal_file_append_object(f,
521 OBJECT_FIELD_HASH_TABLE,
522 offsetof(Object, hash_table.items) + s,
523 &o, &p);
524 if (r < 0)
525 return r;
526
527 memset(o->hash_table.items, 0, s);
528
529 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
530 f->header->field_hash_table_size = htole64(s);
531
532 return 0;
533 }
534
535 static int journal_file_map_data_hash_table(JournalFile *f) {
536 uint64_t s, p;
537 void *t;
538 int r;
539
540 assert(f);
541
542 p = le64toh(f->header->data_hash_table_offset);
543 s = le64toh(f->header->data_hash_table_size);
544
545 r = journal_file_move_to(f,
546 WINDOW_DATA_HASH_TABLE,
547 p, s,
548 &t);
549 if (r < 0)
550 return r;
551
552 f->data_hash_table = t;
553 return 0;
554 }
555
556 static int journal_file_map_field_hash_table(JournalFile *f) {
557 uint64_t s, p;
558 void *t;
559 int r;
560
561 assert(f);
562
563 p = le64toh(f->header->field_hash_table_offset);
564 s = le64toh(f->header->field_hash_table_size);
565
566 r = journal_file_move_to(f,
567 WINDOW_FIELD_HASH_TABLE,
568 p, s,
569 &t);
570 if (r < 0)
571 return r;
572
573 f->field_hash_table = t;
574 return 0;
575 }
576
577 static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, uint64_t hash) {
578 uint64_t p, h;
579 int r;
580
581 assert(f);
582 assert(o);
583 assert(offset > 0);
584 assert(o->object.type == OBJECT_DATA);
585
586 /* This might alter the window we are looking at */
587
588 o->data.next_hash_offset = o->data.next_field_offset = 0;
589 o->data.entry_offset = o->data.entry_array_offset = 0;
590 o->data.n_entries = 0;
591
592 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
593 p = le64toh(f->data_hash_table[h].head_hash_offset);
594 if (p == 0) {
595 /* Only entry in the hash table is easy */
596 f->data_hash_table[h].head_hash_offset = htole64(offset);
597 } else {
598 /* Move back to the previous data object, to patch in
599 * pointer */
600
601 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
602 if (r < 0)
603 return r;
604
605 o->data.next_hash_offset = htole64(offset);
606 }
607
608 f->data_hash_table[h].tail_hash_offset = htole64(offset);
609
610 return 0;
611 }
612
613 int journal_file_find_data_object_with_hash(
614 JournalFile *f,
615 const void *data, uint64_t size, uint64_t hash,
616 Object **ret, uint64_t *offset) {
617
618 uint64_t p, osize, h;
619 int r;
620
621 assert(f);
622 assert(data || size == 0);
623
624 osize = offsetof(Object, data.payload) + size;
625
626 if (f->header->data_hash_table_size == 0)
627 return -EBADMSG;
628
629 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
630 p = le64toh(f->data_hash_table[h].head_hash_offset);
631
632 while (p > 0) {
633 Object *o;
634
635 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
636 if (r < 0)
637 return r;
638
639 if (le64toh(o->data.hash) != hash)
640 goto next;
641
642 if (o->object.flags & OBJECT_COMPRESSED) {
643 #ifdef HAVE_XZ
644 uint64_t l, rsize;
645
646 l = le64toh(o->object.size);
647 if (l <= offsetof(Object, data.payload))
648 return -EBADMSG;
649
650 l -= offsetof(Object, data.payload);
651
652 if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize))
653 return -EBADMSG;
654
655 if (rsize == size &&
656 memcmp(f->compress_buffer, data, size) == 0) {
657
658 if (ret)
659 *ret = o;
660
661 if (offset)
662 *offset = p;
663
664 return 1;
665 }
666 #else
667 return -EPROTONOSUPPORT;
668 #endif
669
670 } else if (le64toh(o->object.size) == osize &&
671 memcmp(o->data.payload, data, size) == 0) {
672
673 if (ret)
674 *ret = o;
675
676 if (offset)
677 *offset = p;
678
679 return 1;
680 }
681
682 next:
683 p = le64toh(o->data.next_hash_offset);
684 }
685
686 return 0;
687 }
688
689 int journal_file_find_data_object(
690 JournalFile *f,
691 const void *data, uint64_t size,
692 Object **ret, uint64_t *offset) {
693
694 uint64_t hash;
695
696 assert(f);
697 assert(data || size == 0);
698
699 hash = hash64(data, size);
700
701 return journal_file_find_data_object_with_hash(f,
702 data, size, hash,
703 ret, offset);
704 }
705
706 static int journal_file_append_data(
707 JournalFile *f,
708 const void *data, uint64_t size,
709 Object **ret, uint64_t *offset) {
710
711 uint64_t hash, p;
712 uint64_t osize;
713 Object *o;
714 int r;
715 bool compressed = false;
716
717 assert(f);
718 assert(data || size == 0);
719
720 hash = hash64(data, size);
721
722 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
723 if (r < 0)
724 return r;
725 else if (r > 0) {
726
727 if (ret)
728 *ret = o;
729
730 if (offset)
731 *offset = p;
732
733 return 0;
734 }
735
736 osize = offsetof(Object, data.payload) + size;
737 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
738 if (r < 0)
739 return r;
740
741 o->data.hash = htole64(hash);
742
743 #ifdef HAVE_XZ
744 if (f->compress &&
745 size >= COMPRESSION_SIZE_THRESHOLD) {
746 uint64_t rsize;
747
748 compressed = compress_blob(data, size, o->data.payload, &rsize);
749
750 if (compressed) {
751 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
752 o->object.flags |= OBJECT_COMPRESSED;
753
754 f->header->incompatible_flags = htole32(le32toh(f->header->incompatible_flags) | HEADER_INCOMPATIBLE_COMPRESSED);
755
756 log_debug("Compressed data object %lu -> %lu", (unsigned long) size, (unsigned long) rsize);
757 }
758 }
759 #endif
760
761 if (!compressed)
762 memcpy(o->data.payload, data, size);
763
764 r = journal_file_link_data(f, o, p, hash);
765 if (r < 0)
766 return r;
767
768 /* The linking might have altered the window, so let's
769 * refresh our pointer */
770 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
771 if (r < 0)
772 return r;
773
774 if (ret)
775 *ret = o;
776
777 if (offset)
778 *offset = p;
779
780 return 0;
781 }
782
783 uint64_t journal_file_entry_n_items(Object *o) {
784 assert(o);
785 assert(o->object.type == OBJECT_ENTRY);
786
787 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
788 }
789
790 static uint64_t journal_file_entry_array_n_items(Object *o) {
791 assert(o);
792 assert(o->object.type == OBJECT_ENTRY_ARRAY);
793
794 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
795 }
796
797 static int link_entry_into_array(JournalFile *f,
798 le64_t *first,
799 le64_t *idx,
800 uint64_t p) {
801 int r;
802 uint64_t n = 0, ap = 0, q, i, a, hidx;
803 Object *o;
804
805 assert(f);
806 assert(first);
807 assert(idx);
808 assert(p > 0);
809
810 a = le64toh(*first);
811 i = hidx = le64toh(*idx);
812 while (a > 0) {
813
814 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
815 if (r < 0)
816 return r;
817
818 n = journal_file_entry_array_n_items(o);
819 if (i < n) {
820 o->entry_array.items[i] = htole64(p);
821 *idx = htole64(hidx + 1);
822 return 0;
823 }
824
825 i -= n;
826 ap = a;
827 a = le64toh(o->entry_array.next_entry_array_offset);
828 }
829
830 if (hidx > n)
831 n = (hidx+1) * 2;
832 else
833 n = n * 2;
834
835 if (n < 4)
836 n = 4;
837
838 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
839 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
840 &o, &q);
841 if (r < 0)
842 return r;
843
844 o->entry_array.items[i] = htole64(p);
845
846 if (ap == 0)
847 *first = htole64(q);
848 else {
849 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
850 if (r < 0)
851 return r;
852
853 o->entry_array.next_entry_array_offset = htole64(q);
854 }
855
856 *idx = htole64(hidx + 1);
857
858 return 0;
859 }
860
861 static int link_entry_into_array_plus_one(JournalFile *f,
862 le64_t *extra,
863 le64_t *first,
864 le64_t *idx,
865 uint64_t p) {
866
867 int r;
868
869 assert(f);
870 assert(extra);
871 assert(first);
872 assert(idx);
873 assert(p > 0);
874
875 if (*idx == 0)
876 *extra = htole64(p);
877 else {
878 le64_t i;
879
880 i = htole64(le64toh(*idx) - 1);
881 r = link_entry_into_array(f, first, &i, p);
882 if (r < 0)
883 return r;
884 }
885
886 *idx = htole64(le64toh(*idx) + 1);
887 return 0;
888 }
889
890 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
891 uint64_t p;
892 int r;
893 assert(f);
894 assert(o);
895 assert(offset > 0);
896
897 p = le64toh(o->entry.items[i].object_offset);
898 if (p == 0)
899 return -EINVAL;
900
901 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
902 if (r < 0)
903 return r;
904
905 return link_entry_into_array_plus_one(f,
906 &o->data.entry_offset,
907 &o->data.entry_array_offset,
908 &o->data.n_entries,
909 offset);
910 }
911
912 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
913 uint64_t n, i;
914 int r;
915
916 assert(f);
917 assert(o);
918 assert(offset > 0);
919 assert(o->object.type == OBJECT_ENTRY);
920
921 __sync_synchronize();
922
923 /* Link up the entry itself */
924 r = link_entry_into_array(f,
925 &f->header->entry_array_offset,
926 &f->header->n_entries,
927 offset);
928 if (r < 0)
929 return r;
930
931 /* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */
932
933 if (f->header->head_entry_realtime == 0)
934 f->header->head_entry_realtime = o->entry.realtime;
935
936 f->header->tail_entry_realtime = o->entry.realtime;
937 f->header->tail_entry_monotonic = o->entry.monotonic;
938
939 f->tail_entry_monotonic_valid = true;
940
941 /* Link up the items */
942 n = journal_file_entry_n_items(o);
943 for (i = 0; i < n; i++) {
944 r = journal_file_link_entry_item(f, o, offset, i);
945 if (r < 0)
946 return r;
947 }
948
949 return 0;
950 }
951
952 static int journal_file_append_entry_internal(
953 JournalFile *f,
954 const dual_timestamp *ts,
955 uint64_t xor_hash,
956 const EntryItem items[], unsigned n_items,
957 uint64_t *seqnum,
958 Object **ret, uint64_t *offset) {
959 uint64_t np;
960 uint64_t osize;
961 Object *o;
962 int r;
963
964 assert(f);
965 assert(items || n_items == 0);
966 assert(ts);
967
968 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
969
970 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
971 if (r < 0)
972 return r;
973
974 o->entry.seqnum = htole64(journal_file_seqnum(f, seqnum));
975 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
976 o->entry.realtime = htole64(ts->realtime);
977 o->entry.monotonic = htole64(ts->monotonic);
978 o->entry.xor_hash = htole64(xor_hash);
979 o->entry.boot_id = f->header->boot_id;
980
981 r = journal_file_link_entry(f, o, np);
982 if (r < 0)
983 return r;
984
985 if (ret)
986 *ret = o;
987
988 if (offset)
989 *offset = np;
990
991 return 0;
992 }
993
994 void journal_file_post_change(JournalFile *f) {
995 assert(f);
996
997 /* inotify() does not receive IN_MODIFY events from file
998 * accesses done via mmap(). After each access we hence
999 * trigger IN_MODIFY by truncating the journal file to its
1000 * current size which triggers IN_MODIFY. */
1001
1002 __sync_synchronize();
1003
1004 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1005 log_error("Failed to to truncate file to its own size: %m");
1006 }
1007
1008 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1009 unsigned i;
1010 EntryItem *items;
1011 int r;
1012 uint64_t xor_hash = 0;
1013 struct dual_timestamp _ts;
1014
1015 assert(f);
1016 assert(iovec || n_iovec == 0);
1017
1018 if (!f->writable)
1019 return -EPERM;
1020
1021 if (!ts) {
1022 dual_timestamp_get(&_ts);
1023 ts = &_ts;
1024 }
1025
1026 if (f->tail_entry_monotonic_valid &&
1027 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1028 return -EINVAL;
1029
1030 items = alloca(sizeof(EntryItem) * n_iovec);
1031
1032 for (i = 0; i < n_iovec; i++) {
1033 uint64_t p;
1034 Object *o;
1035
1036 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1037 if (r < 0)
1038 return r;
1039
1040 xor_hash ^= le64toh(o->data.hash);
1041 items[i].object_offset = htole64(p);
1042 items[i].hash = o->data.hash;
1043 }
1044
1045 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1046
1047 journal_file_post_change(f);
1048
1049 return r;
1050 }
1051
1052 static int generic_array_get(JournalFile *f,
1053 uint64_t first,
1054 uint64_t i,
1055 Object **ret, uint64_t *offset) {
1056
1057 Object *o;
1058 uint64_t p = 0, a;
1059 int r;
1060
1061 assert(f);
1062
1063 a = first;
1064 while (a > 0) {
1065 uint64_t n;
1066
1067 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1068 if (r < 0)
1069 return r;
1070
1071 n = journal_file_entry_array_n_items(o);
1072 if (i < n) {
1073 p = le64toh(o->entry_array.items[i]);
1074 break;
1075 }
1076
1077 i -= n;
1078 a = le64toh(o->entry_array.next_entry_array_offset);
1079 }
1080
1081 if (a <= 0 || p <= 0)
1082 return 0;
1083
1084 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1085 if (r < 0)
1086 return r;
1087
1088 if (ret)
1089 *ret = o;
1090
1091 if (offset)
1092 *offset = p;
1093
1094 return 1;
1095 }
1096
1097 static int generic_array_get_plus_one(JournalFile *f,
1098 uint64_t extra,
1099 uint64_t first,
1100 uint64_t i,
1101 Object **ret, uint64_t *offset) {
1102
1103 Object *o;
1104
1105 assert(f);
1106
1107 if (i == 0) {
1108 int r;
1109
1110 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1111 if (r < 0)
1112 return r;
1113
1114 if (ret)
1115 *ret = o;
1116
1117 if (offset)
1118 *offset = extra;
1119
1120 return 1;
1121 }
1122
1123 return generic_array_get(f, first, i-1, ret, offset);
1124 }
1125
1126 enum {
1127 TEST_FOUND,
1128 TEST_LEFT,
1129 TEST_RIGHT
1130 };
1131
1132 static int generic_array_bisect(JournalFile *f,
1133 uint64_t first,
1134 uint64_t n,
1135 uint64_t needle,
1136 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1137 direction_t direction,
1138 Object **ret,
1139 uint64_t *offset,
1140 uint64_t *idx) {
1141
1142 uint64_t a, p, t = 0, i = 0, last_p = 0;
1143 bool subtract_one = false;
1144 Object *o, *array = NULL;
1145 int r;
1146
1147 assert(f);
1148 assert(test_object);
1149
1150 a = first;
1151 while (a > 0) {
1152 uint64_t left, right, k, lp;
1153
1154 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1155 if (r < 0)
1156 return r;
1157
1158 k = journal_file_entry_array_n_items(array);
1159 right = MIN(k, n);
1160 if (right <= 0)
1161 return 0;
1162
1163 i = right - 1;
1164 lp = p = le64toh(array->entry_array.items[i]);
1165 if (p <= 0)
1166 return -EBADMSG;
1167
1168 r = test_object(f, p, needle);
1169 if (r < 0)
1170 return r;
1171
1172 if (r == TEST_FOUND)
1173 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1174
1175 if (r == TEST_RIGHT) {
1176 left = 0;
1177 right -= 1;
1178 for (;;) {
1179 if (left == right) {
1180 if (direction == DIRECTION_UP)
1181 subtract_one = true;
1182
1183 i = left;
1184 goto found;
1185 }
1186
1187 assert(left < right);
1188
1189 i = (left + right) / 2;
1190 p = le64toh(array->entry_array.items[i]);
1191 if (p <= 0)
1192 return -EBADMSG;
1193
1194 r = test_object(f, p, needle);
1195 if (r < 0)
1196 return r;
1197
1198 if (r == TEST_FOUND)
1199 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1200
1201 if (r == TEST_RIGHT)
1202 right = i;
1203 else
1204 left = i + 1;
1205 }
1206 }
1207
1208 if (k > n)
1209 return 0;
1210
1211 last_p = lp;
1212
1213 n -= k;
1214 t += k;
1215 a = le64toh(array->entry_array.next_entry_array_offset);
1216 }
1217
1218 return 0;
1219
1220 found:
1221 if (subtract_one && t == 0 && i == 0)
1222 return 0;
1223
1224 if (subtract_one && i == 0)
1225 p = last_p;
1226 else if (subtract_one)
1227 p = le64toh(array->entry_array.items[i-1]);
1228 else
1229 p = le64toh(array->entry_array.items[i]);
1230
1231 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1232 if (r < 0)
1233 return r;
1234
1235 if (ret)
1236 *ret = o;
1237
1238 if (offset)
1239 *offset = p;
1240
1241 if (idx)
1242 *idx = t + i - (subtract_one ? 1 : 0);
1243
1244 return 1;
1245 }
1246
1247 static int generic_array_bisect_plus_one(JournalFile *f,
1248 uint64_t extra,
1249 uint64_t first,
1250 uint64_t n,
1251 uint64_t needle,
1252 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1253 direction_t direction,
1254 Object **ret,
1255 uint64_t *offset,
1256 uint64_t *idx) {
1257
1258 int r;
1259
1260 assert(f);
1261 assert(test_object);
1262
1263 if (n <= 0)
1264 return 0;
1265
1266 /* This bisects the array in object 'first', but first checks
1267 * an extra */
1268 r = test_object(f, extra, needle);
1269 if (r < 0)
1270 return r;
1271 else if (r == TEST_FOUND) {
1272 Object *o;
1273
1274 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1275 if (r < 0)
1276 return r;
1277
1278 if (ret)
1279 *ret = o;
1280
1281 if (offset)
1282 *offset = extra;
1283
1284 if (idx)
1285 *idx = 0;
1286
1287 return 1;
1288 } else if (r == TEST_RIGHT)
1289 return 0;
1290
1291 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1292
1293 if (r > 0)
1294 (*idx) ++;
1295
1296 return r;
1297 }
1298
1299 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1300 Object *o;
1301 int r;
1302
1303 assert(f);
1304 assert(p > 0);
1305
1306 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1307 if (r < 0)
1308 return r;
1309
1310 if (le64toh(o->entry.seqnum) == needle)
1311 return TEST_FOUND;
1312 else if (le64toh(o->entry.seqnum) < needle)
1313 return TEST_LEFT;
1314 else
1315 return TEST_RIGHT;
1316 }
1317
1318 int journal_file_move_to_entry_by_seqnum(
1319 JournalFile *f,
1320 uint64_t seqnum,
1321 direction_t direction,
1322 Object **ret,
1323 uint64_t *offset) {
1324
1325 return generic_array_bisect(f,
1326 le64toh(f->header->entry_array_offset),
1327 le64toh(f->header->n_entries),
1328 seqnum,
1329 test_object_seqnum,
1330 direction,
1331 ret, offset, NULL);
1332 }
1333
1334 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1335 Object *o;
1336 int r;
1337
1338 assert(f);
1339 assert(p > 0);
1340
1341 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1342 if (r < 0)
1343 return r;
1344
1345 if (le64toh(o->entry.realtime) == needle)
1346 return TEST_FOUND;
1347 else if (le64toh(o->entry.realtime) < needle)
1348 return TEST_LEFT;
1349 else
1350 return TEST_RIGHT;
1351 }
1352
1353 int journal_file_move_to_entry_by_realtime(
1354 JournalFile *f,
1355 uint64_t realtime,
1356 direction_t direction,
1357 Object **ret,
1358 uint64_t *offset) {
1359
1360 return generic_array_bisect(f,
1361 le64toh(f->header->entry_array_offset),
1362 le64toh(f->header->n_entries),
1363 realtime,
1364 test_object_realtime,
1365 direction,
1366 ret, offset, NULL);
1367 }
1368
1369 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1370 Object *o;
1371 int r;
1372
1373 assert(f);
1374 assert(p > 0);
1375
1376 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1377 if (r < 0)
1378 return r;
1379
1380 if (le64toh(o->entry.monotonic) == needle)
1381 return TEST_FOUND;
1382 else if (le64toh(o->entry.monotonic) < needle)
1383 return TEST_LEFT;
1384 else
1385 return TEST_RIGHT;
1386 }
1387
1388 int journal_file_move_to_entry_by_monotonic(
1389 JournalFile *f,
1390 sd_id128_t boot_id,
1391 uint64_t monotonic,
1392 direction_t direction,
1393 Object **ret,
1394 uint64_t *offset) {
1395
1396 char t[8+32+1] = "_BOOT_ID=";
1397 Object *o;
1398 int r;
1399
1400 sd_id128_to_string(boot_id, t + 8);
1401
1402 r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
1403 if (r < 0)
1404 return r;
1405 else if (r == 0)
1406 return -ENOENT;
1407
1408 return generic_array_bisect_plus_one(f,
1409 le64toh(o->data.entry_offset),
1410 le64toh(o->data.entry_array_offset),
1411 le64toh(o->data.n_entries),
1412 monotonic,
1413 test_object_monotonic,
1414 direction,
1415 ret, offset, NULL);
1416 }
1417
1418 static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1419 assert(f);
1420 assert(p > 0);
1421
1422 if (p == needle)
1423 return TEST_FOUND;
1424 else if (p < needle)
1425 return TEST_LEFT;
1426 else
1427 return TEST_RIGHT;
1428 }
1429
1430 int journal_file_next_entry(
1431 JournalFile *f,
1432 Object *o, uint64_t p,
1433 direction_t direction,
1434 Object **ret, uint64_t *offset) {
1435
1436 uint64_t i, n;
1437 int r;
1438
1439 assert(f);
1440 assert(p > 0 || !o);
1441
1442 n = le64toh(f->header->n_entries);
1443 if (n <= 0)
1444 return 0;
1445
1446 if (!o)
1447 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1448 else {
1449 if (o->object.type != OBJECT_ENTRY)
1450 return -EINVAL;
1451
1452 r = generic_array_bisect(f,
1453 le64toh(f->header->entry_array_offset),
1454 le64toh(f->header->n_entries),
1455 p,
1456 test_object_offset,
1457 DIRECTION_DOWN,
1458 NULL, NULL,
1459 &i);
1460 if (r <= 0)
1461 return r;
1462
1463 if (direction == DIRECTION_DOWN) {
1464 if (i >= n - 1)
1465 return 0;
1466
1467 i++;
1468 } else {
1469 if (i <= 0)
1470 return 0;
1471
1472 i--;
1473 }
1474 }
1475
1476 /* And jump to it */
1477 return generic_array_get(f,
1478 le64toh(f->header->entry_array_offset),
1479 i,
1480 ret, offset);
1481 }
1482
1483 int journal_file_skip_entry(
1484 JournalFile *f,
1485 Object *o, uint64_t p,
1486 int64_t skip,
1487 Object **ret, uint64_t *offset) {
1488
1489 uint64_t i, n;
1490 int r;
1491
1492 assert(f);
1493 assert(o);
1494 assert(p > 0);
1495
1496 if (o->object.type != OBJECT_ENTRY)
1497 return -EINVAL;
1498
1499 r = generic_array_bisect(f,
1500 le64toh(f->header->entry_array_offset),
1501 le64toh(f->header->n_entries),
1502 p,
1503 test_object_offset,
1504 DIRECTION_DOWN,
1505 NULL, NULL,
1506 &i);
1507 if (r <= 0)
1508 return r;
1509
1510 /* Calculate new index */
1511 if (skip < 0) {
1512 if ((uint64_t) -skip >= i)
1513 i = 0;
1514 else
1515 i = i - (uint64_t) -skip;
1516 } else
1517 i += (uint64_t) skip;
1518
1519 n = le64toh(f->header->n_entries);
1520 if (n <= 0)
1521 return -EBADMSG;
1522
1523 if (i >= n)
1524 i = n-1;
1525
1526 return generic_array_get(f,
1527 le64toh(f->header->entry_array_offset),
1528 i,
1529 ret, offset);
1530 }
1531
1532 int journal_file_next_entry_for_data(
1533 JournalFile *f,
1534 Object *o, uint64_t p,
1535 uint64_t data_offset,
1536 direction_t direction,
1537 Object **ret, uint64_t *offset) {
1538
1539 uint64_t n, i;
1540 int r;
1541 Object *d;
1542
1543 assert(f);
1544 assert(p > 0 || !o);
1545
1546 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1547 if (r < 0)
1548 return r;
1549
1550 n = le64toh(d->data.n_entries);
1551 if (n <= 0)
1552 return n;
1553
1554 if (!o)
1555 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1556 else {
1557 if (o->object.type != OBJECT_ENTRY)
1558 return -EINVAL;
1559
1560 r = generic_array_bisect_plus_one(f,
1561 le64toh(d->data.entry_offset),
1562 le64toh(d->data.entry_array_offset),
1563 le64toh(d->data.n_entries),
1564 p,
1565 test_object_offset,
1566 DIRECTION_DOWN,
1567 NULL, NULL,
1568 &i);
1569
1570 if (r <= 0)
1571 return r;
1572
1573 if (direction == DIRECTION_DOWN) {
1574 if (i >= n - 1)
1575 return 0;
1576
1577 i++;
1578 } else {
1579 if (i <= 0)
1580 return 0;
1581
1582 i--;
1583 }
1584
1585 }
1586
1587 return generic_array_get_plus_one(f,
1588 le64toh(d->data.entry_offset),
1589 le64toh(d->data.entry_array_offset),
1590 i,
1591 ret, offset);
1592 }
1593
1594 int journal_file_move_to_entry_by_seqnum_for_data(
1595 JournalFile *f,
1596 uint64_t data_offset,
1597 uint64_t seqnum,
1598 direction_t direction,
1599 Object **ret, uint64_t *offset) {
1600
1601 Object *d;
1602 int r;
1603
1604 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1605 if (r <= 0)
1606 return r;
1607
1608 return generic_array_bisect_plus_one(f,
1609 le64toh(d->data.entry_offset),
1610 le64toh(d->data.entry_array_offset),
1611 le64toh(d->data.n_entries),
1612 seqnum,
1613 test_object_seqnum,
1614 direction,
1615 ret, offset, NULL);
1616 }
1617
1618 int journal_file_move_to_entry_by_realtime_for_data(
1619 JournalFile *f,
1620 uint64_t data_offset,
1621 uint64_t realtime,
1622 direction_t direction,
1623 Object **ret, uint64_t *offset) {
1624
1625 Object *d;
1626 int r;
1627
1628 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1629 if (r <= 0)
1630 return r;
1631
1632 return generic_array_bisect_plus_one(f,
1633 le64toh(d->data.entry_offset),
1634 le64toh(d->data.entry_array_offset),
1635 le64toh(d->data.n_entries),
1636 realtime,
1637 test_object_realtime,
1638 direction,
1639 ret, offset, NULL);
1640 }
1641
1642 void journal_file_dump(JournalFile *f) {
1643 char a[33], b[33], c[33];
1644 Object *o;
1645 int r;
1646 uint64_t p;
1647
1648 assert(f);
1649
1650 printf("File Path: %s\n"
1651 "File ID: %s\n"
1652 "Machine ID: %s\n"
1653 "Boot ID: %s\n"
1654 "Arena size: %llu\n"
1655 "Objects: %lu\n"
1656 "Entries: %lu\n",
1657 f->path,
1658 sd_id128_to_string(f->header->file_id, a),
1659 sd_id128_to_string(f->header->machine_id, b),
1660 sd_id128_to_string(f->header->boot_id, c),
1661 (unsigned long long) le64toh(f->header->arena_size),
1662 (unsigned long) le64toh(f->header->n_objects),
1663 (unsigned long) le64toh(f->header->n_entries));
1664
1665 p = le64toh(f->header->arena_offset);
1666 while (p != 0) {
1667 r = journal_file_move_to_object(f, -1, p, &o);
1668 if (r < 0)
1669 goto fail;
1670
1671 switch (o->object.type) {
1672
1673 case OBJECT_UNUSED:
1674 printf("Type: OBJECT_UNUSED\n");
1675 break;
1676
1677 case OBJECT_DATA:
1678 printf("Type: OBJECT_DATA\n");
1679 break;
1680
1681 case OBJECT_ENTRY:
1682 printf("Type: OBJECT_ENTRY %llu %llu %llu\n",
1683 (unsigned long long) le64toh(o->entry.seqnum),
1684 (unsigned long long) le64toh(o->entry.monotonic),
1685 (unsigned long long) le64toh(o->entry.realtime));
1686 break;
1687
1688 case OBJECT_FIELD_HASH_TABLE:
1689 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
1690 break;
1691
1692 case OBJECT_DATA_HASH_TABLE:
1693 printf("Type: OBJECT_DATA_HASH_TABLE\n");
1694 break;
1695
1696 case OBJECT_ENTRY_ARRAY:
1697 printf("Type: OBJECT_ENTRY_ARRAY\n");
1698 break;
1699 }
1700
1701 if (o->object.flags & OBJECT_COMPRESSED)
1702 printf("Flags: COMPRESSED\n");
1703
1704 if (p == le64toh(f->header->tail_object_offset))
1705 p = 0;
1706 else
1707 p = p + ALIGN64(le64toh(o->object.size));
1708 }
1709
1710 return;
1711 fail:
1712 log_error("File corrupt");
1713 }
1714
1715 int journal_file_open(
1716 const char *fname,
1717 int flags,
1718 mode_t mode,
1719 JournalFile *template,
1720 JournalFile **ret) {
1721
1722 JournalFile *f;
1723 int r;
1724 bool newly_created = false;
1725
1726 assert(fname);
1727
1728 if ((flags & O_ACCMODE) != O_RDONLY &&
1729 (flags & O_ACCMODE) != O_RDWR)
1730 return -EINVAL;
1731
1732 if (!endswith(fname, ".journal"))
1733 return -EINVAL;
1734
1735 f = new0(JournalFile, 1);
1736 if (!f)
1737 return -ENOMEM;
1738
1739 f->fd = -1;
1740 f->flags = flags;
1741 f->mode = mode;
1742 f->writable = (flags & O_ACCMODE) != O_RDONLY;
1743 f->prot = prot_from_flags(flags);
1744
1745 if (template) {
1746 f->metrics = template->metrics;
1747 f->compress = template->compress;
1748 }
1749
1750 f->path = strdup(fname);
1751 if (!f->path) {
1752 r = -ENOMEM;
1753 goto fail;
1754 }
1755
1756 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
1757 if (f->fd < 0) {
1758 r = -errno;
1759 goto fail;
1760 }
1761
1762 if (fstat(f->fd, &f->last_stat) < 0) {
1763 r = -errno;
1764 goto fail;
1765 }
1766
1767 if (f->last_stat.st_size == 0 && f->writable) {
1768 newly_created = true;
1769
1770 r = journal_file_init_header(f, template);
1771 if (r < 0)
1772 goto fail;
1773
1774 if (fstat(f->fd, &f->last_stat) < 0) {
1775 r = -errno;
1776 goto fail;
1777 }
1778 }
1779
1780 if (f->last_stat.st_size < (off_t) sizeof(Header)) {
1781 r = -EIO;
1782 goto fail;
1783 }
1784
1785 f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
1786 if (f->header == MAP_FAILED) {
1787 f->header = NULL;
1788 r = -errno;
1789 goto fail;
1790 }
1791
1792 if (!newly_created) {
1793 r = journal_file_verify_header(f);
1794 if (r < 0)
1795 goto fail;
1796 }
1797
1798 if (f->writable) {
1799 r = journal_file_refresh_header(f);
1800 if (r < 0)
1801 goto fail;
1802 }
1803
1804 if (newly_created) {
1805
1806 r = journal_file_setup_field_hash_table(f);
1807 if (r < 0)
1808 goto fail;
1809
1810 r = journal_file_setup_data_hash_table(f);
1811 if (r < 0)
1812 goto fail;
1813 }
1814
1815 r = journal_file_map_field_hash_table(f);
1816 if (r < 0)
1817 goto fail;
1818
1819 r = journal_file_map_data_hash_table(f);
1820 if (r < 0)
1821 goto fail;
1822
1823 if (ret)
1824 *ret = f;
1825
1826 return 0;
1827
1828 fail:
1829 journal_file_close(f);
1830
1831 return r;
1832 }
1833
1834 int journal_file_rotate(JournalFile **f) {
1835 char *p;
1836 size_t l;
1837 JournalFile *old_file, *new_file = NULL;
1838 int r;
1839
1840 assert(f);
1841 assert(*f);
1842
1843 old_file = *f;
1844
1845 if (!old_file->writable)
1846 return -EINVAL;
1847
1848 if (!endswith(old_file->path, ".journal"))
1849 return -EINVAL;
1850
1851 l = strlen(old_file->path);
1852
1853 p = new(char, l + 1 + 32 + 1 + 16 + 1 + 16 + 1);
1854 if (!p)
1855 return -ENOMEM;
1856
1857 memcpy(p, old_file->path, l - 8);
1858 p[l-8] = '@';
1859 sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
1860 snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
1861 "-%016llx-%016llx.journal",
1862 (unsigned long long) le64toh((*f)->header->seqnum),
1863 (unsigned long long) le64toh((*f)->header->tail_entry_realtime));
1864
1865 r = rename(old_file->path, p);
1866 free(p);
1867
1868 if (r < 0)
1869 return -errno;
1870
1871 old_file->header->state = STATE_ARCHIVED;
1872
1873 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, old_file, &new_file);
1874 journal_file_close(old_file);
1875
1876 *f = new_file;
1877 return r;
1878 }
1879
1880 int journal_file_open_reliably(
1881 const char *fname,
1882 int flags,
1883 mode_t mode,
1884 JournalFile *template,
1885 JournalFile **ret) {
1886
1887 int r;
1888 size_t l;
1889 char *p;
1890
1891 r = journal_file_open(fname, flags, mode, template, ret);
1892 if (r != -EBADMSG && /* corrupted */
1893 r != -ENODATA && /* truncated */
1894 r != -EHOSTDOWN && /* other machine */
1895 r != -EPROTONOSUPPORT) /* incompatible feature */
1896 return r;
1897
1898 if ((flags & O_ACCMODE) == O_RDONLY)
1899 return r;
1900
1901 if (!(flags & O_CREAT))
1902 return r;
1903
1904 /* The file is corrupted. Rotate it away and try it again (but only once) */
1905
1906 l = strlen(fname);
1907 if (asprintf(&p, "%.*s@%016llx-%016llx.journal~",
1908 (int) (l-8), fname,
1909 (unsigned long long) now(CLOCK_REALTIME),
1910 random_ull()) < 0)
1911 return -ENOMEM;
1912
1913 r = rename(fname, p);
1914 free(p);
1915 if (r < 0)
1916 return -errno;
1917
1918 log_warning("File %s corrupted, renaming and replacing.", fname);
1919
1920 return journal_file_open(fname, flags, mode, template, ret);
1921 }
1922
1923 struct vacuum_info {
1924 off_t usage;
1925 char *filename;
1926
1927 uint64_t realtime;
1928 sd_id128_t seqnum_id;
1929 uint64_t seqnum;
1930
1931 bool have_seqnum;
1932 };
1933
1934 static int vacuum_compare(const void *_a, const void *_b) {
1935 const struct vacuum_info *a, *b;
1936
1937 a = _a;
1938 b = _b;
1939
1940 if (a->have_seqnum && b->have_seqnum &&
1941 sd_id128_equal(a->seqnum_id, b->seqnum_id)) {
1942 if (a->seqnum < b->seqnum)
1943 return -1;
1944 else if (a->seqnum > b->seqnum)
1945 return 1;
1946 else
1947 return 0;
1948 }
1949
1950 if (a->realtime < b->realtime)
1951 return -1;
1952 else if (a->realtime > b->realtime)
1953 return 1;
1954 else if (a->have_seqnum && b->have_seqnum)
1955 return memcmp(&a->seqnum_id, &b->seqnum_id, 16);
1956 else
1957 return strcmp(a->filename, b->filename);
1958 }
1959
1960 int journal_directory_vacuum(const char *directory, uint64_t max_use, uint64_t min_free) {
1961 DIR *d;
1962 int r = 0;
1963 struct vacuum_info *list = NULL;
1964 unsigned n_list = 0, n_allocated = 0, i;
1965 uint64_t sum = 0;
1966
1967 assert(directory);
1968
1969 if (max_use <= 0)
1970 return 0;
1971
1972 d = opendir(directory);
1973 if (!d)
1974 return -errno;
1975
1976 for (;;) {
1977 int k;
1978 struct dirent buf, *de;
1979 size_t q;
1980 struct stat st;
1981 char *p;
1982 unsigned long long seqnum = 0, realtime;
1983 sd_id128_t seqnum_id;
1984 bool have_seqnum;
1985
1986 k = readdir_r(d, &buf, &de);
1987 if (k != 0) {
1988 r = -k;
1989 goto finish;
1990 }
1991
1992 if (!de)
1993 break;
1994
1995 if (fstatat(dirfd(d), de->d_name, &st, AT_SYMLINK_NOFOLLOW) < 0)
1996 continue;
1997
1998 if (!S_ISREG(st.st_mode))
1999 continue;
2000
2001 q = strlen(de->d_name);
2002
2003 if (endswith(de->d_name, ".journal")) {
2004
2005 /* Vacuum archived files */
2006
2007 if (q < 1 + 32 + 1 + 16 + 1 + 16 + 8)
2008 continue;
2009
2010 if (de->d_name[q-8-16-1] != '-' ||
2011 de->d_name[q-8-16-1-16-1] != '-' ||
2012 de->d_name[q-8-16-1-16-1-32-1] != '@')
2013 continue;
2014
2015 p = strdup(de->d_name);
2016 if (!p) {
2017 r = -ENOMEM;
2018 goto finish;
2019 }
2020
2021 de->d_name[q-8-16-1-16-1] = 0;
2022 if (sd_id128_from_string(de->d_name + q-8-16-1-16-1-32, &seqnum_id) < 0) {
2023 free(p);
2024 continue;
2025 }
2026
2027 if (sscanf(de->d_name + q-8-16-1-16, "%16llx-%16llx.journal", &seqnum, &realtime) != 2) {
2028 free(p);
2029 continue;
2030 }
2031
2032 have_seqnum = true;
2033
2034 } else if (endswith(de->d_name, ".journal~")) {
2035 unsigned long long tmp;
2036
2037 /* Vacuum corrupted files */
2038
2039 if (q < 1 + 16 + 1 + 16 + 8 + 1)
2040 continue;
2041
2042 if (de->d_name[q-1-8-16-1] != '-' ||
2043 de->d_name[q-1-8-16-1-16-1] != '@')
2044 continue;
2045
2046 p = strdup(de->d_name);
2047 if (!p) {
2048 r = -ENOMEM;
2049 goto finish;
2050 }
2051
2052 if (sscanf(de->d_name + q-1-8-16-1-16, "%16llx-%16llx.journal~", &realtime, &tmp) != 2) {
2053 free(p);
2054 continue;
2055 }
2056
2057 have_seqnum = false;
2058 } else
2059 continue;
2060
2061 if (n_list >= n_allocated) {
2062 struct vacuum_info *j;
2063
2064 n_allocated = MAX(n_allocated * 2U, 8U);
2065 j = realloc(list, n_allocated * sizeof(struct vacuum_info));
2066 if (!j) {
2067 free(p);
2068 r = -ENOMEM;
2069 goto finish;
2070 }
2071
2072 list = j;
2073 }
2074
2075 list[n_list].filename = p;
2076 list[n_list].usage = 512UL * (uint64_t) st.st_blocks;
2077 list[n_list].seqnum = seqnum;
2078 list[n_list].realtime = realtime;
2079 list[n_list].seqnum_id = seqnum_id;
2080 list[n_list].have_seqnum = have_seqnum;
2081
2082 sum += list[n_list].usage;
2083
2084 n_list ++;
2085 }
2086
2087 qsort(list, n_list, sizeof(struct vacuum_info), vacuum_compare);
2088
2089 for(i = 0; i < n_list; i++) {
2090 struct statvfs ss;
2091
2092 if (fstatvfs(dirfd(d), &ss) < 0) {
2093 r = -errno;
2094 goto finish;
2095 }
2096
2097 if (sum <= max_use &&
2098 (uint64_t) ss.f_bavail * (uint64_t) ss.f_bsize >= min_free)
2099 break;
2100
2101 if (unlinkat(dirfd(d), list[i].filename, 0) >= 0) {
2102 log_info("Deleted archived journal %s/%s.", directory, list[i].filename);
2103 sum -= list[i].usage;
2104 } else if (errno != ENOENT)
2105 log_warning("Failed to delete %s/%s: %m", directory, list[i].filename);
2106 }
2107
2108 finish:
2109 for (i = 0; i < n_list; i++)
2110 free(list[i].filename);
2111
2112 free(list);
2113
2114 if (d)
2115 closedir(d);
2116
2117 return r;
2118 }
2119
2120 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2121 uint64_t i, n;
2122 uint64_t q, xor_hash = 0;
2123 int r;
2124 EntryItem *items;
2125 dual_timestamp ts;
2126
2127 assert(from);
2128 assert(to);
2129 assert(o);
2130 assert(p);
2131
2132 if (!to->writable)
2133 return -EPERM;
2134
2135 ts.monotonic = le64toh(o->entry.monotonic);
2136 ts.realtime = le64toh(o->entry.realtime);
2137
2138 if (to->tail_entry_monotonic_valid &&
2139 ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2140 return -EINVAL;
2141
2142 if (ts.realtime < le64toh(to->header->tail_entry_realtime))
2143 return -EINVAL;
2144
2145 n = journal_file_entry_n_items(o);
2146 items = alloca(sizeof(EntryItem) * n);
2147
2148 for (i = 0; i < n; i++) {
2149 uint64_t l, h;
2150 le64_t le_hash;
2151 size_t t;
2152 void *data;
2153 Object *u;
2154
2155 q = le64toh(o->entry.items[i].object_offset);
2156 le_hash = o->entry.items[i].hash;
2157
2158 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2159 if (r < 0)
2160 return r;
2161
2162 if (le_hash != o->data.hash)
2163 return -EBADMSG;
2164
2165 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2166 t = (size_t) l;
2167
2168 /* We hit the limit on 32bit machines */
2169 if ((uint64_t) t != l)
2170 return -E2BIG;
2171
2172 if (o->object.flags & OBJECT_COMPRESSED) {
2173 #ifdef HAVE_XZ
2174 uint64_t rsize;
2175
2176 if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize))
2177 return -EBADMSG;
2178
2179 data = from->compress_buffer;
2180 l = rsize;
2181 #else
2182 return -EPROTONOSUPPORT;
2183 #endif
2184 } else
2185 data = o->data.payload;
2186
2187 r = journal_file_append_data(to, data, l, &u, &h);
2188 if (r < 0)
2189 return r;
2190
2191 xor_hash ^= le64toh(u->data.hash);
2192 items[i].object_offset = htole64(h);
2193 items[i].hash = u->data.hash;
2194
2195 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2196 if (r < 0)
2197 return r;
2198 }
2199
2200 return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2201 }
2202
2203 void journal_default_metrics(JournalMetrics *m, int fd) {
2204 uint64_t fs_size = 0;
2205 struct statvfs ss;
2206 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2207
2208 assert(m);
2209 assert(fd >= 0);
2210
2211 if (fstatvfs(fd, &ss) >= 0)
2212 fs_size = ss.f_frsize * ss.f_blocks;
2213
2214 if (m->max_use == (uint64_t) -1) {
2215
2216 if (fs_size > 0) {
2217 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2218
2219 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2220 m->max_use = DEFAULT_MAX_USE_UPPER;
2221
2222 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2223 m->max_use = DEFAULT_MAX_USE_LOWER;
2224 } else
2225 m->max_use = DEFAULT_MAX_USE_LOWER;
2226 } else {
2227 m->max_use = PAGE_ALIGN(m->max_use);
2228
2229 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2230 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2231 }
2232
2233 if (m->max_size == (uint64_t) -1) {
2234 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2235
2236 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2237 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2238 } else
2239 m->max_size = PAGE_ALIGN(m->max_size);
2240
2241 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2242 m->max_size = JOURNAL_FILE_SIZE_MIN;
2243
2244 if (m->max_size*2 > m->max_use)
2245 m->max_use = m->max_size*2;
2246
2247 if (m->min_size == (uint64_t) -1)
2248 m->min_size = JOURNAL_FILE_SIZE_MIN;
2249 else {
2250 m->min_size = PAGE_ALIGN(m->min_size);
2251
2252 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2253 m->min_size = JOURNAL_FILE_SIZE_MIN;
2254
2255 if (m->min_size > m->max_size)
2256 m->max_size = m->min_size;
2257 }
2258
2259 if (m->keep_free == (uint64_t) -1) {
2260
2261 if (fs_size > 0) {
2262 m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */
2263
2264 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2265 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2266
2267 } else
2268 m->keep_free = DEFAULT_KEEP_FREE;
2269 }
2270
2271 log_info("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2272 format_bytes(a, sizeof(a), m->max_use),
2273 format_bytes(b, sizeof(b), m->max_size),
2274 format_bytes(c, sizeof(c), m->min_size),
2275 format_bytes(d, sizeof(d), m->keep_free));
2276 }