]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/journal/journal-file.c
journal: make sure to refresh window position and pointer after we linked up a data...
[thirdparty/systemd.git] / src / journal / journal-file.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2011 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <sys/mman.h>
23 #include <errno.h>
24 #include <sys/uio.h>
25 #include <unistd.h>
26 #include <sys/statvfs.h>
27 #include <fcntl.h>
28 #include <stddef.h>
29
30 #include "journal-def.h"
31 #include "journal-file.h"
32 #include "lookup3.h"
33 #include "compress.h"
34
35 #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*16ULL)
36 #define DEFAULT_FIELD_HASH_TABLE_SIZE (2047ULL*16ULL)
37
38 #define DEFAULT_WINDOW_SIZE (128ULL*1024ULL*1024ULL)
39
40 #define COMPRESSION_SIZE_THRESHOLD (512ULL)
41
42 /* This is the minimum journal file size */
43 #define JOURNAL_FILE_SIZE_MIN (64ULL*1024ULL)
44
45 /* These are the lower and upper bounds if we deduce the max_use value
46 * from the file system size */
47 #define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
48 #define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
49
50 /* This is the upper bound if we deduce max_size from max_use */
51 #define DEFAULT_MAX_SIZE_UPPER (16ULL*1024ULL*1024ULL) /* 16 MiB */
52
53 /* This is the upper bound if we deduce the keep_free value from the
54 * file system size */
55 #define DEFAULT_KEEP_FREE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
56
57 /* This is the keep_free value when we can't determine the system
58 * size */
59 #define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
60
61 static const char signature[] = { 'L', 'P', 'K', 'S', 'H', 'H', 'R', 'H' };
62
63 #define ALIGN64(x) (((x) + 7ULL) & ~7ULL)
64
65 void journal_file_close(JournalFile *f) {
66 int t;
67
68 assert(f);
69
70 if (f->header && f->writable)
71 f->header->state = STATE_OFFLINE;
72
73
74 for (t = 0; t < _WINDOW_MAX; t++)
75 if (f->windows[t].ptr)
76 munmap(f->windows[t].ptr, f->windows[t].size);
77
78 if (f->fd >= 0)
79 close_nointr_nofail(f->fd);
80
81 free(f->path);
82
83 #ifdef HAVE_XZ
84 free(f->compress_buffer);
85 #endif
86
87 free(f);
88 }
89
90 static int journal_file_init_header(JournalFile *f, JournalFile *template) {
91 Header h;
92 ssize_t k;
93 int r;
94
95 assert(f);
96
97 zero(h);
98 memcpy(h.signature, signature, 8);
99 h.arena_offset = htole64(ALIGN64(sizeof(h)));
100
101 r = sd_id128_randomize(&h.file_id);
102 if (r < 0)
103 return r;
104
105 if (template) {
106 h.seqnum_id = template->header->seqnum_id;
107 h.seqnum = template->header->seqnum;
108 } else
109 h.seqnum_id = h.file_id;
110
111 k = pwrite(f->fd, &h, sizeof(h), 0);
112 if (k < 0)
113 return -errno;
114
115 if (k != sizeof(h))
116 return -EIO;
117
118 return 0;
119 }
120
121 static int journal_file_refresh_header(JournalFile *f) {
122 int r;
123 sd_id128_t boot_id;
124
125 assert(f);
126
127 r = sd_id128_get_machine(&f->header->machine_id);
128 if (r < 0)
129 return r;
130
131 r = sd_id128_get_boot(&boot_id);
132 if (r < 0)
133 return r;
134
135 if (sd_id128_equal(boot_id, f->header->boot_id))
136 f->tail_entry_monotonic_valid = true;
137
138 f->header->boot_id = boot_id;
139
140 f->header->state = STATE_ONLINE;
141
142 __sync_synchronize();
143
144 return 0;
145 }
146
147 static int journal_file_verify_header(JournalFile *f) {
148 assert(f);
149
150 if (memcmp(f->header, signature, 8))
151 return -EBADMSG;
152
153 #ifdef HAVE_XZ
154 if ((le64toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_COMPRESSED) != 0)
155 return -EPROTONOSUPPORT;
156 #else
157 if (f->header->incompatible_flags != 0)
158 return -EPROTONOSUPPORT;
159 #endif
160
161 if ((uint64_t) f->last_stat.st_size < (le64toh(f->header->arena_offset) + le64toh(f->header->arena_size)))
162 return -ENODATA;
163
164 if (f->writable) {
165 uint32_t state;
166 sd_id128_t machine_id;
167 int r;
168
169 r = sd_id128_get_machine(&machine_id);
170 if (r < 0)
171 return r;
172
173 if (!sd_id128_equal(machine_id, f->header->machine_id))
174 return -EHOSTDOWN;
175
176 state = f->header->state;
177
178 if (state == STATE_ONLINE)
179 log_debug("Journal file %s is already online. Assuming unclean closing. Ignoring.", f->path);
180 else if (state == STATE_ARCHIVED)
181 return -ESHUTDOWN;
182 else if (state != STATE_OFFLINE)
183 log_debug("Journal file %s has unknown state %u. Ignoring.", f->path, state);
184 }
185
186 return 0;
187 }
188
189 static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) {
190 uint64_t old_size, new_size;
191
192 assert(f);
193
194 /* We assume that this file is not sparse, and we know that
195 * for sure, since we always call posix_fallocate()
196 * ourselves */
197
198 old_size =
199 le64toh(f->header->arena_offset) +
200 le64toh(f->header->arena_size);
201
202 new_size = PAGE_ALIGN(offset + size);
203 if (new_size < le64toh(f->header->arena_offset))
204 new_size = le64toh(f->header->arena_offset);
205
206 if (new_size <= old_size)
207 return 0;
208
209 if (f->metrics.max_size > 0 &&
210 new_size > f->metrics.max_size)
211 return -E2BIG;
212
213 if (new_size > f->metrics.min_size &&
214 f->metrics.keep_free > 0) {
215 struct statvfs svfs;
216
217 if (fstatvfs(f->fd, &svfs) >= 0) {
218 uint64_t available;
219
220 available = svfs.f_bfree * svfs.f_bsize;
221
222 if (available >= f->metrics.keep_free)
223 available -= f->metrics.keep_free;
224 else
225 available = 0;
226
227 if (new_size - old_size > available)
228 return -E2BIG;
229 }
230 }
231
232 /* Note that the glibc fallocate() fallback is very
233 inefficient, hence we try to minimize the allocation area
234 as we can. */
235 if (posix_fallocate(f->fd, old_size, new_size - old_size) < 0)
236 return -errno;
237
238 if (fstat(f->fd, &f->last_stat) < 0)
239 return -errno;
240
241 f->header->arena_size = htole64(new_size - le64toh(f->header->arena_offset));
242
243 return 0;
244 }
245
246 static int journal_file_map(
247 JournalFile *f,
248 uint64_t offset,
249 uint64_t size,
250 void **_window,
251 uint64_t *_woffset,
252 uint64_t *_wsize,
253 void **ret) {
254
255 uint64_t woffset, wsize;
256 void *window;
257
258 assert(f);
259 assert(size > 0);
260 assert(ret);
261
262 woffset = offset & ~((uint64_t) page_size() - 1ULL);
263 wsize = size + (offset - woffset);
264 wsize = PAGE_ALIGN(wsize);
265
266 /* Avoid SIGBUS on invalid accesses */
267 if (woffset + wsize > (uint64_t) PAGE_ALIGN(f->last_stat.st_size))
268 return -EADDRNOTAVAIL;
269
270 window = mmap(NULL, wsize, f->prot, MAP_SHARED, f->fd, woffset);
271 if (window == MAP_FAILED)
272 return -errno;
273
274 if (_window)
275 *_window = window;
276
277 if (_woffset)
278 *_woffset = woffset;
279
280 if (_wsize)
281 *_wsize = wsize;
282
283 *ret = (uint8_t*) window + (offset - woffset);
284
285 return 0;
286 }
287
288 static int journal_file_move_to(JournalFile *f, int wt, uint64_t offset, uint64_t size, void **ret) {
289 void *p = NULL;
290 uint64_t delta;
291 int r;
292 Window *w;
293
294 assert(f);
295 assert(ret);
296 assert(wt >= 0);
297 assert(wt < _WINDOW_MAX);
298
299 if (offset + size > (uint64_t) f->last_stat.st_size) {
300 /* Hmm, out of range? Let's refresh the fstat() data
301 * first, before we trust that check. */
302
303 if (fstat(f->fd, &f->last_stat) < 0 ||
304 offset + size > (uint64_t) f->last_stat.st_size)
305 return -EADDRNOTAVAIL;
306 }
307
308 w = f->windows + wt;
309
310 if (_likely_(w->ptr &&
311 w->offset <= offset &&
312 w->offset + w->size >= offset + size)) {
313
314 *ret = (uint8_t*) w->ptr + (offset - w->offset);
315 return 0;
316 }
317
318 if (w->ptr) {
319 if (munmap(w->ptr, w->size) < 0)
320 return -errno;
321
322 w->ptr = NULL;
323 w->size = w->offset = 0;
324 }
325
326 if (size < DEFAULT_WINDOW_SIZE) {
327 /* If the default window size is larger then what was
328 * asked for extend the mapping a bit in the hope to
329 * minimize needed remappings later on. We add half
330 * the window space before and half behind the
331 * requested mapping */
332
333 delta = (DEFAULT_WINDOW_SIZE - size) / 2;
334
335 if (delta > offset)
336 delta = offset;
337
338 offset -= delta;
339 size = DEFAULT_WINDOW_SIZE;
340 } else
341 delta = 0;
342
343 if (offset + size > (uint64_t) f->last_stat.st_size)
344 size = (uint64_t) f->last_stat.st_size - offset;
345
346 if (size <= 0)
347 return -EADDRNOTAVAIL;
348
349 r = journal_file_map(f,
350 offset, size,
351 &w->ptr, &w->offset, &w->size,
352 &p);
353
354 if (r < 0)
355 return r;
356
357 *ret = (uint8_t*) p + delta;
358 return 0;
359 }
360
361 static bool verify_hash(Object *o) {
362 uint64_t h1, h2;
363
364 assert(o);
365
366 if (o->object.type == OBJECT_DATA && !(o->object.flags & OBJECT_COMPRESSED)) {
367 h1 = le64toh(o->data.hash);
368 h2 = hash64(o->data.payload, le64toh(o->object.size) - offsetof(Object, data.payload));
369 } else if (o->object.type == OBJECT_FIELD) {
370 h1 = le64toh(o->field.hash);
371 h2 = hash64(o->field.payload, le64toh(o->object.size) - offsetof(Object, field.payload));
372 } else
373 return true;
374
375 return h1 == h2;
376 }
377
378 int journal_file_move_to_object(JournalFile *f, int type, uint64_t offset, Object **ret) {
379 int r;
380 void *t;
381 Object *o;
382 uint64_t s;
383
384 assert(f);
385 assert(ret);
386 assert(type < _OBJECT_TYPE_MAX);
387
388 r = journal_file_move_to(f, type >= 0 ? type : WINDOW_UNKNOWN, offset, sizeof(ObjectHeader), &t);
389 if (r < 0)
390 return r;
391
392 o = (Object*) t;
393 s = le64toh(o->object.size);
394
395 if (s < sizeof(ObjectHeader))
396 return -EBADMSG;
397
398 if (type >= 0 && o->object.type != type)
399 return -EBADMSG;
400
401 if (s > sizeof(ObjectHeader)) {
402 r = journal_file_move_to(f, o->object.type, offset, s, &t);
403 if (r < 0)
404 return r;
405
406 o = (Object*) t;
407 }
408
409 if (!verify_hash(o))
410 return -EBADMSG;
411
412 *ret = o;
413 return 0;
414 }
415
416 static uint64_t journal_file_seqnum(JournalFile *f, uint64_t *seqnum) {
417 uint64_t r;
418
419 assert(f);
420
421 r = le64toh(f->header->seqnum) + 1;
422
423 if (seqnum) {
424 /* If an external seqnum counter was passed, we update
425 * both the local and the external one, and set it to
426 * the maximum of both */
427
428 if (*seqnum + 1 > r)
429 r = *seqnum + 1;
430
431 *seqnum = r;
432 }
433
434 f->header->seqnum = htole64(r);
435
436 if (f->header->first_seqnum == 0)
437 f->header->first_seqnum = htole64(r);
438
439 return r;
440 }
441
442 static int journal_file_append_object(JournalFile *f, int type, uint64_t size, Object **ret, uint64_t *offset) {
443 int r;
444 uint64_t p;
445 Object *tail, *o;
446 void *t;
447
448 assert(f);
449 assert(size >= sizeof(ObjectHeader));
450 assert(offset);
451 assert(ret);
452
453 p = le64toh(f->header->tail_object_offset);
454 if (p == 0)
455 p = le64toh(f->header->arena_offset);
456 else {
457 r = journal_file_move_to_object(f, -1, p, &tail);
458 if (r < 0)
459 return r;
460
461 p += ALIGN64(le64toh(tail->object.size));
462 }
463
464 r = journal_file_allocate(f, p, size);
465 if (r < 0)
466 return r;
467
468 r = journal_file_move_to(f, type, p, size, &t);
469 if (r < 0)
470 return r;
471
472 o = (Object*) t;
473
474 zero(o->object);
475 o->object.type = type;
476 o->object.size = htole64(size);
477
478 f->header->tail_object_offset = htole64(p);
479 f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1);
480
481 *ret = o;
482 *offset = p;
483
484 return 0;
485 }
486
487 static int journal_file_setup_data_hash_table(JournalFile *f) {
488 uint64_t s, p;
489 Object *o;
490 int r;
491
492 assert(f);
493
494 s = DEFAULT_DATA_HASH_TABLE_SIZE;
495 r = journal_file_append_object(f,
496 OBJECT_DATA_HASH_TABLE,
497 offsetof(Object, hash_table.items) + s,
498 &o, &p);
499 if (r < 0)
500 return r;
501
502 memset(o->hash_table.items, 0, s);
503
504 f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
505 f->header->data_hash_table_size = htole64(s);
506
507 return 0;
508 }
509
510 static int journal_file_setup_field_hash_table(JournalFile *f) {
511 uint64_t s, p;
512 Object *o;
513 int r;
514
515 assert(f);
516
517 s = DEFAULT_FIELD_HASH_TABLE_SIZE;
518 r = journal_file_append_object(f,
519 OBJECT_FIELD_HASH_TABLE,
520 offsetof(Object, hash_table.items) + s,
521 &o, &p);
522 if (r < 0)
523 return r;
524
525 memset(o->hash_table.items, 0, s);
526
527 f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items));
528 f->header->field_hash_table_size = htole64(s);
529
530 return 0;
531 }
532
533 static int journal_file_map_data_hash_table(JournalFile *f) {
534 uint64_t s, p;
535 void *t;
536 int r;
537
538 assert(f);
539
540 p = le64toh(f->header->data_hash_table_offset);
541 s = le64toh(f->header->data_hash_table_size);
542
543 r = journal_file_move_to(f,
544 WINDOW_DATA_HASH_TABLE,
545 p, s,
546 &t);
547 if (r < 0)
548 return r;
549
550 f->data_hash_table = t;
551 return 0;
552 }
553
554 static int journal_file_map_field_hash_table(JournalFile *f) {
555 uint64_t s, p;
556 void *t;
557 int r;
558
559 assert(f);
560
561 p = le64toh(f->header->field_hash_table_offset);
562 s = le64toh(f->header->field_hash_table_size);
563
564 r = journal_file_move_to(f,
565 WINDOW_FIELD_HASH_TABLE,
566 p, s,
567 &t);
568 if (r < 0)
569 return r;
570
571 f->field_hash_table = t;
572 return 0;
573 }
574
575 static int journal_file_link_data(JournalFile *f, Object *o, uint64_t offset, uint64_t hash) {
576 uint64_t p, h;
577 int r;
578
579 assert(f);
580 assert(o);
581 assert(offset > 0);
582 assert(o->object.type == OBJECT_DATA);
583
584 /* This might alter the window we are looking at */
585
586 o->data.next_hash_offset = o->data.next_field_offset = 0;
587 o->data.entry_offset = o->data.entry_array_offset = 0;
588 o->data.n_entries = 0;
589
590 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
591 p = le64toh(f->data_hash_table[h].head_hash_offset);
592 if (p == 0) {
593 /* Only entry in the hash table is easy */
594 f->data_hash_table[h].head_hash_offset = htole64(offset);
595 } else {
596 /* Move back to the previous data object, to patch in
597 * pointer */
598
599 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
600 if (r < 0)
601 return r;
602
603 o->data.next_hash_offset = htole64(offset);
604 }
605
606 f->data_hash_table[h].tail_hash_offset = htole64(offset);
607
608 return 0;
609 }
610
611 int journal_file_find_data_object_with_hash(
612 JournalFile *f,
613 const void *data, uint64_t size, uint64_t hash,
614 Object **ret, uint64_t *offset) {
615
616 uint64_t p, osize, h;
617 int r;
618
619 assert(f);
620 assert(data || size == 0);
621
622 osize = offsetof(Object, data.payload) + size;
623
624 if (f->header->data_hash_table_size == 0)
625 return -EBADMSG;
626
627 h = hash % (le64toh(f->header->data_hash_table_size) / sizeof(HashItem));
628 p = le64toh(f->data_hash_table[h].head_hash_offset);
629
630 while (p > 0) {
631 Object *o;
632
633 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
634 if (r < 0)
635 return r;
636
637 if (le64toh(o->data.hash) != hash)
638 goto next;
639
640 if (o->object.flags & OBJECT_COMPRESSED) {
641 #ifdef HAVE_XZ
642 uint64_t l, rsize;
643
644 l = le64toh(o->object.size);
645 if (l <= offsetof(Object, data.payload))
646 return -EBADMSG;
647
648 l -= offsetof(Object, data.payload);
649
650 if (!uncompress_blob(o->data.payload, l, &f->compress_buffer, &f->compress_buffer_size, &rsize))
651 return -EBADMSG;
652
653 if (rsize == size &&
654 memcmp(f->compress_buffer, data, size) == 0) {
655
656 if (ret)
657 *ret = o;
658
659 if (offset)
660 *offset = p;
661
662 return 1;
663 }
664 #else
665 return -EPROTONOSUPPORT;
666 #endif
667
668 } else if (le64toh(o->object.size) == osize &&
669 memcmp(o->data.payload, data, size) == 0) {
670
671 if (ret)
672 *ret = o;
673
674 if (offset)
675 *offset = p;
676
677 return 1;
678 }
679
680 next:
681 p = le64toh(o->data.next_hash_offset);
682 }
683
684 return 0;
685 }
686
687 int journal_file_find_data_object(
688 JournalFile *f,
689 const void *data, uint64_t size,
690 Object **ret, uint64_t *offset) {
691
692 uint64_t hash;
693
694 assert(f);
695 assert(data || size == 0);
696
697 hash = hash64(data, size);
698
699 return journal_file_find_data_object_with_hash(f,
700 data, size, hash,
701 ret, offset);
702 }
703
704 static int journal_file_append_data(
705 JournalFile *f,
706 const void *data, uint64_t size,
707 Object **ret, uint64_t *offset) {
708
709 uint64_t hash, p;
710 uint64_t osize;
711 Object *o;
712 int r;
713 bool compressed = false;
714
715 assert(f);
716 assert(data || size == 0);
717
718 hash = hash64(data, size);
719
720 r = journal_file_find_data_object_with_hash(f, data, size, hash, &o, &p);
721 if (r < 0)
722 return r;
723 else if (r > 0) {
724
725 if (ret)
726 *ret = o;
727
728 if (offset)
729 *offset = p;
730
731 return 0;
732 }
733
734 osize = offsetof(Object, data.payload) + size;
735 r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p);
736 if (r < 0)
737 return r;
738
739 o->data.hash = htole64(hash);
740
741 #ifdef HAVE_XZ
742 if (f->compress &&
743 size >= COMPRESSION_SIZE_THRESHOLD) {
744 uint64_t rsize;
745
746 compressed = compress_blob(data, size, o->data.payload, &rsize);
747
748 if (compressed) {
749 o->object.size = htole64(offsetof(Object, data.payload) + rsize);
750 o->object.flags |= OBJECT_COMPRESSED;
751
752 f->header->incompatible_flags = htole32(le32toh(f->header->incompatible_flags) | HEADER_INCOMPATIBLE_COMPRESSED);
753
754 log_debug("Compressed data object %lu -> %lu", (unsigned long) size, (unsigned long) rsize);
755 }
756 }
757 #endif
758
759 if (!compressed)
760 memcpy(o->data.payload, data, size);
761
762 r = journal_file_link_data(f, o, p, hash);
763 if (r < 0)
764 return r;
765
766 /* The linking might have altered the window, so let's
767 * refresh our pointer */
768 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
769 if (r < 0)
770 return r;
771
772 if (ret)
773 *ret = o;
774
775 if (offset)
776 *offset = p;
777
778 return 0;
779 }
780
781 uint64_t journal_file_entry_n_items(Object *o) {
782 assert(o);
783 assert(o->object.type == OBJECT_ENTRY);
784
785 return (le64toh(o->object.size) - offsetof(Object, entry.items)) / sizeof(EntryItem);
786 }
787
788 static uint64_t journal_file_entry_array_n_items(Object *o) {
789 assert(o);
790 assert(o->object.type == OBJECT_ENTRY_ARRAY);
791
792 return (le64toh(o->object.size) - offsetof(Object, entry_array.items)) / sizeof(uint64_t);
793 }
794
795 static int link_entry_into_array(JournalFile *f,
796 uint64_t *first,
797 uint64_t *idx,
798 uint64_t p) {
799 int r;
800 uint64_t n = 0, ap = 0, q, i, a, hidx;
801 Object *o;
802
803 assert(f);
804 assert(first);
805 assert(idx);
806 assert(p > 0);
807
808 a = le64toh(*first);
809 i = hidx = le64toh(*idx);
810 while (a > 0) {
811
812 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
813 if (r < 0)
814 return r;
815
816 n = journal_file_entry_array_n_items(o);
817 if (i < n) {
818 o->entry_array.items[i] = htole64(p);
819 *idx = htole64(hidx + 1);
820 return 0;
821 }
822
823 i -= n;
824 ap = a;
825 a = le64toh(o->entry_array.next_entry_array_offset);
826 }
827
828 if (hidx > n)
829 n = (hidx+1) * 2;
830 else
831 n = n * 2;
832
833 if (n < 4)
834 n = 4;
835
836 r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY,
837 offsetof(Object, entry_array.items) + n * sizeof(uint64_t),
838 &o, &q);
839 if (r < 0)
840 return r;
841
842 o->entry_array.items[i] = htole64(p);
843
844 if (ap == 0)
845 *first = htole64(q);
846 else {
847 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o);
848 if (r < 0)
849 return r;
850
851 o->entry_array.next_entry_array_offset = htole64(q);
852 }
853
854 *idx = htole64(hidx + 1);
855
856 return 0;
857 }
858
859 static int link_entry_into_array_plus_one(JournalFile *f,
860 uint64_t *extra,
861 uint64_t *first,
862 uint64_t *idx,
863 uint64_t p) {
864
865 int r;
866
867 assert(f);
868 assert(extra);
869 assert(first);
870 assert(idx);
871 assert(p > 0);
872
873 if (*idx == 0)
874 *extra = htole64(p);
875 else {
876 uint64_t i;
877
878 i = htole64(le64toh(*idx) - 1);
879 r = link_entry_into_array(f, first, &i, p);
880 if (r < 0)
881 return r;
882 }
883
884 *idx = htole64(le64toh(*idx) + 1);
885 return 0;
886 }
887
888 static int journal_file_link_entry_item(JournalFile *f, Object *o, uint64_t offset, uint64_t i) {
889 uint64_t p;
890 int r;
891 assert(f);
892 assert(o);
893 assert(offset > 0);
894
895 p = le64toh(o->entry.items[i].object_offset);
896 if (p == 0)
897 return -EINVAL;
898
899 r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
900 if (r < 0)
901 return r;
902
903 return link_entry_into_array_plus_one(f,
904 &o->data.entry_offset,
905 &o->data.entry_array_offset,
906 &o->data.n_entries,
907 offset);
908 }
909
910 static int journal_file_link_entry(JournalFile *f, Object *o, uint64_t offset) {
911 uint64_t n, i;
912 int r;
913
914 assert(f);
915 assert(o);
916 assert(offset > 0);
917 assert(o->object.type == OBJECT_ENTRY);
918
919 __sync_synchronize();
920
921 /* Link up the entry itself */
922 r = link_entry_into_array(f,
923 &f->header->entry_array_offset,
924 &f->header->n_entries,
925 offset);
926 if (r < 0)
927 return r;
928
929 /* log_debug("=> %s seqnr=%lu n_entries=%lu", f->path, (unsigned long) o->entry.seqnum, (unsigned long) f->header->n_entries); */
930
931 if (f->header->head_entry_realtime == 0)
932 f->header->head_entry_realtime = o->entry.realtime;
933
934 f->header->tail_entry_realtime = o->entry.realtime;
935 f->header->tail_entry_monotonic = o->entry.monotonic;
936
937 f->tail_entry_monotonic_valid = true;
938
939 /* Link up the items */
940 n = journal_file_entry_n_items(o);
941 for (i = 0; i < n; i++) {
942 r = journal_file_link_entry_item(f, o, offset, i);
943 if (r < 0)
944 return r;
945 }
946
947 return 0;
948 }
949
950 static int journal_file_append_entry_internal(
951 JournalFile *f,
952 const dual_timestamp *ts,
953 uint64_t xor_hash,
954 const EntryItem items[], unsigned n_items,
955 uint64_t *seqnum,
956 Object **ret, uint64_t *offset) {
957 uint64_t np;
958 uint64_t osize;
959 Object *o;
960 int r;
961
962 assert(f);
963 assert(items || n_items == 0);
964 assert(ts);
965
966 osize = offsetof(Object, entry.items) + (n_items * sizeof(EntryItem));
967
968 r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np);
969 if (r < 0)
970 return r;
971
972 o->entry.seqnum = htole64(journal_file_seqnum(f, seqnum));
973 memcpy(o->entry.items, items, n_items * sizeof(EntryItem));
974 o->entry.realtime = htole64(ts->realtime);
975 o->entry.monotonic = htole64(ts->monotonic);
976 o->entry.xor_hash = htole64(xor_hash);
977 o->entry.boot_id = f->header->boot_id;
978
979 r = journal_file_link_entry(f, o, np);
980 if (r < 0)
981 return r;
982
983 if (ret)
984 *ret = o;
985
986 if (offset)
987 *offset = np;
988
989 return 0;
990 }
991
992 void journal_file_post_change(JournalFile *f) {
993 assert(f);
994
995 /* inotify() does not receive IN_MODIFY events from file
996 * accesses done via mmap(). After each access we hence
997 * trigger IN_MODIFY by truncating the journal file to its
998 * current size which triggers IN_MODIFY. */
999
1000 __sync_synchronize();
1001
1002 if (ftruncate(f->fd, f->last_stat.st_size) < 0)
1003 log_error("Failed to to truncate file to its own size: %m");
1004 }
1005
1006 int journal_file_append_entry(JournalFile *f, const dual_timestamp *ts, const struct iovec iovec[], unsigned n_iovec, uint64_t *seqnum, Object **ret, uint64_t *offset) {
1007 unsigned i;
1008 EntryItem *items;
1009 int r;
1010 uint64_t xor_hash = 0;
1011 struct dual_timestamp _ts;
1012
1013 assert(f);
1014 assert(iovec || n_iovec == 0);
1015
1016 if (!f->writable)
1017 return -EPERM;
1018
1019 if (!ts) {
1020 dual_timestamp_get(&_ts);
1021 ts = &_ts;
1022 }
1023
1024 if (f->tail_entry_monotonic_valid &&
1025 ts->monotonic < le64toh(f->header->tail_entry_monotonic))
1026 return -EINVAL;
1027
1028 items = alloca(sizeof(EntryItem) * n_iovec);
1029
1030 for (i = 0; i < n_iovec; i++) {
1031 uint64_t p;
1032 Object *o;
1033
1034 r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p);
1035 if (r < 0)
1036 return r;
1037
1038 xor_hash ^= le64toh(o->data.hash);
1039 items[i].object_offset = htole64(p);
1040 items[i].hash = o->data.hash;
1041 }
1042
1043 r = journal_file_append_entry_internal(f, ts, xor_hash, items, n_iovec, seqnum, ret, offset);
1044
1045 journal_file_post_change(f);
1046
1047 return r;
1048 }
1049
1050 static int generic_array_get(JournalFile *f,
1051 uint64_t first,
1052 uint64_t i,
1053 Object **ret, uint64_t *offset) {
1054
1055 Object *o;
1056 uint64_t p = 0, a;
1057 int r;
1058
1059 assert(f);
1060
1061 a = first;
1062 while (a > 0) {
1063 uint64_t n;
1064
1065 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o);
1066 if (r < 0)
1067 return r;
1068
1069 n = journal_file_entry_array_n_items(o);
1070 if (i < n) {
1071 p = le64toh(o->entry_array.items[i]);
1072 break;
1073 }
1074
1075 i -= n;
1076 a = le64toh(o->entry_array.next_entry_array_offset);
1077 }
1078
1079 if (a <= 0 || p <= 0)
1080 return 0;
1081
1082 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1083 if (r < 0)
1084 return r;
1085
1086 if (ret)
1087 *ret = o;
1088
1089 if (offset)
1090 *offset = p;
1091
1092 return 1;
1093 }
1094
1095 static int generic_array_get_plus_one(JournalFile *f,
1096 uint64_t extra,
1097 uint64_t first,
1098 uint64_t i,
1099 Object **ret, uint64_t *offset) {
1100
1101 Object *o;
1102
1103 assert(f);
1104
1105 if (i == 0) {
1106 int r;
1107
1108 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1109 if (r < 0)
1110 return r;
1111
1112 if (ret)
1113 *ret = o;
1114
1115 if (offset)
1116 *offset = extra;
1117
1118 return 1;
1119 }
1120
1121 return generic_array_get(f, first, i-1, ret, offset);
1122 }
1123
1124 enum {
1125 TEST_FOUND,
1126 TEST_LEFT,
1127 TEST_RIGHT
1128 };
1129
1130 static int generic_array_bisect(JournalFile *f,
1131 uint64_t first,
1132 uint64_t n,
1133 uint64_t needle,
1134 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1135 direction_t direction,
1136 Object **ret,
1137 uint64_t *offset,
1138 uint64_t *idx) {
1139
1140 uint64_t a, p, t = 0, i = 0, last_p = 0;
1141 bool subtract_one = false;
1142 Object *o, *array = NULL;
1143 int r;
1144
1145 assert(f);
1146 assert(test_object);
1147
1148 a = first;
1149 while (a > 0) {
1150 uint64_t left, right, k, lp;
1151
1152 r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array);
1153 if (r < 0)
1154 return r;
1155
1156 k = journal_file_entry_array_n_items(array);
1157 right = MIN(k, n);
1158 if (right <= 0)
1159 return 0;
1160
1161 i = right - 1;
1162 lp = p = le64toh(array->entry_array.items[i]);
1163 if (p <= 0)
1164 return -EBADMSG;
1165
1166 r = test_object(f, p, needle);
1167 if (r < 0)
1168 return r;
1169
1170 if (r == TEST_FOUND)
1171 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1172
1173 if (r == TEST_RIGHT) {
1174 left = 0;
1175 right -= 1;
1176 for (;;) {
1177 if (left == right) {
1178 if (direction == DIRECTION_UP)
1179 subtract_one = true;
1180
1181 i = left;
1182 goto found;
1183 }
1184
1185 assert(left < right);
1186
1187 i = (left + right) / 2;
1188 p = le64toh(array->entry_array.items[i]);
1189 if (p <= 0)
1190 return -EBADMSG;
1191
1192 r = test_object(f, p, needle);
1193 if (r < 0)
1194 return r;
1195
1196 if (r == TEST_FOUND)
1197 r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT;
1198
1199 if (r == TEST_RIGHT)
1200 right = i;
1201 else
1202 left = i + 1;
1203 }
1204 }
1205
1206 if (k > n)
1207 return 0;
1208
1209 last_p = lp;
1210
1211 n -= k;
1212 t += k;
1213 a = le64toh(array->entry_array.next_entry_array_offset);
1214 }
1215
1216 return 0;
1217
1218 found:
1219 if (subtract_one && t == 0 && i == 0)
1220 return 0;
1221
1222 if (subtract_one && i == 0)
1223 p = last_p;
1224 else if (subtract_one)
1225 p = le64toh(array->entry_array.items[i-1]);
1226 else
1227 p = le64toh(array->entry_array.items[i]);
1228
1229 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1230 if (r < 0)
1231 return r;
1232
1233 if (ret)
1234 *ret = o;
1235
1236 if (offset)
1237 *offset = p;
1238
1239 if (idx)
1240 *idx = t + i - (subtract_one ? 1 : 0);
1241
1242 return 1;
1243 }
1244
1245 static int generic_array_bisect_plus_one(JournalFile *f,
1246 uint64_t extra,
1247 uint64_t first,
1248 uint64_t n,
1249 uint64_t needle,
1250 int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle),
1251 direction_t direction,
1252 Object **ret,
1253 uint64_t *offset,
1254 uint64_t *idx) {
1255
1256 int r;
1257
1258 assert(f);
1259 assert(test_object);
1260
1261 if (n <= 0)
1262 return 0;
1263
1264 /* This bisects the array in object 'first', but first checks
1265 * an extra */
1266 r = test_object(f, extra, needle);
1267 if (r < 0)
1268 return r;
1269 else if (r == TEST_FOUND) {
1270 Object *o;
1271
1272 r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, &o);
1273 if (r < 0)
1274 return r;
1275
1276 if (ret)
1277 *ret = o;
1278
1279 if (offset)
1280 *offset = extra;
1281
1282 if (idx)
1283 *idx = 0;
1284
1285 return 1;
1286 } else if (r == TEST_RIGHT)
1287 return 0;
1288
1289 r = generic_array_bisect(f, first, n-1, needle, test_object, direction, ret, offset, idx);
1290
1291 if (r > 0)
1292 (*idx) ++;
1293
1294 return r;
1295 }
1296
1297 static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) {
1298 Object *o;
1299 int r;
1300
1301 assert(f);
1302 assert(p > 0);
1303
1304 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1305 if (r < 0)
1306 return r;
1307
1308 if (le64toh(o->entry.seqnum) == needle)
1309 return TEST_FOUND;
1310 else if (le64toh(o->entry.seqnum) < needle)
1311 return TEST_LEFT;
1312 else
1313 return TEST_RIGHT;
1314 }
1315
1316 int journal_file_move_to_entry_by_seqnum(
1317 JournalFile *f,
1318 uint64_t seqnum,
1319 direction_t direction,
1320 Object **ret,
1321 uint64_t *offset) {
1322
1323 return generic_array_bisect(f,
1324 le64toh(f->header->entry_array_offset),
1325 le64toh(f->header->n_entries),
1326 seqnum,
1327 test_object_seqnum,
1328 direction,
1329 ret, offset, NULL);
1330 }
1331
1332 static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) {
1333 Object *o;
1334 int r;
1335
1336 assert(f);
1337 assert(p > 0);
1338
1339 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1340 if (r < 0)
1341 return r;
1342
1343 if (le64toh(o->entry.realtime) == needle)
1344 return TEST_FOUND;
1345 else if (le64toh(o->entry.realtime) < needle)
1346 return TEST_LEFT;
1347 else
1348 return TEST_RIGHT;
1349 }
1350
1351 int journal_file_move_to_entry_by_realtime(
1352 JournalFile *f,
1353 uint64_t realtime,
1354 direction_t direction,
1355 Object **ret,
1356 uint64_t *offset) {
1357
1358 return generic_array_bisect(f,
1359 le64toh(f->header->entry_array_offset),
1360 le64toh(f->header->n_entries),
1361 realtime,
1362 test_object_realtime,
1363 direction,
1364 ret, offset, NULL);
1365 }
1366
1367 static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) {
1368 Object *o;
1369 int r;
1370
1371 assert(f);
1372 assert(p > 0);
1373
1374 r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o);
1375 if (r < 0)
1376 return r;
1377
1378 if (le64toh(o->entry.monotonic) == needle)
1379 return TEST_FOUND;
1380 else if (le64toh(o->entry.monotonic) < needle)
1381 return TEST_LEFT;
1382 else
1383 return TEST_RIGHT;
1384 }
1385
1386 int journal_file_move_to_entry_by_monotonic(
1387 JournalFile *f,
1388 sd_id128_t boot_id,
1389 uint64_t monotonic,
1390 direction_t direction,
1391 Object **ret,
1392 uint64_t *offset) {
1393
1394 char t[8+32+1] = "_BOOT_ID=";
1395 Object *o;
1396 int r;
1397
1398 sd_id128_to_string(boot_id, t + 8);
1399
1400 r = journal_file_find_data_object(f, t, strlen(t), &o, NULL);
1401 if (r < 0)
1402 return r;
1403 else if (r == 0)
1404 return -ENOENT;
1405
1406 return generic_array_bisect_plus_one(f,
1407 le64toh(o->data.entry_offset),
1408 le64toh(o->data.entry_array_offset),
1409 le64toh(o->data.n_entries),
1410 monotonic,
1411 test_object_monotonic,
1412 direction,
1413 ret, offset, NULL);
1414 }
1415
1416 static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) {
1417 assert(f);
1418 assert(p > 0);
1419
1420 if (p == needle)
1421 return TEST_FOUND;
1422 else if (p < needle)
1423 return TEST_LEFT;
1424 else
1425 return TEST_RIGHT;
1426 }
1427
1428 int journal_file_next_entry(
1429 JournalFile *f,
1430 Object *o, uint64_t p,
1431 direction_t direction,
1432 Object **ret, uint64_t *offset) {
1433
1434 uint64_t i, n;
1435 int r;
1436
1437 assert(f);
1438 assert(p > 0 || !o);
1439
1440 n = le64toh(f->header->n_entries);
1441 if (n <= 0)
1442 return 0;
1443
1444 if (!o)
1445 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1446 else {
1447 if (o->object.type != OBJECT_ENTRY)
1448 return -EINVAL;
1449
1450 r = generic_array_bisect(f,
1451 le64toh(f->header->entry_array_offset),
1452 le64toh(f->header->n_entries),
1453 p,
1454 test_object_offset,
1455 DIRECTION_DOWN,
1456 NULL, NULL,
1457 &i);
1458 if (r <= 0)
1459 return r;
1460
1461 if (direction == DIRECTION_DOWN) {
1462 if (i >= n - 1)
1463 return 0;
1464
1465 i++;
1466 } else {
1467 if (i <= 0)
1468 return 0;
1469
1470 i--;
1471 }
1472 }
1473
1474 /* And jump to it */
1475 return generic_array_get(f,
1476 le64toh(f->header->entry_array_offset),
1477 i,
1478 ret, offset);
1479 }
1480
1481 int journal_file_skip_entry(
1482 JournalFile *f,
1483 Object *o, uint64_t p,
1484 int64_t skip,
1485 Object **ret, uint64_t *offset) {
1486
1487 uint64_t i, n;
1488 int r;
1489
1490 assert(f);
1491 assert(o);
1492 assert(p > 0);
1493
1494 if (o->object.type != OBJECT_ENTRY)
1495 return -EINVAL;
1496
1497 r = generic_array_bisect(f,
1498 le64toh(f->header->entry_array_offset),
1499 le64toh(f->header->n_entries),
1500 p,
1501 test_object_offset,
1502 DIRECTION_DOWN,
1503 NULL, NULL,
1504 &i);
1505 if (r <= 0)
1506 return r;
1507
1508 /* Calculate new index */
1509 if (skip < 0) {
1510 if ((uint64_t) -skip >= i)
1511 i = 0;
1512 else
1513 i = i - (uint64_t) -skip;
1514 } else
1515 i += (uint64_t) skip;
1516
1517 n = le64toh(f->header->n_entries);
1518 if (n <= 0)
1519 return -EBADMSG;
1520
1521 if (i >= n)
1522 i = n-1;
1523
1524 return generic_array_get(f,
1525 le64toh(f->header->entry_array_offset),
1526 i,
1527 ret, offset);
1528 }
1529
1530 int journal_file_next_entry_for_data(
1531 JournalFile *f,
1532 Object *o, uint64_t p,
1533 uint64_t data_offset,
1534 direction_t direction,
1535 Object **ret, uint64_t *offset) {
1536
1537 uint64_t n, i;
1538 int r;
1539 Object *d;
1540
1541 assert(f);
1542 assert(p > 0 || !o);
1543
1544 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1545 if (r < 0)
1546 return r;
1547
1548 n = le64toh(d->data.n_entries);
1549 if (n <= 0)
1550 return n;
1551
1552 if (!o)
1553 i = direction == DIRECTION_DOWN ? 0 : n - 1;
1554 else {
1555 if (o->object.type != OBJECT_ENTRY)
1556 return -EINVAL;
1557
1558 r = generic_array_bisect_plus_one(f,
1559 le64toh(d->data.entry_offset),
1560 le64toh(d->data.entry_array_offset),
1561 le64toh(d->data.n_entries),
1562 p,
1563 test_object_offset,
1564 DIRECTION_DOWN,
1565 NULL, NULL,
1566 &i);
1567
1568 if (r <= 0)
1569 return r;
1570
1571 if (direction == DIRECTION_DOWN) {
1572 if (i >= n - 1)
1573 return 0;
1574
1575 i++;
1576 } else {
1577 if (i <= 0)
1578 return 0;
1579
1580 i--;
1581 }
1582
1583 }
1584
1585 return generic_array_get_plus_one(f,
1586 le64toh(d->data.entry_offset),
1587 le64toh(d->data.entry_array_offset),
1588 i,
1589 ret, offset);
1590 }
1591
1592 int journal_file_move_to_entry_by_seqnum_for_data(
1593 JournalFile *f,
1594 uint64_t data_offset,
1595 uint64_t seqnum,
1596 direction_t direction,
1597 Object **ret, uint64_t *offset) {
1598
1599 Object *d;
1600 int r;
1601
1602 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1603 if (r <= 0)
1604 return r;
1605
1606 return generic_array_bisect_plus_one(f,
1607 le64toh(d->data.entry_offset),
1608 le64toh(d->data.entry_array_offset),
1609 le64toh(d->data.n_entries),
1610 seqnum,
1611 test_object_seqnum,
1612 direction,
1613 ret, offset, NULL);
1614 }
1615
1616 int journal_file_move_to_entry_by_realtime_for_data(
1617 JournalFile *f,
1618 uint64_t data_offset,
1619 uint64_t realtime,
1620 direction_t direction,
1621 Object **ret, uint64_t *offset) {
1622
1623 Object *d;
1624 int r;
1625
1626 r = journal_file_move_to_object(f, OBJECT_DATA, data_offset, &d);
1627 if (r <= 0)
1628 return r;
1629
1630 return generic_array_bisect_plus_one(f,
1631 le64toh(d->data.entry_offset),
1632 le64toh(d->data.entry_array_offset),
1633 le64toh(d->data.n_entries),
1634 realtime,
1635 test_object_realtime,
1636 direction,
1637 ret, offset, NULL);
1638 }
1639
1640 void journal_file_dump(JournalFile *f) {
1641 char a[33], b[33], c[33];
1642 Object *o;
1643 int r;
1644 uint64_t p;
1645
1646 assert(f);
1647
1648 printf("File Path: %s\n"
1649 "File ID: %s\n"
1650 "Machine ID: %s\n"
1651 "Boot ID: %s\n"
1652 "Arena size: %llu\n"
1653 "Objects: %lu\n"
1654 "Entries: %lu\n",
1655 f->path,
1656 sd_id128_to_string(f->header->file_id, a),
1657 sd_id128_to_string(f->header->machine_id, b),
1658 sd_id128_to_string(f->header->boot_id, c),
1659 (unsigned long long) le64toh(f->header->arena_size),
1660 (unsigned long) le64toh(f->header->n_objects),
1661 (unsigned long) le64toh(f->header->n_entries));
1662
1663 p = le64toh(f->header->arena_offset);
1664 while (p != 0) {
1665 r = journal_file_move_to_object(f, -1, p, &o);
1666 if (r < 0)
1667 goto fail;
1668
1669 switch (o->object.type) {
1670
1671 case OBJECT_UNUSED:
1672 printf("Type: OBJECT_UNUSED\n");
1673 break;
1674
1675 case OBJECT_DATA:
1676 printf("Type: OBJECT_DATA\n");
1677 break;
1678
1679 case OBJECT_ENTRY:
1680 printf("Type: OBJECT_ENTRY %llu %llu %llu\n",
1681 (unsigned long long) le64toh(o->entry.seqnum),
1682 (unsigned long long) le64toh(o->entry.monotonic),
1683 (unsigned long long) le64toh(o->entry.realtime));
1684 break;
1685
1686 case OBJECT_FIELD_HASH_TABLE:
1687 printf("Type: OBJECT_FIELD_HASH_TABLE\n");
1688 break;
1689
1690 case OBJECT_DATA_HASH_TABLE:
1691 printf("Type: OBJECT_DATA_HASH_TABLE\n");
1692 break;
1693
1694 case OBJECT_ENTRY_ARRAY:
1695 printf("Type: OBJECT_ENTRY_ARRAY\n");
1696 break;
1697 }
1698
1699 if (o->object.flags & OBJECT_COMPRESSED)
1700 printf("Flags: COMPRESSED\n");
1701
1702 if (p == le64toh(f->header->tail_object_offset))
1703 p = 0;
1704 else
1705 p = p + ALIGN64(le64toh(o->object.size));
1706 }
1707
1708 return;
1709 fail:
1710 log_error("File corrupt");
1711 }
1712
1713 int journal_file_open(
1714 const char *fname,
1715 int flags,
1716 mode_t mode,
1717 JournalFile *template,
1718 JournalFile **ret) {
1719
1720 JournalFile *f;
1721 int r;
1722 bool newly_created = false;
1723
1724 assert(fname);
1725
1726 if ((flags & O_ACCMODE) != O_RDONLY &&
1727 (flags & O_ACCMODE) != O_RDWR)
1728 return -EINVAL;
1729
1730 f = new0(JournalFile, 1);
1731 if (!f)
1732 return -ENOMEM;
1733
1734 f->fd = -1;
1735 f->flags = flags;
1736 f->mode = mode;
1737 f->writable = (flags & O_ACCMODE) != O_RDONLY;
1738 f->prot = prot_from_flags(flags);
1739
1740 f->path = strdup(fname);
1741 if (!f->path) {
1742 r = -ENOMEM;
1743 goto fail;
1744 }
1745
1746 f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
1747 if (f->fd < 0) {
1748 r = -errno;
1749 goto fail;
1750 }
1751
1752 if (fstat(f->fd, &f->last_stat) < 0) {
1753 r = -errno;
1754 goto fail;
1755 }
1756
1757 if (f->last_stat.st_size == 0 && f->writable) {
1758 newly_created = true;
1759
1760 r = journal_file_init_header(f, template);
1761 if (r < 0)
1762 goto fail;
1763
1764 if (fstat(f->fd, &f->last_stat) < 0) {
1765 r = -errno;
1766 goto fail;
1767 }
1768 }
1769
1770 if (f->last_stat.st_size < (off_t) sizeof(Header)) {
1771 r = -EIO;
1772 goto fail;
1773 }
1774
1775 f->header = mmap(NULL, PAGE_ALIGN(sizeof(Header)), prot_from_flags(flags), MAP_SHARED, f->fd, 0);
1776 if (f->header == MAP_FAILED) {
1777 f->header = NULL;
1778 r = -errno;
1779 goto fail;
1780 }
1781
1782 if (!newly_created) {
1783 r = journal_file_verify_header(f);
1784 if (r < 0)
1785 goto fail;
1786 }
1787
1788 if (f->writable) {
1789 r = journal_file_refresh_header(f);
1790 if (r < 0)
1791 goto fail;
1792 }
1793
1794 if (newly_created) {
1795
1796 r = journal_file_setup_field_hash_table(f);
1797 if (r < 0)
1798 goto fail;
1799
1800 r = journal_file_setup_data_hash_table(f);
1801 if (r < 0)
1802 goto fail;
1803 }
1804
1805 r = journal_file_map_field_hash_table(f);
1806 if (r < 0)
1807 goto fail;
1808
1809 r = journal_file_map_data_hash_table(f);
1810 if (r < 0)
1811 goto fail;
1812
1813 if (ret)
1814 *ret = f;
1815
1816 return 0;
1817
1818 fail:
1819 journal_file_close(f);
1820
1821 return r;
1822 }
1823
1824 int journal_file_rotate(JournalFile **f) {
1825 char *p;
1826 size_t l;
1827 JournalFile *old_file, *new_file = NULL;
1828 int r;
1829
1830 assert(f);
1831 assert(*f);
1832
1833 old_file = *f;
1834
1835 if (!old_file->writable)
1836 return -EINVAL;
1837
1838 if (!endswith(old_file->path, ".journal"))
1839 return -EINVAL;
1840
1841 l = strlen(old_file->path);
1842
1843 p = new(char, l + 1 + 16 + 1 + 32 + 1 + 16 + 1);
1844 if (!p)
1845 return -ENOMEM;
1846
1847 memcpy(p, old_file->path, l - 8);
1848 p[l-8] = '@';
1849 sd_id128_to_string(old_file->header->seqnum_id, p + l - 8 + 1);
1850 snprintf(p + l - 8 + 1 + 32, 1 + 16 + 1 + 16 + 8 + 1,
1851 "-%016llx-%016llx.journal",
1852 (unsigned long long) le64toh((*f)->header->seqnum),
1853 (unsigned long long) le64toh((*f)->header->tail_entry_realtime));
1854
1855 r = rename(old_file->path, p);
1856 free(p);
1857
1858 if (r < 0)
1859 return -errno;
1860
1861 old_file->header->state = le32toh(STATE_ARCHIVED);
1862
1863 r = journal_file_open(old_file->path, old_file->flags, old_file->mode, old_file, &new_file);
1864 journal_file_close(old_file);
1865
1866 *f = new_file;
1867 return r;
1868 }
1869
1870 struct vacuum_info {
1871 off_t usage;
1872 char *filename;
1873
1874 uint64_t realtime;
1875 sd_id128_t seqnum_id;
1876 uint64_t seqnum;
1877 };
1878
1879 static int vacuum_compare(const void *_a, const void *_b) {
1880 const struct vacuum_info *a, *b;
1881
1882 a = _a;
1883 b = _b;
1884
1885 if (sd_id128_equal(a->seqnum_id, b->seqnum_id)) {
1886 if (a->seqnum < b->seqnum)
1887 return -1;
1888 else if (a->seqnum > b->seqnum)
1889 return 1;
1890 else
1891 return 0;
1892 }
1893
1894 if (a->realtime < b->realtime)
1895 return -1;
1896 else if (a->realtime > b->realtime)
1897 return 1;
1898 else
1899 return memcmp(&a->seqnum_id, &b->seqnum_id, 16);
1900 }
1901
1902 int journal_directory_vacuum(const char *directory, uint64_t max_use, uint64_t min_free) {
1903 DIR *d;
1904 int r = 0;
1905 struct vacuum_info *list = NULL;
1906 unsigned n_list = 0, n_allocated = 0, i;
1907 uint64_t sum = 0;
1908
1909 assert(directory);
1910
1911 if (max_use <= 0)
1912 return 0;
1913
1914 d = opendir(directory);
1915 if (!d)
1916 return -errno;
1917
1918 for (;;) {
1919 int k;
1920 struct dirent buf, *de;
1921 size_t q;
1922 struct stat st;
1923 char *p;
1924 unsigned long long seqnum, realtime;
1925 sd_id128_t seqnum_id;
1926
1927 k = readdir_r(d, &buf, &de);
1928 if (k != 0) {
1929 r = -k;
1930 goto finish;
1931 }
1932
1933 if (!de)
1934 break;
1935
1936 if (!dirent_is_file_with_suffix(de, ".journal"))
1937 continue;
1938
1939 q = strlen(de->d_name);
1940
1941 if (q < 1 + 32 + 1 + 16 + 1 + 16 + 8)
1942 continue;
1943
1944 if (de->d_name[q-8-16-1] != '-' ||
1945 de->d_name[q-8-16-1-16-1] != '-' ||
1946 de->d_name[q-8-16-1-16-1-32-1] != '@')
1947 continue;
1948
1949 if (fstatat(dirfd(d), de->d_name, &st, AT_SYMLINK_NOFOLLOW) < 0)
1950 continue;
1951
1952 if (!S_ISREG(st.st_mode))
1953 continue;
1954
1955 p = strdup(de->d_name);
1956 if (!p) {
1957 r = -ENOMEM;
1958 goto finish;
1959 }
1960
1961 de->d_name[q-8-16-1-16-1] = 0;
1962 if (sd_id128_from_string(de->d_name + q-8-16-1-16-1-32, &seqnum_id) < 0) {
1963 free(p);
1964 continue;
1965 }
1966
1967 if (sscanf(de->d_name + q-8-16-1-16, "%16llx-%16llx.journal", &seqnum, &realtime) != 2) {
1968 free(p);
1969 continue;
1970 }
1971
1972 if (n_list >= n_allocated) {
1973 struct vacuum_info *j;
1974
1975 n_allocated = MAX(n_allocated * 2U, 8U);
1976 j = realloc(list, n_allocated * sizeof(struct vacuum_info));
1977 if (!j) {
1978 free(p);
1979 r = -ENOMEM;
1980 goto finish;
1981 }
1982
1983 list = j;
1984 }
1985
1986 list[n_list].filename = p;
1987 list[n_list].usage = (uint64_t) st.st_blksize * (uint64_t) st.st_blocks;
1988 list[n_list].seqnum = seqnum;
1989 list[n_list].realtime = realtime;
1990 list[n_list].seqnum_id = seqnum_id;
1991
1992 sum += list[n_list].usage;
1993
1994 n_list ++;
1995 }
1996
1997 qsort(list, n_list, sizeof(struct vacuum_info), vacuum_compare);
1998
1999 for(i = 0; i < n_list; i++) {
2000 struct statvfs ss;
2001
2002 if (fstatvfs(dirfd(d), &ss) < 0) {
2003 r = -errno;
2004 goto finish;
2005 }
2006
2007 if (sum <= max_use &&
2008 (uint64_t) ss.f_bavail * (uint64_t) ss.f_bsize >= min_free)
2009 break;
2010
2011 if (unlinkat(dirfd(d), list[i].filename, 0) >= 0) {
2012 log_debug("Deleted archived journal %s/%s.", directory, list[i].filename);
2013 sum -= list[i].usage;
2014 } else if (errno != ENOENT)
2015 log_warning("Failed to delete %s/%s: %m", directory, list[i].filename);
2016 }
2017
2018 finish:
2019 for (i = 0; i < n_list; i++)
2020 free(list[i].filename);
2021
2022 free(list);
2023
2024 if (d)
2025 closedir(d);
2026
2027 return r;
2028 }
2029
2030 int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
2031 uint64_t i, n;
2032 uint64_t q, xor_hash = 0;
2033 int r;
2034 EntryItem *items;
2035 dual_timestamp ts;
2036
2037 assert(from);
2038 assert(to);
2039 assert(o);
2040 assert(p);
2041
2042 if (!to->writable)
2043 return -EPERM;
2044
2045 ts.monotonic = le64toh(o->entry.monotonic);
2046 ts.realtime = le64toh(o->entry.realtime);
2047
2048 if (to->tail_entry_monotonic_valid &&
2049 ts.monotonic < le64toh(to->header->tail_entry_monotonic))
2050 return -EINVAL;
2051
2052 if (ts.realtime < le64toh(to->header->tail_entry_realtime))
2053 return -EINVAL;
2054
2055 n = journal_file_entry_n_items(o);
2056 items = alloca(sizeof(EntryItem) * n);
2057
2058 for (i = 0; i < n; i++) {
2059 uint64_t le_hash, l, h;
2060 size_t t;
2061 void *data;
2062 Object *u;
2063
2064 q = le64toh(o->entry.items[i].object_offset);
2065 le_hash = o->entry.items[i].hash;
2066
2067 r = journal_file_move_to_object(from, OBJECT_DATA, q, &o);
2068 if (r < 0)
2069 return r;
2070
2071 if (le_hash != o->data.hash)
2072 return -EBADMSG;
2073
2074 l = le64toh(o->object.size) - offsetof(Object, data.payload);
2075 t = (size_t) l;
2076
2077 /* We hit the limit on 32bit machines */
2078 if ((uint64_t) t != l)
2079 return -E2BIG;
2080
2081 if (o->object.flags & OBJECT_COMPRESSED) {
2082 #ifdef HAVE_XZ
2083 uint64_t rsize;
2084
2085 if (!uncompress_blob(o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize))
2086 return -EBADMSG;
2087
2088 data = from->compress_buffer;
2089 l = rsize;
2090 #else
2091 return -EPROTONOSUPPORT;
2092 #endif
2093 } else
2094 data = o->data.payload;
2095
2096 r = journal_file_append_data(to, data, l, &u, &h);
2097 if (r < 0)
2098 return r;
2099
2100 xor_hash ^= le64toh(u->data.hash);
2101 items[i].object_offset = htole64(h);
2102 items[i].hash = u->data.hash;
2103
2104 r = journal_file_move_to_object(from, OBJECT_ENTRY, p, &o);
2105 if (r < 0)
2106 return r;
2107 }
2108
2109 return journal_file_append_entry_internal(to, &ts, xor_hash, items, n, seqnum, ret, offset);
2110 }
2111
2112 void journal_default_metrics(JournalMetrics *m, int fd) {
2113 uint64_t fs_size = 0;
2114 struct statvfs ss;
2115 char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
2116
2117 assert(m);
2118 assert(fd >= 0);
2119
2120 if (fstatvfs(fd, &ss) >= 0)
2121 fs_size = ss.f_frsize * ss.f_blocks;
2122
2123 if (m->max_use == (uint64_t) -1) {
2124
2125 if (fs_size > 0) {
2126 m->max_use = PAGE_ALIGN(fs_size / 10); /* 10% of file system size */
2127
2128 if (m->max_use > DEFAULT_MAX_USE_UPPER)
2129 m->max_use = DEFAULT_MAX_USE_UPPER;
2130
2131 if (m->max_use < DEFAULT_MAX_USE_LOWER)
2132 m->max_use = DEFAULT_MAX_USE_LOWER;
2133 } else
2134 m->max_use = DEFAULT_MAX_USE_LOWER;
2135 } else {
2136 m->max_use = PAGE_ALIGN(m->max_use);
2137
2138 if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
2139 m->max_use = JOURNAL_FILE_SIZE_MIN*2;
2140 }
2141
2142 if (m->max_size == (uint64_t) -1) {
2143 m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
2144
2145 if (m->max_size > DEFAULT_MAX_SIZE_UPPER)
2146 m->max_size = DEFAULT_MAX_SIZE_UPPER;
2147 } else
2148 m->max_size = PAGE_ALIGN(m->max_size);
2149
2150 if (m->max_size < JOURNAL_FILE_SIZE_MIN)
2151 m->max_size = JOURNAL_FILE_SIZE_MIN;
2152
2153 if (m->max_size*2 > m->max_use)
2154 m->max_use = m->max_size*2;
2155
2156 if (m->min_size == (uint64_t) -1)
2157 m->min_size = JOURNAL_FILE_SIZE_MIN;
2158 else {
2159 m->min_size = PAGE_ALIGN(m->min_size);
2160
2161 if (m->min_size < JOURNAL_FILE_SIZE_MIN)
2162 m->min_size = JOURNAL_FILE_SIZE_MIN;
2163
2164 if (m->min_size > m->max_size)
2165 m->max_size = m->min_size;
2166 }
2167
2168 if (m->keep_free == (uint64_t) -1) {
2169
2170 if (fs_size > 0) {
2171 m->keep_free = PAGE_ALIGN(fs_size / 20); /* 5% of file system size */
2172
2173 if (m->keep_free > DEFAULT_KEEP_FREE_UPPER)
2174 m->keep_free = DEFAULT_KEEP_FREE_UPPER;
2175
2176 } else
2177 m->keep_free = DEFAULT_KEEP_FREE;
2178 }
2179
2180 log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
2181 format_bytes(a, sizeof(a), m->max_use),
2182 format_bytes(b, sizeof(b), m->max_size),
2183 format_bytes(c, sizeof(c), m->min_size),
2184 format_bytes(d, sizeof(d), m->keep_free));
2185 }