]>
Commit | Line | Data |
---|---|---|
db9ecf05 | 1 | /* SPDX-License-Identifier: LGPL-2.1-or-later */ |
cec736d2 | 2 | |
cec736d2 | 3 | #include <errno.h> |
cec736d2 | 4 | #include <fcntl.h> |
11689d2a | 5 | #include <linux/fs.h> |
65ddc2c5 | 6 | #include <linux/magic.h> |
ac2e41f5 | 7 | #include <pthread.h> |
07630cea LP |
8 | #include <stddef.h> |
9 | #include <sys/mman.h> | |
10 | #include <sys/statvfs.h> | |
11 | #include <sys/uio.h> | |
12 | #include <unistd.h> | |
fb0951b0 | 13 | |
a03d4359 ZJS |
14 | #include "sd-event.h" |
15 | ||
b5efdb8a | 16 | #include "alloc-util.h" |
c8b3094d | 17 | #include "chattr-util.h" |
07630cea | 18 | #include "compress.h" |
4ce534f4 | 19 | #include "env-util.h" |
3ffd4af2 | 20 | #include "fd-util.h" |
aa892669 | 21 | #include "format-util.h" |
11b29a96 | 22 | #include "fs-util.h" |
74e795ee | 23 | #include "id128-util.h" |
0284adc6 | 24 | #include "journal-authenticate.h" |
cec736d2 LP |
25 | #include "journal-def.h" |
26 | #include "journal-file.h" | |
d9799ea2 | 27 | #include "journal-internal.h" |
cec736d2 | 28 | #include "lookup3.h" |
0a970718 | 29 | #include "memory-util.h" |
9dfbae20 | 30 | #include "missing_threads.h" |
5d1ce257 | 31 | #include "path-util.h" |
34af7494 | 32 | #include "prioq.h" |
3df3e884 | 33 | #include "random-util.h" |
b58c888f | 34 | #include "set.h" |
760877e9 | 35 | #include "sort-util.h" |
3cc44114 | 36 | #include "stat-util.h" |
363b2b9a | 37 | #include "string-table.h" |
07630cea | 38 | #include "string-util.h" |
4761fd0f | 39 | #include "strv.h" |
bf819d3a | 40 | #include "sync-util.h" |
596c3c7f | 41 | #include "user-util.h" |
89a5a90c | 42 | #include "xattr-util.h" |
cec736d2 | 43 | |
4a92baf3 LP |
44 | #define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem)) |
45 | #define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem)) | |
cec736d2 | 46 | |
57850536 AG |
47 | #define DEFAULT_COMPRESS_THRESHOLD (512ULL) |
48 | #define MIN_COMPRESS_THRESHOLD (8ULL) | |
807e17f0 | 49 | |
babfc091 | 50 | /* This is the minimum journal file size */ |
b39907c7 YW |
51 | #define JOURNAL_FILE_SIZE_MIN (512 * U64_KB) /* 512 KiB */ |
52 | #define JOURNAL_COMPACT_SIZE_MAX ((uint64_t) UINT32_MAX) /* 4 GiB */ | |
babfc091 | 53 | |
b39907c7 YW |
54 | /* These are the lower and upper bounds if we deduce the max_use value from the file system size */ |
55 | #define MAX_USE_LOWER (1 * U64_MB) /* 1 MiB */ | |
56 | #define MAX_USE_UPPER (4 * U64_GB) /* 4 GiB */ | |
babfc091 | 57 | |
6aae0b1a ZJS |
58 | /* Those are the lower and upper bounds for the minimal use limit, |
59 | * i.e. how much we'll use even if keep_free suggests otherwise. */ | |
b39907c7 YW |
60 | #define MIN_USE_LOW (1 * U64_MB) /* 1 MiB */ |
61 | #define MIN_USE_HIGH (16 * U64_MB) /* 16 MiB */ | |
8580d1f7 | 62 | |
babfc091 | 63 | /* This is the upper bound if we deduce max_size from max_use */ |
b39907c7 | 64 | #define MAX_SIZE_UPPER (128 * U64_MB) /* 128 MiB */ |
babfc091 | 65 | |
b39907c7 YW |
66 | /* This is the upper bound if we deduce the keep_free value from the file system size */ |
67 | #define KEEP_FREE_UPPER (4 * U64_GB) /* 4 GiB */ | |
babfc091 | 68 | |
b39907c7 YW |
69 | /* This is the keep_free value when we can't determine the system size */ |
70 | #define DEFAULT_KEEP_FREE (1 * U64_MB) /* 1 MB */ | |
babfc091 | 71 | |
8580d1f7 | 72 | /* This is the default maximum number of journal files to keep around. */ |
6aae0b1a | 73 | #define DEFAULT_N_MAX_FILES 100 |
8580d1f7 | 74 | |
dca6219e LP |
75 | /* n_data was the first entry we added after the initial file format design */ |
76 | #define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data)) | |
cec736d2 | 77 | |
a4bcff5b LP |
78 | /* How many entries to keep in the entry array chain cache at max */ |
79 | #define CHAIN_CACHE_MAX 20 | |
80 | ||
a676e665 | 81 | /* How much to increase the journal file size at once each time we allocate something new. */ |
b39907c7 | 82 | #define FILE_SIZE_INCREASE (8 * U64_MB) /* 8MB */ |
a676e665 | 83 | |
2678031a LP |
84 | /* Reread fstat() of the file for detecting deletions at least this often */ |
85 | #define LAST_STAT_REFRESH_USEC (5*USEC_PER_SEC) | |
86 | ||
0dbe57ee LP |
87 | /* Longest hash chain to rotate after */ |
88 | #define HASH_CHAIN_DEPTH_MAX 100 | |
89 | ||
51804460 ZJS |
90 | #ifdef __clang__ |
91 | # pragma GCC diagnostic ignored "-Waddress-of-packed-member" | |
92 | #endif | |
93 | ||
0e58dc99 ZJS |
94 | static int mmap_prot_from_open_flags(int flags) { |
95 | switch (flags & O_ACCMODE) { | |
96 | case O_RDONLY: | |
97 | return PROT_READ; | |
98 | case O_WRONLY: | |
99 | return PROT_WRITE; | |
100 | case O_RDWR: | |
101 | return PROT_READ|PROT_WRITE; | |
102 | default: | |
103 | assert_not_reached(); | |
104 | } | |
105 | } | |
106 | ||
e6d4a110 | 107 | int journal_file_tail_end_by_pread(JournalFile *f, uint64_t *ret_offset) { |
ab6e257b DDM |
108 | uint64_t p; |
109 | int r; | |
110 | ||
111 | assert(f); | |
112 | assert(f->header); | |
113 | assert(ret_offset); | |
114 | ||
e6d4a110 LP |
115 | /* Same as journal_file_tail_end_by_mmap() below, but operates with pread() to avoid the mmap cache |
116 | * (and thus is thread safe) */ | |
117 | ||
ab6e257b DDM |
118 | p = le64toh(f->header->tail_object_offset); |
119 | if (p == 0) | |
120 | p = le64toh(f->header->header_size); | |
121 | else { | |
e6d4a110 | 122 | Object tail; |
ab6e257b DDM |
123 | uint64_t sz; |
124 | ||
e5d84733 | 125 | r = journal_file_read_object_header(f, OBJECT_UNUSED, p, &tail); |
ab6e257b DDM |
126 | if (r < 0) |
127 | return r; | |
128 | ||
129 | sz = le64toh(tail.object.size); | |
130 | if (sz > UINT64_MAX - sizeof(uint64_t) + 1) | |
131 | return -EBADMSG; | |
132 | ||
133 | sz = ALIGN64(sz); | |
134 | if (p > UINT64_MAX - sz) | |
135 | return -EBADMSG; | |
136 | ||
137 | p += sz; | |
138 | } | |
139 | ||
140 | *ret_offset = p; | |
141 | ||
142 | return 0; | |
143 | } | |
144 | ||
e6d4a110 LP |
145 | int journal_file_tail_end_by_mmap(JournalFile *f, uint64_t *ret_offset) { |
146 | uint64_t p; | |
147 | int r; | |
148 | ||
149 | assert(f); | |
150 | assert(f->header); | |
151 | assert(ret_offset); | |
152 | ||
153 | /* Same as journal_file_tail_end_by_pread() above, but operates with the usual mmap logic */ | |
154 | ||
155 | p = le64toh(f->header->tail_object_offset); | |
156 | if (p == 0) | |
157 | p = le64toh(f->header->header_size); | |
158 | else { | |
159 | Object *tail; | |
160 | uint64_t sz; | |
161 | ||
162 | r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &tail); | |
163 | if (r < 0) | |
164 | return r; | |
165 | ||
166 | sz = le64toh(READ_NOW(tail->object.size)); | |
167 | if (sz > UINT64_MAX - sizeof(uint64_t) + 1) | |
168 | return -EBADMSG; | |
169 | ||
170 | sz = ALIGN64(sz); | |
171 | if (p > UINT64_MAX - sz) | |
172 | return -EBADMSG; | |
173 | ||
174 | p += sz; | |
175 | } | |
176 | ||
177 | *ret_offset = p; | |
178 | ||
179 | return 0; | |
180 | } | |
181 | ||
764721cc | 182 | int journal_file_set_offline_thread_join(JournalFile *f) { |
ac2e41f5 VC |
183 | int r; |
184 | ||
185 | assert(f); | |
186 | ||
187 | if (f->offline_state == OFFLINE_JOINED) | |
188 | return 0; | |
189 | ||
190 | r = pthread_join(f->offline_thread, NULL); | |
191 | if (r) | |
192 | return -r; | |
193 | ||
194 | f->offline_state = OFFLINE_JOINED; | |
26687bf8 | 195 | |
c3bd54bf | 196 | if (mmap_cache_fd_got_sigbus(f->cache_fd)) |
fa6ac760 LP |
197 | return -EIO; |
198 | ||
ac2e41f5 VC |
199 | return 0; |
200 | } | |
26687bf8 | 201 | |
ac2e41f5 | 202 | static int journal_file_set_online(JournalFile *f) { |
83bf6b67 | 203 | bool wait = true; |
ac2e41f5 VC |
204 | |
205 | assert(f); | |
206 | ||
4374d7ea | 207 | if (!journal_file_writable(f)) |
ac2e41f5 VC |
208 | return -EPERM; |
209 | ||
846e5418 | 210 | if (f->fd < 0 || !f->header) |
ac2e41f5 VC |
211 | return -EINVAL; |
212 | ||
83bf6b67 | 213 | while (wait) { |
ac2e41f5 VC |
214 | switch (f->offline_state) { |
215 | case OFFLINE_JOINED: | |
216 | /* No offline thread, no need to wait. */ | |
83bf6b67 | 217 | wait = false; |
ac2e41f5 VC |
218 | break; |
219 | ||
60c040f6 | 220 | case OFFLINE_SYNCING: { |
221 | OfflineState tmp_state = OFFLINE_SYNCING; | |
222 | if (!__atomic_compare_exchange_n(&f->offline_state, &tmp_state, OFFLINE_CANCEL, | |
223 | false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) | |
224 | continue; | |
225 | } | |
ac2e41f5 | 226 | /* Canceled syncing prior to offlining, no need to wait. */ |
83bf6b67 | 227 | wait = false; |
ac2e41f5 VC |
228 | break; |
229 | ||
60c040f6 | 230 | case OFFLINE_AGAIN_FROM_SYNCING: { |
231 | OfflineState tmp_state = OFFLINE_AGAIN_FROM_SYNCING; | |
232 | if (!__atomic_compare_exchange_n(&f->offline_state, &tmp_state, OFFLINE_CANCEL, | |
233 | false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) | |
234 | continue; | |
235 | } | |
ac2e41f5 | 236 | /* Canceled restart from syncing, no need to wait. */ |
83bf6b67 | 237 | wait = false; |
ac2e41f5 VC |
238 | break; |
239 | ||
60c040f6 | 240 | case OFFLINE_AGAIN_FROM_OFFLINING: { |
241 | OfflineState tmp_state = OFFLINE_AGAIN_FROM_OFFLINING; | |
242 | if (!__atomic_compare_exchange_n(&f->offline_state, &tmp_state, OFFLINE_CANCEL, | |
243 | false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST)) | |
244 | continue; | |
245 | } | |
ac2e41f5 | 246 | /* Canceled restart from offlining, must wait for offlining to complete however. */ |
4831981d | 247 | _fallthrough_; |
ac2e41f5 VC |
248 | default: { |
249 | int r; | |
250 | ||
251 | r = journal_file_set_offline_thread_join(f); | |
252 | if (r < 0) | |
253 | return r; | |
254 | ||
83bf6b67 | 255 | wait = false; |
ac2e41f5 VC |
256 | break; |
257 | } | |
258 | } | |
259 | } | |
26687bf8 | 260 | |
c3bd54bf | 261 | if (mmap_cache_fd_got_sigbus(f->cache_fd)) |
fa6ac760 LP |
262 | return -EIO; |
263 | ||
ac2e41f5 VC |
264 | switch (f->header->state) { |
265 | case STATE_ONLINE: | |
266 | return 0; | |
26687bf8 | 267 | |
ac2e41f5 VC |
268 | case STATE_OFFLINE: |
269 | f->header->state = STATE_ONLINE; | |
270 | (void) fsync(f->fd); | |
271 | return 0; | |
272 | ||
273 | default: | |
274 | return -EINVAL; | |
275 | } | |
26687bf8 OS |
276 | } |
277 | ||
804ae586 | 278 | JournalFile* journal_file_close(JournalFile *f) { |
c377a6f3 YW |
279 | if (!f) |
280 | return NULL; | |
cec736d2 | 281 | |
5b895f56 YW |
282 | assert(f->newest_boot_id_prioq_idx == PRIOQ_IDX_NULL); |
283 | ||
a8da6330 | 284 | if (f->cache_fd) |
c3bd54bf | 285 | mmap_cache_fd_free(f->cache_fd); |
cec736d2 | 286 | |
5d1ce257 LP |
287 | if (f->close_fd) |
288 | safe_close(f->fd); | |
cec736d2 | 289 | free(f->path); |
807e17f0 | 290 | |
4743015d | 291 | ordered_hashmap_free_free(f->chain_cache); |
a4bcff5b | 292 | |
d80b051c | 293 | #if HAVE_COMPRESSION |
807e17f0 LP |
294 | free(f->compress_buffer); |
295 | #endif | |
296 | ||
349cc4a5 | 297 | #if HAVE_GCRYPT |
cbdac0c3 YW |
298 | if (f->fss_file) { |
299 | size_t sz = PAGE_ALIGN(f->fss_file_size); | |
300 | assert(sz < SIZE_MAX); | |
301 | munmap(f->fss_file, sz); | |
302 | } else | |
b7c9ae91 LP |
303 | free(f->fsprg_state); |
304 | ||
305 | free(f->fsprg_seed); | |
7560fffc LP |
306 | |
307 | if (f->hmac) | |
308 | gcry_md_close(f->hmac); | |
309 | #endif | |
310 | ||
6b430fdb | 311 | return mfree(f); |
cec736d2 LP |
312 | } |
313 | ||
61297656 | 314 | static bool keyed_hash_requested(void) { |
9dfbae20 | 315 | static thread_local int cached = -1; |
61297656 DDM |
316 | int r; |
317 | ||
9dfbae20 YW |
318 | if (cached < 0) { |
319 | r = getenv_bool("SYSTEMD_JOURNAL_KEYED_HASH"); | |
320 | if (r < 0) { | |
321 | if (r != -ENXIO) | |
322 | log_debug_errno(r, "Failed to parse $SYSTEMD_JOURNAL_KEYED_HASH environment variable, ignoring: %m"); | |
323 | cached = true; | |
324 | } else | |
325 | cached = r; | |
326 | } | |
61297656 | 327 | |
9dfbae20 | 328 | return cached; |
61297656 DDM |
329 | } |
330 | ||
331 | static bool compact_mode_requested(void) { | |
9dfbae20 | 332 | static thread_local int cached = -1; |
61297656 DDM |
333 | int r; |
334 | ||
9dfbae20 YW |
335 | if (cached < 0) { |
336 | r = getenv_bool("SYSTEMD_JOURNAL_COMPACT"); | |
337 | if (r < 0) { | |
338 | if (r != -ENXIO) | |
339 | log_debug_errno(r, "Failed to parse $SYSTEMD_JOURNAL_COMPACT environment variable, ignoring: %m"); | |
340 | cached = true; | |
341 | } else | |
342 | cached = r; | |
343 | } | |
61297656 | 344 | |
9dfbae20 | 345 | return cached; |
61297656 DDM |
346 | } |
347 | ||
1f06ea74 YW |
348 | #if HAVE_COMPRESSION |
349 | static Compression getenv_compression(void) { | |
350 | Compression c; | |
351 | const char *e; | |
352 | int r; | |
353 | ||
354 | e = getenv("SYSTEMD_JOURNAL_COMPRESS"); | |
355 | if (!e) | |
356 | return DEFAULT_COMPRESSION; | |
357 | ||
358 | r = parse_boolean(e); | |
359 | if (r >= 0) | |
360 | return r ? DEFAULT_COMPRESSION : COMPRESSION_NONE; | |
361 | ||
362 | c = compression_from_string(e); | |
363 | if (c < 0) { | |
364 | log_debug_errno(c, "Failed to parse SYSTEMD_JOURNAL_COMPRESS value, ignoring: %s", e); | |
365 | return DEFAULT_COMPRESSION; | |
366 | } | |
367 | ||
368 | if (!compression_supported(c)) { | |
369 | log_debug("Unsupported compression algorithm specified, ignoring: %s", e); | |
370 | return DEFAULT_COMPRESSION; | |
371 | } | |
372 | ||
373 | return c; | |
374 | } | |
375 | #endif | |
376 | ||
377 | static Compression compression_requested(void) { | |
378 | #if HAVE_COMPRESSION | |
379 | static thread_local Compression cached = _COMPRESSION_INVALID; | |
380 | ||
381 | if (cached < 0) | |
382 | cached = getenv_compression(); | |
383 | ||
384 | return cached; | |
385 | #else | |
386 | return COMPRESSION_NONE; | |
387 | #endif | |
388 | } | |
389 | ||
ce92dc27 LP |
390 | static int journal_file_init_header( |
391 | JournalFile *f, | |
392 | JournalFileFlags file_flags, | |
393 | JournalFile *template) { | |
394 | ||
61297656 | 395 | bool seal = false; |
c3dd0dcb | 396 | ssize_t k; |
cec736d2 LP |
397 | int r; |
398 | ||
399 | assert(f); | |
400 | ||
4374d7ea DDM |
401 | #if HAVE_GCRYPT |
402 | /* Try to load the FSPRG state, and if we can't, then just don't do sealing */ | |
403 | seal = FLAGS_SET(file_flags, JOURNAL_SEAL) && journal_file_fss_load(f) >= 0; | |
404 | #endif | |
405 | ||
c3dd0dcb LP |
406 | Header h = { |
407 | .header_size = htole64(ALIGN64(sizeof(h))), | |
408 | .incompatible_flags = htole32( | |
1f06ea74 | 409 | FLAGS_SET(file_flags, JOURNAL_COMPRESS) * COMPRESSION_TO_HEADER_INCOMPATIBLE_FLAG(compression_requested()) | |
c3dd0dcb LP |
410 | keyed_hash_requested() * HEADER_INCOMPATIBLE_KEYED_HASH | |
411 | compact_mode_requested() * HEADER_INCOMPATIBLE_COMPACT), | |
9204fc64 | 412 | .compatible_flags = htole32( |
e375bc5f | 413 | (seal * (HEADER_COMPATIBLE_SEALED | HEADER_COMPATIBLE_SEALED_CONTINUOUS) ) | |
9204fc64 | 414 | HEADER_COMPATIBLE_TAIL_ENTRY_BOOT_ID), |
c3dd0dcb | 415 | }; |
cec736d2 | 416 | |
c3dd0dcb LP |
417 | assert_cc(sizeof(h.signature) == sizeof(HEADER_SIGNATURE)); |
418 | memcpy(h.signature, HEADER_SIGNATURE, sizeof(HEADER_SIGNATURE)); | |
7560fffc | 419 | |
cec736d2 LP |
420 | r = sd_id128_randomize(&h.file_id); |
421 | if (r < 0) | |
422 | return r; | |
423 | ||
8e64ec04 LP |
424 | r = sd_id128_get_machine(&h.machine_id); |
425 | if (r < 0 && !ERRNO_IS_MACHINE_ID_UNSET(r)) | |
426 | return r; /* If we have no valid machine ID (test environment?), let's simply leave the | |
427 | * machine ID field all zeroes. */ | |
428 | ||
0ac38b70 LP |
429 | if (template) { |
430 | h.seqnum_id = template->header->seqnum_id; | |
beec0085 | 431 | h.tail_entry_seqnum = template->header->tail_entry_seqnum; |
0ac38b70 LP |
432 | } else |
433 | h.seqnum_id = h.file_id; | |
cec736d2 LP |
434 | |
435 | k = pwrite(f->fd, &h, sizeof(h), 0); | |
436 | if (k < 0) | |
437 | return -errno; | |
cec736d2 LP |
438 | if (k != sizeof(h)) |
439 | return -EIO; | |
440 | ||
441 | return 0; | |
442 | } | |
443 | ||
444 | static int journal_file_refresh_header(JournalFile *f) { | |
fa6ac760 | 445 | int r; |
cec736d2 LP |
446 | |
447 | assert(f); | |
c88cc6af | 448 | assert(f->header); |
cec736d2 | 449 | |
9204fc64 LP |
450 | /* We used to update the header's boot ID field here, but we don't do that anymore, as per |
451 | * HEADER_COMPATIBLE_TAIL_ENTRY_BOOT_ID */ | |
cec736d2 | 452 | |
fa6ac760 | 453 | r = journal_file_set_online(f); |
b788cc23 | 454 | |
bf819d3a LP |
455 | /* Sync the online state to disk; likely just created a new file, also sync the directory this file |
456 | * is located in. */ | |
457 | (void) fsync_full(f->fd); | |
a0fe2a2d | 458 | |
fa6ac760 | 459 | return r; |
cec736d2 LP |
460 | } |
461 | ||
4214009f ZJS |
462 | static bool warn_wrong_flags(const JournalFile *f, bool compatible) { |
463 | const uint32_t any = compatible ? HEADER_COMPATIBLE_ANY : HEADER_INCOMPATIBLE_ANY, | |
464 | supported = compatible ? HEADER_COMPATIBLE_SUPPORTED : HEADER_INCOMPATIBLE_SUPPORTED; | |
465 | const char *type = compatible ? "compatible" : "incompatible"; | |
d89c8fdf ZJS |
466 | uint32_t flags; |
467 | ||
cc938e4a YW |
468 | assert(f); |
469 | assert(f->header); | |
470 | ||
4214009f ZJS |
471 | flags = le32toh(compatible ? f->header->compatible_flags : f->header->incompatible_flags); |
472 | ||
473 | if (flags & ~supported) { | |
474 | if (flags & ~any) | |
4761fd0f | 475 | log_debug("Journal file %s has unknown %s flags 0x%"PRIx32, |
4214009f ZJS |
476 | f->path, type, flags & ~any); |
477 | flags = (flags & any) & ~supported; | |
4761fd0f | 478 | if (flags) { |
87413812 | 479 | const char* strv[6]; |
86e68f38 | 480 | size_t n = 0; |
4761fd0f ZJS |
481 | _cleanup_free_ char *t = NULL; |
482 | ||
4ce534f4 LP |
483 | if (compatible) { |
484 | if (flags & HEADER_COMPATIBLE_SEALED) | |
485 | strv[n++] = "sealed"; | |
e375bc5f FD |
486 | if (flags & HEADER_COMPATIBLE_SEALED_CONTINUOUS) |
487 | strv[n++] = "sealed-continuous"; | |
4ce534f4 LP |
488 | } else { |
489 | if (flags & HEADER_INCOMPATIBLE_COMPRESSED_XZ) | |
490 | strv[n++] = "xz-compressed"; | |
491 | if (flags & HEADER_INCOMPATIBLE_COMPRESSED_LZ4) | |
492 | strv[n++] = "lz4-compressed"; | |
8653185a LP |
493 | if (flags & HEADER_INCOMPATIBLE_COMPRESSED_ZSTD) |
494 | strv[n++] = "zstd-compressed"; | |
4ce534f4 LP |
495 | if (flags & HEADER_INCOMPATIBLE_KEYED_HASH) |
496 | strv[n++] = "keyed-hash"; | |
87413812 DDM |
497 | if (flags & HEADER_INCOMPATIBLE_COMPACT) |
498 | strv[n++] = "compact"; | |
4ce534f4 | 499 | } |
4761fd0f ZJS |
500 | strv[n] = NULL; |
501 | assert(n < ELEMENTSOF(strv)); | |
502 | ||
503 | t = strv_join((char**) strv, ", "); | |
504 | log_debug("Journal file %s uses %s %s %s disabled at compilation time.", | |
505 | f->path, type, n > 1 ? "flags" : "flag", strnull(t)); | |
506 | } | |
4214009f ZJS |
507 | return true; |
508 | } | |
509 | ||
510 | return false; | |
511 | } | |
512 | ||
50cf2032 YW |
513 | static bool offset_is_valid(uint64_t offset, uint64_t header_size, uint64_t tail_object_offset) { |
514 | if (offset == 0) | |
515 | return true; | |
516 | if (!VALID64(offset)) | |
517 | return false; | |
518 | if (offset < header_size) | |
519 | return false; | |
520 | if (offset > tail_object_offset) | |
521 | return false; | |
522 | return true; | |
523 | } | |
524 | ||
525 | static bool hash_table_is_valid(uint64_t offset, uint64_t size, uint64_t header_size, uint64_t arena_size, uint64_t tail_object_offset) { | |
526 | if ((offset == 0) != (size == 0)) | |
527 | return false; | |
528 | if (offset == 0) | |
529 | return true; | |
530 | if (offset <= offsetof(Object, hash_table.items)) | |
531 | return false; | |
532 | offset -= offsetof(Object, hash_table.items); | |
533 | if (!offset_is_valid(offset, header_size, tail_object_offset)) | |
534 | return false; | |
535 | assert(offset <= header_size + arena_size); | |
536 | if (size > header_size + arena_size - offset) | |
537 | return false; | |
538 | return true; | |
539 | } | |
540 | ||
4214009f | 541 | static int journal_file_verify_header(JournalFile *f) { |
6f94e420 TS |
542 | uint64_t arena_size, header_size; |
543 | ||
cec736d2 | 544 | assert(f); |
c88cc6af | 545 | assert(f->header); |
cec736d2 | 546 | |
7560fffc | 547 | if (memcmp(f->header->signature, HEADER_SIGNATURE, 8)) |
cec736d2 LP |
548 | return -EBADMSG; |
549 | ||
4214009f ZJS |
550 | /* In both read and write mode we refuse to open files with incompatible |
551 | * flags we don't know. */ | |
552 | if (warn_wrong_flags(f, false)) | |
cec736d2 LP |
553 | return -EPROTONOSUPPORT; |
554 | ||
4214009f | 555 | /* When open for writing we refuse to open files with compatible flags, too. */ |
4374d7ea | 556 | if (journal_file_writable(f) && warn_wrong_flags(f, true)) |
d89c8fdf | 557 | return -EPROTONOSUPPORT; |
7560fffc | 558 | |
db11ac1a LP |
559 | if (f->header->state >= _STATE_MAX) |
560 | return -EBADMSG; | |
561 | ||
893e0f8f | 562 | header_size = le64toh(READ_NOW(f->header->header_size)); |
6f94e420 | 563 | |
dca6219e | 564 | /* The first addition was n_data, so check that we are at least this large */ |
6f94e420 | 565 | if (header_size < HEADER_SIZE_MIN) |
23b0b2b2 LP |
566 | return -EBADMSG; |
567 | ||
75bf2627 LP |
568 | /* When open for writing we refuse to open files with a mismatch of the header size, i.e. writing to |
569 | * files implementing older or new header structures. */ | |
570 | if (journal_file_writable(f) && header_size != sizeof(Header)) | |
571 | return -EPROTONOSUPPORT; | |
572 | ||
2f766aca | 573 | /* Don't write to journal files without the new boot ID update behavior guarantee. */ |
574 | if (journal_file_writable(f) && !JOURNAL_HEADER_TAIL_ENTRY_BOOT_ID(f->header)) | |
575 | return -EPROTONOSUPPORT; | |
576 | ||
8088cbd3 | 577 | if (JOURNAL_HEADER_SEALED(f->header) && !JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays)) |
beec0085 LP |
578 | return -EBADMSG; |
579 | ||
893e0f8f | 580 | arena_size = le64toh(READ_NOW(f->header->arena_size)); |
6f94e420 TS |
581 | |
582 | if (UINT64_MAX - header_size < arena_size || header_size + arena_size > (uint64_t) f->last_stat.st_size) | |
db11ac1a LP |
583 | return -ENODATA; |
584 | ||
50cf2032 YW |
585 | uint64_t tail_object_offset = le64toh(f->header->tail_object_offset); |
586 | if (!offset_is_valid(tail_object_offset, header_size, UINT64_MAX)) | |
587 | return -ENODATA; | |
588 | if (header_size + arena_size < tail_object_offset) | |
589 | return -ENODATA; | |
590 | if (header_size + arena_size - tail_object_offset < sizeof(ObjectHeader)) | |
591 | return -ENODATA; | |
592 | ||
593 | if (!hash_table_is_valid(le64toh(f->header->data_hash_table_offset), | |
594 | le64toh(f->header->data_hash_table_size), | |
595 | header_size, arena_size, tail_object_offset)) | |
596 | return -ENODATA; | |
597 | ||
598 | if (!hash_table_is_valid(le64toh(f->header->field_hash_table_offset), | |
599 | le64toh(f->header->field_hash_table_size), | |
600 | header_size, arena_size, tail_object_offset)) | |
601 | return -ENODATA; | |
602 | ||
603 | uint64_t entry_array_offset = le64toh(f->header->entry_array_offset); | |
604 | if (!offset_is_valid(entry_array_offset, header_size, tail_object_offset)) | |
605 | return -ENODATA; | |
606 | ||
607 | if (JOURNAL_HEADER_CONTAINS(f->header, tail_entry_array_offset)) { | |
608 | uint32_t offset = le32toh(f->header->tail_entry_array_offset); | |
609 | uint32_t n = le32toh(f->header->tail_entry_array_n_entries); | |
610 | ||
611 | if (!offset_is_valid(offset, header_size, tail_object_offset)) | |
612 | return -ENODATA; | |
613 | if (entry_array_offset > offset) | |
614 | return -ENODATA; | |
615 | if (entry_array_offset == 0 && offset != 0) | |
616 | return -ENODATA; | |
617 | if ((offset == 0) != (n == 0)) | |
618 | return -ENODATA; | |
619 | assert(offset <= header_size + arena_size); | |
620 | if ((uint64_t) n * journal_file_entry_array_item_size(f) > header_size + arena_size - offset) | |
621 | return -ENODATA; | |
622 | } | |
623 | ||
6ea51363 YW |
624 | if (JOURNAL_HEADER_CONTAINS(f->header, tail_entry_offset)) { |
625 | uint64_t offset = le64toh(f->header->tail_entry_offset); | |
626 | ||
627 | if (!offset_is_valid(offset, header_size, tail_object_offset)) | |
50cf2032 YW |
628 | return -ENODATA; |
629 | ||
6ea51363 YW |
630 | if (offset > 0) { |
631 | /* When there is an entry object, then these fields must be filled. */ | |
632 | if (sd_id128_is_null(f->header->tail_entry_boot_id)) | |
633 | return -ENODATA; | |
634 | if (!VALID_REALTIME(le64toh(f->header->head_entry_realtime))) | |
635 | return -ENODATA; | |
636 | if (!VALID_REALTIME(le64toh(f->header->tail_entry_realtime))) | |
637 | return -ENODATA; | |
638 | if (!VALID_MONOTONIC(le64toh(f->header->tail_entry_realtime))) | |
639 | return -ENODATA; | |
640 | } else { | |
641 | /* Otherwise, the fields must be zero. */ | |
642 | if (JOURNAL_HEADER_TAIL_ENTRY_BOOT_ID(f->header) && | |
643 | !sd_id128_is_null(f->header->tail_entry_boot_id)) | |
644 | return -ENODATA; | |
645 | if (f->header->head_entry_realtime != 0) | |
646 | return -ENODATA; | |
647 | if (f->header->tail_entry_realtime != 0) | |
648 | return -ENODATA; | |
649 | if (f->header->tail_entry_realtime != 0) | |
650 | return -ENODATA; | |
651 | } | |
652 | } | |
653 | ||
50cf2032 YW |
654 | /* Verify number of objects */ |
655 | uint64_t n_objects = le64toh(f->header->n_objects); | |
656 | if (n_objects > arena_size / sizeof(ObjectHeader)) | |
657 | return -ENODATA; | |
658 | ||
659 | uint64_t n_entries = le64toh(f->header->n_entries); | |
660 | if (n_entries > n_objects) | |
661 | return -ENODATA; | |
662 | ||
663 | if (JOURNAL_HEADER_CONTAINS(f->header, n_data) && | |
664 | le64toh(f->header->n_data) > n_objects) | |
665 | return -ENODATA; | |
666 | ||
667 | if (JOURNAL_HEADER_CONTAINS(f->header, n_fields) && | |
668 | le64toh(f->header->n_fields) > n_objects) | |
669 | return -ENODATA; | |
670 | ||
671 | if (JOURNAL_HEADER_CONTAINS(f->header, n_tags) && | |
672 | le64toh(f->header->n_tags) > n_objects) | |
db11ac1a LP |
673 | return -ENODATA; |
674 | ||
50cf2032 YW |
675 | if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays) && |
676 | le64toh(f->header->n_entry_arrays) > n_objects) | |
7762e02b LP |
677 | return -ENODATA; |
678 | ||
50cf2032 YW |
679 | if (JOURNAL_HEADER_CONTAINS(f->header, tail_entry_array_n_entries) && |
680 | le32toh(f->header->tail_entry_array_n_entries) > n_entries) | |
206f0f39 LP |
681 | return -ENODATA; |
682 | ||
4374d7ea | 683 | if (journal_file_writable(f)) { |
cec736d2 | 684 | sd_id128_t machine_id; |
ae739cc1 | 685 | uint8_t state; |
cec736d2 LP |
686 | int r; |
687 | ||
688 | r = sd_id128_get_machine(&machine_id); | |
c5ed77b2 | 689 | if (ERRNO_IS_NEG_MACHINE_ID_UNSET(r)) /* Gracefully handle the machine ID not being initialized yet */ |
07f1c7aa | 690 | machine_id = SD_ID128_NULL; |
c5ed77b2 ZJS |
691 | else if (r < 0) |
692 | return r; | |
cec736d2 LP |
693 | |
694 | if (!sd_id128_equal(machine_id, f->header->machine_id)) | |
2308cf7c LP |
695 | return log_debug_errno(SYNTHETIC_ERRNO(EHOSTDOWN), |
696 | "Trying to open journal file from different host for writing, refusing."); | |
cec736d2 | 697 | |
de190aef | 698 | state = f->header->state; |
cec736d2 | 699 | |
b288cdeb ZJS |
700 | if (state == STATE_ARCHIVED) |
701 | return -ESHUTDOWN; /* Already archived */ | |
0e833026 | 702 | if (state == STATE_ONLINE) |
baaa35ad ZJS |
703 | return log_debug_errno(SYNTHETIC_ERRNO(EBUSY), |
704 | "Journal file %s is already online. Assuming unclean closing.", | |
705 | f->path); | |
0e833026 | 706 | if (state != STATE_OFFLINE) |
baaa35ad ZJS |
707 | return log_debug_errno(SYNTHETIC_ERRNO(EBUSY), |
708 | "Journal file %s has unknown state %i.", | |
709 | f->path, state); | |
ae739cc1 | 710 | |
5b3cc0c8 YN |
711 | if (f->header->field_hash_table_size == 0 || f->header->data_hash_table_size == 0) |
712 | return -EBADMSG; | |
cec736d2 LP |
713 | } |
714 | ||
715 | return 0; | |
716 | } | |
717 | ||
28ca867a | 718 | int journal_file_fstat(JournalFile *f) { |
3cc44114 LP |
719 | int r; |
720 | ||
2678031a LP |
721 | assert(f); |
722 | assert(f->fd >= 0); | |
723 | ||
724 | if (fstat(f->fd, &f->last_stat) < 0) | |
725 | return -errno; | |
726 | ||
727 | f->last_stat_usec = now(CLOCK_MONOTONIC); | |
728 | ||
e9dd6984 | 729 | /* Refuse dealing with files that aren't regular */ |
3cc44114 LP |
730 | r = stat_verify_regular(&f->last_stat); |
731 | if (r < 0) | |
732 | return r; | |
8d6a4d33 | 733 | |
2678031a LP |
734 | /* Refuse appending to files that are already deleted */ |
735 | if (f->last_stat.st_nlink <= 0) | |
736 | return -EIDRM; | |
737 | ||
738 | return 0; | |
739 | } | |
740 | ||
cec736d2 | 741 | static int journal_file_allocate(JournalFile *f, uint64_t offset, uint64_t size) { |
893e0f8f | 742 | uint64_t old_size, new_size, old_header_size, old_arena_size; |
fec2aa2f | 743 | int r; |
cec736d2 LP |
744 | |
745 | assert(f); | |
c88cc6af | 746 | assert(f->header); |
cec736d2 | 747 | |
893e0f8f LP |
748 | /* We assume that this file is not sparse, and we know that for sure, since we always call |
749 | * posix_fallocate() ourselves */ | |
750 | ||
b39907c7 | 751 | if (size > PAGE_ALIGN_DOWN_U64(UINT64_MAX) - offset) |
893e0f8f | 752 | return -EINVAL; |
cec736d2 | 753 | |
c3bd54bf | 754 | if (mmap_cache_fd_got_sigbus(f->cache_fd)) |
fa6ac760 LP |
755 | return -EIO; |
756 | ||
893e0f8f LP |
757 | old_header_size = le64toh(READ_NOW(f->header->header_size)); |
758 | old_arena_size = le64toh(READ_NOW(f->header->arena_size)); | |
b39907c7 | 759 | if (old_arena_size > PAGE_ALIGN_DOWN_U64(UINT64_MAX) - old_header_size) |
893e0f8f LP |
760 | return -EBADMSG; |
761 | ||
762 | old_size = old_header_size + old_arena_size; | |
cec736d2 | 763 | |
b39907c7 | 764 | new_size = MAX(PAGE_ALIGN_U64(offset + size), old_header_size); |
bc85bfee | 765 | |
2678031a LP |
766 | if (new_size <= old_size) { |
767 | ||
768 | /* We already pre-allocated enough space, but before | |
769 | * we write to it, let's check with fstat() if the | |
770 | * file got deleted, in order make sure we don't throw | |
771 | * away the data immediately. Don't check fstat() for | |
772 | * all writes though, but only once ever 10s. */ | |
773 | ||
774 | if (f->last_stat_usec + LAST_STAT_REFRESH_USEC > now(CLOCK_MONOTONIC)) | |
775 | return 0; | |
776 | ||
777 | return journal_file_fstat(f); | |
778 | } | |
779 | ||
780 | /* Allocate more space. */ | |
cec736d2 | 781 | |
a676e665 | 782 | if (f->metrics.max_size > 0 && new_size > f->metrics.max_size) |
bc85bfee | 783 | return -E2BIG; |
cec736d2 | 784 | |
d06727ae DDM |
785 | /* Refuse to go over 4G in compact mode so offsets can be stored in 32-bit. */ |
786 | if (JOURNAL_HEADER_COMPACT(f->header) && new_size > UINT32_MAX) | |
787 | return -E2BIG; | |
788 | ||
a676e665 | 789 | if (new_size > f->metrics.min_size && f->metrics.keep_free > 0) { |
cec736d2 LP |
790 | struct statvfs svfs; |
791 | ||
792 | if (fstatvfs(f->fd, &svfs) >= 0) { | |
793 | uint64_t available; | |
794 | ||
ffee7b97 | 795 | available = LESS_BY(u64_multiply_safe(svfs.f_bfree, svfs.f_bsize), f->metrics.keep_free); |
cec736d2 LP |
796 | |
797 | if (new_size - old_size > available) | |
798 | return -E2BIG; | |
799 | } | |
800 | } | |
801 | ||
eda4b58b | 802 | /* Increase by larger blocks at once */ |
4dcaab9c | 803 | new_size = ROUND_UP(new_size, FILE_SIZE_INCREASE); |
eda4b58b LP |
804 | if (f->metrics.max_size > 0 && new_size > f->metrics.max_size) |
805 | new_size = f->metrics.max_size; | |
806 | ||
bc85bfee LP |
807 | /* Note that the glibc fallocate() fallback is very |
808 | inefficient, hence we try to minimize the allocation area | |
809 | as we can. */ | |
4c54768c IZ |
810 | r = posix_fallocate_loop(f->fd, old_size, new_size - old_size); |
811 | if (r < 0) | |
812 | return r; | |
cec736d2 | 813 | |
893e0f8f | 814 | f->header->arena_size = htole64(new_size - old_header_size); |
cec736d2 | 815 | |
2678031a | 816 | return journal_file_fstat(f); |
cec736d2 LP |
817 | } |
818 | ||
71139898 LP |
819 | static int journal_file_move_to( |
820 | JournalFile *f, | |
821 | ObjectType type, | |
822 | bool keep_always, | |
823 | uint64_t offset, | |
824 | uint64_t size, | |
258190a0 | 825 | void **ret) { |
71139898 | 826 | |
2678031a LP |
827 | int r; |
828 | ||
cec736d2 | 829 | assert(f); |
cec736d2 LP |
830 | assert(ret); |
831 | ||
31438511 YW |
832 | /* This function may clear, overwrite, or alter previously cached entries with the same type. After |
833 | * this function has been called, all previously read objects with the same type may be invalidated, | |
834 | * hence must be re-read before use. */ | |
df04b9ed | 835 | |
7762e02b LP |
836 | if (size <= 0) |
837 | return -EINVAL; | |
838 | ||
893e0f8f LP |
839 | if (size > UINT64_MAX - offset) |
840 | return -EBADMSG; | |
841 | ||
2a59ea54 | 842 | /* Avoid SIGBUS on invalid accesses */ |
4bbdcdb3 LP |
843 | if (offset + size > (uint64_t) f->last_stat.st_size) { |
844 | /* Hmm, out of range? Let's refresh the fstat() data | |
845 | * first, before we trust that check. */ | |
846 | ||
2678031a LP |
847 | r = journal_file_fstat(f); |
848 | if (r < 0) | |
849 | return r; | |
850 | ||
851 | if (offset + size > (uint64_t) f->last_stat.st_size) | |
4bbdcdb3 LP |
852 | return -EADDRNOTAVAIL; |
853 | } | |
854 | ||
1a25ab66 | 855 | return mmap_cache_fd_get(f->cache_fd, type_to_category(type), keep_always, offset, size, &f->last_stat, ret); |
cec736d2 LP |
856 | } |
857 | ||
e81710d3 | 858 | static uint64_t minimum_header_size(JournalFile *f, Object *o) { |
16e9f408 | 859 | |
b8e891e6 | 860 | static const uint64_t table[] = { |
909a6b87 YW |
861 | [OBJECT_DATA] = sizeof(DataObject), |
862 | [OBJECT_FIELD] = sizeof(FieldObject), | |
863 | [OBJECT_ENTRY] = sizeof(EntryObject), | |
864 | [OBJECT_DATA_HASH_TABLE] = sizeof(HashTableObject), | |
16e9f408 | 865 | [OBJECT_FIELD_HASH_TABLE] = sizeof(HashTableObject), |
909a6b87 YW |
866 | [OBJECT_ENTRY_ARRAY] = sizeof(EntryArrayObject), |
867 | [OBJECT_TAG] = sizeof(TagObject), | |
16e9f408 LP |
868 | }; |
869 | ||
e81710d3 DDM |
870 | assert(f); |
871 | assert(o); | |
872 | ||
873 | if (o->object.type == OBJECT_DATA) | |
874 | return journal_file_data_payload_offset(f); | |
875 | ||
16e9f408 LP |
876 | if (o->object.type >= ELEMENTSOF(table) || table[o->object.type] <= 0) |
877 | return sizeof(ObjectHeader); | |
878 | ||
879 | return table[o->object.type]; | |
880 | } | |
881 | ||
e81710d3 | 882 | static int check_object_header(JournalFile *f, Object *o, ObjectType type, uint64_t offset) { |
a30630f5 YW |
883 | uint64_t s; |
884 | ||
e81710d3 | 885 | assert(f); |
a30630f5 YW |
886 | assert(o); |
887 | ||
888 | s = le64toh(READ_NOW(o->object.size)); | |
889 | if (s == 0) | |
890 | return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), | |
891 | "Attempt to move to uninitialized object: %" PRIu64, | |
892 | offset); | |
893 | ||
894 | if (s < sizeof(ObjectHeader)) | |
895 | return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), | |
06762aa5 DDM |
896 | "Attempt to move to overly short object with size %"PRIu64": %" PRIu64, |
897 | s, offset); | |
a30630f5 | 898 | |
b72fd2af | 899 | if (o->object.type <= OBJECT_UNUSED || o->object.type >= _OBJECT_TYPE_MAX) |
a30630f5 | 900 | return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), |
b72fd2af YW |
901 | "Attempt to move to object with invalid type (%u): %" PRIu64, |
902 | o->object.type, offset); | |
a30630f5 YW |
903 | |
904 | if (type > OBJECT_UNUSED && o->object.type != type) | |
905 | return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), | |
06762aa5 DDM |
906 | "Found %s object while expecting %s object: %" PRIu64, |
907 | journal_object_type_to_string(o->object.type), | |
908 | journal_object_type_to_string(type), | |
a30630f5 YW |
909 | offset); |
910 | ||
e81710d3 | 911 | if (s < minimum_header_size(f, o)) |
a30630f5 | 912 | return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), |
06762aa5 DDM |
913 | "Size of %s object (%"PRIu64") is smaller than the minimum object size (%"PRIu64"): %" PRIu64, |
914 | journal_object_type_to_string(o->object.type), | |
915 | s, | |
916 | minimum_header_size(f, o), | |
a30630f5 YW |
917 | offset); |
918 | ||
919 | return 0; | |
920 | } | |
921 | ||
24754f36 TR |
922 | /* Lightweight object checks. We want this to be fast, so that we won't |
923 | * slowdown every journal_file_move_to_object() call too much. */ | |
99daf3ce | 924 | static int check_object(JournalFile *f, Object *o, uint64_t offset) { |
cc938e4a | 925 | assert(f); |
24754f36 TR |
926 | assert(o); |
927 | ||
928 | switch (o->object.type) { | |
929 | ||
a602d93e | 930 | case OBJECT_DATA: |
baaa35ad ZJS |
931 | if ((le64toh(o->data.entry_offset) == 0) ^ (le64toh(o->data.n_entries) == 0)) |
932 | return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), | |
06762aa5 | 933 | "Bad data n_entries: %" PRIu64 ": %" PRIu64, |
baaa35ad ZJS |
934 | le64toh(o->data.n_entries), |
935 | offset); | |
936 | ||
e81710d3 | 937 | if (le64toh(o->object.size) <= journal_file_data_payload_offset(f)) |
baaa35ad | 938 | return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), |
06762aa5 | 939 | "Bad data size (<= %zu): %" PRIu64 ": %" PRIu64, |
e81710d3 | 940 | journal_file_data_payload_offset(f), |
baaa35ad ZJS |
941 | le64toh(o->object.size), |
942 | offset); | |
24754f36 | 943 | |
10e8445b TR |
944 | if (!VALID64(le64toh(o->data.next_hash_offset)) || |
945 | !VALID64(le64toh(o->data.next_field_offset)) || | |
946 | !VALID64(le64toh(o->data.entry_offset)) || | |
baaa35ad ZJS |
947 | !VALID64(le64toh(o->data.entry_array_offset))) |
948 | return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), | |
949 | "Invalid offset, next_hash_offset=" OFSfmt ", next_field_offset=" OFSfmt ", entry_offset=" OFSfmt ", entry_array_offset=" OFSfmt ": %" PRIu64, | |
950 | le64toh(o->data.next_hash_offset), | |
951 | le64toh(o->data.next_field_offset), | |
952 | le64toh(o->data.entry_offset), | |
953 | le64toh(o->data.entry_array_offset), | |
954 | offset); | |
24754f36 TR |
955 | |
956 | break; | |
24754f36 TR |
957 | |
958 | case OBJECT_FIELD: | |
3a8099a8 | 959 | if (le64toh(o->object.size) <= offsetof(Object, field.payload)) |
baaa35ad ZJS |
960 | return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), |
961 | "Bad field size (<= %zu): %" PRIu64 ": %" PRIu64, | |
3a8099a8 | 962 | offsetof(Object, field.payload), |
baaa35ad ZJS |
963 | le64toh(o->object.size), |
964 | offset); | |
24754f36 | 965 | |
10e8445b | 966 | if (!VALID64(le64toh(o->field.next_hash_offset)) || |
baaa35ad ZJS |
967 | !VALID64(le64toh(o->field.head_data_offset))) |
968 | return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), | |
969 | "Invalid offset, next_hash_offset=" OFSfmt ", head_data_offset=" OFSfmt ": %" PRIu64, | |
970 | le64toh(o->field.next_hash_offset), | |
971 | le64toh(o->field.head_data_offset), | |
972 | offset); | |
24754f36 TR |
973 | break; |
974 | ||
893e0f8f LP |
975 | case OBJECT_ENTRY: { |
976 | uint64_t sz; | |
977 | ||
978 | sz = le64toh(READ_NOW(o->object.size)); | |
3a8099a8 | 979 | if (sz < offsetof(Object, entry.items) || |
a9089a66 | 980 | (sz - offsetof(Object, entry.items)) % journal_file_entry_item_size(f) != 0) |
baaa35ad ZJS |
981 | return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), |
982 | "Bad entry size (<= %zu): %" PRIu64 ": %" PRIu64, | |
3a8099a8 | 983 | offsetof(Object, entry.items), |
893e0f8f | 984 | sz, |
baaa35ad ZJS |
985 | offset); |
986 | ||
a9089a66 | 987 | if ((sz - offsetof(Object, entry.items)) / journal_file_entry_item_size(f) <= 0) |
baaa35ad ZJS |
988 | return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), |
989 | "Invalid number items in entry: %" PRIu64 ": %" PRIu64, | |
a9089a66 | 990 | (sz - offsetof(Object, entry.items)) / journal_file_entry_item_size(f), |
baaa35ad ZJS |
991 | offset); |
992 | ||
993 | if (le64toh(o->entry.seqnum) <= 0) | |
994 | return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), | |
995 | "Invalid entry seqnum: %" PRIx64 ": %" PRIu64, | |
996 | le64toh(o->entry.seqnum), | |
997 | offset); | |
998 | ||
999 | if (!VALID_REALTIME(le64toh(o->entry.realtime))) | |
1000 | return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), | |
1001 | "Invalid entry realtime timestamp: %" PRIu64 ": %" PRIu64, | |
1002 | le64toh(o->entry.realtime), | |
1003 | offset); | |
1004 | ||
1005 | if (!VALID_MONOTONIC(le64toh(o->entry.monotonic))) | |
1006 | return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), | |
1007 | "Invalid entry monotonic timestamp: %" PRIu64 ": %" PRIu64, | |
1008 | le64toh(o->entry.monotonic), | |
1009 | offset); | |
24754f36 | 1010 | |
c650d4cd YW |
1011 | if (sd_id128_is_null(o->entry.boot_id)) |
1012 | return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), | |
1013 | "Invalid object entry with an empty boot ID: %" PRIu64, | |
1014 | offset); | |
1015 | ||
24754f36 | 1016 | break; |
893e0f8f | 1017 | } |
24754f36 TR |
1018 | |
1019 | case OBJECT_DATA_HASH_TABLE: | |
893e0f8f LP |
1020 | case OBJECT_FIELD_HASH_TABLE: { |
1021 | uint64_t sz; | |
1022 | ||
1023 | sz = le64toh(READ_NOW(o->object.size)); | |
3a8099a8 DDM |
1024 | if (sz < offsetof(Object, hash_table.items) || |
1025 | (sz - offsetof(Object, hash_table.items)) % sizeof(HashItem) != 0 || | |
1026 | (sz - offsetof(Object, hash_table.items)) / sizeof(HashItem) <= 0) | |
baaa35ad ZJS |
1027 | return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), |
1028 | "Invalid %s hash table size: %" PRIu64 ": %" PRIu64, | |
06762aa5 | 1029 | journal_object_type_to_string(o->object.type), |
893e0f8f | 1030 | sz, |
baaa35ad | 1031 | offset); |
24754f36 TR |
1032 | |
1033 | break; | |
893e0f8f | 1034 | } |
24754f36 | 1035 | |
893e0f8f | 1036 | case OBJECT_ENTRY_ARRAY: { |
b5335da7 | 1037 | uint64_t sz, next; |
893e0f8f LP |
1038 | |
1039 | sz = le64toh(READ_NOW(o->object.size)); | |
3a8099a8 | 1040 | if (sz < offsetof(Object, entry_array.items) || |
99daf3ce DDM |
1041 | (sz - offsetof(Object, entry_array.items)) % journal_file_entry_array_item_size(f) != 0 || |
1042 | (sz - offsetof(Object, entry_array.items)) / journal_file_entry_array_item_size(f) <= 0) | |
baaa35ad ZJS |
1043 | return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), |
1044 | "Invalid object entry array size: %" PRIu64 ": %" PRIu64, | |
893e0f8f | 1045 | sz, |
baaa35ad | 1046 | offset); |
b5335da7 YW |
1047 | /* Here, we request that the offset of each entry array object is in strictly increasing order. */ |
1048 | next = le64toh(o->entry_array.next_entry_array_offset); | |
1049 | if (!VALID64(next) || (next > 0 && next <= offset)) | |
baaa35ad | 1050 | return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), |
b5335da7 YW |
1051 | "Invalid object entry array next_entry_array_offset: %" PRIu64 ": %" PRIu64, |
1052 | next, | |
baaa35ad | 1053 | offset); |
24754f36 TR |
1054 | |
1055 | break; | |
893e0f8f | 1056 | } |
24754f36 TR |
1057 | |
1058 | case OBJECT_TAG: | |
baaa35ad ZJS |
1059 | if (le64toh(o->object.size) != sizeof(TagObject)) |
1060 | return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), | |
1061 | "Invalid object tag size: %" PRIu64 ": %" PRIu64, | |
1062 | le64toh(o->object.size), | |
1063 | offset); | |
24754f36 | 1064 | |
baaa35ad ZJS |
1065 | if (!VALID_EPOCH(le64toh(o->tag.epoch))) |
1066 | return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), | |
1067 | "Invalid object tag epoch: %" PRIu64 ": %" PRIu64, | |
1068 | le64toh(o->tag.epoch), offset); | |
24754f36 TR |
1069 | |
1070 | break; | |
1071 | } | |
1072 | ||
1073 | return 0; | |
1074 | } | |
1075 | ||
78519831 | 1076 | int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset, Object **ret) { |
cec736d2 | 1077 | int r; |
cec736d2 | 1078 | Object *o; |
cec736d2 LP |
1079 | |
1080 | assert(f); | |
cec736d2 | 1081 | |
31438511 YW |
1082 | /* Even if this function fails, it may clear, overwrite, or alter previously cached entries with the |
1083 | * same type. After this function has been called, all previously read objects with the same type may | |
1084 | * be invalidated, hence must be re-read before use. */ | |
df04b9ed | 1085 | |
db11ac1a | 1086 | /* Objects may only be located at multiple of 64 bit */ |
baaa35ad ZJS |
1087 | if (!VALID64(offset)) |
1088 | return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), | |
da890466 | 1089 | "Attempt to move to %s object at non-64-bit boundary: %" PRIu64, |
06762aa5 | 1090 | journal_object_type_to_string(type), |
baaa35ad | 1091 | offset); |
db11ac1a | 1092 | |
50809d7a | 1093 | /* Object may not be located in the file header */ |
baaa35ad ZJS |
1094 | if (offset < le64toh(f->header->header_size)) |
1095 | return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), | |
06762aa5 DDM |
1096 | "Attempt to move to %s object located in file header: %" PRIu64, |
1097 | journal_object_type_to_string(type), | |
baaa35ad | 1098 | offset); |
50809d7a | 1099 | |
586fb79e | 1100 | r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), (void**) &o); |
cec736d2 LP |
1101 | if (r < 0) |
1102 | return r; | |
1103 | ||
e81710d3 | 1104 | r = check_object_header(f, o, type, offset); |
a30630f5 YW |
1105 | if (r < 0) |
1106 | return r; | |
baaa35ad | 1107 | |
a30630f5 YW |
1108 | r = journal_file_move_to(f, type, false, offset, le64toh(READ_NOW(o->object.size)), (void**) &o); |
1109 | if (r < 0) | |
1110 | return r; | |
cec736d2 | 1111 | |
e81710d3 | 1112 | r = check_object_header(f, o, type, offset); |
258190a0 VC |
1113 | if (r < 0) |
1114 | return r; | |
cec736d2 | 1115 | |
99daf3ce | 1116 | r = check_object(f, o, offset); |
24754f36 TR |
1117 | if (r < 0) |
1118 | return r; | |
1119 | ||
ded10e3a DDM |
1120 | if (ret) |
1121 | *ret = o; | |
1122 | ||
cec736d2 LP |
1123 | return 0; |
1124 | } | |
1125 | ||
a1b8d21f YW |
1126 | int journal_file_pin_object(JournalFile *f, Object *o) { |
1127 | assert(f); | |
1128 | assert(o); | |
1129 | ||
1130 | /* This attaches the mmap window that provides the object to the 'pinning' category. So, reading | |
1131 | * another object with the same type will not invalidate the object, until this function is called | |
1132 | * for another object. */ | |
1133 | return mmap_cache_fd_pin(f->cache_fd, type_to_category(o->object.type), o, le64toh(o->object.size)); | |
1134 | } | |
1135 | ||
e5d84733 | 1136 | int journal_file_read_object_header(JournalFile *f, ObjectType type, uint64_t offset, Object *ret) { |
e5d84733 LP |
1137 | ssize_t n; |
1138 | Object o; | |
1139 | int r; | |
117e2112 DDM |
1140 | |
1141 | assert(f); | |
117e2112 DDM |
1142 | |
1143 | /* Objects may only be located at multiple of 64 bit */ | |
1144 | if (!VALID64(offset)) | |
1145 | return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), | |
da890466 | 1146 | "Attempt to read %s object at non-64-bit boundary: %" PRIu64, |
06762aa5 | 1147 | journal_object_type_to_string(type), offset); |
117e2112 DDM |
1148 | |
1149 | /* Object may not be located in the file header */ | |
1150 | if (offset < le64toh(f->header->header_size)) | |
1151 | return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), | |
06762aa5 DDM |
1152 | "Attempt to read %s object located in file header: %" PRIu64, |
1153 | journal_object_type_to_string(type), offset); | |
117e2112 DDM |
1154 | |
1155 | /* This will likely read too much data but it avoids having to call pread() twice. */ | |
e5d84733 LP |
1156 | n = pread(f->fd, &o, sizeof(o), offset); |
1157 | if (n < 0) | |
06762aa5 DDM |
1158 | return log_debug_errno(errno, "Failed to read journal %s object at offset: %" PRIu64, |
1159 | journal_object_type_to_string(type), offset); | |
117e2112 | 1160 | |
e5d84733 LP |
1161 | if ((size_t) n < sizeof(o.object)) |
1162 | return log_debug_errno(SYNTHETIC_ERRNO(EIO), | |
06762aa5 DDM |
1163 | "Failed to read short %s object at offset: %" PRIu64, |
1164 | journal_object_type_to_string(type), offset); | |
117e2112 | 1165 | |
e81710d3 | 1166 | r = check_object_header(f, &o, type, offset); |
a30630f5 YW |
1167 | if (r < 0) |
1168 | return r; | |
117e2112 | 1169 | |
e81710d3 | 1170 | if ((size_t) n < minimum_header_size(f, &o)) |
e5d84733 | 1171 | return log_debug_errno(SYNTHETIC_ERRNO(EIO), |
06762aa5 DDM |
1172 | "Short read while reading %s object: %" PRIu64, |
1173 | journal_object_type_to_string(type), offset); | |
e5d84733 | 1174 | |
99daf3ce | 1175 | r = check_object(f, &o, offset); |
117e2112 DDM |
1176 | if (r < 0) |
1177 | return r; | |
1178 | ||
ded10e3a DDM |
1179 | if (ret) |
1180 | *ret = o; | |
1181 | ||
117e2112 DDM |
1182 | return 0; |
1183 | } | |
1184 | ||
a133189e LP |
1185 | static uint64_t inc_seqnum(uint64_t seqnum) { |
1186 | if (seqnum < UINT64_MAX-1) | |
1187 | return seqnum + 1; | |
1188 | ||
1189 | return 1; /* skip over UINT64_MAX and 0 when we run out of seqnums and start again */ | |
1190 | } | |
1191 | ||
0eaee828 LP |
1192 | static uint64_t journal_file_entry_seqnum( |
1193 | JournalFile *f, | |
1194 | uint64_t *seqnum) { | |
1195 | ||
a133189e | 1196 | uint64_t next_seqnum; |
cec736d2 LP |
1197 | |
1198 | assert(f); | |
c88cc6af | 1199 | assert(f->header); |
cec736d2 | 1200 | |
0eaee828 LP |
1201 | /* Picks a new sequence number for the entry we are about to add and returns it. */ |
1202 | ||
a133189e | 1203 | next_seqnum = inc_seqnum(le64toh(f->header->tail_entry_seqnum)); |
c2373f84 | 1204 | |
a133189e LP |
1205 | /* If an external seqnum counter was passed, we update both the local and the external one, and set |
1206 | * it to the maximum of both */ | |
1207 | if (seqnum) | |
1208 | *seqnum = next_seqnum = MAX(inc_seqnum(*seqnum), next_seqnum); | |
c2373f84 | 1209 | |
a133189e | 1210 | f->header->tail_entry_seqnum = htole64(next_seqnum); |
cec736d2 | 1211 | |
beec0085 | 1212 | if (f->header->head_entry_seqnum == 0) |
a133189e | 1213 | f->header->head_entry_seqnum = htole64(next_seqnum); |
de190aef | 1214 | |
a133189e | 1215 | return next_seqnum; |
0eaee828 LP |
1216 | } |
1217 | ||
f4474e00 LP |
1218 | int journal_file_append_object( |
1219 | JournalFile *f, | |
1220 | ObjectType type, | |
1221 | uint64_t size, | |
cc938e4a | 1222 | Object **ret_object, |
f4474e00 LP |
1223 | uint64_t *ret_offset) { |
1224 | ||
cec736d2 LP |
1225 | int r; |
1226 | uint64_t p; | |
ab6e257b | 1227 | Object *o; |
cec736d2 LP |
1228 | |
1229 | assert(f); | |
c88cc6af | 1230 | assert(f->header); |
d05089d8 | 1231 | assert(type > OBJECT_UNUSED && type < _OBJECT_TYPE_MAX); |
cec736d2 | 1232 | assert(size >= sizeof(ObjectHeader)); |
cec736d2 | 1233 | |
26687bf8 OS |
1234 | r = journal_file_set_online(f); |
1235 | if (r < 0) | |
1236 | return r; | |
1237 | ||
e6d4a110 | 1238 | r = journal_file_tail_end_by_mmap(f, &p); |
ab6e257b DDM |
1239 | if (r < 0) |
1240 | return r; | |
cec736d2 LP |
1241 | |
1242 | r = journal_file_allocate(f, p, size); | |
1243 | if (r < 0) | |
1244 | return r; | |
1245 | ||
586fb79e | 1246 | r = journal_file_move_to(f, type, false, p, size, (void**) &o); |
cec736d2 LP |
1247 | if (r < 0) |
1248 | return r; | |
1249 | ||
71139898 LP |
1250 | o->object = (ObjectHeader) { |
1251 | .type = type, | |
1252 | .size = htole64(size), | |
1253 | }; | |
cec736d2 LP |
1254 | |
1255 | f->header->tail_object_offset = htole64(p); | |
cec736d2 LP |
1256 | f->header->n_objects = htole64(le64toh(f->header->n_objects) + 1); |
1257 | ||
cc938e4a YW |
1258 | if (ret_object) |
1259 | *ret_object = o; | |
f4474e00 LP |
1260 | |
1261 | if (ret_offset) | |
1262 | *ret_offset = p; | |
cec736d2 LP |
1263 | |
1264 | return 0; | |
1265 | } | |
1266 | ||
de190aef | 1267 | static int journal_file_setup_data_hash_table(JournalFile *f) { |
cec736d2 LP |
1268 | uint64_t s, p; |
1269 | Object *o; | |
1270 | int r; | |
1271 | ||
1272 | assert(f); | |
c88cc6af | 1273 | assert(f->header); |
cec736d2 | 1274 | |
070052ab LP |
1275 | /* We estimate that we need 1 hash table entry per 768 bytes |
1276 | of journal file and we want to make sure we never get | |
1277 | beyond 75% fill level. Calculate the hash table size for | |
1278 | the maximum file size based on these metrics. */ | |
4a92baf3 | 1279 | |
dfabe643 | 1280 | s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem); |
4a92baf3 LP |
1281 | if (s < DEFAULT_DATA_HASH_TABLE_SIZE) |
1282 | s = DEFAULT_DATA_HASH_TABLE_SIZE; | |
1283 | ||
5030c85a | 1284 | log_debug("Reserving %"PRIu64" entries in data hash table.", s / sizeof(HashItem)); |
4a92baf3 | 1285 | |
de190aef LP |
1286 | r = journal_file_append_object(f, |
1287 | OBJECT_DATA_HASH_TABLE, | |
1288 | offsetof(Object, hash_table.items) + s, | |
1289 | &o, &p); | |
cec736d2 LP |
1290 | if (r < 0) |
1291 | return r; | |
1292 | ||
29804cc1 | 1293 | memzero(o->hash_table.items, s); |
cec736d2 | 1294 | |
de190aef LP |
1295 | f->header->data_hash_table_offset = htole64(p + offsetof(Object, hash_table.items)); |
1296 | f->header->data_hash_table_size = htole64(s); | |
cec736d2 LP |
1297 | |
1298 | return 0; | |
1299 | } | |
1300 | ||
de190aef | 1301 | static int journal_file_setup_field_hash_table(JournalFile *f) { |
cec736d2 LP |
1302 | uint64_t s, p; |
1303 | Object *o; | |
1304 | int r; | |
1305 | ||
1306 | assert(f); | |
c88cc6af | 1307 | assert(f->header); |
cec736d2 | 1308 | |
3c1668da LP |
1309 | /* We use a fixed size hash table for the fields as this |
1310 | * number should grow very slowly only */ | |
1311 | ||
de190aef | 1312 | s = DEFAULT_FIELD_HASH_TABLE_SIZE; |
5030c85a LP |
1313 | log_debug("Reserving %"PRIu64" entries in field hash table.", s / sizeof(HashItem)); |
1314 | ||
de190aef LP |
1315 | r = journal_file_append_object(f, |
1316 | OBJECT_FIELD_HASH_TABLE, | |
1317 | offsetof(Object, hash_table.items) + s, | |
1318 | &o, &p); | |
cec736d2 LP |
1319 | if (r < 0) |
1320 | return r; | |
1321 | ||
29804cc1 | 1322 | memzero(o->hash_table.items, s); |
cec736d2 | 1323 | |
de190aef LP |
1324 | f->header->field_hash_table_offset = htole64(p + offsetof(Object, hash_table.items)); |
1325 | f->header->field_hash_table_size = htole64(s); | |
cec736d2 LP |
1326 | |
1327 | return 0; | |
1328 | } | |
1329 | ||
dade37d4 | 1330 | int journal_file_map_data_hash_table(JournalFile *f) { |
cec736d2 LP |
1331 | uint64_t s, p; |
1332 | void *t; | |
1333 | int r; | |
1334 | ||
1335 | assert(f); | |
c88cc6af | 1336 | assert(f->header); |
cec736d2 | 1337 | |
dade37d4 LP |
1338 | if (f->data_hash_table) |
1339 | return 0; | |
1340 | ||
de190aef LP |
1341 | p = le64toh(f->header->data_hash_table_offset); |
1342 | s = le64toh(f->header->data_hash_table_size); | |
cec736d2 | 1343 | |
de190aef | 1344 | r = journal_file_move_to(f, |
16e9f408 | 1345 | OBJECT_DATA_HASH_TABLE, |
fcde2389 | 1346 | true, |
de190aef | 1347 | p, s, |
258190a0 | 1348 | &t); |
cec736d2 LP |
1349 | if (r < 0) |
1350 | return r; | |
1351 | ||
de190aef | 1352 | f->data_hash_table = t; |
cec736d2 LP |
1353 | return 0; |
1354 | } | |
1355 | ||
dade37d4 | 1356 | int journal_file_map_field_hash_table(JournalFile *f) { |
cec736d2 LP |
1357 | uint64_t s, p; |
1358 | void *t; | |
1359 | int r; | |
1360 | ||
1361 | assert(f); | |
c88cc6af | 1362 | assert(f->header); |
cec736d2 | 1363 | |
dade37d4 LP |
1364 | if (f->field_hash_table) |
1365 | return 0; | |
1366 | ||
de190aef LP |
1367 | p = le64toh(f->header->field_hash_table_offset); |
1368 | s = le64toh(f->header->field_hash_table_size); | |
cec736d2 | 1369 | |
de190aef | 1370 | r = journal_file_move_to(f, |
16e9f408 | 1371 | OBJECT_FIELD_HASH_TABLE, |
fcde2389 | 1372 | true, |
de190aef | 1373 | p, s, |
258190a0 | 1374 | &t); |
cec736d2 LP |
1375 | if (r < 0) |
1376 | return r; | |
1377 | ||
de190aef | 1378 | f->field_hash_table = t; |
cec736d2 LP |
1379 | return 0; |
1380 | } | |
1381 | ||
3c1668da LP |
1382 | static int journal_file_link_field( |
1383 | JournalFile *f, | |
1384 | Object *o, | |
1385 | uint64_t offset, | |
1386 | uint64_t hash) { | |
1387 | ||
805d1486 | 1388 | uint64_t p, h, m; |
3c1668da LP |
1389 | int r; |
1390 | ||
1391 | assert(f); | |
c88cc6af | 1392 | assert(f->header); |
90d222c1 | 1393 | assert(f->field_hash_table); |
3c1668da LP |
1394 | assert(o); |
1395 | assert(offset > 0); | |
1396 | ||
1397 | if (o->object.type != OBJECT_FIELD) | |
1398 | return -EINVAL; | |
1399 | ||
893e0f8f | 1400 | m = le64toh(READ_NOW(f->header->field_hash_table_size)) / sizeof(HashItem); |
805d1486 LP |
1401 | if (m <= 0) |
1402 | return -EBADMSG; | |
3c1668da | 1403 | |
805d1486 | 1404 | /* This might alter the window we are looking at */ |
3c1668da LP |
1405 | o->field.next_hash_offset = o->field.head_data_offset = 0; |
1406 | ||
805d1486 | 1407 | h = hash % m; |
3c1668da LP |
1408 | p = le64toh(f->field_hash_table[h].tail_hash_offset); |
1409 | if (p == 0) | |
1410 | f->field_hash_table[h].head_hash_offset = htole64(offset); | |
1411 | else { | |
1412 | r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o); | |
1413 | if (r < 0) | |
1414 | return r; | |
1415 | ||
1416 | o->field.next_hash_offset = htole64(offset); | |
1417 | } | |
1418 | ||
1419 | f->field_hash_table[h].tail_hash_offset = htole64(offset); | |
1420 | ||
1421 | if (JOURNAL_HEADER_CONTAINS(f->header, n_fields)) | |
1422 | f->header->n_fields = htole64(le64toh(f->header->n_fields) + 1); | |
1423 | ||
1424 | return 0; | |
1425 | } | |
1426 | ||
1427 | static int journal_file_link_data( | |
1428 | JournalFile *f, | |
1429 | Object *o, | |
1430 | uint64_t offset, | |
1431 | uint64_t hash) { | |
1432 | ||
805d1486 | 1433 | uint64_t p, h, m; |
cec736d2 LP |
1434 | int r; |
1435 | ||
1436 | assert(f); | |
c88cc6af | 1437 | assert(f->header); |
90d222c1 | 1438 | assert(f->data_hash_table); |
cec736d2 LP |
1439 | assert(o); |
1440 | assert(offset > 0); | |
b588975f LP |
1441 | |
1442 | if (o->object.type != OBJECT_DATA) | |
1443 | return -EINVAL; | |
cec736d2 | 1444 | |
893e0f8f | 1445 | m = le64toh(READ_NOW(f->header->data_hash_table_size)) / sizeof(HashItem); |
805d1486 LP |
1446 | if (m <= 0) |
1447 | return -EBADMSG; | |
48496df6 | 1448 | |
805d1486 | 1449 | /* This might alter the window we are looking at */ |
de190aef LP |
1450 | o->data.next_hash_offset = o->data.next_field_offset = 0; |
1451 | o->data.entry_offset = o->data.entry_array_offset = 0; | |
1452 | o->data.n_entries = 0; | |
cec736d2 | 1453 | |
805d1486 | 1454 | h = hash % m; |
8db4213e | 1455 | p = le64toh(f->data_hash_table[h].tail_hash_offset); |
3c1668da | 1456 | if (p == 0) |
cec736d2 | 1457 | /* Only entry in the hash table is easy */ |
de190aef | 1458 | f->data_hash_table[h].head_hash_offset = htole64(offset); |
3c1668da | 1459 | else { |
48496df6 LP |
1460 | /* Move back to the previous data object, to patch in |
1461 | * pointer */ | |
cec736d2 | 1462 | |
de190aef | 1463 | r = journal_file_move_to_object(f, OBJECT_DATA, p, &o); |
cec736d2 LP |
1464 | if (r < 0) |
1465 | return r; | |
1466 | ||
de190aef | 1467 | o->data.next_hash_offset = htole64(offset); |
cec736d2 LP |
1468 | } |
1469 | ||
de190aef | 1470 | f->data_hash_table[h].tail_hash_offset = htole64(offset); |
cec736d2 | 1471 | |
dca6219e LP |
1472 | if (JOURNAL_HEADER_CONTAINS(f->header, n_data)) |
1473 | f->header->n_data = htole64(le64toh(f->header->n_data) + 1); | |
1474 | ||
cec736d2 LP |
1475 | return 0; |
1476 | } | |
1477 | ||
8a77f21d | 1478 | static int get_next_hash_offset( |
0dbe57ee LP |
1479 | JournalFile *f, |
1480 | uint64_t *p, | |
1481 | le64_t *next_hash_offset, | |
1482 | uint64_t *depth, | |
1483 | le64_t *header_max_depth) { | |
1484 | ||
1485 | uint64_t nextp; | |
1486 | ||
cc938e4a YW |
1487 | assert(f); |
1488 | assert(p); | |
1489 | assert(next_hash_offset); | |
1490 | assert(depth); | |
1491 | ||
0dbe57ee LP |
1492 | nextp = le64toh(READ_NOW(*next_hash_offset)); |
1493 | if (nextp > 0) { | |
1494 | if (nextp <= *p) /* Refuse going in loops */ | |
1495 | return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), | |
1496 | "Detected hash item loop in %s, refusing.", f->path); | |
1497 | ||
1498 | (*depth)++; | |
1499 | ||
1500 | /* If the depth of this hash chain is larger than all others we have seen so far, record it */ | |
4374d7ea | 1501 | if (header_max_depth && journal_file_writable(f)) |
0dbe57ee LP |
1502 | *header_max_depth = htole64(MAX(*depth, le64toh(*header_max_depth))); |
1503 | } | |
1504 | ||
1505 | *p = nextp; | |
1506 | return 0; | |
1507 | } | |
1508 | ||
3c1668da LP |
1509 | int journal_file_find_field_object_with_hash( |
1510 | JournalFile *f, | |
cc938e4a YW |
1511 | const void *field, |
1512 | uint64_t size, | |
1513 | uint64_t hash, | |
1514 | Object **ret_object, | |
1515 | uint64_t *ret_offset) { | |
3c1668da | 1516 | |
0dbe57ee | 1517 | uint64_t p, osize, h, m, depth = 0; |
3c1668da LP |
1518 | int r; |
1519 | ||
1520 | assert(f); | |
c88cc6af | 1521 | assert(f->header); |
cc938e4a YW |
1522 | assert(field); |
1523 | assert(size > 0); | |
3c1668da | 1524 | |
dade37d4 LP |
1525 | /* If the field hash table is empty, we can't find anything */ |
1526 | if (le64toh(f->header->field_hash_table_size) <= 0) | |
1527 | return 0; | |
1528 | ||
1529 | /* Map the field hash table, if it isn't mapped yet. */ | |
1530 | r = journal_file_map_field_hash_table(f); | |
1531 | if (r < 0) | |
1532 | return r; | |
1533 | ||
3c1668da LP |
1534 | osize = offsetof(Object, field.payload) + size; |
1535 | ||
893e0f8f | 1536 | m = le64toh(READ_NOW(f->header->field_hash_table_size)) / sizeof(HashItem); |
805d1486 | 1537 | if (m <= 0) |
3c1668da LP |
1538 | return -EBADMSG; |
1539 | ||
805d1486 | 1540 | h = hash % m; |
3c1668da | 1541 | p = le64toh(f->field_hash_table[h].head_hash_offset); |
3c1668da LP |
1542 | while (p > 0) { |
1543 | Object *o; | |
1544 | ||
1545 | r = journal_file_move_to_object(f, OBJECT_FIELD, p, &o); | |
1546 | if (r < 0) | |
1547 | return r; | |
1548 | ||
1549 | if (le64toh(o->field.hash) == hash && | |
1550 | le64toh(o->object.size) == osize && | |
1551 | memcmp(o->field.payload, field, size) == 0) { | |
1552 | ||
cc938e4a YW |
1553 | if (ret_object) |
1554 | *ret_object = o; | |
f4474e00 LP |
1555 | if (ret_offset) |
1556 | *ret_offset = p; | |
3c1668da LP |
1557 | |
1558 | return 1; | |
1559 | } | |
1560 | ||
8a77f21d | 1561 | r = get_next_hash_offset( |
0dbe57ee LP |
1562 | f, |
1563 | &p, | |
1564 | &o->field.next_hash_offset, | |
1565 | &depth, | |
1566 | JOURNAL_HEADER_CONTAINS(f->header, field_hash_chain_depth) ? &f->header->field_hash_chain_depth : NULL); | |
1567 | if (r < 0) | |
1568 | return r; | |
3c1668da LP |
1569 | } |
1570 | ||
1571 | return 0; | |
1572 | } | |
1573 | ||
4ce534f4 LP |
1574 | uint64_t journal_file_hash_data( |
1575 | JournalFile *f, | |
1576 | const void *data, | |
1577 | size_t sz) { | |
1578 | ||
1579 | assert(f); | |
cc938e4a | 1580 | assert(f->header); |
4ce534f4 LP |
1581 | assert(data || sz == 0); |
1582 | ||
1583 | /* We try to unify our codebase on siphash, hence new-styled journal files utilizing the keyed hash | |
1584 | * function use siphash. Old journal files use the Jenkins hash. */ | |
1585 | ||
1586 | if (JOURNAL_HEADER_KEYED_HASH(f->header)) | |
1587 | return siphash24(data, sz, f->header->file_id.bytes); | |
1588 | ||
1589 | return jenkins_hash64(data, sz); | |
1590 | } | |
1591 | ||
3c1668da LP |
1592 | int journal_file_find_field_object( |
1593 | JournalFile *f, | |
cc938e4a YW |
1594 | const void *field, |
1595 | uint64_t size, | |
1596 | Object **ret_object, | |
1597 | uint64_t *ret_offset) { | |
3c1668da | 1598 | |
3c1668da | 1599 | assert(f); |
cc938e4a YW |
1600 | assert(field); |
1601 | assert(size > 0); | |
3c1668da | 1602 | |
f4474e00 LP |
1603 | return journal_file_find_field_object_with_hash( |
1604 | f, | |
4ce534f4 LP |
1605 | field, size, |
1606 | journal_file_hash_data(f, field, size), | |
cc938e4a | 1607 | ret_object, ret_offset); |
3c1668da LP |
1608 | } |
1609 | ||
de190aef LP |
1610 | int journal_file_find_data_object_with_hash( |
1611 | JournalFile *f, | |
cc938e4a YW |
1612 | const void *data, |
1613 | uint64_t size, | |
1614 | uint64_t hash, | |
1615 | Object **ret_object, | |
1616 | uint64_t *ret_offset) { | |
48496df6 | 1617 | |
0e35afff | 1618 | uint64_t p, h, m, depth = 0; |
cec736d2 LP |
1619 | int r; |
1620 | ||
1621 | assert(f); | |
c88cc6af | 1622 | assert(f->header); |
cec736d2 LP |
1623 | assert(data || size == 0); |
1624 | ||
dade37d4 LP |
1625 | /* If there's no data hash table, then there's no entry. */ |
1626 | if (le64toh(f->header->data_hash_table_size) <= 0) | |
1627 | return 0; | |
1628 | ||
1629 | /* Map the data hash table, if it isn't mapped yet. */ | |
1630 | r = journal_file_map_data_hash_table(f); | |
1631 | if (r < 0) | |
1632 | return r; | |
1633 | ||
893e0f8f | 1634 | m = le64toh(READ_NOW(f->header->data_hash_table_size)) / sizeof(HashItem); |
805d1486 | 1635 | if (m <= 0) |
bc85bfee LP |
1636 | return -EBADMSG; |
1637 | ||
805d1486 | 1638 | h = hash % m; |
de190aef | 1639 | p = le64toh(f->data_hash_table[h].head_hash_offset); |
cec736d2 | 1640 | |
de190aef LP |
1641 | while (p > 0) { |
1642 | Object *o; | |
0e35afff DDM |
1643 | void *d; |
1644 | size_t rsize; | |
cec736d2 | 1645 | |
de190aef | 1646 | r = journal_file_move_to_object(f, OBJECT_DATA, p, &o); |
cec736d2 LP |
1647 | if (r < 0) |
1648 | return r; | |
1649 | ||
807e17f0 | 1650 | if (le64toh(o->data.hash) != hash) |
85a131e8 | 1651 | goto next; |
807e17f0 | 1652 | |
0e35afff DDM |
1653 | r = journal_file_data_payload(f, o, p, NULL, 0, 0, &d, &rsize); |
1654 | if (r < 0) | |
1655 | return r; | |
1656 | assert(r > 0); /* journal_file_data_payload() always returns > 0 if no field is provided. */ | |
807e17f0 | 1657 | |
0e35afff | 1658 | if (memcmp_nn(data, size, d, rsize) == 0) { |
cc938e4a YW |
1659 | if (ret_object) |
1660 | *ret_object = o; | |
cec736d2 | 1661 | |
f4474e00 LP |
1662 | if (ret_offset) |
1663 | *ret_offset = p; | |
cec736d2 | 1664 | |
de190aef | 1665 | return 1; |
cec736d2 LP |
1666 | } |
1667 | ||
85a131e8 | 1668 | next: |
8a77f21d | 1669 | r = get_next_hash_offset( |
0dbe57ee LP |
1670 | f, |
1671 | &p, | |
1672 | &o->data.next_hash_offset, | |
1673 | &depth, | |
1674 | JOURNAL_HEADER_CONTAINS(f->header, data_hash_chain_depth) ? &f->header->data_hash_chain_depth : NULL); | |
1675 | if (r < 0) | |
1676 | return r; | |
cec736d2 LP |
1677 | } |
1678 | ||
de190aef LP |
1679 | return 0; |
1680 | } | |
1681 | ||
1682 | int journal_file_find_data_object( | |
1683 | JournalFile *f, | |
cc938e4a YW |
1684 | const void *data, |
1685 | uint64_t size, | |
1686 | Object **ret_object, | |
1687 | uint64_t *ret_offset) { | |
de190aef | 1688 | |
de190aef LP |
1689 | assert(f); |
1690 | assert(data || size == 0); | |
1691 | ||
f4474e00 LP |
1692 | return journal_file_find_data_object_with_hash( |
1693 | f, | |
4ce534f4 LP |
1694 | data, size, |
1695 | journal_file_hash_data(f, data, size), | |
cc938e4a | 1696 | ret_object, ret_offset); |
de190aef LP |
1697 | } |
1698 | ||
adce225a | 1699 | bool journal_field_valid(const char *p, size_t l, bool allow_protected) { |
adce225a YW |
1700 | /* We kinda enforce POSIX syntax recommendations for |
1701 | environment variables here, but make a couple of additional | |
1702 | requirements. | |
1703 | ||
1704 | http://pubs.opengroup.org/onlinepubs/000095399/basedefs/xbd_chap08.html */ | |
1705 | ||
cc938e4a YW |
1706 | assert(p); |
1707 | ||
f5fbe71d | 1708 | if (l == SIZE_MAX) |
adce225a YW |
1709 | l = strlen(p); |
1710 | ||
1711 | /* No empty field names */ | |
1712 | if (l <= 0) | |
1713 | return false; | |
1714 | ||
1715 | /* Don't allow names longer than 64 chars */ | |
1716 | if (l > 64) | |
1717 | return false; | |
1718 | ||
1719 | /* Variables starting with an underscore are protected */ | |
1720 | if (!allow_protected && p[0] == '_') | |
1721 | return false; | |
1722 | ||
1723 | /* Don't allow digits as first character */ | |
ff25d338 | 1724 | if (ascii_isdigit(p[0])) |
adce225a YW |
1725 | return false; |
1726 | ||
1727 | /* Only allow A-Z0-9 and '_' */ | |
f6a0cfa5 | 1728 | for (const char *a = p; a < p + l; a++) |
adce225a | 1729 | if ((*a < 'A' || *a > 'Z') && |
ff25d338 | 1730 | !ascii_isdigit(*a) && |
adce225a YW |
1731 | *a != '_') |
1732 | return false; | |
1733 | ||
1734 | return true; | |
1735 | } | |
1736 | ||
3c1668da LP |
1737 | static int journal_file_append_field( |
1738 | JournalFile *f, | |
cc938e4a YW |
1739 | const void *field, |
1740 | uint64_t size, | |
1741 | Object **ret_object, | |
1742 | uint64_t *ret_offset) { | |
3c1668da LP |
1743 | |
1744 | uint64_t hash, p; | |
1745 | uint64_t osize; | |
1746 | Object *o; | |
1747 | int r; | |
1748 | ||
1749 | assert(f); | |
cc938e4a YW |
1750 | assert(field); |
1751 | assert(size > 0); | |
3c1668da | 1752 | |
f2bd0320 YW |
1753 | if (!journal_field_valid(field, size, true)) |
1754 | return -EBADMSG; | |
1755 | ||
4ce534f4 | 1756 | hash = journal_file_hash_data(f, field, size); |
3c1668da | 1757 | |
cc938e4a | 1758 | r = journal_file_find_field_object_with_hash(f, field, size, hash, ret_object, ret_offset); |
3c1668da LP |
1759 | if (r < 0) |
1760 | return r; | |
ded10e3a | 1761 | if (r > 0) |
3c1668da | 1762 | return 0; |
3c1668da LP |
1763 | |
1764 | osize = offsetof(Object, field.payload) + size; | |
1765 | r = journal_file_append_object(f, OBJECT_FIELD, osize, &o, &p); | |
8c92d4bb LP |
1766 | if (r < 0) |
1767 | return r; | |
3c1668da LP |
1768 | |
1769 | o->field.hash = htole64(hash); | |
1770 | memcpy(o->field.payload, field, size); | |
1771 | ||
1772 | r = journal_file_link_field(f, o, p, hash); | |
1773 | if (r < 0) | |
1774 | return r; | |
1775 | ||
ded10e3a DDM |
1776 | /* The linking might have altered the window, so let's only pass the offset to hmac which will |
1777 | * move to the object again if needed. */ | |
3c1668da | 1778 | |
349cc4a5 | 1779 | #if HAVE_GCRYPT |
ded10e3a | 1780 | r = journal_file_hmac_put_object(f, OBJECT_FIELD, NULL, p); |
3c1668da LP |
1781 | if (r < 0) |
1782 | return r; | |
1783 | #endif | |
1784 | ||
cc938e4a YW |
1785 | if (ret_object) { |
1786 | r = journal_file_move_to_object(f, OBJECT_FIELD, p, ret_object); | |
ded10e3a DDM |
1787 | if (r < 0) |
1788 | return r; | |
1789 | } | |
3c1668da | 1790 | |
f4474e00 LP |
1791 | if (ret_offset) |
1792 | *ret_offset = p; | |
3c1668da LP |
1793 | |
1794 | return 0; | |
1795 | } | |
1796 | ||
bfeaa62d | 1797 | static int maybe_compress_payload(JournalFile *f, uint8_t *dst, const uint8_t *src, uint64_t size, size_t *rsize) { |
cc938e4a YW |
1798 | assert(f); |
1799 | assert(f->header); | |
1800 | ||
e81710d3 | 1801 | #if HAVE_COMPRESSION |
2360352e YW |
1802 | Compression c; |
1803 | int r; | |
1804 | ||
1805 | c = JOURNAL_FILE_COMPRESSION(f); | |
1806 | if (c == COMPRESSION_NONE || size < f->compress_threshold_bytes) | |
bfeaa62d | 1807 | return 0; |
2360352e | 1808 | |
bfeaa62d YW |
1809 | r = compress_blob(c, src, size, dst, size - 1, rsize); |
1810 | if (r < 0) | |
1811 | return log_debug_errno(r, "Failed to compress data object using %s, ignoring: %m", compression_to_string(c)); | |
e81710d3 | 1812 | |
2360352e YW |
1813 | log_debug("Compressed data object %"PRIu64" -> %zu using %s", size, *rsize, compression_to_string(c)); |
1814 | ||
bfeaa62d | 1815 | return 1; /* compressed */ |
2360352e | 1816 | #else |
bfeaa62d | 1817 | return 0; |
2360352e | 1818 | #endif |
e81710d3 DDM |
1819 | } |
1820 | ||
48496df6 LP |
1821 | static int journal_file_append_data( |
1822 | JournalFile *f, | |
cc938e4a YW |
1823 | const void *data, |
1824 | uint64_t size, | |
1825 | Object **ret_object, | |
1826 | uint64_t *ret_offset) { | |
48496df6 | 1827 | |
e81710d3 | 1828 | uint64_t hash, p, osize; |
bc6b326d | 1829 | Object *o, *fo; |
e81710d3 | 1830 | size_t rsize = 0; |
3c1668da | 1831 | const void *eq; |
e81710d3 | 1832 | int r; |
de190aef LP |
1833 | |
1834 | assert(f); | |
bc6b326d DDM |
1835 | |
1836 | if (!data || size == 0) | |
1837 | return -EINVAL; | |
de190aef | 1838 | |
4ce534f4 | 1839 | hash = journal_file_hash_data(f, data, size); |
de190aef | 1840 | |
cc938e4a | 1841 | r = journal_file_find_data_object_with_hash(f, data, size, hash, ret_object, ret_offset); |
de190aef LP |
1842 | if (r < 0) |
1843 | return r; | |
ded10e3a | 1844 | if (r > 0) |
de190aef | 1845 | return 0; |
de190aef | 1846 | |
bc6b326d DDM |
1847 | eq = memchr(data, '=', size); |
1848 | if (!eq) | |
1849 | return -EINVAL; | |
1850 | ||
e81710d3 | 1851 | osize = journal_file_data_payload_offset(f) + size; |
de190aef | 1852 | r = journal_file_append_object(f, OBJECT_DATA, osize, &o, &p); |
cec736d2 LP |
1853 | if (r < 0) |
1854 | return r; | |
1855 | ||
cec736d2 | 1856 | o->data.hash = htole64(hash); |
807e17f0 | 1857 | |
bfeaa62d YW |
1858 | r = maybe_compress_payload(f, journal_file_data_payload_field(f, o), data, size, &rsize); |
1859 | if (r <= 0) | |
1860 | /* We don't really care failures, let's continue without compression */ | |
1861 | memcpy_safe(journal_file_data_payload_field(f, o), data, size); | |
1862 | else { | |
1863 | Compression c = JOURNAL_FILE_COMPRESSION(f); | |
1864 | ||
1865 | assert(c >= 0 && c < _COMPRESSION_MAX && c != COMPRESSION_NONE); | |
807e17f0 | 1866 | |
e81710d3 DDM |
1867 | o->object.size = htole64(journal_file_data_payload_offset(f) + rsize); |
1868 | o->object.flags |= COMPRESSION_TO_OBJECT_FLAG(c); | |
bfeaa62d | 1869 | } |
cec736d2 | 1870 | |
de190aef | 1871 | r = journal_file_link_data(f, o, p, hash); |
cec736d2 LP |
1872 | if (r < 0) |
1873 | return r; | |
1874 | ||
ded10e3a DDM |
1875 | /* The linking might have altered the window, so let's refresh our pointer. */ |
1876 | r = journal_file_move_to_object(f, OBJECT_DATA, p, &o); | |
33685a5a FB |
1877 | if (r < 0) |
1878 | return r; | |
33685a5a | 1879 | |
ded10e3a DDM |
1880 | #if HAVE_GCRYPT |
1881 | r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p); | |
48496df6 LP |
1882 | if (r < 0) |
1883 | return r; | |
ded10e3a | 1884 | #endif |
48496df6 | 1885 | |
bc6b326d | 1886 | /* Create field object ... */ |
e81710d3 | 1887 | r = journal_file_append_field(f, data, (uint8_t*) eq - (uint8_t*) data, &fo, NULL); |
bc6b326d DDM |
1888 | if (r < 0) |
1889 | return r; | |
3c1668da | 1890 | |
bc6b326d DDM |
1891 | /* ... and link it in. */ |
1892 | o->data.next_field_offset = fo->field.head_data_offset; | |
1893 | fo->field.head_data_offset = le64toh(p); | |
3c1668da | 1894 | |
cc938e4a YW |
1895 | if (ret_object) |
1896 | *ret_object = o; | |
cec736d2 | 1897 | |
f4474e00 LP |
1898 | if (ret_offset) |
1899 | *ret_offset = p; | |
cec736d2 LP |
1900 | |
1901 | return 0; | |
1902 | } | |
1903 | ||
0e35afff DDM |
1904 | static int maybe_decompress_payload( |
1905 | JournalFile *f, | |
1906 | uint8_t *payload, | |
1907 | uint64_t size, | |
1908 | Compression compression, | |
1909 | const char *field, | |
1910 | size_t field_length, | |
1911 | size_t data_threshold, | |
1912 | void **ret_data, | |
1913 | size_t *ret_size) { | |
1914 | ||
cc938e4a YW |
1915 | assert(f); |
1916 | ||
da890466 | 1917 | /* We can't read objects larger than 4G on a 32-bit machine */ |
0e35afff DDM |
1918 | if ((uint64_t) (size_t) size != size) |
1919 | return -E2BIG; | |
1920 | ||
1921 | if (compression != COMPRESSION_NONE) { | |
1922 | #if HAVE_COMPRESSION | |
1923 | size_t rsize; | |
1924 | int r; | |
1925 | ||
1926 | if (field) { | |
1927 | r = decompress_startswith(compression, payload, size, &f->compress_buffer, field, | |
1928 | field_length, '='); | |
1929 | if (r < 0) | |
1930 | return log_debug_errno(r, | |
1931 | "Cannot decompress %s object of length %" PRIu64 ": %m", | |
1932 | compression_to_string(compression), | |
1933 | size); | |
1934 | if (r == 0) { | |
cc938e4a YW |
1935 | if (ret_data) |
1936 | *ret_data = NULL; | |
1937 | if (ret_size) | |
1938 | *ret_size = 0; | |
0e35afff DDM |
1939 | return 0; |
1940 | } | |
1941 | } | |
1942 | ||
1943 | r = decompress_blob(compression, payload, size, &f->compress_buffer, &rsize, 0); | |
1944 | if (r < 0) | |
1945 | return r; | |
1946 | ||
1947 | if (ret_data) | |
1948 | *ret_data = f->compress_buffer; | |
1949 | if (ret_size) | |
1950 | *ret_size = rsize; | |
1951 | #else | |
1952 | return -EPROTONOSUPPORT; | |
1953 | #endif | |
1954 | } else { | |
1955 | if (field && (size < field_length + 1 || memcmp(payload, field, field_length) != 0 || payload[field_length] != '=')) { | |
cc938e4a YW |
1956 | if (ret_data) |
1957 | *ret_data = NULL; | |
1958 | if (ret_size) | |
1959 | *ret_size = 0; | |
0e35afff DDM |
1960 | return 0; |
1961 | } | |
1962 | ||
1963 | if (ret_data) | |
1964 | *ret_data = payload; | |
1965 | if (ret_size) | |
1966 | *ret_size = (size_t) size; | |
1967 | } | |
1968 | ||
1969 | return 1; | |
1970 | } | |
1971 | ||
1972 | int journal_file_data_payload( | |
1973 | JournalFile *f, | |
1974 | Object *o, | |
1975 | uint64_t offset, | |
1976 | const char *field, | |
1977 | size_t field_length, | |
1978 | size_t data_threshold, | |
1979 | void **ret_data, | |
1980 | size_t *ret_size) { | |
1981 | ||
1982 | uint64_t size; | |
1983 | Compression c; | |
1984 | int r; | |
1985 | ||
cc938e4a | 1986 | assert(f); |
0e35afff | 1987 | assert(!field == (field_length == 0)); /* These must be specified together. */ |
0e35afff DDM |
1988 | |
1989 | if (!o) { | |
1990 | r = journal_file_move_to_object(f, OBJECT_DATA, offset, &o); | |
1991 | if (r < 0) | |
1992 | return r; | |
1993 | } | |
1994 | ||
1995 | size = le64toh(READ_NOW(o->object.size)); | |
e81710d3 | 1996 | if (size < journal_file_data_payload_offset(f)) |
0e35afff DDM |
1997 | return -EBADMSG; |
1998 | ||
e81710d3 | 1999 | size -= journal_file_data_payload_offset(f); |
0e35afff DDM |
2000 | |
2001 | c = COMPRESSION_FROM_OBJECT(o); | |
2002 | if (c < 0) | |
2003 | return -EPROTONOSUPPORT; | |
2004 | ||
e81710d3 DDM |
2005 | return maybe_decompress_payload(f, journal_file_data_payload_field(f, o), size, c, field, |
2006 | field_length, data_threshold, ret_data, ret_size); | |
0e35afff DDM |
2007 | } |
2008 | ||
a9089a66 | 2009 | uint64_t journal_file_entry_n_items(JournalFile *f, Object *o) { |
893e0f8f | 2010 | uint64_t sz; |
a9089a66 DDM |
2011 | |
2012 | assert(f); | |
cec736d2 | 2013 | assert(o); |
b588975f LP |
2014 | |
2015 | if (o->object.type != OBJECT_ENTRY) | |
2016 | return 0; | |
cec736d2 | 2017 | |
893e0f8f LP |
2018 | sz = le64toh(READ_NOW(o->object.size)); |
2019 | if (sz < offsetof(Object, entry.items)) | |
2020 | return 0; | |
2021 | ||
a9089a66 | 2022 | return (sz - offsetof(Object, entry.items)) / journal_file_entry_item_size(f); |
cec736d2 LP |
2023 | } |
2024 | ||
99daf3ce | 2025 | uint64_t journal_file_entry_array_n_items(JournalFile *f, Object *o) { |
893e0f8f LP |
2026 | uint64_t sz; |
2027 | ||
cc938e4a | 2028 | assert(f); |
de190aef | 2029 | assert(o); |
b588975f LP |
2030 | |
2031 | if (o->object.type != OBJECT_ENTRY_ARRAY) | |
2032 | return 0; | |
de190aef | 2033 | |
893e0f8f LP |
2034 | sz = le64toh(READ_NOW(o->object.size)); |
2035 | if (sz < offsetof(Object, entry_array.items)) | |
2036 | return 0; | |
2037 | ||
99daf3ce | 2038 | return (sz - offsetof(Object, entry_array.items)) / journal_file_entry_array_item_size(f); |
de190aef LP |
2039 | } |
2040 | ||
fb9a24b6 | 2041 | uint64_t journal_file_hash_table_n_items(Object *o) { |
893e0f8f LP |
2042 | uint64_t sz; |
2043 | ||
fb9a24b6 | 2044 | assert(o); |
b588975f | 2045 | |
ec2ce0c5 | 2046 | if (!IN_SET(o->object.type, OBJECT_DATA_HASH_TABLE, OBJECT_FIELD_HASH_TABLE)) |
b588975f | 2047 | return 0; |
fb9a24b6 | 2048 | |
893e0f8f LP |
2049 | sz = le64toh(READ_NOW(o->object.size)); |
2050 | if (sz < offsetof(Object, hash_table.items)) | |
2051 | return 0; | |
2052 | ||
2053 | return (sz - offsetof(Object, hash_table.items)) / sizeof(HashItem); | |
fb9a24b6 LP |
2054 | } |
2055 | ||
99daf3ce DDM |
2056 | static void write_entry_array_item(JournalFile *f, Object *o, uint64_t i, uint64_t p) { |
2057 | assert(f); | |
2058 | assert(o); | |
2059 | ||
2060 | if (JOURNAL_HEADER_COMPACT(f->header)) { | |
2061 | assert(p <= UINT32_MAX); | |
2062 | o->entry_array.items.compact[i] = htole32(p); | |
2063 | } else | |
2064 | o->entry_array.items.regular[i] = htole64(p); | |
2065 | } | |
2066 | ||
cc938e4a YW |
2067 | static int link_entry_into_array( |
2068 | JournalFile *f, | |
2069 | le64_t *first, | |
2070 | le64_t *idx, | |
2071 | le32_t *tail, | |
2072 | le32_t *tidx, | |
2073 | uint64_t p) { | |
2074 | ||
de190aef LP |
2075 | uint64_t n = 0, ap = 0, q, i, a, hidx; |
2076 | Object *o; | |
cc938e4a | 2077 | int r; |
de190aef | 2078 | |
cec736d2 | 2079 | assert(f); |
c88cc6af | 2080 | assert(f->header); |
de190aef LP |
2081 | assert(first); |
2082 | assert(idx); | |
2083 | assert(p > 0); | |
cec736d2 | 2084 | |
e81710d3 DDM |
2085 | a = tail ? le32toh(*tail) : le64toh(*first); |
2086 | hidx = le64toh(READ_NOW(*idx)); | |
2087 | i = tidx ? le32toh(READ_NOW(*tidx)) : hidx; | |
de190aef | 2088 | |
cc938e4a | 2089 | while (a > 0) { |
de190aef LP |
2090 | r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o); |
2091 | if (r < 0) | |
2092 | return r; | |
cec736d2 | 2093 | |
99daf3ce | 2094 | n = journal_file_entry_array_n_items(f, o); |
de190aef | 2095 | if (i < n) { |
99daf3ce | 2096 | write_entry_array_item(f, o, i, p); |
de190aef | 2097 | *idx = htole64(hidx + 1); |
e81710d3 DDM |
2098 | if (tidx) |
2099 | *tidx = htole32(le32toh(*tidx) + 1); | |
de190aef LP |
2100 | return 0; |
2101 | } | |
cec736d2 | 2102 | |
de190aef LP |
2103 | i -= n; |
2104 | ap = a; | |
2105 | a = le64toh(o->entry_array.next_entry_array_offset); | |
2106 | } | |
2107 | ||
2108 | if (hidx > n) | |
2109 | n = (hidx+1) * 2; | |
2110 | else | |
2111 | n = n * 2; | |
2112 | ||
2113 | if (n < 4) | |
2114 | n = 4; | |
2115 | ||
2116 | r = journal_file_append_object(f, OBJECT_ENTRY_ARRAY, | |
99daf3ce | 2117 | offsetof(Object, entry_array.items) + n * journal_file_entry_array_item_size(f), |
de190aef | 2118 | &o, &q); |
cec736d2 LP |
2119 | if (r < 0) |
2120 | return r; | |
2121 | ||
349cc4a5 | 2122 | #if HAVE_GCRYPT |
5996c7c2 | 2123 | r = journal_file_hmac_put_object(f, OBJECT_ENTRY_ARRAY, o, q); |
b0af6f41 LP |
2124 | if (r < 0) |
2125 | return r; | |
feb12d3e | 2126 | #endif |
b0af6f41 | 2127 | |
99daf3ce | 2128 | write_entry_array_item(f, o, i, p); |
cec736d2 | 2129 | |
de190aef | 2130 | if (ap == 0) |
7be3aa17 | 2131 | *first = htole64(q); |
cec736d2 | 2132 | else { |
de190aef | 2133 | r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, ap, &o); |
cec736d2 LP |
2134 | if (r < 0) |
2135 | return r; | |
2136 | ||
de190aef LP |
2137 | o->entry_array.next_entry_array_offset = htole64(q); |
2138 | } | |
cec736d2 | 2139 | |
e81710d3 DDM |
2140 | if (tail) |
2141 | *tail = htole32(q); | |
2142 | ||
2dee23eb LP |
2143 | if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays)) |
2144 | f->header->n_entry_arrays = htole64(le64toh(f->header->n_entry_arrays) + 1); | |
2145 | ||
de190aef | 2146 | *idx = htole64(hidx + 1); |
e81710d3 DDM |
2147 | if (tidx) |
2148 | *tidx = htole32(1); | |
de190aef LP |
2149 | |
2150 | return 0; | |
2151 | } | |
cec736d2 | 2152 | |
cc938e4a YW |
2153 | static int link_entry_into_array_plus_one( |
2154 | JournalFile *f, | |
2155 | le64_t *extra, | |
2156 | le64_t *first, | |
2157 | le64_t *idx, | |
2158 | le32_t *tail, | |
2159 | le32_t *tidx, | |
2160 | uint64_t p) { | |
de190aef | 2161 | |
893e0f8f | 2162 | uint64_t hidx; |
de190aef LP |
2163 | int r; |
2164 | ||
2165 | assert(f); | |
2166 | assert(extra); | |
2167 | assert(first); | |
2168 | assert(idx); | |
2169 | assert(p > 0); | |
2170 | ||
893e0f8f LP |
2171 | hidx = le64toh(READ_NOW(*idx)); |
2172 | if (hidx == UINT64_MAX) | |
2173 | return -EBADMSG; | |
2174 | if (hidx == 0) | |
de190aef LP |
2175 | *extra = htole64(p); |
2176 | else { | |
4fd052ae | 2177 | le64_t i; |
de190aef | 2178 | |
893e0f8f | 2179 | i = htole64(hidx - 1); |
e81710d3 | 2180 | r = link_entry_into_array(f, first, &i, tail, tidx, p); |
de190aef LP |
2181 | if (r < 0) |
2182 | return r; | |
cec736d2 LP |
2183 | } |
2184 | ||
893e0f8f | 2185 | *idx = htole64(hidx + 1); |
de190aef LP |
2186 | return 0; |
2187 | } | |
2188 | ||
8f710237 YW |
2189 | static int journal_file_link_entry_item(JournalFile *f, uint64_t offset, uint64_t p) { |
2190 | Object *o; | |
de190aef | 2191 | int r; |
bfbd5be0 | 2192 | |
de190aef | 2193 | assert(f); |
de190aef LP |
2194 | assert(offset > 0); |
2195 | ||
de190aef | 2196 | r = journal_file_move_to_object(f, OBJECT_DATA, p, &o); |
cec736d2 LP |
2197 | if (r < 0) |
2198 | return r; | |
2199 | ||
de190aef LP |
2200 | return link_entry_into_array_plus_one(f, |
2201 | &o->data.entry_offset, | |
2202 | &o->data.entry_array_offset, | |
2203 | &o->data.n_entries, | |
e81710d3 DDM |
2204 | JOURNAL_HEADER_COMPACT(f->header) ? &o->data.compact.tail_entry_array_offset : NULL, |
2205 | JOURNAL_HEADER_COMPACT(f->header) ? &o->data.compact.tail_entry_array_n_entries : NULL, | |
de190aef | 2206 | offset); |
cec736d2 LP |
2207 | } |
2208 | ||
a9089a66 DDM |
2209 | static int journal_file_link_entry( |
2210 | JournalFile *f, | |
2211 | Object *o, | |
2212 | uint64_t offset, | |
2213 | const EntryItem items[], | |
2214 | size_t n_items) { | |
2215 | ||
cec736d2 LP |
2216 | int r; |
2217 | ||
2218 | assert(f); | |
c88cc6af | 2219 | assert(f->header); |
cec736d2 LP |
2220 | assert(o); |
2221 | assert(offset > 0); | |
b588975f LP |
2222 | |
2223 | if (o->object.type != OBJECT_ENTRY) | |
2224 | return -EINVAL; | |
cec736d2 | 2225 | |
60c040f6 | 2226 | __atomic_thread_fence(__ATOMIC_SEQ_CST); |
b788cc23 | 2227 | |
cec736d2 | 2228 | /* Link up the entry itself */ |
de190aef LP |
2229 | r = link_entry_into_array(f, |
2230 | &f->header->entry_array_offset, | |
2231 | &f->header->n_entries, | |
e81710d3 DDM |
2232 | JOURNAL_HEADER_CONTAINS(f->header, tail_entry_array_offset) ? &f->header->tail_entry_array_offset : NULL, |
2233 | JOURNAL_HEADER_CONTAINS(f->header, tail_entry_array_n_entries) ? &f->header->tail_entry_array_n_entries : NULL, | |
de190aef LP |
2234 | offset); |
2235 | if (r < 0) | |
2236 | return r; | |
cec736d2 | 2237 | |
507f22bd | 2238 | /* log_debug("=> %s seqnr=%"PRIu64" n_entries=%"PRIu64, f->path, o->entry.seqnum, f->header->n_entries); */ |
cec736d2 | 2239 | |
de190aef | 2240 | if (f->header->head_entry_realtime == 0) |
0ac38b70 | 2241 | f->header->head_entry_realtime = o->entry.realtime; |
cec736d2 | 2242 | |
0ac38b70 | 2243 | f->header->tail_entry_realtime = o->entry.realtime; |
de190aef | 2244 | f->header->tail_entry_monotonic = o->entry.monotonic; |
45a6a2aa YW |
2245 | if (JOURNAL_HEADER_CONTAINS(f->header, tail_entry_offset)) |
2246 | f->header->tail_entry_offset = htole64(offset); | |
34af7494 | 2247 | f->newest_mtime = 0; /* we have a new tail entry now, explicitly invalidate newest boot id/timestamp info */ |
de190aef | 2248 | |
cec736d2 | 2249 | /* Link up the items */ |
a9089a66 | 2250 | for (uint64_t i = 0; i < n_items; i++) { |
df535364 DDM |
2251 | int k; |
2252 | ||
2253 | /* If we fail to link an entry item because we can't allocate a new entry array, don't fail | |
2254 | * immediately but try to link the other entry items since it might still be possible to link | |
2255 | * those if they don't require a new entry array to be allocated. */ | |
2256 | ||
8f710237 | 2257 | k = journal_file_link_entry_item(f, offset, items[i].object_offset); |
df535364 DDM |
2258 | if (k == -E2BIG) |
2259 | r = k; | |
2260 | else if (k < 0) | |
2261 | return k; | |
cec736d2 LP |
2262 | } |
2263 | ||
df535364 | 2264 | return r; |
cec736d2 LP |
2265 | } |
2266 | ||
a9089a66 DDM |
2267 | static void write_entry_item(JournalFile *f, Object *o, uint64_t i, const EntryItem *item) { |
2268 | assert(f); | |
2269 | assert(o); | |
2270 | assert(item); | |
2271 | ||
2272 | if (JOURNAL_HEADER_COMPACT(f->header)) { | |
2273 | assert(item->object_offset <= UINT32_MAX); | |
2274 | o->entry.items.compact[i].object_offset = htole32(item->object_offset); | |
2275 | } else { | |
2276 | o->entry.items.regular[i].object_offset = htole64(item->object_offset); | |
2277 | o->entry.items.regular[i].hash = htole64(item->hash); | |
2278 | } | |
2279 | } | |
2280 | ||
cec736d2 LP |
2281 | static int journal_file_append_entry_internal( |
2282 | JournalFile *f, | |
2283 | const dual_timestamp *ts, | |
d180c349 | 2284 | const sd_id128_t *boot_id, |
51ab0afe | 2285 | const sd_id128_t *machine_id, |
cec736d2 | 2286 | uint64_t xor_hash, |
cc938e4a YW |
2287 | const EntryItem items[], |
2288 | size_t n_items, | |
de190aef | 2289 | uint64_t *seqnum, |
e5d60d1b | 2290 | sd_id128_t *seqnum_id, |
cc938e4a YW |
2291 | Object **ret_object, |
2292 | uint64_t *ret_offset) { | |
2293 | ||
cec736d2 LP |
2294 | uint64_t np; |
2295 | uint64_t osize; | |
2296 | Object *o; | |
2297 | int r; | |
2298 | ||
2299 | assert(f); | |
c88cc6af | 2300 | assert(f->header); |
de190aef | 2301 | assert(ts); |
b761ae0f | 2302 | assert(boot_id); |
1eede158 | 2303 | assert(!sd_id128_is_null(*boot_id)); |
cc938e4a | 2304 | assert(items || n_items == 0); |
cec736d2 | 2305 | |
ce92dc27 LP |
2306 | if (f->strict_order) { |
2307 | /* If requested be stricter with ordering in this journal file, to make searching via | |
2308 | * bisection fully deterministic. This is an optional feature, so that if desired journal | |
2309 | * files can be written where the ordering is not strictly enforced (in which case bisection | |
2310 | * will yield *a* result, but not the *only* result, when searching for points in | |
2311 | * time). Strict ordering mode is enabled when journald originally writes the files, but | |
2312 | * might not necessarily be if other tools (the remoting tools for example) write journal | |
2313 | * files from combined sources. | |
2314 | * | |
2315 | * Typically, if any of the errors generated here are seen journald will just rotate the | |
2316 | * journal files and start anew. */ | |
2317 | ||
2318 | if (ts->realtime < le64toh(f->header->tail_entry_realtime)) | |
2319 | return log_debug_errno(SYNTHETIC_ERRNO(EREMCHG), | |
2320 | "Realtime timestamp %" PRIu64 " smaller than previous realtime " | |
2321 | "timestamp %" PRIu64 ", refusing entry.", | |
2322 | ts->realtime, le64toh(f->header->tail_entry_realtime)); | |
2323 | ||
b761ae0f | 2324 | if (sd_id128_equal(*boot_id, f->header->tail_entry_boot_id) && |
addcecf6 | 2325 | ts->monotonic < le64toh(f->header->tail_entry_monotonic)) |
2326 | return log_debug_errno( | |
2327 | SYNTHETIC_ERRNO(ENOTNAM), | |
2328 | "Monotonic timestamp %" PRIu64 | |
2329 | " smaller than previous monotonic timestamp %" PRIu64 | |
2330 | " while having the same boot ID, refusing entry.", | |
2331 | ts->monotonic, | |
2332 | le64toh(f->header->tail_entry_monotonic)); | |
ce92dc27 | 2333 | } |
1d8d483f | 2334 | |
e5d60d1b LP |
2335 | if (seqnum_id) { |
2336 | /* Settle the passed in sequence number ID */ | |
2337 | ||
2338 | if (sd_id128_is_null(*seqnum_id)) | |
2339 | *seqnum_id = f->header->seqnum_id; /* Caller has none assigned, then copy the one from the file */ | |
2340 | else if (!sd_id128_equal(*seqnum_id, f->header->seqnum_id)) { | |
2341 | /* Different seqnum IDs? We can't allow entries from multiple IDs end up in the same journal.*/ | |
2342 | if (le64toh(f->header->n_entries) == 0) | |
2343 | f->header->seqnum_id = *seqnum_id; /* Caller has one, and file so far has no entries, then copy the one from the caller */ | |
2344 | else | |
2345 | return log_debug_errno(SYNTHETIC_ERRNO(EILSEQ), | |
2346 | "Sequence number IDs don't match, refusing entry."); | |
2347 | } | |
2348 | } | |
2349 | ||
51ab0afe LP |
2350 | if (machine_id && sd_id128_is_null(f->header->machine_id)) |
2351 | /* Initialize machine ID when not set yet */ | |
2352 | f->header->machine_id = *machine_id; | |
2353 | ||
a9089a66 | 2354 | osize = offsetof(Object, entry.items) + (n_items * journal_file_entry_item_size(f)); |
cec736d2 | 2355 | |
de190aef | 2356 | r = journal_file_append_object(f, OBJECT_ENTRY, osize, &o, &np); |
cec736d2 LP |
2357 | if (r < 0) |
2358 | return r; | |
2359 | ||
d98cc1f2 | 2360 | o->entry.seqnum = htole64(journal_file_entry_seqnum(f, seqnum)); |
de190aef LP |
2361 | o->entry.realtime = htole64(ts->realtime); |
2362 | o->entry.monotonic = htole64(ts->monotonic); | |
cec736d2 | 2363 | o->entry.xor_hash = htole64(xor_hash); |
b761ae0f | 2364 | o->entry.boot_id = f->header->tail_entry_boot_id = *boot_id; |
cec736d2 | 2365 | |
a9089a66 DDM |
2366 | for (size_t i = 0; i < n_items; i++) |
2367 | write_entry_item(f, o, i, &items[i]); | |
2368 | ||
349cc4a5 | 2369 | #if HAVE_GCRYPT |
5996c7c2 | 2370 | r = journal_file_hmac_put_object(f, OBJECT_ENTRY, o, np); |
b0af6f41 | 2371 | if (r < 0) |
b41b682b | 2372 | return r; |
feb12d3e | 2373 | #endif |
b0af6f41 | 2374 | |
a9089a66 | 2375 | r = journal_file_link_entry(f, o, np, items, n_items); |
cec736d2 | 2376 | if (r < 0) |
b41b682b | 2377 | return r; |
cec736d2 | 2378 | |
cc938e4a YW |
2379 | if (ret_object) |
2380 | *ret_object = o; | |
cec736d2 | 2381 | |
f4474e00 LP |
2382 | if (ret_offset) |
2383 | *ret_offset = np; | |
cec736d2 | 2384 | |
0eaee828 | 2385 | return r; |
cec736d2 LP |
2386 | } |
2387 | ||
cf244689 | 2388 | void journal_file_post_change(JournalFile *f) { |
50f20cfd LP |
2389 | assert(f); |
2390 | ||
c5236850 DT |
2391 | if (f->fd < 0) |
2392 | return; | |
2393 | ||
50f20cfd LP |
2394 | /* inotify() does not receive IN_MODIFY events from file |
2395 | * accesses done via mmap(). After each access we hence | |
2396 | * trigger IN_MODIFY by truncating the journal file to its | |
2397 | * current size which triggers IN_MODIFY. */ | |
2398 | ||
60c040f6 | 2399 | __atomic_thread_fence(__ATOMIC_SEQ_CST); |
bc85bfee | 2400 | |
50f20cfd | 2401 | if (ftruncate(f->fd, f->last_stat.st_size) < 0) |
e167d7fd | 2402 | log_debug_errno(errno, "Failed to truncate file to its own size: %m"); |
50f20cfd LP |
2403 | } |
2404 | ||
7a24f3bf VC |
2405 | static int post_change_thunk(sd_event_source *timer, uint64_t usec, void *userdata) { |
2406 | assert(userdata); | |
2407 | ||
2408 | journal_file_post_change(userdata); | |
2409 | ||
2410 | return 1; | |
2411 | } | |
2412 | ||
2413 | static void schedule_post_change(JournalFile *f) { | |
47f04c2a | 2414 | sd_event *e; |
b6cdfbe5 | 2415 | int r; |
7a24f3bf VC |
2416 | |
2417 | assert(f); | |
2418 | assert(f->post_change_timer); | |
2419 | ||
47f04c2a LP |
2420 | assert_se(e = sd_event_source_get_event(f->post_change_timer)); |
2421 | ||
288bd406 | 2422 | /* If we are already going down, post the change immediately. */ |
47f04c2a LP |
2423 | if (IN_SET(sd_event_get_state(e), SD_EVENT_EXITING, SD_EVENT_FINISHED)) |
2424 | goto fail; | |
2425 | ||
b6cdfbe5 | 2426 | r = sd_event_source_get_enabled(f->post_change_timer, NULL); |
7a24f3bf | 2427 | if (r < 0) { |
e167d7fd LP |
2428 | log_debug_errno(r, "Failed to get ftruncate timer state: %m"); |
2429 | goto fail; | |
7a24f3bf | 2430 | } |
b6cdfbe5 | 2431 | if (r > 0) |
7a24f3bf VC |
2432 | return; |
2433 | ||
39cf0351 | 2434 | r = sd_event_source_set_time_relative(f->post_change_timer, f->post_change_timer_period); |
7a24f3bf | 2435 | if (r < 0) { |
e167d7fd LP |
2436 | log_debug_errno(r, "Failed to set time for scheduling ftruncate: %m"); |
2437 | goto fail; | |
7a24f3bf VC |
2438 | } |
2439 | ||
ca5d90d4 | 2440 | r = sd_event_source_set_enabled(f->post_change_timer, SD_EVENT_ONESHOT); |
7a24f3bf | 2441 | if (r < 0) { |
e167d7fd LP |
2442 | log_debug_errno(r, "Failed to enable scheduled ftruncate: %m"); |
2443 | goto fail; | |
7a24f3bf | 2444 | } |
e167d7fd LP |
2445 | |
2446 | return; | |
2447 | ||
2448 | fail: | |
2449 | /* On failure, let's simply post the change immediately. */ | |
2450 | journal_file_post_change(f); | |
7a24f3bf VC |
2451 | } |
2452 | ||
2453 | /* Enable coalesced change posting in a timer on the provided sd_event instance */ | |
2454 | int journal_file_enable_post_change_timer(JournalFile *f, sd_event *e, usec_t t) { | |
2455 | _cleanup_(sd_event_source_unrefp) sd_event_source *timer = NULL; | |
2456 | int r; | |
2457 | ||
2458 | assert(f); | |
2459 | assert_return(!f->post_change_timer, -EINVAL); | |
2460 | assert(e); | |
2461 | assert(t); | |
2462 | ||
2463 | r = sd_event_add_time(e, &timer, CLOCK_MONOTONIC, 0, 0, post_change_thunk, f); | |
2464 | if (r < 0) | |
2465 | return r; | |
2466 | ||
2467 | r = sd_event_source_set_enabled(timer, SD_EVENT_OFF); | |
2468 | if (r < 0) | |
2469 | return r; | |
2470 | ||
1cc6c93a | 2471 | f->post_change_timer = TAKE_PTR(timer); |
7a24f3bf VC |
2472 | f->post_change_timer_period = t; |
2473 | ||
2474 | return r; | |
2475 | } | |
2476 | ||
93bab288 | 2477 | static int entry_item_cmp(const EntryItem *a, const EntryItem *b) { |
cc938e4a | 2478 | return CMP(ASSERT_PTR(a)->object_offset, ASSERT_PTR(b)->object_offset); |
1f2da9ec LP |
2479 | } |
2480 | ||
5ec9fbae | 2481 | static size_t remove_duplicate_entry_items(EntryItem items[], size_t n) { |
5ec9fbae DDM |
2482 | size_t j = 1; |
2483 | ||
cc938e4a YW |
2484 | assert(items || n == 0); |
2485 | ||
5ec9fbae DDM |
2486 | if (n <= 1) |
2487 | return n; | |
2488 | ||
2489 | for (size_t i = 1; i < n; i++) | |
2490 | if (items[i].object_offset != items[j - 1].object_offset) | |
2491 | items[j++] = items[i]; | |
2492 | ||
2493 | return j; | |
2494 | } | |
2495 | ||
d180c349 ZJS |
2496 | int journal_file_append_entry( |
2497 | JournalFile *f, | |
2498 | const dual_timestamp *ts, | |
2499 | const sd_id128_t *boot_id, | |
cc938e4a | 2500 | const struct iovec iovec[], |
b45a7215 | 2501 | size_t n_iovec, |
d180c349 | 2502 | uint64_t *seqnum, |
e5d60d1b | 2503 | sd_id128_t *seqnum_id, |
cc938e4a YW |
2504 | Object **ret_object, |
2505 | uint64_t *ret_offset) { | |
d180c349 | 2506 | |
52bcf45a | 2507 | _cleanup_free_ EntryItem *items_alloc = NULL; |
cec736d2 | 2508 | EntryItem *items; |
cec736d2 | 2509 | uint64_t xor_hash = 0; |
de190aef | 2510 | struct dual_timestamp _ts; |
51ab0afe | 2511 | sd_id128_t _boot_id, _machine_id, *machine_id; |
52bcf45a | 2512 | int r; |
cec736d2 LP |
2513 | |
2514 | assert(f); | |
c88cc6af | 2515 | assert(f->header); |
cc938e4a YW |
2516 | assert(iovec); |
2517 | assert(n_iovec > 0); | |
cec736d2 | 2518 | |
c6273953 | 2519 | if (ts) { |
baaa35ad ZJS |
2520 | if (!VALID_REALTIME(ts->realtime)) |
2521 | return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), | |
2522 | "Invalid realtime timestamp %" PRIu64 ", refusing entry.", | |
2523 | ts->realtime); | |
2524 | if (!VALID_MONOTONIC(ts->monotonic)) | |
2525 | return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), | |
2526 | "Invalid monotomic timestamp %" PRIu64 ", refusing entry.", | |
2527 | ts->monotonic); | |
c6273953 | 2528 | } else { |
fa5a0251 | 2529 | dual_timestamp_now(&_ts); |
de190aef LP |
2530 | ts = &_ts; |
2531 | } | |
2532 | ||
1eede158 YW |
2533 | if (boot_id) { |
2534 | if (sd_id128_is_null(*boot_id)) | |
2535 | return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), "Empty boot ID, refusing entry."); | |
2536 | } else { | |
bd524f49 LP |
2537 | r = sd_id128_get_boot(&_boot_id); |
2538 | if (r < 0) | |
2539 | return r; | |
2540 | ||
2541 | boot_id = &_boot_id; | |
2542 | } | |
2543 | ||
51ab0afe | 2544 | r = sd_id128_get_machine(&_machine_id); |
c5ed77b2 ZJS |
2545 | if (ERRNO_IS_NEG_MACHINE_ID_UNSET(r)) |
2546 | /* Gracefully handle the machine ID not being initialized yet */ | |
51ab0afe | 2547 | machine_id = NULL; |
c5ed77b2 ZJS |
2548 | else if (r < 0) |
2549 | return r; | |
2550 | else | |
51ab0afe LP |
2551 | machine_id = &_machine_id; |
2552 | ||
349cc4a5 | 2553 | #if HAVE_GCRYPT |
7560fffc LP |
2554 | r = journal_file_maybe_append_tag(f, ts->realtime); |
2555 | if (r < 0) | |
2556 | return r; | |
feb12d3e | 2557 | #endif |
7560fffc | 2558 | |
52bcf45a YW |
2559 | if (n_iovec < ALLOCA_MAX / sizeof(EntryItem) / 2) |
2560 | items = newa(EntryItem, n_iovec); | |
2561 | else { | |
2562 | items_alloc = new(EntryItem, n_iovec); | |
2563 | if (!items_alloc) | |
2564 | return -ENOMEM; | |
2565 | ||
2566 | items = items_alloc; | |
2567 | } | |
cec736d2 | 2568 | |
86e68f38 | 2569 | for (size_t i = 0; i < n_iovec; i++) { |
cec736d2 LP |
2570 | uint64_t p; |
2571 | Object *o; | |
2572 | ||
2573 | r = journal_file_append_data(f, iovec[i].iov_base, iovec[i].iov_len, &o, &p); | |
2574 | if (r < 0) | |
cf244689 | 2575 | return r; |
cec736d2 | 2576 | |
4ce534f4 LP |
2577 | /* When calculating the XOR hash field, we need to take special care if the "keyed-hash" |
2578 | * journal file flag is on. We use the XOR hash field to quickly determine the identity of a | |
2579 | * specific record, and give records with otherwise identical position (i.e. match in seqno, | |
2580 | * timestamp, …) a stable ordering. But for that we can't have it that the hash of the | |
2581 | * objects in each file is different since they are keyed. Hence let's calculate the Jenkins | |
2582 | * hash here for that. This also has the benefit that cursors for old and new journal files | |
2583 | * are completely identical (they include the XOR hash after all). For classic Jenkins-hash | |
2584 | * files things are easier, we can just take the value from the stored record directly. */ | |
2585 | ||
2586 | if (JOURNAL_HEADER_KEYED_HASH(f->header)) | |
2587 | xor_hash ^= jenkins_hash64(iovec[i].iov_base, iovec[i].iov_len); | |
2588 | else | |
2589 | xor_hash ^= le64toh(o->data.hash); | |
2590 | ||
d164ac77 | 2591 | items[i] = (EntryItem) { |
a9089a66 DDM |
2592 | .object_offset = p, |
2593 | .hash = le64toh(o->data.hash), | |
d164ac77 | 2594 | }; |
cec736d2 LP |
2595 | } |
2596 | ||
1f2da9ec LP |
2597 | /* Order by the position on disk, in order to improve seek |
2598 | * times for rotating media. */ | |
93bab288 | 2599 | typesafe_qsort(items, n_iovec, entry_item_cmp); |
5ec9fbae | 2600 | n_iovec = remove_duplicate_entry_items(items, n_iovec); |
1f2da9ec | 2601 | |
51ab0afe LP |
2602 | r = journal_file_append_entry_internal( |
2603 | f, | |
2604 | ts, | |
2605 | boot_id, | |
2606 | machine_id, | |
2607 | xor_hash, | |
2608 | items, | |
2609 | n_iovec, | |
2610 | seqnum, | |
2611 | seqnum_id, | |
2612 | ret_object, | |
2613 | ret_offset); | |
cec736d2 | 2614 | |
fa6ac760 LP |
2615 | /* If the memory mapping triggered a SIGBUS then we return an |
2616 | * IO error and ignore the error code passed down to us, since | |
2617 | * it is very likely just an effect of a nullified replacement | |
2618 | * mapping page */ | |
2619 | ||
c3bd54bf | 2620 | if (mmap_cache_fd_got_sigbus(f->cache_fd)) |
fa6ac760 LP |
2621 | r = -EIO; |
2622 | ||
7a24f3bf VC |
2623 | if (f->post_change_timer) |
2624 | schedule_post_change(f); | |
2625 | else | |
2626 | journal_file_post_change(f); | |
50f20cfd | 2627 | |
cec736d2 LP |
2628 | return r; |
2629 | } | |
2630 | ||
a4bcff5b | 2631 | typedef struct ChainCacheItem { |
3a23e418 YW |
2632 | uint64_t first; /* The offset of the entry array object at the beginning of the chain, |
2633 | * i.e., le64toh(f->header->entry_array_offset), or le64toh(o->data.entry_offset). */ | |
2634 | uint64_t array; /* The offset of the cached entry array object. */ | |
2635 | uint64_t begin; /* The offset of the first item in the cached array. */ | |
2636 | uint64_t total; /* The total number of items in all arrays before the cached one in the chain. */ | |
2637 | uint64_t last_index; /* The last index we looked at in the cached array, to optimize locality when bisecting. */ | |
a4bcff5b LP |
2638 | } ChainCacheItem; |
2639 | ||
2640 | static void chain_cache_put( | |
4743015d | 2641 | OrderedHashmap *h, |
a4bcff5b LP |
2642 | ChainCacheItem *ci, |
2643 | uint64_t first, | |
2644 | uint64_t array, | |
2645 | uint64_t begin, | |
f268980d LP |
2646 | uint64_t total, |
2647 | uint64_t last_index) { | |
a4bcff5b | 2648 | |
cc938e4a YW |
2649 | assert(h); |
2650 | ||
a4bcff5b | 2651 | if (!ci) { |
34741aa3 LP |
2652 | /* If the chain item to cache for this chain is the |
2653 | * first one it's not worth caching anything */ | |
2654 | if (array == first) | |
2655 | return; | |
2656 | ||
29433089 | 2657 | if (ordered_hashmap_size(h) >= CHAIN_CACHE_MAX) { |
4743015d | 2658 | ci = ordered_hashmap_steal_first(h); |
29433089 LP |
2659 | assert(ci); |
2660 | } else { | |
a4bcff5b LP |
2661 | ci = new(ChainCacheItem, 1); |
2662 | if (!ci) | |
2663 | return; | |
2664 | } | |
2665 | ||
2666 | ci->first = first; | |
2667 | ||
4743015d | 2668 | if (ordered_hashmap_put(h, &ci->first, ci) < 0) { |
a4bcff5b LP |
2669 | free(ci); |
2670 | return; | |
2671 | } | |
2672 | } else | |
2673 | assert(ci->first == first); | |
2674 | ||
2675 | ci->array = array; | |
2676 | ci->begin = begin; | |
2677 | ci->total = total; | |
f268980d | 2678 | ci->last_index = last_index; |
a4bcff5b LP |
2679 | } |
2680 | ||
8d801e35 DDM |
2681 | static int bump_array_index(uint64_t *i, direction_t direction, uint64_t n) { |
2682 | assert(i); | |
2683 | ||
2684 | /* Increase or decrease the specified index, in the right direction. */ | |
2685 | ||
2686 | if (direction == DIRECTION_DOWN) { | |
2687 | if (*i >= n - 1) | |
2688 | return 0; | |
2689 | ||
2690 | (*i)++; | |
2691 | } else { | |
2692 | if (*i <= 0) | |
2693 | return 0; | |
2694 | ||
2695 | (*i)--; | |
2696 | } | |
2697 | ||
2698 | return 1; | |
2699 | } | |
2700 | ||
cc938e4a YW |
2701 | static int bump_entry_array( |
2702 | JournalFile *f, | |
b7264911 YW |
2703 | Object *o, /* the current entry array object. */ |
2704 | uint64_t offset, /* the offset of the entry array object. */ | |
2705 | uint64_t first, /* The offset of the first entry array object in the chain. */ | |
cc938e4a YW |
2706 | direction_t direction, |
2707 | uint64_t *ret) { | |
2708 | ||
aa00163d DDM |
2709 | int r; |
2710 | ||
2711 | assert(f); | |
aa00163d DDM |
2712 | assert(ret); |
2713 | ||
cc938e4a YW |
2714 | if (direction == DIRECTION_DOWN) { |
2715 | assert(o); | |
b7264911 YW |
2716 | assert(o->object.type == OBJECT_ENTRY_ARRAY); |
2717 | ||
de6eb806 | 2718 | *ret = le64toh(o->entry_array.next_entry_array_offset); |
b7264911 | 2719 | } else { |
aa00163d | 2720 | |
b7264911 YW |
2721 | /* Entry array chains are a singly linked list, so to find the previous array in the chain, we have |
2722 | * to start iterating from the top. */ | |
aa00163d | 2723 | |
b7264911 | 2724 | assert(offset > 0); |
aa00163d | 2725 | |
b7264911 YW |
2726 | uint64_t p = first, q = 0; |
2727 | while (p > 0 && p != offset) { | |
2728 | r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, p, &o); | |
2729 | if (r < 0) | |
2730 | return r; | |
aa00163d | 2731 | |
b7264911 YW |
2732 | q = p; |
2733 | p = le64toh(o->entry_array.next_entry_array_offset); | |
2734 | } | |
aa00163d | 2735 | |
b7264911 YW |
2736 | /* If we can't find the previous entry array in the entry array chain, we're likely dealing with a |
2737 | * corrupted journal file. */ | |
2738 | if (p == 0) | |
2739 | return -EBADMSG; | |
aa00163d | 2740 | |
b7264911 YW |
2741 | *ret = q; |
2742 | } | |
de6eb806 | 2743 | |
b7264911 | 2744 | return *ret > 0; |
aa00163d DDM |
2745 | } |
2746 | ||
f268980d LP |
2747 | static int generic_array_get( |
2748 | JournalFile *f, | |
3a23e418 YW |
2749 | uint64_t first, /* The offset of the first entry array object in the chain. */ |
2750 | uint64_t i, /* The index of the target object counted from the beginning of the entry array chain. */ | |
8d801e35 | 2751 | direction_t direction, |
3a23e418 YW |
2752 | Object **ret_object, /* The found object. */ |
2753 | uint64_t *ret_offset) { /* The offset of the found object. */ | |
de190aef | 2754 | |
e758735d | 2755 | uint64_t a, t = 0, k; |
a4bcff5b | 2756 | ChainCacheItem *ci; |
b63f09e4 | 2757 | Object *o = NULL; |
cc938e4a | 2758 | int r; |
cec736d2 LP |
2759 | |
2760 | assert(f); | |
2761 | ||
cc938e4a YW |
2762 | /* FIXME: fix return value assignment on success. */ |
2763 | ||
de190aef | 2764 | a = first; |
a4bcff5b LP |
2765 | |
2766 | /* Try the chain cache first */ | |
4743015d | 2767 | ci = ordered_hashmap_get(f->chain_cache, &first); |
a4bcff5b LP |
2768 | if (ci && i > ci->total) { |
2769 | a = ci->array; | |
2770 | i -= ci->total; | |
2771 | t = ci->total; | |
2772 | } | |
2773 | ||
de190aef | 2774 | while (a > 0) { |
de190aef | 2775 | r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o); |
952d1e78 DDM |
2776 | if (IN_SET(r, -EBADMSG, -EADDRNOTAVAIL)) { |
2777 | /* If there's corruption and we're going downwards, let's pretend we reached the | |
2778 | * final entry in the entry array chain. */ | |
2779 | ||
2780 | if (direction == DIRECTION_DOWN) | |
2781 | return 0; | |
2782 | ||
2783 | /* If there's corruption and we're going upwards, move back to the previous entry | |
2784 | * array and start iterating entries from there. */ | |
2785 | ||
952d1e78 | 2786 | i = UINT64_MAX; |
952d1e78 DDM |
2787 | break; |
2788 | } | |
de190aef LP |
2789 | if (r < 0) |
2790 | return r; | |
cec736d2 | 2791 | |
99daf3ce | 2792 | k = journal_file_entry_array_n_items(f, o); |
f85e79d3 YW |
2793 | if (k == 0) |
2794 | return 0; | |
2795 | ||
8d801e35 DDM |
2796 | if (i < k) |
2797 | break; | |
cec736d2 | 2798 | |
3a23e418 | 2799 | /* The index is larger than the number of elements in the array. Let's move to the next array. */ |
a4bcff5b LP |
2800 | i -= k; |
2801 | t += k; | |
de6eb806 | 2802 | a = le64toh(o->entry_array.next_entry_array_offset); |
de190aef LP |
2803 | } |
2804 | ||
8d801e35 DDM |
2805 | /* If we've found the right location, now look for the first non-corrupt entry object (in the right |
2806 | * direction). */ | |
2807 | ||
2808 | while (a > 0) { | |
8d801e35 | 2809 | if (i == UINT64_MAX) { |
b63f09e4 YW |
2810 | r = bump_entry_array(f, o, a, first, direction, &a); |
2811 | if (r <= 0) | |
2812 | return r; | |
2813 | ||
8d801e35 DDM |
2814 | r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &o); |
2815 | if (r < 0) | |
2816 | return r; | |
2817 | ||
99daf3ce | 2818 | k = journal_file_entry_array_n_items(f, o); |
8d801e35 DDM |
2819 | if (k == 0) |
2820 | break; | |
2821 | ||
fe6f2bd8 YW |
2822 | if (direction == DIRECTION_DOWN) |
2823 | i = 0; | |
2824 | else { | |
2825 | /* We moved to the previous array. The total must be decreased. */ | |
2826 | if (t < k) | |
2827 | return -EBADMSG; /* chain cache is broken ? */ | |
2828 | ||
2829 | i = k - 1; | |
2830 | t -= k; | |
2831 | } | |
8d801e35 DDM |
2832 | } |
2833 | ||
2834 | do { | |
e758735d YW |
2835 | uint64_t p; |
2836 | ||
99daf3ce | 2837 | p = journal_file_entry_array_item(f, o, i); |
8d801e35 | 2838 | |
cc938e4a | 2839 | r = journal_file_move_to_object(f, OBJECT_ENTRY, p, ret_object); |
ded10e3a DDM |
2840 | if (r >= 0) { |
2841 | /* Let's cache this item for the next invocation */ | |
99daf3ce | 2842 | chain_cache_put(f->chain_cache, ci, first, a, journal_file_entry_array_item(f, o, 0), t, i); |
ded10e3a DDM |
2843 | |
2844 | if (ret_offset) | |
2845 | *ret_offset = p; | |
2846 | ||
2847 | return 1; | |
2848 | } | |
8d801e35 DDM |
2849 | if (!IN_SET(r, -EADDRNOTAVAIL, -EBADMSG)) |
2850 | return r; | |
2851 | ||
2852 | /* OK, so this entry is borked. Most likely some entry didn't get synced to | |
d9b61db9 | 2853 | * disk properly, let's see if the next one might work for us instead. */ |
8d801e35 | 2854 | log_debug_errno(r, "Entry item %" PRIu64 " is bad, skipping over it.", i); |
d9b61db9 | 2855 | |
8d801e35 DDM |
2856 | } while (bump_array_index(&i, direction, k) > 0); |
2857 | ||
3a23e418 YW |
2858 | /* All entries tried in the above do-while loop are broken. Let's move to the next (or previous) array. */ |
2859 | ||
fe6f2bd8 YW |
2860 | if (direction == DIRECTION_DOWN) |
2861 | /* We are going to the next array, the total must be incremented. */ | |
2862 | t += k; | |
2863 | ||
8d801e35 DDM |
2864 | i = UINT64_MAX; |
2865 | } | |
2866 | ||
a4bcff5b | 2867 | return 0; |
de190aef LP |
2868 | } |
2869 | ||
de190aef | 2870 | enum { |
ab8f553d YW |
2871 | TEST_FOUND, /* The current object passes the test. */ |
2872 | TEST_LEFT, /* The current object is in an earlier position, and the object we are looking | |
2873 | * for should exist in a later position. */ | |
2874 | TEST_RIGHT, /* The current object is in a later position, and the object we are looking for | |
2875 | * should exist in an earlier position. */ | |
2876 | TEST_GOTO_NEXT, /* No matching object exists in this array and earlier arrays, go to the next array. */ | |
2877 | TEST_GOTO_PREVIOUS, /* No matching object exists in this array and later arrays, go to the previous array. */ | |
de190aef | 2878 | }; |
cec736d2 | 2879 | |
ab8f553d | 2880 | static int generic_array_bisect_step( |
fa107181 | 2881 | JournalFile *f, |
ab8f553d YW |
2882 | Object *array, /* entry array object */ |
2883 | uint64_t i, /* index of the entry item in the array we will test. */ | |
fa107181 YW |
2884 | uint64_t needle, |
2885 | int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle), | |
2886 | direction_t direction, | |
ab8f553d YW |
2887 | uint64_t *m, /* The maximum number of the entries we will check in the array. */ |
2888 | uint64_t *left, /* The index of the left boundary in the array. */ | |
2889 | uint64_t *right) { /* The index of the right boundary in the array. */ | |
fa107181 | 2890 | |
fa107181 YW |
2891 | uint64_t p; |
2892 | int r; | |
2893 | ||
2894 | assert(f); | |
f6548f05 | 2895 | assert(array); |
fa107181 | 2896 | assert(test_object); |
ab8f553d | 2897 | assert(m); |
fa107181 YW |
2898 | assert(left); |
2899 | assert(right); | |
2900 | assert(*left <= i); | |
2901 | assert(i <= *right); | |
ab8f553d | 2902 | assert(*right < *m); |
fa107181 YW |
2903 | |
2904 | p = journal_file_entry_array_item(f, array, i); | |
2905 | if (p <= 0) | |
2906 | r = -EBADMSG; | |
2907 | else | |
2908 | r = test_object(f, p, needle); | |
2909 | if (IN_SET(r, -EBADMSG, -EADDRNOTAVAIL)) { | |
2910 | log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short."); | |
ab8f553d | 2911 | |
a93be359 YW |
2912 | if (i == *left) { |
2913 | /* This happens on two situations: | |
2914 | * | |
2915 | * a) i == 0 (hence, *left == 0): | |
2916 | * The first entry in the array is corrupted, let's go back to the previous array. | |
2917 | * | |
2918 | * b) *right == *left or *left + 1, and we are going to downwards: | |
2919 | * In that case, the (i-1)-th object has been already tested in the previous call, | |
2920 | * which returned TEST_LEFT. See below. So, there is no matching entry in this | |
2921 | * array nor in the whole entry array chain. */ | |
2922 | assert(i == 0 || (*right - *left <= 1 && direction == DIRECTION_DOWN)); | |
ab8f553d | 2923 | return TEST_GOTO_PREVIOUS; |
a93be359 | 2924 | } |
ab8f553d YW |
2925 | |
2926 | /* Otherwise, cutting the array short. So, here we limit the number of elements we will see | |
2927 | * in this array, and set the right boundary to the last possibly non-corrupted object. */ | |
2928 | *m = i; | |
2929 | *right = i - 1; | |
2930 | return TEST_RIGHT; | |
fa107181 YW |
2931 | } |
2932 | if (r < 0) | |
2933 | return r; | |
2934 | ||
2935 | if (r == TEST_FOUND) | |
ab8f553d YW |
2936 | /* There may be multiple entries that match with the needle. When the direction is down, we |
2937 | * need to find the first matching entry, hence the right boundary can be moved, but the left | |
2938 | * one cannot. Similarly, when the direction is up, we need to find the last matching entry, | |
2939 | * hence the left boundary can be moved, but the right one cannot. */ | |
fa107181 YW |
2940 | r = direction == DIRECTION_DOWN ? TEST_RIGHT : TEST_LEFT; |
2941 | ||
ab8f553d YW |
2942 | if (r == TEST_RIGHT) { |
2943 | /* Currently, left --- needle --- i --- right, hence we can move the right boundary to i. */ | |
2944 | if (direction == DIRECTION_DOWN) | |
2945 | *right = i; | |
2946 | else { | |
2947 | if (i == 0) | |
2948 | return TEST_GOTO_PREVIOUS; | |
2949 | *right = i - 1; | |
2950 | } | |
2951 | } else { | |
2952 | /* Currently, left --- i --- needle --- right, hence we can move the left boundary to i. */ | |
2953 | if (direction == DIRECTION_DOWN) { | |
2954 | /* Note, here *m is always positive, as by the assertions at the beginning, we have | |
2955 | * 0 <= *left <= i <= *right < m */ | |
2956 | if (i == *m - 1) | |
2957 | return TEST_GOTO_NEXT; | |
2958 | ||
2959 | *left = i + 1; | |
2960 | } else | |
2961 | *left = i; | |
2962 | } | |
fa107181 YW |
2963 | |
2964 | return r; | |
2965 | } | |
2966 | ||
f268980d LP |
2967 | static int generic_array_bisect( |
2968 | JournalFile *f, | |
ab8f553d YW |
2969 | uint64_t first, /* The offset of the first entry array object in the chain. */ |
2970 | uint64_t n, /* The total number of elements in the chain of the entry array. */ | |
2971 | uint64_t needle, /* The target value (e.g. seqnum, monotonic, realtime, ...). */ | |
2972 | int (*test_object)(JournalFile *f, | |
2973 | uint64_t p, /* the offset of the (data or entry) object that will be tested. */ | |
2974 | uint64_t needle), | |
f268980d | 2975 | direction_t direction, |
ab8f553d YW |
2976 | Object **ret_object, /* The found object. */ |
2977 | uint64_t *ret_offset, /* The offset of the found object. */ | |
2978 | uint64_t *ret_idx) { /* The index of the found object counted from the beginning of the entry array chain. */ | |
f268980d | 2979 | |
9a390e86 DDM |
2980 | /* Given an entry array chain, this function finds the object "closest" to the given needle in the |
2981 | * chain, taking into account the provided direction. A function can be provided to determine how | |
2982 | * an object is matched against the given needle. | |
2983 | * | |
2984 | * Given a journal file, the offset of an object and the needle, the test_object() function should | |
ab8f553d YW |
2985 | * return TEST_RIGHT if the needle is located earlier in the entry array chain, TEST_LEFT if the |
2986 | * needle is located later in the entry array chain, and TEST_FOUND if the object matches the needle. | |
9a390e86 DDM |
2987 | * If test_object() returns TEST_FOUND for a specific object, that object's information will be used |
2988 | * to populate the return values of this function. If test_object() never returns TEST_FOUND, the | |
2989 | * return values are populated with the details of one of the objects closest to the needle. If the | |
2990 | * direction is DIRECTION_UP, the earlier object is used. Otherwise, the later object is used. | |
ab8f553d YW |
2991 | * If there are multiple objects that test_object() return TEST_FOUND for, then the first matching |
2992 | * object returned when direction is DIRECTION_DOWN. Otherwise the last object is returned. */ | |
9a390e86 | 2993 | |
ab8f553d | 2994 | uint64_t a, p, t = 0, i, last_index = UINT64_MAX; |
a4bcff5b | 2995 | ChainCacheItem *ci; |
03fdf957 | 2996 | Object *array; |
cc938e4a | 2997 | int r; |
cec736d2 | 2998 | |
de190aef LP |
2999 | assert(f); |
3000 | assert(test_object); | |
cec736d2 | 3001 | |
ab8f553d YW |
3002 | if (n <= 0) |
3003 | return 0; | |
3004 | ||
a4bcff5b | 3005 | /* Start with the first array in the chain */ |
de190aef | 3006 | a = first; |
a4bcff5b | 3007 | |
4743015d | 3008 | ci = ordered_hashmap_get(f->chain_cache, &first); |
96d4d024 | 3009 | if (ci && n > ci->total && ci->begin != 0) { |
de6b162d YW |
3010 | /* Ah, we have iterated this bisection array chain previously! Let's see if we can skip ahead |
3011 | * in the chain, as far as the last time. But we can't jump backwards in the chain, so let's | |
3012 | * check that first. */ | |
a4bcff5b LP |
3013 | |
3014 | r = test_object(f, ci->begin, needle); | |
5dca908a YW |
3015 | if (IN_SET(r, -EBADMSG, -EADDRNOTAVAIL)) |
3016 | log_debug_errno(r, "Cached entry is corrupted, ignoring: %m"); | |
3017 | else if (r < 0) | |
a4bcff5b | 3018 | return r; |
5dca908a | 3019 | else if (r == TEST_LEFT) { |
de6b162d YW |
3020 | /* OK, what we are looking for is right of the begin of this EntryArray, so let's |
3021 | * jump straight to previously cached array in the chain */ | |
a4bcff5b LP |
3022 | |
3023 | a = ci->array; | |
3024 | n -= ci->total; | |
3025 | t = ci->total; | |
f268980d | 3026 | last_index = ci->last_index; |
a4bcff5b LP |
3027 | } |
3028 | } | |
3029 | ||
de190aef | 3030 | while (a > 0) { |
a93be359 | 3031 | uint64_t left, right, k, m, m_original; |
de190aef LP |
3032 | |
3033 | r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array); | |
cec736d2 LP |
3034 | if (r < 0) |
3035 | return r; | |
3036 | ||
99daf3ce | 3037 | k = journal_file_entry_array_n_items(f, array); |
a93be359 | 3038 | m = m_original = MIN(k, n); |
ab8f553d | 3039 | if (m <= 0) |
de190aef | 3040 | return 0; |
cec736d2 | 3041 | |
ab8f553d YW |
3042 | left = 0; |
3043 | right = m - 1; | |
3044 | ||
3045 | if (direction == DIRECTION_UP) { | |
3046 | /* If we're going upwards, the last entry of the previous array may pass the test, | |
3047 | * and the first entry of the current array may not pass. In that case, the last | |
3048 | * entry of the previous array must be returned. Hence, we need to test the first | |
3049 | * entry of the current array. */ | |
3050 | r = generic_array_bisect_step(f, array, 0, needle, test_object, direction, &m, &left, &right); | |
3051 | if (r < 0) | |
3052 | return r; | |
3053 | if (r == TEST_GOTO_PREVIOUS) | |
3054 | goto previous; | |
bee6a291 | 3055 | } |
ab8f553d YW |
3056 | |
3057 | /* Test the last entry of this array, to determine if we should go to the next array. */ | |
3058 | r = generic_array_bisect_step(f, array, right, needle, test_object, direction, &m, &left, &right); | |
de190aef LP |
3059 | if (r < 0) |
3060 | return r; | |
ab8f553d YW |
3061 | if (r == TEST_GOTO_PREVIOUS) |
3062 | goto previous; | |
cec736d2 | 3063 | |
ab8f553d | 3064 | /* The expected entry should be in this array, (or the last entry of the previous array). */ |
de190aef | 3065 | if (r == TEST_RIGHT) { |
ab8f553d | 3066 | |
fa107181 YW |
3067 | /* If we cached the last index we looked at, let's try to not to jump too wildly |
3068 | * around and see if we can limit the range to look at early to the immediate | |
3069 | * neighbors of the last index we looked at. */ | |
f268980d | 3070 | |
ab8f553d YW |
3071 | if (last_index > 0 && left < last_index - 1 && last_index - 1 < right) { |
3072 | r = generic_array_bisect_step(f, array, last_index - 1, needle, test_object, direction, &m, &left, &right); | |
3073 | if (r < 0) | |
fa107181 | 3074 | return r; |
ab8f553d YW |
3075 | if (r == TEST_GOTO_PREVIOUS) |
3076 | goto previous; | |
fa107181 | 3077 | } |
f268980d | 3078 | |
ab8f553d YW |
3079 | if (last_index < UINT64_MAX && left < last_index + 1 && last_index + 1 < right) { |
3080 | r = generic_array_bisect_step(f, array, last_index + 1, needle, test_object, direction, &m, &left, &right); | |
3081 | if (r < 0) | |
fa107181 | 3082 | return r; |
ab8f553d YW |
3083 | if (r == TEST_GOTO_PREVIOUS) |
3084 | goto previous; | |
f268980d LP |
3085 | } |
3086 | ||
de190aef LP |
3087 | for (;;) { |
3088 | if (left == right) { | |
a93be359 YW |
3089 | /* We found one or more corrupted entries in generic_array_bisect_step(). |
3090 | * In that case, the entry pointed by 'right' may not be tested. | |
3091 | * | |
3092 | * When we are going to downwards, the entry object pointed by 'left' | |
3093 | * has not been tested yet, Hence, even if left == right, we still | |
3094 | * have to check the final entry to see if it actually matches. | |
3095 | * | |
3096 | * On the other hand, when we are going to upwards, the entry pointed | |
3097 | * by 'left' is always tested, So, it is not necessary to test the | |
3098 | * final entry again. */ | |
3099 | if (m != m_original && direction == DIRECTION_DOWN) { | |
3100 | r = generic_array_bisect_step(f, array, left, needle, test_object, direction, &m, &left, &right); | |
3101 | if (r < 0) | |
3102 | return r; | |
3103 | if (IN_SET(r, TEST_GOTO_PREVIOUS, TEST_GOTO_NEXT)) | |
3104 | return 0; /* The entry does not pass the test, or is corrupted */ | |
3105 | ||
3106 | assert(TEST_RIGHT); | |
3107 | assert(left == right); | |
3108 | } | |
3109 | ||
de190aef LP |
3110 | i = left; |
3111 | goto found; | |
3112 | } | |
3113 | ||
3114 | assert(left < right); | |
ab8f553d | 3115 | i = (left + right + (direction == DIRECTION_UP)) / 2; |
f268980d | 3116 | |
ab8f553d YW |
3117 | r = generic_array_bisect_step(f, array, i, needle, test_object, direction, &m, &left, &right); |
3118 | if (r < 0) | |
de190aef | 3119 | return r; |
ab8f553d YW |
3120 | if (r == TEST_GOTO_PREVIOUS) |
3121 | goto previous; | |
a93be359 YW |
3122 | if (r == TEST_GOTO_NEXT) |
3123 | return 0; /* Found a corrupt entry, and the array was cut short. */ | |
de190aef LP |
3124 | } |
3125 | } | |
3126 | ||
ab8f553d YW |
3127 | /* Not found in this array (or the last entry of this array should be returned), go to the next array. */ |
3128 | assert(r == (direction == DIRECTION_DOWN ? TEST_GOTO_NEXT : TEST_LEFT)); | |
3129 | ||
2173cbf8 | 3130 | if (k >= n) { |
cbdca852 | 3131 | if (direction == DIRECTION_UP) { |
ab8f553d YW |
3132 | assert(n > 0); |
3133 | i = n - 1; | |
cbdca852 LP |
3134 | goto found; |
3135 | } | |
3136 | ||
cec736d2 | 3137 | return 0; |
cbdca852 | 3138 | } |
cec736d2 | 3139 | |
de190aef LP |
3140 | n -= k; |
3141 | t += k; | |
f5fbe71d | 3142 | last_index = UINT64_MAX; |
de190aef | 3143 | a = le64toh(array->entry_array.next_entry_array_offset); |
cec736d2 LP |
3144 | } |
3145 | ||
3146 | return 0; | |
de190aef | 3147 | |
ab8f553d YW |
3148 | previous: |
3149 | /* Not found in the current array, return the last entry of the previous array. */ | |
3150 | assert(r == TEST_GOTO_PREVIOUS); | |
3151 | ||
3152 | /* The current array is the first in the chain. no previous array. */ | |
3153 | if (t == 0) | |
de190aef LP |
3154 | return 0; |
3155 | ||
ab8f553d YW |
3156 | /* When we are going downwards, there is no matching entries in the previous array. */ |
3157 | if (direction == DIRECTION_DOWN) | |
de190aef | 3158 | return 0; |
e562f131 | 3159 | |
ab8f553d YW |
3160 | /* Indicate to go to the previous array later. Note, do not move to the previous array here, |
3161 | * as that may invalidate the current array object in the mmap cache and | |
3162 | * journal_file_entry_array_item() below may read invalid address. */ | |
3163 | i = UINT64_MAX; | |
3164 | ||
3165 | found: | |
e562f131 YW |
3166 | p = journal_file_entry_array_item(f, array, 0); |
3167 | if (p <= 0) | |
3168 | return -EBADMSG; | |
3169 | ||
a4bcff5b | 3170 | /* Let's cache this item for the next invocation */ |
ab8f553d | 3171 | chain_cache_put(f->chain_cache, ci, first, a, p, t, i); |
a4bcff5b | 3172 | |
ab8f553d YW |
3173 | if (i == UINT64_MAX) { |
3174 | uint64_t m; | |
3175 | ||
3176 | /* Get the last entry of the previous array. */ | |
3177 | ||
3178 | r = bump_entry_array(f, NULL, a, first, DIRECTION_UP, &a); | |
3179 | if (r <= 0) | |
3180 | return r; | |
3181 | ||
3182 | r = journal_file_move_to_object(f, OBJECT_ENTRY_ARRAY, a, &array); | |
3183 | if (r < 0) | |
3184 | return r; | |
3185 | ||
3186 | m = journal_file_entry_array_n_items(f, array); | |
3187 | if (m == 0 || t < m) | |
3188 | return -EBADMSG; | |
3189 | ||
3190 | t -= m; | |
3191 | i = m - 1; | |
3192 | } | |
3193 | ||
3194 | p = journal_file_entry_array_item(f, array, i); | |
3195 | if (p == 0) | |
3196 | return -EBADMSG; | |
de190aef | 3197 | |
cc938e4a YW |
3198 | if (ret_object) { |
3199 | r = journal_file_move_to_object(f, OBJECT_ENTRY, p, ret_object); | |
ded10e3a DDM |
3200 | if (r < 0) |
3201 | return r; | |
3202 | } | |
de190aef | 3203 | |
f4474e00 LP |
3204 | if (ret_offset) |
3205 | *ret_offset = p; | |
de190aef | 3206 | |
f4474e00 | 3207 | if (ret_idx) |
ab8f553d | 3208 | *ret_idx = t + i; |
de190aef LP |
3209 | |
3210 | return 1; | |
cec736d2 LP |
3211 | } |
3212 | ||
7682aedb | 3213 | static int generic_array_bisect_for_data( |
f268980d | 3214 | JournalFile *f, |
7682aedb | 3215 | Object *d, |
f268980d LP |
3216 | uint64_t needle, |
3217 | int (*test_object)(JournalFile *f, uint64_t p, uint64_t needle), | |
3218 | direction_t direction, | |
cc938e4a | 3219 | Object **ret_object, |
00969140 | 3220 | uint64_t *ret_offset) { |
de190aef | 3221 | |
7682aedb | 3222 | uint64_t extra, first, n; |
cec736d2 LP |
3223 | int r; |
3224 | ||
3225 | assert(f); | |
7682aedb YW |
3226 | assert(d); |
3227 | assert(d->object.type == OBJECT_DATA); | |
de190aef | 3228 | assert(test_object); |
cec736d2 | 3229 | |
7682aedb | 3230 | n = le64toh(d->data.n_entries); |
de190aef LP |
3231 | if (n <= 0) |
3232 | return 0; | |
7682aedb YW |
3233 | n--; /* n_entries is the number of entries linked to the data object, including the 'extra' entry. */ |
3234 | ||
3235 | extra = le64toh(d->data.entry_offset); | |
3236 | first = le64toh(d->data.entry_array_offset); | |
cec736d2 | 3237 | |
d67078b4 | 3238 | /* This bisects the array in object 'first', but first checks an extra. */ |
de190aef LP |
3239 | r = test_object(f, extra, needle); |
3240 | if (r < 0) | |
3241 | return r; | |
a536e261 | 3242 | |
d67078b4 YW |
3243 | if (direction == DIRECTION_DOWN) { |
3244 | /* If we are going downwards, then we need to return the first object that passes the test. | |
3245 | * When there is no object that passes the test, we need to return the first object that | |
3246 | * test_object() returns TEST_RIGHT for. */ | |
3247 | if (IN_SET(r, | |
3248 | TEST_FOUND, /* The 'extra' object passes the test. Hence, this is the first | |
3249 | * object that passes the test. */ | |
3250 | TEST_RIGHT)) /* The 'extra' object is the first object that test_object() returns | |
3251 | * TEST_RIGHT for, and no object exists even in the chained arrays | |
3252 | * that passes the test. */ | |
3253 | goto use_extra; /* The 'extra' object is exactly the one we are looking for. It is | |
3254 | * not necessary to bisect the chained arrays. */ | |
3255 | ||
3256 | /* Otherwise, the 'extra' object is not the one we are looking for. Search in the arrays. */ | |
de190aef | 3257 | |
d67078b4 YW |
3258 | } else { |
3259 | /* If we are going upwards, then we need to return the last object that passes the test. | |
3260 | * When there is no object that passes the test, we need to return the the last object that | |
3261 | * test_object() returns TEST_LEFT for. */ | |
3262 | if (r == TEST_RIGHT) | |
3263 | return 0; /* Not only the 'extra' object, but also all objects in the chained arrays | |
3264 | * will never get TEST_FOUND or TEST_LEFT. The object we are looking for | |
3265 | * does not exist. */ | |
3266 | ||
3267 | /* Even if the 'extra' object passes the test, there may be multiple objects in the arrays | |
3268 | * that also pass the test. Hence, we need to bisect the arrays for finding the last matching | |
3269 | * object. */ | |
a536e261 | 3270 | } |
cec736d2 | 3271 | |
7682aedb | 3272 | r = generic_array_bisect(f, first, n, needle, test_object, direction, ret_object, ret_offset, NULL); |
d67078b4 YW |
3273 | if (r != 0) |
3274 | return r; /* When > 0, the found object is the first (or last, when DIRECTION_UP) object. | |
3275 | * Hence, return the found object now. */ | |
3276 | ||
3277 | /* No matching object found in the chained arrays. | |
3278 | * DIRECTION_DOWN : the 'extra' object neither matches the condition. There is no matching object. | |
3279 | * DIRECTION_UP : the 'extra' object matches the condition. So, return it. */ | |
3280 | if (direction == DIRECTION_DOWN) | |
3281 | return 0; | |
de190aef | 3282 | |
d67078b4 | 3283 | use_extra: |
cc938e4a YW |
3284 | if (ret_object) { |
3285 | r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, ret_object); | |
ded10e3a DDM |
3286 | if (r < 0) |
3287 | return r; | |
3288 | } | |
cbdca852 | 3289 | |
f4474e00 LP |
3290 | if (ret_offset) |
3291 | *ret_offset = extra; | |
cbdca852 | 3292 | |
cbdca852 LP |
3293 | return 1; |
3294 | } | |
3295 | ||
d1e8e8b5 | 3296 | static int test_object_offset(JournalFile *f, uint64_t p, uint64_t needle) { |
cbdca852 LP |
3297 | assert(f); |
3298 | assert(p > 0); | |
3299 | ||
3300 | if (p == needle) | |
3301 | return TEST_FOUND; | |
3302 | else if (p < needle) | |
3303 | return TEST_LEFT; | |
3304 | else | |
3305 | return TEST_RIGHT; | |
3306 | } | |
3307 | ||
578cd185 DDM |
3308 | int journal_file_move_to_entry_by_offset( |
3309 | JournalFile *f, | |
3310 | uint64_t p, | |
3311 | direction_t direction, | |
cc938e4a | 3312 | Object **ret_object, |
578cd185 DDM |
3313 | uint64_t *ret_offset) { |
3314 | ||
3315 | assert(f); | |
3316 | assert(f->header); | |
3317 | ||
3318 | return generic_array_bisect( | |
3319 | f, | |
3320 | le64toh(f->header->entry_array_offset), | |
3321 | le64toh(f->header->n_entries), | |
3322 | p, | |
3323 | test_object_offset, | |
3324 | direction, | |
cc938e4a | 3325 | ret_object, ret_offset, NULL); |
578cd185 DDM |
3326 | } |
3327 | ||
de190aef | 3328 | static int test_object_seqnum(JournalFile *f, uint64_t p, uint64_t needle) { |
893e0f8f | 3329 | uint64_t sq; |
de190aef LP |
3330 | Object *o; |
3331 | int r; | |
3332 | ||
3333 | assert(f); | |
3334 | assert(p > 0); | |
3335 | ||
3336 | r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o); | |
cec736d2 LP |
3337 | if (r < 0) |
3338 | return r; | |
3339 | ||
893e0f8f LP |
3340 | sq = le64toh(READ_NOW(o->entry.seqnum)); |
3341 | if (sq == needle) | |
de190aef | 3342 | return TEST_FOUND; |
893e0f8f | 3343 | else if (sq < needle) |
de190aef LP |
3344 | return TEST_LEFT; |
3345 | else | |
3346 | return TEST_RIGHT; | |
3347 | } | |
cec736d2 | 3348 | |
de190aef LP |
3349 | int journal_file_move_to_entry_by_seqnum( |
3350 | JournalFile *f, | |
3351 | uint64_t seqnum, | |
3352 | direction_t direction, | |
cc938e4a | 3353 | Object **ret_object, |
f4474e00 | 3354 | uint64_t *ret_offset) { |
cc938e4a | 3355 | |
c88cc6af VC |
3356 | assert(f); |
3357 | assert(f->header); | |
de190aef | 3358 | |
f4474e00 LP |
3359 | return generic_array_bisect( |
3360 | f, | |
3361 | le64toh(f->header->entry_array_offset), | |
3362 | le64toh(f->header->n_entries), | |
3363 | seqnum, | |
3364 | test_object_seqnum, | |
3365 | direction, | |
cc938e4a | 3366 | ret_object, ret_offset, NULL); |
de190aef | 3367 | } |
cec736d2 | 3368 | |
de190aef LP |
3369 | static int test_object_realtime(JournalFile *f, uint64_t p, uint64_t needle) { |
3370 | Object *o; | |
893e0f8f | 3371 | uint64_t rt; |
de190aef LP |
3372 | int r; |
3373 | ||
3374 | assert(f); | |
3375 | assert(p > 0); | |
3376 | ||
3377 | r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o); | |
3378 | if (r < 0) | |
3379 | return r; | |
3380 | ||
893e0f8f LP |
3381 | rt = le64toh(READ_NOW(o->entry.realtime)); |
3382 | if (rt == needle) | |
de190aef | 3383 | return TEST_FOUND; |
893e0f8f | 3384 | else if (rt < needle) |
de190aef LP |
3385 | return TEST_LEFT; |
3386 | else | |
3387 | return TEST_RIGHT; | |
cec736d2 LP |
3388 | } |
3389 | ||
de190aef LP |
3390 | int journal_file_move_to_entry_by_realtime( |
3391 | JournalFile *f, | |
3392 | uint64_t realtime, | |
3393 | direction_t direction, | |
cc938e4a | 3394 | Object **ret_object, |
f4474e00 | 3395 | uint64_t *ret_offset) { |
cc938e4a | 3396 | |
c88cc6af VC |
3397 | assert(f); |
3398 | assert(f->header); | |
de190aef | 3399 | |
f4474e00 LP |
3400 | return generic_array_bisect( |
3401 | f, | |
3402 | le64toh(f->header->entry_array_offset), | |
3403 | le64toh(f->header->n_entries), | |
3404 | realtime, | |
3405 | test_object_realtime, | |
3406 | direction, | |
cc938e4a | 3407 | ret_object, ret_offset, NULL); |
de190aef LP |
3408 | } |
3409 | ||
3410 | static int test_object_monotonic(JournalFile *f, uint64_t p, uint64_t needle) { | |
3411 | Object *o; | |
893e0f8f | 3412 | uint64_t m; |
de190aef LP |
3413 | int r; |
3414 | ||
3415 | assert(f); | |
3416 | assert(p > 0); | |
3417 | ||
3418 | r = journal_file_move_to_object(f, OBJECT_ENTRY, p, &o); | |
3419 | if (r < 0) | |
3420 | return r; | |
3421 | ||
893e0f8f LP |
3422 | m = le64toh(READ_NOW(o->entry.monotonic)); |
3423 | if (m == needle) | |
de190aef | 3424 | return TEST_FOUND; |
893e0f8f | 3425 | else if (m < needle) |
de190aef LP |
3426 | return TEST_LEFT; |
3427 | else | |
3428 | return TEST_RIGHT; | |
3429 | } | |
3430 | ||
2a560338 | 3431 | static int find_data_object_by_boot_id( |
47838ab3 ZJS |
3432 | JournalFile *f, |
3433 | sd_id128_t boot_id, | |
cc938e4a YW |
3434 | Object **ret_object, |
3435 | uint64_t *ret_offset) { | |
2a560338 | 3436 | |
fbd0b64f | 3437 | char t[STRLEN("_BOOT_ID=") + 32 + 1] = "_BOOT_ID="; |
47838ab3 | 3438 | |
cc938e4a YW |
3439 | assert(f); |
3440 | ||
47838ab3 | 3441 | sd_id128_to_string(boot_id, t + 9); |
cc938e4a | 3442 | return journal_file_find_data_object(f, t, sizeof(t) - 1, ret_object, ret_offset); |
47838ab3 ZJS |
3443 | } |
3444 | ||
de190aef LP |
3445 | int journal_file_move_to_entry_by_monotonic( |
3446 | JournalFile *f, | |
3447 | sd_id128_t boot_id, | |
3448 | uint64_t monotonic, | |
3449 | direction_t direction, | |
cc938e4a | 3450 | Object **ret_object, |
f4474e00 | 3451 | uint64_t *ret_offset) { |
de190aef | 3452 | |
de190aef LP |
3453 | Object *o; |
3454 | int r; | |
3455 | ||
cbdca852 | 3456 | assert(f); |
de190aef | 3457 | |
47838ab3 | 3458 | r = find_data_object_by_boot_id(f, boot_id, &o, NULL); |
304cb08f | 3459 | if (r <= 0) |
de190aef | 3460 | return r; |
de190aef | 3461 | |
7682aedb | 3462 | return generic_array_bisect_for_data( |
f4474e00 | 3463 | f, |
7682aedb | 3464 | o, |
f4474e00 LP |
3465 | monotonic, |
3466 | test_object_monotonic, | |
3467 | direction, | |
00969140 | 3468 | ret_object, ret_offset); |
de190aef LP |
3469 | } |
3470 | ||
1fc605b0 | 3471 | void journal_file_reset_location(JournalFile *f) { |
cc938e4a YW |
3472 | assert(f); |
3473 | ||
6573ef05 | 3474 | f->location_type = LOCATION_HEAD; |
1fc605b0 | 3475 | f->current_offset = 0; |
6573ef05 MS |
3476 | f->current_seqnum = 0; |
3477 | f->current_realtime = 0; | |
3478 | f->current_monotonic = 0; | |
3479 | zero(f->current_boot_id); | |
3480 | f->current_xor_hash = 0; | |
4aa33df8 YW |
3481 | |
3482 | /* Also reset the previous reading direction. Otherwise, next_beyond_location() may wrongly handle we | |
3483 | * already hit EOF. See issue #29216. */ | |
3484 | f->last_direction = _DIRECTION_INVALID; | |
6573ef05 MS |
3485 | } |
3486 | ||
950c07d4 | 3487 | void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) { |
cc938e4a YW |
3488 | assert(f); |
3489 | assert(o); | |
3490 | ||
6573ef05 MS |
3491 | f->location_type = LOCATION_SEEK; |
3492 | f->current_offset = offset; | |
3493 | f->current_seqnum = le64toh(o->entry.seqnum); | |
3494 | f->current_realtime = le64toh(o->entry.realtime); | |
3495 | f->current_monotonic = le64toh(o->entry.monotonic); | |
3496 | f->current_boot_id = o->entry.boot_id; | |
3497 | f->current_xor_hash = le64toh(o->entry.xor_hash); | |
1fc605b0 MS |
3498 | } |
3499 | ||
b6da4ed0 LP |
3500 | static bool check_properly_ordered(uint64_t new_offset, uint64_t old_offset, direction_t direction) { |
3501 | ||
3502 | /* Consider it an error if any of the two offsets is uninitialized */ | |
3503 | if (old_offset == 0 || new_offset == 0) | |
3504 | return false; | |
3505 | ||
3506 | /* If we go down, the new offset must be larger than the old one. */ | |
3507 | return direction == DIRECTION_DOWN ? | |
3508 | new_offset > old_offset : | |
3509 | new_offset < old_offset; | |
3510 | } | |
3511 | ||
de190aef LP |
3512 | int journal_file_next_entry( |
3513 | JournalFile *f, | |
f534928a | 3514 | uint64_t p, |
de190aef | 3515 | direction_t direction, |
cc938e4a YW |
3516 | Object **ret_object, |
3517 | uint64_t *ret_offset) { | |
de190aef | 3518 | |
77db35bf YW |
3519 | uint64_t i, n, q; |
3520 | Object *o; | |
cec736d2 LP |
3521 | int r; |
3522 | ||
3523 | assert(f); | |
c88cc6af | 3524 | assert(f->header); |
de190aef | 3525 | |
cc938e4a YW |
3526 | /* FIXME: fix return value assignment. */ |
3527 | ||
893e0f8f | 3528 | n = le64toh(READ_NOW(f->header->n_entries)); |
de190aef LP |
3529 | if (n <= 0) |
3530 | return 0; | |
cec736d2 | 3531 | |
77db35bf | 3532 | /* When the input offset 'p' is zero, return the first (or last on DIRECTION_UP) entry. */ |
f534928a | 3533 | if (p == 0) |
77db35bf | 3534 | return generic_array_get(f, |
de190aef | 3535 | le64toh(f->header->entry_array_offset), |
77db35bf YW |
3536 | direction == DIRECTION_DOWN ? 0 : n - 1, |
3537 | direction, | |
3538 | ret_object, ret_offset); | |
3539 | ||
3540 | /* Otherwise, first find the nearest entry object. */ | |
3541 | r = generic_array_bisect(f, | |
3542 | le64toh(f->header->entry_array_offset), | |
3543 | le64toh(f->header->n_entries), | |
3544 | p, | |
3545 | test_object_offset, | |
3546 | direction, | |
3547 | ret_object ? &o : NULL, &q, &i); | |
3548 | if (r <= 0) | |
3549 | return r; | |
de190aef | 3550 | |
77db35bf YW |
3551 | assert(direction == DIRECTION_DOWN ? p <= q : q <= p); |
3552 | ||
3553 | /* If the input offset 'p' points to an entry object, generic_array_bisect() should provides | |
3554 | * the same offset, and the index needs to be shifted. Otherwise, use the found object as is, | |
3555 | * as it is the nearest entry object from the input offset 'p'. */ | |
3556 | ||
3557 | if (p != q) | |
3558 | goto found; | |
3559 | ||
3560 | r = bump_array_index(&i, direction, n); | |
3561 | if (r <= 0) | |
3562 | return r; | |
cec736d2 | 3563 | |
de190aef | 3564 | /* And jump to it */ |
77db35bf | 3565 | r = generic_array_get(f, le64toh(f->header->entry_array_offset), i, direction, ret_object ? &o : NULL, &q); |
8d801e35 DDM |
3566 | if (r <= 0) |
3567 | return r; | |
fb099c8d | 3568 | |
b6da4ed0 | 3569 | /* Ensure our array is properly ordered. */ |
77db35bf | 3570 | if (!check_properly_ordered(q, p, direction)) |
baaa35ad | 3571 | return log_debug_errno(SYNTHETIC_ERRNO(EBADMSG), |
77db35bf | 3572 | "%s: entry array not properly ordered at entry index %" PRIu64, |
baaa35ad | 3573 | f->path, i); |
77db35bf YW |
3574 | found: |
3575 | if (ret_object) | |
3576 | *ret_object = o; | |
f4474e00 | 3577 | if (ret_offset) |
77db35bf | 3578 | *ret_offset = q; |
fb099c8d ZJS |
3579 | |
3580 | return 1; | |
de190aef | 3581 | } |
cec736d2 | 3582 | |
d37eeabc | 3583 | int journal_file_move_to_entry_for_data( |
de190aef | 3584 | JournalFile *f, |
ec50313d | 3585 | Object *d, |
de190aef | 3586 | direction_t direction, |
cc938e4a YW |
3587 | Object **ret_object, |
3588 | uint64_t *ret_offset) { | |
de190aef | 3589 | |
d37eeabc YW |
3590 | uint64_t extra, first, n; |
3591 | int r = 0; | |
cec736d2 LP |
3592 | |
3593 | assert(f); | |
ec50313d DDM |
3594 | assert(d); |
3595 | assert(d->object.type == OBJECT_DATA); | |
d37eeabc | 3596 | assert(IN_SET(direction, DIRECTION_DOWN, DIRECTION_UP)); |
cec736d2 | 3597 | |
cc938e4a YW |
3598 | /* FIXME: fix return value assignment. */ |
3599 | ||
d37eeabc YW |
3600 | /* This returns the first (when the direction is down, otherwise the last) entry linked to the |
3601 | * specified data object. */ | |
3602 | ||
3603 | n = le64toh(d->data.n_entries); | |
de190aef | 3604 | if (n <= 0) |
d37eeabc YW |
3605 | return 0; |
3606 | n--; /* n_entries is the number of entries linked to the data object, including the 'extra' entry. */ | |
cec736d2 | 3607 | |
d37eeabc YW |
3608 | extra = le64toh(d->data.entry_offset); |
3609 | first = le64toh(d->data.entry_array_offset); | |
cec736d2 | 3610 | |
d37eeabc YW |
3611 | if (direction == DIRECTION_DOWN && extra > 0) { |
3612 | /* When we are going downwards, first try to read the extra entry. */ | |
3613 | r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, ret_object); | |
3614 | if (r >= 0) | |
3615 | goto use_extra; | |
3616 | if (!IN_SET(r, -EADDRNOTAVAIL, -EBADMSG)) | |
3617 | return r; | |
3618 | } | |
ded5034e | 3619 | |
d37eeabc YW |
3620 | if (n > 0) { |
3621 | /* DIRECTION_DOWN : The extra entry is broken, falling back to the entries in the array. | |
3622 | * DIRECTION_UP : Try to find a valid entry in the array from the tail. */ | |
3623 | r = generic_array_get(f, | |
3624 | first, | |
3625 | direction == DIRECTION_DOWN ? 0 : n - 1, | |
3626 | direction, | |
3627 | ret_object, ret_offset); | |
3628 | if (!IN_SET(r, 0, -EADDRNOTAVAIL, -EBADMSG)) | |
3629 | return r; /* found or critical error. */ | |
3630 | } | |
3631 | ||
3632 | if (direction == DIRECTION_UP && extra > 0) { | |
3633 | /* No valid entry exists in the chained array, falling back to the extra entry. */ | |
3634 | r = journal_file_move_to_object(f, OBJECT_ENTRY, extra, ret_object); | |
3635 | if (r >= 0) | |
3636 | goto use_extra; | |
3637 | } | |
3638 | ||
3639 | return r; | |
3640 | ||
3641 | use_extra: | |
f4474e00 | 3642 | if (ret_offset) |
d37eeabc | 3643 | *ret_offset = extra; |
ded5034e LP |
3644 | |
3645 | return 1; | |
de190aef | 3646 | } |
cec736d2 | 3647 | |
cbdca852 LP |
3648 | int journal_file_move_to_entry_by_offset_for_data( |
3649 | JournalFile *f, | |
ec50313d | 3650 | Object *d, |
cbdca852 LP |
3651 | uint64_t p, |
3652 | direction_t direction, | |
f4474e00 | 3653 | Object **ret, uint64_t *ret_offset) { |
cbdca852 | 3654 | |
cbdca852 | 3655 | assert(f); |
ec50313d DDM |
3656 | assert(d); |
3657 | assert(d->object.type == OBJECT_DATA); | |
cbdca852 | 3658 | |
7682aedb | 3659 | return generic_array_bisect_for_data( |
f4474e00 | 3660 | f, |
7682aedb | 3661 | d, |
f4474e00 LP |
3662 | p, |
3663 | test_object_offset, | |
3664 | direction, | |
00969140 | 3665 | ret, ret_offset); |
cbdca852 LP |
3666 | } |
3667 | ||
3668 | int journal_file_move_to_entry_by_monotonic_for_data( | |
3669 | JournalFile *f, | |
ec50313d | 3670 | Object *d, |
cbdca852 LP |
3671 | sd_id128_t boot_id, |
3672 | uint64_t monotonic, | |
3673 | direction_t direction, | |
cc938e4a YW |
3674 | Object **ret_object, |
3675 | uint64_t *ret_offset) { | |
cbdca852 | 3676 | |
80f96c0c | 3677 | Object *o, *entry; |
7682aedb | 3678 | uint64_t z; |
cbdca852 | 3679 | int r; |
cbdca852 LP |
3680 | |
3681 | assert(f); | |
ec50313d DDM |
3682 | assert(d); |
3683 | assert(d->object.type == OBJECT_DATA); | |
3684 | ||
7682aedb YW |
3685 | /* First, pin the given data object, before reading the _BOOT_ID= data object below. */ |
3686 | r = journal_file_pin_object(f, d); | |
3687 | if (r < 0) | |
3688 | return r; | |
cbdca852 | 3689 | |
7682aedb | 3690 | /* Then, read a data object for _BOOT_ID= and seek by time. */ |
20c45e57 | 3691 | r = find_data_object_by_boot_id(f, boot_id, &o, NULL); |
304cb08f | 3692 | if (r <= 0) |
cbdca852 | 3693 | return r; |
cbdca852 | 3694 | |
7682aedb YW |
3695 | r = generic_array_bisect_for_data(f, |
3696 | o, | |
cbdca852 LP |
3697 | monotonic, |
3698 | test_object_monotonic, | |
3699 | direction, | |
00969140 | 3700 | NULL, &z); |
cbdca852 LP |
3701 | if (r <= 0) |
3702 | return r; | |
3703 | ||
80f96c0c | 3704 | /* And now, continue seeking until we find an entry that exists in both bisection arrays. */ |
cbdca852 | 3705 | for (;;) { |
80f96c0c YW |
3706 | uint64_t p; |
3707 | ||
3708 | /* The journal entry found by the above bisect_plus_one() may not have the specified data, | |
3709 | * that is, it may not be linked in the data object. So, we need to check that. */ | |
cbdca852 | 3710 | |
7682aedb YW |
3711 | r = journal_file_move_to_entry_by_offset_for_data( |
3712 | f, d, z, direction, ret_object ? &entry : NULL, &p); | |
cbdca852 LP |
3713 | if (r <= 0) |
3714 | return r; | |
80f96c0c YW |
3715 | if (p == z) |
3716 | break; /* The journal entry has the specified data. Yay! */ | |
3717 | ||
3718 | /* If the entry does not have the data, then move to the next (or previous, depends on the | |
3719 | * 'direction') entry linked to the data object. But, the next entry may be in another boot. | |
3720 | * So, we need to check that the entry has the matching boot ID. */ | |
cbdca852 | 3721 | |
7682aedb YW |
3722 | r = journal_file_move_to_entry_by_offset_for_data( |
3723 | f, o, p, direction, ret_object ? &entry : NULL, &z); | |
cbdca852 LP |
3724 | if (r <= 0) |
3725 | return r; | |
80f96c0c YW |
3726 | if (p == z) |
3727 | break; /* The journal entry has the specified boot ID. Yay! */ | |
cbdca852 | 3728 | |
80f96c0c | 3729 | /* If not, let's try to the next entry... */ |
cbdca852 | 3730 | } |
80f96c0c YW |
3731 | |
3732 | if (ret_object) | |
3733 | *ret_object = entry; | |
3734 | if (ret_offset) | |
3735 | *ret_offset = z; | |
3736 | return 1; | |
cbdca852 LP |
3737 | } |
3738 | ||
de190aef LP |
3739 | int journal_file_move_to_entry_by_seqnum_for_data( |
3740 | JournalFile *f, | |
ec50313d | 3741 | Object *d, |
de190aef LP |
3742 | uint64_t seqnum, |
3743 | direction_t direction, | |
cc938e4a YW |
3744 | Object **ret_object, |
3745 | uint64_t *ret_offset) { | |
cec736d2 | 3746 | |
91a31dde | 3747 | assert(f); |
ec50313d DDM |
3748 | assert(d); |
3749 | assert(d->object.type == OBJECT_DATA); | |
cec736d2 | 3750 | |
7682aedb | 3751 | return generic_array_bisect_for_data( |
f4474e00 | 3752 | f, |
7682aedb | 3753 | d, |
f4474e00 LP |
3754 | seqnum, |
3755 | test_object_seqnum, | |
3756 | direction, | |
00969140 | 3757 | ret_object, ret_offset); |
de190aef | 3758 | } |
cec736d2 | 3759 | |
de190aef LP |
3760 | int journal_file_move_to_entry_by_realtime_for_data( |
3761 | JournalFile *f, | |
ec50313d | 3762 | Object *d, |
de190aef LP |
3763 | uint64_t realtime, |
3764 | direction_t direction, | |
f4474e00 | 3765 | Object **ret, uint64_t *ret_offset) { |
de190aef | 3766 | |
91a31dde | 3767 | assert(f); |
ec50313d DDM |
3768 | assert(d); |
3769 | assert(d->object.type == OBJECT_DATA); | |
de190aef | 3770 | |
7682aedb | 3771 | return generic_array_bisect_for_data( |
f4474e00 | 3772 | f, |
7682aedb | 3773 | d, |
f4474e00 LP |
3774 | realtime, |
3775 | test_object_realtime, | |
3776 | direction, | |
00969140 | 3777 | ret, ret_offset); |
cec736d2 LP |
3778 | } |
3779 | ||
0284adc6 | 3780 | void journal_file_dump(JournalFile *f) { |
7560fffc | 3781 | Object *o; |
0284adc6 | 3782 | uint64_t p; |
cc938e4a | 3783 | int r; |
7560fffc LP |
3784 | |
3785 | assert(f); | |
c88cc6af | 3786 | assert(f->header); |
7560fffc | 3787 | |
0284adc6 | 3788 | journal_file_print_header(f); |
7560fffc | 3789 | |
893e0f8f | 3790 | p = le64toh(READ_NOW(f->header->header_size)); |
0284adc6 | 3791 | while (p != 0) { |
363b2b9a | 3792 | const char *s; |
acc50c92 | 3793 | Compression c; |
363b2b9a | 3794 | |
d05089d8 | 3795 | r = journal_file_move_to_object(f, OBJECT_UNUSED, p, &o); |
0284adc6 LP |
3796 | if (r < 0) |
3797 | goto fail; | |
7560fffc | 3798 | |
363b2b9a | 3799 | s = journal_object_type_to_string(o->object.type); |
7560fffc | 3800 | |
363b2b9a | 3801 | switch (o->object.type) { |
3c1668da | 3802 | |
0284adc6 | 3803 | case OBJECT_ENTRY: |
363b2b9a DDM |
3804 | assert(s); |
3805 | ||
3806 | printf("Type: %s seqnum=%"PRIu64" monotonic=%"PRIu64" realtime=%"PRIu64"\n", | |
3807 | s, | |
507f22bd ZJS |
3808 | le64toh(o->entry.seqnum), |
3809 | le64toh(o->entry.monotonic), | |
3810 | le64toh(o->entry.realtime)); | |
0284adc6 | 3811 | break; |
7560fffc | 3812 | |
0284adc6 | 3813 | case OBJECT_TAG: |
363b2b9a DDM |
3814 | assert(s); |
3815 | ||
3816 | printf("Type: %s seqnum=%"PRIu64" epoch=%"PRIu64"\n", | |
3817 | s, | |
507f22bd ZJS |
3818 | le64toh(o->tag.seqnum), |
3819 | le64toh(o->tag.epoch)); | |
0284adc6 | 3820 | break; |
3c1668da LP |
3821 | |
3822 | default: | |
363b2b9a DDM |
3823 | if (s) |
3824 | printf("Type: %s \n", s); | |
3825 | else | |
3826 | printf("Type: unknown (%i)", o->object.type); | |
3827 | ||
3c1668da | 3828 | break; |
0284adc6 | 3829 | } |
7560fffc | 3830 | |
acc50c92 LP |
3831 | c = COMPRESSION_FROM_OBJECT(o); |
3832 | if (c > COMPRESSION_NONE) | |
d89c8fdf | 3833 | printf("Flags: %s\n", |
acc50c92 | 3834 | compression_to_string(c)); |
7560fffc | 3835 | |
0284adc6 LP |
3836 | if (p == le64toh(f->header->tail_object_offset)) |
3837 | p = 0; | |
3838 | else | |
71139898 | 3839 | p += ALIGN64(le64toh(o->object.size)); |
0284adc6 | 3840 | } |
7560fffc | 3841 | |
0284adc6 LP |
3842 | return; |
3843 | fail: | |
3844 | log_error("File corrupt"); | |
7560fffc LP |
3845 | } |
3846 | ||
5e62ac8b ZJS |
3847 | /* Note: the lifetime of the compound literal is the immediately surrounding block. */ |
3848 | #define FORMAT_TIMESTAMP_SAFE(t) (FORMAT_TIMESTAMP(t) ?: " --- ") | |
718fe4b1 | 3849 | |
0284adc6 | 3850 | void journal_file_print_header(JournalFile *f) { |
a1a03e30 | 3851 | struct stat st; |
7560fffc LP |
3852 | |
3853 | assert(f); | |
c88cc6af | 3854 | assert(f->header); |
7560fffc | 3855 | |
2c54acb1 | 3856 | printf("File path: %s\n" |
0284adc6 LP |
3857 | "File ID: %s\n" |
3858 | "Machine ID: %s\n" | |
3859 | "Boot ID: %s\n" | |
2c54acb1 | 3860 | "Sequential number ID: %s\n" |
0284adc6 | 3861 | "State: %s\n" |
e375bc5f | 3862 | "Compatible flags:%s%s%s%s\n" |
87413812 | 3863 | "Incompatible flags:%s%s%s%s%s%s\n" |
507f22bd ZJS |
3864 | "Header size: %"PRIu64"\n" |
3865 | "Arena size: %"PRIu64"\n" | |
2c54acb1 TN |
3866 | "Data hash table size: %"PRIu64"\n" |
3867 | "Field hash table size: %"PRIu64"\n" | |
3868 | "Rotate suggested: %s\n" | |
3869 | "Head sequential number: %"PRIu64" (%"PRIx64")\n" | |
3870 | "Tail sequential number: %"PRIu64" (%"PRIx64")\n" | |
3871 | "Head realtime timestamp: %s (%"PRIx64")\n" | |
3872 | "Tail realtime timestamp: %s (%"PRIx64")\n" | |
3873 | "Tail monotonic timestamp: %s (%"PRIx64")\n" | |
507f22bd | 3874 | "Objects: %"PRIu64"\n" |
2c54acb1 | 3875 | "Entry objects: %"PRIu64"\n", |
0284adc6 | 3876 | f->path, |
85b55869 LP |
3877 | SD_ID128_TO_STRING(f->header->file_id), |
3878 | SD_ID128_TO_STRING(f->header->machine_id), | |
9204fc64 | 3879 | SD_ID128_TO_STRING(f->header->tail_entry_boot_id), |
85b55869 | 3880 | SD_ID128_TO_STRING(f->header->seqnum_id), |
3223f44f LP |
3881 | f->header->state == STATE_OFFLINE ? "OFFLINE" : |
3882 | f->header->state == STATE_ONLINE ? "ONLINE" : | |
3883 | f->header->state == STATE_ARCHIVED ? "ARCHIVED" : "UNKNOWN", | |
8088cbd3 | 3884 | JOURNAL_HEADER_SEALED(f->header) ? " SEALED" : "", |
e375bc5f | 3885 | JOURNAL_HEADER_SEALED_CONTINUOUS(f->header) ? " SEALED_CONTINUOUS" : "", |
9204fc64 | 3886 | JOURNAL_HEADER_TAIL_ENTRY_BOOT_ID(f->header) ? " TAIL_ENTRY_BOOT_ID" : "", |
d89c8fdf ZJS |
3887 | (le32toh(f->header->compatible_flags) & ~HEADER_COMPATIBLE_ANY) ? " ???" : "", |
3888 | JOURNAL_HEADER_COMPRESSED_XZ(f->header) ? " COMPRESSED-XZ" : "", | |
3889 | JOURNAL_HEADER_COMPRESSED_LZ4(f->header) ? " COMPRESSED-LZ4" : "", | |
8653185a | 3890 | JOURNAL_HEADER_COMPRESSED_ZSTD(f->header) ? " COMPRESSED-ZSTD" : "", |
4ce534f4 | 3891 | JOURNAL_HEADER_KEYED_HASH(f->header) ? " KEYED-HASH" : "", |
87413812 | 3892 | JOURNAL_HEADER_COMPACT(f->header) ? " COMPACT" : "", |
d89c8fdf | 3893 | (le32toh(f->header->incompatible_flags) & ~HEADER_INCOMPATIBLE_ANY) ? " ???" : "", |
507f22bd ZJS |
3894 | le64toh(f->header->header_size), |
3895 | le64toh(f->header->arena_size), | |
3896 | le64toh(f->header->data_hash_table_size) / sizeof(HashItem), | |
3897 | le64toh(f->header->field_hash_table_size) / sizeof(HashItem), | |
c8e6e1f1 | 3898 | yes_no(journal_file_rotate_suggested(f, 0, LOG_DEBUG)), |
0808b92f LP |
3899 | le64toh(f->header->head_entry_seqnum), le64toh(f->header->head_entry_seqnum), |
3900 | le64toh(f->header->tail_entry_seqnum), le64toh(f->header->tail_entry_seqnum), | |
5e62ac8b ZJS |
3901 | FORMAT_TIMESTAMP_SAFE(le64toh(f->header->head_entry_realtime)), le64toh(f->header->head_entry_realtime), |
3902 | FORMAT_TIMESTAMP_SAFE(le64toh(f->header->tail_entry_realtime)), le64toh(f->header->tail_entry_realtime), | |
5291f26d | 3903 | FORMAT_TIMESPAN(le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC), le64toh(f->header->tail_entry_monotonic), |
507f22bd ZJS |
3904 | le64toh(f->header->n_objects), |
3905 | le64toh(f->header->n_entries)); | |
7560fffc | 3906 | |
0284adc6 | 3907 | if (JOURNAL_HEADER_CONTAINS(f->header, n_data)) |
2c54acb1 TN |
3908 | printf("Data objects: %"PRIu64"\n" |
3909 | "Data hash table fill: %.1f%%\n", | |
507f22bd | 3910 | le64toh(f->header->n_data), |
0284adc6 | 3911 | 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)))); |
7560fffc | 3912 | |
0284adc6 | 3913 | if (JOURNAL_HEADER_CONTAINS(f->header, n_fields)) |
2c54acb1 TN |
3914 | printf("Field objects: %"PRIu64"\n" |
3915 | "Field hash table fill: %.1f%%\n", | |
507f22bd | 3916 | le64toh(f->header->n_fields), |
0284adc6 | 3917 | 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)))); |
3223f44f LP |
3918 | |
3919 | if (JOURNAL_HEADER_CONTAINS(f->header, n_tags)) | |
2c54acb1 | 3920 | printf("Tag objects: %"PRIu64"\n", |
507f22bd | 3921 | le64toh(f->header->n_tags)); |
3223f44f | 3922 | if (JOURNAL_HEADER_CONTAINS(f->header, n_entry_arrays)) |
2c54acb1 | 3923 | printf("Entry array objects: %"PRIu64"\n", |
507f22bd | 3924 | le64toh(f->header->n_entry_arrays)); |
a1a03e30 | 3925 | |
0dbe57ee LP |
3926 | if (JOURNAL_HEADER_CONTAINS(f->header, field_hash_chain_depth)) |
3927 | printf("Deepest field hash chain: %" PRIu64"\n", | |
3928 | f->header->field_hash_chain_depth); | |
3929 | ||
3930 | if (JOURNAL_HEADER_CONTAINS(f->header, data_hash_chain_depth)) | |
3931 | printf("Deepest data hash chain: %" PRIu64"\n", | |
3932 | f->header->data_hash_chain_depth); | |
3933 | ||
a1a03e30 | 3934 | if (fstat(f->fd, &st) >= 0) |
2b59bf51 | 3935 | printf("Disk usage: %s\n", FORMAT_BYTES((uint64_t) st.st_blocks * 512ULL)); |
7560fffc LP |
3936 | } |
3937 | ||
fc68c929 LP |
3938 | static int journal_file_warn_btrfs(JournalFile *f) { |
3939 | unsigned attrs; | |
3940 | int r; | |
3941 | ||
3942 | assert(f); | |
3943 | ||
3944 | /* Before we write anything, check if the COW logic is turned | |
3945 | * off on btrfs. Given our write pattern that is quite | |
3946 | * unfriendly to COW file systems this should greatly improve | |
3947 | * performance on COW file systems, such as btrfs, at the | |
3948 | * expense of data integrity features (which shouldn't be too | |
3949 | * bad, given that we do our own checksumming). */ | |
3950 | ||
65ddc2c5 | 3951 | r = fd_is_fs_type(f->fd, BTRFS_SUPER_MAGIC); |
fc68c929 | 3952 | if (r < 0) |
04cb8ee8 | 3953 | return log_ratelimit_warning_errno(r, JOURNAL_LOG_RATELIMIT, "Failed to determine if journal is on btrfs: %m"); |
79de6eb1 | 3954 | if (r == 0) |
fc68c929 LP |
3955 | return 0; |
3956 | ||
3957 | r = read_attr_fd(f->fd, &attrs); | |
3958 | if (r < 0) | |
04cb8ee8 | 3959 | return log_ratelimit_warning_errno(r, JOURNAL_LOG_RATELIMIT, "Failed to read file attributes: %m"); |
fc68c929 LP |
3960 | |
3961 | if (attrs & FS_NOCOW_FL) { | |
3962 | log_debug("Detected btrfs file system with copy-on-write disabled, all is good."); | |
3963 | return 0; | |
3964 | } | |
3965 | ||
04cb8ee8 DDM |
3966 | log_ratelimit_notice(JOURNAL_LOG_RATELIMIT, |
3967 | "Creating journal file %s on a btrfs file system, and copy-on-write is enabled. " | |
3968 | "This is likely to slow down journal access substantially, please consider turning " | |
3969 | "off the copy-on-write file attribute on the journal directory, using chattr +C.", | |
3970 | f->path); | |
fc68c929 LP |
3971 | |
3972 | return 1; | |
3973 | } | |
3974 | ||
87413812 | 3975 | static void journal_default_metrics(JournalMetrics *m, int fd, bool compact) { |
5517607a DDM |
3976 | struct statvfs ss; |
3977 | uint64_t fs_size = 0; | |
3978 | ||
3979 | assert(m); | |
3980 | assert(fd >= 0); | |
3981 | ||
3982 | if (fstatvfs(fd, &ss) >= 0) | |
ffee7b97 | 3983 | fs_size = u64_multiply_safe(ss.f_frsize, ss.f_blocks); |
5517607a DDM |
3984 | else |
3985 | log_debug_errno(errno, "Failed to determine disk size: %m"); | |
3986 | ||
3987 | if (m->max_use == UINT64_MAX) { | |
3988 | ||
3989 | if (fs_size > 0) | |
b39907c7 | 3990 | m->max_use = CLAMP(PAGE_ALIGN_U64(fs_size / 10), /* 10% of file system size */ |
5517607a DDM |
3991 | MAX_USE_LOWER, MAX_USE_UPPER); |
3992 | else | |
3993 | m->max_use = MAX_USE_LOWER; | |
3994 | } else { | |
b39907c7 | 3995 | m->max_use = PAGE_ALIGN_U64(m->max_use); |
5517607a DDM |
3996 | |
3997 | if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2) | |
3998 | m->max_use = JOURNAL_FILE_SIZE_MIN*2; | |
3999 | } | |
4000 | ||
4001 | if (m->min_use == UINT64_MAX) { | |
4002 | if (fs_size > 0) | |
b39907c7 | 4003 | m->min_use = CLAMP(PAGE_ALIGN_U64(fs_size / 50), /* 2% of file system size */ |
5517607a DDM |
4004 | MIN_USE_LOW, MIN_USE_HIGH); |
4005 | else | |
4006 | m->min_use = MIN_USE_LOW; | |
4007 | } | |
4008 | ||
4009 | if (m->min_use > m->max_use) | |
4010 | m->min_use = m->max_use; | |
4011 | ||
4012 | if (m->max_size == UINT64_MAX) | |
b39907c7 | 4013 | m->max_size = MIN(PAGE_ALIGN_U64(m->max_use / 8), /* 8 chunks */ |
5517607a DDM |
4014 | MAX_SIZE_UPPER); |
4015 | else | |
b39907c7 | 4016 | m->max_size = PAGE_ALIGN_U64(m->max_size); |
5517607a | 4017 | |
87413812 DDM |
4018 | if (compact && m->max_size > JOURNAL_COMPACT_SIZE_MAX) |
4019 | m->max_size = JOURNAL_COMPACT_SIZE_MAX; | |
4020 | ||
5517607a DDM |
4021 | if (m->max_size != 0) { |
4022 | if (m->max_size < JOURNAL_FILE_SIZE_MIN) | |
4023 | m->max_size = JOURNAL_FILE_SIZE_MIN; | |
4024 | ||
4025 | if (m->max_use != 0 && m->max_size*2 > m->max_use) | |
4026 | m->max_use = m->max_size*2; | |
4027 | } | |
4028 | ||
4029 | if (m->min_size == UINT64_MAX) | |
4030 | m->min_size = JOURNAL_FILE_SIZE_MIN; | |
4031 | else | |
b39907c7 | 4032 | m->min_size = CLAMP(PAGE_ALIGN_U64(m->min_size), |
5517607a DDM |
4033 | JOURNAL_FILE_SIZE_MIN, |
4034 | m->max_size ?: UINT64_MAX); | |
4035 | ||
4036 | if (m->keep_free == UINT64_MAX) { | |
4037 | if (fs_size > 0) | |
b39907c7 | 4038 | m->keep_free = MIN(PAGE_ALIGN_U64(fs_size / 20), /* 5% of file system size */ |
5517607a DDM |
4039 | KEEP_FREE_UPPER); |
4040 | else | |
4041 | m->keep_free = DEFAULT_KEEP_FREE; | |
4042 | } | |
4043 | ||
4044 | if (m->n_max_files == UINT64_MAX) | |
4045 | m->n_max_files = DEFAULT_N_MAX_FILES; | |
4046 | ||
4047 | log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64, | |
4048 | FORMAT_BYTES(m->min_use), | |
4049 | FORMAT_BYTES(m->max_use), | |
4050 | FORMAT_BYTES(m->max_size), | |
4051 | FORMAT_BYTES(m->min_size), | |
4052 | FORMAT_BYTES(m->keep_free), | |
4053 | m->n_max_files); | |
4054 | } | |
4055 | ||
0284adc6 | 4056 | int journal_file_open( |
5d1ce257 | 4057 | int fd, |
0284adc6 | 4058 | const char *fname, |
49615dbd LP |
4059 | int open_flags, |
4060 | JournalFileFlags file_flags, | |
0284adc6 | 4061 | mode_t mode, |
57850536 | 4062 | uint64_t compress_threshold_bytes, |
0284adc6 LP |
4063 | JournalMetrics *metrics, |
4064 | MMapCache *mmap_cache, | |
4065 | JournalFile *template, | |
4066 | JournalFile **ret) { | |
7560fffc | 4067 | |
fa6ac760 | 4068 | bool newly_created = false; |
0284adc6 | 4069 | JournalFile *f; |
fa6ac760 | 4070 | void *h; |
0284adc6 | 4071 | int r; |
7560fffc | 4072 | |
5d1ce257 | 4073 | assert(fd >= 0 || fname); |
ce92dc27 LP |
4074 | assert(file_flags >= 0); |
4075 | assert(file_flags <= _JOURNAL_FILE_FLAGS_MAX); | |
74fb5be6 | 4076 | assert(mmap_cache); |
cc938e4a | 4077 | assert(ret); |
7560fffc | 4078 | |
49615dbd | 4079 | if (!IN_SET((open_flags & O_ACCMODE), O_RDONLY, O_RDWR)) |
0284adc6 | 4080 | return -EINVAL; |
7560fffc | 4081 | |
49615dbd | 4082 | if ((open_flags & O_ACCMODE) == O_RDONLY && FLAGS_SET(open_flags, O_CREAT)) |
d120d897 LP |
4083 | return -EINVAL; |
4084 | ||
49615dbd | 4085 | if (fname && (open_flags & O_CREAT) && !endswith(fname, ".journal")) |
6eda13d3 | 4086 | return -EINVAL; |
7560fffc | 4087 | |
971b52c4 | 4088 | f = new(JournalFile, 1); |
0284adc6 LP |
4089 | if (!f) |
4090 | return -ENOMEM; | |
7560fffc | 4091 | |
971b52c4 LP |
4092 | *f = (JournalFile) { |
4093 | .fd = fd, | |
4094 | .mode = mode, | |
49615dbd | 4095 | .open_flags = open_flags, |
f5fbe71d | 4096 | .compress_threshold_bytes = compress_threshold_bytes == UINT64_MAX ? |
971b52c4 LP |
4097 | DEFAULT_COMPRESS_THRESHOLD : |
4098 | MAX(MIN_COMPRESS_THRESHOLD, compress_threshold_bytes), | |
ce92dc27 | 4099 | .strict_order = FLAGS_SET(file_flags, JOURNAL_STRICT_ORDER), |
34af7494 | 4100 | .newest_boot_id_prioq_idx = PRIOQ_IDX_NULL, |
4aa33df8 | 4101 | .last_direction = _DIRECTION_INVALID, |
971b52c4 | 4102 | }; |
7560fffc | 4103 | |
7645c77b | 4104 | if (fname) { |
5d1ce257 | 4105 | f->path = strdup(fname); |
7645c77b ZJS |
4106 | if (!f->path) { |
4107 | r = -ENOMEM; | |
4108 | goto fail; | |
4109 | } | |
4110 | } else { | |
817b1c5b LP |
4111 | assert(fd >= 0); |
4112 | ||
7645c77b ZJS |
4113 | /* If we don't know the path, fill in something explanatory and vaguely useful */ |
4114 | if (asprintf(&f->path, "/proc/self/%i", fd) < 0) { | |
4115 | r = -ENOMEM; | |
4116 | goto fail; | |
4117 | } | |
0284adc6 | 4118 | } |
7560fffc | 4119 | |
4743015d | 4120 | f->chain_cache = ordered_hashmap_new(&uint64_hash_ops); |
a4bcff5b LP |
4121 | if (!f->chain_cache) { |
4122 | r = -ENOMEM; | |
4123 | goto fail; | |
4124 | } | |
4125 | ||
0284adc6 | 4126 | if (f->fd < 0) { |
817b1c5b LP |
4127 | /* We pass O_NONBLOCK here, so that in case somebody pointed us to some character device node or FIFO |
4128 | * or so, we likely fail quickly than block for long. For regular files O_NONBLOCK has no effect, hence | |
4129 | * it doesn't hurt in that case. */ | |
4130 | ||
49615dbd | 4131 | f->fd = openat_report_new(AT_FDCWD, f->path, f->open_flags|O_CLOEXEC|O_NONBLOCK, f->mode, &newly_created); |
5d1ce257 | 4132 | if (f->fd < 0) { |
db5e7d75 | 4133 | r = f->fd; |
5d1ce257 LP |
4134 | goto fail; |
4135 | } | |
4136 | ||
4137 | /* fds we opened here by us should also be closed by us. */ | |
4138 | f->close_fd = true; | |
817b1c5b LP |
4139 | |
4140 | r = fd_nonblock(f->fd, false); | |
4141 | if (r < 0) | |
4142 | goto fail; | |
db5e7d75 LP |
4143 | |
4144 | if (!newly_created) { | |
4145 | r = journal_file_fstat(f); | |
4146 | if (r < 0) | |
4147 | goto fail; | |
4148 | } | |
4149 | } else { | |
4150 | r = journal_file_fstat(f); | |
4151 | if (r < 0) | |
4152 | goto fail; | |
4153 | ||
4154 | /* If we just got the fd passed in, we don't really know if we created the file anew */ | |
4374d7ea | 4155 | newly_created = f->last_stat.st_size == 0 && journal_file_writable(f); |
7560fffc | 4156 | } |
7560fffc | 4157 | |
8926a6a4 YW |
4158 | r = mmap_cache_add_fd(mmap_cache, f->fd, mmap_prot_from_open_flags(open_flags), &f->cache_fd); |
4159 | if (r < 0) | |
be7cdd8e | 4160 | goto fail; |
be7cdd8e | 4161 | |
db5e7d75 | 4162 | if (newly_created) { |
fc68c929 | 4163 | (void) journal_file_warn_btrfs(f); |
11689d2a | 4164 | |
4c2e1b39 LP |
4165 | /* Let's attach the creation time to the journal file, so that the vacuuming code knows the age of this |
4166 | * file even if the file might end up corrupted one day... Ideally we'd just use the creation time many | |
4167 | * file systems maintain for each file, but the API to query this is very new, hence let's emulate this | |
4168 | * via extended attributes. If extended attributes are not supported we'll just skip this, and rely | |
4169 | * solely on mtime/atime/ctime of the file. */ | |
4170 | (void) fd_setcrtime(f->fd, 0); | |
7560fffc | 4171 | |
4374d7ea | 4172 | r = journal_file_init_header(f, file_flags, template); |
0284adc6 LP |
4173 | if (r < 0) |
4174 | goto fail; | |
7560fffc | 4175 | |
2678031a LP |
4176 | r = journal_file_fstat(f); |
4177 | if (r < 0) | |
0284adc6 | 4178 | goto fail; |
0284adc6 | 4179 | } |
7560fffc | 4180 | |
0284adc6 | 4181 | if (f->last_stat.st_size < (off_t) HEADER_SIZE_MIN) { |
cfb571f3 | 4182 | r = -ENODATA; |
0284adc6 LP |
4183 | goto fail; |
4184 | } | |
7560fffc | 4185 | |
1a25ab66 | 4186 | r = mmap_cache_fd_get(f->cache_fd, MMAP_CACHE_CATEGORY_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h); |
5087825e LP |
4187 | if (r == -EINVAL) { |
4188 | /* Some file systems (jffs2 or p9fs) don't support mmap() properly (or only read-only | |
4189 | * mmap()), and return EINVAL in that case. Let's propagate that as a more recognizable error | |
4190 | * code. */ | |
4191 | r = -EAFNOSUPPORT; | |
4192 | goto fail; | |
4193 | } | |
977eaa1e | 4194 | if (r < 0) |
0284adc6 | 4195 | goto fail; |
7560fffc | 4196 | |
fa6ac760 LP |
4197 | f->header = h; |
4198 | ||
0284adc6 LP |
4199 | if (!newly_created) { |
4200 | r = journal_file_verify_header(f); | |
4201 | if (r < 0) | |
4202 | goto fail; | |
4203 | } | |
7560fffc | 4204 | |
349cc4a5 | 4205 | #if HAVE_GCRYPT |
4374d7ea | 4206 | if (!newly_created && journal_file_writable(f) && JOURNAL_HEADER_SEALED(f->header)) { |
baed47c3 | 4207 | r = journal_file_fss_load(f); |
0284adc6 LP |
4208 | if (r < 0) |
4209 | goto fail; | |
4210 | } | |
feb12d3e | 4211 | #endif |
cec736d2 | 4212 | |
4374d7ea | 4213 | if (journal_file_writable(f)) { |
4a92baf3 | 4214 | if (metrics) { |
87413812 | 4215 | journal_default_metrics(metrics, f->fd, JOURNAL_HEADER_COMPACT(f->header)); |
4a92baf3 LP |
4216 | f->metrics = *metrics; |
4217 | } else if (template) | |
4218 | f->metrics = template->metrics; | |
4219 | ||
cec736d2 LP |
4220 | r = journal_file_refresh_header(f); |
4221 | if (r < 0) | |
4222 | goto fail; | |
4223 | } | |
4224 | ||
349cc4a5 | 4225 | #if HAVE_GCRYPT |
baed47c3 | 4226 | r = journal_file_hmac_setup(f); |
14d10188 LP |
4227 | if (r < 0) |
4228 | goto fail; | |
feb12d3e | 4229 | #endif |
14d10188 | 4230 | |
cec736d2 | 4231 | if (newly_created) { |
de190aef | 4232 | r = journal_file_setup_field_hash_table(f); |
cec736d2 LP |
4233 | if (r < 0) |
4234 | goto fail; | |
4235 | ||
de190aef | 4236 | r = journal_file_setup_data_hash_table(f); |
cec736d2 LP |
4237 | if (r < 0) |
4238 | goto fail; | |
7560fffc | 4239 | |
349cc4a5 | 4240 | #if HAVE_GCRYPT |
7560fffc LP |
4241 | r = journal_file_append_first_tag(f); |
4242 | if (r < 0) | |
4243 | goto fail; | |
feb12d3e | 4244 | #endif |
cec736d2 LP |
4245 | } |
4246 | ||
c3bd54bf | 4247 | if (mmap_cache_fd_got_sigbus(f->cache_fd)) { |
fa6ac760 LP |
4248 | r = -EIO; |
4249 | goto fail; | |
4250 | } | |
4251 | ||
7a24f3bf | 4252 | if (template && template->post_change_timer) { |
e167d7fd LP |
4253 | r = journal_file_enable_post_change_timer( |
4254 | f, | |
4255 | sd_event_source_get_event(template->post_change_timer), | |
4256 | template->post_change_timer_period); | |
7a24f3bf | 4257 | |
7a24f3bf VC |
4258 | if (r < 0) |
4259 | goto fail; | |
4260 | } | |
4261 | ||
f8e2f4d6 | 4262 | /* The file is opened now successfully, thus we take possession of any passed in fd. */ |
5d1ce257 LP |
4263 | f->close_fd = true; |
4264 | ||
4374d7ea | 4265 | if (DEBUG_LOGGING) { |
2360352e YW |
4266 | static int last_seal = -1, last_keyed_hash = -1; |
4267 | static Compression last_compression = _COMPRESSION_INVALID; | |
4374d7ea DDM |
4268 | static uint64_t last_bytes = UINT64_MAX; |
4269 | ||
4270 | if (last_seal != JOURNAL_HEADER_SEALED(f->header) || | |
4271 | last_keyed_hash != JOURNAL_HEADER_KEYED_HASH(f->header) || | |
2360352e | 4272 | last_compression != JOURNAL_FILE_COMPRESSION(f) || |
4374d7ea DDM |
4273 | last_bytes != f->compress_threshold_bytes) { |
4274 | ||
4275 | log_debug("Journal effective settings seal=%s keyed_hash=%s compress=%s compress_threshold_bytes=%s", | |
4276 | yes_no(JOURNAL_HEADER_SEALED(f->header)), yes_no(JOURNAL_HEADER_KEYED_HASH(f->header)), | |
2360352e | 4277 | compression_to_string(JOURNAL_FILE_COMPRESSION(f)), FORMAT_BYTES(f->compress_threshold_bytes)); |
4374d7ea DDM |
4278 | last_seal = JOURNAL_HEADER_SEALED(f->header); |
4279 | last_keyed_hash = JOURNAL_HEADER_KEYED_HASH(f->header); | |
2360352e | 4280 | last_compression = JOURNAL_FILE_COMPRESSION(f); |
4374d7ea DDM |
4281 | last_bytes = f->compress_threshold_bytes; |
4282 | } | |
4283 | } | |
4284 | ||
0559d3a5 | 4285 | *ret = f; |
cec736d2 LP |
4286 | return 0; |
4287 | ||
4288 | fail: | |
c3bd54bf | 4289 | if (f->cache_fd && mmap_cache_fd_got_sigbus(f->cache_fd)) |
fa6ac760 LP |
4290 | r = -EIO; |
4291 | ||
69a3a6fd | 4292 | (void) journal_file_close(f); |
cec736d2 | 4293 | |
4bb37be0 FB |
4294 | if (newly_created && fd < 0) |
4295 | (void) unlink(fname); | |
4296 | ||
cec736d2 LP |
4297 | return r; |
4298 | } | |
0ac38b70 | 4299 | |
596c3c7f FB |
4300 | int journal_file_parse_uid_from_filename(const char *path, uid_t *ret_uid) { |
4301 | _cleanup_free_ char *buf = NULL, *p = NULL; | |
4302 | const char *a, *b, *at; | |
4303 | int r; | |
4304 | ||
4305 | /* This helper returns -EREMOTE when the filename doesn't match user online/offline journal | |
4306 | * pattern. Hence it currently doesn't parse archived or disposed user journals. */ | |
4307 | ||
4308 | assert(path); | |
4309 | assert(ret_uid); | |
4310 | ||
4311 | r = path_extract_filename(path, &p); | |
4312 | if (r < 0) | |
4313 | return r; | |
4314 | if (r == O_DIRECTORY) | |
4315 | return -EISDIR; | |
4316 | ||
4317 | a = startswith(p, "user-"); | |
4318 | if (!a) | |
4319 | return -EREMOTE; | |
4320 | b = endswith(p, ".journal"); | |
4321 | if (!b) | |
4322 | return -EREMOTE; | |
4323 | ||
4324 | at = strchr(a, '@'); | |
4325 | if (at) | |
4326 | return -EREMOTE; | |
4327 | ||
4328 | buf = strndup(a, b-a); | |
4329 | if (!buf) | |
4330 | return -ENOMEM; | |
4331 | ||
4332 | return parse_uid(buf, ret_uid); | |
4333 | } | |
4334 | ||
461955ef | 4335 | int journal_file_archive(JournalFile *f, char **ret_previous_path) { |
57535f47 | 4336 | _cleanup_free_ char *p = NULL; |
0ac38b70 LP |
4337 | |
4338 | assert(f); | |
0ac38b70 | 4339 | |
4374d7ea | 4340 | if (!journal_file_writable(f)) |
0ac38b70 LP |
4341 | return -EINVAL; |
4342 | ||
5d1ce257 | 4343 | /* Is this a journal file that was passed to us as fd? If so, we synthesized a path name for it, and we refuse |
13e785f7 | 4344 | * rotation, since we don't know the actual path, and couldn't rename the file hence. */ |
7a4d21ad | 4345 | if (path_startswith(f->path, "/proc/self/fd")) |
5d1ce257 LP |
4346 | return -EINVAL; |
4347 | ||
7a4d21ad | 4348 | if (!endswith(f->path, ".journal")) |
0ac38b70 LP |
4349 | return -EINVAL; |
4350 | ||
7a4d21ad LP |
4351 | if (asprintf(&p, "%.*s@" SD_ID128_FORMAT_STR "-%016"PRIx64"-%016"PRIx64".journal", |
4352 | (int) strlen(f->path) - 8, f->path, | |
4353 | SD_ID128_FORMAT_VAL(f->header->seqnum_id), | |
4354 | le64toh(f->header->head_entry_seqnum), | |
4355 | le64toh(f->header->head_entry_realtime)) < 0) | |
0ac38b70 LP |
4356 | return -ENOMEM; |
4357 | ||
7a4d21ad LP |
4358 | /* Try to rename the file to the archived version. If the file already was deleted, we'll get ENOENT, let's |
4359 | * ignore that case. */ | |
4360 | if (rename(f->path, p) < 0 && errno != ENOENT) | |
0ac38b70 LP |
4361 | return -errno; |
4362 | ||
1fcefd88 | 4363 | /* Sync the rename to disk */ |
7a4d21ad LP |
4364 | (void) fsync_directory_of_file(f->fd); |
4365 | ||
461955ef DDM |
4366 | if (ret_previous_path) |
4367 | *ret_previous_path = f->path; | |
4368 | else | |
4369 | free(f->path); | |
4370 | ||
4371 | f->path = TAKE_PTR(p); | |
4372 | ||
7a4d21ad LP |
4373 | /* Set as archive so offlining commits w/state=STATE_ARCHIVED. Previously we would set old_file->header->state |
4374 | * to STATE_ARCHIVED directly here, but journal_file_set_offline() short-circuits when state != STATE_ONLINE, | |
4375 | * which would result in the rotated journal never getting fsync() called before closing. Now we simply queue | |
4376 | * the archive state by setting an archive bit, leaving the state as STATE_ONLINE so proper offlining | |
4377 | * occurs. */ | |
4378 | f->archive = true; | |
4379 | ||
7a4d21ad LP |
4380 | return 0; |
4381 | } | |
4382 | ||
68127658 LP |
4383 | int journal_file_dispose(int dir_fd, const char *fname) { |
4384 | _cleanup_free_ char *p = NULL; | |
68127658 LP |
4385 | |
4386 | assert(fname); | |
4387 | ||
24ee0f9d | 4388 | /* Renames a journal file to *.journal~, i.e. to mark it as corrupted or otherwise uncleanly shutdown. Note that |
68127658 LP |
4389 | * this is done without looking into the file or changing any of its contents. The idea is that this is called |
4390 | * whenever something is suspicious and we want to move the file away and make clear that it is not accessed | |
4391 | * for writing anymore. */ | |
4392 | ||
4393 | if (!endswith(fname, ".journal")) | |
4394 | return -EINVAL; | |
4395 | ||
4396 | if (asprintf(&p, "%.*s@%016" PRIx64 "-%016" PRIx64 ".journal~", | |
4397 | (int) strlen(fname) - 8, fname, | |
4398 | now(CLOCK_REALTIME), | |
4399 | random_u64()) < 0) | |
4400 | return -ENOMEM; | |
4401 | ||
4402 | if (renameat(dir_fd, fname, dir_fd, p) < 0) | |
4403 | return -errno; | |
4404 | ||
68127658 LP |
4405 | return 0; |
4406 | } | |
4407 | ||
e5d60d1b LP |
4408 | int journal_file_copy_entry( |
4409 | JournalFile *from, | |
4410 | JournalFile *to, | |
4411 | Object *o, | |
4412 | uint64_t p, | |
4413 | uint64_t *seqnum, | |
4414 | sd_id128_t *seqnum_id) { | |
4415 | ||
52bcf45a YW |
4416 | _cleanup_free_ EntryItem *items_alloc = NULL; |
4417 | EntryItem *items; | |
265b1dc0 | 4418 | uint64_t n, m = 0, xor_hash = 0; |
bf9b7728 | 4419 | sd_id128_t boot_id; |
f6a0cfa5 | 4420 | dual_timestamp ts; |
f6a0cfa5 | 4421 | int r; |
cf244689 LP |
4422 | |
4423 | assert(from); | |
4424 | assert(to); | |
4425 | assert(o); | |
cc938e4a | 4426 | assert(p > 0); |
cf244689 | 4427 | |
4374d7ea | 4428 | if (!journal_file_writable(to)) |
cf244689 LP |
4429 | return -EPERM; |
4430 | ||
d164ac77 DDM |
4431 | ts = (dual_timestamp) { |
4432 | .monotonic = le64toh(o->entry.monotonic), | |
4433 | .realtime = le64toh(o->entry.realtime), | |
4434 | }; | |
bf9b7728 | 4435 | boot_id = o->entry.boot_id; |
cf244689 | 4436 | |
a9089a66 | 4437 | n = journal_file_entry_n_items(from, o); |
85e38da2 YW |
4438 | if (n == 0) |
4439 | return 0; | |
52bcf45a YW |
4440 | |
4441 | if (n < ALLOCA_MAX / sizeof(EntryItem) / 2) | |
4442 | items = newa(EntryItem, n); | |
4443 | else { | |
4444 | items_alloc = new(EntryItem, n); | |
4445 | if (!items_alloc) | |
4446 | return -ENOMEM; | |
4447 | ||
4448 | items = items_alloc; | |
4449 | } | |
cf244689 | 4450 | |
f6a0cfa5 | 4451 | for (uint64_t i = 0; i < n; i++) { |
265b1dc0 | 4452 | uint64_t h, q; |
cf244689 | 4453 | void *data; |
0e35afff | 4454 | size_t l; |
cf244689 LP |
4455 | Object *u; |
4456 | ||
a9089a66 | 4457 | q = journal_file_entry_item_object_offset(from, o, i); |
0e35afff DDM |
4458 | r = journal_file_data_payload(from, NULL, q, NULL, 0, 0, &data, &l); |
4459 | if (IN_SET(r, -EADDRNOTAVAIL, -EBADMSG)) { | |
4460 | log_debug_errno(r, "Entry item %"PRIu64" data object is bad, skipping over it: %m", i); | |
31438511 | 4461 | continue; |
0e35afff | 4462 | } |
cf244689 LP |
4463 | if (r < 0) |
4464 | return r; | |
0e35afff | 4465 | assert(r > 0); |
cf244689 | 4466 | |
bc6b326d DDM |
4467 | if (l == 0) |
4468 | return -EBADMSG; | |
4469 | ||
cf244689 LP |
4470 | r = journal_file_append_data(to, data, l, &u, &h); |
4471 | if (r < 0) | |
4472 | return r; | |
4473 | ||
4ce534f4 LP |
4474 | if (JOURNAL_HEADER_KEYED_HASH(to->header)) |
4475 | xor_hash ^= jenkins_hash64(data, l); | |
4476 | else | |
4477 | xor_hash ^= le64toh(u->data.hash); | |
4478 | ||
f81409f8 | 4479 | items[m++] = (EntryItem) { |
a9089a66 DDM |
4480 | .object_offset = h, |
4481 | .hash = le64toh(u->data.hash), | |
d164ac77 | 4482 | }; |
cf244689 LP |
4483 | } |
4484 | ||
f81409f8 DDM |
4485 | if (m == 0) |
4486 | return 0; | |
4487 | ||
51ab0afe LP |
4488 | r = journal_file_append_entry_internal( |
4489 | to, | |
4490 | &ts, | |
bf9b7728 | 4491 | &boot_id, |
51ab0afe LP |
4492 | &from->header->machine_id, |
4493 | xor_hash, | |
4494 | items, | |
f81409f8 | 4495 | m, |
51ab0afe LP |
4496 | seqnum, |
4497 | seqnum_id, | |
4498 | /* ret_object= */ NULL, | |
4499 | /* ret_offset= */ NULL); | |
fa6ac760 | 4500 | |
c3bd54bf | 4501 | if (mmap_cache_fd_got_sigbus(to->cache_fd)) |
fa6ac760 LP |
4502 | return -EIO; |
4503 | ||
4504 | return r; | |
cf244689 | 4505 | } |
babfc091 | 4506 | |
8580d1f7 LP |
4507 | void journal_reset_metrics(JournalMetrics *m) { |
4508 | assert(m); | |
4509 | ||
4510 | /* Set everything to "pick automatic values". */ | |
4511 | ||
4512 | *m = (JournalMetrics) { | |
f5fbe71d YW |
4513 | .min_use = UINT64_MAX, |
4514 | .max_use = UINT64_MAX, | |
4515 | .min_size = UINT64_MAX, | |
4516 | .max_size = UINT64_MAX, | |
4517 | .keep_free = UINT64_MAX, | |
4518 | .n_max_files = UINT64_MAX, | |
8580d1f7 LP |
4519 | }; |
4520 | } | |
4521 | ||
cc938e4a | 4522 | int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *ret_from, usec_t *ret_to) { |
08984293 | 4523 | assert(f); |
c88cc6af | 4524 | assert(f->header); |
cc938e4a | 4525 | assert(ret_from || ret_to); |
08984293 | 4526 | |
cc938e4a | 4527 | if (ret_from) { |
162566a4 LP |
4528 | if (f->header->head_entry_realtime == 0) |
4529 | return -ENOENT; | |
08984293 | 4530 | |
cc938e4a | 4531 | *ret_from = le64toh(f->header->head_entry_realtime); |
08984293 LP |
4532 | } |
4533 | ||
cc938e4a | 4534 | if (ret_to) { |
162566a4 LP |
4535 | if (f->header->tail_entry_realtime == 0) |
4536 | return -ENOENT; | |
08984293 | 4537 | |
cc938e4a | 4538 | *ret_to = le64toh(f->header->tail_entry_realtime); |
08984293 LP |
4539 | } |
4540 | ||
4541 | return 1; | |
4542 | } | |
4543 | ||
cc938e4a | 4544 | int journal_file_get_cutoff_monotonic_usec(JournalFile *f, sd_id128_t boot_id, usec_t *ret_from, usec_t *ret_to) { |
08984293 LP |
4545 | Object *o; |
4546 | uint64_t p; | |
4547 | int r; | |
4548 | ||
4549 | assert(f); | |
cc938e4a YW |
4550 | assert(ret_from || ret_to); |
4551 | ||
4552 | /* FIXME: fix return value assignment on success with 0. */ | |
08984293 | 4553 | |
47838ab3 | 4554 | r = find_data_object_by_boot_id(f, boot_id, &o, &p); |
08984293 LP |
4555 | if (r <= 0) |
4556 | return r; | |
4557 | ||
4558 | if (le64toh(o->data.n_entries) <= 0) | |
4559 | return 0; | |
4560 | ||
cc938e4a | 4561 | if (ret_from) { |
08984293 LP |
4562 | r = journal_file_move_to_object(f, OBJECT_ENTRY, le64toh(o->data.entry_offset), &o); |
4563 | if (r < 0) | |
4564 | return r; | |
4565 | ||
cc938e4a | 4566 | *ret_from = le64toh(o->entry.monotonic); |
08984293 LP |
4567 | } |
4568 | ||
cc938e4a | 4569 | if (ret_to) { |
08984293 LP |
4570 | r = journal_file_move_to_object(f, OBJECT_DATA, p, &o); |
4571 | if (r < 0) | |
4572 | return r; | |
4573 | ||
d37eeabc | 4574 | r = journal_file_move_to_entry_for_data(f, o, DIRECTION_UP, &o, NULL); |
08984293 LP |
4575 | if (r <= 0) |
4576 | return r; | |
4577 | ||
cc938e4a | 4578 | *ret_to = le64toh(o->entry.monotonic); |
08984293 LP |
4579 | } |
4580 | ||
4581 | return 1; | |
4582 | } | |
dca6219e | 4583 | |
c8e6e1f1 | 4584 | bool journal_file_rotate_suggested(JournalFile *f, usec_t max_file_usec, int log_level) { |
dca6219e | 4585 | assert(f); |
c88cc6af | 4586 | assert(f->header); |
dca6219e LP |
4587 | |
4588 | /* If we gained new header fields we gained new features, | |
4589 | * hence suggest a rotation */ | |
361f9cbc | 4590 | if (le64toh(f->header->header_size) < sizeof(Header)) { |
04cb8ee8 DDM |
4591 | log_ratelimit_full(log_level, JOURNAL_LOG_RATELIMIT, |
4592 | "%s uses an outdated header, suggesting rotation.", f->path); | |
dca6219e | 4593 | return true; |
361f9cbc | 4594 | } |
dca6219e | 4595 | |
0dbe57ee LP |
4596 | /* Let's check if the hash tables grew over a certain fill level (75%, borrowing this value from |
4597 | * Java's hash table implementation), and if so suggest a rotation. To calculate the fill level we | |
4598 | * need the n_data field, which only exists in newer versions. */ | |
dca6219e LP |
4599 | |
4600 | if (JOURNAL_HEADER_CONTAINS(f->header, n_data)) | |
361f9cbc | 4601 | if (le64toh(f->header->n_data) * 4ULL > (le64toh(f->header->data_hash_table_size) / sizeof(HashItem)) * 3ULL) { |
8522691d | 4602 | log_ratelimit_full( |
d9799ea2 | 4603 | log_level, JOURNAL_LOG_RATELIMIT, |
8c29ac2f | 4604 | "Data hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items, %"PRIu64" file size, %"PRIu64" bytes per hash table item), suggesting rotation.", |
8522691d DDM |
4605 | f->path, |
4606 | 100.0 * (double) le64toh(f->header->n_data) / ((double) (le64toh(f->header->data_hash_table_size) / sizeof(HashItem))), | |
4607 | le64toh(f->header->n_data), | |
4608 | le64toh(f->header->data_hash_table_size) / sizeof(HashItem), | |
8c29ac2f | 4609 | (uint64_t) f->last_stat.st_size, |
8522691d | 4610 | f->last_stat.st_size / le64toh(f->header->n_data)); |
dca6219e | 4611 | return true; |
361f9cbc | 4612 | } |
dca6219e LP |
4613 | |
4614 | if (JOURNAL_HEADER_CONTAINS(f->header, n_fields)) | |
361f9cbc | 4615 | if (le64toh(f->header->n_fields) * 4ULL > (le64toh(f->header->field_hash_table_size) / sizeof(HashItem)) * 3ULL) { |
8522691d | 4616 | log_ratelimit_full( |
d9799ea2 | 4617 | log_level, JOURNAL_LOG_RATELIMIT, |
8522691d DDM |
4618 | "Field hash table of %s has a fill level at %.1f (%"PRIu64" of %"PRIu64" items), suggesting rotation.", |
4619 | f->path, | |
4620 | 100.0 * (double) le64toh(f->header->n_fields) / ((double) (le64toh(f->header->field_hash_table_size) / sizeof(HashItem))), | |
4621 | le64toh(f->header->n_fields), | |
4622 | le64toh(f->header->field_hash_table_size) / sizeof(HashItem)); | |
dca6219e | 4623 | return true; |
361f9cbc | 4624 | } |
dca6219e | 4625 | |
0dbe57ee LP |
4626 | /* If there are too many hash collisions somebody is most likely playing games with us. Hence, if our |
4627 | * longest chain is longer than some threshold, let's suggest rotation. */ | |
4628 | if (JOURNAL_HEADER_CONTAINS(f->header, data_hash_chain_depth) && | |
4629 | le64toh(f->header->data_hash_chain_depth) > HASH_CHAIN_DEPTH_MAX) { | |
8522691d | 4630 | log_ratelimit_full( |
d9799ea2 | 4631 | log_level, JOURNAL_LOG_RATELIMIT, |
8522691d DDM |
4632 | "Data hash table of %s has deepest hash chain of length %" PRIu64 ", suggesting rotation.", |
4633 | f->path, le64toh(f->header->data_hash_chain_depth)); | |
0dbe57ee LP |
4634 | return true; |
4635 | } | |
4636 | ||
4637 | if (JOURNAL_HEADER_CONTAINS(f->header, field_hash_chain_depth) && | |
4638 | le64toh(f->header->field_hash_chain_depth) > HASH_CHAIN_DEPTH_MAX) { | |
8522691d | 4639 | log_ratelimit_full( |
d9799ea2 | 4640 | log_level, JOURNAL_LOG_RATELIMIT, |
8522691d DDM |
4641 | "Field hash table of %s has deepest hash chain of length at %" PRIu64 ", suggesting rotation.", |
4642 | f->path, le64toh(f->header->field_hash_chain_depth)); | |
0dbe57ee LP |
4643 | return true; |
4644 | } | |
4645 | ||
0598fd4a LP |
4646 | /* Are the data objects properly indexed by field objects? */ |
4647 | if (JOURNAL_HEADER_CONTAINS(f->header, n_data) && | |
4648 | JOURNAL_HEADER_CONTAINS(f->header, n_fields) && | |
4649 | le64toh(f->header->n_data) > 0 && | |
012181ea | 4650 | le64toh(f->header->n_fields) == 0) { |
8522691d | 4651 | log_ratelimit_full( |
d9799ea2 | 4652 | log_level, JOURNAL_LOG_RATELIMIT, |
8522691d DDM |
4653 | "Data objects of %s are not indexed by field objects, suggesting rotation.", |
4654 | f->path); | |
0598fd4a | 4655 | return true; |
012181ea | 4656 | } |
0598fd4a | 4657 | |
fb0951b0 LP |
4658 | if (max_file_usec > 0) { |
4659 | usec_t t, h; | |
4660 | ||
4661 | h = le64toh(f->header->head_entry_realtime); | |
4662 | t = now(CLOCK_REALTIME); | |
4663 | ||
012181ea | 4664 | if (h > 0 && t > h + max_file_usec) { |
8522691d | 4665 | log_ratelimit_full( |
d9799ea2 | 4666 | log_level, JOURNAL_LOG_RATELIMIT, |
8522691d DDM |
4667 | "Oldest entry in %s is older than the configured file retention duration (%s), suggesting rotation.", |
4668 | f->path, FORMAT_TIMESPAN(max_file_usec, USEC_PER_SEC)); | |
fb0951b0 | 4669 | return true; |
012181ea | 4670 | } |
fb0951b0 LP |
4671 | } |
4672 | ||
dca6219e LP |
4673 | return false; |
4674 | } | |
363b2b9a DDM |
4675 | |
4676 | static const char * const journal_object_type_table[] = { | |
deb87cc8 YW |
4677 | [OBJECT_UNUSED] = "unused", |
4678 | [OBJECT_DATA] = "data", | |
4679 | [OBJECT_FIELD] = "field", | |
4680 | [OBJECT_ENTRY] = "entry", | |
4681 | [OBJECT_DATA_HASH_TABLE] = "data hash table", | |
363b2b9a | 4682 | [OBJECT_FIELD_HASH_TABLE] = "field hash table", |
deb87cc8 YW |
4683 | [OBJECT_ENTRY_ARRAY] = "entry array", |
4684 | [OBJECT_TAG] = "tag", | |
363b2b9a DDM |
4685 | }; |
4686 | ||
4687 | DEFINE_STRING_TABLE_LOOKUP_TO_STRING(journal_object_type, ObjectType); |