X-Git-Url: http://git.ipfire.org/?a=blobdiff_plain;f=src%2Fjournal%2Fjournal-file.c;h=d3e021473185b09750d547efbd7a3c55b5a825e5;hb=6b430fdb7c0c2c52ea69a7d56f23d739218b13d0;hp=80e34f6e8b4818c6f7aa53647165fd961c1a94fd;hpb=ec9ffa2cdd931d8ab8ac0d90b1d9eff180a07288;p=thirdparty%2Fsystemd.git diff --git a/src/journal/journal-file.c b/src/journal/journal-file.c index 80e34f6e8b4..d3e02147318 100644 --- a/src/journal/journal-file.c +++ b/src/journal/journal-file.c @@ -37,6 +37,7 @@ #include "journal-file.h" #include "lookup3.h" #include "parse-util.h" +#include "path-util.h" #include "random-util.h" #include "sd-event.h" #include "set.h" @@ -119,7 +120,7 @@ static void journal_file_set_offline_internal(JournalFile *f) { if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_OFFLINING)) continue; - f->header->state = STATE_OFFLINE; + f->header->state = f->archive ? STATE_ARCHIVED : STATE_OFFLINE; (void) fsync(f->fd); break; @@ -217,8 +218,10 @@ int journal_file_set_offline(JournalFile *f, bool wait) { if (!(f->fd >= 0 && f->header)) return -EINVAL; - if (f->header->state != STATE_ONLINE) - return 0; + /* An offlining journal is implicitly online and may modify f->header->state, + * we must also join any potentially lingering offline thread when not online. */ + if (!journal_file_is_offlining(f) && f->header->state != STATE_ONLINE) + return journal_file_set_offline_thread_join(f); /* Restart an in-flight offline thread and wait if needed, or join a lingering done one. */ restarted = journal_file_set_offline_try_restart(f); @@ -330,8 +333,13 @@ JournalFile* journal_file_close(JournalFile *f) { #ifdef HAVE_GCRYPT /* Write the final tag */ - if (f->seal && f->writable) - journal_file_append_tag(f); + if (f->seal && f->writable) { + int r; + + r = journal_file_append_tag(f); + if (r < 0) + log_error_errno(r, "Failed to append tag when closing journal: %m"); + } #endif if (f->post_change_timer) { @@ -362,7 +370,8 @@ JournalFile* journal_file_close(JournalFile *f) { (void) btrfs_defrag_fd(f->fd); } - safe_close(f->fd); + if (f->close_fd) + safe_close(f->fd); free(f->path); mmap_cache_unref(f->mmap); @@ -385,8 +394,7 @@ JournalFile* journal_file_close(JournalFile *f) { gcry_md_close(f->hmac); #endif - free(f); - return NULL; + return mfree(f); } void journal_file_close_set(Set *s) { @@ -435,6 +443,39 @@ static int journal_file_init_header(JournalFile *f, JournalFile *template) { return 0; } +static int fsync_directory_of_file(int fd) { + _cleanup_free_ char *path = NULL, *dn = NULL; + _cleanup_close_ int dfd = -1; + struct stat st; + int r; + + if (fstat(fd, &st) < 0) + return -errno; + + if (!S_ISREG(st.st_mode)) + return -EBADFD; + + r = fd_get_path(fd, &path); + if (r < 0) + return r; + + if (!path_is_absolute(path)) + return -EINVAL; + + dn = dirname_malloc(path); + if (!dn) + return -ENOMEM; + + dfd = open(dn, O_RDONLY|O_CLOEXEC|O_DIRECTORY); + if (dfd < 0) + return -errno; + + if (fsync(dfd) < 0) + return -errno; + + return 0; +} + static int journal_file_refresh_header(JournalFile *f) { sd_id128_t boot_id; int r; @@ -460,6 +501,9 @@ static int journal_file_refresh_header(JournalFile *f) { /* Sync the online state to disk */ (void) fsync(f->fd); + /* We likely just created a new file, also sync the directory this file is located in. */ + (void) fsync_directory_of_file(f->fd); + return r; } @@ -523,8 +567,8 @@ static int journal_file_verify_header(JournalFile *f) { return -ENODATA; if (f->writable) { - uint8_t state; sd_id128_t machine_id; + uint8_t state; int r; r = sd_id128_get_machine(&machine_id); @@ -545,6 +589,14 @@ static int journal_file_verify_header(JournalFile *f) { log_debug("Journal file %s has unknown state %i.", f->path, state); return -EBUSY; } + + /* Don't permit appending to files from the future. Because otherwise the realtime timestamps wouldn't + * be strictly ordered in the entries in the file anymore, and we can't have that since it breaks + * bisection. */ + if (le64toh(f->header->tail_entry_realtime) > now(CLOCK_REALTIME)) { + log_debug("Journal file %s is from the future, refusing to append new data to it that'd be older.", f->path); + return -ETXTBSY; + } } f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header); @@ -702,8 +754,16 @@ int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset assert(ret); /* Objects may only be located at multiple of 64 bit */ - if (!VALID64(offset)) - return -EFAULT; + if (!VALID64(offset)) { + log_debug("Attempt to move to object at non-64bit boundary: %" PRIu64, offset); + return -EBADMSG; + } + + /* Object may not be located in the file header */ + if (offset < le64toh(f->header->header_size)) { + log_debug("Attempt to move to object located in file header: %" PRIu64, offset); + return -EBADMSG; + } r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t); if (r < 0) @@ -712,17 +772,29 @@ int journal_file_move_to_object(JournalFile *f, ObjectType type, uint64_t offset o = (Object*) t; s = le64toh(o->object.size); - if (s < sizeof(ObjectHeader)) + if (s == 0) { + log_debug("Attempt to move to uninitialized object: %" PRIu64, offset); return -EBADMSG; + } + if (s < sizeof(ObjectHeader)) { + log_debug("Attempt to move to overly short object: %" PRIu64, offset); + return -EBADMSG; + } - if (o->object.type <= OBJECT_UNUSED) + if (o->object.type <= OBJECT_UNUSED) { + log_debug("Attempt to move to object with invalid type: %" PRIu64, offset); return -EBADMSG; + } - if (s < minimum_header_size(o)) + if (s < minimum_header_size(o)) { + log_debug("Attempt to move to truncated object: %" PRIu64, offset); return -EBADMSG; + } - if (type > OBJECT_UNUSED && o->object.type != type) + if (type > OBJECT_UNUSED && o->object.type != type) { + log_debug("Attempt to move to object of unexpected type: %" PRIu64, offset); return -EBADMSG; + } if (s > sizeof(ObjectHeader)) { r = journal_file_move_to(f, type, false, offset, s, &t); @@ -1325,6 +1397,12 @@ static int journal_file_append_data( if (r < 0) return r; +#ifdef HAVE_GCRYPT + r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p); + if (r < 0) + return r; +#endif + /* The linking might have altered the window, so let's * refresh our pointer */ r = journal_file_move_to_object(f, OBJECT_DATA, p, &o); @@ -1349,12 +1427,6 @@ static int journal_file_append_data( fo->field.head_data_offset = le64toh(p); } -#ifdef HAVE_GCRYPT - r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p); - if (r < 0) - return r; -#endif - if (ret) *ret = o; @@ -1976,9 +2048,14 @@ static int generic_array_bisect( i = right - 1; lp = p = le64toh(array->entry_array.items[i]); if (p <= 0) - return -EBADMSG; - - r = test_object(f, p, needle); + r = -EBADMSG; + else + r = test_object(f, p, needle); + if (r == -EBADMSG) { + log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (1)"); + n = i; + continue; + } if (r < 0) return r; @@ -2054,9 +2131,14 @@ static int generic_array_bisect( p = le64toh(array->entry_array.items[i]); if (p <= 0) - return -EBADMSG; - - r = test_object(f, p, needle); + r = -EBADMSG; + else + r = test_object(f, p, needle); + if (r == -EBADMSG) { + log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (2)"); + right = n = i; + continue; + } if (r < 0) return r; @@ -2172,7 +2254,7 @@ static int generic_array_bisect_plus_one( goto found; if (r > 0 && idx) - (*idx) ++; + (*idx)++; return r; @@ -2413,6 +2495,37 @@ int journal_file_compare_locations(JournalFile *af, JournalFile *bf) { return 0; } +static int bump_array_index(uint64_t *i, direction_t direction, uint64_t n) { + + /* Increase or decrease the specified index, in the right direction. */ + + if (direction == DIRECTION_DOWN) { + if (*i >= n - 1) + return 0; + + (*i) ++; + } else { + if (*i <= 0) + return 0; + + (*i) --; + } + + return 1; +} + +static bool check_properly_ordered(uint64_t new_offset, uint64_t old_offset, direction_t direction) { + + /* Consider it an error if any of the two offsets is uninitialized */ + if (old_offset == 0 || new_offset == 0) + return false; + + /* If we go down, the new offset must be larger than the old one. */ + return direction == DIRECTION_DOWN ? + new_offset > old_offset : + new_offset < old_offset; +} + int journal_file_next_entry( JournalFile *f, uint64_t p, @@ -2443,31 +2556,34 @@ int journal_file_next_entry( if (r <= 0) return r; - if (direction == DIRECTION_DOWN) { - if (i >= n - 1) - return 0; - - i++; - } else { - if (i <= 0) - return 0; - - i--; - } + r = bump_array_index(&i, direction, n); + if (r <= 0) + return r; } /* And jump to it */ - r = generic_array_get(f, - le64toh(f->header->entry_array_offset), - i, - ret, &ofs); - if (r <= 0) - return r; + for (;;) { + r = generic_array_get(f, + le64toh(f->header->entry_array_offset), + i, + ret, &ofs); + if (r > 0) + break; + if (r != -EBADMSG) + return r; + + /* OK, so this entry is borked. Most likely some entry didn't get synced to disk properly, let's see if + * the next one might work for us instead. */ + log_debug_errno(r, "Entry item %" PRIu64 " is bad, skipping over it.", i); + + r = bump_array_index(&i, direction, n); + if (r <= 0) + return r; + } - if (p > 0 && - (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) { - log_debug("%s: entry array corrupted at entry %"PRIu64, - f->path, i); + /* Ensure our array is properly ordered. */ + if (p > 0 && !check_properly_ordered(ofs, p, direction)) { + log_debug("%s: entry array not properly ordered at entry %" PRIu64, f->path, i); return -EBADMSG; } @@ -2484,9 +2600,9 @@ int journal_file_next_entry_for_data( direction_t direction, Object **ret, uint64_t *offset) { - uint64_t n, i; - int r; + uint64_t i, n, ofs; Object *d; + int r; assert(f); assert(p > 0 || !o); @@ -2518,25 +2634,39 @@ int journal_file_next_entry_for_data( if (r <= 0) return r; - if (direction == DIRECTION_DOWN) { - if (i >= n - 1) - return 0; + r = bump_array_index(&i, direction, n); + if (r <= 0) + return r; + } - i++; - } else { - if (i <= 0) - return 0; + for (;;) { + r = generic_array_get_plus_one(f, + le64toh(d->data.entry_offset), + le64toh(d->data.entry_array_offset), + i, + ret, &ofs); + if (r > 0) + break; + if (r != -EBADMSG) + return r; - i--; - } + log_debug_errno(r, "Data entry item %" PRIu64 " is bad, skipping over it.", i); + + r = bump_array_index(&i, direction, n); + if (r <= 0) + return r; + } + /* Ensure our array is properly ordered. */ + if (p > 0 && check_properly_ordered(ofs, p, direction)) { + log_debug("%s data entry array not properly ordered at entry %" PRIu64, f->path, i); + return -EBADMSG; } - return generic_array_get_plus_one(f, - le64toh(d->data.entry_offset), - le64toh(d->data.entry_array_offset), - i, - ret, offset); + if (offset) + *offset = ofs; + + return 1; } int journal_file_move_to_entry_by_offset_for_data( @@ -2806,11 +2936,11 @@ void journal_file_print_header(JournalFile *f) { "Data Hash Table Size: %"PRIu64"\n" "Field Hash Table Size: %"PRIu64"\n" "Rotate Suggested: %s\n" - "Head Sequential Number: %"PRIu64"\n" - "Tail Sequential Number: %"PRIu64"\n" - "Head Realtime Timestamp: %s\n" - "Tail Realtime Timestamp: %s\n" - "Tail Monotonic Timestamp: %s\n" + "Head Sequential Number: %"PRIu64" (%"PRIx64")\n" + "Tail Sequential Number: %"PRIu64" (%"PRIx64")\n" + "Head Realtime Timestamp: %s (%"PRIx64")\n" + "Tail Realtime Timestamp: %s (%"PRIx64")\n" + "Tail Monotonic Timestamp: %s (%"PRIx64")\n" "Objects: %"PRIu64"\n" "Entry Objects: %"PRIu64"\n", f->path, @@ -2831,11 +2961,11 @@ void journal_file_print_header(JournalFile *f) { le64toh(f->header->data_hash_table_size) / sizeof(HashItem), le64toh(f->header->field_hash_table_size) / sizeof(HashItem), yes_no(journal_file_rotate_suggested(f, 0)), - le64toh(f->header->head_entry_seqnum), - le64toh(f->header->tail_entry_seqnum), - format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)), - format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)), - format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC), + le64toh(f->header->head_entry_seqnum), le64toh(f->header->head_entry_seqnum), + le64toh(f->header->tail_entry_seqnum), le64toh(f->header->tail_entry_seqnum), + format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)), le64toh(f->header->head_entry_realtime), + format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)), le64toh(f->header->tail_entry_realtime), + format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC), le64toh(f->header->tail_entry_monotonic), le64toh(f->header->n_objects), le64toh(f->header->n_entries)); @@ -2898,6 +3028,7 @@ static int journal_file_warn_btrfs(JournalFile *f) { } int journal_file_open( + int fd, const char *fname, int flags, mode_t mode, @@ -2914,22 +3045,24 @@ int journal_file_open( void *h; int r; - assert(fname); assert(ret); + assert(fd >= 0 || fname); if ((flags & O_ACCMODE) != O_RDONLY && (flags & O_ACCMODE) != O_RDWR) return -EINVAL; - if (!endswith(fname, ".journal") && - !endswith(fname, ".journal~")) - return -EINVAL; + if (fname) { + if (!endswith(fname, ".journal") && + !endswith(fname, ".journal~")) + return -EINVAL; + } f = new0(JournalFile, 1); if (!f) return -ENOMEM; - f->fd = -1; + f->fd = fd; f->mode = mode; f->flags = flags; @@ -2954,7 +3087,10 @@ int journal_file_open( } } - f->path = strdup(fname); + if (fname) + f->path = strdup(fname); + else /* If we don't know the path, fill in something explanatory and vaguely useful */ + asprintf(&f->path, "/proc/self/%i", fd); if (!f->path) { r = -ENOMEM; goto fail; @@ -2966,10 +3102,15 @@ int journal_file_open( goto fail; } - f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode); if (f->fd < 0) { - r = -errno; - goto fail; + f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode); + if (f->fd < 0) { + r = -errno; + goto fail; + } + + /* fds we opened here by us should also be closed by us. */ + f->close_fd = true; } r = journal_file_fstat(f); @@ -3090,6 +3231,9 @@ int journal_file_open( goto fail; } + /* The file is opened now successfully, thus we take possession of any passed in fd. */ + f->close_fd = true; + *ret = f; return 0; @@ -3116,6 +3260,11 @@ int journal_file_rotate(JournalFile **f, bool compress, bool seal, Set *deferred if (!old_file->writable) return -EINVAL; + /* Is this a journal file that was passed to us as fd? If so, we synthesized a path name for it, and we refuse + * rotation, since we don't know the actual path, and couldn't rename the file hence.*/ + if (path_startswith(old_file->path, "/proc/self/fd")) + return -EINVAL; + if (!endswith(old_file->path, ".journal")) return -EINVAL; @@ -3135,14 +3284,23 @@ int journal_file_rotate(JournalFile **f, bool compress, bool seal, Set *deferred if (r < 0 && errno != ENOENT) return -errno; - old_file->header->state = STATE_ARCHIVED; + /* Sync the rename to disk */ + (void) fsync_directory_of_file(old_file->fd); + + /* Set as archive so offlining commits w/state=STATE_ARCHIVED. + * Previously we would set old_file->header->state to STATE_ARCHIVED directly here, + * but journal_file_set_offline() short-circuits when state != STATE_ONLINE, which + * would result in the rotated journal never getting fsync() called before closing. + * Now we simply queue the archive state by setting an archive bit, leaving the state + * as STATE_ONLINE so proper offlining occurs. */ + old_file->archive = true; /* Currently, btrfs is not very good with out write patterns * and fragments heavily. Let's defrag our journal files when * we archive them */ old_file->defrag_on_close = true; - r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, deferred_closes, old_file, &new_file); + r = journal_file_open(-1, old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, deferred_closes, old_file, &new_file); if (deferred_closes && set_put(deferred_closes, old_file) >= 0) @@ -3170,7 +3328,7 @@ int journal_file_open_reliably( size_t l; _cleanup_free_ char *p = NULL; - r = journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, deferred_closes, template, ret); + r = journal_file_open(-1, fname, flags, mode, compress, seal, metrics, mmap_cache, deferred_closes, template, ret); if (!IN_SET(r, -EBADMSG, /* corrupted */ -ENODATA, /* truncated */ @@ -3179,7 +3337,8 @@ int journal_file_open_reliably( -EBUSY, /* unclean shutdown */ -ESHUTDOWN, /* already archived */ -EIO, /* IO error, including SIGBUS on mmap */ - -EIDRM /* File has been deleted */)) + -EIDRM, /* File has been deleted */ + -ETXTBSY)) /* File is from the future */ return r; if ((flags & O_ACCMODE) == O_RDONLY) @@ -3206,12 +3365,12 @@ int journal_file_open_reliably( /* btrfs doesn't cope well with our write pattern and * fragments heavily. Let's defrag all files we rotate */ - (void) chattr_path(p, false, FS_NOCOW_FL); + (void) chattr_path(p, 0, FS_NOCOW_FL); (void) btrfs_defrag(p); log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname); - return journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, deferred_closes, template, ret); + return journal_file_open(-1, fname, flags, mode, compress, seal, metrics, mmap_cache, deferred_closes, template, ret); } int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {