#include "journal-file.h"
#include "lookup3.h"
#include "parse-util.h"
+#include "path-util.h"
#include "random-util.h"
#include "sd-event.h"
#include "set.h"
if (!__sync_bool_compare_and_swap(&f->offline_state, OFFLINE_SYNCING, OFFLINE_OFFLINING))
continue;
- f->header->state = STATE_OFFLINE;
+ f->header->state = f->archive ? STATE_ARCHIVED : STATE_OFFLINE;
(void) fsync(f->fd);
break;
if (!(f->fd >= 0 && f->header))
return -EINVAL;
- if (f->header->state != STATE_ONLINE)
- return 0;
+ /* An offlining journal is implicitly online and may modify f->header->state,
+ * we must also join any potentially lingering offline thread when not online. */
+ if (!journal_file_is_offlining(f) && f->header->state != STATE_ONLINE)
+ return journal_file_set_offline_thread_join(f);
/* Restart an in-flight offline thread and wait if needed, or join a lingering done one. */
restarted = journal_file_set_offline_try_restart(f);
#ifdef HAVE_GCRYPT
/* Write the final tag */
- if (f->seal && f->writable)
- journal_file_append_tag(f);
+ if (f->seal && f->writable) {
+ int r;
+
+ r = journal_file_append_tag(f);
+ if (r < 0)
+ log_error_errno(r, "Failed to append tag when closing journal: %m");
+ }
#endif
if (f->post_change_timer) {
(void) btrfs_defrag_fd(f->fd);
}
- safe_close(f->fd);
+ if (f->close_fd)
+ safe_close(f->fd);
free(f->path);
mmap_cache_unref(f->mmap);
gcry_md_close(f->hmac);
#endif
- free(f);
- return NULL;
+ return mfree(f);
}
void journal_file_close_set(Set *s) {
return 0;
}
+static int fsync_directory_of_file(int fd) {
+ _cleanup_free_ char *path = NULL, *dn = NULL;
+ _cleanup_close_ int dfd = -1;
+ struct stat st;
+ int r;
+
+ if (fstat(fd, &st) < 0)
+ return -errno;
+
+ if (!S_ISREG(st.st_mode))
+ return -EBADFD;
+
+ r = fd_get_path(fd, &path);
+ if (r < 0)
+ return r;
+
+ if (!path_is_absolute(path))
+ return -EINVAL;
+
+ dn = dirname_malloc(path);
+ if (!dn)
+ return -ENOMEM;
+
+ dfd = open(dn, O_RDONLY|O_CLOEXEC|O_DIRECTORY);
+ if (dfd < 0)
+ return -errno;
+
+ if (fsync(dfd) < 0)
+ return -errno;
+
+ return 0;
+}
+
static int journal_file_refresh_header(JournalFile *f) {
sd_id128_t boot_id;
int r;
/* Sync the online state to disk */
(void) fsync(f->fd);
+ /* We likely just created a new file, also sync the directory this file is located in. */
+ (void) fsync_directory_of_file(f->fd);
+
return r;
}
return -ENODATA;
if (f->writable) {
- uint8_t state;
sd_id128_t machine_id;
+ uint8_t state;
int r;
r = sd_id128_get_machine(&machine_id);
log_debug("Journal file %s has unknown state %i.", f->path, state);
return -EBUSY;
}
+
+ /* Don't permit appending to files from the future. Because otherwise the realtime timestamps wouldn't
+ * be strictly ordered in the entries in the file anymore, and we can't have that since it breaks
+ * bisection. */
+ if (le64toh(f->header->tail_entry_realtime) > now(CLOCK_REALTIME)) {
+ log_debug("Journal file %s is from the future, refusing to append new data to it that'd be older.", f->path);
+ return -ETXTBSY;
+ }
}
f->compress_xz = JOURNAL_HEADER_COMPRESSED_XZ(f->header);
assert(ret);
/* Objects may only be located at multiple of 64 bit */
- if (!VALID64(offset))
- return -EFAULT;
+ if (!VALID64(offset)) {
+ log_debug("Attempt to move to object at non-64bit boundary: %" PRIu64, offset);
+ return -EBADMSG;
+ }
+
+ /* Object may not be located in the file header */
+ if (offset < le64toh(f->header->header_size)) {
+ log_debug("Attempt to move to object located in file header: %" PRIu64, offset);
+ return -EBADMSG;
+ }
r = journal_file_move_to(f, type, false, offset, sizeof(ObjectHeader), &t);
if (r < 0)
o = (Object*) t;
s = le64toh(o->object.size);
- if (s < sizeof(ObjectHeader))
+ if (s == 0) {
+ log_debug("Attempt to move to uninitialized object: %" PRIu64, offset);
return -EBADMSG;
+ }
+ if (s < sizeof(ObjectHeader)) {
+ log_debug("Attempt to move to overly short object: %" PRIu64, offset);
+ return -EBADMSG;
+ }
- if (o->object.type <= OBJECT_UNUSED)
+ if (o->object.type <= OBJECT_UNUSED) {
+ log_debug("Attempt to move to object with invalid type: %" PRIu64, offset);
return -EBADMSG;
+ }
- if (s < minimum_header_size(o))
+ if (s < minimum_header_size(o)) {
+ log_debug("Attempt to move to truncated object: %" PRIu64, offset);
return -EBADMSG;
+ }
- if (type > OBJECT_UNUSED && o->object.type != type)
+ if (type > OBJECT_UNUSED && o->object.type != type) {
+ log_debug("Attempt to move to object of unexpected type: %" PRIu64, offset);
return -EBADMSG;
+ }
if (s > sizeof(ObjectHeader)) {
r = journal_file_move_to(f, type, false, offset, s, &t);
if (r < 0)
return r;
+#ifdef HAVE_GCRYPT
+ r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
+ if (r < 0)
+ return r;
+#endif
+
/* The linking might have altered the window, so let's
* refresh our pointer */
r = journal_file_move_to_object(f, OBJECT_DATA, p, &o);
fo->field.head_data_offset = le64toh(p);
}
-#ifdef HAVE_GCRYPT
- r = journal_file_hmac_put_object(f, OBJECT_DATA, o, p);
- if (r < 0)
- return r;
-#endif
-
if (ret)
*ret = o;
i = right - 1;
lp = p = le64toh(array->entry_array.items[i]);
if (p <= 0)
- return -EBADMSG;
-
- r = test_object(f, p, needle);
+ r = -EBADMSG;
+ else
+ r = test_object(f, p, needle);
+ if (r == -EBADMSG) {
+ log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (1)");
+ n = i;
+ continue;
+ }
if (r < 0)
return r;
p = le64toh(array->entry_array.items[i]);
if (p <= 0)
- return -EBADMSG;
-
- r = test_object(f, p, needle);
+ r = -EBADMSG;
+ else
+ r = test_object(f, p, needle);
+ if (r == -EBADMSG) {
+ log_debug_errno(r, "Encountered invalid entry while bisecting, cutting algorithm short. (2)");
+ right = n = i;
+ continue;
+ }
if (r < 0)
return r;
return 0;
}
+static int bump_array_index(uint64_t *i, direction_t direction, uint64_t n) {
+
+ /* Increase or decrease the specified index, in the right direction. */
+
+ if (direction == DIRECTION_DOWN) {
+ if (*i >= n - 1)
+ return 0;
+
+ (*i) ++;
+ } else {
+ if (*i <= 0)
+ return 0;
+
+ (*i) --;
+ }
+
+ return 1;
+}
+
+static bool check_properly_ordered(uint64_t new_offset, uint64_t old_offset, direction_t direction) {
+
+ /* Consider it an error if any of the two offsets is uninitialized */
+ if (old_offset == 0 || new_offset == 0)
+ return false;
+
+ /* If we go down, the new offset must be larger than the old one. */
+ return direction == DIRECTION_DOWN ?
+ new_offset > old_offset :
+ new_offset < old_offset;
+}
+
int journal_file_next_entry(
JournalFile *f,
uint64_t p,
if (r <= 0)
return r;
- if (direction == DIRECTION_DOWN) {
- if (i >= n - 1)
- return 0;
-
- i++;
- } else {
- if (i <= 0)
- return 0;
-
- i--;
- }
+ r = bump_array_index(&i, direction, n);
+ if (r <= 0)
+ return r;
}
/* And jump to it */
- r = generic_array_get(f,
- le64toh(f->header->entry_array_offset),
- i,
- ret, &ofs);
- if (r <= 0)
- return r;
+ for (;;) {
+ r = generic_array_get(f,
+ le64toh(f->header->entry_array_offset),
+ i,
+ ret, &ofs);
+ if (r > 0)
+ break;
+ if (r != -EBADMSG)
+ return r;
+
+ /* OK, so this entry is borked. Most likely some entry didn't get synced to disk properly, let's see if
+ * the next one might work for us instead. */
+ log_debug_errno(r, "Entry item %" PRIu64 " is bad, skipping over it.", i);
+
+ r = bump_array_index(&i, direction, n);
+ if (r <= 0)
+ return r;
+ }
- if (p > 0 &&
- (direction == DIRECTION_DOWN ? ofs <= p : ofs >= p)) {
- log_debug("%s: entry array corrupted at entry %"PRIu64,
- f->path, i);
+ /* Ensure our array is properly ordered. */
+ if (p > 0 && !check_properly_ordered(ofs, p, direction)) {
+ log_debug("%s: entry array not properly ordered at entry %" PRIu64, f->path, i);
return -EBADMSG;
}
direction_t direction,
Object **ret, uint64_t *offset) {
- uint64_t n, i;
- int r;
+ uint64_t i, n, ofs;
Object *d;
+ int r;
assert(f);
assert(p > 0 || !o);
if (r <= 0)
return r;
- if (direction == DIRECTION_DOWN) {
- if (i >= n - 1)
- return 0;
+ r = bump_array_index(&i, direction, n);
+ if (r <= 0)
+ return r;
+ }
- i++;
- } else {
- if (i <= 0)
- return 0;
+ for (;;) {
+ r = generic_array_get_plus_one(f,
+ le64toh(d->data.entry_offset),
+ le64toh(d->data.entry_array_offset),
+ i,
+ ret, &ofs);
+ if (r > 0)
+ break;
+ if (r != -EBADMSG)
+ return r;
- i--;
- }
+ log_debug_errno(r, "Data entry item %" PRIu64 " is bad, skipping over it.", i);
+
+ r = bump_array_index(&i, direction, n);
+ if (r <= 0)
+ return r;
+ }
+ /* Ensure our array is properly ordered. */
+ if (p > 0 && check_properly_ordered(ofs, p, direction)) {
+ log_debug("%s data entry array not properly ordered at entry %" PRIu64, f->path, i);
+ return -EBADMSG;
}
- return generic_array_get_plus_one(f,
- le64toh(d->data.entry_offset),
- le64toh(d->data.entry_array_offset),
- i,
- ret, offset);
+ if (offset)
+ *offset = ofs;
+
+ return 1;
}
int journal_file_move_to_entry_by_offset_for_data(
"Data Hash Table Size: %"PRIu64"\n"
"Field Hash Table Size: %"PRIu64"\n"
"Rotate Suggested: %s\n"
- "Head Sequential Number: %"PRIu64"\n"
- "Tail Sequential Number: %"PRIu64"\n"
- "Head Realtime Timestamp: %s\n"
- "Tail Realtime Timestamp: %s\n"
- "Tail Monotonic Timestamp: %s\n"
+ "Head Sequential Number: %"PRIu64" (%"PRIx64")\n"
+ "Tail Sequential Number: %"PRIu64" (%"PRIx64")\n"
+ "Head Realtime Timestamp: %s (%"PRIx64")\n"
+ "Tail Realtime Timestamp: %s (%"PRIx64")\n"
+ "Tail Monotonic Timestamp: %s (%"PRIx64")\n"
"Objects: %"PRIu64"\n"
"Entry Objects: %"PRIu64"\n",
f->path,
le64toh(f->header->data_hash_table_size) / sizeof(HashItem),
le64toh(f->header->field_hash_table_size) / sizeof(HashItem),
yes_no(journal_file_rotate_suggested(f, 0)),
- le64toh(f->header->head_entry_seqnum),
- le64toh(f->header->tail_entry_seqnum),
- format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)),
- format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)),
- format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC),
+ le64toh(f->header->head_entry_seqnum), le64toh(f->header->head_entry_seqnum),
+ le64toh(f->header->tail_entry_seqnum), le64toh(f->header->tail_entry_seqnum),
+ format_timestamp_safe(x, sizeof(x), le64toh(f->header->head_entry_realtime)), le64toh(f->header->head_entry_realtime),
+ format_timestamp_safe(y, sizeof(y), le64toh(f->header->tail_entry_realtime)), le64toh(f->header->tail_entry_realtime),
+ format_timespan(z, sizeof(z), le64toh(f->header->tail_entry_monotonic), USEC_PER_MSEC), le64toh(f->header->tail_entry_monotonic),
le64toh(f->header->n_objects),
le64toh(f->header->n_entries));
}
int journal_file_open(
+ int fd,
const char *fname,
int flags,
mode_t mode,
void *h;
int r;
- assert(fname);
assert(ret);
+ assert(fd >= 0 || fname);
if ((flags & O_ACCMODE) != O_RDONLY &&
(flags & O_ACCMODE) != O_RDWR)
return -EINVAL;
- if (!endswith(fname, ".journal") &&
- !endswith(fname, ".journal~"))
- return -EINVAL;
+ if (fname) {
+ if (!endswith(fname, ".journal") &&
+ !endswith(fname, ".journal~"))
+ return -EINVAL;
+ }
f = new0(JournalFile, 1);
if (!f)
return -ENOMEM;
- f->fd = -1;
+ f->fd = fd;
f->mode = mode;
f->flags = flags;
}
}
- f->path = strdup(fname);
+ if (fname)
+ f->path = strdup(fname);
+ else /* If we don't know the path, fill in something explanatory and vaguely useful */
+ asprintf(&f->path, "/proc/self/%i", fd);
if (!f->path) {
r = -ENOMEM;
goto fail;
goto fail;
}
- f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
if (f->fd < 0) {
- r = -errno;
- goto fail;
+ f->fd = open(f->path, f->flags|O_CLOEXEC, f->mode);
+ if (f->fd < 0) {
+ r = -errno;
+ goto fail;
+ }
+
+ /* fds we opened here by us should also be closed by us. */
+ f->close_fd = true;
}
r = journal_file_fstat(f);
goto fail;
}
+ /* The file is opened now successfully, thus we take possession of any passed in fd. */
+ f->close_fd = true;
+
*ret = f;
return 0;
if (!old_file->writable)
return -EINVAL;
+ /* Is this a journal file that was passed to us as fd? If so, we synthesized a path name for it, and we refuse
+ * rotation, since we don't know the actual path, and couldn't rename the file hence.*/
+ if (path_startswith(old_file->path, "/proc/self/fd"))
+ return -EINVAL;
+
if (!endswith(old_file->path, ".journal"))
return -EINVAL;
if (r < 0 && errno != ENOENT)
return -errno;
- old_file->header->state = STATE_ARCHIVED;
+ /* Sync the rename to disk */
+ (void) fsync_directory_of_file(old_file->fd);
+
+ /* Set as archive so offlining commits w/state=STATE_ARCHIVED.
+ * Previously we would set old_file->header->state to STATE_ARCHIVED directly here,
+ * but journal_file_set_offline() short-circuits when state != STATE_ONLINE, which
+ * would result in the rotated journal never getting fsync() called before closing.
+ * Now we simply queue the archive state by setting an archive bit, leaving the state
+ * as STATE_ONLINE so proper offlining occurs. */
+ old_file->archive = true;
/* Currently, btrfs is not very good with out write patterns
* and fragments heavily. Let's defrag our journal files when
* we archive them */
old_file->defrag_on_close = true;
- r = journal_file_open(old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, deferred_closes, old_file, &new_file);
+ r = journal_file_open(-1, old_file->path, old_file->flags, old_file->mode, compress, seal, NULL, old_file->mmap, deferred_closes, old_file, &new_file);
if (deferred_closes &&
set_put(deferred_closes, old_file) >= 0)
size_t l;
_cleanup_free_ char *p = NULL;
- r = journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, deferred_closes, template, ret);
+ r = journal_file_open(-1, fname, flags, mode, compress, seal, metrics, mmap_cache, deferred_closes, template, ret);
if (!IN_SET(r,
-EBADMSG, /* corrupted */
-ENODATA, /* truncated */
-EBUSY, /* unclean shutdown */
-ESHUTDOWN, /* already archived */
-EIO, /* IO error, including SIGBUS on mmap */
- -EIDRM /* File has been deleted */))
+ -EIDRM, /* File has been deleted */
+ -ETXTBSY)) /* File is from the future */
return r;
if ((flags & O_ACCMODE) == O_RDONLY)
/* btrfs doesn't cope well with our write pattern and
* fragments heavily. Let's defrag all files we rotate */
- (void) chattr_path(p, false, FS_NOCOW_FL);
+ (void) chattr_path(p, 0, FS_NOCOW_FL);
(void) btrfs_defrag(p);
log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
- return journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, deferred_closes, template, ret);
+ return journal_file_open(-1, fname, flags, mode, compress, seal, metrics, mmap_cache, deferred_closes, template, ret);
}
int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {