along with systemd; If not, see <http://www.gnu.org/licenses/>.
***/
-#include <sys/mman.h>
#include <errno.h>
-#include <sys/uio.h>
-#include <unistd.h>
-#include <sys/statvfs.h>
#include <fcntl.h>
+#include <linux/fs.h>
#include <stddef.h>
+#include <sys/mman.h>
+#include <sys/statvfs.h>
+#include <sys/uio.h>
+#include <unistd.h>
#include "btrfs-util.h"
-#include "journal-def.h"
-#include "journal-file.h"
+#include "compress.h"
#include "journal-authenticate.h"
+#include "journal-def.h"
#include "lookup3.h"
-#include "compress.h"
-#include "fsprg.h"
+#include "random-util.h"
+#include "string-util.h"
+#include "journal-file.h"
#define DEFAULT_DATA_HASH_TABLE_SIZE (2047ULL*sizeof(HashItem))
#define DEFAULT_FIELD_HASH_TABLE_SIZE (333ULL*sizeof(HashItem))
#define DEFAULT_MAX_USE_LOWER (1ULL*1024ULL*1024ULL) /* 1 MiB */
#define DEFAULT_MAX_USE_UPPER (4ULL*1024ULL*1024ULL*1024ULL) /* 4 GiB */
+/* This is the default minimal use limit, how much we'll use even if keep_free suggests otherwise. */
+#define DEFAULT_MIN_USE (1ULL*1024ULL*1024ULL) /* 1 MiB */
+
/* This is the upper bound if we deduce max_size from max_use */
#define DEFAULT_MAX_SIZE_UPPER (128ULL*1024ULL*1024ULL) /* 128 MiB */
* size */
#define DEFAULT_KEEP_FREE (1024ULL*1024ULL) /* 1 MB */
+/* This is the default maximum number of journal files to keep around. */
+#define DEFAULT_N_MAX_FILES (100)
+
/* n_data was the first entry we added after the initial file format design */
#define HEADER_SIZE_MIN ALIGN64(offsetof(Header, n_data))
return 0;
}
-void journal_file_close(JournalFile *f) {
+JournalFile* journal_file_close(JournalFile *f) {
assert(f);
#ifdef HAVE_GCRYPT
if (f->mmap && f->fd >= 0)
mmap_cache_close_fd(f->mmap, f->fd);
- if (f->fd >= 0 && f->defrag_on_close)
- btrfs_defrag_fd(f->fd);
+ if (f->fd >= 0 && f->defrag_on_close) {
+
+ /* Be friendly to btrfs: turn COW back on again now,
+ * and defragment the file. We won't write to the file
+ * ever again, hence remove all fragmentation, and
+ * reenable all the good bits COW usually provides
+ * (such as data checksumming). */
+
+ (void) chattr_fd(f->fd, 0, FS_NOCOW_FL);
+ (void) btrfs_defrag_fd(f->fd);
+ }
safe_close(f->fd);
free(f->path);
#ifdef HAVE_GCRYPT
if (f->fss_file)
munmap(f->fss_file, PAGE_ALIGN(f->fss_file_size));
- else if (f->fsprg_state)
+ else
free(f->fsprg_state);
free(f->fsprg_seed);
#endif
free(f);
+ return NULL;
}
static int journal_file_init_header(JournalFile *f, JournalFile *template) {
} else if (state == STATE_ARCHIVED)
return -ESHUTDOWN;
else if (state != STATE_OFFLINE) {
- log_debug("Journal file %s has unknown state %u.", f->path, state);
+ log_debug("Journal file %s has unknown state %i.", f->path, state);
return -EBUSY;
}
}
if (fstatvfs(f->fd, &svfs) >= 0) {
uint64_t available;
- available = svfs.f_bfree * svfs.f_bsize;
-
- if (available >= f->metrics.keep_free)
- available -= f->metrics.keep_free;
- else
- available = 0;
+ available = LESS_BY((uint64_t) svfs.f_bfree * (uint64_t) svfs.f_bsize, f->metrics.keep_free);
if (new_size - old_size > available)
return -E2BIG;
assert(f);
- /* We estimate that we need 1 hash table entry per 768 of
- journal file and we want to make sure we never get beyond
- 75% fill level. Calculate the hash table size for the
- maximum file size based on these metrics. */
+ /* We estimate that we need 1 hash table entry per 768 bytes
+ of journal file and we want to make sure we never get
+ beyond 75% fill level. Calculate the hash table size for
+ the maximum file size based on these metrics. */
s = (f->metrics.max_size * 4 / 768 / 3) * sizeof(HashItem);
if (s < DEFAULT_DATA_HASH_TABLE_SIZE)
return 0;
}
-static int journal_file_map_data_hash_table(JournalFile *f) {
+int journal_file_map_data_hash_table(JournalFile *f) {
uint64_t s, p;
void *t;
int r;
assert(f);
+ if (f->data_hash_table)
+ return 0;
+
p = le64toh(f->header->data_hash_table_offset);
s = le64toh(f->header->data_hash_table_size);
return 0;
}
-static int journal_file_map_field_hash_table(JournalFile *f) {
+int journal_file_map_field_hash_table(JournalFile *f) {
uint64_t s, p;
void *t;
int r;
assert(f);
+ if (f->field_hash_table)
+ return 0;
+
p = le64toh(f->header->field_hash_table_offset);
s = le64toh(f->header->field_hash_table_size);
assert(f);
assert(field && size > 0);
+ /* If the field hash table is empty, we can't find anything */
+ if (le64toh(f->header->field_hash_table_size) <= 0)
+ return 0;
+
+ /* Map the field hash table, if it isn't mapped yet. */
+ r = journal_file_map_field_hash_table(f);
+ if (r < 0)
+ return r;
+
osize = offsetof(Object, field.payload) + size;
m = le64toh(f->header->field_hash_table_size) / sizeof(HashItem);
-
if (m <= 0)
return -EBADMSG;
assert(f);
assert(data || size == 0);
+ /* If there's no data hash table, then there's no entry. */
+ if (le64toh(f->header->data_hash_table_size) <= 0)
+ return 0;
+
+ /* Map the data hash table, if it isn't mapped yet. */
+ r = journal_file_map_data_hash_table(f);
+ if (r < 0)
+ return r;
+
osize = offsetof(Object, data.payload) + size;
m = le64toh(f->header->data_hash_table_size) / sizeof(HashItem);
if (o->object.flags & OBJECT_COMPRESSION_MASK) {
#if defined(HAVE_XZ) || defined(HAVE_LZ4)
uint64_t l;
- size_t rsize;
+ size_t rsize = 0;
l = le64toh(o->object.size);
if (l <= offsetof(Object, data.payload))
#if defined(HAVE_XZ) || defined(HAVE_LZ4)
if (f->compress_xz &&
size >= COMPRESSION_SIZE_THRESHOLD) {
- size_t rsize;
+ size_t rsize = 0;
compression = compress_blob(data, size, o->data.payload, &rsize);
return TEST_RIGHT;
}
-static inline int find_data_object_by_boot_id(
+static int find_data_object_by_boot_id(
JournalFile *f,
sd_id128_t boot_id,
Object **o,
uint64_t *b) {
+
char t[sizeof("_BOOT_ID=")-1 + 32 + 1] = "_BOOT_ID=";
sd_id128_to_string(boot_id, t + 9);
f->current_xor_hash = 0;
}
-void journal_file_save_location(JournalFile *f, direction_t direction, Object *o, uint64_t offset) {
- f->last_direction = direction;
+void journal_file_save_location(JournalFile *f, Object *o, uint64_t offset) {
f->location_type = LOCATION_SEEK;
f->current_offset = offset;
f->current_seqnum = le64toh(o->entry.seqnum);
break;
default:
- printf("Type: unknown (%u)\n", o->object.type);
+ printf("Type: unknown (%i)\n", o->object.type);
break;
}
le64toh(f->header->n_entry_arrays));
if (fstat(f->fd, &st) >= 0)
- printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (off_t) st.st_blocks * 512ULL));
+ printf("Disk usage: %s\n", format_bytes(bytes, sizeof(bytes), (uint64_t) st.st_blocks * 512ULL));
+}
+
+static int journal_file_warn_btrfs(JournalFile *f) {
+ unsigned attrs;
+ int r;
+
+ assert(f);
+
+ /* Before we write anything, check if the COW logic is turned
+ * off on btrfs. Given our write pattern that is quite
+ * unfriendly to COW file systems this should greatly improve
+ * performance on COW file systems, such as btrfs, at the
+ * expense of data integrity features (which shouldn't be too
+ * bad, given that we do our own checksumming). */
+
+ r = btrfs_is_filesystem(f->fd);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to determine if journal is on btrfs: %m");
+ if (!r)
+ return 0;
+
+ r = read_attr_fd(f->fd, &attrs);
+ if (r < 0)
+ return log_warning_errno(r, "Failed to read file attributes: %m");
+
+ if (attrs & FS_NOCOW_FL) {
+ log_debug("Detected btrfs file system with copy-on-write disabled, all is good.");
+ return 0;
+ }
+
+ log_notice("Creating journal file %s on a btrfs file system, and copy-on-write is enabled. "
+ "This is likely to slow down journal access substantially, please consider turning "
+ "off the copy-on-write file attribute on the journal directory, using chattr +C.", f->path);
+
+ return 1;
}
int journal_file_open(
goto fail;
if (f->last_stat.st_size == 0 && f->writable) {
+
+ (void) journal_file_warn_btrfs(f);
+
/* Let's attach the creation time to the journal file,
* so that the vacuuming code knows the age of this
* file even if the file might end up corrupted one
* attributes are not supported we'll just skip this,
* and rely solely on mtime/atime/ctime of the file. */
- fd_setcrtime(f->fd, now(CLOCK_REALTIME));
+ fd_setcrtime(f->fd, 0);
#ifdef HAVE_GCRYPT
/* Try to load the FSPRG state, and if we can't, then
}
r = mmap_cache_get(f->mmap, f->fd, f->prot, CONTEXT_HEADER, true, 0, PAGE_ALIGN(sizeof(Header)), &f->last_stat, &h);
- if (r < 0) {
- r = -errno;
+ if (r < 0)
goto fail;
- }
f->header = h;
#endif
}
- r = journal_file_map_field_hash_table(f);
- if (r < 0)
- goto fail;
-
- r = journal_file_map_data_hash_table(f);
- if (r < 0)
- goto fail;
-
if (mmap_cache_got_sigbus(f->mmap, f->fd)) {
r = -EIO;
goto fail;
size_t l;
_cleanup_free_ char *p = NULL;
- r = journal_file_open(fname, flags, mode, compress, seal,
- metrics, mmap_cache, template, ret);
- if (r != -EBADMSG && /* corrupted */
- r != -ENODATA && /* truncated */
- r != -EHOSTDOWN && /* other machine */
- r != -EPROTONOSUPPORT && /* incompatible feature */
- r != -EBUSY && /* unclean shutdown */
- r != -ESHUTDOWN && /* already archived */
- r != -EIO /* IO error, including SIGBUS on mmap */)
+ r = journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
+ if (!IN_SET(r,
+ -EBADMSG, /* corrupted */
+ -ENODATA, /* truncated */
+ -EHOSTDOWN, /* other machine */
+ -EPROTONOSUPPORT, /* incompatible feature */
+ -EBUSY, /* unclean shutdown */
+ -ESHUTDOWN, /* already archived */
+ -EIO, /* IO error, including SIGBUS on mmap */
+ -EIDRM /* File has been deleted */))
return r;
if ((flags & O_ACCMODE) == O_RDONLY)
/* The file is corrupted. Rotate it away and try it again (but only once) */
l = strlen(fname);
- if (asprintf(&p, "%.*s@%016llx-%016" PRIx64 ".journal~",
+ if (asprintf(&p, "%.*s@%016"PRIx64 "-%016"PRIx64 ".journal~",
(int) l - 8, fname,
- (unsigned long long) now(CLOCK_REALTIME),
+ now(CLOCK_REALTIME),
random_u64()) < 0)
return -ENOMEM;
- r = rename(fname, p);
- if (r < 0)
+ if (rename(fname, p) < 0)
return -errno;
/* btrfs doesn't cope well with our write pattern and
* fragments heavily. Let's defrag all files we rotate */
+
+ (void) chattr_path(p, false, FS_NOCOW_FL);
(void) btrfs_defrag(p);
- log_warning("File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
+ log_warning_errno(r, "File %s corrupted or uncleanly shut down, renaming and replacing.", fname);
- return journal_file_open(fname, flags, mode, compress, seal,
- metrics, mmap_cache, template, ret);
+ return journal_file_open(fname, flags, mode, compress, seal, metrics, mmap_cache, template, ret);
}
int journal_file_copy_entry(JournalFile *from, JournalFile *to, Object *o, uint64_t p, uint64_t *seqnum, Object **ret, uint64_t *offset) {
if (o->object.flags & OBJECT_COMPRESSION_MASK) {
#if defined(HAVE_XZ) || defined(HAVE_LZ4)
- size_t rsize;
+ size_t rsize = 0;
r = decompress_blob(o->object.flags & OBJECT_COMPRESSION_MASK,
o->data.payload, l, &from->compress_buffer, &from->compress_buffer_size, &rsize, 0);
return r;
}
+void journal_reset_metrics(JournalMetrics *m) {
+ assert(m);
+
+ /* Set everything to "pick automatic values". */
+
+ *m = (JournalMetrics) {
+ .min_use = (uint64_t) -1,
+ .max_use = (uint64_t) -1,
+ .min_size = (uint64_t) -1,
+ .max_size = (uint64_t) -1,
+ .keep_free = (uint64_t) -1,
+ .n_max_files = (uint64_t) -1,
+ };
+}
+
void journal_default_metrics(JournalMetrics *m, int fd) {
- uint64_t fs_size = 0;
+ char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX], e[FORMAT_BYTES_MAX];
struct statvfs ss;
- char a[FORMAT_BYTES_MAX], b[FORMAT_BYTES_MAX], c[FORMAT_BYTES_MAX], d[FORMAT_BYTES_MAX];
+ uint64_t fs_size;
assert(m);
assert(fd >= 0);
if (fstatvfs(fd, &ss) >= 0)
fs_size = ss.f_frsize * ss.f_blocks;
+ else {
+ log_debug_errno(errno, "Failed to detremine disk size: %m");
+ fs_size = 0;
+ }
if (m->max_use == (uint64_t) -1) {
} else {
m->max_use = PAGE_ALIGN(m->max_use);
- if (m->max_use < JOURNAL_FILE_SIZE_MIN*2)
+ if (m->max_use != 0 && m->max_use < JOURNAL_FILE_SIZE_MIN*2)
m->max_use = JOURNAL_FILE_SIZE_MIN*2;
}
+ if (m->min_use == (uint64_t) -1)
+ m->min_use = DEFAULT_MIN_USE;
+
+ if (m->min_use > m->max_use)
+ m->min_use = m->max_use;
+
if (m->max_size == (uint64_t) -1) {
m->max_size = PAGE_ALIGN(m->max_use / 8); /* 8 chunks */
} else
m->max_size = PAGE_ALIGN(m->max_size);
- if (m->max_size < JOURNAL_FILE_SIZE_MIN)
- m->max_size = JOURNAL_FILE_SIZE_MIN;
+ if (m->max_size != 0) {
+ if (m->max_size < JOURNAL_FILE_SIZE_MIN)
+ m->max_size = JOURNAL_FILE_SIZE_MIN;
- if (m->max_size*2 > m->max_use)
- m->max_use = m->max_size*2;
+ if (m->max_use != 0 && m->max_size*2 > m->max_use)
+ m->max_use = m->max_size*2;
+ }
if (m->min_size == (uint64_t) -1)
m->min_size = JOURNAL_FILE_SIZE_MIN;
if (m->min_size < JOURNAL_FILE_SIZE_MIN)
m->min_size = JOURNAL_FILE_SIZE_MIN;
- if (m->min_size > m->max_size)
+ if (m->max_size != 0 && m->min_size > m->max_size)
m->max_size = m->min_size;
}
m->keep_free = DEFAULT_KEEP_FREE;
}
- log_debug("Fixed max_use=%s max_size=%s min_size=%s keep_free=%s",
- format_bytes(a, sizeof(a), m->max_use),
- format_bytes(b, sizeof(b), m->max_size),
- format_bytes(c, sizeof(c), m->min_size),
- format_bytes(d, sizeof(d), m->keep_free));
+ if (m->n_max_files == (uint64_t) -1)
+ m->n_max_files = DEFAULT_N_MAX_FILES;
+
+ log_debug("Fixed min_use=%s max_use=%s max_size=%s min_size=%s keep_free=%s n_max_files=%" PRIu64,
+ format_bytes(a, sizeof(a), m->min_use),
+ format_bytes(b, sizeof(b), m->max_use),
+ format_bytes(c, sizeof(c), m->max_size),
+ format_bytes(d, sizeof(d), m->min_size),
+ format_bytes(e, sizeof(e), m->keep_free),
+ m->n_max_files);
}
int journal_file_get_cutoff_realtime_usec(JournalFile *f, usec_t *from, usec_t *to) {