--- /dev/null
+.. SPDX-License-Identifier: GPL-2.0
+
+Casefolding
+===========
+
+bcachefs has support for case-insensitive file and directory
+lookups using the regular `chattr +F` (`S_CASEFOLD`, `FS_CASEFOLD_FL`)
+casefolding attributes.
+
+The main usecase for casefolding is compatibility with software written
+against other filesystems that rely on casefolded lookups
+(eg. NTFS and Wine/Proton).
+Taking advantage of file-system level casefolding can lead to great
+loading time gains in many applications and games.
+
+Casefolding support requires a kernel with the `CONFIG_UNICODE` enabled.
+Once a directory has been flagged for casefolding, a feature bit
+is enabled on the superblock which marks the filesystem as using
+casefolding.
+When the feature bit for casefolding is enabled, it is no longer possible
+to mount that filesystem on kernels without `CONFIG_UNICODE` enabled.
+
+On the lookup/query side: casefolding is implemented by allocating a new
+string of `BCH_NAME_MAX` length using the `utf8_casefold` function to
+casefold the query string.
+
+On the dirent side: casefolding is implemented by ensuring the `bkey`'s
+hash is made from the casefolded string and storing the cached casefolded
+name with the regular name in the dirent.
+
+The structure looks like this:
+
+Regular: [dirent data][regular name][nul][nul]...
+Casefolded: [dirent data][reg len][cf len][regular name][casefolded name][nul][nul]...
+
+(Do note, the number of `NUL`s here is merely for illustration, they count can vary
+ per-key, and they may not even be present if the key is aligned to `sizeof(u64)`.)
+
+This is efficient as it means that for all file lookups that require casefolding,
+it has identical performance to a regular lookup:
+a hash comparison and a `memcmp` of the name.
+
+Rationale
+---------
+
+Several designs were considered for this system:
+One was to introduce a dirent_v2, however that would be painful especially as
+the hash system only has support for a single key type. This would also need
+`BCH_NAME_MAX` to change between versions, and a new feature bit.
+
+Another option was to store without the two lengths, and just take the length of
+the regular name and casefolded name contiguously / 2 as the length. This would
+assume that the regular length == casefolded length, but that could potentially
+not be true, if the uppercase unicode glyph had a different UTF-8 encoding than
+the lowercase unicode glyph.
+It would be possible to disregard the casefold cache for those cases, but it was
+decided to simply encode the two string lengths in the key to avoid random
+performance issues if this edgecase was ever hit.
+
+The option settled on was to use a free-bit in d_type to mark a dirent as having
+a casefold cache, and then treat the first 4 bytes the name block as lengths.
+You can see this in the `d_cf_name_block` member of union in `bch_dirent`.
+
+The feature bit was used to allow casefolding support to be enabled for the majority
+of users, but some allow users who have no need for the feature to still use bcachefs as
+`CONFIG_UNICODE` can increase the kernel side a significant amount due to the tables used,
+which may be decider between using bcachefs for eg. embedded platforms.
+
+Other filesystems like ext4 and f2fs have a super-block level option for casefolding
+encoding, but bcachefs currently does not provide this. ext4 and f2fs do not expose
+any encodings than a single UTF-8 version. When future encodings are desirable,
+they will be added trivially using the opts mechanism.
+
+dentry/dcache considerations
+---------
+
+Currently, in casefolded directories, bcachefs (like other filesystems) will not cache
+negative dentry's.
+
+This is because currently doing so presents a problem in the following scenario:
+ - Lookup file "blAH" in a casefolded directory
+ - Creation of file "BLAH" in a casefolded directory
+ - Lookup file "blAH" in a casefolded directory
+This would fail if negative dentry's were cached.
+
+This is slightly suboptimal, but could be fixed in future with some vfs work.
+
#include <linux/dcache.h>
+static int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info,
+ const struct qstr *str, struct qstr *out_cf)
+{
+ *out_cf = (struct qstr) QSTR_INIT(NULL, 0);
+
+#ifdef CONFIG_UNICODE
+ unsigned char *buf = bch2_trans_kmalloc(trans, BCH_NAME_MAX + 1);
+ int ret = PTR_ERR_OR_ZERO(buf);
+ if (ret)
+ return ret;
+
+ ret = utf8_casefold(info->cf_encoding, str, buf, BCH_NAME_MAX + 1);
+ if (ret <= 0)
+ return ret;
+
+ *out_cf = (struct qstr) QSTR_INIT(buf, ret);
+ return 0;
+#else
+ return -EOPNOTSUPP;
+#endif
+}
+
+static inline int bch2_maybe_casefold(struct btree_trans *trans,
+ const struct bch_hash_info *info,
+ const struct qstr *str, struct qstr *out_cf)
+{
+ if (likely(!info->cf_encoding)) {
+ *out_cf = *str;
+ return 0;
+ } else {
+ return bch2_casefold(trans, info, str, out_cf);
+ }
+}
+
static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d)
{
if (bkey_val_bytes(d.k) < offsetof(struct bch_dirent, d_name))
#endif
return bkey_bytes -
- offsetof(struct bch_dirent, d_name) -
+ (d.v->d_casefold
+ ? offsetof(struct bch_dirent, d_cf_name_block.d_names)
+ : offsetof(struct bch_dirent, d_name)) -
trailing_nuls;
}
struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d)
{
- return (struct qstr) QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d));
+ if (d.v->d_casefold) {
+ unsigned name_len = le16_to_cpu(d.v->d_cf_name_block.d_name_len);
+ return (struct qstr) QSTR_INIT(&d.v->d_cf_name_block.d_names[0], name_len);
+ } else {
+ return (struct qstr) QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d));
+ }
+}
+
+static struct qstr bch2_dirent_get_casefold_name(struct bkey_s_c_dirent d)
+{
+ if (d.v->d_casefold) {
+ unsigned name_len = le16_to_cpu(d.v->d_cf_name_block.d_name_len);
+ unsigned cf_name_len = le16_to_cpu(d.v->d_cf_name_block.d_cf_name_len);
+ return (struct qstr) QSTR_INIT(&d.v->d_cf_name_block.d_names[name_len], cf_name_len);
+ } else {
+ return (struct qstr) QSTR_INIT(NULL, 0);
+ }
+}
+
+static inline struct qstr bch2_dirent_get_lookup_name(struct bkey_s_c_dirent d)
+{
+ return d.v->d_casefold
+ ? bch2_dirent_get_casefold_name(d)
+ : bch2_dirent_get_name(d);
}
static u64 bch2_dirent_hash(const struct bch_hash_info *info,
static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k)
{
struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
- struct qstr name = bch2_dirent_get_name(d);
+ struct qstr name = bch2_dirent_get_lookup_name(d);
return bch2_dirent_hash(info, &name);
}
static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r)
{
struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
- const struct qstr l_name = bch2_dirent_get_name(l);
+ const struct qstr l_name = bch2_dirent_get_lookup_name(l);
const struct qstr *r_name = _r;
return !qstr_eq(l_name, *r_name);
{
struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l);
struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r);
- const struct qstr l_name = bch2_dirent_get_name(l);
- const struct qstr r_name = bch2_dirent_get_name(r);
+ const struct qstr l_name = bch2_dirent_get_lookup_name(l);
+ const struct qstr r_name = bch2_dirent_get_lookup_name(r);
return !qstr_eq(l_name, r_name);
}
struct bkey_validate_context from)
{
struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k);
+ unsigned name_block_len = bch2_dirent_name_bytes(d);
struct qstr d_name = bch2_dirent_get_name(d);
+ struct qstr d_cf_name = bch2_dirent_get_casefold_name(d);
int ret = 0;
bkey_fsck_err_on(!d_name.len,
c, dirent_empty_name,
"empty name");
- bkey_fsck_err_on(bkey_val_u64s(k.k) > dirent_val_u64s(d_name.len),
+ bkey_fsck_err_on(d_name.len + d_cf_name.len > name_block_len,
c, dirent_val_too_big,
- "value too big (%zu > %u)",
- bkey_val_u64s(k.k), dirent_val_u64s(d_name.len));
+ "dirent names exceed bkey size (%d + %d > %d)",
+ d_name.len, d_cf_name.len, name_block_len);
/*
* Check new keys don't exceed the max length
le64_to_cpu(d.v->d_inum) == d.k->p.inode,
c, dirent_to_itself,
"dirent points to own directory");
+
+ if (d.v->d_casefold) {
+ bkey_fsck_err_on(from.from == BKEY_VALIDATE_commit &&
+ d_cf_name.len > BCH_NAME_MAX,
+ c, dirent_cf_name_too_big,
+ "dirent w/ cf name too big (%u > %u)",
+ d_cf_name.len, BCH_NAME_MAX);
+
+ bkey_fsck_err_on(d_cf_name.len != strnlen(d_cf_name.name, d_cf_name.len),
+ c, dirent_stray_data_after_cf_name,
+ "dirent has stray data after cf name's NUL");
+ }
fsck_err:
return ret;
}
static struct bkey_i_dirent *dirent_alloc_key(struct btree_trans *trans,
subvol_inum dir,
u8 type,
- int name_len, u64 dst)
+ int name_len, int cf_name_len,
+ u64 dst)
{
struct bkey_i_dirent *dirent;
- unsigned u64s = BKEY_U64s + dirent_val_u64s(name_len);
+ unsigned u64s = BKEY_U64s + dirent_val_u64s(name_len, cf_name_len);
BUG_ON(u64s > U8_MAX);
}
dirent->v.d_type = type;
+ dirent->v.d_unused = 0;
+ dirent->v.d_casefold = cf_name_len ? 1 : 0;
return dirent;
}
static void dirent_init_regular_name(struct bkey_i_dirent *dirent,
const struct qstr *name)
{
+ EBUG_ON(dirent->v.d_casefold);
+
memcpy(&dirent->v.d_name[0], name->name, name->len);
memset(&dirent->v.d_name[name->len], 0,
bkey_val_bytes(&dirent->k) -
name->len);
}
+static void dirent_init_casefolded_name(struct bkey_i_dirent *dirent,
+ const struct qstr *name,
+ const struct qstr *cf_name)
+{
+ EBUG_ON(!dirent->v.d_casefold);
+ EBUG_ON(!cf_name->len);
+
+ dirent->v.d_cf_name_block.d_name_len = name->len;
+ dirent->v.d_cf_name_block.d_cf_name_len = cf_name->len;
+ memcpy(&dirent->v.d_cf_name_block.d_names[0], name->name, name->len);
+ memcpy(&dirent->v.d_cf_name_block.d_names[name->len], cf_name->name, cf_name->len);
+ memset(&dirent->v.d_cf_name_block.d_names[name->len + cf_name->len], 0,
+ bkey_val_bytes(&dirent->k) -
+ offsetof(struct bch_dirent, d_cf_name_block.d_names) -
+ name->len + cf_name->len);
+
+ EBUG_ON(bch2_dirent_get_casefold_name(dirent_i_to_s_c(dirent)).len != cf_name->len);
+}
+
static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans,
subvol_inum dir,
u8 type,
const struct qstr *name,
+ const struct qstr *cf_name,
u64 dst)
{
struct bkey_i_dirent *dirent;
if (name->len > BCH_NAME_MAX)
return ERR_PTR(-ENAMETOOLONG);
- dirent = dirent_alloc_key(trans, dir, type, name->len, dst);
+ dirent = dirent_alloc_key(trans, dir, type, name->len, cf_name ? cf_name->len : 0, dst);
if (IS_ERR(dirent))
return dirent;
- dirent_init_regular_name(dirent, name);
+ if (cf_name)
+ dirent_init_casefolded_name(dirent, name, cf_name);
+ else
+ dirent_init_regular_name(dirent, name);
- EBUG_ON(bch2_dirent_name_bytes(dirent_i_to_s_c(dirent)) != name->len);
+ EBUG_ON(bch2_dirent_get_name(dirent_i_to_s_c(dirent)).len != name->len);
return dirent;
}
struct bkey_i_dirent *dirent;
int ret;
- dirent = dirent_create_key(trans, dir_inum, type, name, dst_inum);
+ dirent = dirent_create_key(trans, dir_inum, type, name, NULL, dst_inum);
ret = PTR_ERR_OR_ZERO(dirent);
if (ret)
return ret;
struct bkey_i_dirent *dirent;
int ret;
- dirent = dirent_create_key(trans, dir, type, name, dst_inum);
+ if (hash_info->cf_encoding) {
+ struct qstr cf_name;
+ ret = bch2_casefold(trans, hash_info, name, &cf_name);
+ if (ret)
+ return ret;
+ dirent = dirent_create_key(trans, dir, type, name, &cf_name, dst_inum);
+ } else {
+ dirent = dirent_create_key(trans, dir, type, name, NULL, dst_inum);
+ }
+
ret = PTR_ERR_OR_ZERO(dirent);
if (ret)
return ret;
const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset,
enum bch_rename_mode mode)
{
+ struct qstr src_name_lookup, dst_name_lookup;
struct btree_iter src_iter = { NULL };
struct btree_iter dst_iter = { NULL };
struct bkey_s_c old_src, old_dst = bkey_s_c_null;
memset(dst_inum, 0, sizeof(*dst_inum));
/* Lookup src: */
+ ret = bch2_maybe_casefold(trans, src_hash, src_name, &src_name_lookup);
+ if (ret)
+ goto out;
old_src = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc,
- src_hash, src_dir, src_name,
+ src_hash, src_dir, &src_name_lookup,
BTREE_ITER_intent);
ret = bkey_err(old_src);
if (ret)
goto out;
/* Lookup dst: */
+ ret = bch2_maybe_casefold(trans, dst_hash, dst_name, &dst_name_lookup);
+ if (ret)
+ goto out;
if (mode == BCH_RENAME) {
/*
* Note that we're _not_ checking if the target already exists -
* correctness:
*/
ret = bch2_hash_hole(trans, &dst_iter, bch2_dirent_hash_desc,
- dst_hash, dst_dir, dst_name);
+ dst_hash, dst_dir, &dst_name_lookup);
if (ret)
goto out;
} else {
old_dst = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc,
- dst_hash, dst_dir, dst_name,
+ dst_hash, dst_dir, &dst_name_lookup,
BTREE_ITER_intent);
ret = bkey_err(old_dst);
if (ret)
*src_offset = dst_iter.pos.offset;
/* Create new dst key: */
- new_dst = dirent_create_key(trans, dst_dir, 0, dst_name, 0);
+ new_dst = dirent_create_key(trans, dst_dir, 0, dst_name,
+ dst_hash->cf_encoding ? &dst_name_lookup : NULL, 0);
ret = PTR_ERR_OR_ZERO(new_dst);
if (ret)
goto out;
/* Create new src key: */
if (mode == BCH_RENAME_EXCHANGE) {
- new_src = dirent_create_key(trans, src_dir, 0, src_name, 0);
+ new_src = dirent_create_key(trans, src_dir, 0, src_name,
+ src_hash->cf_encoding ? &src_name_lookup : NULL, 0);
ret = PTR_ERR_OR_ZERO(new_src);
if (ret)
goto out;
const struct qstr *name, subvol_inum *inum,
unsigned flags)
{
+ struct qstr lookup_name;
+ int ret = bch2_maybe_casefold(trans, hash_info, name, &lookup_name);
+ if (ret)
+ return ret;
+
struct bkey_s_c k = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc,
- hash_info, dir, name, flags);
- int ret = bkey_err(k);
+ hash_info, dir, &lookup_name, flags);
+ ret = bkey_err(k);
if (ret)
goto err;