domain_id=%s Specify a trusted domain ID for fscache mode so that
different images with the same blobs, identified by blob IDs,
can share storage within the same trusted domain.
+ Also used for different filesystems with inode page sharing
+ enabled to share page cache within the trusted domain.
fsoffset=%llu Specify block-aligned filesystem offset for the primary device.
+inode_share Enable inode page sharing for this filesystem. Inodes with
+ identical content within the same domain ID can share the
+ page cache.
=================== =========================================================
Sysfs Entries
erofs-$(CONFIG_EROFS_FS_ZIP_ACCEL) += decompressor_crypto.o
erofs-$(CONFIG_EROFS_FS_BACKED_BY_FILE) += fileio.o
erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o
+erofs-$(CONFIG_EROFS_FS_PAGE_CACHE_SHARE) += ishare.o
#define EROFS_MOUNT_DAX_ALWAYS 0x00000040
#define EROFS_MOUNT_DAX_NEVER 0x00000080
#define EROFS_MOUNT_DIRECT_IO 0x00000100
+#define EROFS_MOUNT_INODE_SHARE 0x00000200
#define clear_opt(opt, option) ((opt)->mount_opt &= ~EROFS_MOUNT_##option)
#define set_opt(opt, option) ((opt)->mount_opt |= EROFS_MOUNT_##option)
/* default readahead size of directories */
#define EROFS_DIR_RA_BYTES 16384
+struct erofs_inode_fingerprint {
+ u8 *opaque;
+ int size;
+};
+
struct erofs_inode {
erofs_nid_t nid;
};
#endif /* CONFIG_EROFS_FS_ZIP */
};
+#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE
+ struct list_head ishare_list;
+ union {
+ /* for each anon shared inode */
+ struct {
+ struct erofs_inode_fingerprint fingerprint;
+ spinlock_t ishare_lock;
+ };
+ /* for each real inode */
+ struct inode *sharedinode;
+ };
+#endif
/* the corresponding vfs inode */
struct inode vfs_inode;
};
extern const struct file_operations erofs_file_fops;
extern const struct file_operations erofs_dir_fops;
+extern const struct file_operations erofs_ishare_fops;
extern const struct iomap_ops z_erofs_iomap_report_ops;
static inline void erofs_fscache_submit_bio(struct bio *bio) {}
#endif
+#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE
+int __init erofs_init_ishare(void);
+void erofs_exit_ishare(void);
+bool erofs_ishare_fill_inode(struct inode *inode);
+void erofs_ishare_free_inode(struct inode *inode);
+#else
+static inline int erofs_init_ishare(void) { return 0; }
+static inline void erofs_exit_ishare(void) {}
+static inline bool erofs_ishare_fill_inode(struct inode *inode) { return false; }
+static inline void erofs_ishare_free_inode(struct inode *inode) {}
+#endif
+
long erofs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg);
long erofs_compat_ioctl(struct file *filp, unsigned int cmd,
unsigned long arg);
--- /dev/null
+// SPDX-License-Identifier: GPL-2.0-or-later
+/*
+ * Copyright (C) 2024, Alibaba Cloud
+ */
+#include <linux/xxhash.h>
+#include <linux/mount.h>
+#include "internal.h"
+#include "xattr.h"
+
+#include "../internal.h"
+
+static struct vfsmount *erofs_ishare_mnt;
+
+static int erofs_ishare_iget5_eq(struct inode *inode, void *data)
+{
+ struct erofs_inode_fingerprint *fp1 = &EROFS_I(inode)->fingerprint;
+ struct erofs_inode_fingerprint *fp2 = data;
+
+ return fp1->size == fp2->size &&
+ !memcmp(fp1->opaque, fp2->opaque, fp2->size);
+}
+
+static int erofs_ishare_iget5_set(struct inode *inode, void *data)
+{
+ struct erofs_inode *vi = EROFS_I(inode);
+
+ vi->fingerprint = *(struct erofs_inode_fingerprint *)data;
+ INIT_LIST_HEAD(&vi->ishare_list);
+ spin_lock_init(&vi->ishare_lock);
+ return 0;
+}
+
+bool erofs_ishare_fill_inode(struct inode *inode)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb);
+ struct erofs_inode *vi = EROFS_I(inode);
+ struct erofs_inode_fingerprint fp;
+ struct inode *sharedinode;
+ unsigned long hash;
+
+ if (erofs_xattr_fill_inode_fingerprint(&fp, inode, sbi->domain_id))
+ return false;
+ hash = xxh32(fp.opaque, fp.size, 0);
+ sharedinode = iget5_locked(erofs_ishare_mnt->mnt_sb, hash,
+ erofs_ishare_iget5_eq, erofs_ishare_iget5_set,
+ &fp);
+ if (!sharedinode) {
+ kfree(fp.opaque);
+ return false;
+ }
+
+ if (inode_state_read_once(sharedinode) & I_NEW) {
+ if (erofs_inode_set_aops(sharedinode, inode, true)) {
+ iget_failed(sharedinode);
+ kfree(fp.opaque);
+ return false;
+ }
+ sharedinode->i_size = vi->vfs_inode.i_size;
+ unlock_new_inode(sharedinode);
+ } else {
+ kfree(fp.opaque);
+ if (sharedinode->i_size != vi->vfs_inode.i_size) {
+ _erofs_printk(inode->i_sb, KERN_WARNING
+ "size(%lld:%lld) not matches for the same fingerprint\n",
+ vi->vfs_inode.i_size, sharedinode->i_size);
+ iput(sharedinode);
+ return false;
+ }
+ }
+ vi->sharedinode = sharedinode;
+ INIT_LIST_HEAD(&vi->ishare_list);
+ spin_lock(&EROFS_I(sharedinode)->ishare_lock);
+ list_add(&vi->ishare_list, &EROFS_I(sharedinode)->ishare_list);
+ spin_unlock(&EROFS_I(sharedinode)->ishare_lock);
+ return true;
+}
+
+void erofs_ishare_free_inode(struct inode *inode)
+{
+ struct erofs_inode *vi = EROFS_I(inode);
+ struct inode *sharedinode = vi->sharedinode;
+
+ if (!sharedinode)
+ return;
+ spin_lock(&EROFS_I(sharedinode)->ishare_lock);
+ list_del(&vi->ishare_list);
+ spin_unlock(&EROFS_I(sharedinode)->ishare_lock);
+ iput(sharedinode);
+ vi->sharedinode = NULL;
+}
+
+static int erofs_ishare_file_open(struct inode *inode, struct file *file)
+{
+ struct inode *sharedinode = EROFS_I(inode)->sharedinode;
+ struct file *realfile;
+
+ if (file->f_flags & O_DIRECT)
+ return -EINVAL;
+ realfile = alloc_empty_backing_file(O_RDONLY|O_NOATIME, current_cred());
+ if (IS_ERR(realfile))
+ return PTR_ERR(realfile);
+ ihold(sharedinode);
+ realfile->f_op = &erofs_file_fops;
+ realfile->f_inode = sharedinode;
+ realfile->f_mapping = sharedinode->i_mapping;
+ path_get(&file->f_path);
+ backing_file_set_user_path(realfile, &file->f_path);
+
+ file_ra_state_init(&realfile->f_ra, file->f_mapping);
+ realfile->private_data = EROFS_I(inode);
+ file->private_data = realfile;
+ return 0;
+}
+
+static int erofs_ishare_file_release(struct inode *inode, struct file *file)
+{
+ struct file *realfile = file->private_data;
+
+ iput(realfile->f_inode);
+ fput(realfile);
+ file->private_data = NULL;
+ return 0;
+}
+
+static ssize_t erofs_ishare_file_read_iter(struct kiocb *iocb,
+ struct iov_iter *to)
+{
+ struct file *realfile = iocb->ki_filp->private_data;
+ struct kiocb dedup_iocb;
+ ssize_t nread;
+
+ if (!iov_iter_count(to))
+ return 0;
+ kiocb_clone(&dedup_iocb, iocb, realfile);
+ nread = filemap_read(&dedup_iocb, to, 0);
+ iocb->ki_pos = dedup_iocb.ki_pos;
+ return nread;
+}
+
+static int erofs_ishare_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct file *realfile = file->private_data;
+
+ vma_set_file(vma, realfile);
+ return generic_file_readonly_mmap(file, vma);
+}
+
+const struct file_operations erofs_ishare_fops = {
+ .open = erofs_ishare_file_open,
+ .llseek = generic_file_llseek,
+ .read_iter = erofs_ishare_file_read_iter,
+ .mmap = erofs_ishare_mmap,
+ .release = erofs_ishare_file_release,
+ .get_unmapped_area = thp_get_unmapped_area,
+ .splice_read = filemap_splice_read,
+};
+
+int __init erofs_init_ishare(void)
+{
+ erofs_ishare_mnt = kern_mount(&erofs_anon_fs_type);
+ return PTR_ERR_OR_ZERO(erofs_ishare_mnt);
+}
+
+void erofs_exit_ishare(void)
+{
+ kern_unmount(erofs_ishare_mnt);
+}
enum {
Opt_user_xattr, Opt_acl, Opt_cache_strategy, Opt_dax, Opt_dax_enum,
Opt_device, Opt_fsid, Opt_domain_id, Opt_directio, Opt_fsoffset,
+ Opt_inode_share,
};
static const struct constant_table erofs_param_cache_strategy[] = {
fsparam_string("domain_id", Opt_domain_id),
fsparam_flag_no("directio", Opt_directio),
fsparam_u64("fsoffset", Opt_fsoffset),
+ fsparam_flag("inode_share", Opt_inode_share),
{}
};
if (!sbi->fsid)
return -ENOMEM;
break;
+#endif
+#if defined(CONFIG_EROFS_FS_ONDEMAND) || defined(CONFIG_EROFS_FS_PAGE_CACHE_SHARE)
case Opt_domain_id:
kfree_sensitive(sbi->domain_id);
sbi->domain_id = no_free_ptr(param->string);
case Opt_fsoffset:
sbi->dif0.fsoff = result.uint_64;
break;
+ case Opt_inode_share:
+#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE
+ set_opt(&sbi->opt, INODE_SHARE);
+#else
+ errorfc(fc, "%s option not supported", erofs_fs_parameters[opt].name);
+#endif
+ break;
}
return 0;
}
sb->s_maxbytes = MAX_LFS_FILESIZE;
sb->s_op = &erofs_sops;
+ if (!sbi->domain_id && test_opt(&sbi->opt, INODE_SHARE)) {
+ errorfc(fc, "domain_id is needed when inode_ishare is on");
+ return -EINVAL;
+ }
+ if (test_opt(&sbi->opt, DAX_ALWAYS) && test_opt(&sbi->opt, INODE_SHARE)) {
+ errorfc(fc, "FSDAX is not allowed when inode_ishare is on");
+ return -EINVAL;
+ }
+
sbi->blkszbits = PAGE_SHIFT;
if (!sb->s_bdev) {
/*
erofs_info(sb, "unsupported blocksize for DAX");
clear_opt(&sbi->opt, DAX_ALWAYS);
}
+ if (test_opt(&sbi->opt, INODE_SHARE) && !erofs_sb_has_ishare_xattrs(sbi)) {
+ erofs_info(sb, "on-disk ishare xattrs not found. Turning off inode_share.");
+ clear_opt(&sbi->opt, INODE_SHARE);
+ }
+ if (test_opt(&sbi->opt, INODE_SHARE))
+ erofs_info(sb, "EXPERIMENTAL EROFS page cache share support in use. Use at your own risk!");
sb->s_time_gran = 1;
sb->s_xattr = erofs_xattr_handlers;
};
MODULE_ALIAS_FS("erofs");
-#if defined(CONFIG_EROFS_FS_ONDEMAND)
+#if defined(CONFIG_EROFS_FS_ONDEMAND) || defined(CONFIG_EROFS_FS_PAGE_CACHE_SHARE)
+static void erofs_free_anon_inode(struct inode *inode)
+{
+ struct erofs_inode *vi = EROFS_I(inode);
+
+#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE
+ kfree(vi->fingerprint.opaque);
+#endif
+ kmem_cache_free(erofs_inode_cachep, vi);
+}
+
+static const struct super_operations erofs_anon_sops = {
+ .alloc_inode = erofs_alloc_inode,
+ .drop_inode = inode_just_drop,
+ .free_inode = erofs_free_anon_inode,
+};
+
static int erofs_anon_init_fs_context(struct fs_context *fc)
{
- return init_pseudo(fc, EROFS_SUPER_MAGIC) ? 0 : -ENOMEM;
+ struct pseudo_fs_context *ctx;
+
+ ctx = init_pseudo(fc, EROFS_SUPER_MAGIC);
+ if (!ctx)
+ return -ENOMEM;
+ ctx->ops = &erofs_anon_sops;
+ return 0;
}
struct file_system_type erofs_anon_fs_type = {
if (err)
goto sysfs_err;
+ err = erofs_init_ishare();
+ if (err)
+ goto ishare_err;
+
err = register_filesystem(&erofs_fs_type);
if (err)
goto fs_err;
return 0;
fs_err:
+ erofs_exit_ishare();
+ishare_err:
erofs_exit_sysfs();
sysfs_err:
z_erofs_exit_subsystem();
/* Ensure all RCU free inodes / pclusters are safe to be destroyed. */
rcu_barrier();
+ erofs_exit_ishare();
erofs_exit_sysfs();
z_erofs_exit_subsystem();
erofs_exit_shrinker();
#endif
if (sbi->dif0.fsoff)
seq_printf(seq, ",fsoffset=%llu", sbi->dif0.fsoff);
+ if (test_opt(opt, INODE_SHARE))
+ seq_puts(seq, ",inode_share");
return 0;
}
dax_break_layout_final(inode);
#endif
+ erofs_ishare_free_inode(inode);
truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
}
return acl;
}
#endif
+
+#ifdef CONFIG_EROFS_FS_PAGE_CACHE_SHARE
+int erofs_xattr_fill_inode_fingerprint(struct erofs_inode_fingerprint *fp,
+ struct inode *inode, const char *domain_id)
+{
+ struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb);
+ struct erofs_xattr_prefix_item *prefix;
+ const char *infix;
+ int valuelen, base_index;
+
+ if (!test_opt(&sbi->opt, INODE_SHARE))
+ return -EOPNOTSUPP;
+ if (!sbi->xattr_prefixes)
+ return -EINVAL;
+ prefix = sbi->xattr_prefixes + sbi->ishare_xattr_prefix_id;
+ infix = prefix->prefix->infix;
+ base_index = prefix->prefix->base_index;
+ valuelen = erofs_getxattr(inode, base_index, infix, NULL, 0);
+ if (valuelen <= 0 || valuelen > (1 << sbi->blkszbits))
+ return -EFSCORRUPTED;
+ fp->size = valuelen + (domain_id ? strlen(domain_id) : 0);
+ fp->opaque = kmalloc(fp->size, GFP_KERNEL);
+ if (!fp->opaque)
+ return -ENOMEM;
+ if (valuelen != erofs_getxattr(inode, base_index, infix,
+ fp->opaque, valuelen)) {
+ kfree(fp->opaque);
+ fp->opaque = NULL;
+ return -EFSCORRUPTED;
+ }
+ memcpy(fp->opaque + valuelen, domain_id, fp->size - valuelen);
+ return 0;
+}
+#endif
#define erofs_get_acl (NULL)
#endif
+int erofs_xattr_fill_inode_fingerprint(struct erofs_inode_fingerprint *fp,
+ struct inode *inode, const char *domain_id);
+
#endif