From: Sasha Levin Date: Mon, 9 Sep 2024 12:47:27 +0000 (-0400) Subject: Fixes for 6.1 X-Git-Tag: v4.19.322~44^2~4 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=2a1e70b9ad163763148c23d0c571b622849579d8;p=thirdparty%2Fkernel%2Fstable-queue.git Fixes for 6.1 Signed-off-by: Sasha Levin --- diff --git a/queue-6.1/fuse-add-expire-only-mode-to-fuse_notify_inval_entry.patch b/queue-6.1/fuse-add-expire-only-mode-to-fuse_notify_inval_entry.patch new file mode 100644 index 00000000000..e9c741dcc04 --- /dev/null +++ b/queue-6.1/fuse-add-expire-only-mode-to-fuse_notify_inval_entry.patch @@ -0,0 +1,140 @@ +From c9b538beeee36e0c0ad92404ef5c7c4071e1aed2 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 28 Oct 2022 14:25:21 +0200 +Subject: fuse: add "expire only" mode to FUSE_NOTIFY_INVAL_ENTRY + +From: Miklos Szeredi + +[ Upstream commit 4f8d37020e1fd0bf6ee9381ba918135ef3712efd ] + +Add a flag to entry expiration that lets the filesystem expire a dentry +without kicking it out from the cache immediately. + +This makes a difference for overmounted dentries, where plain invalidation +would detach all submounts before dropping the dentry from the cache. If +only expiry is set on the dentry, then any overmounts are left alone and +until ->d_revalidate() is called. + +Note: ->d_revalidate() is not called for the case of following a submount, +so invalidation will only be triggered for the non-overmounted case. The +dentry could also be mounted in a different mount instance, in which case +any submounts will still be detached. + +Suggested-by: Jakob Blomer +Signed-off-by: Miklos Szeredi +Stable-dep-of: 3002240d1649 ("fuse: fix memory leak in fuse_create_open") +Signed-off-by: Sasha Levin +--- + fs/fuse/dev.c | 4 ++-- + fs/fuse/dir.c | 6 ++++-- + fs/fuse/fuse_i.h | 2 +- + include/uapi/linux/fuse.h | 13 +++++++++++-- + 4 files changed, 18 insertions(+), 7 deletions(-) + +diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c +index 96a717f73ce3..61bef919c042 100644 +--- a/fs/fuse/dev.c ++++ b/fs/fuse/dev.c +@@ -1498,7 +1498,7 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, + buf[outarg.namelen] = 0; + + down_read(&fc->killsb); +- err = fuse_reverse_inval_entry(fc, outarg.parent, 0, &name); ++ err = fuse_reverse_inval_entry(fc, outarg.parent, 0, &name, outarg.flags); + up_read(&fc->killsb); + kfree(buf); + return err; +@@ -1546,7 +1546,7 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size, + buf[outarg.namelen] = 0; + + down_read(&fc->killsb); +- err = fuse_reverse_inval_entry(fc, outarg.parent, outarg.child, &name); ++ err = fuse_reverse_inval_entry(fc, outarg.parent, outarg.child, &name, 0); + up_read(&fc->killsb); + kfree(buf); + return err; +diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c +index 936a24b646ce..8474003aa54d 100644 +--- a/fs/fuse/dir.c ++++ b/fs/fuse/dir.c +@@ -1174,7 +1174,7 @@ int fuse_update_attributes(struct inode *inode, struct file *file, u32 mask) + } + + int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, +- u64 child_nodeid, struct qstr *name) ++ u64 child_nodeid, struct qstr *name, u32 flags) + { + int err = -ENOTDIR; + struct inode *parent; +@@ -1201,7 +1201,9 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, + goto unlock; + + fuse_dir_changed(parent); +- fuse_invalidate_entry(entry); ++ if (!(flags & FUSE_EXPIRE_ONLY)) ++ d_invalidate(entry); ++ fuse_invalidate_entry_cache(entry); + + if (child_nodeid != 0 && d_really_is_positive(entry)) { + inode_lock(d_inode(entry)); +diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h +index 66c2a9999468..cb464e5b171a 100644 +--- a/fs/fuse/fuse_i.h ++++ b/fs/fuse/fuse_i.h +@@ -1235,7 +1235,7 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid, + * then the dentry is unhashed (d_delete()). + */ + int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid, +- u64 child_nodeid, struct qstr *name); ++ u64 child_nodeid, struct qstr *name, u32 flags); + + int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file, + bool isdir); +diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h +index 76ee8f9e024a..39cfb343faa8 100644 +--- a/include/uapi/linux/fuse.h ++++ b/include/uapi/linux/fuse.h +@@ -197,6 +197,9 @@ + * + * 7.37 + * - add FUSE_TMPFILE ++ * ++ * 7.38 ++ * - add FUSE_EXPIRE_ONLY flag to fuse_notify_inval_entry + */ + + #ifndef _LINUX_FUSE_H +@@ -232,7 +235,7 @@ + #define FUSE_KERNEL_VERSION 7 + + /** Minor version number of this interface */ +-#define FUSE_KERNEL_MINOR_VERSION 37 ++#define FUSE_KERNEL_MINOR_VERSION 38 + + /** The node ID of the root inode */ + #define FUSE_ROOT_ID 1 +@@ -491,6 +494,12 @@ struct fuse_file_lock { + */ + #define FUSE_SETXATTR_ACL_KILL_SGID (1 << 0) + ++/** ++ * notify_inval_entry flags ++ * FUSE_EXPIRE_ONLY ++ */ ++#define FUSE_EXPIRE_ONLY (1 << 0) ++ + enum fuse_opcode { + FUSE_LOOKUP = 1, + FUSE_FORGET = 2, /* no reply */ +@@ -919,7 +928,7 @@ struct fuse_notify_inval_inode_out { + struct fuse_notify_inval_entry_out { + uint64_t parent; + uint32_t namelen; +- uint32_t padding; ++ uint32_t flags; + }; + + struct fuse_notify_delete_out { +-- +2.43.0 + diff --git a/queue-6.1/fuse-add-request-extension.patch b/queue-6.1/fuse-add-request-extension.patch new file mode 100644 index 00000000000..565f6b1aa54 --- /dev/null +++ b/queue-6.1/fuse-add-request-extension.patch @@ -0,0 +1,255 @@ +From d202afd43db8622cd99dfeff19a12cffd97646ca Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 10 Nov 2022 15:46:33 +0100 +Subject: fuse: add request extension + +From: Miklos Szeredi + +[ Upstream commit 15d937d7ca8c55d2b0ce9116e20c780fdd0b67cc ] + +Will need to add supplementary groups to create messages, so add the +general concept of a request extension. A request extension is appended to +the end of the main request. It has a header indicating the size and type +of the extension. + +The create security context (fuse_secctx_*) is similar to the generic +request extension, so include that as well in a backward compatible manner. + +Add the total extension length to the request header. The offset of the +extension block within the request can be calculated by: + + inh->len - inh->total_extlen * 8 + +Signed-off-by: Miklos Szeredi +Stable-dep-of: 3002240d1649 ("fuse: fix memory leak in fuse_create_open") +Signed-off-by: Sasha Levin +--- + fs/fuse/dev.c | 2 ++ + fs/fuse/dir.c | 66 ++++++++++++++++++++++----------------- + fs/fuse/fuse_i.h | 6 ++-- + include/uapi/linux/fuse.h | 28 ++++++++++++++++- + 4 files changed, 71 insertions(+), 31 deletions(-) + +diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c +index 61bef919c042..7e0d4f08a0cf 100644 +--- a/fs/fuse/dev.c ++++ b/fs/fuse/dev.c +@@ -476,6 +476,8 @@ static void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args) + req->in.h.opcode = args->opcode; + req->in.h.nodeid = args->nodeid; + req->args = args; ++ if (args->is_ext) ++ req->in.h.total_extlen = args->in_args[args->ext_idx].size / 8; + if (args->end) + __set_bit(FR_ASYNC, &req->flags); + } +diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c +index 8474003aa54d..3b7887312ac0 100644 +--- a/fs/fuse/dir.c ++++ b/fs/fuse/dir.c +@@ -470,7 +470,7 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, + } + + static int get_security_context(struct dentry *entry, umode_t mode, +- void **security_ctx, u32 *security_ctxlen) ++ struct fuse_in_arg *ext) + { + struct fuse_secctx *fctx; + struct fuse_secctx_header *header; +@@ -517,14 +517,42 @@ static int get_security_context(struct dentry *entry, umode_t mode, + + memcpy(ptr, ctx, ctxlen); + } +- *security_ctxlen = total_len; +- *security_ctx = header; ++ ext->size = total_len; ++ ext->value = header; + err = 0; + out_err: + kfree(ctx); + return err; + } + ++static int get_create_ext(struct fuse_args *args, struct dentry *dentry, ++ umode_t mode) ++{ ++ struct fuse_conn *fc = get_fuse_conn_super(dentry->d_sb); ++ struct fuse_in_arg ext = { .size = 0, .value = NULL }; ++ int err = 0; ++ ++ if (fc->init_security) ++ err = get_security_context(dentry, mode, &ext); ++ ++ if (!err && ext.size) { ++ WARN_ON(args->in_numargs >= ARRAY_SIZE(args->in_args)); ++ args->is_ext = true; ++ args->ext_idx = args->in_numargs++; ++ args->in_args[args->ext_idx] = ext; ++ } else { ++ kfree(ext.value); ++ } ++ ++ return err; ++} ++ ++static void free_ext_value(struct fuse_args *args) ++{ ++ if (args->is_ext) ++ kfree(args->in_args[args->ext_idx].value); ++} ++ + /* + * Atomic create+open operation + * +@@ -545,8 +573,6 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, + struct fuse_entry_out outentry; + struct fuse_inode *fi; + struct fuse_file *ff; +- void *security_ctx = NULL; +- u32 security_ctxlen; + bool trunc = flags & O_TRUNC; + + /* Userspace expects S_IFREG in create mode */ +@@ -590,19 +616,12 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, + args.out_args[1].size = sizeof(outopen); + args.out_args[1].value = &outopen; + +- if (fm->fc->init_security) { +- err = get_security_context(entry, mode, &security_ctx, +- &security_ctxlen); +- if (err) +- goto out_put_forget_req; +- +- args.in_numargs = 3; +- args.in_args[2].size = security_ctxlen; +- args.in_args[2].value = security_ctx; +- } ++ err = get_create_ext(&args, entry, mode); ++ if (err) ++ goto out_put_forget_req; + + err = fuse_simple_request(fm, &args); +- kfree(security_ctx); ++ free_ext_value(&args); + if (err) + goto out_free_ff; + +@@ -709,8 +728,6 @@ static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args, + struct dentry *d; + int err; + struct fuse_forget_link *forget; +- void *security_ctx = NULL; +- u32 security_ctxlen; + + if (fuse_is_bad(dir)) + return -EIO; +@@ -725,21 +742,14 @@ static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args, + args->out_args[0].size = sizeof(outarg); + args->out_args[0].value = &outarg; + +- if (fm->fc->init_security && args->opcode != FUSE_LINK) { +- err = get_security_context(entry, mode, &security_ctx, +- &security_ctxlen); ++ if (args->opcode != FUSE_LINK) { ++ err = get_create_ext(args, entry, mode); + if (err) + goto out_put_forget_req; +- +- BUG_ON(args->in_numargs != 2); +- +- args->in_numargs = 3; +- args->in_args[2].size = security_ctxlen; +- args->in_args[2].value = security_ctx; + } + + err = fuse_simple_request(fm, args); +- kfree(security_ctx); ++ free_ext_value(args); + if (err) + goto out_put_forget_req; + +diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h +index cb464e5b171a..6c3ec70c1b70 100644 +--- a/fs/fuse/fuse_i.h ++++ b/fs/fuse/fuse_i.h +@@ -264,8 +264,9 @@ struct fuse_page_desc { + struct fuse_args { + uint64_t nodeid; + uint32_t opcode; +- unsigned short in_numargs; +- unsigned short out_numargs; ++ uint8_t in_numargs; ++ uint8_t out_numargs; ++ uint8_t ext_idx; + bool force:1; + bool noreply:1; + bool nocreds:1; +@@ -276,6 +277,7 @@ struct fuse_args { + bool page_zeroing:1; + bool page_replace:1; + bool may_block:1; ++ bool is_ext:1; + struct fuse_in_arg in_args[3]; + struct fuse_arg out_args[2]; + void (*end)(struct fuse_mount *fm, struct fuse_args *args, int error); +diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h +index e3c54109bae9..c71f12429e3d 100644 +--- a/include/uapi/linux/fuse.h ++++ b/include/uapi/linux/fuse.h +@@ -201,6 +201,9 @@ + * 7.38 + * - add FUSE_EXPIRE_ONLY flag to fuse_notify_inval_entry + * - add FOPEN_PARALLEL_DIRECT_WRITES ++ * - add total_extlen to fuse_in_header ++ * - add FUSE_MAX_NR_SECCTX ++ * - add extension header + */ + + #ifndef _LINUX_FUSE_H +@@ -503,6 +506,15 @@ struct fuse_file_lock { + */ + #define FUSE_EXPIRE_ONLY (1 << 0) + ++/** ++ * extension type ++ * FUSE_MAX_NR_SECCTX: maximum value of &fuse_secctx_header.nr_secctx ++ */ ++enum fuse_ext_type { ++ /* Types 0..31 are reserved for fuse_secctx_header */ ++ FUSE_MAX_NR_SECCTX = 31, ++}; ++ + enum fuse_opcode { + FUSE_LOOKUP = 1, + FUSE_FORGET = 2, /* no reply */ +@@ -886,7 +898,8 @@ struct fuse_in_header { + uint32_t uid; + uint32_t gid; + uint32_t pid; +- uint32_t padding; ++ uint16_t total_extlen; /* length of extensions in 8byte units */ ++ uint16_t padding; + }; + + struct fuse_out_header { +@@ -1047,4 +1060,17 @@ struct fuse_secctx_header { + uint32_t nr_secctx; + }; + ++/** ++ * struct fuse_ext_header - extension header ++ * @size: total size of this extension including this header ++ * @type: type of extension ++ * ++ * This is made compatible with fuse_secctx_header by using type values > ++ * FUSE_MAX_NR_SECCTX ++ */ ++struct fuse_ext_header { ++ uint32_t size; ++ uint32_t type; ++}; ++ + #endif /* _LINUX_FUSE_H */ +-- +2.43.0 + diff --git a/queue-6.1/fuse-allow-non-extending-parallel-direct-writes-on-t.patch b/queue-6.1/fuse-allow-non-extending-parallel-direct-writes-on-t.patch new file mode 100644 index 00000000000..61238baa9fb --- /dev/null +++ b/queue-6.1/fuse-allow-non-extending-parallel-direct-writes-on-t.patch @@ -0,0 +1,168 @@ +From 7d1ee672f2616d775939b066f475cd5168f32362 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 17 Jun 2022 12:40:27 +0530 +Subject: fuse: allow non-extending parallel direct writes on the same file + +From: Dharmendra Singh + +[ Upstream commit 153524053bbb0d27bb2e0be36d1b46862e9ce74c ] + +In general, as of now, in FUSE, direct writes on the same file are +serialized over inode lock i.e we hold inode lock for the full duration of +the write request. I could not find in fuse code and git history a comment +which clearly explains why this exclusive lock is taken for direct writes. +Following might be the reasons for acquiring an exclusive lock but not be +limited to + + 1) Our guess is some USER space fuse implementations might be relying on + this lock for serialization. + + 2) The lock protects against file read/write size races. + + 3) Ruling out any issues arising from partial write failures. + +This patch relaxes the exclusive lock for direct non-extending writes only. +File size extending writes might not need the lock either, but we are not +entirely sure if there is a risk to introduce any kind of regression. +Furthermore, benchmarking with fio does not show a difference between patch +versions that take on file size extension a) an exclusive lock and b) a +shared lock. + +A possible example of an issue with i_size extending writes are write error +cases. Some writes might succeed and others might fail for file system +internal reasons - for example ENOSPACE. With parallel file size extending +writes it _might_ be difficult to revert the action of the failing write, +especially to restore the right i_size. + +With these changes, we allow non-extending parallel direct writes on the +same file with the help of a flag called FOPEN_PARALLEL_DIRECT_WRITES. If +this flag is set on the file (flag is passed from libfuse to fuse kernel as +part of file open/create), we do not take exclusive lock anymore, but +instead use a shared lock that allows non-extending writes to run in +parallel. FUSE implementations which rely on this inode lock for +serialization can continue to do so and serialized direct writes are still +the default. Implementations that do not do write serialization need to be +updated and need to set the FOPEN_PARALLEL_DIRECT_WRITES flag in their file +open/create reply. + +On patch review there were concerns that network file systems (or vfs +multiple mounts of the same file system) might have issues with parallel +writes. We believe this is not the case, as this is just a local lock, +which network file systems could not rely on anyway. I.e. this lock is +just for local consistency. + +Signed-off-by: Dharmendra Singh +Signed-off-by: Bernd Schubert +Signed-off-by: Miklos Szeredi +Stable-dep-of: 3002240d1649 ("fuse: fix memory leak in fuse_create_open") +Signed-off-by: Sasha Levin +--- + fs/fuse/file.c | 43 ++++++++++++++++++++++++++++++++++++--- + include/uapi/linux/fuse.h | 3 +++ + 2 files changed, 43 insertions(+), 3 deletions(-) + +diff --git a/fs/fuse/file.c b/fs/fuse/file.c +index e6ec4338a9c5..0df1311afb87 100644 +--- a/fs/fuse/file.c ++++ b/fs/fuse/file.c +@@ -1563,14 +1563,47 @@ static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to) + return res; + } + ++static bool fuse_direct_write_extending_i_size(struct kiocb *iocb, ++ struct iov_iter *iter) ++{ ++ struct inode *inode = file_inode(iocb->ki_filp); ++ ++ return iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode); ++} ++ + static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) + { + struct inode *inode = file_inode(iocb->ki_filp); ++ struct file *file = iocb->ki_filp; ++ struct fuse_file *ff = file->private_data; + struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb); + ssize_t res; ++ bool exclusive_lock = ++ !(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES) || ++ iocb->ki_flags & IOCB_APPEND || ++ fuse_direct_write_extending_i_size(iocb, from); ++ ++ /* ++ * Take exclusive lock if ++ * - Parallel direct writes are disabled - a user space decision ++ * - Parallel direct writes are enabled and i_size is being extended. ++ * This might not be needed at all, but needs further investigation. ++ */ ++ if (exclusive_lock) ++ inode_lock(inode); ++ else { ++ inode_lock_shared(inode); ++ ++ /* A race with truncate might have come up as the decision for ++ * the lock type was done without holding the lock, check again. ++ */ ++ if (fuse_direct_write_extending_i_size(iocb, from)) { ++ inode_unlock_shared(inode); ++ inode_lock(inode); ++ exclusive_lock = true; ++ } ++ } + +- /* Don't allow parallel writes to the same file */ +- inode_lock(inode); + res = generic_write_checks(iocb, from); + if (res > 0) { + if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) { +@@ -1581,7 +1614,10 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) + fuse_write_update_attr(inode, iocb->ki_pos, res); + } + } +- inode_unlock(inode); ++ if (exclusive_lock) ++ inode_unlock(inode); ++ else ++ inode_unlock_shared(inode); + + return res; + } +@@ -2937,6 +2973,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter) + + if (iov_iter_rw(iter) == WRITE) { + fuse_write_update_attr(inode, pos, ret); ++ /* For extending writes we already hold exclusive lock */ + if (ret < 0 && offset + count > i_size) + fuse_do_truncate(file); + } +diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h +index 39cfb343faa8..e3c54109bae9 100644 +--- a/include/uapi/linux/fuse.h ++++ b/include/uapi/linux/fuse.h +@@ -200,6 +200,7 @@ + * + * 7.38 + * - add FUSE_EXPIRE_ONLY flag to fuse_notify_inval_entry ++ * - add FOPEN_PARALLEL_DIRECT_WRITES + */ + + #ifndef _LINUX_FUSE_H +@@ -307,6 +308,7 @@ struct fuse_file_lock { + * FOPEN_CACHE_DIR: allow caching this directory + * FOPEN_STREAM: the file is stream-like (no file position at all) + * FOPEN_NOFLUSH: don't flush data cache on close (unless FUSE_WRITEBACK_CACHE) ++ * FOPEN_PARALLEL_DIRECT_WRITES: Allow concurrent direct writes on the same inode + */ + #define FOPEN_DIRECT_IO (1 << 0) + #define FOPEN_KEEP_CACHE (1 << 1) +@@ -314,6 +316,7 @@ struct fuse_file_lock { + #define FOPEN_CACHE_DIR (1 << 3) + #define FOPEN_STREAM (1 << 4) + #define FOPEN_NOFLUSH (1 << 5) ++#define FOPEN_PARALLEL_DIRECT_WRITES (1 << 6) + + /** + * INIT request/reply flags +-- +2.43.0 + diff --git a/queue-6.1/fuse-fix-memory-leak-in-fuse_create_open.patch b/queue-6.1/fuse-fix-memory-leak-in-fuse_create_open.patch new file mode 100644 index 00000000000..b1b76bca6eb --- /dev/null +++ b/queue-6.1/fuse-fix-memory-leak-in-fuse_create_open.patch @@ -0,0 +1,37 @@ +From a84b7dea9c08ad2f60802e7b8618a354d227351b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 23 Aug 2024 16:51:46 +0800 +Subject: fuse: fix memory leak in fuse_create_open + +From: yangyun + +[ Upstream commit 3002240d16494d798add0575e8ba1f284258ab34 ] + +The memory of struct fuse_file is allocated but not freed +when get_create_ext return error. + +Fixes: 3e2b6fdbdc9a ("fuse: send security context of inode on file") +Cc: stable@vger.kernel.org # v5.17 +Signed-off-by: yangyun +Signed-off-by: Miklos Szeredi +Signed-off-by: Sasha Levin +--- + fs/fuse/dir.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c +index 3b7887312ac0..aa2be4c1ea8f 100644 +--- a/fs/fuse/dir.c ++++ b/fs/fuse/dir.c +@@ -618,7 +618,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry, + + err = get_create_ext(&args, entry, mode); + if (err) +- goto out_put_forget_req; ++ goto out_free_ff; + + err = fuse_simple_request(fm, &args); + free_ext_value(&args); +-- +2.43.0 + diff --git a/queue-6.1/mm-fix-pmd_read_atomic.patch b/queue-6.1/mm-fix-pmd_read_atomic.patch new file mode 100644 index 00000000000..0a61371dd51 --- /dev/null +++ b/queue-6.1/mm-fix-pmd_read_atomic.patch @@ -0,0 +1,173 @@ +From 736b31e4394c8b483561e4ac1fd71eefb2bbec03 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 26 Nov 2020 17:16:22 +0100 +Subject: mm: Fix pmd_read_atomic() + +From: Peter Zijlstra + +[ Upstream commit 024d232ae4fcd7a7ce8ea239607d6c1246d7adc8 ] + +AFAICT there's no reason to do anything different than what we do for +PTEs. Make it so (also affects SH). + +Signed-off-by: Peter Zijlstra (Intel) +Link: https://lkml.kernel.org/r/20221022114424.711181252%40infradead.org +Stable-dep-of: 71c186efc1b2 ("userfaultfd: fix checks for huge PMDs") +Signed-off-by: Sasha Levin +--- + arch/x86/include/asm/pgtable-3level.h | 56 --------------------------- + include/linux/pgtable.h | 47 +++++++++++++++++----- + 2 files changed, 37 insertions(+), 66 deletions(-) + +diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h +index 28556d22feb8..94f50b0100a5 100644 +--- a/arch/x86/include/asm/pgtable-3level.h ++++ b/arch/x86/include/asm/pgtable-3level.h +@@ -34,62 +34,6 @@ static inline void native_set_pte(pte_t *ptep, pte_t pte) + ptep->pte_low = pte.pte_low; + } + +-#define pmd_read_atomic pmd_read_atomic +-/* +- * pte_offset_map_lock() on 32-bit PAE kernels was reading the pmd_t with +- * a "*pmdp" dereference done by GCC. Problem is, in certain places +- * where pte_offset_map_lock() is called, concurrent page faults are +- * allowed, if the mmap_lock is hold for reading. An example is mincore +- * vs page faults vs MADV_DONTNEED. On the page fault side +- * pmd_populate() rightfully does a set_64bit(), but if we're reading the +- * pmd_t with a "*pmdp" on the mincore side, a SMP race can happen +- * because GCC will not read the 64-bit value of the pmd atomically. +- * +- * To fix this all places running pte_offset_map_lock() while holding the +- * mmap_lock in read mode, shall read the pmdp pointer using this +- * function to know if the pmd is null or not, and in turn to know if +- * they can run pte_offset_map_lock() or pmd_trans_huge() or other pmd +- * operations. +- * +- * Without THP if the mmap_lock is held for reading, the pmd can only +- * transition from null to not null while pmd_read_atomic() runs. So +- * we can always return atomic pmd values with this function. +- * +- * With THP if the mmap_lock is held for reading, the pmd can become +- * trans_huge or none or point to a pte (and in turn become "stable") +- * at any time under pmd_read_atomic(). We could read it truly +- * atomically here with an atomic64_read() for the THP enabled case (and +- * it would be a whole lot simpler), but to avoid using cmpxchg8b we +- * only return an atomic pmdval if the low part of the pmdval is later +- * found to be stable (i.e. pointing to a pte). We are also returning a +- * 'none' (zero) pmdval if the low part of the pmd is zero. +- * +- * In some cases the high and low part of the pmdval returned may not be +- * consistent if THP is enabled (the low part may point to previously +- * mapped hugepage, while the high part may point to a more recently +- * mapped hugepage), but pmd_none_or_trans_huge_or_clear_bad() only +- * needs the low part of the pmd to be read atomically to decide if the +- * pmd is unstable or not, with the only exception when the low part +- * of the pmd is zero, in which case we return a 'none' pmd. +- */ +-static inline pmd_t pmd_read_atomic(pmd_t *pmdp) +-{ +- pmdval_t ret; +- u32 *tmp = (u32 *)pmdp; +- +- ret = (pmdval_t) (*tmp); +- if (ret) { +- /* +- * If the low part is null, we must not read the high part +- * or we can end up with a partial pmd. +- */ +- smp_rmb(); +- ret |= ((pmdval_t)*(tmp + 1)) << 32; +- } +- +- return (pmd_t) { .pmd = ret }; +-} +- + static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) + { + set_64bit((unsigned long long *)(ptep), native_pte_val(pte)); +diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h +index 5f0d7d0b9471..8f31e2ff6b58 100644 +--- a/include/linux/pgtable.h ++++ b/include/linux/pgtable.h +@@ -316,6 +316,13 @@ static inline pte_t ptep_get(pte_t *ptep) + } + #endif + ++#ifndef __HAVE_ARCH_PMDP_GET ++static inline pmd_t pmdp_get(pmd_t *pmdp) ++{ ++ return READ_ONCE(*pmdp); ++} ++#endif ++ + #ifdef CONFIG_GUP_GET_PTE_LOW_HIGH + /* + * WARNING: only to be used in the get_user_pages_fast() implementation. +@@ -361,15 +368,42 @@ static inline pte_t ptep_get_lockless(pte_t *ptep) + + return pte; + } +-#else /* CONFIG_GUP_GET_PTE_LOW_HIGH */ ++#define ptep_get_lockless ptep_get_lockless ++ ++#if CONFIG_PGTABLE_LEVELS > 2 ++static inline pmd_t pmdp_get_lockless(pmd_t *pmdp) ++{ ++ pmd_t pmd; ++ ++ do { ++ pmd.pmd_low = pmdp->pmd_low; ++ smp_rmb(); ++ pmd.pmd_high = pmdp->pmd_high; ++ smp_rmb(); ++ } while (unlikely(pmd.pmd_low != pmdp->pmd_low)); ++ ++ return pmd; ++} ++#define pmdp_get_lockless pmdp_get_lockless ++#endif /* CONFIG_PGTABLE_LEVELS > 2 */ ++#endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */ ++ + /* + * We require that the PTE can be read atomically. + */ ++#ifndef ptep_get_lockless + static inline pte_t ptep_get_lockless(pte_t *ptep) + { + return ptep_get(ptep); + } +-#endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */ ++#endif ++ ++#ifndef pmdp_get_lockless ++static inline pmd_t pmdp_get_lockless(pmd_t *pmdp) ++{ ++ return pmdp_get(pmdp); ++} ++#endif + + #ifdef CONFIG_TRANSPARENT_HUGEPAGE + #ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR +@@ -1339,17 +1373,10 @@ static inline int pud_trans_unstable(pud_t *pud) + #endif + } + +-#ifndef pmd_read_atomic + static inline pmd_t pmd_read_atomic(pmd_t *pmdp) + { +- /* +- * Depend on compiler for an atomic pmd read. NOTE: this is +- * only going to work, if the pmdval_t isn't larger than +- * an unsigned long. +- */ +- return *pmdp; ++ return pmdp_get_lockless(pmdp); + } +-#endif + + #ifndef arch_needs_pgtable_deposit + #define arch_needs_pgtable_deposit() (false) +-- +2.43.0 + diff --git a/queue-6.1/mm-rename-pmd_read_atomic.patch b/queue-6.1/mm-rename-pmd_read_atomic.patch new file mode 100644 index 00000000000..ef80ca31aa3 --- /dev/null +++ b/queue-6.1/mm-rename-pmd_read_atomic.patch @@ -0,0 +1,142 @@ +From ffc5769c6f6c603e01a725883e4d5760ee72939b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 26 Nov 2020 17:20:28 +0100 +Subject: mm: Rename pmd_read_atomic() + +From: Peter Zijlstra + +[ Upstream commit dab6e717429e5ec795d558a0e9a5337a1ed33a3d ] + +There's no point in having the identical routines for PTE/PMD have +different names. + +Signed-off-by: Peter Zijlstra (Intel) +Link: https://lkml.kernel.org/r/20221022114424.841277397%40infradead.org +Stable-dep-of: 71c186efc1b2 ("userfaultfd: fix checks for huge PMDs") +Signed-off-by: Sasha Levin +--- + include/linux/pgtable.h | 9 ++------- + mm/hmm.c | 2 +- + mm/khugepaged.c | 2 +- + mm/mapping_dirty_helpers.c | 2 +- + mm/mprotect.c | 2 +- + mm/userfaultfd.c | 2 +- + mm/vmscan.c | 4 ++-- + 7 files changed, 9 insertions(+), 14 deletions(-) + +diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h +index 8f31e2ff6b58..3e3c00e80b65 100644 +--- a/include/linux/pgtable.h ++++ b/include/linux/pgtable.h +@@ -1373,11 +1373,6 @@ static inline int pud_trans_unstable(pud_t *pud) + #endif + } + +-static inline pmd_t pmd_read_atomic(pmd_t *pmdp) +-{ +- return pmdp_get_lockless(pmdp); +-} +- + #ifndef arch_needs_pgtable_deposit + #define arch_needs_pgtable_deposit() (false) + #endif +@@ -1404,13 +1399,13 @@ static inline pmd_t pmd_read_atomic(pmd_t *pmdp) + */ + static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd) + { +- pmd_t pmdval = pmd_read_atomic(pmd); ++ pmd_t pmdval = pmdp_get_lockless(pmd); + /* + * The barrier will stabilize the pmdval in a register or on + * the stack so that it will stop changing under the code. + * + * When CONFIG_TRANSPARENT_HUGEPAGE=y on x86 32bit PAE, +- * pmd_read_atomic is allowed to return a not atomic pmdval ++ * pmdp_get_lockless is allowed to return a not atomic pmdval + * (for example pointing to an hugepage that has never been + * mapped in the pmd). The below checks will only care about + * the low part of the pmd with 32bit PAE x86 anyway, with the +diff --git a/mm/hmm.c b/mm/hmm.c +index 3850fb625dda..39cf50de76d7 100644 +--- a/mm/hmm.c ++++ b/mm/hmm.c +@@ -361,7 +361,7 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp, + * huge or device mapping one and compute corresponding pfn + * values. + */ +- pmd = pmd_read_atomic(pmdp); ++ pmd = pmdp_get_lockless(pmdp); + barrier(); + if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd)) + goto again; +diff --git a/mm/khugepaged.c b/mm/khugepaged.c +index 085fca1fa27a..47010c3b5c4d 100644 +--- a/mm/khugepaged.c ++++ b/mm/khugepaged.c +@@ -866,7 +866,7 @@ static int find_pmd_or_thp_or_none(struct mm_struct *mm, + if (!*pmd) + return SCAN_PMD_NULL; + +- pmde = pmd_read_atomic(*pmd); ++ pmde = pmdp_get_lockless(*pmd); + + #ifdef CONFIG_TRANSPARENT_HUGEPAGE + /* See comments in pmd_none_or_trans_huge_or_clear_bad() */ +diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c +index 1b0ab8fcfd8b..175e424b9ab1 100644 +--- a/mm/mapping_dirty_helpers.c ++++ b/mm/mapping_dirty_helpers.c +@@ -126,7 +126,7 @@ static int clean_record_pte(pte_t *pte, unsigned long addr, + static int wp_clean_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long end, + struct mm_walk *walk) + { +- pmd_t pmdval = pmd_read_atomic(pmd); ++ pmd_t pmdval = pmdp_get_lockless(pmd); + + if (!pmd_trans_unstable(&pmdval)) + return 0; +diff --git a/mm/mprotect.c b/mm/mprotect.c +index 668bfaa6ed2a..f006bafe338f 100644 +--- a/mm/mprotect.c ++++ b/mm/mprotect.c +@@ -294,7 +294,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb, + */ + static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd) + { +- pmd_t pmdval = pmd_read_atomic(pmd); ++ pmd_t pmdval = pmdp_get_lockless(pmd); + + /* See pmd_none_or_trans_huge_or_clear_bad for info on barrier */ + #ifdef CONFIG_TRANSPARENT_HUGEPAGE +diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c +index 992a0a16846f..5d873aadec76 100644 +--- a/mm/userfaultfd.c ++++ b/mm/userfaultfd.c +@@ -641,7 +641,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, + break; + } + +- dst_pmdval = pmd_read_atomic(dst_pmd); ++ dst_pmdval = pmdp_get_lockless(dst_pmd); + /* + * If the dst_pmd is mapped as THP don't + * override it and just be strict. +diff --git a/mm/vmscan.c b/mm/vmscan.c +index 4cd0cbf9c121..f5fa1c76d9e6 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -4068,9 +4068,9 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end, + /* walk_pte_range() may call get_next_vma() */ + vma = args->vma; + for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) { +- pmd_t val = pmd_read_atomic(pmd + i); ++ pmd_t val = pmdp_get_lockless(pmd + i); + +- /* for pmd_read_atomic() */ ++ /* for pmdp_get_lockless() */ + barrier(); + + next = pmd_addr_end(addr, end); +-- +2.43.0 + diff --git a/queue-6.1/net-mana-fix-error-handling-in-mana_create_txq-rxq-s.patch b/queue-6.1/net-mana-fix-error-handling-in-mana_create_txq-rxq-s.patch new file mode 100644 index 00000000000..f4437bf9f6f --- /dev/null +++ b/queue-6.1/net-mana-fix-error-handling-in-mana_create_txq-rxq-s.patch @@ -0,0 +1,125 @@ +From b1c444c28722aa7d59105a8826c4a2da636d24f5 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 2 Sep 2024 05:43:47 -0700 +Subject: net: mana: Fix error handling in mana_create_txq/rxq's NAPI cleanup + +From: Souradeep Chakrabarti + +[ Upstream commit b6ecc662037694488bfff7c9fd21c405df8411f2 ] + +Currently napi_disable() gets called during rxq and txq cleanup, +even before napi is enabled and hrtimer is initialized. It causes +kernel panic. + +? page_fault_oops+0x136/0x2b0 + ? page_counter_cancel+0x2e/0x80 + ? do_user_addr_fault+0x2f2/0x640 + ? refill_obj_stock+0xc4/0x110 + ? exc_page_fault+0x71/0x160 + ? asm_exc_page_fault+0x27/0x30 + ? __mmdrop+0x10/0x180 + ? __mmdrop+0xec/0x180 + ? hrtimer_active+0xd/0x50 + hrtimer_try_to_cancel+0x2c/0xf0 + hrtimer_cancel+0x15/0x30 + napi_disable+0x65/0x90 + mana_destroy_rxq+0x4c/0x2f0 + mana_create_rxq.isra.0+0x56c/0x6d0 + ? mana_uncfg_vport+0x50/0x50 + mana_alloc_queues+0x21b/0x320 + ? skb_dequeue+0x5f/0x80 + +Cc: stable@vger.kernel.org +Fixes: e1b5683ff62e ("net: mana: Move NAPI from EQ to CQ") +Signed-off-by: Souradeep Chakrabarti +Reviewed-by: Haiyang Zhang +Reviewed-by: Shradha Gupta +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/microsoft/mana/mana.h | 2 ++ + drivers/net/ethernet/microsoft/mana/mana_en.c | 22 +++++++++++-------- + 2 files changed, 15 insertions(+), 9 deletions(-) + +diff --git a/drivers/net/ethernet/microsoft/mana/mana.h b/drivers/net/ethernet/microsoft/mana/mana.h +index 41c99eabf40a..2b00d6a29117 100644 +--- a/drivers/net/ethernet/microsoft/mana/mana.h ++++ b/drivers/net/ethernet/microsoft/mana/mana.h +@@ -86,6 +86,8 @@ struct mana_txq { + + atomic_t pending_sends; + ++ bool napi_initialized; ++ + struct mana_stats_tx stats; + }; + +diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c +index e7d1ce68f05e..b52612eef0a6 100644 +--- a/drivers/net/ethernet/microsoft/mana/mana_en.c ++++ b/drivers/net/ethernet/microsoft/mana/mana_en.c +@@ -1391,10 +1391,12 @@ static void mana_destroy_txq(struct mana_port_context *apc) + + for (i = 0; i < apc->num_queues; i++) { + napi = &apc->tx_qp[i].tx_cq.napi; +- napi_synchronize(napi); +- napi_disable(napi); +- netif_napi_del(napi); +- ++ if (apc->tx_qp[i].txq.napi_initialized) { ++ napi_synchronize(napi); ++ napi_disable(napi); ++ netif_napi_del(napi); ++ apc->tx_qp[i].txq.napi_initialized = false; ++ } + mana_destroy_wq_obj(apc, GDMA_SQ, apc->tx_qp[i].tx_object); + + mana_deinit_cq(apc, &apc->tx_qp[i].tx_cq); +@@ -1450,6 +1452,7 @@ static int mana_create_txq(struct mana_port_context *apc, + txq->ndev = net; + txq->net_txq = netdev_get_tx_queue(net, i); + txq->vp_offset = apc->tx_vp_offset; ++ txq->napi_initialized = false; + skb_queue_head_init(&txq->pending_skbs); + + memset(&spec, 0, sizeof(spec)); +@@ -1514,6 +1517,7 @@ static int mana_create_txq(struct mana_port_context *apc, + + netif_napi_add_tx(net, &cq->napi, mana_poll); + napi_enable(&cq->napi); ++ txq->napi_initialized = true; + + mana_gd_ring_cq(cq->gdma_cq, SET_ARM_BIT); + } +@@ -1525,7 +1529,7 @@ static int mana_create_txq(struct mana_port_context *apc, + } + + static void mana_destroy_rxq(struct mana_port_context *apc, +- struct mana_rxq *rxq, bool validate_state) ++ struct mana_rxq *rxq, bool napi_initialized) + + { + struct gdma_context *gc = apc->ac->gdma_dev->gdma_context; +@@ -1539,15 +1543,15 @@ static void mana_destroy_rxq(struct mana_port_context *apc, + + napi = &rxq->rx_cq.napi; + +- if (validate_state) ++ if (napi_initialized) { + napi_synchronize(napi); + +- napi_disable(napi); ++ napi_disable(napi); + ++ netif_napi_del(napi); ++ } + xdp_rxq_info_unreg(&rxq->xdp_rxq); + +- netif_napi_del(napi); +- + mana_destroy_wq_obj(apc, GDMA_RQ, rxq->rxobj); + + mana_deinit_cq(apc, &rxq->rx_cq); +-- +2.43.0 + diff --git a/queue-6.1/rust-macros-provide-correct-provenance-when-construc.patch b/queue-6.1/rust-macros-provide-correct-provenance-when-construc.patch new file mode 100644 index 00000000000..a80f8f0f701 --- /dev/null +++ b/queue-6.1/rust-macros-provide-correct-provenance-when-construc.patch @@ -0,0 +1,63 @@ +From 5b320b29ddf985d8de92c3afa9aebe13ecd5cfad Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 28 Aug 2024 11:01:29 -0700 +Subject: rust: macros: provide correct provenance when constructing + THIS_MODULE + +From: Boqun Feng + +[ Upstream commit a5a3c952e82c1ada12bf8c55b73af26f1a454bd2 ] + +Currently while defining `THIS_MODULE` symbol in `module!()`, the +pointer used to construct `ThisModule` is derived from an immutable +reference of `__this_module`, which means the pointer doesn't have +the provenance for writing, and that means any write to that pointer +is UB regardless of data races or not. However, the usage of +`THIS_MODULE` includes passing this pointer to functions that may write +to it (probably in unsafe code), and this will create soundness issues. + +One way to fix this is using `addr_of_mut!()` but that requires the +unstable feature "const_mut_refs". So instead of `addr_of_mut()!`, +an extern static `Opaque` is used here: since `Opaque` is transparent +to `T`, an extern static `Opaque` will just wrap the C symbol (defined +in a C compile unit) in an `Opaque`, which provides a pointer with +writable provenance via `Opaque::get()`. This fix the potential UBs +because of pointer provenance unmatched. + +Reported-by: Alice Ryhl +Signed-off-by: Boqun Feng +Reviewed-by: Alice Ryhl +Reviewed-by: Trevor Gross +Reviewed-by: Benno Lossin +Reviewed-by: Gary Guo +Closes: https://rust-for-linux.zulipchat.com/#narrow/stream/x/topic/x/near/465412664 +Fixes: 1fbde52bde73 ("rust: add `macros` crate") +Cc: stable@vger.kernel.org # 6.6.x: be2ca1e03965: ("rust: types: Make Opaque::get const") +Link: https://lore.kernel.org/r/20240828180129.4046355-1-boqun.feng@gmail.com +[ Fixed two typos, reworded title. - Miguel ] +Signed-off-by: Miguel Ojeda +Signed-off-by: Sasha Levin +--- + rust/macros/module.rs | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff --git a/rust/macros/module.rs b/rust/macros/module.rs +index 031028b3dc41..071b96639a2e 100644 +--- a/rust/macros/module.rs ++++ b/rust/macros/module.rs +@@ -183,7 +183,11 @@ pub(crate) fn module(ts: TokenStream) -> TokenStream { + // freed until the module is unloaded. + #[cfg(MODULE)] + static THIS_MODULE: kernel::ThisModule = unsafe {{ +- kernel::ThisModule::from_ptr(&kernel::bindings::__this_module as *const _ as *mut _) ++ extern \"C\" {{ ++ static __this_module: kernel::types::Opaque; ++ }} ++ ++ kernel::ThisModule::from_ptr(__this_module.get()) + }}; + #[cfg(not(MODULE))] + static THIS_MODULE: kernel::ThisModule = unsafe {{ +-- +2.43.0 + diff --git a/queue-6.1/series b/queue-6.1/series index 2e0a1d96d76..eeb82f7d258 100644 --- a/queue-6.1/series +++ b/queue-6.1/series @@ -133,3 +133,13 @@ selftests-mptcp-join-check-re-re-adding-id-0-signal.patch io_uring-io-wq-stop-setting-pf_no_setaffinity-on-io-wq-workers.patch io_uring-sqpoll-do-not-set-pf_no_setaffinity-on-sqpoll-threads.patch tcp-process-the-3rd-ack-with-sk_socket-for-tfo-mptcp.patch +rust-macros-provide-correct-provenance-when-construc.patch +fuse-add-expire-only-mode-to-fuse_notify_inval_entry.patch +fuse-allow-non-extending-parallel-direct-writes-on-t.patch +fuse-add-request-extension.patch +fuse-fix-memory-leak-in-fuse_create_open.patch +x86-mm-pae-make-pmd_t-similar-to-pte_t.patch +mm-fix-pmd_read_atomic.patch +mm-rename-pmd_read_atomic.patch +userfaultfd-fix-checks-for-huge-pmds.patch +net-mana-fix-error-handling-in-mana_create_txq-rxq-s.patch diff --git a/queue-6.1/userfaultfd-fix-checks-for-huge-pmds.patch b/queue-6.1/userfaultfd-fix-checks-for-huge-pmds.patch new file mode 100644 index 00000000000..d08b8c4b443 --- /dev/null +++ b/queue-6.1/userfaultfd-fix-checks-for-huge-pmds.patch @@ -0,0 +1,146 @@ +From 170b9e3f04840e9c1a1e14c687b689e84d8f0d9d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 13 Aug 2024 22:25:21 +0200 +Subject: userfaultfd: fix checks for huge PMDs + +From: Jann Horn + +[ Upstream commit 71c186efc1b2cf1aeabfeff3b9bd5ac4c5ac14d8 ] + +Patch series "userfaultfd: fix races around pmd_trans_huge() check", v2. + +The pmd_trans_huge() code in mfill_atomic() is wrong in three different +ways depending on kernel version: + +1. The pmd_trans_huge() check is racy and can lead to a BUG_ON() (if you hit + the right two race windows) - I've tested this in a kernel build with + some extra mdelay() calls. See the commit message for a description + of the race scenario. + On older kernels (before 6.5), I think the same bug can even + theoretically lead to accessing transhuge page contents as a page table + if you hit the right 5 narrow race windows (I haven't tested this case). +2. As pointed out by Qi Zheng, pmd_trans_huge() is not sufficient for + detecting PMDs that don't point to page tables. + On older kernels (before 6.5), you'd just have to win a single fairly + wide race to hit this. + I've tested this on 6.1 stable by racing migration (with a mdelay() + patched into try_to_migrate()) against UFFDIO_ZEROPAGE - on my x86 + VM, that causes a kernel oops in ptlock_ptr(). +3. On newer kernels (>=6.5), for shmem mappings, khugepaged is allowed + to yank page tables out from under us (though I haven't tested that), + so I think the BUG_ON() checks in mfill_atomic() are just wrong. + +I decided to write two separate fixes for these (one fix for bugs 1+2, one +fix for bug 3), so that the first fix can be backported to kernels +affected by bugs 1+2. + +This patch (of 2): + +This fixes two issues. + +I discovered that the following race can occur: + + mfill_atomic other thread + ============ ============ + + pmdp_get_lockless() [reads none pmd] + + + + __pte_alloc [no-op] + + + BUG_ON(pmd_none(*dst_pmd)) + +I have experimentally verified this in a kernel with extra mdelay() calls; +the BUG_ON(pmd_none(*dst_pmd)) triggers. + +On kernels newer than commit 0d940a9b270b ("mm/pgtable: allow +pte_offset_map[_lock]() to fail"), this can't lead to anything worse than +a BUG_ON(), since the page table access helpers are actually designed to +deal with page tables concurrently disappearing; but on older kernels +(<=6.4), I think we could probably theoretically race past the two +BUG_ON() checks and end up treating a hugepage as a page table. + +The second issue is that, as Qi Zheng pointed out, there are other types +of huge PMDs that pmd_trans_huge() can't catch: devmap PMDs and swap PMDs +(in particular, migration PMDs). + +On <=6.4, this is worse than the first issue: If mfill_atomic() runs on a +PMD that contains a migration entry (which just requires winning a single, +fairly wide race), it will pass the PMD to pte_offset_map_lock(), which +assumes that the PMD points to a page table. + +Breakage follows: First, the kernel tries to take the PTE lock (which will +crash or maybe worse if there is no "struct page" for the address bits in +the migration entry PMD - I think at least on X86 there usually is no +corresponding "struct page" thanks to the PTE inversion mitigation, amd64 +looks different). + +If that didn't crash, the kernel would next try to write a PTE into what +it wrongly thinks is a page table. + +As part of fixing these issues, get rid of the check for pmd_trans_huge() +before __pte_alloc() - that's redundant, we're going to have to check for +that after the __pte_alloc() anyway. + +Backport note: pmdp_get_lockless() is pmd_read_atomic() in older kernels. + +Link: https://lkml.kernel.org/r/20240813-uffd-thp-flip-fix-v2-0-5efa61078a41@google.com +Link: https://lkml.kernel.org/r/20240813-uffd-thp-flip-fix-v2-1-5efa61078a41@google.com +Fixes: c1a4de99fada ("userfaultfd: mcopy_atomic|mfill_zeropage: UFFDIO_COPY|UFFDIO_ZEROPAGE preparation") +Signed-off-by: Jann Horn +Acked-by: David Hildenbrand +Cc: Andrea Arcangeli +Cc: Hugh Dickins +Cc: Jann Horn +Cc: Pavel Emelyanov +Cc: Qi Zheng +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Sasha Levin +--- + mm/userfaultfd.c | 22 ++++++++++++---------- + 1 file changed, 12 insertions(+), 10 deletions(-) + +diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c +index 5d873aadec76..50c01a7eb705 100644 +--- a/mm/userfaultfd.c ++++ b/mm/userfaultfd.c +@@ -642,21 +642,23 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, + } + + dst_pmdval = pmdp_get_lockless(dst_pmd); +- /* +- * If the dst_pmd is mapped as THP don't +- * override it and just be strict. +- */ +- if (unlikely(pmd_trans_huge(dst_pmdval))) { +- err = -EEXIST; +- break; +- } + if (unlikely(pmd_none(dst_pmdval)) && + unlikely(__pte_alloc(dst_mm, dst_pmd))) { + err = -ENOMEM; + break; + } +- /* If an huge pmd materialized from under us fail */ +- if (unlikely(pmd_trans_huge(*dst_pmd))) { ++ dst_pmdval = pmdp_get_lockless(dst_pmd); ++ /* ++ * If the dst_pmd is THP don't override it and just be strict. ++ * (This includes the case where the PMD used to be THP and ++ * changed back to none after __pte_alloc().) ++ */ ++ if (unlikely(!pmd_present(dst_pmdval) || pmd_trans_huge(dst_pmdval) || ++ pmd_devmap(dst_pmdval))) { ++ err = -EEXIST; ++ break; ++ } ++ if (unlikely(pmd_bad(dst_pmdval))) { + err = -EFAULT; + break; + } +-- +2.43.0 + diff --git a/queue-6.1/x86-mm-pae-make-pmd_t-similar-to-pte_t.patch b/queue-6.1/x86-mm-pae-make-pmd_t-similar-to-pte_t.patch new file mode 100644 index 00000000000..9390f00740d --- /dev/null +++ b/queue-6.1/x86-mm-pae-make-pmd_t-similar-to-pte_t.patch @@ -0,0 +1,156 @@ +From 4eae6e27b21e427bd48973eec754e0833489829f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 26 Nov 2020 17:02:29 +0100 +Subject: x86/mm/pae: Make pmd_t similar to pte_t + +From: Peter Zijlstra + +[ Upstream commit fbfdec9989e69e0b17aa3bf32fcb22d04cc33301 ] + +Instead of mucking about with at least 2 different ways of fudging +it, do the same thing we do for pte_t. + +Signed-off-by: Peter Zijlstra (Intel) +Link: https://lkml.kernel.org/r/20221022114424.580310787%40infradead.org +Stable-dep-of: 71c186efc1b2 ("userfaultfd: fix checks for huge PMDs") +Signed-off-by: Sasha Levin +--- + arch/x86/include/asm/pgtable-3level.h | 42 +++++++-------------- + arch/x86/include/asm/pgtable-3level_types.h | 7 ++++ + arch/x86/include/asm/pgtable_64_types.h | 1 + + arch/x86/include/asm/pgtable_types.h | 4 +- + 4 files changed, 23 insertions(+), 31 deletions(-) + +diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h +index 28421a887209..28556d22feb8 100644 +--- a/arch/x86/include/asm/pgtable-3level.h ++++ b/arch/x86/include/asm/pgtable-3level.h +@@ -87,7 +87,7 @@ static inline pmd_t pmd_read_atomic(pmd_t *pmdp) + ret |= ((pmdval_t)*(tmp + 1)) << 32; + } + +- return (pmd_t) { ret }; ++ return (pmd_t) { .pmd = ret }; + } + + static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte) +@@ -121,12 +121,11 @@ static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr, + ptep->pte_high = 0; + } + +-static inline void native_pmd_clear(pmd_t *pmd) ++static inline void native_pmd_clear(pmd_t *pmdp) + { +- u32 *tmp = (u32 *)pmd; +- *tmp = 0; ++ pmdp->pmd_low = 0; + smp_wmb(); +- *(tmp + 1) = 0; ++ pmdp->pmd_high = 0; + } + + static inline void native_pud_clear(pud_t *pudp) +@@ -162,25 +161,17 @@ static inline pte_t native_ptep_get_and_clear(pte_t *ptep) + #define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp) + #endif + +-union split_pmd { +- struct { +- u32 pmd_low; +- u32 pmd_high; +- }; +- pmd_t pmd; +-}; +- + #ifdef CONFIG_SMP + static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp) + { +- union split_pmd res, *orig = (union split_pmd *)pmdp; ++ pmd_t res; + + /* xchg acts as a barrier before setting of the high bits */ +- res.pmd_low = xchg(&orig->pmd_low, 0); +- res.pmd_high = orig->pmd_high; +- orig->pmd_high = 0; ++ res.pmd_low = xchg(&pmdp->pmd_low, 0); ++ res.pmd_high = READ_ONCE(pmdp->pmd_high); ++ WRITE_ONCE(pmdp->pmd_high, 0); + +- return res.pmd; ++ return res; + } + #else + #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp) +@@ -199,17 +190,12 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma, + * anybody. + */ + if (!(pmd_val(pmd) & _PAGE_PRESENT)) { +- union split_pmd old, new, *ptr; +- +- ptr = (union split_pmd *)pmdp; +- +- new.pmd = pmd; +- + /* xchg acts as a barrier before setting of the high bits */ +- old.pmd_low = xchg(&ptr->pmd_low, new.pmd_low); +- old.pmd_high = ptr->pmd_high; +- ptr->pmd_high = new.pmd_high; +- return old.pmd; ++ old.pmd_low = xchg(&pmdp->pmd_low, pmd.pmd_low); ++ old.pmd_high = READ_ONCE(pmdp->pmd_high); ++ WRITE_ONCE(pmdp->pmd_high, pmd.pmd_high); ++ ++ return old; + } + + do { +diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h +index 56baf43befb4..80911349519e 100644 +--- a/arch/x86/include/asm/pgtable-3level_types.h ++++ b/arch/x86/include/asm/pgtable-3level_types.h +@@ -18,6 +18,13 @@ typedef union { + }; + pteval_t pte; + } pte_t; ++ ++typedef union { ++ struct { ++ unsigned long pmd_low, pmd_high; ++ }; ++ pmdval_t pmd; ++} pmd_t; + #endif /* !__ASSEMBLY__ */ + + #define SHARED_KERNEL_PMD (!static_cpu_has(X86_FEATURE_PTI)) +diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h +index 6c7f7c526450..4ea3755f2444 100644 +--- a/arch/x86/include/asm/pgtable_64_types.h ++++ b/arch/x86/include/asm/pgtable_64_types.h +@@ -19,6 +19,7 @@ typedef unsigned long pgdval_t; + typedef unsigned long pgprotval_t; + + typedef struct { pteval_t pte; } pte_t; ++typedef struct { pmdval_t pmd; } pmd_t; + + #ifdef CONFIG_X86_5LEVEL + extern unsigned int __pgtable_l5_enabled; +diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h +index e3028373f0b4..d0e9654d7272 100644 +--- a/arch/x86/include/asm/pgtable_types.h ++++ b/arch/x86/include/asm/pgtable_types.h +@@ -363,11 +363,9 @@ static inline pudval_t native_pud_val(pud_t pud) + #endif + + #if CONFIG_PGTABLE_LEVELS > 2 +-typedef struct { pmdval_t pmd; } pmd_t; +- + static inline pmd_t native_make_pmd(pmdval_t val) + { +- return (pmd_t) { val }; ++ return (pmd_t) { .pmd = val }; + } + + static inline pmdval_t native_pmd_val(pmd_t pmd) +-- +2.43.0 +