Fixes for 6.1

author Sasha Levin <sashal@kernel.org>

Mon, 9 Sep 2024 12:47:27 +0000 (08:47 -0400)

committer Sasha Levin <sashal@kernel.org>

Mon, 9 Sep 2024 12:47:27 +0000 (08:47 -0400)
author Sasha Levin <sashal@kernel.org>
Mon, 9 Sep 2024 12:47:27 +0000 (08:47 -0400)
committer Sasha Levin <sashal@kernel.org>
Mon, 9 Sep 2024 12:47:27 +0000 (08:47 -0400)
diff --git a/queue-6.1/fuse-add-expire-only-mode-to-fuse_notify_inval_entry.patch b/queue-6.1/fuse-add-expire-only-mode-to-fuse_notify_inval_entry.patch

new file mode 100644 (file)

index 0000000..e9c741d
--- /dev/null
+++ b/queue-6.1/fuse-add-expire-only-mode-to-fuse_notify_inval_entry.patch
@@ -0,0 +1,140 @@
+From c9b538beeee36e0c0ad92404ef5c7c4071e1aed2 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 28 Oct 2022 14:25:21 +0200
+Subject: fuse: add "expire only" mode to FUSE_NOTIFY_INVAL_ENTRY
+
+From: Miklos Szeredi <mszeredi@redhat.com>
+
+[ Upstream commit 4f8d37020e1fd0bf6ee9381ba918135ef3712efd ]
+
+Add a flag to entry expiration that lets the filesystem expire a dentry
+without kicking it out from the cache immediately.
+
+This makes a difference for overmounted dentries, where plain invalidation
+would detach all submounts before dropping the dentry from the cache.  If
+only expiry is set on the dentry, then any overmounts are left alone and
+until ->d_revalidate() is called.
+
+Note: ->d_revalidate() is not called for the case of following a submount,
+so invalidation will only be triggered for the non-overmounted case.  The
+dentry could also be mounted in a different mount instance, in which case
+any submounts will still be detached.
+
+Suggested-by: Jakob Blomer <jblomer@cern.ch>
+Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
+Stable-dep-of: 3002240d1649 ("fuse: fix memory leak in fuse_create_open")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/fuse/dev.c             |  4 ++--
+ fs/fuse/dir.c             |  6 ++++--
+ fs/fuse/fuse_i.h          |  2 +-
+ include/uapi/linux/fuse.h | 13 +++++++++++--
+ 4 files changed, 18 insertions(+), 7 deletions(-)
+
+diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
+index 96a717f73ce3..61bef919c042 100644
+--- a/fs/fuse/dev.c
++++ b/fs/fuse/dev.c
+@@ -1498,7 +1498,7 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size,
+       buf[outarg.namelen] = 0;
+ 
+       down_read(&fc->killsb);
+-      err = fuse_reverse_inval_entry(fc, outarg.parent, 0, &name);
++      err = fuse_reverse_inval_entry(fc, outarg.parent, 0, &name, outarg.flags);
+       up_read(&fc->killsb);
+       kfree(buf);
+       return err;
+@@ -1546,7 +1546,7 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size,
+       buf[outarg.namelen] = 0;
+ 
+       down_read(&fc->killsb);
+-      err = fuse_reverse_inval_entry(fc, outarg.parent, outarg.child, &name);
++      err = fuse_reverse_inval_entry(fc, outarg.parent, outarg.child, &name, 0);
+       up_read(&fc->killsb);
+       kfree(buf);
+       return err;
+diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
+index 936a24b646ce..8474003aa54d 100644
+--- a/fs/fuse/dir.c
++++ b/fs/fuse/dir.c
+@@ -1174,7 +1174,7 @@ int fuse_update_attributes(struct inode *inode, struct file *file, u32 mask)
+ }
+ 
+ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
+-                           u64 child_nodeid, struct qstr *name)
++                           u64 child_nodeid, struct qstr *name, u32 flags)
+ {
+       int err = -ENOTDIR;
+       struct inode *parent;
+@@ -1201,7 +1201,9 @@ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
+               goto unlock;
+ 
+       fuse_dir_changed(parent);
+-      fuse_invalidate_entry(entry);
++      if (!(flags & FUSE_EXPIRE_ONLY))
++              d_invalidate(entry);
++      fuse_invalidate_entry_cache(entry);
+ 
+       if (child_nodeid != 0 && d_really_is_positive(entry)) {
+               inode_lock(d_inode(entry));
+diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
+index 66c2a9999468..cb464e5b171a 100644
+--- a/fs/fuse/fuse_i.h
++++ b/fs/fuse/fuse_i.h
+@@ -1235,7 +1235,7 @@ int fuse_reverse_inval_inode(struct fuse_conn *fc, u64 nodeid,
+  * then the dentry is unhashed (d_delete()).
+  */
+ int fuse_reverse_inval_entry(struct fuse_conn *fc, u64 parent_nodeid,
+-                           u64 child_nodeid, struct qstr *name);
++                           u64 child_nodeid, struct qstr *name, u32 flags);
+ 
+ int fuse_do_open(struct fuse_mount *fm, u64 nodeid, struct file *file,
+                bool isdir);
+diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
+index 76ee8f9e024a..39cfb343faa8 100644
+--- a/include/uapi/linux/fuse.h
++++ b/include/uapi/linux/fuse.h
+@@ -197,6 +197,9 @@
+  *
+  *  7.37
+  *  - add FUSE_TMPFILE
++ *
++ *  7.38
++ *  - add FUSE_EXPIRE_ONLY flag to fuse_notify_inval_entry
+  */
+ 
+ #ifndef _LINUX_FUSE_H
+@@ -232,7 +235,7 @@
+ #define FUSE_KERNEL_VERSION 7
+ 
+ /** Minor version number of this interface */
+-#define FUSE_KERNEL_MINOR_VERSION 37
++#define FUSE_KERNEL_MINOR_VERSION 38
+ 
+ /** The node ID of the root inode */
+ #define FUSE_ROOT_ID 1
+@@ -491,6 +494,12 @@ struct fuse_file_lock {
+  */
+ #define FUSE_SETXATTR_ACL_KILL_SGID   (1 << 0)
+ 
++/**
++ * notify_inval_entry flags
++ * FUSE_EXPIRE_ONLY
++ */
++#define FUSE_EXPIRE_ONLY              (1 << 0)
++
+ enum fuse_opcode {
+       FUSE_LOOKUP             = 1,
+       FUSE_FORGET             = 2,  /* no reply */
+@@ -919,7 +928,7 @@ struct fuse_notify_inval_inode_out {
+ struct fuse_notify_inval_entry_out {
+       uint64_t        parent;
+       uint32_t        namelen;
+-      uint32_t        padding;
++      uint32_t        flags;
+ };
+ 
+ struct fuse_notify_delete_out {
+-- 
+2.43.0
+
diff --git a/queue-6.1/fuse-add-request-extension.patch b/queue-6.1/fuse-add-request-extension.patch

new file mode 100644 (file)

index 0000000..565f6b1
--- /dev/null
+++ b/queue-6.1/fuse-add-request-extension.patch
@@ -0,0 +1,255 @@
+From d202afd43db8622cd99dfeff19a12cffd97646ca Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 10 Nov 2022 15:46:33 +0100
+Subject: fuse: add request extension
+
+From: Miklos Szeredi <mszeredi@redhat.com>
+
+[ Upstream commit 15d937d7ca8c55d2b0ce9116e20c780fdd0b67cc ]
+
+Will need to add supplementary groups to create messages, so add the
+general concept of a request extension.  A request extension is appended to
+the end of the main request.  It has a header indicating the size and type
+of the extension.
+
+The create security context (fuse_secctx_*) is similar to the generic
+request extension, so include that as well in a backward compatible manner.
+
+Add the total extension length to the request header.  The offset of the
+extension block within the request can be calculated by:
+
+  inh->len - inh->total_extlen * 8
+
+Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
+Stable-dep-of: 3002240d1649 ("fuse: fix memory leak in fuse_create_open")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/fuse/dev.c             |  2 ++
+ fs/fuse/dir.c             | 66 ++++++++++++++++++++++-----------------
+ fs/fuse/fuse_i.h          |  6 ++--
+ include/uapi/linux/fuse.h | 28 ++++++++++++++++-
+ 4 files changed, 71 insertions(+), 31 deletions(-)
+
+diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c
+index 61bef919c042..7e0d4f08a0cf 100644
+--- a/fs/fuse/dev.c
++++ b/fs/fuse/dev.c
+@@ -476,6 +476,8 @@ static void fuse_args_to_req(struct fuse_req *req, struct fuse_args *args)
+       req->in.h.opcode = args->opcode;
+       req->in.h.nodeid = args->nodeid;
+       req->args = args;
++      if (args->is_ext)
++              req->in.h.total_extlen = args->in_args[args->ext_idx].size / 8;
+       if (args->end)
+               __set_bit(FR_ASYNC, &req->flags);
+ }
+diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
+index 8474003aa54d..3b7887312ac0 100644
+--- a/fs/fuse/dir.c
++++ b/fs/fuse/dir.c
+@@ -470,7 +470,7 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry,
+ }
+ 
+ static int get_security_context(struct dentry *entry, umode_t mode,
+-                              void **security_ctx, u32 *security_ctxlen)
++                              struct fuse_in_arg *ext)
+ {
+       struct fuse_secctx *fctx;
+       struct fuse_secctx_header *header;
+@@ -517,14 +517,42 @@ static int get_security_context(struct dentry *entry, umode_t mode,
+ 
+               memcpy(ptr, ctx, ctxlen);
+       }
+-      *security_ctxlen = total_len;
+-      *security_ctx = header;
++      ext->size = total_len;
++      ext->value = header;
+       err = 0;
+ out_err:
+       kfree(ctx);
+       return err;
+ }
+ 
++static int get_create_ext(struct fuse_args *args, struct dentry *dentry,
++                        umode_t mode)
++{
++      struct fuse_conn *fc = get_fuse_conn_super(dentry->d_sb);
++      struct fuse_in_arg ext = { .size = 0, .value = NULL };
++      int err = 0;
++
++      if (fc->init_security)
++              err = get_security_context(dentry, mode, &ext);
++
++      if (!err && ext.size) {
++              WARN_ON(args->in_numargs >= ARRAY_SIZE(args->in_args));
++              args->is_ext = true;
++              args->ext_idx = args->in_numargs++;
++              args->in_args[args->ext_idx] = ext;
++      } else {
++              kfree(ext.value);
++      }
++
++      return err;
++}
++
++static void free_ext_value(struct fuse_args *args)
++{
++      if (args->is_ext)
++              kfree(args->in_args[args->ext_idx].value);
++}
++
+ /*
+  * Atomic create+open operation
+  *
+@@ -545,8 +573,6 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
+       struct fuse_entry_out outentry;
+       struct fuse_inode *fi;
+       struct fuse_file *ff;
+-      void *security_ctx = NULL;
+-      u32 security_ctxlen;
+       bool trunc = flags & O_TRUNC;
+ 
+       /* Userspace expects S_IFREG in create mode */
+@@ -590,19 +616,12 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
+       args.out_args[1].size = sizeof(outopen);
+       args.out_args[1].value = &outopen;
+ 
+-      if (fm->fc->init_security) {
+-              err = get_security_context(entry, mode, &security_ctx,
+-                                         &security_ctxlen);
+-              if (err)
+-                      goto out_put_forget_req;
+-
+-              args.in_numargs = 3;
+-              args.in_args[2].size = security_ctxlen;
+-              args.in_args[2].value = security_ctx;
+-      }
++      err = get_create_ext(&args, entry, mode);
++      if (err)
++              goto out_put_forget_req;
+ 
+       err = fuse_simple_request(fm, &args);
+-      kfree(security_ctx);
++      free_ext_value(&args);
+       if (err)
+               goto out_free_ff;
+ 
+@@ -709,8 +728,6 @@ static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args,
+       struct dentry *d;
+       int err;
+       struct fuse_forget_link *forget;
+-      void *security_ctx = NULL;
+-      u32 security_ctxlen;
+ 
+       if (fuse_is_bad(dir))
+               return -EIO;
+@@ -725,21 +742,14 @@ static int create_new_entry(struct fuse_mount *fm, struct fuse_args *args,
+       args->out_args[0].size = sizeof(outarg);
+       args->out_args[0].value = &outarg;
+ 
+-      if (fm->fc->init_security && args->opcode != FUSE_LINK) {
+-              err = get_security_context(entry, mode, &security_ctx,
+-                                         &security_ctxlen);
++      if (args->opcode != FUSE_LINK) {
++              err = get_create_ext(args, entry, mode);
+               if (err)
+                       goto out_put_forget_req;
+-
+-              BUG_ON(args->in_numargs != 2);
+-
+-              args->in_numargs = 3;
+-              args->in_args[2].size = security_ctxlen;
+-              args->in_args[2].value = security_ctx;
+       }
+ 
+       err = fuse_simple_request(fm, args);
+-      kfree(security_ctx);
++      free_ext_value(args);
+       if (err)
+               goto out_put_forget_req;
+ 
+diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h
+index cb464e5b171a..6c3ec70c1b70 100644
+--- a/fs/fuse/fuse_i.h
++++ b/fs/fuse/fuse_i.h
+@@ -264,8 +264,9 @@ struct fuse_page_desc {
+ struct fuse_args {
+       uint64_t nodeid;
+       uint32_t opcode;
+-      unsigned short in_numargs;
+-      unsigned short out_numargs;
++      uint8_t in_numargs;
++      uint8_t out_numargs;
++      uint8_t ext_idx;
+       bool force:1;
+       bool noreply:1;
+       bool nocreds:1;
+@@ -276,6 +277,7 @@ struct fuse_args {
+       bool page_zeroing:1;
+       bool page_replace:1;
+       bool may_block:1;
++      bool is_ext:1;
+       struct fuse_in_arg in_args[3];
+       struct fuse_arg out_args[2];
+       void (*end)(struct fuse_mount *fm, struct fuse_args *args, int error);
+diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
+index e3c54109bae9..c71f12429e3d 100644
+--- a/include/uapi/linux/fuse.h
++++ b/include/uapi/linux/fuse.h
+@@ -201,6 +201,9 @@
+  *  7.38
+  *  - add FUSE_EXPIRE_ONLY flag to fuse_notify_inval_entry
+  *  - add FOPEN_PARALLEL_DIRECT_WRITES
++ *  - add total_extlen to fuse_in_header
++ *  - add FUSE_MAX_NR_SECCTX
++ *  - add extension header
+  */
+ 
+ #ifndef _LINUX_FUSE_H
+@@ -503,6 +506,15 @@ struct fuse_file_lock {
+  */
+ #define FUSE_EXPIRE_ONLY              (1 << 0)
+ 
++/**
++ * extension type
++ * FUSE_MAX_NR_SECCTX: maximum value of &fuse_secctx_header.nr_secctx
++ */
++enum fuse_ext_type {
++      /* Types 0..31 are reserved for fuse_secctx_header */
++      FUSE_MAX_NR_SECCTX      = 31,
++};
++
+ enum fuse_opcode {
+       FUSE_LOOKUP             = 1,
+       FUSE_FORGET             = 2,  /* no reply */
+@@ -886,7 +898,8 @@ struct fuse_in_header {
+       uint32_t        uid;
+       uint32_t        gid;
+       uint32_t        pid;
+-      uint32_t        padding;
++      uint16_t        total_extlen; /* length of extensions in 8byte units */
++      uint16_t        padding;
+ };
+ 
+ struct fuse_out_header {
+@@ -1047,4 +1060,17 @@ struct fuse_secctx_header {
+       uint32_t        nr_secctx;
+ };
+ 
++/**
++ * struct fuse_ext_header - extension header
++ * @size: total size of this extension including this header
++ * @type: type of extension
++ *
++ * This is made compatible with fuse_secctx_header by using type values >
++ * FUSE_MAX_NR_SECCTX
++ */
++struct fuse_ext_header {
++      uint32_t        size;
++      uint32_t        type;
++};
++
+ #endif /* _LINUX_FUSE_H */
+-- 
+2.43.0
+
diff --git a/queue-6.1/fuse-allow-non-extending-parallel-direct-writes-on-t.patch b/queue-6.1/fuse-allow-non-extending-parallel-direct-writes-on-t.patch

new file mode 100644 (file)

index 0000000..61238ba
--- /dev/null
+++ b/queue-6.1/fuse-allow-non-extending-parallel-direct-writes-on-t.patch
@@ -0,0 +1,168 @@
+From 7d1ee672f2616d775939b066f475cd5168f32362 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 17 Jun 2022 12:40:27 +0530
+Subject: fuse: allow non-extending parallel direct writes on the same file
+
+From: Dharmendra Singh <dsingh@ddn.com>
+
+[ Upstream commit 153524053bbb0d27bb2e0be36d1b46862e9ce74c ]
+
+In general, as of now, in FUSE, direct writes on the same file are
+serialized over inode lock i.e we hold inode lock for the full duration of
+the write request.  I could not find in fuse code and git history a comment
+which clearly explains why this exclusive lock is taken for direct writes.
+Following might be the reasons for acquiring an exclusive lock but not be
+limited to
+
+ 1) Our guess is some USER space fuse implementations might be relying on
+    this lock for serialization.
+
+ 2) The lock protects against file read/write size races.
+
+ 3) Ruling out any issues arising from partial write failures.
+
+This patch relaxes the exclusive lock for direct non-extending writes only.
+File size extending writes might not need the lock either, but we are not
+entirely sure if there is a risk to introduce any kind of regression.
+Furthermore, benchmarking with fio does not show a difference between patch
+versions that take on file size extension a) an exclusive lock and b) a
+shared lock.
+
+A possible example of an issue with i_size extending writes are write error
+cases.  Some writes might succeed and others might fail for file system
+internal reasons - for example ENOSPACE.  With parallel file size extending
+writes it _might_ be difficult to revert the action of the failing write,
+especially to restore the right i_size.
+
+With these changes, we allow non-extending parallel direct writes on the
+same file with the help of a flag called FOPEN_PARALLEL_DIRECT_WRITES.  If
+this flag is set on the file (flag is passed from libfuse to fuse kernel as
+part of file open/create), we do not take exclusive lock anymore, but
+instead use a shared lock that allows non-extending writes to run in
+parallel.  FUSE implementations which rely on this inode lock for
+serialization can continue to do so and serialized direct writes are still
+the default.  Implementations that do not do write serialization need to be
+updated and need to set the FOPEN_PARALLEL_DIRECT_WRITES flag in their file
+open/create reply.
+
+On patch review there were concerns that network file systems (or vfs
+multiple mounts of the same file system) might have issues with parallel
+writes.  We believe this is not the case, as this is just a local lock,
+which network file systems could not rely on anyway.  I.e. this lock is
+just for local consistency.
+
+Signed-off-by: Dharmendra Singh <dsingh@ddn.com>
+Signed-off-by: Bernd Schubert <bschubert@ddn.com>
+Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
+Stable-dep-of: 3002240d1649 ("fuse: fix memory leak in fuse_create_open")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/fuse/file.c            | 43 ++++++++++++++++++++++++++++++++++++---
+ include/uapi/linux/fuse.h |  3 +++
+ 2 files changed, 43 insertions(+), 3 deletions(-)
+
+diff --git a/fs/fuse/file.c b/fs/fuse/file.c
+index e6ec4338a9c5..0df1311afb87 100644
+--- a/fs/fuse/file.c
++++ b/fs/fuse/file.c
+@@ -1563,14 +1563,47 @@ static ssize_t fuse_direct_read_iter(struct kiocb *iocb, struct iov_iter *to)
+       return res;
+ }
+ 
++static bool fuse_direct_write_extending_i_size(struct kiocb *iocb,
++                                             struct iov_iter *iter)
++{
++      struct inode *inode = file_inode(iocb->ki_filp);
++
++      return iocb->ki_pos + iov_iter_count(iter) > i_size_read(inode);
++}
++
+ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
+ {
+       struct inode *inode = file_inode(iocb->ki_filp);
++      struct file *file = iocb->ki_filp;
++      struct fuse_file *ff = file->private_data;
+       struct fuse_io_priv io = FUSE_IO_PRIV_SYNC(iocb);
+       ssize_t res;
++      bool exclusive_lock =
++              !(ff->open_flags & FOPEN_PARALLEL_DIRECT_WRITES) ||
++              iocb->ki_flags & IOCB_APPEND ||
++              fuse_direct_write_extending_i_size(iocb, from);
++
++      /*
++       * Take exclusive lock if
++       * - Parallel direct writes are disabled - a user space decision
++       * - Parallel direct writes are enabled and i_size is being extended.
++       *   This might not be needed at all, but needs further investigation.
++       */
++      if (exclusive_lock)
++              inode_lock(inode);
++      else {
++              inode_lock_shared(inode);
++
++              /* A race with truncate might have come up as the decision for
++               * the lock type was done without holding the lock, check again.
++               */
++              if (fuse_direct_write_extending_i_size(iocb, from)) {
++                      inode_unlock_shared(inode);
++                      inode_lock(inode);
++                      exclusive_lock = true;
++              }
++      }
+ 
+-      /* Don't allow parallel writes to the same file */
+-      inode_lock(inode);
+       res = generic_write_checks(iocb, from);
+       if (res > 0) {
+               if (!is_sync_kiocb(iocb) && iocb->ki_flags & IOCB_DIRECT) {
+@@ -1581,7 +1614,10 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from)
+                       fuse_write_update_attr(inode, iocb->ki_pos, res);
+               }
+       }
+-      inode_unlock(inode);
++      if (exclusive_lock)
++              inode_unlock(inode);
++      else
++              inode_unlock_shared(inode);
+ 
+       return res;
+ }
+@@ -2937,6 +2973,7 @@ fuse_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
+ 
+       if (iov_iter_rw(iter) == WRITE) {
+               fuse_write_update_attr(inode, pos, ret);
++              /* For extending writes we already hold exclusive lock */
+               if (ret < 0 && offset + count > i_size)
+                       fuse_do_truncate(file);
+       }
+diff --git a/include/uapi/linux/fuse.h b/include/uapi/linux/fuse.h
+index 39cfb343faa8..e3c54109bae9 100644
+--- a/include/uapi/linux/fuse.h
++++ b/include/uapi/linux/fuse.h
+@@ -200,6 +200,7 @@
+  *
+  *  7.38
+  *  - add FUSE_EXPIRE_ONLY flag to fuse_notify_inval_entry
++ *  - add FOPEN_PARALLEL_DIRECT_WRITES
+  */
+ 
+ #ifndef _LINUX_FUSE_H
+@@ -307,6 +308,7 @@ struct fuse_file_lock {
+  * FOPEN_CACHE_DIR: allow caching this directory
+  * FOPEN_STREAM: the file is stream-like (no file position at all)
+  * FOPEN_NOFLUSH: don't flush data cache on close (unless FUSE_WRITEBACK_CACHE)
++ * FOPEN_PARALLEL_DIRECT_WRITES: Allow concurrent direct writes on the same inode
+  */
+ #define FOPEN_DIRECT_IO               (1 << 0)
+ #define FOPEN_KEEP_CACHE      (1 << 1)
+@@ -314,6 +316,7 @@ struct fuse_file_lock {
+ #define FOPEN_CACHE_DIR               (1 << 3)
+ #define FOPEN_STREAM          (1 << 4)
+ #define FOPEN_NOFLUSH         (1 << 5)
++#define FOPEN_PARALLEL_DIRECT_WRITES  (1 << 6)
+ 
+ /**
+  * INIT request/reply flags
+-- 
+2.43.0
+
diff --git a/queue-6.1/fuse-fix-memory-leak-in-fuse_create_open.patch b/queue-6.1/fuse-fix-memory-leak-in-fuse_create_open.patch

new file mode 100644 (file)

index 0000000..b1b76bc
--- /dev/null
+++ b/queue-6.1/fuse-fix-memory-leak-in-fuse_create_open.patch
@@ -0,0 +1,37 @@
+From a84b7dea9c08ad2f60802e7b8618a354d227351b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 23 Aug 2024 16:51:46 +0800
+Subject: fuse: fix memory leak in fuse_create_open
+
+From: yangyun <yangyun50@huawei.com>
+
+[ Upstream commit 3002240d16494d798add0575e8ba1f284258ab34 ]
+
+The memory of struct fuse_file is allocated but not freed
+when get_create_ext return error.
+
+Fixes: 3e2b6fdbdc9a ("fuse: send security context of inode on file")
+Cc: stable@vger.kernel.org # v5.17
+Signed-off-by: yangyun <yangyun50@huawei.com>
+Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/fuse/dir.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
+index 3b7887312ac0..aa2be4c1ea8f 100644
+--- a/fs/fuse/dir.c
++++ b/fs/fuse/dir.c
+@@ -618,7 +618,7 @@ static int fuse_create_open(struct inode *dir, struct dentry *entry,
+ 
+       err = get_create_ext(&args, entry, mode);
+       if (err)
+-              goto out_put_forget_req;
++              goto out_free_ff;
+ 
+       err = fuse_simple_request(fm, &args);
+       free_ext_value(&args);
+-- 
+2.43.0
+
diff --git a/queue-6.1/mm-fix-pmd_read_atomic.patch b/queue-6.1/mm-fix-pmd_read_atomic.patch

new file mode 100644 (file)

index 0000000..0a61371
--- /dev/null
+++ b/queue-6.1/mm-fix-pmd_read_atomic.patch
@@ -0,0 +1,173 @@
+From 736b31e4394c8b483561e4ac1fd71eefb2bbec03 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 26 Nov 2020 17:16:22 +0100
+Subject: mm: Fix pmd_read_atomic()
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+[ Upstream commit 024d232ae4fcd7a7ce8ea239607d6c1246d7adc8 ]
+
+AFAICT there's no reason to do anything different than what we do for
+PTEs. Make it so (also affects SH).
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Link: https://lkml.kernel.org/r/20221022114424.711181252%40infradead.org
+Stable-dep-of: 71c186efc1b2 ("userfaultfd: fix checks for huge PMDs")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/pgtable-3level.h | 56 ---------------------------
+ include/linux/pgtable.h               | 47 +++++++++++++++++-----
+ 2 files changed, 37 insertions(+), 66 deletions(-)
+
+diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
+index 28556d22feb8..94f50b0100a5 100644
+--- a/arch/x86/include/asm/pgtable-3level.h
++++ b/arch/x86/include/asm/pgtable-3level.h
+@@ -34,62 +34,6 @@ static inline void native_set_pte(pte_t *ptep, pte_t pte)
+       ptep->pte_low = pte.pte_low;
+ }
+ 
+-#define pmd_read_atomic pmd_read_atomic
+-/*
+- * pte_offset_map_lock() on 32-bit PAE kernels was reading the pmd_t with
+- * a "*pmdp" dereference done by GCC. Problem is, in certain places
+- * where pte_offset_map_lock() is called, concurrent page faults are
+- * allowed, if the mmap_lock is hold for reading. An example is mincore
+- * vs page faults vs MADV_DONTNEED. On the page fault side
+- * pmd_populate() rightfully does a set_64bit(), but if we're reading the
+- * pmd_t with a "*pmdp" on the mincore side, a SMP race can happen
+- * because GCC will not read the 64-bit value of the pmd atomically.
+- *
+- * To fix this all places running pte_offset_map_lock() while holding the
+- * mmap_lock in read mode, shall read the pmdp pointer using this
+- * function to know if the pmd is null or not, and in turn to know if
+- * they can run pte_offset_map_lock() or pmd_trans_huge() or other pmd
+- * operations.
+- *
+- * Without THP if the mmap_lock is held for reading, the pmd can only
+- * transition from null to not null while pmd_read_atomic() runs. So
+- * we can always return atomic pmd values with this function.
+- *
+- * With THP if the mmap_lock is held for reading, the pmd can become
+- * trans_huge or none or point to a pte (and in turn become "stable")
+- * at any time under pmd_read_atomic(). We could read it truly
+- * atomically here with an atomic64_read() for the THP enabled case (and
+- * it would be a whole lot simpler), but to avoid using cmpxchg8b we
+- * only return an atomic pmdval if the low part of the pmdval is later
+- * found to be stable (i.e. pointing to a pte). We are also returning a
+- * 'none' (zero) pmdval if the low part of the pmd is zero.
+- *
+- * In some cases the high and low part of the pmdval returned may not be
+- * consistent if THP is enabled (the low part may point to previously
+- * mapped hugepage, while the high part may point to a more recently
+- * mapped hugepage), but pmd_none_or_trans_huge_or_clear_bad() only
+- * needs the low part of the pmd to be read atomically to decide if the
+- * pmd is unstable or not, with the only exception when the low part
+- * of the pmd is zero, in which case we return a 'none' pmd.
+- */
+-static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
+-{
+-      pmdval_t ret;
+-      u32 *tmp = (u32 *)pmdp;
+-
+-      ret = (pmdval_t) (*tmp);
+-      if (ret) {
+-              /*
+-               * If the low part is null, we must not read the high part
+-               * or we can end up with a partial pmd.
+-               */
+-              smp_rmb();
+-              ret |= ((pmdval_t)*(tmp + 1)) << 32;
+-      }
+-
+-      return (pmd_t) { .pmd = ret };
+-}
+-
+ static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
+ {
+       set_64bit((unsigned long long *)(ptep), native_pte_val(pte));
+diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
+index 5f0d7d0b9471..8f31e2ff6b58 100644
+--- a/include/linux/pgtable.h
++++ b/include/linux/pgtable.h
+@@ -316,6 +316,13 @@ static inline pte_t ptep_get(pte_t *ptep)
+ }
+ #endif
+ 
++#ifndef __HAVE_ARCH_PMDP_GET
++static inline pmd_t pmdp_get(pmd_t *pmdp)
++{
++      return READ_ONCE(*pmdp);
++}
++#endif
++
+ #ifdef CONFIG_GUP_GET_PTE_LOW_HIGH
+ /*
+  * WARNING: only to be used in the get_user_pages_fast() implementation.
+@@ -361,15 +368,42 @@ static inline pte_t ptep_get_lockless(pte_t *ptep)
+ 
+       return pte;
+ }
+-#else /* CONFIG_GUP_GET_PTE_LOW_HIGH */
++#define ptep_get_lockless ptep_get_lockless
++
++#if CONFIG_PGTABLE_LEVELS > 2
++static inline pmd_t pmdp_get_lockless(pmd_t *pmdp)
++{
++      pmd_t pmd;
++
++      do {
++              pmd.pmd_low = pmdp->pmd_low;
++              smp_rmb();
++              pmd.pmd_high = pmdp->pmd_high;
++              smp_rmb();
++      } while (unlikely(pmd.pmd_low != pmdp->pmd_low));
++
++      return pmd;
++}
++#define pmdp_get_lockless pmdp_get_lockless
++#endif /* CONFIG_PGTABLE_LEVELS > 2 */
++#endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */
++
+ /*
+  * We require that the PTE can be read atomically.
+  */
++#ifndef ptep_get_lockless
+ static inline pte_t ptep_get_lockless(pte_t *ptep)
+ {
+       return ptep_get(ptep);
+ }
+-#endif /* CONFIG_GUP_GET_PTE_LOW_HIGH */
++#endif
++
++#ifndef pmdp_get_lockless
++static inline pmd_t pmdp_get_lockless(pmd_t *pmdp)
++{
++      return pmdp_get(pmdp);
++}
++#endif
+ 
+ #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+ #ifndef __HAVE_ARCH_PMDP_HUGE_GET_AND_CLEAR
+@@ -1339,17 +1373,10 @@ static inline int pud_trans_unstable(pud_t *pud)
+ #endif
+ }
+ 
+-#ifndef pmd_read_atomic
+ static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
+ {
+-      /*
+-       * Depend on compiler for an atomic pmd read. NOTE: this is
+-       * only going to work, if the pmdval_t isn't larger than
+-       * an unsigned long.
+-       */
+-      return *pmdp;
++      return pmdp_get_lockless(pmdp);
+ }
+-#endif
+ 
+ #ifndef arch_needs_pgtable_deposit
+ #define arch_needs_pgtable_deposit() (false)
+-- 
+2.43.0
+
diff --git a/queue-6.1/mm-rename-pmd_read_atomic.patch b/queue-6.1/mm-rename-pmd_read_atomic.patch

new file mode 100644 (file)

index 0000000..ef80ca3
--- /dev/null
+++ b/queue-6.1/mm-rename-pmd_read_atomic.patch
@@ -0,0 +1,142 @@
+From ffc5769c6f6c603e01a725883e4d5760ee72939b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 26 Nov 2020 17:20:28 +0100
+Subject: mm: Rename pmd_read_atomic()
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+[ Upstream commit dab6e717429e5ec795d558a0e9a5337a1ed33a3d ]
+
+There's no point in having the identical routines for PTE/PMD have
+different names.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Link: https://lkml.kernel.org/r/20221022114424.841277397%40infradead.org
+Stable-dep-of: 71c186efc1b2 ("userfaultfd: fix checks for huge PMDs")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/linux/pgtable.h    | 9 ++-------
+ mm/hmm.c                   | 2 +-
+ mm/khugepaged.c            | 2 +-
+ mm/mapping_dirty_helpers.c | 2 +-
+ mm/mprotect.c              | 2 +-
+ mm/userfaultfd.c           | 2 +-
+ mm/vmscan.c                | 4 ++--
+ 7 files changed, 9 insertions(+), 14 deletions(-)
+
+diff --git a/include/linux/pgtable.h b/include/linux/pgtable.h
+index 8f31e2ff6b58..3e3c00e80b65 100644
+--- a/include/linux/pgtable.h
++++ b/include/linux/pgtable.h
+@@ -1373,11 +1373,6 @@ static inline int pud_trans_unstable(pud_t *pud)
+ #endif
+ }
+ 
+-static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
+-{
+-      return pmdp_get_lockless(pmdp);
+-}
+-
+ #ifndef arch_needs_pgtable_deposit
+ #define arch_needs_pgtable_deposit() (false)
+ #endif
+@@ -1404,13 +1399,13 @@ static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
+  */
+ static inline int pmd_none_or_trans_huge_or_clear_bad(pmd_t *pmd)
+ {
+-      pmd_t pmdval = pmd_read_atomic(pmd);
++      pmd_t pmdval = pmdp_get_lockless(pmd);
+       /*
+        * The barrier will stabilize the pmdval in a register or on
+        * the stack so that it will stop changing under the code.
+        *
+        * When CONFIG_TRANSPARENT_HUGEPAGE=y on x86 32bit PAE,
+-       * pmd_read_atomic is allowed to return a not atomic pmdval
++       * pmdp_get_lockless is allowed to return a not atomic pmdval
+        * (for example pointing to an hugepage that has never been
+        * mapped in the pmd). The below checks will only care about
+        * the low part of the pmd with 32bit PAE x86 anyway, with the
+diff --git a/mm/hmm.c b/mm/hmm.c
+index 3850fb625dda..39cf50de76d7 100644
+--- a/mm/hmm.c
++++ b/mm/hmm.c
+@@ -361,7 +361,7 @@ static int hmm_vma_walk_pmd(pmd_t *pmdp,
+                * huge or device mapping one and compute corresponding pfn
+                * values.
+                */
+-              pmd = pmd_read_atomic(pmdp);
++              pmd = pmdp_get_lockless(pmdp);
+               barrier();
+               if (!pmd_devmap(pmd) && !pmd_trans_huge(pmd))
+                       goto again;
+diff --git a/mm/khugepaged.c b/mm/khugepaged.c
+index 085fca1fa27a..47010c3b5c4d 100644
+--- a/mm/khugepaged.c
++++ b/mm/khugepaged.c
+@@ -866,7 +866,7 @@ static int find_pmd_or_thp_or_none(struct mm_struct *mm,
+       if (!*pmd)
+               return SCAN_PMD_NULL;
+ 
+-      pmde = pmd_read_atomic(*pmd);
++      pmde = pmdp_get_lockless(*pmd);
+ 
+ #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+       /* See comments in pmd_none_or_trans_huge_or_clear_bad() */
+diff --git a/mm/mapping_dirty_helpers.c b/mm/mapping_dirty_helpers.c
+index 1b0ab8fcfd8b..175e424b9ab1 100644
+--- a/mm/mapping_dirty_helpers.c
++++ b/mm/mapping_dirty_helpers.c
+@@ -126,7 +126,7 @@ static int clean_record_pte(pte_t *pte, unsigned long addr,
+ static int wp_clean_pmd_entry(pmd_t *pmd, unsigned long addr, unsigned long end,
+                             struct mm_walk *walk)
+ {
+-      pmd_t pmdval = pmd_read_atomic(pmd);
++      pmd_t pmdval = pmdp_get_lockless(pmd);
+ 
+       if (!pmd_trans_unstable(&pmdval))
+               return 0;
+diff --git a/mm/mprotect.c b/mm/mprotect.c
+index 668bfaa6ed2a..f006bafe338f 100644
+--- a/mm/mprotect.c
++++ b/mm/mprotect.c
+@@ -294,7 +294,7 @@ static unsigned long change_pte_range(struct mmu_gather *tlb,
+  */
+ static inline int pmd_none_or_clear_bad_unless_trans_huge(pmd_t *pmd)
+ {
+-      pmd_t pmdval = pmd_read_atomic(pmd);
++      pmd_t pmdval = pmdp_get_lockless(pmd);
+ 
+       /* See pmd_none_or_trans_huge_or_clear_bad for info on barrier */
+ #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
+index 992a0a16846f..5d873aadec76 100644
+--- a/mm/userfaultfd.c
++++ b/mm/userfaultfd.c
+@@ -641,7 +641,7 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
+                       break;
+               }
+ 
+-              dst_pmdval = pmd_read_atomic(dst_pmd);
++              dst_pmdval = pmdp_get_lockless(dst_pmd);
+               /*
+                * If the dst_pmd is mapped as THP don't
+                * override it and just be strict.
+diff --git a/mm/vmscan.c b/mm/vmscan.c
+index 4cd0cbf9c121..f5fa1c76d9e6 100644
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -4068,9 +4068,9 @@ static void walk_pmd_range(pud_t *pud, unsigned long start, unsigned long end,
+       /* walk_pte_range() may call get_next_vma() */
+       vma = args->vma;
+       for (i = pmd_index(start), addr = start; addr != end; i++, addr = next) {
+-              pmd_t val = pmd_read_atomic(pmd + i);
++              pmd_t val = pmdp_get_lockless(pmd + i);
+ 
+-              /* for pmd_read_atomic() */
++              /* for pmdp_get_lockless() */
+               barrier();
+ 
+               next = pmd_addr_end(addr, end);
+-- 
+2.43.0
+
diff --git a/queue-6.1/net-mana-fix-error-handling-in-mana_create_txq-rxq-s.patch b/queue-6.1/net-mana-fix-error-handling-in-mana_create_txq-rxq-s.patch

new file mode 100644 (file)

index 0000000..f4437bf
--- /dev/null
+++ b/queue-6.1/net-mana-fix-error-handling-in-mana_create_txq-rxq-s.patch
@@ -0,0 +1,125 @@
+From b1c444c28722aa7d59105a8826c4a2da636d24f5 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 2 Sep 2024 05:43:47 -0700
+Subject: net: mana: Fix error handling in mana_create_txq/rxq's NAPI cleanup
+
+From: Souradeep Chakrabarti <schakrabarti@linux.microsoft.com>
+
+[ Upstream commit b6ecc662037694488bfff7c9fd21c405df8411f2 ]
+
+Currently napi_disable() gets called during rxq and txq cleanup,
+even before napi is enabled and hrtimer is initialized. It causes
+kernel panic.
+
+? page_fault_oops+0x136/0x2b0
+  ? page_counter_cancel+0x2e/0x80
+  ? do_user_addr_fault+0x2f2/0x640
+  ? refill_obj_stock+0xc4/0x110
+  ? exc_page_fault+0x71/0x160
+  ? asm_exc_page_fault+0x27/0x30
+  ? __mmdrop+0x10/0x180
+  ? __mmdrop+0xec/0x180
+  ? hrtimer_active+0xd/0x50
+  hrtimer_try_to_cancel+0x2c/0xf0
+  hrtimer_cancel+0x15/0x30
+  napi_disable+0x65/0x90
+  mana_destroy_rxq+0x4c/0x2f0
+  mana_create_rxq.isra.0+0x56c/0x6d0
+  ? mana_uncfg_vport+0x50/0x50
+  mana_alloc_queues+0x21b/0x320
+  ? skb_dequeue+0x5f/0x80
+
+Cc: stable@vger.kernel.org
+Fixes: e1b5683ff62e ("net: mana: Move NAPI from EQ to CQ")
+Signed-off-by: Souradeep Chakrabarti <schakrabarti@linux.microsoft.com>
+Reviewed-by: Haiyang Zhang <haiyangz@microsoft.com>
+Reviewed-by: Shradha Gupta <shradhagupta@linux.microsoft.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/microsoft/mana/mana.h    |  2 ++
+ drivers/net/ethernet/microsoft/mana/mana_en.c | 22 +++++++++++--------
+ 2 files changed, 15 insertions(+), 9 deletions(-)
+
+diff --git a/drivers/net/ethernet/microsoft/mana/mana.h b/drivers/net/ethernet/microsoft/mana/mana.h
+index 41c99eabf40a..2b00d6a29117 100644
+--- a/drivers/net/ethernet/microsoft/mana/mana.h
++++ b/drivers/net/ethernet/microsoft/mana/mana.h
+@@ -86,6 +86,8 @@ struct mana_txq {
+ 
+       atomic_t pending_sends;
+ 
++      bool napi_initialized;
++
+       struct mana_stats_tx stats;
+ };
+ 
+diff --git a/drivers/net/ethernet/microsoft/mana/mana_en.c b/drivers/net/ethernet/microsoft/mana/mana_en.c
+index e7d1ce68f05e..b52612eef0a6 100644
+--- a/drivers/net/ethernet/microsoft/mana/mana_en.c
++++ b/drivers/net/ethernet/microsoft/mana/mana_en.c
+@@ -1391,10 +1391,12 @@ static void mana_destroy_txq(struct mana_port_context *apc)
+ 
+       for (i = 0; i < apc->num_queues; i++) {
+               napi = &apc->tx_qp[i].tx_cq.napi;
+-              napi_synchronize(napi);
+-              napi_disable(napi);
+-              netif_napi_del(napi);
+-
++              if (apc->tx_qp[i].txq.napi_initialized) {
++                      napi_synchronize(napi);
++                      napi_disable(napi);
++                      netif_napi_del(napi);
++                      apc->tx_qp[i].txq.napi_initialized = false;
++              }
+               mana_destroy_wq_obj(apc, GDMA_SQ, apc->tx_qp[i].tx_object);
+ 
+               mana_deinit_cq(apc, &apc->tx_qp[i].tx_cq);
+@@ -1450,6 +1452,7 @@ static int mana_create_txq(struct mana_port_context *apc,
+               txq->ndev = net;
+               txq->net_txq = netdev_get_tx_queue(net, i);
+               txq->vp_offset = apc->tx_vp_offset;
++              txq->napi_initialized = false;
+               skb_queue_head_init(&txq->pending_skbs);
+ 
+               memset(&spec, 0, sizeof(spec));
+@@ -1514,6 +1517,7 @@ static int mana_create_txq(struct mana_port_context *apc,
+ 
+               netif_napi_add_tx(net, &cq->napi, mana_poll);
+               napi_enable(&cq->napi);
++              txq->napi_initialized = true;
+ 
+               mana_gd_ring_cq(cq->gdma_cq, SET_ARM_BIT);
+       }
+@@ -1525,7 +1529,7 @@ static int mana_create_txq(struct mana_port_context *apc,
+ }
+ 
+ static void mana_destroy_rxq(struct mana_port_context *apc,
+-                           struct mana_rxq *rxq, bool validate_state)
++                           struct mana_rxq *rxq, bool napi_initialized)
+ 
+ {
+       struct gdma_context *gc = apc->ac->gdma_dev->gdma_context;
+@@ -1539,15 +1543,15 @@ static void mana_destroy_rxq(struct mana_port_context *apc,
+ 
+       napi = &rxq->rx_cq.napi;
+ 
+-      if (validate_state)
++      if (napi_initialized) {
+               napi_synchronize(napi);
+ 
+-      napi_disable(napi);
++              napi_disable(napi);
+ 
++              netif_napi_del(napi);
++      }
+       xdp_rxq_info_unreg(&rxq->xdp_rxq);
+ 
+-      netif_napi_del(napi);
+-
+       mana_destroy_wq_obj(apc, GDMA_RQ, rxq->rxobj);
+ 
+       mana_deinit_cq(apc, &rxq->rx_cq);
+-- 
+2.43.0
+
diff --git a/queue-6.1/rust-macros-provide-correct-provenance-when-construc.patch b/queue-6.1/rust-macros-provide-correct-provenance-when-construc.patch

new file mode 100644 (file)

index 0000000..a80f8f0
--- /dev/null
+++ b/queue-6.1/rust-macros-provide-correct-provenance-when-construc.patch
@@ -0,0 +1,63 @@
+From 5b320b29ddf985d8de92c3afa9aebe13ecd5cfad Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 28 Aug 2024 11:01:29 -0700
+Subject: rust: macros: provide correct provenance when constructing
+ THIS_MODULE
+
+From: Boqun Feng <boqun.feng@gmail.com>
+
+[ Upstream commit a5a3c952e82c1ada12bf8c55b73af26f1a454bd2 ]
+
+Currently while defining `THIS_MODULE` symbol in `module!()`, the
+pointer used to construct `ThisModule` is derived from an immutable
+reference of `__this_module`, which means the pointer doesn't have
+the provenance for writing, and that means any write to that pointer
+is UB regardless of data races or not. However, the usage of
+`THIS_MODULE` includes passing this pointer to functions that may write
+to it (probably in unsafe code), and this will create soundness issues.
+
+One way to fix this is using `addr_of_mut!()` but that requires the
+unstable feature "const_mut_refs". So instead of `addr_of_mut()!`,
+an extern static `Opaque` is used here: since `Opaque<T>` is transparent
+to `T`, an extern static `Opaque` will just wrap the C symbol (defined
+in a C compile unit) in an `Opaque`, which provides a pointer with
+writable provenance via `Opaque::get()`. This fix the potential UBs
+because of pointer provenance unmatched.
+
+Reported-by: Alice Ryhl <aliceryhl@google.com>
+Signed-off-by: Boqun Feng <boqun.feng@gmail.com>
+Reviewed-by: Alice Ryhl <aliceryhl@google.com>
+Reviewed-by: Trevor Gross <tmgross@umich.edu>
+Reviewed-by: Benno Lossin <benno.lossin@proton.me>
+Reviewed-by: Gary Guo <gary@garyguo.net>
+Closes: https://rust-for-linux.zulipchat.com/#narrow/stream/x/topic/x/near/465412664
+Fixes: 1fbde52bde73 ("rust: add `macros` crate")
+Cc: stable@vger.kernel.org # 6.6.x: be2ca1e03965: ("rust: types: Make Opaque::get const")
+Link: https://lore.kernel.org/r/20240828180129.4046355-1-boqun.feng@gmail.com
+[ Fixed two typos, reworded title. - Miguel ]
+Signed-off-by: Miguel Ojeda <ojeda@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ rust/macros/module.rs | 6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+diff --git a/rust/macros/module.rs b/rust/macros/module.rs
+index 031028b3dc41..071b96639a2e 100644
+--- a/rust/macros/module.rs
++++ b/rust/macros/module.rs
+@@ -183,7 +183,11 @@ pub(crate) fn module(ts: TokenStream) -> TokenStream {
+             // freed until the module is unloaded.
+             #[cfg(MODULE)]
+             static THIS_MODULE: kernel::ThisModule = unsafe {{
+-                kernel::ThisModule::from_ptr(&kernel::bindings::__this_module as *const _ as *mut _)
++                extern \"C\" {{
++                    static __this_module: kernel::types::Opaque<kernel::bindings::module>;
++                }}
++
++                kernel::ThisModule::from_ptr(__this_module.get())
+             }};
+             #[cfg(not(MODULE))]
+             static THIS_MODULE: kernel::ThisModule = unsafe {{
+-- 
+2.43.0
+
diff --git a/queue-6.1/series b/queue-6.1/series

index 2e0a1d96d76cdf7a903f54f93c9683630c853711..eeb82f7d258e5a2a03178a9422404942857b9a44 100644 (file)
--- a/queue-6.1/series
+++ b/queue-6.1/series
@@ -133,3 +133,13 @@ selftests-mptcp-join-check-re-re-adding-id-0-signal.patch
  io_uring-io-wq-stop-setting-pf_no_setaffinity-on-io-wq-workers.patch
  io_uring-sqpoll-do-not-set-pf_no_setaffinity-on-sqpoll-threads.patch
  tcp-process-the-3rd-ack-with-sk_socket-for-tfo-mptcp.patch
+rust-macros-provide-correct-provenance-when-construc.patch
+fuse-add-expire-only-mode-to-fuse_notify_inval_entry.patch
+fuse-allow-non-extending-parallel-direct-writes-on-t.patch
+fuse-add-request-extension.patch
+fuse-fix-memory-leak-in-fuse_create_open.patch
+x86-mm-pae-make-pmd_t-similar-to-pte_t.patch
+mm-fix-pmd_read_atomic.patch
+mm-rename-pmd_read_atomic.patch
+userfaultfd-fix-checks-for-huge-pmds.patch
+net-mana-fix-error-handling-in-mana_create_txq-rxq-s.patch
diff --git a/queue-6.1/userfaultfd-fix-checks-for-huge-pmds.patch b/queue-6.1/userfaultfd-fix-checks-for-huge-pmds.patch

new file mode 100644 (file)

index 0000000..d08b8c4
--- /dev/null
+++ b/queue-6.1/userfaultfd-fix-checks-for-huge-pmds.patch
@@ -0,0 +1,146 @@
+From 170b9e3f04840e9c1a1e14c687b689e84d8f0d9d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 13 Aug 2024 22:25:21 +0200
+Subject: userfaultfd: fix checks for huge PMDs
+
+From: Jann Horn <jannh@google.com>
+
+[ Upstream commit 71c186efc1b2cf1aeabfeff3b9bd5ac4c5ac14d8 ]
+
+Patch series "userfaultfd: fix races around pmd_trans_huge() check", v2.
+
+The pmd_trans_huge() code in mfill_atomic() is wrong in three different
+ways depending on kernel version:
+
+1. The pmd_trans_huge() check is racy and can lead to a BUG_ON() (if you hit
+   the right two race windows) - I've tested this in a kernel build with
+   some extra mdelay() calls. See the commit message for a description
+   of the race scenario.
+   On older kernels (before 6.5), I think the same bug can even
+   theoretically lead to accessing transhuge page contents as a page table
+   if you hit the right 5 narrow race windows (I haven't tested this case).
+2. As pointed out by Qi Zheng, pmd_trans_huge() is not sufficient for
+   detecting PMDs that don't point to page tables.
+   On older kernels (before 6.5), you'd just have to win a single fairly
+   wide race to hit this.
+   I've tested this on 6.1 stable by racing migration (with a mdelay()
+   patched into try_to_migrate()) against UFFDIO_ZEROPAGE - on my x86
+   VM, that causes a kernel oops in ptlock_ptr().
+3. On newer kernels (>=6.5), for shmem mappings, khugepaged is allowed
+   to yank page tables out from under us (though I haven't tested that),
+   so I think the BUG_ON() checks in mfill_atomic() are just wrong.
+
+I decided to write two separate fixes for these (one fix for bugs 1+2, one
+fix for bug 3), so that the first fix can be backported to kernels
+affected by bugs 1+2.
+
+This patch (of 2):
+
+This fixes two issues.
+
+I discovered that the following race can occur:
+
+  mfill_atomic                other thread
+  ============                ============
+                              <zap PMD>
+  pmdp_get_lockless() [reads none pmd]
+  <bail if trans_huge>
+  <if none:>
+                              <pagefault creates transhuge zeropage>
+    __pte_alloc [no-op]
+                              <zap PMD>
+  <bail if pmd_trans_huge(*dst_pmd)>
+  BUG_ON(pmd_none(*dst_pmd))
+
+I have experimentally verified this in a kernel with extra mdelay() calls;
+the BUG_ON(pmd_none(*dst_pmd)) triggers.
+
+On kernels newer than commit 0d940a9b270b ("mm/pgtable: allow
+pte_offset_map[_lock]() to fail"), this can't lead to anything worse than
+a BUG_ON(), since the page table access helpers are actually designed to
+deal with page tables concurrently disappearing; but on older kernels
+(<=6.4), I think we could probably theoretically race past the two
+BUG_ON() checks and end up treating a hugepage as a page table.
+
+The second issue is that, as Qi Zheng pointed out, there are other types
+of huge PMDs that pmd_trans_huge() can't catch: devmap PMDs and swap PMDs
+(in particular, migration PMDs).
+
+On <=6.4, this is worse than the first issue: If mfill_atomic() runs on a
+PMD that contains a migration entry (which just requires winning a single,
+fairly wide race), it will pass the PMD to pte_offset_map_lock(), which
+assumes that the PMD points to a page table.
+
+Breakage follows: First, the kernel tries to take the PTE lock (which will
+crash or maybe worse if there is no "struct page" for the address bits in
+the migration entry PMD - I think at least on X86 there usually is no
+corresponding "struct page" thanks to the PTE inversion mitigation, amd64
+looks different).
+
+If that didn't crash, the kernel would next try to write a PTE into what
+it wrongly thinks is a page table.
+
+As part of fixing these issues, get rid of the check for pmd_trans_huge()
+before __pte_alloc() - that's redundant, we're going to have to check for
+that after the __pte_alloc() anyway.
+
+Backport note: pmdp_get_lockless() is pmd_read_atomic() in older kernels.
+
+Link: https://lkml.kernel.org/r/20240813-uffd-thp-flip-fix-v2-0-5efa61078a41@google.com
+Link: https://lkml.kernel.org/r/20240813-uffd-thp-flip-fix-v2-1-5efa61078a41@google.com
+Fixes: c1a4de99fada ("userfaultfd: mcopy_atomic|mfill_zeropage: UFFDIO_COPY|UFFDIO_ZEROPAGE preparation")
+Signed-off-by: Jann Horn <jannh@google.com>
+Acked-by: David Hildenbrand <david@redhat.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Jann Horn <jannh@google.com>
+Cc: Pavel Emelyanov <xemul@virtuozzo.com>
+Cc: Qi Zheng <zhengqi.arch@bytedance.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ mm/userfaultfd.c | 22 ++++++++++++----------
+ 1 file changed, 12 insertions(+), 10 deletions(-)
+
+diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c
+index 5d873aadec76..50c01a7eb705 100644
+--- a/mm/userfaultfd.c
++++ b/mm/userfaultfd.c
+@@ -642,21 +642,23 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm,
+               }
+ 
+               dst_pmdval = pmdp_get_lockless(dst_pmd);
+-              /*
+-               * If the dst_pmd is mapped as THP don't
+-               * override it and just be strict.
+-               */
+-              if (unlikely(pmd_trans_huge(dst_pmdval))) {
+-                      err = -EEXIST;
+-                      break;
+-              }
+               if (unlikely(pmd_none(dst_pmdval)) &&
+                   unlikely(__pte_alloc(dst_mm, dst_pmd))) {
+                       err = -ENOMEM;
+                       break;
+               }
+-              /* If an huge pmd materialized from under us fail */
+-              if (unlikely(pmd_trans_huge(*dst_pmd))) {
++              dst_pmdval = pmdp_get_lockless(dst_pmd);
++              /*
++               * If the dst_pmd is THP don't override it and just be strict.
++               * (This includes the case where the PMD used to be THP and
++               * changed back to none after __pte_alloc().)
++               */
++              if (unlikely(!pmd_present(dst_pmdval) || pmd_trans_huge(dst_pmdval) ||
++                           pmd_devmap(dst_pmdval))) {
++                      err = -EEXIST;
++                      break;
++              }
++              if (unlikely(pmd_bad(dst_pmdval))) {
+                       err = -EFAULT;
+                       break;
+               }
+-- 
+2.43.0
+
diff --git a/queue-6.1/x86-mm-pae-make-pmd_t-similar-to-pte_t.patch b/queue-6.1/x86-mm-pae-make-pmd_t-similar-to-pte_t.patch

new file mode 100644 (file)

index 0000000..9390f00
--- /dev/null
+++ b/queue-6.1/x86-mm-pae-make-pmd_t-similar-to-pte_t.patch
@@ -0,0 +1,156 @@
+From 4eae6e27b21e427bd48973eec754e0833489829f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 26 Nov 2020 17:02:29 +0100
+Subject: x86/mm/pae: Make pmd_t similar to pte_t
+
+From: Peter Zijlstra <peterz@infradead.org>
+
+[ Upstream commit fbfdec9989e69e0b17aa3bf32fcb22d04cc33301 ]
+
+Instead of mucking about with at least 2 different ways of fudging
+it, do the same thing we do for pte_t.
+
+Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Link: https://lkml.kernel.org/r/20221022114424.580310787%40infradead.org
+Stable-dep-of: 71c186efc1b2 ("userfaultfd: fix checks for huge PMDs")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/x86/include/asm/pgtable-3level.h       | 42 +++++++--------------
+ arch/x86/include/asm/pgtable-3level_types.h |  7 ++++
+ arch/x86/include/asm/pgtable_64_types.h     |  1 +
+ arch/x86/include/asm/pgtable_types.h        |  4 +-
+ 4 files changed, 23 insertions(+), 31 deletions(-)
+
+diff --git a/arch/x86/include/asm/pgtable-3level.h b/arch/x86/include/asm/pgtable-3level.h
+index 28421a887209..28556d22feb8 100644
+--- a/arch/x86/include/asm/pgtable-3level.h
++++ b/arch/x86/include/asm/pgtable-3level.h
+@@ -87,7 +87,7 @@ static inline pmd_t pmd_read_atomic(pmd_t *pmdp)
+               ret |= ((pmdval_t)*(tmp + 1)) << 32;
+       }
+ 
+-      return (pmd_t) { ret };
++      return (pmd_t) { .pmd = ret };
+ }
+ 
+ static inline void native_set_pte_atomic(pte_t *ptep, pte_t pte)
+@@ -121,12 +121,11 @@ static inline void native_pte_clear(struct mm_struct *mm, unsigned long addr,
+       ptep->pte_high = 0;
+ }
+ 
+-static inline void native_pmd_clear(pmd_t *pmd)
++static inline void native_pmd_clear(pmd_t *pmdp)
+ {
+-      u32 *tmp = (u32 *)pmd;
+-      *tmp = 0;
++      pmdp->pmd_low = 0;
+       smp_wmb();
+-      *(tmp + 1) = 0;
++      pmdp->pmd_high = 0;
+ }
+ 
+ static inline void native_pud_clear(pud_t *pudp)
+@@ -162,25 +161,17 @@ static inline pte_t native_ptep_get_and_clear(pte_t *ptep)
+ #define native_ptep_get_and_clear(xp) native_local_ptep_get_and_clear(xp)
+ #endif
+ 
+-union split_pmd {
+-      struct {
+-              u32 pmd_low;
+-              u32 pmd_high;
+-      };
+-      pmd_t pmd;
+-};
+-
+ #ifdef CONFIG_SMP
+ static inline pmd_t native_pmdp_get_and_clear(pmd_t *pmdp)
+ {
+-      union split_pmd res, *orig = (union split_pmd *)pmdp;
++      pmd_t res;
+ 
+       /* xchg acts as a barrier before setting of the high bits */
+-      res.pmd_low = xchg(&orig->pmd_low, 0);
+-      res.pmd_high = orig->pmd_high;
+-      orig->pmd_high = 0;
++      res.pmd_low = xchg(&pmdp->pmd_low, 0);
++      res.pmd_high = READ_ONCE(pmdp->pmd_high);
++      WRITE_ONCE(pmdp->pmd_high, 0);
+ 
+-      return res.pmd;
++      return res;
+ }
+ #else
+ #define native_pmdp_get_and_clear(xp) native_local_pmdp_get_and_clear(xp)
+@@ -199,17 +190,12 @@ static inline pmd_t pmdp_establish(struct vm_area_struct *vma,
+        * anybody.
+        */
+       if (!(pmd_val(pmd) & _PAGE_PRESENT)) {
+-              union split_pmd old, new, *ptr;
+-
+-              ptr = (union split_pmd *)pmdp;
+-
+-              new.pmd = pmd;
+-
+               /* xchg acts as a barrier before setting of the high bits */
+-              old.pmd_low = xchg(&ptr->pmd_low, new.pmd_low);
+-              old.pmd_high = ptr->pmd_high;
+-              ptr->pmd_high = new.pmd_high;
+-              return old.pmd;
++              old.pmd_low = xchg(&pmdp->pmd_low, pmd.pmd_low);
++              old.pmd_high = READ_ONCE(pmdp->pmd_high);
++              WRITE_ONCE(pmdp->pmd_high, pmd.pmd_high);
++
++              return old;
+       }
+ 
+       do {
+diff --git a/arch/x86/include/asm/pgtable-3level_types.h b/arch/x86/include/asm/pgtable-3level_types.h
+index 56baf43befb4..80911349519e 100644
+--- a/arch/x86/include/asm/pgtable-3level_types.h
++++ b/arch/x86/include/asm/pgtable-3level_types.h
+@@ -18,6 +18,13 @@ typedef union {
+       };
+       pteval_t pte;
+ } pte_t;
++
++typedef union {
++      struct {
++              unsigned long pmd_low, pmd_high;
++      };
++      pmdval_t pmd;
++} pmd_t;
+ #endif        /* !__ASSEMBLY__ */
+ 
+ #define SHARED_KERNEL_PMD     (!static_cpu_has(X86_FEATURE_PTI))
+diff --git a/arch/x86/include/asm/pgtable_64_types.h b/arch/x86/include/asm/pgtable_64_types.h
+index 6c7f7c526450..4ea3755f2444 100644
+--- a/arch/x86/include/asm/pgtable_64_types.h
++++ b/arch/x86/include/asm/pgtable_64_types.h
+@@ -19,6 +19,7 @@ typedef unsigned long        pgdval_t;
+ typedef unsigned long pgprotval_t;
+ 
+ typedef struct { pteval_t pte; } pte_t;
++typedef struct { pmdval_t pmd; } pmd_t;
+ 
+ #ifdef CONFIG_X86_5LEVEL
+ extern unsigned int __pgtable_l5_enabled;
+diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h
+index e3028373f0b4..d0e9654d7272 100644
+--- a/arch/x86/include/asm/pgtable_types.h
++++ b/arch/x86/include/asm/pgtable_types.h
+@@ -363,11 +363,9 @@ static inline pudval_t native_pud_val(pud_t pud)
+ #endif
+ 
+ #if CONFIG_PGTABLE_LEVELS > 2
+-typedef struct { pmdval_t pmd; } pmd_t;
+-
+ static inline pmd_t native_make_pmd(pmdval_t val)
+ {
+-      return (pmd_t) { val };
++      return (pmd_t) { .pmd = val };
+ }
+ 
+ static inline pmdval_t native_pmd_val(pmd_t pmd)
+-- 
+2.43.0
+
author	Sasha Levin <sashal@kernel.org>
	Mon, 9 Sep 2024 12:47:27 +0000 (08:47 -0400)
committer	Sasha Levin <sashal@kernel.org>
	Mon, 9 Sep 2024 12:47:27 +0000 (08:47 -0400)
queue-6.1/fuse-add-expire-only-mode-to-fuse_notify_inval_entry.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/fuse-add-request-extension.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/fuse-allow-non-extending-parallel-direct-writes-on-t.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/fuse-fix-memory-leak-in-fuse_create_open.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/mm-fix-pmd_read_atomic.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/mm-rename-pmd_read_atomic.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/net-mana-fix-error-handling-in-mana_create_txq-rxq-s.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/rust-macros-provide-correct-provenance-when-construc.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/series		patch \| blob \| blame \| history
queue-6.1/userfaultfd-fix-checks-for-huge-pmds.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/x86-mm-pae-make-pmd_t-similar-to-pte_t.patch	[new file with mode: 0644]	patch \| blob