Fixes for 5.15

author Sasha Levin <sashal@kernel.org>

Sat, 10 Sep 2022 17:15:16 +0000 (13:15 -0400)

committer Sasha Levin <sashal@kernel.org>

Sat, 10 Sep 2022 17:15:16 +0000 (13:15 -0400)
author Sasha Levin <sashal@kernel.org>
Sat, 10 Sep 2022 17:15:16 +0000 (13:15 -0400)
committer Sasha Levin <sashal@kernel.org>
Sat, 10 Sep 2022 17:15:16 +0000 (13:15 -0400)
diff --git a/queue-5.15/cgroup-elide-write-locking-threadgroup_rwsem-when-up.patch b/queue-5.15/cgroup-elide-write-locking-threadgroup_rwsem-when-up.patch

new file mode 100644 (file)

index 0000000..b16cdb4
--- /dev/null
+++ b/queue-5.15/cgroup-elide-write-locking-threadgroup_rwsem-when-up.patch
@@ -0,0 +1,81 @@
+From d549ecab002cd5302c23865230be6dccd58b4c3b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Jul 2022 18:38:15 -1000
+Subject: cgroup: Elide write-locking threadgroup_rwsem when updating csses on
+ an empty subtree
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Tejun Heo <tj@kernel.org>
+
+[ Upstream commit 671c11f0619e5ccb380bcf0f062f69ba95fc974a ]
+
+cgroup_update_dfl_csses() write-lock the threadgroup_rwsem as updating the
+csses can trigger process migrations. However, if the subtree doesn't
+contain any tasks, there aren't gonna be any cgroup migrations. This
+condition can be trivially detected by testing whether
+mgctx.preloaded_src_csets is empty. Elide write-locking threadgroup_rwsem if
+the subtree is empty.
+
+After this optimization, the usage pattern of creating a cgroup, enabling
+the necessary controllers, and then seeding it with CLONE_INTO_CGROUP and
+then removing the cgroup after it becomes empty doesn't need to write-lock
+threadgroup_rwsem at all.
+
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Cc: Christian Brauner <brauner@kernel.org>
+Cc: Michal Koutný <mkoutny@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/cgroup/cgroup.c | 16 +++++++++++++---
+ 1 file changed, 13 insertions(+), 3 deletions(-)
+
+diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
+index 416dd7db3fb2c..baebd1c7667b7 100644
+--- a/kernel/cgroup/cgroup.c
++++ b/kernel/cgroup/cgroup.c
+@@ -2949,12 +2949,11 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
+       struct cgroup_subsys_state *d_css;
+       struct cgroup *dsct;
+       struct css_set *src_cset;
++      bool has_tasks;
+       int ret;
+ 
+       lockdep_assert_held(&cgroup_mutex);
+ 
+-      percpu_down_write(&cgroup_threadgroup_rwsem);
+-
+       /* look up all csses currently attached to @cgrp's subtree */
+       spin_lock_irq(&css_set_lock);
+       cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
+@@ -2965,6 +2964,16 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
+       }
+       spin_unlock_irq(&css_set_lock);
+ 
++      /*
++       * We need to write-lock threadgroup_rwsem while migrating tasks.
++       * However, if there are no source csets for @cgrp, changing its
++       * controllers isn't gonna produce any task migrations and the
++       * write-locking can be skipped safely.
++       */
++      has_tasks = !list_empty(&mgctx.preloaded_src_csets);
++      if (has_tasks)
++              percpu_down_write(&cgroup_threadgroup_rwsem);
++
+       /* NULL dst indicates self on default hierarchy */
+       ret = cgroup_migrate_prepare_dst(&mgctx);
+       if (ret)
+@@ -2984,7 +2993,8 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
+       ret = cgroup_migrate_execute(&mgctx);
+ out_finish:
+       cgroup_migrate_finish(&mgctx);
+-      percpu_up_write(&cgroup_threadgroup_rwsem);
++      if (has_tasks)
++              percpu_up_write(&cgroup_threadgroup_rwsem);
+       return ret;
+ }
+ 
+-- 
+2.35.1
+
diff --git a/queue-5.15/cgroup-fix-threadgroup_rwsem-cpus_read_lock-deadlock.patch b/queue-5.15/cgroup-fix-threadgroup_rwsem-cpus_read_lock-deadlock.patch

new file mode 100644 (file)

index 0000000..aacdb27
--- /dev/null
+++ b/queue-5.15/cgroup-fix-threadgroup_rwsem-cpus_read_lock-deadlock.patch
@@ -0,0 +1,207 @@
+From 9964e8bbe57dcf67d4b53897cfc14a248f45657d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 15 Aug 2022 13:27:38 -1000
+Subject: cgroup: Fix threadgroup_rwsem <-> cpus_read_lock() deadlock
+
+From: Tejun Heo <tj@kernel.org>
+
+[ Upstream commit 4f7e7236435ca0abe005c674ebd6892c6e83aeb3 ]
+
+Bringing up a CPU may involve creating and destroying tasks which requires
+read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside
+cpus_read_lock(). However, cpuset's ->attach(), which may be called with
+thredagroup_rwsem write-locked, also wants to disable CPU hotplug and
+acquires cpus_read_lock(), leading to a deadlock.
+
+Fix it by guaranteeing that ->attach() is always called with CPU hotplug
+disabled and removing cpus_read_lock() call from cpuset_attach().
+
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Reviewed-and-tested-by: Imran Khan <imran.f.khan@oracle.com>
+Reported-and-tested-by: Xuewen Yan <xuewen.yan@unisoc.com>
+Fixes: 05c7b7a92cc8 ("cgroup/cpuset: Fix a race between cpuset_attach() and cpu hotplug")
+Cc: stable@vger.kernel.org # v5.17+
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/cgroup/cgroup.c | 77 +++++++++++++++++++++++++++++-------------
+ kernel/cgroup/cpuset.c |  3 +-
+ 2 files changed, 55 insertions(+), 25 deletions(-)
+
+diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
+index baebd1c7667b7..75c3881af0784 100644
+--- a/kernel/cgroup/cgroup.c
++++ b/kernel/cgroup/cgroup.c
+@@ -2345,6 +2345,47 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
+ }
+ EXPORT_SYMBOL_GPL(task_cgroup_path);
+ 
++/**
++ * cgroup_attach_lock - Lock for ->attach()
++ * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem
++ *
++ * cgroup migration sometimes needs to stabilize threadgroups against forks and
++ * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach()
++ * implementations (e.g. cpuset), also need to disable CPU hotplug.
++ * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can
++ * lead to deadlocks.
++ *
++ * Bringing up a CPU may involve creating and destroying tasks which requires
++ * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside
++ * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while
++ * write-locking threadgroup_rwsem, the locking order is reversed and we end up
++ * waiting for an on-going CPU hotplug operation which in turn is waiting for
++ * the threadgroup_rwsem to be released to create new tasks. For more details:
++ *
++ *   http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu
++ *
++ * Resolve the situation by always acquiring cpus_read_lock() before optionally
++ * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that
++ * CPU hotplug is disabled on entry.
++ */
++static void cgroup_attach_lock(bool lock_threadgroup)
++{
++      cpus_read_lock();
++      if (lock_threadgroup)
++              percpu_down_write(&cgroup_threadgroup_rwsem);
++}
++
++/**
++ * cgroup_attach_unlock - Undo cgroup_attach_lock()
++ * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem
++ */
++static void cgroup_attach_unlock(bool lock_threadgroup)
++{
++      if (lock_threadgroup)
++              percpu_up_write(&cgroup_threadgroup_rwsem);
++      cpus_read_unlock();
++}
++
+ /**
+  * cgroup_migrate_add_task - add a migration target task to a migration context
+  * @task: target task
+@@ -2821,8 +2862,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
+ }
+ 
+ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
+-                                           bool *locked)
+-      __acquires(&cgroup_threadgroup_rwsem)
++                                           bool *threadgroup_locked)
+ {
+       struct task_struct *tsk;
+       pid_t pid;
+@@ -2839,12 +2879,8 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
+        * Therefore, we can skip the global lock.
+        */
+       lockdep_assert_held(&cgroup_mutex);
+-      if (pid || threadgroup) {
+-              percpu_down_write(&cgroup_threadgroup_rwsem);
+-              *locked = true;
+-      } else {
+-              *locked = false;
+-      }
++      *threadgroup_locked = pid || threadgroup;
++      cgroup_attach_lock(*threadgroup_locked);
+ 
+       rcu_read_lock();
+       if (pid) {
+@@ -2875,17 +2911,14 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
+       goto out_unlock_rcu;
+ 
+ out_unlock_threadgroup:
+-      if (*locked) {
+-              percpu_up_write(&cgroup_threadgroup_rwsem);
+-              *locked = false;
+-      }
++      cgroup_attach_unlock(*threadgroup_locked);
++      *threadgroup_locked = false;
+ out_unlock_rcu:
+       rcu_read_unlock();
+       return tsk;
+ }
+ 
+-void cgroup_procs_write_finish(struct task_struct *task, bool locked)
+-      __releases(&cgroup_threadgroup_rwsem)
++void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked)
+ {
+       struct cgroup_subsys *ss;
+       int ssid;
+@@ -2893,8 +2926,8 @@ void cgroup_procs_write_finish(struct task_struct *task, bool locked)
+       /* release reference from cgroup_procs_write_start() */
+       put_task_struct(task);
+ 
+-      if (locked)
+-              percpu_up_write(&cgroup_threadgroup_rwsem);
++      cgroup_attach_unlock(threadgroup_locked);
++
+       for_each_subsys(ss, ssid)
+               if (ss->post_attach)
+                       ss->post_attach();
+@@ -2971,8 +3004,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
+        * write-locking can be skipped safely.
+        */
+       has_tasks = !list_empty(&mgctx.preloaded_src_csets);
+-      if (has_tasks)
+-              percpu_down_write(&cgroup_threadgroup_rwsem);
++      cgroup_attach_lock(has_tasks);
+ 
+       /* NULL dst indicates self on default hierarchy */
+       ret = cgroup_migrate_prepare_dst(&mgctx);
+@@ -2993,8 +3025,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
+       ret = cgroup_migrate_execute(&mgctx);
+ out_finish:
+       cgroup_migrate_finish(&mgctx);
+-      if (has_tasks)
+-              percpu_up_write(&cgroup_threadgroup_rwsem);
++      cgroup_attach_unlock(has_tasks);
+       return ret;
+ }
+ 
+@@ -4942,13 +4973,13 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
+       struct task_struct *task;
+       const struct cred *saved_cred;
+       ssize_t ret;
+-      bool locked;
++      bool threadgroup_locked;
+ 
+       dst_cgrp = cgroup_kn_lock_live(of->kn, false);
+       if (!dst_cgrp)
+               return -ENODEV;
+ 
+-      task = cgroup_procs_write_start(buf, threadgroup, &locked);
++      task = cgroup_procs_write_start(buf, threadgroup, &threadgroup_locked);
+       ret = PTR_ERR_OR_ZERO(task);
+       if (ret)
+               goto out_unlock;
+@@ -4974,7 +5005,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
+       ret = cgroup_attach_task(dst_cgrp, task, threadgroup);
+ 
+ out_finish:
+-      cgroup_procs_write_finish(task, locked);
++      cgroup_procs_write_finish(task, threadgroup_locked);
+ out_unlock:
+       cgroup_kn_unlock(of->kn);
+ 
+diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
+index 9c5b659db63f4..3213d3c8ea0a8 100644
+--- a/kernel/cgroup/cpuset.c
++++ b/kernel/cgroup/cpuset.c
+@@ -2249,7 +2249,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
+       cgroup_taskset_first(tset, &css);
+       cs = css_cs(css);
+ 
+-      cpus_read_lock();
++      lockdep_assert_cpus_held();     /* see cgroup_attach_lock() */
+       percpu_down_write(&cpuset_rwsem);
+ 
+       guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
+@@ -2303,7 +2303,6 @@ static void cpuset_attach(struct cgroup_taskset *tset)
+               wake_up(&cpuset_attach_wq);
+ 
+       percpu_up_write(&cpuset_rwsem);
+-      cpus_read_unlock();
+ }
+ 
+ /* The various types of files and directories in a cpuset file system */
+-- 
+2.35.1
+
diff --git a/queue-5.15/nfs-fix-another-fsync-issue-after-a-server-reboot.patch b/queue-5.15/nfs-fix-another-fsync-issue-after-a-server-reboot.patch

new file mode 100644 (file)

index 0000000..31e58b8
--- /dev/null
+++ b/queue-5.15/nfs-fix-another-fsync-issue-after-a-server-reboot.patch
@@ -0,0 +1,125 @@
+From 1d960ddcb28ad2a6a22bac66b4b9e0a363f9dd45 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 13 Aug 2022 08:22:25 -0400
+Subject: NFS: Fix another fsync() issue after a server reboot
+
+From: Trond Myklebust <trond.myklebust@hammerspace.com>
+
+[ Upstream commit 67f4b5dc49913abcdb5cc736e73674e2f352f81d ]
+
+Currently, when the writeback code detects a server reboot, it redirties
+any pages that were not committed to disk, and it sets the flag
+NFS_CONTEXT_RESEND_WRITES in the nfs_open_context of the file descriptor
+that dirtied the file. While this allows the file descriptor in question
+to redrive its own writes, it violates the fsync() requirement that we
+should be synchronising all writes to disk.
+While the problem is infrequent, we do see corner cases where an
+untimely server reboot causes the fsync() call to abandon its attempt to
+sync data to disk and causing data corruption issues due to missed error
+conditions or similar.
+
+In order to tighted up the client's ability to deal with this situation
+without introducing livelocks, add a counter that records the number of
+times pages are redirtied due to a server reboot-like condition, and use
+that in fsync() to redrive the sync to disk.
+
+Fixes: 2197e9b06c22 ("NFS: Fix up fsync() when the server rebooted")
+Cc: stable@vger.kernel.org
+Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/nfs/file.c          | 15 ++++++---------
+ fs/nfs/inode.c         |  1 +
+ fs/nfs/write.c         |  6 ++++--
+ include/linux/nfs_fs.h |  1 +
+ 4 files changed, 12 insertions(+), 11 deletions(-)
+
+diff --git a/fs/nfs/file.c b/fs/nfs/file.c
+index a8693cc50c7ca..ad5114e480097 100644
+--- a/fs/nfs/file.c
++++ b/fs/nfs/file.c
+@@ -223,8 +223,10 @@ nfs_file_fsync_commit(struct file *file, int datasync)
+ int
+ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+ {
+-      struct nfs_open_context *ctx = nfs_file_open_context(file);
+       struct inode *inode = file_inode(file);
++      struct nfs_inode *nfsi = NFS_I(inode);
++      long save_nredirtied = atomic_long_read(&nfsi->redirtied_pages);
++      long nredirtied;
+       int ret;
+ 
+       trace_nfs_fsync_enter(inode);
+@@ -239,15 +241,10 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+               ret = pnfs_sync_inode(inode, !!datasync);
+               if (ret != 0)
+                       break;
+-              if (!test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags))
++              nredirtied = atomic_long_read(&nfsi->redirtied_pages);
++              if (nredirtied == save_nredirtied)
+                       break;
+-              /*
+-               * If nfs_file_fsync_commit detected a server reboot, then
+-               * resend all dirty pages that might have been covered by
+-               * the NFS_CONTEXT_RESEND_WRITES flag
+-               */
+-              start = 0;
+-              end = LLONG_MAX;
++              save_nredirtied = nredirtied;
+       }
+ 
+       trace_nfs_fsync_exit(inode, ret);
+diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
+index dc057ab6b30d1..e4524635a129a 100644
+--- a/fs/nfs/inode.c
++++ b/fs/nfs/inode.c
+@@ -434,6 +434,7 @@ nfs_ilookup(struct super_block *sb, struct nfs_fattr *fattr, struct nfs_fh *fh)
+ static void nfs_inode_init_regular(struct nfs_inode *nfsi)
+ {
+       atomic_long_set(&nfsi->nrequests, 0);
++      atomic_long_set(&nfsi->redirtied_pages, 0);
+       INIT_LIST_HEAD(&nfsi->commit_info.list);
+       atomic_long_set(&nfsi->commit_info.ncommit, 0);
+       atomic_set(&nfsi->commit_info.rpcs_out, 0);
+diff --git a/fs/nfs/write.c b/fs/nfs/write.c
+index cdb29fd235492..be70874bc3292 100644
+--- a/fs/nfs/write.c
++++ b/fs/nfs/write.c
+@@ -1394,10 +1394,12 @@ static void nfs_initiate_write(struct nfs_pgio_header *hdr,
+  */
+ static void nfs_redirty_request(struct nfs_page *req)
+ {
++      struct nfs_inode *nfsi = NFS_I(page_file_mapping(req->wb_page)->host);
++
+       /* Bump the transmission count */
+       req->wb_nio++;
+       nfs_mark_request_dirty(req);
+-      set_bit(NFS_CONTEXT_RESEND_WRITES, &nfs_req_openctx(req)->flags);
++      atomic_long_inc(&nfsi->redirtied_pages);
+       nfs_end_page_writeback(req);
+       nfs_release_request(req);
+ }
+@@ -1870,7 +1872,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
+               /* We have a mismatch. Write the page again */
+               dprintk_cont(" mismatch\n");
+               nfs_mark_request_dirty(req);
+-              set_bit(NFS_CONTEXT_RESEND_WRITES, &nfs_req_openctx(req)->flags);
++              atomic_long_inc(&NFS_I(data->inode)->redirtied_pages);
+       next:
+               nfs_unlock_and_release_request(req);
+               /* Latency breaker */
+diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
+index d0855352cd6fc..71467d661fb66 100644
+--- a/include/linux/nfs_fs.h
++++ b/include/linux/nfs_fs.h
+@@ -180,6 +180,7 @@ struct nfs_inode {
+               /* Regular file */
+               struct {
+                       atomic_long_t   nrequests;
++                      atomic_long_t   redirtied_pages;
+                       struct nfs_mds_commit_info commit_info;
+                       struct mutex    commit_mutex;
+               };
+-- 
+2.35.1
+
diff --git a/queue-5.15/nfs-further-optimisations-for-ls-l.patch b/queue-5.15/nfs-further-optimisations-for-ls-l.patch

new file mode 100644 (file)

index 0000000..d4a51c5
--- /dev/null
+++ b/queue-5.15/nfs-further-optimisations-for-ls-l.patch
@@ -0,0 +1,134 @@
+From 824880fe84e32e08ef97657f71ae5568a73d6899 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 28 Sep 2021 14:33:44 -0400
+Subject: NFS: Further optimisations for 'ls -l'
+
+From: Trond Myklebust <trond.myklebust@hammerspace.com>
+
+[ Upstream commit ff81dfb5d721fff87bd516c558847f6effb70031 ]
+
+If a user is doing 'ls -l', we have a heuristic in GETATTR that tells
+the readdir code to try to use READDIRPLUS in order to refresh the inode
+attributes. In certain cirumstances, we also try to invalidate the
+remaining directory entries in order to ensure this refresh.
+
+If there are multiple readers of the directory, we probably should avoid
+invalidating the page cache, since the heuristic breaks down in that
+situation anyway.
+
+Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
+Tested-by: Benjamin Coddington <bcodding@redhat.com>
+Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/nfs/dir.c           | 16 +++++++++++-----
+ include/linux/nfs_fs.h |  5 ++---
+ 2 files changed, 13 insertions(+), 8 deletions(-)
+
+diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
+index 78219396788b4..32c3d0c454b19 100644
+--- a/fs/nfs/dir.c
++++ b/fs/nfs/dir.c
+@@ -78,6 +78,7 @@ static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir
+               ctx->attr_gencount = nfsi->attr_gencount;
+               ctx->dir_cookie = 0;
+               ctx->dup_cookie = 0;
++              ctx->page_index = 0;
+               spin_lock(&dir->i_lock);
+               if (list_empty(&nfsi->open_files) &&
+                   (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER))
+@@ -85,6 +86,7 @@ static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir
+                                             NFS_INO_INVALID_DATA |
+                                                     NFS_INO_REVAL_FORCED);
+               list_add(&ctx->list, &nfsi->open_files);
++              clear_bit(NFS_INO_FORCE_READDIR, &nfsi->flags);
+               spin_unlock(&dir->i_lock);
+               return ctx;
+       }
+@@ -626,8 +628,7 @@ void nfs_force_use_readdirplus(struct inode *dir)
+       if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS) &&
+           !list_empty(&nfsi->open_files)) {
+               set_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags);
+-              invalidate_mapping_pages(dir->i_mapping,
+-                      nfsi->page_index + 1, -1);
++              set_bit(NFS_INO_FORCE_READDIR, &nfsi->flags);
+       }
+ }
+ 
+@@ -938,10 +939,8 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
+                              sizeof(nfsi->cookieverf));
+       }
+       res = nfs_readdir_search_array(desc);
+-      if (res == 0) {
+-              nfsi->page_index = desc->page_index;
++      if (res == 0)
+               return 0;
+-      }
+       nfs_readdir_page_unlock_and_put_cached(desc);
+       return res;
+ }
+@@ -1081,6 +1080,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
+       struct nfs_inode *nfsi = NFS_I(inode);
+       struct nfs_open_dir_context *dir_ctx = file->private_data;
+       struct nfs_readdir_descriptor *desc;
++      pgoff_t page_index;
+       int res;
+ 
+       dfprintk(FILE, "NFS: readdir(%pD2) starting at cookie %llu\n",
+@@ -1111,10 +1111,15 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
+       desc->dir_cookie = dir_ctx->dir_cookie;
+       desc->dup_cookie = dir_ctx->dup_cookie;
+       desc->duped = dir_ctx->duped;
++      page_index = dir_ctx->page_index;
+       desc->attr_gencount = dir_ctx->attr_gencount;
+       memcpy(desc->verf, dir_ctx->verf, sizeof(desc->verf));
+       spin_unlock(&file->f_lock);
+ 
++      if (test_and_clear_bit(NFS_INO_FORCE_READDIR, &nfsi->flags) &&
++          list_is_singular(&nfsi->open_files))
++              invalidate_mapping_pages(inode->i_mapping, page_index + 1, -1);
++
+       do {
+               res = readdir_search_pagecache(desc);
+ 
+@@ -1151,6 +1156,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
+       dir_ctx->dup_cookie = desc->dup_cookie;
+       dir_ctx->duped = desc->duped;
+       dir_ctx->attr_gencount = desc->attr_gencount;
++      dir_ctx->page_index = desc->page_index;
+       memcpy(dir_ctx->verf, desc->verf, sizeof(dir_ctx->verf));
+       spin_unlock(&file->f_lock);
+ 
+diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
+index 66b6cc24ab8c9..be8625d8a10a7 100644
+--- a/include/linux/nfs_fs.h
++++ b/include/linux/nfs_fs.h
+@@ -103,6 +103,7 @@ struct nfs_open_dir_context {
+       __be32  verf[NFS_DIR_VERIFIER_SIZE];
+       __u64 dir_cookie;
+       __u64 dup_cookie;
++      pgoff_t page_index;
+       signed char duped;
+ };
+ 
+@@ -181,9 +182,6 @@ struct nfs_inode {
+       struct rw_semaphore     rmdir_sem;
+       struct mutex            commit_mutex;
+ 
+-      /* track last access to cached pages */
+-      unsigned long           page_index;
+-
+ #if IS_ENABLED(CONFIG_NFS_V4)
+       struct nfs4_cached_acl  *nfs4_acl;
+         /* NFSv4 state */
+@@ -272,6 +270,7 @@ struct nfs4_copy_state {
+ #define NFS_INO_INVALIDATING  (3)             /* inode is being invalidated */
+ #define NFS_INO_FSCACHE               (5)             /* inode can be cached by FS-Cache */
+ #define NFS_INO_FSCACHE_LOCK  (6)             /* FS-Cache cookie management lock */
++#define NFS_INO_FORCE_READDIR (7)             /* force readdirplus */
+ #define NFS_INO_LAYOUTCOMMIT  (9)             /* layoutcommit required */
+ #define NFS_INO_LAYOUTCOMMITTING (10)         /* layoutcommit inflight */
+ #define NFS_INO_LAYOUTSTATS   (11)            /* layoutstats inflight */
+-- 
+2.35.1
+
diff --git a/queue-5.15/nfs-save-some-space-in-the-inode.patch b/queue-5.15/nfs-save-some-space-in-the-inode.patch

new file mode 100644 (file)

index 0000000..d3fc8e8
--- /dev/null
+++ b/queue-5.15/nfs-save-some-space-in-the-inode.patch
@@ -0,0 +1,147 @@
+From a3af8f4666db612da3ab84dde6b1889b01fb91e5 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 28 Sep 2021 17:41:41 -0400
+Subject: NFS: Save some space in the inode
+
+From: Trond Myklebust <trond.myklebust@hammerspace.com>
+
+[ Upstream commit e591b298d7ecb851e200f65946e3d53fe78a3c4f ]
+
+Save some space in the nfs_inode by setting up an anonymous union with
+the fields that are peculiar to a specific type of filesystem object.
+
+Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/nfs/inode.c         | 26 ++++++++++++++++++--------
+ include/linux/nfs_fs.h | 42 ++++++++++++++++++++++++------------------
+ 2 files changed, 42 insertions(+), 26 deletions(-)
+
+diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
+index cb407af9e9e92..dc057ab6b30d1 100644
+--- a/fs/nfs/inode.c
++++ b/fs/nfs/inode.c
+@@ -431,6 +431,22 @@ nfs_ilookup(struct super_block *sb, struct nfs_fattr *fattr, struct nfs_fh *fh)
+       return inode;
+ }
+ 
++static void nfs_inode_init_regular(struct nfs_inode *nfsi)
++{
++      atomic_long_set(&nfsi->nrequests, 0);
++      INIT_LIST_HEAD(&nfsi->commit_info.list);
++      atomic_long_set(&nfsi->commit_info.ncommit, 0);
++      atomic_set(&nfsi->commit_info.rpcs_out, 0);
++      mutex_init(&nfsi->commit_mutex);
++}
++
++static void nfs_inode_init_dir(struct nfs_inode *nfsi)
++{
++      nfsi->cache_change_attribute = 0;
++      memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
++      init_rwsem(&nfsi->rmdir_sem);
++}
++
+ /*
+  * This is our front-end to iget that looks up inodes by file handle
+  * instead of inode number.
+@@ -485,10 +501,12 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
+               if (S_ISREG(inode->i_mode)) {
+                       inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops;
+                       inode->i_data.a_ops = &nfs_file_aops;
++                      nfs_inode_init_regular(nfsi);
+               } else if (S_ISDIR(inode->i_mode)) {
+                       inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops;
+                       inode->i_fop = &nfs_dir_operations;
+                       inode->i_data.a_ops = &nfs_dir_aops;
++                      nfs_inode_init_dir(nfsi);
+                       /* Deal with crossing mountpoints */
+                       if (fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT ||
+                                       fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) {
+@@ -514,7 +532,6 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
+               inode->i_uid = make_kuid(&init_user_ns, -2);
+               inode->i_gid = make_kgid(&init_user_ns, -2);
+               inode->i_blocks = 0;
+-              memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
+               nfsi->write_io = 0;
+               nfsi->read_io = 0;
+ 
+@@ -2282,14 +2299,7 @@ static void init_once(void *foo)
+       INIT_LIST_HEAD(&nfsi->open_files);
+       INIT_LIST_HEAD(&nfsi->access_cache_entry_lru);
+       INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
+-      INIT_LIST_HEAD(&nfsi->commit_info.list);
+-      atomic_long_set(&nfsi->nrequests, 0);
+-      atomic_long_set(&nfsi->commit_info.ncommit, 0);
+-      atomic_set(&nfsi->commit_info.rpcs_out, 0);
+-      init_rwsem(&nfsi->rmdir_sem);
+-      mutex_init(&nfsi->commit_mutex);
+       nfs4_init_once(nfsi);
+-      nfsi->cache_change_attribute = 0;
+ }
+ 
+ static int __init nfs_init_inodecache(void)
+diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
+index be8625d8a10a7..d0855352cd6fc 100644
+--- a/include/linux/nfs_fs.h
++++ b/include/linux/nfs_fs.h
+@@ -155,33 +155,39 @@ struct nfs_inode {
+       unsigned long           attrtimeo_timestamp;
+ 
+       unsigned long           attr_gencount;
+-      /* "Generation counter" for the attribute cache. This is
+-       * bumped whenever we update the metadata on the
+-       * server.
+-       */
+-      unsigned long           cache_change_attribute;
+ 
+       struct rb_root          access_cache;
+       struct list_head        access_cache_entry_lru;
+       struct list_head        access_cache_inode_lru;
+ 
+-      /*
+-       * This is the cookie verifier used for NFSv3 readdir
+-       * operations
+-       */
+-      __be32                  cookieverf[NFS_DIR_VERIFIER_SIZE];
+-
+-      atomic_long_t           nrequests;
+-      struct nfs_mds_commit_info commit_info;
++      union {
++              /* Directory */
++              struct {
++                      /* "Generation counter" for the attribute cache.
++                       * This is bumped whenever we update the metadata
++                       * on the server.
++                       */
++                      unsigned long   cache_change_attribute;
++                      /*
++                       * This is the cookie verifier used for NFSv3 readdir
++                       * operations
++                       */
++                      __be32          cookieverf[NFS_DIR_VERIFIER_SIZE];
++                      /* Readers: in-flight sillydelete RPC calls */
++                      /* Writers: rmdir */
++                      struct rw_semaphore     rmdir_sem;
++              };
++              /* Regular file */
++              struct {
++                      atomic_long_t   nrequests;
++                      struct nfs_mds_commit_info commit_info;
++                      struct mutex    commit_mutex;
++              };
++      };
+ 
+       /* Open contexts for shared mmap writes */
+       struct list_head        open_files;
+ 
+-      /* Readers: in-flight sillydelete RPC calls */
+-      /* Writers: rmdir */
+-      struct rw_semaphore     rmdir_sem;
+-      struct mutex            commit_mutex;
+-
+ #if IS_ENABLED(CONFIG_NFS_V4)
+       struct nfs4_cached_acl  *nfs4_acl;
+         /* NFSv4 state */
+-- 
+2.35.1
+
diff --git a/queue-5.15/riscv-dts-microchip-mpfs-fix-reference-clock-node.patch b/queue-5.15/riscv-dts-microchip-mpfs-fix-reference-clock-node.patch

new file mode 100644 (file)

index 0000000..52491d8
--- /dev/null
+++ b/queue-5.15/riscv-dts-microchip-mpfs-fix-reference-clock-node.patch
@@ -0,0 +1,79 @@
+From 4fa8e7d65f7a720ac13f22aab2ce447b921dbf97 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 17 Dec 2021 13:49:26 +0100
+Subject: riscv: dts: microchip: mpfs: Fix reference clock node
+
+From: Geert Uytterhoeven <geert@linux-m68k.org>
+
+[ Upstream commit 9d7b3078628f591e4007210c0d5d3f94805cff55 ]
+
+"make dtbs_check" reports:
+
+    arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dt.yaml: soc: refclk: {'compatible': ['fixed-clock'], '#clock-cells': [[0]], 'clock-frequency': [[600000000]], 'clock-output-names': ['msspllclk'], 'phandle': [[7]]} should not be valid under {'type': 'object'}
+       From schema: dtschema/schemas/simple-bus.yaml
+
+Fix this by moving the node out of the "soc" subnode.
+While at it, rename it to "msspllclk", and drop the now superfluous
+"clock-output-names" property.
+Move the actual clock-frequency value to the board DTS, since it is not
+set until bitstream programming time.
+
+Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
+Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
+Reviewed-by: Conor Dooley <conor.dooley@microchip.com>
+Tested-by: Conor Dooley <conor.dooley@microchip.com>
+Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ .../boot/dts/microchip/microchip-mpfs-icicle-kit.dts |  4 ++++
+ arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi    | 12 +++++-------
+ 2 files changed, 9 insertions(+), 7 deletions(-)
+
+diff --git a/arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts b/arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts
+index cce5eca31f257..4b69ab4ff30a2 100644
+--- a/arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts
++++ b/arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts
+@@ -40,6 +40,10 @@
+       };
+ };
+ 
++&refclk {
++      clock-frequency = <600000000>;
++};
++
+ &serial0 {
+       status = "okay";
+ };
+diff --git a/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi b/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi
+index 4ef4bcb748729..9279ccf20009a 100644
+--- a/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi
++++ b/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi
+@@ -139,6 +139,11 @@
+               };
+       };
+ 
++      refclk: msspllclk {
++              compatible = "fixed-clock";
++              #clock-cells = <0>;
++      };
++
+       soc {
+               #address-cells = <2>;
+               #size-cells = <2>;
+@@ -188,13 +193,6 @@
+                       #dma-cells = <1>;
+               };
+ 
+-              refclk: refclk {
+-                      compatible = "fixed-clock";
+-                      #clock-cells = <0>;
+-                      clock-frequency = <600000000>;
+-                      clock-output-names = "msspllclk";
+-              };
+-
+               clkcfg: clkcfg@20002000 {
+                       compatible = "microchip,mpfs-clkcfg";
+                       reg = <0x0 0x20002000 0x0 0x1000>;
+-- 
+2.35.1
+
diff --git a/queue-5.15/series b/queue-5.15/series

index 0bf0d7eb766e41dc11d50fefad106bc5ec122845..1451c2e7b21efcdabe1d304a8a4d15124edff4cb 100644 (file)
--- a/queue-5.15/series
+++ b/queue-5.15/series
@@ -40,3 +40,9 @@ nvmet-fix-a-use-after-free.patch
  drm-i915-implement-waedplinkratedatareload.patch
  scsi-mpt3sas-fix-use-after-free-warning.patch
  scsi-lpfc-add-missing-destroy_workqueue-in-error-path.patch
+nfs-further-optimisations-for-ls-l.patch
+nfs-save-some-space-in-the-inode.patch
+nfs-fix-another-fsync-issue-after-a-server-reboot.patch
+cgroup-elide-write-locking-threadgroup_rwsem-when-up.patch
+cgroup-fix-threadgroup_rwsem-cpus_read_lock-deadlock.patch
+riscv-dts-microchip-mpfs-fix-reference-clock-node.patch
author	Sasha Levin <sashal@kernel.org>
	Sat, 10 Sep 2022 17:15:16 +0000 (13:15 -0400)
committer	Sasha Levin <sashal@kernel.org>
	Sat, 10 Sep 2022 17:15:16 +0000 (13:15 -0400)
queue-5.15/cgroup-elide-write-locking-threadgroup_rwsem-when-up.patch	[new file with mode: 0644]	patch \| blob
queue-5.15/cgroup-fix-threadgroup_rwsem-cpus_read_lock-deadlock.patch	[new file with mode: 0644]	patch \| blob
queue-5.15/nfs-fix-another-fsync-issue-after-a-server-reboot.patch	[new file with mode: 0644]	patch \| blob
queue-5.15/nfs-further-optimisations-for-ls-l.patch	[new file with mode: 0644]	patch \| blob
queue-5.15/nfs-save-some-space-in-the-inode.patch	[new file with mode: 0644]	patch \| blob
queue-5.15/riscv-dts-microchip-mpfs-fix-reference-clock-node.patch	[new file with mode: 0644]	patch \| blob
queue-5.15/series		patch \| blob \| blame \| history