From: Sasha Levin <sashal@kernel.org>
Date: Sat, 10 Sep 2022 17:15:16 +0000 (-0400)
Subject: Fixes for 5.15
X-Git-Tag: v5.19.9~48
X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=eca155cb697c9f903287102b1c30e77d43581c69;p=thirdparty%2Fkernel%2Fstable-queue.git

Fixes for 5.15

Signed-off-by: Sasha Levin <sashal@kernel.org>
---

diff --git a/queue-5.15/cgroup-elide-write-locking-threadgroup_rwsem-when-up.patch b/queue-5.15/cgroup-elide-write-locking-threadgroup_rwsem-when-up.patch
new file mode 100644
index 00000000000..b16cdb48b8f
--- /dev/null
+++ b/queue-5.15/cgroup-elide-write-locking-threadgroup_rwsem-when-up.patch
@@ -0,0 +1,81 @@
+From d549ecab002cd5302c23865230be6dccd58b4c3b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 14 Jul 2022 18:38:15 -1000
+Subject: cgroup: Elide write-locking threadgroup_rwsem when updating csses on
+ an empty subtree
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Tejun Heo <tj@kernel.org>
+
+[ Upstream commit 671c11f0619e5ccb380bcf0f062f69ba95fc974a ]
+
+cgroup_update_dfl_csses() write-lock the threadgroup_rwsem as updating the
+csses can trigger process migrations. However, if the subtree doesn't
+contain any tasks, there aren't gonna be any cgroup migrations. This
+condition can be trivially detected by testing whether
+mgctx.preloaded_src_csets is empty. Elide write-locking threadgroup_rwsem if
+the subtree is empty.
+
+After this optimization, the usage pattern of creating a cgroup, enabling
+the necessary controllers, and then seeding it with CLONE_INTO_CGROUP and
+then removing the cgroup after it becomes empty doesn't need to write-lock
+threadgroup_rwsem at all.
+
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Cc: Christian Brauner <brauner@kernel.org>
+Cc: Michal KoutnÃ½ <mkoutny@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/cgroup/cgroup.c | 16 +++++++++++++---
+ 1 file changed, 13 insertions(+), 3 deletions(-)
+
+diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
+index 416dd7db3fb2c..baebd1c7667b7 100644
+--- a/kernel/cgroup/cgroup.c
++++ b/kernel/cgroup/cgroup.c
+@@ -2949,12 +2949,11 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
+ 	struct cgroup_subsys_state *d_css;
+ 	struct cgroup *dsct;
+ 	struct css_set *src_cset;
++	bool has_tasks;
+ 	int ret;
+ 
+ 	lockdep_assert_held(&cgroup_mutex);
+ 
+-	percpu_down_write(&cgroup_threadgroup_rwsem);
+-
+ 	/* look up all csses currently attached to @cgrp's subtree */
+ 	spin_lock_irq(&css_set_lock);
+ 	cgroup_for_each_live_descendant_pre(dsct, d_css, cgrp) {
+@@ -2965,6 +2964,16 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
+ 	}
+ 	spin_unlock_irq(&css_set_lock);
+ 
++	/*
++	 * We need to write-lock threadgroup_rwsem while migrating tasks.
++	 * However, if there are no source csets for @cgrp, changing its
++	 * controllers isn't gonna produce any task migrations and the
++	 * write-locking can be skipped safely.
++	 */
++	has_tasks = !list_empty(&mgctx.preloaded_src_csets);
++	if (has_tasks)
++		percpu_down_write(&cgroup_threadgroup_rwsem);
++
+ 	/* NULL dst indicates self on default hierarchy */
+ 	ret = cgroup_migrate_prepare_dst(&mgctx);
+ 	if (ret)
+@@ -2984,7 +2993,8 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
+ 	ret = cgroup_migrate_execute(&mgctx);
+ out_finish:
+ 	cgroup_migrate_finish(&mgctx);
+-	percpu_up_write(&cgroup_threadgroup_rwsem);
++	if (has_tasks)
++		percpu_up_write(&cgroup_threadgroup_rwsem);
+ 	return ret;
+ }
+ 
+-- 
+2.35.1
+
diff --git a/queue-5.15/cgroup-fix-threadgroup_rwsem-cpus_read_lock-deadlock.patch b/queue-5.15/cgroup-fix-threadgroup_rwsem-cpus_read_lock-deadlock.patch
new file mode 100644
index 00000000000..aacdb276936
--- /dev/null
+++ b/queue-5.15/cgroup-fix-threadgroup_rwsem-cpus_read_lock-deadlock.patch
@@ -0,0 +1,207 @@
+From 9964e8bbe57dcf67d4b53897cfc14a248f45657d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 15 Aug 2022 13:27:38 -1000
+Subject: cgroup: Fix threadgroup_rwsem <-> cpus_read_lock() deadlock
+
+From: Tejun Heo <tj@kernel.org>
+
+[ Upstream commit 4f7e7236435ca0abe005c674ebd6892c6e83aeb3 ]
+
+Bringing up a CPU may involve creating and destroying tasks which requires
+read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside
+cpus_read_lock(). However, cpuset's ->attach(), which may be called with
+thredagroup_rwsem write-locked, also wants to disable CPU hotplug and
+acquires cpus_read_lock(), leading to a deadlock.
+
+Fix it by guaranteeing that ->attach() is always called with CPU hotplug
+disabled and removing cpus_read_lock() call from cpuset_attach().
+
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Reviewed-and-tested-by: Imran Khan <imran.f.khan@oracle.com>
+Reported-and-tested-by: Xuewen Yan <xuewen.yan@unisoc.com>
+Fixes: 05c7b7a92cc8 ("cgroup/cpuset: Fix a race between cpuset_attach() and cpu hotplug")
+Cc: stable@vger.kernel.org # v5.17+
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/cgroup/cgroup.c | 77 +++++++++++++++++++++++++++++-------------
+ kernel/cgroup/cpuset.c |  3 +-
+ 2 files changed, 55 insertions(+), 25 deletions(-)
+
+diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c
+index baebd1c7667b7..75c3881af0784 100644
+--- a/kernel/cgroup/cgroup.c
++++ b/kernel/cgroup/cgroup.c
+@@ -2345,6 +2345,47 @@ int task_cgroup_path(struct task_struct *task, char *buf, size_t buflen)
+ }
+ EXPORT_SYMBOL_GPL(task_cgroup_path);
+ 
++/**
++ * cgroup_attach_lock - Lock for ->attach()
++ * @lock_threadgroup: whether to down_write cgroup_threadgroup_rwsem
++ *
++ * cgroup migration sometimes needs to stabilize threadgroups against forks and
++ * exits by write-locking cgroup_threadgroup_rwsem. However, some ->attach()
++ * implementations (e.g. cpuset), also need to disable CPU hotplug.
++ * Unfortunately, letting ->attach() operations acquire cpus_read_lock() can
++ * lead to deadlocks.
++ *
++ * Bringing up a CPU may involve creating and destroying tasks which requires
++ * read-locking threadgroup_rwsem, so threadgroup_rwsem nests inside
++ * cpus_read_lock(). If we call an ->attach() which acquires the cpus lock while
++ * write-locking threadgroup_rwsem, the locking order is reversed and we end up
++ * waiting for an on-going CPU hotplug operation which in turn is waiting for
++ * the threadgroup_rwsem to be released to create new tasks. For more details:
++ *
++ *   http://lkml.kernel.org/r/20220711174629.uehfmqegcwn2lqzu@wubuntu
++ *
++ * Resolve the situation by always acquiring cpus_read_lock() before optionally
++ * write-locking cgroup_threadgroup_rwsem. This allows ->attach() to assume that
++ * CPU hotplug is disabled on entry.
++ */
++static void cgroup_attach_lock(bool lock_threadgroup)
++{
++	cpus_read_lock();
++	if (lock_threadgroup)
++		percpu_down_write(&cgroup_threadgroup_rwsem);
++}
++
++/**
++ * cgroup_attach_unlock - Undo cgroup_attach_lock()
++ * @lock_threadgroup: whether to up_write cgroup_threadgroup_rwsem
++ */
++static void cgroup_attach_unlock(bool lock_threadgroup)
++{
++	if (lock_threadgroup)
++		percpu_up_write(&cgroup_threadgroup_rwsem);
++	cpus_read_unlock();
++}
++
+ /**
+  * cgroup_migrate_add_task - add a migration target task to a migration context
+  * @task: target task
+@@ -2821,8 +2862,7 @@ int cgroup_attach_task(struct cgroup *dst_cgrp, struct task_struct *leader,
+ }
+ 
+ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
+-					     bool *locked)
+-	__acquires(&cgroup_threadgroup_rwsem)
++					     bool *threadgroup_locked)
+ {
+ 	struct task_struct *tsk;
+ 	pid_t pid;
+@@ -2839,12 +2879,8 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
+ 	 * Therefore, we can skip the global lock.
+ 	 */
+ 	lockdep_assert_held(&cgroup_mutex);
+-	if (pid || threadgroup) {
+-		percpu_down_write(&cgroup_threadgroup_rwsem);
+-		*locked = true;
+-	} else {
+-		*locked = false;
+-	}
++	*threadgroup_locked = pid || threadgroup;
++	cgroup_attach_lock(*threadgroup_locked);
+ 
+ 	rcu_read_lock();
+ 	if (pid) {
+@@ -2875,17 +2911,14 @@ struct task_struct *cgroup_procs_write_start(char *buf, bool threadgroup,
+ 	goto out_unlock_rcu;
+ 
+ out_unlock_threadgroup:
+-	if (*locked) {
+-		percpu_up_write(&cgroup_threadgroup_rwsem);
+-		*locked = false;
+-	}
++	cgroup_attach_unlock(*threadgroup_locked);
++	*threadgroup_locked = false;
+ out_unlock_rcu:
+ 	rcu_read_unlock();
+ 	return tsk;
+ }
+ 
+-void cgroup_procs_write_finish(struct task_struct *task, bool locked)
+-	__releases(&cgroup_threadgroup_rwsem)
++void cgroup_procs_write_finish(struct task_struct *task, bool threadgroup_locked)
+ {
+ 	struct cgroup_subsys *ss;
+ 	int ssid;
+@@ -2893,8 +2926,8 @@ void cgroup_procs_write_finish(struct task_struct *task, bool locked)
+ 	/* release reference from cgroup_procs_write_start() */
+ 	put_task_struct(task);
+ 
+-	if (locked)
+-		percpu_up_write(&cgroup_threadgroup_rwsem);
++	cgroup_attach_unlock(threadgroup_locked);
++
+ 	for_each_subsys(ss, ssid)
+ 		if (ss->post_attach)
+ 			ss->post_attach();
+@@ -2971,8 +3004,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
+ 	 * write-locking can be skipped safely.
+ 	 */
+ 	has_tasks = !list_empty(&mgctx.preloaded_src_csets);
+-	if (has_tasks)
+-		percpu_down_write(&cgroup_threadgroup_rwsem);
++	cgroup_attach_lock(has_tasks);
+ 
+ 	/* NULL dst indicates self on default hierarchy */
+ 	ret = cgroup_migrate_prepare_dst(&mgctx);
+@@ -2993,8 +3025,7 @@ static int cgroup_update_dfl_csses(struct cgroup *cgrp)
+ 	ret = cgroup_migrate_execute(&mgctx);
+ out_finish:
+ 	cgroup_migrate_finish(&mgctx);
+-	if (has_tasks)
+-		percpu_up_write(&cgroup_threadgroup_rwsem);
++	cgroup_attach_unlock(has_tasks);
+ 	return ret;
+ }
+ 
+@@ -4942,13 +4973,13 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
+ 	struct task_struct *task;
+ 	const struct cred *saved_cred;
+ 	ssize_t ret;
+-	bool locked;
++	bool threadgroup_locked;
+ 
+ 	dst_cgrp = cgroup_kn_lock_live(of->kn, false);
+ 	if (!dst_cgrp)
+ 		return -ENODEV;
+ 
+-	task = cgroup_procs_write_start(buf, threadgroup, &locked);
++	task = cgroup_procs_write_start(buf, threadgroup, &threadgroup_locked);
+ 	ret = PTR_ERR_OR_ZERO(task);
+ 	if (ret)
+ 		goto out_unlock;
+@@ -4974,7 +5005,7 @@ static ssize_t __cgroup_procs_write(struct kernfs_open_file *of, char *buf,
+ 	ret = cgroup_attach_task(dst_cgrp, task, threadgroup);
+ 
+ out_finish:
+-	cgroup_procs_write_finish(task, locked);
++	cgroup_procs_write_finish(task, threadgroup_locked);
+ out_unlock:
+ 	cgroup_kn_unlock(of->kn);
+ 
+diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c
+index 9c5b659db63f4..3213d3c8ea0a8 100644
+--- a/kernel/cgroup/cpuset.c
++++ b/kernel/cgroup/cpuset.c
+@@ -2249,7 +2249,7 @@ static void cpuset_attach(struct cgroup_taskset *tset)
+ 	cgroup_taskset_first(tset, &css);
+ 	cs = css_cs(css);
+ 
+-	cpus_read_lock();
++	lockdep_assert_cpus_held();	/* see cgroup_attach_lock() */
+ 	percpu_down_write(&cpuset_rwsem);
+ 
+ 	guarantee_online_mems(cs, &cpuset_attach_nodemask_to);
+@@ -2303,7 +2303,6 @@ static void cpuset_attach(struct cgroup_taskset *tset)
+ 		wake_up(&cpuset_attach_wq);
+ 
+ 	percpu_up_write(&cpuset_rwsem);
+-	cpus_read_unlock();
+ }
+ 
+ /* The various types of files and directories in a cpuset file system */
+-- 
+2.35.1
+
diff --git a/queue-5.15/nfs-fix-another-fsync-issue-after-a-server-reboot.patch b/queue-5.15/nfs-fix-another-fsync-issue-after-a-server-reboot.patch
new file mode 100644
index 00000000000..31e58b897b9
--- /dev/null
+++ b/queue-5.15/nfs-fix-another-fsync-issue-after-a-server-reboot.patch
@@ -0,0 +1,125 @@
+From 1d960ddcb28ad2a6a22bac66b4b9e0a363f9dd45 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 13 Aug 2022 08:22:25 -0400
+Subject: NFS: Fix another fsync() issue after a server reboot
+
+From: Trond Myklebust <trond.myklebust@hammerspace.com>
+
+[ Upstream commit 67f4b5dc49913abcdb5cc736e73674e2f352f81d ]
+
+Currently, when the writeback code detects a server reboot, it redirties
+any pages that were not committed to disk, and it sets the flag
+NFS_CONTEXT_RESEND_WRITES in the nfs_open_context of the file descriptor
+that dirtied the file. While this allows the file descriptor in question
+to redrive its own writes, it violates the fsync() requirement that we
+should be synchronising all writes to disk.
+While the problem is infrequent, we do see corner cases where an
+untimely server reboot causes the fsync() call to abandon its attempt to
+sync data to disk and causing data corruption issues due to missed error
+conditions or similar.
+
+In order to tighted up the client's ability to deal with this situation
+without introducing livelocks, add a counter that records the number of
+times pages are redirtied due to a server reboot-like condition, and use
+that in fsync() to redrive the sync to disk.
+
+Fixes: 2197e9b06c22 ("NFS: Fix up fsync() when the server rebooted")
+Cc: stable@vger.kernel.org
+Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/nfs/file.c          | 15 ++++++---------
+ fs/nfs/inode.c         |  1 +
+ fs/nfs/write.c         |  6 ++++--
+ include/linux/nfs_fs.h |  1 +
+ 4 files changed, 12 insertions(+), 11 deletions(-)
+
+diff --git a/fs/nfs/file.c b/fs/nfs/file.c
+index a8693cc50c7ca..ad5114e480097 100644
+--- a/fs/nfs/file.c
++++ b/fs/nfs/file.c
+@@ -223,8 +223,10 @@ nfs_file_fsync_commit(struct file *file, int datasync)
+ int
+ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+ {
+-	struct nfs_open_context *ctx = nfs_file_open_context(file);
+ 	struct inode *inode = file_inode(file);
++	struct nfs_inode *nfsi = NFS_I(inode);
++	long save_nredirtied = atomic_long_read(&nfsi->redirtied_pages);
++	long nredirtied;
+ 	int ret;
+ 
+ 	trace_nfs_fsync_enter(inode);
+@@ -239,15 +241,10 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync)
+ 		ret = pnfs_sync_inode(inode, !!datasync);
+ 		if (ret != 0)
+ 			break;
+-		if (!test_and_clear_bit(NFS_CONTEXT_RESEND_WRITES, &ctx->flags))
++		nredirtied = atomic_long_read(&nfsi->redirtied_pages);
++		if (nredirtied == save_nredirtied)
+ 			break;
+-		/*
+-		 * If nfs_file_fsync_commit detected a server reboot, then
+-		 * resend all dirty pages that might have been covered by
+-		 * the NFS_CONTEXT_RESEND_WRITES flag
+-		 */
+-		start = 0;
+-		end = LLONG_MAX;
++		save_nredirtied = nredirtied;
+ 	}
+ 
+ 	trace_nfs_fsync_exit(inode, ret);
+diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
+index dc057ab6b30d1..e4524635a129a 100644
+--- a/fs/nfs/inode.c
++++ b/fs/nfs/inode.c
+@@ -434,6 +434,7 @@ nfs_ilookup(struct super_block *sb, struct nfs_fattr *fattr, struct nfs_fh *fh)
+ static void nfs_inode_init_regular(struct nfs_inode *nfsi)
+ {
+ 	atomic_long_set(&nfsi->nrequests, 0);
++	atomic_long_set(&nfsi->redirtied_pages, 0);
+ 	INIT_LIST_HEAD(&nfsi->commit_info.list);
+ 	atomic_long_set(&nfsi->commit_info.ncommit, 0);
+ 	atomic_set(&nfsi->commit_info.rpcs_out, 0);
+diff --git a/fs/nfs/write.c b/fs/nfs/write.c
+index cdb29fd235492..be70874bc3292 100644
+--- a/fs/nfs/write.c
++++ b/fs/nfs/write.c
+@@ -1394,10 +1394,12 @@ static void nfs_initiate_write(struct nfs_pgio_header *hdr,
+  */
+ static void nfs_redirty_request(struct nfs_page *req)
+ {
++	struct nfs_inode *nfsi = NFS_I(page_file_mapping(req->wb_page)->host);
++
+ 	/* Bump the transmission count */
+ 	req->wb_nio++;
+ 	nfs_mark_request_dirty(req);
+-	set_bit(NFS_CONTEXT_RESEND_WRITES, &nfs_req_openctx(req)->flags);
++	atomic_long_inc(&nfsi->redirtied_pages);
+ 	nfs_end_page_writeback(req);
+ 	nfs_release_request(req);
+ }
+@@ -1870,7 +1872,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
+ 		/* We have a mismatch. Write the page again */
+ 		dprintk_cont(" mismatch\n");
+ 		nfs_mark_request_dirty(req);
+-		set_bit(NFS_CONTEXT_RESEND_WRITES, &nfs_req_openctx(req)->flags);
++		atomic_long_inc(&NFS_I(data->inode)->redirtied_pages);
+ 	next:
+ 		nfs_unlock_and_release_request(req);
+ 		/* Latency breaker */
+diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
+index d0855352cd6fc..71467d661fb66 100644
+--- a/include/linux/nfs_fs.h
++++ b/include/linux/nfs_fs.h
+@@ -180,6 +180,7 @@ struct nfs_inode {
+ 		/* Regular file */
+ 		struct {
+ 			atomic_long_t	nrequests;
++			atomic_long_t	redirtied_pages;
+ 			struct nfs_mds_commit_info commit_info;
+ 			struct mutex	commit_mutex;
+ 		};
+-- 
+2.35.1
+
diff --git a/queue-5.15/nfs-further-optimisations-for-ls-l.patch b/queue-5.15/nfs-further-optimisations-for-ls-l.patch
new file mode 100644
index 00000000000..d4a51c508e0
--- /dev/null
+++ b/queue-5.15/nfs-further-optimisations-for-ls-l.patch
@@ -0,0 +1,134 @@
+From 824880fe84e32e08ef97657f71ae5568a73d6899 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 28 Sep 2021 14:33:44 -0400
+Subject: NFS: Further optimisations for 'ls -l'
+
+From: Trond Myklebust <trond.myklebust@hammerspace.com>
+
+[ Upstream commit ff81dfb5d721fff87bd516c558847f6effb70031 ]
+
+If a user is doing 'ls -l', we have a heuristic in GETATTR that tells
+the readdir code to try to use READDIRPLUS in order to refresh the inode
+attributes. In certain cirumstances, we also try to invalidate the
+remaining directory entries in order to ensure this refresh.
+
+If there are multiple readers of the directory, we probably should avoid
+invalidating the page cache, since the heuristic breaks down in that
+situation anyway.
+
+Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
+Tested-by: Benjamin Coddington <bcodding@redhat.com>
+Reviewed-by: Benjamin Coddington <bcodding@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/nfs/dir.c           | 16 +++++++++++-----
+ include/linux/nfs_fs.h |  5 ++---
+ 2 files changed, 13 insertions(+), 8 deletions(-)
+
+diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
+index 78219396788b4..32c3d0c454b19 100644
+--- a/fs/nfs/dir.c
++++ b/fs/nfs/dir.c
+@@ -78,6 +78,7 @@ static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir
+ 		ctx->attr_gencount = nfsi->attr_gencount;
+ 		ctx->dir_cookie = 0;
+ 		ctx->dup_cookie = 0;
++		ctx->page_index = 0;
+ 		spin_lock(&dir->i_lock);
+ 		if (list_empty(&nfsi->open_files) &&
+ 		    (nfsi->cache_validity & NFS_INO_DATA_INVAL_DEFER))
+@@ -85,6 +86,7 @@ static struct nfs_open_dir_context *alloc_nfs_open_dir_context(struct inode *dir
+ 					      NFS_INO_INVALID_DATA |
+ 						      NFS_INO_REVAL_FORCED);
+ 		list_add(&ctx->list, &nfsi->open_files);
++		clear_bit(NFS_INO_FORCE_READDIR, &nfsi->flags);
+ 		spin_unlock(&dir->i_lock);
+ 		return ctx;
+ 	}
+@@ -626,8 +628,7 @@ void nfs_force_use_readdirplus(struct inode *dir)
+ 	if (nfs_server_capable(dir, NFS_CAP_READDIRPLUS) &&
+ 	    !list_empty(&nfsi->open_files)) {
+ 		set_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags);
+-		invalidate_mapping_pages(dir->i_mapping,
+-			nfsi->page_index + 1, -1);
++		set_bit(NFS_INO_FORCE_READDIR, &nfsi->flags);
+ 	}
+ }
+ 
+@@ -938,10 +939,8 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc)
+ 			       sizeof(nfsi->cookieverf));
+ 	}
+ 	res = nfs_readdir_search_array(desc);
+-	if (res == 0) {
+-		nfsi->page_index = desc->page_index;
++	if (res == 0)
+ 		return 0;
+-	}
+ 	nfs_readdir_page_unlock_and_put_cached(desc);
+ 	return res;
+ }
+@@ -1081,6 +1080,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
+ 	struct nfs_inode *nfsi = NFS_I(inode);
+ 	struct nfs_open_dir_context *dir_ctx = file->private_data;
+ 	struct nfs_readdir_descriptor *desc;
++	pgoff_t page_index;
+ 	int res;
+ 
+ 	dfprintk(FILE, "NFS: readdir(%pD2) starting at cookie %llu\n",
+@@ -1111,10 +1111,15 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
+ 	desc->dir_cookie = dir_ctx->dir_cookie;
+ 	desc->dup_cookie = dir_ctx->dup_cookie;
+ 	desc->duped = dir_ctx->duped;
++	page_index = dir_ctx->page_index;
+ 	desc->attr_gencount = dir_ctx->attr_gencount;
+ 	memcpy(desc->verf, dir_ctx->verf, sizeof(desc->verf));
+ 	spin_unlock(&file->f_lock);
+ 
++	if (test_and_clear_bit(NFS_INO_FORCE_READDIR, &nfsi->flags) &&
++	    list_is_singular(&nfsi->open_files))
++		invalidate_mapping_pages(inode->i_mapping, page_index + 1, -1);
++
+ 	do {
+ 		res = readdir_search_pagecache(desc);
+ 
+@@ -1151,6 +1156,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
+ 	dir_ctx->dup_cookie = desc->dup_cookie;
+ 	dir_ctx->duped = desc->duped;
+ 	dir_ctx->attr_gencount = desc->attr_gencount;
++	dir_ctx->page_index = desc->page_index;
+ 	memcpy(dir_ctx->verf, desc->verf, sizeof(dir_ctx->verf));
+ 	spin_unlock(&file->f_lock);
+ 
+diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
+index 66b6cc24ab8c9..be8625d8a10a7 100644
+--- a/include/linux/nfs_fs.h
++++ b/include/linux/nfs_fs.h
+@@ -103,6 +103,7 @@ struct nfs_open_dir_context {
+ 	__be32	verf[NFS_DIR_VERIFIER_SIZE];
+ 	__u64 dir_cookie;
+ 	__u64 dup_cookie;
++	pgoff_t page_index;
+ 	signed char duped;
+ };
+ 
+@@ -181,9 +182,6 @@ struct nfs_inode {
+ 	struct rw_semaphore	rmdir_sem;
+ 	struct mutex		commit_mutex;
+ 
+-	/* track last access to cached pages */
+-	unsigned long		page_index;
+-
+ #if IS_ENABLED(CONFIG_NFS_V4)
+ 	struct nfs4_cached_acl	*nfs4_acl;
+         /* NFSv4 state */
+@@ -272,6 +270,7 @@ struct nfs4_copy_state {
+ #define NFS_INO_INVALIDATING	(3)		/* inode is being invalidated */
+ #define NFS_INO_FSCACHE		(5)		/* inode can be cached by FS-Cache */
+ #define NFS_INO_FSCACHE_LOCK	(6)		/* FS-Cache cookie management lock */
++#define NFS_INO_FORCE_READDIR	(7)		/* force readdirplus */
+ #define NFS_INO_LAYOUTCOMMIT	(9)		/* layoutcommit required */
+ #define NFS_INO_LAYOUTCOMMITTING (10)		/* layoutcommit inflight */
+ #define NFS_INO_LAYOUTSTATS	(11)		/* layoutstats inflight */
+-- 
+2.35.1
+
diff --git a/queue-5.15/nfs-save-some-space-in-the-inode.patch b/queue-5.15/nfs-save-some-space-in-the-inode.patch
new file mode 100644
index 00000000000..d3fc8e8592e
--- /dev/null
+++ b/queue-5.15/nfs-save-some-space-in-the-inode.patch
@@ -0,0 +1,147 @@
+From a3af8f4666db612da3ab84dde6b1889b01fb91e5 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 28 Sep 2021 17:41:41 -0400
+Subject: NFS: Save some space in the inode
+
+From: Trond Myklebust <trond.myklebust@hammerspace.com>
+
+[ Upstream commit e591b298d7ecb851e200f65946e3d53fe78a3c4f ]
+
+Save some space in the nfs_inode by setting up an anonymous union with
+the fields that are peculiar to a specific type of filesystem object.
+
+Signed-off-by: Trond Myklebust <trond.myklebust@hammerspace.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/nfs/inode.c         | 26 ++++++++++++++++++--------
+ include/linux/nfs_fs.h | 42 ++++++++++++++++++++++++------------------
+ 2 files changed, 42 insertions(+), 26 deletions(-)
+
+diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
+index cb407af9e9e92..dc057ab6b30d1 100644
+--- a/fs/nfs/inode.c
++++ b/fs/nfs/inode.c
+@@ -431,6 +431,22 @@ nfs_ilookup(struct super_block *sb, struct nfs_fattr *fattr, struct nfs_fh *fh)
+ 	return inode;
+ }
+ 
++static void nfs_inode_init_regular(struct nfs_inode *nfsi)
++{
++	atomic_long_set(&nfsi->nrequests, 0);
++	INIT_LIST_HEAD(&nfsi->commit_info.list);
++	atomic_long_set(&nfsi->commit_info.ncommit, 0);
++	atomic_set(&nfsi->commit_info.rpcs_out, 0);
++	mutex_init(&nfsi->commit_mutex);
++}
++
++static void nfs_inode_init_dir(struct nfs_inode *nfsi)
++{
++	nfsi->cache_change_attribute = 0;
++	memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
++	init_rwsem(&nfsi->rmdir_sem);
++}
++
+ /*
+  * This is our front-end to iget that looks up inodes by file handle
+  * instead of inode number.
+@@ -485,10 +501,12 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
+ 		if (S_ISREG(inode->i_mode)) {
+ 			inode->i_fop = NFS_SB(sb)->nfs_client->rpc_ops->file_ops;
+ 			inode->i_data.a_ops = &nfs_file_aops;
++			nfs_inode_init_regular(nfsi);
+ 		} else if (S_ISDIR(inode->i_mode)) {
+ 			inode->i_op = NFS_SB(sb)->nfs_client->rpc_ops->dir_inode_ops;
+ 			inode->i_fop = &nfs_dir_operations;
+ 			inode->i_data.a_ops = &nfs_dir_aops;
++			nfs_inode_init_dir(nfsi);
+ 			/* Deal with crossing mountpoints */
+ 			if (fattr->valid & NFS_ATTR_FATTR_MOUNTPOINT ||
+ 					fattr->valid & NFS_ATTR_FATTR_V4_REFERRAL) {
+@@ -514,7 +532,6 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st
+ 		inode->i_uid = make_kuid(&init_user_ns, -2);
+ 		inode->i_gid = make_kgid(&init_user_ns, -2);
+ 		inode->i_blocks = 0;
+-		memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
+ 		nfsi->write_io = 0;
+ 		nfsi->read_io = 0;
+ 
+@@ -2282,14 +2299,7 @@ static void init_once(void *foo)
+ 	INIT_LIST_HEAD(&nfsi->open_files);
+ 	INIT_LIST_HEAD(&nfsi->access_cache_entry_lru);
+ 	INIT_LIST_HEAD(&nfsi->access_cache_inode_lru);
+-	INIT_LIST_HEAD(&nfsi->commit_info.list);
+-	atomic_long_set(&nfsi->nrequests, 0);
+-	atomic_long_set(&nfsi->commit_info.ncommit, 0);
+-	atomic_set(&nfsi->commit_info.rpcs_out, 0);
+-	init_rwsem(&nfsi->rmdir_sem);
+-	mutex_init(&nfsi->commit_mutex);
+ 	nfs4_init_once(nfsi);
+-	nfsi->cache_change_attribute = 0;
+ }
+ 
+ static int __init nfs_init_inodecache(void)
+diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
+index be8625d8a10a7..d0855352cd6fc 100644
+--- a/include/linux/nfs_fs.h
++++ b/include/linux/nfs_fs.h
+@@ -155,33 +155,39 @@ struct nfs_inode {
+ 	unsigned long		attrtimeo_timestamp;
+ 
+ 	unsigned long		attr_gencount;
+-	/* "Generation counter" for the attribute cache. This is
+-	 * bumped whenever we update the metadata on the
+-	 * server.
+-	 */
+-	unsigned long		cache_change_attribute;
+ 
+ 	struct rb_root		access_cache;
+ 	struct list_head	access_cache_entry_lru;
+ 	struct list_head	access_cache_inode_lru;
+ 
+-	/*
+-	 * This is the cookie verifier used for NFSv3 readdir
+-	 * operations
+-	 */
+-	__be32			cookieverf[NFS_DIR_VERIFIER_SIZE];
+-
+-	atomic_long_t		nrequests;
+-	struct nfs_mds_commit_info commit_info;
++	union {
++		/* Directory */
++		struct {
++			/* "Generation counter" for the attribute cache.
++			 * This is bumped whenever we update the metadata
++			 * on the server.
++			 */
++			unsigned long	cache_change_attribute;
++			/*
++			 * This is the cookie verifier used for NFSv3 readdir
++			 * operations
++			 */
++			__be32		cookieverf[NFS_DIR_VERIFIER_SIZE];
++			/* Readers: in-flight sillydelete RPC calls */
++			/* Writers: rmdir */
++			struct rw_semaphore	rmdir_sem;
++		};
++		/* Regular file */
++		struct {
++			atomic_long_t	nrequests;
++			struct nfs_mds_commit_info commit_info;
++			struct mutex	commit_mutex;
++		};
++	};
+ 
+ 	/* Open contexts for shared mmap writes */
+ 	struct list_head	open_files;
+ 
+-	/* Readers: in-flight sillydelete RPC calls */
+-	/* Writers: rmdir */
+-	struct rw_semaphore	rmdir_sem;
+-	struct mutex		commit_mutex;
+-
+ #if IS_ENABLED(CONFIG_NFS_V4)
+ 	struct nfs4_cached_acl	*nfs4_acl;
+         /* NFSv4 state */
+-- 
+2.35.1
+
diff --git a/queue-5.15/riscv-dts-microchip-mpfs-fix-reference-clock-node.patch b/queue-5.15/riscv-dts-microchip-mpfs-fix-reference-clock-node.patch
new file mode 100644
index 00000000000..52491d8a1f7
--- /dev/null
+++ b/queue-5.15/riscv-dts-microchip-mpfs-fix-reference-clock-node.patch
@@ -0,0 +1,79 @@
+From 4fa8e7d65f7a720ac13f22aab2ce447b921dbf97 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 17 Dec 2021 13:49:26 +0100
+Subject: riscv: dts: microchip: mpfs: Fix reference clock node
+
+From: Geert Uytterhoeven <geert@linux-m68k.org>
+
+[ Upstream commit 9d7b3078628f591e4007210c0d5d3f94805cff55 ]
+
+"make dtbs_check" reports:
+
+    arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dt.yaml: soc: refclk: {'compatible': ['fixed-clock'], '#clock-cells': [[0]], 'clock-frequency': [[600000000]], 'clock-output-names': ['msspllclk'], 'phandle': [[7]]} should not be valid under {'type': 'object'}
+	From schema: dtschema/schemas/simple-bus.yaml
+
+Fix this by moving the node out of the "soc" subnode.
+While at it, rename it to "msspllclk", and drop the now superfluous
+"clock-output-names" property.
+Move the actual clock-frequency value to the board DTS, since it is not
+set until bitstream programming time.
+
+Signed-off-by: Geert Uytterhoeven <geert@linux-m68k.org>
+Acked-by: Krzysztof Kozlowski <krzysztof.kozlowski@canonical.com>
+Reviewed-by: Conor Dooley <conor.dooley@microchip.com>
+Tested-by: Conor Dooley <conor.dooley@microchip.com>
+Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ .../boot/dts/microchip/microchip-mpfs-icicle-kit.dts |  4 ++++
+ arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi    | 12 +++++-------
+ 2 files changed, 9 insertions(+), 7 deletions(-)
+
+diff --git a/arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts b/arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts
+index cce5eca31f257..4b69ab4ff30a2 100644
+--- a/arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts
++++ b/arch/riscv/boot/dts/microchip/microchip-mpfs-icicle-kit.dts
+@@ -40,6 +40,10 @@
+ 	};
+ };
+ 
++&refclk {
++	clock-frequency = <600000000>;
++};
++
+ &serial0 {
+ 	status = "okay";
+ };
+diff --git a/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi b/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi
+index 4ef4bcb748729..9279ccf20009a 100644
+--- a/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi
++++ b/arch/riscv/boot/dts/microchip/microchip-mpfs.dtsi
+@@ -139,6 +139,11 @@
+ 		};
+ 	};
+ 
++	refclk: msspllclk {
++		compatible = "fixed-clock";
++		#clock-cells = <0>;
++	};
++
+ 	soc {
+ 		#address-cells = <2>;
+ 		#size-cells = <2>;
+@@ -188,13 +193,6 @@
+ 			#dma-cells = <1>;
+ 		};
+ 
+-		refclk: refclk {
+-			compatible = "fixed-clock";
+-			#clock-cells = <0>;
+-			clock-frequency = <600000000>;
+-			clock-output-names = "msspllclk";
+-		};
+-
+ 		clkcfg: clkcfg@20002000 {
+ 			compatible = "microchip,mpfs-clkcfg";
+ 			reg = <0x0 0x20002000 0x0 0x1000>;
+-- 
+2.35.1
+
diff --git a/queue-5.15/series b/queue-5.15/series
index 0bf0d7eb766..1451c2e7b21 100644
--- a/queue-5.15/series
+++ b/queue-5.15/series
@@ -40,3 +40,9 @@ nvmet-fix-a-use-after-free.patch
 drm-i915-implement-waedplinkratedatareload.patch
 scsi-mpt3sas-fix-use-after-free-warning.patch
 scsi-lpfc-add-missing-destroy_workqueue-in-error-path.patch
+nfs-further-optimisations-for-ls-l.patch
+nfs-save-some-space-in-the-inode.patch
+nfs-fix-another-fsync-issue-after-a-server-reboot.patch
+cgroup-elide-write-locking-threadgroup_rwsem-when-up.patch
+cgroup-fix-threadgroup_rwsem-cpus_read_lock-deadlock.patch
+riscv-dts-microchip-mpfs-fix-reference-clock-node.patch