From a3d0608952a1ce0ff94e0f73ebb51e79041d27b3 Mon Sep 17 00:00:00 2001
From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Tue, 30 Nov 2021 11:38:07 +0100
Subject: [PATCH] 4.4-stable patches

added patches:
	fuse-fix-page-stealing.patch
	shm-extend-forced-shm-destroy-to-support-objects-from-several-ipc-nses.patch
---
 queue-4.4/fuse-fix-page-stealing.patch        |  61 +++
 ...fuse-release-pipe-buf-after-last-use.patch |   7 +-
 queue-4.4/series                              |   2 +
 ...upport-objects-from-several-ipc-nses.patch | 390 ++++++++++++++++++
 4 files changed, 456 insertions(+), 4 deletions(-)
 create mode 100644 queue-4.4/fuse-fix-page-stealing.patch
 create mode 100644 queue-4.4/shm-extend-forced-shm-destroy-to-support-objects-from-several-ipc-nses.patch

diff --git a/queue-4.4/fuse-fix-page-stealing.patch b/queue-4.4/fuse-fix-page-stealing.patch
new file mode 100644
index 00000000000..33cd2bb38b1
--- /dev/null
+++ b/queue-4.4/fuse-fix-page-stealing.patch
@@ -0,0 +1,61 @@
+From 712a951025c0667ff00b25afc360f74e639dfabe Mon Sep 17 00:00:00 2001
+From: Miklos Szeredi <mszeredi@redhat.com>
+Date: Tue, 2 Nov 2021 11:10:37 +0100
+Subject: fuse: fix page stealing
+
+From: Miklos Szeredi <mszeredi@redhat.com>
+
+commit 712a951025c0667ff00b25afc360f74e639dfabe upstream.
+
+It is possible to trigger a crash by splicing anon pipe bufs to the fuse
+device.
+
+The reason for this is that anon_pipe_buf_release() will reuse buf->page if
+the refcount is 1, but that page might have already been stolen and its
+flags modified (e.g. PG_lru added).
+
+This happens in the unlikely case of fuse_dev_splice_write() getting around
+to calling pipe_buf_release() after a page has been stolen, added to the
+page cache and removed from the page cache.
+
+Fix by calling pipe_buf_release() right after the page was inserted into
+the page cache.  In this case the page has an elevated refcount so any
+release function will know that the page isn't reusable.
+
+Reported-by: Frank Dinoff <fdinoff@google.com>
+Link: https://lore.kernel.org/r/CAAmZXrsGg2xsP1CK+cbuEMumtrqdvD-NKnWzhNcvn71RV3c1yw@mail.gmail.com/
+Fixes: dd3bb14f44a6 ("fuse: support splice() writing to fuse device")
+Cc: <stable@vger.kernel.org> # v2.6.35
+Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/fuse/dev.c |   10 +++++++++-
+ 1 file changed, 9 insertions(+), 1 deletion(-)
+
+--- a/fs/fuse/dev.c
++++ b/fs/fuse/dev.c
+@@ -922,6 +922,13 @@ static int fuse_try_move_page(struct fus
+ 		return err;
+ 	}
+ 
++	/*
++	 * Release while we have extra ref on stolen page.  Otherwise
++	 * anon_pipe_buf_release() might think the page can be reused.
++	 */
++	buf->ops->release(cs->pipe, buf);
++	buf->ops = NULL;
++
+ 	page_cache_get(newpage);
+ 
+ 	if (!(buf->flags & PIPE_BUF_FLAG_LRU))
+@@ -2090,7 +2097,8 @@ static ssize_t fuse_dev_splice_write(str
+ out_free:
+ 	for (idx = 0; idx < nbuf; idx++) {
+ 		struct pipe_buffer *buf = &bufs[idx];
+-		buf->ops->release(pipe, buf);
++		if (buf->ops)
++			buf->ops->release(pipe, buf);
+ 	}
+ 	pipe_unlock(pipe);
+ 
diff --git a/queue-4.4/fuse-release-pipe-buf-after-last-use.patch b/queue-4.4/fuse-release-pipe-buf-after-last-use.patch
index 4a75c6e12f2..869f1bd7a9e 100644
--- a/queue-4.4/fuse-release-pipe-buf-after-last-use.patch
+++ b/queue-4.4/fuse-release-pipe-buf-after-last-use.patch
@@ -19,7 +19,6 @@ Fixes: 712a951025c0 ("fuse: fix page stealing")
 Cc: <stable@vger.kernel.org> # v2.6.35
 Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
 Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
-
 ---
  fs/fuse/dev.c |   10 +++++-----
  1 file changed, 5 insertions(+), 5 deletions(-)
@@ -38,9 +37,9 @@ Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
  	/*
  	 * This is a new and locked page, it shouldn't be mapped or
  	 * have any special flags on it
-@@ -922,11 +927,6 @@ static int fuse_try_move_page(struct fus
- 		return err;
- 	}
+@@ -929,11 +934,6 @@ static int fuse_try_move_page(struct fus
+ 	buf->ops->release(cs->pipe, buf);
+ 	buf->ops = NULL;
  
 -	page_cache_get(newpage);
 -
diff --git a/queue-4.4/series b/queue-4.4/series
index a7828cb1759..eaeaf1c2b70 100644
--- a/queue-4.4/series
+++ b/queue-4.4/series
@@ -19,7 +19,9 @@ tracing-check-pid-filtering-when-creating-events.patch
 hugetlbfs-flush-tlbs-correctly-after-huge_pmd_unshare.patch
 proc-vmcore-fix-clearing-user-buffer-by-properly-using-clear_user.patch
 nfc-add-nci_unreg-flag-to-eliminate-the-race.patch
+fuse-fix-page-stealing.patch
 fuse-release-pipe-buf-after-last-use.patch
+shm-extend-forced-shm-destroy-to-support-objects-from-several-ipc-nses.patch
 xen-sync-include-xen-interface-io-ring.h-with-xen-s-newest-version.patch
 xen-blkfront-read-response-from-backend-only-once.patch
 xen-blkfront-don-t-take-local-copy-of-a-request-from-the-ring-page.patch
diff --git a/queue-4.4/shm-extend-forced-shm-destroy-to-support-objects-from-several-ipc-nses.patch b/queue-4.4/shm-extend-forced-shm-destroy-to-support-objects-from-several-ipc-nses.patch
new file mode 100644
index 00000000000..e65edab55b4
--- /dev/null
+++ b/queue-4.4/shm-extend-forced-shm-destroy-to-support-objects-from-several-ipc-nses.patch
@@ -0,0 +1,390 @@
+From 85b6d24646e4125c591639841169baa98a2da503 Mon Sep 17 00:00:00 2001
+From: Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>
+Date: Fri, 19 Nov 2021 16:43:21 -0800
+Subject: shm: extend forced shm destroy to support objects from several IPC nses
+
+From: Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>
+
+commit 85b6d24646e4125c591639841169baa98a2da503 upstream.
+
+Currently, the exit_shm() function not designed to work properly when
+task->sysvshm.shm_clist holds shm objects from different IPC namespaces.
+
+This is a real pain when sysctl kernel.shm_rmid_forced = 1, because it
+leads to use-after-free (reproducer exists).
+
+This is an attempt to fix the problem by extending exit_shm mechanism to
+handle shm's destroy from several IPC ns'es.
+
+To achieve that we do several things:
+
+1. add a namespace (non-refcounted) pointer to the struct shmid_kernel
+
+2. during new shm object creation (newseg()/shmget syscall) we
+   initialize this pointer by current task IPC ns
+
+3. exit_shm() fully reworked such that it traverses over all shp's in
+   task->sysvshm.shm_clist and gets IPC namespace not from current task
+   as it was before but from shp's object itself, then call
+   shm_destroy(shp, ns).
+
+Note: We need to be really careful here, because as it was said before
+(1), our pointer to IPC ns non-refcnt'ed.  To be on the safe side we
+using special helper get_ipc_ns_not_zero() which allows to get IPC ns
+refcounter only if IPC ns not in the "state of destruction".
+
+Q/A
+
+Q: Why can we access shp->ns memory using non-refcounted pointer?
+A: Because shp object lifetime is always shorther than IPC namespace
+   lifetime, so, if we get shp object from the task->sysvshm.shm_clist
+   while holding task_lock(task) nobody can steal our namespace.
+
+Q: Does this patch change semantics of unshare/setns/clone syscalls?
+A: No. It's just fixes non-covered case when process may leave IPC
+   namespace without getting task->sysvshm.shm_clist list cleaned up.
+
+Link: https://lkml.kernel.org/r/67bb03e5-f79c-1815-e2bf-949c67047418@colorfullife.com
+Link: https://lkml.kernel.org/r/20211109151501.4921-1-manfred@colorfullife.com
+Fixes: ab602f79915 ("shm: make exit_shm work proportional to task activity")
+Co-developed-by: Manfred Spraul <manfred@colorfullife.com>
+Signed-off-by: Manfred Spraul <manfred@colorfullife.com>
+Signed-off-by: Alexander Mikhalitsyn <alexander.mikhalitsyn@virtuozzo.com>
+Cc: "Eric W. Biederman" <ebiederm@xmission.com>
+Cc: Davidlohr Bueso <dave@stgolabs.net>
+Cc: Greg KH <gregkh@linuxfoundation.org>
+Cc: Andrei Vagin <avagin@gmail.com>
+Cc: Pavel Tikhomirov <ptikhomirov@virtuozzo.com>
+Cc: Vasily Averin <vvs@virtuozzo.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ include/linux/ipc_namespace.h |   15 +++
+ include/linux/sched.h         |    2 
+ include/linux/shm.h           |   13 ++-
+ ipc/shm.c                     |  176 +++++++++++++++++++++++++++++++-----------
+ 4 files changed, 159 insertions(+), 47 deletions(-)
+
+--- a/include/linux/ipc_namespace.h
++++ b/include/linux/ipc_namespace.h
+@@ -123,6 +123,16 @@ static inline struct ipc_namespace *get_
+ 	return ns;
+ }
+ 
++static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns)
++{
++	if (ns) {
++		if (atomic_inc_not_zero(&ns->count))
++			return ns;
++	}
++
++	return NULL;
++}
++
+ extern void put_ipc_ns(struct ipc_namespace *ns);
+ #else
+ static inline struct ipc_namespace *copy_ipcs(unsigned long flags,
+@@ -138,6 +148,11 @@ static inline struct ipc_namespace *get_
+ {
+ 	return ns;
+ }
++
++static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns)
++{
++	return ns;
++}
+ 
+ static inline void put_ipc_ns(struct ipc_namespace *ns)
+ {
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -2792,7 +2792,7 @@ static inline int thread_group_empty(str
+  * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
+  * subscriptions and synchronises with wait4().  Also used in procfs.  Also
+  * pins the final release of task.io_context.  Also protects ->cpuset and
+- * ->cgroup.subsys[]. And ->vfork_done.
++ * ->cgroup.subsys[]. And ->vfork_done. And ->sysvshm.shm_clist.
+  *
+  * Nests both inside and outside of read_lock(&tasklist_lock).
+  * It must not be nested with write_lock_irq(&tasklist_lock),
+--- a/include/linux/shm.h
++++ b/include/linux/shm.h
+@@ -19,9 +19,18 @@ struct shmid_kernel /* private to the ke
+ 	pid_t			shm_lprid;
+ 	struct user_struct	*mlock_user;
+ 
+-	/* The task created the shm object.  NULL if the task is dead. */
++	/*
++	 * The task created the shm object, for
++	 * task_lock(shp->shm_creator)
++	 */
+ 	struct task_struct	*shm_creator;
+-	struct list_head	shm_clist;	/* list by creator */
++
++	/*
++	 * List by creator. task_lock(->shm_creator) required for read/write.
++	 * If list_empty(), then the creator is dead already.
++	 */
++	struct list_head	shm_clist;
++	struct ipc_namespace	*ns;
+ };
+ 
+ /* shm_mode upper byte flags */
+--- a/ipc/shm.c
++++ b/ipc/shm.c
+@@ -90,6 +90,7 @@ static void do_shm_rmid(struct ipc_names
+ {
+ 	struct shmid_kernel *shp;
+ 	shp = container_of(ipcp, struct shmid_kernel, shm_perm);
++	WARN_ON(ns != shp->ns);
+ 
+ 	if (shp->shm_nattch) {
+ 		shp->shm_perm.mode |= SHM_DEST;
+@@ -180,10 +181,43 @@ static void shm_rcu_free(struct rcu_head
+ 	ipc_rcu_free(head);
+ }
+ 
+-static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s)
++/*
++ * It has to be called with shp locked.
++ * It must be called before ipc_rmid()
++ */
++static inline void shm_clist_rm(struct shmid_kernel *shp)
++{
++	struct task_struct *creator;
++
++	/* ensure that shm_creator does not disappear */
++	rcu_read_lock();
++
++	/*
++	 * A concurrent exit_shm may do a list_del_init() as well.
++	 * Just do nothing if exit_shm already did the work
++	 */
++	if (!list_empty(&shp->shm_clist)) {
++		/*
++		 * shp->shm_creator is guaranteed to be valid *only*
++		 * if shp->shm_clist is not empty.
++		 */
++		creator = shp->shm_creator;
++
++		task_lock(creator);
++		/*
++		 * list_del_init() is a nop if the entry was already removed
++		 * from the list.
++		 */
++		list_del_init(&shp->shm_clist);
++		task_unlock(creator);
++	}
++	rcu_read_unlock();
++}
++
++static inline void shm_rmid(struct shmid_kernel *s)
+ {
+-	list_del(&s->shm_clist);
+-	ipc_rmid(&shm_ids(ns), &s->shm_perm);
++	shm_clist_rm(s);
++	ipc_rmid(&shm_ids(s->ns), &s->shm_perm);
+ }
+ 
+ 
+@@ -238,7 +272,7 @@ static void shm_destroy(struct ipc_names
+ 	shm_file = shp->shm_file;
+ 	shp->shm_file = NULL;
+ 	ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT;
+-	shm_rmid(ns, shp);
++	shm_rmid(shp);
+ 	shm_unlock(shp);
+ 	if (!is_file_hugepages(shm_file))
+ 		shmem_lock(shm_file, 0, shp->mlock_user);
+@@ -259,10 +293,10 @@ static void shm_destroy(struct ipc_names
+  *
+  * 2) sysctl kernel.shm_rmid_forced is set to 1.
+  */
+-static bool shm_may_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
++static bool shm_may_destroy(struct shmid_kernel *shp)
+ {
+ 	return (shp->shm_nattch == 0) &&
+-	       (ns->shm_rmid_forced ||
++	       (shp->ns->shm_rmid_forced ||
+ 		(shp->shm_perm.mode & SHM_DEST));
+ }
+ 
+@@ -293,7 +327,7 @@ static void shm_close(struct vm_area_str
+ 	shp->shm_lprid = task_tgid_vnr(current);
+ 	shp->shm_dtim = get_seconds();
+ 	shp->shm_nattch--;
+-	if (shm_may_destroy(ns, shp))
++	if (shm_may_destroy(shp))
+ 		shm_destroy(ns, shp);
+ 	else
+ 		shm_unlock(shp);
+@@ -314,10 +348,10 @@ static int shm_try_destroy_orphaned(int
+ 	 *
+ 	 * As shp->* are changed under rwsem, it's safe to skip shp locking.
+ 	 */
+-	if (shp->shm_creator != NULL)
++	if (!list_empty(&shp->shm_clist))
+ 		return 0;
+ 
+-	if (shm_may_destroy(ns, shp)) {
++	if (shm_may_destroy(shp)) {
+ 		shm_lock_by_ptr(shp);
+ 		shm_destroy(ns, shp);
+ 	}
+@@ -335,48 +369,97 @@ void shm_destroy_orphaned(struct ipc_nam
+ /* Locking assumes this will only be called with task == current */
+ void exit_shm(struct task_struct *task)
+ {
+-	struct ipc_namespace *ns = task->nsproxy->ipc_ns;
+-	struct shmid_kernel *shp, *n;
++	for (;;) {
++		struct shmid_kernel *shp;
++		struct ipc_namespace *ns;
+ 
+-	if (list_empty(&task->sysvshm.shm_clist))
+-		return;
++		task_lock(task);
++
++		if (list_empty(&task->sysvshm.shm_clist)) {
++			task_unlock(task);
++			break;
++		}
++
++		shp = list_first_entry(&task->sysvshm.shm_clist, struct shmid_kernel,
++				shm_clist);
+ 
+-	/*
+-	 * If kernel.shm_rmid_forced is not set then only keep track of
+-	 * which shmids are orphaned, so that a later set of the sysctl
+-	 * can clean them up.
+-	 */
+-	if (!ns->shm_rmid_forced) {
+-		down_read(&shm_ids(ns).rwsem);
+-		list_for_each_entry(shp, &task->sysvshm.shm_clist, shm_clist)
+-			shp->shm_creator = NULL;
+ 		/*
+-		 * Only under read lock but we are only called on current
+-		 * so no entry on the list will be shared.
++		 * 1) Get pointer to the ipc namespace. It is worth to say
++		 * that this pointer is guaranteed to be valid because
++		 * shp lifetime is always shorter than namespace lifetime
++		 * in which shp lives.
++		 * We taken task_lock it means that shp won't be freed.
+ 		 */
+-		list_del(&task->sysvshm.shm_clist);
+-		up_read(&shm_ids(ns).rwsem);
+-		return;
+-	}
++		ns = shp->ns;
+ 
+-	/*
+-	 * Destroy all already created segments, that were not yet mapped,
+-	 * and mark any mapped as orphan to cover the sysctl toggling.
+-	 * Destroy is skipped if shm_may_destroy() returns false.
+-	 */
+-	down_write(&shm_ids(ns).rwsem);
+-	list_for_each_entry_safe(shp, n, &task->sysvshm.shm_clist, shm_clist) {
+-		shp->shm_creator = NULL;
++		/*
++		 * 2) If kernel.shm_rmid_forced is not set then only keep track of
++		 * which shmids are orphaned, so that a later set of the sysctl
++		 * can clean them up.
++		 */
++		if (!ns->shm_rmid_forced)
++			goto unlink_continue;
+ 
+-		if (shm_may_destroy(ns, shp)) {
+-			shm_lock_by_ptr(shp);
+-			shm_destroy(ns, shp);
++		/*
++		 * 3) get a reference to the namespace.
++		 *    The refcount could be already 0. If it is 0, then
++		 *    the shm objects will be free by free_ipc_work().
++		 */
++		ns = get_ipc_ns_not_zero(ns);
++		if (!ns) {
++unlink_continue:
++			list_del_init(&shp->shm_clist);
++			task_unlock(task);
++			continue;
+ 		}
+-	}
+ 
+-	/* Remove the list head from any segments still attached. */
+-	list_del(&task->sysvshm.shm_clist);
+-	up_write(&shm_ids(ns).rwsem);
++		/*
++		 * 4) get a reference to shp.
++		 *   This cannot fail: shm_clist_rm() is called before
++		 *   ipc_rmid(), thus the refcount cannot be 0.
++		 */
++		WARN_ON(!ipc_rcu_getref(&shp->shm_perm));
++
++		/*
++		 * 5) unlink the shm segment from the list of segments
++		 *    created by current.
++		 *    This must be done last. After unlinking,
++		 *    only the refcounts obtained above prevent IPC_RMID
++		 *    from destroying the segment or the namespace.
++		 */
++		list_del_init(&shp->shm_clist);
++
++		task_unlock(task);
++
++		/*
++		 * 6) we have all references
++		 *    Thus lock & if needed destroy shp.
++		 */
++		down_write(&shm_ids(ns).rwsem);
++		shm_lock_by_ptr(shp);
++		/*
++		 * rcu_read_lock was implicitly taken in shm_lock_by_ptr, it's
++		 * safe to call ipc_rcu_putref here
++		 */
++		ipc_rcu_putref(&shp->shm_perm, shm_rcu_free);
++
++		if (ipc_valid_object(&shp->shm_perm)) {
++			if (shm_may_destroy(shp))
++				shm_destroy(ns, shp);
++			else
++				shm_unlock(shp);
++		} else {
++			/*
++			 * Someone else deleted the shp from namespace
++			 * idr/kht while we have waited.
++			 * Just unlock and continue.
++			 */
++			shm_unlock(shp);
++		}
++
++		up_write(&shm_ids(ns).rwsem);
++		put_ipc_ns(ns); /* paired with get_ipc_ns_not_zero */
++	}
+ }
+ 
+ static int shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf)
+@@ -607,7 +690,11 @@ static int newseg(struct ipc_namespace *
+ 		goto no_id;
+ 	}
+ 
++	shp->ns = ns;
++
++	task_lock(current);
+ 	list_add(&shp->shm_clist, &current->sysvshm.shm_clist);
++	task_unlock(current);
+ 
+ 	/*
+ 	 * shmid gets reported as "inode#" in /proc/pid/maps.
+@@ -1252,7 +1339,8 @@ out_nattch:
+ 	down_write(&shm_ids(ns).rwsem);
+ 	shp = shm_lock(ns, shmid);
+ 	shp->shm_nattch--;
+-	if (shm_may_destroy(ns, shp))
++
++	if (shm_may_destroy(shp))
+ 		shm_destroy(ns, shp);
+ 	else
+ 		shm_unlock(shp);
-- 
2.39.5