From a3d0608952a1ce0ff94e0f73ebb51e79041d27b3 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 30 Nov 2021 11:38:07 +0100 Subject: [PATCH] 4.4-stable patches added patches: fuse-fix-page-stealing.patch shm-extend-forced-shm-destroy-to-support-objects-from-several-ipc-nses.patch --- queue-4.4/fuse-fix-page-stealing.patch | 61 +++ ...fuse-release-pipe-buf-after-last-use.patch | 7 +- queue-4.4/series | 2 + ...upport-objects-from-several-ipc-nses.patch | 390 ++++++++++++++++++ 4 files changed, 456 insertions(+), 4 deletions(-) create mode 100644 queue-4.4/fuse-fix-page-stealing.patch create mode 100644 queue-4.4/shm-extend-forced-shm-destroy-to-support-objects-from-several-ipc-nses.patch diff --git a/queue-4.4/fuse-fix-page-stealing.patch b/queue-4.4/fuse-fix-page-stealing.patch new file mode 100644 index 00000000000..33cd2bb38b1 --- /dev/null +++ b/queue-4.4/fuse-fix-page-stealing.patch @@ -0,0 +1,61 @@ +From 712a951025c0667ff00b25afc360f74e639dfabe Mon Sep 17 00:00:00 2001 +From: Miklos Szeredi +Date: Tue, 2 Nov 2021 11:10:37 +0100 +Subject: fuse: fix page stealing + +From: Miklos Szeredi + +commit 712a951025c0667ff00b25afc360f74e639dfabe upstream. + +It is possible to trigger a crash by splicing anon pipe bufs to the fuse +device. + +The reason for this is that anon_pipe_buf_release() will reuse buf->page if +the refcount is 1, but that page might have already been stolen and its +flags modified (e.g. PG_lru added). + +This happens in the unlikely case of fuse_dev_splice_write() getting around +to calling pipe_buf_release() after a page has been stolen, added to the +page cache and removed from the page cache. + +Fix by calling pipe_buf_release() right after the page was inserted into +the page cache. In this case the page has an elevated refcount so any +release function will know that the page isn't reusable. + +Reported-by: Frank Dinoff +Link: https://lore.kernel.org/r/CAAmZXrsGg2xsP1CK+cbuEMumtrqdvD-NKnWzhNcvn71RV3c1yw@mail.gmail.com/ +Fixes: dd3bb14f44a6 ("fuse: support splice() writing to fuse device") +Cc: # v2.6.35 +Signed-off-by: Miklos Szeredi +Signed-off-by: Greg Kroah-Hartman + +--- + fs/fuse/dev.c | 10 +++++++++- + 1 file changed, 9 insertions(+), 1 deletion(-) + +--- a/fs/fuse/dev.c ++++ b/fs/fuse/dev.c +@@ -922,6 +922,13 @@ static int fuse_try_move_page(struct fus + return err; + } + ++ /* ++ * Release while we have extra ref on stolen page. Otherwise ++ * anon_pipe_buf_release() might think the page can be reused. ++ */ ++ buf->ops->release(cs->pipe, buf); ++ buf->ops = NULL; ++ + page_cache_get(newpage); + + if (!(buf->flags & PIPE_BUF_FLAG_LRU)) +@@ -2090,7 +2097,8 @@ static ssize_t fuse_dev_splice_write(str + out_free: + for (idx = 0; idx < nbuf; idx++) { + struct pipe_buffer *buf = &bufs[idx]; +- buf->ops->release(pipe, buf); ++ if (buf->ops) ++ buf->ops->release(pipe, buf); + } + pipe_unlock(pipe); + diff --git a/queue-4.4/fuse-release-pipe-buf-after-last-use.patch b/queue-4.4/fuse-release-pipe-buf-after-last-use.patch index 4a75c6e12f2..869f1bd7a9e 100644 --- a/queue-4.4/fuse-release-pipe-buf-after-last-use.patch +++ b/queue-4.4/fuse-release-pipe-buf-after-last-use.patch @@ -19,7 +19,6 @@ Fixes: 712a951025c0 ("fuse: fix page stealing") Cc: # v2.6.35 Signed-off-by: Miklos Szeredi Signed-off-by: Greg Kroah-Hartman - --- fs/fuse/dev.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) @@ -38,9 +37,9 @@ Signed-off-by: Greg Kroah-Hartman /* * This is a new and locked page, it shouldn't be mapped or * have any special flags on it -@@ -922,11 +927,6 @@ static int fuse_try_move_page(struct fus - return err; - } +@@ -929,11 +934,6 @@ static int fuse_try_move_page(struct fus + buf->ops->release(cs->pipe, buf); + buf->ops = NULL; - page_cache_get(newpage); - diff --git a/queue-4.4/series b/queue-4.4/series index a7828cb1759..eaeaf1c2b70 100644 --- a/queue-4.4/series +++ b/queue-4.4/series @@ -19,7 +19,9 @@ tracing-check-pid-filtering-when-creating-events.patch hugetlbfs-flush-tlbs-correctly-after-huge_pmd_unshare.patch proc-vmcore-fix-clearing-user-buffer-by-properly-using-clear_user.patch nfc-add-nci_unreg-flag-to-eliminate-the-race.patch +fuse-fix-page-stealing.patch fuse-release-pipe-buf-after-last-use.patch +shm-extend-forced-shm-destroy-to-support-objects-from-several-ipc-nses.patch xen-sync-include-xen-interface-io-ring.h-with-xen-s-newest-version.patch xen-blkfront-read-response-from-backend-only-once.patch xen-blkfront-don-t-take-local-copy-of-a-request-from-the-ring-page.patch diff --git a/queue-4.4/shm-extend-forced-shm-destroy-to-support-objects-from-several-ipc-nses.patch b/queue-4.4/shm-extend-forced-shm-destroy-to-support-objects-from-several-ipc-nses.patch new file mode 100644 index 00000000000..e65edab55b4 --- /dev/null +++ b/queue-4.4/shm-extend-forced-shm-destroy-to-support-objects-from-several-ipc-nses.patch @@ -0,0 +1,390 @@ +From 85b6d24646e4125c591639841169baa98a2da503 Mon Sep 17 00:00:00 2001 +From: Alexander Mikhalitsyn +Date: Fri, 19 Nov 2021 16:43:21 -0800 +Subject: shm: extend forced shm destroy to support objects from several IPC nses + +From: Alexander Mikhalitsyn + +commit 85b6d24646e4125c591639841169baa98a2da503 upstream. + +Currently, the exit_shm() function not designed to work properly when +task->sysvshm.shm_clist holds shm objects from different IPC namespaces. + +This is a real pain when sysctl kernel.shm_rmid_forced = 1, because it +leads to use-after-free (reproducer exists). + +This is an attempt to fix the problem by extending exit_shm mechanism to +handle shm's destroy from several IPC ns'es. + +To achieve that we do several things: + +1. add a namespace (non-refcounted) pointer to the struct shmid_kernel + +2. during new shm object creation (newseg()/shmget syscall) we + initialize this pointer by current task IPC ns + +3. exit_shm() fully reworked such that it traverses over all shp's in + task->sysvshm.shm_clist and gets IPC namespace not from current task + as it was before but from shp's object itself, then call + shm_destroy(shp, ns). + +Note: We need to be really careful here, because as it was said before +(1), our pointer to IPC ns non-refcnt'ed. To be on the safe side we +using special helper get_ipc_ns_not_zero() which allows to get IPC ns +refcounter only if IPC ns not in the "state of destruction". + +Q/A + +Q: Why can we access shp->ns memory using non-refcounted pointer? +A: Because shp object lifetime is always shorther than IPC namespace + lifetime, so, if we get shp object from the task->sysvshm.shm_clist + while holding task_lock(task) nobody can steal our namespace. + +Q: Does this patch change semantics of unshare/setns/clone syscalls? +A: No. It's just fixes non-covered case when process may leave IPC + namespace without getting task->sysvshm.shm_clist list cleaned up. + +Link: https://lkml.kernel.org/r/67bb03e5-f79c-1815-e2bf-949c67047418@colorfullife.com +Link: https://lkml.kernel.org/r/20211109151501.4921-1-manfred@colorfullife.com +Fixes: ab602f79915 ("shm: make exit_shm work proportional to task activity") +Co-developed-by: Manfred Spraul +Signed-off-by: Manfred Spraul +Signed-off-by: Alexander Mikhalitsyn +Cc: "Eric W. Biederman" +Cc: Davidlohr Bueso +Cc: Greg KH +Cc: Andrei Vagin +Cc: Pavel Tikhomirov +Cc: Vasily Averin +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/ipc_namespace.h | 15 +++ + include/linux/sched.h | 2 + include/linux/shm.h | 13 ++- + ipc/shm.c | 176 +++++++++++++++++++++++++++++++----------- + 4 files changed, 159 insertions(+), 47 deletions(-) + +--- a/include/linux/ipc_namespace.h ++++ b/include/linux/ipc_namespace.h +@@ -123,6 +123,16 @@ static inline struct ipc_namespace *get_ + return ns; + } + ++static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns) ++{ ++ if (ns) { ++ if (atomic_inc_not_zero(&ns->count)) ++ return ns; ++ } ++ ++ return NULL; ++} ++ + extern void put_ipc_ns(struct ipc_namespace *ns); + #else + static inline struct ipc_namespace *copy_ipcs(unsigned long flags, +@@ -138,6 +148,11 @@ static inline struct ipc_namespace *get_ + { + return ns; + } ++ ++static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns) ++{ ++ return ns; ++} + + static inline void put_ipc_ns(struct ipc_namespace *ns) + { +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -2792,7 +2792,7 @@ static inline int thread_group_empty(str + * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring + * subscriptions and synchronises with wait4(). Also used in procfs. Also + * pins the final release of task.io_context. Also protects ->cpuset and +- * ->cgroup.subsys[]. And ->vfork_done. ++ * ->cgroup.subsys[]. And ->vfork_done. And ->sysvshm.shm_clist. + * + * Nests both inside and outside of read_lock(&tasklist_lock). + * It must not be nested with write_lock_irq(&tasklist_lock), +--- a/include/linux/shm.h ++++ b/include/linux/shm.h +@@ -19,9 +19,18 @@ struct shmid_kernel /* private to the ke + pid_t shm_lprid; + struct user_struct *mlock_user; + +- /* The task created the shm object. NULL if the task is dead. */ ++ /* ++ * The task created the shm object, for ++ * task_lock(shp->shm_creator) ++ */ + struct task_struct *shm_creator; +- struct list_head shm_clist; /* list by creator */ ++ ++ /* ++ * List by creator. task_lock(->shm_creator) required for read/write. ++ * If list_empty(), then the creator is dead already. ++ */ ++ struct list_head shm_clist; ++ struct ipc_namespace *ns; + }; + + /* shm_mode upper byte flags */ +--- a/ipc/shm.c ++++ b/ipc/shm.c +@@ -90,6 +90,7 @@ static void do_shm_rmid(struct ipc_names + { + struct shmid_kernel *shp; + shp = container_of(ipcp, struct shmid_kernel, shm_perm); ++ WARN_ON(ns != shp->ns); + + if (shp->shm_nattch) { + shp->shm_perm.mode |= SHM_DEST; +@@ -180,10 +181,43 @@ static void shm_rcu_free(struct rcu_head + ipc_rcu_free(head); + } + +-static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s) ++/* ++ * It has to be called with shp locked. ++ * It must be called before ipc_rmid() ++ */ ++static inline void shm_clist_rm(struct shmid_kernel *shp) ++{ ++ struct task_struct *creator; ++ ++ /* ensure that shm_creator does not disappear */ ++ rcu_read_lock(); ++ ++ /* ++ * A concurrent exit_shm may do a list_del_init() as well. ++ * Just do nothing if exit_shm already did the work ++ */ ++ if (!list_empty(&shp->shm_clist)) { ++ /* ++ * shp->shm_creator is guaranteed to be valid *only* ++ * if shp->shm_clist is not empty. ++ */ ++ creator = shp->shm_creator; ++ ++ task_lock(creator); ++ /* ++ * list_del_init() is a nop if the entry was already removed ++ * from the list. ++ */ ++ list_del_init(&shp->shm_clist); ++ task_unlock(creator); ++ } ++ rcu_read_unlock(); ++} ++ ++static inline void shm_rmid(struct shmid_kernel *s) + { +- list_del(&s->shm_clist); +- ipc_rmid(&shm_ids(ns), &s->shm_perm); ++ shm_clist_rm(s); ++ ipc_rmid(&shm_ids(s->ns), &s->shm_perm); + } + + +@@ -238,7 +272,7 @@ static void shm_destroy(struct ipc_names + shm_file = shp->shm_file; + shp->shm_file = NULL; + ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT; +- shm_rmid(ns, shp); ++ shm_rmid(shp); + shm_unlock(shp); + if (!is_file_hugepages(shm_file)) + shmem_lock(shm_file, 0, shp->mlock_user); +@@ -259,10 +293,10 @@ static void shm_destroy(struct ipc_names + * + * 2) sysctl kernel.shm_rmid_forced is set to 1. + */ +-static bool shm_may_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) ++static bool shm_may_destroy(struct shmid_kernel *shp) + { + return (shp->shm_nattch == 0) && +- (ns->shm_rmid_forced || ++ (shp->ns->shm_rmid_forced || + (shp->shm_perm.mode & SHM_DEST)); + } + +@@ -293,7 +327,7 @@ static void shm_close(struct vm_area_str + shp->shm_lprid = task_tgid_vnr(current); + shp->shm_dtim = get_seconds(); + shp->shm_nattch--; +- if (shm_may_destroy(ns, shp)) ++ if (shm_may_destroy(shp)) + shm_destroy(ns, shp); + else + shm_unlock(shp); +@@ -314,10 +348,10 @@ static int shm_try_destroy_orphaned(int + * + * As shp->* are changed under rwsem, it's safe to skip shp locking. + */ +- if (shp->shm_creator != NULL) ++ if (!list_empty(&shp->shm_clist)) + return 0; + +- if (shm_may_destroy(ns, shp)) { ++ if (shm_may_destroy(shp)) { + shm_lock_by_ptr(shp); + shm_destroy(ns, shp); + } +@@ -335,48 +369,97 @@ void shm_destroy_orphaned(struct ipc_nam + /* Locking assumes this will only be called with task == current */ + void exit_shm(struct task_struct *task) + { +- struct ipc_namespace *ns = task->nsproxy->ipc_ns; +- struct shmid_kernel *shp, *n; ++ for (;;) { ++ struct shmid_kernel *shp; ++ struct ipc_namespace *ns; + +- if (list_empty(&task->sysvshm.shm_clist)) +- return; ++ task_lock(task); ++ ++ if (list_empty(&task->sysvshm.shm_clist)) { ++ task_unlock(task); ++ break; ++ } ++ ++ shp = list_first_entry(&task->sysvshm.shm_clist, struct shmid_kernel, ++ shm_clist); + +- /* +- * If kernel.shm_rmid_forced is not set then only keep track of +- * which shmids are orphaned, so that a later set of the sysctl +- * can clean them up. +- */ +- if (!ns->shm_rmid_forced) { +- down_read(&shm_ids(ns).rwsem); +- list_for_each_entry(shp, &task->sysvshm.shm_clist, shm_clist) +- shp->shm_creator = NULL; + /* +- * Only under read lock but we are only called on current +- * so no entry on the list will be shared. ++ * 1) Get pointer to the ipc namespace. It is worth to say ++ * that this pointer is guaranteed to be valid because ++ * shp lifetime is always shorter than namespace lifetime ++ * in which shp lives. ++ * We taken task_lock it means that shp won't be freed. + */ +- list_del(&task->sysvshm.shm_clist); +- up_read(&shm_ids(ns).rwsem); +- return; +- } ++ ns = shp->ns; + +- /* +- * Destroy all already created segments, that were not yet mapped, +- * and mark any mapped as orphan to cover the sysctl toggling. +- * Destroy is skipped if shm_may_destroy() returns false. +- */ +- down_write(&shm_ids(ns).rwsem); +- list_for_each_entry_safe(shp, n, &task->sysvshm.shm_clist, shm_clist) { +- shp->shm_creator = NULL; ++ /* ++ * 2) If kernel.shm_rmid_forced is not set then only keep track of ++ * which shmids are orphaned, so that a later set of the sysctl ++ * can clean them up. ++ */ ++ if (!ns->shm_rmid_forced) ++ goto unlink_continue; + +- if (shm_may_destroy(ns, shp)) { +- shm_lock_by_ptr(shp); +- shm_destroy(ns, shp); ++ /* ++ * 3) get a reference to the namespace. ++ * The refcount could be already 0. If it is 0, then ++ * the shm objects will be free by free_ipc_work(). ++ */ ++ ns = get_ipc_ns_not_zero(ns); ++ if (!ns) { ++unlink_continue: ++ list_del_init(&shp->shm_clist); ++ task_unlock(task); ++ continue; + } +- } + +- /* Remove the list head from any segments still attached. */ +- list_del(&task->sysvshm.shm_clist); +- up_write(&shm_ids(ns).rwsem); ++ /* ++ * 4) get a reference to shp. ++ * This cannot fail: shm_clist_rm() is called before ++ * ipc_rmid(), thus the refcount cannot be 0. ++ */ ++ WARN_ON(!ipc_rcu_getref(&shp->shm_perm)); ++ ++ /* ++ * 5) unlink the shm segment from the list of segments ++ * created by current. ++ * This must be done last. After unlinking, ++ * only the refcounts obtained above prevent IPC_RMID ++ * from destroying the segment or the namespace. ++ */ ++ list_del_init(&shp->shm_clist); ++ ++ task_unlock(task); ++ ++ /* ++ * 6) we have all references ++ * Thus lock & if needed destroy shp. ++ */ ++ down_write(&shm_ids(ns).rwsem); ++ shm_lock_by_ptr(shp); ++ /* ++ * rcu_read_lock was implicitly taken in shm_lock_by_ptr, it's ++ * safe to call ipc_rcu_putref here ++ */ ++ ipc_rcu_putref(&shp->shm_perm, shm_rcu_free); ++ ++ if (ipc_valid_object(&shp->shm_perm)) { ++ if (shm_may_destroy(shp)) ++ shm_destroy(ns, shp); ++ else ++ shm_unlock(shp); ++ } else { ++ /* ++ * Someone else deleted the shp from namespace ++ * idr/kht while we have waited. ++ * Just unlock and continue. ++ */ ++ shm_unlock(shp); ++ } ++ ++ up_write(&shm_ids(ns).rwsem); ++ put_ipc_ns(ns); /* paired with get_ipc_ns_not_zero */ ++ } + } + + static int shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +@@ -607,7 +690,11 @@ static int newseg(struct ipc_namespace * + goto no_id; + } + ++ shp->ns = ns; ++ ++ task_lock(current); + list_add(&shp->shm_clist, ¤t->sysvshm.shm_clist); ++ task_unlock(current); + + /* + * shmid gets reported as "inode#" in /proc/pid/maps. +@@ -1252,7 +1339,8 @@ out_nattch: + down_write(&shm_ids(ns).rwsem); + shp = shm_lock(ns, shmid); + shp->shm_nattch--; +- if (shm_may_destroy(ns, shp)) ++ ++ if (shm_may_destroy(shp)) + shm_destroy(ns, shp); + else + shm_unlock(shp); -- 2.39.5