From: Greg Kroah-Hartman Date: Tue, 30 Nov 2021 10:38:28 +0000 (+0100) Subject: 4.9-stable patches X-Git-Tag: v5.15.6~4 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=82a6028d110ecb50f5a962c1a328a38b3efadb90;p=thirdparty%2Fkernel%2Fstable-queue.git 4.9-stable patches added patches: shm-extend-forced-shm-destroy-to-support-objects-from-several-ipc-nses.patch --- diff --git a/queue-4.9/series b/queue-4.9/series index 873b3f7d9b7..14638b295ec 100644 --- a/queue-4.9/series +++ b/queue-4.9/series @@ -34,3 +34,4 @@ xen-netfront-don-t-read-data-from-request-on-the-ring-page.patch xen-netfront-disentangle-tx_skb_freelist.patch xen-netfront-don-t-trust-the-backend-response-data-blindly.patch tty-hvc-replace-bug_on-with-negative-return-value.patch +shm-extend-forced-shm-destroy-to-support-objects-from-several-ipc-nses.patch diff --git a/queue-4.9/shm-extend-forced-shm-destroy-to-support-objects-from-several-ipc-nses.patch b/queue-4.9/shm-extend-forced-shm-destroy-to-support-objects-from-several-ipc-nses.patch new file mode 100644 index 00000000000..57ea8627de9 --- /dev/null +++ b/queue-4.9/shm-extend-forced-shm-destroy-to-support-objects-from-several-ipc-nses.patch @@ -0,0 +1,389 @@ +From 85b6d24646e4125c591639841169baa98a2da503 Mon Sep 17 00:00:00 2001 +From: Alexander Mikhalitsyn +Date: Fri, 19 Nov 2021 16:43:21 -0800 +Subject: shm: extend forced shm destroy to support objects from several IPC nses + +From: Alexander Mikhalitsyn + +commit 85b6d24646e4125c591639841169baa98a2da503 upstream. + +Currently, the exit_shm() function not designed to work properly when +task->sysvshm.shm_clist holds shm objects from different IPC namespaces. + +This is a real pain when sysctl kernel.shm_rmid_forced = 1, because it +leads to use-after-free (reproducer exists). + +This is an attempt to fix the problem by extending exit_shm mechanism to +handle shm's destroy from several IPC ns'es. + +To achieve that we do several things: + +1. add a namespace (non-refcounted) pointer to the struct shmid_kernel + +2. during new shm object creation (newseg()/shmget syscall) we + initialize this pointer by current task IPC ns + +3. exit_shm() fully reworked such that it traverses over all shp's in + task->sysvshm.shm_clist and gets IPC namespace not from current task + as it was before but from shp's object itself, then call + shm_destroy(shp, ns). + +Note: We need to be really careful here, because as it was said before +(1), our pointer to IPC ns non-refcnt'ed. To be on the safe side we +using special helper get_ipc_ns_not_zero() which allows to get IPC ns +refcounter only if IPC ns not in the "state of destruction". + +Q/A + +Q: Why can we access shp->ns memory using non-refcounted pointer? +A: Because shp object lifetime is always shorther than IPC namespace + lifetime, so, if we get shp object from the task->sysvshm.shm_clist + while holding task_lock(task) nobody can steal our namespace. + +Q: Does this patch change semantics of unshare/setns/clone syscalls? +A: No. It's just fixes non-covered case when process may leave IPC + namespace without getting task->sysvshm.shm_clist list cleaned up. + +Link: https://lkml.kernel.org/r/67bb03e5-f79c-1815-e2bf-949c67047418@colorfullife.com +Link: https://lkml.kernel.org/r/20211109151501.4921-1-manfred@colorfullife.com +Fixes: ab602f79915 ("shm: make exit_shm work proportional to task activity") +Co-developed-by: Manfred Spraul +Signed-off-by: Manfred Spraul +Signed-off-by: Alexander Mikhalitsyn +Cc: "Eric W. Biederman" +Cc: Davidlohr Bueso +Cc: Greg KH +Cc: Andrei Vagin +Cc: Pavel Tikhomirov +Cc: Vasily Averin +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/ipc_namespace.h | 15 +++ + include/linux/sched.h | 2 + include/linux/shm.h | 13 ++- + ipc/shm.c | 176 +++++++++++++++++++++++++++++++----------- + 4 files changed, 159 insertions(+), 47 deletions(-) + +--- a/include/linux/ipc_namespace.h ++++ b/include/linux/ipc_namespace.h +@@ -122,6 +122,16 @@ static inline struct ipc_namespace *get_ + return ns; + } + ++static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns) ++{ ++ if (ns) { ++ if (atomic_inc_not_zero(&ns->count)) ++ return ns; ++ } ++ ++ return NULL; ++} ++ + extern void put_ipc_ns(struct ipc_namespace *ns); + #else + static inline struct ipc_namespace *copy_ipcs(unsigned long flags, +@@ -137,6 +147,11 @@ static inline struct ipc_namespace *get_ + { + return ns; + } ++ ++static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns) ++{ ++ return ns; ++} + + static inline void put_ipc_ns(struct ipc_namespace *ns) + { +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -3106,7 +3106,7 @@ static inline int thread_group_empty(str + * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring + * subscriptions and synchronises with wait4(). Also used in procfs. Also + * pins the final release of task.io_context. Also protects ->cpuset and +- * ->cgroup.subsys[]. And ->vfork_done. ++ * ->cgroup.subsys[]. And ->vfork_done. And ->sysvshm.shm_clist. + * + * Nests both inside and outside of read_lock(&tasklist_lock). + * It must not be nested with write_lock_irq(&tasklist_lock), +--- a/include/linux/shm.h ++++ b/include/linux/shm.h +@@ -19,9 +19,18 @@ struct shmid_kernel /* private to the ke + pid_t shm_lprid; + struct user_struct *mlock_user; + +- /* The task created the shm object. NULL if the task is dead. */ ++ /* ++ * The task created the shm object, for ++ * task_lock(shp->shm_creator) ++ */ + struct task_struct *shm_creator; +- struct list_head shm_clist; /* list by creator */ ++ ++ /* ++ * List by creator. task_lock(->shm_creator) required for read/write. ++ * If list_empty(), then the creator is dead already. ++ */ ++ struct list_head shm_clist; ++ struct ipc_namespace *ns; + }; + + /* shm_mode upper byte flags */ +--- a/ipc/shm.c ++++ b/ipc/shm.c +@@ -90,6 +90,7 @@ static void do_shm_rmid(struct ipc_names + { + struct shmid_kernel *shp; + shp = container_of(ipcp, struct shmid_kernel, shm_perm); ++ WARN_ON(ns != shp->ns); + + if (shp->shm_nattch) { + shp->shm_perm.mode |= SHM_DEST; +@@ -180,10 +181,43 @@ static void shm_rcu_free(struct rcu_head + ipc_rcu_free(head); + } + +-static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s) ++/* ++ * It has to be called with shp locked. ++ * It must be called before ipc_rmid() ++ */ ++static inline void shm_clist_rm(struct shmid_kernel *shp) ++{ ++ struct task_struct *creator; ++ ++ /* ensure that shm_creator does not disappear */ ++ rcu_read_lock(); ++ ++ /* ++ * A concurrent exit_shm may do a list_del_init() as well. ++ * Just do nothing if exit_shm already did the work ++ */ ++ if (!list_empty(&shp->shm_clist)) { ++ /* ++ * shp->shm_creator is guaranteed to be valid *only* ++ * if shp->shm_clist is not empty. ++ */ ++ creator = shp->shm_creator; ++ ++ task_lock(creator); ++ /* ++ * list_del_init() is a nop if the entry was already removed ++ * from the list. ++ */ ++ list_del_init(&shp->shm_clist); ++ task_unlock(creator); ++ } ++ rcu_read_unlock(); ++} ++ ++static inline void shm_rmid(struct shmid_kernel *s) + { +- list_del(&s->shm_clist); +- ipc_rmid(&shm_ids(ns), &s->shm_perm); ++ shm_clist_rm(s); ++ ipc_rmid(&shm_ids(s->ns), &s->shm_perm); + } + + +@@ -238,7 +272,7 @@ static void shm_destroy(struct ipc_names + shm_file = shp->shm_file; + shp->shm_file = NULL; + ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT; +- shm_rmid(ns, shp); ++ shm_rmid(shp); + shm_unlock(shp); + if (!is_file_hugepages(shm_file)) + shmem_lock(shm_file, 0, shp->mlock_user); +@@ -259,10 +293,10 @@ static void shm_destroy(struct ipc_names + * + * 2) sysctl kernel.shm_rmid_forced is set to 1. + */ +-static bool shm_may_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) ++static bool shm_may_destroy(struct shmid_kernel *shp) + { + return (shp->shm_nattch == 0) && +- (ns->shm_rmid_forced || ++ (shp->ns->shm_rmid_forced || + (shp->shm_perm.mode & SHM_DEST)); + } + +@@ -293,7 +327,7 @@ static void shm_close(struct vm_area_str + shp->shm_lprid = task_tgid_vnr(current); + shp->shm_dtim = get_seconds(); + shp->shm_nattch--; +- if (shm_may_destroy(ns, shp)) ++ if (shm_may_destroy(shp)) + shm_destroy(ns, shp); + else + shm_unlock(shp); +@@ -314,10 +348,10 @@ static int shm_try_destroy_orphaned(int + * + * As shp->* are changed under rwsem, it's safe to skip shp locking. + */ +- if (shp->shm_creator != NULL) ++ if (!list_empty(&shp->shm_clist)) + return 0; + +- if (shm_may_destroy(ns, shp)) { ++ if (shm_may_destroy(shp)) { + shm_lock_by_ptr(shp); + shm_destroy(ns, shp); + } +@@ -335,48 +369,97 @@ void shm_destroy_orphaned(struct ipc_nam + /* Locking assumes this will only be called with task == current */ + void exit_shm(struct task_struct *task) + { +- struct ipc_namespace *ns = task->nsproxy->ipc_ns; +- struct shmid_kernel *shp, *n; ++ for (;;) { ++ struct shmid_kernel *shp; ++ struct ipc_namespace *ns; + +- if (list_empty(&task->sysvshm.shm_clist)) +- return; ++ task_lock(task); ++ ++ if (list_empty(&task->sysvshm.shm_clist)) { ++ task_unlock(task); ++ break; ++ } ++ ++ shp = list_first_entry(&task->sysvshm.shm_clist, struct shmid_kernel, ++ shm_clist); + +- /* +- * If kernel.shm_rmid_forced is not set then only keep track of +- * which shmids are orphaned, so that a later set of the sysctl +- * can clean them up. +- */ +- if (!ns->shm_rmid_forced) { +- down_read(&shm_ids(ns).rwsem); +- list_for_each_entry(shp, &task->sysvshm.shm_clist, shm_clist) +- shp->shm_creator = NULL; + /* +- * Only under read lock but we are only called on current +- * so no entry on the list will be shared. ++ * 1) Get pointer to the ipc namespace. It is worth to say ++ * that this pointer is guaranteed to be valid because ++ * shp lifetime is always shorter than namespace lifetime ++ * in which shp lives. ++ * We taken task_lock it means that shp won't be freed. + */ +- list_del(&task->sysvshm.shm_clist); +- up_read(&shm_ids(ns).rwsem); +- return; +- } ++ ns = shp->ns; + +- /* +- * Destroy all already created segments, that were not yet mapped, +- * and mark any mapped as orphan to cover the sysctl toggling. +- * Destroy is skipped if shm_may_destroy() returns false. +- */ +- down_write(&shm_ids(ns).rwsem); +- list_for_each_entry_safe(shp, n, &task->sysvshm.shm_clist, shm_clist) { +- shp->shm_creator = NULL; ++ /* ++ * 2) If kernel.shm_rmid_forced is not set then only keep track of ++ * which shmids are orphaned, so that a later set of the sysctl ++ * can clean them up. ++ */ ++ if (!ns->shm_rmid_forced) ++ goto unlink_continue; + +- if (shm_may_destroy(ns, shp)) { +- shm_lock_by_ptr(shp); +- shm_destroy(ns, shp); ++ /* ++ * 3) get a reference to the namespace. ++ * The refcount could be already 0. If it is 0, then ++ * the shm objects will be free by free_ipc_work(). ++ */ ++ ns = get_ipc_ns_not_zero(ns); ++ if (!ns) { ++unlink_continue: ++ list_del_init(&shp->shm_clist); ++ task_unlock(task); ++ continue; + } +- } + +- /* Remove the list head from any segments still attached. */ +- list_del(&task->sysvshm.shm_clist); +- up_write(&shm_ids(ns).rwsem); ++ /* ++ * 4) get a reference to shp. ++ * This cannot fail: shm_clist_rm() is called before ++ * ipc_rmid(), thus the refcount cannot be 0. ++ */ ++ WARN_ON(!ipc_rcu_getref(&shp->shm_perm)); ++ ++ /* ++ * 5) unlink the shm segment from the list of segments ++ * created by current. ++ * This must be done last. After unlinking, ++ * only the refcounts obtained above prevent IPC_RMID ++ * from destroying the segment or the namespace. ++ */ ++ list_del_init(&shp->shm_clist); ++ ++ task_unlock(task); ++ ++ /* ++ * 6) we have all references ++ * Thus lock & if needed destroy shp. ++ */ ++ down_write(&shm_ids(ns).rwsem); ++ shm_lock_by_ptr(shp); ++ /* ++ * rcu_read_lock was implicitly taken in shm_lock_by_ptr, it's ++ * safe to call ipc_rcu_putref here ++ */ ++ ipc_rcu_putref(&shp->shm_perm, shm_rcu_free); ++ ++ if (ipc_valid_object(&shp->shm_perm)) { ++ if (shm_may_destroy(shp)) ++ shm_destroy(ns, shp); ++ else ++ shm_unlock(shp); ++ } else { ++ /* ++ * Someone else deleted the shp from namespace ++ * idr/kht while we have waited. ++ * Just unlock and continue. ++ */ ++ shm_unlock(shp); ++ } ++ ++ up_write(&shm_ids(ns).rwsem); ++ put_ipc_ns(ns); /* paired with get_ipc_ns_not_zero */ ++ } + } + + static int shm_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +@@ -621,7 +704,11 @@ static int newseg(struct ipc_namespace * + goto no_id; + } + ++ shp->ns = ns; ++ ++ task_lock(current); + list_add(&shp->shm_clist, ¤t->sysvshm.shm_clist); ++ task_unlock(current); + + /* + * shmid gets reported as "inode#" in /proc/pid/maps. +@@ -1270,7 +1357,8 @@ out_nattch: + down_write(&shm_ids(ns).rwsem); + shp = shm_lock(ns, shmid); + shp->shm_nattch--; +- if (shm_may_destroy(ns, shp)) ++ ++ if (shm_may_destroy(shp)) + shm_destroy(ns, shp); + else + shm_unlock(shp);