From: Greg Kroah-Hartman Date: Mon, 29 Nov 2021 12:49:27 +0000 (+0100) Subject: 5.4-stable patches X-Git-Tag: v5.15.6~30 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=55fdfdb03c0b3cc5e1c186b5cd7fc32ffeeea21d;p=thirdparty%2Fkernel%2Fstable-queue.git 5.4-stable patches added patches: s390-mm-validate-vma-in-pgste-manipulation-functions.patch shm-extend-forced-shm-destroy-to-support-objects-from-several-ipc-nses.patch tracing-check-pid-filtering-when-creating-events.patch --- diff --git a/queue-5.4/s390-mm-validate-vma-in-pgste-manipulation-functions.patch b/queue-5.4/s390-mm-validate-vma-in-pgste-manipulation-functions.patch new file mode 100644 index 00000000000..b851c0be849 --- /dev/null +++ b/queue-5.4/s390-mm-validate-vma-in-pgste-manipulation-functions.patch @@ -0,0 +1,86 @@ +From fe3d10024073f06f04c74b9674bd71ccc1d787cf Mon Sep 17 00:00:00 2001 +From: David Hildenbrand +Date: Thu, 9 Sep 2021 18:22:42 +0200 +Subject: s390/mm: validate VMA in PGSTE manipulation functions + +From: David Hildenbrand + +commit fe3d10024073f06f04c74b9674bd71ccc1d787cf upstream. + +We should not walk/touch page tables outside of VMA boundaries when +holding only the mmap sem in read mode. Evil user space can modify the +VMA layout just before this function runs and e.g., trigger races with +page table removal code since commit dd2283f2605e ("mm: mmap: zap pages +with read mmap_sem in munmap"). gfn_to_hva() will only translate using +KVM memory regions, but won't validate the VMA. + +Further, we should not allocate page tables outside of VMA boundaries: if +evil user space decides to map hugetlbfs to these ranges, bad things will +happen because we suddenly have PTE or PMD page tables where we +shouldn't have them. + +Similarly, we have to check if we suddenly find a hugetlbfs VMA, before +calling get_locked_pte(). + +Fixes: 2d42f9477320 ("s390/kvm: Add PGSTE manipulation functions") +Signed-off-by: David Hildenbrand +Reviewed-by: Claudio Imbrenda +Acked-by: Heiko Carstens +Link: https://lore.kernel.org/r/20210909162248.14969-4-david@redhat.com +Signed-off-by: Christian Borntraeger +Signed-off-by: Greg Kroah-Hartman + +--- + arch/s390/mm/pgtable.c | 13 +++++++++++++ + 1 file changed, 13 insertions(+) + +--- a/arch/s390/mm/pgtable.c ++++ b/arch/s390/mm/pgtable.c +@@ -970,6 +970,7 @@ EXPORT_SYMBOL(get_guest_storage_key); + int pgste_perform_essa(struct mm_struct *mm, unsigned long hva, int orc, + unsigned long *oldpte, unsigned long *oldpgste) + { ++ struct vm_area_struct *vma; + unsigned long pgstev; + spinlock_t *ptl; + pgste_t pgste; +@@ -979,6 +980,10 @@ int pgste_perform_essa(struct mm_struct + WARN_ON_ONCE(orc > ESSA_MAX); + if (unlikely(orc > ESSA_MAX)) + return -EINVAL; ++ ++ vma = find_vma(mm, hva); ++ if (!vma || hva < vma->vm_start || is_vm_hugetlb_page(vma)) ++ return -EFAULT; + ptep = get_locked_pte(mm, hva, &ptl); + if (unlikely(!ptep)) + return -EFAULT; +@@ -1071,10 +1076,14 @@ EXPORT_SYMBOL(pgste_perform_essa); + int set_pgste_bits(struct mm_struct *mm, unsigned long hva, + unsigned long bits, unsigned long value) + { ++ struct vm_area_struct *vma; + spinlock_t *ptl; + pgste_t new; + pte_t *ptep; + ++ vma = find_vma(mm, hva); ++ if (!vma || hva < vma->vm_start || is_vm_hugetlb_page(vma)) ++ return -EFAULT; + ptep = get_locked_pte(mm, hva, &ptl); + if (unlikely(!ptep)) + return -EFAULT; +@@ -1099,9 +1108,13 @@ EXPORT_SYMBOL(set_pgste_bits); + */ + int get_pgste(struct mm_struct *mm, unsigned long hva, unsigned long *pgstep) + { ++ struct vm_area_struct *vma; + spinlock_t *ptl; + pte_t *ptep; + ++ vma = find_vma(mm, hva); ++ if (!vma || hva < vma->vm_start || is_vm_hugetlb_page(vma)) ++ return -EFAULT; + ptep = get_locked_pte(mm, hva, &ptl); + if (unlikely(!ptep)) + return -EFAULT; diff --git a/queue-5.4/series b/queue-5.4/series index 18a2170d69f..59662d89f3d 100644 --- a/queue-5.4/series +++ b/queue-5.4/series @@ -76,3 +76,6 @@ net-mscc-ocelot-correctly-report-the-timestamping-rx.patch f2fs-set-sbi_need_fsck-flag-when-inconsistent-node-b.patch smb3-do-not-error-on-fsync-when-readonly.patch vhost-vsock-fix-incorrect-used-length-reported-to-the-guest.patch +tracing-check-pid-filtering-when-creating-events.patch +s390-mm-validate-vma-in-pgste-manipulation-functions.patch +shm-extend-forced-shm-destroy-to-support-objects-from-several-ipc-nses.patch diff --git a/queue-5.4/shm-extend-forced-shm-destroy-to-support-objects-from-several-ipc-nses.patch b/queue-5.4/shm-extend-forced-shm-destroy-to-support-objects-from-several-ipc-nses.patch new file mode 100644 index 00000000000..fddb64cd69c --- /dev/null +++ b/queue-5.4/shm-extend-forced-shm-destroy-to-support-objects-from-several-ipc-nses.patch @@ -0,0 +1,387 @@ +From 85b6d24646e4125c591639841169baa98a2da503 Mon Sep 17 00:00:00 2001 +From: Alexander Mikhalitsyn +Date: Fri, 19 Nov 2021 16:43:21 -0800 +Subject: shm: extend forced shm destroy to support objects from several IPC nses + +From: Alexander Mikhalitsyn + +commit 85b6d24646e4125c591639841169baa98a2da503 upstream. + +Currently, the exit_shm() function not designed to work properly when +task->sysvshm.shm_clist holds shm objects from different IPC namespaces. + +This is a real pain when sysctl kernel.shm_rmid_forced = 1, because it +leads to use-after-free (reproducer exists). + +This is an attempt to fix the problem by extending exit_shm mechanism to +handle shm's destroy from several IPC ns'es. + +To achieve that we do several things: + +1. add a namespace (non-refcounted) pointer to the struct shmid_kernel + +2. during new shm object creation (newseg()/shmget syscall) we + initialize this pointer by current task IPC ns + +3. exit_shm() fully reworked such that it traverses over all shp's in + task->sysvshm.shm_clist and gets IPC namespace not from current task + as it was before but from shp's object itself, then call + shm_destroy(shp, ns). + +Note: We need to be really careful here, because as it was said before +(1), our pointer to IPC ns non-refcnt'ed. To be on the safe side we +using special helper get_ipc_ns_not_zero() which allows to get IPC ns +refcounter only if IPC ns not in the "state of destruction". + +Q/A + +Q: Why can we access shp->ns memory using non-refcounted pointer? +A: Because shp object lifetime is always shorther than IPC namespace + lifetime, so, if we get shp object from the task->sysvshm.shm_clist + while holding task_lock(task) nobody can steal our namespace. + +Q: Does this patch change semantics of unshare/setns/clone syscalls? +A: No. It's just fixes non-covered case when process may leave IPC + namespace without getting task->sysvshm.shm_clist list cleaned up. + +Link: https://lkml.kernel.org/r/67bb03e5-f79c-1815-e2bf-949c67047418@colorfullife.com +Link: https://lkml.kernel.org/r/20211109151501.4921-1-manfred@colorfullife.com +Fixes: ab602f79915 ("shm: make exit_shm work proportional to task activity") +Co-developed-by: Manfred Spraul +Signed-off-by: Manfred Spraul +Signed-off-by: Alexander Mikhalitsyn +Cc: "Eric W. Biederman" +Cc: Davidlohr Bueso +Cc: Greg KH +Cc: Andrei Vagin +Cc: Pavel Tikhomirov +Cc: Vasily Averin +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + include/linux/ipc_namespace.h | 15 +++ + include/linux/sched/task.h | 2 + ipc/shm.c | 189 +++++++++++++++++++++++++++++++----------- + 3 files changed, 159 insertions(+), 47 deletions(-) + +--- a/include/linux/ipc_namespace.h ++++ b/include/linux/ipc_namespace.h +@@ -130,6 +130,16 @@ static inline struct ipc_namespace *get_ + return ns; + } + ++static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns) ++{ ++ if (ns) { ++ if (refcount_inc_not_zero(&ns->count)) ++ return ns; ++ } ++ ++ return NULL; ++} ++ + extern void put_ipc_ns(struct ipc_namespace *ns); + #else + static inline struct ipc_namespace *copy_ipcs(unsigned long flags, +@@ -145,6 +155,11 @@ static inline struct ipc_namespace *get_ + { + return ns; + } ++ ++static inline struct ipc_namespace *get_ipc_ns_not_zero(struct ipc_namespace *ns) ++{ ++ return ns; ++} + + static inline void put_ipc_ns(struct ipc_namespace *ns) + { +--- a/include/linux/sched/task.h ++++ b/include/linux/sched/task.h +@@ -157,7 +157,7 @@ static inline struct vm_struct *task_sta + * Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring + * subscriptions and synchronises with wait4(). Also used in procfs. Also + * pins the final release of task.io_context. Also protects ->cpuset and +- * ->cgroup.subsys[]. And ->vfork_done. ++ * ->cgroup.subsys[]. And ->vfork_done. And ->sysvshm.shm_clist. + * + * Nests both inside and outside of read_lock(&tasklist_lock). + * It must not be nested with write_lock_irq(&tasklist_lock), +--- a/ipc/shm.c ++++ b/ipc/shm.c +@@ -62,9 +62,18 @@ struct shmid_kernel /* private to the ke + struct pid *shm_lprid; + struct user_struct *mlock_user; + +- /* The task created the shm object. NULL if the task is dead. */ ++ /* ++ * The task created the shm object, for ++ * task_lock(shp->shm_creator) ++ */ + struct task_struct *shm_creator; +- struct list_head shm_clist; /* list by creator */ ++ ++ /* ++ * List by creator. task_lock(->shm_creator) required for read/write. ++ * If list_empty(), then the creator is dead already. ++ */ ++ struct list_head shm_clist; ++ struct ipc_namespace *ns; + } __randomize_layout; + + /* shm_mode upper byte flags */ +@@ -115,6 +124,7 @@ static void do_shm_rmid(struct ipc_names + struct shmid_kernel *shp; + + shp = container_of(ipcp, struct shmid_kernel, shm_perm); ++ WARN_ON(ns != shp->ns); + + if (shp->shm_nattch) { + shp->shm_perm.mode |= SHM_DEST; +@@ -225,10 +235,43 @@ static void shm_rcu_free(struct rcu_head + kvfree(shp); + } + +-static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s) ++/* ++ * It has to be called with shp locked. ++ * It must be called before ipc_rmid() ++ */ ++static inline void shm_clist_rm(struct shmid_kernel *shp) ++{ ++ struct task_struct *creator; ++ ++ /* ensure that shm_creator does not disappear */ ++ rcu_read_lock(); ++ ++ /* ++ * A concurrent exit_shm may do a list_del_init() as well. ++ * Just do nothing if exit_shm already did the work ++ */ ++ if (!list_empty(&shp->shm_clist)) { ++ /* ++ * shp->shm_creator is guaranteed to be valid *only* ++ * if shp->shm_clist is not empty. ++ */ ++ creator = shp->shm_creator; ++ ++ task_lock(creator); ++ /* ++ * list_del_init() is a nop if the entry was already removed ++ * from the list. ++ */ ++ list_del_init(&shp->shm_clist); ++ task_unlock(creator); ++ } ++ rcu_read_unlock(); ++} ++ ++static inline void shm_rmid(struct shmid_kernel *s) + { +- list_del(&s->shm_clist); +- ipc_rmid(&shm_ids(ns), &s->shm_perm); ++ shm_clist_rm(s); ++ ipc_rmid(&shm_ids(s->ns), &s->shm_perm); + } + + +@@ -283,7 +326,7 @@ static void shm_destroy(struct ipc_names + shm_file = shp->shm_file; + shp->shm_file = NULL; + ns->shm_tot -= (shp->shm_segsz + PAGE_SIZE - 1) >> PAGE_SHIFT; +- shm_rmid(ns, shp); ++ shm_rmid(shp); + shm_unlock(shp); + if (!is_file_hugepages(shm_file)) + shmem_lock(shm_file, 0, shp->mlock_user); +@@ -306,10 +349,10 @@ static void shm_destroy(struct ipc_names + * + * 2) sysctl kernel.shm_rmid_forced is set to 1. + */ +-static bool shm_may_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) ++static bool shm_may_destroy(struct shmid_kernel *shp) + { + return (shp->shm_nattch == 0) && +- (ns->shm_rmid_forced || ++ (shp->ns->shm_rmid_forced || + (shp->shm_perm.mode & SHM_DEST)); + } + +@@ -340,7 +383,7 @@ static void shm_close(struct vm_area_str + ipc_update_pid(&shp->shm_lprid, task_tgid(current)); + shp->shm_dtim = ktime_get_real_seconds(); + shp->shm_nattch--; +- if (shm_may_destroy(ns, shp)) ++ if (shm_may_destroy(shp)) + shm_destroy(ns, shp); + else + shm_unlock(shp); +@@ -361,10 +404,10 @@ static int shm_try_destroy_orphaned(int + * + * As shp->* are changed under rwsem, it's safe to skip shp locking. + */ +- if (shp->shm_creator != NULL) ++ if (!list_empty(&shp->shm_clist)) + return 0; + +- if (shm_may_destroy(ns, shp)) { ++ if (shm_may_destroy(shp)) { + shm_lock_by_ptr(shp); + shm_destroy(ns, shp); + } +@@ -382,48 +425,97 @@ void shm_destroy_orphaned(struct ipc_nam + /* Locking assumes this will only be called with task == current */ + void exit_shm(struct task_struct *task) + { +- struct ipc_namespace *ns = task->nsproxy->ipc_ns; +- struct shmid_kernel *shp, *n; ++ for (;;) { ++ struct shmid_kernel *shp; ++ struct ipc_namespace *ns; + +- if (list_empty(&task->sysvshm.shm_clist)) +- return; ++ task_lock(task); ++ ++ if (list_empty(&task->sysvshm.shm_clist)) { ++ task_unlock(task); ++ break; ++ } ++ ++ shp = list_first_entry(&task->sysvshm.shm_clist, struct shmid_kernel, ++ shm_clist); + +- /* +- * If kernel.shm_rmid_forced is not set then only keep track of +- * which shmids are orphaned, so that a later set of the sysctl +- * can clean them up. +- */ +- if (!ns->shm_rmid_forced) { +- down_read(&shm_ids(ns).rwsem); +- list_for_each_entry(shp, &task->sysvshm.shm_clist, shm_clist) +- shp->shm_creator = NULL; + /* +- * Only under read lock but we are only called on current +- * so no entry on the list will be shared. ++ * 1) Get pointer to the ipc namespace. It is worth to say ++ * that this pointer is guaranteed to be valid because ++ * shp lifetime is always shorter than namespace lifetime ++ * in which shp lives. ++ * We taken task_lock it means that shp won't be freed. + */ +- list_del(&task->sysvshm.shm_clist); +- up_read(&shm_ids(ns).rwsem); +- return; +- } ++ ns = shp->ns; + +- /* +- * Destroy all already created segments, that were not yet mapped, +- * and mark any mapped as orphan to cover the sysctl toggling. +- * Destroy is skipped if shm_may_destroy() returns false. +- */ +- down_write(&shm_ids(ns).rwsem); +- list_for_each_entry_safe(shp, n, &task->sysvshm.shm_clist, shm_clist) { +- shp->shm_creator = NULL; ++ /* ++ * 2) If kernel.shm_rmid_forced is not set then only keep track of ++ * which shmids are orphaned, so that a later set of the sysctl ++ * can clean them up. ++ */ ++ if (!ns->shm_rmid_forced) ++ goto unlink_continue; + +- if (shm_may_destroy(ns, shp)) { +- shm_lock_by_ptr(shp); +- shm_destroy(ns, shp); ++ /* ++ * 3) get a reference to the namespace. ++ * The refcount could be already 0. If it is 0, then ++ * the shm objects will be free by free_ipc_work(). ++ */ ++ ns = get_ipc_ns_not_zero(ns); ++ if (!ns) { ++unlink_continue: ++ list_del_init(&shp->shm_clist); ++ task_unlock(task); ++ continue; + } +- } + +- /* Remove the list head from any segments still attached. */ +- list_del(&task->sysvshm.shm_clist); +- up_write(&shm_ids(ns).rwsem); ++ /* ++ * 4) get a reference to shp. ++ * This cannot fail: shm_clist_rm() is called before ++ * ipc_rmid(), thus the refcount cannot be 0. ++ */ ++ WARN_ON(!ipc_rcu_getref(&shp->shm_perm)); ++ ++ /* ++ * 5) unlink the shm segment from the list of segments ++ * created by current. ++ * This must be done last. After unlinking, ++ * only the refcounts obtained above prevent IPC_RMID ++ * from destroying the segment or the namespace. ++ */ ++ list_del_init(&shp->shm_clist); ++ ++ task_unlock(task); ++ ++ /* ++ * 6) we have all references ++ * Thus lock & if needed destroy shp. ++ */ ++ down_write(&shm_ids(ns).rwsem); ++ shm_lock_by_ptr(shp); ++ /* ++ * rcu_read_lock was implicitly taken in shm_lock_by_ptr, it's ++ * safe to call ipc_rcu_putref here ++ */ ++ ipc_rcu_putref(&shp->shm_perm, shm_rcu_free); ++ ++ if (ipc_valid_object(&shp->shm_perm)) { ++ if (shm_may_destroy(shp)) ++ shm_destroy(ns, shp); ++ else ++ shm_unlock(shp); ++ } else { ++ /* ++ * Someone else deleted the shp from namespace ++ * idr/kht while we have waited. ++ * Just unlock and continue. ++ */ ++ shm_unlock(shp); ++ } ++ ++ up_write(&shm_ids(ns).rwsem); ++ put_ipc_ns(ns); /* paired with get_ipc_ns_not_zero */ ++ } + } + + static vm_fault_t shm_fault(struct vm_fault *vmf) +@@ -680,7 +772,11 @@ static int newseg(struct ipc_namespace * + if (error < 0) + goto no_id; + ++ shp->ns = ns; ++ ++ task_lock(current); + list_add(&shp->shm_clist, ¤t->sysvshm.shm_clist); ++ task_unlock(current); + + /* + * shmid gets reported as "inode#" in /proc/pid/maps. +@@ -1575,7 +1671,8 @@ out_nattch: + down_write(&shm_ids(ns).rwsem); + shp = shm_lock(ns, shmid); + shp->shm_nattch--; +- if (shm_may_destroy(ns, shp)) ++ ++ if (shm_may_destroy(shp)) + shm_destroy(ns, shp); + else + shm_unlock(shp); diff --git a/queue-5.4/tracing-check-pid-filtering-when-creating-events.patch b/queue-5.4/tracing-check-pid-filtering-when-creating-events.patch new file mode 100644 index 00000000000..a046d6b9333 --- /dev/null +++ b/queue-5.4/tracing-check-pid-filtering-when-creating-events.patch @@ -0,0 +1,48 @@ +From 6cb206508b621a9a0a2c35b60540e399225c8243 Mon Sep 17 00:00:00 2001 +From: "Steven Rostedt (VMware)" +Date: Fri, 26 Nov 2021 13:35:26 -0500 +Subject: tracing: Check pid filtering when creating events + +From: Steven Rostedt (VMware) + +commit 6cb206508b621a9a0a2c35b60540e399225c8243 upstream. + +When pid filtering is activated in an instance, all of the events trace +files for that instance has the PID_FILTER flag set. This determines +whether or not pid filtering needs to be done on the event, otherwise the +event is executed as normal. + +If pid filtering is enabled when an event is created (via a dynamic event +or modules), its flag is not updated to reflect the current state, and the +events are not filtered properly. + +Cc: stable@vger.kernel.org +Fixes: 3fdaf80f4a836 ("tracing: Implement event pid filtering") +Signed-off-by: Steven Rostedt (VMware) +Signed-off-by: Greg Kroah-Hartman +--- + kernel/trace/trace_events.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +--- a/kernel/trace/trace_events.c ++++ b/kernel/trace/trace_events.c +@@ -2247,12 +2247,19 @@ static struct trace_event_file * + trace_create_new_event(struct trace_event_call *call, + struct trace_array *tr) + { ++ struct trace_pid_list *pid_list; + struct trace_event_file *file; + + file = kmem_cache_alloc(file_cachep, GFP_TRACE); + if (!file) + return NULL; + ++ pid_list = rcu_dereference_protected(tr->filtered_pids, ++ lockdep_is_held(&event_mutex)); ++ ++ if (pid_list) ++ file->flags |= EVENT_FILE_FL_PID_FILTER; ++ + file->event_call = call; + file->tr = tr; + atomic_set(&file->sm_ref, 0);