5.10-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Mon, 19 Jun 2023 06:28:16 +0000 (08:28 +0200)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Mon, 19 Jun 2023 06:28:16 +0000 (08:28 +0200)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 19 Jun 2023 06:28:16 +0000 (08:28 +0200)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 19 Jun 2023 06:28:16 +0000 (08:28 +0200)
diff --git a/queue-5.10/cgroup-always-put-cset-in-cgroup_css_set_put_fork.patch b/queue-5.10/cgroup-always-put-cset-in-cgroup_css_set_put_fork.patch

new file mode 100644 (file)

index 0000000..d412d56
--- /dev/null
+++ b/queue-5.10/cgroup-always-put-cset-in-cgroup_css_set_put_fork.patch
@@ -0,0 +1,83 @@
+From 2bd110339288c18823dcace602b63b0d8627e520 Mon Sep 17 00:00:00 2001
+From: John Sperbeck <jsperbeck@google.com>
+Date: Sun, 21 May 2023 19:29:53 +0000
+Subject: cgroup: always put cset in cgroup_css_set_put_fork
+
+From: John Sperbeck <jsperbeck@google.com>
+
+commit 2bd110339288c18823dcace602b63b0d8627e520 upstream.
+
+A successful call to cgroup_css_set_fork() will always have taken
+a ref on kargs->cset (regardless of CLONE_INTO_CGROUP), so always
+do a corresponding put in cgroup_css_set_put_fork().
+
+Without this, a cset and its contained css structures will be
+leaked for some fork failures.  The following script reproduces
+the leak for a fork failure due to exceeding pids.max in the
+pids controller.  A similar thing can happen if we jump to the
+bad_fork_cancel_cgroup label in copy_process().
+
+[ -z "$1" ] && echo "Usage $0 pids-root" && exit 1
+PID_ROOT=$1
+CGROUP=$PID_ROOT/foo
+
+[ -e $CGROUP ] && rmdir -f $CGROUP
+mkdir $CGROUP
+echo 5 > $CGROUP/pids.max
+echo $$ > $CGROUP/cgroup.procs
+
+fork_bomb()
+{
+       set -e
+       for i in $(seq 10); do
+               /bin/sleep 3600 &
+       done
+}
+
+(fork_bomb) &
+wait
+echo $$ > $PID_ROOT/cgroup.procs
+kill $(cat $CGROUP/cgroup.procs)
+rmdir $CGROUP
+
+Fixes: ef2c41cf38a7 ("clone3: allow spawning processes into cgroups")
+Cc: stable@vger.kernel.org # v5.7+
+Signed-off-by: John Sperbeck <jsperbeck@google.com>
+Signed-off-by: Tejun Heo <tj@kernel.org>
+[TJM: This backport accommodates the lack of cgroup_unlock]
+Signed-off-by: T.J. Mercier <tjmercier@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/cgroup/cgroup.c |   17 ++++++++---------
+ 1 file changed, 8 insertions(+), 9 deletions(-)
+
+--- a/kernel/cgroup/cgroup.c
++++ b/kernel/cgroup/cgroup.c
+@@ -6138,19 +6138,18 @@ err:
+ static void cgroup_css_set_put_fork(struct kernel_clone_args *kargs)
+       __releases(&cgroup_threadgroup_rwsem) __releases(&cgroup_mutex)
+ {
++      struct cgroup *cgrp = kargs->cgrp;
++      struct css_set *cset = kargs->cset;
++
+       cgroup_threadgroup_change_end(current);
+ 
+-      if (kargs->flags & CLONE_INTO_CGROUP) {
+-              struct cgroup *cgrp = kargs->cgrp;
+-              struct css_set *cset = kargs->cset;
++      if (cset) {
++              put_css_set(cset);
++              kargs->cset = NULL;
++      }
+ 
++      if (kargs->flags & CLONE_INTO_CGROUP) {
+               mutex_unlock(&cgroup_mutex);
+-
+-              if (cset) {
+-                      put_css_set(cset);
+-                      kargs->cset = NULL;
+-              }
+-
+               if (cgrp) {
+                       cgroup_put(cgrp);
+                       kargs->cgrp = NULL;
diff --git a/queue-5.10/rcu-kvfree-avoid-freeing-new-kfree_rcu-memory-after-old-grace-period.patch b/queue-5.10/rcu-kvfree-avoid-freeing-new-kfree_rcu-memory-after-old-grace-period.patch

new file mode 100644 (file)

index 0000000..0bd69d0
--- /dev/null
+++ b/queue-5.10/rcu-kvfree-avoid-freeing-new-kfree_rcu-memory-after-old-grace-period.patch
@@ -0,0 +1,183 @@
+From 5da7cb193db32da783a3f3e77d8b639989321d48 Mon Sep 17 00:00:00 2001
+From: Ziwei Dai <ziwei.dai@unisoc.com>
+Date: Fri, 31 Mar 2023 20:42:09 +0800
+Subject: rcu/kvfree: Avoid freeing new kfree_rcu() memory after old grace period
+
+From: Ziwei Dai <ziwei.dai@unisoc.com>
+
+commit 5da7cb193db32da783a3f3e77d8b639989321d48 upstream.
+
+Memory passed to kvfree_rcu() that is to be freed is tracked by a
+per-CPU kfree_rcu_cpu structure, which in turn contains pointers
+to kvfree_rcu_bulk_data structures that contain pointers to memory
+that has not yet been handed to RCU, along with an kfree_rcu_cpu_work
+structure that tracks the memory that has already been handed to RCU.
+These structures track three categories of memory: (1) Memory for
+kfree(), (2) Memory for kvfree(), and (3) Memory for both that arrived
+during an OOM episode.  The first two categories are tracked in a
+cache-friendly manner involving a dynamically allocated page of pointers
+(the aforementioned kvfree_rcu_bulk_data structures), while the third
+uses a simple (but decidedly cache-unfriendly) linked list through the
+rcu_head structures in each block of memory.
+
+On a given CPU, these three categories are handled as a unit, with that
+CPU's kfree_rcu_cpu_work structure having one pointer for each of the
+three categories.  Clearly, new memory for a given category cannot be
+placed in the corresponding kfree_rcu_cpu_work structure until any old
+memory has had its grace period elapse and thus has been removed.  And
+the kfree_rcu_monitor() function does in fact check for this.
+
+Except that the kfree_rcu_monitor() function checks these pointers one
+at a time.  This means that if the previous kfree_rcu() memory passed
+to RCU had only category 1 and the current one has only category 2, the
+kfree_rcu_monitor() function will send that current category-2 memory
+along immediately.  This can result in memory being freed too soon,
+that is, out from under unsuspecting RCU readers.
+
+To see this, consider the following sequence of events, in which:
+
+o      Task A on CPU 0 calls rcu_read_lock(), then uses "from_cset",
+       then is preempted.
+
+o      CPU 1 calls kfree_rcu(cset, rcu_head) in order to free "from_cset"
+       after a later grace period.  Except that "from_cset" is freed
+       right after the previous grace period ended, so that "from_cset"
+       is immediately freed.  Task A resumes and references "from_cset"'s
+       member, after which nothing good happens.
+
+In full detail:
+
+CPU 0                                  CPU 1
+----------------------                 ----------------------
+count_memcg_event_mm()
+|rcu_read_lock()  <---
+|mem_cgroup_from_task()
+ |// css_set_ptr is the "from_cset" mentioned on CPU 1
+ |css_set_ptr = rcu_dereference((task)->cgroups)
+ |// Hard irq comes, current task is scheduled out.
+
+                                       cgroup_attach_task()
+                                       |cgroup_migrate()
+                                       |cgroup_migrate_execute()
+                                       |css_set_move_task(task, from_cset, to_cset, true)
+                                       |cgroup_move_task(task, to_cset)
+                                       |rcu_assign_pointer(.., to_cset)
+                                       |...
+                                       |cgroup_migrate_finish()
+                                       |put_css_set_locked(from_cset)
+                                       |from_cset->refcount return 0
+                                       |kfree_rcu(cset, rcu_head) // free from_cset after new gp
+                                       |add_ptr_to_bulk_krc_lock()
+                                       |schedule_delayed_work(&krcp->monitor_work, ..)
+
+                                       kfree_rcu_monitor()
+                                       |krcp->bulk_head[0]'s work attached to krwp->bulk_head_free[]
+                                       |queue_rcu_work(system_wq, &krwp->rcu_work)
+                                       |if rwork->rcu.work is not in WORK_STRUCT_PENDING_BIT state,
+                                       |call_rcu(&rwork->rcu, rcu_work_rcufn) <--- request new gp
+
+                                       // There is a perious call_rcu(.., rcu_work_rcufn)
+                                       // gp end, rcu_work_rcufn() is called.
+                                       rcu_work_rcufn()
+                                       |__queue_work(.., rwork->wq, &rwork->work);
+
+                                       |kfree_rcu_work()
+                                       |krwp->bulk_head_free[0] bulk is freed before new gp end!!!
+                                       |The "from_cset" is freed before new gp end.
+
+// the task resumes some time later.
+ |css_set_ptr->subsys[(subsys_id) <--- Caused kernel crash, because css_set_ptr is freed.
+
+This commit therefore causes kfree_rcu_monitor() to refrain from moving
+kfree_rcu() memory to the kfree_rcu_cpu_work structure until the RCU
+grace period has completed for all three categories.
+
+v2: Use helper function instead of inserted code block at kfree_rcu_monitor().
+
+Fixes: 34c881745549 ("rcu: Support kfree_bulk() interface in kfree_rcu()")
+Fixes: 5f3c8d620447 ("rcu/tree: Maintain separate array for vmalloc ptrs")
+Reported-by: Mukesh Ojha <quic_mojha@quicinc.com>
+Signed-off-by: Ziwei Dai <ziwei.dai@unisoc.com>
+Reviewed-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
+Tested-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
+Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
+Signed-off-by: Suren Baghdasaryan <surenb@google.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/rcu/tree.c |   49 +++++++++++++++++++++++++++++++++++--------------
+ 1 file changed, 35 insertions(+), 14 deletions(-)
+
+--- a/kernel/rcu/tree.c
++++ b/kernel/rcu/tree.c
+@@ -3281,6 +3281,30 @@ static void kfree_rcu_work(struct work_s
+       }
+ }
+ 
++static bool
++need_offload_krc(struct kfree_rcu_cpu *krcp)
++{
++      int i;
++
++      for (i = 0; i < FREE_N_CHANNELS; i++)
++              if (krcp->bkvhead[i])
++                      return true;
++
++      return !!krcp->head;
++}
++
++static bool
++need_wait_for_krwp_work(struct kfree_rcu_cpu_work *krwp)
++{
++      int i;
++
++      for (i = 0; i < FREE_N_CHANNELS; i++)
++              if (krwp->bkvhead_free[i])
++                      return true;
++
++      return !!krwp->head_free;
++}
++
+ /*
+  * Schedule the kfree batch RCU work to run in workqueue context after a GP.
+  *
+@@ -3298,16 +3322,13 @@ static inline bool queue_kfree_rcu_work(
+       for (i = 0; i < KFREE_N_BATCHES; i++) {
+               krwp = &(krcp->krw_arr[i]);
+ 
+-              /*
+-               * Try to detach bkvhead or head and attach it over any
+-               * available corresponding free channel. It can be that
+-               * a previous RCU batch is in progress, it means that
+-               * immediately to queue another one is not possible so
+-               * return false to tell caller to retry.
+-               */
+-              if ((krcp->bkvhead[0] && !krwp->bkvhead_free[0]) ||
+-                      (krcp->bkvhead[1] && !krwp->bkvhead_free[1]) ||
+-                              (krcp->head && !krwp->head_free)) {
++              // Try to detach bulk_head or head and attach it, only when
++              // all channels are free.  Any channel is not free means at krwp
++              // there is on-going rcu work to handle krwp's free business.
++              if (need_wait_for_krwp_work(krwp))
++                      continue;
++
++              if (need_offload_krc(krcp)) {
+                       // Channel 1 corresponds to SLAB ptrs.
+                       // Channel 2 corresponds to vmalloc ptrs.
+                       for (j = 0; j < FREE_N_CHANNELS; j++) {
+@@ -3334,12 +3355,12 @@ static inline bool queue_kfree_rcu_work(
+                        */
+                       queue_rcu_work(system_wq, &krwp->rcu_work);
+               }
+-
+-              // Repeat if any "free" corresponding channel is still busy.
+-              if (krcp->bkvhead[0] || krcp->bkvhead[1] || krcp->head)
+-                      repeat = true;
+       }
+ 
++      // Repeat if any "free" corresponding channel is still busy.
++      if (need_offload_krc(krcp))
++              repeat = true;
++
+       return !repeat;
+ }
+ 
diff --git a/queue-5.10/series b/queue-5.10/series

index 0e9dfe0874300f3237224381c9f47653545d5567..57448887bade31842d857e5f32280bef06abae33 100644 (file)
--- a/queue-5.10/series
+++ b/queue-5.10/series
@@ -72,3 +72,5 @@ net-lapbether-only-support-ethernet-devices.patch
  net-tipc-resize-nlattr-array-to-correct-size.patch
  selftests-ptp-fix-timestamp-printf-format-for-ptp_sy.patch
  afs-fix-vlserver-probe-rtt-handling.patch
+cgroup-always-put-cset-in-cgroup_css_set_put_fork.patch
+rcu-kvfree-avoid-freeing-new-kfree_rcu-memory-after-old-grace-period.patch
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Mon, 19 Jun 2023 06:28:16 +0000 (08:28 +0200)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Mon, 19 Jun 2023 06:28:16 +0000 (08:28 +0200)
queue-5.10/cgroup-always-put-cset-in-cgroup_css_set_put_fork.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/rcu-kvfree-avoid-freeing-new-kfree_rcu-memory-after-old-grace-period.patch	[new file with mode: 0644]	patch \| blob
queue-5.10/series		patch \| blob \| blame \| history