]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
Fixes for 5.15
authorSasha Levin <sashal@kernel.org>
Wed, 3 May 2023 22:32:14 +0000 (18:32 -0400)
committerSasha Levin <sashal@kernel.org>
Wed, 3 May 2023 22:32:14 +0000 (18:32 -0400)
Signed-off-by: Sasha Levin <sashal@kernel.org>
queue-5.15/series
queue-5.15/writeback-cgroup-fix-null-ptr-deref-write-in-bdi_spl.patch [new file with mode: 0644]

index 03c019d263e64d81f0955bdbfba6225995750583..8d24bbe4e232cf9ca17ffe0c212192c89de7259c 100644 (file)
@@ -7,3 +7,4 @@ selftests-mount-fix-mount_setattr_test-builds-failed.patch
 asm-generic-io.h-suppress-endianness-warnings-for-re.patch
 x86-cpu-add-model-number-for-intel-arrow-lake-proces.patch
 wireguard-timers-cast-enum-limits-members-to-int-in-prints.patch
+writeback-cgroup-fix-null-ptr-deref-write-in-bdi_spl.patch
diff --git a/queue-5.15/writeback-cgroup-fix-null-ptr-deref-write-in-bdi_spl.patch b/queue-5.15/writeback-cgroup-fix-null-ptr-deref-write-in-bdi_spl.patch
new file mode 100644 (file)
index 0000000..7a24a69
--- /dev/null
@@ -0,0 +1,178 @@
+From 5d42748b4de13444317b872e60bd60d4f29cec99 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 10 Apr 2023 21:08:26 +0800
+Subject: writeback, cgroup: fix null-ptr-deref write in bdi_split_work_to_wbs
+
+From: Baokun Li <libaokun1@huawei.com>
+
+[ Upstream commit 1ba1199ec5747f475538c0d25a32804e5ba1dfde ]
+
+KASAN report null-ptr-deref:
+==================================================================
+BUG: KASAN: null-ptr-deref in bdi_split_work_to_wbs+0x5c5/0x7b0
+Write of size 8 at addr 0000000000000000 by task sync/943
+CPU: 5 PID: 943 Comm: sync Tainted: 6.3.0-rc5-next-20230406-dirty #461
+Call Trace:
+ <TASK>
+ dump_stack_lvl+0x7f/0xc0
+ print_report+0x2ba/0x340
+ kasan_report+0xc4/0x120
+ kasan_check_range+0x1b7/0x2e0
+ __kasan_check_write+0x24/0x40
+ bdi_split_work_to_wbs+0x5c5/0x7b0
+ sync_inodes_sb+0x195/0x630
+ sync_inodes_one_sb+0x3a/0x50
+ iterate_supers+0x106/0x1b0
+ ksys_sync+0x98/0x160
+[...]
+==================================================================
+
+The race that causes the above issue is as follows:
+
+           cpu1                     cpu2
+-------------------------|-------------------------
+inode_switch_wbs
+ INIT_WORK(&isw->work, inode_switch_wbs_work_fn)
+ queue_rcu_work(isw_wq, &isw->work)
+ // queue_work async
+  inode_switch_wbs_work_fn
+   wb_put_many(old_wb, nr_switched)
+    percpu_ref_put_many
+     ref->data->release(ref)
+     cgwb_release
+      queue_work(cgwb_release_wq, &wb->release_work)
+      // queue_work async
+       &wb->release_work
+       cgwb_release_workfn
+                            ksys_sync
+                             iterate_supers
+                              sync_inodes_one_sb
+                               sync_inodes_sb
+                                bdi_split_work_to_wbs
+                                 kmalloc(sizeof(*work), GFP_ATOMIC)
+                                 // alloc memory failed
+        percpu_ref_exit
+         ref->data = NULL
+         kfree(data)
+                                 wb_get(wb)
+                                  percpu_ref_get(&wb->refcnt)
+                                   percpu_ref_get_many(ref, 1)
+                                    atomic_long_add(nr, &ref->data->count)
+                                     atomic64_add(i, v)
+                                     // trigger null-ptr-deref
+
+bdi_split_work_to_wbs() traverses &bdi->wb_list to split work into all
+wbs.  If the allocation of new work fails, the on-stack fallback will be
+used and the reference count of the current wb is increased afterwards.
+If cgroup writeback membership switches occur before getting the reference
+count and the current wb is released as old_wd, then calling wb_get() or
+wb_put() will trigger the null pointer dereference above.
+
+This issue was introduced in v4.3-rc7 (see fix tag1).  Both
+sync_inodes_sb() and __writeback_inodes_sb_nr() calls to
+bdi_split_work_to_wbs() can trigger this issue.  For scenarios called via
+sync_inodes_sb(), originally commit 7fc5854f8c6e ("writeback: synchronize
+sync(2) against cgroup writeback membership switches") reduced the
+possibility of the issue by adding wb_switch_rwsem, but in v5.14-rc1 (see
+fix tag2) removed the "inode_io_list_del_locked(inode, old_wb)" from
+inode_switch_wbs_work_fn() so that wb->state contains WB_has_dirty_io,
+thus old_wb is not skipped when traversing wbs in bdi_split_work_to_wbs(),
+and the issue becomes easily reproducible again.
+
+To solve this problem, percpu_ref_exit() is called under RCU protection to
+avoid race between cgwb_release_workfn() and bdi_split_work_to_wbs().
+Moreover, replace wb_get() with wb_tryget() in bdi_split_work_to_wbs(),
+and skip the current wb if wb_tryget() fails because the wb has already
+been shutdown.
+
+Link: https://lkml.kernel.org/r/20230410130826.1492525-1-libaokun1@huawei.com
+Fixes: b817525a4a80 ("writeback: bdi_writeback iteration must not skip dying ones")
+Signed-off-by: Baokun Li <libaokun1@huawei.com>
+Reviewed-by: Jan Kara <jack@suse.cz>
+Acked-by: Tejun Heo <tj@kernel.org>
+Cc: Alexander Viro <viro@zeniv.linux.org.uk>
+Cc: Andreas Dilger <adilger.kernel@dilger.ca>
+Cc: Christian Brauner <brauner@kernel.org>
+Cc: Dennis Zhou <dennis@kernel.org>
+Cc: Hou Tao <houtao1@huawei.com>
+Cc: yangerkun <yangerkun@huawei.com>
+Cc: Zhang Yi <yi.zhang@huawei.com>
+Cc: Jens Axboe <axboe@kernel.dk>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/fs-writeback.c | 17 ++++++++++-------
+ mm/backing-dev.c  | 12 ++++++++++--
+ 2 files changed, 20 insertions(+), 9 deletions(-)
+
+diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
+index f4a5a0c2858a1..fbc3f0ef38c02 100644
+--- a/fs/fs-writeback.c
++++ b/fs/fs-writeback.c
+@@ -1009,6 +1009,16 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
+                       continue;
+               }
++              /*
++               * If wb_tryget fails, the wb has been shutdown, skip it.
++               *
++               * Pin @wb so that it stays on @bdi->wb_list.  This allows
++               * continuing iteration from @wb after dropping and
++               * regrabbing rcu read lock.
++               */
++              if (!wb_tryget(wb))
++                      continue;
++
+               /* alloc failed, execute synchronously using on-stack fallback */
+               work = &fallback_work;
+               *work = *base_work;
+@@ -1017,13 +1027,6 @@ static void bdi_split_work_to_wbs(struct backing_dev_info *bdi,
+               work->done = &fallback_work_done;
+               wb_queue_work(wb, work);
+-
+-              /*
+-               * Pin @wb so that it stays on @bdi->wb_list.  This allows
+-               * continuing iteration from @wb after dropping and
+-               * regrabbing rcu read lock.
+-               */
+-              wb_get(wb);
+               last_wb = wb;
+               rcu_read_unlock();
+diff --git a/mm/backing-dev.c b/mm/backing-dev.c
+index 142e118ade87a..afdd132768455 100644
+--- a/mm/backing-dev.c
++++ b/mm/backing-dev.c
+@@ -385,6 +385,15 @@ static LIST_HEAD(offline_cgwbs);
+ static void cleanup_offline_cgwbs_workfn(struct work_struct *work);
+ static DECLARE_WORK(cleanup_offline_cgwbs_work, cleanup_offline_cgwbs_workfn);
++static void cgwb_free_rcu(struct rcu_head *rcu_head)
++{
++      struct bdi_writeback *wb = container_of(rcu_head,
++                      struct bdi_writeback, rcu);
++
++      percpu_ref_exit(&wb->refcnt);
++      kfree(wb);
++}
++
+ static void cgwb_release_workfn(struct work_struct *work)
+ {
+       struct bdi_writeback *wb = container_of(work, struct bdi_writeback,
+@@ -407,10 +416,9 @@ static void cgwb_release_workfn(struct work_struct *work)
+       list_del(&wb->offline_node);
+       spin_unlock_irq(&cgwb_lock);
+-      percpu_ref_exit(&wb->refcnt);
+       wb_exit(wb);
+       WARN_ON_ONCE(!list_empty(&wb->b_attached));
+-      kfree_rcu(wb, rcu);
++      call_rcu(&wb->rcu, cgwb_free_rcu);
+ }
+ static void cgwb_release(struct percpu_ref *refcnt)
+-- 
+2.39.2
+