]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
5.4-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 21 Feb 2024 08:51:49 +0000 (09:51 +0100)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Wed, 21 Feb 2024 08:51:49 +0000 (09:51 +0100)
added patches:
mm-memcontrol-decouple-reference-counting-from-page-accounting.patch
nilfs2-fix-potential-bug-in-end_buffer_async_write.patch
nilfs2-replace-warn_ons-for-invalid-dat-metadata-block-requests.patch
sched-membarrier-reduce-the-ability-to-hammer-on-sys_membarrier.patch

queue-5.4/mm-memcontrol-decouple-reference-counting-from-page-accounting.patch [new file with mode: 0644]
queue-5.4/nilfs2-fix-potential-bug-in-end_buffer_async_write.patch [new file with mode: 0644]
queue-5.4/nilfs2-replace-warn_ons-for-invalid-dat-metadata-block-requests.patch [new file with mode: 0644]
queue-5.4/sched-membarrier-reduce-the-ability-to-hammer-on-sys_membarrier.patch [new file with mode: 0644]
queue-5.4/series

diff --git a/queue-5.4/mm-memcontrol-decouple-reference-counting-from-page-accounting.patch b/queue-5.4/mm-memcontrol-decouple-reference-counting-from-page-accounting.patch
new file mode 100644 (file)
index 0000000..058fc0c
--- /dev/null
@@ -0,0 +1,104 @@
+From 1a3e1f40962c445b997151a542314f3c6097f8c3 Mon Sep 17 00:00:00 2001
+From: Johannes Weiner <hannes@cmpxchg.org>
+Date: Thu, 6 Aug 2020 23:20:45 -0700
+Subject: mm: memcontrol: decouple reference counting from page accounting
+
+From: Johannes Weiner <hannes@cmpxchg.org>
+
+commit 1a3e1f40962c445b997151a542314f3c6097f8c3 upstream.
+
+The reference counting of a memcg is currently coupled directly to how
+many 4k pages are charged to it.  This doesn't work well with Roman's new
+slab controller, which maintains pools of objects and doesn't want to keep
+an extra balance sheet for the pages backing those objects.
+
+This unusual refcounting design (reference counts usually track pointers
+to an object) is only for historical reasons: memcg used to not take any
+css references and simply stalled offlining until all charges had been
+reparented and the page counters had dropped to zero.  When we got rid of
+the reparenting requirement, the simple mechanical translation was to take
+a reference for every charge.
+
+More historical context can be found in commit e8ea14cc6ead ("mm:
+memcontrol: take a css reference for each charged page"), commit
+64f219938941 ("mm: memcontrol: remove obsolete kmemcg pinning tricks") and
+commit b2052564e66d ("mm: memcontrol: continue cache reclaim from offlined
+groups").
+
+The new slab controller exposes the limitations in this scheme, so let's
+switch it to a more idiomatic reference counting model based on actual
+kernel pointers to the memcg:
+
+- The per-cpu stock holds a reference to the memcg its caching
+
+- User pages hold a reference for their page->mem_cgroup. Transparent
+  huge pages will no longer acquire tail references in advance, we'll
+  get them if needed during the split.
+
+- Kernel pages hold a reference for their page->mem_cgroup
+
+- Pages allocated in the root cgroup will acquire and release css
+  references for simplicity. css_get() and css_put() optimize that.
+
+- The current memcg_charge_slab() already hacked around the per-charge
+  references; this change gets rid of that as well.
+
+- tcp accounting will handle reference in mem_cgroup_sk_{alloc,free}
+
+Roman:
+1) Rebased on top of the current mm tree: added css_get() in
+   mem_cgroup_charge(), dropped mem_cgroup_try_charge() part
+2) I've reformatted commit references in the commit log to make
+   checkpatch.pl happy.
+
+[hughd@google.com: remove css_put_many() from __mem_cgroup_clear_mc()]
+  Link: http://lkml.kernel.org/r/alpine.LSU.2.11.2007302011450.2347@eggly.anvils
+
+Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
+Signed-off-by: Roman Gushchin <guro@fb.com>
+Signed-off-by: Hugh Dickins <hughd@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Reviewed-by: Shakeel Butt <shakeelb@google.com>
+Acked-by: Roman Gushchin <guro@fb.com>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Cc: Christoph Lameter <cl@linux.com>
+Cc: Tejun Heo <tj@kernel.org>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Link: http://lkml.kernel.org/r/20200623174037.3951353-6-guro@fb.com
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Fixes: cdec2e4265df ("memcg: coalesce charging via percpu storage")
+Signed-off-by: GONG, Ruiqi <gongruiqi1@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/memcontrol.c |    6 ++++++
+ 1 file changed, 6 insertions(+)
+
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -2214,6 +2214,9 @@ static void drain_stock(struct memcg_sto
+ {
+       struct mem_cgroup *old = stock->cached;
++      if (!old)
++              return;
++
+       if (stock->nr_pages) {
+               page_counter_uncharge(&old->memory, stock->nr_pages);
+               if (do_memsw_account())
+@@ -2221,6 +2224,8 @@ static void drain_stock(struct memcg_sto
+               css_put_many(&old->css, stock->nr_pages);
+               stock->nr_pages = 0;
+       }
++
++      css_put(&old->css);
+       stock->cached = NULL;
+ }
+@@ -2256,6 +2261,7 @@ static void refill_stock(struct mem_cgro
+       stock = this_cpu_ptr(&memcg_stock);
+       if (stock->cached != memcg) { /* reset if necessary */
+               drain_stock(stock);
++              css_get(&memcg->css);
+               stock->cached = memcg;
+       }
+       stock->nr_pages += nr_pages;
diff --git a/queue-5.4/nilfs2-fix-potential-bug-in-end_buffer_async_write.patch b/queue-5.4/nilfs2-fix-potential-bug-in-end_buffer_async_write.patch
new file mode 100644 (file)
index 0000000..084cdb5
--- /dev/null
@@ -0,0 +1,99 @@
+From 5bc09b397cbf1221f8a8aacb1152650c9195b02b Mon Sep 17 00:00:00 2001
+From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
+Date: Sun, 4 Feb 2024 01:16:45 +0900
+Subject: nilfs2: fix potential bug in end_buffer_async_write
+
+From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
+
+commit 5bc09b397cbf1221f8a8aacb1152650c9195b02b upstream.
+
+According to a syzbot report, end_buffer_async_write(), which handles the
+completion of block device writes, may detect abnormal condition of the
+buffer async_write flag and cause a BUG_ON failure when using nilfs2.
+
+Nilfs2 itself does not use end_buffer_async_write().  But, the async_write
+flag is now used as a marker by commit 7f42ec394156 ("nilfs2: fix issue
+with race condition of competition between segments for dirty blocks") as
+a means of resolving double list insertion of dirty blocks in
+nilfs_lookup_dirty_data_buffers() and nilfs_lookup_node_buffers() and the
+resulting crash.
+
+This modification is safe as long as it is used for file data and b-tree
+node blocks where the page caches are independent.  However, it was
+irrelevant and redundant to also introduce async_write for segment summary
+and super root blocks that share buffers with the backing device.  This
+led to the possibility that the BUG_ON check in end_buffer_async_write
+would fail as described above, if independent writebacks of the backing
+device occurred in parallel.
+
+The use of async_write for segment summary buffers has already been
+removed in a previous change.
+
+Fix this issue by removing the manipulation of the async_write flag for
+the remaining super root block buffer.
+
+Link: https://lkml.kernel.org/r/20240203161645.4992-1-konishi.ryusuke@gmail.com
+Fixes: 7f42ec394156 ("nilfs2: fix issue with race condition of competition between segments for dirty blocks")
+Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
+Reported-by: syzbot+5c04210f7c7f897c1e7f@syzkaller.appspotmail.com
+Closes: https://lkml.kernel.org/r/00000000000019a97c05fd42f8c8@google.com
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/nilfs2/segment.c |    8 +++++---
+ 1 file changed, 5 insertions(+), 3 deletions(-)
+
+--- a/fs/nilfs2/segment.c
++++ b/fs/nilfs2/segment.c
+@@ -1702,7 +1702,6 @@ static void nilfs_segctor_prepare_write(
+               list_for_each_entry(bh, &segbuf->sb_payload_buffers,
+                                   b_assoc_buffers) {
+-                      set_buffer_async_write(bh);
+                       if (bh == segbuf->sb_super_root) {
+                               if (bh->b_page != bd_page) {
+                                       lock_page(bd_page);
+@@ -1713,6 +1712,7 @@ static void nilfs_segctor_prepare_write(
+                               }
+                               break;
+                       }
++                      set_buffer_async_write(bh);
+                       if (bh->b_page != fs_page) {
+                               nilfs_begin_page_io(fs_page);
+                               fs_page = bh->b_page;
+@@ -1798,7 +1798,6 @@ static void nilfs_abort_logs(struct list
+               list_for_each_entry(bh, &segbuf->sb_payload_buffers,
+                                   b_assoc_buffers) {
+-                      clear_buffer_async_write(bh);
+                       if (bh == segbuf->sb_super_root) {
+                               clear_buffer_uptodate(bh);
+                               if (bh->b_page != bd_page) {
+@@ -1807,6 +1806,7 @@ static void nilfs_abort_logs(struct list
+                               }
+                               break;
+                       }
++                      clear_buffer_async_write(bh);
+                       if (bh->b_page != fs_page) {
+                               nilfs_end_page_io(fs_page, err);
+                               fs_page = bh->b_page;
+@@ -1894,8 +1894,9 @@ static void nilfs_segctor_complete_write
+                                BIT(BH_Delay) | BIT(BH_NILFS_Volatile) |
+                                BIT(BH_NILFS_Redirected));
+-                      set_mask_bits(&bh->b_state, clear_bits, set_bits);
+                       if (bh == segbuf->sb_super_root) {
++                              set_buffer_uptodate(bh);
++                              clear_buffer_dirty(bh);
+                               if (bh->b_page != bd_page) {
+                                       end_page_writeback(bd_page);
+                                       bd_page = bh->b_page;
+@@ -1903,6 +1904,7 @@ static void nilfs_segctor_complete_write
+                               update_sr = true;
+                               break;
+                       }
++                      set_mask_bits(&bh->b_state, clear_bits, set_bits);
+                       if (bh->b_page != fs_page) {
+                               nilfs_end_page_io(fs_page, 0);
+                               fs_page = bh->b_page;
diff --git a/queue-5.4/nilfs2-replace-warn_ons-for-invalid-dat-metadata-block-requests.patch b/queue-5.4/nilfs2-replace-warn_ons-for-invalid-dat-metadata-block-requests.patch
new file mode 100644 (file)
index 0000000..90404dc
--- /dev/null
@@ -0,0 +1,82 @@
+From 5124a0a549857c4b87173280e192eea24dea72ad Mon Sep 17 00:00:00 2001
+From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
+Date: Fri, 27 Jan 2023 01:41:14 +0900
+Subject: nilfs2: replace WARN_ONs for invalid DAT metadata block requests
+
+From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
+
+commit 5124a0a549857c4b87173280e192eea24dea72ad upstream.
+
+If DAT metadata file block access fails due to corruption of the DAT file
+or abnormal virtual block numbers held by b-trees or inodes, a kernel
+warning is generated.
+
+This replaces the WARN_ONs by error output, so that a kernel, booted with
+panic_on_warn, does not panic.  This patch also replaces the detected
+return code -ENOENT with another internal code -EINVAL to notify the bmap
+layer of metadata corruption.  When the bmap layer sees -EINVAL, it
+handles the abnormal situation with nilfs_bmap_convert_error() and finally
+returns code -EIO as it should.
+
+Link: https://lkml.kernel.org/r/0000000000005cc3d205ea23ddcf@google.com
+Link: https://lkml.kernel.org/r/20230126164114.6911-1-konishi.ryusuke@gmail.com
+Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
+Reported-by: <syzbot+5d5d25f90f195a3cfcb4@syzkaller.appspotmail.com>
+Tested-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/nilfs2/dat.c |   27 +++++++++++++++++----------
+ 1 file changed, 17 insertions(+), 10 deletions(-)
+
+--- a/fs/nilfs2/dat.c
++++ b/fs/nilfs2/dat.c
+@@ -40,8 +40,21 @@ static inline struct nilfs_dat_info *NIL
+ static int nilfs_dat_prepare_entry(struct inode *dat,
+                                  struct nilfs_palloc_req *req, int create)
+ {
+-      return nilfs_palloc_get_entry_block(dat, req->pr_entry_nr,
+-                                          create, &req->pr_entry_bh);
++      int ret;
++
++      ret = nilfs_palloc_get_entry_block(dat, req->pr_entry_nr,
++                                         create, &req->pr_entry_bh);
++      if (unlikely(ret == -ENOENT)) {
++              nilfs_err(dat->i_sb,
++                        "DAT doesn't have a block to manage vblocknr = %llu",
++                        (unsigned long long)req->pr_entry_nr);
++              /*
++               * Return internal code -EINVAL to notify bmap layer of
++               * metadata corruption.
++               */
++              ret = -EINVAL;
++      }
++      return ret;
+ }
+ static void nilfs_dat_commit_entry(struct inode *dat,
+@@ -123,11 +136,7 @@ static void nilfs_dat_commit_free(struct
+ int nilfs_dat_prepare_start(struct inode *dat, struct nilfs_palloc_req *req)
+ {
+-      int ret;
+-
+-      ret = nilfs_dat_prepare_entry(dat, req, 0);
+-      WARN_ON(ret == -ENOENT);
+-      return ret;
++      return nilfs_dat_prepare_entry(dat, req, 0);
+ }
+ void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req,
+@@ -154,10 +163,8 @@ int nilfs_dat_prepare_end(struct inode *
+       int ret;
+       ret = nilfs_dat_prepare_entry(dat, req, 0);
+-      if (ret < 0) {
+-              WARN_ON(ret == -ENOENT);
++      if (ret < 0)
+               return ret;
+-      }
+       kaddr = kmap_atomic(req->pr_entry_bh->b_page);
+       entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
diff --git a/queue-5.4/sched-membarrier-reduce-the-ability-to-hammer-on-sys_membarrier.patch b/queue-5.4/sched-membarrier-reduce-the-ability-to-hammer-on-sys_membarrier.patch
new file mode 100644 (file)
index 0000000..d17634a
--- /dev/null
@@ -0,0 +1,86 @@
+From 944d5fe50f3f03daacfea16300e656a1691c4a23 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linuxfoundation.org>
+Date: Sun, 4 Feb 2024 15:25:12 +0000
+Subject: sched/membarrier: reduce the ability to hammer on sys_membarrier
+
+From: Linus Torvalds <torvalds@linuxfoundation.org>
+
+commit 944d5fe50f3f03daacfea16300e656a1691c4a23 upstream.
+
+On some systems, sys_membarrier can be very expensive, causing overall
+slowdowns for everything.  So put a lock on the path in order to
+serialize the accesses to prevent the ability for this to be called at
+too high of a frequency and saturate the machine.
+
+Reviewed-and-tested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+Acked-by: Borislav Petkov <bp@alien8.de>
+Fixes: 22e4ebb97582 ("membarrier: Provide expedited private command")
+Fixes: c5f58bd58f43 ("membarrier: Provide GLOBAL_EXPEDITED command")
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+[ converted to explicit mutex_*() calls - cleanup.h is not in this stable
+  branch - gregkh ]
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/sched/membarrier.c |    9 +++++++++
+ 1 file changed, 9 insertions(+)
+
+--- a/kernel/sched/membarrier.c
++++ b/kernel/sched/membarrier.c
+@@ -25,6 +25,8 @@
+       | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED                     \
+       | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)
++static DEFINE_MUTEX(membarrier_ipi_mutex);
++
+ static void ipi_mb(void *info)
+ {
+       smp_mb();       /* IPIs should be serializing but paranoid. */
+@@ -97,6 +99,7 @@ static int membarrier_global_expedited(v
+       if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+               return -ENOMEM;
++      mutex_lock(&membarrier_ipi_mutex);
+       cpus_read_lock();
+       rcu_read_lock();
+       for_each_online_cpu(cpu) {
+@@ -143,6 +146,8 @@ static int membarrier_global_expedited(v
+        * rq->curr modification in scheduler.
+        */
+       smp_mb();       /* exit from system call is not a mb */
++      mutex_unlock(&membarrier_ipi_mutex);
++
+       return 0;
+ }
+@@ -178,6 +183,7 @@ static int membarrier_private_expedited(
+       if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+               return -ENOMEM;
++      mutex_lock(&membarrier_ipi_mutex);
+       cpus_read_lock();
+       rcu_read_lock();
+       for_each_online_cpu(cpu) {
+@@ -212,6 +218,7 @@ static int membarrier_private_expedited(
+        * rq->curr modification in scheduler.
+        */
+       smp_mb();       /* exit from system call is not a mb */
++      mutex_unlock(&membarrier_ipi_mutex);
+       return 0;
+ }
+@@ -253,6 +260,7 @@ static int sync_runqueues_membarrier_sta
+        * between threads which are users of @mm has its membarrier state
+        * updated.
+        */
++      mutex_lock(&membarrier_ipi_mutex);
+       cpus_read_lock();
+       rcu_read_lock();
+       for_each_online_cpu(cpu) {
+@@ -269,6 +277,7 @@ static int sync_runqueues_membarrier_sta
+       free_cpumask_var(tmpmask);
+       cpus_read_unlock();
++      mutex_unlock(&membarrier_ipi_mutex);
+       return 0;
+ }
index 03f6e25ebacf434ae1845f911d5ef576c74abccf..884ffc2f8c08c8275013a5ea19c2d85e65734553 100644 (file)
@@ -253,3 +253,7 @@ kvm-arm64-vgic-its-avoid-potential-uaf-in-lpi-transl.patch
 netfilter-ipset-fix-performance-regression-in-swap-operation.patch
 netfilter-ipset-missing-gc-cancellations-fixed.patch
 net-prevent-mss-overflow-in-skb_segment.patch
+sched-membarrier-reduce-the-ability-to-hammer-on-sys_membarrier.patch
+mm-memcontrol-decouple-reference-counting-from-page-accounting.patch
+nilfs2-fix-potential-bug-in-end_buffer_async_write.patch
+nilfs2-replace-warn_ons-for-invalid-dat-metadata-block-requests.patch