From: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Date: Wed, 21 Feb 2024 08:51:49 +0000 (+0100)
Subject: 5.4-stable patches
X-Git-Tag: v4.19.307~39
X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=84282aee0cc74d5136448394a6afae8cd2e97a10;p=thirdparty%2Fkernel%2Fstable-queue.git

5.4-stable patches

added patches:
	mm-memcontrol-decouple-reference-counting-from-page-accounting.patch
	nilfs2-fix-potential-bug-in-end_buffer_async_write.patch
	nilfs2-replace-warn_ons-for-invalid-dat-metadata-block-requests.patch
	sched-membarrier-reduce-the-ability-to-hammer-on-sys_membarrier.patch
---

diff --git a/queue-5.4/mm-memcontrol-decouple-reference-counting-from-page-accounting.patch b/queue-5.4/mm-memcontrol-decouple-reference-counting-from-page-accounting.patch
new file mode 100644
index 00000000000..058fc0c8ce7
--- /dev/null
+++ b/queue-5.4/mm-memcontrol-decouple-reference-counting-from-page-accounting.patch
@@ -0,0 +1,104 @@
+From 1a3e1f40962c445b997151a542314f3c6097f8c3 Mon Sep 17 00:00:00 2001
+From: Johannes Weiner <hannes@cmpxchg.org>
+Date: Thu, 6 Aug 2020 23:20:45 -0700
+Subject: mm: memcontrol: decouple reference counting from page accounting
+
+From: Johannes Weiner <hannes@cmpxchg.org>
+
+commit 1a3e1f40962c445b997151a542314f3c6097f8c3 upstream.
+
+The reference counting of a memcg is currently coupled directly to how
+many 4k pages are charged to it.  This doesn't work well with Roman's new
+slab controller, which maintains pools of objects and doesn't want to keep
+an extra balance sheet for the pages backing those objects.
+
+This unusual refcounting design (reference counts usually track pointers
+to an object) is only for historical reasons: memcg used to not take any
+css references and simply stalled offlining until all charges had been
+reparented and the page counters had dropped to zero.  When we got rid of
+the reparenting requirement, the simple mechanical translation was to take
+a reference for every charge.
+
+More historical context can be found in commit e8ea14cc6ead ("mm:
+memcontrol: take a css reference for each charged page"), commit
+64f219938941 ("mm: memcontrol: remove obsolete kmemcg pinning tricks") and
+commit b2052564e66d ("mm: memcontrol: continue cache reclaim from offlined
+groups").
+
+The new slab controller exposes the limitations in this scheme, so let's
+switch it to a more idiomatic reference counting model based on actual
+kernel pointers to the memcg:
+
+- The per-cpu stock holds a reference to the memcg its caching
+
+- User pages hold a reference for their page->mem_cgroup. Transparent
+  huge pages will no longer acquire tail references in advance, we'll
+  get them if needed during the split.
+
+- Kernel pages hold a reference for their page->mem_cgroup
+
+- Pages allocated in the root cgroup will acquire and release css
+  references for simplicity. css_get() and css_put() optimize that.
+
+- The current memcg_charge_slab() already hacked around the per-charge
+  references; this change gets rid of that as well.
+
+- tcp accounting will handle reference in mem_cgroup_sk_{alloc,free}
+
+Roman:
+1) Rebased on top of the current mm tree: added css_get() in
+   mem_cgroup_charge(), dropped mem_cgroup_try_charge() part
+2) I've reformatted commit references in the commit log to make
+   checkpatch.pl happy.
+
+[hughd@google.com: remove css_put_many() from __mem_cgroup_clear_mc()]
+  Link: http://lkml.kernel.org/r/alpine.LSU.2.11.2007302011450.2347@eggly.anvils
+
+Signed-off-by: Johannes Weiner <hannes@cmpxchg.org>
+Signed-off-by: Roman Gushchin <guro@fb.com>
+Signed-off-by: Hugh Dickins <hughd@google.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Reviewed-by: Shakeel Butt <shakeelb@google.com>
+Acked-by: Roman Gushchin <guro@fb.com>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Cc: Christoph Lameter <cl@linux.com>
+Cc: Tejun Heo <tj@kernel.org>
+Cc: Vlastimil Babka <vbabka@suse.cz>
+Link: http://lkml.kernel.org/r/20200623174037.3951353-6-guro@fb.com
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Fixes: cdec2e4265df ("memcg: coalesce charging via percpu storage")
+Signed-off-by: GONG, Ruiqi <gongruiqi1@huawei.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/memcontrol.c |    6 ++++++
+ 1 file changed, 6 insertions(+)
+
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -2214,6 +2214,9 @@ static void drain_stock(struct memcg_sto
+ {
+ 	struct mem_cgroup *old = stock->cached;
+ 
++	if (!old)
++		return;
++
+ 	if (stock->nr_pages) {
+ 		page_counter_uncharge(&old->memory, stock->nr_pages);
+ 		if (do_memsw_account())
+@@ -2221,6 +2224,8 @@ static void drain_stock(struct memcg_sto
+ 		css_put_many(&old->css, stock->nr_pages);
+ 		stock->nr_pages = 0;
+ 	}
++
++	css_put(&old->css);
+ 	stock->cached = NULL;
+ }
+ 
+@@ -2256,6 +2261,7 @@ static void refill_stock(struct mem_cgro
+ 	stock = this_cpu_ptr(&memcg_stock);
+ 	if (stock->cached != memcg) { /* reset if necessary */
+ 		drain_stock(stock);
++		css_get(&memcg->css);
+ 		stock->cached = memcg;
+ 	}
+ 	stock->nr_pages += nr_pages;
diff --git a/queue-5.4/nilfs2-fix-potential-bug-in-end_buffer_async_write.patch b/queue-5.4/nilfs2-fix-potential-bug-in-end_buffer_async_write.patch
new file mode 100644
index 00000000000..084cdb505f7
--- /dev/null
+++ b/queue-5.4/nilfs2-fix-potential-bug-in-end_buffer_async_write.patch
@@ -0,0 +1,99 @@
+From 5bc09b397cbf1221f8a8aacb1152650c9195b02b Mon Sep 17 00:00:00 2001
+From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
+Date: Sun, 4 Feb 2024 01:16:45 +0900
+Subject: nilfs2: fix potential bug in end_buffer_async_write
+
+From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
+
+commit 5bc09b397cbf1221f8a8aacb1152650c9195b02b upstream.
+
+According to a syzbot report, end_buffer_async_write(), which handles the
+completion of block device writes, may detect abnormal condition of the
+buffer async_write flag and cause a BUG_ON failure when using nilfs2.
+
+Nilfs2 itself does not use end_buffer_async_write().  But, the async_write
+flag is now used as a marker by commit 7f42ec394156 ("nilfs2: fix issue
+with race condition of competition between segments for dirty blocks") as
+a means of resolving double list insertion of dirty blocks in
+nilfs_lookup_dirty_data_buffers() and nilfs_lookup_node_buffers() and the
+resulting crash.
+
+This modification is safe as long as it is used for file data and b-tree
+node blocks where the page caches are independent.  However, it was
+irrelevant and redundant to also introduce async_write for segment summary
+and super root blocks that share buffers with the backing device.  This
+led to the possibility that the BUG_ON check in end_buffer_async_write
+would fail as described above, if independent writebacks of the backing
+device occurred in parallel.
+
+The use of async_write for segment summary buffers has already been
+removed in a previous change.
+
+Fix this issue by removing the manipulation of the async_write flag for
+the remaining super root block buffer.
+
+Link: https://lkml.kernel.org/r/20240203161645.4992-1-konishi.ryusuke@gmail.com
+Fixes: 7f42ec394156 ("nilfs2: fix issue with race condition of competition between segments for dirty blocks")
+Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
+Reported-by: syzbot+5c04210f7c7f897c1e7f@syzkaller.appspotmail.com
+Closes: https://lkml.kernel.org/r/00000000000019a97c05fd42f8c8@google.com
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/nilfs2/segment.c |    8 +++++---
+ 1 file changed, 5 insertions(+), 3 deletions(-)
+
+--- a/fs/nilfs2/segment.c
++++ b/fs/nilfs2/segment.c
+@@ -1702,7 +1702,6 @@ static void nilfs_segctor_prepare_write(
+ 
+ 		list_for_each_entry(bh, &segbuf->sb_payload_buffers,
+ 				    b_assoc_buffers) {
+-			set_buffer_async_write(bh);
+ 			if (bh == segbuf->sb_super_root) {
+ 				if (bh->b_page != bd_page) {
+ 					lock_page(bd_page);
+@@ -1713,6 +1712,7 @@ static void nilfs_segctor_prepare_write(
+ 				}
+ 				break;
+ 			}
++			set_buffer_async_write(bh);
+ 			if (bh->b_page != fs_page) {
+ 				nilfs_begin_page_io(fs_page);
+ 				fs_page = bh->b_page;
+@@ -1798,7 +1798,6 @@ static void nilfs_abort_logs(struct list
+ 
+ 		list_for_each_entry(bh, &segbuf->sb_payload_buffers,
+ 				    b_assoc_buffers) {
+-			clear_buffer_async_write(bh);
+ 			if (bh == segbuf->sb_super_root) {
+ 				clear_buffer_uptodate(bh);
+ 				if (bh->b_page != bd_page) {
+@@ -1807,6 +1806,7 @@ static void nilfs_abort_logs(struct list
+ 				}
+ 				break;
+ 			}
++			clear_buffer_async_write(bh);
+ 			if (bh->b_page != fs_page) {
+ 				nilfs_end_page_io(fs_page, err);
+ 				fs_page = bh->b_page;
+@@ -1894,8 +1894,9 @@ static void nilfs_segctor_complete_write
+ 				 BIT(BH_Delay) | BIT(BH_NILFS_Volatile) |
+ 				 BIT(BH_NILFS_Redirected));
+ 
+-			set_mask_bits(&bh->b_state, clear_bits, set_bits);
+ 			if (bh == segbuf->sb_super_root) {
++				set_buffer_uptodate(bh);
++				clear_buffer_dirty(bh);
+ 				if (bh->b_page != bd_page) {
+ 					end_page_writeback(bd_page);
+ 					bd_page = bh->b_page;
+@@ -1903,6 +1904,7 @@ static void nilfs_segctor_complete_write
+ 				update_sr = true;
+ 				break;
+ 			}
++			set_mask_bits(&bh->b_state, clear_bits, set_bits);
+ 			if (bh->b_page != fs_page) {
+ 				nilfs_end_page_io(fs_page, 0);
+ 				fs_page = bh->b_page;
diff --git a/queue-5.4/nilfs2-replace-warn_ons-for-invalid-dat-metadata-block-requests.patch b/queue-5.4/nilfs2-replace-warn_ons-for-invalid-dat-metadata-block-requests.patch
new file mode 100644
index 00000000000..90404dc64c9
--- /dev/null
+++ b/queue-5.4/nilfs2-replace-warn_ons-for-invalid-dat-metadata-block-requests.patch
@@ -0,0 +1,82 @@
+From 5124a0a549857c4b87173280e192eea24dea72ad Mon Sep 17 00:00:00 2001
+From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
+Date: Fri, 27 Jan 2023 01:41:14 +0900
+Subject: nilfs2: replace WARN_ONs for invalid DAT metadata block requests
+
+From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
+
+commit 5124a0a549857c4b87173280e192eea24dea72ad upstream.
+
+If DAT metadata file block access fails due to corruption of the DAT file
+or abnormal virtual block numbers held by b-trees or inodes, a kernel
+warning is generated.
+
+This replaces the WARN_ONs by error output, so that a kernel, booted with
+panic_on_warn, does not panic.  This patch also replaces the detected
+return code -ENOENT with another internal code -EINVAL to notify the bmap
+layer of metadata corruption.  When the bmap layer sees -EINVAL, it
+handles the abnormal situation with nilfs_bmap_convert_error() and finally
+returns code -EIO as it should.
+
+Link: https://lkml.kernel.org/r/0000000000005cc3d205ea23ddcf@google.com
+Link: https://lkml.kernel.org/r/20230126164114.6911-1-konishi.ryusuke@gmail.com
+Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
+Reported-by: <syzbot+5d5d25f90f195a3cfcb4@syzkaller.appspotmail.com>
+Tested-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/nilfs2/dat.c |   27 +++++++++++++++++----------
+ 1 file changed, 17 insertions(+), 10 deletions(-)
+
+--- a/fs/nilfs2/dat.c
++++ b/fs/nilfs2/dat.c
+@@ -40,8 +40,21 @@ static inline struct nilfs_dat_info *NIL
+ static int nilfs_dat_prepare_entry(struct inode *dat,
+ 				   struct nilfs_palloc_req *req, int create)
+ {
+-	return nilfs_palloc_get_entry_block(dat, req->pr_entry_nr,
+-					    create, &req->pr_entry_bh);
++	int ret;
++
++	ret = nilfs_palloc_get_entry_block(dat, req->pr_entry_nr,
++					   create, &req->pr_entry_bh);
++	if (unlikely(ret == -ENOENT)) {
++		nilfs_err(dat->i_sb,
++			  "DAT doesn't have a block to manage vblocknr = %llu",
++			  (unsigned long long)req->pr_entry_nr);
++		/*
++		 * Return internal code -EINVAL to notify bmap layer of
++		 * metadata corruption.
++		 */
++		ret = -EINVAL;
++	}
++	return ret;
+ }
+ 
+ static void nilfs_dat_commit_entry(struct inode *dat,
+@@ -123,11 +136,7 @@ static void nilfs_dat_commit_free(struct
+ 
+ int nilfs_dat_prepare_start(struct inode *dat, struct nilfs_palloc_req *req)
+ {
+-	int ret;
+-
+-	ret = nilfs_dat_prepare_entry(dat, req, 0);
+-	WARN_ON(ret == -ENOENT);
+-	return ret;
++	return nilfs_dat_prepare_entry(dat, req, 0);
+ }
+ 
+ void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req,
+@@ -154,10 +163,8 @@ int nilfs_dat_prepare_end(struct inode *
+ 	int ret;
+ 
+ 	ret = nilfs_dat_prepare_entry(dat, req, 0);
+-	if (ret < 0) {
+-		WARN_ON(ret == -ENOENT);
++	if (ret < 0)
+ 		return ret;
+-	}
+ 
+ 	kaddr = kmap_atomic(req->pr_entry_bh->b_page);
+ 	entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr,
diff --git a/queue-5.4/sched-membarrier-reduce-the-ability-to-hammer-on-sys_membarrier.patch b/queue-5.4/sched-membarrier-reduce-the-ability-to-hammer-on-sys_membarrier.patch
new file mode 100644
index 00000000000..d17634a5d92
--- /dev/null
+++ b/queue-5.4/sched-membarrier-reduce-the-ability-to-hammer-on-sys_membarrier.patch
@@ -0,0 +1,86 @@
+From 944d5fe50f3f03daacfea16300e656a1691c4a23 Mon Sep 17 00:00:00 2001
+From: Linus Torvalds <torvalds@linuxfoundation.org>
+Date: Sun, 4 Feb 2024 15:25:12 +0000
+Subject: sched/membarrier: reduce the ability to hammer on sys_membarrier
+
+From: Linus Torvalds <torvalds@linuxfoundation.org>
+
+commit 944d5fe50f3f03daacfea16300e656a1691c4a23 upstream.
+
+On some systems, sys_membarrier can be very expensive, causing overall
+slowdowns for everything.  So put a lock on the path in order to
+serialize the accesses to prevent the ability for this to be called at
+too high of a frequency and saturate the machine.
+
+Reviewed-and-tested-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
+Acked-by: Borislav Petkov <bp@alien8.de>
+Fixes: 22e4ebb97582 ("membarrier: Provide expedited private command")
+Fixes: c5f58bd58f43 ("membarrier: Provide GLOBAL_EXPEDITED command")
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+[ converted to explicit mutex_*() calls - cleanup.h is not in this stable
+  branch - gregkh ]
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/sched/membarrier.c |    9 +++++++++
+ 1 file changed, 9 insertions(+)
+
+--- a/kernel/sched/membarrier.c
++++ b/kernel/sched/membarrier.c
+@@ -25,6 +25,8 @@
+ 	| MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED			\
+ 	| MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK)
+ 
++static DEFINE_MUTEX(membarrier_ipi_mutex);
++
+ static void ipi_mb(void *info)
+ {
+ 	smp_mb();	/* IPIs should be serializing but paranoid. */
+@@ -97,6 +99,7 @@ static int membarrier_global_expedited(v
+ 	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+ 		return -ENOMEM;
+ 
++	mutex_lock(&membarrier_ipi_mutex);
+ 	cpus_read_lock();
+ 	rcu_read_lock();
+ 	for_each_online_cpu(cpu) {
+@@ -143,6 +146,8 @@ static int membarrier_global_expedited(v
+ 	 * rq->curr modification in scheduler.
+ 	 */
+ 	smp_mb();	/* exit from system call is not a mb */
++	mutex_unlock(&membarrier_ipi_mutex);
++
+ 	return 0;
+ }
+ 
+@@ -178,6 +183,7 @@ static int membarrier_private_expedited(
+ 	if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL))
+ 		return -ENOMEM;
+ 
++	mutex_lock(&membarrier_ipi_mutex);
+ 	cpus_read_lock();
+ 	rcu_read_lock();
+ 	for_each_online_cpu(cpu) {
+@@ -212,6 +218,7 @@ static int membarrier_private_expedited(
+ 	 * rq->curr modification in scheduler.
+ 	 */
+ 	smp_mb();	/* exit from system call is not a mb */
++	mutex_unlock(&membarrier_ipi_mutex);
+ 
+ 	return 0;
+ }
+@@ -253,6 +260,7 @@ static int sync_runqueues_membarrier_sta
+ 	 * between threads which are users of @mm has its membarrier state
+ 	 * updated.
+ 	 */
++	mutex_lock(&membarrier_ipi_mutex);
+ 	cpus_read_lock();
+ 	rcu_read_lock();
+ 	for_each_online_cpu(cpu) {
+@@ -269,6 +277,7 @@ static int sync_runqueues_membarrier_sta
+ 
+ 	free_cpumask_var(tmpmask);
+ 	cpus_read_unlock();
++	mutex_unlock(&membarrier_ipi_mutex);
+ 
+ 	return 0;
+ }
diff --git a/queue-5.4/series b/queue-5.4/series
index 03f6e25ebac..884ffc2f8c0 100644
--- a/queue-5.4/series
+++ b/queue-5.4/series
@@ -253,3 +253,7 @@ kvm-arm64-vgic-its-avoid-potential-uaf-in-lpi-transl.patch
 netfilter-ipset-fix-performance-regression-in-swap-operation.patch
 netfilter-ipset-missing-gc-cancellations-fixed.patch
 net-prevent-mss-overflow-in-skb_segment.patch
+sched-membarrier-reduce-the-ability-to-hammer-on-sys_membarrier.patch
+mm-memcontrol-decouple-reference-counting-from-page-accounting.patch
+nilfs2-fix-potential-bug-in-end_buffer_async_write.patch
+nilfs2-replace-warn_ons-for-invalid-dat-metadata-block-requests.patch