From: Greg Kroah-Hartman Date: Wed, 21 Feb 2024 08:51:49 +0000 (+0100) Subject: 5.4-stable patches X-Git-Tag: v4.19.307~39 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=84282aee0cc74d5136448394a6afae8cd2e97a10;p=thirdparty%2Fkernel%2Fstable-queue.git 5.4-stable patches added patches: mm-memcontrol-decouple-reference-counting-from-page-accounting.patch nilfs2-fix-potential-bug-in-end_buffer_async_write.patch nilfs2-replace-warn_ons-for-invalid-dat-metadata-block-requests.patch sched-membarrier-reduce-the-ability-to-hammer-on-sys_membarrier.patch --- diff --git a/queue-5.4/mm-memcontrol-decouple-reference-counting-from-page-accounting.patch b/queue-5.4/mm-memcontrol-decouple-reference-counting-from-page-accounting.patch new file mode 100644 index 00000000000..058fc0c8ce7 --- /dev/null +++ b/queue-5.4/mm-memcontrol-decouple-reference-counting-from-page-accounting.patch @@ -0,0 +1,104 @@ +From 1a3e1f40962c445b997151a542314f3c6097f8c3 Mon Sep 17 00:00:00 2001 +From: Johannes Weiner +Date: Thu, 6 Aug 2020 23:20:45 -0700 +Subject: mm: memcontrol: decouple reference counting from page accounting + +From: Johannes Weiner + +commit 1a3e1f40962c445b997151a542314f3c6097f8c3 upstream. + +The reference counting of a memcg is currently coupled directly to how +many 4k pages are charged to it. This doesn't work well with Roman's new +slab controller, which maintains pools of objects and doesn't want to keep +an extra balance sheet for the pages backing those objects. + +This unusual refcounting design (reference counts usually track pointers +to an object) is only for historical reasons: memcg used to not take any +css references and simply stalled offlining until all charges had been +reparented and the page counters had dropped to zero. When we got rid of +the reparenting requirement, the simple mechanical translation was to take +a reference for every charge. + +More historical context can be found in commit e8ea14cc6ead ("mm: +memcontrol: take a css reference for each charged page"), commit +64f219938941 ("mm: memcontrol: remove obsolete kmemcg pinning tricks") and +commit b2052564e66d ("mm: memcontrol: continue cache reclaim from offlined +groups"). + +The new slab controller exposes the limitations in this scheme, so let's +switch it to a more idiomatic reference counting model based on actual +kernel pointers to the memcg: + +- The per-cpu stock holds a reference to the memcg its caching + +- User pages hold a reference for their page->mem_cgroup. Transparent + huge pages will no longer acquire tail references in advance, we'll + get them if needed during the split. + +- Kernel pages hold a reference for their page->mem_cgroup + +- Pages allocated in the root cgroup will acquire and release css + references for simplicity. css_get() and css_put() optimize that. + +- The current memcg_charge_slab() already hacked around the per-charge + references; this change gets rid of that as well. + +- tcp accounting will handle reference in mem_cgroup_sk_{alloc,free} + +Roman: +1) Rebased on top of the current mm tree: added css_get() in + mem_cgroup_charge(), dropped mem_cgroup_try_charge() part +2) I've reformatted commit references in the commit log to make + checkpatch.pl happy. + +[hughd@google.com: remove css_put_many() from __mem_cgroup_clear_mc()] + Link: http://lkml.kernel.org/r/alpine.LSU.2.11.2007302011450.2347@eggly.anvils + +Signed-off-by: Johannes Weiner +Signed-off-by: Roman Gushchin +Signed-off-by: Hugh Dickins +Signed-off-by: Andrew Morton +Reviewed-by: Shakeel Butt +Acked-by: Roman Gushchin +Acked-by: Michal Hocko +Cc: Christoph Lameter +Cc: Tejun Heo +Cc: Vlastimil Babka +Link: http://lkml.kernel.org/r/20200623174037.3951353-6-guro@fb.com +Signed-off-by: Linus Torvalds +Fixes: cdec2e4265df ("memcg: coalesce charging via percpu storage") +Signed-off-by: GONG, Ruiqi +Signed-off-by: Greg Kroah-Hartman +--- + mm/memcontrol.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -2214,6 +2214,9 @@ static void drain_stock(struct memcg_sto + { + struct mem_cgroup *old = stock->cached; + ++ if (!old) ++ return; ++ + if (stock->nr_pages) { + page_counter_uncharge(&old->memory, stock->nr_pages); + if (do_memsw_account()) +@@ -2221,6 +2224,8 @@ static void drain_stock(struct memcg_sto + css_put_many(&old->css, stock->nr_pages); + stock->nr_pages = 0; + } ++ ++ css_put(&old->css); + stock->cached = NULL; + } + +@@ -2256,6 +2261,7 @@ static void refill_stock(struct mem_cgro + stock = this_cpu_ptr(&memcg_stock); + if (stock->cached != memcg) { /* reset if necessary */ + drain_stock(stock); ++ css_get(&memcg->css); + stock->cached = memcg; + } + stock->nr_pages += nr_pages; diff --git a/queue-5.4/nilfs2-fix-potential-bug-in-end_buffer_async_write.patch b/queue-5.4/nilfs2-fix-potential-bug-in-end_buffer_async_write.patch new file mode 100644 index 00000000000..084cdb505f7 --- /dev/null +++ b/queue-5.4/nilfs2-fix-potential-bug-in-end_buffer_async_write.patch @@ -0,0 +1,99 @@ +From 5bc09b397cbf1221f8a8aacb1152650c9195b02b Mon Sep 17 00:00:00 2001 +From: Ryusuke Konishi +Date: Sun, 4 Feb 2024 01:16:45 +0900 +Subject: nilfs2: fix potential bug in end_buffer_async_write + +From: Ryusuke Konishi + +commit 5bc09b397cbf1221f8a8aacb1152650c9195b02b upstream. + +According to a syzbot report, end_buffer_async_write(), which handles the +completion of block device writes, may detect abnormal condition of the +buffer async_write flag and cause a BUG_ON failure when using nilfs2. + +Nilfs2 itself does not use end_buffer_async_write(). But, the async_write +flag is now used as a marker by commit 7f42ec394156 ("nilfs2: fix issue +with race condition of competition between segments for dirty blocks") as +a means of resolving double list insertion of dirty blocks in +nilfs_lookup_dirty_data_buffers() and nilfs_lookup_node_buffers() and the +resulting crash. + +This modification is safe as long as it is used for file data and b-tree +node blocks where the page caches are independent. However, it was +irrelevant and redundant to also introduce async_write for segment summary +and super root blocks that share buffers with the backing device. This +led to the possibility that the BUG_ON check in end_buffer_async_write +would fail as described above, if independent writebacks of the backing +device occurred in parallel. + +The use of async_write for segment summary buffers has already been +removed in a previous change. + +Fix this issue by removing the manipulation of the async_write flag for +the remaining super root block buffer. + +Link: https://lkml.kernel.org/r/20240203161645.4992-1-konishi.ryusuke@gmail.com +Fixes: 7f42ec394156 ("nilfs2: fix issue with race condition of competition between segments for dirty blocks") +Signed-off-by: Ryusuke Konishi +Reported-by: syzbot+5c04210f7c7f897c1e7f@syzkaller.appspotmail.com +Closes: https://lkml.kernel.org/r/00000000000019a97c05fd42f8c8@google.com +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + fs/nilfs2/segment.c | 8 +++++--- + 1 file changed, 5 insertions(+), 3 deletions(-) + +--- a/fs/nilfs2/segment.c ++++ b/fs/nilfs2/segment.c +@@ -1702,7 +1702,6 @@ static void nilfs_segctor_prepare_write( + + list_for_each_entry(bh, &segbuf->sb_payload_buffers, + b_assoc_buffers) { +- set_buffer_async_write(bh); + if (bh == segbuf->sb_super_root) { + if (bh->b_page != bd_page) { + lock_page(bd_page); +@@ -1713,6 +1712,7 @@ static void nilfs_segctor_prepare_write( + } + break; + } ++ set_buffer_async_write(bh); + if (bh->b_page != fs_page) { + nilfs_begin_page_io(fs_page); + fs_page = bh->b_page; +@@ -1798,7 +1798,6 @@ static void nilfs_abort_logs(struct list + + list_for_each_entry(bh, &segbuf->sb_payload_buffers, + b_assoc_buffers) { +- clear_buffer_async_write(bh); + if (bh == segbuf->sb_super_root) { + clear_buffer_uptodate(bh); + if (bh->b_page != bd_page) { +@@ -1807,6 +1806,7 @@ static void nilfs_abort_logs(struct list + } + break; + } ++ clear_buffer_async_write(bh); + if (bh->b_page != fs_page) { + nilfs_end_page_io(fs_page, err); + fs_page = bh->b_page; +@@ -1894,8 +1894,9 @@ static void nilfs_segctor_complete_write + BIT(BH_Delay) | BIT(BH_NILFS_Volatile) | + BIT(BH_NILFS_Redirected)); + +- set_mask_bits(&bh->b_state, clear_bits, set_bits); + if (bh == segbuf->sb_super_root) { ++ set_buffer_uptodate(bh); ++ clear_buffer_dirty(bh); + if (bh->b_page != bd_page) { + end_page_writeback(bd_page); + bd_page = bh->b_page; +@@ -1903,6 +1904,7 @@ static void nilfs_segctor_complete_write + update_sr = true; + break; + } ++ set_mask_bits(&bh->b_state, clear_bits, set_bits); + if (bh->b_page != fs_page) { + nilfs_end_page_io(fs_page, 0); + fs_page = bh->b_page; diff --git a/queue-5.4/nilfs2-replace-warn_ons-for-invalid-dat-metadata-block-requests.patch b/queue-5.4/nilfs2-replace-warn_ons-for-invalid-dat-metadata-block-requests.patch new file mode 100644 index 00000000000..90404dc64c9 --- /dev/null +++ b/queue-5.4/nilfs2-replace-warn_ons-for-invalid-dat-metadata-block-requests.patch @@ -0,0 +1,82 @@ +From 5124a0a549857c4b87173280e192eea24dea72ad Mon Sep 17 00:00:00 2001 +From: Ryusuke Konishi +Date: Fri, 27 Jan 2023 01:41:14 +0900 +Subject: nilfs2: replace WARN_ONs for invalid DAT metadata block requests + +From: Ryusuke Konishi + +commit 5124a0a549857c4b87173280e192eea24dea72ad upstream. + +If DAT metadata file block access fails due to corruption of the DAT file +or abnormal virtual block numbers held by b-trees or inodes, a kernel +warning is generated. + +This replaces the WARN_ONs by error output, so that a kernel, booted with +panic_on_warn, does not panic. This patch also replaces the detected +return code -ENOENT with another internal code -EINVAL to notify the bmap +layer of metadata corruption. When the bmap layer sees -EINVAL, it +handles the abnormal situation with nilfs_bmap_convert_error() and finally +returns code -EIO as it should. + +Link: https://lkml.kernel.org/r/0000000000005cc3d205ea23ddcf@google.com +Link: https://lkml.kernel.org/r/20230126164114.6911-1-konishi.ryusuke@gmail.com +Signed-off-by: Ryusuke Konishi +Reported-by: +Tested-by: Ryusuke Konishi +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + fs/nilfs2/dat.c | 27 +++++++++++++++++---------- + 1 file changed, 17 insertions(+), 10 deletions(-) + +--- a/fs/nilfs2/dat.c ++++ b/fs/nilfs2/dat.c +@@ -40,8 +40,21 @@ static inline struct nilfs_dat_info *NIL + static int nilfs_dat_prepare_entry(struct inode *dat, + struct nilfs_palloc_req *req, int create) + { +- return nilfs_palloc_get_entry_block(dat, req->pr_entry_nr, +- create, &req->pr_entry_bh); ++ int ret; ++ ++ ret = nilfs_palloc_get_entry_block(dat, req->pr_entry_nr, ++ create, &req->pr_entry_bh); ++ if (unlikely(ret == -ENOENT)) { ++ nilfs_err(dat->i_sb, ++ "DAT doesn't have a block to manage vblocknr = %llu", ++ (unsigned long long)req->pr_entry_nr); ++ /* ++ * Return internal code -EINVAL to notify bmap layer of ++ * metadata corruption. ++ */ ++ ret = -EINVAL; ++ } ++ return ret; + } + + static void nilfs_dat_commit_entry(struct inode *dat, +@@ -123,11 +136,7 @@ static void nilfs_dat_commit_free(struct + + int nilfs_dat_prepare_start(struct inode *dat, struct nilfs_palloc_req *req) + { +- int ret; +- +- ret = nilfs_dat_prepare_entry(dat, req, 0); +- WARN_ON(ret == -ENOENT); +- return ret; ++ return nilfs_dat_prepare_entry(dat, req, 0); + } + + void nilfs_dat_commit_start(struct inode *dat, struct nilfs_palloc_req *req, +@@ -154,10 +163,8 @@ int nilfs_dat_prepare_end(struct inode * + int ret; + + ret = nilfs_dat_prepare_entry(dat, req, 0); +- if (ret < 0) { +- WARN_ON(ret == -ENOENT); ++ if (ret < 0) + return ret; +- } + + kaddr = kmap_atomic(req->pr_entry_bh->b_page); + entry = nilfs_palloc_block_get_entry(dat, req->pr_entry_nr, diff --git a/queue-5.4/sched-membarrier-reduce-the-ability-to-hammer-on-sys_membarrier.patch b/queue-5.4/sched-membarrier-reduce-the-ability-to-hammer-on-sys_membarrier.patch new file mode 100644 index 00000000000..d17634a5d92 --- /dev/null +++ b/queue-5.4/sched-membarrier-reduce-the-ability-to-hammer-on-sys_membarrier.patch @@ -0,0 +1,86 @@ +From 944d5fe50f3f03daacfea16300e656a1691c4a23 Mon Sep 17 00:00:00 2001 +From: Linus Torvalds +Date: Sun, 4 Feb 2024 15:25:12 +0000 +Subject: sched/membarrier: reduce the ability to hammer on sys_membarrier + +From: Linus Torvalds + +commit 944d5fe50f3f03daacfea16300e656a1691c4a23 upstream. + +On some systems, sys_membarrier can be very expensive, causing overall +slowdowns for everything. So put a lock on the path in order to +serialize the accesses to prevent the ability for this to be called at +too high of a frequency and saturate the machine. + +Reviewed-and-tested-by: Mathieu Desnoyers +Acked-by: Borislav Petkov +Fixes: 22e4ebb97582 ("membarrier: Provide expedited private command") +Fixes: c5f58bd58f43 ("membarrier: Provide GLOBAL_EXPEDITED command") +Signed-off-by: Linus Torvalds +[ converted to explicit mutex_*() calls - cleanup.h is not in this stable + branch - gregkh ] +Signed-off-by: Greg Kroah-Hartman +--- + kernel/sched/membarrier.c | 9 +++++++++ + 1 file changed, 9 insertions(+) + +--- a/kernel/sched/membarrier.c ++++ b/kernel/sched/membarrier.c +@@ -25,6 +25,8 @@ + | MEMBARRIER_CMD_REGISTER_PRIVATE_EXPEDITED \ + | MEMBARRIER_PRIVATE_EXPEDITED_SYNC_CORE_BITMASK) + ++static DEFINE_MUTEX(membarrier_ipi_mutex); ++ + static void ipi_mb(void *info) + { + smp_mb(); /* IPIs should be serializing but paranoid. */ +@@ -97,6 +99,7 @@ static int membarrier_global_expedited(v + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + return -ENOMEM; + ++ mutex_lock(&membarrier_ipi_mutex); + cpus_read_lock(); + rcu_read_lock(); + for_each_online_cpu(cpu) { +@@ -143,6 +146,8 @@ static int membarrier_global_expedited(v + * rq->curr modification in scheduler. + */ + smp_mb(); /* exit from system call is not a mb */ ++ mutex_unlock(&membarrier_ipi_mutex); ++ + return 0; + } + +@@ -178,6 +183,7 @@ static int membarrier_private_expedited( + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + return -ENOMEM; + ++ mutex_lock(&membarrier_ipi_mutex); + cpus_read_lock(); + rcu_read_lock(); + for_each_online_cpu(cpu) { +@@ -212,6 +218,7 @@ static int membarrier_private_expedited( + * rq->curr modification in scheduler. + */ + smp_mb(); /* exit from system call is not a mb */ ++ mutex_unlock(&membarrier_ipi_mutex); + + return 0; + } +@@ -253,6 +260,7 @@ static int sync_runqueues_membarrier_sta + * between threads which are users of @mm has its membarrier state + * updated. + */ ++ mutex_lock(&membarrier_ipi_mutex); + cpus_read_lock(); + rcu_read_lock(); + for_each_online_cpu(cpu) { +@@ -269,6 +277,7 @@ static int sync_runqueues_membarrier_sta + + free_cpumask_var(tmpmask); + cpus_read_unlock(); ++ mutex_unlock(&membarrier_ipi_mutex); + + return 0; + } diff --git a/queue-5.4/series b/queue-5.4/series index 03f6e25ebac..884ffc2f8c0 100644 --- a/queue-5.4/series +++ b/queue-5.4/series @@ -253,3 +253,7 @@ kvm-arm64-vgic-its-avoid-potential-uaf-in-lpi-transl.patch netfilter-ipset-fix-performance-regression-in-swap-operation.patch netfilter-ipset-missing-gc-cancellations-fixed.patch net-prevent-mss-overflow-in-skb_segment.patch +sched-membarrier-reduce-the-ability-to-hammer-on-sys_membarrier.patch +mm-memcontrol-decouple-reference-counting-from-page-accounting.patch +nilfs2-fix-potential-bug-in-end_buffer_async_write.patch +nilfs2-replace-warn_ons-for-invalid-dat-metadata-block-requests.patch