From 94ac92fb7c1d554405250226651a7aa1c361f55a Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sun, 9 May 2021 16:03:30 +0200 Subject: [PATCH] 5.12-stable patches added patches: dm-raid-fix-inconclusive-reshape-layout-on-fast-raid4-5-6-table-reload-sequences.patch exfat-fix-erroneous-discard-when-clear-cluster-bit.patch fuse-fix-write-deadlock.patch md-raid1-properly-indicate-failure-when-ending-a-failed-write-request.patch mm-page_alloc-ignore-init_on_free-1-for-debug_pagealloc-1.patch rcu-nocb-fix-missed-nocb_timer-requeue.patch sfc-adjust-efx-xdp_tx_queue_count-with-the-real-number-of-initialized-queues.patch sfc-farch-fix-tx-queue-lookup-in-tx-event-handling.patch sfc-farch-fix-tx-queue-lookup-in-tx-flush-done-handling.patch --- ...ast-raid4-5-6-table-reload-sequences.patch | 134 +++++++++++++++ ...neous-discard-when-clear-cluster-bit.patch | 60 +++++++ queue-5.12/fuse-fix-write-deadlock.patch | 162 ++++++++++++++++++ ...e-when-ending-a-failed-write-request.patch | 35 ++++ ...init_on_free-1-for-debug_pagealloc-1.patch | 103 +++++++++++ ...u-nocb-fix-missed-nocb_timer-requeue.patch | 122 +++++++++++++ queue-5.12/series | 9 + ...he-real-number-of-initialized-queues.patch | 62 +++++++ ...tx-queue-lookup-in-tx-event-handling.patch | 43 +++++ ...eue-lookup-in-tx-flush-done-handling.patch | 46 +++++ 10 files changed, 776 insertions(+) create mode 100644 queue-5.12/dm-raid-fix-inconclusive-reshape-layout-on-fast-raid4-5-6-table-reload-sequences.patch create mode 100644 queue-5.12/exfat-fix-erroneous-discard-when-clear-cluster-bit.patch create mode 100644 queue-5.12/fuse-fix-write-deadlock.patch create mode 100644 queue-5.12/md-raid1-properly-indicate-failure-when-ending-a-failed-write-request.patch create mode 100644 queue-5.12/mm-page_alloc-ignore-init_on_free-1-for-debug_pagealloc-1.patch create mode 100644 queue-5.12/rcu-nocb-fix-missed-nocb_timer-requeue.patch create mode 100644 queue-5.12/sfc-adjust-efx-xdp_tx_queue_count-with-the-real-number-of-initialized-queues.patch create mode 100644 queue-5.12/sfc-farch-fix-tx-queue-lookup-in-tx-event-handling.patch create mode 100644 queue-5.12/sfc-farch-fix-tx-queue-lookup-in-tx-flush-done-handling.patch diff --git a/queue-5.12/dm-raid-fix-inconclusive-reshape-layout-on-fast-raid4-5-6-table-reload-sequences.patch b/queue-5.12/dm-raid-fix-inconclusive-reshape-layout-on-fast-raid4-5-6-table-reload-sequences.patch new file mode 100644 index 00000000000..0c1c6fd8216 --- /dev/null +++ b/queue-5.12/dm-raid-fix-inconclusive-reshape-layout-on-fast-raid4-5-6-table-reload-sequences.patch @@ -0,0 +1,134 @@ +From f99a8e4373eeacb279bc9696937a55adbff7a28a Mon Sep 17 00:00:00 2001 +From: Heinz Mauelshagen +Date: Wed, 21 Apr 2021 23:32:36 +0200 +Subject: dm raid: fix inconclusive reshape layout on fast raid4/5/6 table reload sequences + +From: Heinz Mauelshagen + +commit f99a8e4373eeacb279bc9696937a55adbff7a28a upstream. + +If fast table reloads occur during an ongoing reshape of raid4/5/6 +devices the target may race reading a superblock vs the the MD resync +thread; causing an inconclusive reshape state to be read in its +constructor. + +lvm2 test lvconvert-raid-reshape-stripes-load-reload.sh can cause +BUG_ON() to trigger in md_run(), e.g.: +"kernel BUG at drivers/md/raid5.c:7567!". + +Scenario triggering the bug: + +1. the MD sync thread calls end_reshape() from raid5_sync_request() + when done reshaping. However end_reshape() _only_ updates the + reshape position to MaxSector keeping the changed layout + configuration though (i.e. any delta disks, chunk sector or RAID + algorithm changes). That inconclusive configuration is stored in + the superblock. + +2. dm-raid constructs a mapping, loading named inconsistent superblock + as of step 1 before step 3 is able to finish resetting the reshape + state completely, and calls md_run() which leads to mentioned bug + in raid5.c. + +3. the MD RAID personality's finish_reshape() is called; which resets + the reshape information on chunk sectors, delta disks, etc. This + explains why the bug is rarely seen on multi-core machines, as MD's + finish_reshape() superblock update races with the dm-raid + constructor's superblock load in step 2. + +Fix identifies inconclusive superblock content in the dm-raid +constructor and resets it before calling md_run(), factoring out +identifying checks into rs_is_layout_change() to share in existing +rs_reshape_requested() and new rs_reset_inclonclusive_reshape(). Also +enhance a comment and remove an empty line. + +Cc: stable@vger.kernel.org +Signed-off-by: Heinz Mauelshagen +Signed-off-by: Mike Snitzer +Signed-off-by: Greg Kroah-Hartman +--- + drivers/md/dm-raid.c | 34 ++++++++++++++++++++++++++++------ + 1 file changed, 28 insertions(+), 6 deletions(-) + +--- a/drivers/md/dm-raid.c ++++ b/drivers/md/dm-raid.c +@@ -1868,6 +1868,14 @@ static bool rs_takeover_requested(struct + return rs->md.new_level != rs->md.level; + } + ++/* True if layout is set to reshape. */ ++static bool rs_is_layout_change(struct raid_set *rs, bool use_mddev) ++{ ++ return (use_mddev ? rs->md.delta_disks : rs->delta_disks) || ++ rs->md.new_layout != rs->md.layout || ++ rs->md.new_chunk_sectors != rs->md.chunk_sectors; ++} ++ + /* True if @rs is requested to reshape by ctr */ + static bool rs_reshape_requested(struct raid_set *rs) + { +@@ -1880,9 +1888,7 @@ static bool rs_reshape_requested(struct + if (rs_is_raid0(rs)) + return false; + +- change = mddev->new_layout != mddev->layout || +- mddev->new_chunk_sectors != mddev->chunk_sectors || +- rs->delta_disks; ++ change = rs_is_layout_change(rs, false); + + /* Historical case to support raid1 reshape without delta disks */ + if (rs_is_raid1(rs)) { +@@ -2817,7 +2823,7 @@ static sector_t _get_reshape_sectors(str + } + + /* +- * ++ * Reshape: + * - change raid layout + * - change chunk size + * - add disks +@@ -2927,6 +2933,20 @@ static int rs_setup_reshape(struct raid_ + } + + /* ++ * If the md resync thread has updated superblock with max reshape position ++ * at the end of a reshape but not (yet) reset the layout configuration ++ * changes -> reset the latter. ++ */ ++static void rs_reset_inconclusive_reshape(struct raid_set *rs) ++{ ++ if (!rs_is_reshaping(rs) && rs_is_layout_change(rs, true)) { ++ rs_set_cur(rs); ++ rs->md.delta_disks = 0; ++ rs->md.reshape_backwards = 0; ++ } ++} ++ ++/* + * Enable/disable discard support on RAID set depending on + * RAID level and discard properties of underlying RAID members. + */ +@@ -3212,11 +3232,14 @@ size_check: + if (r) + goto bad; + ++ /* Catch any inconclusive reshape superblock content. */ ++ rs_reset_inconclusive_reshape(rs); ++ + /* Start raid set read-only and assumed clean to change in raid_resume() */ + rs->md.ro = 1; + rs->md.in_sync = 1; + +- /* Keep array frozen */ ++ /* Keep array frozen until resume. */ + set_bit(MD_RECOVERY_FROZEN, &rs->md.recovery); + + /* Has to be held on running the array */ +@@ -3230,7 +3253,6 @@ size_check: + } + + r = md_start(&rs->md); +- + if (r) { + ti->error = "Failed to start raid array"; + mddev_unlock(&rs->md); diff --git a/queue-5.12/exfat-fix-erroneous-discard-when-clear-cluster-bit.patch b/queue-5.12/exfat-fix-erroneous-discard-when-clear-cluster-bit.patch new file mode 100644 index 00000000000..9da46006b2d --- /dev/null +++ b/queue-5.12/exfat-fix-erroneous-discard-when-clear-cluster-bit.patch @@ -0,0 +1,60 @@ +From 77edfc6e51055b61cae2f54c8e6c3bb7c762e4fe Mon Sep 17 00:00:00 2001 +From: Hyeongseok Kim +Date: Thu, 4 Mar 2021 09:15:34 +0900 +Subject: exfat: fix erroneous discard when clear cluster bit + +From: Hyeongseok Kim + +commit 77edfc6e51055b61cae2f54c8e6c3bb7c762e4fe upstream. + +If mounted with discard option, exFAT issues discard command when clear +cluster bit to remove file. But the input parameter of cluster-to-sector +calculation is abnormally added by reserved cluster size which is 2, +leading to discard unrelated sectors included in target+2 cluster. +With fixing this, remove the wrong comments in set/clear/find bitmap +functions. + +Fixes: 1e49a94cf707 ("exfat: add bitmap operations") +Cc: stable@vger.kernel.org # v5.7+ +Signed-off-by: Hyeongseok Kim +Acked-by: Sungjong Seo +Signed-off-by: Namjae Jeon +Signed-off-by: Greg Kroah-Hartman +--- + fs/exfat/balloc.c | 11 +---------- + 1 file changed, 1 insertion(+), 10 deletions(-) + +--- a/fs/exfat/balloc.c ++++ b/fs/exfat/balloc.c +@@ -141,10 +141,6 @@ void exfat_free_bitmap(struct exfat_sb_i + kfree(sbi->vol_amap); + } + +-/* +- * If the value of "clu" is 0, it means cluster 2 which is the first cluster of +- * the cluster heap. +- */ + int exfat_set_bitmap(struct inode *inode, unsigned int clu) + { + int i, b; +@@ -162,10 +158,6 @@ int exfat_set_bitmap(struct inode *inode + return 0; + } + +-/* +- * If the value of "clu" is 0, it means cluster 2 which is the first cluster of +- * the cluster heap. +- */ + void exfat_clear_bitmap(struct inode *inode, unsigned int clu, bool sync) + { + int i, b; +@@ -186,8 +178,7 @@ void exfat_clear_bitmap(struct inode *in + int ret_discard; + + ret_discard = sb_issue_discard(sb, +- exfat_cluster_to_sector(sbi, clu + +- EXFAT_RESERVED_CLUSTERS), ++ exfat_cluster_to_sector(sbi, clu), + (1 << sbi->sect_per_clus_bits), GFP_NOFS, 0); + + if (ret_discard == -EOPNOTSUPP) { diff --git a/queue-5.12/fuse-fix-write-deadlock.patch b/queue-5.12/fuse-fix-write-deadlock.patch new file mode 100644 index 00000000000..da79b315964 --- /dev/null +++ b/queue-5.12/fuse-fix-write-deadlock.patch @@ -0,0 +1,162 @@ +From 4f06dd92b5d0a6f8eec6a34b8d6ef3e1f4ac1e10 Mon Sep 17 00:00:00 2001 +From: Vivek Goyal +Date: Wed, 21 Oct 2020 16:12:49 -0400 +Subject: fuse: fix write deadlock + +From: Vivek Goyal + +commit 4f06dd92b5d0a6f8eec6a34b8d6ef3e1f4ac1e10 upstream. + +There are two modes for write(2) and friends in fuse: + +a) write through (update page cache, send sync WRITE request to userspace) + +b) buffered write (update page cache, async writeout later) + +The write through method kept all the page cache pages locked that were +used for the request. Keeping more than one page locked is deadlock prone +and Qian Cai demonstrated this with trinity fuzzing. + +The reason for keeping the pages locked is that concurrent mapped reads +shouldn't try to pull possibly stale data into the page cache. + +For full page writes, the easy way to fix this is to make the cached page +be the authoritative source by marking the page PG_uptodate immediately. +After this the page can be safely unlocked, since mapped/cached reads will +take the written data from the cache. + +Concurrent mapped writes will now cause data in the original WRITE request +to be updated; this however doesn't cause any data inconsistency and this +scenario should be exceedingly rare anyway. + +If the WRITE request returns with an error in the above case, currently the +page is not marked uptodate; this means that a concurrent read will always +read consistent data. After this patch the page is uptodate between +writing to the cache and receiving the error: there's window where a cached +read will read the wrong data. While theoretically this could be a +regression, it is unlikely to be one in practice, since this is normal for +buffered writes. + +In case of a partial page write to an already uptodate page the locking is +also unnecessary, with the above caveats. + +Partial write of a not uptodate page still needs to be handled. One way +would be to read the complete page before doing the write. This is not +possible, since it might break filesystems that don't expect any READ +requests when the file was opened O_WRONLY. + +The other solution is to serialize the synchronous write with reads from +the partial pages. The easiest way to do this is to keep the partial pages +locked. The problem is that a write() may involve two such pages (one head +and one tail). This patch fixes it by only locking the partial tail page. +If there's a partial head page as well, then split that off as a separate +WRITE request. + +Reported-by: Qian Cai +Link: https://lore.kernel.org/linux-fsdevel/4794a3fa3742a5e84fb0f934944204b55730829b.camel@lca.pw/ +Fixes: ea9b9907b82a ("fuse: implement perform_write") +Cc: # v2.6.26 +Signed-off-by: Vivek Goyal +Signed-off-by: Miklos Szeredi +Signed-off-by: Greg Kroah-Hartman +--- + fs/fuse/file.c | 41 +++++++++++++++++++++++++++++------------ + fs/fuse/fuse_i.h | 1 + + 2 files changed, 30 insertions(+), 12 deletions(-) + +--- a/fs/fuse/file.c ++++ b/fs/fuse/file.c +@@ -1099,6 +1099,7 @@ static ssize_t fuse_send_write_pages(str + struct fuse_file *ff = file->private_data; + struct fuse_mount *fm = ff->fm; + unsigned int offset, i; ++ bool short_write; + int err; + + for (i = 0; i < ap->num_pages; i++) +@@ -1113,32 +1114,38 @@ static ssize_t fuse_send_write_pages(str + if (!err && ia->write.out.size > count) + err = -EIO; + ++ short_write = ia->write.out.size < count; + offset = ap->descs[0].offset; + count = ia->write.out.size; + for (i = 0; i < ap->num_pages; i++) { + struct page *page = ap->pages[i]; + +- if (!err && !offset && count >= PAGE_SIZE) +- SetPageUptodate(page); +- +- if (count > PAGE_SIZE - offset) +- count -= PAGE_SIZE - offset; +- else +- count = 0; +- offset = 0; +- +- unlock_page(page); ++ if (err) { ++ ClearPageUptodate(page); ++ } else { ++ if (count >= PAGE_SIZE - offset) ++ count -= PAGE_SIZE - offset; ++ else { ++ if (short_write) ++ ClearPageUptodate(page); ++ count = 0; ++ } ++ offset = 0; ++ } ++ if (ia->write.page_locked && (i == ap->num_pages - 1)) ++ unlock_page(page); + put_page(page); + } + + return err; + } + +-static ssize_t fuse_fill_write_pages(struct fuse_args_pages *ap, ++static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, + struct address_space *mapping, + struct iov_iter *ii, loff_t pos, + unsigned int max_pages) + { ++ struct fuse_args_pages *ap = &ia->ap; + struct fuse_conn *fc = get_fuse_conn(mapping->host); + unsigned offset = pos & (PAGE_SIZE - 1); + size_t count = 0; +@@ -1191,6 +1198,16 @@ static ssize_t fuse_fill_write_pages(str + if (offset == PAGE_SIZE) + offset = 0; + ++ /* If we copied full page, mark it uptodate */ ++ if (tmp == PAGE_SIZE) ++ SetPageUptodate(page); ++ ++ if (PageUptodate(page)) { ++ unlock_page(page); ++ } else { ++ ia->write.page_locked = true; ++ break; ++ } + if (!fc->big_writes) + break; + } while (iov_iter_count(ii) && count < fc->max_write && +@@ -1234,7 +1251,7 @@ static ssize_t fuse_perform_write(struct + break; + } + +- count = fuse_fill_write_pages(ap, mapping, ii, pos, nr_pages); ++ count = fuse_fill_write_pages(&ia, mapping, ii, pos, nr_pages); + if (count <= 0) { + err = count; + } else { +--- a/fs/fuse/fuse_i.h ++++ b/fs/fuse/fuse_i.h +@@ -912,6 +912,7 @@ struct fuse_io_args { + struct { + struct fuse_write_in in; + struct fuse_write_out out; ++ bool page_locked; + } write; + }; + struct fuse_args_pages ap; diff --git a/queue-5.12/md-raid1-properly-indicate-failure-when-ending-a-failed-write-request.patch b/queue-5.12/md-raid1-properly-indicate-failure-when-ending-a-failed-write-request.patch new file mode 100644 index 00000000000..acfa801c7e9 --- /dev/null +++ b/queue-5.12/md-raid1-properly-indicate-failure-when-ending-a-failed-write-request.patch @@ -0,0 +1,35 @@ +From 2417b9869b81882ab90fd5ed1081a1cb2d4db1dd Mon Sep 17 00:00:00 2001 +From: Paul Clements +Date: Thu, 15 Apr 2021 17:17:57 -0400 +Subject: md/raid1: properly indicate failure when ending a failed write request + +From: Paul Clements + +commit 2417b9869b81882ab90fd5ed1081a1cb2d4db1dd upstream. + +This patch addresses a data corruption bug in raid1 arrays using bitmaps. +Without this fix, the bitmap bits for the failed I/O end up being cleared. + +Since we are in the failure leg of raid1_end_write_request, the request +either needs to be retried (R1BIO_WriteError) or failed (R1BIO_Degraded). + +Fixes: eeba6809d8d5 ("md/raid1: end bio when the device faulty") +Cc: stable@vger.kernel.org # v5.2+ +Signed-off-by: Paul Clements +Signed-off-by: Song Liu +Signed-off-by: Greg Kroah-Hartman +--- + drivers/md/raid1.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/drivers/md/raid1.c ++++ b/drivers/md/raid1.c +@@ -478,6 +478,8 @@ static void raid1_end_write_request(stru + if (!test_bit(Faulty, &rdev->flags)) + set_bit(R1BIO_WriteError, &r1_bio->state); + else { ++ /* Fail the request */ ++ set_bit(R1BIO_Degraded, &r1_bio->state); + /* Finished with this branch */ + r1_bio->bios[mirror] = NULL; + to_put = bio; diff --git a/queue-5.12/mm-page_alloc-ignore-init_on_free-1-for-debug_pagealloc-1.patch b/queue-5.12/mm-page_alloc-ignore-init_on_free-1-for-debug_pagealloc-1.patch new file mode 100644 index 00000000000..de75211d926 --- /dev/null +++ b/queue-5.12/mm-page_alloc-ignore-init_on_free-1-for-debug_pagealloc-1.patch @@ -0,0 +1,103 @@ +From 9df65f522536719682bccd24245ff94db956256c Mon Sep 17 00:00:00 2001 +From: Sergei Trofimovich +Date: Thu, 29 Apr 2021 23:02:11 -0700 +Subject: mm: page_alloc: ignore init_on_free=1 for debug_pagealloc=1 + +From: Sergei Trofimovich + +commit 9df65f522536719682bccd24245ff94db956256c upstream. + +On !ARCH_SUPPORTS_DEBUG_PAGEALLOC (like ia64) debug_pagealloc=1 implies +page_poison=on: + + if (page_poisoning_enabled() || + (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) && + debug_pagealloc_enabled())) + static_branch_enable(&_page_poisoning_enabled); + +page_poison=on needs to override init_on_free=1. + +Before the change it did not work as expected for the following case: +- have PAGE_POISONING=y +- have page_poison unset +- have !ARCH_SUPPORTS_DEBUG_PAGEALLOC arch (like ia64) +- have init_on_free=1 +- have debug_pagealloc=1 + +That way we get both keys enabled: +- static_branch_enable(&init_on_free); +- static_branch_enable(&_page_poisoning_enabled); + +which leads to poisoned pages returned for __GFP_ZERO pages. + +After the change we execute only: +- static_branch_enable(&_page_poisoning_enabled); + and ignore init_on_free=1. + +Link: https://lkml.kernel.org/r/20210329222555.3077928-1-slyfox@gentoo.org +Link: https://lkml.org/lkml/2021/3/26/443 +Fixes: 8db26a3d4735 ("mm, page_poison: use static key more efficiently") +Signed-off-by: Sergei Trofimovich +Acked-by: Vlastimil Babka +Reviewed-by: David Hildenbrand +Cc: Andrey Konovalov +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + mm/page_alloc.c | 30 +++++++++++++++++------------- + 1 file changed, 17 insertions(+), 13 deletions(-) + +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -764,32 +764,36 @@ static inline void clear_page_guard(stru + */ + void init_mem_debugging_and_hardening(void) + { ++ bool page_poisoning_requested = false; ++ ++#ifdef CONFIG_PAGE_POISONING ++ /* ++ * Page poisoning is debug page alloc for some arches. If ++ * either of those options are enabled, enable poisoning. ++ */ ++ if (page_poisoning_enabled() || ++ (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) && ++ debug_pagealloc_enabled())) { ++ static_branch_enable(&_page_poisoning_enabled); ++ page_poisoning_requested = true; ++ } ++#endif ++ + if (_init_on_alloc_enabled_early) { +- if (page_poisoning_enabled()) ++ if (page_poisoning_requested) + pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, " + "will take precedence over init_on_alloc\n"); + else + static_branch_enable(&init_on_alloc); + } + if (_init_on_free_enabled_early) { +- if (page_poisoning_enabled()) ++ if (page_poisoning_requested) + pr_info("mem auto-init: CONFIG_PAGE_POISONING is on, " + "will take precedence over init_on_free\n"); + else + static_branch_enable(&init_on_free); + } + +-#ifdef CONFIG_PAGE_POISONING +- /* +- * Page poisoning is debug page alloc for some arches. If +- * either of those options are enabled, enable poisoning. +- */ +- if (page_poisoning_enabled() || +- (!IS_ENABLED(CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC) && +- debug_pagealloc_enabled())) +- static_branch_enable(&_page_poisoning_enabled); +-#endif +- + #ifdef CONFIG_DEBUG_PAGEALLOC + if (!debug_pagealloc_enabled()) + return; diff --git a/queue-5.12/rcu-nocb-fix-missed-nocb_timer-requeue.patch b/queue-5.12/rcu-nocb-fix-missed-nocb_timer-requeue.patch new file mode 100644 index 00000000000..040d6fac1e3 --- /dev/null +++ b/queue-5.12/rcu-nocb-fix-missed-nocb_timer-requeue.patch @@ -0,0 +1,122 @@ +From b2fcf2102049f6e56981e0ab3d9b633b8e2741da Mon Sep 17 00:00:00 2001 +From: Frederic Weisbecker +Date: Tue, 23 Feb 2021 01:09:59 +0100 +Subject: rcu/nocb: Fix missed nocb_timer requeue + +From: Frederic Weisbecker + +commit b2fcf2102049f6e56981e0ab3d9b633b8e2741da upstream. + +This sequence of events can lead to a failure to requeue a CPU's +->nocb_timer: + +1. There are no callbacks queued for any CPU covered by CPU 0-2's + ->nocb_gp_kthread. Note that ->nocb_gp_kthread is associated + with CPU 0. + +2. CPU 1 enqueues its first callback with interrupts disabled, and + thus must defer awakening its ->nocb_gp_kthread. It therefore + queues its rcu_data structure's ->nocb_timer. At this point, + CPU 1's rdp->nocb_defer_wakeup is RCU_NOCB_WAKE. + +3. CPU 2, which shares the same ->nocb_gp_kthread, also enqueues a + callback, but with interrupts enabled, allowing it to directly + awaken the ->nocb_gp_kthread. + +4. The newly awakened ->nocb_gp_kthread associates both CPU 1's + and CPU 2's callbacks with a future grace period and arranges + for that grace period to be started. + +5. This ->nocb_gp_kthread goes to sleep waiting for the end of this + future grace period. + +6. This grace period elapses before the CPU 1's timer fires. + This is normally improbably given that the timer is set for only + one jiffy, but timers can be delayed. Besides, it is possible + that kernel was built with CONFIG_RCU_STRICT_GRACE_PERIOD=y. + +7. The grace period ends, so rcu_gp_kthread awakens the + ->nocb_gp_kthread, which in turn awakens both CPU 1's and + CPU 2's ->nocb_cb_kthread. Then ->nocb_gb_kthread sleeps + waiting for more newly queued callbacks. + +8. CPU 1's ->nocb_cb_kthread invokes its callback, then sleeps + waiting for more invocable callbacks. + +9. Note that neither kthread updated any ->nocb_timer state, + so CPU 1's ->nocb_defer_wakeup is still set to RCU_NOCB_WAKE. + +10. CPU 1 enqueues its second callback, this time with interrupts + enabled so it can wake directly ->nocb_gp_kthread. + It does so with calling wake_nocb_gp() which also cancels the + pending timer that got queued in step 2. But that doesn't reset + CPU 1's ->nocb_defer_wakeup which is still set to RCU_NOCB_WAKE. + So CPU 1's ->nocb_defer_wakeup and its ->nocb_timer are now + desynchronized. + +11. ->nocb_gp_kthread associates the callback queued in 10 with a new + grace period, arranges for that grace period to start and sleeps + waiting for it to complete. + +12. The grace period ends, rcu_gp_kthread awakens ->nocb_gp_kthread, + which in turn wakes up CPU 1's ->nocb_cb_kthread which then + invokes the callback queued in 10. + +13. CPU 1 enqueues its third callback, this time with interrupts + disabled so it must queue a timer for a deferred wakeup. However + the value of its ->nocb_defer_wakeup is RCU_NOCB_WAKE which + incorrectly indicates that a timer is already queued. Instead, + CPU 1's ->nocb_timer was cancelled in 10. CPU 1 therefore fails + to queue the ->nocb_timer. + +14. CPU 1 has its pending callback and it may go unnoticed until + some other CPU ever wakes up ->nocb_gp_kthread or CPU 1 ever + calls an explicit deferred wakeup, for example, during idle entry. + +This commit fixes this bug by resetting rdp->nocb_defer_wakeup everytime +we delete the ->nocb_timer. + +It is quite possible that there is a similar scenario involving +->nocb_bypass_timer and ->nocb_defer_wakeup. However, despite some +effort from several people, a failure scenario has not yet been located. +However, that by no means guarantees that no such scenario exists. +Finding a failure scenario is left as an exercise for the reader, and the +"Fixes:" tag below relates to ->nocb_bypass_timer instead of ->nocb_timer. + +Fixes: d1b222c6be1f (rcu/nocb: Add bypass callback queueing) +Cc: +Cc: Josh Triplett +Cc: Lai Jiangshan +Cc: Joel Fernandes +Cc: Boqun Feng +Reviewed-by: Neeraj Upadhyay +Signed-off-by: Frederic Weisbecker +Signed-off-by: Paul E. McKenney +Signed-off-by: Greg Kroah-Hartman +--- + kernel/rcu/tree_plugin.h | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +--- a/kernel/rcu/tree_plugin.h ++++ b/kernel/rcu/tree_plugin.h +@@ -1646,7 +1646,11 @@ static bool wake_nocb_gp(struct rcu_data + rcu_nocb_unlock_irqrestore(rdp, flags); + return false; + } +- del_timer(&rdp->nocb_timer); ++ ++ if (READ_ONCE(rdp->nocb_defer_wakeup) > RCU_NOCB_WAKE_NOT) { ++ WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); ++ del_timer(&rdp->nocb_timer); ++ } + rcu_nocb_unlock_irqrestore(rdp, flags); + raw_spin_lock_irqsave(&rdp_gp->nocb_gp_lock, flags); + if (force || READ_ONCE(rdp_gp->nocb_gp_sleep)) { +@@ -2265,7 +2269,6 @@ static bool do_nocb_deferred_wakeup_comm + return false; + } + ndw = READ_ONCE(rdp->nocb_defer_wakeup); +- WRITE_ONCE(rdp->nocb_defer_wakeup, RCU_NOCB_WAKE_NOT); + ret = wake_nocb_gp(rdp, ndw == RCU_NOCB_WAKE_FORCE, flags); + trace_rcu_nocb_wake(rcu_state.name, rdp->cpu, TPS("DeferredWake")); + diff --git a/queue-5.12/series b/queue-5.12/series index c71615c380f..e8e05c42e76 100644 --- a/queue-5.12/series +++ b/queue-5.12/series @@ -317,3 +317,12 @@ tpm-efi-use-local-variable-for-calculating-final-log-size.patch tpm-vtpm_proxy-avoid-reading-host-log-when-using-a-virtual-device.patch crypto-arm-curve25519-move-.fpu-after-.arch.patch crypto-rng-fix-crypto_rng_reset-refcounting-when-crypto_stats.patch +md-raid1-properly-indicate-failure-when-ending-a-failed-write-request.patch +dm-raid-fix-inconclusive-reshape-layout-on-fast-raid4-5-6-table-reload-sequences.patch +fuse-fix-write-deadlock.patch +mm-page_alloc-ignore-init_on_free-1-for-debug_pagealloc-1.patch +exfat-fix-erroneous-discard-when-clear-cluster-bit.patch +sfc-farch-fix-tx-queue-lookup-in-tx-flush-done-handling.patch +sfc-farch-fix-tx-queue-lookup-in-tx-event-handling.patch +sfc-adjust-efx-xdp_tx_queue_count-with-the-real-number-of-initialized-queues.patch +rcu-nocb-fix-missed-nocb_timer-requeue.patch diff --git a/queue-5.12/sfc-adjust-efx-xdp_tx_queue_count-with-the-real-number-of-initialized-queues.patch b/queue-5.12/sfc-adjust-efx-xdp_tx_queue_count-with-the-real-number-of-initialized-queues.patch new file mode 100644 index 00000000000..3ec98834563 --- /dev/null +++ b/queue-5.12/sfc-adjust-efx-xdp_tx_queue_count-with-the-real-number-of-initialized-queues.patch @@ -0,0 +1,62 @@ +From 99ba0ea616aabdc8e26259fd722503e012199a76 Mon Sep 17 00:00:00 2001 +From: Ignat Korchagin +Date: Tue, 27 Apr 2021 22:09:38 +0100 +Subject: sfc: adjust efx->xdp_tx_queue_count with the real number of initialized queues + +From: Ignat Korchagin + +commit 99ba0ea616aabdc8e26259fd722503e012199a76 upstream. + +efx->xdp_tx_queue_count is initially initialized to num_possible_cpus() and is +later used to allocate and traverse efx->xdp_tx_queues lookup array. However, +we may end up not initializing all the array slots with real queues during +probing. This results, for example, in a NULL pointer dereference, when running +"# ethtool -S ", similar to below + +[2570283.664955][T4126959] BUG: kernel NULL pointer dereference, address: 00000000000000f8 +[2570283.681283][T4126959] #PF: supervisor read access in kernel mode +[2570283.695678][T4126959] #PF: error_code(0x0000) - not-present page +[2570283.710013][T4126959] PGD 0 P4D 0 +[2570283.721649][T4126959] Oops: 0000 [#1] SMP PTI +[2570283.734108][T4126959] CPU: 23 PID: 4126959 Comm: ethtool Tainted: G O 5.10.20-cloudflare-2021.3.1 #1 +[2570283.752641][T4126959] Hardware name: +[2570283.781408][T4126959] RIP: 0010:efx_ethtool_get_stats+0x2ca/0x330 [sfc] +[2570283.796073][T4126959] Code: 00 85 c0 74 39 48 8b 95 a8 0f 00 00 48 85 d2 74 2d 31 c0 eb 07 48 8b 95 a8 0f 00 00 48 63 c8 49 83 c4 08 83 c0 01 48 8b 14 ca <48> 8b 92 f8 00 00 00 49 89 54 24 f8 39 85 a0 0f 00 00 77 d7 48 8b +[2570283.831259][T4126959] RSP: 0018:ffffb79a77657ce8 EFLAGS: 00010202 +[2570283.845121][T4126959] RAX: 0000000000000019 RBX: ffffb799cd0c9280 RCX: 0000000000000018 +[2570283.860872][T4126959] RDX: 0000000000000000 RSI: ffff96dd970ce000 RDI: 0000000000000005 +[2570283.876525][T4126959] RBP: ffff96dd86f0a000 R08: ffff96dd970ce480 R09: 000000000000005f +[2570283.892014][T4126959] R10: ffffb799cd0c9fff R11: ffffb799cd0c9000 R12: ffffb799cd0c94f8 +[2570283.907406][T4126959] R13: ffffffffc11b1090 R14: ffff96dd970ce000 R15: ffffffffc11cd66c +[2570283.922705][T4126959] FS: 00007fa7723f8740(0000) GS:ffff96f51fac0000(0000) knlGS:0000000000000000 +[2570283.938848][T4126959] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +[2570283.952524][T4126959] CR2: 00000000000000f8 CR3: 0000001a73e6e006 CR4: 00000000007706e0 +[2570283.967529][T4126959] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 +[2570283.982400][T4126959] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 +[2570283.997308][T4126959] PKRU: 55555554 +[2570284.007649][T4126959] Call Trace: +[2570284.017598][T4126959] dev_ethtool+0x1832/0x2830 + +Fix this by adjusting efx->xdp_tx_queue_count after probing to reflect the true +value of initialized slots in efx->xdp_tx_queues. + +Signed-off-by: Ignat Korchagin +Fixes: e26ca4b53582 ("sfc: reduce the number of requested xdp ev queues") +Cc: # 5.12.x +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/sfc/efx_channels.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/drivers/net/ethernet/sfc/efx_channels.c ++++ b/drivers/net/ethernet/sfc/efx_channels.c +@@ -914,6 +914,8 @@ int efx_set_channels(struct efx_nic *efx + } + } + } ++ if (xdp_queue_number) ++ efx->xdp_tx_queue_count = xdp_queue_number; + + rc = netif_set_real_num_tx_queues(efx->net_dev, efx->n_tx_channels); + if (rc) diff --git a/queue-5.12/sfc-farch-fix-tx-queue-lookup-in-tx-event-handling.patch b/queue-5.12/sfc-farch-fix-tx-queue-lookup-in-tx-event-handling.patch new file mode 100644 index 00000000000..abb15fd82b7 --- /dev/null +++ b/queue-5.12/sfc-farch-fix-tx-queue-lookup-in-tx-event-handling.patch @@ -0,0 +1,43 @@ +From 83b09a1807415608b387c7bc748d329fefc5617e Mon Sep 17 00:00:00 2001 +From: Edward Cree +Date: Tue, 20 Apr 2021 13:28:28 +0100 +Subject: sfc: farch: fix TX queue lookup in TX event handling + +From: Edward Cree + +commit 83b09a1807415608b387c7bc748d329fefc5617e upstream. + +We're starting from a TXQ label, not a TXQ type, so + efx_channel_get_tx_queue() is inappropriate (and could return NULL, + leading to panics). + +Fixes: 12804793b17c ("sfc: decouple TXQ type from label") +Cc: stable@vger.kernel.org +Signed-off-by: Edward Cree +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/sfc/farch.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +--- a/drivers/net/ethernet/sfc/farch.c ++++ b/drivers/net/ethernet/sfc/farch.c +@@ -835,14 +835,14 @@ efx_farch_handle_tx_event(struct efx_cha + /* Transmit completion */ + tx_ev_desc_ptr = EFX_QWORD_FIELD(*event, FSF_AZ_TX_EV_DESC_PTR); + tx_ev_q_label = EFX_QWORD_FIELD(*event, FSF_AZ_TX_EV_Q_LABEL); +- tx_queue = efx_channel_get_tx_queue( +- channel, tx_ev_q_label % EFX_MAX_TXQ_PER_CHANNEL); ++ tx_queue = channel->tx_queue + ++ (tx_ev_q_label % EFX_MAX_TXQ_PER_CHANNEL); + efx_xmit_done(tx_queue, tx_ev_desc_ptr); + } else if (EFX_QWORD_FIELD(*event, FSF_AZ_TX_EV_WQ_FF_FULL)) { + /* Rewrite the FIFO write pointer */ + tx_ev_q_label = EFX_QWORD_FIELD(*event, FSF_AZ_TX_EV_Q_LABEL); +- tx_queue = efx_channel_get_tx_queue( +- channel, tx_ev_q_label % EFX_MAX_TXQ_PER_CHANNEL); ++ tx_queue = channel->tx_queue + ++ (tx_ev_q_label % EFX_MAX_TXQ_PER_CHANNEL); + + netif_tx_lock(efx->net_dev); + efx_farch_notify_tx_desc(tx_queue); diff --git a/queue-5.12/sfc-farch-fix-tx-queue-lookup-in-tx-flush-done-handling.patch b/queue-5.12/sfc-farch-fix-tx-queue-lookup-in-tx-flush-done-handling.patch new file mode 100644 index 00000000000..3102922a54c --- /dev/null +++ b/queue-5.12/sfc-farch-fix-tx-queue-lookup-in-tx-flush-done-handling.patch @@ -0,0 +1,46 @@ +From 5b1faa92289b53cad654123ed2bc8e10f6ddd4ac Mon Sep 17 00:00:00 2001 +From: Edward Cree +Date: Tue, 20 Apr 2021 13:27:22 +0100 +Subject: sfc: farch: fix TX queue lookup in TX flush done handling + +From: Edward Cree + +commit 5b1faa92289b53cad654123ed2bc8e10f6ddd4ac upstream. + +We're starting from a TXQ instance number ('qid'), not a TXQ type, so + efx_get_tx_queue() is inappropriate (and could return NULL, leading + to panics). + +Fixes: 12804793b17c ("sfc: decouple TXQ type from label") +Reported-by: Trevor Hemsley +Cc: stable@vger.kernel.org +Signed-off-by: Edward Cree +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/sfc/farch.c | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +--- a/drivers/net/ethernet/sfc/farch.c ++++ b/drivers/net/ethernet/sfc/farch.c +@@ -1081,16 +1081,16 @@ static void + efx_farch_handle_tx_flush_done(struct efx_nic *efx, efx_qword_t *event) + { + struct efx_tx_queue *tx_queue; ++ struct efx_channel *channel; + int qid; + + qid = EFX_QWORD_FIELD(*event, FSF_AZ_DRIVER_EV_SUBDATA); + if (qid < EFX_MAX_TXQ_PER_CHANNEL * (efx->n_tx_channels + efx->n_extra_tx_channels)) { +- tx_queue = efx_get_tx_queue(efx, qid / EFX_MAX_TXQ_PER_CHANNEL, +- qid % EFX_MAX_TXQ_PER_CHANNEL); +- if (atomic_cmpxchg(&tx_queue->flush_outstanding, 1, 0)) { ++ channel = efx_get_tx_channel(efx, qid / EFX_MAX_TXQ_PER_CHANNEL); ++ tx_queue = channel->tx_queue + (qid % EFX_MAX_TXQ_PER_CHANNEL); ++ if (atomic_cmpxchg(&tx_queue->flush_outstanding, 1, 0)) + efx_farch_magic_event(tx_queue->channel, + EFX_CHANNEL_MAGIC_TX_DRAIN(tx_queue)); +- } + } + } + -- 2.47.3