From 2b8c242ac869eae3d96b712fdb9940e9cd1e0d69 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Tue, 1 Aug 2023 07:57:53 +0200 Subject: [PATCH] 6.4-stable patches added patches: io_uring-gate-iowait-schedule-on-having-pending-requests.patch iommufd-set-end-correctly-when-doing-batch-carry.patch net-dsa-qca8k-enable-use_single_write-for-qca8xxx.patch net-dsa-qca8k-fix-broken-search_and_del.patch net-dsa-qca8k-fix-mdb-add-del-case-with-0-vid.patch net-dsa-qca8k-fix-search_and_insert-wrong-handling-of-new-rule.patch net-ipa-only-reset-hashed-tables-when-supported.patch proc-vmcore-fix-signedness-bug-in-read_from_oldmem.patch virtio-net-fix-race-between-set-queues-and-probe.patch xen-speed-up-grant-table-reclaim.patch --- ...-schedule-on-having-pending-requests.patch | 82 ++++++++++ ...end-correctly-when-doing-batch-carry.patch | 92 +++++++++++ ...-enable-use_single_write-for-qca8xxx.patch | 88 +++++++++++ ...-dsa-qca8k-fix-broken-search_and_del.patch | 44 ++++++ ...ca8k-fix-mdb-add-del-case-with-0-vid.patch | 47 ++++++ ...nd_insert-wrong-handling-of-new-rule.patch | 73 +++++++++ ...y-reset-hashed-tables-when-supported.patch | 101 +++++++++++++ ...x-signedness-bug-in-read_from_oldmem.patch | 45 ++++++ queue-6.4/series | 10 ++ ...ix-race-between-set-queues-and-probe.patch | 47 ++++++ .../xen-speed-up-grant-table-reclaim.patch | 143 ++++++++++++++++++ 11 files changed, 772 insertions(+) create mode 100644 queue-6.4/io_uring-gate-iowait-schedule-on-having-pending-requests.patch create mode 100644 queue-6.4/iommufd-set-end-correctly-when-doing-batch-carry.patch create mode 100644 queue-6.4/net-dsa-qca8k-enable-use_single_write-for-qca8xxx.patch create mode 100644 queue-6.4/net-dsa-qca8k-fix-broken-search_and_del.patch create mode 100644 queue-6.4/net-dsa-qca8k-fix-mdb-add-del-case-with-0-vid.patch create mode 100644 queue-6.4/net-dsa-qca8k-fix-search_and_insert-wrong-handling-of-new-rule.patch create mode 100644 queue-6.4/net-ipa-only-reset-hashed-tables-when-supported.patch create mode 100644 queue-6.4/proc-vmcore-fix-signedness-bug-in-read_from_oldmem.patch create mode 100644 queue-6.4/virtio-net-fix-race-between-set-queues-and-probe.patch create mode 100644 queue-6.4/xen-speed-up-grant-table-reclaim.patch diff --git a/queue-6.4/io_uring-gate-iowait-schedule-on-having-pending-requests.patch b/queue-6.4/io_uring-gate-iowait-schedule-on-having-pending-requests.patch new file mode 100644 index 00000000000..f441ec49c6d --- /dev/null +++ b/queue-6.4/io_uring-gate-iowait-schedule-on-having-pending-requests.patch @@ -0,0 +1,82 @@ +From 7b72d661f1f2f950ab8c12de7e2bc48bdac8ed69 Mon Sep 17 00:00:00 2001 +From: Jens Axboe +Date: Mon, 24 Jul 2023 11:28:17 -0600 +Subject: io_uring: gate iowait schedule on having pending requests + +From: Jens Axboe + +commit 7b72d661f1f2f950ab8c12de7e2bc48bdac8ed69 upstream. + +A previous commit made all cqring waits marked as iowait, as a way to +improve performance for short schedules with pending IO. However, for +use cases that have a special reaper thread that does nothing but +wait on events on the ring, this causes a cosmetic issue where we +know have one core marked as being "busy" with 100% iowait. + +While this isn't a grave issue, it is confusing to users. Rather than +always mark us as being in iowait, gate setting of current->in_iowait +to 1 by whether or not the waiting task has pending requests. + +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/io-uring/CAMEGJJ2RxopfNQ7GNLhr7X9=bHXKo+G5OOe0LUq=+UgLXsv1Xg@mail.gmail.com/ +Link: https://bugzilla.kernel.org/show_bug.cgi?id=217699 +Link: https://bugzilla.kernel.org/show_bug.cgi?id=217700 +Reported-by: Oleksandr Natalenko +Reported-by: Phil Elwell +Tested-by: Andres Freund +Fixes: 8a796565cec3 ("io_uring: Use io_schedule* in cqring wait") +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + io_uring/io_uring.c | 23 +++++++++++++++++------ + 1 file changed, 17 insertions(+), 6 deletions(-) + +--- a/io_uring/io_uring.c ++++ b/io_uring/io_uring.c +@@ -2579,11 +2579,20 @@ int io_run_task_work_sig(struct io_ring_ + return 0; + } + ++static bool current_pending_io(void) ++{ ++ struct io_uring_task *tctx = current->io_uring; ++ ++ if (!tctx) ++ return false; ++ return percpu_counter_read_positive(&tctx->inflight); ++} ++ + /* when returns >0, the caller should retry */ + static inline int io_cqring_wait_schedule(struct io_ring_ctx *ctx, + struct io_wait_queue *iowq) + { +- int token, ret; ++ int io_wait, ret; + + if (unlikely(READ_ONCE(ctx->check_cq))) + return 1; +@@ -2597,17 +2606,19 @@ static inline int io_cqring_wait_schedul + return 0; + + /* +- * Use io_schedule_prepare/finish, so cpufreq can take into account +- * that the task is waiting for IO - turns out to be important for low +- * QD IO. ++ * Mark us as being in io_wait if we have pending requests, so cpufreq ++ * can take into account that the task is waiting for IO - turns out ++ * to be important for low QD IO. + */ +- token = io_schedule_prepare(); ++ io_wait = current->in_iowait; ++ if (current_pending_io()) ++ current->in_iowait = 1; + ret = 0; + if (iowq->timeout == KTIME_MAX) + schedule(); + else if (!schedule_hrtimeout(&iowq->timeout, HRTIMER_MODE_ABS)) + ret = -ETIME; +- io_schedule_finish(token); ++ current->in_iowait = io_wait; + return ret; + } + diff --git a/queue-6.4/iommufd-set-end-correctly-when-doing-batch-carry.patch b/queue-6.4/iommufd-set-end-correctly-when-doing-batch-carry.patch new file mode 100644 index 00000000000..1f4d1575950 --- /dev/null +++ b/queue-6.4/iommufd-set-end-correctly-when-doing-batch-carry.patch @@ -0,0 +1,92 @@ +From b7c822fa6b7701b17e139f1c562fc24135880ed4 Mon Sep 17 00:00:00 2001 +From: Jason Gunthorpe +Date: Tue, 25 Jul 2023 16:05:50 -0300 +Subject: iommufd: Set end correctly when doing batch carry + +From: Jason Gunthorpe + +commit b7c822fa6b7701b17e139f1c562fc24135880ed4 upstream. + +Even though the test suite covers this it somehow became obscured that +this wasn't working. + +The test iommufd_ioas.mock_domain.access_domain_destory would blow up +rarely. + +end should be set to 1 because this just pushed an item, the carry, to the +pfns list. + +Sometimes the test would blow up with: + + BUG: kernel NULL pointer dereference, address: 0000000000000000 + #PF: supervisor read access in kernel mode + #PF: error_code(0x0000) - not-present page + PGD 0 P4D 0 + Oops: 0000 [#1] SMP + CPU: 5 PID: 584 Comm: iommufd Not tainted 6.5.0-rc1-dirty #1236 + Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 + RIP: 0010:batch_unpin+0xa2/0x100 [iommufd] + Code: 17 48 81 fe ff ff 07 00 77 70 48 8b 15 b7 be 97 e2 48 85 d2 74 14 48 8b 14 fa 48 85 d2 74 0b 40 0f b6 f6 48 c1 e6 04 48 01 f2 <48> 8b 3a 48 c1 e0 06 89 ca 48 89 de 48 83 e7 f0 48 01 c7 e8 96 dc + RSP: 0018:ffffc90001677a58 EFLAGS: 00010246 + RAX: 00007f7e2646f000 RBX: 0000000000000000 RCX: 0000000000000001 + RDX: 0000000000000000 RSI: 00000000fefc4c8d RDI: 0000000000fefc4c + RBP: ffffc90001677a80 R08: 0000000000000048 R09: 0000000000000200 + R10: 0000000000030b98 R11: ffffffff81f3bb40 R12: 0000000000000001 + R13: ffff888101f75800 R14: ffffc90001677ad0 R15: 00000000000001fe + FS: 00007f9323679740(0000) GS:ffff8881ba540000(0000) knlGS:0000000000000000 + CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + CR2: 0000000000000000 CR3: 0000000105ede003 CR4: 00000000003706a0 + DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 + DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 + Call Trace: + + ? show_regs+0x5c/0x70 + ? __die+0x1f/0x60 + ? page_fault_oops+0x15d/0x440 + ? lock_release+0xbc/0x240 + ? exc_page_fault+0x4a4/0x970 + ? asm_exc_page_fault+0x27/0x30 + ? batch_unpin+0xa2/0x100 [iommufd] + ? batch_unpin+0xba/0x100 [iommufd] + __iopt_area_unfill_domain+0x198/0x430 [iommufd] + ? __mutex_lock+0x8c/0xb80 + ? __mutex_lock+0x6aa/0xb80 + ? xa_erase+0x28/0x30 + ? iopt_table_remove_domain+0x162/0x320 [iommufd] + ? lock_release+0xbc/0x240 + iopt_area_unfill_domain+0xd/0x10 [iommufd] + iopt_table_remove_domain+0x195/0x320 [iommufd] + iommufd_hw_pagetable_destroy+0xb3/0x110 [iommufd] + iommufd_object_destroy_user+0x8e/0xf0 [iommufd] + iommufd_device_detach+0xc5/0x140 [iommufd] + iommufd_selftest_destroy+0x1f/0x70 [iommufd] + iommufd_object_destroy_user+0x8e/0xf0 [iommufd] + iommufd_destroy+0x3a/0x50 [iommufd] + iommufd_fops_ioctl+0xfb/0x170 [iommufd] + __x64_sys_ioctl+0x40d/0x9a0 + do_syscall_64+0x3c/0x80 + entry_SYSCALL_64_after_hwframe+0x46/0xb0 + +Link: https://lore.kernel.org/r/3-v1-85aacb2af554+bc-iommufd_syz3_jgg@nvidia.com +Cc: +Fixes: f394576eb11d ("iommufd: PFN handling for iopt_pages") +Reviewed-by: Kevin Tian +Tested-by: Nicolin Chen +Reported-by: Nicolin Chen +Signed-off-by: Jason Gunthorpe +Signed-off-by: Greg Kroah-Hartman +--- + drivers/iommu/iommufd/pages.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/iommu/iommufd/pages.c ++++ b/drivers/iommu/iommufd/pages.c +@@ -297,7 +297,7 @@ static void batch_clear_carry(struct pfn + batch->pfns[0] = batch->pfns[batch->end - 1] + + (batch->npfns[batch->end - 1] - keep_pfns); + batch->npfns[0] = keep_pfns; +- batch->end = 0; ++ batch->end = 1; + } + + static void batch_skip_carry(struct pfn_batch *batch, unsigned int skip_pfns) diff --git a/queue-6.4/net-dsa-qca8k-enable-use_single_write-for-qca8xxx.patch b/queue-6.4/net-dsa-qca8k-enable-use_single_write-for-qca8xxx.patch new file mode 100644 index 00000000000..e22a38f6fed --- /dev/null +++ b/queue-6.4/net-dsa-qca8k-enable-use_single_write-for-qca8xxx.patch @@ -0,0 +1,88 @@ +From 2c39dd025da489cf87d26469d9f5ff19715324a0 Mon Sep 17 00:00:00 2001 +From: Christian Marangi +Date: Mon, 24 Jul 2023 05:25:28 +0200 +Subject: net: dsa: qca8k: enable use_single_write for qca8xxx + +From: Christian Marangi + +commit 2c39dd025da489cf87d26469d9f5ff19715324a0 upstream. + +The qca8xxx switch supports 2 way to write reg values, a slow way using +mdio and a fast way by sending specially crafted mgmt packet to +read/write reg. + +The fast way can support up to 32 bytes of data as eth packet are used +to send/receive. + +This correctly works for almost the entire regmap of the switch but with +the use of some kernel selftests for dsa drivers it was found a funny +and interesting hw defect/limitation. + +For some specific reg, bulk write won't work and will result in writing +only part of the requested regs resulting in half data written. This was +especially hard to track and discover due to the total strangeness of +the problem and also by the specific regs where this occurs. + +This occurs in the specific regs of the ATU table, where multiple entry +needs to be written to compose the entire entry. +It was discovered that with a bulk write of 12 bytes on +QCA8K_REG_ATU_DATA0 only QCA8K_REG_ATU_DATA0 and QCA8K_REG_ATU_DATA2 +were written, but QCA8K_REG_ATU_DATA1 was always zero. +Tcpdump was used to make sure the specially crafted packet was correct +and this was confirmed. + +The problem was hard to track as the lack of QCA8K_REG_ATU_DATA1 +resulted in an entry somehow possible as the first bytes of the mac +address are set in QCA8K_REG_ATU_DATA0 and the entry type is set in +QCA8K_REG_ATU_DATA2. + +Funlly enough writing QCA8K_REG_ATU_DATA1 results in the same problem +with QCA8K_REG_ATU_DATA2 empty and QCA8K_REG_ATU_DATA1 and +QCA8K_REG_ATU_FUNC correctly written. +A speculation on the problem might be that there are some kind of +indirection internally when accessing these regs and they can't be +accessed all together, due to the fact that it's really a table mapped +somewhere in the switch SRAM. + +Even more funny is the fact that every other reg was tested with all +kind of combination and they are not affected by this problem. Read +operation was also tested and always worked so it's not affected by this +problem. + +The problem is not present if we limit writing a single reg at times. + +To handle this hardware defect, enable use_single_write so that bulk +api can correctly split the write in multiple different operation +effectively reverting to a non-bulk write. + +Cc: Mark Brown +Fixes: c766e077d927 ("net: dsa: qca8k: convert to regmap read/write API") +Signed-off-by: Christian Marangi +Cc: stable@vger.kernel.org +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/dsa/qca/qca8k-8xxx.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +diff --git a/drivers/net/dsa/qca/qca8k-8xxx.c b/drivers/net/dsa/qca/qca8k-8xxx.c +index 09b80644c11b..efe9380d4a15 100644 +--- a/drivers/net/dsa/qca/qca8k-8xxx.c ++++ b/drivers/net/dsa/qca/qca8k-8xxx.c +@@ -576,8 +576,11 @@ static struct regmap_config qca8k_regmap_config = { + .rd_table = &qca8k_readable_table, + .disable_locking = true, /* Locking is handled by qca8k read/write */ + .cache_type = REGCACHE_NONE, /* Explicitly disable CACHE */ +- .max_raw_read = 32, /* mgmt eth can read/write up to 8 registers at time */ +- .max_raw_write = 32, ++ .max_raw_read = 32, /* mgmt eth can read up to 8 registers at time */ ++ /* ATU regs suffer from a bug where some data are not correctly ++ * written. Disable bulk write to correctly write ATU entry. ++ */ ++ .use_single_write = true, + }; + + static int +-- +2.41.0 + diff --git a/queue-6.4/net-dsa-qca8k-fix-broken-search_and_del.patch b/queue-6.4/net-dsa-qca8k-fix-broken-search_and_del.patch new file mode 100644 index 00000000000..86218fd1719 --- /dev/null +++ b/queue-6.4/net-dsa-qca8k-fix-broken-search_and_del.patch @@ -0,0 +1,44 @@ +From ae70dcb9d9ecaf7d9836d3e1b5bef654d7ef5680 Mon Sep 17 00:00:00 2001 +From: Christian Marangi +Date: Mon, 24 Jul 2023 05:25:30 +0200 +Subject: net: dsa: qca8k: fix broken search_and_del + +From: Christian Marangi + +commit ae70dcb9d9ecaf7d9836d3e1b5bef654d7ef5680 upstream. + +On deleting an MDB entry for a port, fdb_search_and_del is used. +An FDB entry can't be modified so it needs to be deleted and readded +again with the new portmap (and the port deleted as requested) + +We use the SEARCH operator to search the entry to edit by vid and mac +address and then we check the aging if we actually found an entry. + +Currently the code suffer from a bug where the searched fdb entry is +never read again with the found values (if found) resulting in the code +always returning -EINVAL as aging was always 0. + +Fix this by correctly read the fdb entry after it was searched. + +Fixes: ba8f870dfa63 ("net: dsa: qca8k: add support for mdb_add/del") +Signed-off-by: Christian Marangi +Cc: stable@vger.kernel.org +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/dsa/qca/qca8k-common.c | 4 ++++ + 1 file changed, 4 insertions(+) + +--- a/drivers/net/dsa/qca/qca8k-common.c ++++ b/drivers/net/dsa/qca/qca8k-common.c +@@ -293,6 +293,10 @@ static int qca8k_fdb_search_and_del(stru + if (ret < 0) + goto exit; + ++ ret = qca8k_fdb_read(priv, &fdb); ++ if (ret < 0) ++ goto exit; ++ + /* Rule doesn't exist. Why delete? */ + if (!fdb.aging) { + ret = -EINVAL; diff --git a/queue-6.4/net-dsa-qca8k-fix-mdb-add-del-case-with-0-vid.patch b/queue-6.4/net-dsa-qca8k-fix-mdb-add-del-case-with-0-vid.patch new file mode 100644 index 00000000000..574fbd5f5d5 --- /dev/null +++ b/queue-6.4/net-dsa-qca8k-fix-mdb-add-del-case-with-0-vid.patch @@ -0,0 +1,47 @@ +From dfd739f182b00b02bd7470ed94d112684cc04fa2 Mon Sep 17 00:00:00 2001 +From: Christian Marangi +Date: Mon, 24 Jul 2023 05:25:31 +0200 +Subject: net: dsa: qca8k: fix mdb add/del case with 0 VID + +From: Christian Marangi + +commit dfd739f182b00b02bd7470ed94d112684cc04fa2 upstream. + +The qca8k switch doesn't support using 0 as VID and require a default +VID to be always set. MDB add/del function doesn't currently handle +this and are currently setting the default VID. + +Fix this by correctly handling this corner case and internally use the +default VID for VID 0 case. + +Fixes: ba8f870dfa63 ("net: dsa: qca8k: add support for mdb_add/del") +Signed-off-by: Christian Marangi +Cc: stable@vger.kernel.org +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/dsa/qca/qca8k-common.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +--- a/drivers/net/dsa/qca/qca8k-common.c ++++ b/drivers/net/dsa/qca/qca8k-common.c +@@ -816,6 +816,9 @@ int qca8k_port_mdb_add(struct dsa_switch + const u8 *addr = mdb->addr; + u16 vid = mdb->vid; + ++ if (!vid) ++ vid = QCA8K_PORT_VID_DEF; ++ + return qca8k_fdb_search_and_insert(priv, BIT(port), addr, vid, + QCA8K_ATU_STATUS_STATIC); + } +@@ -828,6 +831,9 @@ int qca8k_port_mdb_del(struct dsa_switch + const u8 *addr = mdb->addr; + u16 vid = mdb->vid; + ++ if (!vid) ++ vid = QCA8K_PORT_VID_DEF; ++ + return qca8k_fdb_search_and_del(priv, BIT(port), addr, vid); + } + diff --git a/queue-6.4/net-dsa-qca8k-fix-search_and_insert-wrong-handling-of-new-rule.patch b/queue-6.4/net-dsa-qca8k-fix-search_and_insert-wrong-handling-of-new-rule.patch new file mode 100644 index 00000000000..5c08fe0fca7 --- /dev/null +++ b/queue-6.4/net-dsa-qca8k-fix-search_and_insert-wrong-handling-of-new-rule.patch @@ -0,0 +1,73 @@ +From 80248d4160894d7e40b04111bdbaa4ff93fc4bd7 Mon Sep 17 00:00:00 2001 +From: Christian Marangi +Date: Mon, 24 Jul 2023 05:25:29 +0200 +Subject: net: dsa: qca8k: fix search_and_insert wrong handling of new rule + +From: Christian Marangi + +commit 80248d4160894d7e40b04111bdbaa4ff93fc4bd7 upstream. + +On inserting a mdb entry, fdb_search_and_insert is used to add a port to +the qca8k target entry in the FDB db. + +A FDB entry can't be modified so it needs to be removed and insert again +with the new values. + +To detect if an entry already exist, the SEARCH operation is used and we +check the aging of the entry. If the entry is not 0, the entry exist and +we proceed to delete it. + +Current code have 2 main problem: +- The condition to check if the FDB entry exist is wrong and should be + the opposite. +- When a FDB entry doesn't exist, aging was never actually set to the + STATIC value resulting in allocating an invalid entry. + +Fix both problem by adding aging support to the function, calling the +function with STATIC as aging by default and finally by correct the +condition to check if the entry actually exist. + +Fixes: ba8f870dfa63 ("net: dsa: qca8k: add support for mdb_add/del") +Signed-off-by: Christian Marangi +Cc: stable@vger.kernel.org +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/dsa/qca/qca8k-common.c | 9 ++++++--- + 1 file changed, 6 insertions(+), 3 deletions(-) + +--- a/drivers/net/dsa/qca/qca8k-common.c ++++ b/drivers/net/dsa/qca/qca8k-common.c +@@ -244,7 +244,7 @@ void qca8k_fdb_flush(struct qca8k_priv * + } + + static int qca8k_fdb_search_and_insert(struct qca8k_priv *priv, u8 port_mask, +- const u8 *mac, u16 vid) ++ const u8 *mac, u16 vid, u8 aging) + { + struct qca8k_fdb fdb = { 0 }; + int ret; +@@ -261,10 +261,12 @@ static int qca8k_fdb_search_and_insert(s + goto exit; + + /* Rule exist. Delete first */ +- if (!fdb.aging) { ++ if (fdb.aging) { + ret = qca8k_fdb_access(priv, QCA8K_FDB_PURGE, -1); + if (ret) + goto exit; ++ } else { ++ fdb.aging = aging; + } + + /* Add port to fdb portmask */ +@@ -810,7 +812,8 @@ int qca8k_port_mdb_add(struct dsa_switch + const u8 *addr = mdb->addr; + u16 vid = mdb->vid; + +- return qca8k_fdb_search_and_insert(priv, BIT(port), addr, vid); ++ return qca8k_fdb_search_and_insert(priv, BIT(port), addr, vid, ++ QCA8K_ATU_STATUS_STATIC); + } + + int qca8k_port_mdb_del(struct dsa_switch *ds, int port, diff --git a/queue-6.4/net-ipa-only-reset-hashed-tables-when-supported.patch b/queue-6.4/net-ipa-only-reset-hashed-tables-when-supported.patch new file mode 100644 index 00000000000..20f7c0bc79e --- /dev/null +++ b/queue-6.4/net-ipa-only-reset-hashed-tables-when-supported.patch @@ -0,0 +1,101 @@ +From e11ec2b868af2b351c6c1e2e50eb711cc5423a10 Mon Sep 17 00:00:00 2001 +From: Alex Elder +Date: Mon, 24 Jul 2023 17:40:55 -0500 +Subject: net: ipa: only reset hashed tables when supported + +From: Alex Elder + +commit e11ec2b868af2b351c6c1e2e50eb711cc5423a10 upstream. + +Last year, the code that manages GSI channel transactions switched +from using spinlock-protected linked lists to using indexes into the +ring buffer used for a channel. Recently, Google reported seeing +transaction reference count underflows occasionally during shutdown. + +Doug Anderson found a way to reproduce the issue reliably, and +bisected the issue to the commit that eliminated the linked lists +and the lock. The root cause was ultimately determined to be +related to unused transactions being committed as part of the modem +shutdown cleanup activity. Unused transactions are not normally +expected (except in error cases). + +The modem uses some ranges of IPA-resident memory, and whenever it +shuts down we zero those ranges. In ipa_filter_reset_table() a +transaction is allocated to zero modem filter table entries. If +hashing is not supported, hashed table memory should not be zeroed. +But currently nothing prevents that, and the result is an unused +transaction. Something similar occurs when we zero routing table +entries for the modem. + +By preventing any attempt to clear hashed tables when hashing is not +supported, the reference count underflow is avoided in this case. + +Note that there likely remains an issue with properly freeing unused +transactions (if they occur due to errors). This patch addresses +only the underflows that Google originally reported. + +Cc: # 6.1.x +Fixes: d338ae28d8a8 ("net: ipa: kill all other transaction lists") +Tested-by: Douglas Anderson +Signed-off-by: Alex Elder +Link: https://lore.kernel.org/r/20230724224055.1688854-1-elder@linaro.org +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ipa/ipa_table.c | 20 +++++++++++--------- + 1 file changed, 11 insertions(+), 9 deletions(-) + +--- a/drivers/net/ipa/ipa_table.c ++++ b/drivers/net/ipa/ipa_table.c +@@ -273,16 +273,15 @@ static int ipa_filter_reset(struct ipa * + if (ret) + return ret; + +- ret = ipa_filter_reset_table(ipa, true, false, modem); +- if (ret) ++ ret = ipa_filter_reset_table(ipa, false, true, modem); ++ if (ret || !ipa_table_hash_support(ipa)) + return ret; + +- ret = ipa_filter_reset_table(ipa, false, true, modem); ++ ret = ipa_filter_reset_table(ipa, true, false, modem); + if (ret) + return ret; +- ret = ipa_filter_reset_table(ipa, true, true, modem); + +- return ret; ++ return ipa_filter_reset_table(ipa, true, true, modem); + } + + /* The AP routes and modem routes are each contiguous within the +@@ -291,12 +290,13 @@ static int ipa_filter_reset(struct ipa * + * */ + static int ipa_route_reset(struct ipa *ipa, bool modem) + { ++ bool hash_support = ipa_table_hash_support(ipa); + u32 modem_route_count = ipa->modem_route_count; + struct gsi_trans *trans; + u16 first; + u16 count; + +- trans = ipa_cmd_trans_alloc(ipa, 4); ++ trans = ipa_cmd_trans_alloc(ipa, hash_support ? 4 : 2); + if (!trans) { + dev_err(&ipa->pdev->dev, + "no transaction for %s route reset\n", +@@ -313,10 +313,12 @@ static int ipa_route_reset(struct ipa *i + } + + ipa_table_reset_add(trans, false, false, false, first, count); +- ipa_table_reset_add(trans, false, true, false, first, count); +- + ipa_table_reset_add(trans, false, false, true, first, count); +- ipa_table_reset_add(trans, false, true, true, first, count); ++ ++ if (hash_support) { ++ ipa_table_reset_add(trans, false, true, false, first, count); ++ ipa_table_reset_add(trans, false, true, true, first, count); ++ } + + gsi_trans_commit_wait(trans); + diff --git a/queue-6.4/proc-vmcore-fix-signedness-bug-in-read_from_oldmem.patch b/queue-6.4/proc-vmcore-fix-signedness-bug-in-read_from_oldmem.patch new file mode 100644 index 00000000000..96e8685e9c3 --- /dev/null +++ b/queue-6.4/proc-vmcore-fix-signedness-bug-in-read_from_oldmem.patch @@ -0,0 +1,45 @@ +From 641db40f3afe7998011bfabc726dba3e698f8196 Mon Sep 17 00:00:00 2001 +From: Dan Carpenter +Date: Tue, 25 Jul 2023 20:03:16 +0300 +Subject: proc/vmcore: fix signedness bug in read_from_oldmem() + +From: Dan Carpenter + +commit 641db40f3afe7998011bfabc726dba3e698f8196 upstream. + +The bug is the error handling: + + if (tmp < nr_bytes) { + +"tmp" can hold negative error codes but because "nr_bytes" is type size_t +the negative error codes are treated as very high positive values +(success). Fix this by changing "nr_bytes" to type ssize_t. The +"nr_bytes" variable is used to store values between 1 and PAGE_SIZE and +they can fit in ssize_t without any issue. + +Link: https://lkml.kernel.org/r/b55f7eed-1c65-4adc-95d1-6c7c65a54a6e@moroto.mountain +Fixes: 5d8de293c224 ("vmcore: convert copy_oldmem_page() to take an iov_iter") +Signed-off-by: Dan Carpenter +Reviewed-by: Matthew Wilcox (Oracle) +Acked-by: Baoquan He +Cc: Dave Young +Cc: Vivek Goyal +Cc: Alexey Dobriyan +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + fs/proc/vmcore.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/proc/vmcore.c ++++ b/fs/proc/vmcore.c +@@ -132,7 +132,7 @@ ssize_t read_from_oldmem(struct iov_iter + u64 *ppos, bool encrypted) + { + unsigned long pfn, offset; +- size_t nr_bytes; ++ ssize_t nr_bytes; + ssize_t read = 0, tmp; + int idx; + diff --git a/queue-6.4/series b/queue-6.4/series index ca17b05e9ec..638524666f2 100644 --- a/queue-6.4/series +++ b/queue-6.4/series @@ -196,3 +196,13 @@ tpm_tis-explicitly-check-for-error-code.patch irq-bcm6345-l1-do-not-assume-a-fixed-block-to-cpu-ma.patch irqchip-gic-v4.1-properly-lock-vpes-when-doing-a-dir.patch locking-rtmutex-fix-task-pi_waiters-integrity.patch +proc-vmcore-fix-signedness-bug-in-read_from_oldmem.patch +xen-speed-up-grant-table-reclaim.patch +virtio-net-fix-race-between-set-queues-and-probe.patch +net-ipa-only-reset-hashed-tables-when-supported.patch +net-dsa-qca8k-enable-use_single_write-for-qca8xxx.patch +net-dsa-qca8k-fix-search_and_insert-wrong-handling-of-new-rule.patch +net-dsa-qca8k-fix-broken-search_and_del.patch +net-dsa-qca8k-fix-mdb-add-del-case-with-0-vid.patch +io_uring-gate-iowait-schedule-on-having-pending-requests.patch +iommufd-set-end-correctly-when-doing-batch-carry.patch diff --git a/queue-6.4/virtio-net-fix-race-between-set-queues-and-probe.patch b/queue-6.4/virtio-net-fix-race-between-set-queues-and-probe.patch new file mode 100644 index 00000000000..102e932f89e --- /dev/null +++ b/queue-6.4/virtio-net-fix-race-between-set-queues-and-probe.patch @@ -0,0 +1,47 @@ +From 25266128fe16d5632d43ada34c847d7b8daba539 Mon Sep 17 00:00:00 2001 +From: Jason Wang +Date: Tue, 25 Jul 2023 03:20:49 -0400 +Subject: virtio-net: fix race between set queues and probe + +From: Jason Wang + +commit 25266128fe16d5632d43ada34c847d7b8daba539 upstream. + +A race were found where set_channels could be called after registering +but before virtnet_set_queues() in virtnet_probe(). Fixing this by +moving the virtnet_set_queues() before netdevice registering. While at +it, use _virtnet_set_queues() to avoid holding rtnl as the device is +not even registered at that time. + +Cc: stable@vger.kernel.org +Fixes: a220871be66f ("virtio-net: correctly enable multiqueue") +Signed-off-by: Jason Wang +Acked-by: Michael S. Tsirkin +Reviewed-by: Xuan Zhuo +Link: https://lore.kernel.org/r/20230725072049.617289-1-jasowang@redhat.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/virtio_net.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/drivers/net/virtio_net.c ++++ b/drivers/net/virtio_net.c +@@ -4110,6 +4110,8 @@ static int virtnet_probe(struct virtio_d + if (vi->has_rss || vi->has_rss_hash_report) + virtnet_init_default_rss(vi); + ++ _virtnet_set_queues(vi, vi->curr_queue_pairs); ++ + /* serialize netdev register + virtio_device_ready() with ndo_open() */ + rtnl_lock(); + +@@ -4148,8 +4150,6 @@ static int virtnet_probe(struct virtio_d + goto free_unregister_netdev; + } + +- virtnet_set_queues(vi, vi->curr_queue_pairs); +- + /* Assume link up if device can't report link status, + otherwise get link status from config. */ + netif_carrier_off(dev); diff --git a/queue-6.4/xen-speed-up-grant-table-reclaim.patch b/queue-6.4/xen-speed-up-grant-table-reclaim.patch new file mode 100644 index 00000000000..356343a1000 --- /dev/null +++ b/queue-6.4/xen-speed-up-grant-table-reclaim.patch @@ -0,0 +1,143 @@ +From c04e9894846c663f3278a414f34416e6e45bbe68 Mon Sep 17 00:00:00 2001 +From: Demi Marie Obenour +Date: Wed, 26 Jul 2023 12:52:41 -0400 +Subject: xen: speed up grant-table reclaim + +From: Demi Marie Obenour + +commit c04e9894846c663f3278a414f34416e6e45bbe68 upstream. + +When a grant entry is still in use by the remote domain, Linux must put +it on a deferred list. Normally, this list is very short, because +the PV network and block protocols expect the backend to unmap the grant +first. However, Qubes OS's GUI protocol is subject to the constraints +of the X Window System, and as such winds up with the frontend unmapping +the window first. As a result, the list can grow very large, resulting +in a massive memory leak and eventual VM freeze. + +To partially solve this problem, make the number of entries that the VM +will attempt to free at each iteration tunable. The default is still +10, but it can be overridden via a module parameter. + +This is Cc: stable because (when combined with appropriate userspace +changes) it fixes a severe performance and stability problem for Qubes +OS users. + +Cc: stable@vger.kernel.org +Signed-off-by: Demi Marie Obenour +Reviewed-by: Juergen Gross +Link: https://lore.kernel.org/r/20230726165354.1252-1-demi@invisiblethingslab.com +Signed-off-by: Juergen Gross +Signed-off-by: Greg Kroah-Hartman +--- + Documentation/ABI/testing/sysfs-module | 11 +++++++++ + drivers/xen/grant-table.c | 40 +++++++++++++++++++++++---------- + 2 files changed, 40 insertions(+), 11 deletions(-) + +--- a/Documentation/ABI/testing/sysfs-module ++++ b/Documentation/ABI/testing/sysfs-module +@@ -60,3 +60,14 @@ Description: Module taint flags: + C staging driver module + E unsigned module + == ===================== ++ ++What: /sys/module/grant_table/parameters/free_per_iteration ++Date: July 2023 ++KernelVersion: 6.5 but backported to all supported stable branches ++Contact: Xen developer discussion ++Description: Read and write number of grant entries to attempt to free per iteration. ++ ++ Note: Future versions of Xen and Linux may provide a better ++ interface for controlling the rate of deferred grant reclaim ++ or may not need it at all. ++Users: Qubes OS (https://www.qubes-os.org) +--- a/drivers/xen/grant-table.c ++++ b/drivers/xen/grant-table.c +@@ -498,14 +498,21 @@ static LIST_HEAD(deferred_list); + static void gnttab_handle_deferred(struct timer_list *); + static DEFINE_TIMER(deferred_timer, gnttab_handle_deferred); + ++static atomic64_t deferred_count; ++static atomic64_t leaked_count; ++static unsigned int free_per_iteration = 10; ++module_param(free_per_iteration, uint, 0600); ++ + static void gnttab_handle_deferred(struct timer_list *unused) + { +- unsigned int nr = 10; ++ unsigned int nr = READ_ONCE(free_per_iteration); ++ const bool ignore_limit = nr == 0; + struct deferred_entry *first = NULL; + unsigned long flags; ++ size_t freed = 0; + + spin_lock_irqsave(&gnttab_list_lock, flags); +- while (nr--) { ++ while ((ignore_limit || nr--) && !list_empty(&deferred_list)) { + struct deferred_entry *entry + = list_first_entry(&deferred_list, + struct deferred_entry, list); +@@ -515,10 +522,14 @@ static void gnttab_handle_deferred(struc + list_del(&entry->list); + spin_unlock_irqrestore(&gnttab_list_lock, flags); + if (_gnttab_end_foreign_access_ref(entry->ref)) { ++ uint64_t ret = atomic64_dec_return(&deferred_count); ++ + put_free_entry(entry->ref); +- pr_debug("freeing g.e. %#x (pfn %#lx)\n", +- entry->ref, page_to_pfn(entry->page)); ++ pr_debug("freeing g.e. %#x (pfn %#lx), %llu remaining\n", ++ entry->ref, page_to_pfn(entry->page), ++ (unsigned long long)ret); + put_page(entry->page); ++ freed++; + kfree(entry); + entry = NULL; + } else { +@@ -530,21 +541,22 @@ static void gnttab_handle_deferred(struc + spin_lock_irqsave(&gnttab_list_lock, flags); + if (entry) + list_add_tail(&entry->list, &deferred_list); +- else if (list_empty(&deferred_list)) +- break; + } +- if (!list_empty(&deferred_list) && !timer_pending(&deferred_timer)) { ++ if (list_empty(&deferred_list)) ++ WARN_ON(atomic64_read(&deferred_count)); ++ else if (!timer_pending(&deferred_timer)) { + deferred_timer.expires = jiffies + HZ; + add_timer(&deferred_timer); + } + spin_unlock_irqrestore(&gnttab_list_lock, flags); ++ pr_debug("Freed %zu references", freed); + } + + static void gnttab_add_deferred(grant_ref_t ref, struct page *page) + { + struct deferred_entry *entry; + gfp_t gfp = (in_atomic() || irqs_disabled()) ? GFP_ATOMIC : GFP_KERNEL; +- const char *what = KERN_WARNING "leaking"; ++ uint64_t leaked, deferred; + + entry = kmalloc(sizeof(*entry), gfp); + if (!page) { +@@ -567,10 +579,16 @@ static void gnttab_add_deferred(grant_re + add_timer(&deferred_timer); + } + spin_unlock_irqrestore(&gnttab_list_lock, flags); +- what = KERN_DEBUG "deferring"; ++ deferred = atomic64_inc_return(&deferred_count); ++ leaked = atomic64_read(&leaked_count); ++ pr_debug("deferring g.e. %#x (pfn %#lx) (total deferred %llu, total leaked %llu)\n", ++ ref, page ? page_to_pfn(page) : -1, deferred, leaked); ++ } else { ++ deferred = atomic64_read(&deferred_count); ++ leaked = atomic64_inc_return(&leaked_count); ++ pr_warn("leaking g.e. %#x (pfn %#lx) (total deferred %llu, total leaked %llu)\n", ++ ref, page ? page_to_pfn(page) : -1, deferred, leaked); + } +- printk("%s g.e. %#x (pfn %#lx)\n", +- what, ref, page ? page_to_pfn(page) : -1); + } + + int gnttab_try_end_foreign_access(grant_ref_t ref) -- 2.47.3