From: Greg Kroah-Hartman Date: Tue, 22 Apr 2025 12:33:04 +0000 (+0200) Subject: 5.15-stable patches X-Git-Tag: v6.1.135~43 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=e28ad00bc5fc61f406af894407d8d588498662e8;p=thirdparty%2Fkernel%2Fstable-queue.git 5.15-stable patches added patches: blk-cgroup-support-to-track-if-policy-is-online.patch blk-iocost-do-not-warn-if-iocg-was-already-offlined.patch bpf-avoid-holding-freeze_mutex-during-mmap-operation.patch bpf-check-rcu_read_lock_trace_held-before-calling-bpf-map-helpers.patch ext4-fix-timer-use-after-free-on-failed-mount.patch ipvs-properly-dereference-pe-in-ip_vs_add_service.patch net-openvswitch-fix-race-on-port-output.patch openvswitch-fix-lockup-on-tx-to-unregistering-netdev-with-carrier.patch sched-task_stack-fix-object_is_on_stack-for-kasan-tagged-pointers.patch --- diff --git a/queue-5.15/blk-cgroup-support-to-track-if-policy-is-online.patch b/queue-5.15/blk-cgroup-support-to-track-if-policy-is-online.patch new file mode 100644 index 0000000000..f28eaf2fec --- /dev/null +++ b/queue-5.15/blk-cgroup-support-to-track-if-policy-is-online.patch @@ -0,0 +1,120 @@ +From dfd6200a095440b663099d8d42f1efb0175a1ce3 Mon Sep 17 00:00:00 2001 +From: Yu Kuai +Date: Thu, 19 Jan 2023 19:03:49 +0800 +Subject: blk-cgroup: support to track if policy is online + +From: Yu Kuai + +commit dfd6200a095440b663099d8d42f1efb0175a1ce3 upstream. + +A new field 'online' is added to blkg_policy_data to fix following +2 problem: + +1) In blkcg_activate_policy(), if pd_alloc_fn() with 'GFP_NOWAIT' + failed, 'queue_lock' will be dropped and pd_alloc_fn() will try again + without 'GFP_NOWAIT'. In the meantime, remove cgroup can race with + it, and pd_offline_fn() will be called without pd_init_fn() and + pd_online_fn(). This way null-ptr-deference can be triggered. + +2) In order to synchronize pd_free_fn() from blkg_free_workfn() and + blkcg_deactivate_policy(), 'list_del_init(&blkg->q_node)' will be + delayed to blkg_free_workfn(), hence pd_offline_fn() can be called + first in blkg_destroy(), and then blkcg_deactivate_policy() will + call it again, we must prevent it. + +The new field 'online' will be set after pd_online_fn() and will be +cleared after pd_offline_fn(), in the meantime pd_offline_fn() will only +be called if 'online' is set. + +Signed-off-by: Yu Kuai +Acked-by: Tejun Heo +Reviewed-by: Christoph Hellwig +Link: https://lore.kernel.org/r/20230119110350.2287325-3-yukuai1@huaweicloud.com +Signed-off-by: Jens Axboe +Signed-off-by: Bin Lan +Signed-off-by: He Zhe +Signed-off-by: Greg Kroah-Hartman +--- + block/blk-cgroup.c | 24 +++++++++++++++++------- + include/linux/blk-cgroup.h | 1 + + 2 files changed, 18 insertions(+), 7 deletions(-) + +--- a/block/blk-cgroup.c ++++ b/block/blk-cgroup.c +@@ -192,6 +192,7 @@ static struct blkcg_gq *blkg_alloc(struc + blkg->pd[i] = pd; + pd->blkg = blkg; + pd->plid = i; ++ pd->online = false; + } + + return blkg; +@@ -289,8 +290,11 @@ static struct blkcg_gq *blkg_create(stru + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + +- if (blkg->pd[i] && pol->pd_online_fn) +- pol->pd_online_fn(blkg->pd[i]); ++ if (blkg->pd[i]) { ++ if (pol->pd_online_fn) ++ pol->pd_online_fn(blkg->pd[i]); ++ blkg->pd[i]->online = true; ++ } + } + } + blkg->online = true; +@@ -390,8 +394,11 @@ static void blkg_destroy(struct blkcg_gq + for (i = 0; i < BLKCG_MAX_POLS; i++) { + struct blkcg_policy *pol = blkcg_policy[i]; + +- if (blkg->pd[i] && pol->pd_offline_fn) +- pol->pd_offline_fn(blkg->pd[i]); ++ if (blkg->pd[i] && blkg->pd[i]->online) { ++ if (pol->pd_offline_fn) ++ pol->pd_offline_fn(blkg->pd[i]); ++ blkg->pd[i]->online = false; ++ } + } + + blkg->online = false; +@@ -1367,6 +1374,7 @@ retry: + blkg->pd[pol->plid] = pd; + pd->blkg = blkg; + pd->plid = pol->plid; ++ pd->online = false; + } + + /* all allocated, init in the same order */ +@@ -1374,9 +1382,11 @@ retry: + list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) + pol->pd_init_fn(blkg->pd[pol->plid]); + +- if (pol->pd_online_fn) +- list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) ++ list_for_each_entry_reverse(blkg, &q->blkg_list, q_node) { ++ if (pol->pd_online_fn) + pol->pd_online_fn(blkg->pd[pol->plid]); ++ blkg->pd[pol->plid]->online = true; ++ } + + __set_bit(pol->plid, q->blkcg_pols); + ret = 0; +@@ -1438,7 +1448,7 @@ void blkcg_deactivate_policy(struct requ + + spin_lock(&blkcg->lock); + if (blkg->pd[pol->plid]) { +- if (pol->pd_offline_fn) ++ if (blkg->pd[pol->plid]->online && pol->pd_offline_fn) + pol->pd_offline_fn(blkg->pd[pol->plid]); + pol->pd_free_fn(blkg->pd[pol->plid]); + blkg->pd[pol->plid] = NULL; +--- a/include/linux/blk-cgroup.h ++++ b/include/linux/blk-cgroup.h +@@ -92,6 +92,7 @@ struct blkg_policy_data { + /* the blkg and policy id this per-policy data belongs to */ + struct blkcg_gq *blkg; + int plid; ++ bool online; + }; + + /* diff --git a/queue-5.15/blk-iocost-do-not-warn-if-iocg-was-already-offlined.patch b/queue-5.15/blk-iocost-do-not-warn-if-iocg-was-already-offlined.patch new file mode 100644 index 0000000000..d3429a023e --- /dev/null +++ b/queue-5.15/blk-iocost-do-not-warn-if-iocg-was-already-offlined.patch @@ -0,0 +1,59 @@ +From 01bc4fda9ea0a6b52f12326486f07a4910666cf6 Mon Sep 17 00:00:00 2001 +From: Li Nan +Date: Fri, 19 Apr 2024 17:32:57 +0800 +Subject: blk-iocost: do not WARN if iocg was already offlined + +From: Li Nan + +commit 01bc4fda9ea0a6b52f12326486f07a4910666cf6 upstream. + +In iocg_pay_debt(), warn is triggered if 'active_list' is empty, which +is intended to confirm iocg is active when it has debt. However, warn +can be triggered during a blkcg or disk removal, if iocg_waitq_timer_fn() +is run at that time: + + WARNING: CPU: 0 PID: 2344971 at block/blk-iocost.c:1402 iocg_pay_debt+0x14c/0x190 + Call trace: + iocg_pay_debt+0x14c/0x190 + iocg_kick_waitq+0x438/0x4c0 + iocg_waitq_timer_fn+0xd8/0x130 + __run_hrtimer+0x144/0x45c + __hrtimer_run_queues+0x16c/0x244 + hrtimer_interrupt+0x2cc/0x7b0 + +The warn in this situation is meaningless. Since this iocg is being +removed, the state of the 'active_list' is irrelevant, and 'waitq_timer' +is canceled after removing 'active_list' in ioc_pd_free(), which ensures +iocg is freed after iocg_waitq_timer_fn() returns. + +Therefore, add the check if iocg was already offlined to avoid warn +when removing a blkcg or disk. + +Signed-off-by: Li Nan +Reviewed-by: Yu Kuai +Acked-by: Tejun Heo +Link: https://lore.kernel.org/r/20240419093257.3004211-1-linan666@huaweicloud.com +Signed-off-by: Jens Axboe +Signed-off-by: Bin Lan +Signed-off-by: He Zhe +Signed-off-by: Greg Kroah-Hartman +--- + block/blk-iocost.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +--- a/block/blk-iocost.c ++++ b/block/blk-iocost.c +@@ -1435,8 +1435,11 @@ static void iocg_pay_debt(struct ioc_gq + lockdep_assert_held(&iocg->ioc->lock); + lockdep_assert_held(&iocg->waitq.lock); + +- /* make sure that nobody messed with @iocg */ +- WARN_ON_ONCE(list_empty(&iocg->active_list)); ++ /* ++ * make sure that nobody messed with @iocg. Check iocg->pd.online ++ * to avoid warn when removing blkcg or disk. ++ */ ++ WARN_ON_ONCE(list_empty(&iocg->active_list) && iocg->pd.online); + WARN_ON_ONCE(iocg->inuse > 1); + + iocg->abs_vdebt -= min(abs_vpay, iocg->abs_vdebt); diff --git a/queue-5.15/bpf-avoid-holding-freeze_mutex-during-mmap-operation.patch b/queue-5.15/bpf-avoid-holding-freeze_mutex-during-mmap-operation.patch new file mode 100644 index 0000000000..24d56cc694 --- /dev/null +++ b/queue-5.15/bpf-avoid-holding-freeze_mutex-during-mmap-operation.patch @@ -0,0 +1,75 @@ +From bc27c52eea189e8f7492d40739b7746d67b65beb Mon Sep 17 00:00:00 2001 +From: Andrii Nakryiko +Date: Tue, 28 Jan 2025 17:22:46 -0800 +Subject: bpf: avoid holding freeze_mutex during mmap operation + +From: Andrii Nakryiko + +commit bc27c52eea189e8f7492d40739b7746d67b65beb upstream. + +We use map->freeze_mutex to prevent races between map_freeze() and +memory mapping BPF map contents with writable permissions. The way we +naively do this means we'll hold freeze_mutex for entire duration of all +the mm and VMA manipulations, which is completely unnecessary. This can +potentially also lead to deadlocks, as reported by syzbot in [0]. + +So, instead, hold freeze_mutex only during writeability checks, bump +(proactively) "write active" count for the map, unlock the mutex and +proceed with mmap logic. And only if something went wrong during mmap +logic, then undo that "write active" counter increment. + + [0] https://lore.kernel.org/bpf/678dcbc9.050a0220.303755.0066.GAE@google.com/ + +Fixes: fc9702273e2e ("bpf: Add mmap() support for BPF_MAP_TYPE_ARRAY") +Reported-by: syzbot+4dc041c686b7c816a71e@syzkaller.appspotmail.com +Signed-off-by: Andrii Nakryiko +Link: https://lore.kernel.org/r/20250129012246.1515826-2-andrii@kernel.org +Signed-off-by: Alexei Starovoitov +Signed-off-by: David Sauerwein +Signed-off-by: Greg Kroah-Hartman +--- + kernel/bpf/syscall.c | 17 ++++++++++------- + 1 file changed, 10 insertions(+), 7 deletions(-) + +--- a/kernel/bpf/syscall.c ++++ b/kernel/bpf/syscall.c +@@ -654,7 +654,7 @@ static const struct vm_operations_struct + static int bpf_map_mmap(struct file *filp, struct vm_area_struct *vma) + { + struct bpf_map *map = filp->private_data; +- int err; ++ int err = 0; + + if (!map->ops->map_mmap || map_value_has_spin_lock(map) || + map_value_has_timer(map)) +@@ -679,7 +679,12 @@ static int bpf_map_mmap(struct file *fil + err = -EACCES; + goto out; + } ++ bpf_map_write_active_inc(map); + } ++out: ++ mutex_unlock(&map->freeze_mutex); ++ if (err) ++ return err; + + /* set default open/close callbacks */ + vma->vm_ops = &bpf_map_default_vmops; +@@ -690,13 +695,11 @@ static int bpf_map_mmap(struct file *fil + vma->vm_flags &= ~VM_MAYWRITE; + + err = map->ops->map_mmap(map, vma); +- if (err) +- goto out; ++ if (err) { ++ if (vma->vm_flags & VM_WRITE) ++ bpf_map_write_active_dec(map); ++ } + +- if (vma->vm_flags & VM_MAYWRITE) +- bpf_map_write_active_inc(map); +-out: +- mutex_unlock(&map->freeze_mutex); + return err; + } + diff --git a/queue-5.15/bpf-check-rcu_read_lock_trace_held-before-calling-bpf-map-helpers.patch b/queue-5.15/bpf-check-rcu_read_lock_trace_held-before-calling-bpf-map-helpers.patch new file mode 100644 index 0000000000..9cc503e88b --- /dev/null +++ b/queue-5.15/bpf-check-rcu_read_lock_trace_held-before-calling-bpf-map-helpers.patch @@ -0,0 +1,100 @@ +From 169410eba271afc9f0fb476d996795aa26770c6d Mon Sep 17 00:00:00 2001 +From: Hou Tao +Date: Mon, 4 Dec 2023 22:04:19 +0800 +Subject: bpf: Check rcu_read_lock_trace_held() before calling bpf map helpers + +From: Hou Tao + +commit 169410eba271afc9f0fb476d996795aa26770c6d upstream. + +These three bpf_map_{lookup,update,delete}_elem() helpers are also +available for sleepable bpf program, so add the corresponding lock +assertion for sleepable bpf program, otherwise the following warning +will be reported when a sleepable bpf program manipulates bpf map under +interpreter mode (aka bpf_jit_enable=0): + + WARNING: CPU: 3 PID: 4985 at kernel/bpf/helpers.c:40 ...... + CPU: 3 PID: 4985 Comm: test_progs Not tainted 6.6.0+ #2 + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996) ...... + RIP: 0010:bpf_map_lookup_elem+0x54/0x60 + ...... + Call Trace: + + ? __warn+0xa5/0x240 + ? bpf_map_lookup_elem+0x54/0x60 + ? report_bug+0x1ba/0x1f0 + ? handle_bug+0x40/0x80 + ? exc_invalid_op+0x18/0x50 + ? asm_exc_invalid_op+0x1b/0x20 + ? __pfx_bpf_map_lookup_elem+0x10/0x10 + ? rcu_lockdep_current_cpu_online+0x65/0xb0 + ? rcu_is_watching+0x23/0x50 + ? bpf_map_lookup_elem+0x54/0x60 + ? __pfx_bpf_map_lookup_elem+0x10/0x10 + ___bpf_prog_run+0x513/0x3b70 + __bpf_prog_run32+0x9d/0xd0 + ? __bpf_prog_enter_sleepable_recur+0xad/0x120 + ? __bpf_prog_enter_sleepable_recur+0x3e/0x120 + bpf_trampoline_6442580665+0x4d/0x1000 + __x64_sys_getpgid+0x5/0x30 + ? do_syscall_64+0x36/0xb0 + entry_SYSCALL_64_after_hwframe+0x6e/0x76 + + +Signed-off-by: Hou Tao +Link: https://lore.kernel.org/r/20231204140425.1480317-2-houtao@huaweicloud.com +Signed-off-by: Alexei Starovoitov +Signed-off-by: Cliff Liu +Signed-off-by: He Zhe +Signed-off-by: Greg Kroah-Hartman +--- + kernel/bpf/helpers.c | 14 +++++++++----- + 1 file changed, 9 insertions(+), 5 deletions(-) + +--- a/kernel/bpf/helpers.c ++++ b/kernel/bpf/helpers.c +@@ -3,6 +3,7 @@ + */ + #include + #include ++#include + #include + #include + #include +@@ -24,12 +25,13 @@ + * + * Different map implementations will rely on rcu in map methods + * lookup/update/delete, therefore eBPF programs must run under rcu lock +- * if program is allowed to access maps, so check rcu_read_lock_held in +- * all three functions. ++ * if program is allowed to access maps, so check rcu_read_lock_held() or ++ * rcu_read_lock_trace_held() in all three functions. + */ + BPF_CALL_2(bpf_map_lookup_elem, struct bpf_map *, map, void *, key) + { +- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); ++ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && ++ !rcu_read_lock_bh_held()); + return (unsigned long) map->ops->map_lookup_elem(map, key); + } + +@@ -45,7 +47,8 @@ const struct bpf_func_proto bpf_map_look + BPF_CALL_4(bpf_map_update_elem, struct bpf_map *, map, void *, key, + void *, value, u64, flags) + { +- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); ++ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && ++ !rcu_read_lock_bh_held()); + return map->ops->map_update_elem(map, key, value, flags); + } + +@@ -62,7 +65,8 @@ const struct bpf_func_proto bpf_map_upda + + BPF_CALL_2(bpf_map_delete_elem, struct bpf_map *, map, void *, key) + { +- WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held()); ++ WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_trace_held() && ++ !rcu_read_lock_bh_held()); + return map->ops->map_delete_elem(map, key); + } + diff --git a/queue-5.15/ext4-fix-timer-use-after-free-on-failed-mount.patch b/queue-5.15/ext4-fix-timer-use-after-free-on-failed-mount.patch new file mode 100644 index 0000000000..aabbc0c8dc --- /dev/null +++ b/queue-5.15/ext4-fix-timer-use-after-free-on-failed-mount.patch @@ -0,0 +1,49 @@ +From 0ce160c5bdb67081a62293028dc85758a8efb22a Mon Sep 17 00:00:00 2001 +From: Xiaxi Shen +Date: Sun, 14 Jul 2024 21:33:36 -0700 +Subject: ext4: fix timer use-after-free on failed mount + +From: Xiaxi Shen + +commit 0ce160c5bdb67081a62293028dc85758a8efb22a upstream. + +Syzbot has found an ODEBUG bug in ext4_fill_super + +The del_timer_sync function cancels the s_err_report timer, +which reminds about filesystem errors daily. We should +guarantee the timer is no longer active before kfree(sbi). + +When filesystem mounting fails, the flow goes to failed_mount3, +where an error occurs when ext4_stop_mmpd is called, causing +a read I/O failure. This triggers the ext4_handle_error function +that ultimately re-arms the timer, +leaving the s_err_report timer active before kfree(sbi) is called. + +Fix the issue by canceling the s_err_report timer after calling ext4_stop_mmpd. + +Signed-off-by: Xiaxi Shen +Reported-and-tested-by: syzbot+59e0101c430934bc9a36@syzkaller.appspotmail.com +Closes: https://syzkaller.appspot.com/bug?extid=59e0101c430934bc9a36 +Link: https://patch.msgid.link/20240715043336.98097-1-shenxiaxi26@gmail.com +Signed-off-by: Theodore Ts'o +Cc: stable@kernel.org +[Minor context change fixed] +Signed-off-by: Xiangyu Chen +Signed-off-by: He Zhe +Signed-off-by: Greg Kroah-Hartman +--- + fs/ext4/super.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/ext4/super.c ++++ b/fs/ext4/super.c +@@ -5085,8 +5085,8 @@ failed_mount3a: + failed_mount3: + /* flush s_error_work before sbi destroy */ + flush_work(&sbi->s_error_work); +- del_timer_sync(&sbi->s_err_report); + ext4_stop_mmpd(sbi); ++ del_timer_sync(&sbi->s_err_report); + failed_mount2: + rcu_read_lock(); + group_desc = rcu_dereference(sbi->s_group_desc); diff --git a/queue-5.15/ipvs-properly-dereference-pe-in-ip_vs_add_service.patch b/queue-5.15/ipvs-properly-dereference-pe-in-ip_vs_add_service.patch new file mode 100644 index 0000000000..80884c5a8a --- /dev/null +++ b/queue-5.15/ipvs-properly-dereference-pe-in-ip_vs_add_service.patch @@ -0,0 +1,53 @@ +From cbd070a4ae62f119058973f6d2c984e325bce6e7 Mon Sep 17 00:00:00 2001 +From: Chen Hanxiao +Date: Thu, 27 Jun 2024 14:15:15 +0800 +Subject: ipvs: properly dereference pe in ip_vs_add_service + +From: Chen Hanxiao + +commit cbd070a4ae62f119058973f6d2c984e325bce6e7 upstream. + +Use pe directly to resolve sparse warning: + + net/netfilter/ipvs/ip_vs_ctl.c:1471:27: warning: dereference of noderef expression + +Fixes: 39b972231536 ("ipvs: handle connections started by real-servers") +Signed-off-by: Chen Hanxiao +Acked-by: Julian Anastasov +Acked-by: Simon Horman +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Cliff Liu +Signed-off-by: He Zhe +Signed-off-by: Greg Kroah-Hartman +--- + net/netfilter/ipvs/ip_vs_ctl.c | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +--- a/net/netfilter/ipvs/ip_vs_ctl.c ++++ b/net/netfilter/ipvs/ip_vs_ctl.c +@@ -1384,20 +1384,20 @@ ip_vs_add_service(struct netns_ipvs *ipv + sched = NULL; + } + +- /* Bind the ct retriever */ +- RCU_INIT_POINTER(svc->pe, pe); +- pe = NULL; +- + /* Update the virtual service counters */ + if (svc->port == FTPPORT) + atomic_inc(&ipvs->ftpsvc_counter); + else if (svc->port == 0) + atomic_inc(&ipvs->nullsvc_counter); +- if (svc->pe && svc->pe->conn_out) ++ if (pe && pe->conn_out) + atomic_inc(&ipvs->conn_out_counter); + + ip_vs_start_estimator(ipvs, &svc->stats); + ++ /* Bind the ct retriever */ ++ RCU_INIT_POINTER(svc->pe, pe); ++ pe = NULL; ++ + /* Count only IPv4 services for old get/setsockopt interface */ + if (svc->af == AF_INET) + ipvs->num_services++; diff --git a/queue-5.15/net-openvswitch-fix-race-on-port-output.patch b/queue-5.15/net-openvswitch-fix-race-on-port-output.patch new file mode 100644 index 0000000000..06ebb39e68 --- /dev/null +++ b/queue-5.15/net-openvswitch-fix-race-on-port-output.patch @@ -0,0 +1,235 @@ +From 066b86787fa3d97b7aefb5ac0a99a22dad2d15f8 Mon Sep 17 00:00:00 2001 +From: Felix Huettner +Date: Wed, 5 Apr 2023 07:53:41 +0000 +Subject: net: openvswitch: fix race on port output + +From: Felix Huettner + +commit 066b86787fa3d97b7aefb5ac0a99a22dad2d15f8 upstream. + +assume the following setup on a single machine: +1. An openvswitch instance with one bridge and default flows +2. two network namespaces "server" and "client" +3. two ovs interfaces "server" and "client" on the bridge +4. for each ovs interface a veth pair with a matching name and 32 rx and + tx queues +5. move the ends of the veth pairs to the respective network namespaces +6. assign ip addresses to each of the veth ends in the namespaces (needs + to be the same subnet) +7. start some http server on the server network namespace +8. test if a client in the client namespace can reach the http server + +when following the actions below the host has a chance of getting a cpu +stuck in a infinite loop: +1. send a large amount of parallel requests to the http server (around + 3000 curls should work) +2. in parallel delete the network namespace (do not delete interfaces or + stop the server, just kill the namespace) + +there is a low chance that this will cause the below kernel cpu stuck +message. If this does not happen just retry. +Below there is also the output of bpftrace for the functions mentioned +in the output. + +The series of events happening here is: +1. the network namespace is deleted calling + `unregister_netdevice_many_notify` somewhere in the process +2. this sets first `NETREG_UNREGISTERING` on both ends of the veth and + then runs `synchronize_net` +3. it then calls `call_netdevice_notifiers` with `NETDEV_UNREGISTER` +4. this is then handled by `dp_device_event` which calls + `ovs_netdev_detach_dev` (if a vport is found, which is the case for + the veth interface attached to ovs) +5. this removes the rx_handlers of the device but does not prevent + packages to be sent to the device +6. `dp_device_event` then queues the vport deletion to work in + background as a ovs_lock is needed that we do not hold in the + unregistration path +7. `unregister_netdevice_many_notify` continues to call + `netdev_unregister_kobject` which sets `real_num_tx_queues` to 0 +8. port deletion continues (but details are not relevant for this issue) +9. at some future point the background task deletes the vport + +If after 7. but before 9. a packet is send to the ovs vport (which is +not deleted at this point in time) which forwards it to the +`dev_queue_xmit` flow even though the device is unregistering. +In `skb_tx_hash` (which is called in the `dev_queue_xmit`) path there is +a while loop (if the packet has a rx_queue recorded) that is infinite if +`dev->real_num_tx_queues` is zero. + +To prevent this from happening we update `do_output` to handle devices +without carrier the same as if the device is not found (which would +be the code path after 9. is done). + +Additionally we now produce a warning in `skb_tx_hash` if we will hit +the infinite loop. + +bpftrace (first word is function name): + +__dev_queue_xmit server: real_num_tx_queues: 1, cpu: 2, pid: 28024, tid: 28024, skb_addr: 0xffff9edb6f207000, reg_state: 1 +netdev_core_pick_tx server: addr: 0xffff9f0a46d4a000 real_num_tx_queues: 1, cpu: 2, pid: 28024, tid: 28024, skb_addr: 0xffff9edb6f207000, reg_state: 1 +dp_device_event server: real_num_tx_queues: 1 cpu 9, pid: 21024, tid: 21024, event 2, reg_state: 1 +synchronize_rcu_expedited: cpu 9, pid: 21024, tid: 21024 +synchronize_rcu_expedited: cpu 9, pid: 21024, tid: 21024 +synchronize_rcu_expedited: cpu 9, pid: 21024, tid: 21024 +synchronize_rcu_expedited: cpu 9, pid: 21024, tid: 21024 +dp_device_event server: real_num_tx_queues: 1 cpu 9, pid: 21024, tid: 21024, event 6, reg_state: 2 +ovs_netdev_detach_dev server: real_num_tx_queues: 1 cpu 9, pid: 21024, tid: 21024, reg_state: 2 +netdev_rx_handler_unregister server: real_num_tx_queues: 1, cpu: 9, pid: 21024, tid: 21024, reg_state: 2 +synchronize_rcu_expedited: cpu 9, pid: 21024, tid: 21024 +netdev_rx_handler_unregister ret server: real_num_tx_queues: 1, cpu: 9, pid: 21024, tid: 21024, reg_state: 2 +dp_device_event server: real_num_tx_queues: 1 cpu 9, pid: 21024, tid: 21024, event 27, reg_state: 2 +dp_device_event server: real_num_tx_queues: 1 cpu 9, pid: 21024, tid: 21024, event 22, reg_state: 2 +dp_device_event server: real_num_tx_queues: 1 cpu 9, pid: 21024, tid: 21024, event 18, reg_state: 2 +netdev_unregister_kobject: real_num_tx_queues: 1, cpu: 9, pid: 21024, tid: 21024 +synchronize_rcu_expedited: cpu 9, pid: 21024, tid: 21024 +ovs_vport_send server: real_num_tx_queues: 0, cpu: 2, pid: 28024, tid: 28024, skb_addr: 0xffff9edb6f207000, reg_state: 2 +__dev_queue_xmit server: real_num_tx_queues: 0, cpu: 2, pid: 28024, tid: 28024, skb_addr: 0xffff9edb6f207000, reg_state: 2 +netdev_core_pick_tx server: addr: 0xffff9f0a46d4a000 real_num_tx_queues: 0, cpu: 2, pid: 28024, tid: 28024, skb_addr: 0xffff9edb6f207000, reg_state: 2 +broken device server: real_num_tx_queues: 0, cpu: 2, pid: 28024, tid: 28024 +ovs_dp_detach_port server: real_num_tx_queues: 0 cpu 9, pid: 9124, tid: 9124, reg_state: 2 +synchronize_rcu_expedited: cpu 9, pid: 33604, tid: 33604 + +stuck message: + +watchdog: BUG: soft lockup - CPU#5 stuck for 26s! [curl:1929279] +Modules linked in: veth pktgen bridge stp llc ip_set_hash_net nft_counter xt_set nft_compat nf_tables ip_set_hash_ip ip_set nfnetlink_cttimeout nfnetlink openvswitch nsh nf_conncount nf_nat nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 tls binfmt_misc nls_iso8859_1 input_leds joydev serio_raw dm_multipath scsi_dh_rdac scsi_dh_emc scsi_dh_alua sch_fq_codel drm efi_pstore virtio_rng ip_tables x_tables autofs4 btrfs blake2b_generic zstd_compress raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx xor raid6_pq libcrc32c raid1 raid0 multipath linear hid_generic usbhid hid crct10dif_pclmul crc32_pclmul ghash_clmulni_intel aesni_intel virtio_net ahci net_failover crypto_simd cryptd psmouse libahci virtio_blk failover +CPU: 5 PID: 1929279 Comm: curl Not tainted 5.15.0-67-generic #74-Ubuntu +Hardware name: OpenStack Foundation OpenStack Nova, BIOS rel-1.16.0-0-gd239552ce722-prebuilt.qemu.org 04/01/2014 +RIP: 0010:netdev_pick_tx+0xf1/0x320 +Code: 00 00 8d 48 ff 0f b7 c1 66 39 ca 0f 86 e9 01 00 00 45 0f b7 ff 41 39 c7 0f 87 5b 01 00 00 44 29 f8 41 39 c7 0f 87 4f 01 00 00 f2 0f 1f 44 00 00 49 8b 94 24 28 04 00 00 48 85 d2 0f 84 53 01 +RSP: 0018:ffffb78b40298820 EFLAGS: 00000246 +RAX: 0000000000000000 RBX: ffff9c8773adc2e0 RCX: 000000000000083f +RDX: 0000000000000000 RSI: ffff9c8773adc2e0 RDI: ffff9c870a25e000 +RBP: ffffb78b40298858 R08: 0000000000000001 R09: 0000000000000000 +R10: 0000000000000000 R11: 0000000000000000 R12: ffff9c870a25e000 +R13: ffff9c870a25e000 R14: ffff9c87fe043480 R15: 0000000000000000 +FS: 00007f7b80008f00(0000) GS:ffff9c8e5f740000(0000) knlGS:0000000000000000 +CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +CR2: 00007f7b80f6a0b0 CR3: 0000000329d66000 CR4: 0000000000350ee0 +Call Trace: + + netdev_core_pick_tx+0xa4/0xb0 + __dev_queue_xmit+0xf8/0x510 + ? __bpf_prog_exit+0x1e/0x30 + dev_queue_xmit+0x10/0x20 + ovs_vport_send+0xad/0x170 [openvswitch] + do_output+0x59/0x180 [openvswitch] + do_execute_actions+0xa80/0xaa0 [openvswitch] + ? kfree+0x1/0x250 + ? kfree+0x1/0x250 + ? kprobe_perf_func+0x4f/0x2b0 + ? flow_lookup.constprop.0+0x5c/0x110 [openvswitch] + ovs_execute_actions+0x4c/0x120 [openvswitch] + ovs_dp_process_packet+0xa1/0x200 [openvswitch] + ? ovs_ct_update_key.isra.0+0xa8/0x120 [openvswitch] + ? ovs_ct_fill_key+0x1d/0x30 [openvswitch] + ? ovs_flow_key_extract+0x2db/0x350 [openvswitch] + ovs_vport_receive+0x77/0xd0 [openvswitch] + ? __htab_map_lookup_elem+0x4e/0x60 + ? bpf_prog_680e8aff8547aec1_kfree+0x3b/0x714 + ? trace_call_bpf+0xc8/0x150 + ? kfree+0x1/0x250 + ? kfree+0x1/0x250 + ? kprobe_perf_func+0x4f/0x2b0 + ? kprobe_perf_func+0x4f/0x2b0 + ? __mod_memcg_lruvec_state+0x63/0xe0 + netdev_port_receive+0xc4/0x180 [openvswitch] + ? netdev_port_receive+0x180/0x180 [openvswitch] + netdev_frame_hook+0x1f/0x40 [openvswitch] + __netif_receive_skb_core.constprop.0+0x23d/0xf00 + __netif_receive_skb_one_core+0x3f/0xa0 + __netif_receive_skb+0x15/0x60 + process_backlog+0x9e/0x170 + __napi_poll+0x33/0x180 + net_rx_action+0x126/0x280 + ? ttwu_do_activate+0x72/0xf0 + __do_softirq+0xd9/0x2e7 + ? rcu_report_exp_cpu_mult+0x1b0/0x1b0 + do_softirq+0x7d/0xb0 + + + __local_bh_enable_ip+0x54/0x60 + ip_finish_output2+0x191/0x460 + __ip_finish_output+0xb7/0x180 + ip_finish_output+0x2e/0xc0 + ip_output+0x78/0x100 + ? __ip_finish_output+0x180/0x180 + ip_local_out+0x5e/0x70 + __ip_queue_xmit+0x184/0x440 + ? tcp_syn_options+0x1f9/0x300 + ip_queue_xmit+0x15/0x20 + __tcp_transmit_skb+0x910/0x9c0 + ? __mod_memcg_state+0x44/0xa0 + tcp_connect+0x437/0x4e0 + ? ktime_get_with_offset+0x60/0xf0 + tcp_v4_connect+0x436/0x530 + __inet_stream_connect+0xd4/0x3a0 + ? kprobe_perf_func+0x4f/0x2b0 + ? aa_sk_perm+0x43/0x1c0 + inet_stream_connect+0x3b/0x60 + __sys_connect_file+0x63/0x70 + __sys_connect+0xa6/0xd0 + ? setfl+0x108/0x170 + ? do_fcntl+0xe8/0x5a0 + __x64_sys_connect+0x18/0x20 + do_syscall_64+0x5c/0xc0 + ? __x64_sys_fcntl+0xa9/0xd0 + ? exit_to_user_mode_prepare+0x37/0xb0 + ? syscall_exit_to_user_mode+0x27/0x50 + ? do_syscall_64+0x69/0xc0 + ? __sys_setsockopt+0xea/0x1e0 + ? exit_to_user_mode_prepare+0x37/0xb0 + ? syscall_exit_to_user_mode+0x27/0x50 + ? __x64_sys_setsockopt+0x1f/0x30 + ? do_syscall_64+0x69/0xc0 + ? irqentry_exit+0x1d/0x30 + ? exc_page_fault+0x89/0x170 + entry_SYSCALL_64_after_hwframe+0x61/0xcb +RIP: 0033:0x7f7b8101c6a7 +Code: 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 90 f3 0f 1e fa 64 8b 04 25 18 00 00 00 85 c0 75 10 b8 2a 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 51 c3 48 83 ec 18 89 54 24 0c 48 89 34 24 89 +RSP: 002b:00007ffffd6b2198 EFLAGS: 00000246 ORIG_RAX: 000000000000002a +RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f7b8101c6a7 +RDX: 0000000000000010 RSI: 00007ffffd6b2360 RDI: 0000000000000005 +RBP: 0000561f1370d560 R08: 00002795ad21d1ac R09: 0030312e302e302e +R10: 00007ffffd73f080 R11: 0000000000000246 R12: 0000561f1370c410 +R13: 0000000000000000 R14: 0000000000000005 R15: 0000000000000000 + + +Fixes: 7f8a436eaa2c ("openvswitch: Add conntrack action") +Co-developed-by: Luca Czesla +Signed-off-by: Luca Czesla +Signed-off-by: Felix Huettner +Reviewed-by: Eric Dumazet +Reviewed-by: Simon Horman +Link: https://lore.kernel.org/r/ZC0pBXBAgh7c76CA@kernel-bug-kernel-bug +Signed-off-by: Jakub Kicinski +Signed-off-by: Carlos Soto +Signed-off-by: Florian Fainelli +Signed-off-by: Greg Kroah-Hartman +--- + net/core/dev.c | 1 + + net/openvswitch/actions.c | 2 +- + 2 files changed, 2 insertions(+), 1 deletion(-) + +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -3231,6 +3231,7 @@ static u16 skb_tx_hash(const struct net_ + } + + if (skb_rx_queue_recorded(skb)) { ++ BUILD_BUG_ON_INVALID(qcount == 0); + hash = skb_get_rx_queue(skb); + if (hash >= qoffset) + hash -= qoffset; +--- a/net/openvswitch/actions.c ++++ b/net/openvswitch/actions.c +@@ -913,7 +913,7 @@ static void do_output(struct datapath *d + { + struct vport *vport = ovs_vport_rcu(dp, out_port); + +- if (likely(vport)) { ++ if (likely(vport && netif_carrier_ok(vport->dev))) { + u16 mru = OVS_CB(skb)->mru; + u32 cutlen = OVS_CB(skb)->cutlen; + diff --git a/queue-5.15/openvswitch-fix-lockup-on-tx-to-unregistering-netdev-with-carrier.patch b/queue-5.15/openvswitch-fix-lockup-on-tx-to-unregistering-netdev-with-carrier.patch new file mode 100644 index 0000000000..a17c6ea1b0 --- /dev/null +++ b/queue-5.15/openvswitch-fix-lockup-on-tx-to-unregistering-netdev-with-carrier.patch @@ -0,0 +1,77 @@ +From 47e55e4b410f7d552e43011baa5be1aab4093990 Mon Sep 17 00:00:00 2001 +From: Ilya Maximets +Date: Thu, 9 Jan 2025 13:21:24 +0100 +Subject: openvswitch: fix lockup on tx to unregistering netdev with carrier + +From: Ilya Maximets + +commit 47e55e4b410f7d552e43011baa5be1aab4093990 upstream. + +Commit in a fixes tag attempted to fix the issue in the following +sequence of calls: + + do_output + -> ovs_vport_send + -> dev_queue_xmit + -> __dev_queue_xmit + -> netdev_core_pick_tx + -> skb_tx_hash + +When device is unregistering, the 'dev->real_num_tx_queues' goes to +zero and the 'while (unlikely(hash >= qcount))' loop inside the +'skb_tx_hash' becomes infinite, locking up the core forever. + +But unfortunately, checking just the carrier status is not enough to +fix the issue, because some devices may still be in unregistering +state while reporting carrier status OK. + +One example of such device is a net/dummy. It sets carrier ON +on start, but it doesn't implement .ndo_stop to set the carrier off. +And it makes sense, because dummy doesn't really have a carrier. +Therefore, while this device is unregistering, it's still easy to hit +the infinite loop in the skb_tx_hash() from the OVS datapath. There +might be other drivers that do the same, but dummy by itself is +important for the OVS ecosystem, because it is frequently used as a +packet sink for tcpdump while debugging OVS deployments. And when the +issue is hit, the only way to recover is to reboot. + +Fix that by also checking if the device is running. The running +state is handled by the net core during unregistering, so it covers +unregistering case better, and we don't really need to send packets +to devices that are not running anyway. + +While only checking the running state might be enough, the carrier +check is preserved. The running and the carrier states seem disjoined +throughout the code and different drivers. And other core functions +like __dev_direct_xmit() check both before attempting to transmit +a packet. So, it seems safer to check both flags in OVS as well. + +Fixes: 066b86787fa3 ("net: openvswitch: fix race on port output") +Reported-by: Friedrich Weber +Closes: https://mail.openvswitch.org/pipermail/ovs-discuss/2025-January/053423.html +Signed-off-by: Ilya Maximets +Tested-by: Friedrich Weber +Reviewed-by: Aaron Conole +Link: https://patch.msgid.link/20250109122225.4034688-1-i.maximets@ovn.org +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +Signed-off-by: Carlos Soto +Signed-off-by: Florian Fainelli +Signed-off-by: Greg Kroah-Hartman +--- + net/openvswitch/actions.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- a/net/openvswitch/actions.c ++++ b/net/openvswitch/actions.c +@@ -913,7 +913,9 @@ static void do_output(struct datapath *d + { + struct vport *vport = ovs_vport_rcu(dp, out_port); + +- if (likely(vport && netif_carrier_ok(vport->dev))) { ++ if (likely(vport && ++ netif_running(vport->dev) && ++ netif_carrier_ok(vport->dev))) { + u16 mru = OVS_CB(skb)->mru; + u32 cutlen = OVS_CB(skb)->cutlen; + diff --git a/queue-5.15/sched-task_stack-fix-object_is_on_stack-for-kasan-tagged-pointers.patch b/queue-5.15/sched-task_stack-fix-object_is_on_stack-for-kasan-tagged-pointers.patch new file mode 100644 index 0000000000..7231aeab6e --- /dev/null +++ b/queue-5.15/sched-task_stack-fix-object_is_on_stack-for-kasan-tagged-pointers.patch @@ -0,0 +1,92 @@ +From fd7b4f9f46d46acbc7af3a439bb0d869efdc5c58 Mon Sep 17 00:00:00 2001 +From: Qun-Wei Lin +Date: Wed, 13 Nov 2024 12:25:43 +0800 +Subject: sched/task_stack: fix object_is_on_stack() for KASAN tagged pointers + +From: Qun-Wei Lin + +commit fd7b4f9f46d46acbc7af3a439bb0d869efdc5c58 upstream. + +When CONFIG_KASAN_SW_TAGS and CONFIG_KASAN_STACK are enabled, the +object_is_on_stack() function may produce incorrect results due to the +presence of tags in the obj pointer, while the stack pointer does not have +tags. This discrepancy can lead to incorrect stack object detection and +subsequently trigger warnings if CONFIG_DEBUG_OBJECTS is also enabled. + +Example of the warning: + +ODEBUG: object 3eff800082ea7bb0 is NOT on stack ffff800082ea0000, but annotated. +------------[ cut here ]------------ +WARNING: CPU: 0 PID: 1 at lib/debugobjects.c:557 __debug_object_init+0x330/0x364 +Modules linked in: +CPU: 0 UID: 0 PID: 1 Comm: swapper/0 Not tainted 6.12.0-rc5 #4 +Hardware name: linux,dummy-virt (DT) +pstate: 600000c5 (nZCv daIF -PAN -UAO -TCO -DIT -SSBS BTYPE=--) +pc : __debug_object_init+0x330/0x364 +lr : __debug_object_init+0x330/0x364 +sp : ffff800082ea7b40 +x29: ffff800082ea7b40 x28: 98ff0000c0164518 x27: 98ff0000c0164534 +x26: ffff800082d93ec8 x25: 0000000000000001 x24: 1cff0000c00172a0 +x23: 0000000000000000 x22: ffff800082d93ed0 x21: ffff800081a24418 +x20: 3eff800082ea7bb0 x19: efff800000000000 x18: 0000000000000000 +x17: 00000000000000ff x16: 0000000000000047 x15: 206b63617473206e +x14: 0000000000000018 x13: ffff800082ea7780 x12: 0ffff800082ea78e +x11: 0ffff800082ea790 x10: 0ffff800082ea79d x9 : 34d77febe173e800 +x8 : 34d77febe173e800 x7 : 0000000000000001 x6 : 0000000000000001 +x5 : feff800082ea74b8 x4 : ffff800082870a90 x3 : ffff80008018d3c4 +x2 : 0000000000000001 x1 : ffff800082858810 x0 : 0000000000000050 +Call trace: + __debug_object_init+0x330/0x364 + debug_object_init_on_stack+0x30/0x3c + schedule_hrtimeout_range_clock+0xac/0x26c + schedule_hrtimeout+0x1c/0x30 + wait_task_inactive+0x1d4/0x25c + kthread_bind_mask+0x28/0x98 + init_rescuer+0x1e8/0x280 + workqueue_init+0x1a0/0x3cc + kernel_init_freeable+0x118/0x200 + kernel_init+0x28/0x1f0 + ret_from_fork+0x10/0x20 +---[ end trace 0000000000000000 ]--- +ODEBUG: object 3eff800082ea7bb0 is NOT on stack ffff800082ea0000, but annotated. +------------[ cut here ]------------ + +Link: https://lkml.kernel.org/r/20241113042544.19095-1-qun-wei.lin@mediatek.com +Signed-off-by: Qun-Wei Lin +Cc: Andrew Yang +Cc: AngeloGioacchino Del Regno +Cc: Casper Li +Cc: Catalin Marinas +Cc: Chinwen Chang +Cc: Kent Overstreet +Cc: Matthias Brugger +Cc: Pasha Tatashin +Cc: Shakeel Butt +Cc: +Signed-off-by: Andrew Morton +[Minor context change fixed] +Signed-off-by: Zhi Yang +Signed-off-by: He Zhe +Signed-off-by: Greg Kroah-Hartman +--- + include/linux/sched/task_stack.h | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/include/linux/sched/task_stack.h ++++ b/include/linux/sched/task_stack.h +@@ -8,6 +8,7 @@ + + #include + #include ++#include + + #ifdef CONFIG_THREAD_INFO_IN_TASK + +@@ -86,6 +87,7 @@ static inline int object_is_on_stack(con + { + void *stack = task_stack_page(current); + ++ obj = kasan_reset_tag(obj); + return (obj >= stack) && (obj < (stack + THREAD_SIZE)); + } + diff --git a/queue-5.15/series b/queue-5.15/series index 918d005566..adb95b48fd 100644 --- a/queue-5.15/series +++ b/queue-5.15/series @@ -227,3 +227,12 @@ smb-client-fix-null-ptr-deref-in-crypto_aead_setkey.patch smb-server-fix-potential-null-ptr-deref-of-lease_ctx_info-in-smb2_open.patch smb-client-fix-potential-deadlock-when-releasing-mids.patch smb-client-fix-potential-uaf-in-cifs_stats_proc_show.patch +sched-task_stack-fix-object_is_on_stack-for-kasan-tagged-pointers.patch +bpf-avoid-holding-freeze_mutex-during-mmap-operation.patch +bpf-check-rcu_read_lock_trace_held-before-calling-bpf-map-helpers.patch +blk-cgroup-support-to-track-if-policy-is-online.patch +blk-iocost-do-not-warn-if-iocg-was-already-offlined.patch +ext4-fix-timer-use-after-free-on-failed-mount.patch +ipvs-properly-dereference-pe-in-ip_vs_add_service.patch +net-openvswitch-fix-race-on-port-output.patch +openvswitch-fix-lockup-on-tx-to-unregistering-netdev-with-carrier.patch