From 98ef9739fd4f5cbc6693b32df8d7f430b8b1f052 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Sat, 7 Feb 2026 16:43:29 +0100 Subject: [PATCH] 6.6-stable patches added patches: hfsplus-fix-slab-out-of-bounds-read-in-hfsplus_uni2asc.patch netfilter-nft_set_pipapo-clamp-maximum-map-bucket-size-to-int_max.patch sched-rt-fix-race-in-push_rt_task.patch ublk-fix-deadlock-when-reading-partition-table.patch --- ...ut-of-bounds-read-in-hfsplus_uni2asc.patch | 176 +++++++++++ ...p-maximum-map-bucket-size-to-int_max.patch | 49 +++ .../sched-rt-fix-race-in-push_rt_task.patch | 292 ++++++++++++++++++ queue-6.6/series | 4 + ...eadlock-when-reading-partition-table.patch | 107 +++++++ 5 files changed, 628 insertions(+) create mode 100644 queue-6.6/hfsplus-fix-slab-out-of-bounds-read-in-hfsplus_uni2asc.patch create mode 100644 queue-6.6/netfilter-nft_set_pipapo-clamp-maximum-map-bucket-size-to-int_max.patch create mode 100644 queue-6.6/sched-rt-fix-race-in-push_rt_task.patch create mode 100644 queue-6.6/ublk-fix-deadlock-when-reading-partition-table.patch diff --git a/queue-6.6/hfsplus-fix-slab-out-of-bounds-read-in-hfsplus_uni2asc.patch b/queue-6.6/hfsplus-fix-slab-out-of-bounds-read-in-hfsplus_uni2asc.patch new file mode 100644 index 0000000000..e84dc21511 --- /dev/null +++ b/queue-6.6/hfsplus-fix-slab-out-of-bounds-read-in-hfsplus_uni2asc.patch @@ -0,0 +1,176 @@ +From bea3e1d4467bcf292c8e54f080353d556d355e26 Mon Sep 17 00:00:00 2001 +From: Kang Chen +Date: Tue, 9 Sep 2025 11:13:16 +0800 +Subject: hfsplus: fix slab-out-of-bounds read in hfsplus_uni2asc() + +From: Kang Chen + +commit bea3e1d4467bcf292c8e54f080353d556d355e26 upstream. + +BUG: KASAN: slab-out-of-bounds in hfsplus_uni2asc+0xa71/0xb90 fs/hfsplus/unicode.c:186 +Read of size 2 at addr ffff8880289ef218 by task syz.6.248/14290 + +CPU: 0 UID: 0 PID: 14290 Comm: syz.6.248 Not tainted 6.16.4 #1 PREEMPT(full) +Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 04/01/2014 +Call Trace: + + __dump_stack lib/dump_stack.c:94 [inline] + dump_stack_lvl+0x116/0x1b0 lib/dump_stack.c:120 + print_address_description mm/kasan/report.c:378 [inline] + print_report+0xca/0x5f0 mm/kasan/report.c:482 + kasan_report+0xca/0x100 mm/kasan/report.c:595 + hfsplus_uni2asc+0xa71/0xb90 fs/hfsplus/unicode.c:186 + hfsplus_listxattr+0x5b6/0xbd0 fs/hfsplus/xattr.c:738 + vfs_listxattr+0xbe/0x140 fs/xattr.c:493 + listxattr+0xee/0x190 fs/xattr.c:924 + filename_listxattr fs/xattr.c:958 [inline] + path_listxattrat+0x143/0x360 fs/xattr.c:988 + do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] + do_syscall_64+0xcb/0x4c0 arch/x86/entry/syscall_64.c:94 + entry_SYSCALL_64_after_hwframe+0x77/0x7f +RIP: 0033:0x7fe0e9fae16d +Code: 02 b8 ff ff ff ff c3 66 0f 1f 44 00 00 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 a8 ff ff ff f7 d8 64 89 01 48 +RSP: 002b:00007fe0eae67f98 EFLAGS: 00000246 ORIG_RAX: 00000000000000c3 +RAX: ffffffffffffffda RBX: 00007fe0ea205fa0 RCX: 00007fe0e9fae16d +RDX: 0000000000000000 RSI: 0000000000000000 RDI: 0000200000000000 +RBP: 00007fe0ea0480f0 R08: 0000000000000000 R09: 0000000000000000 +R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000 +R13: 00007fe0ea206038 R14: 00007fe0ea205fa0 R15: 00007fe0eae48000 + + +Allocated by task 14290: + kasan_save_stack+0x24/0x50 mm/kasan/common.c:47 + kasan_save_track+0x14/0x30 mm/kasan/common.c:68 + poison_kmalloc_redzone mm/kasan/common.c:377 [inline] + __kasan_kmalloc+0xaa/0xb0 mm/kasan/common.c:394 + kasan_kmalloc include/linux/kasan.h:260 [inline] + __do_kmalloc_node mm/slub.c:4333 [inline] + __kmalloc_noprof+0x219/0x540 mm/slub.c:4345 + kmalloc_noprof include/linux/slab.h:909 [inline] + hfsplus_find_init+0x95/0x1f0 fs/hfsplus/bfind.c:21 + hfsplus_listxattr+0x331/0xbd0 fs/hfsplus/xattr.c:697 + vfs_listxattr+0xbe/0x140 fs/xattr.c:493 + listxattr+0xee/0x190 fs/xattr.c:924 + filename_listxattr fs/xattr.c:958 [inline] + path_listxattrat+0x143/0x360 fs/xattr.c:988 + do_syscall_x64 arch/x86/entry/syscall_64.c:63 [inline] + do_syscall_64+0xcb/0x4c0 arch/x86/entry/syscall_64.c:94 + entry_SYSCALL_64_after_hwframe+0x77/0x7f + +When hfsplus_uni2asc is called from hfsplus_listxattr, +it actually passes in a struct hfsplus_attr_unistr*. +The size of the corresponding structure is different from that of hfsplus_unistr, +so the previous fix (94458781aee6) is insufficient. +The pointer on the unicode buffer is still going beyond the allocated memory. + +This patch introduces two warpper functions hfsplus_uni2asc_xattr_str and +hfsplus_uni2asc_str to process two unicode buffers, +struct hfsplus_attr_unistr* and struct hfsplus_unistr* respectively. +When ustrlen value is bigger than the allocated memory size, +the ustrlen value is limited to an safe size. + +Fixes: 94458781aee6 ("hfsplus: fix slab-out-of-bounds read in hfsplus_uni2asc()") +Signed-off-by: Kang Chen +Reviewed-by: Viacheslav Dubeyko +Signed-off-by: Viacheslav Dubeyko +Link: https://lore.kernel.org/r/20250909031316.1647094-1-k.chen@smail.nju.edu.cn +Signed-off-by: Viacheslav Dubeyko +Signed-off-by: Jianqiang kang +Signed-off-by: Greg Kroah-Hartman +--- + fs/hfsplus/dir.c | 2 +- + fs/hfsplus/hfsplus_fs.h | 8 ++++++-- + fs/hfsplus/unicode.c | 24 +++++++++++++++++++----- + fs/hfsplus/xattr.c | 6 +++--- + 4 files changed, 29 insertions(+), 11 deletions(-) + +--- a/fs/hfsplus/dir.c ++++ b/fs/hfsplus/dir.c +@@ -204,7 +204,7 @@ static int hfsplus_readdir(struct file * + fd.entrylength); + type = be16_to_cpu(entry.type); + len = NLS_MAX_CHARSET_SIZE * HFSPLUS_MAX_STRLEN; +- err = hfsplus_uni2asc(sb, &fd.key->cat.name, strbuf, &len); ++ err = hfsplus_uni2asc_str(sb, &fd.key->cat.name, strbuf, &len); + if (err) + goto out; + if (type == HFSPLUS_FOLDER) { +--- a/fs/hfsplus/hfsplus_fs.h ++++ b/fs/hfsplus/hfsplus_fs.h +@@ -518,8 +518,12 @@ int hfsplus_strcasecmp(const struct hfsp + const struct hfsplus_unistr *s2); + int hfsplus_strcmp(const struct hfsplus_unistr *s1, + const struct hfsplus_unistr *s2); +-int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, +- char *astr, int *len_p); ++int hfsplus_uni2asc_str(struct super_block *sb, ++ const struct hfsplus_unistr *ustr, char *astr, ++ int *len_p); ++int hfsplus_uni2asc_xattr_str(struct super_block *sb, ++ const struct hfsplus_attr_unistr *ustr, ++ char *astr, int *len_p); + int hfsplus_asc2uni(struct super_block *sb, struct hfsplus_unistr *ustr, + int max_unistr_len, const char *astr, int len); + int hfsplus_hash_dentry(const struct dentry *dentry, struct qstr *str); +--- a/fs/hfsplus/unicode.c ++++ b/fs/hfsplus/unicode.c +@@ -143,9 +143,8 @@ static u16 *hfsplus_compose_lookup(u16 * + return NULL; + } + +-int hfsplus_uni2asc(struct super_block *sb, +- const struct hfsplus_unistr *ustr, +- char *astr, int *len_p) ++static int hfsplus_uni2asc(struct super_block *sb, const struct hfsplus_unistr *ustr, ++ int max_len, char *astr, int *len_p) + { + const hfsplus_unichr *ip; + struct nls_table *nls = HFSPLUS_SB(sb)->nls; +@@ -158,8 +157,8 @@ int hfsplus_uni2asc(struct super_block * + ip = ustr->unicode; + + ustrlen = be16_to_cpu(ustr->length); +- if (ustrlen > HFSPLUS_MAX_STRLEN) { +- ustrlen = HFSPLUS_MAX_STRLEN; ++ if (ustrlen > max_len) { ++ ustrlen = max_len; + pr_err("invalid length %u has been corrected to %d\n", + be16_to_cpu(ustr->length), ustrlen); + } +@@ -280,6 +279,21 @@ out: + return res; + } + ++inline int hfsplus_uni2asc_str(struct super_block *sb, ++ const struct hfsplus_unistr *ustr, char *astr, ++ int *len_p) ++{ ++ return hfsplus_uni2asc(sb, ustr, HFSPLUS_MAX_STRLEN, astr, len_p); ++} ++ ++inline int hfsplus_uni2asc_xattr_str(struct super_block *sb, ++ const struct hfsplus_attr_unistr *ustr, ++ char *astr, int *len_p) ++{ ++ return hfsplus_uni2asc(sb, (const struct hfsplus_unistr *)ustr, ++ HFSPLUS_ATTR_MAX_STRLEN, astr, len_p); ++} ++ + /* + * Convert one or more ASCII characters into a single unicode character. + * Returns the number of ASCII characters corresponding to the unicode char. +--- a/fs/hfsplus/xattr.c ++++ b/fs/hfsplus/xattr.c +@@ -737,9 +737,9 @@ ssize_t hfsplus_listxattr(struct dentry + goto end_listxattr; + + xattr_name_len = NLS_MAX_CHARSET_SIZE * HFSPLUS_ATTR_MAX_STRLEN; +- if (hfsplus_uni2asc(inode->i_sb, +- (const struct hfsplus_unistr *)&fd.key->attr.key_name, +- strbuf, &xattr_name_len)) { ++ if (hfsplus_uni2asc_xattr_str(inode->i_sb, ++ &fd.key->attr.key_name, strbuf, ++ &xattr_name_len)) { + pr_err("unicode conversion failed\n"); + res = -EIO; + goto end_listxattr; diff --git a/queue-6.6/netfilter-nft_set_pipapo-clamp-maximum-map-bucket-size-to-int_max.patch b/queue-6.6/netfilter-nft_set_pipapo-clamp-maximum-map-bucket-size-to-int_max.patch new file mode 100644 index 0000000000..332e07ba7e --- /dev/null +++ b/queue-6.6/netfilter-nft_set_pipapo-clamp-maximum-map-bucket-size-to-int_max.patch @@ -0,0 +1,49 @@ +From b85e3367a5716ed3662a4fe266525190d2af76df Mon Sep 17 00:00:00 2001 +From: Pablo Neira Ayuso +Date: Tue, 22 Apr 2025 21:52:44 +0200 +Subject: netfilter: nft_set_pipapo: clamp maximum map bucket size to INT_MAX + +From: Pablo Neira Ayuso + +commit b85e3367a5716ed3662a4fe266525190d2af76df upstream. + +Otherwise, it is possible to hit WARN_ON_ONCE in __kvmalloc_node_noprof() +when resizing hashtable because __GFP_NOWARN is unset. + +Similar to: + + b541ba7d1f5a ("netfilter: conntrack: clamp maximum hashtable size to INT_MAX") + +Reviewed-by: Stefano Brivio +Signed-off-by: Pablo Neira Ayuso +[ Keerthana: Handle freeing new_lt ] +Signed-off-by: Keerthana K +Signed-off-by: Greg Kroah-Hartman +--- + net/netfilter/nft_set_pipapo.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +--- a/net/netfilter/nft_set_pipapo.c ++++ b/net/netfilter/nft_set_pipapo.c +@@ -667,6 +667,11 @@ static int pipapo_resize(struct nft_pipa + } + + mt: ++ if (rules > (INT_MAX / sizeof(*new_mt))) { ++ kvfree(new_lt); ++ return -ENOMEM; ++ } ++ + new_mt = kvmalloc(rules * sizeof(*new_mt), GFP_KERNEL); + if (!new_mt) { + kvfree(new_lt); +@@ -1359,6 +1364,9 @@ static struct nft_pipapo_match *pipapo_c + src->bsize * sizeof(*dst->lt) * + src->groups * NFT_PIPAPO_BUCKETS(src->bb)); + ++ if (src->rules > (INT_MAX / sizeof(*src->mt))) ++ goto out_mt; ++ + dst->mt = kvmalloc(src->rules * sizeof(*src->mt), GFP_KERNEL); + if (!dst->mt) + goto out_mt; diff --git a/queue-6.6/sched-rt-fix-race-in-push_rt_task.patch b/queue-6.6/sched-rt-fix-race-in-push_rt_task.patch new file mode 100644 index 0000000000..7b22273903 --- /dev/null +++ b/queue-6.6/sched-rt-fix-race-in-push_rt_task.patch @@ -0,0 +1,292 @@ +From 690e47d1403e90b7f2366f03b52ed3304194c793 Mon Sep 17 00:00:00 2001 +From: Harshit Agarwal +Date: Tue, 25 Feb 2025 18:05:53 +0000 +Subject: sched/rt: Fix race in push_rt_task +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Harshit Agarwal + +commit 690e47d1403e90b7f2366f03b52ed3304194c793 upstream. + +Overview +======== +When a CPU chooses to call push_rt_task and picks a task to push to +another CPU's runqueue then it will call find_lock_lowest_rq method +which would take a double lock on both CPUs' runqueues. If one of the +locks aren't readily available, it may lead to dropping the current +runqueue lock and reacquiring both the locks at once. During this window +it is possible that the task is already migrated and is running on some +other CPU. These cases are already handled. However, if the task is +migrated and has already been executed and another CPU is now trying to +wake it up (ttwu) such that it is queued again on the runqeue +(on_rq is 1) and also if the task was run by the same CPU, then the +current checks will pass even though the task was migrated out and is no +longer in the pushable tasks list. + +Crashes +======= +This bug resulted in quite a few flavors of crashes triggering kernel +panics with various crash signatures such as assert failures, page +faults, null pointer dereferences, and queue corruption errors all +coming from scheduler itself. + +Some of the crashes: +-> kernel BUG at kernel/sched/rt.c:1616! BUG_ON(idx >= MAX_RT_PRIO) + Call Trace: + ? __die_body+0x1a/0x60 + ? die+0x2a/0x50 + ? do_trap+0x85/0x100 + ? pick_next_task_rt+0x6e/0x1d0 + ? do_error_trap+0x64/0xa0 + ? pick_next_task_rt+0x6e/0x1d0 + ? exc_invalid_op+0x4c/0x60 + ? pick_next_task_rt+0x6e/0x1d0 + ? asm_exc_invalid_op+0x12/0x20 + ? pick_next_task_rt+0x6e/0x1d0 + __schedule+0x5cb/0x790 + ? update_ts_time_stats+0x55/0x70 + schedule_idle+0x1e/0x40 + do_idle+0x15e/0x200 + cpu_startup_entry+0x19/0x20 + start_secondary+0x117/0x160 + secondary_startup_64_no_verify+0xb0/0xbb + +-> BUG: kernel NULL pointer dereference, address: 00000000000000c0 + Call Trace: + ? __die_body+0x1a/0x60 + ? no_context+0x183/0x350 + ? __warn+0x8a/0xe0 + ? exc_page_fault+0x3d6/0x520 + ? asm_exc_page_fault+0x1e/0x30 + ? pick_next_task_rt+0xb5/0x1d0 + ? pick_next_task_rt+0x8c/0x1d0 + __schedule+0x583/0x7e0 + ? update_ts_time_stats+0x55/0x70 + schedule_idle+0x1e/0x40 + do_idle+0x15e/0x200 + cpu_startup_entry+0x19/0x20 + start_secondary+0x117/0x160 + secondary_startup_64_no_verify+0xb0/0xbb + +-> BUG: unable to handle page fault for address: ffff9464daea5900 + kernel BUG at kernel/sched/rt.c:1861! BUG_ON(rq->cpu != task_cpu(p)) + +-> kernel BUG at kernel/sched/rt.c:1055! BUG_ON(!rq->nr_running) + Call Trace: + ? __die_body+0x1a/0x60 + ? die+0x2a/0x50 + ? do_trap+0x85/0x100 + ? dequeue_top_rt_rq+0xa2/0xb0 + ? do_error_trap+0x64/0xa0 + ? dequeue_top_rt_rq+0xa2/0xb0 + ? exc_invalid_op+0x4c/0x60 + ? dequeue_top_rt_rq+0xa2/0xb0 + ? asm_exc_invalid_op+0x12/0x20 + ? dequeue_top_rt_rq+0xa2/0xb0 + dequeue_rt_entity+0x1f/0x70 + dequeue_task_rt+0x2d/0x70 + __schedule+0x1a8/0x7e0 + ? blk_finish_plug+0x25/0x40 + schedule+0x3c/0xb0 + futex_wait_queue_me+0xb6/0x120 + futex_wait+0xd9/0x240 + do_futex+0x344/0xa90 + ? get_mm_exe_file+0x30/0x60 + ? audit_exe_compare+0x58/0x70 + ? audit_filter_rules.constprop.26+0x65e/0x1220 + __x64_sys_futex+0x148/0x1f0 + do_syscall_64+0x30/0x80 + entry_SYSCALL_64_after_hwframe+0x62/0xc7 + +-> BUG: unable to handle page fault for address: ffff8cf3608bc2c0 + Call Trace: + ? __die_body+0x1a/0x60 + ? no_context+0x183/0x350 + ? spurious_kernel_fault+0x171/0x1c0 + ? exc_page_fault+0x3b6/0x520 + ? plist_check_list+0x15/0x40 + ? plist_check_list+0x2e/0x40 + ? asm_exc_page_fault+0x1e/0x30 + ? _cond_resched+0x15/0x30 + ? futex_wait_queue_me+0xc8/0x120 + ? futex_wait+0xd9/0x240 + ? try_to_wake_up+0x1b8/0x490 + ? futex_wake+0x78/0x160 + ? do_futex+0xcd/0xa90 + ? plist_check_list+0x15/0x40 + ? plist_check_list+0x2e/0x40 + ? plist_del+0x6a/0xd0 + ? plist_check_list+0x15/0x40 + ? plist_check_list+0x2e/0x40 + ? dequeue_pushable_task+0x20/0x70 + ? __schedule+0x382/0x7e0 + ? asm_sysvec_reschedule_ipi+0xa/0x20 + ? schedule+0x3c/0xb0 + ? exit_to_user_mode_prepare+0x9e/0x150 + ? irqentry_exit_to_user_mode+0x5/0x30 + ? asm_sysvec_reschedule_ipi+0x12/0x20 + +Above are some of the common examples of the crashes that were observed +due to this issue. + +Details +======= +Let's look at the following scenario to understand this race. + +1) CPU A enters push_rt_task + a) CPU A has chosen next_task = task p. + b) CPU A calls find_lock_lowest_rq(Task p, CPU Z’s rq). + c) CPU A identifies CPU X as a destination CPU (X < Z). + d) CPU A enters double_lock_balance(CPU Z’s rq, CPU X’s rq). + e) Since X is lower than Z, CPU A unlocks CPU Z’s rq. Someone else has + locked CPU X’s rq, and thus, CPU A must wait. + +2) At CPU Z + a) Previous task has completed execution and thus, CPU Z enters + schedule, locks its own rq after CPU A releases it. + b) CPU Z dequeues previous task and begins executing task p. + c) CPU Z unlocks its rq. + d) Task p yields the CPU (ex. by doing IO or waiting to acquire a + lock) which triggers the schedule function on CPU Z. + e) CPU Z enters schedule again, locks its own rq, and dequeues task p. + f) As part of dequeue, it sets p.on_rq = 0 and unlocks its rq. + +3) At CPU B + a) CPU B enters try_to_wake_up with input task p. + b) Since CPU Z dequeued task p, p.on_rq = 0, and CPU B updates + B.state = WAKING. + c) CPU B via select_task_rq determines CPU Y as the target CPU. + +4) The race + a) CPU A acquires CPU X’s lock and relocks CPU Z. + b) CPU A reads task p.cpu = Z and incorrectly concludes task p is + still on CPU Z. + c) CPU A failed to notice task p had been dequeued from CPU Z while + CPU A was waiting for locks in double_lock_balance. If CPU A knew + that task p had been dequeued, it would return NULL forcing + push_rt_task to give up the task p's migration. + d) CPU B updates task p.cpu = Y and calls ttwu_queue. + e) CPU B locks Ys rq. CPU B enqueues task p onto Y and sets task + p.on_rq = 1. + f) CPU B unlocks CPU Y, triggering memory synchronization. + g) CPU A reads task p.on_rq = 1, cementing its assumption that task p + has not migrated. + h) CPU A decides to migrate p to CPU X. + +This leads to A dequeuing p from Y's queue and various crashes down the +line. + +Solution +======== +The solution here is fairly simple. After obtaining the lock (at 4a), +the check is enhanced to make sure that the task is still at the head of +the pushable tasks list. If not, then it is anyway not suitable for +being pushed out. + +Testing +======= +The fix is tested on a cluster of 3 nodes, where the panics due to this +are hit every couple of days. A fix similar to this was deployed on such +cluster and was stable for more than 30 days. + +Co-developed-by: Jon Kohler +Signed-off-by: Jon Kohler +Co-developed-by: Gauri Patwardhan +Signed-off-by: Gauri Patwardhan +Co-developed-by: Rahul Chunduru +Signed-off-by: Rahul Chunduru +Signed-off-by: Harshit Agarwal +Signed-off-by: Peter Zijlstra (Intel) +Reviewed-by: "Steven Rostedt (Google)" +Reviewed-by: Phil Auld +Tested-by: Will Ton +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/r/20250225180553.167995-1-harshit@nutanix.com +Signed-off-by: Rajani Kantha <681739313@139.com> +Signed-off-by: Greg Kroah-Hartman +--- + kernel/sched/rt.c | 52 +++++++++++++++++++++++++--------------------------- + 1 file changed, 25 insertions(+), 27 deletions(-) + +--- a/kernel/sched/rt.c ++++ b/kernel/sched/rt.c +@@ -1963,6 +1963,26 @@ static int find_lowest_rq(struct task_st + return -1; + } + ++static struct task_struct *pick_next_pushable_task(struct rq *rq) ++{ ++ struct task_struct *p; ++ ++ if (!has_pushable_tasks(rq)) ++ return NULL; ++ ++ p = plist_first_entry(&rq->rt.pushable_tasks, ++ struct task_struct, pushable_tasks); ++ ++ BUG_ON(rq->cpu != task_cpu(p)); ++ BUG_ON(task_current(rq, p)); ++ BUG_ON(p->nr_cpus_allowed <= 1); ++ ++ BUG_ON(!task_on_rq_queued(p)); ++ BUG_ON(!rt_task(p)); ++ ++ return p; ++} ++ + /* Will lock the rq it finds */ + static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) + { +@@ -1993,18 +2013,16 @@ static struct rq *find_lock_lowest_rq(st + /* + * We had to unlock the run queue. In + * the mean time, task could have +- * migrated already or had its affinity changed. +- * Also make sure that it wasn't scheduled on its rq. ++ * migrated already or had its affinity changed, ++ * therefore check if the task is still at the ++ * head of the pushable tasks list. + * It is possible the task was scheduled, set + * "migrate_disabled" and then got preempted, so we must + * check the task migration disable flag here too. + */ +- if (unlikely(task_rq(task) != rq || ++ if (unlikely(is_migration_disabled(task) || + !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask) || +- task_on_cpu(rq, task) || +- !rt_task(task) || +- is_migration_disabled(task) || +- !task_on_rq_queued(task))) { ++ task != pick_next_pushable_task(rq))) { + + double_unlock_balance(rq, lowest_rq); + lowest_rq = NULL; +@@ -2024,26 +2042,6 @@ static struct rq *find_lock_lowest_rq(st + return lowest_rq; + } + +-static struct task_struct *pick_next_pushable_task(struct rq *rq) +-{ +- struct task_struct *p; +- +- if (!has_pushable_tasks(rq)) +- return NULL; +- +- p = plist_first_entry(&rq->rt.pushable_tasks, +- struct task_struct, pushable_tasks); +- +- BUG_ON(rq->cpu != task_cpu(p)); +- BUG_ON(task_current(rq, p)); +- BUG_ON(p->nr_cpus_allowed <= 1); +- +- BUG_ON(!task_on_rq_queued(p)); +- BUG_ON(!rt_task(p)); +- +- return p; +-} +- + /* + * If the current CPU has more than one RT task, see if the non + * running task can migrate over to a CPU that is running a task diff --git a/queue-6.6/series b/queue-6.6/series index a5128ece6e..c6185d1a89 100644 --- a/queue-6.6/series +++ b/queue-6.6/series @@ -10,3 +10,7 @@ rbd-check-for-eod-after-exclusive-lock-is-ensured-to-be-held.patch arm-9468-1-fix-memset64-on-big-endian.patch revert-drm-amd-check-if-aspm-is-enabled-from-pcie-subsystem.patch kvm-don-t-clobber-irqfd-routing-type-when-deassigning-irqfd.patch +netfilter-nft_set_pipapo-clamp-maximum-map-bucket-size-to-int_max.patch +hfsplus-fix-slab-out-of-bounds-read-in-hfsplus_uni2asc.patch +ublk-fix-deadlock-when-reading-partition-table.patch +sched-rt-fix-race-in-push_rt_task.patch diff --git a/queue-6.6/ublk-fix-deadlock-when-reading-partition-table.patch b/queue-6.6/ublk-fix-deadlock-when-reading-partition-table.patch new file mode 100644 index 0000000000..b716b955cb --- /dev/null +++ b/queue-6.6/ublk-fix-deadlock-when-reading-partition-table.patch @@ -0,0 +1,107 @@ +From c258f5c4502c9667bccf5d76fa731ab9c96687c1 Mon Sep 17 00:00:00 2001 +From: Ming Lei +Date: Fri, 12 Dec 2025 22:34:15 +0800 +Subject: ublk: fix deadlock when reading partition table + +From: Ming Lei + +commit c258f5c4502c9667bccf5d76fa731ab9c96687c1 upstream. + +When one process(such as udev) opens ublk block device (e.g., to read +the partition table via bdev_open()), a deadlock[1] can occur: + +1. bdev_open() grabs disk->open_mutex +2. The process issues read I/O to ublk backend to read partition table +3. In __ublk_complete_rq(), blk_update_request() or blk_mq_end_request() + runs bio->bi_end_io() callbacks +4. If this triggers fput() on file descriptor of ublk block device, the + work may be deferred to current task's task work (see fput() implementation) +5. This eventually calls blkdev_release() from the same context +6. blkdev_release() tries to grab disk->open_mutex again +7. Deadlock: same task waiting for a mutex it already holds + +The fix is to run blk_update_request() and blk_mq_end_request() with bottom +halves disabled. This forces blkdev_release() to run in kernel work-queue +context instead of current task work context, and allows ublk server to make +forward progress, and avoids the deadlock. + +Fixes: 71f28f3136af ("ublk_drv: add io_uring based userspace block driver") +Link: https://github.com/ublk-org/ublksrv/issues/170 [1] +Signed-off-by: Ming Lei +Reviewed-by: Caleb Sander Mateos +[axboe: rewrite comment in ublk] +Signed-off-by: Jens Axboe +[ The fix omits the change in __ublk_do_auto_buf_reg() since this function + doesn't exist in Linux 6.6. ] +Signed-off-by: Alva Lan +Signed-off-by: Greg Kroah-Hartman +--- + drivers/block/ublk_drv.c | 30 +++++++++++++++++++++++++++--- + 1 file changed, 27 insertions(+), 3 deletions(-) + +--- a/drivers/block/ublk_drv.c ++++ b/drivers/block/ublk_drv.c +@@ -1050,6 +1050,13 @@ static inline bool ubq_daemon_is_dying(s + return ubq->ubq_daemon->flags & PF_EXITING; + } + ++static void ublk_end_request(struct request *req, blk_status_t error) ++{ ++ local_bh_disable(); ++ blk_mq_end_request(req, error); ++ local_bh_enable(); ++} ++ + /* todo: handle partial completion */ + static inline void __ublk_complete_rq(struct request *req) + { +@@ -1057,6 +1064,7 @@ static inline void __ublk_complete_rq(st + struct ublk_io *io = &ubq->ios[req->tag]; + unsigned int unmapped_bytes; + blk_status_t res = BLK_STS_OK; ++ bool requeue; + + /* called from ublk_abort_queue() code path */ + if (io->flags & UBLK_IO_FLAG_ABORTED) { +@@ -1094,14 +1102,30 @@ static inline void __ublk_complete_rq(st + if (unlikely(unmapped_bytes < io->res)) + io->res = unmapped_bytes; + +- if (blk_update_request(req, BLK_STS_OK, io->res)) ++ /* ++ * Run bio->bi_end_io() with softirqs disabled. If the final fput ++ * happens off this path, then that will prevent ublk's blkdev_release() ++ * from being called on current's task work, see fput() implementation. ++ * ++ * Otherwise, ublk server may not provide forward progress in case of ++ * reading the partition table from bdev_open() with disk->open_mutex ++ * held, and causes dead lock as we could already be holding ++ * disk->open_mutex here. ++ * ++ * Preferably we would not be doing IO with a mutex held that is also ++ * used for release, but this work-around will suffice for now. ++ */ ++ local_bh_disable(); ++ requeue = blk_update_request(req, BLK_STS_OK, io->res); ++ local_bh_enable(); ++ if (requeue) + blk_mq_requeue_request(req, true); + else + __blk_mq_end_request(req, BLK_STS_OK); + + return; + exit: +- blk_mq_end_request(req, res); ++ ublk_end_request(req, res); + } + + static void ublk_complete_rq(struct kref *ref) +@@ -1160,7 +1184,7 @@ static inline void __ublk_abort_rq(struc + if (ublk_queue_can_use_recovery(ubq)) + blk_mq_requeue_request(rq, false); + else +- blk_mq_end_request(rq, BLK_STS_IOERR); ++ ublk_end_request(rq, BLK_STS_IOERR); + + mod_delayed_work(system_wq, &ubq->dev->monitor_work, 0); + } -- 2.47.3