From 11094dba5f8d0df69938f47f5edc5472ba790eca Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Fri, 17 Oct 2025 08:42:10 -0400 Subject: [PATCH] Fixes for all trees Signed-off-by: Sasha Levin --- ...fy-inode-mode-when-loading-from-disk.patch | 51 ++++++++ ..._options-to-set-initramfs-mount-opti.patch | 116 ++++++++++++++++++ ...fy-inode-mode-when-loading-from-disk.patch | 46 +++++++ ...-a-judgment-for-ns-null-in-pid_nr_ns.patch | 95 ++++++++++++++ ...pid_nr_ns-ns-null-safe-for-zombie-ca.patch | 48 ++++++++ queue-5.10/series | 5 + ...fy-inode-mode-when-loading-from-disk.patch | 51 ++++++++ ..._options-to-set-initramfs-mount-opti.patch | 116 ++++++++++++++++++ ...fy-inode-mode-when-loading-from-disk.patch | 46 +++++++ ...-a-judgment-for-ns-null-in-pid_nr_ns.patch | 95 ++++++++++++++ ...pid_nr_ns-ns-null-safe-for-zombie-ca.patch | 48 ++++++++ queue-5.15/series | 7 ++ ...excessively-long-inode-switching-tim.patch | 102 +++++++++++++++ ...softlockup-when-switching-many-inode.patch | 65 ++++++++++ ...fy-inode-mode-when-loading-from-disk.patch | 51 ++++++++ ..._options-to-set-initramfs-mount-opti.patch | 116 ++++++++++++++++++ ...fy-inode-mode-when-loading-from-disk.patch | 46 +++++++ ...-a-judgment-for-ns-null-in-pid_nr_ns.patch | 95 ++++++++++++++ queue-5.4/series | 4 + ...fy-inode-mode-when-loading-from-disk.patch | 51 ++++++++ ..._options-to-set-initramfs-mount-opti.patch | 116 ++++++++++++++++++ ...fy-inode-mode-when-loading-from-disk.patch | 46 +++++++ ...-a-judgment-for-ns-null-in-pid_nr_ns.patch | 95 ++++++++++++++ ...pid_nr_ns-ns-null-safe-for-zombie-ca.patch | 48 ++++++++ queue-6.1/series | 7 ++ ...excessively-long-inode-switching-tim.patch | 102 +++++++++++++++ ...softlockup-when-switching-many-inode.patch | 65 ++++++++++ ...e_range-limit-size-if-in-compat-mode.patch | 66 ++++++++++ ...fy-inode-mode-when-loading-from-disk.patch | 51 ++++++++ ..._options-to-set-initramfs-mount-opti.patch | 116 ++++++++++++++++++ ...lic-avoid-interrupt-id-0-handling-du.patch | 64 ++++++++++ ...sifive-plic-make-use-of-__assign_bit.patch | 51 ++++++++ ...fy-inode-mode-when-loading-from-disk.patch | 46 +++++++ ...-a-judgment-for-ns-null-in-pid_nr_ns.patch | 95 ++++++++++++++ ...pid_nr_ns-ns-null-safe-for-zombie-ca.patch | 48 ++++++++ queue-6.12/series | 10 ++ ...excessively-long-inode-switching-tim.patch | 102 +++++++++++++++ ...softlockup-when-switching-many-inode.patch | 65 ++++++++++ ...e_range-limit-size-if-in-compat-mode.patch | 66 ++++++++++ ...fy-inode-mode-when-loading-from-disk.patch | 51 ++++++++ ..._options-to-set-initramfs-mount-opti.patch | 116 ++++++++++++++++++ ...on-file-io-when-there-is-no-inline_d.patch | 95 ++++++++++++++ ...lic-avoid-interrupt-id-0-handling-du.patch | 64 ++++++++++ ...fy-inode-mode-when-loading-from-disk.patch | 46 +++++++ ...ve-dtrt-if-mnt_ns-had-never-been-add.patch | 40 ++++++ ...handle-null-values-in-mnt_ns_release.patch | 35 ++++++ .../nsfs-validate-extensible-ioctls.patch | 38 ++++++ ...-a-judgment-for-ns-null-in-pid_nr_ns.patch | 95 ++++++++++++++ ...pid_nr_ns-ns-null-safe-for-zombie-ca.patch | 48 ++++++++ .../pidfs-validate-extensible-ioctls.patch | 59 +++++++++ queue-6.17/series | 14 +++ ...excessively-long-inode-switching-tim.patch | 102 +++++++++++++++ ...softlockup-when-switching-many-inode.patch | 65 ++++++++++ ...fy-inode-mode-when-loading-from-disk.patch | 51 ++++++++ ..._options-to-set-initramfs-mount-opti.patch | 116 ++++++++++++++++++ ...lic-avoid-interrupt-id-0-handling-du.patch | 64 ++++++++++ ...sifive-plic-make-use-of-__assign_bit.patch | 51 ++++++++ ...fy-inode-mode-when-loading-from-disk.patch | 46 +++++++ ...-a-judgment-for-ns-null-in-pid_nr_ns.patch | 95 ++++++++++++++ ...pid_nr_ns-ns-null-safe-for-zombie-ca.patch | 48 ++++++++ queue-6.6/series | 9 ++ ...excessively-long-inode-switching-tim.patch | 102 +++++++++++++++ ...softlockup-when-switching-many-inode.patch | 65 ++++++++++ 63 files changed, 4028 insertions(+) create mode 100644 queue-5.10/cramfs-verify-inode-mode-when-loading-from-disk.patch create mode 100644 queue-5.10/fs-add-initramfs_options-to-set-initramfs-mount-opti.patch create mode 100644 queue-5.10/minixfs-verify-inode-mode-when-loading-from-disk.patch create mode 100644 queue-5.10/pid-add-a-judgment-for-ns-null-in-pid_nr_ns.patch create mode 100644 queue-5.10/pid-make-__task_pid_nr_ns-ns-null-safe-for-zombie-ca.patch create mode 100644 queue-5.15/cramfs-verify-inode-mode-when-loading-from-disk.patch create mode 100644 queue-5.15/fs-add-initramfs_options-to-set-initramfs-mount-opti.patch create mode 100644 queue-5.15/minixfs-verify-inode-mode-when-loading-from-disk.patch create mode 100644 queue-5.15/pid-add-a-judgment-for-ns-null-in-pid_nr_ns.patch create mode 100644 queue-5.15/pid-make-__task_pid_nr_ns-ns-null-safe-for-zombie-ca.patch create mode 100644 queue-5.15/writeback-avoid-excessively-long-inode-switching-tim.patch create mode 100644 queue-5.15/writeback-avoid-softlockup-when-switching-many-inode.patch create mode 100644 queue-5.4/cramfs-verify-inode-mode-when-loading-from-disk.patch create mode 100644 queue-5.4/fs-add-initramfs_options-to-set-initramfs-mount-opti.patch create mode 100644 queue-5.4/minixfs-verify-inode-mode-when-loading-from-disk.patch create mode 100644 queue-5.4/pid-add-a-judgment-for-ns-null-in-pid_nr_ns.patch create mode 100644 queue-6.1/cramfs-verify-inode-mode-when-loading-from-disk.patch create mode 100644 queue-6.1/fs-add-initramfs_options-to-set-initramfs-mount-opti.patch create mode 100644 queue-6.1/minixfs-verify-inode-mode-when-loading-from-disk.patch create mode 100644 queue-6.1/pid-add-a-judgment-for-ns-null-in-pid_nr_ns.patch create mode 100644 queue-6.1/pid-make-__task_pid_nr_ns-ns-null-safe-for-zombie-ca.patch create mode 100644 queue-6.1/writeback-avoid-excessively-long-inode-switching-tim.patch create mode 100644 queue-6.1/writeback-avoid-softlockup-when-switching-many-inode.patch create mode 100644 queue-6.12/copy_file_range-limit-size-if-in-compat-mode.patch create mode 100644 queue-6.12/cramfs-verify-inode-mode-when-loading-from-disk.patch create mode 100644 queue-6.12/fs-add-initramfs_options-to-set-initramfs-mount-opti.patch create mode 100644 queue-6.12/irqchip-sifive-plic-avoid-interrupt-id-0-handling-du.patch create mode 100644 queue-6.12/irqchip-sifive-plic-make-use-of-__assign_bit.patch create mode 100644 queue-6.12/minixfs-verify-inode-mode-when-loading-from-disk.patch create mode 100644 queue-6.12/pid-add-a-judgment-for-ns-null-in-pid_nr_ns.patch create mode 100644 queue-6.12/pid-make-__task_pid_nr_ns-ns-null-safe-for-zombie-ca.patch create mode 100644 queue-6.12/writeback-avoid-excessively-long-inode-switching-tim.patch create mode 100644 queue-6.12/writeback-avoid-softlockup-when-switching-many-inode.patch create mode 100644 queue-6.17/copy_file_range-limit-size-if-in-compat-mode.patch create mode 100644 queue-6.17/cramfs-verify-inode-mode-when-loading-from-disk.patch create mode 100644 queue-6.17/fs-add-initramfs_options-to-set-initramfs-mount-opti.patch create mode 100644 queue-6.17/iomap-error-out-on-file-io-when-there-is-no-inline_d.patch create mode 100644 queue-6.17/irqchip-sifive-plic-avoid-interrupt-id-0-handling-du.patch create mode 100644 queue-6.17/minixfs-verify-inode-mode-when-loading-from-disk.patch create mode 100644 queue-6.17/mnt_ns_tree_remove-dtrt-if-mnt_ns-had-never-been-add.patch create mode 100644 queue-6.17/mount-handle-null-values-in-mnt_ns_release.patch create mode 100644 queue-6.17/nsfs-validate-extensible-ioctls.patch create mode 100644 queue-6.17/pid-add-a-judgment-for-ns-null-in-pid_nr_ns.patch create mode 100644 queue-6.17/pid-make-__task_pid_nr_ns-ns-null-safe-for-zombie-ca.patch create mode 100644 queue-6.17/pidfs-validate-extensible-ioctls.patch create mode 100644 queue-6.17/writeback-avoid-excessively-long-inode-switching-tim.patch create mode 100644 queue-6.17/writeback-avoid-softlockup-when-switching-many-inode.patch create mode 100644 queue-6.6/cramfs-verify-inode-mode-when-loading-from-disk.patch create mode 100644 queue-6.6/fs-add-initramfs_options-to-set-initramfs-mount-opti.patch create mode 100644 queue-6.6/irqchip-sifive-plic-avoid-interrupt-id-0-handling-du.patch create mode 100644 queue-6.6/irqchip-sifive-plic-make-use-of-__assign_bit.patch create mode 100644 queue-6.6/minixfs-verify-inode-mode-when-loading-from-disk.patch create mode 100644 queue-6.6/pid-add-a-judgment-for-ns-null-in-pid_nr_ns.patch create mode 100644 queue-6.6/pid-make-__task_pid_nr_ns-ns-null-safe-for-zombie-ca.patch create mode 100644 queue-6.6/writeback-avoid-excessively-long-inode-switching-tim.patch create mode 100644 queue-6.6/writeback-avoid-softlockup-when-switching-many-inode.patch diff --git a/queue-5.10/cramfs-verify-inode-mode-when-loading-from-disk.patch b/queue-5.10/cramfs-verify-inode-mode-when-loading-from-disk.patch new file mode 100644 index 0000000000..c62b1dbd4c --- /dev/null +++ b/queue-5.10/cramfs-verify-inode-mode-when-loading-from-disk.patch @@ -0,0 +1,51 @@ +From 89480951e6756b2332eac1ec7f21d95d4951d528 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 30 Aug 2025 19:01:01 +0900 +Subject: cramfs: Verify inode mode when loading from disk + +From: Tetsuo Handa + +[ Upstream commit 7f9d34b0a7cb93d678ee7207f0634dbf79e47fe5 ] + +The inode mode loaded from corrupted disk can be invalid. Do like what +commit 0a9e74051313 ("isofs: Verify inode mode when loading from disk") +does. + +Reported-by: syzbot +Closes: https://syzkaller.appspot.com/bug?extid=895c23f6917da440ed0d +Signed-off-by: Tetsuo Handa +Link: https://lore.kernel.org/429b3ef1-13de-4310-9a8e-c2dc9a36234a@I-love.SAKURA.ne.jp +Acked-by: Nicolas Pitre +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + fs/cramfs/inode.c | 11 ++++++++++- + 1 file changed, 10 insertions(+), 1 deletion(-) + +diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c +index 4b90cfd1ec360..fda92e56ccd69 100644 +--- a/fs/cramfs/inode.c ++++ b/fs/cramfs/inode.c +@@ -117,9 +117,18 @@ static struct inode *get_cramfs_inode(struct super_block *sb, + inode_nohighmem(inode); + inode->i_data.a_ops = &cramfs_aops; + break; +- default: ++ case S_IFCHR: ++ case S_IFBLK: ++ case S_IFIFO: ++ case S_IFSOCK: + init_special_inode(inode, cramfs_inode->mode, + old_decode_dev(cramfs_inode->size)); ++ break; ++ default: ++ printk(KERN_DEBUG "CRAMFS: Invalid file type 0%04o for inode %lu.\n", ++ inode->i_mode, inode->i_ino); ++ iget_failed(inode); ++ return ERR_PTR(-EIO); + } + + inode->i_mode = cramfs_inode->mode; +-- +2.51.0 + diff --git a/queue-5.10/fs-add-initramfs_options-to-set-initramfs-mount-opti.patch b/queue-5.10/fs-add-initramfs_options-to-set-initramfs-mount-opti.patch new file mode 100644 index 0000000000..866343781a --- /dev/null +++ b/queue-5.10/fs-add-initramfs_options-to-set-initramfs-mount-opti.patch @@ -0,0 +1,116 @@ +From be98da7c43d8586c95a63076c6b69a07923ee23e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 15 Aug 2025 20:14:59 +0800 +Subject: fs: Add 'initramfs_options' to set initramfs mount options +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Lichen Liu + +[ Upstream commit 278033a225e13ec21900f0a92b8351658f5377f2 ] + +When CONFIG_TMPFS is enabled, the initial root filesystem is a tmpfs. +By default, a tmpfs mount is limited to using 50% of the available RAM +for its content. This can be problematic in memory-constrained +environments, particularly during a kdump capture. + +In a kdump scenario, the capture kernel boots with a limited amount of +memory specified by the 'crashkernel' parameter. If the initramfs is +large, it may fail to unpack into the tmpfs rootfs due to insufficient +space. This is because to get X MB of usable space in tmpfs, 2*X MB of +memory must be available for the mount. This leads to an OOM failure +during the early boot process, preventing a successful crash dump. + +This patch introduces a new kernel command-line parameter, +initramfs_options, which allows passing specific mount options directly +to the rootfs when it is first mounted. This gives users control over +the rootfs behavior. + +For example, a user can now specify initramfs_options=size=75% to allow +the tmpfs to use up to 75% of the available memory. This can +significantly reduce the memory pressure for kdump. + +Consider a practical example: + +To unpack a 48MB initramfs, the tmpfs needs 48MB of usable space. With +the default 50% limit, this requires a memory pool of 96MB to be +available for the tmpfs mount. The total memory requirement is therefore +approximately: 16MB (vmlinuz) + 48MB (loaded initramfs) + 48MB (unpacked +kernel) + 96MB (for tmpfs) + 12MB (runtime overhead) ≈ 220MB. + +By using initramfs_options=size=75%, the memory pool required for the +48MB tmpfs is reduced to 48MB / 0.75 = 64MB. This reduces the total +memory requirement by 32MB (96MB - 64MB), allowing the kdump to succeed +with a smaller crashkernel size, such as 192MB. + +An alternative approach of reusing the existing rootflags parameter was +considered. However, a new, dedicated initramfs_options parameter was +chosen to avoid altering the current behavior of rootflags (which +applies to the final root filesystem) and to prevent any potential +regressions. + +Also add documentation for the new kernel parameter "initramfs_options" + +This approach is inspired by prior discussions and patches on the topic. +Ref: https://www.lightofdawn.org/blog/?viewDetailed=00128 +Ref: https://landley.net/notes-2015.html#01-01-2015 +Ref: https://lkml.org/lkml/2021/6/29/783 +Ref: https://www.kernel.org/doc/html/latest/filesystems/ramfs-rootfs-initramfs.html#what-is-rootfs + +Signed-off-by: Lichen Liu +Link: https://lore.kernel.org/20250815121459.3391223-1-lichliu@redhat.com +Tested-by: Rob Landley +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + Documentation/admin-guide/kernel-parameters.txt | 3 +++ + fs/namespace.c | 11 ++++++++++- + 2 files changed, 13 insertions(+), 1 deletion(-) + +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index bac4b1493222a..0f1605d78e83d 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -4846,6 +4846,9 @@ + + rootflags= [KNL] Set root filesystem mount option string + ++ initramfs_options= [KNL] ++ Specify mount options for for the initramfs mount. ++ + rootfstype= [KNL] Set root filesystem type + + rootwait [KNL] Wait (indefinitely) for root device to show up. +diff --git a/fs/namespace.c b/fs/namespace.c +index d1751f9b6f1ce..e9b8d516f1919 100644 +--- a/fs/namespace.c ++++ b/fs/namespace.c +@@ -62,6 +62,15 @@ static int __init set_mphash_entries(char *str) + } + __setup("mphash_entries=", set_mphash_entries); + ++static char * __initdata initramfs_options; ++static int __init initramfs_options_setup(char *str) ++{ ++ initramfs_options = str; ++ return 1; ++} ++ ++__setup("initramfs_options=", initramfs_options_setup); ++ + static u64 event; + static DEFINE_IDA(mnt_id_ida); + static DEFINE_IDA(mnt_group_ida); +@@ -3913,7 +3922,7 @@ static void __init init_mount_tree(void) + struct mnt_namespace *ns; + struct path root; + +- mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", NULL); ++ mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", initramfs_options); + if (IS_ERR(mnt)) + panic("Can't create rootfs"); + +-- +2.51.0 + diff --git a/queue-5.10/minixfs-verify-inode-mode-when-loading-from-disk.patch b/queue-5.10/minixfs-verify-inode-mode-when-loading-from-disk.patch new file mode 100644 index 0000000000..7c7afda64a --- /dev/null +++ b/queue-5.10/minixfs-verify-inode-mode-when-loading-from-disk.patch @@ -0,0 +1,46 @@ +From 8d17883806bb2f9429da4ecb4c13c8b45dc8c4af Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 13 Aug 2025 00:17:44 +0900 +Subject: minixfs: Verify inode mode when loading from disk + +From: Tetsuo Handa + +[ Upstream commit 73861970938ad1323eb02bbbc87f6fbd1e5bacca ] + +The inode mode loaded from corrupted disk can be invalid. Do like what +commit 0a9e74051313 ("isofs: Verify inode mode when loading from disk") +does. + +Reported-by: syzbot +Closes: https://syzkaller.appspot.com/bug?extid=895c23f6917da440ed0d +Signed-off-by: Tetsuo Handa +Link: https://lore.kernel.org/ec982681-84b8-4624-94fa-8af15b77cbd2@I-love.SAKURA.ne.jp +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + fs/minix/inode.c | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +diff --git a/fs/minix/inode.c b/fs/minix/inode.c +index e938f5b1e4b94..7636e789eb49b 100644 +--- a/fs/minix/inode.c ++++ b/fs/minix/inode.c +@@ -469,8 +469,14 @@ void minix_set_inode(struct inode *inode, dev_t rdev) + inode->i_op = &minix_symlink_inode_operations; + inode_nohighmem(inode); + inode->i_mapping->a_ops = &minix_aops; +- } else ++ } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || ++ S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { + init_special_inode(inode, inode->i_mode, rdev); ++ } else { ++ printk(KERN_DEBUG "MINIX-fs: Invalid file type 0%04o for inode %lu.\n", ++ inode->i_mode, inode->i_ino); ++ make_bad_inode(inode); ++ } + } + + /* +-- +2.51.0 + diff --git a/queue-5.10/pid-add-a-judgment-for-ns-null-in-pid_nr_ns.patch b/queue-5.10/pid-add-a-judgment-for-ns-null-in-pid_nr_ns.patch new file mode 100644 index 0000000000..862947a551 --- /dev/null +++ b/queue-5.10/pid-add-a-judgment-for-ns-null-in-pid_nr_ns.patch @@ -0,0 +1,95 @@ +From 7b31773999c2f0b5a8192246075c7c97ec1efa96 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 2 Aug 2025 10:21:23 +0800 +Subject: pid: Add a judgment for ns null in pid_nr_ns + +From: gaoxiang17 + +[ Upstream commit 006568ab4c5ca2309ceb36fa553e390b4aa9c0c7 ] + +__task_pid_nr_ns + ns = task_active_pid_ns(current); + pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); + if (pid && ns->level <= pid->level) { + +Sometimes null is returned for task_active_pid_ns. Then it will trigger kernel panic in pid_nr_ns. + +For example: + Unable to handle kernel NULL pointer dereference at virtual address 0000000000000058 + Mem abort info: + ESR = 0x0000000096000007 + EC = 0x25: DABT (current EL), IL = 32 bits + SET = 0, FnV = 0 + EA = 0, S1PTW = 0 + FSC = 0x07: level 3 translation fault + Data abort info: + ISV = 0, ISS = 0x00000007, ISS2 = 0x00000000 + CM = 0, WnR = 0, TnD = 0, TagAccess = 0 + GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 + user pgtable: 4k pages, 39-bit VAs, pgdp=00000002175aa000 + [0000000000000058] pgd=08000002175ab003, p4d=08000002175ab003, pud=08000002175ab003, pmd=08000002175be003, pte=0000000000000000 + pstate: 834000c5 (Nzcv daIF +PAN -UAO +TCO +DIT -SSBS BTYPE=--) + pc : __task_pid_nr_ns+0x74/0xd0 + lr : __task_pid_nr_ns+0x24/0xd0 + sp : ffffffc08001bd10 + x29: ffffffc08001bd10 x28: ffffffd4422b2000 x27: 0000000000000001 + x26: ffffffd442821168 x25: ffffffd442821000 x24: 00000f89492eab31 + x23: 00000000000000c0 x22: ffffff806f5693c0 x21: ffffff806f5693c0 + x20: 0000000000000001 x19: 0000000000000000 x18: 0000000000000000 + x17: 00000000529c6ef0 x16: 00000000529c6ef0 x15: 00000000023a1adc + x14: 0000000000000003 x13: 00000000007ef6d8 x12: 001167c391c78800 + x11: 00ffffffffffffff x10: 0000000000000000 x9 : 0000000000000001 + x8 : ffffff80816fa3c0 x7 : 0000000000000000 x6 : 49534d702d535449 + x5 : ffffffc080c4c2c0 x4 : ffffffd43ee128c8 x3 : ffffffd43ee124dc + x2 : 0000000000000000 x1 : 0000000000000001 x0 : ffffff806f5693c0 + Call trace: + __task_pid_nr_ns+0x74/0xd0 + ... + __handle_irq_event_percpu+0xd4/0x284 + handle_irq_event+0x48/0xb0 + handle_fasteoi_irq+0x160/0x2d8 + generic_handle_domain_irq+0x44/0x60 + gic_handle_irq+0x4c/0x114 + call_on_irq_stack+0x3c/0x74 + do_interrupt_handler+0x4c/0x84 + el1_interrupt+0x34/0x58 + el1h_64_irq_handler+0x18/0x24 + el1h_64_irq+0x68/0x6c + account_kernel_stack+0x60/0x144 + exit_task_stack_account+0x1c/0x80 + do_exit+0x7e4/0xaf8 + ... + get_signal+0x7bc/0x8d8 + do_notify_resume+0x128/0x828 + el0_svc+0x6c/0x70 + el0t_64_sync_handler+0x68/0xbc + el0t_64_sync+0x1a8/0x1ac + Code: 35fffe54 911a02a8 f9400108 b4000128 (b9405a69) + ---[ end trace 0000000000000000 ]--- + Kernel panic - not syncing: Oops: Fatal exception in interrupt + +Signed-off-by: gaoxiang17 +Link: https://lore.kernel.org/20250802022123.3536934-1-gxxa03070307@gmail.com +Reviewed-by: Baoquan He +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + kernel/pid.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/pid.c b/kernel/pid.c +index 0820f2c50bb0c..a5a08476f3756 100644 +--- a/kernel/pid.c ++++ b/kernel/pid.c +@@ -474,7 +474,7 @@ pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns) + struct upid *upid; + pid_t nr = 0; + +- if (pid && ns->level <= pid->level) { ++ if (pid && ns && ns->level <= pid->level) { + upid = &pid->numbers[ns->level]; + if (upid->ns == ns) + nr = upid->nr; +-- +2.51.0 + diff --git a/queue-5.10/pid-make-__task_pid_nr_ns-ns-null-safe-for-zombie-ca.patch b/queue-5.10/pid-make-__task_pid_nr_ns-ns-null-safe-for-zombie-ca.patch new file mode 100644 index 0000000000..bf1bcf3218 --- /dev/null +++ b/queue-5.10/pid-make-__task_pid_nr_ns-ns-null-safe-for-zombie-ca.patch @@ -0,0 +1,48 @@ +From f713b9c102237b7107812f051d9be3bf7fd94baa Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sun, 10 Aug 2025 19:36:04 +0200 +Subject: pid: make __task_pid_nr_ns(ns => NULL) safe for zombie callers +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Oleg Nesterov + +[ Upstream commit abdfd4948e45c51b19162cf8b3f5003f8f53c9b9 ] + +task_pid_vnr(another_task) will crash if the caller was already reaped. +The pid_alive(current) check can't really help, the parent/debugger can +call release_task() right after this check. + +This also means that even task_ppid_nr_ns(current, NULL) is not safe, +pid_alive() only ensures that it is safe to dereference ->real_parent. + +Change __task_pid_nr_ns() to ensure ns != NULL. + +Originally-by: 高翔 +Link: https://lore.kernel.org/all/20250802022123.3536934-1-gxxa03070307@gmail.com/ +Signed-off-by: Oleg Nesterov +Link: https://lore.kernel.org/20250810173604.GA19991@redhat.com +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + kernel/pid.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/kernel/pid.c b/kernel/pid.c +index a5a08476f3756..6dfec6bef6ab3 100644 +--- a/kernel/pid.c ++++ b/kernel/pid.c +@@ -497,7 +497,8 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, + rcu_read_lock(); + if (!ns) + ns = task_active_pid_ns(current); +- nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); ++ if (ns) ++ nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); + rcu_read_unlock(); + + return nr; +-- +2.51.0 + diff --git a/queue-5.10/series b/queue-5.10/series index 6baeadbb80..5cc570dc1c 100644 --- a/queue-5.10/series +++ b/queue-5.10/series @@ -186,3 +186,8 @@ asoc-codecs-wcd934x-simplify-with-dev_err_probe.patch asoc-wcd934x-fix-error-handling-in-wcd934x_codec_parse_data.patch kvm-x86-don-t-re-check-l1-intercepts-when-completing-userspace-i-o.patch net-9p-fix-double-req-put-in-p9_fd_cancelled.patch +minixfs-verify-inode-mode-when-loading-from-disk.patch +pid-add-a-judgment-for-ns-null-in-pid_nr_ns.patch +pid-make-__task_pid_nr_ns-ns-null-safe-for-zombie-ca.patch +fs-add-initramfs_options-to-set-initramfs-mount-opti.patch +cramfs-verify-inode-mode-when-loading-from-disk.patch diff --git a/queue-5.15/cramfs-verify-inode-mode-when-loading-from-disk.patch b/queue-5.15/cramfs-verify-inode-mode-when-loading-from-disk.patch new file mode 100644 index 0000000000..3c7419fe03 --- /dev/null +++ b/queue-5.15/cramfs-verify-inode-mode-when-loading-from-disk.patch @@ -0,0 +1,51 @@ +From 5dd72f483fef349192cfc9aa03658bd4d7983bf9 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 30 Aug 2025 19:01:01 +0900 +Subject: cramfs: Verify inode mode when loading from disk + +From: Tetsuo Handa + +[ Upstream commit 7f9d34b0a7cb93d678ee7207f0634dbf79e47fe5 ] + +The inode mode loaded from corrupted disk can be invalid. Do like what +commit 0a9e74051313 ("isofs: Verify inode mode when loading from disk") +does. + +Reported-by: syzbot +Closes: https://syzkaller.appspot.com/bug?extid=895c23f6917da440ed0d +Signed-off-by: Tetsuo Handa +Link: https://lore.kernel.org/429b3ef1-13de-4310-9a8e-c2dc9a36234a@I-love.SAKURA.ne.jp +Acked-by: Nicolas Pitre +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + fs/cramfs/inode.c | 11 ++++++++++- + 1 file changed, 10 insertions(+), 1 deletion(-) + +diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c +index 2be65269a987c..c893066e77ab4 100644 +--- a/fs/cramfs/inode.c ++++ b/fs/cramfs/inode.c +@@ -117,9 +117,18 @@ static struct inode *get_cramfs_inode(struct super_block *sb, + inode_nohighmem(inode); + inode->i_data.a_ops = &cramfs_aops; + break; +- default: ++ case S_IFCHR: ++ case S_IFBLK: ++ case S_IFIFO: ++ case S_IFSOCK: + init_special_inode(inode, cramfs_inode->mode, + old_decode_dev(cramfs_inode->size)); ++ break; ++ default: ++ printk(KERN_DEBUG "CRAMFS: Invalid file type 0%04o for inode %lu.\n", ++ inode->i_mode, inode->i_ino); ++ iget_failed(inode); ++ return ERR_PTR(-EIO); + } + + inode->i_mode = cramfs_inode->mode; +-- +2.51.0 + diff --git a/queue-5.15/fs-add-initramfs_options-to-set-initramfs-mount-opti.patch b/queue-5.15/fs-add-initramfs_options-to-set-initramfs-mount-opti.patch new file mode 100644 index 0000000000..42d4c69bd4 --- /dev/null +++ b/queue-5.15/fs-add-initramfs_options-to-set-initramfs-mount-opti.patch @@ -0,0 +1,116 @@ +From b1daf8b079662619a11b485e5bc2c63431d58fae Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 15 Aug 2025 20:14:59 +0800 +Subject: fs: Add 'initramfs_options' to set initramfs mount options +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Lichen Liu + +[ Upstream commit 278033a225e13ec21900f0a92b8351658f5377f2 ] + +When CONFIG_TMPFS is enabled, the initial root filesystem is a tmpfs. +By default, a tmpfs mount is limited to using 50% of the available RAM +for its content. This can be problematic in memory-constrained +environments, particularly during a kdump capture. + +In a kdump scenario, the capture kernel boots with a limited amount of +memory specified by the 'crashkernel' parameter. If the initramfs is +large, it may fail to unpack into the tmpfs rootfs due to insufficient +space. This is because to get X MB of usable space in tmpfs, 2*X MB of +memory must be available for the mount. This leads to an OOM failure +during the early boot process, preventing a successful crash dump. + +This patch introduces a new kernel command-line parameter, +initramfs_options, which allows passing specific mount options directly +to the rootfs when it is first mounted. This gives users control over +the rootfs behavior. + +For example, a user can now specify initramfs_options=size=75% to allow +the tmpfs to use up to 75% of the available memory. This can +significantly reduce the memory pressure for kdump. + +Consider a practical example: + +To unpack a 48MB initramfs, the tmpfs needs 48MB of usable space. With +the default 50% limit, this requires a memory pool of 96MB to be +available for the tmpfs mount. The total memory requirement is therefore +approximately: 16MB (vmlinuz) + 48MB (loaded initramfs) + 48MB (unpacked +kernel) + 96MB (for tmpfs) + 12MB (runtime overhead) ≈ 220MB. + +By using initramfs_options=size=75%, the memory pool required for the +48MB tmpfs is reduced to 48MB / 0.75 = 64MB. This reduces the total +memory requirement by 32MB (96MB - 64MB), allowing the kdump to succeed +with a smaller crashkernel size, such as 192MB. + +An alternative approach of reusing the existing rootflags parameter was +considered. However, a new, dedicated initramfs_options parameter was +chosen to avoid altering the current behavior of rootflags (which +applies to the final root filesystem) and to prevent any potential +regressions. + +Also add documentation for the new kernel parameter "initramfs_options" + +This approach is inspired by prior discussions and patches on the topic. +Ref: https://www.lightofdawn.org/blog/?viewDetailed=00128 +Ref: https://landley.net/notes-2015.html#01-01-2015 +Ref: https://lkml.org/lkml/2021/6/29/783 +Ref: https://www.kernel.org/doc/html/latest/filesystems/ramfs-rootfs-initramfs.html#what-is-rootfs + +Signed-off-by: Lichen Liu +Link: https://lore.kernel.org/20250815121459.3391223-1-lichliu@redhat.com +Tested-by: Rob Landley +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + Documentation/admin-guide/kernel-parameters.txt | 3 +++ + fs/namespace.c | 11 ++++++++++- + 2 files changed, 13 insertions(+), 1 deletion(-) + +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index 25e07ac5c1caf..ae09a6c701f02 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -5148,6 +5148,9 @@ + + rootflags= [KNL] Set root filesystem mount option string + ++ initramfs_options= [KNL] ++ Specify mount options for for the initramfs mount. ++ + rootfstype= [KNL] Set root filesystem type + + rootwait [KNL] Wait (indefinitely) for root device to show up. +diff --git a/fs/namespace.c b/fs/namespace.c +index 35d63bb3b22dc..ae1b8530eb939 100644 +--- a/fs/namespace.c ++++ b/fs/namespace.c +@@ -64,6 +64,15 @@ static int __init set_mphash_entries(char *str) + } + __setup("mphash_entries=", set_mphash_entries); + ++static char * __initdata initramfs_options; ++static int __init initramfs_options_setup(char *str) ++{ ++ initramfs_options = str; ++ return 1; ++} ++ ++__setup("initramfs_options=", initramfs_options_setup); ++ + static u64 event; + static DEFINE_IDA(mnt_id_ida); + static DEFINE_IDA(mnt_group_ida); +@@ -4352,7 +4361,7 @@ static void __init init_mount_tree(void) + struct mnt_namespace *ns; + struct path root; + +- mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", NULL); ++ mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", initramfs_options); + if (IS_ERR(mnt)) + panic("Can't create rootfs"); + +-- +2.51.0 + diff --git a/queue-5.15/minixfs-verify-inode-mode-when-loading-from-disk.patch b/queue-5.15/minixfs-verify-inode-mode-when-loading-from-disk.patch new file mode 100644 index 0000000000..8b2c07dac8 --- /dev/null +++ b/queue-5.15/minixfs-verify-inode-mode-when-loading-from-disk.patch @@ -0,0 +1,46 @@ +From 4c54504b2b04bf7ba7e1b330fc715a8490170f7e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 13 Aug 2025 00:17:44 +0900 +Subject: minixfs: Verify inode mode when loading from disk + +From: Tetsuo Handa + +[ Upstream commit 73861970938ad1323eb02bbbc87f6fbd1e5bacca ] + +The inode mode loaded from corrupted disk can be invalid. Do like what +commit 0a9e74051313 ("isofs: Verify inode mode when loading from disk") +does. + +Reported-by: syzbot +Closes: https://syzkaller.appspot.com/bug?extid=895c23f6917da440ed0d +Signed-off-by: Tetsuo Handa +Link: https://lore.kernel.org/ec982681-84b8-4624-94fa-8af15b77cbd2@I-love.SAKURA.ne.jp +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + fs/minix/inode.c | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +diff --git a/fs/minix/inode.c b/fs/minix/inode.c +index d4bd94234ef73..807ae40b64b06 100644 +--- a/fs/minix/inode.c ++++ b/fs/minix/inode.c +@@ -470,8 +470,14 @@ void minix_set_inode(struct inode *inode, dev_t rdev) + inode->i_op = &minix_symlink_inode_operations; + inode_nohighmem(inode); + inode->i_mapping->a_ops = &minix_aops; +- } else ++ } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || ++ S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { + init_special_inode(inode, inode->i_mode, rdev); ++ } else { ++ printk(KERN_DEBUG "MINIX-fs: Invalid file type 0%04o for inode %lu.\n", ++ inode->i_mode, inode->i_ino); ++ make_bad_inode(inode); ++ } + } + + /* +-- +2.51.0 + diff --git a/queue-5.15/pid-add-a-judgment-for-ns-null-in-pid_nr_ns.patch b/queue-5.15/pid-add-a-judgment-for-ns-null-in-pid_nr_ns.patch new file mode 100644 index 0000000000..572edf455d --- /dev/null +++ b/queue-5.15/pid-add-a-judgment-for-ns-null-in-pid_nr_ns.patch @@ -0,0 +1,95 @@ +From 0f279b5b334740e8c06827f36456a21c1ed8a5ec Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 2 Aug 2025 10:21:23 +0800 +Subject: pid: Add a judgment for ns null in pid_nr_ns + +From: gaoxiang17 + +[ Upstream commit 006568ab4c5ca2309ceb36fa553e390b4aa9c0c7 ] + +__task_pid_nr_ns + ns = task_active_pid_ns(current); + pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); + if (pid && ns->level <= pid->level) { + +Sometimes null is returned for task_active_pid_ns. Then it will trigger kernel panic in pid_nr_ns. + +For example: + Unable to handle kernel NULL pointer dereference at virtual address 0000000000000058 + Mem abort info: + ESR = 0x0000000096000007 + EC = 0x25: DABT (current EL), IL = 32 bits + SET = 0, FnV = 0 + EA = 0, S1PTW = 0 + FSC = 0x07: level 3 translation fault + Data abort info: + ISV = 0, ISS = 0x00000007, ISS2 = 0x00000000 + CM = 0, WnR = 0, TnD = 0, TagAccess = 0 + GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 + user pgtable: 4k pages, 39-bit VAs, pgdp=00000002175aa000 + [0000000000000058] pgd=08000002175ab003, p4d=08000002175ab003, pud=08000002175ab003, pmd=08000002175be003, pte=0000000000000000 + pstate: 834000c5 (Nzcv daIF +PAN -UAO +TCO +DIT -SSBS BTYPE=--) + pc : __task_pid_nr_ns+0x74/0xd0 + lr : __task_pid_nr_ns+0x24/0xd0 + sp : ffffffc08001bd10 + x29: ffffffc08001bd10 x28: ffffffd4422b2000 x27: 0000000000000001 + x26: ffffffd442821168 x25: ffffffd442821000 x24: 00000f89492eab31 + x23: 00000000000000c0 x22: ffffff806f5693c0 x21: ffffff806f5693c0 + x20: 0000000000000001 x19: 0000000000000000 x18: 0000000000000000 + x17: 00000000529c6ef0 x16: 00000000529c6ef0 x15: 00000000023a1adc + x14: 0000000000000003 x13: 00000000007ef6d8 x12: 001167c391c78800 + x11: 00ffffffffffffff x10: 0000000000000000 x9 : 0000000000000001 + x8 : ffffff80816fa3c0 x7 : 0000000000000000 x6 : 49534d702d535449 + x5 : ffffffc080c4c2c0 x4 : ffffffd43ee128c8 x3 : ffffffd43ee124dc + x2 : 0000000000000000 x1 : 0000000000000001 x0 : ffffff806f5693c0 + Call trace: + __task_pid_nr_ns+0x74/0xd0 + ... + __handle_irq_event_percpu+0xd4/0x284 + handle_irq_event+0x48/0xb0 + handle_fasteoi_irq+0x160/0x2d8 + generic_handle_domain_irq+0x44/0x60 + gic_handle_irq+0x4c/0x114 + call_on_irq_stack+0x3c/0x74 + do_interrupt_handler+0x4c/0x84 + el1_interrupt+0x34/0x58 + el1h_64_irq_handler+0x18/0x24 + el1h_64_irq+0x68/0x6c + account_kernel_stack+0x60/0x144 + exit_task_stack_account+0x1c/0x80 + do_exit+0x7e4/0xaf8 + ... + get_signal+0x7bc/0x8d8 + do_notify_resume+0x128/0x828 + el0_svc+0x6c/0x70 + el0t_64_sync_handler+0x68/0xbc + el0t_64_sync+0x1a8/0x1ac + Code: 35fffe54 911a02a8 f9400108 b4000128 (b9405a69) + ---[ end trace 0000000000000000 ]--- + Kernel panic - not syncing: Oops: Fatal exception in interrupt + +Signed-off-by: gaoxiang17 +Link: https://lore.kernel.org/20250802022123.3536934-1-gxxa03070307@gmail.com +Reviewed-by: Baoquan He +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + kernel/pid.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/pid.c b/kernel/pid.c +index efe87db446836..61f6649568b25 100644 +--- a/kernel/pid.c ++++ b/kernel/pid.c +@@ -474,7 +474,7 @@ pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns) + struct upid *upid; + pid_t nr = 0; + +- if (pid && ns->level <= pid->level) { ++ if (pid && ns && ns->level <= pid->level) { + upid = &pid->numbers[ns->level]; + if (upid->ns == ns) + nr = upid->nr; +-- +2.51.0 + diff --git a/queue-5.15/pid-make-__task_pid_nr_ns-ns-null-safe-for-zombie-ca.patch b/queue-5.15/pid-make-__task_pid_nr_ns-ns-null-safe-for-zombie-ca.patch new file mode 100644 index 0000000000..ba4572dd73 --- /dev/null +++ b/queue-5.15/pid-make-__task_pid_nr_ns-ns-null-safe-for-zombie-ca.patch @@ -0,0 +1,48 @@ +From fc8d3c7d133deb82d720bed679b17f22d97c0d81 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sun, 10 Aug 2025 19:36:04 +0200 +Subject: pid: make __task_pid_nr_ns(ns => NULL) safe for zombie callers +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Oleg Nesterov + +[ Upstream commit abdfd4948e45c51b19162cf8b3f5003f8f53c9b9 ] + +task_pid_vnr(another_task) will crash if the caller was already reaped. +The pid_alive(current) check can't really help, the parent/debugger can +call release_task() right after this check. + +This also means that even task_ppid_nr_ns(current, NULL) is not safe, +pid_alive() only ensures that it is safe to dereference ->real_parent. + +Change __task_pid_nr_ns() to ensure ns != NULL. + +Originally-by: 高翔 +Link: https://lore.kernel.org/all/20250802022123.3536934-1-gxxa03070307@gmail.com/ +Signed-off-by: Oleg Nesterov +Link: https://lore.kernel.org/20250810173604.GA19991@redhat.com +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + kernel/pid.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/kernel/pid.c b/kernel/pid.c +index 61f6649568b25..18f67751d0a51 100644 +--- a/kernel/pid.c ++++ b/kernel/pid.c +@@ -497,7 +497,8 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, + rcu_read_lock(); + if (!ns) + ns = task_active_pid_ns(current); +- nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); ++ if (ns) ++ nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); + rcu_read_unlock(); + + return nr; +-- +2.51.0 + diff --git a/queue-5.15/series b/queue-5.15/series index a84e9b8572..9230657765 100644 --- a/queue-5.15/series +++ b/queue-5.15/series @@ -258,3 +258,10 @@ minmax.h-use-build_bug_on_msg-for-the-lo-hi-test-in-clamp.patch minmax.h-move-all-the-clamp-definitions-after-the-min-max-ones.patch minmax.h-simplify-the-variants-of-clamp.patch minmax.h-remove-some-defines-that-are-only-expanded-once.patch +minixfs-verify-inode-mode-when-loading-from-disk.patch +pid-add-a-judgment-for-ns-null-in-pid_nr_ns.patch +pid-make-__task_pid_nr_ns-ns-null-safe-for-zombie-ca.patch +fs-add-initramfs_options-to-set-initramfs-mount-opti.patch +cramfs-verify-inode-mode-when-loading-from-disk.patch +writeback-avoid-softlockup-when-switching-many-inode.patch +writeback-avoid-excessively-long-inode-switching-tim.patch diff --git a/queue-5.15/writeback-avoid-excessively-long-inode-switching-tim.patch b/queue-5.15/writeback-avoid-excessively-long-inode-switching-tim.patch new file mode 100644 index 0000000000..c319a7fc5d --- /dev/null +++ b/queue-5.15/writeback-avoid-excessively-long-inode-switching-tim.patch @@ -0,0 +1,102 @@ +From 255bfd15a1f4ee94521f194b99f4f247d5425302 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 12 Sep 2025 12:38:37 +0200 +Subject: writeback: Avoid excessively long inode switching times + +From: Jan Kara + +[ Upstream commit 9a6ebbdbd41235ea3bc0c4f39e2076599b8113cc ] + +With lazytime mount option enabled we can be switching many dirty inodes +on cgroup exit to the parent cgroup. The numbers observed in practice +when systemd slice of a large cron job exits can easily reach hundreds +of thousands or millions. The logic in inode_do_switch_wbs() which sorts +the inode into appropriate place in b_dirty list of the target wb +however has linear complexity in the number of dirty inodes thus overall +time complexity of switching all the inodes is quadratic leading to +workers being pegged for hours consuming 100% of the CPU and switching +inodes to the parent wb. + +Simple reproducer of the issue: + FILES=10000 + # Filesystem mounted with lazytime mount option + MNT=/mnt/ + echo "Creating files and switching timestamps" + for (( j = 0; j < 50; j ++ )); do + mkdir $MNT/dir$j + for (( i = 0; i < $FILES; i++ )); do + echo "foo" >$MNT/dir$j/file$i + done + touch -a -t 202501010000 $MNT/dir$j/file* + done + wait + echo "Syncing and flushing" + sync + echo 3 >/proc/sys/vm/drop_caches + + echo "Reading all files from a cgroup" + mkdir /sys/fs/cgroup/unified/mycg1 || exit + echo $$ >/sys/fs/cgroup/unified/mycg1/cgroup.procs || exit + for (( j = 0; j < 50; j ++ )); do + cat /mnt/dir$j/file* >/dev/null & + done + wait + echo "Switching wbs" + # Now rmdir the cgroup after the script exits + +We need to maintain b_dirty list ordering to keep writeback happy so +instead of sorting inode into appropriate place just append it at the +end of the list and clobber dirtied_time_when. This may result in inode +writeback starting later after cgroup switch however cgroup switches are +rare so it shouldn't matter much. Since the cgroup had write access to +the inode, there are no practical concerns of the possible DoS issues. + +Acked-by: Tejun Heo +Signed-off-by: Jan Kara +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + fs/fs-writeback.c | 21 +++++++++++---------- + 1 file changed, 11 insertions(+), 10 deletions(-) + +diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c +index 3b002ac407434..095eaa896cbe2 100644 +--- a/fs/fs-writeback.c ++++ b/fs/fs-writeback.c +@@ -418,22 +418,23 @@ static bool inode_do_switch_wbs(struct inode *inode, + * Transfer to @new_wb's IO list if necessary. If the @inode is dirty, + * the specific list @inode was on is ignored and the @inode is put on + * ->b_dirty which is always correct including from ->b_dirty_time. +- * The transfer preserves @inode->dirtied_when ordering. If the @inode +- * was clean, it means it was on the b_attached list, so move it onto +- * the b_attached list of @new_wb. ++ * If the @inode was clean, it means it was on the b_attached list, so ++ * move it onto the b_attached list of @new_wb. + */ + if (!list_empty(&inode->i_io_list)) { + inode->i_wb = new_wb; + + if (inode->i_state & I_DIRTY_ALL) { +- struct inode *pos; +- +- list_for_each_entry(pos, &new_wb->b_dirty, i_io_list) +- if (time_after_eq(inode->dirtied_when, +- pos->dirtied_when)) +- break; ++ /* ++ * We need to keep b_dirty list sorted by ++ * dirtied_time_when. However properly sorting the ++ * inode in the list gets too expensive when switching ++ * many inodes. So just attach inode at the end of the ++ * dirty list and clobber the dirtied_time_when. ++ */ ++ inode->dirtied_time_when = jiffies; + inode_io_list_move_locked(inode, new_wb, +- pos->i_io_list.prev); ++ &new_wb->b_dirty); + } else { + inode_cgwb_move_to_attached(inode, new_wb); + } +-- +2.51.0 + diff --git a/queue-5.15/writeback-avoid-softlockup-when-switching-many-inode.patch b/queue-5.15/writeback-avoid-softlockup-when-switching-many-inode.patch new file mode 100644 index 0000000000..3ee46ad452 --- /dev/null +++ b/queue-5.15/writeback-avoid-softlockup-when-switching-many-inode.patch @@ -0,0 +1,65 @@ +From 4d825e5cd058cfb89aab66765f65f134a245b793 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 12 Sep 2025 12:38:36 +0200 +Subject: writeback: Avoid softlockup when switching many inodes + +From: Jan Kara + +[ Upstream commit 66c14dccd810d42ec5c73bb8a9177489dfd62278 ] + +process_inode_switch_wbs_work() can be switching over 100 inodes to a +different cgroup. Since switching an inode requires counting all dirty & +under-writeback pages in the address space of each inode, this can take +a significant amount of time. Add a possibility to reschedule after +processing each inode to avoid softlockups. + +Acked-by: Tejun Heo +Signed-off-by: Jan Kara +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + fs/fs-writeback.c | 11 ++++++++++- + 1 file changed, 10 insertions(+), 1 deletion(-) + +diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c +index cb3f1790a296e..3b002ac407434 100644 +--- a/fs/fs-writeback.c ++++ b/fs/fs-writeback.c +@@ -475,6 +475,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) + */ + down_read(&bdi->wb_switch_rwsem); + ++ inodep = isw->inodes; + /* + * By the time control reaches here, RCU grace period has passed + * since I_WB_SWITCH assertion and all wb stat update transactions +@@ -485,6 +486,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) + * gives us exclusion against all wb related operations on @inode + * including IO list manipulations and stat updates. + */ ++relock: + if (old_wb < new_wb) { + spin_lock(&old_wb->list_lock); + spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING); +@@ -493,10 +495,17 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) + spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING); + } + +- for (inodep = isw->inodes; *inodep; inodep++) { ++ while (*inodep) { + WARN_ON_ONCE((*inodep)->i_wb != old_wb); + if (inode_do_switch_wbs(*inodep, old_wb, new_wb)) + nr_switched++; ++ inodep++; ++ if (*inodep && need_resched()) { ++ spin_unlock(&new_wb->list_lock); ++ spin_unlock(&old_wb->list_lock); ++ cond_resched(); ++ goto relock; ++ } + } + + spin_unlock(&new_wb->list_lock); +-- +2.51.0 + diff --git a/queue-5.4/cramfs-verify-inode-mode-when-loading-from-disk.patch b/queue-5.4/cramfs-verify-inode-mode-when-loading-from-disk.patch new file mode 100644 index 0000000000..c8d7d65387 --- /dev/null +++ b/queue-5.4/cramfs-verify-inode-mode-when-loading-from-disk.patch @@ -0,0 +1,51 @@ +From 1992ae4798a422a35ace1530e04c7e7b6e96bc2b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 30 Aug 2025 19:01:01 +0900 +Subject: cramfs: Verify inode mode when loading from disk + +From: Tetsuo Handa + +[ Upstream commit 7f9d34b0a7cb93d678ee7207f0634dbf79e47fe5 ] + +The inode mode loaded from corrupted disk can be invalid. Do like what +commit 0a9e74051313 ("isofs: Verify inode mode when loading from disk") +does. + +Reported-by: syzbot +Closes: https://syzkaller.appspot.com/bug?extid=895c23f6917da440ed0d +Signed-off-by: Tetsuo Handa +Link: https://lore.kernel.org/429b3ef1-13de-4310-9a8e-c2dc9a36234a@I-love.SAKURA.ne.jp +Acked-by: Nicolas Pitre +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + fs/cramfs/inode.c | 11 ++++++++++- + 1 file changed, 10 insertions(+), 1 deletion(-) + +diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c +index 2f04024c3588e..82c45ca453216 100644 +--- a/fs/cramfs/inode.c ++++ b/fs/cramfs/inode.c +@@ -117,9 +117,18 @@ static struct inode *get_cramfs_inode(struct super_block *sb, + inode_nohighmem(inode); + inode->i_data.a_ops = &cramfs_aops; + break; +- default: ++ case S_IFCHR: ++ case S_IFBLK: ++ case S_IFIFO: ++ case S_IFSOCK: + init_special_inode(inode, cramfs_inode->mode, + old_decode_dev(cramfs_inode->size)); ++ break; ++ default: ++ printk(KERN_DEBUG "CRAMFS: Invalid file type 0%04o for inode %lu.\n", ++ inode->i_mode, inode->i_ino); ++ iget_failed(inode); ++ return ERR_PTR(-EIO); + } + + inode->i_mode = cramfs_inode->mode; +-- +2.51.0 + diff --git a/queue-5.4/fs-add-initramfs_options-to-set-initramfs-mount-opti.patch b/queue-5.4/fs-add-initramfs_options-to-set-initramfs-mount-opti.patch new file mode 100644 index 0000000000..f70bf95b42 --- /dev/null +++ b/queue-5.4/fs-add-initramfs_options-to-set-initramfs-mount-opti.patch @@ -0,0 +1,116 @@ +From a53cc9813a0ede779e5d69661df27c6461368baa Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 15 Aug 2025 20:14:59 +0800 +Subject: fs: Add 'initramfs_options' to set initramfs mount options +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Lichen Liu + +[ Upstream commit 278033a225e13ec21900f0a92b8351658f5377f2 ] + +When CONFIG_TMPFS is enabled, the initial root filesystem is a tmpfs. +By default, a tmpfs mount is limited to using 50% of the available RAM +for its content. This can be problematic in memory-constrained +environments, particularly during a kdump capture. + +In a kdump scenario, the capture kernel boots with a limited amount of +memory specified by the 'crashkernel' parameter. If the initramfs is +large, it may fail to unpack into the tmpfs rootfs due to insufficient +space. This is because to get X MB of usable space in tmpfs, 2*X MB of +memory must be available for the mount. This leads to an OOM failure +during the early boot process, preventing a successful crash dump. + +This patch introduces a new kernel command-line parameter, +initramfs_options, which allows passing specific mount options directly +to the rootfs when it is first mounted. This gives users control over +the rootfs behavior. + +For example, a user can now specify initramfs_options=size=75% to allow +the tmpfs to use up to 75% of the available memory. This can +significantly reduce the memory pressure for kdump. + +Consider a practical example: + +To unpack a 48MB initramfs, the tmpfs needs 48MB of usable space. With +the default 50% limit, this requires a memory pool of 96MB to be +available for the tmpfs mount. The total memory requirement is therefore +approximately: 16MB (vmlinuz) + 48MB (loaded initramfs) + 48MB (unpacked +kernel) + 96MB (for tmpfs) + 12MB (runtime overhead) ≈ 220MB. + +By using initramfs_options=size=75%, the memory pool required for the +48MB tmpfs is reduced to 48MB / 0.75 = 64MB. This reduces the total +memory requirement by 32MB (96MB - 64MB), allowing the kdump to succeed +with a smaller crashkernel size, such as 192MB. + +An alternative approach of reusing the existing rootflags parameter was +considered. However, a new, dedicated initramfs_options parameter was +chosen to avoid altering the current behavior of rootflags (which +applies to the final root filesystem) and to prevent any potential +regressions. + +Also add documentation for the new kernel parameter "initramfs_options" + +This approach is inspired by prior discussions and patches on the topic. +Ref: https://www.lightofdawn.org/blog/?viewDetailed=00128 +Ref: https://landley.net/notes-2015.html#01-01-2015 +Ref: https://lkml.org/lkml/2021/6/29/783 +Ref: https://www.kernel.org/doc/html/latest/filesystems/ramfs-rootfs-initramfs.html#what-is-rootfs + +Signed-off-by: Lichen Liu +Link: https://lore.kernel.org/20250815121459.3391223-1-lichliu@redhat.com +Tested-by: Rob Landley +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + Documentation/admin-guide/kernel-parameters.txt | 3 +++ + fs/namespace.c | 11 ++++++++++- + 2 files changed, 13 insertions(+), 1 deletion(-) + +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index 9975dcab99c35..5c2594d7c9ac9 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -4409,6 +4409,9 @@ + + rootflags= [KNL] Set root filesystem mount option string + ++ initramfs_options= [KNL] ++ Specify mount options for for the initramfs mount. ++ + rootfstype= [KNL] Set root filesystem type + + rootwait [KNL] Wait (indefinitely) for root device to show up. +diff --git a/fs/namespace.c b/fs/namespace.c +index c87f847c959d9..3c6f0586ae218 100644 +--- a/fs/namespace.c ++++ b/fs/namespace.c +@@ -62,6 +62,15 @@ static int __init set_mphash_entries(char *str) + } + __setup("mphash_entries=", set_mphash_entries); + ++static char * __initdata initramfs_options; ++static int __init initramfs_options_setup(char *str) ++{ ++ initramfs_options = str; ++ return 1; ++} ++ ++__setup("initramfs_options=", initramfs_options_setup); ++ + static u64 event; + static DEFINE_IDA(mnt_id_ida); + static DEFINE_IDA(mnt_group_ida); +@@ -3829,7 +3838,7 @@ static void __init init_mount_tree(void) + struct mnt_namespace *ns; + struct path root; + +- mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", NULL); ++ mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", initramfs_options); + if (IS_ERR(mnt)) + panic("Can't create rootfs"); + +-- +2.51.0 + diff --git a/queue-5.4/minixfs-verify-inode-mode-when-loading-from-disk.patch b/queue-5.4/minixfs-verify-inode-mode-when-loading-from-disk.patch new file mode 100644 index 0000000000..7b123bd97e --- /dev/null +++ b/queue-5.4/minixfs-verify-inode-mode-when-loading-from-disk.patch @@ -0,0 +1,46 @@ +From d7406d8bc361b52e4a3c69f7a334d6601b2ebf29 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 13 Aug 2025 00:17:44 +0900 +Subject: minixfs: Verify inode mode when loading from disk + +From: Tetsuo Handa + +[ Upstream commit 73861970938ad1323eb02bbbc87f6fbd1e5bacca ] + +The inode mode loaded from corrupted disk can be invalid. Do like what +commit 0a9e74051313 ("isofs: Verify inode mode when loading from disk") +does. + +Reported-by: syzbot +Closes: https://syzkaller.appspot.com/bug?extid=895c23f6917da440ed0d +Signed-off-by: Tetsuo Handa +Link: https://lore.kernel.org/ec982681-84b8-4624-94fa-8af15b77cbd2@I-love.SAKURA.ne.jp +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + fs/minix/inode.c | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +diff --git a/fs/minix/inode.c b/fs/minix/inode.c +index 3fffc709afd43..c026706aec0cc 100644 +--- a/fs/minix/inode.c ++++ b/fs/minix/inode.c +@@ -470,8 +470,14 @@ void minix_set_inode(struct inode *inode, dev_t rdev) + inode->i_op = &minix_symlink_inode_operations; + inode_nohighmem(inode); + inode->i_mapping->a_ops = &minix_aops; +- } else ++ } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || ++ S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { + init_special_inode(inode, inode->i_mode, rdev); ++ } else { ++ printk(KERN_DEBUG "MINIX-fs: Invalid file type 0%04o for inode %lu.\n", ++ inode->i_mode, inode->i_ino); ++ make_bad_inode(inode); ++ } + } + + /* +-- +2.51.0 + diff --git a/queue-5.4/pid-add-a-judgment-for-ns-null-in-pid_nr_ns.patch b/queue-5.4/pid-add-a-judgment-for-ns-null-in-pid_nr_ns.patch new file mode 100644 index 0000000000..23be414a98 --- /dev/null +++ b/queue-5.4/pid-add-a-judgment-for-ns-null-in-pid_nr_ns.patch @@ -0,0 +1,95 @@ +From 2ed2880b4416be2bf1e542d96667d589becdb374 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 2 Aug 2025 10:21:23 +0800 +Subject: pid: Add a judgment for ns null in pid_nr_ns + +From: gaoxiang17 + +[ Upstream commit 006568ab4c5ca2309ceb36fa553e390b4aa9c0c7 ] + +__task_pid_nr_ns + ns = task_active_pid_ns(current); + pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); + if (pid && ns->level <= pid->level) { + +Sometimes null is returned for task_active_pid_ns. Then it will trigger kernel panic in pid_nr_ns. + +For example: + Unable to handle kernel NULL pointer dereference at virtual address 0000000000000058 + Mem abort info: + ESR = 0x0000000096000007 + EC = 0x25: DABT (current EL), IL = 32 bits + SET = 0, FnV = 0 + EA = 0, S1PTW = 0 + FSC = 0x07: level 3 translation fault + Data abort info: + ISV = 0, ISS = 0x00000007, ISS2 = 0x00000000 + CM = 0, WnR = 0, TnD = 0, TagAccess = 0 + GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 + user pgtable: 4k pages, 39-bit VAs, pgdp=00000002175aa000 + [0000000000000058] pgd=08000002175ab003, p4d=08000002175ab003, pud=08000002175ab003, pmd=08000002175be003, pte=0000000000000000 + pstate: 834000c5 (Nzcv daIF +PAN -UAO +TCO +DIT -SSBS BTYPE=--) + pc : __task_pid_nr_ns+0x74/0xd0 + lr : __task_pid_nr_ns+0x24/0xd0 + sp : ffffffc08001bd10 + x29: ffffffc08001bd10 x28: ffffffd4422b2000 x27: 0000000000000001 + x26: ffffffd442821168 x25: ffffffd442821000 x24: 00000f89492eab31 + x23: 00000000000000c0 x22: ffffff806f5693c0 x21: ffffff806f5693c0 + x20: 0000000000000001 x19: 0000000000000000 x18: 0000000000000000 + x17: 00000000529c6ef0 x16: 00000000529c6ef0 x15: 00000000023a1adc + x14: 0000000000000003 x13: 00000000007ef6d8 x12: 001167c391c78800 + x11: 00ffffffffffffff x10: 0000000000000000 x9 : 0000000000000001 + x8 : ffffff80816fa3c0 x7 : 0000000000000000 x6 : 49534d702d535449 + x5 : ffffffc080c4c2c0 x4 : ffffffd43ee128c8 x3 : ffffffd43ee124dc + x2 : 0000000000000000 x1 : 0000000000000001 x0 : ffffff806f5693c0 + Call trace: + __task_pid_nr_ns+0x74/0xd0 + ... + __handle_irq_event_percpu+0xd4/0x284 + handle_irq_event+0x48/0xb0 + handle_fasteoi_irq+0x160/0x2d8 + generic_handle_domain_irq+0x44/0x60 + gic_handle_irq+0x4c/0x114 + call_on_irq_stack+0x3c/0x74 + do_interrupt_handler+0x4c/0x84 + el1_interrupt+0x34/0x58 + el1h_64_irq_handler+0x18/0x24 + el1h_64_irq+0x68/0x6c + account_kernel_stack+0x60/0x144 + exit_task_stack_account+0x1c/0x80 + do_exit+0x7e4/0xaf8 + ... + get_signal+0x7bc/0x8d8 + do_notify_resume+0x128/0x828 + el0_svc+0x6c/0x70 + el0t_64_sync_handler+0x68/0xbc + el0t_64_sync+0x1a8/0x1ac + Code: 35fffe54 911a02a8 f9400108 b4000128 (b9405a69) + ---[ end trace 0000000000000000 ]--- + Kernel panic - not syncing: Oops: Fatal exception in interrupt + +Signed-off-by: gaoxiang17 +Link: https://lore.kernel.org/20250802022123.3536934-1-gxxa03070307@gmail.com +Reviewed-by: Baoquan He +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + kernel/pid.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/pid.c b/kernel/pid.c +index 0a9f2e4372176..3a7b71258047f 100644 +--- a/kernel/pid.c ++++ b/kernel/pid.c +@@ -407,7 +407,7 @@ pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns) + struct upid *upid; + pid_t nr = 0; + +- if (pid && ns->level <= pid->level) { ++ if (pid && ns && ns->level <= pid->level) { + upid = &pid->numbers[ns->level]; + if (upid->ns == ns) + nr = upid->nr; +-- +2.51.0 + diff --git a/queue-5.4/series b/queue-5.4/series index 71c6f097a6..27bac166d6 100644 --- a/queue-5.4/series +++ b/queue-5.4/series @@ -142,3 +142,7 @@ mfd-intel_soc_pmic_chtdc_ti-drop-unneeded-assignment-for-cache_type.patch mfd-intel_soc_pmic_chtdc_ti-set-use_single_read-regmap_config-flag.patch dm-fix-null-pointer-dereference-in-__dm_suspend.patch tracing-fix-race-condition-in-kprobe-initialization-causing-null-pointer-dereference.patch +minixfs-verify-inode-mode-when-loading-from-disk.patch +pid-add-a-judgment-for-ns-null-in-pid_nr_ns.patch +fs-add-initramfs_options-to-set-initramfs-mount-opti.patch +cramfs-verify-inode-mode-when-loading-from-disk.patch diff --git a/queue-6.1/cramfs-verify-inode-mode-when-loading-from-disk.patch b/queue-6.1/cramfs-verify-inode-mode-when-loading-from-disk.patch new file mode 100644 index 0000000000..e110ab58de --- /dev/null +++ b/queue-6.1/cramfs-verify-inode-mode-when-loading-from-disk.patch @@ -0,0 +1,51 @@ +From 0f3e4f92ce8ab5ba39827abbc2ee28eb560949f5 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 30 Aug 2025 19:01:01 +0900 +Subject: cramfs: Verify inode mode when loading from disk + +From: Tetsuo Handa + +[ Upstream commit 7f9d34b0a7cb93d678ee7207f0634dbf79e47fe5 ] + +The inode mode loaded from corrupted disk can be invalid. Do like what +commit 0a9e74051313 ("isofs: Verify inode mode when loading from disk") +does. + +Reported-by: syzbot +Closes: https://syzkaller.appspot.com/bug?extid=895c23f6917da440ed0d +Signed-off-by: Tetsuo Handa +Link: https://lore.kernel.org/429b3ef1-13de-4310-9a8e-c2dc9a36234a@I-love.SAKURA.ne.jp +Acked-by: Nicolas Pitre +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + fs/cramfs/inode.c | 11 ++++++++++- + 1 file changed, 10 insertions(+), 1 deletion(-) + +diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c +index 6dae27d6f553f..9979187a4b3c5 100644 +--- a/fs/cramfs/inode.c ++++ b/fs/cramfs/inode.c +@@ -117,9 +117,18 @@ static struct inode *get_cramfs_inode(struct super_block *sb, + inode_nohighmem(inode); + inode->i_data.a_ops = &cramfs_aops; + break; +- default: ++ case S_IFCHR: ++ case S_IFBLK: ++ case S_IFIFO: ++ case S_IFSOCK: + init_special_inode(inode, cramfs_inode->mode, + old_decode_dev(cramfs_inode->size)); ++ break; ++ default: ++ printk(KERN_DEBUG "CRAMFS: Invalid file type 0%04o for inode %lu.\n", ++ inode->i_mode, inode->i_ino); ++ iget_failed(inode); ++ return ERR_PTR(-EIO); + } + + inode->i_mode = cramfs_inode->mode; +-- +2.51.0 + diff --git a/queue-6.1/fs-add-initramfs_options-to-set-initramfs-mount-opti.patch b/queue-6.1/fs-add-initramfs_options-to-set-initramfs-mount-opti.patch new file mode 100644 index 0000000000..a7c5118409 --- /dev/null +++ b/queue-6.1/fs-add-initramfs_options-to-set-initramfs-mount-opti.patch @@ -0,0 +1,116 @@ +From f1eec19b191a431847c3444f440eeead3ec1c9e8 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 15 Aug 2025 20:14:59 +0800 +Subject: fs: Add 'initramfs_options' to set initramfs mount options +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Lichen Liu + +[ Upstream commit 278033a225e13ec21900f0a92b8351658f5377f2 ] + +When CONFIG_TMPFS is enabled, the initial root filesystem is a tmpfs. +By default, a tmpfs mount is limited to using 50% of the available RAM +for its content. This can be problematic in memory-constrained +environments, particularly during a kdump capture. + +In a kdump scenario, the capture kernel boots with a limited amount of +memory specified by the 'crashkernel' parameter. If the initramfs is +large, it may fail to unpack into the tmpfs rootfs due to insufficient +space. This is because to get X MB of usable space in tmpfs, 2*X MB of +memory must be available for the mount. This leads to an OOM failure +during the early boot process, preventing a successful crash dump. + +This patch introduces a new kernel command-line parameter, +initramfs_options, which allows passing specific mount options directly +to the rootfs when it is first mounted. This gives users control over +the rootfs behavior. + +For example, a user can now specify initramfs_options=size=75% to allow +the tmpfs to use up to 75% of the available memory. This can +significantly reduce the memory pressure for kdump. + +Consider a practical example: + +To unpack a 48MB initramfs, the tmpfs needs 48MB of usable space. With +the default 50% limit, this requires a memory pool of 96MB to be +available for the tmpfs mount. The total memory requirement is therefore +approximately: 16MB (vmlinuz) + 48MB (loaded initramfs) + 48MB (unpacked +kernel) + 96MB (for tmpfs) + 12MB (runtime overhead) ≈ 220MB. + +By using initramfs_options=size=75%, the memory pool required for the +48MB tmpfs is reduced to 48MB / 0.75 = 64MB. This reduces the total +memory requirement by 32MB (96MB - 64MB), allowing the kdump to succeed +with a smaller crashkernel size, such as 192MB. + +An alternative approach of reusing the existing rootflags parameter was +considered. However, a new, dedicated initramfs_options parameter was +chosen to avoid altering the current behavior of rootflags (which +applies to the final root filesystem) and to prevent any potential +regressions. + +Also add documentation for the new kernel parameter "initramfs_options" + +This approach is inspired by prior discussions and patches on the topic. +Ref: https://www.lightofdawn.org/blog/?viewDetailed=00128 +Ref: https://landley.net/notes-2015.html#01-01-2015 +Ref: https://lkml.org/lkml/2021/6/29/783 +Ref: https://www.kernel.org/doc/html/latest/filesystems/ramfs-rootfs-initramfs.html#what-is-rootfs + +Signed-off-by: Lichen Liu +Link: https://lore.kernel.org/20250815121459.3391223-1-lichliu@redhat.com +Tested-by: Rob Landley +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + Documentation/admin-guide/kernel-parameters.txt | 3 +++ + fs/namespace.c | 11 ++++++++++- + 2 files changed, 13 insertions(+), 1 deletion(-) + +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index cce2731727392..05ab068c1cc6d 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -5446,6 +5446,9 @@ + + rootflags= [KNL] Set root filesystem mount option string + ++ initramfs_options= [KNL] ++ Specify mount options for for the initramfs mount. ++ + rootfstype= [KNL] Set root filesystem type + + rootwait [KNL] Wait (indefinitely) for root device to show up. +diff --git a/fs/namespace.c b/fs/namespace.c +index 2a76269f2a4e7..f22f76d9c22f9 100644 +--- a/fs/namespace.c ++++ b/fs/namespace.c +@@ -64,6 +64,15 @@ static int __init set_mphash_entries(char *str) + } + __setup("mphash_entries=", set_mphash_entries); + ++static char * __initdata initramfs_options; ++static int __init initramfs_options_setup(char *str) ++{ ++ initramfs_options = str; ++ return 1; ++} ++ ++__setup("initramfs_options=", initramfs_options_setup); ++ + static u64 event; + static DEFINE_IDA(mnt_id_ida); + static DEFINE_IDA(mnt_group_ida); +@@ -4414,7 +4423,7 @@ static void __init init_mount_tree(void) + struct mnt_namespace *ns; + struct path root; + +- mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", NULL); ++ mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", initramfs_options); + if (IS_ERR(mnt)) + panic("Can't create rootfs"); + +-- +2.51.0 + diff --git a/queue-6.1/minixfs-verify-inode-mode-when-loading-from-disk.patch b/queue-6.1/minixfs-verify-inode-mode-when-loading-from-disk.patch new file mode 100644 index 0000000000..56105de169 --- /dev/null +++ b/queue-6.1/minixfs-verify-inode-mode-when-loading-from-disk.patch @@ -0,0 +1,46 @@ +From 4e45893bcda29bb2a129e7992345e4a15ba240a8 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 13 Aug 2025 00:17:44 +0900 +Subject: minixfs: Verify inode mode when loading from disk + +From: Tetsuo Handa + +[ Upstream commit 73861970938ad1323eb02bbbc87f6fbd1e5bacca ] + +The inode mode loaded from corrupted disk can be invalid. Do like what +commit 0a9e74051313 ("isofs: Verify inode mode when loading from disk") +does. + +Reported-by: syzbot +Closes: https://syzkaller.appspot.com/bug?extid=895c23f6917da440ed0d +Signed-off-by: Tetsuo Handa +Link: https://lore.kernel.org/ec982681-84b8-4624-94fa-8af15b77cbd2@I-love.SAKURA.ne.jp +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + fs/minix/inode.c | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +diff --git a/fs/minix/inode.c b/fs/minix/inode.c +index da8bdd1712a70..9da6903e306c6 100644 +--- a/fs/minix/inode.c ++++ b/fs/minix/inode.c +@@ -470,8 +470,14 @@ void minix_set_inode(struct inode *inode, dev_t rdev) + inode->i_op = &minix_symlink_inode_operations; + inode_nohighmem(inode); + inode->i_mapping->a_ops = &minix_aops; +- } else ++ } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || ++ S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { + init_special_inode(inode, inode->i_mode, rdev); ++ } else { ++ printk(KERN_DEBUG "MINIX-fs: Invalid file type 0%04o for inode %lu.\n", ++ inode->i_mode, inode->i_ino); ++ make_bad_inode(inode); ++ } + } + + /* +-- +2.51.0 + diff --git a/queue-6.1/pid-add-a-judgment-for-ns-null-in-pid_nr_ns.patch b/queue-6.1/pid-add-a-judgment-for-ns-null-in-pid_nr_ns.patch new file mode 100644 index 0000000000..011cf90eb8 --- /dev/null +++ b/queue-6.1/pid-add-a-judgment-for-ns-null-in-pid_nr_ns.patch @@ -0,0 +1,95 @@ +From fdc9413906a876918f08ce133c1995c2f7dee73c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 2 Aug 2025 10:21:23 +0800 +Subject: pid: Add a judgment for ns null in pid_nr_ns + +From: gaoxiang17 + +[ Upstream commit 006568ab4c5ca2309ceb36fa553e390b4aa9c0c7 ] + +__task_pid_nr_ns + ns = task_active_pid_ns(current); + pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); + if (pid && ns->level <= pid->level) { + +Sometimes null is returned for task_active_pid_ns. Then it will trigger kernel panic in pid_nr_ns. + +For example: + Unable to handle kernel NULL pointer dereference at virtual address 0000000000000058 + Mem abort info: + ESR = 0x0000000096000007 + EC = 0x25: DABT (current EL), IL = 32 bits + SET = 0, FnV = 0 + EA = 0, S1PTW = 0 + FSC = 0x07: level 3 translation fault + Data abort info: + ISV = 0, ISS = 0x00000007, ISS2 = 0x00000000 + CM = 0, WnR = 0, TnD = 0, TagAccess = 0 + GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 + user pgtable: 4k pages, 39-bit VAs, pgdp=00000002175aa000 + [0000000000000058] pgd=08000002175ab003, p4d=08000002175ab003, pud=08000002175ab003, pmd=08000002175be003, pte=0000000000000000 + pstate: 834000c5 (Nzcv daIF +PAN -UAO +TCO +DIT -SSBS BTYPE=--) + pc : __task_pid_nr_ns+0x74/0xd0 + lr : __task_pid_nr_ns+0x24/0xd0 + sp : ffffffc08001bd10 + x29: ffffffc08001bd10 x28: ffffffd4422b2000 x27: 0000000000000001 + x26: ffffffd442821168 x25: ffffffd442821000 x24: 00000f89492eab31 + x23: 00000000000000c0 x22: ffffff806f5693c0 x21: ffffff806f5693c0 + x20: 0000000000000001 x19: 0000000000000000 x18: 0000000000000000 + x17: 00000000529c6ef0 x16: 00000000529c6ef0 x15: 00000000023a1adc + x14: 0000000000000003 x13: 00000000007ef6d8 x12: 001167c391c78800 + x11: 00ffffffffffffff x10: 0000000000000000 x9 : 0000000000000001 + x8 : ffffff80816fa3c0 x7 : 0000000000000000 x6 : 49534d702d535449 + x5 : ffffffc080c4c2c0 x4 : ffffffd43ee128c8 x3 : ffffffd43ee124dc + x2 : 0000000000000000 x1 : 0000000000000001 x0 : ffffff806f5693c0 + Call trace: + __task_pid_nr_ns+0x74/0xd0 + ... + __handle_irq_event_percpu+0xd4/0x284 + handle_irq_event+0x48/0xb0 + handle_fasteoi_irq+0x160/0x2d8 + generic_handle_domain_irq+0x44/0x60 + gic_handle_irq+0x4c/0x114 + call_on_irq_stack+0x3c/0x74 + do_interrupt_handler+0x4c/0x84 + el1_interrupt+0x34/0x58 + el1h_64_irq_handler+0x18/0x24 + el1h_64_irq+0x68/0x6c + account_kernel_stack+0x60/0x144 + exit_task_stack_account+0x1c/0x80 + do_exit+0x7e4/0xaf8 + ... + get_signal+0x7bc/0x8d8 + do_notify_resume+0x128/0x828 + el0_svc+0x6c/0x70 + el0t_64_sync_handler+0x68/0xbc + el0t_64_sync+0x1a8/0x1ac + Code: 35fffe54 911a02a8 f9400108 b4000128 (b9405a69) + ---[ end trace 0000000000000000 ]--- + Kernel panic - not syncing: Oops: Fatal exception in interrupt + +Signed-off-by: gaoxiang17 +Link: https://lore.kernel.org/20250802022123.3536934-1-gxxa03070307@gmail.com +Reviewed-by: Baoquan He +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + kernel/pid.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/pid.c b/kernel/pid.c +index 8bce3aebc949f..e1d0c9d952278 100644 +--- a/kernel/pid.c ++++ b/kernel/pid.c +@@ -474,7 +474,7 @@ pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns) + struct upid *upid; + pid_t nr = 0; + +- if (pid && ns->level <= pid->level) { ++ if (pid && ns && ns->level <= pid->level) { + upid = &pid->numbers[ns->level]; + if (upid->ns == ns) + nr = upid->nr; +-- +2.51.0 + diff --git a/queue-6.1/pid-make-__task_pid_nr_ns-ns-null-safe-for-zombie-ca.patch b/queue-6.1/pid-make-__task_pid_nr_ns-ns-null-safe-for-zombie-ca.patch new file mode 100644 index 0000000000..095135efe6 --- /dev/null +++ b/queue-6.1/pid-make-__task_pid_nr_ns-ns-null-safe-for-zombie-ca.patch @@ -0,0 +1,48 @@ +From ecf91d0814b73954603515999846941820223193 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sun, 10 Aug 2025 19:36:04 +0200 +Subject: pid: make __task_pid_nr_ns(ns => NULL) safe for zombie callers +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Oleg Nesterov + +[ Upstream commit abdfd4948e45c51b19162cf8b3f5003f8f53c9b9 ] + +task_pid_vnr(another_task) will crash if the caller was already reaped. +The pid_alive(current) check can't really help, the parent/debugger can +call release_task() right after this check. + +This also means that even task_ppid_nr_ns(current, NULL) is not safe, +pid_alive() only ensures that it is safe to dereference ->real_parent. + +Change __task_pid_nr_ns() to ensure ns != NULL. + +Originally-by: 高翔 +Link: https://lore.kernel.org/all/20250802022123.3536934-1-gxxa03070307@gmail.com/ +Signed-off-by: Oleg Nesterov +Link: https://lore.kernel.org/20250810173604.GA19991@redhat.com +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + kernel/pid.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/kernel/pid.c b/kernel/pid.c +index e1d0c9d952278..62a8349267de1 100644 +--- a/kernel/pid.c ++++ b/kernel/pid.c +@@ -497,7 +497,8 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, + rcu_read_lock(); + if (!ns) + ns = task_active_pid_ns(current); +- nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); ++ if (ns) ++ nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); + rcu_read_unlock(); + + return nr; +-- +2.51.0 + diff --git a/queue-6.1/series b/queue-6.1/series index 18f31b8c9c..fe1b0e1050 100644 --- a/queue-6.1/series +++ b/queue-6.1/series @@ -158,3 +158,10 @@ asm-generic-io-add-_ret_ip_-to-mmio-trace-for-more-accurate-debug-info.patch asm-generic-io.h-suppress-endianness-warnings-for-relaxed-accessors.patch asm-generic-io.h-skip-trace-helpers-if-rwmmio-events-are-disabled.patch mptcp-pm-in-kernel-usable-client-side-with-c-flag.patch +minixfs-verify-inode-mode-when-loading-from-disk.patch +pid-add-a-judgment-for-ns-null-in-pid_nr_ns.patch +pid-make-__task_pid_nr_ns-ns-null-safe-for-zombie-ca.patch +fs-add-initramfs_options-to-set-initramfs-mount-opti.patch +cramfs-verify-inode-mode-when-loading-from-disk.patch +writeback-avoid-softlockup-when-switching-many-inode.patch +writeback-avoid-excessively-long-inode-switching-tim.patch diff --git a/queue-6.1/writeback-avoid-excessively-long-inode-switching-tim.patch b/queue-6.1/writeback-avoid-excessively-long-inode-switching-tim.patch new file mode 100644 index 0000000000..904555ef45 --- /dev/null +++ b/queue-6.1/writeback-avoid-excessively-long-inode-switching-tim.patch @@ -0,0 +1,102 @@ +From 72326df292a4827988a5837c9a54c38ca3b6ec36 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 12 Sep 2025 12:38:37 +0200 +Subject: writeback: Avoid excessively long inode switching times + +From: Jan Kara + +[ Upstream commit 9a6ebbdbd41235ea3bc0c4f39e2076599b8113cc ] + +With lazytime mount option enabled we can be switching many dirty inodes +on cgroup exit to the parent cgroup. The numbers observed in practice +when systemd slice of a large cron job exits can easily reach hundreds +of thousands or millions. The logic in inode_do_switch_wbs() which sorts +the inode into appropriate place in b_dirty list of the target wb +however has linear complexity in the number of dirty inodes thus overall +time complexity of switching all the inodes is quadratic leading to +workers being pegged for hours consuming 100% of the CPU and switching +inodes to the parent wb. + +Simple reproducer of the issue: + FILES=10000 + # Filesystem mounted with lazytime mount option + MNT=/mnt/ + echo "Creating files and switching timestamps" + for (( j = 0; j < 50; j ++ )); do + mkdir $MNT/dir$j + for (( i = 0; i < $FILES; i++ )); do + echo "foo" >$MNT/dir$j/file$i + done + touch -a -t 202501010000 $MNT/dir$j/file* + done + wait + echo "Syncing and flushing" + sync + echo 3 >/proc/sys/vm/drop_caches + + echo "Reading all files from a cgroup" + mkdir /sys/fs/cgroup/unified/mycg1 || exit + echo $$ >/sys/fs/cgroup/unified/mycg1/cgroup.procs || exit + for (( j = 0; j < 50; j ++ )); do + cat /mnt/dir$j/file* >/dev/null & + done + wait + echo "Switching wbs" + # Now rmdir the cgroup after the script exits + +We need to maintain b_dirty list ordering to keep writeback happy so +instead of sorting inode into appropriate place just append it at the +end of the list and clobber dirtied_time_when. This may result in inode +writeback starting later after cgroup switch however cgroup switches are +rare so it shouldn't matter much. Since the cgroup had write access to +the inode, there are no practical concerns of the possible DoS issues. + +Acked-by: Tejun Heo +Signed-off-by: Jan Kara +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + fs/fs-writeback.c | 21 +++++++++++---------- + 1 file changed, 11 insertions(+), 10 deletions(-) + +diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c +index 07473cf2a7c9b..75e8c102c5eef 100644 +--- a/fs/fs-writeback.c ++++ b/fs/fs-writeback.c +@@ -420,22 +420,23 @@ static bool inode_do_switch_wbs(struct inode *inode, + * Transfer to @new_wb's IO list if necessary. If the @inode is dirty, + * the specific list @inode was on is ignored and the @inode is put on + * ->b_dirty which is always correct including from ->b_dirty_time. +- * The transfer preserves @inode->dirtied_when ordering. If the @inode +- * was clean, it means it was on the b_attached list, so move it onto +- * the b_attached list of @new_wb. ++ * If the @inode was clean, it means it was on the b_attached list, so ++ * move it onto the b_attached list of @new_wb. + */ + if (!list_empty(&inode->i_io_list)) { + inode->i_wb = new_wb; + + if (inode->i_state & I_DIRTY_ALL) { +- struct inode *pos; +- +- list_for_each_entry(pos, &new_wb->b_dirty, i_io_list) +- if (time_after_eq(inode->dirtied_when, +- pos->dirtied_when)) +- break; ++ /* ++ * We need to keep b_dirty list sorted by ++ * dirtied_time_when. However properly sorting the ++ * inode in the list gets too expensive when switching ++ * many inodes. So just attach inode at the end of the ++ * dirty list and clobber the dirtied_time_when. ++ */ ++ inode->dirtied_time_when = jiffies; + inode_io_list_move_locked(inode, new_wb, +- pos->i_io_list.prev); ++ &new_wb->b_dirty); + } else { + inode_cgwb_move_to_attached(inode, new_wb); + } +-- +2.51.0 + diff --git a/queue-6.1/writeback-avoid-softlockup-when-switching-many-inode.patch b/queue-6.1/writeback-avoid-softlockup-when-switching-many-inode.patch new file mode 100644 index 0000000000..dd4134177f --- /dev/null +++ b/queue-6.1/writeback-avoid-softlockup-when-switching-many-inode.patch @@ -0,0 +1,65 @@ +From 5cfd0e6adfdedb4d45aeeb61ffcbdd1acb9e8d8e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 12 Sep 2025 12:38:36 +0200 +Subject: writeback: Avoid softlockup when switching many inodes + +From: Jan Kara + +[ Upstream commit 66c14dccd810d42ec5c73bb8a9177489dfd62278 ] + +process_inode_switch_wbs_work() can be switching over 100 inodes to a +different cgroup. Since switching an inode requires counting all dirty & +under-writeback pages in the address space of each inode, this can take +a significant amount of time. Add a possibility to reschedule after +processing each inode to avoid softlockups. + +Acked-by: Tejun Heo +Signed-off-by: Jan Kara +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + fs/fs-writeback.c | 11 ++++++++++- + 1 file changed, 10 insertions(+), 1 deletion(-) + +diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c +index 41f8ae8a416fb..07473cf2a7c9b 100644 +--- a/fs/fs-writeback.c ++++ b/fs/fs-writeback.c +@@ -477,6 +477,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) + */ + down_read(&bdi->wb_switch_rwsem); + ++ inodep = isw->inodes; + /* + * By the time control reaches here, RCU grace period has passed + * since I_WB_SWITCH assertion and all wb stat update transactions +@@ -487,6 +488,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) + * gives us exclusion against all wb related operations on @inode + * including IO list manipulations and stat updates. + */ ++relock: + if (old_wb < new_wb) { + spin_lock(&old_wb->list_lock); + spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING); +@@ -495,10 +497,17 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) + spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING); + } + +- for (inodep = isw->inodes; *inodep; inodep++) { ++ while (*inodep) { + WARN_ON_ONCE((*inodep)->i_wb != old_wb); + if (inode_do_switch_wbs(*inodep, old_wb, new_wb)) + nr_switched++; ++ inodep++; ++ if (*inodep && need_resched()) { ++ spin_unlock(&new_wb->list_lock); ++ spin_unlock(&old_wb->list_lock); ++ cond_resched(); ++ goto relock; ++ } + } + + spin_unlock(&new_wb->list_lock); +-- +2.51.0 + diff --git a/queue-6.12/copy_file_range-limit-size-if-in-compat-mode.patch b/queue-6.12/copy_file_range-limit-size-if-in-compat-mode.patch new file mode 100644 index 0000000000..080a2afb7f --- /dev/null +++ b/queue-6.12/copy_file_range-limit-size-if-in-compat-mode.patch @@ -0,0 +1,66 @@ +From 2f9b6661cd9ebb2b99dc7a3584f5eeeab267e370 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 13 Aug 2025 17:11:05 +0200 +Subject: copy_file_range: limit size if in compat mode + +From: Miklos Szeredi + +[ Upstream commit f8f59a2c05dc16d19432e3154a9ac7bc385f4b92 ] + +If the process runs in 32-bit compat mode, copy_file_range results can be +in the in-band error range. In this case limit copy length to MAX_RW_COUNT +to prevent a signed overflow. + +Reported-by: Florian Weimer +Closes: https://lore.kernel.org/all/lhuh5ynl8z5.fsf@oldenburg.str.redhat.com/ +Signed-off-by: Miklos Szeredi +Link: https://lore.kernel.org/20250813151107.99856-1-mszeredi@redhat.com +Reviewed-by: Amir Goldstein +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + fs/read_write.c | 14 +++++++++----- + 1 file changed, 9 insertions(+), 5 deletions(-) + +diff --git a/fs/read_write.c b/fs/read_write.c +index befec0b5c537a..46408bab92385 100644 +--- a/fs/read_write.c ++++ b/fs/read_write.c +@@ -1600,6 +1600,13 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, + if (len == 0) + return 0; + ++ /* ++ * Make sure return value doesn't overflow in 32bit compat mode. Also ++ * limit the size for all cases except when calling ->copy_file_range(). ++ */ ++ if (splice || !file_out->f_op->copy_file_range || in_compat_syscall()) ++ len = min_t(size_t, MAX_RW_COUNT, len); ++ + file_start_write(file_out); + + /* +@@ -1613,9 +1620,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, + len, flags); + } else if (!splice && file_in->f_op->remap_file_range && samesb) { + ret = file_in->f_op->remap_file_range(file_in, pos_in, +- file_out, pos_out, +- min_t(loff_t, MAX_RW_COUNT, len), +- REMAP_FILE_CAN_SHORTEN); ++ file_out, pos_out, len, REMAP_FILE_CAN_SHORTEN); + /* fallback to splice */ + if (ret <= 0) + splice = true; +@@ -1648,8 +1653,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, + * to splicing from input file, while file_start_write() is held on + * the output file on a different sb. + */ +- ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, +- min_t(size_t, len, MAX_RW_COUNT), 0); ++ ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, len, 0); + done: + if (ret > 0) { + fsnotify_access(file_in); +-- +2.51.0 + diff --git a/queue-6.12/cramfs-verify-inode-mode-when-loading-from-disk.patch b/queue-6.12/cramfs-verify-inode-mode-when-loading-from-disk.patch new file mode 100644 index 0000000000..ef148d1e24 --- /dev/null +++ b/queue-6.12/cramfs-verify-inode-mode-when-loading-from-disk.patch @@ -0,0 +1,51 @@ +From 1f031cd7b076a4542583c81a7517497430ae246b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 30 Aug 2025 19:01:01 +0900 +Subject: cramfs: Verify inode mode when loading from disk + +From: Tetsuo Handa + +[ Upstream commit 7f9d34b0a7cb93d678ee7207f0634dbf79e47fe5 ] + +The inode mode loaded from corrupted disk can be invalid. Do like what +commit 0a9e74051313 ("isofs: Verify inode mode when loading from disk") +does. + +Reported-by: syzbot +Closes: https://syzkaller.appspot.com/bug?extid=895c23f6917da440ed0d +Signed-off-by: Tetsuo Handa +Link: https://lore.kernel.org/429b3ef1-13de-4310-9a8e-c2dc9a36234a@I-love.SAKURA.ne.jp +Acked-by: Nicolas Pitre +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + fs/cramfs/inode.c | 11 ++++++++++- + 1 file changed, 10 insertions(+), 1 deletion(-) + +diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c +index b84d1747a0205..e7d192f7ab3b4 100644 +--- a/fs/cramfs/inode.c ++++ b/fs/cramfs/inode.c +@@ -117,9 +117,18 @@ static struct inode *get_cramfs_inode(struct super_block *sb, + inode_nohighmem(inode); + inode->i_data.a_ops = &cramfs_aops; + break; +- default: ++ case S_IFCHR: ++ case S_IFBLK: ++ case S_IFIFO: ++ case S_IFSOCK: + init_special_inode(inode, cramfs_inode->mode, + old_decode_dev(cramfs_inode->size)); ++ break; ++ default: ++ printk(KERN_DEBUG "CRAMFS: Invalid file type 0%04o for inode %lu.\n", ++ inode->i_mode, inode->i_ino); ++ iget_failed(inode); ++ return ERR_PTR(-EIO); + } + + inode->i_mode = cramfs_inode->mode; +-- +2.51.0 + diff --git a/queue-6.12/fs-add-initramfs_options-to-set-initramfs-mount-opti.patch b/queue-6.12/fs-add-initramfs_options-to-set-initramfs-mount-opti.patch new file mode 100644 index 0000000000..30ab7479fc --- /dev/null +++ b/queue-6.12/fs-add-initramfs_options-to-set-initramfs-mount-opti.patch @@ -0,0 +1,116 @@ +From 0c73650be4836726549af5eff48302970c99a5f7 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 15 Aug 2025 20:14:59 +0800 +Subject: fs: Add 'initramfs_options' to set initramfs mount options +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Lichen Liu + +[ Upstream commit 278033a225e13ec21900f0a92b8351658f5377f2 ] + +When CONFIG_TMPFS is enabled, the initial root filesystem is a tmpfs. +By default, a tmpfs mount is limited to using 50% of the available RAM +for its content. This can be problematic in memory-constrained +environments, particularly during a kdump capture. + +In a kdump scenario, the capture kernel boots with a limited amount of +memory specified by the 'crashkernel' parameter. If the initramfs is +large, it may fail to unpack into the tmpfs rootfs due to insufficient +space. This is because to get X MB of usable space in tmpfs, 2*X MB of +memory must be available for the mount. This leads to an OOM failure +during the early boot process, preventing a successful crash dump. + +This patch introduces a new kernel command-line parameter, +initramfs_options, which allows passing specific mount options directly +to the rootfs when it is first mounted. This gives users control over +the rootfs behavior. + +For example, a user can now specify initramfs_options=size=75% to allow +the tmpfs to use up to 75% of the available memory. This can +significantly reduce the memory pressure for kdump. + +Consider a practical example: + +To unpack a 48MB initramfs, the tmpfs needs 48MB of usable space. With +the default 50% limit, this requires a memory pool of 96MB to be +available for the tmpfs mount. The total memory requirement is therefore +approximately: 16MB (vmlinuz) + 48MB (loaded initramfs) + 48MB (unpacked +kernel) + 96MB (for tmpfs) + 12MB (runtime overhead) ≈ 220MB. + +By using initramfs_options=size=75%, the memory pool required for the +48MB tmpfs is reduced to 48MB / 0.75 = 64MB. This reduces the total +memory requirement by 32MB (96MB - 64MB), allowing the kdump to succeed +with a smaller crashkernel size, such as 192MB. + +An alternative approach of reusing the existing rootflags parameter was +considered. However, a new, dedicated initramfs_options parameter was +chosen to avoid altering the current behavior of rootflags (which +applies to the final root filesystem) and to prevent any potential +regressions. + +Also add documentation for the new kernel parameter "initramfs_options" + +This approach is inspired by prior discussions and patches on the topic. +Ref: https://www.lightofdawn.org/blog/?viewDetailed=00128 +Ref: https://landley.net/notes-2015.html#01-01-2015 +Ref: https://lkml.org/lkml/2021/6/29/783 +Ref: https://www.kernel.org/doc/html/latest/filesystems/ramfs-rootfs-initramfs.html#what-is-rootfs + +Signed-off-by: Lichen Liu +Link: https://lore.kernel.org/20250815121459.3391223-1-lichliu@redhat.com +Tested-by: Rob Landley +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + Documentation/admin-guide/kernel-parameters.txt | 3 +++ + fs/namespace.c | 11 ++++++++++- + 2 files changed, 13 insertions(+), 1 deletion(-) + +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index 8724c2c580b88..e88505e945d52 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -5923,6 +5923,9 @@ + + rootflags= [KNL] Set root filesystem mount option string + ++ initramfs_options= [KNL] ++ Specify mount options for for the initramfs mount. ++ + rootfstype= [KNL] Set root filesystem type + + rootwait [KNL] Wait (indefinitely) for root device to show up. +diff --git a/fs/namespace.c b/fs/namespace.c +index 7606969412493..f5e46c2595b11 100644 +--- a/fs/namespace.c ++++ b/fs/namespace.c +@@ -65,6 +65,15 @@ static int __init set_mphash_entries(char *str) + } + __setup("mphash_entries=", set_mphash_entries); + ++static char * __initdata initramfs_options; ++static int __init initramfs_options_setup(char *str) ++{ ++ initramfs_options = str; ++ return 1; ++} ++ ++__setup("initramfs_options=", initramfs_options_setup); ++ + static u64 event; + static DEFINE_IDA(mnt_id_ida); + static DEFINE_IDA(mnt_group_ida); +@@ -5566,7 +5575,7 @@ static void __init init_mount_tree(void) + struct mnt_namespace *ns; + struct path root; + +- mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", NULL); ++ mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", initramfs_options); + if (IS_ERR(mnt)) + panic("Can't create rootfs"); + +-- +2.51.0 + diff --git a/queue-6.12/irqchip-sifive-plic-avoid-interrupt-id-0-handling-du.patch b/queue-6.12/irqchip-sifive-plic-avoid-interrupt-id-0-handling-du.patch new file mode 100644 index 0000000000..773d171e5c --- /dev/null +++ b/queue-6.12/irqchip-sifive-plic-avoid-interrupt-id-0-handling-du.patch @@ -0,0 +1,64 @@ +From 058647f6c5b3d0a4209840a51e23fa14443ee006 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 23 Sep 2025 15:43:19 +0100 +Subject: irqchip/sifive-plic: Avoid interrupt ID 0 handling during + suspend/resume + +From: Lucas Zampieri + +[ Upstream commit f75e07bf5226da640fa99a0594687c780d9bace4 ] + +According to the PLIC specification[1], global interrupt sources are +assigned small unsigned integer identifiers beginning at the value 1. +An interrupt ID of 0 is reserved to mean "no interrupt". + +The current plic_irq_resume() and plic_irq_suspend() functions incorrectly +start the loop from index 0, which accesses the register space for the +reserved interrupt ID 0. + +Change the loop to start from index 1, skipping the reserved +interrupt ID 0 as per the PLIC specification. + +This prevents potential undefined behavior when accessing the reserved +register space during suspend/resume cycles. + +Fixes: e80f0b6a2cf3 ("irqchip/irq-sifive-plic: Add syscore callbacks for hibernation") +Co-developed-by: Jia Wang +Signed-off-by: Jia Wang +Co-developed-by: Charles Mirabile +Signed-off-by: Charles Mirabile +Signed-off-by: Lucas Zampieri +Signed-off-by: Thomas Gleixner +Link: https://github.com/riscv/riscv-plic-spec/releases/tag/1.0.0 +Signed-off-by: Sasha Levin +--- + drivers/irqchip/irq-sifive-plic.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/drivers/irqchip/irq-sifive-plic.c b/drivers/irqchip/irq-sifive-plic.c +index bf69a4802b71e..9c4af7d588463 100644 +--- a/drivers/irqchip/irq-sifive-plic.c ++++ b/drivers/irqchip/irq-sifive-plic.c +@@ -252,7 +252,8 @@ static int plic_irq_suspend(void) + + priv = per_cpu_ptr(&plic_handlers, smp_processor_id())->priv; + +- for (i = 0; i < priv->nr_irqs; i++) { ++ /* irq ID 0 is reserved */ ++ for (i = 1; i < priv->nr_irqs; i++) { + __assign_bit(i, priv->prio_save, + readl(priv->regs + PRIORITY_BASE + i * PRIORITY_PER_ID)); + } +@@ -283,7 +284,8 @@ static void plic_irq_resume(void) + + priv = per_cpu_ptr(&plic_handlers, smp_processor_id())->priv; + +- for (i = 0; i < priv->nr_irqs; i++) { ++ /* irq ID 0 is reserved */ ++ for (i = 1; i < priv->nr_irqs; i++) { + index = BIT_WORD(i); + writel((priv->prio_save[index] & BIT_MASK(i)) ? 1 : 0, + priv->regs + PRIORITY_BASE + i * PRIORITY_PER_ID); +-- +2.51.0 + diff --git a/queue-6.12/irqchip-sifive-plic-make-use-of-__assign_bit.patch b/queue-6.12/irqchip-sifive-plic-make-use-of-__assign_bit.patch new file mode 100644 index 0000000000..caf7ade5be --- /dev/null +++ b/queue-6.12/irqchip-sifive-plic-make-use-of-__assign_bit.patch @@ -0,0 +1,51 @@ +From c65fedccb3cc2eed80b6eb8cfe52677a574b2a08 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 2 Sep 2024 21:08:24 +0800 +Subject: irqchip/sifive-plic: Make use of __assign_bit() + +From: Hongbo Li + +[ Upstream commit 40d7af5375a4e27d8576d9d11954ac213d06f09e ] + +Replace the open coded + +if (foo) + __set_bit(n, bar); + else + __clear_bit(n, bar); + +with __assign_bit(). No functional change intended. + +Signed-off-by: Hongbo Li +Signed-off-by: Thomas Gleixner +Reviewed-by: Palmer Dabbelt +Link: https://lore.kernel.org/all/20240902130824.2878644-1-lihongbo22@huawei.com +Stable-dep-of: f75e07bf5226 ("irqchip/sifive-plic: Avoid interrupt ID 0 handling during suspend/resume") +Signed-off-by: Sasha Levin +--- + drivers/irqchip/irq-sifive-plic.c | 9 ++++----- + 1 file changed, 4 insertions(+), 5 deletions(-) + +diff --git a/drivers/irqchip/irq-sifive-plic.c b/drivers/irqchip/irq-sifive-plic.c +index 36dbcf2d728a5..bf69a4802b71e 100644 +--- a/drivers/irqchip/irq-sifive-plic.c ++++ b/drivers/irqchip/irq-sifive-plic.c +@@ -252,11 +252,10 @@ static int plic_irq_suspend(void) + + priv = per_cpu_ptr(&plic_handlers, smp_processor_id())->priv; + +- for (i = 0; i < priv->nr_irqs; i++) +- if (readl(priv->regs + PRIORITY_BASE + i * PRIORITY_PER_ID)) +- __set_bit(i, priv->prio_save); +- else +- __clear_bit(i, priv->prio_save); ++ for (i = 0; i < priv->nr_irqs; i++) { ++ __assign_bit(i, priv->prio_save, ++ readl(priv->regs + PRIORITY_BASE + i * PRIORITY_PER_ID)); ++ } + + for_each_cpu(cpu, cpu_present_mask) { + struct plic_handler *handler = per_cpu_ptr(&plic_handlers, cpu); +-- +2.51.0 + diff --git a/queue-6.12/minixfs-verify-inode-mode-when-loading-from-disk.patch b/queue-6.12/minixfs-verify-inode-mode-when-loading-from-disk.patch new file mode 100644 index 0000000000..cfcca5d8db --- /dev/null +++ b/queue-6.12/minixfs-verify-inode-mode-when-loading-from-disk.patch @@ -0,0 +1,46 @@ +From 087bb55affd7ea5762b35c3b173cd2b283945ed6 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 13 Aug 2025 00:17:44 +0900 +Subject: minixfs: Verify inode mode when loading from disk + +From: Tetsuo Handa + +[ Upstream commit 73861970938ad1323eb02bbbc87f6fbd1e5bacca ] + +The inode mode loaded from corrupted disk can be invalid. Do like what +commit 0a9e74051313 ("isofs: Verify inode mode when loading from disk") +does. + +Reported-by: syzbot +Closes: https://syzkaller.appspot.com/bug?extid=895c23f6917da440ed0d +Signed-off-by: Tetsuo Handa +Link: https://lore.kernel.org/ec982681-84b8-4624-94fa-8af15b77cbd2@I-love.SAKURA.ne.jp +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + fs/minix/inode.c | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +diff --git a/fs/minix/inode.c b/fs/minix/inode.c +index f007e389d5d29..fc01f9dc8c391 100644 +--- a/fs/minix/inode.c ++++ b/fs/minix/inode.c +@@ -491,8 +491,14 @@ void minix_set_inode(struct inode *inode, dev_t rdev) + inode->i_op = &minix_symlink_inode_operations; + inode_nohighmem(inode); + inode->i_mapping->a_ops = &minix_aops; +- } else ++ } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || ++ S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { + init_special_inode(inode, inode->i_mode, rdev); ++ } else { ++ printk(KERN_DEBUG "MINIX-fs: Invalid file type 0%04o for inode %lu.\n", ++ inode->i_mode, inode->i_ino); ++ make_bad_inode(inode); ++ } + } + + /* +-- +2.51.0 + diff --git a/queue-6.12/pid-add-a-judgment-for-ns-null-in-pid_nr_ns.patch b/queue-6.12/pid-add-a-judgment-for-ns-null-in-pid_nr_ns.patch new file mode 100644 index 0000000000..44cc37cb76 --- /dev/null +++ b/queue-6.12/pid-add-a-judgment-for-ns-null-in-pid_nr_ns.patch @@ -0,0 +1,95 @@ +From 3f235527ebaae3b7942faa087d4f67ae5f839dfc Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 2 Aug 2025 10:21:23 +0800 +Subject: pid: Add a judgment for ns null in pid_nr_ns + +From: gaoxiang17 + +[ Upstream commit 006568ab4c5ca2309ceb36fa553e390b4aa9c0c7 ] + +__task_pid_nr_ns + ns = task_active_pid_ns(current); + pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); + if (pid && ns->level <= pid->level) { + +Sometimes null is returned for task_active_pid_ns. Then it will trigger kernel panic in pid_nr_ns. + +For example: + Unable to handle kernel NULL pointer dereference at virtual address 0000000000000058 + Mem abort info: + ESR = 0x0000000096000007 + EC = 0x25: DABT (current EL), IL = 32 bits + SET = 0, FnV = 0 + EA = 0, S1PTW = 0 + FSC = 0x07: level 3 translation fault + Data abort info: + ISV = 0, ISS = 0x00000007, ISS2 = 0x00000000 + CM = 0, WnR = 0, TnD = 0, TagAccess = 0 + GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 + user pgtable: 4k pages, 39-bit VAs, pgdp=00000002175aa000 + [0000000000000058] pgd=08000002175ab003, p4d=08000002175ab003, pud=08000002175ab003, pmd=08000002175be003, pte=0000000000000000 + pstate: 834000c5 (Nzcv daIF +PAN -UAO +TCO +DIT -SSBS BTYPE=--) + pc : __task_pid_nr_ns+0x74/0xd0 + lr : __task_pid_nr_ns+0x24/0xd0 + sp : ffffffc08001bd10 + x29: ffffffc08001bd10 x28: ffffffd4422b2000 x27: 0000000000000001 + x26: ffffffd442821168 x25: ffffffd442821000 x24: 00000f89492eab31 + x23: 00000000000000c0 x22: ffffff806f5693c0 x21: ffffff806f5693c0 + x20: 0000000000000001 x19: 0000000000000000 x18: 0000000000000000 + x17: 00000000529c6ef0 x16: 00000000529c6ef0 x15: 00000000023a1adc + x14: 0000000000000003 x13: 00000000007ef6d8 x12: 001167c391c78800 + x11: 00ffffffffffffff x10: 0000000000000000 x9 : 0000000000000001 + x8 : ffffff80816fa3c0 x7 : 0000000000000000 x6 : 49534d702d535449 + x5 : ffffffc080c4c2c0 x4 : ffffffd43ee128c8 x3 : ffffffd43ee124dc + x2 : 0000000000000000 x1 : 0000000000000001 x0 : ffffff806f5693c0 + Call trace: + __task_pid_nr_ns+0x74/0xd0 + ... + __handle_irq_event_percpu+0xd4/0x284 + handle_irq_event+0x48/0xb0 + handle_fasteoi_irq+0x160/0x2d8 + generic_handle_domain_irq+0x44/0x60 + gic_handle_irq+0x4c/0x114 + call_on_irq_stack+0x3c/0x74 + do_interrupt_handler+0x4c/0x84 + el1_interrupt+0x34/0x58 + el1h_64_irq_handler+0x18/0x24 + el1h_64_irq+0x68/0x6c + account_kernel_stack+0x60/0x144 + exit_task_stack_account+0x1c/0x80 + do_exit+0x7e4/0xaf8 + ... + get_signal+0x7bc/0x8d8 + do_notify_resume+0x128/0x828 + el0_svc+0x6c/0x70 + el0t_64_sync_handler+0x68/0xbc + el0t_64_sync+0x1a8/0x1ac + Code: 35fffe54 911a02a8 f9400108 b4000128 (b9405a69) + ---[ end trace 0000000000000000 ]--- + Kernel panic - not syncing: Oops: Fatal exception in interrupt + +Signed-off-by: gaoxiang17 +Link: https://lore.kernel.org/20250802022123.3536934-1-gxxa03070307@gmail.com +Reviewed-by: Baoquan He +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + kernel/pid.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/pid.c b/kernel/pid.c +index 2715afb77eab8..b80c3bfb58d07 100644 +--- a/kernel/pid.c ++++ b/kernel/pid.c +@@ -487,7 +487,7 @@ pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns) + struct upid *upid; + pid_t nr = 0; + +- if (pid && ns->level <= pid->level) { ++ if (pid && ns && ns->level <= pid->level) { + upid = &pid->numbers[ns->level]; + if (upid->ns == ns) + nr = upid->nr; +-- +2.51.0 + diff --git a/queue-6.12/pid-make-__task_pid_nr_ns-ns-null-safe-for-zombie-ca.patch b/queue-6.12/pid-make-__task_pid_nr_ns-ns-null-safe-for-zombie-ca.patch new file mode 100644 index 0000000000..d9c61ad727 --- /dev/null +++ b/queue-6.12/pid-make-__task_pid_nr_ns-ns-null-safe-for-zombie-ca.patch @@ -0,0 +1,48 @@ +From d0a57bd87b2bc11529d3849fe657d04b5de485d9 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sun, 10 Aug 2025 19:36:04 +0200 +Subject: pid: make __task_pid_nr_ns(ns => NULL) safe for zombie callers +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Oleg Nesterov + +[ Upstream commit abdfd4948e45c51b19162cf8b3f5003f8f53c9b9 ] + +task_pid_vnr(another_task) will crash if the caller was already reaped. +The pid_alive(current) check can't really help, the parent/debugger can +call release_task() right after this check. + +This also means that even task_ppid_nr_ns(current, NULL) is not safe, +pid_alive() only ensures that it is safe to dereference ->real_parent. + +Change __task_pid_nr_ns() to ensure ns != NULL. + +Originally-by: 高翔 +Link: https://lore.kernel.org/all/20250802022123.3536934-1-gxxa03070307@gmail.com/ +Signed-off-by: Oleg Nesterov +Link: https://lore.kernel.org/20250810173604.GA19991@redhat.com +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + kernel/pid.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/kernel/pid.c b/kernel/pid.c +index b80c3bfb58d07..8fdc3a5f87c7d 100644 +--- a/kernel/pid.c ++++ b/kernel/pid.c +@@ -510,7 +510,8 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, + rcu_read_lock(); + if (!ns) + ns = task_active_pid_ns(current); +- nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); ++ if (ns) ++ nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); + rcu_read_unlock(); + + return nr; +-- +2.51.0 + diff --git a/queue-6.12/series b/queue-6.12/series index 95bb15cb5e..951af66de6 100644 --- a/queue-6.12/series +++ b/queue-6.12/series @@ -260,3 +260,13 @@ s390-bpf-centralize-frame-offset-calculations.patch s390-bpf-describe-the-frame-using-a-struct-instead-of-constants.patch s390-bpf-write-back-tail-call-counter-for-bpf_pseudo_call.patch s390-bpf-write-back-tail-call-counter-for-bpf_tramp_f_call_orig.patch +irqchip-sifive-plic-make-use-of-__assign_bit.patch +irqchip-sifive-plic-avoid-interrupt-id-0-handling-du.patch +copy_file_range-limit-size-if-in-compat-mode.patch +minixfs-verify-inode-mode-when-loading-from-disk.patch +pid-add-a-judgment-for-ns-null-in-pid_nr_ns.patch +pid-make-__task_pid_nr_ns-ns-null-safe-for-zombie-ca.patch +fs-add-initramfs_options-to-set-initramfs-mount-opti.patch +cramfs-verify-inode-mode-when-loading-from-disk.patch +writeback-avoid-softlockup-when-switching-many-inode.patch +writeback-avoid-excessively-long-inode-switching-tim.patch diff --git a/queue-6.12/writeback-avoid-excessively-long-inode-switching-tim.patch b/queue-6.12/writeback-avoid-excessively-long-inode-switching-tim.patch new file mode 100644 index 0000000000..f2ca30cf46 --- /dev/null +++ b/queue-6.12/writeback-avoid-excessively-long-inode-switching-tim.patch @@ -0,0 +1,102 @@ +From 610a1224b6c329c6463202ca1bee301c9b4502d2 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 12 Sep 2025 12:38:37 +0200 +Subject: writeback: Avoid excessively long inode switching times + +From: Jan Kara + +[ Upstream commit 9a6ebbdbd41235ea3bc0c4f39e2076599b8113cc ] + +With lazytime mount option enabled we can be switching many dirty inodes +on cgroup exit to the parent cgroup. The numbers observed in practice +when systemd slice of a large cron job exits can easily reach hundreds +of thousands or millions. The logic in inode_do_switch_wbs() which sorts +the inode into appropriate place in b_dirty list of the target wb +however has linear complexity in the number of dirty inodes thus overall +time complexity of switching all the inodes is quadratic leading to +workers being pegged for hours consuming 100% of the CPU and switching +inodes to the parent wb. + +Simple reproducer of the issue: + FILES=10000 + # Filesystem mounted with lazytime mount option + MNT=/mnt/ + echo "Creating files and switching timestamps" + for (( j = 0; j < 50; j ++ )); do + mkdir $MNT/dir$j + for (( i = 0; i < $FILES; i++ )); do + echo "foo" >$MNT/dir$j/file$i + done + touch -a -t 202501010000 $MNT/dir$j/file* + done + wait + echo "Syncing and flushing" + sync + echo 3 >/proc/sys/vm/drop_caches + + echo "Reading all files from a cgroup" + mkdir /sys/fs/cgroup/unified/mycg1 || exit + echo $$ >/sys/fs/cgroup/unified/mycg1/cgroup.procs || exit + for (( j = 0; j < 50; j ++ )); do + cat /mnt/dir$j/file* >/dev/null & + done + wait + echo "Switching wbs" + # Now rmdir the cgroup after the script exits + +We need to maintain b_dirty list ordering to keep writeback happy so +instead of sorting inode into appropriate place just append it at the +end of the list and clobber dirtied_time_when. This may result in inode +writeback starting later after cgroup switch however cgroup switches are +rare so it shouldn't matter much. Since the cgroup had write access to +the inode, there are no practical concerns of the possible DoS issues. + +Acked-by: Tejun Heo +Signed-off-by: Jan Kara +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + fs/fs-writeback.c | 21 +++++++++++---------- + 1 file changed, 11 insertions(+), 10 deletions(-) + +diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c +index eff778dc0386c..28edfad85c628 100644 +--- a/fs/fs-writeback.c ++++ b/fs/fs-writeback.c +@@ -446,22 +446,23 @@ static bool inode_do_switch_wbs(struct inode *inode, + * Transfer to @new_wb's IO list if necessary. If the @inode is dirty, + * the specific list @inode was on is ignored and the @inode is put on + * ->b_dirty which is always correct including from ->b_dirty_time. +- * The transfer preserves @inode->dirtied_when ordering. If the @inode +- * was clean, it means it was on the b_attached list, so move it onto +- * the b_attached list of @new_wb. ++ * If the @inode was clean, it means it was on the b_attached list, so ++ * move it onto the b_attached list of @new_wb. + */ + if (!list_empty(&inode->i_io_list)) { + inode->i_wb = new_wb; + + if (inode->i_state & I_DIRTY_ALL) { +- struct inode *pos; +- +- list_for_each_entry(pos, &new_wb->b_dirty, i_io_list) +- if (time_after_eq(inode->dirtied_when, +- pos->dirtied_when)) +- break; ++ /* ++ * We need to keep b_dirty list sorted by ++ * dirtied_time_when. However properly sorting the ++ * inode in the list gets too expensive when switching ++ * many inodes. So just attach inode at the end of the ++ * dirty list and clobber the dirtied_time_when. ++ */ ++ inode->dirtied_time_when = jiffies; + inode_io_list_move_locked(inode, new_wb, +- pos->i_io_list.prev); ++ &new_wb->b_dirty); + } else { + inode_cgwb_move_to_attached(inode, new_wb); + } +-- +2.51.0 + diff --git a/queue-6.12/writeback-avoid-softlockup-when-switching-many-inode.patch b/queue-6.12/writeback-avoid-softlockup-when-switching-many-inode.patch new file mode 100644 index 0000000000..77c42e8efe --- /dev/null +++ b/queue-6.12/writeback-avoid-softlockup-when-switching-many-inode.patch @@ -0,0 +1,65 @@ +From b730971ba2912b8af3125a63069f27aa7da3826c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 12 Sep 2025 12:38:36 +0200 +Subject: writeback: Avoid softlockup when switching many inodes + +From: Jan Kara + +[ Upstream commit 66c14dccd810d42ec5c73bb8a9177489dfd62278 ] + +process_inode_switch_wbs_work() can be switching over 100 inodes to a +different cgroup. Since switching an inode requires counting all dirty & +under-writeback pages in the address space of each inode, this can take +a significant amount of time. Add a possibility to reschedule after +processing each inode to avoid softlockups. + +Acked-by: Tejun Heo +Signed-off-by: Jan Kara +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + fs/fs-writeback.c | 11 ++++++++++- + 1 file changed, 10 insertions(+), 1 deletion(-) + +diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c +index 4ae226778d646..eff778dc0386c 100644 +--- a/fs/fs-writeback.c ++++ b/fs/fs-writeback.c +@@ -503,6 +503,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) + */ + down_read(&bdi->wb_switch_rwsem); + ++ inodep = isw->inodes; + /* + * By the time control reaches here, RCU grace period has passed + * since I_WB_SWITCH assertion and all wb stat update transactions +@@ -513,6 +514,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) + * gives us exclusion against all wb related operations on @inode + * including IO list manipulations and stat updates. + */ ++relock: + if (old_wb < new_wb) { + spin_lock(&old_wb->list_lock); + spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING); +@@ -521,10 +523,17 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) + spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING); + } + +- for (inodep = isw->inodes; *inodep; inodep++) { ++ while (*inodep) { + WARN_ON_ONCE((*inodep)->i_wb != old_wb); + if (inode_do_switch_wbs(*inodep, old_wb, new_wb)) + nr_switched++; ++ inodep++; ++ if (*inodep && need_resched()) { ++ spin_unlock(&new_wb->list_lock); ++ spin_unlock(&old_wb->list_lock); ++ cond_resched(); ++ goto relock; ++ } + } + + spin_unlock(&new_wb->list_lock); +-- +2.51.0 + diff --git a/queue-6.17/copy_file_range-limit-size-if-in-compat-mode.patch b/queue-6.17/copy_file_range-limit-size-if-in-compat-mode.patch new file mode 100644 index 0000000000..09e4e4c40c --- /dev/null +++ b/queue-6.17/copy_file_range-limit-size-if-in-compat-mode.patch @@ -0,0 +1,66 @@ +From 239d59f3832f0839ae7db629aaa876403b2deb28 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 13 Aug 2025 17:11:05 +0200 +Subject: copy_file_range: limit size if in compat mode + +From: Miklos Szeredi + +[ Upstream commit f8f59a2c05dc16d19432e3154a9ac7bc385f4b92 ] + +If the process runs in 32-bit compat mode, copy_file_range results can be +in the in-band error range. In this case limit copy length to MAX_RW_COUNT +to prevent a signed overflow. + +Reported-by: Florian Weimer +Closes: https://lore.kernel.org/all/lhuh5ynl8z5.fsf@oldenburg.str.redhat.com/ +Signed-off-by: Miklos Szeredi +Link: https://lore.kernel.org/20250813151107.99856-1-mszeredi@redhat.com +Reviewed-by: Amir Goldstein +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + fs/read_write.c | 14 +++++++++----- + 1 file changed, 9 insertions(+), 5 deletions(-) + +diff --git a/fs/read_write.c b/fs/read_write.c +index c5b6265d984ba..833bae068770a 100644 +--- a/fs/read_write.c ++++ b/fs/read_write.c +@@ -1576,6 +1576,13 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, + if (len == 0) + return 0; + ++ /* ++ * Make sure return value doesn't overflow in 32bit compat mode. Also ++ * limit the size for all cases except when calling ->copy_file_range(). ++ */ ++ if (splice || !file_out->f_op->copy_file_range || in_compat_syscall()) ++ len = min_t(size_t, MAX_RW_COUNT, len); ++ + file_start_write(file_out); + + /* +@@ -1589,9 +1596,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, + len, flags); + } else if (!splice && file_in->f_op->remap_file_range && samesb) { + ret = file_in->f_op->remap_file_range(file_in, pos_in, +- file_out, pos_out, +- min_t(loff_t, MAX_RW_COUNT, len), +- REMAP_FILE_CAN_SHORTEN); ++ file_out, pos_out, len, REMAP_FILE_CAN_SHORTEN); + /* fallback to splice */ + if (ret <= 0) + splice = true; +@@ -1624,8 +1629,7 @@ ssize_t vfs_copy_file_range(struct file *file_in, loff_t pos_in, + * to splicing from input file, while file_start_write() is held on + * the output file on a different sb. + */ +- ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, +- min_t(size_t, len, MAX_RW_COUNT), 0); ++ ret = do_splice_direct(file_in, &pos_in, file_out, &pos_out, len, 0); + done: + if (ret > 0) { + fsnotify_access(file_in); +-- +2.51.0 + diff --git a/queue-6.17/cramfs-verify-inode-mode-when-loading-from-disk.patch b/queue-6.17/cramfs-verify-inode-mode-when-loading-from-disk.patch new file mode 100644 index 0000000000..6a86650036 --- /dev/null +++ b/queue-6.17/cramfs-verify-inode-mode-when-loading-from-disk.patch @@ -0,0 +1,51 @@ +From aed3cc20fd7d92905a3bab5081abb6a58ddb7371 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 30 Aug 2025 19:01:01 +0900 +Subject: cramfs: Verify inode mode when loading from disk + +From: Tetsuo Handa + +[ Upstream commit 7f9d34b0a7cb93d678ee7207f0634dbf79e47fe5 ] + +The inode mode loaded from corrupted disk can be invalid. Do like what +commit 0a9e74051313 ("isofs: Verify inode mode when loading from disk") +does. + +Reported-by: syzbot +Closes: https://syzkaller.appspot.com/bug?extid=895c23f6917da440ed0d +Signed-off-by: Tetsuo Handa +Link: https://lore.kernel.org/429b3ef1-13de-4310-9a8e-c2dc9a36234a@I-love.SAKURA.ne.jp +Acked-by: Nicolas Pitre +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + fs/cramfs/inode.c | 11 ++++++++++- + 1 file changed, 10 insertions(+), 1 deletion(-) + +diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c +index 56c8005b24a34..ca54bf24b719f 100644 +--- a/fs/cramfs/inode.c ++++ b/fs/cramfs/inode.c +@@ -116,9 +116,18 @@ static struct inode *get_cramfs_inode(struct super_block *sb, + inode_nohighmem(inode); + inode->i_data.a_ops = &cramfs_aops; + break; +- default: ++ case S_IFCHR: ++ case S_IFBLK: ++ case S_IFIFO: ++ case S_IFSOCK: + init_special_inode(inode, cramfs_inode->mode, + old_decode_dev(cramfs_inode->size)); ++ break; ++ default: ++ printk(KERN_DEBUG "CRAMFS: Invalid file type 0%04o for inode %lu.\n", ++ inode->i_mode, inode->i_ino); ++ iget_failed(inode); ++ return ERR_PTR(-EIO); + } + + inode->i_mode = cramfs_inode->mode; +-- +2.51.0 + diff --git a/queue-6.17/fs-add-initramfs_options-to-set-initramfs-mount-opti.patch b/queue-6.17/fs-add-initramfs_options-to-set-initramfs-mount-opti.patch new file mode 100644 index 0000000000..8830757c72 --- /dev/null +++ b/queue-6.17/fs-add-initramfs_options-to-set-initramfs-mount-opti.patch @@ -0,0 +1,116 @@ +From caf84963cbd30f5b57f96f509db7fb5a213b7cdc Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 15 Aug 2025 20:14:59 +0800 +Subject: fs: Add 'initramfs_options' to set initramfs mount options +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Lichen Liu + +[ Upstream commit 278033a225e13ec21900f0a92b8351658f5377f2 ] + +When CONFIG_TMPFS is enabled, the initial root filesystem is a tmpfs. +By default, a tmpfs mount is limited to using 50% of the available RAM +for its content. This can be problematic in memory-constrained +environments, particularly during a kdump capture. + +In a kdump scenario, the capture kernel boots with a limited amount of +memory specified by the 'crashkernel' parameter. If the initramfs is +large, it may fail to unpack into the tmpfs rootfs due to insufficient +space. This is because to get X MB of usable space in tmpfs, 2*X MB of +memory must be available for the mount. This leads to an OOM failure +during the early boot process, preventing a successful crash dump. + +This patch introduces a new kernel command-line parameter, +initramfs_options, which allows passing specific mount options directly +to the rootfs when it is first mounted. This gives users control over +the rootfs behavior. + +For example, a user can now specify initramfs_options=size=75% to allow +the tmpfs to use up to 75% of the available memory. This can +significantly reduce the memory pressure for kdump. + +Consider a practical example: + +To unpack a 48MB initramfs, the tmpfs needs 48MB of usable space. With +the default 50% limit, this requires a memory pool of 96MB to be +available for the tmpfs mount. The total memory requirement is therefore +approximately: 16MB (vmlinuz) + 48MB (loaded initramfs) + 48MB (unpacked +kernel) + 96MB (for tmpfs) + 12MB (runtime overhead) ≈ 220MB. + +By using initramfs_options=size=75%, the memory pool required for the +48MB tmpfs is reduced to 48MB / 0.75 = 64MB. This reduces the total +memory requirement by 32MB (96MB - 64MB), allowing the kdump to succeed +with a smaller crashkernel size, such as 192MB. + +An alternative approach of reusing the existing rootflags parameter was +considered. However, a new, dedicated initramfs_options parameter was +chosen to avoid altering the current behavior of rootflags (which +applies to the final root filesystem) and to prevent any potential +regressions. + +Also add documentation for the new kernel parameter "initramfs_options" + +This approach is inspired by prior discussions and patches on the topic. +Ref: https://www.lightofdawn.org/blog/?viewDetailed=00128 +Ref: https://landley.net/notes-2015.html#01-01-2015 +Ref: https://lkml.org/lkml/2021/6/29/783 +Ref: https://www.kernel.org/doc/html/latest/filesystems/ramfs-rootfs-initramfs.html#what-is-rootfs + +Signed-off-by: Lichen Liu +Link: https://lore.kernel.org/20250815121459.3391223-1-lichliu@redhat.com +Tested-by: Rob Landley +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + Documentation/admin-guide/kernel-parameters.txt | 3 +++ + fs/namespace.c | 11 ++++++++++- + 2 files changed, 13 insertions(+), 1 deletion(-) + +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index 5a7a83c411e9c..e92c0056e4e0a 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -6429,6 +6429,9 @@ + + rootflags= [KNL] Set root filesystem mount option string + ++ initramfs_options= [KNL] ++ Specify mount options for for the initramfs mount. ++ + rootfstype= [KNL] Set root filesystem type + + rootwait [KNL] Wait (indefinitely) for root device to show up. +diff --git a/fs/namespace.c b/fs/namespace.c +index 46e654247274f..38609066cf330 100644 +--- a/fs/namespace.c ++++ b/fs/namespace.c +@@ -65,6 +65,15 @@ static int __init set_mphash_entries(char *str) + } + __setup("mphash_entries=", set_mphash_entries); + ++static char * __initdata initramfs_options; ++static int __init initramfs_options_setup(char *str) ++{ ++ initramfs_options = str; ++ return 1; ++} ++ ++__setup("initramfs_options=", initramfs_options_setup); ++ + static u64 event; + static DEFINE_XARRAY_FLAGS(mnt_id_xa, XA_FLAGS_ALLOC); + static DEFINE_IDA(mnt_group_ida); +@@ -6127,7 +6136,7 @@ static void __init init_mount_tree(void) + struct mnt_namespace *ns; + struct path root; + +- mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", NULL); ++ mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", initramfs_options); + if (IS_ERR(mnt)) + panic("Can't create rootfs"); + +-- +2.51.0 + diff --git a/queue-6.17/iomap-error-out-on-file-io-when-there-is-no-inline_d.patch b/queue-6.17/iomap-error-out-on-file-io-when-there-is-no-inline_d.patch new file mode 100644 index 0000000000..5cbb422d93 --- /dev/null +++ b/queue-6.17/iomap-error-out-on-file-io-when-there-is-no-inline_d.patch @@ -0,0 +1,95 @@ +From f72becab157a099604e0cbcb6fcb3b8283f2c872 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 16 Sep 2025 08:00:45 -0700 +Subject: iomap: error out on file IO when there is no inline_data buffer + +From: Darrick J. Wong + +[ Upstream commit 6a96fb653b6481ec73e9627ade216b299e4de9ea ] + +Return IO errors if an ->iomap_begin implementation returns an +IOMAP_INLINE buffer but forgets to set the inline_data pointer. +Filesystems should never do this, but we could help fs developers (me) +fix their bugs by handling this more gracefully than crashing the +kernel. + +Signed-off-by: Darrick J. Wong +Link: https://lore.kernel.org/175803480324.966383.7414345025943296442.stgit@frogsfrogsfrogs +Reviewed-by: Christoph Hellwig +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + fs/iomap/buffered-io.c | 15 ++++++++++----- + fs/iomap/direct-io.c | 3 +++ + 2 files changed, 13 insertions(+), 5 deletions(-) + +diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c +index fd827398afd2f..6fa653d83f703 100644 +--- a/fs/iomap/buffered-io.c ++++ b/fs/iomap/buffered-io.c +@@ -304,6 +304,9 @@ static int iomap_read_inline_data(const struct iomap_iter *iter, + size_t size = i_size_read(iter->inode) - iomap->offset; + size_t offset = offset_in_folio(folio, iomap->offset); + ++ if (WARN_ON_ONCE(!iomap->inline_data)) ++ return -EIO; ++ + if (folio_test_uptodate(folio)) + return 0; + +@@ -894,7 +897,7 @@ static bool __iomap_write_end(struct inode *inode, loff_t pos, size_t len, + return true; + } + +-static void iomap_write_end_inline(const struct iomap_iter *iter, ++static bool iomap_write_end_inline(const struct iomap_iter *iter, + struct folio *folio, loff_t pos, size_t copied) + { + const struct iomap *iomap = &iter->iomap; +@@ -903,12 +906,16 @@ static void iomap_write_end_inline(const struct iomap_iter *iter, + WARN_ON_ONCE(!folio_test_uptodate(folio)); + BUG_ON(!iomap_inline_data_valid(iomap)); + ++ if (WARN_ON_ONCE(!iomap->inline_data)) ++ return false; ++ + flush_dcache_folio(folio); + addr = kmap_local_folio(folio, pos); + memcpy(iomap_inline_data(iomap, pos), addr, copied); + kunmap_local(addr); + + mark_inode_dirty(iter->inode); ++ return true; + } + + /* +@@ -921,10 +928,8 @@ static bool iomap_write_end(struct iomap_iter *iter, size_t len, size_t copied, + const struct iomap *srcmap = iomap_iter_srcmap(iter); + loff_t pos = iter->pos; + +- if (srcmap->type == IOMAP_INLINE) { +- iomap_write_end_inline(iter, folio, pos, copied); +- return true; +- } ++ if (srcmap->type == IOMAP_INLINE) ++ return iomap_write_end_inline(iter, folio, pos, copied); + + if (srcmap->flags & IOMAP_F_BUFFER_HEAD) { + size_t bh_written; +diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c +index b84f6af2eb4c8..46aa85af13dc5 100644 +--- a/fs/iomap/direct-io.c ++++ b/fs/iomap/direct-io.c +@@ -519,6 +519,9 @@ static int iomap_dio_inline_iter(struct iomap_iter *iomi, struct iomap_dio *dio) + loff_t pos = iomi->pos; + u64 copied; + ++ if (WARN_ON_ONCE(!inline_data)) ++ return -EIO; ++ + if (WARN_ON_ONCE(!iomap_inline_data_valid(iomap))) + return -EIO; + +-- +2.51.0 + diff --git a/queue-6.17/irqchip-sifive-plic-avoid-interrupt-id-0-handling-du.patch b/queue-6.17/irqchip-sifive-plic-avoid-interrupt-id-0-handling-du.patch new file mode 100644 index 0000000000..3361ef2431 --- /dev/null +++ b/queue-6.17/irqchip-sifive-plic-avoid-interrupt-id-0-handling-du.patch @@ -0,0 +1,64 @@ +From aab342170636f6a2ce9aa86e3821cb205d32bc2f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 23 Sep 2025 15:43:19 +0100 +Subject: irqchip/sifive-plic: Avoid interrupt ID 0 handling during + suspend/resume + +From: Lucas Zampieri + +[ Upstream commit f75e07bf5226da640fa99a0594687c780d9bace4 ] + +According to the PLIC specification[1], global interrupt sources are +assigned small unsigned integer identifiers beginning at the value 1. +An interrupt ID of 0 is reserved to mean "no interrupt". + +The current plic_irq_resume() and plic_irq_suspend() functions incorrectly +start the loop from index 0, which accesses the register space for the +reserved interrupt ID 0. + +Change the loop to start from index 1, skipping the reserved +interrupt ID 0 as per the PLIC specification. + +This prevents potential undefined behavior when accessing the reserved +register space during suspend/resume cycles. + +Fixes: e80f0b6a2cf3 ("irqchip/irq-sifive-plic: Add syscore callbacks for hibernation") +Co-developed-by: Jia Wang +Signed-off-by: Jia Wang +Co-developed-by: Charles Mirabile +Signed-off-by: Charles Mirabile +Signed-off-by: Lucas Zampieri +Signed-off-by: Thomas Gleixner +Link: https://github.com/riscv/riscv-plic-spec/releases/tag/1.0.0 +Signed-off-by: Sasha Levin +--- + drivers/irqchip/irq-sifive-plic.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/drivers/irqchip/irq-sifive-plic.c b/drivers/irqchip/irq-sifive-plic.c +index bf69a4802b71e..9c4af7d588463 100644 +--- a/drivers/irqchip/irq-sifive-plic.c ++++ b/drivers/irqchip/irq-sifive-plic.c +@@ -252,7 +252,8 @@ static int plic_irq_suspend(void) + + priv = per_cpu_ptr(&plic_handlers, smp_processor_id())->priv; + +- for (i = 0; i < priv->nr_irqs; i++) { ++ /* irq ID 0 is reserved */ ++ for (i = 1; i < priv->nr_irqs; i++) { + __assign_bit(i, priv->prio_save, + readl(priv->regs + PRIORITY_BASE + i * PRIORITY_PER_ID)); + } +@@ -283,7 +284,8 @@ static void plic_irq_resume(void) + + priv = per_cpu_ptr(&plic_handlers, smp_processor_id())->priv; + +- for (i = 0; i < priv->nr_irqs; i++) { ++ /* irq ID 0 is reserved */ ++ for (i = 1; i < priv->nr_irqs; i++) { + index = BIT_WORD(i); + writel((priv->prio_save[index] & BIT_MASK(i)) ? 1 : 0, + priv->regs + PRIORITY_BASE + i * PRIORITY_PER_ID); +-- +2.51.0 + diff --git a/queue-6.17/minixfs-verify-inode-mode-when-loading-from-disk.patch b/queue-6.17/minixfs-verify-inode-mode-when-loading-from-disk.patch new file mode 100644 index 0000000000..fc4cf3e77a --- /dev/null +++ b/queue-6.17/minixfs-verify-inode-mode-when-loading-from-disk.patch @@ -0,0 +1,46 @@ +From 437eff349e57c5cfea1494e285a2389f6a18f651 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 13 Aug 2025 00:17:44 +0900 +Subject: minixfs: Verify inode mode when loading from disk + +From: Tetsuo Handa + +[ Upstream commit 73861970938ad1323eb02bbbc87f6fbd1e5bacca ] + +The inode mode loaded from corrupted disk can be invalid. Do like what +commit 0a9e74051313 ("isofs: Verify inode mode when loading from disk") +does. + +Reported-by: syzbot +Closes: https://syzkaller.appspot.com/bug?extid=895c23f6917da440ed0d +Signed-off-by: Tetsuo Handa +Link: https://lore.kernel.org/ec982681-84b8-4624-94fa-8af15b77cbd2@I-love.SAKURA.ne.jp +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + fs/minix/inode.c | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +diff --git a/fs/minix/inode.c b/fs/minix/inode.c +index df9d11479caf1..32db676127a9e 100644 +--- a/fs/minix/inode.c ++++ b/fs/minix/inode.c +@@ -492,8 +492,14 @@ void minix_set_inode(struct inode *inode, dev_t rdev) + inode->i_op = &minix_symlink_inode_operations; + inode_nohighmem(inode); + inode->i_mapping->a_ops = &minix_aops; +- } else ++ } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || ++ S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { + init_special_inode(inode, inode->i_mode, rdev); ++ } else { ++ printk(KERN_DEBUG "MINIX-fs: Invalid file type 0%04o for inode %lu.\n", ++ inode->i_mode, inode->i_ino); ++ make_bad_inode(inode); ++ } + } + + /* +-- +2.51.0 + diff --git a/queue-6.17/mnt_ns_tree_remove-dtrt-if-mnt_ns-had-never-been-add.patch b/queue-6.17/mnt_ns_tree_remove-dtrt-if-mnt_ns-had-never-been-add.patch new file mode 100644 index 0000000000..a8def395d2 --- /dev/null +++ b/queue-6.17/mnt_ns_tree_remove-dtrt-if-mnt_ns-had-never-been-add.patch @@ -0,0 +1,40 @@ +From 534f26572b7d52ed25e7ce4a308b6b573a791d66 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 26 Aug 2025 16:35:55 -0400 +Subject: mnt_ns_tree_remove(): DTRT if mnt_ns had never been added to + mnt_ns_list + +From: Al Viro + +[ Upstream commit 38f4885088fc5ad41b8b0a2a2cfc73d01e709e5c ] + +Actual removal is done under the lock, but for checking if need to bother +the lockless RB_EMPTY_NODE() is safe - either that namespace had never +been added to mnt_ns_tree, in which case the the node will stay empty, or +whoever had allocated it has called mnt_ns_tree_add() and it has already +run to completion. After that point RB_EMPTY_NODE() will become false and +will remain false, no matter what we do with other nodes in the tree. + +Reviewed-by: Christian Brauner +Signed-off-by: Al Viro +Signed-off-by: Sasha Levin +--- + fs/namespace.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/namespace.c b/fs/namespace.c +index 38609066cf330..fc4cbbefa70e2 100644 +--- a/fs/namespace.c ++++ b/fs/namespace.c +@@ -196,7 +196,7 @@ static void mnt_ns_release_rcu(struct rcu_head *rcu) + static void mnt_ns_tree_remove(struct mnt_namespace *ns) + { + /* remove from global mount namespace list */ +- if (!is_anon_ns(ns)) { ++ if (!RB_EMPTY_NODE(&ns->mnt_ns_tree_node)) { + mnt_ns_tree_write_lock(); + rb_erase(&ns->mnt_ns_tree_node, &mnt_ns_tree); + list_bidir_del_rcu(&ns->mnt_ns_list); +-- +2.51.0 + diff --git a/queue-6.17/mount-handle-null-values-in-mnt_ns_release.patch b/queue-6.17/mount-handle-null-values-in-mnt_ns_release.patch new file mode 100644 index 0000000000..53cf94e202 --- /dev/null +++ b/queue-6.17/mount-handle-null-values-in-mnt_ns_release.patch @@ -0,0 +1,35 @@ +From d9058cb0a4d40a4c557fc71b42b59eaa328f78aa Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 29 Sep 2025 11:41:16 +0200 +Subject: mount: handle NULL values in mnt_ns_release() + +From: Christian Brauner + +[ Upstream commit 6c7ca6a02f8f9549a438a08a23c6327580ecf3d6 ] + +When calling in listmount() mnt_ns_release() may be passed a NULL +pointer. Handle that case gracefully. + +Signed-off-by: Christian Brauner +Signed-off-by: Linus Torvalds +Signed-off-by: Sasha Levin +--- + fs/namespace.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/fs/namespace.c b/fs/namespace.c +index fc4cbbefa70e2..c8c2376bb2424 100644 +--- a/fs/namespace.c ++++ b/fs/namespace.c +@@ -180,7 +180,7 @@ static void mnt_ns_tree_add(struct mnt_namespace *ns) + static void mnt_ns_release(struct mnt_namespace *ns) + { + /* keep alive for {list,stat}mount() */ +- if (refcount_dec_and_test(&ns->passive)) { ++ if (ns && refcount_dec_and_test(&ns->passive)) { + fsnotify_mntns_delete(ns); + put_user_ns(ns->user_ns); + kfree(ns); +-- +2.51.0 + diff --git a/queue-6.17/nsfs-validate-extensible-ioctls.patch b/queue-6.17/nsfs-validate-extensible-ioctls.patch new file mode 100644 index 0000000000..001099df16 --- /dev/null +++ b/queue-6.17/nsfs-validate-extensible-ioctls.patch @@ -0,0 +1,38 @@ +From c95d11d21c5a2c1e5d66978735cec7e89395caa5 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 12 Sep 2025 13:52:26 +0200 +Subject: nsfs: validate extensible ioctls + +From: Christian Brauner + +[ Upstream commit f8527a29f4619f74bc30a9845ea87abb9a6faa1e ] + +Validate extensible ioctls stricter than we do now. + +Reviewed-by: Jan Kara +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + fs/nsfs.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/fs/nsfs.c b/fs/nsfs.c +index 59aa801347a7d..34f0b35d3ead7 100644 +--- a/fs/nsfs.c ++++ b/fs/nsfs.c +@@ -169,9 +169,11 @@ static bool nsfs_ioctl_valid(unsigned int cmd) + /* Extensible ioctls require some extra handling. */ + switch (_IOC_NR(cmd)) { + case _IOC_NR(NS_MNT_GET_INFO): ++ return extensible_ioctl_valid(cmd, NS_MNT_GET_INFO, MNT_NS_INFO_SIZE_VER0); + case _IOC_NR(NS_MNT_GET_NEXT): ++ return extensible_ioctl_valid(cmd, NS_MNT_GET_NEXT, MNT_NS_INFO_SIZE_VER0); + case _IOC_NR(NS_MNT_GET_PREV): +- return (_IOC_TYPE(cmd) == _IOC_TYPE(cmd)); ++ return extensible_ioctl_valid(cmd, NS_MNT_GET_PREV, MNT_NS_INFO_SIZE_VER0); + } + + return false; +-- +2.51.0 + diff --git a/queue-6.17/pid-add-a-judgment-for-ns-null-in-pid_nr_ns.patch b/queue-6.17/pid-add-a-judgment-for-ns-null-in-pid_nr_ns.patch new file mode 100644 index 0000000000..881d3664ad --- /dev/null +++ b/queue-6.17/pid-add-a-judgment-for-ns-null-in-pid_nr_ns.patch @@ -0,0 +1,95 @@ +From 611e274ae9760c9a32ab1e615ce9c761690b92d7 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 2 Aug 2025 10:21:23 +0800 +Subject: pid: Add a judgment for ns null in pid_nr_ns + +From: gaoxiang17 + +[ Upstream commit 006568ab4c5ca2309ceb36fa553e390b4aa9c0c7 ] + +__task_pid_nr_ns + ns = task_active_pid_ns(current); + pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); + if (pid && ns->level <= pid->level) { + +Sometimes null is returned for task_active_pid_ns. Then it will trigger kernel panic in pid_nr_ns. + +For example: + Unable to handle kernel NULL pointer dereference at virtual address 0000000000000058 + Mem abort info: + ESR = 0x0000000096000007 + EC = 0x25: DABT (current EL), IL = 32 bits + SET = 0, FnV = 0 + EA = 0, S1PTW = 0 + FSC = 0x07: level 3 translation fault + Data abort info: + ISV = 0, ISS = 0x00000007, ISS2 = 0x00000000 + CM = 0, WnR = 0, TnD = 0, TagAccess = 0 + GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 + user pgtable: 4k pages, 39-bit VAs, pgdp=00000002175aa000 + [0000000000000058] pgd=08000002175ab003, p4d=08000002175ab003, pud=08000002175ab003, pmd=08000002175be003, pte=0000000000000000 + pstate: 834000c5 (Nzcv daIF +PAN -UAO +TCO +DIT -SSBS BTYPE=--) + pc : __task_pid_nr_ns+0x74/0xd0 + lr : __task_pid_nr_ns+0x24/0xd0 + sp : ffffffc08001bd10 + x29: ffffffc08001bd10 x28: ffffffd4422b2000 x27: 0000000000000001 + x26: ffffffd442821168 x25: ffffffd442821000 x24: 00000f89492eab31 + x23: 00000000000000c0 x22: ffffff806f5693c0 x21: ffffff806f5693c0 + x20: 0000000000000001 x19: 0000000000000000 x18: 0000000000000000 + x17: 00000000529c6ef0 x16: 00000000529c6ef0 x15: 00000000023a1adc + x14: 0000000000000003 x13: 00000000007ef6d8 x12: 001167c391c78800 + x11: 00ffffffffffffff x10: 0000000000000000 x9 : 0000000000000001 + x8 : ffffff80816fa3c0 x7 : 0000000000000000 x6 : 49534d702d535449 + x5 : ffffffc080c4c2c0 x4 : ffffffd43ee128c8 x3 : ffffffd43ee124dc + x2 : 0000000000000000 x1 : 0000000000000001 x0 : ffffff806f5693c0 + Call trace: + __task_pid_nr_ns+0x74/0xd0 + ... + __handle_irq_event_percpu+0xd4/0x284 + handle_irq_event+0x48/0xb0 + handle_fasteoi_irq+0x160/0x2d8 + generic_handle_domain_irq+0x44/0x60 + gic_handle_irq+0x4c/0x114 + call_on_irq_stack+0x3c/0x74 + do_interrupt_handler+0x4c/0x84 + el1_interrupt+0x34/0x58 + el1h_64_irq_handler+0x18/0x24 + el1h_64_irq+0x68/0x6c + account_kernel_stack+0x60/0x144 + exit_task_stack_account+0x1c/0x80 + do_exit+0x7e4/0xaf8 + ... + get_signal+0x7bc/0x8d8 + do_notify_resume+0x128/0x828 + el0_svc+0x6c/0x70 + el0t_64_sync_handler+0x68/0xbc + el0t_64_sync+0x1a8/0x1ac + Code: 35fffe54 911a02a8 f9400108 b4000128 (b9405a69) + ---[ end trace 0000000000000000 ]--- + Kernel panic - not syncing: Oops: Fatal exception in interrupt + +Signed-off-by: gaoxiang17 +Link: https://lore.kernel.org/20250802022123.3536934-1-gxxa03070307@gmail.com +Reviewed-by: Baoquan He +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + kernel/pid.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/pid.c b/kernel/pid.c +index d94ce02505012..296cd04c24bae 100644 +--- a/kernel/pid.c ++++ b/kernel/pid.c +@@ -491,7 +491,7 @@ pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns) + struct upid *upid; + pid_t nr = 0; + +- if (pid && ns->level <= pid->level) { ++ if (pid && ns && ns->level <= pid->level) { + upid = &pid->numbers[ns->level]; + if (upid->ns == ns) + nr = upid->nr; +-- +2.51.0 + diff --git a/queue-6.17/pid-make-__task_pid_nr_ns-ns-null-safe-for-zombie-ca.patch b/queue-6.17/pid-make-__task_pid_nr_ns-ns-null-safe-for-zombie-ca.patch new file mode 100644 index 0000000000..8b359d7c0c --- /dev/null +++ b/queue-6.17/pid-make-__task_pid_nr_ns-ns-null-safe-for-zombie-ca.patch @@ -0,0 +1,48 @@ +From 00e585e46d9e321e24f568a4dc53de36a3dd0735 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sun, 10 Aug 2025 19:36:04 +0200 +Subject: pid: make __task_pid_nr_ns(ns => NULL) safe for zombie callers +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Oleg Nesterov + +[ Upstream commit abdfd4948e45c51b19162cf8b3f5003f8f53c9b9 ] + +task_pid_vnr(another_task) will crash if the caller was already reaped. +The pid_alive(current) check can't really help, the parent/debugger can +call release_task() right after this check. + +This also means that even task_ppid_nr_ns(current, NULL) is not safe, +pid_alive() only ensures that it is safe to dereference ->real_parent. + +Change __task_pid_nr_ns() to ensure ns != NULL. + +Originally-by: 高翔 +Link: https://lore.kernel.org/all/20250802022123.3536934-1-gxxa03070307@gmail.com/ +Signed-off-by: Oleg Nesterov +Link: https://lore.kernel.org/20250810173604.GA19991@redhat.com +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + kernel/pid.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/kernel/pid.c b/kernel/pid.c +index 296cd04c24bae..2dbcc4dd90cc0 100644 +--- a/kernel/pid.c ++++ b/kernel/pid.c +@@ -514,7 +514,8 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, + rcu_read_lock(); + if (!ns) + ns = task_active_pid_ns(current); +- nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); ++ if (ns) ++ nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); + rcu_read_unlock(); + + return nr; +-- +2.51.0 + diff --git a/queue-6.17/pidfs-validate-extensible-ioctls.patch b/queue-6.17/pidfs-validate-extensible-ioctls.patch new file mode 100644 index 0000000000..c9a6e4c575 --- /dev/null +++ b/queue-6.17/pidfs-validate-extensible-ioctls.patch @@ -0,0 +1,59 @@ +From 9e028a4439f7572bba16a80d460649e02433ce17 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 12 Sep 2025 13:52:24 +0200 +Subject: pidfs: validate extensible ioctls + +From: Christian Brauner + +[ Upstream commit 3c17001b21b9f168c957ced9384abe969019b609 ] + +Validate extensible ioctls stricter than we do now. + +Reviewed-by: Aleksa Sarai +Reviewed-by: Jan Kara +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + fs/pidfs.c | 2 +- + include/linux/fs.h | 14 ++++++++++++++ + 2 files changed, 15 insertions(+), 1 deletion(-) + +diff --git a/fs/pidfs.c b/fs/pidfs.c +index 108e7527f837f..2c9c7636253af 100644 +--- a/fs/pidfs.c ++++ b/fs/pidfs.c +@@ -440,7 +440,7 @@ static bool pidfs_ioctl_valid(unsigned int cmd) + * erronously mistook the file descriptor for a pidfd. + * This is not perfect but will catch most cases. + */ +- return (_IOC_TYPE(cmd) == _IOC_TYPE(PIDFD_GET_INFO)); ++ return extensible_ioctl_valid(cmd, PIDFD_GET_INFO, PIDFD_INFO_SIZE_VER0); + } + + return false; +diff --git a/include/linux/fs.h b/include/linux/fs.h +index 74f2bfc519263..ed02715261036 100644 +--- a/include/linux/fs.h ++++ b/include/linux/fs.h +@@ -4025,4 +4025,18 @@ static inline bool vfs_empty_path(int dfd, const char __user *path) + + int generic_atomic_write_valid(struct kiocb *iocb, struct iov_iter *iter); + ++static inline bool extensible_ioctl_valid(unsigned int cmd_a, ++ unsigned int cmd_b, size_t min_size) ++{ ++ if (_IOC_DIR(cmd_a) != _IOC_DIR(cmd_b)) ++ return false; ++ if (_IOC_TYPE(cmd_a) != _IOC_TYPE(cmd_b)) ++ return false; ++ if (_IOC_NR(cmd_a) != _IOC_NR(cmd_b)) ++ return false; ++ if (_IOC_SIZE(cmd_a) < min_size) ++ return false; ++ return true; ++} ++ + #endif /* _LINUX_FS_H */ +-- +2.51.0 + diff --git a/queue-6.17/series b/queue-6.17/series index 1de0ca9ed3..66963d4e26 100644 --- a/queue-6.17/series +++ b/queue-6.17/series @@ -355,3 +355,17 @@ arm64-dts-qcom-qcs615-add-missing-dt-property-in-qup-ses.patch acpi-property-disregard-references-in-data-only-subnode-lists.patch acpi-property-add-code-comments-explaining-what-is-going-on.patch acpi-property-do-not-pass-null-handles-to-acpi_attach_data.patch +irqchip-sifive-plic-avoid-interrupt-id-0-handling-du.patch +copy_file_range-limit-size-if-in-compat-mode.patch +minixfs-verify-inode-mode-when-loading-from-disk.patch +pid-add-a-judgment-for-ns-null-in-pid_nr_ns.patch +pid-make-__task_pid_nr_ns-ns-null-safe-for-zombie-ca.patch +fs-add-initramfs_options-to-set-initramfs-mount-opti.patch +cramfs-verify-inode-mode-when-loading-from-disk.patch +nsfs-validate-extensible-ioctls.patch +mnt_ns_tree_remove-dtrt-if-mnt_ns-had-never-been-add.patch +writeback-avoid-softlockup-when-switching-many-inode.patch +writeback-avoid-excessively-long-inode-switching-tim.patch +iomap-error-out-on-file-io-when-there-is-no-inline_d.patch +pidfs-validate-extensible-ioctls.patch +mount-handle-null-values-in-mnt_ns_release.patch diff --git a/queue-6.17/writeback-avoid-excessively-long-inode-switching-tim.patch b/queue-6.17/writeback-avoid-excessively-long-inode-switching-tim.patch new file mode 100644 index 0000000000..9009dfb81a --- /dev/null +++ b/queue-6.17/writeback-avoid-excessively-long-inode-switching-tim.patch @@ -0,0 +1,102 @@ +From 020fc909cdeb18a6098135b891ea7a85fd31e3c0 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 12 Sep 2025 12:38:37 +0200 +Subject: writeback: Avoid excessively long inode switching times + +From: Jan Kara + +[ Upstream commit 9a6ebbdbd41235ea3bc0c4f39e2076599b8113cc ] + +With lazytime mount option enabled we can be switching many dirty inodes +on cgroup exit to the parent cgroup. The numbers observed in practice +when systemd slice of a large cron job exits can easily reach hundreds +of thousands or millions. The logic in inode_do_switch_wbs() which sorts +the inode into appropriate place in b_dirty list of the target wb +however has linear complexity in the number of dirty inodes thus overall +time complexity of switching all the inodes is quadratic leading to +workers being pegged for hours consuming 100% of the CPU and switching +inodes to the parent wb. + +Simple reproducer of the issue: + FILES=10000 + # Filesystem mounted with lazytime mount option + MNT=/mnt/ + echo "Creating files and switching timestamps" + for (( j = 0; j < 50; j ++ )); do + mkdir $MNT/dir$j + for (( i = 0; i < $FILES; i++ )); do + echo "foo" >$MNT/dir$j/file$i + done + touch -a -t 202501010000 $MNT/dir$j/file* + done + wait + echo "Syncing and flushing" + sync + echo 3 >/proc/sys/vm/drop_caches + + echo "Reading all files from a cgroup" + mkdir /sys/fs/cgroup/unified/mycg1 || exit + echo $$ >/sys/fs/cgroup/unified/mycg1/cgroup.procs || exit + for (( j = 0; j < 50; j ++ )); do + cat /mnt/dir$j/file* >/dev/null & + done + wait + echo "Switching wbs" + # Now rmdir the cgroup after the script exits + +We need to maintain b_dirty list ordering to keep writeback happy so +instead of sorting inode into appropriate place just append it at the +end of the list and clobber dirtied_time_when. This may result in inode +writeback starting later after cgroup switch however cgroup switches are +rare so it shouldn't matter much. Since the cgroup had write access to +the inode, there are no practical concerns of the possible DoS issues. + +Acked-by: Tejun Heo +Signed-off-by: Jan Kara +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + fs/fs-writeback.c | 21 +++++++++++---------- + 1 file changed, 11 insertions(+), 10 deletions(-) + +diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c +index b4aa78da7d94e..3bfc430ef74dc 100644 +--- a/fs/fs-writeback.c ++++ b/fs/fs-writeback.c +@@ -445,22 +445,23 @@ static bool inode_do_switch_wbs(struct inode *inode, + * Transfer to @new_wb's IO list if necessary. If the @inode is dirty, + * the specific list @inode was on is ignored and the @inode is put on + * ->b_dirty which is always correct including from ->b_dirty_time. +- * The transfer preserves @inode->dirtied_when ordering. If the @inode +- * was clean, it means it was on the b_attached list, so move it onto +- * the b_attached list of @new_wb. ++ * If the @inode was clean, it means it was on the b_attached list, so ++ * move it onto the b_attached list of @new_wb. + */ + if (!list_empty(&inode->i_io_list)) { + inode->i_wb = new_wb; + + if (inode->i_state & I_DIRTY_ALL) { +- struct inode *pos; +- +- list_for_each_entry(pos, &new_wb->b_dirty, i_io_list) +- if (time_after_eq(inode->dirtied_when, +- pos->dirtied_when)) +- break; ++ /* ++ * We need to keep b_dirty list sorted by ++ * dirtied_time_when. However properly sorting the ++ * inode in the list gets too expensive when switching ++ * many inodes. So just attach inode at the end of the ++ * dirty list and clobber the dirtied_time_when. ++ */ ++ inode->dirtied_time_when = jiffies; + inode_io_list_move_locked(inode, new_wb, +- pos->i_io_list.prev); ++ &new_wb->b_dirty); + } else { + inode_cgwb_move_to_attached(inode, new_wb); + } +-- +2.51.0 + diff --git a/queue-6.17/writeback-avoid-softlockup-when-switching-many-inode.patch b/queue-6.17/writeback-avoid-softlockup-when-switching-many-inode.patch new file mode 100644 index 0000000000..07f180c838 --- /dev/null +++ b/queue-6.17/writeback-avoid-softlockup-when-switching-many-inode.patch @@ -0,0 +1,65 @@ +From 87fc87f4e4b1fe44ad237c159d2bebfb3e7ea99c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 12 Sep 2025 12:38:36 +0200 +Subject: writeback: Avoid softlockup when switching many inodes + +From: Jan Kara + +[ Upstream commit 66c14dccd810d42ec5c73bb8a9177489dfd62278 ] + +process_inode_switch_wbs_work() can be switching over 100 inodes to a +different cgroup. Since switching an inode requires counting all dirty & +under-writeback pages in the address space of each inode, this can take +a significant amount of time. Add a possibility to reschedule after +processing each inode to avoid softlockups. + +Acked-by: Tejun Heo +Signed-off-by: Jan Kara +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + fs/fs-writeback.c | 11 ++++++++++- + 1 file changed, 10 insertions(+), 1 deletion(-) + +diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c +index a07b8cf73ae27..b4aa78da7d94e 100644 +--- a/fs/fs-writeback.c ++++ b/fs/fs-writeback.c +@@ -502,6 +502,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) + */ + down_read(&bdi->wb_switch_rwsem); + ++ inodep = isw->inodes; + /* + * By the time control reaches here, RCU grace period has passed + * since I_WB_SWITCH assertion and all wb stat update transactions +@@ -512,6 +513,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) + * gives us exclusion against all wb related operations on @inode + * including IO list manipulations and stat updates. + */ ++relock: + if (old_wb < new_wb) { + spin_lock(&old_wb->list_lock); + spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING); +@@ -520,10 +522,17 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) + spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING); + } + +- for (inodep = isw->inodes; *inodep; inodep++) { ++ while (*inodep) { + WARN_ON_ONCE((*inodep)->i_wb != old_wb); + if (inode_do_switch_wbs(*inodep, old_wb, new_wb)) + nr_switched++; ++ inodep++; ++ if (*inodep && need_resched()) { ++ spin_unlock(&new_wb->list_lock); ++ spin_unlock(&old_wb->list_lock); ++ cond_resched(); ++ goto relock; ++ } + } + + spin_unlock(&new_wb->list_lock); +-- +2.51.0 + diff --git a/queue-6.6/cramfs-verify-inode-mode-when-loading-from-disk.patch b/queue-6.6/cramfs-verify-inode-mode-when-loading-from-disk.patch new file mode 100644 index 0000000000..af2ec5794c --- /dev/null +++ b/queue-6.6/cramfs-verify-inode-mode-when-loading-from-disk.patch @@ -0,0 +1,51 @@ +From d6191824ac6e01b466929288aa2b099570f46283 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 30 Aug 2025 19:01:01 +0900 +Subject: cramfs: Verify inode mode when loading from disk + +From: Tetsuo Handa + +[ Upstream commit 7f9d34b0a7cb93d678ee7207f0634dbf79e47fe5 ] + +The inode mode loaded from corrupted disk can be invalid. Do like what +commit 0a9e74051313 ("isofs: Verify inode mode when loading from disk") +does. + +Reported-by: syzbot +Closes: https://syzkaller.appspot.com/bug?extid=895c23f6917da440ed0d +Signed-off-by: Tetsuo Handa +Link: https://lore.kernel.org/429b3ef1-13de-4310-9a8e-c2dc9a36234a@I-love.SAKURA.ne.jp +Acked-by: Nicolas Pitre +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + fs/cramfs/inode.c | 11 ++++++++++- + 1 file changed, 10 insertions(+), 1 deletion(-) + +diff --git a/fs/cramfs/inode.c b/fs/cramfs/inode.c +index 2fbf97077ce91..3f06362985b5a 100644 +--- a/fs/cramfs/inode.c ++++ b/fs/cramfs/inode.c +@@ -117,9 +117,18 @@ static struct inode *get_cramfs_inode(struct super_block *sb, + inode_nohighmem(inode); + inode->i_data.a_ops = &cramfs_aops; + break; +- default: ++ case S_IFCHR: ++ case S_IFBLK: ++ case S_IFIFO: ++ case S_IFSOCK: + init_special_inode(inode, cramfs_inode->mode, + old_decode_dev(cramfs_inode->size)); ++ break; ++ default: ++ printk(KERN_DEBUG "CRAMFS: Invalid file type 0%04o for inode %lu.\n", ++ inode->i_mode, inode->i_ino); ++ iget_failed(inode); ++ return ERR_PTR(-EIO); + } + + inode->i_mode = cramfs_inode->mode; +-- +2.51.0 + diff --git a/queue-6.6/fs-add-initramfs_options-to-set-initramfs-mount-opti.patch b/queue-6.6/fs-add-initramfs_options-to-set-initramfs-mount-opti.patch new file mode 100644 index 0000000000..d94c75920e --- /dev/null +++ b/queue-6.6/fs-add-initramfs_options-to-set-initramfs-mount-opti.patch @@ -0,0 +1,116 @@ +From 516dcf829e0b2836fc94885dec4c34f96bb89804 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 15 Aug 2025 20:14:59 +0800 +Subject: fs: Add 'initramfs_options' to set initramfs mount options +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Lichen Liu + +[ Upstream commit 278033a225e13ec21900f0a92b8351658f5377f2 ] + +When CONFIG_TMPFS is enabled, the initial root filesystem is a tmpfs. +By default, a tmpfs mount is limited to using 50% of the available RAM +for its content. This can be problematic in memory-constrained +environments, particularly during a kdump capture. + +In a kdump scenario, the capture kernel boots with a limited amount of +memory specified by the 'crashkernel' parameter. If the initramfs is +large, it may fail to unpack into the tmpfs rootfs due to insufficient +space. This is because to get X MB of usable space in tmpfs, 2*X MB of +memory must be available for the mount. This leads to an OOM failure +during the early boot process, preventing a successful crash dump. + +This patch introduces a new kernel command-line parameter, +initramfs_options, which allows passing specific mount options directly +to the rootfs when it is first mounted. This gives users control over +the rootfs behavior. + +For example, a user can now specify initramfs_options=size=75% to allow +the tmpfs to use up to 75% of the available memory. This can +significantly reduce the memory pressure for kdump. + +Consider a practical example: + +To unpack a 48MB initramfs, the tmpfs needs 48MB of usable space. With +the default 50% limit, this requires a memory pool of 96MB to be +available for the tmpfs mount. The total memory requirement is therefore +approximately: 16MB (vmlinuz) + 48MB (loaded initramfs) + 48MB (unpacked +kernel) + 96MB (for tmpfs) + 12MB (runtime overhead) ≈ 220MB. + +By using initramfs_options=size=75%, the memory pool required for the +48MB tmpfs is reduced to 48MB / 0.75 = 64MB. This reduces the total +memory requirement by 32MB (96MB - 64MB), allowing the kdump to succeed +with a smaller crashkernel size, such as 192MB. + +An alternative approach of reusing the existing rootflags parameter was +considered. However, a new, dedicated initramfs_options parameter was +chosen to avoid altering the current behavior of rootflags (which +applies to the final root filesystem) and to prevent any potential +regressions. + +Also add documentation for the new kernel parameter "initramfs_options" + +This approach is inspired by prior discussions and patches on the topic. +Ref: https://www.lightofdawn.org/blog/?viewDetailed=00128 +Ref: https://landley.net/notes-2015.html#01-01-2015 +Ref: https://lkml.org/lkml/2021/6/29/783 +Ref: https://www.kernel.org/doc/html/latest/filesystems/ramfs-rootfs-initramfs.html#what-is-rootfs + +Signed-off-by: Lichen Liu +Link: https://lore.kernel.org/20250815121459.3391223-1-lichliu@redhat.com +Tested-by: Rob Landley +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + Documentation/admin-guide/kernel-parameters.txt | 3 +++ + fs/namespace.c | 11 ++++++++++- + 2 files changed, 13 insertions(+), 1 deletion(-) + +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index 60d48ebbc2cb0..fff3ca50c6c26 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -5638,6 +5638,9 @@ + + rootflags= [KNL] Set root filesystem mount option string + ++ initramfs_options= [KNL] ++ Specify mount options for for the initramfs mount. ++ + rootfstype= [KNL] Set root filesystem type + + rootwait [KNL] Wait (indefinitely) for root device to show up. +diff --git a/fs/namespace.c b/fs/namespace.c +index f79226472251b..646d9e7d41ee8 100644 +--- a/fs/namespace.c ++++ b/fs/namespace.c +@@ -64,6 +64,15 @@ static int __init set_mphash_entries(char *str) + } + __setup("mphash_entries=", set_mphash_entries); + ++static char * __initdata initramfs_options; ++static int __init initramfs_options_setup(char *str) ++{ ++ initramfs_options = str; ++ return 1; ++} ++ ++__setup("initramfs_options=", initramfs_options_setup); ++ + static u64 event; + static DEFINE_IDA(mnt_id_ida); + static DEFINE_IDA(mnt_group_ida); +@@ -4728,7 +4737,7 @@ static void __init init_mount_tree(void) + struct mnt_namespace *ns; + struct path root; + +- mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", NULL); ++ mnt = vfs_kern_mount(&rootfs_fs_type, 0, "rootfs", initramfs_options); + if (IS_ERR(mnt)) + panic("Can't create rootfs"); + +-- +2.51.0 + diff --git a/queue-6.6/irqchip-sifive-plic-avoid-interrupt-id-0-handling-du.patch b/queue-6.6/irqchip-sifive-plic-avoid-interrupt-id-0-handling-du.patch new file mode 100644 index 0000000000..8235fa387f --- /dev/null +++ b/queue-6.6/irqchip-sifive-plic-avoid-interrupt-id-0-handling-du.patch @@ -0,0 +1,64 @@ +From bf7a84c0673734f3223aeab43a839908ceac24db Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 23 Sep 2025 15:43:19 +0100 +Subject: irqchip/sifive-plic: Avoid interrupt ID 0 handling during + suspend/resume + +From: Lucas Zampieri + +[ Upstream commit f75e07bf5226da640fa99a0594687c780d9bace4 ] + +According to the PLIC specification[1], global interrupt sources are +assigned small unsigned integer identifiers beginning at the value 1. +An interrupt ID of 0 is reserved to mean "no interrupt". + +The current plic_irq_resume() and plic_irq_suspend() functions incorrectly +start the loop from index 0, which accesses the register space for the +reserved interrupt ID 0. + +Change the loop to start from index 1, skipping the reserved +interrupt ID 0 as per the PLIC specification. + +This prevents potential undefined behavior when accessing the reserved +register space during suspend/resume cycles. + +Fixes: e80f0b6a2cf3 ("irqchip/irq-sifive-plic: Add syscore callbacks for hibernation") +Co-developed-by: Jia Wang +Signed-off-by: Jia Wang +Co-developed-by: Charles Mirabile +Signed-off-by: Charles Mirabile +Signed-off-by: Lucas Zampieri +Signed-off-by: Thomas Gleixner +Link: https://github.com/riscv/riscv-plic-spec/releases/tag/1.0.0 +Signed-off-by: Sasha Levin +--- + drivers/irqchip/irq-sifive-plic.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/drivers/irqchip/irq-sifive-plic.c b/drivers/irqchip/irq-sifive-plic.c +index 0fcd37108b67e..2d20cf9d84cea 100644 +--- a/drivers/irqchip/irq-sifive-plic.c ++++ b/drivers/irqchip/irq-sifive-plic.c +@@ -248,7 +248,8 @@ static int plic_irq_suspend(void) + + priv = per_cpu_ptr(&plic_handlers, smp_processor_id())->priv; + +- for (i = 0; i < priv->nr_irqs; i++) { ++ /* irq ID 0 is reserved */ ++ for (i = 1; i < priv->nr_irqs; i++) { + __assign_bit(i, priv->prio_save, + readl(priv->regs + PRIORITY_BASE + i * PRIORITY_PER_ID)); + } +@@ -278,7 +279,8 @@ static void plic_irq_resume(void) + + priv = per_cpu_ptr(&plic_handlers, smp_processor_id())->priv; + +- for (i = 0; i < priv->nr_irqs; i++) { ++ /* irq ID 0 is reserved */ ++ for (i = 1; i < priv->nr_irqs; i++) { + index = BIT_WORD(i); + writel((priv->prio_save[index] & BIT_MASK(i)) ? 1 : 0, + priv->regs + PRIORITY_BASE + i * PRIORITY_PER_ID); +-- +2.51.0 + diff --git a/queue-6.6/irqchip-sifive-plic-make-use-of-__assign_bit.patch b/queue-6.6/irqchip-sifive-plic-make-use-of-__assign_bit.patch new file mode 100644 index 0000000000..c752824a5a --- /dev/null +++ b/queue-6.6/irqchip-sifive-plic-make-use-of-__assign_bit.patch @@ -0,0 +1,51 @@ +From ada1def3fe5a7e2bd853ee04dc0d0e4287fbc9d8 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 2 Sep 2024 21:08:24 +0800 +Subject: irqchip/sifive-plic: Make use of __assign_bit() + +From: Hongbo Li + +[ Upstream commit 40d7af5375a4e27d8576d9d11954ac213d06f09e ] + +Replace the open coded + +if (foo) + __set_bit(n, bar); + else + __clear_bit(n, bar); + +with __assign_bit(). No functional change intended. + +Signed-off-by: Hongbo Li +Signed-off-by: Thomas Gleixner +Reviewed-by: Palmer Dabbelt +Link: https://lore.kernel.org/all/20240902130824.2878644-1-lihongbo22@huawei.com +Stable-dep-of: f75e07bf5226 ("irqchip/sifive-plic: Avoid interrupt ID 0 handling during suspend/resume") +Signed-off-by: Sasha Levin +--- + drivers/irqchip/irq-sifive-plic.c | 9 ++++----- + 1 file changed, 4 insertions(+), 5 deletions(-) + +diff --git a/drivers/irqchip/irq-sifive-plic.c b/drivers/irqchip/irq-sifive-plic.c +index 5728996691549..0fcd37108b67e 100644 +--- a/drivers/irqchip/irq-sifive-plic.c ++++ b/drivers/irqchip/irq-sifive-plic.c +@@ -248,11 +248,10 @@ static int plic_irq_suspend(void) + + priv = per_cpu_ptr(&plic_handlers, smp_processor_id())->priv; + +- for (i = 0; i < priv->nr_irqs; i++) +- if (readl(priv->regs + PRIORITY_BASE + i * PRIORITY_PER_ID)) +- __set_bit(i, priv->prio_save); +- else +- __clear_bit(i, priv->prio_save); ++ for (i = 0; i < priv->nr_irqs; i++) { ++ __assign_bit(i, priv->prio_save, ++ readl(priv->regs + PRIORITY_BASE + i * PRIORITY_PER_ID)); ++ } + + for_each_cpu(cpu, cpu_present_mask) { + struct plic_handler *handler = per_cpu_ptr(&plic_handlers, cpu); +-- +2.51.0 + diff --git a/queue-6.6/minixfs-verify-inode-mode-when-loading-from-disk.patch b/queue-6.6/minixfs-verify-inode-mode-when-loading-from-disk.patch new file mode 100644 index 0000000000..cd9c539c9c --- /dev/null +++ b/queue-6.6/minixfs-verify-inode-mode-when-loading-from-disk.patch @@ -0,0 +1,46 @@ +From 5d53e0ce33564aebcf764c0ed37fb03ce36a965d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 13 Aug 2025 00:17:44 +0900 +Subject: minixfs: Verify inode mode when loading from disk + +From: Tetsuo Handa + +[ Upstream commit 73861970938ad1323eb02bbbc87f6fbd1e5bacca ] + +The inode mode loaded from corrupted disk can be invalid. Do like what +commit 0a9e74051313 ("isofs: Verify inode mode when loading from disk") +does. + +Reported-by: syzbot +Closes: https://syzkaller.appspot.com/bug?extid=895c23f6917da440ed0d +Signed-off-by: Tetsuo Handa +Link: https://lore.kernel.org/ec982681-84b8-4624-94fa-8af15b77cbd2@I-love.SAKURA.ne.jp +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + fs/minix/inode.c | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +diff --git a/fs/minix/inode.c b/fs/minix/inode.c +index df575473c1cc0..ee8a6fe360e72 100644 +--- a/fs/minix/inode.c ++++ b/fs/minix/inode.c +@@ -470,8 +470,14 @@ void minix_set_inode(struct inode *inode, dev_t rdev) + inode->i_op = &minix_symlink_inode_operations; + inode_nohighmem(inode); + inode->i_mapping->a_ops = &minix_aops; +- } else ++ } else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || ++ S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { + init_special_inode(inode, inode->i_mode, rdev); ++ } else { ++ printk(KERN_DEBUG "MINIX-fs: Invalid file type 0%04o for inode %lu.\n", ++ inode->i_mode, inode->i_ino); ++ make_bad_inode(inode); ++ } + } + + /* +-- +2.51.0 + diff --git a/queue-6.6/pid-add-a-judgment-for-ns-null-in-pid_nr_ns.patch b/queue-6.6/pid-add-a-judgment-for-ns-null-in-pid_nr_ns.patch new file mode 100644 index 0000000000..385515bb5a --- /dev/null +++ b/queue-6.6/pid-add-a-judgment-for-ns-null-in-pid_nr_ns.patch @@ -0,0 +1,95 @@ +From 9694c9b5a0f7267934f619b147c175af4bf654b9 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 2 Aug 2025 10:21:23 +0800 +Subject: pid: Add a judgment for ns null in pid_nr_ns + +From: gaoxiang17 + +[ Upstream commit 006568ab4c5ca2309ceb36fa553e390b4aa9c0c7 ] + +__task_pid_nr_ns + ns = task_active_pid_ns(current); + pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); + if (pid && ns->level <= pid->level) { + +Sometimes null is returned for task_active_pid_ns. Then it will trigger kernel panic in pid_nr_ns. + +For example: + Unable to handle kernel NULL pointer dereference at virtual address 0000000000000058 + Mem abort info: + ESR = 0x0000000096000007 + EC = 0x25: DABT (current EL), IL = 32 bits + SET = 0, FnV = 0 + EA = 0, S1PTW = 0 + FSC = 0x07: level 3 translation fault + Data abort info: + ISV = 0, ISS = 0x00000007, ISS2 = 0x00000000 + CM = 0, WnR = 0, TnD = 0, TagAccess = 0 + GCS = 0, Overlay = 0, DirtyBit = 0, Xs = 0 + user pgtable: 4k pages, 39-bit VAs, pgdp=00000002175aa000 + [0000000000000058] pgd=08000002175ab003, p4d=08000002175ab003, pud=08000002175ab003, pmd=08000002175be003, pte=0000000000000000 + pstate: 834000c5 (Nzcv daIF +PAN -UAO +TCO +DIT -SSBS BTYPE=--) + pc : __task_pid_nr_ns+0x74/0xd0 + lr : __task_pid_nr_ns+0x24/0xd0 + sp : ffffffc08001bd10 + x29: ffffffc08001bd10 x28: ffffffd4422b2000 x27: 0000000000000001 + x26: ffffffd442821168 x25: ffffffd442821000 x24: 00000f89492eab31 + x23: 00000000000000c0 x22: ffffff806f5693c0 x21: ffffff806f5693c0 + x20: 0000000000000001 x19: 0000000000000000 x18: 0000000000000000 + x17: 00000000529c6ef0 x16: 00000000529c6ef0 x15: 00000000023a1adc + x14: 0000000000000003 x13: 00000000007ef6d8 x12: 001167c391c78800 + x11: 00ffffffffffffff x10: 0000000000000000 x9 : 0000000000000001 + x8 : ffffff80816fa3c0 x7 : 0000000000000000 x6 : 49534d702d535449 + x5 : ffffffc080c4c2c0 x4 : ffffffd43ee128c8 x3 : ffffffd43ee124dc + x2 : 0000000000000000 x1 : 0000000000000001 x0 : ffffff806f5693c0 + Call trace: + __task_pid_nr_ns+0x74/0xd0 + ... + __handle_irq_event_percpu+0xd4/0x284 + handle_irq_event+0x48/0xb0 + handle_fasteoi_irq+0x160/0x2d8 + generic_handle_domain_irq+0x44/0x60 + gic_handle_irq+0x4c/0x114 + call_on_irq_stack+0x3c/0x74 + do_interrupt_handler+0x4c/0x84 + el1_interrupt+0x34/0x58 + el1h_64_irq_handler+0x18/0x24 + el1h_64_irq+0x68/0x6c + account_kernel_stack+0x60/0x144 + exit_task_stack_account+0x1c/0x80 + do_exit+0x7e4/0xaf8 + ... + get_signal+0x7bc/0x8d8 + do_notify_resume+0x128/0x828 + el0_svc+0x6c/0x70 + el0t_64_sync_handler+0x68/0xbc + el0t_64_sync+0x1a8/0x1ac + Code: 35fffe54 911a02a8 f9400108 b4000128 (b9405a69) + ---[ end trace 0000000000000000 ]--- + Kernel panic - not syncing: Oops: Fatal exception in interrupt + +Signed-off-by: gaoxiang17 +Link: https://lore.kernel.org/20250802022123.3536934-1-gxxa03070307@gmail.com +Reviewed-by: Baoquan He +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + kernel/pid.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/kernel/pid.c b/kernel/pid.c +index 6500ef956f2f8..e57adc00cb779 100644 +--- a/kernel/pid.c ++++ b/kernel/pid.c +@@ -477,7 +477,7 @@ pid_t pid_nr_ns(struct pid *pid, struct pid_namespace *ns) + struct upid *upid; + pid_t nr = 0; + +- if (pid && ns->level <= pid->level) { ++ if (pid && ns && ns->level <= pid->level) { + upid = &pid->numbers[ns->level]; + if (upid->ns == ns) + nr = upid->nr; +-- +2.51.0 + diff --git a/queue-6.6/pid-make-__task_pid_nr_ns-ns-null-safe-for-zombie-ca.patch b/queue-6.6/pid-make-__task_pid_nr_ns-ns-null-safe-for-zombie-ca.patch new file mode 100644 index 0000000000..71581b2175 --- /dev/null +++ b/queue-6.6/pid-make-__task_pid_nr_ns-ns-null-safe-for-zombie-ca.patch @@ -0,0 +1,48 @@ +From 94ebb937895b3a258a3422b9afee350632d69ff9 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sun, 10 Aug 2025 19:36:04 +0200 +Subject: pid: make __task_pid_nr_ns(ns => NULL) safe for zombie callers +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Oleg Nesterov + +[ Upstream commit abdfd4948e45c51b19162cf8b3f5003f8f53c9b9 ] + +task_pid_vnr(another_task) will crash if the caller was already reaped. +The pid_alive(current) check can't really help, the parent/debugger can +call release_task() right after this check. + +This also means that even task_ppid_nr_ns(current, NULL) is not safe, +pid_alive() only ensures that it is safe to dereference ->real_parent. + +Change __task_pid_nr_ns() to ensure ns != NULL. + +Originally-by: 高翔 +Link: https://lore.kernel.org/all/20250802022123.3536934-1-gxxa03070307@gmail.com/ +Signed-off-by: Oleg Nesterov +Link: https://lore.kernel.org/20250810173604.GA19991@redhat.com +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + kernel/pid.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/kernel/pid.c b/kernel/pid.c +index e57adc00cb779..69922b2e7ed15 100644 +--- a/kernel/pid.c ++++ b/kernel/pid.c +@@ -500,7 +500,8 @@ pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type, + rcu_read_lock(); + if (!ns) + ns = task_active_pid_ns(current); +- nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); ++ if (ns) ++ nr = pid_nr_ns(rcu_dereference(*task_pid_ptr(task, type)), ns); + rcu_read_unlock(); + + return nr; +-- +2.51.0 + diff --git a/queue-6.6/series b/queue-6.6/series index 6f7c96df79..dfc2ca96c8 100644 --- a/queue-6.6/series +++ b/queue-6.6/series @@ -189,3 +189,12 @@ s390-bpf-write-back-tail-call-counter-for-bpf_pseudo_call.patch s390-bpf-write-back-tail-call-counter-for-bpf_tramp_f_call_orig.patch selftests-mm-skip-soft-dirty-tests-when-config_mem_soft_dirty-is-disabled.patch mptcp-pm-in-kernel-usable-client-side-with-c-flag.patch +irqchip-sifive-plic-make-use-of-__assign_bit.patch +irqchip-sifive-plic-avoid-interrupt-id-0-handling-du.patch +minixfs-verify-inode-mode-when-loading-from-disk.patch +pid-add-a-judgment-for-ns-null-in-pid_nr_ns.patch +pid-make-__task_pid_nr_ns-ns-null-safe-for-zombie-ca.patch +fs-add-initramfs_options-to-set-initramfs-mount-opti.patch +cramfs-verify-inode-mode-when-loading-from-disk.patch +writeback-avoid-softlockup-when-switching-many-inode.patch +writeback-avoid-excessively-long-inode-switching-tim.patch diff --git a/queue-6.6/writeback-avoid-excessively-long-inode-switching-tim.patch b/queue-6.6/writeback-avoid-excessively-long-inode-switching-tim.patch new file mode 100644 index 0000000000..145a205f58 --- /dev/null +++ b/queue-6.6/writeback-avoid-excessively-long-inode-switching-tim.patch @@ -0,0 +1,102 @@ +From 11c76462dd44df84c01eb56f712a52f07819be26 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 12 Sep 2025 12:38:37 +0200 +Subject: writeback: Avoid excessively long inode switching times + +From: Jan Kara + +[ Upstream commit 9a6ebbdbd41235ea3bc0c4f39e2076599b8113cc ] + +With lazytime mount option enabled we can be switching many dirty inodes +on cgroup exit to the parent cgroup. The numbers observed in practice +when systemd slice of a large cron job exits can easily reach hundreds +of thousands or millions. The logic in inode_do_switch_wbs() which sorts +the inode into appropriate place in b_dirty list of the target wb +however has linear complexity in the number of dirty inodes thus overall +time complexity of switching all the inodes is quadratic leading to +workers being pegged for hours consuming 100% of the CPU and switching +inodes to the parent wb. + +Simple reproducer of the issue: + FILES=10000 + # Filesystem mounted with lazytime mount option + MNT=/mnt/ + echo "Creating files and switching timestamps" + for (( j = 0; j < 50; j ++ )); do + mkdir $MNT/dir$j + for (( i = 0; i < $FILES; i++ )); do + echo "foo" >$MNT/dir$j/file$i + done + touch -a -t 202501010000 $MNT/dir$j/file* + done + wait + echo "Syncing and flushing" + sync + echo 3 >/proc/sys/vm/drop_caches + + echo "Reading all files from a cgroup" + mkdir /sys/fs/cgroup/unified/mycg1 || exit + echo $$ >/sys/fs/cgroup/unified/mycg1/cgroup.procs || exit + for (( j = 0; j < 50; j ++ )); do + cat /mnt/dir$j/file* >/dev/null & + done + wait + echo "Switching wbs" + # Now rmdir the cgroup after the script exits + +We need to maintain b_dirty list ordering to keep writeback happy so +instead of sorting inode into appropriate place just append it at the +end of the list and clobber dirtied_time_when. This may result in inode +writeback starting later after cgroup switch however cgroup switches are +rare so it shouldn't matter much. Since the cgroup had write access to +the inode, there are no practical concerns of the possible DoS issues. + +Acked-by: Tejun Heo +Signed-off-by: Jan Kara +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + fs/fs-writeback.c | 21 +++++++++++---------- + 1 file changed, 11 insertions(+), 10 deletions(-) + +diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c +index 0454a1f0fc636..274fae88b498e 100644 +--- a/fs/fs-writeback.c ++++ b/fs/fs-writeback.c +@@ -422,22 +422,23 @@ static bool inode_do_switch_wbs(struct inode *inode, + * Transfer to @new_wb's IO list if necessary. If the @inode is dirty, + * the specific list @inode was on is ignored and the @inode is put on + * ->b_dirty which is always correct including from ->b_dirty_time. +- * The transfer preserves @inode->dirtied_when ordering. If the @inode +- * was clean, it means it was on the b_attached list, so move it onto +- * the b_attached list of @new_wb. ++ * If the @inode was clean, it means it was on the b_attached list, so ++ * move it onto the b_attached list of @new_wb. + */ + if (!list_empty(&inode->i_io_list)) { + inode->i_wb = new_wb; + + if (inode->i_state & I_DIRTY_ALL) { +- struct inode *pos; +- +- list_for_each_entry(pos, &new_wb->b_dirty, i_io_list) +- if (time_after_eq(inode->dirtied_when, +- pos->dirtied_when)) +- break; ++ /* ++ * We need to keep b_dirty list sorted by ++ * dirtied_time_when. However properly sorting the ++ * inode in the list gets too expensive when switching ++ * many inodes. So just attach inode at the end of the ++ * dirty list and clobber the dirtied_time_when. ++ */ ++ inode->dirtied_time_when = jiffies; + inode_io_list_move_locked(inode, new_wb, +- pos->i_io_list.prev); ++ &new_wb->b_dirty); + } else { + inode_cgwb_move_to_attached(inode, new_wb); + } +-- +2.51.0 + diff --git a/queue-6.6/writeback-avoid-softlockup-when-switching-many-inode.patch b/queue-6.6/writeback-avoid-softlockup-when-switching-many-inode.patch new file mode 100644 index 0000000000..9894c809ae --- /dev/null +++ b/queue-6.6/writeback-avoid-softlockup-when-switching-many-inode.patch @@ -0,0 +1,65 @@ +From 1823f0dc3123706c07e1aeb86cf0175ac090becd Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 12 Sep 2025 12:38:36 +0200 +Subject: writeback: Avoid softlockup when switching many inodes + +From: Jan Kara + +[ Upstream commit 66c14dccd810d42ec5c73bb8a9177489dfd62278 ] + +process_inode_switch_wbs_work() can be switching over 100 inodes to a +different cgroup. Since switching an inode requires counting all dirty & +under-writeback pages in the address space of each inode, this can take +a significant amount of time. Add a possibility to reschedule after +processing each inode to avoid softlockups. + +Acked-by: Tejun Heo +Signed-off-by: Jan Kara +Signed-off-by: Christian Brauner +Signed-off-by: Sasha Levin +--- + fs/fs-writeback.c | 11 ++++++++++- + 1 file changed, 10 insertions(+), 1 deletion(-) + +diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c +index ed110568d6127..0454a1f0fc636 100644 +--- a/fs/fs-writeback.c ++++ b/fs/fs-writeback.c +@@ -479,6 +479,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) + */ + down_read(&bdi->wb_switch_rwsem); + ++ inodep = isw->inodes; + /* + * By the time control reaches here, RCU grace period has passed + * since I_WB_SWITCH assertion and all wb stat update transactions +@@ -489,6 +490,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) + * gives us exclusion against all wb related operations on @inode + * including IO list manipulations and stat updates. + */ ++relock: + if (old_wb < new_wb) { + spin_lock(&old_wb->list_lock); + spin_lock_nested(&new_wb->list_lock, SINGLE_DEPTH_NESTING); +@@ -497,10 +499,17 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) + spin_lock_nested(&old_wb->list_lock, SINGLE_DEPTH_NESTING); + } + +- for (inodep = isw->inodes; *inodep; inodep++) { ++ while (*inodep) { + WARN_ON_ONCE((*inodep)->i_wb != old_wb); + if (inode_do_switch_wbs(*inodep, old_wb, new_wb)) + nr_switched++; ++ inodep++; ++ if (*inodep && need_resched()) { ++ spin_unlock(&new_wb->list_lock); ++ spin_unlock(&old_wb->list_lock); ++ cond_resched(); ++ goto relock; ++ } + } + + spin_unlock(&new_wb->list_lock); +-- +2.51.0 + -- 2.47.3