From 030cf1f3244153a3b0c765b2b492c7c16bdc8043 Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 13 Jun 2024 13:03:22 +0200 Subject: [PATCH] 6.9-stable patches added patches: alsa-seq-fix-incorrect-ump-type-for-system-messages.patch bpf-fix-multi-uprobe-pid-filtering-logic.patch btrfs-fix-crash-on-racing-fsync-and-size-extending-write-into-prealloc.patch btrfs-fix-leak-of-qgroup-extent-records-after-transaction-abort.patch btrfs-protect-folio-private-when-attaching-extent-buffer-folios.patch btrfs-qgroup-fix-qgroup-id-collision-across-mounts.patch btrfs-qgroup-update-rescan-message-levels-and-error-codes.patch btrfs-re-introduce-norecovery-mount-option.patch cifs-fix-creating-sockets-when-using-sfu-mount-options.patch edac-amd64-convert-pcibios_-return-codes-to-errnos.patch edac-igen6-convert-pcibios_-return-codes-to-errnos.patch eventfs-fix-a-possible-null-pointer-dereference-in-eventfs_find_events.patch eventfs-keep-the-directories-from-having-the-same-inode-number-as-files.patch nfs-fix-read_plus-when-server-doesn-t-support-op_read_plus.patch nfs-fix-undefined-behavior-in-nfs_block_bits.patch nilfs2-fix-nilfs_empty_dir-misjudgment-and-long-loop-on-i-o-errors.patch nilfs2-fix-potential-kernel-bug-due-to-lack-of-writeback-flag-waiting.patch powerpc-64-bpf-fix-tail-calls-for-pcrel-addressing.patch powerpc-bpf-enforce-full-ordering-for-atomic-operations-with-bpf_fetch.patch smb-client-fix-deadlock-in-smb2_find_smb_tcon.patch tracefs-clear-event_inode-flag-in-tracefs_drop_inode.patch --- ...correct-ump-type-for-system-messages.patch | 40 +++ ...fix-multi-uprobe-pid-filtering-logic.patch | 94 ++++++ ...d-size-extending-write-into-prealloc.patch | 218 ++++++++++++++ ...tent-records-after-transaction-abort.patch | 60 ++++ ...-when-attaching-extent-buffer-folios.patch | 279 ++++++++++++++++++ ...ix-qgroup-id-collision-across-mounts.patch | 66 +++++ ...escan-message-levels-and-error-codes.patch | 65 ++++ ...re-introduce-norecovery-mount-option.patch | 68 +++++ ...sockets-when-using-sfu-mount-options.patch | 65 ++++ ...vert-pcibios_-return-codes-to-errnos.patch | 65 ++++ ...vert-pcibios_-return-codes-to-errnos.patch | 51 ++++ ...r-dereference-in-eventfs_find_events.patch | 42 +++ ...aving-the-same-inode-number-as-files.patch | 44 +++ ...-server-doesn-t-support-op_read_plus.patch | 41 +++ ...undefined-behavior-in-nfs_block_bits.patch | 38 +++ ...judgment-and-long-loop-on-i-o-errors.patch | 46 +++ ...ue-to-lack-of-writeback-flag-waiting.patch | 76 +++++ ...-fix-tail-calls-for-pcrel-addressing.patch | 109 +++++++ ...for-atomic-operations-with-bpf_fetch.patch | 138 +++++++++ queue-6.9/series | 21 ++ ...t-fix-deadlock-in-smb2_find_smb_tcon.patch | 34 +++ ...ent_inode-flag-in-tracefs_drop_inode.patch | 95 ++++++ 22 files changed, 1755 insertions(+) create mode 100644 queue-6.9/alsa-seq-fix-incorrect-ump-type-for-system-messages.patch create mode 100644 queue-6.9/bpf-fix-multi-uprobe-pid-filtering-logic.patch create mode 100644 queue-6.9/btrfs-fix-crash-on-racing-fsync-and-size-extending-write-into-prealloc.patch create mode 100644 queue-6.9/btrfs-fix-leak-of-qgroup-extent-records-after-transaction-abort.patch create mode 100644 queue-6.9/btrfs-protect-folio-private-when-attaching-extent-buffer-folios.patch create mode 100644 queue-6.9/btrfs-qgroup-fix-qgroup-id-collision-across-mounts.patch create mode 100644 queue-6.9/btrfs-qgroup-update-rescan-message-levels-and-error-codes.patch create mode 100644 queue-6.9/btrfs-re-introduce-norecovery-mount-option.patch create mode 100644 queue-6.9/cifs-fix-creating-sockets-when-using-sfu-mount-options.patch create mode 100644 queue-6.9/edac-amd64-convert-pcibios_-return-codes-to-errnos.patch create mode 100644 queue-6.9/edac-igen6-convert-pcibios_-return-codes-to-errnos.patch create mode 100644 queue-6.9/eventfs-fix-a-possible-null-pointer-dereference-in-eventfs_find_events.patch create mode 100644 queue-6.9/eventfs-keep-the-directories-from-having-the-same-inode-number-as-files.patch create mode 100644 queue-6.9/nfs-fix-read_plus-when-server-doesn-t-support-op_read_plus.patch create mode 100644 queue-6.9/nfs-fix-undefined-behavior-in-nfs_block_bits.patch create mode 100644 queue-6.9/nilfs2-fix-nilfs_empty_dir-misjudgment-and-long-loop-on-i-o-errors.patch create mode 100644 queue-6.9/nilfs2-fix-potential-kernel-bug-due-to-lack-of-writeback-flag-waiting.patch create mode 100644 queue-6.9/powerpc-64-bpf-fix-tail-calls-for-pcrel-addressing.patch create mode 100644 queue-6.9/powerpc-bpf-enforce-full-ordering-for-atomic-operations-with-bpf_fetch.patch create mode 100644 queue-6.9/smb-client-fix-deadlock-in-smb2_find_smb_tcon.patch create mode 100644 queue-6.9/tracefs-clear-event_inode-flag-in-tracefs_drop_inode.patch diff --git a/queue-6.9/alsa-seq-fix-incorrect-ump-type-for-system-messages.patch b/queue-6.9/alsa-seq-fix-incorrect-ump-type-for-system-messages.patch new file mode 100644 index 00000000000..1ed656e762f --- /dev/null +++ b/queue-6.9/alsa-seq-fix-incorrect-ump-type-for-system-messages.patch @@ -0,0 +1,40 @@ +From edb32776196afa393c074d6a2733e3a69e66b299 Mon Sep 17 00:00:00 2001 +From: Takashi Iwai +Date: Wed, 29 May 2024 10:37:59 +0200 +Subject: ALSA: seq: Fix incorrect UMP type for system messages + +From: Takashi Iwai + +commit edb32776196afa393c074d6a2733e3a69e66b299 upstream. + +When converting a legacy system message to a UMP packet, it forgot to +modify the UMP type field but keeping the default type (either type 2 +or 4). Correct to the right type for system messages. + +Fixes: e9e02819a98a ("ALSA: seq: Automatic conversion of UMP events") +Cc: +Link: https://lore.kernel.org/r/20240529083800.5742-1-tiwai@suse.de +Signed-off-by: Takashi Iwai +Signed-off-by: Greg Kroah-Hartman +--- + sound/core/seq/seq_ump_convert.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/sound/core/seq/seq_ump_convert.c ++++ b/sound/core/seq/seq_ump_convert.c +@@ -740,6 +740,7 @@ static int system_1p_ev_to_ump_midi1(con + union snd_ump_midi1_msg *data, + unsigned char status) + { ++ data->system.type = UMP_MSG_TYPE_SYSTEM; // override + data->system.status = status; + data->system.parm1 = event->data.control.value & 0x7f; + return 1; +@@ -751,6 +752,7 @@ static int system_2p_ev_to_ump_midi1(con + union snd_ump_midi1_msg *data, + unsigned char status) + { ++ data->system.type = UMP_MSG_TYPE_SYSTEM; // override + data->system.status = status; + data->system.parm1 = event->data.control.value & 0x7f; + data->system.parm2 = (event->data.control.value >> 7) & 0x7f; diff --git a/queue-6.9/bpf-fix-multi-uprobe-pid-filtering-logic.patch b/queue-6.9/bpf-fix-multi-uprobe-pid-filtering-logic.patch new file mode 100644 index 00000000000..9d4db3090c7 --- /dev/null +++ b/queue-6.9/bpf-fix-multi-uprobe-pid-filtering-logic.patch @@ -0,0 +1,94 @@ +From 46ba0e49b64232adac35a2bc892f1710c5b0fb7f Mon Sep 17 00:00:00 2001 +From: Andrii Nakryiko +Date: Tue, 21 May 2024 09:33:57 -0700 +Subject: bpf: fix multi-uprobe PID filtering logic + +From: Andrii Nakryiko + +commit 46ba0e49b64232adac35a2bc892f1710c5b0fb7f upstream. + +Current implementation of PID filtering logic for multi-uprobes in +uprobe_prog_run() is filtering down to exact *thread*, while the intent +for PID filtering it to filter by *process* instead. The check in +uprobe_prog_run() also differs from the analogous one in +uprobe_multi_link_filter() for some reason. The latter is correct, +checking task->mm, not the task itself. + +Fix the check in uprobe_prog_run() to perform the same task->mm check. + +While doing this, we also update get_pid_task() use to use PIDTYPE_TGID +type of lookup, given the intent is to get a representative task of an +entire process. This doesn't change behavior, but seems more logical. It +would hold task group leader task now, not any random thread task. + +Last but not least, given multi-uprobe support is half-broken due to +this PID filtering logic (depending on whether PID filtering is +important or not), we need to make it easy for user space consumers +(including libbpf) to easily detect whether PID filtering logic was +already fixed. + +We do it here by adding an early check on passed pid parameter. If it's +negative (and so has no chance of being a valid PID), we return -EINVAL. +Previous behavior would eventually return -ESRCH ("No process found"), +given there can't be any process with negative PID. This subtle change +won't make any practical change in behavior, but will allow applications +to detect PID filtering fixes easily. Libbpf fixes take advantage of +this in the next patch. + +Cc: stable@vger.kernel.org +Acked-by: Jiri Olsa +Fixes: b733eeade420 ("bpf: Add pid filter support for uprobe_multi link") +Signed-off-by: Andrii Nakryiko +Link: https://lore.kernel.org/r/20240521163401.3005045-2-andrii@kernel.org +Signed-off-by: Alexei Starovoitov +Signed-off-by: Greg Kroah-Hartman +--- + kernel/trace/bpf_trace.c | 8 ++++---- + tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c | 2 +- + 2 files changed, 5 insertions(+), 5 deletions(-) + +--- a/kernel/trace/bpf_trace.c ++++ b/kernel/trace/bpf_trace.c +@@ -3260,7 +3260,7 @@ static int uprobe_prog_run(struct bpf_up + struct bpf_run_ctx *old_run_ctx; + int err = 0; + +- if (link->task && current != link->task) ++ if (link->task && current->mm != link->task->mm) + return 0; + + if (sleepable) +@@ -3361,8 +3361,9 @@ int bpf_uprobe_multi_link_attach(const u + upath = u64_to_user_ptr(attr->link_create.uprobe_multi.path); + uoffsets = u64_to_user_ptr(attr->link_create.uprobe_multi.offsets); + cnt = attr->link_create.uprobe_multi.cnt; ++ pid = attr->link_create.uprobe_multi.pid; + +- if (!upath || !uoffsets || !cnt) ++ if (!upath || !uoffsets || !cnt || pid < 0) + return -EINVAL; + if (cnt > MAX_UPROBE_MULTI_CNT) + return -E2BIG; +@@ -3386,10 +3387,9 @@ int bpf_uprobe_multi_link_attach(const u + goto error_path_put; + } + +- pid = attr->link_create.uprobe_multi.pid; + if (pid) { + rcu_read_lock(); +- task = get_pid_task(find_vpid(pid), PIDTYPE_PID); ++ task = get_pid_task(find_vpid(pid), PIDTYPE_TGID); + rcu_read_unlock(); + if (!task) { + err = -ESRCH; +--- a/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c ++++ b/tools/testing/selftests/bpf/prog_tests/uprobe_multi_test.c +@@ -397,7 +397,7 @@ static void test_attach_api_fails(void) + link_fd = bpf_link_create(prog_fd, 0, BPF_TRACE_UPROBE_MULTI, &opts); + if (!ASSERT_ERR(link_fd, "link_fd")) + goto cleanup; +- ASSERT_EQ(link_fd, -ESRCH, "pid_is_wrong"); ++ ASSERT_EQ(link_fd, -EINVAL, "pid_is_wrong"); + + cleanup: + if (link_fd >= 0) diff --git a/queue-6.9/btrfs-fix-crash-on-racing-fsync-and-size-extending-write-into-prealloc.patch b/queue-6.9/btrfs-fix-crash-on-racing-fsync-and-size-extending-write-into-prealloc.patch new file mode 100644 index 00000000000..a1732028c19 --- /dev/null +++ b/queue-6.9/btrfs-fix-crash-on-racing-fsync-and-size-extending-write-into-prealloc.patch @@ -0,0 +1,218 @@ +From 9d274c19a71b3a276949933859610721a453946b Mon Sep 17 00:00:00 2001 +From: Omar Sandoval +Date: Fri, 24 May 2024 13:58:11 -0700 +Subject: btrfs: fix crash on racing fsync and size-extending write into prealloc + +From: Omar Sandoval + +commit 9d274c19a71b3a276949933859610721a453946b upstream. + +We have been seeing crashes on duplicate keys in +btrfs_set_item_key_safe(): + + BTRFS critical (device vdb): slot 4 key (450 108 8192) new key (450 108 8192) + ------------[ cut here ]------------ + kernel BUG at fs/btrfs/ctree.c:2620! + invalid opcode: 0000 [#1] PREEMPT SMP PTI + CPU: 0 PID: 3139 Comm: xfs_io Kdump: loaded Not tainted 6.9.0 #6 + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-2.fc40 04/01/2014 + RIP: 0010:btrfs_set_item_key_safe+0x11f/0x290 [btrfs] + +With the following stack trace: + + #0 btrfs_set_item_key_safe (fs/btrfs/ctree.c:2620:4) + #1 btrfs_drop_extents (fs/btrfs/file.c:411:4) + #2 log_one_extent (fs/btrfs/tree-log.c:4732:9) + #3 btrfs_log_changed_extents (fs/btrfs/tree-log.c:4955:9) + #4 btrfs_log_inode (fs/btrfs/tree-log.c:6626:9) + #5 btrfs_log_inode_parent (fs/btrfs/tree-log.c:7070:8) + #6 btrfs_log_dentry_safe (fs/btrfs/tree-log.c:7171:8) + #7 btrfs_sync_file (fs/btrfs/file.c:1933:8) + #8 vfs_fsync_range (fs/sync.c:188:9) + #9 vfs_fsync (fs/sync.c:202:9) + #10 do_fsync (fs/sync.c:212:9) + #11 __do_sys_fdatasync (fs/sync.c:225:9) + #12 __se_sys_fdatasync (fs/sync.c:223:1) + #13 __x64_sys_fdatasync (fs/sync.c:223:1) + #14 do_syscall_x64 (arch/x86/entry/common.c:52:14) + #15 do_syscall_64 (arch/x86/entry/common.c:83:7) + #16 entry_SYSCALL_64+0xaf/0x14c (arch/x86/entry/entry_64.S:121) + +So we're logging a changed extent from fsync, which is splitting an +extent in the log tree. But this split part already exists in the tree, +triggering the BUG(). + +This is the state of the log tree at the time of the crash, dumped with +drgn (https://github.com/osandov/drgn/blob/main/contrib/btrfs_tree.py) +to get more details than btrfs_print_leaf() gives us: + + >>> print_extent_buffer(prog.crashed_thread().stack_trace()[0]["eb"]) + leaf 33439744 level 0 items 72 generation 9 owner 18446744073709551610 + leaf 33439744 flags 0x100000000000000 + fs uuid e5bd3946-400c-4223-8923-190ef1f18677 + chunk uuid d58cb17e-6d02-494a-829a-18b7d8a399da + item 0 key (450 INODE_ITEM 0) itemoff 16123 itemsize 160 + generation 7 transid 9 size 8192 nbytes 8473563889606862198 + block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 + sequence 204 flags 0x10(PREALLOC) + atime 1716417703.220000000 (2024-05-22 15:41:43) + ctime 1716417704.983333333 (2024-05-22 15:41:44) + mtime 1716417704.983333333 (2024-05-22 15:41:44) + otime 17592186044416.000000000 (559444-03-08 01:40:16) + item 1 key (450 INODE_REF 256) itemoff 16110 itemsize 13 + index 195 namelen 3 name: 193 + item 2 key (450 XATTR_ITEM 1640047104) itemoff 16073 itemsize 37 + location key (0 UNKNOWN.0 0) type XATTR + transid 7 data_len 1 name_len 6 + name: user.a + data a + item 3 key (450 EXTENT_DATA 0) itemoff 16020 itemsize 53 + generation 9 type 1 (regular) + extent data disk byte 303144960 nr 12288 + extent data offset 0 nr 4096 ram 12288 + extent compression 0 (none) + item 4 key (450 EXTENT_DATA 4096) itemoff 15967 itemsize 53 + generation 9 type 2 (prealloc) + prealloc data disk byte 303144960 nr 12288 + prealloc data offset 4096 nr 8192 + item 5 key (450 EXTENT_DATA 8192) itemoff 15914 itemsize 53 + generation 9 type 2 (prealloc) + prealloc data disk byte 303144960 nr 12288 + prealloc data offset 8192 nr 4096 + ... + +So the real problem happened earlier: notice that items 4 (4k-12k) and 5 +(8k-12k) overlap. Both are prealloc extents. Item 4 straddles i_size and +item 5 starts at i_size. + +Here is the state of the filesystem tree at the time of the crash: + + >>> root = prog.crashed_thread().stack_trace()[2]["inode"].root + >>> ret, nodes, slots = btrfs_search_slot(root, BtrfsKey(450, 0, 0)) + >>> print_extent_buffer(nodes[0]) + leaf 30425088 level 0 items 184 generation 9 owner 5 + leaf 30425088 flags 0x100000000000000 + fs uuid e5bd3946-400c-4223-8923-190ef1f18677 + chunk uuid d58cb17e-6d02-494a-829a-18b7d8a399da + ... + item 179 key (450 INODE_ITEM 0) itemoff 4907 itemsize 160 + generation 7 transid 7 size 4096 nbytes 12288 + block group 0 mode 100600 links 1 uid 0 gid 0 rdev 0 + sequence 6 flags 0x10(PREALLOC) + atime 1716417703.220000000 (2024-05-22 15:41:43) + ctime 1716417703.220000000 (2024-05-22 15:41:43) + mtime 1716417703.220000000 (2024-05-22 15:41:43) + otime 1716417703.220000000 (2024-05-22 15:41:43) + item 180 key (450 INODE_REF 256) itemoff 4894 itemsize 13 + index 195 namelen 3 name: 193 + item 181 key (450 XATTR_ITEM 1640047104) itemoff 4857 itemsize 37 + location key (0 UNKNOWN.0 0) type XATTR + transid 7 data_len 1 name_len 6 + name: user.a + data a + item 182 key (450 EXTENT_DATA 0) itemoff 4804 itemsize 53 + generation 9 type 1 (regular) + extent data disk byte 303144960 nr 12288 + extent data offset 0 nr 8192 ram 12288 + extent compression 0 (none) + item 183 key (450 EXTENT_DATA 8192) itemoff 4751 itemsize 53 + generation 9 type 2 (prealloc) + prealloc data disk byte 303144960 nr 12288 + prealloc data offset 8192 nr 4096 + +Item 5 in the log tree corresponds to item 183 in the filesystem tree, +but nothing matches item 4. Furthermore, item 183 is the last item in +the leaf. + +btrfs_log_prealloc_extents() is responsible for logging prealloc extents +beyond i_size. It first truncates any previously logged prealloc extents +that start beyond i_size. Then, it walks the filesystem tree and copies +the prealloc extent items to the log tree. + +If it hits the end of a leaf, then it calls btrfs_next_leaf(), which +unlocks the tree and does another search. However, while the filesystem +tree is unlocked, an ordered extent completion may modify the tree. In +particular, it may insert an extent item that overlaps with an extent +item that was already copied to the log tree. + +This may manifest in several ways depending on the exact scenario, +including an EEXIST error that is silently translated to a full sync, +overlapping items in the log tree, or this crash. This particular crash +is triggered by the following sequence of events: + +- Initially, the file has i_size=4k, a regular extent from 0-4k, and a + prealloc extent beyond i_size from 4k-12k. The prealloc extent item is + the last item in its B-tree leaf. +- The file is fsync'd, which copies its inode item and both extent items + to the log tree. +- An xattr is set on the file, which sets the + BTRFS_INODE_COPY_EVERYTHING flag. +- The range 4k-8k in the file is written using direct I/O. i_size is + extended to 8k, but the ordered extent is still in flight. +- The file is fsync'd. Since BTRFS_INODE_COPY_EVERYTHING is set, this + calls copy_inode_items_to_log(), which calls + btrfs_log_prealloc_extents(). +- btrfs_log_prealloc_extents() finds the 4k-12k prealloc extent in the + filesystem tree. Since it starts before i_size, it skips it. Since it + is the last item in its B-tree leaf, it calls btrfs_next_leaf(). +- btrfs_next_leaf() unlocks the path. +- The ordered extent completion runs, which converts the 4k-8k part of + the prealloc extent to written and inserts the remaining prealloc part + from 8k-12k. +- btrfs_next_leaf() does a search and finds the new prealloc extent + 8k-12k. +- btrfs_log_prealloc_extents() copies the 8k-12k prealloc extent into + the log tree. Note that it overlaps with the 4k-12k prealloc extent + that was copied to the log tree by the first fsync. +- fsync calls btrfs_log_changed_extents(), which tries to log the 4k-8k + extent that was written. +- This tries to drop the range 4k-8k in the log tree, which requires + adjusting the start of the 4k-12k prealloc extent in the log tree to + 8k. +- btrfs_set_item_key_safe() sees that there is already an extent + starting at 8k in the log tree and calls BUG(). + +Fix this by detecting when we're about to insert an overlapping file +extent item in the log tree and truncating the part that would overlap. + +CC: stable@vger.kernel.org # 6.1+ +Reviewed-by: Filipe Manana +Signed-off-by: Omar Sandoval +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/tree-log.c | 17 +++++++++++------ + 1 file changed, 11 insertions(+), 6 deletions(-) + +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -4856,18 +4856,23 @@ static int btrfs_log_prealloc_extents(st + path->slots[0]++; + continue; + } +- if (!dropped_extents) { +- /* +- * Avoid logging extent items logged in past fsync calls +- * and leading to duplicate keys in the log tree. +- */ ++ /* ++ * Avoid overlapping items in the log tree. The first time we ++ * get here, get rid of everything from a past fsync. After ++ * that, if the current extent starts before the end of the last ++ * extent we copied, truncate the last one. This can happen if ++ * an ordered extent completion modifies the subvolume tree ++ * while btrfs_next_leaf() has the tree unlocked. ++ */ ++ if (!dropped_extents || key.offset < truncate_offset) { + ret = truncate_inode_items(trans, root->log_root, inode, +- truncate_offset, ++ min(key.offset, truncate_offset), + BTRFS_EXTENT_DATA_KEY); + if (ret) + goto out; + dropped_extents = true; + } ++ truncate_offset = btrfs_file_extent_end(path); + if (ins_nr == 0) + start_slot = slot; + ins_nr++; diff --git a/queue-6.9/btrfs-fix-leak-of-qgroup-extent-records-after-transaction-abort.patch b/queue-6.9/btrfs-fix-leak-of-qgroup-extent-records-after-transaction-abort.patch new file mode 100644 index 00000000000..7c17f238a8e --- /dev/null +++ b/queue-6.9/btrfs-fix-leak-of-qgroup-extent-records-after-transaction-abort.patch @@ -0,0 +1,60 @@ +From fb33eb2ef0d88e75564983ef057b44c5b7e4fded Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Mon, 3 Jun 2024 12:49:08 +0100 +Subject: btrfs: fix leak of qgroup extent records after transaction abort + +From: Filipe Manana + +commit fb33eb2ef0d88e75564983ef057b44c5b7e4fded upstream. + +Qgroup extent records are created when delayed ref heads are created and +then released after accounting extents at btrfs_qgroup_account_extents(), +called during the transaction commit path. + +If a transaction is aborted we free the qgroup records by calling +btrfs_qgroup_destroy_extent_records() at btrfs_destroy_delayed_refs(), +unless we don't have delayed references. We are incorrectly assuming +that no delayed references means we don't have qgroup extents records. + +We can currently have no delayed references because we ran them all +during a transaction commit and the transaction was aborted after that +due to some error in the commit path. + +So fix this by ensuring we btrfs_qgroup_destroy_extent_records() at +btrfs_destroy_delayed_refs() even if we don't have any delayed references. + +Reported-by: syzbot+0fecc032fa134afd49df@syzkaller.appspotmail.com +Link: https://lore.kernel.org/linux-btrfs/0000000000004e7f980619f91835@google.com/ +Fixes: 81f7eb00ff5b ("btrfs: destroy qgroup extent records on transaction abort") +CC: stable@vger.kernel.org # 6.1+ +Reviewed-by: Josef Bacik +Reviewed-by: Qu Wenruo +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/disk-io.c | 10 +--------- + 1 file changed, 1 insertion(+), 9 deletions(-) + +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -4544,18 +4544,10 @@ static void btrfs_destroy_delayed_refs(s + struct btrfs_fs_info *fs_info) + { + struct rb_node *node; +- struct btrfs_delayed_ref_root *delayed_refs; ++ struct btrfs_delayed_ref_root *delayed_refs = &trans->delayed_refs; + struct btrfs_delayed_ref_node *ref; + +- delayed_refs = &trans->delayed_refs; +- + spin_lock(&delayed_refs->lock); +- if (atomic_read(&delayed_refs->num_entries) == 0) { +- spin_unlock(&delayed_refs->lock); +- btrfs_debug(fs_info, "delayed_refs has NO entry"); +- return; +- } +- + while ((node = rb_first_cached(&delayed_refs->href_root)) != NULL) { + struct btrfs_delayed_ref_head *head; + struct rb_node *n; diff --git a/queue-6.9/btrfs-protect-folio-private-when-attaching-extent-buffer-folios.patch b/queue-6.9/btrfs-protect-folio-private-when-attaching-extent-buffer-folios.patch new file mode 100644 index 00000000000..eefb650e2b2 --- /dev/null +++ b/queue-6.9/btrfs-protect-folio-private-when-attaching-extent-buffer-folios.patch @@ -0,0 +1,279 @@ +From f3a5367c679d31473d3fbb391675055b4792c309 Mon Sep 17 00:00:00 2001 +From: Qu Wenruo +Date: Thu, 6 Jun 2024 11:01:51 +0930 +Subject: btrfs: protect folio::private when attaching extent buffer folios +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Qu Wenruo + +commit f3a5367c679d31473d3fbb391675055b4792c309 upstream. + +[BUG] +Since v6.8 there are rare kernel crashes reported by various people, +the common factor is bad page status error messages like this: + + BUG: Bad page state in process kswapd0 pfn:d6e840 + page: refcount:0 mapcount:0 mapping:000000007512f4f2 index:0x2796c2c7c + pfn:0xd6e840 + aops:btree_aops ino:1 + flags: 0x17ffffe0000008(uptodate|node=0|zone=2|lastcpupid=0x3fffff) + page_type: 0xffffffff() + raw: 0017ffffe0000008 dead000000000100 dead000000000122 ffff88826d0be4c0 + raw: 00000002796c2c7c 0000000000000000 00000000ffffffff 0000000000000000 + page dumped because: non-NULL mapping + +[CAUSE] +Commit 09e6cef19c9f ("btrfs: refactor alloc_extent_buffer() to +allocate-then-attach method") changes the sequence when allocating a new +extent buffer. + +Previously we always called grab_extent_buffer() under +mapping->i_private_lock, to ensure the safety on modification on +folio::private (which is a pointer to extent buffer for regular +sectorsize). + +This can lead to the following race: + +Thread A is trying to allocate an extent buffer at bytenr X, with 4 +4K pages, meanwhile thread B is trying to release the page at X + 4K +(the second page of the extent buffer at X). + + Thread A | Thread B +-----------------------------------+------------------------------------- + | btree_release_folio() + | | This is for the page at X + 4K, + | | Not page X. + | | +alloc_extent_buffer() | |- release_extent_buffer() +|- filemap_add_folio() for the | | |- atomic_dec_and_test(eb->refs) +| page at bytenr X (the first | | | +| page). | | | +| Which returned -EEXIST. | | | +| | | | +|- filemap_lock_folio() | | | +| Returned the first page locked. | | | +| | | | +|- grab_extent_buffer() | | | +| |- atomic_inc_not_zero() | | | +| | Returned false | | | +| |- folio_detach_private() | | |- folio_detach_private() for X +| |- folio_test_private() | | |- folio_test_private() + | Returned true | | | Returned true + |- folio_put() | |- folio_put() + +Now there are two puts on the same folio at folio X, leading to refcount +underflow of the folio X, and eventually causing the BUG_ON() on the +page->mapping. + +The condition is not that easy to hit: + +- The release must be triggered for the middle page of an eb + If the release is on the same first page of an eb, page lock would kick + in and prevent the race. + +- folio_detach_private() has a very small race window + It's only between folio_test_private() and folio_clear_private(). + +That's exactly when mapping->i_private_lock is used to prevent such race, +and commit 09e6cef19c9f ("btrfs: refactor alloc_extent_buffer() to +allocate-then-attach method") screwed that up. + +At that time, I thought the page lock would kick in as +filemap_release_folio() also requires the page to be locked, but forgot +the filemap_release_folio() only locks one page, not all pages of an +extent buffer. + +[FIX] +Move all the code requiring i_private_lock into +attach_eb_folio_to_filemap(), so that everything is done with proper +lock protection. + +Furthermore to prevent future problems, add an extra +lockdep_assert_locked() to ensure we're holding the proper lock. + +To reproducer that is able to hit the race (takes a few minutes with +instrumented code inserting delays to alloc_extent_buffer()): + + #!/bin/sh + drop_caches () { + while(true); do + echo 3 > /proc/sys/vm/drop_caches + echo 1 > /proc/sys/vm/compact_memory + done + } + + run_tar () { + while(true); do + for x in `seq 1 80` ; do + tar cf /dev/zero /mnt > /dev/null & + done + wait + done + } + + mkfs.btrfs -f -d single -m single /dev/vda + mount -o noatime /dev/vda /mnt + # create 200,000 files, 1K each + ./simoop -n 200000 -E -f 1k /mnt + drop_caches & + (run_tar) + +Reported-by: Linus Torvalds +Link: https://lore.kernel.org/linux-btrfs/CAHk-=wgt362nGfScVOOii8cgKn2LVVHeOvOA7OBwg1OwbuJQcw@mail.gmail.com/ +Reported-by: Mikhail Gavrilov +Link: https://lore.kernel.org/lkml/CABXGCsPktcHQOvKTbPaTwegMExije=Gpgci5NW=hqORo-s7diA@mail.gmail.com/ +Reported-by: Toralf Förster +Link: https://lore.kernel.org/linux-btrfs/e8b3311c-9a75-4903-907f-fc0f7a3fe423@gmx.de/ +Reported-by: syzbot+f80b066392366b4af85e@syzkaller.appspotmail.com +Fixes: 09e6cef19c9f ("btrfs: refactor alloc_extent_buffer() to allocate-then-attach method") +CC: stable@vger.kernel.org # 6.8+ +CC: Chris Mason +Reviewed-by: Filipe Manana +Reviewed-by: Josef Bacik +Signed-off-by: Qu Wenruo +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/extent_io.c | 60 ++++++++++++++++++++++++++------------------------- + 1 file changed, 31 insertions(+), 29 deletions(-) + +--- a/fs/btrfs/extent_io.c ++++ b/fs/btrfs/extent_io.c +@@ -3662,6 +3662,8 @@ static struct extent_buffer *grab_extent + struct folio *folio = page_folio(page); + struct extent_buffer *exists; + ++ lockdep_assert_held(&page->mapping->i_private_lock); ++ + /* + * For subpage case, we completely rely on radix tree to ensure we + * don't try to insert two ebs for the same bytenr. So here we always +@@ -3729,13 +3731,14 @@ static int check_eb_alignment(struct btr + * The caller needs to free the existing folios and retry using the same order. + */ + static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i, ++ struct btrfs_subpage *prealloc, + struct extent_buffer **found_eb_ret) + { + + struct btrfs_fs_info *fs_info = eb->fs_info; + struct address_space *mapping = fs_info->btree_inode->i_mapping; + const unsigned long index = eb->start >> PAGE_SHIFT; +- struct folio *existing_folio; ++ struct folio *existing_folio = NULL; + int ret; + + ASSERT(found_eb_ret); +@@ -3747,12 +3750,14 @@ retry: + ret = filemap_add_folio(mapping, eb->folios[i], index + i, + GFP_NOFS | __GFP_NOFAIL); + if (!ret) +- return 0; ++ goto finish; + + existing_folio = filemap_lock_folio(mapping, index + i); + /* The page cache only exists for a very short time, just retry. */ +- if (IS_ERR(existing_folio)) ++ if (IS_ERR(existing_folio)) { ++ existing_folio = NULL; + goto retry; ++ } + + /* For now, we should only have single-page folios for btree inode. */ + ASSERT(folio_nr_pages(existing_folio) == 1); +@@ -3763,14 +3768,13 @@ retry: + return -EAGAIN; + } + +- if (fs_info->nodesize < PAGE_SIZE) { +- /* +- * We're going to reuse the existing page, can drop our page +- * and subpage structure now. +- */ ++finish: ++ spin_lock(&mapping->i_private_lock); ++ if (existing_folio && fs_info->nodesize < PAGE_SIZE) { ++ /* We're going to reuse the existing page, can drop our folio now. */ + __free_page(folio_page(eb->folios[i], 0)); + eb->folios[i] = existing_folio; +- } else { ++ } else if (existing_folio) { + struct extent_buffer *existing_eb; + + existing_eb = grab_extent_buffer(fs_info, +@@ -3778,6 +3782,7 @@ retry: + if (existing_eb) { + /* The extent buffer still exists, we can use it directly. */ + *found_eb_ret = existing_eb; ++ spin_unlock(&mapping->i_private_lock); + folio_unlock(existing_folio); + folio_put(existing_folio); + return 1; +@@ -3786,6 +3791,22 @@ retry: + __free_page(folio_page(eb->folios[i], 0)); + eb->folios[i] = existing_folio; + } ++ eb->folio_size = folio_size(eb->folios[i]); ++ eb->folio_shift = folio_shift(eb->folios[i]); ++ /* Should not fail, as we have preallocated the memory. */ ++ ret = attach_extent_buffer_folio(eb, eb->folios[i], prealloc); ++ ASSERT(!ret); ++ /* ++ * To inform we have an extra eb under allocation, so that ++ * detach_extent_buffer_page() won't release the folio private when the ++ * eb hasn't been inserted into radix tree yet. ++ * ++ * The ref will be decreased when the eb releases the page, in ++ * detach_extent_buffer_page(). Thus needs no special handling in the ++ * error path. ++ */ ++ btrfs_folio_inc_eb_refs(fs_info, eb->folios[i]); ++ spin_unlock(&mapping->i_private_lock); + return 0; + } + +@@ -3797,7 +3818,6 @@ struct extent_buffer *alloc_extent_buffe + int attached = 0; + struct extent_buffer *eb; + struct extent_buffer *existing_eb = NULL; +- struct address_space *mapping = fs_info->btree_inode->i_mapping; + struct btrfs_subpage *prealloc = NULL; + u64 lockdep_owner = owner_root; + bool page_contig = true; +@@ -3863,7 +3883,7 @@ reallocate: + for (int i = 0; i < num_folios; i++) { + struct folio *folio; + +- ret = attach_eb_folio_to_filemap(eb, i, &existing_eb); ++ ret = attach_eb_folio_to_filemap(eb, i, prealloc, &existing_eb); + if (ret > 0) { + ASSERT(existing_eb); + goto out; +@@ -3900,24 +3920,6 @@ reallocate: + * and free the allocated page. + */ + folio = eb->folios[i]; +- eb->folio_size = folio_size(folio); +- eb->folio_shift = folio_shift(folio); +- spin_lock(&mapping->i_private_lock); +- /* Should not fail, as we have preallocated the memory */ +- ret = attach_extent_buffer_folio(eb, folio, prealloc); +- ASSERT(!ret); +- /* +- * To inform we have extra eb under allocation, so that +- * detach_extent_buffer_page() won't release the folio private +- * when the eb hasn't yet been inserted into radix tree. +- * +- * The ref will be decreased when the eb released the page, in +- * detach_extent_buffer_page(). +- * Thus needs no special handling in error path. +- */ +- btrfs_folio_inc_eb_refs(fs_info, folio); +- spin_unlock(&mapping->i_private_lock); +- + WARN_ON(btrfs_folio_test_dirty(fs_info, folio, eb->start, eb->len)); + + /* diff --git a/queue-6.9/btrfs-qgroup-fix-qgroup-id-collision-across-mounts.patch b/queue-6.9/btrfs-qgroup-fix-qgroup-id-collision-across-mounts.patch new file mode 100644 index 00000000000..479d3164fc5 --- /dev/null +++ b/queue-6.9/btrfs-qgroup-fix-qgroup-id-collision-across-mounts.patch @@ -0,0 +1,66 @@ +From 2b8aa78cf1279ec5e418baa26bfed5df682568d8 Mon Sep 17 00:00:00 2001 +From: Boris Burkov +Date: Thu, 9 May 2024 15:34:40 -0700 +Subject: btrfs: qgroup: fix qgroup id collision across mounts + +From: Boris Burkov + +commit 2b8aa78cf1279ec5e418baa26bfed5df682568d8 upstream. + +If we delete subvolumes whose ID is the largest in the filesystem, then +unmount and mount again, then btrfs_init_root_free_objectid on the +tree_root will select a subvolid smaller than that one and thus allow +reusing it. + +If we are also using qgroups (and particularly squotas) it is possible +to delete the subvol without deleting the qgroup. In that case, we will +be able to create a new subvol whose id already has a level 0 qgroup. +This will result in re-using that qgroup which would then lead to +incorrect accounting. + +Fixes: 6ed05643ddb1 ("btrfs: create qgroup earlier in snapshot creation") +CC: stable@vger.kernel.org # 6.7+ +Reviewed-by: Qu Wenruo +Signed-off-by: Boris Burkov +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/qgroup.c | 20 ++++++++++++++++++++ + 1 file changed, 20 insertions(+) + +--- a/fs/btrfs/qgroup.c ++++ b/fs/btrfs/qgroup.c +@@ -468,6 +468,7 @@ int btrfs_read_qgroup_config(struct btrf + } + if (!qgroup) { + struct btrfs_qgroup *prealloc; ++ struct btrfs_root *tree_root = fs_info->tree_root; + + prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL); + if (!prealloc) { +@@ -475,6 +476,25 @@ int btrfs_read_qgroup_config(struct btrf + goto out; + } + qgroup = add_qgroup_rb(fs_info, prealloc, found_key.offset); ++ /* ++ * If a qgroup exists for a subvolume ID, it is possible ++ * that subvolume has been deleted, in which case ++ * re-using that ID would lead to incorrect accounting. ++ * ++ * Ensure that we skip any such subvol ids. ++ * ++ * We don't need to lock because this is only called ++ * during mount before we start doing things like creating ++ * subvolumes. ++ */ ++ if (is_fstree(qgroup->qgroupid) && ++ qgroup->qgroupid > tree_root->free_objectid) ++ /* ++ * Don't need to check against BTRFS_LAST_FREE_OBJECTID, ++ * as it will get checked on the next call to ++ * btrfs_get_free_objectid. ++ */ ++ tree_root->free_objectid = qgroup->qgroupid + 1; + } + ret = btrfs_sysfs_add_one_qgroup(fs_info, qgroup); + if (ret < 0) diff --git a/queue-6.9/btrfs-qgroup-update-rescan-message-levels-and-error-codes.patch b/queue-6.9/btrfs-qgroup-update-rescan-message-levels-and-error-codes.patch new file mode 100644 index 00000000000..c8f28e4ca0c --- /dev/null +++ b/queue-6.9/btrfs-qgroup-update-rescan-message-levels-and-error-codes.patch @@ -0,0 +1,65 @@ +From 1fa7603d569b9e738e9581937ba8725cd7d39b48 Mon Sep 17 00:00:00 2001 +From: David Sterba +Date: Thu, 2 May 2024 22:45:58 +0200 +Subject: btrfs: qgroup: update rescan message levels and error codes + +From: David Sterba + +commit 1fa7603d569b9e738e9581937ba8725cd7d39b48 upstream. + +On filesystems without enabled quotas there's still a warning message in +the logs when rescan is called. In that case it's not a problem that +should be reported, rescan can be called unconditionally. Change the +error code to ENOTCONN which is used for 'quotas not enabled' elsewhere. + +Remove message (also a warning) when rescan is called during an ongoing +rescan, this brings no useful information and the error code is +sufficient. + +Change message levels to debug for now, they can be removed eventually. + +CC: stable@vger.kernel.org # 6.6+ +Reviewed-by: Boris Burkov +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/qgroup.c | 12 +++++------- + 1 file changed, 5 insertions(+), 7 deletions(-) + +--- a/fs/btrfs/qgroup.c ++++ b/fs/btrfs/qgroup.c +@@ -3826,14 +3826,14 @@ qgroup_rescan_init(struct btrfs_fs_info + /* we're resuming qgroup rescan at mount time */ + if (!(fs_info->qgroup_flags & + BTRFS_QGROUP_STATUS_FLAG_RESCAN)) { +- btrfs_warn(fs_info, ++ btrfs_debug(fs_info, + "qgroup rescan init failed, qgroup rescan is not queued"); + ret = -EINVAL; + } else if (!(fs_info->qgroup_flags & + BTRFS_QGROUP_STATUS_FLAG_ON)) { +- btrfs_warn(fs_info, ++ btrfs_debug(fs_info, + "qgroup rescan init failed, qgroup is not enabled"); +- ret = -EINVAL; ++ ret = -ENOTCONN; + } + + if (ret) +@@ -3844,14 +3844,12 @@ qgroup_rescan_init(struct btrfs_fs_info + + if (init_flags) { + if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) { +- btrfs_warn(fs_info, +- "qgroup rescan is already in progress"); + ret = -EINPROGRESS; + } else if (!(fs_info->qgroup_flags & + BTRFS_QGROUP_STATUS_FLAG_ON)) { +- btrfs_warn(fs_info, ++ btrfs_debug(fs_info, + "qgroup rescan init failed, qgroup is not enabled"); +- ret = -EINVAL; ++ ret = -ENOTCONN; + } else if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_DISABLED) { + /* Quota disable is in progress */ + ret = -EBUSY; diff --git a/queue-6.9/btrfs-re-introduce-norecovery-mount-option.patch b/queue-6.9/btrfs-re-introduce-norecovery-mount-option.patch new file mode 100644 index 00000000000..be11a394a44 --- /dev/null +++ b/queue-6.9/btrfs-re-introduce-norecovery-mount-option.patch @@ -0,0 +1,68 @@ +From 440861b1a03c72cc7be4a307e178dcaa6894479b Mon Sep 17 00:00:00 2001 +From: Qu Wenruo +Date: Tue, 21 May 2024 19:27:31 +0930 +Subject: btrfs: re-introduce 'norecovery' mount option + +From: Qu Wenruo + +commit 440861b1a03c72cc7be4a307e178dcaa6894479b upstream. + +Although 'norecovery' mount option was marked as deprecated for a long +time and a warning message was printed during the deprecation window, +it's still actively utilized by several projects that need a safer way +to mount a btrfs without any writes. + +Furthermore this 'norecovery' mount option is supported by other major +filesystems, which makes it less clear what's our motivation to remove +it. + +Re-introduce the 'norecovery' mount option, and output a message to recommend +'rescue=nologreplay' option. + +Link: https://lore.kernel.org/linux-btrfs/ZkxZT0J-z0GYvfy8@gardel-login/#t +Link: https://github.com/systemd/systemd/pull/32892 +Link: https://bugzilla.suse.com/show_bug.cgi?id=1222429 +Reported-by: Lennart Poettering +Reported-by: Jiri Slaby +Fixes: a1912f712188 ("btrfs: remove code for inode_cache and recovery mount options") +CC: stable@vger.kernel.org # 6.8+ +Reviewed-by: Johannes Thumshirn +Signed-off-by: Qu Wenruo +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/super.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +--- a/fs/btrfs/super.c ++++ b/fs/btrfs/super.c +@@ -119,6 +119,7 @@ enum { + Opt_thread_pool, + Opt_treelog, + Opt_user_subvol_rm_allowed, ++ Opt_norecovery, + + /* Rescue options */ + Opt_rescue, +@@ -245,6 +246,8 @@ static const struct fs_parameter_spec bt + __fsparam(NULL, "nologreplay", Opt_nologreplay, fs_param_deprecated, NULL), + /* Deprecated, with alias rescue=usebackuproot */ + __fsparam(NULL, "usebackuproot", Opt_usebackuproot, fs_param_deprecated, NULL), ++ /* For compatibility only, alias for "rescue=nologreplay". */ ++ fsparam_flag("norecovery", Opt_norecovery), + + /* Debugging options. */ + fsparam_flag_no("enospc_debug", Opt_enospc_debug), +@@ -438,6 +441,11 @@ static int btrfs_parse_param(struct fs_c + "'nologreplay' is deprecated, use 'rescue=nologreplay' instead"); + btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY); + break; ++ case Opt_norecovery: ++ btrfs_info(NULL, ++"'norecovery' is for compatibility only, recommended to use 'rescue=nologreplay'"); ++ btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY); ++ break; + case Opt_flushoncommit: + if (result.negated) + btrfs_clear_opt(ctx->mount_opt, FLUSHONCOMMIT); diff --git a/queue-6.9/cifs-fix-creating-sockets-when-using-sfu-mount-options.patch b/queue-6.9/cifs-fix-creating-sockets-when-using-sfu-mount-options.patch new file mode 100644 index 00000000000..5a4e503c4ec --- /dev/null +++ b/queue-6.9/cifs-fix-creating-sockets-when-using-sfu-mount-options.patch @@ -0,0 +1,65 @@ +From 518549c120e671c4906f77d1802b97e9b23f673a Mon Sep 17 00:00:00 2001 +From: Steve French +Date: Wed, 29 May 2024 18:16:56 -0500 +Subject: cifs: fix creating sockets when using sfu mount options + +From: Steve French + +commit 518549c120e671c4906f77d1802b97e9b23f673a upstream. + +When running fstest generic/423 with sfu mount option, it +was being skipped due to inability to create sockets: + + generic/423 [not run] cifs does not support mknod/mkfifo + +which can also be easily reproduced with their af_unix tool: + + ./src/af_unix /mnt1/socket-two bind: Operation not permitted + +Fix sfu mount option to allow creating and reporting sockets. + +Cc: stable@vger.kernel.org +Signed-off-by: Steve French +Signed-off-by: Greg Kroah-Hartman +--- + fs/smb/client/cifspdu.h | 2 +- + fs/smb/client/inode.c | 4 ++++ + fs/smb/client/smb2ops.c | 3 +++ + 3 files changed, 8 insertions(+), 1 deletion(-) + +--- a/fs/smb/client/cifspdu.h ++++ b/fs/smb/client/cifspdu.h +@@ -2574,7 +2574,7 @@ typedef struct { + + + struct win_dev { +- unsigned char type[8]; /* IntxCHR or IntxBLK or LnxFIFO*/ ++ unsigned char type[8]; /* IntxCHR or IntxBLK or LnxFIFO or LnxSOCK */ + __le64 major; + __le64 minor; + } __attribute__((packed)); +--- a/fs/smb/client/inode.c ++++ b/fs/smb/client/inode.c +@@ -591,6 +591,10 @@ cifs_sfu_type(struct cifs_fattr *fattr, + mnr = le64_to_cpu(*(__le64 *)(pbuf+16)); + fattr->cf_rdev = MKDEV(mjr, mnr); + } ++ } else if (memcmp("LnxSOCK", pbuf, 8) == 0) { ++ cifs_dbg(FYI, "Socket\n"); ++ fattr->cf_mode |= S_IFSOCK; ++ fattr->cf_dtype = DT_SOCK; + } else if (memcmp("IntxLNK", pbuf, 7) == 0) { + cifs_dbg(FYI, "Symlink\n"); + fattr->cf_mode |= S_IFLNK; +--- a/fs/smb/client/smb2ops.c ++++ b/fs/smb/client/smb2ops.c +@@ -4996,6 +4996,9 @@ static int __cifs_sfu_make_node(unsigned + pdev.major = cpu_to_le64(MAJOR(dev)); + pdev.minor = cpu_to_le64(MINOR(dev)); + break; ++ case S_IFSOCK: ++ strscpy(pdev.type, "LnxSOCK"); ++ break; + case S_IFIFO: + strscpy(pdev.type, "LnxFIFO"); + break; diff --git a/queue-6.9/edac-amd64-convert-pcibios_-return-codes-to-errnos.patch b/queue-6.9/edac-amd64-convert-pcibios_-return-codes-to-errnos.patch new file mode 100644 index 00000000000..6052e098120 --- /dev/null +++ b/queue-6.9/edac-amd64-convert-pcibios_-return-codes-to-errnos.patch @@ -0,0 +1,65 @@ +From 3ec8ebd8a5b782d56347ae884de880af26f93996 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= +Date: Mon, 27 May 2024 16:22:34 +0300 +Subject: EDAC/amd64: Convert PCIBIOS_* return codes to errnos +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Ilpo Järvinen + +commit 3ec8ebd8a5b782d56347ae884de880af26f93996 upstream. + +gpu_get_node_map() uses pci_read_config_dword() that returns PCIBIOS_* +codes. The return code is then returned all the way into the module +init function amd64_edac_init() that returns it as is. The module init +functions, however, should return normal errnos. + +Convert PCIBIOS_* returns code using pcibios_err_to_errno() into normal +errno before returning it from gpu_get_node_map(). + +For consistency, convert also the other similar cases which return +PCIBIOS_* codes even if they do not have any bugs at the moment. + +Fixes: 4251566ebc1c ("EDAC/amd64: Cache and use GPU node map") +Signed-off-by: Ilpo Järvinen +Signed-off-by: Borislav Petkov (AMD) +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/r/20240527132236.13875-1-ilpo.jarvinen@linux.intel.com +Signed-off-by: Greg Kroah-Hartman +--- + drivers/edac/amd64_edac.c | 8 +++++--- + 1 file changed, 5 insertions(+), 3 deletions(-) + +--- a/drivers/edac/amd64_edac.c ++++ b/drivers/edac/amd64_edac.c +@@ -81,7 +81,7 @@ int __amd64_read_pci_cfg_dword(struct pc + amd64_warn("%s: error reading F%dx%03x.\n", + func, PCI_FUNC(pdev->devfn), offset); + +- return err; ++ return pcibios_err_to_errno(err); + } + + int __amd64_write_pci_cfg_dword(struct pci_dev *pdev, int offset, +@@ -94,7 +94,7 @@ int __amd64_write_pci_cfg_dword(struct p + amd64_warn("%s: error writing to F%dx%03x.\n", + func, PCI_FUNC(pdev->devfn), offset); + +- return err; ++ return pcibios_err_to_errno(err); + } + + /* +@@ -1025,8 +1025,10 @@ static int gpu_get_node_map(struct amd64 + } + + ret = pci_read_config_dword(pdev, REG_LOCAL_NODE_TYPE_MAP, &tmp); +- if (ret) ++ if (ret) { ++ ret = pcibios_err_to_errno(ret); + goto out; ++ } + + gpu_node_map.node_count = FIELD_GET(LNTM_NODE_COUNT, tmp); + gpu_node_map.base_node_id = FIELD_GET(LNTM_BASE_NODE_ID, tmp); diff --git a/queue-6.9/edac-igen6-convert-pcibios_-return-codes-to-errnos.patch b/queue-6.9/edac-igen6-convert-pcibios_-return-codes-to-errnos.patch new file mode 100644 index 00000000000..fc919d89b52 --- /dev/null +++ b/queue-6.9/edac-igen6-convert-pcibios_-return-codes-to-errnos.patch @@ -0,0 +1,51 @@ +From f8367a74aebf88dc8b58a0db6a6c90b4cb8fc9d3 Mon Sep 17 00:00:00 2001 +From: =?UTF-8?q?Ilpo=20J=C3=A4rvinen?= +Date: Mon, 27 May 2024 16:22:35 +0300 +Subject: EDAC/igen6: Convert PCIBIOS_* return codes to errnos +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Ilpo Järvinen + +commit f8367a74aebf88dc8b58a0db6a6c90b4cb8fc9d3 upstream. + +errcmd_enable_error_reporting() uses pci_{read,write}_config_word() +that return PCIBIOS_* codes. The return code is then returned all the +way into the probe function igen6_probe() that returns it as is. The +probe functions, however, should return normal errnos. + +Convert PCIBIOS_* returns code using pcibios_err_to_errno() into normal +errno before returning it from errcmd_enable_error_reporting(). + +Fixes: 10590a9d4f23 ("EDAC/igen6: Add EDAC driver for Intel client SoCs using IBECC") +Signed-off-by: Ilpo Järvinen +Signed-off-by: Borislav Petkov (AMD) +Reviewed-by: Qiuxu Zhuo +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/r/20240527132236.13875-2-ilpo.jarvinen@linux.intel.com +Signed-off-by: Greg Kroah-Hartman +--- + drivers/edac/igen6_edac.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/drivers/edac/igen6_edac.c ++++ b/drivers/edac/igen6_edac.c +@@ -800,7 +800,7 @@ static int errcmd_enable_error_reporting + + rc = pci_read_config_word(imc->pdev, ERRCMD_OFFSET, &errcmd); + if (rc) +- return rc; ++ return pcibios_err_to_errno(rc); + + if (enable) + errcmd |= ERRCMD_CE | ERRSTS_UE; +@@ -809,7 +809,7 @@ static int errcmd_enable_error_reporting + + rc = pci_write_config_word(imc->pdev, ERRCMD_OFFSET, errcmd); + if (rc) +- return rc; ++ return pcibios_err_to_errno(rc); + + return 0; + } diff --git a/queue-6.9/eventfs-fix-a-possible-null-pointer-dereference-in-eventfs_find_events.patch b/queue-6.9/eventfs-fix-a-possible-null-pointer-dereference-in-eventfs_find_events.patch new file mode 100644 index 00000000000..39b1c58c51f --- /dev/null +++ b/queue-6.9/eventfs-fix-a-possible-null-pointer-dereference-in-eventfs_find_events.patch @@ -0,0 +1,42 @@ +From d4e9a968738bf66d3bb852dd5588d4c7afd6d7f4 Mon Sep 17 00:00:00 2001 +From: Hao Ge +Date: Mon, 13 May 2024 13:33:38 +0800 +Subject: eventfs: Fix a possible null pointer dereference in eventfs_find_events() + +From: Hao Ge + +commit d4e9a968738bf66d3bb852dd5588d4c7afd6d7f4 upstream. + +In function eventfs_find_events,there is a potential null pointer +that may be caused by calling update_events_attr which will perform +some operations on the members of the ei struct when ei is NULL. + +Hence,When ei->is_freed is set,return NULL directly. + +Link: https://lore.kernel.org/linux-trace-kernel/20240513053338.63017-1-hao.ge@linux.dev + +Cc: stable@vger.kernel.org +Fixes: 8186fff7ab64 ("tracefs/eventfs: Use root and instance inodes as default ownership") +Signed-off-by: Hao Ge +Signed-off-by: Steven Rostedt (Google) +Signed-off-by: Greg Kroah-Hartman +--- + fs/tracefs/event_inode.c | 7 +++---- + 1 file changed, 3 insertions(+), 4 deletions(-) + +--- a/fs/tracefs/event_inode.c ++++ b/fs/tracefs/event_inode.c +@@ -345,10 +345,9 @@ static struct eventfs_inode *eventfs_fin + * If the ei is being freed, the ownership of the children + * doesn't matter. + */ +- if (ei->is_freed) { +- ei = NULL; +- break; +- } ++ if (ei->is_freed) ++ return NULL; ++ + // Walk upwards until you find the events inode + } while (!ei->is_events); + diff --git a/queue-6.9/eventfs-keep-the-directories-from-having-the-same-inode-number-as-files.patch b/queue-6.9/eventfs-keep-the-directories-from-having-the-same-inode-number-as-files.patch new file mode 100644 index 00000000000..f297499f8b9 --- /dev/null +++ b/queue-6.9/eventfs-keep-the-directories-from-having-the-same-inode-number-as-files.patch @@ -0,0 +1,44 @@ +From 8898e7f288c47d450a3cf1511c791a03550c0789 Mon Sep 17 00:00:00 2001 +From: "Steven Rostedt (Google)" +Date: Thu, 23 May 2024 01:14:26 -0400 +Subject: eventfs: Keep the directories from having the same inode number as files + +From: Steven Rostedt (Google) + +commit 8898e7f288c47d450a3cf1511c791a03550c0789 upstream. + +The directories require unique inode numbers but all the eventfs files +have the same inode number. Prevent the directories from having the same +inode numbers as the files as that can confuse some tooling. + +Link: https://lore.kernel.org/linux-trace-kernel/20240523051539.428826685@goodmis.org + +Cc: stable@vger.kernel.org +Cc: Masami Hiramatsu +Cc: Mark Rutland +Cc: Mathieu Desnoyers +Cc: Andrew Morton +Cc: Masahiro Yamada +Fixes: 834bf76add3e6 ("eventfs: Save directory inodes in the eventfs_inode structure") +Signed-off-by: Steven Rostedt (Google) +Signed-off-by: Greg Kroah-Hartman +--- + fs/tracefs/event_inode.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +--- a/fs/tracefs/event_inode.c ++++ b/fs/tracefs/event_inode.c +@@ -50,8 +50,12 @@ static struct eventfs_root_inode *get_ro + /* Just try to make something consistent and unique */ + static int eventfs_dir_ino(struct eventfs_inode *ei) + { +- if (!ei->ino) ++ if (!ei->ino) { + ei->ino = get_next_ino(); ++ /* Must not have the file inode number */ ++ if (ei->ino == EVENTFS_FILE_INODE_INO) ++ ei->ino = get_next_ino(); ++ } + + return ei->ino; + } diff --git a/queue-6.9/nfs-fix-read_plus-when-server-doesn-t-support-op_read_plus.patch b/queue-6.9/nfs-fix-read_plus-when-server-doesn-t-support-op_read_plus.patch new file mode 100644 index 00000000000..f56bf80b486 --- /dev/null +++ b/queue-6.9/nfs-fix-read_plus-when-server-doesn-t-support-op_read_plus.patch @@ -0,0 +1,41 @@ +From f06d1b10cb016d5aaecdb1804fefca025387bd10 Mon Sep 17 00:00:00 2001 +From: Anna Schumaker +Date: Thu, 25 Apr 2024 16:24:29 -0400 +Subject: NFS: Fix READ_PLUS when server doesn't support OP_READ_PLUS + +From: Anna Schumaker + +commit f06d1b10cb016d5aaecdb1804fefca025387bd10 upstream. + +Olga showed me a case where the client was sending multiple READ_PLUS +calls to the server in parallel, and the server replied +NFS4ERR_OPNOTSUPP to each. The client would fall back to READ for the +first reply, but fail to retry the other calls. + +I fix this by removing the test for NFS_CAP_READ_PLUS in +nfs4_read_plus_not_supported(). This allows us to reschedule any +READ_PLUS call that has a NFS4ERR_OPNOTSUPP return value, even after the +capability has been cleared. + +Reported-by: Olga Kornievskaia +Fixes: c567552612ec ("NFS: Add READ_PLUS data segment support") +Cc: stable@vger.kernel.org # v5.10+ +Signed-off-by: Anna Schumaker +Reviewed-by: Benjamin Coddington +Signed-off-by: Trond Myklebust +Signed-off-by: Greg Kroah-Hartman +--- + fs/nfs/nfs4proc.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/nfs/nfs4proc.c ++++ b/fs/nfs/nfs4proc.c +@@ -5456,7 +5456,7 @@ static bool nfs4_read_plus_not_supported + struct rpc_message *msg = &task->tk_msg; + + if (msg->rpc_proc == &nfs4_procedures[NFSPROC4_CLNT_READ_PLUS] && +- server->caps & NFS_CAP_READ_PLUS && task->tk_status == -ENOTSUPP) { ++ task->tk_status == -ENOTSUPP) { + server->caps &= ~NFS_CAP_READ_PLUS; + msg->rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_READ]; + rpc_restart_call_prepare(task); diff --git a/queue-6.9/nfs-fix-undefined-behavior-in-nfs_block_bits.patch b/queue-6.9/nfs-fix-undefined-behavior-in-nfs_block_bits.patch new file mode 100644 index 00000000000..4afc984e286 --- /dev/null +++ b/queue-6.9/nfs-fix-undefined-behavior-in-nfs_block_bits.patch @@ -0,0 +1,38 @@ +From 3c0a2e0b0ae661457c8505fecc7be5501aa7a715 Mon Sep 17 00:00:00 2001 +From: Sergey Shtylyov +Date: Fri, 10 May 2024 23:24:04 +0300 +Subject: nfs: fix undefined behavior in nfs_block_bits() + +From: Sergey Shtylyov + +commit 3c0a2e0b0ae661457c8505fecc7be5501aa7a715 upstream. + +Shifting *signed int* typed constant 1 left by 31 bits causes undefined +behavior. Specify the correct *unsigned long* type by using 1UL instead. + +Found by Linux Verification Center (linuxtesting.org) with the Svace static +analysis tool. + +Cc: stable@vger.kernel.org +Signed-off-by: Sergey Shtylyov +Reviewed-by: Benjamin Coddington +Signed-off-by: Trond Myklebust +Signed-off-by: Greg Kroah-Hartman +--- + fs/nfs/internal.h | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/fs/nfs/internal.h ++++ b/fs/nfs/internal.h +@@ -710,9 +710,9 @@ unsigned long nfs_block_bits(unsigned lo + if ((bsize & (bsize - 1)) || nrbitsp) { + unsigned char nrbits; + +- for (nrbits = 31; nrbits && !(bsize & (1 << nrbits)); nrbits--) ++ for (nrbits = 31; nrbits && !(bsize & (1UL << nrbits)); nrbits--) + ; +- bsize = 1 << nrbits; ++ bsize = 1UL << nrbits; + if (nrbitsp) + *nrbitsp = nrbits; + } diff --git a/queue-6.9/nilfs2-fix-nilfs_empty_dir-misjudgment-and-long-loop-on-i-o-errors.patch b/queue-6.9/nilfs2-fix-nilfs_empty_dir-misjudgment-and-long-loop-on-i-o-errors.patch new file mode 100644 index 00000000000..68c4f625af5 --- /dev/null +++ b/queue-6.9/nilfs2-fix-nilfs_empty_dir-misjudgment-and-long-loop-on-i-o-errors.patch @@ -0,0 +1,46 @@ +From 7373a51e7998b508af7136530f3a997b286ce81c Mon Sep 17 00:00:00 2001 +From: Ryusuke Konishi +Date: Tue, 4 Jun 2024 22:42:55 +0900 +Subject: nilfs2: fix nilfs_empty_dir() misjudgment and long loop on I/O errors + +From: Ryusuke Konishi + +commit 7373a51e7998b508af7136530f3a997b286ce81c upstream. + +The error handling in nilfs_empty_dir() when a directory folio/page read +fails is incorrect, as in the old ext2 implementation, and if the +folio/page cannot be read or nilfs_check_folio() fails, it will falsely +determine the directory as empty and corrupt the file system. + +In addition, since nilfs_empty_dir() does not immediately return on a +failed folio/page read, but continues to loop, this can cause a long loop +with I/O if i_size of the directory's inode is also corrupted, causing the +log writer thread to wait and hang, as reported by syzbot. + +Fix these issues by making nilfs_empty_dir() immediately return a false +value (0) if it fails to get a directory folio/page. + +Link: https://lkml.kernel.org/r/20240604134255.7165-1-konishi.ryusuke@gmail.com +Signed-off-by: Ryusuke Konishi +Reported-by: syzbot+c8166c541d3971bf6c87@syzkaller.appspotmail.com +Closes: https://syzkaller.appspot.com/bug?extid=c8166c541d3971bf6c87 +Fixes: 2ba466d74ed7 ("nilfs2: directory entry operations") +Tested-by: Ryusuke Konishi +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + fs/nilfs2/dir.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/nilfs2/dir.c ++++ b/fs/nilfs2/dir.c +@@ -608,7 +608,7 @@ int nilfs_empty_dir(struct inode *inode) + + kaddr = nilfs_get_folio(inode, i, &folio); + if (IS_ERR(kaddr)) +- continue; ++ return 0; + + de = (struct nilfs_dir_entry *)kaddr; + kaddr += nilfs_last_byte(inode, i) - NILFS_DIR_REC_LEN(1); diff --git a/queue-6.9/nilfs2-fix-potential-kernel-bug-due-to-lack-of-writeback-flag-waiting.patch b/queue-6.9/nilfs2-fix-potential-kernel-bug-due-to-lack-of-writeback-flag-waiting.patch new file mode 100644 index 00000000000..aa5a7ffdeb7 --- /dev/null +++ b/queue-6.9/nilfs2-fix-potential-kernel-bug-due-to-lack-of-writeback-flag-waiting.patch @@ -0,0 +1,76 @@ +From a4ca369ca221bb7e06c725792ac107f0e48e82e7 Mon Sep 17 00:00:00 2001 +From: Ryusuke Konishi +Date: Thu, 30 May 2024 23:15:56 +0900 +Subject: nilfs2: fix potential kernel bug due to lack of writeback flag waiting + +From: Ryusuke Konishi + +commit a4ca369ca221bb7e06c725792ac107f0e48e82e7 upstream. + +Destructive writes to a block device on which nilfs2 is mounted can cause +a kernel bug in the folio/page writeback start routine or writeback end +routine (__folio_start_writeback in the log below): + + kernel BUG at mm/page-writeback.c:3070! + Oops: invalid opcode: 0000 [#1] PREEMPT SMP KASAN PTI + ... + RIP: 0010:__folio_start_writeback+0xbaa/0x10e0 + Code: 25 ff 0f 00 00 0f 84 18 01 00 00 e8 40 ca c6 ff e9 17 f6 ff ff + e8 36 ca c6 ff 4c 89 f7 48 c7 c6 80 c0 12 84 e8 e7 b3 0f 00 90 <0f> + 0b e8 1f ca c6 ff 4c 89 f7 48 c7 c6 a0 c6 12 84 e8 d0 b3 0f 00 + ... + Call Trace: + + nilfs_segctor_do_construct+0x4654/0x69d0 [nilfs2] + nilfs_segctor_construct+0x181/0x6b0 [nilfs2] + nilfs_segctor_thread+0x548/0x11c0 [nilfs2] + kthread+0x2f0/0x390 + ret_from_fork+0x4b/0x80 + ret_from_fork_asm+0x1a/0x30 + + +This is because when the log writer starts a writeback for segment summary +blocks or a super root block that use the backing device's page cache, it +does not wait for the ongoing folio/page writeback, resulting in an +inconsistent writeback state. + +Fix this issue by waiting for ongoing writebacks when putting +folios/pages on the backing device into writeback state. + +Link: https://lkml.kernel.org/r/20240530141556.4411-1-konishi.ryusuke@gmail.com +Fixes: 9ff05123e3bf ("nilfs2: segment constructor") +Signed-off-by: Ryusuke Konishi +Tested-by: Ryusuke Konishi +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + fs/nilfs2/segment.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/fs/nilfs2/segment.c ++++ b/fs/nilfs2/segment.c +@@ -1652,6 +1652,7 @@ static void nilfs_segctor_prepare_write( + if (bh->b_folio != bd_folio) { + if (bd_folio) { + folio_lock(bd_folio); ++ folio_wait_writeback(bd_folio); + folio_clear_dirty_for_io(bd_folio); + folio_start_writeback(bd_folio); + folio_unlock(bd_folio); +@@ -1665,6 +1666,7 @@ static void nilfs_segctor_prepare_write( + if (bh == segbuf->sb_super_root) { + if (bh->b_folio != bd_folio) { + folio_lock(bd_folio); ++ folio_wait_writeback(bd_folio); + folio_clear_dirty_for_io(bd_folio); + folio_start_writeback(bd_folio); + folio_unlock(bd_folio); +@@ -1681,6 +1683,7 @@ static void nilfs_segctor_prepare_write( + } + if (bd_folio) { + folio_lock(bd_folio); ++ folio_wait_writeback(bd_folio); + folio_clear_dirty_for_io(bd_folio); + folio_start_writeback(bd_folio); + folio_unlock(bd_folio); diff --git a/queue-6.9/powerpc-64-bpf-fix-tail-calls-for-pcrel-addressing.patch b/queue-6.9/powerpc-64-bpf-fix-tail-calls-for-pcrel-addressing.patch new file mode 100644 index 00000000000..a710ef76bb8 --- /dev/null +++ b/queue-6.9/powerpc-64-bpf-fix-tail-calls-for-pcrel-addressing.patch @@ -0,0 +1,109 @@ +From 2ecfe59cd7de1f202e9af2516a61fbbf93d0bd4d Mon Sep 17 00:00:00 2001 +From: Hari Bathini +Date: Thu, 2 May 2024 23:02:04 +0530 +Subject: powerpc/64/bpf: fix tail calls for PCREL addressing + +From: Hari Bathini + +commit 2ecfe59cd7de1f202e9af2516a61fbbf93d0bd4d upstream. + +With PCREL addressing, there is no kernel TOC. So, it is not setup in +prologue when PCREL addressing is used. But the number of instructions +to skip on a tail call was not adjusted accordingly. That resulted in +not so obvious failures while using tailcalls. 'tailcalls' selftest +crashed the system with the below call trace: + + bpf_test_run+0xe8/0x3cc (unreliable) + bpf_prog_test_run_skb+0x348/0x778 + __sys_bpf+0xb04/0x2b00 + sys_bpf+0x28/0x38 + system_call_exception+0x168/0x340 + system_call_vectored_common+0x15c/0x2ec + +Also, as bpf programs are always module addresses and a bpf helper in +general is a core kernel text address, using PC relative addressing +often fails with "out of range of pcrel address" error. Switch to +using kernel base for relative addressing to handle this better. + +Fixes: 7e3a68be42e1 ("powerpc/64: vmlinux support building with PCREL addresing") +Cc: stable@vger.kernel.org # v6.4+ +Signed-off-by: Hari Bathini +Signed-off-by: Michael Ellerman +Link: https://msgid.link/20240502173205.142794-1-hbathini@linux.ibm.com +Signed-off-by: Greg Kroah-Hartman +--- + arch/powerpc/net/bpf_jit_comp64.c | 30 ++++++++++++++++-------------- + 1 file changed, 16 insertions(+), 14 deletions(-) + +--- a/arch/powerpc/net/bpf_jit_comp64.c ++++ b/arch/powerpc/net/bpf_jit_comp64.c +@@ -202,7 +202,8 @@ void bpf_jit_build_epilogue(u32 *image, + EMIT(PPC_RAW_BLR()); + } + +-static int bpf_jit_emit_func_call_hlp(u32 *image, struct codegen_context *ctx, u64 func) ++static int ++bpf_jit_emit_func_call_hlp(u32 *image, u32 *fimage, struct codegen_context *ctx, u64 func) + { + unsigned long func_addr = func ? ppc_function_entry((void *)func) : 0; + long reladdr; +@@ -211,19 +212,20 @@ static int bpf_jit_emit_func_call_hlp(u3 + return -EINVAL; + + if (IS_ENABLED(CONFIG_PPC_KERNEL_PCREL)) { +- reladdr = func_addr - CTX_NIA(ctx); ++ reladdr = func_addr - local_paca->kernelbase; + + if (reladdr >= (long)SZ_8G || reladdr < -(long)SZ_8G) { +- pr_err("eBPF: address of %ps out of range of pcrel address.\n", +- (void *)func); ++ pr_err("eBPF: address of %ps out of range of 34-bit relative address.\n", ++ (void *)func); + return -ERANGE; + } +- /* pla r12,addr */ +- EMIT(PPC_PREFIX_MLS | __PPC_PRFX_R(1) | IMM_H18(reladdr)); +- EMIT(PPC_INST_PADDI | ___PPC_RT(_R12) | IMM_L(reladdr)); +- EMIT(PPC_RAW_MTCTR(_R12)); +- EMIT(PPC_RAW_BCTR()); +- ++ EMIT(PPC_RAW_LD(_R12, _R13, offsetof(struct paca_struct, kernelbase))); ++ /* Align for subsequent prefix instruction */ ++ if (!IS_ALIGNED((unsigned long)fimage + CTX_NIA(ctx), 8)) ++ EMIT(PPC_RAW_NOP()); ++ /* paddi r12,r12,addr */ ++ EMIT(PPC_PREFIX_MLS | __PPC_PRFX_R(0) | IMM_H18(reladdr)); ++ EMIT(PPC_INST_PADDI | ___PPC_RT(_R12) | ___PPC_RA(_R12) | IMM_L(reladdr)); + } else { + reladdr = func_addr - kernel_toc_addr(); + if (reladdr > 0x7FFFFFFF || reladdr < -(0x80000000L)) { +@@ -233,9 +235,9 @@ static int bpf_jit_emit_func_call_hlp(u3 + + EMIT(PPC_RAW_ADDIS(_R12, _R2, PPC_HA(reladdr))); + EMIT(PPC_RAW_ADDI(_R12, _R12, PPC_LO(reladdr))); +- EMIT(PPC_RAW_MTCTR(_R12)); +- EMIT(PPC_RAW_BCTRL()); + } ++ EMIT(PPC_RAW_MTCTR(_R12)); ++ EMIT(PPC_RAW_BCTRL()); + + return 0; + } +@@ -285,7 +287,7 @@ static int bpf_jit_emit_tail_call(u32 *i + int b2p_index = bpf_to_ppc(BPF_REG_3); + int bpf_tailcall_prologue_size = 8; + +- if (IS_ENABLED(CONFIG_PPC64_ELF_ABI_V2)) ++ if (!IS_ENABLED(CONFIG_PPC_KERNEL_PCREL) && IS_ENABLED(CONFIG_PPC64_ELF_ABI_V2)) + bpf_tailcall_prologue_size += 4; /* skip past the toc load */ + + /* +@@ -993,7 +995,7 @@ emit_clear: + return ret; + + if (func_addr_fixed) +- ret = bpf_jit_emit_func_call_hlp(image, ctx, func_addr); ++ ret = bpf_jit_emit_func_call_hlp(image, fimage, ctx, func_addr); + else + ret = bpf_jit_emit_func_call_rel(image, fimage, ctx, func_addr); + diff --git a/queue-6.9/powerpc-bpf-enforce-full-ordering-for-atomic-operations-with-bpf_fetch.patch b/queue-6.9/powerpc-bpf-enforce-full-ordering-for-atomic-operations-with-bpf_fetch.patch new file mode 100644 index 00000000000..fa49925d5b9 --- /dev/null +++ b/queue-6.9/powerpc-bpf-enforce-full-ordering-for-atomic-operations-with-bpf_fetch.patch @@ -0,0 +1,138 @@ +From b1e7cee96127468c2483cf10c2899c9b5cf79bf8 Mon Sep 17 00:00:00 2001 +From: Puranjay Mohan +Date: Mon, 13 May 2024 10:02:48 +0000 +Subject: powerpc/bpf: enforce full ordering for ATOMIC operations with BPF_FETCH + +From: Puranjay Mohan + +commit b1e7cee96127468c2483cf10c2899c9b5cf79bf8 upstream. + +The Linux Kernel Memory Model [1][2] requires RMW operations that have a +return value to be fully ordered. + +BPF atomic operations with BPF_FETCH (including BPF_XCHG and +BPF_CMPXCHG) return a value back so they need to be JITed to fully +ordered operations. POWERPC currently emits relaxed operations for +these. + +We can show this by running the following litmus-test: + + PPC SB+atomic_add+fetch + + { + 0:r0=x; (* dst reg assuming offset is 0 *) + 0:r1=2; (* src reg *) + 0:r2=1; + 0:r4=y; (* P0 writes to this, P1 reads this *) + 0:r5=z; (* P1 writes to this, P0 reads this *) + 0:r6=0; + + 1:r2=1; + 1:r4=y; + 1:r5=z; + } + + P0 | P1 ; + stw r2, 0(r4) | stw r2,0(r5) ; + | ; + loop:lwarx r3, r6, r0 | ; + mr r8, r3 | ; + add r3, r3, r1 | sync ; + stwcx. r3, r6, r0 | ; + bne loop | ; + mr r1, r8 | ; + | ; + lwa r7, 0(r5) | lwa r7,0(r4) ; + + ~exists(0:r7=0 /\ 1:r7=0) + + Witnesses + Positive: 9 Negative: 3 + Condition ~exists (0:r7=0 /\ 1:r7=0) + Observation SB+atomic_add+fetch Sometimes 3 9 + +This test shows that the older store in P0 is reordered with a newer +load to a different address. Although there is a RMW operation with +fetch between them. Adding a sync before and after RMW fixes the issue: + + Witnesses + Positive: 9 Negative: 0 + Condition ~exists (0:r7=0 /\ 1:r7=0) + Observation SB+atomic_add+fetch Never 0 9 + +[1] https://www.kernel.org/doc/Documentation/memory-barriers.txt +[2] https://www.kernel.org/doc/Documentation/atomic_t.txt + +Fixes: aea7ef8a82c0 ("powerpc/bpf/32: add support for BPF_ATOMIC bitwise operations") +Fixes: 2d9206b22743 ("powerpc/bpf/32: Add instructions for atomic_[cmp]xchg") +Fixes: dbe6e2456fb0 ("powerpc/bpf/64: add support for atomic fetch operations") +Fixes: 1e82dfaa7819 ("powerpc/bpf/64: Add instructions for atomic_[cmp]xchg") +Cc: stable@vger.kernel.org # v6.0+ +Signed-off-by: Puranjay Mohan +Reviewed-by: Christophe Leroy +Reviewed-by: Naveen N Rao +Acked-by: Paul E. McKenney +Signed-off-by: Michael Ellerman +Link: https://msgid.link/20240513100248.110535-1-puranjay@kernel.org +Signed-off-by: Greg Kroah-Hartman +--- + arch/powerpc/net/bpf_jit_comp32.c | 12 ++++++++++++ + arch/powerpc/net/bpf_jit_comp64.c | 12 ++++++++++++ + 2 files changed, 24 insertions(+) + +--- a/arch/powerpc/net/bpf_jit_comp32.c ++++ b/arch/powerpc/net/bpf_jit_comp32.c +@@ -900,6 +900,15 @@ int bpf_jit_build_body(struct bpf_prog * + + /* Get offset into TMP_REG */ + EMIT(PPC_RAW_LI(tmp_reg, off)); ++ /* ++ * Enforce full ordering for operations with BPF_FETCH by emitting a 'sync' ++ * before and after the operation. ++ * ++ * This is a requirement in the Linux Kernel Memory Model. ++ * See __cmpxchg_u32() in asm/cmpxchg.h as an example. ++ */ ++ if ((imm & BPF_FETCH) && IS_ENABLED(CONFIG_SMP)) ++ EMIT(PPC_RAW_SYNC()); + tmp_idx = ctx->idx * 4; + /* load value from memory into r0 */ + EMIT(PPC_RAW_LWARX(_R0, tmp_reg, dst_reg, 0)); +@@ -953,6 +962,9 @@ int bpf_jit_build_body(struct bpf_prog * + + /* For the BPF_FETCH variant, get old data into src_reg */ + if (imm & BPF_FETCH) { ++ /* Emit 'sync' to enforce full ordering */ ++ if (IS_ENABLED(CONFIG_SMP)) ++ EMIT(PPC_RAW_SYNC()); + EMIT(PPC_RAW_MR(ret_reg, ax_reg)); + if (!fp->aux->verifier_zext) + EMIT(PPC_RAW_LI(ret_reg - 1, 0)); /* higher 32-bit */ +--- a/arch/powerpc/net/bpf_jit_comp64.c ++++ b/arch/powerpc/net/bpf_jit_comp64.c +@@ -805,6 +805,15 @@ emit_clear: + + /* Get offset into TMP_REG_1 */ + EMIT(PPC_RAW_LI(tmp1_reg, off)); ++ /* ++ * Enforce full ordering for operations with BPF_FETCH by emitting a 'sync' ++ * before and after the operation. ++ * ++ * This is a requirement in the Linux Kernel Memory Model. ++ * See __cmpxchg_u64() in asm/cmpxchg.h as an example. ++ */ ++ if ((imm & BPF_FETCH) && IS_ENABLED(CONFIG_SMP)) ++ EMIT(PPC_RAW_SYNC()); + tmp_idx = ctx->idx * 4; + /* load value from memory into TMP_REG_2 */ + if (size == BPF_DW) +@@ -867,6 +876,9 @@ emit_clear: + PPC_BCC_SHORT(COND_NE, tmp_idx); + + if (imm & BPF_FETCH) { ++ /* Emit 'sync' to enforce full ordering */ ++ if (IS_ENABLED(CONFIG_SMP)) ++ EMIT(PPC_RAW_SYNC()); + EMIT(PPC_RAW_MR(ret_reg, _R0)); + /* + * Skip unnecessary zero-extension for 32-bit cmpxchg. diff --git a/queue-6.9/series b/queue-6.9/series index ac6b0e0a17d..a38cf68965b 100644 --- a/queue-6.9/series +++ b/queue-6.9/series @@ -134,3 +134,24 @@ riscv-enable-have_arch_huge_vmap-for-xip-kernel.patch asoc-sof-ipc4-topology-fix-input-format-query-of-process-modules-without-base-extension.patch alsa-ump-don-t-clear-bank-selection-after-sending-a-program-change.patch alsa-ump-don-t-accept-an-invalid-ump-protocol-number.patch +edac-amd64-convert-pcibios_-return-codes-to-errnos.patch +edac-igen6-convert-pcibios_-return-codes-to-errnos.patch +cifs-fix-creating-sockets-when-using-sfu-mount-options.patch +nfs-fix-undefined-behavior-in-nfs_block_bits.patch +nfs-fix-read_plus-when-server-doesn-t-support-op_read_plus.patch +eventfs-fix-a-possible-null-pointer-dereference-in-eventfs_find_events.patch +eventfs-keep-the-directories-from-having-the-same-inode-number-as-files.patch +tracefs-clear-event_inode-flag-in-tracefs_drop_inode.patch +btrfs-qgroup-update-rescan-message-levels-and-error-codes.patch +btrfs-qgroup-fix-qgroup-id-collision-across-mounts.patch +btrfs-protect-folio-private-when-attaching-extent-buffer-folios.patch +btrfs-fix-crash-on-racing-fsync-and-size-extending-write-into-prealloc.patch +btrfs-fix-leak-of-qgroup-extent-records-after-transaction-abort.patch +btrfs-re-introduce-norecovery-mount-option.patch +alsa-seq-fix-incorrect-ump-type-for-system-messages.patch +bpf-fix-multi-uprobe-pid-filtering-logic.patch +powerpc-64-bpf-fix-tail-calls-for-pcrel-addressing.patch +powerpc-bpf-enforce-full-ordering-for-atomic-operations-with-bpf_fetch.patch +nilfs2-fix-potential-kernel-bug-due-to-lack-of-writeback-flag-waiting.patch +nilfs2-fix-nilfs_empty_dir-misjudgment-and-long-loop-on-i-o-errors.patch +smb-client-fix-deadlock-in-smb2_find_smb_tcon.patch diff --git a/queue-6.9/smb-client-fix-deadlock-in-smb2_find_smb_tcon.patch b/queue-6.9/smb-client-fix-deadlock-in-smb2_find_smb_tcon.patch new file mode 100644 index 00000000000..dd714f0837f --- /dev/null +++ b/queue-6.9/smb-client-fix-deadlock-in-smb2_find_smb_tcon.patch @@ -0,0 +1,34 @@ +From 02c418774f76a0a36a6195c9dbf8971eb4130a15 Mon Sep 17 00:00:00 2001 +From: Enzo Matsumiya +Date: Thu, 6 Jun 2024 13:13:13 -0300 +Subject: smb: client: fix deadlock in smb2_find_smb_tcon() + +From: Enzo Matsumiya + +commit 02c418774f76a0a36a6195c9dbf8971eb4130a15 upstream. + +Unlock cifs_tcp_ses_lock before calling cifs_put_smb_ses() to avoid such +deadlock. + +Cc: stable@vger.kernel.org +Signed-off-by: Enzo Matsumiya +Reviewed-by: Shyam Prasad N +Reviewed-by: Paulo Alcantara (Red Hat) +Signed-off-by: Steve French +Signed-off-by: Greg Kroah-Hartman +--- + fs/smb/client/smb2transport.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/smb/client/smb2transport.c ++++ b/fs/smb/client/smb2transport.c +@@ -216,8 +216,8 @@ smb2_find_smb_tcon(struct TCP_Server_Inf + } + tcon = smb2_find_smb_sess_tcon_unlocked(ses, tid); + if (!tcon) { +- cifs_put_smb_ses(ses); + spin_unlock(&cifs_tcp_ses_lock); ++ cifs_put_smb_ses(ses); + return NULL; + } + spin_unlock(&cifs_tcp_ses_lock); diff --git a/queue-6.9/tracefs-clear-event_inode-flag-in-tracefs_drop_inode.patch b/queue-6.9/tracefs-clear-event_inode-flag-in-tracefs_drop_inode.patch new file mode 100644 index 00000000000..dbfa66a2509 --- /dev/null +++ b/queue-6.9/tracefs-clear-event_inode-flag-in-tracefs_drop_inode.patch @@ -0,0 +1,95 @@ +From 0bcfd9aa4dafa03b88d68bf66b694df2a3e76cf3 Mon Sep 17 00:00:00 2001 +From: "Steven Rostedt (Google)" +Date: Thu, 23 May 2024 01:14:29 -0400 +Subject: tracefs: Clear EVENT_INODE flag in tracefs_drop_inode() + +From: Steven Rostedt (Google) + +commit 0bcfd9aa4dafa03b88d68bf66b694df2a3e76cf3 upstream. + +When the inode is being dropped from the dentry, the TRACEFS_EVENT_INODE +flag needs to be cleared to prevent a remount from calling +eventfs_remount() on the tracefs_inode private data. There's a race +between the inode is dropped (and the dentry freed) to where the inode is +actually freed. If a remount happens between the two, the eventfs_inode +could be accessed after it is freed (only the dentry keeps a ref count on +it). + +Currently the TRACEFS_EVENT_INODE flag is cleared from the dentry iput() +function. But this is incorrect, as it is possible that the inode has +another reference to it. The flag should only be cleared when the inode is +really being dropped and has no more references. That happens in the +drop_inode callback of the inode, as that gets called when the last +reference of the inode is released. + +Remove the tracefs_d_iput() function and move its logic to the more +appropriate tracefs_drop_inode() callback function. + +Link: https://lore.kernel.org/linux-trace-kernel/20240523051539.908205106@goodmis.org + +Cc: stable@vger.kernel.org +Cc: Masami Hiramatsu +Cc: Mark Rutland +Cc: Mathieu Desnoyers +Cc: Andrew Morton +Cc: Masahiro Yamada +Fixes: baa23a8d4360d ("tracefs: Reset permissions on remount if permissions are options") +Signed-off-by: Steven Rostedt (Google) +Signed-off-by: Greg Kroah-Hartman +--- + fs/tracefs/inode.c | 33 +++++++++++++++++---------------- + 1 file changed, 17 insertions(+), 16 deletions(-) + +--- a/fs/tracefs/inode.c ++++ b/fs/tracefs/inode.c +@@ -439,10 +439,26 @@ static int tracefs_show_options(struct s + return 0; + } + ++static int tracefs_drop_inode(struct inode *inode) ++{ ++ struct tracefs_inode *ti = get_tracefs(inode); ++ ++ /* ++ * This inode is being freed and cannot be used for ++ * eventfs. Clear the flag so that it doesn't call into ++ * eventfs during the remount flag updates. The eventfs_inode ++ * gets freed after an RCU cycle, so the content will still ++ * be safe if the iteration is going on now. ++ */ ++ ti->flags &= ~TRACEFS_EVENT_INODE; ++ ++ return 1; ++} ++ + static const struct super_operations tracefs_super_operations = { + .alloc_inode = tracefs_alloc_inode, + .free_inode = tracefs_free_inode, +- .drop_inode = generic_delete_inode, ++ .drop_inode = tracefs_drop_inode, + .statfs = simple_statfs, + .remount_fs = tracefs_remount, + .show_options = tracefs_show_options, +@@ -469,22 +485,7 @@ static int tracefs_d_revalidate(struct d + return !(ei && ei->is_freed); + } + +-static void tracefs_d_iput(struct dentry *dentry, struct inode *inode) +-{ +- struct tracefs_inode *ti = get_tracefs(inode); +- +- /* +- * This inode is being freed and cannot be used for +- * eventfs. Clear the flag so that it doesn't call into +- * eventfs during the remount flag updates. The eventfs_inode +- * gets freed after an RCU cycle, so the content will still +- * be safe if the iteration is going on now. +- */ +- ti->flags &= ~TRACEFS_EVENT_INODE; +-} +- + static const struct dentry_operations tracefs_dentry_operations = { +- .d_iput = tracefs_d_iput, + .d_revalidate = tracefs_d_revalidate, + .d_release = tracefs_d_release, + }; -- 2.47.3