From a19512b77ddd6e82c5d29a1f773797402a38b408 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Sat, 8 May 2021 12:09:49 -0400 Subject: [PATCH] Fixes for 5.11 Signed-off-by: Sasha Levin --- ...rd-.note.gnu.property-sections-in-vd.patch | 78 +++++ ...hen-picking-most-recent-mod-log-oper.patch | 329 ++++++++++++++++++ queue-5.11/series | 2 + 3 files changed, 409 insertions(+) create mode 100644 queue-5.11/arm64-vdso-discard-.note.gnu.property-sections-in-vd.patch create mode 100644 queue-5.11/btrfs-fix-race-when-picking-most-recent-mod-log-oper.patch diff --git a/queue-5.11/arm64-vdso-discard-.note.gnu.property-sections-in-vd.patch b/queue-5.11/arm64-vdso-discard-.note.gnu.property-sections-in-vd.patch new file mode 100644 index 00000000000..213ba461acf --- /dev/null +++ b/queue-5.11/arm64-vdso-discard-.note.gnu.property-sections-in-vd.patch @@ -0,0 +1,78 @@ +From e8c33f3d9dfa83a64393ea9df973a6e95afa8103 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 23 Apr 2021 13:51:59 -0700 +Subject: arm64/vdso: Discard .note.gnu.property sections in vDSO + +From: Bill Wendling + +[ Upstream commit 388708028e6937f3fc5fc19aeeb847f8970f489c ] + +The arm64 assembler in binutils 2.32 and above generates a program +property note in a note section, .note.gnu.property, to encode used x86 +ISAs and features. But the kernel linker script only contains a single +NOTE segment: + + PHDRS + { + text PT_LOAD FLAGS(5) FILEHDR PHDRS; /* PF_R|PF_X */ + dynamic PT_DYNAMIC FLAGS(4); /* PF_R */ + note PT_NOTE FLAGS(4); /* PF_R */ + } + +The NOTE segment generated by the vDSO linker script is aligned to 4 bytes. +But the .note.gnu.property section must be aligned to 8 bytes on arm64. + + $ readelf -n vdso64.so + + Displaying notes found in: .note + Owner Data size Description + Linux 0x00000004 Unknown note type: (0x00000000) + description data: 06 00 00 00 + readelf: Warning: note with invalid namesz and/or descsz found at offset 0x20 + readelf: Warning: type: 0x78, namesize: 0x00000100, descsize: 0x756e694c, alignment: 8 + +Since the note.gnu.property section in the vDSO is not checked by the +dynamic linker, discard the .note.gnu.property sections in the vDSO. + +Similar to commit 4caffe6a28d31 ("x86/vdso: Discard .note.gnu.property +sections in vDSO"), but for arm64. + +Signed-off-by: Bill Wendling +Reviewed-by: Kees Cook +Acked-by: Ard Biesheuvel +Link: https://lore.kernel.org/r/20210423205159.830854-1-morbo@google.com +Signed-off-by: Catalin Marinas +Signed-off-by: Sasha Levin +--- + arch/arm64/kernel/vdso/vdso.lds.S | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +diff --git a/arch/arm64/kernel/vdso/vdso.lds.S b/arch/arm64/kernel/vdso/vdso.lds.S +index 61dbb4c838ef..a5e61e09ea92 100644 +--- a/arch/arm64/kernel/vdso/vdso.lds.S ++++ b/arch/arm64/kernel/vdso/vdso.lds.S +@@ -31,6 +31,13 @@ SECTIONS + .gnu.version_d : { *(.gnu.version_d) } + .gnu.version_r : { *(.gnu.version_r) } + ++ /* ++ * Discard .note.gnu.property sections which are unused and have ++ * different alignment requirement from vDSO note sections. ++ */ ++ /DISCARD/ : { ++ *(.note.GNU-stack .note.gnu.property) ++ } + .note : { *(.note.*) } :text :note + + . = ALIGN(16); +@@ -48,7 +55,6 @@ SECTIONS + PROVIDE(end = .); + + /DISCARD/ : { +- *(.note.GNU-stack) + *(.data .data.* .gnu.linkonce.d.* .sdata*) + *(.bss .sbss .dynbss .dynsbss) + *(.eh_frame .eh_frame_hdr) +-- +2.30.2 + diff --git a/queue-5.11/btrfs-fix-race-when-picking-most-recent-mod-log-oper.patch b/queue-5.11/btrfs-fix-race-when-picking-most-recent-mod-log-oper.patch new file mode 100644 index 00000000000..0d9389c1de9 --- /dev/null +++ b/queue-5.11/btrfs-fix-race-when-picking-most-recent-mod-log-oper.patch @@ -0,0 +1,329 @@ +From 391cb88133b21a549c5d223db626045400249f02 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 20 Apr 2021 10:55:44 +0100 +Subject: btrfs: fix race when picking most recent mod log operation for an old + root + +From: Filipe Manana + +[ Upstream commit f9690f426b2134cc3e74bfc5d9dfd6a4b2ca5281 ] + +Commit dbcc7d57bffc0c ("btrfs: fix race when cloning extent buffer during +rewind of an old root"), fixed a race when we need to rewind the extent +buffer of an old root. It was caused by picking a new mod log operation +for the extent buffer while getting a cloned extent buffer with an outdated +number of items (off by -1), because we cloned the extent buffer without +locking it first. + +However there is still another similar race, but in the opposite direction. +The cloned extent buffer has a number of items that does not match the +number of tree mod log operations that are going to be replayed. This is +because right after we got the last (most recent) tree mod log operation to +replay and before locking and cloning the extent buffer, another task adds +a new pointer to the extent buffer, which results in adding a new tree mod +log operation and incrementing the number of items in the extent buffer. +So after cloning we have mismatch between the number of items in the extent +buffer and the number of mod log operations we are going to apply to it. +This results in hitting a BUG_ON() that produces the following stack trace: + + ------------[ cut here ]------------ + kernel BUG at fs/btrfs/tree-mod-log.c:675! + invalid opcode: 0000 [#1] SMP KASAN PTI + CPU: 3 PID: 4811 Comm: crawl_1215 Tainted: G W 5.12.0-7d1efdf501f8-misc-next+ #99 + Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.12.0-1 04/01/2014 + RIP: 0010:tree_mod_log_rewind+0x3b1/0x3c0 + Code: 05 48 8d 74 10 (...) + RSP: 0018:ffffc90001027090 EFLAGS: 00010293 + RAX: 0000000000000000 RBX: ffff8880a8514600 RCX: ffffffffaa9e59b6 + RDX: 0000000000000007 RSI: dffffc0000000000 RDI: ffff8880a851462c + RBP: ffffc900010270e0 R08: 00000000000000c0 R09: ffffed1004333417 + R10: ffff88802199a0b7 R11: ffffed1004333416 R12: 000000000000000e + R13: ffff888135af8748 R14: ffff88818766ff00 R15: ffff8880a851462c + FS: 00007f29acf62700(0000) GS:ffff8881f2200000(0000) knlGS:0000000000000000 + CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + CR2: 00007f0e6013f718 CR3: 000000010d42e003 CR4: 0000000000170ee0 + Call Trace: + btrfs_get_old_root+0x16a/0x5c0 + ? lock_downgrade+0x400/0x400 + btrfs_search_old_slot+0x192/0x520 + ? btrfs_search_slot+0x1090/0x1090 + ? free_extent_buffer.part.61+0xd7/0x140 + ? free_extent_buffer+0x13/0x20 + resolve_indirect_refs+0x3e9/0xfc0 + ? lock_downgrade+0x400/0x400 + ? __kasan_check_read+0x11/0x20 + ? add_prelim_ref.part.11+0x150/0x150 + ? lock_downgrade+0x400/0x400 + ? __kasan_check_read+0x11/0x20 + ? lock_acquired+0xbb/0x620 + ? __kasan_check_write+0x14/0x20 + ? do_raw_spin_unlock+0xa8/0x140 + ? rb_insert_color+0x340/0x360 + ? prelim_ref_insert+0x12d/0x430 + find_parent_nodes+0x5c3/0x1830 + ? stack_trace_save+0x87/0xb0 + ? resolve_indirect_refs+0xfc0/0xfc0 + ? fs_reclaim_acquire+0x67/0xf0 + ? __kasan_check_read+0x11/0x20 + ? lockdep_hardirqs_on_prepare+0x210/0x210 + ? fs_reclaim_acquire+0x67/0xf0 + ? __kasan_check_read+0x11/0x20 + ? ___might_sleep+0x10f/0x1e0 + ? __kasan_kmalloc+0x9d/0xd0 + ? trace_hardirqs_on+0x55/0x120 + btrfs_find_all_roots_safe+0x142/0x1e0 + ? find_parent_nodes+0x1830/0x1830 + ? trace_hardirqs_on+0x55/0x120 + ? ulist_free+0x1f/0x30 + ? btrfs_inode_flags_to_xflags+0x50/0x50 + iterate_extent_inodes+0x20e/0x580 + ? tree_backref_for_extent+0x230/0x230 + ? release_extent_buffer+0x225/0x280 + ? read_extent_buffer+0xdd/0x110 + ? lock_downgrade+0x400/0x400 + ? __kasan_check_read+0x11/0x20 + ? lock_acquired+0xbb/0x620 + ? __kasan_check_write+0x14/0x20 + ? do_raw_spin_unlock+0xa8/0x140 + ? _raw_spin_unlock+0x22/0x30 + ? release_extent_buffer+0x225/0x280 + iterate_inodes_from_logical+0x129/0x170 + ? iterate_inodes_from_logical+0x129/0x170 + ? btrfs_inode_flags_to_xflags+0x50/0x50 + ? iterate_extent_inodes+0x580/0x580 + ? __vmalloc_node+0x92/0xb0 + ? init_data_container+0x34/0xb0 + ? init_data_container+0x34/0xb0 + ? kvmalloc_node+0x60/0x80 + btrfs_ioctl_logical_to_ino+0x158/0x230 + btrfs_ioctl+0x2038/0x4360 + ? __kasan_check_write+0x14/0x20 + ? mmput+0x3b/0x220 + ? btrfs_ioctl_get_supported_features+0x30/0x30 + ? __kasan_check_read+0x11/0x20 + ? __kasan_check_read+0x11/0x20 + ? lock_release+0xc8/0x650 + ? __might_fault+0x64/0xd0 + ? __kasan_check_read+0x11/0x20 + ? lock_downgrade+0x400/0x400 + ? lockdep_hardirqs_on_prepare+0x210/0x210 + ? lockdep_hardirqs_on_prepare+0x13/0x210 + ? _raw_spin_unlock_irqrestore+0x51/0x63 + ? __kasan_check_read+0x11/0x20 + ? do_vfs_ioctl+0xfc/0x9d0 + ? ioctl_file_clone+0xe0/0xe0 + ? lock_downgrade+0x400/0x400 + ? lockdep_hardirqs_on_prepare+0x210/0x210 + ? __kasan_check_read+0x11/0x20 + ? lock_release+0xc8/0x650 + ? __task_pid_nr_ns+0xd3/0x250 + ? __kasan_check_read+0x11/0x20 + ? __fget_files+0x160/0x230 + ? __fget_light+0xf2/0x110 + __x64_sys_ioctl+0xc3/0x100 + do_syscall_64+0x37/0x80 + entry_SYSCALL_64_after_hwframe+0x44/0xae + RIP: 0033:0x7f29ae85b427 + Code: 00 00 90 48 8b (...) + RSP: 002b:00007f29acf5fcf8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 + RAX: ffffffffffffffda RBX: 00007f29acf5ff40 RCX: 00007f29ae85b427 + RDX: 00007f29acf5ff48 RSI: 00000000c038943b RDI: 0000000000000003 + RBP: 0000000001000000 R08: 0000000000000000 R09: 00007f29acf60120 + R10: 00005640d5fc7b00 R11: 0000000000000246 R12: 0000000000000003 + R13: 00007f29acf5ff48 R14: 00007f29acf5ff40 R15: 00007f29acf5fef8 + Modules linked in: + ---[ end trace 85e5fce078dfbe04 ]--- + + (gdb) l *(tree_mod_log_rewind+0x3b1) + 0xffffffff819e5b21 is in tree_mod_log_rewind (fs/btrfs/tree-mod-log.c:675). + 670 * the modification. As we're going backwards, we do the + 671 * opposite of each operation here. + 672 */ + 673 switch (tm->op) { + 674 case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING: + 675 BUG_ON(tm->slot < n); + 676 fallthrough; + 677 case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING: + 678 case BTRFS_MOD_LOG_KEY_REMOVE: + 679 btrfs_set_node_key(eb, &tm->key, tm->slot); + (gdb) quit + +The following steps explain in more detail how it happens: + +1) We have one tree mod log user (through fiemap or the logical ino ioctl), + with a sequence number of 1, so we have fs_info->tree_mod_seq == 1. + This is task A; + +2) Another task is at ctree.c:balance_level() and we have eb X currently as + the root of the tree, and we promote its single child, eb Y, as the new + root. + + Then, at ctree.c:balance_level(), we call: + + ret = btrfs_tree_mod_log_insert_root(root->node, child, true); + +3) At btrfs_tree_mod_log_insert_root() we create a tree mod log operation + of type BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, with a ->logical field + pointing to ebX->start. We only have one item in eb X, so we create + only one tree mod log operation, and store in the "tm_list" array; + +4) Then, still at btrfs_tree_mod_log_insert_root(), we create a tree mod + log element of operation type BTRFS_MOD_LOG_ROOT_REPLACE, ->logical set + to ebY->start, ->old_root.logical set to ebX->start, ->old_root.level + set to the level of eb X and ->generation set to the generation of eb X; + +5) Then btrfs_tree_mod_log_insert_root() calls tree_mod_log_free_eb() with + "tm_list" as argument. After that, tree_mod_log_free_eb() calls + tree_mod_log_insert(). This inserts the mod log operation of type + BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING from step 3 into the rbtree + with a sequence number of 2 (and fs_info->tree_mod_seq set to 2); + +6) Then, after inserting the "tm_list" single element into the tree mod + log rbtree, the BTRFS_MOD_LOG_ROOT_REPLACE element is inserted, which + gets the sequence number 3 (and fs_info->tree_mod_seq set to 3); + +7) Back to ctree.c:balance_level(), we free eb X by calling + btrfs_free_tree_block() on it. Because eb X was created in the current + transaction, has no other references and writeback did not happen for + it, we add it back to the free space cache/tree; + +8) Later some other task B allocates the metadata extent from eb X, since + it is marked as free space in the space cache/tree, and uses it as a + node for some other btree; + +9) The tree mod log user task calls btrfs_search_old_slot(), which calls + btrfs_get_old_root(), and finally that calls tree_mod_log_oldest_root() + with time_seq == 1 and eb_root == eb Y; + +10) The first iteration of the while loop finds the tree mod log element + with sequence number 3, for the logical address of eb Y and of type + BTRFS_MOD_LOG_ROOT_REPLACE; + +11) Because the operation type is BTRFS_MOD_LOG_ROOT_REPLACE, we don't + break out of the loop, and set root_logical to point to + tm->old_root.logical, which corresponds to the logical address of + eb X; + +12) On the next iteration of the while loop, the call to + tree_mod_log_search_oldest() returns the smallest tree mod log element + for the logical address of eb X, which has a sequence number of 2, an + operation type of BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING and + corresponds to the old slot 0 of eb X (eb X had only 1 item in it + before being freed at step 7); + +13) We then break out of the while loop and return the tree mod log + operation of type BTRFS_MOD_LOG_ROOT_REPLACE (eb Y), and not the one + for slot 0 of eb X, to btrfs_get_old_root(); + +14) At btrfs_get_old_root(), we process the BTRFS_MOD_LOG_ROOT_REPLACE + operation and set "logical" to the logical address of eb X, which was + the old root. We then call tree_mod_log_search() passing it the logical + address of eb X and time_seq == 1; + +15) But before calling tree_mod_log_search(), task B locks eb X, adds a + key to eb X, which results in adding a tree mod log operation of type + BTRFS_MOD_LOG_KEY_ADD, with a sequence number of 4, to the tree mod + log, and increments the number of items in eb X from 0 to 1. + Now fs_info->tree_mod_seq has a value of 4; + +16) Task A then calls tree_mod_log_search(), which returns the most recent + tree mod log operation for eb X, which is the one just added by task B + at the previous step, with a sequence number of 4, a type of + BTRFS_MOD_LOG_KEY_ADD and for slot 0; + +17) Before task A locks and clones eb X, task A adds another key to eb X, + which results in adding a new BTRFS_MOD_LOG_KEY_ADD mod log operation, + with a sequence number of 5, for slot 1 of eb X, increments the + number of items in eb X from 1 to 2, and unlocks eb X. + Now fs_info->tree_mod_seq has a value of 5; + +18) Task A then locks eb X and clones it. The clone has a value of 2 for + the number of items and the pointer "tm" points to the tree mod log + operation with sequence number 4, not the most recent one with a + sequence number of 5, so there is mismatch between the number of + mod log operations that are going to be applied to the cloned version + of eb X and the number of items in the clone; + +19) Task A then calls tree_mod_log_rewind() with the clone of eb X, the + tree mod log operation with sequence number 4 and a type of + BTRFS_MOD_LOG_KEY_ADD, and time_seq == 1; + +20) At tree_mod_log_rewind(), we set the local variable "n" with a value + of 2, which is the number of items in the clone of eb X. + + Then in the first iteration of the while loop, we process the mod log + operation with sequence number 4, which is targeted at slot 0 and has + a type of BTRFS_MOD_LOG_KEY_ADD. This results in decrementing "n" from + 2 to 1. + + Then we pick the next tree mod log operation for eb X, which is the + tree mod log operation with a sequence number of 2, a type of + BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING and for slot 0, it is the one + added in step 5 to the tree mod log tree. + + We go back to the top of the loop to process this mod log operation, + and because its slot is 0 and "n" has a value of 1, we hit the BUG_ON: + + (...) + switch (tm->op) { + case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING: + BUG_ON(tm->slot < n); + fallthrough; + (...) + +Fix this by checking for a more recent tree mod log operation after locking +and cloning the extent buffer of the old root node, and use it as the first +operation to apply to the cloned extent buffer when rewinding it. + +Stable backport notes: due to moved code and renames, in =< 5.11 the +change should be applied to ctree.c:get_old_root. + +Reported-by: Zygo Blaxell +Link: https://lore.kernel.org/linux-btrfs/20210404040732.GZ32440@hungrycats.org/ +Fixes: 834328a8493079 ("Btrfs: tree mod log's old roots could still be part of the tree") +CC: stable@vger.kernel.org # 4.4+ +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/ctree.c | 20 ++++++++++++++++++++ + 1 file changed, 20 insertions(+) + +diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c +index 33fe5d839c11..e7c619eb55e7 100644 +--- a/fs/btrfs/ctree.c ++++ b/fs/btrfs/ctree.c +@@ -1365,10 +1365,30 @@ get_old_root(struct btrfs_root *root, u64 time_seq) + "failed to read tree block %llu from get_old_root", + logical); + } else { ++ struct tree_mod_elem *tm2; ++ + btrfs_tree_read_lock(old); + eb = btrfs_clone_extent_buffer(old); ++ /* ++ * After the lookup for the most recent tree mod operation ++ * above and before we locked and cloned the extent buffer ++ * 'old', a new tree mod log operation may have been added. ++ * So lookup for a more recent one to make sure the number ++ * of mod log operations we replay is consistent with the ++ * number of items we have in the cloned extent buffer, ++ * otherwise we can hit a BUG_ON when rewinding the extent ++ * buffer. ++ */ ++ tm2 = tree_mod_log_search(fs_info, logical, time_seq); + btrfs_tree_read_unlock(old); + free_extent_buffer(old); ++ ASSERT(tm2); ++ ASSERT(tm2 == tm || tm2->seq > tm->seq); ++ if (!tm2 || tm2->seq < tm->seq) { ++ free_extent_buffer(eb); ++ return NULL; ++ } ++ tm = tm2; + } + } else if (old_root) { + eb_root_owner = btrfs_header_owner(eb_root); +-- +2.30.2 + diff --git a/queue-5.11/series b/queue-5.11/series index 82b8aca3c1a..40035b59d7b 100644 --- a/queue-5.11/series +++ b/queue-5.11/series @@ -250,3 +250,5 @@ alsa-hda-realtek-fix-mic-boost-on-intel-nuc-8.patch alsa-hda-realtek-headset-mic-issue-on-hp-platform.patch alsa-hda-realtek-fix-static-noise-on-alc285-lenovo-laptops.patch alsa-hda-realtek-add-quirk-for-intel-clevo-pcx0dx.patch +btrfs-fix-race-when-picking-most-recent-mod-log-oper.patch +arm64-vdso-discard-.note.gnu.property-sections-in-vd.patch -- 2.47.3