From: Greg Kroah-Hartman Date: Mon, 26 Jul 2021 08:53:33 +0000 (+0200) Subject: 5.13-stable patches X-Git-Tag: v4.4.277~48 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=c710575b9b89177de9a6a8594c2f7c4fa5073aad;p=thirdparty%2Fkernel%2Fstable-queue.git 5.13-stable patches added patches: btrfs-check-for-missing-device-in-btrfs_trim_fs.patch btrfs-fix-lock-inversion-problem-when-doing-qgroup-extent-tracing.patch btrfs-fix-unpersisted-i_size-on-fsync-after-expanding-truncate.patch bus-mhi-core-validate-channel-id-when-processing-command-completions.patch bus-mhi-pci_generic-apply-no-op-for-wake-using-sideband-wake-boolean.patch bus-mhi-pci_generic-fix-inbound-ipcr-channel.patch driver-core-auxiliary-bus-fix-memory-leak-when-driver_register-fail.patch firmware-efi-tell-memblock-about-efi-iomem-reservations.patch io_uring-explicitly-count-entries-for-poll-reqs.patch io_uring-fix-early-fdput-of-file.patch io_uring-remove-double-poll-entry-on-arm-failure.patch ixgbe-fix-packet-corruption-due-to-missing-dma-sync.patch media-ngene-fix-out-of-bounds-bug-in-ngene_command_config_free_buf.patch posix-cpu-timers-fix-rearm-racing-against-process-tick.patch selftest-use-mmap-instead-of-posix_memalign-to-allocate-memory.patch tracepoints-update-static_call-before-tp_funcs-when-adding-a-tracepoint.patch tracing-fix-bug-in-rb_per_cpu_empty-that-might-cause-deadloop.patch tracing-histogram-rename-cpu-to-common_cpu.patch tracing-synthetic-event-field_pos-is-an-index-not-a-boolean.patch userfaultfd-do-not-untag-user-pointers.patch --- diff --git a/queue-5.13/btrfs-check-for-missing-device-in-btrfs_trim_fs.patch b/queue-5.13/btrfs-check-for-missing-device-in-btrfs_trim_fs.patch new file mode 100644 index 00000000000..cdd96cb950d --- /dev/null +++ b/queue-5.13/btrfs-check-for-missing-device-in-btrfs_trim_fs.patch @@ -0,0 +1,80 @@ +From 16a200f66ede3f9afa2e51d90ade017aaa18d213 Mon Sep 17 00:00:00 2001 +From: Anand Jain +Date: Sun, 4 Jul 2021 19:14:39 +0800 +Subject: btrfs: check for missing device in btrfs_trim_fs + +From: Anand Jain + +commit 16a200f66ede3f9afa2e51d90ade017aaa18d213 upstream. + +A fstrim on a degraded raid1 can trigger the following null pointer +dereference: + + BTRFS info (device loop0): allowing degraded mounts + BTRFS info (device loop0): disk space caching is enabled + BTRFS info (device loop0): has skinny extents + BTRFS warning (device loop0): devid 2 uuid 97ac16f7-e14d-4db1-95bc-3d489b424adb is missing + BTRFS warning (device loop0): devid 2 uuid 97ac16f7-e14d-4db1-95bc-3d489b424adb is missing + BTRFS info (device loop0): enabling ssd optimizations + BUG: kernel NULL pointer dereference, address: 0000000000000620 + PGD 0 P4D 0 + Oops: 0000 [#1] SMP NOPTI + CPU: 0 PID: 4574 Comm: fstrim Not tainted 5.13.0-rc7+ #31 + Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006 + RIP: 0010:btrfs_trim_fs+0x199/0x4a0 [btrfs] + RSP: 0018:ffff959541797d28 EFLAGS: 00010293 + RAX: 0000000000000000 RBX: ffff946f84eca508 RCX: a7a67937adff8608 + RDX: ffff946e8122d000 RSI: 0000000000000000 RDI: ffffffffc02fdbf0 + RBP: ffff946ea4615000 R08: 0000000000000001 R09: 0000000000000000 + R10: 0000000000000000 R11: ffff946e8122d960 R12: 0000000000000000 + R13: ffff959541797db8 R14: ffff946e8122d000 R15: ffff959541797db8 + FS: 00007f55917a5080(0000) GS:ffff946f9bc00000(0000) knlGS:0000000000000000 + CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + CR2: 0000000000000620 CR3: 000000002d2c8001 CR4: 00000000000706f0 + Call Trace: + btrfs_ioctl_fitrim+0x167/0x260 [btrfs] + btrfs_ioctl+0x1c00/0x2fe0 [btrfs] + ? selinux_file_ioctl+0x140/0x240 + ? syscall_trace_enter.constprop.0+0x188/0x240 + ? __x64_sys_ioctl+0x83/0xb0 + __x64_sys_ioctl+0x83/0xb0 + +Reproducer: + + $ mkfs.btrfs -fq -d raid1 -m raid1 /dev/loop0 /dev/loop1 + $ mount /dev/loop0 /btrfs + $ umount /btrfs + $ btrfs dev scan --forget + $ mount -o degraded /dev/loop0 /btrfs + + $ fstrim /btrfs + +The reason is we call btrfs_trim_free_extents() for the missing device, +which uses device->bdev (NULL for missing device) to find if the device +supports discard. + +Fix is to check if the device is missing before calling +btrfs_trim_free_extents(). + +CC: stable@vger.kernel.org # 5.4+ +Reviewed-by: Filipe Manana +Signed-off-by: Anand Jain +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/extent-tree.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -6034,6 +6034,9 @@ int btrfs_trim_fs(struct btrfs_fs_info * + mutex_lock(&fs_info->fs_devices->device_list_mutex); + devices = &fs_info->fs_devices->devices; + list_for_each_entry(device, devices, dev_list) { ++ if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) ++ continue; ++ + ret = btrfs_trim_free_extents(device, &group_trimmed); + if (ret) { + dev_failed++; diff --git a/queue-5.13/btrfs-fix-lock-inversion-problem-when-doing-qgroup-extent-tracing.patch b/queue-5.13/btrfs-fix-lock-inversion-problem-when-doing-qgroup-extent-tracing.patch new file mode 100644 index 00000000000..2029bd1d29e --- /dev/null +++ b/queue-5.13/btrfs-fix-lock-inversion-problem-when-doing-qgroup-extent-tracing.patch @@ -0,0 +1,388 @@ +From 8949b9a114019b03fbd0d03d65b8647cba4feef3 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Wed, 21 Jul 2021 17:31:48 +0100 +Subject: btrfs: fix lock inversion problem when doing qgroup extent tracing + +From: Filipe Manana + +commit 8949b9a114019b03fbd0d03d65b8647cba4feef3 upstream. + +At btrfs_qgroup_trace_extent_post() we call btrfs_find_all_roots() with a +NULL value as the transaction handle argument, which makes that function +take the commit_root_sem semaphore, which is necessary when we don't hold +a transaction handle or any other mechanism to prevent a transaction +commit from wiping out commit roots. + +However btrfs_qgroup_trace_extent_post() can be called in a context where +we are holding a write lock on an extent buffer from a subvolume tree, +namely from btrfs_truncate_inode_items(), called either during truncate +or unlink operations. In this case we end up with a lock inversion problem +because the commit_root_sem is a higher level lock, always supposed to be +acquired before locking any extent buffer. + +Lockdep detects this lock inversion problem since we switched the extent +buffer locks from custom locks to semaphores, and when running btrfs/158 +from fstests, it reported the following trace: + +[ 9057.626435] ====================================================== +[ 9057.627541] WARNING: possible circular locking dependency detected +[ 9057.628334] 5.14.0-rc2-btrfs-next-93 #1 Not tainted +[ 9057.628961] ------------------------------------------------------ +[ 9057.629867] kworker/u16:4/30781 is trying to acquire lock: +[ 9057.630824] ffff8e2590f58760 (btrfs-tree-00){++++}-{3:3}, at: __btrfs_tree_read_lock+0x24/0x110 [btrfs] +[ 9057.632542] + but task is already holding lock: +[ 9057.633551] ffff8e25582d4b70 (&fs_info->commit_root_sem){++++}-{3:3}, at: iterate_extent_inodes+0x10b/0x280 [btrfs] +[ 9057.635255] + which lock already depends on the new lock. + +[ 9057.636292] + the existing dependency chain (in reverse order) is: +[ 9057.637240] + -> #1 (&fs_info->commit_root_sem){++++}-{3:3}: +[ 9057.638138] down_read+0x46/0x140 +[ 9057.638648] btrfs_find_all_roots+0x41/0x80 [btrfs] +[ 9057.639398] btrfs_qgroup_trace_extent_post+0x37/0x70 [btrfs] +[ 9057.640283] btrfs_add_delayed_data_ref+0x418/0x490 [btrfs] +[ 9057.641114] btrfs_free_extent+0x35/0xb0 [btrfs] +[ 9057.641819] btrfs_truncate_inode_items+0x424/0xf70 [btrfs] +[ 9057.642643] btrfs_evict_inode+0x454/0x4f0 [btrfs] +[ 9057.643418] evict+0xcf/0x1d0 +[ 9057.643895] do_unlinkat+0x1e9/0x300 +[ 9057.644525] do_syscall_64+0x3b/0xc0 +[ 9057.645110] entry_SYSCALL_64_after_hwframe+0x44/0xae +[ 9057.645835] + -> #0 (btrfs-tree-00){++++}-{3:3}: +[ 9057.646600] __lock_acquire+0x130e/0x2210 +[ 9057.647248] lock_acquire+0xd7/0x310 +[ 9057.647773] down_read_nested+0x4b/0x140 +[ 9057.648350] __btrfs_tree_read_lock+0x24/0x110 [btrfs] +[ 9057.649175] btrfs_read_lock_root_node+0x31/0x40 [btrfs] +[ 9057.650010] btrfs_search_slot+0x537/0xc00 [btrfs] +[ 9057.650849] scrub_print_warning_inode+0x89/0x370 [btrfs] +[ 9057.651733] iterate_extent_inodes+0x1e3/0x280 [btrfs] +[ 9057.652501] scrub_print_warning+0x15d/0x2f0 [btrfs] +[ 9057.653264] scrub_handle_errored_block.isra.0+0x135f/0x1640 [btrfs] +[ 9057.654295] scrub_bio_end_io_worker+0x101/0x2e0 [btrfs] +[ 9057.655111] btrfs_work_helper+0xf8/0x400 [btrfs] +[ 9057.655831] process_one_work+0x247/0x5a0 +[ 9057.656425] worker_thread+0x55/0x3c0 +[ 9057.656993] kthread+0x155/0x180 +[ 9057.657494] ret_from_fork+0x22/0x30 +[ 9057.658030] + other info that might help us debug this: + +[ 9057.659064] Possible unsafe locking scenario: + +[ 9057.659824] CPU0 CPU1 +[ 9057.660402] ---- ---- +[ 9057.660988] lock(&fs_info->commit_root_sem); +[ 9057.661581] lock(btrfs-tree-00); +[ 9057.662348] lock(&fs_info->commit_root_sem); +[ 9057.663254] lock(btrfs-tree-00); +[ 9057.663690] + *** DEADLOCK *** + +[ 9057.664437] 4 locks held by kworker/u16:4/30781: +[ 9057.665023] #0: ffff8e25922a1148 ((wq_completion)btrfs-scrub){+.+.}-{0:0}, at: process_one_work+0x1c7/0x5a0 +[ 9057.666260] #1: ffffabb3451ffe70 ((work_completion)(&work->normal_work)){+.+.}-{0:0}, at: process_one_work+0x1c7/0x5a0 +[ 9057.667639] #2: ffff8e25922da198 (&ret->mutex){+.+.}-{3:3}, at: scrub_handle_errored_block.isra.0+0x5d2/0x1640 [btrfs] +[ 9057.669017] #3: ffff8e25582d4b70 (&fs_info->commit_root_sem){++++}-{3:3}, at: iterate_extent_inodes+0x10b/0x280 [btrfs] +[ 9057.670408] + stack backtrace: +[ 9057.670976] CPU: 7 PID: 30781 Comm: kworker/u16:4 Not tainted 5.14.0-rc2-btrfs-next-93 #1 +[ 9057.672030] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014 +[ 9057.673492] Workqueue: btrfs-scrub btrfs_work_helper [btrfs] +[ 9057.674258] Call Trace: +[ 9057.674588] dump_stack_lvl+0x57/0x72 +[ 9057.675083] check_noncircular+0xf3/0x110 +[ 9057.675611] __lock_acquire+0x130e/0x2210 +[ 9057.676132] lock_acquire+0xd7/0x310 +[ 9057.676605] ? __btrfs_tree_read_lock+0x24/0x110 [btrfs] +[ 9057.677313] ? lock_is_held_type+0xe8/0x140 +[ 9057.677849] down_read_nested+0x4b/0x140 +[ 9057.678349] ? __btrfs_tree_read_lock+0x24/0x110 [btrfs] +[ 9057.679068] __btrfs_tree_read_lock+0x24/0x110 [btrfs] +[ 9057.679760] btrfs_read_lock_root_node+0x31/0x40 [btrfs] +[ 9057.680458] btrfs_search_slot+0x537/0xc00 [btrfs] +[ 9057.681083] ? _raw_spin_unlock+0x29/0x40 +[ 9057.681594] ? btrfs_find_all_roots_safe+0x11f/0x140 [btrfs] +[ 9057.682336] scrub_print_warning_inode+0x89/0x370 [btrfs] +[ 9057.683058] ? btrfs_find_all_roots_safe+0x11f/0x140 [btrfs] +[ 9057.683834] ? scrub_write_block_to_dev_replace+0xb0/0xb0 [btrfs] +[ 9057.684632] iterate_extent_inodes+0x1e3/0x280 [btrfs] +[ 9057.685316] scrub_print_warning+0x15d/0x2f0 [btrfs] +[ 9057.685977] ? ___ratelimit+0xa4/0x110 +[ 9057.686460] scrub_handle_errored_block.isra.0+0x135f/0x1640 [btrfs] +[ 9057.687316] scrub_bio_end_io_worker+0x101/0x2e0 [btrfs] +[ 9057.688021] btrfs_work_helper+0xf8/0x400 [btrfs] +[ 9057.688649] ? lock_is_held_type+0xe8/0x140 +[ 9057.689180] process_one_work+0x247/0x5a0 +[ 9057.689696] worker_thread+0x55/0x3c0 +[ 9057.690175] ? process_one_work+0x5a0/0x5a0 +[ 9057.690731] kthread+0x155/0x180 +[ 9057.691158] ? set_kthread_struct+0x40/0x40 +[ 9057.691697] ret_from_fork+0x22/0x30 + +Fix this by making btrfs_find_all_roots() never attempt to lock the +commit_root_sem when it is called from btrfs_qgroup_trace_extent_post(). + +We can't just pass a non-NULL transaction handle to btrfs_find_all_roots() +from btrfs_qgroup_trace_extent_post(), because that would make backref +lookup not use commit roots and acquire read locks on extent buffers, and +therefore could deadlock when btrfs_qgroup_trace_extent_post() is called +from the btrfs_truncate_inode_items() code path which has acquired a write +lock on an extent buffer of the subvolume btree. + +CC: stable@vger.kernel.org # 4.19+ +Reviewed-by: Qu Wenruo +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/backref.c | 6 +++--- + fs/btrfs/backref.h | 3 ++- + fs/btrfs/delayed-ref.c | 4 ++-- + fs/btrfs/qgroup.c | 38 ++++++++++++++++++++++++++++++-------- + fs/btrfs/qgroup.h | 2 +- + fs/btrfs/tests/qgroup-tests.c | 20 ++++++++++---------- + 6 files changed, 48 insertions(+), 25 deletions(-) + +--- a/fs/btrfs/backref.c ++++ b/fs/btrfs/backref.c +@@ -1488,15 +1488,15 @@ static int btrfs_find_all_roots_safe(str + int btrfs_find_all_roots(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, + u64 time_seq, struct ulist **roots, +- bool ignore_offset) ++ bool ignore_offset, bool skip_commit_root_sem) + { + int ret; + +- if (!trans) ++ if (!trans && !skip_commit_root_sem) + down_read(&fs_info->commit_root_sem); + ret = btrfs_find_all_roots_safe(trans, fs_info, bytenr, + time_seq, roots, ignore_offset); +- if (!trans) ++ if (!trans && !skip_commit_root_sem) + up_read(&fs_info->commit_root_sem); + return ret; + } +--- a/fs/btrfs/backref.h ++++ b/fs/btrfs/backref.h +@@ -47,7 +47,8 @@ int btrfs_find_all_leafs(struct btrfs_tr + const u64 *extent_item_pos, bool ignore_offset); + int btrfs_find_all_roots(struct btrfs_trans_handle *trans, + struct btrfs_fs_info *fs_info, u64 bytenr, +- u64 time_seq, struct ulist **roots, bool ignore_offset); ++ u64 time_seq, struct ulist **roots, bool ignore_offset, ++ bool skip_commit_root_sem); + char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path, + u32 name_len, unsigned long name_off, + struct extent_buffer *eb_in, u64 parent, +--- a/fs/btrfs/delayed-ref.c ++++ b/fs/btrfs/delayed-ref.c +@@ -1000,7 +1000,7 @@ int btrfs_add_delayed_tree_ref(struct bt + kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref); + + if (qrecord_inserted) +- btrfs_qgroup_trace_extent_post(fs_info, record); ++ btrfs_qgroup_trace_extent_post(trans, record); + + return 0; + } +@@ -1095,7 +1095,7 @@ int btrfs_add_delayed_data_ref(struct bt + + + if (qrecord_inserted) +- return btrfs_qgroup_trace_extent_post(fs_info, record); ++ return btrfs_qgroup_trace_extent_post(trans, record); + return 0; + } + +--- a/fs/btrfs/qgroup.c ++++ b/fs/btrfs/qgroup.c +@@ -1704,17 +1704,39 @@ int btrfs_qgroup_trace_extent_nolock(str + return 0; + } + +-int btrfs_qgroup_trace_extent_post(struct btrfs_fs_info *fs_info, ++int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, + struct btrfs_qgroup_extent_record *qrecord) + { + struct ulist *old_root; + u64 bytenr = qrecord->bytenr; + int ret; + +- ret = btrfs_find_all_roots(NULL, fs_info, bytenr, 0, &old_root, false); ++ /* ++ * We are always called in a context where we are already holding a ++ * transaction handle. Often we are called when adding a data delayed ++ * reference from btrfs_truncate_inode_items() (truncating or unlinking), ++ * in which case we will be holding a write lock on extent buffer from a ++ * subvolume tree. In this case we can't allow btrfs_find_all_roots() to ++ * acquire fs_info->commit_root_sem, because that is a higher level lock ++ * that must be acquired before locking any extent buffers. ++ * ++ * So we want btrfs_find_all_roots() to not acquire the commit_root_sem ++ * but we can't pass it a non-NULL transaction handle, because otherwise ++ * it would not use commit roots and would lock extent buffers, causing ++ * a deadlock if it ends up trying to read lock the same extent buffer ++ * that was previously write locked at btrfs_truncate_inode_items(). ++ * ++ * So pass a NULL transaction handle to btrfs_find_all_roots() and ++ * explicitly tell it to not acquire the commit_root_sem - if we are ++ * holding a transaction handle we don't need its protection. ++ */ ++ ASSERT(trans != NULL); ++ ++ ret = btrfs_find_all_roots(NULL, trans->fs_info, bytenr, 0, &old_root, ++ false, true); + if (ret < 0) { +- fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; +- btrfs_warn(fs_info, ++ trans->fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; ++ btrfs_warn(trans->fs_info, + "error accounting new delayed refs extent (err code: %d), quota inconsistent", + ret); + return 0; +@@ -1758,7 +1780,7 @@ int btrfs_qgroup_trace_extent(struct btr + kfree(record); + return 0; + } +- return btrfs_qgroup_trace_extent_post(fs_info, record); ++ return btrfs_qgroup_trace_extent_post(trans, record); + } + + int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans, +@@ -2629,7 +2651,7 @@ int btrfs_qgroup_account_extents(struct + /* Search commit root to find old_roots */ + ret = btrfs_find_all_roots(NULL, fs_info, + record->bytenr, 0, +- &record->old_roots, false); ++ &record->old_roots, false, false); + if (ret < 0) + goto cleanup; + } +@@ -2645,7 +2667,7 @@ int btrfs_qgroup_account_extents(struct + * current root. It's safe inside commit_transaction(). + */ + ret = btrfs_find_all_roots(trans, fs_info, +- record->bytenr, BTRFS_SEQ_LAST, &new_roots, false); ++ record->bytenr, BTRFS_SEQ_LAST, &new_roots, false, false); + if (ret < 0) + goto cleanup; + if (qgroup_to_skip) { +@@ -3179,7 +3201,7 @@ static int qgroup_rescan_leaf(struct btr + num_bytes = found.offset; + + ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0, +- &roots, false); ++ &roots, false, false); + if (ret < 0) + goto out; + /* For rescan, just pass old_roots as NULL */ +--- a/fs/btrfs/qgroup.h ++++ b/fs/btrfs/qgroup.h +@@ -298,7 +298,7 @@ int btrfs_qgroup_trace_extent_nolock( + * using current root, then we can move all expensive backref walk out of + * transaction committing, but not now as qgroup accounting will be wrong again. + */ +-int btrfs_qgroup_trace_extent_post(struct btrfs_fs_info *fs_info, ++int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans, + struct btrfs_qgroup_extent_record *qrecord); + + /* +--- a/fs/btrfs/tests/qgroup-tests.c ++++ b/fs/btrfs/tests/qgroup-tests.c +@@ -224,7 +224,7 @@ static int test_no_shared_qgroup(struct + * quota. + */ + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, +- false); ++ false, false); + if (ret) { + ulist_free(old_roots); + test_err("couldn't find old roots: %d", ret); +@@ -237,7 +237,7 @@ static int test_no_shared_qgroup(struct + return ret; + + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, +- false); ++ false, false); + if (ret) { + ulist_free(old_roots); + ulist_free(new_roots); +@@ -261,7 +261,7 @@ static int test_no_shared_qgroup(struct + new_roots = NULL; + + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, +- false); ++ false, false); + if (ret) { + ulist_free(old_roots); + test_err("couldn't find old roots: %d", ret); +@@ -273,7 +273,7 @@ static int test_no_shared_qgroup(struct + return -EINVAL; + + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, +- false); ++ false, false); + if (ret) { + ulist_free(old_roots); + ulist_free(new_roots); +@@ -325,7 +325,7 @@ static int test_multiple_refs(struct btr + } + + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, +- false); ++ false, false); + if (ret) { + ulist_free(old_roots); + test_err("couldn't find old roots: %d", ret); +@@ -338,7 +338,7 @@ static int test_multiple_refs(struct btr + return ret; + + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, +- false); ++ false, false); + if (ret) { + ulist_free(old_roots); + ulist_free(new_roots); +@@ -360,7 +360,7 @@ static int test_multiple_refs(struct btr + } + + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, +- false); ++ false, false); + if (ret) { + ulist_free(old_roots); + test_err("couldn't find old roots: %d", ret); +@@ -373,7 +373,7 @@ static int test_multiple_refs(struct btr + return ret; + + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, +- false); ++ false, false); + if (ret) { + ulist_free(old_roots); + ulist_free(new_roots); +@@ -401,7 +401,7 @@ static int test_multiple_refs(struct btr + } + + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, +- false); ++ false, false); + if (ret) { + ulist_free(old_roots); + test_err("couldn't find old roots: %d", ret); +@@ -414,7 +414,7 @@ static int test_multiple_refs(struct btr + return ret; + + ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, +- false); ++ false, false); + if (ret) { + ulist_free(old_roots); + ulist_free(new_roots); diff --git a/queue-5.13/btrfs-fix-unpersisted-i_size-on-fsync-after-expanding-truncate.patch b/queue-5.13/btrfs-fix-unpersisted-i_size-on-fsync-after-expanding-truncate.patch new file mode 100644 index 00000000000..c99f46e0f72 --- /dev/null +++ b/queue-5.13/btrfs-fix-unpersisted-i_size-on-fsync-after-expanding-truncate.patch @@ -0,0 +1,150 @@ +From 9acc8103ab594f72250788cb45a43427f36d685d Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Tue, 6 Jul 2021 15:41:15 +0100 +Subject: btrfs: fix unpersisted i_size on fsync after expanding truncate + +From: Filipe Manana + +commit 9acc8103ab594f72250788cb45a43427f36d685d upstream. + +If we have an inode that does not have the full sync flag set, was changed +in the current transaction, then it is logged while logging some other +inode (like its parent directory for example), its i_size is increased by +a truncate operation, the log is synced through an fsync of some other +inode and then finally we explicitly call fsync on our inode, the new +i_size is not persisted. + +The following example shows how to trigger it, with comments explaining +how and why the issue happens: + + $ mkfs.btrfs -f /dev/sdc + $ mount /dev/sdc /mnt + + $ touch /mnt/foo + $ xfs_io -f -c "pwrite -S 0xab 0 1M" /mnt/bar + + $ sync + + # Fsync bar, this will be a noop since the file has not yet been + # modified in the current transaction. The goal here is to clear + # BTRFS_INODE_NEEDS_FULL_SYNC from the inode's runtime flags. + $ xfs_io -c "fsync" /mnt/bar + + # Now rename both files, without changing their parent directory. + $ mv /mnt/bar /mnt/bar2 + $ mv /mnt/foo /mnt/foo2 + + # Increase the size of bar2 with a truncate operation. + $ xfs_io -c "truncate 2M" /mnt/bar2 + + # Now fsync foo2, this results in logging its parent inode (the root + # directory), and logging the parent results in logging the inode of + # file bar2 (its inode item and the new name). The inode of file bar2 + # is logged with an i_size of 0 bytes since it's logged in + # LOG_INODE_EXISTS mode, meaning we are only logging its names (and + # xattrs if it had any) and the i_size of the inode will not be changed + # when the log is replayed. + $ xfs_io -c "fsync" /mnt/foo2 + + # Now explicitly fsync bar2. This resulted in doing nothing, not + # logging the inode with the new i_size of 2M and the hole from file + # offset 1M to 2M. Because the inode did not have the flag + # BTRFS_INODE_NEEDS_FULL_SYNC set, when it was logged through the + # fsync of file foo2, its last_log_commit field was updated, + # resulting in this explicit of file bar2 not doing anything. + $ xfs_io -c "fsync" /mnt/bar2 + + # File bar2 content and size before a power failure. + $ od -A d -t x1 /mnt/bar2 + 0000000 ab ab ab ab ab ab ab ab ab ab ab ab ab ab ab ab + * + 1048576 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 + * + 2097152 + + + + # Mount the filesystem to replay the log. + $ mount /dev/sdc /mnt + + # Read the file again, should have the same content and size as before + # the power failure happened, but it doesn't, i_size is still at 1M. + $ od -A d -t x1 /mnt/bar2 + 0000000 ab ab ab ab ab ab ab ab ab ab ab ab ab ab ab ab + * + 1048576 + +This started to happen after commit 209ecbb8585bf6 ("btrfs: remove stale +comment and logic from btrfs_inode_in_log()"), since btrfs_inode_in_log() +no longer checks if the inode's list of modified extents is not empty. +However, checking that list is not the right way to address this case +and the check was added long time ago in commit 125c4cf9f37c98 +("Btrfs: set inode's logged_trans/last_log_commit after ranged fsync") +for a different purpose, to address consecutive ranged fsyncs. + +The reason that checking for the list emptiness makes this test pass is +because during an expanding truncate we create an extent map to represent +a hole from the old i_size to the new i_size, and add that extent map to +the list of modified extents in the inode. However if we are low on +available memory and we can not allocate a new extent map, then we don't +treat it as an error and just set the full sync flag on the inode, so that +the next fsync does not rely on the list of modified extents - so checking +for the emptiness of the list to decide if the inode needs to be logged is +not reliable, and results in not logging the inode if it was not possible +to allocate the extent map for the hole. + +Fix this by ensuring that if we are only logging that an inode exists +(inode item, names/references and xattrs), we don't update the inode's +last_log_commit even if it does not have the full sync runtime flag set. + +A test case for fstests follows soon. + +CC: stable@vger.kernel.org # 5.13+ +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/tree-log.c | 31 ++++++++++++++++++++++--------- + 1 file changed, 22 insertions(+), 9 deletions(-) + +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -5515,16 +5515,29 @@ log_extents: + spin_lock(&inode->lock); + inode->logged_trans = trans->transid; + /* +- * Don't update last_log_commit if we logged that an inode exists +- * after it was loaded to memory (full_sync bit set). +- * This is to prevent data loss when we do a write to the inode, +- * then the inode gets evicted after all delalloc was flushed, +- * then we log it exists (due to a rename for example) and then +- * fsync it. This last fsync would do nothing (not logging the +- * extents previously written). ++ * Don't update last_log_commit if we logged that an inode exists. ++ * We do this for two reasons: ++ * ++ * 1) We might have had buffered writes to this inode that were ++ * flushed and had their ordered extents completed in this ++ * transaction, but we did not previously log the inode with ++ * LOG_INODE_ALL. Later the inode was evicted and after that ++ * it was loaded again and this LOG_INODE_EXISTS log operation ++ * happened. We must make sure that if an explicit fsync against ++ * the inode is performed later, it logs the new extents, an ++ * updated inode item, etc, and syncs the log. The same logic ++ * applies to direct IO writes instead of buffered writes. ++ * ++ * 2) When we log the inode with LOG_INODE_EXISTS, its inode item ++ * is logged with an i_size of 0 or whatever value was logged ++ * before. If later the i_size of the inode is increased by a ++ * truncate operation, the log is synced through an fsync of ++ * some other inode and then finally an explicit fsync against ++ * this inode is made, we must make sure this fsync logs the ++ * inode with the new i_size, the hole between old i_size and ++ * the new i_size, and syncs the log. + */ +- if (inode_only != LOG_INODE_EXISTS || +- !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags)) ++ if (inode_only != LOG_INODE_EXISTS) + inode->last_log_commit = inode->last_sub_trans; + spin_unlock(&inode->lock); + } diff --git a/queue-5.13/bus-mhi-core-validate-channel-id-when-processing-command-completions.patch b/queue-5.13/bus-mhi-core-validate-channel-id-when-processing-command-completions.patch new file mode 100644 index 00000000000..80962ec226e --- /dev/null +++ b/queue-5.13/bus-mhi-core-validate-channel-id-when-processing-command-completions.patch @@ -0,0 +1,56 @@ +From 546362a9ef2ef40b57c6605f14e88ced507f8dd0 Mon Sep 17 00:00:00 2001 +From: Bhaumik Bhatt +Date: Fri, 16 Jul 2021 13:21:05 +0530 +Subject: bus: mhi: core: Validate channel ID when processing command completions + +From: Bhaumik Bhatt + +commit 546362a9ef2ef40b57c6605f14e88ced507f8dd0 upstream. + +MHI reads the channel ID from the event ring element sent by the +device which can be any value between 0 and 255. In order to +prevent any out of bound accesses, add a check against the maximum +number of channels supported by the controller and those channels +not configured yet so as to skip processing of that event ring +element. + +Link: https://lore.kernel.org/r/1624558141-11045-1-git-send-email-bbhatt@codeaurora.org +Fixes: 1d3173a3bae7 ("bus: mhi: core: Add support for processing events from client device") +Cc: stable@vger.kernel.org #5.10 +Reviewed-by: Hemant Kumar +Reviewed-by: Manivannan Sadhasivam +Reviewed-by: Jeffrey Hugo +Signed-off-by: Bhaumik Bhatt +Signed-off-by: Manivannan Sadhasivam +Link: https://lore.kernel.org/r/20210716075106.49938-3-manivannan.sadhasivam@linaro.org +Signed-off-by: Greg Kroah-Hartman +--- + drivers/bus/mhi/core/main.c | 17 ++++++++++++----- + 1 file changed, 12 insertions(+), 5 deletions(-) + +--- a/drivers/bus/mhi/core/main.c ++++ b/drivers/bus/mhi/core/main.c +@@ -773,11 +773,18 @@ static void mhi_process_cmd_completion(s + cmd_pkt = mhi_to_virtual(mhi_ring, ptr); + + chan = MHI_TRE_GET_CMD_CHID(cmd_pkt); +- mhi_chan = &mhi_cntrl->mhi_chan[chan]; +- write_lock_bh(&mhi_chan->lock); +- mhi_chan->ccs = MHI_TRE_GET_EV_CODE(tre); +- complete(&mhi_chan->completion); +- write_unlock_bh(&mhi_chan->lock); ++ ++ if (chan < mhi_cntrl->max_chan && ++ mhi_cntrl->mhi_chan[chan].configured) { ++ mhi_chan = &mhi_cntrl->mhi_chan[chan]; ++ write_lock_bh(&mhi_chan->lock); ++ mhi_chan->ccs = MHI_TRE_GET_EV_CODE(tre); ++ complete(&mhi_chan->completion); ++ write_unlock_bh(&mhi_chan->lock); ++ } else { ++ dev_err(&mhi_cntrl->mhi_dev->dev, ++ "Completion packet for invalid channel ID: %d\n", chan); ++ } + + mhi_del_ring_element(mhi_cntrl, mhi_ring); + } diff --git a/queue-5.13/bus-mhi-pci_generic-apply-no-op-for-wake-using-sideband-wake-boolean.patch b/queue-5.13/bus-mhi-pci_generic-apply-no-op-for-wake-using-sideband-wake-boolean.patch new file mode 100644 index 00000000000..6ff445028ab --- /dev/null +++ b/queue-5.13/bus-mhi-pci_generic-apply-no-op-for-wake-using-sideband-wake-boolean.patch @@ -0,0 +1,113 @@ +From 56f6f4c4eb2a710ec8878dd9373d3d2b2eb75f5c Mon Sep 17 00:00:00 2001 +From: Bhaumik Bhatt +Date: Fri, 16 Jul 2021 13:21:04 +0530 +Subject: bus: mhi: pci_generic: Apply no-op for wake using sideband wake boolean + +From: Bhaumik Bhatt + +commit 56f6f4c4eb2a710ec8878dd9373d3d2b2eb75f5c upstream. + +Devices such as SDX24 do not have the provision for inband wake +doorbell in the form of channel 127 and instead have a sideband +GPIO for it. Newer devices such as SDX55 or SDX65 support inband +wake method by default. Ensure the functionality is used based on +this such that device wake stays held when a client driver uses +mhi_device_get() API or the equivalent debugfs entry. + +Link: https://lore.kernel.org/r/1624560809-30610-1-git-send-email-bbhatt@codeaurora.org +Fixes: e3e5e6508fc1 ("bus: mhi: pci_generic: No-Op for device_wake operations") +Cc: stable@vger.kernel.org #5.12 +Reviewed-by: Manivannan Sadhasivam +Signed-off-by: Bhaumik Bhatt +Signed-off-by: Manivannan Sadhasivam +Link: https://lore.kernel.org/r/20210716075106.49938-2-manivannan.sadhasivam@linaro.org +Signed-off-by: Greg Kroah-Hartman +--- + drivers/bus/mhi/pci_generic.c | 27 +++++++++++++++++++-------- + 1 file changed, 19 insertions(+), 8 deletions(-) + +--- a/drivers/bus/mhi/pci_generic.c ++++ b/drivers/bus/mhi/pci_generic.c +@@ -32,6 +32,8 @@ + * @edl: emergency download mode firmware path (if any) + * @bar_num: PCI base address register to use for MHI MMIO register space + * @dma_data_width: DMA transfer word size (32 or 64 bits) ++ * @sideband_wake: Devices using dedicated sideband GPIO for wakeup instead ++ * of inband wake support (such as sdx24) + */ + struct mhi_pci_dev_info { + const struct mhi_controller_config *config; +@@ -40,6 +42,7 @@ struct mhi_pci_dev_info { + const char *edl; + unsigned int bar_num; + unsigned int dma_data_width; ++ bool sideband_wake; + }; + + #define MHI_CHANNEL_CONFIG_UL(ch_num, ch_name, el_count, ev_ring) \ +@@ -242,7 +245,8 @@ static const struct mhi_pci_dev_info mhi + .edl = "qcom/sdx65m/edl.mbn", + .config = &modem_qcom_v1_mhiv_config, + .bar_num = MHI_PCI_DEFAULT_BAR_NUM, +- .dma_data_width = 32 ++ .dma_data_width = 32, ++ .sideband_wake = false, + }; + + static const struct mhi_pci_dev_info mhi_qcom_sdx55_info = { +@@ -251,7 +255,8 @@ static const struct mhi_pci_dev_info mhi + .edl = "qcom/sdx55m/edl.mbn", + .config = &modem_qcom_v1_mhiv_config, + .bar_num = MHI_PCI_DEFAULT_BAR_NUM, +- .dma_data_width = 32 ++ .dma_data_width = 32, ++ .sideband_wake = false, + }; + + static const struct mhi_pci_dev_info mhi_qcom_sdx24_info = { +@@ -259,7 +264,8 @@ static const struct mhi_pci_dev_info mhi + .edl = "qcom/prog_firehose_sdx24.mbn", + .config = &modem_qcom_v1_mhiv_config, + .bar_num = MHI_PCI_DEFAULT_BAR_NUM, +- .dma_data_width = 32 ++ .dma_data_width = 32, ++ .sideband_wake = true, + }; + + static const struct mhi_channel_config mhi_quectel_em1xx_channels[] = { +@@ -301,7 +307,8 @@ static const struct mhi_pci_dev_info mhi + .edl = "qcom/prog_firehose_sdx24.mbn", + .config = &modem_quectel_em1xx_config, + .bar_num = MHI_PCI_DEFAULT_BAR_NUM, +- .dma_data_width = 32 ++ .dma_data_width = 32, ++ .sideband_wake = true, + }; + + static const struct mhi_channel_config mhi_foxconn_sdx55_channels[] = { +@@ -339,7 +346,8 @@ static const struct mhi_pci_dev_info mhi + .edl = "qcom/sdx55m/edl.mbn", + .config = &modem_foxconn_sdx55_config, + .bar_num = MHI_PCI_DEFAULT_BAR_NUM, +- .dma_data_width = 32 ++ .dma_data_width = 32, ++ .sideband_wake = false, + }; + + static const struct pci_device_id mhi_pci_id_table[] = { +@@ -640,9 +648,12 @@ static int mhi_pci_probe(struct pci_dev + mhi_cntrl->status_cb = mhi_pci_status_cb; + mhi_cntrl->runtime_get = mhi_pci_runtime_get; + mhi_cntrl->runtime_put = mhi_pci_runtime_put; +- mhi_cntrl->wake_get = mhi_pci_wake_get_nop; +- mhi_cntrl->wake_put = mhi_pci_wake_put_nop; +- mhi_cntrl->wake_toggle = mhi_pci_wake_toggle_nop; ++ ++ if (info->sideband_wake) { ++ mhi_cntrl->wake_get = mhi_pci_wake_get_nop; ++ mhi_cntrl->wake_put = mhi_pci_wake_put_nop; ++ mhi_cntrl->wake_toggle = mhi_pci_wake_toggle_nop; ++ } + + err = mhi_pci_claim(mhi_cntrl, info->bar_num, DMA_BIT_MASK(info->dma_data_width)); + if (err) diff --git a/queue-5.13/bus-mhi-pci_generic-fix-inbound-ipcr-channel.patch b/queue-5.13/bus-mhi-pci_generic-fix-inbound-ipcr-channel.patch new file mode 100644 index 00000000000..6895cb8c22e --- /dev/null +++ b/queue-5.13/bus-mhi-pci_generic-fix-inbound-ipcr-channel.patch @@ -0,0 +1,63 @@ +From b8a97f2a65388394f433bf0730293a94f7d49046 Mon Sep 17 00:00:00 2001 +From: Loic Poulain +Date: Fri, 16 Jul 2021 13:21:06 +0530 +Subject: bus: mhi: pci_generic: Fix inbound IPCR channel + +From: Loic Poulain + +commit b8a97f2a65388394f433bf0730293a94f7d49046 upstream. + +The qrtr-mhi client driver assumes that inbound buffers are +automatically allocated and queued by the MHI core, but this +doesn't happen for mhi pci devices since IPCR inbound channel is +not flagged with auto_queue, causing unusable IPCR (qrtr) +feature. Fix that. + +Link: https://lore.kernel.org/r/1625736749-24947-1-git-send-email-loic.poulain@linaro.org +[mani: fixed a spelling mistake in commit description] +Fixes: 855a70c12021 ("bus: mhi: Add MHI PCI support for WWAN modems") +Cc: stable@vger.kernel.org #5.10 +Reviewed-by: Hemant kumar +Reviewed-by: Manivannan Sadhasivam +Signed-off-by: Loic Poulain +Signed-off-by: Manivannan Sadhasivam +Link: https://lore.kernel.org/r/20210716075106.49938-4-manivannan.sadhasivam@linaro.org +Signed-off-by: Greg Kroah-Hartman +--- + drivers/bus/mhi/pci_generic.c | 18 +++++++++++++++++- + 1 file changed, 17 insertions(+), 1 deletion(-) + +--- a/drivers/bus/mhi/pci_generic.c ++++ b/drivers/bus/mhi/pci_generic.c +@@ -75,6 +75,22 @@ struct mhi_pci_dev_info { + .doorbell_mode_switch = false, \ + } + ++#define MHI_CHANNEL_CONFIG_DL_AUTOQUEUE(ch_num, ch_name, el_count, ev_ring) \ ++ { \ ++ .num = ch_num, \ ++ .name = ch_name, \ ++ .num_elements = el_count, \ ++ .event_ring = ev_ring, \ ++ .dir = DMA_FROM_DEVICE, \ ++ .ee_mask = BIT(MHI_EE_AMSS), \ ++ .pollcfg = 0, \ ++ .doorbell = MHI_DB_BRST_DISABLE, \ ++ .lpm_notify = false, \ ++ .offload_channel = false, \ ++ .doorbell_mode_switch = false, \ ++ .auto_queue = true, \ ++ } ++ + #define MHI_EVENT_CONFIG_CTRL(ev_ring, el_count) \ + { \ + .num_elements = el_count, \ +@@ -213,7 +229,7 @@ static const struct mhi_channel_config m + MHI_CHANNEL_CONFIG_UL(14, "QMI", 4, 0), + MHI_CHANNEL_CONFIG_DL(15, "QMI", 4, 0), + MHI_CHANNEL_CONFIG_UL(20, "IPCR", 8, 0), +- MHI_CHANNEL_CONFIG_DL(21, "IPCR", 8, 0), ++ MHI_CHANNEL_CONFIG_DL_AUTOQUEUE(21, "IPCR", 8, 0), + MHI_CHANNEL_CONFIG_UL_FP(34, "FIREHOSE", 32, 0), + MHI_CHANNEL_CONFIG_DL_FP(35, "FIREHOSE", 32, 0), + MHI_CHANNEL_CONFIG_HW_UL(100, "IP_HW0", 128, 2), diff --git a/queue-5.13/driver-core-auxiliary-bus-fix-memory-leak-when-driver_register-fail.patch b/queue-5.13/driver-core-auxiliary-bus-fix-memory-leak-when-driver_register-fail.patch new file mode 100644 index 00000000000..22732fdf06c --- /dev/null +++ b/queue-5.13/driver-core-auxiliary-bus-fix-memory-leak-when-driver_register-fail.patch @@ -0,0 +1,47 @@ +From 4afa0c22eed33cfe0c590742387f0d16f32412f3 Mon Sep 17 00:00:00 2001 +From: Peter Ujfalusi +Date: Tue, 13 Jul 2021 12:34:38 +0300 +Subject: driver core: auxiliary bus: Fix memory leak when driver_register() fail + +From: Peter Ujfalusi + +commit 4afa0c22eed33cfe0c590742387f0d16f32412f3 upstream. + +If driver_register() returns with error we need to free the memory +allocated for auxdrv->driver.name before returning from +__auxiliary_driver_register() + +Fixes: 7de3697e9cbd4 ("Add auxiliary bus support") +Reviewed-by: Dan Williams +Cc: stable +Signed-off-by: Peter Ujfalusi +Link: https://lore.kernel.org/r/20210713093438.3173-1-peter.ujfalusi@linux.intel.com +Signed-off-by: Greg Kroah-Hartman +--- + drivers/base/auxiliary.c | 8 +++++++- + 1 file changed, 7 insertions(+), 1 deletion(-) + +--- a/drivers/base/auxiliary.c ++++ b/drivers/base/auxiliary.c +@@ -231,6 +231,8 @@ EXPORT_SYMBOL_GPL(auxiliary_find_device) + int __auxiliary_driver_register(struct auxiliary_driver *auxdrv, + struct module *owner, const char *modname) + { ++ int ret; ++ + if (WARN_ON(!auxdrv->probe) || WARN_ON(!auxdrv->id_table)) + return -EINVAL; + +@@ -246,7 +248,11 @@ int __auxiliary_driver_register(struct a + auxdrv->driver.bus = &auxiliary_bus_type; + auxdrv->driver.mod_name = modname; + +- return driver_register(&auxdrv->driver); ++ ret = driver_register(&auxdrv->driver); ++ if (ret) ++ kfree(auxdrv->driver.name); ++ ++ return ret; + } + EXPORT_SYMBOL_GPL(__auxiliary_driver_register); + diff --git a/queue-5.13/firmware-efi-tell-memblock-about-efi-iomem-reservations.patch b/queue-5.13/firmware-efi-tell-memblock-about-efi-iomem-reservations.patch new file mode 100644 index 00000000000..907fee2ea65 --- /dev/null +++ b/queue-5.13/firmware-efi-tell-memblock-about-efi-iomem-reservations.patch @@ -0,0 +1,66 @@ +From 2bab693a608bdf614b9fcd44083c5100f34b9f77 Mon Sep 17 00:00:00 2001 +From: Marc Zyngier +Date: Tue, 13 Jul 2021 19:43:26 +0100 +Subject: firmware/efi: Tell memblock about EFI iomem reservations + +From: Marc Zyngier + +commit 2bab693a608bdf614b9fcd44083c5100f34b9f77 upstream. + +kexec_load_file() relies on the memblock infrastructure to avoid +stamping over regions of memory that are essential to the survival +of the system. + +However, nobody seems to agree how to flag these regions as reserved, +and (for example) EFI only publishes its reservations in /proc/iomem +for the benefit of the traditional, userspace based kexec tool. + +On arm64 platforms with GICv3, this can result in the payload being +placed at the location of the LPI tables. Shock, horror! + +Let's augment the EFI reservation code with a memblock_reserve() call, +protecting our dear tables from the secondary kernel invasion. + +Reported-by: Moritz Fischer +Tested-by: Moritz Fischer +Signed-off-by: Marc Zyngier +Cc: stable@vger.kernel.org +Cc: Ard Biesheuvel +Cc: James Morse +Cc: Catalin Marinas +Cc: Will Deacon +Signed-off-by: Ard Biesheuvel +Signed-off-by: Greg Kroah-Hartman +--- + drivers/firmware/efi/efi.c | 13 ++++++++++++- + 1 file changed, 12 insertions(+), 1 deletion(-) + +--- a/drivers/firmware/efi/efi.c ++++ b/drivers/firmware/efi/efi.c +@@ -896,6 +896,7 @@ static int __init efi_memreserve_map_roo + static int efi_mem_reserve_iomem(phys_addr_t addr, u64 size) + { + struct resource *res, *parent; ++ int ret; + + res = kzalloc(sizeof(struct resource), GFP_ATOMIC); + if (!res) +@@ -908,7 +909,17 @@ static int efi_mem_reserve_iomem(phys_ad + + /* we expect a conflict with a 'System RAM' region */ + parent = request_resource_conflict(&iomem_resource, res); +- return parent ? request_resource(parent, res) : 0; ++ ret = parent ? request_resource(parent, res) : 0; ++ ++ /* ++ * Given that efi_mem_reserve_iomem() can be called at any ++ * time, only call memblock_reserve() if the architecture ++ * keeps the infrastructure around. ++ */ ++ if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK) && !ret) ++ memblock_reserve(addr, size); ++ ++ return ret; + } + + int __ref efi_mem_reserve_persistent(phys_addr_t addr, u64 size) diff --git a/queue-5.13/io_uring-explicitly-count-entries-for-poll-reqs.patch b/queue-5.13/io_uring-explicitly-count-entries-for-poll-reqs.patch new file mode 100644 index 00000000000..305a5fe12cb --- /dev/null +++ b/queue-5.13/io_uring-explicitly-count-entries-for-poll-reqs.patch @@ -0,0 +1,74 @@ +From 68b11e8b1562986c134764433af64e97d30c9fc0 Mon Sep 17 00:00:00 2001 +From: Pavel Begunkov +Date: Tue, 20 Jul 2021 10:50:43 +0100 +Subject: io_uring: explicitly count entries for poll reqs + +From: Pavel Begunkov + +commit 68b11e8b1562986c134764433af64e97d30c9fc0 upstream. + +If __io_queue_proc() fails to add a second poll entry, e.g. kmalloc() +failed, but it goes on with a third waitqueue, it may succeed and +overwrite the error status. Count the number of poll entries we added, +so we can set pt->error to zero at the beginning and find out when the +mentioned scenario happens. + +Cc: stable@vger.kernel.org +Fixes: 18bceab101add ("io_uring: allow POLL_ADD with double poll_wait() users") +Signed-off-by: Pavel Begunkov +Link: https://lore.kernel.org/r/9d6b9e561f88bcc0163623b74a76c39f712151c3.1626774457.git.asml.silence@gmail.com +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + fs/io_uring.c | 16 ++++++++++------ + 1 file changed, 10 insertions(+), 6 deletions(-) + +--- a/fs/io_uring.c ++++ b/fs/io_uring.c +@@ -4805,6 +4805,7 @@ IO_NETOP_FN(recv); + struct io_poll_table { + struct poll_table_struct pt; + struct io_kiocb *req; ++ int nr_entries; + int error; + }; + +@@ -5002,11 +5003,11 @@ static void __io_queue_proc(struct io_po + struct io_kiocb *req = pt->req; + + /* +- * If poll->head is already set, it's because the file being polled +- * uses multiple waitqueues for poll handling (eg one for read, one +- * for write). Setup a separate io_poll_iocb if this happens. ++ * The file being polled uses multiple waitqueues for poll handling ++ * (e.g. one for read, one for write). Setup a separate io_poll_iocb ++ * if this happens. + */ +- if (unlikely(poll->head)) { ++ if (unlikely(pt->nr_entries)) { + struct io_poll_iocb *poll_one = poll; + + /* already have a 2nd entry, fail a third attempt */ +@@ -5034,7 +5035,7 @@ static void __io_queue_proc(struct io_po + *poll_ptr = poll; + } + +- pt->error = 0; ++ pt->nr_entries++; + poll->head = head; + + if (poll->events & EPOLLEXCLUSIVE) +@@ -5112,9 +5113,12 @@ static __poll_t __io_arm_poll_handler(st + + ipt->pt._key = mask; + ipt->req = req; +- ipt->error = -EINVAL; ++ ipt->error = 0; ++ ipt->nr_entries = 0; + + mask = vfs_poll(req->file, &ipt->pt) & poll->events; ++ if (unlikely(!ipt->nr_entries) && !ipt->error) ++ ipt->error = -EINVAL; + + spin_lock_irq(&ctx->completion_lock); + if (likely(poll->head)) { diff --git a/queue-5.13/io_uring-fix-early-fdput-of-file.patch b/queue-5.13/io_uring-fix-early-fdput-of-file.patch new file mode 100644 index 00000000000..52e94186da3 --- /dev/null +++ b/queue-5.13/io_uring-fix-early-fdput-of-file.patch @@ -0,0 +1,41 @@ +From 0cc936f74bcacb039b7533aeac0a887dfc896bf6 Mon Sep 17 00:00:00 2001 +From: Jens Axboe +Date: Thu, 22 Jul 2021 17:08:07 -0600 +Subject: io_uring: fix early fdput() of file + +From: Jens Axboe + +commit 0cc936f74bcacb039b7533aeac0a887dfc896bf6 upstream. + +A previous commit shuffled some code around, and inadvertently used +struct file after fdput() had been called on it. As we can't touch +the file post fdput() dropping our reference, move the fdput() to +after that has been done. + +Cc: Pavel Begunkov +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/io-uring/YPnqM0fY3nM5RdRI@zeniv-ca.linux.org.uk/ +Fixes: f2a48dd09b8e ("io_uring: refactor io_sq_offload_create()") +Reported-by: Al Viro +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + fs/io_uring.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +--- a/fs/io_uring.c ++++ b/fs/io_uring.c +@@ -7953,9 +7953,11 @@ static int io_sq_offload_create(struct i + f = fdget(p->wq_fd); + if (!f.file) + return -ENXIO; +- fdput(f); +- if (f.file->f_op != &io_uring_fops) ++ if (f.file->f_op != &io_uring_fops) { ++ fdput(f); + return -EINVAL; ++ } ++ fdput(f); + } + if (ctx->flags & IORING_SETUP_SQPOLL) { + struct task_struct *tsk; diff --git a/queue-5.13/io_uring-remove-double-poll-entry-on-arm-failure.patch b/queue-5.13/io_uring-remove-double-poll-entry-on-arm-failure.patch new file mode 100644 index 00000000000..a03f38511ce --- /dev/null +++ b/queue-5.13/io_uring-remove-double-poll-entry-on-arm-failure.patch @@ -0,0 +1,46 @@ +From 46fee9ab02cb24979bbe07631fc3ae95ae08aa3e Mon Sep 17 00:00:00 2001 +From: Pavel Begunkov +Date: Tue, 20 Jul 2021 10:50:44 +0100 +Subject: io_uring: remove double poll entry on arm failure + +From: Pavel Begunkov + +commit 46fee9ab02cb24979bbe07631fc3ae95ae08aa3e upstream. + +__io_queue_proc() can enqueue both poll entries and still fail +afterwards, so the callers trying to cancel it should also try to remove +the second poll entry (if any). + +For example, it may leave the request alive referencing a io_uring +context but not accessible for cancellation: + +[ 282.599913][ T1620] task:iou-sqp-23145 state:D stack:28720 pid:23155 ppid: 8844 flags:0x00004004 +[ 282.609927][ T1620] Call Trace: +[ 282.613711][ T1620] __schedule+0x93a/0x26f0 +[ 282.634647][ T1620] schedule+0xd3/0x270 +[ 282.638874][ T1620] io_uring_cancel_generic+0x54d/0x890 +[ 282.660346][ T1620] io_sq_thread+0xaac/0x1250 +[ 282.696394][ T1620] ret_from_fork+0x1f/0x30 + +Cc: stable@vger.kernel.org +Fixes: 18bceab101add ("io_uring: allow POLL_ADD with double poll_wait() users") +Reported-and-tested-by: syzbot+ac957324022b7132accf@syzkaller.appspotmail.com +Signed-off-by: Pavel Begunkov +Link: https://lore.kernel.org/r/0ec1228fc5eda4cb524eeda857da8efdc43c331c.1626774457.git.asml.silence@gmail.com +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + fs/io_uring.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/fs/io_uring.c ++++ b/fs/io_uring.c +@@ -5121,6 +5121,8 @@ static __poll_t __io_arm_poll_handler(st + ipt->error = -EINVAL; + + spin_lock_irq(&ctx->completion_lock); ++ if (ipt->error) ++ io_poll_remove_double(req); + if (likely(poll->head)) { + spin_lock(&poll->head->lock); + if (unlikely(list_empty(&poll->wait.entry))) { diff --git a/queue-5.13/ixgbe-fix-packet-corruption-due-to-missing-dma-sync.patch b/queue-5.13/ixgbe-fix-packet-corruption-due-to-missing-dma-sync.patch new file mode 100644 index 00000000000..2c8104c28fe --- /dev/null +++ b/queue-5.13/ixgbe-fix-packet-corruption-due-to-missing-dma-sync.patch @@ -0,0 +1,55 @@ +From 09cfae9f13d51700b0fecf591dcd658fc5375428 Mon Sep 17 00:00:00 2001 +From: Markus Boehme +Date: Tue, 20 Jul 2021 16:26:19 -0700 +Subject: ixgbe: Fix packet corruption due to missing DMA sync + +From: Markus Boehme + +commit 09cfae9f13d51700b0fecf591dcd658fc5375428 upstream. + +When receiving a packet with multiple fragments, hardware may still +touch the first fragment until the entire packet has been received. The +driver therefore keeps the first fragment mapped for DMA until end of +packet has been asserted, and delays its dma_sync call until then. + +The driver tries to fit multiple receive buffers on one page. When using +3K receive buffers (e.g. using Jumbo frames and legacy-rx is turned +off/build_skb is being used) on an architecture with 4K pages, the +driver allocates an order 1 compound page and uses one page per receive +buffer. To determine the correct offset for a delayed DMA sync of the +first fragment of a multi-fragment packet, the driver then cannot just +use PAGE_MASK on the DMA address but has to construct a mask based on +the actual size of the backing page. + +Using PAGE_MASK in the 3K RX buffer/4K page architecture configuration +will always sync the first page of a compound page. With the SWIOTLB +enabled this can lead to corrupted packets (zeroed out first fragment, +re-used garbage from another packet) and various consequences, such as +slow/stalling data transfers and connection resets. For example, testing +on a link with MTU exceeding 3058 bytes on a host with SWIOTLB enabled +(e.g. "iommu=soft swiotlb=262144,force") TCP transfers quickly fizzle +out without this patch. + +Cc: stable@vger.kernel.org +Fixes: 0c5661ecc5dd7 ("ixgbe: fix crash in build_skb Rx code path") +Signed-off-by: Markus Boehme +Tested-by: Tony Brelinski +Signed-off-by: Tony Nguyen +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c ++++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +@@ -1825,7 +1825,8 @@ static void ixgbe_dma_sync_frag(struct i + struct sk_buff *skb) + { + if (ring_uses_build_skb(rx_ring)) { +- unsigned long offset = (unsigned long)(skb->data) & ~PAGE_MASK; ++ unsigned long mask = (unsigned long)ixgbe_rx_pg_size(rx_ring) - 1; ++ unsigned long offset = (unsigned long)(skb->data) & mask; + + dma_sync_single_range_for_cpu(rx_ring->dev, + IXGBE_CB(skb)->dma, diff --git a/queue-5.13/media-ngene-fix-out-of-bounds-bug-in-ngene_command_config_free_buf.patch b/queue-5.13/media-ngene-fix-out-of-bounds-bug-in-ngene_command_config_free_buf.patch new file mode 100644 index 00000000000..d233006b8b4 --- /dev/null +++ b/queue-5.13/media-ngene-fix-out-of-bounds-bug-in-ngene_command_config_free_buf.patch @@ -0,0 +1,82 @@ +From 8d4abca95ecc82fc8c41912fa0085281f19cc29f Mon Sep 17 00:00:00 2001 +From: "Gustavo A. R. Silva" +Date: Mon, 19 Apr 2021 18:43:32 -0500 +Subject: media: ngene: Fix out-of-bounds bug in ngene_command_config_free_buf() + +From: Gustavo A. R. Silva + +commit 8d4abca95ecc82fc8c41912fa0085281f19cc29f upstream. + +Fix an 11-year old bug in ngene_command_config_free_buf() while +addressing the following warnings caught with -Warray-bounds: + +arch/alpha/include/asm/string.h:22:16: warning: '__builtin_memcpy' offset [12, 16] from the object at 'com' is out of the bounds of referenced subobject 'config' with type 'unsigned char' at offset 10 [-Warray-bounds] +arch/x86/include/asm/string_32.h:182:25: warning: '__builtin_memcpy' offset [12, 16] from the object at 'com' is out of the bounds of referenced subobject 'config' with type 'unsigned char' at offset 10 [-Warray-bounds] + +The problem is that the original code is trying to copy 6 bytes of +data into a one-byte size member _config_ of the wrong structue +FW_CONFIGURE_BUFFERS, in a single call to memcpy(). This causes a +legitimate compiler warning because memcpy() overruns the length +of &com.cmd.ConfigureBuffers.config. It seems that the right +structure is FW_CONFIGURE_FREE_BUFFERS, instead, because it contains +6 more members apart from the header _hdr_. Also, the name of +the function ngene_command_config_free_buf() suggests that the actual +intention is to ConfigureFreeBuffers, instead of ConfigureBuffers +(which takes place in the function ngene_command_config_buf(), above). + +Fix this by enclosing those 6 members of struct FW_CONFIGURE_FREE_BUFFERS +into new struct config, and use &com.cmd.ConfigureFreeBuffers.config as +the destination address, instead of &com.cmd.ConfigureBuffers.config, +when calling memcpy(). + +This also helps with the ongoing efforts to globally enable +-Warray-bounds and get us closer to being able to tighten the +FORTIFY_SOURCE routines on memcpy(). + +Link: https://github.com/KSPP/linux/issues/109 +Fixes: dae52d009fc9 ("V4L/DVB: ngene: Initial check-in") +Cc: stable@vger.kernel.org +Reported-by: kernel test robot +Reviewed-by: Kees Cook +Signed-off-by: Gustavo A. R. Silva +Link: https://lore.kernel.org/linux-hardening/20210420001631.GA45456@embeddedor/ +Signed-off-by: Greg Kroah-Hartman +--- + drivers/media/pci/ngene/ngene-core.c | 2 +- + drivers/media/pci/ngene/ngene.h | 14 ++++++++------ + 2 files changed, 9 insertions(+), 7 deletions(-) + +--- a/drivers/media/pci/ngene/ngene-core.c ++++ b/drivers/media/pci/ngene/ngene-core.c +@@ -385,7 +385,7 @@ static int ngene_command_config_free_buf + + com.cmd.hdr.Opcode = CMD_CONFIGURE_FREE_BUFFER; + com.cmd.hdr.Length = 6; +- memcpy(&com.cmd.ConfigureBuffers.config, config, 6); ++ memcpy(&com.cmd.ConfigureFreeBuffers.config, config, 6); + com.in_len = 6; + com.out_len = 0; + +--- a/drivers/media/pci/ngene/ngene.h ++++ b/drivers/media/pci/ngene/ngene.h +@@ -407,12 +407,14 @@ enum _BUFFER_CONFIGS { + + struct FW_CONFIGURE_FREE_BUFFERS { + struct FW_HEADER hdr; +- u8 UVI1_BufferLength; +- u8 UVI2_BufferLength; +- u8 TVO_BufferLength; +- u8 AUD1_BufferLength; +- u8 AUD2_BufferLength; +- u8 TVA_BufferLength; ++ struct { ++ u8 UVI1_BufferLength; ++ u8 UVI2_BufferLength; ++ u8 TVO_BufferLength; ++ u8 AUD1_BufferLength; ++ u8 AUD2_BufferLength; ++ u8 TVA_BufferLength; ++ } __packed config; + } __attribute__ ((__packed__)); + + struct FW_CONFIGURE_UART { diff --git a/queue-5.13/posix-cpu-timers-fix-rearm-racing-against-process-tick.patch b/queue-5.13/posix-cpu-timers-fix-rearm-racing-against-process-tick.patch new file mode 100644 index 00000000000..85675c183aa --- /dev/null +++ b/queue-5.13/posix-cpu-timers-fix-rearm-racing-against-process-tick.patch @@ -0,0 +1,73 @@ +From 1a3402d93c73bf6bb4df6d7c2aac35abfc3c50e2 Mon Sep 17 00:00:00 2001 +From: Frederic Weisbecker +Date: Thu, 3 Jun 2021 01:15:59 +0200 +Subject: posix-cpu-timers: Fix rearm racing against process tick + +From: Frederic Weisbecker + +commit 1a3402d93c73bf6bb4df6d7c2aac35abfc3c50e2 upstream. + +Since the process wide cputime counter is started locklessly from +posix_cpu_timer_rearm(), it can be concurrently stopped by operations +on other timers from the same thread group, such as in the following +unlucky scenario: + + CPU 0 CPU 1 + ----- ----- + timer_settime(TIMER B) + posix_cpu_timer_rearm(TIMER A) + cpu_clock_sample_group() + (pct->timers_active already true) + + handle_posix_cpu_timers() + check_process_timers() + stop_process_timers() + pct->timers_active = false + arm_timer(TIMER A) + + tick -> run_posix_cpu_timers() + // sees !pct->timers_active, ignore + // our TIMER A + +Fix this with simply locking process wide cputime counting start and +timer arm in the same block. + +Acked-by: Peter Zijlstra (Intel) +Signed-off-by: Frederic Weisbecker +Fixes: 60f2ceaa8111 ("posix-cpu-timers: Remove unnecessary locking around cpu_clock_sample_group") +Cc: stable@vger.kernel.org +Cc: Oleg Nesterov +Cc: Thomas Gleixner +Cc: Ingo Molnar +Cc: Eric W. Biederman +Signed-off-by: Greg Kroah-Hartman +--- + kernel/time/posix-cpu-timers.c | 10 +++++----- + 1 file changed, 5 insertions(+), 5 deletions(-) + +--- a/kernel/time/posix-cpu-timers.c ++++ b/kernel/time/posix-cpu-timers.c +@@ -991,6 +991,11 @@ static void posix_cpu_timer_rearm(struct + if (!p) + goto out; + ++ /* Protect timer list r/w in arm_timer() */ ++ sighand = lock_task_sighand(p, &flags); ++ if (unlikely(sighand == NULL)) ++ goto out; ++ + /* + * Fetch the current sample and update the timer's expiry time. + */ +@@ -1001,11 +1006,6 @@ static void posix_cpu_timer_rearm(struct + + bump_cpu_timer(timer, now); + +- /* Protect timer list r/w in arm_timer() */ +- sighand = lock_task_sighand(p, &flags); +- if (unlikely(sighand == NULL)) +- goto out; +- + /* + * Now re-arm for the new expiry time. + */ diff --git a/queue-5.13/selftest-use-mmap-instead-of-posix_memalign-to-allocate-memory.patch b/queue-5.13/selftest-use-mmap-instead-of-posix_memalign-to-allocate-memory.patch new file mode 100644 index 00000000000..5f9b2e789b7 --- /dev/null +++ b/queue-5.13/selftest-use-mmap-instead-of-posix_memalign-to-allocate-memory.patch @@ -0,0 +1,56 @@ +From 0db282ba2c12c1515d490d14a1ff696643ab0f1b Mon Sep 17 00:00:00 2001 +From: Peter Collingbourne +Date: Fri, 23 Jul 2021 15:50:04 -0700 +Subject: selftest: use mmap instead of posix_memalign to allocate memory + +From: Peter Collingbourne + +commit 0db282ba2c12c1515d490d14a1ff696643ab0f1b upstream. + +This test passes pointers obtained from anon_allocate_area to the +userfaultfd and mremap APIs. This causes a problem if the system +allocator returns tagged pointers because with the tagged address ABI +the kernel rejects tagged addresses passed to these APIs, which would +end up causing the test to fail. To make this test compatible with such +system allocators, stop using the system allocator to allocate memory in +anon_allocate_area, and instead just use mmap. + +Link: https://lkml.kernel.org/r/20210714195437.118982-3-pcc@google.com +Link: https://linux-review.googlesource.com/id/Icac91064fcd923f77a83e8e133f8631c5b8fc241 +Fixes: c47174fc362a ("userfaultfd: selftest") +Co-developed-by: Lokesh Gidra +Signed-off-by: Lokesh Gidra +Signed-off-by: Peter Collingbourne +Reviewed-by: Catalin Marinas +Cc: Vincenzo Frascino +Cc: Dave Martin +Cc: Will Deacon +Cc: Andrea Arcangeli +Cc: Alistair Delva +Cc: William McVicker +Cc: Evgenii Stepanov +Cc: Mitch Phillips +Cc: Andrey Konovalov +Cc: [5.4] +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + tools/testing/selftests/vm/userfaultfd.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +--- a/tools/testing/selftests/vm/userfaultfd.c ++++ b/tools/testing/selftests/vm/userfaultfd.c +@@ -197,8 +197,10 @@ static int anon_release_pages(char *rel_ + + static void anon_allocate_area(void **alloc_area) + { +- if (posix_memalign(alloc_area, page_size, nr_pages * page_size)) { +- fprintf(stderr, "out of memory\n"); ++ *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE, ++ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); ++ if (*alloc_area == MAP_FAILED) ++ fprintf(stderr, "mmap of anonymous memory failed"); + *alloc_area = NULL; + } + } diff --git a/queue-5.13/series b/queue-5.13/series index 832a4633fd4..57d520c5c75 100644 --- a/queue-5.13/series +++ b/queue-5.13/series @@ -174,3 +174,23 @@ usb-dwc2-gadget-fix-sending-zero-length-packet-in-ddma-mode.patch usb-typec-tipd-don-t-block-probing-of-consumer-of-connector-nodes.patch usb-typec-stusb160x-register-role-switch-before-interrupt-registration.patch usb-typec-stusb160x-don-t-block-probing-of-consumer-of-connector-nodes.patch +firmware-efi-tell-memblock-about-efi-iomem-reservations.patch +tracepoints-update-static_call-before-tp_funcs-when-adding-a-tracepoint.patch +tracing-histogram-rename-cpu-to-common_cpu.patch +tracing-fix-bug-in-rb_per_cpu_empty-that-might-cause-deadloop.patch +tracing-synthetic-event-field_pos-is-an-index-not-a-boolean.patch +btrfs-check-for-missing-device-in-btrfs_trim_fs.patch +btrfs-fix-unpersisted-i_size-on-fsync-after-expanding-truncate.patch +btrfs-fix-lock-inversion-problem-when-doing-qgroup-extent-tracing.patch +media-ngene-fix-out-of-bounds-bug-in-ngene_command_config_free_buf.patch +ixgbe-fix-packet-corruption-due-to-missing-dma-sync.patch +driver-core-auxiliary-bus-fix-memory-leak-when-driver_register-fail.patch +bus-mhi-pci_generic-apply-no-op-for-wake-using-sideband-wake-boolean.patch +bus-mhi-core-validate-channel-id-when-processing-command-completions.patch +bus-mhi-pci_generic-fix-inbound-ipcr-channel.patch +posix-cpu-timers-fix-rearm-racing-against-process-tick.patch +selftest-use-mmap-instead-of-posix_memalign-to-allocate-memory.patch +io_uring-explicitly-count-entries-for-poll-reqs.patch +io_uring-remove-double-poll-entry-on-arm-failure.patch +io_uring-fix-early-fdput-of-file.patch +userfaultfd-do-not-untag-user-pointers.patch diff --git a/queue-5.13/tracepoints-update-static_call-before-tp_funcs-when-adding-a-tracepoint.patch b/queue-5.13/tracepoints-update-static_call-before-tp_funcs-when-adding-a-tracepoint.patch new file mode 100644 index 00000000000..fae097c6c58 --- /dev/null +++ b/queue-5.13/tracepoints-update-static_call-before-tp_funcs-when-adding-a-tracepoint.patch @@ -0,0 +1,120 @@ +From 352384d5c84ebe40fa77098cc234fe173247d8ef Mon Sep 17 00:00:00 2001 +From: "Steven Rostedt (VMware)" +Date: Thu, 22 Jul 2021 21:52:18 -0400 +Subject: tracepoints: Update static_call before tp_funcs when adding a tracepoint + +From: Steven Rostedt (VMware) + +commit 352384d5c84ebe40fa77098cc234fe173247d8ef upstream. + +Because of the significant overhead that retpolines pose on indirect +calls, the tracepoint code was updated to use the new "static_calls" that +can modify the running code to directly call a function instead of using +an indirect caller, and this function can be changed at runtime. + +In the tracepoint code that calls all the registered callbacks that are +attached to a tracepoint, the following is done: + + it_func_ptr = rcu_dereference_raw((&__tracepoint_##name)->funcs); + if (it_func_ptr) { + __data = (it_func_ptr)->data; + static_call(tp_func_##name)(__data, args); + } + +If there's just a single callback, the static_call is updated to just call +that callback directly. Once another handler is added, then the static +caller is updated to call the iterator, that simply loops over all the +funcs in the array and calls each of the callbacks like the old method +using indirect calling. + +The issue was discovered with a race between updating the funcs array and +updating the static_call. The funcs array was updated first and then the +static_call was updated. This is not an issue as long as the first element +in the old array is the same as the first element in the new array. But +that assumption is incorrect, because callbacks also have a priority +field, and if there's a callback added that has a higher priority than the +callback on the old array, then it will become the first callback in the +new array. This means that it is possible to call the old callback with +the new callback data element, which can cause a kernel panic. + + static_call = callback1() + funcs[] = {callback1,data1}; + callback2 has higher priority than callback1 + + CPU 1 CPU 2 + ----- ----- + + new_funcs = {callback2,data2}, + {callback1,data1} + + rcu_assign_pointer(tp->funcs, new_funcs); + + /* + * Now tp->funcs has the new array + * but the static_call still calls callback1 + */ + + it_func_ptr = tp->funcs [ new_funcs ] + data = it_func_ptr->data [ data2 ] + static_call(callback1, data); + + /* Now callback1 is called with + * callback2's data */ + + [ KERNEL PANIC ] + + update_static_call(iterator); + +To prevent this from happening, always switch the static_call to the +iterator before assigning the tp->funcs to the new array. The iterator will +always properly match the callback with its data. + +To trigger this bug: + + In one terminal: + + while :; do hackbench 50; done + + In another terminal + + echo 1 > /sys/kernel/tracing/events/sched/sched_waking/enable + while :; do + echo 1 > /sys/kernel/tracing/set_event_pid; + sleep 0.5 + echo 0 > /sys/kernel/tracing/set_event_pid; + sleep 0.5 + done + +And it doesn't take long to crash. This is because the set_event_pid adds +a callback to the sched_waking tracepoint with a high priority, which will +be called before the sched_waking trace event callback is called. + +Note, the removal to a single callback updates the array first, before +changing the static_call to single callback, which is the proper order as +the first element in the array is the same as what the static_call is +being changed to. + +Link: https://lore.kernel.org/io-uring/4ebea8f0-58c9-e571-fd30-0ce4f6f09c70@samba.org/ + +Cc: stable@vger.kernel.org +Fixes: d25e37d89dd2f ("tracepoint: Optimize using static_call()") +Reported-by: Stefan Metzmacher +tested-by: Stefan Metzmacher +Signed-off-by: Steven Rostedt (VMware) +Signed-off-by: Greg Kroah-Hartman +--- + kernel/tracepoint.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/kernel/tracepoint.c ++++ b/kernel/tracepoint.c +@@ -299,8 +299,8 @@ static int tracepoint_add_func(struct tr + * a pointer to it. This array is referenced by __DO_TRACE from + * include/linux/tracepoint.h using rcu_dereference_sched(). + */ +- rcu_assign_pointer(tp->funcs, tp_funcs); + tracepoint_update_call(tp, tp_funcs, false); ++ rcu_assign_pointer(tp->funcs, tp_funcs); + static_key_enable(&tp->key); + + release_probes(old); diff --git a/queue-5.13/tracing-fix-bug-in-rb_per_cpu_empty-that-might-cause-deadloop.patch b/queue-5.13/tracing-fix-bug-in-rb_per_cpu_empty-that-might-cause-deadloop.patch new file mode 100644 index 00000000000..28384b0bdc3 --- /dev/null +++ b/queue-5.13/tracing-fix-bug-in-rb_per_cpu_empty-that-might-cause-deadloop.patch @@ -0,0 +1,102 @@ +From 67f0d6d9883c13174669f88adac4f0ee656cc16a Mon Sep 17 00:00:00 2001 +From: Haoran Luo +Date: Wed, 21 Jul 2021 14:12:07 +0000 +Subject: tracing: Fix bug in rb_per_cpu_empty() that might cause deadloop. + +From: Haoran Luo + +commit 67f0d6d9883c13174669f88adac4f0ee656cc16a upstream. + +The "rb_per_cpu_empty()" misinterpret the condition (as not-empty) when +"head_page" and "commit_page" of "struct ring_buffer_per_cpu" points to +the same buffer page, whose "buffer_data_page" is empty and "read" field +is non-zero. + +An error scenario could be constructed as followed (kernel perspective): + +1. All pages in the buffer has been accessed by reader(s) so that all of +them will have non-zero "read" field. + +2. Read and clear all buffer pages so that "rb_num_of_entries()" will +return 0 rendering there's no more data to read. It is also required +that the "read_page", "commit_page" and "tail_page" points to the same +page, while "head_page" is the next page of them. + +3. Invoke "ring_buffer_lock_reserve()" with large enough "length" +so that it shot pass the end of current tail buffer page. Now the +"head_page", "commit_page" and "tail_page" points to the same page. + +4. Discard current event with "ring_buffer_discard_commit()", so that +"head_page", "commit_page" and "tail_page" points to a page whose buffer +data page is now empty. + +When the error scenario has been constructed, "tracing_read_pipe" will +be trapped inside a deadloop: "trace_empty()" returns 0 since +"rb_per_cpu_empty()" returns 0 when it hits the CPU containing such +constructed ring buffer. Then "trace_find_next_entry_inc()" always +return NULL since "rb_num_of_entries()" reports there's no more entry +to read. Finally "trace_seq_to_user()" returns "-EBUSY" spanking +"tracing_read_pipe" back to the start of the "waitagain" loop. + +I've also written a proof-of-concept script to construct the scenario +and trigger the bug automatically, you can use it to trace and validate +my reasoning above: + + https://github.com/aegistudio/RingBufferDetonator.git + +Tests has been carried out on linux kernel 5.14-rc2 +(2734d6c1b1a089fb593ef6a23d4b70903526fe0c), my fixed version +of kernel (for testing whether my update fixes the bug) and +some older kernels (for range of affected kernels). Test result is +also attached to the proof-of-concept repository. + +Link: https://lore.kernel.org/linux-trace-devel/YPaNxsIlb2yjSi5Y@aegistudio/ +Link: https://lore.kernel.org/linux-trace-devel/YPgrN85WL9VyrZ55@aegistudio + +Cc: stable@vger.kernel.org +Fixes: bf41a158cacba ("ring-buffer: make reentrant") +Suggested-by: Linus Torvalds +Signed-off-by: Haoran Luo +Signed-off-by: Steven Rostedt (VMware) +Signed-off-by: Greg Kroah-Hartman +--- + kernel/trace/ring_buffer.c | 28 ++++++++++++++++++++++++---- + 1 file changed, 24 insertions(+), 4 deletions(-) + +--- a/kernel/trace/ring_buffer.c ++++ b/kernel/trace/ring_buffer.c +@@ -3880,10 +3880,30 @@ static bool rb_per_cpu_empty(struct ring + if (unlikely(!head)) + return true; + +- return reader->read == rb_page_commit(reader) && +- (commit == reader || +- (commit == head && +- head->read == rb_page_commit(commit))); ++ /* Reader should exhaust content in reader page */ ++ if (reader->read != rb_page_commit(reader)) ++ return false; ++ ++ /* ++ * If writers are committing on the reader page, knowing all ++ * committed content has been read, the ring buffer is empty. ++ */ ++ if (commit == reader) ++ return true; ++ ++ /* ++ * If writers are committing on a page other than reader page ++ * and head page, there should always be content to read. ++ */ ++ if (commit != head) ++ return false; ++ ++ /* ++ * Writers are committing on the head page, we just need ++ * to care about there're committed data, and the reader will ++ * swap reader page with head page when it is to read data. ++ */ ++ return rb_page_commit(commit) == 0; + } + + /** diff --git a/queue-5.13/tracing-histogram-rename-cpu-to-common_cpu.patch b/queue-5.13/tracing-histogram-rename-cpu-to-common_cpu.patch new file mode 100644 index 00000000000..c993cb91e8c --- /dev/null +++ b/queue-5.13/tracing-histogram-rename-cpu-to-common_cpu.patch @@ -0,0 +1,152 @@ +From 1e3bac71c5053c99d438771fc9fa5082ae5d90aa Mon Sep 17 00:00:00 2001 +From: "Steven Rostedt (VMware)" +Date: Wed, 21 Jul 2021 11:00:53 -0400 +Subject: tracing/histogram: Rename "cpu" to "common_cpu" + +From: Steven Rostedt (VMware) + +commit 1e3bac71c5053c99d438771fc9fa5082ae5d90aa upstream. + +Currently the histogram logic allows the user to write "cpu" in as an +event field, and it will record the CPU that the event happened on. + +The problem with this is that there's a lot of events that have "cpu" +as a real field, and using "cpu" as the CPU it ran on, makes it +impossible to run histograms on the "cpu" field of events. + +For example, if I want to have a histogram on the count of the +workqueue_queue_work event on its cpu field, running: + + ># echo 'hist:keys=cpu' > events/workqueue/workqueue_queue_work/trigger + +Gives a misleading and wrong result. + +Change the command to "common_cpu" as no event should have "common_*" +fields as that's a reserved name for fields used by all events. And +this makes sense here as common_cpu would be a field used by all events. + +Now we can even do: + + ># echo 'hist:keys=common_cpu,cpu if cpu < 100' > events/workqueue/workqueue_queue_work/trigger + ># cat events/workqueue/workqueue_queue_work/hist + # event histogram + # + # trigger info: hist:keys=common_cpu,cpu:vals=hitcount:sort=hitcount:size=2048 if cpu < 100 [active] + # + + { common_cpu: 0, cpu: 2 } hitcount: 1 + { common_cpu: 0, cpu: 4 } hitcount: 1 + { common_cpu: 7, cpu: 7 } hitcount: 1 + { common_cpu: 0, cpu: 7 } hitcount: 1 + { common_cpu: 0, cpu: 1 } hitcount: 1 + { common_cpu: 0, cpu: 6 } hitcount: 2 + { common_cpu: 0, cpu: 5 } hitcount: 2 + { common_cpu: 1, cpu: 1 } hitcount: 4 + { common_cpu: 6, cpu: 6 } hitcount: 4 + { common_cpu: 5, cpu: 5 } hitcount: 14 + { common_cpu: 4, cpu: 4 } hitcount: 26 + { common_cpu: 0, cpu: 0 } hitcount: 39 + { common_cpu: 2, cpu: 2 } hitcount: 184 + +Now for backward compatibility, I added a trick. If "cpu" is used, and +the field is not found, it will fall back to "common_cpu" and work as +it did before. This way, it will still work for old programs that use +"cpu" to get the actual CPU, but if the event has a "cpu" as a field, it +will get that event's "cpu" field, which is probably what it wants +anyway. + +I updated the tracefs/README to include documentation about both the +common_timestamp and the common_cpu. This way, if that text is present in +the README, then an application can know that common_cpu is supported over +just plain "cpu". + +Link: https://lkml.kernel.org/r/20210721110053.26b4f641@oasis.local.home + +Cc: Namhyung Kim +Cc: Ingo Molnar +Cc: Andrew Morton +Cc: stable@vger.kernel.org +Fixes: 8b7622bf94a44 ("tracing: Add cpu field for hist triggers") +Reviewed-by: Tom Zanussi +Reviewed-by: Masami Hiramatsu +Signed-off-by: Steven Rostedt (VMware) +Signed-off-by: Greg Kroah-Hartman +--- + Documentation/trace/histogram.rst | 2 +- + kernel/trace/trace.c | 4 ++++ + kernel/trace/trace_events_hist.c | 22 ++++++++++++++++------ + 3 files changed, 21 insertions(+), 7 deletions(-) + +--- a/Documentation/trace/histogram.rst ++++ b/Documentation/trace/histogram.rst +@@ -191,7 +191,7 @@ Documentation written by Tom Zanussi + with the event, in nanoseconds. May be + modified by .usecs to have timestamps + interpreted as microseconds. +- cpu int the cpu on which the event occurred. ++ common_cpu int the cpu on which the event occurred. + ====================== ==== ======================================= + + Extended error information +--- a/kernel/trace/trace.c ++++ b/kernel/trace/trace.c +@@ -5565,6 +5565,10 @@ static const char readme_msg[] = + "\t [:name=histname1]\n" + "\t [:.]\n" + "\t [if ]\n\n" ++ "\t Note, special fields can be used as well:\n" ++ "\t common_timestamp - to record current timestamp\n" ++ "\t common_cpu - to record the CPU the event happened on\n" ++ "\n" + "\t When a matching event is hit, an entry is added to a hash\n" + "\t table using the key(s) and value(s) named, and the value of a\n" + "\t sum called 'hitcount' is incremented. Keys and values\n" +--- a/kernel/trace/trace_events_hist.c ++++ b/kernel/trace/trace_events_hist.c +@@ -1111,7 +1111,7 @@ static const char *hist_field_name(struc + field->flags & HIST_FIELD_FL_ALIAS) + field_name = hist_field_name(field->operands[0], ++level); + else if (field->flags & HIST_FIELD_FL_CPU) +- field_name = "cpu"; ++ field_name = "common_cpu"; + else if (field->flags & HIST_FIELD_FL_EXPR || + field->flags & HIST_FIELD_FL_VAR_REF) { + if (field->system) { +@@ -1991,14 +1991,24 @@ parse_field(struct hist_trigger_data *hi + hist_data->enable_timestamps = true; + if (*flags & HIST_FIELD_FL_TIMESTAMP_USECS) + hist_data->attrs->ts_in_usecs = true; +- } else if (strcmp(field_name, "cpu") == 0) ++ } else if (strcmp(field_name, "common_cpu") == 0) + *flags |= HIST_FIELD_FL_CPU; + else { + field = trace_find_event_field(file->event_call, field_name); + if (!field || !field->size) { +- hist_err(tr, HIST_ERR_FIELD_NOT_FOUND, errpos(field_name)); +- field = ERR_PTR(-EINVAL); +- goto out; ++ /* ++ * For backward compatibility, if field_name ++ * was "cpu", then we treat this the same as ++ * common_cpu. ++ */ ++ if (strcmp(field_name, "cpu") == 0) { ++ *flags |= HIST_FIELD_FL_CPU; ++ } else { ++ hist_err(tr, HIST_ERR_FIELD_NOT_FOUND, ++ errpos(field_name)); ++ field = ERR_PTR(-EINVAL); ++ goto out; ++ } + } + } + out: +@@ -5085,7 +5095,7 @@ static void hist_field_print(struct seq_ + seq_printf(m, "%s=", hist_field->var.name); + + if (hist_field->flags & HIST_FIELD_FL_CPU) +- seq_puts(m, "cpu"); ++ seq_puts(m, "common_cpu"); + else if (field_name) { + if (hist_field->flags & HIST_FIELD_FL_VAR_REF || + hist_field->flags & HIST_FIELD_FL_ALIAS) diff --git a/queue-5.13/tracing-synthetic-event-field_pos-is-an-index-not-a-boolean.patch b/queue-5.13/tracing-synthetic-event-field_pos-is-an-index-not-a-boolean.patch new file mode 100644 index 00000000000..1a612ed0b60 --- /dev/null +++ b/queue-5.13/tracing-synthetic-event-field_pos-is-an-index-not-a-boolean.patch @@ -0,0 +1,98 @@ +From 3b13911a2fd0dd0146c9777a254840c5466cf120 Mon Sep 17 00:00:00 2001 +From: "Steven Rostedt (VMware)" +Date: Wed, 21 Jul 2021 19:10:08 -0400 +Subject: tracing: Synthetic event field_pos is an index not a boolean + +From: Steven Rostedt (VMware) + +commit 3b13911a2fd0dd0146c9777a254840c5466cf120 upstream. + +Performing the following: + + ># echo 'wakeup_lat s32 pid; u64 delta; char wake_comm[]' > synthetic_events + ># echo 'hist:keys=pid:__arg__1=common_timestamp.usecs' > events/sched/sched_waking/trigger + ># echo 'hist:keys=next_pid:pid=next_pid,delta=common_timestamp.usecs-$__arg__1:onmatch(sched.sched_waking).trace(wakeup_lat,$pid,$delta,prev_comm)'\ + > events/sched/sched_switch/trigger + ># echo 1 > events/synthetic/enable + +Crashed the kernel: + + BUG: kernel NULL pointer dereference, address: 000000000000001b + #PF: supervisor read access in kernel mode + #PF: error_code(0x0000) - not-present page + PGD 0 P4D 0 + Oops: 0000 [#1] PREEMPT SMP + CPU: 7 PID: 0 Comm: swapper/7 Not tainted 5.13.0-rc5-test+ #104 + Hardware name: Hewlett-Packard HP Compaq Pro 6300 SFF/339A, BIOS K01 v03.03 07/14/2016 + RIP: 0010:strlen+0x0/0x20 + Code: f6 82 80 2b 0b bc 20 74 11 0f b6 50 01 48 83 c0 01 f6 82 80 2b 0b bc + 20 75 ef c3 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 <80> 3f 00 74 10 + 48 89 f8 48 83 c0 01 80 38 9 f8 c3 31 + RSP: 0018:ffffaa75000d79d0 EFLAGS: 00010046 + RAX: 0000000000000002 RBX: ffff9cdb55575270 RCX: 0000000000000000 + RDX: ffff9cdb58c7a320 RSI: ffffaa75000d7b40 RDI: 000000000000001b + RBP: ffffaa75000d7b40 R08: ffff9cdb40a4f010 R09: ffffaa75000d7ab8 + R10: ffff9cdb4398c700 R11: 0000000000000008 R12: ffff9cdb58c7a320 + R13: ffff9cdb55575270 R14: ffff9cdb58c7a000 R15: 0000000000000018 + FS: 0000000000000000(0000) GS:ffff9cdb5aa00000(0000) knlGS:0000000000000000 + CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + CR2: 000000000000001b CR3: 00000000c0612006 CR4: 00000000001706e0 + Call Trace: + trace_event_raw_event_synth+0x90/0x1d0 + action_trace+0x5b/0x70 + event_hist_trigger+0x4bd/0x4e0 + ? cpumask_next_and+0x20/0x30 + ? update_sd_lb_stats.constprop.0+0xf6/0x840 + ? __lock_acquire.constprop.0+0x125/0x550 + ? find_held_lock+0x32/0x90 + ? sched_clock_cpu+0xe/0xd0 + ? lock_release+0x155/0x440 + ? update_load_avg+0x8c/0x6f0 + ? enqueue_entity+0x18a/0x920 + ? __rb_reserve_next+0xe5/0x460 + ? ring_buffer_lock_reserve+0x12a/0x3f0 + event_triggers_call+0x52/0xe0 + trace_event_buffer_commit+0x1ae/0x240 + trace_event_raw_event_sched_switch+0x114/0x170 + __traceiter_sched_switch+0x39/0x50 + __schedule+0x431/0xb00 + schedule_idle+0x28/0x40 + do_idle+0x198/0x2e0 + cpu_startup_entry+0x19/0x20 + secondary_startup_64_no_verify+0xc2/0xcb + +The reason is that the dynamic events array keeps track of the field +position of the fields array, via the field_pos variable in the +synth_field structure. Unfortunately, that field is a boolean for some +reason, which means any field_pos greater than 1 will be a bug (in this +case it was 2). + +Link: https://lkml.kernel.org/r/20210721191008.638bce34@oasis.local.home + +Cc: Masami Hiramatsu +Cc: Namhyung Kim +Cc: Ingo Molnar +Cc: Andrew Morton +Cc: stable@vger.kernel.org +Fixes: bd82631d7ccdc ("tracing: Add support for dynamic strings to synthetic events") +Reviewed-by: Tom Zanussi +Signed-off-by: Steven Rostedt (VMware) +Signed-off-by: Greg Kroah-Hartman +--- + kernel/trace/trace_synth.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/kernel/trace/trace_synth.h ++++ b/kernel/trace/trace_synth.h +@@ -14,10 +14,10 @@ struct synth_field { + char *name; + size_t size; + unsigned int offset; ++ unsigned int field_pos; + bool is_signed; + bool is_string; + bool is_dynamic; +- bool field_pos; + }; + + struct synth_event { diff --git a/queue-5.13/userfaultfd-do-not-untag-user-pointers.patch b/queue-5.13/userfaultfd-do-not-untag-user-pointers.patch new file mode 100644 index 00000000000..895ec361bd0 --- /dev/null +++ b/queue-5.13/userfaultfd-do-not-untag-user-pointers.patch @@ -0,0 +1,205 @@ +From e71e2ace5721a8b921dca18b045069e7bb411277 Mon Sep 17 00:00:00 2001 +From: Peter Collingbourne +Date: Fri, 23 Jul 2021 15:50:01 -0700 +Subject: userfaultfd: do not untag user pointers + +From: Peter Collingbourne + +commit e71e2ace5721a8b921dca18b045069e7bb411277 upstream. + +Patch series "userfaultfd: do not untag user pointers", v5. + +If a user program uses userfaultfd on ranges of heap memory, it may end +up passing a tagged pointer to the kernel in the range.start field of +the UFFDIO_REGISTER ioctl. This can happen when using an MTE-capable +allocator, or on Android if using the Tagged Pointers feature for MTE +readiness [1]. + +When a fault subsequently occurs, the tag is stripped from the fault +address returned to the application in the fault.address field of struct +uffd_msg. However, from the application's perspective, the tagged +address *is* the memory address, so if the application is unaware of +memory tags, it may get confused by receiving an address that is, from +its point of view, outside of the bounds of the allocation. We observed +this behavior in the kselftest for userfaultfd [2] but other +applications could have the same problem. + +Address this by not untagging pointers passed to the userfaultfd ioctls. +Instead, let the system call fail. Also change the kselftest to use +mmap so that it doesn't encounter this problem. + +[1] https://source.android.com/devices/tech/debug/tagged-pointers +[2] tools/testing/selftests/vm/userfaultfd.c + +This patch (of 2): + +Do not untag pointers passed to the userfaultfd ioctls. Instead, let +the system call fail. This will provide an early indication of problems +with tag-unaware userspace code instead of letting the code get confused +later, and is consistent with how we decided to handle brk/mmap/mremap +in commit dcde237319e6 ("mm: Avoid creating virtual address aliases in +brk()/mmap()/mremap()"), as well as being consistent with the existing +tagged address ABI documentation relating to how ioctl arguments are +handled. + +The code change is a revert of commit 7d0325749a6c ("userfaultfd: untag +user pointers") plus some fixups to some additional calls to +validate_range that have appeared since then. + +[1] https://source.android.com/devices/tech/debug/tagged-pointers +[2] tools/testing/selftests/vm/userfaultfd.c + +Link: https://lkml.kernel.org/r/20210714195437.118982-1-pcc@google.com +Link: https://lkml.kernel.org/r/20210714195437.118982-2-pcc@google.com +Link: https://linux-review.googlesource.com/id/I761aa9f0344454c482b83fcfcce547db0a25501b +Fixes: 63f0c6037965 ("arm64: Introduce prctl() options to control the tagged user addresses ABI") +Signed-off-by: Peter Collingbourne +Reviewed-by: Andrey Konovalov +Reviewed-by: Catalin Marinas +Cc: Alistair Delva +Cc: Andrea Arcangeli +Cc: Dave Martin +Cc: Evgenii Stepanov +Cc: Lokesh Gidra +Cc: Mitch Phillips +Cc: Vincenzo Frascino +Cc: Will Deacon +Cc: William McVicker +Cc: [5.4] +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman +--- + Documentation/arm64/tagged-address-abi.rst | 26 ++++++++++++++++++-------- + fs/userfaultfd.c | 26 ++++++++++++-------------- + 2 files changed, 30 insertions(+), 22 deletions(-) + +--- a/Documentation/arm64/tagged-address-abi.rst ++++ b/Documentation/arm64/tagged-address-abi.rst +@@ -45,14 +45,24 @@ how the user addresses are used by the k + + 1. User addresses not accessed by the kernel but used for address space + management (e.g. ``mprotect()``, ``madvise()``). The use of valid +- tagged pointers in this context is allowed with the exception of +- ``brk()``, ``mmap()`` and the ``new_address`` argument to +- ``mremap()`` as these have the potential to alias with existing +- user addresses. +- +- NOTE: This behaviour changed in v5.6 and so some earlier kernels may +- incorrectly accept valid tagged pointers for the ``brk()``, +- ``mmap()`` and ``mremap()`` system calls. ++ tagged pointers in this context is allowed with these exceptions: ++ ++ - ``brk()``, ``mmap()`` and the ``new_address`` argument to ++ ``mremap()`` as these have the potential to alias with existing ++ user addresses. ++ ++ NOTE: This behaviour changed in v5.6 and so some earlier kernels may ++ incorrectly accept valid tagged pointers for the ``brk()``, ++ ``mmap()`` and ``mremap()`` system calls. ++ ++ - The ``range.start``, ``start`` and ``dst`` arguments to the ++ ``UFFDIO_*`` ``ioctl()``s used on a file descriptor obtained from ++ ``userfaultfd()``, as fault addresses subsequently obtained by reading ++ the file descriptor will be untagged, which may otherwise confuse ++ tag-unaware programs. ++ ++ NOTE: This behaviour changed in v5.14 and so some earlier kernels may ++ incorrectly accept valid tagged pointers for this system call. + + 2. User addresses accessed by the kernel (e.g. ``write()``). This ABI + relaxation is disabled by default and the application thread needs to +--- a/fs/userfaultfd.c ++++ b/fs/userfaultfd.c +@@ -1236,23 +1236,21 @@ static __always_inline void wake_userfau + } + + static __always_inline int validate_range(struct mm_struct *mm, +- __u64 *start, __u64 len) ++ __u64 start, __u64 len) + { + __u64 task_size = mm->task_size; + +- *start = untagged_addr(*start); +- +- if (*start & ~PAGE_MASK) ++ if (start & ~PAGE_MASK) + return -EINVAL; + if (len & ~PAGE_MASK) + return -EINVAL; + if (!len) + return -EINVAL; +- if (*start < mmap_min_addr) ++ if (start < mmap_min_addr) + return -EINVAL; +- if (*start >= task_size) ++ if (start >= task_size) + return -EINVAL; +- if (len > task_size - *start) ++ if (len > task_size - start) + return -EINVAL; + return 0; + } +@@ -1313,7 +1311,7 @@ static int userfaultfd_register(struct u + vm_flags |= VM_UFFD_MINOR; + } + +- ret = validate_range(mm, &uffdio_register.range.start, ++ ret = validate_range(mm, uffdio_register.range.start, + uffdio_register.range.len); + if (ret) + goto out; +@@ -1519,7 +1517,7 @@ static int userfaultfd_unregister(struct + if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister))) + goto out; + +- ret = validate_range(mm, &uffdio_unregister.start, ++ ret = validate_range(mm, uffdio_unregister.start, + uffdio_unregister.len); + if (ret) + goto out; +@@ -1668,7 +1666,7 @@ static int userfaultfd_wake(struct userf + if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake))) + goto out; + +- ret = validate_range(ctx->mm, &uffdio_wake.start, uffdio_wake.len); ++ ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len); + if (ret) + goto out; + +@@ -1708,7 +1706,7 @@ static int userfaultfd_copy(struct userf + sizeof(uffdio_copy)-sizeof(__s64))) + goto out; + +- ret = validate_range(ctx->mm, &uffdio_copy.dst, uffdio_copy.len); ++ ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len); + if (ret) + goto out; + /* +@@ -1765,7 +1763,7 @@ static int userfaultfd_zeropage(struct u + sizeof(uffdio_zeropage)-sizeof(__s64))) + goto out; + +- ret = validate_range(ctx->mm, &uffdio_zeropage.range.start, ++ ret = validate_range(ctx->mm, uffdio_zeropage.range.start, + uffdio_zeropage.range.len); + if (ret) + goto out; +@@ -1815,7 +1813,7 @@ static int userfaultfd_writeprotect(stru + sizeof(struct uffdio_writeprotect))) + return -EFAULT; + +- ret = validate_range(ctx->mm, &uffdio_wp.range.start, ++ ret = validate_range(ctx->mm, uffdio_wp.range.start, + uffdio_wp.range.len); + if (ret) + return ret; +@@ -1863,7 +1861,7 @@ static int userfaultfd_continue(struct u + sizeof(uffdio_continue) - (sizeof(__s64)))) + goto out; + +- ret = validate_range(ctx->mm, &uffdio_continue.range.start, ++ ret = validate_range(ctx->mm, uffdio_continue.range.start, + uffdio_continue.range.len); + if (ret) + goto out;