--- /dev/null
+From 16a200f66ede3f9afa2e51d90ade017aaa18d213 Mon Sep 17 00:00:00 2001
+From: Anand Jain <anand.jain@oracle.com>
+Date: Sun, 4 Jul 2021 19:14:39 +0800
+Subject: btrfs: check for missing device in btrfs_trim_fs
+
+From: Anand Jain <anand.jain@oracle.com>
+
+commit 16a200f66ede3f9afa2e51d90ade017aaa18d213 upstream.
+
+A fstrim on a degraded raid1 can trigger the following null pointer
+dereference:
+
+ BTRFS info (device loop0): allowing degraded mounts
+ BTRFS info (device loop0): disk space caching is enabled
+ BTRFS info (device loop0): has skinny extents
+ BTRFS warning (device loop0): devid 2 uuid 97ac16f7-e14d-4db1-95bc-3d489b424adb is missing
+ BTRFS warning (device loop0): devid 2 uuid 97ac16f7-e14d-4db1-95bc-3d489b424adb is missing
+ BTRFS info (device loop0): enabling ssd optimizations
+ BUG: kernel NULL pointer dereference, address: 0000000000000620
+ PGD 0 P4D 0
+ Oops: 0000 [#1] SMP NOPTI
+ CPU: 0 PID: 4574 Comm: fstrim Not tainted 5.13.0-rc7+ #31
+ Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
+ RIP: 0010:btrfs_trim_fs+0x199/0x4a0 [btrfs]
+ RSP: 0018:ffff959541797d28 EFLAGS: 00010293
+ RAX: 0000000000000000 RBX: ffff946f84eca508 RCX: a7a67937adff8608
+ RDX: ffff946e8122d000 RSI: 0000000000000000 RDI: ffffffffc02fdbf0
+ RBP: ffff946ea4615000 R08: 0000000000000001 R09: 0000000000000000
+ R10: 0000000000000000 R11: ffff946e8122d960 R12: 0000000000000000
+ R13: ffff959541797db8 R14: ffff946e8122d000 R15: ffff959541797db8
+ FS: 00007f55917a5080(0000) GS:ffff946f9bc00000(0000) knlGS:0000000000000000
+ CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+ CR2: 0000000000000620 CR3: 000000002d2c8001 CR4: 00000000000706f0
+ Call Trace:
+ btrfs_ioctl_fitrim+0x167/0x260 [btrfs]
+ btrfs_ioctl+0x1c00/0x2fe0 [btrfs]
+ ? selinux_file_ioctl+0x140/0x240
+ ? syscall_trace_enter.constprop.0+0x188/0x240
+ ? __x64_sys_ioctl+0x83/0xb0
+ __x64_sys_ioctl+0x83/0xb0
+
+Reproducer:
+
+ $ mkfs.btrfs -fq -d raid1 -m raid1 /dev/loop0 /dev/loop1
+ $ mount /dev/loop0 /btrfs
+ $ umount /btrfs
+ $ btrfs dev scan --forget
+ $ mount -o degraded /dev/loop0 /btrfs
+
+ $ fstrim /btrfs
+
+The reason is we call btrfs_trim_free_extents() for the missing device,
+which uses device->bdev (NULL for missing device) to find if the device
+supports discard.
+
+Fix is to check if the device is missing before calling
+btrfs_trim_free_extents().
+
+CC: stable@vger.kernel.org # 5.4+
+Reviewed-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: Anand Jain <anand.jain@oracle.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/extent-tree.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/fs/btrfs/extent-tree.c
++++ b/fs/btrfs/extent-tree.c
+@@ -6034,6 +6034,9 @@ int btrfs_trim_fs(struct btrfs_fs_info *
+ mutex_lock(&fs_info->fs_devices->device_list_mutex);
+ devices = &fs_info->fs_devices->devices;
+ list_for_each_entry(device, devices, dev_list) {
++ if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
++ continue;
++
+ ret = btrfs_trim_free_extents(device, &group_trimmed);
+ if (ret) {
+ dev_failed++;
--- /dev/null
+From 8949b9a114019b03fbd0d03d65b8647cba4feef3 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Wed, 21 Jul 2021 17:31:48 +0100
+Subject: btrfs: fix lock inversion problem when doing qgroup extent tracing
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 8949b9a114019b03fbd0d03d65b8647cba4feef3 upstream.
+
+At btrfs_qgroup_trace_extent_post() we call btrfs_find_all_roots() with a
+NULL value as the transaction handle argument, which makes that function
+take the commit_root_sem semaphore, which is necessary when we don't hold
+a transaction handle or any other mechanism to prevent a transaction
+commit from wiping out commit roots.
+
+However btrfs_qgroup_trace_extent_post() can be called in a context where
+we are holding a write lock on an extent buffer from a subvolume tree,
+namely from btrfs_truncate_inode_items(), called either during truncate
+or unlink operations. In this case we end up with a lock inversion problem
+because the commit_root_sem is a higher level lock, always supposed to be
+acquired before locking any extent buffer.
+
+Lockdep detects this lock inversion problem since we switched the extent
+buffer locks from custom locks to semaphores, and when running btrfs/158
+from fstests, it reported the following trace:
+
+[ 9057.626435] ======================================================
+[ 9057.627541] WARNING: possible circular locking dependency detected
+[ 9057.628334] 5.14.0-rc2-btrfs-next-93 #1 Not tainted
+[ 9057.628961] ------------------------------------------------------
+[ 9057.629867] kworker/u16:4/30781 is trying to acquire lock:
+[ 9057.630824] ffff8e2590f58760 (btrfs-tree-00){++++}-{3:3}, at: __btrfs_tree_read_lock+0x24/0x110 [btrfs]
+[ 9057.632542]
+ but task is already holding lock:
+[ 9057.633551] ffff8e25582d4b70 (&fs_info->commit_root_sem){++++}-{3:3}, at: iterate_extent_inodes+0x10b/0x280 [btrfs]
+[ 9057.635255]
+ which lock already depends on the new lock.
+
+[ 9057.636292]
+ the existing dependency chain (in reverse order) is:
+[ 9057.637240]
+ -> #1 (&fs_info->commit_root_sem){++++}-{3:3}:
+[ 9057.638138] down_read+0x46/0x140
+[ 9057.638648] btrfs_find_all_roots+0x41/0x80 [btrfs]
+[ 9057.639398] btrfs_qgroup_trace_extent_post+0x37/0x70 [btrfs]
+[ 9057.640283] btrfs_add_delayed_data_ref+0x418/0x490 [btrfs]
+[ 9057.641114] btrfs_free_extent+0x35/0xb0 [btrfs]
+[ 9057.641819] btrfs_truncate_inode_items+0x424/0xf70 [btrfs]
+[ 9057.642643] btrfs_evict_inode+0x454/0x4f0 [btrfs]
+[ 9057.643418] evict+0xcf/0x1d0
+[ 9057.643895] do_unlinkat+0x1e9/0x300
+[ 9057.644525] do_syscall_64+0x3b/0xc0
+[ 9057.645110] entry_SYSCALL_64_after_hwframe+0x44/0xae
+[ 9057.645835]
+ -> #0 (btrfs-tree-00){++++}-{3:3}:
+[ 9057.646600] __lock_acquire+0x130e/0x2210
+[ 9057.647248] lock_acquire+0xd7/0x310
+[ 9057.647773] down_read_nested+0x4b/0x140
+[ 9057.648350] __btrfs_tree_read_lock+0x24/0x110 [btrfs]
+[ 9057.649175] btrfs_read_lock_root_node+0x31/0x40 [btrfs]
+[ 9057.650010] btrfs_search_slot+0x537/0xc00 [btrfs]
+[ 9057.650849] scrub_print_warning_inode+0x89/0x370 [btrfs]
+[ 9057.651733] iterate_extent_inodes+0x1e3/0x280 [btrfs]
+[ 9057.652501] scrub_print_warning+0x15d/0x2f0 [btrfs]
+[ 9057.653264] scrub_handle_errored_block.isra.0+0x135f/0x1640 [btrfs]
+[ 9057.654295] scrub_bio_end_io_worker+0x101/0x2e0 [btrfs]
+[ 9057.655111] btrfs_work_helper+0xf8/0x400 [btrfs]
+[ 9057.655831] process_one_work+0x247/0x5a0
+[ 9057.656425] worker_thread+0x55/0x3c0
+[ 9057.656993] kthread+0x155/0x180
+[ 9057.657494] ret_from_fork+0x22/0x30
+[ 9057.658030]
+ other info that might help us debug this:
+
+[ 9057.659064] Possible unsafe locking scenario:
+
+[ 9057.659824] CPU0 CPU1
+[ 9057.660402] ---- ----
+[ 9057.660988] lock(&fs_info->commit_root_sem);
+[ 9057.661581] lock(btrfs-tree-00);
+[ 9057.662348] lock(&fs_info->commit_root_sem);
+[ 9057.663254] lock(btrfs-tree-00);
+[ 9057.663690]
+ *** DEADLOCK ***
+
+[ 9057.664437] 4 locks held by kworker/u16:4/30781:
+[ 9057.665023] #0: ffff8e25922a1148 ((wq_completion)btrfs-scrub){+.+.}-{0:0}, at: process_one_work+0x1c7/0x5a0
+[ 9057.666260] #1: ffffabb3451ffe70 ((work_completion)(&work->normal_work)){+.+.}-{0:0}, at: process_one_work+0x1c7/0x5a0
+[ 9057.667639] #2: ffff8e25922da198 (&ret->mutex){+.+.}-{3:3}, at: scrub_handle_errored_block.isra.0+0x5d2/0x1640 [btrfs]
+[ 9057.669017] #3: ffff8e25582d4b70 (&fs_info->commit_root_sem){++++}-{3:3}, at: iterate_extent_inodes+0x10b/0x280 [btrfs]
+[ 9057.670408]
+ stack backtrace:
+[ 9057.670976] CPU: 7 PID: 30781 Comm: kworker/u16:4 Not tainted 5.14.0-rc2-btrfs-next-93 #1
+[ 9057.672030] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.14.0-0-g155821a1990b-prebuilt.qemu.org 04/01/2014
+[ 9057.673492] Workqueue: btrfs-scrub btrfs_work_helper [btrfs]
+[ 9057.674258] Call Trace:
+[ 9057.674588] dump_stack_lvl+0x57/0x72
+[ 9057.675083] check_noncircular+0xf3/0x110
+[ 9057.675611] __lock_acquire+0x130e/0x2210
+[ 9057.676132] lock_acquire+0xd7/0x310
+[ 9057.676605] ? __btrfs_tree_read_lock+0x24/0x110 [btrfs]
+[ 9057.677313] ? lock_is_held_type+0xe8/0x140
+[ 9057.677849] down_read_nested+0x4b/0x140
+[ 9057.678349] ? __btrfs_tree_read_lock+0x24/0x110 [btrfs]
+[ 9057.679068] __btrfs_tree_read_lock+0x24/0x110 [btrfs]
+[ 9057.679760] btrfs_read_lock_root_node+0x31/0x40 [btrfs]
+[ 9057.680458] btrfs_search_slot+0x537/0xc00 [btrfs]
+[ 9057.681083] ? _raw_spin_unlock+0x29/0x40
+[ 9057.681594] ? btrfs_find_all_roots_safe+0x11f/0x140 [btrfs]
+[ 9057.682336] scrub_print_warning_inode+0x89/0x370 [btrfs]
+[ 9057.683058] ? btrfs_find_all_roots_safe+0x11f/0x140 [btrfs]
+[ 9057.683834] ? scrub_write_block_to_dev_replace+0xb0/0xb0 [btrfs]
+[ 9057.684632] iterate_extent_inodes+0x1e3/0x280 [btrfs]
+[ 9057.685316] scrub_print_warning+0x15d/0x2f0 [btrfs]
+[ 9057.685977] ? ___ratelimit+0xa4/0x110
+[ 9057.686460] scrub_handle_errored_block.isra.0+0x135f/0x1640 [btrfs]
+[ 9057.687316] scrub_bio_end_io_worker+0x101/0x2e0 [btrfs]
+[ 9057.688021] btrfs_work_helper+0xf8/0x400 [btrfs]
+[ 9057.688649] ? lock_is_held_type+0xe8/0x140
+[ 9057.689180] process_one_work+0x247/0x5a0
+[ 9057.689696] worker_thread+0x55/0x3c0
+[ 9057.690175] ? process_one_work+0x5a0/0x5a0
+[ 9057.690731] kthread+0x155/0x180
+[ 9057.691158] ? set_kthread_struct+0x40/0x40
+[ 9057.691697] ret_from_fork+0x22/0x30
+
+Fix this by making btrfs_find_all_roots() never attempt to lock the
+commit_root_sem when it is called from btrfs_qgroup_trace_extent_post().
+
+We can't just pass a non-NULL transaction handle to btrfs_find_all_roots()
+from btrfs_qgroup_trace_extent_post(), because that would make backref
+lookup not use commit roots and acquire read locks on extent buffers, and
+therefore could deadlock when btrfs_qgroup_trace_extent_post() is called
+from the btrfs_truncate_inode_items() code path which has acquired a write
+lock on an extent buffer of the subvolume btree.
+
+CC: stable@vger.kernel.org # 4.19+
+Reviewed-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/backref.c | 6 +++---
+ fs/btrfs/backref.h | 3 ++-
+ fs/btrfs/delayed-ref.c | 4 ++--
+ fs/btrfs/qgroup.c | 38 ++++++++++++++++++++++++++++++--------
+ fs/btrfs/qgroup.h | 2 +-
+ fs/btrfs/tests/qgroup-tests.c | 20 ++++++++++----------
+ 6 files changed, 48 insertions(+), 25 deletions(-)
+
+--- a/fs/btrfs/backref.c
++++ b/fs/btrfs/backref.c
+@@ -1488,15 +1488,15 @@ static int btrfs_find_all_roots_safe(str
+ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 bytenr,
+ u64 time_seq, struct ulist **roots,
+- bool ignore_offset)
++ bool ignore_offset, bool skip_commit_root_sem)
+ {
+ int ret;
+
+- if (!trans)
++ if (!trans && !skip_commit_root_sem)
+ down_read(&fs_info->commit_root_sem);
+ ret = btrfs_find_all_roots_safe(trans, fs_info, bytenr,
+ time_seq, roots, ignore_offset);
+- if (!trans)
++ if (!trans && !skip_commit_root_sem)
+ up_read(&fs_info->commit_root_sem);
+ return ret;
+ }
+--- a/fs/btrfs/backref.h
++++ b/fs/btrfs/backref.h
+@@ -47,7 +47,8 @@ int btrfs_find_all_leafs(struct btrfs_tr
+ const u64 *extent_item_pos, bool ignore_offset);
+ int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
+ struct btrfs_fs_info *fs_info, u64 bytenr,
+- u64 time_seq, struct ulist **roots, bool ignore_offset);
++ u64 time_seq, struct ulist **roots, bool ignore_offset,
++ bool skip_commit_root_sem);
+ char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
+ u32 name_len, unsigned long name_off,
+ struct extent_buffer *eb_in, u64 parent,
+--- a/fs/btrfs/delayed-ref.c
++++ b/fs/btrfs/delayed-ref.c
+@@ -1000,7 +1000,7 @@ int btrfs_add_delayed_tree_ref(struct bt
+ kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
+
+ if (qrecord_inserted)
+- btrfs_qgroup_trace_extent_post(fs_info, record);
++ btrfs_qgroup_trace_extent_post(trans, record);
+
+ return 0;
+ }
+@@ -1095,7 +1095,7 @@ int btrfs_add_delayed_data_ref(struct bt
+
+
+ if (qrecord_inserted)
+- return btrfs_qgroup_trace_extent_post(fs_info, record);
++ return btrfs_qgroup_trace_extent_post(trans, record);
+ return 0;
+ }
+
+--- a/fs/btrfs/qgroup.c
++++ b/fs/btrfs/qgroup.c
+@@ -1704,17 +1704,39 @@ int btrfs_qgroup_trace_extent_nolock(str
+ return 0;
+ }
+
+-int btrfs_qgroup_trace_extent_post(struct btrfs_fs_info *fs_info,
++int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
+ struct btrfs_qgroup_extent_record *qrecord)
+ {
+ struct ulist *old_root;
+ u64 bytenr = qrecord->bytenr;
+ int ret;
+
+- ret = btrfs_find_all_roots(NULL, fs_info, bytenr, 0, &old_root, false);
++ /*
++ * We are always called in a context where we are already holding a
++ * transaction handle. Often we are called when adding a data delayed
++ * reference from btrfs_truncate_inode_items() (truncating or unlinking),
++ * in which case we will be holding a write lock on extent buffer from a
++ * subvolume tree. In this case we can't allow btrfs_find_all_roots() to
++ * acquire fs_info->commit_root_sem, because that is a higher level lock
++ * that must be acquired before locking any extent buffers.
++ *
++ * So we want btrfs_find_all_roots() to not acquire the commit_root_sem
++ * but we can't pass it a non-NULL transaction handle, because otherwise
++ * it would not use commit roots and would lock extent buffers, causing
++ * a deadlock if it ends up trying to read lock the same extent buffer
++ * that was previously write locked at btrfs_truncate_inode_items().
++ *
++ * So pass a NULL transaction handle to btrfs_find_all_roots() and
++ * explicitly tell it to not acquire the commit_root_sem - if we are
++ * holding a transaction handle we don't need its protection.
++ */
++ ASSERT(trans != NULL);
++
++ ret = btrfs_find_all_roots(NULL, trans->fs_info, bytenr, 0, &old_root,
++ false, true);
+ if (ret < 0) {
+- fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
+- btrfs_warn(fs_info,
++ trans->fs_info->qgroup_flags |= BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT;
++ btrfs_warn(trans->fs_info,
+ "error accounting new delayed refs extent (err code: %d), quota inconsistent",
+ ret);
+ return 0;
+@@ -1758,7 +1780,7 @@ int btrfs_qgroup_trace_extent(struct btr
+ kfree(record);
+ return 0;
+ }
+- return btrfs_qgroup_trace_extent_post(fs_info, record);
++ return btrfs_qgroup_trace_extent_post(trans, record);
+ }
+
+ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
+@@ -2629,7 +2651,7 @@ int btrfs_qgroup_account_extents(struct
+ /* Search commit root to find old_roots */
+ ret = btrfs_find_all_roots(NULL, fs_info,
+ record->bytenr, 0,
+- &record->old_roots, false);
++ &record->old_roots, false, false);
+ if (ret < 0)
+ goto cleanup;
+ }
+@@ -2645,7 +2667,7 @@ int btrfs_qgroup_account_extents(struct
+ * current root. It's safe inside commit_transaction().
+ */
+ ret = btrfs_find_all_roots(trans, fs_info,
+- record->bytenr, BTRFS_SEQ_LAST, &new_roots, false);
++ record->bytenr, BTRFS_SEQ_LAST, &new_roots, false, false);
+ if (ret < 0)
+ goto cleanup;
+ if (qgroup_to_skip) {
+@@ -3179,7 +3201,7 @@ static int qgroup_rescan_leaf(struct btr
+ num_bytes = found.offset;
+
+ ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0,
+- &roots, false);
++ &roots, false, false);
+ if (ret < 0)
+ goto out;
+ /* For rescan, just pass old_roots as NULL */
+--- a/fs/btrfs/qgroup.h
++++ b/fs/btrfs/qgroup.h
+@@ -298,7 +298,7 @@ int btrfs_qgroup_trace_extent_nolock(
+ * using current root, then we can move all expensive backref walk out of
+ * transaction committing, but not now as qgroup accounting will be wrong again.
+ */
+-int btrfs_qgroup_trace_extent_post(struct btrfs_fs_info *fs_info,
++int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
+ struct btrfs_qgroup_extent_record *qrecord);
+
+ /*
+--- a/fs/btrfs/tests/qgroup-tests.c
++++ b/fs/btrfs/tests/qgroup-tests.c
+@@ -224,7 +224,7 @@ static int test_no_shared_qgroup(struct
+ * quota.
+ */
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
+- false);
++ false, false);
+ if (ret) {
+ ulist_free(old_roots);
+ test_err("couldn't find old roots: %d", ret);
+@@ -237,7 +237,7 @@ static int test_no_shared_qgroup(struct
+ return ret;
+
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
+- false);
++ false, false);
+ if (ret) {
+ ulist_free(old_roots);
+ ulist_free(new_roots);
+@@ -261,7 +261,7 @@ static int test_no_shared_qgroup(struct
+ new_roots = NULL;
+
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
+- false);
++ false, false);
+ if (ret) {
+ ulist_free(old_roots);
+ test_err("couldn't find old roots: %d", ret);
+@@ -273,7 +273,7 @@ static int test_no_shared_qgroup(struct
+ return -EINVAL;
+
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
+- false);
++ false, false);
+ if (ret) {
+ ulist_free(old_roots);
+ ulist_free(new_roots);
+@@ -325,7 +325,7 @@ static int test_multiple_refs(struct btr
+ }
+
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
+- false);
++ false, false);
+ if (ret) {
+ ulist_free(old_roots);
+ test_err("couldn't find old roots: %d", ret);
+@@ -338,7 +338,7 @@ static int test_multiple_refs(struct btr
+ return ret;
+
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
+- false);
++ false, false);
+ if (ret) {
+ ulist_free(old_roots);
+ ulist_free(new_roots);
+@@ -360,7 +360,7 @@ static int test_multiple_refs(struct btr
+ }
+
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
+- false);
++ false, false);
+ if (ret) {
+ ulist_free(old_roots);
+ test_err("couldn't find old roots: %d", ret);
+@@ -373,7 +373,7 @@ static int test_multiple_refs(struct btr
+ return ret;
+
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
+- false);
++ false, false);
+ if (ret) {
+ ulist_free(old_roots);
+ ulist_free(new_roots);
+@@ -401,7 +401,7 @@ static int test_multiple_refs(struct btr
+ }
+
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
+- false);
++ false, false);
+ if (ret) {
+ ulist_free(old_roots);
+ test_err("couldn't find old roots: %d", ret);
+@@ -414,7 +414,7 @@ static int test_multiple_refs(struct btr
+ return ret;
+
+ ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
+- false);
++ false, false);
+ if (ret) {
+ ulist_free(old_roots);
+ ulist_free(new_roots);
--- /dev/null
+From 9acc8103ab594f72250788cb45a43427f36d685d Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Tue, 6 Jul 2021 15:41:15 +0100
+Subject: btrfs: fix unpersisted i_size on fsync after expanding truncate
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 9acc8103ab594f72250788cb45a43427f36d685d upstream.
+
+If we have an inode that does not have the full sync flag set, was changed
+in the current transaction, then it is logged while logging some other
+inode (like its parent directory for example), its i_size is increased by
+a truncate operation, the log is synced through an fsync of some other
+inode and then finally we explicitly call fsync on our inode, the new
+i_size is not persisted.
+
+The following example shows how to trigger it, with comments explaining
+how and why the issue happens:
+
+ $ mkfs.btrfs -f /dev/sdc
+ $ mount /dev/sdc /mnt
+
+ $ touch /mnt/foo
+ $ xfs_io -f -c "pwrite -S 0xab 0 1M" /mnt/bar
+
+ $ sync
+
+ # Fsync bar, this will be a noop since the file has not yet been
+ # modified in the current transaction. The goal here is to clear
+ # BTRFS_INODE_NEEDS_FULL_SYNC from the inode's runtime flags.
+ $ xfs_io -c "fsync" /mnt/bar
+
+ # Now rename both files, without changing their parent directory.
+ $ mv /mnt/bar /mnt/bar2
+ $ mv /mnt/foo /mnt/foo2
+
+ # Increase the size of bar2 with a truncate operation.
+ $ xfs_io -c "truncate 2M" /mnt/bar2
+
+ # Now fsync foo2, this results in logging its parent inode (the root
+ # directory), and logging the parent results in logging the inode of
+ # file bar2 (its inode item and the new name). The inode of file bar2
+ # is logged with an i_size of 0 bytes since it's logged in
+ # LOG_INODE_EXISTS mode, meaning we are only logging its names (and
+ # xattrs if it had any) and the i_size of the inode will not be changed
+ # when the log is replayed.
+ $ xfs_io -c "fsync" /mnt/foo2
+
+ # Now explicitly fsync bar2. This resulted in doing nothing, not
+ # logging the inode with the new i_size of 2M and the hole from file
+ # offset 1M to 2M. Because the inode did not have the flag
+ # BTRFS_INODE_NEEDS_FULL_SYNC set, when it was logged through the
+ # fsync of file foo2, its last_log_commit field was updated,
+ # resulting in this explicit of file bar2 not doing anything.
+ $ xfs_io -c "fsync" /mnt/bar2
+
+ # File bar2 content and size before a power failure.
+ $ od -A d -t x1 /mnt/bar2
+ 0000000 ab ab ab ab ab ab ab ab ab ab ab ab ab ab ab ab
+ *
+ 1048576 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+ *
+ 2097152
+
+ <power failure>
+
+ # Mount the filesystem to replay the log.
+ $ mount /dev/sdc /mnt
+
+ # Read the file again, should have the same content and size as before
+ # the power failure happened, but it doesn't, i_size is still at 1M.
+ $ od -A d -t x1 /mnt/bar2
+ 0000000 ab ab ab ab ab ab ab ab ab ab ab ab ab ab ab ab
+ *
+ 1048576
+
+This started to happen after commit 209ecbb8585bf6 ("btrfs: remove stale
+comment and logic from btrfs_inode_in_log()"), since btrfs_inode_in_log()
+no longer checks if the inode's list of modified extents is not empty.
+However, checking that list is not the right way to address this case
+and the check was added long time ago in commit 125c4cf9f37c98
+("Btrfs: set inode's logged_trans/last_log_commit after ranged fsync")
+for a different purpose, to address consecutive ranged fsyncs.
+
+The reason that checking for the list emptiness makes this test pass is
+because during an expanding truncate we create an extent map to represent
+a hole from the old i_size to the new i_size, and add that extent map to
+the list of modified extents in the inode. However if we are low on
+available memory and we can not allocate a new extent map, then we don't
+treat it as an error and just set the full sync flag on the inode, so that
+the next fsync does not rely on the list of modified extents - so checking
+for the emptiness of the list to decide if the inode needs to be logged is
+not reliable, and results in not logging the inode if it was not possible
+to allocate the extent map for the hole.
+
+Fix this by ensuring that if we are only logging that an inode exists
+(inode item, names/references and xattrs), we don't update the inode's
+last_log_commit even if it does not have the full sync runtime flag set.
+
+A test case for fstests follows soon.
+
+CC: stable@vger.kernel.org # 5.13+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/tree-log.c | 31 ++++++++++++++++++++++---------
+ 1 file changed, 22 insertions(+), 9 deletions(-)
+
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -5515,16 +5515,29 @@ log_extents:
+ spin_lock(&inode->lock);
+ inode->logged_trans = trans->transid;
+ /*
+- * Don't update last_log_commit if we logged that an inode exists
+- * after it was loaded to memory (full_sync bit set).
+- * This is to prevent data loss when we do a write to the inode,
+- * then the inode gets evicted after all delalloc was flushed,
+- * then we log it exists (due to a rename for example) and then
+- * fsync it. This last fsync would do nothing (not logging the
+- * extents previously written).
++ * Don't update last_log_commit if we logged that an inode exists.
++ * We do this for two reasons:
++ *
++ * 1) We might have had buffered writes to this inode that were
++ * flushed and had their ordered extents completed in this
++ * transaction, but we did not previously log the inode with
++ * LOG_INODE_ALL. Later the inode was evicted and after that
++ * it was loaded again and this LOG_INODE_EXISTS log operation
++ * happened. We must make sure that if an explicit fsync against
++ * the inode is performed later, it logs the new extents, an
++ * updated inode item, etc, and syncs the log. The same logic
++ * applies to direct IO writes instead of buffered writes.
++ *
++ * 2) When we log the inode with LOG_INODE_EXISTS, its inode item
++ * is logged with an i_size of 0 or whatever value was logged
++ * before. If later the i_size of the inode is increased by a
++ * truncate operation, the log is synced through an fsync of
++ * some other inode and then finally an explicit fsync against
++ * this inode is made, we must make sure this fsync logs the
++ * inode with the new i_size, the hole between old i_size and
++ * the new i_size, and syncs the log.
+ */
+- if (inode_only != LOG_INODE_EXISTS ||
+- !test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags))
++ if (inode_only != LOG_INODE_EXISTS)
+ inode->last_log_commit = inode->last_sub_trans;
+ spin_unlock(&inode->lock);
+ }
--- /dev/null
+From 546362a9ef2ef40b57c6605f14e88ced507f8dd0 Mon Sep 17 00:00:00 2001
+From: Bhaumik Bhatt <bbhatt@codeaurora.org>
+Date: Fri, 16 Jul 2021 13:21:05 +0530
+Subject: bus: mhi: core: Validate channel ID when processing command completions
+
+From: Bhaumik Bhatt <bbhatt@codeaurora.org>
+
+commit 546362a9ef2ef40b57c6605f14e88ced507f8dd0 upstream.
+
+MHI reads the channel ID from the event ring element sent by the
+device which can be any value between 0 and 255. In order to
+prevent any out of bound accesses, add a check against the maximum
+number of channels supported by the controller and those channels
+not configured yet so as to skip processing of that event ring
+element.
+
+Link: https://lore.kernel.org/r/1624558141-11045-1-git-send-email-bbhatt@codeaurora.org
+Fixes: 1d3173a3bae7 ("bus: mhi: core: Add support for processing events from client device")
+Cc: stable@vger.kernel.org #5.10
+Reviewed-by: Hemant Kumar <hemantk@codeaurora.org>
+Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
+Reviewed-by: Jeffrey Hugo <quic_jhugo@quicinc.com>
+Signed-off-by: Bhaumik Bhatt <bbhatt@codeaurora.org>
+Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
+Link: https://lore.kernel.org/r/20210716075106.49938-3-manivannan.sadhasivam@linaro.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/bus/mhi/core/main.c | 17 ++++++++++++-----
+ 1 file changed, 12 insertions(+), 5 deletions(-)
+
+--- a/drivers/bus/mhi/core/main.c
++++ b/drivers/bus/mhi/core/main.c
+@@ -773,11 +773,18 @@ static void mhi_process_cmd_completion(s
+ cmd_pkt = mhi_to_virtual(mhi_ring, ptr);
+
+ chan = MHI_TRE_GET_CMD_CHID(cmd_pkt);
+- mhi_chan = &mhi_cntrl->mhi_chan[chan];
+- write_lock_bh(&mhi_chan->lock);
+- mhi_chan->ccs = MHI_TRE_GET_EV_CODE(tre);
+- complete(&mhi_chan->completion);
+- write_unlock_bh(&mhi_chan->lock);
++
++ if (chan < mhi_cntrl->max_chan &&
++ mhi_cntrl->mhi_chan[chan].configured) {
++ mhi_chan = &mhi_cntrl->mhi_chan[chan];
++ write_lock_bh(&mhi_chan->lock);
++ mhi_chan->ccs = MHI_TRE_GET_EV_CODE(tre);
++ complete(&mhi_chan->completion);
++ write_unlock_bh(&mhi_chan->lock);
++ } else {
++ dev_err(&mhi_cntrl->mhi_dev->dev,
++ "Completion packet for invalid channel ID: %d\n", chan);
++ }
+
+ mhi_del_ring_element(mhi_cntrl, mhi_ring);
+ }
--- /dev/null
+From 56f6f4c4eb2a710ec8878dd9373d3d2b2eb75f5c Mon Sep 17 00:00:00 2001
+From: Bhaumik Bhatt <bbhatt@codeaurora.org>
+Date: Fri, 16 Jul 2021 13:21:04 +0530
+Subject: bus: mhi: pci_generic: Apply no-op for wake using sideband wake boolean
+
+From: Bhaumik Bhatt <bbhatt@codeaurora.org>
+
+commit 56f6f4c4eb2a710ec8878dd9373d3d2b2eb75f5c upstream.
+
+Devices such as SDX24 do not have the provision for inband wake
+doorbell in the form of channel 127 and instead have a sideband
+GPIO for it. Newer devices such as SDX55 or SDX65 support inband
+wake method by default. Ensure the functionality is used based on
+this such that device wake stays held when a client driver uses
+mhi_device_get() API or the equivalent debugfs entry.
+
+Link: https://lore.kernel.org/r/1624560809-30610-1-git-send-email-bbhatt@codeaurora.org
+Fixes: e3e5e6508fc1 ("bus: mhi: pci_generic: No-Op for device_wake operations")
+Cc: stable@vger.kernel.org #5.12
+Reviewed-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
+Signed-off-by: Bhaumik Bhatt <bbhatt@codeaurora.org>
+Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
+Link: https://lore.kernel.org/r/20210716075106.49938-2-manivannan.sadhasivam@linaro.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/bus/mhi/pci_generic.c | 27 +++++++++++++++++++--------
+ 1 file changed, 19 insertions(+), 8 deletions(-)
+
+--- a/drivers/bus/mhi/pci_generic.c
++++ b/drivers/bus/mhi/pci_generic.c
+@@ -32,6 +32,8 @@
+ * @edl: emergency download mode firmware path (if any)
+ * @bar_num: PCI base address register to use for MHI MMIO register space
+ * @dma_data_width: DMA transfer word size (32 or 64 bits)
++ * @sideband_wake: Devices using dedicated sideband GPIO for wakeup instead
++ * of inband wake support (such as sdx24)
+ */
+ struct mhi_pci_dev_info {
+ const struct mhi_controller_config *config;
+@@ -40,6 +42,7 @@ struct mhi_pci_dev_info {
+ const char *edl;
+ unsigned int bar_num;
+ unsigned int dma_data_width;
++ bool sideband_wake;
+ };
+
+ #define MHI_CHANNEL_CONFIG_UL(ch_num, ch_name, el_count, ev_ring) \
+@@ -242,7 +245,8 @@ static const struct mhi_pci_dev_info mhi
+ .edl = "qcom/sdx65m/edl.mbn",
+ .config = &modem_qcom_v1_mhiv_config,
+ .bar_num = MHI_PCI_DEFAULT_BAR_NUM,
+- .dma_data_width = 32
++ .dma_data_width = 32,
++ .sideband_wake = false,
+ };
+
+ static const struct mhi_pci_dev_info mhi_qcom_sdx55_info = {
+@@ -251,7 +255,8 @@ static const struct mhi_pci_dev_info mhi
+ .edl = "qcom/sdx55m/edl.mbn",
+ .config = &modem_qcom_v1_mhiv_config,
+ .bar_num = MHI_PCI_DEFAULT_BAR_NUM,
+- .dma_data_width = 32
++ .dma_data_width = 32,
++ .sideband_wake = false,
+ };
+
+ static const struct mhi_pci_dev_info mhi_qcom_sdx24_info = {
+@@ -259,7 +264,8 @@ static const struct mhi_pci_dev_info mhi
+ .edl = "qcom/prog_firehose_sdx24.mbn",
+ .config = &modem_qcom_v1_mhiv_config,
+ .bar_num = MHI_PCI_DEFAULT_BAR_NUM,
+- .dma_data_width = 32
++ .dma_data_width = 32,
++ .sideband_wake = true,
+ };
+
+ static const struct mhi_channel_config mhi_quectel_em1xx_channels[] = {
+@@ -301,7 +307,8 @@ static const struct mhi_pci_dev_info mhi
+ .edl = "qcom/prog_firehose_sdx24.mbn",
+ .config = &modem_quectel_em1xx_config,
+ .bar_num = MHI_PCI_DEFAULT_BAR_NUM,
+- .dma_data_width = 32
++ .dma_data_width = 32,
++ .sideband_wake = true,
+ };
+
+ static const struct mhi_channel_config mhi_foxconn_sdx55_channels[] = {
+@@ -339,7 +346,8 @@ static const struct mhi_pci_dev_info mhi
+ .edl = "qcom/sdx55m/edl.mbn",
+ .config = &modem_foxconn_sdx55_config,
+ .bar_num = MHI_PCI_DEFAULT_BAR_NUM,
+- .dma_data_width = 32
++ .dma_data_width = 32,
++ .sideband_wake = false,
+ };
+
+ static const struct pci_device_id mhi_pci_id_table[] = {
+@@ -640,9 +648,12 @@ static int mhi_pci_probe(struct pci_dev
+ mhi_cntrl->status_cb = mhi_pci_status_cb;
+ mhi_cntrl->runtime_get = mhi_pci_runtime_get;
+ mhi_cntrl->runtime_put = mhi_pci_runtime_put;
+- mhi_cntrl->wake_get = mhi_pci_wake_get_nop;
+- mhi_cntrl->wake_put = mhi_pci_wake_put_nop;
+- mhi_cntrl->wake_toggle = mhi_pci_wake_toggle_nop;
++
++ if (info->sideband_wake) {
++ mhi_cntrl->wake_get = mhi_pci_wake_get_nop;
++ mhi_cntrl->wake_put = mhi_pci_wake_put_nop;
++ mhi_cntrl->wake_toggle = mhi_pci_wake_toggle_nop;
++ }
+
+ err = mhi_pci_claim(mhi_cntrl, info->bar_num, DMA_BIT_MASK(info->dma_data_width));
+ if (err)
--- /dev/null
+From b8a97f2a65388394f433bf0730293a94f7d49046 Mon Sep 17 00:00:00 2001
+From: Loic Poulain <loic.poulain@linaro.org>
+Date: Fri, 16 Jul 2021 13:21:06 +0530
+Subject: bus: mhi: pci_generic: Fix inbound IPCR channel
+
+From: Loic Poulain <loic.poulain@linaro.org>
+
+commit b8a97f2a65388394f433bf0730293a94f7d49046 upstream.
+
+The qrtr-mhi client driver assumes that inbound buffers are
+automatically allocated and queued by the MHI core, but this
+doesn't happen for mhi pci devices since IPCR inbound channel is
+not flagged with auto_queue, causing unusable IPCR (qrtr)
+feature. Fix that.
+
+Link: https://lore.kernel.org/r/1625736749-24947-1-git-send-email-loic.poulain@linaro.org
+[mani: fixed a spelling mistake in commit description]
+Fixes: 855a70c12021 ("bus: mhi: Add MHI PCI support for WWAN modems")
+Cc: stable@vger.kernel.org #5.10
+Reviewed-by: Hemant kumar <hemantk@codeaurora.org>
+Reviewed-by: Manivannan Sadhasivam <mani@kernel.org>
+Signed-off-by: Loic Poulain <loic.poulain@linaro.org>
+Signed-off-by: Manivannan Sadhasivam <manivannan.sadhasivam@linaro.org>
+Link: https://lore.kernel.org/r/20210716075106.49938-4-manivannan.sadhasivam@linaro.org
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/bus/mhi/pci_generic.c | 18 +++++++++++++++++-
+ 1 file changed, 17 insertions(+), 1 deletion(-)
+
+--- a/drivers/bus/mhi/pci_generic.c
++++ b/drivers/bus/mhi/pci_generic.c
+@@ -75,6 +75,22 @@ struct mhi_pci_dev_info {
+ .doorbell_mode_switch = false, \
+ }
+
++#define MHI_CHANNEL_CONFIG_DL_AUTOQUEUE(ch_num, ch_name, el_count, ev_ring) \
++ { \
++ .num = ch_num, \
++ .name = ch_name, \
++ .num_elements = el_count, \
++ .event_ring = ev_ring, \
++ .dir = DMA_FROM_DEVICE, \
++ .ee_mask = BIT(MHI_EE_AMSS), \
++ .pollcfg = 0, \
++ .doorbell = MHI_DB_BRST_DISABLE, \
++ .lpm_notify = false, \
++ .offload_channel = false, \
++ .doorbell_mode_switch = false, \
++ .auto_queue = true, \
++ }
++
+ #define MHI_EVENT_CONFIG_CTRL(ev_ring, el_count) \
+ { \
+ .num_elements = el_count, \
+@@ -213,7 +229,7 @@ static const struct mhi_channel_config m
+ MHI_CHANNEL_CONFIG_UL(14, "QMI", 4, 0),
+ MHI_CHANNEL_CONFIG_DL(15, "QMI", 4, 0),
+ MHI_CHANNEL_CONFIG_UL(20, "IPCR", 8, 0),
+- MHI_CHANNEL_CONFIG_DL(21, "IPCR", 8, 0),
++ MHI_CHANNEL_CONFIG_DL_AUTOQUEUE(21, "IPCR", 8, 0),
+ MHI_CHANNEL_CONFIG_UL_FP(34, "FIREHOSE", 32, 0),
+ MHI_CHANNEL_CONFIG_DL_FP(35, "FIREHOSE", 32, 0),
+ MHI_CHANNEL_CONFIG_HW_UL(100, "IP_HW0", 128, 2),
--- /dev/null
+From 4afa0c22eed33cfe0c590742387f0d16f32412f3 Mon Sep 17 00:00:00 2001
+From: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
+Date: Tue, 13 Jul 2021 12:34:38 +0300
+Subject: driver core: auxiliary bus: Fix memory leak when driver_register() fail
+
+From: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
+
+commit 4afa0c22eed33cfe0c590742387f0d16f32412f3 upstream.
+
+If driver_register() returns with error we need to free the memory
+allocated for auxdrv->driver.name before returning from
+__auxiliary_driver_register()
+
+Fixes: 7de3697e9cbd4 ("Add auxiliary bus support")
+Reviewed-by: Dan Williams <dan.j.williams@intel.com>
+Cc: stable <stable@vger.kernel.org>
+Signed-off-by: Peter Ujfalusi <peter.ujfalusi@linux.intel.com>
+Link: https://lore.kernel.org/r/20210713093438.3173-1-peter.ujfalusi@linux.intel.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/base/auxiliary.c | 8 +++++++-
+ 1 file changed, 7 insertions(+), 1 deletion(-)
+
+--- a/drivers/base/auxiliary.c
++++ b/drivers/base/auxiliary.c
+@@ -231,6 +231,8 @@ EXPORT_SYMBOL_GPL(auxiliary_find_device)
+ int __auxiliary_driver_register(struct auxiliary_driver *auxdrv,
+ struct module *owner, const char *modname)
+ {
++ int ret;
++
+ if (WARN_ON(!auxdrv->probe) || WARN_ON(!auxdrv->id_table))
+ return -EINVAL;
+
+@@ -246,7 +248,11 @@ int __auxiliary_driver_register(struct a
+ auxdrv->driver.bus = &auxiliary_bus_type;
+ auxdrv->driver.mod_name = modname;
+
+- return driver_register(&auxdrv->driver);
++ ret = driver_register(&auxdrv->driver);
++ if (ret)
++ kfree(auxdrv->driver.name);
++
++ return ret;
+ }
+ EXPORT_SYMBOL_GPL(__auxiliary_driver_register);
+
--- /dev/null
+From 2bab693a608bdf614b9fcd44083c5100f34b9f77 Mon Sep 17 00:00:00 2001
+From: Marc Zyngier <maz@kernel.org>
+Date: Tue, 13 Jul 2021 19:43:26 +0100
+Subject: firmware/efi: Tell memblock about EFI iomem reservations
+
+From: Marc Zyngier <maz@kernel.org>
+
+commit 2bab693a608bdf614b9fcd44083c5100f34b9f77 upstream.
+
+kexec_load_file() relies on the memblock infrastructure to avoid
+stamping over regions of memory that are essential to the survival
+of the system.
+
+However, nobody seems to agree how to flag these regions as reserved,
+and (for example) EFI only publishes its reservations in /proc/iomem
+for the benefit of the traditional, userspace based kexec tool.
+
+On arm64 platforms with GICv3, this can result in the payload being
+placed at the location of the LPI tables. Shock, horror!
+
+Let's augment the EFI reservation code with a memblock_reserve() call,
+protecting our dear tables from the secondary kernel invasion.
+
+Reported-by: Moritz Fischer <mdf@kernel.org>
+Tested-by: Moritz Fischer <mdf@kernel.org>
+Signed-off-by: Marc Zyngier <maz@kernel.org>
+Cc: stable@vger.kernel.org
+Cc: Ard Biesheuvel <ardb@kernel.org>
+Cc: James Morse <james.morse@arm.com>
+Cc: Catalin Marinas <catalin.marinas@arm.com>
+Cc: Will Deacon <will@kernel.org>
+Signed-off-by: Ard Biesheuvel <ardb@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/firmware/efi/efi.c | 13 ++++++++++++-
+ 1 file changed, 12 insertions(+), 1 deletion(-)
+
+--- a/drivers/firmware/efi/efi.c
++++ b/drivers/firmware/efi/efi.c
+@@ -896,6 +896,7 @@ static int __init efi_memreserve_map_roo
+ static int efi_mem_reserve_iomem(phys_addr_t addr, u64 size)
+ {
+ struct resource *res, *parent;
++ int ret;
+
+ res = kzalloc(sizeof(struct resource), GFP_ATOMIC);
+ if (!res)
+@@ -908,7 +909,17 @@ static int efi_mem_reserve_iomem(phys_ad
+
+ /* we expect a conflict with a 'System RAM' region */
+ parent = request_resource_conflict(&iomem_resource, res);
+- return parent ? request_resource(parent, res) : 0;
++ ret = parent ? request_resource(parent, res) : 0;
++
++ /*
++ * Given that efi_mem_reserve_iomem() can be called at any
++ * time, only call memblock_reserve() if the architecture
++ * keeps the infrastructure around.
++ */
++ if (IS_ENABLED(CONFIG_ARCH_KEEP_MEMBLOCK) && !ret)
++ memblock_reserve(addr, size);
++
++ return ret;
+ }
+
+ int __ref efi_mem_reserve_persistent(phys_addr_t addr, u64 size)
--- /dev/null
+From 68b11e8b1562986c134764433af64e97d30c9fc0 Mon Sep 17 00:00:00 2001
+From: Pavel Begunkov <asml.silence@gmail.com>
+Date: Tue, 20 Jul 2021 10:50:43 +0100
+Subject: io_uring: explicitly count entries for poll reqs
+
+From: Pavel Begunkov <asml.silence@gmail.com>
+
+commit 68b11e8b1562986c134764433af64e97d30c9fc0 upstream.
+
+If __io_queue_proc() fails to add a second poll entry, e.g. kmalloc()
+failed, but it goes on with a third waitqueue, it may succeed and
+overwrite the error status. Count the number of poll entries we added,
+so we can set pt->error to zero at the beginning and find out when the
+mentioned scenario happens.
+
+Cc: stable@vger.kernel.org
+Fixes: 18bceab101add ("io_uring: allow POLL_ADD with double poll_wait() users")
+Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
+Link: https://lore.kernel.org/r/9d6b9e561f88bcc0163623b74a76c39f712151c3.1626774457.git.asml.silence@gmail.com
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/io_uring.c | 16 ++++++++++------
+ 1 file changed, 10 insertions(+), 6 deletions(-)
+
+--- a/fs/io_uring.c
++++ b/fs/io_uring.c
+@@ -4805,6 +4805,7 @@ IO_NETOP_FN(recv);
+ struct io_poll_table {
+ struct poll_table_struct pt;
+ struct io_kiocb *req;
++ int nr_entries;
+ int error;
+ };
+
+@@ -5002,11 +5003,11 @@ static void __io_queue_proc(struct io_po
+ struct io_kiocb *req = pt->req;
+
+ /*
+- * If poll->head is already set, it's because the file being polled
+- * uses multiple waitqueues for poll handling (eg one for read, one
+- * for write). Setup a separate io_poll_iocb if this happens.
++ * The file being polled uses multiple waitqueues for poll handling
++ * (e.g. one for read, one for write). Setup a separate io_poll_iocb
++ * if this happens.
+ */
+- if (unlikely(poll->head)) {
++ if (unlikely(pt->nr_entries)) {
+ struct io_poll_iocb *poll_one = poll;
+
+ /* already have a 2nd entry, fail a third attempt */
+@@ -5034,7 +5035,7 @@ static void __io_queue_proc(struct io_po
+ *poll_ptr = poll;
+ }
+
+- pt->error = 0;
++ pt->nr_entries++;
+ poll->head = head;
+
+ if (poll->events & EPOLLEXCLUSIVE)
+@@ -5112,9 +5113,12 @@ static __poll_t __io_arm_poll_handler(st
+
+ ipt->pt._key = mask;
+ ipt->req = req;
+- ipt->error = -EINVAL;
++ ipt->error = 0;
++ ipt->nr_entries = 0;
+
+ mask = vfs_poll(req->file, &ipt->pt) & poll->events;
++ if (unlikely(!ipt->nr_entries) && !ipt->error)
++ ipt->error = -EINVAL;
+
+ spin_lock_irq(&ctx->completion_lock);
+ if (likely(poll->head)) {
--- /dev/null
+From 0cc936f74bcacb039b7533aeac0a887dfc896bf6 Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@kernel.dk>
+Date: Thu, 22 Jul 2021 17:08:07 -0600
+Subject: io_uring: fix early fdput() of file
+
+From: Jens Axboe <axboe@kernel.dk>
+
+commit 0cc936f74bcacb039b7533aeac0a887dfc896bf6 upstream.
+
+A previous commit shuffled some code around, and inadvertently used
+struct file after fdput() had been called on it. As we can't touch
+the file post fdput() dropping our reference, move the fdput() to
+after that has been done.
+
+Cc: Pavel Begunkov <asml.silence@gmail.com>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/io-uring/YPnqM0fY3nM5RdRI@zeniv-ca.linux.org.uk/
+Fixes: f2a48dd09b8e ("io_uring: refactor io_sq_offload_create()")
+Reported-by: Al Viro <viro@zeniv.linux.org.uk>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/io_uring.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+--- a/fs/io_uring.c
++++ b/fs/io_uring.c
+@@ -7953,9 +7953,11 @@ static int io_sq_offload_create(struct i
+ f = fdget(p->wq_fd);
+ if (!f.file)
+ return -ENXIO;
+- fdput(f);
+- if (f.file->f_op != &io_uring_fops)
++ if (f.file->f_op != &io_uring_fops) {
++ fdput(f);
+ return -EINVAL;
++ }
++ fdput(f);
+ }
+ if (ctx->flags & IORING_SETUP_SQPOLL) {
+ struct task_struct *tsk;
--- /dev/null
+From 46fee9ab02cb24979bbe07631fc3ae95ae08aa3e Mon Sep 17 00:00:00 2001
+From: Pavel Begunkov <asml.silence@gmail.com>
+Date: Tue, 20 Jul 2021 10:50:44 +0100
+Subject: io_uring: remove double poll entry on arm failure
+
+From: Pavel Begunkov <asml.silence@gmail.com>
+
+commit 46fee9ab02cb24979bbe07631fc3ae95ae08aa3e upstream.
+
+__io_queue_proc() can enqueue both poll entries and still fail
+afterwards, so the callers trying to cancel it should also try to remove
+the second poll entry (if any).
+
+For example, it may leave the request alive referencing a io_uring
+context but not accessible for cancellation:
+
+[ 282.599913][ T1620] task:iou-sqp-23145 state:D stack:28720 pid:23155 ppid: 8844 flags:0x00004004
+[ 282.609927][ T1620] Call Trace:
+[ 282.613711][ T1620] __schedule+0x93a/0x26f0
+[ 282.634647][ T1620] schedule+0xd3/0x270
+[ 282.638874][ T1620] io_uring_cancel_generic+0x54d/0x890
+[ 282.660346][ T1620] io_sq_thread+0xaac/0x1250
+[ 282.696394][ T1620] ret_from_fork+0x1f/0x30
+
+Cc: stable@vger.kernel.org
+Fixes: 18bceab101add ("io_uring: allow POLL_ADD with double poll_wait() users")
+Reported-and-tested-by: syzbot+ac957324022b7132accf@syzkaller.appspotmail.com
+Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
+Link: https://lore.kernel.org/r/0ec1228fc5eda4cb524eeda857da8efdc43c331c.1626774457.git.asml.silence@gmail.com
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/io_uring.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/fs/io_uring.c
++++ b/fs/io_uring.c
+@@ -5121,6 +5121,8 @@ static __poll_t __io_arm_poll_handler(st
+ ipt->error = -EINVAL;
+
+ spin_lock_irq(&ctx->completion_lock);
++ if (ipt->error)
++ io_poll_remove_double(req);
+ if (likely(poll->head)) {
+ spin_lock(&poll->head->lock);
+ if (unlikely(list_empty(&poll->wait.entry))) {
--- /dev/null
+From 09cfae9f13d51700b0fecf591dcd658fc5375428 Mon Sep 17 00:00:00 2001
+From: Markus Boehme <markubo@amazon.com>
+Date: Tue, 20 Jul 2021 16:26:19 -0700
+Subject: ixgbe: Fix packet corruption due to missing DMA sync
+
+From: Markus Boehme <markubo@amazon.com>
+
+commit 09cfae9f13d51700b0fecf591dcd658fc5375428 upstream.
+
+When receiving a packet with multiple fragments, hardware may still
+touch the first fragment until the entire packet has been received. The
+driver therefore keeps the first fragment mapped for DMA until end of
+packet has been asserted, and delays its dma_sync call until then.
+
+The driver tries to fit multiple receive buffers on one page. When using
+3K receive buffers (e.g. using Jumbo frames and legacy-rx is turned
+off/build_skb is being used) on an architecture with 4K pages, the
+driver allocates an order 1 compound page and uses one page per receive
+buffer. To determine the correct offset for a delayed DMA sync of the
+first fragment of a multi-fragment packet, the driver then cannot just
+use PAGE_MASK on the DMA address but has to construct a mask based on
+the actual size of the backing page.
+
+Using PAGE_MASK in the 3K RX buffer/4K page architecture configuration
+will always sync the first page of a compound page. With the SWIOTLB
+enabled this can lead to corrupted packets (zeroed out first fragment,
+re-used garbage from another packet) and various consequences, such as
+slow/stalling data transfers and connection resets. For example, testing
+on a link with MTU exceeding 3058 bytes on a host with SWIOTLB enabled
+(e.g. "iommu=soft swiotlb=262144,force") TCP transfers quickly fizzle
+out without this patch.
+
+Cc: stable@vger.kernel.org
+Fixes: 0c5661ecc5dd7 ("ixgbe: fix crash in build_skb Rx code path")
+Signed-off-by: Markus Boehme <markubo@amazon.com>
+Tested-by: Tony Brelinski <tonyx.brelinski@intel.com>
+Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/intel/ixgbe/ixgbe_main.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
++++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+@@ -1825,7 +1825,8 @@ static void ixgbe_dma_sync_frag(struct i
+ struct sk_buff *skb)
+ {
+ if (ring_uses_build_skb(rx_ring)) {
+- unsigned long offset = (unsigned long)(skb->data) & ~PAGE_MASK;
++ unsigned long mask = (unsigned long)ixgbe_rx_pg_size(rx_ring) - 1;
++ unsigned long offset = (unsigned long)(skb->data) & mask;
+
+ dma_sync_single_range_for_cpu(rx_ring->dev,
+ IXGBE_CB(skb)->dma,
--- /dev/null
+From 8d4abca95ecc82fc8c41912fa0085281f19cc29f Mon Sep 17 00:00:00 2001
+From: "Gustavo A. R. Silva" <gustavoars@kernel.org>
+Date: Mon, 19 Apr 2021 18:43:32 -0500
+Subject: media: ngene: Fix out-of-bounds bug in ngene_command_config_free_buf()
+
+From: Gustavo A. R. Silva <gustavoars@kernel.org>
+
+commit 8d4abca95ecc82fc8c41912fa0085281f19cc29f upstream.
+
+Fix an 11-year old bug in ngene_command_config_free_buf() while
+addressing the following warnings caught with -Warray-bounds:
+
+arch/alpha/include/asm/string.h:22:16: warning: '__builtin_memcpy' offset [12, 16] from the object at 'com' is out of the bounds of referenced subobject 'config' with type 'unsigned char' at offset 10 [-Warray-bounds]
+arch/x86/include/asm/string_32.h:182:25: warning: '__builtin_memcpy' offset [12, 16] from the object at 'com' is out of the bounds of referenced subobject 'config' with type 'unsigned char' at offset 10 [-Warray-bounds]
+
+The problem is that the original code is trying to copy 6 bytes of
+data into a one-byte size member _config_ of the wrong structue
+FW_CONFIGURE_BUFFERS, in a single call to memcpy(). This causes a
+legitimate compiler warning because memcpy() overruns the length
+of &com.cmd.ConfigureBuffers.config. It seems that the right
+structure is FW_CONFIGURE_FREE_BUFFERS, instead, because it contains
+6 more members apart from the header _hdr_. Also, the name of
+the function ngene_command_config_free_buf() suggests that the actual
+intention is to ConfigureFreeBuffers, instead of ConfigureBuffers
+(which takes place in the function ngene_command_config_buf(), above).
+
+Fix this by enclosing those 6 members of struct FW_CONFIGURE_FREE_BUFFERS
+into new struct config, and use &com.cmd.ConfigureFreeBuffers.config as
+the destination address, instead of &com.cmd.ConfigureBuffers.config,
+when calling memcpy().
+
+This also helps with the ongoing efforts to globally enable
+-Warray-bounds and get us closer to being able to tighten the
+FORTIFY_SOURCE routines on memcpy().
+
+Link: https://github.com/KSPP/linux/issues/109
+Fixes: dae52d009fc9 ("V4L/DVB: ngene: Initial check-in")
+Cc: stable@vger.kernel.org
+Reported-by: kernel test robot <lkp@intel.com>
+Reviewed-by: Kees Cook <keescook@chromium.org>
+Signed-off-by: Gustavo A. R. Silva <gustavoars@kernel.org>
+Link: https://lore.kernel.org/linux-hardening/20210420001631.GA45456@embeddedor/
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/media/pci/ngene/ngene-core.c | 2 +-
+ drivers/media/pci/ngene/ngene.h | 14 ++++++++------
+ 2 files changed, 9 insertions(+), 7 deletions(-)
+
+--- a/drivers/media/pci/ngene/ngene-core.c
++++ b/drivers/media/pci/ngene/ngene-core.c
+@@ -385,7 +385,7 @@ static int ngene_command_config_free_buf
+
+ com.cmd.hdr.Opcode = CMD_CONFIGURE_FREE_BUFFER;
+ com.cmd.hdr.Length = 6;
+- memcpy(&com.cmd.ConfigureBuffers.config, config, 6);
++ memcpy(&com.cmd.ConfigureFreeBuffers.config, config, 6);
+ com.in_len = 6;
+ com.out_len = 0;
+
+--- a/drivers/media/pci/ngene/ngene.h
++++ b/drivers/media/pci/ngene/ngene.h
+@@ -407,12 +407,14 @@ enum _BUFFER_CONFIGS {
+
+ struct FW_CONFIGURE_FREE_BUFFERS {
+ struct FW_HEADER hdr;
+- u8 UVI1_BufferLength;
+- u8 UVI2_BufferLength;
+- u8 TVO_BufferLength;
+- u8 AUD1_BufferLength;
+- u8 AUD2_BufferLength;
+- u8 TVA_BufferLength;
++ struct {
++ u8 UVI1_BufferLength;
++ u8 UVI2_BufferLength;
++ u8 TVO_BufferLength;
++ u8 AUD1_BufferLength;
++ u8 AUD2_BufferLength;
++ u8 TVA_BufferLength;
++ } __packed config;
+ } __attribute__ ((__packed__));
+
+ struct FW_CONFIGURE_UART {
--- /dev/null
+From 1a3402d93c73bf6bb4df6d7c2aac35abfc3c50e2 Mon Sep 17 00:00:00 2001
+From: Frederic Weisbecker <frederic@kernel.org>
+Date: Thu, 3 Jun 2021 01:15:59 +0200
+Subject: posix-cpu-timers: Fix rearm racing against process tick
+
+From: Frederic Weisbecker <frederic@kernel.org>
+
+commit 1a3402d93c73bf6bb4df6d7c2aac35abfc3c50e2 upstream.
+
+Since the process wide cputime counter is started locklessly from
+posix_cpu_timer_rearm(), it can be concurrently stopped by operations
+on other timers from the same thread group, such as in the following
+unlucky scenario:
+
+ CPU 0 CPU 1
+ ----- -----
+ timer_settime(TIMER B)
+ posix_cpu_timer_rearm(TIMER A)
+ cpu_clock_sample_group()
+ (pct->timers_active already true)
+
+ handle_posix_cpu_timers()
+ check_process_timers()
+ stop_process_timers()
+ pct->timers_active = false
+ arm_timer(TIMER A)
+
+ tick -> run_posix_cpu_timers()
+ // sees !pct->timers_active, ignore
+ // our TIMER A
+
+Fix this with simply locking process wide cputime counting start and
+timer arm in the same block.
+
+Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
+Fixes: 60f2ceaa8111 ("posix-cpu-timers: Remove unnecessary locking around cpu_clock_sample_group")
+Cc: stable@vger.kernel.org
+Cc: Oleg Nesterov <oleg@redhat.com>
+Cc: Thomas Gleixner <tglx@linutronix.de>
+Cc: Ingo Molnar <mingo@kernel.org>
+Cc: Eric W. Biederman <ebiederm@xmission.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/time/posix-cpu-timers.c | 10 +++++-----
+ 1 file changed, 5 insertions(+), 5 deletions(-)
+
+--- a/kernel/time/posix-cpu-timers.c
++++ b/kernel/time/posix-cpu-timers.c
+@@ -991,6 +991,11 @@ static void posix_cpu_timer_rearm(struct
+ if (!p)
+ goto out;
+
++ /* Protect timer list r/w in arm_timer() */
++ sighand = lock_task_sighand(p, &flags);
++ if (unlikely(sighand == NULL))
++ goto out;
++
+ /*
+ * Fetch the current sample and update the timer's expiry time.
+ */
+@@ -1001,11 +1006,6 @@ static void posix_cpu_timer_rearm(struct
+
+ bump_cpu_timer(timer, now);
+
+- /* Protect timer list r/w in arm_timer() */
+- sighand = lock_task_sighand(p, &flags);
+- if (unlikely(sighand == NULL))
+- goto out;
+-
+ /*
+ * Now re-arm for the new expiry time.
+ */
--- /dev/null
+From 0db282ba2c12c1515d490d14a1ff696643ab0f1b Mon Sep 17 00:00:00 2001
+From: Peter Collingbourne <pcc@google.com>
+Date: Fri, 23 Jul 2021 15:50:04 -0700
+Subject: selftest: use mmap instead of posix_memalign to allocate memory
+
+From: Peter Collingbourne <pcc@google.com>
+
+commit 0db282ba2c12c1515d490d14a1ff696643ab0f1b upstream.
+
+This test passes pointers obtained from anon_allocate_area to the
+userfaultfd and mremap APIs. This causes a problem if the system
+allocator returns tagged pointers because with the tagged address ABI
+the kernel rejects tagged addresses passed to these APIs, which would
+end up causing the test to fail. To make this test compatible with such
+system allocators, stop using the system allocator to allocate memory in
+anon_allocate_area, and instead just use mmap.
+
+Link: https://lkml.kernel.org/r/20210714195437.118982-3-pcc@google.com
+Link: https://linux-review.googlesource.com/id/Icac91064fcd923f77a83e8e133f8631c5b8fc241
+Fixes: c47174fc362a ("userfaultfd: selftest")
+Co-developed-by: Lokesh Gidra <lokeshgidra@google.com>
+Signed-off-by: Lokesh Gidra <lokeshgidra@google.com>
+Signed-off-by: Peter Collingbourne <pcc@google.com>
+Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
+Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
+Cc: Dave Martin <Dave.Martin@arm.com>
+Cc: Will Deacon <will@kernel.org>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Alistair Delva <adelva@google.com>
+Cc: William McVicker <willmcvicker@google.com>
+Cc: Evgenii Stepanov <eugenis@google.com>
+Cc: Mitch Phillips <mitchp@google.com>
+Cc: Andrey Konovalov <andreyknvl@gmail.com>
+Cc: <stable@vger.kernel.org> [5.4]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ tools/testing/selftests/vm/userfaultfd.c | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+--- a/tools/testing/selftests/vm/userfaultfd.c
++++ b/tools/testing/selftests/vm/userfaultfd.c
+@@ -197,8 +197,10 @@ static int anon_release_pages(char *rel_
+
+ static void anon_allocate_area(void **alloc_area)
+ {
+- if (posix_memalign(alloc_area, page_size, nr_pages * page_size)) {
+- fprintf(stderr, "out of memory\n");
++ *alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
++ MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
++ if (*alloc_area == MAP_FAILED)
++ fprintf(stderr, "mmap of anonymous memory failed");
+ *alloc_area = NULL;
+ }
+ }
usb-typec-tipd-don-t-block-probing-of-consumer-of-connector-nodes.patch
usb-typec-stusb160x-register-role-switch-before-interrupt-registration.patch
usb-typec-stusb160x-don-t-block-probing-of-consumer-of-connector-nodes.patch
+firmware-efi-tell-memblock-about-efi-iomem-reservations.patch
+tracepoints-update-static_call-before-tp_funcs-when-adding-a-tracepoint.patch
+tracing-histogram-rename-cpu-to-common_cpu.patch
+tracing-fix-bug-in-rb_per_cpu_empty-that-might-cause-deadloop.patch
+tracing-synthetic-event-field_pos-is-an-index-not-a-boolean.patch
+btrfs-check-for-missing-device-in-btrfs_trim_fs.patch
+btrfs-fix-unpersisted-i_size-on-fsync-after-expanding-truncate.patch
+btrfs-fix-lock-inversion-problem-when-doing-qgroup-extent-tracing.patch
+media-ngene-fix-out-of-bounds-bug-in-ngene_command_config_free_buf.patch
+ixgbe-fix-packet-corruption-due-to-missing-dma-sync.patch
+driver-core-auxiliary-bus-fix-memory-leak-when-driver_register-fail.patch
+bus-mhi-pci_generic-apply-no-op-for-wake-using-sideband-wake-boolean.patch
+bus-mhi-core-validate-channel-id-when-processing-command-completions.patch
+bus-mhi-pci_generic-fix-inbound-ipcr-channel.patch
+posix-cpu-timers-fix-rearm-racing-against-process-tick.patch
+selftest-use-mmap-instead-of-posix_memalign-to-allocate-memory.patch
+io_uring-explicitly-count-entries-for-poll-reqs.patch
+io_uring-remove-double-poll-entry-on-arm-failure.patch
+io_uring-fix-early-fdput-of-file.patch
+userfaultfd-do-not-untag-user-pointers.patch
--- /dev/null
+From 352384d5c84ebe40fa77098cc234fe173247d8ef Mon Sep 17 00:00:00 2001
+From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
+Date: Thu, 22 Jul 2021 21:52:18 -0400
+Subject: tracepoints: Update static_call before tp_funcs when adding a tracepoint
+
+From: Steven Rostedt (VMware) <rostedt@goodmis.org>
+
+commit 352384d5c84ebe40fa77098cc234fe173247d8ef upstream.
+
+Because of the significant overhead that retpolines pose on indirect
+calls, the tracepoint code was updated to use the new "static_calls" that
+can modify the running code to directly call a function instead of using
+an indirect caller, and this function can be changed at runtime.
+
+In the tracepoint code that calls all the registered callbacks that are
+attached to a tracepoint, the following is done:
+
+ it_func_ptr = rcu_dereference_raw((&__tracepoint_##name)->funcs);
+ if (it_func_ptr) {
+ __data = (it_func_ptr)->data;
+ static_call(tp_func_##name)(__data, args);
+ }
+
+If there's just a single callback, the static_call is updated to just call
+that callback directly. Once another handler is added, then the static
+caller is updated to call the iterator, that simply loops over all the
+funcs in the array and calls each of the callbacks like the old method
+using indirect calling.
+
+The issue was discovered with a race between updating the funcs array and
+updating the static_call. The funcs array was updated first and then the
+static_call was updated. This is not an issue as long as the first element
+in the old array is the same as the first element in the new array. But
+that assumption is incorrect, because callbacks also have a priority
+field, and if there's a callback added that has a higher priority than the
+callback on the old array, then it will become the first callback in the
+new array. This means that it is possible to call the old callback with
+the new callback data element, which can cause a kernel panic.
+
+ static_call = callback1()
+ funcs[] = {callback1,data1};
+ callback2 has higher priority than callback1
+
+ CPU 1 CPU 2
+ ----- -----
+
+ new_funcs = {callback2,data2},
+ {callback1,data1}
+
+ rcu_assign_pointer(tp->funcs, new_funcs);
+
+ /*
+ * Now tp->funcs has the new array
+ * but the static_call still calls callback1
+ */
+
+ it_func_ptr = tp->funcs [ new_funcs ]
+ data = it_func_ptr->data [ data2 ]
+ static_call(callback1, data);
+
+ /* Now callback1 is called with
+ * callback2's data */
+
+ [ KERNEL PANIC ]
+
+ update_static_call(iterator);
+
+To prevent this from happening, always switch the static_call to the
+iterator before assigning the tp->funcs to the new array. The iterator will
+always properly match the callback with its data.
+
+To trigger this bug:
+
+ In one terminal:
+
+ while :; do hackbench 50; done
+
+ In another terminal
+
+ echo 1 > /sys/kernel/tracing/events/sched/sched_waking/enable
+ while :; do
+ echo 1 > /sys/kernel/tracing/set_event_pid;
+ sleep 0.5
+ echo 0 > /sys/kernel/tracing/set_event_pid;
+ sleep 0.5
+ done
+
+And it doesn't take long to crash. This is because the set_event_pid adds
+a callback to the sched_waking tracepoint with a high priority, which will
+be called before the sched_waking trace event callback is called.
+
+Note, the removal to a single callback updates the array first, before
+changing the static_call to single callback, which is the proper order as
+the first element in the array is the same as what the static_call is
+being changed to.
+
+Link: https://lore.kernel.org/io-uring/4ebea8f0-58c9-e571-fd30-0ce4f6f09c70@samba.org/
+
+Cc: stable@vger.kernel.org
+Fixes: d25e37d89dd2f ("tracepoint: Optimize using static_call()")
+Reported-by: Stefan Metzmacher <metze@samba.org>
+tested-by: Stefan Metzmacher <metze@samba.org>
+Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/tracepoint.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/kernel/tracepoint.c
++++ b/kernel/tracepoint.c
+@@ -299,8 +299,8 @@ static int tracepoint_add_func(struct tr
+ * a pointer to it. This array is referenced by __DO_TRACE from
+ * include/linux/tracepoint.h using rcu_dereference_sched().
+ */
+- rcu_assign_pointer(tp->funcs, tp_funcs);
+ tracepoint_update_call(tp, tp_funcs, false);
++ rcu_assign_pointer(tp->funcs, tp_funcs);
+ static_key_enable(&tp->key);
+
+ release_probes(old);
--- /dev/null
+From 67f0d6d9883c13174669f88adac4f0ee656cc16a Mon Sep 17 00:00:00 2001
+From: Haoran Luo <www@aegistudio.net>
+Date: Wed, 21 Jul 2021 14:12:07 +0000
+Subject: tracing: Fix bug in rb_per_cpu_empty() that might cause deadloop.
+
+From: Haoran Luo <www@aegistudio.net>
+
+commit 67f0d6d9883c13174669f88adac4f0ee656cc16a upstream.
+
+The "rb_per_cpu_empty()" misinterpret the condition (as not-empty) when
+"head_page" and "commit_page" of "struct ring_buffer_per_cpu" points to
+the same buffer page, whose "buffer_data_page" is empty and "read" field
+is non-zero.
+
+An error scenario could be constructed as followed (kernel perspective):
+
+1. All pages in the buffer has been accessed by reader(s) so that all of
+them will have non-zero "read" field.
+
+2. Read and clear all buffer pages so that "rb_num_of_entries()" will
+return 0 rendering there's no more data to read. It is also required
+that the "read_page", "commit_page" and "tail_page" points to the same
+page, while "head_page" is the next page of them.
+
+3. Invoke "ring_buffer_lock_reserve()" with large enough "length"
+so that it shot pass the end of current tail buffer page. Now the
+"head_page", "commit_page" and "tail_page" points to the same page.
+
+4. Discard current event with "ring_buffer_discard_commit()", so that
+"head_page", "commit_page" and "tail_page" points to a page whose buffer
+data page is now empty.
+
+When the error scenario has been constructed, "tracing_read_pipe" will
+be trapped inside a deadloop: "trace_empty()" returns 0 since
+"rb_per_cpu_empty()" returns 0 when it hits the CPU containing such
+constructed ring buffer. Then "trace_find_next_entry_inc()" always
+return NULL since "rb_num_of_entries()" reports there's no more entry
+to read. Finally "trace_seq_to_user()" returns "-EBUSY" spanking
+"tracing_read_pipe" back to the start of the "waitagain" loop.
+
+I've also written a proof-of-concept script to construct the scenario
+and trigger the bug automatically, you can use it to trace and validate
+my reasoning above:
+
+ https://github.com/aegistudio/RingBufferDetonator.git
+
+Tests has been carried out on linux kernel 5.14-rc2
+(2734d6c1b1a089fb593ef6a23d4b70903526fe0c), my fixed version
+of kernel (for testing whether my update fixes the bug) and
+some older kernels (for range of affected kernels). Test result is
+also attached to the proof-of-concept repository.
+
+Link: https://lore.kernel.org/linux-trace-devel/YPaNxsIlb2yjSi5Y@aegistudio/
+Link: https://lore.kernel.org/linux-trace-devel/YPgrN85WL9VyrZ55@aegistudio
+
+Cc: stable@vger.kernel.org
+Fixes: bf41a158cacba ("ring-buffer: make reentrant")
+Suggested-by: Linus Torvalds <torvalds@linuxfoundation.org>
+Signed-off-by: Haoran Luo <www@aegistudio.net>
+Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/trace/ring_buffer.c | 28 ++++++++++++++++++++++++----
+ 1 file changed, 24 insertions(+), 4 deletions(-)
+
+--- a/kernel/trace/ring_buffer.c
++++ b/kernel/trace/ring_buffer.c
+@@ -3880,10 +3880,30 @@ static bool rb_per_cpu_empty(struct ring
+ if (unlikely(!head))
+ return true;
+
+- return reader->read == rb_page_commit(reader) &&
+- (commit == reader ||
+- (commit == head &&
+- head->read == rb_page_commit(commit)));
++ /* Reader should exhaust content in reader page */
++ if (reader->read != rb_page_commit(reader))
++ return false;
++
++ /*
++ * If writers are committing on the reader page, knowing all
++ * committed content has been read, the ring buffer is empty.
++ */
++ if (commit == reader)
++ return true;
++
++ /*
++ * If writers are committing on a page other than reader page
++ * and head page, there should always be content to read.
++ */
++ if (commit != head)
++ return false;
++
++ /*
++ * Writers are committing on the head page, we just need
++ * to care about there're committed data, and the reader will
++ * swap reader page with head page when it is to read data.
++ */
++ return rb_page_commit(commit) == 0;
+ }
+
+ /**
--- /dev/null
+From 1e3bac71c5053c99d438771fc9fa5082ae5d90aa Mon Sep 17 00:00:00 2001
+From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
+Date: Wed, 21 Jul 2021 11:00:53 -0400
+Subject: tracing/histogram: Rename "cpu" to "common_cpu"
+
+From: Steven Rostedt (VMware) <rostedt@goodmis.org>
+
+commit 1e3bac71c5053c99d438771fc9fa5082ae5d90aa upstream.
+
+Currently the histogram logic allows the user to write "cpu" in as an
+event field, and it will record the CPU that the event happened on.
+
+The problem with this is that there's a lot of events that have "cpu"
+as a real field, and using "cpu" as the CPU it ran on, makes it
+impossible to run histograms on the "cpu" field of events.
+
+For example, if I want to have a histogram on the count of the
+workqueue_queue_work event on its cpu field, running:
+
+ ># echo 'hist:keys=cpu' > events/workqueue/workqueue_queue_work/trigger
+
+Gives a misleading and wrong result.
+
+Change the command to "common_cpu" as no event should have "common_*"
+fields as that's a reserved name for fields used by all events. And
+this makes sense here as common_cpu would be a field used by all events.
+
+Now we can even do:
+
+ ># echo 'hist:keys=common_cpu,cpu if cpu < 100' > events/workqueue/workqueue_queue_work/trigger
+ ># cat events/workqueue/workqueue_queue_work/hist
+ # event histogram
+ #
+ # trigger info: hist:keys=common_cpu,cpu:vals=hitcount:sort=hitcount:size=2048 if cpu < 100 [active]
+ #
+
+ { common_cpu: 0, cpu: 2 } hitcount: 1
+ { common_cpu: 0, cpu: 4 } hitcount: 1
+ { common_cpu: 7, cpu: 7 } hitcount: 1
+ { common_cpu: 0, cpu: 7 } hitcount: 1
+ { common_cpu: 0, cpu: 1 } hitcount: 1
+ { common_cpu: 0, cpu: 6 } hitcount: 2
+ { common_cpu: 0, cpu: 5 } hitcount: 2
+ { common_cpu: 1, cpu: 1 } hitcount: 4
+ { common_cpu: 6, cpu: 6 } hitcount: 4
+ { common_cpu: 5, cpu: 5 } hitcount: 14
+ { common_cpu: 4, cpu: 4 } hitcount: 26
+ { common_cpu: 0, cpu: 0 } hitcount: 39
+ { common_cpu: 2, cpu: 2 } hitcount: 184
+
+Now for backward compatibility, I added a trick. If "cpu" is used, and
+the field is not found, it will fall back to "common_cpu" and work as
+it did before. This way, it will still work for old programs that use
+"cpu" to get the actual CPU, but if the event has a "cpu" as a field, it
+will get that event's "cpu" field, which is probably what it wants
+anyway.
+
+I updated the tracefs/README to include documentation about both the
+common_timestamp and the common_cpu. This way, if that text is present in
+the README, then an application can know that common_cpu is supported over
+just plain "cpu".
+
+Link: https://lkml.kernel.org/r/20210721110053.26b4f641@oasis.local.home
+
+Cc: Namhyung Kim <namhyung@kernel.org>
+Cc: Ingo Molnar <mingo@kernel.org>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: stable@vger.kernel.org
+Fixes: 8b7622bf94a44 ("tracing: Add cpu field for hist triggers")
+Reviewed-by: Tom Zanussi <zanussi@kernel.org>
+Reviewed-by: Masami Hiramatsu <mhiramat@kernel.org>
+Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ Documentation/trace/histogram.rst | 2 +-
+ kernel/trace/trace.c | 4 ++++
+ kernel/trace/trace_events_hist.c | 22 ++++++++++++++++------
+ 3 files changed, 21 insertions(+), 7 deletions(-)
+
+--- a/Documentation/trace/histogram.rst
++++ b/Documentation/trace/histogram.rst
+@@ -191,7 +191,7 @@ Documentation written by Tom Zanussi
+ with the event, in nanoseconds. May be
+ modified by .usecs to have timestamps
+ interpreted as microseconds.
+- cpu int the cpu on which the event occurred.
++ common_cpu int the cpu on which the event occurred.
+ ====================== ==== =======================================
+
+ Extended error information
+--- a/kernel/trace/trace.c
++++ b/kernel/trace/trace.c
+@@ -5565,6 +5565,10 @@ static const char readme_msg[] =
+ "\t [:name=histname1]\n"
+ "\t [:<handler>.<action>]\n"
+ "\t [if <filter>]\n\n"
++ "\t Note, special fields can be used as well:\n"
++ "\t common_timestamp - to record current timestamp\n"
++ "\t common_cpu - to record the CPU the event happened on\n"
++ "\n"
+ "\t When a matching event is hit, an entry is added to a hash\n"
+ "\t table using the key(s) and value(s) named, and the value of a\n"
+ "\t sum called 'hitcount' is incremented. Keys and values\n"
+--- a/kernel/trace/trace_events_hist.c
++++ b/kernel/trace/trace_events_hist.c
+@@ -1111,7 +1111,7 @@ static const char *hist_field_name(struc
+ field->flags & HIST_FIELD_FL_ALIAS)
+ field_name = hist_field_name(field->operands[0], ++level);
+ else if (field->flags & HIST_FIELD_FL_CPU)
+- field_name = "cpu";
++ field_name = "common_cpu";
+ else if (field->flags & HIST_FIELD_FL_EXPR ||
+ field->flags & HIST_FIELD_FL_VAR_REF) {
+ if (field->system) {
+@@ -1991,14 +1991,24 @@ parse_field(struct hist_trigger_data *hi
+ hist_data->enable_timestamps = true;
+ if (*flags & HIST_FIELD_FL_TIMESTAMP_USECS)
+ hist_data->attrs->ts_in_usecs = true;
+- } else if (strcmp(field_name, "cpu") == 0)
++ } else if (strcmp(field_name, "common_cpu") == 0)
+ *flags |= HIST_FIELD_FL_CPU;
+ else {
+ field = trace_find_event_field(file->event_call, field_name);
+ if (!field || !field->size) {
+- hist_err(tr, HIST_ERR_FIELD_NOT_FOUND, errpos(field_name));
+- field = ERR_PTR(-EINVAL);
+- goto out;
++ /*
++ * For backward compatibility, if field_name
++ * was "cpu", then we treat this the same as
++ * common_cpu.
++ */
++ if (strcmp(field_name, "cpu") == 0) {
++ *flags |= HIST_FIELD_FL_CPU;
++ } else {
++ hist_err(tr, HIST_ERR_FIELD_NOT_FOUND,
++ errpos(field_name));
++ field = ERR_PTR(-EINVAL);
++ goto out;
++ }
+ }
+ }
+ out:
+@@ -5085,7 +5095,7 @@ static void hist_field_print(struct seq_
+ seq_printf(m, "%s=", hist_field->var.name);
+
+ if (hist_field->flags & HIST_FIELD_FL_CPU)
+- seq_puts(m, "cpu");
++ seq_puts(m, "common_cpu");
+ else if (field_name) {
+ if (hist_field->flags & HIST_FIELD_FL_VAR_REF ||
+ hist_field->flags & HIST_FIELD_FL_ALIAS)
--- /dev/null
+From 3b13911a2fd0dd0146c9777a254840c5466cf120 Mon Sep 17 00:00:00 2001
+From: "Steven Rostedt (VMware)" <rostedt@goodmis.org>
+Date: Wed, 21 Jul 2021 19:10:08 -0400
+Subject: tracing: Synthetic event field_pos is an index not a boolean
+
+From: Steven Rostedt (VMware) <rostedt@goodmis.org>
+
+commit 3b13911a2fd0dd0146c9777a254840c5466cf120 upstream.
+
+Performing the following:
+
+ ># echo 'wakeup_lat s32 pid; u64 delta; char wake_comm[]' > synthetic_events
+ ># echo 'hist:keys=pid:__arg__1=common_timestamp.usecs' > events/sched/sched_waking/trigger
+ ># echo 'hist:keys=next_pid:pid=next_pid,delta=common_timestamp.usecs-$__arg__1:onmatch(sched.sched_waking).trace(wakeup_lat,$pid,$delta,prev_comm)'\
+ > events/sched/sched_switch/trigger
+ ># echo 1 > events/synthetic/enable
+
+Crashed the kernel:
+
+ BUG: kernel NULL pointer dereference, address: 000000000000001b
+ #PF: supervisor read access in kernel mode
+ #PF: error_code(0x0000) - not-present page
+ PGD 0 P4D 0
+ Oops: 0000 [#1] PREEMPT SMP
+ CPU: 7 PID: 0 Comm: swapper/7 Not tainted 5.13.0-rc5-test+ #104
+ Hardware name: Hewlett-Packard HP Compaq Pro 6300 SFF/339A, BIOS K01 v03.03 07/14/2016
+ RIP: 0010:strlen+0x0/0x20
+ Code: f6 82 80 2b 0b bc 20 74 11 0f b6 50 01 48 83 c0 01 f6 82 80 2b 0b bc
+ 20 75 ef c3 66 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 <80> 3f 00 74 10
+ 48 89 f8 48 83 c0 01 80 38 9 f8 c3 31
+ RSP: 0018:ffffaa75000d79d0 EFLAGS: 00010046
+ RAX: 0000000000000002 RBX: ffff9cdb55575270 RCX: 0000000000000000
+ RDX: ffff9cdb58c7a320 RSI: ffffaa75000d7b40 RDI: 000000000000001b
+ RBP: ffffaa75000d7b40 R08: ffff9cdb40a4f010 R09: ffffaa75000d7ab8
+ R10: ffff9cdb4398c700 R11: 0000000000000008 R12: ffff9cdb58c7a320
+ R13: ffff9cdb55575270 R14: ffff9cdb58c7a000 R15: 0000000000000018
+ FS: 0000000000000000(0000) GS:ffff9cdb5aa00000(0000) knlGS:0000000000000000
+ CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+ CR2: 000000000000001b CR3: 00000000c0612006 CR4: 00000000001706e0
+ Call Trace:
+ trace_event_raw_event_synth+0x90/0x1d0
+ action_trace+0x5b/0x70
+ event_hist_trigger+0x4bd/0x4e0
+ ? cpumask_next_and+0x20/0x30
+ ? update_sd_lb_stats.constprop.0+0xf6/0x840
+ ? __lock_acquire.constprop.0+0x125/0x550
+ ? find_held_lock+0x32/0x90
+ ? sched_clock_cpu+0xe/0xd0
+ ? lock_release+0x155/0x440
+ ? update_load_avg+0x8c/0x6f0
+ ? enqueue_entity+0x18a/0x920
+ ? __rb_reserve_next+0xe5/0x460
+ ? ring_buffer_lock_reserve+0x12a/0x3f0
+ event_triggers_call+0x52/0xe0
+ trace_event_buffer_commit+0x1ae/0x240
+ trace_event_raw_event_sched_switch+0x114/0x170
+ __traceiter_sched_switch+0x39/0x50
+ __schedule+0x431/0xb00
+ schedule_idle+0x28/0x40
+ do_idle+0x198/0x2e0
+ cpu_startup_entry+0x19/0x20
+ secondary_startup_64_no_verify+0xc2/0xcb
+
+The reason is that the dynamic events array keeps track of the field
+position of the fields array, via the field_pos variable in the
+synth_field structure. Unfortunately, that field is a boolean for some
+reason, which means any field_pos greater than 1 will be a bug (in this
+case it was 2).
+
+Link: https://lkml.kernel.org/r/20210721191008.638bce34@oasis.local.home
+
+Cc: Masami Hiramatsu <mhiramat@kernel.org>
+Cc: Namhyung Kim <namhyung@kernel.org>
+Cc: Ingo Molnar <mingo@kernel.org>
+Cc: Andrew Morton <akpm@linux-foundation.org>
+Cc: stable@vger.kernel.org
+Fixes: bd82631d7ccdc ("tracing: Add support for dynamic strings to synthetic events")
+Reviewed-by: Tom Zanussi <zanussi@kernel.org>
+Signed-off-by: Steven Rostedt (VMware) <rostedt@goodmis.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/trace/trace_synth.h | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/kernel/trace/trace_synth.h
++++ b/kernel/trace/trace_synth.h
+@@ -14,10 +14,10 @@ struct synth_field {
+ char *name;
+ size_t size;
+ unsigned int offset;
++ unsigned int field_pos;
+ bool is_signed;
+ bool is_string;
+ bool is_dynamic;
+- bool field_pos;
+ };
+
+ struct synth_event {
--- /dev/null
+From e71e2ace5721a8b921dca18b045069e7bb411277 Mon Sep 17 00:00:00 2001
+From: Peter Collingbourne <pcc@google.com>
+Date: Fri, 23 Jul 2021 15:50:01 -0700
+Subject: userfaultfd: do not untag user pointers
+
+From: Peter Collingbourne <pcc@google.com>
+
+commit e71e2ace5721a8b921dca18b045069e7bb411277 upstream.
+
+Patch series "userfaultfd: do not untag user pointers", v5.
+
+If a user program uses userfaultfd on ranges of heap memory, it may end
+up passing a tagged pointer to the kernel in the range.start field of
+the UFFDIO_REGISTER ioctl. This can happen when using an MTE-capable
+allocator, or on Android if using the Tagged Pointers feature for MTE
+readiness [1].
+
+When a fault subsequently occurs, the tag is stripped from the fault
+address returned to the application in the fault.address field of struct
+uffd_msg. However, from the application's perspective, the tagged
+address *is* the memory address, so if the application is unaware of
+memory tags, it may get confused by receiving an address that is, from
+its point of view, outside of the bounds of the allocation. We observed
+this behavior in the kselftest for userfaultfd [2] but other
+applications could have the same problem.
+
+Address this by not untagging pointers passed to the userfaultfd ioctls.
+Instead, let the system call fail. Also change the kselftest to use
+mmap so that it doesn't encounter this problem.
+
+[1] https://source.android.com/devices/tech/debug/tagged-pointers
+[2] tools/testing/selftests/vm/userfaultfd.c
+
+This patch (of 2):
+
+Do not untag pointers passed to the userfaultfd ioctls. Instead, let
+the system call fail. This will provide an early indication of problems
+with tag-unaware userspace code instead of letting the code get confused
+later, and is consistent with how we decided to handle brk/mmap/mremap
+in commit dcde237319e6 ("mm: Avoid creating virtual address aliases in
+brk()/mmap()/mremap()"), as well as being consistent with the existing
+tagged address ABI documentation relating to how ioctl arguments are
+handled.
+
+The code change is a revert of commit 7d0325749a6c ("userfaultfd: untag
+user pointers") plus some fixups to some additional calls to
+validate_range that have appeared since then.
+
+[1] https://source.android.com/devices/tech/debug/tagged-pointers
+[2] tools/testing/selftests/vm/userfaultfd.c
+
+Link: https://lkml.kernel.org/r/20210714195437.118982-1-pcc@google.com
+Link: https://lkml.kernel.org/r/20210714195437.118982-2-pcc@google.com
+Link: https://linux-review.googlesource.com/id/I761aa9f0344454c482b83fcfcce547db0a25501b
+Fixes: 63f0c6037965 ("arm64: Introduce prctl() options to control the tagged user addresses ABI")
+Signed-off-by: Peter Collingbourne <pcc@google.com>
+Reviewed-by: Andrey Konovalov <andreyknvl@gmail.com>
+Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
+Cc: Alistair Delva <adelva@google.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Dave Martin <Dave.Martin@arm.com>
+Cc: Evgenii Stepanov <eugenis@google.com>
+Cc: Lokesh Gidra <lokeshgidra@google.com>
+Cc: Mitch Phillips <mitchp@google.com>
+Cc: Vincenzo Frascino <vincenzo.frascino@arm.com>
+Cc: Will Deacon <will@kernel.org>
+Cc: William McVicker <willmcvicker@google.com>
+Cc: <stable@vger.kernel.org> [5.4]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ Documentation/arm64/tagged-address-abi.rst | 26 ++++++++++++++++++--------
+ fs/userfaultfd.c | 26 ++++++++++++--------------
+ 2 files changed, 30 insertions(+), 22 deletions(-)
+
+--- a/Documentation/arm64/tagged-address-abi.rst
++++ b/Documentation/arm64/tagged-address-abi.rst
+@@ -45,14 +45,24 @@ how the user addresses are used by the k
+
+ 1. User addresses not accessed by the kernel but used for address space
+ management (e.g. ``mprotect()``, ``madvise()``). The use of valid
+- tagged pointers in this context is allowed with the exception of
+- ``brk()``, ``mmap()`` and the ``new_address`` argument to
+- ``mremap()`` as these have the potential to alias with existing
+- user addresses.
+-
+- NOTE: This behaviour changed in v5.6 and so some earlier kernels may
+- incorrectly accept valid tagged pointers for the ``brk()``,
+- ``mmap()`` and ``mremap()`` system calls.
++ tagged pointers in this context is allowed with these exceptions:
++
++ - ``brk()``, ``mmap()`` and the ``new_address`` argument to
++ ``mremap()`` as these have the potential to alias with existing
++ user addresses.
++
++ NOTE: This behaviour changed in v5.6 and so some earlier kernels may
++ incorrectly accept valid tagged pointers for the ``brk()``,
++ ``mmap()`` and ``mremap()`` system calls.
++
++ - The ``range.start``, ``start`` and ``dst`` arguments to the
++ ``UFFDIO_*`` ``ioctl()``s used on a file descriptor obtained from
++ ``userfaultfd()``, as fault addresses subsequently obtained by reading
++ the file descriptor will be untagged, which may otherwise confuse
++ tag-unaware programs.
++
++ NOTE: This behaviour changed in v5.14 and so some earlier kernels may
++ incorrectly accept valid tagged pointers for this system call.
+
+ 2. User addresses accessed by the kernel (e.g. ``write()``). This ABI
+ relaxation is disabled by default and the application thread needs to
+--- a/fs/userfaultfd.c
++++ b/fs/userfaultfd.c
+@@ -1236,23 +1236,21 @@ static __always_inline void wake_userfau
+ }
+
+ static __always_inline int validate_range(struct mm_struct *mm,
+- __u64 *start, __u64 len)
++ __u64 start, __u64 len)
+ {
+ __u64 task_size = mm->task_size;
+
+- *start = untagged_addr(*start);
+-
+- if (*start & ~PAGE_MASK)
++ if (start & ~PAGE_MASK)
+ return -EINVAL;
+ if (len & ~PAGE_MASK)
+ return -EINVAL;
+ if (!len)
+ return -EINVAL;
+- if (*start < mmap_min_addr)
++ if (start < mmap_min_addr)
+ return -EINVAL;
+- if (*start >= task_size)
++ if (start >= task_size)
+ return -EINVAL;
+- if (len > task_size - *start)
++ if (len > task_size - start)
+ return -EINVAL;
+ return 0;
+ }
+@@ -1313,7 +1311,7 @@ static int userfaultfd_register(struct u
+ vm_flags |= VM_UFFD_MINOR;
+ }
+
+- ret = validate_range(mm, &uffdio_register.range.start,
++ ret = validate_range(mm, uffdio_register.range.start,
+ uffdio_register.range.len);
+ if (ret)
+ goto out;
+@@ -1519,7 +1517,7 @@ static int userfaultfd_unregister(struct
+ if (copy_from_user(&uffdio_unregister, buf, sizeof(uffdio_unregister)))
+ goto out;
+
+- ret = validate_range(mm, &uffdio_unregister.start,
++ ret = validate_range(mm, uffdio_unregister.start,
+ uffdio_unregister.len);
+ if (ret)
+ goto out;
+@@ -1668,7 +1666,7 @@ static int userfaultfd_wake(struct userf
+ if (copy_from_user(&uffdio_wake, buf, sizeof(uffdio_wake)))
+ goto out;
+
+- ret = validate_range(ctx->mm, &uffdio_wake.start, uffdio_wake.len);
++ ret = validate_range(ctx->mm, uffdio_wake.start, uffdio_wake.len);
+ if (ret)
+ goto out;
+
+@@ -1708,7 +1706,7 @@ static int userfaultfd_copy(struct userf
+ sizeof(uffdio_copy)-sizeof(__s64)))
+ goto out;
+
+- ret = validate_range(ctx->mm, &uffdio_copy.dst, uffdio_copy.len);
++ ret = validate_range(ctx->mm, uffdio_copy.dst, uffdio_copy.len);
+ if (ret)
+ goto out;
+ /*
+@@ -1765,7 +1763,7 @@ static int userfaultfd_zeropage(struct u
+ sizeof(uffdio_zeropage)-sizeof(__s64)))
+ goto out;
+
+- ret = validate_range(ctx->mm, &uffdio_zeropage.range.start,
++ ret = validate_range(ctx->mm, uffdio_zeropage.range.start,
+ uffdio_zeropage.range.len);
+ if (ret)
+ goto out;
+@@ -1815,7 +1813,7 @@ static int userfaultfd_writeprotect(stru
+ sizeof(struct uffdio_writeprotect)))
+ return -EFAULT;
+
+- ret = validate_range(ctx->mm, &uffdio_wp.range.start,
++ ret = validate_range(ctx->mm, uffdio_wp.range.start,
+ uffdio_wp.range.len);
+ if (ret)
+ return ret;
+@@ -1863,7 +1861,7 @@ static int userfaultfd_continue(struct u
+ sizeof(uffdio_continue) - (sizeof(__s64))))
+ goto out;
+
+- ret = validate_range(ctx->mm, &uffdio_continue.range.start,
++ ret = validate_range(ctx->mm, uffdio_continue.range.start,
+ uffdio_continue.range.len);
+ if (ret)
+ goto out;