--- /dev/null
+From ef01f4e25c1760920e2c94f1c232350277ace69b Mon Sep 17 00:00:00 2001
+From: Paul Moore <paul@paul-moore.com>
+Date: Fri, 6 Jan 2023 10:43:59 -0500
+Subject: bpf: restore the ebpf program ID for BPF_AUDIT_UNLOAD and PERF_BPF_EVENT_PROG_UNLOAD
+
+From: Paul Moore <paul@paul-moore.com>
+
+commit ef01f4e25c1760920e2c94f1c232350277ace69b upstream.
+
+When changing the ebpf program put() routines to support being called
+from within IRQ context the program ID was reset to zero prior to
+calling the perf event and audit UNLOAD record generators, which
+resulted in problems as the ebpf program ID was bogus (always zero).
+This patch addresses this problem by removing an unnecessary call to
+bpf_prog_free_id() in __bpf_prog_offload_destroy() and adjusting
+__bpf_prog_put() to only call bpf_prog_free_id() after audit and perf
+have finished their bpf program unload tasks in
+bpf_prog_put_deferred(). For the record, no one can determine, or
+remember, why it was necessary to free the program ID, and remove it
+from the IDR, prior to executing bpf_prog_put_deferred();
+regardless, both Stanislav and Alexei agree that the approach in this
+patch should be safe.
+
+It is worth noting that when moving the bpf_prog_free_id() call, the
+do_idr_lock parameter was forced to true as the ebpf devs determined
+this was the correct as the do_idr_lock should always be true. The
+do_idr_lock parameter will be removed in a follow-up patch, but it
+was kept here to keep the patch small in an effort to ease any stable
+backports.
+
+I also modified the bpf_audit_prog() logic used to associate the
+AUDIT_BPF record with other associated records, e.g. @ctx != NULL.
+Instead of keying off the operation, it now keys off the execution
+context, e.g. '!in_irg && !irqs_disabled()', which is much more
+appropriate and should help better connect the UNLOAD operations with
+the associated audit state (other audit records).
+
+Cc: stable@vger.kernel.org
+Fixes: d809e134be7a ("bpf: Prepare bpf_prog_put() to be called from irq context.")
+Reported-by: Burn Alting <burn.alting@iinet.net.au>
+Reported-by: Jiri Olsa <olsajiri@gmail.com>
+Suggested-by: Stanislav Fomichev <sdf@google.com>
+Suggested-by: Alexei Starovoitov <alexei.starovoitov@gmail.com>
+Signed-off-by: Paul Moore <paul@paul-moore.com>
+Acked-by: Stanislav Fomichev <sdf@google.com>
+Link: https://lore.kernel.org/r/20230106154400.74211-1-paul@paul-moore.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/bpf/offload.c | 3 ---
+ kernel/bpf/syscall.c | 6 ++----
+ 2 files changed, 2 insertions(+), 7 deletions(-)
+
+--- a/kernel/bpf/offload.c
++++ b/kernel/bpf/offload.c
+@@ -216,9 +216,6 @@ static void __bpf_prog_offload_destroy(s
+ if (offload->dev_state)
+ offload->offdev->ops->destroy(prog);
+
+- /* Make sure BPF_PROG_GET_NEXT_ID can't find this dead program */
+- bpf_prog_free_id(prog, true);
+-
+ list_del_init(&offload->offloads);
+ kfree(offload);
+ prog->aux->offload = NULL;
+--- a/kernel/bpf/syscall.c
++++ b/kernel/bpf/syscall.c
+@@ -1958,7 +1958,7 @@ static void bpf_audit_prog(const struct
+ return;
+ if (audit_enabled == AUDIT_OFF)
+ return;
+- if (op == BPF_AUDIT_LOAD)
++ if (!in_irq() && !irqs_disabled())
+ ctx = audit_context();
+ ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF);
+ if (unlikely(!ab))
+@@ -2053,6 +2053,7 @@ static void bpf_prog_put_deferred(struct
+ prog = aux->prog;
+ perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
+ bpf_audit_prog(prog, BPF_AUDIT_UNLOAD);
++ bpf_prog_free_id(prog, true);
+ __bpf_prog_put_noref(prog, true);
+ }
+
+@@ -2061,9 +2062,6 @@ static void __bpf_prog_put(struct bpf_pr
+ struct bpf_prog_aux *aux = prog->aux;
+
+ if (atomic64_dec_and_test(&aux->refcnt)) {
+- /* bpf_prog_free_id() must be called first */
+- bpf_prog_free_id(prog, do_idr_lock);
+-
+ if (in_irq() || irqs_disabled()) {
+ INIT_WORK(&aux->work, bpf_prog_put_deferred);
+ schedule_work(&aux->work);
--- /dev/null
+From ed02363fbbed52a3f5ea0d188edd09045a806eb5 Mon Sep 17 00:00:00 2001
+From: Qu Wenruo <wqu@suse.com>
+Date: Mon, 12 Dec 2022 10:19:37 +0800
+Subject: btrfs: add extra error messages to cover non-ENOMEM errors from device_add_list()
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit ed02363fbbed52a3f5ea0d188edd09045a806eb5 upstream.
+
+[BUG]
+When test case btrfs/219 (aka, mount a registered device but with a lower
+generation) failed, there is not any useful information for the end user
+to find out what's going wrong.
+
+The mount failure just looks like this:
+
+ # mount -o loop /tmp/219.img2 /mnt/btrfs/
+ mount: /mnt/btrfs: mount(2) system call failed: File exists.
+ dmesg(1) may have more information after failed mount system call.
+
+While the dmesg contains nothing but the loop device change:
+
+ loop1: detected capacity change from 0 to 524288
+
+[CAUSE]
+In device_list_add() we have a lot of extra checks to reject invalid
+cases.
+
+That function also contains the regular device scan result like the
+following prompt:
+
+ BTRFS: device fsid 6222333e-f9f1-47e6-b306-55ddd4dcaef4 devid 1 transid 8 /dev/loop0 scanned by systemd-udevd (3027)
+
+But unfortunately not all errors have their own error messages, thus if
+we hit something wrong in device_add_list(), there may be no error
+messages at all.
+
+[FIX]
+Add errors message for all non-ENOMEM errors.
+
+For ENOMEM, I'd say we're in a much worse situation, and there should be
+some OOM messages way before our call sites.
+
+CC: stable@vger.kernel.org # 6.0+
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/volumes.c | 11 ++++++++++-
+ 1 file changed, 10 insertions(+), 1 deletion(-)
+
+--- a/fs/btrfs/volumes.c
++++ b/fs/btrfs/volumes.c
+@@ -770,8 +770,11 @@ static noinline struct btrfs_device *dev
+ BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
+
+ error = lookup_bdev(path, &path_devt);
+- if (error)
++ if (error) {
++ btrfs_err(NULL, "failed to lookup block device for path %s: %d",
++ path, error);
+ return ERR_PTR(error);
++ }
+
+ if (fsid_change_in_progress) {
+ if (!has_metadata_uuid)
+@@ -836,6 +839,9 @@ static noinline struct btrfs_device *dev
+
+ if (!device) {
+ if (fs_devices->opened) {
++ btrfs_err(NULL,
++ "device %s belongs to fsid %pU, and the fs is already mounted",
++ path, fs_devices->fsid);
+ mutex_unlock(&fs_devices->device_list_mutex);
+ return ERR_PTR(-EBUSY);
+ }
+@@ -910,6 +916,9 @@ static noinline struct btrfs_device *dev
+ * generation are equal.
+ */
+ mutex_unlock(&fs_devices->device_list_mutex);
++ btrfs_err(NULL,
++"device %s already registered with a higher generation, found %llu expect %llu",
++ path, found_transid, device->generation);
+ return ERR_PTR(-EEXIST);
+ }
+
--- /dev/null
+From 94cd63ae679973edeb5ea95ec25a54467c3e54c8 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Tue, 10 Jan 2023 14:56:36 +0000
+Subject: btrfs: add missing setup of log for full commit at add_conflicting_inode()
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 94cd63ae679973edeb5ea95ec25a54467c3e54c8 upstream.
+
+When logging conflicting inodes, if we reach the maximum limit of inodes,
+we return BTRFS_LOG_FORCE_COMMIT to force a transaction commit. However
+we don't mark the log for full commit (with btrfs_set_log_full_commit()),
+which means that once we leave the log transaction and before we commit
+the transaction, some other task may sync the log, which is incomplete
+as we have not logged all conflicting inodes, leading to some inconsistent
+in case that log ends up being replayed.
+
+So also call btrfs_set_log_full_commit() at add_conflicting_inode().
+
+Fixes: e09d94c9e448 ("btrfs: log conflicting inodes without holding log mutex of the initial inode")
+CC: stable@vger.kernel.org # 6.1
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/tree-log.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -5626,8 +5626,10 @@ static int add_conflicting_inode(struct
+ * LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction
+ * commits.
+ */
+- if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES)
++ if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES) {
++ btrfs_set_log_full_commit(trans);
+ return BTRFS_LOG_FORCE_COMMIT;
++ }
+
+ inode = btrfs_iget(root->fs_info->sb, ino, root);
+ /*
--- /dev/null
+From 09e44868f1e03c7825ca4283256abedc95e249a3 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Tue, 10 Jan 2023 14:56:38 +0000
+Subject: btrfs: do not abort transaction on failure to update log root
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 09e44868f1e03c7825ca4283256abedc95e249a3 upstream.
+
+When syncing a log, if we fail to update a log root in the log root tree,
+we are aborting the transaction if the failure was not -ENOSPC. This is
+excessive because there is a chance that a transaction commit can succeed,
+and therefore avoid to turn the filesystem into RO mode. All we need to be
+careful about is to mark the log for a full commit, which we already do,
+to make sure no one commits a super block pointing to an outdated log root
+tree.
+
+So don't abort the transaction if we fail to update a log root in the log
+root tree, and log an error if the failure is not -ENOSPC, so that it does
+not go completely unnoticed.
+
+CC: stable@vger.kernel.org # 6.0+
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/tree-log.c | 11 ++++-------
+ 1 file changed, 4 insertions(+), 7 deletions(-)
+
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -3075,15 +3075,12 @@ int btrfs_sync_log(struct btrfs_trans_ha
+
+ blk_finish_plug(&plug);
+ btrfs_set_log_full_commit(trans);
+-
+- if (ret != -ENOSPC) {
+- btrfs_abort_transaction(trans, ret);
+- mutex_unlock(&log_root_tree->log_mutex);
+- goto out;
+- }
++ if (ret != -ENOSPC)
++ btrfs_err(fs_info,
++ "failed to update log for root %llu ret %d",
++ root->root_key.objectid, ret);
+ btrfs_wait_tree_log_extents(log, mark);
+ mutex_unlock(&log_root_tree->log_mutex);
+- ret = BTRFS_LOG_FORCE_COMMIT;
+ goto out;
+ }
+
--- /dev/null
+From 16199ad9eb6db60a6b10794a09fc1ac6d09312ff Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Tue, 10 Jan 2023 14:56:37 +0000
+Subject: btrfs: do not abort transaction on failure to write log tree when syncing log
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 16199ad9eb6db60a6b10794a09fc1ac6d09312ff upstream.
+
+When syncing the log, if we fail to write log tree extent buffers, we mark
+the log for a full commit and abort the transaction. However we don't need
+to abort the transaction, all we really need to do is to make sure no one
+can commit a superblock pointing to new log tree roots. Just because we
+got a failure writing extent buffers for a log tree, it does not mean we
+will also fail to do a transaction commit.
+
+One particular case is if due to a bug somewhere, when writing log tree
+extent buffers, the tree checker detects some corruption and the writeout
+fails because of that. Aborting the transaction can be very disruptive for
+a user, specially if the issue happened on a root filesystem. One example
+is the scenario in the Link tag below, where an isolated corruption on log
+tree leaves was causing transaction aborts when syncing the log.
+
+Link: https://lore.kernel.org/linux-btrfs/ae169fc6-f504-28f0-a098-6fa6a4dfb612@leemhuis.info/
+CC: stable@vger.kernel.org # 5.15+
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/disk-io.c | 9 ++++++++-
+ fs/btrfs/tree-log.c | 2 --
+ 2 files changed, 8 insertions(+), 3 deletions(-)
+
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -344,7 +344,14 @@ error:
+ btrfs_print_tree(eb, 0);
+ btrfs_err(fs_info, "block=%llu write time tree block corruption detected",
+ eb->start);
+- WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
++ /*
++ * Be noisy if this is an extent buffer from a log tree. We don't abort
++ * a transaction in case there's a bad log tree extent buffer, we just
++ * fallback to a transaction commit. Still we want to know when there is
++ * a bad log tree extent buffer, as that may signal a bug somewhere.
++ */
++ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG) ||
++ btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID);
+ return ret;
+ }
+
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -3011,7 +3011,6 @@ int btrfs_sync_log(struct btrfs_trans_ha
+ ret = 0;
+ if (ret) {
+ blk_finish_plug(&plug);
+- btrfs_abort_transaction(trans, ret);
+ btrfs_set_log_full_commit(trans);
+ mutex_unlock(&root->log_mutex);
+ goto out;
+@@ -3143,7 +3142,6 @@ int btrfs_sync_log(struct btrfs_trans_ha
+ goto out_wake_log_root;
+ } else if (ret) {
+ btrfs_set_log_full_commit(trans);
+- btrfs_abort_transaction(trans, ret);
+ mutex_unlock(&log_root_tree->log_mutex);
+ goto out_wake_log_root;
+ }
--- /dev/null
+From 8bb6898da6271d82d8e76d8088d66b971a7dcfa6 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Tue, 10 Jan 2023 14:56:35 +0000
+Subject: btrfs: fix directory logging due to race with concurrent index key deletion
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 8bb6898da6271d82d8e76d8088d66b971a7dcfa6 upstream.
+
+Sometimes we log a directory without holding its VFS lock, so while we
+logging it, dir index entries may be added or removed. This typically
+happens when logging a dentry from a parent directory that points to a
+new directory, through log_new_dir_dentries(), or when while logging
+some other inode we also need to log its parent directories (through
+btrfs_log_all_parents()).
+
+This means that while we are at log_dir_items(), we may not find a dir
+index key we found before, because it was deleted in the meanwhile, so
+a call to btrfs_search_slot() may return 1 (key not found). In that case
+we return from log_dir_items() with a success value (the variable 'err'
+has a value of 0). This can lead to a few problems, specially in the case
+where the variable 'last_offset' has a value of (u64)-1 (and it's
+initialized to that when it was declared):
+
+1) By returning from log_dir_items() with success (0) and a value of
+ (u64)-1 for '*last_offset_ret', we end up not logging any other dir
+ index keys that follow the missing, just deleted, index key. The
+ (u64)-1 value makes log_directory_changes() not call log_dir_items()
+ again;
+
+2) Before returning with success (0), log_dir_items(), will log a dir
+ index range item covering a range from the last old dentry index
+ (stored in the variable 'last_old_dentry_offset') to the value of
+ 'last_offset'. If 'last_offset' has a value of (u64)-1, then it means
+ if the log is persisted and replayed after a power failure, it will
+ cause deletion of all the directory entries that have an index number
+ between last_old_dentry_offset + 1 and (u64)-1;
+
+3) We can end up returning from log_dir_items() with
+ ctx->last_dir_item_offset having a lower value than
+ inode->last_dir_index_offset, because the former is set to the current
+ key we are processing at process_dir_items_leaf(), and at the end of
+ log_directory_changes() we set inode->last_dir_index_offset to the
+ current value of ctx->last_dir_item_offset. So if for example a
+ deletion of a lower dir index key happened, we set
+ ctx->last_dir_item_offset to that index value, then if we return from
+ log_dir_items() because btrfs_search_slot() returned 1, we end up
+ returning from log_dir_items() with success (0) and then
+ log_directory_changes() sets inode->last_dir_index_offset to a lower
+ value than it had before.
+ This can result in unpredictable and unexpected behaviour when we
+ need to log again the directory in the same transaction, and can result
+ in ending up with a log tree leaf that has duplicated keys, as we do
+ batch insertions of dir index keys into a log tree.
+
+So fix this by making log_dir_items() move on to the next dir index key
+if it does not find the one it was looking for.
+
+Reported-by: David Arendt <admin@prnet.org>
+Link: https://lore.kernel.org/linux-btrfs/ae169fc6-f504-28f0-a098-6fa6a4dfb612@leemhuis.info/
+CC: stable@vger.kernel.org # 4.14+
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/tree-log.c | 21 +++++++++++++++------
+ 1 file changed, 15 insertions(+), 6 deletions(-)
+
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -3888,17 +3888,26 @@ static noinline int log_dir_items(struct
+ btrfs_release_path(path);
+
+ /*
+- * Find the first key from this transaction again. See the note for
+- * log_new_dir_dentries, if we're logging a directory recursively we
+- * won't be holding its i_mutex, which means we can modify the directory
+- * while we're logging it. If we remove an entry between our first
+- * search and this search we'll not find the key again and can just
+- * bail.
++ * Find the first key from this transaction again or the one we were at
++ * in the loop below in case we had to reschedule. We may be logging the
++ * directory without holding its VFS lock, which happen when logging new
++ * dentries (through log_new_dir_dentries()) or in some cases when we
++ * need to log the parent directory of an inode. This means a dir index
++ * key might be deleted from the inode's root, and therefore we may not
++ * find it anymore. If we can't find it, just move to the next key. We
++ * can not bail out and ignore, because if we do that we will simply
++ * not log dir index keys that come after the one that was just deleted
++ * and we can end up logging a dir index range that ends at (u64)-1
++ * (@last_offset is initialized to that), resulting in removing dir
++ * entries we should not remove at log replay time.
+ */
+ search:
+ ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
++ if (ret > 0)
++ ret = btrfs_next_item(root, path);
+ if (ret < 0)
+ err = ret;
++ /* If ret is 1, there are no more keys in the inode's root. */
+ if (ret != 0)
+ goto done;
+
--- /dev/null
+From 1f55ee6d0901d915801618bda0af4e5b937e3db7 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Thu, 12 Jan 2023 14:17:20 +0000
+Subject: btrfs: fix invalid leaf access due to inline extent during lseek
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 1f55ee6d0901d915801618bda0af4e5b937e3db7 upstream.
+
+During lseek, for SEEK_DATA and SEEK_HOLE modes, we access the disk_bytenr
+of an extent without checking its type. However inline extents have their
+data starting the offset of the disk_bytenr field, so accessing that field
+when we have an inline extent can result in either of the following:
+
+1) Interpret the inline extent's data as a disk_bytenr value;
+
+2) In case the inline data is less than 8 bytes, we access part of some
+ other item in the leaf, or unused space in the leaf;
+
+3) In case the inline data is less than 8 bytes and the extent item is
+ the first item in the leaf, we can access beyond the leaf's limit.
+
+So fix this by not accessing the disk_bytenr field if we have an inline
+extent.
+
+Fixes: b6e833567ea1 ("btrfs: make hole and data seeking a lot more efficient")
+Reported-by: Matthias Schoepfer <matthias.schoepfer@googlemail.com>
+Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=216908
+Link: https://lore.kernel.org/linux-btrfs/7f25442f-b121-2a3a-5a3d-22bcaae83cd4@leemhuis.info/
+CC: stable@vger.kernel.org # 6.1
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/file.c | 13 ++++++++++---
+ 1 file changed, 10 insertions(+), 3 deletions(-)
+
+--- a/fs/btrfs/file.c
++++ b/fs/btrfs/file.c
+@@ -3838,6 +3838,7 @@ static loff_t find_desired_extent(struct
+ struct extent_buffer *leaf = path->nodes[0];
+ struct btrfs_file_extent_item *extent;
+ u64 extent_end;
++ u8 type;
+
+ if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+ ret = btrfs_next_leaf(root, path);
+@@ -3892,10 +3893,16 @@ static loff_t find_desired_extent(struct
+
+ extent = btrfs_item_ptr(leaf, path->slots[0],
+ struct btrfs_file_extent_item);
++ type = btrfs_file_extent_type(leaf, extent);
+
+- if (btrfs_file_extent_disk_bytenr(leaf, extent) == 0 ||
+- btrfs_file_extent_type(leaf, extent) ==
+- BTRFS_FILE_EXTENT_PREALLOC) {
++ /*
++ * Can't access the extent's disk_bytenr field if this is an
++ * inline extent, since at that offset, it's where the extent
++ * data starts.
++ */
++ if (type == BTRFS_FILE_EXTENT_PREALLOC ||
++ (type == BTRFS_FILE_EXTENT_REG &&
++ btrfs_file_extent_disk_bytenr(leaf, extent) == 0)) {
+ /*
+ * Explicit hole or prealloc extent, search for delalloc.
+ * A prealloc extent is treated like a hole.
--- /dev/null
+From 6d3d970b2735b967650d319be27268fedc5598d1 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Tue, 10 Jan 2023 14:56:34 +0000
+Subject: btrfs: fix missing error handling when logging directory items
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 6d3d970b2735b967650d319be27268fedc5598d1 upstream.
+
+When logging a directory, at log_dir_items(), if we get an error when
+attempting to search the subvolume tree for a dir index item, we end up
+returning 0 (success) from log_dir_items() because 'err' is left with a
+value of 0.
+
+This can lead to a few problems, specially in the case the variable
+'last_offset' has a value of (u64)-1 (and it's initialized to that when
+it was declared):
+
+1) By returning from log_dir_items() with success (0) and a value of
+ (u64)-1 for '*last_offset_ret', we end up not logging any other dir
+ index keys that follow the missing, just deleted, index key. The
+ (u64)-1 value makes log_directory_changes() not call log_dir_items()
+ again;
+
+2) Before returning with success (0), log_dir_items(), will log a dir
+ index range item covering a range from the last old dentry index
+ (stored in the variable 'last_old_dentry_offset') to the value of
+ 'last_offset'. If 'last_offset' has a value of (u64)-1, then it means
+ if the log is persisted and replayed after a power failure, it will
+ cause deletion of all the directory entries that have an index number
+ between last_old_dentry_offset + 1 and (u64)-1;
+
+3) We can end up returning from log_dir_items() with
+ ctx->last_dir_item_offset having a lower value than
+ inode->last_dir_index_offset, because the former is set to the current
+ key we are processing at process_dir_items_leaf(), and at the end of
+ log_directory_changes() we set inode->last_dir_index_offset to the
+ current value of ctx->last_dir_item_offset. So if for example a
+ deletion of a lower dir index key happened, we set
+ ctx->last_dir_item_offset to that index value, then if we return from
+ log_dir_items() because btrfs_search_slot() returned an error, we end up
+ returning without any error from log_dir_items() and then
+ log_directory_changes() sets inode->last_dir_index_offset to a lower
+ value than it had before.
+ This can result in unpredictable and unexpected behaviour when we
+ need to log again the directory in the same transaction, and can result
+ in ending up with a log tree leaf that has duplicated keys, as we do
+ batch insertions of dir index keys into a log tree.
+
+Fix this by setting 'err' to the value of 'ret' in case
+btrfs_search_slot() or btrfs_previous_item() returned an error. That will
+result in falling back to a full transaction commit.
+
+Reported-by: David Arendt <admin@prnet.org>
+Link: https://lore.kernel.org/linux-btrfs/ae169fc6-f504-28f0-a098-6fa6a4dfb612@leemhuis.info/
+Fixes: e02119d5a7b4 ("Btrfs: Add a write ahead tree log to optimize synchronous operations")
+CC: stable@vger.kernel.org # 4.14+
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/tree-log.c | 9 +++++++++
+ 1 file changed, 9 insertions(+)
+
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -3857,7 +3857,10 @@ static noinline int log_dir_items(struct
+ path->slots[0]);
+ if (tmp.type == BTRFS_DIR_INDEX_KEY)
+ last_old_dentry_offset = tmp.offset;
++ } else if (ret < 0) {
++ err = ret;
+ }
++
+ goto done;
+ }
+
+@@ -3877,7 +3880,11 @@ static noinline int log_dir_items(struct
+ */
+ if (tmp.type == BTRFS_DIR_INDEX_KEY)
+ last_old_dentry_offset = tmp.offset;
++ } else if (ret < 0) {
++ err = ret;
++ goto done;
+ }
++
+ btrfs_release_path(path);
+
+ /*
+@@ -3890,6 +3897,8 @@ static noinline int log_dir_items(struct
+ */
+ search:
+ ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
++ if (ret < 0)
++ err = ret;
+ if (ret != 0)
+ goto done;
+
--- /dev/null
+From b7adbf9ada3513d2092362c8eac5cddc5b651f5c Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Thu, 12 Jan 2023 16:31:08 +0000
+Subject: btrfs: fix race between quota rescan and disable leading to NULL pointer deref
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit b7adbf9ada3513d2092362c8eac5cddc5b651f5c upstream.
+
+If we have one task trying to start the quota rescan worker while another
+one is trying to disable quotas, we can end up hitting a race that results
+in the quota rescan worker doing a NULL pointer dereference. The steps for
+this are the following:
+
+1) Quotas are enabled;
+
+2) Task A calls the quota rescan ioctl and enters btrfs_qgroup_rescan().
+ It calls qgroup_rescan_init() which returns 0 (success) and then joins a
+ transaction and commits it;
+
+3) Task B calls the quota disable ioctl and enters btrfs_quota_disable().
+ It clears the bit BTRFS_FS_QUOTA_ENABLED from fs_info->flags and calls
+ btrfs_qgroup_wait_for_completion(), which returns immediately since the
+ rescan worker is not yet running.
+ Then it starts a transaction and locks fs_info->qgroup_ioctl_lock;
+
+4) Task A queues the rescan worker, by calling btrfs_queue_work();
+
+5) The rescan worker starts, and calls rescan_should_stop() at the start
+ of its while loop, which results in 0 iterations of the loop, since
+ the flag BTRFS_FS_QUOTA_ENABLED was cleared from fs_info->flags by
+ task B at step 3);
+
+6) Task B sets fs_info->quota_root to NULL;
+
+7) The rescan worker tries to start a transaction and uses
+ fs_info->quota_root as the root argument for btrfs_start_transaction().
+ This results in a NULL pointer dereference down the call chain of
+ btrfs_start_transaction(). The stack trace is something like the one
+ reported in Link tag below:
+
+ general protection fault, probably for non-canonical address 0xdffffc0000000041: 0000 [#1] PREEMPT SMP KASAN
+ KASAN: null-ptr-deref in range [0x0000000000000208-0x000000000000020f]
+ CPU: 1 PID: 34 Comm: kworker/u4:2 Not tainted 6.1.0-syzkaller-13872-gb6bb9676f216 #0
+ Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/26/2022
+ Workqueue: btrfs-qgroup-rescan btrfs_work_helper
+ RIP: 0010:start_transaction+0x48/0x10f0 fs/btrfs/transaction.c:564
+ Code: 48 89 fb 48 (...)
+ RSP: 0018:ffffc90000ab7ab0 EFLAGS: 00010206
+ RAX: 0000000000000041 RBX: 0000000000000208 RCX: ffff88801779ba80
+ RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000000000000
+ RBP: dffffc0000000000 R08: 0000000000000001 R09: fffff52000156f5d
+ R10: fffff52000156f5d R11: 1ffff92000156f5c R12: 0000000000000000
+ R13: 0000000000000001 R14: 0000000000000001 R15: 0000000000000003
+ FS: 0000000000000000(0000) GS:ffff8880b9900000(0000) knlGS:0000000000000000
+ CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+ CR2: 00007f2bea75b718 CR3: 000000001d0cc000 CR4: 00000000003506e0
+ DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+ DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+ Call Trace:
+ <TASK>
+ btrfs_qgroup_rescan_worker+0x3bb/0x6a0 fs/btrfs/qgroup.c:3402
+ btrfs_work_helper+0x312/0x850 fs/btrfs/async-thread.c:280
+ process_one_work+0x877/0xdb0 kernel/workqueue.c:2289
+ worker_thread+0xb14/0x1330 kernel/workqueue.c:2436
+ kthread+0x266/0x300 kernel/kthread.c:376
+ ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:308
+ </TASK>
+ Modules linked in:
+
+So fix this by having the rescan worker function not attempt to start a
+transaction if it didn't do any rescan work.
+
+Reported-by: syzbot+96977faa68092ad382c4@syzkaller.appspotmail.com
+Link: https://lore.kernel.org/linux-btrfs/000000000000e5454b05f065a803@google.com/
+Fixes: e804861bd4e6 ("btrfs: fix deadlock between quota disable and qgroup rescan worker")
+CC: stable@vger.kernel.org # 5.4+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/qgroup.c | 25 +++++++++++++++++--------
+ 1 file changed, 17 insertions(+), 8 deletions(-)
+
+--- a/fs/btrfs/qgroup.c
++++ b/fs/btrfs/qgroup.c
+@@ -3348,6 +3348,7 @@ static void btrfs_qgroup_rescan_worker(s
+ int err = -ENOMEM;
+ int ret = 0;
+ bool stopped = false;
++ bool did_leaf_rescans = false;
+
+ path = btrfs_alloc_path();
+ if (!path)
+@@ -3368,6 +3369,7 @@ static void btrfs_qgroup_rescan_worker(s
+ }
+
+ err = qgroup_rescan_leaf(trans, path);
++ did_leaf_rescans = true;
+
+ if (err > 0)
+ btrfs_commit_transaction(trans);
+@@ -3388,16 +3390,23 @@ out:
+ mutex_unlock(&fs_info->qgroup_rescan_lock);
+
+ /*
+- * only update status, since the previous part has already updated the
+- * qgroup info.
++ * Only update status, since the previous part has already updated the
++ * qgroup info, and only if we did any actual work. This also prevents
++ * race with a concurrent quota disable, which has already set
++ * fs_info->quota_root to NULL and cleared BTRFS_FS_QUOTA_ENABLED at
++ * btrfs_quota_disable().
+ */
+- trans = btrfs_start_transaction(fs_info->quota_root, 1);
+- if (IS_ERR(trans)) {
+- err = PTR_ERR(trans);
++ if (did_leaf_rescans) {
++ trans = btrfs_start_transaction(fs_info->quota_root, 1);
++ if (IS_ERR(trans)) {
++ err = PTR_ERR(trans);
++ trans = NULL;
++ btrfs_err(fs_info,
++ "fail to start transaction for status update: %d",
++ err);
++ }
++ } else {
+ trans = NULL;
+- btrfs_err(fs_info,
+- "fail to start transaction for status update: %d",
+- err);
+ }
+
+ mutex_lock(&fs_info->qgroup_rescan_lock);
--- /dev/null
+From 75181406b4eafacc531ff2ee5fb032bd93317e2b Mon Sep 17 00:00:00 2001
+From: Qu Wenruo <wqu@suse.com>
+Date: Tue, 10 Jan 2023 15:14:17 +0800
+Subject: btrfs: qgroup: do not warn on record without old_roots populated
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit 75181406b4eafacc531ff2ee5fb032bd93317e2b upstream.
+
+[BUG]
+There are some reports from the mailing list that since v6.1 kernel, the
+WARN_ON() inside btrfs_qgroup_account_extent() gets triggered during
+rescan:
+
+ WARNING: CPU: 3 PID: 6424 at fs/btrfs/qgroup.c:2756 btrfs_qgroup_account_extents+0x1ae/0x260 [btrfs]
+ CPU: 3 PID: 6424 Comm: snapperd Tainted: P OE 6.1.2-1-default #1 openSUSE Tumbleweed 05c7a1b1b61d5627475528f71f50444637b5aad7
+ RIP: 0010:btrfs_qgroup_account_extents+0x1ae/0x260 [btrfs]
+ Call Trace:
+ <TASK>
+ btrfs_commit_transaction+0x30c/0xb40 [btrfs c39c9c546c241c593f03bd6d5f39ea1b676250f6]
+ ? start_transaction+0xc3/0x5b0 [btrfs c39c9c546c241c593f03bd6d5f39ea1b676250f6]
+ btrfs_qgroup_rescan+0x42/0xc0 [btrfs c39c9c546c241c593f03bd6d5f39ea1b676250f6]
+ btrfs_ioctl+0x1ab9/0x25c0 [btrfs c39c9c546c241c593f03bd6d5f39ea1b676250f6]
+ ? __rseq_handle_notify_resume+0xa9/0x4a0
+ ? mntput_no_expire+0x4a/0x240
+ ? __seccomp_filter+0x319/0x4d0
+ __x64_sys_ioctl+0x90/0xd0
+ do_syscall_64+0x5b/0x80
+ ? syscall_exit_to_user_mode+0x17/0x40
+ ? do_syscall_64+0x67/0x80
+ entry_SYSCALL_64_after_hwframe+0x63/0xcd
+ RIP: 0033:0x7fd9b790d9bf
+ </TASK>
+
+[CAUSE]
+Since commit e15e9f43c7ca ("btrfs: introduce
+BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING to skip qgroup accounting"), if
+our qgroup is already in inconsistent state, we will no longer do the
+time-consuming backref walk.
+
+This can leave some qgroup records without a valid old_roots ulist.
+Normally this is fine, as btrfs_qgroup_account_extents() would also skip
+those records if we have NO_ACCOUNTING flag set.
+
+But there is a small window, if we have NO_ACCOUNTING flag set, and
+inserted some qgroup_record without a old_roots ulist, but then the user
+triggered a qgroup rescan.
+
+During btrfs_qgroup_rescan(), we firstly clear NO_ACCOUNTING flag, then
+commit current transaction.
+
+And since we have a qgroup_record with old_roots = NULL, we trigger the
+WARN_ON() during btrfs_qgroup_account_extents().
+
+[FIX]
+Unfortunately due to the introduction of NO_ACCOUNTING flag, the
+assumption that every qgroup_record would have its old_roots populated
+is no longer correct.
+
+Fix the false alerts and drop the WARN_ON().
+
+Reported-by: Lukas Straub <lukasstraub2@web.de>
+Reported-by: HanatoK <summersnow9403@gmail.com>
+Fixes: e15e9f43c7ca ("btrfs: introduce BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING to skip qgroup accounting")
+CC: stable@vger.kernel.org # 6.1
+Link: https://lore.kernel.org/linux-btrfs/2403c697-ddaf-58ad-3829-0335fc89df09@gmail.com/
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/qgroup.c | 14 ++++++++++++--
+ 1 file changed, 12 insertions(+), 2 deletions(-)
+
+--- a/fs/btrfs/qgroup.c
++++ b/fs/btrfs/qgroup.c
+@@ -2751,9 +2751,19 @@ int btrfs_qgroup_account_extents(struct
+ BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)) {
+ /*
+ * Old roots should be searched when inserting qgroup
+- * extent record
++ * extent record.
++ *
++ * But for INCONSISTENT (NO_ACCOUNTING) -> rescan case,
++ * we may have some record inserted during
++ * NO_ACCOUNTING (thus no old_roots populated), but
++ * later we start rescan, which clears NO_ACCOUNTING,
++ * leaving some inserted records without old_roots
++ * populated.
++ *
++ * Those cases are rare and should not cause too much
++ * time spent during commit_transaction().
+ */
+- if (WARN_ON(!record->old_roots)) {
++ if (!record->old_roots) {
+ /* Search commit root to find old_roots */
+ ret = btrfs_find_all_roots(NULL, fs_info,
+ record->bytenr, 0,
--- /dev/null
+From 30b2b2196d6e4cc24cbec633535a2404f258ce69 Mon Sep 17 00:00:00 2001
+From: Enzo Matsumiya <ematsumiya@suse.de>
+Date: Wed, 18 Jan 2023 14:06:57 -0300
+Subject: cifs: do not include page data when checking signature
+
+From: Enzo Matsumiya <ematsumiya@suse.de>
+
+commit 30b2b2196d6e4cc24cbec633535a2404f258ce69 upstream.
+
+On async reads, page data is allocated before sending. When the
+response is received but it has no data to fill (e.g.
+STATUS_END_OF_FILE), __calc_signature() will still include the pages in
+its computation, leading to an invalid signature check.
+
+This patch fixes this by not setting the async read smb_rqst page data
+(zeroed by default) if its got_bytes is 0.
+
+This can be reproduced/verified with xfstests generic/465.
+
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Enzo Matsumiya <ematsumiya@suse.de>
+Reviewed-by: Paulo Alcantara (SUSE) <pc@cjr.nz>
+Signed-off-by: Steve French <stfrench@microsoft.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/cifs/smb2pdu.c | 15 +++++++++------
+ 1 file changed, 9 insertions(+), 6 deletions(-)
+
+--- a/fs/cifs/smb2pdu.c
++++ b/fs/cifs/smb2pdu.c
+@@ -4162,12 +4162,15 @@ smb2_readv_callback(struct mid_q_entry *
+ (struct smb2_hdr *)rdata->iov[0].iov_base;
+ struct cifs_credits credits = { .value = 0, .instance = 0 };
+ struct smb_rqst rqst = { .rq_iov = &rdata->iov[1],
+- .rq_nvec = 1,
+- .rq_pages = rdata->pages,
+- .rq_offset = rdata->page_offset,
+- .rq_npages = rdata->nr_pages,
+- .rq_pagesz = rdata->pagesz,
+- .rq_tailsz = rdata->tailsz };
++ .rq_nvec = 1, };
++
++ if (rdata->got_bytes) {
++ rqst.rq_pages = rdata->pages;
++ rqst.rq_offset = rdata->page_offset;
++ rqst.rq_npages = rdata->nr_pages;
++ rqst.rq_pagesz = rdata->pagesz;
++ rqst.rq_tailsz = rdata->tailsz;
++ }
+
+ WARN_ONCE(rdata->server != mid->server,
+ "rdata server %p != mid server %p",
--- /dev/null
+From 0e678153f5be7e6c8d28835f5a678618da4b7a9c Mon Sep 17 00:00:00 2001
+From: David Hildenbrand <david@redhat.com>
+Date: Thu, 22 Dec 2022 21:55:10 +0100
+Subject: mm/hugetlb: fix PTE marker handling in hugetlb_change_protection()
+
+From: David Hildenbrand <david@redhat.com>
+
+commit 0e678153f5be7e6c8d28835f5a678618da4b7a9c upstream.
+
+Patch series "mm/hugetlb: uffd-wp fixes for hugetlb_change_protection()".
+
+Playing with virtio-mem and background snapshots (using uffd-wp) on
+hugetlb in QEMU, I managed to trigger a VM_BUG_ON(). Looking into the
+details, hugetlb_change_protection() seems to not handle uffd-wp correctly
+in all cases.
+
+Patch #1 fixes my test case. I don't have reproducers for patch #2, as it
+requires running into migration entries.
+
+I did not yet check in detail yet if !hugetlb code requires similar care.
+
+
+This patch (of 2):
+
+There are two problematic cases when stumbling over a PTE marker in
+hugetlb_change_protection():
+
+(1) We protect an uffd-wp PTE marker a second time using uffd-wp: we will
+ end up in the "!huge_pte_none(pte)" case and mess up the PTE marker.
+
+(2) We unprotect a uffd-wp PTE marker: we will similarly end up in the
+ "!huge_pte_none(pte)" case even though we cleared the PTE, because
+ the "pte" variable is stale. We'll mess up the PTE marker.
+
+For example, if we later stumble over such a "wrongly modified" PTE marker,
+we'll treat it like a present PTE that maps some garbage page.
+
+This can, for example, be triggered by mapping a memfd backed by huge
+pages, registering uffd-wp, uffd-wp'ing an unmapped page and (a)
+uffd-wp'ing it a second time; or (b) uffd-unprotecting it; or (c)
+unregistering uffd-wp. Then, ff we trigger fallocate(FALLOC_FL_PUNCH_HOLE)
+on that file range, we will run into a VM_BUG_ON:
+
+[ 195.039560] page:00000000ba1f2987 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x0
+[ 195.039565] flags: 0x7ffffc0001000(reserved|node=0|zone=0|lastcpupid=0x1fffff)
+[ 195.039568] raw: 0007ffffc0001000 ffffe742c0000008 ffffe742c0000008 0000000000000000
+[ 195.039569] raw: 0000000000000000 0000000000000000 00000001ffffffff 0000000000000000
+[ 195.039569] page dumped because: VM_BUG_ON_PAGE(compound && !PageHead(page))
+[ 195.039573] ------------[ cut here ]------------
+[ 195.039574] kernel BUG at mm/rmap.c:1346!
+[ 195.039579] invalid opcode: 0000 [#1] PREEMPT SMP NOPTI
+[ 195.039581] CPU: 7 PID: 4777 Comm: qemu-system-x86 Not tainted 6.0.12-200.fc36.x86_64 #1
+[ 195.039583] Hardware name: LENOVO 20WNS1F81N/20WNS1F81N, BIOS N35ET50W (1.50 ) 09/15/2022
+[ 195.039584] RIP: 0010:page_remove_rmap+0x45b/0x550
+[ 195.039588] Code: [...]
+[ 195.039589] RSP: 0018:ffffbc03c3633ba8 EFLAGS: 00010292
+[ 195.039591] RAX: 0000000000000040 RBX: ffffe742c0000000 RCX: 0000000000000000
+[ 195.039592] RDX: 0000000000000002 RSI: ffffffff8e7aac1a RDI: 00000000ffffffff
+[ 195.039592] RBP: 0000000000000001 R08: 0000000000000000 R09: ffffbc03c3633a08
+[ 195.039593] R10: 0000000000000003 R11: ffffffff8f146328 R12: ffff9b04c42754b0
+[ 195.039594] R13: ffffffff8fcc6328 R14: ffffbc03c3633c80 R15: ffff9b0484ab9100
+[ 195.039595] FS: 00007fc7aaf68640(0000) GS:ffff9b0bbf7c0000(0000) knlGS:0000000000000000
+[ 195.039596] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+[ 195.039597] CR2: 000055d402c49110 CR3: 0000000159392003 CR4: 0000000000772ee0
+[ 195.039598] PKRU: 55555554
+[ 195.039599] Call Trace:
+[ 195.039600] <TASK>
+[ 195.039602] __unmap_hugepage_range+0x33b/0x7d0
+[ 195.039605] unmap_hugepage_range+0x55/0x70
+[ 195.039608] hugetlb_vmdelete_list+0x77/0xa0
+[ 195.039611] hugetlbfs_fallocate+0x410/0x550
+[ 195.039612] ? _raw_spin_unlock_irqrestore+0x23/0x40
+[ 195.039616] vfs_fallocate+0x12e/0x360
+[ 195.039618] __x64_sys_fallocate+0x40/0x70
+[ 195.039620] do_syscall_64+0x58/0x80
+[ 195.039623] ? syscall_exit_to_user_mode+0x17/0x40
+[ 195.039624] ? do_syscall_64+0x67/0x80
+[ 195.039626] entry_SYSCALL_64_after_hwframe+0x63/0xcd
+[ 195.039628] RIP: 0033:0x7fc7b590651f
+[ 195.039653] Code: [...]
+[ 195.039654] RSP: 002b:00007fc7aaf66e70 EFLAGS: 00000293 ORIG_RAX: 000000000000011d
+[ 195.039655] RAX: ffffffffffffffda RBX: 0000558ef4b7f370 RCX: 00007fc7b590651f
+[ 195.039656] RDX: 0000000018000000 RSI: 0000000000000003 RDI: 000000000000000c
+[ 195.039657] RBP: 0000000008000000 R08: 0000000000000000 R09: 0000000000000073
+[ 195.039658] R10: 0000000008000000 R11: 0000000000000293 R12: 0000000018000000
+[ 195.039658] R13: 00007fb8bbe00000 R14: 000000000000000c R15: 0000000000001000
+[ 195.039661] </TASK>
+
+Fix it by not going into the "!huge_pte_none(pte)" case if we stumble over
+an exclusive marker. spin_unlock() + continue would get the job done.
+
+However, instead, make it clearer that there are no fall-through
+statements: we process each case (hwpoison, migration, marker, !none,
+none) and then unlock the page table to continue with the next PTE. Let's
+avoid "continue" statements and use a single spin_unlock() at the end.
+
+Link: https://lkml.kernel.org/r/20221222205511.675832-1-david@redhat.com
+Link: https://lkml.kernel.org/r/20221222205511.675832-2-david@redhat.com
+Fixes: 60dfaad65aa9 ("mm/hugetlb: allow uffd wr-protect none ptes")
+Signed-off-by: David Hildenbrand <david@redhat.com>
+Reviewed-by: Peter Xu <peterx@redhat.com>
+Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
+Cc: Miaohe Lin <linmiaohe@huawei.com>
+Cc: Muchun Song <muchun.song@linux.dev>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/hugetlb.c | 21 +++++++--------------
+ 1 file changed, 7 insertions(+), 14 deletions(-)
+
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -6623,10 +6623,8 @@ unsigned long hugetlb_change_protection(
+ }
+ pte = huge_ptep_get(ptep);
+ if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
+- spin_unlock(ptl);
+- continue;
+- }
+- if (unlikely(is_hugetlb_entry_migration(pte))) {
++ /* Nothing to do. */
++ } else if (unlikely(is_hugetlb_entry_migration(pte))) {
+ swp_entry_t entry = pte_to_swp_entry(pte);
+ struct page *page = pfn_swap_entry_to_page(entry);
+
+@@ -6647,18 +6645,13 @@ unsigned long hugetlb_change_protection(
+ set_huge_pte_at(mm, address, ptep, newpte);
+ pages++;
+ }
+- spin_unlock(ptl);
+- continue;
+- }
+- if (unlikely(pte_marker_uffd_wp(pte))) {
+- /*
+- * This is changing a non-present pte into a none pte,
+- * no need for huge_ptep_modify_prot_start/commit().
+- */
++ } else if (unlikely(is_pte_marker(pte))) {
++ /* No other markers apply for now. */
++ WARN_ON_ONCE(!pte_marker_uffd_wp(pte));
+ if (uffd_wp_resolve)
++ /* Safe to modify directly (non-present->none). */
+ huge_pte_clear(mm, address, ptep, psize);
+- }
+- if (!huge_pte_none(pte)) {
++ } else if (!huge_pte_none(pte)) {
+ pte_t old_pte;
+ unsigned int shift = huge_page_shift(hstate_vma(vma));
+
--- /dev/null
+From 44f86392bdd165da7e43d3c772aeb1e128ffd6c8 Mon Sep 17 00:00:00 2001
+From: David Hildenbrand <david@redhat.com>
+Date: Thu, 22 Dec 2022 21:55:11 +0100
+Subject: mm/hugetlb: fix uffd-wp handling for migration entries in hugetlb_change_protection()
+
+From: David Hildenbrand <david@redhat.com>
+
+commit 44f86392bdd165da7e43d3c772aeb1e128ffd6c8 upstream.
+
+We have to update the uffd-wp SWP PTE bit independent of the type of
+migration entry. Currently, if we're unlucky and we want to install/clear
+the uffd-wp bit just while we're migrating a read-only mapped hugetlb
+page, we would miss to set/clear the uffd-wp bit.
+
+Further, if we're processing a readable-exclusive migration entry and
+neither want to set or clear the uffd-wp bit, we could currently end up
+losing the uffd-wp bit. Note that the same would hold for writable
+migrating entries, however, having a writable migration entry with the
+uffd-wp bit set would already mean that something went wrong.
+
+Note that the change from !is_readable_migration_entry ->
+writable_migration_entry is harmless and actually cleaner, as raised by
+Miaohe Lin and discussed in [1].
+
+[1] https://lkml.kernel.org/r/90dd6a93-4500-e0de-2bf0-bf522c311b0c@huawei.com
+
+Link: https://lkml.kernel.org/r/20221222205511.675832-3-david@redhat.com
+Fixes: 60dfaad65aa9 ("mm/hugetlb: allow uffd wr-protect none ptes")
+Signed-off-by: David Hildenbrand <david@redhat.com>
+Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
+Cc: Miaohe Lin <linmiaohe@huawei.com>
+Cc: Muchun Song <muchun.song@linux.dev>
+Cc: Peter Xu <peterx@redhat.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/hugetlb.c | 17 +++++++++--------
+ 1 file changed, 9 insertions(+), 8 deletions(-)
+
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -6627,10 +6627,9 @@ unsigned long hugetlb_change_protection(
+ } else if (unlikely(is_hugetlb_entry_migration(pte))) {
+ swp_entry_t entry = pte_to_swp_entry(pte);
+ struct page *page = pfn_swap_entry_to_page(entry);
++ pte_t newpte = pte;
+
+- if (!is_readable_migration_entry(entry)) {
+- pte_t newpte;
+-
++ if (is_writable_migration_entry(entry)) {
+ if (PageAnon(page))
+ entry = make_readable_exclusive_migration_entry(
+ swp_offset(entry));
+@@ -6638,13 +6637,15 @@ unsigned long hugetlb_change_protection(
+ entry = make_readable_migration_entry(
+ swp_offset(entry));
+ newpte = swp_entry_to_pte(entry);
+- if (uffd_wp)
+- newpte = pte_swp_mkuffd_wp(newpte);
+- else if (uffd_wp_resolve)
+- newpte = pte_swp_clear_uffd_wp(newpte);
+- set_huge_pte_at(mm, address, ptep, newpte);
+ pages++;
+ }
++
++ if (uffd_wp)
++ newpte = pte_swp_mkuffd_wp(newpte);
++ else if (uffd_wp_resolve)
++ newpte = pte_swp_clear_uffd_wp(newpte);
++ if (!pte_same(pte, newpte))
++ set_huge_pte_at(mm, address, ptep, newpte);
+ } else if (unlikely(is_pte_marker(pte))) {
+ /* No other markers apply for now. */
+ WARN_ON_ONCE(!pte_marker_uffd_wp(pte));
--- /dev/null
+From fed15f1345dc8a7fc8baa81e8b55c3ba010d7f4b Mon Sep 17 00:00:00 2001
+From: Peter Xu <peterx@redhat.com>
+Date: Wed, 4 Jan 2023 17:52:05 -0500
+Subject: mm/hugetlb: pre-allocate pgtable pages for uffd wr-protects
+
+From: Peter Xu <peterx@redhat.com>
+
+commit fed15f1345dc8a7fc8baa81e8b55c3ba010d7f4b upstream.
+
+Userfaultfd-wp uses pte markers to mark wr-protected pages for both shmem
+and hugetlb. Shmem has pre-allocation ready for markers, but hugetlb path
+was overlooked.
+
+Doing so by calling huge_pte_alloc() if the initial pgtable walk fails to
+find the huge ptep. It's possible that huge_pte_alloc() can fail with
+high memory pressure, in that case stop the loop immediately and fail
+silently. This is not the most ideal solution but it matches with what we
+do with shmem meanwhile it avoids the splat in dmesg.
+
+Link: https://lkml.kernel.org/r/20230104225207.1066932-2-peterx@redhat.com
+Fixes: 60dfaad65aa9 ("mm/hugetlb: allow uffd wr-protect none ptes")
+Signed-off-by: Peter Xu <peterx@redhat.com>
+Reported-by: James Houghton <jthoughton@google.com>
+Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
+Acked-by: David Hildenbrand <david@redhat.com>
+Acked-by: James Houghton <jthoughton@google.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Axel Rasmussen <axelrasmussen@google.com>
+Cc: Muchun Song <songmuchun@bytedance.com>
+Cc: Nadav Amit <nadav.amit@gmail.com>
+Cc: <stable@vger.kernel.org> [5.19+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/hugetlb.c | 13 +++++++++++--
+ 1 file changed, 11 insertions(+), 2 deletions(-)
+
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -6604,8 +6604,17 @@ unsigned long hugetlb_change_protection(
+ spinlock_t *ptl;
+ ptep = huge_pte_offset(mm, address, psize);
+ if (!ptep) {
+- address |= last_addr_mask;
+- continue;
++ if (!uffd_wp) {
++ address |= last_addr_mask;
++ continue;
++ }
++ /*
++ * Userfaultfd wr-protect requires pgtable
++ * pre-allocations to install pte markers.
++ */
++ ptep = huge_pte_alloc(mm, vma, address, psize);
++ if (!ptep)
++ break;
+ }
+ ptl = huge_pte_lock(h, mm, ptep);
+ if (huge_pmd_unshare(mm, vma, address, ptep)) {
--- /dev/null
+From 52dc031088f00e323140ece4004e70c33153c6dd Mon Sep 17 00:00:00 2001
+From: Zach O'Keefe <zokeefe@google.com>
+Date: Sat, 24 Dec 2022 00:20:34 -0800
+Subject: mm/MADV_COLLAPSE: don't expand collapse when vm_end is past requested end
+
+From: Zach O'Keefe <zokeefe@google.com>
+
+commit 52dc031088f00e323140ece4004e70c33153c6dd upstream.
+
+MADV_COLLAPSE acts on one hugepage-aligned/sized region at a time, until
+it has collapsed all eligible memory contained within the bounds supplied
+by the user.
+
+At the top of each hugepage iteration we (re)lock mmap_lock and
+(re)validate the VMA for eligibility and update variables that might have
+changed while mmap_lock was dropped. One thing that might occur is that
+the VMA could be resized, and as such, we refetch vma->vm_end to make sure
+we don't collapse past the end of the VMA's new end.
+
+However, it's possible that when refetching vma->vm_end that we expand the
+region acted on by MADV_COLLAPSE if vma->vm_end is greater than size+len
+supplied by the user.
+
+The consequence here is that we may attempt to collapse more memory than
+requested, possibly yielding either "too much success" or "false failure"
+user-visible results. An example of the former is if we MADV_COLLAPSE the
+first 4MiB of a 2TiB mmap()'d file, the incorrect refetch would cause the
+operation to block for much longer than anticipated as we attempt to
+collapse the entire TiB region. An example of the latter is that applying
+MADV_COLLPSE to a 4MiB file mapped to the start of a 6MiB VMA will
+successfully collapse the first 4MiB, then incorrectly attempt to collapse
+the last hugepage-aligned/sized region -- fail (since readahead/page cache
+lookup will fail) -- and report a failure to the user.
+
+I don't believe there is a kernel stability concern here as we always
+(re)validate the VMA / region accordingly. Also as Hugh mentions, the
+user-visible effects are: we try to collapse more memory than requested
+by the user, and/or failing an operation that should have otherwise
+succeeded. An example is trying to collapse a 4MiB file contained
+within a 12MiB VMA.
+
+Don't expand the acted-on region when refetching vma->vm_end.
+
+Link: https://lkml.kernel.org/r/20221224082035.3197140-1-zokeefe@google.com
+Fixes: 4d24de9425f7 ("mm: MADV_COLLAPSE: refetch vm_end after reacquiring mmap_lock")
+Signed-off-by: Zach O'Keefe <zokeefe@google.com>
+Reported-by: Hugh Dickins <hughd@google.com>
+Cc: Yang Shi <shy828301@gmail.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/khugepaged.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/mm/khugepaged.c
++++ b/mm/khugepaged.c
+@@ -2644,7 +2644,7 @@ int madvise_collapse(struct vm_area_stru
+ goto out_nolock;
+ }
+
+- hend = vma->vm_end & HPAGE_PMD_MASK;
++ hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
+ }
+ mmap_assert_locked(mm);
+ memset(cc->node_load, 0, sizeof(cc->node_load));
--- /dev/null
+From 51d3d5eb74ff53b92dcff48b30ae2ed8edd85a32 Mon Sep 17 00:00:00 2001
+From: David Hildenbrand <david@redhat.com>
+Date: Fri, 9 Dec 2022 09:09:12 +0100
+Subject: mm/userfaultfd: enable writenotify while userfaultfd-wp is enabled for a VMA
+
+From: David Hildenbrand <david@redhat.com>
+
+commit 51d3d5eb74ff53b92dcff48b30ae2ed8edd85a32 upstream.
+
+Currently, we don't enable writenotify when enabling userfaultfd-wp on a
+shared writable mapping (for now only shmem and hugetlb). The consequence
+is that vma->vm_page_prot will still include write permissions, to be set
+as default for all PTEs that get remapped (e.g., mprotect(), NUMA hinting,
+page migration, ...).
+
+So far, vma->vm_page_prot is assumed to be a safe default, meaning that we
+only add permissions (e.g., mkwrite) but not remove permissions (e.g.,
+wrprotect). For example, when enabling softdirty tracking, we enable
+writenotify. With uffd-wp on shared mappings, that changed. More details
+on vma->vm_page_prot semantics were summarized in [1].
+
+This is problematic for uffd-wp: we'd have to manually check for a uffd-wp
+PTEs/PMDs and manually write-protect PTEs/PMDs, which is error prone.
+Prone to such issues is any code that uses vma->vm_page_prot to set PTE
+permissions: primarily pte_modify() and mk_pte().
+
+Instead, let's enable writenotify such that PTEs/PMDs/... will be mapped
+write-protected as default and we will only allow selected PTEs that are
+definitely safe to be mapped without write-protection (see
+can_change_pte_writable()) to be writable. In the future, we might want
+to enable write-bit recovery -- e.g., can_change_pte_writable() -- at more
+locations, for example, also when removing uffd-wp protection.
+
+This fixes two known cases:
+
+(a) remove_migration_pte() mapping uffd-wp'ed PTEs writable, resulting
+ in uffd-wp not triggering on write access.
+(b) do_numa_page() / do_huge_pmd_numa_page() mapping uffd-wp'ed PTEs/PMDs
+ writable, resulting in uffd-wp not triggering on write access.
+
+Note that do_numa_page() / do_huge_pmd_numa_page() can be reached even
+without NUMA hinting (which currently doesn't seem to be applicable to
+shmem), for example, by using uffd-wp with a PROT_WRITE shmem VMA. On
+such a VMA, userfaultfd-wp is currently non-functional.
+
+Note that when enabling userfaultfd-wp, there is no need to walk page
+tables to enforce the new default protection for the PTEs: we know that
+they cannot be uffd-wp'ed yet, because that can only happen after enabling
+uffd-wp for the VMA in general.
+
+Also note that this makes mprotect() on ranges with uffd-wp'ed PTEs not
+accidentally set the write bit -- which would result in uffd-wp not
+triggering on later write access. This commit makes uffd-wp on shmem
+behave just like uffd-wp on anonymous memory in that regard, even though,
+mixing mprotect with uffd-wp is controversial.
+
+[1] https://lkml.kernel.org/r/92173bad-caa3-6b43-9d1e-9a471fdbc184@redhat.com
+
+Link: https://lkml.kernel.org/r/20221209080912.7968-1-david@redhat.com
+Fixes: b1f9e876862d ("mm/uffd: enable write protection for shmem & hugetlbfs")
+Signed-off-by: David Hildenbrand <david@redhat.com>
+Reported-by: Ives van Hoorne <ives@codesandbox.io>
+Debugged-by: Peter Xu <peterx@redhat.com>
+Acked-by: Peter Xu <peterx@redhat.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Alistair Popple <apopple@nvidia.com>
+Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
+Cc: Nadav Amit <nadav.amit@gmail.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/userfaultfd.c | 28 ++++++++++++++++++++++------
+ mm/mmap.c | 4 ++++
+ 2 files changed, 26 insertions(+), 6 deletions(-)
+
+--- a/fs/userfaultfd.c
++++ b/fs/userfaultfd.c
+@@ -108,6 +108,21 @@ static bool userfaultfd_is_initialized(s
+ return ctx->features & UFFD_FEATURE_INITIALIZED;
+ }
+
++static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
++ vm_flags_t flags)
++{
++ const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP;
++
++ vma->vm_flags = flags;
++ /*
++ * For shared mappings, we want to enable writenotify while
++ * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply
++ * recalculate vma->vm_page_prot whenever userfaultfd-wp changes.
++ */
++ if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed)
++ vma_set_page_prot(vma);
++}
++
+ static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
+ int wake_flags, void *key)
+ {
+@@ -618,7 +633,8 @@ static void userfaultfd_event_wait_compl
+ for_each_vma(vmi, vma) {
+ if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
+ vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+- vma->vm_flags &= ~__VM_UFFD_FLAGS;
++ userfaultfd_set_vm_flags(vma,
++ vma->vm_flags & ~__VM_UFFD_FLAGS);
+ }
+ }
+ mmap_write_unlock(mm);
+@@ -652,7 +668,7 @@ int dup_userfaultfd(struct vm_area_struc
+ octx = vma->vm_userfaultfd_ctx.ctx;
+ if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
+ vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+- vma->vm_flags &= ~__VM_UFFD_FLAGS;
++ userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
+ return 0;
+ }
+
+@@ -733,7 +749,7 @@ void mremap_userfaultfd_prep(struct vm_a
+ } else {
+ /* Drop uffd context if remap feature not enabled */
+ vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+- vma->vm_flags &= ~__VM_UFFD_FLAGS;
++ userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
+ }
+ }
+
+@@ -895,7 +911,7 @@ static int userfaultfd_release(struct in
+ prev = vma;
+ }
+
+- vma->vm_flags = new_flags;
++ userfaultfd_set_vm_flags(vma, new_flags);
+ vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+ }
+ mmap_write_unlock(mm);
+@@ -1463,7 +1479,7 @@ static int userfaultfd_register(struct u
+ * the next vma was merged into the current one and
+ * the current one has not been updated yet.
+ */
+- vma->vm_flags = new_flags;
++ userfaultfd_set_vm_flags(vma, new_flags);
+ vma->vm_userfaultfd_ctx.ctx = ctx;
+
+ if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma))
+@@ -1651,7 +1667,7 @@ static int userfaultfd_unregister(struct
+ * the next vma was merged into the current one and
+ * the current one has not been updated yet.
+ */
+- vma->vm_flags = new_flags;
++ userfaultfd_set_vm_flags(vma, new_flags);
+ vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+
+ skip:
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -1524,6 +1524,10 @@ int vma_wants_writenotify(struct vm_area
+ if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma))
+ return 1;
+
++ /* Do we need write faults for uffd-wp tracking? */
++ if (userfaultfd_wp(vma))
++ return 1;
++
+ /* Specialty mapping? */
+ if (vm_flags & VM_PFNMAP)
+ return 0;
--- /dev/null
+From 1e336aa0c0250ec84c6f16efac40c9f0138e367d Mon Sep 17 00:00:00 2001
+From: Haibo Chen <haibo.chen@nxp.com>
+Date: Wed, 7 Dec 2022 19:23:15 +0800
+Subject: mmc: sdhci-esdhc-imx: correct the tuning start tap and step setting
+
+From: Haibo Chen <haibo.chen@nxp.com>
+
+commit 1e336aa0c0250ec84c6f16efac40c9f0138e367d upstream.
+
+Current code logic may be impacted by the setting of ROM/Bootloader,
+so unmask these bits first, then setting these bits accordingly.
+
+Fixes: 2b16cf326b70 ("mmc: sdhci-esdhc-imx: move tuning static configuration into hwinit function")
+Signed-off-by: Haibo Chen <haibo.chen@nxp.com>
+Acked-by: Adrian Hunter <adrian.hunter@intel.com>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20221207112315.1812222-1-haibo.chen@nxp.com
+Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/mmc/host/sdhci-esdhc-imx.c | 22 +++++++++++++++-------
+ 1 file changed, 15 insertions(+), 7 deletions(-)
+
+--- a/drivers/mmc/host/sdhci-esdhc-imx.c
++++ b/drivers/mmc/host/sdhci-esdhc-imx.c
+@@ -107,6 +107,7 @@
+ #define ESDHC_TUNING_START_TAP_DEFAULT 0x1
+ #define ESDHC_TUNING_START_TAP_MASK 0x7f
+ #define ESDHC_TUNING_CMD_CRC_CHECK_DISABLE (1 << 7)
++#define ESDHC_TUNING_STEP_DEFAULT 0x1
+ #define ESDHC_TUNING_STEP_MASK 0x00070000
+ #define ESDHC_TUNING_STEP_SHIFT 16
+
+@@ -1361,7 +1362,7 @@ static void sdhci_esdhc_imx_hwinit(struc
+ struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host);
+ struct pltfm_imx_data *imx_data = sdhci_pltfm_priv(pltfm_host);
+ struct cqhci_host *cq_host = host->mmc->cqe_private;
+- int tmp;
++ u32 tmp;
+
+ if (esdhc_is_usdhc(imx_data)) {
+ /*
+@@ -1416,17 +1417,24 @@ static void sdhci_esdhc_imx_hwinit(struc
+
+ if (imx_data->socdata->flags & ESDHC_FLAG_STD_TUNING) {
+ tmp = readl(host->ioaddr + ESDHC_TUNING_CTRL);
+- tmp |= ESDHC_STD_TUNING_EN |
+- ESDHC_TUNING_START_TAP_DEFAULT;
+- if (imx_data->boarddata.tuning_start_tap) {
+- tmp &= ~ESDHC_TUNING_START_TAP_MASK;
++ tmp |= ESDHC_STD_TUNING_EN;
++
++ /*
++ * ROM code or bootloader may config the start tap
++ * and step, unmask them first.
++ */
++ tmp &= ~(ESDHC_TUNING_START_TAP_MASK | ESDHC_TUNING_STEP_MASK);
++ if (imx_data->boarddata.tuning_start_tap)
+ tmp |= imx_data->boarddata.tuning_start_tap;
+- }
++ else
++ tmp |= ESDHC_TUNING_START_TAP_DEFAULT;
+
+ if (imx_data->boarddata.tuning_step) {
+- tmp &= ~ESDHC_TUNING_STEP_MASK;
+ tmp |= imx_data->boarddata.tuning_step
+ << ESDHC_TUNING_STEP_SHIFT;
++ } else {
++ tmp |= ESDHC_TUNING_STEP_DEFAULT
++ << ESDHC_TUNING_STEP_SHIFT;
+ }
+
+ /* Disable the CMD CRC check for tuning, if not, need to
--- /dev/null
+From 8509419758f2cc28dd05370385af0d91573b76b4 Mon Sep 17 00:00:00 2001
+From: Samuel Holland <samuel@sholland.org>
+Date: Tue, 9 Aug 2022 21:25:09 -0500
+Subject: mmc: sunxi-mmc: Fix clock refcount imbalance during unbind
+
+From: Samuel Holland <samuel@sholland.org>
+
+commit 8509419758f2cc28dd05370385af0d91573b76b4 upstream.
+
+If the controller is suspended by runtime PM, the clock is already
+disabled, so do not try to disable it again during removal. Use
+pm_runtime_disable() to flush any pending runtime PM transitions.
+
+Fixes: 9a8e1e8cc2c0 ("mmc: sunxi: Add runtime_pm support")
+Signed-off-by: Samuel Holland <samuel@sholland.org>
+Acked-by: Jernej Skrabec <jernej.skrabec@gmail.com>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20220810022509.43743-1-samuel@sholland.org
+Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/mmc/host/sunxi-mmc.c | 8 +++++---
+ 1 file changed, 5 insertions(+), 3 deletions(-)
+
+--- a/drivers/mmc/host/sunxi-mmc.c
++++ b/drivers/mmc/host/sunxi-mmc.c
+@@ -1492,9 +1492,11 @@ static int sunxi_mmc_remove(struct platf
+ struct sunxi_mmc_host *host = mmc_priv(mmc);
+
+ mmc_remove_host(mmc);
+- pm_runtime_force_suspend(&pdev->dev);
+- disable_irq(host->irq);
+- sunxi_mmc_disable(host);
++ pm_runtime_disable(&pdev->dev);
++ if (!pm_runtime_status_suspended(&pdev->dev)) {
++ disable_irq(host->irq);
++ sunxi_mmc_disable(host);
++ }
+ dma_free_coherent(&pdev->dev, PAGE_SIZE, host->sg_cpu, host->sg_dma);
+ mmc_free_host(mmc);
+
--- /dev/null
+From 43d5f5d63699724d47f0d9e0eae516a260d232b4 Mon Sep 17 00:00:00 2001
+From: Ben Dooks <ben.dooks@codethink.co.uk>
+Date: Fri, 6 Jan 2023 13:44:56 +0000
+Subject: riscv: dts: sifive: fu740: fix size of pcie 32bit memory
+
+From: Ben Dooks <ben.dooks@codethink.co.uk>
+
+commit 43d5f5d63699724d47f0d9e0eae516a260d232b4 upstream.
+
+The 32-bit memory resource is needed for non-prefetchable memory
+allocations on the PCIe bus, however with some cards (such as the
+SM768) the system fails to allocate memory from this.
+
+Checking the allocation against the datasheet, it looks like there
+has been a mis-calcualation of the resource for the first memory
+region (0x0060090000..0x0070ffffff) which in the data-sheet for
+the fu740 (v1p2) is from 0x0060000000..0x007fffffff. Changing
+this to allocate from 0x0060090000..0x007fffffff fixes the probing
+issues.
+
+Fixes: ae80d5148085 ("riscv: dts: Add PCIe support for the SiFive FU740-C000 SoC")
+Cc: Paul Walmsley <paul.walmsley@sifive.com>
+Cc: Greentime Hu <greentime.hu@sifive.com>
+Signed-off-by: Ben Dooks <ben.dooks@codethink.co.uk>
+Cc: stable@vger.kernel.org
+Tested-by: Ron Economos <re@w6rz.net> # from IRC
+Reviewed-by: Conor Dooley <conor.dooley@microchip.com>
+Signed-off-by: Conor Dooley <conor.dooley@microchip.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/riscv/boot/dts/sifive/fu740-c000.dtsi | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/riscv/boot/dts/sifive/fu740-c000.dtsi
++++ b/arch/riscv/boot/dts/sifive/fu740-c000.dtsi
+@@ -328,7 +328,7 @@
+ bus-range = <0x0 0xff>;
+ ranges = <0x81000000 0x0 0x60080000 0x0 0x60080000 0x0 0x10000>, /* I/O */
+ <0x82000000 0x0 0x60090000 0x0 0x60090000 0x0 0xff70000>, /* mem */
+- <0x82000000 0x0 0x70000000 0x0 0x70000000 0x0 0x1000000>, /* mem */
++ <0x82000000 0x0 0x70000000 0x0 0x70000000 0x0 0x10000000>, /* mem */
+ <0xc3000000 0x20 0x00000000 0x20 0x00000000 0x20 0x00000000>; /* mem prefetchable */
+ num-lanes = <0x8>;
+ interrupts = <56>, <57>, <58>, <59>, <60>, <61>, <62>, <63>, <64>;
usb-core-hub-disable-autosuspend-for-ti-tusb8041.patch
comedi-adv_pci1760-fix-pwm-instruction-handling.patch
acpi-prm-check-whether-efi-runtime-is-available.patch
+mmc-sunxi-mmc-fix-clock-refcount-imbalance-during-unbind.patch
+mmc-sdhci-esdhc-imx-correct-the-tuning-start-tap-and-step-setting.patch
+mm-hugetlb-fix-pte-marker-handling-in-hugetlb_change_protection.patch
+mm-hugetlb-fix-uffd-wp-handling-for-migration-entries-in-hugetlb_change_protection.patch
+mm-hugetlb-pre-allocate-pgtable-pages-for-uffd-wr-protects.patch
+mm-userfaultfd-enable-writenotify-while-userfaultfd-wp-is-enabled-for-a-vma.patch
+mm-madv_collapse-don-t-expand-collapse-when-vm_end-is-past-requested-end.patch
+btrfs-add-extra-error-messages-to-cover-non-enomem-errors-from-device_add_list.patch
+btrfs-fix-missing-error-handling-when-logging-directory-items.patch
+btrfs-fix-directory-logging-due-to-race-with-concurrent-index-key-deletion.patch
+btrfs-add-missing-setup-of-log-for-full-commit-at-add_conflicting_inode.patch
+btrfs-do-not-abort-transaction-on-failure-to-write-log-tree-when-syncing-log.patch
+btrfs-do-not-abort-transaction-on-failure-to-update-log-root.patch
+btrfs-qgroup-do-not-warn-on-record-without-old_roots-populated.patch
+btrfs-fix-invalid-leaf-access-due-to-inline-extent-during-lseek.patch
+btrfs-fix-race-between-quota-rescan-and-disable-leading-to-null-pointer-deref.patch
+cifs-do-not-include-page-data-when-checking-signature.patch
+thunderbolt-disable-xdomain-lane-1-only-in-software-connection-manager.patch
+thunderbolt-use-correct-function-to-calculate-maximum-usb3-link-rate.patch
+thunderbolt-do-not-report-errors-if-on-board-retimers-are-found.patch
+thunderbolt-do-not-call-pm-runtime-functions-in-tb_retimer_scan.patch
+riscv-dts-sifive-fu740-fix-size-of-pcie-32bit-memory.patch
+bpf-restore-the-ebpf-program-id-for-bpf_audit_unload-and-perf_bpf_event_prog_unload.patch
--- /dev/null
+From 84ee211c83212f4d35b56e0603acdcc41f860f1b Mon Sep 17 00:00:00 2001
+From: Mika Westerberg <mika.westerberg@linux.intel.com>
+Date: Thu, 8 Sep 2022 09:45:22 +0300
+Subject: thunderbolt: Disable XDomain lane 1 only in software connection manager
+
+From: Mika Westerberg <mika.westerberg@linux.intel.com>
+
+commit 84ee211c83212f4d35b56e0603acdcc41f860f1b upstream.
+
+When firmware connection manager is in use we should not touch the lane
+adapter (well or any) configuration space so do this only when we know
+that the software connection manager is active.
+
+Fixes: 8e1de7042596 ("thunderbolt: Add support for XDomain lane bonding")
+Cc: stable@vger.kernel.org
+Acked-by: Yehezkel Bernat <YehezkelShB@gmail.com>
+Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/thunderbolt/xdomain.c | 17 ++++++++++++-----
+ 1 file changed, 12 insertions(+), 5 deletions(-)
+
+diff --git a/drivers/thunderbolt/xdomain.c b/drivers/thunderbolt/xdomain.c
+index cfa83486c9da..3c51e47dd86b 100644
+--- a/drivers/thunderbolt/xdomain.c
++++ b/drivers/thunderbolt/xdomain.c
+@@ -1419,12 +1419,19 @@ static int tb_xdomain_get_properties(struct tb_xdomain *xd)
+ * registered, we notify the userspace that it has changed.
+ */
+ if (!update) {
+- struct tb_port *port;
++ /*
++ * Now disable lane 1 if bonding was not enabled. Do
++ * this only if bonding was possible at the beginning
++ * (that is we are the connection manager and there are
++ * two lanes).
++ */
++ if (xd->bonding_possible) {
++ struct tb_port *port;
+
+- /* Now disable lane 1 if bonding was not enabled */
+- port = tb_port_at(xd->route, tb_xdomain_parent(xd));
+- if (!port->bonded)
+- tb_port_disable(port->dual_link_port);
++ port = tb_port_at(xd->route, tb_xdomain_parent(xd));
++ if (!port->bonded)
++ tb_port_disable(port->dual_link_port);
++ }
+
+ if (device_add(&xd->dev)) {
+ dev_err(&xd->dev, "failed to add XDomain device\n");
+--
+2.39.1
+
--- /dev/null
+From 23257cfc1cb7202fd0065e9f4a6a0aac1c04c4a9 Mon Sep 17 00:00:00 2001
+From: Mika Westerberg <mika.westerberg@linux.intel.com>
+Date: Thu, 29 Dec 2022 14:10:30 +0200
+Subject: thunderbolt: Do not call PM runtime functions in tb_retimer_scan()
+
+From: Mika Westerberg <mika.westerberg@linux.intel.com>
+
+commit 23257cfc1cb7202fd0065e9f4a6a0aac1c04c4a9 upstream.
+
+We cannot call PM runtime functions in tb_retimer_scan() because it will
+also be called when retimers are scanned from userspace (happens when
+there is no device connected on ChromeOS for instance) and at the same
+USB4 port runtime resume hook. This leads to hang because neither can
+proceed.
+
+Fix this by runtime resuming USB4 ports in tb_scan_port() instead. This
+makes sure the ports are runtime PM active when retimers are added under
+it while avoiding the reported hang as well.
+
+Reported-by: Utkarsh Patel <utkarsh.h.patel@intel.com>
+Fixes: 1e56c88adecc ("thunderbolt: Runtime resume USB4 port when retimers are scanned")
+Cc: stable@vger.kernel.org
+Acked-by: Yehezkel Bernat <YehezkelShB@gmail.com>
+Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/thunderbolt/retimer.c | 17 +++--------------
+ drivers/thunderbolt/tb.c | 20 +++++++++++++++-----
+ 2 files changed, 18 insertions(+), 19 deletions(-)
+
+--- a/drivers/thunderbolt/retimer.c
++++ b/drivers/thunderbolt/retimer.c
+@@ -427,13 +427,6 @@ int tb_retimer_scan(struct tb_port *port
+ {
+ u32 status[TB_MAX_RETIMER_INDEX + 1] = {};
+ int ret, i, last_idx = 0;
+- struct usb4_port *usb4;
+-
+- usb4 = port->usb4;
+- if (!usb4)
+- return 0;
+-
+- pm_runtime_get_sync(&usb4->dev);
+
+ /*
+ * Send broadcast RT to make sure retimer indices facing this
+@@ -441,7 +434,7 @@ int tb_retimer_scan(struct tb_port *port
+ */
+ ret = usb4_port_enumerate_retimers(port);
+ if (ret)
+- goto out;
++ return ret;
+
+ /*
+ * Enable sideband channel for each retimer. We can do this
+@@ -471,11 +464,11 @@ int tb_retimer_scan(struct tb_port *port
+ break;
+ }
+
+- ret = 0;
+ if (!last_idx)
+- goto out;
++ return 0;
+
+ /* Add on-board retimers if they do not exist already */
++ ret = 0;
+ for (i = 1; i <= last_idx; i++) {
+ struct tb_retimer *rt;
+
+@@ -489,10 +482,6 @@ int tb_retimer_scan(struct tb_port *port
+ }
+ }
+
+-out:
+- pm_runtime_mark_last_busy(&usb4->dev);
+- pm_runtime_put_autosuspend(&usb4->dev);
+-
+ return ret;
+ }
+
+--- a/drivers/thunderbolt/tb.c
++++ b/drivers/thunderbolt/tb.c
+@@ -628,11 +628,15 @@ static void tb_scan_port(struct tb_port
+ * Downstream switch is reachable through two ports.
+ * Only scan on the primary port (link_nr == 0).
+ */
++
++ if (port->usb4)
++ pm_runtime_get_sync(&port->usb4->dev);
++
+ if (tb_wait_for_port(port, false) <= 0)
+- return;
++ goto out_rpm_put;
+ if (port->remote) {
+ tb_port_dbg(port, "port already has a remote\n");
+- return;
++ goto out_rpm_put;
+ }
+
+ tb_retimer_scan(port, true);
+@@ -647,12 +651,12 @@ static void tb_scan_port(struct tb_port
+ */
+ if (PTR_ERR(sw) == -EIO || PTR_ERR(sw) == -EADDRNOTAVAIL)
+ tb_scan_xdomain(port);
+- return;
++ goto out_rpm_put;
+ }
+
+ if (tb_switch_configure(sw)) {
+ tb_switch_put(sw);
+- return;
++ goto out_rpm_put;
+ }
+
+ /*
+@@ -681,7 +685,7 @@ static void tb_scan_port(struct tb_port
+
+ if (tb_switch_add(sw)) {
+ tb_switch_put(sw);
+- return;
++ goto out_rpm_put;
+ }
+
+ /* Link the switches using both links if available */
+@@ -733,6 +737,12 @@ static void tb_scan_port(struct tb_port
+
+ tb_add_dp_resources(sw);
+ tb_scan_switch(sw);
++
++out_rpm_put:
++ if (port->usb4) {
++ pm_runtime_mark_last_busy(&port->usb4->dev);
++ pm_runtime_put_autosuspend(&port->usb4->dev);
++ }
+ }
+
+ static void tb_deactivate_and_free_tunnel(struct tb_tunnel *tunnel)
--- /dev/null
+From c28f3d80383571d3630df1a0e89500d23e855924 Mon Sep 17 00:00:00 2001
+From: Utkarsh Patel <utkarsh.h.patel@intel.com>
+Date: Thu, 22 Dec 2022 20:22:46 -0800
+Subject: thunderbolt: Do not report errors if on-board retimers are found
+
+From: Utkarsh Patel <utkarsh.h.patel@intel.com>
+
+commit c28f3d80383571d3630df1a0e89500d23e855924 upstream.
+
+Currently we return an error even if on-board retimers are found and
+that's not expected. Fix this to return an error only if there was one
+and 0 otherwise.
+
+Fixes: 1e56c88adecc ("thunderbolt: Runtime resume USB4 port when retimers are scanned")
+Cc: stable@vger.kernel.org
+Signed-off-by: Utkarsh Patel <utkarsh.h.patel@intel.com>
+Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/thunderbolt/retimer.c | 5 ++---
+ 1 file changed, 2 insertions(+), 3 deletions(-)
+
+--- a/drivers/thunderbolt/retimer.c
++++ b/drivers/thunderbolt/retimer.c
+@@ -471,10 +471,9 @@ int tb_retimer_scan(struct tb_port *port
+ break;
+ }
+
+- if (!last_idx) {
+- ret = 0;
++ ret = 0;
++ if (!last_idx)
+ goto out;
+- }
+
+ /* Add on-board retimers if they do not exist already */
+ for (i = 1; i <= last_idx; i++) {
--- /dev/null
+From e8ff07fb33026c5c1bb5b81293496faba5d68059 Mon Sep 17 00:00:00 2001
+From: Mika Westerberg <mika.westerberg@linux.intel.com>
+Date: Fri, 20 May 2022 13:35:19 +0300
+Subject: thunderbolt: Use correct function to calculate maximum USB3 link rate
+
+From: Mika Westerberg <mika.westerberg@linux.intel.com>
+
+commit e8ff07fb33026c5c1bb5b81293496faba5d68059 upstream.
+
+We need to take minimum of both sides of the USB3 link into consideration,
+not just the downstream port. Fix this by calling tb_usb3_max_link_rate()
+instead.
+
+Fixes: 0bd680cd900c ("thunderbolt: Add USB3 bandwidth management")
+Cc: stable@vger.kernel.org
+Acked-by: Yehezkel Bernat <YehezkelShB@gmail.com>
+Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/thunderbolt/tunnel.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/thunderbolt/tunnel.c
++++ b/drivers/thunderbolt/tunnel.c
+@@ -1275,7 +1275,7 @@ static void tb_usb3_reclaim_available_ba
+ return;
+ } else if (!ret) {
+ /* Use maximum link rate if the link valid is not set */
+- ret = usb4_usb3_port_max_link_rate(tunnel->src_port);
++ ret = tb_usb3_max_link_rate(tunnel->dst_port, tunnel->src_port);
+ if (ret < 0) {
+ tb_tunnel_warn(tunnel, "failed to read maximum link rate\n");
+ return;