6.1-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Sun, 22 Jan 2023 12:59:46 +0000 (13:59 +0100)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Sun, 22 Jan 2023 12:59:46 +0000 (13:59 +0100)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Sun, 22 Jan 2023 12:59:46 +0000 (13:59 +0100)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Sun, 22 Jan 2023 12:59:46 +0000 (13:59 +0100)
diff --git a/queue-6.1/bpf-restore-the-ebpf-program-id-for-bpf_audit_unload-and-perf_bpf_event_prog_unload.patch b/queue-6.1/bpf-restore-the-ebpf-program-id-for-bpf_audit_unload-and-perf_bpf_event_prog_unload.patch

new file mode 100644 (file)

index 0000000..89d657d
--- /dev/null
+++ b/queue-6.1/bpf-restore-the-ebpf-program-id-for-bpf_audit_unload-and-perf_bpf_event_prog_unload.patch
@@ -0,0 +1,94 @@
+From ef01f4e25c1760920e2c94f1c232350277ace69b Mon Sep 17 00:00:00 2001
+From: Paul Moore <paul@paul-moore.com>
+Date: Fri, 6 Jan 2023 10:43:59 -0500
+Subject: bpf: restore the ebpf program ID for BPF_AUDIT_UNLOAD and PERF_BPF_EVENT_PROG_UNLOAD
+
+From: Paul Moore <paul@paul-moore.com>
+
+commit ef01f4e25c1760920e2c94f1c232350277ace69b upstream.
+
+When changing the ebpf program put() routines to support being called
+from within IRQ context the program ID was reset to zero prior to
+calling the perf event and audit UNLOAD record generators, which
+resulted in problems as the ebpf program ID was bogus (always zero).
+This patch addresses this problem by removing an unnecessary call to
+bpf_prog_free_id() in __bpf_prog_offload_destroy() and adjusting
+__bpf_prog_put() to only call bpf_prog_free_id() after audit and perf
+have finished their bpf program unload tasks in
+bpf_prog_put_deferred().  For the record, no one can determine, or
+remember, why it was necessary to free the program ID, and remove it
+from the IDR, prior to executing bpf_prog_put_deferred();
+regardless, both Stanislav and Alexei agree that the approach in this
+patch should be safe.
+
+It is worth noting that when moving the bpf_prog_free_id() call, the
+do_idr_lock parameter was forced to true as the ebpf devs determined
+this was the correct as the do_idr_lock should always be true.  The
+do_idr_lock parameter will be removed in a follow-up patch, but it
+was kept here to keep the patch small in an effort to ease any stable
+backports.
+
+I also modified the bpf_audit_prog() logic used to associate the
+AUDIT_BPF record with other associated records, e.g. @ctx != NULL.
+Instead of keying off the operation, it now keys off the execution
+context, e.g. '!in_irg && !irqs_disabled()', which is much more
+appropriate and should help better connect the UNLOAD operations with
+the associated audit state (other audit records).
+
+Cc: stable@vger.kernel.org
+Fixes: d809e134be7a ("bpf: Prepare bpf_prog_put() to be called from irq context.")
+Reported-by: Burn Alting <burn.alting@iinet.net.au>
+Reported-by: Jiri Olsa <olsajiri@gmail.com>
+Suggested-by: Stanislav Fomichev <sdf@google.com>
+Suggested-by: Alexei Starovoitov <alexei.starovoitov@gmail.com>
+Signed-off-by: Paul Moore <paul@paul-moore.com>
+Acked-by: Stanislav Fomichev <sdf@google.com>
+Link: https://lore.kernel.org/r/20230106154400.74211-1-paul@paul-moore.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/bpf/offload.c |    3 ---
+ kernel/bpf/syscall.c |    6 ++----
+ 2 files changed, 2 insertions(+), 7 deletions(-)
+
+--- a/kernel/bpf/offload.c
++++ b/kernel/bpf/offload.c
+@@ -216,9 +216,6 @@ static void __bpf_prog_offload_destroy(s
+       if (offload->dev_state)
+               offload->offdev->ops->destroy(prog);
+ 
+-      /* Make sure BPF_PROG_GET_NEXT_ID can't find this dead program */
+-      bpf_prog_free_id(prog, true);
+-
+       list_del_init(&offload->offloads);
+       kfree(offload);
+       prog->aux->offload = NULL;
+--- a/kernel/bpf/syscall.c
++++ b/kernel/bpf/syscall.c
+@@ -1958,7 +1958,7 @@ static void bpf_audit_prog(const struct
+               return;
+       if (audit_enabled == AUDIT_OFF)
+               return;
+-      if (op == BPF_AUDIT_LOAD)
++      if (!in_irq() && !irqs_disabled())
+               ctx = audit_context();
+       ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF);
+       if (unlikely(!ab))
+@@ -2053,6 +2053,7 @@ static void bpf_prog_put_deferred(struct
+       prog = aux->prog;
+       perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0);
+       bpf_audit_prog(prog, BPF_AUDIT_UNLOAD);
++      bpf_prog_free_id(prog, true);
+       __bpf_prog_put_noref(prog, true);
+ }
+ 
+@@ -2061,9 +2062,6 @@ static void __bpf_prog_put(struct bpf_pr
+       struct bpf_prog_aux *aux = prog->aux;
+ 
+       if (atomic64_dec_and_test(&aux->refcnt)) {
+-              /* bpf_prog_free_id() must be called first */
+-              bpf_prog_free_id(prog, do_idr_lock);
+-
+               if (in_irq() || irqs_disabled()) {
+                       INIT_WORK(&aux->work, bpf_prog_put_deferred);
+                       schedule_work(&aux->work);
diff --git a/queue-6.1/btrfs-add-extra-error-messages-to-cover-non-enomem-errors-from-device_add_list.patch b/queue-6.1/btrfs-add-extra-error-messages-to-cover-non-enomem-errors-from-device_add_list.patch

new file mode 100644 (file)

index 0000000..53b74ec
--- /dev/null
+++ b/queue-6.1/btrfs-add-extra-error-messages-to-cover-non-enomem-errors-from-device_add_list.patch
@@ -0,0 +1,87 @@
+From ed02363fbbed52a3f5ea0d188edd09045a806eb5 Mon Sep 17 00:00:00 2001
+From: Qu Wenruo <wqu@suse.com>
+Date: Mon, 12 Dec 2022 10:19:37 +0800
+Subject: btrfs: add extra error messages to cover non-ENOMEM errors from device_add_list()
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit ed02363fbbed52a3f5ea0d188edd09045a806eb5 upstream.
+
+[BUG]
+When test case btrfs/219 (aka, mount a registered device but with a lower
+generation) failed, there is not any useful information for the end user
+to find out what's going wrong.
+
+The mount failure just looks like this:
+
+  #  mount -o loop /tmp/219.img2 /mnt/btrfs/
+  mount: /mnt/btrfs: mount(2) system call failed: File exists.
+         dmesg(1) may have more information after failed mount system call.
+
+While the dmesg contains nothing but the loop device change:
+
+  loop1: detected capacity change from 0 to 524288
+
+[CAUSE]
+In device_list_add() we have a lot of extra checks to reject invalid
+cases.
+
+That function also contains the regular device scan result like the
+following prompt:
+
+  BTRFS: device fsid 6222333e-f9f1-47e6-b306-55ddd4dcaef4 devid 1 transid 8 /dev/loop0 scanned by systemd-udevd (3027)
+
+But unfortunately not all errors have their own error messages, thus if
+we hit something wrong in device_add_list(), there may be no error
+messages at all.
+
+[FIX]
+Add errors message for all non-ENOMEM errors.
+
+For ENOMEM, I'd say we're in a much worse situation, and there should be
+some OOM messages way before our call sites.
+
+CC: stable@vger.kernel.org # 6.0+
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/volumes.c |   11 ++++++++++-
+ 1 file changed, 10 insertions(+), 1 deletion(-)
+
+--- a/fs/btrfs/volumes.c
++++ b/fs/btrfs/volumes.c
+@@ -770,8 +770,11 @@ static noinline struct btrfs_device *dev
+                                       BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
+ 
+       error = lookup_bdev(path, &path_devt);
+-      if (error)
++      if (error) {
++              btrfs_err(NULL, "failed to lookup block device for path %s: %d",
++                        path, error);
+               return ERR_PTR(error);
++      }
+ 
+       if (fsid_change_in_progress) {
+               if (!has_metadata_uuid)
+@@ -836,6 +839,9 @@ static noinline struct btrfs_device *dev
+ 
+       if (!device) {
+               if (fs_devices->opened) {
++                      btrfs_err(NULL,
++              "device %s belongs to fsid %pU, and the fs is already mounted",
++                                path, fs_devices->fsid);
+                       mutex_unlock(&fs_devices->device_list_mutex);
+                       return ERR_PTR(-EBUSY);
+               }
+@@ -910,6 +916,9 @@ static noinline struct btrfs_device *dev
+                        * generation are equal.
+                        */
+                       mutex_unlock(&fs_devices->device_list_mutex);
++                      btrfs_err(NULL,
++"device %s already registered with a higher generation, found %llu expect %llu",
++                                path, found_transid, device->generation);
+                       return ERR_PTR(-EEXIST);
+               }
+ 
diff --git a/queue-6.1/btrfs-add-missing-setup-of-log-for-full-commit-at-add_conflicting_inode.patch b/queue-6.1/btrfs-add-missing-setup-of-log-for-full-commit-at-add_conflicting_inode.patch

new file mode 100644 (file)

index 0000000..cbe54c7
--- /dev/null
+++ b/queue-6.1/btrfs-add-missing-setup-of-log-for-full-commit-at-add_conflicting_inode.patch
@@ -0,0 +1,43 @@
+From 94cd63ae679973edeb5ea95ec25a54467c3e54c8 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Tue, 10 Jan 2023 14:56:36 +0000
+Subject: btrfs: add missing setup of log for full commit at add_conflicting_inode()
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 94cd63ae679973edeb5ea95ec25a54467c3e54c8 upstream.
+
+When logging conflicting inodes, if we reach the maximum limit of inodes,
+we return BTRFS_LOG_FORCE_COMMIT to force a transaction commit. However
+we don't mark the log for full commit (with btrfs_set_log_full_commit()),
+which means that once we leave the log transaction and before we commit
+the transaction, some other task may sync the log, which is incomplete
+as we have not logged all conflicting inodes, leading to some inconsistent
+in case that log ends up being replayed.
+
+So also call btrfs_set_log_full_commit() at add_conflicting_inode().
+
+Fixes: e09d94c9e448 ("btrfs: log conflicting inodes without holding log mutex of the initial inode")
+CC: stable@vger.kernel.org # 6.1
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/tree-log.c |    4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -5626,8 +5626,10 @@ static int add_conflicting_inode(struct
+        * LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction
+        * commits.
+        */
+-      if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES)
++      if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES) {
++              btrfs_set_log_full_commit(trans);
+               return BTRFS_LOG_FORCE_COMMIT;
++      }
+ 
+       inode = btrfs_iget(root->fs_info->sb, ino, root);
+       /*
diff --git a/queue-6.1/btrfs-do-not-abort-transaction-on-failure-to-update-log-root.patch b/queue-6.1/btrfs-do-not-abort-transaction-on-failure-to-update-log-root.patch

new file mode 100644 (file)

index 0000000..620d797
--- /dev/null
+++ b/queue-6.1/btrfs-do-not-abort-transaction-on-failure-to-update-log-root.patch
@@ -0,0 +1,52 @@
+From 09e44868f1e03c7825ca4283256abedc95e249a3 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Tue, 10 Jan 2023 14:56:38 +0000
+Subject: btrfs: do not abort transaction on failure to update log root
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 09e44868f1e03c7825ca4283256abedc95e249a3 upstream.
+
+When syncing a log, if we fail to update a log root in the log root tree,
+we are aborting the transaction if the failure was not -ENOSPC. This is
+excessive because there is a chance that a transaction commit can succeed,
+and therefore avoid to turn the filesystem into RO mode. All we need to be
+careful about is to mark the log for a full commit, which we already do,
+to make sure no one commits a super block pointing to an outdated log root
+tree.
+
+So don't abort the transaction if we fail to update a log root in the log
+root tree, and log an error if the failure is not -ENOSPC, so that it does
+not go completely unnoticed.
+
+CC: stable@vger.kernel.org # 6.0+
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/tree-log.c |   11 ++++-------
+ 1 file changed, 4 insertions(+), 7 deletions(-)
+
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -3075,15 +3075,12 @@ int btrfs_sync_log(struct btrfs_trans_ha
+ 
+               blk_finish_plug(&plug);
+               btrfs_set_log_full_commit(trans);
+-
+-              if (ret != -ENOSPC) {
+-                      btrfs_abort_transaction(trans, ret);
+-                      mutex_unlock(&log_root_tree->log_mutex);
+-                      goto out;
+-              }
++              if (ret != -ENOSPC)
++                      btrfs_err(fs_info,
++                                "failed to update log for root %llu ret %d",
++                                root->root_key.objectid, ret);
+               btrfs_wait_tree_log_extents(log, mark);
+               mutex_unlock(&log_root_tree->log_mutex);
+-              ret = BTRFS_LOG_FORCE_COMMIT;
+               goto out;
+       }
+ 
diff --git a/queue-6.1/btrfs-do-not-abort-transaction-on-failure-to-write-log-tree-when-syncing-log.patch b/queue-6.1/btrfs-do-not-abort-transaction-on-failure-to-write-log-tree-when-syncing-log.patch

new file mode 100644 (file)

index 0000000..17487d4
--- /dev/null
+++ b/queue-6.1/btrfs-do-not-abort-transaction-on-failure-to-write-log-tree-when-syncing-log.patch
@@ -0,0 +1,70 @@
+From 16199ad9eb6db60a6b10794a09fc1ac6d09312ff Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Tue, 10 Jan 2023 14:56:37 +0000
+Subject: btrfs: do not abort transaction on failure to write log tree when syncing log
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 16199ad9eb6db60a6b10794a09fc1ac6d09312ff upstream.
+
+When syncing the log, if we fail to write log tree extent buffers, we mark
+the log for a full commit and abort the transaction. However we don't need
+to abort the transaction, all we really need to do is to make sure no one
+can commit a superblock pointing to new log tree roots. Just because we
+got a failure writing extent buffers for a log tree, it does not mean we
+will also fail to do a transaction commit.
+
+One particular case is if due to a bug somewhere, when writing log tree
+extent buffers, the tree checker detects some corruption and the writeout
+fails because of that. Aborting the transaction can be very disruptive for
+a user, specially if the issue happened on a root filesystem. One example
+is the scenario in the Link tag below, where an isolated corruption on log
+tree leaves was causing transaction aborts when syncing the log.
+
+Link: https://lore.kernel.org/linux-btrfs/ae169fc6-f504-28f0-a098-6fa6a4dfb612@leemhuis.info/
+CC: stable@vger.kernel.org # 5.15+
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/disk-io.c  |    9 ++++++++-
+ fs/btrfs/tree-log.c |    2 --
+ 2 files changed, 8 insertions(+), 3 deletions(-)
+
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -344,7 +344,14 @@ error:
+       btrfs_print_tree(eb, 0);
+       btrfs_err(fs_info, "block=%llu write time tree block corruption detected",
+                 eb->start);
+-      WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
++      /*
++       * Be noisy if this is an extent buffer from a log tree. We don't abort
++       * a transaction in case there's a bad log tree extent buffer, we just
++       * fallback to a transaction commit. Still we want to know when there is
++       * a bad log tree extent buffer, as that may signal a bug somewhere.
++       */
++      WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG) ||
++              btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID);
+       return ret;
+ }
+ 
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -3011,7 +3011,6 @@ int btrfs_sync_log(struct btrfs_trans_ha
+               ret = 0;
+       if (ret) {
+               blk_finish_plug(&plug);
+-              btrfs_abort_transaction(trans, ret);
+               btrfs_set_log_full_commit(trans);
+               mutex_unlock(&root->log_mutex);
+               goto out;
+@@ -3143,7 +3142,6 @@ int btrfs_sync_log(struct btrfs_trans_ha
+               goto out_wake_log_root;
+       } else if (ret) {
+               btrfs_set_log_full_commit(trans);
+-              btrfs_abort_transaction(trans, ret);
+               mutex_unlock(&log_root_tree->log_mutex);
+               goto out_wake_log_root;
+       }
diff --git a/queue-6.1/btrfs-fix-directory-logging-due-to-race-with-concurrent-index-key-deletion.patch b/queue-6.1/btrfs-fix-directory-logging-due-to-race-with-concurrent-index-key-deletion.patch

new file mode 100644 (file)

index 0000000..65d0e8b
--- /dev/null
+++ b/queue-6.1/btrfs-fix-directory-logging-due-to-race-with-concurrent-index-key-deletion.patch
@@ -0,0 +1,104 @@
+From 8bb6898da6271d82d8e76d8088d66b971a7dcfa6 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Tue, 10 Jan 2023 14:56:35 +0000
+Subject: btrfs: fix directory logging due to race with concurrent index key deletion
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 8bb6898da6271d82d8e76d8088d66b971a7dcfa6 upstream.
+
+Sometimes we log a directory without holding its VFS lock, so while we
+logging it, dir index entries may be added or removed. This typically
+happens when logging a dentry from a parent directory that points to a
+new directory, through log_new_dir_dentries(), or when while logging
+some other inode we also need to log its parent directories (through
+btrfs_log_all_parents()).
+
+This means that while we are at log_dir_items(), we may not find a dir
+index key we found before, because it was deleted in the meanwhile, so
+a call to btrfs_search_slot() may return 1 (key not found). In that case
+we return from log_dir_items() with a success value (the variable 'err'
+has a value of 0). This can lead to a few problems, specially in the case
+where the variable 'last_offset' has a value of (u64)-1 (and it's
+initialized to that when it was declared):
+
+1) By returning from log_dir_items() with success (0) and a value of
+   (u64)-1 for '*last_offset_ret', we end up not logging any other dir
+   index keys that follow the missing, just deleted, index key. The
+   (u64)-1 value makes log_directory_changes() not call log_dir_items()
+   again;
+
+2) Before returning with success (0), log_dir_items(), will log a dir
+   index range item covering a range from the last old dentry index
+   (stored in the variable 'last_old_dentry_offset') to the value of
+   'last_offset'. If 'last_offset' has a value of (u64)-1, then it means
+   if the log is persisted and replayed after a power failure, it will
+   cause deletion of all the directory entries that have an index number
+   between last_old_dentry_offset + 1 and (u64)-1;
+
+3) We can end up returning from log_dir_items() with
+   ctx->last_dir_item_offset having a lower value than
+   inode->last_dir_index_offset, because the former is set to the current
+   key we are processing at process_dir_items_leaf(), and at the end of
+   log_directory_changes() we set inode->last_dir_index_offset to the
+   current value of ctx->last_dir_item_offset. So if for example a
+   deletion of a lower dir index key happened, we set
+   ctx->last_dir_item_offset to that index value, then if we return from
+   log_dir_items() because btrfs_search_slot() returned 1, we end up
+   returning from log_dir_items() with success (0) and then
+   log_directory_changes() sets inode->last_dir_index_offset to a lower
+   value than it had before.
+   This can result in unpredictable and unexpected behaviour when we
+   need to log again the directory in the same transaction, and can result
+   in ending up with a log tree leaf that has duplicated keys, as we do
+   batch insertions of dir index keys into a log tree.
+
+So fix this by making log_dir_items() move on to the next dir index key
+if it does not find the one it was looking for.
+
+Reported-by: David Arendt <admin@prnet.org>
+Link: https://lore.kernel.org/linux-btrfs/ae169fc6-f504-28f0-a098-6fa6a4dfb612@leemhuis.info/
+CC: stable@vger.kernel.org # 4.14+
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/tree-log.c |   21 +++++++++++++++------
+ 1 file changed, 15 insertions(+), 6 deletions(-)
+
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -3888,17 +3888,26 @@ static noinline int log_dir_items(struct
+       btrfs_release_path(path);
+ 
+       /*
+-       * Find the first key from this transaction again.  See the note for
+-       * log_new_dir_dentries, if we're logging a directory recursively we
+-       * won't be holding its i_mutex, which means we can modify the directory
+-       * while we're logging it.  If we remove an entry between our first
+-       * search and this search we'll not find the key again and can just
+-       * bail.
++       * Find the first key from this transaction again or the one we were at
++       * in the loop below in case we had to reschedule. We may be logging the
++       * directory without holding its VFS lock, which happen when logging new
++       * dentries (through log_new_dir_dentries()) or in some cases when we
++       * need to log the parent directory of an inode. This means a dir index
++       * key might be deleted from the inode's root, and therefore we may not
++       * find it anymore. If we can't find it, just move to the next key. We
++       * can not bail out and ignore, because if we do that we will simply
++       * not log dir index keys that come after the one that was just deleted
++       * and we can end up logging a dir index range that ends at (u64)-1
++       * (@last_offset is initialized to that), resulting in removing dir
++       * entries we should not remove at log replay time.
+        */
+ search:
+       ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
++      if (ret > 0)
++              ret = btrfs_next_item(root, path);
+       if (ret < 0)
+               err = ret;
++      /* If ret is 1, there are no more keys in the inode's root. */
+       if (ret != 0)
+               goto done;
+ 
diff --git a/queue-6.1/btrfs-fix-invalid-leaf-access-due-to-inline-extent-during-lseek.patch b/queue-6.1/btrfs-fix-invalid-leaf-access-due-to-inline-extent-during-lseek.patch

new file mode 100644 (file)

index 0000000..d84a42c
--- /dev/null
+++ b/queue-6.1/btrfs-fix-invalid-leaf-access-due-to-inline-extent-during-lseek.patch
@@ -0,0 +1,68 @@
+From 1f55ee6d0901d915801618bda0af4e5b937e3db7 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Thu, 12 Jan 2023 14:17:20 +0000
+Subject: btrfs: fix invalid leaf access due to inline extent during lseek
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 1f55ee6d0901d915801618bda0af4e5b937e3db7 upstream.
+
+During lseek, for SEEK_DATA and SEEK_HOLE modes, we access the disk_bytenr
+of an extent without checking its type. However inline extents have their
+data starting the offset of the disk_bytenr field, so accessing that field
+when we have an inline extent can result in either of the following:
+
+1) Interpret the inline extent's data as a disk_bytenr value;
+
+2) In case the inline data is less than 8 bytes, we access part of some
+   other item in the leaf, or unused space in the leaf;
+
+3) In case the inline data is less than 8 bytes and the extent item is
+   the first item in the leaf, we can access beyond the leaf's limit.
+
+So fix this by not accessing the disk_bytenr field if we have an inline
+extent.
+
+Fixes: b6e833567ea1 ("btrfs: make hole and data seeking a lot more efficient")
+Reported-by: Matthias Schoepfer <matthias.schoepfer@googlemail.com>
+Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=216908
+Link: https://lore.kernel.org/linux-btrfs/7f25442f-b121-2a3a-5a3d-22bcaae83cd4@leemhuis.info/
+CC: stable@vger.kernel.org # 6.1
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/file.c |   13 ++++++++++---
+ 1 file changed, 10 insertions(+), 3 deletions(-)
+
+--- a/fs/btrfs/file.c
++++ b/fs/btrfs/file.c
+@@ -3838,6 +3838,7 @@ static loff_t find_desired_extent(struct
+               struct extent_buffer *leaf = path->nodes[0];
+               struct btrfs_file_extent_item *extent;
+               u64 extent_end;
++              u8 type;
+ 
+               if (path->slots[0] >= btrfs_header_nritems(leaf)) {
+                       ret = btrfs_next_leaf(root, path);
+@@ -3892,10 +3893,16 @@ static loff_t find_desired_extent(struct
+ 
+               extent = btrfs_item_ptr(leaf, path->slots[0],
+                                       struct btrfs_file_extent_item);
++              type = btrfs_file_extent_type(leaf, extent);
+ 
+-              if (btrfs_file_extent_disk_bytenr(leaf, extent) == 0 ||
+-                  btrfs_file_extent_type(leaf, extent) ==
+-                  BTRFS_FILE_EXTENT_PREALLOC) {
++              /*
++               * Can't access the extent's disk_bytenr field if this is an
++               * inline extent, since at that offset, it's where the extent
++               * data starts.
++               */
++              if (type == BTRFS_FILE_EXTENT_PREALLOC ||
++                  (type == BTRFS_FILE_EXTENT_REG &&
++                   btrfs_file_extent_disk_bytenr(leaf, extent) == 0)) {
+                       /*
+                        * Explicit hole or prealloc extent, search for delalloc.
+                        * A prealloc extent is treated like a hole.
diff --git a/queue-6.1/btrfs-fix-missing-error-handling-when-logging-directory-items.patch b/queue-6.1/btrfs-fix-missing-error-handling-when-logging-directory-items.patch

new file mode 100644 (file)

index 0000000..aa9d50f
--- /dev/null
+++ b/queue-6.1/btrfs-fix-missing-error-handling-when-logging-directory-items.patch
@@ -0,0 +1,99 @@
+From 6d3d970b2735b967650d319be27268fedc5598d1 Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Tue, 10 Jan 2023 14:56:34 +0000
+Subject: btrfs: fix missing error handling when logging directory items
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit 6d3d970b2735b967650d319be27268fedc5598d1 upstream.
+
+When logging a directory, at log_dir_items(), if we get an error when
+attempting to search the subvolume tree for a dir index item, we end up
+returning 0 (success) from log_dir_items() because 'err' is left with a
+value of 0.
+
+This can lead to a few problems, specially in the case the variable
+'last_offset' has a value of (u64)-1 (and it's initialized to that when
+it was declared):
+
+1) By returning from log_dir_items() with success (0) and a value of
+   (u64)-1 for '*last_offset_ret', we end up not logging any other dir
+   index keys that follow the missing, just deleted, index key. The
+   (u64)-1 value makes log_directory_changes() not call log_dir_items()
+   again;
+
+2) Before returning with success (0), log_dir_items(), will log a dir
+   index range item covering a range from the last old dentry index
+   (stored in the variable 'last_old_dentry_offset') to the value of
+   'last_offset'. If 'last_offset' has a value of (u64)-1, then it means
+   if the log is persisted and replayed after a power failure, it will
+   cause deletion of all the directory entries that have an index number
+   between last_old_dentry_offset + 1 and (u64)-1;
+
+3) We can end up returning from log_dir_items() with
+   ctx->last_dir_item_offset having a lower value than
+   inode->last_dir_index_offset, because the former is set to the current
+   key we are processing at process_dir_items_leaf(), and at the end of
+   log_directory_changes() we set inode->last_dir_index_offset to the
+   current value of ctx->last_dir_item_offset. So if for example a
+   deletion of a lower dir index key happened, we set
+   ctx->last_dir_item_offset to that index value, then if we return from
+   log_dir_items() because btrfs_search_slot() returned an error, we end up
+   returning without any error from log_dir_items() and then
+   log_directory_changes() sets inode->last_dir_index_offset to a lower
+   value than it had before.
+   This can result in unpredictable and unexpected behaviour when we
+   need to log again the directory in the same transaction, and can result
+   in ending up with a log tree leaf that has duplicated keys, as we do
+   batch insertions of dir index keys into a log tree.
+
+Fix this by setting 'err' to the value of 'ret' in case
+btrfs_search_slot() or btrfs_previous_item() returned an error. That will
+result in falling back to a full transaction commit.
+
+Reported-by: David Arendt <admin@prnet.org>
+Link: https://lore.kernel.org/linux-btrfs/ae169fc6-f504-28f0-a098-6fa6a4dfb612@leemhuis.info/
+Fixes: e02119d5a7b4 ("Btrfs: Add a write ahead tree log to optimize synchronous operations")
+CC: stable@vger.kernel.org # 4.14+
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/tree-log.c |    9 +++++++++
+ 1 file changed, 9 insertions(+)
+
+--- a/fs/btrfs/tree-log.c
++++ b/fs/btrfs/tree-log.c
+@@ -3857,7 +3857,10 @@ static noinline int log_dir_items(struct
+                                             path->slots[0]);
+                       if (tmp.type == BTRFS_DIR_INDEX_KEY)
+                               last_old_dentry_offset = tmp.offset;
++              } else if (ret < 0) {
++                      err = ret;
+               }
++
+               goto done;
+       }
+ 
+@@ -3877,7 +3880,11 @@ static noinline int log_dir_items(struct
+                */
+               if (tmp.type == BTRFS_DIR_INDEX_KEY)
+                       last_old_dentry_offset = tmp.offset;
++      } else if (ret < 0) {
++              err = ret;
++              goto done;
+       }
++
+       btrfs_release_path(path);
+ 
+       /*
+@@ -3890,6 +3897,8 @@ static noinline int log_dir_items(struct
+        */
+ search:
+       ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0);
++      if (ret < 0)
++              err = ret;
+       if (ret != 0)
+               goto done;
+ 
diff --git a/queue-6.1/btrfs-fix-race-between-quota-rescan-and-disable-leading-to-null-pointer-deref.patch b/queue-6.1/btrfs-fix-race-between-quota-rescan-and-disable-leading-to-null-pointer-deref.patch

new file mode 100644 (file)

index 0000000..1cb586d
--- /dev/null
+++ b/queue-6.1/btrfs-fix-race-between-quota-rescan-and-disable-leading-to-null-pointer-deref.patch
@@ -0,0 +1,134 @@
+From b7adbf9ada3513d2092362c8eac5cddc5b651f5c Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Thu, 12 Jan 2023 16:31:08 +0000
+Subject: btrfs: fix race between quota rescan and disable leading to NULL pointer deref
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit b7adbf9ada3513d2092362c8eac5cddc5b651f5c upstream.
+
+If we have one task trying to start the quota rescan worker while another
+one is trying to disable quotas, we can end up hitting a race that results
+in the quota rescan worker doing a NULL pointer dereference. The steps for
+this are the following:
+
+1) Quotas are enabled;
+
+2) Task A calls the quota rescan ioctl and enters btrfs_qgroup_rescan().
+   It calls qgroup_rescan_init() which returns 0 (success) and then joins a
+   transaction and commits it;
+
+3) Task B calls the quota disable ioctl and enters btrfs_quota_disable().
+   It clears the bit BTRFS_FS_QUOTA_ENABLED from fs_info->flags and calls
+   btrfs_qgroup_wait_for_completion(), which returns immediately since the
+   rescan worker is not yet running.
+   Then it starts a transaction and locks fs_info->qgroup_ioctl_lock;
+
+4) Task A queues the rescan worker, by calling btrfs_queue_work();
+
+5) The rescan worker starts, and calls rescan_should_stop() at the start
+   of its while loop, which results in 0 iterations of the loop, since
+   the flag BTRFS_FS_QUOTA_ENABLED was cleared from fs_info->flags by
+   task B at step 3);
+
+6) Task B sets fs_info->quota_root to NULL;
+
+7) The rescan worker tries to start a transaction and uses
+   fs_info->quota_root as the root argument for btrfs_start_transaction().
+   This results in a NULL pointer dereference down the call chain of
+   btrfs_start_transaction(). The stack trace is something like the one
+   reported in Link tag below:
+
+   general protection fault, probably for non-canonical address 0xdffffc0000000041: 0000 [#1] PREEMPT SMP KASAN
+   KASAN: null-ptr-deref in range [0x0000000000000208-0x000000000000020f]
+   CPU: 1 PID: 34 Comm: kworker/u4:2 Not tainted 6.1.0-syzkaller-13872-gb6bb9676f216 #0
+   Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/26/2022
+   Workqueue: btrfs-qgroup-rescan btrfs_work_helper
+   RIP: 0010:start_transaction+0x48/0x10f0 fs/btrfs/transaction.c:564
+   Code: 48 89 fb 48 (...)
+   RSP: 0018:ffffc90000ab7ab0 EFLAGS: 00010206
+   RAX: 0000000000000041 RBX: 0000000000000208 RCX: ffff88801779ba80
+   RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000000000000
+   RBP: dffffc0000000000 R08: 0000000000000001 R09: fffff52000156f5d
+   R10: fffff52000156f5d R11: 1ffff92000156f5c R12: 0000000000000000
+   R13: 0000000000000001 R14: 0000000000000001 R15: 0000000000000003
+   FS:  0000000000000000(0000) GS:ffff8880b9900000(0000) knlGS:0000000000000000
+   CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+   CR2: 00007f2bea75b718 CR3: 000000001d0cc000 CR4: 00000000003506e0
+   DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+   DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+   Call Trace:
+    <TASK>
+    btrfs_qgroup_rescan_worker+0x3bb/0x6a0 fs/btrfs/qgroup.c:3402
+    btrfs_work_helper+0x312/0x850 fs/btrfs/async-thread.c:280
+    process_one_work+0x877/0xdb0 kernel/workqueue.c:2289
+    worker_thread+0xb14/0x1330 kernel/workqueue.c:2436
+    kthread+0x266/0x300 kernel/kthread.c:376
+    ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:308
+    </TASK>
+   Modules linked in:
+
+So fix this by having the rescan worker function not attempt to start a
+transaction if it didn't do any rescan work.
+
+Reported-by: syzbot+96977faa68092ad382c4@syzkaller.appspotmail.com
+Link: https://lore.kernel.org/linux-btrfs/000000000000e5454b05f065a803@google.com/
+Fixes: e804861bd4e6 ("btrfs: fix deadlock between quota disable and qgroup rescan worker")
+CC: stable@vger.kernel.org # 5.4+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/qgroup.c |   25 +++++++++++++++++--------
+ 1 file changed, 17 insertions(+), 8 deletions(-)
+
+--- a/fs/btrfs/qgroup.c
++++ b/fs/btrfs/qgroup.c
+@@ -3348,6 +3348,7 @@ static void btrfs_qgroup_rescan_worker(s
+       int err = -ENOMEM;
+       int ret = 0;
+       bool stopped = false;
++      bool did_leaf_rescans = false;
+ 
+       path = btrfs_alloc_path();
+       if (!path)
+@@ -3368,6 +3369,7 @@ static void btrfs_qgroup_rescan_worker(s
+               }
+ 
+               err = qgroup_rescan_leaf(trans, path);
++              did_leaf_rescans = true;
+ 
+               if (err > 0)
+                       btrfs_commit_transaction(trans);
+@@ -3388,16 +3390,23 @@ out:
+       mutex_unlock(&fs_info->qgroup_rescan_lock);
+ 
+       /*
+-       * only update status, since the previous part has already updated the
+-       * qgroup info.
++       * Only update status, since the previous part has already updated the
++       * qgroup info, and only if we did any actual work. This also prevents
++       * race with a concurrent quota disable, which has already set
++       * fs_info->quota_root to NULL and cleared BTRFS_FS_QUOTA_ENABLED at
++       * btrfs_quota_disable().
+        */
+-      trans = btrfs_start_transaction(fs_info->quota_root, 1);
+-      if (IS_ERR(trans)) {
+-              err = PTR_ERR(trans);
++      if (did_leaf_rescans) {
++              trans = btrfs_start_transaction(fs_info->quota_root, 1);
++              if (IS_ERR(trans)) {
++                      err = PTR_ERR(trans);
++                      trans = NULL;
++                      btrfs_err(fs_info,
++                                "fail to start transaction for status update: %d",
++                                err);
++              }
++      } else {
+               trans = NULL;
+-              btrfs_err(fs_info,
+-                        "fail to start transaction for status update: %d",
+-                        err);
+       }
+ 
+       mutex_lock(&fs_info->qgroup_rescan_lock);
diff --git a/queue-6.1/btrfs-qgroup-do-not-warn-on-record-without-old_roots-populated.patch b/queue-6.1/btrfs-qgroup-do-not-warn-on-record-without-old_roots-populated.patch

new file mode 100644 (file)

index 0000000..4bfa94e
--- /dev/null
+++ b/queue-6.1/btrfs-qgroup-do-not-warn-on-record-without-old_roots-populated.patch
@@ -0,0 +1,97 @@
+From 75181406b4eafacc531ff2ee5fb032bd93317e2b Mon Sep 17 00:00:00 2001
+From: Qu Wenruo <wqu@suse.com>
+Date: Tue, 10 Jan 2023 15:14:17 +0800
+Subject: btrfs: qgroup: do not warn on record without old_roots populated
+
+From: Qu Wenruo <wqu@suse.com>
+
+commit 75181406b4eafacc531ff2ee5fb032bd93317e2b upstream.
+
+[BUG]
+There are some reports from the mailing list that since v6.1 kernel, the
+WARN_ON() inside btrfs_qgroup_account_extent() gets triggered during
+rescan:
+
+  WARNING: CPU: 3 PID: 6424 at fs/btrfs/qgroup.c:2756 btrfs_qgroup_account_extents+0x1ae/0x260 [btrfs]
+  CPU: 3 PID: 6424 Comm: snapperd Tainted: P           OE      6.1.2-1-default #1 openSUSE Tumbleweed 05c7a1b1b61d5627475528f71f50444637b5aad7
+  RIP: 0010:btrfs_qgroup_account_extents+0x1ae/0x260 [btrfs]
+  Call Trace:
+   <TASK>
+  btrfs_commit_transaction+0x30c/0xb40 [btrfs c39c9c546c241c593f03bd6d5f39ea1b676250f6]
+   ? start_transaction+0xc3/0x5b0 [btrfs c39c9c546c241c593f03bd6d5f39ea1b676250f6]
+  btrfs_qgroup_rescan+0x42/0xc0 [btrfs c39c9c546c241c593f03bd6d5f39ea1b676250f6]
+   btrfs_ioctl+0x1ab9/0x25c0 [btrfs c39c9c546c241c593f03bd6d5f39ea1b676250f6]
+   ? __rseq_handle_notify_resume+0xa9/0x4a0
+   ? mntput_no_expire+0x4a/0x240
+   ? __seccomp_filter+0x319/0x4d0
+   __x64_sys_ioctl+0x90/0xd0
+   do_syscall_64+0x5b/0x80
+   ? syscall_exit_to_user_mode+0x17/0x40
+   ? do_syscall_64+0x67/0x80
+  entry_SYSCALL_64_after_hwframe+0x63/0xcd
+  RIP: 0033:0x7fd9b790d9bf
+   </TASK>
+
+[CAUSE]
+Since commit e15e9f43c7ca ("btrfs: introduce
+BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING to skip qgroup accounting"), if
+our qgroup is already in inconsistent state, we will no longer do the
+time-consuming backref walk.
+
+This can leave some qgroup records without a valid old_roots ulist.
+Normally this is fine, as btrfs_qgroup_account_extents() would also skip
+those records if we have NO_ACCOUNTING flag set.
+
+But there is a small window, if we have NO_ACCOUNTING flag set, and
+inserted some qgroup_record without a old_roots ulist, but then the user
+triggered a qgroup rescan.
+
+During btrfs_qgroup_rescan(), we firstly clear NO_ACCOUNTING flag, then
+commit current transaction.
+
+And since we have a qgroup_record with old_roots = NULL, we trigger the
+WARN_ON() during btrfs_qgroup_account_extents().
+
+[FIX]
+Unfortunately due to the introduction of NO_ACCOUNTING flag, the
+assumption that every qgroup_record would have its old_roots populated
+is no longer correct.
+
+Fix the false alerts and drop the WARN_ON().
+
+Reported-by: Lukas Straub <lukasstraub2@web.de>
+Reported-by: HanatoK <summersnow9403@gmail.com>
+Fixes: e15e9f43c7ca ("btrfs: introduce BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING to skip qgroup accounting")
+CC: stable@vger.kernel.org # 6.1
+Link: https://lore.kernel.org/linux-btrfs/2403c697-ddaf-58ad-3829-0335fc89df09@gmail.com/
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/qgroup.c |   14 ++++++++++++--
+ 1 file changed, 12 insertions(+), 2 deletions(-)
+
+--- a/fs/btrfs/qgroup.c
++++ b/fs/btrfs/qgroup.c
+@@ -2751,9 +2751,19 @@ int btrfs_qgroup_account_extents(struct
+                             BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)) {
+                       /*
+                        * Old roots should be searched when inserting qgroup
+-                       * extent record
++                       * extent record.
++                       *
++                       * But for INCONSISTENT (NO_ACCOUNTING) -> rescan case,
++                       * we may have some record inserted during
++                       * NO_ACCOUNTING (thus no old_roots populated), but
++                       * later we start rescan, which clears NO_ACCOUNTING,
++                       * leaving some inserted records without old_roots
++                       * populated.
++                       *
++                       * Those cases are rare and should not cause too much
++                       * time spent during commit_transaction().
+                        */
+-                      if (WARN_ON(!record->old_roots)) {
++                      if (!record->old_roots) {
+                               /* Search commit root to find old_roots */
+                               ret = btrfs_find_all_roots(NULL, fs_info,
+                                               record->bytenr, 0,
diff --git a/queue-6.1/cifs-do-not-include-page-data-when-checking-signature.patch b/queue-6.1/cifs-do-not-include-page-data-when-checking-signature.patch

new file mode 100644 (file)

index 0000000..548852a
--- /dev/null
+++ b/queue-6.1/cifs-do-not-include-page-data-when-checking-signature.patch
@@ -0,0 +1,52 @@
+From 30b2b2196d6e4cc24cbec633535a2404f258ce69 Mon Sep 17 00:00:00 2001
+From: Enzo Matsumiya <ematsumiya@suse.de>
+Date: Wed, 18 Jan 2023 14:06:57 -0300
+Subject: cifs: do not include page data when checking signature
+
+From: Enzo Matsumiya <ematsumiya@suse.de>
+
+commit 30b2b2196d6e4cc24cbec633535a2404f258ce69 upstream.
+
+On async reads, page data is allocated before sending.  When the
+response is received but it has no data to fill (e.g.
+STATUS_END_OF_FILE), __calc_signature() will still include the pages in
+its computation, leading to an invalid signature check.
+
+This patch fixes this by not setting the async read smb_rqst page data
+(zeroed by default) if its got_bytes is 0.
+
+This can be reproduced/verified with xfstests generic/465.
+
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Enzo Matsumiya <ematsumiya@suse.de>
+Reviewed-by: Paulo Alcantara (SUSE) <pc@cjr.nz>
+Signed-off-by: Steve French <stfrench@microsoft.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/cifs/smb2pdu.c |   15 +++++++++------
+ 1 file changed, 9 insertions(+), 6 deletions(-)
+
+--- a/fs/cifs/smb2pdu.c
++++ b/fs/cifs/smb2pdu.c
+@@ -4162,12 +4162,15 @@ smb2_readv_callback(struct mid_q_entry *
+                               (struct smb2_hdr *)rdata->iov[0].iov_base;
+       struct cifs_credits credits = { .value = 0, .instance = 0 };
+       struct smb_rqst rqst = { .rq_iov = &rdata->iov[1],
+-                               .rq_nvec = 1,
+-                               .rq_pages = rdata->pages,
+-                               .rq_offset = rdata->page_offset,
+-                               .rq_npages = rdata->nr_pages,
+-                               .rq_pagesz = rdata->pagesz,
+-                               .rq_tailsz = rdata->tailsz };
++                               .rq_nvec = 1, };
++
++      if (rdata->got_bytes) {
++              rqst.rq_pages = rdata->pages;
++              rqst.rq_offset = rdata->page_offset;
++              rqst.rq_npages = rdata->nr_pages;
++              rqst.rq_pagesz = rdata->pagesz;
++              rqst.rq_tailsz = rdata->tailsz;
++      }
+ 
+       WARN_ONCE(rdata->server != mid->server,
+                 "rdata server %p != mid server %p",
diff --git a/queue-6.1/mm-hugetlb-fix-pte-marker-handling-in-hugetlb_change_protection.patch b/queue-6.1/mm-hugetlb-fix-pte-marker-handling-in-hugetlb_change_protection.patch

new file mode 100644 (file)

index 0000000..0e9a475
--- /dev/null
+++ b/queue-6.1/mm-hugetlb-fix-pte-marker-handling-in-hugetlb_change_protection.patch
@@ -0,0 +1,150 @@
+From 0e678153f5be7e6c8d28835f5a678618da4b7a9c Mon Sep 17 00:00:00 2001
+From: David Hildenbrand <david@redhat.com>
+Date: Thu, 22 Dec 2022 21:55:10 +0100
+Subject: mm/hugetlb: fix PTE marker handling in hugetlb_change_protection()
+
+From: David Hildenbrand <david@redhat.com>
+
+commit 0e678153f5be7e6c8d28835f5a678618da4b7a9c upstream.
+
+Patch series "mm/hugetlb: uffd-wp fixes for hugetlb_change_protection()".
+
+Playing with virtio-mem and background snapshots (using uffd-wp) on
+hugetlb in QEMU, I managed to trigger a VM_BUG_ON().  Looking into the
+details, hugetlb_change_protection() seems to not handle uffd-wp correctly
+in all cases.
+
+Patch #1 fixes my test case.  I don't have reproducers for patch #2, as it
+requires running into migration entries.
+
+I did not yet check in detail yet if !hugetlb code requires similar care.
+
+
+This patch (of 2):
+
+There are two problematic cases when stumbling over a PTE marker in
+hugetlb_change_protection():
+
+(1) We protect an uffd-wp PTE marker a second time using uffd-wp: we will
+    end up in the "!huge_pte_none(pte)" case and mess up the PTE marker.
+
+(2) We unprotect a uffd-wp PTE marker: we will similarly end up in the
+    "!huge_pte_none(pte)" case even though we cleared the PTE, because
+    the "pte" variable is stale. We'll mess up the PTE marker.
+
+For example, if we later stumble over such a "wrongly modified" PTE marker,
+we'll treat it like a present PTE that maps some garbage page.
+
+This can, for example, be triggered by mapping a memfd backed by huge
+pages, registering uffd-wp, uffd-wp'ing an unmapped page and (a)
+uffd-wp'ing it a second time; or (b) uffd-unprotecting it; or (c)
+unregistering uffd-wp. Then, ff we trigger fallocate(FALLOC_FL_PUNCH_HOLE)
+on that file range, we will run into a VM_BUG_ON:
+
+[  195.039560] page:00000000ba1f2987 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x0
+[  195.039565] flags: 0x7ffffc0001000(reserved|node=0|zone=0|lastcpupid=0x1fffff)
+[  195.039568] raw: 0007ffffc0001000 ffffe742c0000008 ffffe742c0000008 0000000000000000
+[  195.039569] raw: 0000000000000000 0000000000000000 00000001ffffffff 0000000000000000
+[  195.039569] page dumped because: VM_BUG_ON_PAGE(compound && !PageHead(page))
+[  195.039573] ------------[ cut here ]------------
+[  195.039574] kernel BUG at mm/rmap.c:1346!
+[  195.039579] invalid opcode: 0000 [#1] PREEMPT SMP NOPTI
+[  195.039581] CPU: 7 PID: 4777 Comm: qemu-system-x86 Not tainted 6.0.12-200.fc36.x86_64 #1
+[  195.039583] Hardware name: LENOVO 20WNS1F81N/20WNS1F81N, BIOS N35ET50W (1.50 ) 09/15/2022
+[  195.039584] RIP: 0010:page_remove_rmap+0x45b/0x550
+[  195.039588] Code: [...]
+[  195.039589] RSP: 0018:ffffbc03c3633ba8 EFLAGS: 00010292
+[  195.039591] RAX: 0000000000000040 RBX: ffffe742c0000000 RCX: 0000000000000000
+[  195.039592] RDX: 0000000000000002 RSI: ffffffff8e7aac1a RDI: 00000000ffffffff
+[  195.039592] RBP: 0000000000000001 R08: 0000000000000000 R09: ffffbc03c3633a08
+[  195.039593] R10: 0000000000000003 R11: ffffffff8f146328 R12: ffff9b04c42754b0
+[  195.039594] R13: ffffffff8fcc6328 R14: ffffbc03c3633c80 R15: ffff9b0484ab9100
+[  195.039595] FS:  00007fc7aaf68640(0000) GS:ffff9b0bbf7c0000(0000) knlGS:0000000000000000
+[  195.039596] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+[  195.039597] CR2: 000055d402c49110 CR3: 0000000159392003 CR4: 0000000000772ee0
+[  195.039598] PKRU: 55555554
+[  195.039599] Call Trace:
+[  195.039600]  <TASK>
+[  195.039602]  __unmap_hugepage_range+0x33b/0x7d0
+[  195.039605]  unmap_hugepage_range+0x55/0x70
+[  195.039608]  hugetlb_vmdelete_list+0x77/0xa0
+[  195.039611]  hugetlbfs_fallocate+0x410/0x550
+[  195.039612]  ? _raw_spin_unlock_irqrestore+0x23/0x40
+[  195.039616]  vfs_fallocate+0x12e/0x360
+[  195.039618]  __x64_sys_fallocate+0x40/0x70
+[  195.039620]  do_syscall_64+0x58/0x80
+[  195.039623]  ? syscall_exit_to_user_mode+0x17/0x40
+[  195.039624]  ? do_syscall_64+0x67/0x80
+[  195.039626]  entry_SYSCALL_64_after_hwframe+0x63/0xcd
+[  195.039628] RIP: 0033:0x7fc7b590651f
+[  195.039653] Code: [...]
+[  195.039654] RSP: 002b:00007fc7aaf66e70 EFLAGS: 00000293 ORIG_RAX: 000000000000011d
+[  195.039655] RAX: ffffffffffffffda RBX: 0000558ef4b7f370 RCX: 00007fc7b590651f
+[  195.039656] RDX: 0000000018000000 RSI: 0000000000000003 RDI: 000000000000000c
+[  195.039657] RBP: 0000000008000000 R08: 0000000000000000 R09: 0000000000000073
+[  195.039658] R10: 0000000008000000 R11: 0000000000000293 R12: 0000000018000000
+[  195.039658] R13: 00007fb8bbe00000 R14: 000000000000000c R15: 0000000000001000
+[  195.039661]  </TASK>
+
+Fix it by not going into the "!huge_pte_none(pte)" case if we stumble over
+an exclusive marker.  spin_unlock() + continue would get the job done.
+
+However, instead, make it clearer that there are no fall-through
+statements: we process each case (hwpoison, migration, marker, !none,
+none) and then unlock the page table to continue with the next PTE.  Let's
+avoid "continue" statements and use a single spin_unlock() at the end.
+
+Link: https://lkml.kernel.org/r/20221222205511.675832-1-david@redhat.com
+Link: https://lkml.kernel.org/r/20221222205511.675832-2-david@redhat.com
+Fixes: 60dfaad65aa9 ("mm/hugetlb: allow uffd wr-protect none ptes")
+Signed-off-by: David Hildenbrand <david@redhat.com>
+Reviewed-by: Peter Xu <peterx@redhat.com>
+Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
+Cc: Miaohe Lin <linmiaohe@huawei.com>
+Cc: Muchun Song <muchun.song@linux.dev>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/hugetlb.c |   21 +++++++--------------
+ 1 file changed, 7 insertions(+), 14 deletions(-)
+
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -6623,10 +6623,8 @@ unsigned long hugetlb_change_protection(
+               }
+               pte = huge_ptep_get(ptep);
+               if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) {
+-                      spin_unlock(ptl);
+-                      continue;
+-              }
+-              if (unlikely(is_hugetlb_entry_migration(pte))) {
++                      /* Nothing to do. */
++              } else if (unlikely(is_hugetlb_entry_migration(pte))) {
+                       swp_entry_t entry = pte_to_swp_entry(pte);
+                       struct page *page = pfn_swap_entry_to_page(entry);
+ 
+@@ -6647,18 +6645,13 @@ unsigned long hugetlb_change_protection(
+                               set_huge_pte_at(mm, address, ptep, newpte);
+                               pages++;
+                       }
+-                      spin_unlock(ptl);
+-                      continue;
+-              }
+-              if (unlikely(pte_marker_uffd_wp(pte))) {
+-                      /*
+-                       * This is changing a non-present pte into a none pte,
+-                       * no need for huge_ptep_modify_prot_start/commit().
+-                       */
++              } else if (unlikely(is_pte_marker(pte))) {
++                      /* No other markers apply for now. */
++                      WARN_ON_ONCE(!pte_marker_uffd_wp(pte));
+                       if (uffd_wp_resolve)
++                              /* Safe to modify directly (non-present->none). */
+                               huge_pte_clear(mm, address, ptep, psize);
+-              }
+-              if (!huge_pte_none(pte)) {
++              } else if (!huge_pte_none(pte)) {
+                       pte_t old_pte;
+                       unsigned int shift = huge_page_shift(hstate_vma(vma));
+ 
diff --git a/queue-6.1/mm-hugetlb-fix-uffd-wp-handling-for-migration-entries-in-hugetlb_change_protection.patch b/queue-6.1/mm-hugetlb-fix-uffd-wp-handling-for-migration-entries-in-hugetlb_change_protection.patch

new file mode 100644 (file)

index 0000000..df80e8c
--- /dev/null
+++ b/queue-6.1/mm-hugetlb-fix-uffd-wp-handling-for-migration-entries-in-hugetlb_change_protection.patch
@@ -0,0 +1,76 @@
+From 44f86392bdd165da7e43d3c772aeb1e128ffd6c8 Mon Sep 17 00:00:00 2001
+From: David Hildenbrand <david@redhat.com>
+Date: Thu, 22 Dec 2022 21:55:11 +0100
+Subject: mm/hugetlb: fix uffd-wp handling for migration entries in hugetlb_change_protection()
+
+From: David Hildenbrand <david@redhat.com>
+
+commit 44f86392bdd165da7e43d3c772aeb1e128ffd6c8 upstream.
+
+We have to update the uffd-wp SWP PTE bit independent of the type of
+migration entry.  Currently, if we're unlucky and we want to install/clear
+the uffd-wp bit just while we're migrating a read-only mapped hugetlb
+page, we would miss to set/clear the uffd-wp bit.
+
+Further, if we're processing a readable-exclusive migration entry and
+neither want to set or clear the uffd-wp bit, we could currently end up
+losing the uffd-wp bit.  Note that the same would hold for writable
+migrating entries, however, having a writable migration entry with the
+uffd-wp bit set would already mean that something went wrong.
+
+Note that the change from !is_readable_migration_entry ->
+writable_migration_entry is harmless and actually cleaner, as raised by
+Miaohe Lin and discussed in [1].
+
+[1] https://lkml.kernel.org/r/90dd6a93-4500-e0de-2bf0-bf522c311b0c@huawei.com
+
+Link: https://lkml.kernel.org/r/20221222205511.675832-3-david@redhat.com
+Fixes: 60dfaad65aa9 ("mm/hugetlb: allow uffd wr-protect none ptes")
+Signed-off-by: David Hildenbrand <david@redhat.com>
+Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
+Cc: Miaohe Lin <linmiaohe@huawei.com>
+Cc: Muchun Song <muchun.song@linux.dev>
+Cc: Peter Xu <peterx@redhat.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/hugetlb.c |   17 +++++++++--------
+ 1 file changed, 9 insertions(+), 8 deletions(-)
+
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -6627,10 +6627,9 @@ unsigned long hugetlb_change_protection(
+               } else if (unlikely(is_hugetlb_entry_migration(pte))) {
+                       swp_entry_t entry = pte_to_swp_entry(pte);
+                       struct page *page = pfn_swap_entry_to_page(entry);
++                      pte_t newpte = pte;
+ 
+-                      if (!is_readable_migration_entry(entry)) {
+-                              pte_t newpte;
+-
++                      if (is_writable_migration_entry(entry)) {
+                               if (PageAnon(page))
+                                       entry = make_readable_exclusive_migration_entry(
+                                                               swp_offset(entry));
+@@ -6638,13 +6637,15 @@ unsigned long hugetlb_change_protection(
+                                       entry = make_readable_migration_entry(
+                                                               swp_offset(entry));
+                               newpte = swp_entry_to_pte(entry);
+-                              if (uffd_wp)
+-                                      newpte = pte_swp_mkuffd_wp(newpte);
+-                              else if (uffd_wp_resolve)
+-                                      newpte = pte_swp_clear_uffd_wp(newpte);
+-                              set_huge_pte_at(mm, address, ptep, newpte);
+                               pages++;
+                       }
++
++                      if (uffd_wp)
++                              newpte = pte_swp_mkuffd_wp(newpte);
++                      else if (uffd_wp_resolve)
++                              newpte = pte_swp_clear_uffd_wp(newpte);
++                      if (!pte_same(pte, newpte))
++                              set_huge_pte_at(mm, address, ptep, newpte);
+               } else if (unlikely(is_pte_marker(pte))) {
+                       /* No other markers apply for now. */
+                       WARN_ON_ONCE(!pte_marker_uffd_wp(pte));
diff --git a/queue-6.1/mm-hugetlb-pre-allocate-pgtable-pages-for-uffd-wr-protects.patch b/queue-6.1/mm-hugetlb-pre-allocate-pgtable-pages-for-uffd-wr-protects.patch

new file mode 100644 (file)

index 0000000..c17b53d
--- /dev/null
+++ b/queue-6.1/mm-hugetlb-pre-allocate-pgtable-pages-for-uffd-wr-protects.patch
@@ -0,0 +1,59 @@
+From fed15f1345dc8a7fc8baa81e8b55c3ba010d7f4b Mon Sep 17 00:00:00 2001
+From: Peter Xu <peterx@redhat.com>
+Date: Wed, 4 Jan 2023 17:52:05 -0500
+Subject: mm/hugetlb: pre-allocate pgtable pages for uffd wr-protects
+
+From: Peter Xu <peterx@redhat.com>
+
+commit fed15f1345dc8a7fc8baa81e8b55c3ba010d7f4b upstream.
+
+Userfaultfd-wp uses pte markers to mark wr-protected pages for both shmem
+and hugetlb.  Shmem has pre-allocation ready for markers, but hugetlb path
+was overlooked.
+
+Doing so by calling huge_pte_alloc() if the initial pgtable walk fails to
+find the huge ptep.  It's possible that huge_pte_alloc() can fail with
+high memory pressure, in that case stop the loop immediately and fail
+silently.  This is not the most ideal solution but it matches with what we
+do with shmem meanwhile it avoids the splat in dmesg.
+
+Link: https://lkml.kernel.org/r/20230104225207.1066932-2-peterx@redhat.com
+Fixes: 60dfaad65aa9 ("mm/hugetlb: allow uffd wr-protect none ptes")
+Signed-off-by: Peter Xu <peterx@redhat.com>
+Reported-by: James Houghton <jthoughton@google.com>
+Reviewed-by: Mike Kravetz <mike.kravetz@oracle.com>
+Acked-by: David Hildenbrand <david@redhat.com>
+Acked-by: James Houghton <jthoughton@google.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Axel Rasmussen <axelrasmussen@google.com>
+Cc: Muchun Song <songmuchun@bytedance.com>
+Cc: Nadav Amit <nadav.amit@gmail.com>
+Cc: <stable@vger.kernel.org>   [5.19+]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/hugetlb.c |   13 +++++++++++--
+ 1 file changed, 11 insertions(+), 2 deletions(-)
+
+--- a/mm/hugetlb.c
++++ b/mm/hugetlb.c
+@@ -6604,8 +6604,17 @@ unsigned long hugetlb_change_protection(
+               spinlock_t *ptl;
+               ptep = huge_pte_offset(mm, address, psize);
+               if (!ptep) {
+-                      address |= last_addr_mask;
+-                      continue;
++                      if (!uffd_wp) {
++                              address |= last_addr_mask;
++                              continue;
++                      }
++                      /*
++                       * Userfaultfd wr-protect requires pgtable
++                       * pre-allocations to install pte markers.
++                       */
++                      ptep = huge_pte_alloc(mm, vma, address, psize);
++                      if (!ptep)
++                              break;
+               }
+               ptl = huge_pte_lock(h, mm, ptep);
+               if (huge_pmd_unshare(mm, vma, address, ptep)) {
diff --git a/queue-6.1/mm-madv_collapse-don-t-expand-collapse-when-vm_end-is-past-requested-end.patch b/queue-6.1/mm-madv_collapse-don-t-expand-collapse-when-vm_end-is-past-requested-end.patch

new file mode 100644 (file)

index 0000000..29bf16c
--- /dev/null
+++ b/queue-6.1/mm-madv_collapse-don-t-expand-collapse-when-vm_end-is-past-requested-end.patch
@@ -0,0 +1,66 @@
+From 52dc031088f00e323140ece4004e70c33153c6dd Mon Sep 17 00:00:00 2001
+From: Zach O'Keefe <zokeefe@google.com>
+Date: Sat, 24 Dec 2022 00:20:34 -0800
+Subject: mm/MADV_COLLAPSE: don't expand collapse when vm_end is past requested end
+
+From: Zach O'Keefe <zokeefe@google.com>
+
+commit 52dc031088f00e323140ece4004e70c33153c6dd upstream.
+
+MADV_COLLAPSE acts on one hugepage-aligned/sized region at a time, until
+it has collapsed all eligible memory contained within the bounds supplied
+by the user.
+
+At the top of each hugepage iteration we (re)lock mmap_lock and
+(re)validate the VMA for eligibility and update variables that might have
+changed while mmap_lock was dropped.  One thing that might occur is that
+the VMA could be resized, and as such, we refetch vma->vm_end to make sure
+we don't collapse past the end of the VMA's new end.
+
+However, it's possible that when refetching vma->vm_end that we expand the
+region acted on by MADV_COLLAPSE if vma->vm_end is greater than size+len
+supplied by the user.
+
+The consequence here is that we may attempt to collapse more memory than
+requested, possibly yielding either "too much success" or "false failure"
+user-visible results.  An example of the former is if we MADV_COLLAPSE the
+first 4MiB of a 2TiB mmap()'d file, the incorrect refetch would cause the
+operation to block for much longer than anticipated as we attempt to
+collapse the entire TiB region.  An example of the latter is that applying
+MADV_COLLPSE to a 4MiB file mapped to the start of a 6MiB VMA will
+successfully collapse the first 4MiB, then incorrectly attempt to collapse
+the last hugepage-aligned/sized region -- fail (since readahead/page cache
+lookup will fail) -- and report a failure to the user.
+
+I don't believe there is a kernel stability concern here as we always
+(re)validate the VMA / region accordingly.  Also as Hugh mentions, the
+user-visible effects are: we try to collapse more memory than requested
+by the user, and/or failing an operation that should have otherwise
+succeeded.  An example is trying to collapse a 4MiB file contained
+within a 12MiB VMA.
+
+Don't expand the acted-on region when refetching vma->vm_end.
+
+Link: https://lkml.kernel.org/r/20221224082035.3197140-1-zokeefe@google.com
+Fixes: 4d24de9425f7 ("mm: MADV_COLLAPSE: refetch vm_end after reacquiring mmap_lock")
+Signed-off-by: Zach O'Keefe <zokeefe@google.com>
+Reported-by: Hugh Dickins <hughd@google.com>
+Cc: Yang Shi <shy828301@gmail.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/khugepaged.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/mm/khugepaged.c
++++ b/mm/khugepaged.c
+@@ -2644,7 +2644,7 @@ int madvise_collapse(struct vm_area_stru
+                               goto out_nolock;
+                       }
+ 
+-                      hend = vma->vm_end & HPAGE_PMD_MASK;
++                      hend = min(hend, vma->vm_end & HPAGE_PMD_MASK);
+               }
+               mmap_assert_locked(mm);
+               memset(cc->node_load, 0, sizeof(cc->node_load));
diff --git a/queue-6.1/mm-userfaultfd-enable-writenotify-while-userfaultfd-wp-is-enabled-for-a-vma.patch b/queue-6.1/mm-userfaultfd-enable-writenotify-while-userfaultfd-wp-is-enabled-for-a-vma.patch

new file mode 100644 (file)

index 0000000..974375d
--- /dev/null
+++ b/queue-6.1/mm-userfaultfd-enable-writenotify-while-userfaultfd-wp-is-enabled-for-a-vma.patch
@@ -0,0 +1,169 @@
+From 51d3d5eb74ff53b92dcff48b30ae2ed8edd85a32 Mon Sep 17 00:00:00 2001
+From: David Hildenbrand <david@redhat.com>
+Date: Fri, 9 Dec 2022 09:09:12 +0100
+Subject: mm/userfaultfd: enable writenotify while userfaultfd-wp is enabled for a VMA
+
+From: David Hildenbrand <david@redhat.com>
+
+commit 51d3d5eb74ff53b92dcff48b30ae2ed8edd85a32 upstream.
+
+Currently, we don't enable writenotify when enabling userfaultfd-wp on a
+shared writable mapping (for now only shmem and hugetlb).  The consequence
+is that vma->vm_page_prot will still include write permissions, to be set
+as default for all PTEs that get remapped (e.g., mprotect(), NUMA hinting,
+page migration, ...).
+
+So far, vma->vm_page_prot is assumed to be a safe default, meaning that we
+only add permissions (e.g., mkwrite) but not remove permissions (e.g.,
+wrprotect).  For example, when enabling softdirty tracking, we enable
+writenotify.  With uffd-wp on shared mappings, that changed.  More details
+on vma->vm_page_prot semantics were summarized in [1].
+
+This is problematic for uffd-wp: we'd have to manually check for a uffd-wp
+PTEs/PMDs and manually write-protect PTEs/PMDs, which is error prone.
+Prone to such issues is any code that uses vma->vm_page_prot to set PTE
+permissions: primarily pte_modify() and mk_pte().
+
+Instead, let's enable writenotify such that PTEs/PMDs/...  will be mapped
+write-protected as default and we will only allow selected PTEs that are
+definitely safe to be mapped without write-protection (see
+can_change_pte_writable()) to be writable.  In the future, we might want
+to enable write-bit recovery -- e.g., can_change_pte_writable() -- at more
+locations, for example, also when removing uffd-wp protection.
+
+This fixes two known cases:
+
+(a) remove_migration_pte() mapping uffd-wp'ed PTEs writable, resulting
+    in uffd-wp not triggering on write access.
+(b) do_numa_page() / do_huge_pmd_numa_page() mapping uffd-wp'ed PTEs/PMDs
+    writable, resulting in uffd-wp not triggering on write access.
+
+Note that do_numa_page() / do_huge_pmd_numa_page() can be reached even
+without NUMA hinting (which currently doesn't seem to be applicable to
+shmem), for example, by using uffd-wp with a PROT_WRITE shmem VMA.  On
+such a VMA, userfaultfd-wp is currently non-functional.
+
+Note that when enabling userfaultfd-wp, there is no need to walk page
+tables to enforce the new default protection for the PTEs: we know that
+they cannot be uffd-wp'ed yet, because that can only happen after enabling
+uffd-wp for the VMA in general.
+
+Also note that this makes mprotect() on ranges with uffd-wp'ed PTEs not
+accidentally set the write bit -- which would result in uffd-wp not
+triggering on later write access.  This commit makes uffd-wp on shmem
+behave just like uffd-wp on anonymous memory in that regard, even though,
+mixing mprotect with uffd-wp is controversial.
+
+[1] https://lkml.kernel.org/r/92173bad-caa3-6b43-9d1e-9a471fdbc184@redhat.com
+
+Link: https://lkml.kernel.org/r/20221209080912.7968-1-david@redhat.com
+Fixes: b1f9e876862d ("mm/uffd: enable write protection for shmem & hugetlbfs")
+Signed-off-by: David Hildenbrand <david@redhat.com>
+Reported-by: Ives van Hoorne <ives@codesandbox.io>
+Debugged-by: Peter Xu <peterx@redhat.com>
+Acked-by: Peter Xu <peterx@redhat.com>
+Cc: Hugh Dickins <hughd@google.com>
+Cc: Alistair Popple <apopple@nvidia.com>
+Cc: Mike Rapoport <rppt@linux.vnet.ibm.com>
+Cc: Nadav Amit <nadav.amit@gmail.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/userfaultfd.c |   28 ++++++++++++++++++++++------
+ mm/mmap.c        |    4 ++++
+ 2 files changed, 26 insertions(+), 6 deletions(-)
+
+--- a/fs/userfaultfd.c
++++ b/fs/userfaultfd.c
+@@ -108,6 +108,21 @@ static bool userfaultfd_is_initialized(s
+       return ctx->features & UFFD_FEATURE_INITIALIZED;
+ }
+ 
++static void userfaultfd_set_vm_flags(struct vm_area_struct *vma,
++                                   vm_flags_t flags)
++{
++      const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP;
++
++      vma->vm_flags = flags;
++      /*
++       * For shared mappings, we want to enable writenotify while
++       * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply
++       * recalculate vma->vm_page_prot whenever userfaultfd-wp changes.
++       */
++      if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed)
++              vma_set_page_prot(vma);
++}
++
+ static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode,
+                                    int wake_flags, void *key)
+ {
+@@ -618,7 +633,8 @@ static void userfaultfd_event_wait_compl
+               for_each_vma(vmi, vma) {
+                       if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) {
+                               vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+-                              vma->vm_flags &= ~__VM_UFFD_FLAGS;
++                              userfaultfd_set_vm_flags(vma,
++                                                       vma->vm_flags & ~__VM_UFFD_FLAGS);
+                       }
+               }
+               mmap_write_unlock(mm);
+@@ -652,7 +668,7 @@ int dup_userfaultfd(struct vm_area_struc
+       octx = vma->vm_userfaultfd_ctx.ctx;
+       if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) {
+               vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+-              vma->vm_flags &= ~__VM_UFFD_FLAGS;
++              userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
+               return 0;
+       }
+ 
+@@ -733,7 +749,7 @@ void mremap_userfaultfd_prep(struct vm_a
+       } else {
+               /* Drop uffd context if remap feature not enabled */
+               vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+-              vma->vm_flags &= ~__VM_UFFD_FLAGS;
++              userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS);
+       }
+ }
+ 
+@@ -895,7 +911,7 @@ static int userfaultfd_release(struct in
+                       prev = vma;
+               }
+ 
+-              vma->vm_flags = new_flags;
++              userfaultfd_set_vm_flags(vma, new_flags);
+               vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+       }
+       mmap_write_unlock(mm);
+@@ -1463,7 +1479,7 @@ static int userfaultfd_register(struct u
+                * the next vma was merged into the current one and
+                * the current one has not been updated yet.
+                */
+-              vma->vm_flags = new_flags;
++              userfaultfd_set_vm_flags(vma, new_flags);
+               vma->vm_userfaultfd_ctx.ctx = ctx;
+ 
+               if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma))
+@@ -1651,7 +1667,7 @@ static int userfaultfd_unregister(struct
+                * the next vma was merged into the current one and
+                * the current one has not been updated yet.
+                */
+-              vma->vm_flags = new_flags;
++              userfaultfd_set_vm_flags(vma, new_flags);
+               vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX;
+ 
+       skip:
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -1524,6 +1524,10 @@ int vma_wants_writenotify(struct vm_area
+       if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma))
+               return 1;
+ 
++      /* Do we need write faults for uffd-wp tracking? */
++      if (userfaultfd_wp(vma))
++              return 1;
++
+       /* Specialty mapping? */
+       if (vm_flags & VM_PFNMAP)
+               return 0;
diff --git a/queue-6.1/mmc-sdhci-esdhc-imx-correct-the-tuning-start-tap-and-step-setting.patch b/queue-6.1/mmc-sdhci-esdhc-imx-correct-the-tuning-start-tap-and-step-setting.patch

new file mode 100644 (file)

index 0000000..0459121
--- /dev/null
+++ b/queue-6.1/mmc-sdhci-esdhc-imx-correct-the-tuning-start-tap-and-step-setting.patch
@@ -0,0 +1,73 @@
+From 1e336aa0c0250ec84c6f16efac40c9f0138e367d Mon Sep 17 00:00:00 2001
+From: Haibo Chen <haibo.chen@nxp.com>
+Date: Wed, 7 Dec 2022 19:23:15 +0800
+Subject: mmc: sdhci-esdhc-imx: correct the tuning start tap and step setting
+
+From: Haibo Chen <haibo.chen@nxp.com>
+
+commit 1e336aa0c0250ec84c6f16efac40c9f0138e367d upstream.
+
+Current code logic may be impacted by the setting of ROM/Bootloader,
+so unmask these bits first, then setting these bits accordingly.
+
+Fixes: 2b16cf326b70 ("mmc: sdhci-esdhc-imx: move tuning static configuration into hwinit function")
+Signed-off-by: Haibo Chen <haibo.chen@nxp.com>
+Acked-by: Adrian Hunter <adrian.hunter@intel.com>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20221207112315.1812222-1-haibo.chen@nxp.com
+Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/mmc/host/sdhci-esdhc-imx.c |   22 +++++++++++++++-------
+ 1 file changed, 15 insertions(+), 7 deletions(-)
+
+--- a/drivers/mmc/host/sdhci-esdhc-imx.c
++++ b/drivers/mmc/host/sdhci-esdhc-imx.c
+@@ -107,6 +107,7 @@
+ #define ESDHC_TUNING_START_TAP_DEFAULT        0x1
+ #define ESDHC_TUNING_START_TAP_MASK   0x7f
+ #define ESDHC_TUNING_CMD_CRC_CHECK_DISABLE    (1 << 7)
++#define ESDHC_TUNING_STEP_DEFAULT     0x1
+ #define ESDHC_TUNING_STEP_MASK                0x00070000
+ #define ESDHC_TUNING_STEP_SHIFT               16
+ 
+@@ -1361,7 +1362,7 @@ static void sdhci_esdhc_imx_hwinit(struc
+       struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host);
+       struct pltfm_imx_data *imx_data = sdhci_pltfm_priv(pltfm_host);
+       struct cqhci_host *cq_host = host->mmc->cqe_private;
+-      int tmp;
++      u32 tmp;
+ 
+       if (esdhc_is_usdhc(imx_data)) {
+               /*
+@@ -1416,17 +1417,24 @@ static void sdhci_esdhc_imx_hwinit(struc
+ 
+               if (imx_data->socdata->flags & ESDHC_FLAG_STD_TUNING) {
+                       tmp = readl(host->ioaddr + ESDHC_TUNING_CTRL);
+-                      tmp |= ESDHC_STD_TUNING_EN |
+-                              ESDHC_TUNING_START_TAP_DEFAULT;
+-                      if (imx_data->boarddata.tuning_start_tap) {
+-                              tmp &= ~ESDHC_TUNING_START_TAP_MASK;
++                      tmp |= ESDHC_STD_TUNING_EN;
++
++                      /*
++                       * ROM code or bootloader may config the start tap
++                       * and step, unmask them first.
++                       */
++                      tmp &= ~(ESDHC_TUNING_START_TAP_MASK | ESDHC_TUNING_STEP_MASK);
++                      if (imx_data->boarddata.tuning_start_tap)
+                               tmp |= imx_data->boarddata.tuning_start_tap;
+-                      }
++                      else
++                              tmp |= ESDHC_TUNING_START_TAP_DEFAULT;
+ 
+                       if (imx_data->boarddata.tuning_step) {
+-                              tmp &= ~ESDHC_TUNING_STEP_MASK;
+                               tmp |= imx_data->boarddata.tuning_step
+                                       << ESDHC_TUNING_STEP_SHIFT;
++                      } else {
++                              tmp |= ESDHC_TUNING_STEP_DEFAULT
++                                      << ESDHC_TUNING_STEP_SHIFT;
+                       }
+ 
+                       /* Disable the CMD CRC check for tuning, if not, need to
diff --git a/queue-6.1/mmc-sunxi-mmc-fix-clock-refcount-imbalance-during-unbind.patch b/queue-6.1/mmc-sunxi-mmc-fix-clock-refcount-imbalance-during-unbind.patch

new file mode 100644 (file)

index 0000000..33faae0
--- /dev/null
+++ b/queue-6.1/mmc-sunxi-mmc-fix-clock-refcount-imbalance-during-unbind.patch
@@ -0,0 +1,41 @@
+From 8509419758f2cc28dd05370385af0d91573b76b4 Mon Sep 17 00:00:00 2001
+From: Samuel Holland <samuel@sholland.org>
+Date: Tue, 9 Aug 2022 21:25:09 -0500
+Subject: mmc: sunxi-mmc: Fix clock refcount imbalance during unbind
+
+From: Samuel Holland <samuel@sholland.org>
+
+commit 8509419758f2cc28dd05370385af0d91573b76b4 upstream.
+
+If the controller is suspended by runtime PM, the clock is already
+disabled, so do not try to disable it again during removal. Use
+pm_runtime_disable() to flush any pending runtime PM transitions.
+
+Fixes: 9a8e1e8cc2c0 ("mmc: sunxi: Add runtime_pm support")
+Signed-off-by: Samuel Holland <samuel@sholland.org>
+Acked-by: Jernej Skrabec <jernej.skrabec@gmail.com>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20220810022509.43743-1-samuel@sholland.org
+Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/mmc/host/sunxi-mmc.c |    8 +++++---
+ 1 file changed, 5 insertions(+), 3 deletions(-)
+
+--- a/drivers/mmc/host/sunxi-mmc.c
++++ b/drivers/mmc/host/sunxi-mmc.c
+@@ -1492,9 +1492,11 @@ static int sunxi_mmc_remove(struct platf
+       struct sunxi_mmc_host *host = mmc_priv(mmc);
+ 
+       mmc_remove_host(mmc);
+-      pm_runtime_force_suspend(&pdev->dev);
+-      disable_irq(host->irq);
+-      sunxi_mmc_disable(host);
++      pm_runtime_disable(&pdev->dev);
++      if (!pm_runtime_status_suspended(&pdev->dev)) {
++              disable_irq(host->irq);
++              sunxi_mmc_disable(host);
++      }
+       dma_free_coherent(&pdev->dev, PAGE_SIZE, host->sg_cpu, host->sg_dma);
+       mmc_free_host(mmc);
+ 
diff --git a/queue-6.1/riscv-dts-sifive-fu740-fix-size-of-pcie-32bit-memory.patch b/queue-6.1/riscv-dts-sifive-fu740-fix-size-of-pcie-32bit-memory.patch

new file mode 100644 (file)

index 0000000..61edb90
--- /dev/null
+++ b/queue-6.1/riscv-dts-sifive-fu740-fix-size-of-pcie-32bit-memory.patch
@@ -0,0 +1,44 @@
+From 43d5f5d63699724d47f0d9e0eae516a260d232b4 Mon Sep 17 00:00:00 2001
+From: Ben Dooks <ben.dooks@codethink.co.uk>
+Date: Fri, 6 Jan 2023 13:44:56 +0000
+Subject: riscv: dts: sifive: fu740: fix size of pcie 32bit memory
+
+From: Ben Dooks <ben.dooks@codethink.co.uk>
+
+commit 43d5f5d63699724d47f0d9e0eae516a260d232b4 upstream.
+
+The 32-bit memory resource is needed for non-prefetchable memory
+allocations on the PCIe bus, however with some cards (such as the
+SM768) the system fails to allocate memory from this.
+
+Checking the allocation against the datasheet, it looks like there
+has been a mis-calcualation of the resource for the first memory
+region (0x0060090000..0x0070ffffff) which in the data-sheet for
+the fu740 (v1p2) is from 0x0060000000..0x007fffffff. Changing
+this to allocate from 0x0060090000..0x007fffffff fixes the probing
+issues.
+
+Fixes: ae80d5148085 ("riscv: dts: Add PCIe support for the SiFive FU740-C000 SoC")
+Cc: Paul Walmsley <paul.walmsley@sifive.com>
+Cc: Greentime Hu <greentime.hu@sifive.com>
+Signed-off-by: Ben Dooks <ben.dooks@codethink.co.uk>
+Cc: stable@vger.kernel.org
+Tested-by: Ron Economos <re@w6rz.net> # from IRC
+Reviewed-by: Conor Dooley <conor.dooley@microchip.com>
+Signed-off-by: Conor Dooley <conor.dooley@microchip.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/riscv/boot/dts/sifive/fu740-c000.dtsi |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/arch/riscv/boot/dts/sifive/fu740-c000.dtsi
++++ b/arch/riscv/boot/dts/sifive/fu740-c000.dtsi
+@@ -328,7 +328,7 @@
+                       bus-range = <0x0 0xff>;
+                       ranges = <0x81000000  0x0 0x60080000  0x0 0x60080000 0x0 0x10000>,      /* I/O */
+                                <0x82000000  0x0 0x60090000  0x0 0x60090000 0x0 0xff70000>,    /* mem */
+-                               <0x82000000  0x0 0x70000000  0x0 0x70000000 0x0 0x1000000>,    /* mem */
++                               <0x82000000  0x0 0x70000000  0x0 0x70000000 0x0 0x10000000>,    /* mem */
+                                <0xc3000000 0x20 0x00000000 0x20 0x00000000 0x20 0x00000000>;  /* mem prefetchable */
+                       num-lanes = <0x8>;
+                       interrupts = <56>, <57>, <58>, <59>, <60>, <61>, <62>, <63>, <64>;
diff --git a/queue-6.1/series b/queue-6.1/series

index 3963c792c12c94e360ae02dc2a39670d4b0f38aa..771e71c84595076aa8abb58fe0bfa41b520c5432 100644 (file)
--- a/queue-6.1/series
+++ b/queue-6.1/series
@@ -71,3 +71,26 @@ misc-fastrpc-fix-use-after-free-race-condition-for-maps.patch
  usb-core-hub-disable-autosuspend-for-ti-tusb8041.patch
  comedi-adv_pci1760-fix-pwm-instruction-handling.patch
  acpi-prm-check-whether-efi-runtime-is-available.patch
+mmc-sunxi-mmc-fix-clock-refcount-imbalance-during-unbind.patch
+mmc-sdhci-esdhc-imx-correct-the-tuning-start-tap-and-step-setting.patch
+mm-hugetlb-fix-pte-marker-handling-in-hugetlb_change_protection.patch
+mm-hugetlb-fix-uffd-wp-handling-for-migration-entries-in-hugetlb_change_protection.patch
+mm-hugetlb-pre-allocate-pgtable-pages-for-uffd-wr-protects.patch
+mm-userfaultfd-enable-writenotify-while-userfaultfd-wp-is-enabled-for-a-vma.patch
+mm-madv_collapse-don-t-expand-collapse-when-vm_end-is-past-requested-end.patch
+btrfs-add-extra-error-messages-to-cover-non-enomem-errors-from-device_add_list.patch
+btrfs-fix-missing-error-handling-when-logging-directory-items.patch
+btrfs-fix-directory-logging-due-to-race-with-concurrent-index-key-deletion.patch
+btrfs-add-missing-setup-of-log-for-full-commit-at-add_conflicting_inode.patch
+btrfs-do-not-abort-transaction-on-failure-to-write-log-tree-when-syncing-log.patch
+btrfs-do-not-abort-transaction-on-failure-to-update-log-root.patch
+btrfs-qgroup-do-not-warn-on-record-without-old_roots-populated.patch
+btrfs-fix-invalid-leaf-access-due-to-inline-extent-during-lseek.patch
+btrfs-fix-race-between-quota-rescan-and-disable-leading-to-null-pointer-deref.patch
+cifs-do-not-include-page-data-when-checking-signature.patch
+thunderbolt-disable-xdomain-lane-1-only-in-software-connection-manager.patch
+thunderbolt-use-correct-function-to-calculate-maximum-usb3-link-rate.patch
+thunderbolt-do-not-report-errors-if-on-board-retimers-are-found.patch
+thunderbolt-do-not-call-pm-runtime-functions-in-tb_retimer_scan.patch
+riscv-dts-sifive-fu740-fix-size-of-pcie-32bit-memory.patch
+bpf-restore-the-ebpf-program-id-for-bpf_audit_unload-and-perf_bpf_event_prog_unload.patch
diff --git a/queue-6.1/thunderbolt-disable-xdomain-lane-1-only-in-software-connection-manager.patch b/queue-6.1/thunderbolt-disable-xdomain-lane-1-only-in-software-connection-manager.patch

new file mode 100644 (file)

index 0000000..d9e0411
--- /dev/null
+++ b/queue-6.1/thunderbolt-disable-xdomain-lane-1-only-in-software-connection-manager.patch
@@ -0,0 +1,54 @@
+From 84ee211c83212f4d35b56e0603acdcc41f860f1b Mon Sep 17 00:00:00 2001
+From: Mika Westerberg <mika.westerberg@linux.intel.com>
+Date: Thu, 8 Sep 2022 09:45:22 +0300
+Subject: thunderbolt: Disable XDomain lane 1 only in software connection manager
+
+From: Mika Westerberg <mika.westerberg@linux.intel.com>
+
+commit 84ee211c83212f4d35b56e0603acdcc41f860f1b upstream.
+
+When firmware connection manager is in use we should not touch the lane
+adapter (well or any) configuration space so do this only when we know
+that the software connection manager is active.
+
+Fixes: 8e1de7042596 ("thunderbolt: Add support for XDomain lane bonding")
+Cc: stable@vger.kernel.org
+Acked-by: Yehezkel Bernat <YehezkelShB@gmail.com>
+Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/thunderbolt/xdomain.c | 17 ++++++++++++-----
+ 1 file changed, 12 insertions(+), 5 deletions(-)
+
+diff --git a/drivers/thunderbolt/xdomain.c b/drivers/thunderbolt/xdomain.c
+index cfa83486c9da..3c51e47dd86b 100644
+--- a/drivers/thunderbolt/xdomain.c
++++ b/drivers/thunderbolt/xdomain.c
+@@ -1419,12 +1419,19 @@ static int tb_xdomain_get_properties(struct tb_xdomain *xd)
+        * registered, we notify the userspace that it has changed.
+        */
+       if (!update) {
+-              struct tb_port *port;
++              /*
++               * Now disable lane 1 if bonding was not enabled. Do
++               * this only if bonding was possible at the beginning
++               * (that is we are the connection manager and there are
++               * two lanes).
++               */
++              if (xd->bonding_possible) {
++                      struct tb_port *port;
+ 
+-              /* Now disable lane 1 if bonding was not enabled */
+-              port = tb_port_at(xd->route, tb_xdomain_parent(xd));
+-              if (!port->bonded)
+-                      tb_port_disable(port->dual_link_port);
++                      port = tb_port_at(xd->route, tb_xdomain_parent(xd));
++                      if (!port->bonded)
++                              tb_port_disable(port->dual_link_port);
++              }
+ 
+               if (device_add(&xd->dev)) {
+                       dev_err(&xd->dev, "failed to add XDomain device\n");
+-- 
+2.39.1
+
diff --git a/queue-6.1/thunderbolt-do-not-call-pm-runtime-functions-in-tb_retimer_scan.patch b/queue-6.1/thunderbolt-do-not-call-pm-runtime-functions-in-tb_retimer_scan.patch

new file mode 100644 (file)

index 0000000..3c35a17
--- /dev/null
+++ b/queue-6.1/thunderbolt-do-not-call-pm-runtime-functions-in-tb_retimer_scan.patch
@@ -0,0 +1,137 @@
+From 23257cfc1cb7202fd0065e9f4a6a0aac1c04c4a9 Mon Sep 17 00:00:00 2001
+From: Mika Westerberg <mika.westerberg@linux.intel.com>
+Date: Thu, 29 Dec 2022 14:10:30 +0200
+Subject: thunderbolt: Do not call PM runtime functions in tb_retimer_scan()
+
+From: Mika Westerberg <mika.westerberg@linux.intel.com>
+
+commit 23257cfc1cb7202fd0065e9f4a6a0aac1c04c4a9 upstream.
+
+We cannot call PM runtime functions in tb_retimer_scan() because it will
+also be called when retimers are scanned from userspace (happens when
+there is no device connected on ChromeOS for instance) and at the same
+USB4 port runtime resume hook. This leads to hang because neither can
+proceed.
+
+Fix this by runtime resuming USB4 ports in tb_scan_port() instead. This
+makes sure the ports are runtime PM active when retimers are added under
+it while avoiding the reported hang as well.
+
+Reported-by: Utkarsh Patel <utkarsh.h.patel@intel.com>
+Fixes: 1e56c88adecc ("thunderbolt: Runtime resume USB4 port when retimers are scanned")
+Cc: stable@vger.kernel.org
+Acked-by: Yehezkel Bernat <YehezkelShB@gmail.com>
+Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/thunderbolt/retimer.c |   17 +++--------------
+ drivers/thunderbolt/tb.c      |   20 +++++++++++++++-----
+ 2 files changed, 18 insertions(+), 19 deletions(-)
+
+--- a/drivers/thunderbolt/retimer.c
++++ b/drivers/thunderbolt/retimer.c
+@@ -427,13 +427,6 @@ int tb_retimer_scan(struct tb_port *port
+ {
+       u32 status[TB_MAX_RETIMER_INDEX + 1] = {};
+       int ret, i, last_idx = 0;
+-      struct usb4_port *usb4;
+-
+-      usb4 = port->usb4;
+-      if (!usb4)
+-              return 0;
+-
+-      pm_runtime_get_sync(&usb4->dev);
+ 
+       /*
+        * Send broadcast RT to make sure retimer indices facing this
+@@ -441,7 +434,7 @@ int tb_retimer_scan(struct tb_port *port
+        */
+       ret = usb4_port_enumerate_retimers(port);
+       if (ret)
+-              goto out;
++              return ret;
+ 
+       /*
+        * Enable sideband channel for each retimer. We can do this
+@@ -471,11 +464,11 @@ int tb_retimer_scan(struct tb_port *port
+                       break;
+       }
+ 
+-      ret = 0;
+       if (!last_idx)
+-              goto out;
++              return 0;
+ 
+       /* Add on-board retimers if they do not exist already */
++      ret = 0;
+       for (i = 1; i <= last_idx; i++) {
+               struct tb_retimer *rt;
+ 
+@@ -489,10 +482,6 @@ int tb_retimer_scan(struct tb_port *port
+               }
+       }
+ 
+-out:
+-      pm_runtime_mark_last_busy(&usb4->dev);
+-      pm_runtime_put_autosuspend(&usb4->dev);
+-
+       return ret;
+ }
+ 
+--- a/drivers/thunderbolt/tb.c
++++ b/drivers/thunderbolt/tb.c
+@@ -628,11 +628,15 @@ static void tb_scan_port(struct tb_port
+                        * Downstream switch is reachable through two ports.
+                        * Only scan on the primary port (link_nr == 0).
+                        */
++
++      if (port->usb4)
++              pm_runtime_get_sync(&port->usb4->dev);
++
+       if (tb_wait_for_port(port, false) <= 0)
+-              return;
++              goto out_rpm_put;
+       if (port->remote) {
+               tb_port_dbg(port, "port already has a remote\n");
+-              return;
++              goto out_rpm_put;
+       }
+ 
+       tb_retimer_scan(port, true);
+@@ -647,12 +651,12 @@ static void tb_scan_port(struct tb_port
+                */
+               if (PTR_ERR(sw) == -EIO || PTR_ERR(sw) == -EADDRNOTAVAIL)
+                       tb_scan_xdomain(port);
+-              return;
++              goto out_rpm_put;
+       }
+ 
+       if (tb_switch_configure(sw)) {
+               tb_switch_put(sw);
+-              return;
++              goto out_rpm_put;
+       }
+ 
+       /*
+@@ -681,7 +685,7 @@ static void tb_scan_port(struct tb_port
+ 
+       if (tb_switch_add(sw)) {
+               tb_switch_put(sw);
+-              return;
++              goto out_rpm_put;
+       }
+ 
+       /* Link the switches using both links if available */
+@@ -733,6 +737,12 @@ static void tb_scan_port(struct tb_port
+ 
+       tb_add_dp_resources(sw);
+       tb_scan_switch(sw);
++
++out_rpm_put:
++      if (port->usb4) {
++              pm_runtime_mark_last_busy(&port->usb4->dev);
++              pm_runtime_put_autosuspend(&port->usb4->dev);
++      }
+ }
+ 
+ static void tb_deactivate_and_free_tunnel(struct tb_tunnel *tunnel)
diff --git a/queue-6.1/thunderbolt-do-not-report-errors-if-on-board-retimers-are-found.patch b/queue-6.1/thunderbolt-do-not-report-errors-if-on-board-retimers-are-found.patch

new file mode 100644 (file)

index 0000000..9b4dae2
--- /dev/null
+++ b/queue-6.1/thunderbolt-do-not-report-errors-if-on-board-retimers-are-found.patch
@@ -0,0 +1,37 @@
+From c28f3d80383571d3630df1a0e89500d23e855924 Mon Sep 17 00:00:00 2001
+From: Utkarsh Patel <utkarsh.h.patel@intel.com>
+Date: Thu, 22 Dec 2022 20:22:46 -0800
+Subject: thunderbolt: Do not report errors if on-board retimers are found
+
+From: Utkarsh Patel <utkarsh.h.patel@intel.com>
+
+commit c28f3d80383571d3630df1a0e89500d23e855924 upstream.
+
+Currently we return an error even if on-board retimers are found and
+that's not expected. Fix this to return an error only if there was one
+and 0 otherwise.
+
+Fixes: 1e56c88adecc ("thunderbolt: Runtime resume USB4 port when retimers are scanned")
+Cc: stable@vger.kernel.org
+Signed-off-by: Utkarsh Patel <utkarsh.h.patel@intel.com>
+Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/thunderbolt/retimer.c |    5 ++---
+ 1 file changed, 2 insertions(+), 3 deletions(-)
+
+--- a/drivers/thunderbolt/retimer.c
++++ b/drivers/thunderbolt/retimer.c
+@@ -471,10 +471,9 @@ int tb_retimer_scan(struct tb_port *port
+                       break;
+       }
+ 
+-      if (!last_idx) {
+-              ret = 0;
++      ret = 0;
++      if (!last_idx)
+               goto out;
+-      }
+ 
+       /* Add on-board retimers if they do not exist already */
+       for (i = 1; i <= last_idx; i++) {
diff --git a/queue-6.1/thunderbolt-use-correct-function-to-calculate-maximum-usb3-link-rate.patch b/queue-6.1/thunderbolt-use-correct-function-to-calculate-maximum-usb3-link-rate.patch

new file mode 100644 (file)

index 0000000..ab90316
--- /dev/null
+++ b/queue-6.1/thunderbolt-use-correct-function-to-calculate-maximum-usb3-link-rate.patch
@@ -0,0 +1,33 @@
+From e8ff07fb33026c5c1bb5b81293496faba5d68059 Mon Sep 17 00:00:00 2001
+From: Mika Westerberg <mika.westerberg@linux.intel.com>
+Date: Fri, 20 May 2022 13:35:19 +0300
+Subject: thunderbolt: Use correct function to calculate maximum USB3 link rate
+
+From: Mika Westerberg <mika.westerberg@linux.intel.com>
+
+commit e8ff07fb33026c5c1bb5b81293496faba5d68059 upstream.
+
+We need to take minimum of both sides of the USB3 link into consideration,
+not just the downstream port. Fix this by calling tb_usb3_max_link_rate()
+instead.
+
+Fixes: 0bd680cd900c ("thunderbolt: Add USB3 bandwidth management")
+Cc: stable@vger.kernel.org
+Acked-by: Yehezkel Bernat <YehezkelShB@gmail.com>
+Signed-off-by: Mika Westerberg <mika.westerberg@linux.intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/thunderbolt/tunnel.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/thunderbolt/tunnel.c
++++ b/drivers/thunderbolt/tunnel.c
+@@ -1275,7 +1275,7 @@ static void tb_usb3_reclaim_available_ba
+               return;
+       } else if (!ret) {
+               /* Use maximum link rate if the link valid is not set */
+-              ret = usb4_usb3_port_max_link_rate(tunnel->src_port);
++              ret = tb_usb3_max_link_rate(tunnel->dst_port, tunnel->src_port);
+               if (ret < 0) {
+                       tb_tunnel_warn(tunnel, "failed to read maximum link rate\n");
+                       return;
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Sun, 22 Jan 2023 12:59:46 +0000 (13:59 +0100)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Sun, 22 Jan 2023 12:59:46 +0000 (13:59 +0100)
queue-6.1/bpf-restore-the-ebpf-program-id-for-bpf_audit_unload-and-perf_bpf_event_prog_unload.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/btrfs-add-extra-error-messages-to-cover-non-enomem-errors-from-device_add_list.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/btrfs-add-missing-setup-of-log-for-full-commit-at-add_conflicting_inode.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/btrfs-do-not-abort-transaction-on-failure-to-update-log-root.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/btrfs-do-not-abort-transaction-on-failure-to-write-log-tree-when-syncing-log.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/btrfs-fix-directory-logging-due-to-race-with-concurrent-index-key-deletion.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/btrfs-fix-invalid-leaf-access-due-to-inline-extent-during-lseek.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/btrfs-fix-missing-error-handling-when-logging-directory-items.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/btrfs-fix-race-between-quota-rescan-and-disable-leading-to-null-pointer-deref.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/btrfs-qgroup-do-not-warn-on-record-without-old_roots-populated.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/cifs-do-not-include-page-data-when-checking-signature.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/mm-hugetlb-fix-pte-marker-handling-in-hugetlb_change_protection.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/mm-hugetlb-fix-uffd-wp-handling-for-migration-entries-in-hugetlb_change_protection.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/mm-hugetlb-pre-allocate-pgtable-pages-for-uffd-wr-protects.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/mm-madv_collapse-don-t-expand-collapse-when-vm_end-is-past-requested-end.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/mm-userfaultfd-enable-writenotify-while-userfaultfd-wp-is-enabled-for-a-vma.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/mmc-sdhci-esdhc-imx-correct-the-tuning-start-tap-and-step-setting.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/mmc-sunxi-mmc-fix-clock-refcount-imbalance-during-unbind.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/riscv-dts-sifive-fu740-fix-size-of-pcie-32bit-memory.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/series		patch \| blob \| blame \| history
queue-6.1/thunderbolt-disable-xdomain-lane-1-only-in-software-connection-manager.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/thunderbolt-do-not-call-pm-runtime-functions-in-tb_retimer_scan.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/thunderbolt-do-not-report-errors-if-on-board-retimers-are-found.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/thunderbolt-use-correct-function-to-calculate-maximum-usb3-link-rate.patch	[new file with mode: 0644]	patch \| blob