From: Greg Kroah-Hartman Date: Sun, 22 Jan 2023 12:59:46 +0000 (+0100) Subject: 6.1-stable patches X-Git-Tag: v4.14.304~35 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=abda1fe7499e98c15c30d74a66e6532e8e70f829;p=thirdparty%2Fkernel%2Fstable-queue.git 6.1-stable patches added patches: bpf-restore-the-ebpf-program-id-for-bpf_audit_unload-and-perf_bpf_event_prog_unload.patch btrfs-add-extra-error-messages-to-cover-non-enomem-errors-from-device_add_list.patch btrfs-add-missing-setup-of-log-for-full-commit-at-add_conflicting_inode.patch btrfs-do-not-abort-transaction-on-failure-to-update-log-root.patch btrfs-do-not-abort-transaction-on-failure-to-write-log-tree-when-syncing-log.patch btrfs-fix-directory-logging-due-to-race-with-concurrent-index-key-deletion.patch btrfs-fix-invalid-leaf-access-due-to-inline-extent-during-lseek.patch btrfs-fix-missing-error-handling-when-logging-directory-items.patch btrfs-fix-race-between-quota-rescan-and-disable-leading-to-null-pointer-deref.patch btrfs-qgroup-do-not-warn-on-record-without-old_roots-populated.patch cifs-do-not-include-page-data-when-checking-signature.patch mm-hugetlb-fix-pte-marker-handling-in-hugetlb_change_protection.patch mm-hugetlb-fix-uffd-wp-handling-for-migration-entries-in-hugetlb_change_protection.patch mm-hugetlb-pre-allocate-pgtable-pages-for-uffd-wr-protects.patch mm-madv_collapse-don-t-expand-collapse-when-vm_end-is-past-requested-end.patch mm-userfaultfd-enable-writenotify-while-userfaultfd-wp-is-enabled-for-a-vma.patch mmc-sdhci-esdhc-imx-correct-the-tuning-start-tap-and-step-setting.patch mmc-sunxi-mmc-fix-clock-refcount-imbalance-during-unbind.patch riscv-dts-sifive-fu740-fix-size-of-pcie-32bit-memory.patch thunderbolt-disable-xdomain-lane-1-only-in-software-connection-manager.patch thunderbolt-do-not-call-pm-runtime-functions-in-tb_retimer_scan.patch thunderbolt-do-not-report-errors-if-on-board-retimers-are-found.patch thunderbolt-use-correct-function-to-calculate-maximum-usb3-link-rate.patch --- diff --git a/queue-6.1/bpf-restore-the-ebpf-program-id-for-bpf_audit_unload-and-perf_bpf_event_prog_unload.patch b/queue-6.1/bpf-restore-the-ebpf-program-id-for-bpf_audit_unload-and-perf_bpf_event_prog_unload.patch new file mode 100644 index 00000000000..89d657d645f --- /dev/null +++ b/queue-6.1/bpf-restore-the-ebpf-program-id-for-bpf_audit_unload-and-perf_bpf_event_prog_unload.patch @@ -0,0 +1,94 @@ +From ef01f4e25c1760920e2c94f1c232350277ace69b Mon Sep 17 00:00:00 2001 +From: Paul Moore +Date: Fri, 6 Jan 2023 10:43:59 -0500 +Subject: bpf: restore the ebpf program ID for BPF_AUDIT_UNLOAD and PERF_BPF_EVENT_PROG_UNLOAD + +From: Paul Moore + +commit ef01f4e25c1760920e2c94f1c232350277ace69b upstream. + +When changing the ebpf program put() routines to support being called +from within IRQ context the program ID was reset to zero prior to +calling the perf event and audit UNLOAD record generators, which +resulted in problems as the ebpf program ID was bogus (always zero). +This patch addresses this problem by removing an unnecessary call to +bpf_prog_free_id() in __bpf_prog_offload_destroy() and adjusting +__bpf_prog_put() to only call bpf_prog_free_id() after audit and perf +have finished their bpf program unload tasks in +bpf_prog_put_deferred(). For the record, no one can determine, or +remember, why it was necessary to free the program ID, and remove it +from the IDR, prior to executing bpf_prog_put_deferred(); +regardless, both Stanislav and Alexei agree that the approach in this +patch should be safe. + +It is worth noting that when moving the bpf_prog_free_id() call, the +do_idr_lock parameter was forced to true as the ebpf devs determined +this was the correct as the do_idr_lock should always be true. The +do_idr_lock parameter will be removed in a follow-up patch, but it +was kept here to keep the patch small in an effort to ease any stable +backports. + +I also modified the bpf_audit_prog() logic used to associate the +AUDIT_BPF record with other associated records, e.g. @ctx != NULL. +Instead of keying off the operation, it now keys off the execution +context, e.g. '!in_irg && !irqs_disabled()', which is much more +appropriate and should help better connect the UNLOAD operations with +the associated audit state (other audit records). + +Cc: stable@vger.kernel.org +Fixes: d809e134be7a ("bpf: Prepare bpf_prog_put() to be called from irq context.") +Reported-by: Burn Alting +Reported-by: Jiri Olsa +Suggested-by: Stanislav Fomichev +Suggested-by: Alexei Starovoitov +Signed-off-by: Paul Moore +Acked-by: Stanislav Fomichev +Link: https://lore.kernel.org/r/20230106154400.74211-1-paul@paul-moore.com +Signed-off-by: Alexei Starovoitov +Signed-off-by: Greg Kroah-Hartman +--- + kernel/bpf/offload.c | 3 --- + kernel/bpf/syscall.c | 6 ++---- + 2 files changed, 2 insertions(+), 7 deletions(-) + +--- a/kernel/bpf/offload.c ++++ b/kernel/bpf/offload.c +@@ -216,9 +216,6 @@ static void __bpf_prog_offload_destroy(s + if (offload->dev_state) + offload->offdev->ops->destroy(prog); + +- /* Make sure BPF_PROG_GET_NEXT_ID can't find this dead program */ +- bpf_prog_free_id(prog, true); +- + list_del_init(&offload->offloads); + kfree(offload); + prog->aux->offload = NULL; +--- a/kernel/bpf/syscall.c ++++ b/kernel/bpf/syscall.c +@@ -1958,7 +1958,7 @@ static void bpf_audit_prog(const struct + return; + if (audit_enabled == AUDIT_OFF) + return; +- if (op == BPF_AUDIT_LOAD) ++ if (!in_irq() && !irqs_disabled()) + ctx = audit_context(); + ab = audit_log_start(ctx, GFP_ATOMIC, AUDIT_BPF); + if (unlikely(!ab)) +@@ -2053,6 +2053,7 @@ static void bpf_prog_put_deferred(struct + prog = aux->prog; + perf_event_bpf_event(prog, PERF_BPF_EVENT_PROG_UNLOAD, 0); + bpf_audit_prog(prog, BPF_AUDIT_UNLOAD); ++ bpf_prog_free_id(prog, true); + __bpf_prog_put_noref(prog, true); + } + +@@ -2061,9 +2062,6 @@ static void __bpf_prog_put(struct bpf_pr + struct bpf_prog_aux *aux = prog->aux; + + if (atomic64_dec_and_test(&aux->refcnt)) { +- /* bpf_prog_free_id() must be called first */ +- bpf_prog_free_id(prog, do_idr_lock); +- + if (in_irq() || irqs_disabled()) { + INIT_WORK(&aux->work, bpf_prog_put_deferred); + schedule_work(&aux->work); diff --git a/queue-6.1/btrfs-add-extra-error-messages-to-cover-non-enomem-errors-from-device_add_list.patch b/queue-6.1/btrfs-add-extra-error-messages-to-cover-non-enomem-errors-from-device_add_list.patch new file mode 100644 index 00000000000..53b74ecde9d --- /dev/null +++ b/queue-6.1/btrfs-add-extra-error-messages-to-cover-non-enomem-errors-from-device_add_list.patch @@ -0,0 +1,87 @@ +From ed02363fbbed52a3f5ea0d188edd09045a806eb5 Mon Sep 17 00:00:00 2001 +From: Qu Wenruo +Date: Mon, 12 Dec 2022 10:19:37 +0800 +Subject: btrfs: add extra error messages to cover non-ENOMEM errors from device_add_list() + +From: Qu Wenruo + +commit ed02363fbbed52a3f5ea0d188edd09045a806eb5 upstream. + +[BUG] +When test case btrfs/219 (aka, mount a registered device but with a lower +generation) failed, there is not any useful information for the end user +to find out what's going wrong. + +The mount failure just looks like this: + + # mount -o loop /tmp/219.img2 /mnt/btrfs/ + mount: /mnt/btrfs: mount(2) system call failed: File exists. + dmesg(1) may have more information after failed mount system call. + +While the dmesg contains nothing but the loop device change: + + loop1: detected capacity change from 0 to 524288 + +[CAUSE] +In device_list_add() we have a lot of extra checks to reject invalid +cases. + +That function also contains the regular device scan result like the +following prompt: + + BTRFS: device fsid 6222333e-f9f1-47e6-b306-55ddd4dcaef4 devid 1 transid 8 /dev/loop0 scanned by systemd-udevd (3027) + +But unfortunately not all errors have their own error messages, thus if +we hit something wrong in device_add_list(), there may be no error +messages at all. + +[FIX] +Add errors message for all non-ENOMEM errors. + +For ENOMEM, I'd say we're in a much worse situation, and there should be +some OOM messages way before our call sites. + +CC: stable@vger.kernel.org # 6.0+ +Signed-off-by: Qu Wenruo +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/volumes.c | 11 ++++++++++- + 1 file changed, 10 insertions(+), 1 deletion(-) + +--- a/fs/btrfs/volumes.c ++++ b/fs/btrfs/volumes.c +@@ -770,8 +770,11 @@ static noinline struct btrfs_device *dev + BTRFS_SUPER_FLAG_CHANGING_FSID_V2); + + error = lookup_bdev(path, &path_devt); +- if (error) ++ if (error) { ++ btrfs_err(NULL, "failed to lookup block device for path %s: %d", ++ path, error); + return ERR_PTR(error); ++ } + + if (fsid_change_in_progress) { + if (!has_metadata_uuid) +@@ -836,6 +839,9 @@ static noinline struct btrfs_device *dev + + if (!device) { + if (fs_devices->opened) { ++ btrfs_err(NULL, ++ "device %s belongs to fsid %pU, and the fs is already mounted", ++ path, fs_devices->fsid); + mutex_unlock(&fs_devices->device_list_mutex); + return ERR_PTR(-EBUSY); + } +@@ -910,6 +916,9 @@ static noinline struct btrfs_device *dev + * generation are equal. + */ + mutex_unlock(&fs_devices->device_list_mutex); ++ btrfs_err(NULL, ++"device %s already registered with a higher generation, found %llu expect %llu", ++ path, found_transid, device->generation); + return ERR_PTR(-EEXIST); + } + diff --git a/queue-6.1/btrfs-add-missing-setup-of-log-for-full-commit-at-add_conflicting_inode.patch b/queue-6.1/btrfs-add-missing-setup-of-log-for-full-commit-at-add_conflicting_inode.patch new file mode 100644 index 00000000000..cbe54c7d3d8 --- /dev/null +++ b/queue-6.1/btrfs-add-missing-setup-of-log-for-full-commit-at-add_conflicting_inode.patch @@ -0,0 +1,43 @@ +From 94cd63ae679973edeb5ea95ec25a54467c3e54c8 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Tue, 10 Jan 2023 14:56:36 +0000 +Subject: btrfs: add missing setup of log for full commit at add_conflicting_inode() + +From: Filipe Manana + +commit 94cd63ae679973edeb5ea95ec25a54467c3e54c8 upstream. + +When logging conflicting inodes, if we reach the maximum limit of inodes, +we return BTRFS_LOG_FORCE_COMMIT to force a transaction commit. However +we don't mark the log for full commit (with btrfs_set_log_full_commit()), +which means that once we leave the log transaction and before we commit +the transaction, some other task may sync the log, which is incomplete +as we have not logged all conflicting inodes, leading to some inconsistent +in case that log ends up being replayed. + +So also call btrfs_set_log_full_commit() at add_conflicting_inode(). + +Fixes: e09d94c9e448 ("btrfs: log conflicting inodes without holding log mutex of the initial inode") +CC: stable@vger.kernel.org # 6.1 +Reviewed-by: Josef Bacik +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/tree-log.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -5626,8 +5626,10 @@ static int add_conflicting_inode(struct + * LOG_INODE_EXISTS mode) and slow down other fsyncs or transaction + * commits. + */ +- if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES) ++ if (ctx->num_conflict_inodes >= MAX_CONFLICT_INODES) { ++ btrfs_set_log_full_commit(trans); + return BTRFS_LOG_FORCE_COMMIT; ++ } + + inode = btrfs_iget(root->fs_info->sb, ino, root); + /* diff --git a/queue-6.1/btrfs-do-not-abort-transaction-on-failure-to-update-log-root.patch b/queue-6.1/btrfs-do-not-abort-transaction-on-failure-to-update-log-root.patch new file mode 100644 index 00000000000..620d797e297 --- /dev/null +++ b/queue-6.1/btrfs-do-not-abort-transaction-on-failure-to-update-log-root.patch @@ -0,0 +1,52 @@ +From 09e44868f1e03c7825ca4283256abedc95e249a3 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Tue, 10 Jan 2023 14:56:38 +0000 +Subject: btrfs: do not abort transaction on failure to update log root + +From: Filipe Manana + +commit 09e44868f1e03c7825ca4283256abedc95e249a3 upstream. + +When syncing a log, if we fail to update a log root in the log root tree, +we are aborting the transaction if the failure was not -ENOSPC. This is +excessive because there is a chance that a transaction commit can succeed, +and therefore avoid to turn the filesystem into RO mode. All we need to be +careful about is to mark the log for a full commit, which we already do, +to make sure no one commits a super block pointing to an outdated log root +tree. + +So don't abort the transaction if we fail to update a log root in the log +root tree, and log an error if the failure is not -ENOSPC, so that it does +not go completely unnoticed. + +CC: stable@vger.kernel.org # 6.0+ +Reviewed-by: Josef Bacik +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/tree-log.c | 11 ++++------- + 1 file changed, 4 insertions(+), 7 deletions(-) + +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -3075,15 +3075,12 @@ int btrfs_sync_log(struct btrfs_trans_ha + + blk_finish_plug(&plug); + btrfs_set_log_full_commit(trans); +- +- if (ret != -ENOSPC) { +- btrfs_abort_transaction(trans, ret); +- mutex_unlock(&log_root_tree->log_mutex); +- goto out; +- } ++ if (ret != -ENOSPC) ++ btrfs_err(fs_info, ++ "failed to update log for root %llu ret %d", ++ root->root_key.objectid, ret); + btrfs_wait_tree_log_extents(log, mark); + mutex_unlock(&log_root_tree->log_mutex); +- ret = BTRFS_LOG_FORCE_COMMIT; + goto out; + } + diff --git a/queue-6.1/btrfs-do-not-abort-transaction-on-failure-to-write-log-tree-when-syncing-log.patch b/queue-6.1/btrfs-do-not-abort-transaction-on-failure-to-write-log-tree-when-syncing-log.patch new file mode 100644 index 00000000000..17487d4ed65 --- /dev/null +++ b/queue-6.1/btrfs-do-not-abort-transaction-on-failure-to-write-log-tree-when-syncing-log.patch @@ -0,0 +1,70 @@ +From 16199ad9eb6db60a6b10794a09fc1ac6d09312ff Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Tue, 10 Jan 2023 14:56:37 +0000 +Subject: btrfs: do not abort transaction on failure to write log tree when syncing log + +From: Filipe Manana + +commit 16199ad9eb6db60a6b10794a09fc1ac6d09312ff upstream. + +When syncing the log, if we fail to write log tree extent buffers, we mark +the log for a full commit and abort the transaction. However we don't need +to abort the transaction, all we really need to do is to make sure no one +can commit a superblock pointing to new log tree roots. Just because we +got a failure writing extent buffers for a log tree, it does not mean we +will also fail to do a transaction commit. + +One particular case is if due to a bug somewhere, when writing log tree +extent buffers, the tree checker detects some corruption and the writeout +fails because of that. Aborting the transaction can be very disruptive for +a user, specially if the issue happened on a root filesystem. One example +is the scenario in the Link tag below, where an isolated corruption on log +tree leaves was causing transaction aborts when syncing the log. + +Link: https://lore.kernel.org/linux-btrfs/ae169fc6-f504-28f0-a098-6fa6a4dfb612@leemhuis.info/ +CC: stable@vger.kernel.org # 5.15+ +Reviewed-by: Josef Bacik +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/disk-io.c | 9 ++++++++- + fs/btrfs/tree-log.c | 2 -- + 2 files changed, 8 insertions(+), 3 deletions(-) + +--- a/fs/btrfs/disk-io.c ++++ b/fs/btrfs/disk-io.c +@@ -344,7 +344,14 @@ error: + btrfs_print_tree(eb, 0); + btrfs_err(fs_info, "block=%llu write time tree block corruption detected", + eb->start); +- WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); ++ /* ++ * Be noisy if this is an extent buffer from a log tree. We don't abort ++ * a transaction in case there's a bad log tree extent buffer, we just ++ * fallback to a transaction commit. Still we want to know when there is ++ * a bad log tree extent buffer, as that may signal a bug somewhere. ++ */ ++ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG) || ++ btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID); + return ret; + } + +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -3011,7 +3011,6 @@ int btrfs_sync_log(struct btrfs_trans_ha + ret = 0; + if (ret) { + blk_finish_plug(&plug); +- btrfs_abort_transaction(trans, ret); + btrfs_set_log_full_commit(trans); + mutex_unlock(&root->log_mutex); + goto out; +@@ -3143,7 +3142,6 @@ int btrfs_sync_log(struct btrfs_trans_ha + goto out_wake_log_root; + } else if (ret) { + btrfs_set_log_full_commit(trans); +- btrfs_abort_transaction(trans, ret); + mutex_unlock(&log_root_tree->log_mutex); + goto out_wake_log_root; + } diff --git a/queue-6.1/btrfs-fix-directory-logging-due-to-race-with-concurrent-index-key-deletion.patch b/queue-6.1/btrfs-fix-directory-logging-due-to-race-with-concurrent-index-key-deletion.patch new file mode 100644 index 00000000000..65d0e8b0b57 --- /dev/null +++ b/queue-6.1/btrfs-fix-directory-logging-due-to-race-with-concurrent-index-key-deletion.patch @@ -0,0 +1,104 @@ +From 8bb6898da6271d82d8e76d8088d66b971a7dcfa6 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Tue, 10 Jan 2023 14:56:35 +0000 +Subject: btrfs: fix directory logging due to race with concurrent index key deletion + +From: Filipe Manana + +commit 8bb6898da6271d82d8e76d8088d66b971a7dcfa6 upstream. + +Sometimes we log a directory without holding its VFS lock, so while we +logging it, dir index entries may be added or removed. This typically +happens when logging a dentry from a parent directory that points to a +new directory, through log_new_dir_dentries(), or when while logging +some other inode we also need to log its parent directories (through +btrfs_log_all_parents()). + +This means that while we are at log_dir_items(), we may not find a dir +index key we found before, because it was deleted in the meanwhile, so +a call to btrfs_search_slot() may return 1 (key not found). In that case +we return from log_dir_items() with a success value (the variable 'err' +has a value of 0). This can lead to a few problems, specially in the case +where the variable 'last_offset' has a value of (u64)-1 (and it's +initialized to that when it was declared): + +1) By returning from log_dir_items() with success (0) and a value of + (u64)-1 for '*last_offset_ret', we end up not logging any other dir + index keys that follow the missing, just deleted, index key. The + (u64)-1 value makes log_directory_changes() not call log_dir_items() + again; + +2) Before returning with success (0), log_dir_items(), will log a dir + index range item covering a range from the last old dentry index + (stored in the variable 'last_old_dentry_offset') to the value of + 'last_offset'. If 'last_offset' has a value of (u64)-1, then it means + if the log is persisted and replayed after a power failure, it will + cause deletion of all the directory entries that have an index number + between last_old_dentry_offset + 1 and (u64)-1; + +3) We can end up returning from log_dir_items() with + ctx->last_dir_item_offset having a lower value than + inode->last_dir_index_offset, because the former is set to the current + key we are processing at process_dir_items_leaf(), and at the end of + log_directory_changes() we set inode->last_dir_index_offset to the + current value of ctx->last_dir_item_offset. So if for example a + deletion of a lower dir index key happened, we set + ctx->last_dir_item_offset to that index value, then if we return from + log_dir_items() because btrfs_search_slot() returned 1, we end up + returning from log_dir_items() with success (0) and then + log_directory_changes() sets inode->last_dir_index_offset to a lower + value than it had before. + This can result in unpredictable and unexpected behaviour when we + need to log again the directory in the same transaction, and can result + in ending up with a log tree leaf that has duplicated keys, as we do + batch insertions of dir index keys into a log tree. + +So fix this by making log_dir_items() move on to the next dir index key +if it does not find the one it was looking for. + +Reported-by: David Arendt +Link: https://lore.kernel.org/linux-btrfs/ae169fc6-f504-28f0-a098-6fa6a4dfb612@leemhuis.info/ +CC: stable@vger.kernel.org # 4.14+ +Reviewed-by: Josef Bacik +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/tree-log.c | 21 +++++++++++++++------ + 1 file changed, 15 insertions(+), 6 deletions(-) + +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -3888,17 +3888,26 @@ static noinline int log_dir_items(struct + btrfs_release_path(path); + + /* +- * Find the first key from this transaction again. See the note for +- * log_new_dir_dentries, if we're logging a directory recursively we +- * won't be holding its i_mutex, which means we can modify the directory +- * while we're logging it. If we remove an entry between our first +- * search and this search we'll not find the key again and can just +- * bail. ++ * Find the first key from this transaction again or the one we were at ++ * in the loop below in case we had to reschedule. We may be logging the ++ * directory without holding its VFS lock, which happen when logging new ++ * dentries (through log_new_dir_dentries()) or in some cases when we ++ * need to log the parent directory of an inode. This means a dir index ++ * key might be deleted from the inode's root, and therefore we may not ++ * find it anymore. If we can't find it, just move to the next key. We ++ * can not bail out and ignore, because if we do that we will simply ++ * not log dir index keys that come after the one that was just deleted ++ * and we can end up logging a dir index range that ends at (u64)-1 ++ * (@last_offset is initialized to that), resulting in removing dir ++ * entries we should not remove at log replay time. + */ + search: + ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); ++ if (ret > 0) ++ ret = btrfs_next_item(root, path); + if (ret < 0) + err = ret; ++ /* If ret is 1, there are no more keys in the inode's root. */ + if (ret != 0) + goto done; + diff --git a/queue-6.1/btrfs-fix-invalid-leaf-access-due-to-inline-extent-during-lseek.patch b/queue-6.1/btrfs-fix-invalid-leaf-access-due-to-inline-extent-during-lseek.patch new file mode 100644 index 00000000000..d84a42c55f4 --- /dev/null +++ b/queue-6.1/btrfs-fix-invalid-leaf-access-due-to-inline-extent-during-lseek.patch @@ -0,0 +1,68 @@ +From 1f55ee6d0901d915801618bda0af4e5b937e3db7 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Thu, 12 Jan 2023 14:17:20 +0000 +Subject: btrfs: fix invalid leaf access due to inline extent during lseek + +From: Filipe Manana + +commit 1f55ee6d0901d915801618bda0af4e5b937e3db7 upstream. + +During lseek, for SEEK_DATA and SEEK_HOLE modes, we access the disk_bytenr +of an extent without checking its type. However inline extents have their +data starting the offset of the disk_bytenr field, so accessing that field +when we have an inline extent can result in either of the following: + +1) Interpret the inline extent's data as a disk_bytenr value; + +2) In case the inline data is less than 8 bytes, we access part of some + other item in the leaf, or unused space in the leaf; + +3) In case the inline data is less than 8 bytes and the extent item is + the first item in the leaf, we can access beyond the leaf's limit. + +So fix this by not accessing the disk_bytenr field if we have an inline +extent. + +Fixes: b6e833567ea1 ("btrfs: make hole and data seeking a lot more efficient") +Reported-by: Matthias Schoepfer +Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=216908 +Link: https://lore.kernel.org/linux-btrfs/7f25442f-b121-2a3a-5a3d-22bcaae83cd4@leemhuis.info/ +CC: stable@vger.kernel.org # 6.1 +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/file.c | 13 ++++++++++--- + 1 file changed, 10 insertions(+), 3 deletions(-) + +--- a/fs/btrfs/file.c ++++ b/fs/btrfs/file.c +@@ -3838,6 +3838,7 @@ static loff_t find_desired_extent(struct + struct extent_buffer *leaf = path->nodes[0]; + struct btrfs_file_extent_item *extent; + u64 extent_end; ++ u8 type; + + if (path->slots[0] >= btrfs_header_nritems(leaf)) { + ret = btrfs_next_leaf(root, path); +@@ -3892,10 +3893,16 @@ static loff_t find_desired_extent(struct + + extent = btrfs_item_ptr(leaf, path->slots[0], + struct btrfs_file_extent_item); ++ type = btrfs_file_extent_type(leaf, extent); + +- if (btrfs_file_extent_disk_bytenr(leaf, extent) == 0 || +- btrfs_file_extent_type(leaf, extent) == +- BTRFS_FILE_EXTENT_PREALLOC) { ++ /* ++ * Can't access the extent's disk_bytenr field if this is an ++ * inline extent, since at that offset, it's where the extent ++ * data starts. ++ */ ++ if (type == BTRFS_FILE_EXTENT_PREALLOC || ++ (type == BTRFS_FILE_EXTENT_REG && ++ btrfs_file_extent_disk_bytenr(leaf, extent) == 0)) { + /* + * Explicit hole or prealloc extent, search for delalloc. + * A prealloc extent is treated like a hole. diff --git a/queue-6.1/btrfs-fix-missing-error-handling-when-logging-directory-items.patch b/queue-6.1/btrfs-fix-missing-error-handling-when-logging-directory-items.patch new file mode 100644 index 00000000000..aa9d50f0ca5 --- /dev/null +++ b/queue-6.1/btrfs-fix-missing-error-handling-when-logging-directory-items.patch @@ -0,0 +1,99 @@ +From 6d3d970b2735b967650d319be27268fedc5598d1 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Tue, 10 Jan 2023 14:56:34 +0000 +Subject: btrfs: fix missing error handling when logging directory items + +From: Filipe Manana + +commit 6d3d970b2735b967650d319be27268fedc5598d1 upstream. + +When logging a directory, at log_dir_items(), if we get an error when +attempting to search the subvolume tree for a dir index item, we end up +returning 0 (success) from log_dir_items() because 'err' is left with a +value of 0. + +This can lead to a few problems, specially in the case the variable +'last_offset' has a value of (u64)-1 (and it's initialized to that when +it was declared): + +1) By returning from log_dir_items() with success (0) and a value of + (u64)-1 for '*last_offset_ret', we end up not logging any other dir + index keys that follow the missing, just deleted, index key. The + (u64)-1 value makes log_directory_changes() not call log_dir_items() + again; + +2) Before returning with success (0), log_dir_items(), will log a dir + index range item covering a range from the last old dentry index + (stored in the variable 'last_old_dentry_offset') to the value of + 'last_offset'. If 'last_offset' has a value of (u64)-1, then it means + if the log is persisted and replayed after a power failure, it will + cause deletion of all the directory entries that have an index number + between last_old_dentry_offset + 1 and (u64)-1; + +3) We can end up returning from log_dir_items() with + ctx->last_dir_item_offset having a lower value than + inode->last_dir_index_offset, because the former is set to the current + key we are processing at process_dir_items_leaf(), and at the end of + log_directory_changes() we set inode->last_dir_index_offset to the + current value of ctx->last_dir_item_offset. So if for example a + deletion of a lower dir index key happened, we set + ctx->last_dir_item_offset to that index value, then if we return from + log_dir_items() because btrfs_search_slot() returned an error, we end up + returning without any error from log_dir_items() and then + log_directory_changes() sets inode->last_dir_index_offset to a lower + value than it had before. + This can result in unpredictable and unexpected behaviour when we + need to log again the directory in the same transaction, and can result + in ending up with a log tree leaf that has duplicated keys, as we do + batch insertions of dir index keys into a log tree. + +Fix this by setting 'err' to the value of 'ret' in case +btrfs_search_slot() or btrfs_previous_item() returned an error. That will +result in falling back to a full transaction commit. + +Reported-by: David Arendt +Link: https://lore.kernel.org/linux-btrfs/ae169fc6-f504-28f0-a098-6fa6a4dfb612@leemhuis.info/ +Fixes: e02119d5a7b4 ("Btrfs: Add a write ahead tree log to optimize synchronous operations") +CC: stable@vger.kernel.org # 4.14+ +Reviewed-by: Josef Bacik +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/tree-log.c | 9 +++++++++ + 1 file changed, 9 insertions(+) + +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -3857,7 +3857,10 @@ static noinline int log_dir_items(struct + path->slots[0]); + if (tmp.type == BTRFS_DIR_INDEX_KEY) + last_old_dentry_offset = tmp.offset; ++ } else if (ret < 0) { ++ err = ret; + } ++ + goto done; + } + +@@ -3877,7 +3880,11 @@ static noinline int log_dir_items(struct + */ + if (tmp.type == BTRFS_DIR_INDEX_KEY) + last_old_dentry_offset = tmp.offset; ++ } else if (ret < 0) { ++ err = ret; ++ goto done; + } ++ + btrfs_release_path(path); + + /* +@@ -3890,6 +3897,8 @@ static noinline int log_dir_items(struct + */ + search: + ret = btrfs_search_slot(NULL, root, &min_key, path, 0, 0); ++ if (ret < 0) ++ err = ret; + if (ret != 0) + goto done; + diff --git a/queue-6.1/btrfs-fix-race-between-quota-rescan-and-disable-leading-to-null-pointer-deref.patch b/queue-6.1/btrfs-fix-race-between-quota-rescan-and-disable-leading-to-null-pointer-deref.patch new file mode 100644 index 00000000000..1cb586dc0a8 --- /dev/null +++ b/queue-6.1/btrfs-fix-race-between-quota-rescan-and-disable-leading-to-null-pointer-deref.patch @@ -0,0 +1,134 @@ +From b7adbf9ada3513d2092362c8eac5cddc5b651f5c Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Thu, 12 Jan 2023 16:31:08 +0000 +Subject: btrfs: fix race between quota rescan and disable leading to NULL pointer deref + +From: Filipe Manana + +commit b7adbf9ada3513d2092362c8eac5cddc5b651f5c upstream. + +If we have one task trying to start the quota rescan worker while another +one is trying to disable quotas, we can end up hitting a race that results +in the quota rescan worker doing a NULL pointer dereference. The steps for +this are the following: + +1) Quotas are enabled; + +2) Task A calls the quota rescan ioctl and enters btrfs_qgroup_rescan(). + It calls qgroup_rescan_init() which returns 0 (success) and then joins a + transaction and commits it; + +3) Task B calls the quota disable ioctl and enters btrfs_quota_disable(). + It clears the bit BTRFS_FS_QUOTA_ENABLED from fs_info->flags and calls + btrfs_qgroup_wait_for_completion(), which returns immediately since the + rescan worker is not yet running. + Then it starts a transaction and locks fs_info->qgroup_ioctl_lock; + +4) Task A queues the rescan worker, by calling btrfs_queue_work(); + +5) The rescan worker starts, and calls rescan_should_stop() at the start + of its while loop, which results in 0 iterations of the loop, since + the flag BTRFS_FS_QUOTA_ENABLED was cleared from fs_info->flags by + task B at step 3); + +6) Task B sets fs_info->quota_root to NULL; + +7) The rescan worker tries to start a transaction and uses + fs_info->quota_root as the root argument for btrfs_start_transaction(). + This results in a NULL pointer dereference down the call chain of + btrfs_start_transaction(). The stack trace is something like the one + reported in Link tag below: + + general protection fault, probably for non-canonical address 0xdffffc0000000041: 0000 [#1] PREEMPT SMP KASAN + KASAN: null-ptr-deref in range [0x0000000000000208-0x000000000000020f] + CPU: 1 PID: 34 Comm: kworker/u4:2 Not tainted 6.1.0-syzkaller-13872-gb6bb9676f216 #0 + Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/26/2022 + Workqueue: btrfs-qgroup-rescan btrfs_work_helper + RIP: 0010:start_transaction+0x48/0x10f0 fs/btrfs/transaction.c:564 + Code: 48 89 fb 48 (...) + RSP: 0018:ffffc90000ab7ab0 EFLAGS: 00010206 + RAX: 0000000000000041 RBX: 0000000000000208 RCX: ffff88801779ba80 + RDX: 0000000000000000 RSI: 0000000000000001 RDI: 0000000000000000 + RBP: dffffc0000000000 R08: 0000000000000001 R09: fffff52000156f5d + R10: fffff52000156f5d R11: 1ffff92000156f5c R12: 0000000000000000 + R13: 0000000000000001 R14: 0000000000000001 R15: 0000000000000003 + FS: 0000000000000000(0000) GS:ffff8880b9900000(0000) knlGS:0000000000000000 + CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + CR2: 00007f2bea75b718 CR3: 000000001d0cc000 CR4: 00000000003506e0 + DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 + DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 + Call Trace: + + btrfs_qgroup_rescan_worker+0x3bb/0x6a0 fs/btrfs/qgroup.c:3402 + btrfs_work_helper+0x312/0x850 fs/btrfs/async-thread.c:280 + process_one_work+0x877/0xdb0 kernel/workqueue.c:2289 + worker_thread+0xb14/0x1330 kernel/workqueue.c:2436 + kthread+0x266/0x300 kernel/kthread.c:376 + ret_from_fork+0x1f/0x30 arch/x86/entry/entry_64.S:308 + + Modules linked in: + +So fix this by having the rescan worker function not attempt to start a +transaction if it didn't do any rescan work. + +Reported-by: syzbot+96977faa68092ad382c4@syzkaller.appspotmail.com +Link: https://lore.kernel.org/linux-btrfs/000000000000e5454b05f065a803@google.com/ +Fixes: e804861bd4e6 ("btrfs: fix deadlock between quota disable and qgroup rescan worker") +CC: stable@vger.kernel.org # 5.4+ +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/qgroup.c | 25 +++++++++++++++++-------- + 1 file changed, 17 insertions(+), 8 deletions(-) + +--- a/fs/btrfs/qgroup.c ++++ b/fs/btrfs/qgroup.c +@@ -3348,6 +3348,7 @@ static void btrfs_qgroup_rescan_worker(s + int err = -ENOMEM; + int ret = 0; + bool stopped = false; ++ bool did_leaf_rescans = false; + + path = btrfs_alloc_path(); + if (!path) +@@ -3368,6 +3369,7 @@ static void btrfs_qgroup_rescan_worker(s + } + + err = qgroup_rescan_leaf(trans, path); ++ did_leaf_rescans = true; + + if (err > 0) + btrfs_commit_transaction(trans); +@@ -3388,16 +3390,23 @@ out: + mutex_unlock(&fs_info->qgroup_rescan_lock); + + /* +- * only update status, since the previous part has already updated the +- * qgroup info. ++ * Only update status, since the previous part has already updated the ++ * qgroup info, and only if we did any actual work. This also prevents ++ * race with a concurrent quota disable, which has already set ++ * fs_info->quota_root to NULL and cleared BTRFS_FS_QUOTA_ENABLED at ++ * btrfs_quota_disable(). + */ +- trans = btrfs_start_transaction(fs_info->quota_root, 1); +- if (IS_ERR(trans)) { +- err = PTR_ERR(trans); ++ if (did_leaf_rescans) { ++ trans = btrfs_start_transaction(fs_info->quota_root, 1); ++ if (IS_ERR(trans)) { ++ err = PTR_ERR(trans); ++ trans = NULL; ++ btrfs_err(fs_info, ++ "fail to start transaction for status update: %d", ++ err); ++ } ++ } else { + trans = NULL; +- btrfs_err(fs_info, +- "fail to start transaction for status update: %d", +- err); + } + + mutex_lock(&fs_info->qgroup_rescan_lock); diff --git a/queue-6.1/btrfs-qgroup-do-not-warn-on-record-without-old_roots-populated.patch b/queue-6.1/btrfs-qgroup-do-not-warn-on-record-without-old_roots-populated.patch new file mode 100644 index 00000000000..4bfa94e88e1 --- /dev/null +++ b/queue-6.1/btrfs-qgroup-do-not-warn-on-record-without-old_roots-populated.patch @@ -0,0 +1,97 @@ +From 75181406b4eafacc531ff2ee5fb032bd93317e2b Mon Sep 17 00:00:00 2001 +From: Qu Wenruo +Date: Tue, 10 Jan 2023 15:14:17 +0800 +Subject: btrfs: qgroup: do not warn on record without old_roots populated + +From: Qu Wenruo + +commit 75181406b4eafacc531ff2ee5fb032bd93317e2b upstream. + +[BUG] +There are some reports from the mailing list that since v6.1 kernel, the +WARN_ON() inside btrfs_qgroup_account_extent() gets triggered during +rescan: + + WARNING: CPU: 3 PID: 6424 at fs/btrfs/qgroup.c:2756 btrfs_qgroup_account_extents+0x1ae/0x260 [btrfs] + CPU: 3 PID: 6424 Comm: snapperd Tainted: P OE 6.1.2-1-default #1 openSUSE Tumbleweed 05c7a1b1b61d5627475528f71f50444637b5aad7 + RIP: 0010:btrfs_qgroup_account_extents+0x1ae/0x260 [btrfs] + Call Trace: + + btrfs_commit_transaction+0x30c/0xb40 [btrfs c39c9c546c241c593f03bd6d5f39ea1b676250f6] + ? start_transaction+0xc3/0x5b0 [btrfs c39c9c546c241c593f03bd6d5f39ea1b676250f6] + btrfs_qgroup_rescan+0x42/0xc0 [btrfs c39c9c546c241c593f03bd6d5f39ea1b676250f6] + btrfs_ioctl+0x1ab9/0x25c0 [btrfs c39c9c546c241c593f03bd6d5f39ea1b676250f6] + ? __rseq_handle_notify_resume+0xa9/0x4a0 + ? mntput_no_expire+0x4a/0x240 + ? __seccomp_filter+0x319/0x4d0 + __x64_sys_ioctl+0x90/0xd0 + do_syscall_64+0x5b/0x80 + ? syscall_exit_to_user_mode+0x17/0x40 + ? do_syscall_64+0x67/0x80 + entry_SYSCALL_64_after_hwframe+0x63/0xcd + RIP: 0033:0x7fd9b790d9bf + + +[CAUSE] +Since commit e15e9f43c7ca ("btrfs: introduce +BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING to skip qgroup accounting"), if +our qgroup is already in inconsistent state, we will no longer do the +time-consuming backref walk. + +This can leave some qgroup records without a valid old_roots ulist. +Normally this is fine, as btrfs_qgroup_account_extents() would also skip +those records if we have NO_ACCOUNTING flag set. + +But there is a small window, if we have NO_ACCOUNTING flag set, and +inserted some qgroup_record without a old_roots ulist, but then the user +triggered a qgroup rescan. + +During btrfs_qgroup_rescan(), we firstly clear NO_ACCOUNTING flag, then +commit current transaction. + +And since we have a qgroup_record with old_roots = NULL, we trigger the +WARN_ON() during btrfs_qgroup_account_extents(). + +[FIX] +Unfortunately due to the introduction of NO_ACCOUNTING flag, the +assumption that every qgroup_record would have its old_roots populated +is no longer correct. + +Fix the false alerts and drop the WARN_ON(). + +Reported-by: Lukas Straub +Reported-by: HanatoK +Fixes: e15e9f43c7ca ("btrfs: introduce BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING to skip qgroup accounting") +CC: stable@vger.kernel.org # 6.1 +Link: https://lore.kernel.org/linux-btrfs/2403c697-ddaf-58ad-3829-0335fc89df09@gmail.com/ +Signed-off-by: Qu Wenruo +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman +--- + fs/btrfs/qgroup.c | 14 ++++++++++++-- + 1 file changed, 12 insertions(+), 2 deletions(-) + +--- a/fs/btrfs/qgroup.c ++++ b/fs/btrfs/qgroup.c +@@ -2751,9 +2751,19 @@ int btrfs_qgroup_account_extents(struct + BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)) { + /* + * Old roots should be searched when inserting qgroup +- * extent record ++ * extent record. ++ * ++ * But for INCONSISTENT (NO_ACCOUNTING) -> rescan case, ++ * we may have some record inserted during ++ * NO_ACCOUNTING (thus no old_roots populated), but ++ * later we start rescan, which clears NO_ACCOUNTING, ++ * leaving some inserted records without old_roots ++ * populated. ++ * ++ * Those cases are rare and should not cause too much ++ * time spent during commit_transaction(). + */ +- if (WARN_ON(!record->old_roots)) { ++ if (!record->old_roots) { + /* Search commit root to find old_roots */ + ret = btrfs_find_all_roots(NULL, fs_info, + record->bytenr, 0, diff --git a/queue-6.1/cifs-do-not-include-page-data-when-checking-signature.patch b/queue-6.1/cifs-do-not-include-page-data-when-checking-signature.patch new file mode 100644 index 00000000000..548852ab6a3 --- /dev/null +++ b/queue-6.1/cifs-do-not-include-page-data-when-checking-signature.patch @@ -0,0 +1,52 @@ +From 30b2b2196d6e4cc24cbec633535a2404f258ce69 Mon Sep 17 00:00:00 2001 +From: Enzo Matsumiya +Date: Wed, 18 Jan 2023 14:06:57 -0300 +Subject: cifs: do not include page data when checking signature + +From: Enzo Matsumiya + +commit 30b2b2196d6e4cc24cbec633535a2404f258ce69 upstream. + +On async reads, page data is allocated before sending. When the +response is received but it has no data to fill (e.g. +STATUS_END_OF_FILE), __calc_signature() will still include the pages in +its computation, leading to an invalid signature check. + +This patch fixes this by not setting the async read smb_rqst page data +(zeroed by default) if its got_bytes is 0. + +This can be reproduced/verified with xfstests generic/465. + +Cc: +Signed-off-by: Enzo Matsumiya +Reviewed-by: Paulo Alcantara (SUSE) +Signed-off-by: Steve French +Signed-off-by: Greg Kroah-Hartman +--- + fs/cifs/smb2pdu.c | 15 +++++++++------ + 1 file changed, 9 insertions(+), 6 deletions(-) + +--- a/fs/cifs/smb2pdu.c ++++ b/fs/cifs/smb2pdu.c +@@ -4162,12 +4162,15 @@ smb2_readv_callback(struct mid_q_entry * + (struct smb2_hdr *)rdata->iov[0].iov_base; + struct cifs_credits credits = { .value = 0, .instance = 0 }; + struct smb_rqst rqst = { .rq_iov = &rdata->iov[1], +- .rq_nvec = 1, +- .rq_pages = rdata->pages, +- .rq_offset = rdata->page_offset, +- .rq_npages = rdata->nr_pages, +- .rq_pagesz = rdata->pagesz, +- .rq_tailsz = rdata->tailsz }; ++ .rq_nvec = 1, }; ++ ++ if (rdata->got_bytes) { ++ rqst.rq_pages = rdata->pages; ++ rqst.rq_offset = rdata->page_offset; ++ rqst.rq_npages = rdata->nr_pages; ++ rqst.rq_pagesz = rdata->pagesz; ++ rqst.rq_tailsz = rdata->tailsz; ++ } + + WARN_ONCE(rdata->server != mid->server, + "rdata server %p != mid server %p", diff --git a/queue-6.1/mm-hugetlb-fix-pte-marker-handling-in-hugetlb_change_protection.patch b/queue-6.1/mm-hugetlb-fix-pte-marker-handling-in-hugetlb_change_protection.patch new file mode 100644 index 00000000000..0e9a475ad1e --- /dev/null +++ b/queue-6.1/mm-hugetlb-fix-pte-marker-handling-in-hugetlb_change_protection.patch @@ -0,0 +1,150 @@ +From 0e678153f5be7e6c8d28835f5a678618da4b7a9c Mon Sep 17 00:00:00 2001 +From: David Hildenbrand +Date: Thu, 22 Dec 2022 21:55:10 +0100 +Subject: mm/hugetlb: fix PTE marker handling in hugetlb_change_protection() + +From: David Hildenbrand + +commit 0e678153f5be7e6c8d28835f5a678618da4b7a9c upstream. + +Patch series "mm/hugetlb: uffd-wp fixes for hugetlb_change_protection()". + +Playing with virtio-mem and background snapshots (using uffd-wp) on +hugetlb in QEMU, I managed to trigger a VM_BUG_ON(). Looking into the +details, hugetlb_change_protection() seems to not handle uffd-wp correctly +in all cases. + +Patch #1 fixes my test case. I don't have reproducers for patch #2, as it +requires running into migration entries. + +I did not yet check in detail yet if !hugetlb code requires similar care. + + +This patch (of 2): + +There are two problematic cases when stumbling over a PTE marker in +hugetlb_change_protection(): + +(1) We protect an uffd-wp PTE marker a second time using uffd-wp: we will + end up in the "!huge_pte_none(pte)" case and mess up the PTE marker. + +(2) We unprotect a uffd-wp PTE marker: we will similarly end up in the + "!huge_pte_none(pte)" case even though we cleared the PTE, because + the "pte" variable is stale. We'll mess up the PTE marker. + +For example, if we later stumble over such a "wrongly modified" PTE marker, +we'll treat it like a present PTE that maps some garbage page. + +This can, for example, be triggered by mapping a memfd backed by huge +pages, registering uffd-wp, uffd-wp'ing an unmapped page and (a) +uffd-wp'ing it a second time; or (b) uffd-unprotecting it; or (c) +unregistering uffd-wp. Then, ff we trigger fallocate(FALLOC_FL_PUNCH_HOLE) +on that file range, we will run into a VM_BUG_ON: + +[ 195.039560] page:00000000ba1f2987 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x0 +[ 195.039565] flags: 0x7ffffc0001000(reserved|node=0|zone=0|lastcpupid=0x1fffff) +[ 195.039568] raw: 0007ffffc0001000 ffffe742c0000008 ffffe742c0000008 0000000000000000 +[ 195.039569] raw: 0000000000000000 0000000000000000 00000001ffffffff 0000000000000000 +[ 195.039569] page dumped because: VM_BUG_ON_PAGE(compound && !PageHead(page)) +[ 195.039573] ------------[ cut here ]------------ +[ 195.039574] kernel BUG at mm/rmap.c:1346! +[ 195.039579] invalid opcode: 0000 [#1] PREEMPT SMP NOPTI +[ 195.039581] CPU: 7 PID: 4777 Comm: qemu-system-x86 Not tainted 6.0.12-200.fc36.x86_64 #1 +[ 195.039583] Hardware name: LENOVO 20WNS1F81N/20WNS1F81N, BIOS N35ET50W (1.50 ) 09/15/2022 +[ 195.039584] RIP: 0010:page_remove_rmap+0x45b/0x550 +[ 195.039588] Code: [...] +[ 195.039589] RSP: 0018:ffffbc03c3633ba8 EFLAGS: 00010292 +[ 195.039591] RAX: 0000000000000040 RBX: ffffe742c0000000 RCX: 0000000000000000 +[ 195.039592] RDX: 0000000000000002 RSI: ffffffff8e7aac1a RDI: 00000000ffffffff +[ 195.039592] RBP: 0000000000000001 R08: 0000000000000000 R09: ffffbc03c3633a08 +[ 195.039593] R10: 0000000000000003 R11: ffffffff8f146328 R12: ffff9b04c42754b0 +[ 195.039594] R13: ffffffff8fcc6328 R14: ffffbc03c3633c80 R15: ffff9b0484ab9100 +[ 195.039595] FS: 00007fc7aaf68640(0000) GS:ffff9b0bbf7c0000(0000) knlGS:0000000000000000 +[ 195.039596] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +[ 195.039597] CR2: 000055d402c49110 CR3: 0000000159392003 CR4: 0000000000772ee0 +[ 195.039598] PKRU: 55555554 +[ 195.039599] Call Trace: +[ 195.039600] +[ 195.039602] __unmap_hugepage_range+0x33b/0x7d0 +[ 195.039605] unmap_hugepage_range+0x55/0x70 +[ 195.039608] hugetlb_vmdelete_list+0x77/0xa0 +[ 195.039611] hugetlbfs_fallocate+0x410/0x550 +[ 195.039612] ? _raw_spin_unlock_irqrestore+0x23/0x40 +[ 195.039616] vfs_fallocate+0x12e/0x360 +[ 195.039618] __x64_sys_fallocate+0x40/0x70 +[ 195.039620] do_syscall_64+0x58/0x80 +[ 195.039623] ? syscall_exit_to_user_mode+0x17/0x40 +[ 195.039624] ? do_syscall_64+0x67/0x80 +[ 195.039626] entry_SYSCALL_64_after_hwframe+0x63/0xcd +[ 195.039628] RIP: 0033:0x7fc7b590651f +[ 195.039653] Code: [...] +[ 195.039654] RSP: 002b:00007fc7aaf66e70 EFLAGS: 00000293 ORIG_RAX: 000000000000011d +[ 195.039655] RAX: ffffffffffffffda RBX: 0000558ef4b7f370 RCX: 00007fc7b590651f +[ 195.039656] RDX: 0000000018000000 RSI: 0000000000000003 RDI: 000000000000000c +[ 195.039657] RBP: 0000000008000000 R08: 0000000000000000 R09: 0000000000000073 +[ 195.039658] R10: 0000000008000000 R11: 0000000000000293 R12: 0000000018000000 +[ 195.039658] R13: 00007fb8bbe00000 R14: 000000000000000c R15: 0000000000001000 +[ 195.039661] + +Fix it by not going into the "!huge_pte_none(pte)" case if we stumble over +an exclusive marker. spin_unlock() + continue would get the job done. + +However, instead, make it clearer that there are no fall-through +statements: we process each case (hwpoison, migration, marker, !none, +none) and then unlock the page table to continue with the next PTE. Let's +avoid "continue" statements and use a single spin_unlock() at the end. + +Link: https://lkml.kernel.org/r/20221222205511.675832-1-david@redhat.com +Link: https://lkml.kernel.org/r/20221222205511.675832-2-david@redhat.com +Fixes: 60dfaad65aa9 ("mm/hugetlb: allow uffd wr-protect none ptes") +Signed-off-by: David Hildenbrand +Reviewed-by: Peter Xu +Reviewed-by: Mike Kravetz +Cc: Miaohe Lin +Cc: Muchun Song +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/hugetlb.c | 21 +++++++-------------- + 1 file changed, 7 insertions(+), 14 deletions(-) + +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -6623,10 +6623,8 @@ unsigned long hugetlb_change_protection( + } + pte = huge_ptep_get(ptep); + if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { +- spin_unlock(ptl); +- continue; +- } +- if (unlikely(is_hugetlb_entry_migration(pte))) { ++ /* Nothing to do. */ ++ } else if (unlikely(is_hugetlb_entry_migration(pte))) { + swp_entry_t entry = pte_to_swp_entry(pte); + struct page *page = pfn_swap_entry_to_page(entry); + +@@ -6647,18 +6645,13 @@ unsigned long hugetlb_change_protection( + set_huge_pte_at(mm, address, ptep, newpte); + pages++; + } +- spin_unlock(ptl); +- continue; +- } +- if (unlikely(pte_marker_uffd_wp(pte))) { +- /* +- * This is changing a non-present pte into a none pte, +- * no need for huge_ptep_modify_prot_start/commit(). +- */ ++ } else if (unlikely(is_pte_marker(pte))) { ++ /* No other markers apply for now. */ ++ WARN_ON_ONCE(!pte_marker_uffd_wp(pte)); + if (uffd_wp_resolve) ++ /* Safe to modify directly (non-present->none). */ + huge_pte_clear(mm, address, ptep, psize); +- } +- if (!huge_pte_none(pte)) { ++ } else if (!huge_pte_none(pte)) { + pte_t old_pte; + unsigned int shift = huge_page_shift(hstate_vma(vma)); + diff --git a/queue-6.1/mm-hugetlb-fix-uffd-wp-handling-for-migration-entries-in-hugetlb_change_protection.patch b/queue-6.1/mm-hugetlb-fix-uffd-wp-handling-for-migration-entries-in-hugetlb_change_protection.patch new file mode 100644 index 00000000000..df80e8c1dde --- /dev/null +++ b/queue-6.1/mm-hugetlb-fix-uffd-wp-handling-for-migration-entries-in-hugetlb_change_protection.patch @@ -0,0 +1,76 @@ +From 44f86392bdd165da7e43d3c772aeb1e128ffd6c8 Mon Sep 17 00:00:00 2001 +From: David Hildenbrand +Date: Thu, 22 Dec 2022 21:55:11 +0100 +Subject: mm/hugetlb: fix uffd-wp handling for migration entries in hugetlb_change_protection() + +From: David Hildenbrand + +commit 44f86392bdd165da7e43d3c772aeb1e128ffd6c8 upstream. + +We have to update the uffd-wp SWP PTE bit independent of the type of +migration entry. Currently, if we're unlucky and we want to install/clear +the uffd-wp bit just while we're migrating a read-only mapped hugetlb +page, we would miss to set/clear the uffd-wp bit. + +Further, if we're processing a readable-exclusive migration entry and +neither want to set or clear the uffd-wp bit, we could currently end up +losing the uffd-wp bit. Note that the same would hold for writable +migrating entries, however, having a writable migration entry with the +uffd-wp bit set would already mean that something went wrong. + +Note that the change from !is_readable_migration_entry -> +writable_migration_entry is harmless and actually cleaner, as raised by +Miaohe Lin and discussed in [1]. + +[1] https://lkml.kernel.org/r/90dd6a93-4500-e0de-2bf0-bf522c311b0c@huawei.com + +Link: https://lkml.kernel.org/r/20221222205511.675832-3-david@redhat.com +Fixes: 60dfaad65aa9 ("mm/hugetlb: allow uffd wr-protect none ptes") +Signed-off-by: David Hildenbrand +Reviewed-by: Mike Kravetz +Cc: Miaohe Lin +Cc: Muchun Song +Cc: Peter Xu +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/hugetlb.c | 17 +++++++++-------- + 1 file changed, 9 insertions(+), 8 deletions(-) + +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -6627,10 +6627,9 @@ unsigned long hugetlb_change_protection( + } else if (unlikely(is_hugetlb_entry_migration(pte))) { + swp_entry_t entry = pte_to_swp_entry(pte); + struct page *page = pfn_swap_entry_to_page(entry); ++ pte_t newpte = pte; + +- if (!is_readable_migration_entry(entry)) { +- pte_t newpte; +- ++ if (is_writable_migration_entry(entry)) { + if (PageAnon(page)) + entry = make_readable_exclusive_migration_entry( + swp_offset(entry)); +@@ -6638,13 +6637,15 @@ unsigned long hugetlb_change_protection( + entry = make_readable_migration_entry( + swp_offset(entry)); + newpte = swp_entry_to_pte(entry); +- if (uffd_wp) +- newpte = pte_swp_mkuffd_wp(newpte); +- else if (uffd_wp_resolve) +- newpte = pte_swp_clear_uffd_wp(newpte); +- set_huge_pte_at(mm, address, ptep, newpte); + pages++; + } ++ ++ if (uffd_wp) ++ newpte = pte_swp_mkuffd_wp(newpte); ++ else if (uffd_wp_resolve) ++ newpte = pte_swp_clear_uffd_wp(newpte); ++ if (!pte_same(pte, newpte)) ++ set_huge_pte_at(mm, address, ptep, newpte); + } else if (unlikely(is_pte_marker(pte))) { + /* No other markers apply for now. */ + WARN_ON_ONCE(!pte_marker_uffd_wp(pte)); diff --git a/queue-6.1/mm-hugetlb-pre-allocate-pgtable-pages-for-uffd-wr-protects.patch b/queue-6.1/mm-hugetlb-pre-allocate-pgtable-pages-for-uffd-wr-protects.patch new file mode 100644 index 00000000000..c17b53d2b85 --- /dev/null +++ b/queue-6.1/mm-hugetlb-pre-allocate-pgtable-pages-for-uffd-wr-protects.patch @@ -0,0 +1,59 @@ +From fed15f1345dc8a7fc8baa81e8b55c3ba010d7f4b Mon Sep 17 00:00:00 2001 +From: Peter Xu +Date: Wed, 4 Jan 2023 17:52:05 -0500 +Subject: mm/hugetlb: pre-allocate pgtable pages for uffd wr-protects + +From: Peter Xu + +commit fed15f1345dc8a7fc8baa81e8b55c3ba010d7f4b upstream. + +Userfaultfd-wp uses pte markers to mark wr-protected pages for both shmem +and hugetlb. Shmem has pre-allocation ready for markers, but hugetlb path +was overlooked. + +Doing so by calling huge_pte_alloc() if the initial pgtable walk fails to +find the huge ptep. It's possible that huge_pte_alloc() can fail with +high memory pressure, in that case stop the loop immediately and fail +silently. This is not the most ideal solution but it matches with what we +do with shmem meanwhile it avoids the splat in dmesg. + +Link: https://lkml.kernel.org/r/20230104225207.1066932-2-peterx@redhat.com +Fixes: 60dfaad65aa9 ("mm/hugetlb: allow uffd wr-protect none ptes") +Signed-off-by: Peter Xu +Reported-by: James Houghton +Reviewed-by: Mike Kravetz +Acked-by: David Hildenbrand +Acked-by: James Houghton +Cc: Andrea Arcangeli +Cc: Axel Rasmussen +Cc: Muchun Song +Cc: Nadav Amit +Cc: [5.19+] +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/hugetlb.c | 13 +++++++++++-- + 1 file changed, 11 insertions(+), 2 deletions(-) + +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -6604,8 +6604,17 @@ unsigned long hugetlb_change_protection( + spinlock_t *ptl; + ptep = huge_pte_offset(mm, address, psize); + if (!ptep) { +- address |= last_addr_mask; +- continue; ++ if (!uffd_wp) { ++ address |= last_addr_mask; ++ continue; ++ } ++ /* ++ * Userfaultfd wr-protect requires pgtable ++ * pre-allocations to install pte markers. ++ */ ++ ptep = huge_pte_alloc(mm, vma, address, psize); ++ if (!ptep) ++ break; + } + ptl = huge_pte_lock(h, mm, ptep); + if (huge_pmd_unshare(mm, vma, address, ptep)) { diff --git a/queue-6.1/mm-madv_collapse-don-t-expand-collapse-when-vm_end-is-past-requested-end.patch b/queue-6.1/mm-madv_collapse-don-t-expand-collapse-when-vm_end-is-past-requested-end.patch new file mode 100644 index 00000000000..29bf16c97fc --- /dev/null +++ b/queue-6.1/mm-madv_collapse-don-t-expand-collapse-when-vm_end-is-past-requested-end.patch @@ -0,0 +1,66 @@ +From 52dc031088f00e323140ece4004e70c33153c6dd Mon Sep 17 00:00:00 2001 +From: Zach O'Keefe +Date: Sat, 24 Dec 2022 00:20:34 -0800 +Subject: mm/MADV_COLLAPSE: don't expand collapse when vm_end is past requested end + +From: Zach O'Keefe + +commit 52dc031088f00e323140ece4004e70c33153c6dd upstream. + +MADV_COLLAPSE acts on one hugepage-aligned/sized region at a time, until +it has collapsed all eligible memory contained within the bounds supplied +by the user. + +At the top of each hugepage iteration we (re)lock mmap_lock and +(re)validate the VMA for eligibility and update variables that might have +changed while mmap_lock was dropped. One thing that might occur is that +the VMA could be resized, and as such, we refetch vma->vm_end to make sure +we don't collapse past the end of the VMA's new end. + +However, it's possible that when refetching vma->vm_end that we expand the +region acted on by MADV_COLLAPSE if vma->vm_end is greater than size+len +supplied by the user. + +The consequence here is that we may attempt to collapse more memory than +requested, possibly yielding either "too much success" or "false failure" +user-visible results. An example of the former is if we MADV_COLLAPSE the +first 4MiB of a 2TiB mmap()'d file, the incorrect refetch would cause the +operation to block for much longer than anticipated as we attempt to +collapse the entire TiB region. An example of the latter is that applying +MADV_COLLPSE to a 4MiB file mapped to the start of a 6MiB VMA will +successfully collapse the first 4MiB, then incorrectly attempt to collapse +the last hugepage-aligned/sized region -- fail (since readahead/page cache +lookup will fail) -- and report a failure to the user. + +I don't believe there is a kernel stability concern here as we always +(re)validate the VMA / region accordingly. Also as Hugh mentions, the +user-visible effects are: we try to collapse more memory than requested +by the user, and/or failing an operation that should have otherwise +succeeded. An example is trying to collapse a 4MiB file contained +within a 12MiB VMA. + +Don't expand the acted-on region when refetching vma->vm_end. + +Link: https://lkml.kernel.org/r/20221224082035.3197140-1-zokeefe@google.com +Fixes: 4d24de9425f7 ("mm: MADV_COLLAPSE: refetch vm_end after reacquiring mmap_lock") +Signed-off-by: Zach O'Keefe +Reported-by: Hugh Dickins +Cc: Yang Shi +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + mm/khugepaged.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/mm/khugepaged.c ++++ b/mm/khugepaged.c +@@ -2644,7 +2644,7 @@ int madvise_collapse(struct vm_area_stru + goto out_nolock; + } + +- hend = vma->vm_end & HPAGE_PMD_MASK; ++ hend = min(hend, vma->vm_end & HPAGE_PMD_MASK); + } + mmap_assert_locked(mm); + memset(cc->node_load, 0, sizeof(cc->node_load)); diff --git a/queue-6.1/mm-userfaultfd-enable-writenotify-while-userfaultfd-wp-is-enabled-for-a-vma.patch b/queue-6.1/mm-userfaultfd-enable-writenotify-while-userfaultfd-wp-is-enabled-for-a-vma.patch new file mode 100644 index 00000000000..974375d6965 --- /dev/null +++ b/queue-6.1/mm-userfaultfd-enable-writenotify-while-userfaultfd-wp-is-enabled-for-a-vma.patch @@ -0,0 +1,169 @@ +From 51d3d5eb74ff53b92dcff48b30ae2ed8edd85a32 Mon Sep 17 00:00:00 2001 +From: David Hildenbrand +Date: Fri, 9 Dec 2022 09:09:12 +0100 +Subject: mm/userfaultfd: enable writenotify while userfaultfd-wp is enabled for a VMA + +From: David Hildenbrand + +commit 51d3d5eb74ff53b92dcff48b30ae2ed8edd85a32 upstream. + +Currently, we don't enable writenotify when enabling userfaultfd-wp on a +shared writable mapping (for now only shmem and hugetlb). The consequence +is that vma->vm_page_prot will still include write permissions, to be set +as default for all PTEs that get remapped (e.g., mprotect(), NUMA hinting, +page migration, ...). + +So far, vma->vm_page_prot is assumed to be a safe default, meaning that we +only add permissions (e.g., mkwrite) but not remove permissions (e.g., +wrprotect). For example, when enabling softdirty tracking, we enable +writenotify. With uffd-wp on shared mappings, that changed. More details +on vma->vm_page_prot semantics were summarized in [1]. + +This is problematic for uffd-wp: we'd have to manually check for a uffd-wp +PTEs/PMDs and manually write-protect PTEs/PMDs, which is error prone. +Prone to such issues is any code that uses vma->vm_page_prot to set PTE +permissions: primarily pte_modify() and mk_pte(). + +Instead, let's enable writenotify such that PTEs/PMDs/... will be mapped +write-protected as default and we will only allow selected PTEs that are +definitely safe to be mapped without write-protection (see +can_change_pte_writable()) to be writable. In the future, we might want +to enable write-bit recovery -- e.g., can_change_pte_writable() -- at more +locations, for example, also when removing uffd-wp protection. + +This fixes two known cases: + +(a) remove_migration_pte() mapping uffd-wp'ed PTEs writable, resulting + in uffd-wp not triggering on write access. +(b) do_numa_page() / do_huge_pmd_numa_page() mapping uffd-wp'ed PTEs/PMDs + writable, resulting in uffd-wp not triggering on write access. + +Note that do_numa_page() / do_huge_pmd_numa_page() can be reached even +without NUMA hinting (which currently doesn't seem to be applicable to +shmem), for example, by using uffd-wp with a PROT_WRITE shmem VMA. On +such a VMA, userfaultfd-wp is currently non-functional. + +Note that when enabling userfaultfd-wp, there is no need to walk page +tables to enforce the new default protection for the PTEs: we know that +they cannot be uffd-wp'ed yet, because that can only happen after enabling +uffd-wp for the VMA in general. + +Also note that this makes mprotect() on ranges with uffd-wp'ed PTEs not +accidentally set the write bit -- which would result in uffd-wp not +triggering on later write access. This commit makes uffd-wp on shmem +behave just like uffd-wp on anonymous memory in that regard, even though, +mixing mprotect with uffd-wp is controversial. + +[1] https://lkml.kernel.org/r/92173bad-caa3-6b43-9d1e-9a471fdbc184@redhat.com + +Link: https://lkml.kernel.org/r/20221209080912.7968-1-david@redhat.com +Fixes: b1f9e876862d ("mm/uffd: enable write protection for shmem & hugetlbfs") +Signed-off-by: David Hildenbrand +Reported-by: Ives van Hoorne +Debugged-by: Peter Xu +Acked-by: Peter Xu +Cc: Hugh Dickins +Cc: Alistair Popple +Cc: Mike Rapoport +Cc: Nadav Amit +Cc: Andrea Arcangeli +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Greg Kroah-Hartman +--- + fs/userfaultfd.c | 28 ++++++++++++++++++++++------ + mm/mmap.c | 4 ++++ + 2 files changed, 26 insertions(+), 6 deletions(-) + +--- a/fs/userfaultfd.c ++++ b/fs/userfaultfd.c +@@ -108,6 +108,21 @@ static bool userfaultfd_is_initialized(s + return ctx->features & UFFD_FEATURE_INITIALIZED; + } + ++static void userfaultfd_set_vm_flags(struct vm_area_struct *vma, ++ vm_flags_t flags) ++{ ++ const bool uffd_wp_changed = (vma->vm_flags ^ flags) & VM_UFFD_WP; ++ ++ vma->vm_flags = flags; ++ /* ++ * For shared mappings, we want to enable writenotify while ++ * userfaultfd-wp is enabled (see vma_wants_writenotify()). We'll simply ++ * recalculate vma->vm_page_prot whenever userfaultfd-wp changes. ++ */ ++ if ((vma->vm_flags & VM_SHARED) && uffd_wp_changed) ++ vma_set_page_prot(vma); ++} ++ + static int userfaultfd_wake_function(wait_queue_entry_t *wq, unsigned mode, + int wake_flags, void *key) + { +@@ -618,7 +633,8 @@ static void userfaultfd_event_wait_compl + for_each_vma(vmi, vma) { + if (vma->vm_userfaultfd_ctx.ctx == release_new_ctx) { + vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; +- vma->vm_flags &= ~__VM_UFFD_FLAGS; ++ userfaultfd_set_vm_flags(vma, ++ vma->vm_flags & ~__VM_UFFD_FLAGS); + } + } + mmap_write_unlock(mm); +@@ -652,7 +668,7 @@ int dup_userfaultfd(struct vm_area_struc + octx = vma->vm_userfaultfd_ctx.ctx; + if (!octx || !(octx->features & UFFD_FEATURE_EVENT_FORK)) { + vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; +- vma->vm_flags &= ~__VM_UFFD_FLAGS; ++ userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS); + return 0; + } + +@@ -733,7 +749,7 @@ void mremap_userfaultfd_prep(struct vm_a + } else { + /* Drop uffd context if remap feature not enabled */ + vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; +- vma->vm_flags &= ~__VM_UFFD_FLAGS; ++ userfaultfd_set_vm_flags(vma, vma->vm_flags & ~__VM_UFFD_FLAGS); + } + } + +@@ -895,7 +911,7 @@ static int userfaultfd_release(struct in + prev = vma; + } + +- vma->vm_flags = new_flags; ++ userfaultfd_set_vm_flags(vma, new_flags); + vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; + } + mmap_write_unlock(mm); +@@ -1463,7 +1479,7 @@ static int userfaultfd_register(struct u + * the next vma was merged into the current one and + * the current one has not been updated yet. + */ +- vma->vm_flags = new_flags; ++ userfaultfd_set_vm_flags(vma, new_flags); + vma->vm_userfaultfd_ctx.ctx = ctx; + + if (is_vm_hugetlb_page(vma) && uffd_disable_huge_pmd_share(vma)) +@@ -1651,7 +1667,7 @@ static int userfaultfd_unregister(struct + * the next vma was merged into the current one and + * the current one has not been updated yet. + */ +- vma->vm_flags = new_flags; ++ userfaultfd_set_vm_flags(vma, new_flags); + vma->vm_userfaultfd_ctx = NULL_VM_UFFD_CTX; + + skip: +--- a/mm/mmap.c ++++ b/mm/mmap.c +@@ -1524,6 +1524,10 @@ int vma_wants_writenotify(struct vm_area + if (vma_soft_dirty_enabled(vma) && !is_vm_hugetlb_page(vma)) + return 1; + ++ /* Do we need write faults for uffd-wp tracking? */ ++ if (userfaultfd_wp(vma)) ++ return 1; ++ + /* Specialty mapping? */ + if (vm_flags & VM_PFNMAP) + return 0; diff --git a/queue-6.1/mmc-sdhci-esdhc-imx-correct-the-tuning-start-tap-and-step-setting.patch b/queue-6.1/mmc-sdhci-esdhc-imx-correct-the-tuning-start-tap-and-step-setting.patch new file mode 100644 index 00000000000..04591214798 --- /dev/null +++ b/queue-6.1/mmc-sdhci-esdhc-imx-correct-the-tuning-start-tap-and-step-setting.patch @@ -0,0 +1,73 @@ +From 1e336aa0c0250ec84c6f16efac40c9f0138e367d Mon Sep 17 00:00:00 2001 +From: Haibo Chen +Date: Wed, 7 Dec 2022 19:23:15 +0800 +Subject: mmc: sdhci-esdhc-imx: correct the tuning start tap and step setting + +From: Haibo Chen + +commit 1e336aa0c0250ec84c6f16efac40c9f0138e367d upstream. + +Current code logic may be impacted by the setting of ROM/Bootloader, +so unmask these bits first, then setting these bits accordingly. + +Fixes: 2b16cf326b70 ("mmc: sdhci-esdhc-imx: move tuning static configuration into hwinit function") +Signed-off-by: Haibo Chen +Acked-by: Adrian Hunter +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/r/20221207112315.1812222-1-haibo.chen@nxp.com +Signed-off-by: Ulf Hansson +Signed-off-by: Greg Kroah-Hartman +--- + drivers/mmc/host/sdhci-esdhc-imx.c | 22 +++++++++++++++------- + 1 file changed, 15 insertions(+), 7 deletions(-) + +--- a/drivers/mmc/host/sdhci-esdhc-imx.c ++++ b/drivers/mmc/host/sdhci-esdhc-imx.c +@@ -107,6 +107,7 @@ + #define ESDHC_TUNING_START_TAP_DEFAULT 0x1 + #define ESDHC_TUNING_START_TAP_MASK 0x7f + #define ESDHC_TUNING_CMD_CRC_CHECK_DISABLE (1 << 7) ++#define ESDHC_TUNING_STEP_DEFAULT 0x1 + #define ESDHC_TUNING_STEP_MASK 0x00070000 + #define ESDHC_TUNING_STEP_SHIFT 16 + +@@ -1361,7 +1362,7 @@ static void sdhci_esdhc_imx_hwinit(struc + struct sdhci_pltfm_host *pltfm_host = sdhci_priv(host); + struct pltfm_imx_data *imx_data = sdhci_pltfm_priv(pltfm_host); + struct cqhci_host *cq_host = host->mmc->cqe_private; +- int tmp; ++ u32 tmp; + + if (esdhc_is_usdhc(imx_data)) { + /* +@@ -1416,17 +1417,24 @@ static void sdhci_esdhc_imx_hwinit(struc + + if (imx_data->socdata->flags & ESDHC_FLAG_STD_TUNING) { + tmp = readl(host->ioaddr + ESDHC_TUNING_CTRL); +- tmp |= ESDHC_STD_TUNING_EN | +- ESDHC_TUNING_START_TAP_DEFAULT; +- if (imx_data->boarddata.tuning_start_tap) { +- tmp &= ~ESDHC_TUNING_START_TAP_MASK; ++ tmp |= ESDHC_STD_TUNING_EN; ++ ++ /* ++ * ROM code or bootloader may config the start tap ++ * and step, unmask them first. ++ */ ++ tmp &= ~(ESDHC_TUNING_START_TAP_MASK | ESDHC_TUNING_STEP_MASK); ++ if (imx_data->boarddata.tuning_start_tap) + tmp |= imx_data->boarddata.tuning_start_tap; +- } ++ else ++ tmp |= ESDHC_TUNING_START_TAP_DEFAULT; + + if (imx_data->boarddata.tuning_step) { +- tmp &= ~ESDHC_TUNING_STEP_MASK; + tmp |= imx_data->boarddata.tuning_step + << ESDHC_TUNING_STEP_SHIFT; ++ } else { ++ tmp |= ESDHC_TUNING_STEP_DEFAULT ++ << ESDHC_TUNING_STEP_SHIFT; + } + + /* Disable the CMD CRC check for tuning, if not, need to diff --git a/queue-6.1/mmc-sunxi-mmc-fix-clock-refcount-imbalance-during-unbind.patch b/queue-6.1/mmc-sunxi-mmc-fix-clock-refcount-imbalance-during-unbind.patch new file mode 100644 index 00000000000..33faae030d6 --- /dev/null +++ b/queue-6.1/mmc-sunxi-mmc-fix-clock-refcount-imbalance-during-unbind.patch @@ -0,0 +1,41 @@ +From 8509419758f2cc28dd05370385af0d91573b76b4 Mon Sep 17 00:00:00 2001 +From: Samuel Holland +Date: Tue, 9 Aug 2022 21:25:09 -0500 +Subject: mmc: sunxi-mmc: Fix clock refcount imbalance during unbind + +From: Samuel Holland + +commit 8509419758f2cc28dd05370385af0d91573b76b4 upstream. + +If the controller is suspended by runtime PM, the clock is already +disabled, so do not try to disable it again during removal. Use +pm_runtime_disable() to flush any pending runtime PM transitions. + +Fixes: 9a8e1e8cc2c0 ("mmc: sunxi: Add runtime_pm support") +Signed-off-by: Samuel Holland +Acked-by: Jernej Skrabec +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/r/20220810022509.43743-1-samuel@sholland.org +Signed-off-by: Ulf Hansson +Signed-off-by: Greg Kroah-Hartman +--- + drivers/mmc/host/sunxi-mmc.c | 8 +++++--- + 1 file changed, 5 insertions(+), 3 deletions(-) + +--- a/drivers/mmc/host/sunxi-mmc.c ++++ b/drivers/mmc/host/sunxi-mmc.c +@@ -1492,9 +1492,11 @@ static int sunxi_mmc_remove(struct platf + struct sunxi_mmc_host *host = mmc_priv(mmc); + + mmc_remove_host(mmc); +- pm_runtime_force_suspend(&pdev->dev); +- disable_irq(host->irq); +- sunxi_mmc_disable(host); ++ pm_runtime_disable(&pdev->dev); ++ if (!pm_runtime_status_suspended(&pdev->dev)) { ++ disable_irq(host->irq); ++ sunxi_mmc_disable(host); ++ } + dma_free_coherent(&pdev->dev, PAGE_SIZE, host->sg_cpu, host->sg_dma); + mmc_free_host(mmc); + diff --git a/queue-6.1/riscv-dts-sifive-fu740-fix-size-of-pcie-32bit-memory.patch b/queue-6.1/riscv-dts-sifive-fu740-fix-size-of-pcie-32bit-memory.patch new file mode 100644 index 00000000000..61edb902876 --- /dev/null +++ b/queue-6.1/riscv-dts-sifive-fu740-fix-size-of-pcie-32bit-memory.patch @@ -0,0 +1,44 @@ +From 43d5f5d63699724d47f0d9e0eae516a260d232b4 Mon Sep 17 00:00:00 2001 +From: Ben Dooks +Date: Fri, 6 Jan 2023 13:44:56 +0000 +Subject: riscv: dts: sifive: fu740: fix size of pcie 32bit memory + +From: Ben Dooks + +commit 43d5f5d63699724d47f0d9e0eae516a260d232b4 upstream. + +The 32-bit memory resource is needed for non-prefetchable memory +allocations on the PCIe bus, however with some cards (such as the +SM768) the system fails to allocate memory from this. + +Checking the allocation against the datasheet, it looks like there +has been a mis-calcualation of the resource for the first memory +region (0x0060090000..0x0070ffffff) which in the data-sheet for +the fu740 (v1p2) is from 0x0060000000..0x007fffffff. Changing +this to allocate from 0x0060090000..0x007fffffff fixes the probing +issues. + +Fixes: ae80d5148085 ("riscv: dts: Add PCIe support for the SiFive FU740-C000 SoC") +Cc: Paul Walmsley +Cc: Greentime Hu +Signed-off-by: Ben Dooks +Cc: stable@vger.kernel.org +Tested-by: Ron Economos # from IRC +Reviewed-by: Conor Dooley +Signed-off-by: Conor Dooley +Signed-off-by: Greg Kroah-Hartman +--- + arch/riscv/boot/dts/sifive/fu740-c000.dtsi | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/riscv/boot/dts/sifive/fu740-c000.dtsi ++++ b/arch/riscv/boot/dts/sifive/fu740-c000.dtsi +@@ -328,7 +328,7 @@ + bus-range = <0x0 0xff>; + ranges = <0x81000000 0x0 0x60080000 0x0 0x60080000 0x0 0x10000>, /* I/O */ + <0x82000000 0x0 0x60090000 0x0 0x60090000 0x0 0xff70000>, /* mem */ +- <0x82000000 0x0 0x70000000 0x0 0x70000000 0x0 0x1000000>, /* mem */ ++ <0x82000000 0x0 0x70000000 0x0 0x70000000 0x0 0x10000000>, /* mem */ + <0xc3000000 0x20 0x00000000 0x20 0x00000000 0x20 0x00000000>; /* mem prefetchable */ + num-lanes = <0x8>; + interrupts = <56>, <57>, <58>, <59>, <60>, <61>, <62>, <63>, <64>; diff --git a/queue-6.1/series b/queue-6.1/series index 3963c792c12..771e71c8459 100644 --- a/queue-6.1/series +++ b/queue-6.1/series @@ -71,3 +71,26 @@ misc-fastrpc-fix-use-after-free-race-condition-for-maps.patch usb-core-hub-disable-autosuspend-for-ti-tusb8041.patch comedi-adv_pci1760-fix-pwm-instruction-handling.patch acpi-prm-check-whether-efi-runtime-is-available.patch +mmc-sunxi-mmc-fix-clock-refcount-imbalance-during-unbind.patch +mmc-sdhci-esdhc-imx-correct-the-tuning-start-tap-and-step-setting.patch +mm-hugetlb-fix-pte-marker-handling-in-hugetlb_change_protection.patch +mm-hugetlb-fix-uffd-wp-handling-for-migration-entries-in-hugetlb_change_protection.patch +mm-hugetlb-pre-allocate-pgtable-pages-for-uffd-wr-protects.patch +mm-userfaultfd-enable-writenotify-while-userfaultfd-wp-is-enabled-for-a-vma.patch +mm-madv_collapse-don-t-expand-collapse-when-vm_end-is-past-requested-end.patch +btrfs-add-extra-error-messages-to-cover-non-enomem-errors-from-device_add_list.patch +btrfs-fix-missing-error-handling-when-logging-directory-items.patch +btrfs-fix-directory-logging-due-to-race-with-concurrent-index-key-deletion.patch +btrfs-add-missing-setup-of-log-for-full-commit-at-add_conflicting_inode.patch +btrfs-do-not-abort-transaction-on-failure-to-write-log-tree-when-syncing-log.patch +btrfs-do-not-abort-transaction-on-failure-to-update-log-root.patch +btrfs-qgroup-do-not-warn-on-record-without-old_roots-populated.patch +btrfs-fix-invalid-leaf-access-due-to-inline-extent-during-lseek.patch +btrfs-fix-race-between-quota-rescan-and-disable-leading-to-null-pointer-deref.patch +cifs-do-not-include-page-data-when-checking-signature.patch +thunderbolt-disable-xdomain-lane-1-only-in-software-connection-manager.patch +thunderbolt-use-correct-function-to-calculate-maximum-usb3-link-rate.patch +thunderbolt-do-not-report-errors-if-on-board-retimers-are-found.patch +thunderbolt-do-not-call-pm-runtime-functions-in-tb_retimer_scan.patch +riscv-dts-sifive-fu740-fix-size-of-pcie-32bit-memory.patch +bpf-restore-the-ebpf-program-id-for-bpf_audit_unload-and-perf_bpf_event_prog_unload.patch diff --git a/queue-6.1/thunderbolt-disable-xdomain-lane-1-only-in-software-connection-manager.patch b/queue-6.1/thunderbolt-disable-xdomain-lane-1-only-in-software-connection-manager.patch new file mode 100644 index 00000000000..d9e0411aa3d --- /dev/null +++ b/queue-6.1/thunderbolt-disable-xdomain-lane-1-only-in-software-connection-manager.patch @@ -0,0 +1,54 @@ +From 84ee211c83212f4d35b56e0603acdcc41f860f1b Mon Sep 17 00:00:00 2001 +From: Mika Westerberg +Date: Thu, 8 Sep 2022 09:45:22 +0300 +Subject: thunderbolt: Disable XDomain lane 1 only in software connection manager + +From: Mika Westerberg + +commit 84ee211c83212f4d35b56e0603acdcc41f860f1b upstream. + +When firmware connection manager is in use we should not touch the lane +adapter (well or any) configuration space so do this only when we know +that the software connection manager is active. + +Fixes: 8e1de7042596 ("thunderbolt: Add support for XDomain lane bonding") +Cc: stable@vger.kernel.org +Acked-by: Yehezkel Bernat +Signed-off-by: Mika Westerberg +Signed-off-by: Greg Kroah-Hartman +--- + drivers/thunderbolt/xdomain.c | 17 ++++++++++++----- + 1 file changed, 12 insertions(+), 5 deletions(-) + +diff --git a/drivers/thunderbolt/xdomain.c b/drivers/thunderbolt/xdomain.c +index cfa83486c9da..3c51e47dd86b 100644 +--- a/drivers/thunderbolt/xdomain.c ++++ b/drivers/thunderbolt/xdomain.c +@@ -1419,12 +1419,19 @@ static int tb_xdomain_get_properties(struct tb_xdomain *xd) + * registered, we notify the userspace that it has changed. + */ + if (!update) { +- struct tb_port *port; ++ /* ++ * Now disable lane 1 if bonding was not enabled. Do ++ * this only if bonding was possible at the beginning ++ * (that is we are the connection manager and there are ++ * two lanes). ++ */ ++ if (xd->bonding_possible) { ++ struct tb_port *port; + +- /* Now disable lane 1 if bonding was not enabled */ +- port = tb_port_at(xd->route, tb_xdomain_parent(xd)); +- if (!port->bonded) +- tb_port_disable(port->dual_link_port); ++ port = tb_port_at(xd->route, tb_xdomain_parent(xd)); ++ if (!port->bonded) ++ tb_port_disable(port->dual_link_port); ++ } + + if (device_add(&xd->dev)) { + dev_err(&xd->dev, "failed to add XDomain device\n"); +-- +2.39.1 + diff --git a/queue-6.1/thunderbolt-do-not-call-pm-runtime-functions-in-tb_retimer_scan.patch b/queue-6.1/thunderbolt-do-not-call-pm-runtime-functions-in-tb_retimer_scan.patch new file mode 100644 index 00000000000..3c35a17b00c --- /dev/null +++ b/queue-6.1/thunderbolt-do-not-call-pm-runtime-functions-in-tb_retimer_scan.patch @@ -0,0 +1,137 @@ +From 23257cfc1cb7202fd0065e9f4a6a0aac1c04c4a9 Mon Sep 17 00:00:00 2001 +From: Mika Westerberg +Date: Thu, 29 Dec 2022 14:10:30 +0200 +Subject: thunderbolt: Do not call PM runtime functions in tb_retimer_scan() + +From: Mika Westerberg + +commit 23257cfc1cb7202fd0065e9f4a6a0aac1c04c4a9 upstream. + +We cannot call PM runtime functions in tb_retimer_scan() because it will +also be called when retimers are scanned from userspace (happens when +there is no device connected on ChromeOS for instance) and at the same +USB4 port runtime resume hook. This leads to hang because neither can +proceed. + +Fix this by runtime resuming USB4 ports in tb_scan_port() instead. This +makes sure the ports are runtime PM active when retimers are added under +it while avoiding the reported hang as well. + +Reported-by: Utkarsh Patel +Fixes: 1e56c88adecc ("thunderbolt: Runtime resume USB4 port when retimers are scanned") +Cc: stable@vger.kernel.org +Acked-by: Yehezkel Bernat +Signed-off-by: Mika Westerberg +Signed-off-by: Greg Kroah-Hartman +--- + drivers/thunderbolt/retimer.c | 17 +++-------------- + drivers/thunderbolt/tb.c | 20 +++++++++++++++----- + 2 files changed, 18 insertions(+), 19 deletions(-) + +--- a/drivers/thunderbolt/retimer.c ++++ b/drivers/thunderbolt/retimer.c +@@ -427,13 +427,6 @@ int tb_retimer_scan(struct tb_port *port + { + u32 status[TB_MAX_RETIMER_INDEX + 1] = {}; + int ret, i, last_idx = 0; +- struct usb4_port *usb4; +- +- usb4 = port->usb4; +- if (!usb4) +- return 0; +- +- pm_runtime_get_sync(&usb4->dev); + + /* + * Send broadcast RT to make sure retimer indices facing this +@@ -441,7 +434,7 @@ int tb_retimer_scan(struct tb_port *port + */ + ret = usb4_port_enumerate_retimers(port); + if (ret) +- goto out; ++ return ret; + + /* + * Enable sideband channel for each retimer. We can do this +@@ -471,11 +464,11 @@ int tb_retimer_scan(struct tb_port *port + break; + } + +- ret = 0; + if (!last_idx) +- goto out; ++ return 0; + + /* Add on-board retimers if they do not exist already */ ++ ret = 0; + for (i = 1; i <= last_idx; i++) { + struct tb_retimer *rt; + +@@ -489,10 +482,6 @@ int tb_retimer_scan(struct tb_port *port + } + } + +-out: +- pm_runtime_mark_last_busy(&usb4->dev); +- pm_runtime_put_autosuspend(&usb4->dev); +- + return ret; + } + +--- a/drivers/thunderbolt/tb.c ++++ b/drivers/thunderbolt/tb.c +@@ -628,11 +628,15 @@ static void tb_scan_port(struct tb_port + * Downstream switch is reachable through two ports. + * Only scan on the primary port (link_nr == 0). + */ ++ ++ if (port->usb4) ++ pm_runtime_get_sync(&port->usb4->dev); ++ + if (tb_wait_for_port(port, false) <= 0) +- return; ++ goto out_rpm_put; + if (port->remote) { + tb_port_dbg(port, "port already has a remote\n"); +- return; ++ goto out_rpm_put; + } + + tb_retimer_scan(port, true); +@@ -647,12 +651,12 @@ static void tb_scan_port(struct tb_port + */ + if (PTR_ERR(sw) == -EIO || PTR_ERR(sw) == -EADDRNOTAVAIL) + tb_scan_xdomain(port); +- return; ++ goto out_rpm_put; + } + + if (tb_switch_configure(sw)) { + tb_switch_put(sw); +- return; ++ goto out_rpm_put; + } + + /* +@@ -681,7 +685,7 @@ static void tb_scan_port(struct tb_port + + if (tb_switch_add(sw)) { + tb_switch_put(sw); +- return; ++ goto out_rpm_put; + } + + /* Link the switches using both links if available */ +@@ -733,6 +737,12 @@ static void tb_scan_port(struct tb_port + + tb_add_dp_resources(sw); + tb_scan_switch(sw); ++ ++out_rpm_put: ++ if (port->usb4) { ++ pm_runtime_mark_last_busy(&port->usb4->dev); ++ pm_runtime_put_autosuspend(&port->usb4->dev); ++ } + } + + static void tb_deactivate_and_free_tunnel(struct tb_tunnel *tunnel) diff --git a/queue-6.1/thunderbolt-do-not-report-errors-if-on-board-retimers-are-found.patch b/queue-6.1/thunderbolt-do-not-report-errors-if-on-board-retimers-are-found.patch new file mode 100644 index 00000000000..9b4dae207ba --- /dev/null +++ b/queue-6.1/thunderbolt-do-not-report-errors-if-on-board-retimers-are-found.patch @@ -0,0 +1,37 @@ +From c28f3d80383571d3630df1a0e89500d23e855924 Mon Sep 17 00:00:00 2001 +From: Utkarsh Patel +Date: Thu, 22 Dec 2022 20:22:46 -0800 +Subject: thunderbolt: Do not report errors if on-board retimers are found + +From: Utkarsh Patel + +commit c28f3d80383571d3630df1a0e89500d23e855924 upstream. + +Currently we return an error even if on-board retimers are found and +that's not expected. Fix this to return an error only if there was one +and 0 otherwise. + +Fixes: 1e56c88adecc ("thunderbolt: Runtime resume USB4 port when retimers are scanned") +Cc: stable@vger.kernel.org +Signed-off-by: Utkarsh Patel +Signed-off-by: Mika Westerberg +Signed-off-by: Greg Kroah-Hartman +--- + drivers/thunderbolt/retimer.c | 5 ++--- + 1 file changed, 2 insertions(+), 3 deletions(-) + +--- a/drivers/thunderbolt/retimer.c ++++ b/drivers/thunderbolt/retimer.c +@@ -471,10 +471,9 @@ int tb_retimer_scan(struct tb_port *port + break; + } + +- if (!last_idx) { +- ret = 0; ++ ret = 0; ++ if (!last_idx) + goto out; +- } + + /* Add on-board retimers if they do not exist already */ + for (i = 1; i <= last_idx; i++) { diff --git a/queue-6.1/thunderbolt-use-correct-function-to-calculate-maximum-usb3-link-rate.patch b/queue-6.1/thunderbolt-use-correct-function-to-calculate-maximum-usb3-link-rate.patch new file mode 100644 index 00000000000..ab90316180d --- /dev/null +++ b/queue-6.1/thunderbolt-use-correct-function-to-calculate-maximum-usb3-link-rate.patch @@ -0,0 +1,33 @@ +From e8ff07fb33026c5c1bb5b81293496faba5d68059 Mon Sep 17 00:00:00 2001 +From: Mika Westerberg +Date: Fri, 20 May 2022 13:35:19 +0300 +Subject: thunderbolt: Use correct function to calculate maximum USB3 link rate + +From: Mika Westerberg + +commit e8ff07fb33026c5c1bb5b81293496faba5d68059 upstream. + +We need to take minimum of both sides of the USB3 link into consideration, +not just the downstream port. Fix this by calling tb_usb3_max_link_rate() +instead. + +Fixes: 0bd680cd900c ("thunderbolt: Add USB3 bandwidth management") +Cc: stable@vger.kernel.org +Acked-by: Yehezkel Bernat +Signed-off-by: Mika Westerberg +Signed-off-by: Greg Kroah-Hartman +--- + drivers/thunderbolt/tunnel.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/thunderbolt/tunnel.c ++++ b/drivers/thunderbolt/tunnel.c +@@ -1275,7 +1275,7 @@ static void tb_usb3_reclaim_available_ba + return; + } else if (!ret) { + /* Use maximum link rate if the link valid is not set */ +- ret = usb4_usb3_port_max_link_rate(tunnel->src_port); ++ ret = tb_usb3_max_link_rate(tunnel->dst_port, tunnel->src_port); + if (ret < 0) { + tb_tunnel_warn(tunnel, "failed to read maximum link rate\n"); + return;