]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
6.3-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 23 Jun 2023 09:53:48 +0000 (11:53 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Fri, 23 Jun 2023 09:53:48 +0000 (11:53 +0200)
added patches:
acpi-sleep-avoid-breaking-s3-wakeup-due-to-might_sleep.patch
bpf-ensure-main-program-has-an-extable.patch
cgroup-do-not-corrupt-task-iteration-when-rebinding-subsystem.patch
cgroup-freezer-hold-cpu_hotplug_lock-before-freezer_mutex-in-freezer_css_-online-offline.patch
drivers-hv-vmbus-call-hv_synic_free-if-hv_synic_alloc-fails.patch
drivers-hv-vmbus-fix-vmbus_wait_for_unload-to-scan-present-cpus.patch
io_uring-net-clear-msg_controllen-on-partial-sendmsg-retry.patch
io_uring-net-disable-partial-retries-for-recvmsg-with-cmsg.patch
kvm-avoid-illegal-stage2-mapping-on-invalid-memory-slot.patch
memfd-check-for-non-null-file_seals-in-memfd_create-syscall.patch
mm-mprotect-fix-do_mprotect_pkey-limit-check.patch
mm-vmalloc-do-not-output-a-spurious-warning-when-huge-vmalloc-fails.patch
mmc-bcm2835-fix-deferred-probing.patch
mmc-litex_mmc-set-probe_prefer_asynchronous.patch
mmc-meson-gx-fix-deferred-probing.patch
mmc-meson-gx-remove-redundant-mmc_request_done-call-from-irq-context.patch
mmc-mmci-stm32-fix-max-busy-timeout-calculation.patch
mmc-sdhci-msm-disable-broken-64-bit-dma-on-msm8916.patch
mmc-sdhci-spear-fix-deferred-probing.patch
mmc-sunxi-fix-deferred-probing.patch
mptcp-consolidate-fallback-and-non-fallback-state-machine.patch
mptcp-ensure-listener-is-unhashed-before-updating-the-sk-status.patch
mptcp-fix-possible-divide-by-zero-in-recvmsg.patch
mptcp-fix-possible-list-corruption-on-passive-mpj.patch
mptcp-handle-correctly-disconnect-failures.patch
net-mdio-fix-the-wrong-parameters.patch
nilfs2-fix-buffer-corruption-due-to-concurrent-device-reads.patch
nilfs2-prevent-general-protection-fault-in-nilfs_clear_dirty_page.patch
pci-hv-add-a-per-bus-mutex-state_lock.patch
pci-hv-fix-a-race-condition-bug-in-hv_pci_query_relations.patch
pci-hv-fix-a-race-condition-in-hv_irq_unmask-that-can-cause-panic.patch
pci-hv-remove-the-useless-hv_pcichild_state-from-struct-hv_pci_dev.patch
revert-pci-hv-fix-a-timing-issue-which-causes-kdump-to-fail-occasionally.patch
scripts-fix-the-gfp-flags-header-path-in-gfp-translate.patch
thermal-intel-intel_soc_dts_iosf-fix-reporting-wrong-temperatures.patch
wifi-iwlwifi-pcie-handle-so-f-device-for-pci-id-0x7af0.patch
writeback-fix-dereferencing-null-mapping-host-on-writeback_page_template.patch

38 files changed:
queue-6.3/acpi-sleep-avoid-breaking-s3-wakeup-due-to-might_sleep.patch [new file with mode: 0644]
queue-6.3/bpf-ensure-main-program-has-an-extable.patch [new file with mode: 0644]
queue-6.3/cgroup-do-not-corrupt-task-iteration-when-rebinding-subsystem.patch [new file with mode: 0644]
queue-6.3/cgroup-freezer-hold-cpu_hotplug_lock-before-freezer_mutex-in-freezer_css_-online-offline.patch [new file with mode: 0644]
queue-6.3/drivers-hv-vmbus-call-hv_synic_free-if-hv_synic_alloc-fails.patch [new file with mode: 0644]
queue-6.3/drivers-hv-vmbus-fix-vmbus_wait_for_unload-to-scan-present-cpus.patch [new file with mode: 0644]
queue-6.3/io_uring-net-clear-msg_controllen-on-partial-sendmsg-retry.patch [new file with mode: 0644]
queue-6.3/io_uring-net-disable-partial-retries-for-recvmsg-with-cmsg.patch [new file with mode: 0644]
queue-6.3/kvm-avoid-illegal-stage2-mapping-on-invalid-memory-slot.patch [new file with mode: 0644]
queue-6.3/memfd-check-for-non-null-file_seals-in-memfd_create-syscall.patch [new file with mode: 0644]
queue-6.3/mm-mprotect-fix-do_mprotect_pkey-limit-check.patch [new file with mode: 0644]
queue-6.3/mm-vmalloc-do-not-output-a-spurious-warning-when-huge-vmalloc-fails.patch [new file with mode: 0644]
queue-6.3/mmc-bcm2835-fix-deferred-probing.patch [new file with mode: 0644]
queue-6.3/mmc-litex_mmc-set-probe_prefer_asynchronous.patch [new file with mode: 0644]
queue-6.3/mmc-meson-gx-fix-deferred-probing.patch [new file with mode: 0644]
queue-6.3/mmc-meson-gx-remove-redundant-mmc_request_done-call-from-irq-context.patch [new file with mode: 0644]
queue-6.3/mmc-mmci-stm32-fix-max-busy-timeout-calculation.patch [new file with mode: 0644]
queue-6.3/mmc-sdhci-msm-disable-broken-64-bit-dma-on-msm8916.patch [new file with mode: 0644]
queue-6.3/mmc-sdhci-spear-fix-deferred-probing.patch [new file with mode: 0644]
queue-6.3/mmc-sunxi-fix-deferred-probing.patch [new file with mode: 0644]
queue-6.3/mptcp-consolidate-fallback-and-non-fallback-state-machine.patch [new file with mode: 0644]
queue-6.3/mptcp-ensure-listener-is-unhashed-before-updating-the-sk-status.patch [new file with mode: 0644]
queue-6.3/mptcp-fix-possible-divide-by-zero-in-recvmsg.patch [new file with mode: 0644]
queue-6.3/mptcp-fix-possible-list-corruption-on-passive-mpj.patch [new file with mode: 0644]
queue-6.3/mptcp-handle-correctly-disconnect-failures.patch [new file with mode: 0644]
queue-6.3/net-mdio-fix-the-wrong-parameters.patch [new file with mode: 0644]
queue-6.3/nilfs2-fix-buffer-corruption-due-to-concurrent-device-reads.patch [new file with mode: 0644]
queue-6.3/nilfs2-prevent-general-protection-fault-in-nilfs_clear_dirty_page.patch [new file with mode: 0644]
queue-6.3/pci-hv-add-a-per-bus-mutex-state_lock.patch [new file with mode: 0644]
queue-6.3/pci-hv-fix-a-race-condition-bug-in-hv_pci_query_relations.patch [new file with mode: 0644]
queue-6.3/pci-hv-fix-a-race-condition-in-hv_irq_unmask-that-can-cause-panic.patch [new file with mode: 0644]
queue-6.3/pci-hv-remove-the-useless-hv_pcichild_state-from-struct-hv_pci_dev.patch [new file with mode: 0644]
queue-6.3/revert-pci-hv-fix-a-timing-issue-which-causes-kdump-to-fail-occasionally.patch [new file with mode: 0644]
queue-6.3/scripts-fix-the-gfp-flags-header-path-in-gfp-translate.patch [new file with mode: 0644]
queue-6.3/series
queue-6.3/thermal-intel-intel_soc_dts_iosf-fix-reporting-wrong-temperatures.patch [new file with mode: 0644]
queue-6.3/wifi-iwlwifi-pcie-handle-so-f-device-for-pci-id-0x7af0.patch [new file with mode: 0644]
queue-6.3/writeback-fix-dereferencing-null-mapping-host-on-writeback_page_template.patch [new file with mode: 0644]

diff --git a/queue-6.3/acpi-sleep-avoid-breaking-s3-wakeup-due-to-might_sleep.patch b/queue-6.3/acpi-sleep-avoid-breaking-s3-wakeup-due-to-might_sleep.patch
new file mode 100644 (file)
index 0000000..b2dd66f
--- /dev/null
@@ -0,0 +1,86 @@
+From 22db06337f590d01d79f60f181d8dfe5a9ef9085 Mon Sep 17 00:00:00 2001
+From: "Rafael J. Wysocki" <rafael.j.wysocki@intel.com>
+Date: Wed, 14 Jun 2023 17:29:21 +0200
+Subject: ACPI: sleep: Avoid breaking S3 wakeup due to might_sleep()
+
+From: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+
+commit 22db06337f590d01d79f60f181d8dfe5a9ef9085 upstream.
+
+The addition of might_sleep() to down_timeout() caused the latter to
+enable interrupts unconditionally in some cases, which in turn broke
+the ACPI S3 wakeup path in acpi_suspend_enter(), where down_timeout()
+is called by acpi_disable_all_gpes() via acpi_ut_acquire_mutex().
+
+Namely, if CONFIG_DEBUG_ATOMIC_SLEEP is set, might_sleep() causes
+might_resched() to be used and if CONFIG_PREEMPT_VOLUNTARY is set,
+this triggers __cond_resched() which may call preempt_schedule_common(),
+so __schedule() gets invoked and it ends up with enabled interrupts (in
+the prev == next case).
+
+Now, enabling interrupts early in the S3 wakeup path causes the kernel
+to crash.
+
+Address this by modifying acpi_suspend_enter() to disable GPEs without
+attempting to acquire the sleeping lock which is not needed in that code
+path anyway.
+
+Fixes: 99409b935c9a ("locking/semaphore: Add might_sleep() to down_*() family")
+Reported-by: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
+Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
+Cc: 5.15+ <stable@vger.kernel.org> # 5.15+
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/acpi/acpica/achware.h |    2 --
+ drivers/acpi/sleep.c          |   16 ++++++++++++----
+ include/acpi/acpixf.h         |    1 +
+ 3 files changed, 13 insertions(+), 6 deletions(-)
+
+--- a/drivers/acpi/acpica/achware.h
++++ b/drivers/acpi/acpica/achware.h
+@@ -101,8 +101,6 @@ acpi_status
+ acpi_hw_get_gpe_status(struct acpi_gpe_event_info *gpe_event_info,
+                      acpi_event_status *event_status);
+-acpi_status acpi_hw_disable_all_gpes(void);
+-
+ acpi_status acpi_hw_enable_all_runtime_gpes(void);
+ acpi_status acpi_hw_enable_all_wakeup_gpes(void);
+--- a/drivers/acpi/sleep.c
++++ b/drivers/acpi/sleep.c
+@@ -636,11 +636,19 @@ static int acpi_suspend_enter(suspend_st
+       }
+       /*
+-       * Disable and clear GPE status before interrupt is enabled. Some GPEs
+-       * (like wakeup GPE) haven't handler, this can avoid such GPE misfire.
+-       * acpi_leave_sleep_state will reenable specific GPEs later
++       * Disable all GPE and clear their status bits before interrupts are
++       * enabled. Some GPEs (like wakeup GPEs) have no handlers and this can
++       * prevent them from producing spurious interrups.
++       *
++       * acpi_leave_sleep_state() will reenable specific GPEs later.
++       *
++       * Because this code runs on one CPU with disabled interrupts (all of
++       * the other CPUs are offline at this time), it need not acquire any
++       * sleeping locks which may trigger an implicit preemption point even
++       * if there is no contention, so avoid doing that by using a low-level
++       * library routine here.
+        */
+-      acpi_disable_all_gpes();
++      acpi_hw_disable_all_gpes();
+       /* Allow EC transactions to happen. */
+       acpi_ec_unblock_transactions();
+--- a/include/acpi/acpixf.h
++++ b/include/acpi/acpixf.h
+@@ -761,6 +761,7 @@ ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_sta
+                                                    acpi_event_status
+                                                    *event_status))
+ ACPI_HW_DEPENDENT_RETURN_UINT32(u32 acpi_dispatch_gpe(acpi_handle gpe_device, u32 gpe_number))
++ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status acpi_hw_disable_all_gpes(void))
+ ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status acpi_disable_all_gpes(void))
+ ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status acpi_enable_all_runtime_gpes(void))
+ ACPI_HW_DEPENDENT_RETURN_STATUS(acpi_status acpi_enable_all_wakeup_gpes(void))
diff --git a/queue-6.3/bpf-ensure-main-program-has-an-extable.patch b/queue-6.3/bpf-ensure-main-program-has-an-extable.patch
new file mode 100644 (file)
index 0000000..29ed02e
--- /dev/null
@@ -0,0 +1,64 @@
+From 0108a4e9f3584a7a2c026d1601b0682ff7335d95 Mon Sep 17 00:00:00 2001
+From: Krister Johansen <kjlx@templeofstupid.com>
+Date: Mon, 12 Jun 2023 17:44:40 -0700
+Subject: bpf: ensure main program has an extable
+
+From: Krister Johansen <kjlx@templeofstupid.com>
+
+commit 0108a4e9f3584a7a2c026d1601b0682ff7335d95 upstream.
+
+When subprograms are in use, the main program is not jit'd after the
+subprograms because jit_subprogs sets a value for prog->bpf_func upon
+success.  Subsequent calls to the JIT are bypassed when this value is
+non-NULL.  This leads to a situation where the main program and its
+func[0] counterpart are both in the bpf kallsyms tree, but only func[0]
+has an extable.  Extables are only created during JIT.  Now there are
+two nearly identical program ksym entries in the tree, but only one has
+an extable.  Depending upon how the entries are placed, there's a chance
+that a fault will call search_extable on the aux with the NULL entry.
+
+Since jit_subprogs already copies state from func[0] to the main
+program, include the extable pointer in this state duplication.
+Additionally, ensure that the copy of the main program in func[0] is not
+added to the bpf_prog_kallsyms table. Instead, let the main program get
+added later in bpf_prog_load().  This ensures there is only a single
+copy of the main program in the kallsyms table, and that its tag matches
+the tag observed by tooling like bpftool.
+
+Cc: stable@vger.kernel.org
+Fixes: 1c2a088a6626 ("bpf: x64: add JIT support for multi-function programs")
+Signed-off-by: Krister Johansen <kjlx@templeofstupid.com>
+Acked-by: Yonghong Song <yhs@fb.com>
+Acked-by: Ilya Leoshkevich <iii@linux.ibm.com>
+Tested-by: Ilya Leoshkevich <iii@linux.ibm.com>
+Link: https://lore.kernel.org/r/6de9b2f4b4724ef56efbb0339daaa66c8b68b1e7.1686616663.git.kjlx@templeofstupid.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/bpf/verifier.c |    7 +++++--
+ 1 file changed, 5 insertions(+), 2 deletions(-)
+
+--- a/kernel/bpf/verifier.c
++++ b/kernel/bpf/verifier.c
+@@ -16198,9 +16198,10 @@ static int jit_subprogs(struct bpf_verif
+       }
+       /* finally lock prog and jit images for all functions and
+-       * populate kallsysm
++       * populate kallsysm. Begin at the first subprogram, since
++       * bpf_prog_load will add the kallsyms for the main program.
+        */
+-      for (i = 0; i < env->subprog_cnt; i++) {
++      for (i = 1; i < env->subprog_cnt; i++) {
+               bpf_prog_lock_ro(func[i]);
+               bpf_prog_kallsyms_add(func[i]);
+       }
+@@ -16226,6 +16227,8 @@ static int jit_subprogs(struct bpf_verif
+       prog->jited = 1;
+       prog->bpf_func = func[0]->bpf_func;
+       prog->jited_len = func[0]->jited_len;
++      prog->aux->extable = func[0]->aux->extable;
++      prog->aux->num_exentries = func[0]->aux->num_exentries;
+       prog->aux->func = func;
+       prog->aux->func_cnt = env->subprog_cnt;
+       bpf_prog_jit_attempt_done(prog);
diff --git a/queue-6.3/cgroup-do-not-corrupt-task-iteration-when-rebinding-subsystem.patch b/queue-6.3/cgroup-do-not-corrupt-task-iteration-when-rebinding-subsystem.patch
new file mode 100644 (file)
index 0000000..f14d144
--- /dev/null
@@ -0,0 +1,125 @@
+From 6f363f5aa845561f7ea496d8b1175e3204470486 Mon Sep 17 00:00:00 2001
+From: Xiu Jianfeng <xiujianfeng@huawei.com>
+Date: Sat, 10 Jun 2023 17:26:43 +0800
+Subject: cgroup: Do not corrupt task iteration when rebinding subsystem
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Xiu Jianfeng <xiujianfeng@huawei.com>
+
+commit 6f363f5aa845561f7ea496d8b1175e3204470486 upstream.
+
+We found a refcount UAF bug as follows:
+
+refcount_t: addition on 0; use-after-free.
+WARNING: CPU: 1 PID: 342 at lib/refcount.c:25 refcount_warn_saturate+0xa0/0x148
+Workqueue: events cpuset_hotplug_workfn
+Call trace:
+ refcount_warn_saturate+0xa0/0x148
+ __refcount_add.constprop.0+0x5c/0x80
+ css_task_iter_advance_css_set+0xd8/0x210
+ css_task_iter_advance+0xa8/0x120
+ css_task_iter_next+0x94/0x158
+ update_tasks_root_domain+0x58/0x98
+ rebuild_root_domains+0xa0/0x1b0
+ rebuild_sched_domains_locked+0x144/0x188
+ cpuset_hotplug_workfn+0x138/0x5a0
+ process_one_work+0x1e8/0x448
+ worker_thread+0x228/0x3e0
+ kthread+0xe0/0xf0
+ ret_from_fork+0x10/0x20
+
+then a kernel panic will be triggered as below:
+
+Unable to handle kernel paging request at virtual address 00000000c0000010
+Call trace:
+ cgroup_apply_control_disable+0xa4/0x16c
+ rebind_subsystems+0x224/0x590
+ cgroup_destroy_root+0x64/0x2e0
+ css_free_rwork_fn+0x198/0x2a0
+ process_one_work+0x1d4/0x4bc
+ worker_thread+0x158/0x410
+ kthread+0x108/0x13c
+ ret_from_fork+0x10/0x18
+
+The race that cause this bug can be shown as below:
+
+(hotplug cpu)                | (umount cpuset)
+mutex_lock(&cpuset_mutex)    | mutex_lock(&cgroup_mutex)
+cpuset_hotplug_workfn        |
+ rebuild_root_domains        |  rebind_subsystems
+  update_tasks_root_domain   |   spin_lock_irq(&css_set_lock)
+   css_task_iter_start       |    list_move_tail(&cset->e_cset_node[ss->id]
+   while(css_task_iter_next) |                  &dcgrp->e_csets[ss->id]);
+   css_task_iter_end         |   spin_unlock_irq(&css_set_lock)
+mutex_unlock(&cpuset_mutex)  | mutex_unlock(&cgroup_mutex)
+
+Inside css_task_iter_start/next/end, css_set_lock is hold and then
+released, so when iterating task(left side), the css_set may be moved to
+another list(right side), then it->cset_head points to the old list head
+and it->cset_pos->next points to the head node of new list, which can't
+be used as struct css_set.
+
+To fix this issue, switch from all css_sets to only scgrp's css_sets to
+patch in-flight iterators to preserve correct iteration, and then
+update it->cset_head as well.
+
+Reported-by: Gaosheng Cui <cuigaosheng1@huawei.com>
+Link: https://www.spinics.net/lists/cgroups/msg37935.html
+Suggested-by: Michal Koutný <mkoutny@suse.com>
+Link: https://lore.kernel.org/all/20230526114139.70274-1-xiujianfeng@huaweicloud.com/
+Signed-off-by: Xiu Jianfeng <xiujianfeng@huawei.com>
+Fixes: 2d8f243a5e6e ("cgroup: implement cgroup->e_csets[]")
+Cc: stable@vger.kernel.org # v3.16+
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/cgroup/cgroup.c |   20 +++++++++++++++++---
+ 1 file changed, 17 insertions(+), 3 deletions(-)
+
+--- a/kernel/cgroup/cgroup.c
++++ b/kernel/cgroup/cgroup.c
+@@ -1788,7 +1788,7 @@ int rebind_subsystems(struct cgroup_root
+ {
+       struct cgroup *dcgrp = &dst_root->cgrp;
+       struct cgroup_subsys *ss;
+-      int ssid, i, ret;
++      int ssid, ret;
+       u16 dfl_disable_ss_mask = 0;
+       lockdep_assert_held(&cgroup_mutex);
+@@ -1832,7 +1832,8 @@ int rebind_subsystems(struct cgroup_root
+               struct cgroup_root *src_root = ss->root;
+               struct cgroup *scgrp = &src_root->cgrp;
+               struct cgroup_subsys_state *css = cgroup_css(scgrp, ss);
+-              struct css_set *cset;
++              struct css_set *cset, *cset_pos;
++              struct css_task_iter *it;
+               WARN_ON(!css || cgroup_css(dcgrp, ss));
+@@ -1850,9 +1851,22 @@ int rebind_subsystems(struct cgroup_root
+               css->cgroup = dcgrp;
+               spin_lock_irq(&css_set_lock);
+-              hash_for_each(css_set_table, i, cset, hlist)
++              WARN_ON(!list_empty(&dcgrp->e_csets[ss->id]));
++              list_for_each_entry_safe(cset, cset_pos, &scgrp->e_csets[ss->id],
++                                       e_cset_node[ss->id]) {
+                       list_move_tail(&cset->e_cset_node[ss->id],
+                                      &dcgrp->e_csets[ss->id]);
++                      /*
++                       * all css_sets of scgrp together in same order to dcgrp,
++                       * patch in-flight iterators to preserve correct iteration.
++                       * since the iterator is always advanced right away and
++                       * finished when it->cset_pos meets it->cset_head, so only
++                       * update it->cset_head is enough here.
++                       */
++                      list_for_each_entry(it, &cset->task_iters, iters_node)
++                              if (it->cset_head == &scgrp->e_csets[ss->id])
++                                      it->cset_head = &dcgrp->e_csets[ss->id];
++              }
+               spin_unlock_irq(&css_set_lock);
+               if (ss->css_rstat_flush) {
diff --git a/queue-6.3/cgroup-freezer-hold-cpu_hotplug_lock-before-freezer_mutex-in-freezer_css_-online-offline.patch b/queue-6.3/cgroup-freezer-hold-cpu_hotplug_lock-before-freezer_mutex-in-freezer_css_-online-offline.patch
new file mode 100644 (file)
index 0000000..fb82174
--- /dev/null
@@ -0,0 +1,66 @@
+From f0cc749254d12c78e93dae3b27b21dc9546843d0 Mon Sep 17 00:00:00 2001
+From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
+Date: Sun, 11 Jun 2023 22:48:12 +0900
+Subject: cgroup,freezer: hold cpu_hotplug_lock before freezer_mutex in freezer_css_{online,offline}()
+
+From: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
+
+commit f0cc749254d12c78e93dae3b27b21dc9546843d0 upstream.
+
+syzbot is again reporting circular locking dependency between
+cpu_hotplug_lock and freezer_mutex. Do like what we did with
+commit 57dcd64c7e036299 ("cgroup,freezer: hold cpu_hotplug_lock
+before freezer_mutex").
+
+Reported-by: syzbot <syzbot+2ab700fe1829880a2ec6@syzkaller.appspotmail.com>
+Closes: https://syzkaller.appspot.com/bug?extid=2ab700fe1829880a2ec6
+Signed-off-by: Tetsuo Handa <penguin-kernel@I-love.SAKURA.ne.jp>
+Tested-by: syzbot <syzbot+2ab700fe1829880a2ec6@syzkaller.appspotmail.com>
+Fixes: f5d39b020809 ("freezer,sched: Rewrite core freezer logic")
+Cc: stable@vger.kernel.org # v6.1+
+Signed-off-by: Tejun Heo <tj@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/cgroup/legacy_freezer.c |    8 ++++++--
+ 1 file changed, 6 insertions(+), 2 deletions(-)
+
+--- a/kernel/cgroup/legacy_freezer.c
++++ b/kernel/cgroup/legacy_freezer.c
+@@ -108,16 +108,18 @@ static int freezer_css_online(struct cgr
+       struct freezer *freezer = css_freezer(css);
+       struct freezer *parent = parent_freezer(freezer);
++      cpus_read_lock();
+       mutex_lock(&freezer_mutex);
+       freezer->state |= CGROUP_FREEZER_ONLINE;
+       if (parent && (parent->state & CGROUP_FREEZING)) {
+               freezer->state |= CGROUP_FREEZING_PARENT | CGROUP_FROZEN;
+-              static_branch_inc(&freezer_active);
++              static_branch_inc_cpuslocked(&freezer_active);
+       }
+       mutex_unlock(&freezer_mutex);
++      cpus_read_unlock();
+       return 0;
+ }
+@@ -132,14 +134,16 @@ static void freezer_css_offline(struct c
+ {
+       struct freezer *freezer = css_freezer(css);
++      cpus_read_lock();
+       mutex_lock(&freezer_mutex);
+       if (freezer->state & CGROUP_FREEZING)
+-              static_branch_dec(&freezer_active);
++              static_branch_dec_cpuslocked(&freezer_active);
+       freezer->state = 0;
+       mutex_unlock(&freezer_mutex);
++      cpus_read_unlock();
+ }
+ static void freezer_css_free(struct cgroup_subsys_state *css)
diff --git a/queue-6.3/drivers-hv-vmbus-call-hv_synic_free-if-hv_synic_alloc-fails.patch b/queue-6.3/drivers-hv-vmbus-call-hv_synic_free-if-hv_synic_alloc-fails.patch
new file mode 100644 (file)
index 0000000..d9cab5b
--- /dev/null
@@ -0,0 +1,50 @@
+From ec97e112985c2581ee61854a4b74f080f6cdfc2c Mon Sep 17 00:00:00 2001
+From: Dexuan Cui <decui@microsoft.com>
+Date: Thu, 4 May 2023 15:41:55 -0700
+Subject: Drivers: hv: vmbus: Call hv_synic_free() if hv_synic_alloc() fails
+
+From: Dexuan Cui <decui@microsoft.com>
+
+commit ec97e112985c2581ee61854a4b74f080f6cdfc2c upstream.
+
+Commit 572086325ce9 ("Drivers: hv: vmbus: Cleanup synic memory free path")
+says "Any memory allocations that succeeded will be freed when the caller
+cleans up by calling hv_synic_free()", but if the get_zeroed_page() in
+hv_synic_alloc() fails, currently hv_synic_free() is not really called
+in vmbus_bus_init(), consequently there will be a memory leak, e.g.
+hv_context.hv_numa_map is not freed in the error path. Fix this by
+updating the goto labels.
+
+Cc: stable@kernel.org
+Signed-off-by: Dexuan Cui <decui@microsoft.com>
+Fixes: 4df4cb9e99f8 ("x86/hyperv: Initialize clockevents earlier in CPU onlining")
+Reviewed-by: Michael Kelley <mikelley@microsoft.com>
+Link: https://lore.kernel.org/r/20230504224155.10484-1-decui@microsoft.com
+Signed-off-by: Wei Liu <wei.liu@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/hv/vmbus_drv.c |    5 ++---
+ 1 file changed, 2 insertions(+), 3 deletions(-)
+
+--- a/drivers/hv/vmbus_drv.c
++++ b/drivers/hv/vmbus_drv.c
+@@ -1525,7 +1525,7 @@ static int vmbus_bus_init(void)
+       ret = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "hyperv/vmbus:online",
+                               hv_synic_init, hv_synic_cleanup);
+       if (ret < 0)
+-              goto err_cpuhp;
++              goto err_alloc;
+       hyperv_cpuhp_online = ret;
+       ret = vmbus_connect();
+@@ -1577,9 +1577,8 @@ static int vmbus_bus_init(void)
+ err_connect:
+       cpuhp_remove_state(hyperv_cpuhp_online);
+-err_cpuhp:
+-      hv_synic_free();
+ err_alloc:
++      hv_synic_free();
+       if (vmbus_irq == -1) {
+               hv_remove_vmbus_handler();
+       } else {
diff --git a/queue-6.3/drivers-hv-vmbus-fix-vmbus_wait_for_unload-to-scan-present-cpus.patch b/queue-6.3/drivers-hv-vmbus-fix-vmbus_wait_for_unload-to-scan-present-cpus.patch
new file mode 100644 (file)
index 0000000..dd59333
--- /dev/null
@@ -0,0 +1,84 @@
+From 320805ab61e5f1e2a5729ae266e16bec2904050c Mon Sep 17 00:00:00 2001
+From: Michael Kelley <mikelley@microsoft.com>
+Date: Thu, 18 May 2023 08:13:52 -0700
+Subject: Drivers: hv: vmbus: Fix vmbus_wait_for_unload() to scan present CPUs
+
+From: Michael Kelley <mikelley@microsoft.com>
+
+commit 320805ab61e5f1e2a5729ae266e16bec2904050c upstream.
+
+vmbus_wait_for_unload() may be called in the panic path after other
+CPUs are stopped. vmbus_wait_for_unload() currently loops through
+online CPUs looking for the UNLOAD response message. But the values of
+CONFIG_KEXEC_CORE and crash_kexec_post_notifiers affect the path used
+to stop the other CPUs, and in one of the paths the stopped CPUs
+are removed from cpu_online_mask. This removal happens in both
+x86/x64 and arm64 architectures. In such a case, vmbus_wait_for_unload()
+only checks the panic'ing CPU, and misses the UNLOAD response message
+except when the panic'ing CPU is CPU 0. vmbus_wait_for_unload()
+eventually times out, but only after waiting 100 seconds.
+
+Fix this by looping through *present* CPUs in vmbus_wait_for_unload().
+The cpu_present_mask is not modified by stopping the other CPUs in the
+panic path, nor should it be.
+
+Also, in a CoCo VM the synic_message_page is not allocated in
+hv_synic_alloc(), but is set and cleared in hv_synic_enable_regs()
+and hv_synic_disable_regs() such that it is set only when the CPU is
+online.  If not all present CPUs are online when vmbus_wait_for_unload()
+is called, the synic_message_page might be NULL. Add a check for this.
+
+Fixes: cd95aad55793 ("Drivers: hv: vmbus: handle various crash scenarios")
+Cc: stable@vger.kernel.org
+Reported-by: John Starks <jostarks@microsoft.com>
+Signed-off-by: Michael Kelley <mikelley@microsoft.com>
+Reviewed-by: Vitaly Kuznetsov <vkuznets@redhat.com>
+Link: https://lore.kernel.org/r/1684422832-38476-1-git-send-email-mikelley@microsoft.com
+Signed-off-by: Wei Liu <wei.liu@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/hv/channel_mgmt.c |   18 ++++++++++++++++--
+ 1 file changed, 16 insertions(+), 2 deletions(-)
+
+--- a/drivers/hv/channel_mgmt.c
++++ b/drivers/hv/channel_mgmt.c
+@@ -829,11 +829,22 @@ static void vmbus_wait_for_unload(void)
+               if (completion_done(&vmbus_connection.unload_event))
+                       goto completed;
+-              for_each_online_cpu(cpu) {
++              for_each_present_cpu(cpu) {
+                       struct hv_per_cpu_context *hv_cpu
+                               = per_cpu_ptr(hv_context.cpu_context, cpu);
++                      /*
++                       * In a CoCo VM the synic_message_page is not allocated
++                       * in hv_synic_alloc(). Instead it is set/cleared in
++                       * hv_synic_enable_regs() and hv_synic_disable_regs()
++                       * such that it is set only when the CPU is online. If
++                       * not all present CPUs are online, the message page
++                       * might be NULL, so skip such CPUs.
++                       */
+                       page_addr = hv_cpu->synic_message_page;
++                      if (!page_addr)
++                              continue;
++
+                       msg = (struct hv_message *)page_addr
+                               + VMBUS_MESSAGE_SINT;
+@@ -867,11 +878,14 @@ completed:
+        * maybe-pending messages on all CPUs to be able to receive new
+        * messages after we reconnect.
+        */
+-      for_each_online_cpu(cpu) {
++      for_each_present_cpu(cpu) {
+               struct hv_per_cpu_context *hv_cpu
+                       = per_cpu_ptr(hv_context.cpu_context, cpu);
+               page_addr = hv_cpu->synic_message_page;
++              if (!page_addr)
++                      continue;
++
+               msg = (struct hv_message *)page_addr + VMBUS_MESSAGE_SINT;
+               msg->header.message_type = HVMSG_NONE;
+       }
diff --git a/queue-6.3/io_uring-net-clear-msg_controllen-on-partial-sendmsg-retry.patch b/queue-6.3/io_uring-net-clear-msg_controllen-on-partial-sendmsg-retry.patch
new file mode 100644 (file)
index 0000000..53b0dc3
--- /dev/null
@@ -0,0 +1,33 @@
+From b1dc492087db0f2e5a45f1072a743d04618dd6be Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@kernel.dk>
+Date: Mon, 19 Jun 2023 09:35:34 -0600
+Subject: io_uring/net: clear msg_controllen on partial sendmsg retry
+
+From: Jens Axboe <axboe@kernel.dk>
+
+commit b1dc492087db0f2e5a45f1072a743d04618dd6be upstream.
+
+If we have cmsg attached AND we transferred partial data at least, clear
+msg_controllen on retry so we don't attempt to send that again.
+
+Cc: stable@vger.kernel.org # 5.10+
+Fixes: cac9e4418f4c ("io_uring/net: save msghdr->msg_control for retries")
+Reported-by: Stefan Metzmacher <metze@samba.org>
+Reviewed-by: Stefan Metzmacher <metze@samba.org>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ io_uring/net.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/io_uring/net.c
++++ b/io_uring/net.c
+@@ -326,6 +326,8 @@ int io_sendmsg(struct io_kiocb *req, uns
+               if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK))
+                       return io_setup_async_msg(req, kmsg, issue_flags);
+               if (ret > 0 && io_net_retry(sock, flags)) {
++                      kmsg->msg.msg_controllen = 0;
++                      kmsg->msg.msg_control = NULL;
+                       sr->done_io += ret;
+                       req->flags |= REQ_F_PARTIAL_IO;
+                       return io_setup_async_msg(req, kmsg, issue_flags);
diff --git a/queue-6.3/io_uring-net-disable-partial-retries-for-recvmsg-with-cmsg.patch b/queue-6.3/io_uring-net-disable-partial-retries-for-recvmsg-with-cmsg.patch
new file mode 100644 (file)
index 0000000..79cace7
--- /dev/null
@@ -0,0 +1,53 @@
+From 78d0d2063bab954d19a1696feae4c7706a626d48 Mon Sep 17 00:00:00 2001
+From: Jens Axboe <axboe@kernel.dk>
+Date: Mon, 19 Jun 2023 09:41:05 -0600
+Subject: io_uring/net: disable partial retries for recvmsg with cmsg
+
+From: Jens Axboe <axboe@kernel.dk>
+
+commit 78d0d2063bab954d19a1696feae4c7706a626d48 upstream.
+
+We cannot sanely handle partial retries for recvmsg if we have cmsg
+attached. If we don't, then we'd just be overwriting the initial cmsg
+header on retries. Alternatively we could increment and handle this
+appropriately, but it doesn't seem worth the complication.
+
+Move the MSG_WAITALL check into the non-multishot case while at it,
+since MSG_WAITALL is explicitly disabled for multishot anyway.
+
+Link: https://lore.kernel.org/io-uring/0b0d4411-c8fd-4272-770b-e030af6919a0@kernel.dk/
+Cc: stable@vger.kernel.org # 5.10+
+Reported-by: Stefan Metzmacher <metze@samba.org>
+Reviewed-by: Stefan Metzmacher <metze@samba.org>
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ io_uring/net.c |   11 +++++++----
+ 1 file changed, 7 insertions(+), 4 deletions(-)
+
+--- a/io_uring/net.c
++++ b/io_uring/net.c
+@@ -789,16 +789,19 @@ retry_multishot:
+       flags = sr->msg_flags;
+       if (force_nonblock)
+               flags |= MSG_DONTWAIT;
+-      if (flags & MSG_WAITALL)
+-              min_ret = iov_iter_count(&kmsg->msg.msg_iter);
+       kmsg->msg.msg_get_inq = 1;
+-      if (req->flags & REQ_F_APOLL_MULTISHOT)
++      if (req->flags & REQ_F_APOLL_MULTISHOT) {
+               ret = io_recvmsg_multishot(sock, sr, kmsg, flags,
+                                          &mshot_finished);
+-      else
++      } else {
++              /* disable partial retry for recvmsg with cmsg attached */
++              if (flags & MSG_WAITALL && !kmsg->msg.msg_controllen)
++                      min_ret = iov_iter_count(&kmsg->msg.msg_iter);
++
+               ret = __sys_recvmsg_sock(sock, &kmsg->msg, sr->umsg,
+                                        kmsg->uaddr, flags);
++      }
+       if (ret < min_ret) {
+               if (ret == -EAGAIN && force_nonblock) {
diff --git a/queue-6.3/kvm-avoid-illegal-stage2-mapping-on-invalid-memory-slot.patch b/queue-6.3/kvm-avoid-illegal-stage2-mapping-on-invalid-memory-slot.patch
new file mode 100644 (file)
index 0000000..7028ac1
--- /dev/null
@@ -0,0 +1,120 @@
+From 2230f9e1171a2e9731422a14d1bbc313c0b719d1 Mon Sep 17 00:00:00 2001
+From: Gavin Shan <gshan@redhat.com>
+Date: Thu, 15 Jun 2023 15:42:59 +1000
+Subject: KVM: Avoid illegal stage2 mapping on invalid memory slot
+
+From: Gavin Shan <gshan@redhat.com>
+
+commit 2230f9e1171a2e9731422a14d1bbc313c0b719d1 upstream.
+
+We run into guest hang in edk2 firmware when KSM is kept as running on
+the host. The edk2 firmware is waiting for status 0x80 from QEMU's pflash
+device (TYPE_PFLASH_CFI01) during the operation of sector erasing or
+buffered write. The status is returned by reading the memory region of
+the pflash device and the read request should have been forwarded to QEMU
+and emulated by it. Unfortunately, the read request is covered by an
+illegal stage2 mapping when the guest hang issue occurs. The read request
+is completed with QEMU bypassed and wrong status is fetched. The edk2
+firmware runs into an infinite loop with the wrong status.
+
+The illegal stage2 mapping is populated due to same page sharing by KSM
+at (C) even the associated memory slot has been marked as invalid at (B)
+when the memory slot is requested to be deleted. It's notable that the
+active and inactive memory slots can't be swapped when we're in the middle
+of kvm_mmu_notifier_change_pte() because kvm->mn_active_invalidate_count
+is elevated, and kvm_swap_active_memslots() will busy loop until it reaches
+to zero again. Besides, the swapping from the active to the inactive memory
+slots is also avoided by holding &kvm->srcu in __kvm_handle_hva_range(),
+corresponding to synchronize_srcu_expedited() in kvm_swap_active_memslots().
+
+  CPU-A                    CPU-B
+  -----                    -----
+                           ioctl(kvm_fd, KVM_SET_USER_MEMORY_REGION)
+                           kvm_vm_ioctl_set_memory_region
+                           kvm_set_memory_region
+                           __kvm_set_memory_region
+                           kvm_set_memslot(kvm, old, NULL, KVM_MR_DELETE)
+                             kvm_invalidate_memslot
+                               kvm_copy_memslot
+                               kvm_replace_memslot
+                               kvm_swap_active_memslots        (A)
+                               kvm_arch_flush_shadow_memslot   (B)
+  same page sharing by KSM
+  kvm_mmu_notifier_invalidate_range_start
+        :
+  kvm_mmu_notifier_change_pte
+    kvm_handle_hva_range
+    __kvm_handle_hva_range
+    kvm_set_spte_gfn            (C)
+        :
+  kvm_mmu_notifier_invalidate_range_end
+
+Fix the issue by skipping the invalid memory slot at (C) to avoid the
+illegal stage2 mapping so that the read request for the pflash's status
+is forwarded to QEMU and emulated by it. In this way, the correct pflash's
+status can be returned from QEMU to break the infinite loop in the edk2
+firmware.
+
+We tried a git-bisect and the first problematic commit is cd4c71835228 ("
+KVM: arm64: Convert to the gfn-based MMU notifier callbacks"). With this,
+clean_dcache_guest_page() is called after the memory slots are iterated
+in kvm_mmu_notifier_change_pte(). clean_dcache_guest_page() is called
+before the iteration on the memory slots before this commit. This change
+literally enlarges the racy window between kvm_mmu_notifier_change_pte()
+and memory slot removal so that we're able to reproduce the issue in a
+practical test case. However, the issue exists since commit d5d8184d35c9
+("KVM: ARM: Memory virtualization setup").
+
+Cc: stable@vger.kernel.org # v3.9+
+Fixes: d5d8184d35c9 ("KVM: ARM: Memory virtualization setup")
+Reported-by: Shuai Hu <hshuai@redhat.com>
+Reported-by: Zhenyu Zhang <zhenyzha@redhat.com>
+Signed-off-by: Gavin Shan <gshan@redhat.com>
+Reviewed-by: David Hildenbrand <david@redhat.com>
+Reviewed-by: Oliver Upton <oliver.upton@linux.dev>
+Reviewed-by: Peter Xu <peterx@redhat.com>
+Reviewed-by: Sean Christopherson <seanjc@google.com>
+Reviewed-by: Shaoqin Huang <shahuang@redhat.com>
+Message-Id: <20230615054259.14911-1-gshan@redhat.com>
+Signed-off-by: Paolo Bonzini <pbonzini@redhat.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ virt/kvm/kvm_main.c |   20 +++++++++++++++++++-
+ 1 file changed, 19 insertions(+), 1 deletion(-)
+
+--- a/virt/kvm/kvm_main.c
++++ b/virt/kvm/kvm_main.c
+@@ -683,6 +683,24 @@ static __always_inline int kvm_handle_hv
+       return __kvm_handle_hva_range(kvm, &range);
+ }
++
++static bool kvm_change_spte_gfn(struct kvm *kvm, struct kvm_gfn_range *range)
++{
++      /*
++       * Skipping invalid memslots is correct if and only change_pte() is
++       * surrounded by invalidate_range_{start,end}(), which is currently
++       * guaranteed by the primary MMU.  If that ever changes, KVM needs to
++       * unmap the memslot instead of skipping the memslot to ensure that KVM
++       * doesn't hold references to the old PFN.
++       */
++      WARN_ON_ONCE(!READ_ONCE(kvm->mn_active_invalidate_count));
++
++      if (range->slot->flags & KVM_MEMSLOT_INVALID)
++              return false;
++
++      return kvm_set_spte_gfn(kvm, range);
++}
++
+ static void kvm_mmu_notifier_change_pte(struct mmu_notifier *mn,
+                                       struct mm_struct *mm,
+                                       unsigned long address,
+@@ -704,7 +722,7 @@ static void kvm_mmu_notifier_change_pte(
+       if (!READ_ONCE(kvm->mmu_invalidate_in_progress))
+               return;
+-      kvm_handle_hva_range(mn, address, address + 1, pte, kvm_set_spte_gfn);
++      kvm_handle_hva_range(mn, address, address + 1, pte, kvm_change_spte_gfn);
+ }
+ void kvm_mmu_invalidate_begin(struct kvm *kvm, unsigned long start,
diff --git a/queue-6.3/memfd-check-for-non-null-file_seals-in-memfd_create-syscall.patch b/queue-6.3/memfd-check-for-non-null-file_seals-in-memfd_create-syscall.patch
new file mode 100644 (file)
index 0000000..6925a81
--- /dev/null
@@ -0,0 +1,46 @@
+From 935d44acf621aa0688fef8312dec3e5940f38f4e Mon Sep 17 00:00:00 2001
+From: Roberto Sassu <roberto.sassu@huawei.com>
+Date: Wed, 7 Jun 2023 15:24:27 +0200
+Subject: memfd: check for non-NULL file_seals in memfd_create() syscall
+
+From: Roberto Sassu <roberto.sassu@huawei.com>
+
+commit 935d44acf621aa0688fef8312dec3e5940f38f4e upstream.
+
+Ensure that file_seals is non-NULL before using it in the memfd_create()
+syscall.  One situation in which memfd_file_seals_ptr() could return a
+NULL pointer when CONFIG_SHMEM=n, oopsing the kernel.
+
+Link: https://lkml.kernel.org/r/20230607132427.2867435-1-roberto.sassu@huaweicloud.com
+Fixes: 47b9012ecdc7 ("shmem: add sealing support to hugetlb-backed memfd")
+Signed-off-by: Roberto Sassu <roberto.sassu@huawei.com>
+Cc: Marc-Andr Lureau <marcandre.lureau@redhat.com>
+Cc: Mike Kravetz <mike.kravetz@oracle.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/memfd.c |    9 ++++++---
+ 1 file changed, 6 insertions(+), 3 deletions(-)
+
+--- a/mm/memfd.c
++++ b/mm/memfd.c
+@@ -375,12 +375,15 @@ SYSCALL_DEFINE2(memfd_create,
+               inode->i_mode &= ~0111;
+               file_seals = memfd_file_seals_ptr(file);
+-              *file_seals &= ~F_SEAL_SEAL;
+-              *file_seals |= F_SEAL_EXEC;
++              if (file_seals) {
++                      *file_seals &= ~F_SEAL_SEAL;
++                      *file_seals |= F_SEAL_EXEC;
++              }
+       } else if (flags & MFD_ALLOW_SEALING) {
+               /* MFD_EXEC and MFD_ALLOW_SEALING are set */
+               file_seals = memfd_file_seals_ptr(file);
+-              *file_seals &= ~F_SEAL_SEAL;
++              if (file_seals)
++                      *file_seals &= ~F_SEAL_SEAL;
+       }
+       fd_install(fd, file);
diff --git a/queue-6.3/mm-mprotect-fix-do_mprotect_pkey-limit-check.patch b/queue-6.3/mm-mprotect-fix-do_mprotect_pkey-limit-check.patch
new file mode 100644 (file)
index 0000000..6c4b28f
--- /dev/null
@@ -0,0 +1,39 @@
+From 77795f900e2a07c1cbedc375789aefb43843b6c2 Mon Sep 17 00:00:00 2001
+From: "Liam R. Howlett" <Liam.Howlett@oracle.com>
+Date: Tue, 6 Jun 2023 14:29:12 -0400
+Subject: mm/mprotect: fix do_mprotect_pkey() limit check
+
+From: Liam R. Howlett <Liam.Howlett@oracle.com>
+
+commit 77795f900e2a07c1cbedc375789aefb43843b6c2 upstream.
+
+The return of do_mprotect_pkey() can still be incorrectly returned as
+success if there is a gap that spans to or beyond the end address passed
+in.  Update the check to ensure that the end address has indeed been seen.
+
+Link: https://lore.kernel.org/all/CABi2SkXjN+5iFoBhxk71t3cmunTk-s=rB4T7qo0UQRh17s49PQ@mail.gmail.com/
+Link: https://lkml.kernel.org/r/20230606182912.586576-1-Liam.Howlett@oracle.com
+Fixes: 82f951340f25 ("mm/mprotect: fix do_mprotect_pkey() return on error")
+Signed-off-by: Liam R. Howlett <Liam.Howlett@oracle.com>
+Reported-by: Jeff Xu <jeffxu@chromium.org>
+Reviewed-by: Lorenzo Stoakes <lstoakes@gmail.com>
+Acked-by: David Hildenbrand <david@redhat.com>
+Acked-by: Vlastimil Babka <vbabka@suse.cz>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/mprotect.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/mm/mprotect.c
++++ b/mm/mprotect.c
+@@ -838,7 +838,7 @@ static int do_mprotect_pkey(unsigned lon
+       }
+       tlb_finish_mmu(&tlb);
+-      if (!error && vma_iter_end(&vmi) < end)
++      if (!error && tmp < end)
+               error = -ENOMEM;
+ out:
diff --git a/queue-6.3/mm-vmalloc-do-not-output-a-spurious-warning-when-huge-vmalloc-fails.patch b/queue-6.3/mm-vmalloc-do-not-output-a-spurious-warning-when-huge-vmalloc-fails.patch
new file mode 100644 (file)
index 0000000..e35e20f
--- /dev/null
@@ -0,0 +1,69 @@
+From 95a301eefa82057571207edd06ea36218985a75e Mon Sep 17 00:00:00 2001
+From: Lorenzo Stoakes <lstoakes@gmail.com>
+Date: Mon, 5 Jun 2023 21:11:07 +0100
+Subject: mm/vmalloc: do not output a spurious warning when huge vmalloc() fails
+
+From: Lorenzo Stoakes <lstoakes@gmail.com>
+
+commit 95a301eefa82057571207edd06ea36218985a75e upstream.
+
+In __vmalloc_area_node() we always warn_alloc() when an allocation
+performed by vm_area_alloc_pages() fails unless it was due to a pending
+fatal signal.
+
+However, huge page allocations instigated either by vmalloc_huge() or
+__vmalloc_node_range() (or a caller that invokes this like kvmalloc() or
+kvmalloc_node()) always falls back to order-0 allocations if the huge page
+allocation fails.
+
+This renders the warning useless and noisy, especially as all callers
+appear to be aware that this may fallback.  This has already resulted in
+at least one bug report from a user who was confused by this (see link).
+
+Therefore, simply update the code to only output this warning for order-0
+pages when no fatal signal is pending.
+
+Link: https://bugzilla.suse.com/show_bug.cgi?id=1211410
+Link: https://lkml.kernel.org/r/20230605201107.83298-1-lstoakes@gmail.com
+Fixes: 80b1d8fdfad1 ("mm: vmalloc: correct use of __GFP_NOWARN mask in __vmalloc_area_node()")
+Signed-off-by: Lorenzo Stoakes <lstoakes@gmail.com>
+Acked-by: Vlastimil Babka <vbabka@suse.cz>
+Reviewed-by: Baoquan He <bhe@redhat.com>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Reviewed-by: Uladzislau Rezki (Sony) <urezki@gmail.com>
+Reviewed-by: David Hildenbrand <david@redhat.com>
+Cc: Christoph Hellwig <hch@infradead.org>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/vmalloc.c |   17 +++++++++++++----
+ 1 file changed, 13 insertions(+), 4 deletions(-)
+
+--- a/mm/vmalloc.c
++++ b/mm/vmalloc.c
+@@ -3046,11 +3046,20 @@ static void *__vmalloc_area_node(struct
+        * allocation request, free them via vfree() if any.
+        */
+       if (area->nr_pages != nr_small_pages) {
+-              /* vm_area_alloc_pages() can also fail due to a fatal signal */
+-              if (!fatal_signal_pending(current))
++              /*
++               * vm_area_alloc_pages() can fail due to insufficient memory but
++               * also:-
++               *
++               * - a pending fatal signal
++               * - insufficient huge page-order pages
++               *
++               * Since we always retry allocations at order-0 in the huge page
++               * case a warning for either is spurious.
++               */
++              if (!fatal_signal_pending(current) && page_order == 0)
+                       warn_alloc(gfp_mask, NULL,
+-                              "vmalloc error: size %lu, page order %u, failed to allocate pages",
+-                              area->nr_pages * PAGE_SIZE, page_order);
++                              "vmalloc error: size %lu, failed to allocate pages",
++                              area->nr_pages * PAGE_SIZE);
+               goto fail;
+       }
diff --git a/queue-6.3/mmc-bcm2835-fix-deferred-probing.patch b/queue-6.3/mmc-bcm2835-fix-deferred-probing.patch
new file mode 100644 (file)
index 0000000..c025e7a
--- /dev/null
@@ -0,0 +1,39 @@
+From 71150ac12558bcd9d75e6e24cf7c872c2efd80f3 Mon Sep 17 00:00:00 2001
+From: Sergey Shtylyov <s.shtylyov@omp.ru>
+Date: Sat, 17 Jun 2023 23:36:11 +0300
+Subject: mmc: bcm2835: fix deferred probing
+
+From: Sergey Shtylyov <s.shtylyov@omp.ru>
+
+commit 71150ac12558bcd9d75e6e24cf7c872c2efd80f3 upstream.
+
+The driver overrides the error codes and IRQ0 returned by platform_get_irq()
+to -EINVAL, so if it returns -EPROBE_DEFER, the driver will fail the probe
+permanently instead of the deferred probing. Switch to propagating the error
+codes upstream.  Since commit ce753ad1549c ("platform: finally disallow IRQ0
+in platform_get_irq() and its ilk") IRQ0 is no longer returned by those APIs,
+so we now can safely ignore it...
+
+Fixes: 660fc733bd74 ("mmc: bcm2835: Add new driver for the sdhost controller.")
+Cc: stable@vger.kernel.org # v5.19+
+Signed-off-by: Sergey Shtylyov <s.shtylyov@omp.ru>
+Link: https://lore.kernel.org/r/20230617203622.6812-2-s.shtylyov@omp.ru
+Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/mmc/host/bcm2835.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/drivers/mmc/host/bcm2835.c
++++ b/drivers/mmc/host/bcm2835.c
+@@ -1403,8 +1403,8 @@ static int bcm2835_probe(struct platform
+       host->max_clk = clk_get_rate(clk);
+       host->irq = platform_get_irq(pdev, 0);
+-      if (host->irq <= 0) {
+-              ret = -EINVAL;
++      if (host->irq < 0) {
++              ret = host->irq;
+               goto err;
+       }
diff --git a/queue-6.3/mmc-litex_mmc-set-probe_prefer_asynchronous.patch b/queue-6.3/mmc-litex_mmc-set-probe_prefer_asynchronous.patch
new file mode 100644 (file)
index 0000000..d53ee05
--- /dev/null
@@ -0,0 +1,36 @@
+From f334ad47683606b682b4166b800d8b372d315436 Mon Sep 17 00:00:00 2001
+From: Jisheng Zhang <jszhang@kernel.org>
+Date: Sat, 17 Jun 2023 16:53:19 +0800
+Subject: mmc: litex_mmc: set PROBE_PREFER_ASYNCHRONOUS
+
+From: Jisheng Zhang <jszhang@kernel.org>
+
+commit f334ad47683606b682b4166b800d8b372d315436 upstream.
+
+mmc host drivers should have enabled the asynchronous probe option, but
+it seems like we didn't set it for litex_mmc when introducing litex mmc
+support, so let's set it now.
+
+Tested with linux-on-litex-vexriscv on sipeed tang nano 20K fpga.
+
+Signed-off-by: Jisheng Zhang <jszhang@kernel.org>
+Acked-by: Gabriel Somlo <gsomlo@gmail.com>
+Fixes: 92e099104729 ("mmc: Add driver for LiteX's LiteSDCard interface")
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20230617085319.2139-1-jszhang@kernel.org
+Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/mmc/host/litex_mmc.c |    1 +
+ 1 file changed, 1 insertion(+)
+
+--- a/drivers/mmc/host/litex_mmc.c
++++ b/drivers/mmc/host/litex_mmc.c
+@@ -649,6 +649,7 @@ static struct platform_driver litex_mmc_
+       .driver = {
+               .name = "litex-mmc",
+               .of_match_table = litex_match,
++              .probe_type = PROBE_PREFER_ASYNCHRONOUS,
+       },
+ };
+ module_platform_driver(litex_mmc_driver);
diff --git a/queue-6.3/mmc-meson-gx-fix-deferred-probing.patch b/queue-6.3/mmc-meson-gx-fix-deferred-probing.patch
new file mode 100644 (file)
index 0000000..a6cd0be
--- /dev/null
@@ -0,0 +1,40 @@
+From b8ada54fa1b83f3b6480d4cced71354301750153 Mon Sep 17 00:00:00 2001
+From: Sergey Shtylyov <s.shtylyov@omp.ru>
+Date: Sat, 17 Jun 2023 23:36:12 +0300
+Subject: mmc: meson-gx: fix deferred probing
+
+From: Sergey Shtylyov <s.shtylyov@omp.ru>
+
+commit b8ada54fa1b83f3b6480d4cced71354301750153 upstream.
+
+The driver overrides the error codes and IRQ0 returned by platform_get_irq()
+to -EINVAL, so if it returns -EPROBE_DEFER, the driver will fail the probe
+permanently instead of the deferred probing. Switch to propagating the error
+codes upstream.  Since commit ce753ad1549c ("platform: finally disallow IRQ0
+in platform_get_irq() and its ilk") IRQ0 is no longer returned by those APIs,
+so we now can safely ignore it...
+
+Fixes: cbcaac6d7dd2 ("mmc: meson-gx-mmc: Fix platform_get_irq's error checking")
+Cc: stable@vger.kernel.org # v5.19+
+Signed-off-by: Sergey Shtylyov <s.shtylyov@omp.ru>
+Reviewed-by: Neil Armstrong <neil.armstrong@linaro.org>
+Link: https://lore.kernel.org/r/20230617203622.6812-3-s.shtylyov@omp.ru
+Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/mmc/host/meson-gx-mmc.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/drivers/mmc/host/meson-gx-mmc.c
++++ b/drivers/mmc/host/meson-gx-mmc.c
+@@ -1202,8 +1202,8 @@ static int meson_mmc_probe(struct platfo
+               return PTR_ERR(host->regs);
+       host->irq = platform_get_irq(pdev, 0);
+-      if (host->irq <= 0)
+-              return -EINVAL;
++      if (host->irq < 0)
++              return host->irq;
+       cd_irq = platform_get_irq_optional(pdev, 1);
+       mmc_gpio_set_cd_irq(mmc, cd_irq);
diff --git a/queue-6.3/mmc-meson-gx-remove-redundant-mmc_request_done-call-from-irq-context.patch b/queue-6.3/mmc-meson-gx-remove-redundant-mmc_request_done-call-from-irq-context.patch
new file mode 100644 (file)
index 0000000..54155ca
--- /dev/null
@@ -0,0 +1,101 @@
+From 3c40eb8145325b0f5b93b8a169146078cb2c49d6 Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Martin=20Hundeb=C3=B8ll?= <martin@geanix.com>
+Date: Wed, 7 Jun 2023 10:27:12 +0200
+Subject: mmc: meson-gx: remove redundant mmc_request_done() call from irq context
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Martin Hundebøll <martin@geanix.com>
+
+commit 3c40eb8145325b0f5b93b8a169146078cb2c49d6 upstream.
+
+The call to mmc_request_done() can schedule, so it must not be called
+from irq context. Wake the irq thread if it needs to be called, and let
+its existing logic do its work.
+
+Fixes the following kernel bug, which appears when running an RT patched
+kernel on the AmLogic Meson AXG A113X SoC:
+[   11.111407] BUG: scheduling while atomic: kworker/0:1H/75/0x00010001
+[   11.111438] Modules linked in:
+[   11.111451] CPU: 0 PID: 75 Comm: kworker/0:1H Not tainted 6.4.0-rc3-rt2-rtx-00081-gfd07f41ed6b4-dirty #1
+[   11.111461] Hardware name: RTX AXG A113X Linux Platform Board (DT)
+[   11.111469] Workqueue: kblockd blk_mq_run_work_fn
+[   11.111492] Call trace:
+[   11.111497]  dump_backtrace+0xac/0xe8
+[   11.111510]  show_stack+0x18/0x28
+[   11.111518]  dump_stack_lvl+0x48/0x60
+[   11.111530]  dump_stack+0x18/0x24
+[   11.111537]  __schedule_bug+0x4c/0x68
+[   11.111548]  __schedule+0x80/0x574
+[   11.111558]  schedule_loop+0x2c/0x50
+[   11.111567]  schedule_rtlock+0x14/0x20
+[   11.111576]  rtlock_slowlock_locked+0x468/0x730
+[   11.111587]  rt_spin_lock+0x40/0x64
+[   11.111596]  __wake_up_common_lock+0x5c/0xc4
+[   11.111610]  __wake_up+0x18/0x24
+[   11.111620]  mmc_blk_mq_req_done+0x68/0x138
+[   11.111633]  mmc_request_done+0x104/0x118
+[   11.111644]  meson_mmc_request_done+0x38/0x48
+[   11.111654]  meson_mmc_irq+0x128/0x1f0
+[   11.111663]  __handle_irq_event_percpu+0x70/0x114
+[   11.111674]  handle_irq_event_percpu+0x18/0x4c
+[   11.111683]  handle_irq_event+0x80/0xb8
+[   11.111691]  handle_fasteoi_irq+0xa4/0x120
+[   11.111704]  handle_irq_desc+0x20/0x38
+[   11.111712]  generic_handle_domain_irq+0x1c/0x28
+[   11.111721]  gic_handle_irq+0x8c/0xa8
+[   11.111735]  call_on_irq_stack+0x24/0x4c
+[   11.111746]  do_interrupt_handler+0x88/0x94
+[   11.111757]  el1_interrupt+0x34/0x64
+[   11.111769]  el1h_64_irq_handler+0x18/0x24
+[   11.111779]  el1h_64_irq+0x64/0x68
+[   11.111786]  __add_wait_queue+0x0/0x4c
+[   11.111795]  mmc_blk_rw_wait+0x84/0x118
+[   11.111804]  mmc_blk_mq_issue_rq+0x5c4/0x654
+[   11.111814]  mmc_mq_queue_rq+0x194/0x214
+[   11.111822]  blk_mq_dispatch_rq_list+0x3ac/0x528
+[   11.111834]  __blk_mq_sched_dispatch_requests+0x340/0x4d0
+[   11.111847]  blk_mq_sched_dispatch_requests+0x38/0x70
+[   11.111858]  blk_mq_run_work_fn+0x3c/0x70
+[   11.111865]  process_one_work+0x17c/0x1f0
+[   11.111876]  worker_thread+0x1d4/0x26c
+[   11.111885]  kthread+0xe4/0xf4
+[   11.111894]  ret_from_fork+0x10/0x20
+
+Fixes: 51c5d8447bd7 ("MMC: meson: initial support for GX platforms")
+Cc: stable@vger.kernel.org
+Signed-off-by: Martin Hundebøll <martin@geanix.com>
+Link: https://lore.kernel.org/r/20230607082713.517157-1-martin@geanix.com
+Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/mmc/host/meson-gx-mmc.c |   10 ++--------
+ 1 file changed, 2 insertions(+), 8 deletions(-)
+
+--- a/drivers/mmc/host/meson-gx-mmc.c
++++ b/drivers/mmc/host/meson-gx-mmc.c
+@@ -1006,11 +1006,8 @@ static irqreturn_t meson_mmc_irq(int irq
+               if (data && !cmd->error)
+                       data->bytes_xfered = data->blksz * data->blocks;
+-              if (meson_mmc_bounce_buf_read(data) ||
+-                  meson_mmc_get_next_command(cmd))
+-                      ret = IRQ_WAKE_THREAD;
+-              else
+-                      ret = IRQ_HANDLED;
++
++              return IRQ_WAKE_THREAD;
+       }
+ out:
+@@ -1022,9 +1019,6 @@ out:
+               writel(start, host->regs + SD_EMMC_START);
+       }
+-      if (ret == IRQ_HANDLED)
+-              meson_mmc_request_done(host->mmc, cmd->mrq);
+-
+       return ret;
+ }
diff --git a/queue-6.3/mmc-mmci-stm32-fix-max-busy-timeout-calculation.patch b/queue-6.3/mmc-mmci-stm32-fix-max-busy-timeout-calculation.patch
new file mode 100644 (file)
index 0000000..9bdc566
--- /dev/null
@@ -0,0 +1,36 @@
+From 47b3ad6b7842f49d374a01b054a4b1461a621bdc Mon Sep 17 00:00:00 2001
+From: Christophe Kerello <christophe.kerello@foss.st.com>
+Date: Tue, 13 Jun 2023 15:41:46 +0200
+Subject: mmc: mmci: stm32: fix max busy timeout calculation
+
+From: Christophe Kerello <christophe.kerello@foss.st.com>
+
+commit 47b3ad6b7842f49d374a01b054a4b1461a621bdc upstream.
+
+The way that the timeout is currently calculated could lead to a u64
+timeout value in mmci_start_command(). This value is then cast in a u32
+register that leads to mmc erase failed issue with some SD cards.
+
+Fixes: 8266c585f489 ("mmc: mmci: add hardware busy timeout feature")
+Signed-off-by: Yann Gautier <yann.gautier@foss.st.com>
+Signed-off-by: Christophe Kerello <christophe.kerello@foss.st.com>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20230613134146.418016-1-yann.gautier@foss.st.com
+Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/mmc/host/mmci.c |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/drivers/mmc/host/mmci.c
++++ b/drivers/mmc/host/mmci.c
+@@ -1735,7 +1735,8 @@ static void mmci_set_max_busy_timeout(st
+               return;
+       if (host->variant->busy_timeout && mmc->actual_clock)
+-              max_busy_timeout = ~0UL / (mmc->actual_clock / MSEC_PER_SEC);
++              max_busy_timeout = U32_MAX / DIV_ROUND_UP(mmc->actual_clock,
++                                                        MSEC_PER_SEC);
+       mmc->max_busy_timeout = max_busy_timeout;
+ }
diff --git a/queue-6.3/mmc-sdhci-msm-disable-broken-64-bit-dma-on-msm8916.patch b/queue-6.3/mmc-sdhci-msm-disable-broken-64-bit-dma-on-msm8916.patch
new file mode 100644 (file)
index 0000000..187c0e1
--- /dev/null
@@ -0,0 +1,73 @@
+From e6f9e590b72e12bbb86b1b8be7e1981f357392ad Mon Sep 17 00:00:00 2001
+From: Stephan Gerhold <stephan@gerhold.net>
+Date: Thu, 18 May 2023 11:39:36 +0200
+Subject: mmc: sdhci-msm: Disable broken 64-bit DMA on MSM8916
+
+From: Stephan Gerhold <stephan@gerhold.net>
+
+commit e6f9e590b72e12bbb86b1b8be7e1981f357392ad upstream.
+
+While SDHCI claims to support 64-bit DMA on MSM8916 it does not seem to
+be properly functional. It is not immediately obvious because SDHCI is
+usually used with IOMMU bypassed on this SoC, and all physical memory
+has 32-bit addresses. But when trying to enable the IOMMU it quickly
+fails with an error such as the following:
+
+  arm-smmu 1e00000.iommu: Unhandled context fault:
+    fsr=0x402, iova=0xfffff200, fsynr=0xe0000, cbfrsynra=0x140, cb=3
+  mmc1: ADMA error: 0x02000000
+  mmc1: sdhci: ============ SDHCI REGISTER DUMP ===========
+  mmc1: sdhci: Sys addr:  0x00000000 | Version:  0x00002e02
+  mmc1: sdhci: Blk size:  0x00000008 | Blk cnt:  0x00000000
+  mmc1: sdhci: Argument:  0x00000000 | Trn mode: 0x00000013
+  mmc1: sdhci: Present:   0x03f80206 | Host ctl: 0x00000019
+  mmc1: sdhci: Power:     0x0000000f | Blk gap:  0x00000000
+  mmc1: sdhci: Wake-up:   0x00000000 | Clock:    0x00000007
+  mmc1: sdhci: Timeout:   0x0000000a | Int stat: 0x00000001
+  mmc1: sdhci: Int enab:  0x03ff900b | Sig enab: 0x03ff100b
+  mmc1: sdhci: ACmd stat: 0x00000000 | Slot int: 0x00000000
+  mmc1: sdhci: Caps:      0x322dc8b2 | Caps_1:   0x00008007
+  mmc1: sdhci: Cmd:       0x0000333a | Max curr: 0x00000000
+  mmc1: sdhci: Resp[0]:   0x00000920 | Resp[1]:  0x5b590000
+  mmc1: sdhci: Resp[2]:   0xe6487f80 | Resp[3]:  0x0a404094
+  mmc1: sdhci: Host ctl2: 0x00000008
+  mmc1: sdhci: ADMA Err:  0x00000001 | ADMA Ptr: 0x0000000ffffff224
+  mmc1: sdhci_msm: ----------- VENDOR REGISTER DUMP -----------
+  mmc1: sdhci_msm: DLL sts: 0x00000000 | DLL cfg:  0x60006400 | DLL cfg2: 0x00000000
+  mmc1: sdhci_msm: DLL cfg3: 0x00000000 | DLL usr ctl:  0x00000000 | DDR cfg: 0x00000000
+  mmc1: sdhci_msm: Vndr func: 0x00018a9c | Vndr func2 : 0xf88018a8 Vndr func3: 0x00000000
+  mmc1: sdhci: ============================================
+  mmc1: sdhci: fffffffff200: DMA 0x0000ffffffffe100, LEN 0x0008, Attr=0x21
+  mmc1: sdhci: fffffffff20c: DMA 0x0000000000000000, LEN 0x0000, Attr=0x03
+
+Looking closely it's obvious that only the 32-bit part of the address
+(0xfffff200) arrives at the SMMU, the higher 16-bit (0xffff...) get
+lost somewhere. This might not be a limitation of the SDHCI itself but
+perhaps the bus/interconnect it is connected to, or even the connection
+to the SMMU.
+
+Work around this by setting SDHCI_QUIRK2_BROKEN_64_BIT_DMA to avoid
+using 64-bit addresses.
+
+Signed-off-by: Stephan Gerhold <stephan@gerhold.net>
+Acked-by: Adrian Hunter <adrian.hunter@intel.com>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20230518-msm8916-64bit-v1-1-5694b0f35211@gerhold.net
+Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/mmc/host/sdhci-msm.c |    3 +++
+ 1 file changed, 3 insertions(+)
+
+--- a/drivers/mmc/host/sdhci-msm.c
++++ b/drivers/mmc/host/sdhci-msm.c
+@@ -2479,6 +2479,9 @@ static inline void sdhci_msm_get_of_prop
+               msm_host->ddr_config = DDR_CONFIG_POR_VAL;
+       of_property_read_u32(node, "qcom,dll-config", &msm_host->dll_config);
++
++      if (of_device_is_compatible(node, "qcom,msm8916-sdhci"))
++              host->quirks2 |= SDHCI_QUIRK2_BROKEN_64_BIT_DMA;
+ }
+ static int sdhci_msm_gcc_reset(struct device *dev, struct sdhci_host *host)
diff --git a/queue-6.3/mmc-sdhci-spear-fix-deferred-probing.patch b/queue-6.3/mmc-sdhci-spear-fix-deferred-probing.patch
new file mode 100644 (file)
index 0000000..7b12a10
--- /dev/null
@@ -0,0 +1,41 @@
+From 8d0caeedcd05a721f3cc2537b0ea212ec4027307 Mon Sep 17 00:00:00 2001
+From: Sergey Shtylyov <s.shtylyov@omp.ru>
+Date: Sat, 17 Jun 2023 23:36:19 +0300
+Subject: mmc: sdhci-spear: fix deferred probing
+
+From: Sergey Shtylyov <s.shtylyov@omp.ru>
+
+commit 8d0caeedcd05a721f3cc2537b0ea212ec4027307 upstream.
+
+The driver overrides the error codes and IRQ0 returned by platform_get_irq()
+to -EINVAL, so if it returns -EPROBE_DEFER, the driver will fail the probe
+permanently instead of the deferred probing. Switch to propagating the error
+codes upstream.  Since commit ce753ad1549c ("platform: finally disallow IRQ0
+in platform_get_irq() and its ilk") IRQ0 is no longer returned by those APIs,
+so we now can safely ignore it...
+
+Fixes: 682798a596a6 ("mmc: sdhci-spear: Handle return value of platform_get_irq")
+Cc: stable@vger.kernel.org # v5.19+
+Signed-off-by: Sergey Shtylyov <s.shtylyov@omp.ru>
+Acked-by: Viresh Kumar <viresh.kumar@linaro.org>
+Acked-by: Adrian Hunter <adrian.hunter@intel.com>
+Link: https://lore.kernel.org/r/20230617203622.6812-10-s.shtylyov@omp.ru
+Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/mmc/host/sdhci-spear.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/drivers/mmc/host/sdhci-spear.c
++++ b/drivers/mmc/host/sdhci-spear.c
+@@ -65,8 +65,8 @@ static int sdhci_probe(struct platform_d
+       host->hw_name = "sdhci";
+       host->ops = &sdhci_pltfm_ops;
+       host->irq = platform_get_irq(pdev, 0);
+-      if (host->irq <= 0) {
+-              ret = -EINVAL;
++      if (host->irq < 0) {
++              ret = host->irq;
+               goto err_host;
+       }
+       host->quirks = SDHCI_QUIRK_BROKEN_ADMA;
diff --git a/queue-6.3/mmc-sunxi-fix-deferred-probing.patch b/queue-6.3/mmc-sunxi-fix-deferred-probing.patch
new file mode 100644 (file)
index 0000000..db0b5a2
--- /dev/null
@@ -0,0 +1,40 @@
+From c2df53c5806cfd746dae08e07bc8c4ad247c3b70 Mon Sep 17 00:00:00 2001
+From: Sergey Shtylyov <s.shtylyov@omp.ru>
+Date: Sat, 17 Jun 2023 23:36:21 +0300
+Subject: mmc: sunxi: fix deferred probing
+
+From: Sergey Shtylyov <s.shtylyov@omp.ru>
+
+commit c2df53c5806cfd746dae08e07bc8c4ad247c3b70 upstream.
+
+The driver overrides the error codes and IRQ0 returned by platform_get_irq()
+to -EINVAL, so if it returns -EPROBE_DEFER, the driver will fail the probe
+permanently instead of the deferred probing. Switch to propagating the error
+codes upstream.  Since commit ce753ad1549c ("platform: finally disallow IRQ0
+in platform_get_irq() and its ilk") IRQ0 is no longer returned by those APIs,
+so we now can safely ignore it...
+
+Fixes: 2408a08583d2 ("mmc: sunxi-mmc: Handle return value of platform_get_irq")
+Cc: stable@vger.kernel.org # v5.19+
+Signed-off-by: Sergey Shtylyov <s.shtylyov@omp.ru>
+Reviewed-by: Jernej Skrabec <jernej.skrabec@gmail.com>
+Link: https://lore.kernel.org/r/20230617203622.6812-12-s.shtylyov@omp.ru
+Signed-off-by: Ulf Hansson <ulf.hansson@linaro.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/mmc/host/sunxi-mmc.c |    4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+--- a/drivers/mmc/host/sunxi-mmc.c
++++ b/drivers/mmc/host/sunxi-mmc.c
+@@ -1350,8 +1350,8 @@ static int sunxi_mmc_resource_request(st
+               return ret;
+       host->irq = platform_get_irq(pdev, 0);
+-      if (host->irq <= 0) {
+-              ret = -EINVAL;
++      if (host->irq < 0) {
++              ret = host->irq;
+               goto error_disable_mmc;
+       }
diff --git a/queue-6.3/mptcp-consolidate-fallback-and-non-fallback-state-machine.patch b/queue-6.3/mptcp-consolidate-fallback-and-non-fallback-state-machine.patch
new file mode 100644 (file)
index 0000000..16c945a
--- /dev/null
@@ -0,0 +1,205 @@
+From 81c1d029016001f994ce1c46849c5e9900d8eab8 Mon Sep 17 00:00:00 2001
+From: Paolo Abeni <pabeni@redhat.com>
+Date: Tue, 20 Jun 2023 18:24:21 +0200
+Subject: mptcp: consolidate fallback and non fallback state machine
+
+From: Paolo Abeni <pabeni@redhat.com>
+
+commit 81c1d029016001f994ce1c46849c5e9900d8eab8 upstream.
+
+An orphaned msk releases the used resources via the worker,
+when the latter first see the msk in CLOSED status.
+
+If the msk status transitions to TCP_CLOSE in the release callback
+invoked by the worker's final release_sock(), such instance of the
+workqueue will not take any action.
+
+Additionally the MPTCP code prevents scheduling the worker once the
+socket reaches the CLOSE status: such msk resources will be leaked.
+
+The only code path that can trigger the above scenario is the
+__mptcp_check_send_data_fin() in fallback mode.
+
+Address the issue removing the special handling of fallback socket
+in __mptcp_check_send_data_fin(), consolidating the state machine
+for fallback and non fallback socket.
+
+Since non-fallback sockets do not send and do not receive data_fin,
+the mptcp code can update the msk internal status to match the next
+step in the SM every time data fin (ack) should be generated or
+received.
+
+As a consequence we can remove a bunch of checks for fallback from
+the fastpath.
+
+Fixes: 6e628cd3a8f7 ("mptcp: use mptcp release_cb for delayed tasks")
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Reviewed-by: Mat Martineau <martineau@kernel.org>
+Signed-off-by: Matthieu Baerts <matthieu.baerts@tessares.net>
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/mptcp/protocol.c |   41 +++++++++++++++--------------------------
+ net/mptcp/subflow.c  |   17 ++++++++++-------
+ 2 files changed, 25 insertions(+), 33 deletions(-)
+
+--- a/net/mptcp/protocol.c
++++ b/net/mptcp/protocol.c
+@@ -44,7 +44,7 @@ enum {
+ static struct percpu_counter mptcp_sockets_allocated ____cacheline_aligned_in_smp;
+ static void __mptcp_destroy_sock(struct sock *sk);
+-static void __mptcp_check_send_data_fin(struct sock *sk);
++static void mptcp_check_send_data_fin(struct sock *sk);
+ DEFINE_PER_CPU(struct mptcp_delegated_action, mptcp_delegated_actions);
+ static struct net_device mptcp_napi_dev;
+@@ -411,8 +411,7 @@ static bool mptcp_pending_data_fin_ack(s
+ {
+       struct mptcp_sock *msk = mptcp_sk(sk);
+-      return !__mptcp_check_fallback(msk) &&
+-             ((1 << sk->sk_state) &
++      return ((1 << sk->sk_state) &
+               (TCPF_FIN_WAIT1 | TCPF_CLOSING | TCPF_LAST_ACK)) &&
+              msk->write_seq == READ_ONCE(msk->snd_una);
+ }
+@@ -570,9 +569,6 @@ static bool mptcp_check_data_fin(struct
+       u64 rcv_data_fin_seq;
+       bool ret = false;
+-      if (__mptcp_check_fallback(msk))
+-              return ret;
+-
+       /* Need to ack a DATA_FIN received from a peer while this side
+        * of the connection is in ESTABLISHED, FIN_WAIT1, or FIN_WAIT2.
+        * msk->rcv_data_fin was set when parsing the incoming options
+@@ -610,7 +606,8 @@ static bool mptcp_check_data_fin(struct
+               }
+               ret = true;
+-              mptcp_send_ack(msk);
++              if (!__mptcp_check_fallback(msk))
++                      mptcp_send_ack(msk);
+               mptcp_close_wake_up(sk);
+       }
+       return ret;
+@@ -1596,7 +1593,7 @@ out:
+       if (!mptcp_timer_pending(sk))
+               mptcp_reset_timer(sk);
+       if (do_check_data_fin)
+-              __mptcp_check_send_data_fin(sk);
++              mptcp_check_send_data_fin(sk);
+ }
+ static void __mptcp_subflow_push_pending(struct sock *sk, struct sock *ssk, bool first)
+@@ -2651,8 +2648,6 @@ static void mptcp_worker(struct work_str
+       if (unlikely((1 << state) & (TCPF_CLOSE | TCPF_LISTEN)))
+               goto unlock;
+-      mptcp_check_data_fin_ack(sk);
+-
+       mptcp_check_fastclose(msk);
+       mptcp_pm_nl_work(msk);
+@@ -2660,7 +2655,8 @@ static void mptcp_worker(struct work_str
+       if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags))
+               mptcp_check_for_eof(msk);
+-      __mptcp_check_send_data_fin(sk);
++      mptcp_check_send_data_fin(sk);
++      mptcp_check_data_fin_ack(sk);
+       mptcp_check_data_fin(sk);
+       if (test_and_clear_bit(MPTCP_WORK_CLOSE_SUBFLOW, &msk->flags))
+@@ -2803,6 +2799,12 @@ void mptcp_subflow_shutdown(struct sock
+                       pr_debug("Fallback");
+                       ssk->sk_shutdown |= how;
+                       tcp_shutdown(ssk, how);
++
++                      /* simulate the data_fin ack reception to let the state
++                       * machine move forward
++                       */
++                      WRITE_ONCE(mptcp_sk(sk)->snd_una, mptcp_sk(sk)->snd_nxt);
++                      mptcp_schedule_work(sk);
+               } else {
+                       pr_debug("Sending DATA_FIN on subflow %p", ssk);
+                       tcp_send_ack(ssk);
+@@ -2842,7 +2844,7 @@ static int mptcp_close_state(struct sock
+       return next & TCP_ACTION_FIN;
+ }
+-static void __mptcp_check_send_data_fin(struct sock *sk)
++static void mptcp_check_send_data_fin(struct sock *sk)
+ {
+       struct mptcp_subflow_context *subflow;
+       struct mptcp_sock *msk = mptcp_sk(sk);
+@@ -2860,19 +2862,6 @@ static void __mptcp_check_send_data_fin(
+       WRITE_ONCE(msk->snd_nxt, msk->write_seq);
+-      /* fallback socket will not get data_fin/ack, can move to the next
+-       * state now
+-       */
+-      if (__mptcp_check_fallback(msk)) {
+-              WRITE_ONCE(msk->snd_una, msk->write_seq);
+-              if ((1 << sk->sk_state) & (TCPF_CLOSING | TCPF_LAST_ACK)) {
+-                      inet_sk_state_store(sk, TCP_CLOSE);
+-                      mptcp_close_wake_up(sk);
+-              } else if (sk->sk_state == TCP_FIN_WAIT1) {
+-                      inet_sk_state_store(sk, TCP_FIN_WAIT2);
+-              }
+-      }
+-
+       mptcp_for_each_subflow(msk, subflow) {
+               struct sock *tcp_sk = mptcp_subflow_tcp_sock(subflow);
+@@ -2892,7 +2881,7 @@ static void __mptcp_wr_shutdown(struct s
+       WRITE_ONCE(msk->write_seq, msk->write_seq + 1);
+       WRITE_ONCE(msk->snd_data_fin_enable, 1);
+-      __mptcp_check_send_data_fin(sk);
++      mptcp_check_send_data_fin(sk);
+ }
+ static void __mptcp_destroy_sock(struct sock *sk)
+--- a/net/mptcp/subflow.c
++++ b/net/mptcp/subflow.c
+@@ -1749,14 +1749,16 @@ static void subflow_state_change(struct
+ {
+       struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(sk);
+       struct sock *parent = subflow->conn;
++      struct mptcp_sock *msk;
+       __subflow_state_change(sk);
++      msk = mptcp_sk(parent);
+       if (subflow_simultaneous_connect(sk)) {
+               mptcp_propagate_sndbuf(parent, sk);
+               mptcp_do_fallback(sk);
+-              mptcp_rcv_space_init(mptcp_sk(parent), sk);
+-              pr_fallback(mptcp_sk(parent));
++              mptcp_rcv_space_init(msk, sk);
++              pr_fallback(msk);
+               subflow->conn_finished = 1;
+               mptcp_set_connected(parent);
+       }
+@@ -1772,11 +1774,12 @@ static void subflow_state_change(struct
+       subflow_sched_work_if_closed(mptcp_sk(parent), sk);
+-      if (__mptcp_check_fallback(mptcp_sk(parent)) &&
+-          !subflow->rx_eof && subflow_is_done(sk)) {
+-              subflow->rx_eof = 1;
+-              mptcp_subflow_eof(parent);
+-      }
++      /* when the fallback subflow closes the rx side, trigger a 'dummy'
++       * ingress data fin, so that the msk state will follow along
++       */
++      if (__mptcp_check_fallback(msk) && subflow_is_done(sk) && msk->first == sk &&
++          mptcp_update_rcv_data_fin(msk, READ_ONCE(msk->ack_seq), true))
++              mptcp_schedule_work(parent);
+ }
+ void mptcp_subflow_queue_clean(struct sock *listener_sk, struct sock *listener_ssk)
diff --git a/queue-6.3/mptcp-ensure-listener-is-unhashed-before-updating-the-sk-status.patch b/queue-6.3/mptcp-ensure-listener-is-unhashed-before-updating-the-sk-status.patch
new file mode 100644 (file)
index 0000000..67effa5
--- /dev/null
@@ -0,0 +1,104 @@
+From 57fc0f1ceaa4016354cf6f88533e20b56190e41a Mon Sep 17 00:00:00 2001
+From: Paolo Abeni <pabeni@redhat.com>
+Date: Tue, 20 Jun 2023 18:24:23 +0200
+Subject: mptcp: ensure listener is unhashed before updating the sk status
+
+From: Paolo Abeni <pabeni@redhat.com>
+
+commit 57fc0f1ceaa4016354cf6f88533e20b56190e41a upstream.
+
+The MPTCP protocol access the listener subflow in a lockless
+manner in a couple of places (poll, diag). That works only if
+the msk itself leaves the listener status only after that the
+subflow itself has been closed/disconnected. Otherwise we risk
+deadlock in diag, as reported by Christoph.
+
+Address the issue ensuring that the first subflow (the listener
+one) is always disconnected before updating the msk socket status.
+
+Reported-by: Christoph Paasch <cpaasch@apple.com>
+Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/407
+Fixes: b29fcfb54cd7 ("mptcp: full disconnect implementation")
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Reviewed-by: Matthieu Baerts <matthieu.baerts@tessares.net>
+Signed-off-by: Matthieu Baerts <matthieu.baerts@tessares.net>
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/mptcp/pm_netlink.c |    1 +
+ net/mptcp/protocol.c   |   31 +++++++++++++++++++------------
+ 2 files changed, 20 insertions(+), 12 deletions(-)
+
+--- a/net/mptcp/pm_netlink.c
++++ b/net/mptcp/pm_netlink.c
+@@ -1047,6 +1047,7 @@ static int mptcp_pm_nl_create_listen_soc
+       if (err)
+               return err;
++      inet_sk_state_store(newsk, TCP_LISTEN);
+       err = kernel_listen(ssock, backlog);
+       if (err)
+               return err;
+--- a/net/mptcp/protocol.c
++++ b/net/mptcp/protocol.c
+@@ -2385,13 +2385,6 @@ static void __mptcp_close_ssk(struct soc
+               kfree_rcu(subflow, rcu);
+       } else {
+               /* otherwise tcp will dispose of the ssk and subflow ctx */
+-              if (ssk->sk_state == TCP_LISTEN) {
+-                      tcp_set_state(ssk, TCP_CLOSE);
+-                      mptcp_subflow_queue_clean(sk, ssk);
+-                      inet_csk_listen_stop(ssk);
+-                      mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CLOSED);
+-              }
+-
+               __tcp_close(ssk, 0);
+               /* close acquired an extra ref */
+@@ -2926,10 +2919,24 @@ static __poll_t mptcp_check_readable(str
+       return EPOLLIN | EPOLLRDNORM;
+ }
+-static void mptcp_listen_inuse_dec(struct sock *sk)
++static void mptcp_check_listen_stop(struct sock *sk)
+ {
+-      if (inet_sk_state_load(sk) == TCP_LISTEN)
+-              sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
++      struct sock *ssk;
++
++      if (inet_sk_state_load(sk) != TCP_LISTEN)
++              return;
++
++      sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
++      ssk = mptcp_sk(sk)->first;
++      if (WARN_ON_ONCE(!ssk || inet_sk_state_load(ssk) != TCP_LISTEN))
++              return;
++
++      lock_sock_nested(ssk, SINGLE_DEPTH_NESTING);
++      mptcp_subflow_queue_clean(sk, ssk);
++      inet_csk_listen_stop(ssk);
++      mptcp_event_pm_listener(ssk, MPTCP_EVENT_LISTENER_CLOSED);
++      tcp_set_state(ssk, TCP_CLOSE);
++      release_sock(ssk);
+ }
+ bool __mptcp_close(struct sock *sk, long timeout)
+@@ -2942,7 +2949,7 @@ bool __mptcp_close(struct sock *sk, long
+       WRITE_ONCE(sk->sk_shutdown, SHUTDOWN_MASK);
+       if ((1 << sk->sk_state) & (TCPF_LISTEN | TCPF_CLOSE)) {
+-              mptcp_listen_inuse_dec(sk);
++              mptcp_check_listen_stop(sk);
+               inet_sk_state_store(sk, TCP_CLOSE);
+               goto cleanup;
+       }
+@@ -3056,7 +3063,7 @@ static int mptcp_disconnect(struct sock
+       if (msk->fastopening)
+               return -EBUSY;
+-      mptcp_listen_inuse_dec(sk);
++      mptcp_check_listen_stop(sk);
+       inet_sk_state_store(sk, TCP_CLOSE);
+       mptcp_stop_timer(sk);
diff --git a/queue-6.3/mptcp-fix-possible-divide-by-zero-in-recvmsg.patch b/queue-6.3/mptcp-fix-possible-divide-by-zero-in-recvmsg.patch
new file mode 100644 (file)
index 0000000..0addfb3
--- /dev/null
@@ -0,0 +1,95 @@
+From 0ad529d9fd2bfa3fc619552a8d2fb2f2ef0bce2e Mon Sep 17 00:00:00 2001
+From: Paolo Abeni <pabeni@redhat.com>
+Date: Tue, 20 Jun 2023 18:24:19 +0200
+Subject: mptcp: fix possible divide by zero in recvmsg()
+
+From: Paolo Abeni <pabeni@redhat.com>
+
+commit 0ad529d9fd2bfa3fc619552a8d2fb2f2ef0bce2e upstream.
+
+Christoph reported a divide by zero bug in mptcp_recvmsg():
+
+divide error: 0000 [#1] PREEMPT SMP
+CPU: 1 PID: 19978 Comm: syz-executor.6 Not tainted 6.4.0-rc2-gffcc7899081b #20
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.el7 04/01/2014
+RIP: 0010:__tcp_select_window+0x30e/0x420 net/ipv4/tcp_output.c:3018
+Code: 11 ff 0f b7 cd c1 e9 0c b8 ff ff ff ff d3 e0 89 c1 f7 d1 01 cb 21 c3 eb 17 e8 2e 83 11 ff 31 db eb 0e e8 25 83 11 ff 89 d8 99 <f7> 7c 24 04 29 d3 65 48 8b 04 25 28 00 00 00 48 3b 44 24 10 75 60
+RSP: 0018:ffffc90000a07a18 EFLAGS: 00010246
+RAX: 000000000000ffd7 RBX: 000000000000ffd7 RCX: 0000000000040000
+RDX: 0000000000000000 RSI: 000000000003ffff RDI: 0000000000040000
+RBP: 000000000000ffd7 R08: ffffffff820cf297 R09: 0000000000000001
+R10: 0000000000000000 R11: ffffffff8103d1a0 R12: 0000000000003f00
+R13: 0000000000300000 R14: ffff888101cf3540 R15: 0000000000180000
+FS:  00007f9af4c09640(0000) GS:ffff88813bd00000(0000) knlGS:0000000000000000
+CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+CR2: 0000001b33824000 CR3: 000000012f241001 CR4: 0000000000170ee0
+Call Trace:
+ <TASK>
+ __tcp_cleanup_rbuf+0x138/0x1d0 net/ipv4/tcp.c:1611
+ mptcp_recvmsg+0xcb8/0xdd0 net/mptcp/protocol.c:2034
+ inet_recvmsg+0x127/0x1f0 net/ipv4/af_inet.c:861
+ ____sys_recvmsg+0x269/0x2b0 net/socket.c:1019
+ ___sys_recvmsg+0xe6/0x260 net/socket.c:2764
+ do_recvmmsg+0x1a5/0x470 net/socket.c:2858
+ __do_sys_recvmmsg net/socket.c:2937 [inline]
+ __se_sys_recvmmsg net/socket.c:2953 [inline]
+ __x64_sys_recvmmsg+0xa6/0x130 net/socket.c:2953
+ do_syscall_x64 arch/x86/entry/common.c:50 [inline]
+ do_syscall_64+0x47/0xa0 arch/x86/entry/common.c:80
+ entry_SYSCALL_64_after_hwframe+0x72/0xdc
+RIP: 0033:0x7f9af58fc6a9
+Code: 5c c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d 4f 37 0d 00 f7 d8 64 89 01 48
+RSP: 002b:00007f9af4c08cd8 EFLAGS: 00000246 ORIG_RAX: 000000000000012b
+RAX: ffffffffffffffda RBX: 00000000006bc050 RCX: 00007f9af58fc6a9
+RDX: 0000000000000001 RSI: 0000000020000140 RDI: 0000000000000004
+RBP: 0000000000000000 R08: 0000000000000000 R09: 0000000000000000
+R10: 0000000000000f00 R11: 0000000000000246 R12: 00000000006bc05c
+R13: fffffffffffffea8 R14: 00000000006bc050 R15: 000000000001fe40
+ </TASK>
+
+mptcp_recvmsg is allowed to release the msk socket lock when
+blocking, and before re-acquiring it another thread could have
+switched the sock to TCP_LISTEN status - with a prior
+connect(AF_UNSPEC) - also clearing icsk_ack.rcv_mss.
+
+Address the issue preventing the disconnect if some other process is
+concurrently performing a blocking syscall on the same socket, alike
+commit 4faeee0cf8a5 ("tcp: deny tcp_disconnect() when threads are waiting").
+
+Fixes: a6b118febbab ("mptcp: add receive buffer auto-tuning")
+Cc: stable@vger.kernel.org
+Reported-by: Christoph Paasch <cpaasch@apple.com>
+Closes: https://github.com/multipath-tcp/mptcp_net-next/issues/404
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Tested-by: Christoph Paasch <cpaasch@apple.com>
+Reviewed-by: Matthieu Baerts <matthieu.baerts@tessares.net>
+Signed-off-by: Matthieu Baerts <matthieu.baerts@tessares.net>
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/mptcp/protocol.c |    7 +++++++
+ 1 file changed, 7 insertions(+)
+
+--- a/net/mptcp/protocol.c
++++ b/net/mptcp/protocol.c
+@@ -3054,6 +3054,12 @@ static int mptcp_disconnect(struct sock
+ {
+       struct mptcp_sock *msk = mptcp_sk(sk);
++      /* Deny disconnect if other threads are blocked in sk_wait_event()
++       * or inet_wait_for_connect().
++       */
++      if (sk->sk_wait_pending)
++              return -EBUSY;
++
+       /* We are on the fastopen error path. We can't call straight into the
+        * subflows cleanup code due to lock nesting (we are already under
+        * msk->firstsocket lock).
+@@ -3120,6 +3126,7 @@ struct sock *mptcp_sk_clone_init(const s
+               inet_sk(nsk)->pinet6 = mptcp_inet6_sk(nsk);
+ #endif
++      nsk->sk_wait_pending = 0;
+       __mptcp_init_sock(nsk);
+       msk = mptcp_sk(nsk);
diff --git a/queue-6.3/mptcp-fix-possible-list-corruption-on-passive-mpj.patch b/queue-6.3/mptcp-fix-possible-list-corruption-on-passive-mpj.patch
new file mode 100644 (file)
index 0000000..3655c52
--- /dev/null
@@ -0,0 +1,78 @@
+From 56a666c48b038e91b76471289e2cf60c79d326b9 Mon Sep 17 00:00:00 2001
+From: Paolo Abeni <pabeni@redhat.com>
+Date: Tue, 20 Jun 2023 18:24:20 +0200
+Subject: mptcp: fix possible list corruption on passive MPJ
+
+From: Paolo Abeni <pabeni@redhat.com>
+
+commit 56a666c48b038e91b76471289e2cf60c79d326b9 upstream.
+
+At passive MPJ time, if the msk socket lock is held by the user,
+the new subflow is appended to the msk->join_list under the msk
+data lock.
+
+In mptcp_release_cb()/__mptcp_flush_join_list(), the subflows in
+that list are moved from the join_list into the conn_list under the
+msk socket lock.
+
+Append and removal could race, possibly corrupting such list.
+Address the issue splicing the join list into a temporary one while
+still under the msk data lock.
+
+Found by code inspection, the race itself should be almost impossible
+to trigger in practice.
+
+Fixes: 3e5014909b56 ("mptcp: cleanup MPJ subflow list handling")
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Reviewed-by: Matthieu Baerts <matthieu.baerts@tessares.net>
+Signed-off-by: Matthieu Baerts <matthieu.baerts@tessares.net>
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/mptcp/protocol.c |   12 +++++++++---
+ 1 file changed, 9 insertions(+), 3 deletions(-)
+
+--- a/net/mptcp/protocol.c
++++ b/net/mptcp/protocol.c
+@@ -837,12 +837,12 @@ static bool __mptcp_finish_join(struct m
+       return true;
+ }
+-static void __mptcp_flush_join_list(struct sock *sk)
++static void __mptcp_flush_join_list(struct sock *sk, struct list_head *join_list)
+ {
+       struct mptcp_subflow_context *tmp, *subflow;
+       struct mptcp_sock *msk = mptcp_sk(sk);
+-      list_for_each_entry_safe(subflow, tmp, &msk->join_list, node) {
++      list_for_each_entry_safe(subflow, tmp, join_list, node) {
+               struct sock *ssk = mptcp_subflow_tcp_sock(subflow);
+               bool slow = lock_sock_fast(ssk);
+@@ -3314,9 +3314,14 @@ static void mptcp_release_cb(struct sock
+       for (;;) {
+               unsigned long flags = (msk->cb_flags & MPTCP_FLAGS_PROCESS_CTX_NEED) |
+                                     msk->push_pending;
++              struct list_head join_list;
++
+               if (!flags)
+                       break;
++              INIT_LIST_HEAD(&join_list);
++              list_splice_init(&msk->join_list, &join_list);
++
+               /* the following actions acquire the subflow socket lock
+                *
+                * 1) can't be invoked in atomic scope
+@@ -3327,8 +3332,9 @@ static void mptcp_release_cb(struct sock
+               msk->push_pending = 0;
+               msk->cb_flags &= ~flags;
+               spin_unlock_bh(&sk->sk_lock.slock);
++
+               if (flags & BIT(MPTCP_FLUSH_JOIN_LIST))
+-                      __mptcp_flush_join_list(sk);
++                      __mptcp_flush_join_list(sk, &join_list);
+               if (flags & BIT(MPTCP_PUSH_PENDING))
+                       __mptcp_push_pending(sk, 0);
+               if (flags & BIT(MPTCP_RETRANSMIT))
diff --git a/queue-6.3/mptcp-handle-correctly-disconnect-failures.patch b/queue-6.3/mptcp-handle-correctly-disconnect-failures.patch
new file mode 100644 (file)
index 0000000..5173b9a
--- /dev/null
@@ -0,0 +1,83 @@
+From c2b2ae3925b65070adb27d5a31a31c376f26dec7 Mon Sep 17 00:00:00 2001
+From: Paolo Abeni <pabeni@redhat.com>
+Date: Tue, 20 Jun 2023 18:24:18 +0200
+Subject: mptcp: handle correctly disconnect() failures
+
+From: Paolo Abeni <pabeni@redhat.com>
+
+commit c2b2ae3925b65070adb27d5a31a31c376f26dec7 upstream.
+
+Currently the mptcp code has assumes that disconnect() can fail only
+at mptcp_sendmsg_fastopen() time - to avoid a deadlock scenario - and
+don't even bother returning an error code.
+
+Soon mptcp_disconnect() will handle more error conditions: let's track
+them explicitly.
+
+As a bonus, explicitly annotate TCP-level disconnect as not failing:
+the mptcp code never blocks for event on the subflows.
+
+Fixes: 7d803344fdc3 ("mptcp: fix deadlock in fastopen error path")
+Cc: stable@vger.kernel.org
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Tested-by: Christoph Paasch <cpaasch@apple.com>
+Reviewed-by: Matthieu Baerts <matthieu.baerts@tessares.net>
+Signed-off-by: Matthieu Baerts <matthieu.baerts@tessares.net>
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/mptcp/protocol.c |   20 ++++++++++++++------
+ 1 file changed, 14 insertions(+), 6 deletions(-)
+
+--- a/net/mptcp/protocol.c
++++ b/net/mptcp/protocol.c
+@@ -1696,7 +1696,13 @@ static int mptcp_sendmsg_fastopen(struct
+               if (ret && ret != -EINPROGRESS && ret != -ERESTARTSYS && ret != -EINTR)
+                       *copied_syn = 0;
+       } else if (ret && ret != -EINPROGRESS) {
+-              mptcp_disconnect(sk, 0);
++              /* The disconnect() op called by tcp_sendmsg_fastopen()/
++               * __inet_stream_connect() can fail, due to looking check,
++               * see mptcp_disconnect().
++               * Attempt it again outside the problematic scope.
++               */
++              if (!mptcp_disconnect(sk, 0))
++                      sk->sk_socket->state = SS_UNCONNECTED;
+       }
+       return ret;
+@@ -2360,7 +2366,10 @@ static void __mptcp_close_ssk(struct soc
+       need_push = (flags & MPTCP_CF_PUSH) && __mptcp_retransmit_pending_data(sk);
+       if (!dispose_it) {
+-              tcp_disconnect(ssk, 0);
++              /* The MPTCP code never wait on the subflow sockets, TCP-level
++               * disconnect should never fail
++               */
++              WARN_ON_ONCE(tcp_disconnect(ssk, 0));
+               msk->subflow->state = SS_UNCONNECTED;
+               mptcp_subflow_ctx_reset(subflow);
+               release_sock(ssk);
+@@ -2787,7 +2796,7 @@ void mptcp_subflow_shutdown(struct sock
+                       break;
+               fallthrough;
+       case TCP_SYN_SENT:
+-              tcp_disconnect(ssk, O_NONBLOCK);
++              WARN_ON_ONCE(tcp_disconnect(ssk, O_NONBLOCK));
+               break;
+       default:
+               if (__mptcp_check_fallback(mptcp_sk(sk))) {
+@@ -3047,11 +3056,10 @@ static int mptcp_disconnect(struct sock
+       /* We are on the fastopen error path. We can't call straight into the
+        * subflows cleanup code due to lock nesting (we are already under
+-       * msk->firstsocket lock). Do nothing and leave the cleanup to the
+-       * caller.
++       * msk->firstsocket lock).
+        */
+       if (msk->fastopening)
+-              return 0;
++              return -EBUSY;
+       mptcp_listen_inuse_dec(sk);
+       inet_sk_state_store(sk, TCP_CLOSE);
diff --git a/queue-6.3/net-mdio-fix-the-wrong-parameters.patch b/queue-6.3/net-mdio-fix-the-wrong-parameters.patch
new file mode 100644 (file)
index 0000000..7097005
--- /dev/null
@@ -0,0 +1,39 @@
+From 408c090002c8ca5da3da1417d1d675583379fae6 Mon Sep 17 00:00:00 2001
+From: Jiawen Wu <jiawenwu@trustnetic.com>
+Date: Mon, 19 Jun 2023 17:49:48 +0800
+Subject: net: mdio: fix the wrong parameters
+
+From: Jiawen Wu <jiawenwu@trustnetic.com>
+
+commit 408c090002c8ca5da3da1417d1d675583379fae6 upstream.
+
+PHY address and device address are passed in the wrong order.
+
+Cc: stable@vger.kernel.org
+Fixes: 4e4aafcddbbf ("net: mdio: Add dedicated C45 API to MDIO bus drivers")
+Signed-off-by: Jiawen Wu <jiawenwu@trustnetic.com>
+Reviewed-by: Andrew Lunn <andrew@lunn.ch>
+Reviewed-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
+Link: https://lore.kernel.org/r/20230619094948.84452-1-jiawenwu@trustnetic.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/phy/mdio_bus.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/net/phy/mdio_bus.c b/drivers/net/phy/mdio_bus.c
+index 389f33a12534..8b3618d3da4a 100644
+--- a/drivers/net/phy/mdio_bus.c
++++ b/drivers/net/phy/mdio_bus.c
+@@ -1287,7 +1287,7 @@ EXPORT_SYMBOL_GPL(mdiobus_modify_changed);
+  * @mask: bit mask of bits to clear
+  * @set: bit mask of bits to set
+  */
+-int mdiobus_c45_modify_changed(struct mii_bus *bus, int devad, int addr,
++int mdiobus_c45_modify_changed(struct mii_bus *bus, int addr, int devad,
+                              u32 regnum, u16 mask, u16 set)
+ {
+       int err;
+-- 
+2.41.0
+
diff --git a/queue-6.3/nilfs2-fix-buffer-corruption-due-to-concurrent-device-reads.patch b/queue-6.3/nilfs2-fix-buffer-corruption-due-to-concurrent-device-reads.patch
new file mode 100644 (file)
index 0000000..1b780fd
--- /dev/null
@@ -0,0 +1,147 @@
+From 679bd7ebdd315bf457a4740b306ae99f1d0a403d Mon Sep 17 00:00:00 2001
+From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
+Date: Fri, 9 Jun 2023 12:57:32 +0900
+Subject: nilfs2: fix buffer corruption due to concurrent device reads
+
+From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
+
+commit 679bd7ebdd315bf457a4740b306ae99f1d0a403d upstream.
+
+As a result of analysis of a syzbot report, it turned out that in three
+cases where nilfs2 allocates block device buffers directly via sb_getblk,
+concurrent reads to the device can corrupt the allocated buffers.
+
+Nilfs2 uses sb_getblk for segment summary blocks, that make up a log
+header, and the super root block, that is the trailer, and when moving and
+writing the second super block after fs resize.
+
+In any of these, since the uptodate flag is not set when storing metadata
+to be written in the allocated buffers, the stored metadata will be
+overwritten if a device read of the same block occurs concurrently before
+the write.  This causes metadata corruption and misbehavior in the log
+write itself, causing warnings in nilfs_btree_assign() as reported.
+
+Fix these issues by setting an uptodate flag on the buffer head on the
+first or before modifying each buffer obtained with sb_getblk, and
+clearing the flag on failure.
+
+When setting the uptodate flag, the lock_buffer/unlock_buffer pair is used
+to perform necessary exclusive control, and the buffer is filled to ensure
+that uninitialized bytes are not mixed into the data read from others.  As
+for buffers for segment summary blocks, they are filled incrementally, so
+if the uptodate flag was unset on their allocation, set the flag and zero
+fill the buffer once at that point.
+
+Also, regarding the superblock move routine, the starting point of the
+memset call to zerofill the block is incorrectly specified, which can
+cause a buffer overflow on file systems with block sizes greater than
+4KiB.  In addition, if the superblock is moved within a large block, it is
+necessary to assume the possibility that the data in the superblock will
+be destroyed by zero-filling before copying.  So fix these potential
+issues as well.
+
+Link: https://lkml.kernel.org/r/20230609035732.20426-1-konishi.ryusuke@gmail.com
+Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
+Reported-by: syzbot+31837fe952932efc8fb9@syzkaller.appspotmail.com
+Closes: https://lkml.kernel.org/r/00000000000030000a05e981f475@google.com
+Tested-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/nilfs2/segbuf.c  |    6 ++++++
+ fs/nilfs2/segment.c |    7 +++++++
+ fs/nilfs2/super.c   |   23 ++++++++++++++++++++++-
+ 3 files changed, 35 insertions(+), 1 deletion(-)
+
+--- a/fs/nilfs2/segbuf.c
++++ b/fs/nilfs2/segbuf.c
+@@ -101,6 +101,12 @@ int nilfs_segbuf_extend_segsum(struct ni
+       if (unlikely(!bh))
+               return -ENOMEM;
++      lock_buffer(bh);
++      if (!buffer_uptodate(bh)) {
++              memset(bh->b_data, 0, bh->b_size);
++              set_buffer_uptodate(bh);
++      }
++      unlock_buffer(bh);
+       nilfs_segbuf_add_segsum_buffer(segbuf, bh);
+       return 0;
+ }
+--- a/fs/nilfs2/segment.c
++++ b/fs/nilfs2/segment.c
+@@ -981,10 +981,13 @@ static void nilfs_segctor_fill_in_super_
+       unsigned int isz, srsz;
+       bh_sr = NILFS_LAST_SEGBUF(&sci->sc_segbufs)->sb_super_root;
++
++      lock_buffer(bh_sr);
+       raw_sr = (struct nilfs_super_root *)bh_sr->b_data;
+       isz = nilfs->ns_inode_size;
+       srsz = NILFS_SR_BYTES(isz);
++      raw_sr->sr_sum = 0;  /* Ensure initialization within this update */
+       raw_sr->sr_bytes = cpu_to_le16(srsz);
+       raw_sr->sr_nongc_ctime
+               = cpu_to_le64(nilfs_doing_gc() ?
+@@ -998,6 +1001,8 @@ static void nilfs_segctor_fill_in_super_
+       nilfs_write_inode_common(nilfs->ns_sufile, (void *)raw_sr +
+                                NILFS_SR_SUFILE_OFFSET(isz), 1);
+       memset((void *)raw_sr + srsz, 0, nilfs->ns_blocksize - srsz);
++      set_buffer_uptodate(bh_sr);
++      unlock_buffer(bh_sr);
+ }
+ static void nilfs_redirty_inodes(struct list_head *head)
+@@ -1780,6 +1785,7 @@ static void nilfs_abort_logs(struct list
+       list_for_each_entry(segbuf, logs, sb_list) {
+               list_for_each_entry(bh, &segbuf->sb_segsum_buffers,
+                                   b_assoc_buffers) {
++                      clear_buffer_uptodate(bh);
+                       if (bh->b_page != bd_page) {
+                               if (bd_page)
+                                       end_page_writeback(bd_page);
+@@ -1791,6 +1797,7 @@ static void nilfs_abort_logs(struct list
+                                   b_assoc_buffers) {
+                       clear_buffer_async_write(bh);
+                       if (bh == segbuf->sb_super_root) {
++                              clear_buffer_uptodate(bh);
+                               if (bh->b_page != bd_page) {
+                                       end_page_writeback(bd_page);
+                                       bd_page = bh->b_page;
+--- a/fs/nilfs2/super.c
++++ b/fs/nilfs2/super.c
+@@ -372,10 +372,31 @@ static int nilfs_move_2nd_super(struct s
+               goto out;
+       }
+       nsbp = (void *)nsbh->b_data + offset;
+-      memset(nsbp, 0, nilfs->ns_blocksize);
++      lock_buffer(nsbh);
+       if (sb2i >= 0) {
++              /*
++               * The position of the second superblock only changes by 4KiB,
++               * which is larger than the maximum superblock data size
++               * (= 1KiB), so there is no need to use memmove() to allow
++               * overlap between source and destination.
++               */
+               memcpy(nsbp, nilfs->ns_sbp[sb2i], nilfs->ns_sbsize);
++
++              /*
++               * Zero fill after copy to avoid overwriting in case of move
++               * within the same block.
++               */
++              memset(nsbh->b_data, 0, offset);
++              memset((void *)nsbp + nilfs->ns_sbsize, 0,
++                     nsbh->b_size - offset - nilfs->ns_sbsize);
++      } else {
++              memset(nsbh->b_data, 0, nsbh->b_size);
++      }
++      set_buffer_uptodate(nsbh);
++      unlock_buffer(nsbh);
++
++      if (sb2i >= 0) {
+               brelse(nilfs->ns_sbh[sb2i]);
+               nilfs->ns_sbh[sb2i] = nsbh;
+               nilfs->ns_sbp[sb2i] = nsbp;
diff --git a/queue-6.3/nilfs2-prevent-general-protection-fault-in-nilfs_clear_dirty_page.patch b/queue-6.3/nilfs2-prevent-general-protection-fault-in-nilfs_clear_dirty_page.patch
new file mode 100644 (file)
index 0000000..ebd09c4
--- /dev/null
@@ -0,0 +1,56 @@
+From 782e53d0c14420858dbf0f8f797973c150d3b6d7 Mon Sep 17 00:00:00 2001
+From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
+Date: Mon, 12 Jun 2023 11:14:56 +0900
+Subject: nilfs2: prevent general protection fault in nilfs_clear_dirty_page()
+
+From: Ryusuke Konishi <konishi.ryusuke@gmail.com>
+
+commit 782e53d0c14420858dbf0f8f797973c150d3b6d7 upstream.
+
+In a syzbot stress test that deliberately causes file system errors on
+nilfs2 with a corrupted disk image, it has been reported that
+nilfs_clear_dirty_page() called from nilfs_clear_dirty_pages() can cause a
+general protection fault.
+
+In nilfs_clear_dirty_pages(), when looking up dirty pages from the page
+cache and calling nilfs_clear_dirty_page() for each dirty page/folio
+retrieved, the back reference from the argument page to "mapping" may have
+been changed to NULL (and possibly others).  It is necessary to check this
+after locking the page/folio.
+
+So, fix this issue by not calling nilfs_clear_dirty_page() on a page/folio
+after locking it in nilfs_clear_dirty_pages() if the back reference
+"mapping" from the page/folio is different from the "mapping" that held
+the page/folio just before.
+
+Link: https://lkml.kernel.org/r/20230612021456.3682-1-konishi.ryusuke@gmail.com
+Signed-off-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
+Reported-by: syzbot+53369d11851d8f26735c@syzkaller.appspotmail.com
+Closes: https://lkml.kernel.org/r/000000000000da4f6b05eb9bf593@google.com
+Tested-by: Ryusuke Konishi <konishi.ryusuke@gmail.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/nilfs2/page.c |   10 +++++++++-
+ 1 file changed, 9 insertions(+), 1 deletion(-)
+
+--- a/fs/nilfs2/page.c
++++ b/fs/nilfs2/page.c
+@@ -370,7 +370,15 @@ void nilfs_clear_dirty_pages(struct addr
+                       struct folio *folio = fbatch.folios[i];
+                       folio_lock(folio);
+-                      nilfs_clear_dirty_page(&folio->page, silent);
++
++                      /*
++                       * This folio may have been removed from the address
++                       * space by truncation or invalidation when the lock
++                       * was acquired.  Skip processing in that case.
++                       */
++                      if (likely(folio->mapping == mapping))
++                              nilfs_clear_dirty_page(&folio->page, silent);
++
+                       folio_unlock(folio);
+               }
+               folio_batch_release(&fbatch);
diff --git a/queue-6.3/pci-hv-add-a-per-bus-mutex-state_lock.patch b/queue-6.3/pci-hv-add-a-per-bus-mutex-state_lock.patch
new file mode 100644 (file)
index 0000000..686ccb1
--- /dev/null
@@ -0,0 +1,163 @@
+From 067d6ec7ed5b49380688e06c1e5f883a71bef4fe Mon Sep 17 00:00:00 2001
+From: Dexuan Cui <decui@microsoft.com>
+Date: Wed, 14 Jun 2023 21:44:51 -0700
+Subject: PCI: hv: Add a per-bus mutex state_lock
+
+From: Dexuan Cui <decui@microsoft.com>
+
+commit 067d6ec7ed5b49380688e06c1e5f883a71bef4fe upstream.
+
+In the case of fast device addition/removal, it's possible that
+hv_eject_device_work() can start to run before create_root_hv_pci_bus()
+starts to run; as a result, the pci_get_domain_bus_and_slot() in
+hv_eject_device_work() can return a 'pdev' of NULL, and
+hv_eject_device_work() can remove the 'hpdev', and immediately send a
+message PCI_EJECTION_COMPLETE to the host, and the host immediately
+unassigns the PCI device from the guest; meanwhile,
+create_root_hv_pci_bus() and the PCI device driver can be probing the
+dead PCI device and reporting timeout errors.
+
+Fix the issue by adding a per-bus mutex 'state_lock' and grabbing the
+mutex before powering on the PCI bus in hv_pci_enter_d0(): when
+hv_eject_device_work() starts to run, it's able to find the 'pdev' and call
+pci_stop_and_remove_bus_device(pdev): if the PCI device driver has
+loaded, the PCI device driver's probe() function is already called in
+create_root_hv_pci_bus() -> pci_bus_add_devices(), and now
+hv_eject_device_work() -> pci_stop_and_remove_bus_device() is able
+to call the PCI device driver's remove() function and remove the device
+reliably; if the PCI device driver hasn't loaded yet, the function call
+hv_eject_device_work() -> pci_stop_and_remove_bus_device() is able to
+remove the PCI device reliably and the PCI device driver's probe()
+function won't be called; if the PCI device driver's probe() is already
+running (e.g., systemd-udev is loading the PCI device driver), it must
+be holding the per-device lock, and after the probe() finishes and releases
+the lock, hv_eject_device_work() -> pci_stop_and_remove_bus_device() is
+able to proceed to remove the device reliably.
+
+Fixes: 4daace0d8ce8 ("PCI: hv: Add paravirtual PCI front-end for Microsoft Hyper-V VMs")
+Signed-off-by: Dexuan Cui <decui@microsoft.com>
+Reviewed-by: Michael Kelley <mikelley@microsoft.com>
+Acked-by: Lorenzo Pieralisi <lpieralisi@kernel.org>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20230615044451.5580-6-decui@microsoft.com
+Signed-off-by: Wei Liu <wei.liu@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/pci/controller/pci-hyperv.c |   29 ++++++++++++++++++++++++++---
+ 1 file changed, 26 insertions(+), 3 deletions(-)
+
+--- a/drivers/pci/controller/pci-hyperv.c
++++ b/drivers/pci/controller/pci-hyperv.c
+@@ -489,7 +489,10 @@ struct hv_pcibus_device {
+       struct fwnode_handle *fwnode;
+       /* Protocol version negotiated with the host */
+       enum pci_protocol_version_t protocol_version;
++
++      struct mutex state_lock;
+       enum hv_pcibus_state state;
++
+       struct hv_device *hdev;
+       resource_size_t low_mmio_space;
+       resource_size_t high_mmio_space;
+@@ -2512,6 +2515,8 @@ static void pci_devices_present_work(str
+       if (!dr)
+               return;
++      mutex_lock(&hbus->state_lock);
++
+       /* First, mark all existing children as reported missing. */
+       spin_lock_irqsave(&hbus->device_list_lock, flags);
+       list_for_each_entry(hpdev, &hbus->children, list_entry) {
+@@ -2593,6 +2598,8 @@ static void pci_devices_present_work(str
+               break;
+       }
++      mutex_unlock(&hbus->state_lock);
++
+       kfree(dr);
+ }
+@@ -2741,6 +2748,8 @@ static void hv_eject_device_work(struct
+       hpdev = container_of(work, struct hv_pci_dev, wrk);
+       hbus = hpdev->hbus;
++      mutex_lock(&hbus->state_lock);
++
+       /*
+        * Ejection can come before or after the PCI bus has been set up, so
+        * attempt to find it and tear down the bus state, if it exists.  This
+@@ -2777,6 +2786,8 @@ static void hv_eject_device_work(struct
+       put_pcichild(hpdev);
+       put_pcichild(hpdev);
+       /* hpdev has been freed. Do not use it any more. */
++
++      mutex_unlock(&hbus->state_lock);
+ }
+ /**
+@@ -3567,6 +3578,7 @@ static int hv_pci_probe(struct hv_device
+               return -ENOMEM;
+       hbus->bridge = bridge;
++      mutex_init(&hbus->state_lock);
+       hbus->state = hv_pcibus_init;
+       hbus->wslot_res_allocated = -1;
+@@ -3675,9 +3687,11 @@ static int hv_pci_probe(struct hv_device
+       if (ret)
+               goto free_irq_domain;
++      mutex_lock(&hbus->state_lock);
++
+       ret = hv_pci_enter_d0(hdev);
+       if (ret)
+-              goto free_irq_domain;
++              goto release_state_lock;
+       ret = hv_pci_allocate_bridge_windows(hbus);
+       if (ret)
+@@ -3695,12 +3709,15 @@ static int hv_pci_probe(struct hv_device
+       if (ret)
+               goto free_windows;
++      mutex_unlock(&hbus->state_lock);
+       return 0;
+ free_windows:
+       hv_pci_free_bridge_windows(hbus);
+ exit_d0:
+       (void) hv_pci_bus_exit(hdev, true);
++release_state_lock:
++      mutex_unlock(&hbus->state_lock);
+ free_irq_domain:
+       irq_domain_remove(hbus->irq_domain);
+ free_fwnode:
+@@ -3950,20 +3967,26 @@ static int hv_pci_resume(struct hv_devic
+       if (ret)
+               goto out;
++      mutex_lock(&hbus->state_lock);
++
+       ret = hv_pci_enter_d0(hdev);
+       if (ret)
+-              goto out;
++              goto release_state_lock;
+       ret = hv_send_resources_allocated(hdev);
+       if (ret)
+-              goto out;
++              goto release_state_lock;
+       prepopulate_bars(hbus);
+       hv_pci_restore_msi_state(hbus);
+       hbus->state = hv_pcibus_installed;
++      mutex_unlock(&hbus->state_lock);
+       return 0;
++
++release_state_lock:
++      mutex_unlock(&hbus->state_lock);
+ out:
+       vmbus_close(hdev->channel);
+       return ret;
diff --git a/queue-6.3/pci-hv-fix-a-race-condition-bug-in-hv_pci_query_relations.patch b/queue-6.3/pci-hv-fix-a-race-condition-bug-in-hv_pci_query_relations.patch
new file mode 100644 (file)
index 0000000..c9531ab
--- /dev/null
@@ -0,0 +1,57 @@
+From 440b5e3663271b0ffbd4908115044a6a51fb938b Mon Sep 17 00:00:00 2001
+From: Dexuan Cui <decui@microsoft.com>
+Date: Wed, 14 Jun 2023 21:44:47 -0700
+Subject: PCI: hv: Fix a race condition bug in hv_pci_query_relations()
+
+From: Dexuan Cui <decui@microsoft.com>
+
+commit 440b5e3663271b0ffbd4908115044a6a51fb938b upstream.
+
+Since day 1 of the driver, there has been a race between
+hv_pci_query_relations() and survey_child_resources(): during fast
+device hotplug, hv_pci_query_relations() may error out due to
+device-remove and the stack variable 'comp' is no longer valid;
+however, pci_devices_present_work() -> survey_child_resources() ->
+complete() may be running on another CPU and accessing the no-longer-valid
+'comp'. Fix the race by flushing the workqueue before we exit from
+hv_pci_query_relations().
+
+Fixes: 4daace0d8ce8 ("PCI: hv: Add paravirtual PCI front-end for Microsoft Hyper-V VMs")
+Signed-off-by: Dexuan Cui <decui@microsoft.com>
+Reviewed-by: Michael Kelley <mikelley@microsoft.com>
+Acked-by: Lorenzo Pieralisi <lpieralisi@kernel.org>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20230615044451.5580-2-decui@microsoft.com
+Signed-off-by: Wei Liu <wei.liu@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/pci/controller/pci-hyperv.c |   18 ++++++++++++++++++
+ 1 file changed, 18 insertions(+)
+
+--- a/drivers/pci/controller/pci-hyperv.c
++++ b/drivers/pci/controller/pci-hyperv.c
+@@ -3308,6 +3308,24 @@ static int hv_pci_query_relations(struct
+       if (!ret)
+               ret = wait_for_response(hdev, &comp);
++      /*
++       * In the case of fast device addition/removal, it's possible that
++       * vmbus_sendpacket() or wait_for_response() returns -ENODEV but we
++       * already got a PCI_BUS_RELATIONS* message from the host and the
++       * channel callback already scheduled a work to hbus->wq, which can be
++       * running pci_devices_present_work() -> survey_child_resources() ->
++       * complete(&hbus->survey_event), even after hv_pci_query_relations()
++       * exits and the stack variable 'comp' is no longer valid; as a result,
++       * a hang or a page fault may happen when the complete() calls
++       * raw_spin_lock_irqsave(). Flush hbus->wq before we exit from
++       * hv_pci_query_relations() to avoid the issues. Note: if 'ret' is
++       * -ENODEV, there can't be any more work item scheduled to hbus->wq
++       * after the flush_workqueue(): see vmbus_onoffer_rescind() ->
++       * vmbus_reset_channel_cb(), vmbus_rescind_cleanup() ->
++       * channel->rescind = true.
++       */
++      flush_workqueue(hbus->wq);
++
+       return ret;
+ }
diff --git a/queue-6.3/pci-hv-fix-a-race-condition-in-hv_irq_unmask-that-can-cause-panic.patch b/queue-6.3/pci-hv-fix-a-race-condition-in-hv_irq_unmask-that-can-cause-panic.patch
new file mode 100644 (file)
index 0000000..307f776
--- /dev/null
@@ -0,0 +1,73 @@
+From 2738d5ab7929a845b654cd171a1e275c37eb428e Mon Sep 17 00:00:00 2001
+From: Dexuan Cui <decui@microsoft.com>
+Date: Wed, 14 Jun 2023 21:44:48 -0700
+Subject: PCI: hv: Fix a race condition in hv_irq_unmask() that can cause panic
+
+From: Dexuan Cui <decui@microsoft.com>
+
+commit 2738d5ab7929a845b654cd171a1e275c37eb428e upstream.
+
+When the host tries to remove a PCI device, the host first sends a
+PCI_EJECT message to the guest, and the guest is supposed to gracefully
+remove the PCI device and send a PCI_EJECTION_COMPLETE message to the host;
+the host then sends a VMBus message CHANNELMSG_RESCIND_CHANNELOFFER to
+the guest (when the guest receives this message, the device is already
+unassigned from the guest) and the guest can do some final cleanup work;
+if the guest fails to respond to the PCI_EJECT message within one minute,
+the host sends the VMBus message CHANNELMSG_RESCIND_CHANNELOFFER and
+removes the PCI device forcibly.
+
+In the case of fast device addition/removal, it's possible that the PCI
+device driver is still configuring MSI-X interrupts when the guest receives
+the PCI_EJECT message; the channel callback calls hv_pci_eject_device(),
+which sets hpdev->state to hv_pcichild_ejecting, and schedules a work
+hv_eject_device_work(); if the PCI device driver is calling
+pci_alloc_irq_vectors() -> ... -> hv_compose_msi_msg(), we can break the
+while loop in hv_compose_msi_msg() due to the updated hpdev->state, and
+leave data->chip_data with its default value of NULL; later, when the PCI
+device driver calls request_irq() -> ... -> hv_irq_unmask(), the guest
+crashes in hv_arch_irq_unmask() due to data->chip_data being NULL.
+
+Fix the issue by not testing hpdev->state in the while loop: when the
+guest receives PCI_EJECT, the device is still assigned to the guest, and
+the guest has one minute to finish the device removal gracefully. We don't
+really need to (and we should not) test hpdev->state in the loop.
+
+Fixes: de0aa7b2f97d ("PCI: hv: Fix 2 hang issues in hv_compose_msi_msg()")
+Signed-off-by: Dexuan Cui <decui@microsoft.com>
+Reviewed-by: Michael Kelley <mikelley@microsoft.com>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20230615044451.5580-3-decui@microsoft.com
+Signed-off-by: Wei Liu <wei.liu@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/pci/controller/pci-hyperv.c |   11 +++++------
+ 1 file changed, 5 insertions(+), 6 deletions(-)
+
+--- a/drivers/pci/controller/pci-hyperv.c
++++ b/drivers/pci/controller/pci-hyperv.c
+@@ -634,6 +634,11 @@ static void hv_arch_irq_unmask(struct ir
+       pbus = pdev->bus;
+       hbus = container_of(pbus->sysdata, struct hv_pcibus_device, sysdata);
+       int_desc = data->chip_data;
++      if (!int_desc) {
++              dev_warn(&hbus->hdev->device, "%s() can not unmask irq %u\n",
++                       __func__, data->irq);
++              return;
++      }
+       spin_lock_irqsave(&hbus->retarget_msi_interrupt_lock, flags);
+@@ -1902,12 +1907,6 @@ static void hv_compose_msi_msg(struct ir
+               hv_pci_onchannelcallback(hbus);
+               spin_unlock_irqrestore(&channel->sched_lock, flags);
+-              if (hpdev->state == hv_pcichild_ejecting) {
+-                      dev_err_once(&hbus->hdev->device,
+-                                   "the device is being ejected\n");
+-                      goto enable_tasklet;
+-              }
+-
+               udelay(100);
+       }
diff --git a/queue-6.3/pci-hv-remove-the-useless-hv_pcichild_state-from-struct-hv_pci_dev.patch b/queue-6.3/pci-hv-remove-the-useless-hv_pcichild_state-from-struct-hv_pci_dev.patch
new file mode 100644 (file)
index 0000000..3efd573
--- /dev/null
@@ -0,0 +1,62 @@
+From add9195e69c94b32e96f78c2f9cea68f0e850b3f Mon Sep 17 00:00:00 2001
+From: Dexuan Cui <decui@microsoft.com>
+Date: Wed, 14 Jun 2023 21:44:49 -0700
+Subject: PCI: hv: Remove the useless hv_pcichild_state from struct hv_pci_dev
+
+From: Dexuan Cui <decui@microsoft.com>
+
+commit add9195e69c94b32e96f78c2f9cea68f0e850b3f upstream.
+
+The hpdev->state is never really useful. The only use in
+hv_pci_eject_device() and hv_eject_device_work() is not really necessary.
+
+Signed-off-by: Dexuan Cui <decui@microsoft.com>
+Reviewed-by: Michael Kelley <mikelley@microsoft.com>
+Acked-by: Lorenzo Pieralisi <lpieralisi@kernel.org>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20230615044451.5580-4-decui@microsoft.com
+Signed-off-by: Wei Liu <wei.liu@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/pci/controller/pci-hyperv.c |   12 ------------
+ 1 file changed, 12 deletions(-)
+
+--- a/drivers/pci/controller/pci-hyperv.c
++++ b/drivers/pci/controller/pci-hyperv.c
+@@ -553,19 +553,10 @@ struct hv_dr_state {
+       struct hv_pcidev_description func[];
+ };
+-enum hv_pcichild_state {
+-      hv_pcichild_init = 0,
+-      hv_pcichild_requirements,
+-      hv_pcichild_resourced,
+-      hv_pcichild_ejecting,
+-      hv_pcichild_maximum
+-};
+-
+ struct hv_pci_dev {
+       /* List protected by pci_rescan_remove_lock */
+       struct list_head list_entry;
+       refcount_t refs;
+-      enum hv_pcichild_state state;
+       struct pci_slot *pci_slot;
+       struct hv_pcidev_description desc;
+       bool reported_missing;
+@@ -2751,8 +2742,6 @@ static void hv_eject_device_work(struct
+       hpdev = container_of(work, struct hv_pci_dev, wrk);
+       hbus = hpdev->hbus;
+-      WARN_ON(hpdev->state != hv_pcichild_ejecting);
+-
+       /*
+        * Ejection can come before or after the PCI bus has been set up, so
+        * attempt to find it and tear down the bus state, if it exists.  This
+@@ -2809,7 +2798,6 @@ static void hv_pci_eject_device(struct h
+               return;
+       }
+-      hpdev->state = hv_pcichild_ejecting;
+       get_pcichild(hpdev);
+       INIT_WORK(&hpdev->wrk, hv_eject_device_work);
+       queue_work(hbus->wq, &hpdev->wrk);
diff --git a/queue-6.3/revert-pci-hv-fix-a-timing-issue-which-causes-kdump-to-fail-occasionally.patch b/queue-6.3/revert-pci-hv-fix-a-timing-issue-which-causes-kdump-to-fail-occasionally.patch
new file mode 100644 (file)
index 0000000..00bc63f
--- /dev/null
@@ -0,0 +1,144 @@
+From a847234e24d03d01a9566d1d9dcce018cc018d67 Mon Sep 17 00:00:00 2001
+From: Dexuan Cui <decui@microsoft.com>
+Date: Wed, 14 Jun 2023 21:44:50 -0700
+Subject: Revert "PCI: hv: Fix a timing issue which causes kdump to fail occasionally"
+
+From: Dexuan Cui <decui@microsoft.com>
+
+commit a847234e24d03d01a9566d1d9dcce018cc018d67 upstream.
+
+This reverts commit d6af2ed29c7c1c311b96dac989dcb991e90ee195.
+
+The statement "the hv_pci_bus_exit() call releases structures of all its
+child devices" in commit d6af2ed29c7c is not true: in the path
+hv_pci_probe() -> hv_pci_enter_d0() -> hv_pci_bus_exit(hdev, true): the
+parameter "keep_devs" is true, so hv_pci_bus_exit() does *not* release the
+child "struct hv_pci_dev *hpdev" that is created earlier in
+pci_devices_present_work() -> new_pcichild_device().
+
+The commit d6af2ed29c7c was originally made in July 2020 for RHEL 7.7,
+where the old version of hv_pci_bus_exit() was used; when the commit was
+rebased and merged into the upstream, people didn't notice that it's
+not really necessary. The commit itself doesn't cause any issue, but it
+makes hv_pci_probe() more complicated. Revert it to facilitate some
+upcoming changes to hv_pci_probe().
+
+Signed-off-by: Dexuan Cui <decui@microsoft.com>
+Reviewed-by: Michael Kelley <mikelley@microsoft.com>
+Acked-by: Wei Hu <weh@microsoft.com>
+Cc: stable@vger.kernel.org
+Link: https://lore.kernel.org/r/20230615044451.5580-5-decui@microsoft.com
+Signed-off-by: Wei Liu <wei.liu@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/pci/controller/pci-hyperv.c |   71 +++++++++++++++++-------------------
+ 1 file changed, 34 insertions(+), 37 deletions(-)
+
+--- a/drivers/pci/controller/pci-hyperv.c
++++ b/drivers/pci/controller/pci-hyperv.c
+@@ -3238,8 +3238,10 @@ static int hv_pci_enter_d0(struct hv_dev
+       struct pci_bus_d0_entry *d0_entry;
+       struct hv_pci_compl comp_pkt;
+       struct pci_packet *pkt;
++      bool retry = true;
+       int ret;
++enter_d0_retry:
+       /*
+        * Tell the host that the bus is ready to use, and moved into the
+        * powered-on state.  This includes telling the host which region
+@@ -3266,6 +3268,38 @@ static int hv_pci_enter_d0(struct hv_dev
+       if (ret)
+               goto exit;
++      /*
++       * In certain case (Kdump) the pci device of interest was
++       * not cleanly shut down and resource is still held on host
++       * side, the host could return invalid device status.
++       * We need to explicitly request host to release the resource
++       * and try to enter D0 again.
++       */
++      if (comp_pkt.completion_status < 0 && retry) {
++              retry = false;
++
++              dev_err(&hdev->device, "Retrying D0 Entry\n");
++
++              /*
++               * Hv_pci_bus_exit() calls hv_send_resource_released()
++               * to free up resources of its child devices.
++               * In the kdump kernel we need to set the
++               * wslot_res_allocated to 255 so it scans all child
++               * devices to release resources allocated in the
++               * normal kernel before panic happened.
++               */
++              hbus->wslot_res_allocated = 255;
++
++              ret = hv_pci_bus_exit(hdev, true);
++
++              if (ret == 0) {
++                      kfree(pkt);
++                      goto enter_d0_retry;
++              }
++              dev_err(&hdev->device,
++                      "Retrying D0 failed with ret %d\n", ret);
++      }
++
+       if (comp_pkt.completion_status < 0) {
+               dev_err(&hdev->device,
+                       "PCI Pass-through VSP failed D0 Entry with status %x\n",
+@@ -3511,7 +3545,6 @@ static int hv_pci_probe(struct hv_device
+       struct hv_pcibus_device *hbus;
+       u16 dom_req, dom;
+       char *name;
+-      bool enter_d0_retry = true;
+       int ret;
+       /*
+@@ -3651,47 +3684,11 @@ static int hv_pci_probe(struct hv_device
+       if (ret)
+               goto free_fwnode;
+-retry:
+       ret = hv_pci_query_relations(hdev);
+       if (ret)
+               goto free_irq_domain;
+       ret = hv_pci_enter_d0(hdev);
+-      /*
+-       * In certain case (Kdump) the pci device of interest was
+-       * not cleanly shut down and resource is still held on host
+-       * side, the host could return invalid device status.
+-       * We need to explicitly request host to release the resource
+-       * and try to enter D0 again.
+-       * Since the hv_pci_bus_exit() call releases structures
+-       * of all its child devices, we need to start the retry from
+-       * hv_pci_query_relations() call, requesting host to send
+-       * the synchronous child device relations message before this
+-       * information is needed in hv_send_resources_allocated()
+-       * call later.
+-       */
+-      if (ret == -EPROTO && enter_d0_retry) {
+-              enter_d0_retry = false;
+-
+-              dev_err(&hdev->device, "Retrying D0 Entry\n");
+-
+-              /*
+-               * Hv_pci_bus_exit() calls hv_send_resources_released()
+-               * to free up resources of its child devices.
+-               * In the kdump kernel we need to set the
+-               * wslot_res_allocated to 255 so it scans all child
+-               * devices to release resources allocated in the
+-               * normal kernel before panic happened.
+-               */
+-              hbus->wslot_res_allocated = 255;
+-              ret = hv_pci_bus_exit(hdev, true);
+-
+-              if (ret == 0)
+-                      goto retry;
+-
+-              dev_err(&hdev->device,
+-                      "Retrying D0 failed with ret %d\n", ret);
+-      }
+       if (ret)
+               goto free_irq_domain;
diff --git a/queue-6.3/scripts-fix-the-gfp-flags-header-path-in-gfp-translate.patch b/queue-6.3/scripts-fix-the-gfp-flags-header-path-in-gfp-translate.patch
new file mode 100644 (file)
index 0000000..cb35df8
--- /dev/null
@@ -0,0 +1,46 @@
+From 2049a7d0cbc6ac8e370e836ed68597be04a7dc49 Mon Sep 17 00:00:00 2001
+From: Prathu Baronia <prathubaronia2011@gmail.com>
+Date: Thu, 8 Jun 2023 21:14:49 +0530
+Subject: scripts: fix the gfp flags header path in gfp-translate
+
+From: Prathu Baronia <prathubaronia2011@gmail.com>
+
+commit 2049a7d0cbc6ac8e370e836ed68597be04a7dc49 upstream.
+
+Since gfp flags have been shifted to gfp_types.h so update the path in
+the gfp-translate script.
+
+Link: https://lkml.kernel.org/r/20230608154450.21758-1-prathubaronia2011@gmail.com
+Fixes: cb5a065b4ea9c ("headers/deps: mm: Split <linux/gfp_types.h> out of <linux/gfp.h>")
+Signed-off-by: Prathu Baronia <prathubaronia2011@gmail.com>
+Reviewed-by: David Hildenbrand <david@redhat.com>
+Cc: Masahiro Yamada <masahiroy@kernel.org>
+Cc: Nathan Chancellor <nathan@kernel.org>
+Cc: Nick Desaulniers <ndesaulniers@google.com>
+Cc: Nicolas Schier <nicolas@fjasle.eu>
+Cc: Ingo Molnar <mingo@kernel.org>
+Cc: Yury Norov <yury.norov@gmail.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ scripts/gfp-translate |    6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+--- a/scripts/gfp-translate
++++ b/scripts/gfp-translate
+@@ -63,11 +63,11 @@ fi
+ # Extract GFP flags from the kernel source
+ TMPFILE=`mktemp -t gfptranslate-XXXXXX` || exit 1
+-grep -q ___GFP $SOURCE/include/linux/gfp.h
++grep -q ___GFP $SOURCE/include/linux/gfp_types.h
+ if [ $? -eq 0 ]; then
+-      grep "^#define ___GFP" $SOURCE/include/linux/gfp.h | sed -e 's/u$//' | grep -v GFP_BITS > $TMPFILE
++      grep "^#define ___GFP" $SOURCE/include/linux/gfp_types.h | sed -e 's/u$//' | grep -v GFP_BITS > $TMPFILE
+ else
+-      grep "^#define __GFP" $SOURCE/include/linux/gfp.h | sed -e 's/(__force gfp_t)//' | sed -e 's/u)/)/' | grep -v GFP_BITS | sed -e 's/)\//) \//' > $TMPFILE
++      grep "^#define __GFP" $SOURCE/include/linux/gfp_types.h | sed -e 's/(__force gfp_t)//' | sed -e 's/u)/)/' | grep -v GFP_BITS | sed -e 's/)\//) \//' > $TMPFILE
+ fi
+ # Parse the flags
index debb6d06cf0c2e486db02373860e954c0a300dd6..46ab58e384ede5611ba8efb26e27106659a36bf2 100644 (file)
@@ -48,3 +48,40 @@ selftests-mptcp-join-skip-mpc-backups-tests-if-not-supported.patch
 selftests-mptcp-join-skip-pm-listener-tests-if-not-supported.patch
 selftests-mptcp-join-uniform-listener-tests.patch
 selftests-mptcp-join-skip-mixed-tests-if-not-supported.patch
+memfd-check-for-non-null-file_seals-in-memfd_create-syscall.patch
+writeback-fix-dereferencing-null-mapping-host-on-writeback_page_template.patch
+scripts-fix-the-gfp-flags-header-path-in-gfp-translate.patch
+nilfs2-fix-buffer-corruption-due-to-concurrent-device-reads.patch
+nilfs2-prevent-general-protection-fault-in-nilfs_clear_dirty_page.patch
+acpi-sleep-avoid-breaking-s3-wakeup-due-to-might_sleep.patch
+thermal-intel-intel_soc_dts_iosf-fix-reporting-wrong-temperatures.patch
+kvm-avoid-illegal-stage2-mapping-on-invalid-memory-slot.patch
+mm-vmalloc-do-not-output-a-spurious-warning-when-huge-vmalloc-fails.patch
+mm-mprotect-fix-do_mprotect_pkey-limit-check.patch
+drivers-hv-vmbus-call-hv_synic_free-if-hv_synic_alloc-fails.patch
+drivers-hv-vmbus-fix-vmbus_wait_for_unload-to-scan-present-cpus.patch
+pci-hv-fix-a-race-condition-bug-in-hv_pci_query_relations.patch
+revert-pci-hv-fix-a-timing-issue-which-causes-kdump-to-fail-occasionally.patch
+pci-hv-remove-the-useless-hv_pcichild_state-from-struct-hv_pci_dev.patch
+pci-hv-fix-a-race-condition-in-hv_irq_unmask-that-can-cause-panic.patch
+pci-hv-add-a-per-bus-mutex-state_lock.patch
+io_uring-net-clear-msg_controllen-on-partial-sendmsg-retry.patch
+io_uring-net-disable-partial-retries-for-recvmsg-with-cmsg.patch
+mptcp-handle-correctly-disconnect-failures.patch
+mptcp-fix-possible-divide-by-zero-in-recvmsg.patch
+mptcp-fix-possible-list-corruption-on-passive-mpj.patch
+mptcp-consolidate-fallback-and-non-fallback-state-machine.patch
+mptcp-ensure-listener-is-unhashed-before-updating-the-sk-status.patch
+cgroup-do-not-corrupt-task-iteration-when-rebinding-subsystem.patch
+cgroup-freezer-hold-cpu_hotplug_lock-before-freezer_mutex-in-freezer_css_-online-offline.patch
+net-mdio-fix-the-wrong-parameters.patch
+mmc-litex_mmc-set-probe_prefer_asynchronous.patch
+mmc-sdhci-msm-disable-broken-64-bit-dma-on-msm8916.patch
+mmc-meson-gx-remove-redundant-mmc_request_done-call-from-irq-context.patch
+mmc-mmci-stm32-fix-max-busy-timeout-calculation.patch
+mmc-sdhci-spear-fix-deferred-probing.patch
+mmc-bcm2835-fix-deferred-probing.patch
+mmc-sunxi-fix-deferred-probing.patch
+mmc-meson-gx-fix-deferred-probing.patch
+bpf-ensure-main-program-has-an-extable.patch
+wifi-iwlwifi-pcie-handle-so-f-device-for-pci-id-0x7af0.patch
diff --git a/queue-6.3/thermal-intel-intel_soc_dts_iosf-fix-reporting-wrong-temperatures.patch b/queue-6.3/thermal-intel-intel_soc_dts_iosf-fix-reporting-wrong-temperatures.patch
new file mode 100644 (file)
index 0000000..e0f5b41
--- /dev/null
@@ -0,0 +1,42 @@
+From 0bb619f9227aa370330d2b309733d74750705053 Mon Sep 17 00:00:00 2001
+From: Hans de Goede <hdegoede@redhat.com>
+Date: Wed, 14 Jun 2023 12:07:56 +0200
+Subject: thermal/intel/intel_soc_dts_iosf: Fix reporting wrong temperatures
+
+From: Hans de Goede <hdegoede@redhat.com>
+
+commit 0bb619f9227aa370330d2b309733d74750705053 upstream.
+
+Since commit 955fb8719efb ("thermal/intel/intel_soc_dts_iosf: Use Intel
+TCC library") intel_soc_dts_iosf is reporting the wrong temperature.
+
+The driver expects tj_max to be in milli-degrees-celcius but after
+the switch to the TCC library this is now in degrees celcius so
+instead of e.g. 90000 it is set to 90 causing a temperature 45
+degrees below tj_max to be reported as -44910 milli-degrees
+instead of as 45000 milli-degrees.
+
+Fix this by adding back the lost factor of 1000.
+
+Fixes: 955fb8719efb ("thermal/intel/intel_soc_dts_iosf: Use Intel TCC library")
+Reported-by: Bernhard Krug <b.krug@elektronenpumpe.de>
+Signed-off-by: Hans de Goede <hdegoede@redhat.com>
+Acked-by: Zhang Rui <rui.zhang@intel.com>
+Cc: 6.3+ <stable@vger.kernel.org> # 6.3+
+Signed-off-by: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/thermal/intel/intel_soc_dts_iosf.c |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/drivers/thermal/intel/intel_soc_dts_iosf.c
++++ b/drivers/thermal/intel/intel_soc_dts_iosf.c
+@@ -401,7 +401,7 @@ struct intel_soc_dts_sensors *intel_soc_
+       spin_lock_init(&sensors->intr_notify_lock);
+       mutex_init(&sensors->dts_update_lock);
+       sensors->intr_type = intr_type;
+-      sensors->tj_max = tj_max;
++      sensors->tj_max = tj_max * 1000;
+       if (intr_type == INTEL_SOC_DTS_INTERRUPT_NONE)
+               notification = false;
+       else
diff --git a/queue-6.3/wifi-iwlwifi-pcie-handle-so-f-device-for-pci-id-0x7af0.patch b/queue-6.3/wifi-iwlwifi-pcie-handle-so-f-device-for-pci-id-0x7af0.patch
new file mode 100644 (file)
index 0000000..7e0cd14
--- /dev/null
@@ -0,0 +1,34 @@
+From 4e9f0ec38852c18faa9689322e758575af33e5d4 Mon Sep 17 00:00:00 2001
+From: Mukesh Sisodiya <mukesh.sisodiya@intel.com>
+Date: Mon, 19 Jun 2023 17:02:34 +0200
+Subject: wifi: iwlwifi: pcie: Handle SO-F device for PCI id 0x7AF0
+
+From: Mukesh Sisodiya <mukesh.sisodiya@intel.com>
+
+commit 4e9f0ec38852c18faa9689322e758575af33e5d4 upstream.
+
+Add support for AX1690i and AX1690s devices with
+PCIE id 0x7AF0.
+
+Cc: stable@vger.kernel.org # 6.1+
+Signed-off-by: Mukesh Sisodiya <mukesh.sisodiya@intel.com>
+Signed-off-by: Gregory Greenman <gregory.greenman@intel.com>
+Signed-off-by: Johannes Berg <johannes.berg@intel.com>
+Link: https://lore.kernel.org/r/20230619150233.461290-2-johannes@sipsolutions.net
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/wireless/intel/iwlwifi/pcie/drv.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/drivers/net/wireless/intel/iwlwifi/pcie/drv.c
++++ b/drivers/net/wireless/intel/iwlwifi/pcie/drv.c
+@@ -547,6 +547,8 @@ static const struct iwl_dev_info iwl_dev
+       IWL_DEV_INFO(0x54F0, 0x1692, iwlax411_2ax_cfg_so_gf4_a0, iwl_ax411_killer_1690i_name),
+       IWL_DEV_INFO(0x7A70, 0x1691, iwlax411_2ax_cfg_so_gf4_a0, iwl_ax411_killer_1690s_name),
+       IWL_DEV_INFO(0x7A70, 0x1692, iwlax411_2ax_cfg_so_gf4_a0, iwl_ax411_killer_1690i_name),
++      IWL_DEV_INFO(0x7AF0, 0x1691, iwlax411_2ax_cfg_so_gf4_a0, iwl_ax411_killer_1690s_name),
++      IWL_DEV_INFO(0x7AF0, 0x1692, iwlax411_2ax_cfg_so_gf4_a0, iwl_ax411_killer_1690i_name),
+       IWL_DEV_INFO(0x271C, 0x0214, iwl9260_2ac_cfg, iwl9260_1_name),
+       IWL_DEV_INFO(0x7E40, 0x1691, iwl_cfg_ma_a0_gf4_a0, iwl_ax411_killer_1690s_name),
diff --git a/queue-6.3/writeback-fix-dereferencing-null-mapping-host-on-writeback_page_template.patch b/queue-6.3/writeback-fix-dereferencing-null-mapping-host-on-writeback_page_template.patch
new file mode 100644 (file)
index 0000000..4b8146d
--- /dev/null
@@ -0,0 +1,97 @@
+From 54abe19e00cfcc5a72773d15cd00ed19ab763439 Mon Sep 17 00:00:00 2001
+From: Rafael Aquini <aquini@redhat.com>
+Date: Tue, 6 Jun 2023 19:36:13 -0400
+Subject: writeback: fix dereferencing NULL mapping->host on writeback_page_template
+
+From: Rafael Aquini <aquini@redhat.com>
+
+commit 54abe19e00cfcc5a72773d15cd00ed19ab763439 upstream.
+
+When commit 19343b5bdd16 ("mm/page-writeback: introduce tracepoint for
+wait_on_page_writeback()") repurposed the writeback_dirty_page trace event
+as a template to create its new wait_on_page_writeback trace event, it
+ended up opening a window to NULL pointer dereference crashes due to the
+(infrequent) occurrence of a race where an access to a page in the
+swap-cache happens concurrently with the moment this page is being written
+to disk and the tracepoint is enabled:
+
+    BUG: kernel NULL pointer dereference, address: 0000000000000040
+    #PF: supervisor read access in kernel mode
+    #PF: error_code(0x0000) - not-present page
+    PGD 800000010ec0a067 P4D 800000010ec0a067 PUD 102353067 PMD 0
+    Oops: 0000 [#1] PREEMPT SMP PTI
+    CPU: 1 PID: 1320 Comm: shmem-worker Kdump: loaded Not tainted 6.4.0-rc5+ #13
+    Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS edk2-20230301gitf80f052277c8-1.fc37 03/01/2023
+    RIP: 0010:trace_event_raw_event_writeback_folio_template+0x76/0xf0
+    Code: 4d 85 e4 74 5c 49 8b 3c 24 e8 06 98 ee ff 48 89 c7 e8 9e 8b ee ff ba 20 00 00 00 48 89 ef 48 89 c6 e8 fe d4 1a 00 49 8b 04 24 <48> 8b 40 40 48 89 43 28 49 8b 45 20 48 89 e7 48 89 43 30 e8 a2 4d
+    RSP: 0000:ffffaad580b6fb60 EFLAGS: 00010246
+    RAX: 0000000000000000 RBX: ffff90e38035c01c RCX: 0000000000000000
+    RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffff90e38035c044
+    RBP: ffff90e38035c024 R08: 0000000000000002 R09: 0000000000000006
+    R10: ffff90e38035c02e R11: 0000000000000020 R12: ffff90e380bac000
+    R13: ffffe3a7456d9200 R14: 0000000000001b81 R15: ffffe3a7456d9200
+    FS:  00007f2e4e8a15c0(0000) GS:ffff90e3fbc80000(0000) knlGS:0000000000000000
+    CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+    CR2: 0000000000000040 CR3: 00000001150c6003 CR4: 0000000000170ee0
+    Call Trace:
+     <TASK>
+     ? __die+0x20/0x70
+     ? page_fault_oops+0x76/0x170
+     ? kernelmode_fixup_or_oops+0x84/0x110
+     ? exc_page_fault+0x65/0x150
+     ? asm_exc_page_fault+0x22/0x30
+     ? trace_event_raw_event_writeback_folio_template+0x76/0xf0
+     folio_wait_writeback+0x6b/0x80
+     shmem_swapin_folio+0x24a/0x500
+     ? filemap_get_entry+0xe3/0x140
+     shmem_get_folio_gfp+0x36e/0x7c0
+     ? find_busiest_group+0x43/0x1a0
+     shmem_fault+0x76/0x2a0
+     ? __update_load_avg_cfs_rq+0x281/0x2f0
+     __do_fault+0x33/0x130
+     do_read_fault+0x118/0x160
+     do_pte_missing+0x1ed/0x2a0
+     __handle_mm_fault+0x566/0x630
+     handle_mm_fault+0x91/0x210
+     do_user_addr_fault+0x22c/0x740
+     exc_page_fault+0x65/0x150
+     asm_exc_page_fault+0x22/0x30
+
+This problem arises from the fact that the repurposed writeback_dirty_page
+trace event code was written assuming that every pointer to mapping
+(struct address_space) would come from a file-mapped page-cache object,
+thus mapping->host would always be populated, and that was a valid case
+before commit 19343b5bdd16.  The swap-cache address space
+(swapper_spaces), however, doesn't populate its ->host (struct inode)
+pointer, thus leading to the crashes in the corner-case aforementioned.
+
+commit 19343b5bdd16 ended up breaking the assignment of __entry->name and
+__entry->ino for the wait_on_page_writeback tracepoint -- both dependent
+on mapping->host carrying a pointer to a valid inode.  The assignment of
+__entry->name was fixed by commit 68f23b89067f ("memcg: fix a crash in
+wb_workfn when a device disappears"), and this commit fixes the remaining
+case, for __entry->ino.
+
+Link: https://lkml.kernel.org/r/20230606233613.1290819-1-aquini@redhat.com
+Fixes: 19343b5bdd16 ("mm/page-writeback: introduce tracepoint for wait_on_page_writeback()")
+Signed-off-by: Rafael Aquini <aquini@redhat.com>
+Reviewed-by: Yafang Shao <laoar.shao@gmail.com>
+Cc: Aristeu Rozanski <aris@redhat.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/trace/events/writeback.h |    2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/include/trace/events/writeback.h
++++ b/include/trace/events/writeback.h
+@@ -68,7 +68,7 @@ DECLARE_EVENT_CLASS(writeback_folio_temp
+               strscpy_pad(__entry->name,
+                           bdi_dev_name(mapping ? inode_to_bdi(mapping->host) :
+                                        NULL), 32);
+-              __entry->ino = mapping ? mapping->host->i_ino : 0;
++              __entry->ino = (mapping && mapping->host) ? mapping->host->i_ino : 0;
+               __entry->index = folio->index;
+       ),