From 981db53b683bf16f99fa1f06933edf4bf31de653 Mon Sep 17 00:00:00 2001 From: Sasha Levin Date: Sat, 21 Jan 2023 23:27:49 -0500 Subject: [PATCH] Fixes for 5.15 Signed-off-by: Sasha Levin --- ...-fix-mute-micmute-leds-don-t-work-fo.patch | 37 +++++ ...-fix-mute-micmute-leds-for-a-hp-prob.patch | 38 ++++++ ...-removal-of-the-firmware-framebuffer.patch | 86 ++++++++++++ ...le-runtime-pm-on-several-sienna-cich.patch | 69 ++++++++++ ...e-infinite-retry-read-efivars-after-.patch | 46 +++++++ ...provide-a-eventfd_signal_mask-helper.patch | 120 ++++++++++++++++ ...dd-epoll_uring_wake-poll-wakeup-flag.patch | 119 ++++++++++++++++ ...nshare-some-pmds-when-splitting-vmas.patch | 128 ++++++++++++++++++ ...g-for-disabling-provided-buffer-recy.patch | 60 ++++++++ ...ng-allow-re-poll-if-we-made-progress.patch | 53 ++++++++ ...o-not-recalculate-ppos-unnecessarily.patch | 100 ++++++++++++++ ...ate-task_work-run-on-tif_notify_sign.patch | 47 +++++++ ...recv-and-recvmsg-handle-msg_waitall-.patch | 107 +++++++++++++++ ...that-cached-task-references-are-alwa.patch | 55 ++++++++ ...x-async-accept-on-o_nonblock-sockets.patch | 50 +++++++ ...ing-improve-send-recv-error-handling.patch | 126 +++++++++++++++++ ...-epoll_uring_wake-for-eventfd-signal.patch | 87 ++++++++++++ ...ve-duplicated-calls-to-io_kiocb_ppos.patch | 65 +++++++++ ...defer-fsnotify-calls-to-task-context.patch | 122 +++++++++++++++++ ...t-msg_waitall-for-ioring_op_send-msg.patch | 111 +++++++++++++++ ...pdate-kiocb-ki_pos-at-execution-time.patch | 86 ++++++++++++ queue-5.15/series | 21 +++ 22 files changed, 1733 insertions(+) create mode 100644 queue-5.15/alsa-hda-realtek-fix-mute-micmute-leds-don-t-work-fo.patch create mode 100644 queue-5.15/alsa-hda-realtek-fix-mute-micmute-leds-for-a-hp-prob.patch create mode 100644 queue-5.15/drm-amd-delay-removal-of-the-firmware-framebuffer.patch create mode 100644 queue-5.15/drm-amdgpu-disable-runtime-pm-on-several-sienna-cich.patch create mode 100644 queue-5.15/efi-fix-userspace-infinite-retry-read-efivars-after-.patch create mode 100644 queue-5.15/eventfd-provide-a-eventfd_signal_mask-helper.patch create mode 100644 queue-5.15/eventpoll-add-epoll_uring_wake-poll-wakeup-flag.patch create mode 100644 queue-5.15/hugetlb-unshare-some-pmds-when-splitting-vmas.patch create mode 100644 queue-5.15/io_uring-add-flag-for-disabling-provided-buffer-recy.patch create mode 100644 queue-5.15/io_uring-allow-re-poll-if-we-made-progress.patch create mode 100644 queue-5.15/io_uring-do-not-recalculate-ppos-unnecessarily.patch create mode 100644 queue-5.15/io_uring-don-t-gate-task_work-run-on-tif_notify_sign.patch create mode 100644 queue-5.15/io_uring-ensure-recv-and-recvmsg-handle-msg_waitall-.patch create mode 100644 queue-5.15/io_uring-ensure-that-cached-task-references-are-alwa.patch create mode 100644 queue-5.15/io_uring-fix-async-accept-on-o_nonblock-sockets.patch create mode 100644 queue-5.15/io_uring-improve-send-recv-error-handling.patch create mode 100644 queue-5.15/io_uring-pass-in-epoll_uring_wake-for-eventfd-signal.patch create mode 100644 queue-5.15/io_uring-remove-duplicated-calls-to-io_kiocb_ppos.patch create mode 100644 queue-5.15/io_uring-rw-defer-fsnotify-calls-to-task-context.patch create mode 100644 queue-5.15/io_uring-support-msg_waitall-for-ioring_op_send-msg.patch create mode 100644 queue-5.15/io_uring-update-kiocb-ki_pos-at-execution-time.patch diff --git a/queue-5.15/alsa-hda-realtek-fix-mute-micmute-leds-don-t-work-fo.patch b/queue-5.15/alsa-hda-realtek-fix-mute-micmute-leds-don-t-work-fo.patch new file mode 100644 index 00000000000..02423717b28 --- /dev/null +++ b/queue-5.15/alsa-hda-realtek-fix-mute-micmute-leds-don-t-work-fo.patch @@ -0,0 +1,37 @@ +From cf8d448a5fbdf2a0300125d11c2920002d9313b9 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 5 Jan 2023 12:41:53 +0800 +Subject: ALSA: hda/realtek: fix mute/micmute LEDs don't work for a HP platform + +From: Jeremy Szu + +[ Upstream commit 9c694fbfe6f36017b060ad74c7565cb379852e40 ] + +There is a HP platform uses ALC236 codec which using GPIO2 to control +mute LED and GPIO1 to control micmute LED. +Thus, add a quirk to make them work. + +Signed-off-by: Jeremy Szu +Cc: +Link: https://lore.kernel.org/r/20230105044154.8242-1-jeremy.szu@canonical.com +Signed-off-by: Takashi Iwai +Signed-off-by: Sasha Levin +--- + sound/pci/hda/patch_realtek.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c +index feb337083573..74fe0fe85834 100644 +--- a/sound/pci/hda/patch_realtek.c ++++ b/sound/pci/hda/patch_realtek.c +@@ -9078,6 +9078,7 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { + SND_PCI_QUIRK(0x103c, 0x8aab, "HP EliteBook 650 G9 (MB 8AA9)", ALC236_FIXUP_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8b5d, "HP", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), + SND_PCI_QUIRK(0x103c, 0x8b5e, "HP", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), ++ SND_PCI_QUIRK(0x103c, 0x8bf0, "HP", ALC236_FIXUP_HP_GPIO_LED), + SND_PCI_QUIRK(0x1043, 0x103e, "ASUS X540SA", ALC256_FIXUP_ASUS_MIC), + SND_PCI_QUIRK(0x1043, 0x103f, "ASUS TX300", ALC282_FIXUP_ASUS_TX300), + SND_PCI_QUIRK(0x1043, 0x106d, "Asus K53BE", ALC269_FIXUP_LIMIT_INT_MIC_BOOST), +-- +2.39.0 + diff --git a/queue-5.15/alsa-hda-realtek-fix-mute-micmute-leds-for-a-hp-prob.patch b/queue-5.15/alsa-hda-realtek-fix-mute-micmute-leds-for-a-hp-prob.patch new file mode 100644 index 00000000000..e50c3958d7d --- /dev/null +++ b/queue-5.15/alsa-hda-realtek-fix-mute-micmute-leds-for-a-hp-prob.patch @@ -0,0 +1,38 @@ +From a2739279bac7a3b48e5be5b874b4d40407ba163e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 28 Nov 2022 10:28:47 +0800 +Subject: ALSA: hda/realtek: fix mute/micmute LEDs for a HP ProBook + +From: Andy Chi + +[ Upstream commit 1d8025ec722d5e011f9299c46274eb21fb54a428 ] + +There is a HP ProBook which using ALC236 codec and need the +ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF quirk to make mute LED and +micmute LED work. + +Signed-off-by: Andy Chi +Cc: +Link: https://lore.kernel.org/r/20221128022849.13759-1-andy.chi@canonical.com +Signed-off-by: Takashi Iwai +Signed-off-by: Sasha Levin +--- + sound/pci/hda/patch_realtek.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c +index c7321f5842b3..feb337083573 100644 +--- a/sound/pci/hda/patch_realtek.c ++++ b/sound/pci/hda/patch_realtek.c +@@ -9076,6 +9076,8 @@ static const struct snd_pci_quirk alc269_fixup_tbl[] = { + SND_PCI_QUIRK(0x103c, 0x8aa3, "HP ProBook 450 G9 (MB 8AA1)", ALC236_FIXUP_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8aa8, "HP EliteBook 640 G9 (MB 8AA6)", ALC236_FIXUP_HP_GPIO_LED), + SND_PCI_QUIRK(0x103c, 0x8aab, "HP EliteBook 650 G9 (MB 8AA9)", ALC236_FIXUP_HP_GPIO_LED), ++ SND_PCI_QUIRK(0x103c, 0x8b5d, "HP", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), ++ SND_PCI_QUIRK(0x103c, 0x8b5e, "HP", ALC236_FIXUP_HP_MUTE_LED_MICMUTE_VREF), + SND_PCI_QUIRK(0x1043, 0x103e, "ASUS X540SA", ALC256_FIXUP_ASUS_MIC), + SND_PCI_QUIRK(0x1043, 0x103f, "ASUS TX300", ALC282_FIXUP_ASUS_TX300), + SND_PCI_QUIRK(0x1043, 0x106d, "Asus K53BE", ALC269_FIXUP_LIMIT_INT_MIC_BOOST), +-- +2.39.0 + diff --git a/queue-5.15/drm-amd-delay-removal-of-the-firmware-framebuffer.patch b/queue-5.15/drm-amd-delay-removal-of-the-firmware-framebuffer.patch new file mode 100644 index 00000000000..7937c7adeb6 --- /dev/null +++ b/queue-5.15/drm-amd-delay-removal-of-the-firmware-framebuffer.patch @@ -0,0 +1,86 @@ +From cf3b8fb9b542f4bfd3beb12c8f3bb9c1166a3a1d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 20 Jan 2023 18:01:19 -0500 +Subject: drm/amd: Delay removal of the firmware framebuffer + +[ Upstream commit 1923bc5a56daeeabd7e9093bad2febcd6af2416a ] + +Removing the firmware framebuffer from the driver means that even +if the driver doesn't support the IP blocks in a GPU it will no +longer be functional after the driver fails to initialize. + +This change will ensure that unsupported IP blocks at least cause +the driver to work with the EFI framebuffer. + +Cc: stable@vger.kernel.org +Suggested-by: Alex Deucher +Reviewed-by: Alex Deucher +Reviewed-by: Lijo Lazar +Signed-off-by: Mario Limonciello +Signed-off-by: Alex Deucher +Signed-off-by: Sasha Levin +--- + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 8 ++++++++ + drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 6 ------ + 2 files changed, 8 insertions(+), 6 deletions(-) + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +index 0d998bc830c2..b5fe2c91f58c 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +@@ -32,6 +32,7 @@ + #include + #include + ++#include + #include + #include + #include +@@ -89,6 +90,8 @@ MODULE_FIRMWARE("amdgpu/yellow_carp_gpu_info.bin"); + + #define AMDGPU_RESUME_MS 2000 + ++static const struct drm_driver amdgpu_kms_driver; ++ + const char *amdgpu_asic_name[] = { + "TAHITI", + "PITCAIRN", +@@ -3637,6 +3640,11 @@ int amdgpu_device_init(struct amdgpu_device *adev, + if (r) + return r; + ++ /* Get rid of things like offb */ ++ r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver); ++ if (r) ++ return r; ++ + /* doorbell bar mapping and doorbell index init*/ + amdgpu_device_doorbell_init(adev); + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +index cabbf02eb054..c95cee3d4c9a 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +@@ -23,7 +23,6 @@ + */ + + #include +-#include + #include + #include + #include +@@ -2067,11 +2066,6 @@ static int amdgpu_pci_probe(struct pci_dev *pdev, + size = pci_resource_len(pdev, 0); + is_fw_fb = amdgpu_is_fw_framebuffer(base, size); + +- /* Get rid of things like offb */ +- ret = drm_aperture_remove_conflicting_pci_framebuffers(pdev, &amdgpu_kms_driver); +- if (ret) +- return ret; +- + adev = devm_drm_dev_alloc(&pdev->dev, &amdgpu_kms_driver, typeof(*adev), ddev); + if (IS_ERR(adev)) + return PTR_ERR(adev); +-- +2.39.0 + diff --git a/queue-5.15/drm-amdgpu-disable-runtime-pm-on-several-sienna-cich.patch b/queue-5.15/drm-amdgpu-disable-runtime-pm-on-several-sienna-cich.patch new file mode 100644 index 00000000000..2b295cde49b --- /dev/null +++ b/queue-5.15/drm-amdgpu-disable-runtime-pm-on-several-sienna-cich.patch @@ -0,0 +1,69 @@ +From c825716dde20494ca5f44c4c81158973736999f4 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 27 Apr 2022 15:51:02 +0800 +Subject: drm/amdgpu: disable runtime pm on several sienna cichlid cards(v2) + +From: Guchun Chen + +[ Upstream commit d1acd68b2b8924c804e1e3cc1bc5fa4d6b76176c ] + +Disable runtime power management on several sienna cichlid +cards, otherwise SMU will possibly fail to be resumed from +runtime suspend. Will drop this after a clean solution between +kernel driver and SMU FW is available. + +amdgpu 0000:63:00.0: amdgpu: GECC is enabled +amdgpu 0000:63:00.0: amdgpu: SECUREDISPLAY: securedisplay ta ucode is not available +amdgpu 0000:63:00.0: amdgpu: SMU is resuming... +amdgpu 0000:63:00.0: amdgpu: SMU: I'm not done with your command: SMN_C2PMSG_66:0x0000000E SMN_C2PMSG_82:0x00000080 +amdgpu 0000:63:00.0: amdgpu: Failed to SetDriverDramAddr! +amdgpu 0000:63:00.0: amdgpu: Failed to setup smc hw! +[drm:amdgpu_device_ip_resume_phase2 [amdgpu]] *ERROR* resume of IP block failed -62 +amdgpu 0000:63:00.0: amdgpu: amdgpu_device_ip_resume failed (-62) + +v2: seperate to a function. + +Signed-off-by: Guchun Chen +Reviewed-by: Evan Quan +Signed-off-by: Alex Deucher +Stable-dep-of: 1923bc5a56da ("drm/amd: Delay removal of the firmware framebuffer") +Signed-off-by: Sasha Levin +--- + drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c | 14 ++++++++++++++ + 1 file changed, 14 insertions(+) + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +index 6744427577b3..43e30b9a2e02 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_kms.c +@@ -43,6 +43,17 @@ + #include "amdgpu_display.h" + #include "amdgpu_ras.h" + ++static void amdgpu_runtime_pm_quirk(struct amdgpu_device *adev) ++{ ++ /* ++ * Add below quirk on several sienna_cichlid cards to disable ++ * runtime pm to fix EMI failures. ++ */ ++ if (((adev->pdev->device == 0x73A1) && (adev->pdev->revision == 0x00)) || ++ ((adev->pdev->device == 0x73BF) && (adev->pdev->revision == 0xCF))) ++ adev->runpm = false; ++} ++ + void amdgpu_unregister_gpu_instance(struct amdgpu_device *adev) + { + struct amdgpu_gpu_instance *gpu_instance; +@@ -201,6 +212,9 @@ int amdgpu_driver_load_kms(struct amdgpu_device *adev, unsigned long flags) + */ + if (adev->is_fw_fb) + adev->runpm = false; ++ ++ amdgpu_runtime_pm_quirk(adev); ++ + if (adev->runpm) + dev_info(adev->dev, "Using BACO for runtime pm\n"); + } +-- +2.39.0 + diff --git a/queue-5.15/efi-fix-userspace-infinite-retry-read-efivars-after-.patch b/queue-5.15/efi-fix-userspace-infinite-retry-read-efivars-after-.patch new file mode 100644 index 00000000000..4d1096a8823 --- /dev/null +++ b/queue-5.15/efi-fix-userspace-infinite-retry-read-efivars-after-.patch @@ -0,0 +1,46 @@ +From c368463eedf59bc987512cbbacc2f0609421bfaa Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 27 Dec 2022 23:09:36 +0800 +Subject: efi: fix userspace infinite retry read efivars after EFI runtime + services page fault + +From: Ding Hui + +[ Upstream commit e006ac3003080177cf0b673441a4241f77aaecce ] + +After [1][2], if we catch exceptions due to EFI runtime service, we will +clear EFI_RUNTIME_SERVICES bit to disable EFI runtime service, then the +subsequent routine which invoke the EFI runtime service should fail. + +But the userspace cat efivars through /sys/firmware/efi/efivars/ will stuck +and infinite loop calling read() due to efivarfs_file_read() return -EINTR. + +The -EINTR is converted from EFI_ABORTED by efi_status_to_err(), and is +an improper return value in this situation, so let virt_efi_xxx() return +EFI_DEVICE_ERROR and converted to -EIO to invoker. + +Cc: +Fixes: 3425d934fc03 ("efi/x86: Handle page faults occurring while running EFI runtime services") +Fixes: 23715a26c8d8 ("arm64: efi: Recover from synchronous exceptions occurring in firmware") +Signed-off-by: Ding Hui +Signed-off-by: Ard Biesheuvel +Signed-off-by: Sasha Levin +--- + drivers/firmware/efi/runtime-wrappers.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/drivers/firmware/efi/runtime-wrappers.c b/drivers/firmware/efi/runtime-wrappers.c +index f3e54f6616f0..60075e0e4943 100644 +--- a/drivers/firmware/efi/runtime-wrappers.c ++++ b/drivers/firmware/efi/runtime-wrappers.c +@@ -62,6 +62,7 @@ struct efi_runtime_work efi_rts_work; + \ + if (!efi_enabled(EFI_RUNTIME_SERVICES)) { \ + pr_warn_once("EFI Runtime Services are disabled!\n"); \ ++ efi_rts_work.status = EFI_DEVICE_ERROR; \ + goto exit; \ + } \ + \ +-- +2.39.0 + diff --git a/queue-5.15/eventfd-provide-a-eventfd_signal_mask-helper.patch b/queue-5.15/eventfd-provide-a-eventfd_signal_mask-helper.patch new file mode 100644 index 00000000000..abe4f326d05 --- /dev/null +++ b/queue-5.15/eventfd-provide-a-eventfd_signal_mask-helper.patch @@ -0,0 +1,120 @@ +From 54b5c7b71df2983aa7b663b231216bc6fb51ba6b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sun, 20 Nov 2022 10:13:44 -0700 +Subject: eventfd: provide a eventfd_signal_mask() helper + +From: Jens Axboe + +[ Upstream commit 03e02acda8e267a8183e1e0ed289ff1ef9cd7ed8 ] + +This is identical to eventfd_signal(), but it allows the caller to pass +in a mask to be used for the poll wakeup key. The use case is avoiding +repeated multishot triggers if we have a dependency between eventfd and +io_uring. + +If we setup an eventfd context and register that as the io_uring eventfd, +and at the same time queue a multishot poll request for the eventfd +context, then any CQE posted will repeatedly trigger the multishot request +until it terminates when the CQ ring overflows. + +In preparation for io_uring detecting this circular dependency, add the +mentioned helper so that io_uring can pass in EPOLL_URING as part of the +poll wakeup key. + +Cc: stable@vger.kernel.org # 6.0 +[axboe: fold in !CONFIG_EVENTFD fix from Zhang Qilong] +Signed-off-by: Jens Axboe +Signed-off-by: Sasha Levin +--- + fs/eventfd.c | 37 +++++++++++++++++++++---------------- + include/linux/eventfd.h | 7 +++++++ + 2 files changed, 28 insertions(+), 16 deletions(-) + +diff --git a/fs/eventfd.c b/fs/eventfd.c +index c0ffee99ad23..249ca6c0b784 100644 +--- a/fs/eventfd.c ++++ b/fs/eventfd.c +@@ -43,21 +43,7 @@ struct eventfd_ctx { + int id; + }; + +-/** +- * eventfd_signal - Adds @n to the eventfd counter. +- * @ctx: [in] Pointer to the eventfd context. +- * @n: [in] Value of the counter to be added to the eventfd internal counter. +- * The value cannot be negative. +- * +- * This function is supposed to be called by the kernel in paths that do not +- * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX +- * value, and we signal this as overflow condition by returning a EPOLLERR +- * to poll(2). +- * +- * Returns the amount by which the counter was incremented. This will be less +- * than @n if the counter has overflowed. +- */ +-__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n) ++__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, unsigned mask) + { + unsigned long flags; + +@@ -78,12 +64,31 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n) + n = ULLONG_MAX - ctx->count; + ctx->count += n; + if (waitqueue_active(&ctx->wqh)) +- wake_up_locked_poll(&ctx->wqh, EPOLLIN); ++ wake_up_locked_poll(&ctx->wqh, EPOLLIN | mask); + current->in_eventfd = 0; + spin_unlock_irqrestore(&ctx->wqh.lock, flags); + + return n; + } ++ ++/** ++ * eventfd_signal - Adds @n to the eventfd counter. ++ * @ctx: [in] Pointer to the eventfd context. ++ * @n: [in] Value of the counter to be added to the eventfd internal counter. ++ * The value cannot be negative. ++ * ++ * This function is supposed to be called by the kernel in paths that do not ++ * allow sleeping. In this function we allow the counter to reach the ULLONG_MAX ++ * value, and we signal this as overflow condition by returning a EPOLLERR ++ * to poll(2). ++ * ++ * Returns the amount by which the counter was incremented. This will be less ++ * than @n if the counter has overflowed. ++ */ ++__u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n) ++{ ++ return eventfd_signal_mask(ctx, n, 0); ++} + EXPORT_SYMBOL_GPL(eventfd_signal); + + static void eventfd_free_ctx(struct eventfd_ctx *ctx) +diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h +index 3cd202d3eefb..36a486505b08 100644 +--- a/include/linux/eventfd.h ++++ b/include/linux/eventfd.h +@@ -40,6 +40,7 @@ struct file *eventfd_fget(int fd); + struct eventfd_ctx *eventfd_ctx_fdget(int fd); + struct eventfd_ctx *eventfd_ctx_fileget(struct file *file); + __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n); ++__u64 eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, unsigned mask); + int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait, + __u64 *cnt); + void eventfd_ctx_do_read(struct eventfd_ctx *ctx, __u64 *cnt); +@@ -66,6 +67,12 @@ static inline int eventfd_signal(struct eventfd_ctx *ctx, __u64 n) + return -ENOSYS; + } + ++static inline int eventfd_signal_mask(struct eventfd_ctx *ctx, __u64 n, ++ unsigned mask) ++{ ++ return -ENOSYS; ++} ++ + static inline void eventfd_ctx_put(struct eventfd_ctx *ctx) + { + +-- +2.39.0 + diff --git a/queue-5.15/eventpoll-add-epoll_uring_wake-poll-wakeup-flag.patch b/queue-5.15/eventpoll-add-epoll_uring_wake-poll-wakeup-flag.patch new file mode 100644 index 00000000000..e89c674e41d --- /dev/null +++ b/queue-5.15/eventpoll-add-epoll_uring_wake-poll-wakeup-flag.patch @@ -0,0 +1,119 @@ +From 7ac2c34204d7579365219f9337decd8c82bc08b1 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sun, 20 Nov 2022 10:10:53 -0700 +Subject: eventpoll: add EPOLL_URING_WAKE poll wakeup flag + +From: Jens Axboe + +[ Upstream commit caf1aeaffc3b09649a56769e559333ae2c4f1802 ] + +We can have dependencies between epoll and io_uring. Consider an epoll +context, identified by the epfd file descriptor, and an io_uring file +descriptor identified by iofd. If we add iofd to the epfd context, and +arm a multishot poll request for epfd with iofd, then the multishot +poll request will repeatedly trigger and generate events until terminated +by CQ ring overflow. This isn't a desired behavior. + +Add EPOLL_URING so that io_uring can pass it in as part of the poll wakeup +key, and io_uring can check for that to detect a potential recursive +invocation. + +Cc: stable@vger.kernel.org # 6.0 +Signed-off-by: Jens Axboe +Signed-off-by: Sasha Levin +--- + fs/eventpoll.c | 18 ++++++++++-------- + include/uapi/linux/eventpoll.h | 6 ++++++ + 2 files changed, 16 insertions(+), 8 deletions(-) + +diff --git a/fs/eventpoll.c b/fs/eventpoll.c +index cf326c53db0f..1ec197825544 100644 +--- a/fs/eventpoll.c ++++ b/fs/eventpoll.c +@@ -484,7 +484,8 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) + */ + #ifdef CONFIG_DEBUG_LOCK_ALLOC + +-static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi) ++static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi, ++ unsigned pollflags) + { + struct eventpoll *ep_src; + unsigned long flags; +@@ -515,16 +516,17 @@ static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi) + } + spin_lock_irqsave_nested(&ep->poll_wait.lock, flags, nests); + ep->nests = nests + 1; +- wake_up_locked_poll(&ep->poll_wait, EPOLLIN); ++ wake_up_locked_poll(&ep->poll_wait, EPOLLIN | pollflags); + ep->nests = 0; + spin_unlock_irqrestore(&ep->poll_wait.lock, flags); + } + + #else + +-static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi) ++static void ep_poll_safewake(struct eventpoll *ep, struct epitem *epi, ++ unsigned pollflags) + { +- wake_up_poll(&ep->poll_wait, EPOLLIN); ++ wake_up_poll(&ep->poll_wait, EPOLLIN | pollflags); + } + + #endif +@@ -735,7 +737,7 @@ static void ep_free(struct eventpoll *ep) + + /* We need to release all tasks waiting for these file */ + if (waitqueue_active(&ep->poll_wait)) +- ep_poll_safewake(ep, NULL); ++ ep_poll_safewake(ep, NULL, 0); + + /* + * We need to lock this because we could be hit by +@@ -1201,7 +1203,7 @@ static int ep_poll_callback(wait_queue_entry_t *wait, unsigned mode, int sync, v + + /* We have to call this outside the lock */ + if (pwake) +- ep_poll_safewake(ep, epi); ++ ep_poll_safewake(ep, epi, pollflags & EPOLL_URING_WAKE); + + if (!(epi->event.events & EPOLLEXCLUSIVE)) + ewake = 1; +@@ -1546,7 +1548,7 @@ static int ep_insert(struct eventpoll *ep, const struct epoll_event *event, + + /* We have to call this outside the lock */ + if (pwake) +- ep_poll_safewake(ep, NULL); ++ ep_poll_safewake(ep, NULL, 0); + + return 0; + } +@@ -1622,7 +1624,7 @@ static int ep_modify(struct eventpoll *ep, struct epitem *epi, + + /* We have to call this outside the lock */ + if (pwake) +- ep_poll_safewake(ep, NULL); ++ ep_poll_safewake(ep, NULL, 0); + + return 0; + } +diff --git a/include/uapi/linux/eventpoll.h b/include/uapi/linux/eventpoll.h +index 8a3432d0f0dc..e687658843b1 100644 +--- a/include/uapi/linux/eventpoll.h ++++ b/include/uapi/linux/eventpoll.h +@@ -41,6 +41,12 @@ + #define EPOLLMSG (__force __poll_t)0x00000400 + #define EPOLLRDHUP (__force __poll_t)0x00002000 + ++/* ++ * Internal flag - wakeup generated by io_uring, used to detect recursion back ++ * into the io_uring poll handler. ++ */ ++#define EPOLL_URING_WAKE ((__force __poll_t)(1U << 27)) ++ + /* Set exclusive wakeup mode for the target file descriptor */ + #define EPOLLEXCLUSIVE ((__force __poll_t)(1U << 28)) + +-- +2.39.0 + diff --git a/queue-5.15/hugetlb-unshare-some-pmds-when-splitting-vmas.patch b/queue-5.15/hugetlb-unshare-some-pmds-when-splitting-vmas.patch new file mode 100644 index 00000000000..f902b6891fc --- /dev/null +++ b/queue-5.15/hugetlb-unshare-some-pmds-when-splitting-vmas.patch @@ -0,0 +1,128 @@ +From 8b58f4839634ed814a53f71c0831a2e4fa0e9292 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 4 Jan 2023 23:19:10 +0000 +Subject: hugetlb: unshare some PMDs when splitting VMAs + +From: James Houghton + +[ Upstream commit b30c14cd61025eeea2f2e8569606cd167ba9ad2d ] + +PMD sharing can only be done in PUD_SIZE-aligned pieces of VMAs; however, +it is possible that HugeTLB VMAs are split without unsharing the PMDs +first. + +Without this fix, it is possible to hit the uffd-wp-related WARN_ON_ONCE +in hugetlb_change_protection [1]. The key there is that +hugetlb_unshare_all_pmds will not attempt to unshare PMDs in +non-PUD_SIZE-aligned sections of the VMA. + +It might seem ideal to unshare in hugetlb_vm_op_open, but we need to +unshare in both the new and old VMAs, so unsharing in hugetlb_vm_op_split +seems natural. + +[1]: https://lore.kernel.org/linux-mm/CADrL8HVeOkj0QH5VZZbRzybNE8CG-tEGFshnA+bG9nMgcWtBSg@mail.gmail.com/ + +Link: https://lkml.kernel.org/r/20230104231910.1464197-1-jthoughton@google.com +Fixes: 6dfeaff93be1 ("hugetlb/userfaultfd: unshare all pmds for hugetlbfs when register wp") +Signed-off-by: James Houghton +Reviewed-by: Mike Kravetz +Acked-by: Peter Xu +Cc: Axel Rasmussen +Cc: Muchun Song +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Sasha Levin +--- + mm/hugetlb.c | 44 +++++++++++++++++++++++++++++++++++--------- + 1 file changed, 35 insertions(+), 9 deletions(-) + +diff --git a/mm/hugetlb.c b/mm/hugetlb.c +index e7bd42f23667..8599f16d4aa4 100644 +--- a/mm/hugetlb.c ++++ b/mm/hugetlb.c +@@ -82,6 +82,8 @@ struct mutex *hugetlb_fault_mutex_table ____cacheline_aligned_in_smp; + + /* Forward declaration */ + static int hugetlb_acct_memory(struct hstate *h, long delta); ++static void hugetlb_unshare_pmds(struct vm_area_struct *vma, ++ unsigned long start, unsigned long end); + + static inline bool subpool_is_free(struct hugepage_subpool *spool) + { +@@ -4164,6 +4166,25 @@ static int hugetlb_vm_op_split(struct vm_area_struct *vma, unsigned long addr) + { + if (addr & ~(huge_page_mask(hstate_vma(vma)))) + return -EINVAL; ++ ++ /* ++ * PMD sharing is only possible for PUD_SIZE-aligned address ranges ++ * in HugeTLB VMAs. If we will lose PUD_SIZE alignment due to this ++ * split, unshare PMDs in the PUD_SIZE interval surrounding addr now. ++ */ ++ if (addr & ~PUD_MASK) { ++ /* ++ * hugetlb_vm_op_split is called right before we attempt to ++ * split the VMA. We will need to unshare PMDs in the old and ++ * new VMAs, so let's unshare before we split. ++ */ ++ unsigned long floor = addr & PUD_MASK; ++ unsigned long ceil = floor + PUD_SIZE; ++ ++ if (floor >= vma->vm_start && ceil <= vma->vm_end) ++ hugetlb_unshare_pmds(vma, floor, ceil); ++ } ++ + return 0; + } + +@@ -6349,26 +6370,21 @@ void move_hugetlb_state(struct page *oldpage, struct page *newpage, int reason) + } + } + +-/* +- * This function will unconditionally remove all the shared pmd pgtable entries +- * within the specific vma for a hugetlbfs memory range. +- */ +-void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) ++static void hugetlb_unshare_pmds(struct vm_area_struct *vma, ++ unsigned long start, ++ unsigned long end) + { + struct hstate *h = hstate_vma(vma); + unsigned long sz = huge_page_size(h); + struct mm_struct *mm = vma->vm_mm; + struct mmu_notifier_range range; +- unsigned long address, start, end; ++ unsigned long address; + spinlock_t *ptl; + pte_t *ptep; + + if (!(vma->vm_flags & VM_MAYSHARE)) + return; + +- start = ALIGN(vma->vm_start, PUD_SIZE); +- end = ALIGN_DOWN(vma->vm_end, PUD_SIZE); +- + if (start >= end) + return; + +@@ -6400,6 +6416,16 @@ void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) + mmu_notifier_invalidate_range_end(&range); + } + ++/* ++ * This function will unconditionally remove all the shared pmd pgtable entries ++ * within the specific vma for a hugetlbfs memory range. ++ */ ++void hugetlb_unshare_all_pmds(struct vm_area_struct *vma) ++{ ++ hugetlb_unshare_pmds(vma, ALIGN(vma->vm_start, PUD_SIZE), ++ ALIGN_DOWN(vma->vm_end, PUD_SIZE)); ++} ++ + #ifdef CONFIG_CMA + static bool cma_reserve_called __initdata; + +-- +2.39.0 + diff --git a/queue-5.15/io_uring-add-flag-for-disabling-provided-buffer-recy.patch b/queue-5.15/io_uring-add-flag-for-disabling-provided-buffer-recy.patch new file mode 100644 index 00000000000..b1427bdf943 --- /dev/null +++ b/queue-5.15/io_uring-add-flag-for-disabling-provided-buffer-recy.patch @@ -0,0 +1,60 @@ +From e9452c133eee9a693368f144cdbea07cfe5fe492 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 23 Mar 2022 09:30:05 -0600 +Subject: io_uring: add flag for disabling provided buffer recycling + +From: Jens Axboe + +commit 8a3e8ee56417f5e0e66580d93941ed9d6f4c8274 upstream. + +If we need to continue doing this IO, then we don't want a potentially +selected buffer recycled. Add a flag for that. + +Set this for recv/recvmsg if they do partial IO. + +Signed-off-by: Jens Axboe +Signed-off-by: Sasha Levin +--- + io_uring/io_uring.c | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c +index 04441e981624..2350d43aa782 100644 +--- a/io_uring/io_uring.c ++++ b/io_uring/io_uring.c +@@ -740,6 +740,7 @@ enum { + REQ_F_CREDS_BIT, + REQ_F_REFCOUNT_BIT, + REQ_F_ARM_LTIMEOUT_BIT, ++ REQ_F_PARTIAL_IO_BIT, + /* keep async read/write and isreg together and in order */ + REQ_F_NOWAIT_READ_BIT, + REQ_F_NOWAIT_WRITE_BIT, +@@ -795,6 +796,8 @@ enum { + REQ_F_REFCOUNT = BIT(REQ_F_REFCOUNT_BIT), + /* there is a linked timeout that has to be armed */ + REQ_F_ARM_LTIMEOUT = BIT(REQ_F_ARM_LTIMEOUT_BIT), ++ /* request has already done partial IO */ ++ REQ_F_PARTIAL_IO = BIT(REQ_F_PARTIAL_IO_BIT), + }; + + struct async_poll { +@@ -5123,6 +5126,7 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) + ret = -EINTR; + if (ret > 0 && io_net_retry(sock, flags)) { + sr->done_io += ret; ++ req->flags |= REQ_F_PARTIAL_IO; + return io_setup_async_msg(req, kmsg); + } + req_set_fail(req); +@@ -5196,6 +5200,7 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags) + sr->len -= ret; + sr->buf += ret; + sr->done_io += ret; ++ req->flags |= REQ_F_PARTIAL_IO; + return -EAGAIN; + } + req_set_fail(req); +-- +2.39.0 + diff --git a/queue-5.15/io_uring-allow-re-poll-if-we-made-progress.patch b/queue-5.15/io_uring-allow-re-poll-if-we-made-progress.patch new file mode 100644 index 00000000000..f5435836ab3 --- /dev/null +++ b/queue-5.15/io_uring-allow-re-poll-if-we-made-progress.patch @@ -0,0 +1,53 @@ +From 5e575c1b387f1229b83e69da35f905ead24efd01 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 21 Jan 2023 10:39:22 -0700 +Subject: io_uring: allow re-poll if we made progress + +From: Jens Axboe + +commit 10c873334febaeea9aa0c25c10b5ac0951b77a5f upstream. + +We currently check REQ_F_POLLED before arming async poll for a +notification to retry. If it's set, then we don't allow poll and will +punt to io-wq instead. This is done to prevent a situation where a buggy +driver will repeatedly return that there's space/data available yet we +get -EAGAIN. + +However, if we already transferred data, then it should be safe to rely +on poll again. Gate the check on whether or not REQ_F_PARTIAL_IO is +also set. + +Signed-off-by: Jens Axboe +Signed-off-by: Sasha Levin +--- + io_uring/io_uring.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c +index 3fb76863fed4..997a7264e1d4 100644 +--- a/io_uring/io_uring.c ++++ b/io_uring/io_uring.c +@@ -5853,7 +5853,7 @@ static int io_arm_poll_handler(struct io_kiocb *req) + + if (!req->file || !file_can_poll(req->file)) + return IO_APOLL_ABORTED; +- if (req->flags & REQ_F_POLLED) ++ if ((req->flags & (REQ_F_POLLED|REQ_F_PARTIAL_IO)) == REQ_F_POLLED) + return IO_APOLL_ABORTED; + if (!def->pollin && !def->pollout) + return IO_APOLL_ABORTED; +@@ -5869,7 +5869,10 @@ static int io_arm_poll_handler(struct io_kiocb *req) + mask |= POLLOUT | POLLWRNORM; + } + +- apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); ++ if (req->flags & REQ_F_POLLED) ++ apoll = req->apoll; ++ else ++ apoll = kmalloc(sizeof(*apoll), GFP_ATOMIC); + if (unlikely(!apoll)) + return IO_APOLL_ABORTED; + apoll->double_poll = NULL; +-- +2.39.0 + diff --git a/queue-5.15/io_uring-do-not-recalculate-ppos-unnecessarily.patch b/queue-5.15/io_uring-do-not-recalculate-ppos-unnecessarily.patch new file mode 100644 index 00000000000..a84f710c98b --- /dev/null +++ b/queue-5.15/io_uring-do-not-recalculate-ppos-unnecessarily.patch @@ -0,0 +1,100 @@ +From 7fb0202e019fb7add27e0d80961adf708996a5b9 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 22 Feb 2022 02:55:03 -0800 +Subject: io_uring: do not recalculate ppos unnecessarily + +From: Dylan Yudaken + +commit b4aec40015953b65f2f114641e7fd7714c8df8e6 upstream. + +There is a slight optimisation to be had by calculating the correct pos +pointer inside io_kiocb_update_pos and then using that later. + +It seems code size drops by a bit: +000000000000a1b0 0000000000000400 t io_read +000000000000a5b0 0000000000000319 t io_write + +vs +000000000000a1b0 00000000000003f6 t io_read +000000000000a5b0 0000000000000310 t io_write + +Signed-off-by: Dylan Yudaken +Reviewed-by: Pavel Begunkov +Signed-off-by: Jens Axboe +Signed-off-by: Sasha Levin +--- + io_uring/io_uring.c | 18 ++++++++++++------ + 1 file changed, 12 insertions(+), 6 deletions(-) + +diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c +index d9396cfaa4f3..73d261004c4a 100644 +--- a/io_uring/io_uring.c ++++ b/io_uring/io_uring.c +@@ -3003,18 +3003,22 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) + } + } + +-static inline void io_kiocb_update_pos(struct io_kiocb *req) ++static inline loff_t *io_kiocb_update_pos(struct io_kiocb *req) + { + struct kiocb *kiocb = &req->rw.kiocb; ++ bool is_stream = req->file->f_mode & FMODE_STREAM; + + if (kiocb->ki_pos == -1) { +- if (!(req->file->f_mode & FMODE_STREAM)) { ++ if (!is_stream) { + req->flags |= REQ_F_CUR_POS; + kiocb->ki_pos = req->file->f_pos; ++ return &kiocb->ki_pos; + } else { + kiocb->ki_pos = 0; ++ return NULL; + } + } ++ return is_stream ? NULL : &kiocb->ki_pos; + } + + static void kiocb_done(struct kiocb *kiocb, ssize_t ret, +@@ -3540,6 +3544,7 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) + bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; + struct iov_iter_state __state, *state; + ssize_t ret, ret2; ++ loff_t *ppos; + + if (rw) { + iter = &rw->iter; +@@ -3572,9 +3577,9 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) + return ret ?: -EAGAIN; + } + +- io_kiocb_update_pos(req); ++ ppos = io_kiocb_update_pos(req); + +- ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), req->result); ++ ret = rw_verify_area(READ, req->file, ppos, req->result); + if (unlikely(ret)) { + kfree(iovec); + return ret; +@@ -3678,6 +3683,7 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) + bool force_nonblock = issue_flags & IO_URING_F_NONBLOCK; + struct iov_iter_state __state, *state; + ssize_t ret, ret2; ++ loff_t *ppos; + + if (rw) { + iter = &rw->iter; +@@ -3708,9 +3714,9 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) + (req->flags & REQ_F_ISREG)) + goto copy_iov; + +- io_kiocb_update_pos(req); ++ ppos = io_kiocb_update_pos(req); + +- ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), req->result); ++ ret = rw_verify_area(WRITE, req->file, ppos, req->result); + if (unlikely(ret)) + goto out_free; + +-- +2.39.0 + diff --git a/queue-5.15/io_uring-don-t-gate-task_work-run-on-tif_notify_sign.patch b/queue-5.15/io_uring-don-t-gate-task_work-run-on-tif_notify_sign.patch new file mode 100644 index 00000000000..8cc7577a4a5 --- /dev/null +++ b/queue-5.15/io_uring-don-t-gate-task_work-run-on-tif_notify_sign.patch @@ -0,0 +1,47 @@ +From a1d18b218b3c9ad3c44f17686ed39780d552a243 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 20 Jan 2023 20:50:24 -0700 +Subject: io_uring: don't gate task_work run on TIF_NOTIFY_SIGNAL + +From: Jens Axboe + +commit 46a525e199e4037516f7e498c18f065b09df32ac upstream. + +This isn't a reliable mechanism to tell if we have task_work pending, we +really should be looking at whether we have any items queued. This is +problematic if forward progress is gated on running said task_work. One +such example is reading from a pipe, where the write side has been closed +right before the read is started. The fput() of the file queues TWA_RESUME +task_work, and we need that task_work to be run before ->release() is +called for the pipe. If ->release() isn't called, then the read will sit +forever waiting on data that will never arise. + +Fix this by io_run_task_work() so it checks if we have task_work pending +rather than rely on TIF_NOTIFY_SIGNAL for that. The latter obviously +doesn't work for task_work that is queued without TWA_SIGNAL. + +Reported-by: Christiano Haesbaert +Cc: stable@vger.kernel.org +Link: https://github.com/axboe/liburing/issues/665 +Signed-off-by: Jens Axboe +Signed-off-by: Sasha Levin +--- + io_uring/io-wq.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/io_uring/io-wq.c b/io_uring/io-wq.c +index 87bc38b47103..81485c1a9879 100644 +--- a/io_uring/io-wq.c ++++ b/io_uring/io-wq.c +@@ -513,7 +513,7 @@ static struct io_wq_work *io_get_next_work(struct io_wqe_acct *acct, + + static bool io_flush_signals(void) + { +- if (unlikely(test_thread_flag(TIF_NOTIFY_SIGNAL))) { ++ if (test_thread_flag(TIF_NOTIFY_SIGNAL) || current->task_works) { + __set_current_state(TASK_RUNNING); + tracehook_notify_signal(); + return true; +-- +2.39.0 + diff --git a/queue-5.15/io_uring-ensure-recv-and-recvmsg-handle-msg_waitall-.patch b/queue-5.15/io_uring-ensure-recv-and-recvmsg-handle-msg_waitall-.patch new file mode 100644 index 00000000000..6c18140646b --- /dev/null +++ b/queue-5.15/io_uring-ensure-recv-and-recvmsg-handle-msg_waitall-.patch @@ -0,0 +1,107 @@ +From 512090d1131532991640fe4a0a1bf5a72e9f451b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 21 Jan 2023 10:21:22 -0700 +Subject: io_uring: ensure recv and recvmsg handle MSG_WAITALL correctly + +From: Jens Axboe + +commit 7ba89d2af17aa879dda30f5d5d3f152e587fc551 upstream. + +We currently don't attempt to get the full asked for length even if +MSG_WAITALL is set, if we get a partial receive. If we do see a partial +receive, then just note how many bytes we did and return -EAGAIN to +get it retried. + +The iov is advanced appropriately for the vector based case, and we +manually bump the buffer and remainder for the non-vector case. + +Cc: stable@vger.kernel.org +Reported-by: Constantine Gavrilov +Signed-off-by: Jens Axboe +Signed-off-by: Sasha Levin +--- + io_uring/io_uring.c | 28 ++++++++++++++++++++++++++++ + 1 file changed, 28 insertions(+) + +diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c +index 93023562d548..04441e981624 100644 +--- a/io_uring/io_uring.c ++++ b/io_uring/io_uring.c +@@ -578,6 +578,7 @@ struct io_sr_msg { + int msg_flags; + int bgid; + size_t len; ++ size_t done_io; + struct io_buffer *kbuf; + }; + +@@ -5063,12 +5064,21 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) + if (req->ctx->compat) + sr->msg_flags |= MSG_CMSG_COMPAT; + #endif ++ sr->done_io = 0; + return 0; + } + ++static bool io_net_retry(struct socket *sock, int flags) ++{ ++ if (!(flags & MSG_WAITALL)) ++ return false; ++ return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET; ++} ++ + static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) + { + struct io_async_msghdr iomsg, *kmsg; ++ struct io_sr_msg *sr = &req->sr_msg; + struct socket *sock; + struct io_buffer *kbuf; + unsigned flags; +@@ -5111,6 +5121,10 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) + return io_setup_async_msg(req, kmsg); + if (ret == -ERESTARTSYS) + ret = -EINTR; ++ if (ret > 0 && io_net_retry(sock, flags)) { ++ sr->done_io += ret; ++ return io_setup_async_msg(req, kmsg); ++ } + req_set_fail(req); + } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { + req_set_fail(req); +@@ -5122,6 +5136,10 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) + if (kmsg->free_iov) + kfree(kmsg->free_iov); + req->flags &= ~REQ_F_NEED_CLEANUP; ++ if (ret >= 0) ++ ret += sr->done_io; ++ else if (sr->done_io) ++ ret = sr->done_io; + __io_req_complete(req, issue_flags, ret, cflags); + return 0; + } +@@ -5174,12 +5192,22 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags) + return -EAGAIN; + if (ret == -ERESTARTSYS) + ret = -EINTR; ++ if (ret > 0 && io_net_retry(sock, flags)) { ++ sr->len -= ret; ++ sr->buf += ret; ++ sr->done_io += ret; ++ return -EAGAIN; ++ } + req_set_fail(req); + } else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { + req_set_fail(req); + } + if (req->flags & REQ_F_BUFFER_SELECTED) + cflags = io_put_recv_kbuf(req); ++ if (ret >= 0) ++ ret += sr->done_io; ++ else if (sr->done_io) ++ ret = sr->done_io; + __io_req_complete(req, issue_flags, ret, cflags); + return 0; + } +-- +2.39.0 + diff --git a/queue-5.15/io_uring-ensure-that-cached-task-references-are-alwa.patch b/queue-5.15/io_uring-ensure-that-cached-task-references-are-alwa.patch new file mode 100644 index 00000000000..8c44856ddc5 --- /dev/null +++ b/queue-5.15/io_uring-ensure-that-cached-task-references-are-alwa.patch @@ -0,0 +1,55 @@ +From dccf54bc7a377c5df8da5d2dcf86ac76bf255e61 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 21 Jan 2023 12:36:08 -0700 +Subject: io_uring: ensure that cached task references are always put on exit + +From: Jens Axboe + +commit e775f93f2ab976a2cdb4a7b53063cbe890904f73 upstream. + +io_uring caches task references to avoid doing atomics for each of them +per request. If a request is put from the same task that allocated it, +then we can maintain a per-ctx cache of them. This obviously relies +on io_uring always pruning caches in a reliable way, and there's +currently a case off io_uring fd release where we can miss that. + +One example is a ring setup with IOPOLL, which relies on the task +polling for completions, which will free them. However, if such a task +submits a request and then exits or closes the ring without reaping +the completion, then ring release will reap and put. If release happens +from that very same task, the completed request task refs will get +put back into the cache pool. This is problematic, as we're now beyond +the point of pruning caches. + +Manually drop these caches after doing an IOPOLL reap. This releases +references from the current task, which is enough. If another task +happens to be doing the release, then the caching will not be +triggered and there's no issue. + +Cc: stable@vger.kernel.org +Fixes: e98e49b2bbf7 ("io_uring: extend task put optimisations") +Reported-by: Homin Rhee +Signed-off-by: Jens Axboe +Signed-off-by: Sasha Levin +--- + io_uring/io_uring.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c +index e1e15d40d758..2caef6417260 100644 +--- a/io_uring/io_uring.c ++++ b/io_uring/io_uring.c +@@ -9684,6 +9684,10 @@ static void io_ring_ctx_wait_and_kill(struct io_ring_ctx *ctx) + /* if we failed setting up the ctx, we might not have any rings */ + io_iopoll_try_reap_events(ctx); + ++ /* drop cached put refs after potentially doing completions */ ++ if (current->io_uring) ++ io_uring_drop_tctx_refs(current); ++ + INIT_WORK(&ctx->exit_work, io_ring_exit_work); + /* + * Use system_unbound_wq to avoid spawning tons of event kworkers +-- +2.39.0 + diff --git a/queue-5.15/io_uring-fix-async-accept-on-o_nonblock-sockets.patch b/queue-5.15/io_uring-fix-async-accept-on-o_nonblock-sockets.patch new file mode 100644 index 00000000000..bd6707c3192 --- /dev/null +++ b/queue-5.15/io_uring-fix-async-accept-on-o_nonblock-sockets.patch @@ -0,0 +1,50 @@ +From ca760f9e0df16e3351879479cdb05b61ee1789fd Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 21 Jan 2023 09:13:12 -0700 +Subject: io_uring: fix async accept on O_NONBLOCK sockets + +From: Dylan Yudaken + +commit a73825ba70c93e1eb39a845bb3d9885a787f8ffe upstream. + +Do not set REQ_F_NOWAIT if the socket is non blocking. When enabled this +causes the accept to immediately post a CQE with EAGAIN, which means you +cannot perform an accept SQE on a NONBLOCK socket asynchronously. + +By removing the flag if there is no pending accept then poll is armed as +usual and when a connection comes in the CQE is posted. + +Signed-off-by: Dylan Yudaken +Link: https://lore.kernel.org/r/20220324143435.2875844-1-dylany@fb.com +Signed-off-by: Jens Axboe +Signed-off-by: Sasha Levin +--- + io_uring/io_uring.c | 5 ++--- + 1 file changed, 2 insertions(+), 3 deletions(-) + +diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c +index 997a7264e1d4..e1e15d40d758 100644 +--- a/io_uring/io_uring.c ++++ b/io_uring/io_uring.c +@@ -5272,9 +5272,6 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags) + struct file *file; + int ret, fd; + +- if (req->file->f_flags & O_NONBLOCK) +- req->flags |= REQ_F_NOWAIT; +- + if (!fixed) { + fd = __get_unused_fd_flags(accept->flags, accept->nofile); + if (unlikely(fd < 0)) +@@ -5286,6 +5283,8 @@ static int io_accept(struct io_kiocb *req, unsigned int issue_flags) + if (!fixed) + put_unused_fd(fd); + ret = PTR_ERR(file); ++ /* safe to retry */ ++ req->flags |= REQ_F_PARTIAL_IO; + if (ret == -EAGAIN && force_nonblock) + return -EAGAIN; + if (ret == -ERESTARTSYS) +-- +2.39.0 + diff --git a/queue-5.15/io_uring-improve-send-recv-error-handling.patch b/queue-5.15/io_uring-improve-send-recv-error-handling.patch new file mode 100644 index 00000000000..ee2387dc075 --- /dev/null +++ b/queue-5.15/io_uring-improve-send-recv-error-handling.patch @@ -0,0 +1,126 @@ +From 7ec39780241bf364da91b8ac8da42370c0ae0a0d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 23 Nov 2021 00:07:47 +0000 +Subject: io_uring: improve send/recv error handling + +From: Pavel Begunkov + +commit 7297ce3d59449de49d3c9e1f64ae25488750a1fc upstream. + +Hide all error handling under common if block, removes two extra ifs on +the success path and keeps the handling more condensed. + +Signed-off-by: Pavel Begunkov +Link: https://lore.kernel.org/r/5761545158a12968f3caf30f747eea65ed75dfc1.1637524285.git.asml.silence@gmail.com +Signed-off-by: Jens Axboe +Signed-off-by: Sasha Levin +--- + io_uring/io_uring.c | 55 +++++++++++++++++++++++++-------------------- + 1 file changed, 31 insertions(+), 24 deletions(-) + +diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c +index d855e668f37c..93023562d548 100644 +--- a/io_uring/io_uring.c ++++ b/io_uring/io_uring.c +@@ -4866,17 +4866,18 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) + min_ret = iov_iter_count(&kmsg->msg.msg_iter); + + ret = __sys_sendmsg_sock(sock, &kmsg->msg, flags); +- if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN) +- return io_setup_async_msg(req, kmsg); +- if (ret == -ERESTARTSYS) +- ret = -EINTR; + ++ if (ret < min_ret) { ++ if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) ++ return io_setup_async_msg(req, kmsg); ++ if (ret == -ERESTARTSYS) ++ ret = -EINTR; ++ req_set_fail(req); ++ } + /* fast path, check for non-NULL to avoid function call */ + if (kmsg->free_iov) + kfree(kmsg->free_iov); + req->flags &= ~REQ_F_NEED_CLEANUP; +- if (ret < min_ret) +- req_set_fail(req); + __io_req_complete(req, issue_flags, ret, 0); + return 0; + } +@@ -4912,13 +4913,13 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags) + + msg.msg_flags = flags; + ret = sock_sendmsg(sock, &msg); +- if ((issue_flags & IO_URING_F_NONBLOCK) && ret == -EAGAIN) +- return -EAGAIN; +- if (ret == -ERESTARTSYS) +- ret = -EINTR; +- +- if (ret < min_ret) ++ if (ret < min_ret) { ++ if (ret == -EAGAIN && (issue_flags & IO_URING_F_NONBLOCK)) ++ return -EAGAIN; ++ if (ret == -ERESTARTSYS) ++ ret = -EINTR; + req_set_fail(req); ++ } + __io_req_complete(req, issue_flags, ret, 0); + return 0; + } +@@ -5105,10 +5106,15 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) + + ret = __sys_recvmsg_sock(sock, &kmsg->msg, req->sr_msg.umsg, + kmsg->uaddr, flags); +- if (force_nonblock && ret == -EAGAIN) +- return io_setup_async_msg(req, kmsg); +- if (ret == -ERESTARTSYS) +- ret = -EINTR; ++ if (ret < min_ret) { ++ if (ret == -EAGAIN && force_nonblock) ++ return io_setup_async_msg(req, kmsg); ++ if (ret == -ERESTARTSYS) ++ ret = -EINTR; ++ req_set_fail(req); ++ } else if ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { ++ req_set_fail(req); ++ } + + if (req->flags & REQ_F_BUFFER_SELECTED) + cflags = io_put_recv_kbuf(req); +@@ -5116,8 +5122,6 @@ static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) + if (kmsg->free_iov) + kfree(kmsg->free_iov); + req->flags &= ~REQ_F_NEED_CLEANUP; +- if (ret < min_ret || ((flags & MSG_WAITALL) && (kmsg->msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC)))) +- req_set_fail(req); + __io_req_complete(req, issue_flags, ret, cflags); + return 0; + } +@@ -5164,15 +5168,18 @@ static int io_recv(struct io_kiocb *req, unsigned int issue_flags) + min_ret = iov_iter_count(&msg.msg_iter); + + ret = sock_recvmsg(sock, &msg, flags); +- if (force_nonblock && ret == -EAGAIN) +- return -EAGAIN; +- if (ret == -ERESTARTSYS) +- ret = -EINTR; + out_free: ++ if (ret < min_ret) { ++ if (ret == -EAGAIN && force_nonblock) ++ return -EAGAIN; ++ if (ret == -ERESTARTSYS) ++ ret = -EINTR; ++ req_set_fail(req); ++ } else if ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC))) { ++ req_set_fail(req); ++ } + if (req->flags & REQ_F_BUFFER_SELECTED) + cflags = io_put_recv_kbuf(req); +- if (ret < min_ret || ((flags & MSG_WAITALL) && (msg.msg_flags & (MSG_TRUNC | MSG_CTRUNC)))) +- req_set_fail(req); + __io_req_complete(req, issue_flags, ret, cflags); + return 0; + } +-- +2.39.0 + diff --git a/queue-5.15/io_uring-pass-in-epoll_uring_wake-for-eventfd-signal.patch b/queue-5.15/io_uring-pass-in-epoll_uring_wake-for-eventfd-signal.patch new file mode 100644 index 00000000000..e6147b843a9 --- /dev/null +++ b/queue-5.15/io_uring-pass-in-epoll_uring_wake-for-eventfd-signal.patch @@ -0,0 +1,87 @@ +From 9408d99dda879989d1b7d1084aeed040eac529fb Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 23 Dec 2022 07:04:49 -0700 +Subject: io_uring: pass in EPOLL_URING_WAKE for eventfd signaling and wakeups + +From: Jens Axboe + +[ Upstream commit 4464853277d0ccdb9914608dd1332f0fa2f9846f ] + +Pass in EPOLL_URING_WAKE when signaling eventfd or doing poll related +wakups, so that we can check for a circular event dependency between +eventfd and epoll. If this flag is set when our wakeup handlers are +called, then we know we have a dependency that needs to terminate +multishot requests. + +eventfd and epoll are the only such possible dependencies. + +Cc: stable@vger.kernel.org # 6.0 +Signed-off-by: Jens Axboe +Signed-off-by: Sasha Levin +--- + io_uring/io_uring.c | 27 ++++++++++++++++++++------- + 1 file changed, 20 insertions(+), 7 deletions(-) + +diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c +index 9a01188ff45a..d855e668f37c 100644 +--- a/io_uring/io_uring.c ++++ b/io_uring/io_uring.c +@@ -1629,13 +1629,15 @@ static void io_cqring_ev_posted(struct io_ring_ctx *ctx) + * wake as many waiters as we need to. + */ + if (wq_has_sleeper(&ctx->cq_wait)) +- wake_up_all(&ctx->cq_wait); ++ __wake_up(&ctx->cq_wait, TASK_NORMAL, 0, ++ poll_to_key(EPOLL_URING_WAKE | EPOLLIN)); + if (ctx->sq_data && waitqueue_active(&ctx->sq_data->wait)) + wake_up(&ctx->sq_data->wait); + if (io_should_trigger_evfd(ctx)) +- eventfd_signal(ctx->cq_ev_fd, 1); ++ eventfd_signal_mask(ctx->cq_ev_fd, 1, EPOLL_URING_WAKE); + if (waitqueue_active(&ctx->poll_wait)) +- wake_up_interruptible(&ctx->poll_wait); ++ __wake_up(&ctx->poll_wait, TASK_INTERRUPTIBLE, 0, ++ poll_to_key(EPOLL_URING_WAKE | EPOLLIN)); + } + + static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) +@@ -1645,12 +1647,14 @@ static void io_cqring_ev_posted_iopoll(struct io_ring_ctx *ctx) + + if (ctx->flags & IORING_SETUP_SQPOLL) { + if (waitqueue_active(&ctx->cq_wait)) +- wake_up_all(&ctx->cq_wait); ++ __wake_up(&ctx->cq_wait, TASK_NORMAL, 0, ++ poll_to_key(EPOLL_URING_WAKE | EPOLLIN)); + } + if (io_should_trigger_evfd(ctx)) +- eventfd_signal(ctx->cq_ev_fd, 1); ++ eventfd_signal_mask(ctx->cq_ev_fd, 1, EPOLL_URING_WAKE); + if (waitqueue_active(&ctx->poll_wait)) +- wake_up_interruptible(&ctx->poll_wait); ++ __wake_up(&ctx->poll_wait, TASK_INTERRUPTIBLE, 0, ++ poll_to_key(EPOLL_URING_WAKE | EPOLLIN)); + } + + /* Returns true if there are no backlogged entries after the flush */ +@@ -5636,8 +5640,17 @@ static int io_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, + if (mask && !(mask & poll->events)) + return 0; + +- if (io_poll_get_ownership(req)) ++ if (io_poll_get_ownership(req)) { ++ /* ++ * If we trigger a multishot poll off our own wakeup path, ++ * disable multishot as there is a circular dependency between ++ * CQ posting and triggering the event. ++ */ ++ if (mask & EPOLL_URING_WAKE) ++ poll->events |= EPOLLONESHOT; ++ + __io_poll_execute(req, mask); ++ } + return 1; + } + +-- +2.39.0 + diff --git a/queue-5.15/io_uring-remove-duplicated-calls-to-io_kiocb_ppos.patch b/queue-5.15/io_uring-remove-duplicated-calls-to-io_kiocb_ppos.patch new file mode 100644 index 00000000000..e8a3c8279d0 --- /dev/null +++ b/queue-5.15/io_uring-remove-duplicated-calls-to-io_kiocb_ppos.patch @@ -0,0 +1,65 @@ +From 014aa661545c102fdef146dc638677d98689a1e4 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 22 Feb 2022 02:55:01 -0800 +Subject: io_uring: remove duplicated calls to io_kiocb_ppos + +From: Dylan Yudaken + +commit af9c45ecebaf1b428306f41421f4bcffe439f735 upstream. + +io_kiocb_ppos is called in both branches, and it seems that the compiler +does not fuse this. Fusing removes a few bytes from loop_rw_iter. + +Before: +$ nm -S fs/io_uring.o | grep loop_rw_iter +0000000000002430 0000000000000124 t loop_rw_iter + +After: +$ nm -S fs/io_uring.o | grep loop_rw_iter +0000000000002430 000000000000010d t loop_rw_iter + +Signed-off-by: Dylan Yudaken +Reviewed-by: Pavel Begunkov +Signed-off-by: Jens Axboe +Signed-off-by: Sasha Levin +--- + io_uring/io_uring.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c +index 2caef6417260..14297add8485 100644 +--- a/io_uring/io_uring.c ++++ b/io_uring/io_uring.c +@@ -3303,6 +3303,7 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter) + struct kiocb *kiocb = &req->rw.kiocb; + struct file *file = req->file; + ssize_t ret = 0; ++ loff_t *ppos; + + /* + * Don't support polled IO through this interface, and we can't +@@ -3314,6 +3315,8 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter) + if (kiocb->ki_flags & IOCB_NOWAIT) + return -EAGAIN; + ++ ppos = io_kiocb_ppos(kiocb); ++ + while (iov_iter_count(iter)) { + struct iovec iovec; + ssize_t nr; +@@ -3327,10 +3330,10 @@ static ssize_t loop_rw_iter(int rw, struct io_kiocb *req, struct iov_iter *iter) + + if (rw == READ) { + nr = file->f_op->read(file, iovec.iov_base, +- iovec.iov_len, io_kiocb_ppos(kiocb)); ++ iovec.iov_len, ppos); + } else { + nr = file->f_op->write(file, iovec.iov_base, +- iovec.iov_len, io_kiocb_ppos(kiocb)); ++ iovec.iov_len, ppos); + } + + if (nr < 0) { +-- +2.39.0 + diff --git a/queue-5.15/io_uring-rw-defer-fsnotify-calls-to-task-context.patch b/queue-5.15/io_uring-rw-defer-fsnotify-calls-to-task-context.patch new file mode 100644 index 00000000000..f9854a3a30c --- /dev/null +++ b/queue-5.15/io_uring-rw-defer-fsnotify-calls-to-task-context.patch @@ -0,0 +1,122 @@ +From d555c91cfc2f24961e823f0c717e2fcf92a8ed6a Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 21 Jan 2023 13:38:51 -0700 +Subject: io_uring/rw: defer fsnotify calls to task context + +From: Jens Axboe + +commit b000145e9907809406d8164c3b2b8861d95aecd1 upstream. + +We can't call these off the kiocb completion as that might be off +soft/hard irq context. Defer the calls to when we process the +task_work for this request. That avoids valid complaints like: + +stack backtrace: +CPU: 1 PID: 0 Comm: swapper/1 Not tainted 6.0.0-rc6-syzkaller-00321-g105a36f3694e #0 +Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 08/26/2022 +Call Trace: + + __dump_stack lib/dump_stack.c:88 [inline] + dump_stack_lvl+0xcd/0x134 lib/dump_stack.c:106 + print_usage_bug kernel/locking/lockdep.c:3961 [inline] + valid_state kernel/locking/lockdep.c:3973 [inline] + mark_lock_irq kernel/locking/lockdep.c:4176 [inline] + mark_lock.part.0.cold+0x18/0xd8 kernel/locking/lockdep.c:4632 + mark_lock kernel/locking/lockdep.c:4596 [inline] + mark_usage kernel/locking/lockdep.c:4527 [inline] + __lock_acquire+0x11d9/0x56d0 kernel/locking/lockdep.c:5007 + lock_acquire kernel/locking/lockdep.c:5666 [inline] + lock_acquire+0x1ab/0x570 kernel/locking/lockdep.c:5631 + __fs_reclaim_acquire mm/page_alloc.c:4674 [inline] + fs_reclaim_acquire+0x115/0x160 mm/page_alloc.c:4688 + might_alloc include/linux/sched/mm.h:271 [inline] + slab_pre_alloc_hook mm/slab.h:700 [inline] + slab_alloc mm/slab.c:3278 [inline] + __kmem_cache_alloc_lru mm/slab.c:3471 [inline] + kmem_cache_alloc+0x39/0x520 mm/slab.c:3491 + fanotify_alloc_fid_event fs/notify/fanotify/fanotify.c:580 [inline] + fanotify_alloc_event fs/notify/fanotify/fanotify.c:813 [inline] + fanotify_handle_event+0x1130/0x3f40 fs/notify/fanotify/fanotify.c:948 + send_to_group fs/notify/fsnotify.c:360 [inline] + fsnotify+0xafb/0x1680 fs/notify/fsnotify.c:570 + __fsnotify_parent+0x62f/0xa60 fs/notify/fsnotify.c:230 + fsnotify_parent include/linux/fsnotify.h:77 [inline] + fsnotify_file include/linux/fsnotify.h:99 [inline] + fsnotify_access include/linux/fsnotify.h:309 [inline] + __io_complete_rw_common+0x485/0x720 io_uring/rw.c:195 + io_complete_rw+0x1a/0x1f0 io_uring/rw.c:228 + iomap_dio_complete_work fs/iomap/direct-io.c:144 [inline] + iomap_dio_bio_end_io+0x438/0x5e0 fs/iomap/direct-io.c:178 + bio_endio+0x5f9/0x780 block/bio.c:1564 + req_bio_endio block/blk-mq.c:695 [inline] + blk_update_request+0x3fc/0x1300 block/blk-mq.c:825 + scsi_end_request+0x7a/0x9a0 drivers/scsi/scsi_lib.c:541 + scsi_io_completion+0x173/0x1f70 drivers/scsi/scsi_lib.c:971 + scsi_complete+0x122/0x3b0 drivers/scsi/scsi_lib.c:1438 + blk_complete_reqs+0xad/0xe0 block/blk-mq.c:1022 + __do_softirq+0x1d3/0x9c6 kernel/softirq.c:571 + invoke_softirq kernel/softirq.c:445 [inline] + __irq_exit_rcu+0x123/0x180 kernel/softirq.c:650 + irq_exit_rcu+0x5/0x20 kernel/softirq.c:662 + common_interrupt+0xa9/0xc0 arch/x86/kernel/irq.c:240 + +Fixes: f63cf5192fe3 ("io_uring: ensure that fsnotify is always called") +Link: https://lore.kernel.org/all/20220929135627.ykivmdks2w5vzrwg@quack3/ +Reported-by: syzbot+dfcc5f4da15868df7d4d@syzkaller.appspotmail.com +Reported-by: Jan Kara +Signed-off-by: Jens Axboe +Signed-off-by: Sasha Levin +--- + io_uring/io_uring.c | 22 +++++++++++++++------- + 1 file changed, 15 insertions(+), 7 deletions(-) + +diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c +index 73d261004c4a..78ed38d778f8 100644 +--- a/io_uring/io_uring.c ++++ b/io_uring/io_uring.c +@@ -2705,12 +2705,6 @@ static bool io_rw_should_reissue(struct io_kiocb *req) + + static bool __io_complete_rw_common(struct io_kiocb *req, long res) + { +- if (req->rw.kiocb.ki_flags & IOCB_WRITE) { +- kiocb_end_write(req); +- fsnotify_modify(req->file); +- } else { +- fsnotify_access(req->file); +- } + if (res != req->result) { + if ((res == -EAGAIN || res == -EOPNOTSUPP) && + io_rw_should_reissue(req)) { +@@ -2763,6 +2757,20 @@ static void __io_complete_rw(struct io_kiocb *req, long res, long res2, + __io_req_complete(req, issue_flags, io_fixup_rw_res(req, res), io_put_rw_kbuf(req)); + } + ++static void io_req_rw_complete(struct io_kiocb *req, bool *locked) ++{ ++ struct io_rw *rw = &req->rw; ++ ++ if (rw->kiocb.ki_flags & IOCB_WRITE) { ++ kiocb_end_write(req); ++ fsnotify_modify(req->file); ++ } else { ++ fsnotify_access(req->file); ++ } ++ ++ io_req_task_complete(req, locked); ++} ++ + static void io_complete_rw(struct kiocb *kiocb, long res, long res2) + { + struct io_kiocb *req = container_of(kiocb, struct io_kiocb, rw.kiocb); +@@ -2770,7 +2778,7 @@ static void io_complete_rw(struct kiocb *kiocb, long res, long res2) + if (__io_complete_rw_common(req, res)) + return; + req->result = io_fixup_rw_res(req, res); +- req->io_task_work.func = io_req_task_complete; ++ req->io_task_work.func = io_req_rw_complete; + io_req_task_work_add(req); + } + +-- +2.39.0 + diff --git a/queue-5.15/io_uring-support-msg_waitall-for-ioring_op_send-msg.patch b/queue-5.15/io_uring-support-msg_waitall-for-ioring_op_send-msg.patch new file mode 100644 index 00000000000..3f188b8a503 --- /dev/null +++ b/queue-5.15/io_uring-support-msg_waitall-for-ioring_op_send-msg.patch @@ -0,0 +1,111 @@ +From 02dc72701a3f7be6535e276753836451b95360bc Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 20 Apr 2022 19:21:36 -0600 +Subject: io_uring: support MSG_WAITALL for IORING_OP_SEND(MSG) + +From: Jens Axboe + +commit 4c3c09439c08b03d9503df0ca4c7619c5842892e upstream. + +Like commit 7ba89d2af17a for recv/recvmsg, support MSG_WAITALL for the +send side. If this flag is set and we do a short send, retry for a +stream of seqpacket socket. + +Signed-off-by: Jens Axboe +Signed-off-by: Sasha Levin +--- + io_uring/io_uring.c | 36 +++++++++++++++++++++++++++++------- + 1 file changed, 29 insertions(+), 7 deletions(-) + +diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c +index 2350d43aa782..3fb76863fed4 100644 +--- a/io_uring/io_uring.c ++++ b/io_uring/io_uring.c +@@ -4777,6 +4777,13 @@ static int io_sync_file_range(struct io_kiocb *req, unsigned int issue_flags) + } + + #if defined(CONFIG_NET) ++static bool io_net_retry(struct socket *sock, int flags) ++{ ++ if (!(flags & MSG_WAITALL)) ++ return false; ++ return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET; ++} ++ + static int io_setup_async_msg(struct io_kiocb *req, + struct io_async_msghdr *kmsg) + { +@@ -4840,12 +4847,14 @@ static int io_sendmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) + if (req->ctx->compat) + sr->msg_flags |= MSG_CMSG_COMPAT; + #endif ++ sr->done_io = 0; + return 0; + } + + static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) + { + struct io_async_msghdr iomsg, *kmsg; ++ struct io_sr_msg *sr = &req->sr_msg; + struct socket *sock; + unsigned flags; + int min_ret = 0; +@@ -4876,12 +4885,21 @@ static int io_sendmsg(struct io_kiocb *req, unsigned int issue_flags) + return io_setup_async_msg(req, kmsg); + if (ret == -ERESTARTSYS) + ret = -EINTR; ++ if (ret > 0 && io_net_retry(sock, flags)) { ++ sr->done_io += ret; ++ req->flags |= REQ_F_PARTIAL_IO; ++ return io_setup_async_msg(req, kmsg); ++ } + req_set_fail(req); + } + /* fast path, check for non-NULL to avoid function call */ + if (kmsg->free_iov) + kfree(kmsg->free_iov); + req->flags &= ~REQ_F_NEED_CLEANUP; ++ if (ret >= 0) ++ ret += sr->done_io; ++ else if (sr->done_io) ++ ret = sr->done_io; + __io_req_complete(req, issue_flags, ret, 0); + return 0; + } +@@ -4922,8 +4940,19 @@ static int io_send(struct io_kiocb *req, unsigned int issue_flags) + return -EAGAIN; + if (ret == -ERESTARTSYS) + ret = -EINTR; ++ if (ret > 0 && io_net_retry(sock, flags)) { ++ sr->len -= ret; ++ sr->buf += ret; ++ sr->done_io += ret; ++ req->flags |= REQ_F_PARTIAL_IO; ++ return -EAGAIN; ++ } + req_set_fail(req); + } ++ if (ret >= 0) ++ ret += sr->done_io; ++ else if (sr->done_io) ++ ret = sr->done_io; + __io_req_complete(req, issue_flags, ret, 0); + return 0; + } +@@ -5071,13 +5100,6 @@ static int io_recvmsg_prep(struct io_kiocb *req, const struct io_uring_sqe *sqe) + return 0; + } + +-static bool io_net_retry(struct socket *sock, int flags) +-{ +- if (!(flags & MSG_WAITALL)) +- return false; +- return sock->type == SOCK_STREAM || sock->type == SOCK_SEQPACKET; +-} +- + static int io_recvmsg(struct io_kiocb *req, unsigned int issue_flags) + { + struct io_async_msghdr iomsg, *kmsg; +-- +2.39.0 + diff --git a/queue-5.15/io_uring-update-kiocb-ki_pos-at-execution-time.patch b/queue-5.15/io_uring-update-kiocb-ki_pos-at-execution-time.patch new file mode 100644 index 00000000000..3574360f235 --- /dev/null +++ b/queue-5.15/io_uring-update-kiocb-ki_pos-at-execution-time.patch @@ -0,0 +1,86 @@ +From 3fb298fa47c2bbf1201d03fd211c893c7c71643e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 22 Feb 2022 02:55:02 -0800 +Subject: io_uring: update kiocb->ki_pos at execution time + +From: Dylan Yudaken + +commit d34e1e5b396a0dbaa4a29b7138df662cfb9d8e8e upstream. + +Update kiocb->ki_pos at execution time rather than in io_prep_rw(). +io_prep_rw() happens before the job is enqueued to a worker and so the +offset might be read multiple times before being executed once. + +Ensures that the file position in a set of _linked_ SQEs will be only +obtained after earlier SQEs have completed, and so will include their +incremented file position. + +Signed-off-by: Dylan Yudaken +Reviewed-by: Pavel Begunkov +Signed-off-by: Jens Axboe +Signed-off-by: Sasha Levin +--- + io_uring/io_uring.c | 26 ++++++++++++++++++-------- + 1 file changed, 18 insertions(+), 8 deletions(-) + +diff --git a/io_uring/io_uring.c b/io_uring/io_uring.c +index 14297add8485..d9396cfaa4f3 100644 +--- a/io_uring/io_uring.c ++++ b/io_uring/io_uring.c +@@ -2922,14 +2922,6 @@ static int io_prep_rw(struct io_kiocb *req, const struct io_uring_sqe *sqe, + req->flags |= REQ_F_ISREG; + + kiocb->ki_pos = READ_ONCE(sqe->off); +- if (kiocb->ki_pos == -1) { +- if (!(file->f_mode & FMODE_STREAM)) { +- req->flags |= REQ_F_CUR_POS; +- kiocb->ki_pos = file->f_pos; +- } else { +- kiocb->ki_pos = 0; +- } +- } + kiocb->ki_hint = ki_hint_validate(file_write_hint(kiocb->ki_filp)); + kiocb->ki_flags = iocb_flags(kiocb->ki_filp); + ret = kiocb_set_rw_flags(kiocb, READ_ONCE(sqe->rw_flags)); +@@ -3011,6 +3003,20 @@ static inline void io_rw_done(struct kiocb *kiocb, ssize_t ret) + } + } + ++static inline void io_kiocb_update_pos(struct io_kiocb *req) ++{ ++ struct kiocb *kiocb = &req->rw.kiocb; ++ ++ if (kiocb->ki_pos == -1) { ++ if (!(req->file->f_mode & FMODE_STREAM)) { ++ req->flags |= REQ_F_CUR_POS; ++ kiocb->ki_pos = req->file->f_pos; ++ } else { ++ kiocb->ki_pos = 0; ++ } ++ } ++} ++ + static void kiocb_done(struct kiocb *kiocb, ssize_t ret, + unsigned int issue_flags) + { +@@ -3566,6 +3572,8 @@ static int io_read(struct io_kiocb *req, unsigned int issue_flags) + return ret ?: -EAGAIN; + } + ++ io_kiocb_update_pos(req); ++ + ret = rw_verify_area(READ, req->file, io_kiocb_ppos(kiocb), req->result); + if (unlikely(ret)) { + kfree(iovec); +@@ -3700,6 +3708,8 @@ static int io_write(struct io_kiocb *req, unsigned int issue_flags) + (req->flags & REQ_F_ISREG)) + goto copy_iov; + ++ io_kiocb_update_pos(req); ++ + ret = rw_verify_area(WRITE, req->file, io_kiocb_ppos(kiocb), req->result); + if (unlikely(ret)) + goto out_free; +-- +2.39.0 + diff --git a/queue-5.15/series b/queue-5.15/series index 9b18c26d059..af7f23ebe12 100644 --- a/queue-5.15/series +++ b/queue-5.15/series @@ -18,3 +18,24 @@ wifi-mac80211-sdata-can-be-null-during-ampdu-start.patch add-exception-protection-processing-for-vd-in-axi_chan_handle_err-function.patch zonefs-detect-append-writes-at-invalid-locations.patch nilfs2-fix-general-protection-fault-in-nilfs_btree_insert.patch +efi-fix-userspace-infinite-retry-read-efivars-after-.patch +alsa-hda-realtek-fix-mute-micmute-leds-for-a-hp-prob.patch +alsa-hda-realtek-fix-mute-micmute-leds-don-t-work-fo.patch +drm-amdgpu-disable-runtime-pm-on-several-sienna-cich.patch +drm-amd-delay-removal-of-the-firmware-framebuffer.patch +hugetlb-unshare-some-pmds-when-splitting-vmas.patch +io_uring-don-t-gate-task_work-run-on-tif_notify_sign.patch +eventpoll-add-epoll_uring_wake-poll-wakeup-flag.patch +eventfd-provide-a-eventfd_signal_mask-helper.patch +io_uring-pass-in-epoll_uring_wake-for-eventfd-signal.patch +io_uring-improve-send-recv-error-handling.patch +io_uring-ensure-recv-and-recvmsg-handle-msg_waitall-.patch +io_uring-add-flag-for-disabling-provided-buffer-recy.patch +io_uring-support-msg_waitall-for-ioring_op_send-msg.patch +io_uring-allow-re-poll-if-we-made-progress.patch +io_uring-fix-async-accept-on-o_nonblock-sockets.patch +io_uring-ensure-that-cached-task-references-are-alwa.patch +io_uring-remove-duplicated-calls-to-io_kiocb_ppos.patch +io_uring-update-kiocb-ki_pos-at-execution-time.patch +io_uring-do-not-recalculate-ppos-unnecessarily.patch +io_uring-rw-defer-fsnotify-calls-to-task-context.patch -- 2.47.3