]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
5.15-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 27 Jan 2022 15:48:32 +0000 (16:48 +0100)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 27 Jan 2022 15:48:32 +0000 (16:48 +0100)
added patches:
bnx2x-invalidate-fastpath-hsi-version-for-vfs.patch
bnx2x-utilize-firmware-7.13.21.0.patch
drm-amdgpu-use-correct-viewport_dimension-for-dcn2.patch
io_uring-fix-not-released-cached-task-refs.patch
memcg-better-bounds-on-the-memcg-stats-updates.patch
memcg-flush-stats-only-if-updated.patch
memcg-unify-memcg-stat-flushing.patch
rcu-tighten-rcu_advance_cbs_nowake-checks.patch
select-fix-indefinitely-sleeping-task-in-poll_schedule_timeout.patch

queue-5.15/bnx2x-invalidate-fastpath-hsi-version-for-vfs.patch [new file with mode: 0644]
queue-5.15/bnx2x-utilize-firmware-7.13.21.0.patch [new file with mode: 0644]
queue-5.15/drm-amdgpu-use-correct-viewport_dimension-for-dcn2.patch [new file with mode: 0644]
queue-5.15/io_uring-fix-not-released-cached-task-refs.patch [new file with mode: 0644]
queue-5.15/memcg-better-bounds-on-the-memcg-stats-updates.patch [new file with mode: 0644]
queue-5.15/memcg-flush-stats-only-if-updated.patch [new file with mode: 0644]
queue-5.15/memcg-unify-memcg-stat-flushing.patch [new file with mode: 0644]
queue-5.15/rcu-tighten-rcu_advance_cbs_nowake-checks.patch [new file with mode: 0644]
queue-5.15/select-fix-indefinitely-sleeping-task-in-poll_schedule_timeout.patch [new file with mode: 0644]
queue-5.15/series

diff --git a/queue-5.15/bnx2x-invalidate-fastpath-hsi-version-for-vfs.patch b/queue-5.15/bnx2x-invalidate-fastpath-hsi-version-for-vfs.patch
new file mode 100644 (file)
index 0000000..6ef65c1
--- /dev/null
@@ -0,0 +1,56 @@
+From foo@baz Thu Jan 27 04:22:51 PM CET 2022
+From: Manish Chopra <manishc@marvell.com>
+Date: Tue, 25 Jan 2022 10:57:49 -0800
+Subject: bnx2x: Invalidate fastpath HSI version for VFs
+To: <stable@vger.kernel.org>
+Cc: <aelior@marvell.com>, <gregkh@linuxfoundation.org>, <manishc@marvell.com>
+Message-ID: <20220125185749.26774-2-manishc@marvell.com>
+
+From: Manish Chopra <manishc@marvell.com>
+
+commit 802d4d207e75d7208ff75adb712b556c1e91cf1c upstream
+
+Commit 0a6890b9b4df ("bnx2x: Utilize FW 7.13.15.0.")
+added validation for fastpath HSI versions for different
+client init which was not meant for SR-IOV VF clients, which
+resulted in firmware asserts when running VF clients with
+different fastpath HSI version.
+
+This patch along with the new firmware support in patch #1
+fixes this behavior in order to not validate fastpath HSI
+version for the VFs.
+
+Fixes: 0a6890b9b4df ("bnx2x: Utilize FW 7.13.15.0.")
+Signed-off-by: Manish Chopra <manishc@marvell.com>
+Signed-off-by: Prabhakar Kushwaha <pkushwaha@marvell.com>
+Signed-off-by: Alok Prasad <palok@marvell.com>
+Signed-off-by: Ariel Elior <aelior@marvell.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c |   13 +++++++++++--
+ 1 file changed, 11 insertions(+), 2 deletions(-)
+
+--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
++++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_sriov.c
+@@ -758,9 +758,18 @@ static void bnx2x_vf_igu_reset(struct bn
+ void bnx2x_vf_enable_access(struct bnx2x *bp, u8 abs_vfid)
+ {
++      u16 abs_fid;
++
++      abs_fid = FW_VF_HANDLE(abs_vfid);
++
+       /* set the VF-PF association in the FW */
+-      storm_memset_vf_to_pf(bp, FW_VF_HANDLE(abs_vfid), BP_FUNC(bp));
+-      storm_memset_func_en(bp, FW_VF_HANDLE(abs_vfid), 1);
++      storm_memset_vf_to_pf(bp, abs_fid, BP_FUNC(bp));
++      storm_memset_func_en(bp, abs_fid, 1);
++
++      /* Invalidate fp_hsi version for vfs */
++      if (bp->fw_cap & FW_CAP_INVALIDATE_VF_FP_HSI)
++              REG_WR8(bp, BAR_XSTRORM_INTMEM +
++                          XSTORM_ETH_FUNCTION_INFO_FP_HSI_VALID_E2_OFFSET(abs_fid), 0);
+       /* clear vf errors*/
+       bnx2x_vf_semi_clear_err(bp, abs_vfid);
diff --git a/queue-5.15/bnx2x-utilize-firmware-7.13.21.0.patch b/queue-5.15/bnx2x-utilize-firmware-7.13.21.0.patch
new file mode 100644 (file)
index 0000000..f0abca0
--- /dev/null
@@ -0,0 +1,257 @@
+From foo@baz Thu Jan 27 04:22:51 PM CET 2022
+From: Manish Chopra <manishc@marvell.com>
+Date: Tue, 25 Jan 2022 10:57:48 -0800
+Subject: bnx2x: Utilize firmware 7.13.21.0
+To: <stable@vger.kernel.org>
+Cc: <aelior@marvell.com>, <gregkh@linuxfoundation.org>, <manishc@marvell.com>
+Message-ID: <20220125185749.26774-1-manishc@marvell.com>
+
+From: Manish Chopra <manishc@marvell.com>
+
+commit b7a49f73059fe6147b6b78e8f674ce0d21237432 upstream
+
+This new firmware addresses few important issues and enhancements
+as mentioned below -
+
+- Support direct invalidation of FP HSI Ver per function ID, required for
+  invalidating FP HSI Ver prior to each VF start, as there is no VF start
+- BRB hardware block parity error detection support for the driver
+- Fix the FCOE underrun flow
+- Fix PSOD during FCoE BFS over the NIC ports after preboot driver
+- Maintains backward compatibility
+
+This patch incorporates this new firmware 7.13.21.0 in bnx2x driver.
+
+Signed-off-by: Manish Chopra <manishc@marvell.com>
+Signed-off-by: Prabhakar Kushwaha <pkushwaha@marvell.com>
+Signed-off-by: Alok Prasad <palok@marvell.com>
+Signed-off-by: Ariel Elior <aelior@marvell.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/broadcom/bnx2x/bnx2x.h         |   11 ++
+ drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c     |    6 -
+ drivers/net/ethernet/broadcom/bnx2x/bnx2x_fw_defs.h |    2 
+ drivers/net/ethernet/broadcom/bnx2x/bnx2x_hsi.h     |    3 
+ drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c    |   75 ++++++++++++++------
+ 5 files changed, 69 insertions(+), 28 deletions(-)
+
+--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
++++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x.h
+@@ -1850,6 +1850,14 @@ struct bnx2x {
+       /* Vxlan/Geneve related information */
+       u16 udp_tunnel_ports[BNX2X_UDP_PORT_MAX];
++
++#define FW_CAP_INVALIDATE_VF_FP_HSI   BIT(0)
++      u32 fw_cap;
++
++      u32 fw_major;
++      u32 fw_minor;
++      u32 fw_rev;
++      u32 fw_eng;
+ };
+ /* Tx queues may be less or equal to Rx queues */
+@@ -2525,5 +2533,6 @@ void bnx2x_register_phc(struct bnx2x *bp
+  * Meant for implicit re-load flows.
+  */
+ int bnx2x_vlan_reconfigure_vid(struct bnx2x *bp);
+-
++int bnx2x_init_firmware(struct bnx2x *bp);
++void bnx2x_release_firmware(struct bnx2x *bp);
+ #endif /* bnx2x.h */
+--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
++++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_cmn.c
+@@ -2364,10 +2364,8 @@ int bnx2x_compare_fw_ver(struct bnx2x *b
+       if (load_code != FW_MSG_CODE_DRV_LOAD_COMMON_CHIP &&
+           load_code != FW_MSG_CODE_DRV_LOAD_COMMON) {
+               /* build my FW version dword */
+-              u32 my_fw = (BCM_5710_FW_MAJOR_VERSION) +
+-                      (BCM_5710_FW_MINOR_VERSION << 8) +
+-                      (BCM_5710_FW_REVISION_VERSION << 16) +
+-                      (BCM_5710_FW_ENGINEERING_VERSION << 24);
++              u32 my_fw = (bp->fw_major) + (bp->fw_minor << 8) +
++                              (bp->fw_rev << 16) + (bp->fw_eng << 24);
+               /* read loaded FW from chip */
+               u32 loaded_fw = REG_RD(bp, XSEM_REG_PRAM);
+--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_fw_defs.h
++++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_fw_defs.h
+@@ -241,6 +241,8 @@
+       IRO[221].m2))
+ #define XSTORM_VF_TO_PF_OFFSET(funcId) \
+       (IRO[48].base + ((funcId) * IRO[48].m1))
++#define XSTORM_ETH_FUNCTION_INFO_FP_HSI_VALID_E2_OFFSET(fid)  \
++      (IRO[386].base + ((fid) * IRO[386].m1))
+ #define COMMON_ASM_INVALID_ASSERT_OPCODE 0x0
+ /* eth hsi version */
+--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_hsi.h
++++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_hsi.h
+@@ -3024,7 +3024,8 @@ struct afex_stats {
+ #define BCM_5710_FW_MAJOR_VERSION                     7
+ #define BCM_5710_FW_MINOR_VERSION                     13
+-#define BCM_5710_FW_REVISION_VERSION          15
++#define BCM_5710_FW_REVISION_VERSION          21
++#define BCM_5710_FW_REVISION_VERSION_V15      15
+ #define BCM_5710_FW_ENGINEERING_VERSION               0
+ #define BCM_5710_FW_COMPILE_FLAGS                     1
+--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
++++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_main.c
+@@ -74,9 +74,19 @@
+       __stringify(BCM_5710_FW_MINOR_VERSION) "."      \
+       __stringify(BCM_5710_FW_REVISION_VERSION) "."   \
+       __stringify(BCM_5710_FW_ENGINEERING_VERSION)
++
++#define FW_FILE_VERSION_V15                           \
++      __stringify(BCM_5710_FW_MAJOR_VERSION) "."      \
++      __stringify(BCM_5710_FW_MINOR_VERSION) "."      \
++      __stringify(BCM_5710_FW_REVISION_VERSION_V15) "."       \
++      __stringify(BCM_5710_FW_ENGINEERING_VERSION)
++
+ #define FW_FILE_NAME_E1               "bnx2x/bnx2x-e1-" FW_FILE_VERSION ".fw"
+ #define FW_FILE_NAME_E1H      "bnx2x/bnx2x-e1h-" FW_FILE_VERSION ".fw"
+ #define FW_FILE_NAME_E2               "bnx2x/bnx2x-e2-" FW_FILE_VERSION ".fw"
++#define FW_FILE_NAME_E1_V15   "bnx2x/bnx2x-e1-" FW_FILE_VERSION_V15 ".fw"
++#define FW_FILE_NAME_E1H_V15  "bnx2x/bnx2x-e1h-" FW_FILE_VERSION_V15 ".fw"
++#define FW_FILE_NAME_E2_V15   "bnx2x/bnx2x-e2-" FW_FILE_VERSION_V15 ".fw"
+ /* Time in jiffies before concluding the transmitter is hung */
+ #define TX_TIMEOUT            (5*HZ)
+@@ -747,9 +757,7 @@ static int bnx2x_mc_assert(struct bnx2x
+                 CHIP_IS_E1(bp) ? "everest1" :
+                 CHIP_IS_E1H(bp) ? "everest1h" :
+                 CHIP_IS_E2(bp) ? "everest2" : "everest3",
+-                BCM_5710_FW_MAJOR_VERSION,
+-                BCM_5710_FW_MINOR_VERSION,
+-                BCM_5710_FW_REVISION_VERSION);
++                bp->fw_major, bp->fw_minor, bp->fw_rev);
+       return rc;
+ }
+@@ -12302,6 +12310,15 @@ static int bnx2x_init_bp(struct bnx2x *b
+       bnx2x_read_fwinfo(bp);
++      if (IS_PF(bp)) {
++              rc = bnx2x_init_firmware(bp);
++
++              if (rc) {
++                      bnx2x_free_mem_bp(bp);
++                      return rc;
++              }
++      }
++
+       func = BP_FUNC(bp);
+       /* need to reset chip if undi was active */
+@@ -12314,6 +12331,7 @@ static int bnx2x_init_bp(struct bnx2x *b
+               rc = bnx2x_prev_unload(bp);
+               if (rc) {
++                      bnx2x_release_firmware(bp);
+                       bnx2x_free_mem_bp(bp);
+                       return rc;
+               }
+@@ -13311,16 +13329,11 @@ static int bnx2x_check_firmware(struct b
+       /* Check FW version */
+       offset = be32_to_cpu(fw_hdr->fw_version.offset);
+       fw_ver = firmware->data + offset;
+-      if ((fw_ver[0] != BCM_5710_FW_MAJOR_VERSION) ||
+-          (fw_ver[1] != BCM_5710_FW_MINOR_VERSION) ||
+-          (fw_ver[2] != BCM_5710_FW_REVISION_VERSION) ||
+-          (fw_ver[3] != BCM_5710_FW_ENGINEERING_VERSION)) {
++      if (fw_ver[0] != bp->fw_major || fw_ver[1] != bp->fw_minor ||
++          fw_ver[2] != bp->fw_rev || fw_ver[3] != bp->fw_eng) {
+               BNX2X_ERR("Bad FW version:%d.%d.%d.%d. Should be %d.%d.%d.%d\n",
+-                     fw_ver[0], fw_ver[1], fw_ver[2], fw_ver[3],
+-                     BCM_5710_FW_MAJOR_VERSION,
+-                     BCM_5710_FW_MINOR_VERSION,
+-                     BCM_5710_FW_REVISION_VERSION,
+-                     BCM_5710_FW_ENGINEERING_VERSION);
++                        fw_ver[0], fw_ver[1], fw_ver[2], fw_ver[3],
++                        bp->fw_major, bp->fw_minor, bp->fw_rev, bp->fw_eng);
+               return -EINVAL;
+       }
+@@ -13398,34 +13411,51 @@ do {                                                                 \
+            (u8 *)bp->arr, len);                                       \
+ } while (0)
+-static int bnx2x_init_firmware(struct bnx2x *bp)
++int bnx2x_init_firmware(struct bnx2x *bp)
+ {
+-      const char *fw_file_name;
++      const char *fw_file_name, *fw_file_name_v15;
+       struct bnx2x_fw_file_hdr *fw_hdr;
+       int rc;
+       if (bp->firmware)
+               return 0;
+-      if (CHIP_IS_E1(bp))
++      if (CHIP_IS_E1(bp)) {
+               fw_file_name = FW_FILE_NAME_E1;
+-      else if (CHIP_IS_E1H(bp))
++              fw_file_name_v15 = FW_FILE_NAME_E1_V15;
++      } else if (CHIP_IS_E1H(bp)) {
+               fw_file_name = FW_FILE_NAME_E1H;
+-      else if (!CHIP_IS_E1x(bp))
++              fw_file_name_v15 = FW_FILE_NAME_E1H_V15;
++      } else if (!CHIP_IS_E1x(bp)) {
+               fw_file_name = FW_FILE_NAME_E2;
+-      else {
++              fw_file_name_v15 = FW_FILE_NAME_E2_V15;
++      } else {
+               BNX2X_ERR("Unsupported chip revision\n");
+               return -EINVAL;
+       }
++
+       BNX2X_DEV_INFO("Loading %s\n", fw_file_name);
+       rc = request_firmware(&bp->firmware, fw_file_name, &bp->pdev->dev);
+       if (rc) {
+-              BNX2X_ERR("Can't load firmware file %s\n",
+-                        fw_file_name);
+-              goto request_firmware_exit;
++              BNX2X_DEV_INFO("Trying to load older fw %s\n", fw_file_name_v15);
++
++              /* try to load prev version */
++              rc = request_firmware(&bp->firmware, fw_file_name_v15, &bp->pdev->dev);
++
++              if (rc)
++                      goto request_firmware_exit;
++
++              bp->fw_rev = BCM_5710_FW_REVISION_VERSION_V15;
++      } else {
++              bp->fw_cap |= FW_CAP_INVALIDATE_VF_FP_HSI;
++              bp->fw_rev = BCM_5710_FW_REVISION_VERSION;
+       }
++      bp->fw_major = BCM_5710_FW_MAJOR_VERSION;
++      bp->fw_minor = BCM_5710_FW_MINOR_VERSION;
++      bp->fw_eng = BCM_5710_FW_ENGINEERING_VERSION;
++
+       rc = bnx2x_check_firmware(bp);
+       if (rc) {
+               BNX2X_ERR("Corrupt firmware file %s\n", fw_file_name);
+@@ -13481,7 +13511,7 @@ request_firmware_exit:
+       return rc;
+ }
+-static void bnx2x_release_firmware(struct bnx2x *bp)
++void bnx2x_release_firmware(struct bnx2x *bp)
+ {
+       kfree(bp->init_ops_offsets);
+       kfree(bp->init_ops);
+@@ -13998,6 +14028,7 @@ static int bnx2x_init_one(struct pci_dev
+       return 0;
+ init_one_freemem:
++      bnx2x_release_firmware(bp);
+       bnx2x_free_mem_bp(bp);
+ init_one_exit:
diff --git a/queue-5.15/drm-amdgpu-use-correct-viewport_dimension-for-dcn2.patch b/queue-5.15/drm-amdgpu-use-correct-viewport_dimension-for-dcn2.patch
new file mode 100644 (file)
index 0000000..5ae6a63
--- /dev/null
@@ -0,0 +1,81 @@
+From dc5d4aff2e99c312df8abbe1ee9a731d2913bc1b Mon Sep 17 00:00:00 2001
+From: Harry Wentland <harry.wentland@amd.com>
+Date: Tue, 4 Jan 2022 10:45:41 -0500
+Subject: drm/amdgpu: Use correct VIEWPORT_DIMENSION for DCN2
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Harry Wentland <harry.wentland@amd.com>
+
+commit dc5d4aff2e99c312df8abbe1ee9a731d2913bc1b upstream.
+
+For some reason this file isn't using the appropriate register
+headers for DCN headers, which means that on DCN2 we're getting
+the VIEWPORT_DIMENSION offset wrong.
+
+This means that we're not correctly carving out the framebuffer
+memory correctly for a framebuffer allocated by EFI and
+therefore see corruption when loading amdgpu before the display
+driver takes over control of the framebuffer scanout.
+
+Fix this by checking the DCE_HWIP and picking the correct offset
+accordingly.
+
+Long-term we should expose this info from DC as GMC shouldn't
+need to know about DCN registers.
+
+Cc: stable@vger.kernel.org
+Signed-off-by: Harry Wentland <harry.wentland@amd.com>
+Reviewed-by: Huang Rui <ray.huang@amd.com>
+Acked-by: Christian König <christian.koenig@amd.com>
+Reviewed-by: Alex Deucher <alexander.deucher@amd.com>
+Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
+Signed-off-by: Mario Limonciello <mario.limonciello@amd.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c |   14 +++++++++++++-
+ 1 file changed, 13 insertions(+), 1 deletion(-)
+
+--- a/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
++++ b/drivers/gpu/drm/amd/amdgpu/gmc_v9_0.c
+@@ -72,6 +72,9 @@
+ #define mmDCHUBBUB_SDPIF_MMIO_CNTRL_0                                                                  0x049d
+ #define mmDCHUBBUB_SDPIF_MMIO_CNTRL_0_BASE_IDX                                                         2
++#define mmHUBP0_DCSURF_PRI_VIEWPORT_DIMENSION_DCN2                                                          0x05ea
++#define mmHUBP0_DCSURF_PRI_VIEWPORT_DIMENSION_DCN2_BASE_IDX                                                 2
++
+ static const char *gfxhub_client_ids[] = {
+       "CB",
+@@ -1103,6 +1106,8 @@ static unsigned gmc_v9_0_get_vbios_fb_si
+       u32 d1vga_control = RREG32_SOC15(DCE, 0, mmD1VGA_CONTROL);
+       unsigned size;
++      /* TODO move to DC so GMC doesn't need to hard-code DCN registers */
++
+       if (REG_GET_FIELD(d1vga_control, D1VGA_CONTROL, D1VGA_MODE_ENABLE)) {
+               size = AMDGPU_VBIOS_VGA_ALLOCATION;
+       } else {
+@@ -1110,11 +1115,18 @@ static unsigned gmc_v9_0_get_vbios_fb_si
+               switch (adev->asic_type) {
+               case CHIP_RAVEN:
+-              case CHIP_RENOIR:
+                       viewport = RREG32_SOC15(DCE, 0, mmHUBP0_DCSURF_PRI_VIEWPORT_DIMENSION);
+                       size = (REG_GET_FIELD(viewport,
+                                             HUBP0_DCSURF_PRI_VIEWPORT_DIMENSION, PRI_VIEWPORT_HEIGHT) *
+                               REG_GET_FIELD(viewport,
++                                            HUBP0_DCSURF_PRI_VIEWPORT_DIMENSION, PRI_VIEWPORT_WIDTH) *
++                              4);
++                      break;
++              case CHIP_RENOIR:
++                      viewport = RREG32_SOC15(DCE, 0, mmHUBP0_DCSURF_PRI_VIEWPORT_DIMENSION_DCN2);
++                      size = (REG_GET_FIELD(viewport,
++                                            HUBP0_DCSURF_PRI_VIEWPORT_DIMENSION, PRI_VIEWPORT_HEIGHT) *
++                              REG_GET_FIELD(viewport,
+                                             HUBP0_DCSURF_PRI_VIEWPORT_DIMENSION, PRI_VIEWPORT_WIDTH) *
+                               4);
+                       break;
diff --git a/queue-5.15/io_uring-fix-not-released-cached-task-refs.patch b/queue-5.15/io_uring-fix-not-released-cached-task-refs.patch
new file mode 100644 (file)
index 0000000..0e9f43a
--- /dev/null
@@ -0,0 +1,93 @@
+From 3cc7fdb9f90a25ae92250bf9e6cf3b9556b230e9 Mon Sep 17 00:00:00 2001
+From: Pavel Begunkov <asml.silence@gmail.com>
+Date: Sun, 9 Jan 2022 00:53:22 +0000
+Subject: io_uring: fix not released cached task refs
+
+From: Pavel Begunkov <asml.silence@gmail.com>
+
+commit 3cc7fdb9f90a25ae92250bf9e6cf3b9556b230e9 upstream.
+
+tctx_task_work() may get run after io_uring cancellation and so there
+will be no one to put cached in tctx task refs that may have been added
+back by tw handlers using inline completion infra, Call
+io_uring_drop_tctx_refs() at the end of the main tw handler to release
+them.
+
+Cc: stable@vger.kernel.org # 5.15+
+Reported-by: Lukas Bulwahn <lukas.bulwahn@gmail.com>
+Fixes: e98e49b2bbf7 ("io_uring: extend task put optimisations")
+Signed-off-by: Pavel Begunkov <asml.silence@gmail.com>
+Link: https://lore.kernel.org/r/69f226b35fbdb996ab799a8bbc1c06bf634ccec1.1641688805.git.asml.silence@gmail.com
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/io_uring.c |   34 +++++++++++++++++++++-------------
+ 1 file changed, 21 insertions(+), 13 deletions(-)
+
+--- a/fs/io_uring.c
++++ b/fs/io_uring.c
+@@ -1760,6 +1760,18 @@ static inline void io_get_task_refs(int
+               io_task_refs_refill(tctx);
+ }
++static __cold void io_uring_drop_tctx_refs(struct task_struct *task)
++{
++      struct io_uring_task *tctx = task->io_uring;
++      unsigned int refs = tctx->cached_refs;
++
++      if (refs) {
++              tctx->cached_refs = 0;
++              percpu_counter_sub(&tctx->inflight, refs);
++              put_task_struct_many(task, refs);
++      }
++}
++
+ static bool io_cqring_event_overflow(struct io_ring_ctx *ctx, u64 user_data,
+                                    long res, unsigned int cflags)
+ {
+@@ -2200,6 +2212,10 @@ static void tctx_task_work(struct callba
+       }
+       ctx_flush_and_put(ctx, &locked);
++
++      /* relaxed read is enough as only the task itself sets ->in_idle */
++      if (unlikely(atomic_read(&tctx->in_idle)))
++              io_uring_drop_tctx_refs(current);
+ }
+ static void io_req_task_work_add(struct io_kiocb *req)
+@@ -9766,18 +9782,6 @@ static s64 tctx_inflight(struct io_uring
+       return percpu_counter_sum(&tctx->inflight);
+ }
+-static void io_uring_drop_tctx_refs(struct task_struct *task)
+-{
+-      struct io_uring_task *tctx = task->io_uring;
+-      unsigned int refs = tctx->cached_refs;
+-
+-      if (refs) {
+-              tctx->cached_refs = 0;
+-              percpu_counter_sub(&tctx->inflight, refs);
+-              put_task_struct_many(task, refs);
+-      }
+-}
+-
+ /*
+  * Find any io_uring ctx that this task has registered or done IO on, and cancel
+  * requests. @sqd should be not-null IFF it's an SQPOLL thread cancellation.
+@@ -9834,10 +9838,14 @@ static void io_uring_cancel_generic(bool
+                       schedule();
+               finish_wait(&tctx->wait, &wait);
+       } while (1);
+-      atomic_dec(&tctx->in_idle);
+       io_uring_clean_tctx(tctx);
+       if (cancel_all) {
++              /*
++               * We shouldn't run task_works after cancel, so just leave
++               * ->in_idle set for normal exit.
++               */
++              atomic_dec(&tctx->in_idle);
+               /* for exec all current's requests should be gone, kill tctx */
+               __io_uring_free(current);
+       }
diff --git a/queue-5.15/memcg-better-bounds-on-the-memcg-stats-updates.patch b/queue-5.15/memcg-better-bounds-on-the-memcg-stats-updates.patch
new file mode 100644 (file)
index 0000000..3519093
--- /dev/null
@@ -0,0 +1,99 @@
+From 5b3be698a872c490dbed524f3e2463701ab21339 Mon Sep 17 00:00:00 2001
+From: Shakeel Butt <shakeelb@google.com>
+Date: Fri, 14 Jan 2022 14:05:39 -0800
+Subject: memcg: better bounds on the memcg stats updates
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Shakeel Butt <shakeelb@google.com>
+
+commit 5b3be698a872c490dbed524f3e2463701ab21339 upstream.
+
+Commit 11192d9c124d ("memcg: flush stats only if updated") added
+tracking of memcg stats updates which is used by the readers to flush
+only if the updates are over a certain threshold.  However each
+individual update can correspond to a large value change for a given
+stat.  For example adding or removing a hugepage to an LRU changes the
+stat by thp_nr_pages (512 on x86_64).
+
+Treating the update related to THP as one can keep the stat off, in
+theory, by (thp_nr_pages * nr_cpus * CHARGE_BATCH) before flush.
+
+To handle such scenarios, this patch adds consideration of the stat
+update value as well instead of just the update event.  In addition let
+the asyn flusher unconditionally flush the stats to put time limit on
+the stats skew and hopefully a lot less readers would need to flush.
+
+Link: https://lkml.kernel.org/r/20211118065350.697046-1-shakeelb@google.com
+Signed-off-by: Shakeel Butt <shakeelb@google.com>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: "Michal Koutný" <mkoutny@suse.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Ivan Babrou <ivan@cloudflare.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/memcontrol.c |   20 +++++++++++++-------
+ 1 file changed, 13 insertions(+), 7 deletions(-)
+
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -651,11 +651,17 @@ static DEFINE_SPINLOCK(stats_flush_lock)
+ static DEFINE_PER_CPU(unsigned int, stats_updates);
+ static atomic_t stats_flush_threshold = ATOMIC_INIT(0);
+-static inline void memcg_rstat_updated(struct mem_cgroup *memcg)
++static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
+ {
++      unsigned int x;
++
+       cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
+-      if (!(__this_cpu_inc_return(stats_updates) % MEMCG_CHARGE_BATCH))
+-              atomic_inc(&stats_flush_threshold);
++
++      x = __this_cpu_add_return(stats_updates, abs(val));
++      if (x > MEMCG_CHARGE_BATCH) {
++              atomic_add(x / MEMCG_CHARGE_BATCH, &stats_flush_threshold);
++              __this_cpu_write(stats_updates, 0);
++      }
+ }
+ static void __mem_cgroup_flush_stats(void)
+@@ -678,7 +684,7 @@ void mem_cgroup_flush_stats(void)
+ static void flush_memcg_stats_dwork(struct work_struct *w)
+ {
+-      mem_cgroup_flush_stats();
++      __mem_cgroup_flush_stats();
+       queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ);
+ }
+@@ -694,7 +700,7 @@ void __mod_memcg_state(struct mem_cgroup
+               return;
+       __this_cpu_add(memcg->vmstats_percpu->state[idx], val);
+-      memcg_rstat_updated(memcg);
++      memcg_rstat_updated(memcg, val);
+ }
+ /* idx can be of type enum memcg_stat_item or node_stat_item. */
+@@ -727,7 +733,7 @@ void __mod_memcg_lruvec_state(struct lru
+       /* Update lruvec */
+       __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
+-      memcg_rstat_updated(memcg);
++      memcg_rstat_updated(memcg, val);
+ }
+ /**
+@@ -829,7 +835,7 @@ void __count_memcg_events(struct mem_cgr
+               return;
+       __this_cpu_add(memcg->vmstats_percpu->events[idx], count);
+-      memcg_rstat_updated(memcg);
++      memcg_rstat_updated(memcg, count);
+ }
+ static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
diff --git a/queue-5.15/memcg-flush-stats-only-if-updated.patch b/queue-5.15/memcg-flush-stats-only-if-updated.patch
new file mode 100644 (file)
index 0000000..c95a9fb
--- /dev/null
@@ -0,0 +1,174 @@
+From 11192d9c124d58d66449b163ed0d2cdff03761a1 Mon Sep 17 00:00:00 2001
+From: Shakeel Butt <shakeelb@google.com>
+Date: Fri, 5 Nov 2021 13:37:31 -0700
+Subject: memcg: flush stats only if updated
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Shakeel Butt <shakeelb@google.com>
+
+commit 11192d9c124d58d66449b163ed0d2cdff03761a1 upstream.
+
+At the moment, the kernel flushes the memcg stats on every refault and
+also on every reclaim iteration.  Although rstat maintains per-cpu
+update tree but on the flush the kernel still has to go through all the
+cpu rstat update tree to check if there is anything to flush.  This
+patch adds the tracking on the stats update side to make flush side more
+clever by skipping the flush if there is no update.
+
+The stats update codepath is very sensitive performance wise for many
+workloads and benchmarks.  So, we can not follow what the commit
+aa48e47e3906 ("memcg: infrastructure to flush memcg stats") did which
+was triggering async flush through queue_work() and caused a lot
+performance regression reports.  That got reverted by the commit
+1f828223b799 ("memcg: flush lruvec stats in the refault").
+
+In this patch we kept the stats update codepath very minimal and let the
+stats reader side to flush the stats only when the updates are over a
+specific threshold.  For now the threshold is (nr_cpus * CHARGE_BATCH).
+
+To evaluate the impact of this patch, an 8 GiB tmpfs file is created on
+a system with swap-on-zram and the file was pushed to swap through
+memory.force_empty interface.  On reading the whole file, the memcg stat
+flush in the refault code path is triggered.  With this patch, we
+observed 63% reduction in the read time of 8 GiB file.
+
+Link: https://lkml.kernel.org/r/20211001190040.48086-1-shakeelb@google.com
+Signed-off-by: Shakeel Butt <shakeelb@google.com>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Michal Hocko <mhocko@kernel.org>
+Reviewed-by: "Michal Koutný" <mkoutny@suse.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Ivan Babrou <ivan@cloudflare.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/memcontrol.c |   78 +++++++++++++++++++++++++++++++++++++++-----------------
+ 1 file changed, 55 insertions(+), 23 deletions(-)
+
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -103,11 +103,6 @@ static bool do_memsw_account(void)
+       return !cgroup_subsys_on_dfl(memory_cgrp_subsys) && !cgroup_memory_noswap;
+ }
+-/* memcg and lruvec stats flushing */
+-static void flush_memcg_stats_dwork(struct work_struct *w);
+-static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
+-static DEFINE_SPINLOCK(stats_flush_lock);
+-
+ #define THRESHOLDS_EVENTS_TARGET 128
+ #define SOFTLIMIT_EVENTS_TARGET 1024
+@@ -635,6 +630,56 @@ mem_cgroup_largest_soft_limit_node(struc
+       return mz;
+ }
++/*
++ * memcg and lruvec stats flushing
++ *
++ * Many codepaths leading to stats update or read are performance sensitive and
++ * adding stats flushing in such codepaths is not desirable. So, to optimize the
++ * flushing the kernel does:
++ *
++ * 1) Periodically and asynchronously flush the stats every 2 seconds to not let
++ *    rstat update tree grow unbounded.
++ *
++ * 2) Flush the stats synchronously on reader side only when there are more than
++ *    (MEMCG_CHARGE_BATCH * nr_cpus) update events. Though this optimization
++ *    will let stats be out of sync by atmost (MEMCG_CHARGE_BATCH * nr_cpus) but
++ *    only for 2 seconds due to (1).
++ */
++static void flush_memcg_stats_dwork(struct work_struct *w);
++static DECLARE_DEFERRABLE_WORK(stats_flush_dwork, flush_memcg_stats_dwork);
++static DEFINE_SPINLOCK(stats_flush_lock);
++static DEFINE_PER_CPU(unsigned int, stats_updates);
++static atomic_t stats_flush_threshold = ATOMIC_INIT(0);
++
++static inline void memcg_rstat_updated(struct mem_cgroup *memcg)
++{
++      cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
++      if (!(__this_cpu_inc_return(stats_updates) % MEMCG_CHARGE_BATCH))
++              atomic_inc(&stats_flush_threshold);
++}
++
++static void __mem_cgroup_flush_stats(void)
++{
++      if (!spin_trylock(&stats_flush_lock))
++              return;
++
++      cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup);
++      atomic_set(&stats_flush_threshold, 0);
++      spin_unlock(&stats_flush_lock);
++}
++
++void mem_cgroup_flush_stats(void)
++{
++      if (atomic_read(&stats_flush_threshold) > num_online_cpus())
++              __mem_cgroup_flush_stats();
++}
++
++static void flush_memcg_stats_dwork(struct work_struct *w)
++{
++      mem_cgroup_flush_stats();
++      queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ);
++}
++
+ /**
+  * __mod_memcg_state - update cgroup memory statistics
+  * @memcg: the memory cgroup
+@@ -647,7 +692,7 @@ void __mod_memcg_state(struct mem_cgroup
+               return;
+       __this_cpu_add(memcg->vmstats_percpu->state[idx], val);
+-      cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
++      memcg_rstat_updated(memcg);
+ }
+ /* idx can be of type enum memcg_stat_item or node_stat_item. */
+@@ -675,10 +720,12 @@ void __mod_memcg_lruvec_state(struct lru
+       memcg = pn->memcg;
+       /* Update memcg */
+-      __mod_memcg_state(memcg, idx, val);
++      __this_cpu_add(memcg->vmstats_percpu->state[idx], val);
+       /* Update lruvec */
+       __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val);
++
++      memcg_rstat_updated(memcg);
+ }
+ /**
+@@ -780,7 +827,7 @@ void __count_memcg_events(struct mem_cgr
+               return;
+       __this_cpu_add(memcg->vmstats_percpu->events[idx], count);
+-      cgroup_rstat_updated(memcg->css.cgroup, smp_processor_id());
++      memcg_rstat_updated(memcg);
+ }
+ static unsigned long memcg_events(struct mem_cgroup *memcg, int event)
+@@ -5330,21 +5377,6 @@ static void mem_cgroup_css_reset(struct
+       memcg_wb_domain_size_changed(memcg);
+ }
+-void mem_cgroup_flush_stats(void)
+-{
+-      if (!spin_trylock(&stats_flush_lock))
+-              return;
+-
+-      cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup);
+-      spin_unlock(&stats_flush_lock);
+-}
+-
+-static void flush_memcg_stats_dwork(struct work_struct *w)
+-{
+-      mem_cgroup_flush_stats();
+-      queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ);
+-}
+-
+ static void mem_cgroup_css_rstat_flush(struct cgroup_subsys_state *css, int cpu)
+ {
+       struct mem_cgroup *memcg = mem_cgroup_from_css(css);
diff --git a/queue-5.15/memcg-unify-memcg-stat-flushing.patch b/queue-5.15/memcg-unify-memcg-stat-flushing.patch
new file mode 100644 (file)
index 0000000..99005d6
--- /dev/null
@@ -0,0 +1,109 @@
+From fd25a9e0e23b995fd0ba5e2f00a1099452cbc3cf Mon Sep 17 00:00:00 2001
+From: Shakeel Butt <shakeelb@google.com>
+Date: Fri, 5 Nov 2021 13:37:34 -0700
+Subject: memcg: unify memcg stat flushing
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Shakeel Butt <shakeelb@google.com>
+
+commit fd25a9e0e23b995fd0ba5e2f00a1099452cbc3cf upstream.
+
+The memcg stats can be flushed in multiple context and potentially in
+parallel too.  For example multiple parallel user space readers for
+memcg stats will contend on the rstat locks with each other.  There is
+no need for that.  We just need one flusher and everyone else can
+benefit.
+
+In addition after aa48e47e3906 ("memcg: infrastructure to flush memcg
+stats") the kernel periodically flush the memcg stats from the root, so,
+the other flushers will potentially have much less work to do.
+
+Link: https://lkml.kernel.org/r/20211001190040.48086-2-shakeelb@google.com
+Signed-off-by: Shakeel Butt <shakeelb@google.com>
+Acked-by: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Michal Hocko <mhocko@kernel.org>
+Cc: "Michal Koutný" <mkoutny@suse.com>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Cc: Ivan Babrou <ivan@cloudflare.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/memcontrol.c |   19 ++++++++++---------
+ 1 file changed, 10 insertions(+), 9 deletions(-)
+
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -660,12 +660,14 @@ static inline void memcg_rstat_updated(s
+ static void __mem_cgroup_flush_stats(void)
+ {
+-      if (!spin_trylock(&stats_flush_lock))
++      unsigned long flag;
++
++      if (!spin_trylock_irqsave(&stats_flush_lock, flag))
+               return;
+       cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup);
+       atomic_set(&stats_flush_threshold, 0);
+-      spin_unlock(&stats_flush_lock);
++      spin_unlock_irqrestore(&stats_flush_lock, flag);
+ }
+ void mem_cgroup_flush_stats(void)
+@@ -1461,7 +1463,7 @@ static char *memory_stat_format(struct m
+        *
+        * Current memory state:
+        */
+-      cgroup_rstat_flush(memcg->css.cgroup);
++      mem_cgroup_flush_stats();
+       for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
+               u64 size;
+@@ -3554,8 +3556,7 @@ static unsigned long mem_cgroup_usage(st
+       unsigned long val;
+       if (mem_cgroup_is_root(memcg)) {
+-              /* mem_cgroup_threshold() calls here from irqsafe context */
+-              cgroup_rstat_flush_irqsafe(memcg->css.cgroup);
++              mem_cgroup_flush_stats();
+               val = memcg_page_state(memcg, NR_FILE_PAGES) +
+                       memcg_page_state(memcg, NR_ANON_MAPPED);
+               if (swap)
+@@ -3936,7 +3937,7 @@ static int memcg_numa_stat_show(struct s
+       int nid;
+       struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+-      cgroup_rstat_flush(memcg->css.cgroup);
++      mem_cgroup_flush_stats();
+       for (stat = stats; stat < stats + ARRAY_SIZE(stats); stat++) {
+               seq_printf(m, "%s=%lu", stat->name,
+@@ -4008,7 +4009,7 @@ static int memcg_stat_show(struct seq_fi
+       BUILD_BUG_ON(ARRAY_SIZE(memcg1_stat_names) != ARRAY_SIZE(memcg1_stats));
+-      cgroup_rstat_flush(memcg->css.cgroup);
++      mem_cgroup_flush_stats();
+       for (i = 0; i < ARRAY_SIZE(memcg1_stats); i++) {
+               unsigned long nr;
+@@ -4511,7 +4512,7 @@ void mem_cgroup_wb_stats(struct bdi_writ
+       struct mem_cgroup *memcg = mem_cgroup_from_css(wb->memcg_css);
+       struct mem_cgroup *parent;
+-      cgroup_rstat_flush_irqsafe(memcg->css.cgroup);
++      mem_cgroup_flush_stats();
+       *pdirty = memcg_page_state(memcg, NR_FILE_DIRTY);
+       *pwriteback = memcg_page_state(memcg, NR_WRITEBACK);
+@@ -6394,7 +6395,7 @@ static int memory_numa_stat_show(struct
+       int i;
+       struct mem_cgroup *memcg = mem_cgroup_from_seq(m);
+-      cgroup_rstat_flush(memcg->css.cgroup);
++      mem_cgroup_flush_stats();
+       for (i = 0; i < ARRAY_SIZE(memory_stats); i++) {
+               int nid;
diff --git a/queue-5.15/rcu-tighten-rcu_advance_cbs_nowake-checks.patch b/queue-5.15/rcu-tighten-rcu_advance_cbs_nowake-checks.patch
new file mode 100644 (file)
index 0000000..dfaacc9
--- /dev/null
@@ -0,0 +1,43 @@
+From 614ddad17f22a22e035e2ea37a04815f50362017 Mon Sep 17 00:00:00 2001
+From: "Paul E. McKenney" <paulmck@kernel.org>
+Date: Fri, 17 Sep 2021 15:04:48 -0700
+Subject: rcu: Tighten rcu_advance_cbs_nowake() checks
+
+From: Paul E. McKenney <paulmck@kernel.org>
+
+commit 614ddad17f22a22e035e2ea37a04815f50362017 upstream.
+
+Currently, rcu_advance_cbs_nowake() checks that a grace period is in
+progress, however, that grace period could end just after the check.
+This commit rechecks that a grace period is still in progress while
+holding the rcu_node structure's lock.  The grace period cannot end while
+the current CPU's rcu_node structure's ->lock is held, thus avoiding
+false positives from the WARN_ON_ONCE().
+
+As Daniel Vacek noted, it is not necessary for the rcu_node structure
+to have a CPU that has not yet passed through its quiescent state.
+
+Tested-by: Guillaume Morin <guillaume@morinfr.org>
+Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ kernel/rcu/tree.c |    7 ++++---
+ 1 file changed, 4 insertions(+), 3 deletions(-)
+
+--- a/kernel/rcu/tree.c
++++ b/kernel/rcu/tree.c
+@@ -1594,10 +1594,11 @@ static void __maybe_unused rcu_advance_c
+                                                 struct rcu_data *rdp)
+ {
+       rcu_lockdep_assert_cblist_protected(rdp);
+-      if (!rcu_seq_state(rcu_seq_current(&rnp->gp_seq)) ||
+-          !raw_spin_trylock_rcu_node(rnp))
++      if (!rcu_seq_state(rcu_seq_current(&rnp->gp_seq)) || !raw_spin_trylock_rcu_node(rnp))
+               return;
+-      WARN_ON_ONCE(rcu_advance_cbs(rnp, rdp));
++      // The grace period cannot end while we hold the rcu_node lock.
++      if (rcu_seq_state(rcu_seq_current(&rnp->gp_seq)))
++              WARN_ON_ONCE(rcu_advance_cbs(rnp, rdp));
+       raw_spin_unlock_rcu_node(rnp);
+ }
diff --git a/queue-5.15/select-fix-indefinitely-sleeping-task-in-poll_schedule_timeout.patch b/queue-5.15/select-fix-indefinitely-sleeping-task-in-poll_schedule_timeout.patch
new file mode 100644 (file)
index 0000000..9d11e52
--- /dev/null
@@ -0,0 +1,135 @@
+From 68514dacf2715d11b91ca50d88de047c086fea9c Mon Sep 17 00:00:00 2001
+From: Jan Kara <jack@suse.cz>
+Date: Mon, 10 Jan 2022 19:19:23 +0100
+Subject: select: Fix indefinitely sleeping task in poll_schedule_timeout()
+
+From: Jan Kara <jack@suse.cz>
+
+commit 68514dacf2715d11b91ca50d88de047c086fea9c upstream.
+
+A task can end up indefinitely sleeping in do_select() ->
+poll_schedule_timeout() when the following race happens:
+
+  TASK1 (thread1)             TASK2                   TASK1 (thread2)
+  do_select()
+    setup poll_wqueues table
+    with 'fd'
+                              write data to 'fd'
+                                pollwake()
+                                  table->triggered = 1
+                                                      closes 'fd' thread1 is
+                                                        waiting for
+    poll_schedule_timeout()
+      - sees table->triggered
+      table->triggered = 0
+      return -EINTR
+    loop back in do_select()
+
+But at this point when TASK1 loops back, the fdget() in the setup of
+poll_wqueues fails.  So now so we never find 'fd' is ready for reading
+and sleep in poll_schedule_timeout() indefinitely.
+
+Treat an fd that got closed as a fd on which some event happened.  This
+makes sure cannot block indefinitely in do_select().
+
+Another option would be to return -EBADF in this case but that has a
+potential of subtly breaking applications that excercise this behavior
+and it happens to work for them.  So returning fd as active seems like a
+safer choice.
+
+Suggested-by: Linus Torvalds <torvalds@linux-foundation.org>
+CC: stable@vger.kernel.org
+Signed-off-by: Jan Kara <jack@suse.cz>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/select.c |   63 +++++++++++++++++++++++++++++++-----------------------------
+ 1 file changed, 33 insertions(+), 30 deletions(-)
+
+--- a/fs/select.c
++++ b/fs/select.c
+@@ -458,9 +458,11 @@ get_max:
+       return max;
+ }
+-#define POLLIN_SET (EPOLLRDNORM | EPOLLRDBAND | EPOLLIN | EPOLLHUP | EPOLLERR)
+-#define POLLOUT_SET (EPOLLWRBAND | EPOLLWRNORM | EPOLLOUT | EPOLLERR)
+-#define POLLEX_SET (EPOLLPRI)
++#define POLLIN_SET (EPOLLRDNORM | EPOLLRDBAND | EPOLLIN | EPOLLHUP | EPOLLERR |\
++                      EPOLLNVAL)
++#define POLLOUT_SET (EPOLLWRBAND | EPOLLWRNORM | EPOLLOUT | EPOLLERR |\
++                       EPOLLNVAL)
++#define POLLEX_SET (EPOLLPRI | EPOLLNVAL)
+ static inline void wait_key_set(poll_table *wait, unsigned long in,
+                               unsigned long out, unsigned long bit,
+@@ -527,6 +529,7 @@ static int do_select(int n, fd_set_bits
+                                       break;
+                               if (!(bit & all_bits))
+                                       continue;
++                              mask = EPOLLNVAL;
+                               f = fdget(i);
+                               if (f.file) {
+                                       wait_key_set(wait, in, out, bit,
+@@ -534,34 +537,34 @@ static int do_select(int n, fd_set_bits
+                                       mask = vfs_poll(f.file, wait);
+                                       fdput(f);
+-                                      if ((mask & POLLIN_SET) && (in & bit)) {
+-                                              res_in |= bit;
+-                                              retval++;
+-                                              wait->_qproc = NULL;
+-                                      }
+-                                      if ((mask & POLLOUT_SET) && (out & bit)) {
+-                                              res_out |= bit;
+-                                              retval++;
+-                                              wait->_qproc = NULL;
+-                                      }
+-                                      if ((mask & POLLEX_SET) && (ex & bit)) {
+-                                              res_ex |= bit;
+-                                              retval++;
+-                                              wait->_qproc = NULL;
+-                                      }
+-                                      /* got something, stop busy polling */
+-                                      if (retval) {
+-                                              can_busy_loop = false;
+-                                              busy_flag = 0;
+-
+-                                      /*
+-                                       * only remember a returned
+-                                       * POLL_BUSY_LOOP if we asked for it
+-                                       */
+-                                      } else if (busy_flag & mask)
+-                                              can_busy_loop = true;
+-
+                               }
++                              if ((mask & POLLIN_SET) && (in & bit)) {
++                                      res_in |= bit;
++                                      retval++;
++                                      wait->_qproc = NULL;
++                              }
++                              if ((mask & POLLOUT_SET) && (out & bit)) {
++                                      res_out |= bit;
++                                      retval++;
++                                      wait->_qproc = NULL;
++                              }
++                              if ((mask & POLLEX_SET) && (ex & bit)) {
++                                      res_ex |= bit;
++                                      retval++;
++                                      wait->_qproc = NULL;
++                              }
++                              /* got something, stop busy polling */
++                              if (retval) {
++                                      can_busy_loop = false;
++                                      busy_flag = 0;
++
++                              /*
++                               * only remember a returned
++                               * POLL_BUSY_LOOP if we asked for it
++                               */
++                              } else if (busy_flag & mask)
++                                      can_busy_loop = true;
++
+                       }
+                       if (res_in)
+                               *rinp = res_in;
index 638a593af76f3a753355c1927d8a342cd2f38706..2eb031c3c8b1e469883b1f27a3e9910fbcd70115 100644 (file)
@@ -1,2 +1,11 @@
 drm-i915-flush-tlbs-before-releasing-backing-store.patch
 drm-amd-display-reset-dcn31-smu-mailbox-on-failures.patch
+io_uring-fix-not-released-cached-task-refs.patch
+bnx2x-utilize-firmware-7.13.21.0.patch
+bnx2x-invalidate-fastpath-hsi-version-for-vfs.patch
+memcg-flush-stats-only-if-updated.patch
+memcg-unify-memcg-stat-flushing.patch
+memcg-better-bounds-on-the-memcg-stats-updates.patch
+rcu-tighten-rcu_advance_cbs_nowake-checks.patch
+select-fix-indefinitely-sleeping-task-in-poll_schedule_timeout.patch
+drm-amdgpu-use-correct-viewport_dimension-for-dcn2.patch