]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
5.15-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 25 Apr 2022 10:27:43 +0000 (12:27 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 25 Apr 2022 10:27:43 +0000 (12:27 +0200)
added patches:
ata-pata_marvell-check-the-bmdma_addr-beforing-reading.patch
dma-at_xdmac-fix-a-missing-check-on-list-iterator.patch
dmaengine-imx-sdma-fix-init-of-uart-scripts.patch
edac-synopsys-read-the-error-count-from-the-correct-register.patch
memcg-sync-flush-only-if-periodic-flush-is-delayed.patch
mm-hugetlb-allow-for-high-userspace-addresses.patch
mm-memory-failure.c-skip-huge_zero_page-in-memory_failure.patch
mm-mmu_notifier.c-fix-race-in-mmu_interval_notifier_remove.patch
net-atlantic-invert-deep-par-in-pm-functions-preventing-null-derefs.patch
oom_kill.c-futex-delay-the-oom-reaper-to-allow-time-for-proper-futex-cleanup.patch

queue-5.15/ata-pata_marvell-check-the-bmdma_addr-beforing-reading.patch [new file with mode: 0644]
queue-5.15/dma-at_xdmac-fix-a-missing-check-on-list-iterator.patch [new file with mode: 0644]
queue-5.15/dmaengine-imx-sdma-fix-init-of-uart-scripts.patch [new file with mode: 0644]
queue-5.15/edac-synopsys-read-the-error-count-from-the-correct-register.patch [new file with mode: 0644]
queue-5.15/memcg-sync-flush-only-if-periodic-flush-is-delayed.patch [new file with mode: 0644]
queue-5.15/mm-hugetlb-allow-for-high-userspace-addresses.patch [new file with mode: 0644]
queue-5.15/mm-memory-failure.c-skip-huge_zero_page-in-memory_failure.patch [new file with mode: 0644]
queue-5.15/mm-mmu_notifier.c-fix-race-in-mmu_interval_notifier_remove.patch [new file with mode: 0644]
queue-5.15/net-atlantic-invert-deep-par-in-pm-functions-preventing-null-derefs.patch [new file with mode: 0644]
queue-5.15/oom_kill.c-futex-delay-the-oom-reaper-to-allow-time-for-proper-futex-cleanup.patch [new file with mode: 0644]
queue-5.15/series

diff --git a/queue-5.15/ata-pata_marvell-check-the-bmdma_addr-beforing-reading.patch b/queue-5.15/ata-pata_marvell-check-the-bmdma_addr-beforing-reading.patch
new file mode 100644 (file)
index 0000000..c79f199
--- /dev/null
@@ -0,0 +1,40 @@
+From aafa9f958342db36c17ac2a7f1b841032c96feb4 Mon Sep 17 00:00:00 2001
+From: Zheyu Ma <zheyuma97@gmail.com>
+Date: Thu, 21 Apr 2022 09:39:20 +0800
+Subject: ata: pata_marvell: Check the 'bmdma_addr' beforing reading
+
+From: Zheyu Ma <zheyuma97@gmail.com>
+
+commit aafa9f958342db36c17ac2a7f1b841032c96feb4 upstream.
+
+Before detecting the cable type on the dma bar, the driver should check
+whether the 'bmdma_addr' is zero, which means the adapter does not
+support DMA, otherwise we will get the following error:
+
+[    5.146634] Bad IO access at port 0x1 (return inb(port))
+[    5.147206] WARNING: CPU: 2 PID: 303 at lib/iomap.c:44 ioread8+0x4a/0x60
+[    5.150856] RIP: 0010:ioread8+0x4a/0x60
+[    5.160238] Call Trace:
+[    5.160470]  <TASK>
+[    5.160674]  marvell_cable_detect+0x6e/0xc0 [pata_marvell]
+[    5.161728]  ata_eh_recover+0x3520/0x6cc0
+[    5.168075]  ata_do_eh+0x49/0x3c0
+
+Signed-off-by: Zheyu Ma <zheyuma97@gmail.com>
+Signed-off-by: Damien Le Moal <damien.lemoal@opensource.wdc.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/ata/pata_marvell.c |    2 ++
+ 1 file changed, 2 insertions(+)
+
+--- a/drivers/ata/pata_marvell.c
++++ b/drivers/ata/pata_marvell.c
+@@ -83,6 +83,8 @@ static int marvell_cable_detect(struct a
+       switch(ap->port_no)
+       {
+       case 0:
++              if (!ap->ioaddr.bmdma_addr)
++                      return ATA_CBL_PATA_UNK;
+               if (ioread8(ap->ioaddr.bmdma_addr + 1) & 1)
+                       return ATA_CBL_PATA40;
+               return ATA_CBL_PATA80;
diff --git a/queue-5.15/dma-at_xdmac-fix-a-missing-check-on-list-iterator.patch b/queue-5.15/dma-at_xdmac-fix-a-missing-check-on-list-iterator.patch
new file mode 100644 (file)
index 0000000..9575a7f
--- /dev/null
@@ -0,0 +1,57 @@
+From 206680c4e46b62fd8909385e0874a36952595b85 Mon Sep 17 00:00:00 2001
+From: Xiaomeng Tong <xiam0nd.tong@gmail.com>
+Date: Sun, 27 Mar 2022 14:11:54 +0800
+Subject: dma: at_xdmac: fix a missing check on list iterator
+
+From: Xiaomeng Tong <xiam0nd.tong@gmail.com>
+
+commit 206680c4e46b62fd8909385e0874a36952595b85 upstream.
+
+The bug is here:
+       __func__, desc, &desc->tx_dma_desc.phys, ret, cookie, residue);
+
+The list iterator 'desc' will point to a bogus position containing
+HEAD if the list is empty or no element is found. To avoid dev_dbg()
+prints a invalid address, use a new variable 'iter' as the list
+iterator, while use the origin variable 'desc' as a dedicated
+pointer to point to the found element.
+
+Cc: stable@vger.kernel.org
+Fixes: 82e2424635f4c ("dmaengine: xdmac: fix print warning on dma_addr_t variable")
+Signed-off-by: Xiaomeng Tong <xiam0nd.tong@gmail.com>
+Link: https://lore.kernel.org/r/20220327061154.4867-1-xiam0nd.tong@gmail.com
+Signed-off-by: Vinod Koul <vkoul@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/dma/at_xdmac.c |   12 +++++++-----
+ 1 file changed, 7 insertions(+), 5 deletions(-)
+
+--- a/drivers/dma/at_xdmac.c
++++ b/drivers/dma/at_xdmac.c
+@@ -1450,7 +1450,7 @@ at_xdmac_tx_status(struct dma_chan *chan
+ {
+       struct at_xdmac_chan    *atchan = to_at_xdmac_chan(chan);
+       struct at_xdmac         *atxdmac = to_at_xdmac(atchan->chan.device);
+-      struct at_xdmac_desc    *desc, *_desc;
++      struct at_xdmac_desc    *desc, *_desc, *iter;
+       struct list_head        *descs_list;
+       enum dma_status         ret;
+       int                     residue, retry;
+@@ -1565,11 +1565,13 @@ at_xdmac_tx_status(struct dma_chan *chan
+        * microblock.
+        */
+       descs_list = &desc->descs_list;
+-      list_for_each_entry_safe(desc, _desc, descs_list, desc_node) {
+-              dwidth = at_xdmac_get_dwidth(desc->lld.mbr_cfg);
+-              residue -= (desc->lld.mbr_ubc & 0xffffff) << dwidth;
+-              if ((desc->lld.mbr_nda & 0xfffffffc) == cur_nda)
++      list_for_each_entry_safe(iter, _desc, descs_list, desc_node) {
++              dwidth = at_xdmac_get_dwidth(iter->lld.mbr_cfg);
++              residue -= (iter->lld.mbr_ubc & 0xffffff) << dwidth;
++              if ((iter->lld.mbr_nda & 0xfffffffc) == cur_nda) {
++                      desc = iter;
+                       break;
++              }
+       }
+       residue += cur_ubc << dwidth;
diff --git a/queue-5.15/dmaengine-imx-sdma-fix-init-of-uart-scripts.patch b/queue-5.15/dmaengine-imx-sdma-fix-init-of-uart-scripts.patch
new file mode 100644 (file)
index 0000000..0c62d8b
--- /dev/null
@@ -0,0 +1,103 @@
+From a3ae97f4c87d9570e7e9a3e3324c443757f6e29a Mon Sep 17 00:00:00 2001
+From: Kevin Groeneveld <kgroeneveld@lenbrook.com>
+Date: Sun, 10 Apr 2022 18:31:18 -0400
+Subject: dmaengine: imx-sdma: fix init of uart scripts
+
+From: Kevin Groeneveld <kgroeneveld@lenbrook.com>
+
+commit a3ae97f4c87d9570e7e9a3e3324c443757f6e29a upstream.
+
+Commit b98ce2f4e32b ("dmaengine: imx-sdma: add uart rom script") broke
+uart rx on imx5 when using sdma firmware from older Freescale 2.6.35
+kernel. In this case reading addr->uartXX_2_mcu_addr was going out of
+bounds of the firmware memory and corrupting the uart script addresses.
+
+Simply adding a bounds check before accessing addr->uartXX_2_mcu_addr
+does not work as the uartXX_2_mcu_addr members are now beyond the size
+of the older firmware and the uart addresses would never be populated
+in that case. There are other ways to fix this but overall the logic
+seems clearer to me to revert the uartXX_2_mcu_ram_addr structure
+entries back to uartXX_2_mcu_addr, change the newer entries to
+uartXX_2_mcu_rom_addr and update the logic accordingly.
+
+I have tested this patch on:
+1. An i.MX53 system with sdma firmware from Freescale 2.6.35 kernel.
+   Without this patch uart rx is broken in this scenario, with the
+   patch uart rx is restored.
+2. An i.MX6D system with no external sdma firmware. uart is okay with
+   or without this patch.
+3. An i.MX8MM system using current sdma-imx7d.bin firmware from
+   linux-firmware. uart is okay with or without this patch and I
+   confirmed the rom version of the uart script is being used which was
+   the intention and reason for commit b98ce2f4e32b ("dmaengine:
+   imx-sdma: add uart rom script") in the first place.
+
+Fixes: b98ce2f4e32b ("dmaengine: imx-sdma: add uart rom script")
+Cc: stable@vger.kernel.org
+Signed-off-by: Kevin Groeneveld <kgroeneveld@lenbrook.com>
+Reviewed-by: Lucas Stach <l.stach@pengutronix.de>
+Reviewed-by: Fabio Estevam <festevam@gmail.com>
+Acked-by: Russell King (Oracle) <rmk+kernel@armlinux.org.uk>
+Link: https://lore.kernel.org/r/20220410223118.15086-1-kgroeneveld@lenbrook.com
+Signed-off-by: Vinod Koul <vkoul@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/dma/imx-sdma.c |   28 ++++++++++++++--------------
+ 1 file changed, 14 insertions(+), 14 deletions(-)
+
+--- a/drivers/dma/imx-sdma.c
++++ b/drivers/dma/imx-sdma.c
+@@ -198,12 +198,12 @@ struct sdma_script_start_addrs {
+       s32 per_2_firi_addr;
+       s32 mcu_2_firi_addr;
+       s32 uart_2_per_addr;
+-      s32 uart_2_mcu_ram_addr;
++      s32 uart_2_mcu_addr;
+       s32 per_2_app_addr;
+       s32 mcu_2_app_addr;
+       s32 per_2_per_addr;
+       s32 uartsh_2_per_addr;
+-      s32 uartsh_2_mcu_ram_addr;
++      s32 uartsh_2_mcu_addr;
+       s32 per_2_shp_addr;
+       s32 mcu_2_shp_addr;
+       s32 ata_2_mcu_addr;
+@@ -232,8 +232,8 @@ struct sdma_script_start_addrs {
+       s32 mcu_2_ecspi_addr;
+       s32 mcu_2_sai_addr;
+       s32 sai_2_mcu_addr;
+-      s32 uart_2_mcu_addr;
+-      s32 uartsh_2_mcu_addr;
++      s32 uart_2_mcu_rom_addr;
++      s32 uartsh_2_mcu_rom_addr;
+       /* End of v3 array */
+       s32 mcu_2_zqspi_addr;
+       /* End of v4 array */
+@@ -1780,17 +1780,17 @@ static void sdma_add_scripts(struct sdma
+                       saddr_arr[i] = addr_arr[i];
+       /*
+-       * get uart_2_mcu_addr/uartsh_2_mcu_addr rom script specially because
+-       * they are now replaced by uart_2_mcu_ram_addr/uartsh_2_mcu_ram_addr
+-       * to be compatible with legacy freescale/nxp sdma firmware, and they
+-       * are located in the bottom part of sdma_script_start_addrs which are
+-       * beyond the SDMA_SCRIPT_ADDRS_ARRAY_SIZE_V1.
++       * For compatibility with NXP internal legacy kernel before 4.19 which
++       * is based on uart ram script and mainline kernel based on uart rom
++       * script, both uart ram/rom scripts are present in newer sdma
++       * firmware. Use the rom versions if they are present (V3 or newer).
+        */
+-      if (addr->uart_2_mcu_addr)
+-              sdma->script_addrs->uart_2_mcu_addr = addr->uart_2_mcu_addr;
+-      if (addr->uartsh_2_mcu_addr)
+-              sdma->script_addrs->uartsh_2_mcu_addr = addr->uartsh_2_mcu_addr;
+-
++      if (sdma->script_number >= SDMA_SCRIPT_ADDRS_ARRAY_SIZE_V3) {
++              if (addr->uart_2_mcu_rom_addr)
++                      sdma->script_addrs->uart_2_mcu_addr = addr->uart_2_mcu_rom_addr;
++              if (addr->uartsh_2_mcu_rom_addr)
++                      sdma->script_addrs->uartsh_2_mcu_addr = addr->uartsh_2_mcu_rom_addr;
++      }
+ }
+ static void sdma_load_firmware(const struct firmware *fw, void *context)
diff --git a/queue-5.15/edac-synopsys-read-the-error-count-from-the-correct-register.patch b/queue-5.15/edac-synopsys-read-the-error-count-from-the-correct-register.patch
new file mode 100644 (file)
index 0000000..7d6c370
--- /dev/null
@@ -0,0 +1,61 @@
+From e2932d1f6f055b2af2114c7e64a26dc1b5593d0c Mon Sep 17 00:00:00 2001
+From: Shubhrajyoti Datta <shubhrajyoti.datta@xilinx.com>
+Date: Thu, 14 Apr 2022 15:58:13 +0530
+Subject: EDAC/synopsys: Read the error count from the correct register
+
+From: Shubhrajyoti Datta <shubhrajyoti.datta@xilinx.com>
+
+commit e2932d1f6f055b2af2114c7e64a26dc1b5593d0c upstream.
+
+Currently, the error count is read wrongly from the status register. Read
+the count from the proper error count register (ERRCNT).
+
+  [ bp: Massage. ]
+
+Fixes: b500b4a029d5 ("EDAC, synopsys: Add ECC support for ZynqMP DDR controller")
+Signed-off-by: Shubhrajyoti Datta <shubhrajyoti.datta@xilinx.com>
+Signed-off-by: Borislav Petkov <bp@suse.de>
+Acked-by: Michal Simek <michal.simek@xilinx.com>
+Cc: <stable@vger.kernel.org>
+Link: https://lore.kernel.org/r/20220414102813.4468-1-shubhrajyoti.datta@xilinx.com
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/edac/synopsys_edac.c |   16 +++++++++++-----
+ 1 file changed, 11 insertions(+), 5 deletions(-)
+
+--- a/drivers/edac/synopsys_edac.c
++++ b/drivers/edac/synopsys_edac.c
+@@ -163,6 +163,11 @@
+ #define ECC_STAT_CECNT_SHIFT          8
+ #define ECC_STAT_BITNUM_MASK          0x7F
++/* ECC error count register definitions */
++#define ECC_ERRCNT_UECNT_MASK         0xFFFF0000
++#define ECC_ERRCNT_UECNT_SHIFT                16
++#define ECC_ERRCNT_CECNT_MASK         0xFFFF
++
+ /* DDR QOS Interrupt register definitions */
+ #define DDR_QOS_IRQ_STAT_OFST         0x20200
+ #define DDR_QOSUE_MASK                        0x4
+@@ -418,15 +423,16 @@ static int zynqmp_get_error_info(struct
+       base = priv->baseaddr;
+       p = &priv->stat;
++      regval = readl(base + ECC_ERRCNT_OFST);
++      p->ce_cnt = regval & ECC_ERRCNT_CECNT_MASK;
++      p->ue_cnt = (regval & ECC_ERRCNT_UECNT_MASK) >> ECC_ERRCNT_UECNT_SHIFT;
++      if (!p->ce_cnt)
++              goto ue_err;
++
+       regval = readl(base + ECC_STAT_OFST);
+       if (!regval)
+               return 1;
+-      p->ce_cnt = (regval & ECC_STAT_CECNT_MASK) >> ECC_STAT_CECNT_SHIFT;
+-      p->ue_cnt = (regval & ECC_STAT_UECNT_MASK) >> ECC_STAT_UECNT_SHIFT;
+-      if (!p->ce_cnt)
+-              goto ue_err;
+-
+       p->ceinfo.bitpos = (regval & ECC_STAT_BITNUM_MASK);
+       regval = readl(base + ECC_CEADDR0_OFST);
diff --git a/queue-5.15/memcg-sync-flush-only-if-periodic-flush-is-delayed.patch b/queue-5.15/memcg-sync-flush-only-if-periodic-flush-is-delayed.patch
new file mode 100644 (file)
index 0000000..8d87b5b
--- /dev/null
@@ -0,0 +1,130 @@
+From 9b3016154c913b2e7ec5ae5c9a42eb9e732d86aa Mon Sep 17 00:00:00 2001
+From: Shakeel Butt <shakeelb@google.com>
+Date: Thu, 21 Apr 2022 16:35:40 -0700
+Subject: memcg: sync flush only if periodic flush is delayed
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Shakeel Butt <shakeelb@google.com>
+
+commit 9b3016154c913b2e7ec5ae5c9a42eb9e732d86aa upstream.
+
+Daniel Dao has reported [1] a regression on workloads that may trigger a
+lot of refaults (anon and file).  The underlying issue is that flushing
+rstat is expensive.  Although rstat flush are batched with (nr_cpus *
+MEMCG_BATCH) stat updates, it seems like there are workloads which
+genuinely do stat updates larger than batch value within short amount of
+time.  Since the rstat flush can happen in the performance critical
+codepaths like page faults, such workload can suffer greatly.
+
+This patch fixes this regression by making the rstat flushing
+conditional in the performance critical codepaths.  More specifically,
+the kernel relies on the async periodic rstat flusher to flush the stats
+and only if the periodic flusher is delayed by more than twice the
+amount of its normal time window then the kernel allows rstat flushing
+from the performance critical codepaths.
+
+Now the question: what are the side-effects of this change? The worst
+that can happen is the refault codepath will see 4sec old lruvec stats
+and may cause false (or missed) activations of the refaulted page which
+may under-or-overestimate the workingset size.  Though that is not very
+concerning as the kernel can already miss or do false activations.
+
+There are two more codepaths whose flushing behavior is not changed by
+this patch and we may need to come to them in future.  One is the
+writeback stats used by dirty throttling and second is the deactivation
+heuristic in the reclaim.  For now keeping an eye on them and if there
+is report of regression due to these codepaths, we will reevaluate then.
+
+Link: https://lore.kernel.org/all/CA+wXwBSyO87ZX5PVwdHm-=dBjZYECGmfnydUicUyrQqndgX2MQ@mail.gmail.com [1]
+Link: https://lkml.kernel.org/r/20220304184040.1304781-1-shakeelb@google.com
+Fixes: 1f828223b799 ("memcg: flush lruvec stats in the refault")
+Signed-off-by: Shakeel Butt <shakeelb@google.com>
+Reported-by: Daniel Dao <dqminh@cloudflare.com>
+Tested-by: Ivan Babrou <ivan@cloudflare.com>
+Cc: Michal Hocko <mhocko@suse.com>
+Cc: Roman Gushchin <roman.gushchin@linux.dev>
+Cc: Johannes Weiner <hannes@cmpxchg.org>
+Cc: Michal Koutný <mkoutny@suse.com>
+Cc: Frank Hofmann <fhofmann@cloudflare.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/memcontrol.h |    5 +++++
+ mm/memcontrol.c            |   12 +++++++++++-
+ mm/workingset.c            |    2 +-
+ 3 files changed, 17 insertions(+), 2 deletions(-)
+
+--- a/include/linux/memcontrol.h
++++ b/include/linux/memcontrol.h
+@@ -1002,6 +1002,7 @@ static inline unsigned long lruvec_page_
+ }
+ void mem_cgroup_flush_stats(void);
++void mem_cgroup_flush_stats_delayed(void);
+ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx,
+                             int val);
+@@ -1422,6 +1423,10 @@ static inline void mem_cgroup_flush_stat
+ {
+ }
++static inline void mem_cgroup_flush_stats_delayed(void)
++{
++}
++
+ static inline void __mod_memcg_lruvec_state(struct lruvec *lruvec,
+                                           enum node_stat_item idx, int val)
+ {
+--- a/mm/memcontrol.c
++++ b/mm/memcontrol.c
+@@ -650,6 +650,9 @@ static DECLARE_DEFERRABLE_WORK(stats_flu
+ static DEFINE_SPINLOCK(stats_flush_lock);
+ static DEFINE_PER_CPU(unsigned int, stats_updates);
+ static atomic_t stats_flush_threshold = ATOMIC_INIT(0);
++static u64 flush_next_time;
++
++#define FLUSH_TIME (2UL*HZ)
+ static inline void memcg_rstat_updated(struct mem_cgroup *memcg, int val)
+ {
+@@ -671,6 +674,7 @@ static void __mem_cgroup_flush_stats(voi
+       if (!spin_trylock_irqsave(&stats_flush_lock, flag))
+               return;
++      flush_next_time = jiffies_64 + 2*FLUSH_TIME;
+       cgroup_rstat_flush_irqsafe(root_mem_cgroup->css.cgroup);
+       atomic_set(&stats_flush_threshold, 0);
+       spin_unlock_irqrestore(&stats_flush_lock, flag);
+@@ -682,10 +686,16 @@ void mem_cgroup_flush_stats(void)
+               __mem_cgroup_flush_stats();
+ }
++void mem_cgroup_flush_stats_delayed(void)
++{
++      if (time_after64(jiffies_64, flush_next_time))
++              mem_cgroup_flush_stats();
++}
++
+ static void flush_memcg_stats_dwork(struct work_struct *w)
+ {
+       __mem_cgroup_flush_stats();
+-      queue_delayed_work(system_unbound_wq, &stats_flush_dwork, 2UL*HZ);
++      queue_delayed_work(system_unbound_wq, &stats_flush_dwork, FLUSH_TIME);
+ }
+ /**
+--- a/mm/workingset.c
++++ b/mm/workingset.c
+@@ -352,7 +352,7 @@ void workingset_refault(struct page *pag
+       inc_lruvec_state(lruvec, WORKINGSET_REFAULT_BASE + file);
+-      mem_cgroup_flush_stats();
++      mem_cgroup_flush_stats_delayed();
+       /*
+        * Compare the distance to the existing workingset size. We
+        * don't activate pages that couldn't stay resident even if
diff --git a/queue-5.15/mm-hugetlb-allow-for-high-userspace-addresses.patch b/queue-5.15/mm-hugetlb-allow-for-high-userspace-addresses.patch
new file mode 100644 (file)
index 0000000..d45ae17
--- /dev/null
@@ -0,0 +1,145 @@
+From 5f24d5a579d1eace79d505b148808a850b417d4c Mon Sep 17 00:00:00 2001
+From: Christophe Leroy <christophe.leroy@csgroup.eu>
+Date: Thu, 21 Apr 2022 16:35:46 -0700
+Subject: mm, hugetlb: allow for "high" userspace addresses
+
+From: Christophe Leroy <christophe.leroy@csgroup.eu>
+
+commit 5f24d5a579d1eace79d505b148808a850b417d4c upstream.
+
+This is a fix for commit f6795053dac8 ("mm: mmap: Allow for "high"
+userspace addresses") for hugetlb.
+
+This patch adds support for "high" userspace addresses that are
+optionally supported on the system and have to be requested via a hint
+mechanism ("high" addr parameter to mmap).
+
+Architectures such as powerpc and x86 achieve this by making changes to
+their architectural versions of hugetlb_get_unmapped_area() function.
+However, arm64 uses the generic version of that function.
+
+So take into account arch_get_mmap_base() and arch_get_mmap_end() in
+hugetlb_get_unmapped_area().  To allow that, move those two macros out
+of mm/mmap.c into include/linux/sched/mm.h
+
+If these macros are not defined in architectural code then they default
+to (TASK_SIZE) and (base) so should not introduce any behavioural
+changes to architectures that do not define them.
+
+For the time being, only ARM64 is affected by this change.
+
+Catalin (ARM64) said
+ "We should have fixed hugetlb_get_unmapped_area() as well when we added
+  support for 52-bit VA. The reason for commit f6795053dac8 was to
+  prevent normal mmap() from returning addresses above 48-bit by default
+  as some user-space had hard assumptions about this.
+
+  It's a slight ABI change if you do this for hugetlb_get_unmapped_area()
+  but I doubt anyone would notice. It's more likely that the current
+  behaviour would cause issues, so I'd rather have them consistent.
+
+  Basically when arm64 gained support for 52-bit addresses we did not
+  want user-space calling mmap() to suddenly get such high addresses,
+  otherwise we could have inadvertently broken some programs (similar
+  behaviour to x86 here). Hence we added commit f6795053dac8. But we
+  missed hugetlbfs which could still get such high mmap() addresses. So
+  in theory that's a potential regression that should have bee addressed
+  at the same time as commit f6795053dac8 (and before arm64 enabled
+  52-bit addresses)"
+
+Link: https://lkml.kernel.org/r/ab847b6edb197bffdfe189e70fb4ac76bfe79e0d.1650033747.git.christophe.leroy@csgroup.eu
+Fixes: f6795053dac8 ("mm: mmap: Allow for "high" userspace addresses")
+Signed-off-by: Christophe Leroy <christophe.leroy@csgroup.eu>
+Reviewed-by: Catalin Marinas <catalin.marinas@arm.com>
+Cc: Steve Capper <steve.capper@arm.com>
+Cc: Will Deacon <will.deacon@arm.com>
+Cc: <stable@vger.kernel.org>   [5.0.x]
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/hugetlbfs/inode.c     |    9 +++++----
+ include/linux/sched/mm.h |    8 ++++++++
+ mm/mmap.c                |    8 --------
+ 3 files changed, 13 insertions(+), 12 deletions(-)
+
+--- a/fs/hugetlbfs/inode.c
++++ b/fs/hugetlbfs/inode.c
+@@ -206,7 +206,7 @@ hugetlb_get_unmapped_area_bottomup(struc
+       info.flags = 0;
+       info.length = len;
+       info.low_limit = current->mm->mmap_base;
+-      info.high_limit = TASK_SIZE;
++      info.high_limit = arch_get_mmap_end(addr);
+       info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+       info.align_offset = 0;
+       return vm_unmapped_area(&info);
+@@ -222,7 +222,7 @@ hugetlb_get_unmapped_area_topdown(struct
+       info.flags = VM_UNMAPPED_AREA_TOPDOWN;
+       info.length = len;
+       info.low_limit = max(PAGE_SIZE, mmap_min_addr);
+-      info.high_limit = current->mm->mmap_base;
++      info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base);
+       info.align_mask = PAGE_MASK & ~huge_page_mask(h);
+       info.align_offset = 0;
+       addr = vm_unmapped_area(&info);
+@@ -237,7 +237,7 @@ hugetlb_get_unmapped_area_topdown(struct
+               VM_BUG_ON(addr != -ENOMEM);
+               info.flags = 0;
+               info.low_limit = current->mm->mmap_base;
+-              info.high_limit = TASK_SIZE;
++              info.high_limit = arch_get_mmap_end(addr);
+               addr = vm_unmapped_area(&info);
+       }
+@@ -251,6 +251,7 @@ hugetlb_get_unmapped_area(struct file *f
+       struct mm_struct *mm = current->mm;
+       struct vm_area_struct *vma;
+       struct hstate *h = hstate_file(file);
++      const unsigned long mmap_end = arch_get_mmap_end(addr);
+       if (len & ~huge_page_mask(h))
+               return -EINVAL;
+@@ -266,7 +267,7 @@ hugetlb_get_unmapped_area(struct file *f
+       if (addr) {
+               addr = ALIGN(addr, huge_page_size(h));
+               vma = find_vma(mm, addr);
+-              if (TASK_SIZE - len >= addr &&
++              if (mmap_end - len >= addr &&
+                   (!vma || addr + len <= vm_start_gap(vma)))
+                       return addr;
+       }
+--- a/include/linux/sched/mm.h
++++ b/include/linux/sched/mm.h
+@@ -106,6 +106,14 @@ static inline void mm_update_next_owner(
+ #endif /* CONFIG_MEMCG */
+ #ifdef CONFIG_MMU
++#ifndef arch_get_mmap_end
++#define arch_get_mmap_end(addr)       (TASK_SIZE)
++#endif
++
++#ifndef arch_get_mmap_base
++#define arch_get_mmap_base(addr, base) (base)
++#endif
++
+ extern void arch_pick_mmap_layout(struct mm_struct *mm,
+                                 struct rlimit *rlim_stack);
+ extern unsigned long
+--- a/mm/mmap.c
++++ b/mm/mmap.c
+@@ -2113,14 +2113,6 @@ unsigned long vm_unmapped_area(struct vm
+       return addr;
+ }
+-#ifndef arch_get_mmap_end
+-#define arch_get_mmap_end(addr)       (TASK_SIZE)
+-#endif
+-
+-#ifndef arch_get_mmap_base
+-#define arch_get_mmap_base(addr, base) (base)
+-#endif
+-
+ /* Get an address range which is currently unmapped.
+  * For shmat() with addr=0.
+  *
diff --git a/queue-5.15/mm-memory-failure.c-skip-huge_zero_page-in-memory_failure.patch b/queue-5.15/mm-memory-failure.c-skip-huge_zero_page-in-memory_failure.patch
new file mode 100644 (file)
index 0000000..ff30abb
--- /dev/null
@@ -0,0 +1,95 @@
+From d173d5417fb67411e623d394aab986d847e47dad Mon Sep 17 00:00:00 2001
+From: Xu Yu <xuyu@linux.alibaba.com>
+Date: Thu, 21 Apr 2022 16:35:37 -0700
+Subject: mm/memory-failure.c: skip huge_zero_page in memory_failure()
+
+From: Xu Yu <xuyu@linux.alibaba.com>
+
+commit d173d5417fb67411e623d394aab986d847e47dad upstream.
+
+Kernel panic when injecting memory_failure for the global
+huge_zero_page, when CONFIG_DEBUG_VM is enabled, as follows.
+
+  Injecting memory failure for pfn 0x109ff9 at process virtual address 0x20ff9000
+  page:00000000fb053fc3 refcount:2 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x109e00
+  head:00000000fb053fc3 order:9 compound_mapcount:0 compound_pincount:0
+  flags: 0x17fffc000010001(locked|head|node=0|zone=2|lastcpupid=0x1ffff)
+  raw: 017fffc000010001 0000000000000000 dead000000000122 0000000000000000
+  raw: 0000000000000000 0000000000000000 00000002ffffffff 0000000000000000
+  page dumped because: VM_BUG_ON_PAGE(is_huge_zero_page(head))
+  ------------[ cut here ]------------
+  kernel BUG at mm/huge_memory.c:2499!
+  invalid opcode: 0000 [#1] PREEMPT SMP PTI
+  CPU: 6 PID: 553 Comm: split_bug Not tainted 5.18.0-rc1+ #11
+  Hardware name: Alibaba Cloud Alibaba Cloud ECS, BIOS 3288b3c 04/01/2014
+  RIP: 0010:split_huge_page_to_list+0x66a/0x880
+  Code: 84 9b fb ff ff 48 8b 7c 24 08 31 f6 e8 9f 5d 2a 00 b8 b8 02 00 00 e9 e8 fb ff ff 48 c7 c6 e8 47 3c 82 4c b
+  RSP: 0018:ffffc90000dcbdf8 EFLAGS: 00010246
+  RAX: 000000000000003c RBX: 0000000000000001 RCX: 0000000000000000
+  RDX: 0000000000000000 RSI: ffffffff823e4c4f RDI: 00000000ffffffff
+  RBP: ffff88843fffdb40 R08: 0000000000000000 R09: 00000000fffeffff
+  R10: ffffc90000dcbc48 R11: ffffffff82d68448 R12: ffffea0004278000
+  R13: ffffffff823c6203 R14: 0000000000109ff9 R15: ffffea000427fe40
+  FS:  00007fc375a26740(0000) GS:ffff88842fd80000(0000) knlGS:0000000000000000
+  CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+  CR2: 00007fc3757c9290 CR3: 0000000102174006 CR4: 00000000003706e0
+  DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+  DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+  Call Trace:
+   try_to_split_thp_page+0x3a/0x130
+   memory_failure+0x128/0x800
+   madvise_inject_error.cold+0x8b/0xa1
+   __x64_sys_madvise+0x54/0x60
+   do_syscall_64+0x35/0x80
+   entry_SYSCALL_64_after_hwframe+0x44/0xae
+  RIP: 0033:0x7fc3754f8bf9
+  Code: 01 00 48 81 c4 80 00 00 00 e9 f1 fe ff ff 0f 1f 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 8
+  RSP: 002b:00007ffeda93a1d8 EFLAGS: 00000217 ORIG_RAX: 000000000000001c
+  RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007fc3754f8bf9
+  RDX: 0000000000000064 RSI: 0000000000003000 RDI: 0000000020ff9000
+  RBP: 00007ffeda93a200 R08: 0000000000000000 R09: 0000000000000000
+  R10: 00000000ffffffff R11: 0000000000000217 R12: 0000000000400490
+  R13: 00007ffeda93a2e0 R14: 0000000000000000 R15: 0000000000000000
+
+This makes huge_zero_page bail out explicitly before split in
+memory_failure(), thus the panic above won't happen again.
+
+Link: https://lkml.kernel.org/r/497d3835612610e370c74e697ea3c721d1d55b9c.1649775850.git.xuyu@linux.alibaba.com
+Fixes: 6a46079cf57a ("HWPOISON: The high level memory error handler in the VM v7")
+Signed-off-by: Xu Yu <xuyu@linux.alibaba.com>
+Reported-by: Abaci <abaci@linux.alibaba.com>
+Suggested-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
+Acked-by: Naoya Horiguchi <naoya.horiguchi@nec.com>
+Reviewed-by: Miaohe Lin <linmiaohe@huawei.com>
+Cc: Anshuman Khandual <anshuman.khandual@arm.com>
+Cc: Oscar Salvador <osalvador@suse.de>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/memory-failure.c |   13 +++++++++++++
+ 1 file changed, 13 insertions(+)
+
+--- a/mm/memory-failure.c
++++ b/mm/memory-failure.c
+@@ -1690,6 +1690,19 @@ try_again:
+       if (PageTransHuge(hpage)) {
+               /*
++               * Bail out before SetPageHasHWPoisoned() if hpage is
++               * huge_zero_page, although PG_has_hwpoisoned is not
++               * checked in set_huge_zero_page().
++               *
++               * TODO: Handle memory failure of huge_zero_page thoroughly.
++               */
++              if (is_huge_zero_page(hpage)) {
++                      action_result(pfn, MF_MSG_UNSPLIT_THP, MF_IGNORED);
++                      res = -EBUSY;
++                      goto unlock_mutex;
++              }
++
++              /*
+                * The flag must be set after the refcount is bumped
+                * otherwise it may race with THP split.
+                * And the flag can't be set in get_hwpoison_page() since
diff --git a/queue-5.15/mm-mmu_notifier.c-fix-race-in-mmu_interval_notifier_remove.patch b/queue-5.15/mm-mmu_notifier.c-fix-race-in-mmu_interval_notifier_remove.patch
new file mode 100644 (file)
index 0000000..8a79293
--- /dev/null
@@ -0,0 +1,84 @@
+From 319561669a59d8e9206ab311ae5433ef92fd79d1 Mon Sep 17 00:00:00 2001
+From: Alistair Popple <apopple@nvidia.com>
+Date: Thu, 21 Apr 2022 16:36:10 -0700
+Subject: mm/mmu_notifier.c: fix race in mmu_interval_notifier_remove()
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Alistair Popple <apopple@nvidia.com>
+
+commit 319561669a59d8e9206ab311ae5433ef92fd79d1 upstream.
+
+In some cases it is possible for mmu_interval_notifier_remove() to race
+with mn_tree_inv_end() allowing it to return while the notifier data
+structure is still in use.  Consider the following sequence:
+
+  CPU0 - mn_tree_inv_end()            CPU1 - mmu_interval_notifier_remove()
+  ----------------------------------- ------------------------------------
+                                      spin_lock(subscriptions->lock);
+                                      seq = subscriptions->invalidate_seq;
+  spin_lock(subscriptions->lock);     spin_unlock(subscriptions->lock);
+  subscriptions->invalidate_seq++;
+                                      wait_event(invalidate_seq != seq);
+                                      return;
+  interval_tree_remove(interval_sub); kfree(interval_sub);
+  spin_unlock(subscriptions->lock);
+  wake_up_all();
+
+As the wait_event() condition is true it will return immediately.  This
+can lead to use-after-free type errors if the caller frees the data
+structure containing the interval notifier subscription while it is
+still on a deferred list.  Fix this by taking the appropriate lock when
+reading invalidate_seq to ensure proper synchronisation.
+
+I observed this whilst running stress testing during some development.
+You do have to be pretty unlucky, but it leads to the usual problems of
+use-after-free (memory corruption, kernel crash, difficult to diagnose
+WARN_ON, etc).
+
+Link: https://lkml.kernel.org/r/20220420043734.476348-1-apopple@nvidia.com
+Fixes: 99cb252f5e68 ("mm/mmu_notifier: add an interval tree notifier")
+Signed-off-by: Alistair Popple <apopple@nvidia.com>
+Signed-off-by: Jason Gunthorpe <jgg@nvidia.com>
+Cc: Christian König <christian.koenig@amd.com>
+Cc: John Hubbard <jhubbard@nvidia.com>
+Cc: Ralph Campbell <rcampbell@nvidia.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/mmu_notifier.c |   14 +++++++++++++-
+ 1 file changed, 13 insertions(+), 1 deletion(-)
+
+--- a/mm/mmu_notifier.c
++++ b/mm/mmu_notifier.c
+@@ -1036,6 +1036,18 @@ int mmu_interval_notifier_insert_locked(
+ }
+ EXPORT_SYMBOL_GPL(mmu_interval_notifier_insert_locked);
++static bool
++mmu_interval_seq_released(struct mmu_notifier_subscriptions *subscriptions,
++                        unsigned long seq)
++{
++      bool ret;
++
++      spin_lock(&subscriptions->lock);
++      ret = subscriptions->invalidate_seq != seq;
++      spin_unlock(&subscriptions->lock);
++      return ret;
++}
++
+ /**
+  * mmu_interval_notifier_remove - Remove a interval notifier
+  * @interval_sub: Interval subscription to unregister
+@@ -1083,7 +1095,7 @@ void mmu_interval_notifier_remove(struct
+       lock_map_release(&__mmu_notifier_invalidate_range_start_map);
+       if (seq)
+               wait_event(subscriptions->wq,
+-                         READ_ONCE(subscriptions->invalidate_seq) != seq);
++                         mmu_interval_seq_released(subscriptions, seq));
+       /* pairs with mmgrab in mmu_interval_notifier_insert() */
+       mmdrop(mm);
diff --git a/queue-5.15/net-atlantic-invert-deep-par-in-pm-functions-preventing-null-derefs.patch b/queue-5.15/net-atlantic-invert-deep-par-in-pm-functions-preventing-null-derefs.patch
new file mode 100644 (file)
index 0000000..369826a
--- /dev/null
@@ -0,0 +1,95 @@
+From cbe6c3a8f8f4315b96e46e1a1c70393c06d95a4c Mon Sep 17 00:00:00 2001
+From: Manuel Ullmann <labre@posteo.de>
+Date: Mon, 18 Apr 2022 00:20:01 +0200
+Subject: net: atlantic: invert deep par in pm functions, preventing null derefs
+
+From: Manuel Ullmann <labre@posteo.de>
+
+commit cbe6c3a8f8f4315b96e46e1a1c70393c06d95a4c upstream.
+
+This will reset deeply on freeze and thaw instead of suspend and
+resume and prevent null pointer dereferences of the uninitialized ring
+0 buffer while thawing.
+
+The impact is an indefinitely hanging kernel. You can't switch
+consoles after this and the only possible user interaction is SysRq.
+
+BUG: kernel NULL pointer dereference
+RIP: 0010:aq_ring_rx_fill+0xcf/0x210 [atlantic]
+aq_vec_init+0x85/0xe0 [atlantic]
+aq_nic_init+0xf7/0x1d0 [atlantic]
+atl_resume_common+0x4f/0x100 [atlantic]
+pci_pm_thaw+0x42/0xa0
+
+resolves in aq_ring.o to
+
+```
+0000000000000ae0 <aq_ring_rx_fill>:
+{
+/* ... */
+ baf:  48 8b 43 08             mov    0x8(%rbx),%rax
+               buff->flags = 0U; /* buff is NULL */
+```
+
+The bug has been present since the introduction of the new pm code in
+8aaa112a57c1 ("net: atlantic: refactoring pm logic") and was hidden
+until 8ce84271697a ("net: atlantic: changes for multi-TC support"),
+which refactored the aq_vec_{free,alloc} functions into
+aq_vec_{,ring}_{free,alloc}, but is technically not wrong. The
+original functions just always reinitialized the buffers on S3/S4. If
+the interface is down before freezing, the bug does not occur. It does
+not matter, whether the initrd contains and loads the module before
+thawing.
+
+So the fix is to invert the boolean parameter deep in all pm function
+calls, which was clearly intended to be set like that.
+
+First report was on Github [1], which you have to guess from the
+resume logs in the posted dmesg snippet. Recently I posted one on
+Bugzilla [2], since I did not have an AQC device so far.
+
+#regzbot introduced: 8ce84271697a
+#regzbot from: koo5 <kolman.jindrich@gmail.com>
+#regzbot monitor: https://github.com/Aquantia/AQtion/issues/32
+
+Fixes: 8aaa112a57c1 ("net: atlantic: refactoring pm logic")
+Link: https://github.com/Aquantia/AQtion/issues/32 [1]
+Link: https://bugzilla.kernel.org/show_bug.cgi?id=215798 [2]
+Cc: stable@vger.kernel.org
+Reported-by: koo5 <kolman.jindrich@gmail.com>
+Signed-off-by: Manuel Ullmann <labre@posteo.de>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c |    8 ++++----
+ 1 file changed, 4 insertions(+), 4 deletions(-)
+
+--- a/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c
++++ b/drivers/net/ethernet/aquantia/atlantic/aq_pci_func.c
+@@ -444,22 +444,22 @@ err_exit:
+ static int aq_pm_freeze(struct device *dev)
+ {
+-      return aq_suspend_common(dev, false);
++      return aq_suspend_common(dev, true);
+ }
+ static int aq_pm_suspend_poweroff(struct device *dev)
+ {
+-      return aq_suspend_common(dev, true);
++      return aq_suspend_common(dev, false);
+ }
+ static int aq_pm_thaw(struct device *dev)
+ {
+-      return atl_resume_common(dev, false);
++      return atl_resume_common(dev, true);
+ }
+ static int aq_pm_resume_restore(struct device *dev)
+ {
+-      return atl_resume_common(dev, true);
++      return atl_resume_common(dev, false);
+ }
+ static const struct dev_pm_ops aq_pm_ops = {
diff --git a/queue-5.15/oom_kill.c-futex-delay-the-oom-reaper-to-allow-time-for-proper-futex-cleanup.patch b/queue-5.15/oom_kill.c-futex-delay-the-oom-reaper-to-allow-time-for-proper-futex-cleanup.patch
new file mode 100644 (file)
index 0000000..b2ab4be
--- /dev/null
@@ -0,0 +1,206 @@
+From e4a38402c36e42df28eb1a5394be87e6571fb48a Mon Sep 17 00:00:00 2001
+From: Nico Pache <npache@redhat.com>
+Date: Thu, 21 Apr 2022 16:36:01 -0700
+Subject: oom_kill.c: futex: delay the OOM reaper to allow time for proper futex cleanup
+
+From: Nico Pache <npache@redhat.com>
+
+commit e4a38402c36e42df28eb1a5394be87e6571fb48a upstream.
+
+The pthread struct is allocated on PRIVATE|ANONYMOUS memory [1] which
+can be targeted by the oom reaper.  This mapping is used to store the
+futex robust list head; the kernel does not keep a copy of the robust
+list and instead references a userspace address to maintain the
+robustness during a process death.
+
+A race can occur between exit_mm and the oom reaper that allows the oom
+reaper to free the memory of the futex robust list before the exit path
+has handled the futex death:
+
+    CPU1                               CPU2
+    --------------------------------------------------------------------
+    page_fault
+    do_exit "signal"
+    wake_oom_reaper
+                                        oom_reaper
+                                        oom_reap_task_mm (invalidates mm)
+    exit_mm
+    exit_mm_release
+    futex_exit_release
+    futex_cleanup
+    exit_robust_list
+    get_user (EFAULT- can't access memory)
+
+If the get_user EFAULT's, the kernel will be unable to recover the
+waiters on the robust_list, leaving userspace mutexes hung indefinitely.
+
+Delay the OOM reaper, allowing more time for the exit path to perform
+the futex cleanup.
+
+Reproducer: https://gitlab.com/jsavitz/oom_futex_reproducer
+
+Based on a patch by Michal Hocko.
+
+Link: https://elixir.bootlin.com/glibc/glibc-2.35/source/nptl/allocatestack.c#L370 [1]
+Link: https://lkml.kernel.org/r/20220414144042.677008-1-npache@redhat.com
+Fixes: 212925802454 ("mm: oom: let oom_reap_task and exit_mmap run concurrently")
+Signed-off-by: Joel Savitz <jsavitz@redhat.com>
+Signed-off-by: Nico Pache <npache@redhat.com>
+Co-developed-by: Joel Savitz <jsavitz@redhat.com>
+Suggested-by: Thomas Gleixner <tglx@linutronix.de>
+Acked-by: Thomas Gleixner <tglx@linutronix.de>
+Acked-by: Michal Hocko <mhocko@suse.com>
+Cc: Rafael Aquini <aquini@redhat.com>
+Cc: Waiman Long <longman@redhat.com>
+Cc: Herton R. Krzesinski <herton@redhat.com>
+Cc: Juri Lelli <juri.lelli@redhat.com>
+Cc: Vincent Guittot <vincent.guittot@linaro.org>
+Cc: Dietmar Eggemann <dietmar.eggemann@arm.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Ben Segall <bsegall@google.com>
+Cc: Mel Gorman <mgorman@suse.de>
+Cc: Daniel Bristot de Oliveira <bristot@redhat.com>
+Cc: David Rientjes <rientjes@google.com>
+Cc: Andrea Arcangeli <aarcange@redhat.com>
+Cc: Davidlohr Bueso <dave@stgolabs.net>
+Cc: Peter Zijlstra <peterz@infradead.org>
+Cc: Ingo Molnar <mingo@redhat.com>
+Cc: Joel Savitz <jsavitz@redhat.com>
+Cc: Darren Hart <dvhart@infradead.org>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/linux/sched.h |    1 
+ mm/oom_kill.c         |   54 +++++++++++++++++++++++++++++++++++++-------------
+ 2 files changed, 41 insertions(+), 14 deletions(-)
+
+--- a/include/linux/sched.h
++++ b/include/linux/sched.h
+@@ -1436,6 +1436,7 @@ struct task_struct {
+       int                             pagefault_disabled;
+ #ifdef CONFIG_MMU
+       struct task_struct              *oom_reaper_list;
++      struct timer_list               oom_reaper_timer;
+ #endif
+ #ifdef CONFIG_VMAP_STACK
+       struct vm_struct                *stack_vm_area;
+--- a/mm/oom_kill.c
++++ b/mm/oom_kill.c
+@@ -635,7 +635,7 @@ done:
+        */
+       set_bit(MMF_OOM_SKIP, &mm->flags);
+-      /* Drop a reference taken by wake_oom_reaper */
++      /* Drop a reference taken by queue_oom_reaper */
+       put_task_struct(tsk);
+ }
+@@ -645,12 +645,12 @@ static int oom_reaper(void *unused)
+               struct task_struct *tsk = NULL;
+               wait_event_freezable(oom_reaper_wait, oom_reaper_list != NULL);
+-              spin_lock(&oom_reaper_lock);
++              spin_lock_irq(&oom_reaper_lock);
+               if (oom_reaper_list != NULL) {
+                       tsk = oom_reaper_list;
+                       oom_reaper_list = tsk->oom_reaper_list;
+               }
+-              spin_unlock(&oom_reaper_lock);
++              spin_unlock_irq(&oom_reaper_lock);
+               if (tsk)
+                       oom_reap_task(tsk);
+@@ -659,22 +659,48 @@ static int oom_reaper(void *unused)
+       return 0;
+ }
+-static void wake_oom_reaper(struct task_struct *tsk)
++static void wake_oom_reaper(struct timer_list *timer)
+ {
+-      /* mm is already queued? */
+-      if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
++      struct task_struct *tsk = container_of(timer, struct task_struct,
++                      oom_reaper_timer);
++      struct mm_struct *mm = tsk->signal->oom_mm;
++      unsigned long flags;
++
++      /* The victim managed to terminate on its own - see exit_mmap */
++      if (test_bit(MMF_OOM_SKIP, &mm->flags)) {
++              put_task_struct(tsk);
+               return;
++      }
+-      get_task_struct(tsk);
+-
+-      spin_lock(&oom_reaper_lock);
++      spin_lock_irqsave(&oom_reaper_lock, flags);
+       tsk->oom_reaper_list = oom_reaper_list;
+       oom_reaper_list = tsk;
+-      spin_unlock(&oom_reaper_lock);
++      spin_unlock_irqrestore(&oom_reaper_lock, flags);
+       trace_wake_reaper(tsk->pid);
+       wake_up(&oom_reaper_wait);
+ }
++/*
++ * Give the OOM victim time to exit naturally before invoking the oom_reaping.
++ * The timers timeout is arbitrary... the longer it is, the longer the worst
++ * case scenario for the OOM can take. If it is too small, the oom_reaper can
++ * get in the way and release resources needed by the process exit path.
++ * e.g. The futex robust list can sit in Anon|Private memory that gets reaped
++ * before the exit path is able to wake the futex waiters.
++ */
++#define OOM_REAPER_DELAY (2*HZ)
++static void queue_oom_reaper(struct task_struct *tsk)
++{
++      /* mm is already queued? */
++      if (test_and_set_bit(MMF_OOM_REAP_QUEUED, &tsk->signal->oom_mm->flags))
++              return;
++
++      get_task_struct(tsk);
++      timer_setup(&tsk->oom_reaper_timer, wake_oom_reaper, 0);
++      tsk->oom_reaper_timer.expires = jiffies + OOM_REAPER_DELAY;
++      add_timer(&tsk->oom_reaper_timer);
++}
++
+ static int __init oom_init(void)
+ {
+       oom_reaper_th = kthread_run(oom_reaper, NULL, "oom_reaper");
+@@ -682,7 +708,7 @@ static int __init oom_init(void)
+ }
+ subsys_initcall(oom_init)
+ #else
+-static inline void wake_oom_reaper(struct task_struct *tsk)
++static inline void queue_oom_reaper(struct task_struct *tsk)
+ {
+ }
+ #endif /* CONFIG_MMU */
+@@ -933,7 +959,7 @@ static void __oom_kill_process(struct ta
+       rcu_read_unlock();
+       if (can_oom_reap)
+-              wake_oom_reaper(victim);
++              queue_oom_reaper(victim);
+       mmdrop(mm);
+       put_task_struct(victim);
+@@ -969,7 +995,7 @@ static void oom_kill_process(struct oom_
+       task_lock(victim);
+       if (task_will_free_mem(victim)) {
+               mark_oom_victim(victim);
+-              wake_oom_reaper(victim);
++              queue_oom_reaper(victim);
+               task_unlock(victim);
+               put_task_struct(victim);
+               return;
+@@ -1067,7 +1093,7 @@ bool out_of_memory(struct oom_control *o
+        */
+       if (task_will_free_mem(current)) {
+               mark_oom_victim(current);
+-              wake_oom_reaper(current);
++              queue_oom_reaper(current);
+               return true;
+       }
index f86bbf7bb68286c5076f886626a9ac6d1607b453..bc76116261360a08ccf8e19986a11dd467c50f9b 100644 (file)
@@ -74,3 +74,13 @@ vfs-filename_create-fix-incorrect-intent.patch
 nvme-add-a-quirk-to-disable-namespace-identifiers.patch
 nvme-pci-disable-namespace-identifiers-for-the-maxio.patch
 nvme-pci-disable-namespace-identifiers-for-qemu-cont.patch
+edac-synopsys-read-the-error-count-from-the-correct-register.patch
+mm-memory-failure.c-skip-huge_zero_page-in-memory_failure.patch
+memcg-sync-flush-only-if-periodic-flush-is-delayed.patch
+mm-hugetlb-allow-for-high-userspace-addresses.patch
+oom_kill.c-futex-delay-the-oom-reaper-to-allow-time-for-proper-futex-cleanup.patch
+mm-mmu_notifier.c-fix-race-in-mmu_interval_notifier_remove.patch
+ata-pata_marvell-check-the-bmdma_addr-beforing-reading.patch
+dma-at_xdmac-fix-a-missing-check-on-list-iterator.patch
+dmaengine-imx-sdma-fix-init-of-uart-scripts.patch
+net-atlantic-invert-deep-par-in-pm-functions-preventing-null-derefs.patch