--- /dev/null
+From 6321f5fb70d502d95de8a212a7b484c297ec9644 Mon Sep 17 00:00:00 2001
+From: Joshua Washington <joshwash@google.com>
+Date: Wed, 18 Dec 2024 05:34:11 -0800
+Subject: gve: clean XDP queues in gve_tx_stop_ring_gqi
+
+From: Joshua Washington <joshwash@google.com>
+
+commit 6321f5fb70d502d95de8a212a7b484c297ec9644 upstream.
+
+When stopping XDP TX rings, the XDP clean function needs to be called to
+clean out the entire queue, similar to what happens in the normal TX
+queue case. Otherwise, the FIFO won't be cleared correctly, and
+xsk_tx_completed won't be reported.
+
+Fixes: 75eaae158b1b ("gve: Add XDP DROP and TX support for GQI-QPL format")
+Cc: stable@vger.kernel.org
+Signed-off-by: Joshua Washington <joshwash@google.com>
+Signed-off-by: Praveen Kaligineedi <pkaligineedi@google.com>
+Reviewed-by: Praveen Kaligineedi <pkaligineedi@google.com>
+Reviewed-by: Willem de Bruijn <willemb@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/google/gve/gve_tx.c | 5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+--- a/drivers/net/ethernet/google/gve/gve_tx.c
++++ b/drivers/net/ethernet/google/gve/gve_tx.c
+@@ -206,7 +206,10 @@ void gve_tx_stop_ring_gqi(struct gve_pri
+ return;
+
+ gve_remove_napi(priv, ntfy_idx);
+- gve_clean_tx_done(priv, tx, priv->tx_desc_cnt, false);
++ if (tx->q_num < priv->tx_cfg.num_queues)
++ gve_clean_tx_done(priv, tx, priv->tx_desc_cnt, false);
++ else
++ gve_clean_xdp_done(priv, tx, priv->tx_desc_cnt);
+ netdev_tx_reset_queue(tx->netdev_txq);
+ gve_tx_remove_from_block(priv, idx);
+ }
--- /dev/null
+From de63ac44a527b2c5067551dbd70d939fe151325a Mon Sep 17 00:00:00 2001
+From: Joshua Washington <joshwash@google.com>
+Date: Wed, 18 Dec 2024 05:34:15 -0800
+Subject: gve: fix XDP allocation path in edge cases
+
+From: Joshua Washington <joshwash@google.com>
+
+commit de63ac44a527b2c5067551dbd70d939fe151325a upstream.
+
+This patch fixes a number of consistency issues in the queue allocation
+path related to XDP.
+
+As it stands, the number of allocated XDP queues changes in three
+different scenarios.
+1) Adding an XDP program while the interface is up via
+ gve_add_xdp_queues
+2) Removing an XDP program while the interface is up via
+ gve_remove_xdp_queues
+3) After queues have been allocated and the old queue memory has been
+ removed in gve_queues_start.
+
+However, the requirement for the interface to be up for
+gve_(add|remove)_xdp_queues to be called, in conjunction with the fact
+that the number of queues stored in priv isn't updated until _after_ XDP
+queues have been allocated in the normal queue allocation path means
+that if an XDP program is added while the interface is down, XDP queues
+won't be added until the _second_ if_up, not the first.
+
+Given the expectation that the number of XDP queues is equal to the
+number of RX queues, scenario (3) has another problematic implication.
+When changing the number of queues while an XDP program is loaded, the
+number of XDP queues must be updated as well, as there is logic in the
+driver (gve_xdp_tx_queue_id()) which relies on every RX queue having a
+corresponding XDP TX queue. However, the number of XDP queues stored in
+priv would not be updated until _after_ a close/open leading to a
+mismatch in the number of XDP queues reported vs the number of XDP
+queues which actually exist after the queue count update completes.
+
+This patch remedies these issues by doing the following:
+1) The allocation config getter function is set up to retrieve the
+ _expected_ number of XDP queues to allocate instead of relying
+ on the value stored in `priv` which is only updated once the queues
+ have been allocated.
+2) When adjusting queues, XDP queues are adjusted to match the number of
+ RX queues when XDP is enabled. This only works in the case when
+ queues are live, so part (1) of the fix must still be available in
+ the case that queues are adjusted when there is an XDP program and
+ the interface is down.
+
+Fixes: 5f08cd3d6423 ("gve: Alloc before freeing when adjusting queues")
+Cc: stable@vger.kernel.org
+Signed-off-by: Joshua Washington <joshwash@google.com>
+Signed-off-by: Praveen Kaligineedi <pkaligineedi@google.com>
+Reviewed-by: Praveen Kaligineedi <pkaligineedi@google.com>
+Reviewed-by: Shailend Chand <shailend@google.com>
+Reviewed-by: Willem de Bruijn <willemb@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/google/gve/gve_main.c | 9 ++++++++-
+ 1 file changed, 8 insertions(+), 1 deletion(-)
+
+--- a/drivers/net/ethernet/google/gve/gve_main.c
++++ b/drivers/net/ethernet/google/gve/gve_main.c
+@@ -930,11 +930,13 @@ static void gve_init_sync_stats(struct g
+ static void gve_tx_get_curr_alloc_cfg(struct gve_priv *priv,
+ struct gve_tx_alloc_rings_cfg *cfg)
+ {
++ int num_xdp_queues = priv->xdp_prog ? priv->rx_cfg.num_queues : 0;
++
+ cfg->qcfg = &priv->tx_cfg;
+ cfg->raw_addressing = !gve_is_qpl(priv);
+ cfg->ring_size = priv->tx_desc_cnt;
+ cfg->start_idx = 0;
+- cfg->num_rings = gve_num_tx_queues(priv);
++ cfg->num_rings = priv->tx_cfg.num_queues + num_xdp_queues;
+ cfg->tx = priv->tx;
+ }
+
+@@ -1843,6 +1845,7 @@ int gve_adjust_queues(struct gve_priv *p
+ {
+ struct gve_tx_alloc_rings_cfg tx_alloc_cfg = {0};
+ struct gve_rx_alloc_rings_cfg rx_alloc_cfg = {0};
++ int num_xdp_queues;
+ int err;
+
+ gve_get_curr_alloc_cfgs(priv, &tx_alloc_cfg, &rx_alloc_cfg);
+@@ -1853,6 +1856,10 @@ int gve_adjust_queues(struct gve_priv *p
+ rx_alloc_cfg.qcfg = &new_rx_config;
+ tx_alloc_cfg.num_rings = new_tx_config.num_queues;
+
++ /* Add dedicated XDP TX queues if enabled. */
++ num_xdp_queues = priv->xdp_prog ? new_rx_config.num_queues : 0;
++ tx_alloc_cfg.num_rings += num_xdp_queues;
++
+ if (netif_running(priv->dev)) {
+ err = gve_adjust_config(priv, &tx_alloc_cfg, &rx_alloc_cfg);
+ return err;
--- /dev/null
+From ff7c2dea9dd1a436fc79d6273adffdcc4a7ffea3 Mon Sep 17 00:00:00 2001
+From: Joshua Washington <joshwash@google.com>
+Date: Wed, 18 Dec 2024 05:34:12 -0800
+Subject: gve: guard XDP xmit NDO on existence of xdp queues
+
+From: Joshua Washington <joshwash@google.com>
+
+commit ff7c2dea9dd1a436fc79d6273adffdcc4a7ffea3 upstream.
+
+In GVE, dedicated XDP queues only exist when an XDP program is installed
+and the interface is up. As such, the NDO XDP XMIT callback should
+return early if either of these conditions are false.
+
+In the case of no loaded XDP program, priv->num_xdp_queues=0 which can
+cause a divide-by-zero error, and in the case of interface down,
+num_xdp_queues remains untouched to persist XDP queue count for the next
+interface up, but the TX pointer itself would be NULL.
+
+The XDP xmit callback also needs to synchronize with a device
+transitioning from open to close. This synchronization will happen via
+the GVE_PRIV_FLAGS_NAPI_ENABLED bit along with a synchronize_net() call,
+which waits for any RCU critical sections at call-time to complete.
+
+Fixes: 39a7f4aa3e4a ("gve: Add XDP REDIRECT support for GQI-QPL format")
+Cc: stable@vger.kernel.org
+Signed-off-by: Joshua Washington <joshwash@google.com>
+Signed-off-by: Praveen Kaligineedi <pkaligineedi@google.com>
+Reviewed-by: Praveen Kaligineedi <pkaligineedi@google.com>
+Reviewed-by: Shailend Chand <shailend@google.com>
+Reviewed-by: Willem de Bruijn <willemb@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/google/gve/gve_main.c | 3 +++
+ drivers/net/ethernet/google/gve/gve_tx.c | 5 ++++-
+ 2 files changed, 7 insertions(+), 1 deletion(-)
+
+--- a/drivers/net/ethernet/google/gve/gve_main.c
++++ b/drivers/net/ethernet/google/gve/gve_main.c
+@@ -1904,6 +1904,9 @@ static void gve_turndown(struct gve_priv
+
+ gve_clear_napi_enabled(priv);
+ gve_clear_report_stats(priv);
++
++ /* Make sure that all traffic is finished processing. */
++ synchronize_net();
+ }
+
+ static void gve_turnup(struct gve_priv *priv)
+--- a/drivers/net/ethernet/google/gve/gve_tx.c
++++ b/drivers/net/ethernet/google/gve/gve_tx.c
+@@ -837,9 +837,12 @@ int gve_xdp_xmit(struct net_device *dev,
+ struct gve_tx_ring *tx;
+ int i, err = 0, qid;
+
+- if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK))
++ if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK) || !priv->xdp_prog)
+ return -EINVAL;
+
++ if (!gve_get_napi_enabled(priv))
++ return -ENETDOWN;
++
+ qid = gve_xdp_tx_queue_id(priv,
+ smp_processor_id() % priv->num_xdp_queues);
+
--- /dev/null
+From 40338d7987d810fcaa95c500b1068a52b08eec9b Mon Sep 17 00:00:00 2001
+From: Joshua Washington <joshwash@google.com>
+Date: Wed, 18 Dec 2024 05:34:13 -0800
+Subject: gve: guard XSK operations on the existence of queues
+
+From: Joshua Washington <joshwash@google.com>
+
+commit 40338d7987d810fcaa95c500b1068a52b08eec9b upstream.
+
+This patch predicates the enabling and disabling of XSK pools on the
+existence of queues. As it stands, if the interface is down, disabling
+or enabling XSK pools would result in a crash, as the RX queue pointer
+would be NULL. XSK pool registration will occur as part of the next
+interface up.
+
+Similarly, xsk_wakeup needs be guarded against queues disappearing
+while the function is executing, so a check against the
+GVE_PRIV_FLAGS_NAPI_ENABLED flag is added to synchronize with the
+disabling of the bit and the synchronize_net() in gve_turndown.
+
+Fixes: fd8e40321a12 ("gve: Add AF_XDP zero-copy support for GQI-QPL format")
+Cc: stable@vger.kernel.org
+Signed-off-by: Joshua Washington <joshwash@google.com>
+Signed-off-by: Praveen Kaligineedi <pkaligineedi@google.com>
+Reviewed-by: Praveen Kaligineedi <pkaligineedi@google.com>
+Reviewed-by: Shailend Chand <shailend@google.com>
+Reviewed-by: Willem de Bruijn <willemb@google.com>
+Reviewed-by: Larysa Zaremba <larysa.zaremba@intel.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/google/gve/gve_main.c | 22 ++++++++++------------
+ 1 file changed, 10 insertions(+), 12 deletions(-)
+
+--- a/drivers/net/ethernet/google/gve/gve_main.c
++++ b/drivers/net/ethernet/google/gve/gve_main.c
+@@ -1631,8 +1631,8 @@ static int gve_xsk_pool_enable(struct ne
+ if (err)
+ return err;
+
+- /* If XDP prog is not installed, return */
+- if (!priv->xdp_prog)
++ /* If XDP prog is not installed or interface is down, return. */
++ if (!priv->xdp_prog || !netif_running(dev))
+ return 0;
+
+ rx = &priv->rx[qid];
+@@ -1677,21 +1677,16 @@ static int gve_xsk_pool_disable(struct n
+ if (qid >= priv->rx_cfg.num_queues)
+ return -EINVAL;
+
+- /* If XDP prog is not installed, unmap DMA and return */
+- if (!priv->xdp_prog)
++ /* If XDP prog is not installed or interface is down, unmap DMA and
++ * return.
++ */
++ if (!priv->xdp_prog || !netif_running(dev))
+ goto done;
+
+- tx_qid = gve_xdp_tx_queue_id(priv, qid);
+- if (!netif_running(dev)) {
+- priv->rx[qid].xsk_pool = NULL;
+- xdp_rxq_info_unreg(&priv->rx[qid].xsk_rxq);
+- priv->tx[tx_qid].xsk_pool = NULL;
+- goto done;
+- }
+-
+ napi_rx = &priv->ntfy_blocks[priv->rx[qid].ntfy_id].napi;
+ napi_disable(napi_rx); /* make sure current rx poll is done */
+
++ tx_qid = gve_xdp_tx_queue_id(priv, qid);
+ napi_tx = &priv->ntfy_blocks[priv->tx[tx_qid].ntfy_id].napi;
+ napi_disable(napi_tx); /* make sure current tx poll is done */
+
+@@ -1719,6 +1714,9 @@ static int gve_xsk_wakeup(struct net_dev
+ struct gve_priv *priv = netdev_priv(dev);
+ int tx_queue_id = gve_xdp_tx_queue_id(priv, queue_id);
+
++ if (!gve_get_napi_enabled(priv))
++ return -ENETDOWN;
++
+ if (queue_id >= priv->rx_cfg.num_queues || !priv->xdp_prog)
+ return -EINVAL;
+
--- /dev/null
+From fb3a9a1165cea104b5ab3753e88218e4497b01c1 Mon Sep 17 00:00:00 2001
+From: Joshua Washington <joshwash@google.com>
+Date: Fri, 20 Dec 2024 19:28:06 -0800
+Subject: gve: trigger RX NAPI instead of TX NAPI in gve_xsk_wakeup
+
+From: Joshua Washington <joshwash@google.com>
+
+commit fb3a9a1165cea104b5ab3753e88218e4497b01c1 upstream.
+
+Commit ba0925c34e0f ("gve: process XSK TX descriptors as part of RX NAPI")
+moved XSK TX processing to be part of the RX NAPI. However, that commit
+did not include triggering the RX NAPI in gve_xsk_wakeup. This is
+necessary because the TX NAPI only processes TX completions, meaning
+that a TX wakeup would not actually trigger XSK descriptor processing.
+Also, the branch on XDP_WAKEUP_TX was supposed to have been removed, as
+the NAPI should be scheduled whether the wakeup is for RX or TX.
+
+Fixes: ba0925c34e0f ("gve: process XSK TX descriptors as part of RX NAPI")
+Cc: stable@vger.kernel.org
+Signed-off-by: Joshua Washington <joshwash@google.com>
+Signed-off-by: Praveen Kaligineedi <pkaligineedi@google.com>
+Link: https://patch.msgid.link/20241221032807.302244-1-pkaligineedi@google.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/net/ethernet/google/gve/gve_main.c | 21 +++++++--------------
+ 1 file changed, 7 insertions(+), 14 deletions(-)
+
+--- a/drivers/net/ethernet/google/gve/gve_main.c
++++ b/drivers/net/ethernet/google/gve/gve_main.c
+@@ -1714,7 +1714,7 @@ done:
+ static int gve_xsk_wakeup(struct net_device *dev, u32 queue_id, u32 flags)
+ {
+ struct gve_priv *priv = netdev_priv(dev);
+- int tx_queue_id = gve_xdp_tx_queue_id(priv, queue_id);
++ struct napi_struct *napi;
+
+ if (!gve_get_napi_enabled(priv))
+ return -ENETDOWN;
+@@ -1722,19 +1722,12 @@ static int gve_xsk_wakeup(struct net_dev
+ if (queue_id >= priv->rx_cfg.num_queues || !priv->xdp_prog)
+ return -EINVAL;
+
+- if (flags & XDP_WAKEUP_TX) {
+- struct gve_tx_ring *tx = &priv->tx[tx_queue_id];
+- struct napi_struct *napi =
+- &priv->ntfy_blocks[tx->ntfy_id].napi;
+-
+- if (!napi_if_scheduled_mark_missed(napi)) {
+- /* Call local_bh_enable to trigger SoftIRQ processing */
+- local_bh_disable();
+- napi_schedule(napi);
+- local_bh_enable();
+- }
+-
+- tx->xdp_xsk_wakeup++;
++ napi = &priv->ntfy_blocks[gve_rx_idx_to_ntfy(priv, queue_id)].napi;
++ if (!napi_if_scheduled_mark_missed(napi)) {
++ /* Call local_bh_enable to trigger SoftIRQ processing */
++ local_bh_disable();
++ napi_schedule(napi);
++ local_bh_enable();
+ }
+
+ return 0;
--- /dev/null
+From cddc76b165161a02ff14c4d84d0f5266d9d32b9e Mon Sep 17 00:00:00 2001
+From: Alessandro Carminati <acarmina@redhat.com>
+Date: Tue, 17 Dec 2024 14:20:33 +0000
+Subject: mm/kmemleak: fix sleeping function called from invalid context at print message
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Alessandro Carminati <acarmina@redhat.com>
+
+commit cddc76b165161a02ff14c4d84d0f5266d9d32b9e upstream.
+
+Address a bug in the kernel that triggers a "sleeping function called from
+invalid context" warning when /sys/kernel/debug/kmemleak is printed under
+specific conditions:
+- CONFIG_PREEMPT_RT=y
+- Set SELinux as the LSM for the system
+- Set kptr_restrict to 1
+- kmemleak buffer contains at least one item
+
+BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48
+in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 136, name: cat
+preempt_count: 1, expected: 0
+RCU nest depth: 2, expected: 2
+6 locks held by cat/136:
+ #0: ffff32e64bcbf950 (&p->lock){+.+.}-{3:3}, at: seq_read_iter+0xb8/0xe30
+ #1: ffffafe6aaa9dea0 (scan_mutex){+.+.}-{3:3}, at: kmemleak_seq_start+0x34/0x128
+ #3: ffff32e6546b1cd0 (&object->lock){....}-{2:2}, at: kmemleak_seq_show+0x3c/0x1e0
+ #4: ffffafe6aa8d8560 (rcu_read_lock){....}-{1:2}, at: has_ns_capability_noaudit+0x8/0x1b0
+ #5: ffffafe6aabbc0f8 (notif_lock){+.+.}-{2:2}, at: avc_compute_av+0xc4/0x3d0
+irq event stamp: 136660
+hardirqs last enabled at (136659): [<ffffafe6a80fd7a0>] _raw_spin_unlock_irqrestore+0xa8/0xd8
+hardirqs last disabled at (136660): [<ffffafe6a80fd85c>] _raw_spin_lock_irqsave+0x8c/0xb0
+softirqs last enabled at (0): [<ffffafe6a5d50b28>] copy_process+0x11d8/0x3df8
+softirqs last disabled at (0): [<0000000000000000>] 0x0
+Preemption disabled at:
+[<ffffafe6a6598a4c>] kmemleak_seq_show+0x3c/0x1e0
+CPU: 1 UID: 0 PID: 136 Comm: cat Tainted: G E 6.11.0-rt7+ #34
+Tainted: [E]=UNSIGNED_MODULE
+Hardware name: linux,dummy-virt (DT)
+Call trace:
+ dump_backtrace+0xa0/0x128
+ show_stack+0x1c/0x30
+ dump_stack_lvl+0xe8/0x198
+ dump_stack+0x18/0x20
+ rt_spin_lock+0x8c/0x1a8
+ avc_perm_nonode+0xa0/0x150
+ cred_has_capability.isra.0+0x118/0x218
+ selinux_capable+0x50/0x80
+ security_capable+0x7c/0xd0
+ has_ns_capability_noaudit+0x94/0x1b0
+ has_capability_noaudit+0x20/0x30
+ restricted_pointer+0x21c/0x4b0
+ pointer+0x298/0x760
+ vsnprintf+0x330/0xf70
+ seq_printf+0x178/0x218
+ print_unreferenced+0x1a4/0x2d0
+ kmemleak_seq_show+0xd0/0x1e0
+ seq_read_iter+0x354/0xe30
+ seq_read+0x250/0x378
+ full_proxy_read+0xd8/0x148
+ vfs_read+0x190/0x918
+ ksys_read+0xf0/0x1e0
+ __arm64_sys_read+0x70/0xa8
+ invoke_syscall.constprop.0+0xd4/0x1d8
+ el0_svc+0x50/0x158
+ el0t_64_sync+0x17c/0x180
+
+%pS and %pK, in the same back trace line, are redundant, and %pS can void
+%pK service in certain contexts.
+
+%pS alone already provides the necessary information, and if it cannot
+resolve the symbol, it falls back to printing the raw address voiding
+the original intent behind the %pK.
+
+Additionally, %pK requires a privilege check CAP_SYSLOG enforced through
+the LSM, which can trigger a "sleeping function called from invalid
+context" warning under RT_PREEMPT kernels when the check occurs in an
+atomic context. This issue may also affect other LSMs.
+
+This change avoids the unnecessary privilege check and resolves the
+sleeping function warning without any loss of information.
+
+Link: https://lkml.kernel.org/r/20241217142032.55793-1-acarmina@redhat.com
+Fixes: 3a6f33d86baa ("mm/kmemleak: use %pK to display kernel pointers in backtrace")
+Signed-off-by: Alessandro Carminati <acarmina@redhat.com>
+Acked-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+Acked-by: Catalin Marinas <catalin.marinas@arm.com>
+Cc: Clément Léger <clement.leger@bootlin.com>
+Cc: Alessandro Carminati <acarmina@redhat.com>
+Cc: Eric Chanudet <echanude@redhat.com>
+Cc: Gabriele Paoloni <gpaoloni@redhat.com>
+Cc: Juri Lelli <juri.lelli@redhat.com>
+Cc: Steven Rostedt <rostedt@goodmis.org>
+Cc: Thomas Weißschuh <thomas.weissschuh@linutronix.de>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/kmemleak.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+--- a/mm/kmemleak.c
++++ b/mm/kmemleak.c
+@@ -373,7 +373,7 @@ static void print_unreferenced(struct se
+
+ for (i = 0; i < nr_entries; i++) {
+ void *ptr = (void *)entries[i];
+- warn_or_seq_printf(seq, " [<%pK>] %pS\n", ptr, ptr);
++ warn_or_seq_printf(seq, " %pS\n", ptr);
+ }
+ }
+
--- /dev/null
+From 158cdce87c8c172787063998ad5dd3e2f658b963 Mon Sep 17 00:00:00 2001
+From: Yafang Shao <laoar.shao@gmail.com>
+Date: Fri, 6 Dec 2024 16:30:25 +0800
+Subject: mm/readahead: fix large folio support in async readahead
+
+From: Yafang Shao <laoar.shao@gmail.com>
+
+commit 158cdce87c8c172787063998ad5dd3e2f658b963 upstream.
+
+When testing large folio support with XFS on our servers, we observed that
+only a few large folios are mapped when reading large files via mmap.
+After a thorough analysis, I identified it was caused by the
+`/sys/block/*/queue/read_ahead_kb` setting. On our test servers, this
+parameter is set to 128KB. After I tune it to 2MB, the large folio can
+work as expected. However, I believe the large folio behavior should not
+be dependent on the value of read_ahead_kb. It would be more robust if
+the kernel can automatically adopt to it.
+
+With /sys/block/*/queue/read_ahead_kb set to 128KB and performing a
+sequential read on a 1GB file using MADV_HUGEPAGE, the differences in
+/proc/meminfo are as follows:
+
+- before this patch
+ FileHugePages: 18432 kB
+ FilePmdMapped: 4096 kB
+
+- after this patch
+ FileHugePages: 1067008 kB
+ FilePmdMapped: 1048576 kB
+
+This shows that after applying the patch, the entire 1GB file is mapped to
+huge pages. The stable list is CCed, as without this patch, large folios
+don't function optimally in the readahead path.
+
+It's worth noting that if read_ahead_kb is set to a larger value that
+isn't aligned with huge page sizes (e.g., 4MB + 128KB), it may still fail
+to map to hugepages.
+
+Link: https://lkml.kernel.org/r/20241108141710.9721-1-laoar.shao@gmail.com
+Link: https://lkml.kernel.org/r/20241206083025.3478-1-laoar.shao@gmail.com
+Fixes: 4687fdbb805a ("mm/filemap: Support VM_HUGEPAGE for file mappings")
+Signed-off-by: Yafang Shao <laoar.shao@gmail.com>
+Tested-by: kernel test robot <oliver.sang@intel.com>
+Cc: Matthew Wilcox <willy@infradead.org>
+Cc: David Hildenbrand <david@redhat.com>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/readahead.c | 6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+--- a/mm/readahead.c
++++ b/mm/readahead.c
+@@ -641,7 +641,11 @@ void page_cache_async_ra(struct readahea
+ 1UL << order);
+ if (index == expected) {
+ ra->start += ra->size;
+- ra->size = get_next_ra_size(ra, max_pages);
++ /*
++ * In the case of MADV_HUGEPAGE, the actual size might exceed
++ * the readahead window.
++ */
++ ra->size = max(ra->size, get_next_ra_size(ra, max_pages));
+ ra->async_size = ra->size;
+ goto readit;
+ }
--- /dev/null
+From 6aaced5abd32e2a57cd94fd64f824514d0361da8 Mon Sep 17 00:00:00 2001
+From: Seiji Nishikawa <snishika@redhat.com>
+Date: Sun, 1 Dec 2024 01:12:34 +0900
+Subject: mm: vmscan: account for free pages to prevent infinite Loop in throttle_direct_reclaim()
+
+From: Seiji Nishikawa <snishika@redhat.com>
+
+commit 6aaced5abd32e2a57cd94fd64f824514d0361da8 upstream.
+
+The task sometimes continues looping in throttle_direct_reclaim() because
+allow_direct_reclaim(pgdat) keeps returning false.
+
+ #0 [ffff80002cb6f8d0] __switch_to at ffff8000080095ac
+ #1 [ffff80002cb6f900] __schedule at ffff800008abbd1c
+ #2 [ffff80002cb6f990] schedule at ffff800008abc50c
+ #3 [ffff80002cb6f9b0] throttle_direct_reclaim at ffff800008273550
+ #4 [ffff80002cb6fa20] try_to_free_pages at ffff800008277b68
+ #5 [ffff80002cb6fae0] __alloc_pages_nodemask at ffff8000082c4660
+ #6 [ffff80002cb6fc50] alloc_pages_vma at ffff8000082e4a98
+ #7 [ffff80002cb6fca0] do_anonymous_page at ffff80000829f5a8
+ #8 [ffff80002cb6fce0] __handle_mm_fault at ffff8000082a5974
+ #9 [ffff80002cb6fd90] handle_mm_fault at ffff8000082a5bd4
+
+At this point, the pgdat contains the following two zones:
+
+ NODE: 4 ZONE: 0 ADDR: ffff00817fffe540 NAME: "DMA32"
+ SIZE: 20480 MIN/LOW/HIGH: 11/28/45
+ VM_STAT:
+ NR_FREE_PAGES: 359
+ NR_ZONE_INACTIVE_ANON: 18813
+ NR_ZONE_ACTIVE_ANON: 0
+ NR_ZONE_INACTIVE_FILE: 50
+ NR_ZONE_ACTIVE_FILE: 0
+ NR_ZONE_UNEVICTABLE: 0
+ NR_ZONE_WRITE_PENDING: 0
+ NR_MLOCK: 0
+ NR_BOUNCE: 0
+ NR_ZSPAGES: 0
+ NR_FREE_CMA_PAGES: 0
+
+ NODE: 4 ZONE: 1 ADDR: ffff00817fffec00 NAME: "Normal"
+ SIZE: 8454144 PRESENT: 98304 MIN/LOW/HIGH: 68/166/264
+ VM_STAT:
+ NR_FREE_PAGES: 146
+ NR_ZONE_INACTIVE_ANON: 94668
+ NR_ZONE_ACTIVE_ANON: 3
+ NR_ZONE_INACTIVE_FILE: 735
+ NR_ZONE_ACTIVE_FILE: 78
+ NR_ZONE_UNEVICTABLE: 0
+ NR_ZONE_WRITE_PENDING: 0
+ NR_MLOCK: 0
+ NR_BOUNCE: 0
+ NR_ZSPAGES: 0
+ NR_FREE_CMA_PAGES: 0
+
+In allow_direct_reclaim(), while processing ZONE_DMA32, the sum of
+inactive/active file-backed pages calculated in zone_reclaimable_pages()
+based on the result of zone_page_state_snapshot() is zero.
+
+Additionally, since this system lacks swap, the calculation of inactive/
+active anonymous pages is skipped.
+
+ crash> p nr_swap_pages
+ nr_swap_pages = $1937 = {
+ counter = 0
+ }
+
+As a result, ZONE_DMA32 is deemed unreclaimable and skipped, moving on to
+the processing of the next zone, ZONE_NORMAL, despite ZONE_DMA32 having
+free pages significantly exceeding the high watermark.
+
+The problem is that the pgdat->kswapd_failures hasn't been incremented.
+
+ crash> px ((struct pglist_data *) 0xffff00817fffe540)->kswapd_failures
+ $1935 = 0x0
+
+This is because the node deemed balanced. The node balancing logic in
+balance_pgdat() evaluates all zones collectively. If one or more zones
+(e.g., ZONE_DMA32) have enough free pages to meet their watermarks, the
+entire node is deemed balanced. This causes balance_pgdat() to exit early
+before incrementing the kswapd_failures, as it considers the overall
+memory state acceptable, even though some zones (like ZONE_NORMAL) remain
+under significant pressure.
+
+
+The patch ensures that zone_reclaimable_pages() includes free pages
+(NR_FREE_PAGES) in its calculation when no other reclaimable pages are
+available (e.g., file-backed or anonymous pages). This change prevents
+zones like ZONE_DMA32, which have sufficient free pages, from being
+mistakenly deemed unreclaimable. By doing so, the patch ensures proper
+node balancing, avoids masking pressure on other zones like ZONE_NORMAL,
+and prevents infinite loops in throttle_direct_reclaim() caused by
+allow_direct_reclaim(pgdat) repeatedly returning false.
+
+
+The kernel hangs due to a task stuck in throttle_direct_reclaim(), caused
+by a node being incorrectly deemed balanced despite pressure in certain
+zones, such as ZONE_NORMAL. This issue arises from
+zone_reclaimable_pages() returning 0 for zones without reclaimable file-
+backed or anonymous pages, causing zones like ZONE_DMA32 with sufficient
+free pages to be skipped.
+
+The lack of swap or reclaimable pages results in ZONE_DMA32 being ignored
+during reclaim, masking pressure in other zones. Consequently,
+pgdat->kswapd_failures remains 0 in balance_pgdat(), preventing fallback
+mechanisms in allow_direct_reclaim() from being triggered, leading to an
+infinite loop in throttle_direct_reclaim().
+
+This patch modifies zone_reclaimable_pages() to account for free pages
+(NR_FREE_PAGES) when no other reclaimable pages exist. This ensures zones
+with sufficient free pages are not skipped, enabling proper balancing and
+reclaim behavior.
+
+[akpm@linux-foundation.org: coding-style cleanups]
+Link: https://lkml.kernel.org/r/20241130164346.436469-1-snishika@redhat.com
+Link: https://lkml.kernel.org/r/20241130161236.433747-2-snishika@redhat.com
+Fixes: 5a1c84b404a7 ("mm: remove reclaim and compaction retry approximations")
+Signed-off-by: Seiji Nishikawa <snishika@redhat.com>
+Cc: Mel Gorman <mgorman@techsingularity.net>
+Cc: <stable@vger.kernel.org>
+Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ mm/vmscan.c | 9 ++++++++-
+ 1 file changed, 8 insertions(+), 1 deletion(-)
+
+--- a/mm/vmscan.c
++++ b/mm/vmscan.c
+@@ -374,7 +374,14 @@ unsigned long zone_reclaimable_pages(str
+ if (can_reclaim_anon_pages(NULL, zone_to_nid(zone), NULL))
+ nr += zone_page_state_snapshot(zone, NR_ZONE_INACTIVE_ANON) +
+ zone_page_state_snapshot(zone, NR_ZONE_ACTIVE_ANON);
+-
++ /*
++ * If there are no reclaimable file-backed or anonymous pages,
++ * ensure zones with sufficient free pages are not skipped.
++ * This prevents zones like DMA32 from being ignored in reclaim
++ * scenarios where they can still help alleviate memory pressure.
++ */
++ if (nr == 0)
++ nr = zone_page_state_snapshot(zone, NR_FREE_PAGES);
+ return nr;
+ }
+
mm-shmem-fix-incorrect-index-alignment-for-within_size-policy.patch
fs-proc-task_mmu-fix-pagemap-flags-with-pmd-thp-entries-on-32bit.patch
gve-process-xsk-tx-descriptors-as-part-of-rx-napi.patch
+gve-clean-xdp-queues-in-gve_tx_stop_ring_gqi.patch
+gve-guard-xsk-operations-on-the-existence-of-queues.patch
+gve-fix-xdp-allocation-path-in-edge-cases.patch
+gve-guard-xdp-xmit-ndo-on-existence-of-xdp-queues.patch
+gve-trigger-rx-napi-instead-of-tx-napi-in-gve_xsk_wakeup.patch
+mm-readahead-fix-large-folio-support-in-async-readahead.patch
+mm-kmemleak-fix-sleeping-function-called-from-invalid-context-at-print-message.patch
+mm-vmscan-account-for-free-pages-to-prevent-infinite-loop-in-throttle_direct_reclaim.patch