]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
net: Extend NAPI threaded polling to allow kthread based busy polling
authorSamiullah Khawaja <skhawaja@google.com>
Tue, 28 Oct 2025 20:30:05 +0000 (20:30 +0000)
committerJakub Kicinski <kuba@kernel.org>
Tue, 4 Nov 2025 02:11:40 +0000 (18:11 -0800)
Add a new state NAPI_STATE_THREADED_BUSY_POLL to the NAPI state enum to
enable and disable threaded busy polling.

When threaded busy polling is enabled for a NAPI, enable
NAPI_STATE_THREADED also.

When the threaded NAPI is scheduled, set NAPI_STATE_IN_BUSY_POLL to
signal napi_complete_done not to rearm interrupts.

Whenever NAPI_STATE_THREADED_BUSY_POLL is unset, the
NAPI_STATE_IN_BUSY_POLL will be unset, napi_complete_done unsets the
NAPI_STATE_SCHED_THREADED bit also, which in turn will make the kthread
go to sleep.

Signed-off-by: Samiullah Khawaja <skhawaja@google.com>
Reviewed-by: Willem de Bruijn <willemb@google.com>
Acked-by: Martin Karsten <mkarsten@uwaterloo.ca>
Tested-by: Martin Karsten <mkarsten@uwaterloo.ca>
Link: https://patch.msgid.link/20251028203007.575686-2-skhawaja@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Documentation/netlink/specs/netdev.yaml
Documentation/networking/napi.rst
include/linux/netdevice.h
include/uapi/linux/netdev.h
net/core/dev.c
net/core/dev.h
net/core/netdev-genl-gen.c
tools/include/uapi/linux/netdev.h

index e00d3fa1c152d7165e9485d6d383a2cc9cef7cfd..10c412b7433f761d68cfb2bb2bd498075dc74b35 100644 (file)
@@ -88,7 +88,7 @@ definitions:
   -
     name: napi-threaded
     type: enum
-    entries: [disabled, enabled]
+    entries: [disabled, enabled, busy-poll]
 
 attribute-sets:
   -
@@ -291,7 +291,8 @@ attribute-sets:
         name: threaded
         doc: Whether the NAPI is configured to operate in threaded polling
              mode. If this is set to enabled then the NAPI context operates
-             in threaded polling mode.
+             in threaded polling mode. If this is set to busy-poll, then the
+             threaded polling mode also busy polls.
         type: u32
         enum: napi-threaded
   -
index 7dd60366f4ff3948a58c0b038cd7ab57f1401500..4e008efebb352ad17e6e898a16cab7217d45b2d6 100644 (file)
@@ -263,7 +263,9 @@ are not well known).
 Busy polling is enabled by either setting ``SO_BUSY_POLL`` on
 selected sockets or using the global ``net.core.busy_poll`` and
 ``net.core.busy_read`` sysctls. An io_uring API for NAPI busy polling
-also exists.
+also exists. Threaded polling of NAPI also has a mode to busy poll for
+packets (:ref:`threaded busy polling<threaded_busy_poll>`) using the NAPI
+processing kthread.
 
 epoll-based busy polling
 ------------------------
@@ -426,6 +428,52 @@ Therefore, setting ``gro_flush_timeout`` and ``napi_defer_hard_irqs`` is
 the recommended usage, because otherwise setting ``irq-suspend-timeout``
 might not have any discernible effect.
 
+.. _threaded_busy_poll:
+
+Threaded NAPI busy polling
+--------------------------
+
+Threaded NAPI busy polling extends threaded NAPI and adds support to do
+continuous busy polling of the NAPI. This can be useful for forwarding or
+AF_XDP applications.
+
+Threaded NAPI busy polling can be enabled on per NIC queue basis using Netlink.
+
+For example, using the following script:
+
+.. code-block:: bash
+
+  $ ynl --family netdev --do napi-set \
+            --json='{"id": 66, "threaded": "busy-poll"}'
+
+The kernel will create a kthread that busy polls on this NAPI.
+
+The user may elect to set the CPU affinity of this kthread to an unused CPU
+core to improve how often the NAPI is polled at the expense of wasted CPU
+cycles. Note that this will keep the CPU core busy with 100% usage.
+
+Once threaded busy polling is enabled for a NAPI, PID of the kthread can be
+retrieved using Netlink so the affinity of the kthread can be set up.
+
+For example, the following script can be used to fetch the PID:
+
+.. code-block:: bash
+
+  $ ynl --family netdev --do napi-get --json='{"id": 66}'
+
+This will output something like following, the pid `258` is the PID of the
+kthread that is polling this NAPI.
+
+.. code-block:: bash
+
+  $ {'defer-hard-irqs': 0,
+     'gro-flush-timeout': 0,
+     'id': 66,
+     'ifindex': 2,
+     'irq-suspend-timeout': 0,
+     'pid': 258,
+     'threaded': 'busy-poll'}
+
 .. _threaded:
 
 Threaded NAPI
index 9c1e5042c5e7646c0aa9e8f4e160c78ea27a639a..e808071dbb7d3770decb3d8202c381836f1f1aa6 100644 (file)
@@ -423,11 +423,12 @@ enum {
        NAPI_STATE_NPSVC,               /* Netpoll - don't dequeue from poll_list */
        NAPI_STATE_LISTED,              /* NAPI added to system lists */
        NAPI_STATE_NO_BUSY_POLL,        /* Do not add in napi_hash, no busy polling */
-       NAPI_STATE_IN_BUSY_POLL,        /* sk_busy_loop() owns this NAPI */
+       NAPI_STATE_IN_BUSY_POLL,        /* Do not rearm NAPI interrupt */
        NAPI_STATE_PREFER_BUSY_POLL,    /* prefer busy-polling over softirq processing*/
        NAPI_STATE_THREADED,            /* The poll is performed inside its own thread*/
        NAPI_STATE_SCHED_THREADED,      /* Napi is currently scheduled in threaded mode */
        NAPI_STATE_HAS_NOTIFIER,        /* Napi has an IRQ notifier */
+       NAPI_STATE_THREADED_BUSY_POLL,  /* The threaded NAPI poller will busy poll */
 };
 
 enum {
@@ -442,6 +443,7 @@ enum {
        NAPIF_STATE_THREADED            = BIT(NAPI_STATE_THREADED),
        NAPIF_STATE_SCHED_THREADED      = BIT(NAPI_STATE_SCHED_THREADED),
        NAPIF_STATE_HAS_NOTIFIER        = BIT(NAPI_STATE_HAS_NOTIFIER),
+       NAPIF_STATE_THREADED_BUSY_POLL  = BIT(NAPI_STATE_THREADED_BUSY_POLL),
 };
 
 enum gro_result {
index 48eb49aa03d41cfd8ba04303099553c562ef2d7c..048c8de1a130dbc08886202aac6c145342ee7782 100644 (file)
@@ -80,6 +80,7 @@ enum netdev_qstats_scope {
 enum netdev_napi_threaded {
        NETDEV_NAPI_THREADED_DISABLED,
        NETDEV_NAPI_THREADED_ENABLED,
+       NETDEV_NAPI_THREADED_BUSY_POLL,
 };
 
 enum {
index dccc1176f3c6565f96a7e2b5f42d009ef6435496..2c1de5fb97d93c8711797168249d7e4f8a2ae0a3 100644 (file)
@@ -7089,7 +7089,8 @@ static void napi_stop_kthread(struct napi_struct *napi)
                 */
                if ((val & NAPIF_STATE_SCHED_THREADED) ||
                    !(val & NAPIF_STATE_SCHED)) {
-                       new = val & (~NAPIF_STATE_THREADED);
+                       new = val & (~(NAPIF_STATE_THREADED |
+                                      NAPIF_STATE_THREADED_BUSY_POLL));
                } else {
                        msleep(20);
                        continue;
@@ -7113,6 +7114,16 @@ static void napi_stop_kthread(struct napi_struct *napi)
        napi->thread = NULL;
 }
 
+static void napi_set_threaded_state(struct napi_struct *napi,
+                                   enum netdev_napi_threaded threaded_mode)
+{
+       bool threaded = threaded_mode != NETDEV_NAPI_THREADED_DISABLED;
+       bool busy_poll = threaded_mode == NETDEV_NAPI_THREADED_BUSY_POLL;
+
+       assign_bit(NAPI_STATE_THREADED, &napi->state, threaded);
+       assign_bit(NAPI_STATE_THREADED_BUSY_POLL, &napi->state, busy_poll);
+}
+
 int napi_set_threaded(struct napi_struct *napi,
                      enum netdev_napi_threaded threaded)
 {
@@ -7139,7 +7150,7 @@ int napi_set_threaded(struct napi_struct *napi,
        } else {
                /* Make sure kthread is created before THREADED bit is set. */
                smp_mb__before_atomic();
-               assign_bit(NAPI_STATE_THREADED, &napi->state, threaded);
+               napi_set_threaded_state(napi, threaded);
        }
 
        return 0;
@@ -7531,7 +7542,9 @@ void napi_disable_locked(struct napi_struct *n)
                }
 
                new = val | NAPIF_STATE_SCHED | NAPIF_STATE_NPSVC;
-               new &= ~(NAPIF_STATE_THREADED | NAPIF_STATE_PREFER_BUSY_POLL);
+               new &= ~(NAPIF_STATE_THREADED |
+                        NAPIF_STATE_THREADED_BUSY_POLL |
+                        NAPIF_STATE_PREFER_BUSY_POLL);
        } while (!try_cmpxchg(&n->state, &val, new));
 
        hrtimer_cancel(&n->timer);
@@ -7743,7 +7756,7 @@ static int napi_thread_wait(struct napi_struct *napi)
        return -1;
 }
 
-static void napi_threaded_poll_loop(struct napi_struct *napi)
+static void napi_threaded_poll_loop(struct napi_struct *napi, bool busy_poll)
 {
        struct bpf_net_context __bpf_net_ctx, *bpf_net_ctx;
        struct softnet_data *sd;
@@ -7772,22 +7785,47 @@ static void napi_threaded_poll_loop(struct napi_struct *napi)
                }
                skb_defer_free_flush();
                bpf_net_ctx_clear(bpf_net_ctx);
+
+               /* When busy poll is enabled, the old packets are not flushed in
+                * napi_complete_done. So flush them here.
+                */
+               if (busy_poll)
+                       gro_flush_normal(&napi->gro, HZ >= 1000);
                local_bh_enable();
 
+               /* Call cond_resched here to avoid watchdog warnings. */
+               if (repoll || busy_poll) {
+                       rcu_softirq_qs_periodic(last_qs);
+                       cond_resched();
+               }
+
                if (!repoll)
                        break;
-
-               rcu_softirq_qs_periodic(last_qs);
-               cond_resched();
        }
 }
 
 static int napi_threaded_poll(void *data)
 {
        struct napi_struct *napi = data;
+       bool want_busy_poll;
+       bool in_busy_poll;
+       unsigned long val;
+
+       while (!napi_thread_wait(napi)) {
+               val = READ_ONCE(napi->state);
+
+               want_busy_poll = val & NAPIF_STATE_THREADED_BUSY_POLL;
+               in_busy_poll = val & NAPIF_STATE_IN_BUSY_POLL;
 
-       while (!napi_thread_wait(napi))
-               napi_threaded_poll_loop(napi);
+               if (unlikely(val & NAPIF_STATE_DISABLE))
+                       want_busy_poll = false;
+
+               if (want_busy_poll != in_busy_poll)
+                       assign_bit(NAPI_STATE_IN_BUSY_POLL, &napi->state,
+                                  want_busy_poll);
+
+               napi_threaded_poll_loop(napi, want_busy_poll);
+       }
 
        return 0;
 }
@@ -13097,7 +13135,7 @@ static void run_backlog_napi(unsigned int cpu)
 {
        struct softnet_data *sd = per_cpu_ptr(&softnet_data, cpu);
 
-       napi_threaded_poll_loop(&sd->backlog);
+       napi_threaded_poll_loop(&sd->backlog, false);
 }
 
 static void backlog_napi_setup(unsigned int cpu)
index 900880e8b5b4b9492eca23a4d9201045e6bf7f74..4d872a79bafbc817b7213f967f33404730c2a3ff 100644 (file)
@@ -317,6 +317,9 @@ static inline void napi_set_irq_suspend_timeout(struct napi_struct *n,
 
 static inline enum netdev_napi_threaded napi_get_threaded(struct napi_struct *n)
 {
+       if (test_bit(NAPI_STATE_THREADED_BUSY_POLL, &n->state))
+               return NETDEV_NAPI_THREADED_BUSY_POLL;
+
        if (test_bit(NAPI_STATE_THREADED, &n->state))
                return NETDEV_NAPI_THREADED_ENABLED;
 
index e9a2a6f26cb7d8b1b4451553127fe451571fe5e8..ff20435c45d2542cb65a2fd2f127ba6a779cadb7 100644 (file)
@@ -97,7 +97,7 @@ static const struct nla_policy netdev_napi_set_nl_policy[NETDEV_A_NAPI_THREADED
        [NETDEV_A_NAPI_DEFER_HARD_IRQS] = NLA_POLICY_FULL_RANGE(NLA_U32, &netdev_a_napi_defer_hard_irqs_range),
        [NETDEV_A_NAPI_GRO_FLUSH_TIMEOUT] = { .type = NLA_UINT, },
        [NETDEV_A_NAPI_IRQ_SUSPEND_TIMEOUT] = { .type = NLA_UINT, },
-       [NETDEV_A_NAPI_THREADED] = NLA_POLICY_MAX(NLA_U32, 1),
+       [NETDEV_A_NAPI_THREADED] = NLA_POLICY_MAX(NLA_U32, 2),
 };
 
 /* NETDEV_CMD_BIND_TX - do */
index 48eb49aa03d41cfd8ba04303099553c562ef2d7c..048c8de1a130dbc08886202aac6c145342ee7782 100644 (file)
@@ -80,6 +80,7 @@ enum netdev_qstats_scope {
 enum netdev_napi_threaded {
        NETDEV_NAPI_THREADED_DISABLED,
        NETDEV_NAPI_THREADED_ENABLED,
+       NETDEV_NAPI_THREADED_BUSY_POLL,
 };
 
 enum {