]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
net-sysfs: use rps_tag_ptr and remove metadata from rps_dev_flow_table
authorEric Dumazet <edumazet@google.com>
Mon, 2 Mar 2026 18:14:32 +0000 (18:14 +0000)
committerJakub Kicinski <kuba@kernel.org>
Thu, 5 Mar 2026 00:54:10 +0000 (16:54 -0800)
Instead of storing the @log at the beginning of rps_dev_flow_table
use 5 low order bits of the rps_tag_ptr to store the log of the size.

This removes a potential cache line miss (for light traffic).

This allows us to switch to one high-order allocation instead of vmalloc()
when CONFIG_RFS_ACCEL is not set.

Signed-off-by: Eric Dumazet <edumazet@google.com>
Reviewed-by: Kuniyuki Iwashima <kuniyu@google.com>
Link: https://patch.msgid.link/20260302181432.1836150-8-edumazet@google.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
include/net/netdev_rx_queue.h
include/net/rps.h
net/core/dev.c
net/core/net-sysfs.c

index cfa72c4853876c6fcb84b5c551580d9205f7b29d..08f81329fc11dc86767f9da661be8c7194dc1da2 100644 (file)
@@ -8,13 +8,14 @@
 #include <net/xdp.h>
 #include <net/page_pool/types.h>
 #include <net/netdev_queues.h>
+#include <net/rps-types.h>
 
 /* This structure contains an instance of an RX queue. */
 struct netdev_rx_queue {
        struct xdp_rxq_info             xdp_rxq;
 #ifdef CONFIG_RPS
        struct rps_map __rcu            *rps_map;
-       struct rps_dev_flow_table __rcu *rps_flow_table;
+       rps_tag_ptr                     rps_flow_table;
 #endif
        struct kobject                  kobj;
        const struct attribute_group    **groups;
index e900480e828b487c721b3ef392f4abb427ad442c..e33c6a2fa8bbca3555ecccbbf9132d01cc433c36 100644 (file)
@@ -39,16 +39,6 @@ struct rps_dev_flow {
 };
 #define RPS_NO_FILTER 0xffff
 
-/*
- * The rps_dev_flow_table structure contains a table of flow mappings.
- */
-struct rps_dev_flow_table {
-       u8                      log;
-       struct rps_dev_flow     flows[];
-};
-#define RPS_DEV_FLOW_TABLE_SIZE(_num) (sizeof(struct rps_dev_flow_table) + \
-    ((_num) * sizeof(struct rps_dev_flow)))
-
 /*
  * The rps_sock_flow_table contains mappings of flows to the last CPU
  * on which they were processed by the application (set in recvmsg).
index 7ae87be81afc9d970ca4d4a5d4475d4d671586f9..b470487788a21c2e34c97477084e36dcc94c9a8d 100644 (file)
@@ -4968,16 +4968,16 @@ EXPORT_SYMBOL(rps_needed);
 struct static_key_false rfs_needed __read_mostly;
 EXPORT_SYMBOL(rfs_needed);
 
-static u32 rfs_slot(u32 hash, const struct rps_dev_flow_table *flow_table)
+static u32 rfs_slot(u32 hash, rps_tag_ptr tag_ptr)
 {
-       return hash_32(hash, flow_table->log);
+       return hash_32(hash, rps_tag_to_log(tag_ptr));
 }
 
 #ifdef CONFIG_RFS_ACCEL
 /**
  * rps_flow_is_active - check whether the flow is recently active.
  * @rflow: Specific flow to check activity.
- * @flow_table: per-queue flowtable that @rflow belongs to.
+ * @log: ilog2(hashsize).
  * @cpu: CPU saved in @rflow.
  *
  * If the CPU has processed many packets since the flow's last activity
@@ -4986,7 +4986,7 @@ static u32 rfs_slot(u32 hash, const struct rps_dev_flow_table *flow_table)
  * Return: true if flow was recently active.
  */
 static bool rps_flow_is_active(struct rps_dev_flow *rflow,
-                              struct rps_dev_flow_table *flow_table,
+                              u8 log,
                               unsigned int cpu)
 {
        unsigned int flow_last_active;
@@ -4999,7 +4999,7 @@ static bool rps_flow_is_active(struct rps_dev_flow *rflow,
        flow_last_active = READ_ONCE(rflow->last_qtail);
 
        return (int)(sd_input_head - flow_last_active) <
-               (int)(10 << flow_table->log);
+               (int)(10 << log);
 }
 #endif
 
@@ -5011,9 +5011,10 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
                u32 head;
 #ifdef CONFIG_RFS_ACCEL
                struct netdev_rx_queue *rxqueue;
-               struct rps_dev_flow_table *flow_table;
+               struct rps_dev_flow *flow_table;
                struct rps_dev_flow *old_rflow;
                struct rps_dev_flow *tmp_rflow;
+               rps_tag_ptr q_tag_ptr;
                unsigned int tmp_cpu;
                u16 rxq_index;
                u32 flow_id;
@@ -5028,16 +5029,18 @@ set_rps_cpu(struct net_device *dev, struct sk_buff *skb,
                        goto out;
 
                rxqueue = dev->_rx + rxq_index;
-               flow_table = rcu_dereference(rxqueue->rps_flow_table);
-               if (!flow_table)
+               q_tag_ptr = READ_ONCE(rxqueue->rps_flow_table);
+               if (!q_tag_ptr)
                        goto out;
 
-               flow_id = rfs_slot(hash, flow_table);
-               tmp_rflow = &flow_table->flows[flow_id];
+               flow_id = rfs_slot(hash, q_tag_ptr);
+               flow_table = rps_tag_to_table(q_tag_ptr);
+               tmp_rflow = flow_table + flow_id;
                tmp_cpu = READ_ONCE(tmp_rflow->cpu);
 
                if (READ_ONCE(tmp_rflow->filter) != RPS_NO_FILTER) {
-                       if (rps_flow_is_active(tmp_rflow, flow_table,
+                       if (rps_flow_is_active(tmp_rflow,
+                                              rps_tag_to_log(q_tag_ptr),
                                               tmp_cpu)) {
                                if (hash != READ_ONCE(tmp_rflow->hash) ||
                                    next_cpu == tmp_cpu)
@@ -5076,8 +5079,7 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
                       struct rps_dev_flow **rflowp)
 {
        struct netdev_rx_queue *rxqueue = dev->_rx;
-       struct rps_dev_flow_table *flow_table;
-       rps_tag_ptr global_tag_ptr;
+       rps_tag_ptr global_tag_ptr, q_tag_ptr;
        struct rps_map *map;
        int cpu = -1;
        u32 tcpu;
@@ -5098,9 +5100,9 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
 
        /* Avoid computing hash if RFS/RPS is not active for this rxqueue */
 
-       flow_table = rcu_dereference(rxqueue->rps_flow_table);
+       q_tag_ptr = READ_ONCE(rxqueue->rps_flow_table);
        map = rcu_dereference(rxqueue->rps_map);
-       if (!flow_table && !map)
+       if (!q_tag_ptr && !map)
                goto done;
 
        skb_reset_network_header(skb);
@@ -5109,8 +5111,9 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
                goto done;
 
        global_tag_ptr = READ_ONCE(net_hotdata.rps_sock_flow_table);
-       if (flow_table && global_tag_ptr) {
+       if (q_tag_ptr && global_tag_ptr) {
                struct rps_sock_flow_table *sock_flow_table;
+               struct rps_dev_flow *flow_table;
                struct rps_dev_flow *rflow;
                u32 next_cpu;
                u32 flow_id;
@@ -5130,7 +5133,9 @@ static int get_rps_cpu(struct net_device *dev, struct sk_buff *skb,
                /* OK, now we know there is a match,
                 * we can look at the local (per receive queue) flow table
                 */
-               rflow = &flow_table->flows[rfs_slot(hash, flow_table)];
+               flow_id = rfs_slot(hash, q_tag_ptr);
+               flow_table = rps_tag_to_table(q_tag_ptr);
+               rflow = flow_table + flow_id;
                tcpu = rflow->cpu;
 
                /*
@@ -5190,19 +5195,23 @@ bool rps_may_expire_flow(struct net_device *dev, u16 rxq_index,
                         u32 flow_id, u16 filter_id)
 {
        struct netdev_rx_queue *rxqueue = dev->_rx + rxq_index;
-       struct rps_dev_flow_table *flow_table;
+       struct rps_dev_flow *flow_table;
        struct rps_dev_flow *rflow;
+       rps_tag_ptr q_tag_ptr;
        bool expire = true;
+       u8 log;
 
        rcu_read_lock();
-       flow_table = rcu_dereference(rxqueue->rps_flow_table);
-       if (flow_table && flow_id < (1UL << flow_table->log)) {
+       q_tag_ptr = READ_ONCE(rxqueue->rps_flow_table);
+       log = rps_tag_to_log(q_tag_ptr);
+       if (q_tag_ptr && flow_id < (1UL << log)) {
                unsigned int cpu;
 
-               rflow = &flow_table->flows[flow_id];
+               flow_table = rps_tag_to_table(q_tag_ptr);
+               rflow = flow_table + flow_id;
                cpu = READ_ONCE(rflow->cpu);
                if (READ_ONCE(rflow->filter) == filter_id &&
-                   rps_flow_is_active(rflow, flow_table, cpu))
+                   rps_flow_is_active(rflow, log, cpu))
                        expire = false;
        }
        rcu_read_unlock();
index fd6f81930bc6437957f32206c84db87ee242fede..2ce011fae2490b3bd950cf8d9089e7d71cc0fd7a 100644 (file)
@@ -1060,14 +1060,12 @@ out:
 static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
                                           char *buf)
 {
-       struct rps_dev_flow_table *flow_table;
        unsigned long val = 0;
+       rps_tag_ptr tag_ptr;
 
-       rcu_read_lock();
-       flow_table = rcu_dereference(queue->rps_flow_table);
-       if (flow_table)
-               val = 1UL << flow_table->log;
-       rcu_read_unlock();
+       tag_ptr = READ_ONCE(queue->rps_flow_table);
+       if (tag_ptr)
+               val = 1UL << rps_tag_to_log(tag_ptr);
 
        return sysfs_emit(buf, "%lu\n", val);
 }
@@ -1075,8 +1073,10 @@ static ssize_t show_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
 static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
                                            const char *buf, size_t len)
 {
+       rps_tag_ptr otag, tag_ptr = 0UL;
+       struct rps_dev_flow *table;
        unsigned long mask, count;
-       struct rps_dev_flow_table *table, *old_table;
+       size_t sz;
        int rc;
 
        if (!capable(CAP_NET_ADMIN))
@@ -1093,38 +1093,36 @@ static ssize_t store_rps_dev_flow_table_cnt(struct netdev_rx_queue *queue,
                 */
                while ((mask | (mask >> 1)) != mask)
                        mask |= (mask >> 1);
-               /* On 64 bit arches, must check mask fits in table->mask (u32),
-                * and on 32bit arches, must check
-                * RPS_DEV_FLOW_TABLE_SIZE(mask + 1) doesn't overflow.
-                */
-#if BITS_PER_LONG > 32
-               if (mask > (unsigned long)(u32)mask)
-                       return -EINVAL;
-#else
-               if (mask > (ULONG_MAX - RPS_DEV_FLOW_TABLE_SIZE(1))
-                               / sizeof(struct rps_dev_flow)) {
-                       /* Enforce a limit to prevent overflow */
+
+               /* Do not accept too large tables. */
+               if (mask > (INT_MAX / sizeof(*table) - 1))
                        return -EINVAL;
-               }
-#endif
-               table = vmalloc(RPS_DEV_FLOW_TABLE_SIZE(mask + 1));
+
+               sz = max_t(size_t, sizeof(*table) * (mask + 1),
+                          PAGE_SIZE);
+               if (sz <= (PAGE_SIZE << PAGE_ALLOC_COSTLY_ORDER) ||
+                   is_power_of_2(sizeof(*table)))
+                       table = kvmalloc(sz, GFP_KERNEL);
+               else
+                       table = vmalloc(sz);
                if (!table)
                        return -ENOMEM;
-
-               table->log = ilog2(mask) + 1;
+               tag_ptr = (rps_tag_ptr)table;
+               if (rps_tag_to_log(tag_ptr)) {
+                       pr_err_once("store_rps_dev_flow_table_cnt() got a non page aligned allocation.\n");
+                       kvfree(table);
+                       return -ENOMEM;
+               }
+               tag_ptr |= (ilog2(mask) + 1);
                for (count = 0; count <= mask; count++) {
-                       table->flows[count].cpu = RPS_NO_CPU;
-                       table->flows[count].filter = RPS_NO_FILTER;
+                       table[count].cpu = RPS_NO_CPU;
+                       table[count].filter = RPS_NO_FILTER;
                }
-       } else {
-               table = NULL;
        }
 
-       old_table = unrcu_pointer(xchg(&queue->rps_flow_table,
-                                      RCU_INITIALIZER(table)));
-
-       if (old_table)
-               kvfree_rcu_mightsleep(old_table);
+       otag = xchg(&queue->rps_flow_table, tag_ptr);
+       if (otag)
+               kvfree_rcu_mightsleep(rps_tag_to_table(otag));
 
        return len;
 }
@@ -1150,7 +1148,7 @@ static void rx_queue_release(struct kobject *kobj)
 {
        struct netdev_rx_queue *queue = to_rx_queue(kobj);
 #ifdef CONFIG_RPS
-       struct rps_dev_flow_table *old_table;
+       rps_tag_ptr tag_ptr;
        struct rps_map *map;
 
        map = rcu_dereference_protected(queue->rps_map, 1);
@@ -1159,9 +1157,9 @@ static void rx_queue_release(struct kobject *kobj)
                kfree_rcu(map, rcu);
        }
 
-       old_table = unrcu_pointer(xchg(&queue->rps_flow_table, NULL));
-       if (old_table)
-               kvfree_rcu_mightsleep(old_table);
+       tag_ptr = xchg(&queue->rps_flow_table, 0UL);
+       if (tag_ptr)
+               kvfree_rcu_mightsleep(rps_tag_to_table(tag_ptr));
 #endif
 
        memset(kobj, 0, sizeof(*kobj));