]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
net: introduce per netns packet chains
authorPaolo Abeni <pabeni@redhat.com>
Thu, 20 Mar 2025 18:22:38 +0000 (19:22 +0100)
committerJakub Kicinski <kuba@kernel.org>
Mon, 24 Mar 2025 20:58:22 +0000 (13:58 -0700)
Currently network taps unbound to any interface are linked in the
global ptype_all list, affecting the performance in all the network
namespaces.

Add per netns ptypes chains, so that in the mentioned case only
the netns owning the packet socket(s) is affected.

While at that drop the global ptype_all list: no in kernel user
registers a tap on "any" type without specifying either the target
device or the target namespace (and IMHO doing that would not make
any sense).

Note that this adds a conditional in the fast path (to check for
per netns ptype_specific list) and increases the dataset size by
a cacheline (owing the per netns lists).

Reviewed-by: Sabrina Dubroca <sd@queasysnail.net>
Signed-off-by: Paolo Abeni <pabeni@redhat.com>
Reviewed-by: Eric Dumazet <edumaze@google.com>
Link: https://patch.msgid.link/ae405f98875ee87f8150c460ad162de7e466f8a7.1742494826.git.pabeni@redhat.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
include/linux/netdevice.h
include/net/hotdata.h
include/net/net_namespace.h
net/core/dev.c
net/core/hotdata.c
net/core/net-procfs.c
net/core/net_namespace.c

index 0c5b1f7f8f3af8049ee7aa25817351e998bf6354..f22cca7c03add6b035afb1a51b5978087507e680 100644 (file)
@@ -4278,7 +4278,17 @@ static __always_inline int ____dev_forward_skb(struct net_device *dev,
        return 0;
 }
 
-bool dev_nit_active(struct net_device *dev);
+bool dev_nit_active_rcu(const struct net_device *dev);
+static inline bool dev_nit_active(const struct net_device *dev)
+{
+       bool ret;
+
+       rcu_read_lock();
+       ret = dev_nit_active_rcu(dev);
+       rcu_read_unlock();
+       return ret;
+}
+
 void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev);
 
 static inline void __dev_put(struct net_device *dev)
index 30e9570beb2afbcb4ece641d6042bdc0de80bd38..fda94b2647ffa242c256c95ae929d9ef25e54f96 100644 (file)
@@ -23,7 +23,6 @@ struct net_hotdata {
        struct net_offload      udpv6_offload;
 #endif
        struct list_head        offload_base;
-       struct list_head        ptype_all;
        struct kmem_cache       *skbuff_cache;
        struct kmem_cache       *skbuff_fclone_cache;
        struct kmem_cache       *skb_small_head_cache;
index f467a66abc6b16b690a99037a3dea2e355910661..bd57d8fb54f145e1f041b8ab25985a47c029e757 100644 (file)
@@ -83,6 +83,9 @@ struct net {
        struct llist_node       defer_free_list;
        struct llist_node       cleanup_list;   /* namespaces on death row */
 
+       struct list_head ptype_all;
+       struct list_head ptype_specific;
+
 #ifdef CONFIG_KEYS
        struct key_tag          *key_domain;    /* Key domain of operation tag */
 #endif
index 2355603417650fe10d075c8e85416a488e00626d..bcf81c3ff6a32d56cf0613a4165677492987dd4d 100644 (file)
@@ -572,10 +572,18 @@ static inline void netdev_set_addr_lockdep_class(struct net_device *dev)
 
 static inline struct list_head *ptype_head(const struct packet_type *pt)
 {
-       if (pt->type == htons(ETH_P_ALL))
-               return pt->dev ? &pt->dev->ptype_all : &net_hotdata.ptype_all;
-       else
-               return pt->dev ? &pt->dev->ptype_specific :
+       if (pt->type == htons(ETH_P_ALL)) {
+               if (!pt->af_packet_net && !pt->dev)
+                       return NULL;
+
+               return pt->dev ? &pt->dev->ptype_all :
+                                &pt->af_packet_net->ptype_all;
+       }
+
+       if (pt->dev)
+               return &pt->dev->ptype_specific;
+
+       return pt->af_packet_net ? &pt->af_packet_net->ptype_specific :
                                 &ptype_base[ntohs(pt->type) & PTYPE_HASH_MASK];
 }
 
@@ -596,6 +604,9 @@ void dev_add_pack(struct packet_type *pt)
 {
        struct list_head *head = ptype_head(pt);
 
+       if (WARN_ON_ONCE(!head))
+               return;
+
        spin_lock(&ptype_lock);
        list_add_rcu(&pt->list, head);
        spin_unlock(&ptype_lock);
@@ -620,6 +631,9 @@ void __dev_remove_pack(struct packet_type *pt)
        struct list_head *head = ptype_head(pt);
        struct packet_type *pt1;
 
+       if (!head)
+               return;
+
        spin_lock(&ptype_lock);
 
        list_for_each_entry(pt1, head, list) {
@@ -2441,16 +2455,21 @@ static inline bool skb_loop_sk(struct packet_type *ptype, struct sk_buff *skb)
 }
 
 /**
- * dev_nit_active - return true if any network interface taps are in use
+ * dev_nit_active_rcu - return true if any network interface taps are in use
+ *
+ * The caller must hold the RCU lock
  *
  * @dev: network device to check for the presence of taps
  */
-bool dev_nit_active(struct net_device *dev)
+bool dev_nit_active_rcu(const struct net_device *dev)
 {
-       return !list_empty(&net_hotdata.ptype_all) ||
+       /* Callers may hold either RCU or RCU BH lock */
+       WARN_ON_ONCE(!rcu_read_lock_held() && !rcu_read_lock_bh_held());
+
+       return !list_empty(&dev_net(dev)->ptype_all) ||
               !list_empty(&dev->ptype_all);
 }
-EXPORT_SYMBOL_GPL(dev_nit_active);
+EXPORT_SYMBOL_GPL(dev_nit_active_rcu);
 
 /*
  *     Support routine. Sends outgoing frames to any network
@@ -2459,11 +2478,12 @@ EXPORT_SYMBOL_GPL(dev_nit_active);
 
 void dev_queue_xmit_nit(struct sk_buff *skb, struct net_device *dev)
 {
-       struct list_head *ptype_list = &net_hotdata.ptype_all;
        struct packet_type *ptype, *pt_prev = NULL;
+       struct list_head *ptype_list;
        struct sk_buff *skb2 = NULL;
 
        rcu_read_lock();
+       ptype_list = &dev_net_rcu(dev)->ptype_all;
 again:
        list_for_each_entry_rcu(ptype, ptype_list, list) {
                if (READ_ONCE(ptype->ignore_outgoing))
@@ -2507,7 +2527,7 @@ again:
                pt_prev = ptype;
        }
 
-       if (ptype_list == &net_hotdata.ptype_all) {
+       if (ptype_list != &dev->ptype_all) {
                ptype_list = &dev->ptype_all;
                goto again;
        }
@@ -3752,7 +3772,7 @@ static int xmit_one(struct sk_buff *skb, struct net_device *dev,
        unsigned int len;
        int rc;
 
-       if (dev_nit_active(dev))
+       if (dev_nit_active_rcu(dev))
                dev_queue_xmit_nit(skb, dev);
 
        len = skb->len;
@@ -5696,7 +5716,8 @@ another_round:
        if (pfmemalloc)
                goto skip_taps;
 
-       list_for_each_entry_rcu(ptype, &net_hotdata.ptype_all, list) {
+       list_for_each_entry_rcu(ptype, &dev_net_rcu(skb->dev)->ptype_all,
+                               list) {
                if (pt_prev)
                        ret = deliver_skb(skb, pt_prev, orig_dev);
                pt_prev = ptype;
@@ -5808,6 +5829,14 @@ check_vlan_id:
                deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
                                       &ptype_base[ntohs(type) &
                                                   PTYPE_HASH_MASK]);
+
+               /* orig_dev and skb->dev could belong to different netns;
+                * Even in such case we need to traverse only the list
+                * coming from skb->dev, as the ptype owner (packet socket)
+                * will use dev_net(skb->dev) to do namespace filtering.
+                */
+               deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
+                                      &dev_net_rcu(skb->dev)->ptype_specific);
        }
 
        deliver_ptype_list_skb(skb, &pt_prev, orig_dev, type,
index d0aaaaa556f229ded4e1997bf814a2b690b46920..0bc893d5f07b03b31e08967a2238f63d218020d7 100644 (file)
@@ -7,7 +7,6 @@
 
 struct net_hotdata net_hotdata __cacheline_aligned = {
        .offload_base = LIST_HEAD_INIT(net_hotdata.offload_base),
-       .ptype_all = LIST_HEAD_INIT(net_hotdata.ptype_all),
        .gro_normal_batch = 8,
 
        .netdev_budget = 300,
index fa6d3969734a6ec154c3444d1b25ee93edfc5588..3e92bf0f9060b19a3d9b0ace6bb4005352fedafe 100644 (file)
@@ -185,7 +185,13 @@ static void *ptype_get_idx(struct seq_file *seq, loff_t pos)
                }
        }
 
-       list_for_each_entry_rcu(pt, &net_hotdata.ptype_all, list) {
+       list_for_each_entry_rcu(pt, &seq_file_net(seq)->ptype_all, list) {
+               if (i == pos)
+                       return pt;
+               ++i;
+       }
+
+       list_for_each_entry_rcu(pt, &seq_file_net(seq)->ptype_specific, list) {
                if (i == pos)
                        return pt;
                ++i;
@@ -210,6 +216,7 @@ static void *ptype_seq_start(struct seq_file *seq, loff_t *pos)
 
 static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 {
+       struct net *net = seq_file_net(seq);
        struct net_device *dev;
        struct packet_type *pt;
        struct list_head *nxt;
@@ -232,15 +239,22 @@ static void *ptype_seq_next(struct seq_file *seq, void *v, loff_t *pos)
                                goto found;
                        }
                }
-
-               nxt = net_hotdata.ptype_all.next;
-               goto ptype_all;
+               nxt = net->ptype_all.next;
+               goto net_ptype_all;
        }
 
-       if (pt->type == htons(ETH_P_ALL)) {
-ptype_all:
-               if (nxt != &net_hotdata.ptype_all)
+       if (pt->af_packet_net) {
+net_ptype_all:
+               if (nxt != &net->ptype_all && nxt != &net->ptype_specific)
                        goto found;
+
+               if (nxt == &net->ptype_all) {
+                       /* continue with ->ptype_specific if it's not empty */
+                       nxt = net->ptype_specific.next;
+                       if (nxt != &net->ptype_specific)
+                               goto found;
+               }
+
                hash = 0;
                nxt = ptype_base[0].next;
        } else
index 4303f2a4926243e2c0ff0c0387383cd8e0658019..b0dfdf791ece5aa8fefdc2aea1ff4a9d9c399d72 100644 (file)
@@ -340,6 +340,8 @@ static __net_init void preinit_net(struct net *net, struct user_namespace *user_
        lock_set_cmp_fn(&net->rtnl_mutex, rtnl_net_lock_cmp_fn, NULL);
 #endif
 
+       INIT_LIST_HEAD(&net->ptype_all);
+       INIT_LIST_HEAD(&net->ptype_specific);
        preinit_net_sysctl(net);
 }