]> git.ipfire.org Git - thirdparty/kernel/linux.git/commitdiff
netkit: Add single device mode for netkit
authorDaniel Borkmann <daniel@iogearbox.net>
Thu, 2 Apr 2026 23:10:27 +0000 (01:10 +0200)
committerJakub Kicinski <kuba@kernel.org>
Fri, 10 Apr 2026 01:21:47 +0000 (18:21 -0700)
Add a single device mode for netkit instead of netkit pairs. The primary
target for the paired devices is to connect network namespaces, of course,
and support has been implemented in projects like Cilium [0]. For the rxq
leasing the plan is to support two main scenarios related to single device
mode:

* For the use-case of io_uring zero-copy, the control plane can either
  set up a netkit pair where the peer device can perform rxq leasing which
  is then tied to the lifetime of the peer device, or the control plane
  can use a regular netkit pair to connect the hostns to a Pod/container
  and dynamically add/remove rxq leasing through a single device without
  having to interrupt the device pair. In the case of io_uring, the memory
  pool is used as skb non-linear pages, and thus the skb will go its way
  through the regular stack into netkit. Things like the netkit policy when
  no BPF is attached or skb scrubbing etc apply as-is in case the paired
  devices are used, or if the backend memory is tied to the single device
  and traffic goes through a paired device.

* For the use-case of AF_XDP, the control plane needs to use netkit in the
  single device mode. The single device mode currently enforces only a
  pass policy when no BPF is attached, and does not yet support BPF link
  attachments for AF_XDP. skbs sent to that device get dropped at the
  moment. Given AF_XDP operates at a lower layer of the stack tying this
  to the netkit pair did not make sense. In future, the plan is to allow
  BPF at the XDP layer which can: i) process traffic coming from the AF_XDP
  application (e.g. QEMU with AF_XDP backend) to filter egress traffic or
  to push selected egress traffic up to the single netkit device to the
  local stack (e.g. DHCP requests), and ii) vice-versa skbs sent to the
  single netkit into the AF_XDP application (e.g. DHCP replies). Also,
  the control-plane can dynamically manage rxq leasing for the single
  netkit device without having to interrupt (e.g. down/up cycle) the main
  netkit pair for the Pod which has traffic going in and out.

Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
Co-developed-by: David Wei <dw@davidwei.uk>
Signed-off-by: David Wei <dw@davidwei.uk>
Reviewed-by: Jordan Rife <jordan@jrife.io>
Reviewed-by: Nikolay Aleksandrov <razor@blackwall.org>
Link: https://docs.cilium.io/en/stable/operations/performance/tuning/#netkit-device-mode
Link: https://patch.msgid.link/20260402231031.447597-11-daniel@iogearbox.net
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Documentation/netlink/specs/rt-link.yaml
drivers/net/netkit.c
include/uapi/linux/if_link.h

index df4b56beb818715e41c443ea6ce27f649bd0e674..fcb5aaf0926f4bc67cb3478359daf026e1e61f3b 100644 (file)
@@ -825,6 +825,13 @@ definitions:
     entries:
       - name: none
       - name: default
+  -
+    name: netkit-pairing
+    type: enum
+    enum-name: netkit-pairing
+    entries:
+      - name: pair
+      - name: single
   -
     name: ovpn-mode
     enum-name: ovpn-mode
@@ -2299,6 +2306,10 @@ attribute-sets:
       -
         name: tailroom
         type: u16
+      -
+        name: pairing
+        type: u32
+        enum: netkit-pairing
   -
     name: linkinfo-ovpn-attrs
     name-prefix: ifla-ovpn-
index 5c0e01396e064779813bfd018dcce88e3352ddfe..96c098a6db0dc637e2c3b880bd170ae7b215f091 100644 (file)
@@ -26,6 +26,7 @@ struct netkit {
 
        __cacheline_group_begin(netkit_slowpath);
        enum netkit_mode mode;
+       enum netkit_pairing pair;
        bool primary;
        u32 headroom;
        __cacheline_group_end(netkit_slowpath);
@@ -135,6 +136,10 @@ static int netkit_open(struct net_device *dev)
        struct netkit *nk = netkit_priv(dev);
        struct net_device *peer = rtnl_dereference(nk->peer);
 
+       if (nk->pair == NETKIT_DEVICE_SINGLE) {
+               netif_carrier_on(dev);
+               return 0;
+       }
        if (!peer)
                return -ENOTCONN;
        if (peer->flags & IFF_UP) {
@@ -194,16 +199,17 @@ static void netkit_set_headroom(struct net_device *dev, int headroom)
 
        rcu_read_lock();
        peer = rcu_dereference(nk->peer);
-       if (unlikely(!peer))
-               goto out;
-
-       nk2 = netkit_priv(peer);
-       nk->headroom = headroom;
-       headroom = max(nk->headroom, nk2->headroom);
+       if (!peer) {
+               nk->headroom = headroom;
+               dev->needed_headroom = headroom;
+       } else {
+               nk2 = netkit_priv(peer);
+               nk->headroom = headroom;
+               headroom = max(nk->headroom, nk2->headroom);
 
-       peer->needed_headroom = headroom;
-       dev->needed_headroom = headroom;
-out:
+               peer->needed_headroom = headroom;
+               dev->needed_headroom = headroom;
+       }
        rcu_read_unlock();
 }
 
@@ -335,15 +341,17 @@ static int netkit_new_link(struct net_device *dev,
        enum netkit_scrub scrub_prim = NETKIT_SCRUB_DEFAULT;
        enum netkit_scrub scrub_peer = NETKIT_SCRUB_DEFAULT;
        struct nlattr *peer_tb[IFLA_MAX + 1], **tbp, *attr;
+       enum netkit_pairing pair = NETKIT_DEVICE_PAIR;
        enum netkit_action policy_prim = NETKIT_PASS;
        enum netkit_action policy_peer = NETKIT_PASS;
+       bool seen_peer = false, seen_scrub = false;
        struct nlattr **data = params->data;
        enum netkit_mode mode = NETKIT_L3;
        unsigned char ifname_assign_type;
        struct nlattr **tb = params->tb;
        u16 headroom = 0, tailroom = 0;
        struct ifinfomsg *ifmp = NULL;
-       struct net_device *peer;
+       struct net_device *peer = NULL;
        char ifname[IFNAMSIZ];
        struct netkit *nk;
        int err;
@@ -380,6 +388,13 @@ static int netkit_new_link(struct net_device *dev,
                        headroom = nla_get_u16(data[IFLA_NETKIT_HEADROOM]);
                if (data[IFLA_NETKIT_TAILROOM])
                        tailroom = nla_get_u16(data[IFLA_NETKIT_TAILROOM]);
+               if (data[IFLA_NETKIT_PAIRING])
+                       pair = nla_get_u32(data[IFLA_NETKIT_PAIRING]);
+
+               seen_scrub = data[IFLA_NETKIT_SCRUB];
+               seen_peer = data[IFLA_NETKIT_PEER_INFO] ||
+                           data[IFLA_NETKIT_PEER_SCRUB] ||
+                           data[IFLA_NETKIT_PEER_POLICY];
        }
 
        if (ifmp && tbp[IFLA_IFNAME]) {
@@ -392,45 +407,47 @@ static int netkit_new_link(struct net_device *dev,
        if (mode != NETKIT_L2 &&
            (tb[IFLA_ADDRESS] || tbp[IFLA_ADDRESS]))
                return -EOPNOTSUPP;
+       if (pair == NETKIT_DEVICE_SINGLE &&
+           (tb != tbp || seen_peer || seen_scrub ||
+            policy_prim != NETKIT_PASS))
+               return -EOPNOTSUPP;
 
-       peer = rtnl_create_link(peer_net, ifname, ifname_assign_type,
-                               &netkit_link_ops, tbp, extack);
-       if (IS_ERR(peer))
-               return PTR_ERR(peer);
-
-       netif_inherit_tso_max(peer, dev);
-       if (headroom) {
-               peer->needed_headroom = headroom;
-               dev->needed_headroom = headroom;
-       }
-       if (tailroom) {
-               peer->needed_tailroom = tailroom;
-               dev->needed_tailroom = tailroom;
-       }
-
-       if (mode == NETKIT_L2 && !(ifmp && tbp[IFLA_ADDRESS]))
-               eth_hw_addr_random(peer);
-       if (ifmp && dev->ifindex)
-               peer->ifindex = ifmp->ifi_index;
-
-       nk = netkit_priv(peer);
-       nk->primary = false;
-       nk->policy = policy_peer;
-       nk->scrub = scrub_peer;
-       nk->mode = mode;
-       nk->headroom = headroom;
-       bpf_mprog_bundle_init(&nk->bundle);
+       if (pair == NETKIT_DEVICE_PAIR) {
+               peer = rtnl_create_link(peer_net, ifname, ifname_assign_type,
+                                       &netkit_link_ops, tbp, extack);
+               if (IS_ERR(peer))
+                       return PTR_ERR(peer);
+
+               netif_inherit_tso_max(peer, dev);
+               if (headroom)
+                       peer->needed_headroom = headroom;
+               if (tailroom)
+                       peer->needed_tailroom = tailroom;
+               if (mode == NETKIT_L2 && !(ifmp && tbp[IFLA_ADDRESS]))
+                       eth_hw_addr_random(peer);
+               if (ifmp && dev->ifindex)
+                       peer->ifindex = ifmp->ifi_index;
 
-       err = register_netdevice(peer);
-       if (err < 0)
-               goto err_register_peer;
-       netif_carrier_off(peer);
-       if (mode == NETKIT_L2)
-               dev_change_flags(peer, peer->flags & ~IFF_NOARP, NULL);
+               nk = netkit_priv(peer);
+               nk->primary = false;
+               nk->policy = policy_peer;
+               nk->scrub = scrub_peer;
+               nk->mode = mode;
+               nk->pair = pair;
+               nk->headroom = headroom;
+               bpf_mprog_bundle_init(&nk->bundle);
+
+               err = register_netdevice(peer);
+               if (err < 0)
+                       goto err_register_peer;
+               netif_carrier_off(peer);
+               if (mode == NETKIT_L2)
+                       dev_change_flags(peer, peer->flags & ~IFF_NOARP, NULL);
 
-       err = rtnl_configure_link(peer, NULL, 0, NULL);
-       if (err < 0)
-               goto err_configure_peer;
+               err = rtnl_configure_link(peer, NULL, 0, NULL);
+               if (err < 0)
+                       goto err_configure_peer;
+       }
 
        if (mode == NETKIT_L2 && !tb[IFLA_ADDRESS])
                eth_hw_addr_random(dev);
@@ -438,12 +455,17 @@ static int netkit_new_link(struct net_device *dev,
                nla_strscpy(dev->name, tb[IFLA_IFNAME], IFNAMSIZ);
        else
                strscpy(dev->name, "nk%d", IFNAMSIZ);
+       if (headroom)
+               dev->needed_headroom = headroom;
+       if (tailroom)
+               dev->needed_tailroom = tailroom;
 
        nk = netkit_priv(dev);
        nk->primary = true;
        nk->policy = policy_prim;
        nk->scrub = scrub_prim;
        nk->mode = mode;
+       nk->pair = pair;
        nk->headroom = headroom;
        bpf_mprog_bundle_init(&nk->bundle);
 
@@ -455,10 +477,12 @@ static int netkit_new_link(struct net_device *dev,
                dev_change_flags(dev, dev->flags & ~IFF_NOARP, NULL);
 
        rcu_assign_pointer(netkit_priv(dev)->peer, peer);
-       rcu_assign_pointer(netkit_priv(peer)->peer, dev);
+       if (peer)
+               rcu_assign_pointer(netkit_priv(peer)->peer, dev);
        return 0;
 err_configure_peer:
-       unregister_netdevice(peer);
+       if (peer)
+               unregister_netdevice(peer);
        return err;
 err_register_peer:
        free_netdev(peer);
@@ -518,6 +542,8 @@ static struct net_device *netkit_dev_fetch(struct net *net, u32 ifindex, u32 whi
        nk = netkit_priv(dev);
        if (!nk->primary)
                return ERR_PTR(-EACCES);
+       if (nk->pair == NETKIT_DEVICE_SINGLE)
+               return ERR_PTR(-EOPNOTSUPP);
        if (which == BPF_NETKIT_PEER) {
                dev = rcu_dereference_rtnl(nk->peer);
                if (!dev)
@@ -879,6 +905,7 @@ static int netkit_change_link(struct net_device *dev, struct nlattr *tb[],
                { IFLA_NETKIT_PEER_INFO,  "peer info" },
                { IFLA_NETKIT_HEADROOM,   "headroom" },
                { IFLA_NETKIT_TAILROOM,   "tailroom" },
+               { IFLA_NETKIT_PAIRING,    "pairing" },
        };
 
        if (!nk->primary) {
@@ -898,9 +925,11 @@ static int netkit_change_link(struct net_device *dev, struct nlattr *tb[],
        }
 
        if (data[IFLA_NETKIT_POLICY]) {
+               err = -EOPNOTSUPP;
                attr = data[IFLA_NETKIT_POLICY];
                policy = nla_get_u32(attr);
-               err = netkit_check_policy(policy, attr, extack);
+               if (nk->pair == NETKIT_DEVICE_PAIR)
+                       err = netkit_check_policy(policy, attr, extack);
                if (err)
                        return err;
                WRITE_ONCE(nk->policy, policy);
@@ -931,6 +960,7 @@ static size_t netkit_get_size(const struct net_device *dev)
               nla_total_size(sizeof(u8))  + /* IFLA_NETKIT_PRIMARY */
               nla_total_size(sizeof(u16)) + /* IFLA_NETKIT_HEADROOM */
               nla_total_size(sizeof(u16)) + /* IFLA_NETKIT_TAILROOM */
+              nla_total_size(sizeof(u32)) + /* IFLA_NETKIT_PAIRING */
               0;
 }
 
@@ -951,6 +981,8 @@ static int netkit_fill_info(struct sk_buff *skb, const struct net_device *dev)
                return -EMSGSIZE;
        if (nla_put_u16(skb, IFLA_NETKIT_TAILROOM, dev->needed_tailroom))
                return -EMSGSIZE;
+       if (nla_put_u32(skb, IFLA_NETKIT_PAIRING, nk->pair))
+               return -EMSGSIZE;
 
        if (peer) {
                nk = netkit_priv(peer);
@@ -972,6 +1004,7 @@ static const struct nla_policy netkit_policy[IFLA_NETKIT_MAX + 1] = {
        [IFLA_NETKIT_TAILROOM]          = { .type = NLA_U16 },
        [IFLA_NETKIT_SCRUB]             = NLA_POLICY_MAX(NLA_U32, NETKIT_SCRUB_DEFAULT),
        [IFLA_NETKIT_PEER_SCRUB]        = NLA_POLICY_MAX(NLA_U32, NETKIT_SCRUB_DEFAULT),
+       [IFLA_NETKIT_PAIRING]           = NLA_POLICY_MAX(NLA_U32, NETKIT_DEVICE_SINGLE),
        [IFLA_NETKIT_PRIMARY]           = { .type = NLA_REJECT,
                                            .reject_message = "Primary attribute is read-only" },
 };
index 83a96c56b8cad81d676c46e8a09e6c69619a0ec4..280bb178051232f9630deb2d41f96285790f3ef8 100644 (file)
@@ -1296,6 +1296,11 @@ enum netkit_mode {
        NETKIT_L3,
 };
 
+enum netkit_pairing {
+       NETKIT_DEVICE_PAIR,
+       NETKIT_DEVICE_SINGLE,
+};
+
 /* NETKIT_SCRUB_NONE leaves clearing skb->{mark,priority} up to
  * the BPF program if attached. This also means the latter can
  * consume the two fields if they were populated earlier.
@@ -1320,6 +1325,7 @@ enum {
        IFLA_NETKIT_PEER_SCRUB,
        IFLA_NETKIT_HEADROOM,
        IFLA_NETKIT_TAILROOM,
+       IFLA_NETKIT_PAIRING,
        __IFLA_NETKIT_MAX,
 };
 #define IFLA_NETKIT_MAX        (__IFLA_NETKIT_MAX - 1)