]> git.ipfire.org Git - thirdparty/kernel/stable.git/commitdiff
tcp/udp: Make early_demux back namespacified.
authorKuniyuki Iwashima <kuniyu@amazon.com>
Wed, 13 Jul 2022 17:52:07 +0000 (10:52 -0700)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 10 Nov 2022 14:47:22 +0000 (15:47 +0100)
commit 11052589cf5c0bab3b4884d423d5f60c38fcf25d upstream.

Commit e21145a9871a ("ipv4: namespacify ip_early_demux sysctl knob") made
it possible to enable/disable early_demux on a per-netns basis.  Then, we
introduced two knobs, tcp_early_demux and udp_early_demux, to switch it for
TCP/UDP in commit dddb64bcb346 ("net: Add sysctl to toggle early demux for
tcp and udp").  However, the .proc_handler() was wrong and actually
disabled us from changing the behaviour in each netns.

We can execute early_demux if net.ipv4.ip_early_demux is on and each proto
.early_demux() handler is not NULL.  When we toggle (tcp|udp)_early_demux,
the change itself is saved in each netns variable, but the .early_demux()
handler is a global variable, so the handler is switched based on the
init_net's sysctl variable.  Thus, netns (tcp|udp)_early_demux knobs have
nothing to do with the logic.  Whether we CAN execute proto .early_demux()
is always decided by init_net's sysctl knob, and whether we DO it or not is
by each netns ip_early_demux knob.

This patch namespacifies (tcp|udp)_early_demux again.  For now, the users
of the .early_demux() handler are TCP and UDP only, and they are called
directly to avoid retpoline.  So, we can remove the .early_demux() handler
from inet6?_protos and need not dereference them in ip6?_rcv_finish_core().
If another proto needs .early_demux(), we can restore it at that time.

Fixes: dddb64bcb346 ("net: Add sysctl to toggle early demux for tcp and udp")
Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
Link: https://lore.kernel.org/r/20220713175207.7727-1-kuniyu@amazon.com
Signed-off-by: Jakub Kicinski <kuba@kernel.org>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
include/net/protocol.h
include/net/tcp.h
include/net/udp.h
net/ipv4/af_inet.c
net/ipv4/ip_input.c
net/ipv4/sysctl_net_ipv4.c
net/ipv6/ip6_input.c
net/ipv6/tcp_ipv6.c
net/ipv6/udp.c

index 4fc75f7ae23beb47674b04df27ac6ef4594679fe..312a27393e0b7863af6b7f43e00e1fd3685b8dcf 100644 (file)
@@ -39,8 +39,6 @@
 
 /* This is used to register protocols. */
 struct net_protocol {
-       int                     (*early_demux)(struct sk_buff *skb);
-       int                     (*early_demux_handler)(struct sk_buff *skb);
        int                     (*handler)(struct sk_buff *skb);
        void                    (*err_handler)(struct sk_buff *skb, u32 info);
        unsigned int            no_policy:1,
@@ -54,8 +52,6 @@ struct net_protocol {
 
 #if IS_ENABLED(CONFIG_IPV6)
 struct inet6_protocol {
-       void    (*early_demux)(struct sk_buff *skb);
-       void    (*early_demux_handler)(struct sk_buff *skb);
        int     (*handler)(struct sk_buff *skb);
 
        void    (*err_handler)(struct sk_buff *skb,
index 003638f73ffad35691d4185b8dc2ab08183814df..4f97c0e2d5f3455375ceb04bd1644980291f14aa 100644 (file)
@@ -890,6 +890,8 @@ static inline int tcp_v6_sdif(const struct sk_buff *skb)
 #endif
        return 0;
 }
+
+void tcp_v6_early_demux(struct sk_buff *skb);
 #endif
 
 static inline bool inet_exact_dif_match(struct net *net, struct sk_buff *skb)
index 18391015233ef26daf3651b11f112ec1a63bf790..07135de001668e221e5a6deefbc840a1a3590cfa 100644 (file)
@@ -173,6 +173,7 @@ typedef struct sock *(*udp_lookup_t)(struct sk_buff *skb, __be16 sport,
 struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
                                 struct udphdr *uh, udp_lookup_t lookup);
 int udp_gro_complete(struct sk_buff *skb, int nhoff, udp_lookup_t lookup);
+void udp_v6_early_demux(struct sk_buff *skb);
 
 static inline struct udphdr *udp_gro_udphdr(struct sk_buff *skb)
 {
index 06ad21597d68e14f0474a7e4d997c2c0a4850d22..1eee002b12d252d1b207a2464cbed4e70742e8de 100644 (file)
@@ -1608,12 +1608,7 @@ static const struct net_protocol igmp_protocol = {
 };
 #endif
 
-/* thinking of making this const? Don't.
- * early_demux can change based on sysctl.
- */
-static struct net_protocol tcp_protocol = {
-       .early_demux    =       tcp_v4_early_demux,
-       .early_demux_handler =  tcp_v4_early_demux,
+static const struct net_protocol tcp_protocol = {
        .handler        =       tcp_v4_rcv,
        .err_handler    =       tcp_v4_err,
        .no_policy      =       1,
@@ -1621,12 +1616,7 @@ static struct net_protocol tcp_protocol = {
        .icmp_strict_tag_validation = 1,
 };
 
-/* thinking of making this const? Don't.
- * early_demux can change based on sysctl.
- */
-static struct net_protocol udp_protocol = {
-       .early_demux =  udp_v4_early_demux,
-       .early_demux_handler =  udp_v4_early_demux,
+static const struct net_protocol udp_protocol = {
        .handler =      udp_rcv,
        .err_handler =  udp_err,
        .no_policy =    1,
index 6fc45d3a1f8a921ba2428a299ca63fb395afbf89..155476df8f9a77584bb1f83163f2597f5791b23d 100644 (file)
@@ -307,10 +307,11 @@ drop:
        return true;
 }
 
+int udp_v4_early_demux(struct sk_buff *);
+int tcp_v4_early_demux(struct sk_buff *);
 static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
        const struct iphdr *iph = ip_hdr(skb);
-       int (*edemux)(struct sk_buff *skb);
        struct net_device *dev = skb->dev;
        struct rtable *rt;
        int err;
@@ -322,20 +323,29 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
        if (!skb)
                return NET_RX_SUCCESS;
 
-       if (net->ipv4.sysctl_ip_early_demux &&
+       if (READ_ONCE(net->ipv4.sysctl_ip_early_demux) &&
            !skb_dst(skb) &&
            !skb->sk &&
            !ip_is_fragment(iph)) {
-               const struct net_protocol *ipprot;
-               int protocol = iph->protocol;
+               switch (iph->protocol) {
+               case IPPROTO_TCP:
+                       if (READ_ONCE(net->ipv4.sysctl_tcp_early_demux)) {
+                               tcp_v4_early_demux(skb);
 
-               ipprot = rcu_dereference(inet_protos[protocol]);
-               if (ipprot && (edemux = READ_ONCE(ipprot->early_demux))) {
-                       err = edemux(skb);
-                       if (unlikely(err))
-                               goto drop_error;
-                       /* must reload iph, skb->head might have changed */
-                       iph = ip_hdr(skb);
+                               /* must reload iph, skb->head might have changed */
+                               iph = ip_hdr(skb);
+                       }
+                       break;
+               case IPPROTO_UDP:
+                       if (READ_ONCE(net->ipv4.sysctl_udp_early_demux)) {
+                               err = udp_v4_early_demux(skb);
+                               if (unlikely(err))
+                                       goto drop_error;
+
+                               /* must reload iph, skb->head might have changed */
+                               iph = ip_hdr(skb);
+                       }
+                       break;
                }
        }
 
index 78771272f61363c48bd50bb2b71b9cef47dcd7dc..df118d5b1e28f3d1e8eaf2c1b0927e59bba3ac8b 100644 (file)
@@ -311,61 +311,6 @@ bad_key:
        return ret;
 }
 
-static void proc_configure_early_demux(int enabled, int protocol)
-{
-       struct net_protocol *ipprot;
-#if IS_ENABLED(CONFIG_IPV6)
-       struct inet6_protocol *ip6prot;
-#endif
-
-       rcu_read_lock();
-
-       ipprot = rcu_dereference(inet_protos[protocol]);
-       if (ipprot)
-               ipprot->early_demux = enabled ? ipprot->early_demux_handler :
-                                               NULL;
-
-#if IS_ENABLED(CONFIG_IPV6)
-       ip6prot = rcu_dereference(inet6_protos[protocol]);
-       if (ip6prot)
-               ip6prot->early_demux = enabled ? ip6prot->early_demux_handler :
-                                                NULL;
-#endif
-       rcu_read_unlock();
-}
-
-static int proc_tcp_early_demux(struct ctl_table *table, int write,
-                               void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-       int ret = 0;
-
-       ret = proc_dointvec(table, write, buffer, lenp, ppos);
-
-       if (write && !ret) {
-               int enabled = init_net.ipv4.sysctl_tcp_early_demux;
-
-               proc_configure_early_demux(enabled, IPPROTO_TCP);
-       }
-
-       return ret;
-}
-
-static int proc_udp_early_demux(struct ctl_table *table, int write,
-                               void __user *buffer, size_t *lenp, loff_t *ppos)
-{
-       int ret = 0;
-
-       ret = proc_dointvec(table, write, buffer, lenp, ppos);
-
-       if (write && !ret) {
-               int enabled = init_net.ipv4.sysctl_udp_early_demux;
-
-               proc_configure_early_demux(enabled, IPPROTO_UDP);
-       }
-
-       return ret;
-}
-
 static int proc_tfo_blackhole_detect_timeout(struct ctl_table *table,
                                             int write,
                                             void __user *buffer,
@@ -853,14 +798,14 @@ static struct ctl_table ipv4_net_table[] = {
                .data           = &init_net.ipv4.sysctl_udp_early_demux,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-               .proc_handler   = proc_udp_early_demux
+               .proc_handler   = proc_douintvec,
        },
        {
                .procname       = "tcp_early_demux",
                .data           = &init_net.ipv4.sysctl_tcp_early_demux,
                .maxlen         = sizeof(int),
                .mode           = 0644,
-               .proc_handler   = proc_tcp_early_demux
+               .proc_handler   = proc_douintvec,
        },
        {
                .procname       = "ip_default_ttl",
index 9ee208a348f559b27dcc56ee80bf9ad51630cb45..62b5dac7be30169e4b377cb5c4977ec2e61971c3 100644 (file)
 #include <net/inet_ecn.h>
 #include <net/dst_metadata.h>
 
+void udp_v6_early_demux(struct sk_buff *);
+void tcp_v6_early_demux(struct sk_buff *);
 int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
 {
-       void (*edemux)(struct sk_buff *skb);
-
        /* if ingress device is enslaved to an L3 master device pass the
         * skb to its handler for processing
         */
@@ -58,13 +58,20 @@ int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
        if (!skb)
                return NET_RX_SUCCESS;
 
-       if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) {
-               const struct inet6_protocol *ipprot;
-
-               ipprot = rcu_dereference(inet6_protos[ipv6_hdr(skb)->nexthdr]);
-               if (ipprot && (edemux = READ_ONCE(ipprot->early_demux)))
-                       edemux(skb);
+       if (READ_ONCE(net->ipv4.sysctl_ip_early_demux) &&
+           !skb_dst(skb) && !skb->sk) {
+               switch (ipv6_hdr(skb)->nexthdr) {
+               case IPPROTO_TCP:
+                       if (READ_ONCE(net->ipv4.sysctl_tcp_early_demux))
+                               tcp_v6_early_demux(skb);
+                       break;
+               case IPPROTO_UDP:
+                       if (READ_ONCE(net->ipv4.sysctl_udp_early_demux))
+                               udp_v6_early_demux(skb);
+                       break;
+               }
        }
+
        if (!skb_valid_dst(skb))
                ip6_route_input(skb);
 
index 711e5565e3a5b233f42fec97a223d88e5a9eabdf..4ef55062d37c784d1a6898b0b33a7a63b2ce6cb4 100644 (file)
@@ -1635,7 +1635,7 @@ do_time_wait:
        goto discard_it;
 }
 
-static void tcp_v6_early_demux(struct sk_buff *skb)
+void tcp_v6_early_demux(struct sk_buff *skb)
 {
        const struct ipv6hdr *hdr;
        const struct tcphdr *th;
@@ -1991,12 +1991,7 @@ struct proto tcpv6_prot = {
        .diag_destroy           = tcp_abort,
 };
 
-/* thinking of making this const? Don't.
- * early_demux can change based on sysctl.
- */
-static struct inet6_protocol tcpv6_protocol = {
-       .early_demux    =       tcp_v6_early_demux,
-       .early_demux_handler =  tcp_v6_early_demux,
+static const struct inet6_protocol tcpv6_protocol = {
        .handler        =       tcp_v6_rcv,
        .err_handler    =       tcp_v6_err,
        .flags          =       INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,
index 685efa6a8c32d54bd8cbbb4e2f91ba8fd7d017c5..0d4f82f9ebfd5f296582143a04e3f0a5c8fa626b 100644 (file)
@@ -932,7 +932,7 @@ static struct sock *__udp6_lib_demux_lookup(struct net *net,
        return NULL;
 }
 
-static void udp_v6_early_demux(struct sk_buff *skb)
+void udp_v6_early_demux(struct sk_buff *skb)
 {
        struct net *net = dev_net(skb->dev);
        const struct udphdr *uh;
@@ -1491,12 +1491,7 @@ int compat_udpv6_getsockopt(struct sock *sk, int level, int optname,
 }
 #endif
 
-/* thinking of making this const? Don't.
- * early_demux can change based on sysctl.
- */
-static struct inet6_protocol udpv6_protocol = {
-       .early_demux    =       udp_v6_early_demux,
-       .early_demux_handler =  udp_v6_early_demux,
+static const struct inet6_protocol udpv6_protocol = {
        .handler        =       udpv6_rcv,
        .err_handler    =       udpv6_err,
        .flags          =       INET6_PROTO_NOPOLICY|INET6_PROTO_FINAL,