1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
7 * ROUTE - implementation of the IP router.
10 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11 * Alan Cox, <gw4pts@gw4pts.ampr.org>
12 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
13 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
16 * Alan Cox : Verify area fixes.
17 * Alan Cox : cli() protects routing changes
18 * Rui Oliveira : ICMP routing table updates
19 * (rco@di.uminho.pt) Routing table insertion and update
20 * Linus Torvalds : Rewrote bits to be sensible
21 * Alan Cox : Added BSD route gw semantics
22 * Alan Cox : Super /proc >4K
23 * Alan Cox : MTU in route table
24 * Alan Cox : MSS actually. Also added the window
26 * Sam Lantinga : Fixed route matching in rt_del()
27 * Alan Cox : Routing cache support.
28 * Alan Cox : Removed compatibility cruft.
29 * Alan Cox : RTF_REJECT support.
30 * Alan Cox : TCP irtt support.
31 * Jonathan Naylor : Added Metric support.
32 * Miquel van Smoorenburg : BSD API fixes.
33 * Miquel van Smoorenburg : Metrics.
34 * Alan Cox : Use __u32 properly
35 * Alan Cox : Aligned routing errors more closely with BSD
36 * our system is still very different.
37 * Alan Cox : Faster /proc handling
38 * Alexey Kuznetsov : Massive rework to support tree based routing,
39 * routing caches and better behaviour.
41 * Olaf Erb : irtt wasn't being copied right.
42 * Bjorn Ekwall : Kerneld route support.
43 * Alan Cox : Multicast fixed (I hope)
44 * Pavel Krauz : Limited broadcast fixed
45 * Mike McLagan : Routing by source
46 * Alexey Kuznetsov : End of old history. Split to fib.c and
47 * route.c and rewritten from scratch.
48 * Andi Kleen : Load-limit warning messages.
49 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
50 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
51 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
52 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
53 * Marc Boucher : routing by fwmark
54 * Robert Olsson : Added rt_cache statistics
55 * Arnaldo C. Melo : Convert proc stuff to seq_file
56 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
57 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
58 * Ilia Sotnikov : Removed TOS from hash calculations
61 #define pr_fmt(fmt) "IPv4: " fmt
63 #include <linux/module.h>
64 #include <linux/bitops.h>
65 #include <linux/kernel.h>
67 #include <linux/memblock.h>
68 #include <linux/socket.h>
69 #include <linux/errno.h>
71 #include <linux/inet.h>
72 #include <linux/netdevice.h>
73 #include <linux/proc_fs.h>
74 #include <linux/init.h>
75 #include <linux/skbuff.h>
76 #include <linux/inetdevice.h>
77 #include <linux/igmp.h>
78 #include <linux/pkt_sched.h>
79 #include <linux/mroute.h>
80 #include <linux/netfilter_ipv4.h>
81 #include <linux/random.h>
82 #include <linux/rcupdate.h>
83 #include <linux/slab.h>
84 #include <linux/jhash.h>
86 #include <net/dst_metadata.h>
87 #include <net/inet_dscp.h>
88 #include <net/net_namespace.h>
90 #include <net/route.h>
91 #include <net/inetpeer.h>
93 #include <net/ip_fib.h>
94 #include <net/nexthop.h>
98 #include <net/lwtunnel.h>
99 #include <net/netevent.h>
100 #include <net/rtnetlink.h>
102 #include <linux/sysctl.h>
104 #include <net/secure_seq.h>
105 #include <net/ip_tunnels.h>
107 #include "fib_lookup.h"
109 #define RT_GC_TIMEOUT (300*HZ)
111 #define DEFAULT_MIN_PMTU (512 + 20 + 20)
112 #define DEFAULT_MTU_EXPIRES (10 * 60 * HZ)
113 #define DEFAULT_MIN_ADVMSS 256
114 static int ip_rt_max_size
;
115 static int ip_rt_redirect_number __read_mostly
= 9;
116 static int ip_rt_redirect_load __read_mostly
= HZ
/ 50;
117 static int ip_rt_redirect_silence __read_mostly
= ((HZ
/ 50) << (9 + 1));
118 static int ip_rt_error_cost __read_mostly
= HZ
;
119 static int ip_rt_error_burst __read_mostly
= 5 * HZ
;
121 static int ip_rt_gc_timeout __read_mostly
= RT_GC_TIMEOUT
;
124 * Interface to generic destination cache.
127 INDIRECT_CALLABLE_SCOPE
128 struct dst_entry
*ipv4_dst_check(struct dst_entry
*dst
, u32 cookie
);
129 static unsigned int ipv4_default_advmss(const struct dst_entry
*dst
);
130 INDIRECT_CALLABLE_SCOPE
131 unsigned int ipv4_mtu(const struct dst_entry
*dst
);
132 static void ipv4_negative_advice(struct sock
*sk
,
133 struct dst_entry
*dst
);
134 static void ipv4_link_failure(struct sk_buff
*skb
);
135 static void ip_rt_update_pmtu(struct dst_entry
*dst
, struct sock
*sk
,
136 struct sk_buff
*skb
, u32 mtu
,
138 static void ip_do_redirect(struct dst_entry
*dst
, struct sock
*sk
,
139 struct sk_buff
*skb
);
140 static void ipv4_dst_destroy(struct dst_entry
*dst
);
142 static u32
*ipv4_cow_metrics(struct dst_entry
*dst
, unsigned long old
)
148 static struct neighbour
*ipv4_neigh_lookup(const struct dst_entry
*dst
,
151 static void ipv4_confirm_neigh(const struct dst_entry
*dst
, const void *daddr
);
153 static struct dst_ops ipv4_dst_ops
= {
155 .check
= ipv4_dst_check
,
156 .default_advmss
= ipv4_default_advmss
,
158 .cow_metrics
= ipv4_cow_metrics
,
159 .destroy
= ipv4_dst_destroy
,
160 .negative_advice
= ipv4_negative_advice
,
161 .link_failure
= ipv4_link_failure
,
162 .update_pmtu
= ip_rt_update_pmtu
,
163 .redirect
= ip_do_redirect
,
164 .local_out
= __ip_local_out
,
165 .neigh_lookup
= ipv4_neigh_lookup
,
166 .confirm_neigh
= ipv4_confirm_neigh
,
169 #define ECN_OR_COST(class) TC_PRIO_##class
171 const __u8 ip_tos2prio
[16] = {
173 ECN_OR_COST(BESTEFFORT
),
175 ECN_OR_COST(BESTEFFORT
),
181 ECN_OR_COST(INTERACTIVE
),
183 ECN_OR_COST(INTERACTIVE
),
184 TC_PRIO_INTERACTIVE_BULK
,
185 ECN_OR_COST(INTERACTIVE_BULK
),
186 TC_PRIO_INTERACTIVE_BULK
,
187 ECN_OR_COST(INTERACTIVE_BULK
)
189 EXPORT_SYMBOL(ip_tos2prio
);
191 static DEFINE_PER_CPU(struct rt_cache_stat
, rt_cache_stat
);
192 #ifndef CONFIG_PREEMPT_RT
193 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
195 #define RT_CACHE_STAT_INC(field) this_cpu_inc(rt_cache_stat.field)
198 #ifdef CONFIG_PROC_FS
199 static void *rt_cache_seq_start(struct seq_file
*seq
, loff_t
*pos
)
203 return SEQ_START_TOKEN
;
206 static void *rt_cache_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
212 static void rt_cache_seq_stop(struct seq_file
*seq
, void *v
)
216 static int rt_cache_seq_show(struct seq_file
*seq
, void *v
)
218 if (v
== SEQ_START_TOKEN
)
219 seq_printf(seq
, "%-127s\n",
220 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
221 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
226 static const struct seq_operations rt_cache_seq_ops
= {
227 .start
= rt_cache_seq_start
,
228 .next
= rt_cache_seq_next
,
229 .stop
= rt_cache_seq_stop
,
230 .show
= rt_cache_seq_show
,
233 static void *rt_cpu_seq_start(struct seq_file
*seq
, loff_t
*pos
)
238 return SEQ_START_TOKEN
;
240 for (cpu
= *pos
-1; cpu
< nr_cpu_ids
; ++cpu
) {
241 if (!cpu_possible(cpu
))
244 return &per_cpu(rt_cache_stat
, cpu
);
249 static void *rt_cpu_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
253 for (cpu
= *pos
; cpu
< nr_cpu_ids
; ++cpu
) {
254 if (!cpu_possible(cpu
))
257 return &per_cpu(rt_cache_stat
, cpu
);
264 static void rt_cpu_seq_stop(struct seq_file
*seq
, void *v
)
269 static int rt_cpu_seq_show(struct seq_file
*seq
, void *v
)
271 struct rt_cache_stat
*st
= v
;
273 if (v
== SEQ_START_TOKEN
) {
274 seq_puts(seq
, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
278 seq_printf(seq
, "%08x %08x %08x %08x %08x %08x %08x "
279 "%08x %08x %08x %08x %08x %08x "
280 "%08x %08x %08x %08x\n",
281 dst_entries_get_slow(&ipv4_dst_ops
),
294 0, /* st->gc_total */
295 0, /* st->gc_ignored */
296 0, /* st->gc_goal_miss */
297 0, /* st->gc_dst_overflow */
298 0, /* st->in_hlist_search */
299 0 /* st->out_hlist_search */
304 static const struct seq_operations rt_cpu_seq_ops
= {
305 .start
= rt_cpu_seq_start
,
306 .next
= rt_cpu_seq_next
,
307 .stop
= rt_cpu_seq_stop
,
308 .show
= rt_cpu_seq_show
,
311 #ifdef CONFIG_IP_ROUTE_CLASSID
312 static int rt_acct_proc_show(struct seq_file
*m
, void *v
)
314 struct ip_rt_acct
*dst
, *src
;
317 dst
= kcalloc(256, sizeof(struct ip_rt_acct
), GFP_KERNEL
);
321 for_each_possible_cpu(i
) {
322 src
= (struct ip_rt_acct
*)per_cpu_ptr(ip_rt_acct
, i
);
323 for (j
= 0; j
< 256; j
++) {
324 dst
[j
].o_bytes
+= src
[j
].o_bytes
;
325 dst
[j
].o_packets
+= src
[j
].o_packets
;
326 dst
[j
].i_bytes
+= src
[j
].i_bytes
;
327 dst
[j
].i_packets
+= src
[j
].i_packets
;
331 seq_write(m
, dst
, 256 * sizeof(struct ip_rt_acct
));
337 static int __net_init
ip_rt_do_proc_init(struct net
*net
)
339 struct proc_dir_entry
*pde
;
341 pde
= proc_create_seq("rt_cache", 0444, net
->proc_net
,
346 pde
= proc_create_seq("rt_cache", 0444, net
->proc_net_stat
,
351 #ifdef CONFIG_IP_ROUTE_CLASSID
352 pde
= proc_create_single("rt_acct", 0, net
->proc_net
,
359 #ifdef CONFIG_IP_ROUTE_CLASSID
361 remove_proc_entry("rt_cache", net
->proc_net_stat
);
364 remove_proc_entry("rt_cache", net
->proc_net
);
369 static void __net_exit
ip_rt_do_proc_exit(struct net
*net
)
371 remove_proc_entry("rt_cache", net
->proc_net_stat
);
372 remove_proc_entry("rt_cache", net
->proc_net
);
373 #ifdef CONFIG_IP_ROUTE_CLASSID
374 remove_proc_entry("rt_acct", net
->proc_net
);
378 static struct pernet_operations ip_rt_proc_ops __net_initdata
= {
379 .init
= ip_rt_do_proc_init
,
380 .exit
= ip_rt_do_proc_exit
,
383 static int __init
ip_rt_proc_init(void)
385 return register_pernet_subsys(&ip_rt_proc_ops
);
389 static inline int ip_rt_proc_init(void)
393 #endif /* CONFIG_PROC_FS */
395 static inline bool rt_is_expired(const struct rtable
*rth
)
400 res
= rth
->rt_genid
!= rt_genid_ipv4(dev_net_rcu(rth
->dst
.dev
));
406 void rt_cache_flush(struct net
*net
)
408 rt_genid_bump_ipv4(net
);
411 static struct neighbour
*ipv4_neigh_lookup(const struct dst_entry
*dst
,
415 const struct rtable
*rt
= container_of(dst
, struct rtable
, dst
);
416 struct net_device
*dev
= dst
->dev
;
421 if (likely(rt
->rt_gw_family
== AF_INET
)) {
422 n
= ip_neigh_gw4(dev
, rt
->rt_gw4
);
423 } else if (rt
->rt_gw_family
== AF_INET6
) {
424 n
= ip_neigh_gw6(dev
, &rt
->rt_gw6
);
428 pkey
= skb
? ip_hdr(skb
)->daddr
: *((__be32
*) daddr
);
429 n
= ip_neigh_gw4(dev
, pkey
);
432 if (!IS_ERR(n
) && !refcount_inc_not_zero(&n
->refcnt
))
440 static void ipv4_confirm_neigh(const struct dst_entry
*dst
, const void *daddr
)
442 const struct rtable
*rt
= container_of(dst
, struct rtable
, dst
);
443 struct net_device
*dev
= dst
->dev
;
444 const __be32
*pkey
= daddr
;
446 if (rt
->rt_gw_family
== AF_INET
) {
447 pkey
= (const __be32
*)&rt
->rt_gw4
;
448 } else if (rt
->rt_gw_family
== AF_INET6
) {
449 return __ipv6_confirm_neigh_stub(dev
, &rt
->rt_gw6
);
452 (RTCF_MULTICAST
| RTCF_BROADCAST
| RTCF_LOCAL
))) {
455 __ipv4_confirm_neigh(dev
, *(__force u32
*)pkey
);
458 /* Hash tables of size 2048..262144 depending on RAM size.
459 * Each bucket uses 8 bytes.
461 static u32 ip_idents_mask __read_mostly
;
462 static atomic_t
*ip_idents __read_mostly
;
463 static u32
*ip_tstamps __read_mostly
;
465 /* In order to protect privacy, we add a perturbation to identifiers
466 * if one generator is seldom used. This makes hard for an attacker
467 * to infer how many packets were sent between two points in time.
469 static u32
ip_idents_reserve(u32 hash
, int segs
)
471 u32 bucket
, old
, now
= (u32
)jiffies
;
476 bucket
= hash
& ip_idents_mask
;
477 p_tstamp
= ip_tstamps
+ bucket
;
478 p_id
= ip_idents
+ bucket
;
479 old
= READ_ONCE(*p_tstamp
);
481 if (old
!= now
&& cmpxchg(p_tstamp
, old
, now
) == old
)
482 delta
= get_random_u32_below(now
- old
);
484 /* If UBSAN reports an error there, please make sure your compiler
485 * supports -fno-strict-overflow before reporting it that was a bug
486 * in UBSAN, and it has been fixed in GCC-8.
488 return atomic_add_return(segs
+ delta
, p_id
) - segs
;
491 void __ip_select_ident(struct net
*net
, struct iphdr
*iph
, int segs
)
495 /* Note the following code is not safe, but this is okay. */
496 if (unlikely(siphash_key_is_zero(&net
->ipv4
.ip_id_key
)))
497 get_random_bytes(&net
->ipv4
.ip_id_key
,
498 sizeof(net
->ipv4
.ip_id_key
));
500 hash
= siphash_3u32((__force u32
)iph
->daddr
,
501 (__force u32
)iph
->saddr
,
503 &net
->ipv4
.ip_id_key
);
504 id
= ip_idents_reserve(hash
, segs
);
507 EXPORT_SYMBOL(__ip_select_ident
);
509 static void __build_flow_key(const struct net
*net
, struct flowi4
*fl4
,
510 const struct sock
*sk
, const struct iphdr
*iph
,
511 int oif
, __u8 tos
, u8 prot
, u32 mark
,
514 __u8 scope
= RT_SCOPE_UNIVERSE
;
517 oif
= sk
->sk_bound_dev_if
;
518 mark
= READ_ONCE(sk
->sk_mark
);
519 tos
= ip_sock_rt_tos(sk
);
520 scope
= ip_sock_rt_scope(sk
);
521 prot
= inet_test_bit(HDRINCL
, sk
) ? IPPROTO_RAW
:
525 flowi4_init_output(fl4
, oif
, mark
, tos
& INET_DSCP_MASK
, scope
,
526 prot
, flow_flags
, iph
->daddr
, iph
->saddr
, 0, 0,
527 sock_net_uid(net
, sk
));
530 static void build_skb_flow_key(struct flowi4
*fl4
, const struct sk_buff
*skb
,
531 const struct sock
*sk
)
533 const struct net
*net
= dev_net(skb
->dev
);
534 const struct iphdr
*iph
= ip_hdr(skb
);
535 int oif
= skb
->dev
->ifindex
;
536 u8 prot
= iph
->protocol
;
537 u32 mark
= skb
->mark
;
540 __build_flow_key(net
, fl4
, sk
, iph
, oif
, tos
, prot
, mark
, 0);
543 static void build_sk_flow_key(struct flowi4
*fl4
, const struct sock
*sk
)
545 const struct inet_sock
*inet
= inet_sk(sk
);
546 const struct ip_options_rcu
*inet_opt
;
547 __be32 daddr
= inet
->inet_daddr
;
550 inet_opt
= rcu_dereference(inet
->inet_opt
);
551 if (inet_opt
&& inet_opt
->opt
.srr
)
552 daddr
= inet_opt
->opt
.faddr
;
553 flowi4_init_output(fl4
, sk
->sk_bound_dev_if
, READ_ONCE(sk
->sk_mark
),
555 ip_sock_rt_scope(sk
),
556 inet_test_bit(HDRINCL
, sk
) ?
557 IPPROTO_RAW
: sk
->sk_protocol
,
558 inet_sk_flowi_flags(sk
),
559 daddr
, inet
->inet_saddr
, 0, 0, sk
->sk_uid
);
563 static void ip_rt_build_flow_key(struct flowi4
*fl4
, const struct sock
*sk
,
564 const struct sk_buff
*skb
)
567 build_skb_flow_key(fl4
, skb
, sk
);
569 build_sk_flow_key(fl4
, sk
);
572 static DEFINE_SPINLOCK(fnhe_lock
);
574 static void fnhe_flush_routes(struct fib_nh_exception
*fnhe
)
578 rt
= rcu_dereference(fnhe
->fnhe_rth_input
);
580 RCU_INIT_POINTER(fnhe
->fnhe_rth_input
, NULL
);
581 dst_dev_put(&rt
->dst
);
582 dst_release(&rt
->dst
);
584 rt
= rcu_dereference(fnhe
->fnhe_rth_output
);
586 RCU_INIT_POINTER(fnhe
->fnhe_rth_output
, NULL
);
587 dst_dev_put(&rt
->dst
);
588 dst_release(&rt
->dst
);
592 static void fnhe_remove_oldest(struct fnhe_hash_bucket
*hash
)
594 struct fib_nh_exception __rcu
**fnhe_p
, **oldest_p
;
595 struct fib_nh_exception
*fnhe
, *oldest
= NULL
;
597 for (fnhe_p
= &hash
->chain
; ; fnhe_p
= &fnhe
->fnhe_next
) {
598 fnhe
= rcu_dereference_protected(*fnhe_p
,
599 lockdep_is_held(&fnhe_lock
));
603 time_before(fnhe
->fnhe_stamp
, oldest
->fnhe_stamp
)) {
608 fnhe_flush_routes(oldest
);
609 *oldest_p
= oldest
->fnhe_next
;
610 kfree_rcu(oldest
, rcu
);
613 static u32
fnhe_hashfun(__be32 daddr
)
615 static siphash_aligned_key_t fnhe_hash_key
;
618 net_get_random_once(&fnhe_hash_key
, sizeof(fnhe_hash_key
));
619 hval
= siphash_1u32((__force u32
)daddr
, &fnhe_hash_key
);
620 return hash_64(hval
, FNHE_HASH_SHIFT
);
623 static void fill_route_from_fnhe(struct rtable
*rt
, struct fib_nh_exception
*fnhe
)
625 rt
->rt_pmtu
= fnhe
->fnhe_pmtu
;
626 rt
->rt_mtu_locked
= fnhe
->fnhe_mtu_locked
;
627 rt
->dst
.expires
= fnhe
->fnhe_expires
;
630 rt
->rt_flags
|= RTCF_REDIRECTED
;
631 rt
->rt_uses_gateway
= 1;
632 rt
->rt_gw_family
= AF_INET
;
633 rt
->rt_gw4
= fnhe
->fnhe_gw
;
637 static void update_or_create_fnhe(struct fib_nh_common
*nhc
, __be32 daddr
,
638 __be32 gw
, u32 pmtu
, bool lock
,
639 unsigned long expires
)
641 struct fnhe_hash_bucket
*hash
;
642 struct fib_nh_exception
*fnhe
;
648 genid
= fnhe_genid(dev_net(nhc
->nhc_dev
));
649 hval
= fnhe_hashfun(daddr
);
651 spin_lock_bh(&fnhe_lock
);
653 hash
= rcu_dereference(nhc
->nhc_exceptions
);
655 hash
= kcalloc(FNHE_HASH_SIZE
, sizeof(*hash
), GFP_ATOMIC
);
658 rcu_assign_pointer(nhc
->nhc_exceptions
, hash
);
664 for (fnhe
= rcu_dereference(hash
->chain
); fnhe
;
665 fnhe
= rcu_dereference(fnhe
->fnhe_next
)) {
666 if (fnhe
->fnhe_daddr
== daddr
)
672 if (fnhe
->fnhe_genid
!= genid
)
673 fnhe
->fnhe_genid
= genid
;
677 fnhe
->fnhe_pmtu
= pmtu
;
678 fnhe
->fnhe_mtu_locked
= lock
;
680 fnhe
->fnhe_expires
= max(1UL, expires
);
681 /* Update all cached dsts too */
682 rt
= rcu_dereference(fnhe
->fnhe_rth_input
);
684 fill_route_from_fnhe(rt
, fnhe
);
685 rt
= rcu_dereference(fnhe
->fnhe_rth_output
);
687 fill_route_from_fnhe(rt
, fnhe
);
689 /* Randomize max depth to avoid some side channels attacks. */
690 int max_depth
= FNHE_RECLAIM_DEPTH
+
691 get_random_u32_below(FNHE_RECLAIM_DEPTH
);
693 while (depth
> max_depth
) {
694 fnhe_remove_oldest(hash
);
698 fnhe
= kzalloc(sizeof(*fnhe
), GFP_ATOMIC
);
702 fnhe
->fnhe_next
= hash
->chain
;
704 fnhe
->fnhe_genid
= genid
;
705 fnhe
->fnhe_daddr
= daddr
;
707 fnhe
->fnhe_pmtu
= pmtu
;
708 fnhe
->fnhe_mtu_locked
= lock
;
709 fnhe
->fnhe_expires
= max(1UL, expires
);
711 rcu_assign_pointer(hash
->chain
, fnhe
);
713 /* Exception created; mark the cached routes for the nexthop
714 * stale, so anyone caching it rechecks if this exception
717 rt
= rcu_dereference(nhc
->nhc_rth_input
);
719 rt
->dst
.obsolete
= DST_OBSOLETE_KILL
;
721 for_each_possible_cpu(i
) {
722 struct rtable __rcu
**prt
;
724 prt
= per_cpu_ptr(nhc
->nhc_pcpu_rth_output
, i
);
725 rt
= rcu_dereference(*prt
);
727 rt
->dst
.obsolete
= DST_OBSOLETE_KILL
;
731 fnhe
->fnhe_stamp
= jiffies
;
734 spin_unlock_bh(&fnhe_lock
);
737 static void __ip_do_redirect(struct rtable
*rt
, struct sk_buff
*skb
, struct flowi4
*fl4
,
740 __be32 new_gw
= icmp_hdr(skb
)->un
.gateway
;
741 __be32 old_gw
= ip_hdr(skb
)->saddr
;
742 struct net_device
*dev
= skb
->dev
;
743 struct in_device
*in_dev
;
744 struct fib_result res
;
748 switch (icmp_hdr(skb
)->code
& 7) {
750 case ICMP_REDIR_NETTOS
:
751 case ICMP_REDIR_HOST
:
752 case ICMP_REDIR_HOSTTOS
:
759 if (rt
->rt_gw_family
!= AF_INET
|| rt
->rt_gw4
!= old_gw
)
762 in_dev
= __in_dev_get_rcu(dev
);
767 if (new_gw
== old_gw
|| !IN_DEV_RX_REDIRECTS(in_dev
) ||
768 ipv4_is_multicast(new_gw
) || ipv4_is_lbcast(new_gw
) ||
769 ipv4_is_zeronet(new_gw
))
770 goto reject_redirect
;
772 if (!IN_DEV_SHARED_MEDIA(in_dev
)) {
773 if (!inet_addr_onlink(in_dev
, new_gw
, old_gw
))
774 goto reject_redirect
;
775 if (IN_DEV_SEC_REDIRECTS(in_dev
) && ip_fib_check_default(new_gw
, dev
))
776 goto reject_redirect
;
778 if (inet_addr_type(net
, new_gw
) != RTN_UNICAST
)
779 goto reject_redirect
;
782 n
= __ipv4_neigh_lookup(rt
->dst
.dev
, (__force u32
)new_gw
);
784 n
= neigh_create(&arp_tbl
, &new_gw
, rt
->dst
.dev
);
786 if (!(READ_ONCE(n
->nud_state
) & NUD_VALID
)) {
787 neigh_event_send(n
, NULL
);
789 if (fib_lookup(net
, fl4
, &res
, 0) == 0) {
790 struct fib_nh_common
*nhc
;
792 fib_select_path(net
, &res
, fl4
, skb
);
793 nhc
= FIB_RES_NHC(res
);
794 update_or_create_fnhe(nhc
, fl4
->daddr
, new_gw
,
796 jiffies
+ ip_rt_gc_timeout
);
799 rt
->dst
.obsolete
= DST_OBSOLETE_KILL
;
800 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE
, n
);
807 #ifdef CONFIG_IP_ROUTE_VERBOSE
808 if (IN_DEV_LOG_MARTIANS(in_dev
)) {
809 const struct iphdr
*iph
= (const struct iphdr
*) skb
->data
;
810 __be32 daddr
= iph
->daddr
;
811 __be32 saddr
= iph
->saddr
;
813 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
814 " Advised path = %pI4 -> %pI4\n",
815 &old_gw
, dev
->name
, &new_gw
,
822 static void ip_do_redirect(struct dst_entry
*dst
, struct sock
*sk
, struct sk_buff
*skb
)
826 const struct iphdr
*iph
= (const struct iphdr
*) skb
->data
;
827 struct net
*net
= dev_net(skb
->dev
);
828 int oif
= skb
->dev
->ifindex
;
829 u8 prot
= iph
->protocol
;
830 u32 mark
= skb
->mark
;
833 rt
= dst_rtable(dst
);
835 __build_flow_key(net
, &fl4
, sk
, iph
, oif
, tos
, prot
, mark
, 0);
836 __ip_do_redirect(rt
, skb
, &fl4
, true);
839 static void ipv4_negative_advice(struct sock
*sk
,
840 struct dst_entry
*dst
)
842 struct rtable
*rt
= dst_rtable(dst
);
844 if ((dst
->obsolete
> 0) ||
845 (rt
->rt_flags
& RTCF_REDIRECTED
) ||
852 * 1. The first ip_rt_redirect_number redirects are sent
853 * with exponential backoff, then we stop sending them at all,
854 * assuming that the host ignores our redirects.
855 * 2. If we did not see packets requiring redirects
856 * during ip_rt_redirect_silence, we assume that the host
857 * forgot redirected route and start to send redirects again.
859 * This algorithm is much cheaper and more intelligent than dumb load limiting
862 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
863 * and "frag. need" (breaks PMTU discovery) in icmp.c.
866 void ip_rt_send_redirect(struct sk_buff
*skb
)
868 struct rtable
*rt
= skb_rtable(skb
);
869 struct in_device
*in_dev
;
870 struct inet_peer
*peer
;
876 in_dev
= __in_dev_get_rcu(rt
->dst
.dev
);
877 if (!in_dev
|| !IN_DEV_TX_REDIRECTS(in_dev
)) {
881 log_martians
= IN_DEV_LOG_MARTIANS(in_dev
);
882 vif
= l3mdev_master_ifindex_rcu(rt
->dst
.dev
);
884 net
= dev_net(rt
->dst
.dev
);
885 peer
= inet_getpeer_v4(net
->ipv4
.peers
, ip_hdr(skb
)->saddr
, vif
);
888 icmp_send(skb
, ICMP_REDIRECT
, ICMP_REDIR_HOST
,
889 rt_nexthop(rt
, ip_hdr(skb
)->daddr
));
893 /* No redirected packets during ip_rt_redirect_silence;
894 * reset the algorithm.
896 if (time_after(jiffies
, peer
->rate_last
+ ip_rt_redirect_silence
)) {
897 peer
->rate_tokens
= 0;
898 peer
->n_redirects
= 0;
901 /* Too many ignored redirects; do not send anything
902 * set dst.rate_last to the last seen redirected packet.
904 if (peer
->n_redirects
>= ip_rt_redirect_number
) {
905 peer
->rate_last
= jiffies
;
909 /* Check for load limit; set rate_last to the latest sent
912 if (peer
->n_redirects
== 0 ||
915 (ip_rt_redirect_load
<< peer
->n_redirects
)))) {
916 __be32 gw
= rt_nexthop(rt
, ip_hdr(skb
)->daddr
);
918 icmp_send(skb
, ICMP_REDIRECT
, ICMP_REDIR_HOST
, gw
);
919 peer
->rate_last
= jiffies
;
921 if (IS_ENABLED(CONFIG_IP_ROUTE_VERBOSE
) && log_martians
&&
922 peer
->n_redirects
== ip_rt_redirect_number
)
923 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
924 &ip_hdr(skb
)->saddr
, inet_iif(skb
),
925 &ip_hdr(skb
)->daddr
, &gw
);
931 static int ip_error(struct sk_buff
*skb
)
933 struct rtable
*rt
= skb_rtable(skb
);
934 struct net_device
*dev
= skb
->dev
;
935 struct in_device
*in_dev
;
936 struct inet_peer
*peer
;
943 if (netif_is_l3_master(skb
->dev
)) {
944 dev
= __dev_get_by_index(dev_net(skb
->dev
), IPCB(skb
)->iif
);
949 in_dev
= __in_dev_get_rcu(dev
);
951 /* IP on this device is disabled. */
955 net
= dev_net(rt
->dst
.dev
);
956 if (!IN_DEV_FORWARD(in_dev
)) {
957 switch (rt
->dst
.error
) {
959 SKB_DR_SET(reason
, IP_INADDRERRORS
);
960 __IP_INC_STATS(net
, IPSTATS_MIB_INADDRERRORS
);
964 SKB_DR_SET(reason
, IP_INNOROUTES
);
965 __IP_INC_STATS(net
, IPSTATS_MIB_INNOROUTES
);
971 switch (rt
->dst
.error
) {
976 code
= ICMP_HOST_UNREACH
;
979 code
= ICMP_NET_UNREACH
;
980 SKB_DR_SET(reason
, IP_INNOROUTES
);
981 __IP_INC_STATS(net
, IPSTATS_MIB_INNOROUTES
);
984 code
= ICMP_PKT_FILTERED
;
989 peer
= inet_getpeer_v4(net
->ipv4
.peers
, ip_hdr(skb
)->saddr
,
990 l3mdev_master_ifindex_rcu(skb
->dev
));
994 peer
->rate_tokens
+= now
- peer
->rate_last
;
995 if (peer
->rate_tokens
> ip_rt_error_burst
)
996 peer
->rate_tokens
= ip_rt_error_burst
;
997 peer
->rate_last
= now
;
998 if (peer
->rate_tokens
>= ip_rt_error_cost
)
999 peer
->rate_tokens
-= ip_rt_error_cost
;
1006 icmp_send(skb
, ICMP_DEST_UNREACH
, code
, 0);
1008 out
: kfree_skb_reason(skb
, reason
);
1012 static void __ip_rt_update_pmtu(struct rtable
*rt
, struct flowi4
*fl4
, u32 mtu
)
1014 struct dst_entry
*dst
= &rt
->dst
;
1015 struct fib_result res
;
1020 if (ip_mtu_locked(dst
))
1023 old_mtu
= ipv4_mtu(dst
);
1028 net
= dev_net_rcu(dst
->dev
);
1029 if (mtu
< net
->ipv4
.ip_rt_min_pmtu
) {
1031 mtu
= min(old_mtu
, net
->ipv4
.ip_rt_min_pmtu
);
1034 if (rt
->rt_pmtu
== mtu
&& !lock
&&
1035 time_before(jiffies
, dst
->expires
- net
->ipv4
.ip_rt_mtu_expires
/ 2))
1038 if (fib_lookup(net
, fl4
, &res
, 0) == 0) {
1039 struct fib_nh_common
*nhc
;
1041 fib_select_path(net
, &res
, fl4
, NULL
);
1042 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1043 if (fib_info_num_path(res
.fi
) > 1) {
1046 for (nhsel
= 0; nhsel
< fib_info_num_path(res
.fi
); nhsel
++) {
1047 nhc
= fib_info_nhc(res
.fi
, nhsel
);
1048 update_or_create_fnhe(nhc
, fl4
->daddr
, 0, mtu
, lock
,
1049 jiffies
+ net
->ipv4
.ip_rt_mtu_expires
);
1053 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1054 nhc
= FIB_RES_NHC(res
);
1055 update_or_create_fnhe(nhc
, fl4
->daddr
, 0, mtu
, lock
,
1056 jiffies
+ net
->ipv4
.ip_rt_mtu_expires
);
1062 static void ip_rt_update_pmtu(struct dst_entry
*dst
, struct sock
*sk
,
1063 struct sk_buff
*skb
, u32 mtu
,
1066 struct rtable
*rt
= dst_rtable(dst
);
1069 ip_rt_build_flow_key(&fl4
, sk
, skb
);
1071 /* Don't make lookup fail for bridged encapsulations */
1072 if (skb
&& netif_is_any_bridge_port(skb
->dev
))
1075 __ip_rt_update_pmtu(rt
, &fl4
, mtu
);
1078 void ipv4_update_pmtu(struct sk_buff
*skb
, struct net
*net
, u32 mtu
,
1079 int oif
, u8 protocol
)
1081 const struct iphdr
*iph
= (const struct iphdr
*)skb
->data
;
1084 u32 mark
= IP4_REPLY_MARK(net
, skb
->mark
);
1086 __build_flow_key(net
, &fl4
, NULL
, iph
, oif
, iph
->tos
, protocol
, mark
,
1088 rt
= __ip_route_output_key(net
, &fl4
);
1090 __ip_rt_update_pmtu(rt
, &fl4
, mtu
);
1094 EXPORT_SYMBOL_GPL(ipv4_update_pmtu
);
1096 static void __ipv4_sk_update_pmtu(struct sk_buff
*skb
, struct sock
*sk
, u32 mtu
)
1098 const struct iphdr
*iph
= (const struct iphdr
*)skb
->data
;
1102 __build_flow_key(sock_net(sk
), &fl4
, sk
, iph
, 0, 0, 0, 0, 0);
1104 if (!fl4
.flowi4_mark
)
1105 fl4
.flowi4_mark
= IP4_REPLY_MARK(sock_net(sk
), skb
->mark
);
1107 rt
= __ip_route_output_key(sock_net(sk
), &fl4
);
1109 __ip_rt_update_pmtu(rt
, &fl4
, mtu
);
1114 void ipv4_sk_update_pmtu(struct sk_buff
*skb
, struct sock
*sk
, u32 mtu
)
1116 const struct iphdr
*iph
= (const struct iphdr
*)skb
->data
;
1119 struct dst_entry
*odst
= NULL
;
1121 struct net
*net
= sock_net(sk
);
1125 if (!ip_sk_accept_pmtu(sk
))
1128 odst
= sk_dst_get(sk
);
1130 if (sock_owned_by_user(sk
) || !odst
) {
1131 __ipv4_sk_update_pmtu(skb
, sk
, mtu
);
1135 __build_flow_key(net
, &fl4
, sk
, iph
, 0, 0, 0, 0, 0);
1137 rt
= dst_rtable(odst
);
1138 if (odst
->obsolete
&& !odst
->ops
->check(odst
, 0)) {
1139 rt
= ip_route_output_flow(sock_net(sk
), &fl4
, sk
);
1146 __ip_rt_update_pmtu(dst_rtable(xfrm_dst_path(&rt
->dst
)), &fl4
, mtu
);
1148 if (!dst_check(&rt
->dst
, 0)) {
1150 dst_release(&rt
->dst
);
1152 rt
= ip_route_output_flow(sock_net(sk
), &fl4
, sk
);
1160 sk_dst_set(sk
, &rt
->dst
);
1166 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu
);
1168 void ipv4_redirect(struct sk_buff
*skb
, struct net
*net
,
1169 int oif
, u8 protocol
)
1171 const struct iphdr
*iph
= (const struct iphdr
*)skb
->data
;
1175 __build_flow_key(net
, &fl4
, NULL
, iph
, oif
, iph
->tos
, protocol
, 0, 0);
1176 rt
= __ip_route_output_key(net
, &fl4
);
1178 __ip_do_redirect(rt
, skb
, &fl4
, false);
1182 EXPORT_SYMBOL_GPL(ipv4_redirect
);
1184 void ipv4_sk_redirect(struct sk_buff
*skb
, struct sock
*sk
)
1186 const struct iphdr
*iph
= (const struct iphdr
*)skb
->data
;
1189 struct net
*net
= sock_net(sk
);
1191 __build_flow_key(net
, &fl4
, sk
, iph
, 0, 0, 0, 0, 0);
1192 rt
= __ip_route_output_key(net
, &fl4
);
1194 __ip_do_redirect(rt
, skb
, &fl4
, false);
1198 EXPORT_SYMBOL_GPL(ipv4_sk_redirect
);
1200 INDIRECT_CALLABLE_SCOPE
struct dst_entry
*ipv4_dst_check(struct dst_entry
*dst
,
1203 struct rtable
*rt
= dst_rtable(dst
);
1205 /* All IPV4 dsts are created with ->obsolete set to the value
1206 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1207 * into this function always.
1209 * When a PMTU/redirect information update invalidates a route,
1210 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1211 * DST_OBSOLETE_DEAD.
1213 if (dst
->obsolete
!= DST_OBSOLETE_FORCE_CHK
|| rt_is_expired(rt
))
1217 EXPORT_INDIRECT_CALLABLE(ipv4_dst_check
);
1219 static void ipv4_send_dest_unreach(struct sk_buff
*skb
)
1221 struct net_device
*dev
;
1222 struct ip_options opt
;
1225 /* Recompile ip options since IPCB may not be valid anymore.
1226 * Also check we have a reasonable ipv4 header.
1228 if (!pskb_network_may_pull(skb
, sizeof(struct iphdr
)) ||
1229 ip_hdr(skb
)->version
!= 4 || ip_hdr(skb
)->ihl
< 5)
1232 memset(&opt
, 0, sizeof(opt
));
1233 if (ip_hdr(skb
)->ihl
> 5) {
1234 if (!pskb_network_may_pull(skb
, ip_hdr(skb
)->ihl
* 4))
1236 opt
.optlen
= ip_hdr(skb
)->ihl
* 4 - sizeof(struct iphdr
);
1239 dev
= skb
->dev
? skb
->dev
: skb_rtable(skb
)->dst
.dev
;
1240 res
= __ip_options_compile(dev_net(dev
), &opt
, skb
, NULL
);
1246 __icmp_send(skb
, ICMP_DEST_UNREACH
, ICMP_HOST_UNREACH
, 0, &opt
);
1249 static void ipv4_link_failure(struct sk_buff
*skb
)
1253 ipv4_send_dest_unreach(skb
);
1255 rt
= skb_rtable(skb
);
1257 dst_set_expires(&rt
->dst
, 0);
1260 static int ip_rt_bug(struct net
*net
, struct sock
*sk
, struct sk_buff
*skb
)
1262 pr_debug("%s: %pI4 -> %pI4, %s\n",
1263 __func__
, &ip_hdr(skb
)->saddr
, &ip_hdr(skb
)->daddr
,
1264 skb
->dev
? skb
->dev
->name
: "?");
1271 * We do not cache source address of outgoing interface,
1272 * because it is used only by IP RR, TS and SRR options,
1273 * so that it out of fast path.
1275 * BTW remember: "addr" is allowed to be not aligned
1279 void ip_rt_get_source(u8
*addr
, struct sk_buff
*skb
, struct rtable
*rt
)
1283 if (rt_is_output_route(rt
))
1284 src
= ip_hdr(skb
)->saddr
;
1286 struct fib_result res
;
1287 struct iphdr
*iph
= ip_hdr(skb
);
1288 struct flowi4 fl4
= {
1289 .daddr
= iph
->daddr
,
1290 .saddr
= iph
->saddr
,
1291 .flowi4_tos
= inet_dscp_to_dsfield(ip4h_dscp(iph
)),
1292 .flowi4_oif
= rt
->dst
.dev
->ifindex
,
1293 .flowi4_iif
= skb
->dev
->ifindex
,
1294 .flowi4_mark
= skb
->mark
,
1298 if (fib_lookup(dev_net(rt
->dst
.dev
), &fl4
, &res
, 0) == 0)
1299 src
= fib_result_prefsrc(dev_net(rt
->dst
.dev
), &res
);
1301 src
= inet_select_addr(rt
->dst
.dev
,
1302 rt_nexthop(rt
, iph
->daddr
),
1306 memcpy(addr
, &src
, 4);
1309 #ifdef CONFIG_IP_ROUTE_CLASSID
1310 static void set_class_tag(struct rtable
*rt
, u32 tag
)
1312 if (!(rt
->dst
.tclassid
& 0xFFFF))
1313 rt
->dst
.tclassid
|= tag
& 0xFFFF;
1314 if (!(rt
->dst
.tclassid
& 0xFFFF0000))
1315 rt
->dst
.tclassid
|= tag
& 0xFFFF0000;
1319 static unsigned int ipv4_default_advmss(const struct dst_entry
*dst
)
1321 unsigned int header_size
= sizeof(struct tcphdr
) + sizeof(struct iphdr
);
1322 unsigned int advmss
;
1326 net
= dev_net_rcu(dst
->dev
);
1327 advmss
= max_t(unsigned int, ipv4_mtu(dst
) - header_size
,
1328 net
->ipv4
.ip_rt_min_advmss
);
1331 return min(advmss
, IPV4_MAX_PMTU
- header_size
);
1334 INDIRECT_CALLABLE_SCOPE
unsigned int ipv4_mtu(const struct dst_entry
*dst
)
1336 return ip_dst_mtu_maybe_forward(dst
, false);
1338 EXPORT_INDIRECT_CALLABLE(ipv4_mtu
);
1340 static void ip_del_fnhe(struct fib_nh_common
*nhc
, __be32 daddr
)
1342 struct fnhe_hash_bucket
*hash
;
1343 struct fib_nh_exception
*fnhe
, __rcu
**fnhe_p
;
1344 u32 hval
= fnhe_hashfun(daddr
);
1346 spin_lock_bh(&fnhe_lock
);
1348 hash
= rcu_dereference_protected(nhc
->nhc_exceptions
,
1349 lockdep_is_held(&fnhe_lock
));
1352 fnhe_p
= &hash
->chain
;
1353 fnhe
= rcu_dereference_protected(*fnhe_p
, lockdep_is_held(&fnhe_lock
));
1355 if (fnhe
->fnhe_daddr
== daddr
) {
1356 rcu_assign_pointer(*fnhe_p
, rcu_dereference_protected(
1357 fnhe
->fnhe_next
, lockdep_is_held(&fnhe_lock
)));
1358 /* set fnhe_daddr to 0 to ensure it won't bind with
1359 * new dsts in rt_bind_exception().
1361 fnhe
->fnhe_daddr
= 0;
1362 fnhe_flush_routes(fnhe
);
1363 kfree_rcu(fnhe
, rcu
);
1366 fnhe_p
= &fnhe
->fnhe_next
;
1367 fnhe
= rcu_dereference_protected(fnhe
->fnhe_next
,
1368 lockdep_is_held(&fnhe_lock
));
1371 spin_unlock_bh(&fnhe_lock
);
1374 static struct fib_nh_exception
*find_exception(struct fib_nh_common
*nhc
,
1377 struct fnhe_hash_bucket
*hash
= rcu_dereference(nhc
->nhc_exceptions
);
1378 struct fib_nh_exception
*fnhe
;
1384 hval
= fnhe_hashfun(daddr
);
1386 for (fnhe
= rcu_dereference(hash
[hval
].chain
); fnhe
;
1387 fnhe
= rcu_dereference(fnhe
->fnhe_next
)) {
1388 if (fnhe
->fnhe_daddr
== daddr
) {
1389 if (fnhe
->fnhe_expires
&&
1390 time_after(jiffies
, fnhe
->fnhe_expires
)) {
1391 ip_del_fnhe(nhc
, daddr
);
1401 * 1. mtu on route is locked - use it
1402 * 2. mtu from nexthop exception
1403 * 3. mtu from egress device
1406 u32
ip_mtu_from_fib_result(struct fib_result
*res
, __be32 daddr
)
1408 struct fib_nh_common
*nhc
= res
->nhc
;
1409 struct net_device
*dev
= nhc
->nhc_dev
;
1410 struct fib_info
*fi
= res
->fi
;
1413 if (READ_ONCE(dev_net(dev
)->ipv4
.sysctl_ip_fwd_use_pmtu
) ||
1414 fi
->fib_metrics
->metrics
[RTAX_LOCK
- 1] & (1 << RTAX_MTU
))
1418 struct fib_nh_exception
*fnhe
;
1420 fnhe
= find_exception(nhc
, daddr
);
1421 if (fnhe
&& !time_after_eq(jiffies
, fnhe
->fnhe_expires
))
1422 mtu
= fnhe
->fnhe_pmtu
;
1426 mtu
= min(READ_ONCE(dev
->mtu
), IP_MAX_MTU
);
1428 return mtu
- lwtunnel_headroom(nhc
->nhc_lwtstate
, mtu
);
1431 static bool rt_bind_exception(struct rtable
*rt
, struct fib_nh_exception
*fnhe
,
1432 __be32 daddr
, const bool do_cache
)
1436 spin_lock_bh(&fnhe_lock
);
1438 if (daddr
== fnhe
->fnhe_daddr
) {
1439 struct rtable __rcu
**porig
;
1440 struct rtable
*orig
;
1441 int genid
= fnhe_genid(dev_net(rt
->dst
.dev
));
1443 if (rt_is_input_route(rt
))
1444 porig
= &fnhe
->fnhe_rth_input
;
1446 porig
= &fnhe
->fnhe_rth_output
;
1447 orig
= rcu_dereference(*porig
);
1449 if (fnhe
->fnhe_genid
!= genid
) {
1450 fnhe
->fnhe_genid
= genid
;
1452 fnhe
->fnhe_pmtu
= 0;
1453 fnhe
->fnhe_expires
= 0;
1454 fnhe
->fnhe_mtu_locked
= false;
1455 fnhe_flush_routes(fnhe
);
1458 fill_route_from_fnhe(rt
, fnhe
);
1461 rt
->rt_gw_family
= AF_INET
;
1466 rcu_assign_pointer(*porig
, rt
);
1468 dst_dev_put(&orig
->dst
);
1469 dst_release(&orig
->dst
);
1474 fnhe
->fnhe_stamp
= jiffies
;
1476 spin_unlock_bh(&fnhe_lock
);
1481 static bool rt_cache_route(struct fib_nh_common
*nhc
, struct rtable
*rt
)
1483 struct rtable
*orig
, *prev
, **p
;
1486 if (rt_is_input_route(rt
)) {
1487 p
= (struct rtable
**)&nhc
->nhc_rth_input
;
1489 p
= (struct rtable
**)raw_cpu_ptr(nhc
->nhc_pcpu_rth_output
);
1493 /* hold dst before doing cmpxchg() to avoid race condition
1497 prev
= cmpxchg(p
, orig
, rt
);
1500 rt_add_uncached_list(orig
);
1501 dst_release(&orig
->dst
);
1504 dst_release(&rt
->dst
);
1511 struct uncached_list
{
1513 struct list_head head
;
1516 static DEFINE_PER_CPU_ALIGNED(struct uncached_list
, rt_uncached_list
);
1518 void rt_add_uncached_list(struct rtable
*rt
)
1520 struct uncached_list
*ul
= raw_cpu_ptr(&rt_uncached_list
);
1522 rt
->dst
.rt_uncached_list
= ul
;
1524 spin_lock_bh(&ul
->lock
);
1525 list_add_tail(&rt
->dst
.rt_uncached
, &ul
->head
);
1526 spin_unlock_bh(&ul
->lock
);
1529 void rt_del_uncached_list(struct rtable
*rt
)
1531 if (!list_empty(&rt
->dst
.rt_uncached
)) {
1532 struct uncached_list
*ul
= rt
->dst
.rt_uncached_list
;
1534 spin_lock_bh(&ul
->lock
);
1535 list_del_init(&rt
->dst
.rt_uncached
);
1536 spin_unlock_bh(&ul
->lock
);
1540 static void ipv4_dst_destroy(struct dst_entry
*dst
)
1542 ip_dst_metrics_put(dst
);
1543 rt_del_uncached_list(dst_rtable(dst
));
1546 void rt_flush_dev(struct net_device
*dev
)
1548 struct rtable
*rt
, *safe
;
1551 for_each_possible_cpu(cpu
) {
1552 struct uncached_list
*ul
= &per_cpu(rt_uncached_list
, cpu
);
1554 if (list_empty(&ul
->head
))
1557 spin_lock_bh(&ul
->lock
);
1558 list_for_each_entry_safe(rt
, safe
, &ul
->head
, dst
.rt_uncached
) {
1559 if (rt
->dst
.dev
!= dev
)
1561 rt
->dst
.dev
= blackhole_netdev
;
1562 netdev_ref_replace(dev
, blackhole_netdev
,
1563 &rt
->dst
.dev_tracker
, GFP_ATOMIC
);
1564 list_del_init(&rt
->dst
.rt_uncached
);
1566 spin_unlock_bh(&ul
->lock
);
1570 static bool rt_cache_valid(const struct rtable
*rt
)
1573 rt
->dst
.obsolete
== DST_OBSOLETE_FORCE_CHK
&&
1577 static void rt_set_nexthop(struct rtable
*rt
, __be32 daddr
,
1578 const struct fib_result
*res
,
1579 struct fib_nh_exception
*fnhe
,
1580 struct fib_info
*fi
, u16 type
, u32 itag
,
1581 const bool do_cache
)
1583 bool cached
= false;
1586 struct fib_nh_common
*nhc
= FIB_RES_NHC(*res
);
1588 if (nhc
->nhc_gw_family
&& nhc
->nhc_scope
== RT_SCOPE_LINK
) {
1589 rt
->rt_uses_gateway
= 1;
1590 rt
->rt_gw_family
= nhc
->nhc_gw_family
;
1591 /* only INET and INET6 are supported */
1592 if (likely(nhc
->nhc_gw_family
== AF_INET
))
1593 rt
->rt_gw4
= nhc
->nhc_gw
.ipv4
;
1595 rt
->rt_gw6
= nhc
->nhc_gw
.ipv6
;
1598 ip_dst_init_metrics(&rt
->dst
, fi
->fib_metrics
);
1600 #ifdef CONFIG_IP_ROUTE_CLASSID
1601 if (nhc
->nhc_family
== AF_INET
) {
1604 nh
= container_of(nhc
, struct fib_nh
, nh_common
);
1605 rt
->dst
.tclassid
= nh
->nh_tclassid
;
1608 rt
->dst
.lwtstate
= lwtstate_get(nhc
->nhc_lwtstate
);
1610 cached
= rt_bind_exception(rt
, fnhe
, daddr
, do_cache
);
1612 cached
= rt_cache_route(nhc
, rt
);
1613 if (unlikely(!cached
)) {
1614 /* Routes we intend to cache in nexthop exception or
1615 * FIB nexthop have the DST_NOCACHE bit clear.
1616 * However, if we are unsuccessful at storing this
1617 * route into the cache we really need to set it.
1620 rt
->rt_gw_family
= AF_INET
;
1623 rt_add_uncached_list(rt
);
1626 rt_add_uncached_list(rt
);
1628 #ifdef CONFIG_IP_ROUTE_CLASSID
1629 #ifdef CONFIG_IP_MULTIPLE_TABLES
1630 set_class_tag(rt
, res
->tclassid
);
1632 set_class_tag(rt
, itag
);
1636 struct rtable
*rt_dst_alloc(struct net_device
*dev
,
1637 unsigned int flags
, u16 type
,
1642 rt
= dst_alloc(&ipv4_dst_ops
, dev
, DST_OBSOLETE_FORCE_CHK
,
1643 (noxfrm
? DST_NOXFRM
: 0));
1646 rt
->rt_genid
= rt_genid_ipv4(dev_net(dev
));
1647 rt
->rt_flags
= flags
;
1649 rt
->rt_is_input
= 0;
1652 rt
->rt_mtu_locked
= 0;
1653 rt
->rt_uses_gateway
= 0;
1654 rt
->rt_gw_family
= 0;
1657 rt
->dst
.output
= ip_output
;
1658 if (flags
& RTCF_LOCAL
)
1659 rt
->dst
.input
= ip_local_deliver
;
1664 EXPORT_SYMBOL(rt_dst_alloc
);
1666 struct rtable
*rt_dst_clone(struct net_device
*dev
, struct rtable
*rt
)
1668 struct rtable
*new_rt
;
1670 new_rt
= dst_alloc(&ipv4_dst_ops
, dev
, DST_OBSOLETE_FORCE_CHK
,
1674 new_rt
->rt_genid
= rt_genid_ipv4(dev_net(dev
));
1675 new_rt
->rt_flags
= rt
->rt_flags
;
1676 new_rt
->rt_type
= rt
->rt_type
;
1677 new_rt
->rt_is_input
= rt
->rt_is_input
;
1678 new_rt
->rt_iif
= rt
->rt_iif
;
1679 new_rt
->rt_pmtu
= rt
->rt_pmtu
;
1680 new_rt
->rt_mtu_locked
= rt
->rt_mtu_locked
;
1681 new_rt
->rt_gw_family
= rt
->rt_gw_family
;
1682 if (rt
->rt_gw_family
== AF_INET
)
1683 new_rt
->rt_gw4
= rt
->rt_gw4
;
1684 else if (rt
->rt_gw_family
== AF_INET6
)
1685 new_rt
->rt_gw6
= rt
->rt_gw6
;
1687 new_rt
->dst
.input
= rt
->dst
.input
;
1688 new_rt
->dst
.output
= rt
->dst
.output
;
1689 new_rt
->dst
.error
= rt
->dst
.error
;
1690 new_rt
->dst
.lastuse
= jiffies
;
1691 new_rt
->dst
.lwtstate
= lwtstate_get(rt
->dst
.lwtstate
);
1695 EXPORT_SYMBOL(rt_dst_clone
);
1697 /* called in rcu_read_lock() section */
1698 enum skb_drop_reason
1699 ip_mc_validate_source(struct sk_buff
*skb
, __be32 daddr
, __be32 saddr
,
1700 dscp_t dscp
, struct net_device
*dev
,
1701 struct in_device
*in_dev
, u32
*itag
)
1703 enum skb_drop_reason reason
;
1705 /* Primary sanity checks. */
1707 return SKB_DROP_REASON_NOT_SPECIFIED
;
1709 if (ipv4_is_multicast(saddr
) || ipv4_is_lbcast(saddr
))
1710 return SKB_DROP_REASON_IP_INVALID_SOURCE
;
1712 if (skb
->protocol
!= htons(ETH_P_IP
))
1713 return SKB_DROP_REASON_INVALID_PROTO
;
1715 if (ipv4_is_loopback(saddr
) && !IN_DEV_ROUTE_LOCALNET(in_dev
))
1716 return SKB_DROP_REASON_IP_LOCALNET
;
1718 if (ipv4_is_zeronet(saddr
)) {
1719 if (!ipv4_is_local_multicast(daddr
) &&
1720 ip_hdr(skb
)->protocol
!= IPPROTO_IGMP
)
1721 return SKB_DROP_REASON_IP_INVALID_SOURCE
;
1723 reason
= fib_validate_source_reason(skb
, saddr
, 0, dscp
, 0,
1728 return SKB_NOT_DROPPED_YET
;
1731 /* called in rcu_read_lock() section */
1732 static enum skb_drop_reason
1733 ip_route_input_mc(struct sk_buff
*skb
, __be32 daddr
, __be32 saddr
,
1734 dscp_t dscp
, struct net_device
*dev
, int our
)
1736 struct in_device
*in_dev
= __in_dev_get_rcu(dev
);
1737 unsigned int flags
= RTCF_MULTICAST
;
1738 enum skb_drop_reason reason
;
1742 reason
= ip_mc_validate_source(skb
, daddr
, saddr
, dscp
, dev
, in_dev
,
1748 flags
|= RTCF_LOCAL
;
1750 if (IN_DEV_ORCONF(in_dev
, NOPOLICY
))
1751 IPCB(skb
)->flags
|= IPSKB_NOPOLICY
;
1753 rth
= rt_dst_alloc(dev_net(dev
)->loopback_dev
, flags
, RTN_MULTICAST
,
1756 return SKB_DROP_REASON_NOMEM
;
1758 #ifdef CONFIG_IP_ROUTE_CLASSID
1759 rth
->dst
.tclassid
= itag
;
1761 rth
->dst
.output
= ip_rt_bug
;
1762 rth
->rt_is_input
= 1;
1764 #ifdef CONFIG_IP_MROUTE
1765 if (!ipv4_is_local_multicast(daddr
) && IN_DEV_MFORWARD(in_dev
))
1766 rth
->dst
.input
= ip_mr_input
;
1768 RT_CACHE_STAT_INC(in_slow_mc
);
1771 skb_dst_set(skb
, &rth
->dst
);
1772 return SKB_NOT_DROPPED_YET
;
1776 static void ip_handle_martian_source(struct net_device
*dev
,
1777 struct in_device
*in_dev
,
1778 struct sk_buff
*skb
,
1782 RT_CACHE_STAT_INC(in_martian_src
);
1783 #ifdef CONFIG_IP_ROUTE_VERBOSE
1784 if (IN_DEV_LOG_MARTIANS(in_dev
) && net_ratelimit()) {
1786 * RFC1812 recommendation, if source is martian,
1787 * the only hint is MAC header.
1789 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1790 &daddr
, &saddr
, dev
->name
);
1791 if (dev
->hard_header_len
&& skb_mac_header_was_set(skb
)) {
1792 print_hex_dump(KERN_WARNING
, "ll header: ",
1793 DUMP_PREFIX_OFFSET
, 16, 1,
1794 skb_mac_header(skb
),
1795 dev
->hard_header_len
, false);
1801 /* called in rcu_read_lock() section */
1802 static enum skb_drop_reason
1803 __mkroute_input(struct sk_buff
*skb
, const struct fib_result
*res
,
1804 struct in_device
*in_dev
, __be32 daddr
,
1805 __be32 saddr
, dscp_t dscp
)
1807 enum skb_drop_reason reason
= SKB_DROP_REASON_NOT_SPECIFIED
;
1808 struct fib_nh_common
*nhc
= FIB_RES_NHC(*res
);
1809 struct net_device
*dev
= nhc
->nhc_dev
;
1810 struct fib_nh_exception
*fnhe
;
1813 struct in_device
*out_dev
;
1817 /* get a working reference to the output device */
1818 out_dev
= __in_dev_get_rcu(dev
);
1820 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1824 err
= fib_validate_source(skb
, saddr
, daddr
, dscp
, FIB_RES_OIF(*res
),
1825 in_dev
->dev
, in_dev
, &itag
);
1828 ip_handle_martian_source(in_dev
->dev
, in_dev
, skb
, daddr
,
1834 do_cache
= res
->fi
&& !itag
;
1835 if (out_dev
== in_dev
&& err
&& IN_DEV_TX_REDIRECTS(out_dev
) &&
1836 skb
->protocol
== htons(ETH_P_IP
)) {
1839 gw
= nhc
->nhc_gw_family
== AF_INET
? nhc
->nhc_gw
.ipv4
: 0;
1840 if (IN_DEV_SHARED_MEDIA(out_dev
) ||
1841 inet_addr_onlink(out_dev
, saddr
, gw
))
1842 IPCB(skb
)->flags
|= IPSKB_DOREDIRECT
;
1845 if (skb
->protocol
!= htons(ETH_P_IP
)) {
1846 /* Not IP (i.e. ARP). Do not create route, if it is
1847 * invalid for proxy arp. DNAT routes are always valid.
1849 * Proxy arp feature have been extended to allow, ARP
1850 * replies back to the same interface, to support
1851 * Private VLAN switch technologies. See arp.c.
1853 if (out_dev
== in_dev
&&
1854 IN_DEV_PROXY_ARP_PVLAN(in_dev
) == 0) {
1855 reason
= SKB_DROP_REASON_ARP_PVLAN_DISABLE
;
1860 if (IN_DEV_ORCONF(in_dev
, NOPOLICY
))
1861 IPCB(skb
)->flags
|= IPSKB_NOPOLICY
;
1863 fnhe
= find_exception(nhc
, daddr
);
1866 rth
= rcu_dereference(fnhe
->fnhe_rth_input
);
1868 rth
= rcu_dereference(nhc
->nhc_rth_input
);
1869 if (rt_cache_valid(rth
)) {
1870 skb_dst_set_noref(skb
, &rth
->dst
);
1875 rth
= rt_dst_alloc(out_dev
->dev
, 0, res
->type
,
1876 IN_DEV_ORCONF(out_dev
, NOXFRM
));
1878 reason
= SKB_DROP_REASON_NOMEM
;
1882 rth
->rt_is_input
= 1;
1883 RT_CACHE_STAT_INC(in_slow_tot
);
1885 rth
->dst
.input
= ip_forward
;
1887 rt_set_nexthop(rth
, daddr
, res
, fnhe
, res
->fi
, res
->type
, itag
,
1889 lwtunnel_set_redirect(&rth
->dst
);
1890 skb_dst_set(skb
, &rth
->dst
);
1892 reason
= SKB_NOT_DROPPED_YET
;
1897 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1898 /* To make ICMP packets follow the right flow, the multipath hash is
1899 * calculated from the inner IP addresses.
1901 static void ip_multipath_l3_keys(const struct sk_buff
*skb
,
1902 struct flow_keys
*hash_keys
)
1904 const struct iphdr
*outer_iph
= ip_hdr(skb
);
1905 const struct iphdr
*key_iph
= outer_iph
;
1906 const struct iphdr
*inner_iph
;
1907 const struct icmphdr
*icmph
;
1908 struct iphdr _inner_iph
;
1909 struct icmphdr _icmph
;
1911 if (likely(outer_iph
->protocol
!= IPPROTO_ICMP
))
1914 if (unlikely((outer_iph
->frag_off
& htons(IP_OFFSET
)) != 0))
1917 icmph
= skb_header_pointer(skb
, outer_iph
->ihl
* 4, sizeof(_icmph
),
1922 if (!icmp_is_err(icmph
->type
))
1925 inner_iph
= skb_header_pointer(skb
,
1926 outer_iph
->ihl
* 4 + sizeof(_icmph
),
1927 sizeof(_inner_iph
), &_inner_iph
);
1931 key_iph
= inner_iph
;
1933 hash_keys
->addrs
.v4addrs
.src
= key_iph
->saddr
;
1934 hash_keys
->addrs
.v4addrs
.dst
= key_iph
->daddr
;
1937 static u32
fib_multipath_custom_hash_outer(const struct net
*net
,
1938 const struct sk_buff
*skb
,
1941 u32 hash_fields
= READ_ONCE(net
->ipv4
.sysctl_fib_multipath_hash_fields
);
1942 struct flow_keys keys
, hash_keys
;
1944 if (!(hash_fields
& FIB_MULTIPATH_HASH_FIELD_OUTER_MASK
))
1947 memset(&hash_keys
, 0, sizeof(hash_keys
));
1948 skb_flow_dissect_flow_keys(skb
, &keys
, FLOW_DISSECTOR_F_STOP_AT_ENCAP
);
1950 hash_keys
.control
.addr_type
= FLOW_DISSECTOR_KEY_IPV4_ADDRS
;
1951 if (hash_fields
& FIB_MULTIPATH_HASH_FIELD_SRC_IP
)
1952 hash_keys
.addrs
.v4addrs
.src
= keys
.addrs
.v4addrs
.src
;
1953 if (hash_fields
& FIB_MULTIPATH_HASH_FIELD_DST_IP
)
1954 hash_keys
.addrs
.v4addrs
.dst
= keys
.addrs
.v4addrs
.dst
;
1955 if (hash_fields
& FIB_MULTIPATH_HASH_FIELD_IP_PROTO
)
1956 hash_keys
.basic
.ip_proto
= keys
.basic
.ip_proto
;
1957 if (hash_fields
& FIB_MULTIPATH_HASH_FIELD_SRC_PORT
)
1958 hash_keys
.ports
.src
= keys
.ports
.src
;
1959 if (hash_fields
& FIB_MULTIPATH_HASH_FIELD_DST_PORT
)
1960 hash_keys
.ports
.dst
= keys
.ports
.dst
;
1962 *p_has_inner
= !!(keys
.control
.flags
& FLOW_DIS_ENCAPSULATION
);
1963 return fib_multipath_hash_from_keys(net
, &hash_keys
);
1966 static u32
fib_multipath_custom_hash_inner(const struct net
*net
,
1967 const struct sk_buff
*skb
,
1970 u32 hash_fields
= READ_ONCE(net
->ipv4
.sysctl_fib_multipath_hash_fields
);
1971 struct flow_keys keys
, hash_keys
;
1973 /* We assume the packet carries an encapsulation, but if none was
1974 * encountered during dissection of the outer flow, then there is no
1975 * point in calling the flow dissector again.
1980 if (!(hash_fields
& FIB_MULTIPATH_HASH_FIELD_INNER_MASK
))
1983 memset(&hash_keys
, 0, sizeof(hash_keys
));
1984 skb_flow_dissect_flow_keys(skb
, &keys
, 0);
1986 if (!(keys
.control
.flags
& FLOW_DIS_ENCAPSULATION
))
1989 if (keys
.control
.addr_type
== FLOW_DISSECTOR_KEY_IPV4_ADDRS
) {
1990 hash_keys
.control
.addr_type
= FLOW_DISSECTOR_KEY_IPV4_ADDRS
;
1991 if (hash_fields
& FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP
)
1992 hash_keys
.addrs
.v4addrs
.src
= keys
.addrs
.v4addrs
.src
;
1993 if (hash_fields
& FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP
)
1994 hash_keys
.addrs
.v4addrs
.dst
= keys
.addrs
.v4addrs
.dst
;
1995 } else if (keys
.control
.addr_type
== FLOW_DISSECTOR_KEY_IPV6_ADDRS
) {
1996 hash_keys
.control
.addr_type
= FLOW_DISSECTOR_KEY_IPV6_ADDRS
;
1997 if (hash_fields
& FIB_MULTIPATH_HASH_FIELD_INNER_SRC_IP
)
1998 hash_keys
.addrs
.v6addrs
.src
= keys
.addrs
.v6addrs
.src
;
1999 if (hash_fields
& FIB_MULTIPATH_HASH_FIELD_INNER_DST_IP
)
2000 hash_keys
.addrs
.v6addrs
.dst
= keys
.addrs
.v6addrs
.dst
;
2001 if (hash_fields
& FIB_MULTIPATH_HASH_FIELD_INNER_FLOWLABEL
)
2002 hash_keys
.tags
.flow_label
= keys
.tags
.flow_label
;
2005 if (hash_fields
& FIB_MULTIPATH_HASH_FIELD_INNER_IP_PROTO
)
2006 hash_keys
.basic
.ip_proto
= keys
.basic
.ip_proto
;
2007 if (hash_fields
& FIB_MULTIPATH_HASH_FIELD_INNER_SRC_PORT
)
2008 hash_keys
.ports
.src
= keys
.ports
.src
;
2009 if (hash_fields
& FIB_MULTIPATH_HASH_FIELD_INNER_DST_PORT
)
2010 hash_keys
.ports
.dst
= keys
.ports
.dst
;
2012 return fib_multipath_hash_from_keys(net
, &hash_keys
);
2015 static u32
fib_multipath_custom_hash_skb(const struct net
*net
,
2016 const struct sk_buff
*skb
)
2018 u32 mhash
, mhash_inner
;
2019 bool has_inner
= true;
2021 mhash
= fib_multipath_custom_hash_outer(net
, skb
, &has_inner
);
2022 mhash_inner
= fib_multipath_custom_hash_inner(net
, skb
, has_inner
);
2024 return jhash_2words(mhash
, mhash_inner
, 0);
2027 static u32
fib_multipath_custom_hash_fl4(const struct net
*net
,
2028 const struct flowi4
*fl4
)
2030 u32 hash_fields
= READ_ONCE(net
->ipv4
.sysctl_fib_multipath_hash_fields
);
2031 struct flow_keys hash_keys
;
2033 if (!(hash_fields
& FIB_MULTIPATH_HASH_FIELD_OUTER_MASK
))
2036 memset(&hash_keys
, 0, sizeof(hash_keys
));
2037 hash_keys
.control
.addr_type
= FLOW_DISSECTOR_KEY_IPV4_ADDRS
;
2038 if (hash_fields
& FIB_MULTIPATH_HASH_FIELD_SRC_IP
)
2039 hash_keys
.addrs
.v4addrs
.src
= fl4
->saddr
;
2040 if (hash_fields
& FIB_MULTIPATH_HASH_FIELD_DST_IP
)
2041 hash_keys
.addrs
.v4addrs
.dst
= fl4
->daddr
;
2042 if (hash_fields
& FIB_MULTIPATH_HASH_FIELD_IP_PROTO
)
2043 hash_keys
.basic
.ip_proto
= fl4
->flowi4_proto
;
2044 if (hash_fields
& FIB_MULTIPATH_HASH_FIELD_SRC_PORT
) {
2045 if (fl4
->flowi4_flags
& FLOWI_FLAG_ANY_SPORT
)
2046 hash_keys
.ports
.src
= (__force __be16
)get_random_u16();
2048 hash_keys
.ports
.src
= fl4
->fl4_sport
;
2050 if (hash_fields
& FIB_MULTIPATH_HASH_FIELD_DST_PORT
)
2051 hash_keys
.ports
.dst
= fl4
->fl4_dport
;
2053 return fib_multipath_hash_from_keys(net
, &hash_keys
);
2056 /* if skb is set it will be used and fl4 can be NULL */
2057 int fib_multipath_hash(const struct net
*net
, const struct flowi4
*fl4
,
2058 const struct sk_buff
*skb
, struct flow_keys
*flkeys
)
2060 u32 multipath_hash
= fl4
? fl4
->flowi4_multipath_hash
: 0;
2061 struct flow_keys hash_keys
;
2064 switch (READ_ONCE(net
->ipv4
.sysctl_fib_multipath_hash_policy
)) {
2066 memset(&hash_keys
, 0, sizeof(hash_keys
));
2067 hash_keys
.control
.addr_type
= FLOW_DISSECTOR_KEY_IPV4_ADDRS
;
2069 ip_multipath_l3_keys(skb
, &hash_keys
);
2071 hash_keys
.addrs
.v4addrs
.src
= fl4
->saddr
;
2072 hash_keys
.addrs
.v4addrs
.dst
= fl4
->daddr
;
2074 mhash
= fib_multipath_hash_from_keys(net
, &hash_keys
);
2077 /* skb is currently provided only when forwarding */
2079 unsigned int flag
= FLOW_DISSECTOR_F_STOP_AT_ENCAP
;
2080 struct flow_keys keys
;
2082 /* short-circuit if we already have L4 hash present */
2084 return skb_get_hash_raw(skb
) >> 1;
2086 memset(&hash_keys
, 0, sizeof(hash_keys
));
2089 skb_flow_dissect_flow_keys(skb
, &keys
, flag
);
2093 hash_keys
.control
.addr_type
= FLOW_DISSECTOR_KEY_IPV4_ADDRS
;
2094 hash_keys
.addrs
.v4addrs
.src
= flkeys
->addrs
.v4addrs
.src
;
2095 hash_keys
.addrs
.v4addrs
.dst
= flkeys
->addrs
.v4addrs
.dst
;
2096 hash_keys
.ports
.src
= flkeys
->ports
.src
;
2097 hash_keys
.ports
.dst
= flkeys
->ports
.dst
;
2098 hash_keys
.basic
.ip_proto
= flkeys
->basic
.ip_proto
;
2100 memset(&hash_keys
, 0, sizeof(hash_keys
));
2101 hash_keys
.control
.addr_type
= FLOW_DISSECTOR_KEY_IPV4_ADDRS
;
2102 hash_keys
.addrs
.v4addrs
.src
= fl4
->saddr
;
2103 hash_keys
.addrs
.v4addrs
.dst
= fl4
->daddr
;
2104 if (fl4
->flowi4_flags
& FLOWI_FLAG_ANY_SPORT
)
2105 hash_keys
.ports
.src
= (__force __be16
)get_random_u16();
2107 hash_keys
.ports
.src
= fl4
->fl4_sport
;
2108 hash_keys
.ports
.dst
= fl4
->fl4_dport
;
2109 hash_keys
.basic
.ip_proto
= fl4
->flowi4_proto
;
2111 mhash
= fib_multipath_hash_from_keys(net
, &hash_keys
);
2114 memset(&hash_keys
, 0, sizeof(hash_keys
));
2115 /* skb is currently provided only when forwarding */
2117 struct flow_keys keys
;
2119 skb_flow_dissect_flow_keys(skb
, &keys
, 0);
2120 /* Inner can be v4 or v6 */
2121 if (keys
.control
.addr_type
== FLOW_DISSECTOR_KEY_IPV4_ADDRS
) {
2122 hash_keys
.control
.addr_type
= FLOW_DISSECTOR_KEY_IPV4_ADDRS
;
2123 hash_keys
.addrs
.v4addrs
.src
= keys
.addrs
.v4addrs
.src
;
2124 hash_keys
.addrs
.v4addrs
.dst
= keys
.addrs
.v4addrs
.dst
;
2125 } else if (keys
.control
.addr_type
== FLOW_DISSECTOR_KEY_IPV6_ADDRS
) {
2126 hash_keys
.control
.addr_type
= FLOW_DISSECTOR_KEY_IPV6_ADDRS
;
2127 hash_keys
.addrs
.v6addrs
.src
= keys
.addrs
.v6addrs
.src
;
2128 hash_keys
.addrs
.v6addrs
.dst
= keys
.addrs
.v6addrs
.dst
;
2129 hash_keys
.tags
.flow_label
= keys
.tags
.flow_label
;
2130 hash_keys
.basic
.ip_proto
= keys
.basic
.ip_proto
;
2132 /* Same as case 0 */
2133 hash_keys
.control
.addr_type
= FLOW_DISSECTOR_KEY_IPV4_ADDRS
;
2134 ip_multipath_l3_keys(skb
, &hash_keys
);
2137 /* Same as case 0 */
2138 hash_keys
.control
.addr_type
= FLOW_DISSECTOR_KEY_IPV4_ADDRS
;
2139 hash_keys
.addrs
.v4addrs
.src
= fl4
->saddr
;
2140 hash_keys
.addrs
.v4addrs
.dst
= fl4
->daddr
;
2142 mhash
= fib_multipath_hash_from_keys(net
, &hash_keys
);
2146 mhash
= fib_multipath_custom_hash_skb(net
, skb
);
2148 mhash
= fib_multipath_custom_hash_fl4(net
, fl4
);
2153 mhash
= jhash_2words(mhash
, multipath_hash
, 0);
2157 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
2159 static enum skb_drop_reason
2160 ip_mkroute_input(struct sk_buff
*skb
, struct fib_result
*res
,
2161 struct in_device
*in_dev
, __be32 daddr
,
2162 __be32 saddr
, dscp_t dscp
, struct flow_keys
*hkeys
)
2164 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2165 if (res
->fi
&& fib_info_num_path(res
->fi
) > 1) {
2166 int h
= fib_multipath_hash(res
->fi
->fib_net
, NULL
, skb
, hkeys
);
2168 fib_select_multipath(res
, h
, NULL
);
2169 IPCB(skb
)->flags
|= IPSKB_MULTIPATH
;
2173 /* create a routing cache entry */
2174 return __mkroute_input(skb
, res
, in_dev
, daddr
, saddr
, dscp
);
2177 /* Implements all the saddr-related checks as ip_route_input_slow(),
2178 * assuming daddr is valid and the destination is not a local broadcast one.
2179 * Uses the provided hint instead of performing a route lookup.
2181 enum skb_drop_reason
2182 ip_route_use_hint(struct sk_buff
*skb
, __be32 daddr
, __be32 saddr
,
2183 dscp_t dscp
, struct net_device
*dev
,
2184 const struct sk_buff
*hint
)
2186 enum skb_drop_reason reason
= SKB_DROP_REASON_NOT_SPECIFIED
;
2187 struct in_device
*in_dev
= __in_dev_get_rcu(dev
);
2188 struct rtable
*rt
= skb_rtable(hint
);
2189 struct net
*net
= dev_net(dev
);
2195 if (ipv4_is_multicast(saddr
) || ipv4_is_lbcast(saddr
)) {
2196 reason
= SKB_DROP_REASON_IP_INVALID_SOURCE
;
2197 goto martian_source
;
2200 if (ipv4_is_zeronet(saddr
)) {
2201 reason
= SKB_DROP_REASON_IP_INVALID_SOURCE
;
2202 goto martian_source
;
2205 if (ipv4_is_loopback(saddr
) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev
, net
)) {
2206 reason
= SKB_DROP_REASON_IP_LOCALNET
;
2207 goto martian_source
;
2210 if (rt
->rt_type
!= RTN_LOCAL
)
2211 goto skip_validate_source
;
2213 reason
= fib_validate_source_reason(skb
, saddr
, daddr
, dscp
, 0, dev
,
2216 goto martian_source
;
2218 skip_validate_source
:
2219 skb_dst_copy(skb
, hint
);
2220 return SKB_NOT_DROPPED_YET
;
2223 ip_handle_martian_source(dev
, in_dev
, skb
, daddr
, saddr
);
2227 /* get device for dst_alloc with local routes */
2228 static struct net_device
*ip_rt_get_dev(struct net
*net
,
2229 const struct fib_result
*res
)
2231 struct fib_nh_common
*nhc
= res
->fi
? res
->nhc
: NULL
;
2232 struct net_device
*dev
= NULL
;
2235 dev
= l3mdev_master_dev_rcu(nhc
->nhc_dev
);
2237 return dev
? : net
->loopback_dev
;
2241 * NOTE. We drop all the packets that has local source
2242 * addresses, because every properly looped back packet
2243 * must have correct destination already attached by output routine.
2244 * Changes in the enforced policies must be applied also to
2245 * ip_route_use_hint().
2247 * Such approach solves two big problems:
2248 * 1. Not simplex devices are handled properly.
2249 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2250 * called with rcu_read_lock()
2253 static enum skb_drop_reason
2254 ip_route_input_slow(struct sk_buff
*skb
, __be32 daddr
, __be32 saddr
,
2255 dscp_t dscp
, struct net_device
*dev
,
2256 struct fib_result
*res
)
2258 enum skb_drop_reason reason
= SKB_DROP_REASON_NOT_SPECIFIED
;
2259 struct in_device
*in_dev
= __in_dev_get_rcu(dev
);
2260 struct flow_keys
*flkeys
= NULL
, _flkeys
;
2261 struct net
*net
= dev_net(dev
);
2262 struct ip_tunnel_info
*tun_info
;
2264 unsigned int flags
= 0;
2268 bool do_cache
= true;
2270 /* IP on this device is disabled. */
2275 /* Check for the most weird martians, which can be not detected
2279 tun_info
= skb_tunnel_info(skb
);
2280 if (tun_info
&& !(tun_info
->mode
& IP_TUNNEL_INFO_TX
))
2281 fl4
.flowi4_tun_key
.tun_id
= tun_info
->key
.tun_id
;
2283 fl4
.flowi4_tun_key
.tun_id
= 0;
2286 if (ipv4_is_multicast(saddr
) || ipv4_is_lbcast(saddr
)) {
2287 reason
= SKB_DROP_REASON_IP_INVALID_SOURCE
;
2288 goto martian_source
;
2293 if (ipv4_is_lbcast(daddr
) || (saddr
== 0 && daddr
== 0))
2296 /* Accept zero addresses only to limited broadcast;
2297 * I even do not know to fix it or not. Waiting for complains :-)
2299 if (ipv4_is_zeronet(saddr
)) {
2300 reason
= SKB_DROP_REASON_IP_INVALID_SOURCE
;
2301 goto martian_source
;
2304 if (ipv4_is_zeronet(daddr
)) {
2305 reason
= SKB_DROP_REASON_IP_INVALID_DEST
;
2306 goto martian_destination
;
2309 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2310 * and call it once if daddr or/and saddr are loopback addresses
2312 if (ipv4_is_loopback(daddr
)) {
2313 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev
, net
)) {
2314 reason
= SKB_DROP_REASON_IP_LOCALNET
;
2315 goto martian_destination
;
2317 } else if (ipv4_is_loopback(saddr
)) {
2318 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev
, net
)) {
2319 reason
= SKB_DROP_REASON_IP_LOCALNET
;
2320 goto martian_source
;
2325 * Now we are ready to route packet.
2327 fl4
.flowi4_l3mdev
= 0;
2329 fl4
.flowi4_iif
= dev
->ifindex
;
2330 fl4
.flowi4_mark
= skb
->mark
;
2331 fl4
.flowi4_tos
= inet_dscp_to_dsfield(dscp
);
2332 fl4
.flowi4_scope
= RT_SCOPE_UNIVERSE
;
2333 fl4
.flowi4_flags
= 0;
2336 fl4
.flowi4_uid
= sock_net_uid(net
, NULL
);
2337 fl4
.flowi4_multipath_hash
= 0;
2339 if (fib4_rules_early_flow_dissect(net
, skb
, &fl4
, &_flkeys
)) {
2342 fl4
.flowi4_proto
= 0;
2347 err
= fib_lookup(net
, &fl4
, res
, 0);
2349 if (!IN_DEV_FORWARD(in_dev
))
2350 err
= -EHOSTUNREACH
;
2354 if (res
->type
== RTN_BROADCAST
) {
2355 if (IN_DEV_BFORWARD(in_dev
))
2357 /* not do cache if bc_forwarding is enabled */
2358 if (IPV4_DEVCONF_ALL_RO(net
, BC_FORWARDING
))
2364 if (res
->type
== RTN_LOCAL
) {
2365 reason
= fib_validate_source_reason(skb
, saddr
, daddr
, dscp
,
2366 0, dev
, in_dev
, &itag
);
2368 goto martian_source
;
2372 if (!IN_DEV_FORWARD(in_dev
)) {
2373 err
= -EHOSTUNREACH
;
2376 if (res
->type
!= RTN_UNICAST
) {
2377 reason
= SKB_DROP_REASON_IP_INVALID_DEST
;
2378 goto martian_destination
;
2382 reason
= ip_mkroute_input(skb
, res
, in_dev
, daddr
, saddr
, dscp
,
2389 if (skb
->protocol
!= htons(ETH_P_IP
)) {
2390 reason
= SKB_DROP_REASON_INVALID_PROTO
;
2394 if (!ipv4_is_zeronet(saddr
)) {
2395 reason
= fib_validate_source_reason(skb
, saddr
, 0, dscp
, 0,
2396 dev
, in_dev
, &itag
);
2398 goto martian_source
;
2400 flags
|= RTCF_BROADCAST
;
2401 res
->type
= RTN_BROADCAST
;
2402 RT_CACHE_STAT_INC(in_brd
);
2405 if (IN_DEV_ORCONF(in_dev
, NOPOLICY
))
2406 IPCB(skb
)->flags
|= IPSKB_NOPOLICY
;
2408 do_cache
&= res
->fi
&& !itag
;
2410 struct fib_nh_common
*nhc
= FIB_RES_NHC(*res
);
2412 rth
= rcu_dereference(nhc
->nhc_rth_input
);
2413 if (rt_cache_valid(rth
)) {
2414 skb_dst_set_noref(skb
, &rth
->dst
);
2415 reason
= SKB_NOT_DROPPED_YET
;
2420 rth
= rt_dst_alloc(ip_rt_get_dev(net
, res
),
2421 flags
| RTCF_LOCAL
, res
->type
, false);
2425 rth
->dst
.output
= ip_rt_bug
;
2426 #ifdef CONFIG_IP_ROUTE_CLASSID
2427 rth
->dst
.tclassid
= itag
;
2429 rth
->rt_is_input
= 1;
2431 RT_CACHE_STAT_INC(in_slow_tot
);
2432 if (res
->type
== RTN_UNREACHABLE
) {
2433 rth
->dst
.input
= ip_error
;
2434 rth
->dst
.error
= -err
;
2435 rth
->rt_flags
&= ~RTCF_LOCAL
;
2439 struct fib_nh_common
*nhc
= FIB_RES_NHC(*res
);
2441 rth
->dst
.lwtstate
= lwtstate_get(nhc
->nhc_lwtstate
);
2442 if (lwtunnel_input_redirect(rth
->dst
.lwtstate
)) {
2443 WARN_ON(rth
->dst
.input
== lwtunnel_input
);
2444 rth
->dst
.lwtstate
->orig_input
= rth
->dst
.input
;
2445 rth
->dst
.input
= lwtunnel_input
;
2448 if (unlikely(!rt_cache_route(nhc
, rth
)))
2449 rt_add_uncached_list(rth
);
2451 skb_dst_set(skb
, &rth
->dst
);
2452 reason
= SKB_NOT_DROPPED_YET
;
2456 RT_CACHE_STAT_INC(in_no_route
);
2457 res
->type
= RTN_UNREACHABLE
;
2463 * Do not cache martian addresses: they should be logged (RFC1812)
2465 martian_destination
:
2466 RT_CACHE_STAT_INC(in_martian_dst
);
2467 #ifdef CONFIG_IP_ROUTE_VERBOSE
2468 if (IN_DEV_LOG_MARTIANS(in_dev
))
2469 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2470 &daddr
, &saddr
, dev
->name
);
2475 reason
= SKB_DROP_REASON_NOMEM
;
2479 ip_handle_martian_source(dev
, in_dev
, skb
, daddr
, saddr
);
2483 /* called with rcu_read_lock held */
2484 static enum skb_drop_reason
2485 ip_route_input_rcu(struct sk_buff
*skb
, __be32 daddr
, __be32 saddr
,
2486 dscp_t dscp
, struct net_device
*dev
,
2487 struct fib_result
*res
)
2489 /* Multicast recognition logic is moved from route cache to here.
2490 * The problem was that too many Ethernet cards have broken/missing
2491 * hardware multicast filters :-( As result the host on multicasting
2492 * network acquires a lot of useless route cache entries, sort of
2493 * SDR messages from all the world. Now we try to get rid of them.
2494 * Really, provided software IP multicast filter is organized
2495 * reasonably (at least, hashed), it does not result in a slowdown
2496 * comparing with route cache reject entries.
2497 * Note, that multicast routers are not affected, because
2498 * route cache entry is created eventually.
2500 if (ipv4_is_multicast(daddr
)) {
2501 enum skb_drop_reason reason
= SKB_DROP_REASON_NOT_SPECIFIED
;
2502 struct in_device
*in_dev
= __in_dev_get_rcu(dev
);
2508 our
= ip_check_mc_rcu(in_dev
, daddr
, saddr
,
2509 ip_hdr(skb
)->protocol
);
2511 /* check l3 master if no match yet */
2512 if (!our
&& netif_is_l3_slave(dev
)) {
2513 struct in_device
*l3_in_dev
;
2515 l3_in_dev
= __in_dev_get_rcu(skb
->dev
);
2517 our
= ip_check_mc_rcu(l3_in_dev
, daddr
, saddr
,
2518 ip_hdr(skb
)->protocol
);
2522 #ifdef CONFIG_IP_MROUTE
2524 (!ipv4_is_local_multicast(daddr
) &&
2525 IN_DEV_MFORWARD(in_dev
))
2528 reason
= ip_route_input_mc(skb
, daddr
, saddr
, dscp
,
2534 return ip_route_input_slow(skb
, daddr
, saddr
, dscp
, dev
, res
);
2537 enum skb_drop_reason
ip_route_input_noref(struct sk_buff
*skb
, __be32 daddr
,
2538 __be32 saddr
, dscp_t dscp
,
2539 struct net_device
*dev
)
2541 enum skb_drop_reason reason
;
2542 struct fib_result res
;
2545 reason
= ip_route_input_rcu(skb
, daddr
, saddr
, dscp
, dev
, &res
);
2550 EXPORT_SYMBOL(ip_route_input_noref
);
2552 /* called with rcu_read_lock() */
2553 static struct rtable
*__mkroute_output(const struct fib_result
*res
,
2554 const struct flowi4
*fl4
, int orig_oif
,
2555 struct net_device
*dev_out
,
2558 struct fib_info
*fi
= res
->fi
;
2559 struct fib_nh_exception
*fnhe
;
2560 struct in_device
*in_dev
;
2561 u16 type
= res
->type
;
2565 in_dev
= __in_dev_get_rcu(dev_out
);
2567 return ERR_PTR(-EINVAL
);
2569 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev
)))
2570 if (ipv4_is_loopback(fl4
->saddr
) &&
2571 !(dev_out
->flags
& IFF_LOOPBACK
) &&
2572 !netif_is_l3_master(dev_out
))
2573 return ERR_PTR(-EINVAL
);
2575 if (ipv4_is_lbcast(fl4
->daddr
))
2576 type
= RTN_BROADCAST
;
2577 else if (ipv4_is_multicast(fl4
->daddr
))
2578 type
= RTN_MULTICAST
;
2579 else if (ipv4_is_zeronet(fl4
->daddr
))
2580 return ERR_PTR(-EINVAL
);
2582 if (dev_out
->flags
& IFF_LOOPBACK
)
2583 flags
|= RTCF_LOCAL
;
2586 if (type
== RTN_BROADCAST
) {
2587 flags
|= RTCF_BROADCAST
| RTCF_LOCAL
;
2589 } else if (type
== RTN_MULTICAST
) {
2590 flags
|= RTCF_MULTICAST
| RTCF_LOCAL
;
2591 if (!ip_check_mc_rcu(in_dev
, fl4
->daddr
, fl4
->saddr
,
2593 flags
&= ~RTCF_LOCAL
;
2596 /* If multicast route do not exist use
2597 * default one, but do not gateway in this case.
2600 if (fi
&& res
->prefixlen
< 4)
2602 } else if ((type
== RTN_LOCAL
) && (orig_oif
!= 0) &&
2603 (orig_oif
!= dev_out
->ifindex
)) {
2604 /* For local routes that require a particular output interface
2605 * we do not want to cache the result. Caching the result
2606 * causes incorrect behaviour when there are multiple source
2607 * addresses on the interface, the end result being that if the
2608 * intended recipient is waiting on that interface for the
2609 * packet he won't receive it because it will be delivered on
2610 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2611 * be set to the loopback interface as well.
2617 do_cache
&= fi
!= NULL
;
2619 struct fib_nh_common
*nhc
= FIB_RES_NHC(*res
);
2620 struct rtable __rcu
**prth
;
2622 fnhe
= find_exception(nhc
, fl4
->daddr
);
2626 prth
= &fnhe
->fnhe_rth_output
;
2628 if (unlikely(fl4
->flowi4_flags
&
2629 FLOWI_FLAG_KNOWN_NH
&&
2630 !(nhc
->nhc_gw_family
&&
2631 nhc
->nhc_scope
== RT_SCOPE_LINK
))) {
2635 prth
= raw_cpu_ptr(nhc
->nhc_pcpu_rth_output
);
2637 rth
= rcu_dereference(*prth
);
2638 if (rt_cache_valid(rth
) && dst_hold_safe(&rth
->dst
))
2643 rth
= rt_dst_alloc(dev_out
, flags
, type
,
2644 IN_DEV_ORCONF(in_dev
, NOXFRM
));
2646 return ERR_PTR(-ENOBUFS
);
2648 rth
->rt_iif
= orig_oif
;
2650 RT_CACHE_STAT_INC(out_slow_tot
);
2652 if (flags
& (RTCF_BROADCAST
| RTCF_MULTICAST
)) {
2653 if (flags
& RTCF_LOCAL
&&
2654 !(dev_out
->flags
& IFF_LOOPBACK
)) {
2655 rth
->dst
.output
= ip_mc_output
;
2656 RT_CACHE_STAT_INC(out_slow_mc
);
2658 #ifdef CONFIG_IP_MROUTE
2659 if (type
== RTN_MULTICAST
) {
2660 if (IN_DEV_MFORWARD(in_dev
) &&
2661 !ipv4_is_local_multicast(fl4
->daddr
)) {
2662 rth
->dst
.input
= ip_mr_input
;
2663 rth
->dst
.output
= ip_mc_output
;
2669 rt_set_nexthop(rth
, fl4
->daddr
, res
, fnhe
, fi
, type
, 0, do_cache
);
2670 lwtunnel_set_redirect(&rth
->dst
);
2676 * Major route resolver routine.
2679 struct rtable
*ip_route_output_key_hash(struct net
*net
, struct flowi4
*fl4
,
2680 const struct sk_buff
*skb
)
2682 struct fib_result res
= {
2690 fl4
->flowi4_iif
= LOOPBACK_IFINDEX
;
2691 fl4
->flowi4_tos
&= INET_DSCP_MASK
;
2694 rth
= ip_route_output_key_hash_rcu(net
, fl4
, &res
, skb
);
2699 EXPORT_SYMBOL_GPL(ip_route_output_key_hash
);
2701 struct rtable
*ip_route_output_key_hash_rcu(struct net
*net
, struct flowi4
*fl4
,
2702 struct fib_result
*res
,
2703 const struct sk_buff
*skb
)
2705 struct net_device
*dev_out
= NULL
;
2706 int orig_oif
= fl4
->flowi4_oif
;
2707 unsigned int flags
= 0;
2712 if (ipv4_is_multicast(fl4
->saddr
) ||
2713 ipv4_is_lbcast(fl4
->saddr
)) {
2714 rth
= ERR_PTR(-EINVAL
);
2718 rth
= ERR_PTR(-ENETUNREACH
);
2720 /* I removed check for oif == dev_out->oif here.
2721 * It was wrong for two reasons:
2722 * 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2723 * is assigned to multiple interfaces.
2724 * 2. Moreover, we are allowed to send packets with saddr
2725 * of another iface. --ANK
2728 if (fl4
->flowi4_oif
== 0 &&
2729 (ipv4_is_multicast(fl4
->daddr
) ||
2730 ipv4_is_lbcast(fl4
->daddr
))) {
2731 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2732 dev_out
= __ip_dev_find(net
, fl4
->saddr
, false);
2736 /* Special hack: user can direct multicasts
2737 * and limited broadcast via necessary interface
2738 * without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2739 * This hack is not just for fun, it allows
2740 * vic,vat and friends to work.
2741 * They bind socket to loopback, set ttl to zero
2742 * and expect that it will work.
2743 * From the viewpoint of routing cache they are broken,
2744 * because we are not allowed to build multicast path
2745 * with loopback source addr (look, routing cache
2746 * cannot know, that ttl is zero, so that packet
2747 * will not leave this host and route is valid).
2748 * Luckily, this hack is good workaround.
2751 fl4
->flowi4_oif
= dev_out
->ifindex
;
2755 if (!(fl4
->flowi4_flags
& FLOWI_FLAG_ANYSRC
)) {
2756 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2757 if (!__ip_dev_find(net
, fl4
->saddr
, false))
2763 if (fl4
->flowi4_oif
) {
2764 dev_out
= dev_get_by_index_rcu(net
, fl4
->flowi4_oif
);
2765 rth
= ERR_PTR(-ENODEV
);
2769 /* RACE: Check return value of inet_select_addr instead. */
2770 if (!(dev_out
->flags
& IFF_UP
) || !__in_dev_get_rcu(dev_out
)) {
2771 rth
= ERR_PTR(-ENETUNREACH
);
2774 if (ipv4_is_local_multicast(fl4
->daddr
) ||
2775 ipv4_is_lbcast(fl4
->daddr
) ||
2776 fl4
->flowi4_proto
== IPPROTO_IGMP
) {
2778 fl4
->saddr
= inet_select_addr(dev_out
, 0,
2783 if (ipv4_is_multicast(fl4
->daddr
))
2784 fl4
->saddr
= inet_select_addr(dev_out
, 0,
2786 else if (!fl4
->daddr
)
2787 fl4
->saddr
= inet_select_addr(dev_out
, 0,
2793 fl4
->daddr
= fl4
->saddr
;
2795 fl4
->daddr
= fl4
->saddr
= htonl(INADDR_LOOPBACK
);
2796 dev_out
= net
->loopback_dev
;
2797 fl4
->flowi4_oif
= LOOPBACK_IFINDEX
;
2798 res
->type
= RTN_LOCAL
;
2799 flags
|= RTCF_LOCAL
;
2803 err
= fib_lookup(net
, fl4
, res
, 0);
2807 if (fl4
->flowi4_oif
&&
2808 (ipv4_is_multicast(fl4
->daddr
) || !fl4
->flowi4_l3mdev
)) {
2809 /* Apparently, routing tables are wrong. Assume,
2810 * that the destination is on link.
2813 * Because we are allowed to send to iface
2814 * even if it has NO routes and NO assigned
2815 * addresses. When oif is specified, routing
2816 * tables are looked up with only one purpose:
2817 * to catch if destination is gatewayed, rather than
2818 * direct. Moreover, if MSG_DONTROUTE is set,
2819 * we send packet, ignoring both routing tables
2820 * and ifaddr state. --ANK
2823 * We could make it even if oif is unknown,
2824 * likely IPv6, but we do not.
2827 if (fl4
->saddr
== 0)
2828 fl4
->saddr
= inet_select_addr(dev_out
, 0,
2830 res
->type
= RTN_UNICAST
;
2837 if (res
->type
== RTN_LOCAL
) {
2839 if (res
->fi
->fib_prefsrc
)
2840 fl4
->saddr
= res
->fi
->fib_prefsrc
;
2842 fl4
->saddr
= fl4
->daddr
;
2845 /* L3 master device is the loopback for that domain */
2846 dev_out
= l3mdev_master_dev_rcu(FIB_RES_DEV(*res
)) ? :
2849 /* make sure orig_oif points to fib result device even
2850 * though packet rx/tx happens over loopback or l3mdev
2852 orig_oif
= FIB_RES_OIF(*res
);
2854 fl4
->flowi4_oif
= dev_out
->ifindex
;
2855 flags
|= RTCF_LOCAL
;
2859 fib_select_path(net
, res
, fl4
, skb
);
2861 dev_out
= FIB_RES_DEV(*res
);
2864 rth
= __mkroute_output(res
, fl4
, orig_oif
, dev_out
, flags
);
2870 static struct dst_ops ipv4_dst_blackhole_ops
= {
2872 .default_advmss
= ipv4_default_advmss
,
2873 .neigh_lookup
= ipv4_neigh_lookup
,
2874 .check
= dst_blackhole_check
,
2875 .cow_metrics
= dst_blackhole_cow_metrics
,
2876 .update_pmtu
= dst_blackhole_update_pmtu
,
2877 .redirect
= dst_blackhole_redirect
,
2878 .mtu
= dst_blackhole_mtu
,
2881 struct dst_entry
*ipv4_blackhole_route(struct net
*net
, struct dst_entry
*dst_orig
)
2883 struct rtable
*ort
= dst_rtable(dst_orig
);
2886 rt
= dst_alloc(&ipv4_dst_blackhole_ops
, NULL
, DST_OBSOLETE_DEAD
, 0);
2888 struct dst_entry
*new = &rt
->dst
;
2891 new->input
= dst_discard
;
2892 new->output
= dst_discard_out
;
2894 new->dev
= net
->loopback_dev
;
2895 netdev_hold(new->dev
, &new->dev_tracker
, GFP_ATOMIC
);
2897 rt
->rt_is_input
= ort
->rt_is_input
;
2898 rt
->rt_iif
= ort
->rt_iif
;
2899 rt
->rt_pmtu
= ort
->rt_pmtu
;
2900 rt
->rt_mtu_locked
= ort
->rt_mtu_locked
;
2902 rt
->rt_genid
= rt_genid_ipv4(net
);
2903 rt
->rt_flags
= ort
->rt_flags
;
2904 rt
->rt_type
= ort
->rt_type
;
2905 rt
->rt_uses_gateway
= ort
->rt_uses_gateway
;
2906 rt
->rt_gw_family
= ort
->rt_gw_family
;
2907 if (rt
->rt_gw_family
== AF_INET
)
2908 rt
->rt_gw4
= ort
->rt_gw4
;
2909 else if (rt
->rt_gw_family
== AF_INET6
)
2910 rt
->rt_gw6
= ort
->rt_gw6
;
2913 dst_release(dst_orig
);
2915 return rt
? &rt
->dst
: ERR_PTR(-ENOMEM
);
2918 struct rtable
*ip_route_output_flow(struct net
*net
, struct flowi4
*flp4
,
2919 const struct sock
*sk
)
2921 struct rtable
*rt
= __ip_route_output_key(net
, flp4
);
2926 if (flp4
->flowi4_proto
) {
2927 flp4
->flowi4_oif
= rt
->dst
.dev
->ifindex
;
2928 rt
= dst_rtable(xfrm_lookup_route(net
, &rt
->dst
,
2929 flowi4_to_flowi(flp4
),
2935 EXPORT_SYMBOL_GPL(ip_route_output_flow
);
2937 /* called with rcu_read_lock held */
2938 static int rt_fill_info(struct net
*net
, __be32 dst
, __be32 src
,
2939 struct rtable
*rt
, u32 table_id
, dscp_t dscp
,
2940 struct flowi4
*fl4
, struct sk_buff
*skb
, u32 portid
,
2941 u32 seq
, unsigned int flags
)
2944 struct nlmsghdr
*nlh
;
2945 unsigned long expires
= 0;
2947 u32 metrics
[RTAX_MAX
];
2949 nlh
= nlmsg_put(skb
, portid
, seq
, RTM_NEWROUTE
, sizeof(*r
), flags
);
2953 r
= nlmsg_data(nlh
);
2954 r
->rtm_family
= AF_INET
;
2955 r
->rtm_dst_len
= 32;
2957 r
->rtm_tos
= inet_dscp_to_dsfield(dscp
);
2958 r
->rtm_table
= table_id
< 256 ? table_id
: RT_TABLE_COMPAT
;
2959 if (nla_put_u32(skb
, RTA_TABLE
, table_id
))
2960 goto nla_put_failure
;
2961 r
->rtm_type
= rt
->rt_type
;
2962 r
->rtm_scope
= RT_SCOPE_UNIVERSE
;
2963 r
->rtm_protocol
= RTPROT_UNSPEC
;
2964 r
->rtm_flags
= (rt
->rt_flags
& ~0xFFFF) | RTM_F_CLONED
;
2965 if (rt
->rt_flags
& RTCF_NOTIFY
)
2966 r
->rtm_flags
|= RTM_F_NOTIFY
;
2967 if (IPCB(skb
)->flags
& IPSKB_DOREDIRECT
)
2968 r
->rtm_flags
|= RTCF_DOREDIRECT
;
2970 if (nla_put_in_addr(skb
, RTA_DST
, dst
))
2971 goto nla_put_failure
;
2973 r
->rtm_src_len
= 32;
2974 if (nla_put_in_addr(skb
, RTA_SRC
, src
))
2975 goto nla_put_failure
;
2978 nla_put_u32(skb
, RTA_OIF
, rt
->dst
.dev
->ifindex
))
2979 goto nla_put_failure
;
2980 if (rt
->dst
.lwtstate
&&
2981 lwtunnel_fill_encap(skb
, rt
->dst
.lwtstate
, RTA_ENCAP
, RTA_ENCAP_TYPE
) < 0)
2982 goto nla_put_failure
;
2983 #ifdef CONFIG_IP_ROUTE_CLASSID
2984 if (rt
->dst
.tclassid
&&
2985 nla_put_u32(skb
, RTA_FLOW
, rt
->dst
.tclassid
))
2986 goto nla_put_failure
;
2988 if (fl4
&& !rt_is_input_route(rt
) &&
2989 fl4
->saddr
!= src
) {
2990 if (nla_put_in_addr(skb
, RTA_PREFSRC
, fl4
->saddr
))
2991 goto nla_put_failure
;
2993 if (rt
->rt_uses_gateway
) {
2994 if (rt
->rt_gw_family
== AF_INET
&&
2995 nla_put_in_addr(skb
, RTA_GATEWAY
, rt
->rt_gw4
)) {
2996 goto nla_put_failure
;
2997 } else if (rt
->rt_gw_family
== AF_INET6
) {
2998 int alen
= sizeof(struct in6_addr
);
3002 nla
= nla_reserve(skb
, RTA_VIA
, alen
+ 2);
3004 goto nla_put_failure
;
3006 via
= nla_data(nla
);
3007 via
->rtvia_family
= AF_INET6
;
3008 memcpy(via
->rtvia_addr
, &rt
->rt_gw6
, alen
);
3012 expires
= rt
->dst
.expires
;
3014 unsigned long now
= jiffies
;
3016 if (time_before(now
, expires
))
3022 memcpy(metrics
, dst_metrics_ptr(&rt
->dst
), sizeof(metrics
));
3023 if (rt
->rt_pmtu
&& expires
)
3024 metrics
[RTAX_MTU
- 1] = rt
->rt_pmtu
;
3025 if (rt
->rt_mtu_locked
&& expires
)
3026 metrics
[RTAX_LOCK
- 1] |= BIT(RTAX_MTU
);
3027 if (rtnetlink_put_metrics(skb
, metrics
) < 0)
3028 goto nla_put_failure
;
3031 if (fl4
->flowi4_mark
&&
3032 nla_put_u32(skb
, RTA_MARK
, fl4
->flowi4_mark
))
3033 goto nla_put_failure
;
3035 if (!uid_eq(fl4
->flowi4_uid
, INVALID_UID
) &&
3036 nla_put_u32(skb
, RTA_UID
,
3037 from_kuid_munged(current_user_ns(),
3039 goto nla_put_failure
;
3041 if (rt_is_input_route(rt
)) {
3042 #ifdef CONFIG_IP_MROUTE
3043 if (ipv4_is_multicast(dst
) &&
3044 !ipv4_is_local_multicast(dst
) &&
3045 IPV4_DEVCONF_ALL_RO(net
, MC_FORWARDING
)) {
3046 int err
= ipmr_get_route(net
, skb
,
3047 fl4
->saddr
, fl4
->daddr
,
3053 goto nla_put_failure
;
3057 if (nla_put_u32(skb
, RTA_IIF
, fl4
->flowi4_iif
))
3058 goto nla_put_failure
;
3062 error
= rt
->dst
.error
;
3064 if (rtnl_put_cacheinfo(skb
, &rt
->dst
, 0, expires
, error
) < 0)
3065 goto nla_put_failure
;
3067 nlmsg_end(skb
, nlh
);
3071 nlmsg_cancel(skb
, nlh
);
3075 static int fnhe_dump_bucket(struct net
*net
, struct sk_buff
*skb
,
3076 struct netlink_callback
*cb
, u32 table_id
,
3077 struct fnhe_hash_bucket
*bucket
, int genid
,
3078 int *fa_index
, int fa_start
, unsigned int flags
)
3082 for (i
= 0; i
< FNHE_HASH_SIZE
; i
++) {
3083 struct fib_nh_exception
*fnhe
;
3085 for (fnhe
= rcu_dereference(bucket
[i
].chain
); fnhe
;
3086 fnhe
= rcu_dereference(fnhe
->fnhe_next
)) {
3090 if (*fa_index
< fa_start
)
3093 if (fnhe
->fnhe_genid
!= genid
)
3096 if (fnhe
->fnhe_expires
&&
3097 time_after(jiffies
, fnhe
->fnhe_expires
))
3100 rt
= rcu_dereference(fnhe
->fnhe_rth_input
);
3102 rt
= rcu_dereference(fnhe
->fnhe_rth_output
);
3106 err
= rt_fill_info(net
, fnhe
->fnhe_daddr
, 0, rt
,
3107 table_id
, 0, NULL
, skb
,
3108 NETLINK_CB(cb
->skb
).portid
,
3109 cb
->nlh
->nlmsg_seq
, flags
);
3120 int fib_dump_info_fnhe(struct sk_buff
*skb
, struct netlink_callback
*cb
,
3121 u32 table_id
, struct fib_info
*fi
,
3122 int *fa_index
, int fa_start
, unsigned int flags
)
3124 struct net
*net
= sock_net(cb
->skb
->sk
);
3125 int nhsel
, genid
= fnhe_genid(net
);
3127 for (nhsel
= 0; nhsel
< fib_info_num_path(fi
); nhsel
++) {
3128 struct fib_nh_common
*nhc
= fib_info_nhc(fi
, nhsel
);
3129 struct fnhe_hash_bucket
*bucket
;
3132 if (nhc
->nhc_flags
& RTNH_F_DEAD
)
3136 bucket
= rcu_dereference(nhc
->nhc_exceptions
);
3139 err
= fnhe_dump_bucket(net
, skb
, cb
, table_id
, bucket
,
3140 genid
, fa_index
, fa_start
,
3150 static struct sk_buff
*inet_rtm_getroute_build_skb(__be32 src
, __be32 dst
,
3151 u8 ip_proto
, __be16 sport
,
3154 struct sk_buff
*skb
;
3157 skb
= alloc_skb(NLMSG_GOODSIZE
, GFP_KERNEL
);
3161 /* Reserve room for dummy headers, this skb can pass
3162 * through good chunk of routing engine.
3164 skb_reset_mac_header(skb
);
3165 skb_reset_network_header(skb
);
3166 skb
->protocol
= htons(ETH_P_IP
);
3167 iph
= skb_put(skb
, sizeof(struct iphdr
));
3168 iph
->protocol
= ip_proto
;
3174 skb_set_transport_header(skb
, skb
->len
);
3176 switch (iph
->protocol
) {
3178 struct udphdr
*udph
;
3180 udph
= skb_put_zero(skb
, sizeof(struct udphdr
));
3181 udph
->source
= sport
;
3183 udph
->len
= htons(sizeof(struct udphdr
));
3188 struct tcphdr
*tcph
;
3190 tcph
= skb_put_zero(skb
, sizeof(struct tcphdr
));
3191 tcph
->source
= sport
;
3193 tcph
->doff
= sizeof(struct tcphdr
) / 4;
3195 tcph
->check
= ~tcp_v4_check(sizeof(struct tcphdr
),
3199 case IPPROTO_ICMP
: {
3200 struct icmphdr
*icmph
;
3202 icmph
= skb_put_zero(skb
, sizeof(struct icmphdr
));
3203 icmph
->type
= ICMP_ECHO
;
3211 static int inet_rtm_valid_getroute_req(struct sk_buff
*skb
,
3212 const struct nlmsghdr
*nlh
,
3214 struct netlink_ext_ack
*extack
)
3219 rtm
= nlmsg_payload(nlh
, sizeof(*rtm
));
3221 NL_SET_ERR_MSG(extack
,
3222 "ipv4: Invalid header for route get request");
3226 if (!netlink_strict_get_check(skb
))
3227 return nlmsg_parse_deprecated(nlh
, sizeof(*rtm
), tb
, RTA_MAX
,
3228 rtm_ipv4_policy
, extack
);
3230 if ((rtm
->rtm_src_len
&& rtm
->rtm_src_len
!= 32) ||
3231 (rtm
->rtm_dst_len
&& rtm
->rtm_dst_len
!= 32) ||
3232 rtm
->rtm_table
|| rtm
->rtm_protocol
||
3233 rtm
->rtm_scope
|| rtm
->rtm_type
) {
3234 NL_SET_ERR_MSG(extack
, "ipv4: Invalid values in header for route get request");
3238 if (rtm
->rtm_flags
& ~(RTM_F_NOTIFY
|
3239 RTM_F_LOOKUP_TABLE
|
3241 NL_SET_ERR_MSG(extack
, "ipv4: Unsupported rtm_flags for route get request");
3245 err
= nlmsg_parse_deprecated_strict(nlh
, sizeof(*rtm
), tb
, RTA_MAX
,
3246 rtm_ipv4_policy
, extack
);
3250 if ((tb
[RTA_SRC
] && !rtm
->rtm_src_len
) ||
3251 (tb
[RTA_DST
] && !rtm
->rtm_dst_len
)) {
3252 NL_SET_ERR_MSG(extack
, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3256 for (i
= 0; i
<= RTA_MAX
; i
++) {
3272 NL_SET_ERR_MSG(extack
, "ipv4: Unsupported attribute in route get request");
3280 static int inet_rtm_getroute(struct sk_buff
*in_skb
, struct nlmsghdr
*nlh
,
3281 struct netlink_ext_ack
*extack
)
3283 struct net
*net
= sock_net(in_skb
->sk
);
3284 struct nlattr
*tb
[RTA_MAX
+1];
3285 u32 table_id
= RT_TABLE_MAIN
;
3286 __be16 sport
= 0, dport
= 0;
3287 struct fib_result res
= {};
3288 u8 ip_proto
= IPPROTO_UDP
;
3289 struct rtable
*rt
= NULL
;
3290 struct sk_buff
*skb
;
3292 struct flowi4 fl4
= {};
3301 err
= inet_rtm_valid_getroute_req(in_skb
, nlh
, tb
, extack
);
3305 rtm
= nlmsg_data(nlh
);
3306 src
= nla_get_in_addr_default(tb
[RTA_SRC
], 0);
3307 dst
= nla_get_in_addr_default(tb
[RTA_DST
], 0);
3308 iif
= nla_get_u32_default(tb
[RTA_IIF
], 0);
3309 mark
= nla_get_u32_default(tb
[RTA_MARK
], 0);
3310 dscp
= inet_dsfield_to_dscp(rtm
->rtm_tos
);
3312 uid
= make_kuid(current_user_ns(), nla_get_u32(tb
[RTA_UID
]));
3314 uid
= (iif
? INVALID_UID
: current_uid());
3316 if (tb
[RTA_IP_PROTO
]) {
3317 err
= rtm_getroute_parse_ip_proto(tb
[RTA_IP_PROTO
],
3318 &ip_proto
, AF_INET
, extack
);
3324 sport
= nla_get_be16(tb
[RTA_SPORT
]);
3327 dport
= nla_get_be16(tb
[RTA_DPORT
]);
3329 skb
= inet_rtm_getroute_build_skb(src
, dst
, ip_proto
, sport
, dport
);
3335 fl4
.flowi4_tos
= inet_dscp_to_dsfield(dscp
);
3336 fl4
.flowi4_oif
= nla_get_u32_default(tb
[RTA_OIF
], 0);
3337 fl4
.flowi4_mark
= mark
;
3338 fl4
.flowi4_uid
= uid
;
3340 fl4
.fl4_sport
= sport
;
3342 fl4
.fl4_dport
= dport
;
3343 fl4
.flowi4_proto
= ip_proto
;
3348 struct net_device
*dev
;
3350 dev
= dev_get_by_index_rcu(net
, iif
);
3356 fl4
.flowi4_iif
= iif
; /* for rt_fill_info */
3359 err
= ip_route_input_rcu(skb
, dst
, src
, dscp
, dev
,
3360 &res
) ? -EINVAL
: 0;
3362 rt
= skb_rtable(skb
);
3363 if (err
== 0 && rt
->dst
.error
)
3364 err
= -rt
->dst
.error
;
3366 fl4
.flowi4_iif
= LOOPBACK_IFINDEX
;
3367 skb
->dev
= net
->loopback_dev
;
3368 rt
= ip_route_output_key_hash_rcu(net
, &fl4
, &res
, skb
);
3373 skb_dst_set(skb
, &rt
->dst
);
3379 if (rtm
->rtm_flags
& RTM_F_NOTIFY
)
3380 rt
->rt_flags
|= RTCF_NOTIFY
;
3382 if (rtm
->rtm_flags
& RTM_F_LOOKUP_TABLE
)
3383 table_id
= res
.table
? res
.table
->tb_id
: 0;
3385 /* reset skb for netlink reply msg */
3387 skb_reset_network_header(skb
);
3388 skb_reset_transport_header(skb
);
3389 skb_reset_mac_header(skb
);
3391 if (rtm
->rtm_flags
& RTM_F_FIB_MATCH
) {
3392 struct fib_rt_info fri
;
3395 err
= fib_props
[res
.type
].error
;
3397 err
= -EHOSTUNREACH
;
3401 fri
.tb_id
= table_id
;
3402 fri
.dst
= res
.prefix
;
3403 fri
.dst_len
= res
.prefixlen
;
3404 fri
.dscp
= res
.dscp
;
3405 fri
.type
= rt
->rt_type
;
3408 fri
.offload_failed
= 0;
3410 struct fib_alias
*fa
;
3412 hlist_for_each_entry_rcu(fa
, res
.fa_head
, fa_list
) {
3413 u8 slen
= 32 - fri
.dst_len
;
3415 if (fa
->fa_slen
== slen
&&
3416 fa
->tb_id
== fri
.tb_id
&&
3417 fa
->fa_dscp
== fri
.dscp
&&
3418 fa
->fa_info
== res
.fi
&&
3419 fa
->fa_type
== fri
.type
) {
3420 fri
.offload
= READ_ONCE(fa
->offload
);
3421 fri
.trap
= READ_ONCE(fa
->trap
);
3422 fri
.offload_failed
=
3423 READ_ONCE(fa
->offload_failed
);
3428 err
= fib_dump_info(skb
, NETLINK_CB(in_skb
).portid
,
3429 nlh
->nlmsg_seq
, RTM_NEWROUTE
, &fri
, 0);
3431 err
= rt_fill_info(net
, dst
, src
, rt
, table_id
, res
.dscp
, &fl4
,
3432 skb
, NETLINK_CB(in_skb
).portid
,
3440 err
= rtnl_unicast(skb
, net
, NETLINK_CB(in_skb
).portid
);
3450 void ip_rt_multicast_event(struct in_device
*in_dev
)
3452 rt_cache_flush(dev_net(in_dev
->dev
));
3455 #ifdef CONFIG_SYSCTL
3456 static int ip_rt_gc_interval __read_mostly
= 60 * HZ
;
3457 static int ip_rt_gc_min_interval __read_mostly
= HZ
/ 2;
3458 static int ip_rt_gc_elasticity __read_mostly
= 8;
3459 static int ip_min_valid_pmtu __read_mostly
= IPV4_MIN_MTU
;
3461 static int ipv4_sysctl_rtcache_flush(const struct ctl_table
*__ctl
, int write
,
3462 void *buffer
, size_t *lenp
, loff_t
*ppos
)
3464 struct net
*net
= (struct net
*)__ctl
->extra1
;
3467 rt_cache_flush(net
);
3468 fnhe_genid_bump(net
);
3475 static struct ctl_table ipv4_route_table
[] = {
3477 .procname
= "gc_thresh",
3478 .data
= &ipv4_dst_ops
.gc_thresh
,
3479 .maxlen
= sizeof(int),
3481 .proc_handler
= proc_dointvec
,
3484 .procname
= "max_size",
3485 .data
= &ip_rt_max_size
,
3486 .maxlen
= sizeof(int),
3488 .proc_handler
= proc_dointvec
,
3491 /* Deprecated. Use gc_min_interval_ms */
3493 .procname
= "gc_min_interval",
3494 .data
= &ip_rt_gc_min_interval
,
3495 .maxlen
= sizeof(int),
3497 .proc_handler
= proc_dointvec_jiffies
,
3500 .procname
= "gc_min_interval_ms",
3501 .data
= &ip_rt_gc_min_interval
,
3502 .maxlen
= sizeof(int),
3504 .proc_handler
= proc_dointvec_ms_jiffies
,
3507 .procname
= "gc_timeout",
3508 .data
= &ip_rt_gc_timeout
,
3509 .maxlen
= sizeof(int),
3511 .proc_handler
= proc_dointvec_jiffies
,
3514 .procname
= "gc_interval",
3515 .data
= &ip_rt_gc_interval
,
3516 .maxlen
= sizeof(int),
3518 .proc_handler
= proc_dointvec_jiffies
,
3521 .procname
= "redirect_load",
3522 .data
= &ip_rt_redirect_load
,
3523 .maxlen
= sizeof(int),
3525 .proc_handler
= proc_dointvec
,
3528 .procname
= "redirect_number",
3529 .data
= &ip_rt_redirect_number
,
3530 .maxlen
= sizeof(int),
3532 .proc_handler
= proc_dointvec
,
3535 .procname
= "redirect_silence",
3536 .data
= &ip_rt_redirect_silence
,
3537 .maxlen
= sizeof(int),
3539 .proc_handler
= proc_dointvec
,
3542 .procname
= "error_cost",
3543 .data
= &ip_rt_error_cost
,
3544 .maxlen
= sizeof(int),
3546 .proc_handler
= proc_dointvec
,
3549 .procname
= "error_burst",
3550 .data
= &ip_rt_error_burst
,
3551 .maxlen
= sizeof(int),
3553 .proc_handler
= proc_dointvec
,
3556 .procname
= "gc_elasticity",
3557 .data
= &ip_rt_gc_elasticity
,
3558 .maxlen
= sizeof(int),
3560 .proc_handler
= proc_dointvec
,
3564 static const char ipv4_route_flush_procname
[] = "flush";
3566 static struct ctl_table ipv4_route_netns_table
[] = {
3568 .procname
= ipv4_route_flush_procname
,
3569 .maxlen
= sizeof(int),
3571 .proc_handler
= ipv4_sysctl_rtcache_flush
,
3574 .procname
= "min_pmtu",
3575 .data
= &init_net
.ipv4
.ip_rt_min_pmtu
,
3576 .maxlen
= sizeof(int),
3578 .proc_handler
= proc_dointvec_minmax
,
3579 .extra1
= &ip_min_valid_pmtu
,
3582 .procname
= "mtu_expires",
3583 .data
= &init_net
.ipv4
.ip_rt_mtu_expires
,
3584 .maxlen
= sizeof(int),
3586 .proc_handler
= proc_dointvec_jiffies
,
3589 .procname
= "min_adv_mss",
3590 .data
= &init_net
.ipv4
.ip_rt_min_advmss
,
3591 .maxlen
= sizeof(int),
3593 .proc_handler
= proc_dointvec
,
3597 static __net_init
int sysctl_route_net_init(struct net
*net
)
3599 struct ctl_table
*tbl
;
3600 size_t table_size
= ARRAY_SIZE(ipv4_route_netns_table
);
3602 tbl
= ipv4_route_netns_table
;
3603 if (!net_eq(net
, &init_net
)) {
3606 tbl
= kmemdup(tbl
, sizeof(ipv4_route_netns_table
), GFP_KERNEL
);
3610 /* Don't export non-whitelisted sysctls to unprivileged users */
3611 if (net
->user_ns
!= &init_user_ns
) {
3612 if (tbl
[0].procname
!= ipv4_route_flush_procname
)
3616 /* Update the variables to point into the current struct net
3617 * except for the first element flush
3619 for (i
= 1; i
< table_size
; i
++)
3620 tbl
[i
].data
+= (void *)net
- (void *)&init_net
;
3622 tbl
[0].extra1
= net
;
3624 net
->ipv4
.route_hdr
= register_net_sysctl_sz(net
, "net/ipv4/route",
3626 if (!net
->ipv4
.route_hdr
)
3631 if (tbl
!= ipv4_route_netns_table
)
3637 static __net_exit
void sysctl_route_net_exit(struct net
*net
)
3639 const struct ctl_table
*tbl
;
3641 tbl
= net
->ipv4
.route_hdr
->ctl_table_arg
;
3642 unregister_net_sysctl_table(net
->ipv4
.route_hdr
);
3643 BUG_ON(tbl
== ipv4_route_netns_table
);
3647 static __net_initdata
struct pernet_operations sysctl_route_ops
= {
3648 .init
= sysctl_route_net_init
,
3649 .exit
= sysctl_route_net_exit
,
3653 static __net_init
int netns_ip_rt_init(struct net
*net
)
3655 /* Set default value for namespaceified sysctls */
3656 net
->ipv4
.ip_rt_min_pmtu
= DEFAULT_MIN_PMTU
;
3657 net
->ipv4
.ip_rt_mtu_expires
= DEFAULT_MTU_EXPIRES
;
3658 net
->ipv4
.ip_rt_min_advmss
= DEFAULT_MIN_ADVMSS
;
3662 static struct pernet_operations __net_initdata ip_rt_ops
= {
3663 .init
= netns_ip_rt_init
,
3666 static __net_init
int rt_genid_init(struct net
*net
)
3668 atomic_set(&net
->ipv4
.rt_genid
, 0);
3669 atomic_set(&net
->fnhe_genid
, 0);
3670 atomic_set(&net
->ipv4
.dev_addr_genid
, get_random_u32());
3674 static __net_initdata
struct pernet_operations rt_genid_ops
= {
3675 .init
= rt_genid_init
,
3678 static int __net_init
ipv4_inetpeer_init(struct net
*net
)
3680 struct inet_peer_base
*bp
= kmalloc(sizeof(*bp
), GFP_KERNEL
);
3684 inet_peer_base_init(bp
);
3685 net
->ipv4
.peers
= bp
;
3689 static void __net_exit
ipv4_inetpeer_exit(struct net
*net
)
3691 struct inet_peer_base
*bp
= net
->ipv4
.peers
;
3693 net
->ipv4
.peers
= NULL
;
3694 inetpeer_invalidate_tree(bp
);
3698 static __net_initdata
struct pernet_operations ipv4_inetpeer_ops
= {
3699 .init
= ipv4_inetpeer_init
,
3700 .exit
= ipv4_inetpeer_exit
,
3703 #ifdef CONFIG_IP_ROUTE_CLASSID
3704 struct ip_rt_acct __percpu
*ip_rt_acct __read_mostly
;
3705 #endif /* CONFIG_IP_ROUTE_CLASSID */
3707 static const struct rtnl_msg_handler ip_rt_rtnl_msg_handlers
[] __initconst
= {
3708 {.protocol
= PF_INET
, .msgtype
= RTM_GETROUTE
,
3709 .doit
= inet_rtm_getroute
, .flags
= RTNL_FLAG_DOIT_UNLOCKED
},
3712 int __init
ip_rt_init(void)
3717 /* For modern hosts, this will use 2 MB of memory */
3718 idents_hash
= alloc_large_system_hash("IP idents",
3719 sizeof(*ip_idents
) + sizeof(*ip_tstamps
),
3721 16, /* one bucket per 64 KB */
3728 ip_idents
= idents_hash
;
3730 get_random_bytes(ip_idents
, (ip_idents_mask
+ 1) * sizeof(*ip_idents
));
3732 ip_tstamps
= idents_hash
+ (ip_idents_mask
+ 1) * sizeof(*ip_idents
);
3734 for_each_possible_cpu(cpu
) {
3735 struct uncached_list
*ul
= &per_cpu(rt_uncached_list
, cpu
);
3737 INIT_LIST_HEAD(&ul
->head
);
3738 spin_lock_init(&ul
->lock
);
3740 #ifdef CONFIG_IP_ROUTE_CLASSID
3741 ip_rt_acct
= __alloc_percpu(256 * sizeof(struct ip_rt_acct
), __alignof__(struct ip_rt_acct
));
3743 panic("IP: failed to allocate ip_rt_acct\n");
3746 ipv4_dst_ops
.kmem_cachep
= KMEM_CACHE(rtable
,
3747 SLAB_HWCACHE_ALIGN
| SLAB_PANIC
);
3749 ipv4_dst_blackhole_ops
.kmem_cachep
= ipv4_dst_ops
.kmem_cachep
;
3751 if (dst_entries_init(&ipv4_dst_ops
) < 0)
3752 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3754 if (dst_entries_init(&ipv4_dst_blackhole_ops
) < 0)
3755 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3757 ipv4_dst_ops
.gc_thresh
= ~0;
3758 ip_rt_max_size
= INT_MAX
;
3763 if (ip_rt_proc_init())
3764 pr_err("Unable to create route proc files\n");
3769 rtnl_register_many(ip_rt_rtnl_msg_handlers
);
3771 #ifdef CONFIG_SYSCTL
3772 register_pernet_subsys(&sysctl_route_ops
);
3774 register_pernet_subsys(&ip_rt_ops
);
3775 register_pernet_subsys(&rt_genid_ops
);
3776 register_pernet_subsys(&ipv4_inetpeer_ops
);
3780 #ifdef CONFIG_SYSCTL
3782 * We really need to sanitize the damn ipv4 init order, then all
3783 * this nonsense will go away.
3785 void __init
ip_static_sysctl_init(void)
3787 register_net_sysctl(&init_net
, "net/ipv4/route", ipv4_route_table
);