]> git.ipfire.org Git - thirdparty/linux.git/blame - net/ipv4/route.c
Merge tag 'x86-fpu-2020-06-01' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
[thirdparty/linux.git] / net / ipv4 / route.c
CommitLineData
2874c5fd 1// SPDX-License-Identifier: GPL-2.0-or-later
1da177e4
LT
2/*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * ROUTE - implementation of the IP router.
8 *
02c30a84 9 * Authors: Ross Biro
1da177e4
LT
10 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11 * Alan Cox, <gw4pts@gw4pts.ampr.org>
12 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
13 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
14 *
15 * Fixes:
16 * Alan Cox : Verify area fixes.
17 * Alan Cox : cli() protects routing changes
18 * Rui Oliveira : ICMP routing table updates
19 * (rco@di.uminho.pt) Routing table insertion and update
20 * Linus Torvalds : Rewrote bits to be sensible
21 * Alan Cox : Added BSD route gw semantics
e905a9ed 22 * Alan Cox : Super /proc >4K
1da177e4
LT
23 * Alan Cox : MTU in route table
24 * Alan Cox : MSS actually. Also added the window
25 * clamper.
26 * Sam Lantinga : Fixed route matching in rt_del()
27 * Alan Cox : Routing cache support.
28 * Alan Cox : Removed compatibility cruft.
29 * Alan Cox : RTF_REJECT support.
30 * Alan Cox : TCP irtt support.
31 * Jonathan Naylor : Added Metric support.
32 * Miquel van Smoorenburg : BSD API fixes.
33 * Miquel van Smoorenburg : Metrics.
34 * Alan Cox : Use __u32 properly
35 * Alan Cox : Aligned routing errors more closely with BSD
36 * our system is still very different.
37 * Alan Cox : Faster /proc handling
38 * Alexey Kuznetsov : Massive rework to support tree based routing,
39 * routing caches and better behaviour.
e905a9ed 40 *
1da177e4
LT
41 * Olaf Erb : irtt wasn't being copied right.
42 * Bjorn Ekwall : Kerneld route support.
43 * Alan Cox : Multicast fixed (I hope)
44 * Pavel Krauz : Limited broadcast fixed
45 * Mike McLagan : Routing by source
46 * Alexey Kuznetsov : End of old history. Split to fib.c and
47 * route.c and rewritten from scratch.
48 * Andi Kleen : Load-limit warning messages.
49 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
50 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
51 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
52 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
53 * Marc Boucher : routing by fwmark
54 * Robert Olsson : Added rt_cache statistics
55 * Arnaldo C. Melo : Convert proc stuff to seq_file
bb1d23b0 56 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
cef2685e
IS
57 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
58 * Ilia Sotnikov : Removed TOS from hash calculations
1da177e4
LT
59 */
60
afd46503
JP
61#define pr_fmt(fmt) "IPv4: " fmt
62
1da177e4 63#include <linux/module.h>
7c0f6ba6 64#include <linux/uaccess.h>
1da177e4
LT
65#include <linux/bitops.h>
66#include <linux/types.h>
67#include <linux/kernel.h>
1da177e4
LT
68#include <linux/mm.h>
69#include <linux/string.h>
70#include <linux/socket.h>
71#include <linux/sockios.h>
72#include <linux/errno.h>
73#include <linux/in.h>
74#include <linux/inet.h>
75#include <linux/netdevice.h>
76#include <linux/proc_fs.h>
77#include <linux/init.h>
78#include <linux/skbuff.h>
1da177e4
LT
79#include <linux/inetdevice.h>
80#include <linux/igmp.h>
81#include <linux/pkt_sched.h>
82#include <linux/mroute.h>
83#include <linux/netfilter_ipv4.h>
84#include <linux/random.h>
1da177e4
LT
85#include <linux/rcupdate.h>
86#include <linux/times.h>
5a0e3ad6 87#include <linux/slab.h>
73f156a6 88#include <linux/jhash.h>
352e512c 89#include <net/dst.h>
1b7179d3 90#include <net/dst_metadata.h>
457c4cbc 91#include <net/net_namespace.h>
1da177e4
LT
92#include <net/protocol.h>
93#include <net/ip.h>
94#include <net/route.h>
95#include <net/inetpeer.h>
96#include <net/sock.h>
97#include <net/ip_fib.h>
5481d73f 98#include <net/nexthop.h>
1da177e4
LT
99#include <net/arp.h>
100#include <net/tcp.h>
101#include <net/icmp.h>
102#include <net/xfrm.h>
571e7226 103#include <net/lwtunnel.h>
8d71740c 104#include <net/netevent.h>
63f3444f 105#include <net/rtnetlink.h>
1da177e4
LT
106#ifdef CONFIG_SYSCTL
107#include <linux/sysctl.h>
108#endif
6e5714ea 109#include <net/secure_seq.h>
1b7179d3 110#include <net/ip_tunnels.h>
385add90 111#include <net/l3mdev.h>
1da177e4 112
b6179813
RP
113#include "fib_lookup.h"
114
68a5e3dd 115#define RT_FL_TOS(oldflp4) \
f61759e6 116 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
1da177e4 117
1da177e4
LT
118#define RT_GC_TIMEOUT (300*HZ)
119
1da177e4 120static int ip_rt_max_size;
817bc4db
SH
121static int ip_rt_redirect_number __read_mostly = 9;
122static int ip_rt_redirect_load __read_mostly = HZ / 50;
123static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
124static int ip_rt_error_cost __read_mostly = HZ;
125static int ip_rt_error_burst __read_mostly = 5 * HZ;
817bc4db 126static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
c7272c2f 127static u32 ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
817bc4db 128static int ip_rt_min_advmss __read_mostly = 256;
9f28a2fc 129
deed49df 130static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
c7272c2f 131
1da177e4
LT
132/*
133 * Interface to generic destination cache.
134 */
135
136static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
0dbaee3b 137static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
ebb762f2 138static unsigned int ipv4_mtu(const struct dst_entry *dst);
1da177e4
LT
139static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
140static void ipv4_link_failure(struct sk_buff *skb);
6700c270 141static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
bd085ef6
HL
142 struct sk_buff *skb, u32 mtu,
143 bool confirm_neigh);
6700c270
DM
144static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
145 struct sk_buff *skb);
caacf05e 146static void ipv4_dst_destroy(struct dst_entry *dst);
1da177e4 147
62fa8a84
DM
148static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
149{
31248731
DM
150 WARN_ON(1);
151 return NULL;
62fa8a84
DM
152}
153
f894cbf8
DM
154static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
155 struct sk_buff *skb,
156 const void *daddr);
63fca65d 157static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
d3aaeb38 158
1da177e4
LT
159static struct dst_ops ipv4_dst_ops = {
160 .family = AF_INET,
1da177e4 161 .check = ipv4_dst_check,
0dbaee3b 162 .default_advmss = ipv4_default_advmss,
ebb762f2 163 .mtu = ipv4_mtu,
62fa8a84 164 .cow_metrics = ipv4_cow_metrics,
caacf05e 165 .destroy = ipv4_dst_destroy,
1da177e4
LT
166 .negative_advice = ipv4_negative_advice,
167 .link_failure = ipv4_link_failure,
168 .update_pmtu = ip_rt_update_pmtu,
e47a185b 169 .redirect = ip_do_redirect,
b92dacd4 170 .local_out = __ip_local_out,
d3aaeb38 171 .neigh_lookup = ipv4_neigh_lookup,
63fca65d 172 .confirm_neigh = ipv4_confirm_neigh,
1da177e4
LT
173};
174
175#define ECN_OR_COST(class) TC_PRIO_##class
176
4839c52b 177const __u8 ip_tos2prio[16] = {
1da177e4 178 TC_PRIO_BESTEFFORT,
4a2b9c37 179 ECN_OR_COST(BESTEFFORT),
1da177e4
LT
180 TC_PRIO_BESTEFFORT,
181 ECN_OR_COST(BESTEFFORT),
182 TC_PRIO_BULK,
183 ECN_OR_COST(BULK),
184 TC_PRIO_BULK,
185 ECN_OR_COST(BULK),
186 TC_PRIO_INTERACTIVE,
187 ECN_OR_COST(INTERACTIVE),
188 TC_PRIO_INTERACTIVE,
189 ECN_OR_COST(INTERACTIVE),
190 TC_PRIO_INTERACTIVE_BULK,
191 ECN_OR_COST(INTERACTIVE_BULK),
192 TC_PRIO_INTERACTIVE_BULK,
193 ECN_OR_COST(INTERACTIVE_BULK)
194};
d4a96865 195EXPORT_SYMBOL(ip_tos2prio);
1da177e4 196
2f970d83 197static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
3ed66e91 198#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
1da177e4 199
1da177e4 200#ifdef CONFIG_PROC_FS
1da177e4
LT
201static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
202{
29e75252 203 if (*pos)
89aef892 204 return NULL;
29e75252 205 return SEQ_START_TOKEN;
1da177e4
LT
206}
207
208static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
209{
1da177e4 210 ++*pos;
89aef892 211 return NULL;
1da177e4
LT
212}
213
214static void rt_cache_seq_stop(struct seq_file *seq, void *v)
215{
1da177e4
LT
216}
217
218static int rt_cache_seq_show(struct seq_file *seq, void *v)
219{
220 if (v == SEQ_START_TOKEN)
221 seq_printf(seq, "%-127s\n",
222 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
223 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
224 "HHUptod\tSpecDst");
e905a9ed 225 return 0;
1da177e4
LT
226}
227
f690808e 228static const struct seq_operations rt_cache_seq_ops = {
1da177e4
LT
229 .start = rt_cache_seq_start,
230 .next = rt_cache_seq_next,
231 .stop = rt_cache_seq_stop,
232 .show = rt_cache_seq_show,
233};
234
235static int rt_cache_seq_open(struct inode *inode, struct file *file)
236{
89aef892 237 return seq_open(file, &rt_cache_seq_ops);
1da177e4
LT
238}
239
97a32539
AD
240static const struct proc_ops rt_cache_proc_ops = {
241 .proc_open = rt_cache_seq_open,
242 .proc_read = seq_read,
243 .proc_lseek = seq_lseek,
244 .proc_release = seq_release,
1da177e4
LT
245};
246
247
248static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
249{
250 int cpu;
251
252 if (*pos == 0)
253 return SEQ_START_TOKEN;
254
0f23174a 255 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
256 if (!cpu_possible(cpu))
257 continue;
258 *pos = cpu+1;
2f970d83 259 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
260 }
261 return NULL;
262}
263
264static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
265{
266 int cpu;
267
0f23174a 268 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
269 if (!cpu_possible(cpu))
270 continue;
271 *pos = cpu+1;
2f970d83 272 return &per_cpu(rt_cache_stat, cpu);
1da177e4 273 }
a3ea8673 274 (*pos)++;
1da177e4 275 return NULL;
e905a9ed 276
1da177e4
LT
277}
278
279static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
280{
281
282}
283
284static int rt_cpu_seq_show(struct seq_file *seq, void *v)
285{
286 struct rt_cache_stat *st = v;
287
288 if (v == SEQ_START_TOKEN) {
5bec0039 289 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
1da177e4
LT
290 return 0;
291 }
e905a9ed 292
1da177e4
LT
293 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
294 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
fc66f95c 295 dst_entries_get_slow(&ipv4_dst_ops),
0baf2b35 296 0, /* st->in_hit */
1da177e4
LT
297 st->in_slow_tot,
298 st->in_slow_mc,
299 st->in_no_route,
300 st->in_brd,
301 st->in_martian_dst,
302 st->in_martian_src,
303
0baf2b35 304 0, /* st->out_hit */
1da177e4 305 st->out_slow_tot,
e905a9ed 306 st->out_slow_mc,
1da177e4 307
0baf2b35
ED
308 0, /* st->gc_total */
309 0, /* st->gc_ignored */
310 0, /* st->gc_goal_miss */
311 0, /* st->gc_dst_overflow */
312 0, /* st->in_hlist_search */
313 0 /* st->out_hlist_search */
1da177e4
LT
314 );
315 return 0;
316}
317
f690808e 318static const struct seq_operations rt_cpu_seq_ops = {
1da177e4
LT
319 .start = rt_cpu_seq_start,
320 .next = rt_cpu_seq_next,
321 .stop = rt_cpu_seq_stop,
322 .show = rt_cpu_seq_show,
323};
324
325
326static int rt_cpu_seq_open(struct inode *inode, struct file *file)
327{
328 return seq_open(file, &rt_cpu_seq_ops);
329}
330
97a32539
AD
331static const struct proc_ops rt_cpu_proc_ops = {
332 .proc_open = rt_cpu_seq_open,
333 .proc_read = seq_read,
334 .proc_lseek = seq_lseek,
335 .proc_release = seq_release,
1da177e4
LT
336};
337
c7066f70 338#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 339static int rt_acct_proc_show(struct seq_file *m, void *v)
78c686e9 340{
a661c419
AD
341 struct ip_rt_acct *dst, *src;
342 unsigned int i, j;
343
344 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
345 if (!dst)
346 return -ENOMEM;
347
348 for_each_possible_cpu(i) {
349 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
350 for (j = 0; j < 256; j++) {
351 dst[j].o_bytes += src[j].o_bytes;
352 dst[j].o_packets += src[j].o_packets;
353 dst[j].i_bytes += src[j].i_bytes;
354 dst[j].i_packets += src[j].i_packets;
355 }
78c686e9
PE
356 }
357
a661c419
AD
358 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
359 kfree(dst);
360 return 0;
361}
78c686e9 362#endif
107f1634 363
73b38711 364static int __net_init ip_rt_do_proc_init(struct net *net)
107f1634
PE
365{
366 struct proc_dir_entry *pde;
367
d6444062 368 pde = proc_create("rt_cache", 0444, net->proc_net,
97a32539 369 &rt_cache_proc_ops);
107f1634
PE
370 if (!pde)
371 goto err1;
372
d6444062 373 pde = proc_create("rt_cache", 0444,
97a32539 374 net->proc_net_stat, &rt_cpu_proc_ops);
107f1634
PE
375 if (!pde)
376 goto err2;
377
c7066f70 378#ifdef CONFIG_IP_ROUTE_CLASSID
3f3942ac
CH
379 pde = proc_create_single("rt_acct", 0, net->proc_net,
380 rt_acct_proc_show);
107f1634
PE
381 if (!pde)
382 goto err3;
383#endif
384 return 0;
385
c7066f70 386#ifdef CONFIG_IP_ROUTE_CLASSID
107f1634
PE
387err3:
388 remove_proc_entry("rt_cache", net->proc_net_stat);
389#endif
390err2:
391 remove_proc_entry("rt_cache", net->proc_net);
392err1:
393 return -ENOMEM;
394}
73b38711
DL
395
396static void __net_exit ip_rt_do_proc_exit(struct net *net)
397{
398 remove_proc_entry("rt_cache", net->proc_net_stat);
399 remove_proc_entry("rt_cache", net->proc_net);
c7066f70 400#ifdef CONFIG_IP_ROUTE_CLASSID
73b38711 401 remove_proc_entry("rt_acct", net->proc_net);
0a931acf 402#endif
73b38711
DL
403}
404
405static struct pernet_operations ip_rt_proc_ops __net_initdata = {
406 .init = ip_rt_do_proc_init,
407 .exit = ip_rt_do_proc_exit,
408};
409
410static int __init ip_rt_proc_init(void)
411{
412 return register_pernet_subsys(&ip_rt_proc_ops);
413}
414
107f1634 415#else
73b38711 416static inline int ip_rt_proc_init(void)
107f1634
PE
417{
418 return 0;
419}
1da177e4 420#endif /* CONFIG_PROC_FS */
e905a9ed 421
4331debc 422static inline bool rt_is_expired(const struct rtable *rth)
e84f84f2 423{
ca4c3fc2 424 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
e84f84f2
DL
425}
426
4ccfe6d4 427void rt_cache_flush(struct net *net)
1da177e4 428{
ca4c3fc2 429 rt_genid_bump_ipv4(net);
98376387
ED
430}
431
f894cbf8
DM
432static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
433 struct sk_buff *skb,
434 const void *daddr)
3769cffb 435{
1550c171 436 const struct rtable *rt = container_of(dst, struct rtable, dst);
d3aaeb38 437 struct net_device *dev = dst->dev;
3769cffb
DM
438 struct neighbour *n;
439
5c9f7c1d
DA
440 rcu_read_lock_bh();
441
442 if (likely(rt->rt_gw_family == AF_INET)) {
443 n = ip_neigh_gw4(dev, rt->rt_gw4);
444 } else if (rt->rt_gw_family == AF_INET6) {
445 n = ip_neigh_gw6(dev, &rt->rt_gw6);
446 } else {
447 __be32 pkey;
448
449 pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
450 n = ip_neigh_gw4(dev, pkey);
451 }
452
537de0c8 453 if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
5c9f7c1d
DA
454 n = NULL;
455
456 rcu_read_unlock_bh();
457
458 return n;
d3aaeb38
DM
459}
460
63fca65d
JA
461static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
462{
1550c171 463 const struct rtable *rt = container_of(dst, struct rtable, dst);
63fca65d
JA
464 struct net_device *dev = dst->dev;
465 const __be32 *pkey = daddr;
63fca65d 466
6de9c055 467 if (rt->rt_gw_family == AF_INET) {
1550c171 468 pkey = (const __be32 *)&rt->rt_gw4;
6de9c055
DA
469 } else if (rt->rt_gw_family == AF_INET6) {
470 return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
471 } else if (!daddr ||
63fca65d 472 (rt->rt_flags &
6de9c055 473 (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
63fca65d 474 return;
6de9c055 475 }
63fca65d
JA
476 __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
477}
478
04ca6973 479#define IP_IDENTS_SZ 2048u
04ca6973 480
355b590c
ED
481static atomic_t *ip_idents __read_mostly;
482static u32 *ip_tstamps __read_mostly;
04ca6973
ED
483
484/* In order to protect privacy, we add a perturbation to identifiers
485 * if one generator is seldom used. This makes hard for an attacker
486 * to infer how many packets were sent between two points in time.
487 */
488u32 ip_idents_reserve(u32 hash, int segs)
489{
355b590c
ED
490 u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
491 atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
6aa7de05 492 u32 old = READ_ONCE(*p_tstamp);
04ca6973 493 u32 now = (u32)jiffies;
a6211caa 494 u32 delta = 0;
04ca6973 495
355b590c 496 if (old != now && cmpxchg(p_tstamp, old, now) == old)
04ca6973
ED
497 delta = prandom_u32_max(now - old);
498
a6211caa
YJ
499 /* If UBSAN reports an error there, please make sure your compiler
500 * supports -fno-strict-overflow before reporting it that was a bug
501 * in UBSAN, and it has been fixed in GCC-8.
502 */
503 return atomic_add_return(segs + delta, p_id) - segs;
04ca6973
ED
504}
505EXPORT_SYMBOL(ip_idents_reserve);
1da177e4 506
b6a7719a 507void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
1da177e4 508{
73f156a6 509 u32 hash, id;
1da177e4 510
df453700
ED
511 /* Note the following code is not safe, but this is okay. */
512 if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
513 get_random_bytes(&net->ipv4.ip_id_key,
514 sizeof(net->ipv4.ip_id_key));
1da177e4 515
df453700 516 hash = siphash_3u32((__force u32)iph->daddr,
04ca6973 517 (__force u32)iph->saddr,
df453700
ED
518 iph->protocol,
519 &net->ipv4.ip_id_key);
73f156a6
ED
520 id = ip_idents_reserve(hash, segs);
521 iph->id = htons(id);
1da177e4 522}
4bc2f18b 523EXPORT_SYMBOL(__ip_select_ident);
1da177e4 524
e2d118a1
LC
525static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
526 const struct sock *sk,
4895c771
DM
527 const struct iphdr *iph,
528 int oif, u8 tos,
529 u8 prot, u32 mark, int flow_flags)
530{
531 if (sk) {
532 const struct inet_sock *inet = inet_sk(sk);
533
534 oif = sk->sk_bound_dev_if;
535 mark = sk->sk_mark;
536 tos = RT_CONN_FLAGS(sk);
537 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
538 }
539 flowi4_init_output(fl4, oif, mark, tos,
540 RT_SCOPE_UNIVERSE, prot,
541 flow_flags,
e2d118a1
LC
542 iph->daddr, iph->saddr, 0, 0,
543 sock_net_uid(net, sk));
4895c771
DM
544}
545
5abf7f7e
ED
546static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
547 const struct sock *sk)
4895c771 548{
d109e61b 549 const struct net *net = dev_net(skb->dev);
4895c771
DM
550 const struct iphdr *iph = ip_hdr(skb);
551 int oif = skb->dev->ifindex;
552 u8 tos = RT_TOS(iph->tos);
553 u8 prot = iph->protocol;
554 u32 mark = skb->mark;
555
d109e61b 556 __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
4895c771
DM
557}
558
5abf7f7e 559static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
4895c771
DM
560{
561 const struct inet_sock *inet = inet_sk(sk);
5abf7f7e 562 const struct ip_options_rcu *inet_opt;
4895c771
DM
563 __be32 daddr = inet->inet_daddr;
564
565 rcu_read_lock();
566 inet_opt = rcu_dereference(inet->inet_opt);
567 if (inet_opt && inet_opt->opt.srr)
568 daddr = inet_opt->opt.faddr;
569 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
570 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
571 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
572 inet_sk_flowi_flags(sk),
e2d118a1 573 daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
4895c771
DM
574 rcu_read_unlock();
575}
576
5abf7f7e
ED
577static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
578 const struct sk_buff *skb)
4895c771
DM
579{
580 if (skb)
581 build_skb_flow_key(fl4, skb, sk);
582 else
583 build_sk_flow_key(fl4, sk);
584}
585
c5038a83 586static DEFINE_SPINLOCK(fnhe_lock);
4895c771 587
2ffae99d
TT
588static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
589{
590 struct rtable *rt;
591
592 rt = rcu_dereference(fnhe->fnhe_rth_input);
593 if (rt) {
594 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
95c47f9c 595 dst_dev_put(&rt->dst);
0830106c 596 dst_release(&rt->dst);
2ffae99d
TT
597 }
598 rt = rcu_dereference(fnhe->fnhe_rth_output);
599 if (rt) {
600 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
95c47f9c 601 dst_dev_put(&rt->dst);
0830106c 602 dst_release(&rt->dst);
2ffae99d
TT
603 }
604}
605
aee06da6 606static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
4895c771
DM
607{
608 struct fib_nh_exception *fnhe, *oldest;
609
610 oldest = rcu_dereference(hash->chain);
611 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
612 fnhe = rcu_dereference(fnhe->fnhe_next)) {
613 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
614 oldest = fnhe;
615 }
2ffae99d 616 fnhe_flush_routes(oldest);
4895c771
DM
617 return oldest;
618}
619
d3a25c98
DM
620static inline u32 fnhe_hashfun(__be32 daddr)
621{
d546c621 622 static u32 fnhe_hashrnd __read_mostly;
d3a25c98
DM
623 u32 hval;
624
d546c621
ED
625 net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
626 hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
627 return hash_32(hval, FNHE_HASH_SHIFT);
d3a25c98
DM
628}
629
387aa65a
TT
630static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
631{
632 rt->rt_pmtu = fnhe->fnhe_pmtu;
d52e5a7e 633 rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
387aa65a
TT
634 rt->dst.expires = fnhe->fnhe_expires;
635
636 if (fnhe->fnhe_gw) {
637 rt->rt_flags |= RTCF_REDIRECTED;
77d5bc7e 638 rt->rt_uses_gateway = 1;
1550c171
DA
639 rt->rt_gw_family = AF_INET;
640 rt->rt_gw4 = fnhe->fnhe_gw;
387aa65a
TT
641 }
642}
643
a5995e71
DA
644static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
645 __be32 gw, u32 pmtu, bool lock,
646 unsigned long expires)
4895c771 647{
aee06da6 648 struct fnhe_hash_bucket *hash;
4895c771 649 struct fib_nh_exception *fnhe;
387aa65a 650 struct rtable *rt;
cebe84c6 651 u32 genid, hval;
387aa65a 652 unsigned int i;
4895c771 653 int depth;
cebe84c6 654
a5995e71 655 genid = fnhe_genid(dev_net(nhc->nhc_dev));
cebe84c6 656 hval = fnhe_hashfun(daddr);
aee06da6 657
c5038a83 658 spin_lock_bh(&fnhe_lock);
4895c771 659
a5995e71 660 hash = rcu_dereference(nhc->nhc_exceptions);
4895c771 661 if (!hash) {
6396bb22 662 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
4895c771 663 if (!hash)
aee06da6 664 goto out_unlock;
a5995e71 665 rcu_assign_pointer(nhc->nhc_exceptions, hash);
4895c771
DM
666 }
667
4895c771
DM
668 hash += hval;
669
670 depth = 0;
671 for (fnhe = rcu_dereference(hash->chain); fnhe;
672 fnhe = rcu_dereference(fnhe->fnhe_next)) {
673 if (fnhe->fnhe_daddr == daddr)
aee06da6 674 break;
4895c771
DM
675 depth++;
676 }
677
aee06da6 678 if (fnhe) {
cebe84c6
XL
679 if (fnhe->fnhe_genid != genid)
680 fnhe->fnhe_genid = genid;
aee06da6
JA
681 if (gw)
682 fnhe->fnhe_gw = gw;
d52e5a7e 683 if (pmtu) {
aee06da6 684 fnhe->fnhe_pmtu = pmtu;
d52e5a7e
SD
685 fnhe->fnhe_mtu_locked = lock;
686 }
e39d5246 687 fnhe->fnhe_expires = max(1UL, expires);
387aa65a 688 /* Update all cached dsts too */
2ffae99d
TT
689 rt = rcu_dereference(fnhe->fnhe_rth_input);
690 if (rt)
691 fill_route_from_fnhe(rt, fnhe);
692 rt = rcu_dereference(fnhe->fnhe_rth_output);
387aa65a
TT
693 if (rt)
694 fill_route_from_fnhe(rt, fnhe);
aee06da6
JA
695 } else {
696 if (depth > FNHE_RECLAIM_DEPTH)
697 fnhe = fnhe_oldest(hash);
698 else {
699 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
700 if (!fnhe)
701 goto out_unlock;
702
703 fnhe->fnhe_next = hash->chain;
704 rcu_assign_pointer(hash->chain, fnhe);
705 }
cebe84c6 706 fnhe->fnhe_genid = genid;
aee06da6
JA
707 fnhe->fnhe_daddr = daddr;
708 fnhe->fnhe_gw = gw;
709 fnhe->fnhe_pmtu = pmtu;
d52e5a7e 710 fnhe->fnhe_mtu_locked = lock;
94720e3a 711 fnhe->fnhe_expires = max(1UL, expires);
387aa65a
TT
712
713 /* Exception created; mark the cached routes for the nexthop
714 * stale, so anyone caching it rechecks if this exception
715 * applies to them.
716 */
0f457a36 717 rt = rcu_dereference(nhc->nhc_rth_input);
2ffae99d
TT
718 if (rt)
719 rt->dst.obsolete = DST_OBSOLETE_KILL;
720
387aa65a
TT
721 for_each_possible_cpu(i) {
722 struct rtable __rcu **prt;
0f457a36 723 prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
387aa65a
TT
724 rt = rcu_dereference(*prt);
725 if (rt)
726 rt->dst.obsolete = DST_OBSOLETE_KILL;
727 }
4895c771 728 }
4895c771 729
4895c771 730 fnhe->fnhe_stamp = jiffies;
aee06da6
JA
731
732out_unlock:
c5038a83 733 spin_unlock_bh(&fnhe_lock);
4895c771
DM
734}
735
ceb33206
DM
736static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
737 bool kill_route)
1da177e4 738{
e47a185b 739 __be32 new_gw = icmp_hdr(skb)->un.gateway;
94206125 740 __be32 old_gw = ip_hdr(skb)->saddr;
e47a185b 741 struct net_device *dev = skb->dev;
e47a185b 742 struct in_device *in_dev;
4895c771 743 struct fib_result res;
e47a185b 744 struct neighbour *n;
317805b8 745 struct net *net;
1da177e4 746
94206125
DM
747 switch (icmp_hdr(skb)->code & 7) {
748 case ICMP_REDIR_NET:
749 case ICMP_REDIR_NETTOS:
750 case ICMP_REDIR_HOST:
751 case ICMP_REDIR_HOSTTOS:
752 break;
753
754 default:
755 return;
756 }
757
1550c171 758 if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
e47a185b
DM
759 return;
760
761 in_dev = __in_dev_get_rcu(dev);
762 if (!in_dev)
763 return;
764
c346dca1 765 net = dev_net(dev);
9d4fb27d
JP
766 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
767 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
768 ipv4_is_zeronet(new_gw))
1da177e4
LT
769 goto reject_redirect;
770
771 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
772 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
773 goto reject_redirect;
774 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
775 goto reject_redirect;
776 } else {
317805b8 777 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1da177e4
LT
778 goto reject_redirect;
779 }
780
969447f2
SSL
781 n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
782 if (!n)
783 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
2c1a4311 784 if (!IS_ERR(n)) {
e47a185b
DM
785 if (!(n->nud_state & NUD_VALID)) {
786 neigh_event_send(n, NULL);
787 } else {
0eeb075f 788 if (fib_lookup(net, fl4, &res, 0) == 0) {
eba618ab 789 struct fib_nh_common *nhc = FIB_RES_NHC(res);
4895c771 790
a5995e71 791 update_or_create_fnhe(nhc, fl4->daddr, new_gw,
d52e5a7e
SD
792 0, false,
793 jiffies + ip_rt_gc_timeout);
4895c771 794 }
ceb33206
DM
795 if (kill_route)
796 rt->dst.obsolete = DST_OBSOLETE_KILL;
e47a185b
DM
797 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
798 }
799 neigh_release(n);
800 }
801 return;
802
803reject_redirect:
804#ifdef CONFIG_IP_ROUTE_VERBOSE
99ee038d
DM
805 if (IN_DEV_LOG_MARTIANS(in_dev)) {
806 const struct iphdr *iph = (const struct iphdr *) skb->data;
807 __be32 daddr = iph->daddr;
808 __be32 saddr = iph->saddr;
809
e47a185b
DM
810 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
811 " Advised path = %pI4 -> %pI4\n",
812 &old_gw, dev->name, &new_gw,
813 &saddr, &daddr);
99ee038d 814 }
e47a185b
DM
815#endif
816 ;
817}
818
4895c771
DM
819static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
820{
821 struct rtable *rt;
822 struct flowi4 fl4;
f96ef988 823 const struct iphdr *iph = (const struct iphdr *) skb->data;
7d995694 824 struct net *net = dev_net(skb->dev);
f96ef988
MK
825 int oif = skb->dev->ifindex;
826 u8 tos = RT_TOS(iph->tos);
827 u8 prot = iph->protocol;
828 u32 mark = skb->mark;
4895c771
DM
829
830 rt = (struct rtable *) dst;
831
7d995694 832 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
ceb33206 833 __ip_do_redirect(rt, skb, &fl4, true);
4895c771
DM
834}
835
1da177e4
LT
836static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
837{
ee6b9673 838 struct rtable *rt = (struct rtable *)dst;
1da177e4
LT
839 struct dst_entry *ret = dst;
840
841 if (rt) {
d11a4dc1 842 if (dst->obsolete > 0) {
1da177e4
LT
843 ip_rt_put(rt);
844 ret = NULL;
5943634f
DM
845 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
846 rt->dst.expires) {
89aef892 847 ip_rt_put(rt);
1da177e4
LT
848 ret = NULL;
849 }
850 }
851 return ret;
852}
853
854/*
855 * Algorithm:
856 * 1. The first ip_rt_redirect_number redirects are sent
857 * with exponential backoff, then we stop sending them at all,
858 * assuming that the host ignores our redirects.
859 * 2. If we did not see packets requiring redirects
860 * during ip_rt_redirect_silence, we assume that the host
861 * forgot redirected route and start to send redirects again.
862 *
863 * This algorithm is much cheaper and more intelligent than dumb load limiting
864 * in icmp.c.
865 *
866 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
867 * and "frag. need" (breaks PMTU discovery) in icmp.c.
868 */
869
870void ip_rt_send_redirect(struct sk_buff *skb)
871{
511c3f92 872 struct rtable *rt = skb_rtable(skb);
30038fc6 873 struct in_device *in_dev;
92d86829 874 struct inet_peer *peer;
1d861aa4 875 struct net *net;
30038fc6 876 int log_martians;
192132b9 877 int vif;
1da177e4 878
30038fc6 879 rcu_read_lock();
d8d1f30b 880 in_dev = __in_dev_get_rcu(rt->dst.dev);
30038fc6
ED
881 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
882 rcu_read_unlock();
1da177e4 883 return;
30038fc6
ED
884 }
885 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
385add90 886 vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
30038fc6 887 rcu_read_unlock();
1da177e4 888
1d861aa4 889 net = dev_net(rt->dst.dev);
192132b9 890 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
92d86829 891 if (!peer) {
e81da0e1
JA
892 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
893 rt_nexthop(rt, ip_hdr(skb)->daddr));
92d86829
DM
894 return;
895 }
896
1da177e4
LT
897 /* No redirected packets during ip_rt_redirect_silence;
898 * reset the algorithm.
899 */
c09551c6 900 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
92d86829 901 peer->rate_tokens = 0;
c09551c6
LB
902 peer->n_redirects = 0;
903 }
1da177e4
LT
904
905 /* Too many ignored redirects; do not send anything
d8d1f30b 906 * set dst.rate_last to the last seen redirected packet.
1da177e4 907 */
c09551c6 908 if (peer->n_redirects >= ip_rt_redirect_number) {
92d86829 909 peer->rate_last = jiffies;
1d861aa4 910 goto out_put_peer;
1da177e4
LT
911 }
912
913 /* Check for load limit; set rate_last to the latest sent
914 * redirect.
915 */
57644431 916 if (peer->n_redirects == 0 ||
14fb8a76 917 time_after(jiffies,
92d86829 918 (peer->rate_last +
b406472b 919 (ip_rt_redirect_load << peer->n_redirects)))) {
e81da0e1
JA
920 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
921
922 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
92d86829 923 peer->rate_last = jiffies;
c09551c6 924 ++peer->n_redirects;
1da177e4 925#ifdef CONFIG_IP_ROUTE_VERBOSE
30038fc6 926 if (log_martians &&
b406472b 927 peer->n_redirects == ip_rt_redirect_number)
e87cc472 928 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
92101b3b 929 &ip_hdr(skb)->saddr, inet_iif(skb),
e81da0e1 930 &ip_hdr(skb)->daddr, &gw);
1da177e4
LT
931#endif
932 }
1d861aa4
DM
933out_put_peer:
934 inet_putpeer(peer);
1da177e4
LT
935}
936
937static int ip_error(struct sk_buff *skb)
938{
511c3f92 939 struct rtable *rt = skb_rtable(skb);
e2c0dc1f
SS
940 struct net_device *dev = skb->dev;
941 struct in_device *in_dev;
92d86829 942 struct inet_peer *peer;
1da177e4 943 unsigned long now;
251da413 944 struct net *net;
92d86829 945 bool send;
1da177e4
LT
946 int code;
947
e2c0dc1f
SS
948 if (netif_is_l3_master(skb->dev)) {
949 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
950 if (!dev)
951 goto out;
952 }
953
954 in_dev = __in_dev_get_rcu(dev);
955
381c759d
EB
956 /* IP on this device is disabled. */
957 if (!in_dev)
958 goto out;
959
251da413
DM
960 net = dev_net(rt->dst.dev);
961 if (!IN_DEV_FORWARD(in_dev)) {
962 switch (rt->dst.error) {
963 case EHOSTUNREACH:
b45386ef 964 __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
251da413
DM
965 break;
966
967 case ENETUNREACH:
b45386ef 968 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
251da413
DM
969 break;
970 }
971 goto out;
972 }
973
d8d1f30b 974 switch (rt->dst.error) {
4500ebf8
JP
975 case EINVAL:
976 default:
977 goto out;
978 case EHOSTUNREACH:
979 code = ICMP_HOST_UNREACH;
980 break;
981 case ENETUNREACH:
982 code = ICMP_NET_UNREACH;
b45386ef 983 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
4500ebf8
JP
984 break;
985 case EACCES:
986 code = ICMP_PKT_FILTERED;
987 break;
1da177e4
LT
988 }
989
192132b9 990 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
385add90 991 l3mdev_master_ifindex(skb->dev), 1);
92d86829
DM
992
993 send = true;
994 if (peer) {
995 now = jiffies;
996 peer->rate_tokens += now - peer->rate_last;
997 if (peer->rate_tokens > ip_rt_error_burst)
998 peer->rate_tokens = ip_rt_error_burst;
999 peer->rate_last = now;
1000 if (peer->rate_tokens >= ip_rt_error_cost)
1001 peer->rate_tokens -= ip_rt_error_cost;
1002 else
1003 send = false;
1d861aa4 1004 inet_putpeer(peer);
1da177e4 1005 }
92d86829
DM
1006 if (send)
1007 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1da177e4
LT
1008
1009out: kfree_skb(skb);
1010 return 0;
e905a9ed 1011}
1da177e4 1012
d851c12b 1013static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1da177e4 1014{
d851c12b 1015 struct dst_entry *dst = &rt->dst;
28d35bcd 1016 u32 old_mtu = ipv4_mtu(dst);
4895c771 1017 struct fib_result res;
d52e5a7e 1018 bool lock = false;
2c8cec5c 1019
d52e5a7e 1020 if (ip_mtu_locked(dst))
fa1e492a
SK
1021 return;
1022
28d35bcd 1023 if (old_mtu < mtu)
3cdaa5be
LW
1024 return;
1025
d52e5a7e
SD
1026 if (mtu < ip_rt_min_pmtu) {
1027 lock = true;
28d35bcd 1028 mtu = min(old_mtu, ip_rt_min_pmtu);
d52e5a7e 1029 }
2c8cec5c 1030
28d35bcd 1031 if (rt->rt_pmtu == mtu && !lock &&
f016229e
TT
1032 time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1033 return;
1034
c5ae7d41 1035 rcu_read_lock();
0eeb075f 1036 if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
eba618ab 1037 struct fib_nh_common *nhc = FIB_RES_NHC(res);
4895c771 1038
a5995e71 1039 update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
aee06da6 1040 jiffies + ip_rt_mtu_expires);
4895c771 1041 }
c5ae7d41 1042 rcu_read_unlock();
1da177e4
LT
1043}
1044
4895c771 1045static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
bd085ef6
HL
1046 struct sk_buff *skb, u32 mtu,
1047 bool confirm_neigh)
4895c771
DM
1048{
1049 struct rtable *rt = (struct rtable *) dst;
1050 struct flowi4 fl4;
1051
1052 ip_rt_build_flow_key(&fl4, sk, skb);
d851c12b 1053 __ip_rt_update_pmtu(rt, &fl4, mtu);
4895c771
DM
1054}
1055
36393395 1056void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
d888f396 1057 int oif, u8 protocol)
36393395 1058{
4895c771 1059 const struct iphdr *iph = (const struct iphdr *) skb->data;
36393395
DM
1060 struct flowi4 fl4;
1061 struct rtable *rt;
d888f396 1062 u32 mark = IP4_REPLY_MARK(net, skb->mark);
1b3c61dc 1063
e2d118a1 1064 __build_flow_key(net, &fl4, NULL, iph, oif,
d888f396 1065 RT_TOS(iph->tos), protocol, mark, 0);
36393395
DM
1066 rt = __ip_route_output_key(net, &fl4);
1067 if (!IS_ERR(rt)) {
4895c771 1068 __ip_rt_update_pmtu(rt, &fl4, mtu);
36393395
DM
1069 ip_rt_put(rt);
1070 }
1071}
1072EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1073
9cb3a50c 1074static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
36393395 1075{
4895c771
DM
1076 const struct iphdr *iph = (const struct iphdr *) skb->data;
1077 struct flowi4 fl4;
1078 struct rtable *rt;
36393395 1079
e2d118a1 1080 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1b3c61dc
LC
1081
1082 if (!fl4.flowi4_mark)
1083 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1084
4895c771
DM
1085 rt = __ip_route_output_key(sock_net(sk), &fl4);
1086 if (!IS_ERR(rt)) {
1087 __ip_rt_update_pmtu(rt, &fl4, mtu);
1088 ip_rt_put(rt);
1089 }
36393395 1090}
9cb3a50c
SK
1091
1092void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1093{
1094 const struct iphdr *iph = (const struct iphdr *) skb->data;
1095 struct flowi4 fl4;
1096 struct rtable *rt;
7f502361 1097 struct dst_entry *odst = NULL;
b44108db 1098 bool new = false;
e2d118a1 1099 struct net *net = sock_net(sk);
9cb3a50c
SK
1100
1101 bh_lock_sock(sk);
482fc609
HFS
1102
1103 if (!ip_sk_accept_pmtu(sk))
1104 goto out;
1105
7f502361 1106 odst = sk_dst_get(sk);
9cb3a50c 1107
7f502361 1108 if (sock_owned_by_user(sk) || !odst) {
9cb3a50c
SK
1109 __ipv4_sk_update_pmtu(skb, sk, mtu);
1110 goto out;
1111 }
1112
e2d118a1 1113 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
9cb3a50c 1114
7f502361 1115 rt = (struct rtable *)odst;
51456b29 1116 if (odst->obsolete && !odst->ops->check(odst, 0)) {
9cb3a50c
SK
1117 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1118 if (IS_ERR(rt))
1119 goto out;
b44108db
SK
1120
1121 new = true;
9cb3a50c
SK
1122 }
1123
0f6c480f 1124 __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
9cb3a50c 1125
7f502361 1126 if (!dst_check(&rt->dst, 0)) {
b44108db
SK
1127 if (new)
1128 dst_release(&rt->dst);
1129
9cb3a50c
SK
1130 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1131 if (IS_ERR(rt))
1132 goto out;
1133
b44108db 1134 new = true;
9cb3a50c
SK
1135 }
1136
b44108db 1137 if (new)
7f502361 1138 sk_dst_set(sk, &rt->dst);
9cb3a50c
SK
1139
1140out:
1141 bh_unlock_sock(sk);
7f502361 1142 dst_release(odst);
9cb3a50c 1143}
36393395 1144EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
f39925db 1145
b42597e2 1146void ipv4_redirect(struct sk_buff *skb, struct net *net,
1042caa7 1147 int oif, u8 protocol)
b42597e2 1148{
4895c771 1149 const struct iphdr *iph = (const struct iphdr *) skb->data;
b42597e2
DM
1150 struct flowi4 fl4;
1151 struct rtable *rt;
1152
e2d118a1 1153 __build_flow_key(net, &fl4, NULL, iph, oif,
1042caa7 1154 RT_TOS(iph->tos), protocol, 0, 0);
b42597e2
DM
1155 rt = __ip_route_output_key(net, &fl4);
1156 if (!IS_ERR(rt)) {
ceb33206 1157 __ip_do_redirect(rt, skb, &fl4, false);
b42597e2
DM
1158 ip_rt_put(rt);
1159 }
1160}
1161EXPORT_SYMBOL_GPL(ipv4_redirect);
1162
1163void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1164{
4895c771
DM
1165 const struct iphdr *iph = (const struct iphdr *) skb->data;
1166 struct flowi4 fl4;
1167 struct rtable *rt;
e2d118a1 1168 struct net *net = sock_net(sk);
b42597e2 1169
e2d118a1
LC
1170 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1171 rt = __ip_route_output_key(net, &fl4);
4895c771 1172 if (!IS_ERR(rt)) {
ceb33206 1173 __ip_do_redirect(rt, skb, &fl4, false);
4895c771
DM
1174 ip_rt_put(rt);
1175 }
b42597e2
DM
1176}
1177EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1178
efbc368d
DM
1179static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1180{
1181 struct rtable *rt = (struct rtable *) dst;
1182
ceb33206
DM
1183 /* All IPV4 dsts are created with ->obsolete set to the value
1184 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1185 * into this function always.
1186 *
387aa65a
TT
1187 * When a PMTU/redirect information update invalidates a route,
1188 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
02afc7ad 1189 * DST_OBSOLETE_DEAD.
ceb33206 1190 */
387aa65a 1191 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
efbc368d 1192 return NULL;
d11a4dc1 1193 return dst;
1da177e4
LT
1194}
1195
20ff83f1 1196static void ipv4_send_dest_unreach(struct sk_buff *skb)
1da177e4 1197{
ed0de45a 1198 struct ip_options opt;
c543cb4a 1199 int res;
1da177e4 1200
ed0de45a 1201 /* Recompile ip options since IPCB may not be valid anymore.
20ff83f1 1202 * Also check we have a reasonable ipv4 header.
ed0de45a 1203 */
20ff83f1
ED
1204 if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1205 ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1206 return;
c543cb4a 1207
20ff83f1
ED
1208 memset(&opt, 0, sizeof(opt));
1209 if (ip_hdr(skb)->ihl > 5) {
1210 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1211 return;
1212 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
c543cb4a 1213
20ff83f1
ED
1214 rcu_read_lock();
1215 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1216 rcu_read_unlock();
ed0de45a 1217
20ff83f1
ED
1218 if (res)
1219 return;
1220 }
ed0de45a 1221 __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
20ff83f1
ED
1222}
1223
1224static void ipv4_link_failure(struct sk_buff *skb)
1225{
1226 struct rtable *rt;
1227
1228 ipv4_send_dest_unreach(skb);
1da177e4 1229
511c3f92 1230 rt = skb_rtable(skb);
5943634f
DM
1231 if (rt)
1232 dst_set_expires(&rt->dst, 0);
1da177e4
LT
1233}
1234
ede2059d 1235static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1da177e4 1236{
91df42be
JP
1237 pr_debug("%s: %pI4 -> %pI4, %s\n",
1238 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1239 skb->dev ? skb->dev->name : "?");
1da177e4 1240 kfree_skb(skb);
c378a9c0 1241 WARN_ON(1);
1da177e4
LT
1242 return 0;
1243}
1244
1245/*
1246 We do not cache source address of outgoing interface,
1247 because it is used only by IP RR, TS and SRR options,
1248 so that it out of fast path.
1249
1250 BTW remember: "addr" is allowed to be not aligned
1251 in IP options!
1252 */
1253
8e36360a 1254void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1da177e4 1255{
a61ced5d 1256 __be32 src;
1da177e4 1257
c7537967 1258 if (rt_is_output_route(rt))
c5be24ff 1259 src = ip_hdr(skb)->saddr;
ebc0ffae 1260 else {
8e36360a 1261 struct fib_result res;
e351bb62
MÅ»
1262 struct iphdr *iph = ip_hdr(skb);
1263 struct flowi4 fl4 = {
1264 .daddr = iph->daddr,
1265 .saddr = iph->saddr,
1266 .flowi4_tos = RT_TOS(iph->tos),
1267 .flowi4_oif = rt->dst.dev->ifindex,
1268 .flowi4_iif = skb->dev->ifindex,
1269 .flowi4_mark = skb->mark,
1270 };
5e2b61f7 1271
ebc0ffae 1272 rcu_read_lock();
0eeb075f 1273 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
eba618ab 1274 src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
ebc0ffae 1275 else
f8126f1d
DM
1276 src = inet_select_addr(rt->dst.dev,
1277 rt_nexthop(rt, iph->daddr),
1278 RT_SCOPE_UNIVERSE);
ebc0ffae
ED
1279 rcu_read_unlock();
1280 }
1da177e4
LT
1281 memcpy(addr, &src, 4);
1282}
1283
c7066f70 1284#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4
LT
1285static void set_class_tag(struct rtable *rt, u32 tag)
1286{
d8d1f30b
CG
1287 if (!(rt->dst.tclassid & 0xFFFF))
1288 rt->dst.tclassid |= tag & 0xFFFF;
1289 if (!(rt->dst.tclassid & 0xFFFF0000))
1290 rt->dst.tclassid |= tag & 0xFFFF0000;
1da177e4
LT
1291}
1292#endif
1293
0dbaee3b
DM
1294static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1295{
7ed14d97 1296 unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
164a5e7a 1297 unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
7ed14d97 1298 ip_rt_min_advmss);
0dbaee3b 1299
7ed14d97 1300 return min(advmss, IPV4_MAX_PMTU - header_size);
0dbaee3b
DM
1301}
1302
ebb762f2 1303static unsigned int ipv4_mtu(const struct dst_entry *dst)
d33e4553 1304{
261663b0 1305 const struct rtable *rt = (const struct rtable *) dst;
5943634f
DM
1306 unsigned int mtu = rt->rt_pmtu;
1307
98d75c37 1308 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
5943634f 1309 mtu = dst_metric_raw(dst, RTAX_MTU);
618f9bc7 1310
38d523e2 1311 if (mtu)
618f9bc7
SK
1312 return mtu;
1313
c780a049 1314 mtu = READ_ONCE(dst->dev->mtu);
d33e4553 1315
d52e5a7e 1316 if (unlikely(ip_mtu_locked(dst))) {
77d5bc7e 1317 if (rt->rt_uses_gateway && mtu > 576)
d33e4553
DM
1318 mtu = 576;
1319 }
1320
14972cbd
RP
1321 mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1322
1323 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
d33e4553
DM
1324}
1325
a5995e71 1326static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
94720e3a
JA
1327{
1328 struct fnhe_hash_bucket *hash;
1329 struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1330 u32 hval = fnhe_hashfun(daddr);
1331
1332 spin_lock_bh(&fnhe_lock);
1333
a5995e71 1334 hash = rcu_dereference_protected(nhc->nhc_exceptions,
94720e3a
JA
1335 lockdep_is_held(&fnhe_lock));
1336 hash += hval;
1337
1338 fnhe_p = &hash->chain;
1339 fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1340 while (fnhe) {
1341 if (fnhe->fnhe_daddr == daddr) {
1342 rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1343 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
ee60ad21
XL
1344 /* set fnhe_daddr to 0 to ensure it won't bind with
1345 * new dsts in rt_bind_exception().
1346 */
1347 fnhe->fnhe_daddr = 0;
94720e3a
JA
1348 fnhe_flush_routes(fnhe);
1349 kfree_rcu(fnhe, rcu);
1350 break;
1351 }
1352 fnhe_p = &fnhe->fnhe_next;
1353 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1354 lockdep_is_held(&fnhe_lock));
1355 }
1356
1357 spin_unlock_bh(&fnhe_lock);
1358}
1359
a5995e71
DA
1360static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1361 __be32 daddr)
4895c771 1362{
a5995e71 1363 struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
4895c771
DM
1364 struct fib_nh_exception *fnhe;
1365 u32 hval;
1366
f2bb4bed
DM
1367 if (!hash)
1368 return NULL;
1369
d3a25c98 1370 hval = fnhe_hashfun(daddr);
4895c771
DM
1371
1372 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1373 fnhe = rcu_dereference(fnhe->fnhe_next)) {
94720e3a
JA
1374 if (fnhe->fnhe_daddr == daddr) {
1375 if (fnhe->fnhe_expires &&
1376 time_after(jiffies, fnhe->fnhe_expires)) {
a5995e71 1377 ip_del_fnhe(nhc, daddr);
94720e3a
JA
1378 break;
1379 }
f2bb4bed 1380 return fnhe;
94720e3a 1381 }
f2bb4bed
DM
1382 }
1383 return NULL;
1384}
aee06da6 1385
50d889b1
DA
1386/* MTU selection:
1387 * 1. mtu on route is locked - use it
1388 * 2. mtu from nexthop exception
1389 * 3. mtu from egress device
1390 */
1391
1392u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1393{
eba618ab
DA
1394 struct fib_nh_common *nhc = res->nhc;
1395 struct net_device *dev = nhc->nhc_dev;
50d889b1 1396 struct fib_info *fi = res->fi;
50d889b1
DA
1397 u32 mtu = 0;
1398
1399 if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1400 fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1401 mtu = fi->fib_mtu;
1402
1403 if (likely(!mtu)) {
1404 struct fib_nh_exception *fnhe;
1405
a5995e71 1406 fnhe = find_exception(nhc, daddr);
50d889b1
DA
1407 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1408 mtu = fnhe->fnhe_pmtu;
1409 }
1410
1411 if (likely(!mtu))
1412 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1413
eba618ab 1414 return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
50d889b1
DA
1415}
1416
caacf05e 1417static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
a4c2fd7f 1418 __be32 daddr, const bool do_cache)
f2bb4bed 1419{
caacf05e
DM
1420 bool ret = false;
1421
c5038a83 1422 spin_lock_bh(&fnhe_lock);
f2bb4bed 1423
c5038a83 1424 if (daddr == fnhe->fnhe_daddr) {
2ffae99d
TT
1425 struct rtable __rcu **porig;
1426 struct rtable *orig;
5aad1de5 1427 int genid = fnhe_genid(dev_net(rt->dst.dev));
2ffae99d
TT
1428
1429 if (rt_is_input_route(rt))
1430 porig = &fnhe->fnhe_rth_input;
1431 else
1432 porig = &fnhe->fnhe_rth_output;
1433 orig = rcu_dereference(*porig);
5aad1de5
TT
1434
1435 if (fnhe->fnhe_genid != genid) {
1436 fnhe->fnhe_genid = genid;
13d82bf5
SK
1437 fnhe->fnhe_gw = 0;
1438 fnhe->fnhe_pmtu = 0;
1439 fnhe->fnhe_expires = 0;
0e8411e4 1440 fnhe->fnhe_mtu_locked = false;
2ffae99d
TT
1441 fnhe_flush_routes(fnhe);
1442 orig = NULL;
13d82bf5 1443 }
387aa65a 1444 fill_route_from_fnhe(rt, fnhe);
1550c171
DA
1445 if (!rt->rt_gw4) {
1446 rt->rt_gw4 = daddr;
1447 rt->rt_gw_family = AF_INET;
1448 }
f2bb4bed 1449
a4c2fd7f 1450 if (do_cache) {
0830106c 1451 dst_hold(&rt->dst);
2ffae99d 1452 rcu_assign_pointer(*porig, rt);
0830106c 1453 if (orig) {
95c47f9c 1454 dst_dev_put(&orig->dst);
0830106c 1455 dst_release(&orig->dst);
0830106c 1456 }
2ffae99d
TT
1457 ret = true;
1458 }
c5038a83
DM
1459
1460 fnhe->fnhe_stamp = jiffies;
c5038a83
DM
1461 }
1462 spin_unlock_bh(&fnhe_lock);
caacf05e
DM
1463
1464 return ret;
54764bb6
ED
1465}
1466
87063a1f 1467static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
f2bb4bed 1468{
d26b3a7c 1469 struct rtable *orig, *prev, **p;
caacf05e 1470 bool ret = true;
f2bb4bed 1471
d26b3a7c 1472 if (rt_is_input_route(rt)) {
0f457a36 1473 p = (struct rtable **)&nhc->nhc_rth_input;
d26b3a7c 1474 } else {
0f457a36 1475 p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
d26b3a7c 1476 }
f2bb4bed
DM
1477 orig = *p;
1478
0830106c
WW
1479 /* hold dst before doing cmpxchg() to avoid race condition
1480 * on this dst
1481 */
1482 dst_hold(&rt->dst);
f2bb4bed
DM
1483 prev = cmpxchg(p, orig, rt);
1484 if (prev == orig) {
0830106c 1485 if (orig) {
5018c596 1486 rt_add_uncached_list(orig);
0830106c 1487 dst_release(&orig->dst);
0830106c
WW
1488 }
1489 } else {
1490 dst_release(&rt->dst);
caacf05e 1491 ret = false;
0830106c 1492 }
caacf05e
DM
1493
1494 return ret;
1495}
1496
5055c371
ED
1497struct uncached_list {
1498 spinlock_t lock;
1499 struct list_head head;
1500};
1501
1502static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
caacf05e 1503
510c321b 1504void rt_add_uncached_list(struct rtable *rt)
caacf05e 1505{
5055c371
ED
1506 struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1507
1508 rt->rt_uncached_list = ul;
1509
1510 spin_lock_bh(&ul->lock);
1511 list_add_tail(&rt->rt_uncached, &ul->head);
1512 spin_unlock_bh(&ul->lock);
caacf05e
DM
1513}
1514
510c321b 1515void rt_del_uncached_list(struct rtable *rt)
caacf05e 1516{
78df76a0 1517 if (!list_empty(&rt->rt_uncached)) {
5055c371
ED
1518 struct uncached_list *ul = rt->rt_uncached_list;
1519
1520 spin_lock_bh(&ul->lock);
caacf05e 1521 list_del(&rt->rt_uncached);
5055c371 1522 spin_unlock_bh(&ul->lock);
caacf05e
DM
1523 }
1524}
1525
510c321b
XL
1526static void ipv4_dst_destroy(struct dst_entry *dst)
1527{
510c321b
XL
1528 struct rtable *rt = (struct rtable *)dst;
1529
1620a336 1530 ip_dst_metrics_put(dst);
510c321b
XL
1531 rt_del_uncached_list(rt);
1532}
1533
caacf05e
DM
1534void rt_flush_dev(struct net_device *dev)
1535{
5055c371
ED
1536 struct rtable *rt;
1537 int cpu;
1538
1539 for_each_possible_cpu(cpu) {
1540 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
caacf05e 1541
5055c371
ED
1542 spin_lock_bh(&ul->lock);
1543 list_for_each_entry(rt, &ul->head, rt_uncached) {
caacf05e
DM
1544 if (rt->dst.dev != dev)
1545 continue;
8d7017fd 1546 rt->dst.dev = blackhole_netdev;
caacf05e
DM
1547 dev_hold(rt->dst.dev);
1548 dev_put(dev);
1549 }
5055c371 1550 spin_unlock_bh(&ul->lock);
4895c771
DM
1551 }
1552}
1553
4331debc 1554static bool rt_cache_valid(const struct rtable *rt)
d2d68ba9 1555{
4331debc
ED
1556 return rt &&
1557 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1558 !rt_is_expired(rt);
d2d68ba9
DM
1559}
1560
f2bb4bed 1561static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
5e2b61f7 1562 const struct fib_result *res,
f2bb4bed 1563 struct fib_nh_exception *fnhe,
a4c2fd7f
WW
1564 struct fib_info *fi, u16 type, u32 itag,
1565 const bool do_cache)
1da177e4 1566{
caacf05e
DM
1567 bool cached = false;
1568
1da177e4 1569 if (fi) {
eba618ab 1570 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
4895c771 1571
0f5f7d7b 1572 if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
77d5bc7e 1573 rt->rt_uses_gateway = 1;
0f5f7d7b
DA
1574 rt->rt_gw_family = nhc->nhc_gw_family;
1575 /* only INET and INET6 are supported */
1576 if (likely(nhc->nhc_gw_family == AF_INET))
1577 rt->rt_gw4 = nhc->nhc_gw.ipv4;
1578 else
1579 rt->rt_gw6 = nhc->nhc_gw.ipv6;
155e8336 1580 }
0f5f7d7b 1581
e1255ed4
DA
1582 ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1583
c7066f70 1584#ifdef CONFIG_IP_ROUTE_CLASSID
dcb1ecb5 1585 if (nhc->nhc_family == AF_INET) {
87063a1f
DA
1586 struct fib_nh *nh;
1587
1588 nh = container_of(nhc, struct fib_nh, nh_common);
1589 rt->dst.tclassid = nh->nh_tclassid;
1590 }
1da177e4 1591#endif
87063a1f 1592 rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
c5038a83 1593 if (unlikely(fnhe))
a4c2fd7f
WW
1594 cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1595 else if (do_cache)
87063a1f 1596 cached = rt_cache_route(nhc, rt);
155e8336
JA
1597 if (unlikely(!cached)) {
1598 /* Routes we intend to cache in nexthop exception or
1599 * FIB nexthop have the DST_NOCACHE bit clear.
1600 * However, if we are unsuccessful at storing this
1601 * route into the cache we really need to set it.
1602 */
1550c171
DA
1603 if (!rt->rt_gw4) {
1604 rt->rt_gw_family = AF_INET;
1605 rt->rt_gw4 = daddr;
1606 }
155e8336
JA
1607 rt_add_uncached_list(rt);
1608 }
1609 } else
caacf05e 1610 rt_add_uncached_list(rt);
defb3519 1611
c7066f70 1612#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4 1613#ifdef CONFIG_IP_MULTIPLE_TABLES
85b91b03 1614 set_class_tag(rt, res->tclassid);
1da177e4
LT
1615#endif
1616 set_class_tag(rt, itag);
1617#endif
1da177e4
LT
1618}
1619
9ab179d8
DA
1620struct rtable *rt_dst_alloc(struct net_device *dev,
1621 unsigned int flags, u16 type,
af13b3c3 1622 bool nopolicy, bool noxfrm)
0c4dcd58 1623{
d08c4f35
DA
1624 struct rtable *rt;
1625
1626 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
d08c4f35 1627 (nopolicy ? DST_NOPOLICY : 0) |
b2a9c0ed 1628 (noxfrm ? DST_NOXFRM : 0));
d08c4f35
DA
1629
1630 if (rt) {
1631 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1632 rt->rt_flags = flags;
1633 rt->rt_type = type;
1634 rt->rt_is_input = 0;
1635 rt->rt_iif = 0;
1636 rt->rt_pmtu = 0;
d52e5a7e 1637 rt->rt_mtu_locked = 0;
77d5bc7e 1638 rt->rt_uses_gateway = 0;
1550c171
DA
1639 rt->rt_gw_family = 0;
1640 rt->rt_gw4 = 0;
d08c4f35
DA
1641 INIT_LIST_HEAD(&rt->rt_uncached);
1642
1643 rt->dst.output = ip_output;
1644 if (flags & RTCF_LOCAL)
1645 rt->dst.input = ip_local_deliver;
1646 }
1647
1648 return rt;
0c4dcd58 1649}
9ab179d8 1650EXPORT_SYMBOL(rt_dst_alloc);
0c4dcd58 1651
5b18f128
SS
1652struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1653{
1654 struct rtable *new_rt;
1655
1656 new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1657 rt->dst.flags);
1658
1659 if (new_rt) {
1660 new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1661 new_rt->rt_flags = rt->rt_flags;
1662 new_rt->rt_type = rt->rt_type;
1663 new_rt->rt_is_input = rt->rt_is_input;
1664 new_rt->rt_iif = rt->rt_iif;
1665 new_rt->rt_pmtu = rt->rt_pmtu;
1666 new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1667 new_rt->rt_gw_family = rt->rt_gw_family;
1668 if (rt->rt_gw_family == AF_INET)
1669 new_rt->rt_gw4 = rt->rt_gw4;
1670 else if (rt->rt_gw_family == AF_INET6)
1671 new_rt->rt_gw6 = rt->rt_gw6;
1672 INIT_LIST_HEAD(&new_rt->rt_uncached);
1673
5b18f128
SS
1674 new_rt->dst.input = rt->dst.input;
1675 new_rt->dst.output = rt->dst.output;
1676 new_rt->dst.error = rt->dst.error;
1677 new_rt->dst.lastuse = jiffies;
1678 new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1679 }
1680 return new_rt;
1681}
1682EXPORT_SYMBOL(rt_dst_clone);
1683
96d36220 1684/* called in rcu_read_lock() section */
bc044e8d
PA
1685int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1686 u8 tos, struct net_device *dev,
1687 struct in_device *in_dev, u32 *itag)
1da177e4 1688{
b5f7e755 1689 int err;
1da177e4
LT
1690
1691 /* Primary sanity checks. */
51456b29 1692 if (!in_dev)
1da177e4
LT
1693 return -EINVAL;
1694
1e637c74 1695 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
d0daebc3 1696 skb->protocol != htons(ETH_P_IP))
bc044e8d 1697 return -EINVAL;
1da177e4 1698
75fea73d 1699 if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
bc044e8d 1700 return -EINVAL;
d0daebc3 1701
f97c1e0c 1702 if (ipv4_is_zeronet(saddr)) {
1d2f4ebb
EC
1703 if (!ipv4_is_local_multicast(daddr) &&
1704 ip_hdr(skb)->protocol != IPPROTO_IGMP)
bc044e8d 1705 return -EINVAL;
b5f7e755 1706 } else {
9e56e380 1707 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
bc044e8d 1708 in_dev, itag);
b5f7e755 1709 if (err < 0)
bc044e8d 1710 return err;
b5f7e755 1711 }
bc044e8d
PA
1712 return 0;
1713}
1714
1715/* called in rcu_read_lock() section */
1716static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1717 u8 tos, struct net_device *dev, int our)
1718{
1719 struct in_device *in_dev = __in_dev_get_rcu(dev);
1720 unsigned int flags = RTCF_MULTICAST;
1721 struct rtable *rth;
1722 u32 itag = 0;
1723 int err;
1724
1725 err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1726 if (err)
1727 return err;
1728
d08c4f35
DA
1729 if (our)
1730 flags |= RTCF_LOCAL;
1731
1732 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
af13b3c3 1733 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1da177e4 1734 if (!rth)
bc044e8d 1735 return -ENOBUFS;
1da177e4 1736
cf911662
DM
1737#ifdef CONFIG_IP_ROUTE_CLASSID
1738 rth->dst.tclassid = itag;
1739#endif
d8d1f30b 1740 rth->dst.output = ip_rt_bug;
9917e1e8 1741 rth->rt_is_input= 1;
1da177e4
LT
1742
1743#ifdef CONFIG_IP_MROUTE
f97c1e0c 1744 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
d8d1f30b 1745 rth->dst.input = ip_mr_input;
1da177e4
LT
1746#endif
1747 RT_CACHE_STAT_INC(in_slow_mc);
1748
89aef892
DM
1749 skb_dst_set(skb, &rth->dst);
1750 return 0;
1da177e4
LT
1751}
1752
1753
1754static void ip_handle_martian_source(struct net_device *dev,
1755 struct in_device *in_dev,
1756 struct sk_buff *skb,
9e12bb22
AV
1757 __be32 daddr,
1758 __be32 saddr)
1da177e4
LT
1759{
1760 RT_CACHE_STAT_INC(in_martian_src);
1761#ifdef CONFIG_IP_ROUTE_VERBOSE
1762 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1763 /*
1764 * RFC1812 recommendation, if source is martian,
1765 * the only hint is MAC header.
1766 */
058bd4d2 1767 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
673d57e7 1768 &daddr, &saddr, dev->name);
98e399f8 1769 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
058bd4d2
JP
1770 print_hex_dump(KERN_WARNING, "ll header: ",
1771 DUMP_PREFIX_OFFSET, 16, 1,
1772 skb_mac_header(skb),
b2c85100 1773 dev->hard_header_len, false);
1da177e4
LT
1774 }
1775 }
1776#endif
1777}
1778
47360228 1779/* called in rcu_read_lock() section */
5969f71d 1780static int __mkroute_input(struct sk_buff *skb,
982721f3 1781 const struct fib_result *res,
5969f71d 1782 struct in_device *in_dev,
c6cffba4 1783 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 1784{
eba618ab
DA
1785 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1786 struct net_device *dev = nhc->nhc_dev;
2ffae99d 1787 struct fib_nh_exception *fnhe;
1da177e4
LT
1788 struct rtable *rth;
1789 int err;
1790 struct in_device *out_dev;
d2d68ba9 1791 bool do_cache;
fbdc0ad0 1792 u32 itag = 0;
1da177e4
LT
1793
1794 /* get a working reference to the output device */
eba618ab 1795 out_dev = __in_dev_get_rcu(dev);
51456b29 1796 if (!out_dev) {
e87cc472 1797 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1da177e4
LT
1798 return -EINVAL;
1799 }
1800
5c04c819 1801 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
9e56e380 1802 in_dev->dev, in_dev, &itag);
1da177e4 1803 if (err < 0) {
e905a9ed 1804 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1da177e4 1805 saddr);
e905a9ed 1806
1da177e4
LT
1807 goto cleanup;
1808 }
1809
e81da0e1
JA
1810 do_cache = res->fi && !itag;
1811 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
eba618ab 1812 skb->protocol == htons(ETH_P_IP)) {
bdf00467 1813 __be32 gw;
eba618ab 1814
bdf00467 1815 gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
eba618ab
DA
1816 if (IN_DEV_SHARED_MEDIA(out_dev) ||
1817 inet_addr_onlink(out_dev, saddr, gw))
1818 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1819 }
1da177e4
LT
1820
1821 if (skb->protocol != htons(ETH_P_IP)) {
1822 /* Not IP (i.e. ARP). Do not create route, if it is
1823 * invalid for proxy arp. DNAT routes are always valid.
65324144
JDB
1824 *
1825 * Proxy arp feature have been extended to allow, ARP
1826 * replies back to the same interface, to support
1827 * Private VLAN switch technologies. See arp.c.
1da177e4 1828 */
65324144
JDB
1829 if (out_dev == in_dev &&
1830 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1da177e4
LT
1831 err = -EINVAL;
1832 goto cleanup;
1833 }
1834 }
1835
a5995e71 1836 fnhe = find_exception(nhc, daddr);
e81da0e1 1837 if (do_cache) {
94720e3a 1838 if (fnhe)
2ffae99d 1839 rth = rcu_dereference(fnhe->fnhe_rth_input);
94720e3a 1840 else
0f457a36 1841 rth = rcu_dereference(nhc->nhc_rth_input);
e81da0e1
JA
1842 if (rt_cache_valid(rth)) {
1843 skb_dst_set_noref(skb, &rth->dst);
1844 goto out;
d2d68ba9
DM
1845 }
1846 }
f2bb4bed 1847
d08c4f35 1848 rth = rt_dst_alloc(out_dev->dev, 0, res->type,
5c1e6aa3 1849 IN_DEV_CONF_GET(in_dev, NOPOLICY),
af13b3c3 1850 IN_DEV_CONF_GET(out_dev, NOXFRM));
1da177e4
LT
1851 if (!rth) {
1852 err = -ENOBUFS;
1853 goto cleanup;
1854 }
1855
9917e1e8 1856 rth->rt_is_input = 1;
a6254864 1857 RT_CACHE_STAT_INC(in_slow_tot);
1da177e4 1858
d8d1f30b 1859 rth->dst.input = ip_forward;
1da177e4 1860
a4c2fd7f
WW
1861 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1862 do_cache);
9942895b 1863 lwtunnel_set_redirect(&rth->dst);
c6cffba4 1864 skb_dst_set(skb, &rth->dst);
d2d68ba9 1865out:
1da177e4
LT
1866 err = 0;
1867 cleanup:
1da177e4 1868 return err;
e905a9ed 1869}
1da177e4 1870
79a13159 1871#ifdef CONFIG_IP_ROUTE_MULTIPATH
79a13159 1872/* To make ICMP packets follow the right flow, the multipath hash is
bf4e0a3d 1873 * calculated from the inner IP addresses.
79a13159 1874 */
bf4e0a3d
NA
1875static void ip_multipath_l3_keys(const struct sk_buff *skb,
1876 struct flow_keys *hash_keys)
79a13159
PN
1877{
1878 const struct iphdr *outer_iph = ip_hdr(skb);
6f74b6c2 1879 const struct iphdr *key_iph = outer_iph;
bf4e0a3d 1880 const struct iphdr *inner_iph;
79a13159
PN
1881 const struct icmphdr *icmph;
1882 struct iphdr _inner_iph;
bf4e0a3d
NA
1883 struct icmphdr _icmph;
1884
bf4e0a3d 1885 if (likely(outer_iph->protocol != IPPROTO_ICMP))
6f74b6c2 1886 goto out;
79a13159
PN
1887
1888 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
6f74b6c2 1889 goto out;
79a13159
PN
1890
1891 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1892 &_icmph);
1893 if (!icmph)
6f74b6c2 1894 goto out;
79a13159 1895
54074f1d 1896 if (!icmp_is_err(icmph->type))
6f74b6c2 1897 goto out;
79a13159
PN
1898
1899 inner_iph = skb_header_pointer(skb,
1900 outer_iph->ihl * 4 + sizeof(_icmph),
1901 sizeof(_inner_iph), &_inner_iph);
1902 if (!inner_iph)
6f74b6c2
DA
1903 goto out;
1904
1905 key_iph = inner_iph;
1906out:
1907 hash_keys->addrs.v4addrs.src = key_iph->saddr;
1908 hash_keys->addrs.v4addrs.dst = key_iph->daddr;
bf4e0a3d 1909}
79a13159 1910
bf4e0a3d 1911/* if skb is set it will be used and fl4 can be NULL */
7efc0b6b 1912int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
e37b1e97 1913 const struct sk_buff *skb, struct flow_keys *flkeys)
bf4e0a3d 1914{
2a8e4997 1915 u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
bf4e0a3d
NA
1916 struct flow_keys hash_keys;
1917 u32 mhash;
79a13159 1918
bf4e0a3d
NA
1919 switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1920 case 0:
1921 memset(&hash_keys, 0, sizeof(hash_keys));
1922 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1923 if (skb) {
1924 ip_multipath_l3_keys(skb, &hash_keys);
1925 } else {
1926 hash_keys.addrs.v4addrs.src = fl4->saddr;
1927 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1928 }
1929 break;
1930 case 1:
1931 /* skb is currently provided only when forwarding */
1932 if (skb) {
1933 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1934 struct flow_keys keys;
1935
1936 /* short-circuit if we already have L4 hash present */
1937 if (skb->l4_hash)
1938 return skb_get_hash_raw(skb) >> 1;
ec7127a5 1939
bf4e0a3d 1940 memset(&hash_keys, 0, sizeof(hash_keys));
1fe4b118 1941
ec7127a5 1942 if (!flkeys) {
e37b1e97 1943 skb_flow_dissect_flow_keys(skb, &keys, flag);
ec7127a5 1944 flkeys = &keys;
e37b1e97 1945 }
ec7127a5
DA
1946
1947 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1948 hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1949 hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1950 hash_keys.ports.src = flkeys->ports.src;
1951 hash_keys.ports.dst = flkeys->ports.dst;
1952 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
bf4e0a3d
NA
1953 } else {
1954 memset(&hash_keys, 0, sizeof(hash_keys));
1955 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1956 hash_keys.addrs.v4addrs.src = fl4->saddr;
1957 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1958 hash_keys.ports.src = fl4->fl4_sport;
1959 hash_keys.ports.dst = fl4->fl4_dport;
1960 hash_keys.basic.ip_proto = fl4->flowi4_proto;
1961 }
1962 break;
363887a2
SS
1963 case 2:
1964 memset(&hash_keys, 0, sizeof(hash_keys));
363887a2
SS
1965 /* skb is currently provided only when forwarding */
1966 if (skb) {
1967 struct flow_keys keys;
1968
1969 skb_flow_dissect_flow_keys(skb, &keys, 0);
828b2b44
SS
1970 /* Inner can be v4 or v6 */
1971 if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
1972 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1973 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1974 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1975 } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
1976 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1977 hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
1978 hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
1979 hash_keys.tags.flow_label = keys.tags.flow_label;
1980 hash_keys.basic.ip_proto = keys.basic.ip_proto;
1981 } else {
1982 /* Same as case 0 */
1983 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1984 ip_multipath_l3_keys(skb, &hash_keys);
1985 }
363887a2
SS
1986 } else {
1987 /* Same as case 0 */
828b2b44 1988 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
363887a2
SS
1989 hash_keys.addrs.v4addrs.src = fl4->saddr;
1990 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1991 }
1992 break;
bf4e0a3d
NA
1993 }
1994 mhash = flow_hash_from_keys(&hash_keys);
79a13159 1995
24ba1440 1996 if (multipath_hash)
1997 mhash = jhash_2words(mhash, multipath_hash, 0);
1998
bf4e0a3d
NA
1999 return mhash >> 1;
2000}
79a13159
PN
2001#endif /* CONFIG_IP_ROUTE_MULTIPATH */
2002
5969f71d
SH
2003static int ip_mkroute_input(struct sk_buff *skb,
2004 struct fib_result *res,
5969f71d 2005 struct in_device *in_dev,
e37b1e97
RP
2006 __be32 daddr, __be32 saddr, u32 tos,
2007 struct flow_keys *hkeys)
1da177e4 2008{
1da177e4 2009#ifdef CONFIG_IP_ROUTE_MULTIPATH
5481d73f 2010 if (res->fi && fib_info_num_path(res->fi) > 1) {
7efc0b6b 2011 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
0e884c78 2012
0e884c78
PN
2013 fib_select_multipath(res, h);
2014 }
1da177e4
LT
2015#endif
2016
2017 /* create a routing cache entry */
c6cffba4 2018 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1da177e4
LT
2019}
2020
02b24941
PA
2021/* Implements all the saddr-related checks as ip_route_input_slow(),
2022 * assuming daddr is valid and the destination is not a local broadcast one.
2023 * Uses the provided hint instead of performing a route lookup.
2024 */
2025int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2026 u8 tos, struct net_device *dev,
2027 const struct sk_buff *hint)
2028{
2029 struct in_device *in_dev = __in_dev_get_rcu(dev);
2030 struct rtable *rt = (struct rtable *)hint;
2031 struct net *net = dev_net(dev);
2032 int err = -EINVAL;
2033 u32 tag = 0;
2034
2035 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2036 goto martian_source;
2037
2038 if (ipv4_is_zeronet(saddr))
2039 goto martian_source;
2040
2041 if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2042 goto martian_source;
2043
2044 if (rt->rt_type != RTN_LOCAL)
2045 goto skip_validate_source;
2046
2047 tos &= IPTOS_RT_MASK;
2048 err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
2049 if (err < 0)
2050 goto martian_source;
2051
2052skip_validate_source:
2053 skb_dst_copy(skb, hint);
2054 return 0;
2055
2056martian_source:
2057 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2058 return err;
2059}
2060
1da177e4
LT
2061/*
2062 * NOTE. We drop all the packets that has local source
2063 * addresses, because every properly looped back packet
2064 * must have correct destination already attached by output routine.
02b24941
PA
2065 * Changes in the enforced policies must be applied also to
2066 * ip_route_use_hint().
1da177e4
LT
2067 *
2068 * Such approach solves two big problems:
2069 * 1. Not simplex devices are handled properly.
2070 * 2. IP spoofing attempts are filtered with 100% of guarantee.
ebc0ffae 2071 * called with rcu_read_lock()
1da177e4
LT
2072 */
2073
9e12bb22 2074static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
5510cdf7
DA
2075 u8 tos, struct net_device *dev,
2076 struct fib_result *res)
1da177e4 2077{
96d36220 2078 struct in_device *in_dev = __in_dev_get_rcu(dev);
e37b1e97
RP
2079 struct flow_keys *flkeys = NULL, _flkeys;
2080 struct net *net = dev_net(dev);
1b7179d3 2081 struct ip_tunnel_info *tun_info;
e37b1e97 2082 int err = -EINVAL;
95c96174 2083 unsigned int flags = 0;
1da177e4 2084 u32 itag = 0;
95c96174 2085 struct rtable *rth;
e37b1e97 2086 struct flowi4 fl4;
0a90478b 2087 bool do_cache = true;
1da177e4
LT
2088
2089 /* IP on this device is disabled. */
2090
2091 if (!in_dev)
2092 goto out;
2093
2094 /* Check for the most weird martians, which can be not detected
2095 by fib_lookup.
2096 */
2097
61adedf3 2098 tun_info = skb_tunnel_info(skb);
46fa062a 2099 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1b7179d3
TG
2100 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2101 else
2102 fl4.flowi4_tun_key.tun_id = 0;
f38a9eb1
TG
2103 skb_dst_drop(skb);
2104
d0daebc3 2105 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1da177e4
LT
2106 goto martian_source;
2107
5510cdf7
DA
2108 res->fi = NULL;
2109 res->table = NULL;
27a954bd 2110 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1da177e4
LT
2111 goto brd_input;
2112
2113 /* Accept zero addresses only to limited broadcast;
2114 * I even do not know to fix it or not. Waiting for complains :-)
2115 */
f97c1e0c 2116 if (ipv4_is_zeronet(saddr))
1da177e4
LT
2117 goto martian_source;
2118
d0daebc3 2119 if (ipv4_is_zeronet(daddr))
1da177e4
LT
2120 goto martian_destination;
2121
9eb43e76
ED
2122 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2123 * and call it once if daddr or/and saddr are loopback addresses
2124 */
2125 if (ipv4_is_loopback(daddr)) {
2126 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
d0daebc3 2127 goto martian_destination;
9eb43e76
ED
2128 } else if (ipv4_is_loopback(saddr)) {
2129 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
d0daebc3
TG
2130 goto martian_source;
2131 }
2132
1da177e4
LT
2133 /*
2134 * Now we are ready to route packet.
2135 */
68a5e3dd 2136 fl4.flowi4_oif = 0;
e0d56fdd 2137 fl4.flowi4_iif = dev->ifindex;
68a5e3dd
DM
2138 fl4.flowi4_mark = skb->mark;
2139 fl4.flowi4_tos = tos;
2140 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
b84f7878 2141 fl4.flowi4_flags = 0;
68a5e3dd
DM
2142 fl4.daddr = daddr;
2143 fl4.saddr = saddr;
8bcfd092 2144 fl4.flowi4_uid = sock_net_uid(net, NULL);
e37b1e97 2145
5a847a6e 2146 if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
e37b1e97 2147 flkeys = &_flkeys;
5a847a6e
DA
2148 } else {
2149 fl4.flowi4_proto = 0;
2150 fl4.fl4_sport = 0;
2151 fl4.fl4_dport = 0;
2152 }
e37b1e97 2153
5510cdf7 2154 err = fib_lookup(net, &fl4, res, 0);
cd0f0b95
DJ
2155 if (err != 0) {
2156 if (!IN_DEV_FORWARD(in_dev))
2157 err = -EHOSTUNREACH;
1da177e4 2158 goto no_route;
cd0f0b95 2159 }
1da177e4 2160
5cbf777c
XL
2161 if (res->type == RTN_BROADCAST) {
2162 if (IN_DEV_BFORWARD(in_dev))
2163 goto make_route;
0a90478b
XL
2164 /* not do cache if bc_forwarding is enabled */
2165 if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2166 do_cache = false;
1da177e4 2167 goto brd_input;
5cbf777c 2168 }
1da177e4 2169
5510cdf7 2170 if (res->type == RTN_LOCAL) {
5c04c819 2171 err = fib_validate_source(skb, saddr, daddr, tos,
0d5edc68 2172 0, dev, in_dev, &itag);
b5f7e755 2173 if (err < 0)
0d753960 2174 goto martian_source;
1da177e4
LT
2175 goto local_input;
2176 }
2177
cd0f0b95
DJ
2178 if (!IN_DEV_FORWARD(in_dev)) {
2179 err = -EHOSTUNREACH;
251da413 2180 goto no_route;
cd0f0b95 2181 }
5510cdf7 2182 if (res->type != RTN_UNICAST)
1da177e4
LT
2183 goto martian_destination;
2184
5cbf777c 2185make_route:
e37b1e97 2186 err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
1da177e4
LT
2187out: return err;
2188
2189brd_input:
2190 if (skb->protocol != htons(ETH_P_IP))
2191 goto e_inval;
2192
41347dcd 2193 if (!ipv4_is_zeronet(saddr)) {
9e56e380
DM
2194 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2195 in_dev, &itag);
1da177e4 2196 if (err < 0)
0d753960 2197 goto martian_source;
1da177e4
LT
2198 }
2199 flags |= RTCF_BROADCAST;
5510cdf7 2200 res->type = RTN_BROADCAST;
1da177e4
LT
2201 RT_CACHE_STAT_INC(in_brd);
2202
2203local_input:
0a90478b
XL
2204 do_cache &= res->fi && !itag;
2205 if (do_cache) {
2206 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
eba618ab 2207
0a90478b
XL
2208 rth = rcu_dereference(nhc->nhc_rth_input);
2209 if (rt_cache_valid(rth)) {
2210 skb_dst_set_noref(skb, &rth->dst);
2211 err = 0;
2212 goto out;
d2d68ba9
DM
2213 }
2214 }
2215
f5a0aab8 2216 rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
5510cdf7 2217 flags | RTCF_LOCAL, res->type,
af13b3c3 2218 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1da177e4
LT
2219 if (!rth)
2220 goto e_nobufs;
2221
d8d1f30b 2222 rth->dst.output= ip_rt_bug;
cf911662
DM
2223#ifdef CONFIG_IP_ROUTE_CLASSID
2224 rth->dst.tclassid = itag;
2225#endif
9917e1e8 2226 rth->rt_is_input = 1;
571e7226 2227
a6254864 2228 RT_CACHE_STAT_INC(in_slow_tot);
5510cdf7 2229 if (res->type == RTN_UNREACHABLE) {
d8d1f30b
CG
2230 rth->dst.input= ip_error;
2231 rth->dst.error= -err;
1da177e4
LT
2232 rth->rt_flags &= ~RTCF_LOCAL;
2233 }
efd85700 2234
dcdfdf56 2235 if (do_cache) {
eba618ab 2236 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
efd85700 2237
eba618ab 2238 rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
efd85700
TG
2239 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2240 WARN_ON(rth->dst.input == lwtunnel_input);
2241 rth->dst.lwtstate->orig_input = rth->dst.input;
2242 rth->dst.input = lwtunnel_input;
2243 }
2244
87063a1f 2245 if (unlikely(!rt_cache_route(nhc, rth)))
dcdfdf56 2246 rt_add_uncached_list(rth);
dcdfdf56 2247 }
89aef892 2248 skb_dst_set(skb, &rth->dst);
b23dd4fe 2249 err = 0;
ebc0ffae 2250 goto out;
1da177e4
LT
2251
2252no_route:
2253 RT_CACHE_STAT_INC(in_no_route);
5510cdf7
DA
2254 res->type = RTN_UNREACHABLE;
2255 res->fi = NULL;
2256 res->table = NULL;
1da177e4
LT
2257 goto local_input;
2258
2259 /*
2260 * Do not cache martian addresses: they should be logged (RFC1812)
2261 */
2262martian_destination:
2263 RT_CACHE_STAT_INC(in_martian_dst);
2264#ifdef CONFIG_IP_ROUTE_VERBOSE
e87cc472
JP
2265 if (IN_DEV_LOG_MARTIANS(in_dev))
2266 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2267 &daddr, &saddr, dev->name);
1da177e4 2268#endif
2c2910a4 2269
1da177e4
LT
2270e_inval:
2271 err = -EINVAL;
ebc0ffae 2272 goto out;
1da177e4
LT
2273
2274e_nobufs:
2275 err = -ENOBUFS;
ebc0ffae 2276 goto out;
1da177e4
LT
2277
2278martian_source:
2279 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
ebc0ffae 2280 goto out;
1da177e4
LT
2281}
2282
c6cffba4
DM
2283int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2284 u8 tos, struct net_device *dev)
1da177e4 2285{
5510cdf7
DA
2286 struct fib_result res;
2287 int err;
1da177e4 2288
6e28099d 2289 tos &= IPTOS_RT_MASK;
96d36220 2290 rcu_read_lock();
5510cdf7
DA
2291 err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2292 rcu_read_unlock();
96d36220 2293
5510cdf7
DA
2294 return err;
2295}
2296EXPORT_SYMBOL(ip_route_input_noref);
2297
2298/* called with rcu_read_lock held */
2299int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2300 u8 tos, struct net_device *dev, struct fib_result *res)
2301{
1da177e4
LT
2302 /* Multicast recognition logic is moved from route cache to here.
2303 The problem was that too many Ethernet cards have broken/missing
2304 hardware multicast filters :-( As result the host on multicasting
2305 network acquires a lot of useless route cache entries, sort of
2306 SDR messages from all the world. Now we try to get rid of them.
2307 Really, provided software IP multicast filter is organized
2308 reasonably (at least, hashed), it does not result in a slowdown
2309 comparing with route cache reject entries.
2310 Note, that multicast routers are not affected, because
2311 route cache entry is created eventually.
2312 */
f97c1e0c 2313 if (ipv4_is_multicast(daddr)) {
96d36220 2314 struct in_device *in_dev = __in_dev_get_rcu(dev);
e58e4159 2315 int our = 0;
5510cdf7 2316 int err = -EINVAL;
1da177e4 2317
22c74764
PA
2318 if (!in_dev)
2319 return err;
2320 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2321 ip_hdr(skb)->protocol);
e58e4159
DA
2322
2323 /* check l3 master if no match yet */
22c74764 2324 if (!our && netif_is_l3_slave(dev)) {
e58e4159
DA
2325 struct in_device *l3_in_dev;
2326
2327 l3_in_dev = __in_dev_get_rcu(skb->dev);
2328 if (l3_in_dev)
2329 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2330 ip_hdr(skb)->protocol);
2331 }
2332
e58e4159 2333 if (our
1da177e4 2334#ifdef CONFIG_IP_MROUTE
e58e4159
DA
2335 ||
2336 (!ipv4_is_local_multicast(daddr) &&
2337 IN_DEV_MFORWARD(in_dev))
1da177e4 2338#endif
e58e4159 2339 ) {
5510cdf7 2340 err = ip_route_input_mc(skb, daddr, saddr,
e58e4159 2341 tos, dev, our);
1da177e4 2342 }
5510cdf7 2343 return err;
1da177e4 2344 }
5510cdf7
DA
2345
2346 return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
1da177e4
LT
2347}
2348
ebc0ffae 2349/* called with rcu_read_lock() */
982721f3 2350static struct rtable *__mkroute_output(const struct fib_result *res,
1a00fee4 2351 const struct flowi4 *fl4, int orig_oif,
f61759e6 2352 struct net_device *dev_out,
5ada5527 2353 unsigned int flags)
1da177e4 2354{
982721f3 2355 struct fib_info *fi = res->fi;
f2bb4bed 2356 struct fib_nh_exception *fnhe;
5ada5527 2357 struct in_device *in_dev;
982721f3 2358 u16 type = res->type;
5ada5527 2359 struct rtable *rth;
c92b9655 2360 bool do_cache;
1da177e4 2361
d0daebc3
TG
2362 in_dev = __in_dev_get_rcu(dev_out);
2363 if (!in_dev)
5ada5527 2364 return ERR_PTR(-EINVAL);
1da177e4 2365
d0daebc3 2366 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
5f02ce24
DA
2367 if (ipv4_is_loopback(fl4->saddr) &&
2368 !(dev_out->flags & IFF_LOOPBACK) &&
2369 !netif_is_l3_master(dev_out))
d0daebc3
TG
2370 return ERR_PTR(-EINVAL);
2371
68a5e3dd 2372 if (ipv4_is_lbcast(fl4->daddr))
982721f3 2373 type = RTN_BROADCAST;
68a5e3dd 2374 else if (ipv4_is_multicast(fl4->daddr))
982721f3 2375 type = RTN_MULTICAST;
68a5e3dd 2376 else if (ipv4_is_zeronet(fl4->daddr))
5ada5527 2377 return ERR_PTR(-EINVAL);
1da177e4
LT
2378
2379 if (dev_out->flags & IFF_LOOPBACK)
2380 flags |= RTCF_LOCAL;
2381
63617421 2382 do_cache = true;
982721f3 2383 if (type == RTN_BROADCAST) {
1da177e4 2384 flags |= RTCF_BROADCAST | RTCF_LOCAL;
982721f3
DM
2385 fi = NULL;
2386 } else if (type == RTN_MULTICAST) {
dd28d1a0 2387 flags |= RTCF_MULTICAST | RTCF_LOCAL;
813b3b5d
DM
2388 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2389 fl4->flowi4_proto))
1da177e4 2390 flags &= ~RTCF_LOCAL;
63617421
JA
2391 else
2392 do_cache = false;
1da177e4 2393 /* If multicast route do not exist use
dd28d1a0
ED
2394 * default one, but do not gateway in this case.
2395 * Yes, it is hack.
1da177e4 2396 */
982721f3
DM
2397 if (fi && res->prefixlen < 4)
2398 fi = NULL;
d6d5e999
CF
2399 } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2400 (orig_oif != dev_out->ifindex)) {
2401 /* For local routes that require a particular output interface
2402 * we do not want to cache the result. Caching the result
2403 * causes incorrect behaviour when there are multiple source
2404 * addresses on the interface, the end result being that if the
2405 * intended recipient is waiting on that interface for the
2406 * packet he won't receive it because it will be delivered on
2407 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2408 * be set to the loopback interface as well.
2409 */
94720e3a 2410 do_cache = false;
1da177e4
LT
2411 }
2412
f2bb4bed 2413 fnhe = NULL;
63617421 2414 do_cache &= fi != NULL;
94720e3a 2415 if (fi) {
eba618ab 2416 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
c5038a83 2417 struct rtable __rcu **prth;
d26b3a7c 2418
a5995e71 2419 fnhe = find_exception(nhc, fl4->daddr);
94720e3a
JA
2420 if (!do_cache)
2421 goto add;
deed49df 2422 if (fnhe) {
2ffae99d 2423 prth = &fnhe->fnhe_rth_output;
94720e3a
JA
2424 } else {
2425 if (unlikely(fl4->flowi4_flags &
2426 FLOWI_FLAG_KNOWN_NH &&
bdf00467 2427 !(nhc->nhc_gw_family &&
eba618ab 2428 nhc->nhc_scope == RT_SCOPE_LINK))) {
94720e3a
JA
2429 do_cache = false;
2430 goto add;
c92b9655 2431 }
0f457a36 2432 prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
c92b9655 2433 }
c5038a83 2434 rth = rcu_dereference(*prth);
9df16efa 2435 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
c5038a83 2436 return rth;
f2bb4bed 2437 }
c92b9655
JA
2438
2439add:
d08c4f35 2440 rth = rt_dst_alloc(dev_out, flags, type,
5c1e6aa3 2441 IN_DEV_CONF_GET(in_dev, NOPOLICY),
af13b3c3 2442 IN_DEV_CONF_GET(in_dev, NOXFRM));
8391d07b 2443 if (!rth)
5ada5527 2444 return ERR_PTR(-ENOBUFS);
8391d07b 2445
9438c871 2446 rth->rt_iif = orig_oif;
b7503e0c 2447
1da177e4
LT
2448 RT_CACHE_STAT_INC(out_slow_tot);
2449
1da177e4 2450 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
e905a9ed 2451 if (flags & RTCF_LOCAL &&
1da177e4 2452 !(dev_out->flags & IFF_LOOPBACK)) {
d8d1f30b 2453 rth->dst.output = ip_mc_output;
1da177e4
LT
2454 RT_CACHE_STAT_INC(out_slow_mc);
2455 }
2456#ifdef CONFIG_IP_MROUTE
982721f3 2457 if (type == RTN_MULTICAST) {
1da177e4 2458 if (IN_DEV_MFORWARD(in_dev) &&
813b3b5d 2459 !ipv4_is_local_multicast(fl4->daddr)) {
d8d1f30b
CG
2460 rth->dst.input = ip_mr_input;
2461 rth->dst.output = ip_mc_output;
1da177e4
LT
2462 }
2463 }
2464#endif
2465 }
2466
a4c2fd7f 2467 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
9942895b 2468 lwtunnel_set_redirect(&rth->dst);
1da177e4 2469
5ada5527 2470 return rth;
1da177e4
LT
2471}
2472
1da177e4
LT
2473/*
2474 * Major route resolver routine.
2475 */
2476
3abd1ade
DA
2477struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2478 const struct sk_buff *skb)
1da177e4 2479{
f61759e6 2480 __u8 tos = RT_FL_TOS(fl4);
d0ea2b12
ED
2481 struct fib_result res = {
2482 .type = RTN_UNSPEC,
2483 .fi = NULL,
2484 .table = NULL,
2485 .tclassid = 0,
2486 };
5ada5527 2487 struct rtable *rth;
1da177e4 2488
1fb9489b 2489 fl4->flowi4_iif = LOOPBACK_IFINDEX;
813b3b5d
DM
2490 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2491 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2492 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
44713b67 2493
010c2708 2494 rcu_read_lock();
3abd1ade
DA
2495 rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2496 rcu_read_unlock();
2497
2498 return rth;
2499}
2500EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2501
2502struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2503 struct fib_result *res,
2504 const struct sk_buff *skb)
2505{
2506 struct net_device *dev_out = NULL;
2507 int orig_oif = fl4->flowi4_oif;
2508 unsigned int flags = 0;
2509 struct rtable *rth;
595e0651 2510 int err;
3abd1ade 2511
813b3b5d 2512 if (fl4->saddr) {
813b3b5d
DM
2513 if (ipv4_is_multicast(fl4->saddr) ||
2514 ipv4_is_lbcast(fl4->saddr) ||
595e0651
SB
2515 ipv4_is_zeronet(fl4->saddr)) {
2516 rth = ERR_PTR(-EINVAL);
1da177e4 2517 goto out;
595e0651
SB
2518 }
2519
2520 rth = ERR_PTR(-ENETUNREACH);
1da177e4 2521
1da177e4
LT
2522 /* I removed check for oif == dev_out->oif here.
2523 It was wrong for two reasons:
1ab35276
DL
2524 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2525 is assigned to multiple interfaces.
1da177e4
LT
2526 2. Moreover, we are allowed to send packets with saddr
2527 of another iface. --ANK
2528 */
2529
813b3b5d
DM
2530 if (fl4->flowi4_oif == 0 &&
2531 (ipv4_is_multicast(fl4->daddr) ||
2532 ipv4_is_lbcast(fl4->daddr))) {
a210d01a 2533 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 2534 dev_out = __ip_dev_find(net, fl4->saddr, false);
51456b29 2535 if (!dev_out)
a210d01a
JA
2536 goto out;
2537
1da177e4
LT
2538 /* Special hack: user can direct multicasts
2539 and limited broadcast via necessary interface
2540 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2541 This hack is not just for fun, it allows
2542 vic,vat and friends to work.
2543 They bind socket to loopback, set ttl to zero
2544 and expect that it will work.
2545 From the viewpoint of routing cache they are broken,
2546 because we are not allowed to build multicast path
2547 with loopback source addr (look, routing cache
2548 cannot know, that ttl is zero, so that packet
2549 will not leave this host and route is valid).
2550 Luckily, this hack is good workaround.
2551 */
2552
813b3b5d 2553 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2554 goto make_route;
2555 }
a210d01a 2556
813b3b5d 2557 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
a210d01a 2558 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 2559 if (!__ip_dev_find(net, fl4->saddr, false))
a210d01a 2560 goto out;
a210d01a 2561 }
1da177e4
LT
2562 }
2563
2564
813b3b5d
DM
2565 if (fl4->flowi4_oif) {
2566 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
b23dd4fe 2567 rth = ERR_PTR(-ENODEV);
51456b29 2568 if (!dev_out)
1da177e4 2569 goto out;
e5ed6399
HX
2570
2571 /* RACE: Check return value of inet_select_addr instead. */
fc75fc83 2572 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
b23dd4fe 2573 rth = ERR_PTR(-ENETUNREACH);
fc75fc83
ED
2574 goto out;
2575 }
813b3b5d 2576 if (ipv4_is_local_multicast(fl4->daddr) ||
6a211654
AL
2577 ipv4_is_lbcast(fl4->daddr) ||
2578 fl4->flowi4_proto == IPPROTO_IGMP) {
813b3b5d
DM
2579 if (!fl4->saddr)
2580 fl4->saddr = inet_select_addr(dev_out, 0,
2581 RT_SCOPE_LINK);
1da177e4
LT
2582 goto make_route;
2583 }
0a7e2260 2584 if (!fl4->saddr) {
813b3b5d
DM
2585 if (ipv4_is_multicast(fl4->daddr))
2586 fl4->saddr = inet_select_addr(dev_out, 0,
2587 fl4->flowi4_scope);
2588 else if (!fl4->daddr)
2589 fl4->saddr = inet_select_addr(dev_out, 0,
2590 RT_SCOPE_HOST);
1da177e4
LT
2591 }
2592 }
2593
813b3b5d
DM
2594 if (!fl4->daddr) {
2595 fl4->daddr = fl4->saddr;
2596 if (!fl4->daddr)
2597 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
b40afd0e 2598 dev_out = net->loopback_dev;
1fb9489b 2599 fl4->flowi4_oif = LOOPBACK_IFINDEX;
3abd1ade 2600 res->type = RTN_LOCAL;
1da177e4
LT
2601 flags |= RTCF_LOCAL;
2602 goto make_route;
2603 }
2604
3abd1ade 2605 err = fib_lookup(net, fl4, res, 0);
0315e382 2606 if (err) {
3abd1ade
DA
2607 res->fi = NULL;
2608 res->table = NULL;
6104e112 2609 if (fl4->flowi4_oif &&
e58e4159
DA
2610 (ipv4_is_multicast(fl4->daddr) ||
2611 !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
1da177e4
LT
2612 /* Apparently, routing tables are wrong. Assume,
2613 that the destination is on link.
2614
2615 WHY? DW.
2616 Because we are allowed to send to iface
2617 even if it has NO routes and NO assigned
2618 addresses. When oif is specified, routing
2619 tables are looked up with only one purpose:
2620 to catch if destination is gatewayed, rather than
2621 direct. Moreover, if MSG_DONTROUTE is set,
2622 we send packet, ignoring both routing tables
2623 and ifaddr state. --ANK
2624
2625
2626 We could make it even if oif is unknown,
2627 likely IPv6, but we do not.
2628 */
2629
813b3b5d
DM
2630 if (fl4->saddr == 0)
2631 fl4->saddr = inet_select_addr(dev_out, 0,
2632 RT_SCOPE_LINK);
3abd1ade 2633 res->type = RTN_UNICAST;
1da177e4
LT
2634 goto make_route;
2635 }
0315e382 2636 rth = ERR_PTR(err);
1da177e4
LT
2637 goto out;
2638 }
1da177e4 2639
3abd1ade 2640 if (res->type == RTN_LOCAL) {
813b3b5d 2641 if (!fl4->saddr) {
3abd1ade
DA
2642 if (res->fi->fib_prefsrc)
2643 fl4->saddr = res->fi->fib_prefsrc;
9fc3bbb4 2644 else
813b3b5d 2645 fl4->saddr = fl4->daddr;
9fc3bbb4 2646 }
5f02ce24
DA
2647
2648 /* L3 master device is the loopback for that domain */
3abd1ade 2649 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
b7c8487c 2650 net->loopback_dev;
839da4d9
DA
2651
2652 /* make sure orig_oif points to fib result device even
2653 * though packet rx/tx happens over loopback or l3mdev
2654 */
2655 orig_oif = FIB_RES_OIF(*res);
2656
813b3b5d 2657 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2658 flags |= RTCF_LOCAL;
2659 goto make_route;
2660 }
2661
3abd1ade 2662 fib_select_path(net, res, fl4, skb);
1da177e4 2663
3abd1ade 2664 dev_out = FIB_RES_DEV(*res);
813b3b5d 2665 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2666
2667
2668make_route:
3abd1ade 2669 rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
1da177e4 2670
010c2708 2671out:
b23dd4fe 2672 return rth;
1da177e4 2673}
d8c97a94 2674
ae2688d5
JW
2675static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2676{
2677 return NULL;
2678}
2679
ebb762f2 2680static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
ec831ea7 2681{
618f9bc7
SK
2682 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2683
2684 return mtu ? : dst->dev->mtu;
ec831ea7
RD
2685}
2686
6700c270 2687static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
bd085ef6
HL
2688 struct sk_buff *skb, u32 mtu,
2689 bool confirm_neigh)
14e50e57
DM
2690{
2691}
2692
6700c270
DM
2693static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2694 struct sk_buff *skb)
b587ee3b
DM
2695{
2696}
2697
0972ddb2
HB
2698static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2699 unsigned long old)
2700{
2701 return NULL;
2702}
2703
14e50e57
DM
2704static struct dst_ops ipv4_dst_blackhole_ops = {
2705 .family = AF_INET,
ae2688d5 2706 .check = ipv4_blackhole_dst_check,
ebb762f2 2707 .mtu = ipv4_blackhole_mtu,
214f45c9 2708 .default_advmss = ipv4_default_advmss,
14e50e57 2709 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
b587ee3b 2710 .redirect = ipv4_rt_blackhole_redirect,
0972ddb2 2711 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
d3aaeb38 2712 .neigh_lookup = ipv4_neigh_lookup,
14e50e57
DM
2713};
2714
2774c131 2715struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
14e50e57 2716{
2774c131 2717 struct rtable *ort = (struct rtable *) dst_orig;
f5b0a874 2718 struct rtable *rt;
14e50e57 2719
6c0e7284 2720 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
14e50e57 2721 if (rt) {
d8d1f30b 2722 struct dst_entry *new = &rt->dst;
14e50e57 2723
14e50e57 2724 new->__use = 1;
352e512c 2725 new->input = dst_discard;
ede2059d 2726 new->output = dst_discard_out;
14e50e57 2727
1dbe3252 2728 new->dev = net->loopback_dev;
14e50e57
DM
2729 if (new->dev)
2730 dev_hold(new->dev);
2731
9917e1e8 2732 rt->rt_is_input = ort->rt_is_input;
5e2b61f7 2733 rt->rt_iif = ort->rt_iif;
5943634f 2734 rt->rt_pmtu = ort->rt_pmtu;
d52e5a7e 2735 rt->rt_mtu_locked = ort->rt_mtu_locked;
14e50e57 2736
ca4c3fc2 2737 rt->rt_genid = rt_genid_ipv4(net);
14e50e57
DM
2738 rt->rt_flags = ort->rt_flags;
2739 rt->rt_type = ort->rt_type;
77d5bc7e 2740 rt->rt_uses_gateway = ort->rt_uses_gateway;
1550c171
DA
2741 rt->rt_gw_family = ort->rt_gw_family;
2742 if (rt->rt_gw_family == AF_INET)
2743 rt->rt_gw4 = ort->rt_gw4;
0f5f7d7b
DA
2744 else if (rt->rt_gw_family == AF_INET6)
2745 rt->rt_gw6 = ort->rt_gw6;
14e50e57 2746
caacf05e 2747 INIT_LIST_HEAD(&rt->rt_uncached);
14e50e57
DM
2748 }
2749
2774c131
DM
2750 dst_release(dst_orig);
2751
2752 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
14e50e57
DM
2753}
2754
9d6ec938 2755struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
6f9c9615 2756 const struct sock *sk)
1da177e4 2757{
9d6ec938 2758 struct rtable *rt = __ip_route_output_key(net, flp4);
1da177e4 2759
b23dd4fe
DM
2760 if (IS_ERR(rt))
2761 return rt;
1da177e4 2762
56157872 2763 if (flp4->flowi4_proto)
f92ee619
SK
2764 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2765 flowi4_to_flowi(flp4),
2766 sk, 0);
1da177e4 2767
b23dd4fe 2768 return rt;
1da177e4 2769}
d8c97a94
ACM
2770EXPORT_SYMBOL_GPL(ip_route_output_flow);
2771
571912c6
MV
2772struct rtable *ip_route_output_tunnel(struct sk_buff *skb,
2773 struct net_device *dev,
2774 struct net *net, __be32 *saddr,
2775 const struct ip_tunnel_info *info,
2776 u8 protocol, bool use_cache)
2777{
2778#ifdef CONFIG_DST_CACHE
2779 struct dst_cache *dst_cache;
2780#endif
2781 struct rtable *rt = NULL;
2782 struct flowi4 fl4;
2783 __u8 tos;
2784
2785#ifdef CONFIG_DST_CACHE
2786 dst_cache = (struct dst_cache *)&info->dst_cache;
2787 if (use_cache) {
2788 rt = dst_cache_get_ip4(dst_cache, saddr);
2789 if (rt)
2790 return rt;
2791 }
2792#endif
2793 memset(&fl4, 0, sizeof(fl4));
2794 fl4.flowi4_mark = skb->mark;
2795 fl4.flowi4_proto = protocol;
2796 fl4.daddr = info->key.u.ipv4.dst;
2797 fl4.saddr = info->key.u.ipv4.src;
2798 tos = info->key.tos;
2799 fl4.flowi4_tos = RT_TOS(tos);
2800
2801 rt = ip_route_output_key(net, &fl4);
2802 if (IS_ERR(rt)) {
2803 netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
2804 return ERR_PTR(-ENETUNREACH);
2805 }
2806 if (rt->dst.dev == dev) { /* is this necessary? */
2807 netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
2808 ip_rt_put(rt);
2809 return ERR_PTR(-ELOOP);
2810 }
2811#ifdef CONFIG_DST_CACHE
2812 if (use_cache)
2813 dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
2814#endif
2815 *saddr = fl4.saddr;
2816 return rt;
2817}
2818EXPORT_SYMBOL_GPL(ip_route_output_tunnel);
2819
3765d35e 2820/* called with rcu_read_lock held */
404eb77e
RP
2821static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2822 struct rtable *rt, u32 table_id, struct flowi4 *fl4,
e93fb3e9
JF
2823 struct sk_buff *skb, u32 portid, u32 seq,
2824 unsigned int flags)
1da177e4 2825{
1da177e4 2826 struct rtmsg *r;
be403ea1 2827 struct nlmsghdr *nlh;
2bc8ca40 2828 unsigned long expires = 0;
f185071d 2829 u32 error;
521f5490 2830 u32 metrics[RTAX_MAX];
be403ea1 2831
e93fb3e9 2832 nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
51456b29 2833 if (!nlh)
26932566 2834 return -EMSGSIZE;
be403ea1
TG
2835
2836 r = nlmsg_data(nlh);
1da177e4
LT
2837 r->rtm_family = AF_INET;
2838 r->rtm_dst_len = 32;
2839 r->rtm_src_len = 0;
d948974c 2840 r->rtm_tos = fl4 ? fl4->flowi4_tos : 0;
8a430ed5 2841 r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
c36ba660 2842 if (nla_put_u32(skb, RTA_TABLE, table_id))
f3756b79 2843 goto nla_put_failure;
1da177e4
LT
2844 r->rtm_type = rt->rt_type;
2845 r->rtm_scope = RT_SCOPE_UNIVERSE;
2846 r->rtm_protocol = RTPROT_UNSPEC;
2847 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2848 if (rt->rt_flags & RTCF_NOTIFY)
2849 r->rtm_flags |= RTM_F_NOTIFY;
df4d9254
HFS
2850 if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2851 r->rtm_flags |= RTCF_DOREDIRECT;
be403ea1 2852
930345ea 2853 if (nla_put_in_addr(skb, RTA_DST, dst))
f3756b79 2854 goto nla_put_failure;
1a00fee4 2855 if (src) {
1da177e4 2856 r->rtm_src_len = 32;
930345ea 2857 if (nla_put_in_addr(skb, RTA_SRC, src))
f3756b79 2858 goto nla_put_failure;
1da177e4 2859 }
f3756b79
DM
2860 if (rt->dst.dev &&
2861 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2862 goto nla_put_failure;
c7066f70 2863#ifdef CONFIG_IP_ROUTE_CLASSID
f3756b79
DM
2864 if (rt->dst.tclassid &&
2865 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2866 goto nla_put_failure;
1da177e4 2867#endif
d948974c 2868 if (fl4 && !rt_is_input_route(rt) &&
d6c0a4f6 2869 fl4->saddr != src) {
930345ea 2870 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
f3756b79
DM
2871 goto nla_put_failure;
2872 }
77d5bc7e
DA
2873 if (rt->rt_uses_gateway) {
2874 if (rt->rt_gw_family == AF_INET &&
2875 nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
0f5f7d7b 2876 goto nla_put_failure;
77d5bc7e
DA
2877 } else if (rt->rt_gw_family == AF_INET6) {
2878 int alen = sizeof(struct in6_addr);
2879 struct nlattr *nla;
2880 struct rtvia *via;
2881
2882 nla = nla_reserve(skb, RTA_VIA, alen + 2);
2883 if (!nla)
2884 goto nla_put_failure;
2885
2886 via = nla_data(nla);
2887 via->rtvia_family = AF_INET6;
2888 memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2889 }
0f5f7d7b 2890 }
be403ea1 2891
ee9a8f7a
SK
2892 expires = rt->dst.expires;
2893 if (expires) {
2894 unsigned long now = jiffies;
2895
2896 if (time_before(now, expires))
2897 expires -= now;
2898 else
2899 expires = 0;
2900 }
2901
521f5490 2902 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
ee9a8f7a 2903 if (rt->rt_pmtu && expires)
521f5490 2904 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
d52e5a7e
SD
2905 if (rt->rt_mtu_locked && expires)
2906 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
521f5490 2907 if (rtnetlink_put_metrics(skb, metrics) < 0)
be403ea1
TG
2908 goto nla_put_failure;
2909
d948974c
SB
2910 if (fl4) {
2911 if (fl4->flowi4_mark &&
2912 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2913 goto nla_put_failure;
622ec2c9 2914
d948974c
SB
2915 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2916 nla_put_u32(skb, RTA_UID,
2917 from_kuid_munged(current_user_ns(),
2918 fl4->flowi4_uid)))
2919 goto nla_put_failure;
be403ea1 2920
d948974c 2921 if (rt_is_input_route(rt)) {
8caaf7b6 2922#ifdef CONFIG_IP_MROUTE
d948974c
SB
2923 if (ipv4_is_multicast(dst) &&
2924 !ipv4_is_local_multicast(dst) &&
2925 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2926 int err = ipmr_get_route(net, skb,
2927 fl4->saddr, fl4->daddr,
2928 r, portid);
2929
2930 if (err <= 0) {
2931 if (err == 0)
2932 return 0;
2933 goto nla_put_failure;
2934 }
2935 } else
8caaf7b6 2936#endif
d948974c
SB
2937 if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2938 goto nla_put_failure;
2939 }
1da177e4
LT
2940 }
2941
d948974c
SB
2942 error = rt->dst.error;
2943
f185071d 2944 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
e3703b3d 2945 goto nla_put_failure;
be403ea1 2946
053c095a
JB
2947 nlmsg_end(skb, nlh);
2948 return 0;
1da177e4 2949
be403ea1 2950nla_put_failure:
26932566
PM
2951 nlmsg_cancel(skb, nlh);
2952 return -EMSGSIZE;
1da177e4
LT
2953}
2954
ee28906f
SB
2955static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
2956 struct netlink_callback *cb, u32 table_id,
2957 struct fnhe_hash_bucket *bucket, int genid,
e93fb3e9 2958 int *fa_index, int fa_start, unsigned int flags)
ee28906f
SB
2959{
2960 int i;
2961
2962 for (i = 0; i < FNHE_HASH_SIZE; i++) {
2963 struct fib_nh_exception *fnhe;
2964
2965 for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
2966 fnhe = rcu_dereference(fnhe->fnhe_next)) {
2967 struct rtable *rt;
2968 int err;
2969
2970 if (*fa_index < fa_start)
2971 goto next;
2972
2973 if (fnhe->fnhe_genid != genid)
2974 goto next;
2975
2976 if (fnhe->fnhe_expires &&
2977 time_after(jiffies, fnhe->fnhe_expires))
2978 goto next;
2979
2980 rt = rcu_dereference(fnhe->fnhe_rth_input);
2981 if (!rt)
2982 rt = rcu_dereference(fnhe->fnhe_rth_output);
2983 if (!rt)
2984 goto next;
2985
2986 err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
2987 table_id, NULL, skb,
2988 NETLINK_CB(cb->skb).portid,
e93fb3e9 2989 cb->nlh->nlmsg_seq, flags);
ee28906f
SB
2990 if (err)
2991 return err;
2992next:
2993 (*fa_index)++;
2994 }
2995 }
2996
2997 return 0;
2998}
2999
3000int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
3001 u32 table_id, struct fib_info *fi,
e93fb3e9 3002 int *fa_index, int fa_start, unsigned int flags)
ee28906f
SB
3003{
3004 struct net *net = sock_net(cb->skb->sk);
3005 int nhsel, genid = fnhe_genid(net);
3006
3007 for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
3008 struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
3009 struct fnhe_hash_bucket *bucket;
3010 int err;
3011
3012 if (nhc->nhc_flags & RTNH_F_DEAD)
3013 continue;
3014
93ed54b1 3015 rcu_read_lock();
ee28906f 3016 bucket = rcu_dereference(nhc->nhc_exceptions);
93ed54b1
ED
3017 err = 0;
3018 if (bucket)
3019 err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
e93fb3e9
JF
3020 genid, fa_index, fa_start,
3021 flags);
93ed54b1 3022 rcu_read_unlock();
ee28906f
SB
3023 if (err)
3024 return err;
3025 }
3026
3027 return 0;
3028}
3029
404eb77e
RP
3030static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
3031 u8 ip_proto, __be16 sport,
3032 __be16 dport)
3033{
3034 struct sk_buff *skb;
3035 struct iphdr *iph;
3036
3037 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3038 if (!skb)
3039 return NULL;
3040
3041 /* Reserve room for dummy headers, this skb can pass
3042 * through good chunk of routing engine.
3043 */
3044 skb_reset_mac_header(skb);
3045 skb_reset_network_header(skb);
3046 skb->protocol = htons(ETH_P_IP);
3047 iph = skb_put(skb, sizeof(struct iphdr));
3048 iph->protocol = ip_proto;
3049 iph->saddr = src;
3050 iph->daddr = dst;
3051 iph->version = 0x4;
3052 iph->frag_off = 0;
3053 iph->ihl = 0x5;
3054 skb_set_transport_header(skb, skb->len);
3055
3056 switch (iph->protocol) {
3057 case IPPROTO_UDP: {
3058 struct udphdr *udph;
3059
3060 udph = skb_put_zero(skb, sizeof(struct udphdr));
3061 udph->source = sport;
3062 udph->dest = dport;
3063 udph->len = sizeof(struct udphdr);
3064 udph->check = 0;
3065 break;
3066 }
3067 case IPPROTO_TCP: {
3068 struct tcphdr *tcph;
3069
3070 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3071 tcph->source = sport;
3072 tcph->dest = dport;
3073 tcph->doff = sizeof(struct tcphdr) / 4;
3074 tcph->rst = 1;
3075 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3076 src, dst, 0);
3077 break;
3078 }
3079 case IPPROTO_ICMP: {
3080 struct icmphdr *icmph;
3081
3082 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3083 icmph->type = ICMP_ECHO;
3084 icmph->code = 0;
3085 }
3086 }
3087
3088 return skb;
3089}
3090
a00302b6
JK
3091static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3092 const struct nlmsghdr *nlh,
3093 struct nlattr **tb,
3094 struct netlink_ext_ack *extack)
3095{
3096 struct rtmsg *rtm;
3097 int i, err;
3098
3099 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3100 NL_SET_ERR_MSG(extack,
3101 "ipv4: Invalid header for route get request");
3102 return -EINVAL;
3103 }
3104
3105 if (!netlink_strict_get_check(skb))
8cb08174
JB
3106 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3107 rtm_ipv4_policy, extack);
a00302b6
JK
3108
3109 rtm = nlmsg_data(nlh);
3110 if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3111 (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3112 rtm->rtm_table || rtm->rtm_protocol ||
3113 rtm->rtm_scope || rtm->rtm_type) {
3114 NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3115 return -EINVAL;
3116 }
3117
3118 if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3119 RTM_F_LOOKUP_TABLE |
3120 RTM_F_FIB_MATCH)) {
3121 NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3122 return -EINVAL;
3123 }
3124
8cb08174
JB
3125 err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3126 rtm_ipv4_policy, extack);
a00302b6
JK
3127 if (err)
3128 return err;
3129
3130 if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3131 (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3132 NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3133 return -EINVAL;
3134 }
3135
3136 for (i = 0; i <= RTA_MAX; i++) {
3137 if (!tb[i])
3138 continue;
3139
3140 switch (i) {
3141 case RTA_IIF:
3142 case RTA_OIF:
3143 case RTA_SRC:
3144 case RTA_DST:
3145 case RTA_IP_PROTO:
3146 case RTA_SPORT:
3147 case RTA_DPORT:
3148 case RTA_MARK:
3149 case RTA_UID:
3150 break;
3151 default:
3152 NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3153 return -EINVAL;
3154 }
3155 }
3156
3157 return 0;
3158}
3159
c21ef3e3
DA
3160static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3161 struct netlink_ext_ack *extack)
1da177e4 3162{
3b1e0a65 3163 struct net *net = sock_net(in_skb->sk);
d889ce3b 3164 struct nlattr *tb[RTA_MAX+1];
404eb77e
RP
3165 u32 table_id = RT_TABLE_MAIN;
3166 __be16 sport = 0, dport = 0;
3765d35e 3167 struct fib_result res = {};
404eb77e 3168 u8 ip_proto = IPPROTO_UDP;
1da177e4 3169 struct rtable *rt = NULL;
404eb77e
RP
3170 struct sk_buff *skb;
3171 struct rtmsg *rtm;
e8e3fbe9 3172 struct flowi4 fl4 = {};
9e12bb22
AV
3173 __be32 dst = 0;
3174 __be32 src = 0;
404eb77e 3175 kuid_t uid;
9e12bb22 3176 u32 iif;
d889ce3b 3177 int err;
963bfeee 3178 int mark;
1da177e4 3179
a00302b6 3180 err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
d889ce3b 3181 if (err < 0)
404eb77e 3182 return err;
d889ce3b
TG
3183
3184 rtm = nlmsg_data(nlh);
67b61f6c
JB
3185 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3186 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
d889ce3b 3187 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
963bfeee 3188 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
622ec2c9
LC
3189 if (tb[RTA_UID])
3190 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3191 else
3192 uid = (iif ? INVALID_UID : current_uid());
1da177e4 3193
404eb77e
RP
3194 if (tb[RTA_IP_PROTO]) {
3195 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
5e1a99ea 3196 &ip_proto, AF_INET, extack);
404eb77e
RP
3197 if (err)
3198 return err;
3199 }
bbadb9a2 3200
404eb77e
RP
3201 if (tb[RTA_SPORT])
3202 sport = nla_get_be16(tb[RTA_SPORT]);
bbadb9a2 3203
404eb77e
RP
3204 if (tb[RTA_DPORT])
3205 dport = nla_get_be16(tb[RTA_DPORT]);
3206
3207 skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3208 if (!skb)
3209 return -ENOBUFS;
bbadb9a2 3210
d6c0a4f6
DM
3211 fl4.daddr = dst;
3212 fl4.saddr = src;
3213 fl4.flowi4_tos = rtm->rtm_tos;
3214 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3215 fl4.flowi4_mark = mark;
622ec2c9 3216 fl4.flowi4_uid = uid;
404eb77e
RP
3217 if (sport)
3218 fl4.fl4_sport = sport;
3219 if (dport)
3220 fl4.fl4_dport = dport;
3221 fl4.flowi4_proto = ip_proto;
d6c0a4f6 3222
3765d35e
DA
3223 rcu_read_lock();
3224
1da177e4 3225 if (iif) {
d889ce3b
TG
3226 struct net_device *dev;
3227
3765d35e 3228 dev = dev_get_by_index_rcu(net, iif);
51456b29 3229 if (!dev) {
d889ce3b 3230 err = -ENODEV;
404eb77e 3231 goto errout_rcu;
d889ce3b
TG
3232 }
3233
404eb77e 3234 fl4.flowi4_iif = iif; /* for rt_fill_info */
1da177e4 3235 skb->dev = dev;
963bfeee 3236 skb->mark = mark;
3765d35e
DA
3237 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
3238 dev, &res);
d889ce3b 3239
511c3f92 3240 rt = skb_rtable(skb);
d8d1f30b
CG
3241 if (err == 0 && rt->dst.error)
3242 err = -rt->dst.error;
1da177e4 3243 } else {
6503a304 3244 fl4.flowi4_iif = LOOPBACK_IFINDEX;
21f94775 3245 skb->dev = net->loopback_dev;
3765d35e 3246 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
b23dd4fe
DM
3247 err = 0;
3248 if (IS_ERR(rt))
3249 err = PTR_ERR(rt);
2c87d63a
FW
3250 else
3251 skb_dst_set(skb, &rt->dst);
1da177e4 3252 }
d889ce3b 3253
1da177e4 3254 if (err)
404eb77e 3255 goto errout_rcu;
1da177e4 3256
1da177e4
LT
3257 if (rtm->rtm_flags & RTM_F_NOTIFY)
3258 rt->rt_flags |= RTCF_NOTIFY;
3259
c36ba660 3260 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
68e813aa 3261 table_id = res.table ? res.table->tb_id : 0;
c36ba660 3262
404eb77e
RP
3263 /* reset skb for netlink reply msg */
3264 skb_trim(skb, 0);
3265 skb_reset_network_header(skb);
3266 skb_reset_transport_header(skb);
3267 skb_reset_mac_header(skb);
3268
bc3aae2b 3269 if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
1e301fd0
IS
3270 struct fib_rt_info fri;
3271
bc3aae2b
RP
3272 if (!res.fi) {
3273 err = fib_props[res.type].error;
3274 if (!err)
3275 err = -EHOSTUNREACH;
404eb77e 3276 goto errout_rcu;
bc3aae2b 3277 }
1e301fd0
IS
3278 fri.fi = res.fi;
3279 fri.tb_id = table_id;
3280 fri.dst = res.prefix;
3281 fri.dst_len = res.prefixlen;
3282 fri.tos = fl4.flowi4_tos;
3283 fri.type = rt->rt_type;
90b93f1b
IS
3284 fri.offload = 0;
3285 fri.trap = 0;
3286 if (res.fa_head) {
3287 struct fib_alias *fa;
3288
3289 hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
3290 u8 slen = 32 - fri.dst_len;
3291
3292 if (fa->fa_slen == slen &&
3293 fa->tb_id == fri.tb_id &&
3294 fa->fa_tos == fri.tos &&
3295 fa->fa_info == res.fi &&
3296 fa->fa_type == fri.type) {
3297 fri.offload = fa->offload;
3298 fri.trap = fa->trap;
3299 break;
3300 }
3301 }
3302 }
b6179813 3303 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
1e301fd0 3304 nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
bc3aae2b 3305 } else {
404eb77e 3306 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
e93fb3e9
JF
3307 NETLINK_CB(in_skb).portid,
3308 nlh->nlmsg_seq, 0);
bc3aae2b 3309 }
7b46a644 3310 if (err < 0)
404eb77e 3311 goto errout_rcu;
1da177e4 3312
3765d35e
DA
3313 rcu_read_unlock();
3314
15e47304 3315 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
1da177e4 3316
d889ce3b 3317errout_free:
404eb77e
RP
3318 return err;
3319errout_rcu:
3765d35e 3320 rcu_read_unlock();
1da177e4 3321 kfree_skb(skb);
404eb77e 3322 goto errout_free;
1da177e4
LT
3323}
3324
1da177e4
LT
3325void ip_rt_multicast_event(struct in_device *in_dev)
3326{
4ccfe6d4 3327 rt_cache_flush(dev_net(in_dev->dev));
1da177e4
LT
3328}
3329
3330#ifdef CONFIG_SYSCTL
082c7ca4
G
3331static int ip_rt_gc_interval __read_mostly = 60 * HZ;
3332static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
3333static int ip_rt_gc_elasticity __read_mostly = 8;
773daa3c 3334static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU;
082c7ca4 3335
fe2c6338 3336static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
8d65af78 3337 void __user *buffer,
1da177e4
LT
3338 size_t *lenp, loff_t *ppos)
3339{
5aad1de5
TT
3340 struct net *net = (struct net *)__ctl->extra1;
3341
1da177e4 3342 if (write) {
5aad1de5
TT
3343 rt_cache_flush(net);
3344 fnhe_genid_bump(net);
1da177e4 3345 return 0;
e905a9ed 3346 }
1da177e4
LT
3347
3348 return -EINVAL;
3349}
3350
fe2c6338 3351static struct ctl_table ipv4_route_table[] = {
1da177e4 3352 {
1da177e4
LT
3353 .procname = "gc_thresh",
3354 .data = &ipv4_dst_ops.gc_thresh,
3355 .maxlen = sizeof(int),
3356 .mode = 0644,
6d9f239a 3357 .proc_handler = proc_dointvec,
1da177e4
LT
3358 },
3359 {
1da177e4
LT
3360 .procname = "max_size",
3361 .data = &ip_rt_max_size,
3362 .maxlen = sizeof(int),
3363 .mode = 0644,
6d9f239a 3364 .proc_handler = proc_dointvec,
1da177e4
LT
3365 },
3366 {
3367 /* Deprecated. Use gc_min_interval_ms */
e905a9ed 3368
1da177e4
LT
3369 .procname = "gc_min_interval",
3370 .data = &ip_rt_gc_min_interval,
3371 .maxlen = sizeof(int),
3372 .mode = 0644,
6d9f239a 3373 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3374 },
3375 {
1da177e4
LT
3376 .procname = "gc_min_interval_ms",
3377 .data = &ip_rt_gc_min_interval,
3378 .maxlen = sizeof(int),
3379 .mode = 0644,
6d9f239a 3380 .proc_handler = proc_dointvec_ms_jiffies,
1da177e4
LT
3381 },
3382 {
1da177e4
LT
3383 .procname = "gc_timeout",
3384 .data = &ip_rt_gc_timeout,
3385 .maxlen = sizeof(int),
3386 .mode = 0644,
6d9f239a 3387 .proc_handler = proc_dointvec_jiffies,
1da177e4 3388 },
9f28a2fc
ED
3389 {
3390 .procname = "gc_interval",
3391 .data = &ip_rt_gc_interval,
3392 .maxlen = sizeof(int),
3393 .mode = 0644,
3394 .proc_handler = proc_dointvec_jiffies,
3395 },
1da177e4 3396 {
1da177e4
LT
3397 .procname = "redirect_load",
3398 .data = &ip_rt_redirect_load,
3399 .maxlen = sizeof(int),
3400 .mode = 0644,
6d9f239a 3401 .proc_handler = proc_dointvec,
1da177e4
LT
3402 },
3403 {
1da177e4
LT
3404 .procname = "redirect_number",
3405 .data = &ip_rt_redirect_number,
3406 .maxlen = sizeof(int),
3407 .mode = 0644,
6d9f239a 3408 .proc_handler = proc_dointvec,
1da177e4
LT
3409 },
3410 {
1da177e4
LT
3411 .procname = "redirect_silence",
3412 .data = &ip_rt_redirect_silence,
3413 .maxlen = sizeof(int),
3414 .mode = 0644,
6d9f239a 3415 .proc_handler = proc_dointvec,
1da177e4
LT
3416 },
3417 {
1da177e4
LT
3418 .procname = "error_cost",
3419 .data = &ip_rt_error_cost,
3420 .maxlen = sizeof(int),
3421 .mode = 0644,
6d9f239a 3422 .proc_handler = proc_dointvec,
1da177e4
LT
3423 },
3424 {
1da177e4
LT
3425 .procname = "error_burst",
3426 .data = &ip_rt_error_burst,
3427 .maxlen = sizeof(int),
3428 .mode = 0644,
6d9f239a 3429 .proc_handler = proc_dointvec,
1da177e4
LT
3430 },
3431 {
1da177e4
LT
3432 .procname = "gc_elasticity",
3433 .data = &ip_rt_gc_elasticity,
3434 .maxlen = sizeof(int),
3435 .mode = 0644,
6d9f239a 3436 .proc_handler = proc_dointvec,
1da177e4
LT
3437 },
3438 {
1da177e4
LT
3439 .procname = "mtu_expires",
3440 .data = &ip_rt_mtu_expires,
3441 .maxlen = sizeof(int),
3442 .mode = 0644,
6d9f239a 3443 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3444 },
3445 {
1da177e4
LT
3446 .procname = "min_pmtu",
3447 .data = &ip_rt_min_pmtu,
3448 .maxlen = sizeof(int),
3449 .mode = 0644,
c7272c2f
SD
3450 .proc_handler = proc_dointvec_minmax,
3451 .extra1 = &ip_min_valid_pmtu,
1da177e4
LT
3452 },
3453 {
1da177e4
LT
3454 .procname = "min_adv_mss",
3455 .data = &ip_rt_min_advmss,
3456 .maxlen = sizeof(int),
3457 .mode = 0644,
6d9f239a 3458 .proc_handler = proc_dointvec,
1da177e4 3459 },
f8572d8f 3460 { }
1da177e4 3461};
39a23e75 3462
5cdda5f1
CB
3463static const char ipv4_route_flush_procname[] = "flush";
3464
39a23e75
DL
3465static struct ctl_table ipv4_route_flush_table[] = {
3466 {
5cdda5f1 3467 .procname = ipv4_route_flush_procname,
39a23e75
DL
3468 .maxlen = sizeof(int),
3469 .mode = 0200,
6d9f239a 3470 .proc_handler = ipv4_sysctl_rtcache_flush,
39a23e75 3471 },
f8572d8f 3472 { },
39a23e75
DL
3473};
3474
3475static __net_init int sysctl_route_net_init(struct net *net)
3476{
3477 struct ctl_table *tbl;
3478
3479 tbl = ipv4_route_flush_table;
09ad9bc7 3480 if (!net_eq(net, &init_net)) {
39a23e75 3481 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
51456b29 3482 if (!tbl)
39a23e75 3483 goto err_dup;
464dc801 3484
5cdda5f1
CB
3485 /* Don't export non-whitelisted sysctls to unprivileged users */
3486 if (net->user_ns != &init_user_ns) {
3487 if (tbl[0].procname != ipv4_route_flush_procname)
3488 tbl[0].procname = NULL;
3489 }
39a23e75
DL
3490 }
3491 tbl[0].extra1 = net;
3492
ec8f23ce 3493 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
51456b29 3494 if (!net->ipv4.route_hdr)
39a23e75
DL
3495 goto err_reg;
3496 return 0;
3497
3498err_reg:
3499 if (tbl != ipv4_route_flush_table)
3500 kfree(tbl);
3501err_dup:
3502 return -ENOMEM;
3503}
3504
3505static __net_exit void sysctl_route_net_exit(struct net *net)
3506{
3507 struct ctl_table *tbl;
3508
3509 tbl = net->ipv4.route_hdr->ctl_table_arg;
3510 unregister_net_sysctl_table(net->ipv4.route_hdr);
3511 BUG_ON(tbl == ipv4_route_flush_table);
3512 kfree(tbl);
3513}
3514
3515static __net_initdata struct pernet_operations sysctl_route_ops = {
3516 .init = sysctl_route_net_init,
3517 .exit = sysctl_route_net_exit,
3518};
1da177e4
LT
3519#endif
3520
3ee94372 3521static __net_init int rt_genid_init(struct net *net)
9f5e97e5 3522{
ca4c3fc2 3523 atomic_set(&net->ipv4.rt_genid, 0);
5aad1de5 3524 atomic_set(&net->fnhe_genid, 0);
7aed9f72 3525 atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
9f5e97e5
DL
3526 return 0;
3527}
3528
3ee94372
NH
3529static __net_initdata struct pernet_operations rt_genid_ops = {
3530 .init = rt_genid_init,
9f5e97e5
DL
3531};
3532
c3426b47
DM
3533static int __net_init ipv4_inetpeer_init(struct net *net)
3534{
3535 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3536
3537 if (!bp)
3538 return -ENOMEM;
3539 inet_peer_base_init(bp);
3540 net->ipv4.peers = bp;
3541 return 0;
3542}
3543
3544static void __net_exit ipv4_inetpeer_exit(struct net *net)
3545{
3546 struct inet_peer_base *bp = net->ipv4.peers;
3547
3548 net->ipv4.peers = NULL;
56a6b248 3549 inetpeer_invalidate_tree(bp);
c3426b47
DM
3550 kfree(bp);
3551}
3552
3553static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3554 .init = ipv4_inetpeer_init,
3555 .exit = ipv4_inetpeer_exit,
3556};
9f5e97e5 3557
c7066f70 3558#ifdef CONFIG_IP_ROUTE_CLASSID
7d720c3e 3559struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
c7066f70 3560#endif /* CONFIG_IP_ROUTE_CLASSID */
1da177e4 3561
1da177e4
LT
3562int __init ip_rt_init(void)
3563{
5055c371 3564 int cpu;
1da177e4 3565
6da2ec56
KC
3566 ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
3567 GFP_KERNEL);
73f156a6
ED
3568 if (!ip_idents)
3569 panic("IP: failed to allocate ip_idents\n");
3570
3571 prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3572
355b590c
ED
3573 ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3574 if (!ip_tstamps)
3575 panic("IP: failed to allocate ip_tstamps\n");
3576
5055c371
ED
3577 for_each_possible_cpu(cpu) {
3578 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3579
3580 INIT_LIST_HEAD(&ul->head);
3581 spin_lock_init(&ul->lock);
3582 }
c7066f70 3583#ifdef CONFIG_IP_ROUTE_CLASSID
0dcec8c2 3584 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
1da177e4
LT
3585 if (!ip_rt_acct)
3586 panic("IP: failed to allocate ip_rt_acct\n");
1da177e4
LT
3587#endif
3588
e5d679f3
AD
3589 ipv4_dst_ops.kmem_cachep =
3590 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
20c2df83 3591 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1da177e4 3592
14e50e57
DM
3593 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3594
fc66f95c
ED
3595 if (dst_entries_init(&ipv4_dst_ops) < 0)
3596 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3597
3598 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3599 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3600
89aef892
DM
3601 ipv4_dst_ops.gc_thresh = ~0;
3602 ip_rt_max_size = INT_MAX;
1da177e4 3603
1da177e4
LT
3604 devinet_init();
3605 ip_fib_init();
3606
73b38711 3607 if (ip_rt_proc_init())
058bd4d2 3608 pr_err("Unable to create route proc files\n");
1da177e4
LT
3609#ifdef CONFIG_XFRM
3610 xfrm_init();
703fb94e 3611 xfrm4_init();
1da177e4 3612#endif
394f51ab
FW
3613 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3614 RTNL_FLAG_DOIT_UNLOCKED);
63f3444f 3615
39a23e75
DL
3616#ifdef CONFIG_SYSCTL
3617 register_pernet_subsys(&sysctl_route_ops);
3618#endif
3ee94372 3619 register_pernet_subsys(&rt_genid_ops);
c3426b47 3620 register_pernet_subsys(&ipv4_inetpeer_ops);
1bcdca3f 3621 return 0;
1da177e4
LT
3622}
3623
a1bc6eb4 3624#ifdef CONFIG_SYSCTL
eeb61f71
AV
3625/*
3626 * We really need to sanitize the damn ipv4 init order, then all
3627 * this nonsense will go away.
3628 */
3629void __init ip_static_sysctl_init(void)
3630{
4e5ca785 3631 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
eeb61f71 3632}
a1bc6eb4 3633#endif