]> git.ipfire.org Git - thirdparty/linux.git/blame - net/ipv4/route.c
Merge tag 'io_uring-5.7-2020-05-22' of git://git.kernel.dk/linux-block
[thirdparty/linux.git] / net / ipv4 / route.c
CommitLineData
2874c5fd 1// SPDX-License-Identifier: GPL-2.0-or-later
1da177e4
LT
2/*
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
6 *
7 * ROUTE - implementation of the IP router.
8 *
02c30a84 9 * Authors: Ross Biro
1da177e4
LT
10 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11 * Alan Cox, <gw4pts@gw4pts.ampr.org>
12 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
13 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
14 *
15 * Fixes:
16 * Alan Cox : Verify area fixes.
17 * Alan Cox : cli() protects routing changes
18 * Rui Oliveira : ICMP routing table updates
19 * (rco@di.uminho.pt) Routing table insertion and update
20 * Linus Torvalds : Rewrote bits to be sensible
21 * Alan Cox : Added BSD route gw semantics
e905a9ed 22 * Alan Cox : Super /proc >4K
1da177e4
LT
23 * Alan Cox : MTU in route table
24 * Alan Cox : MSS actually. Also added the window
25 * clamper.
26 * Sam Lantinga : Fixed route matching in rt_del()
27 * Alan Cox : Routing cache support.
28 * Alan Cox : Removed compatibility cruft.
29 * Alan Cox : RTF_REJECT support.
30 * Alan Cox : TCP irtt support.
31 * Jonathan Naylor : Added Metric support.
32 * Miquel van Smoorenburg : BSD API fixes.
33 * Miquel van Smoorenburg : Metrics.
34 * Alan Cox : Use __u32 properly
35 * Alan Cox : Aligned routing errors more closely with BSD
36 * our system is still very different.
37 * Alan Cox : Faster /proc handling
38 * Alexey Kuznetsov : Massive rework to support tree based routing,
39 * routing caches and better behaviour.
e905a9ed 40 *
1da177e4
LT
41 * Olaf Erb : irtt wasn't being copied right.
42 * Bjorn Ekwall : Kerneld route support.
43 * Alan Cox : Multicast fixed (I hope)
44 * Pavel Krauz : Limited broadcast fixed
45 * Mike McLagan : Routing by source
46 * Alexey Kuznetsov : End of old history. Split to fib.c and
47 * route.c and rewritten from scratch.
48 * Andi Kleen : Load-limit warning messages.
49 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
50 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
51 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
52 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
53 * Marc Boucher : routing by fwmark
54 * Robert Olsson : Added rt_cache statistics
55 * Arnaldo C. Melo : Convert proc stuff to seq_file
bb1d23b0 56 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
cef2685e
IS
57 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
58 * Ilia Sotnikov : Removed TOS from hash calculations
1da177e4
LT
59 */
60
afd46503
JP
61#define pr_fmt(fmt) "IPv4: " fmt
62
1da177e4 63#include <linux/module.h>
7c0f6ba6 64#include <linux/uaccess.h>
1da177e4
LT
65#include <linux/bitops.h>
66#include <linux/types.h>
67#include <linux/kernel.h>
1da177e4
LT
68#include <linux/mm.h>
69#include <linux/string.h>
70#include <linux/socket.h>
71#include <linux/sockios.h>
72#include <linux/errno.h>
73#include <linux/in.h>
74#include <linux/inet.h>
75#include <linux/netdevice.h>
76#include <linux/proc_fs.h>
77#include <linux/init.h>
78#include <linux/skbuff.h>
1da177e4
LT
79#include <linux/inetdevice.h>
80#include <linux/igmp.h>
81#include <linux/pkt_sched.h>
82#include <linux/mroute.h>
83#include <linux/netfilter_ipv4.h>
84#include <linux/random.h>
1da177e4
LT
85#include <linux/rcupdate.h>
86#include <linux/times.h>
5a0e3ad6 87#include <linux/slab.h>
73f156a6 88#include <linux/jhash.h>
352e512c 89#include <net/dst.h>
1b7179d3 90#include <net/dst_metadata.h>
457c4cbc 91#include <net/net_namespace.h>
1da177e4
LT
92#include <net/protocol.h>
93#include <net/ip.h>
94#include <net/route.h>
95#include <net/inetpeer.h>
96#include <net/sock.h>
97#include <net/ip_fib.h>
5481d73f 98#include <net/nexthop.h>
1da177e4
LT
99#include <net/arp.h>
100#include <net/tcp.h>
101#include <net/icmp.h>
102#include <net/xfrm.h>
571e7226 103#include <net/lwtunnel.h>
8d71740c 104#include <net/netevent.h>
63f3444f 105#include <net/rtnetlink.h>
1da177e4
LT
106#ifdef CONFIG_SYSCTL
107#include <linux/sysctl.h>
108#endif
6e5714ea 109#include <net/secure_seq.h>
1b7179d3 110#include <net/ip_tunnels.h>
385add90 111#include <net/l3mdev.h>
1da177e4 112
b6179813
RP
113#include "fib_lookup.h"
114
68a5e3dd 115#define RT_FL_TOS(oldflp4) \
f61759e6 116 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
1da177e4 117
1da177e4
LT
118#define RT_GC_TIMEOUT (300*HZ)
119
1da177e4 120static int ip_rt_max_size;
817bc4db
SH
121static int ip_rt_redirect_number __read_mostly = 9;
122static int ip_rt_redirect_load __read_mostly = HZ / 50;
123static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
124static int ip_rt_error_cost __read_mostly = HZ;
125static int ip_rt_error_burst __read_mostly = 5 * HZ;
817bc4db 126static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
c7272c2f 127static u32 ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
817bc4db 128static int ip_rt_min_advmss __read_mostly = 256;
9f28a2fc 129
deed49df 130static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
c7272c2f 131
1da177e4
LT
132/*
133 * Interface to generic destination cache.
134 */
135
136static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
0dbaee3b 137static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
ebb762f2 138static unsigned int ipv4_mtu(const struct dst_entry *dst);
1da177e4
LT
139static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
140static void ipv4_link_failure(struct sk_buff *skb);
6700c270 141static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
bd085ef6
HL
142 struct sk_buff *skb, u32 mtu,
143 bool confirm_neigh);
6700c270
DM
144static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
145 struct sk_buff *skb);
caacf05e 146static void ipv4_dst_destroy(struct dst_entry *dst);
1da177e4 147
62fa8a84
DM
148static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
149{
31248731
DM
150 WARN_ON(1);
151 return NULL;
62fa8a84
DM
152}
153
f894cbf8
DM
154static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
155 struct sk_buff *skb,
156 const void *daddr);
63fca65d 157static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
d3aaeb38 158
1da177e4
LT
159static struct dst_ops ipv4_dst_ops = {
160 .family = AF_INET,
1da177e4 161 .check = ipv4_dst_check,
0dbaee3b 162 .default_advmss = ipv4_default_advmss,
ebb762f2 163 .mtu = ipv4_mtu,
62fa8a84 164 .cow_metrics = ipv4_cow_metrics,
caacf05e 165 .destroy = ipv4_dst_destroy,
1da177e4
LT
166 .negative_advice = ipv4_negative_advice,
167 .link_failure = ipv4_link_failure,
168 .update_pmtu = ip_rt_update_pmtu,
e47a185b 169 .redirect = ip_do_redirect,
b92dacd4 170 .local_out = __ip_local_out,
d3aaeb38 171 .neigh_lookup = ipv4_neigh_lookup,
63fca65d 172 .confirm_neigh = ipv4_confirm_neigh,
1da177e4
LT
173};
174
175#define ECN_OR_COST(class) TC_PRIO_##class
176
4839c52b 177const __u8 ip_tos2prio[16] = {
1da177e4 178 TC_PRIO_BESTEFFORT,
4a2b9c37 179 ECN_OR_COST(BESTEFFORT),
1da177e4
LT
180 TC_PRIO_BESTEFFORT,
181 ECN_OR_COST(BESTEFFORT),
182 TC_PRIO_BULK,
183 ECN_OR_COST(BULK),
184 TC_PRIO_BULK,
185 ECN_OR_COST(BULK),
186 TC_PRIO_INTERACTIVE,
187 ECN_OR_COST(INTERACTIVE),
188 TC_PRIO_INTERACTIVE,
189 ECN_OR_COST(INTERACTIVE),
190 TC_PRIO_INTERACTIVE_BULK,
191 ECN_OR_COST(INTERACTIVE_BULK),
192 TC_PRIO_INTERACTIVE_BULK,
193 ECN_OR_COST(INTERACTIVE_BULK)
194};
d4a96865 195EXPORT_SYMBOL(ip_tos2prio);
1da177e4 196
2f970d83 197static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
3ed66e91 198#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
1da177e4 199
1da177e4 200#ifdef CONFIG_PROC_FS
1da177e4
LT
201static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
202{
29e75252 203 if (*pos)
89aef892 204 return NULL;
29e75252 205 return SEQ_START_TOKEN;
1da177e4
LT
206}
207
208static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
209{
1da177e4 210 ++*pos;
89aef892 211 return NULL;
1da177e4
LT
212}
213
214static void rt_cache_seq_stop(struct seq_file *seq, void *v)
215{
1da177e4
LT
216}
217
218static int rt_cache_seq_show(struct seq_file *seq, void *v)
219{
220 if (v == SEQ_START_TOKEN)
221 seq_printf(seq, "%-127s\n",
222 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
223 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
224 "HHUptod\tSpecDst");
e905a9ed 225 return 0;
1da177e4
LT
226}
227
f690808e 228static const struct seq_operations rt_cache_seq_ops = {
1da177e4
LT
229 .start = rt_cache_seq_start,
230 .next = rt_cache_seq_next,
231 .stop = rt_cache_seq_stop,
232 .show = rt_cache_seq_show,
233};
234
235static int rt_cache_seq_open(struct inode *inode, struct file *file)
236{
89aef892 237 return seq_open(file, &rt_cache_seq_ops);
1da177e4
LT
238}
239
97a32539
AD
240static const struct proc_ops rt_cache_proc_ops = {
241 .proc_open = rt_cache_seq_open,
242 .proc_read = seq_read,
243 .proc_lseek = seq_lseek,
244 .proc_release = seq_release,
1da177e4
LT
245};
246
247
248static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
249{
250 int cpu;
251
252 if (*pos == 0)
253 return SEQ_START_TOKEN;
254
0f23174a 255 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
256 if (!cpu_possible(cpu))
257 continue;
258 *pos = cpu+1;
2f970d83 259 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
260 }
261 return NULL;
262}
263
264static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
265{
266 int cpu;
267
0f23174a 268 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
269 if (!cpu_possible(cpu))
270 continue;
271 *pos = cpu+1;
2f970d83 272 return &per_cpu(rt_cache_stat, cpu);
1da177e4 273 }
a3ea8673 274 (*pos)++;
1da177e4 275 return NULL;
e905a9ed 276
1da177e4
LT
277}
278
279static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
280{
281
282}
283
284static int rt_cpu_seq_show(struct seq_file *seq, void *v)
285{
286 struct rt_cache_stat *st = v;
287
288 if (v == SEQ_START_TOKEN) {
5bec0039 289 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
1da177e4
LT
290 return 0;
291 }
e905a9ed 292
1da177e4
LT
293 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
294 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
fc66f95c 295 dst_entries_get_slow(&ipv4_dst_ops),
0baf2b35 296 0, /* st->in_hit */
1da177e4
LT
297 st->in_slow_tot,
298 st->in_slow_mc,
299 st->in_no_route,
300 st->in_brd,
301 st->in_martian_dst,
302 st->in_martian_src,
303
0baf2b35 304 0, /* st->out_hit */
1da177e4 305 st->out_slow_tot,
e905a9ed 306 st->out_slow_mc,
1da177e4 307
0baf2b35
ED
308 0, /* st->gc_total */
309 0, /* st->gc_ignored */
310 0, /* st->gc_goal_miss */
311 0, /* st->gc_dst_overflow */
312 0, /* st->in_hlist_search */
313 0 /* st->out_hlist_search */
1da177e4
LT
314 );
315 return 0;
316}
317
f690808e 318static const struct seq_operations rt_cpu_seq_ops = {
1da177e4
LT
319 .start = rt_cpu_seq_start,
320 .next = rt_cpu_seq_next,
321 .stop = rt_cpu_seq_stop,
322 .show = rt_cpu_seq_show,
323};
324
325
326static int rt_cpu_seq_open(struct inode *inode, struct file *file)
327{
328 return seq_open(file, &rt_cpu_seq_ops);
329}
330
97a32539
AD
331static const struct proc_ops rt_cpu_proc_ops = {
332 .proc_open = rt_cpu_seq_open,
333 .proc_read = seq_read,
334 .proc_lseek = seq_lseek,
335 .proc_release = seq_release,
1da177e4
LT
336};
337
c7066f70 338#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 339static int rt_acct_proc_show(struct seq_file *m, void *v)
78c686e9 340{
a661c419
AD
341 struct ip_rt_acct *dst, *src;
342 unsigned int i, j;
343
344 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
345 if (!dst)
346 return -ENOMEM;
347
348 for_each_possible_cpu(i) {
349 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
350 for (j = 0; j < 256; j++) {
351 dst[j].o_bytes += src[j].o_bytes;
352 dst[j].o_packets += src[j].o_packets;
353 dst[j].i_bytes += src[j].i_bytes;
354 dst[j].i_packets += src[j].i_packets;
355 }
78c686e9
PE
356 }
357
a661c419
AD
358 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
359 kfree(dst);
360 return 0;
361}
78c686e9 362#endif
107f1634 363
73b38711 364static int __net_init ip_rt_do_proc_init(struct net *net)
107f1634
PE
365{
366 struct proc_dir_entry *pde;
367
d6444062 368 pde = proc_create("rt_cache", 0444, net->proc_net,
97a32539 369 &rt_cache_proc_ops);
107f1634
PE
370 if (!pde)
371 goto err1;
372
d6444062 373 pde = proc_create("rt_cache", 0444,
97a32539 374 net->proc_net_stat, &rt_cpu_proc_ops);
107f1634
PE
375 if (!pde)
376 goto err2;
377
c7066f70 378#ifdef CONFIG_IP_ROUTE_CLASSID
3f3942ac
CH
379 pde = proc_create_single("rt_acct", 0, net->proc_net,
380 rt_acct_proc_show);
107f1634
PE
381 if (!pde)
382 goto err3;
383#endif
384 return 0;
385
c7066f70 386#ifdef CONFIG_IP_ROUTE_CLASSID
107f1634
PE
387err3:
388 remove_proc_entry("rt_cache", net->proc_net_stat);
389#endif
390err2:
391 remove_proc_entry("rt_cache", net->proc_net);
392err1:
393 return -ENOMEM;
394}
73b38711
DL
395
396static void __net_exit ip_rt_do_proc_exit(struct net *net)
397{
398 remove_proc_entry("rt_cache", net->proc_net_stat);
399 remove_proc_entry("rt_cache", net->proc_net);
c7066f70 400#ifdef CONFIG_IP_ROUTE_CLASSID
73b38711 401 remove_proc_entry("rt_acct", net->proc_net);
0a931acf 402#endif
73b38711
DL
403}
404
405static struct pernet_operations ip_rt_proc_ops __net_initdata = {
406 .init = ip_rt_do_proc_init,
407 .exit = ip_rt_do_proc_exit,
408};
409
410static int __init ip_rt_proc_init(void)
411{
412 return register_pernet_subsys(&ip_rt_proc_ops);
413}
414
107f1634 415#else
73b38711 416static inline int ip_rt_proc_init(void)
107f1634
PE
417{
418 return 0;
419}
1da177e4 420#endif /* CONFIG_PROC_FS */
e905a9ed 421
4331debc 422static inline bool rt_is_expired(const struct rtable *rth)
e84f84f2 423{
ca4c3fc2 424 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
e84f84f2
DL
425}
426
4ccfe6d4 427void rt_cache_flush(struct net *net)
1da177e4 428{
ca4c3fc2 429 rt_genid_bump_ipv4(net);
98376387
ED
430}
431
f894cbf8
DM
432static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
433 struct sk_buff *skb,
434 const void *daddr)
3769cffb 435{
1550c171 436 const struct rtable *rt = container_of(dst, struct rtable, dst);
d3aaeb38 437 struct net_device *dev = dst->dev;
3769cffb
DM
438 struct neighbour *n;
439
5c9f7c1d
DA
440 rcu_read_lock_bh();
441
442 if (likely(rt->rt_gw_family == AF_INET)) {
443 n = ip_neigh_gw4(dev, rt->rt_gw4);
444 } else if (rt->rt_gw_family == AF_INET6) {
445 n = ip_neigh_gw6(dev, &rt->rt_gw6);
446 } else {
447 __be32 pkey;
448
449 pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
450 n = ip_neigh_gw4(dev, pkey);
451 }
452
537de0c8 453 if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
5c9f7c1d
DA
454 n = NULL;
455
456 rcu_read_unlock_bh();
457
458 return n;
d3aaeb38
DM
459}
460
63fca65d
JA
461static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
462{
1550c171 463 const struct rtable *rt = container_of(dst, struct rtable, dst);
63fca65d
JA
464 struct net_device *dev = dst->dev;
465 const __be32 *pkey = daddr;
63fca65d 466
6de9c055 467 if (rt->rt_gw_family == AF_INET) {
1550c171 468 pkey = (const __be32 *)&rt->rt_gw4;
6de9c055
DA
469 } else if (rt->rt_gw_family == AF_INET6) {
470 return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
471 } else if (!daddr ||
63fca65d 472 (rt->rt_flags &
6de9c055 473 (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
63fca65d 474 return;
6de9c055 475 }
63fca65d
JA
476 __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
477}
478
04ca6973 479#define IP_IDENTS_SZ 2048u
04ca6973 480
355b590c
ED
481static atomic_t *ip_idents __read_mostly;
482static u32 *ip_tstamps __read_mostly;
04ca6973
ED
483
484/* In order to protect privacy, we add a perturbation to identifiers
485 * if one generator is seldom used. This makes hard for an attacker
486 * to infer how many packets were sent between two points in time.
487 */
488u32 ip_idents_reserve(u32 hash, int segs)
489{
355b590c
ED
490 u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
491 atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
6aa7de05 492 u32 old = READ_ONCE(*p_tstamp);
04ca6973 493 u32 now = (u32)jiffies;
adb03115 494 u32 new, delta = 0;
04ca6973 495
355b590c 496 if (old != now && cmpxchg(p_tstamp, old, now) == old)
04ca6973
ED
497 delta = prandom_u32_max(now - old);
498
adb03115
ED
499 /* Do not use atomic_add_return() as it makes UBSAN unhappy */
500 do {
501 old = (u32)atomic_read(p_id);
502 new = old + delta + segs;
503 } while (atomic_cmpxchg(p_id, old, new) != old);
504
505 return new - segs;
04ca6973
ED
506}
507EXPORT_SYMBOL(ip_idents_reserve);
1da177e4 508
b6a7719a 509void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
1da177e4 510{
73f156a6 511 u32 hash, id;
1da177e4 512
df453700
ED
513 /* Note the following code is not safe, but this is okay. */
514 if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
515 get_random_bytes(&net->ipv4.ip_id_key,
516 sizeof(net->ipv4.ip_id_key));
1da177e4 517
df453700 518 hash = siphash_3u32((__force u32)iph->daddr,
04ca6973 519 (__force u32)iph->saddr,
df453700
ED
520 iph->protocol,
521 &net->ipv4.ip_id_key);
73f156a6
ED
522 id = ip_idents_reserve(hash, segs);
523 iph->id = htons(id);
1da177e4 524}
4bc2f18b 525EXPORT_SYMBOL(__ip_select_ident);
1da177e4 526
e2d118a1
LC
527static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
528 const struct sock *sk,
4895c771
DM
529 const struct iphdr *iph,
530 int oif, u8 tos,
531 u8 prot, u32 mark, int flow_flags)
532{
533 if (sk) {
534 const struct inet_sock *inet = inet_sk(sk);
535
536 oif = sk->sk_bound_dev_if;
537 mark = sk->sk_mark;
538 tos = RT_CONN_FLAGS(sk);
539 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
540 }
541 flowi4_init_output(fl4, oif, mark, tos,
542 RT_SCOPE_UNIVERSE, prot,
543 flow_flags,
e2d118a1
LC
544 iph->daddr, iph->saddr, 0, 0,
545 sock_net_uid(net, sk));
4895c771
DM
546}
547
5abf7f7e
ED
548static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
549 const struct sock *sk)
4895c771 550{
d109e61b 551 const struct net *net = dev_net(skb->dev);
4895c771
DM
552 const struct iphdr *iph = ip_hdr(skb);
553 int oif = skb->dev->ifindex;
554 u8 tos = RT_TOS(iph->tos);
555 u8 prot = iph->protocol;
556 u32 mark = skb->mark;
557
d109e61b 558 __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
4895c771
DM
559}
560
5abf7f7e 561static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
4895c771
DM
562{
563 const struct inet_sock *inet = inet_sk(sk);
5abf7f7e 564 const struct ip_options_rcu *inet_opt;
4895c771
DM
565 __be32 daddr = inet->inet_daddr;
566
567 rcu_read_lock();
568 inet_opt = rcu_dereference(inet->inet_opt);
569 if (inet_opt && inet_opt->opt.srr)
570 daddr = inet_opt->opt.faddr;
571 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
572 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
573 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
574 inet_sk_flowi_flags(sk),
e2d118a1 575 daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
4895c771
DM
576 rcu_read_unlock();
577}
578
5abf7f7e
ED
579static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
580 const struct sk_buff *skb)
4895c771
DM
581{
582 if (skb)
583 build_skb_flow_key(fl4, skb, sk);
584 else
585 build_sk_flow_key(fl4, sk);
586}
587
c5038a83 588static DEFINE_SPINLOCK(fnhe_lock);
4895c771 589
2ffae99d
TT
590static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
591{
592 struct rtable *rt;
593
594 rt = rcu_dereference(fnhe->fnhe_rth_input);
595 if (rt) {
596 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
95c47f9c 597 dst_dev_put(&rt->dst);
0830106c 598 dst_release(&rt->dst);
2ffae99d
TT
599 }
600 rt = rcu_dereference(fnhe->fnhe_rth_output);
601 if (rt) {
602 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
95c47f9c 603 dst_dev_put(&rt->dst);
0830106c 604 dst_release(&rt->dst);
2ffae99d
TT
605 }
606}
607
aee06da6 608static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
4895c771
DM
609{
610 struct fib_nh_exception *fnhe, *oldest;
611
612 oldest = rcu_dereference(hash->chain);
613 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
614 fnhe = rcu_dereference(fnhe->fnhe_next)) {
615 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
616 oldest = fnhe;
617 }
2ffae99d 618 fnhe_flush_routes(oldest);
4895c771
DM
619 return oldest;
620}
621
d3a25c98
DM
622static inline u32 fnhe_hashfun(__be32 daddr)
623{
d546c621 624 static u32 fnhe_hashrnd __read_mostly;
d3a25c98
DM
625 u32 hval;
626
d546c621
ED
627 net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
628 hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
629 return hash_32(hval, FNHE_HASH_SHIFT);
d3a25c98
DM
630}
631
387aa65a
TT
632static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
633{
634 rt->rt_pmtu = fnhe->fnhe_pmtu;
d52e5a7e 635 rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
387aa65a
TT
636 rt->dst.expires = fnhe->fnhe_expires;
637
638 if (fnhe->fnhe_gw) {
639 rt->rt_flags |= RTCF_REDIRECTED;
77d5bc7e 640 rt->rt_uses_gateway = 1;
1550c171
DA
641 rt->rt_gw_family = AF_INET;
642 rt->rt_gw4 = fnhe->fnhe_gw;
387aa65a
TT
643 }
644}
645
a5995e71
DA
646static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
647 __be32 gw, u32 pmtu, bool lock,
648 unsigned long expires)
4895c771 649{
aee06da6 650 struct fnhe_hash_bucket *hash;
4895c771 651 struct fib_nh_exception *fnhe;
387aa65a 652 struct rtable *rt;
cebe84c6 653 u32 genid, hval;
387aa65a 654 unsigned int i;
4895c771 655 int depth;
cebe84c6 656
a5995e71 657 genid = fnhe_genid(dev_net(nhc->nhc_dev));
cebe84c6 658 hval = fnhe_hashfun(daddr);
aee06da6 659
c5038a83 660 spin_lock_bh(&fnhe_lock);
4895c771 661
a5995e71 662 hash = rcu_dereference(nhc->nhc_exceptions);
4895c771 663 if (!hash) {
6396bb22 664 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
4895c771 665 if (!hash)
aee06da6 666 goto out_unlock;
a5995e71 667 rcu_assign_pointer(nhc->nhc_exceptions, hash);
4895c771
DM
668 }
669
4895c771
DM
670 hash += hval;
671
672 depth = 0;
673 for (fnhe = rcu_dereference(hash->chain); fnhe;
674 fnhe = rcu_dereference(fnhe->fnhe_next)) {
675 if (fnhe->fnhe_daddr == daddr)
aee06da6 676 break;
4895c771
DM
677 depth++;
678 }
679
aee06da6 680 if (fnhe) {
cebe84c6
XL
681 if (fnhe->fnhe_genid != genid)
682 fnhe->fnhe_genid = genid;
aee06da6
JA
683 if (gw)
684 fnhe->fnhe_gw = gw;
d52e5a7e 685 if (pmtu) {
aee06da6 686 fnhe->fnhe_pmtu = pmtu;
d52e5a7e
SD
687 fnhe->fnhe_mtu_locked = lock;
688 }
e39d5246 689 fnhe->fnhe_expires = max(1UL, expires);
387aa65a 690 /* Update all cached dsts too */
2ffae99d
TT
691 rt = rcu_dereference(fnhe->fnhe_rth_input);
692 if (rt)
693 fill_route_from_fnhe(rt, fnhe);
694 rt = rcu_dereference(fnhe->fnhe_rth_output);
387aa65a
TT
695 if (rt)
696 fill_route_from_fnhe(rt, fnhe);
aee06da6
JA
697 } else {
698 if (depth > FNHE_RECLAIM_DEPTH)
699 fnhe = fnhe_oldest(hash);
700 else {
701 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
702 if (!fnhe)
703 goto out_unlock;
704
705 fnhe->fnhe_next = hash->chain;
706 rcu_assign_pointer(hash->chain, fnhe);
707 }
cebe84c6 708 fnhe->fnhe_genid = genid;
aee06da6
JA
709 fnhe->fnhe_daddr = daddr;
710 fnhe->fnhe_gw = gw;
711 fnhe->fnhe_pmtu = pmtu;
d52e5a7e 712 fnhe->fnhe_mtu_locked = lock;
94720e3a 713 fnhe->fnhe_expires = max(1UL, expires);
387aa65a
TT
714
715 /* Exception created; mark the cached routes for the nexthop
716 * stale, so anyone caching it rechecks if this exception
717 * applies to them.
718 */
0f457a36 719 rt = rcu_dereference(nhc->nhc_rth_input);
2ffae99d
TT
720 if (rt)
721 rt->dst.obsolete = DST_OBSOLETE_KILL;
722
387aa65a
TT
723 for_each_possible_cpu(i) {
724 struct rtable __rcu **prt;
0f457a36 725 prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
387aa65a
TT
726 rt = rcu_dereference(*prt);
727 if (rt)
728 rt->dst.obsolete = DST_OBSOLETE_KILL;
729 }
4895c771 730 }
4895c771 731
4895c771 732 fnhe->fnhe_stamp = jiffies;
aee06da6
JA
733
734out_unlock:
c5038a83 735 spin_unlock_bh(&fnhe_lock);
4895c771
DM
736}
737
ceb33206
DM
738static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
739 bool kill_route)
1da177e4 740{
e47a185b 741 __be32 new_gw = icmp_hdr(skb)->un.gateway;
94206125 742 __be32 old_gw = ip_hdr(skb)->saddr;
e47a185b 743 struct net_device *dev = skb->dev;
e47a185b 744 struct in_device *in_dev;
4895c771 745 struct fib_result res;
e47a185b 746 struct neighbour *n;
317805b8 747 struct net *net;
1da177e4 748
94206125
DM
749 switch (icmp_hdr(skb)->code & 7) {
750 case ICMP_REDIR_NET:
751 case ICMP_REDIR_NETTOS:
752 case ICMP_REDIR_HOST:
753 case ICMP_REDIR_HOSTTOS:
754 break;
755
756 default:
757 return;
758 }
759
1550c171 760 if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
e47a185b
DM
761 return;
762
763 in_dev = __in_dev_get_rcu(dev);
764 if (!in_dev)
765 return;
766
c346dca1 767 net = dev_net(dev);
9d4fb27d
JP
768 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
769 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
770 ipv4_is_zeronet(new_gw))
1da177e4
LT
771 goto reject_redirect;
772
773 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
774 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
775 goto reject_redirect;
776 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
777 goto reject_redirect;
778 } else {
317805b8 779 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1da177e4
LT
780 goto reject_redirect;
781 }
782
969447f2
SSL
783 n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
784 if (!n)
785 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
2c1a4311 786 if (!IS_ERR(n)) {
e47a185b
DM
787 if (!(n->nud_state & NUD_VALID)) {
788 neigh_event_send(n, NULL);
789 } else {
0eeb075f 790 if (fib_lookup(net, fl4, &res, 0) == 0) {
eba618ab 791 struct fib_nh_common *nhc = FIB_RES_NHC(res);
4895c771 792
a5995e71 793 update_or_create_fnhe(nhc, fl4->daddr, new_gw,
d52e5a7e
SD
794 0, false,
795 jiffies + ip_rt_gc_timeout);
4895c771 796 }
ceb33206
DM
797 if (kill_route)
798 rt->dst.obsolete = DST_OBSOLETE_KILL;
e47a185b
DM
799 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
800 }
801 neigh_release(n);
802 }
803 return;
804
805reject_redirect:
806#ifdef CONFIG_IP_ROUTE_VERBOSE
99ee038d
DM
807 if (IN_DEV_LOG_MARTIANS(in_dev)) {
808 const struct iphdr *iph = (const struct iphdr *) skb->data;
809 __be32 daddr = iph->daddr;
810 __be32 saddr = iph->saddr;
811
e47a185b
DM
812 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
813 " Advised path = %pI4 -> %pI4\n",
814 &old_gw, dev->name, &new_gw,
815 &saddr, &daddr);
99ee038d 816 }
e47a185b
DM
817#endif
818 ;
819}
820
4895c771
DM
821static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
822{
823 struct rtable *rt;
824 struct flowi4 fl4;
f96ef988 825 const struct iphdr *iph = (const struct iphdr *) skb->data;
7d995694 826 struct net *net = dev_net(skb->dev);
f96ef988
MK
827 int oif = skb->dev->ifindex;
828 u8 tos = RT_TOS(iph->tos);
829 u8 prot = iph->protocol;
830 u32 mark = skb->mark;
4895c771
DM
831
832 rt = (struct rtable *) dst;
833
7d995694 834 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
ceb33206 835 __ip_do_redirect(rt, skb, &fl4, true);
4895c771
DM
836}
837
1da177e4
LT
838static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
839{
ee6b9673 840 struct rtable *rt = (struct rtable *)dst;
1da177e4
LT
841 struct dst_entry *ret = dst;
842
843 if (rt) {
d11a4dc1 844 if (dst->obsolete > 0) {
1da177e4
LT
845 ip_rt_put(rt);
846 ret = NULL;
5943634f
DM
847 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
848 rt->dst.expires) {
89aef892 849 ip_rt_put(rt);
1da177e4
LT
850 ret = NULL;
851 }
852 }
853 return ret;
854}
855
856/*
857 * Algorithm:
858 * 1. The first ip_rt_redirect_number redirects are sent
859 * with exponential backoff, then we stop sending them at all,
860 * assuming that the host ignores our redirects.
861 * 2. If we did not see packets requiring redirects
862 * during ip_rt_redirect_silence, we assume that the host
863 * forgot redirected route and start to send redirects again.
864 *
865 * This algorithm is much cheaper and more intelligent than dumb load limiting
866 * in icmp.c.
867 *
868 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
869 * and "frag. need" (breaks PMTU discovery) in icmp.c.
870 */
871
872void ip_rt_send_redirect(struct sk_buff *skb)
873{
511c3f92 874 struct rtable *rt = skb_rtable(skb);
30038fc6 875 struct in_device *in_dev;
92d86829 876 struct inet_peer *peer;
1d861aa4 877 struct net *net;
30038fc6 878 int log_martians;
192132b9 879 int vif;
1da177e4 880
30038fc6 881 rcu_read_lock();
d8d1f30b 882 in_dev = __in_dev_get_rcu(rt->dst.dev);
30038fc6
ED
883 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
884 rcu_read_unlock();
1da177e4 885 return;
30038fc6
ED
886 }
887 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
385add90 888 vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
30038fc6 889 rcu_read_unlock();
1da177e4 890
1d861aa4 891 net = dev_net(rt->dst.dev);
192132b9 892 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
92d86829 893 if (!peer) {
e81da0e1
JA
894 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
895 rt_nexthop(rt, ip_hdr(skb)->daddr));
92d86829
DM
896 return;
897 }
898
1da177e4
LT
899 /* No redirected packets during ip_rt_redirect_silence;
900 * reset the algorithm.
901 */
c09551c6 902 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
92d86829 903 peer->rate_tokens = 0;
c09551c6
LB
904 peer->n_redirects = 0;
905 }
1da177e4
LT
906
907 /* Too many ignored redirects; do not send anything
d8d1f30b 908 * set dst.rate_last to the last seen redirected packet.
1da177e4 909 */
c09551c6 910 if (peer->n_redirects >= ip_rt_redirect_number) {
92d86829 911 peer->rate_last = jiffies;
1d861aa4 912 goto out_put_peer;
1da177e4
LT
913 }
914
915 /* Check for load limit; set rate_last to the latest sent
916 * redirect.
917 */
57644431 918 if (peer->n_redirects == 0 ||
14fb8a76 919 time_after(jiffies,
92d86829 920 (peer->rate_last +
b406472b 921 (ip_rt_redirect_load << peer->n_redirects)))) {
e81da0e1
JA
922 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
923
924 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
92d86829 925 peer->rate_last = jiffies;
c09551c6 926 ++peer->n_redirects;
1da177e4 927#ifdef CONFIG_IP_ROUTE_VERBOSE
30038fc6 928 if (log_martians &&
b406472b 929 peer->n_redirects == ip_rt_redirect_number)
e87cc472 930 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
92101b3b 931 &ip_hdr(skb)->saddr, inet_iif(skb),
e81da0e1 932 &ip_hdr(skb)->daddr, &gw);
1da177e4
LT
933#endif
934 }
1d861aa4
DM
935out_put_peer:
936 inet_putpeer(peer);
1da177e4
LT
937}
938
939static int ip_error(struct sk_buff *skb)
940{
511c3f92 941 struct rtable *rt = skb_rtable(skb);
e2c0dc1f
SS
942 struct net_device *dev = skb->dev;
943 struct in_device *in_dev;
92d86829 944 struct inet_peer *peer;
1da177e4 945 unsigned long now;
251da413 946 struct net *net;
92d86829 947 bool send;
1da177e4
LT
948 int code;
949
e2c0dc1f
SS
950 if (netif_is_l3_master(skb->dev)) {
951 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
952 if (!dev)
953 goto out;
954 }
955
956 in_dev = __in_dev_get_rcu(dev);
957
381c759d
EB
958 /* IP on this device is disabled. */
959 if (!in_dev)
960 goto out;
961
251da413
DM
962 net = dev_net(rt->dst.dev);
963 if (!IN_DEV_FORWARD(in_dev)) {
964 switch (rt->dst.error) {
965 case EHOSTUNREACH:
b45386ef 966 __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
251da413
DM
967 break;
968
969 case ENETUNREACH:
b45386ef 970 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
251da413
DM
971 break;
972 }
973 goto out;
974 }
975
d8d1f30b 976 switch (rt->dst.error) {
4500ebf8
JP
977 case EINVAL:
978 default:
979 goto out;
980 case EHOSTUNREACH:
981 code = ICMP_HOST_UNREACH;
982 break;
983 case ENETUNREACH:
984 code = ICMP_NET_UNREACH;
b45386ef 985 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
4500ebf8
JP
986 break;
987 case EACCES:
988 code = ICMP_PKT_FILTERED;
989 break;
1da177e4
LT
990 }
991
192132b9 992 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
385add90 993 l3mdev_master_ifindex(skb->dev), 1);
92d86829
DM
994
995 send = true;
996 if (peer) {
997 now = jiffies;
998 peer->rate_tokens += now - peer->rate_last;
999 if (peer->rate_tokens > ip_rt_error_burst)
1000 peer->rate_tokens = ip_rt_error_burst;
1001 peer->rate_last = now;
1002 if (peer->rate_tokens >= ip_rt_error_cost)
1003 peer->rate_tokens -= ip_rt_error_cost;
1004 else
1005 send = false;
1d861aa4 1006 inet_putpeer(peer);
1da177e4 1007 }
92d86829
DM
1008 if (send)
1009 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1da177e4
LT
1010
1011out: kfree_skb(skb);
1012 return 0;
e905a9ed 1013}
1da177e4 1014
d851c12b 1015static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1da177e4 1016{
d851c12b 1017 struct dst_entry *dst = &rt->dst;
28d35bcd 1018 u32 old_mtu = ipv4_mtu(dst);
4895c771 1019 struct fib_result res;
d52e5a7e 1020 bool lock = false;
2c8cec5c 1021
d52e5a7e 1022 if (ip_mtu_locked(dst))
fa1e492a
SK
1023 return;
1024
28d35bcd 1025 if (old_mtu < mtu)
3cdaa5be
LW
1026 return;
1027
d52e5a7e
SD
1028 if (mtu < ip_rt_min_pmtu) {
1029 lock = true;
28d35bcd 1030 mtu = min(old_mtu, ip_rt_min_pmtu);
d52e5a7e 1031 }
2c8cec5c 1032
28d35bcd 1033 if (rt->rt_pmtu == mtu && !lock &&
f016229e
TT
1034 time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1035 return;
1036
c5ae7d41 1037 rcu_read_lock();
0eeb075f 1038 if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
eba618ab 1039 struct fib_nh_common *nhc = FIB_RES_NHC(res);
4895c771 1040
a5995e71 1041 update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
aee06da6 1042 jiffies + ip_rt_mtu_expires);
4895c771 1043 }
c5ae7d41 1044 rcu_read_unlock();
1da177e4
LT
1045}
1046
4895c771 1047static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
bd085ef6
HL
1048 struct sk_buff *skb, u32 mtu,
1049 bool confirm_neigh)
4895c771
DM
1050{
1051 struct rtable *rt = (struct rtable *) dst;
1052 struct flowi4 fl4;
1053
1054 ip_rt_build_flow_key(&fl4, sk, skb);
d851c12b 1055 __ip_rt_update_pmtu(rt, &fl4, mtu);
4895c771
DM
1056}
1057
36393395 1058void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
d888f396 1059 int oif, u8 protocol)
36393395 1060{
4895c771 1061 const struct iphdr *iph = (const struct iphdr *) skb->data;
36393395
DM
1062 struct flowi4 fl4;
1063 struct rtable *rt;
d888f396 1064 u32 mark = IP4_REPLY_MARK(net, skb->mark);
1b3c61dc 1065
e2d118a1 1066 __build_flow_key(net, &fl4, NULL, iph, oif,
d888f396 1067 RT_TOS(iph->tos), protocol, mark, 0);
36393395
DM
1068 rt = __ip_route_output_key(net, &fl4);
1069 if (!IS_ERR(rt)) {
4895c771 1070 __ip_rt_update_pmtu(rt, &fl4, mtu);
36393395
DM
1071 ip_rt_put(rt);
1072 }
1073}
1074EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1075
9cb3a50c 1076static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
36393395 1077{
4895c771
DM
1078 const struct iphdr *iph = (const struct iphdr *) skb->data;
1079 struct flowi4 fl4;
1080 struct rtable *rt;
36393395 1081
e2d118a1 1082 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1b3c61dc
LC
1083
1084 if (!fl4.flowi4_mark)
1085 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1086
4895c771
DM
1087 rt = __ip_route_output_key(sock_net(sk), &fl4);
1088 if (!IS_ERR(rt)) {
1089 __ip_rt_update_pmtu(rt, &fl4, mtu);
1090 ip_rt_put(rt);
1091 }
36393395 1092}
9cb3a50c
SK
1093
1094void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1095{
1096 const struct iphdr *iph = (const struct iphdr *) skb->data;
1097 struct flowi4 fl4;
1098 struct rtable *rt;
7f502361 1099 struct dst_entry *odst = NULL;
b44108db 1100 bool new = false;
e2d118a1 1101 struct net *net = sock_net(sk);
9cb3a50c
SK
1102
1103 bh_lock_sock(sk);
482fc609
HFS
1104
1105 if (!ip_sk_accept_pmtu(sk))
1106 goto out;
1107
7f502361 1108 odst = sk_dst_get(sk);
9cb3a50c 1109
7f502361 1110 if (sock_owned_by_user(sk) || !odst) {
9cb3a50c
SK
1111 __ipv4_sk_update_pmtu(skb, sk, mtu);
1112 goto out;
1113 }
1114
e2d118a1 1115 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
9cb3a50c 1116
7f502361 1117 rt = (struct rtable *)odst;
51456b29 1118 if (odst->obsolete && !odst->ops->check(odst, 0)) {
9cb3a50c
SK
1119 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1120 if (IS_ERR(rt))
1121 goto out;
b44108db
SK
1122
1123 new = true;
9cb3a50c
SK
1124 }
1125
0f6c480f 1126 __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
9cb3a50c 1127
7f502361 1128 if (!dst_check(&rt->dst, 0)) {
b44108db
SK
1129 if (new)
1130 dst_release(&rt->dst);
1131
9cb3a50c
SK
1132 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1133 if (IS_ERR(rt))
1134 goto out;
1135
b44108db 1136 new = true;
9cb3a50c
SK
1137 }
1138
b44108db 1139 if (new)
7f502361 1140 sk_dst_set(sk, &rt->dst);
9cb3a50c
SK
1141
1142out:
1143 bh_unlock_sock(sk);
7f502361 1144 dst_release(odst);
9cb3a50c 1145}
36393395 1146EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
f39925db 1147
b42597e2 1148void ipv4_redirect(struct sk_buff *skb, struct net *net,
1042caa7 1149 int oif, u8 protocol)
b42597e2 1150{
4895c771 1151 const struct iphdr *iph = (const struct iphdr *) skb->data;
b42597e2
DM
1152 struct flowi4 fl4;
1153 struct rtable *rt;
1154
e2d118a1 1155 __build_flow_key(net, &fl4, NULL, iph, oif,
1042caa7 1156 RT_TOS(iph->tos), protocol, 0, 0);
b42597e2
DM
1157 rt = __ip_route_output_key(net, &fl4);
1158 if (!IS_ERR(rt)) {
ceb33206 1159 __ip_do_redirect(rt, skb, &fl4, false);
b42597e2
DM
1160 ip_rt_put(rt);
1161 }
1162}
1163EXPORT_SYMBOL_GPL(ipv4_redirect);
1164
1165void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1166{
4895c771
DM
1167 const struct iphdr *iph = (const struct iphdr *) skb->data;
1168 struct flowi4 fl4;
1169 struct rtable *rt;
e2d118a1 1170 struct net *net = sock_net(sk);
b42597e2 1171
e2d118a1
LC
1172 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1173 rt = __ip_route_output_key(net, &fl4);
4895c771 1174 if (!IS_ERR(rt)) {
ceb33206 1175 __ip_do_redirect(rt, skb, &fl4, false);
4895c771
DM
1176 ip_rt_put(rt);
1177 }
b42597e2
DM
1178}
1179EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1180
efbc368d
DM
1181static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1182{
1183 struct rtable *rt = (struct rtable *) dst;
1184
ceb33206
DM
1185 /* All IPV4 dsts are created with ->obsolete set to the value
1186 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1187 * into this function always.
1188 *
387aa65a
TT
1189 * When a PMTU/redirect information update invalidates a route,
1190 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
02afc7ad 1191 * DST_OBSOLETE_DEAD.
ceb33206 1192 */
387aa65a 1193 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
efbc368d 1194 return NULL;
d11a4dc1 1195 return dst;
1da177e4
LT
1196}
1197
20ff83f1 1198static void ipv4_send_dest_unreach(struct sk_buff *skb)
1da177e4 1199{
ed0de45a 1200 struct ip_options opt;
c543cb4a 1201 int res;
1da177e4 1202
ed0de45a 1203 /* Recompile ip options since IPCB may not be valid anymore.
20ff83f1 1204 * Also check we have a reasonable ipv4 header.
ed0de45a 1205 */
20ff83f1
ED
1206 if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1207 ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1208 return;
c543cb4a 1209
20ff83f1
ED
1210 memset(&opt, 0, sizeof(opt));
1211 if (ip_hdr(skb)->ihl > 5) {
1212 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1213 return;
1214 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
c543cb4a 1215
20ff83f1
ED
1216 rcu_read_lock();
1217 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1218 rcu_read_unlock();
ed0de45a 1219
20ff83f1
ED
1220 if (res)
1221 return;
1222 }
ed0de45a 1223 __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
20ff83f1
ED
1224}
1225
1226static void ipv4_link_failure(struct sk_buff *skb)
1227{
1228 struct rtable *rt;
1229
1230 ipv4_send_dest_unreach(skb);
1da177e4 1231
511c3f92 1232 rt = skb_rtable(skb);
5943634f
DM
1233 if (rt)
1234 dst_set_expires(&rt->dst, 0);
1da177e4
LT
1235}
1236
ede2059d 1237static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1da177e4 1238{
91df42be
JP
1239 pr_debug("%s: %pI4 -> %pI4, %s\n",
1240 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1241 skb->dev ? skb->dev->name : "?");
1da177e4 1242 kfree_skb(skb);
c378a9c0 1243 WARN_ON(1);
1da177e4
LT
1244 return 0;
1245}
1246
1247/*
1248 We do not cache source address of outgoing interface,
1249 because it is used only by IP RR, TS and SRR options,
1250 so that it out of fast path.
1251
1252 BTW remember: "addr" is allowed to be not aligned
1253 in IP options!
1254 */
1255
8e36360a 1256void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1da177e4 1257{
a61ced5d 1258 __be32 src;
1da177e4 1259
c7537967 1260 if (rt_is_output_route(rt))
c5be24ff 1261 src = ip_hdr(skb)->saddr;
ebc0ffae 1262 else {
8e36360a 1263 struct fib_result res;
e351bb62
MÅ»
1264 struct iphdr *iph = ip_hdr(skb);
1265 struct flowi4 fl4 = {
1266 .daddr = iph->daddr,
1267 .saddr = iph->saddr,
1268 .flowi4_tos = RT_TOS(iph->tos),
1269 .flowi4_oif = rt->dst.dev->ifindex,
1270 .flowi4_iif = skb->dev->ifindex,
1271 .flowi4_mark = skb->mark,
1272 };
5e2b61f7 1273
ebc0ffae 1274 rcu_read_lock();
0eeb075f 1275 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
eba618ab 1276 src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
ebc0ffae 1277 else
f8126f1d
DM
1278 src = inet_select_addr(rt->dst.dev,
1279 rt_nexthop(rt, iph->daddr),
1280 RT_SCOPE_UNIVERSE);
ebc0ffae
ED
1281 rcu_read_unlock();
1282 }
1da177e4
LT
1283 memcpy(addr, &src, 4);
1284}
1285
c7066f70 1286#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4
LT
1287static void set_class_tag(struct rtable *rt, u32 tag)
1288{
d8d1f30b
CG
1289 if (!(rt->dst.tclassid & 0xFFFF))
1290 rt->dst.tclassid |= tag & 0xFFFF;
1291 if (!(rt->dst.tclassid & 0xFFFF0000))
1292 rt->dst.tclassid |= tag & 0xFFFF0000;
1da177e4
LT
1293}
1294#endif
1295
0dbaee3b
DM
1296static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1297{
7ed14d97 1298 unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
164a5e7a 1299 unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
7ed14d97 1300 ip_rt_min_advmss);
0dbaee3b 1301
7ed14d97 1302 return min(advmss, IPV4_MAX_PMTU - header_size);
0dbaee3b
DM
1303}
1304
ebb762f2 1305static unsigned int ipv4_mtu(const struct dst_entry *dst)
d33e4553 1306{
261663b0 1307 const struct rtable *rt = (const struct rtable *) dst;
5943634f
DM
1308 unsigned int mtu = rt->rt_pmtu;
1309
98d75c37 1310 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
5943634f 1311 mtu = dst_metric_raw(dst, RTAX_MTU);
618f9bc7 1312
38d523e2 1313 if (mtu)
618f9bc7
SK
1314 return mtu;
1315
c780a049 1316 mtu = READ_ONCE(dst->dev->mtu);
d33e4553 1317
d52e5a7e 1318 if (unlikely(ip_mtu_locked(dst))) {
77d5bc7e 1319 if (rt->rt_uses_gateway && mtu > 576)
d33e4553
DM
1320 mtu = 576;
1321 }
1322
14972cbd
RP
1323 mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1324
1325 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
d33e4553
DM
1326}
1327
a5995e71 1328static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
94720e3a
JA
1329{
1330 struct fnhe_hash_bucket *hash;
1331 struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1332 u32 hval = fnhe_hashfun(daddr);
1333
1334 spin_lock_bh(&fnhe_lock);
1335
a5995e71 1336 hash = rcu_dereference_protected(nhc->nhc_exceptions,
94720e3a
JA
1337 lockdep_is_held(&fnhe_lock));
1338 hash += hval;
1339
1340 fnhe_p = &hash->chain;
1341 fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1342 while (fnhe) {
1343 if (fnhe->fnhe_daddr == daddr) {
1344 rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1345 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
ee60ad21
XL
1346 /* set fnhe_daddr to 0 to ensure it won't bind with
1347 * new dsts in rt_bind_exception().
1348 */
1349 fnhe->fnhe_daddr = 0;
94720e3a
JA
1350 fnhe_flush_routes(fnhe);
1351 kfree_rcu(fnhe, rcu);
1352 break;
1353 }
1354 fnhe_p = &fnhe->fnhe_next;
1355 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1356 lockdep_is_held(&fnhe_lock));
1357 }
1358
1359 spin_unlock_bh(&fnhe_lock);
1360}
1361
a5995e71
DA
1362static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1363 __be32 daddr)
4895c771 1364{
a5995e71 1365 struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
4895c771
DM
1366 struct fib_nh_exception *fnhe;
1367 u32 hval;
1368
f2bb4bed
DM
1369 if (!hash)
1370 return NULL;
1371
d3a25c98 1372 hval = fnhe_hashfun(daddr);
4895c771
DM
1373
1374 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1375 fnhe = rcu_dereference(fnhe->fnhe_next)) {
94720e3a
JA
1376 if (fnhe->fnhe_daddr == daddr) {
1377 if (fnhe->fnhe_expires &&
1378 time_after(jiffies, fnhe->fnhe_expires)) {
a5995e71 1379 ip_del_fnhe(nhc, daddr);
94720e3a
JA
1380 break;
1381 }
f2bb4bed 1382 return fnhe;
94720e3a 1383 }
f2bb4bed
DM
1384 }
1385 return NULL;
1386}
aee06da6 1387
50d889b1
DA
1388/* MTU selection:
1389 * 1. mtu on route is locked - use it
1390 * 2. mtu from nexthop exception
1391 * 3. mtu from egress device
1392 */
1393
1394u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1395{
eba618ab
DA
1396 struct fib_nh_common *nhc = res->nhc;
1397 struct net_device *dev = nhc->nhc_dev;
50d889b1 1398 struct fib_info *fi = res->fi;
50d889b1
DA
1399 u32 mtu = 0;
1400
1401 if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1402 fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1403 mtu = fi->fib_mtu;
1404
1405 if (likely(!mtu)) {
1406 struct fib_nh_exception *fnhe;
1407
a5995e71 1408 fnhe = find_exception(nhc, daddr);
50d889b1
DA
1409 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1410 mtu = fnhe->fnhe_pmtu;
1411 }
1412
1413 if (likely(!mtu))
1414 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1415
eba618ab 1416 return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
50d889b1
DA
1417}
1418
caacf05e 1419static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
a4c2fd7f 1420 __be32 daddr, const bool do_cache)
f2bb4bed 1421{
caacf05e
DM
1422 bool ret = false;
1423
c5038a83 1424 spin_lock_bh(&fnhe_lock);
f2bb4bed 1425
c5038a83 1426 if (daddr == fnhe->fnhe_daddr) {
2ffae99d
TT
1427 struct rtable __rcu **porig;
1428 struct rtable *orig;
5aad1de5 1429 int genid = fnhe_genid(dev_net(rt->dst.dev));
2ffae99d
TT
1430
1431 if (rt_is_input_route(rt))
1432 porig = &fnhe->fnhe_rth_input;
1433 else
1434 porig = &fnhe->fnhe_rth_output;
1435 orig = rcu_dereference(*porig);
5aad1de5
TT
1436
1437 if (fnhe->fnhe_genid != genid) {
1438 fnhe->fnhe_genid = genid;
13d82bf5
SK
1439 fnhe->fnhe_gw = 0;
1440 fnhe->fnhe_pmtu = 0;
1441 fnhe->fnhe_expires = 0;
0e8411e4 1442 fnhe->fnhe_mtu_locked = false;
2ffae99d
TT
1443 fnhe_flush_routes(fnhe);
1444 orig = NULL;
13d82bf5 1445 }
387aa65a 1446 fill_route_from_fnhe(rt, fnhe);
1550c171
DA
1447 if (!rt->rt_gw4) {
1448 rt->rt_gw4 = daddr;
1449 rt->rt_gw_family = AF_INET;
1450 }
f2bb4bed 1451
a4c2fd7f 1452 if (do_cache) {
0830106c 1453 dst_hold(&rt->dst);
2ffae99d 1454 rcu_assign_pointer(*porig, rt);
0830106c 1455 if (orig) {
95c47f9c 1456 dst_dev_put(&orig->dst);
0830106c 1457 dst_release(&orig->dst);
0830106c 1458 }
2ffae99d
TT
1459 ret = true;
1460 }
c5038a83
DM
1461
1462 fnhe->fnhe_stamp = jiffies;
c5038a83
DM
1463 }
1464 spin_unlock_bh(&fnhe_lock);
caacf05e
DM
1465
1466 return ret;
54764bb6
ED
1467}
1468
87063a1f 1469static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
f2bb4bed 1470{
d26b3a7c 1471 struct rtable *orig, *prev, **p;
caacf05e 1472 bool ret = true;
f2bb4bed 1473
d26b3a7c 1474 if (rt_is_input_route(rt)) {
0f457a36 1475 p = (struct rtable **)&nhc->nhc_rth_input;
d26b3a7c 1476 } else {
0f457a36 1477 p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
d26b3a7c 1478 }
f2bb4bed
DM
1479 orig = *p;
1480
0830106c
WW
1481 /* hold dst before doing cmpxchg() to avoid race condition
1482 * on this dst
1483 */
1484 dst_hold(&rt->dst);
f2bb4bed
DM
1485 prev = cmpxchg(p, orig, rt);
1486 if (prev == orig) {
0830106c 1487 if (orig) {
5018c596 1488 rt_add_uncached_list(orig);
0830106c 1489 dst_release(&orig->dst);
0830106c
WW
1490 }
1491 } else {
1492 dst_release(&rt->dst);
caacf05e 1493 ret = false;
0830106c 1494 }
caacf05e
DM
1495
1496 return ret;
1497}
1498
5055c371
ED
1499struct uncached_list {
1500 spinlock_t lock;
1501 struct list_head head;
1502};
1503
1504static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
caacf05e 1505
510c321b 1506void rt_add_uncached_list(struct rtable *rt)
caacf05e 1507{
5055c371
ED
1508 struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1509
1510 rt->rt_uncached_list = ul;
1511
1512 spin_lock_bh(&ul->lock);
1513 list_add_tail(&rt->rt_uncached, &ul->head);
1514 spin_unlock_bh(&ul->lock);
caacf05e
DM
1515}
1516
510c321b 1517void rt_del_uncached_list(struct rtable *rt)
caacf05e 1518{
78df76a0 1519 if (!list_empty(&rt->rt_uncached)) {
5055c371
ED
1520 struct uncached_list *ul = rt->rt_uncached_list;
1521
1522 spin_lock_bh(&ul->lock);
caacf05e 1523 list_del(&rt->rt_uncached);
5055c371 1524 spin_unlock_bh(&ul->lock);
caacf05e
DM
1525 }
1526}
1527
510c321b
XL
1528static void ipv4_dst_destroy(struct dst_entry *dst)
1529{
510c321b
XL
1530 struct rtable *rt = (struct rtable *)dst;
1531
1620a336 1532 ip_dst_metrics_put(dst);
510c321b
XL
1533 rt_del_uncached_list(rt);
1534}
1535
caacf05e
DM
1536void rt_flush_dev(struct net_device *dev)
1537{
5055c371
ED
1538 struct rtable *rt;
1539 int cpu;
1540
1541 for_each_possible_cpu(cpu) {
1542 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
caacf05e 1543
5055c371
ED
1544 spin_lock_bh(&ul->lock);
1545 list_for_each_entry(rt, &ul->head, rt_uncached) {
caacf05e
DM
1546 if (rt->dst.dev != dev)
1547 continue;
8d7017fd 1548 rt->dst.dev = blackhole_netdev;
caacf05e
DM
1549 dev_hold(rt->dst.dev);
1550 dev_put(dev);
1551 }
5055c371 1552 spin_unlock_bh(&ul->lock);
4895c771
DM
1553 }
1554}
1555
4331debc 1556static bool rt_cache_valid(const struct rtable *rt)
d2d68ba9 1557{
4331debc
ED
1558 return rt &&
1559 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1560 !rt_is_expired(rt);
d2d68ba9
DM
1561}
1562
f2bb4bed 1563static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
5e2b61f7 1564 const struct fib_result *res,
f2bb4bed 1565 struct fib_nh_exception *fnhe,
a4c2fd7f
WW
1566 struct fib_info *fi, u16 type, u32 itag,
1567 const bool do_cache)
1da177e4 1568{
caacf05e
DM
1569 bool cached = false;
1570
1da177e4 1571 if (fi) {
eba618ab 1572 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
4895c771 1573
0f5f7d7b 1574 if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
77d5bc7e 1575 rt->rt_uses_gateway = 1;
0f5f7d7b
DA
1576 rt->rt_gw_family = nhc->nhc_gw_family;
1577 /* only INET and INET6 are supported */
1578 if (likely(nhc->nhc_gw_family == AF_INET))
1579 rt->rt_gw4 = nhc->nhc_gw.ipv4;
1580 else
1581 rt->rt_gw6 = nhc->nhc_gw.ipv6;
155e8336 1582 }
0f5f7d7b 1583
e1255ed4
DA
1584 ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1585
c7066f70 1586#ifdef CONFIG_IP_ROUTE_CLASSID
dcb1ecb5 1587 if (nhc->nhc_family == AF_INET) {
87063a1f
DA
1588 struct fib_nh *nh;
1589
1590 nh = container_of(nhc, struct fib_nh, nh_common);
1591 rt->dst.tclassid = nh->nh_tclassid;
1592 }
1da177e4 1593#endif
87063a1f 1594 rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
c5038a83 1595 if (unlikely(fnhe))
a4c2fd7f
WW
1596 cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1597 else if (do_cache)
87063a1f 1598 cached = rt_cache_route(nhc, rt);
155e8336
JA
1599 if (unlikely(!cached)) {
1600 /* Routes we intend to cache in nexthop exception or
1601 * FIB nexthop have the DST_NOCACHE bit clear.
1602 * However, if we are unsuccessful at storing this
1603 * route into the cache we really need to set it.
1604 */
1550c171
DA
1605 if (!rt->rt_gw4) {
1606 rt->rt_gw_family = AF_INET;
1607 rt->rt_gw4 = daddr;
1608 }
155e8336
JA
1609 rt_add_uncached_list(rt);
1610 }
1611 } else
caacf05e 1612 rt_add_uncached_list(rt);
defb3519 1613
c7066f70 1614#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4 1615#ifdef CONFIG_IP_MULTIPLE_TABLES
85b91b03 1616 set_class_tag(rt, res->tclassid);
1da177e4
LT
1617#endif
1618 set_class_tag(rt, itag);
1619#endif
1da177e4
LT
1620}
1621
9ab179d8
DA
1622struct rtable *rt_dst_alloc(struct net_device *dev,
1623 unsigned int flags, u16 type,
af13b3c3 1624 bool nopolicy, bool noxfrm)
0c4dcd58 1625{
d08c4f35
DA
1626 struct rtable *rt;
1627
1628 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
d08c4f35 1629 (nopolicy ? DST_NOPOLICY : 0) |
b2a9c0ed 1630 (noxfrm ? DST_NOXFRM : 0));
d08c4f35
DA
1631
1632 if (rt) {
1633 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1634 rt->rt_flags = flags;
1635 rt->rt_type = type;
1636 rt->rt_is_input = 0;
1637 rt->rt_iif = 0;
1638 rt->rt_pmtu = 0;
d52e5a7e 1639 rt->rt_mtu_locked = 0;
77d5bc7e 1640 rt->rt_uses_gateway = 0;
1550c171
DA
1641 rt->rt_gw_family = 0;
1642 rt->rt_gw4 = 0;
d08c4f35
DA
1643 INIT_LIST_HEAD(&rt->rt_uncached);
1644
1645 rt->dst.output = ip_output;
1646 if (flags & RTCF_LOCAL)
1647 rt->dst.input = ip_local_deliver;
1648 }
1649
1650 return rt;
0c4dcd58 1651}
9ab179d8 1652EXPORT_SYMBOL(rt_dst_alloc);
0c4dcd58 1653
5b18f128
SS
1654struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1655{
1656 struct rtable *new_rt;
1657
1658 new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1659 rt->dst.flags);
1660
1661 if (new_rt) {
1662 new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1663 new_rt->rt_flags = rt->rt_flags;
1664 new_rt->rt_type = rt->rt_type;
1665 new_rt->rt_is_input = rt->rt_is_input;
1666 new_rt->rt_iif = rt->rt_iif;
1667 new_rt->rt_pmtu = rt->rt_pmtu;
1668 new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1669 new_rt->rt_gw_family = rt->rt_gw_family;
1670 if (rt->rt_gw_family == AF_INET)
1671 new_rt->rt_gw4 = rt->rt_gw4;
1672 else if (rt->rt_gw_family == AF_INET6)
1673 new_rt->rt_gw6 = rt->rt_gw6;
1674 INIT_LIST_HEAD(&new_rt->rt_uncached);
1675
5b18f128
SS
1676 new_rt->dst.input = rt->dst.input;
1677 new_rt->dst.output = rt->dst.output;
1678 new_rt->dst.error = rt->dst.error;
1679 new_rt->dst.lastuse = jiffies;
1680 new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1681 }
1682 return new_rt;
1683}
1684EXPORT_SYMBOL(rt_dst_clone);
1685
96d36220 1686/* called in rcu_read_lock() section */
bc044e8d
PA
1687int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1688 u8 tos, struct net_device *dev,
1689 struct in_device *in_dev, u32 *itag)
1da177e4 1690{
b5f7e755 1691 int err;
1da177e4
LT
1692
1693 /* Primary sanity checks. */
51456b29 1694 if (!in_dev)
1da177e4
LT
1695 return -EINVAL;
1696
1e637c74 1697 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
d0daebc3 1698 skb->protocol != htons(ETH_P_IP))
bc044e8d 1699 return -EINVAL;
1da177e4 1700
75fea73d 1701 if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
bc044e8d 1702 return -EINVAL;
d0daebc3 1703
f97c1e0c 1704 if (ipv4_is_zeronet(saddr)) {
1d2f4ebb
EC
1705 if (!ipv4_is_local_multicast(daddr) &&
1706 ip_hdr(skb)->protocol != IPPROTO_IGMP)
bc044e8d 1707 return -EINVAL;
b5f7e755 1708 } else {
9e56e380 1709 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
bc044e8d 1710 in_dev, itag);
b5f7e755 1711 if (err < 0)
bc044e8d 1712 return err;
b5f7e755 1713 }
bc044e8d
PA
1714 return 0;
1715}
1716
1717/* called in rcu_read_lock() section */
1718static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1719 u8 tos, struct net_device *dev, int our)
1720{
1721 struct in_device *in_dev = __in_dev_get_rcu(dev);
1722 unsigned int flags = RTCF_MULTICAST;
1723 struct rtable *rth;
1724 u32 itag = 0;
1725 int err;
1726
1727 err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1728 if (err)
1729 return err;
1730
d08c4f35
DA
1731 if (our)
1732 flags |= RTCF_LOCAL;
1733
1734 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
af13b3c3 1735 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1da177e4 1736 if (!rth)
bc044e8d 1737 return -ENOBUFS;
1da177e4 1738
cf911662
DM
1739#ifdef CONFIG_IP_ROUTE_CLASSID
1740 rth->dst.tclassid = itag;
1741#endif
d8d1f30b 1742 rth->dst.output = ip_rt_bug;
9917e1e8 1743 rth->rt_is_input= 1;
1da177e4
LT
1744
1745#ifdef CONFIG_IP_MROUTE
f97c1e0c 1746 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
d8d1f30b 1747 rth->dst.input = ip_mr_input;
1da177e4
LT
1748#endif
1749 RT_CACHE_STAT_INC(in_slow_mc);
1750
89aef892
DM
1751 skb_dst_set(skb, &rth->dst);
1752 return 0;
1da177e4
LT
1753}
1754
1755
1756static void ip_handle_martian_source(struct net_device *dev,
1757 struct in_device *in_dev,
1758 struct sk_buff *skb,
9e12bb22
AV
1759 __be32 daddr,
1760 __be32 saddr)
1da177e4
LT
1761{
1762 RT_CACHE_STAT_INC(in_martian_src);
1763#ifdef CONFIG_IP_ROUTE_VERBOSE
1764 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1765 /*
1766 * RFC1812 recommendation, if source is martian,
1767 * the only hint is MAC header.
1768 */
058bd4d2 1769 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
673d57e7 1770 &daddr, &saddr, dev->name);
98e399f8 1771 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
058bd4d2
JP
1772 print_hex_dump(KERN_WARNING, "ll header: ",
1773 DUMP_PREFIX_OFFSET, 16, 1,
1774 skb_mac_header(skb),
b2c85100 1775 dev->hard_header_len, false);
1da177e4
LT
1776 }
1777 }
1778#endif
1779}
1780
47360228 1781/* called in rcu_read_lock() section */
5969f71d 1782static int __mkroute_input(struct sk_buff *skb,
982721f3 1783 const struct fib_result *res,
5969f71d 1784 struct in_device *in_dev,
c6cffba4 1785 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 1786{
eba618ab
DA
1787 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1788 struct net_device *dev = nhc->nhc_dev;
2ffae99d 1789 struct fib_nh_exception *fnhe;
1da177e4
LT
1790 struct rtable *rth;
1791 int err;
1792 struct in_device *out_dev;
d2d68ba9 1793 bool do_cache;
fbdc0ad0 1794 u32 itag = 0;
1da177e4
LT
1795
1796 /* get a working reference to the output device */
eba618ab 1797 out_dev = __in_dev_get_rcu(dev);
51456b29 1798 if (!out_dev) {
e87cc472 1799 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1da177e4
LT
1800 return -EINVAL;
1801 }
1802
5c04c819 1803 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
9e56e380 1804 in_dev->dev, in_dev, &itag);
1da177e4 1805 if (err < 0) {
e905a9ed 1806 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1da177e4 1807 saddr);
e905a9ed 1808
1da177e4
LT
1809 goto cleanup;
1810 }
1811
e81da0e1
JA
1812 do_cache = res->fi && !itag;
1813 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
eba618ab 1814 skb->protocol == htons(ETH_P_IP)) {
bdf00467 1815 __be32 gw;
eba618ab 1816
bdf00467 1817 gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
eba618ab
DA
1818 if (IN_DEV_SHARED_MEDIA(out_dev) ||
1819 inet_addr_onlink(out_dev, saddr, gw))
1820 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1821 }
1da177e4
LT
1822
1823 if (skb->protocol != htons(ETH_P_IP)) {
1824 /* Not IP (i.e. ARP). Do not create route, if it is
1825 * invalid for proxy arp. DNAT routes are always valid.
65324144
JDB
1826 *
1827 * Proxy arp feature have been extended to allow, ARP
1828 * replies back to the same interface, to support
1829 * Private VLAN switch technologies. See arp.c.
1da177e4 1830 */
65324144
JDB
1831 if (out_dev == in_dev &&
1832 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1da177e4
LT
1833 err = -EINVAL;
1834 goto cleanup;
1835 }
1836 }
1837
a5995e71 1838 fnhe = find_exception(nhc, daddr);
e81da0e1 1839 if (do_cache) {
94720e3a 1840 if (fnhe)
2ffae99d 1841 rth = rcu_dereference(fnhe->fnhe_rth_input);
94720e3a 1842 else
0f457a36 1843 rth = rcu_dereference(nhc->nhc_rth_input);
e81da0e1
JA
1844 if (rt_cache_valid(rth)) {
1845 skb_dst_set_noref(skb, &rth->dst);
1846 goto out;
d2d68ba9
DM
1847 }
1848 }
f2bb4bed 1849
d08c4f35 1850 rth = rt_dst_alloc(out_dev->dev, 0, res->type,
5c1e6aa3 1851 IN_DEV_CONF_GET(in_dev, NOPOLICY),
af13b3c3 1852 IN_DEV_CONF_GET(out_dev, NOXFRM));
1da177e4
LT
1853 if (!rth) {
1854 err = -ENOBUFS;
1855 goto cleanup;
1856 }
1857
9917e1e8 1858 rth->rt_is_input = 1;
a6254864 1859 RT_CACHE_STAT_INC(in_slow_tot);
1da177e4 1860
d8d1f30b 1861 rth->dst.input = ip_forward;
1da177e4 1862
a4c2fd7f
WW
1863 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1864 do_cache);
9942895b 1865 lwtunnel_set_redirect(&rth->dst);
c6cffba4 1866 skb_dst_set(skb, &rth->dst);
d2d68ba9 1867out:
1da177e4
LT
1868 err = 0;
1869 cleanup:
1da177e4 1870 return err;
e905a9ed 1871}
1da177e4 1872
79a13159 1873#ifdef CONFIG_IP_ROUTE_MULTIPATH
79a13159 1874/* To make ICMP packets follow the right flow, the multipath hash is
bf4e0a3d 1875 * calculated from the inner IP addresses.
79a13159 1876 */
bf4e0a3d
NA
1877static void ip_multipath_l3_keys(const struct sk_buff *skb,
1878 struct flow_keys *hash_keys)
79a13159
PN
1879{
1880 const struct iphdr *outer_iph = ip_hdr(skb);
6f74b6c2 1881 const struct iphdr *key_iph = outer_iph;
bf4e0a3d 1882 const struct iphdr *inner_iph;
79a13159
PN
1883 const struct icmphdr *icmph;
1884 struct iphdr _inner_iph;
bf4e0a3d
NA
1885 struct icmphdr _icmph;
1886
bf4e0a3d 1887 if (likely(outer_iph->protocol != IPPROTO_ICMP))
6f74b6c2 1888 goto out;
79a13159
PN
1889
1890 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
6f74b6c2 1891 goto out;
79a13159
PN
1892
1893 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1894 &_icmph);
1895 if (!icmph)
6f74b6c2 1896 goto out;
79a13159 1897
54074f1d 1898 if (!icmp_is_err(icmph->type))
6f74b6c2 1899 goto out;
79a13159
PN
1900
1901 inner_iph = skb_header_pointer(skb,
1902 outer_iph->ihl * 4 + sizeof(_icmph),
1903 sizeof(_inner_iph), &_inner_iph);
1904 if (!inner_iph)
6f74b6c2
DA
1905 goto out;
1906
1907 key_iph = inner_iph;
1908out:
1909 hash_keys->addrs.v4addrs.src = key_iph->saddr;
1910 hash_keys->addrs.v4addrs.dst = key_iph->daddr;
bf4e0a3d 1911}
79a13159 1912
bf4e0a3d 1913/* if skb is set it will be used and fl4 can be NULL */
7efc0b6b 1914int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
e37b1e97 1915 const struct sk_buff *skb, struct flow_keys *flkeys)
bf4e0a3d 1916{
2a8e4997 1917 u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
bf4e0a3d
NA
1918 struct flow_keys hash_keys;
1919 u32 mhash;
79a13159 1920
bf4e0a3d
NA
1921 switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1922 case 0:
1923 memset(&hash_keys, 0, sizeof(hash_keys));
1924 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1925 if (skb) {
1926 ip_multipath_l3_keys(skb, &hash_keys);
1927 } else {
1928 hash_keys.addrs.v4addrs.src = fl4->saddr;
1929 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1930 }
1931 break;
1932 case 1:
1933 /* skb is currently provided only when forwarding */
1934 if (skb) {
1935 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1936 struct flow_keys keys;
1937
1938 /* short-circuit if we already have L4 hash present */
1939 if (skb->l4_hash)
1940 return skb_get_hash_raw(skb) >> 1;
ec7127a5 1941
bf4e0a3d 1942 memset(&hash_keys, 0, sizeof(hash_keys));
1fe4b118 1943
ec7127a5 1944 if (!flkeys) {
e37b1e97 1945 skb_flow_dissect_flow_keys(skb, &keys, flag);
ec7127a5 1946 flkeys = &keys;
e37b1e97 1947 }
ec7127a5
DA
1948
1949 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1950 hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1951 hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1952 hash_keys.ports.src = flkeys->ports.src;
1953 hash_keys.ports.dst = flkeys->ports.dst;
1954 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
bf4e0a3d
NA
1955 } else {
1956 memset(&hash_keys, 0, sizeof(hash_keys));
1957 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1958 hash_keys.addrs.v4addrs.src = fl4->saddr;
1959 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1960 hash_keys.ports.src = fl4->fl4_sport;
1961 hash_keys.ports.dst = fl4->fl4_dport;
1962 hash_keys.basic.ip_proto = fl4->flowi4_proto;
1963 }
1964 break;
363887a2
SS
1965 case 2:
1966 memset(&hash_keys, 0, sizeof(hash_keys));
363887a2
SS
1967 /* skb is currently provided only when forwarding */
1968 if (skb) {
1969 struct flow_keys keys;
1970
1971 skb_flow_dissect_flow_keys(skb, &keys, 0);
828b2b44
SS
1972 /* Inner can be v4 or v6 */
1973 if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
1974 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1975 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1976 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1977 } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
1978 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1979 hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
1980 hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
1981 hash_keys.tags.flow_label = keys.tags.flow_label;
1982 hash_keys.basic.ip_proto = keys.basic.ip_proto;
1983 } else {
1984 /* Same as case 0 */
1985 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1986 ip_multipath_l3_keys(skb, &hash_keys);
1987 }
363887a2
SS
1988 } else {
1989 /* Same as case 0 */
828b2b44 1990 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
363887a2
SS
1991 hash_keys.addrs.v4addrs.src = fl4->saddr;
1992 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1993 }
1994 break;
bf4e0a3d
NA
1995 }
1996 mhash = flow_hash_from_keys(&hash_keys);
79a13159 1997
24ba1440 1998 if (multipath_hash)
1999 mhash = jhash_2words(mhash, multipath_hash, 0);
2000
bf4e0a3d
NA
2001 return mhash >> 1;
2002}
79a13159
PN
2003#endif /* CONFIG_IP_ROUTE_MULTIPATH */
2004
5969f71d
SH
2005static int ip_mkroute_input(struct sk_buff *skb,
2006 struct fib_result *res,
5969f71d 2007 struct in_device *in_dev,
e37b1e97
RP
2008 __be32 daddr, __be32 saddr, u32 tos,
2009 struct flow_keys *hkeys)
1da177e4 2010{
1da177e4 2011#ifdef CONFIG_IP_ROUTE_MULTIPATH
5481d73f 2012 if (res->fi && fib_info_num_path(res->fi) > 1) {
7efc0b6b 2013 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
0e884c78 2014
0e884c78
PN
2015 fib_select_multipath(res, h);
2016 }
1da177e4
LT
2017#endif
2018
2019 /* create a routing cache entry */
c6cffba4 2020 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1da177e4
LT
2021}
2022
02b24941
PA
2023/* Implements all the saddr-related checks as ip_route_input_slow(),
2024 * assuming daddr is valid and the destination is not a local broadcast one.
2025 * Uses the provided hint instead of performing a route lookup.
2026 */
2027int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2028 u8 tos, struct net_device *dev,
2029 const struct sk_buff *hint)
2030{
2031 struct in_device *in_dev = __in_dev_get_rcu(dev);
2032 struct rtable *rt = (struct rtable *)hint;
2033 struct net *net = dev_net(dev);
2034 int err = -EINVAL;
2035 u32 tag = 0;
2036
2037 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2038 goto martian_source;
2039
2040 if (ipv4_is_zeronet(saddr))
2041 goto martian_source;
2042
2043 if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2044 goto martian_source;
2045
2046 if (rt->rt_type != RTN_LOCAL)
2047 goto skip_validate_source;
2048
2049 tos &= IPTOS_RT_MASK;
2050 err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
2051 if (err < 0)
2052 goto martian_source;
2053
2054skip_validate_source:
2055 skb_dst_copy(skb, hint);
2056 return 0;
2057
2058martian_source:
2059 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2060 return err;
2061}
2062
1da177e4
LT
2063/*
2064 * NOTE. We drop all the packets that has local source
2065 * addresses, because every properly looped back packet
2066 * must have correct destination already attached by output routine.
02b24941
PA
2067 * Changes in the enforced policies must be applied also to
2068 * ip_route_use_hint().
1da177e4
LT
2069 *
2070 * Such approach solves two big problems:
2071 * 1. Not simplex devices are handled properly.
2072 * 2. IP spoofing attempts are filtered with 100% of guarantee.
ebc0ffae 2073 * called with rcu_read_lock()
1da177e4
LT
2074 */
2075
9e12bb22 2076static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
5510cdf7
DA
2077 u8 tos, struct net_device *dev,
2078 struct fib_result *res)
1da177e4 2079{
96d36220 2080 struct in_device *in_dev = __in_dev_get_rcu(dev);
e37b1e97
RP
2081 struct flow_keys *flkeys = NULL, _flkeys;
2082 struct net *net = dev_net(dev);
1b7179d3 2083 struct ip_tunnel_info *tun_info;
e37b1e97 2084 int err = -EINVAL;
95c96174 2085 unsigned int flags = 0;
1da177e4 2086 u32 itag = 0;
95c96174 2087 struct rtable *rth;
e37b1e97 2088 struct flowi4 fl4;
0a90478b 2089 bool do_cache = true;
1da177e4
LT
2090
2091 /* IP on this device is disabled. */
2092
2093 if (!in_dev)
2094 goto out;
2095
2096 /* Check for the most weird martians, which can be not detected
2097 by fib_lookup.
2098 */
2099
61adedf3 2100 tun_info = skb_tunnel_info(skb);
46fa062a 2101 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1b7179d3
TG
2102 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2103 else
2104 fl4.flowi4_tun_key.tun_id = 0;
f38a9eb1
TG
2105 skb_dst_drop(skb);
2106
d0daebc3 2107 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1da177e4
LT
2108 goto martian_source;
2109
5510cdf7
DA
2110 res->fi = NULL;
2111 res->table = NULL;
27a954bd 2112 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1da177e4
LT
2113 goto brd_input;
2114
2115 /* Accept zero addresses only to limited broadcast;
2116 * I even do not know to fix it or not. Waiting for complains :-)
2117 */
f97c1e0c 2118 if (ipv4_is_zeronet(saddr))
1da177e4
LT
2119 goto martian_source;
2120
d0daebc3 2121 if (ipv4_is_zeronet(daddr))
1da177e4
LT
2122 goto martian_destination;
2123
9eb43e76
ED
2124 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2125 * and call it once if daddr or/and saddr are loopback addresses
2126 */
2127 if (ipv4_is_loopback(daddr)) {
2128 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
d0daebc3 2129 goto martian_destination;
9eb43e76
ED
2130 } else if (ipv4_is_loopback(saddr)) {
2131 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
d0daebc3
TG
2132 goto martian_source;
2133 }
2134
1da177e4
LT
2135 /*
2136 * Now we are ready to route packet.
2137 */
68a5e3dd 2138 fl4.flowi4_oif = 0;
e0d56fdd 2139 fl4.flowi4_iif = dev->ifindex;
68a5e3dd
DM
2140 fl4.flowi4_mark = skb->mark;
2141 fl4.flowi4_tos = tos;
2142 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
b84f7878 2143 fl4.flowi4_flags = 0;
68a5e3dd
DM
2144 fl4.daddr = daddr;
2145 fl4.saddr = saddr;
8bcfd092 2146 fl4.flowi4_uid = sock_net_uid(net, NULL);
e37b1e97 2147
5a847a6e 2148 if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
e37b1e97 2149 flkeys = &_flkeys;
5a847a6e
DA
2150 } else {
2151 fl4.flowi4_proto = 0;
2152 fl4.fl4_sport = 0;
2153 fl4.fl4_dport = 0;
2154 }
e37b1e97 2155
5510cdf7 2156 err = fib_lookup(net, &fl4, res, 0);
cd0f0b95
DJ
2157 if (err != 0) {
2158 if (!IN_DEV_FORWARD(in_dev))
2159 err = -EHOSTUNREACH;
1da177e4 2160 goto no_route;
cd0f0b95 2161 }
1da177e4 2162
5cbf777c
XL
2163 if (res->type == RTN_BROADCAST) {
2164 if (IN_DEV_BFORWARD(in_dev))
2165 goto make_route;
0a90478b
XL
2166 /* not do cache if bc_forwarding is enabled */
2167 if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2168 do_cache = false;
1da177e4 2169 goto brd_input;
5cbf777c 2170 }
1da177e4 2171
5510cdf7 2172 if (res->type == RTN_LOCAL) {
5c04c819 2173 err = fib_validate_source(skb, saddr, daddr, tos,
0d5edc68 2174 0, dev, in_dev, &itag);
b5f7e755 2175 if (err < 0)
0d753960 2176 goto martian_source;
1da177e4
LT
2177 goto local_input;
2178 }
2179
cd0f0b95
DJ
2180 if (!IN_DEV_FORWARD(in_dev)) {
2181 err = -EHOSTUNREACH;
251da413 2182 goto no_route;
cd0f0b95 2183 }
5510cdf7 2184 if (res->type != RTN_UNICAST)
1da177e4
LT
2185 goto martian_destination;
2186
5cbf777c 2187make_route:
e37b1e97 2188 err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
1da177e4
LT
2189out: return err;
2190
2191brd_input:
2192 if (skb->protocol != htons(ETH_P_IP))
2193 goto e_inval;
2194
41347dcd 2195 if (!ipv4_is_zeronet(saddr)) {
9e56e380
DM
2196 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2197 in_dev, &itag);
1da177e4 2198 if (err < 0)
0d753960 2199 goto martian_source;
1da177e4
LT
2200 }
2201 flags |= RTCF_BROADCAST;
5510cdf7 2202 res->type = RTN_BROADCAST;
1da177e4
LT
2203 RT_CACHE_STAT_INC(in_brd);
2204
2205local_input:
0a90478b
XL
2206 do_cache &= res->fi && !itag;
2207 if (do_cache) {
2208 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
eba618ab 2209
0a90478b
XL
2210 rth = rcu_dereference(nhc->nhc_rth_input);
2211 if (rt_cache_valid(rth)) {
2212 skb_dst_set_noref(skb, &rth->dst);
2213 err = 0;
2214 goto out;
d2d68ba9
DM
2215 }
2216 }
2217
f5a0aab8 2218 rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
5510cdf7 2219 flags | RTCF_LOCAL, res->type,
af13b3c3 2220 IN_DEV_CONF_GET(in_dev, NOPOLICY), false);
1da177e4
LT
2221 if (!rth)
2222 goto e_nobufs;
2223
d8d1f30b 2224 rth->dst.output= ip_rt_bug;
cf911662
DM
2225#ifdef CONFIG_IP_ROUTE_CLASSID
2226 rth->dst.tclassid = itag;
2227#endif
9917e1e8 2228 rth->rt_is_input = 1;
571e7226 2229
a6254864 2230 RT_CACHE_STAT_INC(in_slow_tot);
5510cdf7 2231 if (res->type == RTN_UNREACHABLE) {
d8d1f30b
CG
2232 rth->dst.input= ip_error;
2233 rth->dst.error= -err;
1da177e4
LT
2234 rth->rt_flags &= ~RTCF_LOCAL;
2235 }
efd85700 2236
dcdfdf56 2237 if (do_cache) {
eba618ab 2238 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
efd85700 2239
eba618ab 2240 rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
efd85700
TG
2241 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2242 WARN_ON(rth->dst.input == lwtunnel_input);
2243 rth->dst.lwtstate->orig_input = rth->dst.input;
2244 rth->dst.input = lwtunnel_input;
2245 }
2246
87063a1f 2247 if (unlikely(!rt_cache_route(nhc, rth)))
dcdfdf56 2248 rt_add_uncached_list(rth);
dcdfdf56 2249 }
89aef892 2250 skb_dst_set(skb, &rth->dst);
b23dd4fe 2251 err = 0;
ebc0ffae 2252 goto out;
1da177e4
LT
2253
2254no_route:
2255 RT_CACHE_STAT_INC(in_no_route);
5510cdf7
DA
2256 res->type = RTN_UNREACHABLE;
2257 res->fi = NULL;
2258 res->table = NULL;
1da177e4
LT
2259 goto local_input;
2260
2261 /*
2262 * Do not cache martian addresses: they should be logged (RFC1812)
2263 */
2264martian_destination:
2265 RT_CACHE_STAT_INC(in_martian_dst);
2266#ifdef CONFIG_IP_ROUTE_VERBOSE
e87cc472
JP
2267 if (IN_DEV_LOG_MARTIANS(in_dev))
2268 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2269 &daddr, &saddr, dev->name);
1da177e4 2270#endif
2c2910a4 2271
1da177e4
LT
2272e_inval:
2273 err = -EINVAL;
ebc0ffae 2274 goto out;
1da177e4
LT
2275
2276e_nobufs:
2277 err = -ENOBUFS;
ebc0ffae 2278 goto out;
1da177e4
LT
2279
2280martian_source:
2281 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
ebc0ffae 2282 goto out;
1da177e4
LT
2283}
2284
c6cffba4
DM
2285int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2286 u8 tos, struct net_device *dev)
1da177e4 2287{
5510cdf7
DA
2288 struct fib_result res;
2289 int err;
1da177e4 2290
6e28099d 2291 tos &= IPTOS_RT_MASK;
96d36220 2292 rcu_read_lock();
5510cdf7
DA
2293 err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2294 rcu_read_unlock();
96d36220 2295
5510cdf7
DA
2296 return err;
2297}
2298EXPORT_SYMBOL(ip_route_input_noref);
2299
2300/* called with rcu_read_lock held */
2301int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2302 u8 tos, struct net_device *dev, struct fib_result *res)
2303{
1da177e4
LT
2304 /* Multicast recognition logic is moved from route cache to here.
2305 The problem was that too many Ethernet cards have broken/missing
2306 hardware multicast filters :-( As result the host on multicasting
2307 network acquires a lot of useless route cache entries, sort of
2308 SDR messages from all the world. Now we try to get rid of them.
2309 Really, provided software IP multicast filter is organized
2310 reasonably (at least, hashed), it does not result in a slowdown
2311 comparing with route cache reject entries.
2312 Note, that multicast routers are not affected, because
2313 route cache entry is created eventually.
2314 */
f97c1e0c 2315 if (ipv4_is_multicast(daddr)) {
96d36220 2316 struct in_device *in_dev = __in_dev_get_rcu(dev);
e58e4159 2317 int our = 0;
5510cdf7 2318 int err = -EINVAL;
1da177e4 2319
22c74764
PA
2320 if (!in_dev)
2321 return err;
2322 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2323 ip_hdr(skb)->protocol);
e58e4159
DA
2324
2325 /* check l3 master if no match yet */
22c74764 2326 if (!our && netif_is_l3_slave(dev)) {
e58e4159
DA
2327 struct in_device *l3_in_dev;
2328
2329 l3_in_dev = __in_dev_get_rcu(skb->dev);
2330 if (l3_in_dev)
2331 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2332 ip_hdr(skb)->protocol);
2333 }
2334
e58e4159 2335 if (our
1da177e4 2336#ifdef CONFIG_IP_MROUTE
e58e4159
DA
2337 ||
2338 (!ipv4_is_local_multicast(daddr) &&
2339 IN_DEV_MFORWARD(in_dev))
1da177e4 2340#endif
e58e4159 2341 ) {
5510cdf7 2342 err = ip_route_input_mc(skb, daddr, saddr,
e58e4159 2343 tos, dev, our);
1da177e4 2344 }
5510cdf7 2345 return err;
1da177e4 2346 }
5510cdf7
DA
2347
2348 return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
1da177e4
LT
2349}
2350
ebc0ffae 2351/* called with rcu_read_lock() */
982721f3 2352static struct rtable *__mkroute_output(const struct fib_result *res,
1a00fee4 2353 const struct flowi4 *fl4, int orig_oif,
f61759e6 2354 struct net_device *dev_out,
5ada5527 2355 unsigned int flags)
1da177e4 2356{
982721f3 2357 struct fib_info *fi = res->fi;
f2bb4bed 2358 struct fib_nh_exception *fnhe;
5ada5527 2359 struct in_device *in_dev;
982721f3 2360 u16 type = res->type;
5ada5527 2361 struct rtable *rth;
c92b9655 2362 bool do_cache;
1da177e4 2363
d0daebc3
TG
2364 in_dev = __in_dev_get_rcu(dev_out);
2365 if (!in_dev)
5ada5527 2366 return ERR_PTR(-EINVAL);
1da177e4 2367
d0daebc3 2368 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
5f02ce24
DA
2369 if (ipv4_is_loopback(fl4->saddr) &&
2370 !(dev_out->flags & IFF_LOOPBACK) &&
2371 !netif_is_l3_master(dev_out))
d0daebc3
TG
2372 return ERR_PTR(-EINVAL);
2373
68a5e3dd 2374 if (ipv4_is_lbcast(fl4->daddr))
982721f3 2375 type = RTN_BROADCAST;
68a5e3dd 2376 else if (ipv4_is_multicast(fl4->daddr))
982721f3 2377 type = RTN_MULTICAST;
68a5e3dd 2378 else if (ipv4_is_zeronet(fl4->daddr))
5ada5527 2379 return ERR_PTR(-EINVAL);
1da177e4
LT
2380
2381 if (dev_out->flags & IFF_LOOPBACK)
2382 flags |= RTCF_LOCAL;
2383
63617421 2384 do_cache = true;
982721f3 2385 if (type == RTN_BROADCAST) {
1da177e4 2386 flags |= RTCF_BROADCAST | RTCF_LOCAL;
982721f3
DM
2387 fi = NULL;
2388 } else if (type == RTN_MULTICAST) {
dd28d1a0 2389 flags |= RTCF_MULTICAST | RTCF_LOCAL;
813b3b5d
DM
2390 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2391 fl4->flowi4_proto))
1da177e4 2392 flags &= ~RTCF_LOCAL;
63617421
JA
2393 else
2394 do_cache = false;
1da177e4 2395 /* If multicast route do not exist use
dd28d1a0
ED
2396 * default one, but do not gateway in this case.
2397 * Yes, it is hack.
1da177e4 2398 */
982721f3
DM
2399 if (fi && res->prefixlen < 4)
2400 fi = NULL;
d6d5e999
CF
2401 } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2402 (orig_oif != dev_out->ifindex)) {
2403 /* For local routes that require a particular output interface
2404 * we do not want to cache the result. Caching the result
2405 * causes incorrect behaviour when there are multiple source
2406 * addresses on the interface, the end result being that if the
2407 * intended recipient is waiting on that interface for the
2408 * packet he won't receive it because it will be delivered on
2409 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2410 * be set to the loopback interface as well.
2411 */
94720e3a 2412 do_cache = false;
1da177e4
LT
2413 }
2414
f2bb4bed 2415 fnhe = NULL;
63617421 2416 do_cache &= fi != NULL;
94720e3a 2417 if (fi) {
eba618ab 2418 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
c5038a83 2419 struct rtable __rcu **prth;
d26b3a7c 2420
a5995e71 2421 fnhe = find_exception(nhc, fl4->daddr);
94720e3a
JA
2422 if (!do_cache)
2423 goto add;
deed49df 2424 if (fnhe) {
2ffae99d 2425 prth = &fnhe->fnhe_rth_output;
94720e3a
JA
2426 } else {
2427 if (unlikely(fl4->flowi4_flags &
2428 FLOWI_FLAG_KNOWN_NH &&
bdf00467 2429 !(nhc->nhc_gw_family &&
eba618ab 2430 nhc->nhc_scope == RT_SCOPE_LINK))) {
94720e3a
JA
2431 do_cache = false;
2432 goto add;
c92b9655 2433 }
0f457a36 2434 prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
c92b9655 2435 }
c5038a83 2436 rth = rcu_dereference(*prth);
9df16efa 2437 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
c5038a83 2438 return rth;
f2bb4bed 2439 }
c92b9655
JA
2440
2441add:
d08c4f35 2442 rth = rt_dst_alloc(dev_out, flags, type,
5c1e6aa3 2443 IN_DEV_CONF_GET(in_dev, NOPOLICY),
af13b3c3 2444 IN_DEV_CONF_GET(in_dev, NOXFRM));
8391d07b 2445 if (!rth)
5ada5527 2446 return ERR_PTR(-ENOBUFS);
8391d07b 2447
9438c871 2448 rth->rt_iif = orig_oif;
b7503e0c 2449
1da177e4
LT
2450 RT_CACHE_STAT_INC(out_slow_tot);
2451
1da177e4 2452 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
e905a9ed 2453 if (flags & RTCF_LOCAL &&
1da177e4 2454 !(dev_out->flags & IFF_LOOPBACK)) {
d8d1f30b 2455 rth->dst.output = ip_mc_output;
1da177e4
LT
2456 RT_CACHE_STAT_INC(out_slow_mc);
2457 }
2458#ifdef CONFIG_IP_MROUTE
982721f3 2459 if (type == RTN_MULTICAST) {
1da177e4 2460 if (IN_DEV_MFORWARD(in_dev) &&
813b3b5d 2461 !ipv4_is_local_multicast(fl4->daddr)) {
d8d1f30b
CG
2462 rth->dst.input = ip_mr_input;
2463 rth->dst.output = ip_mc_output;
1da177e4
LT
2464 }
2465 }
2466#endif
2467 }
2468
a4c2fd7f 2469 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
9942895b 2470 lwtunnel_set_redirect(&rth->dst);
1da177e4 2471
5ada5527 2472 return rth;
1da177e4
LT
2473}
2474
1da177e4
LT
2475/*
2476 * Major route resolver routine.
2477 */
2478
3abd1ade
DA
2479struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2480 const struct sk_buff *skb)
1da177e4 2481{
f61759e6 2482 __u8 tos = RT_FL_TOS(fl4);
d0ea2b12
ED
2483 struct fib_result res = {
2484 .type = RTN_UNSPEC,
2485 .fi = NULL,
2486 .table = NULL,
2487 .tclassid = 0,
2488 };
5ada5527 2489 struct rtable *rth;
1da177e4 2490
1fb9489b 2491 fl4->flowi4_iif = LOOPBACK_IFINDEX;
813b3b5d
DM
2492 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2493 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2494 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
44713b67 2495
010c2708 2496 rcu_read_lock();
3abd1ade
DA
2497 rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2498 rcu_read_unlock();
2499
2500 return rth;
2501}
2502EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2503
2504struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2505 struct fib_result *res,
2506 const struct sk_buff *skb)
2507{
2508 struct net_device *dev_out = NULL;
2509 int orig_oif = fl4->flowi4_oif;
2510 unsigned int flags = 0;
2511 struct rtable *rth;
595e0651 2512 int err;
3abd1ade 2513
813b3b5d 2514 if (fl4->saddr) {
813b3b5d
DM
2515 if (ipv4_is_multicast(fl4->saddr) ||
2516 ipv4_is_lbcast(fl4->saddr) ||
595e0651
SB
2517 ipv4_is_zeronet(fl4->saddr)) {
2518 rth = ERR_PTR(-EINVAL);
1da177e4 2519 goto out;
595e0651
SB
2520 }
2521
2522 rth = ERR_PTR(-ENETUNREACH);
1da177e4 2523
1da177e4
LT
2524 /* I removed check for oif == dev_out->oif here.
2525 It was wrong for two reasons:
1ab35276
DL
2526 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2527 is assigned to multiple interfaces.
1da177e4
LT
2528 2. Moreover, we are allowed to send packets with saddr
2529 of another iface. --ANK
2530 */
2531
813b3b5d
DM
2532 if (fl4->flowi4_oif == 0 &&
2533 (ipv4_is_multicast(fl4->daddr) ||
2534 ipv4_is_lbcast(fl4->daddr))) {
a210d01a 2535 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 2536 dev_out = __ip_dev_find(net, fl4->saddr, false);
51456b29 2537 if (!dev_out)
a210d01a
JA
2538 goto out;
2539
1da177e4
LT
2540 /* Special hack: user can direct multicasts
2541 and limited broadcast via necessary interface
2542 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2543 This hack is not just for fun, it allows
2544 vic,vat and friends to work.
2545 They bind socket to loopback, set ttl to zero
2546 and expect that it will work.
2547 From the viewpoint of routing cache they are broken,
2548 because we are not allowed to build multicast path
2549 with loopback source addr (look, routing cache
2550 cannot know, that ttl is zero, so that packet
2551 will not leave this host and route is valid).
2552 Luckily, this hack is good workaround.
2553 */
2554
813b3b5d 2555 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2556 goto make_route;
2557 }
a210d01a 2558
813b3b5d 2559 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
a210d01a 2560 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 2561 if (!__ip_dev_find(net, fl4->saddr, false))
a210d01a 2562 goto out;
a210d01a 2563 }
1da177e4
LT
2564 }
2565
2566
813b3b5d
DM
2567 if (fl4->flowi4_oif) {
2568 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
b23dd4fe 2569 rth = ERR_PTR(-ENODEV);
51456b29 2570 if (!dev_out)
1da177e4 2571 goto out;
e5ed6399
HX
2572
2573 /* RACE: Check return value of inet_select_addr instead. */
fc75fc83 2574 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
b23dd4fe 2575 rth = ERR_PTR(-ENETUNREACH);
fc75fc83
ED
2576 goto out;
2577 }
813b3b5d 2578 if (ipv4_is_local_multicast(fl4->daddr) ||
6a211654
AL
2579 ipv4_is_lbcast(fl4->daddr) ||
2580 fl4->flowi4_proto == IPPROTO_IGMP) {
813b3b5d
DM
2581 if (!fl4->saddr)
2582 fl4->saddr = inet_select_addr(dev_out, 0,
2583 RT_SCOPE_LINK);
1da177e4
LT
2584 goto make_route;
2585 }
0a7e2260 2586 if (!fl4->saddr) {
813b3b5d
DM
2587 if (ipv4_is_multicast(fl4->daddr))
2588 fl4->saddr = inet_select_addr(dev_out, 0,
2589 fl4->flowi4_scope);
2590 else if (!fl4->daddr)
2591 fl4->saddr = inet_select_addr(dev_out, 0,
2592 RT_SCOPE_HOST);
1da177e4
LT
2593 }
2594 }
2595
813b3b5d
DM
2596 if (!fl4->daddr) {
2597 fl4->daddr = fl4->saddr;
2598 if (!fl4->daddr)
2599 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
b40afd0e 2600 dev_out = net->loopback_dev;
1fb9489b 2601 fl4->flowi4_oif = LOOPBACK_IFINDEX;
3abd1ade 2602 res->type = RTN_LOCAL;
1da177e4
LT
2603 flags |= RTCF_LOCAL;
2604 goto make_route;
2605 }
2606
3abd1ade 2607 err = fib_lookup(net, fl4, res, 0);
0315e382 2608 if (err) {
3abd1ade
DA
2609 res->fi = NULL;
2610 res->table = NULL;
6104e112 2611 if (fl4->flowi4_oif &&
e58e4159
DA
2612 (ipv4_is_multicast(fl4->daddr) ||
2613 !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
1da177e4
LT
2614 /* Apparently, routing tables are wrong. Assume,
2615 that the destination is on link.
2616
2617 WHY? DW.
2618 Because we are allowed to send to iface
2619 even if it has NO routes and NO assigned
2620 addresses. When oif is specified, routing
2621 tables are looked up with only one purpose:
2622 to catch if destination is gatewayed, rather than
2623 direct. Moreover, if MSG_DONTROUTE is set,
2624 we send packet, ignoring both routing tables
2625 and ifaddr state. --ANK
2626
2627
2628 We could make it even if oif is unknown,
2629 likely IPv6, but we do not.
2630 */
2631
813b3b5d
DM
2632 if (fl4->saddr == 0)
2633 fl4->saddr = inet_select_addr(dev_out, 0,
2634 RT_SCOPE_LINK);
3abd1ade 2635 res->type = RTN_UNICAST;
1da177e4
LT
2636 goto make_route;
2637 }
0315e382 2638 rth = ERR_PTR(err);
1da177e4
LT
2639 goto out;
2640 }
1da177e4 2641
3abd1ade 2642 if (res->type == RTN_LOCAL) {
813b3b5d 2643 if (!fl4->saddr) {
3abd1ade
DA
2644 if (res->fi->fib_prefsrc)
2645 fl4->saddr = res->fi->fib_prefsrc;
9fc3bbb4 2646 else
813b3b5d 2647 fl4->saddr = fl4->daddr;
9fc3bbb4 2648 }
5f02ce24
DA
2649
2650 /* L3 master device is the loopback for that domain */
3abd1ade 2651 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
b7c8487c 2652 net->loopback_dev;
839da4d9
DA
2653
2654 /* make sure orig_oif points to fib result device even
2655 * though packet rx/tx happens over loopback or l3mdev
2656 */
2657 orig_oif = FIB_RES_OIF(*res);
2658
813b3b5d 2659 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2660 flags |= RTCF_LOCAL;
2661 goto make_route;
2662 }
2663
3abd1ade 2664 fib_select_path(net, res, fl4, skb);
1da177e4 2665
3abd1ade 2666 dev_out = FIB_RES_DEV(*res);
813b3b5d 2667 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2668
2669
2670make_route:
3abd1ade 2671 rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
1da177e4 2672
010c2708 2673out:
b23dd4fe 2674 return rth;
1da177e4 2675}
d8c97a94 2676
ae2688d5
JW
2677static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2678{
2679 return NULL;
2680}
2681
ebb762f2 2682static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
ec831ea7 2683{
618f9bc7
SK
2684 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2685
2686 return mtu ? : dst->dev->mtu;
ec831ea7
RD
2687}
2688
6700c270 2689static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
bd085ef6
HL
2690 struct sk_buff *skb, u32 mtu,
2691 bool confirm_neigh)
14e50e57
DM
2692{
2693}
2694
6700c270
DM
2695static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2696 struct sk_buff *skb)
b587ee3b
DM
2697{
2698}
2699
0972ddb2
HB
2700static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2701 unsigned long old)
2702{
2703 return NULL;
2704}
2705
14e50e57
DM
2706static struct dst_ops ipv4_dst_blackhole_ops = {
2707 .family = AF_INET,
ae2688d5 2708 .check = ipv4_blackhole_dst_check,
ebb762f2 2709 .mtu = ipv4_blackhole_mtu,
214f45c9 2710 .default_advmss = ipv4_default_advmss,
14e50e57 2711 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
b587ee3b 2712 .redirect = ipv4_rt_blackhole_redirect,
0972ddb2 2713 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
d3aaeb38 2714 .neigh_lookup = ipv4_neigh_lookup,
14e50e57
DM
2715};
2716
2774c131 2717struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
14e50e57 2718{
2774c131 2719 struct rtable *ort = (struct rtable *) dst_orig;
f5b0a874 2720 struct rtable *rt;
14e50e57 2721
6c0e7284 2722 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
14e50e57 2723 if (rt) {
d8d1f30b 2724 struct dst_entry *new = &rt->dst;
14e50e57 2725
14e50e57 2726 new->__use = 1;
352e512c 2727 new->input = dst_discard;
ede2059d 2728 new->output = dst_discard_out;
14e50e57 2729
1dbe3252 2730 new->dev = net->loopback_dev;
14e50e57
DM
2731 if (new->dev)
2732 dev_hold(new->dev);
2733
9917e1e8 2734 rt->rt_is_input = ort->rt_is_input;
5e2b61f7 2735 rt->rt_iif = ort->rt_iif;
5943634f 2736 rt->rt_pmtu = ort->rt_pmtu;
d52e5a7e 2737 rt->rt_mtu_locked = ort->rt_mtu_locked;
14e50e57 2738
ca4c3fc2 2739 rt->rt_genid = rt_genid_ipv4(net);
14e50e57
DM
2740 rt->rt_flags = ort->rt_flags;
2741 rt->rt_type = ort->rt_type;
77d5bc7e 2742 rt->rt_uses_gateway = ort->rt_uses_gateway;
1550c171
DA
2743 rt->rt_gw_family = ort->rt_gw_family;
2744 if (rt->rt_gw_family == AF_INET)
2745 rt->rt_gw4 = ort->rt_gw4;
0f5f7d7b
DA
2746 else if (rt->rt_gw_family == AF_INET6)
2747 rt->rt_gw6 = ort->rt_gw6;
14e50e57 2748
caacf05e 2749 INIT_LIST_HEAD(&rt->rt_uncached);
14e50e57
DM
2750 }
2751
2774c131
DM
2752 dst_release(dst_orig);
2753
2754 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
14e50e57
DM
2755}
2756
9d6ec938 2757struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
6f9c9615 2758 const struct sock *sk)
1da177e4 2759{
9d6ec938 2760 struct rtable *rt = __ip_route_output_key(net, flp4);
1da177e4 2761
b23dd4fe
DM
2762 if (IS_ERR(rt))
2763 return rt;
1da177e4 2764
56157872 2765 if (flp4->flowi4_proto)
f92ee619
SK
2766 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2767 flowi4_to_flowi(flp4),
2768 sk, 0);
1da177e4 2769
b23dd4fe 2770 return rt;
1da177e4 2771}
d8c97a94
ACM
2772EXPORT_SYMBOL_GPL(ip_route_output_flow);
2773
571912c6
MV
2774struct rtable *ip_route_output_tunnel(struct sk_buff *skb,
2775 struct net_device *dev,
2776 struct net *net, __be32 *saddr,
2777 const struct ip_tunnel_info *info,
2778 u8 protocol, bool use_cache)
2779{
2780#ifdef CONFIG_DST_CACHE
2781 struct dst_cache *dst_cache;
2782#endif
2783 struct rtable *rt = NULL;
2784 struct flowi4 fl4;
2785 __u8 tos;
2786
2787#ifdef CONFIG_DST_CACHE
2788 dst_cache = (struct dst_cache *)&info->dst_cache;
2789 if (use_cache) {
2790 rt = dst_cache_get_ip4(dst_cache, saddr);
2791 if (rt)
2792 return rt;
2793 }
2794#endif
2795 memset(&fl4, 0, sizeof(fl4));
2796 fl4.flowi4_mark = skb->mark;
2797 fl4.flowi4_proto = protocol;
2798 fl4.daddr = info->key.u.ipv4.dst;
2799 fl4.saddr = info->key.u.ipv4.src;
2800 tos = info->key.tos;
2801 fl4.flowi4_tos = RT_TOS(tos);
2802
2803 rt = ip_route_output_key(net, &fl4);
2804 if (IS_ERR(rt)) {
2805 netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
2806 return ERR_PTR(-ENETUNREACH);
2807 }
2808 if (rt->dst.dev == dev) { /* is this necessary? */
2809 netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
2810 ip_rt_put(rt);
2811 return ERR_PTR(-ELOOP);
2812 }
2813#ifdef CONFIG_DST_CACHE
2814 if (use_cache)
2815 dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
2816#endif
2817 *saddr = fl4.saddr;
2818 return rt;
2819}
2820EXPORT_SYMBOL_GPL(ip_route_output_tunnel);
2821
3765d35e 2822/* called with rcu_read_lock held */
404eb77e
RP
2823static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2824 struct rtable *rt, u32 table_id, struct flowi4 *fl4,
e93fb3e9
JF
2825 struct sk_buff *skb, u32 portid, u32 seq,
2826 unsigned int flags)
1da177e4 2827{
1da177e4 2828 struct rtmsg *r;
be403ea1 2829 struct nlmsghdr *nlh;
2bc8ca40 2830 unsigned long expires = 0;
f185071d 2831 u32 error;
521f5490 2832 u32 metrics[RTAX_MAX];
be403ea1 2833
e93fb3e9 2834 nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
51456b29 2835 if (!nlh)
26932566 2836 return -EMSGSIZE;
be403ea1
TG
2837
2838 r = nlmsg_data(nlh);
1da177e4
LT
2839 r->rtm_family = AF_INET;
2840 r->rtm_dst_len = 32;
2841 r->rtm_src_len = 0;
d948974c 2842 r->rtm_tos = fl4 ? fl4->flowi4_tos : 0;
8a430ed5 2843 r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
c36ba660 2844 if (nla_put_u32(skb, RTA_TABLE, table_id))
f3756b79 2845 goto nla_put_failure;
1da177e4
LT
2846 r->rtm_type = rt->rt_type;
2847 r->rtm_scope = RT_SCOPE_UNIVERSE;
2848 r->rtm_protocol = RTPROT_UNSPEC;
2849 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2850 if (rt->rt_flags & RTCF_NOTIFY)
2851 r->rtm_flags |= RTM_F_NOTIFY;
df4d9254
HFS
2852 if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2853 r->rtm_flags |= RTCF_DOREDIRECT;
be403ea1 2854
930345ea 2855 if (nla_put_in_addr(skb, RTA_DST, dst))
f3756b79 2856 goto nla_put_failure;
1a00fee4 2857 if (src) {
1da177e4 2858 r->rtm_src_len = 32;
930345ea 2859 if (nla_put_in_addr(skb, RTA_SRC, src))
f3756b79 2860 goto nla_put_failure;
1da177e4 2861 }
f3756b79
DM
2862 if (rt->dst.dev &&
2863 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2864 goto nla_put_failure;
c7066f70 2865#ifdef CONFIG_IP_ROUTE_CLASSID
f3756b79
DM
2866 if (rt->dst.tclassid &&
2867 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2868 goto nla_put_failure;
1da177e4 2869#endif
d948974c 2870 if (fl4 && !rt_is_input_route(rt) &&
d6c0a4f6 2871 fl4->saddr != src) {
930345ea 2872 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
f3756b79
DM
2873 goto nla_put_failure;
2874 }
77d5bc7e
DA
2875 if (rt->rt_uses_gateway) {
2876 if (rt->rt_gw_family == AF_INET &&
2877 nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
0f5f7d7b 2878 goto nla_put_failure;
77d5bc7e
DA
2879 } else if (rt->rt_gw_family == AF_INET6) {
2880 int alen = sizeof(struct in6_addr);
2881 struct nlattr *nla;
2882 struct rtvia *via;
2883
2884 nla = nla_reserve(skb, RTA_VIA, alen + 2);
2885 if (!nla)
2886 goto nla_put_failure;
2887
2888 via = nla_data(nla);
2889 via->rtvia_family = AF_INET6;
2890 memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2891 }
0f5f7d7b 2892 }
be403ea1 2893
ee9a8f7a
SK
2894 expires = rt->dst.expires;
2895 if (expires) {
2896 unsigned long now = jiffies;
2897
2898 if (time_before(now, expires))
2899 expires -= now;
2900 else
2901 expires = 0;
2902 }
2903
521f5490 2904 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
ee9a8f7a 2905 if (rt->rt_pmtu && expires)
521f5490 2906 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
d52e5a7e
SD
2907 if (rt->rt_mtu_locked && expires)
2908 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
521f5490 2909 if (rtnetlink_put_metrics(skb, metrics) < 0)
be403ea1
TG
2910 goto nla_put_failure;
2911
d948974c
SB
2912 if (fl4) {
2913 if (fl4->flowi4_mark &&
2914 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2915 goto nla_put_failure;
622ec2c9 2916
d948974c
SB
2917 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2918 nla_put_u32(skb, RTA_UID,
2919 from_kuid_munged(current_user_ns(),
2920 fl4->flowi4_uid)))
2921 goto nla_put_failure;
be403ea1 2922
d948974c 2923 if (rt_is_input_route(rt)) {
8caaf7b6 2924#ifdef CONFIG_IP_MROUTE
d948974c
SB
2925 if (ipv4_is_multicast(dst) &&
2926 !ipv4_is_local_multicast(dst) &&
2927 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2928 int err = ipmr_get_route(net, skb,
2929 fl4->saddr, fl4->daddr,
2930 r, portid);
2931
2932 if (err <= 0) {
2933 if (err == 0)
2934 return 0;
2935 goto nla_put_failure;
2936 }
2937 } else
8caaf7b6 2938#endif
d948974c
SB
2939 if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2940 goto nla_put_failure;
2941 }
1da177e4
LT
2942 }
2943
d948974c
SB
2944 error = rt->dst.error;
2945
f185071d 2946 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
e3703b3d 2947 goto nla_put_failure;
be403ea1 2948
053c095a
JB
2949 nlmsg_end(skb, nlh);
2950 return 0;
1da177e4 2951
be403ea1 2952nla_put_failure:
26932566
PM
2953 nlmsg_cancel(skb, nlh);
2954 return -EMSGSIZE;
1da177e4
LT
2955}
2956
ee28906f
SB
2957static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
2958 struct netlink_callback *cb, u32 table_id,
2959 struct fnhe_hash_bucket *bucket, int genid,
e93fb3e9 2960 int *fa_index, int fa_start, unsigned int flags)
ee28906f
SB
2961{
2962 int i;
2963
2964 for (i = 0; i < FNHE_HASH_SIZE; i++) {
2965 struct fib_nh_exception *fnhe;
2966
2967 for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
2968 fnhe = rcu_dereference(fnhe->fnhe_next)) {
2969 struct rtable *rt;
2970 int err;
2971
2972 if (*fa_index < fa_start)
2973 goto next;
2974
2975 if (fnhe->fnhe_genid != genid)
2976 goto next;
2977
2978 if (fnhe->fnhe_expires &&
2979 time_after(jiffies, fnhe->fnhe_expires))
2980 goto next;
2981
2982 rt = rcu_dereference(fnhe->fnhe_rth_input);
2983 if (!rt)
2984 rt = rcu_dereference(fnhe->fnhe_rth_output);
2985 if (!rt)
2986 goto next;
2987
2988 err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
2989 table_id, NULL, skb,
2990 NETLINK_CB(cb->skb).portid,
e93fb3e9 2991 cb->nlh->nlmsg_seq, flags);
ee28906f
SB
2992 if (err)
2993 return err;
2994next:
2995 (*fa_index)++;
2996 }
2997 }
2998
2999 return 0;
3000}
3001
3002int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
3003 u32 table_id, struct fib_info *fi,
e93fb3e9 3004 int *fa_index, int fa_start, unsigned int flags)
ee28906f
SB
3005{
3006 struct net *net = sock_net(cb->skb->sk);
3007 int nhsel, genid = fnhe_genid(net);
3008
3009 for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
3010 struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
3011 struct fnhe_hash_bucket *bucket;
3012 int err;
3013
3014 if (nhc->nhc_flags & RTNH_F_DEAD)
3015 continue;
3016
93ed54b1 3017 rcu_read_lock();
ee28906f 3018 bucket = rcu_dereference(nhc->nhc_exceptions);
93ed54b1
ED
3019 err = 0;
3020 if (bucket)
3021 err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
e93fb3e9
JF
3022 genid, fa_index, fa_start,
3023 flags);
93ed54b1 3024 rcu_read_unlock();
ee28906f
SB
3025 if (err)
3026 return err;
3027 }
3028
3029 return 0;
3030}
3031
404eb77e
RP
3032static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
3033 u8 ip_proto, __be16 sport,
3034 __be16 dport)
3035{
3036 struct sk_buff *skb;
3037 struct iphdr *iph;
3038
3039 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3040 if (!skb)
3041 return NULL;
3042
3043 /* Reserve room for dummy headers, this skb can pass
3044 * through good chunk of routing engine.
3045 */
3046 skb_reset_mac_header(skb);
3047 skb_reset_network_header(skb);
3048 skb->protocol = htons(ETH_P_IP);
3049 iph = skb_put(skb, sizeof(struct iphdr));
3050 iph->protocol = ip_proto;
3051 iph->saddr = src;
3052 iph->daddr = dst;
3053 iph->version = 0x4;
3054 iph->frag_off = 0;
3055 iph->ihl = 0x5;
3056 skb_set_transport_header(skb, skb->len);
3057
3058 switch (iph->protocol) {
3059 case IPPROTO_UDP: {
3060 struct udphdr *udph;
3061
3062 udph = skb_put_zero(skb, sizeof(struct udphdr));
3063 udph->source = sport;
3064 udph->dest = dport;
3065 udph->len = sizeof(struct udphdr);
3066 udph->check = 0;
3067 break;
3068 }
3069 case IPPROTO_TCP: {
3070 struct tcphdr *tcph;
3071
3072 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3073 tcph->source = sport;
3074 tcph->dest = dport;
3075 tcph->doff = sizeof(struct tcphdr) / 4;
3076 tcph->rst = 1;
3077 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3078 src, dst, 0);
3079 break;
3080 }
3081 case IPPROTO_ICMP: {
3082 struct icmphdr *icmph;
3083
3084 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3085 icmph->type = ICMP_ECHO;
3086 icmph->code = 0;
3087 }
3088 }
3089
3090 return skb;
3091}
3092
a00302b6
JK
3093static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3094 const struct nlmsghdr *nlh,
3095 struct nlattr **tb,
3096 struct netlink_ext_ack *extack)
3097{
3098 struct rtmsg *rtm;
3099 int i, err;
3100
3101 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3102 NL_SET_ERR_MSG(extack,
3103 "ipv4: Invalid header for route get request");
3104 return -EINVAL;
3105 }
3106
3107 if (!netlink_strict_get_check(skb))
8cb08174
JB
3108 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3109 rtm_ipv4_policy, extack);
a00302b6
JK
3110
3111 rtm = nlmsg_data(nlh);
3112 if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3113 (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3114 rtm->rtm_table || rtm->rtm_protocol ||
3115 rtm->rtm_scope || rtm->rtm_type) {
3116 NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3117 return -EINVAL;
3118 }
3119
3120 if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3121 RTM_F_LOOKUP_TABLE |
3122 RTM_F_FIB_MATCH)) {
3123 NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3124 return -EINVAL;
3125 }
3126
8cb08174
JB
3127 err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3128 rtm_ipv4_policy, extack);
a00302b6
JK
3129 if (err)
3130 return err;
3131
3132 if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3133 (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3134 NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3135 return -EINVAL;
3136 }
3137
3138 for (i = 0; i <= RTA_MAX; i++) {
3139 if (!tb[i])
3140 continue;
3141
3142 switch (i) {
3143 case RTA_IIF:
3144 case RTA_OIF:
3145 case RTA_SRC:
3146 case RTA_DST:
3147 case RTA_IP_PROTO:
3148 case RTA_SPORT:
3149 case RTA_DPORT:
3150 case RTA_MARK:
3151 case RTA_UID:
3152 break;
3153 default:
3154 NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3155 return -EINVAL;
3156 }
3157 }
3158
3159 return 0;
3160}
3161
c21ef3e3
DA
3162static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3163 struct netlink_ext_ack *extack)
1da177e4 3164{
3b1e0a65 3165 struct net *net = sock_net(in_skb->sk);
d889ce3b 3166 struct nlattr *tb[RTA_MAX+1];
404eb77e
RP
3167 u32 table_id = RT_TABLE_MAIN;
3168 __be16 sport = 0, dport = 0;
3765d35e 3169 struct fib_result res = {};
404eb77e 3170 u8 ip_proto = IPPROTO_UDP;
1da177e4 3171 struct rtable *rt = NULL;
404eb77e
RP
3172 struct sk_buff *skb;
3173 struct rtmsg *rtm;
e8e3fbe9 3174 struct flowi4 fl4 = {};
9e12bb22
AV
3175 __be32 dst = 0;
3176 __be32 src = 0;
404eb77e 3177 kuid_t uid;
9e12bb22 3178 u32 iif;
d889ce3b 3179 int err;
963bfeee 3180 int mark;
1da177e4 3181
a00302b6 3182 err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
d889ce3b 3183 if (err < 0)
404eb77e 3184 return err;
d889ce3b
TG
3185
3186 rtm = nlmsg_data(nlh);
67b61f6c
JB
3187 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3188 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
d889ce3b 3189 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
963bfeee 3190 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
622ec2c9
LC
3191 if (tb[RTA_UID])
3192 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3193 else
3194 uid = (iif ? INVALID_UID : current_uid());
1da177e4 3195
404eb77e
RP
3196 if (tb[RTA_IP_PROTO]) {
3197 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
5e1a99ea 3198 &ip_proto, AF_INET, extack);
404eb77e
RP
3199 if (err)
3200 return err;
3201 }
bbadb9a2 3202
404eb77e
RP
3203 if (tb[RTA_SPORT])
3204 sport = nla_get_be16(tb[RTA_SPORT]);
bbadb9a2 3205
404eb77e
RP
3206 if (tb[RTA_DPORT])
3207 dport = nla_get_be16(tb[RTA_DPORT]);
3208
3209 skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3210 if (!skb)
3211 return -ENOBUFS;
bbadb9a2 3212
d6c0a4f6
DM
3213 fl4.daddr = dst;
3214 fl4.saddr = src;
3215 fl4.flowi4_tos = rtm->rtm_tos;
3216 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3217 fl4.flowi4_mark = mark;
622ec2c9 3218 fl4.flowi4_uid = uid;
404eb77e
RP
3219 if (sport)
3220 fl4.fl4_sport = sport;
3221 if (dport)
3222 fl4.fl4_dport = dport;
3223 fl4.flowi4_proto = ip_proto;
d6c0a4f6 3224
3765d35e
DA
3225 rcu_read_lock();
3226
1da177e4 3227 if (iif) {
d889ce3b
TG
3228 struct net_device *dev;
3229
3765d35e 3230 dev = dev_get_by_index_rcu(net, iif);
51456b29 3231 if (!dev) {
d889ce3b 3232 err = -ENODEV;
404eb77e 3233 goto errout_rcu;
d889ce3b
TG
3234 }
3235
404eb77e 3236 fl4.flowi4_iif = iif; /* for rt_fill_info */
1da177e4 3237 skb->dev = dev;
963bfeee 3238 skb->mark = mark;
3765d35e
DA
3239 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
3240 dev, &res);
d889ce3b 3241
511c3f92 3242 rt = skb_rtable(skb);
d8d1f30b
CG
3243 if (err == 0 && rt->dst.error)
3244 err = -rt->dst.error;
1da177e4 3245 } else {
6503a304 3246 fl4.flowi4_iif = LOOPBACK_IFINDEX;
21f94775 3247 skb->dev = net->loopback_dev;
3765d35e 3248 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
b23dd4fe
DM
3249 err = 0;
3250 if (IS_ERR(rt))
3251 err = PTR_ERR(rt);
2c87d63a
FW
3252 else
3253 skb_dst_set(skb, &rt->dst);
1da177e4 3254 }
d889ce3b 3255
1da177e4 3256 if (err)
404eb77e 3257 goto errout_rcu;
1da177e4 3258
1da177e4
LT
3259 if (rtm->rtm_flags & RTM_F_NOTIFY)
3260 rt->rt_flags |= RTCF_NOTIFY;
3261
c36ba660 3262 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
68e813aa 3263 table_id = res.table ? res.table->tb_id : 0;
c36ba660 3264
404eb77e
RP
3265 /* reset skb for netlink reply msg */
3266 skb_trim(skb, 0);
3267 skb_reset_network_header(skb);
3268 skb_reset_transport_header(skb);
3269 skb_reset_mac_header(skb);
3270
bc3aae2b 3271 if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
1e301fd0
IS
3272 struct fib_rt_info fri;
3273
bc3aae2b
RP
3274 if (!res.fi) {
3275 err = fib_props[res.type].error;
3276 if (!err)
3277 err = -EHOSTUNREACH;
404eb77e 3278 goto errout_rcu;
bc3aae2b 3279 }
1e301fd0
IS
3280 fri.fi = res.fi;
3281 fri.tb_id = table_id;
3282 fri.dst = res.prefix;
3283 fri.dst_len = res.prefixlen;
3284 fri.tos = fl4.flowi4_tos;
3285 fri.type = rt->rt_type;
90b93f1b
IS
3286 fri.offload = 0;
3287 fri.trap = 0;
3288 if (res.fa_head) {
3289 struct fib_alias *fa;
3290
3291 hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
3292 u8 slen = 32 - fri.dst_len;
3293
3294 if (fa->fa_slen == slen &&
3295 fa->tb_id == fri.tb_id &&
3296 fa->fa_tos == fri.tos &&
3297 fa->fa_info == res.fi &&
3298 fa->fa_type == fri.type) {
3299 fri.offload = fa->offload;
3300 fri.trap = fa->trap;
3301 break;
3302 }
3303 }
3304 }
b6179813 3305 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
1e301fd0 3306 nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
bc3aae2b 3307 } else {
404eb77e 3308 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
e93fb3e9
JF
3309 NETLINK_CB(in_skb).portid,
3310 nlh->nlmsg_seq, 0);
bc3aae2b 3311 }
7b46a644 3312 if (err < 0)
404eb77e 3313 goto errout_rcu;
1da177e4 3314
3765d35e
DA
3315 rcu_read_unlock();
3316
15e47304 3317 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
1da177e4 3318
d889ce3b 3319errout_free:
404eb77e
RP
3320 return err;
3321errout_rcu:
3765d35e 3322 rcu_read_unlock();
1da177e4 3323 kfree_skb(skb);
404eb77e 3324 goto errout_free;
1da177e4
LT
3325}
3326
1da177e4
LT
3327void ip_rt_multicast_event(struct in_device *in_dev)
3328{
4ccfe6d4 3329 rt_cache_flush(dev_net(in_dev->dev));
1da177e4
LT
3330}
3331
3332#ifdef CONFIG_SYSCTL
082c7ca4
G
3333static int ip_rt_gc_interval __read_mostly = 60 * HZ;
3334static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
3335static int ip_rt_gc_elasticity __read_mostly = 8;
773daa3c 3336static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU;
082c7ca4 3337
fe2c6338 3338static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
8d65af78 3339 void __user *buffer,
1da177e4
LT
3340 size_t *lenp, loff_t *ppos)
3341{
5aad1de5
TT
3342 struct net *net = (struct net *)__ctl->extra1;
3343
1da177e4 3344 if (write) {
5aad1de5
TT
3345 rt_cache_flush(net);
3346 fnhe_genid_bump(net);
1da177e4 3347 return 0;
e905a9ed 3348 }
1da177e4
LT
3349
3350 return -EINVAL;
3351}
3352
fe2c6338 3353static struct ctl_table ipv4_route_table[] = {
1da177e4 3354 {
1da177e4
LT
3355 .procname = "gc_thresh",
3356 .data = &ipv4_dst_ops.gc_thresh,
3357 .maxlen = sizeof(int),
3358 .mode = 0644,
6d9f239a 3359 .proc_handler = proc_dointvec,
1da177e4
LT
3360 },
3361 {
1da177e4
LT
3362 .procname = "max_size",
3363 .data = &ip_rt_max_size,
3364 .maxlen = sizeof(int),
3365 .mode = 0644,
6d9f239a 3366 .proc_handler = proc_dointvec,
1da177e4
LT
3367 },
3368 {
3369 /* Deprecated. Use gc_min_interval_ms */
e905a9ed 3370
1da177e4
LT
3371 .procname = "gc_min_interval",
3372 .data = &ip_rt_gc_min_interval,
3373 .maxlen = sizeof(int),
3374 .mode = 0644,
6d9f239a 3375 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3376 },
3377 {
1da177e4
LT
3378 .procname = "gc_min_interval_ms",
3379 .data = &ip_rt_gc_min_interval,
3380 .maxlen = sizeof(int),
3381 .mode = 0644,
6d9f239a 3382 .proc_handler = proc_dointvec_ms_jiffies,
1da177e4
LT
3383 },
3384 {
1da177e4
LT
3385 .procname = "gc_timeout",
3386 .data = &ip_rt_gc_timeout,
3387 .maxlen = sizeof(int),
3388 .mode = 0644,
6d9f239a 3389 .proc_handler = proc_dointvec_jiffies,
1da177e4 3390 },
9f28a2fc
ED
3391 {
3392 .procname = "gc_interval",
3393 .data = &ip_rt_gc_interval,
3394 .maxlen = sizeof(int),
3395 .mode = 0644,
3396 .proc_handler = proc_dointvec_jiffies,
3397 },
1da177e4 3398 {
1da177e4
LT
3399 .procname = "redirect_load",
3400 .data = &ip_rt_redirect_load,
3401 .maxlen = sizeof(int),
3402 .mode = 0644,
6d9f239a 3403 .proc_handler = proc_dointvec,
1da177e4
LT
3404 },
3405 {
1da177e4
LT
3406 .procname = "redirect_number",
3407 .data = &ip_rt_redirect_number,
3408 .maxlen = sizeof(int),
3409 .mode = 0644,
6d9f239a 3410 .proc_handler = proc_dointvec,
1da177e4
LT
3411 },
3412 {
1da177e4
LT
3413 .procname = "redirect_silence",
3414 .data = &ip_rt_redirect_silence,
3415 .maxlen = sizeof(int),
3416 .mode = 0644,
6d9f239a 3417 .proc_handler = proc_dointvec,
1da177e4
LT
3418 },
3419 {
1da177e4
LT
3420 .procname = "error_cost",
3421 .data = &ip_rt_error_cost,
3422 .maxlen = sizeof(int),
3423 .mode = 0644,
6d9f239a 3424 .proc_handler = proc_dointvec,
1da177e4
LT
3425 },
3426 {
1da177e4
LT
3427 .procname = "error_burst",
3428 .data = &ip_rt_error_burst,
3429 .maxlen = sizeof(int),
3430 .mode = 0644,
6d9f239a 3431 .proc_handler = proc_dointvec,
1da177e4
LT
3432 },
3433 {
1da177e4
LT
3434 .procname = "gc_elasticity",
3435 .data = &ip_rt_gc_elasticity,
3436 .maxlen = sizeof(int),
3437 .mode = 0644,
6d9f239a 3438 .proc_handler = proc_dointvec,
1da177e4
LT
3439 },
3440 {
1da177e4
LT
3441 .procname = "mtu_expires",
3442 .data = &ip_rt_mtu_expires,
3443 .maxlen = sizeof(int),
3444 .mode = 0644,
6d9f239a 3445 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
3446 },
3447 {
1da177e4
LT
3448 .procname = "min_pmtu",
3449 .data = &ip_rt_min_pmtu,
3450 .maxlen = sizeof(int),
3451 .mode = 0644,
c7272c2f
SD
3452 .proc_handler = proc_dointvec_minmax,
3453 .extra1 = &ip_min_valid_pmtu,
1da177e4
LT
3454 },
3455 {
1da177e4
LT
3456 .procname = "min_adv_mss",
3457 .data = &ip_rt_min_advmss,
3458 .maxlen = sizeof(int),
3459 .mode = 0644,
6d9f239a 3460 .proc_handler = proc_dointvec,
1da177e4 3461 },
f8572d8f 3462 { }
1da177e4 3463};
39a23e75 3464
5cdda5f1
CB
3465static const char ipv4_route_flush_procname[] = "flush";
3466
39a23e75
DL
3467static struct ctl_table ipv4_route_flush_table[] = {
3468 {
5cdda5f1 3469 .procname = ipv4_route_flush_procname,
39a23e75
DL
3470 .maxlen = sizeof(int),
3471 .mode = 0200,
6d9f239a 3472 .proc_handler = ipv4_sysctl_rtcache_flush,
39a23e75 3473 },
f8572d8f 3474 { },
39a23e75
DL
3475};
3476
3477static __net_init int sysctl_route_net_init(struct net *net)
3478{
3479 struct ctl_table *tbl;
3480
3481 tbl = ipv4_route_flush_table;
09ad9bc7 3482 if (!net_eq(net, &init_net)) {
39a23e75 3483 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
51456b29 3484 if (!tbl)
39a23e75 3485 goto err_dup;
464dc801 3486
5cdda5f1
CB
3487 /* Don't export non-whitelisted sysctls to unprivileged users */
3488 if (net->user_ns != &init_user_ns) {
3489 if (tbl[0].procname != ipv4_route_flush_procname)
3490 tbl[0].procname = NULL;
3491 }
39a23e75
DL
3492 }
3493 tbl[0].extra1 = net;
3494
ec8f23ce 3495 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
51456b29 3496 if (!net->ipv4.route_hdr)
39a23e75
DL
3497 goto err_reg;
3498 return 0;
3499
3500err_reg:
3501 if (tbl != ipv4_route_flush_table)
3502 kfree(tbl);
3503err_dup:
3504 return -ENOMEM;
3505}
3506
3507static __net_exit void sysctl_route_net_exit(struct net *net)
3508{
3509 struct ctl_table *tbl;
3510
3511 tbl = net->ipv4.route_hdr->ctl_table_arg;
3512 unregister_net_sysctl_table(net->ipv4.route_hdr);
3513 BUG_ON(tbl == ipv4_route_flush_table);
3514 kfree(tbl);
3515}
3516
3517static __net_initdata struct pernet_operations sysctl_route_ops = {
3518 .init = sysctl_route_net_init,
3519 .exit = sysctl_route_net_exit,
3520};
1da177e4
LT
3521#endif
3522
3ee94372 3523static __net_init int rt_genid_init(struct net *net)
9f5e97e5 3524{
ca4c3fc2 3525 atomic_set(&net->ipv4.rt_genid, 0);
5aad1de5 3526 atomic_set(&net->fnhe_genid, 0);
7aed9f72 3527 atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
9f5e97e5
DL
3528 return 0;
3529}
3530
3ee94372
NH
3531static __net_initdata struct pernet_operations rt_genid_ops = {
3532 .init = rt_genid_init,
9f5e97e5
DL
3533};
3534
c3426b47
DM
3535static int __net_init ipv4_inetpeer_init(struct net *net)
3536{
3537 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3538
3539 if (!bp)
3540 return -ENOMEM;
3541 inet_peer_base_init(bp);
3542 net->ipv4.peers = bp;
3543 return 0;
3544}
3545
3546static void __net_exit ipv4_inetpeer_exit(struct net *net)
3547{
3548 struct inet_peer_base *bp = net->ipv4.peers;
3549
3550 net->ipv4.peers = NULL;
56a6b248 3551 inetpeer_invalidate_tree(bp);
c3426b47
DM
3552 kfree(bp);
3553}
3554
3555static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3556 .init = ipv4_inetpeer_init,
3557 .exit = ipv4_inetpeer_exit,
3558};
9f5e97e5 3559
c7066f70 3560#ifdef CONFIG_IP_ROUTE_CLASSID
7d720c3e 3561struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
c7066f70 3562#endif /* CONFIG_IP_ROUTE_CLASSID */
1da177e4 3563
1da177e4
LT
3564int __init ip_rt_init(void)
3565{
5055c371 3566 int cpu;
1da177e4 3567
6da2ec56
KC
3568 ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
3569 GFP_KERNEL);
73f156a6
ED
3570 if (!ip_idents)
3571 panic("IP: failed to allocate ip_idents\n");
3572
3573 prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3574
355b590c
ED
3575 ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3576 if (!ip_tstamps)
3577 panic("IP: failed to allocate ip_tstamps\n");
3578
5055c371
ED
3579 for_each_possible_cpu(cpu) {
3580 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3581
3582 INIT_LIST_HEAD(&ul->head);
3583 spin_lock_init(&ul->lock);
3584 }
c7066f70 3585#ifdef CONFIG_IP_ROUTE_CLASSID
0dcec8c2 3586 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
1da177e4
LT
3587 if (!ip_rt_acct)
3588 panic("IP: failed to allocate ip_rt_acct\n");
1da177e4
LT
3589#endif
3590
e5d679f3
AD
3591 ipv4_dst_ops.kmem_cachep =
3592 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
20c2df83 3593 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1da177e4 3594
14e50e57
DM
3595 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3596
fc66f95c
ED
3597 if (dst_entries_init(&ipv4_dst_ops) < 0)
3598 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3599
3600 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3601 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3602
89aef892
DM
3603 ipv4_dst_ops.gc_thresh = ~0;
3604 ip_rt_max_size = INT_MAX;
1da177e4 3605
1da177e4
LT
3606 devinet_init();
3607 ip_fib_init();
3608
73b38711 3609 if (ip_rt_proc_init())
058bd4d2 3610 pr_err("Unable to create route proc files\n");
1da177e4
LT
3611#ifdef CONFIG_XFRM
3612 xfrm_init();
703fb94e 3613 xfrm4_init();
1da177e4 3614#endif
394f51ab
FW
3615 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3616 RTNL_FLAG_DOIT_UNLOCKED);
63f3444f 3617
39a23e75
DL
3618#ifdef CONFIG_SYSCTL
3619 register_pernet_subsys(&sysctl_route_ops);
3620#endif
3ee94372 3621 register_pernet_subsys(&rt_genid_ops);
c3426b47 3622 register_pernet_subsys(&ipv4_inetpeer_ops);
1bcdca3f 3623 return 0;
1da177e4
LT
3624}
3625
a1bc6eb4 3626#ifdef CONFIG_SYSCTL
eeb61f71
AV
3627/*
3628 * We really need to sanitize the damn ipv4 init order, then all
3629 * this nonsense will go away.
3630 */
3631void __init ip_static_sysctl_init(void)
3632{
4e5ca785 3633 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
eeb61f71 3634}
a1bc6eb4 3635#endif