]> git.ipfire.org Git - thirdparty/kernel/stable.git/blame - net/ipv4/route.c
net: remove dst gc related code
[thirdparty/kernel/stable.git] / net / ipv4 / route.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
e905a9ed 21 * Alan Cox : Super /proc >4K
1da177e4
LT
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
e905a9ed 39 *
1da177e4
LT
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
bb1d23b0 55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
cef2685e
IS
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
1da177e4
LT
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
afd46503
JP
65#define pr_fmt(fmt) "IPv4: " fmt
66
1da177e4 67#include <linux/module.h>
7c0f6ba6 68#include <linux/uaccess.h>
1da177e4
LT
69#include <linux/bitops.h>
70#include <linux/types.h>
71#include <linux/kernel.h>
1da177e4
LT
72#include <linux/mm.h>
73#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
82#include <linux/skbuff.h>
1da177e4
LT
83#include <linux/inetdevice.h>
84#include <linux/igmp.h>
85#include <linux/pkt_sched.h>
86#include <linux/mroute.h>
87#include <linux/netfilter_ipv4.h>
88#include <linux/random.h>
1da177e4
LT
89#include <linux/rcupdate.h>
90#include <linux/times.h>
5a0e3ad6 91#include <linux/slab.h>
73f156a6 92#include <linux/jhash.h>
352e512c 93#include <net/dst.h>
1b7179d3 94#include <net/dst_metadata.h>
457c4cbc 95#include <net/net_namespace.h>
1da177e4
LT
96#include <net/protocol.h>
97#include <net/ip.h>
98#include <net/route.h>
99#include <net/inetpeer.h>
100#include <net/sock.h>
101#include <net/ip_fib.h>
102#include <net/arp.h>
103#include <net/tcp.h>
104#include <net/icmp.h>
105#include <net/xfrm.h>
571e7226 106#include <net/lwtunnel.h>
8d71740c 107#include <net/netevent.h>
63f3444f 108#include <net/rtnetlink.h>
1da177e4
LT
109#ifdef CONFIG_SYSCTL
110#include <linux/sysctl.h>
7426a564 111#include <linux/kmemleak.h>
1da177e4 112#endif
6e5714ea 113#include <net/secure_seq.h>
1b7179d3 114#include <net/ip_tunnels.h>
385add90 115#include <net/l3mdev.h>
1da177e4 116
b6179813
RP
117#include "fib_lookup.h"
118
68a5e3dd 119#define RT_FL_TOS(oldflp4) \
f61759e6 120 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
1da177e4 121
1da177e4
LT
122#define RT_GC_TIMEOUT (300*HZ)
123
1da177e4 124static int ip_rt_max_size;
817bc4db
SH
125static int ip_rt_redirect_number __read_mostly = 9;
126static int ip_rt_redirect_load __read_mostly = HZ / 50;
127static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
128static int ip_rt_error_cost __read_mostly = HZ;
129static int ip_rt_error_burst __read_mostly = 5 * HZ;
817bc4db
SH
130static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
131static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
132static int ip_rt_min_advmss __read_mostly = 256;
9f28a2fc 133
deed49df 134static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
1da177e4
LT
135/*
136 * Interface to generic destination cache.
137 */
138
139static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
0dbaee3b 140static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
ebb762f2 141static unsigned int ipv4_mtu(const struct dst_entry *dst);
1da177e4
LT
142static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
143static void ipv4_link_failure(struct sk_buff *skb);
6700c270
DM
144static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
145 struct sk_buff *skb, u32 mtu);
146static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
147 struct sk_buff *skb);
caacf05e 148static void ipv4_dst_destroy(struct dst_entry *dst);
1da177e4 149
62fa8a84
DM
150static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
151{
31248731
DM
152 WARN_ON(1);
153 return NULL;
62fa8a84
DM
154}
155
f894cbf8
DM
156static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
157 struct sk_buff *skb,
158 const void *daddr);
63fca65d 159static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
d3aaeb38 160
1da177e4
LT
161static struct dst_ops ipv4_dst_ops = {
162 .family = AF_INET,
1da177e4 163 .check = ipv4_dst_check,
0dbaee3b 164 .default_advmss = ipv4_default_advmss,
ebb762f2 165 .mtu = ipv4_mtu,
62fa8a84 166 .cow_metrics = ipv4_cow_metrics,
caacf05e 167 .destroy = ipv4_dst_destroy,
1da177e4
LT
168 .negative_advice = ipv4_negative_advice,
169 .link_failure = ipv4_link_failure,
170 .update_pmtu = ip_rt_update_pmtu,
e47a185b 171 .redirect = ip_do_redirect,
b92dacd4 172 .local_out = __ip_local_out,
d3aaeb38 173 .neigh_lookup = ipv4_neigh_lookup,
63fca65d 174 .confirm_neigh = ipv4_confirm_neigh,
1da177e4
LT
175};
176
177#define ECN_OR_COST(class) TC_PRIO_##class
178
4839c52b 179const __u8 ip_tos2prio[16] = {
1da177e4 180 TC_PRIO_BESTEFFORT,
4a2b9c37 181 ECN_OR_COST(BESTEFFORT),
1da177e4
LT
182 TC_PRIO_BESTEFFORT,
183 ECN_OR_COST(BESTEFFORT),
184 TC_PRIO_BULK,
185 ECN_OR_COST(BULK),
186 TC_PRIO_BULK,
187 ECN_OR_COST(BULK),
188 TC_PRIO_INTERACTIVE,
189 ECN_OR_COST(INTERACTIVE),
190 TC_PRIO_INTERACTIVE,
191 ECN_OR_COST(INTERACTIVE),
192 TC_PRIO_INTERACTIVE_BULK,
193 ECN_OR_COST(INTERACTIVE_BULK),
194 TC_PRIO_INTERACTIVE_BULK,
195 ECN_OR_COST(INTERACTIVE_BULK)
196};
d4a96865 197EXPORT_SYMBOL(ip_tos2prio);
1da177e4 198
2f970d83 199static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
3ed66e91 200#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
1da177e4 201
1da177e4 202#ifdef CONFIG_PROC_FS
1da177e4
LT
203static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
204{
29e75252 205 if (*pos)
89aef892 206 return NULL;
29e75252 207 return SEQ_START_TOKEN;
1da177e4
LT
208}
209
210static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
211{
1da177e4 212 ++*pos;
89aef892 213 return NULL;
1da177e4
LT
214}
215
216static void rt_cache_seq_stop(struct seq_file *seq, void *v)
217{
1da177e4
LT
218}
219
220static int rt_cache_seq_show(struct seq_file *seq, void *v)
221{
222 if (v == SEQ_START_TOKEN)
223 seq_printf(seq, "%-127s\n",
224 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
225 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
226 "HHUptod\tSpecDst");
e905a9ed 227 return 0;
1da177e4
LT
228}
229
f690808e 230static const struct seq_operations rt_cache_seq_ops = {
1da177e4
LT
231 .start = rt_cache_seq_start,
232 .next = rt_cache_seq_next,
233 .stop = rt_cache_seq_stop,
234 .show = rt_cache_seq_show,
235};
236
237static int rt_cache_seq_open(struct inode *inode, struct file *file)
238{
89aef892 239 return seq_open(file, &rt_cache_seq_ops);
1da177e4
LT
240}
241
9a32144e 242static const struct file_operations rt_cache_seq_fops = {
1da177e4
LT
243 .owner = THIS_MODULE,
244 .open = rt_cache_seq_open,
245 .read = seq_read,
246 .llseek = seq_lseek,
89aef892 247 .release = seq_release,
1da177e4
LT
248};
249
250
251static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
252{
253 int cpu;
254
255 if (*pos == 0)
256 return SEQ_START_TOKEN;
257
0f23174a 258 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
259 if (!cpu_possible(cpu))
260 continue;
261 *pos = cpu+1;
2f970d83 262 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
263 }
264 return NULL;
265}
266
267static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
268{
269 int cpu;
270
0f23174a 271 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
272 if (!cpu_possible(cpu))
273 continue;
274 *pos = cpu+1;
2f970d83 275 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
276 }
277 return NULL;
e905a9ed 278
1da177e4
LT
279}
280
281static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
282{
283
284}
285
286static int rt_cpu_seq_show(struct seq_file *seq, void *v)
287{
288 struct rt_cache_stat *st = v;
289
290 if (v == SEQ_START_TOKEN) {
5bec0039 291 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
1da177e4
LT
292 return 0;
293 }
e905a9ed 294
1da177e4
LT
295 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
296 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
fc66f95c 297 dst_entries_get_slow(&ipv4_dst_ops),
0baf2b35 298 0, /* st->in_hit */
1da177e4
LT
299 st->in_slow_tot,
300 st->in_slow_mc,
301 st->in_no_route,
302 st->in_brd,
303 st->in_martian_dst,
304 st->in_martian_src,
305
0baf2b35 306 0, /* st->out_hit */
1da177e4 307 st->out_slow_tot,
e905a9ed 308 st->out_slow_mc,
1da177e4 309
0baf2b35
ED
310 0, /* st->gc_total */
311 0, /* st->gc_ignored */
312 0, /* st->gc_goal_miss */
313 0, /* st->gc_dst_overflow */
314 0, /* st->in_hlist_search */
315 0 /* st->out_hlist_search */
1da177e4
LT
316 );
317 return 0;
318}
319
f690808e 320static const struct seq_operations rt_cpu_seq_ops = {
1da177e4
LT
321 .start = rt_cpu_seq_start,
322 .next = rt_cpu_seq_next,
323 .stop = rt_cpu_seq_stop,
324 .show = rt_cpu_seq_show,
325};
326
327
328static int rt_cpu_seq_open(struct inode *inode, struct file *file)
329{
330 return seq_open(file, &rt_cpu_seq_ops);
331}
332
9a32144e 333static const struct file_operations rt_cpu_seq_fops = {
1da177e4
LT
334 .owner = THIS_MODULE,
335 .open = rt_cpu_seq_open,
336 .read = seq_read,
337 .llseek = seq_lseek,
338 .release = seq_release,
339};
340
c7066f70 341#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 342static int rt_acct_proc_show(struct seq_file *m, void *v)
78c686e9 343{
a661c419
AD
344 struct ip_rt_acct *dst, *src;
345 unsigned int i, j;
346
347 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
348 if (!dst)
349 return -ENOMEM;
350
351 for_each_possible_cpu(i) {
352 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
353 for (j = 0; j < 256; j++) {
354 dst[j].o_bytes += src[j].o_bytes;
355 dst[j].o_packets += src[j].o_packets;
356 dst[j].i_bytes += src[j].i_bytes;
357 dst[j].i_packets += src[j].i_packets;
358 }
78c686e9
PE
359 }
360
a661c419
AD
361 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
362 kfree(dst);
363 return 0;
364}
78c686e9 365
a661c419
AD
366static int rt_acct_proc_open(struct inode *inode, struct file *file)
367{
368 return single_open(file, rt_acct_proc_show, NULL);
78c686e9 369}
a661c419
AD
370
371static const struct file_operations rt_acct_proc_fops = {
372 .owner = THIS_MODULE,
373 .open = rt_acct_proc_open,
374 .read = seq_read,
375 .llseek = seq_lseek,
376 .release = single_release,
377};
78c686e9 378#endif
107f1634 379
73b38711 380static int __net_init ip_rt_do_proc_init(struct net *net)
107f1634
PE
381{
382 struct proc_dir_entry *pde;
383
d4beaa66
G
384 pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
385 &rt_cache_seq_fops);
107f1634
PE
386 if (!pde)
387 goto err1;
388
77020720
WC
389 pde = proc_create("rt_cache", S_IRUGO,
390 net->proc_net_stat, &rt_cpu_seq_fops);
107f1634
PE
391 if (!pde)
392 goto err2;
393
c7066f70 394#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 395 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
107f1634
PE
396 if (!pde)
397 goto err3;
398#endif
399 return 0;
400
c7066f70 401#ifdef CONFIG_IP_ROUTE_CLASSID
107f1634
PE
402err3:
403 remove_proc_entry("rt_cache", net->proc_net_stat);
404#endif
405err2:
406 remove_proc_entry("rt_cache", net->proc_net);
407err1:
408 return -ENOMEM;
409}
73b38711
DL
410
411static void __net_exit ip_rt_do_proc_exit(struct net *net)
412{
413 remove_proc_entry("rt_cache", net->proc_net_stat);
414 remove_proc_entry("rt_cache", net->proc_net);
c7066f70 415#ifdef CONFIG_IP_ROUTE_CLASSID
73b38711 416 remove_proc_entry("rt_acct", net->proc_net);
0a931acf 417#endif
73b38711
DL
418}
419
420static struct pernet_operations ip_rt_proc_ops __net_initdata = {
421 .init = ip_rt_do_proc_init,
422 .exit = ip_rt_do_proc_exit,
423};
424
425static int __init ip_rt_proc_init(void)
426{
427 return register_pernet_subsys(&ip_rt_proc_ops);
428}
429
107f1634 430#else
73b38711 431static inline int ip_rt_proc_init(void)
107f1634
PE
432{
433 return 0;
434}
1da177e4 435#endif /* CONFIG_PROC_FS */
e905a9ed 436
4331debc 437static inline bool rt_is_expired(const struct rtable *rth)
e84f84f2 438{
ca4c3fc2 439 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
e84f84f2
DL
440}
441
4ccfe6d4 442void rt_cache_flush(struct net *net)
1da177e4 443{
ca4c3fc2 444 rt_genid_bump_ipv4(net);
98376387
ED
445}
446
f894cbf8
DM
447static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
448 struct sk_buff *skb,
449 const void *daddr)
3769cffb 450{
d3aaeb38
DM
451 struct net_device *dev = dst->dev;
452 const __be32 *pkey = daddr;
39232973 453 const struct rtable *rt;
3769cffb
DM
454 struct neighbour *n;
455
39232973 456 rt = (const struct rtable *) dst;
a263b309 457 if (rt->rt_gateway)
39232973 458 pkey = (const __be32 *) &rt->rt_gateway;
f894cbf8
DM
459 else if (skb)
460 pkey = &ip_hdr(skb)->daddr;
d3aaeb38 461
80703d26 462 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
d3aaeb38
DM
463 if (n)
464 return n;
32092ecf 465 return neigh_create(&arp_tbl, pkey, dev);
d3aaeb38
DM
466}
467
63fca65d
JA
468static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
469{
470 struct net_device *dev = dst->dev;
471 const __be32 *pkey = daddr;
472 const struct rtable *rt;
473
474 rt = (const struct rtable *)dst;
475 if (rt->rt_gateway)
476 pkey = (const __be32 *)&rt->rt_gateway;
477 else if (!daddr ||
478 (rt->rt_flags &
479 (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
480 return;
481
482 __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
483}
484
04ca6973 485#define IP_IDENTS_SZ 2048u
04ca6973 486
355b590c
ED
487static atomic_t *ip_idents __read_mostly;
488static u32 *ip_tstamps __read_mostly;
04ca6973
ED
489
490/* In order to protect privacy, we add a perturbation to identifiers
491 * if one generator is seldom used. This makes hard for an attacker
492 * to infer how many packets were sent between two points in time.
493 */
494u32 ip_idents_reserve(u32 hash, int segs)
495{
355b590c
ED
496 u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
497 atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
498 u32 old = ACCESS_ONCE(*p_tstamp);
04ca6973 499 u32 now = (u32)jiffies;
adb03115 500 u32 new, delta = 0;
04ca6973 501
355b590c 502 if (old != now && cmpxchg(p_tstamp, old, now) == old)
04ca6973
ED
503 delta = prandom_u32_max(now - old);
504
adb03115
ED
505 /* Do not use atomic_add_return() as it makes UBSAN unhappy */
506 do {
507 old = (u32)atomic_read(p_id);
508 new = old + delta + segs;
509 } while (atomic_cmpxchg(p_id, old, new) != old);
510
511 return new - segs;
04ca6973
ED
512}
513EXPORT_SYMBOL(ip_idents_reserve);
1da177e4 514
b6a7719a 515void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
1da177e4 516{
73f156a6
ED
517 static u32 ip_idents_hashrnd __read_mostly;
518 u32 hash, id;
1da177e4 519
73f156a6 520 net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
1da177e4 521
04ca6973
ED
522 hash = jhash_3words((__force u32)iph->daddr,
523 (__force u32)iph->saddr,
b6a7719a 524 iph->protocol ^ net_hash_mix(net),
04ca6973 525 ip_idents_hashrnd);
73f156a6
ED
526 id = ip_idents_reserve(hash, segs);
527 iph->id = htons(id);
1da177e4 528}
4bc2f18b 529EXPORT_SYMBOL(__ip_select_ident);
1da177e4 530
e2d118a1
LC
531static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
532 const struct sock *sk,
4895c771
DM
533 const struct iphdr *iph,
534 int oif, u8 tos,
535 u8 prot, u32 mark, int flow_flags)
536{
537 if (sk) {
538 const struct inet_sock *inet = inet_sk(sk);
539
540 oif = sk->sk_bound_dev_if;
541 mark = sk->sk_mark;
542 tos = RT_CONN_FLAGS(sk);
543 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
544 }
545 flowi4_init_output(fl4, oif, mark, tos,
546 RT_SCOPE_UNIVERSE, prot,
547 flow_flags,
e2d118a1
LC
548 iph->daddr, iph->saddr, 0, 0,
549 sock_net_uid(net, sk));
4895c771
DM
550}
551
5abf7f7e
ED
552static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
553 const struct sock *sk)
4895c771 554{
d109e61b 555 const struct net *net = dev_net(skb->dev);
4895c771
DM
556 const struct iphdr *iph = ip_hdr(skb);
557 int oif = skb->dev->ifindex;
558 u8 tos = RT_TOS(iph->tos);
559 u8 prot = iph->protocol;
560 u32 mark = skb->mark;
561
d109e61b 562 __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
4895c771
DM
563}
564
5abf7f7e 565static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
4895c771
DM
566{
567 const struct inet_sock *inet = inet_sk(sk);
5abf7f7e 568 const struct ip_options_rcu *inet_opt;
4895c771
DM
569 __be32 daddr = inet->inet_daddr;
570
571 rcu_read_lock();
572 inet_opt = rcu_dereference(inet->inet_opt);
573 if (inet_opt && inet_opt->opt.srr)
574 daddr = inet_opt->opt.faddr;
575 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
576 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
577 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
578 inet_sk_flowi_flags(sk),
e2d118a1 579 daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
4895c771
DM
580 rcu_read_unlock();
581}
582
5abf7f7e
ED
583static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
584 const struct sk_buff *skb)
4895c771
DM
585{
586 if (skb)
587 build_skb_flow_key(fl4, skb, sk);
588 else
589 build_sk_flow_key(fl4, sk);
590}
591
c5038a83 592static DEFINE_SPINLOCK(fnhe_lock);
4895c771 593
2ffae99d
TT
594static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
595{
596 struct rtable *rt;
597
598 rt = rcu_dereference(fnhe->fnhe_rth_input);
599 if (rt) {
600 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
95c47f9c 601 dst_dev_put(&rt->dst);
0830106c 602 dst_release(&rt->dst);
2ffae99d
TT
603 }
604 rt = rcu_dereference(fnhe->fnhe_rth_output);
605 if (rt) {
606 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
95c47f9c 607 dst_dev_put(&rt->dst);
0830106c 608 dst_release(&rt->dst);
2ffae99d
TT
609 }
610}
611
aee06da6 612static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
4895c771
DM
613{
614 struct fib_nh_exception *fnhe, *oldest;
615
616 oldest = rcu_dereference(hash->chain);
617 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
618 fnhe = rcu_dereference(fnhe->fnhe_next)) {
619 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
620 oldest = fnhe;
621 }
2ffae99d 622 fnhe_flush_routes(oldest);
4895c771
DM
623 return oldest;
624}
625
d3a25c98
DM
626static inline u32 fnhe_hashfun(__be32 daddr)
627{
d546c621 628 static u32 fnhe_hashrnd __read_mostly;
d3a25c98
DM
629 u32 hval;
630
d546c621
ED
631 net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
632 hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
633 return hash_32(hval, FNHE_HASH_SHIFT);
d3a25c98
DM
634}
635
387aa65a
TT
636static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
637{
638 rt->rt_pmtu = fnhe->fnhe_pmtu;
639 rt->dst.expires = fnhe->fnhe_expires;
640
641 if (fnhe->fnhe_gw) {
642 rt->rt_flags |= RTCF_REDIRECTED;
643 rt->rt_gateway = fnhe->fnhe_gw;
644 rt->rt_uses_gateway = 1;
645 }
646}
647
aee06da6
JA
648static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
649 u32 pmtu, unsigned long expires)
4895c771 650{
aee06da6 651 struct fnhe_hash_bucket *hash;
4895c771 652 struct fib_nh_exception *fnhe;
387aa65a
TT
653 struct rtable *rt;
654 unsigned int i;
4895c771 655 int depth;
aee06da6
JA
656 u32 hval = fnhe_hashfun(daddr);
657
c5038a83 658 spin_lock_bh(&fnhe_lock);
4895c771 659
caa41527 660 hash = rcu_dereference(nh->nh_exceptions);
4895c771 661 if (!hash) {
aee06da6 662 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
4895c771 663 if (!hash)
aee06da6 664 goto out_unlock;
caa41527 665 rcu_assign_pointer(nh->nh_exceptions, hash);
4895c771
DM
666 }
667
4895c771
DM
668 hash += hval;
669
670 depth = 0;
671 for (fnhe = rcu_dereference(hash->chain); fnhe;
672 fnhe = rcu_dereference(fnhe->fnhe_next)) {
673 if (fnhe->fnhe_daddr == daddr)
aee06da6 674 break;
4895c771
DM
675 depth++;
676 }
677
aee06da6
JA
678 if (fnhe) {
679 if (gw)
680 fnhe->fnhe_gw = gw;
681 if (pmtu) {
682 fnhe->fnhe_pmtu = pmtu;
387aa65a 683 fnhe->fnhe_expires = max(1UL, expires);
aee06da6 684 }
387aa65a 685 /* Update all cached dsts too */
2ffae99d
TT
686 rt = rcu_dereference(fnhe->fnhe_rth_input);
687 if (rt)
688 fill_route_from_fnhe(rt, fnhe);
689 rt = rcu_dereference(fnhe->fnhe_rth_output);
387aa65a
TT
690 if (rt)
691 fill_route_from_fnhe(rt, fnhe);
aee06da6
JA
692 } else {
693 if (depth > FNHE_RECLAIM_DEPTH)
694 fnhe = fnhe_oldest(hash);
695 else {
696 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
697 if (!fnhe)
698 goto out_unlock;
699
700 fnhe->fnhe_next = hash->chain;
701 rcu_assign_pointer(hash->chain, fnhe);
702 }
5aad1de5 703 fnhe->fnhe_genid = fnhe_genid(dev_net(nh->nh_dev));
aee06da6
JA
704 fnhe->fnhe_daddr = daddr;
705 fnhe->fnhe_gw = gw;
706 fnhe->fnhe_pmtu = pmtu;
707 fnhe->fnhe_expires = expires;
387aa65a
TT
708
709 /* Exception created; mark the cached routes for the nexthop
710 * stale, so anyone caching it rechecks if this exception
711 * applies to them.
712 */
2ffae99d
TT
713 rt = rcu_dereference(nh->nh_rth_input);
714 if (rt)
715 rt->dst.obsolete = DST_OBSOLETE_KILL;
716
387aa65a
TT
717 for_each_possible_cpu(i) {
718 struct rtable __rcu **prt;
719 prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
720 rt = rcu_dereference(*prt);
721 if (rt)
722 rt->dst.obsolete = DST_OBSOLETE_KILL;
723 }
4895c771 724 }
4895c771 725
4895c771 726 fnhe->fnhe_stamp = jiffies;
aee06da6
JA
727
728out_unlock:
c5038a83 729 spin_unlock_bh(&fnhe_lock);
4895c771
DM
730}
731
ceb33206
DM
732static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
733 bool kill_route)
1da177e4 734{
e47a185b 735 __be32 new_gw = icmp_hdr(skb)->un.gateway;
94206125 736 __be32 old_gw = ip_hdr(skb)->saddr;
e47a185b 737 struct net_device *dev = skb->dev;
e47a185b 738 struct in_device *in_dev;
4895c771 739 struct fib_result res;
e47a185b 740 struct neighbour *n;
317805b8 741 struct net *net;
1da177e4 742
94206125
DM
743 switch (icmp_hdr(skb)->code & 7) {
744 case ICMP_REDIR_NET:
745 case ICMP_REDIR_NETTOS:
746 case ICMP_REDIR_HOST:
747 case ICMP_REDIR_HOSTTOS:
748 break;
749
750 default:
751 return;
752 }
753
e47a185b
DM
754 if (rt->rt_gateway != old_gw)
755 return;
756
757 in_dev = __in_dev_get_rcu(dev);
758 if (!in_dev)
759 return;
760
c346dca1 761 net = dev_net(dev);
9d4fb27d
JP
762 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
763 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
764 ipv4_is_zeronet(new_gw))
1da177e4
LT
765 goto reject_redirect;
766
767 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
768 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
769 goto reject_redirect;
770 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
771 goto reject_redirect;
772 } else {
317805b8 773 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1da177e4
LT
774 goto reject_redirect;
775 }
776
969447f2
SSL
777 n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
778 if (!n)
779 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
2c1a4311 780 if (!IS_ERR(n)) {
e47a185b
DM
781 if (!(n->nud_state & NUD_VALID)) {
782 neigh_event_send(n, NULL);
783 } else {
0eeb075f 784 if (fib_lookup(net, fl4, &res, 0) == 0) {
4895c771 785 struct fib_nh *nh = &FIB_RES_NH(res);
4895c771 786
aee06da6 787 update_or_create_fnhe(nh, fl4->daddr, new_gw,
deed49df 788 0, jiffies + ip_rt_gc_timeout);
4895c771 789 }
ceb33206
DM
790 if (kill_route)
791 rt->dst.obsolete = DST_OBSOLETE_KILL;
e47a185b
DM
792 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
793 }
794 neigh_release(n);
795 }
796 return;
797
798reject_redirect:
799#ifdef CONFIG_IP_ROUTE_VERBOSE
99ee038d
DM
800 if (IN_DEV_LOG_MARTIANS(in_dev)) {
801 const struct iphdr *iph = (const struct iphdr *) skb->data;
802 __be32 daddr = iph->daddr;
803 __be32 saddr = iph->saddr;
804
e47a185b
DM
805 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
806 " Advised path = %pI4 -> %pI4\n",
807 &old_gw, dev->name, &new_gw,
808 &saddr, &daddr);
99ee038d 809 }
e47a185b
DM
810#endif
811 ;
812}
813
4895c771
DM
814static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
815{
816 struct rtable *rt;
817 struct flowi4 fl4;
f96ef988 818 const struct iphdr *iph = (const struct iphdr *) skb->data;
7d995694 819 struct net *net = dev_net(skb->dev);
f96ef988
MK
820 int oif = skb->dev->ifindex;
821 u8 tos = RT_TOS(iph->tos);
822 u8 prot = iph->protocol;
823 u32 mark = skb->mark;
4895c771
DM
824
825 rt = (struct rtable *) dst;
826
7d995694 827 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
ceb33206 828 __ip_do_redirect(rt, skb, &fl4, true);
4895c771
DM
829}
830
1da177e4
LT
831static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
832{
ee6b9673 833 struct rtable *rt = (struct rtable *)dst;
1da177e4
LT
834 struct dst_entry *ret = dst;
835
836 if (rt) {
d11a4dc1 837 if (dst->obsolete > 0) {
1da177e4
LT
838 ip_rt_put(rt);
839 ret = NULL;
5943634f
DM
840 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
841 rt->dst.expires) {
89aef892 842 ip_rt_put(rt);
1da177e4
LT
843 ret = NULL;
844 }
845 }
846 return ret;
847}
848
849/*
850 * Algorithm:
851 * 1. The first ip_rt_redirect_number redirects are sent
852 * with exponential backoff, then we stop sending them at all,
853 * assuming that the host ignores our redirects.
854 * 2. If we did not see packets requiring redirects
855 * during ip_rt_redirect_silence, we assume that the host
856 * forgot redirected route and start to send redirects again.
857 *
858 * This algorithm is much cheaper and more intelligent than dumb load limiting
859 * in icmp.c.
860 *
861 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
862 * and "frag. need" (breaks PMTU discovery) in icmp.c.
863 */
864
865void ip_rt_send_redirect(struct sk_buff *skb)
866{
511c3f92 867 struct rtable *rt = skb_rtable(skb);
30038fc6 868 struct in_device *in_dev;
92d86829 869 struct inet_peer *peer;
1d861aa4 870 struct net *net;
30038fc6 871 int log_martians;
192132b9 872 int vif;
1da177e4 873
30038fc6 874 rcu_read_lock();
d8d1f30b 875 in_dev = __in_dev_get_rcu(rt->dst.dev);
30038fc6
ED
876 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
877 rcu_read_unlock();
1da177e4 878 return;
30038fc6
ED
879 }
880 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
385add90 881 vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
30038fc6 882 rcu_read_unlock();
1da177e4 883
1d861aa4 884 net = dev_net(rt->dst.dev);
192132b9 885 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
92d86829 886 if (!peer) {
e81da0e1
JA
887 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
888 rt_nexthop(rt, ip_hdr(skb)->daddr));
92d86829
DM
889 return;
890 }
891
1da177e4
LT
892 /* No redirected packets during ip_rt_redirect_silence;
893 * reset the algorithm.
894 */
92d86829
DM
895 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
896 peer->rate_tokens = 0;
1da177e4
LT
897
898 /* Too many ignored redirects; do not send anything
d8d1f30b 899 * set dst.rate_last to the last seen redirected packet.
1da177e4 900 */
92d86829
DM
901 if (peer->rate_tokens >= ip_rt_redirect_number) {
902 peer->rate_last = jiffies;
1d861aa4 903 goto out_put_peer;
1da177e4
LT
904 }
905
906 /* Check for load limit; set rate_last to the latest sent
907 * redirect.
908 */
92d86829 909 if (peer->rate_tokens == 0 ||
14fb8a76 910 time_after(jiffies,
92d86829
DM
911 (peer->rate_last +
912 (ip_rt_redirect_load << peer->rate_tokens)))) {
e81da0e1
JA
913 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
914
915 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
92d86829
DM
916 peer->rate_last = jiffies;
917 ++peer->rate_tokens;
1da177e4 918#ifdef CONFIG_IP_ROUTE_VERBOSE
30038fc6 919 if (log_martians &&
e87cc472
JP
920 peer->rate_tokens == ip_rt_redirect_number)
921 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
92101b3b 922 &ip_hdr(skb)->saddr, inet_iif(skb),
e81da0e1 923 &ip_hdr(skb)->daddr, &gw);
1da177e4
LT
924#endif
925 }
1d861aa4
DM
926out_put_peer:
927 inet_putpeer(peer);
1da177e4
LT
928}
929
930static int ip_error(struct sk_buff *skb)
931{
251da413 932 struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
511c3f92 933 struct rtable *rt = skb_rtable(skb);
92d86829 934 struct inet_peer *peer;
1da177e4 935 unsigned long now;
251da413 936 struct net *net;
92d86829 937 bool send;
1da177e4
LT
938 int code;
939
381c759d
EB
940 /* IP on this device is disabled. */
941 if (!in_dev)
942 goto out;
943
251da413
DM
944 net = dev_net(rt->dst.dev);
945 if (!IN_DEV_FORWARD(in_dev)) {
946 switch (rt->dst.error) {
947 case EHOSTUNREACH:
b45386ef 948 __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
251da413
DM
949 break;
950
951 case ENETUNREACH:
b45386ef 952 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
251da413
DM
953 break;
954 }
955 goto out;
956 }
957
d8d1f30b 958 switch (rt->dst.error) {
4500ebf8
JP
959 case EINVAL:
960 default:
961 goto out;
962 case EHOSTUNREACH:
963 code = ICMP_HOST_UNREACH;
964 break;
965 case ENETUNREACH:
966 code = ICMP_NET_UNREACH;
b45386ef 967 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
4500ebf8
JP
968 break;
969 case EACCES:
970 code = ICMP_PKT_FILTERED;
971 break;
1da177e4
LT
972 }
973
192132b9 974 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
385add90 975 l3mdev_master_ifindex(skb->dev), 1);
92d86829
DM
976
977 send = true;
978 if (peer) {
979 now = jiffies;
980 peer->rate_tokens += now - peer->rate_last;
981 if (peer->rate_tokens > ip_rt_error_burst)
982 peer->rate_tokens = ip_rt_error_burst;
983 peer->rate_last = now;
984 if (peer->rate_tokens >= ip_rt_error_cost)
985 peer->rate_tokens -= ip_rt_error_cost;
986 else
987 send = false;
1d861aa4 988 inet_putpeer(peer);
1da177e4 989 }
92d86829
DM
990 if (send)
991 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1da177e4
LT
992
993out: kfree_skb(skb);
994 return 0;
e905a9ed 995}
1da177e4 996
d851c12b 997static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1da177e4 998{
d851c12b 999 struct dst_entry *dst = &rt->dst;
4895c771 1000 struct fib_result res;
2c8cec5c 1001
fa1e492a
SK
1002 if (dst_metric_locked(dst, RTAX_MTU))
1003 return;
1004
cb6ccf09 1005 if (ipv4_mtu(dst) < mtu)
3cdaa5be
LW
1006 return;
1007
5943634f
DM
1008 if (mtu < ip_rt_min_pmtu)
1009 mtu = ip_rt_min_pmtu;
2c8cec5c 1010
f016229e
TT
1011 if (rt->rt_pmtu == mtu &&
1012 time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1013 return;
1014
c5ae7d41 1015 rcu_read_lock();
0eeb075f 1016 if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
4895c771 1017 struct fib_nh *nh = &FIB_RES_NH(res);
4895c771 1018
aee06da6
JA
1019 update_or_create_fnhe(nh, fl4->daddr, 0, mtu,
1020 jiffies + ip_rt_mtu_expires);
4895c771 1021 }
c5ae7d41 1022 rcu_read_unlock();
1da177e4
LT
1023}
1024
4895c771
DM
1025static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1026 struct sk_buff *skb, u32 mtu)
1027{
1028 struct rtable *rt = (struct rtable *) dst;
1029 struct flowi4 fl4;
1030
1031 ip_rt_build_flow_key(&fl4, sk, skb);
d851c12b 1032 __ip_rt_update_pmtu(rt, &fl4, mtu);
4895c771
DM
1033}
1034
36393395
DM
1035void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1036 int oif, u32 mark, u8 protocol, int flow_flags)
1037{
4895c771 1038 const struct iphdr *iph = (const struct iphdr *) skb->data;
36393395
DM
1039 struct flowi4 fl4;
1040 struct rtable *rt;
1041
1b3c61dc
LC
1042 if (!mark)
1043 mark = IP4_REPLY_MARK(net, skb->mark);
1044
e2d118a1 1045 __build_flow_key(net, &fl4, NULL, iph, oif,
4895c771 1046 RT_TOS(iph->tos), protocol, mark, flow_flags);
36393395
DM
1047 rt = __ip_route_output_key(net, &fl4);
1048 if (!IS_ERR(rt)) {
4895c771 1049 __ip_rt_update_pmtu(rt, &fl4, mtu);
36393395
DM
1050 ip_rt_put(rt);
1051 }
1052}
1053EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1054
9cb3a50c 1055static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
36393395 1056{
4895c771
DM
1057 const struct iphdr *iph = (const struct iphdr *) skb->data;
1058 struct flowi4 fl4;
1059 struct rtable *rt;
36393395 1060
e2d118a1 1061 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1b3c61dc
LC
1062
1063 if (!fl4.flowi4_mark)
1064 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1065
4895c771
DM
1066 rt = __ip_route_output_key(sock_net(sk), &fl4);
1067 if (!IS_ERR(rt)) {
1068 __ip_rt_update_pmtu(rt, &fl4, mtu);
1069 ip_rt_put(rt);
1070 }
36393395 1071}
9cb3a50c
SK
1072
1073void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1074{
1075 const struct iphdr *iph = (const struct iphdr *) skb->data;
1076 struct flowi4 fl4;
1077 struct rtable *rt;
7f502361 1078 struct dst_entry *odst = NULL;
b44108db 1079 bool new = false;
e2d118a1 1080 struct net *net = sock_net(sk);
9cb3a50c
SK
1081
1082 bh_lock_sock(sk);
482fc609
HFS
1083
1084 if (!ip_sk_accept_pmtu(sk))
1085 goto out;
1086
7f502361 1087 odst = sk_dst_get(sk);
9cb3a50c 1088
7f502361 1089 if (sock_owned_by_user(sk) || !odst) {
9cb3a50c
SK
1090 __ipv4_sk_update_pmtu(skb, sk, mtu);
1091 goto out;
1092 }
1093
e2d118a1 1094 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
9cb3a50c 1095
7f502361 1096 rt = (struct rtable *)odst;
51456b29 1097 if (odst->obsolete && !odst->ops->check(odst, 0)) {
9cb3a50c
SK
1098 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1099 if (IS_ERR(rt))
1100 goto out;
b44108db
SK
1101
1102 new = true;
9cb3a50c
SK
1103 }
1104
1105 __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1106
7f502361 1107 if (!dst_check(&rt->dst, 0)) {
b44108db
SK
1108 if (new)
1109 dst_release(&rt->dst);
1110
9cb3a50c
SK
1111 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1112 if (IS_ERR(rt))
1113 goto out;
1114
b44108db 1115 new = true;
9cb3a50c
SK
1116 }
1117
b44108db 1118 if (new)
7f502361 1119 sk_dst_set(sk, &rt->dst);
9cb3a50c
SK
1120
1121out:
1122 bh_unlock_sock(sk);
7f502361 1123 dst_release(odst);
9cb3a50c 1124}
36393395 1125EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
f39925db 1126
b42597e2
DM
1127void ipv4_redirect(struct sk_buff *skb, struct net *net,
1128 int oif, u32 mark, u8 protocol, int flow_flags)
1129{
4895c771 1130 const struct iphdr *iph = (const struct iphdr *) skb->data;
b42597e2
DM
1131 struct flowi4 fl4;
1132 struct rtable *rt;
1133
e2d118a1 1134 __build_flow_key(net, &fl4, NULL, iph, oif,
4895c771 1135 RT_TOS(iph->tos), protocol, mark, flow_flags);
b42597e2
DM
1136 rt = __ip_route_output_key(net, &fl4);
1137 if (!IS_ERR(rt)) {
ceb33206 1138 __ip_do_redirect(rt, skb, &fl4, false);
b42597e2
DM
1139 ip_rt_put(rt);
1140 }
1141}
1142EXPORT_SYMBOL_GPL(ipv4_redirect);
1143
1144void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1145{
4895c771
DM
1146 const struct iphdr *iph = (const struct iphdr *) skb->data;
1147 struct flowi4 fl4;
1148 struct rtable *rt;
e2d118a1 1149 struct net *net = sock_net(sk);
b42597e2 1150
e2d118a1
LC
1151 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1152 rt = __ip_route_output_key(net, &fl4);
4895c771 1153 if (!IS_ERR(rt)) {
ceb33206 1154 __ip_do_redirect(rt, skb, &fl4, false);
4895c771
DM
1155 ip_rt_put(rt);
1156 }
b42597e2
DM
1157}
1158EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1159
efbc368d
DM
1160static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1161{
1162 struct rtable *rt = (struct rtable *) dst;
1163
ceb33206
DM
1164 /* All IPV4 dsts are created with ->obsolete set to the value
1165 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1166 * into this function always.
1167 *
387aa65a
TT
1168 * When a PMTU/redirect information update invalidates a route,
1169 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1170 * DST_OBSOLETE_DEAD by dst_free().
ceb33206 1171 */
387aa65a 1172 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
efbc368d 1173 return NULL;
d11a4dc1 1174 return dst;
1da177e4
LT
1175}
1176
1da177e4
LT
1177static void ipv4_link_failure(struct sk_buff *skb)
1178{
1179 struct rtable *rt;
1180
1181 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1182
511c3f92 1183 rt = skb_rtable(skb);
5943634f
DM
1184 if (rt)
1185 dst_set_expires(&rt->dst, 0);
1da177e4
LT
1186}
1187
ede2059d 1188static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1da177e4 1189{
91df42be
JP
1190 pr_debug("%s: %pI4 -> %pI4, %s\n",
1191 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1192 skb->dev ? skb->dev->name : "?");
1da177e4 1193 kfree_skb(skb);
c378a9c0 1194 WARN_ON(1);
1da177e4
LT
1195 return 0;
1196}
1197
1198/*
1199 We do not cache source address of outgoing interface,
1200 because it is used only by IP RR, TS and SRR options,
1201 so that it out of fast path.
1202
1203 BTW remember: "addr" is allowed to be not aligned
1204 in IP options!
1205 */
1206
8e36360a 1207void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1da177e4 1208{
a61ced5d 1209 __be32 src;
1da177e4 1210
c7537967 1211 if (rt_is_output_route(rt))
c5be24ff 1212 src = ip_hdr(skb)->saddr;
ebc0ffae 1213 else {
8e36360a
DM
1214 struct fib_result res;
1215 struct flowi4 fl4;
1216 struct iphdr *iph;
1217
1218 iph = ip_hdr(skb);
1219
1220 memset(&fl4, 0, sizeof(fl4));
1221 fl4.daddr = iph->daddr;
1222 fl4.saddr = iph->saddr;
b0fe4a31 1223 fl4.flowi4_tos = RT_TOS(iph->tos);
8e36360a
DM
1224 fl4.flowi4_oif = rt->dst.dev->ifindex;
1225 fl4.flowi4_iif = skb->dev->ifindex;
1226 fl4.flowi4_mark = skb->mark;
5e2b61f7 1227
ebc0ffae 1228 rcu_read_lock();
0eeb075f 1229 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
436c3b66 1230 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
ebc0ffae 1231 else
f8126f1d
DM
1232 src = inet_select_addr(rt->dst.dev,
1233 rt_nexthop(rt, iph->daddr),
1234 RT_SCOPE_UNIVERSE);
ebc0ffae
ED
1235 rcu_read_unlock();
1236 }
1da177e4
LT
1237 memcpy(addr, &src, 4);
1238}
1239
c7066f70 1240#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4
LT
1241static void set_class_tag(struct rtable *rt, u32 tag)
1242{
d8d1f30b
CG
1243 if (!(rt->dst.tclassid & 0xFFFF))
1244 rt->dst.tclassid |= tag & 0xFFFF;
1245 if (!(rt->dst.tclassid & 0xFFFF0000))
1246 rt->dst.tclassid |= tag & 0xFFFF0000;
1da177e4
LT
1247}
1248#endif
1249
0dbaee3b
DM
1250static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1251{
7ed14d97
GF
1252 unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1253 unsigned int advmss = max_t(unsigned int, dst->dev->mtu - header_size,
1254 ip_rt_min_advmss);
0dbaee3b 1255
7ed14d97 1256 return min(advmss, IPV4_MAX_PMTU - header_size);
0dbaee3b
DM
1257}
1258
ebb762f2 1259static unsigned int ipv4_mtu(const struct dst_entry *dst)
d33e4553 1260{
261663b0 1261 const struct rtable *rt = (const struct rtable *) dst;
5943634f
DM
1262 unsigned int mtu = rt->rt_pmtu;
1263
98d75c37 1264 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
5943634f 1265 mtu = dst_metric_raw(dst, RTAX_MTU);
618f9bc7 1266
38d523e2 1267 if (mtu)
618f9bc7
SK
1268 return mtu;
1269
1270 mtu = dst->dev->mtu;
d33e4553
DM
1271
1272 if (unlikely(dst_metric_locked(dst, RTAX_MTU))) {
155e8336 1273 if (rt->rt_uses_gateway && mtu > 576)
d33e4553
DM
1274 mtu = 576;
1275 }
1276
14972cbd
RP
1277 mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1278
1279 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
d33e4553
DM
1280}
1281
f2bb4bed 1282static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
4895c771 1283{
caa41527 1284 struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
4895c771
DM
1285 struct fib_nh_exception *fnhe;
1286 u32 hval;
1287
f2bb4bed
DM
1288 if (!hash)
1289 return NULL;
1290
d3a25c98 1291 hval = fnhe_hashfun(daddr);
4895c771
DM
1292
1293 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1294 fnhe = rcu_dereference(fnhe->fnhe_next)) {
f2bb4bed
DM
1295 if (fnhe->fnhe_daddr == daddr)
1296 return fnhe;
1297 }
1298 return NULL;
1299}
aee06da6 1300
caacf05e 1301static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
f2bb4bed
DM
1302 __be32 daddr)
1303{
caacf05e
DM
1304 bool ret = false;
1305
c5038a83 1306 spin_lock_bh(&fnhe_lock);
f2bb4bed 1307
c5038a83 1308 if (daddr == fnhe->fnhe_daddr) {
2ffae99d
TT
1309 struct rtable __rcu **porig;
1310 struct rtable *orig;
5aad1de5 1311 int genid = fnhe_genid(dev_net(rt->dst.dev));
2ffae99d
TT
1312
1313 if (rt_is_input_route(rt))
1314 porig = &fnhe->fnhe_rth_input;
1315 else
1316 porig = &fnhe->fnhe_rth_output;
1317 orig = rcu_dereference(*porig);
5aad1de5
TT
1318
1319 if (fnhe->fnhe_genid != genid) {
1320 fnhe->fnhe_genid = genid;
13d82bf5
SK
1321 fnhe->fnhe_gw = 0;
1322 fnhe->fnhe_pmtu = 0;
1323 fnhe->fnhe_expires = 0;
2ffae99d
TT
1324 fnhe_flush_routes(fnhe);
1325 orig = NULL;
13d82bf5 1326 }
387aa65a
TT
1327 fill_route_from_fnhe(rt, fnhe);
1328 if (!rt->rt_gateway)
155e8336 1329 rt->rt_gateway = daddr;
f2bb4bed 1330
2ffae99d 1331 if (!(rt->dst.flags & DST_NOCACHE)) {
0830106c 1332 dst_hold(&rt->dst);
2ffae99d 1333 rcu_assign_pointer(*porig, rt);
0830106c 1334 if (orig) {
95c47f9c 1335 dst_dev_put(&orig->dst);
0830106c 1336 dst_release(&orig->dst);
0830106c 1337 }
2ffae99d
TT
1338 ret = true;
1339 }
c5038a83
DM
1340
1341 fnhe->fnhe_stamp = jiffies;
c5038a83
DM
1342 }
1343 spin_unlock_bh(&fnhe_lock);
caacf05e
DM
1344
1345 return ret;
54764bb6
ED
1346}
1347
caacf05e 1348static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
f2bb4bed 1349{
d26b3a7c 1350 struct rtable *orig, *prev, **p;
caacf05e 1351 bool ret = true;
f2bb4bed 1352
d26b3a7c 1353 if (rt_is_input_route(rt)) {
54764bb6 1354 p = (struct rtable **)&nh->nh_rth_input;
d26b3a7c 1355 } else {
903ceff7 1356 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
d26b3a7c 1357 }
f2bb4bed
DM
1358 orig = *p;
1359
0830106c
WW
1360 /* hold dst before doing cmpxchg() to avoid race condition
1361 * on this dst
1362 */
1363 dst_hold(&rt->dst);
f2bb4bed
DM
1364 prev = cmpxchg(p, orig, rt);
1365 if (prev == orig) {
0830106c 1366 if (orig) {
95c47f9c 1367 dst_dev_put(&orig->dst);
0830106c 1368 dst_release(&orig->dst);
0830106c
WW
1369 }
1370 } else {
1371 dst_release(&rt->dst);
caacf05e 1372 ret = false;
0830106c 1373 }
caacf05e
DM
1374
1375 return ret;
1376}
1377
5055c371
ED
1378struct uncached_list {
1379 spinlock_t lock;
1380 struct list_head head;
1381};
1382
1383static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
caacf05e
DM
1384
1385static void rt_add_uncached_list(struct rtable *rt)
1386{
5055c371
ED
1387 struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1388
1389 rt->rt_uncached_list = ul;
1390
1391 spin_lock_bh(&ul->lock);
1392 list_add_tail(&rt->rt_uncached, &ul->head);
1393 spin_unlock_bh(&ul->lock);
caacf05e
DM
1394}
1395
1396static void ipv4_dst_destroy(struct dst_entry *dst)
1397{
3fb07daf 1398 struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
caacf05e
DM
1399 struct rtable *rt = (struct rtable *) dst;
1400
3fb07daf
ED
1401 if (p != &dst_default_metrics && atomic_dec_and_test(&p->refcnt))
1402 kfree(p);
1403
78df76a0 1404 if (!list_empty(&rt->rt_uncached)) {
5055c371
ED
1405 struct uncached_list *ul = rt->rt_uncached_list;
1406
1407 spin_lock_bh(&ul->lock);
caacf05e 1408 list_del(&rt->rt_uncached);
5055c371 1409 spin_unlock_bh(&ul->lock);
caacf05e
DM
1410 }
1411}
1412
1413void rt_flush_dev(struct net_device *dev)
1414{
5055c371
ED
1415 struct net *net = dev_net(dev);
1416 struct rtable *rt;
1417 int cpu;
1418
1419 for_each_possible_cpu(cpu) {
1420 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
caacf05e 1421
5055c371
ED
1422 spin_lock_bh(&ul->lock);
1423 list_for_each_entry(rt, &ul->head, rt_uncached) {
caacf05e
DM
1424 if (rt->dst.dev != dev)
1425 continue;
1426 rt->dst.dev = net->loopback_dev;
1427 dev_hold(rt->dst.dev);
1428 dev_put(dev);
1429 }
5055c371 1430 spin_unlock_bh(&ul->lock);
4895c771
DM
1431 }
1432}
1433
4331debc 1434static bool rt_cache_valid(const struct rtable *rt)
d2d68ba9 1435{
4331debc
ED
1436 return rt &&
1437 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1438 !rt_is_expired(rt);
d2d68ba9
DM
1439}
1440
f2bb4bed 1441static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
5e2b61f7 1442 const struct fib_result *res,
f2bb4bed 1443 struct fib_nh_exception *fnhe,
982721f3 1444 struct fib_info *fi, u16 type, u32 itag)
1da177e4 1445{
caacf05e
DM
1446 bool cached = false;
1447
1da177e4 1448 if (fi) {
4895c771
DM
1449 struct fib_nh *nh = &FIB_RES_NH(*res);
1450
155e8336 1451 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
4895c771 1452 rt->rt_gateway = nh->nh_gw;
155e8336
JA
1453 rt->rt_uses_gateway = 1;
1454 }
3fb07daf
ED
1455 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1456 if (fi->fib_metrics != &dst_default_metrics) {
1457 rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1458 atomic_inc(&fi->fib_metrics->refcnt);
1459 }
c7066f70 1460#ifdef CONFIG_IP_ROUTE_CLASSID
f2bb4bed 1461 rt->dst.tclassid = nh->nh_tclassid;
1da177e4 1462#endif
61adedf3 1463 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
c5038a83 1464 if (unlikely(fnhe))
caacf05e 1465 cached = rt_bind_exception(rt, fnhe, daddr);
c5038a83 1466 else if (!(rt->dst.flags & DST_NOCACHE))
caacf05e 1467 cached = rt_cache_route(nh, rt);
155e8336
JA
1468 if (unlikely(!cached)) {
1469 /* Routes we intend to cache in nexthop exception or
1470 * FIB nexthop have the DST_NOCACHE bit clear.
1471 * However, if we are unsuccessful at storing this
1472 * route into the cache we really need to set it.
1473 */
1474 rt->dst.flags |= DST_NOCACHE;
1475 if (!rt->rt_gateway)
1476 rt->rt_gateway = daddr;
1477 rt_add_uncached_list(rt);
1478 }
1479 } else
caacf05e 1480 rt_add_uncached_list(rt);
defb3519 1481
c7066f70 1482#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4 1483#ifdef CONFIG_IP_MULTIPLE_TABLES
85b91b03 1484 set_class_tag(rt, res->tclassid);
1da177e4
LT
1485#endif
1486 set_class_tag(rt, itag);
1487#endif
1da177e4
LT
1488}
1489
9ab179d8
DA
1490struct rtable *rt_dst_alloc(struct net_device *dev,
1491 unsigned int flags, u16 type,
1492 bool nopolicy, bool noxfrm, bool will_cache)
0c4dcd58 1493{
d08c4f35
DA
1494 struct rtable *rt;
1495
1496 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1497 (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1498 (nopolicy ? DST_NOPOLICY : 0) |
b838d5e1
WW
1499 (noxfrm ? DST_NOXFRM : 0) |
1500 DST_NOGC);
d08c4f35
DA
1501
1502 if (rt) {
1503 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1504 rt->rt_flags = flags;
1505 rt->rt_type = type;
1506 rt->rt_is_input = 0;
1507 rt->rt_iif = 0;
1508 rt->rt_pmtu = 0;
1509 rt->rt_gateway = 0;
1510 rt->rt_uses_gateway = 0;
b7503e0c 1511 rt->rt_table_id = 0;
d08c4f35
DA
1512 INIT_LIST_HEAD(&rt->rt_uncached);
1513
1514 rt->dst.output = ip_output;
1515 if (flags & RTCF_LOCAL)
1516 rt->dst.input = ip_local_deliver;
1517 }
1518
1519 return rt;
0c4dcd58 1520}
9ab179d8 1521EXPORT_SYMBOL(rt_dst_alloc);
0c4dcd58 1522
96d36220 1523/* called in rcu_read_lock() section */
9e12bb22 1524static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1da177e4
LT
1525 u8 tos, struct net_device *dev, int our)
1526{
1da177e4 1527 struct rtable *rth;
96d36220 1528 struct in_device *in_dev = __in_dev_get_rcu(dev);
d08c4f35 1529 unsigned int flags = RTCF_MULTICAST;
1da177e4 1530 u32 itag = 0;
b5f7e755 1531 int err;
1da177e4
LT
1532
1533 /* Primary sanity checks. */
1534
51456b29 1535 if (!in_dev)
1da177e4
LT
1536 return -EINVAL;
1537
1e637c74 1538 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
d0daebc3 1539 skb->protocol != htons(ETH_P_IP))
1da177e4
LT
1540 goto e_inval;
1541
75fea73d
AD
1542 if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1543 goto e_inval;
d0daebc3 1544
f97c1e0c
JP
1545 if (ipv4_is_zeronet(saddr)) {
1546 if (!ipv4_is_local_multicast(daddr))
1da177e4 1547 goto e_inval;
b5f7e755 1548 } else {
9e56e380
DM
1549 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1550 in_dev, &itag);
b5f7e755
ED
1551 if (err < 0)
1552 goto e_err;
1553 }
d08c4f35
DA
1554 if (our)
1555 flags |= RTCF_LOCAL;
1556
1557 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
f2bb4bed 1558 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1da177e4
LT
1559 if (!rth)
1560 goto e_nobufs;
1561
cf911662
DM
1562#ifdef CONFIG_IP_ROUTE_CLASSID
1563 rth->dst.tclassid = itag;
1564#endif
d8d1f30b 1565 rth->dst.output = ip_rt_bug;
9917e1e8 1566 rth->rt_is_input= 1;
1da177e4
LT
1567
1568#ifdef CONFIG_IP_MROUTE
f97c1e0c 1569 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
d8d1f30b 1570 rth->dst.input = ip_mr_input;
1da177e4
LT
1571#endif
1572 RT_CACHE_STAT_INC(in_slow_mc);
1573
89aef892
DM
1574 skb_dst_set(skb, &rth->dst);
1575 return 0;
1da177e4
LT
1576
1577e_nobufs:
1da177e4 1578 return -ENOBUFS;
1da177e4 1579e_inval:
96d36220 1580 return -EINVAL;
b5f7e755 1581e_err:
b5f7e755 1582 return err;
1da177e4
LT
1583}
1584
1585
1586static void ip_handle_martian_source(struct net_device *dev,
1587 struct in_device *in_dev,
1588 struct sk_buff *skb,
9e12bb22
AV
1589 __be32 daddr,
1590 __be32 saddr)
1da177e4
LT
1591{
1592 RT_CACHE_STAT_INC(in_martian_src);
1593#ifdef CONFIG_IP_ROUTE_VERBOSE
1594 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1595 /*
1596 * RFC1812 recommendation, if source is martian,
1597 * the only hint is MAC header.
1598 */
058bd4d2 1599 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
673d57e7 1600 &daddr, &saddr, dev->name);
98e399f8 1601 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
058bd4d2
JP
1602 print_hex_dump(KERN_WARNING, "ll header: ",
1603 DUMP_PREFIX_OFFSET, 16, 1,
1604 skb_mac_header(skb),
1605 dev->hard_header_len, true);
1da177e4
LT
1606 }
1607 }
1608#endif
1609}
1610
deed49df
XL
1611static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1612{
1613 struct fnhe_hash_bucket *hash;
1614 struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1615 u32 hval = fnhe_hashfun(daddr);
1616
1617 spin_lock_bh(&fnhe_lock);
1618
1619 hash = rcu_dereference_protected(nh->nh_exceptions,
1620 lockdep_is_held(&fnhe_lock));
1621 hash += hval;
1622
1623 fnhe_p = &hash->chain;
1624 fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1625 while (fnhe) {
1626 if (fnhe->fnhe_daddr == daddr) {
1627 rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1628 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1629 fnhe_flush_routes(fnhe);
1630 kfree_rcu(fnhe, rcu);
1631 break;
1632 }
1633 fnhe_p = &fnhe->fnhe_next;
1634 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1635 lockdep_is_held(&fnhe_lock));
1636 }
1637
1638 spin_unlock_bh(&fnhe_lock);
1639}
1640
efd85700
TG
1641static void set_lwt_redirect(struct rtable *rth)
1642{
1643 if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1644 rth->dst.lwtstate->orig_output = rth->dst.output;
1645 rth->dst.output = lwtunnel_output;
1646 }
1647
1648 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1649 rth->dst.lwtstate->orig_input = rth->dst.input;
1650 rth->dst.input = lwtunnel_input;
1651 }
1652}
1653
47360228 1654/* called in rcu_read_lock() section */
5969f71d 1655static int __mkroute_input(struct sk_buff *skb,
982721f3 1656 const struct fib_result *res,
5969f71d 1657 struct in_device *in_dev,
c6cffba4 1658 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 1659{
2ffae99d 1660 struct fib_nh_exception *fnhe;
1da177e4
LT
1661 struct rtable *rth;
1662 int err;
1663 struct in_device *out_dev;
d2d68ba9 1664 bool do_cache;
fbdc0ad0 1665 u32 itag = 0;
1da177e4
LT
1666
1667 /* get a working reference to the output device */
47360228 1668 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
51456b29 1669 if (!out_dev) {
e87cc472 1670 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1da177e4
LT
1671 return -EINVAL;
1672 }
1673
5c04c819 1674 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
9e56e380 1675 in_dev->dev, in_dev, &itag);
1da177e4 1676 if (err < 0) {
e905a9ed 1677 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1da177e4 1678 saddr);
e905a9ed 1679
1da177e4
LT
1680 goto cleanup;
1681 }
1682
e81da0e1
JA
1683 do_cache = res->fi && !itag;
1684 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
df4d9254 1685 skb->protocol == htons(ETH_P_IP) &&
1da177e4 1686 (IN_DEV_SHARED_MEDIA(out_dev) ||
df4d9254
HFS
1687 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1688 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1da177e4
LT
1689
1690 if (skb->protocol != htons(ETH_P_IP)) {
1691 /* Not IP (i.e. ARP). Do not create route, if it is
1692 * invalid for proxy arp. DNAT routes are always valid.
65324144
JDB
1693 *
1694 * Proxy arp feature have been extended to allow, ARP
1695 * replies back to the same interface, to support
1696 * Private VLAN switch technologies. See arp.c.
1da177e4 1697 */
65324144
JDB
1698 if (out_dev == in_dev &&
1699 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1da177e4
LT
1700 err = -EINVAL;
1701 goto cleanup;
1702 }
1703 }
1704
2ffae99d 1705 fnhe = find_exception(&FIB_RES_NH(*res), daddr);
e81da0e1 1706 if (do_cache) {
deed49df 1707 if (fnhe) {
2ffae99d 1708 rth = rcu_dereference(fnhe->fnhe_rth_input);
deed49df
XL
1709 if (rth && rth->dst.expires &&
1710 time_after(jiffies, rth->dst.expires)) {
1711 ip_del_fnhe(&FIB_RES_NH(*res), daddr);
1712 fnhe = NULL;
1713 } else {
1714 goto rt_cache;
1715 }
1716 }
1717
1718 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
2ffae99d 1719
deed49df 1720rt_cache:
e81da0e1
JA
1721 if (rt_cache_valid(rth)) {
1722 skb_dst_set_noref(skb, &rth->dst);
1723 goto out;
d2d68ba9
DM
1724 }
1725 }
f2bb4bed 1726
d08c4f35 1727 rth = rt_dst_alloc(out_dev->dev, 0, res->type,
5c1e6aa3 1728 IN_DEV_CONF_GET(in_dev, NOPOLICY),
d2d68ba9 1729 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1da177e4
LT
1730 if (!rth) {
1731 err = -ENOBUFS;
1732 goto cleanup;
1733 }
1734
9917e1e8 1735 rth->rt_is_input = 1;
b7503e0c
DA
1736 if (res->table)
1737 rth->rt_table_id = res->table->tb_id;
a6254864 1738 RT_CACHE_STAT_INC(in_slow_tot);
1da177e4 1739
d8d1f30b 1740 rth->dst.input = ip_forward;
1da177e4 1741
2ffae99d 1742 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
efd85700 1743 set_lwt_redirect(rth);
c6cffba4 1744 skb_dst_set(skb, &rth->dst);
d2d68ba9 1745out:
1da177e4
LT
1746 err = 0;
1747 cleanup:
1da177e4 1748 return err;
e905a9ed 1749}
1da177e4 1750
79a13159 1751#ifdef CONFIG_IP_ROUTE_MULTIPATH
79a13159 1752/* To make ICMP packets follow the right flow, the multipath hash is
bf4e0a3d 1753 * calculated from the inner IP addresses.
79a13159 1754 */
bf4e0a3d
NA
1755static void ip_multipath_l3_keys(const struct sk_buff *skb,
1756 struct flow_keys *hash_keys)
79a13159
PN
1757{
1758 const struct iphdr *outer_iph = ip_hdr(skb);
bf4e0a3d 1759 const struct iphdr *inner_iph;
79a13159
PN
1760 const struct icmphdr *icmph;
1761 struct iphdr _inner_iph;
bf4e0a3d
NA
1762 struct icmphdr _icmph;
1763
1764 hash_keys->addrs.v4addrs.src = outer_iph->saddr;
1765 hash_keys->addrs.v4addrs.dst = outer_iph->daddr;
1766 if (likely(outer_iph->protocol != IPPROTO_ICMP))
1767 return;
79a13159
PN
1768
1769 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
bf4e0a3d 1770 return;
79a13159
PN
1771
1772 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1773 &_icmph);
1774 if (!icmph)
bf4e0a3d 1775 return;
79a13159
PN
1776
1777 if (icmph->type != ICMP_DEST_UNREACH &&
1778 icmph->type != ICMP_REDIRECT &&
1779 icmph->type != ICMP_TIME_EXCEEDED &&
bf4e0a3d
NA
1780 icmph->type != ICMP_PARAMETERPROB)
1781 return;
79a13159
PN
1782
1783 inner_iph = skb_header_pointer(skb,
1784 outer_iph->ihl * 4 + sizeof(_icmph),
1785 sizeof(_inner_iph), &_inner_iph);
1786 if (!inner_iph)
bf4e0a3d
NA
1787 return;
1788 hash_keys->addrs.v4addrs.src = inner_iph->saddr;
1789 hash_keys->addrs.v4addrs.dst = inner_iph->daddr;
1790}
79a13159 1791
bf4e0a3d
NA
1792/* if skb is set it will be used and fl4 can be NULL */
1793int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
1794 const struct sk_buff *skb)
1795{
1796 struct net *net = fi->fib_net;
1797 struct flow_keys hash_keys;
1798 u32 mhash;
79a13159 1799
bf4e0a3d
NA
1800 switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1801 case 0:
1802 memset(&hash_keys, 0, sizeof(hash_keys));
1803 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1804 if (skb) {
1805 ip_multipath_l3_keys(skb, &hash_keys);
1806 } else {
1807 hash_keys.addrs.v4addrs.src = fl4->saddr;
1808 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1809 }
1810 break;
1811 case 1:
1812 /* skb is currently provided only when forwarding */
1813 if (skb) {
1814 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1815 struct flow_keys keys;
1816
1817 /* short-circuit if we already have L4 hash present */
1818 if (skb->l4_hash)
1819 return skb_get_hash_raw(skb) >> 1;
1820 memset(&hash_keys, 0, sizeof(hash_keys));
1821 skb_flow_dissect_flow_keys(skb, &keys, flag);
1822 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1823 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1824 hash_keys.ports.src = keys.ports.src;
1825 hash_keys.ports.dst = keys.ports.dst;
1826 hash_keys.basic.ip_proto = keys.basic.ip_proto;
1827 } else {
1828 memset(&hash_keys, 0, sizeof(hash_keys));
1829 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1830 hash_keys.addrs.v4addrs.src = fl4->saddr;
1831 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1832 hash_keys.ports.src = fl4->fl4_sport;
1833 hash_keys.ports.dst = fl4->fl4_dport;
1834 hash_keys.basic.ip_proto = fl4->flowi4_proto;
1835 }
1836 break;
1837 }
1838 mhash = flow_hash_from_keys(&hash_keys);
79a13159 1839
bf4e0a3d
NA
1840 return mhash >> 1;
1841}
1842EXPORT_SYMBOL_GPL(fib_multipath_hash);
79a13159
PN
1843#endif /* CONFIG_IP_ROUTE_MULTIPATH */
1844
5969f71d
SH
1845static int ip_mkroute_input(struct sk_buff *skb,
1846 struct fib_result *res,
5969f71d
SH
1847 struct in_device *in_dev,
1848 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 1849{
1da177e4 1850#ifdef CONFIG_IP_ROUTE_MULTIPATH
0e884c78 1851 if (res->fi && res->fi->fib_nhs > 1) {
bf4e0a3d 1852 int h = fib_multipath_hash(res->fi, NULL, skb);
0e884c78 1853
0e884c78
PN
1854 fib_select_multipath(res, h);
1855 }
1da177e4
LT
1856#endif
1857
1858 /* create a routing cache entry */
c6cffba4 1859 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1da177e4
LT
1860}
1861
1da177e4
LT
1862/*
1863 * NOTE. We drop all the packets that has local source
1864 * addresses, because every properly looped back packet
1865 * must have correct destination already attached by output routine.
1866 *
1867 * Such approach solves two big problems:
1868 * 1. Not simplex devices are handled properly.
1869 * 2. IP spoofing attempts are filtered with 100% of guarantee.
ebc0ffae 1870 * called with rcu_read_lock()
1da177e4
LT
1871 */
1872
9e12bb22 1873static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
5510cdf7
DA
1874 u8 tos, struct net_device *dev,
1875 struct fib_result *res)
1da177e4 1876{
96d36220 1877 struct in_device *in_dev = __in_dev_get_rcu(dev);
1b7179d3 1878 struct ip_tunnel_info *tun_info;
68a5e3dd 1879 struct flowi4 fl4;
95c96174 1880 unsigned int flags = 0;
1da177e4 1881 u32 itag = 0;
95c96174 1882 struct rtable *rth;
1da177e4 1883 int err = -EINVAL;
5e73ea1a 1884 struct net *net = dev_net(dev);
d2d68ba9 1885 bool do_cache;
1da177e4
LT
1886
1887 /* IP on this device is disabled. */
1888
1889 if (!in_dev)
1890 goto out;
1891
1892 /* Check for the most weird martians, which can be not detected
1893 by fib_lookup.
1894 */
1895
61adedf3 1896 tun_info = skb_tunnel_info(skb);
46fa062a 1897 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1b7179d3
TG
1898 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1899 else
1900 fl4.flowi4_tun_key.tun_id = 0;
f38a9eb1
TG
1901 skb_dst_drop(skb);
1902
d0daebc3 1903 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1da177e4
LT
1904 goto martian_source;
1905
5510cdf7
DA
1906 res->fi = NULL;
1907 res->table = NULL;
27a954bd 1908 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1da177e4
LT
1909 goto brd_input;
1910
1911 /* Accept zero addresses only to limited broadcast;
1912 * I even do not know to fix it or not. Waiting for complains :-)
1913 */
f97c1e0c 1914 if (ipv4_is_zeronet(saddr))
1da177e4
LT
1915 goto martian_source;
1916
d0daebc3 1917 if (ipv4_is_zeronet(daddr))
1da177e4
LT
1918 goto martian_destination;
1919
9eb43e76
ED
1920 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1921 * and call it once if daddr or/and saddr are loopback addresses
1922 */
1923 if (ipv4_is_loopback(daddr)) {
1924 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
d0daebc3 1925 goto martian_destination;
9eb43e76
ED
1926 } else if (ipv4_is_loopback(saddr)) {
1927 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
d0daebc3
TG
1928 goto martian_source;
1929 }
1930
1da177e4
LT
1931 /*
1932 * Now we are ready to route packet.
1933 */
68a5e3dd 1934 fl4.flowi4_oif = 0;
e0d56fdd 1935 fl4.flowi4_iif = dev->ifindex;
68a5e3dd
DM
1936 fl4.flowi4_mark = skb->mark;
1937 fl4.flowi4_tos = tos;
1938 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
b84f7878 1939 fl4.flowi4_flags = 0;
68a5e3dd
DM
1940 fl4.daddr = daddr;
1941 fl4.saddr = saddr;
8bcfd092 1942 fl4.flowi4_uid = sock_net_uid(net, NULL);
5510cdf7 1943 err = fib_lookup(net, &fl4, res, 0);
cd0f0b95
DJ
1944 if (err != 0) {
1945 if (!IN_DEV_FORWARD(in_dev))
1946 err = -EHOSTUNREACH;
1da177e4 1947 goto no_route;
cd0f0b95 1948 }
1da177e4 1949
5510cdf7 1950 if (res->type == RTN_BROADCAST)
1da177e4
LT
1951 goto brd_input;
1952
5510cdf7 1953 if (res->type == RTN_LOCAL) {
5c04c819 1954 err = fib_validate_source(skb, saddr, daddr, tos,
0d5edc68 1955 0, dev, in_dev, &itag);
b5f7e755 1956 if (err < 0)
0d753960 1957 goto martian_source;
1da177e4
LT
1958 goto local_input;
1959 }
1960
cd0f0b95
DJ
1961 if (!IN_DEV_FORWARD(in_dev)) {
1962 err = -EHOSTUNREACH;
251da413 1963 goto no_route;
cd0f0b95 1964 }
5510cdf7 1965 if (res->type != RTN_UNICAST)
1da177e4
LT
1966 goto martian_destination;
1967
5510cdf7 1968 err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1da177e4
LT
1969out: return err;
1970
1971brd_input:
1972 if (skb->protocol != htons(ETH_P_IP))
1973 goto e_inval;
1974
41347dcd 1975 if (!ipv4_is_zeronet(saddr)) {
9e56e380
DM
1976 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1977 in_dev, &itag);
1da177e4 1978 if (err < 0)
0d753960 1979 goto martian_source;
1da177e4
LT
1980 }
1981 flags |= RTCF_BROADCAST;
5510cdf7 1982 res->type = RTN_BROADCAST;
1da177e4
LT
1983 RT_CACHE_STAT_INC(in_brd);
1984
1985local_input:
d2d68ba9 1986 do_cache = false;
5510cdf7 1987 if (res->fi) {
fe3edf45 1988 if (!itag) {
5510cdf7 1989 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
d2d68ba9 1990 if (rt_cache_valid(rth)) {
c6cffba4
DM
1991 skb_dst_set_noref(skb, &rth->dst);
1992 err = 0;
1993 goto out;
d2d68ba9
DM
1994 }
1995 do_cache = true;
1996 }
1997 }
1998
f5a0aab8 1999 rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
5510cdf7 2000 flags | RTCF_LOCAL, res->type,
d2d68ba9 2001 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1da177e4
LT
2002 if (!rth)
2003 goto e_nobufs;
2004
d8d1f30b 2005 rth->dst.output= ip_rt_bug;
cf911662
DM
2006#ifdef CONFIG_IP_ROUTE_CLASSID
2007 rth->dst.tclassid = itag;
2008#endif
9917e1e8 2009 rth->rt_is_input = 1;
5510cdf7
DA
2010 if (res->table)
2011 rth->rt_table_id = res->table->tb_id;
571e7226 2012
a6254864 2013 RT_CACHE_STAT_INC(in_slow_tot);
5510cdf7 2014 if (res->type == RTN_UNREACHABLE) {
d8d1f30b
CG
2015 rth->dst.input= ip_error;
2016 rth->dst.error= -err;
1da177e4
LT
2017 rth->rt_flags &= ~RTCF_LOCAL;
2018 }
efd85700 2019
dcdfdf56 2020 if (do_cache) {
5510cdf7 2021 struct fib_nh *nh = &FIB_RES_NH(*res);
efd85700
TG
2022
2023 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2024 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2025 WARN_ON(rth->dst.input == lwtunnel_input);
2026 rth->dst.lwtstate->orig_input = rth->dst.input;
2027 rth->dst.input = lwtunnel_input;
2028 }
2029
2030 if (unlikely(!rt_cache_route(nh, rth))) {
dcdfdf56
AS
2031 rth->dst.flags |= DST_NOCACHE;
2032 rt_add_uncached_list(rth);
2033 }
2034 }
89aef892 2035 skb_dst_set(skb, &rth->dst);
b23dd4fe 2036 err = 0;
ebc0ffae 2037 goto out;
1da177e4
LT
2038
2039no_route:
2040 RT_CACHE_STAT_INC(in_no_route);
5510cdf7
DA
2041 res->type = RTN_UNREACHABLE;
2042 res->fi = NULL;
2043 res->table = NULL;
1da177e4
LT
2044 goto local_input;
2045
2046 /*
2047 * Do not cache martian addresses: they should be logged (RFC1812)
2048 */
2049martian_destination:
2050 RT_CACHE_STAT_INC(in_martian_dst);
2051#ifdef CONFIG_IP_ROUTE_VERBOSE
e87cc472
JP
2052 if (IN_DEV_LOG_MARTIANS(in_dev))
2053 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2054 &daddr, &saddr, dev->name);
1da177e4 2055#endif
2c2910a4 2056
1da177e4
LT
2057e_inval:
2058 err = -EINVAL;
ebc0ffae 2059 goto out;
1da177e4
LT
2060
2061e_nobufs:
2062 err = -ENOBUFS;
ebc0ffae 2063 goto out;
1da177e4
LT
2064
2065martian_source:
2066 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
ebc0ffae 2067 goto out;
1da177e4
LT
2068}
2069
c6cffba4
DM
2070int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2071 u8 tos, struct net_device *dev)
1da177e4 2072{
5510cdf7
DA
2073 struct fib_result res;
2074 int err;
1da177e4 2075
6e28099d 2076 tos &= IPTOS_RT_MASK;
96d36220 2077 rcu_read_lock();
5510cdf7
DA
2078 err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2079 rcu_read_unlock();
96d36220 2080
5510cdf7
DA
2081 return err;
2082}
2083EXPORT_SYMBOL(ip_route_input_noref);
2084
2085/* called with rcu_read_lock held */
2086int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2087 u8 tos, struct net_device *dev, struct fib_result *res)
2088{
1da177e4
LT
2089 /* Multicast recognition logic is moved from route cache to here.
2090 The problem was that too many Ethernet cards have broken/missing
2091 hardware multicast filters :-( As result the host on multicasting
2092 network acquires a lot of useless route cache entries, sort of
2093 SDR messages from all the world. Now we try to get rid of them.
2094 Really, provided software IP multicast filter is organized
2095 reasonably (at least, hashed), it does not result in a slowdown
2096 comparing with route cache reject entries.
2097 Note, that multicast routers are not affected, because
2098 route cache entry is created eventually.
2099 */
f97c1e0c 2100 if (ipv4_is_multicast(daddr)) {
96d36220 2101 struct in_device *in_dev = __in_dev_get_rcu(dev);
e58e4159 2102 int our = 0;
5510cdf7 2103 int err = -EINVAL;
1da177e4 2104
e58e4159
DA
2105 if (in_dev)
2106 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2107 ip_hdr(skb)->protocol);
2108
2109 /* check l3 master if no match yet */
2110 if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
2111 struct in_device *l3_in_dev;
2112
2113 l3_in_dev = __in_dev_get_rcu(skb->dev);
2114 if (l3_in_dev)
2115 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2116 ip_hdr(skb)->protocol);
2117 }
2118
e58e4159 2119 if (our
1da177e4 2120#ifdef CONFIG_IP_MROUTE
e58e4159
DA
2121 ||
2122 (!ipv4_is_local_multicast(daddr) &&
2123 IN_DEV_MFORWARD(in_dev))
1da177e4 2124#endif
e58e4159 2125 ) {
5510cdf7 2126 err = ip_route_input_mc(skb, daddr, saddr,
e58e4159 2127 tos, dev, our);
1da177e4 2128 }
5510cdf7 2129 return err;
1da177e4 2130 }
5510cdf7
DA
2131
2132 return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
1da177e4
LT
2133}
2134
ebc0ffae 2135/* called with rcu_read_lock() */
982721f3 2136static struct rtable *__mkroute_output(const struct fib_result *res,
1a00fee4 2137 const struct flowi4 *fl4, int orig_oif,
f61759e6 2138 struct net_device *dev_out,
5ada5527 2139 unsigned int flags)
1da177e4 2140{
982721f3 2141 struct fib_info *fi = res->fi;
f2bb4bed 2142 struct fib_nh_exception *fnhe;
5ada5527 2143 struct in_device *in_dev;
982721f3 2144 u16 type = res->type;
5ada5527 2145 struct rtable *rth;
c92b9655 2146 bool do_cache;
1da177e4 2147
d0daebc3
TG
2148 in_dev = __in_dev_get_rcu(dev_out);
2149 if (!in_dev)
5ada5527 2150 return ERR_PTR(-EINVAL);
1da177e4 2151
d0daebc3 2152 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
5f02ce24
DA
2153 if (ipv4_is_loopback(fl4->saddr) &&
2154 !(dev_out->flags & IFF_LOOPBACK) &&
2155 !netif_is_l3_master(dev_out))
d0daebc3
TG
2156 return ERR_PTR(-EINVAL);
2157
68a5e3dd 2158 if (ipv4_is_lbcast(fl4->daddr))
982721f3 2159 type = RTN_BROADCAST;
68a5e3dd 2160 else if (ipv4_is_multicast(fl4->daddr))
982721f3 2161 type = RTN_MULTICAST;
68a5e3dd 2162 else if (ipv4_is_zeronet(fl4->daddr))
5ada5527 2163 return ERR_PTR(-EINVAL);
1da177e4
LT
2164
2165 if (dev_out->flags & IFF_LOOPBACK)
2166 flags |= RTCF_LOCAL;
2167
63617421 2168 do_cache = true;
982721f3 2169 if (type == RTN_BROADCAST) {
1da177e4 2170 flags |= RTCF_BROADCAST | RTCF_LOCAL;
982721f3
DM
2171 fi = NULL;
2172 } else if (type == RTN_MULTICAST) {
dd28d1a0 2173 flags |= RTCF_MULTICAST | RTCF_LOCAL;
813b3b5d
DM
2174 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2175 fl4->flowi4_proto))
1da177e4 2176 flags &= ~RTCF_LOCAL;
63617421
JA
2177 else
2178 do_cache = false;
1da177e4 2179 /* If multicast route do not exist use
dd28d1a0
ED
2180 * default one, but do not gateway in this case.
2181 * Yes, it is hack.
1da177e4 2182 */
982721f3
DM
2183 if (fi && res->prefixlen < 4)
2184 fi = NULL;
d6d5e999
CF
2185 } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2186 (orig_oif != dev_out->ifindex)) {
2187 /* For local routes that require a particular output interface
2188 * we do not want to cache the result. Caching the result
2189 * causes incorrect behaviour when there are multiple source
2190 * addresses on the interface, the end result being that if the
2191 * intended recipient is waiting on that interface for the
2192 * packet he won't receive it because it will be delivered on
2193 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2194 * be set to the loopback interface as well.
2195 */
2196 fi = NULL;
1da177e4
LT
2197 }
2198
f2bb4bed 2199 fnhe = NULL;
63617421
JA
2200 do_cache &= fi != NULL;
2201 if (do_cache) {
c5038a83 2202 struct rtable __rcu **prth;
c92b9655 2203 struct fib_nh *nh = &FIB_RES_NH(*res);
d26b3a7c 2204
c92b9655 2205 fnhe = find_exception(nh, fl4->daddr);
deed49df 2206 if (fnhe) {
2ffae99d 2207 prth = &fnhe->fnhe_rth_output;
deed49df
XL
2208 rth = rcu_dereference(*prth);
2209 if (rth && rth->dst.expires &&
2210 time_after(jiffies, rth->dst.expires)) {
2211 ip_del_fnhe(nh, fl4->daddr);
2212 fnhe = NULL;
2213 } else {
2214 goto rt_cache;
c92b9655 2215 }
c92b9655 2216 }
deed49df
XL
2217
2218 if (unlikely(fl4->flowi4_flags &
2219 FLOWI_FLAG_KNOWN_NH &&
2220 !(nh->nh_gw &&
2221 nh->nh_scope == RT_SCOPE_LINK))) {
2222 do_cache = false;
2223 goto add;
2224 }
2225 prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
c5038a83 2226 rth = rcu_dereference(*prth);
deed49df
XL
2227
2228rt_cache:
9df16efa 2229 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
c5038a83 2230 return rth;
f2bb4bed 2231 }
c92b9655
JA
2232
2233add:
d08c4f35 2234 rth = rt_dst_alloc(dev_out, flags, type,
5c1e6aa3 2235 IN_DEV_CONF_GET(in_dev, NOPOLICY),
f2bb4bed 2236 IN_DEV_CONF_GET(in_dev, NOXFRM),
c92b9655 2237 do_cache);
8391d07b 2238 if (!rth)
5ada5527 2239 return ERR_PTR(-ENOBUFS);
8391d07b 2240
13378cad 2241 rth->rt_iif = orig_oif ? : 0;
b7503e0c
DA
2242 if (res->table)
2243 rth->rt_table_id = res->table->tb_id;
2244
1da177e4
LT
2245 RT_CACHE_STAT_INC(out_slow_tot);
2246
1da177e4 2247 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
e905a9ed 2248 if (flags & RTCF_LOCAL &&
1da177e4 2249 !(dev_out->flags & IFF_LOOPBACK)) {
d8d1f30b 2250 rth->dst.output = ip_mc_output;
1da177e4
LT
2251 RT_CACHE_STAT_INC(out_slow_mc);
2252 }
2253#ifdef CONFIG_IP_MROUTE
982721f3 2254 if (type == RTN_MULTICAST) {
1da177e4 2255 if (IN_DEV_MFORWARD(in_dev) &&
813b3b5d 2256 !ipv4_is_local_multicast(fl4->daddr)) {
d8d1f30b
CG
2257 rth->dst.input = ip_mr_input;
2258 rth->dst.output = ip_mc_output;
1da177e4
LT
2259 }
2260 }
2261#endif
2262 }
2263
f2bb4bed 2264 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
efd85700 2265 set_lwt_redirect(rth);
1da177e4 2266
5ada5527 2267 return rth;
1da177e4
LT
2268}
2269
1da177e4
LT
2270/*
2271 * Major route resolver routine.
2272 */
2273
3abd1ade
DA
2274struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2275 const struct sk_buff *skb)
1da177e4 2276{
f61759e6 2277 __u8 tos = RT_FL_TOS(fl4);
813b3b5d 2278 struct fib_result res;
5ada5527 2279 struct rtable *rth;
1da177e4 2280
85b91b03 2281 res.tclassid = 0;
1da177e4 2282 res.fi = NULL;
8b96d22d 2283 res.table = NULL;
1da177e4 2284
1fb9489b 2285 fl4->flowi4_iif = LOOPBACK_IFINDEX;
813b3b5d
DM
2286 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2287 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2288 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
44713b67 2289
010c2708 2290 rcu_read_lock();
3abd1ade
DA
2291 rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2292 rcu_read_unlock();
2293
2294 return rth;
2295}
2296EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2297
2298struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2299 struct fib_result *res,
2300 const struct sk_buff *skb)
2301{
2302 struct net_device *dev_out = NULL;
2303 int orig_oif = fl4->flowi4_oif;
2304 unsigned int flags = 0;
2305 struct rtable *rth;
2306 int err = -ENETUNREACH;
2307
813b3b5d 2308 if (fl4->saddr) {
b23dd4fe 2309 rth = ERR_PTR(-EINVAL);
813b3b5d
DM
2310 if (ipv4_is_multicast(fl4->saddr) ||
2311 ipv4_is_lbcast(fl4->saddr) ||
2312 ipv4_is_zeronet(fl4->saddr))
1da177e4
LT
2313 goto out;
2314
1da177e4
LT
2315 /* I removed check for oif == dev_out->oif here.
2316 It was wrong for two reasons:
1ab35276
DL
2317 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2318 is assigned to multiple interfaces.
1da177e4
LT
2319 2. Moreover, we are allowed to send packets with saddr
2320 of another iface. --ANK
2321 */
2322
813b3b5d
DM
2323 if (fl4->flowi4_oif == 0 &&
2324 (ipv4_is_multicast(fl4->daddr) ||
2325 ipv4_is_lbcast(fl4->daddr))) {
a210d01a 2326 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 2327 dev_out = __ip_dev_find(net, fl4->saddr, false);
51456b29 2328 if (!dev_out)
a210d01a
JA
2329 goto out;
2330
1da177e4
LT
2331 /* Special hack: user can direct multicasts
2332 and limited broadcast via necessary interface
2333 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2334 This hack is not just for fun, it allows
2335 vic,vat and friends to work.
2336 They bind socket to loopback, set ttl to zero
2337 and expect that it will work.
2338 From the viewpoint of routing cache they are broken,
2339 because we are not allowed to build multicast path
2340 with loopback source addr (look, routing cache
2341 cannot know, that ttl is zero, so that packet
2342 will not leave this host and route is valid).
2343 Luckily, this hack is good workaround.
2344 */
2345
813b3b5d 2346 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2347 goto make_route;
2348 }
a210d01a 2349
813b3b5d 2350 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
a210d01a 2351 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 2352 if (!__ip_dev_find(net, fl4->saddr, false))
a210d01a 2353 goto out;
a210d01a 2354 }
1da177e4
LT
2355 }
2356
2357
813b3b5d
DM
2358 if (fl4->flowi4_oif) {
2359 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
b23dd4fe 2360 rth = ERR_PTR(-ENODEV);
51456b29 2361 if (!dev_out)
1da177e4 2362 goto out;
e5ed6399
HX
2363
2364 /* RACE: Check return value of inet_select_addr instead. */
fc75fc83 2365 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
b23dd4fe 2366 rth = ERR_PTR(-ENETUNREACH);
fc75fc83
ED
2367 goto out;
2368 }
813b3b5d 2369 if (ipv4_is_local_multicast(fl4->daddr) ||
6a211654
AL
2370 ipv4_is_lbcast(fl4->daddr) ||
2371 fl4->flowi4_proto == IPPROTO_IGMP) {
813b3b5d
DM
2372 if (!fl4->saddr)
2373 fl4->saddr = inet_select_addr(dev_out, 0,
2374 RT_SCOPE_LINK);
1da177e4
LT
2375 goto make_route;
2376 }
0a7e2260 2377 if (!fl4->saddr) {
813b3b5d
DM
2378 if (ipv4_is_multicast(fl4->daddr))
2379 fl4->saddr = inet_select_addr(dev_out, 0,
2380 fl4->flowi4_scope);
2381 else if (!fl4->daddr)
2382 fl4->saddr = inet_select_addr(dev_out, 0,
2383 RT_SCOPE_HOST);
1da177e4
LT
2384 }
2385 }
2386
813b3b5d
DM
2387 if (!fl4->daddr) {
2388 fl4->daddr = fl4->saddr;
2389 if (!fl4->daddr)
2390 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
b40afd0e 2391 dev_out = net->loopback_dev;
1fb9489b 2392 fl4->flowi4_oif = LOOPBACK_IFINDEX;
3abd1ade 2393 res->type = RTN_LOCAL;
1da177e4
LT
2394 flags |= RTCF_LOCAL;
2395 goto make_route;
2396 }
2397
3abd1ade 2398 err = fib_lookup(net, fl4, res, 0);
0315e382 2399 if (err) {
3abd1ade
DA
2400 res->fi = NULL;
2401 res->table = NULL;
6104e112 2402 if (fl4->flowi4_oif &&
e58e4159
DA
2403 (ipv4_is_multicast(fl4->daddr) ||
2404 !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
1da177e4
LT
2405 /* Apparently, routing tables are wrong. Assume,
2406 that the destination is on link.
2407
2408 WHY? DW.
2409 Because we are allowed to send to iface
2410 even if it has NO routes and NO assigned
2411 addresses. When oif is specified, routing
2412 tables are looked up with only one purpose:
2413 to catch if destination is gatewayed, rather than
2414 direct. Moreover, if MSG_DONTROUTE is set,
2415 we send packet, ignoring both routing tables
2416 and ifaddr state. --ANK
2417
2418
2419 We could make it even if oif is unknown,
2420 likely IPv6, but we do not.
2421 */
2422
813b3b5d
DM
2423 if (fl4->saddr == 0)
2424 fl4->saddr = inet_select_addr(dev_out, 0,
2425 RT_SCOPE_LINK);
3abd1ade 2426 res->type = RTN_UNICAST;
1da177e4
LT
2427 goto make_route;
2428 }
0315e382 2429 rth = ERR_PTR(err);
1da177e4
LT
2430 goto out;
2431 }
1da177e4 2432
3abd1ade 2433 if (res->type == RTN_LOCAL) {
813b3b5d 2434 if (!fl4->saddr) {
3abd1ade
DA
2435 if (res->fi->fib_prefsrc)
2436 fl4->saddr = res->fi->fib_prefsrc;
9fc3bbb4 2437 else
813b3b5d 2438 fl4->saddr = fl4->daddr;
9fc3bbb4 2439 }
5f02ce24
DA
2440
2441 /* L3 master device is the loopback for that domain */
3abd1ade 2442 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
b7c8487c 2443 net->loopback_dev;
813b3b5d 2444 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2445 flags |= RTCF_LOCAL;
2446 goto make_route;
2447 }
2448
3abd1ade 2449 fib_select_path(net, res, fl4, skb);
1da177e4 2450
3abd1ade 2451 dev_out = FIB_RES_DEV(*res);
813b3b5d 2452 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2453
2454
2455make_route:
3abd1ade 2456 rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
1da177e4 2457
010c2708 2458out:
b23dd4fe 2459 return rth;
1da177e4 2460}
d8c97a94 2461
ae2688d5
JW
2462static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2463{
2464 return NULL;
2465}
2466
ebb762f2 2467static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
ec831ea7 2468{
618f9bc7
SK
2469 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2470
2471 return mtu ? : dst->dev->mtu;
ec831ea7
RD
2472}
2473
6700c270
DM
2474static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2475 struct sk_buff *skb, u32 mtu)
14e50e57
DM
2476{
2477}
2478
6700c270
DM
2479static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2480 struct sk_buff *skb)
b587ee3b
DM
2481{
2482}
2483
0972ddb2
HB
2484static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2485 unsigned long old)
2486{
2487 return NULL;
2488}
2489
14e50e57
DM
2490static struct dst_ops ipv4_dst_blackhole_ops = {
2491 .family = AF_INET,
ae2688d5 2492 .check = ipv4_blackhole_dst_check,
ebb762f2 2493 .mtu = ipv4_blackhole_mtu,
214f45c9 2494 .default_advmss = ipv4_default_advmss,
14e50e57 2495 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
b587ee3b 2496 .redirect = ipv4_rt_blackhole_redirect,
0972ddb2 2497 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
d3aaeb38 2498 .neigh_lookup = ipv4_neigh_lookup,
14e50e57
DM
2499};
2500
2774c131 2501struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
14e50e57 2502{
2774c131 2503 struct rtable *ort = (struct rtable *) dst_orig;
f5b0a874 2504 struct rtable *rt;
14e50e57 2505
b838d5e1 2506 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, DST_NOGC);
14e50e57 2507 if (rt) {
d8d1f30b 2508 struct dst_entry *new = &rt->dst;
14e50e57 2509
14e50e57 2510 new->__use = 1;
352e512c 2511 new->input = dst_discard;
ede2059d 2512 new->output = dst_discard_out;
14e50e57 2513
1dbe3252 2514 new->dev = net->loopback_dev;
14e50e57
DM
2515 if (new->dev)
2516 dev_hold(new->dev);
2517
9917e1e8 2518 rt->rt_is_input = ort->rt_is_input;
5e2b61f7 2519 rt->rt_iif = ort->rt_iif;
5943634f 2520 rt->rt_pmtu = ort->rt_pmtu;
14e50e57 2521
ca4c3fc2 2522 rt->rt_genid = rt_genid_ipv4(net);
14e50e57
DM
2523 rt->rt_flags = ort->rt_flags;
2524 rt->rt_type = ort->rt_type;
14e50e57 2525 rt->rt_gateway = ort->rt_gateway;
155e8336 2526 rt->rt_uses_gateway = ort->rt_uses_gateway;
14e50e57 2527
caacf05e 2528 INIT_LIST_HEAD(&rt->rt_uncached);
14e50e57
DM
2529 }
2530
2774c131
DM
2531 dst_release(dst_orig);
2532
2533 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
14e50e57
DM
2534}
2535
9d6ec938 2536struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
6f9c9615 2537 const struct sock *sk)
1da177e4 2538{
9d6ec938 2539 struct rtable *rt = __ip_route_output_key(net, flp4);
1da177e4 2540
b23dd4fe
DM
2541 if (IS_ERR(rt))
2542 return rt;
1da177e4 2543
56157872 2544 if (flp4->flowi4_proto)
f92ee619
SK
2545 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2546 flowi4_to_flowi(flp4),
2547 sk, 0);
1da177e4 2548
b23dd4fe 2549 return rt;
1da177e4 2550}
d8c97a94
ACM
2551EXPORT_SYMBOL_GPL(ip_route_output_flow);
2552
3765d35e 2553/* called with rcu_read_lock held */
c36ba660 2554static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
15e47304 2555 struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
ba52d61e 2556 u32 seq)
1da177e4 2557{
ba52d61e 2558 struct rtable *rt = skb_rtable(skb);
1da177e4 2559 struct rtmsg *r;
be403ea1 2560 struct nlmsghdr *nlh;
2bc8ca40 2561 unsigned long expires = 0;
f185071d 2562 u32 error;
521f5490 2563 u32 metrics[RTAX_MAX];
be403ea1 2564
d3166e0c 2565 nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
51456b29 2566 if (!nlh)
26932566 2567 return -EMSGSIZE;
be403ea1
TG
2568
2569 r = nlmsg_data(nlh);
1da177e4
LT
2570 r->rtm_family = AF_INET;
2571 r->rtm_dst_len = 32;
2572 r->rtm_src_len = 0;
d6c0a4f6 2573 r->rtm_tos = fl4->flowi4_tos;
8a430ed5 2574 r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
c36ba660 2575 if (nla_put_u32(skb, RTA_TABLE, table_id))
f3756b79 2576 goto nla_put_failure;
1da177e4
LT
2577 r->rtm_type = rt->rt_type;
2578 r->rtm_scope = RT_SCOPE_UNIVERSE;
2579 r->rtm_protocol = RTPROT_UNSPEC;
2580 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2581 if (rt->rt_flags & RTCF_NOTIFY)
2582 r->rtm_flags |= RTM_F_NOTIFY;
df4d9254
HFS
2583 if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2584 r->rtm_flags |= RTCF_DOREDIRECT;
be403ea1 2585
930345ea 2586 if (nla_put_in_addr(skb, RTA_DST, dst))
f3756b79 2587 goto nla_put_failure;
1a00fee4 2588 if (src) {
1da177e4 2589 r->rtm_src_len = 32;
930345ea 2590 if (nla_put_in_addr(skb, RTA_SRC, src))
f3756b79 2591 goto nla_put_failure;
1da177e4 2592 }
f3756b79
DM
2593 if (rt->dst.dev &&
2594 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2595 goto nla_put_failure;
c7066f70 2596#ifdef CONFIG_IP_ROUTE_CLASSID
f3756b79
DM
2597 if (rt->dst.tclassid &&
2598 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2599 goto nla_put_failure;
1da177e4 2600#endif
41347dcd 2601 if (!rt_is_input_route(rt) &&
d6c0a4f6 2602 fl4->saddr != src) {
930345ea 2603 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
f3756b79
DM
2604 goto nla_put_failure;
2605 }
155e8336 2606 if (rt->rt_uses_gateway &&
930345ea 2607 nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
f3756b79 2608 goto nla_put_failure;
be403ea1 2609
ee9a8f7a
SK
2610 expires = rt->dst.expires;
2611 if (expires) {
2612 unsigned long now = jiffies;
2613
2614 if (time_before(now, expires))
2615 expires -= now;
2616 else
2617 expires = 0;
2618 }
2619
521f5490 2620 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
ee9a8f7a 2621 if (rt->rt_pmtu && expires)
521f5490
JA
2622 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2623 if (rtnetlink_put_metrics(skb, metrics) < 0)
be403ea1
TG
2624 goto nla_put_failure;
2625
b4869889 2626 if (fl4->flowi4_mark &&
68aaed54 2627 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
f3756b79 2628 goto nla_put_failure;
963bfeee 2629
622ec2c9
LC
2630 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2631 nla_put_u32(skb, RTA_UID,
2632 from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2633 goto nla_put_failure;
2634
d8d1f30b 2635 error = rt->dst.error;
be403ea1 2636
c7537967 2637 if (rt_is_input_route(rt)) {
8caaf7b6
ND
2638#ifdef CONFIG_IP_MROUTE
2639 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2640 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2641 int err = ipmr_get_route(net, skb,
2642 fl4->saddr, fl4->daddr,
9f09eaea 2643 r, portid);
2cf75070 2644
8caaf7b6 2645 if (err <= 0) {
0c8d803f
DA
2646 if (err == 0)
2647 return 0;
2648 goto nla_put_failure;
8caaf7b6
ND
2649 }
2650 } else
2651#endif
91146153 2652 if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
8caaf7b6 2653 goto nla_put_failure;
1da177e4
LT
2654 }
2655
f185071d 2656 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
e3703b3d 2657 goto nla_put_failure;
be403ea1 2658
053c095a
JB
2659 nlmsg_end(skb, nlh);
2660 return 0;
1da177e4 2661
be403ea1 2662nla_put_failure:
26932566
PM
2663 nlmsg_cancel(skb, nlh);
2664 return -EMSGSIZE;
1da177e4
LT
2665}
2666
c21ef3e3
DA
2667static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2668 struct netlink_ext_ack *extack)
1da177e4 2669{
3b1e0a65 2670 struct net *net = sock_net(in_skb->sk);
d889ce3b
TG
2671 struct rtmsg *rtm;
2672 struct nlattr *tb[RTA_MAX+1];
3765d35e 2673 struct fib_result res = {};
1da177e4 2674 struct rtable *rt = NULL;
d6c0a4f6 2675 struct flowi4 fl4;
9e12bb22
AV
2676 __be32 dst = 0;
2677 __be32 src = 0;
2678 u32 iif;
d889ce3b 2679 int err;
963bfeee 2680 int mark;
1da177e4 2681 struct sk_buff *skb;
c36ba660 2682 u32 table_id = RT_TABLE_MAIN;
622ec2c9 2683 kuid_t uid;
1da177e4 2684
fceb6435 2685 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
c21ef3e3 2686 extack);
d889ce3b
TG
2687 if (err < 0)
2688 goto errout;
2689
2690 rtm = nlmsg_data(nlh);
2691
1da177e4 2692 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
51456b29 2693 if (!skb) {
d889ce3b
TG
2694 err = -ENOBUFS;
2695 goto errout;
2696 }
1da177e4
LT
2697
2698 /* Reserve room for dummy headers, this skb can pass
2699 through good chunk of routing engine.
2700 */
459a98ed 2701 skb_reset_mac_header(skb);
c1d2bbe1 2702 skb_reset_network_header(skb);
d2c962b8 2703
67b61f6c
JB
2704 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2705 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
d889ce3b 2706 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
963bfeee 2707 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
622ec2c9
LC
2708 if (tb[RTA_UID])
2709 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2710 else
2711 uid = (iif ? INVALID_UID : current_uid());
1da177e4 2712
bbadb9a2
FL
2713 /* Bugfix: need to give ip_route_input enough of an IP header to
2714 * not gag.
2715 */
2716 ip_hdr(skb)->protocol = IPPROTO_UDP;
2717 ip_hdr(skb)->saddr = src;
2718 ip_hdr(skb)->daddr = dst;
2719
2720 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2721
d6c0a4f6
DM
2722 memset(&fl4, 0, sizeof(fl4));
2723 fl4.daddr = dst;
2724 fl4.saddr = src;
2725 fl4.flowi4_tos = rtm->rtm_tos;
2726 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2727 fl4.flowi4_mark = mark;
622ec2c9 2728 fl4.flowi4_uid = uid;
d6c0a4f6 2729
3765d35e
DA
2730 rcu_read_lock();
2731
1da177e4 2732 if (iif) {
d889ce3b
TG
2733 struct net_device *dev;
2734
3765d35e 2735 dev = dev_get_by_index_rcu(net, iif);
51456b29 2736 if (!dev) {
d889ce3b
TG
2737 err = -ENODEV;
2738 goto errout_free;
2739 }
2740
1da177e4
LT
2741 skb->protocol = htons(ETH_P_IP);
2742 skb->dev = dev;
963bfeee 2743 skb->mark = mark;
3765d35e
DA
2744 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
2745 dev, &res);
d889ce3b 2746
511c3f92 2747 rt = skb_rtable(skb);
d8d1f30b
CG
2748 if (err == 0 && rt->dst.error)
2749 err = -rt->dst.error;
1da177e4 2750 } else {
3765d35e 2751 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
b23dd4fe
DM
2752 err = 0;
2753 if (IS_ERR(rt))
2754 err = PTR_ERR(rt);
1da177e4 2755 }
d889ce3b 2756
1da177e4 2757 if (err)
d889ce3b 2758 goto errout_free;
1da177e4 2759
ba52d61e 2760 skb_dst_set(skb, &rt->dst);
1da177e4
LT
2761 if (rtm->rtm_flags & RTM_F_NOTIFY)
2762 rt->rt_flags |= RTCF_NOTIFY;
2763
c36ba660
DA
2764 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2765 table_id = rt->rt_table_id;
2766
b6179813
RP
2767 if (rtm->rtm_flags & RTM_F_FIB_MATCH)
2768 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2769 nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2770 rt->rt_type, res.prefix, res.prefixlen,
2771 fl4.flowi4_tos, res.fi, 0);
2772 else
2773 err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
ba52d61e 2774 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
7b46a644 2775 if (err < 0)
d889ce3b 2776 goto errout_free;
1da177e4 2777
3765d35e
DA
2778 rcu_read_unlock();
2779
15e47304 2780 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
d889ce3b 2781errout:
2942e900 2782 return err;
1da177e4 2783
d889ce3b 2784errout_free:
3765d35e 2785 rcu_read_unlock();
1da177e4 2786 kfree_skb(skb);
d889ce3b 2787 goto errout;
1da177e4
LT
2788}
2789
1da177e4
LT
2790void ip_rt_multicast_event(struct in_device *in_dev)
2791{
4ccfe6d4 2792 rt_cache_flush(dev_net(in_dev->dev));
1da177e4
LT
2793}
2794
2795#ifdef CONFIG_SYSCTL
082c7ca4
G
2796static int ip_rt_gc_interval __read_mostly = 60 * HZ;
2797static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
2798static int ip_rt_gc_elasticity __read_mostly = 8;
2799
fe2c6338 2800static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
8d65af78 2801 void __user *buffer,
1da177e4
LT
2802 size_t *lenp, loff_t *ppos)
2803{
5aad1de5
TT
2804 struct net *net = (struct net *)__ctl->extra1;
2805
1da177e4 2806 if (write) {
5aad1de5
TT
2807 rt_cache_flush(net);
2808 fnhe_genid_bump(net);
1da177e4 2809 return 0;
e905a9ed 2810 }
1da177e4
LT
2811
2812 return -EINVAL;
2813}
2814
fe2c6338 2815static struct ctl_table ipv4_route_table[] = {
1da177e4 2816 {
1da177e4
LT
2817 .procname = "gc_thresh",
2818 .data = &ipv4_dst_ops.gc_thresh,
2819 .maxlen = sizeof(int),
2820 .mode = 0644,
6d9f239a 2821 .proc_handler = proc_dointvec,
1da177e4
LT
2822 },
2823 {
1da177e4
LT
2824 .procname = "max_size",
2825 .data = &ip_rt_max_size,
2826 .maxlen = sizeof(int),
2827 .mode = 0644,
6d9f239a 2828 .proc_handler = proc_dointvec,
1da177e4
LT
2829 },
2830 {
2831 /* Deprecated. Use gc_min_interval_ms */
e905a9ed 2832
1da177e4
LT
2833 .procname = "gc_min_interval",
2834 .data = &ip_rt_gc_min_interval,
2835 .maxlen = sizeof(int),
2836 .mode = 0644,
6d9f239a 2837 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
2838 },
2839 {
1da177e4
LT
2840 .procname = "gc_min_interval_ms",
2841 .data = &ip_rt_gc_min_interval,
2842 .maxlen = sizeof(int),
2843 .mode = 0644,
6d9f239a 2844 .proc_handler = proc_dointvec_ms_jiffies,
1da177e4
LT
2845 },
2846 {
1da177e4
LT
2847 .procname = "gc_timeout",
2848 .data = &ip_rt_gc_timeout,
2849 .maxlen = sizeof(int),
2850 .mode = 0644,
6d9f239a 2851 .proc_handler = proc_dointvec_jiffies,
1da177e4 2852 },
9f28a2fc
ED
2853 {
2854 .procname = "gc_interval",
2855 .data = &ip_rt_gc_interval,
2856 .maxlen = sizeof(int),
2857 .mode = 0644,
2858 .proc_handler = proc_dointvec_jiffies,
2859 },
1da177e4 2860 {
1da177e4
LT
2861 .procname = "redirect_load",
2862 .data = &ip_rt_redirect_load,
2863 .maxlen = sizeof(int),
2864 .mode = 0644,
6d9f239a 2865 .proc_handler = proc_dointvec,
1da177e4
LT
2866 },
2867 {
1da177e4
LT
2868 .procname = "redirect_number",
2869 .data = &ip_rt_redirect_number,
2870 .maxlen = sizeof(int),
2871 .mode = 0644,
6d9f239a 2872 .proc_handler = proc_dointvec,
1da177e4
LT
2873 },
2874 {
1da177e4
LT
2875 .procname = "redirect_silence",
2876 .data = &ip_rt_redirect_silence,
2877 .maxlen = sizeof(int),
2878 .mode = 0644,
6d9f239a 2879 .proc_handler = proc_dointvec,
1da177e4
LT
2880 },
2881 {
1da177e4
LT
2882 .procname = "error_cost",
2883 .data = &ip_rt_error_cost,
2884 .maxlen = sizeof(int),
2885 .mode = 0644,
6d9f239a 2886 .proc_handler = proc_dointvec,
1da177e4
LT
2887 },
2888 {
1da177e4
LT
2889 .procname = "error_burst",
2890 .data = &ip_rt_error_burst,
2891 .maxlen = sizeof(int),
2892 .mode = 0644,
6d9f239a 2893 .proc_handler = proc_dointvec,
1da177e4
LT
2894 },
2895 {
1da177e4
LT
2896 .procname = "gc_elasticity",
2897 .data = &ip_rt_gc_elasticity,
2898 .maxlen = sizeof(int),
2899 .mode = 0644,
6d9f239a 2900 .proc_handler = proc_dointvec,
1da177e4
LT
2901 },
2902 {
1da177e4
LT
2903 .procname = "mtu_expires",
2904 .data = &ip_rt_mtu_expires,
2905 .maxlen = sizeof(int),
2906 .mode = 0644,
6d9f239a 2907 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
2908 },
2909 {
1da177e4
LT
2910 .procname = "min_pmtu",
2911 .data = &ip_rt_min_pmtu,
2912 .maxlen = sizeof(int),
2913 .mode = 0644,
6d9f239a 2914 .proc_handler = proc_dointvec,
1da177e4
LT
2915 },
2916 {
1da177e4
LT
2917 .procname = "min_adv_mss",
2918 .data = &ip_rt_min_advmss,
2919 .maxlen = sizeof(int),
2920 .mode = 0644,
6d9f239a 2921 .proc_handler = proc_dointvec,
1da177e4 2922 },
f8572d8f 2923 { }
1da177e4 2924};
39a23e75 2925
39a23e75
DL
2926static struct ctl_table ipv4_route_flush_table[] = {
2927 {
39a23e75
DL
2928 .procname = "flush",
2929 .maxlen = sizeof(int),
2930 .mode = 0200,
6d9f239a 2931 .proc_handler = ipv4_sysctl_rtcache_flush,
39a23e75 2932 },
f8572d8f 2933 { },
39a23e75
DL
2934};
2935
2936static __net_init int sysctl_route_net_init(struct net *net)
2937{
2938 struct ctl_table *tbl;
2939
2940 tbl = ipv4_route_flush_table;
09ad9bc7 2941 if (!net_eq(net, &init_net)) {
39a23e75 2942 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
51456b29 2943 if (!tbl)
39a23e75 2944 goto err_dup;
464dc801
EB
2945
2946 /* Don't export sysctls to unprivileged users */
2947 if (net->user_ns != &init_user_ns)
2948 tbl[0].procname = NULL;
39a23e75
DL
2949 }
2950 tbl[0].extra1 = net;
2951
ec8f23ce 2952 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
51456b29 2953 if (!net->ipv4.route_hdr)
39a23e75
DL
2954 goto err_reg;
2955 return 0;
2956
2957err_reg:
2958 if (tbl != ipv4_route_flush_table)
2959 kfree(tbl);
2960err_dup:
2961 return -ENOMEM;
2962}
2963
2964static __net_exit void sysctl_route_net_exit(struct net *net)
2965{
2966 struct ctl_table *tbl;
2967
2968 tbl = net->ipv4.route_hdr->ctl_table_arg;
2969 unregister_net_sysctl_table(net->ipv4.route_hdr);
2970 BUG_ON(tbl == ipv4_route_flush_table);
2971 kfree(tbl);
2972}
2973
2974static __net_initdata struct pernet_operations sysctl_route_ops = {
2975 .init = sysctl_route_net_init,
2976 .exit = sysctl_route_net_exit,
2977};
1da177e4
LT
2978#endif
2979
3ee94372 2980static __net_init int rt_genid_init(struct net *net)
9f5e97e5 2981{
ca4c3fc2 2982 atomic_set(&net->ipv4.rt_genid, 0);
5aad1de5 2983 atomic_set(&net->fnhe_genid, 0);
436c3b66
DM
2984 get_random_bytes(&net->ipv4.dev_addr_genid,
2985 sizeof(net->ipv4.dev_addr_genid));
9f5e97e5
DL
2986 return 0;
2987}
2988
3ee94372
NH
2989static __net_initdata struct pernet_operations rt_genid_ops = {
2990 .init = rt_genid_init,
9f5e97e5
DL
2991};
2992
c3426b47
DM
2993static int __net_init ipv4_inetpeer_init(struct net *net)
2994{
2995 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2996
2997 if (!bp)
2998 return -ENOMEM;
2999 inet_peer_base_init(bp);
3000 net->ipv4.peers = bp;
3001 return 0;
3002}
3003
3004static void __net_exit ipv4_inetpeer_exit(struct net *net)
3005{
3006 struct inet_peer_base *bp = net->ipv4.peers;
3007
3008 net->ipv4.peers = NULL;
56a6b248 3009 inetpeer_invalidate_tree(bp);
c3426b47
DM
3010 kfree(bp);
3011}
3012
3013static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3014 .init = ipv4_inetpeer_init,
3015 .exit = ipv4_inetpeer_exit,
3016};
9f5e97e5 3017
c7066f70 3018#ifdef CONFIG_IP_ROUTE_CLASSID
7d720c3e 3019struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
c7066f70 3020#endif /* CONFIG_IP_ROUTE_CLASSID */
1da177e4 3021
1da177e4
LT
3022int __init ip_rt_init(void)
3023{
424c4b70 3024 int rc = 0;
5055c371 3025 int cpu;
1da177e4 3026
73f156a6
ED
3027 ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3028 if (!ip_idents)
3029 panic("IP: failed to allocate ip_idents\n");
3030
3031 prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3032
355b590c
ED
3033 ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3034 if (!ip_tstamps)
3035 panic("IP: failed to allocate ip_tstamps\n");
3036
5055c371
ED
3037 for_each_possible_cpu(cpu) {
3038 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3039
3040 INIT_LIST_HEAD(&ul->head);
3041 spin_lock_init(&ul->lock);
3042 }
c7066f70 3043#ifdef CONFIG_IP_ROUTE_CLASSID
0dcec8c2 3044 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
1da177e4
LT
3045 if (!ip_rt_acct)
3046 panic("IP: failed to allocate ip_rt_acct\n");
1da177e4
LT
3047#endif
3048
e5d679f3
AD
3049 ipv4_dst_ops.kmem_cachep =
3050 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
20c2df83 3051 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1da177e4 3052
14e50e57
DM
3053 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3054
fc66f95c
ED
3055 if (dst_entries_init(&ipv4_dst_ops) < 0)
3056 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3057
3058 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3059 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3060
89aef892
DM
3061 ipv4_dst_ops.gc_thresh = ~0;
3062 ip_rt_max_size = INT_MAX;
1da177e4 3063
1da177e4
LT
3064 devinet_init();
3065 ip_fib_init();
3066
73b38711 3067 if (ip_rt_proc_init())
058bd4d2 3068 pr_err("Unable to create route proc files\n");
1da177e4
LT
3069#ifdef CONFIG_XFRM
3070 xfrm_init();
703fb94e 3071 xfrm4_init();
1da177e4 3072#endif
c7ac8679 3073 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
63f3444f 3074
39a23e75
DL
3075#ifdef CONFIG_SYSCTL
3076 register_pernet_subsys(&sysctl_route_ops);
3077#endif
3ee94372 3078 register_pernet_subsys(&rt_genid_ops);
c3426b47 3079 register_pernet_subsys(&ipv4_inetpeer_ops);
1da177e4
LT
3080 return rc;
3081}
3082
a1bc6eb4 3083#ifdef CONFIG_SYSCTL
eeb61f71
AV
3084/*
3085 * We really need to sanitize the damn ipv4 init order, then all
3086 * this nonsense will go away.
3087 */
3088void __init ip_static_sysctl_init(void)
3089{
4e5ca785 3090 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
eeb61f71 3091}
a1bc6eb4 3092#endif