]> git.ipfire.org Git - thirdparty/kernel/stable.git/blame - net/ipv4/route.c
tls: don't use stack memory in a scatterlist
[thirdparty/kernel/stable.git] / net / ipv4 / route.c
CommitLineData
1da177e4
LT
1/*
2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
5 *
6 * ROUTE - implementation of the IP router.
7 *
02c30a84 8 * Authors: Ross Biro
1da177e4
LT
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 *
14 * Fixes:
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
e905a9ed 21 * Alan Cox : Super /proc >4K
1da177e4
LT
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
24 * clamper.
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
e905a9ed 39 *
1da177e4
LT
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
bb1d23b0 55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
cef2685e
IS
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
1da177e4
LT
58 *
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
63 */
64
afd46503
JP
65#define pr_fmt(fmt) "IPv4: " fmt
66
1da177e4 67#include <linux/module.h>
7c0f6ba6 68#include <linux/uaccess.h>
1da177e4
LT
69#include <linux/bitops.h>
70#include <linux/types.h>
71#include <linux/kernel.h>
1da177e4
LT
72#include <linux/mm.h>
73#include <linux/string.h>
74#include <linux/socket.h>
75#include <linux/sockios.h>
76#include <linux/errno.h>
77#include <linux/in.h>
78#include <linux/inet.h>
79#include <linux/netdevice.h>
80#include <linux/proc_fs.h>
81#include <linux/init.h>
82#include <linux/skbuff.h>
1da177e4
LT
83#include <linux/inetdevice.h>
84#include <linux/igmp.h>
85#include <linux/pkt_sched.h>
86#include <linux/mroute.h>
87#include <linux/netfilter_ipv4.h>
88#include <linux/random.h>
1da177e4
LT
89#include <linux/rcupdate.h>
90#include <linux/times.h>
5a0e3ad6 91#include <linux/slab.h>
73f156a6 92#include <linux/jhash.h>
352e512c 93#include <net/dst.h>
1b7179d3 94#include <net/dst_metadata.h>
457c4cbc 95#include <net/net_namespace.h>
1da177e4
LT
96#include <net/protocol.h>
97#include <net/ip.h>
98#include <net/route.h>
99#include <net/inetpeer.h>
100#include <net/sock.h>
101#include <net/ip_fib.h>
102#include <net/arp.h>
103#include <net/tcp.h>
104#include <net/icmp.h>
105#include <net/xfrm.h>
571e7226 106#include <net/lwtunnel.h>
8d71740c 107#include <net/netevent.h>
63f3444f 108#include <net/rtnetlink.h>
1da177e4
LT
109#ifdef CONFIG_SYSCTL
110#include <linux/sysctl.h>
111#endif
6e5714ea 112#include <net/secure_seq.h>
1b7179d3 113#include <net/ip_tunnels.h>
385add90 114#include <net/l3mdev.h>
1da177e4 115
b6179813
RP
116#include "fib_lookup.h"
117
68a5e3dd 118#define RT_FL_TOS(oldflp4) \
f61759e6 119 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
1da177e4 120
1da177e4
LT
121#define RT_GC_TIMEOUT (300*HZ)
122
1da177e4 123static int ip_rt_max_size;
817bc4db
SH
124static int ip_rt_redirect_number __read_mostly = 9;
125static int ip_rt_redirect_load __read_mostly = HZ / 50;
126static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
127static int ip_rt_error_cost __read_mostly = HZ;
128static int ip_rt_error_burst __read_mostly = 5 * HZ;
817bc4db 129static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
c7272c2f 130static u32 ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
817bc4db 131static int ip_rt_min_advmss __read_mostly = 256;
9f28a2fc 132
deed49df 133static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
c7272c2f 134
1da177e4
LT
135/*
136 * Interface to generic destination cache.
137 */
138
139static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
0dbaee3b 140static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
ebb762f2 141static unsigned int ipv4_mtu(const struct dst_entry *dst);
1da177e4
LT
142static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
143static void ipv4_link_failure(struct sk_buff *skb);
6700c270
DM
144static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
145 struct sk_buff *skb, u32 mtu);
146static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
147 struct sk_buff *skb);
caacf05e 148static void ipv4_dst_destroy(struct dst_entry *dst);
1da177e4 149
62fa8a84
DM
150static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
151{
31248731
DM
152 WARN_ON(1);
153 return NULL;
62fa8a84
DM
154}
155
f894cbf8
DM
156static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
157 struct sk_buff *skb,
158 const void *daddr);
63fca65d 159static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
d3aaeb38 160
1da177e4
LT
161static struct dst_ops ipv4_dst_ops = {
162 .family = AF_INET,
1da177e4 163 .check = ipv4_dst_check,
0dbaee3b 164 .default_advmss = ipv4_default_advmss,
ebb762f2 165 .mtu = ipv4_mtu,
62fa8a84 166 .cow_metrics = ipv4_cow_metrics,
caacf05e 167 .destroy = ipv4_dst_destroy,
1da177e4
LT
168 .negative_advice = ipv4_negative_advice,
169 .link_failure = ipv4_link_failure,
170 .update_pmtu = ip_rt_update_pmtu,
e47a185b 171 .redirect = ip_do_redirect,
b92dacd4 172 .local_out = __ip_local_out,
d3aaeb38 173 .neigh_lookup = ipv4_neigh_lookup,
63fca65d 174 .confirm_neigh = ipv4_confirm_neigh,
1da177e4
LT
175};
176
177#define ECN_OR_COST(class) TC_PRIO_##class
178
4839c52b 179const __u8 ip_tos2prio[16] = {
1da177e4 180 TC_PRIO_BESTEFFORT,
4a2b9c37 181 ECN_OR_COST(BESTEFFORT),
1da177e4
LT
182 TC_PRIO_BESTEFFORT,
183 ECN_OR_COST(BESTEFFORT),
184 TC_PRIO_BULK,
185 ECN_OR_COST(BULK),
186 TC_PRIO_BULK,
187 ECN_OR_COST(BULK),
188 TC_PRIO_INTERACTIVE,
189 ECN_OR_COST(INTERACTIVE),
190 TC_PRIO_INTERACTIVE,
191 ECN_OR_COST(INTERACTIVE),
192 TC_PRIO_INTERACTIVE_BULK,
193 ECN_OR_COST(INTERACTIVE_BULK),
194 TC_PRIO_INTERACTIVE_BULK,
195 ECN_OR_COST(INTERACTIVE_BULK)
196};
d4a96865 197EXPORT_SYMBOL(ip_tos2prio);
1da177e4 198
2f970d83 199static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
3ed66e91 200#define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
1da177e4 201
1da177e4 202#ifdef CONFIG_PROC_FS
1da177e4
LT
203static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
204{
29e75252 205 if (*pos)
89aef892 206 return NULL;
29e75252 207 return SEQ_START_TOKEN;
1da177e4
LT
208}
209
210static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
211{
1da177e4 212 ++*pos;
89aef892 213 return NULL;
1da177e4
LT
214}
215
216static void rt_cache_seq_stop(struct seq_file *seq, void *v)
217{
1da177e4
LT
218}
219
220static int rt_cache_seq_show(struct seq_file *seq, void *v)
221{
222 if (v == SEQ_START_TOKEN)
223 seq_printf(seq, "%-127s\n",
224 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
225 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
226 "HHUptod\tSpecDst");
e905a9ed 227 return 0;
1da177e4
LT
228}
229
f690808e 230static const struct seq_operations rt_cache_seq_ops = {
1da177e4
LT
231 .start = rt_cache_seq_start,
232 .next = rt_cache_seq_next,
233 .stop = rt_cache_seq_stop,
234 .show = rt_cache_seq_show,
235};
236
237static int rt_cache_seq_open(struct inode *inode, struct file *file)
238{
89aef892 239 return seq_open(file, &rt_cache_seq_ops);
1da177e4
LT
240}
241
9a32144e 242static const struct file_operations rt_cache_seq_fops = {
1da177e4
LT
243 .open = rt_cache_seq_open,
244 .read = seq_read,
245 .llseek = seq_lseek,
89aef892 246 .release = seq_release,
1da177e4
LT
247};
248
249
250static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
251{
252 int cpu;
253
254 if (*pos == 0)
255 return SEQ_START_TOKEN;
256
0f23174a 257 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
258 if (!cpu_possible(cpu))
259 continue;
260 *pos = cpu+1;
2f970d83 261 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
262 }
263 return NULL;
264}
265
266static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
267{
268 int cpu;
269
0f23174a 270 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
1da177e4
LT
271 if (!cpu_possible(cpu))
272 continue;
273 *pos = cpu+1;
2f970d83 274 return &per_cpu(rt_cache_stat, cpu);
1da177e4
LT
275 }
276 return NULL;
e905a9ed 277
1da177e4
LT
278}
279
280static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
281{
282
283}
284
285static int rt_cpu_seq_show(struct seq_file *seq, void *v)
286{
287 struct rt_cache_stat *st = v;
288
289 if (v == SEQ_START_TOKEN) {
5bec0039 290 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
1da177e4
LT
291 return 0;
292 }
e905a9ed 293
1da177e4
LT
294 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
295 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
fc66f95c 296 dst_entries_get_slow(&ipv4_dst_ops),
0baf2b35 297 0, /* st->in_hit */
1da177e4
LT
298 st->in_slow_tot,
299 st->in_slow_mc,
300 st->in_no_route,
301 st->in_brd,
302 st->in_martian_dst,
303 st->in_martian_src,
304
0baf2b35 305 0, /* st->out_hit */
1da177e4 306 st->out_slow_tot,
e905a9ed 307 st->out_slow_mc,
1da177e4 308
0baf2b35
ED
309 0, /* st->gc_total */
310 0, /* st->gc_ignored */
311 0, /* st->gc_goal_miss */
312 0, /* st->gc_dst_overflow */
313 0, /* st->in_hlist_search */
314 0 /* st->out_hlist_search */
1da177e4
LT
315 );
316 return 0;
317}
318
f690808e 319static const struct seq_operations rt_cpu_seq_ops = {
1da177e4
LT
320 .start = rt_cpu_seq_start,
321 .next = rt_cpu_seq_next,
322 .stop = rt_cpu_seq_stop,
323 .show = rt_cpu_seq_show,
324};
325
326
327static int rt_cpu_seq_open(struct inode *inode, struct file *file)
328{
329 return seq_open(file, &rt_cpu_seq_ops);
330}
331
9a32144e 332static const struct file_operations rt_cpu_seq_fops = {
1da177e4
LT
333 .open = rt_cpu_seq_open,
334 .read = seq_read,
335 .llseek = seq_lseek,
336 .release = seq_release,
337};
338
c7066f70 339#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 340static int rt_acct_proc_show(struct seq_file *m, void *v)
78c686e9 341{
a661c419
AD
342 struct ip_rt_acct *dst, *src;
343 unsigned int i, j;
344
345 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
346 if (!dst)
347 return -ENOMEM;
348
349 for_each_possible_cpu(i) {
350 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
351 for (j = 0; j < 256; j++) {
352 dst[j].o_bytes += src[j].o_bytes;
353 dst[j].o_packets += src[j].o_packets;
354 dst[j].i_bytes += src[j].i_bytes;
355 dst[j].i_packets += src[j].i_packets;
356 }
78c686e9
PE
357 }
358
a661c419
AD
359 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
360 kfree(dst);
361 return 0;
362}
78c686e9 363
a661c419
AD
364static int rt_acct_proc_open(struct inode *inode, struct file *file)
365{
366 return single_open(file, rt_acct_proc_show, NULL);
78c686e9 367}
a661c419
AD
368
369static const struct file_operations rt_acct_proc_fops = {
a661c419
AD
370 .open = rt_acct_proc_open,
371 .read = seq_read,
372 .llseek = seq_lseek,
373 .release = single_release,
374};
78c686e9 375#endif
107f1634 376
73b38711 377static int __net_init ip_rt_do_proc_init(struct net *net)
107f1634
PE
378{
379 struct proc_dir_entry *pde;
380
d6444062 381 pde = proc_create("rt_cache", 0444, net->proc_net,
d4beaa66 382 &rt_cache_seq_fops);
107f1634
PE
383 if (!pde)
384 goto err1;
385
d6444062 386 pde = proc_create("rt_cache", 0444,
77020720 387 net->proc_net_stat, &rt_cpu_seq_fops);
107f1634
PE
388 if (!pde)
389 goto err2;
390
c7066f70 391#ifdef CONFIG_IP_ROUTE_CLASSID
a661c419 392 pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
107f1634
PE
393 if (!pde)
394 goto err3;
395#endif
396 return 0;
397
c7066f70 398#ifdef CONFIG_IP_ROUTE_CLASSID
107f1634
PE
399err3:
400 remove_proc_entry("rt_cache", net->proc_net_stat);
401#endif
402err2:
403 remove_proc_entry("rt_cache", net->proc_net);
404err1:
405 return -ENOMEM;
406}
73b38711
DL
407
408static void __net_exit ip_rt_do_proc_exit(struct net *net)
409{
410 remove_proc_entry("rt_cache", net->proc_net_stat);
411 remove_proc_entry("rt_cache", net->proc_net);
c7066f70 412#ifdef CONFIG_IP_ROUTE_CLASSID
73b38711 413 remove_proc_entry("rt_acct", net->proc_net);
0a931acf 414#endif
73b38711
DL
415}
416
417static struct pernet_operations ip_rt_proc_ops __net_initdata = {
418 .init = ip_rt_do_proc_init,
419 .exit = ip_rt_do_proc_exit,
420};
421
422static int __init ip_rt_proc_init(void)
423{
424 return register_pernet_subsys(&ip_rt_proc_ops);
425}
426
107f1634 427#else
73b38711 428static inline int ip_rt_proc_init(void)
107f1634
PE
429{
430 return 0;
431}
1da177e4 432#endif /* CONFIG_PROC_FS */
e905a9ed 433
4331debc 434static inline bool rt_is_expired(const struct rtable *rth)
e84f84f2 435{
ca4c3fc2 436 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
e84f84f2
DL
437}
438
4ccfe6d4 439void rt_cache_flush(struct net *net)
1da177e4 440{
ca4c3fc2 441 rt_genid_bump_ipv4(net);
98376387
ED
442}
443
f894cbf8
DM
444static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
445 struct sk_buff *skb,
446 const void *daddr)
3769cffb 447{
d3aaeb38
DM
448 struct net_device *dev = dst->dev;
449 const __be32 *pkey = daddr;
39232973 450 const struct rtable *rt;
3769cffb
DM
451 struct neighbour *n;
452
39232973 453 rt = (const struct rtable *) dst;
a263b309 454 if (rt->rt_gateway)
39232973 455 pkey = (const __be32 *) &rt->rt_gateway;
f894cbf8
DM
456 else if (skb)
457 pkey = &ip_hdr(skb)->daddr;
d3aaeb38 458
80703d26 459 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
d3aaeb38
DM
460 if (n)
461 return n;
32092ecf 462 return neigh_create(&arp_tbl, pkey, dev);
d3aaeb38
DM
463}
464
63fca65d
JA
465static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
466{
467 struct net_device *dev = dst->dev;
468 const __be32 *pkey = daddr;
469 const struct rtable *rt;
470
471 rt = (const struct rtable *)dst;
472 if (rt->rt_gateway)
473 pkey = (const __be32 *)&rt->rt_gateway;
474 else if (!daddr ||
475 (rt->rt_flags &
476 (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
477 return;
478
479 __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
480}
481
04ca6973 482#define IP_IDENTS_SZ 2048u
04ca6973 483
355b590c
ED
484static atomic_t *ip_idents __read_mostly;
485static u32 *ip_tstamps __read_mostly;
04ca6973
ED
486
487/* In order to protect privacy, we add a perturbation to identifiers
488 * if one generator is seldom used. This makes hard for an attacker
489 * to infer how many packets were sent between two points in time.
490 */
491u32 ip_idents_reserve(u32 hash, int segs)
492{
355b590c
ED
493 u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
494 atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
6aa7de05 495 u32 old = READ_ONCE(*p_tstamp);
04ca6973 496 u32 now = (u32)jiffies;
adb03115 497 u32 new, delta = 0;
04ca6973 498
355b590c 499 if (old != now && cmpxchg(p_tstamp, old, now) == old)
04ca6973
ED
500 delta = prandom_u32_max(now - old);
501
adb03115
ED
502 /* Do not use atomic_add_return() as it makes UBSAN unhappy */
503 do {
504 old = (u32)atomic_read(p_id);
505 new = old + delta + segs;
506 } while (atomic_cmpxchg(p_id, old, new) != old);
507
508 return new - segs;
04ca6973
ED
509}
510EXPORT_SYMBOL(ip_idents_reserve);
1da177e4 511
b6a7719a 512void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
1da177e4 513{
73f156a6
ED
514 static u32 ip_idents_hashrnd __read_mostly;
515 u32 hash, id;
1da177e4 516
73f156a6 517 net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
1da177e4 518
04ca6973
ED
519 hash = jhash_3words((__force u32)iph->daddr,
520 (__force u32)iph->saddr,
b6a7719a 521 iph->protocol ^ net_hash_mix(net),
04ca6973 522 ip_idents_hashrnd);
73f156a6
ED
523 id = ip_idents_reserve(hash, segs);
524 iph->id = htons(id);
1da177e4 525}
4bc2f18b 526EXPORT_SYMBOL(__ip_select_ident);
1da177e4 527
e2d118a1
LC
528static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
529 const struct sock *sk,
4895c771
DM
530 const struct iphdr *iph,
531 int oif, u8 tos,
532 u8 prot, u32 mark, int flow_flags)
533{
534 if (sk) {
535 const struct inet_sock *inet = inet_sk(sk);
536
537 oif = sk->sk_bound_dev_if;
538 mark = sk->sk_mark;
539 tos = RT_CONN_FLAGS(sk);
540 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
541 }
542 flowi4_init_output(fl4, oif, mark, tos,
543 RT_SCOPE_UNIVERSE, prot,
544 flow_flags,
e2d118a1
LC
545 iph->daddr, iph->saddr, 0, 0,
546 sock_net_uid(net, sk));
4895c771
DM
547}
548
5abf7f7e
ED
549static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
550 const struct sock *sk)
4895c771 551{
d109e61b 552 const struct net *net = dev_net(skb->dev);
4895c771
DM
553 const struct iphdr *iph = ip_hdr(skb);
554 int oif = skb->dev->ifindex;
555 u8 tos = RT_TOS(iph->tos);
556 u8 prot = iph->protocol;
557 u32 mark = skb->mark;
558
d109e61b 559 __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
4895c771
DM
560}
561
5abf7f7e 562static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
4895c771
DM
563{
564 const struct inet_sock *inet = inet_sk(sk);
5abf7f7e 565 const struct ip_options_rcu *inet_opt;
4895c771
DM
566 __be32 daddr = inet->inet_daddr;
567
568 rcu_read_lock();
569 inet_opt = rcu_dereference(inet->inet_opt);
570 if (inet_opt && inet_opt->opt.srr)
571 daddr = inet_opt->opt.faddr;
572 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
573 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
574 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
575 inet_sk_flowi_flags(sk),
e2d118a1 576 daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
4895c771
DM
577 rcu_read_unlock();
578}
579
5abf7f7e
ED
580static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
581 const struct sk_buff *skb)
4895c771
DM
582{
583 if (skb)
584 build_skb_flow_key(fl4, skb, sk);
585 else
586 build_sk_flow_key(fl4, sk);
587}
588
c5038a83 589static DEFINE_SPINLOCK(fnhe_lock);
4895c771 590
2ffae99d
TT
591static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
592{
593 struct rtable *rt;
594
595 rt = rcu_dereference(fnhe->fnhe_rth_input);
596 if (rt) {
597 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
95c47f9c 598 dst_dev_put(&rt->dst);
0830106c 599 dst_release(&rt->dst);
2ffae99d
TT
600 }
601 rt = rcu_dereference(fnhe->fnhe_rth_output);
602 if (rt) {
603 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
95c47f9c 604 dst_dev_put(&rt->dst);
0830106c 605 dst_release(&rt->dst);
2ffae99d
TT
606 }
607}
608
aee06da6 609static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
4895c771
DM
610{
611 struct fib_nh_exception *fnhe, *oldest;
612
613 oldest = rcu_dereference(hash->chain);
614 for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
615 fnhe = rcu_dereference(fnhe->fnhe_next)) {
616 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
617 oldest = fnhe;
618 }
2ffae99d 619 fnhe_flush_routes(oldest);
4895c771
DM
620 return oldest;
621}
622
d3a25c98
DM
623static inline u32 fnhe_hashfun(__be32 daddr)
624{
d546c621 625 static u32 fnhe_hashrnd __read_mostly;
d3a25c98
DM
626 u32 hval;
627
d546c621
ED
628 net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
629 hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
630 return hash_32(hval, FNHE_HASH_SHIFT);
d3a25c98
DM
631}
632
387aa65a
TT
633static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
634{
635 rt->rt_pmtu = fnhe->fnhe_pmtu;
d52e5a7e 636 rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
387aa65a
TT
637 rt->dst.expires = fnhe->fnhe_expires;
638
639 if (fnhe->fnhe_gw) {
640 rt->rt_flags |= RTCF_REDIRECTED;
641 rt->rt_gateway = fnhe->fnhe_gw;
642 rt->rt_uses_gateway = 1;
643 }
644}
645
aee06da6 646static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
d52e5a7e 647 u32 pmtu, bool lock, unsigned long expires)
4895c771 648{
aee06da6 649 struct fnhe_hash_bucket *hash;
4895c771 650 struct fib_nh_exception *fnhe;
387aa65a 651 struct rtable *rt;
cebe84c6 652 u32 genid, hval;
387aa65a 653 unsigned int i;
4895c771 654 int depth;
cebe84c6
XL
655
656 genid = fnhe_genid(dev_net(nh->nh_dev));
657 hval = fnhe_hashfun(daddr);
aee06da6 658
c5038a83 659 spin_lock_bh(&fnhe_lock);
4895c771 660
caa41527 661 hash = rcu_dereference(nh->nh_exceptions);
4895c771 662 if (!hash) {
aee06da6 663 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
4895c771 664 if (!hash)
aee06da6 665 goto out_unlock;
caa41527 666 rcu_assign_pointer(nh->nh_exceptions, hash);
4895c771
DM
667 }
668
4895c771
DM
669 hash += hval;
670
671 depth = 0;
672 for (fnhe = rcu_dereference(hash->chain); fnhe;
673 fnhe = rcu_dereference(fnhe->fnhe_next)) {
674 if (fnhe->fnhe_daddr == daddr)
aee06da6 675 break;
4895c771
DM
676 depth++;
677 }
678
aee06da6 679 if (fnhe) {
cebe84c6
XL
680 if (fnhe->fnhe_genid != genid)
681 fnhe->fnhe_genid = genid;
aee06da6
JA
682 if (gw)
683 fnhe->fnhe_gw = gw;
d52e5a7e 684 if (pmtu) {
aee06da6 685 fnhe->fnhe_pmtu = pmtu;
d52e5a7e
SD
686 fnhe->fnhe_mtu_locked = lock;
687 }
e39d5246 688 fnhe->fnhe_expires = max(1UL, expires);
387aa65a 689 /* Update all cached dsts too */
2ffae99d
TT
690 rt = rcu_dereference(fnhe->fnhe_rth_input);
691 if (rt)
692 fill_route_from_fnhe(rt, fnhe);
693 rt = rcu_dereference(fnhe->fnhe_rth_output);
387aa65a
TT
694 if (rt)
695 fill_route_from_fnhe(rt, fnhe);
aee06da6
JA
696 } else {
697 if (depth > FNHE_RECLAIM_DEPTH)
698 fnhe = fnhe_oldest(hash);
699 else {
700 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
701 if (!fnhe)
702 goto out_unlock;
703
704 fnhe->fnhe_next = hash->chain;
705 rcu_assign_pointer(hash->chain, fnhe);
706 }
cebe84c6 707 fnhe->fnhe_genid = genid;
aee06da6
JA
708 fnhe->fnhe_daddr = daddr;
709 fnhe->fnhe_gw = gw;
710 fnhe->fnhe_pmtu = pmtu;
d52e5a7e 711 fnhe->fnhe_mtu_locked = lock;
94720e3a 712 fnhe->fnhe_expires = max(1UL, expires);
387aa65a
TT
713
714 /* Exception created; mark the cached routes for the nexthop
715 * stale, so anyone caching it rechecks if this exception
716 * applies to them.
717 */
2ffae99d
TT
718 rt = rcu_dereference(nh->nh_rth_input);
719 if (rt)
720 rt->dst.obsolete = DST_OBSOLETE_KILL;
721
387aa65a
TT
722 for_each_possible_cpu(i) {
723 struct rtable __rcu **prt;
724 prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
725 rt = rcu_dereference(*prt);
726 if (rt)
727 rt->dst.obsolete = DST_OBSOLETE_KILL;
728 }
4895c771 729 }
4895c771 730
4895c771 731 fnhe->fnhe_stamp = jiffies;
aee06da6
JA
732
733out_unlock:
c5038a83 734 spin_unlock_bh(&fnhe_lock);
4895c771
DM
735}
736
ceb33206
DM
737static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
738 bool kill_route)
1da177e4 739{
e47a185b 740 __be32 new_gw = icmp_hdr(skb)->un.gateway;
94206125 741 __be32 old_gw = ip_hdr(skb)->saddr;
e47a185b 742 struct net_device *dev = skb->dev;
e47a185b 743 struct in_device *in_dev;
4895c771 744 struct fib_result res;
e47a185b 745 struct neighbour *n;
317805b8 746 struct net *net;
1da177e4 747
94206125
DM
748 switch (icmp_hdr(skb)->code & 7) {
749 case ICMP_REDIR_NET:
750 case ICMP_REDIR_NETTOS:
751 case ICMP_REDIR_HOST:
752 case ICMP_REDIR_HOSTTOS:
753 break;
754
755 default:
756 return;
757 }
758
e47a185b
DM
759 if (rt->rt_gateway != old_gw)
760 return;
761
762 in_dev = __in_dev_get_rcu(dev);
763 if (!in_dev)
764 return;
765
c346dca1 766 net = dev_net(dev);
9d4fb27d
JP
767 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
768 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
769 ipv4_is_zeronet(new_gw))
1da177e4
LT
770 goto reject_redirect;
771
772 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
773 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
774 goto reject_redirect;
775 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
776 goto reject_redirect;
777 } else {
317805b8 778 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
1da177e4
LT
779 goto reject_redirect;
780 }
781
969447f2
SSL
782 n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
783 if (!n)
784 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
2c1a4311 785 if (!IS_ERR(n)) {
e47a185b
DM
786 if (!(n->nud_state & NUD_VALID)) {
787 neigh_event_send(n, NULL);
788 } else {
0eeb075f 789 if (fib_lookup(net, fl4, &res, 0) == 0) {
4895c771 790 struct fib_nh *nh = &FIB_RES_NH(res);
4895c771 791
aee06da6 792 update_or_create_fnhe(nh, fl4->daddr, new_gw,
d52e5a7e
SD
793 0, false,
794 jiffies + ip_rt_gc_timeout);
4895c771 795 }
ceb33206
DM
796 if (kill_route)
797 rt->dst.obsolete = DST_OBSOLETE_KILL;
e47a185b
DM
798 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
799 }
800 neigh_release(n);
801 }
802 return;
803
804reject_redirect:
805#ifdef CONFIG_IP_ROUTE_VERBOSE
99ee038d
DM
806 if (IN_DEV_LOG_MARTIANS(in_dev)) {
807 const struct iphdr *iph = (const struct iphdr *) skb->data;
808 __be32 daddr = iph->daddr;
809 __be32 saddr = iph->saddr;
810
e47a185b
DM
811 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
812 " Advised path = %pI4 -> %pI4\n",
813 &old_gw, dev->name, &new_gw,
814 &saddr, &daddr);
99ee038d 815 }
e47a185b
DM
816#endif
817 ;
818}
819
4895c771
DM
820static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
821{
822 struct rtable *rt;
823 struct flowi4 fl4;
f96ef988 824 const struct iphdr *iph = (const struct iphdr *) skb->data;
7d995694 825 struct net *net = dev_net(skb->dev);
f96ef988
MK
826 int oif = skb->dev->ifindex;
827 u8 tos = RT_TOS(iph->tos);
828 u8 prot = iph->protocol;
829 u32 mark = skb->mark;
4895c771
DM
830
831 rt = (struct rtable *) dst;
832
7d995694 833 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
ceb33206 834 __ip_do_redirect(rt, skb, &fl4, true);
4895c771
DM
835}
836
1da177e4
LT
837static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
838{
ee6b9673 839 struct rtable *rt = (struct rtable *)dst;
1da177e4
LT
840 struct dst_entry *ret = dst;
841
842 if (rt) {
d11a4dc1 843 if (dst->obsolete > 0) {
1da177e4
LT
844 ip_rt_put(rt);
845 ret = NULL;
5943634f
DM
846 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
847 rt->dst.expires) {
89aef892 848 ip_rt_put(rt);
1da177e4
LT
849 ret = NULL;
850 }
851 }
852 return ret;
853}
854
855/*
856 * Algorithm:
857 * 1. The first ip_rt_redirect_number redirects are sent
858 * with exponential backoff, then we stop sending them at all,
859 * assuming that the host ignores our redirects.
860 * 2. If we did not see packets requiring redirects
861 * during ip_rt_redirect_silence, we assume that the host
862 * forgot redirected route and start to send redirects again.
863 *
864 * This algorithm is much cheaper and more intelligent than dumb load limiting
865 * in icmp.c.
866 *
867 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
868 * and "frag. need" (breaks PMTU discovery) in icmp.c.
869 */
870
871void ip_rt_send_redirect(struct sk_buff *skb)
872{
511c3f92 873 struct rtable *rt = skb_rtable(skb);
30038fc6 874 struct in_device *in_dev;
92d86829 875 struct inet_peer *peer;
1d861aa4 876 struct net *net;
30038fc6 877 int log_martians;
192132b9 878 int vif;
1da177e4 879
30038fc6 880 rcu_read_lock();
d8d1f30b 881 in_dev = __in_dev_get_rcu(rt->dst.dev);
30038fc6
ED
882 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
883 rcu_read_unlock();
1da177e4 884 return;
30038fc6
ED
885 }
886 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
385add90 887 vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
30038fc6 888 rcu_read_unlock();
1da177e4 889
1d861aa4 890 net = dev_net(rt->dst.dev);
192132b9 891 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
92d86829 892 if (!peer) {
e81da0e1
JA
893 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
894 rt_nexthop(rt, ip_hdr(skb)->daddr));
92d86829
DM
895 return;
896 }
897
1da177e4
LT
898 /* No redirected packets during ip_rt_redirect_silence;
899 * reset the algorithm.
900 */
92d86829
DM
901 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
902 peer->rate_tokens = 0;
1da177e4
LT
903
904 /* Too many ignored redirects; do not send anything
d8d1f30b 905 * set dst.rate_last to the last seen redirected packet.
1da177e4 906 */
92d86829
DM
907 if (peer->rate_tokens >= ip_rt_redirect_number) {
908 peer->rate_last = jiffies;
1d861aa4 909 goto out_put_peer;
1da177e4
LT
910 }
911
912 /* Check for load limit; set rate_last to the latest sent
913 * redirect.
914 */
92d86829 915 if (peer->rate_tokens == 0 ||
14fb8a76 916 time_after(jiffies,
92d86829
DM
917 (peer->rate_last +
918 (ip_rt_redirect_load << peer->rate_tokens)))) {
e81da0e1
JA
919 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
920
921 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
92d86829
DM
922 peer->rate_last = jiffies;
923 ++peer->rate_tokens;
1da177e4 924#ifdef CONFIG_IP_ROUTE_VERBOSE
30038fc6 925 if (log_martians &&
e87cc472
JP
926 peer->rate_tokens == ip_rt_redirect_number)
927 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
92101b3b 928 &ip_hdr(skb)->saddr, inet_iif(skb),
e81da0e1 929 &ip_hdr(skb)->daddr, &gw);
1da177e4
LT
930#endif
931 }
1d861aa4
DM
932out_put_peer:
933 inet_putpeer(peer);
1da177e4
LT
934}
935
936static int ip_error(struct sk_buff *skb)
937{
511c3f92 938 struct rtable *rt = skb_rtable(skb);
e2c0dc1f
SS
939 struct net_device *dev = skb->dev;
940 struct in_device *in_dev;
92d86829 941 struct inet_peer *peer;
1da177e4 942 unsigned long now;
251da413 943 struct net *net;
92d86829 944 bool send;
1da177e4
LT
945 int code;
946
e2c0dc1f
SS
947 if (netif_is_l3_master(skb->dev)) {
948 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
949 if (!dev)
950 goto out;
951 }
952
953 in_dev = __in_dev_get_rcu(dev);
954
381c759d
EB
955 /* IP on this device is disabled. */
956 if (!in_dev)
957 goto out;
958
251da413
DM
959 net = dev_net(rt->dst.dev);
960 if (!IN_DEV_FORWARD(in_dev)) {
961 switch (rt->dst.error) {
962 case EHOSTUNREACH:
b45386ef 963 __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
251da413
DM
964 break;
965
966 case ENETUNREACH:
b45386ef 967 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
251da413
DM
968 break;
969 }
970 goto out;
971 }
972
d8d1f30b 973 switch (rt->dst.error) {
4500ebf8
JP
974 case EINVAL:
975 default:
976 goto out;
977 case EHOSTUNREACH:
978 code = ICMP_HOST_UNREACH;
979 break;
980 case ENETUNREACH:
981 code = ICMP_NET_UNREACH;
b45386ef 982 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
4500ebf8
JP
983 break;
984 case EACCES:
985 code = ICMP_PKT_FILTERED;
986 break;
1da177e4
LT
987 }
988
192132b9 989 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
385add90 990 l3mdev_master_ifindex(skb->dev), 1);
92d86829
DM
991
992 send = true;
993 if (peer) {
994 now = jiffies;
995 peer->rate_tokens += now - peer->rate_last;
996 if (peer->rate_tokens > ip_rt_error_burst)
997 peer->rate_tokens = ip_rt_error_burst;
998 peer->rate_last = now;
999 if (peer->rate_tokens >= ip_rt_error_cost)
1000 peer->rate_tokens -= ip_rt_error_cost;
1001 else
1002 send = false;
1d861aa4 1003 inet_putpeer(peer);
1da177e4 1004 }
92d86829
DM
1005 if (send)
1006 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1da177e4
LT
1007
1008out: kfree_skb(skb);
1009 return 0;
e905a9ed 1010}
1da177e4 1011
d851c12b 1012static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1da177e4 1013{
d851c12b 1014 struct dst_entry *dst = &rt->dst;
4895c771 1015 struct fib_result res;
d52e5a7e 1016 bool lock = false;
2c8cec5c 1017
d52e5a7e 1018 if (ip_mtu_locked(dst))
fa1e492a
SK
1019 return;
1020
cb6ccf09 1021 if (ipv4_mtu(dst) < mtu)
3cdaa5be
LW
1022 return;
1023
d52e5a7e
SD
1024 if (mtu < ip_rt_min_pmtu) {
1025 lock = true;
5943634f 1026 mtu = ip_rt_min_pmtu;
d52e5a7e 1027 }
2c8cec5c 1028
f016229e
TT
1029 if (rt->rt_pmtu == mtu &&
1030 time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1031 return;
1032
c5ae7d41 1033 rcu_read_lock();
0eeb075f 1034 if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
4895c771 1035 struct fib_nh *nh = &FIB_RES_NH(res);
4895c771 1036
d52e5a7e 1037 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
aee06da6 1038 jiffies + ip_rt_mtu_expires);
4895c771 1039 }
c5ae7d41 1040 rcu_read_unlock();
1da177e4
LT
1041}
1042
4895c771
DM
1043static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1044 struct sk_buff *skb, u32 mtu)
1045{
1046 struct rtable *rt = (struct rtable *) dst;
1047 struct flowi4 fl4;
1048
1049 ip_rt_build_flow_key(&fl4, sk, skb);
d851c12b 1050 __ip_rt_update_pmtu(rt, &fl4, mtu);
4895c771
DM
1051}
1052
36393395
DM
1053void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1054 int oif, u32 mark, u8 protocol, int flow_flags)
1055{
4895c771 1056 const struct iphdr *iph = (const struct iphdr *) skb->data;
36393395
DM
1057 struct flowi4 fl4;
1058 struct rtable *rt;
1059
1b3c61dc
LC
1060 if (!mark)
1061 mark = IP4_REPLY_MARK(net, skb->mark);
1062
e2d118a1 1063 __build_flow_key(net, &fl4, NULL, iph, oif,
4895c771 1064 RT_TOS(iph->tos), protocol, mark, flow_flags);
36393395
DM
1065 rt = __ip_route_output_key(net, &fl4);
1066 if (!IS_ERR(rt)) {
4895c771 1067 __ip_rt_update_pmtu(rt, &fl4, mtu);
36393395
DM
1068 ip_rt_put(rt);
1069 }
1070}
1071EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1072
9cb3a50c 1073static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
36393395 1074{
4895c771
DM
1075 const struct iphdr *iph = (const struct iphdr *) skb->data;
1076 struct flowi4 fl4;
1077 struct rtable *rt;
36393395 1078
e2d118a1 1079 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1b3c61dc
LC
1080
1081 if (!fl4.flowi4_mark)
1082 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1083
4895c771
DM
1084 rt = __ip_route_output_key(sock_net(sk), &fl4);
1085 if (!IS_ERR(rt)) {
1086 __ip_rt_update_pmtu(rt, &fl4, mtu);
1087 ip_rt_put(rt);
1088 }
36393395 1089}
9cb3a50c
SK
1090
1091void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1092{
1093 const struct iphdr *iph = (const struct iphdr *) skb->data;
1094 struct flowi4 fl4;
1095 struct rtable *rt;
7f502361 1096 struct dst_entry *odst = NULL;
b44108db 1097 bool new = false;
e2d118a1 1098 struct net *net = sock_net(sk);
9cb3a50c
SK
1099
1100 bh_lock_sock(sk);
482fc609
HFS
1101
1102 if (!ip_sk_accept_pmtu(sk))
1103 goto out;
1104
7f502361 1105 odst = sk_dst_get(sk);
9cb3a50c 1106
7f502361 1107 if (sock_owned_by_user(sk) || !odst) {
9cb3a50c
SK
1108 __ipv4_sk_update_pmtu(skb, sk, mtu);
1109 goto out;
1110 }
1111
e2d118a1 1112 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
9cb3a50c 1113
7f502361 1114 rt = (struct rtable *)odst;
51456b29 1115 if (odst->obsolete && !odst->ops->check(odst, 0)) {
9cb3a50c
SK
1116 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1117 if (IS_ERR(rt))
1118 goto out;
b44108db
SK
1119
1120 new = true;
9cb3a50c
SK
1121 }
1122
0f6c480f 1123 __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
9cb3a50c 1124
7f502361 1125 if (!dst_check(&rt->dst, 0)) {
b44108db
SK
1126 if (new)
1127 dst_release(&rt->dst);
1128
9cb3a50c
SK
1129 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1130 if (IS_ERR(rt))
1131 goto out;
1132
b44108db 1133 new = true;
9cb3a50c
SK
1134 }
1135
b44108db 1136 if (new)
7f502361 1137 sk_dst_set(sk, &rt->dst);
9cb3a50c
SK
1138
1139out:
1140 bh_unlock_sock(sk);
7f502361 1141 dst_release(odst);
9cb3a50c 1142}
36393395 1143EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
f39925db 1144
b42597e2
DM
1145void ipv4_redirect(struct sk_buff *skb, struct net *net,
1146 int oif, u32 mark, u8 protocol, int flow_flags)
1147{
4895c771 1148 const struct iphdr *iph = (const struct iphdr *) skb->data;
b42597e2
DM
1149 struct flowi4 fl4;
1150 struct rtable *rt;
1151
e2d118a1 1152 __build_flow_key(net, &fl4, NULL, iph, oif,
4895c771 1153 RT_TOS(iph->tos), protocol, mark, flow_flags);
b42597e2
DM
1154 rt = __ip_route_output_key(net, &fl4);
1155 if (!IS_ERR(rt)) {
ceb33206 1156 __ip_do_redirect(rt, skb, &fl4, false);
b42597e2
DM
1157 ip_rt_put(rt);
1158 }
1159}
1160EXPORT_SYMBOL_GPL(ipv4_redirect);
1161
1162void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1163{
4895c771
DM
1164 const struct iphdr *iph = (const struct iphdr *) skb->data;
1165 struct flowi4 fl4;
1166 struct rtable *rt;
e2d118a1 1167 struct net *net = sock_net(sk);
b42597e2 1168
e2d118a1
LC
1169 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1170 rt = __ip_route_output_key(net, &fl4);
4895c771 1171 if (!IS_ERR(rt)) {
ceb33206 1172 __ip_do_redirect(rt, skb, &fl4, false);
4895c771
DM
1173 ip_rt_put(rt);
1174 }
b42597e2
DM
1175}
1176EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1177
efbc368d
DM
1178static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1179{
1180 struct rtable *rt = (struct rtable *) dst;
1181
ceb33206
DM
1182 /* All IPV4 dsts are created with ->obsolete set to the value
1183 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1184 * into this function always.
1185 *
387aa65a
TT
1186 * When a PMTU/redirect information update invalidates a route,
1187 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1188 * DST_OBSOLETE_DEAD by dst_free().
ceb33206 1189 */
387aa65a 1190 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
efbc368d 1191 return NULL;
d11a4dc1 1192 return dst;
1da177e4
LT
1193}
1194
1da177e4
LT
1195static void ipv4_link_failure(struct sk_buff *skb)
1196{
1197 struct rtable *rt;
1198
1199 icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1200
511c3f92 1201 rt = skb_rtable(skb);
5943634f
DM
1202 if (rt)
1203 dst_set_expires(&rt->dst, 0);
1da177e4
LT
1204}
1205
ede2059d 1206static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1da177e4 1207{
91df42be
JP
1208 pr_debug("%s: %pI4 -> %pI4, %s\n",
1209 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1210 skb->dev ? skb->dev->name : "?");
1da177e4 1211 kfree_skb(skb);
c378a9c0 1212 WARN_ON(1);
1da177e4
LT
1213 return 0;
1214}
1215
1216/*
1217 We do not cache source address of outgoing interface,
1218 because it is used only by IP RR, TS and SRR options,
1219 so that it out of fast path.
1220
1221 BTW remember: "addr" is allowed to be not aligned
1222 in IP options!
1223 */
1224
8e36360a 1225void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1da177e4 1226{
a61ced5d 1227 __be32 src;
1da177e4 1228
c7537967 1229 if (rt_is_output_route(rt))
c5be24ff 1230 src = ip_hdr(skb)->saddr;
ebc0ffae 1231 else {
8e36360a
DM
1232 struct fib_result res;
1233 struct flowi4 fl4;
1234 struct iphdr *iph;
1235
1236 iph = ip_hdr(skb);
1237
1238 memset(&fl4, 0, sizeof(fl4));
1239 fl4.daddr = iph->daddr;
1240 fl4.saddr = iph->saddr;
b0fe4a31 1241 fl4.flowi4_tos = RT_TOS(iph->tos);
8e36360a
DM
1242 fl4.flowi4_oif = rt->dst.dev->ifindex;
1243 fl4.flowi4_iif = skb->dev->ifindex;
1244 fl4.flowi4_mark = skb->mark;
5e2b61f7 1245
ebc0ffae 1246 rcu_read_lock();
0eeb075f 1247 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
436c3b66 1248 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
ebc0ffae 1249 else
f8126f1d
DM
1250 src = inet_select_addr(rt->dst.dev,
1251 rt_nexthop(rt, iph->daddr),
1252 RT_SCOPE_UNIVERSE);
ebc0ffae
ED
1253 rcu_read_unlock();
1254 }
1da177e4
LT
1255 memcpy(addr, &src, 4);
1256}
1257
c7066f70 1258#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4
LT
1259static void set_class_tag(struct rtable *rt, u32 tag)
1260{
d8d1f30b
CG
1261 if (!(rt->dst.tclassid & 0xFFFF))
1262 rt->dst.tclassid |= tag & 0xFFFF;
1263 if (!(rt->dst.tclassid & 0xFFFF0000))
1264 rt->dst.tclassid |= tag & 0xFFFF0000;
1da177e4
LT
1265}
1266#endif
1267
0dbaee3b
DM
1268static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1269{
7ed14d97 1270 unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
164a5e7a 1271 unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
7ed14d97 1272 ip_rt_min_advmss);
0dbaee3b 1273
7ed14d97 1274 return min(advmss, IPV4_MAX_PMTU - header_size);
0dbaee3b
DM
1275}
1276
ebb762f2 1277static unsigned int ipv4_mtu(const struct dst_entry *dst)
d33e4553 1278{
261663b0 1279 const struct rtable *rt = (const struct rtable *) dst;
5943634f
DM
1280 unsigned int mtu = rt->rt_pmtu;
1281
98d75c37 1282 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
5943634f 1283 mtu = dst_metric_raw(dst, RTAX_MTU);
618f9bc7 1284
38d523e2 1285 if (mtu)
618f9bc7
SK
1286 return mtu;
1287
c780a049 1288 mtu = READ_ONCE(dst->dev->mtu);
d33e4553 1289
d52e5a7e 1290 if (unlikely(ip_mtu_locked(dst))) {
155e8336 1291 if (rt->rt_uses_gateway && mtu > 576)
d33e4553
DM
1292 mtu = 576;
1293 }
1294
14972cbd
RP
1295 mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1296
1297 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
d33e4553
DM
1298}
1299
94720e3a
JA
1300static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1301{
1302 struct fnhe_hash_bucket *hash;
1303 struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1304 u32 hval = fnhe_hashfun(daddr);
1305
1306 spin_lock_bh(&fnhe_lock);
1307
1308 hash = rcu_dereference_protected(nh->nh_exceptions,
1309 lockdep_is_held(&fnhe_lock));
1310 hash += hval;
1311
1312 fnhe_p = &hash->chain;
1313 fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1314 while (fnhe) {
1315 if (fnhe->fnhe_daddr == daddr) {
1316 rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1317 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1318 fnhe_flush_routes(fnhe);
1319 kfree_rcu(fnhe, rcu);
1320 break;
1321 }
1322 fnhe_p = &fnhe->fnhe_next;
1323 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1324 lockdep_is_held(&fnhe_lock));
1325 }
1326
1327 spin_unlock_bh(&fnhe_lock);
1328}
1329
f2bb4bed 1330static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
4895c771 1331{
caa41527 1332 struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
4895c771
DM
1333 struct fib_nh_exception *fnhe;
1334 u32 hval;
1335
f2bb4bed
DM
1336 if (!hash)
1337 return NULL;
1338
d3a25c98 1339 hval = fnhe_hashfun(daddr);
4895c771
DM
1340
1341 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1342 fnhe = rcu_dereference(fnhe->fnhe_next)) {
94720e3a
JA
1343 if (fnhe->fnhe_daddr == daddr) {
1344 if (fnhe->fnhe_expires &&
1345 time_after(jiffies, fnhe->fnhe_expires)) {
1346 ip_del_fnhe(nh, daddr);
1347 break;
1348 }
f2bb4bed 1349 return fnhe;
94720e3a 1350 }
f2bb4bed
DM
1351 }
1352 return NULL;
1353}
aee06da6 1354
caacf05e 1355static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
a4c2fd7f 1356 __be32 daddr, const bool do_cache)
f2bb4bed 1357{
caacf05e
DM
1358 bool ret = false;
1359
c5038a83 1360 spin_lock_bh(&fnhe_lock);
f2bb4bed 1361
c5038a83 1362 if (daddr == fnhe->fnhe_daddr) {
2ffae99d
TT
1363 struct rtable __rcu **porig;
1364 struct rtable *orig;
5aad1de5 1365 int genid = fnhe_genid(dev_net(rt->dst.dev));
2ffae99d
TT
1366
1367 if (rt_is_input_route(rt))
1368 porig = &fnhe->fnhe_rth_input;
1369 else
1370 porig = &fnhe->fnhe_rth_output;
1371 orig = rcu_dereference(*porig);
5aad1de5
TT
1372
1373 if (fnhe->fnhe_genid != genid) {
1374 fnhe->fnhe_genid = genid;
13d82bf5
SK
1375 fnhe->fnhe_gw = 0;
1376 fnhe->fnhe_pmtu = 0;
1377 fnhe->fnhe_expires = 0;
0e8411e4 1378 fnhe->fnhe_mtu_locked = false;
2ffae99d
TT
1379 fnhe_flush_routes(fnhe);
1380 orig = NULL;
13d82bf5 1381 }
387aa65a
TT
1382 fill_route_from_fnhe(rt, fnhe);
1383 if (!rt->rt_gateway)
155e8336 1384 rt->rt_gateway = daddr;
f2bb4bed 1385
a4c2fd7f 1386 if (do_cache) {
0830106c 1387 dst_hold(&rt->dst);
2ffae99d 1388 rcu_assign_pointer(*porig, rt);
0830106c 1389 if (orig) {
95c47f9c 1390 dst_dev_put(&orig->dst);
0830106c 1391 dst_release(&orig->dst);
0830106c 1392 }
2ffae99d
TT
1393 ret = true;
1394 }
c5038a83
DM
1395
1396 fnhe->fnhe_stamp = jiffies;
c5038a83
DM
1397 }
1398 spin_unlock_bh(&fnhe_lock);
caacf05e
DM
1399
1400 return ret;
54764bb6
ED
1401}
1402
caacf05e 1403static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
f2bb4bed 1404{
d26b3a7c 1405 struct rtable *orig, *prev, **p;
caacf05e 1406 bool ret = true;
f2bb4bed 1407
d26b3a7c 1408 if (rt_is_input_route(rt)) {
54764bb6 1409 p = (struct rtable **)&nh->nh_rth_input;
d26b3a7c 1410 } else {
903ceff7 1411 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
d26b3a7c 1412 }
f2bb4bed
DM
1413 orig = *p;
1414
0830106c
WW
1415 /* hold dst before doing cmpxchg() to avoid race condition
1416 * on this dst
1417 */
1418 dst_hold(&rt->dst);
f2bb4bed
DM
1419 prev = cmpxchg(p, orig, rt);
1420 if (prev == orig) {
0830106c 1421 if (orig) {
95c47f9c 1422 dst_dev_put(&orig->dst);
0830106c 1423 dst_release(&orig->dst);
0830106c
WW
1424 }
1425 } else {
1426 dst_release(&rt->dst);
caacf05e 1427 ret = false;
0830106c 1428 }
caacf05e
DM
1429
1430 return ret;
1431}
1432
5055c371
ED
1433struct uncached_list {
1434 spinlock_t lock;
1435 struct list_head head;
1436};
1437
1438static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
caacf05e 1439
510c321b 1440void rt_add_uncached_list(struct rtable *rt)
caacf05e 1441{
5055c371
ED
1442 struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1443
1444 rt->rt_uncached_list = ul;
1445
1446 spin_lock_bh(&ul->lock);
1447 list_add_tail(&rt->rt_uncached, &ul->head);
1448 spin_unlock_bh(&ul->lock);
caacf05e
DM
1449}
1450
510c321b 1451void rt_del_uncached_list(struct rtable *rt)
caacf05e 1452{
78df76a0 1453 if (!list_empty(&rt->rt_uncached)) {
5055c371
ED
1454 struct uncached_list *ul = rt->rt_uncached_list;
1455
1456 spin_lock_bh(&ul->lock);
caacf05e 1457 list_del(&rt->rt_uncached);
5055c371 1458 spin_unlock_bh(&ul->lock);
caacf05e
DM
1459 }
1460}
1461
510c321b
XL
1462static void ipv4_dst_destroy(struct dst_entry *dst)
1463{
1464 struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1465 struct rtable *rt = (struct rtable *)dst;
1466
1467 if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
1468 kfree(p);
1469
1470 rt_del_uncached_list(rt);
1471}
1472
caacf05e
DM
1473void rt_flush_dev(struct net_device *dev)
1474{
5055c371
ED
1475 struct net *net = dev_net(dev);
1476 struct rtable *rt;
1477 int cpu;
1478
1479 for_each_possible_cpu(cpu) {
1480 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
caacf05e 1481
5055c371
ED
1482 spin_lock_bh(&ul->lock);
1483 list_for_each_entry(rt, &ul->head, rt_uncached) {
caacf05e
DM
1484 if (rt->dst.dev != dev)
1485 continue;
1486 rt->dst.dev = net->loopback_dev;
1487 dev_hold(rt->dst.dev);
1488 dev_put(dev);
1489 }
5055c371 1490 spin_unlock_bh(&ul->lock);
4895c771
DM
1491 }
1492}
1493
4331debc 1494static bool rt_cache_valid(const struct rtable *rt)
d2d68ba9 1495{
4331debc
ED
1496 return rt &&
1497 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1498 !rt_is_expired(rt);
d2d68ba9
DM
1499}
1500
f2bb4bed 1501static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
5e2b61f7 1502 const struct fib_result *res,
f2bb4bed 1503 struct fib_nh_exception *fnhe,
a4c2fd7f
WW
1504 struct fib_info *fi, u16 type, u32 itag,
1505 const bool do_cache)
1da177e4 1506{
caacf05e
DM
1507 bool cached = false;
1508
1da177e4 1509 if (fi) {
4895c771
DM
1510 struct fib_nh *nh = &FIB_RES_NH(*res);
1511
155e8336 1512 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
4895c771 1513 rt->rt_gateway = nh->nh_gw;
155e8336
JA
1514 rt->rt_uses_gateway = 1;
1515 }
3fb07daf
ED
1516 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1517 if (fi->fib_metrics != &dst_default_metrics) {
1518 rt->dst._metrics |= DST_METRICS_REFCOUNTED;
9620fef2 1519 refcount_inc(&fi->fib_metrics->refcnt);
3fb07daf 1520 }
c7066f70 1521#ifdef CONFIG_IP_ROUTE_CLASSID
f2bb4bed 1522 rt->dst.tclassid = nh->nh_tclassid;
1da177e4 1523#endif
61adedf3 1524 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
c5038a83 1525 if (unlikely(fnhe))
a4c2fd7f
WW
1526 cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1527 else if (do_cache)
caacf05e 1528 cached = rt_cache_route(nh, rt);
155e8336
JA
1529 if (unlikely(!cached)) {
1530 /* Routes we intend to cache in nexthop exception or
1531 * FIB nexthop have the DST_NOCACHE bit clear.
1532 * However, if we are unsuccessful at storing this
1533 * route into the cache we really need to set it.
1534 */
155e8336
JA
1535 if (!rt->rt_gateway)
1536 rt->rt_gateway = daddr;
1537 rt_add_uncached_list(rt);
1538 }
1539 } else
caacf05e 1540 rt_add_uncached_list(rt);
defb3519 1541
c7066f70 1542#ifdef CONFIG_IP_ROUTE_CLASSID
1da177e4 1543#ifdef CONFIG_IP_MULTIPLE_TABLES
85b91b03 1544 set_class_tag(rt, res->tclassid);
1da177e4
LT
1545#endif
1546 set_class_tag(rt, itag);
1547#endif
1da177e4
LT
1548}
1549
9ab179d8
DA
1550struct rtable *rt_dst_alloc(struct net_device *dev,
1551 unsigned int flags, u16 type,
1552 bool nopolicy, bool noxfrm, bool will_cache)
0c4dcd58 1553{
d08c4f35
DA
1554 struct rtable *rt;
1555
1556 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
a4c2fd7f 1557 (will_cache ? 0 : DST_HOST) |
d08c4f35 1558 (nopolicy ? DST_NOPOLICY : 0) |
b2a9c0ed 1559 (noxfrm ? DST_NOXFRM : 0));
d08c4f35
DA
1560
1561 if (rt) {
1562 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1563 rt->rt_flags = flags;
1564 rt->rt_type = type;
1565 rt->rt_is_input = 0;
1566 rt->rt_iif = 0;
1567 rt->rt_pmtu = 0;
d52e5a7e 1568 rt->rt_mtu_locked = 0;
d08c4f35
DA
1569 rt->rt_gateway = 0;
1570 rt->rt_uses_gateway = 0;
1571 INIT_LIST_HEAD(&rt->rt_uncached);
1572
1573 rt->dst.output = ip_output;
1574 if (flags & RTCF_LOCAL)
1575 rt->dst.input = ip_local_deliver;
1576 }
1577
1578 return rt;
0c4dcd58 1579}
9ab179d8 1580EXPORT_SYMBOL(rt_dst_alloc);
0c4dcd58 1581
96d36220 1582/* called in rcu_read_lock() section */
bc044e8d
PA
1583int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1584 u8 tos, struct net_device *dev,
1585 struct in_device *in_dev, u32 *itag)
1da177e4 1586{
b5f7e755 1587 int err;
1da177e4
LT
1588
1589 /* Primary sanity checks. */
51456b29 1590 if (!in_dev)
1da177e4
LT
1591 return -EINVAL;
1592
1e637c74 1593 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
d0daebc3 1594 skb->protocol != htons(ETH_P_IP))
bc044e8d 1595 return -EINVAL;
1da177e4 1596
75fea73d 1597 if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
bc044e8d 1598 return -EINVAL;
d0daebc3 1599
f97c1e0c
JP
1600 if (ipv4_is_zeronet(saddr)) {
1601 if (!ipv4_is_local_multicast(daddr))
bc044e8d 1602 return -EINVAL;
b5f7e755 1603 } else {
9e56e380 1604 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
bc044e8d 1605 in_dev, itag);
b5f7e755 1606 if (err < 0)
bc044e8d 1607 return err;
b5f7e755 1608 }
bc044e8d
PA
1609 return 0;
1610}
1611
1612/* called in rcu_read_lock() section */
1613static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1614 u8 tos, struct net_device *dev, int our)
1615{
1616 struct in_device *in_dev = __in_dev_get_rcu(dev);
1617 unsigned int flags = RTCF_MULTICAST;
1618 struct rtable *rth;
1619 u32 itag = 0;
1620 int err;
1621
1622 err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1623 if (err)
1624 return err;
1625
d08c4f35
DA
1626 if (our)
1627 flags |= RTCF_LOCAL;
1628
1629 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
f2bb4bed 1630 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1da177e4 1631 if (!rth)
bc044e8d 1632 return -ENOBUFS;
1da177e4 1633
cf911662
DM
1634#ifdef CONFIG_IP_ROUTE_CLASSID
1635 rth->dst.tclassid = itag;
1636#endif
d8d1f30b 1637 rth->dst.output = ip_rt_bug;
9917e1e8 1638 rth->rt_is_input= 1;
1da177e4
LT
1639
1640#ifdef CONFIG_IP_MROUTE
f97c1e0c 1641 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
d8d1f30b 1642 rth->dst.input = ip_mr_input;
1da177e4
LT
1643#endif
1644 RT_CACHE_STAT_INC(in_slow_mc);
1645
89aef892
DM
1646 skb_dst_set(skb, &rth->dst);
1647 return 0;
1da177e4
LT
1648}
1649
1650
1651static void ip_handle_martian_source(struct net_device *dev,
1652 struct in_device *in_dev,
1653 struct sk_buff *skb,
9e12bb22
AV
1654 __be32 daddr,
1655 __be32 saddr)
1da177e4
LT
1656{
1657 RT_CACHE_STAT_INC(in_martian_src);
1658#ifdef CONFIG_IP_ROUTE_VERBOSE
1659 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1660 /*
1661 * RFC1812 recommendation, if source is martian,
1662 * the only hint is MAC header.
1663 */
058bd4d2 1664 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
673d57e7 1665 &daddr, &saddr, dev->name);
98e399f8 1666 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
058bd4d2
JP
1667 print_hex_dump(KERN_WARNING, "ll header: ",
1668 DUMP_PREFIX_OFFSET, 16, 1,
1669 skb_mac_header(skb),
1670 dev->hard_header_len, true);
1da177e4
LT
1671 }
1672 }
1673#endif
1674}
1675
47360228 1676/* called in rcu_read_lock() section */
5969f71d 1677static int __mkroute_input(struct sk_buff *skb,
982721f3 1678 const struct fib_result *res,
5969f71d 1679 struct in_device *in_dev,
c6cffba4 1680 __be32 daddr, __be32 saddr, u32 tos)
1da177e4 1681{
2ffae99d 1682 struct fib_nh_exception *fnhe;
1da177e4
LT
1683 struct rtable *rth;
1684 int err;
1685 struct in_device *out_dev;
d2d68ba9 1686 bool do_cache;
fbdc0ad0 1687 u32 itag = 0;
1da177e4
LT
1688
1689 /* get a working reference to the output device */
47360228 1690 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
51456b29 1691 if (!out_dev) {
e87cc472 1692 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1da177e4
LT
1693 return -EINVAL;
1694 }
1695
5c04c819 1696 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
9e56e380 1697 in_dev->dev, in_dev, &itag);
1da177e4 1698 if (err < 0) {
e905a9ed 1699 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1da177e4 1700 saddr);
e905a9ed 1701
1da177e4
LT
1702 goto cleanup;
1703 }
1704
e81da0e1
JA
1705 do_cache = res->fi && !itag;
1706 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
df4d9254 1707 skb->protocol == htons(ETH_P_IP) &&
1da177e4 1708 (IN_DEV_SHARED_MEDIA(out_dev) ||
df4d9254
HFS
1709 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1710 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1da177e4
LT
1711
1712 if (skb->protocol != htons(ETH_P_IP)) {
1713 /* Not IP (i.e. ARP). Do not create route, if it is
1714 * invalid for proxy arp. DNAT routes are always valid.
65324144
JDB
1715 *
1716 * Proxy arp feature have been extended to allow, ARP
1717 * replies back to the same interface, to support
1718 * Private VLAN switch technologies. See arp.c.
1da177e4 1719 */
65324144
JDB
1720 if (out_dev == in_dev &&
1721 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1da177e4
LT
1722 err = -EINVAL;
1723 goto cleanup;
1724 }
1725 }
1726
2ffae99d 1727 fnhe = find_exception(&FIB_RES_NH(*res), daddr);
e81da0e1 1728 if (do_cache) {
94720e3a 1729 if (fnhe)
2ffae99d 1730 rth = rcu_dereference(fnhe->fnhe_rth_input);
94720e3a
JA
1731 else
1732 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
e81da0e1
JA
1733 if (rt_cache_valid(rth)) {
1734 skb_dst_set_noref(skb, &rth->dst);
1735 goto out;
d2d68ba9
DM
1736 }
1737 }
f2bb4bed 1738
d08c4f35 1739 rth = rt_dst_alloc(out_dev->dev, 0, res->type,
5c1e6aa3 1740 IN_DEV_CONF_GET(in_dev, NOPOLICY),
d2d68ba9 1741 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1da177e4
LT
1742 if (!rth) {
1743 err = -ENOBUFS;
1744 goto cleanup;
1745 }
1746
9917e1e8 1747 rth->rt_is_input = 1;
a6254864 1748 RT_CACHE_STAT_INC(in_slow_tot);
1da177e4 1749
d8d1f30b 1750 rth->dst.input = ip_forward;
1da177e4 1751
a4c2fd7f
WW
1752 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1753 do_cache);
9942895b 1754 lwtunnel_set_redirect(&rth->dst);
c6cffba4 1755 skb_dst_set(skb, &rth->dst);
d2d68ba9 1756out:
1da177e4
LT
1757 err = 0;
1758 cleanup:
1da177e4 1759 return err;
e905a9ed 1760}
1da177e4 1761
79a13159 1762#ifdef CONFIG_IP_ROUTE_MULTIPATH
79a13159 1763/* To make ICMP packets follow the right flow, the multipath hash is
bf4e0a3d 1764 * calculated from the inner IP addresses.
79a13159 1765 */
bf4e0a3d
NA
1766static void ip_multipath_l3_keys(const struct sk_buff *skb,
1767 struct flow_keys *hash_keys)
79a13159
PN
1768{
1769 const struct iphdr *outer_iph = ip_hdr(skb);
6f74b6c2 1770 const struct iphdr *key_iph = outer_iph;
bf4e0a3d 1771 const struct iphdr *inner_iph;
79a13159
PN
1772 const struct icmphdr *icmph;
1773 struct iphdr _inner_iph;
bf4e0a3d
NA
1774 struct icmphdr _icmph;
1775
bf4e0a3d 1776 if (likely(outer_iph->protocol != IPPROTO_ICMP))
6f74b6c2 1777 goto out;
79a13159
PN
1778
1779 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
6f74b6c2 1780 goto out;
79a13159
PN
1781
1782 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1783 &_icmph);
1784 if (!icmph)
6f74b6c2 1785 goto out;
79a13159
PN
1786
1787 if (icmph->type != ICMP_DEST_UNREACH &&
1788 icmph->type != ICMP_REDIRECT &&
1789 icmph->type != ICMP_TIME_EXCEEDED &&
bf4e0a3d 1790 icmph->type != ICMP_PARAMETERPROB)
6f74b6c2 1791 goto out;
79a13159
PN
1792
1793 inner_iph = skb_header_pointer(skb,
1794 outer_iph->ihl * 4 + sizeof(_icmph),
1795 sizeof(_inner_iph), &_inner_iph);
1796 if (!inner_iph)
6f74b6c2
DA
1797 goto out;
1798
1799 key_iph = inner_iph;
1800out:
1801 hash_keys->addrs.v4addrs.src = key_iph->saddr;
1802 hash_keys->addrs.v4addrs.dst = key_iph->daddr;
bf4e0a3d 1803}
79a13159 1804
bf4e0a3d 1805/* if skb is set it will be used and fl4 can be NULL */
7efc0b6b 1806int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
e37b1e97 1807 const struct sk_buff *skb, struct flow_keys *flkeys)
bf4e0a3d 1808{
bf4e0a3d
NA
1809 struct flow_keys hash_keys;
1810 u32 mhash;
79a13159 1811
bf4e0a3d
NA
1812 switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1813 case 0:
1814 memset(&hash_keys, 0, sizeof(hash_keys));
1815 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1816 if (skb) {
1817 ip_multipath_l3_keys(skb, &hash_keys);
1818 } else {
1819 hash_keys.addrs.v4addrs.src = fl4->saddr;
1820 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1821 }
1822 break;
1823 case 1:
1824 /* skb is currently provided only when forwarding */
1825 if (skb) {
1826 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1827 struct flow_keys keys;
1828
1829 /* short-circuit if we already have L4 hash present */
1830 if (skb->l4_hash)
1831 return skb_get_hash_raw(skb) >> 1;
ec7127a5 1832
bf4e0a3d 1833 memset(&hash_keys, 0, sizeof(hash_keys));
1fe4b118 1834
ec7127a5 1835 if (!flkeys) {
e37b1e97 1836 skb_flow_dissect_flow_keys(skb, &keys, flag);
ec7127a5 1837 flkeys = &keys;
e37b1e97 1838 }
ec7127a5
DA
1839
1840 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1841 hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1842 hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1843 hash_keys.ports.src = flkeys->ports.src;
1844 hash_keys.ports.dst = flkeys->ports.dst;
1845 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
bf4e0a3d
NA
1846 } else {
1847 memset(&hash_keys, 0, sizeof(hash_keys));
1848 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1849 hash_keys.addrs.v4addrs.src = fl4->saddr;
1850 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1851 hash_keys.ports.src = fl4->fl4_sport;
1852 hash_keys.ports.dst = fl4->fl4_dport;
1853 hash_keys.basic.ip_proto = fl4->flowi4_proto;
1854 }
1855 break;
1856 }
1857 mhash = flow_hash_from_keys(&hash_keys);
79a13159 1858
bf4e0a3d
NA
1859 return mhash >> 1;
1860}
79a13159
PN
1861#endif /* CONFIG_IP_ROUTE_MULTIPATH */
1862
5969f71d
SH
1863static int ip_mkroute_input(struct sk_buff *skb,
1864 struct fib_result *res,
5969f71d 1865 struct in_device *in_dev,
e37b1e97
RP
1866 __be32 daddr, __be32 saddr, u32 tos,
1867 struct flow_keys *hkeys)
1da177e4 1868{
1da177e4 1869#ifdef CONFIG_IP_ROUTE_MULTIPATH
0e884c78 1870 if (res->fi && res->fi->fib_nhs > 1) {
7efc0b6b 1871 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
0e884c78 1872
0e884c78
PN
1873 fib_select_multipath(res, h);
1874 }
1da177e4
LT
1875#endif
1876
1877 /* create a routing cache entry */
c6cffba4 1878 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1da177e4
LT
1879}
1880
1da177e4
LT
1881/*
1882 * NOTE. We drop all the packets that has local source
1883 * addresses, because every properly looped back packet
1884 * must have correct destination already attached by output routine.
1885 *
1886 * Such approach solves two big problems:
1887 * 1. Not simplex devices are handled properly.
1888 * 2. IP spoofing attempts are filtered with 100% of guarantee.
ebc0ffae 1889 * called with rcu_read_lock()
1da177e4
LT
1890 */
1891
9e12bb22 1892static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
5510cdf7
DA
1893 u8 tos, struct net_device *dev,
1894 struct fib_result *res)
1da177e4 1895{
96d36220 1896 struct in_device *in_dev = __in_dev_get_rcu(dev);
e37b1e97
RP
1897 struct flow_keys *flkeys = NULL, _flkeys;
1898 struct net *net = dev_net(dev);
1b7179d3 1899 struct ip_tunnel_info *tun_info;
e37b1e97 1900 int err = -EINVAL;
95c96174 1901 unsigned int flags = 0;
1da177e4 1902 u32 itag = 0;
95c96174 1903 struct rtable *rth;
e37b1e97 1904 struct flowi4 fl4;
d2d68ba9 1905 bool do_cache;
1da177e4
LT
1906
1907 /* IP on this device is disabled. */
1908
1909 if (!in_dev)
1910 goto out;
1911
1912 /* Check for the most weird martians, which can be not detected
1913 by fib_lookup.
1914 */
1915
61adedf3 1916 tun_info = skb_tunnel_info(skb);
46fa062a 1917 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1b7179d3
TG
1918 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1919 else
1920 fl4.flowi4_tun_key.tun_id = 0;
f38a9eb1
TG
1921 skb_dst_drop(skb);
1922
d0daebc3 1923 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1da177e4
LT
1924 goto martian_source;
1925
5510cdf7
DA
1926 res->fi = NULL;
1927 res->table = NULL;
27a954bd 1928 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1da177e4
LT
1929 goto brd_input;
1930
1931 /* Accept zero addresses only to limited broadcast;
1932 * I even do not know to fix it or not. Waiting for complains :-)
1933 */
f97c1e0c 1934 if (ipv4_is_zeronet(saddr))
1da177e4
LT
1935 goto martian_source;
1936
d0daebc3 1937 if (ipv4_is_zeronet(daddr))
1da177e4
LT
1938 goto martian_destination;
1939
9eb43e76
ED
1940 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1941 * and call it once if daddr or/and saddr are loopback addresses
1942 */
1943 if (ipv4_is_loopback(daddr)) {
1944 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
d0daebc3 1945 goto martian_destination;
9eb43e76
ED
1946 } else if (ipv4_is_loopback(saddr)) {
1947 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
d0daebc3
TG
1948 goto martian_source;
1949 }
1950
1da177e4
LT
1951 /*
1952 * Now we are ready to route packet.
1953 */
68a5e3dd 1954 fl4.flowi4_oif = 0;
e0d56fdd 1955 fl4.flowi4_iif = dev->ifindex;
68a5e3dd
DM
1956 fl4.flowi4_mark = skb->mark;
1957 fl4.flowi4_tos = tos;
1958 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
b84f7878 1959 fl4.flowi4_flags = 0;
68a5e3dd
DM
1960 fl4.daddr = daddr;
1961 fl4.saddr = saddr;
8bcfd092 1962 fl4.flowi4_uid = sock_net_uid(net, NULL);
e37b1e97
RP
1963
1964 if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys))
1965 flkeys = &_flkeys;
1966
5510cdf7 1967 err = fib_lookup(net, &fl4, res, 0);
cd0f0b95
DJ
1968 if (err != 0) {
1969 if (!IN_DEV_FORWARD(in_dev))
1970 err = -EHOSTUNREACH;
1da177e4 1971 goto no_route;
cd0f0b95 1972 }
1da177e4 1973
5510cdf7 1974 if (res->type == RTN_BROADCAST)
1da177e4
LT
1975 goto brd_input;
1976
5510cdf7 1977 if (res->type == RTN_LOCAL) {
5c04c819 1978 err = fib_validate_source(skb, saddr, daddr, tos,
0d5edc68 1979 0, dev, in_dev, &itag);
b5f7e755 1980 if (err < 0)
0d753960 1981 goto martian_source;
1da177e4
LT
1982 goto local_input;
1983 }
1984
cd0f0b95
DJ
1985 if (!IN_DEV_FORWARD(in_dev)) {
1986 err = -EHOSTUNREACH;
251da413 1987 goto no_route;
cd0f0b95 1988 }
5510cdf7 1989 if (res->type != RTN_UNICAST)
1da177e4
LT
1990 goto martian_destination;
1991
e37b1e97 1992 err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
1da177e4
LT
1993out: return err;
1994
1995brd_input:
1996 if (skb->protocol != htons(ETH_P_IP))
1997 goto e_inval;
1998
41347dcd 1999 if (!ipv4_is_zeronet(saddr)) {
9e56e380
DM
2000 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2001 in_dev, &itag);
1da177e4 2002 if (err < 0)
0d753960 2003 goto martian_source;
1da177e4
LT
2004 }
2005 flags |= RTCF_BROADCAST;
5510cdf7 2006 res->type = RTN_BROADCAST;
1da177e4
LT
2007 RT_CACHE_STAT_INC(in_brd);
2008
2009local_input:
d2d68ba9 2010 do_cache = false;
5510cdf7 2011 if (res->fi) {
fe3edf45 2012 if (!itag) {
5510cdf7 2013 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
d2d68ba9 2014 if (rt_cache_valid(rth)) {
c6cffba4
DM
2015 skb_dst_set_noref(skb, &rth->dst);
2016 err = 0;
2017 goto out;
d2d68ba9
DM
2018 }
2019 do_cache = true;
2020 }
2021 }
2022
f5a0aab8 2023 rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
5510cdf7 2024 flags | RTCF_LOCAL, res->type,
d2d68ba9 2025 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1da177e4
LT
2026 if (!rth)
2027 goto e_nobufs;
2028
d8d1f30b 2029 rth->dst.output= ip_rt_bug;
cf911662
DM
2030#ifdef CONFIG_IP_ROUTE_CLASSID
2031 rth->dst.tclassid = itag;
2032#endif
9917e1e8 2033 rth->rt_is_input = 1;
571e7226 2034
a6254864 2035 RT_CACHE_STAT_INC(in_slow_tot);
5510cdf7 2036 if (res->type == RTN_UNREACHABLE) {
d8d1f30b
CG
2037 rth->dst.input= ip_error;
2038 rth->dst.error= -err;
1da177e4
LT
2039 rth->rt_flags &= ~RTCF_LOCAL;
2040 }
efd85700 2041
dcdfdf56 2042 if (do_cache) {
5510cdf7 2043 struct fib_nh *nh = &FIB_RES_NH(*res);
efd85700
TG
2044
2045 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2046 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2047 WARN_ON(rth->dst.input == lwtunnel_input);
2048 rth->dst.lwtstate->orig_input = rth->dst.input;
2049 rth->dst.input = lwtunnel_input;
2050 }
2051
a4c2fd7f 2052 if (unlikely(!rt_cache_route(nh, rth)))
dcdfdf56 2053 rt_add_uncached_list(rth);
dcdfdf56 2054 }
89aef892 2055 skb_dst_set(skb, &rth->dst);
b23dd4fe 2056 err = 0;
ebc0ffae 2057 goto out;
1da177e4
LT
2058
2059no_route:
2060 RT_CACHE_STAT_INC(in_no_route);
5510cdf7
DA
2061 res->type = RTN_UNREACHABLE;
2062 res->fi = NULL;
2063 res->table = NULL;
1da177e4
LT
2064 goto local_input;
2065
2066 /*
2067 * Do not cache martian addresses: they should be logged (RFC1812)
2068 */
2069martian_destination:
2070 RT_CACHE_STAT_INC(in_martian_dst);
2071#ifdef CONFIG_IP_ROUTE_VERBOSE
e87cc472
JP
2072 if (IN_DEV_LOG_MARTIANS(in_dev))
2073 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2074 &daddr, &saddr, dev->name);
1da177e4 2075#endif
2c2910a4 2076
1da177e4
LT
2077e_inval:
2078 err = -EINVAL;
ebc0ffae 2079 goto out;
1da177e4
LT
2080
2081e_nobufs:
2082 err = -ENOBUFS;
ebc0ffae 2083 goto out;
1da177e4
LT
2084
2085martian_source:
2086 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
ebc0ffae 2087 goto out;
1da177e4
LT
2088}
2089
c6cffba4
DM
2090int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2091 u8 tos, struct net_device *dev)
1da177e4 2092{
5510cdf7
DA
2093 struct fib_result res;
2094 int err;
1da177e4 2095
6e28099d 2096 tos &= IPTOS_RT_MASK;
96d36220 2097 rcu_read_lock();
5510cdf7
DA
2098 err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2099 rcu_read_unlock();
96d36220 2100
5510cdf7
DA
2101 return err;
2102}
2103EXPORT_SYMBOL(ip_route_input_noref);
2104
2105/* called with rcu_read_lock held */
2106int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2107 u8 tos, struct net_device *dev, struct fib_result *res)
2108{
1da177e4
LT
2109 /* Multicast recognition logic is moved from route cache to here.
2110 The problem was that too many Ethernet cards have broken/missing
2111 hardware multicast filters :-( As result the host on multicasting
2112 network acquires a lot of useless route cache entries, sort of
2113 SDR messages from all the world. Now we try to get rid of them.
2114 Really, provided software IP multicast filter is organized
2115 reasonably (at least, hashed), it does not result in a slowdown
2116 comparing with route cache reject entries.
2117 Note, that multicast routers are not affected, because
2118 route cache entry is created eventually.
2119 */
f97c1e0c 2120 if (ipv4_is_multicast(daddr)) {
96d36220 2121 struct in_device *in_dev = __in_dev_get_rcu(dev);
e58e4159 2122 int our = 0;
5510cdf7 2123 int err = -EINVAL;
1da177e4 2124
e58e4159
DA
2125 if (in_dev)
2126 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2127 ip_hdr(skb)->protocol);
2128
2129 /* check l3 master if no match yet */
2130 if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
2131 struct in_device *l3_in_dev;
2132
2133 l3_in_dev = __in_dev_get_rcu(skb->dev);
2134 if (l3_in_dev)
2135 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2136 ip_hdr(skb)->protocol);
2137 }
2138
e58e4159 2139 if (our
1da177e4 2140#ifdef CONFIG_IP_MROUTE
e58e4159
DA
2141 ||
2142 (!ipv4_is_local_multicast(daddr) &&
2143 IN_DEV_MFORWARD(in_dev))
1da177e4 2144#endif
e58e4159 2145 ) {
5510cdf7 2146 err = ip_route_input_mc(skb, daddr, saddr,
e58e4159 2147 tos, dev, our);
1da177e4 2148 }
5510cdf7 2149 return err;
1da177e4 2150 }
5510cdf7
DA
2151
2152 return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
1da177e4
LT
2153}
2154
ebc0ffae 2155/* called with rcu_read_lock() */
982721f3 2156static struct rtable *__mkroute_output(const struct fib_result *res,
1a00fee4 2157 const struct flowi4 *fl4, int orig_oif,
f61759e6 2158 struct net_device *dev_out,
5ada5527 2159 unsigned int flags)
1da177e4 2160{
982721f3 2161 struct fib_info *fi = res->fi;
f2bb4bed 2162 struct fib_nh_exception *fnhe;
5ada5527 2163 struct in_device *in_dev;
982721f3 2164 u16 type = res->type;
5ada5527 2165 struct rtable *rth;
c92b9655 2166 bool do_cache;
1da177e4 2167
d0daebc3
TG
2168 in_dev = __in_dev_get_rcu(dev_out);
2169 if (!in_dev)
5ada5527 2170 return ERR_PTR(-EINVAL);
1da177e4 2171
d0daebc3 2172 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
5f02ce24
DA
2173 if (ipv4_is_loopback(fl4->saddr) &&
2174 !(dev_out->flags & IFF_LOOPBACK) &&
2175 !netif_is_l3_master(dev_out))
d0daebc3
TG
2176 return ERR_PTR(-EINVAL);
2177
68a5e3dd 2178 if (ipv4_is_lbcast(fl4->daddr))
982721f3 2179 type = RTN_BROADCAST;
68a5e3dd 2180 else if (ipv4_is_multicast(fl4->daddr))
982721f3 2181 type = RTN_MULTICAST;
68a5e3dd 2182 else if (ipv4_is_zeronet(fl4->daddr))
5ada5527 2183 return ERR_PTR(-EINVAL);
1da177e4
LT
2184
2185 if (dev_out->flags & IFF_LOOPBACK)
2186 flags |= RTCF_LOCAL;
2187
63617421 2188 do_cache = true;
982721f3 2189 if (type == RTN_BROADCAST) {
1da177e4 2190 flags |= RTCF_BROADCAST | RTCF_LOCAL;
982721f3
DM
2191 fi = NULL;
2192 } else if (type == RTN_MULTICAST) {
dd28d1a0 2193 flags |= RTCF_MULTICAST | RTCF_LOCAL;
813b3b5d
DM
2194 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2195 fl4->flowi4_proto))
1da177e4 2196 flags &= ~RTCF_LOCAL;
63617421
JA
2197 else
2198 do_cache = false;
1da177e4 2199 /* If multicast route do not exist use
dd28d1a0
ED
2200 * default one, but do not gateway in this case.
2201 * Yes, it is hack.
1da177e4 2202 */
982721f3
DM
2203 if (fi && res->prefixlen < 4)
2204 fi = NULL;
d6d5e999
CF
2205 } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2206 (orig_oif != dev_out->ifindex)) {
2207 /* For local routes that require a particular output interface
2208 * we do not want to cache the result. Caching the result
2209 * causes incorrect behaviour when there are multiple source
2210 * addresses on the interface, the end result being that if the
2211 * intended recipient is waiting on that interface for the
2212 * packet he won't receive it because it will be delivered on
2213 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2214 * be set to the loopback interface as well.
2215 */
94720e3a 2216 do_cache = false;
1da177e4
LT
2217 }
2218
f2bb4bed 2219 fnhe = NULL;
63617421 2220 do_cache &= fi != NULL;
94720e3a 2221 if (fi) {
c5038a83 2222 struct rtable __rcu **prth;
c92b9655 2223 struct fib_nh *nh = &FIB_RES_NH(*res);
d26b3a7c 2224
c92b9655 2225 fnhe = find_exception(nh, fl4->daddr);
94720e3a
JA
2226 if (!do_cache)
2227 goto add;
deed49df 2228 if (fnhe) {
2ffae99d 2229 prth = &fnhe->fnhe_rth_output;
94720e3a
JA
2230 } else {
2231 if (unlikely(fl4->flowi4_flags &
2232 FLOWI_FLAG_KNOWN_NH &&
2233 !(nh->nh_gw &&
2234 nh->nh_scope == RT_SCOPE_LINK))) {
2235 do_cache = false;
2236 goto add;
c92b9655 2237 }
94720e3a 2238 prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
c92b9655 2239 }
c5038a83 2240 rth = rcu_dereference(*prth);
9df16efa 2241 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
c5038a83 2242 return rth;
f2bb4bed 2243 }
c92b9655
JA
2244
2245add:
d08c4f35 2246 rth = rt_dst_alloc(dev_out, flags, type,
5c1e6aa3 2247 IN_DEV_CONF_GET(in_dev, NOPOLICY),
f2bb4bed 2248 IN_DEV_CONF_GET(in_dev, NOXFRM),
c92b9655 2249 do_cache);
8391d07b 2250 if (!rth)
5ada5527 2251 return ERR_PTR(-ENOBUFS);
8391d07b 2252
9438c871 2253 rth->rt_iif = orig_oif;
b7503e0c 2254
1da177e4
LT
2255 RT_CACHE_STAT_INC(out_slow_tot);
2256
1da177e4 2257 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
e905a9ed 2258 if (flags & RTCF_LOCAL &&
1da177e4 2259 !(dev_out->flags & IFF_LOOPBACK)) {
d8d1f30b 2260 rth->dst.output = ip_mc_output;
1da177e4
LT
2261 RT_CACHE_STAT_INC(out_slow_mc);
2262 }
2263#ifdef CONFIG_IP_MROUTE
982721f3 2264 if (type == RTN_MULTICAST) {
1da177e4 2265 if (IN_DEV_MFORWARD(in_dev) &&
813b3b5d 2266 !ipv4_is_local_multicast(fl4->daddr)) {
d8d1f30b
CG
2267 rth->dst.input = ip_mr_input;
2268 rth->dst.output = ip_mc_output;
1da177e4
LT
2269 }
2270 }
2271#endif
2272 }
2273
a4c2fd7f 2274 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
9942895b 2275 lwtunnel_set_redirect(&rth->dst);
1da177e4 2276
5ada5527 2277 return rth;
1da177e4
LT
2278}
2279
1da177e4
LT
2280/*
2281 * Major route resolver routine.
2282 */
2283
3abd1ade
DA
2284struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2285 const struct sk_buff *skb)
1da177e4 2286{
f61759e6 2287 __u8 tos = RT_FL_TOS(fl4);
d0ea2b12
ED
2288 struct fib_result res = {
2289 .type = RTN_UNSPEC,
2290 .fi = NULL,
2291 .table = NULL,
2292 .tclassid = 0,
2293 };
5ada5527 2294 struct rtable *rth;
1da177e4 2295
1fb9489b 2296 fl4->flowi4_iif = LOOPBACK_IFINDEX;
813b3b5d
DM
2297 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2298 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2299 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
44713b67 2300
010c2708 2301 rcu_read_lock();
3abd1ade
DA
2302 rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2303 rcu_read_unlock();
2304
2305 return rth;
2306}
2307EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2308
2309struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2310 struct fib_result *res,
2311 const struct sk_buff *skb)
2312{
2313 struct net_device *dev_out = NULL;
2314 int orig_oif = fl4->flowi4_oif;
2315 unsigned int flags = 0;
2316 struct rtable *rth;
2317 int err = -ENETUNREACH;
2318
813b3b5d 2319 if (fl4->saddr) {
b23dd4fe 2320 rth = ERR_PTR(-EINVAL);
813b3b5d
DM
2321 if (ipv4_is_multicast(fl4->saddr) ||
2322 ipv4_is_lbcast(fl4->saddr) ||
2323 ipv4_is_zeronet(fl4->saddr))
1da177e4
LT
2324 goto out;
2325
1da177e4
LT
2326 /* I removed check for oif == dev_out->oif here.
2327 It was wrong for two reasons:
1ab35276
DL
2328 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2329 is assigned to multiple interfaces.
1da177e4
LT
2330 2. Moreover, we are allowed to send packets with saddr
2331 of another iface. --ANK
2332 */
2333
813b3b5d
DM
2334 if (fl4->flowi4_oif == 0 &&
2335 (ipv4_is_multicast(fl4->daddr) ||
2336 ipv4_is_lbcast(fl4->daddr))) {
a210d01a 2337 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 2338 dev_out = __ip_dev_find(net, fl4->saddr, false);
51456b29 2339 if (!dev_out)
a210d01a
JA
2340 goto out;
2341
1da177e4
LT
2342 /* Special hack: user can direct multicasts
2343 and limited broadcast via necessary interface
2344 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2345 This hack is not just for fun, it allows
2346 vic,vat and friends to work.
2347 They bind socket to loopback, set ttl to zero
2348 and expect that it will work.
2349 From the viewpoint of routing cache they are broken,
2350 because we are not allowed to build multicast path
2351 with loopback source addr (look, routing cache
2352 cannot know, that ttl is zero, so that packet
2353 will not leave this host and route is valid).
2354 Luckily, this hack is good workaround.
2355 */
2356
813b3b5d 2357 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2358 goto make_route;
2359 }
a210d01a 2360
813b3b5d 2361 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
a210d01a 2362 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
813b3b5d 2363 if (!__ip_dev_find(net, fl4->saddr, false))
a210d01a 2364 goto out;
a210d01a 2365 }
1da177e4
LT
2366 }
2367
2368
813b3b5d
DM
2369 if (fl4->flowi4_oif) {
2370 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
b23dd4fe 2371 rth = ERR_PTR(-ENODEV);
51456b29 2372 if (!dev_out)
1da177e4 2373 goto out;
e5ed6399
HX
2374
2375 /* RACE: Check return value of inet_select_addr instead. */
fc75fc83 2376 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
b23dd4fe 2377 rth = ERR_PTR(-ENETUNREACH);
fc75fc83
ED
2378 goto out;
2379 }
813b3b5d 2380 if (ipv4_is_local_multicast(fl4->daddr) ||
6a211654
AL
2381 ipv4_is_lbcast(fl4->daddr) ||
2382 fl4->flowi4_proto == IPPROTO_IGMP) {
813b3b5d
DM
2383 if (!fl4->saddr)
2384 fl4->saddr = inet_select_addr(dev_out, 0,
2385 RT_SCOPE_LINK);
1da177e4
LT
2386 goto make_route;
2387 }
0a7e2260 2388 if (!fl4->saddr) {
813b3b5d
DM
2389 if (ipv4_is_multicast(fl4->daddr))
2390 fl4->saddr = inet_select_addr(dev_out, 0,
2391 fl4->flowi4_scope);
2392 else if (!fl4->daddr)
2393 fl4->saddr = inet_select_addr(dev_out, 0,
2394 RT_SCOPE_HOST);
1da177e4
LT
2395 }
2396 }
2397
813b3b5d
DM
2398 if (!fl4->daddr) {
2399 fl4->daddr = fl4->saddr;
2400 if (!fl4->daddr)
2401 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
b40afd0e 2402 dev_out = net->loopback_dev;
1fb9489b 2403 fl4->flowi4_oif = LOOPBACK_IFINDEX;
3abd1ade 2404 res->type = RTN_LOCAL;
1da177e4
LT
2405 flags |= RTCF_LOCAL;
2406 goto make_route;
2407 }
2408
3abd1ade 2409 err = fib_lookup(net, fl4, res, 0);
0315e382 2410 if (err) {
3abd1ade
DA
2411 res->fi = NULL;
2412 res->table = NULL;
6104e112 2413 if (fl4->flowi4_oif &&
e58e4159
DA
2414 (ipv4_is_multicast(fl4->daddr) ||
2415 !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
1da177e4
LT
2416 /* Apparently, routing tables are wrong. Assume,
2417 that the destination is on link.
2418
2419 WHY? DW.
2420 Because we are allowed to send to iface
2421 even if it has NO routes and NO assigned
2422 addresses. When oif is specified, routing
2423 tables are looked up with only one purpose:
2424 to catch if destination is gatewayed, rather than
2425 direct. Moreover, if MSG_DONTROUTE is set,
2426 we send packet, ignoring both routing tables
2427 and ifaddr state. --ANK
2428
2429
2430 We could make it even if oif is unknown,
2431 likely IPv6, but we do not.
2432 */
2433
813b3b5d
DM
2434 if (fl4->saddr == 0)
2435 fl4->saddr = inet_select_addr(dev_out, 0,
2436 RT_SCOPE_LINK);
3abd1ade 2437 res->type = RTN_UNICAST;
1da177e4
LT
2438 goto make_route;
2439 }
0315e382 2440 rth = ERR_PTR(err);
1da177e4
LT
2441 goto out;
2442 }
1da177e4 2443
3abd1ade 2444 if (res->type == RTN_LOCAL) {
813b3b5d 2445 if (!fl4->saddr) {
3abd1ade
DA
2446 if (res->fi->fib_prefsrc)
2447 fl4->saddr = res->fi->fib_prefsrc;
9fc3bbb4 2448 else
813b3b5d 2449 fl4->saddr = fl4->daddr;
9fc3bbb4 2450 }
5f02ce24
DA
2451
2452 /* L3 master device is the loopback for that domain */
3abd1ade 2453 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
b7c8487c 2454 net->loopback_dev;
839da4d9
DA
2455
2456 /* make sure orig_oif points to fib result device even
2457 * though packet rx/tx happens over loopback or l3mdev
2458 */
2459 orig_oif = FIB_RES_OIF(*res);
2460
813b3b5d 2461 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2462 flags |= RTCF_LOCAL;
2463 goto make_route;
2464 }
2465
3abd1ade 2466 fib_select_path(net, res, fl4, skb);
1da177e4 2467
3abd1ade 2468 dev_out = FIB_RES_DEV(*res);
813b3b5d 2469 fl4->flowi4_oif = dev_out->ifindex;
1da177e4
LT
2470
2471
2472make_route:
3abd1ade 2473 rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
1da177e4 2474
010c2708 2475out:
b23dd4fe 2476 return rth;
1da177e4 2477}
d8c97a94 2478
ae2688d5
JW
2479static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2480{
2481 return NULL;
2482}
2483
ebb762f2 2484static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
ec831ea7 2485{
618f9bc7
SK
2486 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2487
2488 return mtu ? : dst->dev->mtu;
ec831ea7
RD
2489}
2490
6700c270
DM
2491static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2492 struct sk_buff *skb, u32 mtu)
14e50e57
DM
2493{
2494}
2495
6700c270
DM
2496static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2497 struct sk_buff *skb)
b587ee3b
DM
2498{
2499}
2500
0972ddb2
HB
2501static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2502 unsigned long old)
2503{
2504 return NULL;
2505}
2506
14e50e57
DM
2507static struct dst_ops ipv4_dst_blackhole_ops = {
2508 .family = AF_INET,
ae2688d5 2509 .check = ipv4_blackhole_dst_check,
ebb762f2 2510 .mtu = ipv4_blackhole_mtu,
214f45c9 2511 .default_advmss = ipv4_default_advmss,
14e50e57 2512 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
b587ee3b 2513 .redirect = ipv4_rt_blackhole_redirect,
0972ddb2 2514 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
d3aaeb38 2515 .neigh_lookup = ipv4_neigh_lookup,
14e50e57
DM
2516};
2517
2774c131 2518struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
14e50e57 2519{
2774c131 2520 struct rtable *ort = (struct rtable *) dst_orig;
f5b0a874 2521 struct rtable *rt;
14e50e57 2522
6c0e7284 2523 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
14e50e57 2524 if (rt) {
d8d1f30b 2525 struct dst_entry *new = &rt->dst;
14e50e57 2526
14e50e57 2527 new->__use = 1;
352e512c 2528 new->input = dst_discard;
ede2059d 2529 new->output = dst_discard_out;
14e50e57 2530
1dbe3252 2531 new->dev = net->loopback_dev;
14e50e57
DM
2532 if (new->dev)
2533 dev_hold(new->dev);
2534
9917e1e8 2535 rt->rt_is_input = ort->rt_is_input;
5e2b61f7 2536 rt->rt_iif = ort->rt_iif;
5943634f 2537 rt->rt_pmtu = ort->rt_pmtu;
d52e5a7e 2538 rt->rt_mtu_locked = ort->rt_mtu_locked;
14e50e57 2539
ca4c3fc2 2540 rt->rt_genid = rt_genid_ipv4(net);
14e50e57
DM
2541 rt->rt_flags = ort->rt_flags;
2542 rt->rt_type = ort->rt_type;
14e50e57 2543 rt->rt_gateway = ort->rt_gateway;
155e8336 2544 rt->rt_uses_gateway = ort->rt_uses_gateway;
14e50e57 2545
caacf05e 2546 INIT_LIST_HEAD(&rt->rt_uncached);
14e50e57
DM
2547 }
2548
2774c131
DM
2549 dst_release(dst_orig);
2550
2551 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
14e50e57
DM
2552}
2553
9d6ec938 2554struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
6f9c9615 2555 const struct sock *sk)
1da177e4 2556{
9d6ec938 2557 struct rtable *rt = __ip_route_output_key(net, flp4);
1da177e4 2558
b23dd4fe
DM
2559 if (IS_ERR(rt))
2560 return rt;
1da177e4 2561
56157872 2562 if (flp4->flowi4_proto)
f92ee619
SK
2563 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2564 flowi4_to_flowi(flp4),
2565 sk, 0);
1da177e4 2566
b23dd4fe 2567 return rt;
1da177e4 2568}
d8c97a94
ACM
2569EXPORT_SYMBOL_GPL(ip_route_output_flow);
2570
3765d35e 2571/* called with rcu_read_lock held */
c36ba660 2572static int rt_fill_info(struct net *net, __be32 dst, __be32 src, u32 table_id,
15e47304 2573 struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
ba52d61e 2574 u32 seq)
1da177e4 2575{
ba52d61e 2576 struct rtable *rt = skb_rtable(skb);
1da177e4 2577 struct rtmsg *r;
be403ea1 2578 struct nlmsghdr *nlh;
2bc8ca40 2579 unsigned long expires = 0;
f185071d 2580 u32 error;
521f5490 2581 u32 metrics[RTAX_MAX];
be403ea1 2582
d3166e0c 2583 nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
51456b29 2584 if (!nlh)
26932566 2585 return -EMSGSIZE;
be403ea1
TG
2586
2587 r = nlmsg_data(nlh);
1da177e4
LT
2588 r->rtm_family = AF_INET;
2589 r->rtm_dst_len = 32;
2590 r->rtm_src_len = 0;
d6c0a4f6 2591 r->rtm_tos = fl4->flowi4_tos;
8a430ed5 2592 r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
c36ba660 2593 if (nla_put_u32(skb, RTA_TABLE, table_id))
f3756b79 2594 goto nla_put_failure;
1da177e4
LT
2595 r->rtm_type = rt->rt_type;
2596 r->rtm_scope = RT_SCOPE_UNIVERSE;
2597 r->rtm_protocol = RTPROT_UNSPEC;
2598 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2599 if (rt->rt_flags & RTCF_NOTIFY)
2600 r->rtm_flags |= RTM_F_NOTIFY;
df4d9254
HFS
2601 if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2602 r->rtm_flags |= RTCF_DOREDIRECT;
be403ea1 2603
930345ea 2604 if (nla_put_in_addr(skb, RTA_DST, dst))
f3756b79 2605 goto nla_put_failure;
1a00fee4 2606 if (src) {
1da177e4 2607 r->rtm_src_len = 32;
930345ea 2608 if (nla_put_in_addr(skb, RTA_SRC, src))
f3756b79 2609 goto nla_put_failure;
1da177e4 2610 }
f3756b79
DM
2611 if (rt->dst.dev &&
2612 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2613 goto nla_put_failure;
c7066f70 2614#ifdef CONFIG_IP_ROUTE_CLASSID
f3756b79
DM
2615 if (rt->dst.tclassid &&
2616 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2617 goto nla_put_failure;
1da177e4 2618#endif
41347dcd 2619 if (!rt_is_input_route(rt) &&
d6c0a4f6 2620 fl4->saddr != src) {
930345ea 2621 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
f3756b79
DM
2622 goto nla_put_failure;
2623 }
155e8336 2624 if (rt->rt_uses_gateway &&
930345ea 2625 nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
f3756b79 2626 goto nla_put_failure;
be403ea1 2627
ee9a8f7a
SK
2628 expires = rt->dst.expires;
2629 if (expires) {
2630 unsigned long now = jiffies;
2631
2632 if (time_before(now, expires))
2633 expires -= now;
2634 else
2635 expires = 0;
2636 }
2637
521f5490 2638 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
ee9a8f7a 2639 if (rt->rt_pmtu && expires)
521f5490 2640 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
d52e5a7e
SD
2641 if (rt->rt_mtu_locked && expires)
2642 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
521f5490 2643 if (rtnetlink_put_metrics(skb, metrics) < 0)
be403ea1
TG
2644 goto nla_put_failure;
2645
b4869889 2646 if (fl4->flowi4_mark &&
68aaed54 2647 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
f3756b79 2648 goto nla_put_failure;
963bfeee 2649
622ec2c9
LC
2650 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2651 nla_put_u32(skb, RTA_UID,
2652 from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2653 goto nla_put_failure;
2654
d8d1f30b 2655 error = rt->dst.error;
be403ea1 2656
c7537967 2657 if (rt_is_input_route(rt)) {
8caaf7b6
ND
2658#ifdef CONFIG_IP_MROUTE
2659 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2660 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2661 int err = ipmr_get_route(net, skb,
2662 fl4->saddr, fl4->daddr,
9f09eaea 2663 r, portid);
2cf75070 2664
8caaf7b6 2665 if (err <= 0) {
0c8d803f
DA
2666 if (err == 0)
2667 return 0;
2668 goto nla_put_failure;
8caaf7b6
ND
2669 }
2670 } else
2671#endif
91146153 2672 if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
8caaf7b6 2673 goto nla_put_failure;
1da177e4
LT
2674 }
2675
f185071d 2676 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
e3703b3d 2677 goto nla_put_failure;
be403ea1 2678
053c095a
JB
2679 nlmsg_end(skb, nlh);
2680 return 0;
1da177e4 2681
be403ea1 2682nla_put_failure:
26932566
PM
2683 nlmsg_cancel(skb, nlh);
2684 return -EMSGSIZE;
1da177e4
LT
2685}
2686
c21ef3e3
DA
2687static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2688 struct netlink_ext_ack *extack)
1da177e4 2689{
3b1e0a65 2690 struct net *net = sock_net(in_skb->sk);
d889ce3b
TG
2691 struct rtmsg *rtm;
2692 struct nlattr *tb[RTA_MAX+1];
3765d35e 2693 struct fib_result res = {};
1da177e4 2694 struct rtable *rt = NULL;
d6c0a4f6 2695 struct flowi4 fl4;
9e12bb22
AV
2696 __be32 dst = 0;
2697 __be32 src = 0;
2698 u32 iif;
d889ce3b 2699 int err;
963bfeee 2700 int mark;
1da177e4 2701 struct sk_buff *skb;
c36ba660 2702 u32 table_id = RT_TABLE_MAIN;
622ec2c9 2703 kuid_t uid;
1da177e4 2704
fceb6435 2705 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
c21ef3e3 2706 extack);
d889ce3b
TG
2707 if (err < 0)
2708 goto errout;
2709
2710 rtm = nlmsg_data(nlh);
2711
1da177e4 2712 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
51456b29 2713 if (!skb) {
d889ce3b
TG
2714 err = -ENOBUFS;
2715 goto errout;
2716 }
1da177e4
LT
2717
2718 /* Reserve room for dummy headers, this skb can pass
2719 through good chunk of routing engine.
2720 */
459a98ed 2721 skb_reset_mac_header(skb);
c1d2bbe1 2722 skb_reset_network_header(skb);
d2c962b8 2723
67b61f6c
JB
2724 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2725 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
d889ce3b 2726 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
963bfeee 2727 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
622ec2c9
LC
2728 if (tb[RTA_UID])
2729 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2730 else
2731 uid = (iif ? INVALID_UID : current_uid());
1da177e4 2732
bbadb9a2
FL
2733 /* Bugfix: need to give ip_route_input enough of an IP header to
2734 * not gag.
2735 */
2736 ip_hdr(skb)->protocol = IPPROTO_UDP;
2737 ip_hdr(skb)->saddr = src;
2738 ip_hdr(skb)->daddr = dst;
2739
2740 skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2741
d6c0a4f6
DM
2742 memset(&fl4, 0, sizeof(fl4));
2743 fl4.daddr = dst;
2744 fl4.saddr = src;
2745 fl4.flowi4_tos = rtm->rtm_tos;
2746 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2747 fl4.flowi4_mark = mark;
622ec2c9 2748 fl4.flowi4_uid = uid;
d6c0a4f6 2749
3765d35e
DA
2750 rcu_read_lock();
2751
1da177e4 2752 if (iif) {
d889ce3b
TG
2753 struct net_device *dev;
2754
3765d35e 2755 dev = dev_get_by_index_rcu(net, iif);
51456b29 2756 if (!dev) {
d889ce3b
TG
2757 err = -ENODEV;
2758 goto errout_free;
2759 }
2760
1da177e4
LT
2761 skb->protocol = htons(ETH_P_IP);
2762 skb->dev = dev;
963bfeee 2763 skb->mark = mark;
3765d35e
DA
2764 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
2765 dev, &res);
d889ce3b 2766
511c3f92 2767 rt = skb_rtable(skb);
d8d1f30b
CG
2768 if (err == 0 && rt->dst.error)
2769 err = -rt->dst.error;
1da177e4 2770 } else {
6503a304 2771 fl4.flowi4_iif = LOOPBACK_IFINDEX;
3765d35e 2772 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
b23dd4fe
DM
2773 err = 0;
2774 if (IS_ERR(rt))
2775 err = PTR_ERR(rt);
2c87d63a
FW
2776 else
2777 skb_dst_set(skb, &rt->dst);
1da177e4 2778 }
d889ce3b 2779
1da177e4 2780 if (err)
d889ce3b 2781 goto errout_free;
1da177e4 2782
1da177e4
LT
2783 if (rtm->rtm_flags & RTM_F_NOTIFY)
2784 rt->rt_flags |= RTCF_NOTIFY;
2785
c36ba660 2786 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
68e813aa 2787 table_id = res.table ? res.table->tb_id : 0;
c36ba660 2788
bc3aae2b
RP
2789 if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2790 if (!res.fi) {
2791 err = fib_props[res.type].error;
2792 if (!err)
2793 err = -EHOSTUNREACH;
2794 goto errout_free;
2795 }
b6179813
RP
2796 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2797 nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2798 rt->rt_type, res.prefix, res.prefixlen,
2799 fl4.flowi4_tos, res.fi, 0);
bc3aae2b 2800 } else {
b6179813 2801 err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
ba52d61e 2802 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
bc3aae2b 2803 }
7b46a644 2804 if (err < 0)
d889ce3b 2805 goto errout_free;
1da177e4 2806
3765d35e
DA
2807 rcu_read_unlock();
2808
15e47304 2809 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
d889ce3b 2810errout:
2942e900 2811 return err;
1da177e4 2812
d889ce3b 2813errout_free:
3765d35e 2814 rcu_read_unlock();
1da177e4 2815 kfree_skb(skb);
d889ce3b 2816 goto errout;
1da177e4
LT
2817}
2818
1da177e4
LT
2819void ip_rt_multicast_event(struct in_device *in_dev)
2820{
4ccfe6d4 2821 rt_cache_flush(dev_net(in_dev->dev));
1da177e4
LT
2822}
2823
2824#ifdef CONFIG_SYSCTL
082c7ca4
G
2825static int ip_rt_gc_interval __read_mostly = 60 * HZ;
2826static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
2827static int ip_rt_gc_elasticity __read_mostly = 8;
773daa3c 2828static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU;
082c7ca4 2829
fe2c6338 2830static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
8d65af78 2831 void __user *buffer,
1da177e4
LT
2832 size_t *lenp, loff_t *ppos)
2833{
5aad1de5
TT
2834 struct net *net = (struct net *)__ctl->extra1;
2835
1da177e4 2836 if (write) {
5aad1de5
TT
2837 rt_cache_flush(net);
2838 fnhe_genid_bump(net);
1da177e4 2839 return 0;
e905a9ed 2840 }
1da177e4
LT
2841
2842 return -EINVAL;
2843}
2844
fe2c6338 2845static struct ctl_table ipv4_route_table[] = {
1da177e4 2846 {
1da177e4
LT
2847 .procname = "gc_thresh",
2848 .data = &ipv4_dst_ops.gc_thresh,
2849 .maxlen = sizeof(int),
2850 .mode = 0644,
6d9f239a 2851 .proc_handler = proc_dointvec,
1da177e4
LT
2852 },
2853 {
1da177e4
LT
2854 .procname = "max_size",
2855 .data = &ip_rt_max_size,
2856 .maxlen = sizeof(int),
2857 .mode = 0644,
6d9f239a 2858 .proc_handler = proc_dointvec,
1da177e4
LT
2859 },
2860 {
2861 /* Deprecated. Use gc_min_interval_ms */
e905a9ed 2862
1da177e4
LT
2863 .procname = "gc_min_interval",
2864 .data = &ip_rt_gc_min_interval,
2865 .maxlen = sizeof(int),
2866 .mode = 0644,
6d9f239a 2867 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
2868 },
2869 {
1da177e4
LT
2870 .procname = "gc_min_interval_ms",
2871 .data = &ip_rt_gc_min_interval,
2872 .maxlen = sizeof(int),
2873 .mode = 0644,
6d9f239a 2874 .proc_handler = proc_dointvec_ms_jiffies,
1da177e4
LT
2875 },
2876 {
1da177e4
LT
2877 .procname = "gc_timeout",
2878 .data = &ip_rt_gc_timeout,
2879 .maxlen = sizeof(int),
2880 .mode = 0644,
6d9f239a 2881 .proc_handler = proc_dointvec_jiffies,
1da177e4 2882 },
9f28a2fc
ED
2883 {
2884 .procname = "gc_interval",
2885 .data = &ip_rt_gc_interval,
2886 .maxlen = sizeof(int),
2887 .mode = 0644,
2888 .proc_handler = proc_dointvec_jiffies,
2889 },
1da177e4 2890 {
1da177e4
LT
2891 .procname = "redirect_load",
2892 .data = &ip_rt_redirect_load,
2893 .maxlen = sizeof(int),
2894 .mode = 0644,
6d9f239a 2895 .proc_handler = proc_dointvec,
1da177e4
LT
2896 },
2897 {
1da177e4
LT
2898 .procname = "redirect_number",
2899 .data = &ip_rt_redirect_number,
2900 .maxlen = sizeof(int),
2901 .mode = 0644,
6d9f239a 2902 .proc_handler = proc_dointvec,
1da177e4
LT
2903 },
2904 {
1da177e4
LT
2905 .procname = "redirect_silence",
2906 .data = &ip_rt_redirect_silence,
2907 .maxlen = sizeof(int),
2908 .mode = 0644,
6d9f239a 2909 .proc_handler = proc_dointvec,
1da177e4
LT
2910 },
2911 {
1da177e4
LT
2912 .procname = "error_cost",
2913 .data = &ip_rt_error_cost,
2914 .maxlen = sizeof(int),
2915 .mode = 0644,
6d9f239a 2916 .proc_handler = proc_dointvec,
1da177e4
LT
2917 },
2918 {
1da177e4
LT
2919 .procname = "error_burst",
2920 .data = &ip_rt_error_burst,
2921 .maxlen = sizeof(int),
2922 .mode = 0644,
6d9f239a 2923 .proc_handler = proc_dointvec,
1da177e4
LT
2924 },
2925 {
1da177e4
LT
2926 .procname = "gc_elasticity",
2927 .data = &ip_rt_gc_elasticity,
2928 .maxlen = sizeof(int),
2929 .mode = 0644,
6d9f239a 2930 .proc_handler = proc_dointvec,
1da177e4
LT
2931 },
2932 {
1da177e4
LT
2933 .procname = "mtu_expires",
2934 .data = &ip_rt_mtu_expires,
2935 .maxlen = sizeof(int),
2936 .mode = 0644,
6d9f239a 2937 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
2938 },
2939 {
1da177e4
LT
2940 .procname = "min_pmtu",
2941 .data = &ip_rt_min_pmtu,
2942 .maxlen = sizeof(int),
2943 .mode = 0644,
c7272c2f
SD
2944 .proc_handler = proc_dointvec_minmax,
2945 .extra1 = &ip_min_valid_pmtu,
1da177e4
LT
2946 },
2947 {
1da177e4
LT
2948 .procname = "min_adv_mss",
2949 .data = &ip_rt_min_advmss,
2950 .maxlen = sizeof(int),
2951 .mode = 0644,
6d9f239a 2952 .proc_handler = proc_dointvec,
1da177e4 2953 },
f8572d8f 2954 { }
1da177e4 2955};
39a23e75 2956
39a23e75
DL
2957static struct ctl_table ipv4_route_flush_table[] = {
2958 {
39a23e75
DL
2959 .procname = "flush",
2960 .maxlen = sizeof(int),
2961 .mode = 0200,
6d9f239a 2962 .proc_handler = ipv4_sysctl_rtcache_flush,
39a23e75 2963 },
f8572d8f 2964 { },
39a23e75
DL
2965};
2966
2967static __net_init int sysctl_route_net_init(struct net *net)
2968{
2969 struct ctl_table *tbl;
2970
2971 tbl = ipv4_route_flush_table;
09ad9bc7 2972 if (!net_eq(net, &init_net)) {
39a23e75 2973 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
51456b29 2974 if (!tbl)
39a23e75 2975 goto err_dup;
464dc801
EB
2976
2977 /* Don't export sysctls to unprivileged users */
2978 if (net->user_ns != &init_user_ns)
2979 tbl[0].procname = NULL;
39a23e75
DL
2980 }
2981 tbl[0].extra1 = net;
2982
ec8f23ce 2983 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
51456b29 2984 if (!net->ipv4.route_hdr)
39a23e75
DL
2985 goto err_reg;
2986 return 0;
2987
2988err_reg:
2989 if (tbl != ipv4_route_flush_table)
2990 kfree(tbl);
2991err_dup:
2992 return -ENOMEM;
2993}
2994
2995static __net_exit void sysctl_route_net_exit(struct net *net)
2996{
2997 struct ctl_table *tbl;
2998
2999 tbl = net->ipv4.route_hdr->ctl_table_arg;
3000 unregister_net_sysctl_table(net->ipv4.route_hdr);
3001 BUG_ON(tbl == ipv4_route_flush_table);
3002 kfree(tbl);
3003}
3004
3005static __net_initdata struct pernet_operations sysctl_route_ops = {
3006 .init = sysctl_route_net_init,
3007 .exit = sysctl_route_net_exit,
3008};
1da177e4
LT
3009#endif
3010
3ee94372 3011static __net_init int rt_genid_init(struct net *net)
9f5e97e5 3012{
ca4c3fc2 3013 atomic_set(&net->ipv4.rt_genid, 0);
5aad1de5 3014 atomic_set(&net->fnhe_genid, 0);
7aed9f72 3015 atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
9f5e97e5
DL
3016 return 0;
3017}
3018
3ee94372
NH
3019static __net_initdata struct pernet_operations rt_genid_ops = {
3020 .init = rt_genid_init,
9f5e97e5
DL
3021};
3022
c3426b47
DM
3023static int __net_init ipv4_inetpeer_init(struct net *net)
3024{
3025 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3026
3027 if (!bp)
3028 return -ENOMEM;
3029 inet_peer_base_init(bp);
3030 net->ipv4.peers = bp;
3031 return 0;
3032}
3033
3034static void __net_exit ipv4_inetpeer_exit(struct net *net)
3035{
3036 struct inet_peer_base *bp = net->ipv4.peers;
3037
3038 net->ipv4.peers = NULL;
56a6b248 3039 inetpeer_invalidate_tree(bp);
c3426b47
DM
3040 kfree(bp);
3041}
3042
3043static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3044 .init = ipv4_inetpeer_init,
3045 .exit = ipv4_inetpeer_exit,
3046};
9f5e97e5 3047
c7066f70 3048#ifdef CONFIG_IP_ROUTE_CLASSID
7d720c3e 3049struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
c7066f70 3050#endif /* CONFIG_IP_ROUTE_CLASSID */
1da177e4 3051
1da177e4
LT
3052int __init ip_rt_init(void)
3053{
5055c371 3054 int cpu;
1da177e4 3055
73f156a6
ED
3056 ip_idents = kmalloc(IP_IDENTS_SZ * sizeof(*ip_idents), GFP_KERNEL);
3057 if (!ip_idents)
3058 panic("IP: failed to allocate ip_idents\n");
3059
3060 prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3061
355b590c
ED
3062 ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3063 if (!ip_tstamps)
3064 panic("IP: failed to allocate ip_tstamps\n");
3065
5055c371
ED
3066 for_each_possible_cpu(cpu) {
3067 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3068
3069 INIT_LIST_HEAD(&ul->head);
3070 spin_lock_init(&ul->lock);
3071 }
c7066f70 3072#ifdef CONFIG_IP_ROUTE_CLASSID
0dcec8c2 3073 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
1da177e4
LT
3074 if (!ip_rt_acct)
3075 panic("IP: failed to allocate ip_rt_acct\n");
1da177e4
LT
3076#endif
3077
e5d679f3
AD
3078 ipv4_dst_ops.kmem_cachep =
3079 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
20c2df83 3080 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
1da177e4 3081
14e50e57
DM
3082 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3083
fc66f95c
ED
3084 if (dst_entries_init(&ipv4_dst_ops) < 0)
3085 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3086
3087 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3088 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3089
89aef892
DM
3090 ipv4_dst_ops.gc_thresh = ~0;
3091 ip_rt_max_size = INT_MAX;
1da177e4 3092
1da177e4
LT
3093 devinet_init();
3094 ip_fib_init();
3095
73b38711 3096 if (ip_rt_proc_init())
058bd4d2 3097 pr_err("Unable to create route proc files\n");
1da177e4
LT
3098#ifdef CONFIG_XFRM
3099 xfrm_init();
703fb94e 3100 xfrm4_init();
1da177e4 3101#endif
394f51ab
FW
3102 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3103 RTNL_FLAG_DOIT_UNLOCKED);
63f3444f 3104
39a23e75
DL
3105#ifdef CONFIG_SYSCTL
3106 register_pernet_subsys(&sysctl_route_ops);
3107#endif
3ee94372 3108 register_pernet_subsys(&rt_genid_ops);
c3426b47 3109 register_pernet_subsys(&ipv4_inetpeer_ops);
1bcdca3f 3110 return 0;
1da177e4
LT
3111}
3112
a1bc6eb4 3113#ifdef CONFIG_SYSCTL
eeb61f71
AV
3114/*
3115 * We really need to sanitize the damn ipv4 init order, then all
3116 * this nonsense will go away.
3117 */
3118void __init ip_static_sysctl_init(void)
3119{
4e5ca785 3120 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
eeb61f71 3121}
a1bc6eb4 3122#endif