]> git.ipfire.org Git - thirdparty/kernel/stable.git/blame - net/ipv6/route.c
vrf: Move fib6_table into net_vrf
[thirdparty/kernel/stable.git] / net / ipv6 / route.c
CommitLineData
1da177e4
LT
1/*
2 * Linux INET6 implementation
3 * FIB front-end.
4 *
5 * Authors:
1ab1457c 6 * Pedro Roque <roque@di.fc.ul.pt>
1da177e4 7 *
1da177e4
LT
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
12 */
13
14/* Changes:
15 *
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
c0bece9f
YH
23 * Ville Nuorvala
24 * Fixed routing subtrees.
1da177e4
LT
25 */
26
f3213831
JP
27#define pr_fmt(fmt) "IPv6: " fmt
28
4fc268d2 29#include <linux/capability.h>
1da177e4 30#include <linux/errno.h>
bc3b2d7f 31#include <linux/export.h>
1da177e4
LT
32#include <linux/types.h>
33#include <linux/times.h>
34#include <linux/socket.h>
35#include <linux/sockios.h>
36#include <linux/net.h>
37#include <linux/route.h>
38#include <linux/netdevice.h>
39#include <linux/in6.h>
7bc570c8 40#include <linux/mroute6.h>
1da177e4 41#include <linux/init.h>
1da177e4 42#include <linux/if_arp.h>
1da177e4
LT
43#include <linux/proc_fs.h>
44#include <linux/seq_file.h>
5b7c931d 45#include <linux/nsproxy.h>
5a0e3ad6 46#include <linux/slab.h>
35732d01 47#include <linux/jhash.h>
457c4cbc 48#include <net/net_namespace.h>
1da177e4
LT
49#include <net/snmp.h>
50#include <net/ipv6.h>
51#include <net/ip6_fib.h>
52#include <net/ip6_route.h>
53#include <net/ndisc.h>
54#include <net/addrconf.h>
55#include <net/tcp.h>
56#include <linux/rtnetlink.h>
57#include <net/dst.h>
904af04d 58#include <net/dst_metadata.h>
1da177e4 59#include <net/xfrm.h>
8d71740c 60#include <net/netevent.h>
21713ebc 61#include <net/netlink.h>
51ebd318 62#include <net/nexthop.h>
19e42e45 63#include <net/lwtunnel.h>
904af04d 64#include <net/ip_tunnels.h>
ca254490 65#include <net/l3mdev.h>
b811580d 66#include <trace/events/fib6.h>
1da177e4 67
7c0f6ba6 68#include <linux/uaccess.h>
1da177e4
LT
69
70#ifdef CONFIG_SYSCTL
71#include <linux/sysctl.h>
72#endif
73
afc154e9 74enum rt6_nud_state {
7e980569
JB
75 RT6_NUD_FAIL_HARD = -3,
76 RT6_NUD_FAIL_PROBE = -2,
77 RT6_NUD_FAIL_DO_RR = -1,
afc154e9
HFS
78 RT6_NUD_SUCCEED = 1
79};
80
83a09abd 81static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
1da177e4 82static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
0dbaee3b 83static unsigned int ip6_default_advmss(const struct dst_entry *dst);
ebb762f2 84static unsigned int ip6_mtu(const struct dst_entry *dst);
1da177e4
LT
85static struct dst_entry *ip6_negative_advice(struct dst_entry *);
86static void ip6_dst_destroy(struct dst_entry *);
87static void ip6_dst_ifdown(struct dst_entry *,
88 struct net_device *dev, int how);
569d3645 89static int ip6_dst_gc(struct dst_ops *ops);
1da177e4
LT
90
91static int ip6_pkt_discard(struct sk_buff *skb);
ede2059d 92static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
7150aede 93static int ip6_pkt_prohibit(struct sk_buff *skb);
ede2059d 94static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
1da177e4 95static void ip6_link_failure(struct sk_buff *skb);
6700c270
DM
96static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
97 struct sk_buff *skb, u32 mtu);
98static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
99 struct sk_buff *skb);
4b32b5ad 100static void rt6_dst_from_metrics_check(struct rt6_info *rt);
52bd4c0c 101static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
16a16cd3
DA
102static size_t rt6_nlmsg_size(struct rt6_info *rt);
103static int rt6_fill_node(struct net *net,
104 struct sk_buff *skb, struct rt6_info *rt,
105 struct in6_addr *dst, struct in6_addr *src,
106 int iif, int type, u32 portid, u32 seq,
107 unsigned int flags);
35732d01
WW
108static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
109 struct in6_addr *daddr,
110 struct in6_addr *saddr);
1da177e4 111
70ceb4f5 112#ifdef CONFIG_IPV6_ROUTE_INFO
efa2cea0 113static struct rt6_info *rt6_add_route_info(struct net *net,
b71d1d42 114 const struct in6_addr *prefix, int prefixlen,
830218c1
DA
115 const struct in6_addr *gwaddr,
116 struct net_device *dev,
95c96174 117 unsigned int pref);
efa2cea0 118static struct rt6_info *rt6_get_route_info(struct net *net,
b71d1d42 119 const struct in6_addr *prefix, int prefixlen,
830218c1
DA
120 const struct in6_addr *gwaddr,
121 struct net_device *dev);
70ceb4f5
YH
122#endif
123
8d0b94af
MKL
124struct uncached_list {
125 spinlock_t lock;
126 struct list_head head;
127};
128
129static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
130
510c321b 131void rt6_uncached_list_add(struct rt6_info *rt)
8d0b94af
MKL
132{
133 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
134
8d0b94af
MKL
135 rt->rt6i_uncached_list = ul;
136
137 spin_lock_bh(&ul->lock);
138 list_add_tail(&rt->rt6i_uncached, &ul->head);
139 spin_unlock_bh(&ul->lock);
140}
141
510c321b 142void rt6_uncached_list_del(struct rt6_info *rt)
8d0b94af
MKL
143{
144 if (!list_empty(&rt->rt6i_uncached)) {
145 struct uncached_list *ul = rt->rt6i_uncached_list;
81eb8447 146 struct net *net = dev_net(rt->dst.dev);
8d0b94af
MKL
147
148 spin_lock_bh(&ul->lock);
149 list_del(&rt->rt6i_uncached);
81eb8447 150 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
8d0b94af
MKL
151 spin_unlock_bh(&ul->lock);
152 }
153}
154
155static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
156{
157 struct net_device *loopback_dev = net->loopback_dev;
158 int cpu;
159
e332bc67
EB
160 if (dev == loopback_dev)
161 return;
162
8d0b94af
MKL
163 for_each_possible_cpu(cpu) {
164 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
165 struct rt6_info *rt;
166
167 spin_lock_bh(&ul->lock);
168 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
169 struct inet6_dev *rt_idev = rt->rt6i_idev;
170 struct net_device *rt_dev = rt->dst.dev;
171
e332bc67 172 if (rt_idev->dev == dev) {
8d0b94af
MKL
173 rt->rt6i_idev = in6_dev_get(loopback_dev);
174 in6_dev_put(rt_idev);
175 }
176
e332bc67 177 if (rt_dev == dev) {
8d0b94af
MKL
178 rt->dst.dev = loopback_dev;
179 dev_hold(rt->dst.dev);
180 dev_put(rt_dev);
181 }
182 }
183 spin_unlock_bh(&ul->lock);
184 }
185}
186
d52d3997
MKL
187static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
188{
3a2232e9 189 return dst_metrics_write_ptr(&rt->from->dst);
d52d3997
MKL
190}
191
06582540
DM
192static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
193{
4b32b5ad 194 struct rt6_info *rt = (struct rt6_info *)dst;
06582540 195
d52d3997
MKL
196 if (rt->rt6i_flags & RTF_PCPU)
197 return rt6_pcpu_cow_metrics(rt);
198 else if (rt->rt6i_flags & RTF_CACHE)
4b32b5ad
MKL
199 return NULL;
200 else
3b471175 201 return dst_cow_metrics_generic(dst, old);
06582540
DM
202}
203
f894cbf8
DM
204static inline const void *choose_neigh_daddr(struct rt6_info *rt,
205 struct sk_buff *skb,
206 const void *daddr)
39232973
DM
207{
208 struct in6_addr *p = &rt->rt6i_gateway;
209
a7563f34 210 if (!ipv6_addr_any(p))
39232973 211 return (const void *) p;
f894cbf8
DM
212 else if (skb)
213 return &ipv6_hdr(skb)->daddr;
39232973
DM
214 return daddr;
215}
216
f894cbf8
DM
217static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
218 struct sk_buff *skb,
219 const void *daddr)
d3aaeb38 220{
39232973
DM
221 struct rt6_info *rt = (struct rt6_info *) dst;
222 struct neighbour *n;
223
f894cbf8 224 daddr = choose_neigh_daddr(rt, skb, daddr);
8e022ee6 225 n = __ipv6_neigh_lookup(dst->dev, daddr);
f83c7790
DM
226 if (n)
227 return n;
228 return neigh_create(&nd_tbl, daddr, dst->dev);
229}
230
63fca65d
JA
231static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
232{
233 struct net_device *dev = dst->dev;
234 struct rt6_info *rt = (struct rt6_info *)dst;
235
236 daddr = choose_neigh_daddr(rt, NULL, daddr);
237 if (!daddr)
238 return;
239 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
240 return;
241 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
242 return;
243 __ipv6_confirm_neigh(dev, daddr);
244}
245
9a7ec3a9 246static struct dst_ops ip6_dst_ops_template = {
1da177e4 247 .family = AF_INET6,
1da177e4
LT
248 .gc = ip6_dst_gc,
249 .gc_thresh = 1024,
250 .check = ip6_dst_check,
0dbaee3b 251 .default_advmss = ip6_default_advmss,
ebb762f2 252 .mtu = ip6_mtu,
06582540 253 .cow_metrics = ipv6_cow_metrics,
1da177e4
LT
254 .destroy = ip6_dst_destroy,
255 .ifdown = ip6_dst_ifdown,
256 .negative_advice = ip6_negative_advice,
257 .link_failure = ip6_link_failure,
258 .update_pmtu = ip6_rt_update_pmtu,
6e157b6a 259 .redirect = rt6_do_redirect,
9f8955cc 260 .local_out = __ip6_local_out,
d3aaeb38 261 .neigh_lookup = ip6_neigh_lookup,
63fca65d 262 .confirm_neigh = ip6_confirm_neigh,
1da177e4
LT
263};
264
ebb762f2 265static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
ec831ea7 266{
618f9bc7
SK
267 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
268
269 return mtu ? : dst->dev->mtu;
ec831ea7
RD
270}
271
6700c270
DM
272static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
273 struct sk_buff *skb, u32 mtu)
14e50e57
DM
274{
275}
276
6700c270
DM
277static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
278 struct sk_buff *skb)
b587ee3b
DM
279{
280}
281
14e50e57
DM
282static struct dst_ops ip6_dst_blackhole_ops = {
283 .family = AF_INET6,
14e50e57
DM
284 .destroy = ip6_dst_destroy,
285 .check = ip6_dst_check,
ebb762f2 286 .mtu = ip6_blackhole_mtu,
214f45c9 287 .default_advmss = ip6_default_advmss,
14e50e57 288 .update_pmtu = ip6_rt_blackhole_update_pmtu,
b587ee3b 289 .redirect = ip6_rt_blackhole_redirect,
0a1f5962 290 .cow_metrics = dst_cow_metrics_generic,
d3aaeb38 291 .neigh_lookup = ip6_neigh_lookup,
14e50e57
DM
292};
293
62fa8a84 294static const u32 ip6_template_metrics[RTAX_MAX] = {
14edd87d 295 [RTAX_HOPLIMIT - 1] = 0,
62fa8a84
DM
296};
297
fb0af4c7 298static const struct rt6_info ip6_null_entry_template = {
d8d1f30b
CG
299 .dst = {
300 .__refcnt = ATOMIC_INIT(1),
301 .__use = 1,
2c20cbd7 302 .obsolete = DST_OBSOLETE_FORCE_CHK,
d8d1f30b 303 .error = -ENETUNREACH,
d8d1f30b
CG
304 .input = ip6_pkt_discard,
305 .output = ip6_pkt_discard_out,
1da177e4
LT
306 },
307 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
4f724279 308 .rt6i_protocol = RTPROT_KERNEL,
1da177e4
LT
309 .rt6i_metric = ~(u32) 0,
310 .rt6i_ref = ATOMIC_INIT(1),
311};
312
101367c2
TG
313#ifdef CONFIG_IPV6_MULTIPLE_TABLES
314
fb0af4c7 315static const struct rt6_info ip6_prohibit_entry_template = {
d8d1f30b
CG
316 .dst = {
317 .__refcnt = ATOMIC_INIT(1),
318 .__use = 1,
2c20cbd7 319 .obsolete = DST_OBSOLETE_FORCE_CHK,
d8d1f30b 320 .error = -EACCES,
d8d1f30b
CG
321 .input = ip6_pkt_prohibit,
322 .output = ip6_pkt_prohibit_out,
101367c2
TG
323 },
324 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
4f724279 325 .rt6i_protocol = RTPROT_KERNEL,
101367c2
TG
326 .rt6i_metric = ~(u32) 0,
327 .rt6i_ref = ATOMIC_INIT(1),
328};
329
fb0af4c7 330static const struct rt6_info ip6_blk_hole_entry_template = {
d8d1f30b
CG
331 .dst = {
332 .__refcnt = ATOMIC_INIT(1),
333 .__use = 1,
2c20cbd7 334 .obsolete = DST_OBSOLETE_FORCE_CHK,
d8d1f30b 335 .error = -EINVAL,
d8d1f30b 336 .input = dst_discard,
ede2059d 337 .output = dst_discard_out,
101367c2
TG
338 },
339 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
4f724279 340 .rt6i_protocol = RTPROT_KERNEL,
101367c2
TG
341 .rt6i_metric = ~(u32) 0,
342 .rt6i_ref = ATOMIC_INIT(1),
343};
344
345#endif
346
ebfa45f0
MKL
347static void rt6_info_init(struct rt6_info *rt)
348{
349 struct dst_entry *dst = &rt->dst;
350
351 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
352 INIT_LIST_HEAD(&rt->rt6i_siblings);
353 INIT_LIST_HEAD(&rt->rt6i_uncached);
354}
355
1da177e4 356/* allocate dst with ip6_dst_ops */
d52d3997
MKL
357static struct rt6_info *__ip6_dst_alloc(struct net *net,
358 struct net_device *dev,
ad706862 359 int flags)
1da177e4 360{
97bab73f 361 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
b2a9c0ed 362 1, DST_OBSOLETE_FORCE_CHK, flags);
cf911662 363
81eb8447 364 if (rt) {
ebfa45f0 365 rt6_info_init(rt);
81eb8447
WW
366 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
367 }
8104891b 368
cf911662 369 return rt;
1da177e4
LT
370}
371
9ab179d8
DA
372struct rt6_info *ip6_dst_alloc(struct net *net,
373 struct net_device *dev,
374 int flags)
d52d3997 375{
ad706862 376 struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
d52d3997
MKL
377
378 if (rt) {
379 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
bfd8e5a4 380 if (!rt->rt6i_pcpu) {
587fea74 381 dst_release_immediate(&rt->dst);
d52d3997
MKL
382 return NULL;
383 }
384 }
385
386 return rt;
387}
9ab179d8 388EXPORT_SYMBOL(ip6_dst_alloc);
d52d3997 389
1da177e4
LT
390static void ip6_dst_destroy(struct dst_entry *dst)
391{
392 struct rt6_info *rt = (struct rt6_info *)dst;
35732d01 393 struct rt6_exception_bucket *bucket;
3a2232e9 394 struct rt6_info *from = rt->from;
8d0b94af 395 struct inet6_dev *idev;
1da177e4 396
4b32b5ad 397 dst_destroy_metrics_generic(dst);
87775312 398 free_percpu(rt->rt6i_pcpu);
8d0b94af
MKL
399 rt6_uncached_list_del(rt);
400
401 idev = rt->rt6i_idev;
38308473 402 if (idev) {
1da177e4
LT
403 rt->rt6i_idev = NULL;
404 in6_dev_put(idev);
1ab1457c 405 }
35732d01
WW
406 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket, 1);
407 if (bucket) {
408 rt->rt6i_exception_bucket = NULL;
409 kfree(bucket);
410 }
1716a961 411
3a2232e9
DM
412 rt->from = NULL;
413 dst_release(&from->dst);
b3419363
DM
414}
415
1da177e4
LT
416static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
417 int how)
418{
419 struct rt6_info *rt = (struct rt6_info *)dst;
420 struct inet6_dev *idev = rt->rt6i_idev;
5a3e55d6 421 struct net_device *loopback_dev =
c346dca1 422 dev_net(dev)->loopback_dev;
1da177e4 423
e5645f51
WW
424 if (idev && idev->dev != loopback_dev) {
425 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
426 if (loopback_idev) {
427 rt->rt6i_idev = loopback_idev;
428 in6_dev_put(idev);
97cac082 429 }
1da177e4
LT
430 }
431}
432
5973fb1e
MKL
433static bool __rt6_check_expired(const struct rt6_info *rt)
434{
435 if (rt->rt6i_flags & RTF_EXPIRES)
436 return time_after(jiffies, rt->dst.expires);
437 else
438 return false;
439}
440
a50feda5 441static bool rt6_check_expired(const struct rt6_info *rt)
1da177e4 442{
1716a961
G
443 if (rt->rt6i_flags & RTF_EXPIRES) {
444 if (time_after(jiffies, rt->dst.expires))
a50feda5 445 return true;
3a2232e9 446 } else if (rt->from) {
1e2ea8ad 447 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
3a2232e9 448 rt6_check_expired(rt->from);
1716a961 449 }
a50feda5 450 return false;
1da177e4
LT
451}
452
b4bac172
DA
453static struct rt6_info *rt6_multipath_select(const struct net *net,
454 struct rt6_info *match,
52bd4c0c 455 struct flowi6 *fl6, int oif,
b75cc8f9 456 const struct sk_buff *skb,
52bd4c0c 457 int strict)
51ebd318
ND
458{
459 struct rt6_info *sibling, *next_sibling;
51ebd318 460
b673d6cc
JS
461 /* We might have already computed the hash for ICMPv6 errors. In such
462 * case it will always be non-zero. Otherwise now is the time to do it.
463 */
464 if (!fl6->mp_hash)
b4bac172 465 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
b673d6cc 466
3d709f69
IS
467 if (fl6->mp_hash <= atomic_read(&match->rt6i_nh_upper_bound))
468 return match;
469
470 list_for_each_entry_safe(sibling, next_sibling, &match->rt6i_siblings,
471 rt6i_siblings) {
472 if (fl6->mp_hash > atomic_read(&sibling->rt6i_nh_upper_bound))
473 continue;
474 if (rt6_score_route(sibling, oif, strict) < 0)
475 break;
476 match = sibling;
477 break;
478 }
479
51ebd318
ND
480 return match;
481}
482
1da177e4 483/*
66f5d6ce 484 * Route lookup. rcu_read_lock() should be held.
1da177e4
LT
485 */
486
8ed67789
DL
487static inline struct rt6_info *rt6_device_match(struct net *net,
488 struct rt6_info *rt,
b71d1d42 489 const struct in6_addr *saddr,
1da177e4 490 int oif,
d420895e 491 int flags)
1da177e4
LT
492{
493 struct rt6_info *local = NULL;
494 struct rt6_info *sprt;
495
8067bb8c
IS
496 if (!oif && ipv6_addr_any(saddr) && !(rt->rt6i_nh_flags & RTNH_F_DEAD))
497 return rt;
dd3abc4e 498
071fb37e 499 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->rt6_next)) {
d1918542 500 struct net_device *dev = sprt->dst.dev;
dd3abc4e 501
8067bb8c
IS
502 if (sprt->rt6i_nh_flags & RTNH_F_DEAD)
503 continue;
504
dd3abc4e 505 if (oif) {
1da177e4
LT
506 if (dev->ifindex == oif)
507 return sprt;
508 if (dev->flags & IFF_LOOPBACK) {
38308473 509 if (!sprt->rt6i_idev ||
1da177e4 510 sprt->rt6i_idev->dev->ifindex != oif) {
17fb0b2b 511 if (flags & RT6_LOOKUP_F_IFACE)
1da177e4 512 continue;
17fb0b2b
DA
513 if (local &&
514 local->rt6i_idev->dev->ifindex == oif)
1da177e4
LT
515 continue;
516 }
517 local = sprt;
518 }
dd3abc4e
YH
519 } else {
520 if (ipv6_chk_addr(net, saddr, dev,
521 flags & RT6_LOOKUP_F_IFACE))
522 return sprt;
1da177e4 523 }
dd3abc4e 524 }
1da177e4 525
dd3abc4e 526 if (oif) {
1da177e4
LT
527 if (local)
528 return local;
529
d420895e 530 if (flags & RT6_LOOKUP_F_IFACE)
8ed67789 531 return net->ipv6.ip6_null_entry;
1da177e4 532 }
8067bb8c
IS
533
534 return rt->rt6i_nh_flags & RTNH_F_DEAD ? net->ipv6.ip6_null_entry : rt;
1da177e4
LT
535}
536
27097255 537#ifdef CONFIG_IPV6_ROUTER_PREF
c2f17e82
HFS
538struct __rt6_probe_work {
539 struct work_struct work;
540 struct in6_addr target;
541 struct net_device *dev;
542};
543
544static void rt6_probe_deferred(struct work_struct *w)
545{
546 struct in6_addr mcaddr;
547 struct __rt6_probe_work *work =
548 container_of(w, struct __rt6_probe_work, work);
549
550 addrconf_addr_solict_mult(&work->target, &mcaddr);
adc176c5 551 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
c2f17e82 552 dev_put(work->dev);
662f5533 553 kfree(work);
c2f17e82
HFS
554}
555
27097255
YH
556static void rt6_probe(struct rt6_info *rt)
557{
990edb42 558 struct __rt6_probe_work *work;
f2c31e32 559 struct neighbour *neigh;
27097255
YH
560 /*
561 * Okay, this does not seem to be appropriate
562 * for now, however, we need to check if it
563 * is really so; aka Router Reachability Probing.
564 *
565 * Router Reachability Probe MUST be rate-limited
566 * to no more than one per minute.
567 */
2152caea 568 if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
7ff74a59 569 return;
2152caea
YH
570 rcu_read_lock_bh();
571 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
572 if (neigh) {
8d6c31bf
MKL
573 if (neigh->nud_state & NUD_VALID)
574 goto out;
575
990edb42 576 work = NULL;
2152caea 577 write_lock(&neigh->lock);
990edb42
MKL
578 if (!(neigh->nud_state & NUD_VALID) &&
579 time_after(jiffies,
580 neigh->updated +
581 rt->rt6i_idev->cnf.rtr_probe_interval)) {
582 work = kmalloc(sizeof(*work), GFP_ATOMIC);
583 if (work)
584 __neigh_set_probe_once(neigh);
c2f17e82 585 }
2152caea 586 write_unlock(&neigh->lock);
990edb42
MKL
587 } else {
588 work = kmalloc(sizeof(*work), GFP_ATOMIC);
f2c31e32 589 }
990edb42
MKL
590
591 if (work) {
592 INIT_WORK(&work->work, rt6_probe_deferred);
593 work->target = rt->rt6i_gateway;
594 dev_hold(rt->dst.dev);
595 work->dev = rt->dst.dev;
596 schedule_work(&work->work);
597 }
598
8d6c31bf 599out:
2152caea 600 rcu_read_unlock_bh();
27097255
YH
601}
602#else
603static inline void rt6_probe(struct rt6_info *rt)
604{
27097255
YH
605}
606#endif
607
1da177e4 608/*
554cfb7e 609 * Default Router Selection (RFC 2461 6.3.6)
1da177e4 610 */
b6f99a21 611static inline int rt6_check_dev(struct rt6_info *rt, int oif)
554cfb7e 612{
d1918542 613 struct net_device *dev = rt->dst.dev;
161980f4 614 if (!oif || dev->ifindex == oif)
554cfb7e 615 return 2;
161980f4
DM
616 if ((dev->flags & IFF_LOOPBACK) &&
617 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
618 return 1;
619 return 0;
554cfb7e 620}
1da177e4 621
afc154e9 622static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
1da177e4 623{
f2c31e32 624 struct neighbour *neigh;
afc154e9 625 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
f2c31e32 626
4d0c5911
YH
627 if (rt->rt6i_flags & RTF_NONEXTHOP ||
628 !(rt->rt6i_flags & RTF_GATEWAY))
afc154e9 629 return RT6_NUD_SUCCEED;
145a3621
YH
630
631 rcu_read_lock_bh();
632 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
633 if (neigh) {
634 read_lock(&neigh->lock);
554cfb7e 635 if (neigh->nud_state & NUD_VALID)
afc154e9 636 ret = RT6_NUD_SUCCEED;
398bcbeb 637#ifdef CONFIG_IPV6_ROUTER_PREF
a5a81f0b 638 else if (!(neigh->nud_state & NUD_FAILED))
afc154e9 639 ret = RT6_NUD_SUCCEED;
7e980569
JB
640 else
641 ret = RT6_NUD_FAIL_PROBE;
398bcbeb 642#endif
145a3621 643 read_unlock(&neigh->lock);
afc154e9
HFS
644 } else {
645 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
7e980569 646 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
a5a81f0b 647 }
145a3621
YH
648 rcu_read_unlock_bh();
649
a5a81f0b 650 return ret;
1da177e4
LT
651}
652
554cfb7e
YH
653static int rt6_score_route(struct rt6_info *rt, int oif,
654 int strict)
1da177e4 655{
a5a81f0b 656 int m;
1ab1457c 657
4d0c5911 658 m = rt6_check_dev(rt, oif);
77d16f45 659 if (!m && (strict & RT6_LOOKUP_F_IFACE))
afc154e9 660 return RT6_NUD_FAIL_HARD;
ebacaaa0
YH
661#ifdef CONFIG_IPV6_ROUTER_PREF
662 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
663#endif
afc154e9
HFS
664 if (strict & RT6_LOOKUP_F_REACHABLE) {
665 int n = rt6_check_neigh(rt);
666 if (n < 0)
667 return n;
668 }
554cfb7e
YH
669 return m;
670}
671
f11e6659 672static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
afc154e9
HFS
673 int *mpri, struct rt6_info *match,
674 bool *do_rr)
554cfb7e 675{
f11e6659 676 int m;
afc154e9 677 bool match_do_rr = false;
35103d11 678 struct inet6_dev *idev = rt->rt6i_idev;
35103d11 679
8067bb8c
IS
680 if (rt->rt6i_nh_flags & RTNH_F_DEAD)
681 goto out;
682
14c5206c
IS
683 if (idev->cnf.ignore_routes_with_linkdown &&
684 rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
d5d32e4b 685 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
35103d11 686 goto out;
f11e6659
DM
687
688 if (rt6_check_expired(rt))
689 goto out;
690
691 m = rt6_score_route(rt, oif, strict);
7e980569 692 if (m == RT6_NUD_FAIL_DO_RR) {
afc154e9
HFS
693 match_do_rr = true;
694 m = 0; /* lowest valid score */
7e980569 695 } else if (m == RT6_NUD_FAIL_HARD) {
f11e6659 696 goto out;
afc154e9
HFS
697 }
698
699 if (strict & RT6_LOOKUP_F_REACHABLE)
700 rt6_probe(rt);
f11e6659 701
7e980569 702 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
f11e6659 703 if (m > *mpri) {
afc154e9 704 *do_rr = match_do_rr;
f11e6659
DM
705 *mpri = m;
706 match = rt;
f11e6659 707 }
f11e6659
DM
708out:
709 return match;
710}
711
712static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
8d1040e8 713 struct rt6_info *leaf,
f11e6659 714 struct rt6_info *rr_head,
afc154e9
HFS
715 u32 metric, int oif, int strict,
716 bool *do_rr)
f11e6659 717{
9fbdcfaf 718 struct rt6_info *rt, *match, *cont;
554cfb7e 719 int mpri = -1;
1da177e4 720
f11e6659 721 match = NULL;
9fbdcfaf 722 cont = NULL;
071fb37e 723 for (rt = rr_head; rt; rt = rcu_dereference(rt->rt6_next)) {
9fbdcfaf
SK
724 if (rt->rt6i_metric != metric) {
725 cont = rt;
726 break;
727 }
728
729 match = find_match(rt, oif, strict, &mpri, match, do_rr);
730 }
731
66f5d6ce 732 for (rt = leaf; rt && rt != rr_head;
071fb37e 733 rt = rcu_dereference(rt->rt6_next)) {
9fbdcfaf
SK
734 if (rt->rt6i_metric != metric) {
735 cont = rt;
736 break;
737 }
738
afc154e9 739 match = find_match(rt, oif, strict, &mpri, match, do_rr);
9fbdcfaf
SK
740 }
741
742 if (match || !cont)
743 return match;
744
071fb37e 745 for (rt = cont; rt; rt = rcu_dereference(rt->rt6_next))
afc154e9 746 match = find_match(rt, oif, strict, &mpri, match, do_rr);
1da177e4 747
f11e6659
DM
748 return match;
749}
1da177e4 750
8d1040e8
WW
751static struct rt6_info *rt6_select(struct net *net, struct fib6_node *fn,
752 int oif, int strict)
f11e6659 753{
66f5d6ce 754 struct rt6_info *leaf = rcu_dereference(fn->leaf);
f11e6659 755 struct rt6_info *match, *rt0;
afc154e9 756 bool do_rr = false;
17ecf590 757 int key_plen;
1da177e4 758
87b1af8d 759 if (!leaf || leaf == net->ipv6.ip6_null_entry)
8d1040e8
WW
760 return net->ipv6.ip6_null_entry;
761
66f5d6ce 762 rt0 = rcu_dereference(fn->rr_ptr);
f11e6659 763 if (!rt0)
66f5d6ce 764 rt0 = leaf;
1da177e4 765
17ecf590
WW
766 /* Double check to make sure fn is not an intermediate node
767 * and fn->leaf does not points to its child's leaf
768 * (This might happen if all routes under fn are deleted from
769 * the tree and fib6_repair_tree() is called on the node.)
770 */
771 key_plen = rt0->rt6i_dst.plen;
772#ifdef CONFIG_IPV6_SUBTREES
773 if (rt0->rt6i_src.plen)
774 key_plen = rt0->rt6i_src.plen;
775#endif
776 if (fn->fn_bit != key_plen)
777 return net->ipv6.ip6_null_entry;
778
8d1040e8 779 match = find_rr_leaf(fn, leaf, rt0, rt0->rt6i_metric, oif, strict,
afc154e9 780 &do_rr);
1da177e4 781
afc154e9 782 if (do_rr) {
071fb37e 783 struct rt6_info *next = rcu_dereference(rt0->rt6_next);
f11e6659 784
554cfb7e 785 /* no entries matched; do round-robin */
f11e6659 786 if (!next || next->rt6i_metric != rt0->rt6i_metric)
8d1040e8 787 next = leaf;
f11e6659 788
66f5d6ce
WW
789 if (next != rt0) {
790 spin_lock_bh(&leaf->rt6i_table->tb6_lock);
791 /* make sure next is not being deleted from the tree */
792 if (next->rt6i_node)
793 rcu_assign_pointer(fn->rr_ptr, next);
794 spin_unlock_bh(&leaf->rt6i_table->tb6_lock);
795 }
1da177e4 796 }
1da177e4 797
a02cec21 798 return match ? match : net->ipv6.ip6_null_entry;
1da177e4
LT
799}
800
8b9df265
MKL
801static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
802{
803 return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
804}
805
70ceb4f5
YH
806#ifdef CONFIG_IPV6_ROUTE_INFO
807int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
b71d1d42 808 const struct in6_addr *gwaddr)
70ceb4f5 809{
c346dca1 810 struct net *net = dev_net(dev);
70ceb4f5
YH
811 struct route_info *rinfo = (struct route_info *) opt;
812 struct in6_addr prefix_buf, *prefix;
813 unsigned int pref;
4bed72e4 814 unsigned long lifetime;
70ceb4f5
YH
815 struct rt6_info *rt;
816
817 if (len < sizeof(struct route_info)) {
818 return -EINVAL;
819 }
820
821 /* Sanity check for prefix_len and length */
822 if (rinfo->length > 3) {
823 return -EINVAL;
824 } else if (rinfo->prefix_len > 128) {
825 return -EINVAL;
826 } else if (rinfo->prefix_len > 64) {
827 if (rinfo->length < 2) {
828 return -EINVAL;
829 }
830 } else if (rinfo->prefix_len > 0) {
831 if (rinfo->length < 1) {
832 return -EINVAL;
833 }
834 }
835
836 pref = rinfo->route_pref;
837 if (pref == ICMPV6_ROUTER_PREF_INVALID)
3933fc95 838 return -EINVAL;
70ceb4f5 839
4bed72e4 840 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
70ceb4f5
YH
841
842 if (rinfo->length == 3)
843 prefix = (struct in6_addr *)rinfo->prefix;
844 else {
845 /* this function is safe */
846 ipv6_addr_prefix(&prefix_buf,
847 (struct in6_addr *)rinfo->prefix,
848 rinfo->prefix_len);
849 prefix = &prefix_buf;
850 }
851
f104a567
DJ
852 if (rinfo->prefix_len == 0)
853 rt = rt6_get_dflt_router(gwaddr, dev);
854 else
855 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
830218c1 856 gwaddr, dev);
70ceb4f5
YH
857
858 if (rt && !lifetime) {
e0a1ad73 859 ip6_del_rt(rt);
70ceb4f5
YH
860 rt = NULL;
861 }
862
863 if (!rt && lifetime)
830218c1
DA
864 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
865 dev, pref);
70ceb4f5
YH
866 else if (rt)
867 rt->rt6i_flags = RTF_ROUTEINFO |
868 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
869
870 if (rt) {
1716a961
G
871 if (!addrconf_finite_timeout(lifetime))
872 rt6_clean_expires(rt);
873 else
874 rt6_set_expires(rt, jiffies + HZ * lifetime);
875
94e187c0 876 ip6_rt_put(rt);
70ceb4f5
YH
877 }
878 return 0;
879}
880#endif
881
a3c00e46
MKL
882static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
883 struct in6_addr *saddr)
884{
66f5d6ce 885 struct fib6_node *pn, *sn;
a3c00e46
MKL
886 while (1) {
887 if (fn->fn_flags & RTN_TL_ROOT)
888 return NULL;
66f5d6ce
WW
889 pn = rcu_dereference(fn->parent);
890 sn = FIB6_SUBTREE(pn);
891 if (sn && sn != fn)
892 fn = fib6_lookup(sn, NULL, saddr);
a3c00e46
MKL
893 else
894 fn = pn;
895 if (fn->fn_flags & RTN_RTINFO)
896 return fn;
897 }
898}
c71099ac 899
d3843fe5
WW
900static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
901 bool null_fallback)
902{
903 struct rt6_info *rt = *prt;
904
905 if (dst_hold_safe(&rt->dst))
906 return true;
907 if (null_fallback) {
908 rt = net->ipv6.ip6_null_entry;
909 dst_hold(&rt->dst);
910 } else {
911 rt = NULL;
912 }
913 *prt = rt;
914 return false;
915}
916
8ed67789
DL
917static struct rt6_info *ip6_pol_route_lookup(struct net *net,
918 struct fib6_table *table,
b75cc8f9
DA
919 struct flowi6 *fl6,
920 const struct sk_buff *skb,
921 int flags)
1da177e4 922{
2b760fcf 923 struct rt6_info *rt, *rt_cache;
1da177e4 924 struct fib6_node *fn;
1da177e4 925
b6cdbc85
DA
926 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
927 flags &= ~RT6_LOOKUP_F_IFACE;
928
66f5d6ce 929 rcu_read_lock();
4c9483b2 930 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
c71099ac 931restart:
66f5d6ce
WW
932 rt = rcu_dereference(fn->leaf);
933 if (!rt) {
934 rt = net->ipv6.ip6_null_entry;
935 } else {
936 rt = rt6_device_match(net, rt, &fl6->saddr,
937 fl6->flowi6_oif, flags);
938 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
b4bac172 939 rt = rt6_multipath_select(net, rt, fl6, fl6->flowi6_oif,
b75cc8f9 940 skb, flags);
66f5d6ce 941 }
a3c00e46
MKL
942 if (rt == net->ipv6.ip6_null_entry) {
943 fn = fib6_backtrack(fn, &fl6->saddr);
944 if (fn)
945 goto restart;
946 }
2b760fcf
WW
947 /* Search through exception table */
948 rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
949 if (rt_cache)
950 rt = rt_cache;
951
d3843fe5
WW
952 if (ip6_hold_safe(net, &rt, true))
953 dst_use_noref(&rt->dst, jiffies);
954
66f5d6ce 955 rcu_read_unlock();
b811580d 956
b65f164d 957 trace_fib6_table_lookup(net, rt, table, fl6);
b811580d 958
c71099ac
TG
959 return rt;
960
961}
962
67ba4152 963struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
b75cc8f9 964 const struct sk_buff *skb, int flags)
ea6e574e 965{
b75cc8f9 966 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
ea6e574e
FW
967}
968EXPORT_SYMBOL_GPL(ip6_route_lookup);
969
9acd9f3a 970struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
b75cc8f9
DA
971 const struct in6_addr *saddr, int oif,
972 const struct sk_buff *skb, int strict)
c71099ac 973{
4c9483b2
DM
974 struct flowi6 fl6 = {
975 .flowi6_oif = oif,
976 .daddr = *daddr,
c71099ac
TG
977 };
978 struct dst_entry *dst;
77d16f45 979 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
c71099ac 980
adaa70bb 981 if (saddr) {
4c9483b2 982 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
adaa70bb
TG
983 flags |= RT6_LOOKUP_F_HAS_SADDR;
984 }
985
b75cc8f9 986 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
c71099ac
TG
987 if (dst->error == 0)
988 return (struct rt6_info *) dst;
989
990 dst_release(dst);
991
1da177e4
LT
992 return NULL;
993}
7159039a
YH
994EXPORT_SYMBOL(rt6_lookup);
995
c71099ac 996/* ip6_ins_rt is called with FREE table->tb6_lock.
1cfb71ee
WW
997 * It takes new route entry, the addition fails by any reason the
998 * route is released.
999 * Caller must hold dst before calling it.
1da177e4
LT
1000 */
1001
e5fd387a 1002static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
333c4301
DA
1003 struct mx6_config *mxc,
1004 struct netlink_ext_ack *extack)
1da177e4
LT
1005{
1006 int err;
c71099ac 1007 struct fib6_table *table;
1da177e4 1008
c71099ac 1009 table = rt->rt6i_table;
66f5d6ce 1010 spin_lock_bh(&table->tb6_lock);
333c4301 1011 err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
66f5d6ce 1012 spin_unlock_bh(&table->tb6_lock);
1da177e4
LT
1013
1014 return err;
1015}
1016
40e22e8f
TG
1017int ip6_ins_rt(struct rt6_info *rt)
1018{
e715b6d3
FW
1019 struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
1020 struct mx6_config mxc = { .mx = NULL, };
1021
1cfb71ee
WW
1022 /* Hold dst to account for the reference from the fib6 tree */
1023 dst_hold(&rt->dst);
333c4301 1024 return __ip6_ins_rt(rt, &info, &mxc, NULL);
40e22e8f
TG
1025}
1026
4832c30d
DA
1027/* called with rcu_lock held */
1028static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
1029{
1030 struct net_device *dev = rt->dst.dev;
1031
98d11291 1032 if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) {
4832c30d
DA
1033 /* for copies of local routes, dst->dev needs to be the
1034 * device if it is a master device, the master device if
1035 * device is enslaved, and the loopback as the default
1036 */
1037 if (netif_is_l3_slave(dev) &&
1038 !rt6_need_strict(&rt->rt6i_dst.addr))
1039 dev = l3mdev_master_dev_rcu(dev);
1040 else if (!netif_is_l3_master(dev))
1041 dev = dev_net(dev)->loopback_dev;
1042 /* last case is netif_is_l3_master(dev) is true in which
1043 * case we want dev returned to be dev
1044 */
1045 }
1046
1047 return dev;
1048}
1049
8b9df265
MKL
1050static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
1051 const struct in6_addr *daddr,
1052 const struct in6_addr *saddr)
1da177e4 1053{
4832c30d 1054 struct net_device *dev;
1da177e4
LT
1055 struct rt6_info *rt;
1056
1057 /*
1058 * Clone the route.
1059 */
1060
d52d3997 1061 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
3a2232e9 1062 ort = ort->from;
1da177e4 1063
4832c30d
DA
1064 rcu_read_lock();
1065 dev = ip6_rt_get_dev_rcu(ort);
1066 rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1067 rcu_read_unlock();
83a09abd
MKL
1068 if (!rt)
1069 return NULL;
1070
1071 ip6_rt_copy_init(rt, ort);
1072 rt->rt6i_flags |= RTF_CACHE;
1073 rt->rt6i_metric = 0;
1074 rt->dst.flags |= DST_HOST;
1075 rt->rt6i_dst.addr = *daddr;
1076 rt->rt6i_dst.plen = 128;
1da177e4 1077
83a09abd
MKL
1078 if (!rt6_is_gw_or_nonexthop(ort)) {
1079 if (ort->rt6i_dst.plen != 128 &&
1080 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1081 rt->rt6i_flags |= RTF_ANYCAST;
1da177e4 1082#ifdef CONFIG_IPV6_SUBTREES
83a09abd
MKL
1083 if (rt->rt6i_src.plen && saddr) {
1084 rt->rt6i_src.addr = *saddr;
1085 rt->rt6i_src.plen = 128;
8b9df265 1086 }
83a09abd 1087#endif
95a9a5ba 1088 }
1da177e4 1089
95a9a5ba
YH
1090 return rt;
1091}
1da177e4 1092
d52d3997
MKL
1093static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1094{
4832c30d 1095 struct net_device *dev;
d52d3997
MKL
1096 struct rt6_info *pcpu_rt;
1097
4832c30d
DA
1098 rcu_read_lock();
1099 dev = ip6_rt_get_dev_rcu(rt);
1100 pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1101 rcu_read_unlock();
d52d3997
MKL
1102 if (!pcpu_rt)
1103 return NULL;
1104 ip6_rt_copy_init(pcpu_rt, rt);
1105 pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1106 pcpu_rt->rt6i_flags |= RTF_PCPU;
1107 return pcpu_rt;
1108}
1109
66f5d6ce 1110/* It should be called with rcu_read_lock() acquired */
d52d3997
MKL
1111static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1112{
a73e4195 1113 struct rt6_info *pcpu_rt, **p;
d52d3997
MKL
1114
1115 p = this_cpu_ptr(rt->rt6i_pcpu);
1116 pcpu_rt = *p;
1117
d3843fe5 1118 if (pcpu_rt && ip6_hold_safe(NULL, &pcpu_rt, false))
a73e4195 1119 rt6_dst_from_metrics_check(pcpu_rt);
d3843fe5 1120
a73e4195
MKL
1121 return pcpu_rt;
1122}
1123
1124static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1125{
1126 struct rt6_info *pcpu_rt, *prev, **p;
d52d3997
MKL
1127
1128 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1129 if (!pcpu_rt) {
1130 struct net *net = dev_net(rt->dst.dev);
1131
9c7370a1
MKL
1132 dst_hold(&net->ipv6.ip6_null_entry->dst);
1133 return net->ipv6.ip6_null_entry;
d52d3997
MKL
1134 }
1135
a94b9367
WW
1136 dst_hold(&pcpu_rt->dst);
1137 p = this_cpu_ptr(rt->rt6i_pcpu);
1138 prev = cmpxchg(p, NULL, pcpu_rt);
951f788a 1139 BUG_ON(prev);
a94b9367 1140
d52d3997
MKL
1141 rt6_dst_from_metrics_check(pcpu_rt);
1142 return pcpu_rt;
1143}
1144
35732d01
WW
1145/* exception hash table implementation
1146 */
1147static DEFINE_SPINLOCK(rt6_exception_lock);
1148
1149/* Remove rt6_ex from hash table and free the memory
1150 * Caller must hold rt6_exception_lock
1151 */
1152static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1153 struct rt6_exception *rt6_ex)
1154{
b2427e67 1155 struct net *net;
81eb8447 1156
35732d01
WW
1157 if (!bucket || !rt6_ex)
1158 return;
b2427e67
CIK
1159
1160 net = dev_net(rt6_ex->rt6i->dst.dev);
35732d01
WW
1161 rt6_ex->rt6i->rt6i_node = NULL;
1162 hlist_del_rcu(&rt6_ex->hlist);
1163 rt6_release(rt6_ex->rt6i);
1164 kfree_rcu(rt6_ex, rcu);
1165 WARN_ON_ONCE(!bucket->depth);
1166 bucket->depth--;
81eb8447 1167 net->ipv6.rt6_stats->fib_rt_cache--;
35732d01
WW
1168}
1169
1170/* Remove oldest rt6_ex in bucket and free the memory
1171 * Caller must hold rt6_exception_lock
1172 */
1173static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1174{
1175 struct rt6_exception *rt6_ex, *oldest = NULL;
1176
1177 if (!bucket)
1178 return;
1179
1180 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1181 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1182 oldest = rt6_ex;
1183 }
1184 rt6_remove_exception(bucket, oldest);
1185}
1186
1187static u32 rt6_exception_hash(const struct in6_addr *dst,
1188 const struct in6_addr *src)
1189{
1190 static u32 seed __read_mostly;
1191 u32 val;
1192
1193 net_get_random_once(&seed, sizeof(seed));
1194 val = jhash(dst, sizeof(*dst), seed);
1195
1196#ifdef CONFIG_IPV6_SUBTREES
1197 if (src)
1198 val = jhash(src, sizeof(*src), val);
1199#endif
1200 return hash_32(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1201}
1202
1203/* Helper function to find the cached rt in the hash table
1204 * and update bucket pointer to point to the bucket for this
1205 * (daddr, saddr) pair
1206 * Caller must hold rt6_exception_lock
1207 */
1208static struct rt6_exception *
1209__rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1210 const struct in6_addr *daddr,
1211 const struct in6_addr *saddr)
1212{
1213 struct rt6_exception *rt6_ex;
1214 u32 hval;
1215
1216 if (!(*bucket) || !daddr)
1217 return NULL;
1218
1219 hval = rt6_exception_hash(daddr, saddr);
1220 *bucket += hval;
1221
1222 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1223 struct rt6_info *rt6 = rt6_ex->rt6i;
1224 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1225
1226#ifdef CONFIG_IPV6_SUBTREES
1227 if (matched && saddr)
1228 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1229#endif
1230 if (matched)
1231 return rt6_ex;
1232 }
1233 return NULL;
1234}
1235
1236/* Helper function to find the cached rt in the hash table
1237 * and update bucket pointer to point to the bucket for this
1238 * (daddr, saddr) pair
1239 * Caller must hold rcu_read_lock()
1240 */
1241static struct rt6_exception *
1242__rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1243 const struct in6_addr *daddr,
1244 const struct in6_addr *saddr)
1245{
1246 struct rt6_exception *rt6_ex;
1247 u32 hval;
1248
1249 WARN_ON_ONCE(!rcu_read_lock_held());
1250
1251 if (!(*bucket) || !daddr)
1252 return NULL;
1253
1254 hval = rt6_exception_hash(daddr, saddr);
1255 *bucket += hval;
1256
1257 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1258 struct rt6_info *rt6 = rt6_ex->rt6i;
1259 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1260
1261#ifdef CONFIG_IPV6_SUBTREES
1262 if (matched && saddr)
1263 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1264#endif
1265 if (matched)
1266 return rt6_ex;
1267 }
1268 return NULL;
1269}
1270
1271static int rt6_insert_exception(struct rt6_info *nrt,
1272 struct rt6_info *ort)
1273{
81eb8447 1274 struct net *net = dev_net(ort->dst.dev);
35732d01
WW
1275 struct rt6_exception_bucket *bucket;
1276 struct in6_addr *src_key = NULL;
1277 struct rt6_exception *rt6_ex;
1278 int err = 0;
1279
1280 /* ort can't be a cache or pcpu route */
1281 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
3a2232e9 1282 ort = ort->from;
35732d01
WW
1283 WARN_ON_ONCE(ort->rt6i_flags & (RTF_CACHE | RTF_PCPU));
1284
1285 spin_lock_bh(&rt6_exception_lock);
1286
1287 if (ort->exception_bucket_flushed) {
1288 err = -EINVAL;
1289 goto out;
1290 }
1291
1292 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1293 lockdep_is_held(&rt6_exception_lock));
1294 if (!bucket) {
1295 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1296 GFP_ATOMIC);
1297 if (!bucket) {
1298 err = -ENOMEM;
1299 goto out;
1300 }
1301 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1302 }
1303
1304#ifdef CONFIG_IPV6_SUBTREES
1305 /* rt6i_src.plen != 0 indicates ort is in subtree
1306 * and exception table is indexed by a hash of
1307 * both rt6i_dst and rt6i_src.
1308 * Otherwise, the exception table is indexed by
1309 * a hash of only rt6i_dst.
1310 */
1311 if (ort->rt6i_src.plen)
1312 src_key = &nrt->rt6i_src.addr;
1313#endif
60006a48
WW
1314
1315 /* Update rt6i_prefsrc as it could be changed
1316 * in rt6_remove_prefsrc()
1317 */
1318 nrt->rt6i_prefsrc = ort->rt6i_prefsrc;
f5bbe7ee
WW
1319 /* rt6_mtu_change() might lower mtu on ort.
1320 * Only insert this exception route if its mtu
1321 * is less than ort's mtu value.
1322 */
1323 if (nrt->rt6i_pmtu >= dst_mtu(&ort->dst)) {
1324 err = -EINVAL;
1325 goto out;
1326 }
60006a48 1327
35732d01
WW
1328 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1329 src_key);
1330 if (rt6_ex)
1331 rt6_remove_exception(bucket, rt6_ex);
1332
1333 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1334 if (!rt6_ex) {
1335 err = -ENOMEM;
1336 goto out;
1337 }
1338 rt6_ex->rt6i = nrt;
1339 rt6_ex->stamp = jiffies;
1340 atomic_inc(&nrt->rt6i_ref);
1341 nrt->rt6i_node = ort->rt6i_node;
1342 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1343 bucket->depth++;
81eb8447 1344 net->ipv6.rt6_stats->fib_rt_cache++;
35732d01
WW
1345
1346 if (bucket->depth > FIB6_MAX_DEPTH)
1347 rt6_exception_remove_oldest(bucket);
1348
1349out:
1350 spin_unlock_bh(&rt6_exception_lock);
1351
1352 /* Update fn->fn_sernum to invalidate all cached dst */
b886d5f2 1353 if (!err) {
922c2ac8 1354 spin_lock_bh(&ort->rt6i_table->tb6_lock);
35732d01 1355 fib6_update_sernum(ort);
922c2ac8 1356 spin_unlock_bh(&ort->rt6i_table->tb6_lock);
b886d5f2
PA
1357 fib6_force_start_gc(net);
1358 }
35732d01
WW
1359
1360 return err;
1361}
1362
1363void rt6_flush_exceptions(struct rt6_info *rt)
1364{
1365 struct rt6_exception_bucket *bucket;
1366 struct rt6_exception *rt6_ex;
1367 struct hlist_node *tmp;
1368 int i;
1369
1370 spin_lock_bh(&rt6_exception_lock);
1371 /* Prevent rt6_insert_exception() to recreate the bucket list */
1372 rt->exception_bucket_flushed = 1;
1373
1374 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1375 lockdep_is_held(&rt6_exception_lock));
1376 if (!bucket)
1377 goto out;
1378
1379 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1380 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1381 rt6_remove_exception(bucket, rt6_ex);
1382 WARN_ON_ONCE(bucket->depth);
1383 bucket++;
1384 }
1385
1386out:
1387 spin_unlock_bh(&rt6_exception_lock);
1388}
1389
1390/* Find cached rt in the hash table inside passed in rt
1391 * Caller has to hold rcu_read_lock()
1392 */
1393static struct rt6_info *rt6_find_cached_rt(struct rt6_info *rt,
1394 struct in6_addr *daddr,
1395 struct in6_addr *saddr)
1396{
1397 struct rt6_exception_bucket *bucket;
1398 struct in6_addr *src_key = NULL;
1399 struct rt6_exception *rt6_ex;
1400 struct rt6_info *res = NULL;
1401
1402 bucket = rcu_dereference(rt->rt6i_exception_bucket);
1403
1404#ifdef CONFIG_IPV6_SUBTREES
1405 /* rt6i_src.plen != 0 indicates rt is in subtree
1406 * and exception table is indexed by a hash of
1407 * both rt6i_dst and rt6i_src.
1408 * Otherwise, the exception table is indexed by
1409 * a hash of only rt6i_dst.
1410 */
1411 if (rt->rt6i_src.plen)
1412 src_key = saddr;
1413#endif
1414 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1415
1416 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1417 res = rt6_ex->rt6i;
1418
1419 return res;
1420}
1421
1422/* Remove the passed in cached rt from the hash table that contains it */
1423int rt6_remove_exception_rt(struct rt6_info *rt)
1424{
35732d01 1425 struct rt6_exception_bucket *bucket;
3a2232e9 1426 struct rt6_info *from = rt->from;
35732d01
WW
1427 struct in6_addr *src_key = NULL;
1428 struct rt6_exception *rt6_ex;
1429 int err;
1430
1431 if (!from ||
442d713b 1432 !(rt->rt6i_flags & RTF_CACHE))
35732d01
WW
1433 return -EINVAL;
1434
1435 if (!rcu_access_pointer(from->rt6i_exception_bucket))
1436 return -ENOENT;
1437
1438 spin_lock_bh(&rt6_exception_lock);
1439 bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1440 lockdep_is_held(&rt6_exception_lock));
1441#ifdef CONFIG_IPV6_SUBTREES
1442 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1443 * and exception table is indexed by a hash of
1444 * both rt6i_dst and rt6i_src.
1445 * Otherwise, the exception table is indexed by
1446 * a hash of only rt6i_dst.
1447 */
1448 if (from->rt6i_src.plen)
1449 src_key = &rt->rt6i_src.addr;
1450#endif
1451 rt6_ex = __rt6_find_exception_spinlock(&bucket,
1452 &rt->rt6i_dst.addr,
1453 src_key);
1454 if (rt6_ex) {
1455 rt6_remove_exception(bucket, rt6_ex);
1456 err = 0;
1457 } else {
1458 err = -ENOENT;
1459 }
1460
1461 spin_unlock_bh(&rt6_exception_lock);
1462 return err;
1463}
1464
1465/* Find rt6_ex which contains the passed in rt cache and
1466 * refresh its stamp
1467 */
1468static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1469{
35732d01 1470 struct rt6_exception_bucket *bucket;
3a2232e9 1471 struct rt6_info *from = rt->from;
35732d01
WW
1472 struct in6_addr *src_key = NULL;
1473 struct rt6_exception *rt6_ex;
1474
1475 if (!from ||
442d713b 1476 !(rt->rt6i_flags & RTF_CACHE))
35732d01
WW
1477 return;
1478
1479 rcu_read_lock();
1480 bucket = rcu_dereference(from->rt6i_exception_bucket);
1481
1482#ifdef CONFIG_IPV6_SUBTREES
1483 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1484 * and exception table is indexed by a hash of
1485 * both rt6i_dst and rt6i_src.
1486 * Otherwise, the exception table is indexed by
1487 * a hash of only rt6i_dst.
1488 */
1489 if (from->rt6i_src.plen)
1490 src_key = &rt->rt6i_src.addr;
1491#endif
1492 rt6_ex = __rt6_find_exception_rcu(&bucket,
1493 &rt->rt6i_dst.addr,
1494 src_key);
1495 if (rt6_ex)
1496 rt6_ex->stamp = jiffies;
1497
1498 rcu_read_unlock();
1499}
1500
60006a48
WW
1501static void rt6_exceptions_remove_prefsrc(struct rt6_info *rt)
1502{
1503 struct rt6_exception_bucket *bucket;
1504 struct rt6_exception *rt6_ex;
1505 int i;
1506
1507 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1508 lockdep_is_held(&rt6_exception_lock));
1509
1510 if (bucket) {
1511 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1512 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1513 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1514 }
1515 bucket++;
1516 }
1517 }
1518}
1519
e9fa1495
SB
1520static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1521 struct rt6_info *rt, int mtu)
1522{
1523 /* If the new MTU is lower than the route PMTU, this new MTU will be the
1524 * lowest MTU in the path: always allow updating the route PMTU to
1525 * reflect PMTU decreases.
1526 *
1527 * If the new MTU is higher, and the route PMTU is equal to the local
1528 * MTU, this means the old MTU is the lowest in the path, so allow
1529 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1530 * handle this.
1531 */
1532
1533 if (dst_mtu(&rt->dst) >= mtu)
1534 return true;
1535
1536 if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1537 return true;
1538
1539 return false;
1540}
1541
1542static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1543 struct rt6_info *rt, int mtu)
f5bbe7ee
WW
1544{
1545 struct rt6_exception_bucket *bucket;
1546 struct rt6_exception *rt6_ex;
1547 int i;
1548
1549 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1550 lockdep_is_held(&rt6_exception_lock));
1551
e9fa1495
SB
1552 if (!bucket)
1553 return;
1554
1555 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1556 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1557 struct rt6_info *entry = rt6_ex->rt6i;
1558
1559 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1560 * route), the metrics of its rt->dst.from have already
1561 * been updated.
1562 */
1563 if (entry->rt6i_pmtu &&
1564 rt6_mtu_change_route_allowed(idev, entry, mtu))
1565 entry->rt6i_pmtu = mtu;
f5bbe7ee 1566 }
e9fa1495 1567 bucket++;
f5bbe7ee
WW
1568 }
1569}
1570
b16cb459
WW
1571#define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
1572
1573static void rt6_exceptions_clean_tohost(struct rt6_info *rt,
1574 struct in6_addr *gateway)
1575{
1576 struct rt6_exception_bucket *bucket;
1577 struct rt6_exception *rt6_ex;
1578 struct hlist_node *tmp;
1579 int i;
1580
1581 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1582 return;
1583
1584 spin_lock_bh(&rt6_exception_lock);
1585 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1586 lockdep_is_held(&rt6_exception_lock));
1587
1588 if (bucket) {
1589 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1590 hlist_for_each_entry_safe(rt6_ex, tmp,
1591 &bucket->chain, hlist) {
1592 struct rt6_info *entry = rt6_ex->rt6i;
1593
1594 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1595 RTF_CACHE_GATEWAY &&
1596 ipv6_addr_equal(gateway,
1597 &entry->rt6i_gateway)) {
1598 rt6_remove_exception(bucket, rt6_ex);
1599 }
1600 }
1601 bucket++;
1602 }
1603 }
1604
1605 spin_unlock_bh(&rt6_exception_lock);
1606}
1607
c757faa8
WW
1608static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1609 struct rt6_exception *rt6_ex,
1610 struct fib6_gc_args *gc_args,
1611 unsigned long now)
1612{
1613 struct rt6_info *rt = rt6_ex->rt6i;
1614
1859bac0
PA
1615 /* we are pruning and obsoleting aged-out and non gateway exceptions
1616 * even if others have still references to them, so that on next
1617 * dst_check() such references can be dropped.
1618 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1619 * expired, independently from their aging, as per RFC 8201 section 4
1620 */
31afeb42
WW
1621 if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1622 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1623 RT6_TRACE("aging clone %p\n", rt);
1624 rt6_remove_exception(bucket, rt6_ex);
1625 return;
1626 }
1627 } else if (time_after(jiffies, rt->dst.expires)) {
1628 RT6_TRACE("purging expired route %p\n", rt);
c757faa8
WW
1629 rt6_remove_exception(bucket, rt6_ex);
1630 return;
31afeb42
WW
1631 }
1632
1633 if (rt->rt6i_flags & RTF_GATEWAY) {
c757faa8
WW
1634 struct neighbour *neigh;
1635 __u8 neigh_flags = 0;
1636
1bfa26ff
ED
1637 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1638 if (neigh)
c757faa8 1639 neigh_flags = neigh->flags;
1bfa26ff 1640
c757faa8
WW
1641 if (!(neigh_flags & NTF_ROUTER)) {
1642 RT6_TRACE("purging route %p via non-router but gateway\n",
1643 rt);
1644 rt6_remove_exception(bucket, rt6_ex);
1645 return;
1646 }
1647 }
31afeb42 1648
c757faa8
WW
1649 gc_args->more++;
1650}
1651
1652void rt6_age_exceptions(struct rt6_info *rt,
1653 struct fib6_gc_args *gc_args,
1654 unsigned long now)
1655{
1656 struct rt6_exception_bucket *bucket;
1657 struct rt6_exception *rt6_ex;
1658 struct hlist_node *tmp;
1659 int i;
1660
1661 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1662 return;
1663
1bfa26ff
ED
1664 rcu_read_lock_bh();
1665 spin_lock(&rt6_exception_lock);
c757faa8
WW
1666 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1667 lockdep_is_held(&rt6_exception_lock));
1668
1669 if (bucket) {
1670 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1671 hlist_for_each_entry_safe(rt6_ex, tmp,
1672 &bucket->chain, hlist) {
1673 rt6_age_examine_exception(bucket, rt6_ex,
1674 gc_args, now);
1675 }
1676 bucket++;
1677 }
1678 }
1bfa26ff
ED
1679 spin_unlock(&rt6_exception_lock);
1680 rcu_read_unlock_bh();
c757faa8
WW
1681}
1682
9ff74384 1683struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
b75cc8f9
DA
1684 int oif, struct flowi6 *fl6,
1685 const struct sk_buff *skb, int flags)
1da177e4 1686{
367efcb9 1687 struct fib6_node *fn, *saved_fn;
2b760fcf 1688 struct rt6_info *rt, *rt_cache;
c71099ac 1689 int strict = 0;
1da177e4 1690
77d16f45 1691 strict |= flags & RT6_LOOKUP_F_IFACE;
d5d32e4b 1692 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
367efcb9
MKL
1693 if (net->ipv6.devconf_all->forwarding == 0)
1694 strict |= RT6_LOOKUP_F_REACHABLE;
1da177e4 1695
66f5d6ce 1696 rcu_read_lock();
1da177e4 1697
4c9483b2 1698 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
367efcb9 1699 saved_fn = fn;
1da177e4 1700
ca254490
DA
1701 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1702 oif = 0;
1703
a3c00e46 1704redo_rt6_select:
8d1040e8 1705 rt = rt6_select(net, fn, oif, strict);
52bd4c0c 1706 if (rt->rt6i_nsiblings)
b4bac172 1707 rt = rt6_multipath_select(net, rt, fl6, oif, skb, strict);
a3c00e46
MKL
1708 if (rt == net->ipv6.ip6_null_entry) {
1709 fn = fib6_backtrack(fn, &fl6->saddr);
1710 if (fn)
1711 goto redo_rt6_select;
367efcb9
MKL
1712 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1713 /* also consider unreachable route */
1714 strict &= ~RT6_LOOKUP_F_REACHABLE;
1715 fn = saved_fn;
1716 goto redo_rt6_select;
367efcb9 1717 }
a3c00e46
MKL
1718 }
1719
2b760fcf
WW
1720 /*Search through exception table */
1721 rt_cache = rt6_find_cached_rt(rt, &fl6->daddr, &fl6->saddr);
1722 if (rt_cache)
1723 rt = rt_cache;
fb9de91e 1724
d3843fe5 1725 if (rt == net->ipv6.ip6_null_entry) {
66f5d6ce 1726 rcu_read_unlock();
d3843fe5 1727 dst_hold(&rt->dst);
b65f164d 1728 trace_fib6_table_lookup(net, rt, table, fl6);
d3843fe5
WW
1729 return rt;
1730 } else if (rt->rt6i_flags & RTF_CACHE) {
1731 if (ip6_hold_safe(net, &rt, true)) {
1732 dst_use_noref(&rt->dst, jiffies);
1733 rt6_dst_from_metrics_check(rt);
1734 }
66f5d6ce 1735 rcu_read_unlock();
b65f164d 1736 trace_fib6_table_lookup(net, rt, table, fl6);
d52d3997 1737 return rt;
3da59bd9
MKL
1738 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1739 !(rt->rt6i_flags & RTF_GATEWAY))) {
1740 /* Create a RTF_CACHE clone which will not be
1741 * owned by the fib6 tree. It is for the special case where
1742 * the daddr in the skb during the neighbor look-up is different
1743 * from the fl6->daddr used to look-up route here.
1744 */
1745
1746 struct rt6_info *uncached_rt;
1747
d3843fe5
WW
1748 if (ip6_hold_safe(net, &rt, true)) {
1749 dst_use_noref(&rt->dst, jiffies);
1750 } else {
66f5d6ce 1751 rcu_read_unlock();
d3843fe5
WW
1752 uncached_rt = rt;
1753 goto uncached_rt_out;
1754 }
66f5d6ce 1755 rcu_read_unlock();
d52d3997 1756
3da59bd9
MKL
1757 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1758 dst_release(&rt->dst);
c71099ac 1759
1cfb71ee
WW
1760 if (uncached_rt) {
1761 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1762 * No need for another dst_hold()
1763 */
8d0b94af 1764 rt6_uncached_list_add(uncached_rt);
81eb8447 1765 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1cfb71ee 1766 } else {
3da59bd9 1767 uncached_rt = net->ipv6.ip6_null_entry;
1cfb71ee
WW
1768 dst_hold(&uncached_rt->dst);
1769 }
b811580d 1770
d3843fe5 1771uncached_rt_out:
b65f164d 1772 trace_fib6_table_lookup(net, uncached_rt, table, fl6);
3da59bd9 1773 return uncached_rt;
3da59bd9 1774
d52d3997
MKL
1775 } else {
1776 /* Get a percpu copy */
1777
1778 struct rt6_info *pcpu_rt;
1779
d3843fe5 1780 dst_use_noref(&rt->dst, jiffies);
951f788a 1781 local_bh_disable();
d52d3997 1782 pcpu_rt = rt6_get_pcpu_route(rt);
d52d3997 1783
951f788a 1784 if (!pcpu_rt) {
a94b9367
WW
1785 /* atomic_inc_not_zero() is needed when using rcu */
1786 if (atomic_inc_not_zero(&rt->rt6i_ref)) {
951f788a 1787 /* No dst_hold() on rt is needed because grabbing
a94b9367
WW
1788 * rt->rt6i_ref makes sure rt can't be released.
1789 */
a94b9367
WW
1790 pcpu_rt = rt6_make_pcpu_route(rt);
1791 rt6_release(rt);
1792 } else {
1793 /* rt is already removed from tree */
a94b9367
WW
1794 pcpu_rt = net->ipv6.ip6_null_entry;
1795 dst_hold(&pcpu_rt->dst);
1796 }
9c7370a1 1797 }
951f788a
ED
1798 local_bh_enable();
1799 rcu_read_unlock();
b65f164d 1800 trace_fib6_table_lookup(net, pcpu_rt, table, fl6);
d52d3997
MKL
1801 return pcpu_rt;
1802 }
1da177e4 1803}
9ff74384 1804EXPORT_SYMBOL_GPL(ip6_pol_route);
1da177e4 1805
b75cc8f9
DA
1806static struct rt6_info *ip6_pol_route_input(struct net *net,
1807 struct fib6_table *table,
1808 struct flowi6 *fl6,
1809 const struct sk_buff *skb,
1810 int flags)
4acad72d 1811{
b75cc8f9 1812 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
4acad72d
PE
1813}
1814
d409b847
MB
1815struct dst_entry *ip6_route_input_lookup(struct net *net,
1816 struct net_device *dev,
b75cc8f9
DA
1817 struct flowi6 *fl6,
1818 const struct sk_buff *skb,
1819 int flags)
72331bc0
SL
1820{
1821 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1822 flags |= RT6_LOOKUP_F_IFACE;
1823
b75cc8f9 1824 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
72331bc0 1825}
d409b847 1826EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
72331bc0 1827
23aebdac 1828static void ip6_multipath_l3_keys(const struct sk_buff *skb,
5e5d6fed
RP
1829 struct flow_keys *keys,
1830 struct flow_keys *flkeys)
23aebdac
JS
1831{
1832 const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1833 const struct ipv6hdr *key_iph = outer_iph;
5e5d6fed 1834 struct flow_keys *_flkeys = flkeys;
23aebdac
JS
1835 const struct ipv6hdr *inner_iph;
1836 const struct icmp6hdr *icmph;
1837 struct ipv6hdr _inner_iph;
1838
1839 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1840 goto out;
1841
1842 icmph = icmp6_hdr(skb);
1843 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1844 icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1845 icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1846 icmph->icmp6_type != ICMPV6_PARAMPROB)
1847 goto out;
1848
1849 inner_iph = skb_header_pointer(skb,
1850 skb_transport_offset(skb) + sizeof(*icmph),
1851 sizeof(_inner_iph), &_inner_iph);
1852 if (!inner_iph)
1853 goto out;
1854
1855 key_iph = inner_iph;
5e5d6fed 1856 _flkeys = NULL;
23aebdac 1857out:
5e5d6fed
RP
1858 if (_flkeys) {
1859 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
1860 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
1861 keys->tags.flow_label = _flkeys->tags.flow_label;
1862 keys->basic.ip_proto = _flkeys->basic.ip_proto;
1863 } else {
1864 keys->addrs.v6addrs.src = key_iph->saddr;
1865 keys->addrs.v6addrs.dst = key_iph->daddr;
1866 keys->tags.flow_label = ip6_flowinfo(key_iph);
1867 keys->basic.ip_proto = key_iph->nexthdr;
1868 }
23aebdac
JS
1869}
1870
1871/* if skb is set it will be used and fl6 can be NULL */
b4bac172
DA
1872u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
1873 const struct sk_buff *skb, struct flow_keys *flkeys)
23aebdac
JS
1874{
1875 struct flow_keys hash_keys;
9a2a537a 1876 u32 mhash;
23aebdac 1877
bbfa047a 1878 switch (ip6_multipath_hash_policy(net)) {
b4bac172
DA
1879 case 0:
1880 memset(&hash_keys, 0, sizeof(hash_keys));
1881 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1882 if (skb) {
1883 ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
1884 } else {
1885 hash_keys.addrs.v6addrs.src = fl6->saddr;
1886 hash_keys.addrs.v6addrs.dst = fl6->daddr;
1887 hash_keys.tags.flow_label = (__force u32)fl6->flowlabel;
1888 hash_keys.basic.ip_proto = fl6->flowi6_proto;
1889 }
1890 break;
1891 case 1:
1892 if (skb) {
1893 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1894 struct flow_keys keys;
1895
1896 /* short-circuit if we already have L4 hash present */
1897 if (skb->l4_hash)
1898 return skb_get_hash_raw(skb) >> 1;
1899
1900 memset(&hash_keys, 0, sizeof(hash_keys));
1901
1902 if (!flkeys) {
1903 skb_flow_dissect_flow_keys(skb, &keys, flag);
1904 flkeys = &keys;
1905 }
1906 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1907 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
1908 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
1909 hash_keys.ports.src = flkeys->ports.src;
1910 hash_keys.ports.dst = flkeys->ports.dst;
1911 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1912 } else {
1913 memset(&hash_keys, 0, sizeof(hash_keys));
1914 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1915 hash_keys.addrs.v6addrs.src = fl6->saddr;
1916 hash_keys.addrs.v6addrs.dst = fl6->daddr;
1917 hash_keys.ports.src = fl6->fl6_sport;
1918 hash_keys.ports.dst = fl6->fl6_dport;
1919 hash_keys.basic.ip_proto = fl6->flowi6_proto;
1920 }
1921 break;
23aebdac 1922 }
9a2a537a 1923 mhash = flow_hash_from_keys(&hash_keys);
23aebdac 1924
9a2a537a 1925 return mhash >> 1;
23aebdac
JS
1926}
1927
c71099ac
TG
1928void ip6_route_input(struct sk_buff *skb)
1929{
b71d1d42 1930 const struct ipv6hdr *iph = ipv6_hdr(skb);
c346dca1 1931 struct net *net = dev_net(skb->dev);
adaa70bb 1932 int flags = RT6_LOOKUP_F_HAS_SADDR;
904af04d 1933 struct ip_tunnel_info *tun_info;
4c9483b2 1934 struct flowi6 fl6 = {
e0d56fdd 1935 .flowi6_iif = skb->dev->ifindex,
4c9483b2
DM
1936 .daddr = iph->daddr,
1937 .saddr = iph->saddr,
6502ca52 1938 .flowlabel = ip6_flowinfo(iph),
4c9483b2
DM
1939 .flowi6_mark = skb->mark,
1940 .flowi6_proto = iph->nexthdr,
c71099ac 1941 };
5e5d6fed 1942 struct flow_keys *flkeys = NULL, _flkeys;
adaa70bb 1943
904af04d 1944 tun_info = skb_tunnel_info(skb);
46fa062a 1945 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
904af04d 1946 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
5e5d6fed
RP
1947
1948 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
1949 flkeys = &_flkeys;
1950
23aebdac 1951 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
b4bac172 1952 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
06e9d040 1953 skb_dst_drop(skb);
b75cc8f9
DA
1954 skb_dst_set(skb,
1955 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
c71099ac
TG
1956}
1957
b75cc8f9
DA
1958static struct rt6_info *ip6_pol_route_output(struct net *net,
1959 struct fib6_table *table,
1960 struct flowi6 *fl6,
1961 const struct sk_buff *skb,
1962 int flags)
1da177e4 1963{
b75cc8f9 1964 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
c71099ac
TG
1965}
1966
6f21c96a
PA
1967struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1968 struct flowi6 *fl6, int flags)
c71099ac 1969{
d46a9d67 1970 bool any_src;
c71099ac 1971
4c1feac5
DA
1972 if (rt6_need_strict(&fl6->daddr)) {
1973 struct dst_entry *dst;
1974
1975 dst = l3mdev_link_scope_lookup(net, fl6);
1976 if (dst)
1977 return dst;
1978 }
ca254490 1979
1fb9489b 1980 fl6->flowi6_iif = LOOPBACK_IFINDEX;
4dc27d1c 1981
d46a9d67 1982 any_src = ipv6_addr_any(&fl6->saddr);
741a11d9 1983 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
d46a9d67 1984 (fl6->flowi6_oif && any_src))
77d16f45 1985 flags |= RT6_LOOKUP_F_IFACE;
c71099ac 1986
d46a9d67 1987 if (!any_src)
adaa70bb 1988 flags |= RT6_LOOKUP_F_HAS_SADDR;
0c9a2ac1
YH
1989 else if (sk)
1990 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
adaa70bb 1991
b75cc8f9 1992 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
1da177e4 1993}
6f21c96a 1994EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1da177e4 1995
2774c131 1996struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
14e50e57 1997{
5c1e6aa3 1998 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1dbe3252 1999 struct net_device *loopback_dev = net->loopback_dev;
14e50e57
DM
2000 struct dst_entry *new = NULL;
2001
1dbe3252 2002 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
62cf27e5 2003 DST_OBSOLETE_DEAD, 0);
14e50e57 2004 if (rt) {
0a1f5962 2005 rt6_info_init(rt);
81eb8447 2006 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
8104891b 2007
0a1f5962 2008 new = &rt->dst;
14e50e57 2009 new->__use = 1;
352e512c 2010 new->input = dst_discard;
ede2059d 2011 new->output = dst_discard_out;
14e50e57 2012
0a1f5962 2013 dst_copy_metrics(new, &ort->dst);
14e50e57 2014
1dbe3252 2015 rt->rt6i_idev = in6_dev_get(loopback_dev);
4e3fd7a0 2016 rt->rt6i_gateway = ort->rt6i_gateway;
0a1f5962 2017 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
14e50e57
DM
2018 rt->rt6i_metric = 0;
2019
2020 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2021#ifdef CONFIG_IPV6_SUBTREES
2022 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2023#endif
14e50e57
DM
2024 }
2025
69ead7af
DM
2026 dst_release(dst_orig);
2027 return new ? new : ERR_PTR(-ENOMEM);
14e50e57 2028}
14e50e57 2029
1da177e4
LT
2030/*
2031 * Destination cache support functions
2032 */
2033
4b32b5ad
MKL
2034static void rt6_dst_from_metrics_check(struct rt6_info *rt)
2035{
3a2232e9
DM
2036 if (rt->from &&
2037 dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(&rt->from->dst))
2038 dst_init_metrics(&rt->dst, dst_metrics_ptr(&rt->from->dst), true);
4b32b5ad
MKL
2039}
2040
3da59bd9
MKL
2041static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
2042{
36143645 2043 u32 rt_cookie = 0;
c5cff856
WW
2044
2045 if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
3da59bd9
MKL
2046 return NULL;
2047
2048 if (rt6_check_expired(rt))
2049 return NULL;
2050
2051 return &rt->dst;
2052}
2053
2054static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
2055{
5973fb1e
MKL
2056 if (!__rt6_check_expired(rt) &&
2057 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
3a2232e9 2058 rt6_check(rt->from, cookie))
3da59bd9
MKL
2059 return &rt->dst;
2060 else
2061 return NULL;
2062}
2063
1da177e4
LT
2064static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2065{
2066 struct rt6_info *rt;
2067
2068 rt = (struct rt6_info *) dst;
2069
6f3118b5
ND
2070 /* All IPV6 dsts are created with ->obsolete set to the value
2071 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2072 * into this function always.
2073 */
e3bc10bd 2074
4b32b5ad
MKL
2075 rt6_dst_from_metrics_check(rt);
2076
02bcf4e0 2077 if (rt->rt6i_flags & RTF_PCPU ||
3a2232e9 2078 (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->from))
3da59bd9
MKL
2079 return rt6_dst_from_check(rt, cookie);
2080 else
2081 return rt6_check(rt, cookie);
1da177e4
LT
2082}
2083
2084static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2085{
2086 struct rt6_info *rt = (struct rt6_info *) dst;
2087
2088 if (rt) {
54c1a859
YH
2089 if (rt->rt6i_flags & RTF_CACHE) {
2090 if (rt6_check_expired(rt)) {
2091 ip6_del_rt(rt);
2092 dst = NULL;
2093 }
2094 } else {
1da177e4 2095 dst_release(dst);
54c1a859
YH
2096 dst = NULL;
2097 }
1da177e4 2098 }
54c1a859 2099 return dst;
1da177e4
LT
2100}
2101
2102static void ip6_link_failure(struct sk_buff *skb)
2103{
2104 struct rt6_info *rt;
2105
3ffe533c 2106 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1da177e4 2107
adf30907 2108 rt = (struct rt6_info *) skb_dst(skb);
1da177e4 2109 if (rt) {
1eb4f758 2110 if (rt->rt6i_flags & RTF_CACHE) {
ad65a2f0
WW
2111 if (dst_hold_safe(&rt->dst))
2112 ip6_del_rt(rt);
c5cff856
WW
2113 } else {
2114 struct fib6_node *fn;
2115
2116 rcu_read_lock();
2117 fn = rcu_dereference(rt->rt6i_node);
2118 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2119 fn->fn_sernum = -1;
2120 rcu_read_unlock();
1eb4f758 2121 }
1da177e4
LT
2122 }
2123}
2124
45e4fd26
MKL
2125static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2126{
2127 struct net *net = dev_net(rt->dst.dev);
2128
2129 rt->rt6i_flags |= RTF_MODIFIED;
2130 rt->rt6i_pmtu = mtu;
2131 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2132}
2133
0d3f6d29
MKL
2134static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2135{
2136 return !(rt->rt6i_flags & RTF_CACHE) &&
4e587ea7
WW
2137 (rt->rt6i_flags & RTF_PCPU ||
2138 rcu_access_pointer(rt->rt6i_node));
0d3f6d29
MKL
2139}
2140
45e4fd26
MKL
2141static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2142 const struct ipv6hdr *iph, u32 mtu)
1da177e4 2143{
0dec879f 2144 const struct in6_addr *daddr, *saddr;
67ba4152 2145 struct rt6_info *rt6 = (struct rt6_info *)dst;
1da177e4 2146
45e4fd26
MKL
2147 if (rt6->rt6i_flags & RTF_LOCAL)
2148 return;
81aded24 2149
19bda36c
XL
2150 if (dst_metric_locked(dst, RTAX_MTU))
2151 return;
2152
0dec879f
JA
2153 if (iph) {
2154 daddr = &iph->daddr;
2155 saddr = &iph->saddr;
2156 } else if (sk) {
2157 daddr = &sk->sk_v6_daddr;
2158 saddr = &inet6_sk(sk)->saddr;
2159 } else {
2160 daddr = NULL;
2161 saddr = NULL;
2162 }
2163 dst_confirm_neigh(dst, daddr);
45e4fd26
MKL
2164 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2165 if (mtu >= dst_mtu(dst))
2166 return;
9d289715 2167
0d3f6d29 2168 if (!rt6_cache_allowed_for_pmtu(rt6)) {
45e4fd26 2169 rt6_do_update_pmtu(rt6, mtu);
2b760fcf
WW
2170 /* update rt6_ex->stamp for cache */
2171 if (rt6->rt6i_flags & RTF_CACHE)
2172 rt6_update_exception_stamp_rt(rt6);
0dec879f 2173 } else if (daddr) {
45e4fd26
MKL
2174 struct rt6_info *nrt6;
2175
45e4fd26
MKL
2176 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
2177 if (nrt6) {
2178 rt6_do_update_pmtu(nrt6, mtu);
2b760fcf
WW
2179 if (rt6_insert_exception(nrt6, rt6))
2180 dst_release_immediate(&nrt6->dst);
45e4fd26 2181 }
1da177e4
LT
2182 }
2183}
2184
45e4fd26
MKL
2185static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2186 struct sk_buff *skb, u32 mtu)
2187{
2188 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
2189}
2190
42ae66c8 2191void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
e2d118a1 2192 int oif, u32 mark, kuid_t uid)
81aded24
DM
2193{
2194 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2195 struct dst_entry *dst;
2196 struct flowi6 fl6;
2197
2198 memset(&fl6, 0, sizeof(fl6));
2199 fl6.flowi6_oif = oif;
1b3c61dc 2200 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
81aded24
DM
2201 fl6.daddr = iph->daddr;
2202 fl6.saddr = iph->saddr;
6502ca52 2203 fl6.flowlabel = ip6_flowinfo(iph);
e2d118a1 2204 fl6.flowi6_uid = uid;
81aded24
DM
2205
2206 dst = ip6_route_output(net, NULL, &fl6);
2207 if (!dst->error)
45e4fd26 2208 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
81aded24
DM
2209 dst_release(dst);
2210}
2211EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2212
2213void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2214{
33c162a9
MKL
2215 struct dst_entry *dst;
2216
81aded24 2217 ip6_update_pmtu(skb, sock_net(sk), mtu,
e2d118a1 2218 sk->sk_bound_dev_if, sk->sk_mark, sk->sk_uid);
33c162a9
MKL
2219
2220 dst = __sk_dst_get(sk);
2221 if (!dst || !dst->obsolete ||
2222 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2223 return;
2224
2225 bh_lock_sock(sk);
2226 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2227 ip6_datagram_dst_update(sk, false);
2228 bh_unlock_sock(sk);
81aded24
DM
2229}
2230EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2231
7d6850f7
AK
2232void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2233 const struct flowi6 *fl6)
2234{
2235#ifdef CONFIG_IPV6_SUBTREES
2236 struct ipv6_pinfo *np = inet6_sk(sk);
2237#endif
2238
2239 ip6_dst_store(sk, dst,
2240 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2241 &sk->sk_v6_daddr : NULL,
2242#ifdef CONFIG_IPV6_SUBTREES
2243 ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2244 &np->saddr :
2245#endif
2246 NULL);
2247}
2248
b55b76b2
DJ
2249/* Handle redirects */
2250struct ip6rd_flowi {
2251 struct flowi6 fl6;
2252 struct in6_addr gateway;
2253};
2254
2255static struct rt6_info *__ip6_route_redirect(struct net *net,
2256 struct fib6_table *table,
2257 struct flowi6 *fl6,
b75cc8f9 2258 const struct sk_buff *skb,
b55b76b2
DJ
2259 int flags)
2260{
2261 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2b760fcf 2262 struct rt6_info *rt, *rt_cache;
b55b76b2
DJ
2263 struct fib6_node *fn;
2264
2265 /* Get the "current" route for this destination and
67c408cf 2266 * check if the redirect has come from appropriate router.
b55b76b2
DJ
2267 *
2268 * RFC 4861 specifies that redirects should only be
2269 * accepted if they come from the nexthop to the target.
2270 * Due to the way the routes are chosen, this notion
2271 * is a bit fuzzy and one might need to check all possible
2272 * routes.
2273 */
2274
66f5d6ce 2275 rcu_read_lock();
b55b76b2
DJ
2276 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2277restart:
66f5d6ce 2278 for_each_fib6_node_rt_rcu(fn) {
8067bb8c
IS
2279 if (rt->rt6i_nh_flags & RTNH_F_DEAD)
2280 continue;
b55b76b2
DJ
2281 if (rt6_check_expired(rt))
2282 continue;
2283 if (rt->dst.error)
2284 break;
2285 if (!(rt->rt6i_flags & RTF_GATEWAY))
2286 continue;
2287 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
2288 continue;
2b760fcf
WW
2289 /* rt_cache's gateway might be different from its 'parent'
2290 * in the case of an ip redirect.
2291 * So we keep searching in the exception table if the gateway
2292 * is different.
2293 */
2294 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway)) {
2295 rt_cache = rt6_find_cached_rt(rt,
2296 &fl6->daddr,
2297 &fl6->saddr);
2298 if (rt_cache &&
2299 ipv6_addr_equal(&rdfl->gateway,
2300 &rt_cache->rt6i_gateway)) {
2301 rt = rt_cache;
2302 break;
2303 }
b55b76b2 2304 continue;
2b760fcf 2305 }
b55b76b2
DJ
2306 break;
2307 }
2308
2309 if (!rt)
2310 rt = net->ipv6.ip6_null_entry;
2311 else if (rt->dst.error) {
2312 rt = net->ipv6.ip6_null_entry;
b0a1ba59
MKL
2313 goto out;
2314 }
2315
2316 if (rt == net->ipv6.ip6_null_entry) {
a3c00e46
MKL
2317 fn = fib6_backtrack(fn, &fl6->saddr);
2318 if (fn)
2319 goto restart;
b55b76b2 2320 }
a3c00e46 2321
b0a1ba59 2322out:
d3843fe5 2323 ip6_hold_safe(net, &rt, true);
b55b76b2 2324
66f5d6ce 2325 rcu_read_unlock();
b55b76b2 2326
b65f164d 2327 trace_fib6_table_lookup(net, rt, table, fl6);
b55b76b2
DJ
2328 return rt;
2329};
2330
2331static struct dst_entry *ip6_route_redirect(struct net *net,
b75cc8f9
DA
2332 const struct flowi6 *fl6,
2333 const struct sk_buff *skb,
2334 const struct in6_addr *gateway)
b55b76b2
DJ
2335{
2336 int flags = RT6_LOOKUP_F_HAS_SADDR;
2337 struct ip6rd_flowi rdfl;
2338
2339 rdfl.fl6 = *fl6;
2340 rdfl.gateway = *gateway;
2341
b75cc8f9 2342 return fib6_rule_lookup(net, &rdfl.fl6, skb,
b55b76b2
DJ
2343 flags, __ip6_route_redirect);
2344}
2345
e2d118a1
LC
2346void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2347 kuid_t uid)
3a5ad2ee
DM
2348{
2349 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2350 struct dst_entry *dst;
2351 struct flowi6 fl6;
2352
2353 memset(&fl6, 0, sizeof(fl6));
e374c618 2354 fl6.flowi6_iif = LOOPBACK_IFINDEX;
3a5ad2ee
DM
2355 fl6.flowi6_oif = oif;
2356 fl6.flowi6_mark = mark;
3a5ad2ee
DM
2357 fl6.daddr = iph->daddr;
2358 fl6.saddr = iph->saddr;
6502ca52 2359 fl6.flowlabel = ip6_flowinfo(iph);
e2d118a1 2360 fl6.flowi6_uid = uid;
3a5ad2ee 2361
b75cc8f9 2362 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
b55b76b2 2363 rt6_do_redirect(dst, NULL, skb);
3a5ad2ee
DM
2364 dst_release(dst);
2365}
2366EXPORT_SYMBOL_GPL(ip6_redirect);
2367
c92a59ec
DJ
2368void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2369 u32 mark)
2370{
2371 const struct ipv6hdr *iph = ipv6_hdr(skb);
2372 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2373 struct dst_entry *dst;
2374 struct flowi6 fl6;
2375
2376 memset(&fl6, 0, sizeof(fl6));
e374c618 2377 fl6.flowi6_iif = LOOPBACK_IFINDEX;
c92a59ec
DJ
2378 fl6.flowi6_oif = oif;
2379 fl6.flowi6_mark = mark;
c92a59ec
DJ
2380 fl6.daddr = msg->dest;
2381 fl6.saddr = iph->daddr;
e2d118a1 2382 fl6.flowi6_uid = sock_net_uid(net, NULL);
c92a59ec 2383
b75cc8f9 2384 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
b55b76b2 2385 rt6_do_redirect(dst, NULL, skb);
c92a59ec
DJ
2386 dst_release(dst);
2387}
2388
3a5ad2ee
DM
2389void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2390{
e2d118a1
LC
2391 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2392 sk->sk_uid);
3a5ad2ee
DM
2393}
2394EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2395
0dbaee3b 2396static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1da177e4 2397{
0dbaee3b
DM
2398 struct net_device *dev = dst->dev;
2399 unsigned int mtu = dst_mtu(dst);
2400 struct net *net = dev_net(dev);
2401
1da177e4
LT
2402 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2403
5578689a
DL
2404 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2405 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1da177e4
LT
2406
2407 /*
1ab1457c
YH
2408 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2409 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2410 * IPV6_MAXPLEN is also valid and means: "any MSS,
1da177e4
LT
2411 * rely only on pmtu discovery"
2412 */
2413 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2414 mtu = IPV6_MAXPLEN;
2415 return mtu;
2416}
2417
ebb762f2 2418static unsigned int ip6_mtu(const struct dst_entry *dst)
d33e4553 2419{
4b32b5ad
MKL
2420 const struct rt6_info *rt = (const struct rt6_info *)dst;
2421 unsigned int mtu = rt->rt6i_pmtu;
d33e4553 2422 struct inet6_dev *idev;
618f9bc7 2423
4b32b5ad
MKL
2424 if (mtu)
2425 goto out;
2426
2427 mtu = dst_metric_raw(dst, RTAX_MTU);
618f9bc7 2428 if (mtu)
30f78d8e 2429 goto out;
618f9bc7
SK
2430
2431 mtu = IPV6_MIN_MTU;
d33e4553
DM
2432
2433 rcu_read_lock();
2434 idev = __in6_dev_get(dst->dev);
2435 if (idev)
2436 mtu = idev->cnf.mtu6;
2437 rcu_read_unlock();
2438
30f78d8e 2439out:
14972cbd
RP
2440 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2441
2442 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
d33e4553
DM
2443}
2444
3b00944c 2445struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
87a11578 2446 struct flowi6 *fl6)
1da177e4 2447{
87a11578 2448 struct dst_entry *dst;
1da177e4
LT
2449 struct rt6_info *rt;
2450 struct inet6_dev *idev = in6_dev_get(dev);
c346dca1 2451 struct net *net = dev_net(dev);
1da177e4 2452
38308473 2453 if (unlikely(!idev))
122bdf67 2454 return ERR_PTR(-ENODEV);
1da177e4 2455
ad706862 2456 rt = ip6_dst_alloc(net, dev, 0);
38308473 2457 if (unlikely(!rt)) {
1da177e4 2458 in6_dev_put(idev);
87a11578 2459 dst = ERR_PTR(-ENOMEM);
1da177e4
LT
2460 goto out;
2461 }
2462
8e2ec639 2463 rt->dst.flags |= DST_HOST;
588753f1 2464 rt->dst.input = ip6_input;
8e2ec639 2465 rt->dst.output = ip6_output;
550bab42 2466 rt->rt6i_gateway = fl6->daddr;
87a11578 2467 rt->rt6i_dst.addr = fl6->daddr;
8e2ec639
YZ
2468 rt->rt6i_dst.plen = 128;
2469 rt->rt6i_idev = idev;
14edd87d 2470 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1da177e4 2471
4c981e28 2472 /* Add this dst into uncached_list so that rt6_disable_ip() can
587fea74
WW
2473 * do proper release of the net_device
2474 */
2475 rt6_uncached_list_add(rt);
81eb8447 2476 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1da177e4 2477
87a11578
DM
2478 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2479
1da177e4 2480out:
87a11578 2481 return dst;
1da177e4
LT
2482}
2483
569d3645 2484static int ip6_dst_gc(struct dst_ops *ops)
1da177e4 2485{
86393e52 2486 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
7019b78e
DL
2487 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2488 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2489 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2490 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2491 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
fc66f95c 2492 int entries;
7019b78e 2493
fc66f95c 2494 entries = dst_entries_get_fast(ops);
49a18d86 2495 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
fc66f95c 2496 entries <= rt_max_size)
1da177e4
LT
2497 goto out;
2498
6891a346 2499 net->ipv6.ip6_rt_gc_expire++;
14956643 2500 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
fc66f95c
ED
2501 entries = dst_entries_get_slow(ops);
2502 if (entries < ops->gc_thresh)
7019b78e 2503 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1da177e4 2504out:
7019b78e 2505 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
fc66f95c 2506 return entries > rt_max_size;
1da177e4
LT
2507}
2508
e715b6d3
FW
2509static int ip6_convert_metrics(struct mx6_config *mxc,
2510 const struct fib6_config *cfg)
2511{
6670e152 2512 struct net *net = cfg->fc_nlinfo.nl_net;
c3a8d947 2513 bool ecn_ca = false;
e715b6d3
FW
2514 struct nlattr *nla;
2515 int remaining;
2516 u32 *mp;
2517
63159f29 2518 if (!cfg->fc_mx)
e715b6d3
FW
2519 return 0;
2520
2521 mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
2522 if (unlikely(!mp))
2523 return -ENOMEM;
2524
2525 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
2526 int type = nla_type(nla);
1bb14807 2527 u32 val;
e715b6d3 2528
1bb14807
DB
2529 if (!type)
2530 continue;
2531 if (unlikely(type > RTAX_MAX))
2532 goto err;
ea697639 2533
1bb14807
DB
2534 if (type == RTAX_CC_ALGO) {
2535 char tmp[TCP_CA_NAME_MAX];
e715b6d3 2536
1bb14807 2537 nla_strlcpy(tmp, nla, sizeof(tmp));
6670e152 2538 val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
1bb14807
DB
2539 if (val == TCP_CA_UNSPEC)
2540 goto err;
2541 } else {
2542 val = nla_get_u32(nla);
e715b6d3 2543 }
626abd59
PA
2544 if (type == RTAX_HOPLIMIT && val > 255)
2545 val = 255;
b8d3e416
DB
2546 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
2547 goto err;
1bb14807
DB
2548
2549 mp[type - 1] = val;
2550 __set_bit(type - 1, mxc->mx_valid);
e715b6d3
FW
2551 }
2552
c3a8d947
DB
2553 if (ecn_ca) {
2554 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
2555 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
2556 }
e715b6d3 2557
c3a8d947 2558 mxc->mx = mp;
e715b6d3
FW
2559 return 0;
2560 err:
2561 kfree(mp);
2562 return -EINVAL;
2563}
1da177e4 2564
8c14586f
DA
2565static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2566 struct fib6_config *cfg,
f4797b33
DA
2567 const struct in6_addr *gw_addr,
2568 u32 tbid, int flags)
8c14586f
DA
2569{
2570 struct flowi6 fl6 = {
2571 .flowi6_oif = cfg->fc_ifindex,
2572 .daddr = *gw_addr,
2573 .saddr = cfg->fc_prefsrc,
2574 };
2575 struct fib6_table *table;
2576 struct rt6_info *rt;
8c14586f 2577
f4797b33 2578 table = fib6_get_table(net, tbid);
8c14586f
DA
2579 if (!table)
2580 return NULL;
2581
2582 if (!ipv6_addr_any(&cfg->fc_prefsrc))
2583 flags |= RT6_LOOKUP_F_HAS_SADDR;
2584
f4797b33 2585 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
b75cc8f9 2586 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
8c14586f
DA
2587
2588 /* if table lookup failed, fall back to full lookup */
2589 if (rt == net->ipv6.ip6_null_entry) {
2590 ip6_rt_put(rt);
2591 rt = NULL;
2592 }
2593
2594 return rt;
2595}
2596
fc1e64e1
DA
2597static int ip6_route_check_nh_onlink(struct net *net,
2598 struct fib6_config *cfg,
9fbb704c 2599 const struct net_device *dev,
fc1e64e1
DA
2600 struct netlink_ext_ack *extack)
2601{
44750f84 2602 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
fc1e64e1
DA
2603 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2604 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2605 struct rt6_info *grt;
2606 int err;
2607
2608 err = 0;
2609 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2610 if (grt) {
58e354c0
DA
2611 if (!grt->dst.error &&
2612 (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
44750f84
DA
2613 NL_SET_ERR_MSG(extack,
2614 "Nexthop has invalid gateway or device mismatch");
fc1e64e1
DA
2615 err = -EINVAL;
2616 }
2617
2618 ip6_rt_put(grt);
2619 }
2620
2621 return err;
2622}
2623
1edce99f
DA
2624static int ip6_route_check_nh(struct net *net,
2625 struct fib6_config *cfg,
2626 struct net_device **_dev,
2627 struct inet6_dev **idev)
2628{
2629 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2630 struct net_device *dev = _dev ? *_dev : NULL;
2631 struct rt6_info *grt = NULL;
2632 int err = -EHOSTUNREACH;
2633
2634 if (cfg->fc_table) {
f4797b33
DA
2635 int flags = RT6_LOOKUP_F_IFACE;
2636
2637 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2638 cfg->fc_table, flags);
1edce99f
DA
2639 if (grt) {
2640 if (grt->rt6i_flags & RTF_GATEWAY ||
2641 (dev && dev != grt->dst.dev)) {
2642 ip6_rt_put(grt);
2643 grt = NULL;
2644 }
2645 }
2646 }
2647
2648 if (!grt)
b75cc8f9 2649 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
1edce99f
DA
2650
2651 if (!grt)
2652 goto out;
2653
2654 if (dev) {
2655 if (dev != grt->dst.dev) {
2656 ip6_rt_put(grt);
2657 goto out;
2658 }
2659 } else {
2660 *_dev = dev = grt->dst.dev;
2661 *idev = grt->rt6i_idev;
2662 dev_hold(dev);
2663 in6_dev_hold(grt->rt6i_idev);
2664 }
2665
2666 if (!(grt->rt6i_flags & RTF_GATEWAY))
2667 err = 0;
2668
2669 ip6_rt_put(grt);
2670
2671out:
2672 return err;
2673}
2674
9fbb704c
DA
2675static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2676 struct net_device **_dev, struct inet6_dev **idev,
2677 struct netlink_ext_ack *extack)
2678{
2679 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2680 int gwa_type = ipv6_addr_type(gw_addr);
232378e8 2681 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
9fbb704c 2682 const struct net_device *dev = *_dev;
232378e8 2683 bool need_addr_check = !dev;
9fbb704c
DA
2684 int err = -EINVAL;
2685
2686 /* if gw_addr is local we will fail to detect this in case
2687 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2688 * will return already-added prefix route via interface that
2689 * prefix route was assigned to, which might be non-loopback.
2690 */
232378e8
DA
2691 if (dev &&
2692 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2693 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
9fbb704c
DA
2694 goto out;
2695 }
2696
2697 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2698 /* IPv6 strictly inhibits using not link-local
2699 * addresses as nexthop address.
2700 * Otherwise, router will not able to send redirects.
2701 * It is very good, but in some (rare!) circumstances
2702 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2703 * some exceptions. --ANK
2704 * We allow IPv4-mapped nexthops to support RFC4798-type
2705 * addressing
2706 */
2707 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2708 NL_SET_ERR_MSG(extack, "Invalid gateway address");
2709 goto out;
2710 }
2711
2712 if (cfg->fc_flags & RTNH_F_ONLINK)
2713 err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2714 else
2715 err = ip6_route_check_nh(net, cfg, _dev, idev);
2716
2717 if (err)
2718 goto out;
2719 }
2720
2721 /* reload in case device was changed */
2722 dev = *_dev;
2723
2724 err = -EINVAL;
2725 if (!dev) {
2726 NL_SET_ERR_MSG(extack, "Egress device not specified");
2727 goto out;
2728 } else if (dev->flags & IFF_LOOPBACK) {
2729 NL_SET_ERR_MSG(extack,
2730 "Egress device can not be loopback device for this route");
2731 goto out;
2732 }
232378e8
DA
2733
2734 /* if we did not check gw_addr above, do so now that the
2735 * egress device has been resolved.
2736 */
2737 if (need_addr_check &&
2738 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2739 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2740 goto out;
2741 }
2742
9fbb704c
DA
2743 err = 0;
2744out:
2745 return err;
2746}
2747
333c4301
DA
2748static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
2749 struct netlink_ext_ack *extack)
1da177e4 2750{
5578689a 2751 struct net *net = cfg->fc_nlinfo.nl_net;
1da177e4
LT
2752 struct rt6_info *rt = NULL;
2753 struct net_device *dev = NULL;
2754 struct inet6_dev *idev = NULL;
c71099ac 2755 struct fib6_table *table;
1da177e4 2756 int addr_type;
8c5b83f0 2757 int err = -EINVAL;
1da177e4 2758
557c44be 2759 /* RTF_PCPU is an internal flag; can not be set by userspace */
d5d531cb
DA
2760 if (cfg->fc_flags & RTF_PCPU) {
2761 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
557c44be 2762 goto out;
d5d531cb 2763 }
557c44be 2764
2ea2352e
WW
2765 /* RTF_CACHE is an internal flag; can not be set by userspace */
2766 if (cfg->fc_flags & RTF_CACHE) {
2767 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
2768 goto out;
2769 }
2770
d5d531cb
DA
2771 if (cfg->fc_dst_len > 128) {
2772 NL_SET_ERR_MSG(extack, "Invalid prefix length");
2773 goto out;
2774 }
2775 if (cfg->fc_src_len > 128) {
2776 NL_SET_ERR_MSG(extack, "Invalid source address length");
8c5b83f0 2777 goto out;
d5d531cb 2778 }
1da177e4 2779#ifndef CONFIG_IPV6_SUBTREES
d5d531cb
DA
2780 if (cfg->fc_src_len) {
2781 NL_SET_ERR_MSG(extack,
2782 "Specifying source address requires IPV6_SUBTREES to be enabled");
8c5b83f0 2783 goto out;
d5d531cb 2784 }
1da177e4 2785#endif
86872cb5 2786 if (cfg->fc_ifindex) {
1da177e4 2787 err = -ENODEV;
5578689a 2788 dev = dev_get_by_index(net, cfg->fc_ifindex);
1da177e4
LT
2789 if (!dev)
2790 goto out;
2791 idev = in6_dev_get(dev);
2792 if (!idev)
2793 goto out;
2794 }
2795
86872cb5
TG
2796 if (cfg->fc_metric == 0)
2797 cfg->fc_metric = IP6_RT_PRIO_USER;
1da177e4 2798
fc1e64e1
DA
2799 if (cfg->fc_flags & RTNH_F_ONLINK) {
2800 if (!dev) {
2801 NL_SET_ERR_MSG(extack,
2802 "Nexthop device required for onlink");
2803 err = -ENODEV;
2804 goto out;
2805 }
2806
2807 if (!(dev->flags & IFF_UP)) {
2808 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2809 err = -ENETDOWN;
2810 goto out;
2811 }
2812 }
2813
d71314b4 2814 err = -ENOBUFS;
38308473
DM
2815 if (cfg->fc_nlinfo.nlh &&
2816 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
d71314b4 2817 table = fib6_get_table(net, cfg->fc_table);
38308473 2818 if (!table) {
f3213831 2819 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
d71314b4
MV
2820 table = fib6_new_table(net, cfg->fc_table);
2821 }
2822 } else {
2823 table = fib6_new_table(net, cfg->fc_table);
2824 }
38308473
DM
2825
2826 if (!table)
c71099ac 2827 goto out;
c71099ac 2828
ad706862
MKL
2829 rt = ip6_dst_alloc(net, NULL,
2830 (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
1da177e4 2831
38308473 2832 if (!rt) {
1da177e4
LT
2833 err = -ENOMEM;
2834 goto out;
2835 }
2836
1716a961
G
2837 if (cfg->fc_flags & RTF_EXPIRES)
2838 rt6_set_expires(rt, jiffies +
2839 clock_t_to_jiffies(cfg->fc_expires));
2840 else
2841 rt6_clean_expires(rt);
1da177e4 2842
86872cb5
TG
2843 if (cfg->fc_protocol == RTPROT_UNSPEC)
2844 cfg->fc_protocol = RTPROT_BOOT;
2845 rt->rt6i_protocol = cfg->fc_protocol;
2846
2847 addr_type = ipv6_addr_type(&cfg->fc_dst);
1da177e4
LT
2848
2849 if (addr_type & IPV6_ADDR_MULTICAST)
d8d1f30b 2850 rt->dst.input = ip6_mc_input;
ab79ad14
2851 else if (cfg->fc_flags & RTF_LOCAL)
2852 rt->dst.input = ip6_input;
1da177e4 2853 else
d8d1f30b 2854 rt->dst.input = ip6_forward;
1da177e4 2855
d8d1f30b 2856 rt->dst.output = ip6_output;
1da177e4 2857
19e42e45
RP
2858 if (cfg->fc_encap) {
2859 struct lwtunnel_state *lwtstate;
2860
30357d7d 2861 err = lwtunnel_build_state(cfg->fc_encap_type,
127eb7cd 2862 cfg->fc_encap, AF_INET6, cfg,
9ae28727 2863 &lwtstate, extack);
19e42e45
RP
2864 if (err)
2865 goto out;
61adedf3 2866 rt->dst.lwtstate = lwtstate_get(lwtstate);
9942895b 2867 lwtunnel_set_redirect(&rt->dst);
19e42e45
RP
2868 }
2869
86872cb5
TG
2870 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2871 rt->rt6i_dst.plen = cfg->fc_dst_len;
afc4eef8 2872 if (rt->rt6i_dst.plen == 128)
e5fd387a 2873 rt->dst.flags |= DST_HOST;
e5fd387a 2874
1da177e4 2875#ifdef CONFIG_IPV6_SUBTREES
86872cb5
TG
2876 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2877 rt->rt6i_src.plen = cfg->fc_src_len;
1da177e4
LT
2878#endif
2879
86872cb5 2880 rt->rt6i_metric = cfg->fc_metric;
398958ae 2881 rt->rt6i_nh_weight = 1;
1da177e4
LT
2882
2883 /* We cannot add true routes via loopback here,
2884 they would result in kernel looping; promote them to reject routes
2885 */
86872cb5 2886 if ((cfg->fc_flags & RTF_REJECT) ||
38308473
DM
2887 (dev && (dev->flags & IFF_LOOPBACK) &&
2888 !(addr_type & IPV6_ADDR_LOOPBACK) &&
2889 !(cfg->fc_flags & RTF_LOCAL))) {
1da177e4 2890 /* hold loopback dev/idev if we haven't done so. */
5578689a 2891 if (dev != net->loopback_dev) {
1da177e4
LT
2892 if (dev) {
2893 dev_put(dev);
2894 in6_dev_put(idev);
2895 }
5578689a 2896 dev = net->loopback_dev;
1da177e4
LT
2897 dev_hold(dev);
2898 idev = in6_dev_get(dev);
2899 if (!idev) {
2900 err = -ENODEV;
2901 goto out;
2902 }
2903 }
1da177e4 2904 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
ef2c7d7b
ND
2905 switch (cfg->fc_type) {
2906 case RTN_BLACKHOLE:
2907 rt->dst.error = -EINVAL;
ede2059d 2908 rt->dst.output = dst_discard_out;
7150aede 2909 rt->dst.input = dst_discard;
ef2c7d7b
ND
2910 break;
2911 case RTN_PROHIBIT:
2912 rt->dst.error = -EACCES;
7150aede
K
2913 rt->dst.output = ip6_pkt_prohibit_out;
2914 rt->dst.input = ip6_pkt_prohibit;
ef2c7d7b 2915 break;
b4949ab2 2916 case RTN_THROW:
0315e382 2917 case RTN_UNREACHABLE:
ef2c7d7b 2918 default:
7150aede 2919 rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
0315e382
NF
2920 : (cfg->fc_type == RTN_UNREACHABLE)
2921 ? -EHOSTUNREACH : -ENETUNREACH;
7150aede
K
2922 rt->dst.output = ip6_pkt_discard_out;
2923 rt->dst.input = ip6_pkt_discard;
ef2c7d7b
ND
2924 break;
2925 }
1da177e4
LT
2926 goto install_route;
2927 }
2928
86872cb5 2929 if (cfg->fc_flags & RTF_GATEWAY) {
9fbb704c
DA
2930 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
2931 if (err)
48ed7b26 2932 goto out;
1da177e4 2933
9fbb704c 2934 rt->rt6i_gateway = cfg->fc_gateway;
1da177e4
LT
2935 }
2936
2937 err = -ENODEV;
38308473 2938 if (!dev)
1da177e4
LT
2939 goto out;
2940
428604fb
LB
2941 if (idev->cnf.disable_ipv6) {
2942 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
2943 err = -EACCES;
2944 goto out;
2945 }
2946
955ec4cb
DA
2947 if (!(dev->flags & IFF_UP)) {
2948 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
2949 err = -ENETDOWN;
2950 goto out;
2951 }
2952
c3968a85
DW
2953 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2954 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
d5d531cb 2955 NL_SET_ERR_MSG(extack, "Invalid source address");
c3968a85
DW
2956 err = -EINVAL;
2957 goto out;
2958 }
4e3fd7a0 2959 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
c3968a85
DW
2960 rt->rt6i_prefsrc.plen = 128;
2961 } else
2962 rt->rt6i_prefsrc.plen = 0;
2963
86872cb5 2964 rt->rt6i_flags = cfg->fc_flags;
1da177e4
LT
2965
2966install_route:
5609b80a
IS
2967 if (!(rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
2968 !netif_carrier_ok(dev))
2969 rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
fc1e64e1 2970 rt->rt6i_nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
d8d1f30b 2971 rt->dst.dev = dev;
1da177e4 2972 rt->rt6i_idev = idev;
c71099ac 2973 rt->rt6i_table = table;
63152fc0 2974
c346dca1 2975 cfg->fc_nlinfo.nl_net = dev_net(dev);
63152fc0 2976
8c5b83f0 2977 return rt;
6b9ea5a6
RP
2978out:
2979 if (dev)
2980 dev_put(dev);
2981 if (idev)
2982 in6_dev_put(idev);
587fea74
WW
2983 if (rt)
2984 dst_release_immediate(&rt->dst);
6b9ea5a6 2985
8c5b83f0 2986 return ERR_PTR(err);
6b9ea5a6
RP
2987}
2988
333c4301
DA
2989int ip6_route_add(struct fib6_config *cfg,
2990 struct netlink_ext_ack *extack)
6b9ea5a6
RP
2991{
2992 struct mx6_config mxc = { .mx = NULL, };
8c5b83f0 2993 struct rt6_info *rt;
6b9ea5a6
RP
2994 int err;
2995
333c4301 2996 rt = ip6_route_info_create(cfg, extack);
8c5b83f0
RP
2997 if (IS_ERR(rt)) {
2998 err = PTR_ERR(rt);
2999 rt = NULL;
6b9ea5a6 3000 goto out;
8c5b83f0 3001 }
6b9ea5a6 3002
e715b6d3
FW
3003 err = ip6_convert_metrics(&mxc, cfg);
3004 if (err)
3005 goto out;
1da177e4 3006
333c4301 3007 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
e715b6d3
FW
3008
3009 kfree(mxc.mx);
6b9ea5a6 3010
e715b6d3 3011 return err;
1da177e4 3012out:
587fea74
WW
3013 if (rt)
3014 dst_release_immediate(&rt->dst);
6b9ea5a6 3015
1da177e4
LT
3016 return err;
3017}
3018
86872cb5 3019static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
1da177e4
LT
3020{
3021 int err;
c71099ac 3022 struct fib6_table *table;
d1918542 3023 struct net *net = dev_net(rt->dst.dev);
1da177e4 3024
a4c2fd7f 3025 if (rt == net->ipv6.ip6_null_entry) {
6825a26c
G
3026 err = -ENOENT;
3027 goto out;
3028 }
6c813a72 3029
c71099ac 3030 table = rt->rt6i_table;
66f5d6ce 3031 spin_lock_bh(&table->tb6_lock);
86872cb5 3032 err = fib6_del(rt, info);
66f5d6ce 3033 spin_unlock_bh(&table->tb6_lock);
1da177e4 3034
6825a26c 3035out:
94e187c0 3036 ip6_rt_put(rt);
1da177e4
LT
3037 return err;
3038}
3039
e0a1ad73
TG
3040int ip6_del_rt(struct rt6_info *rt)
3041{
4d1169c1 3042 struct nl_info info = {
d1918542 3043 .nl_net = dev_net(rt->dst.dev),
4d1169c1 3044 };
528c4ceb 3045 return __ip6_del_rt(rt, &info);
e0a1ad73
TG
3046}
3047
0ae81335
DA
3048static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
3049{
3050 struct nl_info *info = &cfg->fc_nlinfo;
e3330039 3051 struct net *net = info->nl_net;
16a16cd3 3052 struct sk_buff *skb = NULL;
0ae81335 3053 struct fib6_table *table;
e3330039 3054 int err = -ENOENT;
0ae81335 3055
e3330039
WC
3056 if (rt == net->ipv6.ip6_null_entry)
3057 goto out_put;
0ae81335 3058 table = rt->rt6i_table;
66f5d6ce 3059 spin_lock_bh(&table->tb6_lock);
0ae81335
DA
3060
3061 if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
3062 struct rt6_info *sibling, *next_sibling;
3063
16a16cd3
DA
3064 /* prefer to send a single notification with all hops */
3065 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3066 if (skb) {
3067 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3068
e3330039 3069 if (rt6_fill_node(net, skb, rt,
16a16cd3
DA
3070 NULL, NULL, 0, RTM_DELROUTE,
3071 info->portid, seq, 0) < 0) {
3072 kfree_skb(skb);
3073 skb = NULL;
3074 } else
3075 info->skip_notify = 1;
3076 }
3077
0ae81335
DA
3078 list_for_each_entry_safe(sibling, next_sibling,
3079 &rt->rt6i_siblings,
3080 rt6i_siblings) {
3081 err = fib6_del(sibling, info);
3082 if (err)
e3330039 3083 goto out_unlock;
0ae81335
DA
3084 }
3085 }
3086
3087 err = fib6_del(rt, info);
e3330039 3088out_unlock:
66f5d6ce 3089 spin_unlock_bh(&table->tb6_lock);
e3330039 3090out_put:
0ae81335 3091 ip6_rt_put(rt);
16a16cd3
DA
3092
3093 if (skb) {
e3330039 3094 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
16a16cd3
DA
3095 info->nlh, gfp_any());
3096 }
0ae81335
DA
3097 return err;
3098}
3099
333c4301
DA
3100static int ip6_route_del(struct fib6_config *cfg,
3101 struct netlink_ext_ack *extack)
1da177e4 3102{
2b760fcf 3103 struct rt6_info *rt, *rt_cache;
c71099ac 3104 struct fib6_table *table;
1da177e4 3105 struct fib6_node *fn;
1da177e4
LT
3106 int err = -ESRCH;
3107
5578689a 3108 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
d5d531cb
DA
3109 if (!table) {
3110 NL_SET_ERR_MSG(extack, "FIB table does not exist");
c71099ac 3111 return err;
d5d531cb 3112 }
c71099ac 3113
66f5d6ce 3114 rcu_read_lock();
1da177e4 3115
c71099ac 3116 fn = fib6_locate(&table->tb6_root,
86872cb5 3117 &cfg->fc_dst, cfg->fc_dst_len,
38fbeeee 3118 &cfg->fc_src, cfg->fc_src_len,
2b760fcf 3119 !(cfg->fc_flags & RTF_CACHE));
1ab1457c 3120
1da177e4 3121 if (fn) {
66f5d6ce 3122 for_each_fib6_node_rt_rcu(fn) {
2b760fcf
WW
3123 if (cfg->fc_flags & RTF_CACHE) {
3124 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3125 &cfg->fc_src);
3126 if (!rt_cache)
3127 continue;
3128 rt = rt_cache;
3129 }
86872cb5 3130 if (cfg->fc_ifindex &&
d1918542
DM
3131 (!rt->dst.dev ||
3132 rt->dst.dev->ifindex != cfg->fc_ifindex))
1da177e4 3133 continue;
86872cb5
TG
3134 if (cfg->fc_flags & RTF_GATEWAY &&
3135 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
1da177e4 3136 continue;
86872cb5 3137 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
1da177e4 3138 continue;
c2ed1880
M
3139 if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
3140 continue;
d3843fe5
WW
3141 if (!dst_hold_safe(&rt->dst))
3142 break;
66f5d6ce 3143 rcu_read_unlock();
1da177e4 3144
0ae81335
DA
3145 /* if gateway was specified only delete the one hop */
3146 if (cfg->fc_flags & RTF_GATEWAY)
3147 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3148
3149 return __ip6_del_rt_siblings(rt, cfg);
1da177e4
LT
3150 }
3151 }
66f5d6ce 3152 rcu_read_unlock();
1da177e4
LT
3153
3154 return err;
3155}
3156
6700c270 3157static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
a6279458 3158{
a6279458 3159 struct netevent_redirect netevent;
e8599ff4 3160 struct rt6_info *rt, *nrt = NULL;
e8599ff4
DM
3161 struct ndisc_options ndopts;
3162 struct inet6_dev *in6_dev;
3163 struct neighbour *neigh;
71bcdba0 3164 struct rd_msg *msg;
6e157b6a
DM
3165 int optlen, on_link;
3166 u8 *lladdr;
e8599ff4 3167
29a3cad5 3168 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
71bcdba0 3169 optlen -= sizeof(*msg);
e8599ff4
DM
3170
3171 if (optlen < 0) {
6e157b6a 3172 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
e8599ff4
DM
3173 return;
3174 }
3175
71bcdba0 3176 msg = (struct rd_msg *)icmp6_hdr(skb);
e8599ff4 3177
71bcdba0 3178 if (ipv6_addr_is_multicast(&msg->dest)) {
6e157b6a 3179 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
e8599ff4
DM
3180 return;
3181 }
3182
6e157b6a 3183 on_link = 0;
71bcdba0 3184 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
e8599ff4 3185 on_link = 1;
71bcdba0 3186 } else if (ipv6_addr_type(&msg->target) !=
e8599ff4 3187 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
6e157b6a 3188 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
e8599ff4
DM
3189 return;
3190 }
3191
3192 in6_dev = __in6_dev_get(skb->dev);
3193 if (!in6_dev)
3194 return;
3195 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3196 return;
3197
3198 /* RFC2461 8.1:
3199 * The IP source address of the Redirect MUST be the same as the current
3200 * first-hop router for the specified ICMP Destination Address.
3201 */
3202
f997c55c 3203 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
e8599ff4
DM
3204 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3205 return;
3206 }
6e157b6a
DM
3207
3208 lladdr = NULL;
e8599ff4
DM
3209 if (ndopts.nd_opts_tgt_lladdr) {
3210 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3211 skb->dev);
3212 if (!lladdr) {
3213 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3214 return;
3215 }
3216 }
3217
6e157b6a 3218 rt = (struct rt6_info *) dst;
ec13ad1d 3219 if (rt->rt6i_flags & RTF_REJECT) {
6e157b6a 3220 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
e8599ff4 3221 return;
6e157b6a 3222 }
e8599ff4 3223
6e157b6a
DM
3224 /* Redirect received -> path was valid.
3225 * Look, redirects are sent only in response to data packets,
3226 * so that this nexthop apparently is reachable. --ANK
3227 */
0dec879f 3228 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
a6279458 3229
71bcdba0 3230 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
6e157b6a
DM
3231 if (!neigh)
3232 return;
a6279458 3233
1da177e4
LT
3234 /*
3235 * We have finally decided to accept it.
3236 */
3237
f997c55c 3238 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
1da177e4
LT
3239 NEIGH_UPDATE_F_WEAK_OVERRIDE|
3240 NEIGH_UPDATE_F_OVERRIDE|
3241 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
f997c55c
AA
3242 NEIGH_UPDATE_F_ISROUTER)),
3243 NDISC_REDIRECT, &ndopts);
1da177e4 3244
83a09abd 3245 nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
38308473 3246 if (!nrt)
1da177e4
LT
3247 goto out;
3248
3249 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3250 if (on_link)
3251 nrt->rt6i_flags &= ~RTF_GATEWAY;
3252
b91d5329 3253 nrt->rt6i_protocol = RTPROT_REDIRECT;
4e3fd7a0 3254 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
1da177e4 3255
2b760fcf
WW
3256 /* No need to remove rt from the exception table if rt is
3257 * a cached route because rt6_insert_exception() will
3258 * takes care of it
3259 */
3260 if (rt6_insert_exception(nrt, rt)) {
3261 dst_release_immediate(&nrt->dst);
3262 goto out;
3263 }
1da177e4 3264
d8d1f30b
CG
3265 netevent.old = &rt->dst;
3266 netevent.new = &nrt->dst;
71bcdba0 3267 netevent.daddr = &msg->dest;
60592833 3268 netevent.neigh = neigh;
8d71740c
TT
3269 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3270
1da177e4 3271out:
e8599ff4 3272 neigh_release(neigh);
6e157b6a
DM
3273}
3274
1da177e4
LT
3275/*
3276 * Misc support functions
3277 */
3278
4b32b5ad
MKL
3279static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
3280{
3a2232e9 3281 BUG_ON(from->from);
4b32b5ad
MKL
3282
3283 rt->rt6i_flags &= ~RTF_EXPIRES;
3284 dst_hold(&from->dst);
3a2232e9 3285 rt->from = from;
4b32b5ad
MKL
3286 dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
3287}
3288
83a09abd
MKL
3289static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
3290{
3291 rt->dst.input = ort->dst.input;
3292 rt->dst.output = ort->dst.output;
3293 rt->rt6i_dst = ort->rt6i_dst;
3294 rt->dst.error = ort->dst.error;
3295 rt->rt6i_idev = ort->rt6i_idev;
3296 if (rt->rt6i_idev)
3297 in6_dev_hold(rt->rt6i_idev);
3298 rt->dst.lastuse = jiffies;
3299 rt->rt6i_gateway = ort->rt6i_gateway;
3300 rt->rt6i_flags = ort->rt6i_flags;
3301 rt6_set_from(rt, ort);
3302 rt->rt6i_metric = ort->rt6i_metric;
1da177e4 3303#ifdef CONFIG_IPV6_SUBTREES
83a09abd 3304 rt->rt6i_src = ort->rt6i_src;
1da177e4 3305#endif
83a09abd
MKL
3306 rt->rt6i_prefsrc = ort->rt6i_prefsrc;
3307 rt->rt6i_table = ort->rt6i_table;
61adedf3 3308 rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
1da177e4
LT
3309}
3310
70ceb4f5 3311#ifdef CONFIG_IPV6_ROUTE_INFO
efa2cea0 3312static struct rt6_info *rt6_get_route_info(struct net *net,
b71d1d42 3313 const struct in6_addr *prefix, int prefixlen,
830218c1
DA
3314 const struct in6_addr *gwaddr,
3315 struct net_device *dev)
70ceb4f5 3316{
830218c1
DA
3317 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3318 int ifindex = dev->ifindex;
70ceb4f5
YH
3319 struct fib6_node *fn;
3320 struct rt6_info *rt = NULL;
c71099ac
TG
3321 struct fib6_table *table;
3322
830218c1 3323 table = fib6_get_table(net, tb_id);
38308473 3324 if (!table)
c71099ac 3325 return NULL;
70ceb4f5 3326
66f5d6ce 3327 rcu_read_lock();
38fbeeee 3328 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
70ceb4f5
YH
3329 if (!fn)
3330 goto out;
3331
66f5d6ce 3332 for_each_fib6_node_rt_rcu(fn) {
d1918542 3333 if (rt->dst.dev->ifindex != ifindex)
70ceb4f5
YH
3334 continue;
3335 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3336 continue;
3337 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
3338 continue;
d3843fe5 3339 ip6_hold_safe(NULL, &rt, false);
70ceb4f5
YH
3340 break;
3341 }
3342out:
66f5d6ce 3343 rcu_read_unlock();
70ceb4f5
YH
3344 return rt;
3345}
3346
efa2cea0 3347static struct rt6_info *rt6_add_route_info(struct net *net,
b71d1d42 3348 const struct in6_addr *prefix, int prefixlen,
830218c1
DA
3349 const struct in6_addr *gwaddr,
3350 struct net_device *dev,
95c96174 3351 unsigned int pref)
70ceb4f5 3352{
86872cb5 3353 struct fib6_config cfg = {
238fc7ea 3354 .fc_metric = IP6_RT_PRIO_USER,
830218c1 3355 .fc_ifindex = dev->ifindex,
86872cb5
TG
3356 .fc_dst_len = prefixlen,
3357 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3358 RTF_UP | RTF_PREF(pref),
b91d5329 3359 .fc_protocol = RTPROT_RA,
15e47304 3360 .fc_nlinfo.portid = 0,
efa2cea0
DL
3361 .fc_nlinfo.nlh = NULL,
3362 .fc_nlinfo.nl_net = net,
86872cb5
TG
3363 };
3364
830218c1 3365 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
4e3fd7a0
AD
3366 cfg.fc_dst = *prefix;
3367 cfg.fc_gateway = *gwaddr;
70ceb4f5 3368
e317da96
YH
3369 /* We should treat it as a default route if prefix length is 0. */
3370 if (!prefixlen)
86872cb5 3371 cfg.fc_flags |= RTF_DEFAULT;
70ceb4f5 3372
333c4301 3373 ip6_route_add(&cfg, NULL);
70ceb4f5 3374
830218c1 3375 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
70ceb4f5
YH
3376}
3377#endif
3378
b71d1d42 3379struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
1ab1457c 3380{
830218c1 3381 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
1da177e4 3382 struct rt6_info *rt;
c71099ac 3383 struct fib6_table *table;
1da177e4 3384
830218c1 3385 table = fib6_get_table(dev_net(dev), tb_id);
38308473 3386 if (!table)
c71099ac 3387 return NULL;
1da177e4 3388
66f5d6ce
WW
3389 rcu_read_lock();
3390 for_each_fib6_node_rt_rcu(&table->tb6_root) {
d1918542 3391 if (dev == rt->dst.dev &&
045927ff 3392 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
1da177e4
LT
3393 ipv6_addr_equal(&rt->rt6i_gateway, addr))
3394 break;
3395 }
3396 if (rt)
d3843fe5 3397 ip6_hold_safe(NULL, &rt, false);
66f5d6ce 3398 rcu_read_unlock();
1da177e4
LT
3399 return rt;
3400}
3401
b71d1d42 3402struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
ebacaaa0
YH
3403 struct net_device *dev,
3404 unsigned int pref)
1da177e4 3405{
86872cb5 3406 struct fib6_config cfg = {
ca254490 3407 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
238fc7ea 3408 .fc_metric = IP6_RT_PRIO_USER,
86872cb5
TG
3409 .fc_ifindex = dev->ifindex,
3410 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3411 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
b91d5329 3412 .fc_protocol = RTPROT_RA,
15e47304 3413 .fc_nlinfo.portid = 0,
5578689a 3414 .fc_nlinfo.nlh = NULL,
c346dca1 3415 .fc_nlinfo.nl_net = dev_net(dev),
86872cb5 3416 };
1da177e4 3417
4e3fd7a0 3418 cfg.fc_gateway = *gwaddr;
1da177e4 3419
333c4301 3420 if (!ip6_route_add(&cfg, NULL)) {
830218c1
DA
3421 struct fib6_table *table;
3422
3423 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3424 if (table)
3425 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3426 }
1da177e4 3427
1da177e4
LT
3428 return rt6_get_dflt_router(gwaddr, dev);
3429}
3430
830218c1 3431static void __rt6_purge_dflt_routers(struct fib6_table *table)
1da177e4
LT
3432{
3433 struct rt6_info *rt;
3434
3435restart:
66f5d6ce
WW
3436 rcu_read_lock();
3437 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3e8b0ac3
LC
3438 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3439 (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
d3843fe5 3440 if (dst_hold_safe(&rt->dst)) {
66f5d6ce 3441 rcu_read_unlock();
d3843fe5
WW
3442 ip6_del_rt(rt);
3443 } else {
66f5d6ce 3444 rcu_read_unlock();
d3843fe5 3445 }
1da177e4
LT
3446 goto restart;
3447 }
3448 }
66f5d6ce 3449 rcu_read_unlock();
830218c1
DA
3450
3451 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3452}
3453
3454void rt6_purge_dflt_routers(struct net *net)
3455{
3456 struct fib6_table *table;
3457 struct hlist_head *head;
3458 unsigned int h;
3459
3460 rcu_read_lock();
3461
3462 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3463 head = &net->ipv6.fib_table_hash[h];
3464 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3465 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3466 __rt6_purge_dflt_routers(table);
3467 }
3468 }
3469
3470 rcu_read_unlock();
1da177e4
LT
3471}
3472
5578689a
DL
3473static void rtmsg_to_fib6_config(struct net *net,
3474 struct in6_rtmsg *rtmsg,
86872cb5
TG
3475 struct fib6_config *cfg)
3476{
3477 memset(cfg, 0, sizeof(*cfg));
3478
ca254490
DA
3479 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3480 : RT6_TABLE_MAIN;
86872cb5
TG
3481 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3482 cfg->fc_metric = rtmsg->rtmsg_metric;
3483 cfg->fc_expires = rtmsg->rtmsg_info;
3484 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3485 cfg->fc_src_len = rtmsg->rtmsg_src_len;
3486 cfg->fc_flags = rtmsg->rtmsg_flags;
3487
5578689a 3488 cfg->fc_nlinfo.nl_net = net;
f1243c2d 3489
4e3fd7a0
AD
3490 cfg->fc_dst = rtmsg->rtmsg_dst;
3491 cfg->fc_src = rtmsg->rtmsg_src;
3492 cfg->fc_gateway = rtmsg->rtmsg_gateway;
86872cb5
TG
3493}
3494
5578689a 3495int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
1da177e4 3496{
86872cb5 3497 struct fib6_config cfg;
1da177e4
LT
3498 struct in6_rtmsg rtmsg;
3499 int err;
3500
67ba4152 3501 switch (cmd) {
1da177e4
LT
3502 case SIOCADDRT: /* Add a route */
3503 case SIOCDELRT: /* Delete a route */
af31f412 3504 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
1da177e4
LT
3505 return -EPERM;
3506 err = copy_from_user(&rtmsg, arg,
3507 sizeof(struct in6_rtmsg));
3508 if (err)
3509 return -EFAULT;
86872cb5 3510
5578689a 3511 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
86872cb5 3512
1da177e4
LT
3513 rtnl_lock();
3514 switch (cmd) {
3515 case SIOCADDRT:
333c4301 3516 err = ip6_route_add(&cfg, NULL);
1da177e4
LT
3517 break;
3518 case SIOCDELRT:
333c4301 3519 err = ip6_route_del(&cfg, NULL);
1da177e4
LT
3520 break;
3521 default:
3522 err = -EINVAL;
3523 }
3524 rtnl_unlock();
3525
3526 return err;
3ff50b79 3527 }
1da177e4
LT
3528
3529 return -EINVAL;
3530}
3531
3532/*
3533 * Drop the packet on the floor
3534 */
3535
d5fdd6ba 3536static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
1da177e4 3537{
612f09e8 3538 int type;
adf30907 3539 struct dst_entry *dst = skb_dst(skb);
612f09e8
YH
3540 switch (ipstats_mib_noroutes) {
3541 case IPSTATS_MIB_INNOROUTES:
0660e03f 3542 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
45bb0060 3543 if (type == IPV6_ADDR_ANY) {
bdb7cc64
SS
3544 IP6_INC_STATS(dev_net(dst->dev),
3545 __in6_dev_get_safely(skb->dev),
3bd653c8 3546 IPSTATS_MIB_INADDRERRORS);
612f09e8
YH
3547 break;
3548 }
3549 /* FALLTHROUGH */
3550 case IPSTATS_MIB_OUTNOROUTES:
3bd653c8
DL
3551 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3552 ipstats_mib_noroutes);
612f09e8
YH
3553 break;
3554 }
3ffe533c 3555 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
1da177e4
LT
3556 kfree_skb(skb);
3557 return 0;
3558}
3559
9ce8ade0
TG
3560static int ip6_pkt_discard(struct sk_buff *skb)
3561{
612f09e8 3562 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
9ce8ade0
TG
3563}
3564
ede2059d 3565static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
1da177e4 3566{
adf30907 3567 skb->dev = skb_dst(skb)->dev;
612f09e8 3568 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
1da177e4
LT
3569}
3570
9ce8ade0
TG
3571static int ip6_pkt_prohibit(struct sk_buff *skb)
3572{
612f09e8 3573 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
9ce8ade0
TG
3574}
3575
ede2059d 3576static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
9ce8ade0 3577{
adf30907 3578 skb->dev = skb_dst(skb)->dev;
612f09e8 3579 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
9ce8ade0
TG
3580}
3581
1da177e4
LT
3582/*
3583 * Allocate a dst for local (unicast / anycast) address.
3584 */
3585
3586struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
3587 const struct in6_addr *addr,
8f031519 3588 bool anycast)
1da177e4 3589{
ca254490 3590 u32 tb_id;
c346dca1 3591 struct net *net = dev_net(idev->dev);
4832c30d 3592 struct net_device *dev = idev->dev;
5f02ce24
DA
3593 struct rt6_info *rt;
3594
5f02ce24 3595 rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
a3300ef4 3596 if (!rt)
1da177e4
LT
3597 return ERR_PTR(-ENOMEM);
3598
1da177e4
LT
3599 in6_dev_hold(idev);
3600
11d53b49 3601 rt->dst.flags |= DST_HOST;
d8d1f30b
CG
3602 rt->dst.input = ip6_input;
3603 rt->dst.output = ip6_output;
1da177e4 3604 rt->rt6i_idev = idev;
1da177e4 3605
94b5e0f9 3606 rt->rt6i_protocol = RTPROT_KERNEL;
1da177e4 3607 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
58c4fb86
YH
3608 if (anycast)
3609 rt->rt6i_flags |= RTF_ANYCAST;
3610 else
1da177e4 3611 rt->rt6i_flags |= RTF_LOCAL;
1da177e4 3612
550bab42 3613 rt->rt6i_gateway = *addr;
4e3fd7a0 3614 rt->rt6i_dst.addr = *addr;
1da177e4 3615 rt->rt6i_dst.plen = 128;
ca254490
DA
3616 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3617 rt->rt6i_table = fib6_get_table(net, tb_id);
1da177e4 3618
1da177e4
LT
3619 return rt;
3620}
3621
c3968a85
DW
3622/* remove deleted ip from prefsrc entries */
3623struct arg_dev_net_ip {
3624 struct net_device *dev;
3625 struct net *net;
3626 struct in6_addr *addr;
3627};
3628
3629static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
3630{
3631 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3632 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3633 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3634
d1918542 3635 if (((void *)rt->dst.dev == dev || !dev) &&
c3968a85
DW
3636 rt != net->ipv6.ip6_null_entry &&
3637 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
60006a48 3638 spin_lock_bh(&rt6_exception_lock);
c3968a85
DW
3639 /* remove prefsrc entry */
3640 rt->rt6i_prefsrc.plen = 0;
60006a48
WW
3641 /* need to update cache as well */
3642 rt6_exceptions_remove_prefsrc(rt);
3643 spin_unlock_bh(&rt6_exception_lock);
c3968a85
DW
3644 }
3645 return 0;
3646}
3647
3648void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3649{
3650 struct net *net = dev_net(ifp->idev->dev);
3651 struct arg_dev_net_ip adni = {
3652 .dev = ifp->idev->dev,
3653 .net = net,
3654 .addr = &ifp->addr,
3655 };
0c3584d5 3656 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
c3968a85
DW
3657}
3658
be7a010d 3659#define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
be7a010d
DJ
3660
3661/* Remove routers and update dst entries when gateway turn into host. */
3662static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
3663{
3664 struct in6_addr *gateway = (struct in6_addr *)arg;
3665
2b760fcf
WW
3666 if (((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3667 ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
be7a010d
DJ
3668 return -1;
3669 }
b16cb459
WW
3670
3671 /* Further clean up cached routes in exception table.
3672 * This is needed because cached route may have a different
3673 * gateway than its 'parent' in the case of an ip redirect.
3674 */
3675 rt6_exceptions_clean_tohost(rt, gateway);
3676
be7a010d
DJ
3677 return 0;
3678}
3679
3680void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3681{
3682 fib6_clean_all(net, fib6_clean_tohost, gateway);
3683}
3684
2127d95a
IS
3685struct arg_netdev_event {
3686 const struct net_device *dev;
4c981e28
IS
3687 union {
3688 unsigned int nh_flags;
3689 unsigned long event;
3690 };
2127d95a
IS
3691};
3692
d7dedee1
IS
3693static struct rt6_info *rt6_multipath_first_sibling(const struct rt6_info *rt)
3694{
3695 struct rt6_info *iter;
3696 struct fib6_node *fn;
3697
3698 fn = rcu_dereference_protected(rt->rt6i_node,
3699 lockdep_is_held(&rt->rt6i_table->tb6_lock));
3700 iter = rcu_dereference_protected(fn->leaf,
3701 lockdep_is_held(&rt->rt6i_table->tb6_lock));
3702 while (iter) {
3703 if (iter->rt6i_metric == rt->rt6i_metric &&
3704 rt6_qualify_for_ecmp(iter))
3705 return iter;
3706 iter = rcu_dereference_protected(iter->rt6_next,
3707 lockdep_is_held(&rt->rt6i_table->tb6_lock));
3708 }
3709
3710 return NULL;
3711}
3712
3713static bool rt6_is_dead(const struct rt6_info *rt)
3714{
3715 if (rt->rt6i_nh_flags & RTNH_F_DEAD ||
3716 (rt->rt6i_nh_flags & RTNH_F_LINKDOWN &&
3717 rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
3718 return true;
3719
3720 return false;
3721}
3722
3723static int rt6_multipath_total_weight(const struct rt6_info *rt)
3724{
3725 struct rt6_info *iter;
3726 int total = 0;
3727
3728 if (!rt6_is_dead(rt))
398958ae 3729 total += rt->rt6i_nh_weight;
d7dedee1
IS
3730
3731 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings) {
3732 if (!rt6_is_dead(iter))
398958ae 3733 total += iter->rt6i_nh_weight;
d7dedee1
IS
3734 }
3735
3736 return total;
3737}
3738
3739static void rt6_upper_bound_set(struct rt6_info *rt, int *weight, int total)
3740{
3741 int upper_bound = -1;
3742
3743 if (!rt6_is_dead(rt)) {
398958ae 3744 *weight += rt->rt6i_nh_weight;
d7dedee1
IS
3745 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3746 total) - 1;
3747 }
3748 atomic_set(&rt->rt6i_nh_upper_bound, upper_bound);
3749}
3750
3751static void rt6_multipath_upper_bound_set(struct rt6_info *rt, int total)
3752{
3753 struct rt6_info *iter;
3754 int weight = 0;
3755
3756 rt6_upper_bound_set(rt, &weight, total);
3757
3758 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3759 rt6_upper_bound_set(iter, &weight, total);
3760}
3761
3762void rt6_multipath_rebalance(struct rt6_info *rt)
3763{
3764 struct rt6_info *first;
3765 int total;
3766
3767 /* In case the entire multipath route was marked for flushing,
3768 * then there is no need to rebalance upon the removal of every
3769 * sibling route.
3770 */
3771 if (!rt->rt6i_nsiblings || rt->should_flush)
3772 return;
3773
3774 /* During lookup routes are evaluated in order, so we need to
3775 * make sure upper bounds are assigned from the first sibling
3776 * onwards.
3777 */
3778 first = rt6_multipath_first_sibling(rt);
3779 if (WARN_ON_ONCE(!first))
3780 return;
3781
3782 total = rt6_multipath_total_weight(first);
3783 rt6_multipath_upper_bound_set(first, total);
3784}
3785
2127d95a
IS
3786static int fib6_ifup(struct rt6_info *rt, void *p_arg)
3787{
3788 const struct arg_netdev_event *arg = p_arg;
3789 const struct net *net = dev_net(arg->dev);
3790
1de178ed 3791 if (rt != net->ipv6.ip6_null_entry && rt->dst.dev == arg->dev) {
2127d95a 3792 rt->rt6i_nh_flags &= ~arg->nh_flags;
1de178ed 3793 fib6_update_sernum_upto_root(dev_net(rt->dst.dev), rt);
d7dedee1 3794 rt6_multipath_rebalance(rt);
1de178ed 3795 }
2127d95a
IS
3796
3797 return 0;
3798}
3799
3800void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
3801{
3802 struct arg_netdev_event arg = {
3803 .dev = dev,
6802f3ad
IS
3804 {
3805 .nh_flags = nh_flags,
3806 },
2127d95a
IS
3807 };
3808
3809 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
3810 arg.nh_flags |= RTNH_F_LINKDOWN;
3811
3812 fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
3813}
3814
1de178ed
IS
3815static bool rt6_multipath_uses_dev(const struct rt6_info *rt,
3816 const struct net_device *dev)
3817{
3818 struct rt6_info *iter;
3819
3820 if (rt->dst.dev == dev)
3821 return true;
3822 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3823 if (iter->dst.dev == dev)
3824 return true;
3825
3826 return false;
3827}
3828
3829static void rt6_multipath_flush(struct rt6_info *rt)
3830{
3831 struct rt6_info *iter;
3832
3833 rt->should_flush = 1;
3834 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3835 iter->should_flush = 1;
3836}
3837
3838static unsigned int rt6_multipath_dead_count(const struct rt6_info *rt,
3839 const struct net_device *down_dev)
3840{
3841 struct rt6_info *iter;
3842 unsigned int dead = 0;
3843
3844 if (rt->dst.dev == down_dev || rt->rt6i_nh_flags & RTNH_F_DEAD)
3845 dead++;
3846 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3847 if (iter->dst.dev == down_dev ||
3848 iter->rt6i_nh_flags & RTNH_F_DEAD)
3849 dead++;
3850
3851 return dead;
3852}
3853
3854static void rt6_multipath_nh_flags_set(struct rt6_info *rt,
3855 const struct net_device *dev,
3856 unsigned int nh_flags)
3857{
3858 struct rt6_info *iter;
3859
3860 if (rt->dst.dev == dev)
3861 rt->rt6i_nh_flags |= nh_flags;
3862 list_for_each_entry(iter, &rt->rt6i_siblings, rt6i_siblings)
3863 if (iter->dst.dev == dev)
3864 iter->rt6i_nh_flags |= nh_flags;
3865}
3866
a1a22c12 3867/* called with write lock held for table with rt */
4c981e28 3868static int fib6_ifdown(struct rt6_info *rt, void *p_arg)
1da177e4 3869{
4c981e28
IS
3870 const struct arg_netdev_event *arg = p_arg;
3871 const struct net_device *dev = arg->dev;
3872 const struct net *net = dev_net(dev);
8ed67789 3873
1de178ed 3874 if (rt == net->ipv6.ip6_null_entry)
27c6fa73
IS
3875 return 0;
3876
3877 switch (arg->event) {
3878 case NETDEV_UNREGISTER:
1de178ed 3879 return rt->dst.dev == dev ? -1 : 0;
27c6fa73 3880 case NETDEV_DOWN:
1de178ed 3881 if (rt->should_flush)
27c6fa73 3882 return -1;
1de178ed
IS
3883 if (!rt->rt6i_nsiblings)
3884 return rt->dst.dev == dev ? -1 : 0;
3885 if (rt6_multipath_uses_dev(rt, dev)) {
3886 unsigned int count;
3887
3888 count = rt6_multipath_dead_count(rt, dev);
3889 if (rt->rt6i_nsiblings + 1 == count) {
3890 rt6_multipath_flush(rt);
3891 return -1;
3892 }
3893 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
3894 RTNH_F_LINKDOWN);
3895 fib6_update_sernum(rt);
d7dedee1 3896 rt6_multipath_rebalance(rt);
1de178ed
IS
3897 }
3898 return -2;
27c6fa73 3899 case NETDEV_CHANGE:
1de178ed
IS
3900 if (rt->dst.dev != dev ||
3901 rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST))
27c6fa73
IS
3902 break;
3903 rt->rt6i_nh_flags |= RTNH_F_LINKDOWN;
d7dedee1 3904 rt6_multipath_rebalance(rt);
27c6fa73 3905 break;
2b241361 3906 }
c159d30c 3907
1da177e4
LT
3908 return 0;
3909}
3910
27c6fa73 3911void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
1da177e4 3912{
4c981e28 3913 struct arg_netdev_event arg = {
8ed67789 3914 .dev = dev,
6802f3ad
IS
3915 {
3916 .event = event,
3917 },
8ed67789
DL
3918 };
3919
4c981e28
IS
3920 fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
3921}
3922
3923void rt6_disable_ip(struct net_device *dev, unsigned long event)
3924{
3925 rt6_sync_down_dev(dev, event);
3926 rt6_uncached_list_flush_dev(dev_net(dev), dev);
3927 neigh_ifdown(&nd_tbl, dev);
1da177e4
LT
3928}
3929
95c96174 3930struct rt6_mtu_change_arg {
1da177e4 3931 struct net_device *dev;
95c96174 3932 unsigned int mtu;
1da177e4
LT
3933};
3934
3935static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
3936{
3937 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
3938 struct inet6_dev *idev;
3939
3940 /* In IPv6 pmtu discovery is not optional,
3941 so that RTAX_MTU lock cannot disable it.
3942 We still use this lock to block changes
3943 caused by addrconf/ndisc.
3944 */
3945
3946 idev = __in6_dev_get(arg->dev);
38308473 3947 if (!idev)
1da177e4
LT
3948 return 0;
3949
3950 /* For administrative MTU increase, there is no way to discover
3951 IPv6 PMTU increase, so PMTU increase should be updated here.
3952 Since RFC 1981 doesn't include administrative MTU increase
3953 update PMTU increase is a MUST. (i.e. jumbo frame)
3954 */
d1918542 3955 if (rt->dst.dev == arg->dev &&
4b32b5ad 3956 !dst_metric_locked(&rt->dst, RTAX_MTU)) {
f5bbe7ee 3957 spin_lock_bh(&rt6_exception_lock);
e9fa1495
SB
3958 if (dst_metric_raw(&rt->dst, RTAX_MTU) &&
3959 rt6_mtu_change_route_allowed(idev, rt, arg->mtu))
4b32b5ad 3960 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
e9fa1495 3961 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
f5bbe7ee 3962 spin_unlock_bh(&rt6_exception_lock);
566cfd8f 3963 }
1da177e4
LT
3964 return 0;
3965}
3966
95c96174 3967void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
1da177e4 3968{
c71099ac
TG
3969 struct rt6_mtu_change_arg arg = {
3970 .dev = dev,
3971 .mtu = mtu,
3972 };
1da177e4 3973
0c3584d5 3974 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
1da177e4
LT
3975}
3976
ef7c79ed 3977static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
5176f91e 3978 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
86872cb5 3979 [RTA_OIF] = { .type = NLA_U32 },
ab364a6f 3980 [RTA_IIF] = { .type = NLA_U32 },
86872cb5
TG
3981 [RTA_PRIORITY] = { .type = NLA_U32 },
3982 [RTA_METRICS] = { .type = NLA_NESTED },
51ebd318 3983 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
c78ba6d6 3984 [RTA_PREF] = { .type = NLA_U8 },
19e42e45
RP
3985 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
3986 [RTA_ENCAP] = { .type = NLA_NESTED },
32bc201e 3987 [RTA_EXPIRES] = { .type = NLA_U32 },
622ec2c9 3988 [RTA_UID] = { .type = NLA_U32 },
3b45a410 3989 [RTA_MARK] = { .type = NLA_U32 },
86872cb5
TG
3990};
3991
3992static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
333c4301
DA
3993 struct fib6_config *cfg,
3994 struct netlink_ext_ack *extack)
1da177e4 3995{
86872cb5
TG
3996 struct rtmsg *rtm;
3997 struct nlattr *tb[RTA_MAX+1];
c78ba6d6 3998 unsigned int pref;
86872cb5 3999 int err;
1da177e4 4000
fceb6435
JB
4001 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4002 NULL);
86872cb5
TG
4003 if (err < 0)
4004 goto errout;
1da177e4 4005
86872cb5
TG
4006 err = -EINVAL;
4007 rtm = nlmsg_data(nlh);
4008 memset(cfg, 0, sizeof(*cfg));
4009
4010 cfg->fc_table = rtm->rtm_table;
4011 cfg->fc_dst_len = rtm->rtm_dst_len;
4012 cfg->fc_src_len = rtm->rtm_src_len;
4013 cfg->fc_flags = RTF_UP;
4014 cfg->fc_protocol = rtm->rtm_protocol;
ef2c7d7b 4015 cfg->fc_type = rtm->rtm_type;
86872cb5 4016
ef2c7d7b
ND
4017 if (rtm->rtm_type == RTN_UNREACHABLE ||
4018 rtm->rtm_type == RTN_BLACKHOLE ||
b4949ab2
ND
4019 rtm->rtm_type == RTN_PROHIBIT ||
4020 rtm->rtm_type == RTN_THROW)
86872cb5
TG
4021 cfg->fc_flags |= RTF_REJECT;
4022
ab79ad14
4023 if (rtm->rtm_type == RTN_LOCAL)
4024 cfg->fc_flags |= RTF_LOCAL;
4025
1f56a01f
MKL
4026 if (rtm->rtm_flags & RTM_F_CLONED)
4027 cfg->fc_flags |= RTF_CACHE;
4028
fc1e64e1
DA
4029 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4030
15e47304 4031 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
86872cb5 4032 cfg->fc_nlinfo.nlh = nlh;
3b1e0a65 4033 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
86872cb5
TG
4034
4035 if (tb[RTA_GATEWAY]) {
67b61f6c 4036 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
86872cb5 4037 cfg->fc_flags |= RTF_GATEWAY;
1da177e4 4038 }
86872cb5
TG
4039
4040 if (tb[RTA_DST]) {
4041 int plen = (rtm->rtm_dst_len + 7) >> 3;
4042
4043 if (nla_len(tb[RTA_DST]) < plen)
4044 goto errout;
4045
4046 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
1da177e4 4047 }
86872cb5
TG
4048
4049 if (tb[RTA_SRC]) {
4050 int plen = (rtm->rtm_src_len + 7) >> 3;
4051
4052 if (nla_len(tb[RTA_SRC]) < plen)
4053 goto errout;
4054
4055 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
1da177e4 4056 }
86872cb5 4057
c3968a85 4058 if (tb[RTA_PREFSRC])
67b61f6c 4059 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
c3968a85 4060
86872cb5
TG
4061 if (tb[RTA_OIF])
4062 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4063
4064 if (tb[RTA_PRIORITY])
4065 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4066
4067 if (tb[RTA_METRICS]) {
4068 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4069 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
1da177e4 4070 }
86872cb5
TG
4071
4072 if (tb[RTA_TABLE])
4073 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4074
51ebd318
ND
4075 if (tb[RTA_MULTIPATH]) {
4076 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4077 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
9ed59592
DA
4078
4079 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
c255bd68 4080 cfg->fc_mp_len, extack);
9ed59592
DA
4081 if (err < 0)
4082 goto errout;
51ebd318
ND
4083 }
4084
c78ba6d6
LR
4085 if (tb[RTA_PREF]) {
4086 pref = nla_get_u8(tb[RTA_PREF]);
4087 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4088 pref != ICMPV6_ROUTER_PREF_HIGH)
4089 pref = ICMPV6_ROUTER_PREF_MEDIUM;
4090 cfg->fc_flags |= RTF_PREF(pref);
4091 }
4092
19e42e45
RP
4093 if (tb[RTA_ENCAP])
4094 cfg->fc_encap = tb[RTA_ENCAP];
4095
9ed59592 4096 if (tb[RTA_ENCAP_TYPE]) {
19e42e45
RP
4097 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4098
c255bd68 4099 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
9ed59592
DA
4100 if (err < 0)
4101 goto errout;
4102 }
4103
32bc201e
XL
4104 if (tb[RTA_EXPIRES]) {
4105 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4106
4107 if (addrconf_finite_timeout(timeout)) {
4108 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4109 cfg->fc_flags |= RTF_EXPIRES;
4110 }
4111 }
4112
86872cb5
TG
4113 err = 0;
4114errout:
4115 return err;
1da177e4
LT
4116}
4117
6b9ea5a6
RP
4118struct rt6_nh {
4119 struct rt6_info *rt6_info;
4120 struct fib6_config r_cfg;
4121 struct mx6_config mxc;
4122 struct list_head next;
4123};
4124
4125static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4126{
4127 struct rt6_nh *nh;
4128
4129 list_for_each_entry(nh, rt6_nh_list, next) {
7d4d5065 4130 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
6b9ea5a6
RP
4131 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4132 nh->r_cfg.fc_ifindex);
4133 }
4134}
4135
4136static int ip6_route_info_append(struct list_head *rt6_nh_list,
4137 struct rt6_info *rt, struct fib6_config *r_cfg)
4138{
4139 struct rt6_nh *nh;
6b9ea5a6
RP
4140 int err = -EEXIST;
4141
4142 list_for_each_entry(nh, rt6_nh_list, next) {
4143 /* check if rt6_info already exists */
f06b7549 4144 if (rt6_duplicate_nexthop(nh->rt6_info, rt))
6b9ea5a6
RP
4145 return err;
4146 }
4147
4148 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4149 if (!nh)
4150 return -ENOMEM;
4151 nh->rt6_info = rt;
4152 err = ip6_convert_metrics(&nh->mxc, r_cfg);
4153 if (err) {
4154 kfree(nh);
4155 return err;
4156 }
4157 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4158 list_add_tail(&nh->next, rt6_nh_list);
4159
4160 return 0;
4161}
4162
3b1137fe
DA
4163static void ip6_route_mpath_notify(struct rt6_info *rt,
4164 struct rt6_info *rt_last,
4165 struct nl_info *info,
4166 __u16 nlflags)
4167{
4168 /* if this is an APPEND route, then rt points to the first route
4169 * inserted and rt_last points to last route inserted. Userspace
4170 * wants a consistent dump of the route which starts at the first
4171 * nexthop. Since sibling routes are always added at the end of
4172 * the list, find the first sibling of the last route appended
4173 */
4174 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
4175 rt = list_first_entry(&rt_last->rt6i_siblings,
4176 struct rt6_info,
4177 rt6i_siblings);
4178 }
4179
4180 if (rt)
4181 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4182}
4183
333c4301
DA
4184static int ip6_route_multipath_add(struct fib6_config *cfg,
4185 struct netlink_ext_ack *extack)
51ebd318 4186{
3b1137fe
DA
4187 struct rt6_info *rt_notif = NULL, *rt_last = NULL;
4188 struct nl_info *info = &cfg->fc_nlinfo;
51ebd318
ND
4189 struct fib6_config r_cfg;
4190 struct rtnexthop *rtnh;
6b9ea5a6
RP
4191 struct rt6_info *rt;
4192 struct rt6_nh *err_nh;
4193 struct rt6_nh *nh, *nh_safe;
3b1137fe 4194 __u16 nlflags;
51ebd318
ND
4195 int remaining;
4196 int attrlen;
6b9ea5a6
RP
4197 int err = 1;
4198 int nhn = 0;
4199 int replace = (cfg->fc_nlinfo.nlh &&
4200 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4201 LIST_HEAD(rt6_nh_list);
51ebd318 4202
3b1137fe
DA
4203 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4204 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4205 nlflags |= NLM_F_APPEND;
4206
35f1b4e9 4207 remaining = cfg->fc_mp_len;
51ebd318 4208 rtnh = (struct rtnexthop *)cfg->fc_mp;
51ebd318 4209
6b9ea5a6
RP
4210 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4211 * rt6_info structs per nexthop
4212 */
51ebd318
ND
4213 while (rtnh_ok(rtnh, remaining)) {
4214 memcpy(&r_cfg, cfg, sizeof(*cfg));
4215 if (rtnh->rtnh_ifindex)
4216 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4217
4218 attrlen = rtnh_attrlen(rtnh);
4219 if (attrlen > 0) {
4220 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4221
4222 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4223 if (nla) {
67b61f6c 4224 r_cfg.fc_gateway = nla_get_in6_addr(nla);
51ebd318
ND
4225 r_cfg.fc_flags |= RTF_GATEWAY;
4226 }
19e42e45
RP
4227 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4228 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4229 if (nla)
4230 r_cfg.fc_encap_type = nla_get_u16(nla);
51ebd318 4231 }
6b9ea5a6 4232
68e2ffde 4233 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
333c4301 4234 rt = ip6_route_info_create(&r_cfg, extack);
8c5b83f0
RP
4235 if (IS_ERR(rt)) {
4236 err = PTR_ERR(rt);
4237 rt = NULL;
6b9ea5a6 4238 goto cleanup;
8c5b83f0 4239 }
6b9ea5a6 4240
398958ae
IS
4241 rt->rt6i_nh_weight = rtnh->rtnh_hops + 1;
4242
6b9ea5a6 4243 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
51ebd318 4244 if (err) {
587fea74 4245 dst_release_immediate(&rt->dst);
6b9ea5a6
RP
4246 goto cleanup;
4247 }
4248
4249 rtnh = rtnh_next(rtnh, &remaining);
4250 }
4251
3b1137fe
DA
4252 /* for add and replace send one notification with all nexthops.
4253 * Skip the notification in fib6_add_rt2node and send one with
4254 * the full route when done
4255 */
4256 info->skip_notify = 1;
4257
6b9ea5a6
RP
4258 err_nh = NULL;
4259 list_for_each_entry(nh, &rt6_nh_list, next) {
3b1137fe 4260 rt_last = nh->rt6_info;
333c4301 4261 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
3b1137fe
DA
4262 /* save reference to first route for notification */
4263 if (!rt_notif && !err)
4264 rt_notif = nh->rt6_info;
4265
6b9ea5a6
RP
4266 /* nh->rt6_info is used or freed at this point, reset to NULL*/
4267 nh->rt6_info = NULL;
4268 if (err) {
4269 if (replace && nhn)
4270 ip6_print_replace_route_err(&rt6_nh_list);
4271 err_nh = nh;
4272 goto add_errout;
51ebd318 4273 }
6b9ea5a6 4274
1a72418b 4275 /* Because each route is added like a single route we remove
27596472
MK
4276 * these flags after the first nexthop: if there is a collision,
4277 * we have already failed to add the first nexthop:
4278 * fib6_add_rt2node() has rejected it; when replacing, old
4279 * nexthops have been replaced by first new, the rest should
4280 * be added to it.
1a72418b 4281 */
27596472
MK
4282 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4283 NLM_F_REPLACE);
6b9ea5a6
RP
4284 nhn++;
4285 }
4286
3b1137fe
DA
4287 /* success ... tell user about new route */
4288 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
6b9ea5a6
RP
4289 goto cleanup;
4290
4291add_errout:
3b1137fe
DA
4292 /* send notification for routes that were added so that
4293 * the delete notifications sent by ip6_route_del are
4294 * coherent
4295 */
4296 if (rt_notif)
4297 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4298
6b9ea5a6
RP
4299 /* Delete routes that were already added */
4300 list_for_each_entry(nh, &rt6_nh_list, next) {
4301 if (err_nh == nh)
4302 break;
333c4301 4303 ip6_route_del(&nh->r_cfg, extack);
6b9ea5a6
RP
4304 }
4305
4306cleanup:
4307 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
587fea74
WW
4308 if (nh->rt6_info)
4309 dst_release_immediate(&nh->rt6_info->dst);
52fe51f8 4310 kfree(nh->mxc.mx);
6b9ea5a6
RP
4311 list_del(&nh->next);
4312 kfree(nh);
4313 }
4314
4315 return err;
4316}
4317
333c4301
DA
4318static int ip6_route_multipath_del(struct fib6_config *cfg,
4319 struct netlink_ext_ack *extack)
6b9ea5a6
RP
4320{
4321 struct fib6_config r_cfg;
4322 struct rtnexthop *rtnh;
4323 int remaining;
4324 int attrlen;
4325 int err = 1, last_err = 0;
4326
4327 remaining = cfg->fc_mp_len;
4328 rtnh = (struct rtnexthop *)cfg->fc_mp;
4329
4330 /* Parse a Multipath Entry */
4331 while (rtnh_ok(rtnh, remaining)) {
4332 memcpy(&r_cfg, cfg, sizeof(*cfg));
4333 if (rtnh->rtnh_ifindex)
4334 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4335
4336 attrlen = rtnh_attrlen(rtnh);
4337 if (attrlen > 0) {
4338 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4339
4340 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4341 if (nla) {
4342 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
4343 r_cfg.fc_flags |= RTF_GATEWAY;
4344 }
4345 }
333c4301 4346 err = ip6_route_del(&r_cfg, extack);
6b9ea5a6
RP
4347 if (err)
4348 last_err = err;
4349
51ebd318
ND
4350 rtnh = rtnh_next(rtnh, &remaining);
4351 }
4352
4353 return last_err;
4354}
4355
c21ef3e3
DA
4356static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4357 struct netlink_ext_ack *extack)
1da177e4 4358{
86872cb5
TG
4359 struct fib6_config cfg;
4360 int err;
1da177e4 4361
333c4301 4362 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
86872cb5
TG
4363 if (err < 0)
4364 return err;
4365
51ebd318 4366 if (cfg.fc_mp)
333c4301 4367 return ip6_route_multipath_del(&cfg, extack);
0ae81335
DA
4368 else {
4369 cfg.fc_delete_all_nh = 1;
333c4301 4370 return ip6_route_del(&cfg, extack);
0ae81335 4371 }
1da177e4
LT
4372}
4373
c21ef3e3
DA
4374static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4375 struct netlink_ext_ack *extack)
1da177e4 4376{
86872cb5
TG
4377 struct fib6_config cfg;
4378 int err;
1da177e4 4379
333c4301 4380 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
86872cb5
TG
4381 if (err < 0)
4382 return err;
4383
51ebd318 4384 if (cfg.fc_mp)
333c4301 4385 return ip6_route_multipath_add(&cfg, extack);
51ebd318 4386 else
333c4301 4387 return ip6_route_add(&cfg, extack);
1da177e4
LT
4388}
4389
beb1afac 4390static size_t rt6_nlmsg_size(struct rt6_info *rt)
339bf98f 4391{
beb1afac
DA
4392 int nexthop_len = 0;
4393
4394 if (rt->rt6i_nsiblings) {
4395 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
4396 + NLA_ALIGN(sizeof(struct rtnexthop))
4397 + nla_total_size(16) /* RTA_GATEWAY */
beb1afac
DA
4398 + lwtunnel_get_encap_size(rt->dst.lwtstate);
4399
4400 nexthop_len *= rt->rt6i_nsiblings;
4401 }
4402
339bf98f
TG
4403 return NLMSG_ALIGN(sizeof(struct rtmsg))
4404 + nla_total_size(16) /* RTA_SRC */
4405 + nla_total_size(16) /* RTA_DST */
4406 + nla_total_size(16) /* RTA_GATEWAY */
4407 + nla_total_size(16) /* RTA_PREFSRC */
4408 + nla_total_size(4) /* RTA_TABLE */
4409 + nla_total_size(4) /* RTA_IIF */
4410 + nla_total_size(4) /* RTA_OIF */
4411 + nla_total_size(4) /* RTA_PRIORITY */
6a2b9ce0 4412 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
ea697639 4413 + nla_total_size(sizeof(struct rta_cacheinfo))
c78ba6d6 4414 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
19e42e45 4415 + nla_total_size(1) /* RTA_PREF */
beb1afac
DA
4416 + lwtunnel_get_encap_size(rt->dst.lwtstate)
4417 + nexthop_len;
4418}
4419
4420static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
5be083ce 4421 unsigned int *flags, bool skip_oif)
beb1afac 4422{
f9d882ea
IS
4423 if (rt->rt6i_nh_flags & RTNH_F_DEAD)
4424 *flags |= RTNH_F_DEAD;
4425
44c9f2f2 4426 if (rt->rt6i_nh_flags & RTNH_F_LINKDOWN) {
beb1afac
DA
4427 *flags |= RTNH_F_LINKDOWN;
4428 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
4429 *flags |= RTNH_F_DEAD;
4430 }
4431
4432 if (rt->rt6i_flags & RTF_GATEWAY) {
4433 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
4434 goto nla_put_failure;
4435 }
4436
fc1e64e1 4437 *flags |= (rt->rt6i_nh_flags & RTNH_F_ONLINK);
fe400799 4438 if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
61e4d01e
IS
4439 *flags |= RTNH_F_OFFLOAD;
4440
5be083ce
DA
4441 /* not needed for multipath encoding b/c it has a rtnexthop struct */
4442 if (!skip_oif && rt->dst.dev &&
beb1afac
DA
4443 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
4444 goto nla_put_failure;
4445
4446 if (rt->dst.lwtstate &&
4447 lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
4448 goto nla_put_failure;
4449
4450 return 0;
4451
4452nla_put_failure:
4453 return -EMSGSIZE;
4454}
4455
5be083ce 4456/* add multipath next hop */
beb1afac
DA
4457static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
4458{
4459 struct rtnexthop *rtnh;
4460 unsigned int flags = 0;
4461
4462 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4463 if (!rtnh)
4464 goto nla_put_failure;
4465
398958ae 4466 rtnh->rtnh_hops = rt->rt6i_nh_weight - 1;
beb1afac
DA
4467 rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
4468
5be083ce 4469 if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
beb1afac
DA
4470 goto nla_put_failure;
4471
4472 rtnh->rtnh_flags = flags;
4473
4474 /* length of rtnetlink header + attributes */
4475 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4476
4477 return 0;
4478
4479nla_put_failure:
4480 return -EMSGSIZE;
339bf98f
TG
4481}
4482
191cd582
BH
4483static int rt6_fill_node(struct net *net,
4484 struct sk_buff *skb, struct rt6_info *rt,
0d51aa80 4485 struct in6_addr *dst, struct in6_addr *src,
15e47304 4486 int iif, int type, u32 portid, u32 seq,
f8cfe2ce 4487 unsigned int flags)
1da177e4 4488{
4b32b5ad 4489 u32 metrics[RTAX_MAX];
1da177e4 4490 struct rtmsg *rtm;
2d7202bf 4491 struct nlmsghdr *nlh;
e3703b3d 4492 long expires;
9e762a4a 4493 u32 table;
1da177e4 4494
15e47304 4495 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
38308473 4496 if (!nlh)
26932566 4497 return -EMSGSIZE;
2d7202bf
TG
4498
4499 rtm = nlmsg_data(nlh);
1da177e4
LT
4500 rtm->rtm_family = AF_INET6;
4501 rtm->rtm_dst_len = rt->rt6i_dst.plen;
4502 rtm->rtm_src_len = rt->rt6i_src.plen;
4503 rtm->rtm_tos = 0;
c71099ac 4504 if (rt->rt6i_table)
9e762a4a 4505 table = rt->rt6i_table->tb6_id;
c71099ac 4506 else
9e762a4a
PM
4507 table = RT6_TABLE_UNSPEC;
4508 rtm->rtm_table = table;
c78679e8
DM
4509 if (nla_put_u32(skb, RTA_TABLE, table))
4510 goto nla_put_failure;
ef2c7d7b
ND
4511 if (rt->rt6i_flags & RTF_REJECT) {
4512 switch (rt->dst.error) {
4513 case -EINVAL:
4514 rtm->rtm_type = RTN_BLACKHOLE;
4515 break;
4516 case -EACCES:
4517 rtm->rtm_type = RTN_PROHIBIT;
4518 break;
b4949ab2
ND
4519 case -EAGAIN:
4520 rtm->rtm_type = RTN_THROW;
4521 break;
ef2c7d7b
ND
4522 default:
4523 rtm->rtm_type = RTN_UNREACHABLE;
4524 break;
4525 }
4526 }
38308473 4527 else if (rt->rt6i_flags & RTF_LOCAL)
ab79ad14 4528 rtm->rtm_type = RTN_LOCAL;
4ee39733
DA
4529 else if (rt->rt6i_flags & RTF_ANYCAST)
4530 rtm->rtm_type = RTN_ANYCAST;
d1918542 4531 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
1da177e4
LT
4532 rtm->rtm_type = RTN_LOCAL;
4533 else
4534 rtm->rtm_type = RTN_UNICAST;
4535 rtm->rtm_flags = 0;
4536 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4537 rtm->rtm_protocol = rt->rt6i_protocol;
1da177e4 4538
38308473 4539 if (rt->rt6i_flags & RTF_CACHE)
1da177e4
LT
4540 rtm->rtm_flags |= RTM_F_CLONED;
4541
4542 if (dst) {
930345ea 4543 if (nla_put_in6_addr(skb, RTA_DST, dst))
c78679e8 4544 goto nla_put_failure;
1ab1457c 4545 rtm->rtm_dst_len = 128;
1da177e4 4546 } else if (rtm->rtm_dst_len)
930345ea 4547 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
c78679e8 4548 goto nla_put_failure;
1da177e4
LT
4549#ifdef CONFIG_IPV6_SUBTREES
4550 if (src) {
930345ea 4551 if (nla_put_in6_addr(skb, RTA_SRC, src))
c78679e8 4552 goto nla_put_failure;
1ab1457c 4553 rtm->rtm_src_len = 128;
c78679e8 4554 } else if (rtm->rtm_src_len &&
930345ea 4555 nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
c78679e8 4556 goto nla_put_failure;
1da177e4 4557#endif
7bc570c8
YH
4558 if (iif) {
4559#ifdef CONFIG_IPV6_MROUTE
4560 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
fd61c6ba
DA
4561 int err = ip6mr_get_route(net, skb, rtm, portid);
4562
4563 if (err == 0)
4564 return 0;
4565 if (err < 0)
4566 goto nla_put_failure;
7bc570c8
YH
4567 } else
4568#endif
c78679e8
DM
4569 if (nla_put_u32(skb, RTA_IIF, iif))
4570 goto nla_put_failure;
7bc570c8 4571 } else if (dst) {
1da177e4 4572 struct in6_addr saddr_buf;
c78679e8 4573 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
930345ea 4574 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
c78679e8 4575 goto nla_put_failure;
1da177e4 4576 }
2d7202bf 4577
c3968a85
DW
4578 if (rt->rt6i_prefsrc.plen) {
4579 struct in6_addr saddr_buf;
4e3fd7a0 4580 saddr_buf = rt->rt6i_prefsrc.addr;
930345ea 4581 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
c78679e8 4582 goto nla_put_failure;
c3968a85
DW
4583 }
4584
4b32b5ad
MKL
4585 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
4586 if (rt->rt6i_pmtu)
4587 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
4588 if (rtnetlink_put_metrics(skb, metrics) < 0)
2d7202bf
TG
4589 goto nla_put_failure;
4590
c78679e8
DM
4591 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
4592 goto nla_put_failure;
8253947e 4593
beb1afac
DA
4594 /* For multipath routes, walk the siblings list and add
4595 * each as a nexthop within RTA_MULTIPATH.
4596 */
4597 if (rt->rt6i_nsiblings) {
4598 struct rt6_info *sibling, *next_sibling;
4599 struct nlattr *mp;
4600
4601 mp = nla_nest_start(skb, RTA_MULTIPATH);
4602 if (!mp)
4603 goto nla_put_failure;
4604
4605 if (rt6_add_nexthop(skb, rt) < 0)
4606 goto nla_put_failure;
4607
4608 list_for_each_entry_safe(sibling, next_sibling,
4609 &rt->rt6i_siblings, rt6i_siblings) {
4610 if (rt6_add_nexthop(skb, sibling) < 0)
4611 goto nla_put_failure;
4612 }
4613
4614 nla_nest_end(skb, mp);
4615 } else {
5be083ce 4616 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
beb1afac
DA
4617 goto nla_put_failure;
4618 }
4619
8253947e 4620 expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
69cdf8f9 4621
87a50699 4622 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
e3703b3d 4623 goto nla_put_failure;
2d7202bf 4624
c78ba6d6
LR
4625 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
4626 goto nla_put_failure;
4627
19e42e45 4628
053c095a
JB
4629 nlmsg_end(skb, nlh);
4630 return 0;
2d7202bf
TG
4631
4632nla_put_failure:
26932566
PM
4633 nlmsg_cancel(skb, nlh);
4634 return -EMSGSIZE;
1da177e4
LT
4635}
4636
1b43af54 4637int rt6_dump_route(struct rt6_info *rt, void *p_arg)
1da177e4
LT
4638{
4639 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
1f17e2f2
DA
4640 struct net *net = arg->net;
4641
4642 if (rt == net->ipv6.ip6_null_entry)
4643 return 0;
1da177e4 4644
2d7202bf
TG
4645 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4646 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
f8cfe2ce
DA
4647
4648 /* user wants prefix routes only */
4649 if (rtm->rtm_flags & RTM_F_PREFIX &&
4650 !(rt->rt6i_flags & RTF_PREFIX_RT)) {
4651 /* success since this is not a prefix route */
4652 return 1;
4653 }
4654 }
1da177e4 4655
1f17e2f2 4656 return rt6_fill_node(net,
191cd582 4657 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
15e47304 4658 NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
f8cfe2ce 4659 NLM_F_MULTI);
1da177e4
LT
4660}
4661
c21ef3e3
DA
4662static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4663 struct netlink_ext_ack *extack)
1da177e4 4664{
3b1e0a65 4665 struct net *net = sock_net(in_skb->sk);
ab364a6f 4666 struct nlattr *tb[RTA_MAX+1];
18c3a61c
RP
4667 int err, iif = 0, oif = 0;
4668 struct dst_entry *dst;
ab364a6f 4669 struct rt6_info *rt;
1da177e4 4670 struct sk_buff *skb;
ab364a6f 4671 struct rtmsg *rtm;
4c9483b2 4672 struct flowi6 fl6;
18c3a61c 4673 bool fibmatch;
1da177e4 4674
fceb6435 4675 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
c21ef3e3 4676 extack);
ab364a6f
TG
4677 if (err < 0)
4678 goto errout;
1da177e4 4679
ab364a6f 4680 err = -EINVAL;
4c9483b2 4681 memset(&fl6, 0, sizeof(fl6));
38b7097b
HFS
4682 rtm = nlmsg_data(nlh);
4683 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
18c3a61c 4684 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
1da177e4 4685
ab364a6f
TG
4686 if (tb[RTA_SRC]) {
4687 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4688 goto errout;
4689
4e3fd7a0 4690 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
ab364a6f
TG
4691 }
4692
4693 if (tb[RTA_DST]) {
4694 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4695 goto errout;
4696
4e3fd7a0 4697 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
ab364a6f
TG
4698 }
4699
4700 if (tb[RTA_IIF])
4701 iif = nla_get_u32(tb[RTA_IIF]);
4702
4703 if (tb[RTA_OIF])
72331bc0 4704 oif = nla_get_u32(tb[RTA_OIF]);
1da177e4 4705
2e47b291
LC
4706 if (tb[RTA_MARK])
4707 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4708
622ec2c9
LC
4709 if (tb[RTA_UID])
4710 fl6.flowi6_uid = make_kuid(current_user_ns(),
4711 nla_get_u32(tb[RTA_UID]));
4712 else
4713 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4714
1da177e4
LT
4715 if (iif) {
4716 struct net_device *dev;
72331bc0
SL
4717 int flags = 0;
4718
121622db
FW
4719 rcu_read_lock();
4720
4721 dev = dev_get_by_index_rcu(net, iif);
1da177e4 4722 if (!dev) {
121622db 4723 rcu_read_unlock();
1da177e4 4724 err = -ENODEV;
ab364a6f 4725 goto errout;
1da177e4 4726 }
72331bc0
SL
4727
4728 fl6.flowi6_iif = iif;
4729
4730 if (!ipv6_addr_any(&fl6.saddr))
4731 flags |= RT6_LOOKUP_F_HAS_SADDR;
4732
b75cc8f9 4733 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
121622db
FW
4734
4735 rcu_read_unlock();
72331bc0
SL
4736 } else {
4737 fl6.flowi6_oif = oif;
4738
58acfd71 4739 dst = ip6_route_output(net, NULL, &fl6);
18c3a61c
RP
4740 }
4741
18c3a61c
RP
4742
4743 rt = container_of(dst, struct rt6_info, dst);
4744 if (rt->dst.error) {
4745 err = rt->dst.error;
4746 ip6_rt_put(rt);
4747 goto errout;
1da177e4
LT
4748 }
4749
9d6acb3b
WC
4750 if (rt == net->ipv6.ip6_null_entry) {
4751 err = rt->dst.error;
4752 ip6_rt_put(rt);
4753 goto errout;
4754 }
4755
fba961ab
DM
4756 if (fibmatch && rt->from) {
4757 struct rt6_info *ort = rt->from;
58acfd71
IS
4758
4759 dst_hold(&ort->dst);
4760 ip6_rt_put(rt);
4761 rt = ort;
4762 }
4763
ab364a6f 4764 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
38308473 4765 if (!skb) {
94e187c0 4766 ip6_rt_put(rt);
ab364a6f
TG
4767 err = -ENOBUFS;
4768 goto errout;
4769 }
1da177e4 4770
d8d1f30b 4771 skb_dst_set(skb, &rt->dst);
18c3a61c
RP
4772 if (fibmatch)
4773 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
4774 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4775 nlh->nlmsg_seq, 0);
4776 else
4777 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
4778 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
4779 nlh->nlmsg_seq, 0);
1da177e4 4780 if (err < 0) {
ab364a6f
TG
4781 kfree_skb(skb);
4782 goto errout;
1da177e4
LT
4783 }
4784
15e47304 4785 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
ab364a6f 4786errout:
1da177e4 4787 return err;
1da177e4
LT
4788}
4789
37a1d361
RP
4790void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
4791 unsigned int nlm_flags)
1da177e4
LT
4792{
4793 struct sk_buff *skb;
5578689a 4794 struct net *net = info->nl_net;
528c4ceb
DL
4795 u32 seq;
4796 int err;
4797
4798 err = -ENOBUFS;
38308473 4799 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
86872cb5 4800
19e42e45 4801 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
38308473 4802 if (!skb)
21713ebc
TG
4803 goto errout;
4804
191cd582 4805 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
f8cfe2ce 4806 event, info->portid, seq, nlm_flags);
26932566
PM
4807 if (err < 0) {
4808 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
4809 WARN_ON(err == -EMSGSIZE);
4810 kfree_skb(skb);
4811 goto errout;
4812 }
15e47304 4813 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
1ce85fe4
PNA
4814 info->nlh, gfp_any());
4815 return;
21713ebc
TG
4816errout:
4817 if (err < 0)
5578689a 4818 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
1da177e4
LT
4819}
4820
8ed67789 4821static int ip6_route_dev_notify(struct notifier_block *this,
351638e7 4822 unsigned long event, void *ptr)
8ed67789 4823{
351638e7 4824 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
c346dca1 4825 struct net *net = dev_net(dev);
8ed67789 4826
242d3a49
WC
4827 if (!(dev->flags & IFF_LOOPBACK))
4828 return NOTIFY_OK;
4829
4830 if (event == NETDEV_REGISTER) {
d8d1f30b 4831 net->ipv6.ip6_null_entry->dst.dev = dev;
8ed67789
DL
4832 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
4833#ifdef CONFIG_IPV6_MULTIPLE_TABLES
d8d1f30b 4834 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
8ed67789 4835 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
d8d1f30b 4836 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
8ed67789 4837 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
242d3a49 4838#endif
76da0704
WC
4839 } else if (event == NETDEV_UNREGISTER &&
4840 dev->reg_state != NETREG_UNREGISTERED) {
4841 /* NETDEV_UNREGISTER could be fired for multiple times by
4842 * netdev_wait_allrefs(). Make sure we only call this once.
4843 */
12d94a80 4844 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
242d3a49 4845#ifdef CONFIG_IPV6_MULTIPLE_TABLES
12d94a80
ED
4846 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
4847 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
8ed67789
DL
4848#endif
4849 }
4850
4851 return NOTIFY_OK;
4852}
4853
1da177e4
LT
4854/*
4855 * /proc
4856 */
4857
4858#ifdef CONFIG_PROC_FS
4859
33120b30 4860static const struct file_operations ipv6_route_proc_fops = {
33120b30
AD
4861 .open = ipv6_route_open,
4862 .read = seq_read,
4863 .llseek = seq_lseek,
8d2ca1d7 4864 .release = seq_release_net,
33120b30
AD
4865};
4866
1da177e4
LT
4867static int rt6_stats_seq_show(struct seq_file *seq, void *v)
4868{
69ddb805 4869 struct net *net = (struct net *)seq->private;
1da177e4 4870 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
69ddb805
DL
4871 net->ipv6.rt6_stats->fib_nodes,
4872 net->ipv6.rt6_stats->fib_route_nodes,
81eb8447 4873 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
69ddb805
DL
4874 net->ipv6.rt6_stats->fib_rt_entries,
4875 net->ipv6.rt6_stats->fib_rt_cache,
fc66f95c 4876 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
69ddb805 4877 net->ipv6.rt6_stats->fib_discarded_routes);
1da177e4
LT
4878
4879 return 0;
4880}
4881
4882static int rt6_stats_seq_open(struct inode *inode, struct file *file)
4883{
de05c557 4884 return single_open_net(inode, file, rt6_stats_seq_show);
69ddb805
DL
4885}
4886
9a32144e 4887static const struct file_operations rt6_stats_seq_fops = {
1da177e4
LT
4888 .open = rt6_stats_seq_open,
4889 .read = seq_read,
4890 .llseek = seq_lseek,
b6fcbdb4 4891 .release = single_release_net,
1da177e4
LT
4892};
4893#endif /* CONFIG_PROC_FS */
4894
4895#ifdef CONFIG_SYSCTL
4896
1da177e4 4897static
fe2c6338 4898int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
1da177e4
LT
4899 void __user *buffer, size_t *lenp, loff_t *ppos)
4900{
c486da34
LAG
4901 struct net *net;
4902 int delay;
4903 if (!write)
1da177e4 4904 return -EINVAL;
c486da34
LAG
4905
4906 net = (struct net *)ctl->extra1;
4907 delay = net->ipv6.sysctl.flush_delay;
4908 proc_dointvec(ctl, write, buffer, lenp, ppos);
2ac3ac8f 4909 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
c486da34 4910 return 0;
1da177e4
LT
4911}
4912
fe2c6338 4913struct ctl_table ipv6_route_table_template[] = {
1ab1457c 4914 {
1da177e4 4915 .procname = "flush",
4990509f 4916 .data = &init_net.ipv6.sysctl.flush_delay,
1da177e4 4917 .maxlen = sizeof(int),
89c8b3a1 4918 .mode = 0200,
6d9f239a 4919 .proc_handler = ipv6_sysctl_rtcache_flush
1da177e4
LT
4920 },
4921 {
1da177e4 4922 .procname = "gc_thresh",
9a7ec3a9 4923 .data = &ip6_dst_ops_template.gc_thresh,
1da177e4
LT
4924 .maxlen = sizeof(int),
4925 .mode = 0644,
6d9f239a 4926 .proc_handler = proc_dointvec,
1da177e4
LT
4927 },
4928 {
1da177e4 4929 .procname = "max_size",
4990509f 4930 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
1da177e4
LT
4931 .maxlen = sizeof(int),
4932 .mode = 0644,
6d9f239a 4933 .proc_handler = proc_dointvec,
1da177e4
LT
4934 },
4935 {
1da177e4 4936 .procname = "gc_min_interval",
4990509f 4937 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
1da177e4
LT
4938 .maxlen = sizeof(int),
4939 .mode = 0644,
6d9f239a 4940 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
4941 },
4942 {
1da177e4 4943 .procname = "gc_timeout",
4990509f 4944 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
1da177e4
LT
4945 .maxlen = sizeof(int),
4946 .mode = 0644,
6d9f239a 4947 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
4948 },
4949 {
1da177e4 4950 .procname = "gc_interval",
4990509f 4951 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
1da177e4
LT
4952 .maxlen = sizeof(int),
4953 .mode = 0644,
6d9f239a 4954 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
4955 },
4956 {
1da177e4 4957 .procname = "gc_elasticity",
4990509f 4958 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
1da177e4
LT
4959 .maxlen = sizeof(int),
4960 .mode = 0644,
f3d3f616 4961 .proc_handler = proc_dointvec,
1da177e4
LT
4962 },
4963 {
1da177e4 4964 .procname = "mtu_expires",
4990509f 4965 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
1da177e4
LT
4966 .maxlen = sizeof(int),
4967 .mode = 0644,
6d9f239a 4968 .proc_handler = proc_dointvec_jiffies,
1da177e4
LT
4969 },
4970 {
1da177e4 4971 .procname = "min_adv_mss",
4990509f 4972 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
1da177e4
LT
4973 .maxlen = sizeof(int),
4974 .mode = 0644,
f3d3f616 4975 .proc_handler = proc_dointvec,
1da177e4
LT
4976 },
4977 {
1da177e4 4978 .procname = "gc_min_interval_ms",
4990509f 4979 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
1da177e4
LT
4980 .maxlen = sizeof(int),
4981 .mode = 0644,
6d9f239a 4982 .proc_handler = proc_dointvec_ms_jiffies,
1da177e4 4983 },
f8572d8f 4984 { }
1da177e4
LT
4985};
4986
2c8c1e72 4987struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
760f2d01
DL
4988{
4989 struct ctl_table *table;
4990
4991 table = kmemdup(ipv6_route_table_template,
4992 sizeof(ipv6_route_table_template),
4993 GFP_KERNEL);
5ee09105
YH
4994
4995 if (table) {
4996 table[0].data = &net->ipv6.sysctl.flush_delay;
c486da34 4997 table[0].extra1 = net;
86393e52 4998 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5ee09105
YH
4999 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5000 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5001 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5002 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5003 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5004 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5005 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
9c69fabe 5006 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
464dc801
EB
5007
5008 /* Don't export sysctls to unprivileged users */
5009 if (net->user_ns != &init_user_ns)
5010 table[0].procname = NULL;
5ee09105
YH
5011 }
5012
760f2d01
DL
5013 return table;
5014}
1da177e4
LT
5015#endif
5016
2c8c1e72 5017static int __net_init ip6_route_net_init(struct net *net)
cdb18761 5018{
633d424b 5019 int ret = -ENOMEM;
8ed67789 5020
86393e52
AD
5021 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5022 sizeof(net->ipv6.ip6_dst_ops));
f2fc6a54 5023
fc66f95c
ED
5024 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5025 goto out_ip6_dst_ops;
5026
8ed67789
DL
5027 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5028 sizeof(*net->ipv6.ip6_null_entry),
5029 GFP_KERNEL);
5030 if (!net->ipv6.ip6_null_entry)
fc66f95c 5031 goto out_ip6_dst_entries;
d8d1f30b 5032 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
62fa8a84
DM
5033 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5034 ip6_template_metrics, true);
8ed67789
DL
5035
5036#ifdef CONFIG_IPV6_MULTIPLE_TABLES
feca7d8c 5037 net->ipv6.fib6_has_custom_rules = false;
8ed67789
DL
5038 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5039 sizeof(*net->ipv6.ip6_prohibit_entry),
5040 GFP_KERNEL);
68fffc67
PZ
5041 if (!net->ipv6.ip6_prohibit_entry)
5042 goto out_ip6_null_entry;
d8d1f30b 5043 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
62fa8a84
DM
5044 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5045 ip6_template_metrics, true);
8ed67789
DL
5046
5047 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5048 sizeof(*net->ipv6.ip6_blk_hole_entry),
5049 GFP_KERNEL);
68fffc67
PZ
5050 if (!net->ipv6.ip6_blk_hole_entry)
5051 goto out_ip6_prohibit_entry;
d8d1f30b 5052 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
62fa8a84
DM
5053 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5054 ip6_template_metrics, true);
8ed67789
DL
5055#endif
5056
b339a47c
PZ
5057 net->ipv6.sysctl.flush_delay = 0;
5058 net->ipv6.sysctl.ip6_rt_max_size = 4096;
5059 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5060 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5061 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5062 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5063 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5064 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5065
6891a346
BT
5066 net->ipv6.ip6_rt_gc_expire = 30*HZ;
5067
8ed67789
DL
5068 ret = 0;
5069out:
5070 return ret;
f2fc6a54 5071
68fffc67
PZ
5072#ifdef CONFIG_IPV6_MULTIPLE_TABLES
5073out_ip6_prohibit_entry:
5074 kfree(net->ipv6.ip6_prohibit_entry);
5075out_ip6_null_entry:
5076 kfree(net->ipv6.ip6_null_entry);
5077#endif
fc66f95c
ED
5078out_ip6_dst_entries:
5079 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
f2fc6a54 5080out_ip6_dst_ops:
f2fc6a54 5081 goto out;
cdb18761
DL
5082}
5083
2c8c1e72 5084static void __net_exit ip6_route_net_exit(struct net *net)
cdb18761 5085{
8ed67789
DL
5086 kfree(net->ipv6.ip6_null_entry);
5087#ifdef CONFIG_IPV6_MULTIPLE_TABLES
5088 kfree(net->ipv6.ip6_prohibit_entry);
5089 kfree(net->ipv6.ip6_blk_hole_entry);
5090#endif
41bb78b4 5091 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
cdb18761
DL
5092}
5093
d189634e
TG
5094static int __net_init ip6_route_net_init_late(struct net *net)
5095{
5096#ifdef CONFIG_PROC_FS
d4beaa66 5097 proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
d6444062 5098 proc_create("rt6_stats", 0444, net->proc_net, &rt6_stats_seq_fops);
d189634e
TG
5099#endif
5100 return 0;
5101}
5102
5103static void __net_exit ip6_route_net_exit_late(struct net *net)
5104{
5105#ifdef CONFIG_PROC_FS
ece31ffd
G
5106 remove_proc_entry("ipv6_route", net->proc_net);
5107 remove_proc_entry("rt6_stats", net->proc_net);
d189634e
TG
5108#endif
5109}
5110
cdb18761
DL
5111static struct pernet_operations ip6_route_net_ops = {
5112 .init = ip6_route_net_init,
5113 .exit = ip6_route_net_exit,
5114};
5115
c3426b47
DM
5116static int __net_init ipv6_inetpeer_init(struct net *net)
5117{
5118 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5119
5120 if (!bp)
5121 return -ENOMEM;
5122 inet_peer_base_init(bp);
5123 net->ipv6.peers = bp;
5124 return 0;
5125}
5126
5127static void __net_exit ipv6_inetpeer_exit(struct net *net)
5128{
5129 struct inet_peer_base *bp = net->ipv6.peers;
5130
5131 net->ipv6.peers = NULL;
56a6b248 5132 inetpeer_invalidate_tree(bp);
c3426b47
DM
5133 kfree(bp);
5134}
5135
2b823f72 5136static struct pernet_operations ipv6_inetpeer_ops = {
c3426b47
DM
5137 .init = ipv6_inetpeer_init,
5138 .exit = ipv6_inetpeer_exit,
5139};
5140
d189634e
TG
5141static struct pernet_operations ip6_route_net_late_ops = {
5142 .init = ip6_route_net_init_late,
5143 .exit = ip6_route_net_exit_late,
5144};
5145
8ed67789
DL
5146static struct notifier_block ip6_route_dev_notifier = {
5147 .notifier_call = ip6_route_dev_notify,
242d3a49 5148 .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
8ed67789
DL
5149};
5150
2f460933
WC
5151void __init ip6_route_init_special_entries(void)
5152{
5153 /* Registering of the loopback is done before this portion of code,
5154 * the loopback reference in rt6_info will not be taken, do it
5155 * manually for init_net */
5156 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5157 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5158 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5159 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5160 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5161 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5162 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5163 #endif
5164}
5165
433d49c3 5166int __init ip6_route_init(void)
1da177e4 5167{
433d49c3 5168 int ret;
8d0b94af 5169 int cpu;
433d49c3 5170
9a7ec3a9
DL
5171 ret = -ENOMEM;
5172 ip6_dst_ops_template.kmem_cachep =
e5d679f3 5173 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
f845ab6b 5174 SLAB_HWCACHE_ALIGN, NULL);
9a7ec3a9 5175 if (!ip6_dst_ops_template.kmem_cachep)
c19a28e1 5176 goto out;
14e50e57 5177
fc66f95c 5178 ret = dst_entries_init(&ip6_dst_blackhole_ops);
8ed67789 5179 if (ret)
bdb3289f 5180 goto out_kmem_cache;
bdb3289f 5181
c3426b47
DM
5182 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5183 if (ret)
e8803b6c 5184 goto out_dst_entries;
2a0c451a 5185
7e52b33b
DM
5186 ret = register_pernet_subsys(&ip6_route_net_ops);
5187 if (ret)
5188 goto out_register_inetpeer;
c3426b47 5189
5dc121e9
AE
5190 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5191
e8803b6c 5192 ret = fib6_init();
433d49c3 5193 if (ret)
8ed67789 5194 goto out_register_subsys;
433d49c3 5195
433d49c3
DL
5196 ret = xfrm6_init();
5197 if (ret)
e8803b6c 5198 goto out_fib6_init;
c35b7e72 5199
433d49c3
DL
5200 ret = fib6_rules_init();
5201 if (ret)
5202 goto xfrm6_init;
7e5449c2 5203
d189634e
TG
5204 ret = register_pernet_subsys(&ip6_route_net_late_ops);
5205 if (ret)
5206 goto fib6_rules_init;
5207
16feebcf
FW
5208 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5209 inet6_rtm_newroute, NULL, 0);
5210 if (ret < 0)
5211 goto out_register_late_subsys;
5212
5213 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5214 inet6_rtm_delroute, NULL, 0);
5215 if (ret < 0)
5216 goto out_register_late_subsys;
5217
5218 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5219 inet6_rtm_getroute, NULL,
5220 RTNL_FLAG_DOIT_UNLOCKED);
5221 if (ret < 0)
d189634e 5222 goto out_register_late_subsys;
c127ea2c 5223
8ed67789 5224 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
cdb18761 5225 if (ret)
d189634e 5226 goto out_register_late_subsys;
8ed67789 5227
8d0b94af
MKL
5228 for_each_possible_cpu(cpu) {
5229 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5230
5231 INIT_LIST_HEAD(&ul->head);
5232 spin_lock_init(&ul->lock);
5233 }
5234
433d49c3
DL
5235out:
5236 return ret;
5237
d189634e 5238out_register_late_subsys:
16feebcf 5239 rtnl_unregister_all(PF_INET6);
d189634e 5240 unregister_pernet_subsys(&ip6_route_net_late_ops);
433d49c3 5241fib6_rules_init:
433d49c3
DL
5242 fib6_rules_cleanup();
5243xfrm6_init:
433d49c3 5244 xfrm6_fini();
2a0c451a
TG
5245out_fib6_init:
5246 fib6_gc_cleanup();
8ed67789
DL
5247out_register_subsys:
5248 unregister_pernet_subsys(&ip6_route_net_ops);
7e52b33b
DM
5249out_register_inetpeer:
5250 unregister_pernet_subsys(&ipv6_inetpeer_ops);
fc66f95c
ED
5251out_dst_entries:
5252 dst_entries_destroy(&ip6_dst_blackhole_ops);
433d49c3 5253out_kmem_cache:
f2fc6a54 5254 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
433d49c3 5255 goto out;
1da177e4
LT
5256}
5257
5258void ip6_route_cleanup(void)
5259{
8ed67789 5260 unregister_netdevice_notifier(&ip6_route_dev_notifier);
d189634e 5261 unregister_pernet_subsys(&ip6_route_net_late_ops);
101367c2 5262 fib6_rules_cleanup();
1da177e4 5263 xfrm6_fini();
1da177e4 5264 fib6_gc_cleanup();
c3426b47 5265 unregister_pernet_subsys(&ipv6_inetpeer_ops);
8ed67789 5266 unregister_pernet_subsys(&ip6_route_net_ops);
41bb78b4 5267 dst_entries_destroy(&ip6_dst_blackhole_ops);
f2fc6a54 5268 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
1da177e4 5269}