net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <linux/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/string.h>
  74 #include <linux/socket.h>
  75 #include <linux/sockios.h>
  76 #include <linux/errno.h>
  77 #include <linux/in.h>
  78 #include <linux/inet.h>
  79 #include <linux/netdevice.h>
  80 #include <linux/proc_fs.h>
  81 #include <linux/init.h>
  82 #include <linux/skbuff.h>
  83 #include <linux/inetdevice.h>
  84 #include <linux/igmp.h>
  85 #include <linux/pkt_sched.h>
  86 #include <linux/mroute.h>
  87 #include <linux/netfilter_ipv4.h>
  88 #include <linux/random.h>
  89 #include <linux/rcupdate.h>
  90 #include <linux/times.h>
  91 #include <linux/slab.h>
  92 #include <linux/jhash.h>
  93 #include <net/dst.h>
  94 #include <net/dst_metadata.h>
  95 #include <net/net_namespace.h>
  96 #include <net/protocol.h>
  97 #include <net/ip.h>
  98 #include <net/route.h>
  99 #include <net/inetpeer.h>
 100 #include <net/sock.h>
 101 #include <net/ip_fib.h>
 102 #include <net/arp.h>
 103 #include <net/tcp.h>
 104 #include <net/icmp.h>
 105 #include <net/xfrm.h>
 106 #include <net/lwtunnel.h>
 107 #include <net/netevent.h>
 108 #include <net/rtnetlink.h>
 109 #ifdef CONFIG_SYSCTL
 110 #include <linux/sysctl.h>
 111 #endif
 112 #include <net/secure_seq.h>
 113 #include <net/ip_tunnels.h>
 114 #include <net/l3mdev.h>
 115
 116 #include "fib_lookup.h"
 117
 118 #define RT_FL_TOS(oldflp4) \
 119         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 120
 121 #define RT_GC_TIMEOUT (300*HZ)
 122
 123 static int ip_rt_max_size;
 124 static int ip_rt_redirect_number __read_mostly  = 9;
 125 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 126 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 127 static int ip_rt_error_cost __read_mostly       = HZ;
 128 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 129 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 130 static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 131 static int ip_rt_min_advmss __read_mostly       = 256;
 132
 133 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 134
 135 /*
 136  *      Interface to generic destination cache.
 137  */
 138
 139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 140 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 141 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 142 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 143 static void              ipv4_link_failure(struct sk_buff *skb);
 144 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 145                                            struct sk_buff *skb, u32 mtu);
 146 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 147                                         struct sk_buff *skb);
 148 static void             ipv4_dst_destroy(struct dst_entry *dst);
 149
 150 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 151 {
 152         WARN_ON(1);
 153         return NULL;
 154 }
 155
 156 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 157                                            struct sk_buff *skb,
 158                                            const void *daddr);
 159 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 160
 161 static struct dst_ops ipv4_dst_ops = {
 162         .family =               AF_INET,
 163         .check =                ipv4_dst_check,
 164         .default_advmss =       ipv4_default_advmss,
 165         .mtu =                  ipv4_mtu,
 166         .cow_metrics =          ipv4_cow_metrics,
 167         .destroy =              ipv4_dst_destroy,
 168         .negative_advice =      ipv4_negative_advice,
 169         .link_failure =         ipv4_link_failure,
 170         .update_pmtu =          ip_rt_update_pmtu,
 171         .redirect =             ip_do_redirect,
 172         .local_out =            __ip_local_out,
 173         .neigh_lookup =         ipv4_neigh_lookup,
 174         .confirm_neigh =        ipv4_confirm_neigh,
 175 };
 176
 177 #define ECN_OR_COST(class)      TC_PRIO_##class
 178
 179 const __u8 ip_tos2prio[16] = {
 180         TC_PRIO_BESTEFFORT,
 181         ECN_OR_COST(BESTEFFORT),
 182         TC_PRIO_BESTEFFORT,
 183         ECN_OR_COST(BESTEFFORT),
 184         TC_PRIO_BULK,
 185         ECN_OR_COST(BULK),
 186         TC_PRIO_BULK,
 187         ECN_OR_COST(BULK),
 188         TC_PRIO_INTERACTIVE,
 189         ECN_OR_COST(INTERACTIVE),
 190         TC_PRIO_INTERACTIVE,
 191         ECN_OR_COST(INTERACTIVE),
 192         TC_PRIO_INTERACTIVE_BULK,
 193         ECN_OR_COST(INTERACTIVE_BULK),
 194         TC_PRIO_INTERACTIVE_BULK,
 195         ECN_OR_COST(INTERACTIVE_BULK)
 196 };
 197 EXPORT_SYMBOL(ip_tos2prio);
 198
 199 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 200 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 201
 202 #ifdef CONFIG_PROC_FS
 203 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 204 {
 205         if (*pos)
 206                 return NULL;
 207         return SEQ_START_TOKEN;
 208 }
 209
 210 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 211 {
 212         ++*pos;
 213         return NULL;
 214 }
 215
 216 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 217 {
 218 }
 219
 220 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 221 {
 222         if (v == SEQ_START_TOKEN)
 223                 seq_printf(seq, "%-127s\n",
 224                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 225                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 226                            "HHUptod\tSpecDst");
 227         return 0;
 228 }
 229
 230 static const struct seq_operations rt_cache_seq_ops = {
 231         .start  = rt_cache_seq_start,
 232         .next   = rt_cache_seq_next,
 233         .stop   = rt_cache_seq_stop,
 234         .show   = rt_cache_seq_show,
 235 };
 236
 237 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 238 {
 239         return seq_open(file, &rt_cache_seq_ops);
 240 }
 241
 242 static const struct file_operations rt_cache_seq_fops = {
 243         .open    = rt_cache_seq_open,
 244         .read    = seq_read,
 245         .llseek  = seq_lseek,
 246         .release = seq_release,
 247 };
 248
 249
 250 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 251 {
 252         int cpu;
 253
 254         if (*pos == 0)
 255                 return SEQ_START_TOKEN;
 256
 257         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 258                 if (!cpu_possible(cpu))
 259                         continue;
 260                 *pos = cpu+1;
 261                 return &per_cpu(rt_cache_stat, cpu);
 262         }
 263         return NULL;
 264 }
 265
 266 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 267 {
 268         int cpu;
 269
 270         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 271                 if (!cpu_possible(cpu))
 272                         continue;
 273                 *pos = cpu+1;
 274                 return &per_cpu(rt_cache_stat, cpu);
 275         }
 276         return NULL;
 277
 278 }
 279
 280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 281 {
 282
 283 }
 284
 285 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 286 {
 287         struct rt_cache_stat *st = v;
 288
 289         if (v == SEQ_START_TOKEN) {
 290                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 291                 return 0;
 292         }
 293
 294         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 295                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 296                    dst_entries_get_slow(&ipv4_dst_ops),
 297                    0, /* st->in_hit */
 298                    st->in_slow_tot,
 299                    st->in_slow_mc,
 300                    st->in_no_route,
 301                    st->in_brd,
 302                    st->in_martian_dst,
 303                    st->in_martian_src,
 304
 305                    0, /* st->out_hit */
 306                    st->out_slow_tot,
 307                    st->out_slow_mc,
 308
 309                    0, /* st->gc_total */
 310                    0, /* st->gc_ignored */
 311                    0, /* st->gc_goal_miss */
 312                    0, /* st->gc_dst_overflow */
 313                    0, /* st->in_hlist_search */
 314                    0  /* st->out_hlist_search */
 315                 );
 316         return 0;
 317 }
 318
 319 static const struct seq_operations rt_cpu_seq_ops = {
 320         .start  = rt_cpu_seq_start,
 321         .next   = rt_cpu_seq_next,
 322         .stop   = rt_cpu_seq_stop,
 323         .show   = rt_cpu_seq_show,
 324 };
 325
 326
 327 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 328 {
 329         return seq_open(file, &rt_cpu_seq_ops);
 330 }
 331
 332 static const struct file_operations rt_cpu_seq_fops = {
 333         .open    = rt_cpu_seq_open,
 334         .read    = seq_read,
 335         .llseek  = seq_lseek,
 336         .release = seq_release,
 337 };
 338
 339 #ifdef CONFIG_IP_ROUTE_CLASSID
 340 static int rt_acct_proc_show(struct seq_file *m, void *v)
 341 {
 342         struct ip_rt_acct *dst, *src;
 343         unsigned int i, j;
 344
 345         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 346         if (!dst)
 347                 return -ENOMEM;
 348
 349         for_each_possible_cpu(i) {
 350                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 351                 for (j = 0; j < 256; j++) {
 352                         dst[j].o_bytes   += src[j].o_bytes;
 353                         dst[j].o_packets += src[j].o_packets;
 354                         dst[j].i_bytes   += src[j].i_bytes;
 355                         dst[j].i_packets += src[j].i_packets;
 356                 }
 357         }
 358
 359         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 360         kfree(dst);
 361         return 0;
 362 }
 363 #endif
 364
 365 static int __net_init ip_rt_do_proc_init(struct net *net)
 366 {
 367         struct proc_dir_entry *pde;
 368
 369         pde = proc_create("rt_cache", 0444, net->proc_net,
 370                           &rt_cache_seq_fops);
 371         if (!pde)
 372                 goto err1;
 373
 374         pde = proc_create("rt_cache", 0444,
 375                           net->proc_net_stat, &rt_cpu_seq_fops);
 376         if (!pde)
 377                 goto err2;
 378
 379 #ifdef CONFIG_IP_ROUTE_CLASSID
 380         pde = proc_create_single("rt_acct", 0, net->proc_net,
 381                         rt_acct_proc_show);
 382         if (!pde)
 383                 goto err3;
 384 #endif
 385         return 0;
 386
 387 #ifdef CONFIG_IP_ROUTE_CLASSID
 388 err3:
 389         remove_proc_entry("rt_cache", net->proc_net_stat);
 390 #endif
 391 err2:
 392         remove_proc_entry("rt_cache", net->proc_net);
 393 err1:
 394         return -ENOMEM;
 395 }
 396
 397 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 398 {
 399         remove_proc_entry("rt_cache", net->proc_net_stat);
 400         remove_proc_entry("rt_cache", net->proc_net);
 401 #ifdef CONFIG_IP_ROUTE_CLASSID
 402         remove_proc_entry("rt_acct", net->proc_net);
 403 #endif
 404 }
 405
 406 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 407         .init = ip_rt_do_proc_init,
 408         .exit = ip_rt_do_proc_exit,
 409 };
 410
 411 static int __init ip_rt_proc_init(void)
 412 {
 413         return register_pernet_subsys(&ip_rt_proc_ops);
 414 }
 415
 416 #else
 417 static inline int ip_rt_proc_init(void)
 418 {
 419         return 0;
 420 }
 421 #endif /* CONFIG_PROC_FS */
 422
 423 static inline bool rt_is_expired(const struct rtable *rth)
 424 {
 425         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 426 }
 427
 428 void rt_cache_flush(struct net *net)
 429 {
 430         rt_genid_bump_ipv4(net);
 431 }
 432
 433 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 434                                            struct sk_buff *skb,
 435                                            const void *daddr)
 436 {
 437         struct net_device *dev = dst->dev;
 438         const __be32 *pkey = daddr;
 439         const struct rtable *rt;
 440         struct neighbour *n;
 441
 442         rt = (const struct rtable *) dst;
 443         if (rt->rt_gateway)
 444                 pkey = (const __be32 *) &rt->rt_gateway;
 445         else if (skb)
 446                 pkey = &ip_hdr(skb)->daddr;
 447
 448         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 449         if (n)
 450                 return n;
 451         return neigh_create(&arp_tbl, pkey, dev);
 452 }
 453
 454 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 455 {
 456         struct net_device *dev = dst->dev;
 457         const __be32 *pkey = daddr;
 458         const struct rtable *rt;
 459
 460         rt = (const struct rtable *)dst;
 461         if (rt->rt_gateway)
 462                 pkey = (const __be32 *)&rt->rt_gateway;
 463         else if (!daddr ||
 464                  (rt->rt_flags &
 465                   (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
 466                 return;
 467
 468         __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
 469 }
 470
 471 #define IP_IDENTS_SZ 2048u
 472
 473 static atomic_t *ip_idents __read_mostly;
 474 static u32 *ip_tstamps __read_mostly;
 475
 476 /* In order to protect privacy, we add a perturbation to identifiers
 477  * if one generator is seldom used. This makes hard for an attacker
 478  * to infer how many packets were sent between two points in time.
 479  */
 480 u32 ip_idents_reserve(u32 hash, int segs)
 481 {
 482         u32 *p_tstamp = ip_tstamps + hash % IP_IDENTS_SZ;
 483         atomic_t *p_id = ip_idents + hash % IP_IDENTS_SZ;
 484         u32 old = READ_ONCE(*p_tstamp);
 485         u32 now = (u32)jiffies;
 486         u32 new, delta = 0;
 487
 488         if (old != now && cmpxchg(p_tstamp, old, now) == old)
 489                 delta = prandom_u32_max(now - old);
 490
 491         /* Do not use atomic_add_return() as it makes UBSAN unhappy */
 492         do {
 493                 old = (u32)atomic_read(p_id);
 494                 new = old + delta + segs;
 495         } while (atomic_cmpxchg(p_id, old, new) != old);
 496
 497         return new - segs;
 498 }
 499 EXPORT_SYMBOL(ip_idents_reserve);
 500
 501 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 502 {
 503         static u32 ip_idents_hashrnd __read_mostly;
 504         u32 hash, id;
 505
 506         net_get_random_once(&ip_idents_hashrnd, sizeof(ip_idents_hashrnd));
 507
 508         hash = jhash_3words((__force u32)iph->daddr,
 509                             (__force u32)iph->saddr,
 510                             iph->protocol ^ net_hash_mix(net),
 511                             ip_idents_hashrnd);
 512         id = ip_idents_reserve(hash, segs);
 513         iph->id = htons(id);
 514 }
 515 EXPORT_SYMBOL(__ip_select_ident);
 516
 517 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
 518                              const struct sock *sk,
 519                              const struct iphdr *iph,
 520                              int oif, u8 tos,
 521                              u8 prot, u32 mark, int flow_flags)
 522 {
 523         if (sk) {
 524                 const struct inet_sock *inet = inet_sk(sk);
 525
 526                 oif = sk->sk_bound_dev_if;
 527                 mark = sk->sk_mark;
 528                 tos = RT_CONN_FLAGS(sk);
 529                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 530         }
 531         flowi4_init_output(fl4, oif, mark, tos,
 532                            RT_SCOPE_UNIVERSE, prot,
 533                            flow_flags,
 534                            iph->daddr, iph->saddr, 0, 0,
 535                            sock_net_uid(net, sk));
 536 }
 537
 538 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 539                                const struct sock *sk)
 540 {
 541         const struct net *net = dev_net(skb->dev);
 542         const struct iphdr *iph = ip_hdr(skb);
 543         int oif = skb->dev->ifindex;
 544         u8 tos = RT_TOS(iph->tos);
 545         u8 prot = iph->protocol;
 546         u32 mark = skb->mark;
 547
 548         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
 549 }
 550
 551 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 552 {
 553         const struct inet_sock *inet = inet_sk(sk);
 554         const struct ip_options_rcu *inet_opt;
 555         __be32 daddr = inet->inet_daddr;
 556
 557         rcu_read_lock();
 558         inet_opt = rcu_dereference(inet->inet_opt);
 559         if (inet_opt && inet_opt->opt.srr)
 560                 daddr = inet_opt->opt.faddr;
 561         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 562                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 563                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 564                            inet_sk_flowi_flags(sk),
 565                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
 566         rcu_read_unlock();
 567 }
 568
 569 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 570                                  const struct sk_buff *skb)
 571 {
 572         if (skb)
 573                 build_skb_flow_key(fl4, skb, sk);
 574         else
 575                 build_sk_flow_key(fl4, sk);
 576 }
 577
 578 static DEFINE_SPINLOCK(fnhe_lock);
 579
 580 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 581 {
 582         struct rtable *rt;
 583
 584         rt = rcu_dereference(fnhe->fnhe_rth_input);
 585         if (rt) {
 586                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 587                 dst_dev_put(&rt->dst);
 588                 dst_release(&rt->dst);
 589         }
 590         rt = rcu_dereference(fnhe->fnhe_rth_output);
 591         if (rt) {
 592                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 593                 dst_dev_put(&rt->dst);
 594                 dst_release(&rt->dst);
 595         }
 596 }
 597
 598 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
 599 {
 600         struct fib_nh_exception *fnhe, *oldest;
 601
 602         oldest = rcu_dereference(hash->chain);
 603         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
 604              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 605                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
 606                         oldest = fnhe;
 607         }
 608         fnhe_flush_routes(oldest);
 609         return oldest;
 610 }
 611
 612 static inline u32 fnhe_hashfun(__be32 daddr)
 613 {
 614         static u32 fnhe_hashrnd __read_mostly;
 615         u32 hval;
 616
 617         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
 618         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
 619         return hash_32(hval, FNHE_HASH_SHIFT);
 620 }
 621
 622 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 623 {
 624         rt->rt_pmtu = fnhe->fnhe_pmtu;
 625         rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
 626         rt->dst.expires = fnhe->fnhe_expires;
 627
 628         if (fnhe->fnhe_gw) {
 629                 rt->rt_flags |= RTCF_REDIRECTED;
 630                 rt->rt_gateway = fnhe->fnhe_gw;
 631                 rt->rt_uses_gateway = 1;
 632         }
 633 }
 634
 635 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 636                                   u32 pmtu, bool lock, unsigned long expires)
 637 {
 638         struct fnhe_hash_bucket *hash;
 639         struct fib_nh_exception *fnhe;
 640         struct rtable *rt;
 641         u32 genid, hval;
 642         unsigned int i;
 643         int depth;
 644
 645         genid = fnhe_genid(dev_net(nh->nh_dev));
 646         hval = fnhe_hashfun(daddr);
 647
 648         spin_lock_bh(&fnhe_lock);
 649
 650         hash = rcu_dereference(nh->nh_exceptions);
 651         if (!hash) {
 652                 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
 653                 if (!hash)
 654                         goto out_unlock;
 655                 rcu_assign_pointer(nh->nh_exceptions, hash);
 656         }
 657
 658         hash += hval;
 659
 660         depth = 0;
 661         for (fnhe = rcu_dereference(hash->chain); fnhe;
 662              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 663                 if (fnhe->fnhe_daddr == daddr)
 664                         break;
 665                 depth++;
 666         }
 667
 668         if (fnhe) {
 669                 if (fnhe->fnhe_genid != genid)
 670                         fnhe->fnhe_genid = genid;
 671                 if (gw)
 672                         fnhe->fnhe_gw = gw;
 673                 if (pmtu) {
 674                         fnhe->fnhe_pmtu = pmtu;
 675                         fnhe->fnhe_mtu_locked = lock;
 676                 }
 677                 fnhe->fnhe_expires = max(1UL, expires);
 678                 /* Update all cached dsts too */
 679                 rt = rcu_dereference(fnhe->fnhe_rth_input);
 680                 if (rt)
 681                         fill_route_from_fnhe(rt, fnhe);
 682                 rt = rcu_dereference(fnhe->fnhe_rth_output);
 683                 if (rt)
 684                         fill_route_from_fnhe(rt, fnhe);
 685         } else {
 686                 if (depth > FNHE_RECLAIM_DEPTH)
 687                         fnhe = fnhe_oldest(hash);
 688                 else {
 689                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 690                         if (!fnhe)
 691                                 goto out_unlock;
 692
 693                         fnhe->fnhe_next = hash->chain;
 694                         rcu_assign_pointer(hash->chain, fnhe);
 695                 }
 696                 fnhe->fnhe_genid = genid;
 697                 fnhe->fnhe_daddr = daddr;
 698                 fnhe->fnhe_gw = gw;
 699                 fnhe->fnhe_pmtu = pmtu;
 700                 fnhe->fnhe_mtu_locked = lock;
 701                 fnhe->fnhe_expires = max(1UL, expires);
 702
 703                 /* Exception created; mark the cached routes for the nexthop
 704                  * stale, so anyone caching it rechecks if this exception
 705                  * applies to them.
 706                  */
 707                 rt = rcu_dereference(nh->nh_rth_input);
 708                 if (rt)
 709                         rt->dst.obsolete = DST_OBSOLETE_KILL;
 710
 711                 for_each_possible_cpu(i) {
 712                         struct rtable __rcu **prt;
 713                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
 714                         rt = rcu_dereference(*prt);
 715                         if (rt)
 716                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 717                 }
 718         }
 719
 720         fnhe->fnhe_stamp = jiffies;
 721
 722 out_unlock:
 723         spin_unlock_bh(&fnhe_lock);
 724 }
 725
 726 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 727                              bool kill_route)
 728 {
 729         __be32 new_gw = icmp_hdr(skb)->un.gateway;
 730         __be32 old_gw = ip_hdr(skb)->saddr;
 731         struct net_device *dev = skb->dev;
 732         struct in_device *in_dev;
 733         struct fib_result res;
 734         struct neighbour *n;
 735         struct net *net;
 736
 737         switch (icmp_hdr(skb)->code & 7) {
 738         case ICMP_REDIR_NET:
 739         case ICMP_REDIR_NETTOS:
 740         case ICMP_REDIR_HOST:
 741         case ICMP_REDIR_HOSTTOS:
 742                 break;
 743
 744         default:
 745                 return;
 746         }
 747
 748         if (rt->rt_gateway != old_gw)
 749                 return;
 750
 751         in_dev = __in_dev_get_rcu(dev);
 752         if (!in_dev)
 753                 return;
 754
 755         net = dev_net(dev);
 756         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 757             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 758             ipv4_is_zeronet(new_gw))
 759                 goto reject_redirect;
 760
 761         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 762                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 763                         goto reject_redirect;
 764                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 765                         goto reject_redirect;
 766         } else {
 767                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 768                         goto reject_redirect;
 769         }
 770
 771         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
 772         if (!n)
 773                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 774         if (!IS_ERR(n)) {
 775                 if (!(n->nud_state & NUD_VALID)) {
 776                         neigh_event_send(n, NULL);
 777                 } else {
 778                         if (fib_lookup(net, fl4, &res, 0) == 0) {
 779                                 struct fib_nh *nh = &FIB_RES_NH(res);
 780
 781                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
 782                                                 0, false,
 783                                                 jiffies + ip_rt_gc_timeout);
 784                         }
 785                         if (kill_route)
 786                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 787                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 788                 }
 789                 neigh_release(n);
 790         }
 791         return;
 792
 793 reject_redirect:
 794 #ifdef CONFIG_IP_ROUTE_VERBOSE
 795         if (IN_DEV_LOG_MARTIANS(in_dev)) {
 796                 const struct iphdr *iph = (const struct iphdr *) skb->data;
 797                 __be32 daddr = iph->daddr;
 798                 __be32 saddr = iph->saddr;
 799
 800                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 801                                      "  Advised path = %pI4 -> %pI4\n",
 802                                      &old_gw, dev->name, &new_gw,
 803                                      &saddr, &daddr);
 804         }
 805 #endif
 806         ;
 807 }
 808
 809 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 810 {
 811         struct rtable *rt;
 812         struct flowi4 fl4;
 813         const struct iphdr *iph = (const struct iphdr *) skb->data;
 814         struct net *net = dev_net(skb->dev);
 815         int oif = skb->dev->ifindex;
 816         u8 tos = RT_TOS(iph->tos);
 817         u8 prot = iph->protocol;
 818         u32 mark = skb->mark;
 819
 820         rt = (struct rtable *) dst;
 821
 822         __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
 823         __ip_do_redirect(rt, skb, &fl4, true);
 824 }
 825
 826 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 827 {
 828         struct rtable *rt = (struct rtable *)dst;
 829         struct dst_entry *ret = dst;
 830
 831         if (rt) {
 832                 if (dst->obsolete > 0) {
 833                         ip_rt_put(rt);
 834                         ret = NULL;
 835                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 836                            rt->dst.expires) {
 837                         ip_rt_put(rt);
 838                         ret = NULL;
 839                 }
 840         }
 841         return ret;
 842 }
 843
 844 /*
 845  * Algorithm:
 846  *      1. The first ip_rt_redirect_number redirects are sent
 847  *         with exponential backoff, then we stop sending them at all,
 848  *         assuming that the host ignores our redirects.
 849  *      2. If we did not see packets requiring redirects
 850  *         during ip_rt_redirect_silence, we assume that the host
 851  *         forgot redirected route and start to send redirects again.
 852  *
 853  * This algorithm is much cheaper and more intelligent than dumb load limiting
 854  * in icmp.c.
 855  *
 856  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 857  * and "frag. need" (breaks PMTU discovery) in icmp.c.
 858  */
 859
 860 void ip_rt_send_redirect(struct sk_buff *skb)
 861 {
 862         struct rtable *rt = skb_rtable(skb);
 863         struct in_device *in_dev;
 864         struct inet_peer *peer;
 865         struct net *net;
 866         int log_martians;
 867         int vif;
 868
 869         rcu_read_lock();
 870         in_dev = __in_dev_get_rcu(rt->dst.dev);
 871         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 872                 rcu_read_unlock();
 873                 return;
 874         }
 875         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 876         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 877         rcu_read_unlock();
 878
 879         net = dev_net(rt->dst.dev);
 880         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 881         if (!peer) {
 882                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 883                           rt_nexthop(rt, ip_hdr(skb)->daddr));
 884                 return;
 885         }
 886
 887         /* No redirected packets during ip_rt_redirect_silence;
 888          * reset the algorithm.
 889          */
 890         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence))
 891                 peer->rate_tokens = 0;
 892
 893         /* Too many ignored redirects; do not send anything
 894          * set dst.rate_last to the last seen redirected packet.
 895          */
 896         if (peer->rate_tokens >= ip_rt_redirect_number) {
 897                 peer->rate_last = jiffies;
 898                 goto out_put_peer;
 899         }
 900
 901         /* Check for load limit; set rate_last to the latest sent
 902          * redirect.
 903          */
 904         if (peer->rate_tokens == 0 ||
 905             time_after(jiffies,
 906                        (peer->rate_last +
 907                         (ip_rt_redirect_load << peer->rate_tokens)))) {
 908                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 909
 910                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 911                 peer->rate_last = jiffies;
 912                 ++peer->rate_tokens;
 913 #ifdef CONFIG_IP_ROUTE_VERBOSE
 914                 if (log_martians &&
 915                     peer->rate_tokens == ip_rt_redirect_number)
 916                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 917                                              &ip_hdr(skb)->saddr, inet_iif(skb),
 918                                              &ip_hdr(skb)->daddr, &gw);
 919 #endif
 920         }
 921 out_put_peer:
 922         inet_putpeer(peer);
 923 }
 924
 925 static int ip_error(struct sk_buff *skb)
 926 {
 927         struct rtable *rt = skb_rtable(skb);
 928         struct net_device *dev = skb->dev;
 929         struct in_device *in_dev;
 930         struct inet_peer *peer;
 931         unsigned long now;
 932         struct net *net;
 933         bool send;
 934         int code;
 935
 936         if (netif_is_l3_master(skb->dev)) {
 937                 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
 938                 if (!dev)
 939                         goto out;
 940         }
 941
 942         in_dev = __in_dev_get_rcu(dev);
 943
 944         /* IP on this device is disabled. */
 945         if (!in_dev)
 946                 goto out;
 947
 948         net = dev_net(rt->dst.dev);
 949         if (!IN_DEV_FORWARD(in_dev)) {
 950                 switch (rt->dst.error) {
 951                 case EHOSTUNREACH:
 952                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 953                         break;
 954
 955                 case ENETUNREACH:
 956                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 957                         break;
 958                 }
 959                 goto out;
 960         }
 961
 962         switch (rt->dst.error) {
 963         case EINVAL:
 964         default:
 965                 goto out;
 966         case EHOSTUNREACH:
 967                 code = ICMP_HOST_UNREACH;
 968                 break;
 969         case ENETUNREACH:
 970                 code = ICMP_NET_UNREACH;
 971                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 972                 break;
 973         case EACCES:
 974                 code = ICMP_PKT_FILTERED;
 975                 break;
 976         }
 977
 978         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 979                                l3mdev_master_ifindex(skb->dev), 1);
 980
 981         send = true;
 982         if (peer) {
 983                 now = jiffies;
 984                 peer->rate_tokens += now - peer->rate_last;
 985                 if (peer->rate_tokens > ip_rt_error_burst)
 986                         peer->rate_tokens = ip_rt_error_burst;
 987                 peer->rate_last = now;
 988                 if (peer->rate_tokens >= ip_rt_error_cost)
 989                         peer->rate_tokens -= ip_rt_error_cost;
 990                 else
 991                         send = false;
 992                 inet_putpeer(peer);
 993         }
 994         if (send)
 995                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
 996
 997 out:    kfree_skb(skb);
 998         return 0;
 999 }
1000
1001 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1002 {
1003         struct dst_entry *dst = &rt->dst;
1004         u32 old_mtu = ipv4_mtu(dst);
1005         struct fib_result res;
1006         bool lock = false;
1007
1008         if (ip_mtu_locked(dst))
1009                 return;
1010
1011         if (old_mtu < mtu)
1012                 return;
1013
1014         if (mtu < ip_rt_min_pmtu) {
1015                 lock = true;
1016                 mtu = min(old_mtu, ip_rt_min_pmtu);
1017         }
1018
1019         if (rt->rt_pmtu == mtu && !lock &&
1020             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1021                 return;
1022
1023         rcu_read_lock();
1024         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1025                 struct fib_nh *nh = &FIB_RES_NH(res);
1026
1027                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
1028                                       jiffies + ip_rt_mtu_expires);
1029         }
1030         rcu_read_unlock();
1031 }
1032
1033 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1034                               struct sk_buff *skb, u32 mtu)
1035 {
1036         struct rtable *rt = (struct rtable *) dst;
1037         struct flowi4 fl4;
1038
1039         ip_rt_build_flow_key(&fl4, sk, skb);
1040         __ip_rt_update_pmtu(rt, &fl4, mtu);
1041 }
1042
1043 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1044                       int oif, u8 protocol)
1045 {
1046         const struct iphdr *iph = (const struct iphdr *) skb->data;
1047         struct flowi4 fl4;
1048         struct rtable *rt;
1049         u32 mark = IP4_REPLY_MARK(net, skb->mark);
1050
1051         __build_flow_key(net, &fl4, NULL, iph, oif,
1052                          RT_TOS(iph->tos), protocol, mark, 0);
1053         rt = __ip_route_output_key(net, &fl4);
1054         if (!IS_ERR(rt)) {
1055                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1056                 ip_rt_put(rt);
1057         }
1058 }
1059 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1060
1061 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1062 {
1063         const struct iphdr *iph = (const struct iphdr *) skb->data;
1064         struct flowi4 fl4;
1065         struct rtable *rt;
1066
1067         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1068
1069         if (!fl4.flowi4_mark)
1070                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1071
1072         rt = __ip_route_output_key(sock_net(sk), &fl4);
1073         if (!IS_ERR(rt)) {
1074                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1075                 ip_rt_put(rt);
1076         }
1077 }
1078
1079 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1080 {
1081         const struct iphdr *iph = (const struct iphdr *) skb->data;
1082         struct flowi4 fl4;
1083         struct rtable *rt;
1084         struct dst_entry *odst = NULL;
1085         bool new = false;
1086         struct net *net = sock_net(sk);
1087
1088         bh_lock_sock(sk);
1089
1090         if (!ip_sk_accept_pmtu(sk))
1091                 goto out;
1092
1093         odst = sk_dst_get(sk);
1094
1095         if (sock_owned_by_user(sk) || !odst) {
1096                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1097                 goto out;
1098         }
1099
1100         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1101
1102         rt = (struct rtable *)odst;
1103         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1104                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1105                 if (IS_ERR(rt))
1106                         goto out;
1107
1108                 new = true;
1109         }
1110
1111         __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1112
1113         if (!dst_check(&rt->dst, 0)) {
1114                 if (new)
1115                         dst_release(&rt->dst);
1116
1117                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1118                 if (IS_ERR(rt))
1119                         goto out;
1120
1121                 new = true;
1122         }
1123
1124         if (new)
1125                 sk_dst_set(sk, &rt->dst);
1126
1127 out:
1128         bh_unlock_sock(sk);
1129         dst_release(odst);
1130 }
1131 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1132
1133 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1134                    int oif, u8 protocol)
1135 {
1136         const struct iphdr *iph = (const struct iphdr *) skb->data;
1137         struct flowi4 fl4;
1138         struct rtable *rt;
1139
1140         __build_flow_key(net, &fl4, NULL, iph, oif,
1141                          RT_TOS(iph->tos), protocol, 0, 0);
1142         rt = __ip_route_output_key(net, &fl4);
1143         if (!IS_ERR(rt)) {
1144                 __ip_do_redirect(rt, skb, &fl4, false);
1145                 ip_rt_put(rt);
1146         }
1147 }
1148 EXPORT_SYMBOL_GPL(ipv4_redirect);
1149
1150 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1151 {
1152         const struct iphdr *iph = (const struct iphdr *) skb->data;
1153         struct flowi4 fl4;
1154         struct rtable *rt;
1155         struct net *net = sock_net(sk);
1156
1157         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1158         rt = __ip_route_output_key(net, &fl4);
1159         if (!IS_ERR(rt)) {
1160                 __ip_do_redirect(rt, skb, &fl4, false);
1161                 ip_rt_put(rt);
1162         }
1163 }
1164 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1165
1166 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1167 {
1168         struct rtable *rt = (struct rtable *) dst;
1169
1170         /* All IPV4 dsts are created with ->obsolete set to the value
1171          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1172          * into this function always.
1173          *
1174          * When a PMTU/redirect information update invalidates a route,
1175          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1176          * DST_OBSOLETE_DEAD by dst_free().
1177          */
1178         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1179                 return NULL;
1180         return dst;
1181 }
1182
1183 static void ipv4_link_failure(struct sk_buff *skb)
1184 {
1185         struct rtable *rt;
1186
1187         icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0);
1188
1189         rt = skb_rtable(skb);
1190         if (rt)
1191                 dst_set_expires(&rt->dst, 0);
1192 }
1193
1194 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1195 {
1196         pr_debug("%s: %pI4 -> %pI4, %s\n",
1197                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1198                  skb->dev ? skb->dev->name : "?");
1199         kfree_skb(skb);
1200         WARN_ON(1);
1201         return 0;
1202 }
1203
1204 /*
1205    We do not cache source address of outgoing interface,
1206    because it is used only by IP RR, TS and SRR options,
1207    so that it out of fast path.
1208
1209    BTW remember: "addr" is allowed to be not aligned
1210    in IP options!
1211  */
1212
1213 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1214 {
1215         __be32 src;
1216
1217         if (rt_is_output_route(rt))
1218                 src = ip_hdr(skb)->saddr;
1219         else {
1220                 struct fib_result res;
1221                 struct iphdr *iph = ip_hdr(skb);
1222                 struct flowi4 fl4 = {
1223                         .daddr = iph->daddr,
1224                         .saddr = iph->saddr,
1225                         .flowi4_tos = RT_TOS(iph->tos),
1226                         .flowi4_oif = rt->dst.dev->ifindex,
1227                         .flowi4_iif = skb->dev->ifindex,
1228                         .flowi4_mark = skb->mark,
1229                 };
1230
1231                 rcu_read_lock();
1232                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1233                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1234                 else
1235                         src = inet_select_addr(rt->dst.dev,
1236                                                rt_nexthop(rt, iph->daddr),
1237                                                RT_SCOPE_UNIVERSE);
1238                 rcu_read_unlock();
1239         }
1240         memcpy(addr, &src, 4);
1241 }
1242
1243 #ifdef CONFIG_IP_ROUTE_CLASSID
1244 static void set_class_tag(struct rtable *rt, u32 tag)
1245 {
1246         if (!(rt->dst.tclassid & 0xFFFF))
1247                 rt->dst.tclassid |= tag & 0xFFFF;
1248         if (!(rt->dst.tclassid & 0xFFFF0000))
1249                 rt->dst.tclassid |= tag & 0xFFFF0000;
1250 }
1251 #endif
1252
1253 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1254 {
1255         unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1256         unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1257                                     ip_rt_min_advmss);
1258
1259         return min(advmss, IPV4_MAX_PMTU - header_size);
1260 }
1261
1262 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1263 {
1264         const struct rtable *rt = (const struct rtable *) dst;
1265         unsigned int mtu = rt->rt_pmtu;
1266
1267         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1268                 mtu = dst_metric_raw(dst, RTAX_MTU);
1269
1270         if (mtu)
1271                 return mtu;
1272
1273         mtu = READ_ONCE(dst->dev->mtu);
1274
1275         if (unlikely(ip_mtu_locked(dst))) {
1276                 if (rt->rt_uses_gateway && mtu > 576)
1277                         mtu = 576;
1278         }
1279
1280         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1281
1282         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1283 }
1284
1285 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1286 {
1287         struct fnhe_hash_bucket *hash;
1288         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1289         u32 hval = fnhe_hashfun(daddr);
1290
1291         spin_lock_bh(&fnhe_lock);
1292
1293         hash = rcu_dereference_protected(nh->nh_exceptions,
1294                                          lockdep_is_held(&fnhe_lock));
1295         hash += hval;
1296
1297         fnhe_p = &hash->chain;
1298         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1299         while (fnhe) {
1300                 if (fnhe->fnhe_daddr == daddr) {
1301                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1302                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1303                         fnhe_flush_routes(fnhe);
1304                         kfree_rcu(fnhe, rcu);
1305                         break;
1306                 }
1307                 fnhe_p = &fnhe->fnhe_next;
1308                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1309                                                  lockdep_is_held(&fnhe_lock));
1310         }
1311
1312         spin_unlock_bh(&fnhe_lock);
1313 }
1314
1315 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1316 {
1317         struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1318         struct fib_nh_exception *fnhe;
1319         u32 hval;
1320
1321         if (!hash)
1322                 return NULL;
1323
1324         hval = fnhe_hashfun(daddr);
1325
1326         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1327              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1328                 if (fnhe->fnhe_daddr == daddr) {
1329                         if (fnhe->fnhe_expires &&
1330                             time_after(jiffies, fnhe->fnhe_expires)) {
1331                                 ip_del_fnhe(nh, daddr);
1332                                 break;
1333                         }
1334                         return fnhe;
1335                 }
1336         }
1337         return NULL;
1338 }
1339
1340 /* MTU selection:
1341  * 1. mtu on route is locked - use it
1342  * 2. mtu from nexthop exception
1343  * 3. mtu from egress device
1344  */
1345
1346 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1347 {
1348         struct fib_info *fi = res->fi;
1349         struct fib_nh *nh = &fi->fib_nh[res->nh_sel];
1350         struct net_device *dev = nh->nh_dev;
1351         u32 mtu = 0;
1352
1353         if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1354             fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1355                 mtu = fi->fib_mtu;
1356
1357         if (likely(!mtu)) {
1358                 struct fib_nh_exception *fnhe;
1359
1360                 fnhe = find_exception(nh, daddr);
1361                 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1362                         mtu = fnhe->fnhe_pmtu;
1363         }
1364
1365         if (likely(!mtu))
1366                 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1367
1368         return mtu - lwtunnel_headroom(nh->nh_lwtstate, mtu);
1369 }
1370
1371 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1372                               __be32 daddr, const bool do_cache)
1373 {
1374         bool ret = false;
1375
1376         spin_lock_bh(&fnhe_lock);
1377
1378         if (daddr == fnhe->fnhe_daddr) {
1379                 struct rtable __rcu **porig;
1380                 struct rtable *orig;
1381                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1382
1383                 if (rt_is_input_route(rt))
1384                         porig = &fnhe->fnhe_rth_input;
1385                 else
1386                         porig = &fnhe->fnhe_rth_output;
1387                 orig = rcu_dereference(*porig);
1388
1389                 if (fnhe->fnhe_genid != genid) {
1390                         fnhe->fnhe_genid = genid;
1391                         fnhe->fnhe_gw = 0;
1392                         fnhe->fnhe_pmtu = 0;
1393                         fnhe->fnhe_expires = 0;
1394                         fnhe->fnhe_mtu_locked = false;
1395                         fnhe_flush_routes(fnhe);
1396                         orig = NULL;
1397                 }
1398                 fill_route_from_fnhe(rt, fnhe);
1399                 if (!rt->rt_gateway)
1400                         rt->rt_gateway = daddr;
1401
1402                 if (do_cache) {
1403                         dst_hold(&rt->dst);
1404                         rcu_assign_pointer(*porig, rt);
1405                         if (orig) {
1406                                 dst_dev_put(&orig->dst);
1407                                 dst_release(&orig->dst);
1408                         }
1409                         ret = true;
1410                 }
1411
1412                 fnhe->fnhe_stamp = jiffies;
1413         }
1414         spin_unlock_bh(&fnhe_lock);
1415
1416         return ret;
1417 }
1418
1419 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1420 {
1421         struct rtable *orig, *prev, **p;
1422         bool ret = true;
1423
1424         if (rt_is_input_route(rt)) {
1425                 p = (struct rtable **)&nh->nh_rth_input;
1426         } else {
1427                 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1428         }
1429         orig = *p;
1430
1431         /* hold dst before doing cmpxchg() to avoid race condition
1432          * on this dst
1433          */
1434         dst_hold(&rt->dst);
1435         prev = cmpxchg(p, orig, rt);
1436         if (prev == orig) {
1437                 if (orig) {
1438                         dst_dev_put(&orig->dst);
1439                         dst_release(&orig->dst);
1440                 }
1441         } else {
1442                 dst_release(&rt->dst);
1443                 ret = false;
1444         }
1445
1446         return ret;
1447 }
1448
1449 struct uncached_list {
1450         spinlock_t              lock;
1451         struct list_head        head;
1452 };
1453
1454 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1455
1456 void rt_add_uncached_list(struct rtable *rt)
1457 {
1458         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1459
1460         rt->rt_uncached_list = ul;
1461
1462         spin_lock_bh(&ul->lock);
1463         list_add_tail(&rt->rt_uncached, &ul->head);
1464         spin_unlock_bh(&ul->lock);
1465 }
1466
1467 void rt_del_uncached_list(struct rtable *rt)
1468 {
1469         if (!list_empty(&rt->rt_uncached)) {
1470                 struct uncached_list *ul = rt->rt_uncached_list;
1471
1472                 spin_lock_bh(&ul->lock);
1473                 list_del(&rt->rt_uncached);
1474                 spin_unlock_bh(&ul->lock);
1475         }
1476 }
1477
1478 static void ipv4_dst_destroy(struct dst_entry *dst)
1479 {
1480         struct rtable *rt = (struct rtable *)dst;
1481
1482         ip_dst_metrics_put(dst);
1483         rt_del_uncached_list(rt);
1484 }
1485
1486 void rt_flush_dev(struct net_device *dev)
1487 {
1488         struct net *net = dev_net(dev);
1489         struct rtable *rt;
1490         int cpu;
1491
1492         for_each_possible_cpu(cpu) {
1493                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1494
1495                 spin_lock_bh(&ul->lock);
1496                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1497                         if (rt->dst.dev != dev)
1498                                 continue;
1499                         rt->dst.dev = net->loopback_dev;
1500                         dev_hold(rt->dst.dev);
1501                         dev_put(dev);
1502                 }
1503                 spin_unlock_bh(&ul->lock);
1504         }
1505 }
1506
1507 static bool rt_cache_valid(const struct rtable *rt)
1508 {
1509         return  rt &&
1510                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1511                 !rt_is_expired(rt);
1512 }
1513
1514 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1515                            const struct fib_result *res,
1516                            struct fib_nh_exception *fnhe,
1517                            struct fib_info *fi, u16 type, u32 itag,
1518                            const bool do_cache)
1519 {
1520         bool cached = false;
1521
1522         if (fi) {
1523                 struct fib_nh *nh = &FIB_RES_NH(*res);
1524
1525                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1526                         rt->rt_gateway = nh->nh_gw;
1527                         rt->rt_uses_gateway = 1;
1528                 }
1529                 ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1530
1531 #ifdef CONFIG_IP_ROUTE_CLASSID
1532                 rt->dst.tclassid = nh->nh_tclassid;
1533 #endif
1534                 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1535                 if (unlikely(fnhe))
1536                         cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1537                 else if (do_cache)
1538                         cached = rt_cache_route(nh, rt);
1539                 if (unlikely(!cached)) {
1540                         /* Routes we intend to cache in nexthop exception or
1541                          * FIB nexthop have the DST_NOCACHE bit clear.
1542                          * However, if we are unsuccessful at storing this
1543                          * route into the cache we really need to set it.
1544                          */
1545                         if (!rt->rt_gateway)
1546                                 rt->rt_gateway = daddr;
1547                         rt_add_uncached_list(rt);
1548                 }
1549         } else
1550                 rt_add_uncached_list(rt);
1551
1552 #ifdef CONFIG_IP_ROUTE_CLASSID
1553 #ifdef CONFIG_IP_MULTIPLE_TABLES
1554         set_class_tag(rt, res->tclassid);
1555 #endif
1556         set_class_tag(rt, itag);
1557 #endif
1558 }
1559
1560 struct rtable *rt_dst_alloc(struct net_device *dev,
1561                             unsigned int flags, u16 type,
1562                             bool nopolicy, bool noxfrm, bool will_cache)
1563 {
1564         struct rtable *rt;
1565
1566         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1567                        (will_cache ? 0 : DST_HOST) |
1568                        (nopolicy ? DST_NOPOLICY : 0) |
1569                        (noxfrm ? DST_NOXFRM : 0));
1570
1571         if (rt) {
1572                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1573                 rt->rt_flags = flags;
1574                 rt->rt_type = type;
1575                 rt->rt_is_input = 0;
1576                 rt->rt_iif = 0;
1577                 rt->rt_pmtu = 0;
1578                 rt->rt_mtu_locked = 0;
1579                 rt->rt_gateway = 0;
1580                 rt->rt_uses_gateway = 0;
1581                 INIT_LIST_HEAD(&rt->rt_uncached);
1582
1583                 rt->dst.output = ip_output;
1584                 if (flags & RTCF_LOCAL)
1585                         rt->dst.input = ip_local_deliver;
1586         }
1587
1588         return rt;
1589 }
1590 EXPORT_SYMBOL(rt_dst_alloc);
1591
1592 /* called in rcu_read_lock() section */
1593 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1594                           u8 tos, struct net_device *dev,
1595                           struct in_device *in_dev, u32 *itag)
1596 {
1597         int err;
1598
1599         /* Primary sanity checks. */
1600         if (!in_dev)
1601                 return -EINVAL;
1602
1603         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1604             skb->protocol != htons(ETH_P_IP))
1605                 return -EINVAL;
1606
1607         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1608                 return -EINVAL;
1609
1610         if (ipv4_is_zeronet(saddr)) {
1611                 if (!ipv4_is_local_multicast(daddr))
1612                         return -EINVAL;
1613         } else {
1614                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1615                                           in_dev, itag);
1616                 if (err < 0)
1617                         return err;
1618         }
1619         return 0;
1620 }
1621
1622 /* called in rcu_read_lock() section */
1623 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1624                              u8 tos, struct net_device *dev, int our)
1625 {
1626         struct in_device *in_dev = __in_dev_get_rcu(dev);
1627         unsigned int flags = RTCF_MULTICAST;
1628         struct rtable *rth;
1629         u32 itag = 0;
1630         int err;
1631
1632         err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1633         if (err)
1634                 return err;
1635
1636         if (our)
1637                 flags |= RTCF_LOCAL;
1638
1639         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1640                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1641         if (!rth)
1642                 return -ENOBUFS;
1643
1644 #ifdef CONFIG_IP_ROUTE_CLASSID
1645         rth->dst.tclassid = itag;
1646 #endif
1647         rth->dst.output = ip_rt_bug;
1648         rth->rt_is_input= 1;
1649
1650 #ifdef CONFIG_IP_MROUTE
1651         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1652                 rth->dst.input = ip_mr_input;
1653 #endif
1654         RT_CACHE_STAT_INC(in_slow_mc);
1655
1656         skb_dst_set(skb, &rth->dst);
1657         return 0;
1658 }
1659
1660
1661 static void ip_handle_martian_source(struct net_device *dev,
1662                                      struct in_device *in_dev,
1663                                      struct sk_buff *skb,
1664                                      __be32 daddr,
1665                                      __be32 saddr)
1666 {
1667         RT_CACHE_STAT_INC(in_martian_src);
1668 #ifdef CONFIG_IP_ROUTE_VERBOSE
1669         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1670                 /*
1671                  *      RFC1812 recommendation, if source is martian,
1672                  *      the only hint is MAC header.
1673                  */
1674                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1675                         &daddr, &saddr, dev->name);
1676                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1677                         print_hex_dump(KERN_WARNING, "ll header: ",
1678                                        DUMP_PREFIX_OFFSET, 16, 1,
1679                                        skb_mac_header(skb),
1680                                        dev->hard_header_len, false);
1681                 }
1682         }
1683 #endif
1684 }
1685
1686 /* called in rcu_read_lock() section */
1687 static int __mkroute_input(struct sk_buff *skb,
1688                            const struct fib_result *res,
1689                            struct in_device *in_dev,
1690                            __be32 daddr, __be32 saddr, u32 tos)
1691 {
1692         struct fib_nh_exception *fnhe;
1693         struct rtable *rth;
1694         int err;
1695         struct in_device *out_dev;
1696         bool do_cache;
1697         u32 itag = 0;
1698
1699         /* get a working reference to the output device */
1700         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1701         if (!out_dev) {
1702                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1703                 return -EINVAL;
1704         }
1705
1706         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1707                                   in_dev->dev, in_dev, &itag);
1708         if (err < 0) {
1709                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1710                                          saddr);
1711
1712                 goto cleanup;
1713         }
1714
1715         do_cache = res->fi && !itag;
1716         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1717             skb->protocol == htons(ETH_P_IP) &&
1718             (IN_DEV_SHARED_MEDIA(out_dev) ||
1719              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1720                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1721
1722         if (skb->protocol != htons(ETH_P_IP)) {
1723                 /* Not IP (i.e. ARP). Do not create route, if it is
1724                  * invalid for proxy arp. DNAT routes are always valid.
1725                  *
1726                  * Proxy arp feature have been extended to allow, ARP
1727                  * replies back to the same interface, to support
1728                  * Private VLAN switch technologies. See arp.c.
1729                  */
1730                 if (out_dev == in_dev &&
1731                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1732                         err = -EINVAL;
1733                         goto cleanup;
1734                 }
1735         }
1736
1737         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1738         if (do_cache) {
1739                 if (fnhe)
1740                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1741                 else
1742                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1743                 if (rt_cache_valid(rth)) {
1744                         skb_dst_set_noref(skb, &rth->dst);
1745                         goto out;
1746                 }
1747         }
1748
1749         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1750                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1751                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1752         if (!rth) {
1753                 err = -ENOBUFS;
1754                 goto cleanup;
1755         }
1756
1757         rth->rt_is_input = 1;
1758         RT_CACHE_STAT_INC(in_slow_tot);
1759
1760         rth->dst.input = ip_forward;
1761
1762         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1763                        do_cache);
1764         lwtunnel_set_redirect(&rth->dst);
1765         skb_dst_set(skb, &rth->dst);
1766 out:
1767         err = 0;
1768  cleanup:
1769         return err;
1770 }
1771
1772 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1773 /* To make ICMP packets follow the right flow, the multipath hash is
1774  * calculated from the inner IP addresses.
1775  */
1776 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1777                                  struct flow_keys *hash_keys)
1778 {
1779         const struct iphdr *outer_iph = ip_hdr(skb);
1780         const struct iphdr *key_iph = outer_iph;
1781         const struct iphdr *inner_iph;
1782         const struct icmphdr *icmph;
1783         struct iphdr _inner_iph;
1784         struct icmphdr _icmph;
1785
1786         if (likely(outer_iph->protocol != IPPROTO_ICMP))
1787                 goto out;
1788
1789         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1790                 goto out;
1791
1792         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1793                                    &_icmph);
1794         if (!icmph)
1795                 goto out;
1796
1797         if (icmph->type != ICMP_DEST_UNREACH &&
1798             icmph->type != ICMP_REDIRECT &&
1799             icmph->type != ICMP_TIME_EXCEEDED &&
1800             icmph->type != ICMP_PARAMETERPROB)
1801                 goto out;
1802
1803         inner_iph = skb_header_pointer(skb,
1804                                        outer_iph->ihl * 4 + sizeof(_icmph),
1805                                        sizeof(_inner_iph), &_inner_iph);
1806         if (!inner_iph)
1807                 goto out;
1808
1809         key_iph = inner_iph;
1810 out:
1811         hash_keys->addrs.v4addrs.src = key_iph->saddr;
1812         hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1813 }
1814
1815 /* if skb is set it will be used and fl4 can be NULL */
1816 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1817                        const struct sk_buff *skb, struct flow_keys *flkeys)
1818 {
1819         struct flow_keys hash_keys;
1820         u32 mhash;
1821
1822         switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1823         case 0:
1824                 memset(&hash_keys, 0, sizeof(hash_keys));
1825                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1826                 if (skb) {
1827                         ip_multipath_l3_keys(skb, &hash_keys);
1828                 } else {
1829                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1830                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1831                 }
1832                 break;
1833         case 1:
1834                 /* skb is currently provided only when forwarding */
1835                 if (skb) {
1836                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1837                         struct flow_keys keys;
1838
1839                         /* short-circuit if we already have L4 hash present */
1840                         if (skb->l4_hash)
1841                                 return skb_get_hash_raw(skb) >> 1;
1842
1843                         memset(&hash_keys, 0, sizeof(hash_keys));
1844
1845                         if (!flkeys) {
1846                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
1847                                 flkeys = &keys;
1848                         }
1849
1850                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1851                         hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1852                         hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1853                         hash_keys.ports.src = flkeys->ports.src;
1854                         hash_keys.ports.dst = flkeys->ports.dst;
1855                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1856                 } else {
1857                         memset(&hash_keys, 0, sizeof(hash_keys));
1858                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1859                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1860                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1861                         hash_keys.ports.src = fl4->fl4_sport;
1862                         hash_keys.ports.dst = fl4->fl4_dport;
1863                         hash_keys.basic.ip_proto = fl4->flowi4_proto;
1864                 }
1865                 break;
1866         }
1867         mhash = flow_hash_from_keys(&hash_keys);
1868
1869         return mhash >> 1;
1870 }
1871 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1872
1873 static int ip_mkroute_input(struct sk_buff *skb,
1874                             struct fib_result *res,
1875                             struct in_device *in_dev,
1876                             __be32 daddr, __be32 saddr, u32 tos,
1877                             struct flow_keys *hkeys)
1878 {
1879 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1880         if (res->fi && res->fi->fib_nhs > 1) {
1881                 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
1882
1883                 fib_select_multipath(res, h);
1884         }
1885 #endif
1886
1887         /* create a routing cache entry */
1888         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1889 }
1890
1891 /*
1892  *      NOTE. We drop all the packets that has local source
1893  *      addresses, because every properly looped back packet
1894  *      must have correct destination already attached by output routine.
1895  *
1896  *      Such approach solves two big problems:
1897  *      1. Not simplex devices are handled properly.
1898  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1899  *      called with rcu_read_lock()
1900  */
1901
1902 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1903                                u8 tos, struct net_device *dev,
1904                                struct fib_result *res)
1905 {
1906         struct in_device *in_dev = __in_dev_get_rcu(dev);
1907         struct flow_keys *flkeys = NULL, _flkeys;
1908         struct net    *net = dev_net(dev);
1909         struct ip_tunnel_info *tun_info;
1910         int             err = -EINVAL;
1911         unsigned int    flags = 0;
1912         u32             itag = 0;
1913         struct rtable   *rth;
1914         struct flowi4   fl4;
1915         bool do_cache;
1916
1917         /* IP on this device is disabled. */
1918
1919         if (!in_dev)
1920                 goto out;
1921
1922         /* Check for the most weird martians, which can be not detected
1923            by fib_lookup.
1924          */
1925
1926         tun_info = skb_tunnel_info(skb);
1927         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1928                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1929         else
1930                 fl4.flowi4_tun_key.tun_id = 0;
1931         skb_dst_drop(skb);
1932
1933         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1934                 goto martian_source;
1935
1936         res->fi = NULL;
1937         res->table = NULL;
1938         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1939                 goto brd_input;
1940
1941         /* Accept zero addresses only to limited broadcast;
1942          * I even do not know to fix it or not. Waiting for complains :-)
1943          */
1944         if (ipv4_is_zeronet(saddr))
1945                 goto martian_source;
1946
1947         if (ipv4_is_zeronet(daddr))
1948                 goto martian_destination;
1949
1950         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1951          * and call it once if daddr or/and saddr are loopback addresses
1952          */
1953         if (ipv4_is_loopback(daddr)) {
1954                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1955                         goto martian_destination;
1956         } else if (ipv4_is_loopback(saddr)) {
1957                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1958                         goto martian_source;
1959         }
1960
1961         /*
1962          *      Now we are ready to route packet.
1963          */
1964         fl4.flowi4_oif = 0;
1965         fl4.flowi4_iif = dev->ifindex;
1966         fl4.flowi4_mark = skb->mark;
1967         fl4.flowi4_tos = tos;
1968         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1969         fl4.flowi4_flags = 0;
1970         fl4.daddr = daddr;
1971         fl4.saddr = saddr;
1972         fl4.flowi4_uid = sock_net_uid(net, NULL);
1973
1974         if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
1975                 flkeys = &_flkeys;
1976         } else {
1977                 fl4.flowi4_proto = 0;
1978                 fl4.fl4_sport = 0;
1979                 fl4.fl4_dport = 0;
1980         }
1981
1982         err = fib_lookup(net, &fl4, res, 0);
1983         if (err != 0) {
1984                 if (!IN_DEV_FORWARD(in_dev))
1985                         err = -EHOSTUNREACH;
1986                 goto no_route;
1987         }
1988
1989         if (res->type == RTN_BROADCAST) {
1990                 if (IN_DEV_BFORWARD(in_dev))
1991                         goto make_route;
1992                 goto brd_input;
1993         }
1994
1995         if (res->type == RTN_LOCAL) {
1996                 err = fib_validate_source(skb, saddr, daddr, tos,
1997                                           0, dev, in_dev, &itag);
1998                 if (err < 0)
1999                         goto martian_source;
2000                 goto local_input;
2001         }
2002
2003         if (!IN_DEV_FORWARD(in_dev)) {
2004                 err = -EHOSTUNREACH;
2005                 goto no_route;
2006         }
2007         if (res->type != RTN_UNICAST)
2008                 goto martian_destination;
2009
2010 make_route:
2011         err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2012 out:    return err;
2013
2014 brd_input:
2015         if (skb->protocol != htons(ETH_P_IP))
2016                 goto e_inval;
2017
2018         if (!ipv4_is_zeronet(saddr)) {
2019                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2020                                           in_dev, &itag);
2021                 if (err < 0)
2022                         goto martian_source;
2023         }
2024         flags |= RTCF_BROADCAST;
2025         res->type = RTN_BROADCAST;
2026         RT_CACHE_STAT_INC(in_brd);
2027
2028 local_input:
2029         do_cache = false;
2030         if (res->fi) {
2031                 if (!itag) {
2032                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
2033                         if (rt_cache_valid(rth)) {
2034                                 skb_dst_set_noref(skb, &rth->dst);
2035                                 err = 0;
2036                                 goto out;
2037                         }
2038                         do_cache = true;
2039                 }
2040         }
2041
2042         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2043                            flags | RTCF_LOCAL, res->type,
2044                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2045         if (!rth)
2046                 goto e_nobufs;
2047
2048         rth->dst.output= ip_rt_bug;
2049 #ifdef CONFIG_IP_ROUTE_CLASSID
2050         rth->dst.tclassid = itag;
2051 #endif
2052         rth->rt_is_input = 1;
2053
2054         RT_CACHE_STAT_INC(in_slow_tot);
2055         if (res->type == RTN_UNREACHABLE) {
2056                 rth->dst.input= ip_error;
2057                 rth->dst.error= -err;
2058                 rth->rt_flags   &= ~RTCF_LOCAL;
2059         }
2060
2061         if (do_cache) {
2062                 struct fib_nh *nh = &FIB_RES_NH(*res);
2063
2064                 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2065                 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2066                         WARN_ON(rth->dst.input == lwtunnel_input);
2067                         rth->dst.lwtstate->orig_input = rth->dst.input;
2068                         rth->dst.input = lwtunnel_input;
2069                 }
2070
2071                 if (unlikely(!rt_cache_route(nh, rth)))
2072                         rt_add_uncached_list(rth);
2073         }
2074         skb_dst_set(skb, &rth->dst);
2075         err = 0;
2076         goto out;
2077
2078 no_route:
2079         RT_CACHE_STAT_INC(in_no_route);
2080         res->type = RTN_UNREACHABLE;
2081         res->fi = NULL;
2082         res->table = NULL;
2083         goto local_input;
2084
2085         /*
2086          *      Do not cache martian addresses: they should be logged (RFC1812)
2087          */
2088 martian_destination:
2089         RT_CACHE_STAT_INC(in_martian_dst);
2090 #ifdef CONFIG_IP_ROUTE_VERBOSE
2091         if (IN_DEV_LOG_MARTIANS(in_dev))
2092                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2093                                      &daddr, &saddr, dev->name);
2094 #endif
2095
2096 e_inval:
2097         err = -EINVAL;
2098         goto out;
2099
2100 e_nobufs:
2101         err = -ENOBUFS;
2102         goto out;
2103
2104 martian_source:
2105         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2106         goto out;
2107 }
2108
2109 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2110                          u8 tos, struct net_device *dev)
2111 {
2112         struct fib_result res;
2113         int err;
2114
2115         tos &= IPTOS_RT_MASK;
2116         rcu_read_lock();
2117         err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2118         rcu_read_unlock();
2119
2120         return err;
2121 }
2122 EXPORT_SYMBOL(ip_route_input_noref);
2123
2124 /* called with rcu_read_lock held */
2125 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2126                        u8 tos, struct net_device *dev, struct fib_result *res)
2127 {
2128         /* Multicast recognition logic is moved from route cache to here.
2129            The problem was that too many Ethernet cards have broken/missing
2130            hardware multicast filters :-( As result the host on multicasting
2131            network acquires a lot of useless route cache entries, sort of
2132            SDR messages from all the world. Now we try to get rid of them.
2133            Really, provided software IP multicast filter is organized
2134            reasonably (at least, hashed), it does not result in a slowdown
2135            comparing with route cache reject entries.
2136            Note, that multicast routers are not affected, because
2137            route cache entry is created eventually.
2138          */
2139         if (ipv4_is_multicast(daddr)) {
2140                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2141                 int our = 0;
2142                 int err = -EINVAL;
2143
2144                 if (in_dev)
2145                         our = ip_check_mc_rcu(in_dev, daddr, saddr,
2146                                               ip_hdr(skb)->protocol);
2147
2148                 /* check l3 master if no match yet */
2149                 if ((!in_dev || !our) && netif_is_l3_slave(dev)) {
2150                         struct in_device *l3_in_dev;
2151
2152                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2153                         if (l3_in_dev)
2154                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2155                                                       ip_hdr(skb)->protocol);
2156                 }
2157
2158                 if (our
2159 #ifdef CONFIG_IP_MROUTE
2160                         ||
2161                     (!ipv4_is_local_multicast(daddr) &&
2162                      IN_DEV_MFORWARD(in_dev))
2163 #endif
2164                    ) {
2165                         err = ip_route_input_mc(skb, daddr, saddr,
2166                                                 tos, dev, our);
2167                 }
2168                 return err;
2169         }
2170
2171         return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2172 }
2173
2174 /* called with rcu_read_lock() */
2175 static struct rtable *__mkroute_output(const struct fib_result *res,
2176                                        const struct flowi4 *fl4, int orig_oif,
2177                                        struct net_device *dev_out,
2178                                        unsigned int flags)
2179 {
2180         struct fib_info *fi = res->fi;
2181         struct fib_nh_exception *fnhe;
2182         struct in_device *in_dev;
2183         u16 type = res->type;
2184         struct rtable *rth;
2185         bool do_cache;
2186
2187         in_dev = __in_dev_get_rcu(dev_out);
2188         if (!in_dev)
2189                 return ERR_PTR(-EINVAL);
2190
2191         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2192                 if (ipv4_is_loopback(fl4->saddr) &&
2193                     !(dev_out->flags & IFF_LOOPBACK) &&
2194                     !netif_is_l3_master(dev_out))
2195                         return ERR_PTR(-EINVAL);
2196
2197         if (ipv4_is_lbcast(fl4->daddr))
2198                 type = RTN_BROADCAST;
2199         else if (ipv4_is_multicast(fl4->daddr))
2200                 type = RTN_MULTICAST;
2201         else if (ipv4_is_zeronet(fl4->daddr))
2202                 return ERR_PTR(-EINVAL);
2203
2204         if (dev_out->flags & IFF_LOOPBACK)
2205                 flags |= RTCF_LOCAL;
2206
2207         do_cache = true;
2208         if (type == RTN_BROADCAST) {
2209                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2210                 fi = NULL;
2211         } else if (type == RTN_MULTICAST) {
2212                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2213                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2214                                      fl4->flowi4_proto))
2215                         flags &= ~RTCF_LOCAL;
2216                 else
2217                         do_cache = false;
2218                 /* If multicast route do not exist use
2219                  * default one, but do not gateway in this case.
2220                  * Yes, it is hack.
2221                  */
2222                 if (fi && res->prefixlen < 4)
2223                         fi = NULL;
2224         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2225                    (orig_oif != dev_out->ifindex)) {
2226                 /* For local routes that require a particular output interface
2227                  * we do not want to cache the result.  Caching the result
2228                  * causes incorrect behaviour when there are multiple source
2229                  * addresses on the interface, the end result being that if the
2230                  * intended recipient is waiting on that interface for the
2231                  * packet he won't receive it because it will be delivered on
2232                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2233                  * be set to the loopback interface as well.
2234                  */
2235                 do_cache = false;
2236         }
2237
2238         fnhe = NULL;
2239         do_cache &= fi != NULL;
2240         if (fi) {
2241                 struct rtable __rcu **prth;
2242                 struct fib_nh *nh = &FIB_RES_NH(*res);
2243
2244                 fnhe = find_exception(nh, fl4->daddr);
2245                 if (!do_cache)
2246                         goto add;
2247                 if (fnhe) {
2248                         prth = &fnhe->fnhe_rth_output;
2249                 } else {
2250                         if (unlikely(fl4->flowi4_flags &
2251                                      FLOWI_FLAG_KNOWN_NH &&
2252                                      !(nh->nh_gw &&
2253                                        nh->nh_scope == RT_SCOPE_LINK))) {
2254                                 do_cache = false;
2255                                 goto add;
2256                         }
2257                         prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2258                 }
2259                 rth = rcu_dereference(*prth);
2260                 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2261                         return rth;
2262         }
2263
2264 add:
2265         rth = rt_dst_alloc(dev_out, flags, type,
2266                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2267                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2268                            do_cache);
2269         if (!rth)
2270                 return ERR_PTR(-ENOBUFS);
2271
2272         rth->rt_iif = orig_oif;
2273
2274         RT_CACHE_STAT_INC(out_slow_tot);
2275
2276         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2277                 if (flags & RTCF_LOCAL &&
2278                     !(dev_out->flags & IFF_LOOPBACK)) {
2279                         rth->dst.output = ip_mc_output;
2280                         RT_CACHE_STAT_INC(out_slow_mc);
2281                 }
2282 #ifdef CONFIG_IP_MROUTE
2283                 if (type == RTN_MULTICAST) {
2284                         if (IN_DEV_MFORWARD(in_dev) &&
2285                             !ipv4_is_local_multicast(fl4->daddr)) {
2286                                 rth->dst.input = ip_mr_input;
2287                                 rth->dst.output = ip_mc_output;
2288                         }
2289                 }
2290 #endif
2291         }
2292
2293         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2294         lwtunnel_set_redirect(&rth->dst);
2295
2296         return rth;
2297 }
2298
2299 /*
2300  * Major route resolver routine.
2301  */
2302
2303 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2304                                         const struct sk_buff *skb)
2305 {
2306         __u8 tos = RT_FL_TOS(fl4);
2307         struct fib_result res = {
2308                 .type           = RTN_UNSPEC,
2309                 .fi             = NULL,
2310                 .table          = NULL,
2311                 .tclassid       = 0,
2312         };
2313         struct rtable *rth;
2314
2315         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2316         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2317         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2318                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2319
2320         rcu_read_lock();
2321         rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2322         rcu_read_unlock();
2323
2324         return rth;
2325 }
2326 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2327
2328 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2329                                             struct fib_result *res,
2330                                             const struct sk_buff *skb)
2331 {
2332         struct net_device *dev_out = NULL;
2333         int orig_oif = fl4->flowi4_oif;
2334         unsigned int flags = 0;
2335         struct rtable *rth;
2336         int err = -ENETUNREACH;
2337
2338         if (fl4->saddr) {
2339                 rth = ERR_PTR(-EINVAL);
2340                 if (ipv4_is_multicast(fl4->saddr) ||
2341                     ipv4_is_lbcast(fl4->saddr) ||
2342                     ipv4_is_zeronet(fl4->saddr))
2343                         goto out;
2344
2345                 /* I removed check for oif == dev_out->oif here.
2346                    It was wrong for two reasons:
2347                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2348                       is assigned to multiple interfaces.
2349                    2. Moreover, we are allowed to send packets with saddr
2350                       of another iface. --ANK
2351                  */
2352
2353                 if (fl4->flowi4_oif == 0 &&
2354                     (ipv4_is_multicast(fl4->daddr) ||
2355                      ipv4_is_lbcast(fl4->daddr))) {
2356                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2357                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2358                         if (!dev_out)
2359                                 goto out;
2360
2361                         /* Special hack: user can direct multicasts
2362                            and limited broadcast via necessary interface
2363                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2364                            This hack is not just for fun, it allows
2365                            vic,vat and friends to work.
2366                            They bind socket to loopback, set ttl to zero
2367                            and expect that it will work.
2368                            From the viewpoint of routing cache they are broken,
2369                            because we are not allowed to build multicast path
2370                            with loopback source addr (look, routing cache
2371                            cannot know, that ttl is zero, so that packet
2372                            will not leave this host and route is valid).
2373                            Luckily, this hack is good workaround.
2374                          */
2375
2376                         fl4->flowi4_oif = dev_out->ifindex;
2377                         goto make_route;
2378                 }
2379
2380                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2381                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2382                         if (!__ip_dev_find(net, fl4->saddr, false))
2383                                 goto out;
2384                 }
2385         }
2386
2387
2388         if (fl4->flowi4_oif) {
2389                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2390                 rth = ERR_PTR(-ENODEV);
2391                 if (!dev_out)
2392                         goto out;
2393
2394                 /* RACE: Check return value of inet_select_addr instead. */
2395                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2396                         rth = ERR_PTR(-ENETUNREACH);
2397                         goto out;
2398                 }
2399                 if (ipv4_is_local_multicast(fl4->daddr) ||
2400                     ipv4_is_lbcast(fl4->daddr) ||
2401                     fl4->flowi4_proto == IPPROTO_IGMP) {
2402                         if (!fl4->saddr)
2403                                 fl4->saddr = inet_select_addr(dev_out, 0,
2404                                                               RT_SCOPE_LINK);
2405                         goto make_route;
2406                 }
2407                 if (!fl4->saddr) {
2408                         if (ipv4_is_multicast(fl4->daddr))
2409                                 fl4->saddr = inet_select_addr(dev_out, 0,
2410                                                               fl4->flowi4_scope);
2411                         else if (!fl4->daddr)
2412                                 fl4->saddr = inet_select_addr(dev_out, 0,
2413                                                               RT_SCOPE_HOST);
2414                 }
2415         }
2416
2417         if (!fl4->daddr) {
2418                 fl4->daddr = fl4->saddr;
2419                 if (!fl4->daddr)
2420                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2421                 dev_out = net->loopback_dev;
2422                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2423                 res->type = RTN_LOCAL;
2424                 flags |= RTCF_LOCAL;
2425                 goto make_route;
2426         }
2427
2428         err = fib_lookup(net, fl4, res, 0);
2429         if (err) {
2430                 res->fi = NULL;
2431                 res->table = NULL;
2432                 if (fl4->flowi4_oif &&
2433                     (ipv4_is_multicast(fl4->daddr) ||
2434                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2435                         /* Apparently, routing tables are wrong. Assume,
2436                            that the destination is on link.
2437
2438                            WHY? DW.
2439                            Because we are allowed to send to iface
2440                            even if it has NO routes and NO assigned
2441                            addresses. When oif is specified, routing
2442                            tables are looked up with only one purpose:
2443                            to catch if destination is gatewayed, rather than
2444                            direct. Moreover, if MSG_DONTROUTE is set,
2445                            we send packet, ignoring both routing tables
2446                            and ifaddr state. --ANK
2447
2448
2449                            We could make it even if oif is unknown,
2450                            likely IPv6, but we do not.
2451                          */
2452
2453                         if (fl4->saddr == 0)
2454                                 fl4->saddr = inet_select_addr(dev_out, 0,
2455                                                               RT_SCOPE_LINK);
2456                         res->type = RTN_UNICAST;
2457                         goto make_route;
2458                 }
2459                 rth = ERR_PTR(err);
2460                 goto out;
2461         }
2462
2463         if (res->type == RTN_LOCAL) {
2464                 if (!fl4->saddr) {
2465                         if (res->fi->fib_prefsrc)
2466                                 fl4->saddr = res->fi->fib_prefsrc;
2467                         else
2468                                 fl4->saddr = fl4->daddr;
2469                 }
2470
2471                 /* L3 master device is the loopback for that domain */
2472                 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2473                         net->loopback_dev;
2474
2475                 /* make sure orig_oif points to fib result device even
2476                  * though packet rx/tx happens over loopback or l3mdev
2477                  */
2478                 orig_oif = FIB_RES_OIF(*res);
2479
2480                 fl4->flowi4_oif = dev_out->ifindex;
2481                 flags |= RTCF_LOCAL;
2482                 goto make_route;
2483         }
2484
2485         fib_select_path(net, res, fl4, skb);
2486
2487         dev_out = FIB_RES_DEV(*res);
2488         fl4->flowi4_oif = dev_out->ifindex;
2489
2490
2491 make_route:
2492         rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2493
2494 out:
2495         return rth;
2496 }
2497
2498 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2499 {
2500         return NULL;
2501 }
2502
2503 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2504 {
2505         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2506
2507         return mtu ? : dst->dev->mtu;
2508 }
2509
2510 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2511                                           struct sk_buff *skb, u32 mtu)
2512 {
2513 }
2514
2515 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2516                                        struct sk_buff *skb)
2517 {
2518 }
2519
2520 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2521                                           unsigned long old)
2522 {
2523         return NULL;
2524 }
2525
2526 static struct dst_ops ipv4_dst_blackhole_ops = {
2527         .family                 =       AF_INET,
2528         .check                  =       ipv4_blackhole_dst_check,
2529         .mtu                    =       ipv4_blackhole_mtu,
2530         .default_advmss         =       ipv4_default_advmss,
2531         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2532         .redirect               =       ipv4_rt_blackhole_redirect,
2533         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2534         .neigh_lookup           =       ipv4_neigh_lookup,
2535 };
2536
2537 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2538 {
2539         struct rtable *ort = (struct rtable *) dst_orig;
2540         struct rtable *rt;
2541
2542         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2543         if (rt) {
2544                 struct dst_entry *new = &rt->dst;
2545
2546                 new->__use = 1;
2547                 new->input = dst_discard;
2548                 new->output = dst_discard_out;
2549
2550                 new->dev = net->loopback_dev;
2551                 if (new->dev)
2552                         dev_hold(new->dev);
2553
2554                 rt->rt_is_input = ort->rt_is_input;
2555                 rt->rt_iif = ort->rt_iif;
2556                 rt->rt_pmtu = ort->rt_pmtu;
2557                 rt->rt_mtu_locked = ort->rt_mtu_locked;
2558
2559                 rt->rt_genid = rt_genid_ipv4(net);
2560                 rt->rt_flags = ort->rt_flags;
2561                 rt->rt_type = ort->rt_type;
2562                 rt->rt_gateway = ort->rt_gateway;
2563                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2564
2565                 INIT_LIST_HEAD(&rt->rt_uncached);
2566         }
2567
2568         dst_release(dst_orig);
2569
2570         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2571 }
2572
2573 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2574                                     const struct sock *sk)
2575 {
2576         struct rtable *rt = __ip_route_output_key(net, flp4);
2577
2578         if (IS_ERR(rt))
2579                 return rt;
2580
2581         if (flp4->flowi4_proto)
2582                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2583                                                         flowi4_to_flowi(flp4),
2584                                                         sk, 0);
2585
2586         return rt;
2587 }
2588 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2589
2590 /* called with rcu_read_lock held */
2591 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2592                         struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2593                         struct sk_buff *skb, u32 portid, u32 seq)
2594 {
2595         struct rtmsg *r;
2596         struct nlmsghdr *nlh;
2597         unsigned long expires = 0;
2598         u32 error;
2599         u32 metrics[RTAX_MAX];
2600
2601         nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2602         if (!nlh)
2603                 return -EMSGSIZE;
2604
2605         r = nlmsg_data(nlh);
2606         r->rtm_family    = AF_INET;
2607         r->rtm_dst_len  = 32;
2608         r->rtm_src_len  = 0;
2609         r->rtm_tos      = fl4->flowi4_tos;
2610         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2611         if (nla_put_u32(skb, RTA_TABLE, table_id))
2612                 goto nla_put_failure;
2613         r->rtm_type     = rt->rt_type;
2614         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2615         r->rtm_protocol = RTPROT_UNSPEC;
2616         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2617         if (rt->rt_flags & RTCF_NOTIFY)
2618                 r->rtm_flags |= RTM_F_NOTIFY;
2619         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2620                 r->rtm_flags |= RTCF_DOREDIRECT;
2621
2622         if (nla_put_in_addr(skb, RTA_DST, dst))
2623                 goto nla_put_failure;
2624         if (src) {
2625                 r->rtm_src_len = 32;
2626                 if (nla_put_in_addr(skb, RTA_SRC, src))
2627                         goto nla_put_failure;
2628         }
2629         if (rt->dst.dev &&
2630             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2631                 goto nla_put_failure;
2632 #ifdef CONFIG_IP_ROUTE_CLASSID
2633         if (rt->dst.tclassid &&
2634             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2635                 goto nla_put_failure;
2636 #endif
2637         if (!rt_is_input_route(rt) &&
2638             fl4->saddr != src) {
2639                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2640                         goto nla_put_failure;
2641         }
2642         if (rt->rt_uses_gateway &&
2643             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2644                 goto nla_put_failure;
2645
2646         expires = rt->dst.expires;
2647         if (expires) {
2648                 unsigned long now = jiffies;
2649
2650                 if (time_before(now, expires))
2651                         expires -= now;
2652                 else
2653                         expires = 0;
2654         }
2655
2656         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2657         if (rt->rt_pmtu && expires)
2658                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2659         if (rt->rt_mtu_locked && expires)
2660                 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2661         if (rtnetlink_put_metrics(skb, metrics) < 0)
2662                 goto nla_put_failure;
2663
2664         if (fl4->flowi4_mark &&
2665             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2666                 goto nla_put_failure;
2667
2668         if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2669             nla_put_u32(skb, RTA_UID,
2670                         from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2671                 goto nla_put_failure;
2672
2673         error = rt->dst.error;
2674
2675         if (rt_is_input_route(rt)) {
2676 #ifdef CONFIG_IP_MROUTE
2677                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2678                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2679                         int err = ipmr_get_route(net, skb,
2680                                                  fl4->saddr, fl4->daddr,
2681                                                  r, portid);
2682
2683                         if (err <= 0) {
2684                                 if (err == 0)
2685                                         return 0;
2686                                 goto nla_put_failure;
2687                         }
2688                 } else
2689 #endif
2690                         if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2691                                 goto nla_put_failure;
2692         }
2693
2694         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2695                 goto nla_put_failure;
2696
2697         nlmsg_end(skb, nlh);
2698         return 0;
2699
2700 nla_put_failure:
2701         nlmsg_cancel(skb, nlh);
2702         return -EMSGSIZE;
2703 }
2704
2705 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
2706                                                    u8 ip_proto, __be16 sport,
2707                                                    __be16 dport)
2708 {
2709         struct sk_buff *skb;
2710         struct iphdr *iph;
2711
2712         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2713         if (!skb)
2714                 return NULL;
2715
2716         /* Reserve room for dummy headers, this skb can pass
2717          * through good chunk of routing engine.
2718          */
2719         skb_reset_mac_header(skb);
2720         skb_reset_network_header(skb);
2721         skb->protocol = htons(ETH_P_IP);
2722         iph = skb_put(skb, sizeof(struct iphdr));
2723         iph->protocol = ip_proto;
2724         iph->saddr = src;
2725         iph->daddr = dst;
2726         iph->version = 0x4;
2727         iph->frag_off = 0;
2728         iph->ihl = 0x5;
2729         skb_set_transport_header(skb, skb->len);
2730
2731         switch (iph->protocol) {
2732         case IPPROTO_UDP: {
2733                 struct udphdr *udph;
2734
2735                 udph = skb_put_zero(skb, sizeof(struct udphdr));
2736                 udph->source = sport;
2737                 udph->dest = dport;
2738                 udph->len = sizeof(struct udphdr);
2739                 udph->check = 0;
2740                 break;
2741         }
2742         case IPPROTO_TCP: {
2743                 struct tcphdr *tcph;
2744
2745                 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
2746                 tcph->source    = sport;
2747                 tcph->dest      = dport;
2748                 tcph->doff      = sizeof(struct tcphdr) / 4;
2749                 tcph->rst = 1;
2750                 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
2751                                             src, dst, 0);
2752                 break;
2753         }
2754         case IPPROTO_ICMP: {
2755                 struct icmphdr *icmph;
2756
2757                 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
2758                 icmph->type = ICMP_ECHO;
2759                 icmph->code = 0;
2760         }
2761         }
2762
2763         return skb;
2764 }
2765
2766 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2767                              struct netlink_ext_ack *extack)
2768 {
2769         struct net *net = sock_net(in_skb->sk);
2770         struct nlattr *tb[RTA_MAX+1];
2771         u32 table_id = RT_TABLE_MAIN;
2772         __be16 sport = 0, dport = 0;
2773         struct fib_result res = {};
2774         u8 ip_proto = IPPROTO_UDP;
2775         struct rtable *rt = NULL;
2776         struct sk_buff *skb;
2777         struct rtmsg *rtm;
2778         struct flowi4 fl4 = {};
2779         __be32 dst = 0;
2780         __be32 src = 0;
2781         kuid_t uid;
2782         u32 iif;
2783         int err;
2784         int mark;
2785
2786         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
2787                           extack);
2788         if (err < 0)
2789                 return err;
2790
2791         rtm = nlmsg_data(nlh);
2792         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2793         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2794         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2795         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2796         if (tb[RTA_UID])
2797                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2798         else
2799                 uid = (iif ? INVALID_UID : current_uid());
2800
2801         if (tb[RTA_IP_PROTO]) {
2802                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
2803                                                   &ip_proto, extack);
2804                 if (err)
2805                         return err;
2806         }
2807
2808         if (tb[RTA_SPORT])
2809                 sport = nla_get_be16(tb[RTA_SPORT]);
2810
2811         if (tb[RTA_DPORT])
2812                 dport = nla_get_be16(tb[RTA_DPORT]);
2813
2814         skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
2815         if (!skb)
2816                 return -ENOBUFS;
2817
2818         fl4.daddr = dst;
2819         fl4.saddr = src;
2820         fl4.flowi4_tos = rtm->rtm_tos;
2821         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2822         fl4.flowi4_mark = mark;
2823         fl4.flowi4_uid = uid;
2824         if (sport)
2825                 fl4.fl4_sport = sport;
2826         if (dport)
2827                 fl4.fl4_dport = dport;
2828         fl4.flowi4_proto = ip_proto;
2829
2830         rcu_read_lock();
2831
2832         if (iif) {
2833                 struct net_device *dev;
2834
2835                 dev = dev_get_by_index_rcu(net, iif);
2836                 if (!dev) {
2837                         err = -ENODEV;
2838                         goto errout_rcu;
2839                 }
2840
2841                 fl4.flowi4_iif = iif; /* for rt_fill_info */
2842                 skb->dev        = dev;
2843                 skb->mark       = mark;
2844                 err = ip_route_input_rcu(skb, dst, src, rtm->rtm_tos,
2845                                          dev, &res);
2846
2847                 rt = skb_rtable(skb);
2848                 if (err == 0 && rt->dst.error)
2849                         err = -rt->dst.error;
2850         } else {
2851                 fl4.flowi4_iif = LOOPBACK_IFINDEX;
2852                 skb->dev = net->loopback_dev;
2853                 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
2854                 err = 0;
2855                 if (IS_ERR(rt))
2856                         err = PTR_ERR(rt);
2857                 else
2858                         skb_dst_set(skb, &rt->dst);
2859         }
2860
2861         if (err)
2862                 goto errout_rcu;
2863
2864         if (rtm->rtm_flags & RTM_F_NOTIFY)
2865                 rt->rt_flags |= RTCF_NOTIFY;
2866
2867         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2868                 table_id = res.table ? res.table->tb_id : 0;
2869
2870         /* reset skb for netlink reply msg */
2871         skb_trim(skb, 0);
2872         skb_reset_network_header(skb);
2873         skb_reset_transport_header(skb);
2874         skb_reset_mac_header(skb);
2875
2876         if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2877                 if (!res.fi) {
2878                         err = fib_props[res.type].error;
2879                         if (!err)
2880                                 err = -EHOSTUNREACH;
2881                         goto errout_rcu;
2882                 }
2883                 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2884                                     nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2885                                     rt->rt_type, res.prefix, res.prefixlen,
2886                                     fl4.flowi4_tos, res.fi, 0);
2887         } else {
2888                 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
2889                                    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
2890         }
2891         if (err < 0)
2892                 goto errout_rcu;
2893
2894         rcu_read_unlock();
2895
2896         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2897
2898 errout_free:
2899         return err;
2900 errout_rcu:
2901         rcu_read_unlock();
2902         kfree_skb(skb);
2903         goto errout_free;
2904 }
2905
2906 void ip_rt_multicast_event(struct in_device *in_dev)
2907 {
2908         rt_cache_flush(dev_net(in_dev->dev));
2909 }
2910
2911 #ifdef CONFIG_SYSCTL
2912 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2913 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2914 static int ip_rt_gc_elasticity __read_mostly    = 8;
2915 static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
2916
2917 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2918                                         void __user *buffer,
2919                                         size_t *lenp, loff_t *ppos)
2920 {
2921         struct net *net = (struct net *)__ctl->extra1;
2922
2923         if (write) {
2924                 rt_cache_flush(net);
2925                 fnhe_genid_bump(net);
2926                 return 0;
2927         }
2928
2929         return -EINVAL;
2930 }
2931
2932 static struct ctl_table ipv4_route_table[] = {
2933         {
2934                 .procname       = "gc_thresh",
2935                 .data           = &ipv4_dst_ops.gc_thresh,
2936                 .maxlen         = sizeof(int),
2937                 .mode           = 0644,
2938                 .proc_handler   = proc_dointvec,
2939         },
2940         {
2941                 .procname       = "max_size",
2942                 .data           = &ip_rt_max_size,
2943                 .maxlen         = sizeof(int),
2944                 .mode           = 0644,
2945                 .proc_handler   = proc_dointvec,
2946         },
2947         {
2948                 /*  Deprecated. Use gc_min_interval_ms */
2949
2950                 .procname       = "gc_min_interval",
2951                 .data           = &ip_rt_gc_min_interval,
2952                 .maxlen         = sizeof(int),
2953                 .mode           = 0644,
2954                 .proc_handler   = proc_dointvec_jiffies,
2955         },
2956         {
2957                 .procname       = "gc_min_interval_ms",
2958                 .data           = &ip_rt_gc_min_interval,
2959                 .maxlen         = sizeof(int),
2960                 .mode           = 0644,
2961                 .proc_handler   = proc_dointvec_ms_jiffies,
2962         },
2963         {
2964                 .procname       = "gc_timeout",
2965                 .data           = &ip_rt_gc_timeout,
2966                 .maxlen         = sizeof(int),
2967                 .mode           = 0644,
2968                 .proc_handler   = proc_dointvec_jiffies,
2969         },
2970         {
2971                 .procname       = "gc_interval",
2972                 .data           = &ip_rt_gc_interval,
2973                 .maxlen         = sizeof(int),
2974                 .mode           = 0644,
2975                 .proc_handler   = proc_dointvec_jiffies,
2976         },
2977         {
2978                 .procname       = "redirect_load",
2979                 .data           = &ip_rt_redirect_load,
2980                 .maxlen         = sizeof(int),
2981                 .mode           = 0644,
2982                 .proc_handler   = proc_dointvec,
2983         },
2984         {
2985                 .procname       = "redirect_number",
2986                 .data           = &ip_rt_redirect_number,
2987                 .maxlen         = sizeof(int),
2988                 .mode           = 0644,
2989                 .proc_handler   = proc_dointvec,
2990         },
2991         {
2992                 .procname       = "redirect_silence",
2993                 .data           = &ip_rt_redirect_silence,
2994                 .maxlen         = sizeof(int),
2995                 .mode           = 0644,
2996                 .proc_handler   = proc_dointvec,
2997         },
2998         {
2999                 .procname       = "error_cost",
3000                 .data           = &ip_rt_error_cost,
3001                 .maxlen         = sizeof(int),
3002                 .mode           = 0644,
3003                 .proc_handler   = proc_dointvec,
3004         },
3005         {
3006                 .procname       = "error_burst",
3007                 .data           = &ip_rt_error_burst,
3008                 .maxlen         = sizeof(int),
3009                 .mode           = 0644,
3010                 .proc_handler   = proc_dointvec,
3011         },
3012         {
3013                 .procname       = "gc_elasticity",
3014                 .data           = &ip_rt_gc_elasticity,
3015                 .maxlen         = sizeof(int),
3016                 .mode           = 0644,
3017                 .proc_handler   = proc_dointvec,
3018         },
3019         {
3020                 .procname       = "mtu_expires",
3021                 .data           = &ip_rt_mtu_expires,
3022                 .maxlen         = sizeof(int),
3023                 .mode           = 0644,
3024                 .proc_handler   = proc_dointvec_jiffies,
3025         },
3026         {
3027                 .procname       = "min_pmtu",
3028                 .data           = &ip_rt_min_pmtu,
3029                 .maxlen         = sizeof(int),
3030                 .mode           = 0644,
3031                 .proc_handler   = proc_dointvec_minmax,
3032                 .extra1         = &ip_min_valid_pmtu,
3033         },
3034         {
3035                 .procname       = "min_adv_mss",
3036                 .data           = &ip_rt_min_advmss,
3037                 .maxlen         = sizeof(int),
3038                 .mode           = 0644,
3039                 .proc_handler   = proc_dointvec,
3040         },
3041         { }
3042 };
3043
3044 static struct ctl_table ipv4_route_flush_table[] = {
3045         {
3046                 .procname       = "flush",
3047                 .maxlen         = sizeof(int),
3048                 .mode           = 0200,
3049                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3050         },
3051         { },
3052 };
3053
3054 static __net_init int sysctl_route_net_init(struct net *net)
3055 {
3056         struct ctl_table *tbl;
3057
3058         tbl = ipv4_route_flush_table;
3059         if (!net_eq(net, &init_net)) {
3060                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3061                 if (!tbl)
3062                         goto err_dup;
3063
3064                 /* Don't export sysctls to unprivileged users */
3065                 if (net->user_ns != &init_user_ns)
3066                         tbl[0].procname = NULL;
3067         }
3068         tbl[0].extra1 = net;
3069
3070         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3071         if (!net->ipv4.route_hdr)
3072                 goto err_reg;
3073         return 0;
3074
3075 err_reg:
3076         if (tbl != ipv4_route_flush_table)
3077                 kfree(tbl);
3078 err_dup:
3079         return -ENOMEM;
3080 }
3081
3082 static __net_exit void sysctl_route_net_exit(struct net *net)
3083 {
3084         struct ctl_table *tbl;
3085
3086         tbl = net->ipv4.route_hdr->ctl_table_arg;
3087         unregister_net_sysctl_table(net->ipv4.route_hdr);
3088         BUG_ON(tbl == ipv4_route_flush_table);
3089         kfree(tbl);
3090 }
3091
3092 static __net_initdata struct pernet_operations sysctl_route_ops = {
3093         .init = sysctl_route_net_init,
3094         .exit = sysctl_route_net_exit,
3095 };
3096 #endif
3097
3098 static __net_init int rt_genid_init(struct net *net)
3099 {
3100         atomic_set(&net->ipv4.rt_genid, 0);
3101         atomic_set(&net->fnhe_genid, 0);
3102         atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3103         return 0;
3104 }
3105
3106 static __net_initdata struct pernet_operations rt_genid_ops = {
3107         .init = rt_genid_init,
3108 };
3109
3110 static int __net_init ipv4_inetpeer_init(struct net *net)
3111 {
3112         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3113
3114         if (!bp)
3115                 return -ENOMEM;
3116         inet_peer_base_init(bp);
3117         net->ipv4.peers = bp;
3118         return 0;
3119 }
3120
3121 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3122 {
3123         struct inet_peer_base *bp = net->ipv4.peers;
3124
3125         net->ipv4.peers = NULL;
3126         inetpeer_invalidate_tree(bp);
3127         kfree(bp);
3128 }
3129
3130 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3131         .init   =       ipv4_inetpeer_init,
3132         .exit   =       ipv4_inetpeer_exit,
3133 };
3134
3135 #ifdef CONFIG_IP_ROUTE_CLASSID
3136 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3137 #endif /* CONFIG_IP_ROUTE_CLASSID */
3138
3139 int __init ip_rt_init(void)
3140 {
3141         int cpu;
3142
3143         ip_idents = kmalloc_array(IP_IDENTS_SZ, sizeof(*ip_idents),
3144                                   GFP_KERNEL);
3145         if (!ip_idents)
3146                 panic("IP: failed to allocate ip_idents\n");
3147
3148         prandom_bytes(ip_idents, IP_IDENTS_SZ * sizeof(*ip_idents));
3149
3150         ip_tstamps = kcalloc(IP_IDENTS_SZ, sizeof(*ip_tstamps), GFP_KERNEL);
3151         if (!ip_tstamps)
3152                 panic("IP: failed to allocate ip_tstamps\n");
3153
3154         for_each_possible_cpu(cpu) {
3155                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3156
3157                 INIT_LIST_HEAD(&ul->head);
3158                 spin_lock_init(&ul->lock);
3159         }
3160 #ifdef CONFIG_IP_ROUTE_CLASSID
3161         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3162         if (!ip_rt_acct)
3163                 panic("IP: failed to allocate ip_rt_acct\n");
3164 #endif
3165
3166         ipv4_dst_ops.kmem_cachep =
3167                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3168                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3169
3170         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3171
3172         if (dst_entries_init(&ipv4_dst_ops) < 0)
3173                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3174
3175         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3176                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3177
3178         ipv4_dst_ops.gc_thresh = ~0;
3179         ip_rt_max_size = INT_MAX;
3180
3181         devinet_init();
3182         ip_fib_init();
3183
3184         if (ip_rt_proc_init())
3185                 pr_err("Unable to create route proc files\n");
3186 #ifdef CONFIG_XFRM
3187         xfrm_init();
3188         xfrm4_init();
3189 #endif
3190         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3191                       RTNL_FLAG_DOIT_UNLOCKED);
3192
3193 #ifdef CONFIG_SYSCTL
3194         register_pernet_subsys(&sysctl_route_ops);
3195 #endif
3196         register_pernet_subsys(&rt_genid_ops);
3197         register_pernet_subsys(&ipv4_inetpeer_ops);
3198         return 0;
3199 }
3200
3201 #ifdef CONFIG_SYSCTL
3202 /*
3203  * We really need to sanitize the damn ipv4 init order, then all
3204  * this nonsense will go away.
3205  */
3206 void __init ip_static_sysctl_init(void)
3207 {
3208         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3209 }
3210 #endif