net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <asm/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/bootmem.h>
  74 #include <linux/string.h>
  75 #include <linux/socket.h>
  76 #include <linux/sockios.h>
  77 #include <linux/errno.h>
  78 #include <linux/in.h>
  79 #include <linux/inet.h>
  80 #include <linux/netdevice.h>
  81 #include <linux/proc_fs.h>
  82 #include <linux/init.h>
  83 #include <linux/skbuff.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/rcupdate.h>
  91 #include <linux/times.h>
  92 #include <linux/slab.h>
  93 #include <linux/jhash.h>
  94 #include <net/dst.h>
  95 #include <net/dst_metadata.h>
  96 #include <net/net_namespace.h>
  97 #include <net/protocol.h>
  98 #include <net/ip.h>
  99 #include <net/route.h>
 100 #include <net/inetpeer.h>
 101 #include <net/sock.h>
 102 #include <net/ip_fib.h>
 103 #include <net/arp.h>
 104 #include <net/tcp.h>
 105 #include <net/icmp.h>
 106 #include <net/xfrm.h>
 107 #include <net/lwtunnel.h>
 108 #include <net/netevent.h>
 109 #include <net/rtnetlink.h>
 110 #ifdef CONFIG_SYSCTL
 111 #include <linux/sysctl.h>
 112 #include <linux/kmemleak.h>
 113 #endif
 114 #include <net/secure_seq.h>
 115 #include <net/ip_tunnels.h>
 116 #include <net/l3mdev.h>
 117
 118 #define RT_FL_TOS(oldflp4) \
 119         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 120
 121 #define RT_GC_TIMEOUT (300*HZ)
 122
 123 static int ip_rt_max_size;
 124 static int ip_rt_redirect_number __read_mostly  = 9;
 125 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 126 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 127 static int ip_rt_error_cost __read_mostly       = HZ;
 128 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 129 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 130 static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 131 static int ip_rt_min_advmss __read_mostly       = 256;
 132
 133 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 134
 135 /*
 136  *      Interface to generic destination cache.
 137  */
 138
 139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 140 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 141 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 142 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 143 static void              ipv4_link_failure(struct sk_buff *skb);
 144 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 145                                            struct sk_buff *skb, u32 mtu);
 146 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 147                                         struct sk_buff *skb);
 148 static void             ipv4_dst_destroy(struct dst_entry *dst);
 149
 150 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 151 {
 152         WARN_ON(1);
 153         return NULL;
 154 }
 155
 156 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 157                                            struct sk_buff *skb,
 158                                            const void *daddr);
 159
 160 static struct dst_ops ipv4_dst_ops = {
 161         .family =               AF_INET,
 162         .check =                ipv4_dst_check,
 163         .default_advmss =       ipv4_default_advmss,
 164         .mtu =                  ipv4_mtu,
 165         .cow_metrics =          ipv4_cow_metrics,
 166         .destroy =              ipv4_dst_destroy,
 167         .negative_advice =      ipv4_negative_advice,
 168         .link_failure =         ipv4_link_failure,
 169         .update_pmtu =          ip_rt_update_pmtu,
 170         .redirect =             ip_do_redirect,
 171         .local_out =            __ip_local_out,
 172         .neigh_lookup =         ipv4_neigh_lookup,
 173 };
 174
 175 #define ECN_OR_COST(class)      TC_PRIO_##class
 176
 177 const __u8 ip_tos2prio[16] = {
 178         TC_PRIO_BESTEFFORT,
 179         ECN_OR_COST(BESTEFFORT),
 180         TC_PRIO_BESTEFFORT,
 181         ECN_OR_COST(BESTEFFORT),
 182         TC_PRIO_BULK,
 183         ECN_OR_COST(BULK),
 184         TC_PRIO_BULK,
 185         ECN_OR_COST(BULK),
 186         TC_PRIO_INTERACTIVE,
 187         ECN_OR_COST(INTERACTIVE),
 188         TC_PRIO_INTERACTIVE,
 189         ECN_OR_COST(INTERACTIVE),
 190         TC_PRIO_INTERACTIVE_BULK,
 191         ECN_OR_COST(INTERACTIVE_BULK),
 192         TC_PRIO_INTERACTIVE_BULK,
 193         ECN_OR_COST(INTERACTIVE_BULK)
 194 };
 195 EXPORT_SYMBOL(ip_tos2prio);
 196
 197 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 198 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 199
 200 #ifdef CONFIG_PROC_FS
 201 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 202 {
 203         if (*pos)
 204                 return NULL;
 205         return SEQ_START_TOKEN;
 206 }
 207
 208 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 209 {
 210         ++*pos;
 211         return NULL;
 212 }
 213
 214 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 215 {
 216 }
 217
 218 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 219 {
 220         if (v == SEQ_START_TOKEN)
 221                 seq_printf(seq, "%-127s\n",
 222                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 223                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 224                            "HHUptod\tSpecDst");
 225         return 0;
 226 }
 227
 228 static const struct seq_operations rt_cache_seq_ops = {
 229         .start  = rt_cache_seq_start,
 230         .next   = rt_cache_seq_next,
 231         .stop   = rt_cache_seq_stop,
 232         .show   = rt_cache_seq_show,
 233 };
 234
 235 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 236 {
 237         return seq_open(file, &rt_cache_seq_ops);
 238 }
 239
 240 static const struct file_operations rt_cache_seq_fops = {
 241         .owner   = THIS_MODULE,
 242         .open    = rt_cache_seq_open,
 243         .read    = seq_read,
 244         .llseek  = seq_lseek,
 245         .release = seq_release,
 246 };
 247
 248
 249 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 250 {
 251         int cpu;
 252
 253         if (*pos == 0)
 254                 return SEQ_START_TOKEN;
 255
 256         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 257                 if (!cpu_possible(cpu))
 258                         continue;
 259                 *pos = cpu+1;
 260                 return &per_cpu(rt_cache_stat, cpu);
 261         }
 262         return NULL;
 263 }
 264
 265 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 266 {
 267         int cpu;
 268
 269         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 270                 if (!cpu_possible(cpu))
 271                         continue;
 272                 *pos = cpu+1;
 273                 return &per_cpu(rt_cache_stat, cpu);
 274         }
 275         (*pos)++;
 276         return NULL;
 277
 278 }
 279
 280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 281 {
 282
 283 }
 284
 285 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 286 {
 287         struct rt_cache_stat *st = v;
 288
 289         if (v == SEQ_START_TOKEN) {
 290                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 291                 return 0;
 292         }
 293
 294         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 295                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 296                    dst_entries_get_slow(&ipv4_dst_ops),
 297                    0, /* st->in_hit */
 298                    st->in_slow_tot,
 299                    st->in_slow_mc,
 300                    st->in_no_route,
 301                    st->in_brd,
 302                    st->in_martian_dst,
 303                    st->in_martian_src,
 304
 305                    0, /* st->out_hit */
 306                    st->out_slow_tot,
 307                    st->out_slow_mc,
 308
 309                    0, /* st->gc_total */
 310                    0, /* st->gc_ignored */
 311                    0, /* st->gc_goal_miss */
 312                    0, /* st->gc_dst_overflow */
 313                    0, /* st->in_hlist_search */
 314                    0  /* st->out_hlist_search */
 315                 );
 316         return 0;
 317 }
 318
 319 static const struct seq_operations rt_cpu_seq_ops = {
 320         .start  = rt_cpu_seq_start,
 321         .next   = rt_cpu_seq_next,
 322         .stop   = rt_cpu_seq_stop,
 323         .show   = rt_cpu_seq_show,
 324 };
 325
 326
 327 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 328 {
 329         return seq_open(file, &rt_cpu_seq_ops);
 330 }
 331
 332 static const struct file_operations rt_cpu_seq_fops = {
 333         .owner   = THIS_MODULE,
 334         .open    = rt_cpu_seq_open,
 335         .read    = seq_read,
 336         .llseek  = seq_lseek,
 337         .release = seq_release,
 338 };
 339
 340 #ifdef CONFIG_IP_ROUTE_CLASSID
 341 static int rt_acct_proc_show(struct seq_file *m, void *v)
 342 {
 343         struct ip_rt_acct *dst, *src;
 344         unsigned int i, j;
 345
 346         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 347         if (!dst)
 348                 return -ENOMEM;
 349
 350         for_each_possible_cpu(i) {
 351                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 352                 for (j = 0; j < 256; j++) {
 353                         dst[j].o_bytes   += src[j].o_bytes;
 354                         dst[j].o_packets += src[j].o_packets;
 355                         dst[j].i_bytes   += src[j].i_bytes;
 356                         dst[j].i_packets += src[j].i_packets;
 357                 }
 358         }
 359
 360         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 361         kfree(dst);
 362         return 0;
 363 }
 364
 365 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 366 {
 367         return single_open(file, rt_acct_proc_show, NULL);
 368 }
 369
 370 static const struct file_operations rt_acct_proc_fops = {
 371         .owner          = THIS_MODULE,
 372         .open           = rt_acct_proc_open,
 373         .read           = seq_read,
 374         .llseek         = seq_lseek,
 375         .release        = single_release,
 376 };
 377 #endif
 378
 379 static int __net_init ip_rt_do_proc_init(struct net *net)
 380 {
 381         struct proc_dir_entry *pde;
 382
 383         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
 384                           &rt_cache_seq_fops);
 385         if (!pde)
 386                 goto err1;
 387
 388         pde = proc_create("rt_cache", S_IRUGO,
 389                           net->proc_net_stat, &rt_cpu_seq_fops);
 390         if (!pde)
 391                 goto err2;
 392
 393 #ifdef CONFIG_IP_ROUTE_CLASSID
 394         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 395         if (!pde)
 396                 goto err3;
 397 #endif
 398         return 0;
 399
 400 #ifdef CONFIG_IP_ROUTE_CLASSID
 401 err3:
 402         remove_proc_entry("rt_cache", net->proc_net_stat);
 403 #endif
 404 err2:
 405         remove_proc_entry("rt_cache", net->proc_net);
 406 err1:
 407         return -ENOMEM;
 408 }
 409
 410 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 411 {
 412         remove_proc_entry("rt_cache", net->proc_net_stat);
 413         remove_proc_entry("rt_cache", net->proc_net);
 414 #ifdef CONFIG_IP_ROUTE_CLASSID
 415         remove_proc_entry("rt_acct", net->proc_net);
 416 #endif
 417 }
 418
 419 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 420         .init = ip_rt_do_proc_init,
 421         .exit = ip_rt_do_proc_exit,
 422 };
 423
 424 static int __init ip_rt_proc_init(void)
 425 {
 426         return register_pernet_subsys(&ip_rt_proc_ops);
 427 }
 428
 429 #else
 430 static inline int ip_rt_proc_init(void)
 431 {
 432         return 0;
 433 }
 434 #endif /* CONFIG_PROC_FS */
 435
 436 static inline bool rt_is_expired(const struct rtable *rth)
 437 {
 438         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 439 }
 440
 441 void rt_cache_flush(struct net *net)
 442 {
 443         rt_genid_bump_ipv4(net);
 444 }
 445
 446 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 447                                            struct sk_buff *skb,
 448                                            const void *daddr)
 449 {
 450         struct net_device *dev = dst->dev;
 451         const __be32 *pkey = daddr;
 452         const struct rtable *rt;
 453         struct neighbour *n;
 454
 455         rt = (const struct rtable *) dst;
 456         if (rt->rt_gateway)
 457                 pkey = (const __be32 *) &rt->rt_gateway;
 458         else if (skb)
 459                 pkey = &ip_hdr(skb)->daddr;
 460
 461         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 462         if (n)
 463                 return n;
 464         return neigh_create(&arp_tbl, pkey, dev);
 465 }
 466
 467 /* Hash tables of size 2048..262144 depending on RAM size.
 468  * Each bucket uses 8 bytes.
 469  */
 470 static u32 ip_idents_mask __read_mostly;
 471 static atomic_t *ip_idents __read_mostly;
 472 static u32 *ip_tstamps __read_mostly;
 473
 474 /* In order to protect privacy, we add a perturbation to identifiers
 475  * if one generator is seldom used. This makes hard for an attacker
 476  * to infer how many packets were sent between two points in time.
 477  */
 478 u32 ip_idents_reserve(u32 hash, int segs)
 479 {
 480         u32 bucket, old, now = (u32)jiffies;
 481         atomic_t *p_id;
 482         u32 *p_tstamp;
 483         u32 delta = 0;
 484
 485         bucket = hash & ip_idents_mask;
 486         p_tstamp = ip_tstamps + bucket;
 487         p_id = ip_idents + bucket;
 488         old = ACCESS_ONCE(*p_tstamp);
 489
 490         if (old != now && cmpxchg(p_tstamp, old, now) == old)
 491                 delta = prandom_u32_max(now - old);
 492
 493         /* If UBSAN reports an error there, please make sure your compiler
 494          * supports -fno-strict-overflow before reporting it that was a bug
 495          * in UBSAN, and it has been fixed in GCC-8.
 496          */
 497         return atomic_add_return(segs + delta, p_id) - segs;
 498 }
 499 EXPORT_SYMBOL(ip_idents_reserve);
 500
 501 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 502 {
 503         u32 hash, id;
 504
 505         /* Note the following code is not safe, but this is okay. */
 506         if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
 507                 get_random_bytes(&net->ipv4.ip_id_key,
 508                                  sizeof(net->ipv4.ip_id_key));
 509
 510         hash = siphash_3u32((__force u32)iph->daddr,
 511                             (__force u32)iph->saddr,
 512                             iph->protocol,
 513                             &net->ipv4.ip_id_key);
 514         id = ip_idents_reserve(hash, segs);
 515         iph->id = htons(id);
 516 }
 517 EXPORT_SYMBOL(__ip_select_ident);
 518
 519 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 520                              const struct iphdr *iph,
 521                              int oif, u8 tos,
 522                              u8 prot, u32 mark, int flow_flags)
 523 {
 524         if (sk) {
 525                 const struct inet_sock *inet = inet_sk(sk);
 526
 527                 oif = sk->sk_bound_dev_if;
 528                 mark = sk->sk_mark;
 529                 tos = RT_CONN_FLAGS(sk);
 530                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 531         }
 532         flowi4_init_output(fl4, oif, mark, tos,
 533                            RT_SCOPE_UNIVERSE, prot,
 534                            flow_flags,
 535                            iph->daddr, iph->saddr, 0, 0);
 536 }
 537
 538 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 539                                const struct sock *sk)
 540 {
 541         const struct iphdr *iph = ip_hdr(skb);
 542         int oif = skb->dev->ifindex;
 543         u8 tos = RT_TOS(iph->tos);
 544         u8 prot = iph->protocol;
 545         u32 mark = skb->mark;
 546
 547         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
 548 }
 549
 550 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 551 {
 552         const struct inet_sock *inet = inet_sk(sk);
 553         const struct ip_options_rcu *inet_opt;
 554         __be32 daddr = inet->inet_daddr;
 555
 556         rcu_read_lock();
 557         inet_opt = rcu_dereference(inet->inet_opt);
 558         if (inet_opt && inet_opt->opt.srr)
 559                 daddr = inet_opt->opt.faddr;
 560         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 561                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 562                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 563                            inet_sk_flowi_flags(sk),
 564                            daddr, inet->inet_saddr, 0, 0);
 565         rcu_read_unlock();
 566 }
 567
 568 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 569                                  const struct sk_buff *skb)
 570 {
 571         if (skb)
 572                 build_skb_flow_key(fl4, skb, sk);
 573         else
 574                 build_sk_flow_key(fl4, sk);
 575 }
 576
 577 static inline void rt_free(struct rtable *rt)
 578 {
 579         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
 580 }
 581
 582 static DEFINE_SPINLOCK(fnhe_lock);
 583
 584 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 585 {
 586         struct rtable *rt;
 587
 588         rt = rcu_dereference(fnhe->fnhe_rth_input);
 589         if (rt) {
 590                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 591                 rt_free(rt);
 592         }
 593         rt = rcu_dereference(fnhe->fnhe_rth_output);
 594         if (rt) {
 595                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 596                 rt_free(rt);
 597         }
 598 }
 599
 600 static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
 601 {
 602         struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
 603         struct fib_nh_exception *fnhe, *oldest = NULL;
 604
 605         for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
 606                 fnhe = rcu_dereference_protected(*fnhe_p,
 607                                                  lockdep_is_held(&fnhe_lock));
 608                 if (!fnhe)
 609                         break;
 610                 if (!oldest ||
 611                     time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
 612                         oldest = fnhe;
 613                         oldest_p = fnhe_p;
 614                 }
 615         }
 616         fnhe_flush_routes(oldest);
 617         *oldest_p = oldest->fnhe_next;
 618         kfree_rcu(oldest, rcu);
 619 }
 620
 621 static inline u32 fnhe_hashfun(__be32 daddr)
 622 {
 623         static u32 fnhe_hashrnd __read_mostly;
 624         u32 hval;
 625
 626         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
 627         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
 628         return hash_32(hval, FNHE_HASH_SHIFT);
 629 }
 630
 631 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 632 {
 633         rt->rt_pmtu = fnhe->fnhe_pmtu;
 634         rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
 635         rt->dst.expires = fnhe->fnhe_expires;
 636
 637         if (fnhe->fnhe_gw) {
 638                 rt->rt_flags |= RTCF_REDIRECTED;
 639                 rt->rt_gateway = fnhe->fnhe_gw;
 640                 rt->rt_uses_gateway = 1;
 641         }
 642 }
 643
 644 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 645                                   u32 pmtu, bool lock, unsigned long expires)
 646 {
 647         struct fnhe_hash_bucket *hash;
 648         struct fib_nh_exception *fnhe;
 649         struct rtable *rt;
 650         u32 genid, hval;
 651         unsigned int i;
 652         int depth;
 653
 654         genid = fnhe_genid(dev_net(nh->nh_dev));
 655         hval = fnhe_hashfun(daddr);
 656
 657         spin_lock_bh(&fnhe_lock);
 658
 659         hash = rcu_dereference(nh->nh_exceptions);
 660         if (!hash) {
 661                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 662                 if (!hash)
 663                         goto out_unlock;
 664                 rcu_assign_pointer(nh->nh_exceptions, hash);
 665         }
 666
 667         hash += hval;
 668
 669         depth = 0;
 670         for (fnhe = rcu_dereference(hash->chain); fnhe;
 671              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 672                 if (fnhe->fnhe_daddr == daddr)
 673                         break;
 674                 depth++;
 675         }
 676
 677         if (fnhe) {
 678                 if (fnhe->fnhe_genid != genid)
 679                         fnhe->fnhe_genid = genid;
 680                 if (gw)
 681                         fnhe->fnhe_gw = gw;
 682                 if (pmtu) {
 683                         fnhe->fnhe_pmtu = pmtu;
 684                         fnhe->fnhe_mtu_locked = lock;
 685                 }
 686                 fnhe->fnhe_expires = max(1UL, expires);
 687                 /* Update all cached dsts too */
 688                 rt = rcu_dereference(fnhe->fnhe_rth_input);
 689                 if (rt)
 690                         fill_route_from_fnhe(rt, fnhe);
 691                 rt = rcu_dereference(fnhe->fnhe_rth_output);
 692                 if (rt)
 693                         fill_route_from_fnhe(rt, fnhe);
 694         } else {
 695                 /* Randomize max depth to avoid some side channels attacks. */
 696                 int max_depth = FNHE_RECLAIM_DEPTH +
 697                                 prandom_u32_max(FNHE_RECLAIM_DEPTH);
 698
 699                 while (depth > max_depth) {
 700                         fnhe_remove_oldest(hash);
 701                         depth--;
 702                 }
 703
 704                 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 705                 if (!fnhe)
 706                         goto out_unlock;
 707
 708                 fnhe->fnhe_next = hash->chain;
 709
 710                 fnhe->fnhe_genid = genid;
 711                 fnhe->fnhe_daddr = daddr;
 712                 fnhe->fnhe_gw = gw;
 713                 fnhe->fnhe_pmtu = pmtu;
 714                 fnhe->fnhe_mtu_locked = lock;
 715                 fnhe->fnhe_expires = expires;
 716
 717                 rcu_assign_pointer(hash->chain, fnhe);
 718
 719                 /* Exception created; mark the cached routes for the nexthop
 720                  * stale, so anyone caching it rechecks if this exception
 721                  * applies to them.
 722                  */
 723                 rt = rcu_dereference(nh->nh_rth_input);
 724                 if (rt)
 725                         rt->dst.obsolete = DST_OBSOLETE_KILL;
 726
 727                 for_each_possible_cpu(i) {
 728                         struct rtable __rcu **prt;
 729                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
 730                         rt = rcu_dereference(*prt);
 731                         if (rt)
 732                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 733                 }
 734         }
 735
 736         fnhe->fnhe_stamp = jiffies;
 737
 738 out_unlock:
 739         spin_unlock_bh(&fnhe_lock);
 740 }
 741
 742 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 743                              bool kill_route)
 744 {
 745         __be32 new_gw = icmp_hdr(skb)->un.gateway;
 746         __be32 old_gw = ip_hdr(skb)->saddr;
 747         struct net_device *dev = skb->dev;
 748         struct in_device *in_dev;
 749         struct fib_result res;
 750         struct neighbour *n;
 751         struct net *net;
 752
 753         switch (icmp_hdr(skb)->code & 7) {
 754         case ICMP_REDIR_NET:
 755         case ICMP_REDIR_NETTOS:
 756         case ICMP_REDIR_HOST:
 757         case ICMP_REDIR_HOSTTOS:
 758                 break;
 759
 760         default:
 761                 return;
 762         }
 763
 764         if (rt->rt_gateway != old_gw)
 765                 return;
 766
 767         in_dev = __in_dev_get_rcu(dev);
 768         if (!in_dev)
 769                 return;
 770
 771         net = dev_net(dev);
 772         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 773             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 774             ipv4_is_zeronet(new_gw))
 775                 goto reject_redirect;
 776
 777         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 778                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 779                         goto reject_redirect;
 780                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 781                         goto reject_redirect;
 782         } else {
 783                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 784                         goto reject_redirect;
 785         }
 786
 787         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
 788         if (!n)
 789                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 790         if (!IS_ERR(n)) {
 791                 if (!(n->nud_state & NUD_VALID)) {
 792                         neigh_event_send(n, NULL);
 793                 } else {
 794                         if (fib_lookup(net, fl4, &res, 0) == 0) {
 795                                 struct fib_nh *nh = &FIB_RES_NH(res);
 796
 797                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
 798                                                 0, false,
 799                                                 jiffies + ip_rt_gc_timeout);
 800                         }
 801                         if (kill_route)
 802                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 803                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 804                 }
 805                 neigh_release(n);
 806         }
 807         return;
 808
 809 reject_redirect:
 810 #ifdef CONFIG_IP_ROUTE_VERBOSE
 811         if (IN_DEV_LOG_MARTIANS(in_dev)) {
 812                 const struct iphdr *iph = (const struct iphdr *) skb->data;
 813                 __be32 daddr = iph->daddr;
 814                 __be32 saddr = iph->saddr;
 815
 816                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 817                                      "  Advised path = %pI4 -> %pI4\n",
 818                                      &old_gw, dev->name, &new_gw,
 819                                      &saddr, &daddr);
 820         }
 821 #endif
 822         ;
 823 }
 824
 825 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 826 {
 827         struct rtable *rt;
 828         struct flowi4 fl4;
 829         const struct iphdr *iph = (const struct iphdr *) skb->data;
 830         int oif = skb->dev->ifindex;
 831         u8 tos = RT_TOS(iph->tos);
 832         u8 prot = iph->protocol;
 833         u32 mark = skb->mark;
 834
 835         rt = (struct rtable *) dst;
 836
 837         __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
 838         __ip_do_redirect(rt, skb, &fl4, true);
 839 }
 840
 841 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 842 {
 843         struct rtable *rt = (struct rtable *)dst;
 844         struct dst_entry *ret = dst;
 845
 846         if (rt) {
 847                 if (dst->obsolete > 0) {
 848                         ip_rt_put(rt);
 849                         ret = NULL;
 850                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 851                            rt->dst.expires) {
 852                         ip_rt_put(rt);
 853                         ret = NULL;
 854                 }
 855         }
 856         return ret;
 857 }
 858
 859 /*
 860  * Algorithm:
 861  *      1. The first ip_rt_redirect_number redirects are sent
 862  *         with exponential backoff, then we stop sending them at all,
 863  *         assuming that the host ignores our redirects.
 864  *      2. If we did not see packets requiring redirects
 865  *         during ip_rt_redirect_silence, we assume that the host
 866  *         forgot redirected route and start to send redirects again.
 867  *
 868  * This algorithm is much cheaper and more intelligent than dumb load limiting
 869  * in icmp.c.
 870  *
 871  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 872  * and "frag. need" (breaks PMTU discovery) in icmp.c.
 873  */
 874
 875 void ip_rt_send_redirect(struct sk_buff *skb)
 876 {
 877         struct rtable *rt = skb_rtable(skb);
 878         struct in_device *in_dev;
 879         struct inet_peer *peer;
 880         struct net *net;
 881         int log_martians;
 882         int vif;
 883
 884         rcu_read_lock();
 885         in_dev = __in_dev_get_rcu(rt->dst.dev);
 886         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 887                 rcu_read_unlock();
 888                 return;
 889         }
 890         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 891         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 892         rcu_read_unlock();
 893
 894         net = dev_net(rt->dst.dev);
 895         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 896         if (!peer) {
 897                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 898                           rt_nexthop(rt, ip_hdr(skb)->daddr));
 899                 return;
 900         }
 901
 902         /* No redirected packets during ip_rt_redirect_silence;
 903          * reset the algorithm.
 904          */
 905         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
 906                 peer->rate_tokens = 0;
 907                 peer->n_redirects = 0;
 908         }
 909
 910         /* Too many ignored redirects; do not send anything
 911          * set dst.rate_last to the last seen redirected packet.
 912          */
 913         if (peer->n_redirects >= ip_rt_redirect_number) {
 914                 peer->rate_last = jiffies;
 915                 goto out_put_peer;
 916         }
 917
 918         /* Check for load limit; set rate_last to the latest sent
 919          * redirect.
 920          */
 921         if (peer->n_redirects == 0 ||
 922             time_after(jiffies,
 923                        (peer->rate_last +
 924                         (ip_rt_redirect_load << peer->n_redirects)))) {
 925                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 926
 927                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 928                 peer->rate_last = jiffies;
 929                 ++peer->n_redirects;
 930 #ifdef CONFIG_IP_ROUTE_VERBOSE
 931                 if (log_martians &&
 932                     peer->n_redirects == ip_rt_redirect_number)
 933                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 934                                              &ip_hdr(skb)->saddr, inet_iif(skb),
 935                                              &ip_hdr(skb)->daddr, &gw);
 936 #endif
 937         }
 938 out_put_peer:
 939         inet_putpeer(peer);
 940 }
 941
 942 static int ip_error(struct sk_buff *skb)
 943 {
 944         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
 945         struct rtable *rt = skb_rtable(skb);
 946         struct inet_peer *peer;
 947         unsigned long now;
 948         struct net *net;
 949         bool send;
 950         int code;
 951
 952         /* IP on this device is disabled. */
 953         if (!in_dev)
 954                 goto out;
 955
 956         net = dev_net(rt->dst.dev);
 957         if (!IN_DEV_FORWARD(in_dev)) {
 958                 switch (rt->dst.error) {
 959                 case EHOSTUNREACH:
 960                         IP_INC_STATS_BH(net, IPSTATS_MIB_INADDRERRORS);
 961                         break;
 962
 963                 case ENETUNREACH:
 964                         IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
 965                         break;
 966                 }
 967                 goto out;
 968         }
 969
 970         switch (rt->dst.error) {
 971         case EINVAL:
 972         default:
 973                 goto out;
 974         case EHOSTUNREACH:
 975                 code = ICMP_HOST_UNREACH;
 976                 break;
 977         case ENETUNREACH:
 978                 code = ICMP_NET_UNREACH;
 979                 IP_INC_STATS_BH(net, IPSTATS_MIB_INNOROUTES);
 980                 break;
 981         case EACCES:
 982                 code = ICMP_PKT_FILTERED;
 983                 break;
 984         }
 985
 986         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 987                                l3mdev_master_ifindex(skb->dev), 1);
 988
 989         send = true;
 990         if (peer) {
 991                 now = jiffies;
 992                 peer->rate_tokens += now - peer->rate_last;
 993                 if (peer->rate_tokens > ip_rt_error_burst)
 994                         peer->rate_tokens = ip_rt_error_burst;
 995                 peer->rate_last = now;
 996                 if (peer->rate_tokens >= ip_rt_error_cost)
 997                         peer->rate_tokens -= ip_rt_error_cost;
 998                 else
 999                         send = false;
1000                 inet_putpeer(peer);
1001         }
1002         if (send)
1003                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1004
1005 out:    kfree_skb(skb);
1006         return 0;
1007 }
1008
1009 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1010 {
1011         struct dst_entry *dst = &rt->dst;
1012         u32 old_mtu = ipv4_mtu(dst);
1013         struct fib_result res;
1014         bool lock = false;
1015
1016         if (ip_mtu_locked(dst))
1017                 return;
1018
1019         if (old_mtu < mtu)
1020                 return;
1021
1022         if (mtu < ip_rt_min_pmtu) {
1023                 lock = true;
1024                 mtu = min(old_mtu, ip_rt_min_pmtu);
1025         }
1026
1027         if (rt->rt_pmtu == mtu && !lock &&
1028             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1029                 return;
1030
1031         rcu_read_lock();
1032         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1033                 struct fib_nh *nh = &FIB_RES_NH(res);
1034
1035                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
1036                                       jiffies + ip_rt_mtu_expires);
1037         }
1038         rcu_read_unlock();
1039 }
1040
1041 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1042                               struct sk_buff *skb, u32 mtu)
1043 {
1044         struct rtable *rt = (struct rtable *) dst;
1045         struct flowi4 fl4;
1046
1047         ip_rt_build_flow_key(&fl4, sk, skb);
1048         __ip_rt_update_pmtu(rt, &fl4, mtu);
1049 }
1050
1051 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1052                       int oif, u32 mark, u8 protocol, int flow_flags)
1053 {
1054         const struct iphdr *iph = (const struct iphdr *) skb->data;
1055         struct flowi4 fl4;
1056         struct rtable *rt;
1057
1058         if (!mark)
1059                 mark = IP4_REPLY_MARK(net, skb->mark);
1060
1061         __build_flow_key(&fl4, NULL, iph, oif,
1062                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1063         rt = __ip_route_output_key(net, &fl4);
1064         if (!IS_ERR(rt)) {
1065                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1066                 ip_rt_put(rt);
1067         }
1068 }
1069 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1070
1071 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1072 {
1073         const struct iphdr *iph = (const struct iphdr *) skb->data;
1074         struct flowi4 fl4;
1075         struct rtable *rt;
1076
1077         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1078
1079         if (!fl4.flowi4_mark)
1080                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1081
1082         rt = __ip_route_output_key(sock_net(sk), &fl4);
1083         if (!IS_ERR(rt)) {
1084                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1085                 ip_rt_put(rt);
1086         }
1087 }
1088
1089 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1090 {
1091         const struct iphdr *iph = (const struct iphdr *) skb->data;
1092         struct flowi4 fl4;
1093         struct rtable *rt;
1094         struct dst_entry *odst = NULL;
1095         bool new = false;
1096
1097         bh_lock_sock(sk);
1098
1099         if (!ip_sk_accept_pmtu(sk))
1100                 goto out;
1101
1102         odst = sk_dst_get(sk);
1103
1104         if (sock_owned_by_user(sk) || !odst) {
1105                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1106                 goto out;
1107         }
1108
1109         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1110
1111         rt = (struct rtable *)odst;
1112         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1113                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1114                 if (IS_ERR(rt))
1115                         goto out;
1116
1117                 new = true;
1118         }
1119
1120         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1121
1122         if (!dst_check(&rt->dst, 0)) {
1123                 if (new)
1124                         dst_release(&rt->dst);
1125
1126                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1127                 if (IS_ERR(rt))
1128                         goto out;
1129
1130                 new = true;
1131         }
1132
1133         if (new)
1134                 sk_dst_set(sk, &rt->dst);
1135
1136 out:
1137         bh_unlock_sock(sk);
1138         dst_release(odst);
1139 }
1140 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1141
1142 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1143                    int oif, u32 mark, u8 protocol, int flow_flags)
1144 {
1145         const struct iphdr *iph = (const struct iphdr *) skb->data;
1146         struct flowi4 fl4;
1147         struct rtable *rt;
1148
1149         __build_flow_key(&fl4, NULL, iph, oif,
1150                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1151         rt = __ip_route_output_key(net, &fl4);
1152         if (!IS_ERR(rt)) {
1153                 __ip_do_redirect(rt, skb, &fl4, false);
1154                 ip_rt_put(rt);
1155         }
1156 }
1157 EXPORT_SYMBOL_GPL(ipv4_redirect);
1158
1159 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1160 {
1161         const struct iphdr *iph = (const struct iphdr *) skb->data;
1162         struct flowi4 fl4;
1163         struct rtable *rt;
1164
1165         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1166         rt = __ip_route_output_key(sock_net(sk), &fl4);
1167         if (!IS_ERR(rt)) {
1168                 __ip_do_redirect(rt, skb, &fl4, false);
1169                 ip_rt_put(rt);
1170         }
1171 }
1172 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1173
1174 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1175 {
1176         struct rtable *rt = (struct rtable *) dst;
1177
1178         /* All IPV4 dsts are created with ->obsolete set to the value
1179          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1180          * into this function always.
1181          *
1182          * When a PMTU/redirect information update invalidates a route,
1183          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1184          * DST_OBSOLETE_DEAD by dst_free().
1185          */
1186         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1187                 return NULL;
1188         return dst;
1189 }
1190
1191 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1192 {
1193         struct ip_options opt;
1194         int res;
1195
1196         /* Recompile ip options since IPCB may not be valid anymore.
1197          * Also check we have a reasonable ipv4 header.
1198          */
1199         if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1200             ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1201                 return;
1202
1203         memset(&opt, 0, sizeof(opt));
1204         if (ip_hdr(skb)->ihl > 5) {
1205                 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1206                         return;
1207                 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1208
1209                 rcu_read_lock();
1210                 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1211                 rcu_read_unlock();
1212
1213                 if (res)
1214                         return;
1215         }
1216         __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1217 }
1218
1219 static void ipv4_link_failure(struct sk_buff *skb)
1220 {
1221         struct rtable *rt;
1222
1223         ipv4_send_dest_unreach(skb);
1224
1225         rt = skb_rtable(skb);
1226         if (rt)
1227                 dst_set_expires(&rt->dst, 0);
1228 }
1229
1230 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1231 {
1232         pr_debug("%s: %pI4 -> %pI4, %s\n",
1233                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1234                  skb->dev ? skb->dev->name : "?");
1235         kfree_skb(skb);
1236         WARN_ON(1);
1237         return 0;
1238 }
1239
1240 /*
1241    We do not cache source address of outgoing interface,
1242    because it is used only by IP RR, TS and SRR options,
1243    so that it out of fast path.
1244
1245    BTW remember: "addr" is allowed to be not aligned
1246    in IP options!
1247  */
1248
1249 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1250 {
1251         __be32 src;
1252
1253         if (rt_is_output_route(rt))
1254                 src = ip_hdr(skb)->saddr;
1255         else {
1256                 struct fib_result res;
1257                 struct flowi4 fl4;
1258                 struct iphdr *iph;
1259
1260                 iph = ip_hdr(skb);
1261
1262                 memset(&fl4, 0, sizeof(fl4));
1263                 fl4.daddr = iph->daddr;
1264                 fl4.saddr = iph->saddr;
1265                 fl4.flowi4_tos = RT_TOS(iph->tos);
1266                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1267                 fl4.flowi4_iif = skb->dev->ifindex;
1268                 fl4.flowi4_mark = skb->mark;
1269
1270                 rcu_read_lock();
1271                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1272                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1273                 else
1274                         src = inet_select_addr(rt->dst.dev,
1275                                                rt_nexthop(rt, iph->daddr),
1276                                                RT_SCOPE_UNIVERSE);
1277                 rcu_read_unlock();
1278         }
1279         memcpy(addr, &src, 4);
1280 }
1281
1282 #ifdef CONFIG_IP_ROUTE_CLASSID
1283 static void set_class_tag(struct rtable *rt, u32 tag)
1284 {
1285         if (!(rt->dst.tclassid & 0xFFFF))
1286                 rt->dst.tclassid |= tag & 0xFFFF;
1287         if (!(rt->dst.tclassid & 0xFFFF0000))
1288                 rt->dst.tclassid |= tag & 0xFFFF0000;
1289 }
1290 #endif
1291
1292 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1293 {
1294         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1295
1296         if (advmss == 0) {
1297                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1298                                ip_rt_min_advmss);
1299                 if (advmss > 65535 - 40)
1300                         advmss = 65535 - 40;
1301         }
1302         return advmss;
1303 }
1304
1305 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1306 {
1307         const struct rtable *rt = (const struct rtable *) dst;
1308         unsigned int mtu = rt->rt_pmtu;
1309
1310         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1311                 mtu = dst_metric_raw(dst, RTAX_MTU);
1312
1313         if (mtu)
1314                 return mtu;
1315
1316         mtu = READ_ONCE(dst->dev->mtu);
1317
1318         if (unlikely(ip_mtu_locked(dst))) {
1319                 if (rt->rt_uses_gateway && mtu > 576)
1320                         mtu = 576;
1321         }
1322
1323         return min_t(unsigned int, mtu, IP_MAX_MTU);
1324 }
1325
1326 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1327 {
1328         struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1329         struct fib_nh_exception *fnhe;
1330         u32 hval;
1331
1332         if (!hash)
1333                 return NULL;
1334
1335         hval = fnhe_hashfun(daddr);
1336
1337         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1338              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1339                 if (fnhe->fnhe_daddr == daddr)
1340                         return fnhe;
1341         }
1342         return NULL;
1343 }
1344
1345 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1346                               __be32 daddr)
1347 {
1348         bool ret = false;
1349
1350         spin_lock_bh(&fnhe_lock);
1351
1352         if (daddr == fnhe->fnhe_daddr) {
1353                 struct rtable __rcu **porig;
1354                 struct rtable *orig;
1355                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1356
1357                 if (rt_is_input_route(rt))
1358                         porig = &fnhe->fnhe_rth_input;
1359                 else
1360                         porig = &fnhe->fnhe_rth_output;
1361                 orig = rcu_dereference(*porig);
1362
1363                 if (fnhe->fnhe_genid != genid) {
1364                         fnhe->fnhe_genid = genid;
1365                         fnhe->fnhe_gw = 0;
1366                         fnhe->fnhe_pmtu = 0;
1367                         fnhe->fnhe_expires = 0;
1368                         fnhe_flush_routes(fnhe);
1369                         orig = NULL;
1370                 }
1371                 fill_route_from_fnhe(rt, fnhe);
1372                 if (!rt->rt_gateway)
1373                         rt->rt_gateway = daddr;
1374
1375                 if (!(rt->dst.flags & DST_NOCACHE)) {
1376                         rcu_assign_pointer(*porig, rt);
1377                         if (orig)
1378                                 rt_free(orig);
1379                         ret = true;
1380                 }
1381
1382                 fnhe->fnhe_stamp = jiffies;
1383         }
1384         spin_unlock_bh(&fnhe_lock);
1385
1386         return ret;
1387 }
1388
1389 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1390 {
1391         struct rtable *orig, *prev, **p;
1392         bool ret = true;
1393
1394         if (rt_is_input_route(rt)) {
1395                 p = (struct rtable **)&nh->nh_rth_input;
1396         } else {
1397                 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1398         }
1399         orig = *p;
1400
1401         prev = cmpxchg(p, orig, rt);
1402         if (prev == orig) {
1403                 if (orig)
1404                         rt_free(orig);
1405         } else
1406                 ret = false;
1407
1408         return ret;
1409 }
1410
1411 struct uncached_list {
1412         spinlock_t              lock;
1413         struct list_head        head;
1414 };
1415
1416 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1417
1418 static void rt_add_uncached_list(struct rtable *rt)
1419 {
1420         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1421
1422         rt->rt_uncached_list = ul;
1423
1424         spin_lock_bh(&ul->lock);
1425         list_add_tail(&rt->rt_uncached, &ul->head);
1426         spin_unlock_bh(&ul->lock);
1427 }
1428
1429 static void ipv4_dst_destroy(struct dst_entry *dst)
1430 {
1431         struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1432         struct rtable *rt = (struct rtable *) dst;
1433
1434         if (p != &dst_default_metrics && atomic_dec_and_test(&p->refcnt))
1435                 kfree(p);
1436
1437         if (!list_empty(&rt->rt_uncached)) {
1438                 struct uncached_list *ul = rt->rt_uncached_list;
1439
1440                 spin_lock_bh(&ul->lock);
1441                 list_del(&rt->rt_uncached);
1442                 spin_unlock_bh(&ul->lock);
1443         }
1444 }
1445
1446 void rt_flush_dev(struct net_device *dev)
1447 {
1448         struct net *net = dev_net(dev);
1449         struct rtable *rt;
1450         int cpu;
1451
1452         for_each_possible_cpu(cpu) {
1453                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1454
1455                 spin_lock_bh(&ul->lock);
1456                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1457                         if (rt->dst.dev != dev)
1458                                 continue;
1459                         rt->dst.dev = net->loopback_dev;
1460                         dev_hold(rt->dst.dev);
1461                         dev_put(dev);
1462                 }
1463                 spin_unlock_bh(&ul->lock);
1464         }
1465 }
1466
1467 static bool rt_cache_valid(const struct rtable *rt)
1468 {
1469         return  rt &&
1470                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1471                 !rt_is_expired(rt);
1472 }
1473
1474 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1475                            const struct fib_result *res,
1476                            struct fib_nh_exception *fnhe,
1477                            struct fib_info *fi, u16 type, u32 itag)
1478 {
1479         bool cached = false;
1480
1481         if (fi) {
1482                 struct fib_nh *nh = &FIB_RES_NH(*res);
1483
1484                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1485                         rt->rt_gateway = nh->nh_gw;
1486                         rt->rt_uses_gateway = 1;
1487                 }
1488                 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1489                 if (fi->fib_metrics != &dst_default_metrics) {
1490                         rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1491                         atomic_inc(&fi->fib_metrics->refcnt);
1492                 }
1493 #ifdef CONFIG_IP_ROUTE_CLASSID
1494                 rt->dst.tclassid = nh->nh_tclassid;
1495 #endif
1496                 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1497                 if (unlikely(fnhe))
1498                         cached = rt_bind_exception(rt, fnhe, daddr);
1499                 else if (!(rt->dst.flags & DST_NOCACHE))
1500                         cached = rt_cache_route(nh, rt);
1501                 if (unlikely(!cached)) {
1502                         /* Routes we intend to cache in nexthop exception or
1503                          * FIB nexthop have the DST_NOCACHE bit clear.
1504                          * However, if we are unsuccessful at storing this
1505                          * route into the cache we really need to set it.
1506                          */
1507                         rt->dst.flags |= DST_NOCACHE;
1508                         if (!rt->rt_gateway)
1509                                 rt->rt_gateway = daddr;
1510                         rt_add_uncached_list(rt);
1511                 }
1512         } else
1513                 rt_add_uncached_list(rt);
1514
1515 #ifdef CONFIG_IP_ROUTE_CLASSID
1516 #ifdef CONFIG_IP_MULTIPLE_TABLES
1517         set_class_tag(rt, res->tclassid);
1518 #endif
1519         set_class_tag(rt, itag);
1520 #endif
1521 }
1522
1523 struct rtable *rt_dst_alloc(struct net_device *dev,
1524                             unsigned int flags, u16 type,
1525                             bool nopolicy, bool noxfrm, bool will_cache)
1526 {
1527         struct rtable *rt;
1528
1529         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1530                        (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1531                        (nopolicy ? DST_NOPOLICY : 0) |
1532                        (noxfrm ? DST_NOXFRM : 0));
1533
1534         if (rt) {
1535                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1536                 rt->rt_flags = flags;
1537                 rt->rt_type = type;
1538                 rt->rt_is_input = 0;
1539                 rt->rt_iif = 0;
1540                 rt->rt_pmtu = 0;
1541                 rt->rt_mtu_locked = 0;
1542                 rt->rt_gateway = 0;
1543                 rt->rt_uses_gateway = 0;
1544                 rt->rt_table_id = 0;
1545                 INIT_LIST_HEAD(&rt->rt_uncached);
1546
1547                 rt->dst.output = ip_output;
1548                 if (flags & RTCF_LOCAL)
1549                         rt->dst.input = ip_local_deliver;
1550         }
1551
1552         return rt;
1553 }
1554 EXPORT_SYMBOL(rt_dst_alloc);
1555
1556 /* called in rcu_read_lock() section */
1557 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1558                                 u8 tos, struct net_device *dev, int our)
1559 {
1560         struct rtable *rth;
1561         struct in_device *in_dev = __in_dev_get_rcu(dev);
1562         unsigned int flags = RTCF_MULTICAST;
1563         u32 itag = 0;
1564         int err;
1565
1566         /* Primary sanity checks. */
1567
1568         if (!in_dev)
1569                 return -EINVAL;
1570
1571         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1572             skb->protocol != htons(ETH_P_IP))
1573                 goto e_inval;
1574
1575         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1576                 goto e_inval;
1577
1578         if (ipv4_is_zeronet(saddr)) {
1579                 if (!ipv4_is_local_multicast(daddr))
1580                         goto e_inval;
1581         } else {
1582                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1583                                           in_dev, &itag);
1584                 if (err < 0)
1585                         goto e_err;
1586         }
1587         if (our)
1588                 flags |= RTCF_LOCAL;
1589
1590         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1591                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1592         if (!rth)
1593                 goto e_nobufs;
1594
1595 #ifdef CONFIG_IP_ROUTE_CLASSID
1596         rth->dst.tclassid = itag;
1597 #endif
1598         rth->dst.output = ip_rt_bug;
1599         rth->rt_is_input= 1;
1600
1601 #ifdef CONFIG_IP_MROUTE
1602         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1603                 rth->dst.input = ip_mr_input;
1604 #endif
1605         RT_CACHE_STAT_INC(in_slow_mc);
1606
1607         skb_dst_set(skb, &rth->dst);
1608         return 0;
1609
1610 e_nobufs:
1611         return -ENOBUFS;
1612 e_inval:
1613         return -EINVAL;
1614 e_err:
1615         return err;
1616 }
1617
1618
1619 static void ip_handle_martian_source(struct net_device *dev,
1620                                      struct in_device *in_dev,
1621                                      struct sk_buff *skb,
1622                                      __be32 daddr,
1623                                      __be32 saddr)
1624 {
1625         RT_CACHE_STAT_INC(in_martian_src);
1626 #ifdef CONFIG_IP_ROUTE_VERBOSE
1627         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1628                 /*
1629                  *      RFC1812 recommendation, if source is martian,
1630                  *      the only hint is MAC header.
1631                  */
1632                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1633                         &daddr, &saddr, dev->name);
1634                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1635                         print_hex_dump(KERN_WARNING, "ll header: ",
1636                                        DUMP_PREFIX_OFFSET, 16, 1,
1637                                        skb_mac_header(skb),
1638                                        dev->hard_header_len, true);
1639                 }
1640         }
1641 #endif
1642 }
1643
1644 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1645 {
1646         struct fnhe_hash_bucket *hash;
1647         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1648         u32 hval = fnhe_hashfun(daddr);
1649
1650         spin_lock_bh(&fnhe_lock);
1651
1652         hash = rcu_dereference_protected(nh->nh_exceptions,
1653                                          lockdep_is_held(&fnhe_lock));
1654         hash += hval;
1655
1656         fnhe_p = &hash->chain;
1657         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1658         while (fnhe) {
1659                 if (fnhe->fnhe_daddr == daddr) {
1660                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1661                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1662                         /* set fnhe_daddr to 0 to ensure it won't bind with
1663                          * new dsts in rt_bind_exception().
1664                          */
1665                         fnhe->fnhe_daddr = 0;
1666                         fnhe_flush_routes(fnhe);
1667                         kfree_rcu(fnhe, rcu);
1668                         break;
1669                 }
1670                 fnhe_p = &fnhe->fnhe_next;
1671                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1672                                                  lockdep_is_held(&fnhe_lock));
1673         }
1674
1675         spin_unlock_bh(&fnhe_lock);
1676 }
1677
1678 /* called in rcu_read_lock() section */
1679 static int __mkroute_input(struct sk_buff *skb,
1680                            const struct fib_result *res,
1681                            struct in_device *in_dev,
1682                            __be32 daddr, __be32 saddr, u32 tos)
1683 {
1684         struct fib_nh_exception *fnhe;
1685         struct rtable *rth;
1686         int err;
1687         struct in_device *out_dev;
1688         bool do_cache;
1689         u32 itag = 0;
1690
1691         /* get a working reference to the output device */
1692         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1693         if (!out_dev) {
1694                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1695                 return -EINVAL;
1696         }
1697
1698         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1699                                   in_dev->dev, in_dev, &itag);
1700         if (err < 0) {
1701                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1702                                          saddr);
1703
1704                 goto cleanup;
1705         }
1706
1707         do_cache = res->fi && !itag;
1708         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1709             skb->protocol == htons(ETH_P_IP) &&
1710             (IN_DEV_SHARED_MEDIA(out_dev) ||
1711              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1712                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1713
1714         if (skb->protocol != htons(ETH_P_IP)) {
1715                 /* Not IP (i.e. ARP). Do not create route, if it is
1716                  * invalid for proxy arp. DNAT routes are always valid.
1717                  *
1718                  * Proxy arp feature have been extended to allow, ARP
1719                  * replies back to the same interface, to support
1720                  * Private VLAN switch technologies. See arp.c.
1721                  */
1722                 if (out_dev == in_dev &&
1723                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1724                         err = -EINVAL;
1725                         goto cleanup;
1726                 }
1727         }
1728
1729         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1730         if (do_cache) {
1731                 if (fnhe) {
1732                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1733                         if (rth && rth->dst.expires &&
1734                             time_after(jiffies, rth->dst.expires)) {
1735                                 ip_del_fnhe(&FIB_RES_NH(*res), daddr);
1736                                 fnhe = NULL;
1737                         } else {
1738                                 goto rt_cache;
1739                         }
1740                 }
1741
1742                 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1743
1744 rt_cache:
1745                 if (rt_cache_valid(rth)) {
1746                         skb_dst_set_noref(skb, &rth->dst);
1747                         goto out;
1748                 }
1749         }
1750
1751         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1752                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1753                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1754         if (!rth) {
1755                 err = -ENOBUFS;
1756                 goto cleanup;
1757         }
1758
1759         rth->rt_is_input = 1;
1760         if (res->table)
1761                 rth->rt_table_id = res->table->tb_id;
1762         RT_CACHE_STAT_INC(in_slow_tot);
1763
1764         rth->dst.input = ip_forward;
1765
1766         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1767         if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1768                 rth->dst.lwtstate->orig_output = rth->dst.output;
1769                 rth->dst.output = lwtunnel_output;
1770         }
1771         if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1772                 rth->dst.lwtstate->orig_input = rth->dst.input;
1773                 rth->dst.input = lwtunnel_input;
1774         }
1775         skb_dst_set(skb, &rth->dst);
1776 out:
1777         err = 0;
1778  cleanup:
1779         return err;
1780 }
1781
1782 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1783
1784 /* To make ICMP packets follow the right flow, the multipath hash is
1785  * calculated from the inner IP addresses in reverse order.
1786  */
1787 static int ip_multipath_icmp_hash(struct sk_buff *skb)
1788 {
1789         const struct iphdr *outer_iph = ip_hdr(skb);
1790         struct icmphdr _icmph;
1791         const struct icmphdr *icmph;
1792         struct iphdr _inner_iph;
1793         const struct iphdr *inner_iph;
1794
1795         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1796                 goto standard_hash;
1797
1798         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1799                                    &_icmph);
1800         if (!icmph)
1801                 goto standard_hash;
1802
1803         if (icmph->type != ICMP_DEST_UNREACH &&
1804             icmph->type != ICMP_REDIRECT &&
1805             icmph->type != ICMP_TIME_EXCEEDED &&
1806             icmph->type != ICMP_PARAMETERPROB) {
1807                 goto standard_hash;
1808         }
1809
1810         inner_iph = skb_header_pointer(skb,
1811                                        outer_iph->ihl * 4 + sizeof(_icmph),
1812                                        sizeof(_inner_iph), &_inner_iph);
1813         if (!inner_iph)
1814                 goto standard_hash;
1815
1816         return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr);
1817
1818 standard_hash:
1819         return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr);
1820 }
1821
1822 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1823
1824 static int ip_mkroute_input(struct sk_buff *skb,
1825                             struct fib_result *res,
1826                             const struct flowi4 *fl4,
1827                             struct in_device *in_dev,
1828                             __be32 daddr, __be32 saddr, u32 tos)
1829 {
1830 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1831         if (res->fi && res->fi->fib_nhs > 1) {
1832                 int h;
1833
1834                 if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP))
1835                         h = ip_multipath_icmp_hash(skb);
1836                 else
1837                         h = fib_multipath_hash(saddr, daddr);
1838                 fib_select_multipath(res, h);
1839         }
1840 #endif
1841
1842         /* create a routing cache entry */
1843         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1844 }
1845
1846 /*
1847  *      NOTE. We drop all the packets that has local source
1848  *      addresses, because every properly looped back packet
1849  *      must have correct destination already attached by output routine.
1850  *
1851  *      Such approach solves two big problems:
1852  *      1. Not simplex devices are handled properly.
1853  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1854  *      called with rcu_read_lock()
1855  */
1856
1857 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1858                                u8 tos, struct net_device *dev)
1859 {
1860         struct fib_result res;
1861         struct in_device *in_dev = __in_dev_get_rcu(dev);
1862         struct ip_tunnel_info *tun_info;
1863         struct flowi4   fl4;
1864         unsigned int    flags = 0;
1865         u32             itag = 0;
1866         struct rtable   *rth;
1867         int             err = -EINVAL;
1868         struct net    *net = dev_net(dev);
1869         bool do_cache;
1870
1871         /* IP on this device is disabled. */
1872
1873         if (!in_dev)
1874                 goto out;
1875
1876         /* Check for the most weird martians, which can be not detected
1877            by fib_lookup.
1878          */
1879
1880         tun_info = skb_tunnel_info(skb);
1881         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1882                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1883         else
1884                 fl4.flowi4_tun_key.tun_id = 0;
1885         skb_dst_drop(skb);
1886
1887         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1888                 goto martian_source;
1889
1890         res.fi = NULL;
1891         res.table = NULL;
1892         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1893                 goto brd_input;
1894
1895         /* Accept zero addresses only to limited broadcast;
1896          * I even do not know to fix it or not. Waiting for complains :-)
1897          */
1898         if (ipv4_is_zeronet(saddr))
1899                 goto martian_source;
1900
1901         if (ipv4_is_zeronet(daddr))
1902                 goto martian_destination;
1903
1904         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1905          * and call it once if daddr or/and saddr are loopback addresses
1906          */
1907         if (ipv4_is_loopback(daddr)) {
1908                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1909                         goto martian_destination;
1910         } else if (ipv4_is_loopback(saddr)) {
1911                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1912                         goto martian_source;
1913         }
1914
1915         /*
1916          *      Now we are ready to route packet.
1917          */
1918         fl4.flowi4_oif = 0;
1919         fl4.flowi4_iif = l3mdev_fib_oif_rcu(dev);
1920         fl4.flowi4_mark = skb->mark;
1921         fl4.flowi4_tos = tos;
1922         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1923         fl4.flowi4_flags = 0;
1924         fl4.daddr = daddr;
1925         fl4.saddr = saddr;
1926         err = fib_lookup(net, &fl4, &res, 0);
1927         if (err != 0) {
1928                 if (!IN_DEV_FORWARD(in_dev))
1929                         err = -EHOSTUNREACH;
1930                 goto no_route;
1931         }
1932
1933         if (res.type == RTN_BROADCAST)
1934                 goto brd_input;
1935
1936         if (res.type == RTN_LOCAL) {
1937                 err = fib_validate_source(skb, saddr, daddr, tos,
1938                                           0, dev, in_dev, &itag);
1939                 if (err < 0)
1940                         goto martian_source;
1941                 goto local_input;
1942         }
1943
1944         if (!IN_DEV_FORWARD(in_dev)) {
1945                 err = -EHOSTUNREACH;
1946                 goto no_route;
1947         }
1948         if (res.type != RTN_UNICAST)
1949                 goto martian_destination;
1950
1951         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1952 out:    return err;
1953
1954 brd_input:
1955         if (skb->protocol != htons(ETH_P_IP))
1956                 goto e_inval;
1957
1958         if (!ipv4_is_zeronet(saddr)) {
1959                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1960                                           in_dev, &itag);
1961                 if (err < 0)
1962                         goto martian_source;
1963         }
1964         flags |= RTCF_BROADCAST;
1965         res.type = RTN_BROADCAST;
1966         RT_CACHE_STAT_INC(in_brd);
1967
1968 local_input:
1969         do_cache = false;
1970         if (res.fi) {
1971                 if (!itag) {
1972                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1973                         if (rt_cache_valid(rth)) {
1974                                 skb_dst_set_noref(skb, &rth->dst);
1975                                 err = 0;
1976                                 goto out;
1977                         }
1978                         do_cache = true;
1979                 }
1980         }
1981
1982         rth = rt_dst_alloc(net->loopback_dev, flags | RTCF_LOCAL, res.type,
1983                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1984         if (!rth)
1985                 goto e_nobufs;
1986
1987         rth->dst.output= ip_rt_bug;
1988 #ifdef CONFIG_IP_ROUTE_CLASSID
1989         rth->dst.tclassid = itag;
1990 #endif
1991         rth->rt_is_input = 1;
1992         if (res.table)
1993                 rth->rt_table_id = res.table->tb_id;
1994
1995         RT_CACHE_STAT_INC(in_slow_tot);
1996         if (res.type == RTN_UNREACHABLE) {
1997                 rth->dst.input= ip_error;
1998                 rth->dst.error= -err;
1999                 rth->rt_flags   &= ~RTCF_LOCAL;
2000         }
2001         if (do_cache) {
2002                 if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
2003                         rth->dst.flags |= DST_NOCACHE;
2004                         rt_add_uncached_list(rth);
2005                 }
2006         }
2007         skb_dst_set(skb, &rth->dst);
2008         err = 0;
2009         goto out;
2010
2011 no_route:
2012         RT_CACHE_STAT_INC(in_no_route);
2013         res.type = RTN_UNREACHABLE;
2014         res.fi = NULL;
2015         res.table = NULL;
2016         goto local_input;
2017
2018         /*
2019          *      Do not cache martian addresses: they should be logged (RFC1812)
2020          */
2021 martian_destination:
2022         RT_CACHE_STAT_INC(in_martian_dst);
2023 #ifdef CONFIG_IP_ROUTE_VERBOSE
2024         if (IN_DEV_LOG_MARTIANS(in_dev))
2025                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2026                                      &daddr, &saddr, dev->name);
2027 #endif
2028
2029 e_inval:
2030         err = -EINVAL;
2031         goto out;
2032
2033 e_nobufs:
2034         err = -ENOBUFS;
2035         goto out;
2036
2037 martian_source:
2038         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2039         goto out;
2040 }
2041
2042 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2043                          u8 tos, struct net_device *dev)
2044 {
2045         int res;
2046
2047         tos &= IPTOS_RT_MASK;
2048         rcu_read_lock();
2049
2050         /* Multicast recognition logic is moved from route cache to here.
2051            The problem was that too many Ethernet cards have broken/missing
2052            hardware multicast filters :-( As result the host on multicasting
2053            network acquires a lot of useless route cache entries, sort of
2054            SDR messages from all the world. Now we try to get rid of them.
2055            Really, provided software IP multicast filter is organized
2056            reasonably (at least, hashed), it does not result in a slowdown
2057            comparing with route cache reject entries.
2058            Note, that multicast routers are not affected, because
2059            route cache entry is created eventually.
2060          */
2061         if (ipv4_is_multicast(daddr)) {
2062                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2063
2064                 if (in_dev) {
2065                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2066                                                   ip_hdr(skb)->protocol);
2067                         if (our
2068 #ifdef CONFIG_IP_MROUTE
2069                                 ||
2070                             (!ipv4_is_local_multicast(daddr) &&
2071                              IN_DEV_MFORWARD(in_dev))
2072 #endif
2073                            ) {
2074                                 int res = ip_route_input_mc(skb, daddr, saddr,
2075                                                             tos, dev, our);
2076                                 rcu_read_unlock();
2077                                 return res;
2078                         }
2079                 }
2080                 rcu_read_unlock();
2081                 return -EINVAL;
2082         }
2083         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2084         rcu_read_unlock();
2085         return res;
2086 }
2087 EXPORT_SYMBOL(ip_route_input_noref);
2088
2089 /* called with rcu_read_lock() */
2090 static struct rtable *__mkroute_output(const struct fib_result *res,
2091                                        const struct flowi4 *fl4, int orig_oif,
2092                                        struct net_device *dev_out,
2093                                        unsigned int flags)
2094 {
2095         struct fib_info *fi = res->fi;
2096         struct fib_nh_exception *fnhe;
2097         struct in_device *in_dev;
2098         u16 type = res->type;
2099         struct rtable *rth;
2100         bool do_cache;
2101
2102         in_dev = __in_dev_get_rcu(dev_out);
2103         if (!in_dev)
2104                 return ERR_PTR(-EINVAL);
2105
2106         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2107                 if (ipv4_is_loopback(fl4->saddr) && !(dev_out->flags & IFF_LOOPBACK))
2108                         return ERR_PTR(-EINVAL);
2109
2110         if (ipv4_is_lbcast(fl4->daddr))
2111                 type = RTN_BROADCAST;
2112         else if (ipv4_is_multicast(fl4->daddr))
2113                 type = RTN_MULTICAST;
2114         else if (ipv4_is_zeronet(fl4->daddr))
2115                 return ERR_PTR(-EINVAL);
2116
2117         if (dev_out->flags & IFF_LOOPBACK)
2118                 flags |= RTCF_LOCAL;
2119
2120         do_cache = true;
2121         if (type == RTN_BROADCAST) {
2122                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2123                 fi = NULL;
2124         } else if (type == RTN_MULTICAST) {
2125                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2126                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2127                                      fl4->flowi4_proto))
2128                         flags &= ~RTCF_LOCAL;
2129                 else
2130                         do_cache = false;
2131                 /* If multicast route do not exist use
2132                  * default one, but do not gateway in this case.
2133                  * Yes, it is hack.
2134                  */
2135                 if (fi && res->prefixlen < 4)
2136                         fi = NULL;
2137         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2138                    (orig_oif != dev_out->ifindex)) {
2139                 /* For local routes that require a particular output interface
2140                  * we do not want to cache the result.  Caching the result
2141                  * causes incorrect behaviour when there are multiple source
2142                  * addresses on the interface, the end result being that if the
2143                  * intended recipient is waiting on that interface for the
2144                  * packet he won't receive it because it will be delivered on
2145                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2146                  * be set to the loopback interface as well.
2147                  */
2148                 fi = NULL;
2149         }
2150
2151         fnhe = NULL;
2152         do_cache &= fi != NULL;
2153         if (do_cache) {
2154                 struct rtable __rcu **prth;
2155                 struct fib_nh *nh = &FIB_RES_NH(*res);
2156
2157                 fnhe = find_exception(nh, fl4->daddr);
2158                 if (fnhe) {
2159                         prth = &fnhe->fnhe_rth_output;
2160                         rth = rcu_dereference(*prth);
2161                         if (rth && rth->dst.expires &&
2162                             time_after(jiffies, rth->dst.expires)) {
2163                                 ip_del_fnhe(nh, fl4->daddr);
2164                                 fnhe = NULL;
2165                         } else {
2166                                 goto rt_cache;
2167                         }
2168                 }
2169
2170                 if (unlikely(fl4->flowi4_flags &
2171                              FLOWI_FLAG_KNOWN_NH &&
2172                              !(nh->nh_gw &&
2173                                nh->nh_scope == RT_SCOPE_LINK))) {
2174                         do_cache = false;
2175                         goto add;
2176                 }
2177                 prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2178                 rth = rcu_dereference(*prth);
2179
2180 rt_cache:
2181                 if (rt_cache_valid(rth)) {
2182                         dst_hold(&rth->dst);
2183                         return rth;
2184                 }
2185         }
2186
2187 add:
2188         rth = rt_dst_alloc(dev_out, flags, type,
2189                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2190                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2191                            do_cache);
2192         if (!rth)
2193                 return ERR_PTR(-ENOBUFS);
2194
2195         rth->rt_iif     = orig_oif ? : 0;
2196         if (res->table)
2197                 rth->rt_table_id = res->table->tb_id;
2198
2199         RT_CACHE_STAT_INC(out_slow_tot);
2200
2201         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2202                 if (flags & RTCF_LOCAL &&
2203                     !(dev_out->flags & IFF_LOOPBACK)) {
2204                         rth->dst.output = ip_mc_output;
2205                         RT_CACHE_STAT_INC(out_slow_mc);
2206                 }
2207 #ifdef CONFIG_IP_MROUTE
2208                 if (type == RTN_MULTICAST) {
2209                         if (IN_DEV_MFORWARD(in_dev) &&
2210                             !ipv4_is_local_multicast(fl4->daddr)) {
2211                                 rth->dst.input = ip_mr_input;
2212                                 rth->dst.output = ip_mc_output;
2213                         }
2214                 }
2215 #endif
2216         }
2217
2218         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
2219         if (lwtunnel_output_redirect(rth->dst.lwtstate))
2220                 rth->dst.output = lwtunnel_output;
2221
2222         return rth;
2223 }
2224
2225 /*
2226  * Major route resolver routine.
2227  */
2228
2229 struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2230                                           int mp_hash)
2231 {
2232         struct net_device *dev_out = NULL;
2233         __u8 tos = RT_FL_TOS(fl4);
2234         unsigned int flags = 0;
2235         struct fib_result res;
2236         struct rtable *rth;
2237         int orig_oif;
2238         int err;
2239
2240         res.tclassid    = 0;
2241         res.fi          = NULL;
2242         res.table       = NULL;
2243
2244         orig_oif = fl4->flowi4_oif;
2245
2246         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2247         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2248         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2249                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2250
2251         rcu_read_lock();
2252         if (fl4->saddr) {
2253                 if (ipv4_is_multicast(fl4->saddr) ||
2254                     ipv4_is_lbcast(fl4->saddr) ||
2255                     ipv4_is_zeronet(fl4->saddr)) {
2256                         rth = ERR_PTR(-EINVAL);
2257                         goto out;
2258                 }
2259
2260                 rth = ERR_PTR(-ENETUNREACH);
2261
2262                 /* I removed check for oif == dev_out->oif here.
2263                    It was wrong for two reasons:
2264                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2265                       is assigned to multiple interfaces.
2266                    2. Moreover, we are allowed to send packets with saddr
2267                       of another iface. --ANK
2268                  */
2269
2270                 if (fl4->flowi4_oif == 0 &&
2271                     (ipv4_is_multicast(fl4->daddr) ||
2272                      ipv4_is_lbcast(fl4->daddr))) {
2273                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2274                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2275                         if (!dev_out)
2276                                 goto out;
2277
2278                         /* Special hack: user can direct multicasts
2279                            and limited broadcast via necessary interface
2280                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2281                            This hack is not just for fun, it allows
2282                            vic,vat and friends to work.
2283                            They bind socket to loopback, set ttl to zero
2284                            and expect that it will work.
2285                            From the viewpoint of routing cache they are broken,
2286                            because we are not allowed to build multicast path
2287                            with loopback source addr (look, routing cache
2288                            cannot know, that ttl is zero, so that packet
2289                            will not leave this host and route is valid).
2290                            Luckily, this hack is good workaround.
2291                          */
2292
2293                         fl4->flowi4_oif = dev_out->ifindex;
2294                         goto make_route;
2295                 }
2296
2297                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2298                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2299                         if (!__ip_dev_find(net, fl4->saddr, false))
2300                                 goto out;
2301                 }
2302         }
2303
2304
2305         if (fl4->flowi4_oif) {
2306                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2307                 rth = ERR_PTR(-ENODEV);
2308                 if (!dev_out)
2309                         goto out;
2310
2311                 /* RACE: Check return value of inet_select_addr instead. */
2312                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2313                         rth = ERR_PTR(-ENETUNREACH);
2314                         goto out;
2315                 }
2316                 if (ipv4_is_local_multicast(fl4->daddr) ||
2317                     ipv4_is_lbcast(fl4->daddr) ||
2318                     fl4->flowi4_proto == IPPROTO_IGMP) {
2319                         if (!fl4->saddr)
2320                                 fl4->saddr = inet_select_addr(dev_out, 0,
2321                                                               RT_SCOPE_LINK);
2322                         goto make_route;
2323                 }
2324                 if (!fl4->saddr) {
2325                         if (ipv4_is_multicast(fl4->daddr))
2326                                 fl4->saddr = inet_select_addr(dev_out, 0,
2327                                                               fl4->flowi4_scope);
2328                         else if (!fl4->daddr)
2329                                 fl4->saddr = inet_select_addr(dev_out, 0,
2330                                                               RT_SCOPE_HOST);
2331                 }
2332
2333                 rth = l3mdev_get_rtable(dev_out, fl4);
2334                 if (rth)
2335                         goto out;
2336         }
2337
2338         if (!fl4->daddr) {
2339                 fl4->daddr = fl4->saddr;
2340                 if (!fl4->daddr)
2341                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2342                 dev_out = net->loopback_dev;
2343                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2344                 res.type = RTN_LOCAL;
2345                 flags |= RTCF_LOCAL;
2346                 goto make_route;
2347         }
2348
2349         err = fib_lookup(net, fl4, &res, 0);
2350         if (err) {
2351                 res.fi = NULL;
2352                 res.table = NULL;
2353                 if (fl4->flowi4_oif &&
2354                     !netif_index_is_l3_master(net, fl4->flowi4_oif)) {
2355                         /* Apparently, routing tables are wrong. Assume,
2356                            that the destination is on link.
2357
2358                            WHY? DW.
2359                            Because we are allowed to send to iface
2360                            even if it has NO routes and NO assigned
2361                            addresses. When oif is specified, routing
2362                            tables are looked up with only one purpose:
2363                            to catch if destination is gatewayed, rather than
2364                            direct. Moreover, if MSG_DONTROUTE is set,
2365                            we send packet, ignoring both routing tables
2366                            and ifaddr state. --ANK
2367
2368
2369                            We could make it even if oif is unknown,
2370                            likely IPv6, but we do not.
2371                          */
2372
2373                         if (fl4->saddr == 0)
2374                                 fl4->saddr = inet_select_addr(dev_out, 0,
2375                                                               RT_SCOPE_LINK);
2376                         res.type = RTN_UNICAST;
2377                         goto make_route;
2378                 }
2379                 rth = ERR_PTR(err);
2380                 goto out;
2381         }
2382
2383         if (res.type == RTN_LOCAL) {
2384                 if (!fl4->saddr) {
2385                         if (res.fi->fib_prefsrc)
2386                                 fl4->saddr = res.fi->fib_prefsrc;
2387                         else
2388                                 fl4->saddr = fl4->daddr;
2389                 }
2390                 dev_out = net->loopback_dev;
2391                 fl4->flowi4_oif = dev_out->ifindex;
2392                 flags |= RTCF_LOCAL;
2393                 goto make_route;
2394         }
2395
2396         fib_select_path(net, &res, fl4, mp_hash);
2397
2398         dev_out = FIB_RES_DEV(res);
2399         fl4->flowi4_oif = dev_out->ifindex;
2400
2401
2402 make_route:
2403         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2404
2405 out:
2406         rcu_read_unlock();
2407         return rth;
2408 }
2409 EXPORT_SYMBOL_GPL(__ip_route_output_key_hash);
2410
2411 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2412 {
2413         return NULL;
2414 }
2415
2416 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2417 {
2418         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2419
2420         return mtu ? : dst->dev->mtu;
2421 }
2422
2423 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2424                                           struct sk_buff *skb, u32 mtu)
2425 {
2426 }
2427
2428 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2429                                        struct sk_buff *skb)
2430 {
2431 }
2432
2433 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2434                                           unsigned long old)
2435 {
2436         return NULL;
2437 }
2438
2439 static struct dst_ops ipv4_dst_blackhole_ops = {
2440         .family                 =       AF_INET,
2441         .check                  =       ipv4_blackhole_dst_check,
2442         .mtu                    =       ipv4_blackhole_mtu,
2443         .default_advmss         =       ipv4_default_advmss,
2444         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2445         .redirect               =       ipv4_rt_blackhole_redirect,
2446         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2447         .neigh_lookup           =       ipv4_neigh_lookup,
2448 };
2449
2450 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2451 {
2452         struct rtable *ort = (struct rtable *) dst_orig;
2453         struct rtable *rt;
2454
2455         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2456         if (rt) {
2457                 struct dst_entry *new = &rt->dst;
2458
2459                 new->__use = 1;
2460                 new->input = dst_discard;
2461                 new->output = dst_discard_out;
2462
2463                 new->dev = ort->dst.dev;
2464                 if (new->dev)
2465                         dev_hold(new->dev);
2466
2467                 rt->rt_is_input = ort->rt_is_input;
2468                 rt->rt_iif = ort->rt_iif;
2469                 rt->rt_pmtu = ort->rt_pmtu;
2470                 rt->rt_mtu_locked = ort->rt_mtu_locked;
2471
2472                 rt->rt_genid = rt_genid_ipv4(net);
2473                 rt->rt_flags = ort->rt_flags;
2474                 rt->rt_type = ort->rt_type;
2475                 rt->rt_gateway = ort->rt_gateway;
2476                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2477
2478                 INIT_LIST_HEAD(&rt->rt_uncached);
2479                 dst_free(new);
2480         }
2481
2482         dst_release(dst_orig);
2483
2484         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2485 }
2486
2487 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2488                                     const struct sock *sk)
2489 {
2490         struct rtable *rt = __ip_route_output_key(net, flp4);
2491
2492         if (IS_ERR(rt))
2493                 return rt;
2494
2495         if (flp4->flowi4_proto)
2496                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2497                                                         flowi4_to_flowi(flp4),
2498                                                         sk, 0);
2499
2500         return rt;
2501 }
2502 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2503
2504 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2505                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2506                         u32 seq, int event, int nowait, unsigned int flags)
2507 {
2508         struct rtable *rt = skb_rtable(skb);
2509         struct rtmsg *r;
2510         struct nlmsghdr *nlh;
2511         unsigned long expires = 0;
2512         u32 error;
2513         u32 metrics[RTAX_MAX];
2514
2515         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2516         if (!nlh)
2517                 return -EMSGSIZE;
2518
2519         r = nlmsg_data(nlh);
2520         r->rtm_family    = AF_INET;
2521         r->rtm_dst_len  = 32;
2522         r->rtm_src_len  = 0;
2523         r->rtm_tos      = fl4->flowi4_tos;
2524         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2525         if (nla_put_u32(skb, RTA_TABLE, table_id))
2526                 goto nla_put_failure;
2527         r->rtm_type     = rt->rt_type;
2528         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2529         r->rtm_protocol = RTPROT_UNSPEC;
2530         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2531         if (rt->rt_flags & RTCF_NOTIFY)
2532                 r->rtm_flags |= RTM_F_NOTIFY;
2533         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2534                 r->rtm_flags |= RTCF_DOREDIRECT;
2535
2536         if (nla_put_in_addr(skb, RTA_DST, dst))
2537                 goto nla_put_failure;
2538         if (src) {
2539                 r->rtm_src_len = 32;
2540                 if (nla_put_in_addr(skb, RTA_SRC, src))
2541                         goto nla_put_failure;
2542         }
2543         if (rt->dst.dev &&
2544             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2545                 goto nla_put_failure;
2546 #ifdef CONFIG_IP_ROUTE_CLASSID
2547         if (rt->dst.tclassid &&
2548             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2549                 goto nla_put_failure;
2550 #endif
2551         if (!rt_is_input_route(rt) &&
2552             fl4->saddr != src) {
2553                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2554                         goto nla_put_failure;
2555         }
2556         if (rt->rt_uses_gateway &&
2557             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2558                 goto nla_put_failure;
2559
2560         expires = rt->dst.expires;
2561         if (expires) {
2562                 unsigned long now = jiffies;
2563
2564                 if (time_before(now, expires))
2565                         expires -= now;
2566                 else
2567                         expires = 0;
2568         }
2569
2570         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2571         if (rt->rt_pmtu && expires)
2572                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2573         if (rt->rt_mtu_locked && expires)
2574                 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2575         if (rtnetlink_put_metrics(skb, metrics) < 0)
2576                 goto nla_put_failure;
2577
2578         if (fl4->flowi4_mark &&
2579             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2580                 goto nla_put_failure;
2581
2582         error = rt->dst.error;
2583
2584         if (rt_is_input_route(rt)) {
2585 #ifdef CONFIG_IP_MROUTE
2586                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2587                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2588                         int err = ipmr_get_route(net, skb,
2589                                                  fl4->saddr, fl4->daddr,
2590                                                  r, nowait, portid);
2591
2592                         if (err <= 0) {
2593                                 if (!nowait) {
2594                                         if (err == 0)
2595                                                 return 0;
2596                                         goto nla_put_failure;
2597                                 } else {
2598                                         if (err == -EMSGSIZE)
2599                                                 goto nla_put_failure;
2600                                         error = err;
2601                                 }
2602                         }
2603                 } else
2604 #endif
2605                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2606                                 goto nla_put_failure;
2607         }
2608
2609         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2610                 goto nla_put_failure;
2611
2612         nlmsg_end(skb, nlh);
2613         return 0;
2614
2615 nla_put_failure:
2616         nlmsg_cancel(skb, nlh);
2617         return -EMSGSIZE;
2618 }
2619
2620 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2621 {
2622         struct net *net = sock_net(in_skb->sk);
2623         struct rtmsg *rtm;
2624         struct nlattr *tb[RTA_MAX+1];
2625         struct rtable *rt = NULL;
2626         struct flowi4 fl4;
2627         __be32 dst = 0;
2628         __be32 src = 0;
2629         u32 iif;
2630         int err;
2631         int mark;
2632         struct sk_buff *skb;
2633         u32 table_id = RT_TABLE_MAIN;
2634
2635         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2636         if (err < 0)
2637                 goto errout;
2638
2639         rtm = nlmsg_data(nlh);
2640
2641         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2642         if (!skb) {
2643                 err = -ENOBUFS;
2644                 goto errout;
2645         }
2646
2647         /* Reserve room for dummy headers, this skb can pass
2648            through good chunk of routing engine.
2649          */
2650         skb_reset_mac_header(skb);
2651         skb_reset_network_header(skb);
2652
2653         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2654         ip_hdr(skb)->protocol = IPPROTO_UDP;
2655         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2656
2657         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2658         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2659         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2660         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2661
2662         memset(&fl4, 0, sizeof(fl4));
2663         fl4.daddr = dst;
2664         fl4.saddr = src;
2665         fl4.flowi4_tos = rtm->rtm_tos;
2666         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2667         fl4.flowi4_mark = mark;
2668
2669         if (netif_index_is_l3_master(net, fl4.flowi4_oif))
2670                 fl4.flowi4_flags = FLOWI_FLAG_L3MDEV_SRC | FLOWI_FLAG_SKIP_NH_OIF;
2671
2672         if (iif) {
2673                 struct net_device *dev;
2674
2675                 dev = __dev_get_by_index(net, iif);
2676                 if (!dev) {
2677                         err = -ENODEV;
2678                         goto errout_free;
2679                 }
2680
2681                 skb->protocol   = htons(ETH_P_IP);
2682                 skb->dev        = dev;
2683                 skb->mark       = mark;
2684                 local_bh_disable();
2685                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2686                 local_bh_enable();
2687
2688                 rt = skb_rtable(skb);
2689                 if (err == 0 && rt->dst.error)
2690                         err = -rt->dst.error;
2691         } else {
2692                 rt = ip_route_output_key(net, &fl4);
2693
2694                 err = 0;
2695                 if (IS_ERR(rt))
2696                         err = PTR_ERR(rt);
2697         }
2698
2699         if (err)
2700                 goto errout_free;
2701
2702         skb_dst_set(skb, &rt->dst);
2703         if (rtm->rtm_flags & RTM_F_NOTIFY)
2704                 rt->rt_flags |= RTCF_NOTIFY;
2705
2706         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2707                 table_id = rt->rt_table_id;
2708
2709         err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2710                            NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2711                            RTM_NEWROUTE, 0, 0);
2712         if (err < 0)
2713                 goto errout_free;
2714
2715         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2716 errout:
2717         return err;
2718
2719 errout_free:
2720         kfree_skb(skb);
2721         goto errout;
2722 }
2723
2724 void ip_rt_multicast_event(struct in_device *in_dev)
2725 {
2726         rt_cache_flush(dev_net(in_dev->dev));
2727 }
2728
2729 #ifdef CONFIG_SYSCTL
2730 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2731 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2732 static int ip_rt_gc_elasticity __read_mostly    = 8;
2733 static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
2734
2735 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2736                                         void __user *buffer,
2737                                         size_t *lenp, loff_t *ppos)
2738 {
2739         struct net *net = (struct net *)__ctl->extra1;
2740
2741         if (write) {
2742                 rt_cache_flush(net);
2743                 fnhe_genid_bump(net);
2744                 return 0;
2745         }
2746
2747         return -EINVAL;
2748 }
2749
2750 static struct ctl_table ipv4_route_table[] = {
2751         {
2752                 .procname       = "gc_thresh",
2753                 .data           = &ipv4_dst_ops.gc_thresh,
2754                 .maxlen         = sizeof(int),
2755                 .mode           = 0644,
2756                 .proc_handler   = proc_dointvec,
2757         },
2758         {
2759                 .procname       = "max_size",
2760                 .data           = &ip_rt_max_size,
2761                 .maxlen         = sizeof(int),
2762                 .mode           = 0644,
2763                 .proc_handler   = proc_dointvec,
2764         },
2765         {
2766                 /*  Deprecated. Use gc_min_interval_ms */
2767
2768                 .procname       = "gc_min_interval",
2769                 .data           = &ip_rt_gc_min_interval,
2770                 .maxlen         = sizeof(int),
2771                 .mode           = 0644,
2772                 .proc_handler   = proc_dointvec_jiffies,
2773         },
2774         {
2775                 .procname       = "gc_min_interval_ms",
2776                 .data           = &ip_rt_gc_min_interval,
2777                 .maxlen         = sizeof(int),
2778                 .mode           = 0644,
2779                 .proc_handler   = proc_dointvec_ms_jiffies,
2780         },
2781         {
2782                 .procname       = "gc_timeout",
2783                 .data           = &ip_rt_gc_timeout,
2784                 .maxlen         = sizeof(int),
2785                 .mode           = 0644,
2786                 .proc_handler   = proc_dointvec_jiffies,
2787         },
2788         {
2789                 .procname       = "gc_interval",
2790                 .data           = &ip_rt_gc_interval,
2791                 .maxlen         = sizeof(int),
2792                 .mode           = 0644,
2793                 .proc_handler   = proc_dointvec_jiffies,
2794         },
2795         {
2796                 .procname       = "redirect_load",
2797                 .data           = &ip_rt_redirect_load,
2798                 .maxlen         = sizeof(int),
2799                 .mode           = 0644,
2800                 .proc_handler   = proc_dointvec,
2801         },
2802         {
2803                 .procname       = "redirect_number",
2804                 .data           = &ip_rt_redirect_number,
2805                 .maxlen         = sizeof(int),
2806                 .mode           = 0644,
2807                 .proc_handler   = proc_dointvec,
2808         },
2809         {
2810                 .procname       = "redirect_silence",
2811                 .data           = &ip_rt_redirect_silence,
2812                 .maxlen         = sizeof(int),
2813                 .mode           = 0644,
2814                 .proc_handler   = proc_dointvec,
2815         },
2816         {
2817                 .procname       = "error_cost",
2818                 .data           = &ip_rt_error_cost,
2819                 .maxlen         = sizeof(int),
2820                 .mode           = 0644,
2821                 .proc_handler   = proc_dointvec,
2822         },
2823         {
2824                 .procname       = "error_burst",
2825                 .data           = &ip_rt_error_burst,
2826                 .maxlen         = sizeof(int),
2827                 .mode           = 0644,
2828                 .proc_handler   = proc_dointvec,
2829         },
2830         {
2831                 .procname       = "gc_elasticity",
2832                 .data           = &ip_rt_gc_elasticity,
2833                 .maxlen         = sizeof(int),
2834                 .mode           = 0644,
2835                 .proc_handler   = proc_dointvec,
2836         },
2837         {
2838                 .procname       = "mtu_expires",
2839                 .data           = &ip_rt_mtu_expires,
2840                 .maxlen         = sizeof(int),
2841                 .mode           = 0644,
2842                 .proc_handler   = proc_dointvec_jiffies,
2843         },
2844         {
2845                 .procname       = "min_pmtu",
2846                 .data           = &ip_rt_min_pmtu,
2847                 .maxlen         = sizeof(int),
2848                 .mode           = 0644,
2849                 .proc_handler   = proc_dointvec_minmax,
2850                 .extra1         = &ip_min_valid_pmtu,
2851         },
2852         {
2853                 .procname       = "min_adv_mss",
2854                 .data           = &ip_rt_min_advmss,
2855                 .maxlen         = sizeof(int),
2856                 .mode           = 0644,
2857                 .proc_handler   = proc_dointvec,
2858         },
2859         { }
2860 };
2861
2862 static struct ctl_table ipv4_route_flush_table[] = {
2863         {
2864                 .procname       = "flush",
2865                 .maxlen         = sizeof(int),
2866                 .mode           = 0200,
2867                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2868         },
2869         { },
2870 };
2871
2872 static __net_init int sysctl_route_net_init(struct net *net)
2873 {
2874         struct ctl_table *tbl;
2875
2876         tbl = ipv4_route_flush_table;
2877         if (!net_eq(net, &init_net)) {
2878                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2879                 if (!tbl)
2880                         goto err_dup;
2881
2882                 /* Don't export sysctls to unprivileged users */
2883                 if (net->user_ns != &init_user_ns)
2884                         tbl[0].procname = NULL;
2885         }
2886         tbl[0].extra1 = net;
2887
2888         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2889         if (!net->ipv4.route_hdr)
2890                 goto err_reg;
2891         return 0;
2892
2893 err_reg:
2894         if (tbl != ipv4_route_flush_table)
2895                 kfree(tbl);
2896 err_dup:
2897         return -ENOMEM;
2898 }
2899
2900 static __net_exit void sysctl_route_net_exit(struct net *net)
2901 {
2902         struct ctl_table *tbl;
2903
2904         tbl = net->ipv4.route_hdr->ctl_table_arg;
2905         unregister_net_sysctl_table(net->ipv4.route_hdr);
2906         BUG_ON(tbl == ipv4_route_flush_table);
2907         kfree(tbl);
2908 }
2909
2910 static __net_initdata struct pernet_operations sysctl_route_ops = {
2911         .init = sysctl_route_net_init,
2912         .exit = sysctl_route_net_exit,
2913 };
2914 #endif
2915
2916 static __net_init int rt_genid_init(struct net *net)
2917 {
2918         atomic_set(&net->ipv4.rt_genid, 0);
2919         atomic_set(&net->fnhe_genid, 0);
2920         get_random_bytes(&net->ipv4.dev_addr_genid,
2921                          sizeof(net->ipv4.dev_addr_genid));
2922         return 0;
2923 }
2924
2925 static __net_initdata struct pernet_operations rt_genid_ops = {
2926         .init = rt_genid_init,
2927 };
2928
2929 static int __net_init ipv4_inetpeer_init(struct net *net)
2930 {
2931         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2932
2933         if (!bp)
2934                 return -ENOMEM;
2935         inet_peer_base_init(bp);
2936         net->ipv4.peers = bp;
2937         return 0;
2938 }
2939
2940 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2941 {
2942         struct inet_peer_base *bp = net->ipv4.peers;
2943
2944         net->ipv4.peers = NULL;
2945         inetpeer_invalidate_tree(bp);
2946         kfree(bp);
2947 }
2948
2949 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2950         .init   =       ipv4_inetpeer_init,
2951         .exit   =       ipv4_inetpeer_exit,
2952 };
2953
2954 #ifdef CONFIG_IP_ROUTE_CLASSID
2955 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2956 #endif /* CONFIG_IP_ROUTE_CLASSID */
2957
2958 int __init ip_rt_init(void)
2959 {
2960         void *idents_hash;
2961         int rc = 0;
2962         int cpu;
2963
2964         /* For modern hosts, this will use 2 MB of memory */
2965         idents_hash = alloc_large_system_hash("IP idents",
2966                                               sizeof(*ip_idents) + sizeof(*ip_tstamps),
2967                                               0,
2968                                               16, /* one bucket per 64 KB */
2969                                               0,
2970                                               NULL,
2971                                               &ip_idents_mask,
2972                                               2048,
2973                                               256*1024);
2974
2975         ip_idents = idents_hash;
2976
2977         prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
2978
2979         ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
2980         memset(ip_tstamps, 0, (ip_idents_mask + 1) * sizeof(*ip_tstamps));
2981
2982         for_each_possible_cpu(cpu) {
2983                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
2984
2985                 INIT_LIST_HEAD(&ul->head);
2986                 spin_lock_init(&ul->lock);
2987         }
2988 #ifdef CONFIG_IP_ROUTE_CLASSID
2989         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2990         if (!ip_rt_acct)
2991                 panic("IP: failed to allocate ip_rt_acct\n");
2992 #endif
2993
2994         ipv4_dst_ops.kmem_cachep =
2995                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2996                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2997
2998         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2999
3000         if (dst_entries_init(&ipv4_dst_ops) < 0)
3001                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3002
3003         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3004                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3005
3006         ipv4_dst_ops.gc_thresh = ~0;
3007         ip_rt_max_size = INT_MAX;
3008
3009         devinet_init();
3010         ip_fib_init();
3011
3012         if (ip_rt_proc_init())
3013                 pr_err("Unable to create route proc files\n");
3014 #ifdef CONFIG_XFRM
3015         xfrm_init();
3016         xfrm4_init();
3017 #endif
3018         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3019
3020 #ifdef CONFIG_SYSCTL
3021         register_pernet_subsys(&sysctl_route_ops);
3022 #endif
3023         register_pernet_subsys(&rt_genid_ops);
3024         register_pernet_subsys(&ipv4_inetpeer_ops);
3025         return rc;
3026 }
3027
3028 #ifdef CONFIG_SYSCTL
3029 /*
3030  * We really need to sanitize the damn ipv4 init order, then all
3031  * this nonsense will go away.
3032  */
3033 void __init ip_static_sysctl_init(void)
3034 {
3035         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3036 }
3037 #endif