net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <asm/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/bootmem.h>
  74 #include <linux/string.h>
  75 #include <linux/socket.h>
  76 #include <linux/sockios.h>
  77 #include <linux/errno.h>
  78 #include <linux/in.h>
  79 #include <linux/inet.h>
  80 #include <linux/netdevice.h>
  81 #include <linux/proc_fs.h>
  82 #include <linux/init.h>
  83 #include <linux/skbuff.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/rcupdate.h>
  91 #include <linux/times.h>
  92 #include <linux/slab.h>
  93 #include <linux/jhash.h>
  94 #include <net/dst.h>
  95 #include <net/dst_metadata.h>
  96 #include <net/net_namespace.h>
  97 #include <net/protocol.h>
  98 #include <net/ip.h>
  99 #include <net/route.h>
 100 #include <net/inetpeer.h>
 101 #include <net/sock.h>
 102 #include <net/ip_fib.h>
 103 #include <net/arp.h>
 104 #include <net/tcp.h>
 105 #include <net/icmp.h>
 106 #include <net/xfrm.h>
 107 #include <net/lwtunnel.h>
 108 #include <net/netevent.h>
 109 #include <net/rtnetlink.h>
 110 #ifdef CONFIG_SYSCTL
 111 #include <linux/sysctl.h>
 112 #include <linux/kmemleak.h>
 113 #endif
 114 #include <net/secure_seq.h>
 115 #include <net/ip_tunnels.h>
 116 #include <net/l3mdev.h>
 117
 118 #define RT_FL_TOS(oldflp4) \
 119         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 120
 121 #define RT_GC_TIMEOUT (300*HZ)
 122
 123 static int ip_rt_max_size;
 124 static int ip_rt_redirect_number __read_mostly  = 9;
 125 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 126 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 127 static int ip_rt_error_cost __read_mostly       = HZ;
 128 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 129 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 130 static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 131 static int ip_rt_min_advmss __read_mostly       = 256;
 132
 133 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 134
 135 /*
 136  *      Interface to generic destination cache.
 137  */
 138
 139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 140 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 141 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 142 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 143 static void              ipv4_link_failure(struct sk_buff *skb);
 144 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 145                                            struct sk_buff *skb, u32 mtu);
 146 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 147                                         struct sk_buff *skb);
 148 static void             ipv4_dst_destroy(struct dst_entry *dst);
 149
 150 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 151 {
 152         WARN_ON(1);
 153         return NULL;
 154 }
 155
 156 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 157                                            struct sk_buff *skb,
 158                                            const void *daddr);
 159
 160 static struct dst_ops ipv4_dst_ops = {
 161         .family =               AF_INET,
 162         .check =                ipv4_dst_check,
 163         .default_advmss =       ipv4_default_advmss,
 164         .mtu =                  ipv4_mtu,
 165         .cow_metrics =          ipv4_cow_metrics,
 166         .destroy =              ipv4_dst_destroy,
 167         .negative_advice =      ipv4_negative_advice,
 168         .link_failure =         ipv4_link_failure,
 169         .update_pmtu =          ip_rt_update_pmtu,
 170         .redirect =             ip_do_redirect,
 171         .local_out =            __ip_local_out,
 172         .neigh_lookup =         ipv4_neigh_lookup,
 173 };
 174
 175 #define ECN_OR_COST(class)      TC_PRIO_##class
 176
 177 const __u8 ip_tos2prio[16] = {
 178         TC_PRIO_BESTEFFORT,
 179         ECN_OR_COST(BESTEFFORT),
 180         TC_PRIO_BESTEFFORT,
 181         ECN_OR_COST(BESTEFFORT),
 182         TC_PRIO_BULK,
 183         ECN_OR_COST(BULK),
 184         TC_PRIO_BULK,
 185         ECN_OR_COST(BULK),
 186         TC_PRIO_INTERACTIVE,
 187         ECN_OR_COST(INTERACTIVE),
 188         TC_PRIO_INTERACTIVE,
 189         ECN_OR_COST(INTERACTIVE),
 190         TC_PRIO_INTERACTIVE_BULK,
 191         ECN_OR_COST(INTERACTIVE_BULK),
 192         TC_PRIO_INTERACTIVE_BULK,
 193         ECN_OR_COST(INTERACTIVE_BULK)
 194 };
 195 EXPORT_SYMBOL(ip_tos2prio);
 196
 197 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 198 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 199
 200 #ifdef CONFIG_PROC_FS
 201 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 202 {
 203         if (*pos)
 204                 return NULL;
 205         return SEQ_START_TOKEN;
 206 }
 207
 208 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 209 {
 210         ++*pos;
 211         return NULL;
 212 }
 213
 214 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 215 {
 216 }
 217
 218 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 219 {
 220         if (v == SEQ_START_TOKEN)
 221                 seq_printf(seq, "%-127s\n",
 222                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 223                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 224                            "HHUptod\tSpecDst");
 225         return 0;
 226 }
 227
 228 static const struct seq_operations rt_cache_seq_ops = {
 229         .start  = rt_cache_seq_start,
 230         .next   = rt_cache_seq_next,
 231         .stop   = rt_cache_seq_stop,
 232         .show   = rt_cache_seq_show,
 233 };
 234
 235 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 236 {
 237         return seq_open(file, &rt_cache_seq_ops);
 238 }
 239
 240 static const struct file_operations rt_cache_seq_fops = {
 241         .owner   = THIS_MODULE,
 242         .open    = rt_cache_seq_open,
 243         .read    = seq_read,
 244         .llseek  = seq_lseek,
 245         .release = seq_release,
 246 };
 247
 248
 249 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 250 {
 251         int cpu;
 252
 253         if (*pos == 0)
 254                 return SEQ_START_TOKEN;
 255
 256         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 257                 if (!cpu_possible(cpu))
 258                         continue;
 259                 *pos = cpu+1;
 260                 return &per_cpu(rt_cache_stat, cpu);
 261         }
 262         return NULL;
 263 }
 264
 265 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 266 {
 267         int cpu;
 268
 269         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 270                 if (!cpu_possible(cpu))
 271                         continue;
 272                 *pos = cpu+1;
 273                 return &per_cpu(rt_cache_stat, cpu);
 274         }
 275         (*pos)++;
 276         return NULL;
 277
 278 }
 279
 280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 281 {
 282
 283 }
 284
 285 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 286 {
 287         struct rt_cache_stat *st = v;
 288
 289         if (v == SEQ_START_TOKEN) {
 290                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 291                 return 0;
 292         }
 293
 294         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 295                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 296                    dst_entries_get_slow(&ipv4_dst_ops),
 297                    0, /* st->in_hit */
 298                    st->in_slow_tot,
 299                    st->in_slow_mc,
 300                    st->in_no_route,
 301                    st->in_brd,
 302                    st->in_martian_dst,
 303                    st->in_martian_src,
 304
 305                    0, /* st->out_hit */
 306                    st->out_slow_tot,
 307                    st->out_slow_mc,
 308
 309                    0, /* st->gc_total */
 310                    0, /* st->gc_ignored */
 311                    0, /* st->gc_goal_miss */
 312                    0, /* st->gc_dst_overflow */
 313                    0, /* st->in_hlist_search */
 314                    0  /* st->out_hlist_search */
 315                 );
 316         return 0;
 317 }
 318
 319 static const struct seq_operations rt_cpu_seq_ops = {
 320         .start  = rt_cpu_seq_start,
 321         .next   = rt_cpu_seq_next,
 322         .stop   = rt_cpu_seq_stop,
 323         .show   = rt_cpu_seq_show,
 324 };
 325
 326
 327 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 328 {
 329         return seq_open(file, &rt_cpu_seq_ops);
 330 }
 331
 332 static const struct file_operations rt_cpu_seq_fops = {
 333         .owner   = THIS_MODULE,
 334         .open    = rt_cpu_seq_open,
 335         .read    = seq_read,
 336         .llseek  = seq_lseek,
 337         .release = seq_release,
 338 };
 339
 340 #ifdef CONFIG_IP_ROUTE_CLASSID
 341 static int rt_acct_proc_show(struct seq_file *m, void *v)
 342 {
 343         struct ip_rt_acct *dst, *src;
 344         unsigned int i, j;
 345
 346         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 347         if (!dst)
 348                 return -ENOMEM;
 349
 350         for_each_possible_cpu(i) {
 351                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 352                 for (j = 0; j < 256; j++) {
 353                         dst[j].o_bytes   += src[j].o_bytes;
 354                         dst[j].o_packets += src[j].o_packets;
 355                         dst[j].i_bytes   += src[j].i_bytes;
 356                         dst[j].i_packets += src[j].i_packets;
 357                 }
 358         }
 359
 360         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 361         kfree(dst);
 362         return 0;
 363 }
 364
 365 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 366 {
 367         return single_open(file, rt_acct_proc_show, NULL);
 368 }
 369
 370 static const struct file_operations rt_acct_proc_fops = {
 371         .owner          = THIS_MODULE,
 372         .open           = rt_acct_proc_open,
 373         .read           = seq_read,
 374         .llseek         = seq_lseek,
 375         .release        = single_release,
 376 };
 377 #endif
 378
 379 static int __net_init ip_rt_do_proc_init(struct net *net)
 380 {
 381         struct proc_dir_entry *pde;
 382
 383         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
 384                           &rt_cache_seq_fops);
 385         if (!pde)
 386                 goto err1;
 387
 388         pde = proc_create("rt_cache", S_IRUGO,
 389                           net->proc_net_stat, &rt_cpu_seq_fops);
 390         if (!pde)
 391                 goto err2;
 392
 393 #ifdef CONFIG_IP_ROUTE_CLASSID
 394         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 395         if (!pde)
 396                 goto err3;
 397 #endif
 398         return 0;
 399
 400 #ifdef CONFIG_IP_ROUTE_CLASSID
 401 err3:
 402         remove_proc_entry("rt_cache", net->proc_net_stat);
 403 #endif
 404 err2:
 405         remove_proc_entry("rt_cache", net->proc_net);
 406 err1:
 407         return -ENOMEM;
 408 }
 409
 410 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 411 {
 412         remove_proc_entry("rt_cache", net->proc_net_stat);
 413         remove_proc_entry("rt_cache", net->proc_net);
 414 #ifdef CONFIG_IP_ROUTE_CLASSID
 415         remove_proc_entry("rt_acct", net->proc_net);
 416 #endif
 417 }
 418
 419 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 420         .init = ip_rt_do_proc_init,
 421         .exit = ip_rt_do_proc_exit,
 422 };
 423
 424 static int __init ip_rt_proc_init(void)
 425 {
 426         return register_pernet_subsys(&ip_rt_proc_ops);
 427 }
 428
 429 #else
 430 static inline int ip_rt_proc_init(void)
 431 {
 432         return 0;
 433 }
 434 #endif /* CONFIG_PROC_FS */
 435
 436 static inline bool rt_is_expired(const struct rtable *rth)
 437 {
 438         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 439 }
 440
 441 void rt_cache_flush(struct net *net)
 442 {
 443         rt_genid_bump_ipv4(net);
 444 }
 445
 446 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 447                                            struct sk_buff *skb,
 448                                            const void *daddr)
 449 {
 450         struct net_device *dev = dst->dev;
 451         const __be32 *pkey = daddr;
 452         const struct rtable *rt;
 453         struct neighbour *n;
 454
 455         rt = (const struct rtable *) dst;
 456         if (rt->rt_gateway)
 457                 pkey = (const __be32 *) &rt->rt_gateway;
 458         else if (skb)
 459                 pkey = &ip_hdr(skb)->daddr;
 460
 461         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 462         if (n)
 463                 return n;
 464         return neigh_create(&arp_tbl, pkey, dev);
 465 }
 466
 467 /* Hash tables of size 2048..262144 depending on RAM size.
 468  * Each bucket uses 8 bytes.
 469  */
 470 static u32 ip_idents_mask __read_mostly;
 471 static atomic_t *ip_idents __read_mostly;
 472 static u32 *ip_tstamps __read_mostly;
 473
 474 /* In order to protect privacy, we add a perturbation to identifiers
 475  * if one generator is seldom used. This makes hard for an attacker
 476  * to infer how many packets were sent between two points in time.
 477  */
 478 u32 ip_idents_reserve(u32 hash, int segs)
 479 {
 480         u32 bucket, old, now = (u32)jiffies;
 481         atomic_t *p_id;
 482         u32 *p_tstamp;
 483         u32 delta = 0;
 484
 485         bucket = hash & ip_idents_mask;
 486         p_tstamp = ip_tstamps + bucket;
 487         p_id = ip_idents + bucket;
 488         old = ACCESS_ONCE(*p_tstamp);
 489
 490         if (old != now && cmpxchg(p_tstamp, old, now) == old)
 491                 delta = prandom_u32_max(now - old);
 492
 493         /* If UBSAN reports an error there, please make sure your compiler
 494          * supports -fno-strict-overflow before reporting it that was a bug
 495          * in UBSAN, and it has been fixed in GCC-8.
 496          */
 497         return atomic_add_return(segs + delta, p_id) - segs;
 498 }
 499 EXPORT_SYMBOL(ip_idents_reserve);
 500
 501 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 502 {
 503         u32 hash, id;
 504
 505         /* Note the following code is not safe, but this is okay. */
 506         if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
 507                 get_random_bytes(&net->ipv4.ip_id_key,
 508                                  sizeof(net->ipv4.ip_id_key));
 509
 510         hash = siphash_3u32((__force u32)iph->daddr,
 511                             (__force u32)iph->saddr,
 512                             iph->protocol,
 513                             &net->ipv4.ip_id_key);
 514         id = ip_idents_reserve(hash, segs);
 515         iph->id = htons(id);
 516 }
 517 EXPORT_SYMBOL(__ip_select_ident);
 518
 519 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 520                              const struct iphdr *iph,
 521                              int oif, u8 tos,
 522                              u8 prot, u32 mark, int flow_flags)
 523 {
 524         if (sk) {
 525                 const struct inet_sock *inet = inet_sk(sk);
 526
 527                 oif = sk->sk_bound_dev_if;
 528                 mark = sk->sk_mark;
 529                 tos = RT_CONN_FLAGS(sk);
 530                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 531         }
 532         flowi4_init_output(fl4, oif, mark, tos,
 533                            RT_SCOPE_UNIVERSE, prot,
 534                            flow_flags,
 535                            iph->daddr, iph->saddr, 0, 0);
 536 }
 537
 538 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 539                                const struct sock *sk)
 540 {
 541         const struct iphdr *iph = ip_hdr(skb);
 542         int oif = skb->dev->ifindex;
 543         u8 tos = RT_TOS(iph->tos);
 544         u8 prot = iph->protocol;
 545         u32 mark = skb->mark;
 546
 547         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
 548 }
 549
 550 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 551 {
 552         const struct inet_sock *inet = inet_sk(sk);
 553         const struct ip_options_rcu *inet_opt;
 554         __be32 daddr = inet->inet_daddr;
 555
 556         rcu_read_lock();
 557         inet_opt = rcu_dereference(inet->inet_opt);
 558         if (inet_opt && inet_opt->opt.srr)
 559                 daddr = inet_opt->opt.faddr;
 560         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 561                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 562                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 563                            inet_sk_flowi_flags(sk),
 564                            daddr, inet->inet_saddr, 0, 0);
 565         rcu_read_unlock();
 566 }
 567
 568 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 569                                  const struct sk_buff *skb)
 570 {
 571         if (skb)
 572                 build_skb_flow_key(fl4, skb, sk);
 573         else
 574                 build_sk_flow_key(fl4, sk);
 575 }
 576
 577 static inline void rt_free(struct rtable *rt)
 578 {
 579         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
 580 }
 581
 582 static DEFINE_SPINLOCK(fnhe_lock);
 583
 584 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 585 {
 586         struct rtable *rt;
 587
 588         rt = rcu_dereference(fnhe->fnhe_rth_input);
 589         if (rt) {
 590                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 591                 rt_free(rt);
 592         }
 593         rt = rcu_dereference(fnhe->fnhe_rth_output);
 594         if (rt) {
 595                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 596                 rt_free(rt);
 597         }
 598 }
 599
 600 static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
 601 {
 602         struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
 603         struct fib_nh_exception *fnhe, *oldest = NULL;
 604
 605         for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
 606                 fnhe = rcu_dereference_protected(*fnhe_p,
 607                                                  lockdep_is_held(&fnhe_lock));
 608                 if (!fnhe)
 609                         break;
 610                 if (!oldest ||
 611                     time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
 612                         oldest = fnhe;
 613                         oldest_p = fnhe_p;
 614                 }
 615         }
 616         fnhe_flush_routes(oldest);
 617         *oldest_p = oldest->fnhe_next;
 618         kfree_rcu(oldest, rcu);
 619 }
 620
 621 static inline u32 fnhe_hashfun(__be32 daddr)
 622 {
 623         static u32 fnhe_hashrnd __read_mostly;
 624         u32 hval;
 625
 626         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
 627         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
 628         return hash_32(hval, FNHE_HASH_SHIFT);
 629 }
 630
 631 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 632 {
 633         rt->rt_pmtu = fnhe->fnhe_pmtu;
 634         rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
 635         rt->dst.expires = fnhe->fnhe_expires;
 636
 637         if (fnhe->fnhe_gw) {
 638                 rt->rt_flags |= RTCF_REDIRECTED;
 639                 rt->rt_gateway = fnhe->fnhe_gw;
 640                 rt->rt_uses_gateway = 1;
 641         }
 642 }
 643
 644 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 645                                   u32 pmtu, bool lock, unsigned long expires)
 646 {
 647         struct fnhe_hash_bucket *hash;
 648         struct fib_nh_exception *fnhe;
 649         struct rtable *rt;
 650         u32 genid, hval;
 651         unsigned int i;
 652         int depth;
 653
 654         genid = fnhe_genid(dev_net(nh->nh_dev));
 655         hval = fnhe_hashfun(daddr);
 656
 657         spin_lock_bh(&fnhe_lock);
 658
 659         hash = rcu_dereference(nh->nh_exceptions);
 660         if (!hash) {
 661                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 662                 if (!hash)
 663                         goto out_unlock;
 664                 rcu_assign_pointer(nh->nh_exceptions, hash);
 665         }
 666
 667         hash += hval;
 668
 669         depth = 0;
 670         for (fnhe = rcu_dereference(hash->chain); fnhe;
 671              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 672                 if (fnhe->fnhe_daddr == daddr)
 673                         break;
 674                 depth++;
 675         }
 676
 677         if (fnhe) {
 678                 if (fnhe->fnhe_genid != genid)
 679                         fnhe->fnhe_genid = genid;
 680                 if (gw)
 681                         fnhe->fnhe_gw = gw;
 682                 if (pmtu) {
 683                         fnhe->fnhe_pmtu = pmtu;
 684                         fnhe->fnhe_mtu_locked = lock;
 685                 }
 686                 fnhe->fnhe_expires = max(1UL, expires);
 687                 /* Update all cached dsts too */
 688                 rt = rcu_dereference(fnhe->fnhe_rth_input);
 689                 if (rt)
 690                         fill_route_from_fnhe(rt, fnhe);
 691                 rt = rcu_dereference(fnhe->fnhe_rth_output);
 692                 if (rt)
 693                         fill_route_from_fnhe(rt, fnhe);
 694         } else {
 695                 /* Randomize max depth to avoid some side channels attacks. */
 696                 int max_depth = FNHE_RECLAIM_DEPTH +
 697                                 prandom_u32_max(FNHE_RECLAIM_DEPTH);
 698
 699                 while (depth > max_depth) {
 700                         fnhe_remove_oldest(hash);
 701                         depth--;
 702                 }
 703
 704                 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 705                 if (!fnhe)
 706                         goto out_unlock;
 707
 708                 fnhe->fnhe_next = hash->chain;
 709
 710                 fnhe->fnhe_genid = genid;
 711                 fnhe->fnhe_daddr = daddr;
 712                 fnhe->fnhe_gw = gw;
 713                 fnhe->fnhe_pmtu = pmtu;
 714                 fnhe->fnhe_mtu_locked = lock;
 715                 fnhe->fnhe_expires = expires;
 716
 717                 rcu_assign_pointer(hash->chain, fnhe);
 718
 719                 /* Exception created; mark the cached routes for the nexthop
 720                  * stale, so anyone caching it rechecks if this exception
 721                  * applies to them.
 722                  */
 723                 rt = rcu_dereference(nh->nh_rth_input);
 724                 if (rt)
 725                         rt->dst.obsolete = DST_OBSOLETE_KILL;
 726
 727                 for_each_possible_cpu(i) {
 728                         struct rtable __rcu **prt;
 729                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
 730                         rt = rcu_dereference(*prt);
 731                         if (rt)
 732                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 733                 }
 734         }
 735
 736         fnhe->fnhe_stamp = jiffies;
 737
 738 out_unlock:
 739         spin_unlock_bh(&fnhe_lock);
 740 }
 741
 742 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 743                              bool kill_route)
 744 {
 745         __be32 new_gw = icmp_hdr(skb)->un.gateway;
 746         __be32 old_gw = ip_hdr(skb)->saddr;
 747         struct net_device *dev = skb->dev;
 748         struct in_device *in_dev;
 749         struct fib_result res;
 750         struct neighbour *n;
 751         struct net *net;
 752
 753         switch (icmp_hdr(skb)->code & 7) {
 754         case ICMP_REDIR_NET:
 755         case ICMP_REDIR_NETTOS:
 756         case ICMP_REDIR_HOST:
 757         case ICMP_REDIR_HOSTTOS:
 758                 break;
 759
 760         default:
 761                 return;
 762         }
 763
 764         if (rt->rt_gateway != old_gw)
 765                 return;
 766
 767         in_dev = __in_dev_get_rcu(dev);
 768         if (!in_dev)
 769                 return;
 770
 771         net = dev_net(dev);
 772         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 773             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 774             ipv4_is_zeronet(new_gw))
 775                 goto reject_redirect;
 776
 777         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 778                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 779                         goto reject_redirect;
 780                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 781                         goto reject_redirect;
 782         } else {
 783                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 784                         goto reject_redirect;
 785         }
 786
 787         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
 788         if (!n)
 789                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 790         if (!IS_ERR(n)) {
 791                 if (!(n->nud_state & NUD_VALID)) {
 792                         neigh_event_send(n, NULL);
 793                 } else {
 794                         if (fib_lookup(net, fl4, &res, 0) == 0) {
 795                                 struct fib_nh *nh = &FIB_RES_NH(res);
 796
 797                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
 798                                                 0, false,
 799                                                 jiffies + ip_rt_gc_timeout);
 800                         }
 801                         if (kill_route)
 802                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 803                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 804                 }
 805                 neigh_release(n);
 806         }
 807         return;
 808
 809 reject_redirect:
 810 #ifdef CONFIG_IP_ROUTE_VERBOSE
 811         if (IN_DEV_LOG_MARTIANS(in_dev)) {
 812                 const struct iphdr *iph = (const struct iphdr *) skb->data;
 813                 __be32 daddr = iph->daddr;
 814                 __be32 saddr = iph->saddr;
 815
 816                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 817                                      "  Advised path = %pI4 -> %pI4\n",
 818                                      &old_gw, dev->name, &new_gw,
 819                                      &saddr, &daddr);
 820         }
 821 #endif
 822         ;
 823 }
 824
 825 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 826 {
 827         struct rtable *rt;
 828         struct flowi4 fl4;
 829         const struct iphdr *iph = (const struct iphdr *) skb->data;
 830         int oif = skb->dev->ifindex;
 831         u8 tos = RT_TOS(iph->tos);
 832         u8 prot = iph->protocol;
 833         u32 mark = skb->mark;
 834
 835         rt = (struct rtable *) dst;
 836
 837         __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
 838         __ip_do_redirect(rt, skb, &fl4, true);
 839 }
 840
 841 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 842 {
 843         struct rtable *rt = (struct rtable *)dst;
 844         struct dst_entry *ret = dst;
 845
 846         if (rt) {
 847                 if (dst->obsolete > 0) {
 848                         ip_rt_put(rt);
 849                         ret = NULL;
 850                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 851                            rt->dst.expires) {
 852                         ip_rt_put(rt);
 853                         ret = NULL;
 854                 }
 855         }
 856         return ret;
 857 }
 858
 859 /*
 860  * Algorithm:
 861  *      1. The first ip_rt_redirect_number redirects are sent
 862  *         with exponential backoff, then we stop sending them at all,
 863  *         assuming that the host ignores our redirects.
 864  *      2. If we did not see packets requiring redirects
 865  *         during ip_rt_redirect_silence, we assume that the host
 866  *         forgot redirected route and start to send redirects again.
 867  *
 868  * This algorithm is much cheaper and more intelligent than dumb load limiting
 869  * in icmp.c.
 870  *
 871  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 872  * and "frag. need" (breaks PMTU discovery) in icmp.c.
 873  */
 874
 875 void ip_rt_send_redirect(struct sk_buff *skb)
 876 {
 877         struct rtable *rt = skb_rtable(skb);
 878         struct in_device *in_dev;
 879         struct inet_peer *peer;
 880         struct net *net;
 881         int log_martians;
 882         int vif;
 883
 884         rcu_read_lock();
 885         in_dev = __in_dev_get_rcu(rt->dst.dev);
 886         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 887                 rcu_read_unlock();
 888                 return;
 889         }
 890         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 891         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 892         rcu_read_unlock();
 893
 894         net = dev_net(rt->dst.dev);
 895         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 896         if (!peer) {
 897                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 898                           rt_nexthop(rt, ip_hdr(skb)->daddr));
 899                 return;
 900         }
 901
 902         /* No redirected packets during ip_rt_redirect_silence;
 903          * reset the algorithm.
 904          */
 905         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
 906                 peer->rate_tokens = 0;
 907                 peer->n_redirects = 0;
 908         }
 909
 910         /* Too many ignored redirects; do not send anything
 911          * set dst.rate_last to the last seen redirected packet.
 912          */
 913         if (peer->n_redirects >= ip_rt_redirect_number) {
 914                 peer->rate_last = jiffies;
 915                 goto out_put_peer;
 916         }
 917
 918         /* Check for load limit; set rate_last to the latest sent
 919          * redirect.
 920          */
 921         if (peer->n_redirects == 0 ||
 922             time_after(jiffies,
 923                        (peer->rate_last +
 924                         (ip_rt_redirect_load << peer->n_redirects)))) {
 925                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 926
 927                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 928                 peer->rate_last = jiffies;
 929                 ++peer->n_redirects;
 930 #ifdef CONFIG_IP_ROUTE_VERBOSE
 931                 if (log_martians &&
 932                     peer->n_redirects == ip_rt_redirect_number)
 933                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 934                                              &ip_hdr(skb)->saddr, inet_iif(skb),
 935                                              &ip_hdr(skb)->daddr, &gw);
 936 #endif
 937         }
 938 out_put_peer:
 939         inet_putpeer(peer);
 940 }
 941
 942 static int ip_error(struct sk_buff *skb)
 943 {
 944         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
 945         struct rtable *rt = skb_rtable(skb);
 946         struct inet_peer *peer;
 947         unsigned long now;
 948         struct net *net;
 949         bool send;
 950         int code;
 951
 952         /* IP on this device is disabled. */
 953         if (!in_dev)
 954                 goto out;
 955
 956         net = dev_net(rt->dst.dev);
 957         if (!IN_DEV_FORWARD(in_dev)) {
 958                 switch (rt->dst.error) {
 959                 case EHOSTUNREACH:
 960                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 961                         break;
 962
 963                 case ENETUNREACH:
 964                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 965                         break;
 966                 }
 967                 goto out;
 968         }
 969
 970         switch (rt->dst.error) {
 971         case EINVAL:
 972         default:
 973                 goto out;
 974         case EHOSTUNREACH:
 975                 code = ICMP_HOST_UNREACH;
 976                 break;
 977         case ENETUNREACH:
 978                 code = ICMP_NET_UNREACH;
 979                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 980                 break;
 981         case EACCES:
 982                 code = ICMP_PKT_FILTERED;
 983                 break;
 984         }
 985
 986         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 987                                l3mdev_master_ifindex(skb->dev), 1);
 988
 989         send = true;
 990         if (peer) {
 991                 now = jiffies;
 992                 peer->rate_tokens += now - peer->rate_last;
 993                 if (peer->rate_tokens > ip_rt_error_burst)
 994                         peer->rate_tokens = ip_rt_error_burst;
 995                 peer->rate_last = now;
 996                 if (peer->rate_tokens >= ip_rt_error_cost)
 997                         peer->rate_tokens -= ip_rt_error_cost;
 998                 else
 999                         send = false;
1000                 inet_putpeer(peer);
1001         }
1002         if (send)
1003                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1004
1005 out:    kfree_skb(skb);
1006         return 0;
1007 }
1008
1009 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1010 {
1011         struct dst_entry *dst = &rt->dst;
1012         u32 old_mtu = ipv4_mtu(dst);
1013         struct fib_result res;
1014         bool lock = false;
1015
1016         if (ip_mtu_locked(dst))
1017                 return;
1018
1019         if (old_mtu < mtu)
1020                 return;
1021
1022         if (mtu < ip_rt_min_pmtu) {
1023                 lock = true;
1024                 mtu = min(old_mtu, ip_rt_min_pmtu);
1025         }
1026
1027         if (rt->rt_pmtu == mtu && !lock &&
1028             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1029                 return;
1030
1031         rcu_read_lock();
1032         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1033                 struct fib_nh *nh = &FIB_RES_NH(res);
1034
1035                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
1036                                       jiffies + ip_rt_mtu_expires);
1037         }
1038         rcu_read_unlock();
1039 }
1040
1041 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1042                               struct sk_buff *skb, u32 mtu)
1043 {
1044         struct rtable *rt = (struct rtable *) dst;
1045         struct flowi4 fl4;
1046
1047         ip_rt_build_flow_key(&fl4, sk, skb);
1048         __ip_rt_update_pmtu(rt, &fl4, mtu);
1049 }
1050
1051 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1052                       int oif, u32 mark, u8 protocol, int flow_flags)
1053 {
1054         const struct iphdr *iph = (const struct iphdr *) skb->data;
1055         struct flowi4 fl4;
1056         struct rtable *rt;
1057
1058         if (!mark)
1059                 mark = IP4_REPLY_MARK(net, skb->mark);
1060
1061         __build_flow_key(&fl4, NULL, iph, oif,
1062                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1063         rt = __ip_route_output_key(net, &fl4);
1064         if (!IS_ERR(rt)) {
1065                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1066                 ip_rt_put(rt);
1067         }
1068 }
1069 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1070
1071 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1072 {
1073         const struct iphdr *iph = (const struct iphdr *) skb->data;
1074         struct flowi4 fl4;
1075         struct rtable *rt;
1076
1077         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1078
1079         if (!fl4.flowi4_mark)
1080                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1081
1082         rt = __ip_route_output_key(sock_net(sk), &fl4);
1083         if (!IS_ERR(rt)) {
1084                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1085                 ip_rt_put(rt);
1086         }
1087 }
1088
1089 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1090 {
1091         const struct iphdr *iph = (const struct iphdr *) skb->data;
1092         struct flowi4 fl4;
1093         struct rtable *rt;
1094         struct dst_entry *odst = NULL;
1095         bool new = false;
1096
1097         bh_lock_sock(sk);
1098
1099         if (!ip_sk_accept_pmtu(sk))
1100                 goto out;
1101
1102         odst = sk_dst_get(sk);
1103
1104         if (sock_owned_by_user(sk) || !odst) {
1105                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1106                 goto out;
1107         }
1108
1109         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1110
1111         rt = (struct rtable *)odst;
1112         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1113                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1114                 if (IS_ERR(rt))
1115                         goto out;
1116
1117                 new = true;
1118         }
1119
1120         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1121
1122         if (!dst_check(&rt->dst, 0)) {
1123                 if (new)
1124                         dst_release(&rt->dst);
1125
1126                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1127                 if (IS_ERR(rt))
1128                         goto out;
1129
1130                 new = true;
1131         }
1132
1133         if (new)
1134                 sk_dst_set(sk, &rt->dst);
1135
1136 out:
1137         bh_unlock_sock(sk);
1138         dst_release(odst);
1139 }
1140 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1141
1142 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1143                    int oif, u32 mark, u8 protocol, int flow_flags)
1144 {
1145         const struct iphdr *iph = (const struct iphdr *) skb->data;
1146         struct flowi4 fl4;
1147         struct rtable *rt;
1148
1149         __build_flow_key(&fl4, NULL, iph, oif,
1150                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1151         rt = __ip_route_output_key(net, &fl4);
1152         if (!IS_ERR(rt)) {
1153                 __ip_do_redirect(rt, skb, &fl4, false);
1154                 ip_rt_put(rt);
1155         }
1156 }
1157 EXPORT_SYMBOL_GPL(ipv4_redirect);
1158
1159 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1160 {
1161         const struct iphdr *iph = (const struct iphdr *) skb->data;
1162         struct flowi4 fl4;
1163         struct rtable *rt;
1164
1165         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1166         rt = __ip_route_output_key(sock_net(sk), &fl4);
1167         if (!IS_ERR(rt)) {
1168                 __ip_do_redirect(rt, skb, &fl4, false);
1169                 ip_rt_put(rt);
1170         }
1171 }
1172 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1173
1174 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1175 {
1176         struct rtable *rt = (struct rtable *) dst;
1177
1178         /* All IPV4 dsts are created with ->obsolete set to the value
1179          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1180          * into this function always.
1181          *
1182          * When a PMTU/redirect information update invalidates a route,
1183          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1184          * DST_OBSOLETE_DEAD by dst_free().
1185          */
1186         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1187                 return NULL;
1188         return dst;
1189 }
1190
1191 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1192 {
1193         struct ip_options opt;
1194         int res;
1195
1196         /* Recompile ip options since IPCB may not be valid anymore.
1197          * Also check we have a reasonable ipv4 header.
1198          */
1199         if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1200             ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1201                 return;
1202
1203         memset(&opt, 0, sizeof(opt));
1204         if (ip_hdr(skb)->ihl > 5) {
1205                 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1206                         return;
1207                 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1208
1209                 rcu_read_lock();
1210                 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1211                 rcu_read_unlock();
1212
1213                 if (res)
1214                         return;
1215         }
1216         __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1217 }
1218
1219 static void ipv4_link_failure(struct sk_buff *skb)
1220 {
1221         struct rtable *rt;
1222
1223         ipv4_send_dest_unreach(skb);
1224
1225         rt = skb_rtable(skb);
1226         if (rt)
1227                 dst_set_expires(&rt->dst, 0);
1228 }
1229
1230 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1231 {
1232         pr_debug("%s: %pI4 -> %pI4, %s\n",
1233                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1234                  skb->dev ? skb->dev->name : "?");
1235         kfree_skb(skb);
1236         WARN_ON(1);
1237         return 0;
1238 }
1239
1240 /*
1241    We do not cache source address of outgoing interface,
1242    because it is used only by IP RR, TS and SRR options,
1243    so that it out of fast path.
1244
1245    BTW remember: "addr" is allowed to be not aligned
1246    in IP options!
1247  */
1248
1249 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1250 {
1251         __be32 src;
1252
1253         if (rt_is_output_route(rt))
1254                 src = ip_hdr(skb)->saddr;
1255         else {
1256                 struct fib_result res;
1257                 struct flowi4 fl4;
1258                 struct iphdr *iph;
1259
1260                 iph = ip_hdr(skb);
1261
1262                 memset(&fl4, 0, sizeof(fl4));
1263                 fl4.daddr = iph->daddr;
1264                 fl4.saddr = iph->saddr;
1265                 fl4.flowi4_tos = RT_TOS(iph->tos);
1266                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1267                 fl4.flowi4_iif = skb->dev->ifindex;
1268                 fl4.flowi4_mark = skb->mark;
1269
1270                 rcu_read_lock();
1271                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1272                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1273                 else
1274                         src = inet_select_addr(rt->dst.dev,
1275                                                rt_nexthop(rt, iph->daddr),
1276                                                RT_SCOPE_UNIVERSE);
1277                 rcu_read_unlock();
1278         }
1279         memcpy(addr, &src, 4);
1280 }
1281
1282 #ifdef CONFIG_IP_ROUTE_CLASSID
1283 static void set_class_tag(struct rtable *rt, u32 tag)
1284 {
1285         if (!(rt->dst.tclassid & 0xFFFF))
1286                 rt->dst.tclassid |= tag & 0xFFFF;
1287         if (!(rt->dst.tclassid & 0xFFFF0000))
1288                 rt->dst.tclassid |= tag & 0xFFFF0000;
1289 }
1290 #endif
1291
1292 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1293 {
1294         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1295
1296         if (advmss == 0) {
1297                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1298                                ip_rt_min_advmss);
1299                 if (advmss > 65535 - 40)
1300                         advmss = 65535 - 40;
1301         }
1302         return advmss;
1303 }
1304
1305 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1306 {
1307         const struct rtable *rt = (const struct rtable *) dst;
1308         unsigned int mtu = rt->rt_pmtu;
1309
1310         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1311                 mtu = dst_metric_raw(dst, RTAX_MTU);
1312
1313         if (mtu)
1314                 return mtu;
1315
1316         mtu = READ_ONCE(dst->dev->mtu);
1317
1318         if (unlikely(ip_mtu_locked(dst))) {
1319                 if (rt->rt_uses_gateway && mtu > 576)
1320                         mtu = 576;
1321         }
1322
1323         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1324
1325         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1326 }
1327
1328 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1329 {
1330         struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1331         struct fib_nh_exception *fnhe;
1332         u32 hval;
1333
1334         if (!hash)
1335                 return NULL;
1336
1337         hval = fnhe_hashfun(daddr);
1338
1339         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1340              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1341                 if (fnhe->fnhe_daddr == daddr)
1342                         return fnhe;
1343         }
1344         return NULL;
1345 }
1346
1347 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1348                               __be32 daddr)
1349 {
1350         bool ret = false;
1351
1352         spin_lock_bh(&fnhe_lock);
1353
1354         if (daddr == fnhe->fnhe_daddr) {
1355                 struct rtable __rcu **porig;
1356                 struct rtable *orig;
1357                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1358
1359                 if (rt_is_input_route(rt))
1360                         porig = &fnhe->fnhe_rth_input;
1361                 else
1362                         porig = &fnhe->fnhe_rth_output;
1363                 orig = rcu_dereference(*porig);
1364
1365                 if (fnhe->fnhe_genid != genid) {
1366                         fnhe->fnhe_genid = genid;
1367                         fnhe->fnhe_gw = 0;
1368                         fnhe->fnhe_pmtu = 0;
1369                         fnhe->fnhe_expires = 0;
1370                         fnhe_flush_routes(fnhe);
1371                         orig = NULL;
1372                 }
1373                 fill_route_from_fnhe(rt, fnhe);
1374                 if (!rt->rt_gateway)
1375                         rt->rt_gateway = daddr;
1376
1377                 if (!(rt->dst.flags & DST_NOCACHE)) {
1378                         rcu_assign_pointer(*porig, rt);
1379                         if (orig)
1380                                 rt_free(orig);
1381                         ret = true;
1382                 }
1383
1384                 fnhe->fnhe_stamp = jiffies;
1385         }
1386         spin_unlock_bh(&fnhe_lock);
1387
1388         return ret;
1389 }
1390
1391 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1392 {
1393         struct rtable *orig, *prev, **p;
1394         bool ret = true;
1395
1396         if (rt_is_input_route(rt)) {
1397                 p = (struct rtable **)&nh->nh_rth_input;
1398         } else {
1399                 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1400         }
1401         orig = *p;
1402
1403         prev = cmpxchg(p, orig, rt);
1404         if (prev == orig) {
1405                 if (orig)
1406                         rt_free(orig);
1407         } else
1408                 ret = false;
1409
1410         return ret;
1411 }
1412
1413 struct uncached_list {
1414         spinlock_t              lock;
1415         struct list_head        head;
1416 };
1417
1418 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1419
1420 static void rt_add_uncached_list(struct rtable *rt)
1421 {
1422         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1423
1424         rt->rt_uncached_list = ul;
1425
1426         spin_lock_bh(&ul->lock);
1427         list_add_tail(&rt->rt_uncached, &ul->head);
1428         spin_unlock_bh(&ul->lock);
1429 }
1430
1431 static void ipv4_dst_destroy(struct dst_entry *dst)
1432 {
1433         struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1434         struct rtable *rt = (struct rtable *) dst;
1435
1436         if (p != &dst_default_metrics && atomic_dec_and_test(&p->refcnt))
1437                 kfree(p);
1438
1439         if (!list_empty(&rt->rt_uncached)) {
1440                 struct uncached_list *ul = rt->rt_uncached_list;
1441
1442                 spin_lock_bh(&ul->lock);
1443                 list_del(&rt->rt_uncached);
1444                 spin_unlock_bh(&ul->lock);
1445         }
1446 }
1447
1448 void rt_flush_dev(struct net_device *dev)
1449 {
1450         struct net *net = dev_net(dev);
1451         struct rtable *rt;
1452         int cpu;
1453
1454         for_each_possible_cpu(cpu) {
1455                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1456
1457                 spin_lock_bh(&ul->lock);
1458                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1459                         if (rt->dst.dev != dev)
1460                                 continue;
1461                         rt->dst.dev = net->loopback_dev;
1462                         dev_hold(rt->dst.dev);
1463                         dev_put(dev);
1464                 }
1465                 spin_unlock_bh(&ul->lock);
1466         }
1467 }
1468
1469 static bool rt_cache_valid(const struct rtable *rt)
1470 {
1471         return  rt &&
1472                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1473                 !rt_is_expired(rt);
1474 }
1475
1476 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1477                            const struct fib_result *res,
1478                            struct fib_nh_exception *fnhe,
1479                            struct fib_info *fi, u16 type, u32 itag)
1480 {
1481         bool cached = false;
1482
1483         if (fi) {
1484                 struct fib_nh *nh = &FIB_RES_NH(*res);
1485
1486                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1487                         rt->rt_gateway = nh->nh_gw;
1488                         rt->rt_uses_gateway = 1;
1489                 }
1490                 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1491                 if (fi->fib_metrics != &dst_default_metrics) {
1492                         rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1493                         atomic_inc(&fi->fib_metrics->refcnt);
1494                 }
1495 #ifdef CONFIG_IP_ROUTE_CLASSID
1496                 rt->dst.tclassid = nh->nh_tclassid;
1497 #endif
1498                 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1499                 if (unlikely(fnhe))
1500                         cached = rt_bind_exception(rt, fnhe, daddr);
1501                 else if (!(rt->dst.flags & DST_NOCACHE))
1502                         cached = rt_cache_route(nh, rt);
1503                 if (unlikely(!cached)) {
1504                         /* Routes we intend to cache in nexthop exception or
1505                          * FIB nexthop have the DST_NOCACHE bit clear.
1506                          * However, if we are unsuccessful at storing this
1507                          * route into the cache we really need to set it.
1508                          */
1509                         rt->dst.flags |= DST_NOCACHE;
1510                         if (!rt->rt_gateway)
1511                                 rt->rt_gateway = daddr;
1512                         rt_add_uncached_list(rt);
1513                 }
1514         } else
1515                 rt_add_uncached_list(rt);
1516
1517 #ifdef CONFIG_IP_ROUTE_CLASSID
1518 #ifdef CONFIG_IP_MULTIPLE_TABLES
1519         set_class_tag(rt, res->tclassid);
1520 #endif
1521         set_class_tag(rt, itag);
1522 #endif
1523 }
1524
1525 struct rtable *rt_dst_alloc(struct net_device *dev,
1526                             unsigned int flags, u16 type,
1527                             bool nopolicy, bool noxfrm, bool will_cache)
1528 {
1529         struct rtable *rt;
1530
1531         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1532                        (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1533                        (nopolicy ? DST_NOPOLICY : 0) |
1534                        (noxfrm ? DST_NOXFRM : 0));
1535
1536         if (rt) {
1537                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1538                 rt->rt_flags = flags;
1539                 rt->rt_type = type;
1540                 rt->rt_is_input = 0;
1541                 rt->rt_iif = 0;
1542                 rt->rt_pmtu = 0;
1543                 rt->rt_mtu_locked = 0;
1544                 rt->rt_gateway = 0;
1545                 rt->rt_uses_gateway = 0;
1546                 rt->rt_table_id = 0;
1547                 INIT_LIST_HEAD(&rt->rt_uncached);
1548
1549                 rt->dst.output = ip_output;
1550                 if (flags & RTCF_LOCAL)
1551                         rt->dst.input = ip_local_deliver;
1552         }
1553
1554         return rt;
1555 }
1556 EXPORT_SYMBOL(rt_dst_alloc);
1557
1558 /* called in rcu_read_lock() section */
1559 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1560                                 u8 tos, struct net_device *dev, int our)
1561 {
1562         struct rtable *rth;
1563         struct in_device *in_dev = __in_dev_get_rcu(dev);
1564         unsigned int flags = RTCF_MULTICAST;
1565         u32 itag = 0;
1566         int err;
1567
1568         /* Primary sanity checks. */
1569
1570         if (!in_dev)
1571                 return -EINVAL;
1572
1573         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1574             skb->protocol != htons(ETH_P_IP))
1575                 goto e_inval;
1576
1577         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1578                 goto e_inval;
1579
1580         if (ipv4_is_zeronet(saddr)) {
1581                 if (!ipv4_is_local_multicast(daddr))
1582                         goto e_inval;
1583         } else {
1584                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1585                                           in_dev, &itag);
1586                 if (err < 0)
1587                         goto e_err;
1588         }
1589         if (our)
1590                 flags |= RTCF_LOCAL;
1591
1592         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1593                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1594         if (!rth)
1595                 goto e_nobufs;
1596
1597 #ifdef CONFIG_IP_ROUTE_CLASSID
1598         rth->dst.tclassid = itag;
1599 #endif
1600         rth->dst.output = ip_rt_bug;
1601         rth->rt_is_input= 1;
1602
1603 #ifdef CONFIG_IP_MROUTE
1604         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1605                 rth->dst.input = ip_mr_input;
1606 #endif
1607         RT_CACHE_STAT_INC(in_slow_mc);
1608
1609         skb_dst_drop(skb);
1610         skb_dst_set(skb, &rth->dst);
1611         return 0;
1612
1613 e_nobufs:
1614         return -ENOBUFS;
1615 e_inval:
1616         return -EINVAL;
1617 e_err:
1618         return err;
1619 }
1620
1621
1622 static void ip_handle_martian_source(struct net_device *dev,
1623                                      struct in_device *in_dev,
1624                                      struct sk_buff *skb,
1625                                      __be32 daddr,
1626                                      __be32 saddr)
1627 {
1628         RT_CACHE_STAT_INC(in_martian_src);
1629 #ifdef CONFIG_IP_ROUTE_VERBOSE
1630         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1631                 /*
1632                  *      RFC1812 recommendation, if source is martian,
1633                  *      the only hint is MAC header.
1634                  */
1635                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1636                         &daddr, &saddr, dev->name);
1637                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1638                         print_hex_dump(KERN_WARNING, "ll header: ",
1639                                        DUMP_PREFIX_OFFSET, 16, 1,
1640                                        skb_mac_header(skb),
1641                                        dev->hard_header_len, true);
1642                 }
1643         }
1644 #endif
1645 }
1646
1647 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1648 {
1649         struct fnhe_hash_bucket *hash;
1650         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1651         u32 hval = fnhe_hashfun(daddr);
1652
1653         spin_lock_bh(&fnhe_lock);
1654
1655         hash = rcu_dereference_protected(nh->nh_exceptions,
1656                                          lockdep_is_held(&fnhe_lock));
1657         hash += hval;
1658
1659         fnhe_p = &hash->chain;
1660         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1661         while (fnhe) {
1662                 if (fnhe->fnhe_daddr == daddr) {
1663                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1664                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1665                         /* set fnhe_daddr to 0 to ensure it won't bind with
1666                          * new dsts in rt_bind_exception().
1667                          */
1668                         fnhe->fnhe_daddr = 0;
1669                         fnhe_flush_routes(fnhe);
1670                         kfree_rcu(fnhe, rcu);
1671                         break;
1672                 }
1673                 fnhe_p = &fnhe->fnhe_next;
1674                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1675                                                  lockdep_is_held(&fnhe_lock));
1676         }
1677
1678         spin_unlock_bh(&fnhe_lock);
1679 }
1680
1681 /* called in rcu_read_lock() section */
1682 static int __mkroute_input(struct sk_buff *skb,
1683                            const struct fib_result *res,
1684                            struct in_device *in_dev,
1685                            __be32 daddr, __be32 saddr, u32 tos)
1686 {
1687         struct fib_nh_exception *fnhe;
1688         struct rtable *rth;
1689         int err;
1690         struct in_device *out_dev;
1691         bool do_cache;
1692         u32 itag = 0;
1693
1694         /* get a working reference to the output device */
1695         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1696         if (!out_dev) {
1697                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1698                 return -EINVAL;
1699         }
1700
1701         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1702                                   in_dev->dev, in_dev, &itag);
1703         if (err < 0) {
1704                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1705                                          saddr);
1706
1707                 goto cleanup;
1708         }
1709
1710         do_cache = res->fi && !itag;
1711         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1712             skb->protocol == htons(ETH_P_IP) &&
1713             (IN_DEV_SHARED_MEDIA(out_dev) ||
1714              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1715                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1716
1717         if (skb->protocol != htons(ETH_P_IP)) {
1718                 /* Not IP (i.e. ARP). Do not create route, if it is
1719                  * invalid for proxy arp. DNAT routes are always valid.
1720                  *
1721                  * Proxy arp feature have been extended to allow, ARP
1722                  * replies back to the same interface, to support
1723                  * Private VLAN switch technologies. See arp.c.
1724                  */
1725                 if (out_dev == in_dev &&
1726                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1727                         err = -EINVAL;
1728                         goto cleanup;
1729                 }
1730         }
1731
1732         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1733         if (do_cache) {
1734                 if (fnhe) {
1735                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1736                         if (rth && rth->dst.expires &&
1737                             time_after(jiffies, rth->dst.expires)) {
1738                                 ip_del_fnhe(&FIB_RES_NH(*res), daddr);
1739                                 fnhe = NULL;
1740                         } else {
1741                                 goto rt_cache;
1742                         }
1743                 }
1744
1745                 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1746
1747 rt_cache:
1748                 if (rt_cache_valid(rth)) {
1749                         skb_dst_set_noref(skb, &rth->dst);
1750                         goto out;
1751                 }
1752         }
1753
1754         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1755                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1756                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1757         if (!rth) {
1758                 err = -ENOBUFS;
1759                 goto cleanup;
1760         }
1761
1762         rth->rt_is_input = 1;
1763         if (res->table)
1764                 rth->rt_table_id = res->table->tb_id;
1765         RT_CACHE_STAT_INC(in_slow_tot);
1766
1767         rth->dst.input = ip_forward;
1768
1769         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1770         if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1771                 rth->dst.lwtstate->orig_output = rth->dst.output;
1772                 rth->dst.output = lwtunnel_output;
1773         }
1774         if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1775                 rth->dst.lwtstate->orig_input = rth->dst.input;
1776                 rth->dst.input = lwtunnel_input;
1777         }
1778         skb_dst_set(skb, &rth->dst);
1779 out:
1780         err = 0;
1781  cleanup:
1782         return err;
1783 }
1784
1785 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1786
1787 /* To make ICMP packets follow the right flow, the multipath hash is
1788  * calculated from the inner IP addresses in reverse order.
1789  */
1790 static int ip_multipath_icmp_hash(struct sk_buff *skb)
1791 {
1792         const struct iphdr *outer_iph = ip_hdr(skb);
1793         struct icmphdr _icmph;
1794         const struct icmphdr *icmph;
1795         struct iphdr _inner_iph;
1796         const struct iphdr *inner_iph;
1797
1798         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1799                 goto standard_hash;
1800
1801         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1802                                    &_icmph);
1803         if (!icmph)
1804                 goto standard_hash;
1805
1806         if (icmph->type != ICMP_DEST_UNREACH &&
1807             icmph->type != ICMP_REDIRECT &&
1808             icmph->type != ICMP_TIME_EXCEEDED &&
1809             icmph->type != ICMP_PARAMETERPROB) {
1810                 goto standard_hash;
1811         }
1812
1813         inner_iph = skb_header_pointer(skb,
1814                                        outer_iph->ihl * 4 + sizeof(_icmph),
1815                                        sizeof(_inner_iph), &_inner_iph);
1816         if (!inner_iph)
1817                 goto standard_hash;
1818
1819         return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr);
1820
1821 standard_hash:
1822         return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr);
1823 }
1824
1825 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1826
1827 static int ip_mkroute_input(struct sk_buff *skb,
1828                             struct fib_result *res,
1829                             const struct flowi4 *fl4,
1830                             struct in_device *in_dev,
1831                             __be32 daddr, __be32 saddr, u32 tos)
1832 {
1833 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1834         if (res->fi && res->fi->fib_nhs > 1) {
1835                 int h;
1836
1837                 if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP))
1838                         h = ip_multipath_icmp_hash(skb);
1839                 else
1840                         h = fib_multipath_hash(saddr, daddr);
1841                 fib_select_multipath(res, h);
1842         }
1843 #endif
1844
1845         /* create a routing cache entry */
1846         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1847 }
1848
1849 /*
1850  *      NOTE. We drop all the packets that has local source
1851  *      addresses, because every properly looped back packet
1852  *      must have correct destination already attached by output routine.
1853  *
1854  *      Such approach solves two big problems:
1855  *      1. Not simplex devices are handled properly.
1856  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1857  *      called with rcu_read_lock()
1858  */
1859
1860 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1861                                u8 tos, struct net_device *dev)
1862 {
1863         struct fib_result res;
1864         struct in_device *in_dev = __in_dev_get_rcu(dev);
1865         struct ip_tunnel_info *tun_info;
1866         struct flowi4   fl4;
1867         unsigned int    flags = 0;
1868         u32             itag = 0;
1869         struct rtable   *rth;
1870         int             err = -EINVAL;
1871         struct net    *net = dev_net(dev);
1872         bool do_cache;
1873
1874         /* IP on this device is disabled. */
1875
1876         if (!in_dev)
1877                 goto out;
1878
1879         /* Check for the most weird martians, which can be not detected
1880            by fib_lookup.
1881          */
1882
1883         tun_info = skb_tunnel_info(skb);
1884         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1885                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1886         else
1887                 fl4.flowi4_tun_key.tun_id = 0;
1888         skb_dst_drop(skb);
1889
1890         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1891                 goto martian_source;
1892
1893         res.fi = NULL;
1894         res.table = NULL;
1895         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1896                 goto brd_input;
1897
1898         /* Accept zero addresses only to limited broadcast;
1899          * I even do not know to fix it or not. Waiting for complains :-)
1900          */
1901         if (ipv4_is_zeronet(saddr))
1902                 goto martian_source;
1903
1904         if (ipv4_is_zeronet(daddr))
1905                 goto martian_destination;
1906
1907         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1908          * and call it once if daddr or/and saddr are loopback addresses
1909          */
1910         if (ipv4_is_loopback(daddr)) {
1911                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1912                         goto martian_destination;
1913         } else if (ipv4_is_loopback(saddr)) {
1914                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1915                         goto martian_source;
1916         }
1917
1918         /*
1919          *      Now we are ready to route packet.
1920          */
1921         fl4.flowi4_oif = 0;
1922         fl4.flowi4_iif = dev->ifindex;
1923         fl4.flowi4_mark = skb->mark;
1924         fl4.flowi4_tos = tos;
1925         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1926         fl4.flowi4_flags = 0;
1927         fl4.daddr = daddr;
1928         fl4.saddr = saddr;
1929         err = fib_lookup(net, &fl4, &res, 0);
1930         if (err != 0) {
1931                 if (!IN_DEV_FORWARD(in_dev))
1932                         err = -EHOSTUNREACH;
1933                 goto no_route;
1934         }
1935
1936         if (res.type == RTN_BROADCAST)
1937                 goto brd_input;
1938
1939         if (res.type == RTN_LOCAL) {
1940                 err = fib_validate_source(skb, saddr, daddr, tos,
1941                                           0, dev, in_dev, &itag);
1942                 if (err < 0)
1943                         goto martian_source;
1944                 goto local_input;
1945         }
1946
1947         if (!IN_DEV_FORWARD(in_dev)) {
1948                 err = -EHOSTUNREACH;
1949                 goto no_route;
1950         }
1951         if (res.type != RTN_UNICAST)
1952                 goto martian_destination;
1953
1954         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1955 out:    return err;
1956
1957 brd_input:
1958         if (skb->protocol != htons(ETH_P_IP))
1959                 goto e_inval;
1960
1961         if (!ipv4_is_zeronet(saddr)) {
1962                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1963                                           in_dev, &itag);
1964                 if (err < 0)
1965                         goto martian_source;
1966         }
1967         flags |= RTCF_BROADCAST;
1968         res.type = RTN_BROADCAST;
1969         RT_CACHE_STAT_INC(in_brd);
1970
1971 local_input:
1972         do_cache = false;
1973         if (res.fi) {
1974                 if (!itag) {
1975                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1976                         if (rt_cache_valid(rth)) {
1977                                 skb_dst_set_noref(skb, &rth->dst);
1978                                 err = 0;
1979                                 goto out;
1980                         }
1981                         do_cache = true;
1982                 }
1983         }
1984
1985         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
1986                            flags | RTCF_LOCAL, res.type,
1987                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1988         if (!rth)
1989                 goto e_nobufs;
1990
1991         rth->dst.output= ip_rt_bug;
1992 #ifdef CONFIG_IP_ROUTE_CLASSID
1993         rth->dst.tclassid = itag;
1994 #endif
1995         rth->rt_is_input = 1;
1996         if (res.table)
1997                 rth->rt_table_id = res.table->tb_id;
1998
1999         RT_CACHE_STAT_INC(in_slow_tot);
2000         if (res.type == RTN_UNREACHABLE) {
2001                 rth->dst.input= ip_error;
2002                 rth->dst.error= -err;
2003                 rth->rt_flags   &= ~RTCF_LOCAL;
2004         }
2005         if (do_cache) {
2006                 if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
2007                         rth->dst.flags |= DST_NOCACHE;
2008                         rt_add_uncached_list(rth);
2009                 }
2010         }
2011         skb_dst_set(skb, &rth->dst);
2012         err = 0;
2013         goto out;
2014
2015 no_route:
2016         RT_CACHE_STAT_INC(in_no_route);
2017         res.type = RTN_UNREACHABLE;
2018         res.fi = NULL;
2019         res.table = NULL;
2020         goto local_input;
2021
2022         /*
2023          *      Do not cache martian addresses: they should be logged (RFC1812)
2024          */
2025 martian_destination:
2026         RT_CACHE_STAT_INC(in_martian_dst);
2027 #ifdef CONFIG_IP_ROUTE_VERBOSE
2028         if (IN_DEV_LOG_MARTIANS(in_dev))
2029                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2030                                      &daddr, &saddr, dev->name);
2031 #endif
2032
2033 e_inval:
2034         err = -EINVAL;
2035         goto out;
2036
2037 e_nobufs:
2038         err = -ENOBUFS;
2039         goto out;
2040
2041 martian_source:
2042         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2043         goto out;
2044 }
2045
2046 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2047                          u8 tos, struct net_device *dev)
2048 {
2049         int res;
2050
2051         tos &= IPTOS_RT_MASK;
2052         rcu_read_lock();
2053
2054         /* Multicast recognition logic is moved from route cache to here.
2055            The problem was that too many Ethernet cards have broken/missing
2056            hardware multicast filters :-( As result the host on multicasting
2057            network acquires a lot of useless route cache entries, sort of
2058            SDR messages from all the world. Now we try to get rid of them.
2059            Really, provided software IP multicast filter is organized
2060            reasonably (at least, hashed), it does not result in a slowdown
2061            comparing with route cache reject entries.
2062            Note, that multicast routers are not affected, because
2063            route cache entry is created eventually.
2064          */
2065         if (ipv4_is_multicast(daddr)) {
2066                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2067
2068                 if (in_dev) {
2069                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2070                                                   ip_hdr(skb)->protocol);
2071                         if (our
2072 #ifdef CONFIG_IP_MROUTE
2073                                 ||
2074                             (!ipv4_is_local_multicast(daddr) &&
2075                              IN_DEV_MFORWARD(in_dev))
2076 #endif
2077                            ) {
2078                                 int res = ip_route_input_mc(skb, daddr, saddr,
2079                                                             tos, dev, our);
2080                                 rcu_read_unlock();
2081                                 return res;
2082                         }
2083                 }
2084                 rcu_read_unlock();
2085                 return -EINVAL;
2086         }
2087         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2088         rcu_read_unlock();
2089         return res;
2090 }
2091 EXPORT_SYMBOL(ip_route_input_noref);
2092
2093 /* called with rcu_read_lock() */
2094 static struct rtable *__mkroute_output(const struct fib_result *res,
2095                                        const struct flowi4 *fl4, int orig_oif,
2096                                        struct net_device *dev_out,
2097                                        unsigned int flags)
2098 {
2099         struct fib_info *fi = res->fi;
2100         struct fib_nh_exception *fnhe;
2101         struct in_device *in_dev;
2102         u16 type = res->type;
2103         struct rtable *rth;
2104         bool do_cache;
2105
2106         in_dev = __in_dev_get_rcu(dev_out);
2107         if (!in_dev)
2108                 return ERR_PTR(-EINVAL);
2109
2110         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2111                 if (ipv4_is_loopback(fl4->saddr) &&
2112                     !(dev_out->flags & IFF_LOOPBACK) &&
2113                     !netif_is_l3_master(dev_out))
2114                         return ERR_PTR(-EINVAL);
2115
2116         if (ipv4_is_lbcast(fl4->daddr))
2117                 type = RTN_BROADCAST;
2118         else if (ipv4_is_multicast(fl4->daddr))
2119                 type = RTN_MULTICAST;
2120         else if (ipv4_is_zeronet(fl4->daddr))
2121                 return ERR_PTR(-EINVAL);
2122
2123         if (dev_out->flags & IFF_LOOPBACK)
2124                 flags |= RTCF_LOCAL;
2125
2126         do_cache = true;
2127         if (type == RTN_BROADCAST) {
2128                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2129                 fi = NULL;
2130         } else if (type == RTN_MULTICAST) {
2131                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2132                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2133                                      fl4->flowi4_proto))
2134                         flags &= ~RTCF_LOCAL;
2135                 else
2136                         do_cache = false;
2137                 /* If multicast route do not exist use
2138                  * default one, but do not gateway in this case.
2139                  * Yes, it is hack.
2140                  */
2141                 if (fi && res->prefixlen < 4)
2142                         fi = NULL;
2143         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2144                    (orig_oif != dev_out->ifindex)) {
2145                 /* For local routes that require a particular output interface
2146                  * we do not want to cache the result.  Caching the result
2147                  * causes incorrect behaviour when there are multiple source
2148                  * addresses on the interface, the end result being that if the
2149                  * intended recipient is waiting on that interface for the
2150                  * packet he won't receive it because it will be delivered on
2151                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2152                  * be set to the loopback interface as well.
2153                  */
2154                 fi = NULL;
2155         }
2156
2157         fnhe = NULL;
2158         do_cache &= fi != NULL;
2159         if (do_cache) {
2160                 struct rtable __rcu **prth;
2161                 struct fib_nh *nh = &FIB_RES_NH(*res);
2162
2163                 fnhe = find_exception(nh, fl4->daddr);
2164                 if (fnhe) {
2165                         prth = &fnhe->fnhe_rth_output;
2166                         rth = rcu_dereference(*prth);
2167                         if (rth && rth->dst.expires &&
2168                             time_after(jiffies, rth->dst.expires)) {
2169                                 ip_del_fnhe(nh, fl4->daddr);
2170                                 fnhe = NULL;
2171                         } else {
2172                                 goto rt_cache;
2173                         }
2174                 }
2175
2176                 if (unlikely(fl4->flowi4_flags &
2177                              FLOWI_FLAG_KNOWN_NH &&
2178                              !(nh->nh_gw &&
2179                                nh->nh_scope == RT_SCOPE_LINK))) {
2180                         do_cache = false;
2181                         goto add;
2182                 }
2183                 prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2184                 rth = rcu_dereference(*prth);
2185
2186 rt_cache:
2187                 if (rt_cache_valid(rth)) {
2188                         dst_hold(&rth->dst);
2189                         return rth;
2190                 }
2191         }
2192
2193 add:
2194         rth = rt_dst_alloc(dev_out, flags, type,
2195                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2196                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2197                            do_cache);
2198         if (!rth)
2199                 return ERR_PTR(-ENOBUFS);
2200
2201         rth->rt_iif     = orig_oif ? : 0;
2202         if (res->table)
2203                 rth->rt_table_id = res->table->tb_id;
2204
2205         RT_CACHE_STAT_INC(out_slow_tot);
2206
2207         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2208                 if (flags & RTCF_LOCAL &&
2209                     !(dev_out->flags & IFF_LOOPBACK)) {
2210                         rth->dst.output = ip_mc_output;
2211                         RT_CACHE_STAT_INC(out_slow_mc);
2212                 }
2213 #ifdef CONFIG_IP_MROUTE
2214                 if (type == RTN_MULTICAST) {
2215                         if (IN_DEV_MFORWARD(in_dev) &&
2216                             !ipv4_is_local_multicast(fl4->daddr)) {
2217                                 rth->dst.input = ip_mr_input;
2218                                 rth->dst.output = ip_mc_output;
2219                         }
2220                 }
2221 #endif
2222         }
2223
2224         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
2225         if (lwtunnel_output_redirect(rth->dst.lwtstate))
2226                 rth->dst.output = lwtunnel_output;
2227
2228         return rth;
2229 }
2230
2231 /*
2232  * Major route resolver routine.
2233  */
2234
2235 struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2236                                           int mp_hash)
2237 {
2238         struct net_device *dev_out = NULL;
2239         __u8 tos = RT_FL_TOS(fl4);
2240         unsigned int flags = 0;
2241         struct fib_result res;
2242         struct rtable *rth;
2243         int orig_oif;
2244         int err;
2245
2246         res.tclassid    = 0;
2247         res.fi          = NULL;
2248         res.table       = NULL;
2249
2250         orig_oif = fl4->flowi4_oif;
2251
2252         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2253         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2254         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2255                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2256
2257         rcu_read_lock();
2258         if (fl4->saddr) {
2259                 if (ipv4_is_multicast(fl4->saddr) ||
2260                     ipv4_is_lbcast(fl4->saddr) ||
2261                     ipv4_is_zeronet(fl4->saddr)) {
2262                         rth = ERR_PTR(-EINVAL);
2263                         goto out;
2264                 }
2265
2266                 rth = ERR_PTR(-ENETUNREACH);
2267
2268                 /* I removed check for oif == dev_out->oif here.
2269                    It was wrong for two reasons:
2270                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2271                       is assigned to multiple interfaces.
2272                    2. Moreover, we are allowed to send packets with saddr
2273                       of another iface. --ANK
2274                  */
2275
2276                 if (fl4->flowi4_oif == 0 &&
2277                     (ipv4_is_multicast(fl4->daddr) ||
2278                      ipv4_is_lbcast(fl4->daddr))) {
2279                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2280                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2281                         if (!dev_out)
2282                                 goto out;
2283
2284                         /* Special hack: user can direct multicasts
2285                            and limited broadcast via necessary interface
2286                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2287                            This hack is not just for fun, it allows
2288                            vic,vat and friends to work.
2289                            They bind socket to loopback, set ttl to zero
2290                            and expect that it will work.
2291                            From the viewpoint of routing cache they are broken,
2292                            because we are not allowed to build multicast path
2293                            with loopback source addr (look, routing cache
2294                            cannot know, that ttl is zero, so that packet
2295                            will not leave this host and route is valid).
2296                            Luckily, this hack is good workaround.
2297                          */
2298
2299                         fl4->flowi4_oif = dev_out->ifindex;
2300                         goto make_route;
2301                 }
2302
2303                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2304                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2305                         if (!__ip_dev_find(net, fl4->saddr, false))
2306                                 goto out;
2307                 }
2308         }
2309
2310
2311         if (fl4->flowi4_oif) {
2312                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2313                 rth = ERR_PTR(-ENODEV);
2314                 if (!dev_out)
2315                         goto out;
2316
2317                 /* RACE: Check return value of inet_select_addr instead. */
2318                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2319                         rth = ERR_PTR(-ENETUNREACH);
2320                         goto out;
2321                 }
2322                 if (ipv4_is_local_multicast(fl4->daddr) ||
2323                     ipv4_is_lbcast(fl4->daddr) ||
2324                     fl4->flowi4_proto == IPPROTO_IGMP) {
2325                         if (!fl4->saddr)
2326                                 fl4->saddr = inet_select_addr(dev_out, 0,
2327                                                               RT_SCOPE_LINK);
2328                         goto make_route;
2329                 }
2330                 if (!fl4->saddr) {
2331                         if (ipv4_is_multicast(fl4->daddr))
2332                                 fl4->saddr = inet_select_addr(dev_out, 0,
2333                                                               fl4->flowi4_scope);
2334                         else if (!fl4->daddr)
2335                                 fl4->saddr = inet_select_addr(dev_out, 0,
2336                                                               RT_SCOPE_HOST);
2337                 }
2338         }
2339
2340         if (!fl4->daddr) {
2341                 fl4->daddr = fl4->saddr;
2342                 if (!fl4->daddr)
2343                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2344                 dev_out = net->loopback_dev;
2345                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2346                 res.type = RTN_LOCAL;
2347                 flags |= RTCF_LOCAL;
2348                 goto make_route;
2349         }
2350
2351         err = fib_lookup(net, fl4, &res, 0);
2352         if (err) {
2353                 res.fi = NULL;
2354                 res.table = NULL;
2355                 if (fl4->flowi4_oif &&
2356                     !netif_index_is_l3_master(net, fl4->flowi4_oif)) {
2357                         /* Apparently, routing tables are wrong. Assume,
2358                            that the destination is on link.
2359
2360                            WHY? DW.
2361                            Because we are allowed to send to iface
2362                            even if it has NO routes and NO assigned
2363                            addresses. When oif is specified, routing
2364                            tables are looked up with only one purpose:
2365                            to catch if destination is gatewayed, rather than
2366                            direct. Moreover, if MSG_DONTROUTE is set,
2367                            we send packet, ignoring both routing tables
2368                            and ifaddr state. --ANK
2369
2370
2371                            We could make it even if oif is unknown,
2372                            likely IPv6, but we do not.
2373                          */
2374
2375                         if (fl4->saddr == 0)
2376                                 fl4->saddr = inet_select_addr(dev_out, 0,
2377                                                               RT_SCOPE_LINK);
2378                         res.type = RTN_UNICAST;
2379                         goto make_route;
2380                 }
2381                 rth = ERR_PTR(err);
2382                 goto out;
2383         }
2384
2385         if (res.type == RTN_LOCAL) {
2386                 if (!fl4->saddr) {
2387                         if (res.fi->fib_prefsrc)
2388                                 fl4->saddr = res.fi->fib_prefsrc;
2389                         else
2390                                 fl4->saddr = fl4->daddr;
2391                 }
2392
2393                 /* L3 master device is the loopback for that domain */
2394                 dev_out = l3mdev_master_dev_rcu(dev_out) ? : net->loopback_dev;
2395                 fl4->flowi4_oif = dev_out->ifindex;
2396                 flags |= RTCF_LOCAL;
2397                 goto make_route;
2398         }
2399
2400         fib_select_path(net, &res, fl4, mp_hash);
2401
2402         dev_out = FIB_RES_DEV(res);
2403         fl4->flowi4_oif = dev_out->ifindex;
2404
2405
2406 make_route:
2407         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2408
2409 out:
2410         rcu_read_unlock();
2411         return rth;
2412 }
2413 EXPORT_SYMBOL_GPL(__ip_route_output_key_hash);
2414
2415 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2416 {
2417         return NULL;
2418 }
2419
2420 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2421 {
2422         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2423
2424         return mtu ? : dst->dev->mtu;
2425 }
2426
2427 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2428                                           struct sk_buff *skb, u32 mtu)
2429 {
2430 }
2431
2432 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2433                                        struct sk_buff *skb)
2434 {
2435 }
2436
2437 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2438                                           unsigned long old)
2439 {
2440         return NULL;
2441 }
2442
2443 static struct dst_ops ipv4_dst_blackhole_ops = {
2444         .family                 =       AF_INET,
2445         .check                  =       ipv4_blackhole_dst_check,
2446         .mtu                    =       ipv4_blackhole_mtu,
2447         .default_advmss         =       ipv4_default_advmss,
2448         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2449         .redirect               =       ipv4_rt_blackhole_redirect,
2450         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2451         .neigh_lookup           =       ipv4_neigh_lookup,
2452 };
2453
2454 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2455 {
2456         struct rtable *ort = (struct rtable *) dst_orig;
2457         struct rtable *rt;
2458
2459         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2460         if (rt) {
2461                 struct dst_entry *new = &rt->dst;
2462
2463                 new->__use = 1;
2464                 new->input = dst_discard;
2465                 new->output = dst_discard_out;
2466
2467                 new->dev = ort->dst.dev;
2468                 if (new->dev)
2469                         dev_hold(new->dev);
2470
2471                 rt->rt_is_input = ort->rt_is_input;
2472                 rt->rt_iif = ort->rt_iif;
2473                 rt->rt_pmtu = ort->rt_pmtu;
2474                 rt->rt_mtu_locked = ort->rt_mtu_locked;
2475
2476                 rt->rt_genid = rt_genid_ipv4(net);
2477                 rt->rt_flags = ort->rt_flags;
2478                 rt->rt_type = ort->rt_type;
2479                 rt->rt_gateway = ort->rt_gateway;
2480                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2481
2482                 INIT_LIST_HEAD(&rt->rt_uncached);
2483                 dst_free(new);
2484         }
2485
2486         dst_release(dst_orig);
2487
2488         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2489 }
2490
2491 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2492                                     const struct sock *sk)
2493 {
2494         struct rtable *rt = __ip_route_output_key(net, flp4);
2495
2496         if (IS_ERR(rt))
2497                 return rt;
2498
2499         if (flp4->flowi4_proto)
2500                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2501                                                         flowi4_to_flowi(flp4),
2502                                                         sk, 0);
2503
2504         return rt;
2505 }
2506 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2507
2508 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2509                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2510                         u32 seq, int event, int nowait, unsigned int flags)
2511 {
2512         struct rtable *rt = skb_rtable(skb);
2513         struct rtmsg *r;
2514         struct nlmsghdr *nlh;
2515         unsigned long expires = 0;
2516         u32 error;
2517         u32 metrics[RTAX_MAX];
2518
2519         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2520         if (!nlh)
2521                 return -EMSGSIZE;
2522
2523         r = nlmsg_data(nlh);
2524         r->rtm_family    = AF_INET;
2525         r->rtm_dst_len  = 32;
2526         r->rtm_src_len  = 0;
2527         r->rtm_tos      = fl4->flowi4_tos;
2528         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2529         if (nla_put_u32(skb, RTA_TABLE, table_id))
2530                 goto nla_put_failure;
2531         r->rtm_type     = rt->rt_type;
2532         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2533         r->rtm_protocol = RTPROT_UNSPEC;
2534         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2535         if (rt->rt_flags & RTCF_NOTIFY)
2536                 r->rtm_flags |= RTM_F_NOTIFY;
2537         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2538                 r->rtm_flags |= RTCF_DOREDIRECT;
2539
2540         if (nla_put_in_addr(skb, RTA_DST, dst))
2541                 goto nla_put_failure;
2542         if (src) {
2543                 r->rtm_src_len = 32;
2544                 if (nla_put_in_addr(skb, RTA_SRC, src))
2545                         goto nla_put_failure;
2546         }
2547         if (rt->dst.dev &&
2548             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2549                 goto nla_put_failure;
2550 #ifdef CONFIG_IP_ROUTE_CLASSID
2551         if (rt->dst.tclassid &&
2552             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2553                 goto nla_put_failure;
2554 #endif
2555         if (!rt_is_input_route(rt) &&
2556             fl4->saddr != src) {
2557                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2558                         goto nla_put_failure;
2559         }
2560         if (rt->rt_uses_gateway &&
2561             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2562                 goto nla_put_failure;
2563
2564         expires = rt->dst.expires;
2565         if (expires) {
2566                 unsigned long now = jiffies;
2567
2568                 if (time_before(now, expires))
2569                         expires -= now;
2570                 else
2571                         expires = 0;
2572         }
2573
2574         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2575         if (rt->rt_pmtu && expires)
2576                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2577         if (rt->rt_mtu_locked && expires)
2578                 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2579         if (rtnetlink_put_metrics(skb, metrics) < 0)
2580                 goto nla_put_failure;
2581
2582         if (fl4->flowi4_mark &&
2583             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2584                 goto nla_put_failure;
2585
2586         error = rt->dst.error;
2587
2588         if (rt_is_input_route(rt)) {
2589 #ifdef CONFIG_IP_MROUTE
2590                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2591                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2592                         int err = ipmr_get_route(net, skb,
2593                                                  fl4->saddr, fl4->daddr,
2594                                                  r, nowait, portid);
2595
2596                         if (err <= 0) {
2597                                 if (!nowait) {
2598                                         if (err == 0)
2599                                                 return 0;
2600                                         goto nla_put_failure;
2601                                 } else {
2602                                         if (err == -EMSGSIZE)
2603                                                 goto nla_put_failure;
2604                                         error = err;
2605                                 }
2606                         }
2607                 } else
2608 #endif
2609                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2610                                 goto nla_put_failure;
2611         }
2612
2613         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2614                 goto nla_put_failure;
2615
2616         nlmsg_end(skb, nlh);
2617         return 0;
2618
2619 nla_put_failure:
2620         nlmsg_cancel(skb, nlh);
2621         return -EMSGSIZE;
2622 }
2623
2624 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2625 {
2626         struct net *net = sock_net(in_skb->sk);
2627         struct rtmsg *rtm;
2628         struct nlattr *tb[RTA_MAX+1];
2629         struct rtable *rt = NULL;
2630         struct flowi4 fl4;
2631         __be32 dst = 0;
2632         __be32 src = 0;
2633         u32 iif;
2634         int err;
2635         int mark;
2636         struct sk_buff *skb;
2637         u32 table_id = RT_TABLE_MAIN;
2638
2639         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2640         if (err < 0)
2641                 goto errout;
2642
2643         rtm = nlmsg_data(nlh);
2644
2645         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2646         if (!skb) {
2647                 err = -ENOBUFS;
2648                 goto errout;
2649         }
2650
2651         /* Reserve room for dummy headers, this skb can pass
2652            through good chunk of routing engine.
2653          */
2654         skb_reset_mac_header(skb);
2655         skb_reset_network_header(skb);
2656
2657         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2658         ip_hdr(skb)->protocol = IPPROTO_UDP;
2659         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2660
2661         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2662         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2663         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2664         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2665
2666         memset(&fl4, 0, sizeof(fl4));
2667         fl4.daddr = dst;
2668         fl4.saddr = src;
2669         fl4.flowi4_tos = rtm->rtm_tos;
2670         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2671         fl4.flowi4_mark = mark;
2672
2673         if (iif) {
2674                 struct net_device *dev;
2675
2676                 dev = __dev_get_by_index(net, iif);
2677                 if (!dev) {
2678                         err = -ENODEV;
2679                         goto errout_free;
2680                 }
2681
2682                 skb->protocol   = htons(ETH_P_IP);
2683                 skb->dev        = dev;
2684                 skb->mark       = mark;
2685                 local_bh_disable();
2686                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2687                 local_bh_enable();
2688
2689                 rt = skb_rtable(skb);
2690                 if (err == 0 && rt->dst.error)
2691                         err = -rt->dst.error;
2692         } else {
2693                 rt = ip_route_output_key(net, &fl4);
2694
2695                 err = 0;
2696                 if (IS_ERR(rt))
2697                         err = PTR_ERR(rt);
2698         }
2699
2700         if (err)
2701                 goto errout_free;
2702
2703         skb_dst_set(skb, &rt->dst);
2704         if (rtm->rtm_flags & RTM_F_NOTIFY)
2705                 rt->rt_flags |= RTCF_NOTIFY;
2706
2707         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2708                 table_id = rt->rt_table_id;
2709
2710         err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2711                            NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2712                            RTM_NEWROUTE, 0, 0);
2713         if (err < 0)
2714                 goto errout_free;
2715
2716         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2717 errout:
2718         return err;
2719
2720 errout_free:
2721         kfree_skb(skb);
2722         goto errout;
2723 }
2724
2725 void ip_rt_multicast_event(struct in_device *in_dev)
2726 {
2727         rt_cache_flush(dev_net(in_dev->dev));
2728 }
2729
2730 #ifdef CONFIG_SYSCTL
2731 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2732 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2733 static int ip_rt_gc_elasticity __read_mostly    = 8;
2734 static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
2735
2736 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2737                                         void __user *buffer,
2738                                         size_t *lenp, loff_t *ppos)
2739 {
2740         struct net *net = (struct net *)__ctl->extra1;
2741
2742         if (write) {
2743                 rt_cache_flush(net);
2744                 fnhe_genid_bump(net);
2745                 return 0;
2746         }
2747
2748         return -EINVAL;
2749 }
2750
2751 static struct ctl_table ipv4_route_table[] = {
2752         {
2753                 .procname       = "gc_thresh",
2754                 .data           = &ipv4_dst_ops.gc_thresh,
2755                 .maxlen         = sizeof(int),
2756                 .mode           = 0644,
2757                 .proc_handler   = proc_dointvec,
2758         },
2759         {
2760                 .procname       = "max_size",
2761                 .data           = &ip_rt_max_size,
2762                 .maxlen         = sizeof(int),
2763                 .mode           = 0644,
2764                 .proc_handler   = proc_dointvec,
2765         },
2766         {
2767                 /*  Deprecated. Use gc_min_interval_ms */
2768
2769                 .procname       = "gc_min_interval",
2770                 .data           = &ip_rt_gc_min_interval,
2771                 .maxlen         = sizeof(int),
2772                 .mode           = 0644,
2773                 .proc_handler   = proc_dointvec_jiffies,
2774         },
2775         {
2776                 .procname       = "gc_min_interval_ms",
2777                 .data           = &ip_rt_gc_min_interval,
2778                 .maxlen         = sizeof(int),
2779                 .mode           = 0644,
2780                 .proc_handler   = proc_dointvec_ms_jiffies,
2781         },
2782         {
2783                 .procname       = "gc_timeout",
2784                 .data           = &ip_rt_gc_timeout,
2785                 .maxlen         = sizeof(int),
2786                 .mode           = 0644,
2787                 .proc_handler   = proc_dointvec_jiffies,
2788         },
2789         {
2790                 .procname       = "gc_interval",
2791                 .data           = &ip_rt_gc_interval,
2792                 .maxlen         = sizeof(int),
2793                 .mode           = 0644,
2794                 .proc_handler   = proc_dointvec_jiffies,
2795         },
2796         {
2797                 .procname       = "redirect_load",
2798                 .data           = &ip_rt_redirect_load,
2799                 .maxlen         = sizeof(int),
2800                 .mode           = 0644,
2801                 .proc_handler   = proc_dointvec,
2802         },
2803         {
2804                 .procname       = "redirect_number",
2805                 .data           = &ip_rt_redirect_number,
2806                 .maxlen         = sizeof(int),
2807                 .mode           = 0644,
2808                 .proc_handler   = proc_dointvec,
2809         },
2810         {
2811                 .procname       = "redirect_silence",
2812                 .data           = &ip_rt_redirect_silence,
2813                 .maxlen         = sizeof(int),
2814                 .mode           = 0644,
2815                 .proc_handler   = proc_dointvec,
2816         },
2817         {
2818                 .procname       = "error_cost",
2819                 .data           = &ip_rt_error_cost,
2820                 .maxlen         = sizeof(int),
2821                 .mode           = 0644,
2822                 .proc_handler   = proc_dointvec,
2823         },
2824         {
2825                 .procname       = "error_burst",
2826                 .data           = &ip_rt_error_burst,
2827                 .maxlen         = sizeof(int),
2828                 .mode           = 0644,
2829                 .proc_handler   = proc_dointvec,
2830         },
2831         {
2832                 .procname       = "gc_elasticity",
2833                 .data           = &ip_rt_gc_elasticity,
2834                 .maxlen         = sizeof(int),
2835                 .mode           = 0644,
2836                 .proc_handler   = proc_dointvec,
2837         },
2838         {
2839                 .procname       = "mtu_expires",
2840                 .data           = &ip_rt_mtu_expires,
2841                 .maxlen         = sizeof(int),
2842                 .mode           = 0644,
2843                 .proc_handler   = proc_dointvec_jiffies,
2844         },
2845         {
2846                 .procname       = "min_pmtu",
2847                 .data           = &ip_rt_min_pmtu,
2848                 .maxlen         = sizeof(int),
2849                 .mode           = 0644,
2850                 .proc_handler   = proc_dointvec_minmax,
2851                 .extra1         = &ip_min_valid_pmtu,
2852         },
2853         {
2854                 .procname       = "min_adv_mss",
2855                 .data           = &ip_rt_min_advmss,
2856                 .maxlen         = sizeof(int),
2857                 .mode           = 0644,
2858                 .proc_handler   = proc_dointvec,
2859         },
2860         { }
2861 };
2862
2863 static struct ctl_table ipv4_route_flush_table[] = {
2864         {
2865                 .procname       = "flush",
2866                 .maxlen         = sizeof(int),
2867                 .mode           = 0200,
2868                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2869         },
2870         { },
2871 };
2872
2873 static __net_init int sysctl_route_net_init(struct net *net)
2874 {
2875         struct ctl_table *tbl;
2876
2877         tbl = ipv4_route_flush_table;
2878         if (!net_eq(net, &init_net)) {
2879                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2880                 if (!tbl)
2881                         goto err_dup;
2882
2883                 /* Don't export sysctls to unprivileged users */
2884                 if (net->user_ns != &init_user_ns)
2885                         tbl[0].procname = NULL;
2886         }
2887         tbl[0].extra1 = net;
2888
2889         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2890         if (!net->ipv4.route_hdr)
2891                 goto err_reg;
2892         return 0;
2893
2894 err_reg:
2895         if (tbl != ipv4_route_flush_table)
2896                 kfree(tbl);
2897 err_dup:
2898         return -ENOMEM;
2899 }
2900
2901 static __net_exit void sysctl_route_net_exit(struct net *net)
2902 {
2903         struct ctl_table *tbl;
2904
2905         tbl = net->ipv4.route_hdr->ctl_table_arg;
2906         unregister_net_sysctl_table(net->ipv4.route_hdr);
2907         BUG_ON(tbl == ipv4_route_flush_table);
2908         kfree(tbl);
2909 }
2910
2911 static __net_initdata struct pernet_operations sysctl_route_ops = {
2912         .init = sysctl_route_net_init,
2913         .exit = sysctl_route_net_exit,
2914 };
2915 #endif
2916
2917 static __net_init int rt_genid_init(struct net *net)
2918 {
2919         atomic_set(&net->ipv4.rt_genid, 0);
2920         atomic_set(&net->fnhe_genid, 0);
2921         get_random_bytes(&net->ipv4.dev_addr_genid,
2922                          sizeof(net->ipv4.dev_addr_genid));
2923         return 0;
2924 }
2925
2926 static __net_initdata struct pernet_operations rt_genid_ops = {
2927         .init = rt_genid_init,
2928 };
2929
2930 static int __net_init ipv4_inetpeer_init(struct net *net)
2931 {
2932         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2933
2934         if (!bp)
2935                 return -ENOMEM;
2936         inet_peer_base_init(bp);
2937         net->ipv4.peers = bp;
2938         return 0;
2939 }
2940
2941 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2942 {
2943         struct inet_peer_base *bp = net->ipv4.peers;
2944
2945         net->ipv4.peers = NULL;
2946         inetpeer_invalidate_tree(bp);
2947         kfree(bp);
2948 }
2949
2950 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2951         .init   =       ipv4_inetpeer_init,
2952         .exit   =       ipv4_inetpeer_exit,
2953 };
2954
2955 #ifdef CONFIG_IP_ROUTE_CLASSID
2956 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2957 #endif /* CONFIG_IP_ROUTE_CLASSID */
2958
2959 int __init ip_rt_init(void)
2960 {
2961         void *idents_hash;
2962         int rc = 0;
2963         int cpu;
2964
2965         /* For modern hosts, this will use 2 MB of memory */
2966         idents_hash = alloc_large_system_hash("IP idents",
2967                                               sizeof(*ip_idents) + sizeof(*ip_tstamps),
2968                                               0,
2969                                               16, /* one bucket per 64 KB */
2970                                               0,
2971                                               NULL,
2972                                               &ip_idents_mask,
2973                                               2048,
2974                                               256*1024);
2975
2976         ip_idents = idents_hash;
2977
2978         prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
2979
2980         ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
2981         memset(ip_tstamps, 0, (ip_idents_mask + 1) * sizeof(*ip_tstamps));
2982
2983         for_each_possible_cpu(cpu) {
2984                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
2985
2986                 INIT_LIST_HEAD(&ul->head);
2987                 spin_lock_init(&ul->lock);
2988         }
2989 #ifdef CONFIG_IP_ROUTE_CLASSID
2990         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2991         if (!ip_rt_acct)
2992                 panic("IP: failed to allocate ip_rt_acct\n");
2993 #endif
2994
2995         ipv4_dst_ops.kmem_cachep =
2996                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2997                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2998
2999         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3000
3001         if (dst_entries_init(&ipv4_dst_ops) < 0)
3002                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3003
3004         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3005                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3006
3007         ipv4_dst_ops.gc_thresh = ~0;
3008         ip_rt_max_size = INT_MAX;
3009
3010         devinet_init();
3011         ip_fib_init();
3012
3013         if (ip_rt_proc_init())
3014                 pr_err("Unable to create route proc files\n");
3015 #ifdef CONFIG_XFRM
3016         xfrm_init();
3017         xfrm4_init();
3018 #endif
3019         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3020
3021 #ifdef CONFIG_SYSCTL
3022         register_pernet_subsys(&sysctl_route_ops);
3023 #endif
3024         register_pernet_subsys(&rt_genid_ops);
3025         register_pernet_subsys(&ipv4_inetpeer_ops);
3026         return rc;
3027 }
3028
3029 #ifdef CONFIG_SYSCTL
3030 /*
3031  * We really need to sanitize the damn ipv4 init order, then all
3032  * this nonsense will go away.
3033  */
3034 void __init ip_static_sysctl_init(void)
3035 {
3036         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3037 }
3038 #endif