net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <asm/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/bootmem.h>
  74 #include <linux/string.h>
  75 #include <linux/socket.h>
  76 #include <linux/sockios.h>
  77 #include <linux/errno.h>
  78 #include <linux/in.h>
  79 #include <linux/inet.h>
  80 #include <linux/netdevice.h>
  81 #include <linux/proc_fs.h>
  82 #include <linux/init.h>
  83 #include <linux/skbuff.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/rcupdate.h>
  91 #include <linux/times.h>
  92 #include <linux/slab.h>
  93 #include <linux/jhash.h>
  94 #include <net/dst.h>
  95 #include <net/dst_metadata.h>
  96 #include <net/net_namespace.h>
  97 #include <net/protocol.h>
  98 #include <net/ip.h>
  99 #include <net/route.h>
 100 #include <net/inetpeer.h>
 101 #include <net/sock.h>
 102 #include <net/ip_fib.h>
 103 #include <net/arp.h>
 104 #include <net/tcp.h>
 105 #include <net/icmp.h>
 106 #include <net/xfrm.h>
 107 #include <net/lwtunnel.h>
 108 #include <net/netevent.h>
 109 #include <net/rtnetlink.h>
 110 #ifdef CONFIG_SYSCTL
 111 #include <linux/sysctl.h>
 112 #include <linux/kmemleak.h>
 113 #endif
 114 #include <net/secure_seq.h>
 115 #include <net/ip_tunnels.h>
 116 #include <net/l3mdev.h>
 117
 118 #define RT_FL_TOS(oldflp4) \
 119         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 120
 121 #define RT_GC_TIMEOUT (300*HZ)
 122
 123 static int ip_rt_max_size;
 124 static int ip_rt_redirect_number __read_mostly  = 9;
 125 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 126 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 127 static int ip_rt_error_cost __read_mostly       = HZ;
 128 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 129 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 130 static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 131 static int ip_rt_min_advmss __read_mostly       = 256;
 132
 133 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 134
 135 /*
 136  *      Interface to generic destination cache.
 137  */
 138
 139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 140 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 141 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 142 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 143 static void              ipv4_link_failure(struct sk_buff *skb);
 144 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 145                                            struct sk_buff *skb, u32 mtu);
 146 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 147                                         struct sk_buff *skb);
 148 static void             ipv4_dst_destroy(struct dst_entry *dst);
 149
 150 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 151 {
 152         WARN_ON(1);
 153         return NULL;
 154 }
 155
 156 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 157                                            struct sk_buff *skb,
 158                                            const void *daddr);
 159
 160 static struct dst_ops ipv4_dst_ops = {
 161         .family =               AF_INET,
 162         .check =                ipv4_dst_check,
 163         .default_advmss =       ipv4_default_advmss,
 164         .mtu =                  ipv4_mtu,
 165         .cow_metrics =          ipv4_cow_metrics,
 166         .destroy =              ipv4_dst_destroy,
 167         .negative_advice =      ipv4_negative_advice,
 168         .link_failure =         ipv4_link_failure,
 169         .update_pmtu =          ip_rt_update_pmtu,
 170         .redirect =             ip_do_redirect,
 171         .local_out =            __ip_local_out,
 172         .neigh_lookup =         ipv4_neigh_lookup,
 173 };
 174
 175 #define ECN_OR_COST(class)      TC_PRIO_##class
 176
 177 const __u8 ip_tos2prio[16] = {
 178         TC_PRIO_BESTEFFORT,
 179         ECN_OR_COST(BESTEFFORT),
 180         TC_PRIO_BESTEFFORT,
 181         ECN_OR_COST(BESTEFFORT),
 182         TC_PRIO_BULK,
 183         ECN_OR_COST(BULK),
 184         TC_PRIO_BULK,
 185         ECN_OR_COST(BULK),
 186         TC_PRIO_INTERACTIVE,
 187         ECN_OR_COST(INTERACTIVE),
 188         TC_PRIO_INTERACTIVE,
 189         ECN_OR_COST(INTERACTIVE),
 190         TC_PRIO_INTERACTIVE_BULK,
 191         ECN_OR_COST(INTERACTIVE_BULK),
 192         TC_PRIO_INTERACTIVE_BULK,
 193         ECN_OR_COST(INTERACTIVE_BULK)
 194 };
 195 EXPORT_SYMBOL(ip_tos2prio);
 196
 197 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 198 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 199
 200 #ifdef CONFIG_PROC_FS
 201 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 202 {
 203         if (*pos)
 204                 return NULL;
 205         return SEQ_START_TOKEN;
 206 }
 207
 208 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 209 {
 210         ++*pos;
 211         return NULL;
 212 }
 213
 214 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 215 {
 216 }
 217
 218 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 219 {
 220         if (v == SEQ_START_TOKEN)
 221                 seq_printf(seq, "%-127s\n",
 222                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 223                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 224                            "HHUptod\tSpecDst");
 225         return 0;
 226 }
 227
 228 static const struct seq_operations rt_cache_seq_ops = {
 229         .start  = rt_cache_seq_start,
 230         .next   = rt_cache_seq_next,
 231         .stop   = rt_cache_seq_stop,
 232         .show   = rt_cache_seq_show,
 233 };
 234
 235 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 236 {
 237         return seq_open(file, &rt_cache_seq_ops);
 238 }
 239
 240 static const struct file_operations rt_cache_seq_fops = {
 241         .owner   = THIS_MODULE,
 242         .open    = rt_cache_seq_open,
 243         .read    = seq_read,
 244         .llseek  = seq_lseek,
 245         .release = seq_release,
 246 };
 247
 248
 249 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 250 {
 251         int cpu;
 252
 253         if (*pos == 0)
 254                 return SEQ_START_TOKEN;
 255
 256         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 257                 if (!cpu_possible(cpu))
 258                         continue;
 259                 *pos = cpu+1;
 260                 return &per_cpu(rt_cache_stat, cpu);
 261         }
 262         return NULL;
 263 }
 264
 265 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 266 {
 267         int cpu;
 268
 269         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 270                 if (!cpu_possible(cpu))
 271                         continue;
 272                 *pos = cpu+1;
 273                 return &per_cpu(rt_cache_stat, cpu);
 274         }
 275         (*pos)++;
 276         return NULL;
 277
 278 }
 279
 280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 281 {
 282
 283 }
 284
 285 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 286 {
 287         struct rt_cache_stat *st = v;
 288
 289         if (v == SEQ_START_TOKEN) {
 290                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 291                 return 0;
 292         }
 293
 294         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 295                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 296                    dst_entries_get_slow(&ipv4_dst_ops),
 297                    0, /* st->in_hit */
 298                    st->in_slow_tot,
 299                    st->in_slow_mc,
 300                    st->in_no_route,
 301                    st->in_brd,
 302                    st->in_martian_dst,
 303                    st->in_martian_src,
 304
 305                    0, /* st->out_hit */
 306                    st->out_slow_tot,
 307                    st->out_slow_mc,
 308
 309                    0, /* st->gc_total */
 310                    0, /* st->gc_ignored */
 311                    0, /* st->gc_goal_miss */
 312                    0, /* st->gc_dst_overflow */
 313                    0, /* st->in_hlist_search */
 314                    0  /* st->out_hlist_search */
 315                 );
 316         return 0;
 317 }
 318
 319 static const struct seq_operations rt_cpu_seq_ops = {
 320         .start  = rt_cpu_seq_start,
 321         .next   = rt_cpu_seq_next,
 322         .stop   = rt_cpu_seq_stop,
 323         .show   = rt_cpu_seq_show,
 324 };
 325
 326
 327 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 328 {
 329         return seq_open(file, &rt_cpu_seq_ops);
 330 }
 331
 332 static const struct file_operations rt_cpu_seq_fops = {
 333         .owner   = THIS_MODULE,
 334         .open    = rt_cpu_seq_open,
 335         .read    = seq_read,
 336         .llseek  = seq_lseek,
 337         .release = seq_release,
 338 };
 339
 340 #ifdef CONFIG_IP_ROUTE_CLASSID
 341 static int rt_acct_proc_show(struct seq_file *m, void *v)
 342 {
 343         struct ip_rt_acct *dst, *src;
 344         unsigned int i, j;
 345
 346         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 347         if (!dst)
 348                 return -ENOMEM;
 349
 350         for_each_possible_cpu(i) {
 351                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 352                 for (j = 0; j < 256; j++) {
 353                         dst[j].o_bytes   += src[j].o_bytes;
 354                         dst[j].o_packets += src[j].o_packets;
 355                         dst[j].i_bytes   += src[j].i_bytes;
 356                         dst[j].i_packets += src[j].i_packets;
 357                 }
 358         }
 359
 360         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 361         kfree(dst);
 362         return 0;
 363 }
 364
 365 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 366 {
 367         return single_open(file, rt_acct_proc_show, NULL);
 368 }
 369
 370 static const struct file_operations rt_acct_proc_fops = {
 371         .owner          = THIS_MODULE,
 372         .open           = rt_acct_proc_open,
 373         .read           = seq_read,
 374         .llseek         = seq_lseek,
 375         .release        = single_release,
 376 };
 377 #endif
 378
 379 static int __net_init ip_rt_do_proc_init(struct net *net)
 380 {
 381         struct proc_dir_entry *pde;
 382
 383         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
 384                           &rt_cache_seq_fops);
 385         if (!pde)
 386                 goto err1;
 387
 388         pde = proc_create("rt_cache", S_IRUGO,
 389                           net->proc_net_stat, &rt_cpu_seq_fops);
 390         if (!pde)
 391                 goto err2;
 392
 393 #ifdef CONFIG_IP_ROUTE_CLASSID
 394         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 395         if (!pde)
 396                 goto err3;
 397 #endif
 398         return 0;
 399
 400 #ifdef CONFIG_IP_ROUTE_CLASSID
 401 err3:
 402         remove_proc_entry("rt_cache", net->proc_net_stat);
 403 #endif
 404 err2:
 405         remove_proc_entry("rt_cache", net->proc_net);
 406 err1:
 407         return -ENOMEM;
 408 }
 409
 410 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 411 {
 412         remove_proc_entry("rt_cache", net->proc_net_stat);
 413         remove_proc_entry("rt_cache", net->proc_net);
 414 #ifdef CONFIG_IP_ROUTE_CLASSID
 415         remove_proc_entry("rt_acct", net->proc_net);
 416 #endif
 417 }
 418
 419 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 420         .init = ip_rt_do_proc_init,
 421         .exit = ip_rt_do_proc_exit,
 422 };
 423
 424 static int __init ip_rt_proc_init(void)
 425 {
 426         return register_pernet_subsys(&ip_rt_proc_ops);
 427 }
 428
 429 #else
 430 static inline int ip_rt_proc_init(void)
 431 {
 432         return 0;
 433 }
 434 #endif /* CONFIG_PROC_FS */
 435
 436 static inline bool rt_is_expired(const struct rtable *rth)
 437 {
 438         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 439 }
 440
 441 void rt_cache_flush(struct net *net)
 442 {
 443         rt_genid_bump_ipv4(net);
 444 }
 445
 446 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 447                                            struct sk_buff *skb,
 448                                            const void *daddr)
 449 {
 450         struct net_device *dev = dst->dev;
 451         const __be32 *pkey = daddr;
 452         const struct rtable *rt;
 453         struct neighbour *n;
 454
 455         rt = (const struct rtable *) dst;
 456         if (rt->rt_gateway)
 457                 pkey = (const __be32 *) &rt->rt_gateway;
 458         else if (skb)
 459                 pkey = &ip_hdr(skb)->daddr;
 460
 461         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 462         if (n)
 463                 return n;
 464         return neigh_create(&arp_tbl, pkey, dev);
 465 }
 466
 467 /* Hash tables of size 2048..262144 depending on RAM size.
 468  * Each bucket uses 8 bytes.
 469  */
 470 static u32 ip_idents_mask __read_mostly;
 471 static atomic_t *ip_idents __read_mostly;
 472 static u32 *ip_tstamps __read_mostly;
 473
 474 /* In order to protect privacy, we add a perturbation to identifiers
 475  * if one generator is seldom used. This makes hard for an attacker
 476  * to infer how many packets were sent between two points in time.
 477  */
 478 u32 ip_idents_reserve(u32 hash, int segs)
 479 {
 480         u32 bucket, old, now = (u32)jiffies;
 481         atomic_t *p_id;
 482         u32 *p_tstamp;
 483         u32 delta = 0;
 484
 485         bucket = hash & ip_idents_mask;
 486         p_tstamp = ip_tstamps + bucket;
 487         p_id = ip_idents + bucket;
 488         old = ACCESS_ONCE(*p_tstamp);
 489
 490         if (old != now && cmpxchg(p_tstamp, old, now) == old)
 491                 delta = prandom_u32_max(now - old);
 492
 493         /* If UBSAN reports an error there, please make sure your compiler
 494          * supports -fno-strict-overflow before reporting it that was a bug
 495          * in UBSAN, and it has been fixed in GCC-8.
 496          */
 497         return atomic_add_return(segs + delta, p_id) - segs;
 498 }
 499 EXPORT_SYMBOL(ip_idents_reserve);
 500
 501 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 502 {
 503         u32 hash, id;
 504
 505         /* Note the following code is not safe, but this is okay. */
 506         if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
 507                 get_random_bytes(&net->ipv4.ip_id_key,
 508                                  sizeof(net->ipv4.ip_id_key));
 509
 510         hash = siphash_3u32((__force u32)iph->daddr,
 511                             (__force u32)iph->saddr,
 512                             iph->protocol,
 513                             &net->ipv4.ip_id_key);
 514         id = ip_idents_reserve(hash, segs);
 515         iph->id = htons(id);
 516 }
 517 EXPORT_SYMBOL(__ip_select_ident);
 518
 519 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 520                              const struct iphdr *iph,
 521                              int oif, u8 tos,
 522                              u8 prot, u32 mark, int flow_flags)
 523 {
 524         if (sk) {
 525                 const struct inet_sock *inet = inet_sk(sk);
 526
 527                 oif = sk->sk_bound_dev_if;
 528                 mark = sk->sk_mark;
 529                 tos = RT_CONN_FLAGS(sk);
 530                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 531         }
 532         flowi4_init_output(fl4, oif, mark, tos,
 533                            RT_SCOPE_UNIVERSE, prot,
 534                            flow_flags,
 535                            iph->daddr, iph->saddr, 0, 0);
 536 }
 537
 538 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 539                                const struct sock *sk)
 540 {
 541         const struct iphdr *iph = ip_hdr(skb);
 542         int oif = skb->dev->ifindex;
 543         u8 tos = RT_TOS(iph->tos);
 544         u8 prot = iph->protocol;
 545         u32 mark = skb->mark;
 546
 547         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
 548 }
 549
 550 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 551 {
 552         const struct inet_sock *inet = inet_sk(sk);
 553         const struct ip_options_rcu *inet_opt;
 554         __be32 daddr = inet->inet_daddr;
 555
 556         rcu_read_lock();
 557         inet_opt = rcu_dereference(inet->inet_opt);
 558         if (inet_opt && inet_opt->opt.srr)
 559                 daddr = inet_opt->opt.faddr;
 560         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 561                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 562                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 563                            inet_sk_flowi_flags(sk),
 564                            daddr, inet->inet_saddr, 0, 0);
 565         rcu_read_unlock();
 566 }
 567
 568 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 569                                  const struct sk_buff *skb)
 570 {
 571         if (skb)
 572                 build_skb_flow_key(fl4, skb, sk);
 573         else
 574                 build_sk_flow_key(fl4, sk);
 575 }
 576
 577 static inline void rt_free(struct rtable *rt)
 578 {
 579         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
 580 }
 581
 582 static DEFINE_SPINLOCK(fnhe_lock);
 583
 584 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 585 {
 586         struct rtable *rt;
 587
 588         rt = rcu_dereference(fnhe->fnhe_rth_input);
 589         if (rt) {
 590                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 591                 rt_free(rt);
 592         }
 593         rt = rcu_dereference(fnhe->fnhe_rth_output);
 594         if (rt) {
 595                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 596                 rt_free(rt);
 597         }
 598 }
 599
 600 static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
 601 {
 602         struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
 603         struct fib_nh_exception *fnhe, *oldest = NULL;
 604
 605         for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
 606                 fnhe = rcu_dereference_protected(*fnhe_p,
 607                                                  lockdep_is_held(&fnhe_lock));
 608                 if (!fnhe)
 609                         break;
 610                 if (!oldest ||
 611                     time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
 612                         oldest = fnhe;
 613                         oldest_p = fnhe_p;
 614                 }
 615         }
 616         fnhe_flush_routes(oldest);
 617         *oldest_p = oldest->fnhe_next;
 618         kfree_rcu(oldest, rcu);
 619 }
 620
 621 static inline u32 fnhe_hashfun(__be32 daddr)
 622 {
 623         static u32 fnhe_hashrnd __read_mostly;
 624         u32 hval;
 625
 626         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
 627         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
 628         return hash_32(hval, FNHE_HASH_SHIFT);
 629 }
 630
 631 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 632 {
 633         rt->rt_pmtu = fnhe->fnhe_pmtu;
 634         rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
 635         rt->dst.expires = fnhe->fnhe_expires;
 636
 637         if (fnhe->fnhe_gw) {
 638                 rt->rt_flags |= RTCF_REDIRECTED;
 639                 rt->rt_gateway = fnhe->fnhe_gw;
 640                 rt->rt_uses_gateway = 1;
 641         }
 642 }
 643
 644 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 645                                   u32 pmtu, bool lock, unsigned long expires)
 646 {
 647         struct fnhe_hash_bucket *hash;
 648         struct fib_nh_exception *fnhe;
 649         struct rtable *rt;
 650         u32 genid, hval;
 651         unsigned int i;
 652         int depth;
 653
 654         genid = fnhe_genid(dev_net(nh->nh_dev));
 655         hval = fnhe_hashfun(daddr);
 656
 657         spin_lock_bh(&fnhe_lock);
 658
 659         hash = rcu_dereference(nh->nh_exceptions);
 660         if (!hash) {
 661                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 662                 if (!hash)
 663                         goto out_unlock;
 664                 rcu_assign_pointer(nh->nh_exceptions, hash);
 665         }
 666
 667         hash += hval;
 668
 669         depth = 0;
 670         for (fnhe = rcu_dereference(hash->chain); fnhe;
 671              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 672                 if (fnhe->fnhe_daddr == daddr)
 673                         break;
 674                 depth++;
 675         }
 676
 677         if (fnhe) {
 678                 if (fnhe->fnhe_genid != genid)
 679                         fnhe->fnhe_genid = genid;
 680                 if (gw)
 681                         fnhe->fnhe_gw = gw;
 682                 if (pmtu) {
 683                         fnhe->fnhe_pmtu = pmtu;
 684                         fnhe->fnhe_mtu_locked = lock;
 685                 }
 686                 fnhe->fnhe_expires = max(1UL, expires);
 687                 /* Update all cached dsts too */
 688                 rt = rcu_dereference(fnhe->fnhe_rth_input);
 689                 if (rt)
 690                         fill_route_from_fnhe(rt, fnhe);
 691                 rt = rcu_dereference(fnhe->fnhe_rth_output);
 692                 if (rt)
 693                         fill_route_from_fnhe(rt, fnhe);
 694         } else {
 695                 /* Randomize max depth to avoid some side channels attacks. */
 696                 int max_depth = FNHE_RECLAIM_DEPTH +
 697                                 prandom_u32_max(FNHE_RECLAIM_DEPTH);
 698
 699                 while (depth > max_depth) {
 700                         fnhe_remove_oldest(hash);
 701                         depth--;
 702                 }
 703
 704                 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 705                 if (!fnhe)
 706                         goto out_unlock;
 707
 708                 fnhe->fnhe_next = hash->chain;
 709
 710                 fnhe->fnhe_genid = genid;
 711                 fnhe->fnhe_daddr = daddr;
 712                 fnhe->fnhe_gw = gw;
 713                 fnhe->fnhe_pmtu = pmtu;
 714                 fnhe->fnhe_mtu_locked = lock;
 715                 fnhe->fnhe_expires = expires;
 716
 717                 rcu_assign_pointer(hash->chain, fnhe);
 718
 719                 /* Exception created; mark the cached routes for the nexthop
 720                  * stale, so anyone caching it rechecks if this exception
 721                  * applies to them.
 722                  */
 723                 rt = rcu_dereference(nh->nh_rth_input);
 724                 if (rt)
 725                         rt->dst.obsolete = DST_OBSOLETE_KILL;
 726
 727                 for_each_possible_cpu(i) {
 728                         struct rtable __rcu **prt;
 729                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
 730                         rt = rcu_dereference(*prt);
 731                         if (rt)
 732                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 733                 }
 734         }
 735
 736         fnhe->fnhe_stamp = jiffies;
 737
 738 out_unlock:
 739         spin_unlock_bh(&fnhe_lock);
 740 }
 741
 742 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 743                              bool kill_route)
 744 {
 745         __be32 new_gw = icmp_hdr(skb)->un.gateway;
 746         __be32 old_gw = ip_hdr(skb)->saddr;
 747         struct net_device *dev = skb->dev;
 748         struct in_device *in_dev;
 749         struct fib_result res;
 750         struct neighbour *n;
 751         struct net *net;
 752
 753         switch (icmp_hdr(skb)->code & 7) {
 754         case ICMP_REDIR_NET:
 755         case ICMP_REDIR_NETTOS:
 756         case ICMP_REDIR_HOST:
 757         case ICMP_REDIR_HOSTTOS:
 758                 break;
 759
 760         default:
 761                 return;
 762         }
 763
 764         if (rt->rt_gateway != old_gw)
 765                 return;
 766
 767         in_dev = __in_dev_get_rcu(dev);
 768         if (!in_dev)
 769                 return;
 770
 771         net = dev_net(dev);
 772         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 773             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 774             ipv4_is_zeronet(new_gw))
 775                 goto reject_redirect;
 776
 777         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 778                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 779                         goto reject_redirect;
 780                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 781                         goto reject_redirect;
 782         } else {
 783                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 784                         goto reject_redirect;
 785         }
 786
 787         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
 788         if (!n)
 789                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 790         if (!IS_ERR(n)) {
 791                 if (!(n->nud_state & NUD_VALID)) {
 792                         neigh_event_send(n, NULL);
 793                 } else {
 794                         if (fib_lookup(net, fl4, &res, 0) == 0) {
 795                                 struct fib_nh *nh = &FIB_RES_NH(res);
 796
 797                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
 798                                                 0, false,
 799                                                 jiffies + ip_rt_gc_timeout);
 800                         }
 801                         if (kill_route)
 802                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 803                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 804                 }
 805                 neigh_release(n);
 806         }
 807         return;
 808
 809 reject_redirect:
 810 #ifdef CONFIG_IP_ROUTE_VERBOSE
 811         if (IN_DEV_LOG_MARTIANS(in_dev)) {
 812                 const struct iphdr *iph = (const struct iphdr *) skb->data;
 813                 __be32 daddr = iph->daddr;
 814                 __be32 saddr = iph->saddr;
 815
 816                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 817                                      "  Advised path = %pI4 -> %pI4\n",
 818                                      &old_gw, dev->name, &new_gw,
 819                                      &saddr, &daddr);
 820         }
 821 #endif
 822         ;
 823 }
 824
 825 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 826 {
 827         struct rtable *rt;
 828         struct flowi4 fl4;
 829         const struct iphdr *iph = (const struct iphdr *) skb->data;
 830         int oif = skb->dev->ifindex;
 831         u8 tos = RT_TOS(iph->tos);
 832         u8 prot = iph->protocol;
 833         u32 mark = skb->mark;
 834
 835         rt = (struct rtable *) dst;
 836
 837         __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
 838         __ip_do_redirect(rt, skb, &fl4, true);
 839 }
 840
 841 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 842 {
 843         struct rtable *rt = (struct rtable *)dst;
 844         struct dst_entry *ret = dst;
 845
 846         if (rt) {
 847                 if (dst->obsolete > 0) {
 848                         ip_rt_put(rt);
 849                         ret = NULL;
 850                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 851                            rt->dst.expires) {
 852                         ip_rt_put(rt);
 853                         ret = NULL;
 854                 }
 855         }
 856         return ret;
 857 }
 858
 859 /*
 860  * Algorithm:
 861  *      1. The first ip_rt_redirect_number redirects are sent
 862  *         with exponential backoff, then we stop sending them at all,
 863  *         assuming that the host ignores our redirects.
 864  *      2. If we did not see packets requiring redirects
 865  *         during ip_rt_redirect_silence, we assume that the host
 866  *         forgot redirected route and start to send redirects again.
 867  *
 868  * This algorithm is much cheaper and more intelligent than dumb load limiting
 869  * in icmp.c.
 870  *
 871  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 872  * and "frag. need" (breaks PMTU discovery) in icmp.c.
 873  */
 874
 875 void ip_rt_send_redirect(struct sk_buff *skb)
 876 {
 877         struct rtable *rt = skb_rtable(skb);
 878         struct in_device *in_dev;
 879         struct inet_peer *peer;
 880         struct net *net;
 881         int log_martians;
 882         int vif;
 883
 884         rcu_read_lock();
 885         in_dev = __in_dev_get_rcu(rt->dst.dev);
 886         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 887                 rcu_read_unlock();
 888                 return;
 889         }
 890         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 891         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 892         rcu_read_unlock();
 893
 894         net = dev_net(rt->dst.dev);
 895         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 896         if (!peer) {
 897                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 898                           rt_nexthop(rt, ip_hdr(skb)->daddr));
 899                 return;
 900         }
 901
 902         /* No redirected packets during ip_rt_redirect_silence;
 903          * reset the algorithm.
 904          */
 905         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
 906                 peer->rate_tokens = 0;
 907                 peer->n_redirects = 0;
 908         }
 909
 910         /* Too many ignored redirects; do not send anything
 911          * set dst.rate_last to the last seen redirected packet.
 912          */
 913         if (peer->n_redirects >= ip_rt_redirect_number) {
 914                 peer->rate_last = jiffies;
 915                 goto out_put_peer;
 916         }
 917
 918         /* Check for load limit; set rate_last to the latest sent
 919          * redirect.
 920          */
 921         if (peer->n_redirects == 0 ||
 922             time_after(jiffies,
 923                        (peer->rate_last +
 924                         (ip_rt_redirect_load << peer->n_redirects)))) {
 925                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 926
 927                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 928                 peer->rate_last = jiffies;
 929                 ++peer->n_redirects;
 930 #ifdef CONFIG_IP_ROUTE_VERBOSE
 931                 if (log_martians &&
 932                     peer->n_redirects == ip_rt_redirect_number)
 933                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 934                                              &ip_hdr(skb)->saddr, inet_iif(skb),
 935                                              &ip_hdr(skb)->daddr, &gw);
 936 #endif
 937         }
 938 out_put_peer:
 939         inet_putpeer(peer);
 940 }
 941
 942 static int ip_error(struct sk_buff *skb)
 943 {
 944         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
 945         struct rtable *rt = skb_rtable(skb);
 946         struct inet_peer *peer;
 947         unsigned long now;
 948         struct net *net;
 949         bool send;
 950         int code;
 951
 952         /* IP on this device is disabled. */
 953         if (!in_dev)
 954                 goto out;
 955
 956         net = dev_net(rt->dst.dev);
 957         if (!IN_DEV_FORWARD(in_dev)) {
 958                 switch (rt->dst.error) {
 959                 case EHOSTUNREACH:
 960                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 961                         break;
 962
 963                 case ENETUNREACH:
 964                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 965                         break;
 966                 }
 967                 goto out;
 968         }
 969
 970         switch (rt->dst.error) {
 971         case EINVAL:
 972         default:
 973                 goto out;
 974         case EHOSTUNREACH:
 975                 code = ICMP_HOST_UNREACH;
 976                 break;
 977         case ENETUNREACH:
 978                 code = ICMP_NET_UNREACH;
 979                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 980                 break;
 981         case EACCES:
 982                 code = ICMP_PKT_FILTERED;
 983                 break;
 984         }
 985
 986         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
 987                                l3mdev_master_ifindex(skb->dev), 1);
 988
 989         send = true;
 990         if (peer) {
 991                 now = jiffies;
 992                 peer->rate_tokens += now - peer->rate_last;
 993                 if (peer->rate_tokens > ip_rt_error_burst)
 994                         peer->rate_tokens = ip_rt_error_burst;
 995                 peer->rate_last = now;
 996                 if (peer->rate_tokens >= ip_rt_error_cost)
 997                         peer->rate_tokens -= ip_rt_error_cost;
 998                 else
 999                         send = false;
1000                 inet_putpeer(peer);
1001         }
1002         if (send)
1003                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1004
1005 out:    kfree_skb(skb);
1006         return 0;
1007 }
1008
1009 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1010 {
1011         struct dst_entry *dst = &rt->dst;
1012         u32 old_mtu = ipv4_mtu(dst);
1013         struct fib_result res;
1014         bool lock = false;
1015
1016         if (ip_mtu_locked(dst))
1017                 return;
1018
1019         if (old_mtu < mtu)
1020                 return;
1021
1022         if (mtu < ip_rt_min_pmtu) {
1023                 lock = true;
1024                 mtu = min(old_mtu, ip_rt_min_pmtu);
1025         }
1026
1027         if (rt->rt_pmtu == mtu && !lock &&
1028             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1029                 return;
1030
1031         rcu_read_lock();
1032         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1033                 struct fib_nh *nh = &FIB_RES_NH(res);
1034
1035                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
1036                                       jiffies + ip_rt_mtu_expires);
1037         }
1038         rcu_read_unlock();
1039 }
1040
1041 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1042                               struct sk_buff *skb, u32 mtu)
1043 {
1044         struct rtable *rt = (struct rtable *) dst;
1045         struct flowi4 fl4;
1046
1047         ip_rt_build_flow_key(&fl4, sk, skb);
1048         __ip_rt_update_pmtu(rt, &fl4, mtu);
1049 }
1050
1051 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1052                       int oif, u32 mark, u8 protocol, int flow_flags)
1053 {
1054         const struct iphdr *iph = (const struct iphdr *) skb->data;
1055         struct flowi4 fl4;
1056         struct rtable *rt;
1057
1058         if (!mark)
1059                 mark = IP4_REPLY_MARK(net, skb->mark);
1060
1061         __build_flow_key(&fl4, NULL, iph, oif,
1062                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1063         rt = __ip_route_output_key(net, &fl4);
1064         if (!IS_ERR(rt)) {
1065                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1066                 ip_rt_put(rt);
1067         }
1068 }
1069 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1070
1071 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1072 {
1073         const struct iphdr *iph = (const struct iphdr *) skb->data;
1074         struct flowi4 fl4;
1075         struct rtable *rt;
1076
1077         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1078
1079         if (!fl4.flowi4_mark)
1080                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1081
1082         rt = __ip_route_output_key(sock_net(sk), &fl4);
1083         if (!IS_ERR(rt)) {
1084                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1085                 ip_rt_put(rt);
1086         }
1087 }
1088
1089 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1090 {
1091         const struct iphdr *iph = (const struct iphdr *) skb->data;
1092         struct flowi4 fl4;
1093         struct rtable *rt;
1094         struct dst_entry *odst = NULL;
1095         bool new = false;
1096
1097         bh_lock_sock(sk);
1098
1099         if (!ip_sk_accept_pmtu(sk))
1100                 goto out;
1101
1102         odst = sk_dst_get(sk);
1103
1104         if (sock_owned_by_user(sk) || !odst) {
1105                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1106                 goto out;
1107         }
1108
1109         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1110
1111         rt = (struct rtable *)odst;
1112         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1113                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1114                 if (IS_ERR(rt))
1115                         goto out;
1116
1117                 new = true;
1118         }
1119
1120         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1121
1122         if (!dst_check(&rt->dst, 0)) {
1123                 if (new)
1124                         dst_release(&rt->dst);
1125
1126                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1127                 if (IS_ERR(rt))
1128                         goto out;
1129
1130                 new = true;
1131         }
1132
1133         if (new)
1134                 sk_dst_set(sk, &rt->dst);
1135
1136 out:
1137         bh_unlock_sock(sk);
1138         dst_release(odst);
1139 }
1140 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1141
1142 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1143                    int oif, u32 mark, u8 protocol, int flow_flags)
1144 {
1145         const struct iphdr *iph = (const struct iphdr *) skb->data;
1146         struct flowi4 fl4;
1147         struct rtable *rt;
1148
1149         __build_flow_key(&fl4, NULL, iph, oif,
1150                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1151         rt = __ip_route_output_key(net, &fl4);
1152         if (!IS_ERR(rt)) {
1153                 __ip_do_redirect(rt, skb, &fl4, false);
1154                 ip_rt_put(rt);
1155         }
1156 }
1157 EXPORT_SYMBOL_GPL(ipv4_redirect);
1158
1159 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1160 {
1161         const struct iphdr *iph = (const struct iphdr *) skb->data;
1162         struct flowi4 fl4;
1163         struct rtable *rt;
1164
1165         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1166         rt = __ip_route_output_key(sock_net(sk), &fl4);
1167         if (!IS_ERR(rt)) {
1168                 __ip_do_redirect(rt, skb, &fl4, false);
1169                 ip_rt_put(rt);
1170         }
1171 }
1172 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1173
1174 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1175 {
1176         struct rtable *rt = (struct rtable *) dst;
1177
1178         /* All IPV4 dsts are created with ->obsolete set to the value
1179          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1180          * into this function always.
1181          *
1182          * When a PMTU/redirect information update invalidates a route,
1183          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1184          * DST_OBSOLETE_DEAD by dst_free().
1185          */
1186         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1187                 return NULL;
1188         return dst;
1189 }
1190
1191 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1192 {
1193         struct ip_options opt;
1194         int res;
1195
1196         /* Recompile ip options since IPCB may not be valid anymore.
1197          * Also check we have a reasonable ipv4 header.
1198          */
1199         if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1200             ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1201                 return;
1202
1203         memset(&opt, 0, sizeof(opt));
1204         if (ip_hdr(skb)->ihl > 5) {
1205                 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1206                         return;
1207                 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1208
1209                 rcu_read_lock();
1210                 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1211                 rcu_read_unlock();
1212
1213                 if (res)
1214                         return;
1215         }
1216         __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1217 }
1218
1219 static void ipv4_link_failure(struct sk_buff *skb)
1220 {
1221         struct rtable *rt;
1222
1223         ipv4_send_dest_unreach(skb);
1224
1225         rt = skb_rtable(skb);
1226         if (rt)
1227                 dst_set_expires(&rt->dst, 0);
1228 }
1229
1230 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1231 {
1232         pr_debug("%s: %pI4 -> %pI4, %s\n",
1233                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1234                  skb->dev ? skb->dev->name : "?");
1235         kfree_skb(skb);
1236         WARN_ON(1);
1237         return 0;
1238 }
1239
1240 /*
1241    We do not cache source address of outgoing interface,
1242    because it is used only by IP RR, TS and SRR options,
1243    so that it out of fast path.
1244
1245    BTW remember: "addr" is allowed to be not aligned
1246    in IP options!
1247  */
1248
1249 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1250 {
1251         __be32 src;
1252
1253         if (rt_is_output_route(rt))
1254                 src = ip_hdr(skb)->saddr;
1255         else {
1256                 struct fib_result res;
1257                 struct flowi4 fl4;
1258                 struct iphdr *iph;
1259
1260                 iph = ip_hdr(skb);
1261
1262                 memset(&fl4, 0, sizeof(fl4));
1263                 fl4.daddr = iph->daddr;
1264                 fl4.saddr = iph->saddr;
1265                 fl4.flowi4_tos = RT_TOS(iph->tos);
1266                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1267                 fl4.flowi4_iif = skb->dev->ifindex;
1268                 fl4.flowi4_mark = skb->mark;
1269
1270                 rcu_read_lock();
1271                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1272                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1273                 else
1274                         src = inet_select_addr(rt->dst.dev,
1275                                                rt_nexthop(rt, iph->daddr),
1276                                                RT_SCOPE_UNIVERSE);
1277                 rcu_read_unlock();
1278         }
1279         memcpy(addr, &src, 4);
1280 }
1281
1282 #ifdef CONFIG_IP_ROUTE_CLASSID
1283 static void set_class_tag(struct rtable *rt, u32 tag)
1284 {
1285         if (!(rt->dst.tclassid & 0xFFFF))
1286                 rt->dst.tclassid |= tag & 0xFFFF;
1287         if (!(rt->dst.tclassid & 0xFFFF0000))
1288                 rt->dst.tclassid |= tag & 0xFFFF0000;
1289 }
1290 #endif
1291
1292 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1293 {
1294         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1295
1296         if (advmss == 0) {
1297                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1298                                ip_rt_min_advmss);
1299                 if (advmss > 65535 - 40)
1300                         advmss = 65535 - 40;
1301         }
1302         return advmss;
1303 }
1304
1305 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1306 {
1307         const struct rtable *rt = (const struct rtable *) dst;
1308         unsigned int mtu = rt->rt_pmtu;
1309
1310         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1311                 mtu = dst_metric_raw(dst, RTAX_MTU);
1312
1313         if (mtu)
1314                 return mtu;
1315
1316         mtu = READ_ONCE(dst->dev->mtu);
1317
1318         if (unlikely(ip_mtu_locked(dst))) {
1319                 if (rt->rt_uses_gateway && mtu > 576)
1320                         mtu = 576;
1321         }
1322
1323         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1324
1325         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1326 }
1327
1328 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1329 {
1330         struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1331         struct fib_nh_exception *fnhe;
1332         u32 hval;
1333
1334         if (!hash)
1335                 return NULL;
1336
1337         hval = fnhe_hashfun(daddr);
1338
1339         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1340              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1341                 if (fnhe->fnhe_daddr == daddr)
1342                         return fnhe;
1343         }
1344         return NULL;
1345 }
1346
1347 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1348                               __be32 daddr)
1349 {
1350         bool ret = false;
1351
1352         spin_lock_bh(&fnhe_lock);
1353
1354         if (daddr == fnhe->fnhe_daddr) {
1355                 struct rtable __rcu **porig;
1356                 struct rtable *orig;
1357                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1358
1359                 if (rt_is_input_route(rt))
1360                         porig = &fnhe->fnhe_rth_input;
1361                 else
1362                         porig = &fnhe->fnhe_rth_output;
1363                 orig = rcu_dereference(*porig);
1364
1365                 if (fnhe->fnhe_genid != genid) {
1366                         fnhe->fnhe_genid = genid;
1367                         fnhe->fnhe_gw = 0;
1368                         fnhe->fnhe_pmtu = 0;
1369                         fnhe->fnhe_expires = 0;
1370                         fnhe_flush_routes(fnhe);
1371                         orig = NULL;
1372                 }
1373                 fill_route_from_fnhe(rt, fnhe);
1374                 if (!rt->rt_gateway)
1375                         rt->rt_gateway = daddr;
1376
1377                 if (!(rt->dst.flags & DST_NOCACHE)) {
1378                         rcu_assign_pointer(*porig, rt);
1379                         if (orig)
1380                                 rt_free(orig);
1381                         ret = true;
1382                 }
1383
1384                 fnhe->fnhe_stamp = jiffies;
1385         }
1386         spin_unlock_bh(&fnhe_lock);
1387
1388         return ret;
1389 }
1390
1391 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1392 {
1393         struct rtable *orig, *prev, **p;
1394         bool ret = true;
1395
1396         if (rt_is_input_route(rt)) {
1397                 p = (struct rtable **)&nh->nh_rth_input;
1398         } else {
1399                 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1400         }
1401         orig = *p;
1402
1403         prev = cmpxchg(p, orig, rt);
1404         if (prev == orig) {
1405                 if (orig)
1406                         rt_free(orig);
1407         } else
1408                 ret = false;
1409
1410         return ret;
1411 }
1412
1413 struct uncached_list {
1414         spinlock_t              lock;
1415         struct list_head        head;
1416 };
1417
1418 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1419
1420 static void rt_add_uncached_list(struct rtable *rt)
1421 {
1422         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1423
1424         rt->rt_uncached_list = ul;
1425
1426         spin_lock_bh(&ul->lock);
1427         list_add_tail(&rt->rt_uncached, &ul->head);
1428         spin_unlock_bh(&ul->lock);
1429 }
1430
1431 static void ipv4_dst_destroy(struct dst_entry *dst)
1432 {
1433         struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1434         struct rtable *rt = (struct rtable *) dst;
1435
1436         if (p != &dst_default_metrics && atomic_dec_and_test(&p->refcnt))
1437                 kfree(p);
1438
1439         if (!list_empty(&rt->rt_uncached)) {
1440                 struct uncached_list *ul = rt->rt_uncached_list;
1441
1442                 spin_lock_bh(&ul->lock);
1443                 list_del(&rt->rt_uncached);
1444                 spin_unlock_bh(&ul->lock);
1445         }
1446 }
1447
1448 void rt_flush_dev(struct net_device *dev)
1449 {
1450         struct net *net = dev_net(dev);
1451         struct rtable *rt;
1452         int cpu;
1453
1454         for_each_possible_cpu(cpu) {
1455                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1456
1457                 spin_lock_bh(&ul->lock);
1458                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1459                         if (rt->dst.dev != dev)
1460                                 continue;
1461                         rt->dst.dev = net->loopback_dev;
1462                         dev_hold(rt->dst.dev);
1463                         dev_put(dev);
1464                 }
1465                 spin_unlock_bh(&ul->lock);
1466         }
1467 }
1468
1469 static bool rt_cache_valid(const struct rtable *rt)
1470 {
1471         return  rt &&
1472                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1473                 !rt_is_expired(rt);
1474 }
1475
1476 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1477                            const struct fib_result *res,
1478                            struct fib_nh_exception *fnhe,
1479                            struct fib_info *fi, u16 type, u32 itag)
1480 {
1481         bool cached = false;
1482
1483         if (fi) {
1484                 struct fib_nh *nh = &FIB_RES_NH(*res);
1485
1486                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1487                         rt->rt_gateway = nh->nh_gw;
1488                         rt->rt_uses_gateway = 1;
1489                 }
1490                 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1491                 if (fi->fib_metrics != &dst_default_metrics) {
1492                         rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1493                         atomic_inc(&fi->fib_metrics->refcnt);
1494                 }
1495 #ifdef CONFIG_IP_ROUTE_CLASSID
1496                 rt->dst.tclassid = nh->nh_tclassid;
1497 #endif
1498                 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1499                 if (unlikely(fnhe))
1500                         cached = rt_bind_exception(rt, fnhe, daddr);
1501                 else if (!(rt->dst.flags & DST_NOCACHE))
1502                         cached = rt_cache_route(nh, rt);
1503                 if (unlikely(!cached)) {
1504                         /* Routes we intend to cache in nexthop exception or
1505                          * FIB nexthop have the DST_NOCACHE bit clear.
1506                          * However, if we are unsuccessful at storing this
1507                          * route into the cache we really need to set it.
1508                          */
1509                         rt->dst.flags |= DST_NOCACHE;
1510                         if (!rt->rt_gateway)
1511                                 rt->rt_gateway = daddr;
1512                         rt_add_uncached_list(rt);
1513                 }
1514         } else
1515                 rt_add_uncached_list(rt);
1516
1517 #ifdef CONFIG_IP_ROUTE_CLASSID
1518 #ifdef CONFIG_IP_MULTIPLE_TABLES
1519         set_class_tag(rt, res->tclassid);
1520 #endif
1521         set_class_tag(rt, itag);
1522 #endif
1523 }
1524
1525 struct rtable *rt_dst_alloc(struct net_device *dev,
1526                             unsigned int flags, u16 type,
1527                             bool nopolicy, bool noxfrm, bool will_cache)
1528 {
1529         struct rtable *rt;
1530
1531         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1532                        (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1533                        (nopolicy ? DST_NOPOLICY : 0) |
1534                        (noxfrm ? DST_NOXFRM : 0));
1535
1536         if (rt) {
1537                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1538                 rt->rt_flags = flags;
1539                 rt->rt_type = type;
1540                 rt->rt_is_input = 0;
1541                 rt->rt_iif = 0;
1542                 rt->rt_pmtu = 0;
1543                 rt->rt_mtu_locked = 0;
1544                 rt->rt_gateway = 0;
1545                 rt->rt_uses_gateway = 0;
1546                 rt->rt_table_id = 0;
1547                 INIT_LIST_HEAD(&rt->rt_uncached);
1548
1549                 rt->dst.output = ip_output;
1550                 if (flags & RTCF_LOCAL)
1551                         rt->dst.input = ip_local_deliver;
1552         }
1553
1554         return rt;
1555 }
1556 EXPORT_SYMBOL(rt_dst_alloc);
1557
1558 /* called in rcu_read_lock() section */
1559 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1560                                 u8 tos, struct net_device *dev, int our)
1561 {
1562         struct rtable *rth;
1563         struct in_device *in_dev = __in_dev_get_rcu(dev);
1564         unsigned int flags = RTCF_MULTICAST;
1565         u32 itag = 0;
1566         int err;
1567
1568         /* Primary sanity checks. */
1569
1570         if (!in_dev)
1571                 return -EINVAL;
1572
1573         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1574             skb->protocol != htons(ETH_P_IP))
1575                 goto e_inval;
1576
1577         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1578                 goto e_inval;
1579
1580         if (ipv4_is_zeronet(saddr)) {
1581                 if (!ipv4_is_local_multicast(daddr))
1582                         goto e_inval;
1583         } else {
1584                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1585                                           in_dev, &itag);
1586                 if (err < 0)
1587                         goto e_err;
1588         }
1589         if (our)
1590                 flags |= RTCF_LOCAL;
1591
1592         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1593                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1594         if (!rth)
1595                 goto e_nobufs;
1596
1597 #ifdef CONFIG_IP_ROUTE_CLASSID
1598         rth->dst.tclassid = itag;
1599 #endif
1600         rth->dst.output = ip_rt_bug;
1601         rth->rt_is_input= 1;
1602
1603 #ifdef CONFIG_IP_MROUTE
1604         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1605                 rth->dst.input = ip_mr_input;
1606 #endif
1607         RT_CACHE_STAT_INC(in_slow_mc);
1608
1609         skb_dst_set(skb, &rth->dst);
1610         return 0;
1611
1612 e_nobufs:
1613         return -ENOBUFS;
1614 e_inval:
1615         return -EINVAL;
1616 e_err:
1617         return err;
1618 }
1619
1620
1621 static void ip_handle_martian_source(struct net_device *dev,
1622                                      struct in_device *in_dev,
1623                                      struct sk_buff *skb,
1624                                      __be32 daddr,
1625                                      __be32 saddr)
1626 {
1627         RT_CACHE_STAT_INC(in_martian_src);
1628 #ifdef CONFIG_IP_ROUTE_VERBOSE
1629         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1630                 /*
1631                  *      RFC1812 recommendation, if source is martian,
1632                  *      the only hint is MAC header.
1633                  */
1634                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1635                         &daddr, &saddr, dev->name);
1636                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1637                         print_hex_dump(KERN_WARNING, "ll header: ",
1638                                        DUMP_PREFIX_OFFSET, 16, 1,
1639                                        skb_mac_header(skb),
1640                                        dev->hard_header_len, true);
1641                 }
1642         }
1643 #endif
1644 }
1645
1646 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1647 {
1648         struct fnhe_hash_bucket *hash;
1649         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1650         u32 hval = fnhe_hashfun(daddr);
1651
1652         spin_lock_bh(&fnhe_lock);
1653
1654         hash = rcu_dereference_protected(nh->nh_exceptions,
1655                                          lockdep_is_held(&fnhe_lock));
1656         hash += hval;
1657
1658         fnhe_p = &hash->chain;
1659         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1660         while (fnhe) {
1661                 if (fnhe->fnhe_daddr == daddr) {
1662                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1663                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1664                         /* set fnhe_daddr to 0 to ensure it won't bind with
1665                          * new dsts in rt_bind_exception().
1666                          */
1667                         fnhe->fnhe_daddr = 0;
1668                         fnhe_flush_routes(fnhe);
1669                         kfree_rcu(fnhe, rcu);
1670                         break;
1671                 }
1672                 fnhe_p = &fnhe->fnhe_next;
1673                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1674                                                  lockdep_is_held(&fnhe_lock));
1675         }
1676
1677         spin_unlock_bh(&fnhe_lock);
1678 }
1679
1680 /* called in rcu_read_lock() section */
1681 static int __mkroute_input(struct sk_buff *skb,
1682                            const struct fib_result *res,
1683                            struct in_device *in_dev,
1684                            __be32 daddr, __be32 saddr, u32 tos)
1685 {
1686         struct fib_nh_exception *fnhe;
1687         struct rtable *rth;
1688         int err;
1689         struct in_device *out_dev;
1690         bool do_cache;
1691         u32 itag = 0;
1692
1693         /* get a working reference to the output device */
1694         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1695         if (!out_dev) {
1696                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1697                 return -EINVAL;
1698         }
1699
1700         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1701                                   in_dev->dev, in_dev, &itag);
1702         if (err < 0) {
1703                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1704                                          saddr);
1705
1706                 goto cleanup;
1707         }
1708
1709         do_cache = res->fi && !itag;
1710         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1711             skb->protocol == htons(ETH_P_IP) &&
1712             (IN_DEV_SHARED_MEDIA(out_dev) ||
1713              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1714                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1715
1716         if (skb->protocol != htons(ETH_P_IP)) {
1717                 /* Not IP (i.e. ARP). Do not create route, if it is
1718                  * invalid for proxy arp. DNAT routes are always valid.
1719                  *
1720                  * Proxy arp feature have been extended to allow, ARP
1721                  * replies back to the same interface, to support
1722                  * Private VLAN switch technologies. See arp.c.
1723                  */
1724                 if (out_dev == in_dev &&
1725                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1726                         err = -EINVAL;
1727                         goto cleanup;
1728                 }
1729         }
1730
1731         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1732         if (do_cache) {
1733                 if (fnhe) {
1734                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1735                         if (rth && rth->dst.expires &&
1736                             time_after(jiffies, rth->dst.expires)) {
1737                                 ip_del_fnhe(&FIB_RES_NH(*res), daddr);
1738                                 fnhe = NULL;
1739                         } else {
1740                                 goto rt_cache;
1741                         }
1742                 }
1743
1744                 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1745
1746 rt_cache:
1747                 if (rt_cache_valid(rth)) {
1748                         skb_dst_set_noref(skb, &rth->dst);
1749                         goto out;
1750                 }
1751         }
1752
1753         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1754                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1755                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1756         if (!rth) {
1757                 err = -ENOBUFS;
1758                 goto cleanup;
1759         }
1760
1761         rth->rt_is_input = 1;
1762         if (res->table)
1763                 rth->rt_table_id = res->table->tb_id;
1764         RT_CACHE_STAT_INC(in_slow_tot);
1765
1766         rth->dst.input = ip_forward;
1767
1768         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1769         if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1770                 rth->dst.lwtstate->orig_output = rth->dst.output;
1771                 rth->dst.output = lwtunnel_output;
1772         }
1773         if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1774                 rth->dst.lwtstate->orig_input = rth->dst.input;
1775                 rth->dst.input = lwtunnel_input;
1776         }
1777         skb_dst_set(skb, &rth->dst);
1778 out:
1779         err = 0;
1780  cleanup:
1781         return err;
1782 }
1783
1784 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1785
1786 /* To make ICMP packets follow the right flow, the multipath hash is
1787  * calculated from the inner IP addresses in reverse order.
1788  */
1789 static int ip_multipath_icmp_hash(struct sk_buff *skb)
1790 {
1791         const struct iphdr *outer_iph = ip_hdr(skb);
1792         struct icmphdr _icmph;
1793         const struct icmphdr *icmph;
1794         struct iphdr _inner_iph;
1795         const struct iphdr *inner_iph;
1796
1797         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1798                 goto standard_hash;
1799
1800         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1801                                    &_icmph);
1802         if (!icmph)
1803                 goto standard_hash;
1804
1805         if (icmph->type != ICMP_DEST_UNREACH &&
1806             icmph->type != ICMP_REDIRECT &&
1807             icmph->type != ICMP_TIME_EXCEEDED &&
1808             icmph->type != ICMP_PARAMETERPROB) {
1809                 goto standard_hash;
1810         }
1811
1812         inner_iph = skb_header_pointer(skb,
1813                                        outer_iph->ihl * 4 + sizeof(_icmph),
1814                                        sizeof(_inner_iph), &_inner_iph);
1815         if (!inner_iph)
1816                 goto standard_hash;
1817
1818         return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr);
1819
1820 standard_hash:
1821         return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr);
1822 }
1823
1824 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1825
1826 static int ip_mkroute_input(struct sk_buff *skb,
1827                             struct fib_result *res,
1828                             const struct flowi4 *fl4,
1829                             struct in_device *in_dev,
1830                             __be32 daddr, __be32 saddr, u32 tos)
1831 {
1832 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1833         if (res->fi && res->fi->fib_nhs > 1) {
1834                 int h;
1835
1836                 if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP))
1837                         h = ip_multipath_icmp_hash(skb);
1838                 else
1839                         h = fib_multipath_hash(saddr, daddr);
1840                 fib_select_multipath(res, h);
1841         }
1842 #endif
1843
1844         /* create a routing cache entry */
1845         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1846 }
1847
1848 /*
1849  *      NOTE. We drop all the packets that has local source
1850  *      addresses, because every properly looped back packet
1851  *      must have correct destination already attached by output routine.
1852  *
1853  *      Such approach solves two big problems:
1854  *      1. Not simplex devices are handled properly.
1855  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1856  *      called with rcu_read_lock()
1857  */
1858
1859 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1860                                u8 tos, struct net_device *dev)
1861 {
1862         struct fib_result res;
1863         struct in_device *in_dev = __in_dev_get_rcu(dev);
1864         struct ip_tunnel_info *tun_info;
1865         struct flowi4   fl4;
1866         unsigned int    flags = 0;
1867         u32             itag = 0;
1868         struct rtable   *rth;
1869         int             err = -EINVAL;
1870         struct net    *net = dev_net(dev);
1871         bool do_cache;
1872
1873         /* IP on this device is disabled. */
1874
1875         if (!in_dev)
1876                 goto out;
1877
1878         /* Check for the most weird martians, which can be not detected
1879            by fib_lookup.
1880          */
1881
1882         tun_info = skb_tunnel_info(skb);
1883         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1884                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1885         else
1886                 fl4.flowi4_tun_key.tun_id = 0;
1887         skb_dst_drop(skb);
1888
1889         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1890                 goto martian_source;
1891
1892         res.fi = NULL;
1893         res.table = NULL;
1894         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1895                 goto brd_input;
1896
1897         /* Accept zero addresses only to limited broadcast;
1898          * I even do not know to fix it or not. Waiting for complains :-)
1899          */
1900         if (ipv4_is_zeronet(saddr))
1901                 goto martian_source;
1902
1903         if (ipv4_is_zeronet(daddr))
1904                 goto martian_destination;
1905
1906         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1907          * and call it once if daddr or/and saddr are loopback addresses
1908          */
1909         if (ipv4_is_loopback(daddr)) {
1910                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1911                         goto martian_destination;
1912         } else if (ipv4_is_loopback(saddr)) {
1913                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1914                         goto martian_source;
1915         }
1916
1917         /*
1918          *      Now we are ready to route packet.
1919          */
1920         fl4.flowi4_oif = 0;
1921         fl4.flowi4_iif = dev->ifindex;
1922         fl4.flowi4_mark = skb->mark;
1923         fl4.flowi4_tos = tos;
1924         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1925         fl4.flowi4_flags = 0;
1926         fl4.daddr = daddr;
1927         fl4.saddr = saddr;
1928         err = fib_lookup(net, &fl4, &res, 0);
1929         if (err != 0) {
1930                 if (!IN_DEV_FORWARD(in_dev))
1931                         err = -EHOSTUNREACH;
1932                 goto no_route;
1933         }
1934
1935         if (res.type == RTN_BROADCAST)
1936                 goto brd_input;
1937
1938         if (res.type == RTN_LOCAL) {
1939                 err = fib_validate_source(skb, saddr, daddr, tos,
1940                                           0, dev, in_dev, &itag);
1941                 if (err < 0)
1942                         goto martian_source;
1943                 goto local_input;
1944         }
1945
1946         if (!IN_DEV_FORWARD(in_dev)) {
1947                 err = -EHOSTUNREACH;
1948                 goto no_route;
1949         }
1950         if (res.type != RTN_UNICAST)
1951                 goto martian_destination;
1952
1953         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1954 out:    return err;
1955
1956 brd_input:
1957         if (skb->protocol != htons(ETH_P_IP))
1958                 goto e_inval;
1959
1960         if (!ipv4_is_zeronet(saddr)) {
1961                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1962                                           in_dev, &itag);
1963                 if (err < 0)
1964                         goto martian_source;
1965         }
1966         flags |= RTCF_BROADCAST;
1967         res.type = RTN_BROADCAST;
1968         RT_CACHE_STAT_INC(in_brd);
1969
1970 local_input:
1971         do_cache = false;
1972         if (res.fi) {
1973                 if (!itag) {
1974                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1975                         if (rt_cache_valid(rth)) {
1976                                 skb_dst_set_noref(skb, &rth->dst);
1977                                 err = 0;
1978                                 goto out;
1979                         }
1980                         do_cache = true;
1981                 }
1982         }
1983
1984         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
1985                            flags | RTCF_LOCAL, res.type,
1986                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1987         if (!rth)
1988                 goto e_nobufs;
1989
1990         rth->dst.output= ip_rt_bug;
1991 #ifdef CONFIG_IP_ROUTE_CLASSID
1992         rth->dst.tclassid = itag;
1993 #endif
1994         rth->rt_is_input = 1;
1995         if (res.table)
1996                 rth->rt_table_id = res.table->tb_id;
1997
1998         RT_CACHE_STAT_INC(in_slow_tot);
1999         if (res.type == RTN_UNREACHABLE) {
2000                 rth->dst.input= ip_error;
2001                 rth->dst.error= -err;
2002                 rth->rt_flags   &= ~RTCF_LOCAL;
2003         }
2004         if (do_cache) {
2005                 if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
2006                         rth->dst.flags |= DST_NOCACHE;
2007                         rt_add_uncached_list(rth);
2008                 }
2009         }
2010         skb_dst_set(skb, &rth->dst);
2011         err = 0;
2012         goto out;
2013
2014 no_route:
2015         RT_CACHE_STAT_INC(in_no_route);
2016         res.type = RTN_UNREACHABLE;
2017         res.fi = NULL;
2018         res.table = NULL;
2019         goto local_input;
2020
2021         /*
2022          *      Do not cache martian addresses: they should be logged (RFC1812)
2023          */
2024 martian_destination:
2025         RT_CACHE_STAT_INC(in_martian_dst);
2026 #ifdef CONFIG_IP_ROUTE_VERBOSE
2027         if (IN_DEV_LOG_MARTIANS(in_dev))
2028                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2029                                      &daddr, &saddr, dev->name);
2030 #endif
2031
2032 e_inval:
2033         err = -EINVAL;
2034         goto out;
2035
2036 e_nobufs:
2037         err = -ENOBUFS;
2038         goto out;
2039
2040 martian_source:
2041         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2042         goto out;
2043 }
2044
2045 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2046                          u8 tos, struct net_device *dev)
2047 {
2048         int res;
2049
2050         tos &= IPTOS_RT_MASK;
2051         rcu_read_lock();
2052
2053         /* Multicast recognition logic is moved from route cache to here.
2054            The problem was that too many Ethernet cards have broken/missing
2055            hardware multicast filters :-( As result the host on multicasting
2056            network acquires a lot of useless route cache entries, sort of
2057            SDR messages from all the world. Now we try to get rid of them.
2058            Really, provided software IP multicast filter is organized
2059            reasonably (at least, hashed), it does not result in a slowdown
2060            comparing with route cache reject entries.
2061            Note, that multicast routers are not affected, because
2062            route cache entry is created eventually.
2063          */
2064         if (ipv4_is_multicast(daddr)) {
2065                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2066
2067                 if (in_dev) {
2068                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2069                                                   ip_hdr(skb)->protocol);
2070                         if (our
2071 #ifdef CONFIG_IP_MROUTE
2072                                 ||
2073                             (!ipv4_is_local_multicast(daddr) &&
2074                              IN_DEV_MFORWARD(in_dev))
2075 #endif
2076                            ) {
2077                                 int res = ip_route_input_mc(skb, daddr, saddr,
2078                                                             tos, dev, our);
2079                                 rcu_read_unlock();
2080                                 return res;
2081                         }
2082                 }
2083                 rcu_read_unlock();
2084                 return -EINVAL;
2085         }
2086         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2087         rcu_read_unlock();
2088         return res;
2089 }
2090 EXPORT_SYMBOL(ip_route_input_noref);
2091
2092 /* called with rcu_read_lock() */
2093 static struct rtable *__mkroute_output(const struct fib_result *res,
2094                                        const struct flowi4 *fl4, int orig_oif,
2095                                        struct net_device *dev_out,
2096                                        unsigned int flags)
2097 {
2098         struct fib_info *fi = res->fi;
2099         struct fib_nh_exception *fnhe;
2100         struct in_device *in_dev;
2101         u16 type = res->type;
2102         struct rtable *rth;
2103         bool do_cache;
2104
2105         in_dev = __in_dev_get_rcu(dev_out);
2106         if (!in_dev)
2107                 return ERR_PTR(-EINVAL);
2108
2109         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2110                 if (ipv4_is_loopback(fl4->saddr) &&
2111                     !(dev_out->flags & IFF_LOOPBACK) &&
2112                     !netif_is_l3_master(dev_out))
2113                         return ERR_PTR(-EINVAL);
2114
2115         if (ipv4_is_lbcast(fl4->daddr))
2116                 type = RTN_BROADCAST;
2117         else if (ipv4_is_multicast(fl4->daddr))
2118                 type = RTN_MULTICAST;
2119         else if (ipv4_is_zeronet(fl4->daddr))
2120                 return ERR_PTR(-EINVAL);
2121
2122         if (dev_out->flags & IFF_LOOPBACK)
2123                 flags |= RTCF_LOCAL;
2124
2125         do_cache = true;
2126         if (type == RTN_BROADCAST) {
2127                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2128                 fi = NULL;
2129         } else if (type == RTN_MULTICAST) {
2130                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2131                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2132                                      fl4->flowi4_proto))
2133                         flags &= ~RTCF_LOCAL;
2134                 else
2135                         do_cache = false;
2136                 /* If multicast route do not exist use
2137                  * default one, but do not gateway in this case.
2138                  * Yes, it is hack.
2139                  */
2140                 if (fi && res->prefixlen < 4)
2141                         fi = NULL;
2142         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2143                    (orig_oif != dev_out->ifindex)) {
2144                 /* For local routes that require a particular output interface
2145                  * we do not want to cache the result.  Caching the result
2146                  * causes incorrect behaviour when there are multiple source
2147                  * addresses on the interface, the end result being that if the
2148                  * intended recipient is waiting on that interface for the
2149                  * packet he won't receive it because it will be delivered on
2150                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2151                  * be set to the loopback interface as well.
2152                  */
2153                 fi = NULL;
2154         }
2155
2156         fnhe = NULL;
2157         do_cache &= fi != NULL;
2158         if (do_cache) {
2159                 struct rtable __rcu **prth;
2160                 struct fib_nh *nh = &FIB_RES_NH(*res);
2161
2162                 fnhe = find_exception(nh, fl4->daddr);
2163                 if (fnhe) {
2164                         prth = &fnhe->fnhe_rth_output;
2165                         rth = rcu_dereference(*prth);
2166                         if (rth && rth->dst.expires &&
2167                             time_after(jiffies, rth->dst.expires)) {
2168                                 ip_del_fnhe(nh, fl4->daddr);
2169                                 fnhe = NULL;
2170                         } else {
2171                                 goto rt_cache;
2172                         }
2173                 }
2174
2175                 if (unlikely(fl4->flowi4_flags &
2176                              FLOWI_FLAG_KNOWN_NH &&
2177                              !(nh->nh_gw &&
2178                                nh->nh_scope == RT_SCOPE_LINK))) {
2179                         do_cache = false;
2180                         goto add;
2181                 }
2182                 prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2183                 rth = rcu_dereference(*prth);
2184
2185 rt_cache:
2186                 if (rt_cache_valid(rth)) {
2187                         dst_hold(&rth->dst);
2188                         return rth;
2189                 }
2190         }
2191
2192 add:
2193         rth = rt_dst_alloc(dev_out, flags, type,
2194                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2195                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2196                            do_cache);
2197         if (!rth)
2198                 return ERR_PTR(-ENOBUFS);
2199
2200         rth->rt_iif     = orig_oif ? : 0;
2201         if (res->table)
2202                 rth->rt_table_id = res->table->tb_id;
2203
2204         RT_CACHE_STAT_INC(out_slow_tot);
2205
2206         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2207                 if (flags & RTCF_LOCAL &&
2208                     !(dev_out->flags & IFF_LOOPBACK)) {
2209                         rth->dst.output = ip_mc_output;
2210                         RT_CACHE_STAT_INC(out_slow_mc);
2211                 }
2212 #ifdef CONFIG_IP_MROUTE
2213                 if (type == RTN_MULTICAST) {
2214                         if (IN_DEV_MFORWARD(in_dev) &&
2215                             !ipv4_is_local_multicast(fl4->daddr)) {
2216                                 rth->dst.input = ip_mr_input;
2217                                 rth->dst.output = ip_mc_output;
2218                         }
2219                 }
2220 #endif
2221         }
2222
2223         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
2224         if (lwtunnel_output_redirect(rth->dst.lwtstate))
2225                 rth->dst.output = lwtunnel_output;
2226
2227         return rth;
2228 }
2229
2230 /*
2231  * Major route resolver routine.
2232  */
2233
2234 struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2235                                           int mp_hash)
2236 {
2237         struct net_device *dev_out = NULL;
2238         __u8 tos = RT_FL_TOS(fl4);
2239         unsigned int flags = 0;
2240         struct fib_result res;
2241         struct rtable *rth;
2242         int orig_oif;
2243         int err;
2244
2245         res.tclassid    = 0;
2246         res.fi          = NULL;
2247         res.table       = NULL;
2248
2249         orig_oif = fl4->flowi4_oif;
2250
2251         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2252         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2253         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2254                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2255
2256         rcu_read_lock();
2257         if (fl4->saddr) {
2258                 if (ipv4_is_multicast(fl4->saddr) ||
2259                     ipv4_is_lbcast(fl4->saddr) ||
2260                     ipv4_is_zeronet(fl4->saddr)) {
2261                         rth = ERR_PTR(-EINVAL);
2262                         goto out;
2263                 }
2264
2265                 rth = ERR_PTR(-ENETUNREACH);
2266
2267                 /* I removed check for oif == dev_out->oif here.
2268                    It was wrong for two reasons:
2269                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2270                       is assigned to multiple interfaces.
2271                    2. Moreover, we are allowed to send packets with saddr
2272                       of another iface. --ANK
2273                  */
2274
2275                 if (fl4->flowi4_oif == 0 &&
2276                     (ipv4_is_multicast(fl4->daddr) ||
2277                      ipv4_is_lbcast(fl4->daddr))) {
2278                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2279                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2280                         if (!dev_out)
2281                                 goto out;
2282
2283                         /* Special hack: user can direct multicasts
2284                            and limited broadcast via necessary interface
2285                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2286                            This hack is not just for fun, it allows
2287                            vic,vat and friends to work.
2288                            They bind socket to loopback, set ttl to zero
2289                            and expect that it will work.
2290                            From the viewpoint of routing cache they are broken,
2291                            because we are not allowed to build multicast path
2292                            with loopback source addr (look, routing cache
2293                            cannot know, that ttl is zero, so that packet
2294                            will not leave this host and route is valid).
2295                            Luckily, this hack is good workaround.
2296                          */
2297
2298                         fl4->flowi4_oif = dev_out->ifindex;
2299                         goto make_route;
2300                 }
2301
2302                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2303                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2304                         if (!__ip_dev_find(net, fl4->saddr, false))
2305                                 goto out;
2306                 }
2307         }
2308
2309
2310         if (fl4->flowi4_oif) {
2311                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2312                 rth = ERR_PTR(-ENODEV);
2313                 if (!dev_out)
2314                         goto out;
2315
2316                 /* RACE: Check return value of inet_select_addr instead. */
2317                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2318                         rth = ERR_PTR(-ENETUNREACH);
2319                         goto out;
2320                 }
2321                 if (ipv4_is_local_multicast(fl4->daddr) ||
2322                     ipv4_is_lbcast(fl4->daddr) ||
2323                     fl4->flowi4_proto == IPPROTO_IGMP) {
2324                         if (!fl4->saddr)
2325                                 fl4->saddr = inet_select_addr(dev_out, 0,
2326                                                               RT_SCOPE_LINK);
2327                         goto make_route;
2328                 }
2329                 if (!fl4->saddr) {
2330                         if (ipv4_is_multicast(fl4->daddr))
2331                                 fl4->saddr = inet_select_addr(dev_out, 0,
2332                                                               fl4->flowi4_scope);
2333                         else if (!fl4->daddr)
2334                                 fl4->saddr = inet_select_addr(dev_out, 0,
2335                                                               RT_SCOPE_HOST);
2336                 }
2337         }
2338
2339         if (!fl4->daddr) {
2340                 fl4->daddr = fl4->saddr;
2341                 if (!fl4->daddr)
2342                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2343                 dev_out = net->loopback_dev;
2344                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2345                 res.type = RTN_LOCAL;
2346                 flags |= RTCF_LOCAL;
2347                 goto make_route;
2348         }
2349
2350         err = fib_lookup(net, fl4, &res, 0);
2351         if (err) {
2352                 res.fi = NULL;
2353                 res.table = NULL;
2354                 if (fl4->flowi4_oif &&
2355                     !netif_index_is_l3_master(net, fl4->flowi4_oif)) {
2356                         /* Apparently, routing tables are wrong. Assume,
2357                            that the destination is on link.
2358
2359                            WHY? DW.
2360                            Because we are allowed to send to iface
2361                            even if it has NO routes and NO assigned
2362                            addresses. When oif is specified, routing
2363                            tables are looked up with only one purpose:
2364                            to catch if destination is gatewayed, rather than
2365                            direct. Moreover, if MSG_DONTROUTE is set,
2366                            we send packet, ignoring both routing tables
2367                            and ifaddr state. --ANK
2368
2369
2370                            We could make it even if oif is unknown,
2371                            likely IPv6, but we do not.
2372                          */
2373
2374                         if (fl4->saddr == 0)
2375                                 fl4->saddr = inet_select_addr(dev_out, 0,
2376                                                               RT_SCOPE_LINK);
2377                         res.type = RTN_UNICAST;
2378                         goto make_route;
2379                 }
2380                 rth = ERR_PTR(err);
2381                 goto out;
2382         }
2383
2384         if (res.type == RTN_LOCAL) {
2385                 if (!fl4->saddr) {
2386                         if (res.fi->fib_prefsrc)
2387                                 fl4->saddr = res.fi->fib_prefsrc;
2388                         else
2389                                 fl4->saddr = fl4->daddr;
2390                 }
2391
2392                 /* L3 master device is the loopback for that domain */
2393                 dev_out = l3mdev_master_dev_rcu(dev_out) ? : net->loopback_dev;
2394                 fl4->flowi4_oif = dev_out->ifindex;
2395                 flags |= RTCF_LOCAL;
2396                 goto make_route;
2397         }
2398
2399         fib_select_path(net, &res, fl4, mp_hash);
2400
2401         dev_out = FIB_RES_DEV(res);
2402         fl4->flowi4_oif = dev_out->ifindex;
2403
2404
2405 make_route:
2406         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2407
2408 out:
2409         rcu_read_unlock();
2410         return rth;
2411 }
2412 EXPORT_SYMBOL_GPL(__ip_route_output_key_hash);
2413
2414 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2415 {
2416         return NULL;
2417 }
2418
2419 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2420 {
2421         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2422
2423         return mtu ? : dst->dev->mtu;
2424 }
2425
2426 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2427                                           struct sk_buff *skb, u32 mtu)
2428 {
2429 }
2430
2431 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2432                                        struct sk_buff *skb)
2433 {
2434 }
2435
2436 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2437                                           unsigned long old)
2438 {
2439         return NULL;
2440 }
2441
2442 static struct dst_ops ipv4_dst_blackhole_ops = {
2443         .family                 =       AF_INET,
2444         .check                  =       ipv4_blackhole_dst_check,
2445         .mtu                    =       ipv4_blackhole_mtu,
2446         .default_advmss         =       ipv4_default_advmss,
2447         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2448         .redirect               =       ipv4_rt_blackhole_redirect,
2449         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2450         .neigh_lookup           =       ipv4_neigh_lookup,
2451 };
2452
2453 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2454 {
2455         struct rtable *ort = (struct rtable *) dst_orig;
2456         struct rtable *rt;
2457
2458         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2459         if (rt) {
2460                 struct dst_entry *new = &rt->dst;
2461
2462                 new->__use = 1;
2463                 new->input = dst_discard;
2464                 new->output = dst_discard_out;
2465
2466                 new->dev = ort->dst.dev;
2467                 if (new->dev)
2468                         dev_hold(new->dev);
2469
2470                 rt->rt_is_input = ort->rt_is_input;
2471                 rt->rt_iif = ort->rt_iif;
2472                 rt->rt_pmtu = ort->rt_pmtu;
2473                 rt->rt_mtu_locked = ort->rt_mtu_locked;
2474
2475                 rt->rt_genid = rt_genid_ipv4(net);
2476                 rt->rt_flags = ort->rt_flags;
2477                 rt->rt_type = ort->rt_type;
2478                 rt->rt_gateway = ort->rt_gateway;
2479                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2480
2481                 INIT_LIST_HEAD(&rt->rt_uncached);
2482                 dst_free(new);
2483         }
2484
2485         dst_release(dst_orig);
2486
2487         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2488 }
2489
2490 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2491                                     const struct sock *sk)
2492 {
2493         struct rtable *rt = __ip_route_output_key(net, flp4);
2494
2495         if (IS_ERR(rt))
2496                 return rt;
2497
2498         if (flp4->flowi4_proto)
2499                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2500                                                         flowi4_to_flowi(flp4),
2501                                                         sk, 0);
2502
2503         return rt;
2504 }
2505 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2506
2507 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2508                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2509                         u32 seq, int event, int nowait, unsigned int flags)
2510 {
2511         struct rtable *rt = skb_rtable(skb);
2512         struct rtmsg *r;
2513         struct nlmsghdr *nlh;
2514         unsigned long expires = 0;
2515         u32 error;
2516         u32 metrics[RTAX_MAX];
2517
2518         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2519         if (!nlh)
2520                 return -EMSGSIZE;
2521
2522         r = nlmsg_data(nlh);
2523         r->rtm_family    = AF_INET;
2524         r->rtm_dst_len  = 32;
2525         r->rtm_src_len  = 0;
2526         r->rtm_tos      = fl4->flowi4_tos;
2527         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2528         if (nla_put_u32(skb, RTA_TABLE, table_id))
2529                 goto nla_put_failure;
2530         r->rtm_type     = rt->rt_type;
2531         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2532         r->rtm_protocol = RTPROT_UNSPEC;
2533         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2534         if (rt->rt_flags & RTCF_NOTIFY)
2535                 r->rtm_flags |= RTM_F_NOTIFY;
2536         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2537                 r->rtm_flags |= RTCF_DOREDIRECT;
2538
2539         if (nla_put_in_addr(skb, RTA_DST, dst))
2540                 goto nla_put_failure;
2541         if (src) {
2542                 r->rtm_src_len = 32;
2543                 if (nla_put_in_addr(skb, RTA_SRC, src))
2544                         goto nla_put_failure;
2545         }
2546         if (rt->dst.dev &&
2547             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2548                 goto nla_put_failure;
2549 #ifdef CONFIG_IP_ROUTE_CLASSID
2550         if (rt->dst.tclassid &&
2551             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2552                 goto nla_put_failure;
2553 #endif
2554         if (!rt_is_input_route(rt) &&
2555             fl4->saddr != src) {
2556                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2557                         goto nla_put_failure;
2558         }
2559         if (rt->rt_uses_gateway &&
2560             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2561                 goto nla_put_failure;
2562
2563         expires = rt->dst.expires;
2564         if (expires) {
2565                 unsigned long now = jiffies;
2566
2567                 if (time_before(now, expires))
2568                         expires -= now;
2569                 else
2570                         expires = 0;
2571         }
2572
2573         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2574         if (rt->rt_pmtu && expires)
2575                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2576         if (rt->rt_mtu_locked && expires)
2577                 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2578         if (rtnetlink_put_metrics(skb, metrics) < 0)
2579                 goto nla_put_failure;
2580
2581         if (fl4->flowi4_mark &&
2582             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2583                 goto nla_put_failure;
2584
2585         error = rt->dst.error;
2586
2587         if (rt_is_input_route(rt)) {
2588 #ifdef CONFIG_IP_MROUTE
2589                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2590                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2591                         int err = ipmr_get_route(net, skb,
2592                                                  fl4->saddr, fl4->daddr,
2593                                                  r, nowait, portid);
2594
2595                         if (err <= 0) {
2596                                 if (!nowait) {
2597                                         if (err == 0)
2598                                                 return 0;
2599                                         goto nla_put_failure;
2600                                 } else {
2601                                         if (err == -EMSGSIZE)
2602                                                 goto nla_put_failure;
2603                                         error = err;
2604                                 }
2605                         }
2606                 } else
2607 #endif
2608                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2609                                 goto nla_put_failure;
2610         }
2611
2612         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2613                 goto nla_put_failure;
2614
2615         nlmsg_end(skb, nlh);
2616         return 0;
2617
2618 nla_put_failure:
2619         nlmsg_cancel(skb, nlh);
2620         return -EMSGSIZE;
2621 }
2622
2623 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2624 {
2625         struct net *net = sock_net(in_skb->sk);
2626         struct rtmsg *rtm;
2627         struct nlattr *tb[RTA_MAX+1];
2628         struct rtable *rt = NULL;
2629         struct flowi4 fl4;
2630         __be32 dst = 0;
2631         __be32 src = 0;
2632         u32 iif;
2633         int err;
2634         int mark;
2635         struct sk_buff *skb;
2636         u32 table_id = RT_TABLE_MAIN;
2637
2638         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2639         if (err < 0)
2640                 goto errout;
2641
2642         rtm = nlmsg_data(nlh);
2643
2644         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2645         if (!skb) {
2646                 err = -ENOBUFS;
2647                 goto errout;
2648         }
2649
2650         /* Reserve room for dummy headers, this skb can pass
2651            through good chunk of routing engine.
2652          */
2653         skb_reset_mac_header(skb);
2654         skb_reset_network_header(skb);
2655
2656         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2657         ip_hdr(skb)->protocol = IPPROTO_UDP;
2658         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2659
2660         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2661         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2662         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2663         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2664
2665         memset(&fl4, 0, sizeof(fl4));
2666         fl4.daddr = dst;
2667         fl4.saddr = src;
2668         fl4.flowi4_tos = rtm->rtm_tos;
2669         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2670         fl4.flowi4_mark = mark;
2671
2672         if (iif) {
2673                 struct net_device *dev;
2674
2675                 dev = __dev_get_by_index(net, iif);
2676                 if (!dev) {
2677                         err = -ENODEV;
2678                         goto errout_free;
2679                 }
2680
2681                 skb->protocol   = htons(ETH_P_IP);
2682                 skb->dev        = dev;
2683                 skb->mark       = mark;
2684                 local_bh_disable();
2685                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2686                 local_bh_enable();
2687
2688                 rt = skb_rtable(skb);
2689                 if (err == 0 && rt->dst.error)
2690                         err = -rt->dst.error;
2691         } else {
2692                 rt = ip_route_output_key(net, &fl4);
2693
2694                 err = 0;
2695                 if (IS_ERR(rt))
2696                         err = PTR_ERR(rt);
2697         }
2698
2699         if (err)
2700                 goto errout_free;
2701
2702         skb_dst_set(skb, &rt->dst);
2703         if (rtm->rtm_flags & RTM_F_NOTIFY)
2704                 rt->rt_flags |= RTCF_NOTIFY;
2705
2706         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2707                 table_id = rt->rt_table_id;
2708
2709         err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2710                            NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2711                            RTM_NEWROUTE, 0, 0);
2712         if (err < 0)
2713                 goto errout_free;
2714
2715         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2716 errout:
2717         return err;
2718
2719 errout_free:
2720         kfree_skb(skb);
2721         goto errout;
2722 }
2723
2724 void ip_rt_multicast_event(struct in_device *in_dev)
2725 {
2726         rt_cache_flush(dev_net(in_dev->dev));
2727 }
2728
2729 #ifdef CONFIG_SYSCTL
2730 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2731 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2732 static int ip_rt_gc_elasticity __read_mostly    = 8;
2733 static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
2734
2735 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2736                                         void __user *buffer,
2737                                         size_t *lenp, loff_t *ppos)
2738 {
2739         struct net *net = (struct net *)__ctl->extra1;
2740
2741         if (write) {
2742                 rt_cache_flush(net);
2743                 fnhe_genid_bump(net);
2744                 return 0;
2745         }
2746
2747         return -EINVAL;
2748 }
2749
2750 static struct ctl_table ipv4_route_table[] = {
2751         {
2752                 .procname       = "gc_thresh",
2753                 .data           = &ipv4_dst_ops.gc_thresh,
2754                 .maxlen         = sizeof(int),
2755                 .mode           = 0644,
2756                 .proc_handler   = proc_dointvec,
2757         },
2758         {
2759                 .procname       = "max_size",
2760                 .data           = &ip_rt_max_size,
2761                 .maxlen         = sizeof(int),
2762                 .mode           = 0644,
2763                 .proc_handler   = proc_dointvec,
2764         },
2765         {
2766                 /*  Deprecated. Use gc_min_interval_ms */
2767
2768                 .procname       = "gc_min_interval",
2769                 .data           = &ip_rt_gc_min_interval,
2770                 .maxlen         = sizeof(int),
2771                 .mode           = 0644,
2772                 .proc_handler   = proc_dointvec_jiffies,
2773         },
2774         {
2775                 .procname       = "gc_min_interval_ms",
2776                 .data           = &ip_rt_gc_min_interval,
2777                 .maxlen         = sizeof(int),
2778                 .mode           = 0644,
2779                 .proc_handler   = proc_dointvec_ms_jiffies,
2780         },
2781         {
2782                 .procname       = "gc_timeout",
2783                 .data           = &ip_rt_gc_timeout,
2784                 .maxlen         = sizeof(int),
2785                 .mode           = 0644,
2786                 .proc_handler   = proc_dointvec_jiffies,
2787         },
2788         {
2789                 .procname       = "gc_interval",
2790                 .data           = &ip_rt_gc_interval,
2791                 .maxlen         = sizeof(int),
2792                 .mode           = 0644,
2793                 .proc_handler   = proc_dointvec_jiffies,
2794         },
2795         {
2796                 .procname       = "redirect_load",
2797                 .data           = &ip_rt_redirect_load,
2798                 .maxlen         = sizeof(int),
2799                 .mode           = 0644,
2800                 .proc_handler   = proc_dointvec,
2801         },
2802         {
2803                 .procname       = "redirect_number",
2804                 .data           = &ip_rt_redirect_number,
2805                 .maxlen         = sizeof(int),
2806                 .mode           = 0644,
2807                 .proc_handler   = proc_dointvec,
2808         },
2809         {
2810                 .procname       = "redirect_silence",
2811                 .data           = &ip_rt_redirect_silence,
2812                 .maxlen         = sizeof(int),
2813                 .mode           = 0644,
2814                 .proc_handler   = proc_dointvec,
2815         },
2816         {
2817                 .procname       = "error_cost",
2818                 .data           = &ip_rt_error_cost,
2819                 .maxlen         = sizeof(int),
2820                 .mode           = 0644,
2821                 .proc_handler   = proc_dointvec,
2822         },
2823         {
2824                 .procname       = "error_burst",
2825                 .data           = &ip_rt_error_burst,
2826                 .maxlen         = sizeof(int),
2827                 .mode           = 0644,
2828                 .proc_handler   = proc_dointvec,
2829         },
2830         {
2831                 .procname       = "gc_elasticity",
2832                 .data           = &ip_rt_gc_elasticity,
2833                 .maxlen         = sizeof(int),
2834                 .mode           = 0644,
2835                 .proc_handler   = proc_dointvec,
2836         },
2837         {
2838                 .procname       = "mtu_expires",
2839                 .data           = &ip_rt_mtu_expires,
2840                 .maxlen         = sizeof(int),
2841                 .mode           = 0644,
2842                 .proc_handler   = proc_dointvec_jiffies,
2843         },
2844         {
2845                 .procname       = "min_pmtu",
2846                 .data           = &ip_rt_min_pmtu,
2847                 .maxlen         = sizeof(int),
2848                 .mode           = 0644,
2849                 .proc_handler   = proc_dointvec_minmax,
2850                 .extra1         = &ip_min_valid_pmtu,
2851         },
2852         {
2853                 .procname       = "min_adv_mss",
2854                 .data           = &ip_rt_min_advmss,
2855                 .maxlen         = sizeof(int),
2856                 .mode           = 0644,
2857                 .proc_handler   = proc_dointvec,
2858         },
2859         { }
2860 };
2861
2862 static struct ctl_table ipv4_route_flush_table[] = {
2863         {
2864                 .procname       = "flush",
2865                 .maxlen         = sizeof(int),
2866                 .mode           = 0200,
2867                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2868         },
2869         { },
2870 };
2871
2872 static __net_init int sysctl_route_net_init(struct net *net)
2873 {
2874         struct ctl_table *tbl;
2875
2876         tbl = ipv4_route_flush_table;
2877         if (!net_eq(net, &init_net)) {
2878                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2879                 if (!tbl)
2880                         goto err_dup;
2881
2882                 /* Don't export sysctls to unprivileged users */
2883                 if (net->user_ns != &init_user_ns)
2884                         tbl[0].procname = NULL;
2885         }
2886         tbl[0].extra1 = net;
2887
2888         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2889         if (!net->ipv4.route_hdr)
2890                 goto err_reg;
2891         return 0;
2892
2893 err_reg:
2894         if (tbl != ipv4_route_flush_table)
2895                 kfree(tbl);
2896 err_dup:
2897         return -ENOMEM;
2898 }
2899
2900 static __net_exit void sysctl_route_net_exit(struct net *net)
2901 {
2902         struct ctl_table *tbl;
2903
2904         tbl = net->ipv4.route_hdr->ctl_table_arg;
2905         unregister_net_sysctl_table(net->ipv4.route_hdr);
2906         BUG_ON(tbl == ipv4_route_flush_table);
2907         kfree(tbl);
2908 }
2909
2910 static __net_initdata struct pernet_operations sysctl_route_ops = {
2911         .init = sysctl_route_net_init,
2912         .exit = sysctl_route_net_exit,
2913 };
2914 #endif
2915
2916 static __net_init int rt_genid_init(struct net *net)
2917 {
2918         atomic_set(&net->ipv4.rt_genid, 0);
2919         atomic_set(&net->fnhe_genid, 0);
2920         get_random_bytes(&net->ipv4.dev_addr_genid,
2921                          sizeof(net->ipv4.dev_addr_genid));
2922         return 0;
2923 }
2924
2925 static __net_initdata struct pernet_operations rt_genid_ops = {
2926         .init = rt_genid_init,
2927 };
2928
2929 static int __net_init ipv4_inetpeer_init(struct net *net)
2930 {
2931         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2932
2933         if (!bp)
2934                 return -ENOMEM;
2935         inet_peer_base_init(bp);
2936         net->ipv4.peers = bp;
2937         return 0;
2938 }
2939
2940 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2941 {
2942         struct inet_peer_base *bp = net->ipv4.peers;
2943
2944         net->ipv4.peers = NULL;
2945         inetpeer_invalidate_tree(bp);
2946         kfree(bp);
2947 }
2948
2949 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2950         .init   =       ipv4_inetpeer_init,
2951         .exit   =       ipv4_inetpeer_exit,
2952 };
2953
2954 #ifdef CONFIG_IP_ROUTE_CLASSID
2955 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2956 #endif /* CONFIG_IP_ROUTE_CLASSID */
2957
2958 int __init ip_rt_init(void)
2959 {
2960         void *idents_hash;
2961         int rc = 0;
2962         int cpu;
2963
2964         /* For modern hosts, this will use 2 MB of memory */
2965         idents_hash = alloc_large_system_hash("IP idents",
2966                                               sizeof(*ip_idents) + sizeof(*ip_tstamps),
2967                                               0,
2968                                               16, /* one bucket per 64 KB */
2969                                               0,
2970                                               NULL,
2971                                               &ip_idents_mask,
2972                                               2048,
2973                                               256*1024);
2974
2975         ip_idents = idents_hash;
2976
2977         prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
2978
2979         ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
2980         memset(ip_tstamps, 0, (ip_idents_mask + 1) * sizeof(*ip_tstamps));
2981
2982         for_each_possible_cpu(cpu) {
2983                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
2984
2985                 INIT_LIST_HEAD(&ul->head);
2986                 spin_lock_init(&ul->lock);
2987         }
2988 #ifdef CONFIG_IP_ROUTE_CLASSID
2989         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2990         if (!ip_rt_acct)
2991                 panic("IP: failed to allocate ip_rt_acct\n");
2992 #endif
2993
2994         ipv4_dst_ops.kmem_cachep =
2995                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2996                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2997
2998         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2999
3000         if (dst_entries_init(&ipv4_dst_ops) < 0)
3001                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3002
3003         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3004                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3005
3006         ipv4_dst_ops.gc_thresh = ~0;
3007         ip_rt_max_size = INT_MAX;
3008
3009         devinet_init();
3010         ip_fib_init();
3011
3012         if (ip_rt_proc_init())
3013                 pr_err("Unable to create route proc files\n");
3014 #ifdef CONFIG_XFRM
3015         xfrm_init();
3016         xfrm4_init();
3017 #endif
3018         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3019
3020 #ifdef CONFIG_SYSCTL
3021         register_pernet_subsys(&sysctl_route_ops);
3022 #endif
3023         register_pernet_subsys(&rt_genid_ops);
3024         register_pernet_subsys(&ipv4_inetpeer_ops);
3025         return rc;
3026 }
3027
3028 #ifdef CONFIG_SYSCTL
3029 /*
3030  * We really need to sanitize the damn ipv4 init order, then all
3031  * this nonsense will go away.
3032  */
3033 void __init ip_static_sysctl_init(void)
3034 {
3035         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3036 }
3037 #endif