net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <linux/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/bootmem.h>
  74 #include <linux/string.h>
  75 #include <linux/socket.h>
  76 #include <linux/sockios.h>
  77 #include <linux/errno.h>
  78 #include <linux/in.h>
  79 #include <linux/inet.h>
  80 #include <linux/netdevice.h>
  81 #include <linux/proc_fs.h>
  82 #include <linux/init.h>
  83 #include <linux/skbuff.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/rcupdate.h>
  91 #include <linux/times.h>
  92 #include <linux/slab.h>
  93 #include <linux/jhash.h>
  94 #include <net/dst.h>
  95 #include <net/dst_metadata.h>
  96 #include <net/net_namespace.h>
  97 #include <net/protocol.h>
  98 #include <net/ip.h>
  99 #include <net/route.h>
 100 #include <net/inetpeer.h>
 101 #include <net/sock.h>
 102 #include <net/ip_fib.h>
 103 #include <net/arp.h>
 104 #include <net/tcp.h>
 105 #include <net/icmp.h>
 106 #include <net/xfrm.h>
 107 #include <net/lwtunnel.h>
 108 #include <net/netevent.h>
 109 #include <net/rtnetlink.h>
 110 #ifdef CONFIG_SYSCTL
 111 #include <linux/sysctl.h>
 112 #include <linux/kmemleak.h>
 113 #endif
 114 #include <net/secure_seq.h>
 115 #include <net/ip_tunnels.h>
 116 #include <net/l3mdev.h>
 117
 118 #include "fib_lookup.h"
 119
 120 #define RT_FL_TOS(oldflp4) \
 121         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 122
 123 #define RT_GC_TIMEOUT (300*HZ)
 124
 125 static int ip_rt_max_size;
 126 static int ip_rt_redirect_number __read_mostly  = 9;
 127 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 128 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 129 static int ip_rt_error_cost __read_mostly       = HZ;
 130 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 131 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 132 static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 133 static int ip_rt_min_advmss __read_mostly       = 256;
 134
 135 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 136
 137 /*
 138  *      Interface to generic destination cache.
 139  */
 140
 141 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 142 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 143 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 144 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 145 static void              ipv4_link_failure(struct sk_buff *skb);
 146 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 147                                            struct sk_buff *skb, u32 mtu,
 148                                            bool confirm_neigh);
 149 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 150                                         struct sk_buff *skb);
 151 static void             ipv4_dst_destroy(struct dst_entry *dst);
 152
 153 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 154 {
 155         WARN_ON(1);
 156         return NULL;
 157 }
 158
 159 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 160                                            struct sk_buff *skb,
 161                                            const void *daddr);
 162 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 163
 164 static struct dst_ops ipv4_dst_ops = {
 165         .family =               AF_INET,
 166         .check =                ipv4_dst_check,
 167         .default_advmss =       ipv4_default_advmss,
 168         .mtu =                  ipv4_mtu,
 169         .cow_metrics =          ipv4_cow_metrics,
 170         .destroy =              ipv4_dst_destroy,
 171         .negative_advice =      ipv4_negative_advice,
 172         .link_failure =         ipv4_link_failure,
 173         .update_pmtu =          ip_rt_update_pmtu,
 174         .redirect =             ip_do_redirect,
 175         .local_out =            __ip_local_out,
 176         .neigh_lookup =         ipv4_neigh_lookup,
 177         .confirm_neigh =        ipv4_confirm_neigh,
 178 };
 179
 180 #define ECN_OR_COST(class)      TC_PRIO_##class
 181
 182 const __u8 ip_tos2prio[16] = {
 183         TC_PRIO_BESTEFFORT,
 184         ECN_OR_COST(BESTEFFORT),
 185         TC_PRIO_BESTEFFORT,
 186         ECN_OR_COST(BESTEFFORT),
 187         TC_PRIO_BULK,
 188         ECN_OR_COST(BULK),
 189         TC_PRIO_BULK,
 190         ECN_OR_COST(BULK),
 191         TC_PRIO_INTERACTIVE,
 192         ECN_OR_COST(INTERACTIVE),
 193         TC_PRIO_INTERACTIVE,
 194         ECN_OR_COST(INTERACTIVE),
 195         TC_PRIO_INTERACTIVE_BULK,
 196         ECN_OR_COST(INTERACTIVE_BULK),
 197         TC_PRIO_INTERACTIVE_BULK,
 198         ECN_OR_COST(INTERACTIVE_BULK)
 199 };
 200 EXPORT_SYMBOL(ip_tos2prio);
 201
 202 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 203 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 204
 205 #ifdef CONFIG_PROC_FS
 206 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 207 {
 208         if (*pos)
 209                 return NULL;
 210         return SEQ_START_TOKEN;
 211 }
 212
 213 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 214 {
 215         ++*pos;
 216         return NULL;
 217 }
 218
 219 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 220 {
 221 }
 222
 223 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 224 {
 225         if (v == SEQ_START_TOKEN)
 226                 seq_printf(seq, "%-127s\n",
 227                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 228                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 229                            "HHUptod\tSpecDst");
 230         return 0;
 231 }
 232
 233 static const struct seq_operations rt_cache_seq_ops = {
 234         .start  = rt_cache_seq_start,
 235         .next   = rt_cache_seq_next,
 236         .stop   = rt_cache_seq_stop,
 237         .show   = rt_cache_seq_show,
 238 };
 239
 240 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 241 {
 242         return seq_open(file, &rt_cache_seq_ops);
 243 }
 244
 245 static const struct file_operations rt_cache_seq_fops = {
 246         .owner   = THIS_MODULE,
 247         .open    = rt_cache_seq_open,
 248         .read    = seq_read,
 249         .llseek  = seq_lseek,
 250         .release = seq_release,
 251 };
 252
 253
 254 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 255 {
 256         int cpu;
 257
 258         if (*pos == 0)
 259                 return SEQ_START_TOKEN;
 260
 261         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 262                 if (!cpu_possible(cpu))
 263                         continue;
 264                 *pos = cpu+1;
 265                 return &per_cpu(rt_cache_stat, cpu);
 266         }
 267         return NULL;
 268 }
 269
 270 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 271 {
 272         int cpu;
 273
 274         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 275                 if (!cpu_possible(cpu))
 276                         continue;
 277                 *pos = cpu+1;
 278                 return &per_cpu(rt_cache_stat, cpu);
 279         }
 280         (*pos)++;
 281         return NULL;
 282
 283 }
 284
 285 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 286 {
 287
 288 }
 289
 290 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 291 {
 292         struct rt_cache_stat *st = v;
 293
 294         if (v == SEQ_START_TOKEN) {
 295                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 296                 return 0;
 297         }
 298
 299         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 300                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 301                    dst_entries_get_slow(&ipv4_dst_ops),
 302                    0, /* st->in_hit */
 303                    st->in_slow_tot,
 304                    st->in_slow_mc,
 305                    st->in_no_route,
 306                    st->in_brd,
 307                    st->in_martian_dst,
 308                    st->in_martian_src,
 309
 310                    0, /* st->out_hit */
 311                    st->out_slow_tot,
 312                    st->out_slow_mc,
 313
 314                    0, /* st->gc_total */
 315                    0, /* st->gc_ignored */
 316                    0, /* st->gc_goal_miss */
 317                    0, /* st->gc_dst_overflow */
 318                    0, /* st->in_hlist_search */
 319                    0  /* st->out_hlist_search */
 320                 );
 321         return 0;
 322 }
 323
 324 static const struct seq_operations rt_cpu_seq_ops = {
 325         .start  = rt_cpu_seq_start,
 326         .next   = rt_cpu_seq_next,
 327         .stop   = rt_cpu_seq_stop,
 328         .show   = rt_cpu_seq_show,
 329 };
 330
 331
 332 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 333 {
 334         return seq_open(file, &rt_cpu_seq_ops);
 335 }
 336
 337 static const struct file_operations rt_cpu_seq_fops = {
 338         .owner   = THIS_MODULE,
 339         .open    = rt_cpu_seq_open,
 340         .read    = seq_read,
 341         .llseek  = seq_lseek,
 342         .release = seq_release,
 343 };
 344
 345 #ifdef CONFIG_IP_ROUTE_CLASSID
 346 static int rt_acct_proc_show(struct seq_file *m, void *v)
 347 {
 348         struct ip_rt_acct *dst, *src;
 349         unsigned int i, j;
 350
 351         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 352         if (!dst)
 353                 return -ENOMEM;
 354
 355         for_each_possible_cpu(i) {
 356                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 357                 for (j = 0; j < 256; j++) {
 358                         dst[j].o_bytes   += src[j].o_bytes;
 359                         dst[j].o_packets += src[j].o_packets;
 360                         dst[j].i_bytes   += src[j].i_bytes;
 361                         dst[j].i_packets += src[j].i_packets;
 362                 }
 363         }
 364
 365         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 366         kfree(dst);
 367         return 0;
 368 }
 369
 370 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 371 {
 372         return single_open(file, rt_acct_proc_show, NULL);
 373 }
 374
 375 static const struct file_operations rt_acct_proc_fops = {
 376         .owner          = THIS_MODULE,
 377         .open           = rt_acct_proc_open,
 378         .read           = seq_read,
 379         .llseek         = seq_lseek,
 380         .release        = single_release,
 381 };
 382 #endif
 383
 384 static int __net_init ip_rt_do_proc_init(struct net *net)
 385 {
 386         struct proc_dir_entry *pde;
 387
 388         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
 389                           &rt_cache_seq_fops);
 390         if (!pde)
 391                 goto err1;
 392
 393         pde = proc_create("rt_cache", S_IRUGO,
 394                           net->proc_net_stat, &rt_cpu_seq_fops);
 395         if (!pde)
 396                 goto err2;
 397
 398 #ifdef CONFIG_IP_ROUTE_CLASSID
 399         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 400         if (!pde)
 401                 goto err3;
 402 #endif
 403         return 0;
 404
 405 #ifdef CONFIG_IP_ROUTE_CLASSID
 406 err3:
 407         remove_proc_entry("rt_cache", net->proc_net_stat);
 408 #endif
 409 err2:
 410         remove_proc_entry("rt_cache", net->proc_net);
 411 err1:
 412         return -ENOMEM;
 413 }
 414
 415 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 416 {
 417         remove_proc_entry("rt_cache", net->proc_net_stat);
 418         remove_proc_entry("rt_cache", net->proc_net);
 419 #ifdef CONFIG_IP_ROUTE_CLASSID
 420         remove_proc_entry("rt_acct", net->proc_net);
 421 #endif
 422 }
 423
 424 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 425         .init = ip_rt_do_proc_init,
 426         .exit = ip_rt_do_proc_exit,
 427 };
 428
 429 static int __init ip_rt_proc_init(void)
 430 {
 431         return register_pernet_subsys(&ip_rt_proc_ops);
 432 }
 433
 434 #else
 435 static inline int ip_rt_proc_init(void)
 436 {
 437         return 0;
 438 }
 439 #endif /* CONFIG_PROC_FS */
 440
 441 static inline bool rt_is_expired(const struct rtable *rth)
 442 {
 443         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 444 }
 445
 446 void rt_cache_flush(struct net *net)
 447 {
 448         rt_genid_bump_ipv4(net);
 449 }
 450
 451 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 452                                            struct sk_buff *skb,
 453                                            const void *daddr)
 454 {
 455         struct net_device *dev = dst->dev;
 456         const __be32 *pkey = daddr;
 457         const struct rtable *rt;
 458         struct neighbour *n;
 459
 460         rt = (const struct rtable *) dst;
 461         if (rt->rt_gateway)
 462                 pkey = (const __be32 *) &rt->rt_gateway;
 463         else if (skb)
 464                 pkey = &ip_hdr(skb)->daddr;
 465
 466         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 467         if (n)
 468                 return n;
 469         return neigh_create(&arp_tbl, pkey, dev);
 470 }
 471
 472 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 473 {
 474         struct net_device *dev = dst->dev;
 475         const __be32 *pkey = daddr;
 476         const struct rtable *rt;
 477
 478         rt = (const struct rtable *)dst;
 479         if (rt->rt_gateway)
 480                 pkey = (const __be32 *)&rt->rt_gateway;
 481         else if (!daddr ||
 482                  (rt->rt_flags &
 483                   (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
 484                 return;
 485
 486         __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
 487 }
 488
 489 /* Hash tables of size 2048..262144 depending on RAM size.
 490  * Each bucket uses 8 bytes.
 491  */
 492 static u32 ip_idents_mask __read_mostly;
 493 static atomic_t *ip_idents __read_mostly;
 494 static u32 *ip_tstamps __read_mostly;
 495
 496 /* In order to protect privacy, we add a perturbation to identifiers
 497  * if one generator is seldom used. This makes hard for an attacker
 498  * to infer how many packets were sent between two points in time.
 499  */
 500 u32 ip_idents_reserve(u32 hash, int segs)
 501 {
 502         u32 bucket, old, now = (u32)jiffies;
 503         atomic_t *p_id;
 504         u32 *p_tstamp;
 505         u32 delta = 0;
 506
 507         bucket = hash & ip_idents_mask;
 508         p_tstamp = ip_tstamps + bucket;
 509         p_id = ip_idents + bucket;
 510         old = ACCESS_ONCE(*p_tstamp);
 511
 512         if (old != now && cmpxchg(p_tstamp, old, now) == old)
 513                 delta = prandom_u32_max(now - old);
 514
 515         /* If UBSAN reports an error there, please make sure your compiler
 516          * supports -fno-strict-overflow before reporting it that was a bug
 517          * in UBSAN, and it has been fixed in GCC-8.
 518          */
 519         return atomic_add_return(segs + delta, p_id) - segs;
 520 }
 521 EXPORT_SYMBOL(ip_idents_reserve);
 522
 523 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 524 {
 525         u32 hash, id;
 526
 527         /* Note the following code is not safe, but this is okay. */
 528         if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
 529                 get_random_bytes(&net->ipv4.ip_id_key,
 530                                  sizeof(net->ipv4.ip_id_key));
 531
 532         hash = siphash_3u32((__force u32)iph->daddr,
 533                             (__force u32)iph->saddr,
 534                             iph->protocol,
 535                             &net->ipv4.ip_id_key);
 536         id = ip_idents_reserve(hash, segs);
 537         iph->id = htons(id);
 538 }
 539 EXPORT_SYMBOL(__ip_select_ident);
 540
 541 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
 542                              const struct sock *sk,
 543                              const struct iphdr *iph,
 544                              int oif, u8 tos,
 545                              u8 prot, u32 mark, int flow_flags)
 546 {
 547         if (sk) {
 548                 const struct inet_sock *inet = inet_sk(sk);
 549
 550                 oif = sk->sk_bound_dev_if;
 551                 mark = sk->sk_mark;
 552                 tos = RT_CONN_FLAGS(sk);
 553                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 554         }
 555         flowi4_init_output(fl4, oif, mark, tos,
 556                            RT_SCOPE_UNIVERSE, prot,
 557                            flow_flags,
 558                            iph->daddr, iph->saddr, 0, 0,
 559                            sock_net_uid(net, sk));
 560 }
 561
 562 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 563                                const struct sock *sk)
 564 {
 565         const struct net *net = dev_net(skb->dev);
 566         const struct iphdr *iph = ip_hdr(skb);
 567         int oif = skb->dev->ifindex;
 568         u8 tos = RT_TOS(iph->tos);
 569         u8 prot = iph->protocol;
 570         u32 mark = skb->mark;
 571
 572         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
 573 }
 574
 575 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 576 {
 577         const struct inet_sock *inet = inet_sk(sk);
 578         const struct ip_options_rcu *inet_opt;
 579         __be32 daddr = inet->inet_daddr;
 580
 581         rcu_read_lock();
 582         inet_opt = rcu_dereference(inet->inet_opt);
 583         if (inet_opt && inet_opt->opt.srr)
 584                 daddr = inet_opt->opt.faddr;
 585         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 586                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 587                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 588                            inet_sk_flowi_flags(sk),
 589                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
 590         rcu_read_unlock();
 591 }
 592
 593 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 594                                  const struct sk_buff *skb)
 595 {
 596         if (skb)
 597                 build_skb_flow_key(fl4, skb, sk);
 598         else
 599                 build_sk_flow_key(fl4, sk);
 600 }
 601
 602 static DEFINE_SPINLOCK(fnhe_lock);
 603
 604 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 605 {
 606         struct rtable *rt;
 607
 608         rt = rcu_dereference(fnhe->fnhe_rth_input);
 609         if (rt) {
 610                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 611                 dst_dev_put(&rt->dst);
 612                 dst_release(&rt->dst);
 613         }
 614         rt = rcu_dereference(fnhe->fnhe_rth_output);
 615         if (rt) {
 616                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 617                 dst_dev_put(&rt->dst);
 618                 dst_release(&rt->dst);
 619         }
 620 }
 621
 622 static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
 623 {
 624         struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
 625         struct fib_nh_exception *fnhe, *oldest = NULL;
 626
 627         for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
 628                 fnhe = rcu_dereference_protected(*fnhe_p,
 629                                                  lockdep_is_held(&fnhe_lock));
 630                 if (!fnhe)
 631                         break;
 632                 if (!oldest ||
 633                     time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
 634                         oldest = fnhe;
 635                         oldest_p = fnhe_p;
 636                 }
 637         }
 638         fnhe_flush_routes(oldest);
 639         *oldest_p = oldest->fnhe_next;
 640         kfree_rcu(oldest, rcu);
 641 }
 642
 643 static u32 fnhe_hashfun(__be32 daddr)
 644 {
 645         static siphash_key_t fnhe_hash_key __read_mostly;
 646         u64 hval;
 647
 648         net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key));
 649         hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key);
 650         return hash_64(hval, FNHE_HASH_SHIFT);
 651 }
 652
 653 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 654 {
 655         rt->rt_pmtu = fnhe->fnhe_pmtu;
 656         rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
 657         rt->dst.expires = fnhe->fnhe_expires;
 658
 659         if (fnhe->fnhe_gw) {
 660                 rt->rt_flags |= RTCF_REDIRECTED;
 661                 rt->rt_gateway = fnhe->fnhe_gw;
 662                 rt->rt_uses_gateway = 1;
 663         }
 664 }
 665
 666 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 667                                   u32 pmtu, bool lock, unsigned long expires)
 668 {
 669         struct fnhe_hash_bucket *hash;
 670         struct fib_nh_exception *fnhe;
 671         struct rtable *rt;
 672         u32 genid, hval;
 673         unsigned int i;
 674         int depth;
 675
 676         genid = fnhe_genid(dev_net(nh->nh_dev));
 677         hval = fnhe_hashfun(daddr);
 678
 679         spin_lock_bh(&fnhe_lock);
 680
 681         hash = rcu_dereference(nh->nh_exceptions);
 682         if (!hash) {
 683                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 684                 if (!hash)
 685                         goto out_unlock;
 686                 rcu_assign_pointer(nh->nh_exceptions, hash);
 687         }
 688
 689         hash += hval;
 690
 691         depth = 0;
 692         for (fnhe = rcu_dereference(hash->chain); fnhe;
 693              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 694                 if (fnhe->fnhe_daddr == daddr)
 695                         break;
 696                 depth++;
 697         }
 698
 699         if (fnhe) {
 700                 if (fnhe->fnhe_genid != genid)
 701                         fnhe->fnhe_genid = genid;
 702                 if (gw)
 703                         fnhe->fnhe_gw = gw;
 704                 if (pmtu) {
 705                         fnhe->fnhe_pmtu = pmtu;
 706                         fnhe->fnhe_mtu_locked = lock;
 707                 }
 708                 fnhe->fnhe_expires = max(1UL, expires);
 709                 /* Update all cached dsts too */
 710                 rt = rcu_dereference(fnhe->fnhe_rth_input);
 711                 if (rt)
 712                         fill_route_from_fnhe(rt, fnhe);
 713                 rt = rcu_dereference(fnhe->fnhe_rth_output);
 714                 if (rt)
 715                         fill_route_from_fnhe(rt, fnhe);
 716         } else {
 717                 /* Randomize max depth to avoid some side channels attacks. */
 718                 int max_depth = FNHE_RECLAIM_DEPTH +
 719                                 prandom_u32_max(FNHE_RECLAIM_DEPTH);
 720
 721                 while (depth > max_depth) {
 722                         fnhe_remove_oldest(hash);
 723                         depth--;
 724                 }
 725
 726                 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 727                 if (!fnhe)
 728                         goto out_unlock;
 729
 730                 fnhe->fnhe_next = hash->chain;
 731
 732                 fnhe->fnhe_genid = genid;
 733                 fnhe->fnhe_daddr = daddr;
 734                 fnhe->fnhe_gw = gw;
 735                 fnhe->fnhe_pmtu = pmtu;
 736                 fnhe->fnhe_mtu_locked = lock;
 737                 fnhe->fnhe_expires = max(1UL, expires);
 738
 739                 rcu_assign_pointer(hash->chain, fnhe);
 740
 741                 /* Exception created; mark the cached routes for the nexthop
 742                  * stale, so anyone caching it rechecks if this exception
 743                  * applies to them.
 744                  */
 745                 rt = rcu_dereference(nh->nh_rth_input);
 746                 if (rt)
 747                         rt->dst.obsolete = DST_OBSOLETE_KILL;
 748
 749                 for_each_possible_cpu(i) {
 750                         struct rtable __rcu **prt;
 751                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
 752                         rt = rcu_dereference(*prt);
 753                         if (rt)
 754                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 755                 }
 756         }
 757
 758         fnhe->fnhe_stamp = jiffies;
 759
 760 out_unlock:
 761         spin_unlock_bh(&fnhe_lock);
 762 }
 763
 764 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 765                              bool kill_route)
 766 {
 767         __be32 new_gw = icmp_hdr(skb)->un.gateway;
 768         __be32 old_gw = ip_hdr(skb)->saddr;
 769         struct net_device *dev = skb->dev;
 770         struct in_device *in_dev;
 771         struct fib_result res;
 772         struct neighbour *n;
 773         struct net *net;
 774
 775         switch (icmp_hdr(skb)->code & 7) {
 776         case ICMP_REDIR_NET:
 777         case ICMP_REDIR_NETTOS:
 778         case ICMP_REDIR_HOST:
 779         case ICMP_REDIR_HOSTTOS:
 780                 break;
 781
 782         default:
 783                 return;
 784         }
 785
 786         if (rt->rt_gateway != old_gw)
 787                 return;
 788
 789         in_dev = __in_dev_get_rcu(dev);
 790         if (!in_dev)
 791                 return;
 792
 793         net = dev_net(dev);
 794         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 795             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 796             ipv4_is_zeronet(new_gw))
 797                 goto reject_redirect;
 798
 799         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 800                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 801                         goto reject_redirect;
 802                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 803                         goto reject_redirect;
 804         } else {
 805                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 806                         goto reject_redirect;
 807         }
 808
 809         n = __ipv4_neigh_lookup(rt->dst.dev, (__force u32)new_gw);
 810         if (!n)
 811                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 812         if (!IS_ERR(n)) {
 813                 if (!(n->nud_state & NUD_VALID)) {
 814                         neigh_event_send(n, NULL);
 815                 } else {
 816                         if (fib_lookup(net, fl4, &res, 0) == 0) {
 817                                 struct fib_nh *nh = &FIB_RES_NH(res);
 818
 819                                 fib_select_path(net, &res, fl4, skb);
 820                                 nh = &FIB_RES_NH(res);
 821                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
 822                                                 0, false,
 823                                                 jiffies + ip_rt_gc_timeout);
 824                         }
 825                         if (kill_route)
 826                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 827                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 828                 }
 829                 neigh_release(n);
 830         }
 831         return;
 832
 833 reject_redirect:
 834 #ifdef CONFIG_IP_ROUTE_VERBOSE
 835         if (IN_DEV_LOG_MARTIANS(in_dev)) {
 836                 const struct iphdr *iph = (const struct iphdr *) skb->data;
 837                 __be32 daddr = iph->daddr;
 838                 __be32 saddr = iph->saddr;
 839
 840                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 841                                      "  Advised path = %pI4 -> %pI4\n",
 842                                      &old_gw, dev->name, &new_gw,
 843                                      &saddr, &daddr);
 844         }
 845 #endif
 846         ;
 847 }
 848
 849 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 850 {
 851         struct rtable *rt;
 852         struct flowi4 fl4;
 853         const struct iphdr *iph = (const struct iphdr *) skb->data;
 854         struct net *net = dev_net(skb->dev);
 855         int oif = skb->dev->ifindex;
 856         u8 tos = RT_TOS(iph->tos);
 857         u8 prot = iph->protocol;
 858         u32 mark = skb->mark;
 859
 860         rt = (struct rtable *) dst;
 861
 862         __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
 863         __ip_do_redirect(rt, skb, &fl4, true);
 864 }
 865
 866 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 867 {
 868         struct rtable *rt = (struct rtable *)dst;
 869         struct dst_entry *ret = dst;
 870
 871         if (rt) {
 872                 if (dst->obsolete > 0) {
 873                         ip_rt_put(rt);
 874                         ret = NULL;
 875                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 876                            rt->dst.expires) {
 877                         ip_rt_put(rt);
 878                         ret = NULL;
 879                 }
 880         }
 881         return ret;
 882 }
 883
 884 /*
 885  * Algorithm:
 886  *      1. The first ip_rt_redirect_number redirects are sent
 887  *         with exponential backoff, then we stop sending them at all,
 888  *         assuming that the host ignores our redirects.
 889  *      2. If we did not see packets requiring redirects
 890  *         during ip_rt_redirect_silence, we assume that the host
 891  *         forgot redirected route and start to send redirects again.
 892  *
 893  * This algorithm is much cheaper and more intelligent than dumb load limiting
 894  * in icmp.c.
 895  *
 896  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 897  * and "frag. need" (breaks PMTU discovery) in icmp.c.
 898  */
 899
 900 void ip_rt_send_redirect(struct sk_buff *skb)
 901 {
 902         struct rtable *rt = skb_rtable(skb);
 903         struct in_device *in_dev;
 904         struct inet_peer *peer;
 905         struct net *net;
 906         int log_martians;
 907         int vif;
 908
 909         rcu_read_lock();
 910         in_dev = __in_dev_get_rcu(rt->dst.dev);
 911         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 912                 rcu_read_unlock();
 913                 return;
 914         }
 915         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 916         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 917         rcu_read_unlock();
 918
 919         net = dev_net(rt->dst.dev);
 920         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 921         if (!peer) {
 922                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 923                           rt_nexthop(rt, ip_hdr(skb)->daddr));
 924                 return;
 925         }
 926
 927         /* No redirected packets during ip_rt_redirect_silence;
 928          * reset the algorithm.
 929          */
 930         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
 931                 peer->rate_tokens = 0;
 932                 peer->n_redirects = 0;
 933         }
 934
 935         /* Too many ignored redirects; do not send anything
 936          * set dst.rate_last to the last seen redirected packet.
 937          */
 938         if (peer->n_redirects >= ip_rt_redirect_number) {
 939                 peer->rate_last = jiffies;
 940                 goto out_put_peer;
 941         }
 942
 943         /* Check for load limit; set rate_last to the latest sent
 944          * redirect.
 945          */
 946         if (peer->n_redirects == 0 ||
 947             time_after(jiffies,
 948                        (peer->rate_last +
 949                         (ip_rt_redirect_load << peer->n_redirects)))) {
 950                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 951
 952                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 953                 peer->rate_last = jiffies;
 954                 ++peer->n_redirects;
 955 #ifdef CONFIG_IP_ROUTE_VERBOSE
 956                 if (log_martians &&
 957                     peer->n_redirects == ip_rt_redirect_number)
 958                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 959                                              &ip_hdr(skb)->saddr, inet_iif(skb),
 960                                              &ip_hdr(skb)->daddr, &gw);
 961 #endif
 962         }
 963 out_put_peer:
 964         inet_putpeer(peer);
 965 }
 966
 967 static int ip_error(struct sk_buff *skb)
 968 {
 969         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
 970         struct rtable *rt = skb_rtable(skb);
 971         struct inet_peer *peer;
 972         unsigned long now;
 973         struct net *net;
 974         bool send;
 975         int code;
 976
 977         /* IP on this device is disabled. */
 978         if (!in_dev)
 979                 goto out;
 980
 981         net = dev_net(rt->dst.dev);
 982         if (!IN_DEV_FORWARD(in_dev)) {
 983                 switch (rt->dst.error) {
 984                 case EHOSTUNREACH:
 985                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 986                         break;
 987
 988                 case ENETUNREACH:
 989                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 990                         break;
 991                 }
 992                 goto out;
 993         }
 994
 995         switch (rt->dst.error) {
 996         case EINVAL:
 997         default:
 998                 goto out;
 999         case EHOSTUNREACH:
1000                 code = ICMP_HOST_UNREACH;
1001                 break;
1002         case ENETUNREACH:
1003                 code = ICMP_NET_UNREACH;
1004                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
1005                 break;
1006         case EACCES:
1007                 code = ICMP_PKT_FILTERED;
1008                 break;
1009         }
1010
1011         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
1012                                l3mdev_master_ifindex(skb->dev), 1);
1013
1014         send = true;
1015         if (peer) {
1016                 now = jiffies;
1017                 peer->rate_tokens += now - peer->rate_last;
1018                 if (peer->rate_tokens > ip_rt_error_burst)
1019                         peer->rate_tokens = ip_rt_error_burst;
1020                 peer->rate_last = now;
1021                 if (peer->rate_tokens >= ip_rt_error_cost)
1022                         peer->rate_tokens -= ip_rt_error_cost;
1023                 else
1024                         send = false;
1025                 inet_putpeer(peer);
1026         }
1027         if (send)
1028                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1029
1030 out:    kfree_skb(skb);
1031         return 0;
1032 }
1033
1034 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1035 {
1036         struct dst_entry *dst = &rt->dst;
1037         struct net *net = dev_net(dst->dev);
1038         u32 old_mtu = ipv4_mtu(dst);
1039         struct fib_result res;
1040         bool lock = false;
1041
1042         if (ip_mtu_locked(dst))
1043                 return;
1044
1045         if (old_mtu < mtu)
1046                 return;
1047
1048         if (mtu < ip_rt_min_pmtu) {
1049                 lock = true;
1050                 mtu = min(old_mtu, ip_rt_min_pmtu);
1051         }
1052
1053         if (rt->rt_pmtu == mtu && !lock &&
1054             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1055                 return;
1056
1057         rcu_read_lock();
1058         if (fib_lookup(net, fl4, &res, 0) == 0) {
1059                 struct fib_nh *nh;
1060
1061                 fib_select_path(net, &res, fl4, NULL);
1062                 nh = &FIB_RES_NH(res);
1063                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
1064                                       jiffies + ip_rt_mtu_expires);
1065         }
1066         rcu_read_unlock();
1067 }
1068
1069 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1070                               struct sk_buff *skb, u32 mtu,
1071                               bool confirm_neigh)
1072 {
1073         struct rtable *rt = (struct rtable *) dst;
1074         struct flowi4 fl4;
1075
1076         ip_rt_build_flow_key(&fl4, sk, skb);
1077         __ip_rt_update_pmtu(rt, &fl4, mtu);
1078 }
1079
1080 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1081                       int oif, u32 mark, u8 protocol, int flow_flags)
1082 {
1083         const struct iphdr *iph = (const struct iphdr *) skb->data;
1084         struct flowi4 fl4;
1085         struct rtable *rt;
1086
1087         if (!mark)
1088                 mark = IP4_REPLY_MARK(net, skb->mark);
1089
1090         __build_flow_key(net, &fl4, NULL, iph, oif,
1091                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1092         rt = __ip_route_output_key(net, &fl4);
1093         if (!IS_ERR(rt)) {
1094                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1095                 ip_rt_put(rt);
1096         }
1097 }
1098 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1099
1100 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1101 {
1102         const struct iphdr *iph = (const struct iphdr *) skb->data;
1103         struct flowi4 fl4;
1104         struct rtable *rt;
1105
1106         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1107
1108         if (!fl4.flowi4_mark)
1109                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1110
1111         rt = __ip_route_output_key(sock_net(sk), &fl4);
1112         if (!IS_ERR(rt)) {
1113                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1114                 ip_rt_put(rt);
1115         }
1116 }
1117
1118 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1119 {
1120         const struct iphdr *iph = (const struct iphdr *) skb->data;
1121         struct flowi4 fl4;
1122         struct rtable *rt;
1123         struct dst_entry *odst = NULL;
1124         bool new = false;
1125         struct net *net = sock_net(sk);
1126
1127         bh_lock_sock(sk);
1128
1129         if (!ip_sk_accept_pmtu(sk))
1130                 goto out;
1131
1132         odst = sk_dst_get(sk);
1133
1134         if (sock_owned_by_user(sk) || !odst) {
1135                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1136                 goto out;
1137         }
1138
1139         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1140
1141         rt = (struct rtable *)odst;
1142         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1143                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1144                 if (IS_ERR(rt))
1145                         goto out;
1146
1147                 new = true;
1148         }
1149
1150         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1151
1152         if (!dst_check(&rt->dst, 0)) {
1153                 if (new)
1154                         dst_release(&rt->dst);
1155
1156                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1157                 if (IS_ERR(rt))
1158                         goto out;
1159
1160                 new = true;
1161         }
1162
1163         if (new)
1164                 sk_dst_set(sk, &rt->dst);
1165
1166 out:
1167         bh_unlock_sock(sk);
1168         dst_release(odst);
1169 }
1170 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1171
1172 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1173                    int oif, u32 mark, u8 protocol, int flow_flags)
1174 {
1175         const struct iphdr *iph = (const struct iphdr *) skb->data;
1176         struct flowi4 fl4;
1177         struct rtable *rt;
1178
1179         __build_flow_key(net, &fl4, NULL, iph, oif,
1180                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1181         rt = __ip_route_output_key(net, &fl4);
1182         if (!IS_ERR(rt)) {
1183                 __ip_do_redirect(rt, skb, &fl4, false);
1184                 ip_rt_put(rt);
1185         }
1186 }
1187 EXPORT_SYMBOL_GPL(ipv4_redirect);
1188
1189 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1190 {
1191         const struct iphdr *iph = (const struct iphdr *) skb->data;
1192         struct flowi4 fl4;
1193         struct rtable *rt;
1194         struct net *net = sock_net(sk);
1195
1196         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1197         rt = __ip_route_output_key(net, &fl4);
1198         if (!IS_ERR(rt)) {
1199                 __ip_do_redirect(rt, skb, &fl4, false);
1200                 ip_rt_put(rt);
1201         }
1202 }
1203 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1204
1205 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1206 {
1207         struct rtable *rt = (struct rtable *) dst;
1208
1209         /* All IPV4 dsts are created with ->obsolete set to the value
1210          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1211          * into this function always.
1212          *
1213          * When a PMTU/redirect information update invalidates a route,
1214          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1215          * DST_OBSOLETE_DEAD by dst_free().
1216          */
1217         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1218                 return NULL;
1219         return dst;
1220 }
1221
1222 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1223 {
1224         struct net_device *dev;
1225         struct ip_options opt;
1226         int res;
1227
1228         /* Recompile ip options since IPCB may not be valid anymore.
1229          * Also check we have a reasonable ipv4 header.
1230          */
1231         if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1232             ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1233                 return;
1234
1235         memset(&opt, 0, sizeof(opt));
1236         if (ip_hdr(skb)->ihl > 5) {
1237                 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1238                         return;
1239                 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1240
1241                 rcu_read_lock();
1242                 dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev;
1243                 res = __ip_options_compile(dev_net(dev), &opt, skb, NULL);
1244                 rcu_read_unlock();
1245
1246                 if (res)
1247                         return;
1248         }
1249         __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1250 }
1251
1252 static void ipv4_link_failure(struct sk_buff *skb)
1253 {
1254         struct rtable *rt;
1255
1256         ipv4_send_dest_unreach(skb);
1257
1258         rt = skb_rtable(skb);
1259         if (rt)
1260                 dst_set_expires(&rt->dst, 0);
1261 }
1262
1263 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1264 {
1265         pr_debug("%s: %pI4 -> %pI4, %s\n",
1266                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1267                  skb->dev ? skb->dev->name : "?");
1268         kfree_skb(skb);
1269         WARN_ON(1);
1270         return 0;
1271 }
1272
1273 /*
1274    We do not cache source address of outgoing interface,
1275    because it is used only by IP RR, TS and SRR options,
1276    so that it out of fast path.
1277
1278    BTW remember: "addr" is allowed to be not aligned
1279    in IP options!
1280  */
1281
1282 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1283 {
1284         __be32 src;
1285
1286         if (rt_is_output_route(rt))
1287                 src = ip_hdr(skb)->saddr;
1288         else {
1289                 struct fib_result res;
1290                 struct flowi4 fl4;
1291                 struct iphdr *iph;
1292
1293                 iph = ip_hdr(skb);
1294
1295                 memset(&fl4, 0, sizeof(fl4));
1296                 fl4.daddr = iph->daddr;
1297                 fl4.saddr = iph->saddr;
1298                 fl4.flowi4_tos = RT_TOS(iph->tos);
1299                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1300                 fl4.flowi4_iif = skb->dev->ifindex;
1301                 fl4.flowi4_mark = skb->mark;
1302
1303                 rcu_read_lock();
1304                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1305                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1306                 else
1307                         src = inet_select_addr(rt->dst.dev,
1308                                                rt_nexthop(rt, iph->daddr),
1309                                                RT_SCOPE_UNIVERSE);
1310                 rcu_read_unlock();
1311         }
1312         memcpy(addr, &src, 4);
1313 }
1314
1315 #ifdef CONFIG_IP_ROUTE_CLASSID
1316 static void set_class_tag(struct rtable *rt, u32 tag)
1317 {
1318         if (!(rt->dst.tclassid & 0xFFFF))
1319                 rt->dst.tclassid |= tag & 0xFFFF;
1320         if (!(rt->dst.tclassid & 0xFFFF0000))
1321                 rt->dst.tclassid |= tag & 0xFFFF0000;
1322 }
1323 #endif
1324
1325 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1326 {
1327         unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1328         unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1329                                     ip_rt_min_advmss);
1330
1331         return min(advmss, IPV4_MAX_PMTU - header_size);
1332 }
1333
1334 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1335 {
1336         const struct rtable *rt = (const struct rtable *) dst;
1337         unsigned int mtu = rt->rt_pmtu;
1338
1339         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1340                 mtu = dst_metric_raw(dst, RTAX_MTU);
1341
1342         if (mtu)
1343                 return mtu;
1344
1345         mtu = READ_ONCE(dst->dev->mtu);
1346
1347         if (unlikely(ip_mtu_locked(dst))) {
1348                 if (rt->rt_uses_gateway && mtu > 576)
1349                         mtu = 576;
1350         }
1351
1352         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1353
1354         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1355 }
1356
1357 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1358 {
1359         struct fnhe_hash_bucket *hash;
1360         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1361         u32 hval = fnhe_hashfun(daddr);
1362
1363         spin_lock_bh(&fnhe_lock);
1364
1365         hash = rcu_dereference_protected(nh->nh_exceptions,
1366                                          lockdep_is_held(&fnhe_lock));
1367         hash += hval;
1368
1369         fnhe_p = &hash->chain;
1370         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1371         while (fnhe) {
1372                 if (fnhe->fnhe_daddr == daddr) {
1373                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1374                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1375                         /* set fnhe_daddr to 0 to ensure it won't bind with
1376                          * new dsts in rt_bind_exception().
1377                          */
1378                         fnhe->fnhe_daddr = 0;
1379                         fnhe_flush_routes(fnhe);
1380                         kfree_rcu(fnhe, rcu);
1381                         break;
1382                 }
1383                 fnhe_p = &fnhe->fnhe_next;
1384                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1385                                                  lockdep_is_held(&fnhe_lock));
1386         }
1387
1388         spin_unlock_bh(&fnhe_lock);
1389 }
1390
1391 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1392 {
1393         struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1394         struct fib_nh_exception *fnhe;
1395         u32 hval;
1396
1397         if (!hash)
1398                 return NULL;
1399
1400         hval = fnhe_hashfun(daddr);
1401
1402         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1403              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1404                 if (fnhe->fnhe_daddr == daddr) {
1405                         if (fnhe->fnhe_expires &&
1406                             time_after(jiffies, fnhe->fnhe_expires)) {
1407                                 ip_del_fnhe(nh, daddr);
1408                                 break;
1409                         }
1410                         return fnhe;
1411                 }
1412         }
1413         return NULL;
1414 }
1415
1416 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1417                               __be32 daddr, const bool do_cache)
1418 {
1419         bool ret = false;
1420
1421         spin_lock_bh(&fnhe_lock);
1422
1423         if (daddr == fnhe->fnhe_daddr) {
1424                 struct rtable __rcu **porig;
1425                 struct rtable *orig;
1426                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1427
1428                 if (rt_is_input_route(rt))
1429                         porig = &fnhe->fnhe_rth_input;
1430                 else
1431                         porig = &fnhe->fnhe_rth_output;
1432                 orig = rcu_dereference(*porig);
1433
1434                 if (fnhe->fnhe_genid != genid) {
1435                         fnhe->fnhe_genid = genid;
1436                         fnhe->fnhe_gw = 0;
1437                         fnhe->fnhe_pmtu = 0;
1438                         fnhe->fnhe_expires = 0;
1439                         fnhe_flush_routes(fnhe);
1440                         orig = NULL;
1441                 }
1442                 fill_route_from_fnhe(rt, fnhe);
1443                 if (!rt->rt_gateway)
1444                         rt->rt_gateway = daddr;
1445
1446                 if (do_cache) {
1447                         dst_hold(&rt->dst);
1448                         rcu_assign_pointer(*porig, rt);
1449                         if (orig) {
1450                                 dst_dev_put(&orig->dst);
1451                                 dst_release(&orig->dst);
1452                         }
1453                         ret = true;
1454                 }
1455
1456                 fnhe->fnhe_stamp = jiffies;
1457         }
1458         spin_unlock_bh(&fnhe_lock);
1459
1460         return ret;
1461 }
1462
1463 struct uncached_list {
1464         spinlock_t              lock;
1465         struct list_head        head;
1466 };
1467
1468 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1469
1470 static void rt_add_uncached_list(struct rtable *rt)
1471 {
1472         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1473
1474         rt->rt_uncached_list = ul;
1475
1476         spin_lock_bh(&ul->lock);
1477         list_add_tail(&rt->rt_uncached, &ul->head);
1478         spin_unlock_bh(&ul->lock);
1479 }
1480
1481 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1482 {
1483         struct rtable *orig, *prev, **p;
1484         bool ret = true;
1485
1486         if (rt_is_input_route(rt)) {
1487                 p = (struct rtable **)&nh->nh_rth_input;
1488         } else {
1489                 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1490         }
1491         orig = *p;
1492
1493         /* hold dst before doing cmpxchg() to avoid race condition
1494          * on this dst
1495          */
1496         dst_hold(&rt->dst);
1497         prev = cmpxchg(p, orig, rt);
1498         if (prev == orig) {
1499                 if (orig) {
1500                         rt_add_uncached_list(orig);
1501                         dst_release(&orig->dst);
1502                 }
1503         } else {
1504                 dst_release(&rt->dst);
1505                 ret = false;
1506         }
1507
1508         return ret;
1509 }
1510
1511 static void ipv4_dst_destroy(struct dst_entry *dst)
1512 {
1513         struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1514         struct rtable *rt = (struct rtable *) dst;
1515
1516         if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
1517                 kfree(p);
1518
1519         if (!list_empty(&rt->rt_uncached)) {
1520                 struct uncached_list *ul = rt->rt_uncached_list;
1521
1522                 spin_lock_bh(&ul->lock);
1523                 list_del(&rt->rt_uncached);
1524                 spin_unlock_bh(&ul->lock);
1525         }
1526 }
1527
1528 void rt_flush_dev(struct net_device *dev)
1529 {
1530         struct net *net = dev_net(dev);
1531         struct rtable *rt;
1532         int cpu;
1533
1534         for_each_possible_cpu(cpu) {
1535                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1536
1537                 spin_lock_bh(&ul->lock);
1538                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1539                         if (rt->dst.dev != dev)
1540                                 continue;
1541                         rt->dst.dev = net->loopback_dev;
1542                         dev_hold(rt->dst.dev);
1543                         dev_put(dev);
1544                 }
1545                 spin_unlock_bh(&ul->lock);
1546         }
1547 }
1548
1549 static bool rt_cache_valid(const struct rtable *rt)
1550 {
1551         return  rt &&
1552                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1553                 !rt_is_expired(rt);
1554 }
1555
1556 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1557                            const struct fib_result *res,
1558                            struct fib_nh_exception *fnhe,
1559                            struct fib_info *fi, u16 type, u32 itag,
1560                            const bool do_cache)
1561 {
1562         bool cached = false;
1563
1564         if (fi) {
1565                 struct fib_nh *nh = &FIB_RES_NH(*res);
1566
1567                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1568                         rt->rt_gateway = nh->nh_gw;
1569                         rt->rt_uses_gateway = 1;
1570                 }
1571                 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1572                 if (fi->fib_metrics != &dst_default_metrics) {
1573                         rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1574                         refcount_inc(&fi->fib_metrics->refcnt);
1575                 }
1576 #ifdef CONFIG_IP_ROUTE_CLASSID
1577                 rt->dst.tclassid = nh->nh_tclassid;
1578 #endif
1579                 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1580                 if (unlikely(fnhe))
1581                         cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1582                 else if (do_cache)
1583                         cached = rt_cache_route(nh, rt);
1584                 if (unlikely(!cached)) {
1585                         /* Routes we intend to cache in nexthop exception or
1586                          * FIB nexthop have the DST_NOCACHE bit clear.
1587                          * However, if we are unsuccessful at storing this
1588                          * route into the cache we really need to set it.
1589                          */
1590                         if (!rt->rt_gateway)
1591                                 rt->rt_gateway = daddr;
1592                         rt_add_uncached_list(rt);
1593                 }
1594         } else
1595                 rt_add_uncached_list(rt);
1596
1597 #ifdef CONFIG_IP_ROUTE_CLASSID
1598 #ifdef CONFIG_IP_MULTIPLE_TABLES
1599         set_class_tag(rt, res->tclassid);
1600 #endif
1601         set_class_tag(rt, itag);
1602 #endif
1603 }
1604
1605 struct rtable *rt_dst_alloc(struct net_device *dev,
1606                             unsigned int flags, u16 type,
1607                             bool nopolicy, bool noxfrm, bool will_cache)
1608 {
1609         struct rtable *rt;
1610
1611         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1612                        (will_cache ? 0 : DST_HOST) |
1613                        (nopolicy ? DST_NOPOLICY : 0) |
1614                        (noxfrm ? DST_NOXFRM : 0));
1615
1616         if (rt) {
1617                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1618                 rt->rt_flags = flags;
1619                 rt->rt_type = type;
1620                 rt->rt_is_input = 0;
1621                 rt->rt_iif = 0;
1622                 rt->rt_pmtu = 0;
1623                 rt->rt_mtu_locked = 0;
1624                 rt->rt_gateway = 0;
1625                 rt->rt_uses_gateway = 0;
1626                 rt->rt_table_id = 0;
1627                 INIT_LIST_HEAD(&rt->rt_uncached);
1628
1629                 rt->dst.output = ip_output;
1630                 if (flags & RTCF_LOCAL)
1631                         rt->dst.input = ip_local_deliver;
1632         }
1633
1634         return rt;
1635 }
1636 EXPORT_SYMBOL(rt_dst_alloc);
1637
1638 /* called in rcu_read_lock() section */
1639 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1640                           u8 tos, struct net_device *dev,
1641                           struct in_device *in_dev, u32 *itag)
1642 {
1643         int err;
1644
1645         /* Primary sanity checks. */
1646         if (!in_dev)
1647                 return -EINVAL;
1648
1649         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1650             skb->protocol != htons(ETH_P_IP))
1651                 return -EINVAL;
1652
1653         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1654                 return -EINVAL;
1655
1656         if (ipv4_is_zeronet(saddr)) {
1657                 if (!ipv4_is_local_multicast(daddr))
1658                         return -EINVAL;
1659         } else {
1660                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1661                                           in_dev, itag);
1662                 if (err < 0)
1663                         return err;
1664         }
1665         return 0;
1666 }
1667
1668 /* called in rcu_read_lock() section */
1669 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1670                              u8 tos, struct net_device *dev, int our)
1671 {
1672         struct in_device *in_dev = __in_dev_get_rcu(dev);
1673         unsigned int flags = RTCF_MULTICAST;
1674         struct rtable *rth;
1675         u32 itag = 0;
1676         int err;
1677
1678         err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1679         if (err)
1680                 return err;
1681
1682         if (our)
1683                 flags |= RTCF_LOCAL;
1684
1685         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1686                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1687         if (!rth)
1688                 return -ENOBUFS;
1689
1690 #ifdef CONFIG_IP_ROUTE_CLASSID
1691         rth->dst.tclassid = itag;
1692 #endif
1693         rth->dst.output = ip_rt_bug;
1694         rth->rt_is_input= 1;
1695
1696 #ifdef CONFIG_IP_MROUTE
1697         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1698                 rth->dst.input = ip_mr_input;
1699 #endif
1700         RT_CACHE_STAT_INC(in_slow_mc);
1701
1702         skb_dst_drop(skb);
1703         skb_dst_set(skb, &rth->dst);
1704         return 0;
1705 }
1706
1707
1708 static void ip_handle_martian_source(struct net_device *dev,
1709                                      struct in_device *in_dev,
1710                                      struct sk_buff *skb,
1711                                      __be32 daddr,
1712                                      __be32 saddr)
1713 {
1714         RT_CACHE_STAT_INC(in_martian_src);
1715 #ifdef CONFIG_IP_ROUTE_VERBOSE
1716         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1717                 /*
1718                  *      RFC1812 recommendation, if source is martian,
1719                  *      the only hint is MAC header.
1720                  */
1721                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1722                         &daddr, &saddr, dev->name);
1723                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1724                         print_hex_dump(KERN_WARNING, "ll header: ",
1725                                        DUMP_PREFIX_OFFSET, 16, 1,
1726                                        skb_mac_header(skb),
1727                                        dev->hard_header_len, true);
1728                 }
1729         }
1730 #endif
1731 }
1732
1733 static void set_lwt_redirect(struct rtable *rth)
1734 {
1735         if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1736                 rth->dst.lwtstate->orig_output = rth->dst.output;
1737                 rth->dst.output = lwtunnel_output;
1738         }
1739
1740         if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1741                 rth->dst.lwtstate->orig_input = rth->dst.input;
1742                 rth->dst.input = lwtunnel_input;
1743         }
1744 }
1745
1746 /* called in rcu_read_lock() section */
1747 static int __mkroute_input(struct sk_buff *skb,
1748                            const struct fib_result *res,
1749                            struct in_device *in_dev,
1750                            __be32 daddr, __be32 saddr, u32 tos)
1751 {
1752         struct fib_nh_exception *fnhe;
1753         struct rtable *rth;
1754         int err;
1755         struct in_device *out_dev;
1756         bool do_cache;
1757         u32 itag = 0;
1758
1759         /* get a working reference to the output device */
1760         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1761         if (!out_dev) {
1762                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1763                 return -EINVAL;
1764         }
1765
1766         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1767                                   in_dev->dev, in_dev, &itag);
1768         if (err < 0) {
1769                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1770                                          saddr);
1771
1772                 goto cleanup;
1773         }
1774
1775         do_cache = res->fi && !itag;
1776         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1777             skb->protocol == htons(ETH_P_IP) &&
1778             (IN_DEV_SHARED_MEDIA(out_dev) ||
1779              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1780                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1781
1782         if (skb->protocol != htons(ETH_P_IP)) {
1783                 /* Not IP (i.e. ARP). Do not create route, if it is
1784                  * invalid for proxy arp. DNAT routes are always valid.
1785                  *
1786                  * Proxy arp feature have been extended to allow, ARP
1787                  * replies back to the same interface, to support
1788                  * Private VLAN switch technologies. See arp.c.
1789                  */
1790                 if (out_dev == in_dev &&
1791                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1792                         err = -EINVAL;
1793                         goto cleanup;
1794                 }
1795         }
1796
1797         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1798         if (do_cache) {
1799                 if (fnhe)
1800                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1801                 else
1802                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1803                 if (rt_cache_valid(rth)) {
1804                         skb_dst_set_noref(skb, &rth->dst);
1805                         goto out;
1806                 }
1807         }
1808
1809         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1810                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1811                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1812         if (!rth) {
1813                 err = -ENOBUFS;
1814                 goto cleanup;
1815         }
1816
1817         rth->rt_is_input = 1;
1818         if (res->table)
1819                 rth->rt_table_id = res->table->tb_id;
1820         RT_CACHE_STAT_INC(in_slow_tot);
1821
1822         rth->dst.input = ip_forward;
1823
1824         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1825                        do_cache);
1826         set_lwt_redirect(rth);
1827         skb_dst_set(skb, &rth->dst);
1828 out:
1829         err = 0;
1830  cleanup:
1831         return err;
1832 }
1833
1834 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1835 /* To make ICMP packets follow the right flow, the multipath hash is
1836  * calculated from the inner IP addresses.
1837  */
1838 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1839                                  struct flow_keys *hash_keys)
1840 {
1841         const struct iphdr *outer_iph = ip_hdr(skb);
1842         const struct iphdr *inner_iph;
1843         const struct icmphdr *icmph;
1844         struct iphdr _inner_iph;
1845         struct icmphdr _icmph;
1846
1847         hash_keys->addrs.v4addrs.src = outer_iph->saddr;
1848         hash_keys->addrs.v4addrs.dst = outer_iph->daddr;
1849         if (likely(outer_iph->protocol != IPPROTO_ICMP))
1850                 return;
1851
1852         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1853                 return;
1854
1855         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1856                                    &_icmph);
1857         if (!icmph)
1858                 return;
1859
1860         if (icmph->type != ICMP_DEST_UNREACH &&
1861             icmph->type != ICMP_REDIRECT &&
1862             icmph->type != ICMP_TIME_EXCEEDED &&
1863             icmph->type != ICMP_PARAMETERPROB)
1864                 return;
1865
1866         inner_iph = skb_header_pointer(skb,
1867                                        outer_iph->ihl * 4 + sizeof(_icmph),
1868                                        sizeof(_inner_iph), &_inner_iph);
1869         if (!inner_iph)
1870                 return;
1871         hash_keys->addrs.v4addrs.src = inner_iph->saddr;
1872         hash_keys->addrs.v4addrs.dst = inner_iph->daddr;
1873 }
1874
1875 /* if skb is set it will be used and fl4 can be NULL */
1876 int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
1877                        const struct sk_buff *skb)
1878 {
1879         struct net *net = fi->fib_net;
1880         struct flow_keys hash_keys;
1881         u32 mhash;
1882
1883         switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1884         case 0:
1885                 memset(&hash_keys, 0, sizeof(hash_keys));
1886                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1887                 if (skb) {
1888                         ip_multipath_l3_keys(skb, &hash_keys);
1889                 } else {
1890                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1891                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1892                 }
1893                 break;
1894         case 1:
1895                 /* skb is currently provided only when forwarding */
1896                 if (skb) {
1897                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1898                         struct flow_keys keys;
1899
1900                         /* short-circuit if we already have L4 hash present */
1901                         if (skb->l4_hash)
1902                                 return skb_get_hash_raw(skb) >> 1;
1903                         memset(&hash_keys, 0, sizeof(hash_keys));
1904                         skb_flow_dissect_flow_keys(skb, &keys, flag);
1905
1906                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1907                         hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1908                         hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1909                         hash_keys.ports.src = keys.ports.src;
1910                         hash_keys.ports.dst = keys.ports.dst;
1911                         hash_keys.basic.ip_proto = keys.basic.ip_proto;
1912                 } else {
1913                         memset(&hash_keys, 0, sizeof(hash_keys));
1914                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1915                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1916                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1917                         hash_keys.ports.src = fl4->fl4_sport;
1918                         hash_keys.ports.dst = fl4->fl4_dport;
1919                         hash_keys.basic.ip_proto = fl4->flowi4_proto;
1920                 }
1921                 break;
1922         }
1923         mhash = flow_hash_from_keys(&hash_keys);
1924
1925         return mhash >> 1;
1926 }
1927 EXPORT_SYMBOL_GPL(fib_multipath_hash);
1928 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1929
1930 static int ip_mkroute_input(struct sk_buff *skb,
1931                             struct fib_result *res,
1932                             struct in_device *in_dev,
1933                             __be32 daddr, __be32 saddr, u32 tos)
1934 {
1935 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1936         if (res->fi && res->fi->fib_nhs > 1) {
1937                 int h = fib_multipath_hash(res->fi, NULL, skb);
1938
1939                 fib_select_multipath(res, h);
1940         }
1941 #endif
1942
1943         /* create a routing cache entry */
1944         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1945 }
1946
1947 /*
1948  *      NOTE. We drop all the packets that has local source
1949  *      addresses, because every properly looped back packet
1950  *      must have correct destination already attached by output routine.
1951  *
1952  *      Such approach solves two big problems:
1953  *      1. Not simplex devices are handled properly.
1954  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1955  *      called with rcu_read_lock()
1956  */
1957
1958 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1959                                u8 tos, struct net_device *dev,
1960                                struct fib_result *res)
1961 {
1962         struct in_device *in_dev = __in_dev_get_rcu(dev);
1963         struct ip_tunnel_info *tun_info;
1964         struct flowi4   fl4;
1965         unsigned int    flags = 0;
1966         u32             itag = 0;
1967         struct rtable   *rth;
1968         int             err = -EINVAL;
1969         struct net    *net = dev_net(dev);
1970         bool do_cache;
1971
1972         /* IP on this device is disabled. */
1973
1974         if (!in_dev)
1975                 goto out;
1976
1977         /* Check for the most weird martians, which can be not detected
1978            by fib_lookup.
1979          */
1980
1981         tun_info = skb_tunnel_info(skb);
1982         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1983                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1984         else
1985                 fl4.flowi4_tun_key.tun_id = 0;
1986         skb_dst_drop(skb);
1987
1988         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1989                 goto martian_source;
1990
1991         res->fi = NULL;
1992         res->table = NULL;
1993         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1994                 goto brd_input;
1995
1996         /* Accept zero addresses only to limited broadcast;
1997          * I even do not know to fix it or not. Waiting for complains :-)
1998          */
1999         if (ipv4_is_zeronet(saddr))
2000                 goto martian_source;
2001
2002         if (ipv4_is_zeronet(daddr))
2003                 goto martian_destination;
2004
2005         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2006          * and call it once if daddr or/and saddr are loopback addresses
2007          */
2008         if (ipv4_is_loopback(daddr)) {
2009                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2010                         goto martian_destination;
2011         } else if (ipv4_is_loopback(saddr)) {
2012                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2013                         goto martian_source;
2014         }
2015
2016         /*
2017          *      Now we are ready to route packet.
2018          */
2019         fl4.flowi4_oif = 0;
2020         fl4.flowi4_iif = dev->ifindex;
2021         fl4.flowi4_mark = skb->mark;
2022         fl4.flowi4_tos = tos;
2023         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2024         fl4.flowi4_flags = 0;
2025         fl4.daddr = daddr;
2026         fl4.saddr = saddr;
2027         fl4.flowi4_uid = sock_net_uid(net, NULL);
2028         err = fib_lookup(net, &fl4, res, 0);
2029         if (err != 0) {
2030                 if (!IN_DEV_FORWARD(in_dev))
2031                         err = -EHOSTUNREACH;
2032                 goto no_route;
2033         }
2034
2035         if (res->type == RTN_BROADCAST)
2036                 goto brd_input;
2037
2038         if (res->type == RTN_LOCAL) {
2039                 err = fib_validate_source(skb, saddr, daddr, tos,
2040                                           0, dev, in_dev, &itag);
2041                 if (err < 0)
2042                         goto martian_source;
2043                 goto local_input;
2044         }
2045
2046         if (!IN_DEV_FORWARD(in_dev)) {
2047                 err = -EHOSTUNREACH;
2048                 goto no_route;
2049         }
2050         if (res->type != RTN_UNICAST)
2051                 goto martian_destination;
2052
2053         err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2054 out:    return err;
2055
2056 brd_input:
2057         if (skb->protocol != htons(ETH_P_IP))
2058                 goto e_inval;
2059
2060         if (!ipv4_is_zeronet(saddr)) {
2061                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2062                                           in_dev, &itag);
2063                 if (err < 0)
2064                         goto martian_source;
2065         }
2066         flags |= RTCF_BROADCAST;
2067         res->type = RTN_BROADCAST;
2068         RT_CACHE_STAT_INC(in_brd);
2069
2070 local_input:
2071         do_cache = false;
2072         if (res->fi) {
2073                 if (!itag) {
2074                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
2075                         if (rt_cache_valid(rth)) {
2076                                 skb_dst_set_noref(skb, &rth->dst);
2077                                 err = 0;
2078                                 goto out;
2079                         }
2080                         do_cache = true;
2081                 }
2082         }
2083
2084         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2085                            flags | RTCF_LOCAL, res->type,
2086                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2087         if (!rth)
2088                 goto e_nobufs;
2089
2090         rth->dst.output= ip_rt_bug;
2091 #ifdef CONFIG_IP_ROUTE_CLASSID
2092         rth->dst.tclassid = itag;
2093 #endif
2094         rth->rt_is_input = 1;
2095         if (res->table)
2096                 rth->rt_table_id = res->table->tb_id;
2097
2098         RT_CACHE_STAT_INC(in_slow_tot);
2099         if (res->type == RTN_UNREACHABLE) {
2100                 rth->dst.input= ip_error;
2101                 rth->dst.error= -err;
2102                 rth->rt_flags   &= ~RTCF_LOCAL;
2103         }
2104
2105         if (do_cache) {
2106                 struct fib_nh *nh = &FIB_RES_NH(*res);
2107
2108                 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2109                 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2110                         WARN_ON(rth->dst.input == lwtunnel_input);
2111                         rth->dst.lwtstate->orig_input = rth->dst.input;
2112                         rth->dst.input = lwtunnel_input;
2113                 }
2114
2115                 if (unlikely(!rt_cache_route(nh, rth)))
2116                         rt_add_uncached_list(rth);
2117         }
2118         skb_dst_set(skb, &rth->dst);
2119         err = 0;
2120         goto out;
2121
2122 no_route:
2123         RT_CACHE_STAT_INC(in_no_route);
2124         res->type = RTN_UNREACHABLE;
2125         res->fi = NULL;
2126         res->table = NULL;
2127         goto local_input;
2128
2129         /*
2130          *      Do not cache martian addresses: they should be logged (RFC1812)
2131          */
2132 martian_destination:
2133         RT_CACHE_STAT_INC(in_martian_dst);
2134 #ifdef CONFIG_IP_ROUTE_VERBOSE
2135         if (IN_DEV_LOG_MARTIANS(in_dev))
2136                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2137                                      &daddr, &saddr, dev->name);
2138 #endif
2139
2140 e_inval:
2141         err = -EINVAL;
2142         goto out;
2143
2144 e_nobufs:
2145         err = -ENOBUFS;
2146         goto out;
2147
2148 martian_source:
2149         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2150         goto out;
2151 }
2152
2153 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2154                          u8 tos, struct net_device *dev)
2155 {
2156         struct fib_result res;
2157         int err;
2158
2159         tos &= IPTOS_RT_MASK;
2160         rcu_read_lock();
2161         err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2162         rcu_read_unlock();
2163
2164         return err;
2165 }
2166 EXPORT_SYMBOL(ip_route_input_noref);
2167
2168 /* called with rcu_read_lock held */
2169 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2170                        u8 tos, struct net_device *dev, struct fib_result *res)
2171 {
2172         /* Multicast recognition logic is moved from route cache to here.
2173            The problem was that too many Ethernet cards have broken/missing
2174            hardware multicast filters :-( As result the host on multicasting
2175            network acquires a lot of useless route cache entries, sort of
2176            SDR messages from all the world. Now we try to get rid of them.
2177            Really, provided software IP multicast filter is organized
2178            reasonably (at least, hashed), it does not result in a slowdown
2179            comparing with route cache reject entries.
2180            Note, that multicast routers are not affected, because
2181            route cache entry is created eventually.
2182          */
2183         if (ipv4_is_multicast(daddr)) {
2184                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2185                 int our = 0;
2186                 int err = -EINVAL;
2187
2188                 if (!in_dev)
2189                         return err;
2190                 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2191                                       ip_hdr(skb)->protocol);
2192
2193                 /* check l3 master if no match yet */
2194                 if (!our && netif_is_l3_slave(dev)) {
2195                         struct in_device *l3_in_dev;
2196
2197                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2198                         if (l3_in_dev)
2199                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2200                                                       ip_hdr(skb)->protocol);
2201                 }
2202
2203                 if (our
2204 #ifdef CONFIG_IP_MROUTE
2205                         ||
2206                     (!ipv4_is_local_multicast(daddr) &&
2207                      IN_DEV_MFORWARD(in_dev))
2208 #endif
2209                    ) {
2210                         err = ip_route_input_mc(skb, daddr, saddr,
2211                                                 tos, dev, our);
2212                 }
2213                 return err;
2214         }
2215
2216         return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2217 }
2218
2219 /* called with rcu_read_lock() */
2220 static struct rtable *__mkroute_output(const struct fib_result *res,
2221                                        const struct flowi4 *fl4, int orig_oif,
2222                                        struct net_device *dev_out,
2223                                        unsigned int flags)
2224 {
2225         struct fib_info *fi = res->fi;
2226         struct fib_nh_exception *fnhe;
2227         struct in_device *in_dev;
2228         u16 type = res->type;
2229         struct rtable *rth;
2230         bool do_cache;
2231
2232         in_dev = __in_dev_get_rcu(dev_out);
2233         if (!in_dev)
2234                 return ERR_PTR(-EINVAL);
2235
2236         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2237                 if (ipv4_is_loopback(fl4->saddr) &&
2238                     !(dev_out->flags & IFF_LOOPBACK) &&
2239                     !netif_is_l3_master(dev_out))
2240                         return ERR_PTR(-EINVAL);
2241
2242         if (ipv4_is_lbcast(fl4->daddr))
2243                 type = RTN_BROADCAST;
2244         else if (ipv4_is_multicast(fl4->daddr))
2245                 type = RTN_MULTICAST;
2246         else if (ipv4_is_zeronet(fl4->daddr))
2247                 return ERR_PTR(-EINVAL);
2248
2249         if (dev_out->flags & IFF_LOOPBACK)
2250                 flags |= RTCF_LOCAL;
2251
2252         do_cache = true;
2253         if (type == RTN_BROADCAST) {
2254                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2255                 fi = NULL;
2256         } else if (type == RTN_MULTICAST) {
2257                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2258                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2259                                      fl4->flowi4_proto))
2260                         flags &= ~RTCF_LOCAL;
2261                 else
2262                         do_cache = false;
2263                 /* If multicast route do not exist use
2264                  * default one, but do not gateway in this case.
2265                  * Yes, it is hack.
2266                  */
2267                 if (fi && res->prefixlen < 4)
2268                         fi = NULL;
2269         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2270                    (orig_oif != dev_out->ifindex)) {
2271                 /* For local routes that require a particular output interface
2272                  * we do not want to cache the result.  Caching the result
2273                  * causes incorrect behaviour when there are multiple source
2274                  * addresses on the interface, the end result being that if the
2275                  * intended recipient is waiting on that interface for the
2276                  * packet he won't receive it because it will be delivered on
2277                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2278                  * be set to the loopback interface as well.
2279                  */
2280                 do_cache = false;
2281         }
2282
2283         fnhe = NULL;
2284         do_cache &= fi != NULL;
2285         if (fi) {
2286                 struct rtable __rcu **prth;
2287                 struct fib_nh *nh = &FIB_RES_NH(*res);
2288
2289                 fnhe = find_exception(nh, fl4->daddr);
2290                 if (!do_cache)
2291                         goto add;
2292                 if (fnhe) {
2293                         prth = &fnhe->fnhe_rth_output;
2294                 } else {
2295                         if (unlikely(fl4->flowi4_flags &
2296                                      FLOWI_FLAG_KNOWN_NH &&
2297                                      !(nh->nh_gw &&
2298                                        nh->nh_scope == RT_SCOPE_LINK))) {
2299                                 do_cache = false;
2300                                 goto add;
2301                         }
2302                         prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2303                 }
2304                 rth = rcu_dereference(*prth);
2305                 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2306                         return rth;
2307         }
2308
2309 add:
2310         rth = rt_dst_alloc(dev_out, flags, type,
2311                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2312                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2313                            do_cache);
2314         if (!rth)
2315                 return ERR_PTR(-ENOBUFS);
2316
2317         rth->rt_iif = orig_oif;
2318         if (res->table)
2319                 rth->rt_table_id = res->table->tb_id;
2320
2321         RT_CACHE_STAT_INC(out_slow_tot);
2322
2323         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2324                 if (flags & RTCF_LOCAL &&
2325                     !(dev_out->flags & IFF_LOOPBACK)) {
2326                         rth->dst.output = ip_mc_output;
2327                         RT_CACHE_STAT_INC(out_slow_mc);
2328                 }
2329 #ifdef CONFIG_IP_MROUTE
2330                 if (type == RTN_MULTICAST) {
2331                         if (IN_DEV_MFORWARD(in_dev) &&
2332                             !ipv4_is_local_multicast(fl4->daddr)) {
2333                                 rth->dst.input = ip_mr_input;
2334                                 rth->dst.output = ip_mc_output;
2335                         }
2336                 }
2337 #endif
2338         }
2339
2340         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2341         set_lwt_redirect(rth);
2342
2343         return rth;
2344 }
2345
2346 /*
2347  * Major route resolver routine.
2348  */
2349
2350 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2351                                         const struct sk_buff *skb)
2352 {
2353         __u8 tos = RT_FL_TOS(fl4);
2354         struct fib_result res = {
2355                 .type           = RTN_UNSPEC,
2356                 .fi             = NULL,
2357                 .table          = NULL,
2358                 .tclassid       = 0,
2359         };
2360         struct rtable *rth;
2361
2362         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2363         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2364         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2365                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2366
2367         rcu_read_lock();
2368         rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2369         rcu_read_unlock();
2370
2371         return rth;
2372 }
2373 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2374
2375 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2376                                             struct fib_result *res,
2377                                             const struct sk_buff *skb)
2378 {
2379         struct net_device *dev_out = NULL;
2380         int orig_oif = fl4->flowi4_oif;
2381         unsigned int flags = 0;
2382         struct rtable *rth;
2383         int err;
2384
2385         if (fl4->saddr) {
2386                 if (ipv4_is_multicast(fl4->saddr) ||
2387                     ipv4_is_lbcast(fl4->saddr) ||
2388                     ipv4_is_zeronet(fl4->saddr)) {
2389                         rth = ERR_PTR(-EINVAL);
2390                         goto out;
2391                 }
2392
2393                 rth = ERR_PTR(-ENETUNREACH);
2394
2395                 /* I removed check for oif == dev_out->oif here.
2396                    It was wrong for two reasons:
2397                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2398                       is assigned to multiple interfaces.
2399                    2. Moreover, we are allowed to send packets with saddr
2400                       of another iface. --ANK
2401                  */
2402
2403                 if (fl4->flowi4_oif == 0 &&
2404                     (ipv4_is_multicast(fl4->daddr) ||
2405                      ipv4_is_lbcast(fl4->daddr))) {
2406                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2407                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2408                         if (!dev_out)
2409                                 goto out;
2410
2411                         /* Special hack: user can direct multicasts
2412                            and limited broadcast via necessary interface
2413                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2414                            This hack is not just for fun, it allows
2415                            vic,vat and friends to work.
2416                            They bind socket to loopback, set ttl to zero
2417                            and expect that it will work.
2418                            From the viewpoint of routing cache they are broken,
2419                            because we are not allowed to build multicast path
2420                            with loopback source addr (look, routing cache
2421                            cannot know, that ttl is zero, so that packet
2422                            will not leave this host and route is valid).
2423                            Luckily, this hack is good workaround.
2424                          */
2425
2426                         fl4->flowi4_oif = dev_out->ifindex;
2427                         goto make_route;
2428                 }
2429
2430                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2431                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2432                         if (!__ip_dev_find(net, fl4->saddr, false))
2433                                 goto out;
2434                 }
2435         }
2436
2437
2438         if (fl4->flowi4_oif) {
2439                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2440                 rth = ERR_PTR(-ENODEV);
2441                 if (!dev_out)
2442                         goto out;
2443
2444                 /* RACE: Check return value of inet_select_addr instead. */
2445                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2446                         rth = ERR_PTR(-ENETUNREACH);
2447                         goto out;
2448                 }
2449                 if (ipv4_is_local_multicast(fl4->daddr) ||
2450                     ipv4_is_lbcast(fl4->daddr) ||
2451                     fl4->flowi4_proto == IPPROTO_IGMP) {
2452                         if (!fl4->saddr)
2453                                 fl4->saddr = inet_select_addr(dev_out, 0,
2454                                                               RT_SCOPE_LINK);
2455                         goto make_route;
2456                 }
2457                 if (!fl4->saddr) {
2458                         if (ipv4_is_multicast(fl4->daddr))
2459                                 fl4->saddr = inet_select_addr(dev_out, 0,
2460                                                               fl4->flowi4_scope);
2461                         else if (!fl4->daddr)
2462                                 fl4->saddr = inet_select_addr(dev_out, 0,
2463                                                               RT_SCOPE_HOST);
2464                 }
2465         }
2466
2467         if (!fl4->daddr) {
2468                 fl4->daddr = fl4->saddr;
2469                 if (!fl4->daddr)
2470                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2471                 dev_out = net->loopback_dev;
2472                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2473                 res->type = RTN_LOCAL;
2474                 flags |= RTCF_LOCAL;
2475                 goto make_route;
2476         }
2477
2478         err = fib_lookup(net, fl4, res, 0);
2479         if (err) {
2480                 res->fi = NULL;
2481                 res->table = NULL;
2482                 if (fl4->flowi4_oif &&
2483                     (ipv4_is_multicast(fl4->daddr) ||
2484                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2485                         /* Apparently, routing tables are wrong. Assume,
2486                            that the destination is on link.
2487
2488                            WHY? DW.
2489                            Because we are allowed to send to iface
2490                            even if it has NO routes and NO assigned
2491                            addresses. When oif is specified, routing
2492                            tables are looked up with only one purpose:
2493                            to catch if destination is gatewayed, rather than
2494                            direct. Moreover, if MSG_DONTROUTE is set,
2495                            we send packet, ignoring both routing tables
2496                            and ifaddr state. --ANK
2497
2498
2499                            We could make it even if oif is unknown,
2500                            likely IPv6, but we do not.
2501                          */
2502
2503                         if (fl4->saddr == 0)
2504                                 fl4->saddr = inet_select_addr(dev_out, 0,
2505                                                               RT_SCOPE_LINK);
2506                         res->type = RTN_UNICAST;
2507                         goto make_route;
2508                 }
2509                 rth = ERR_PTR(err);
2510                 goto out;
2511         }
2512
2513         if (res->type == RTN_LOCAL) {
2514                 if (!fl4->saddr) {
2515                         if (res->fi->fib_prefsrc)
2516                                 fl4->saddr = res->fi->fib_prefsrc;
2517                         else
2518                                 fl4->saddr = fl4->daddr;
2519                 }
2520
2521                 /* L3 master device is the loopback for that domain */
2522                 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2523                         net->loopback_dev;
2524
2525                 /* make sure orig_oif points to fib result device even
2526                  * though packet rx/tx happens over loopback or l3mdev
2527                  */
2528                 orig_oif = FIB_RES_OIF(*res);
2529
2530                 fl4->flowi4_oif = dev_out->ifindex;
2531                 flags |= RTCF_LOCAL;
2532                 goto make_route;
2533         }
2534
2535         fib_select_path(net, res, fl4, skb);
2536
2537         dev_out = FIB_RES_DEV(*res);
2538
2539 make_route:
2540         rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2541
2542 out:
2543         return rth;
2544 }
2545
2546 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2547 {
2548         return NULL;
2549 }
2550
2551 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2552 {
2553         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2554
2555         return mtu ? : dst->dev->mtu;
2556 }
2557
2558 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2559                                           struct sk_buff *skb, u32 mtu,
2560                                           bool confirm_neigh)
2561 {
2562 }
2563
2564 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2565                                        struct sk_buff *skb)
2566 {
2567 }
2568
2569 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2570                                           unsigned long old)
2571 {
2572         return NULL;
2573 }
2574
2575 static struct dst_ops ipv4_dst_blackhole_ops = {
2576         .family                 =       AF_INET,
2577         .check                  =       ipv4_blackhole_dst_check,
2578         .mtu                    =       ipv4_blackhole_mtu,
2579         .default_advmss         =       ipv4_default_advmss,
2580         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2581         .redirect               =       ipv4_rt_blackhole_redirect,
2582         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2583         .neigh_lookup           =       ipv4_neigh_lookup,
2584 };
2585
2586 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2587 {
2588         struct rtable *ort = (struct rtable *) dst_orig;
2589         struct rtable *rt;
2590
2591         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2592         if (rt) {
2593                 struct dst_entry *new = &rt->dst;
2594
2595                 new->__use = 1;
2596                 new->input = dst_discard;
2597                 new->output = dst_discard_out;
2598
2599                 new->dev = net->loopback_dev;
2600                 if (new->dev)
2601                         dev_hold(new->dev);
2602
2603                 rt->rt_is_input = ort->rt_is_input;
2604                 rt->rt_iif = ort->rt_iif;
2605                 rt->rt_pmtu = ort->rt_pmtu;
2606                 rt->rt_mtu_locked = ort->rt_mtu_locked;
2607
2608                 rt->rt_genid = rt_genid_ipv4(net);
2609                 rt->rt_flags = ort->rt_flags;
2610                 rt->rt_type = ort->rt_type;
2611                 rt->rt_gateway = ort->rt_gateway;
2612                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2613
2614                 INIT_LIST_HEAD(&rt->rt_uncached);
2615         }
2616
2617         dst_release(dst_orig);
2618
2619         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2620 }
2621
2622 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2623                                     const struct sock *sk)
2624 {
2625         struct rtable *rt = __ip_route_output_key(net, flp4);
2626
2627         if (IS_ERR(rt))
2628                 return rt;
2629
2630         if (flp4->flowi4_proto) {
2631                 flp4->flowi4_oif = rt->dst.dev->ifindex;
2632                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2633                                                         flowi4_to_flowi(flp4),
2634                                                         sk, 0);
2635         }
2636
2637         return rt;
2638 }
2639 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2640
2641 /* called with rcu_read_lock held */
2642 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2643                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2644                         u32 seq)
2645 {
2646         struct rtable *rt = skb_rtable(skb);
2647         struct rtmsg *r;
2648         struct nlmsghdr *nlh;
2649         unsigned long expires = 0;
2650         u32 error;
2651         u32 metrics[RTAX_MAX];
2652
2653         nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2654         if (!nlh)
2655                 return -EMSGSIZE;
2656
2657         r = nlmsg_data(nlh);
2658         r->rtm_family    = AF_INET;
2659         r->rtm_dst_len  = 32;
2660         r->rtm_src_len  = 0;
2661         r->rtm_tos      = fl4->flowi4_tos;
2662         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2663         if (nla_put_u32(skb, RTA_TABLE, table_id))
2664                 goto nla_put_failure;
2665         r->rtm_type     = rt->rt_type;
2666         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2667         r->rtm_protocol = RTPROT_UNSPEC;
2668         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2669         if (rt->rt_flags & RTCF_NOTIFY)
2670                 r->rtm_flags |= RTM_F_NOTIFY;
2671         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2672                 r->rtm_flags |= RTCF_DOREDIRECT;
2673
2674         if (nla_put_in_addr(skb, RTA_DST, dst))
2675                 goto nla_put_failure;
2676         if (src) {
2677                 r->rtm_src_len = 32;
2678                 if (nla_put_in_addr(skb, RTA_SRC, src))
2679                         goto nla_put_failure;
2680         }
2681         if (rt->dst.dev &&
2682             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2683                 goto nla_put_failure;
2684 #ifdef CONFIG_IP_ROUTE_CLASSID
2685         if (rt->dst.tclassid &&
2686             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2687                 goto nla_put_failure;
2688 #endif
2689         if (!rt_is_input_route(rt) &&
2690             fl4->saddr != src) {
2691                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2692                         goto nla_put_failure;
2693         }
2694         if (rt->rt_uses_gateway &&
2695             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2696                 goto nla_put_failure;
2697
2698         expires = rt->dst.expires;
2699         if (expires) {
2700                 unsigned long now = jiffies;
2701
2702                 if (time_before(now, expires))
2703                         expires -= now;
2704                 else
2705                         expires = 0;
2706         }
2707
2708         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2709         if (rt->rt_pmtu && expires)
2710                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2711         if (rt->rt_mtu_locked && expires)
2712                 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2713         if (rtnetlink_put_metrics(skb, metrics) < 0)
2714                 goto nla_put_failure;
2715
2716         if (fl4->flowi4_mark &&
2717             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2718                 goto nla_put_failure;
2719
2720         if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2721             nla_put_u32(skb, RTA_UID,
2722                         from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2723                 goto nla_put_failure;
2724
2725         error = rt->dst.error;
2726
2727         if (rt_is_input_route(rt)) {
2728 #ifdef CONFIG_IP_MROUTE
2729                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2730                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2731                         int err = ipmr_get_route(net, skb,
2732                                                  fl4->saddr, fl4->daddr,
2733                                                  r, portid);
2734
2735                         if (err <= 0) {
2736                                 if (err == 0)
2737                                         return 0;
2738                                 goto nla_put_failure;
2739                         }
2740                 } else
2741 #endif
2742                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2743                                 goto nla_put_failure;
2744         }
2745
2746         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2747                 goto nla_put_failure;
2748
2749         nlmsg_end(skb, nlh);
2750         return 0;
2751
2752 nla_put_failure:
2753         nlmsg_cancel(skb, nlh);
2754         return -EMSGSIZE;
2755 }
2756
2757 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2758                              struct netlink_ext_ack *extack)
2759 {
2760         struct net *net = sock_net(in_skb->sk);
2761         struct rtmsg *rtm;
2762         struct nlattr *tb[RTA_MAX+1];
2763         struct fib_result res = {};
2764         struct rtable *rt = NULL;
2765         struct flowi4 fl4;
2766         __be32 dst = 0;
2767         __be32 src = 0;
2768         u32 iif;
2769         int err;
2770         int mark;
2771         struct sk_buff *skb;
2772         u32 table_id = RT_TABLE_MAIN;
2773         kuid_t uid;
2774
2775         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
2776                           extack);
2777         if (err < 0)
2778                 goto errout;
2779
2780         rtm = nlmsg_data(nlh);
2781
2782         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2783         if (!skb) {
2784                 err = -ENOBUFS;
2785                 goto errout;
2786         }
2787
2788         /* Reserve room for dummy headers, this skb can pass
2789            through good chunk of routing engine.
2790          */
2791         skb_reset_mac_header(skb);
2792         skb_reset_network_header(skb);
2793
2794         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2795         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2796         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2797         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2798         if (tb[RTA_UID])
2799                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2800         else
2801                 uid = (iif ? INVALID_UID : current_uid());
2802
2803         /* Bugfix: need to give ip_route_input enough of an IP header to
2804          * not gag.
2805          */
2806         ip_hdr(skb)->protocol = IPPROTO_UDP;
2807         ip_hdr(skb)->saddr = src;
2808         ip_hdr(skb)->daddr = dst;
2809
2810         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2811
2812         memset(&fl4, 0, sizeof(fl4));
2813         fl4.daddr = dst;
2814         fl4.saddr = src;
2815         fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
2816         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2817         fl4.flowi4_mark = mark;
2818         fl4.flowi4_uid = uid;
2819
2820         rcu_read_lock();
2821
2822         if (iif) {
2823                 struct net_device *dev;
2824
2825                 dev = dev_get_by_index_rcu(net, iif);
2826                 if (!dev) {
2827                         err = -ENODEV;
2828                         goto errout_free;
2829                 }
2830
2831                 skb->protocol   = htons(ETH_P_IP);
2832                 skb->dev        = dev;
2833                 skb->mark       = mark;
2834                 err = ip_route_input_rcu(skb, dst, src,
2835                                          rtm->rtm_tos & IPTOS_RT_MASK, dev,
2836                                          &res);
2837
2838                 rt = skb_rtable(skb);
2839                 if (err == 0 && rt->dst.error)
2840                         err = -rt->dst.error;
2841         } else {
2842                 fl4.flowi4_iif = LOOPBACK_IFINDEX;
2843                 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
2844                 err = 0;
2845                 if (IS_ERR(rt))
2846                         err = PTR_ERR(rt);
2847                 else
2848                         skb_dst_set(skb, &rt->dst);
2849         }
2850
2851         if (err)
2852                 goto errout_free;
2853
2854         if (rtm->rtm_flags & RTM_F_NOTIFY)
2855                 rt->rt_flags |= RTCF_NOTIFY;
2856
2857         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2858                 table_id = rt->rt_table_id;
2859
2860         if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2861                 if (!res.fi) {
2862                         err = fib_props[res.type].error;
2863                         if (!err)
2864                                 err = -EHOSTUNREACH;
2865                         goto errout_free;
2866                 }
2867                 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2868                                     nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2869                                     rt->rt_type, res.prefix, res.prefixlen,
2870                                     fl4.flowi4_tos, res.fi, 0);
2871         } else {
2872                 err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2873                                    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
2874         }
2875         if (err < 0)
2876                 goto errout_free;
2877
2878         rcu_read_unlock();
2879
2880         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2881 errout:
2882         return err;
2883
2884 errout_free:
2885         rcu_read_unlock();
2886         kfree_skb(skb);
2887         goto errout;
2888 }
2889
2890 void ip_rt_multicast_event(struct in_device *in_dev)
2891 {
2892         rt_cache_flush(dev_net(in_dev->dev));
2893 }
2894
2895 #ifdef CONFIG_SYSCTL
2896 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2897 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2898 static int ip_rt_gc_elasticity __read_mostly    = 8;
2899 static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
2900
2901 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2902                                         void __user *buffer,
2903                                         size_t *lenp, loff_t *ppos)
2904 {
2905         struct net *net = (struct net *)__ctl->extra1;
2906
2907         if (write) {
2908                 rt_cache_flush(net);
2909                 fnhe_genid_bump(net);
2910                 return 0;
2911         }
2912
2913         return -EINVAL;
2914 }
2915
2916 static struct ctl_table ipv4_route_table[] = {
2917         {
2918                 .procname       = "gc_thresh",
2919                 .data           = &ipv4_dst_ops.gc_thresh,
2920                 .maxlen         = sizeof(int),
2921                 .mode           = 0644,
2922                 .proc_handler   = proc_dointvec,
2923         },
2924         {
2925                 .procname       = "max_size",
2926                 .data           = &ip_rt_max_size,
2927                 .maxlen         = sizeof(int),
2928                 .mode           = 0644,
2929                 .proc_handler   = proc_dointvec,
2930         },
2931         {
2932                 /*  Deprecated. Use gc_min_interval_ms */
2933
2934                 .procname       = "gc_min_interval",
2935                 .data           = &ip_rt_gc_min_interval,
2936                 .maxlen         = sizeof(int),
2937                 .mode           = 0644,
2938                 .proc_handler   = proc_dointvec_jiffies,
2939         },
2940         {
2941                 .procname       = "gc_min_interval_ms",
2942                 .data           = &ip_rt_gc_min_interval,
2943                 .maxlen         = sizeof(int),
2944                 .mode           = 0644,
2945                 .proc_handler   = proc_dointvec_ms_jiffies,
2946         },
2947         {
2948                 .procname       = "gc_timeout",
2949                 .data           = &ip_rt_gc_timeout,
2950                 .maxlen         = sizeof(int),
2951                 .mode           = 0644,
2952                 .proc_handler   = proc_dointvec_jiffies,
2953         },
2954         {
2955                 .procname       = "gc_interval",
2956                 .data           = &ip_rt_gc_interval,
2957                 .maxlen         = sizeof(int),
2958                 .mode           = 0644,
2959                 .proc_handler   = proc_dointvec_jiffies,
2960         },
2961         {
2962                 .procname       = "redirect_load",
2963                 .data           = &ip_rt_redirect_load,
2964                 .maxlen         = sizeof(int),
2965                 .mode           = 0644,
2966                 .proc_handler   = proc_dointvec,
2967         },
2968         {
2969                 .procname       = "redirect_number",
2970                 .data           = &ip_rt_redirect_number,
2971                 .maxlen         = sizeof(int),
2972                 .mode           = 0644,
2973                 .proc_handler   = proc_dointvec,
2974         },
2975         {
2976                 .procname       = "redirect_silence",
2977                 .data           = &ip_rt_redirect_silence,
2978                 .maxlen         = sizeof(int),
2979                 .mode           = 0644,
2980                 .proc_handler   = proc_dointvec,
2981         },
2982         {
2983                 .procname       = "error_cost",
2984                 .data           = &ip_rt_error_cost,
2985                 .maxlen         = sizeof(int),
2986                 .mode           = 0644,
2987                 .proc_handler   = proc_dointvec,
2988         },
2989         {
2990                 .procname       = "error_burst",
2991                 .data           = &ip_rt_error_burst,
2992                 .maxlen         = sizeof(int),
2993                 .mode           = 0644,
2994                 .proc_handler   = proc_dointvec,
2995         },
2996         {
2997                 .procname       = "gc_elasticity",
2998                 .data           = &ip_rt_gc_elasticity,
2999                 .maxlen         = sizeof(int),
3000                 .mode           = 0644,
3001                 .proc_handler   = proc_dointvec,
3002         },
3003         {
3004                 .procname       = "mtu_expires",
3005                 .data           = &ip_rt_mtu_expires,
3006                 .maxlen         = sizeof(int),
3007                 .mode           = 0644,
3008                 .proc_handler   = proc_dointvec_jiffies,
3009         },
3010         {
3011                 .procname       = "min_pmtu",
3012                 .data           = &ip_rt_min_pmtu,
3013                 .maxlen         = sizeof(int),
3014                 .mode           = 0644,
3015                 .proc_handler   = proc_dointvec_minmax,
3016                 .extra1         = &ip_min_valid_pmtu,
3017         },
3018         {
3019                 .procname       = "min_adv_mss",
3020                 .data           = &ip_rt_min_advmss,
3021                 .maxlen         = sizeof(int),
3022                 .mode           = 0644,
3023                 .proc_handler   = proc_dointvec,
3024         },
3025         { }
3026 };
3027
3028 static struct ctl_table ipv4_route_flush_table[] = {
3029         {
3030                 .procname       = "flush",
3031                 .maxlen         = sizeof(int),
3032                 .mode           = 0200,
3033                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3034         },
3035         { },
3036 };
3037
3038 static __net_init int sysctl_route_net_init(struct net *net)
3039 {
3040         struct ctl_table *tbl;
3041
3042         tbl = ipv4_route_flush_table;
3043         if (!net_eq(net, &init_net)) {
3044                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3045                 if (!tbl)
3046                         goto err_dup;
3047
3048                 /* Don't export sysctls to unprivileged users */
3049                 if (net->user_ns != &init_user_ns)
3050                         tbl[0].procname = NULL;
3051         }
3052         tbl[0].extra1 = net;
3053
3054         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3055         if (!net->ipv4.route_hdr)
3056                 goto err_reg;
3057         return 0;
3058
3059 err_reg:
3060         if (tbl != ipv4_route_flush_table)
3061                 kfree(tbl);
3062 err_dup:
3063         return -ENOMEM;
3064 }
3065
3066 static __net_exit void sysctl_route_net_exit(struct net *net)
3067 {
3068         struct ctl_table *tbl;
3069
3070         tbl = net->ipv4.route_hdr->ctl_table_arg;
3071         unregister_net_sysctl_table(net->ipv4.route_hdr);
3072         BUG_ON(tbl == ipv4_route_flush_table);
3073         kfree(tbl);
3074 }
3075
3076 static __net_initdata struct pernet_operations sysctl_route_ops = {
3077         .init = sysctl_route_net_init,
3078         .exit = sysctl_route_net_exit,
3079 };
3080 #endif
3081
3082 static __net_init int rt_genid_init(struct net *net)
3083 {
3084         atomic_set(&net->ipv4.rt_genid, 0);
3085         atomic_set(&net->fnhe_genid, 0);
3086         atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3087         return 0;
3088 }
3089
3090 static __net_initdata struct pernet_operations rt_genid_ops = {
3091         .init = rt_genid_init,
3092 };
3093
3094 static int __net_init ipv4_inetpeer_init(struct net *net)
3095 {
3096         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3097
3098         if (!bp)
3099                 return -ENOMEM;
3100         inet_peer_base_init(bp);
3101         net->ipv4.peers = bp;
3102         return 0;
3103 }
3104
3105 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3106 {
3107         struct inet_peer_base *bp = net->ipv4.peers;
3108
3109         net->ipv4.peers = NULL;
3110         inetpeer_invalidate_tree(bp);
3111         kfree(bp);
3112 }
3113
3114 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3115         .init   =       ipv4_inetpeer_init,
3116         .exit   =       ipv4_inetpeer_exit,
3117 };
3118
3119 #ifdef CONFIG_IP_ROUTE_CLASSID
3120 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3121 #endif /* CONFIG_IP_ROUTE_CLASSID */
3122
3123 int __init ip_rt_init(void)
3124 {
3125         void *idents_hash;
3126         int rc = 0;
3127         int cpu;
3128
3129         /* For modern hosts, this will use 2 MB of memory */
3130         idents_hash = alloc_large_system_hash("IP idents",
3131                                               sizeof(*ip_idents) + sizeof(*ip_tstamps),
3132                                               0,
3133                                               16, /* one bucket per 64 KB */
3134                                               HASH_ZERO,
3135                                               NULL,
3136                                               &ip_idents_mask,
3137                                               2048,
3138                                               256*1024);
3139
3140         ip_idents = idents_hash;
3141
3142         prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
3143
3144         ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
3145
3146         for_each_possible_cpu(cpu) {
3147                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3148
3149                 INIT_LIST_HEAD(&ul->head);
3150                 spin_lock_init(&ul->lock);
3151         }
3152 #ifdef CONFIG_IP_ROUTE_CLASSID
3153         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3154         if (!ip_rt_acct)
3155                 panic("IP: failed to allocate ip_rt_acct\n");
3156 #endif
3157
3158         ipv4_dst_ops.kmem_cachep =
3159                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3160                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3161
3162         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3163
3164         if (dst_entries_init(&ipv4_dst_ops) < 0)
3165                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3166
3167         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3168                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3169
3170         ipv4_dst_ops.gc_thresh = ~0;
3171         ip_rt_max_size = INT_MAX;
3172
3173         devinet_init();
3174         ip_fib_init();
3175
3176         if (ip_rt_proc_init())
3177                 pr_err("Unable to create route proc files\n");
3178 #ifdef CONFIG_XFRM
3179         xfrm_init();
3180         xfrm4_init();
3181 #endif
3182         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3183                       RTNL_FLAG_DOIT_UNLOCKED);
3184
3185 #ifdef CONFIG_SYSCTL
3186         register_pernet_subsys(&sysctl_route_ops);
3187 #endif
3188         register_pernet_subsys(&rt_genid_ops);
3189         register_pernet_subsys(&ipv4_inetpeer_ops);
3190         return rc;
3191 }
3192
3193 #ifdef CONFIG_SYSCTL
3194 /*
3195  * We really need to sanitize the damn ipv4 init order, then all
3196  * this nonsense will go away.
3197  */
3198 void __init ip_static_sysctl_init(void)
3199 {
3200         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3201 }
3202 #endif