net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <linux/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/bootmem.h>
  74 #include <linux/string.h>
  75 #include <linux/socket.h>
  76 #include <linux/sockios.h>
  77 #include <linux/errno.h>
  78 #include <linux/in.h>
  79 #include <linux/inet.h>
  80 #include <linux/netdevice.h>
  81 #include <linux/proc_fs.h>
  82 #include <linux/init.h>
  83 #include <linux/skbuff.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/rcupdate.h>
  91 #include <linux/times.h>
  92 #include <linux/slab.h>
  93 #include <linux/jhash.h>
  94 #include <net/dst.h>
  95 #include <net/dst_metadata.h>
  96 #include <net/net_namespace.h>
  97 #include <net/protocol.h>
  98 #include <net/ip.h>
  99 #include <net/route.h>
 100 #include <net/inetpeer.h>
 101 #include <net/sock.h>
 102 #include <net/ip_fib.h>
 103 #include <net/arp.h>
 104 #include <net/tcp.h>
 105 #include <net/icmp.h>
 106 #include <net/xfrm.h>
 107 #include <net/lwtunnel.h>
 108 #include <net/netevent.h>
 109 #include <net/rtnetlink.h>
 110 #ifdef CONFIG_SYSCTL
 111 #include <linux/sysctl.h>
 112 #include <linux/kmemleak.h>
 113 #endif
 114 #include <net/secure_seq.h>
 115 #include <net/ip_tunnels.h>
 116 #include <net/l3mdev.h>
 117
 118 #include "fib_lookup.h"
 119
 120 #define RT_FL_TOS(oldflp4) \
 121         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 122
 123 #define RT_GC_TIMEOUT (300*HZ)
 124
 125 static int ip_rt_max_size;
 126 static int ip_rt_redirect_number __read_mostly  = 9;
 127 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 128 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 129 static int ip_rt_error_cost __read_mostly       = HZ;
 130 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 131 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 132 static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 133 static int ip_rt_min_advmss __read_mostly       = 256;
 134
 135 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 136
 137 /*
 138  *      Interface to generic destination cache.
 139  */
 140
 141 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 142 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 143 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 144 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 145 static void              ipv4_link_failure(struct sk_buff *skb);
 146 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 147                                            struct sk_buff *skb, u32 mtu,
 148                                            bool confirm_neigh);
 149 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 150                                         struct sk_buff *skb);
 151 static void             ipv4_dst_destroy(struct dst_entry *dst);
 152
 153 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 154 {
 155         WARN_ON(1);
 156         return NULL;
 157 }
 158
 159 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 160                                            struct sk_buff *skb,
 161                                            const void *daddr);
 162 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 163
 164 static struct dst_ops ipv4_dst_ops = {
 165         .family =               AF_INET,
 166         .check =                ipv4_dst_check,
 167         .default_advmss =       ipv4_default_advmss,
 168         .mtu =                  ipv4_mtu,
 169         .cow_metrics =          ipv4_cow_metrics,
 170         .destroy =              ipv4_dst_destroy,
 171         .negative_advice =      ipv4_negative_advice,
 172         .link_failure =         ipv4_link_failure,
 173         .update_pmtu =          ip_rt_update_pmtu,
 174         .redirect =             ip_do_redirect,
 175         .local_out =            __ip_local_out,
 176         .neigh_lookup =         ipv4_neigh_lookup,
 177         .confirm_neigh =        ipv4_confirm_neigh,
 178 };
 179
 180 #define ECN_OR_COST(class)      TC_PRIO_##class
 181
 182 const __u8 ip_tos2prio[16] = {
 183         TC_PRIO_BESTEFFORT,
 184         ECN_OR_COST(BESTEFFORT),
 185         TC_PRIO_BESTEFFORT,
 186         ECN_OR_COST(BESTEFFORT),
 187         TC_PRIO_BULK,
 188         ECN_OR_COST(BULK),
 189         TC_PRIO_BULK,
 190         ECN_OR_COST(BULK),
 191         TC_PRIO_INTERACTIVE,
 192         ECN_OR_COST(INTERACTIVE),
 193         TC_PRIO_INTERACTIVE,
 194         ECN_OR_COST(INTERACTIVE),
 195         TC_PRIO_INTERACTIVE_BULK,
 196         ECN_OR_COST(INTERACTIVE_BULK),
 197         TC_PRIO_INTERACTIVE_BULK,
 198         ECN_OR_COST(INTERACTIVE_BULK)
 199 };
 200 EXPORT_SYMBOL(ip_tos2prio);
 201
 202 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 203 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 204
 205 #ifdef CONFIG_PROC_FS
 206 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 207 {
 208         if (*pos)
 209                 return NULL;
 210         return SEQ_START_TOKEN;
 211 }
 212
 213 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 214 {
 215         ++*pos;
 216         return NULL;
 217 }
 218
 219 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 220 {
 221 }
 222
 223 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 224 {
 225         if (v == SEQ_START_TOKEN)
 226                 seq_printf(seq, "%-127s\n",
 227                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 228                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 229                            "HHUptod\tSpecDst");
 230         return 0;
 231 }
 232
 233 static const struct seq_operations rt_cache_seq_ops = {
 234         .start  = rt_cache_seq_start,
 235         .next   = rt_cache_seq_next,
 236         .stop   = rt_cache_seq_stop,
 237         .show   = rt_cache_seq_show,
 238 };
 239
 240 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 241 {
 242         return seq_open(file, &rt_cache_seq_ops);
 243 }
 244
 245 static const struct file_operations rt_cache_seq_fops = {
 246         .owner   = THIS_MODULE,
 247         .open    = rt_cache_seq_open,
 248         .read    = seq_read,
 249         .llseek  = seq_lseek,
 250         .release = seq_release,
 251 };
 252
 253
 254 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 255 {
 256         int cpu;
 257
 258         if (*pos == 0)
 259                 return SEQ_START_TOKEN;
 260
 261         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 262                 if (!cpu_possible(cpu))
 263                         continue;
 264                 *pos = cpu+1;
 265                 return &per_cpu(rt_cache_stat, cpu);
 266         }
 267         return NULL;
 268 }
 269
 270 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 271 {
 272         int cpu;
 273
 274         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 275                 if (!cpu_possible(cpu))
 276                         continue;
 277                 *pos = cpu+1;
 278                 return &per_cpu(rt_cache_stat, cpu);
 279         }
 280         (*pos)++;
 281         return NULL;
 282
 283 }
 284
 285 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 286 {
 287
 288 }
 289
 290 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 291 {
 292         struct rt_cache_stat *st = v;
 293
 294         if (v == SEQ_START_TOKEN) {
 295                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 296                 return 0;
 297         }
 298
 299         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 300                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 301                    dst_entries_get_slow(&ipv4_dst_ops),
 302                    0, /* st->in_hit */
 303                    st->in_slow_tot,
 304                    st->in_slow_mc,
 305                    st->in_no_route,
 306                    st->in_brd,
 307                    st->in_martian_dst,
 308                    st->in_martian_src,
 309
 310                    0, /* st->out_hit */
 311                    st->out_slow_tot,
 312                    st->out_slow_mc,
 313
 314                    0, /* st->gc_total */
 315                    0, /* st->gc_ignored */
 316                    0, /* st->gc_goal_miss */
 317                    0, /* st->gc_dst_overflow */
 318                    0, /* st->in_hlist_search */
 319                    0  /* st->out_hlist_search */
 320                 );
 321         return 0;
 322 }
 323
 324 static const struct seq_operations rt_cpu_seq_ops = {
 325         .start  = rt_cpu_seq_start,
 326         .next   = rt_cpu_seq_next,
 327         .stop   = rt_cpu_seq_stop,
 328         .show   = rt_cpu_seq_show,
 329 };
 330
 331
 332 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 333 {
 334         return seq_open(file, &rt_cpu_seq_ops);
 335 }
 336
 337 static const struct file_operations rt_cpu_seq_fops = {
 338         .owner   = THIS_MODULE,
 339         .open    = rt_cpu_seq_open,
 340         .read    = seq_read,
 341         .llseek  = seq_lseek,
 342         .release = seq_release,
 343 };
 344
 345 #ifdef CONFIG_IP_ROUTE_CLASSID
 346 static int rt_acct_proc_show(struct seq_file *m, void *v)
 347 {
 348         struct ip_rt_acct *dst, *src;
 349         unsigned int i, j;
 350
 351         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 352         if (!dst)
 353                 return -ENOMEM;
 354
 355         for_each_possible_cpu(i) {
 356                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 357                 for (j = 0; j < 256; j++) {
 358                         dst[j].o_bytes   += src[j].o_bytes;
 359                         dst[j].o_packets += src[j].o_packets;
 360                         dst[j].i_bytes   += src[j].i_bytes;
 361                         dst[j].i_packets += src[j].i_packets;
 362                 }
 363         }
 364
 365         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 366         kfree(dst);
 367         return 0;
 368 }
 369
 370 static int rt_acct_proc_open(struct inode *inode, struct file *file)
 371 {
 372         return single_open(file, rt_acct_proc_show, NULL);
 373 }
 374
 375 static const struct file_operations rt_acct_proc_fops = {
 376         .owner          = THIS_MODULE,
 377         .open           = rt_acct_proc_open,
 378         .read           = seq_read,
 379         .llseek         = seq_lseek,
 380         .release        = single_release,
 381 };
 382 #endif
 383
 384 static int __net_init ip_rt_do_proc_init(struct net *net)
 385 {
 386         struct proc_dir_entry *pde;
 387
 388         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
 389                           &rt_cache_seq_fops);
 390         if (!pde)
 391                 goto err1;
 392
 393         pde = proc_create("rt_cache", S_IRUGO,
 394                           net->proc_net_stat, &rt_cpu_seq_fops);
 395         if (!pde)
 396                 goto err2;
 397
 398 #ifdef CONFIG_IP_ROUTE_CLASSID
 399         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
 400         if (!pde)
 401                 goto err3;
 402 #endif
 403         return 0;
 404
 405 #ifdef CONFIG_IP_ROUTE_CLASSID
 406 err3:
 407         remove_proc_entry("rt_cache", net->proc_net_stat);
 408 #endif
 409 err2:
 410         remove_proc_entry("rt_cache", net->proc_net);
 411 err1:
 412         return -ENOMEM;
 413 }
 414
 415 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 416 {
 417         remove_proc_entry("rt_cache", net->proc_net_stat);
 418         remove_proc_entry("rt_cache", net->proc_net);
 419 #ifdef CONFIG_IP_ROUTE_CLASSID
 420         remove_proc_entry("rt_acct", net->proc_net);
 421 #endif
 422 }
 423
 424 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 425         .init = ip_rt_do_proc_init,
 426         .exit = ip_rt_do_proc_exit,
 427 };
 428
 429 static int __init ip_rt_proc_init(void)
 430 {
 431         return register_pernet_subsys(&ip_rt_proc_ops);
 432 }
 433
 434 #else
 435 static inline int ip_rt_proc_init(void)
 436 {
 437         return 0;
 438 }
 439 #endif /* CONFIG_PROC_FS */
 440
 441 static inline bool rt_is_expired(const struct rtable *rth)
 442 {
 443         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 444 }
 445
 446 void rt_cache_flush(struct net *net)
 447 {
 448         rt_genid_bump_ipv4(net);
 449 }
 450
 451 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 452                                            struct sk_buff *skb,
 453                                            const void *daddr)
 454 {
 455         struct net_device *dev = dst->dev;
 456         const __be32 *pkey = daddr;
 457         const struct rtable *rt;
 458         struct neighbour *n;
 459
 460         rt = (const struct rtable *) dst;
 461         if (rt->rt_gateway)
 462                 pkey = (const __be32 *) &rt->rt_gateway;
 463         else if (skb)
 464                 pkey = &ip_hdr(skb)->daddr;
 465
 466         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 467         if (n)
 468                 return n;
 469         return neigh_create(&arp_tbl, pkey, dev);
 470 }
 471
 472 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 473 {
 474         struct net_device *dev = dst->dev;
 475         const __be32 *pkey = daddr;
 476         const struct rtable *rt;
 477
 478         rt = (const struct rtable *)dst;
 479         if (rt->rt_gateway)
 480                 pkey = (const __be32 *)&rt->rt_gateway;
 481         else if (!daddr ||
 482                  (rt->rt_flags &
 483                   (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
 484                 return;
 485
 486         __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
 487 }
 488
 489 /* Hash tables of size 2048..262144 depending on RAM size.
 490  * Each bucket uses 8 bytes.
 491  */
 492 static u32 ip_idents_mask __read_mostly;
 493 static atomic_t *ip_idents __read_mostly;
 494 static u32 *ip_tstamps __read_mostly;
 495
 496 /* In order to protect privacy, we add a perturbation to identifiers
 497  * if one generator is seldom used. This makes hard for an attacker
 498  * to infer how many packets were sent between two points in time.
 499  */
 500 u32 ip_idents_reserve(u32 hash, int segs)
 501 {
 502         u32 bucket, old, now = (u32)jiffies;
 503         atomic_t *p_id;
 504         u32 *p_tstamp;
 505         u32 delta = 0;
 506
 507         bucket = hash & ip_idents_mask;
 508         p_tstamp = ip_tstamps + bucket;
 509         p_id = ip_idents + bucket;
 510         old = ACCESS_ONCE(*p_tstamp);
 511
 512         if (old != now && cmpxchg(p_tstamp, old, now) == old)
 513                 delta = prandom_u32_max(now - old);
 514
 515         /* If UBSAN reports an error there, please make sure your compiler
 516          * supports -fno-strict-overflow before reporting it that was a bug
 517          * in UBSAN, and it has been fixed in GCC-8.
 518          */
 519         return atomic_add_return(segs + delta, p_id) - segs;
 520 }
 521 EXPORT_SYMBOL(ip_idents_reserve);
 522
 523 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 524 {
 525         u32 hash, id;
 526
 527         /* Note the following code is not safe, but this is okay. */
 528         if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
 529                 get_random_bytes(&net->ipv4.ip_id_key,
 530                                  sizeof(net->ipv4.ip_id_key));
 531
 532         hash = siphash_3u32((__force u32)iph->daddr,
 533                             (__force u32)iph->saddr,
 534                             iph->protocol,
 535                             &net->ipv4.ip_id_key);
 536         id = ip_idents_reserve(hash, segs);
 537         iph->id = htons(id);
 538 }
 539 EXPORT_SYMBOL(__ip_select_ident);
 540
 541 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
 542                              const struct sock *sk,
 543                              const struct iphdr *iph,
 544                              int oif, u8 tos,
 545                              u8 prot, u32 mark, int flow_flags)
 546 {
 547         if (sk) {
 548                 const struct inet_sock *inet = inet_sk(sk);
 549
 550                 oif = sk->sk_bound_dev_if;
 551                 mark = sk->sk_mark;
 552                 tos = RT_CONN_FLAGS(sk);
 553                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 554         }
 555         flowi4_init_output(fl4, oif, mark, tos,
 556                            RT_SCOPE_UNIVERSE, prot,
 557                            flow_flags,
 558                            iph->daddr, iph->saddr, 0, 0,
 559                            sock_net_uid(net, sk));
 560 }
 561
 562 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 563                                const struct sock *sk)
 564 {
 565         const struct net *net = dev_net(skb->dev);
 566         const struct iphdr *iph = ip_hdr(skb);
 567         int oif = skb->dev->ifindex;
 568         u8 tos = RT_TOS(iph->tos);
 569         u8 prot = iph->protocol;
 570         u32 mark = skb->mark;
 571
 572         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
 573 }
 574
 575 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 576 {
 577         const struct inet_sock *inet = inet_sk(sk);
 578         const struct ip_options_rcu *inet_opt;
 579         __be32 daddr = inet->inet_daddr;
 580
 581         rcu_read_lock();
 582         inet_opt = rcu_dereference(inet->inet_opt);
 583         if (inet_opt && inet_opt->opt.srr)
 584                 daddr = inet_opt->opt.faddr;
 585         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 586                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 587                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 588                            inet_sk_flowi_flags(sk),
 589                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
 590         rcu_read_unlock();
 591 }
 592
 593 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 594                                  const struct sk_buff *skb)
 595 {
 596         if (skb)
 597                 build_skb_flow_key(fl4, skb, sk);
 598         else
 599                 build_sk_flow_key(fl4, sk);
 600 }
 601
 602 static DEFINE_SPINLOCK(fnhe_lock);
 603
 604 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 605 {
 606         struct rtable *rt;
 607
 608         rt = rcu_dereference(fnhe->fnhe_rth_input);
 609         if (rt) {
 610                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 611                 dst_dev_put(&rt->dst);
 612                 dst_release(&rt->dst);
 613         }
 614         rt = rcu_dereference(fnhe->fnhe_rth_output);
 615         if (rt) {
 616                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 617                 dst_dev_put(&rt->dst);
 618                 dst_release(&rt->dst);
 619         }
 620 }
 621
 622 static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
 623 {
 624         struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
 625         struct fib_nh_exception *fnhe, *oldest = NULL;
 626
 627         for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
 628                 fnhe = rcu_dereference_protected(*fnhe_p,
 629                                                  lockdep_is_held(&fnhe_lock));
 630                 if (!fnhe)
 631                         break;
 632                 if (!oldest ||
 633                     time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
 634                         oldest = fnhe;
 635                         oldest_p = fnhe_p;
 636                 }
 637         }
 638         fnhe_flush_routes(oldest);
 639         *oldest_p = oldest->fnhe_next;
 640         kfree_rcu(oldest, rcu);
 641 }
 642
 643 static u32 fnhe_hashfun(__be32 daddr)
 644 {
 645         static siphash_key_t fnhe_hash_key __read_mostly;
 646         u64 hval;
 647
 648         net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key));
 649         hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key);
 650         return hash_64(hval, FNHE_HASH_SHIFT);
 651 }
 652
 653 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 654 {
 655         rt->rt_pmtu = fnhe->fnhe_pmtu;
 656         rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
 657         rt->dst.expires = fnhe->fnhe_expires;
 658
 659         if (fnhe->fnhe_gw) {
 660                 rt->rt_flags |= RTCF_REDIRECTED;
 661                 rt->rt_gateway = fnhe->fnhe_gw;
 662                 rt->rt_uses_gateway = 1;
 663         }
 664 }
 665
 666 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 667                                   u32 pmtu, bool lock, unsigned long expires)
 668 {
 669         struct fnhe_hash_bucket *hash;
 670         struct fib_nh_exception *fnhe;
 671         struct rtable *rt;
 672         u32 genid, hval;
 673         unsigned int i;
 674         int depth;
 675
 676         genid = fnhe_genid(dev_net(nh->nh_dev));
 677         hval = fnhe_hashfun(daddr);
 678
 679         spin_lock_bh(&fnhe_lock);
 680
 681         hash = rcu_dereference(nh->nh_exceptions);
 682         if (!hash) {
 683                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
 684                 if (!hash)
 685                         goto out_unlock;
 686                 rcu_assign_pointer(nh->nh_exceptions, hash);
 687         }
 688
 689         hash += hval;
 690
 691         depth = 0;
 692         for (fnhe = rcu_dereference(hash->chain); fnhe;
 693              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 694                 if (fnhe->fnhe_daddr == daddr)
 695                         break;
 696                 depth++;
 697         }
 698
 699         if (fnhe) {
 700                 if (fnhe->fnhe_genid != genid)
 701                         fnhe->fnhe_genid = genid;
 702                 if (gw)
 703                         fnhe->fnhe_gw = gw;
 704                 if (pmtu) {
 705                         fnhe->fnhe_pmtu = pmtu;
 706                         fnhe->fnhe_mtu_locked = lock;
 707                 }
 708                 fnhe->fnhe_expires = max(1UL, expires);
 709                 /* Update all cached dsts too */
 710                 rt = rcu_dereference(fnhe->fnhe_rth_input);
 711                 if (rt)
 712                         fill_route_from_fnhe(rt, fnhe);
 713                 rt = rcu_dereference(fnhe->fnhe_rth_output);
 714                 if (rt)
 715                         fill_route_from_fnhe(rt, fnhe);
 716         } else {
 717                 /* Randomize max depth to avoid some side channels attacks. */
 718                 int max_depth = FNHE_RECLAIM_DEPTH +
 719                                 prandom_u32_max(FNHE_RECLAIM_DEPTH);
 720
 721                 while (depth > max_depth) {
 722                         fnhe_remove_oldest(hash);
 723                         depth--;
 724                 }
 725
 726                 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 727                 if (!fnhe)
 728                         goto out_unlock;
 729
 730                 fnhe->fnhe_next = hash->chain;
 731
 732                 fnhe->fnhe_genid = genid;
 733                 fnhe->fnhe_daddr = daddr;
 734                 fnhe->fnhe_gw = gw;
 735                 fnhe->fnhe_pmtu = pmtu;
 736                 fnhe->fnhe_mtu_locked = lock;
 737                 fnhe->fnhe_expires = max(1UL, expires);
 738
 739                 rcu_assign_pointer(hash->chain, fnhe);
 740
 741                 /* Exception created; mark the cached routes for the nexthop
 742                  * stale, so anyone caching it rechecks if this exception
 743                  * applies to them.
 744                  */
 745                 rt = rcu_dereference(nh->nh_rth_input);
 746                 if (rt)
 747                         rt->dst.obsolete = DST_OBSOLETE_KILL;
 748
 749                 for_each_possible_cpu(i) {
 750                         struct rtable __rcu **prt;
 751                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
 752                         rt = rcu_dereference(*prt);
 753                         if (rt)
 754                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 755                 }
 756         }
 757
 758         fnhe->fnhe_stamp = jiffies;
 759
 760 out_unlock:
 761         spin_unlock_bh(&fnhe_lock);
 762 }
 763
 764 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 765                              bool kill_route)
 766 {
 767         __be32 new_gw = icmp_hdr(skb)->un.gateway;
 768         __be32 old_gw = ip_hdr(skb)->saddr;
 769         struct net_device *dev = skb->dev;
 770         struct in_device *in_dev;
 771         struct fib_result res;
 772         struct neighbour *n;
 773         struct net *net;
 774
 775         switch (icmp_hdr(skb)->code & 7) {
 776         case ICMP_REDIR_NET:
 777         case ICMP_REDIR_NETTOS:
 778         case ICMP_REDIR_HOST:
 779         case ICMP_REDIR_HOSTTOS:
 780                 break;
 781
 782         default:
 783                 return;
 784         }
 785
 786         if (rt->rt_gateway != old_gw)
 787                 return;
 788
 789         in_dev = __in_dev_get_rcu(dev);
 790         if (!in_dev)
 791                 return;
 792
 793         net = dev_net(dev);
 794         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 795             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 796             ipv4_is_zeronet(new_gw))
 797                 goto reject_redirect;
 798
 799         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 800                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 801                         goto reject_redirect;
 802                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 803                         goto reject_redirect;
 804         } else {
 805                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 806                         goto reject_redirect;
 807         }
 808
 809         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
 810         if (!n)
 811                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 812         if (!IS_ERR(n)) {
 813                 if (!(n->nud_state & NUD_VALID)) {
 814                         neigh_event_send(n, NULL);
 815                 } else {
 816                         if (fib_lookup(net, fl4, &res, 0) == 0) {
 817                                 struct fib_nh *nh = &FIB_RES_NH(res);
 818
 819                                 fib_select_path(net, &res, fl4, skb);
 820                                 nh = &FIB_RES_NH(res);
 821                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
 822                                                 0, false,
 823                                                 jiffies + ip_rt_gc_timeout);
 824                         }
 825                         if (kill_route)
 826                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 827                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 828                 }
 829                 neigh_release(n);
 830         }
 831         return;
 832
 833 reject_redirect:
 834 #ifdef CONFIG_IP_ROUTE_VERBOSE
 835         if (IN_DEV_LOG_MARTIANS(in_dev)) {
 836                 const struct iphdr *iph = (const struct iphdr *) skb->data;
 837                 __be32 daddr = iph->daddr;
 838                 __be32 saddr = iph->saddr;
 839
 840                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 841                                      "  Advised path = %pI4 -> %pI4\n",
 842                                      &old_gw, dev->name, &new_gw,
 843                                      &saddr, &daddr);
 844         }
 845 #endif
 846         ;
 847 }
 848
 849 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 850 {
 851         struct rtable *rt;
 852         struct flowi4 fl4;
 853         const struct iphdr *iph = (const struct iphdr *) skb->data;
 854         struct net *net = dev_net(skb->dev);
 855         int oif = skb->dev->ifindex;
 856         u8 tos = RT_TOS(iph->tos);
 857         u8 prot = iph->protocol;
 858         u32 mark = skb->mark;
 859
 860         rt = (struct rtable *) dst;
 861
 862         __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
 863         __ip_do_redirect(rt, skb, &fl4, true);
 864 }
 865
 866 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 867 {
 868         struct rtable *rt = (struct rtable *)dst;
 869         struct dst_entry *ret = dst;
 870
 871         if (rt) {
 872                 if (dst->obsolete > 0) {
 873                         ip_rt_put(rt);
 874                         ret = NULL;
 875                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 876                            rt->dst.expires) {
 877                         ip_rt_put(rt);
 878                         ret = NULL;
 879                 }
 880         }
 881         return ret;
 882 }
 883
 884 /*
 885  * Algorithm:
 886  *      1. The first ip_rt_redirect_number redirects are sent
 887  *         with exponential backoff, then we stop sending them at all,
 888  *         assuming that the host ignores our redirects.
 889  *      2. If we did not see packets requiring redirects
 890  *         during ip_rt_redirect_silence, we assume that the host
 891  *         forgot redirected route and start to send redirects again.
 892  *
 893  * This algorithm is much cheaper and more intelligent than dumb load limiting
 894  * in icmp.c.
 895  *
 896  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 897  * and "frag. need" (breaks PMTU discovery) in icmp.c.
 898  */
 899
 900 void ip_rt_send_redirect(struct sk_buff *skb)
 901 {
 902         struct rtable *rt = skb_rtable(skb);
 903         struct in_device *in_dev;
 904         struct inet_peer *peer;
 905         struct net *net;
 906         int log_martians;
 907         int vif;
 908
 909         rcu_read_lock();
 910         in_dev = __in_dev_get_rcu(rt->dst.dev);
 911         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 912                 rcu_read_unlock();
 913                 return;
 914         }
 915         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 916         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 917         rcu_read_unlock();
 918
 919         net = dev_net(rt->dst.dev);
 920         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 921         if (!peer) {
 922                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 923                           rt_nexthop(rt, ip_hdr(skb)->daddr));
 924                 return;
 925         }
 926
 927         /* No redirected packets during ip_rt_redirect_silence;
 928          * reset the algorithm.
 929          */
 930         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
 931                 peer->rate_tokens = 0;
 932                 peer->n_redirects = 0;
 933         }
 934
 935         /* Too many ignored redirects; do not send anything
 936          * set dst.rate_last to the last seen redirected packet.
 937          */
 938         if (peer->n_redirects >= ip_rt_redirect_number) {
 939                 peer->rate_last = jiffies;
 940                 goto out_put_peer;
 941         }
 942
 943         /* Check for load limit; set rate_last to the latest sent
 944          * redirect.
 945          */
 946         if (peer->n_redirects == 0 ||
 947             time_after(jiffies,
 948                        (peer->rate_last +
 949                         (ip_rt_redirect_load << peer->n_redirects)))) {
 950                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 951
 952                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 953                 peer->rate_last = jiffies;
 954                 ++peer->n_redirects;
 955 #ifdef CONFIG_IP_ROUTE_VERBOSE
 956                 if (log_martians &&
 957                     peer->n_redirects == ip_rt_redirect_number)
 958                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 959                                              &ip_hdr(skb)->saddr, inet_iif(skb),
 960                                              &ip_hdr(skb)->daddr, &gw);
 961 #endif
 962         }
 963 out_put_peer:
 964         inet_putpeer(peer);
 965 }
 966
 967 static int ip_error(struct sk_buff *skb)
 968 {
 969         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
 970         struct rtable *rt = skb_rtable(skb);
 971         struct inet_peer *peer;
 972         unsigned long now;
 973         struct net *net;
 974         bool send;
 975         int code;
 976
 977         /* IP on this device is disabled. */
 978         if (!in_dev)
 979                 goto out;
 980
 981         net = dev_net(rt->dst.dev);
 982         if (!IN_DEV_FORWARD(in_dev)) {
 983                 switch (rt->dst.error) {
 984                 case EHOSTUNREACH:
 985                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 986                         break;
 987
 988                 case ENETUNREACH:
 989                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 990                         break;
 991                 }
 992                 goto out;
 993         }
 994
 995         switch (rt->dst.error) {
 996         case EINVAL:
 997         default:
 998                 goto out;
 999         case EHOSTUNREACH:
1000                 code = ICMP_HOST_UNREACH;
1001                 break;
1002         case ENETUNREACH:
1003                 code = ICMP_NET_UNREACH;
1004                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
1005                 break;
1006         case EACCES:
1007                 code = ICMP_PKT_FILTERED;
1008                 break;
1009         }
1010
1011         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
1012                                l3mdev_master_ifindex(skb->dev), 1);
1013
1014         send = true;
1015         if (peer) {
1016                 now = jiffies;
1017                 peer->rate_tokens += now - peer->rate_last;
1018                 if (peer->rate_tokens > ip_rt_error_burst)
1019                         peer->rate_tokens = ip_rt_error_burst;
1020                 peer->rate_last = now;
1021                 if (peer->rate_tokens >= ip_rt_error_cost)
1022                         peer->rate_tokens -= ip_rt_error_cost;
1023                 else
1024                         send = false;
1025                 inet_putpeer(peer);
1026         }
1027         if (send)
1028                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1029
1030 out:    kfree_skb(skb);
1031         return 0;
1032 }
1033
1034 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1035 {
1036         struct dst_entry *dst = &rt->dst;
1037         struct net *net = dev_net(dst->dev);
1038         u32 old_mtu = ipv4_mtu(dst);
1039         struct fib_result res;
1040         bool lock = false;
1041
1042         if (ip_mtu_locked(dst))
1043                 return;
1044
1045         if (old_mtu < mtu)
1046                 return;
1047
1048         if (mtu < ip_rt_min_pmtu) {
1049                 lock = true;
1050                 mtu = min(old_mtu, ip_rt_min_pmtu);
1051         }
1052
1053         if (rt->rt_pmtu == mtu && !lock &&
1054             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1055                 return;
1056
1057         rcu_read_lock();
1058         if (fib_lookup(net, fl4, &res, 0) == 0) {
1059                 struct fib_nh *nh;
1060
1061                 fib_select_path(net, &res, fl4, NULL);
1062                 nh = &FIB_RES_NH(res);
1063                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
1064                                       jiffies + ip_rt_mtu_expires);
1065         }
1066         rcu_read_unlock();
1067 }
1068
1069 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1070                               struct sk_buff *skb, u32 mtu,
1071                               bool confirm_neigh)
1072 {
1073         struct rtable *rt = (struct rtable *) dst;
1074         struct flowi4 fl4;
1075
1076         ip_rt_build_flow_key(&fl4, sk, skb);
1077         __ip_rt_update_pmtu(rt, &fl4, mtu);
1078 }
1079
1080 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1081                       int oif, u32 mark, u8 protocol, int flow_flags)
1082 {
1083         const struct iphdr *iph = (const struct iphdr *) skb->data;
1084         struct flowi4 fl4;
1085         struct rtable *rt;
1086
1087         if (!mark)
1088                 mark = IP4_REPLY_MARK(net, skb->mark);
1089
1090         __build_flow_key(net, &fl4, NULL, iph, oif,
1091                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1092         rt = __ip_route_output_key(net, &fl4);
1093         if (!IS_ERR(rt)) {
1094                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1095                 ip_rt_put(rt);
1096         }
1097 }
1098 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1099
1100 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1101 {
1102         const struct iphdr *iph = (const struct iphdr *) skb->data;
1103         struct flowi4 fl4;
1104         struct rtable *rt;
1105
1106         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1107
1108         if (!fl4.flowi4_mark)
1109                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1110
1111         rt = __ip_route_output_key(sock_net(sk), &fl4);
1112         if (!IS_ERR(rt)) {
1113                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1114                 ip_rt_put(rt);
1115         }
1116 }
1117
1118 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1119 {
1120         const struct iphdr *iph = (const struct iphdr *) skb->data;
1121         struct flowi4 fl4;
1122         struct rtable *rt;
1123         struct dst_entry *odst = NULL;
1124         bool new = false;
1125         struct net *net = sock_net(sk);
1126
1127         bh_lock_sock(sk);
1128
1129         if (!ip_sk_accept_pmtu(sk))
1130                 goto out;
1131
1132         odst = sk_dst_get(sk);
1133
1134         if (sock_owned_by_user(sk) || !odst) {
1135                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1136                 goto out;
1137         }
1138
1139         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1140
1141         rt = (struct rtable *)odst;
1142         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1143                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1144                 if (IS_ERR(rt))
1145                         goto out;
1146
1147                 new = true;
1148         }
1149
1150         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1151
1152         if (!dst_check(&rt->dst, 0)) {
1153                 if (new)
1154                         dst_release(&rt->dst);
1155
1156                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1157                 if (IS_ERR(rt))
1158                         goto out;
1159
1160                 new = true;
1161         }
1162
1163         if (new)
1164                 sk_dst_set(sk, &rt->dst);
1165
1166 out:
1167         bh_unlock_sock(sk);
1168         dst_release(odst);
1169 }
1170 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1171
1172 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1173                    int oif, u32 mark, u8 protocol, int flow_flags)
1174 {
1175         const struct iphdr *iph = (const struct iphdr *) skb->data;
1176         struct flowi4 fl4;
1177         struct rtable *rt;
1178
1179         __build_flow_key(net, &fl4, NULL, iph, oif,
1180                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1181         rt = __ip_route_output_key(net, &fl4);
1182         if (!IS_ERR(rt)) {
1183                 __ip_do_redirect(rt, skb, &fl4, false);
1184                 ip_rt_put(rt);
1185         }
1186 }
1187 EXPORT_SYMBOL_GPL(ipv4_redirect);
1188
1189 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1190 {
1191         const struct iphdr *iph = (const struct iphdr *) skb->data;
1192         struct flowi4 fl4;
1193         struct rtable *rt;
1194         struct net *net = sock_net(sk);
1195
1196         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1197         rt = __ip_route_output_key(net, &fl4);
1198         if (!IS_ERR(rt)) {
1199                 __ip_do_redirect(rt, skb, &fl4, false);
1200                 ip_rt_put(rt);
1201         }
1202 }
1203 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1204
1205 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1206 {
1207         struct rtable *rt = (struct rtable *) dst;
1208
1209         /* All IPV4 dsts are created with ->obsolete set to the value
1210          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1211          * into this function always.
1212          *
1213          * When a PMTU/redirect information update invalidates a route,
1214          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1215          * DST_OBSOLETE_DEAD by dst_free().
1216          */
1217         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1218                 return NULL;
1219         return dst;
1220 }
1221
1222 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1223 {
1224         struct ip_options opt;
1225         int res;
1226
1227         /* Recompile ip options since IPCB may not be valid anymore.
1228          * Also check we have a reasonable ipv4 header.
1229          */
1230         if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1231             ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1232                 return;
1233
1234         memset(&opt, 0, sizeof(opt));
1235         if (ip_hdr(skb)->ihl > 5) {
1236                 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1237                         return;
1238                 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1239
1240                 rcu_read_lock();
1241                 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1242                 rcu_read_unlock();
1243
1244                 if (res)
1245                         return;
1246         }
1247         __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1248 }
1249
1250 static void ipv4_link_failure(struct sk_buff *skb)
1251 {
1252         struct rtable *rt;
1253
1254         ipv4_send_dest_unreach(skb);
1255
1256         rt = skb_rtable(skb);
1257         if (rt)
1258                 dst_set_expires(&rt->dst, 0);
1259 }
1260
1261 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1262 {
1263         pr_debug("%s: %pI4 -> %pI4, %s\n",
1264                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1265                  skb->dev ? skb->dev->name : "?");
1266         kfree_skb(skb);
1267         WARN_ON(1);
1268         return 0;
1269 }
1270
1271 /*
1272    We do not cache source address of outgoing interface,
1273    because it is used only by IP RR, TS and SRR options,
1274    so that it out of fast path.
1275
1276    BTW remember: "addr" is allowed to be not aligned
1277    in IP options!
1278  */
1279
1280 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1281 {
1282         __be32 src;
1283
1284         if (rt_is_output_route(rt))
1285                 src = ip_hdr(skb)->saddr;
1286         else {
1287                 struct fib_result res;
1288                 struct flowi4 fl4;
1289                 struct iphdr *iph;
1290
1291                 iph = ip_hdr(skb);
1292
1293                 memset(&fl4, 0, sizeof(fl4));
1294                 fl4.daddr = iph->daddr;
1295                 fl4.saddr = iph->saddr;
1296                 fl4.flowi4_tos = RT_TOS(iph->tos);
1297                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1298                 fl4.flowi4_iif = skb->dev->ifindex;
1299                 fl4.flowi4_mark = skb->mark;
1300
1301                 rcu_read_lock();
1302                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1303                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1304                 else
1305                         src = inet_select_addr(rt->dst.dev,
1306                                                rt_nexthop(rt, iph->daddr),
1307                                                RT_SCOPE_UNIVERSE);
1308                 rcu_read_unlock();
1309         }
1310         memcpy(addr, &src, 4);
1311 }
1312
1313 #ifdef CONFIG_IP_ROUTE_CLASSID
1314 static void set_class_tag(struct rtable *rt, u32 tag)
1315 {
1316         if (!(rt->dst.tclassid & 0xFFFF))
1317                 rt->dst.tclassid |= tag & 0xFFFF;
1318         if (!(rt->dst.tclassid & 0xFFFF0000))
1319                 rt->dst.tclassid |= tag & 0xFFFF0000;
1320 }
1321 #endif
1322
1323 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1324 {
1325         unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1326         unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1327                                     ip_rt_min_advmss);
1328
1329         return min(advmss, IPV4_MAX_PMTU - header_size);
1330 }
1331
1332 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1333 {
1334         const struct rtable *rt = (const struct rtable *) dst;
1335         unsigned int mtu = rt->rt_pmtu;
1336
1337         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1338                 mtu = dst_metric_raw(dst, RTAX_MTU);
1339
1340         if (mtu)
1341                 return mtu;
1342
1343         mtu = READ_ONCE(dst->dev->mtu);
1344
1345         if (unlikely(ip_mtu_locked(dst))) {
1346                 if (rt->rt_uses_gateway && mtu > 576)
1347                         mtu = 576;
1348         }
1349
1350         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1351
1352         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1353 }
1354
1355 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1356 {
1357         struct fnhe_hash_bucket *hash;
1358         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1359         u32 hval = fnhe_hashfun(daddr);
1360
1361         spin_lock_bh(&fnhe_lock);
1362
1363         hash = rcu_dereference_protected(nh->nh_exceptions,
1364                                          lockdep_is_held(&fnhe_lock));
1365         hash += hval;
1366
1367         fnhe_p = &hash->chain;
1368         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1369         while (fnhe) {
1370                 if (fnhe->fnhe_daddr == daddr) {
1371                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1372                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1373                         /* set fnhe_daddr to 0 to ensure it won't bind with
1374                          * new dsts in rt_bind_exception().
1375                          */
1376                         fnhe->fnhe_daddr = 0;
1377                         fnhe_flush_routes(fnhe);
1378                         kfree_rcu(fnhe, rcu);
1379                         break;
1380                 }
1381                 fnhe_p = &fnhe->fnhe_next;
1382                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1383                                                  lockdep_is_held(&fnhe_lock));
1384         }
1385
1386         spin_unlock_bh(&fnhe_lock);
1387 }
1388
1389 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1390 {
1391         struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1392         struct fib_nh_exception *fnhe;
1393         u32 hval;
1394
1395         if (!hash)
1396                 return NULL;
1397
1398         hval = fnhe_hashfun(daddr);
1399
1400         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1401              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1402                 if (fnhe->fnhe_daddr == daddr) {
1403                         if (fnhe->fnhe_expires &&
1404                             time_after(jiffies, fnhe->fnhe_expires)) {
1405                                 ip_del_fnhe(nh, daddr);
1406                                 break;
1407                         }
1408                         return fnhe;
1409                 }
1410         }
1411         return NULL;
1412 }
1413
1414 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1415                               __be32 daddr, const bool do_cache)
1416 {
1417         bool ret = false;
1418
1419         spin_lock_bh(&fnhe_lock);
1420
1421         if (daddr == fnhe->fnhe_daddr) {
1422                 struct rtable __rcu **porig;
1423                 struct rtable *orig;
1424                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1425
1426                 if (rt_is_input_route(rt))
1427                         porig = &fnhe->fnhe_rth_input;
1428                 else
1429                         porig = &fnhe->fnhe_rth_output;
1430                 orig = rcu_dereference(*porig);
1431
1432                 if (fnhe->fnhe_genid != genid) {
1433                         fnhe->fnhe_genid = genid;
1434                         fnhe->fnhe_gw = 0;
1435                         fnhe->fnhe_pmtu = 0;
1436                         fnhe->fnhe_expires = 0;
1437                         fnhe_flush_routes(fnhe);
1438                         orig = NULL;
1439                 }
1440                 fill_route_from_fnhe(rt, fnhe);
1441                 if (!rt->rt_gateway)
1442                         rt->rt_gateway = daddr;
1443
1444                 if (do_cache) {
1445                         dst_hold(&rt->dst);
1446                         rcu_assign_pointer(*porig, rt);
1447                         if (orig) {
1448                                 dst_dev_put(&orig->dst);
1449                                 dst_release(&orig->dst);
1450                         }
1451                         ret = true;
1452                 }
1453
1454                 fnhe->fnhe_stamp = jiffies;
1455         }
1456         spin_unlock_bh(&fnhe_lock);
1457
1458         return ret;
1459 }
1460
1461 struct uncached_list {
1462         spinlock_t              lock;
1463         struct list_head        head;
1464 };
1465
1466 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1467
1468 static void rt_add_uncached_list(struct rtable *rt)
1469 {
1470         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1471
1472         rt->rt_uncached_list = ul;
1473
1474         spin_lock_bh(&ul->lock);
1475         list_add_tail(&rt->rt_uncached, &ul->head);
1476         spin_unlock_bh(&ul->lock);
1477 }
1478
1479 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1480 {
1481         struct rtable *orig, *prev, **p;
1482         bool ret = true;
1483
1484         if (rt_is_input_route(rt)) {
1485                 p = (struct rtable **)&nh->nh_rth_input;
1486         } else {
1487                 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1488         }
1489         orig = *p;
1490
1491         /* hold dst before doing cmpxchg() to avoid race condition
1492          * on this dst
1493          */
1494         dst_hold(&rt->dst);
1495         prev = cmpxchg(p, orig, rt);
1496         if (prev == orig) {
1497                 if (orig) {
1498                         rt_add_uncached_list(orig);
1499                         dst_release(&orig->dst);
1500                 }
1501         } else {
1502                 dst_release(&rt->dst);
1503                 ret = false;
1504         }
1505
1506         return ret;
1507 }
1508
1509 static void ipv4_dst_destroy(struct dst_entry *dst)
1510 {
1511         struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1512         struct rtable *rt = (struct rtable *) dst;
1513
1514         if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
1515                 kfree(p);
1516
1517         if (!list_empty(&rt->rt_uncached)) {
1518                 struct uncached_list *ul = rt->rt_uncached_list;
1519
1520                 spin_lock_bh(&ul->lock);
1521                 list_del(&rt->rt_uncached);
1522                 spin_unlock_bh(&ul->lock);
1523         }
1524 }
1525
1526 void rt_flush_dev(struct net_device *dev)
1527 {
1528         struct net *net = dev_net(dev);
1529         struct rtable *rt;
1530         int cpu;
1531
1532         for_each_possible_cpu(cpu) {
1533                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1534
1535                 spin_lock_bh(&ul->lock);
1536                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1537                         if (rt->dst.dev != dev)
1538                                 continue;
1539                         rt->dst.dev = net->loopback_dev;
1540                         dev_hold(rt->dst.dev);
1541                         dev_put(dev);
1542                 }
1543                 spin_unlock_bh(&ul->lock);
1544         }
1545 }
1546
1547 static bool rt_cache_valid(const struct rtable *rt)
1548 {
1549         return  rt &&
1550                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1551                 !rt_is_expired(rt);
1552 }
1553
1554 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1555                            const struct fib_result *res,
1556                            struct fib_nh_exception *fnhe,
1557                            struct fib_info *fi, u16 type, u32 itag,
1558                            const bool do_cache)
1559 {
1560         bool cached = false;
1561
1562         if (fi) {
1563                 struct fib_nh *nh = &FIB_RES_NH(*res);
1564
1565                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1566                         rt->rt_gateway = nh->nh_gw;
1567                         rt->rt_uses_gateway = 1;
1568                 }
1569                 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1570                 if (fi->fib_metrics != &dst_default_metrics) {
1571                         rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1572                         refcount_inc(&fi->fib_metrics->refcnt);
1573                 }
1574 #ifdef CONFIG_IP_ROUTE_CLASSID
1575                 rt->dst.tclassid = nh->nh_tclassid;
1576 #endif
1577                 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1578                 if (unlikely(fnhe))
1579                         cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1580                 else if (do_cache)
1581                         cached = rt_cache_route(nh, rt);
1582                 if (unlikely(!cached)) {
1583                         /* Routes we intend to cache in nexthop exception or
1584                          * FIB nexthop have the DST_NOCACHE bit clear.
1585                          * However, if we are unsuccessful at storing this
1586                          * route into the cache we really need to set it.
1587                          */
1588                         if (!rt->rt_gateway)
1589                                 rt->rt_gateway = daddr;
1590                         rt_add_uncached_list(rt);
1591                 }
1592         } else
1593                 rt_add_uncached_list(rt);
1594
1595 #ifdef CONFIG_IP_ROUTE_CLASSID
1596 #ifdef CONFIG_IP_MULTIPLE_TABLES
1597         set_class_tag(rt, res->tclassid);
1598 #endif
1599         set_class_tag(rt, itag);
1600 #endif
1601 }
1602
1603 struct rtable *rt_dst_alloc(struct net_device *dev,
1604                             unsigned int flags, u16 type,
1605                             bool nopolicy, bool noxfrm, bool will_cache)
1606 {
1607         struct rtable *rt;
1608
1609         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1610                        (will_cache ? 0 : DST_HOST) |
1611                        (nopolicy ? DST_NOPOLICY : 0) |
1612                        (noxfrm ? DST_NOXFRM : 0));
1613
1614         if (rt) {
1615                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1616                 rt->rt_flags = flags;
1617                 rt->rt_type = type;
1618                 rt->rt_is_input = 0;
1619                 rt->rt_iif = 0;
1620                 rt->rt_pmtu = 0;
1621                 rt->rt_mtu_locked = 0;
1622                 rt->rt_gateway = 0;
1623                 rt->rt_uses_gateway = 0;
1624                 rt->rt_table_id = 0;
1625                 INIT_LIST_HEAD(&rt->rt_uncached);
1626
1627                 rt->dst.output = ip_output;
1628                 if (flags & RTCF_LOCAL)
1629                         rt->dst.input = ip_local_deliver;
1630         }
1631
1632         return rt;
1633 }
1634 EXPORT_SYMBOL(rt_dst_alloc);
1635
1636 /* called in rcu_read_lock() section */
1637 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1638                           u8 tos, struct net_device *dev,
1639                           struct in_device *in_dev, u32 *itag)
1640 {
1641         int err;
1642
1643         /* Primary sanity checks. */
1644         if (!in_dev)
1645                 return -EINVAL;
1646
1647         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1648             skb->protocol != htons(ETH_P_IP))
1649                 return -EINVAL;
1650
1651         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1652                 return -EINVAL;
1653
1654         if (ipv4_is_zeronet(saddr)) {
1655                 if (!ipv4_is_local_multicast(daddr))
1656                         return -EINVAL;
1657         } else {
1658                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1659                                           in_dev, itag);
1660                 if (err < 0)
1661                         return err;
1662         }
1663         return 0;
1664 }
1665
1666 /* called in rcu_read_lock() section */
1667 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1668                              u8 tos, struct net_device *dev, int our)
1669 {
1670         struct in_device *in_dev = __in_dev_get_rcu(dev);
1671         unsigned int flags = RTCF_MULTICAST;
1672         struct rtable *rth;
1673         u32 itag = 0;
1674         int err;
1675
1676         err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1677         if (err)
1678                 return err;
1679
1680         if (our)
1681                 flags |= RTCF_LOCAL;
1682
1683         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1684                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1685         if (!rth)
1686                 return -ENOBUFS;
1687
1688 #ifdef CONFIG_IP_ROUTE_CLASSID
1689         rth->dst.tclassid = itag;
1690 #endif
1691         rth->dst.output = ip_rt_bug;
1692         rth->rt_is_input= 1;
1693
1694 #ifdef CONFIG_IP_MROUTE
1695         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1696                 rth->dst.input = ip_mr_input;
1697 #endif
1698         RT_CACHE_STAT_INC(in_slow_mc);
1699
1700         skb_dst_drop(skb);
1701         skb_dst_set(skb, &rth->dst);
1702         return 0;
1703 }
1704
1705
1706 static void ip_handle_martian_source(struct net_device *dev,
1707                                      struct in_device *in_dev,
1708                                      struct sk_buff *skb,
1709                                      __be32 daddr,
1710                                      __be32 saddr)
1711 {
1712         RT_CACHE_STAT_INC(in_martian_src);
1713 #ifdef CONFIG_IP_ROUTE_VERBOSE
1714         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1715                 /*
1716                  *      RFC1812 recommendation, if source is martian,
1717                  *      the only hint is MAC header.
1718                  */
1719                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1720                         &daddr, &saddr, dev->name);
1721                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1722                         print_hex_dump(KERN_WARNING, "ll header: ",
1723                                        DUMP_PREFIX_OFFSET, 16, 1,
1724                                        skb_mac_header(skb),
1725                                        dev->hard_header_len, true);
1726                 }
1727         }
1728 #endif
1729 }
1730
1731 static void set_lwt_redirect(struct rtable *rth)
1732 {
1733         if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1734                 rth->dst.lwtstate->orig_output = rth->dst.output;
1735                 rth->dst.output = lwtunnel_output;
1736         }
1737
1738         if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1739                 rth->dst.lwtstate->orig_input = rth->dst.input;
1740                 rth->dst.input = lwtunnel_input;
1741         }
1742 }
1743
1744 /* called in rcu_read_lock() section */
1745 static int __mkroute_input(struct sk_buff *skb,
1746                            const struct fib_result *res,
1747                            struct in_device *in_dev,
1748                            __be32 daddr, __be32 saddr, u32 tos)
1749 {
1750         struct fib_nh_exception *fnhe;
1751         struct rtable *rth;
1752         int err;
1753         struct in_device *out_dev;
1754         bool do_cache;
1755         u32 itag = 0;
1756
1757         /* get a working reference to the output device */
1758         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1759         if (!out_dev) {
1760                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1761                 return -EINVAL;
1762         }
1763
1764         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1765                                   in_dev->dev, in_dev, &itag);
1766         if (err < 0) {
1767                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1768                                          saddr);
1769
1770                 goto cleanup;
1771         }
1772
1773         do_cache = res->fi && !itag;
1774         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1775             skb->protocol == htons(ETH_P_IP) &&
1776             (IN_DEV_SHARED_MEDIA(out_dev) ||
1777              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1778                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1779
1780         if (skb->protocol != htons(ETH_P_IP)) {
1781                 /* Not IP (i.e. ARP). Do not create route, if it is
1782                  * invalid for proxy arp. DNAT routes are always valid.
1783                  *
1784                  * Proxy arp feature have been extended to allow, ARP
1785                  * replies back to the same interface, to support
1786                  * Private VLAN switch technologies. See arp.c.
1787                  */
1788                 if (out_dev == in_dev &&
1789                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1790                         err = -EINVAL;
1791                         goto cleanup;
1792                 }
1793         }
1794
1795         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1796         if (do_cache) {
1797                 if (fnhe)
1798                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1799                 else
1800                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1801                 if (rt_cache_valid(rth)) {
1802                         skb_dst_set_noref(skb, &rth->dst);
1803                         goto out;
1804                 }
1805         }
1806
1807         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1808                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1809                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1810         if (!rth) {
1811                 err = -ENOBUFS;
1812                 goto cleanup;
1813         }
1814
1815         rth->rt_is_input = 1;
1816         if (res->table)
1817                 rth->rt_table_id = res->table->tb_id;
1818         RT_CACHE_STAT_INC(in_slow_tot);
1819
1820         rth->dst.input = ip_forward;
1821
1822         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1823                        do_cache);
1824         set_lwt_redirect(rth);
1825         skb_dst_set(skb, &rth->dst);
1826 out:
1827         err = 0;
1828  cleanup:
1829         return err;
1830 }
1831
1832 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1833 /* To make ICMP packets follow the right flow, the multipath hash is
1834  * calculated from the inner IP addresses.
1835  */
1836 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1837                                  struct flow_keys *hash_keys)
1838 {
1839         const struct iphdr *outer_iph = ip_hdr(skb);
1840         const struct iphdr *inner_iph;
1841         const struct icmphdr *icmph;
1842         struct iphdr _inner_iph;
1843         struct icmphdr _icmph;
1844
1845         hash_keys->addrs.v4addrs.src = outer_iph->saddr;
1846         hash_keys->addrs.v4addrs.dst = outer_iph->daddr;
1847         if (likely(outer_iph->protocol != IPPROTO_ICMP))
1848                 return;
1849
1850         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1851                 return;
1852
1853         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1854                                    &_icmph);
1855         if (!icmph)
1856                 return;
1857
1858         if (icmph->type != ICMP_DEST_UNREACH &&
1859             icmph->type != ICMP_REDIRECT &&
1860             icmph->type != ICMP_TIME_EXCEEDED &&
1861             icmph->type != ICMP_PARAMETERPROB)
1862                 return;
1863
1864         inner_iph = skb_header_pointer(skb,
1865                                        outer_iph->ihl * 4 + sizeof(_icmph),
1866                                        sizeof(_inner_iph), &_inner_iph);
1867         if (!inner_iph)
1868                 return;
1869         hash_keys->addrs.v4addrs.src = inner_iph->saddr;
1870         hash_keys->addrs.v4addrs.dst = inner_iph->daddr;
1871 }
1872
1873 /* if skb is set it will be used and fl4 can be NULL */
1874 int fib_multipath_hash(const struct fib_info *fi, const struct flowi4 *fl4,
1875                        const struct sk_buff *skb)
1876 {
1877         struct net *net = fi->fib_net;
1878         struct flow_keys hash_keys;
1879         u32 mhash;
1880
1881         switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1882         case 0:
1883                 memset(&hash_keys, 0, sizeof(hash_keys));
1884                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1885                 if (skb) {
1886                         ip_multipath_l3_keys(skb, &hash_keys);
1887                 } else {
1888                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1889                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1890                 }
1891                 break;
1892         case 1:
1893                 /* skb is currently provided only when forwarding */
1894                 if (skb) {
1895                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1896                         struct flow_keys keys;
1897
1898                         /* short-circuit if we already have L4 hash present */
1899                         if (skb->l4_hash)
1900                                 return skb_get_hash_raw(skb) >> 1;
1901                         memset(&hash_keys, 0, sizeof(hash_keys));
1902                         skb_flow_dissect_flow_keys(skb, &keys, flag);
1903
1904                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1905                         hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
1906                         hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
1907                         hash_keys.ports.src = keys.ports.src;
1908                         hash_keys.ports.dst = keys.ports.dst;
1909                         hash_keys.basic.ip_proto = keys.basic.ip_proto;
1910                 } else {
1911                         memset(&hash_keys, 0, sizeof(hash_keys));
1912                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1913                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1914                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1915                         hash_keys.ports.src = fl4->fl4_sport;
1916                         hash_keys.ports.dst = fl4->fl4_dport;
1917                         hash_keys.basic.ip_proto = fl4->flowi4_proto;
1918                 }
1919                 break;
1920         }
1921         mhash = flow_hash_from_keys(&hash_keys);
1922
1923         return mhash >> 1;
1924 }
1925 EXPORT_SYMBOL_GPL(fib_multipath_hash);
1926 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1927
1928 static int ip_mkroute_input(struct sk_buff *skb,
1929                             struct fib_result *res,
1930                             struct in_device *in_dev,
1931                             __be32 daddr, __be32 saddr, u32 tos)
1932 {
1933 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1934         if (res->fi && res->fi->fib_nhs > 1) {
1935                 int h = fib_multipath_hash(res->fi, NULL, skb);
1936
1937                 fib_select_multipath(res, h);
1938         }
1939 #endif
1940
1941         /* create a routing cache entry */
1942         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1943 }
1944
1945 /*
1946  *      NOTE. We drop all the packets that has local source
1947  *      addresses, because every properly looped back packet
1948  *      must have correct destination already attached by output routine.
1949  *
1950  *      Such approach solves two big problems:
1951  *      1. Not simplex devices are handled properly.
1952  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1953  *      called with rcu_read_lock()
1954  */
1955
1956 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1957                                u8 tos, struct net_device *dev,
1958                                struct fib_result *res)
1959 {
1960         struct in_device *in_dev = __in_dev_get_rcu(dev);
1961         struct ip_tunnel_info *tun_info;
1962         struct flowi4   fl4;
1963         unsigned int    flags = 0;
1964         u32             itag = 0;
1965         struct rtable   *rth;
1966         int             err = -EINVAL;
1967         struct net    *net = dev_net(dev);
1968         bool do_cache;
1969
1970         /* IP on this device is disabled. */
1971
1972         if (!in_dev)
1973                 goto out;
1974
1975         /* Check for the most weird martians, which can be not detected
1976            by fib_lookup.
1977          */
1978
1979         tun_info = skb_tunnel_info(skb);
1980         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1981                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1982         else
1983                 fl4.flowi4_tun_key.tun_id = 0;
1984         skb_dst_drop(skb);
1985
1986         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1987                 goto martian_source;
1988
1989         res->fi = NULL;
1990         res->table = NULL;
1991         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1992                 goto brd_input;
1993
1994         /* Accept zero addresses only to limited broadcast;
1995          * I even do not know to fix it or not. Waiting for complains :-)
1996          */
1997         if (ipv4_is_zeronet(saddr))
1998                 goto martian_source;
1999
2000         if (ipv4_is_zeronet(daddr))
2001                 goto martian_destination;
2002
2003         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2004          * and call it once if daddr or/and saddr are loopback addresses
2005          */
2006         if (ipv4_is_loopback(daddr)) {
2007                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2008                         goto martian_destination;
2009         } else if (ipv4_is_loopback(saddr)) {
2010                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2011                         goto martian_source;
2012         }
2013
2014         /*
2015          *      Now we are ready to route packet.
2016          */
2017         fl4.flowi4_oif = 0;
2018         fl4.flowi4_iif = dev->ifindex;
2019         fl4.flowi4_mark = skb->mark;
2020         fl4.flowi4_tos = tos;
2021         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2022         fl4.flowi4_flags = 0;
2023         fl4.daddr = daddr;
2024         fl4.saddr = saddr;
2025         fl4.flowi4_uid = sock_net_uid(net, NULL);
2026         err = fib_lookup(net, &fl4, res, 0);
2027         if (err != 0) {
2028                 if (!IN_DEV_FORWARD(in_dev))
2029                         err = -EHOSTUNREACH;
2030                 goto no_route;
2031         }
2032
2033         if (res->type == RTN_BROADCAST)
2034                 goto brd_input;
2035
2036         if (res->type == RTN_LOCAL) {
2037                 err = fib_validate_source(skb, saddr, daddr, tos,
2038                                           0, dev, in_dev, &itag);
2039                 if (err < 0)
2040                         goto martian_source;
2041                 goto local_input;
2042         }
2043
2044         if (!IN_DEV_FORWARD(in_dev)) {
2045                 err = -EHOSTUNREACH;
2046                 goto no_route;
2047         }
2048         if (res->type != RTN_UNICAST)
2049                 goto martian_destination;
2050
2051         err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2052 out:    return err;
2053
2054 brd_input:
2055         if (skb->protocol != htons(ETH_P_IP))
2056                 goto e_inval;
2057
2058         if (!ipv4_is_zeronet(saddr)) {
2059                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2060                                           in_dev, &itag);
2061                 if (err < 0)
2062                         goto martian_source;
2063         }
2064         flags |= RTCF_BROADCAST;
2065         res->type = RTN_BROADCAST;
2066         RT_CACHE_STAT_INC(in_brd);
2067
2068 local_input:
2069         do_cache = false;
2070         if (res->fi) {
2071                 if (!itag) {
2072                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
2073                         if (rt_cache_valid(rth)) {
2074                                 skb_dst_set_noref(skb, &rth->dst);
2075                                 err = 0;
2076                                 goto out;
2077                         }
2078                         do_cache = true;
2079                 }
2080         }
2081
2082         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2083                            flags | RTCF_LOCAL, res->type,
2084                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2085         if (!rth)
2086                 goto e_nobufs;
2087
2088         rth->dst.output= ip_rt_bug;
2089 #ifdef CONFIG_IP_ROUTE_CLASSID
2090         rth->dst.tclassid = itag;
2091 #endif
2092         rth->rt_is_input = 1;
2093         if (res->table)
2094                 rth->rt_table_id = res->table->tb_id;
2095
2096         RT_CACHE_STAT_INC(in_slow_tot);
2097         if (res->type == RTN_UNREACHABLE) {
2098                 rth->dst.input= ip_error;
2099                 rth->dst.error= -err;
2100                 rth->rt_flags   &= ~RTCF_LOCAL;
2101         }
2102
2103         if (do_cache) {
2104                 struct fib_nh *nh = &FIB_RES_NH(*res);
2105
2106                 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2107                 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2108                         WARN_ON(rth->dst.input == lwtunnel_input);
2109                         rth->dst.lwtstate->orig_input = rth->dst.input;
2110                         rth->dst.input = lwtunnel_input;
2111                 }
2112
2113                 if (unlikely(!rt_cache_route(nh, rth)))
2114                         rt_add_uncached_list(rth);
2115         }
2116         skb_dst_set(skb, &rth->dst);
2117         err = 0;
2118         goto out;
2119
2120 no_route:
2121         RT_CACHE_STAT_INC(in_no_route);
2122         res->type = RTN_UNREACHABLE;
2123         res->fi = NULL;
2124         res->table = NULL;
2125         goto local_input;
2126
2127         /*
2128          *      Do not cache martian addresses: they should be logged (RFC1812)
2129          */
2130 martian_destination:
2131         RT_CACHE_STAT_INC(in_martian_dst);
2132 #ifdef CONFIG_IP_ROUTE_VERBOSE
2133         if (IN_DEV_LOG_MARTIANS(in_dev))
2134                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2135                                      &daddr, &saddr, dev->name);
2136 #endif
2137
2138 e_inval:
2139         err = -EINVAL;
2140         goto out;
2141
2142 e_nobufs:
2143         err = -ENOBUFS;
2144         goto out;
2145
2146 martian_source:
2147         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2148         goto out;
2149 }
2150
2151 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2152                          u8 tos, struct net_device *dev)
2153 {
2154         struct fib_result res;
2155         int err;
2156
2157         tos &= IPTOS_RT_MASK;
2158         rcu_read_lock();
2159         err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2160         rcu_read_unlock();
2161
2162         return err;
2163 }
2164 EXPORT_SYMBOL(ip_route_input_noref);
2165
2166 /* called with rcu_read_lock held */
2167 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2168                        u8 tos, struct net_device *dev, struct fib_result *res)
2169 {
2170         /* Multicast recognition logic is moved from route cache to here.
2171            The problem was that too many Ethernet cards have broken/missing
2172            hardware multicast filters :-( As result the host on multicasting
2173            network acquires a lot of useless route cache entries, sort of
2174            SDR messages from all the world. Now we try to get rid of them.
2175            Really, provided software IP multicast filter is organized
2176            reasonably (at least, hashed), it does not result in a slowdown
2177            comparing with route cache reject entries.
2178            Note, that multicast routers are not affected, because
2179            route cache entry is created eventually.
2180          */
2181         if (ipv4_is_multicast(daddr)) {
2182                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2183                 int our = 0;
2184                 int err = -EINVAL;
2185
2186                 if (!in_dev)
2187                         return err;
2188                 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2189                                       ip_hdr(skb)->protocol);
2190
2191                 /* check l3 master if no match yet */
2192                 if (!our && netif_is_l3_slave(dev)) {
2193                         struct in_device *l3_in_dev;
2194
2195                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2196                         if (l3_in_dev)
2197                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2198                                                       ip_hdr(skb)->protocol);
2199                 }
2200
2201                 if (our
2202 #ifdef CONFIG_IP_MROUTE
2203                         ||
2204                     (!ipv4_is_local_multicast(daddr) &&
2205                      IN_DEV_MFORWARD(in_dev))
2206 #endif
2207                    ) {
2208                         err = ip_route_input_mc(skb, daddr, saddr,
2209                                                 tos, dev, our);
2210                 }
2211                 return err;
2212         }
2213
2214         return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2215 }
2216
2217 /* called with rcu_read_lock() */
2218 static struct rtable *__mkroute_output(const struct fib_result *res,
2219                                        const struct flowi4 *fl4, int orig_oif,
2220                                        struct net_device *dev_out,
2221                                        unsigned int flags)
2222 {
2223         struct fib_info *fi = res->fi;
2224         struct fib_nh_exception *fnhe;
2225         struct in_device *in_dev;
2226         u16 type = res->type;
2227         struct rtable *rth;
2228         bool do_cache;
2229
2230         in_dev = __in_dev_get_rcu(dev_out);
2231         if (!in_dev)
2232                 return ERR_PTR(-EINVAL);
2233
2234         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2235                 if (ipv4_is_loopback(fl4->saddr) &&
2236                     !(dev_out->flags & IFF_LOOPBACK) &&
2237                     !netif_is_l3_master(dev_out))
2238                         return ERR_PTR(-EINVAL);
2239
2240         if (ipv4_is_lbcast(fl4->daddr))
2241                 type = RTN_BROADCAST;
2242         else if (ipv4_is_multicast(fl4->daddr))
2243                 type = RTN_MULTICAST;
2244         else if (ipv4_is_zeronet(fl4->daddr))
2245                 return ERR_PTR(-EINVAL);
2246
2247         if (dev_out->flags & IFF_LOOPBACK)
2248                 flags |= RTCF_LOCAL;
2249
2250         do_cache = true;
2251         if (type == RTN_BROADCAST) {
2252                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2253                 fi = NULL;
2254         } else if (type == RTN_MULTICAST) {
2255                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2256                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2257                                      fl4->flowi4_proto))
2258                         flags &= ~RTCF_LOCAL;
2259                 else
2260                         do_cache = false;
2261                 /* If multicast route do not exist use
2262                  * default one, but do not gateway in this case.
2263                  * Yes, it is hack.
2264                  */
2265                 if (fi && res->prefixlen < 4)
2266                         fi = NULL;
2267         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2268                    (orig_oif != dev_out->ifindex)) {
2269                 /* For local routes that require a particular output interface
2270                  * we do not want to cache the result.  Caching the result
2271                  * causes incorrect behaviour when there are multiple source
2272                  * addresses on the interface, the end result being that if the
2273                  * intended recipient is waiting on that interface for the
2274                  * packet he won't receive it because it will be delivered on
2275                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2276                  * be set to the loopback interface as well.
2277                  */
2278                 do_cache = false;
2279         }
2280
2281         fnhe = NULL;
2282         do_cache &= fi != NULL;
2283         if (fi) {
2284                 struct rtable __rcu **prth;
2285                 struct fib_nh *nh = &FIB_RES_NH(*res);
2286
2287                 fnhe = find_exception(nh, fl4->daddr);
2288                 if (!do_cache)
2289                         goto add;
2290                 if (fnhe) {
2291                         prth = &fnhe->fnhe_rth_output;
2292                 } else {
2293                         if (unlikely(fl4->flowi4_flags &
2294                                      FLOWI_FLAG_KNOWN_NH &&
2295                                      !(nh->nh_gw &&
2296                                        nh->nh_scope == RT_SCOPE_LINK))) {
2297                                 do_cache = false;
2298                                 goto add;
2299                         }
2300                         prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2301                 }
2302                 rth = rcu_dereference(*prth);
2303                 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2304                         return rth;
2305         }
2306
2307 add:
2308         rth = rt_dst_alloc(dev_out, flags, type,
2309                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2310                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2311                            do_cache);
2312         if (!rth)
2313                 return ERR_PTR(-ENOBUFS);
2314
2315         rth->rt_iif = orig_oif;
2316         if (res->table)
2317                 rth->rt_table_id = res->table->tb_id;
2318
2319         RT_CACHE_STAT_INC(out_slow_tot);
2320
2321         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2322                 if (flags & RTCF_LOCAL &&
2323                     !(dev_out->flags & IFF_LOOPBACK)) {
2324                         rth->dst.output = ip_mc_output;
2325                         RT_CACHE_STAT_INC(out_slow_mc);
2326                 }
2327 #ifdef CONFIG_IP_MROUTE
2328                 if (type == RTN_MULTICAST) {
2329                         if (IN_DEV_MFORWARD(in_dev) &&
2330                             !ipv4_is_local_multicast(fl4->daddr)) {
2331                                 rth->dst.input = ip_mr_input;
2332                                 rth->dst.output = ip_mc_output;
2333                         }
2334                 }
2335 #endif
2336         }
2337
2338         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2339         set_lwt_redirect(rth);
2340
2341         return rth;
2342 }
2343
2344 /*
2345  * Major route resolver routine.
2346  */
2347
2348 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2349                                         const struct sk_buff *skb)
2350 {
2351         __u8 tos = RT_FL_TOS(fl4);
2352         struct fib_result res = {
2353                 .type           = RTN_UNSPEC,
2354                 .fi             = NULL,
2355                 .table          = NULL,
2356                 .tclassid       = 0,
2357         };
2358         struct rtable *rth;
2359
2360         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2361         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2362         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2363                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2364
2365         rcu_read_lock();
2366         rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2367         rcu_read_unlock();
2368
2369         return rth;
2370 }
2371 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2372
2373 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2374                                             struct fib_result *res,
2375                                             const struct sk_buff *skb)
2376 {
2377         struct net_device *dev_out = NULL;
2378         int orig_oif = fl4->flowi4_oif;
2379         unsigned int flags = 0;
2380         struct rtable *rth;
2381         int err;
2382
2383         if (fl4->saddr) {
2384                 if (ipv4_is_multicast(fl4->saddr) ||
2385                     ipv4_is_lbcast(fl4->saddr) ||
2386                     ipv4_is_zeronet(fl4->saddr)) {
2387                         rth = ERR_PTR(-EINVAL);
2388                         goto out;
2389                 }
2390
2391                 rth = ERR_PTR(-ENETUNREACH);
2392
2393                 /* I removed check for oif == dev_out->oif here.
2394                    It was wrong for two reasons:
2395                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2396                       is assigned to multiple interfaces.
2397                    2. Moreover, we are allowed to send packets with saddr
2398                       of another iface. --ANK
2399                  */
2400
2401                 if (fl4->flowi4_oif == 0 &&
2402                     (ipv4_is_multicast(fl4->daddr) ||
2403                      ipv4_is_lbcast(fl4->daddr))) {
2404                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2405                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2406                         if (!dev_out)
2407                                 goto out;
2408
2409                         /* Special hack: user can direct multicasts
2410                            and limited broadcast via necessary interface
2411                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2412                            This hack is not just for fun, it allows
2413                            vic,vat and friends to work.
2414                            They bind socket to loopback, set ttl to zero
2415                            and expect that it will work.
2416                            From the viewpoint of routing cache they are broken,
2417                            because we are not allowed to build multicast path
2418                            with loopback source addr (look, routing cache
2419                            cannot know, that ttl is zero, so that packet
2420                            will not leave this host and route is valid).
2421                            Luckily, this hack is good workaround.
2422                          */
2423
2424                         fl4->flowi4_oif = dev_out->ifindex;
2425                         goto make_route;
2426                 }
2427
2428                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2429                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2430                         if (!__ip_dev_find(net, fl4->saddr, false))
2431                                 goto out;
2432                 }
2433         }
2434
2435
2436         if (fl4->flowi4_oif) {
2437                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2438                 rth = ERR_PTR(-ENODEV);
2439                 if (!dev_out)
2440                         goto out;
2441
2442                 /* RACE: Check return value of inet_select_addr instead. */
2443                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2444                         rth = ERR_PTR(-ENETUNREACH);
2445                         goto out;
2446                 }
2447                 if (ipv4_is_local_multicast(fl4->daddr) ||
2448                     ipv4_is_lbcast(fl4->daddr) ||
2449                     fl4->flowi4_proto == IPPROTO_IGMP) {
2450                         if (!fl4->saddr)
2451                                 fl4->saddr = inet_select_addr(dev_out, 0,
2452                                                               RT_SCOPE_LINK);
2453                         goto make_route;
2454                 }
2455                 if (!fl4->saddr) {
2456                         if (ipv4_is_multicast(fl4->daddr))
2457                                 fl4->saddr = inet_select_addr(dev_out, 0,
2458                                                               fl4->flowi4_scope);
2459                         else if (!fl4->daddr)
2460                                 fl4->saddr = inet_select_addr(dev_out, 0,
2461                                                               RT_SCOPE_HOST);
2462                 }
2463         }
2464
2465         if (!fl4->daddr) {
2466                 fl4->daddr = fl4->saddr;
2467                 if (!fl4->daddr)
2468                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2469                 dev_out = net->loopback_dev;
2470                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2471                 res->type = RTN_LOCAL;
2472                 flags |= RTCF_LOCAL;
2473                 goto make_route;
2474         }
2475
2476         err = fib_lookup(net, fl4, res, 0);
2477         if (err) {
2478                 res->fi = NULL;
2479                 res->table = NULL;
2480                 if (fl4->flowi4_oif &&
2481                     (ipv4_is_multicast(fl4->daddr) ||
2482                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2483                         /* Apparently, routing tables are wrong. Assume,
2484                            that the destination is on link.
2485
2486                            WHY? DW.
2487                            Because we are allowed to send to iface
2488                            even if it has NO routes and NO assigned
2489                            addresses. When oif is specified, routing
2490                            tables are looked up with only one purpose:
2491                            to catch if destination is gatewayed, rather than
2492                            direct. Moreover, if MSG_DONTROUTE is set,
2493                            we send packet, ignoring both routing tables
2494                            and ifaddr state. --ANK
2495
2496
2497                            We could make it even if oif is unknown,
2498                            likely IPv6, but we do not.
2499                          */
2500
2501                         if (fl4->saddr == 0)
2502                                 fl4->saddr = inet_select_addr(dev_out, 0,
2503                                                               RT_SCOPE_LINK);
2504                         res->type = RTN_UNICAST;
2505                         goto make_route;
2506                 }
2507                 rth = ERR_PTR(err);
2508                 goto out;
2509         }
2510
2511         if (res->type == RTN_LOCAL) {
2512                 if (!fl4->saddr) {
2513                         if (res->fi->fib_prefsrc)
2514                                 fl4->saddr = res->fi->fib_prefsrc;
2515                         else
2516                                 fl4->saddr = fl4->daddr;
2517                 }
2518
2519                 /* L3 master device is the loopback for that domain */
2520                 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2521                         net->loopback_dev;
2522
2523                 /* make sure orig_oif points to fib result device even
2524                  * though packet rx/tx happens over loopback or l3mdev
2525                  */
2526                 orig_oif = FIB_RES_OIF(*res);
2527
2528                 fl4->flowi4_oif = dev_out->ifindex;
2529                 flags |= RTCF_LOCAL;
2530                 goto make_route;
2531         }
2532
2533         fib_select_path(net, res, fl4, skb);
2534
2535         dev_out = FIB_RES_DEV(*res);
2536
2537 make_route:
2538         rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2539
2540 out:
2541         return rth;
2542 }
2543
2544 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2545 {
2546         return NULL;
2547 }
2548
2549 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2550 {
2551         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2552
2553         return mtu ? : dst->dev->mtu;
2554 }
2555
2556 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2557                                           struct sk_buff *skb, u32 mtu,
2558                                           bool confirm_neigh)
2559 {
2560 }
2561
2562 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2563                                        struct sk_buff *skb)
2564 {
2565 }
2566
2567 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2568                                           unsigned long old)
2569 {
2570         return NULL;
2571 }
2572
2573 static struct dst_ops ipv4_dst_blackhole_ops = {
2574         .family                 =       AF_INET,
2575         .check                  =       ipv4_blackhole_dst_check,
2576         .mtu                    =       ipv4_blackhole_mtu,
2577         .default_advmss         =       ipv4_default_advmss,
2578         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2579         .redirect               =       ipv4_rt_blackhole_redirect,
2580         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2581         .neigh_lookup           =       ipv4_neigh_lookup,
2582 };
2583
2584 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2585 {
2586         struct rtable *ort = (struct rtable *) dst_orig;
2587         struct rtable *rt;
2588
2589         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2590         if (rt) {
2591                 struct dst_entry *new = &rt->dst;
2592
2593                 new->__use = 1;
2594                 new->input = dst_discard;
2595                 new->output = dst_discard_out;
2596
2597                 new->dev = net->loopback_dev;
2598                 if (new->dev)
2599                         dev_hold(new->dev);
2600
2601                 rt->rt_is_input = ort->rt_is_input;
2602                 rt->rt_iif = ort->rt_iif;
2603                 rt->rt_pmtu = ort->rt_pmtu;
2604                 rt->rt_mtu_locked = ort->rt_mtu_locked;
2605
2606                 rt->rt_genid = rt_genid_ipv4(net);
2607                 rt->rt_flags = ort->rt_flags;
2608                 rt->rt_type = ort->rt_type;
2609                 rt->rt_gateway = ort->rt_gateway;
2610                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2611
2612                 INIT_LIST_HEAD(&rt->rt_uncached);
2613         }
2614
2615         dst_release(dst_orig);
2616
2617         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2618 }
2619
2620 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2621                                     const struct sock *sk)
2622 {
2623         struct rtable *rt = __ip_route_output_key(net, flp4);
2624
2625         if (IS_ERR(rt))
2626                 return rt;
2627
2628         if (flp4->flowi4_proto) {
2629                 flp4->flowi4_oif = rt->dst.dev->ifindex;
2630                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2631                                                         flowi4_to_flowi(flp4),
2632                                                         sk, 0);
2633         }
2634
2635         return rt;
2636 }
2637 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2638
2639 /* called with rcu_read_lock held */
2640 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2641                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2642                         u32 seq)
2643 {
2644         struct rtable *rt = skb_rtable(skb);
2645         struct rtmsg *r;
2646         struct nlmsghdr *nlh;
2647         unsigned long expires = 0;
2648         u32 error;
2649         u32 metrics[RTAX_MAX];
2650
2651         nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2652         if (!nlh)
2653                 return -EMSGSIZE;
2654
2655         r = nlmsg_data(nlh);
2656         r->rtm_family    = AF_INET;
2657         r->rtm_dst_len  = 32;
2658         r->rtm_src_len  = 0;
2659         r->rtm_tos      = fl4->flowi4_tos;
2660         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2661         if (nla_put_u32(skb, RTA_TABLE, table_id))
2662                 goto nla_put_failure;
2663         r->rtm_type     = rt->rt_type;
2664         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2665         r->rtm_protocol = RTPROT_UNSPEC;
2666         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2667         if (rt->rt_flags & RTCF_NOTIFY)
2668                 r->rtm_flags |= RTM_F_NOTIFY;
2669         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2670                 r->rtm_flags |= RTCF_DOREDIRECT;
2671
2672         if (nla_put_in_addr(skb, RTA_DST, dst))
2673                 goto nla_put_failure;
2674         if (src) {
2675                 r->rtm_src_len = 32;
2676                 if (nla_put_in_addr(skb, RTA_SRC, src))
2677                         goto nla_put_failure;
2678         }
2679         if (rt->dst.dev &&
2680             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2681                 goto nla_put_failure;
2682 #ifdef CONFIG_IP_ROUTE_CLASSID
2683         if (rt->dst.tclassid &&
2684             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2685                 goto nla_put_failure;
2686 #endif
2687         if (!rt_is_input_route(rt) &&
2688             fl4->saddr != src) {
2689                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2690                         goto nla_put_failure;
2691         }
2692         if (rt->rt_uses_gateway &&
2693             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2694                 goto nla_put_failure;
2695
2696         expires = rt->dst.expires;
2697         if (expires) {
2698                 unsigned long now = jiffies;
2699
2700                 if (time_before(now, expires))
2701                         expires -= now;
2702                 else
2703                         expires = 0;
2704         }
2705
2706         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2707         if (rt->rt_pmtu && expires)
2708                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2709         if (rt->rt_mtu_locked && expires)
2710                 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2711         if (rtnetlink_put_metrics(skb, metrics) < 0)
2712                 goto nla_put_failure;
2713
2714         if (fl4->flowi4_mark &&
2715             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2716                 goto nla_put_failure;
2717
2718         if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2719             nla_put_u32(skb, RTA_UID,
2720                         from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2721                 goto nla_put_failure;
2722
2723         error = rt->dst.error;
2724
2725         if (rt_is_input_route(rt)) {
2726 #ifdef CONFIG_IP_MROUTE
2727                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2728                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2729                         int err = ipmr_get_route(net, skb,
2730                                                  fl4->saddr, fl4->daddr,
2731                                                  r, portid);
2732
2733                         if (err <= 0) {
2734                                 if (err == 0)
2735                                         return 0;
2736                                 goto nla_put_failure;
2737                         }
2738                 } else
2739 #endif
2740                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2741                                 goto nla_put_failure;
2742         }
2743
2744         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2745                 goto nla_put_failure;
2746
2747         nlmsg_end(skb, nlh);
2748         return 0;
2749
2750 nla_put_failure:
2751         nlmsg_cancel(skb, nlh);
2752         return -EMSGSIZE;
2753 }
2754
2755 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2756                              struct netlink_ext_ack *extack)
2757 {
2758         struct net *net = sock_net(in_skb->sk);
2759         struct rtmsg *rtm;
2760         struct nlattr *tb[RTA_MAX+1];
2761         struct fib_result res = {};
2762         struct rtable *rt = NULL;
2763         struct flowi4 fl4;
2764         __be32 dst = 0;
2765         __be32 src = 0;
2766         u32 iif;
2767         int err;
2768         int mark;
2769         struct sk_buff *skb;
2770         u32 table_id = RT_TABLE_MAIN;
2771         kuid_t uid;
2772
2773         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
2774                           extack);
2775         if (err < 0)
2776                 goto errout;
2777
2778         rtm = nlmsg_data(nlh);
2779
2780         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2781         if (!skb) {
2782                 err = -ENOBUFS;
2783                 goto errout;
2784         }
2785
2786         /* Reserve room for dummy headers, this skb can pass
2787            through good chunk of routing engine.
2788          */
2789         skb_reset_mac_header(skb);
2790         skb_reset_network_header(skb);
2791
2792         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2793         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2794         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2795         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2796         if (tb[RTA_UID])
2797                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2798         else
2799                 uid = (iif ? INVALID_UID : current_uid());
2800
2801         /* Bugfix: need to give ip_route_input enough of an IP header to
2802          * not gag.
2803          */
2804         ip_hdr(skb)->protocol = IPPROTO_UDP;
2805         ip_hdr(skb)->saddr = src;
2806         ip_hdr(skb)->daddr = dst;
2807
2808         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2809
2810         memset(&fl4, 0, sizeof(fl4));
2811         fl4.daddr = dst;
2812         fl4.saddr = src;
2813         fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
2814         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2815         fl4.flowi4_mark = mark;
2816         fl4.flowi4_uid = uid;
2817
2818         rcu_read_lock();
2819
2820         if (iif) {
2821                 struct net_device *dev;
2822
2823                 dev = dev_get_by_index_rcu(net, iif);
2824                 if (!dev) {
2825                         err = -ENODEV;
2826                         goto errout_free;
2827                 }
2828
2829                 skb->protocol   = htons(ETH_P_IP);
2830                 skb->dev        = dev;
2831                 skb->mark       = mark;
2832                 err = ip_route_input_rcu(skb, dst, src,
2833                                          rtm->rtm_tos & IPTOS_RT_MASK, dev,
2834                                          &res);
2835
2836                 rt = skb_rtable(skb);
2837                 if (err == 0 && rt->dst.error)
2838                         err = -rt->dst.error;
2839         } else {
2840                 fl4.flowi4_iif = LOOPBACK_IFINDEX;
2841                 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
2842                 err = 0;
2843                 if (IS_ERR(rt))
2844                         err = PTR_ERR(rt);
2845                 else
2846                         skb_dst_set(skb, &rt->dst);
2847         }
2848
2849         if (err)
2850                 goto errout_free;
2851
2852         if (rtm->rtm_flags & RTM_F_NOTIFY)
2853                 rt->rt_flags |= RTCF_NOTIFY;
2854
2855         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2856                 table_id = rt->rt_table_id;
2857
2858         if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2859                 if (!res.fi) {
2860                         err = fib_props[res.type].error;
2861                         if (!err)
2862                                 err = -EHOSTUNREACH;
2863                         goto errout_free;
2864                 }
2865                 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2866                                     nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2867                                     rt->rt_type, res.prefix, res.prefixlen,
2868                                     fl4.flowi4_tos, res.fi, 0);
2869         } else {
2870                 err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2871                                    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
2872         }
2873         if (err < 0)
2874                 goto errout_free;
2875
2876         rcu_read_unlock();
2877
2878         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2879 errout:
2880         return err;
2881
2882 errout_free:
2883         rcu_read_unlock();
2884         kfree_skb(skb);
2885         goto errout;
2886 }
2887
2888 void ip_rt_multicast_event(struct in_device *in_dev)
2889 {
2890         rt_cache_flush(dev_net(in_dev->dev));
2891 }
2892
2893 #ifdef CONFIG_SYSCTL
2894 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2895 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2896 static int ip_rt_gc_elasticity __read_mostly    = 8;
2897 static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
2898
2899 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2900                                         void __user *buffer,
2901                                         size_t *lenp, loff_t *ppos)
2902 {
2903         struct net *net = (struct net *)__ctl->extra1;
2904
2905         if (write) {
2906                 rt_cache_flush(net);
2907                 fnhe_genid_bump(net);
2908                 return 0;
2909         }
2910
2911         return -EINVAL;
2912 }
2913
2914 static struct ctl_table ipv4_route_table[] = {
2915         {
2916                 .procname       = "gc_thresh",
2917                 .data           = &ipv4_dst_ops.gc_thresh,
2918                 .maxlen         = sizeof(int),
2919                 .mode           = 0644,
2920                 .proc_handler   = proc_dointvec,
2921         },
2922         {
2923                 .procname       = "max_size",
2924                 .data           = &ip_rt_max_size,
2925                 .maxlen         = sizeof(int),
2926                 .mode           = 0644,
2927                 .proc_handler   = proc_dointvec,
2928         },
2929         {
2930                 /*  Deprecated. Use gc_min_interval_ms */
2931
2932                 .procname       = "gc_min_interval",
2933                 .data           = &ip_rt_gc_min_interval,
2934                 .maxlen         = sizeof(int),
2935                 .mode           = 0644,
2936                 .proc_handler   = proc_dointvec_jiffies,
2937         },
2938         {
2939                 .procname       = "gc_min_interval_ms",
2940                 .data           = &ip_rt_gc_min_interval,
2941                 .maxlen         = sizeof(int),
2942                 .mode           = 0644,
2943                 .proc_handler   = proc_dointvec_ms_jiffies,
2944         },
2945         {
2946                 .procname       = "gc_timeout",
2947                 .data           = &ip_rt_gc_timeout,
2948                 .maxlen         = sizeof(int),
2949                 .mode           = 0644,
2950                 .proc_handler   = proc_dointvec_jiffies,
2951         },
2952         {
2953                 .procname       = "gc_interval",
2954                 .data           = &ip_rt_gc_interval,
2955                 .maxlen         = sizeof(int),
2956                 .mode           = 0644,
2957                 .proc_handler   = proc_dointvec_jiffies,
2958         },
2959         {
2960                 .procname       = "redirect_load",
2961                 .data           = &ip_rt_redirect_load,
2962                 .maxlen         = sizeof(int),
2963                 .mode           = 0644,
2964                 .proc_handler   = proc_dointvec,
2965         },
2966         {
2967                 .procname       = "redirect_number",
2968                 .data           = &ip_rt_redirect_number,
2969                 .maxlen         = sizeof(int),
2970                 .mode           = 0644,
2971                 .proc_handler   = proc_dointvec,
2972         },
2973         {
2974                 .procname       = "redirect_silence",
2975                 .data           = &ip_rt_redirect_silence,
2976                 .maxlen         = sizeof(int),
2977                 .mode           = 0644,
2978                 .proc_handler   = proc_dointvec,
2979         },
2980         {
2981                 .procname       = "error_cost",
2982                 .data           = &ip_rt_error_cost,
2983                 .maxlen         = sizeof(int),
2984                 .mode           = 0644,
2985                 .proc_handler   = proc_dointvec,
2986         },
2987         {
2988                 .procname       = "error_burst",
2989                 .data           = &ip_rt_error_burst,
2990                 .maxlen         = sizeof(int),
2991                 .mode           = 0644,
2992                 .proc_handler   = proc_dointvec,
2993         },
2994         {
2995                 .procname       = "gc_elasticity",
2996                 .data           = &ip_rt_gc_elasticity,
2997                 .maxlen         = sizeof(int),
2998                 .mode           = 0644,
2999                 .proc_handler   = proc_dointvec,
3000         },
3001         {
3002                 .procname       = "mtu_expires",
3003                 .data           = &ip_rt_mtu_expires,
3004                 .maxlen         = sizeof(int),
3005                 .mode           = 0644,
3006                 .proc_handler   = proc_dointvec_jiffies,
3007         },
3008         {
3009                 .procname       = "min_pmtu",
3010                 .data           = &ip_rt_min_pmtu,
3011                 .maxlen         = sizeof(int),
3012                 .mode           = 0644,
3013                 .proc_handler   = proc_dointvec_minmax,
3014                 .extra1         = &ip_min_valid_pmtu,
3015         },
3016         {
3017                 .procname       = "min_adv_mss",
3018                 .data           = &ip_rt_min_advmss,
3019                 .maxlen         = sizeof(int),
3020                 .mode           = 0644,
3021                 .proc_handler   = proc_dointvec,
3022         },
3023         { }
3024 };
3025
3026 static struct ctl_table ipv4_route_flush_table[] = {
3027         {
3028                 .procname       = "flush",
3029                 .maxlen         = sizeof(int),
3030                 .mode           = 0200,
3031                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3032         },
3033         { },
3034 };
3035
3036 static __net_init int sysctl_route_net_init(struct net *net)
3037 {
3038         struct ctl_table *tbl;
3039
3040         tbl = ipv4_route_flush_table;
3041         if (!net_eq(net, &init_net)) {
3042                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3043                 if (!tbl)
3044                         goto err_dup;
3045
3046                 /* Don't export sysctls to unprivileged users */
3047                 if (net->user_ns != &init_user_ns)
3048                         tbl[0].procname = NULL;
3049         }
3050         tbl[0].extra1 = net;
3051
3052         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3053         if (!net->ipv4.route_hdr)
3054                 goto err_reg;
3055         return 0;
3056
3057 err_reg:
3058         if (tbl != ipv4_route_flush_table)
3059                 kfree(tbl);
3060 err_dup:
3061         return -ENOMEM;
3062 }
3063
3064 static __net_exit void sysctl_route_net_exit(struct net *net)
3065 {
3066         struct ctl_table *tbl;
3067
3068         tbl = net->ipv4.route_hdr->ctl_table_arg;
3069         unregister_net_sysctl_table(net->ipv4.route_hdr);
3070         BUG_ON(tbl == ipv4_route_flush_table);
3071         kfree(tbl);
3072 }
3073
3074 static __net_initdata struct pernet_operations sysctl_route_ops = {
3075         .init = sysctl_route_net_init,
3076         .exit = sysctl_route_net_exit,
3077 };
3078 #endif
3079
3080 static __net_init int rt_genid_init(struct net *net)
3081 {
3082         atomic_set(&net->ipv4.rt_genid, 0);
3083         atomic_set(&net->fnhe_genid, 0);
3084         atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3085         return 0;
3086 }
3087
3088 static __net_initdata struct pernet_operations rt_genid_ops = {
3089         .init = rt_genid_init,
3090 };
3091
3092 static int __net_init ipv4_inetpeer_init(struct net *net)
3093 {
3094         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3095
3096         if (!bp)
3097                 return -ENOMEM;
3098         inet_peer_base_init(bp);
3099         net->ipv4.peers = bp;
3100         return 0;
3101 }
3102
3103 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3104 {
3105         struct inet_peer_base *bp = net->ipv4.peers;
3106
3107         net->ipv4.peers = NULL;
3108         inetpeer_invalidate_tree(bp);
3109         kfree(bp);
3110 }
3111
3112 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3113         .init   =       ipv4_inetpeer_init,
3114         .exit   =       ipv4_inetpeer_exit,
3115 };
3116
3117 #ifdef CONFIG_IP_ROUTE_CLASSID
3118 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3119 #endif /* CONFIG_IP_ROUTE_CLASSID */
3120
3121 int __init ip_rt_init(void)
3122 {
3123         void *idents_hash;
3124         int rc = 0;
3125         int cpu;
3126
3127         /* For modern hosts, this will use 2 MB of memory */
3128         idents_hash = alloc_large_system_hash("IP idents",
3129                                               sizeof(*ip_idents) + sizeof(*ip_tstamps),
3130                                               0,
3131                                               16, /* one bucket per 64 KB */
3132                                               HASH_ZERO,
3133                                               NULL,
3134                                               &ip_idents_mask,
3135                                               2048,
3136                                               256*1024);
3137
3138         ip_idents = idents_hash;
3139
3140         prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
3141
3142         ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
3143
3144         for_each_possible_cpu(cpu) {
3145                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3146
3147                 INIT_LIST_HEAD(&ul->head);
3148                 spin_lock_init(&ul->lock);
3149         }
3150 #ifdef CONFIG_IP_ROUTE_CLASSID
3151         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3152         if (!ip_rt_acct)
3153                 panic("IP: failed to allocate ip_rt_acct\n");
3154 #endif
3155
3156         ipv4_dst_ops.kmem_cachep =
3157                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3158                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3159
3160         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3161
3162         if (dst_entries_init(&ipv4_dst_ops) < 0)
3163                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3164
3165         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3166                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3167
3168         ipv4_dst_ops.gc_thresh = ~0;
3169         ip_rt_max_size = INT_MAX;
3170
3171         devinet_init();
3172         ip_fib_init();
3173
3174         if (ip_rt_proc_init())
3175                 pr_err("Unable to create route proc files\n");
3176 #ifdef CONFIG_XFRM
3177         xfrm_init();
3178         xfrm4_init();
3179 #endif
3180         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3181                       RTNL_FLAG_DOIT_UNLOCKED);
3182
3183 #ifdef CONFIG_SYSCTL
3184         register_pernet_subsys(&sysctl_route_ops);
3185 #endif
3186         register_pernet_subsys(&rt_genid_ops);
3187         register_pernet_subsys(&ipv4_inetpeer_ops);
3188         return rc;
3189 }
3190
3191 #ifdef CONFIG_SYSCTL
3192 /*
3193  * We really need to sanitize the damn ipv4 init order, then all
3194  * this nonsense will go away.
3195  */
3196 void __init ip_static_sysctl_init(void)
3197 {
3198         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3199 }
3200 #endif