net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <linux/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/bootmem.h>
  74 #include <linux/string.h>
  75 #include <linux/socket.h>
  76 #include <linux/sockios.h>
  77 #include <linux/errno.h>
  78 #include <linux/in.h>
  79 #include <linux/inet.h>
  80 #include <linux/netdevice.h>
  81 #include <linux/proc_fs.h>
  82 #include <linux/init.h>
  83 #include <linux/skbuff.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/rcupdate.h>
  91 #include <linux/times.h>
  92 #include <linux/slab.h>
  93 #include <linux/jhash.h>
  94 #include <net/dst.h>
  95 #include <net/dst_metadata.h>
  96 #include <net/net_namespace.h>
  97 #include <net/protocol.h>
  98 #include <net/ip.h>
  99 #include <net/route.h>
 100 #include <net/inetpeer.h>
 101 #include <net/sock.h>
 102 #include <net/ip_fib.h>
 103 #include <net/arp.h>
 104 #include <net/tcp.h>
 105 #include <net/icmp.h>
 106 #include <net/xfrm.h>
 107 #include <net/lwtunnel.h>
 108 #include <net/netevent.h>
 109 #include <net/rtnetlink.h>
 110 #ifdef CONFIG_SYSCTL
 111 #include <linux/sysctl.h>
 112 #endif
 113 #include <net/secure_seq.h>
 114 #include <net/ip_tunnels.h>
 115 #include <net/l3mdev.h>
 116
 117 #include "fib_lookup.h"
 118
 119 #define RT_FL_TOS(oldflp4) \
 120         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 121
 122 #define RT_GC_TIMEOUT (300*HZ)
 123
 124 static int ip_rt_max_size;
 125 static int ip_rt_redirect_number __read_mostly  = 9;
 126 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 127 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 128 static int ip_rt_error_cost __read_mostly       = HZ;
 129 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 130 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 131 static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 132 static int ip_rt_min_advmss __read_mostly       = 256;
 133
 134 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 135
 136 /*
 137  *      Interface to generic destination cache.
 138  */
 139
 140 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 141 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 142 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 143 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 144 static void              ipv4_link_failure(struct sk_buff *skb);
 145 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 146                                            struct sk_buff *skb, u32 mtu,
 147                                            bool confirm_neigh);
 148 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 149                                         struct sk_buff *skb);
 150 static void             ipv4_dst_destroy(struct dst_entry *dst);
 151
 152 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 153 {
 154         WARN_ON(1);
 155         return NULL;
 156 }
 157
 158 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 159                                            struct sk_buff *skb,
 160                                            const void *daddr);
 161 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 162
 163 static struct dst_ops ipv4_dst_ops = {
 164         .family =               AF_INET,
 165         .check =                ipv4_dst_check,
 166         .default_advmss =       ipv4_default_advmss,
 167         .mtu =                  ipv4_mtu,
 168         .cow_metrics =          ipv4_cow_metrics,
 169         .destroy =              ipv4_dst_destroy,
 170         .negative_advice =      ipv4_negative_advice,
 171         .link_failure =         ipv4_link_failure,
 172         .update_pmtu =          ip_rt_update_pmtu,
 173         .redirect =             ip_do_redirect,
 174         .local_out =            __ip_local_out,
 175         .neigh_lookup =         ipv4_neigh_lookup,
 176         .confirm_neigh =        ipv4_confirm_neigh,
 177 };
 178
 179 #define ECN_OR_COST(class)      TC_PRIO_##class
 180
 181 const __u8 ip_tos2prio[16] = {
 182         TC_PRIO_BESTEFFORT,
 183         ECN_OR_COST(BESTEFFORT),
 184         TC_PRIO_BESTEFFORT,
 185         ECN_OR_COST(BESTEFFORT),
 186         TC_PRIO_BULK,
 187         ECN_OR_COST(BULK),
 188         TC_PRIO_BULK,
 189         ECN_OR_COST(BULK),
 190         TC_PRIO_INTERACTIVE,
 191         ECN_OR_COST(INTERACTIVE),
 192         TC_PRIO_INTERACTIVE,
 193         ECN_OR_COST(INTERACTIVE),
 194         TC_PRIO_INTERACTIVE_BULK,
 195         ECN_OR_COST(INTERACTIVE_BULK),
 196         TC_PRIO_INTERACTIVE_BULK,
 197         ECN_OR_COST(INTERACTIVE_BULK)
 198 };
 199 EXPORT_SYMBOL(ip_tos2prio);
 200
 201 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 202 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 203
 204 #ifdef CONFIG_PROC_FS
 205 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 206 {
 207         if (*pos)
 208                 return NULL;
 209         return SEQ_START_TOKEN;
 210 }
 211
 212 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 213 {
 214         ++*pos;
 215         return NULL;
 216 }
 217
 218 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 219 {
 220 }
 221
 222 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 223 {
 224         if (v == SEQ_START_TOKEN)
 225                 seq_printf(seq, "%-127s\n",
 226                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 227                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 228                            "HHUptod\tSpecDst");
 229         return 0;
 230 }
 231
 232 static const struct seq_operations rt_cache_seq_ops = {
 233         .start  = rt_cache_seq_start,
 234         .next   = rt_cache_seq_next,
 235         .stop   = rt_cache_seq_stop,
 236         .show   = rt_cache_seq_show,
 237 };
 238
 239 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 240 {
 241         return seq_open(file, &rt_cache_seq_ops);
 242 }
 243
 244 static const struct file_operations rt_cache_seq_fops = {
 245         .open    = rt_cache_seq_open,
 246         .read    = seq_read,
 247         .llseek  = seq_lseek,
 248         .release = seq_release,
 249 };
 250
 251
 252 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 253 {
 254         int cpu;
 255
 256         if (*pos == 0)
 257                 return SEQ_START_TOKEN;
 258
 259         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 260                 if (!cpu_possible(cpu))
 261                         continue;
 262                 *pos = cpu+1;
 263                 return &per_cpu(rt_cache_stat, cpu);
 264         }
 265         return NULL;
 266 }
 267
 268 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 269 {
 270         int cpu;
 271
 272         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 273                 if (!cpu_possible(cpu))
 274                         continue;
 275                 *pos = cpu+1;
 276                 return &per_cpu(rt_cache_stat, cpu);
 277         }
 278         (*pos)++;
 279         return NULL;
 280
 281 }
 282
 283 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 284 {
 285
 286 }
 287
 288 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 289 {
 290         struct rt_cache_stat *st = v;
 291
 292         if (v == SEQ_START_TOKEN) {
 293                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 294                 return 0;
 295         }
 296
 297         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 298                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 299                    dst_entries_get_slow(&ipv4_dst_ops),
 300                    0, /* st->in_hit */
 301                    st->in_slow_tot,
 302                    st->in_slow_mc,
 303                    st->in_no_route,
 304                    st->in_brd,
 305                    st->in_martian_dst,
 306                    st->in_martian_src,
 307
 308                    0, /* st->out_hit */
 309                    st->out_slow_tot,
 310                    st->out_slow_mc,
 311
 312                    0, /* st->gc_total */
 313                    0, /* st->gc_ignored */
 314                    0, /* st->gc_goal_miss */
 315                    0, /* st->gc_dst_overflow */
 316                    0, /* st->in_hlist_search */
 317                    0  /* st->out_hlist_search */
 318                 );
 319         return 0;
 320 }
 321
 322 static const struct seq_operations rt_cpu_seq_ops = {
 323         .start  = rt_cpu_seq_start,
 324         .next   = rt_cpu_seq_next,
 325         .stop   = rt_cpu_seq_stop,
 326         .show   = rt_cpu_seq_show,
 327 };
 328
 329
 330 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 331 {
 332         return seq_open(file, &rt_cpu_seq_ops);
 333 }
 334
 335 static const struct file_operations rt_cpu_seq_fops = {
 336         .open    = rt_cpu_seq_open,
 337         .read    = seq_read,
 338         .llseek  = seq_lseek,
 339         .release = seq_release,
 340 };
 341
 342 #ifdef CONFIG_IP_ROUTE_CLASSID
 343 static int rt_acct_proc_show(struct seq_file *m, void *v)
 344 {
 345         struct ip_rt_acct *dst, *src;
 346         unsigned int i, j;
 347
 348         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 349         if (!dst)
 350                 return -ENOMEM;
 351
 352         for_each_possible_cpu(i) {
 353                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 354                 for (j = 0; j < 256; j++) {
 355                         dst[j].o_bytes   += src[j].o_bytes;
 356                         dst[j].o_packets += src[j].o_packets;
 357                         dst[j].i_bytes   += src[j].i_bytes;
 358                         dst[j].i_packets += src[j].i_packets;
 359                 }
 360         }
 361
 362         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 363         kfree(dst);
 364         return 0;
 365 }
 366 #endif
 367
 368 static int __net_init ip_rt_do_proc_init(struct net *net)
 369 {
 370         struct proc_dir_entry *pde;
 371
 372         pde = proc_create("rt_cache", 0444, net->proc_net,
 373                           &rt_cache_seq_fops);
 374         if (!pde)
 375                 goto err1;
 376
 377         pde = proc_create("rt_cache", 0444,
 378                           net->proc_net_stat, &rt_cpu_seq_fops);
 379         if (!pde)
 380                 goto err2;
 381
 382 #ifdef CONFIG_IP_ROUTE_CLASSID
 383         pde = proc_create_single("rt_acct", 0, net->proc_net,
 384                         rt_acct_proc_show);
 385         if (!pde)
 386                 goto err3;
 387 #endif
 388         return 0;
 389
 390 #ifdef CONFIG_IP_ROUTE_CLASSID
 391 err3:
 392         remove_proc_entry("rt_cache", net->proc_net_stat);
 393 #endif
 394 err2:
 395         remove_proc_entry("rt_cache", net->proc_net);
 396 err1:
 397         return -ENOMEM;
 398 }
 399
 400 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 401 {
 402         remove_proc_entry("rt_cache", net->proc_net_stat);
 403         remove_proc_entry("rt_cache", net->proc_net);
 404 #ifdef CONFIG_IP_ROUTE_CLASSID
 405         remove_proc_entry("rt_acct", net->proc_net);
 406 #endif
 407 }
 408
 409 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 410         .init = ip_rt_do_proc_init,
 411         .exit = ip_rt_do_proc_exit,
 412 };
 413
 414 static int __init ip_rt_proc_init(void)
 415 {
 416         return register_pernet_subsys(&ip_rt_proc_ops);
 417 }
 418
 419 #else
 420 static inline int ip_rt_proc_init(void)
 421 {
 422         return 0;
 423 }
 424 #endif /* CONFIG_PROC_FS */
 425
 426 static inline bool rt_is_expired(const struct rtable *rth)
 427 {
 428         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 429 }
 430
 431 void rt_cache_flush(struct net *net)
 432 {
 433         rt_genid_bump_ipv4(net);
 434 }
 435
 436 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 437                                            struct sk_buff *skb,
 438                                            const void *daddr)
 439 {
 440         struct net_device *dev = dst->dev;
 441         const __be32 *pkey = daddr;
 442         const struct rtable *rt;
 443         struct neighbour *n;
 444
 445         rt = (const struct rtable *) dst;
 446         if (rt->rt_gateway)
 447                 pkey = (const __be32 *) &rt->rt_gateway;
 448         else if (skb)
 449                 pkey = &ip_hdr(skb)->daddr;
 450
 451         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 452         if (n)
 453                 return n;
 454         return neigh_create(&arp_tbl, pkey, dev);
 455 }
 456
 457 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 458 {
 459         struct net_device *dev = dst->dev;
 460         const __be32 *pkey = daddr;
 461         const struct rtable *rt;
 462
 463         rt = (const struct rtable *)dst;
 464         if (rt->rt_gateway)
 465                 pkey = (const __be32 *)&rt->rt_gateway;
 466         else if (!daddr ||
 467                  (rt->rt_flags &
 468                   (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
 469                 return;
 470
 471         __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
 472 }
 473
 474 /* Hash tables of size 2048..262144 depending on RAM size.
 475  * Each bucket uses 8 bytes.
 476  */
 477 static u32 ip_idents_mask __read_mostly;
 478 static atomic_t *ip_idents __read_mostly;
 479 static u32 *ip_tstamps __read_mostly;
 480
 481 /* In order to protect privacy, we add a perturbation to identifiers
 482  * if one generator is seldom used. This makes hard for an attacker
 483  * to infer how many packets were sent between two points in time.
 484  */
 485 u32 ip_idents_reserve(u32 hash, int segs)
 486 {
 487         u32 bucket, old, now = (u32)jiffies;
 488         atomic_t *p_id;
 489         u32 *p_tstamp;
 490         u32 delta = 0;
 491
 492         bucket = hash & ip_idents_mask;
 493         p_tstamp = ip_tstamps + bucket;
 494         p_id = ip_idents + bucket;
 495         old = READ_ONCE(*p_tstamp);
 496
 497         if (old != now && cmpxchg(p_tstamp, old, now) == old)
 498                 delta = prandom_u32_max(now - old);
 499
 500         /* If UBSAN reports an error there, please make sure your compiler
 501          * supports -fno-strict-overflow before reporting it that was a bug
 502          * in UBSAN, and it has been fixed in GCC-8.
 503          */
 504         return atomic_add_return(segs + delta, p_id) - segs;
 505 }
 506 EXPORT_SYMBOL(ip_idents_reserve);
 507
 508 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 509 {
 510         u32 hash, id;
 511
 512         /* Note the following code is not safe, but this is okay. */
 513         if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
 514                 get_random_bytes(&net->ipv4.ip_id_key,
 515                                  sizeof(net->ipv4.ip_id_key));
 516
 517         hash = siphash_3u32((__force u32)iph->daddr,
 518                             (__force u32)iph->saddr,
 519                             iph->protocol,
 520                             &net->ipv4.ip_id_key);
 521         id = ip_idents_reserve(hash, segs);
 522         iph->id = htons(id);
 523 }
 524 EXPORT_SYMBOL(__ip_select_ident);
 525
 526 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
 527                              const struct sock *sk,
 528                              const struct iphdr *iph,
 529                              int oif, u8 tos,
 530                              u8 prot, u32 mark, int flow_flags)
 531 {
 532         if (sk) {
 533                 const struct inet_sock *inet = inet_sk(sk);
 534
 535                 oif = sk->sk_bound_dev_if;
 536                 mark = sk->sk_mark;
 537                 tos = RT_CONN_FLAGS(sk);
 538                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 539         }
 540         flowi4_init_output(fl4, oif, mark, tos,
 541                            RT_SCOPE_UNIVERSE, prot,
 542                            flow_flags,
 543                            iph->daddr, iph->saddr, 0, 0,
 544                            sock_net_uid(net, sk));
 545 }
 546
 547 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 548                                const struct sock *sk)
 549 {
 550         const struct net *net = dev_net(skb->dev);
 551         const struct iphdr *iph = ip_hdr(skb);
 552         int oif = skb->dev->ifindex;
 553         u8 tos = RT_TOS(iph->tos);
 554         u8 prot = iph->protocol;
 555         u32 mark = skb->mark;
 556
 557         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
 558 }
 559
 560 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 561 {
 562         const struct inet_sock *inet = inet_sk(sk);
 563         const struct ip_options_rcu *inet_opt;
 564         __be32 daddr = inet->inet_daddr;
 565
 566         rcu_read_lock();
 567         inet_opt = rcu_dereference(inet->inet_opt);
 568         if (inet_opt && inet_opt->opt.srr)
 569                 daddr = inet_opt->opt.faddr;
 570         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 571                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 572                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 573                            inet_sk_flowi_flags(sk),
 574                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
 575         rcu_read_unlock();
 576 }
 577
 578 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 579                                  const struct sk_buff *skb)
 580 {
 581         if (skb)
 582                 build_skb_flow_key(fl4, skb, sk);
 583         else
 584                 build_sk_flow_key(fl4, sk);
 585 }
 586
 587 static DEFINE_SPINLOCK(fnhe_lock);
 588
 589 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 590 {
 591         struct rtable *rt;
 592
 593         rt = rcu_dereference(fnhe->fnhe_rth_input);
 594         if (rt) {
 595                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 596                 dst_dev_put(&rt->dst);
 597                 dst_release(&rt->dst);
 598         }
 599         rt = rcu_dereference(fnhe->fnhe_rth_output);
 600         if (rt) {
 601                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 602                 dst_dev_put(&rt->dst);
 603                 dst_release(&rt->dst);
 604         }
 605 }
 606
 607 static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
 608 {
 609         struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
 610         struct fib_nh_exception *fnhe, *oldest = NULL;
 611
 612         for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
 613                 fnhe = rcu_dereference_protected(*fnhe_p,
 614                                                  lockdep_is_held(&fnhe_lock));
 615                 if (!fnhe)
 616                         break;
 617                 if (!oldest ||
 618                     time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
 619                         oldest = fnhe;
 620                         oldest_p = fnhe_p;
 621                 }
 622         }
 623         fnhe_flush_routes(oldest);
 624         *oldest_p = oldest->fnhe_next;
 625         kfree_rcu(oldest, rcu);
 626 }
 627
 628 static inline u32 fnhe_hashfun(__be32 daddr)
 629 {
 630         static u32 fnhe_hashrnd __read_mostly;
 631         u32 hval;
 632
 633         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
 634         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
 635         return hash_32(hval, FNHE_HASH_SHIFT);
 636 }
 637
 638 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 639 {
 640         rt->rt_pmtu = fnhe->fnhe_pmtu;
 641         rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
 642         rt->dst.expires = fnhe->fnhe_expires;
 643
 644         if (fnhe->fnhe_gw) {
 645                 rt->rt_flags |= RTCF_REDIRECTED;
 646                 rt->rt_gateway = fnhe->fnhe_gw;
 647                 rt->rt_uses_gateway = 1;
 648         }
 649 }
 650
 651 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 652                                   u32 pmtu, bool lock, unsigned long expires)
 653 {
 654         struct fnhe_hash_bucket *hash;
 655         struct fib_nh_exception *fnhe;
 656         struct rtable *rt;
 657         u32 genid, hval;
 658         unsigned int i;
 659         int depth;
 660
 661         genid = fnhe_genid(dev_net(nh->nh_dev));
 662         hval = fnhe_hashfun(daddr);
 663
 664         spin_lock_bh(&fnhe_lock);
 665
 666         hash = rcu_dereference(nh->nh_exceptions);
 667         if (!hash) {
 668                 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
 669                 if (!hash)
 670                         goto out_unlock;
 671                 rcu_assign_pointer(nh->nh_exceptions, hash);
 672         }
 673
 674         hash += hval;
 675
 676         depth = 0;
 677         for (fnhe = rcu_dereference(hash->chain); fnhe;
 678              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 679                 if (fnhe->fnhe_daddr == daddr)
 680                         break;
 681                 depth++;
 682         }
 683
 684         if (fnhe) {
 685                 if (fnhe->fnhe_genid != genid)
 686                         fnhe->fnhe_genid = genid;
 687                 if (gw)
 688                         fnhe->fnhe_gw = gw;
 689                 if (pmtu) {
 690                         fnhe->fnhe_pmtu = pmtu;
 691                         fnhe->fnhe_mtu_locked = lock;
 692                 }
 693                 fnhe->fnhe_expires = max(1UL, expires);
 694                 /* Update all cached dsts too */
 695                 rt = rcu_dereference(fnhe->fnhe_rth_input);
 696                 if (rt)
 697                         fill_route_from_fnhe(rt, fnhe);
 698                 rt = rcu_dereference(fnhe->fnhe_rth_output);
 699                 if (rt)
 700                         fill_route_from_fnhe(rt, fnhe);
 701         } else {
 702                 /* Randomize max depth to avoid some side channels attacks. */
 703                 int max_depth = FNHE_RECLAIM_DEPTH +
 704                                 prandom_u32_max(FNHE_RECLAIM_DEPTH);
 705
 706                 while (depth > max_depth) {
 707                         fnhe_remove_oldest(hash);
 708                         depth--;
 709                 }
 710
 711                 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 712                 if (!fnhe)
 713                         goto out_unlock;
 714
 715                 fnhe->fnhe_next = hash->chain;
 716
 717                 fnhe->fnhe_genid = genid;
 718                 fnhe->fnhe_daddr = daddr;
 719                 fnhe->fnhe_gw = gw;
 720                 fnhe->fnhe_pmtu = pmtu;
 721                 fnhe->fnhe_mtu_locked = lock;
 722                 fnhe->fnhe_expires = max(1UL, expires);
 723
 724                 rcu_assign_pointer(hash->chain, fnhe);
 725
 726                 /* Exception created; mark the cached routes for the nexthop
 727                  * stale, so anyone caching it rechecks if this exception
 728                  * applies to them.
 729                  */
 730                 rt = rcu_dereference(nh->nh_rth_input);
 731                 if (rt)
 732                         rt->dst.obsolete = DST_OBSOLETE_KILL;
 733
 734                 for_each_possible_cpu(i) {
 735                         struct rtable __rcu **prt;
 736                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
 737                         rt = rcu_dereference(*prt);
 738                         if (rt)
 739                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 740                 }
 741         }
 742
 743         fnhe->fnhe_stamp = jiffies;
 744
 745 out_unlock:
 746         spin_unlock_bh(&fnhe_lock);
 747 }
 748
 749 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 750                              bool kill_route)
 751 {
 752         __be32 new_gw = icmp_hdr(skb)->un.gateway;
 753         __be32 old_gw = ip_hdr(skb)->saddr;
 754         struct net_device *dev = skb->dev;
 755         struct in_device *in_dev;
 756         struct fib_result res;
 757         struct neighbour *n;
 758         struct net *net;
 759
 760         switch (icmp_hdr(skb)->code & 7) {
 761         case ICMP_REDIR_NET:
 762         case ICMP_REDIR_NETTOS:
 763         case ICMP_REDIR_HOST:
 764         case ICMP_REDIR_HOSTTOS:
 765                 break;
 766
 767         default:
 768                 return;
 769         }
 770
 771         if (rt->rt_gateway != old_gw)
 772                 return;
 773
 774         in_dev = __in_dev_get_rcu(dev);
 775         if (!in_dev)
 776                 return;
 777
 778         net = dev_net(dev);
 779         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 780             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 781             ipv4_is_zeronet(new_gw))
 782                 goto reject_redirect;
 783
 784         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 785                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 786                         goto reject_redirect;
 787                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 788                         goto reject_redirect;
 789         } else {
 790                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 791                         goto reject_redirect;
 792         }
 793
 794         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
 795         if (!n)
 796                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 797         if (!IS_ERR(n)) {
 798                 if (!(n->nud_state & NUD_VALID)) {
 799                         neigh_event_send(n, NULL);
 800                 } else {
 801                         if (fib_lookup(net, fl4, &res, 0) == 0) {
 802                                 struct fib_nh *nh;
 803
 804                                 fib_select_path(net, &res, fl4, skb);
 805                                 nh = &FIB_RES_NH(res);
 806                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
 807                                                 0, false,
 808                                                 jiffies + ip_rt_gc_timeout);
 809                         }
 810                         if (kill_route)
 811                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 812                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 813                 }
 814                 neigh_release(n);
 815         }
 816         return;
 817
 818 reject_redirect:
 819 #ifdef CONFIG_IP_ROUTE_VERBOSE
 820         if (IN_DEV_LOG_MARTIANS(in_dev)) {
 821                 const struct iphdr *iph = (const struct iphdr *) skb->data;
 822                 __be32 daddr = iph->daddr;
 823                 __be32 saddr = iph->saddr;
 824
 825                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 826                                      "  Advised path = %pI4 -> %pI4\n",
 827                                      &old_gw, dev->name, &new_gw,
 828                                      &saddr, &daddr);
 829         }
 830 #endif
 831         ;
 832 }
 833
 834 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 835 {
 836         struct rtable *rt;
 837         struct flowi4 fl4;
 838         const struct iphdr *iph = (const struct iphdr *) skb->data;
 839         struct net *net = dev_net(skb->dev);
 840         int oif = skb->dev->ifindex;
 841         u8 tos = RT_TOS(iph->tos);
 842         u8 prot = iph->protocol;
 843         u32 mark = skb->mark;
 844
 845         rt = (struct rtable *) dst;
 846
 847         __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
 848         __ip_do_redirect(rt, skb, &fl4, true);
 849 }
 850
 851 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 852 {
 853         struct rtable *rt = (struct rtable *)dst;
 854         struct dst_entry *ret = dst;
 855
 856         if (rt) {
 857                 if (dst->obsolete > 0) {
 858                         ip_rt_put(rt);
 859                         ret = NULL;
 860                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 861                            rt->dst.expires) {
 862                         ip_rt_put(rt);
 863                         ret = NULL;
 864                 }
 865         }
 866         return ret;
 867 }
 868
 869 /*
 870  * Algorithm:
 871  *      1. The first ip_rt_redirect_number redirects are sent
 872  *         with exponential backoff, then we stop sending them at all,
 873  *         assuming that the host ignores our redirects.
 874  *      2. If we did not see packets requiring redirects
 875  *         during ip_rt_redirect_silence, we assume that the host
 876  *         forgot redirected route and start to send redirects again.
 877  *
 878  * This algorithm is much cheaper and more intelligent than dumb load limiting
 879  * in icmp.c.
 880  *
 881  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 882  * and "frag. need" (breaks PMTU discovery) in icmp.c.
 883  */
 884
 885 void ip_rt_send_redirect(struct sk_buff *skb)
 886 {
 887         struct rtable *rt = skb_rtable(skb);
 888         struct in_device *in_dev;
 889         struct inet_peer *peer;
 890         struct net *net;
 891         int log_martians;
 892         int vif;
 893
 894         rcu_read_lock();
 895         in_dev = __in_dev_get_rcu(rt->dst.dev);
 896         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 897                 rcu_read_unlock();
 898                 return;
 899         }
 900         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 901         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 902         rcu_read_unlock();
 903
 904         net = dev_net(rt->dst.dev);
 905         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 906         if (!peer) {
 907                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 908                           rt_nexthop(rt, ip_hdr(skb)->daddr));
 909                 return;
 910         }
 911
 912         /* No redirected packets during ip_rt_redirect_silence;
 913          * reset the algorithm.
 914          */
 915         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
 916                 peer->rate_tokens = 0;
 917                 peer->n_redirects = 0;
 918         }
 919
 920         /* Too many ignored redirects; do not send anything
 921          * set dst.rate_last to the last seen redirected packet.
 922          */
 923         if (peer->n_redirects >= ip_rt_redirect_number) {
 924                 peer->rate_last = jiffies;
 925                 goto out_put_peer;
 926         }
 927
 928         /* Check for load limit; set rate_last to the latest sent
 929          * redirect.
 930          */
 931         if (peer->n_redirects == 0 ||
 932             time_after(jiffies,
 933                        (peer->rate_last +
 934                         (ip_rt_redirect_load << peer->n_redirects)))) {
 935                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 936
 937                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 938                 peer->rate_last = jiffies;
 939                 ++peer->n_redirects;
 940 #ifdef CONFIG_IP_ROUTE_VERBOSE
 941                 if (log_martians &&
 942                     peer->n_redirects == ip_rt_redirect_number)
 943                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 944                                              &ip_hdr(skb)->saddr, inet_iif(skb),
 945                                              &ip_hdr(skb)->daddr, &gw);
 946 #endif
 947         }
 948 out_put_peer:
 949         inet_putpeer(peer);
 950 }
 951
 952 static int ip_error(struct sk_buff *skb)
 953 {
 954         struct rtable *rt = skb_rtable(skb);
 955         struct net_device *dev = skb->dev;
 956         struct in_device *in_dev;
 957         struct inet_peer *peer;
 958         unsigned long now;
 959         struct net *net;
 960         bool send;
 961         int code;
 962
 963         if (netif_is_l3_master(skb->dev)) {
 964                 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
 965                 if (!dev)
 966                         goto out;
 967         }
 968
 969         in_dev = __in_dev_get_rcu(dev);
 970
 971         /* IP on this device is disabled. */
 972         if (!in_dev)
 973                 goto out;
 974
 975         net = dev_net(rt->dst.dev);
 976         if (!IN_DEV_FORWARD(in_dev)) {
 977                 switch (rt->dst.error) {
 978                 case EHOSTUNREACH:
 979                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 980                         break;
 981
 982                 case ENETUNREACH:
 983                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 984                         break;
 985                 }
 986                 goto out;
 987         }
 988
 989         switch (rt->dst.error) {
 990         case EINVAL:
 991         default:
 992                 goto out;
 993         case EHOSTUNREACH:
 994                 code = ICMP_HOST_UNREACH;
 995                 break;
 996         case ENETUNREACH:
 997                 code = ICMP_NET_UNREACH;
 998                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 999                 break;
1000         case EACCES:
1001                 code = ICMP_PKT_FILTERED;
1002                 break;
1003         }
1004
1005         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
1006                                l3mdev_master_ifindex(skb->dev), 1);
1007
1008         send = true;
1009         if (peer) {
1010                 now = jiffies;
1011                 peer->rate_tokens += now - peer->rate_last;
1012                 if (peer->rate_tokens > ip_rt_error_burst)
1013                         peer->rate_tokens = ip_rt_error_burst;
1014                 peer->rate_last = now;
1015                 if (peer->rate_tokens >= ip_rt_error_cost)
1016                         peer->rate_tokens -= ip_rt_error_cost;
1017                 else
1018                         send = false;
1019                 inet_putpeer(peer);
1020         }
1021         if (send)
1022                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1023
1024 out:    kfree_skb(skb);
1025         return 0;
1026 }
1027
1028 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1029 {
1030         struct dst_entry *dst = &rt->dst;
1031         struct net *net = dev_net(dst->dev);
1032         u32 old_mtu = ipv4_mtu(dst);
1033         struct fib_result res;
1034         bool lock = false;
1035
1036         if (ip_mtu_locked(dst))
1037                 return;
1038
1039         if (old_mtu < mtu)
1040                 return;
1041
1042         if (mtu < ip_rt_min_pmtu) {
1043                 lock = true;
1044                 mtu = min(old_mtu, ip_rt_min_pmtu);
1045         }
1046
1047         if (rt->rt_pmtu == mtu && !lock &&
1048             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1049                 return;
1050
1051         rcu_read_lock();
1052         if (fib_lookup(net, fl4, &res, 0) == 0) {
1053                 struct fib_nh *nh;
1054
1055                 fib_select_path(net, &res, fl4, NULL);
1056                 nh = &FIB_RES_NH(res);
1057                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
1058                                       jiffies + ip_rt_mtu_expires);
1059         }
1060         rcu_read_unlock();
1061 }
1062
1063 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1064                               struct sk_buff *skb, u32 mtu,
1065                               bool confirm_neigh)
1066 {
1067         struct rtable *rt = (struct rtable *) dst;
1068         struct flowi4 fl4;
1069
1070         ip_rt_build_flow_key(&fl4, sk, skb);
1071         __ip_rt_update_pmtu(rt, &fl4, mtu);
1072 }
1073
1074 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1075                       int oif, u32 mark, u8 protocol, int flow_flags)
1076 {
1077         const struct iphdr *iph = (const struct iphdr *) skb->data;
1078         struct flowi4 fl4;
1079         struct rtable *rt;
1080
1081         if (!mark)
1082                 mark = IP4_REPLY_MARK(net, skb->mark);
1083
1084         __build_flow_key(net, &fl4, NULL, iph, oif,
1085                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1086         rt = __ip_route_output_key(net, &fl4);
1087         if (!IS_ERR(rt)) {
1088                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1089                 ip_rt_put(rt);
1090         }
1091 }
1092 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1093
1094 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1095 {
1096         const struct iphdr *iph = (const struct iphdr *) skb->data;
1097         struct flowi4 fl4;
1098         struct rtable *rt;
1099
1100         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1101
1102         if (!fl4.flowi4_mark)
1103                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1104
1105         rt = __ip_route_output_key(sock_net(sk), &fl4);
1106         if (!IS_ERR(rt)) {
1107                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1108                 ip_rt_put(rt);
1109         }
1110 }
1111
1112 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1113 {
1114         const struct iphdr *iph = (const struct iphdr *) skb->data;
1115         struct flowi4 fl4;
1116         struct rtable *rt;
1117         struct dst_entry *odst = NULL;
1118         bool new = false;
1119         struct net *net = sock_net(sk);
1120
1121         bh_lock_sock(sk);
1122
1123         if (!ip_sk_accept_pmtu(sk))
1124                 goto out;
1125
1126         odst = sk_dst_get(sk);
1127
1128         if (sock_owned_by_user(sk) || !odst) {
1129                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1130                 goto out;
1131         }
1132
1133         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1134
1135         rt = (struct rtable *)odst;
1136         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1137                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1138                 if (IS_ERR(rt))
1139                         goto out;
1140
1141                 new = true;
1142         }
1143
1144         __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1145
1146         if (!dst_check(&rt->dst, 0)) {
1147                 if (new)
1148                         dst_release(&rt->dst);
1149
1150                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1151                 if (IS_ERR(rt))
1152                         goto out;
1153
1154                 new = true;
1155         }
1156
1157         if (new)
1158                 sk_dst_set(sk, &rt->dst);
1159
1160 out:
1161         bh_unlock_sock(sk);
1162         dst_release(odst);
1163 }
1164 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1165
1166 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1167                    int oif, u32 mark, u8 protocol, int flow_flags)
1168 {
1169         const struct iphdr *iph = (const struct iphdr *) skb->data;
1170         struct flowi4 fl4;
1171         struct rtable *rt;
1172
1173         __build_flow_key(net, &fl4, NULL, iph, oif,
1174                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1175         rt = __ip_route_output_key(net, &fl4);
1176         if (!IS_ERR(rt)) {
1177                 __ip_do_redirect(rt, skb, &fl4, false);
1178                 ip_rt_put(rt);
1179         }
1180 }
1181 EXPORT_SYMBOL_GPL(ipv4_redirect);
1182
1183 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1184 {
1185         const struct iphdr *iph = (const struct iphdr *) skb->data;
1186         struct flowi4 fl4;
1187         struct rtable *rt;
1188         struct net *net = sock_net(sk);
1189
1190         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1191         rt = __ip_route_output_key(net, &fl4);
1192         if (!IS_ERR(rt)) {
1193                 __ip_do_redirect(rt, skb, &fl4, false);
1194                 ip_rt_put(rt);
1195         }
1196 }
1197 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1198
1199 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1200 {
1201         struct rtable *rt = (struct rtable *) dst;
1202
1203         /* All IPV4 dsts are created with ->obsolete set to the value
1204          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1205          * into this function always.
1206          *
1207          * When a PMTU/redirect information update invalidates a route,
1208          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1209          * DST_OBSOLETE_DEAD by dst_free().
1210          */
1211         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1212                 return NULL;
1213         return dst;
1214 }
1215
1216 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1217 {
1218         struct ip_options opt;
1219         int res;
1220
1221         /* Recompile ip options since IPCB may not be valid anymore.
1222          * Also check we have a reasonable ipv4 header.
1223          */
1224         if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1225             ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1226                 return;
1227
1228         memset(&opt, 0, sizeof(opt));
1229         if (ip_hdr(skb)->ihl > 5) {
1230                 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1231                         return;
1232                 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1233
1234                 rcu_read_lock();
1235                 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1236                 rcu_read_unlock();
1237
1238                 if (res)
1239                         return;
1240         }
1241         __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1242 }
1243
1244 static void ipv4_link_failure(struct sk_buff *skb)
1245 {
1246         struct rtable *rt;
1247
1248         ipv4_send_dest_unreach(skb);
1249
1250         rt = skb_rtable(skb);
1251         if (rt)
1252                 dst_set_expires(&rt->dst, 0);
1253 }
1254
1255 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1256 {
1257         pr_debug("%s: %pI4 -> %pI4, %s\n",
1258                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1259                  skb->dev ? skb->dev->name : "?");
1260         kfree_skb(skb);
1261         WARN_ON(1);
1262         return 0;
1263 }
1264
1265 /*
1266    We do not cache source address of outgoing interface,
1267    because it is used only by IP RR, TS and SRR options,
1268    so that it out of fast path.
1269
1270    BTW remember: "addr" is allowed to be not aligned
1271    in IP options!
1272  */
1273
1274 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1275 {
1276         __be32 src;
1277
1278         if (rt_is_output_route(rt))
1279                 src = ip_hdr(skb)->saddr;
1280         else {
1281                 struct fib_result res;
1282                 struct flowi4 fl4;
1283                 struct iphdr *iph;
1284
1285                 iph = ip_hdr(skb);
1286
1287                 memset(&fl4, 0, sizeof(fl4));
1288                 fl4.daddr = iph->daddr;
1289                 fl4.saddr = iph->saddr;
1290                 fl4.flowi4_tos = RT_TOS(iph->tos);
1291                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1292                 fl4.flowi4_iif = skb->dev->ifindex;
1293                 fl4.flowi4_mark = skb->mark;
1294
1295                 rcu_read_lock();
1296                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1297                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1298                 else
1299                         src = inet_select_addr(rt->dst.dev,
1300                                                rt_nexthop(rt, iph->daddr),
1301                                                RT_SCOPE_UNIVERSE);
1302                 rcu_read_unlock();
1303         }
1304         memcpy(addr, &src, 4);
1305 }
1306
1307 #ifdef CONFIG_IP_ROUTE_CLASSID
1308 static void set_class_tag(struct rtable *rt, u32 tag)
1309 {
1310         if (!(rt->dst.tclassid & 0xFFFF))
1311                 rt->dst.tclassid |= tag & 0xFFFF;
1312         if (!(rt->dst.tclassid & 0xFFFF0000))
1313                 rt->dst.tclassid |= tag & 0xFFFF0000;
1314 }
1315 #endif
1316
1317 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1318 {
1319         unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1320         unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1321                                     ip_rt_min_advmss);
1322
1323         return min(advmss, IPV4_MAX_PMTU - header_size);
1324 }
1325
1326 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1327 {
1328         const struct rtable *rt = (const struct rtable *) dst;
1329         unsigned int mtu = rt->rt_pmtu;
1330
1331         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1332                 mtu = dst_metric_raw(dst, RTAX_MTU);
1333
1334         if (mtu)
1335                 goto out;
1336
1337         mtu = READ_ONCE(dst->dev->mtu);
1338
1339         if (unlikely(ip_mtu_locked(dst))) {
1340                 if (rt->rt_uses_gateway && mtu > 576)
1341                         mtu = 576;
1342         }
1343
1344 out:
1345         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1346
1347         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1348 }
1349
1350 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1351 {
1352         struct fnhe_hash_bucket *hash;
1353         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1354         u32 hval = fnhe_hashfun(daddr);
1355
1356         spin_lock_bh(&fnhe_lock);
1357
1358         hash = rcu_dereference_protected(nh->nh_exceptions,
1359                                          lockdep_is_held(&fnhe_lock));
1360         hash += hval;
1361
1362         fnhe_p = &hash->chain;
1363         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1364         while (fnhe) {
1365                 if (fnhe->fnhe_daddr == daddr) {
1366                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1367                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1368                         /* set fnhe_daddr to 0 to ensure it won't bind with
1369                          * new dsts in rt_bind_exception().
1370                          */
1371                         fnhe->fnhe_daddr = 0;
1372                         fnhe_flush_routes(fnhe);
1373                         kfree_rcu(fnhe, rcu);
1374                         break;
1375                 }
1376                 fnhe_p = &fnhe->fnhe_next;
1377                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1378                                                  lockdep_is_held(&fnhe_lock));
1379         }
1380
1381         spin_unlock_bh(&fnhe_lock);
1382 }
1383
1384 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1385 {
1386         struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1387         struct fib_nh_exception *fnhe;
1388         u32 hval;
1389
1390         if (!hash)
1391                 return NULL;
1392
1393         hval = fnhe_hashfun(daddr);
1394
1395         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1396              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1397                 if (fnhe->fnhe_daddr == daddr) {
1398                         if (fnhe->fnhe_expires &&
1399                             time_after(jiffies, fnhe->fnhe_expires)) {
1400                                 ip_del_fnhe(nh, daddr);
1401                                 break;
1402                         }
1403                         return fnhe;
1404                 }
1405         }
1406         return NULL;
1407 }
1408
1409 /* MTU selection:
1410  * 1. mtu on route is locked - use it
1411  * 2. mtu from nexthop exception
1412  * 3. mtu from egress device
1413  */
1414
1415 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1416 {
1417         struct fib_info *fi = res->fi;
1418         struct fib_nh *nh = &fi->fib_nh[res->nh_sel];
1419         struct net_device *dev = nh->nh_dev;
1420         u32 mtu = 0;
1421
1422         if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1423             fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1424                 mtu = fi->fib_mtu;
1425
1426         if (likely(!mtu)) {
1427                 struct fib_nh_exception *fnhe;
1428
1429                 fnhe = find_exception(nh, daddr);
1430                 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1431                         mtu = fnhe->fnhe_pmtu;
1432         }
1433
1434         if (likely(!mtu))
1435                 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1436
1437         return mtu - lwtunnel_headroom(nh->nh_lwtstate, mtu);
1438 }
1439
1440 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1441                               __be32 daddr, const bool do_cache)
1442 {
1443         bool ret = false;
1444
1445         spin_lock_bh(&fnhe_lock);
1446
1447         if (daddr == fnhe->fnhe_daddr) {
1448                 struct rtable __rcu **porig;
1449                 struct rtable *orig;
1450                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1451
1452                 if (rt_is_input_route(rt))
1453                         porig = &fnhe->fnhe_rth_input;
1454                 else
1455                         porig = &fnhe->fnhe_rth_output;
1456                 orig = rcu_dereference(*porig);
1457
1458                 if (fnhe->fnhe_genid != genid) {
1459                         fnhe->fnhe_genid = genid;
1460                         fnhe->fnhe_gw = 0;
1461                         fnhe->fnhe_pmtu = 0;
1462                         fnhe->fnhe_expires = 0;
1463                         fnhe->fnhe_mtu_locked = false;
1464                         fnhe_flush_routes(fnhe);
1465                         orig = NULL;
1466                 }
1467                 fill_route_from_fnhe(rt, fnhe);
1468                 if (!rt->rt_gateway)
1469                         rt->rt_gateway = daddr;
1470
1471                 if (do_cache) {
1472                         dst_hold(&rt->dst);
1473                         rcu_assign_pointer(*porig, rt);
1474                         if (orig) {
1475                                 dst_dev_put(&orig->dst);
1476                                 dst_release(&orig->dst);
1477                         }
1478                         ret = true;
1479                 }
1480
1481                 fnhe->fnhe_stamp = jiffies;
1482         }
1483         spin_unlock_bh(&fnhe_lock);
1484
1485         return ret;
1486 }
1487
1488 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1489 {
1490         struct rtable *orig, *prev, **p;
1491         bool ret = true;
1492
1493         if (rt_is_input_route(rt)) {
1494                 p = (struct rtable **)&nh->nh_rth_input;
1495         } else {
1496                 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1497         }
1498         orig = *p;
1499
1500         /* hold dst before doing cmpxchg() to avoid race condition
1501          * on this dst
1502          */
1503         dst_hold(&rt->dst);
1504         prev = cmpxchg(p, orig, rt);
1505         if (prev == orig) {
1506                 if (orig) {
1507                         rt_add_uncached_list(orig);
1508                         dst_release(&orig->dst);
1509                 }
1510         } else {
1511                 dst_release(&rt->dst);
1512                 ret = false;
1513         }
1514
1515         return ret;
1516 }
1517
1518 struct uncached_list {
1519         spinlock_t              lock;
1520         struct list_head        head;
1521 };
1522
1523 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1524
1525 void rt_add_uncached_list(struct rtable *rt)
1526 {
1527         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1528
1529         rt->rt_uncached_list = ul;
1530
1531         spin_lock_bh(&ul->lock);
1532         list_add_tail(&rt->rt_uncached, &ul->head);
1533         spin_unlock_bh(&ul->lock);
1534 }
1535
1536 void rt_del_uncached_list(struct rtable *rt)
1537 {
1538         if (!list_empty(&rt->rt_uncached)) {
1539                 struct uncached_list *ul = rt->rt_uncached_list;
1540
1541                 spin_lock_bh(&ul->lock);
1542                 list_del(&rt->rt_uncached);
1543                 spin_unlock_bh(&ul->lock);
1544         }
1545 }
1546
1547 static void ipv4_dst_destroy(struct dst_entry *dst)
1548 {
1549         struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1550         struct rtable *rt = (struct rtable *)dst;
1551
1552         if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
1553                 kfree(p);
1554
1555         rt_del_uncached_list(rt);
1556 }
1557
1558 void rt_flush_dev(struct net_device *dev)
1559 {
1560         struct net *net = dev_net(dev);
1561         struct rtable *rt;
1562         int cpu;
1563
1564         for_each_possible_cpu(cpu) {
1565                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1566
1567                 spin_lock_bh(&ul->lock);
1568                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1569                         if (rt->dst.dev != dev)
1570                                 continue;
1571                         rt->dst.dev = net->loopback_dev;
1572                         dev_hold(rt->dst.dev);
1573                         dev_put(dev);
1574                 }
1575                 spin_unlock_bh(&ul->lock);
1576         }
1577 }
1578
1579 static bool rt_cache_valid(const struct rtable *rt)
1580 {
1581         return  rt &&
1582                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1583                 !rt_is_expired(rt);
1584 }
1585
1586 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1587                            const struct fib_result *res,
1588                            struct fib_nh_exception *fnhe,
1589                            struct fib_info *fi, u16 type, u32 itag,
1590                            const bool do_cache)
1591 {
1592         bool cached = false;
1593
1594         if (fi) {
1595                 struct fib_nh *nh = &FIB_RES_NH(*res);
1596
1597                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1598                         rt->rt_gateway = nh->nh_gw;
1599                         rt->rt_uses_gateway = 1;
1600                 }
1601                 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1602                 if (fi->fib_metrics != &dst_default_metrics) {
1603                         rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1604                         refcount_inc(&fi->fib_metrics->refcnt);
1605                 }
1606 #ifdef CONFIG_IP_ROUTE_CLASSID
1607                 rt->dst.tclassid = nh->nh_tclassid;
1608 #endif
1609                 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1610                 if (unlikely(fnhe))
1611                         cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1612                 else if (do_cache)
1613                         cached = rt_cache_route(nh, rt);
1614                 if (unlikely(!cached)) {
1615                         /* Routes we intend to cache in nexthop exception or
1616                          * FIB nexthop have the DST_NOCACHE bit clear.
1617                          * However, if we are unsuccessful at storing this
1618                          * route into the cache we really need to set it.
1619                          */
1620                         if (!rt->rt_gateway)
1621                                 rt->rt_gateway = daddr;
1622                         rt_add_uncached_list(rt);
1623                 }
1624         } else
1625                 rt_add_uncached_list(rt);
1626
1627 #ifdef CONFIG_IP_ROUTE_CLASSID
1628 #ifdef CONFIG_IP_MULTIPLE_TABLES
1629         set_class_tag(rt, res->tclassid);
1630 #endif
1631         set_class_tag(rt, itag);
1632 #endif
1633 }
1634
1635 struct rtable *rt_dst_alloc(struct net_device *dev,
1636                             unsigned int flags, u16 type,
1637                             bool nopolicy, bool noxfrm, bool will_cache)
1638 {
1639         struct rtable *rt;
1640
1641         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1642                        (will_cache ? 0 : DST_HOST) |
1643                        (nopolicy ? DST_NOPOLICY : 0) |
1644                        (noxfrm ? DST_NOXFRM : 0));
1645
1646         if (rt) {
1647                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1648                 rt->rt_flags = flags;
1649                 rt->rt_type = type;
1650                 rt->rt_is_input = 0;
1651                 rt->rt_iif = 0;
1652                 rt->rt_pmtu = 0;
1653                 rt->rt_mtu_locked = 0;
1654                 rt->rt_gateway = 0;
1655                 rt->rt_uses_gateway = 0;
1656                 INIT_LIST_HEAD(&rt->rt_uncached);
1657
1658                 rt->dst.output = ip_output;
1659                 if (flags & RTCF_LOCAL)
1660                         rt->dst.input = ip_local_deliver;
1661         }
1662
1663         return rt;
1664 }
1665 EXPORT_SYMBOL(rt_dst_alloc);
1666
1667 /* called in rcu_read_lock() section */
1668 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1669                           u8 tos, struct net_device *dev,
1670                           struct in_device *in_dev, u32 *itag)
1671 {
1672         int err;
1673
1674         /* Primary sanity checks. */
1675         if (!in_dev)
1676                 return -EINVAL;
1677
1678         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1679             skb->protocol != htons(ETH_P_IP))
1680                 return -EINVAL;
1681
1682         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1683                 return -EINVAL;
1684
1685         if (ipv4_is_zeronet(saddr)) {
1686                 if (!ipv4_is_local_multicast(daddr))
1687                         return -EINVAL;
1688         } else {
1689                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1690                                           in_dev, itag);
1691                 if (err < 0)
1692                         return err;
1693         }
1694         return 0;
1695 }
1696
1697 /* called in rcu_read_lock() section */
1698 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1699                              u8 tos, struct net_device *dev, int our)
1700 {
1701         struct in_device *in_dev = __in_dev_get_rcu(dev);
1702         unsigned int flags = RTCF_MULTICAST;
1703         struct rtable *rth;
1704         u32 itag = 0;
1705         int err;
1706
1707         err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1708         if (err)
1709                 return err;
1710
1711         if (our)
1712                 flags |= RTCF_LOCAL;
1713
1714         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1715                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1716         if (!rth)
1717                 return -ENOBUFS;
1718
1719 #ifdef CONFIG_IP_ROUTE_CLASSID
1720         rth->dst.tclassid = itag;
1721 #endif
1722         rth->dst.output = ip_rt_bug;
1723         rth->rt_is_input= 1;
1724
1725 #ifdef CONFIG_IP_MROUTE
1726         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1727                 rth->dst.input = ip_mr_input;
1728 #endif
1729         RT_CACHE_STAT_INC(in_slow_mc);
1730
1731         skb_dst_set(skb, &rth->dst);
1732         return 0;
1733 }
1734
1735
1736 static void ip_handle_martian_source(struct net_device *dev,
1737                                      struct in_device *in_dev,
1738                                      struct sk_buff *skb,
1739                                      __be32 daddr,
1740                                      __be32 saddr)
1741 {
1742         RT_CACHE_STAT_INC(in_martian_src);
1743 #ifdef CONFIG_IP_ROUTE_VERBOSE
1744         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1745                 /*
1746                  *      RFC1812 recommendation, if source is martian,
1747                  *      the only hint is MAC header.
1748                  */
1749                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1750                         &daddr, &saddr, dev->name);
1751                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1752                         print_hex_dump(KERN_WARNING, "ll header: ",
1753                                        DUMP_PREFIX_OFFSET, 16, 1,
1754                                        skb_mac_header(skb),
1755                                        dev->hard_header_len, true);
1756                 }
1757         }
1758 #endif
1759 }
1760
1761 /* called in rcu_read_lock() section */
1762 static int __mkroute_input(struct sk_buff *skb,
1763                            const struct fib_result *res,
1764                            struct in_device *in_dev,
1765                            __be32 daddr, __be32 saddr, u32 tos)
1766 {
1767         struct fib_nh_exception *fnhe;
1768         struct rtable *rth;
1769         int err;
1770         struct in_device *out_dev;
1771         bool do_cache;
1772         u32 itag = 0;
1773
1774         /* get a working reference to the output device */
1775         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1776         if (!out_dev) {
1777                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1778                 return -EINVAL;
1779         }
1780
1781         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1782                                   in_dev->dev, in_dev, &itag);
1783         if (err < 0) {
1784                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1785                                          saddr);
1786
1787                 goto cleanup;
1788         }
1789
1790         do_cache = res->fi && !itag;
1791         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1792             skb->protocol == htons(ETH_P_IP) &&
1793             (IN_DEV_SHARED_MEDIA(out_dev) ||
1794              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1795                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1796
1797         if (skb->protocol != htons(ETH_P_IP)) {
1798                 /* Not IP (i.e. ARP). Do not create route, if it is
1799                  * invalid for proxy arp. DNAT routes are always valid.
1800                  *
1801                  * Proxy arp feature have been extended to allow, ARP
1802                  * replies back to the same interface, to support
1803                  * Private VLAN switch technologies. See arp.c.
1804                  */
1805                 if (out_dev == in_dev &&
1806                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1807                         err = -EINVAL;
1808                         goto cleanup;
1809                 }
1810         }
1811
1812         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1813         if (do_cache) {
1814                 if (fnhe)
1815                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1816                 else
1817                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1818                 if (rt_cache_valid(rth)) {
1819                         skb_dst_set_noref(skb, &rth->dst);
1820                         goto out;
1821                 }
1822         }
1823
1824         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1825                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1826                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1827         if (!rth) {
1828                 err = -ENOBUFS;
1829                 goto cleanup;
1830         }
1831
1832         rth->rt_is_input = 1;
1833         RT_CACHE_STAT_INC(in_slow_tot);
1834
1835         rth->dst.input = ip_forward;
1836
1837         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1838                        do_cache);
1839         lwtunnel_set_redirect(&rth->dst);
1840         skb_dst_set(skb, &rth->dst);
1841 out:
1842         err = 0;
1843  cleanup:
1844         return err;
1845 }
1846
1847 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1848 /* To make ICMP packets follow the right flow, the multipath hash is
1849  * calculated from the inner IP addresses.
1850  */
1851 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1852                                  struct flow_keys *hash_keys)
1853 {
1854         const struct iphdr *outer_iph = ip_hdr(skb);
1855         const struct iphdr *key_iph = outer_iph;
1856         const struct iphdr *inner_iph;
1857         const struct icmphdr *icmph;
1858         struct iphdr _inner_iph;
1859         struct icmphdr _icmph;
1860
1861         if (likely(outer_iph->protocol != IPPROTO_ICMP))
1862                 goto out;
1863
1864         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1865                 goto out;
1866
1867         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1868                                    &_icmph);
1869         if (!icmph)
1870                 goto out;
1871
1872         if (icmph->type != ICMP_DEST_UNREACH &&
1873             icmph->type != ICMP_REDIRECT &&
1874             icmph->type != ICMP_TIME_EXCEEDED &&
1875             icmph->type != ICMP_PARAMETERPROB)
1876                 goto out;
1877
1878         inner_iph = skb_header_pointer(skb,
1879                                        outer_iph->ihl * 4 + sizeof(_icmph),
1880                                        sizeof(_inner_iph), &_inner_iph);
1881         if (!inner_iph)
1882                 goto out;
1883
1884         key_iph = inner_iph;
1885 out:
1886         hash_keys->addrs.v4addrs.src = key_iph->saddr;
1887         hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1888 }
1889
1890 /* if skb is set it will be used and fl4 can be NULL */
1891 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1892                        const struct sk_buff *skb, struct flow_keys *flkeys)
1893 {
1894         struct flow_keys hash_keys;
1895         u32 mhash;
1896
1897         switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1898         case 0:
1899                 memset(&hash_keys, 0, sizeof(hash_keys));
1900                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1901                 if (skb) {
1902                         ip_multipath_l3_keys(skb, &hash_keys);
1903                 } else {
1904                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1905                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1906                 }
1907                 break;
1908         case 1:
1909                 /* skb is currently provided only when forwarding */
1910                 if (skb) {
1911                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1912                         struct flow_keys keys;
1913
1914                         /* short-circuit if we already have L4 hash present */
1915                         if (skb->l4_hash)
1916                                 return skb_get_hash_raw(skb) >> 1;
1917
1918                         memset(&hash_keys, 0, sizeof(hash_keys));
1919
1920                         if (!flkeys) {
1921                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
1922                                 flkeys = &keys;
1923                         }
1924
1925                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1926                         hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1927                         hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1928                         hash_keys.ports.src = flkeys->ports.src;
1929                         hash_keys.ports.dst = flkeys->ports.dst;
1930                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1931                 } else {
1932                         memset(&hash_keys, 0, sizeof(hash_keys));
1933                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1934                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1935                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1936                         hash_keys.ports.src = fl4->fl4_sport;
1937                         hash_keys.ports.dst = fl4->fl4_dport;
1938                         hash_keys.basic.ip_proto = fl4->flowi4_proto;
1939                 }
1940                 break;
1941         }
1942         mhash = flow_hash_from_keys(&hash_keys);
1943
1944         return mhash >> 1;
1945 }
1946 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1947
1948 static int ip_mkroute_input(struct sk_buff *skb,
1949                             struct fib_result *res,
1950                             struct in_device *in_dev,
1951                             __be32 daddr, __be32 saddr, u32 tos,
1952                             struct flow_keys *hkeys)
1953 {
1954 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1955         if (res->fi && res->fi->fib_nhs > 1) {
1956                 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
1957
1958                 fib_select_multipath(res, h);
1959         }
1960 #endif
1961
1962         /* create a routing cache entry */
1963         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1964 }
1965
1966 /*
1967  *      NOTE. We drop all the packets that has local source
1968  *      addresses, because every properly looped back packet
1969  *      must have correct destination already attached by output routine.
1970  *
1971  *      Such approach solves two big problems:
1972  *      1. Not simplex devices are handled properly.
1973  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1974  *      called with rcu_read_lock()
1975  */
1976
1977 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1978                                u8 tos, struct net_device *dev,
1979                                struct fib_result *res)
1980 {
1981         struct in_device *in_dev = __in_dev_get_rcu(dev);
1982         struct flow_keys *flkeys = NULL, _flkeys;
1983         struct net    *net = dev_net(dev);
1984         struct ip_tunnel_info *tun_info;
1985         int             err = -EINVAL;
1986         unsigned int    flags = 0;
1987         u32             itag = 0;
1988         struct rtable   *rth;
1989         struct flowi4   fl4;
1990         bool do_cache = true;
1991
1992         /* IP on this device is disabled. */
1993
1994         if (!in_dev)
1995                 goto out;
1996
1997         /* Check for the most weird martians, which can be not detected
1998            by fib_lookup.
1999          */
2000
2001         tun_info = skb_tunnel_info(skb);
2002         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2003                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2004         else
2005                 fl4.flowi4_tun_key.tun_id = 0;
2006         skb_dst_drop(skb);
2007
2008         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2009                 goto martian_source;
2010
2011         res->fi = NULL;
2012         res->table = NULL;
2013         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2014                 goto brd_input;
2015
2016         /* Accept zero addresses only to limited broadcast;
2017          * I even do not know to fix it or not. Waiting for complains :-)
2018          */
2019         if (ipv4_is_zeronet(saddr))
2020                 goto martian_source;
2021
2022         if (ipv4_is_zeronet(daddr))
2023                 goto martian_destination;
2024
2025         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2026          * and call it once if daddr or/and saddr are loopback addresses
2027          */
2028         if (ipv4_is_loopback(daddr)) {
2029                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2030                         goto martian_destination;
2031         } else if (ipv4_is_loopback(saddr)) {
2032                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2033                         goto martian_source;
2034         }
2035
2036         /*
2037          *      Now we are ready to route packet.
2038          */
2039         fl4.flowi4_oif = 0;
2040         fl4.flowi4_iif = dev->ifindex;
2041         fl4.flowi4_mark = skb->mark;
2042         fl4.flowi4_tos = tos;
2043         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2044         fl4.flowi4_flags = 0;
2045         fl4.daddr = daddr;
2046         fl4.saddr = saddr;
2047         fl4.flowi4_uid = sock_net_uid(net, NULL);
2048
2049         if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2050                 flkeys = &_flkeys;
2051         } else {
2052                 fl4.flowi4_proto = 0;
2053                 fl4.fl4_sport = 0;
2054                 fl4.fl4_dport = 0;
2055         }
2056
2057         err = fib_lookup(net, &fl4, res, 0);
2058         if (err != 0) {
2059                 if (!IN_DEV_FORWARD(in_dev))
2060                         err = -EHOSTUNREACH;
2061                 goto no_route;
2062         }
2063
2064         if (res->type == RTN_BROADCAST) {
2065                 if (IN_DEV_BFORWARD(in_dev))
2066                         goto make_route;
2067                 /* not do cache if bc_forwarding is enabled */
2068                 if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2069                         do_cache = false;
2070                 goto brd_input;
2071         }
2072
2073         if (res->type == RTN_LOCAL) {
2074                 err = fib_validate_source(skb, saddr, daddr, tos,
2075                                           0, dev, in_dev, &itag);
2076                 if (err < 0)
2077                         goto martian_source;
2078                 goto local_input;
2079         }
2080
2081         if (!IN_DEV_FORWARD(in_dev)) {
2082                 err = -EHOSTUNREACH;
2083                 goto no_route;
2084         }
2085         if (res->type != RTN_UNICAST)
2086                 goto martian_destination;
2087
2088 make_route:
2089         err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2090 out:    return err;
2091
2092 brd_input:
2093         if (skb->protocol != htons(ETH_P_IP))
2094                 goto e_inval;
2095
2096         if (!ipv4_is_zeronet(saddr)) {
2097                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2098                                           in_dev, &itag);
2099                 if (err < 0)
2100                         goto martian_source;
2101         }
2102         flags |= RTCF_BROADCAST;
2103         res->type = RTN_BROADCAST;
2104         RT_CACHE_STAT_INC(in_brd);
2105
2106 local_input:
2107         do_cache &= res->fi && !itag;
2108         if (do_cache) {
2109                 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
2110                 if (rt_cache_valid(rth)) {
2111                         skb_dst_set_noref(skb, &rth->dst);
2112                         err = 0;
2113                         goto out;
2114                 }
2115         }
2116
2117         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2118                            flags | RTCF_LOCAL, res->type,
2119                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2120         if (!rth)
2121                 goto e_nobufs;
2122
2123         rth->dst.output= ip_rt_bug;
2124 #ifdef CONFIG_IP_ROUTE_CLASSID
2125         rth->dst.tclassid = itag;
2126 #endif
2127         rth->rt_is_input = 1;
2128
2129         RT_CACHE_STAT_INC(in_slow_tot);
2130         if (res->type == RTN_UNREACHABLE) {
2131                 rth->dst.input= ip_error;
2132                 rth->dst.error= -err;
2133                 rth->rt_flags   &= ~RTCF_LOCAL;
2134         }
2135
2136         if (do_cache) {
2137                 struct fib_nh *nh = &FIB_RES_NH(*res);
2138
2139                 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2140                 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2141                         WARN_ON(rth->dst.input == lwtunnel_input);
2142                         rth->dst.lwtstate->orig_input = rth->dst.input;
2143                         rth->dst.input = lwtunnel_input;
2144                 }
2145
2146                 if (unlikely(!rt_cache_route(nh, rth)))
2147                         rt_add_uncached_list(rth);
2148         }
2149         skb_dst_set(skb, &rth->dst);
2150         err = 0;
2151         goto out;
2152
2153 no_route:
2154         RT_CACHE_STAT_INC(in_no_route);
2155         res->type = RTN_UNREACHABLE;
2156         res->fi = NULL;
2157         res->table = NULL;
2158         goto local_input;
2159
2160         /*
2161          *      Do not cache martian addresses: they should be logged (RFC1812)
2162          */
2163 martian_destination:
2164         RT_CACHE_STAT_INC(in_martian_dst);
2165 #ifdef CONFIG_IP_ROUTE_VERBOSE
2166         if (IN_DEV_LOG_MARTIANS(in_dev))
2167                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2168                                      &daddr, &saddr, dev->name);
2169 #endif
2170
2171 e_inval:
2172         err = -EINVAL;
2173         goto out;
2174
2175 e_nobufs:
2176         err = -ENOBUFS;
2177         goto out;
2178
2179 martian_source:
2180         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2181         goto out;
2182 }
2183
2184 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2185                          u8 tos, struct net_device *dev)
2186 {
2187         struct fib_result res;
2188         int err;
2189
2190         tos &= IPTOS_RT_MASK;
2191         rcu_read_lock();
2192         err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2193         rcu_read_unlock();
2194
2195         return err;
2196 }
2197 EXPORT_SYMBOL(ip_route_input_noref);
2198
2199 /* called with rcu_read_lock held */
2200 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2201                        u8 tos, struct net_device *dev, struct fib_result *res)
2202 {
2203         /* Multicast recognition logic is moved from route cache to here.
2204            The problem was that too many Ethernet cards have broken/missing
2205            hardware multicast filters :-( As result the host on multicasting
2206            network acquires a lot of useless route cache entries, sort of
2207            SDR messages from all the world. Now we try to get rid of them.
2208            Really, provided software IP multicast filter is organized
2209            reasonably (at least, hashed), it does not result in a slowdown
2210            comparing with route cache reject entries.
2211            Note, that multicast routers are not affected, because
2212            route cache entry is created eventually.
2213          */
2214         if (ipv4_is_multicast(daddr)) {
2215                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2216                 int our = 0;
2217                 int err = -EINVAL;
2218
2219                 if (!in_dev)
2220                         return err;
2221                 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2222                                       ip_hdr(skb)->protocol);
2223
2224                 /* check l3 master if no match yet */
2225                 if (!our && netif_is_l3_slave(dev)) {
2226                         struct in_device *l3_in_dev;
2227
2228                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2229                         if (l3_in_dev)
2230                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2231                                                       ip_hdr(skb)->protocol);
2232                 }
2233
2234                 if (our
2235 #ifdef CONFIG_IP_MROUTE
2236                         ||
2237                     (!ipv4_is_local_multicast(daddr) &&
2238                      IN_DEV_MFORWARD(in_dev))
2239 #endif
2240                    ) {
2241                         err = ip_route_input_mc(skb, daddr, saddr,
2242                                                 tos, dev, our);
2243                 }
2244                 return err;
2245         }
2246
2247         return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2248 }
2249
2250 /* called with rcu_read_lock() */
2251 static struct rtable *__mkroute_output(const struct fib_result *res,
2252                                        const struct flowi4 *fl4, int orig_oif,
2253                                        struct net_device *dev_out,
2254                                        unsigned int flags)
2255 {
2256         struct fib_info *fi = res->fi;
2257         struct fib_nh_exception *fnhe;
2258         struct in_device *in_dev;
2259         u16 type = res->type;
2260         struct rtable *rth;
2261         bool do_cache;
2262
2263         in_dev = __in_dev_get_rcu(dev_out);
2264         if (!in_dev)
2265                 return ERR_PTR(-EINVAL);
2266
2267         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2268                 if (ipv4_is_loopback(fl4->saddr) &&
2269                     !(dev_out->flags & IFF_LOOPBACK) &&
2270                     !netif_is_l3_master(dev_out))
2271                         return ERR_PTR(-EINVAL);
2272
2273         if (ipv4_is_lbcast(fl4->daddr))
2274                 type = RTN_BROADCAST;
2275         else if (ipv4_is_multicast(fl4->daddr))
2276                 type = RTN_MULTICAST;
2277         else if (ipv4_is_zeronet(fl4->daddr))
2278                 return ERR_PTR(-EINVAL);
2279
2280         if (dev_out->flags & IFF_LOOPBACK)
2281                 flags |= RTCF_LOCAL;
2282
2283         do_cache = true;
2284         if (type == RTN_BROADCAST) {
2285                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2286                 fi = NULL;
2287         } else if (type == RTN_MULTICAST) {
2288                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2289                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2290                                      fl4->flowi4_proto))
2291                         flags &= ~RTCF_LOCAL;
2292                 else
2293                         do_cache = false;
2294                 /* If multicast route do not exist use
2295                  * default one, but do not gateway in this case.
2296                  * Yes, it is hack.
2297                  */
2298                 if (fi && res->prefixlen < 4)
2299                         fi = NULL;
2300         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2301                    (orig_oif != dev_out->ifindex)) {
2302                 /* For local routes that require a particular output interface
2303                  * we do not want to cache the result.  Caching the result
2304                  * causes incorrect behaviour when there are multiple source
2305                  * addresses on the interface, the end result being that if the
2306                  * intended recipient is waiting on that interface for the
2307                  * packet he won't receive it because it will be delivered on
2308                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2309                  * be set to the loopback interface as well.
2310                  */
2311                 do_cache = false;
2312         }
2313
2314         fnhe = NULL;
2315         do_cache &= fi != NULL;
2316         if (fi) {
2317                 struct rtable __rcu **prth;
2318                 struct fib_nh *nh = &FIB_RES_NH(*res);
2319
2320                 fnhe = find_exception(nh, fl4->daddr);
2321                 if (!do_cache)
2322                         goto add;
2323                 if (fnhe) {
2324                         prth = &fnhe->fnhe_rth_output;
2325                 } else {
2326                         if (unlikely(fl4->flowi4_flags &
2327                                      FLOWI_FLAG_KNOWN_NH &&
2328                                      !(nh->nh_gw &&
2329                                        nh->nh_scope == RT_SCOPE_LINK))) {
2330                                 do_cache = false;
2331                                 goto add;
2332                         }
2333                         prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2334                 }
2335                 rth = rcu_dereference(*prth);
2336                 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2337                         return rth;
2338         }
2339
2340 add:
2341         rth = rt_dst_alloc(dev_out, flags, type,
2342                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2343                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2344                            do_cache);
2345         if (!rth)
2346                 return ERR_PTR(-ENOBUFS);
2347
2348         rth->rt_iif = orig_oif;
2349
2350         RT_CACHE_STAT_INC(out_slow_tot);
2351
2352         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2353                 if (flags & RTCF_LOCAL &&
2354                     !(dev_out->flags & IFF_LOOPBACK)) {
2355                         rth->dst.output = ip_mc_output;
2356                         RT_CACHE_STAT_INC(out_slow_mc);
2357                 }
2358 #ifdef CONFIG_IP_MROUTE
2359                 if (type == RTN_MULTICAST) {
2360                         if (IN_DEV_MFORWARD(in_dev) &&
2361                             !ipv4_is_local_multicast(fl4->daddr)) {
2362                                 rth->dst.input = ip_mr_input;
2363                                 rth->dst.output = ip_mc_output;
2364                         }
2365                 }
2366 #endif
2367         }
2368
2369         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2370         lwtunnel_set_redirect(&rth->dst);
2371
2372         return rth;
2373 }
2374
2375 /*
2376  * Major route resolver routine.
2377  */
2378
2379 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2380                                         const struct sk_buff *skb)
2381 {
2382         __u8 tos = RT_FL_TOS(fl4);
2383         struct fib_result res = {
2384                 .type           = RTN_UNSPEC,
2385                 .fi             = NULL,
2386                 .table          = NULL,
2387                 .tclassid       = 0,
2388         };
2389         struct rtable *rth;
2390
2391         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2392         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2393         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2394                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2395
2396         rcu_read_lock();
2397         rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2398         rcu_read_unlock();
2399
2400         return rth;
2401 }
2402 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2403
2404 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2405                                             struct fib_result *res,
2406                                             const struct sk_buff *skb)
2407 {
2408         struct net_device *dev_out = NULL;
2409         int orig_oif = fl4->flowi4_oif;
2410         unsigned int flags = 0;
2411         struct rtable *rth;
2412         int err;
2413
2414         if (fl4->saddr) {
2415                 if (ipv4_is_multicast(fl4->saddr) ||
2416                     ipv4_is_lbcast(fl4->saddr) ||
2417                     ipv4_is_zeronet(fl4->saddr)) {
2418                         rth = ERR_PTR(-EINVAL);
2419                         goto out;
2420                 }
2421
2422                 rth = ERR_PTR(-ENETUNREACH);
2423
2424                 /* I removed check for oif == dev_out->oif here.
2425                    It was wrong for two reasons:
2426                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2427                       is assigned to multiple interfaces.
2428                    2. Moreover, we are allowed to send packets with saddr
2429                       of another iface. --ANK
2430                  */
2431
2432                 if (fl4->flowi4_oif == 0 &&
2433                     (ipv4_is_multicast(fl4->daddr) ||
2434                      ipv4_is_lbcast(fl4->daddr))) {
2435                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2436                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2437                         if (!dev_out)
2438                                 goto out;
2439
2440                         /* Special hack: user can direct multicasts
2441                            and limited broadcast via necessary interface
2442                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2443                            This hack is not just for fun, it allows
2444                            vic,vat and friends to work.
2445                            They bind socket to loopback, set ttl to zero
2446                            and expect that it will work.
2447                            From the viewpoint of routing cache they are broken,
2448                            because we are not allowed to build multicast path
2449                            with loopback source addr (look, routing cache
2450                            cannot know, that ttl is zero, so that packet
2451                            will not leave this host and route is valid).
2452                            Luckily, this hack is good workaround.
2453                          */
2454
2455                         fl4->flowi4_oif = dev_out->ifindex;
2456                         goto make_route;
2457                 }
2458
2459                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2460                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2461                         if (!__ip_dev_find(net, fl4->saddr, false))
2462                                 goto out;
2463                 }
2464         }
2465
2466
2467         if (fl4->flowi4_oif) {
2468                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2469                 rth = ERR_PTR(-ENODEV);
2470                 if (!dev_out)
2471                         goto out;
2472
2473                 /* RACE: Check return value of inet_select_addr instead. */
2474                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2475                         rth = ERR_PTR(-ENETUNREACH);
2476                         goto out;
2477                 }
2478                 if (ipv4_is_local_multicast(fl4->daddr) ||
2479                     ipv4_is_lbcast(fl4->daddr) ||
2480                     fl4->flowi4_proto == IPPROTO_IGMP) {
2481                         if (!fl4->saddr)
2482                                 fl4->saddr = inet_select_addr(dev_out, 0,
2483                                                               RT_SCOPE_LINK);
2484                         goto make_route;
2485                 }
2486                 if (!fl4->saddr) {
2487                         if (ipv4_is_multicast(fl4->daddr))
2488                                 fl4->saddr = inet_select_addr(dev_out, 0,
2489                                                               fl4->flowi4_scope);
2490                         else if (!fl4->daddr)
2491                                 fl4->saddr = inet_select_addr(dev_out, 0,
2492                                                               RT_SCOPE_HOST);
2493                 }
2494         }
2495
2496         if (!fl4->daddr) {
2497                 fl4->daddr = fl4->saddr;
2498                 if (!fl4->daddr)
2499                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2500                 dev_out = net->loopback_dev;
2501                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2502                 res->type = RTN_LOCAL;
2503                 flags |= RTCF_LOCAL;
2504                 goto make_route;
2505         }
2506
2507         err = fib_lookup(net, fl4, res, 0);
2508         if (err) {
2509                 res->fi = NULL;
2510                 res->table = NULL;
2511                 if (fl4->flowi4_oif &&
2512                     (ipv4_is_multicast(fl4->daddr) ||
2513                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2514                         /* Apparently, routing tables are wrong. Assume,
2515                            that the destination is on link.
2516
2517                            WHY? DW.
2518                            Because we are allowed to send to iface
2519                            even if it has NO routes and NO assigned
2520                            addresses. When oif is specified, routing
2521                            tables are looked up with only one purpose:
2522                            to catch if destination is gatewayed, rather than
2523                            direct. Moreover, if MSG_DONTROUTE is set,
2524                            we send packet, ignoring both routing tables
2525                            and ifaddr state. --ANK
2526
2527
2528                            We could make it even if oif is unknown,
2529                            likely IPv6, but we do not.
2530                          */
2531
2532                         if (fl4->saddr == 0)
2533                                 fl4->saddr = inet_select_addr(dev_out, 0,
2534                                                               RT_SCOPE_LINK);
2535                         res->type = RTN_UNICAST;
2536                         goto make_route;
2537                 }
2538                 rth = ERR_PTR(err);
2539                 goto out;
2540         }
2541
2542         if (res->type == RTN_LOCAL) {
2543                 if (!fl4->saddr) {
2544                         if (res->fi->fib_prefsrc)
2545                                 fl4->saddr = res->fi->fib_prefsrc;
2546                         else
2547                                 fl4->saddr = fl4->daddr;
2548                 }
2549
2550                 /* L3 master device is the loopback for that domain */
2551                 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2552                         net->loopback_dev;
2553
2554                 /* make sure orig_oif points to fib result device even
2555                  * though packet rx/tx happens over loopback or l3mdev
2556                  */
2557                 orig_oif = FIB_RES_OIF(*res);
2558
2559                 fl4->flowi4_oif = dev_out->ifindex;
2560                 flags |= RTCF_LOCAL;
2561                 goto make_route;
2562         }
2563
2564         fib_select_path(net, res, fl4, skb);
2565
2566         dev_out = FIB_RES_DEV(*res);
2567
2568 make_route:
2569         rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2570
2571 out:
2572         return rth;
2573 }
2574
2575 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2576 {
2577         return NULL;
2578 }
2579
2580 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2581 {
2582         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2583
2584         return mtu ? : dst->dev->mtu;
2585 }
2586
2587 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2588                                           struct sk_buff *skb, u32 mtu,
2589                                           bool confirm_neigh)
2590 {
2591 }
2592
2593 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2594                                        struct sk_buff *skb)
2595 {
2596 }
2597
2598 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2599                                           unsigned long old)
2600 {
2601         return NULL;
2602 }
2603
2604 static struct dst_ops ipv4_dst_blackhole_ops = {
2605         .family                 =       AF_INET,
2606         .check                  =       ipv4_blackhole_dst_check,
2607         .mtu                    =       ipv4_blackhole_mtu,
2608         .default_advmss         =       ipv4_default_advmss,
2609         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2610         .redirect               =       ipv4_rt_blackhole_redirect,
2611         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2612         .neigh_lookup           =       ipv4_neigh_lookup,
2613 };
2614
2615 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2616 {
2617         struct rtable *ort = (struct rtable *) dst_orig;
2618         struct rtable *rt;
2619
2620         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2621         if (rt) {
2622                 struct dst_entry *new = &rt->dst;
2623
2624                 new->__use = 1;
2625                 new->input = dst_discard;
2626                 new->output = dst_discard_out;
2627
2628                 new->dev = net->loopback_dev;
2629                 if (new->dev)
2630                         dev_hold(new->dev);
2631
2632                 rt->rt_is_input = ort->rt_is_input;
2633                 rt->rt_iif = ort->rt_iif;
2634                 rt->rt_pmtu = ort->rt_pmtu;
2635                 rt->rt_mtu_locked = ort->rt_mtu_locked;
2636
2637                 rt->rt_genid = rt_genid_ipv4(net);
2638                 rt->rt_flags = ort->rt_flags;
2639                 rt->rt_type = ort->rt_type;
2640                 rt->rt_gateway = ort->rt_gateway;
2641                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2642
2643                 INIT_LIST_HEAD(&rt->rt_uncached);
2644         }
2645
2646         dst_release(dst_orig);
2647
2648         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2649 }
2650
2651 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2652                                     const struct sock *sk)
2653 {
2654         struct rtable *rt = __ip_route_output_key(net, flp4);
2655
2656         if (IS_ERR(rt))
2657                 return rt;
2658
2659         if (flp4->flowi4_proto) {
2660                 flp4->flowi4_oif = rt->dst.dev->ifindex;
2661                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2662                                                         flowi4_to_flowi(flp4),
2663                                                         sk, 0);
2664         }
2665
2666         return rt;
2667 }
2668 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2669
2670 /* called with rcu_read_lock held */
2671 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2672                         struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2673                         struct sk_buff *skb, u32 portid, u32 seq)
2674 {
2675         struct rtmsg *r;
2676         struct nlmsghdr *nlh;
2677         unsigned long expires = 0;
2678         u32 error;
2679         u32 metrics[RTAX_MAX];
2680
2681         nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2682         if (!nlh)
2683                 return -EMSGSIZE;
2684
2685         r = nlmsg_data(nlh);
2686         r->rtm_family    = AF_INET;
2687         r->rtm_dst_len  = 32;
2688         r->rtm_src_len  = 0;
2689         r->rtm_tos      = fl4->flowi4_tos;
2690         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2691         if (nla_put_u32(skb, RTA_TABLE, table_id))
2692                 goto nla_put_failure;
2693         r->rtm_type     = rt->rt_type;
2694         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2695         r->rtm_protocol = RTPROT_UNSPEC;
2696         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2697         if (rt->rt_flags & RTCF_NOTIFY)
2698                 r->rtm_flags |= RTM_F_NOTIFY;
2699         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2700                 r->rtm_flags |= RTCF_DOREDIRECT;
2701
2702         if (nla_put_in_addr(skb, RTA_DST, dst))
2703                 goto nla_put_failure;
2704         if (src) {
2705                 r->rtm_src_len = 32;
2706                 if (nla_put_in_addr(skb, RTA_SRC, src))
2707                         goto nla_put_failure;
2708         }
2709         if (rt->dst.dev &&
2710             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2711                 goto nla_put_failure;
2712 #ifdef CONFIG_IP_ROUTE_CLASSID
2713         if (rt->dst.tclassid &&
2714             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2715                 goto nla_put_failure;
2716 #endif
2717         if (!rt_is_input_route(rt) &&
2718             fl4->saddr != src) {
2719                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2720                         goto nla_put_failure;
2721         }
2722         if (rt->rt_uses_gateway &&
2723             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2724                 goto nla_put_failure;
2725
2726         expires = rt->dst.expires;
2727         if (expires) {
2728                 unsigned long now = jiffies;
2729
2730                 if (time_before(now, expires))
2731                         expires -= now;
2732                 else
2733                         expires = 0;
2734         }
2735
2736         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2737         if (rt->rt_pmtu && expires)
2738                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2739         if (rt->rt_mtu_locked && expires)
2740                 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2741         if (rtnetlink_put_metrics(skb, metrics) < 0)
2742                 goto nla_put_failure;
2743
2744         if (fl4->flowi4_mark &&
2745             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2746                 goto nla_put_failure;
2747
2748         if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2749             nla_put_u32(skb, RTA_UID,
2750                         from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2751                 goto nla_put_failure;
2752
2753         error = rt->dst.error;
2754
2755         if (rt_is_input_route(rt)) {
2756 #ifdef CONFIG_IP_MROUTE
2757                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2758                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2759                         int err = ipmr_get_route(net, skb,
2760                                                  fl4->saddr, fl4->daddr,
2761                                                  r, portid);
2762
2763                         if (err <= 0) {
2764                                 if (err == 0)
2765                                         return 0;
2766                                 goto nla_put_failure;
2767                         }
2768                 } else
2769 #endif
2770                         if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2771                                 goto nla_put_failure;
2772         }
2773
2774         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2775                 goto nla_put_failure;
2776
2777         nlmsg_end(skb, nlh);
2778         return 0;
2779
2780 nla_put_failure:
2781         nlmsg_cancel(skb, nlh);
2782         return -EMSGSIZE;
2783 }
2784
2785 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
2786                                                    u8 ip_proto, __be16 sport,
2787                                                    __be16 dport)
2788 {
2789         struct sk_buff *skb;
2790         struct iphdr *iph;
2791
2792         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2793         if (!skb)
2794                 return NULL;
2795
2796         /* Reserve room for dummy headers, this skb can pass
2797          * through good chunk of routing engine.
2798          */
2799         skb_reset_mac_header(skb);
2800         skb_reset_network_header(skb);
2801         skb->protocol = htons(ETH_P_IP);
2802         iph = skb_put(skb, sizeof(struct iphdr));
2803         iph->protocol = ip_proto;
2804         iph->saddr = src;
2805         iph->daddr = dst;
2806         iph->version = 0x4;
2807         iph->frag_off = 0;
2808         iph->ihl = 0x5;
2809         skb_set_transport_header(skb, skb->len);
2810
2811         switch (iph->protocol) {
2812         case IPPROTO_UDP: {
2813                 struct udphdr *udph;
2814
2815                 udph = skb_put_zero(skb, sizeof(struct udphdr));
2816                 udph->source = sport;
2817                 udph->dest = dport;
2818                 udph->len = htons(sizeof(struct udphdr));
2819                 udph->check = 0;
2820                 break;
2821         }
2822         case IPPROTO_TCP: {
2823                 struct tcphdr *tcph;
2824
2825                 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
2826                 tcph->source    = sport;
2827                 tcph->dest      = dport;
2828                 tcph->doff      = sizeof(struct tcphdr) / 4;
2829                 tcph->rst = 1;
2830                 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
2831                                             src, dst, 0);
2832                 break;
2833         }
2834         case IPPROTO_ICMP: {
2835                 struct icmphdr *icmph;
2836
2837                 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
2838                 icmph->type = ICMP_ECHO;
2839                 icmph->code = 0;
2840         }
2841         }
2842
2843         return skb;
2844 }
2845
2846 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2847                              struct netlink_ext_ack *extack)
2848 {
2849         struct net *net = sock_net(in_skb->sk);
2850         struct nlattr *tb[RTA_MAX+1];
2851         u32 table_id = RT_TABLE_MAIN;
2852         __be16 sport = 0, dport = 0;
2853         struct fib_result res = {};
2854         u8 ip_proto = IPPROTO_UDP;
2855         struct rtable *rt = NULL;
2856         struct sk_buff *skb;
2857         struct rtmsg *rtm;
2858         struct flowi4 fl4;
2859         __be32 dst = 0;
2860         __be32 src = 0;
2861         kuid_t uid;
2862         u32 iif;
2863         int err;
2864         int mark;
2865
2866         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
2867                           extack);
2868         if (err < 0)
2869                 return err;
2870
2871         rtm = nlmsg_data(nlh);
2872         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2873         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2874         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2875         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2876         if (tb[RTA_UID])
2877                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2878         else
2879                 uid = (iif ? INVALID_UID : current_uid());
2880
2881         if (tb[RTA_IP_PROTO]) {
2882                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
2883                                                   &ip_proto, AF_INET, extack);
2884                 if (err)
2885                         return err;
2886         }
2887
2888         if (tb[RTA_SPORT])
2889                 sport = nla_get_be16(tb[RTA_SPORT]);
2890
2891         if (tb[RTA_DPORT])
2892                 dport = nla_get_be16(tb[RTA_DPORT]);
2893
2894         skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
2895         if (!skb)
2896                 return -ENOBUFS;
2897
2898         memset(&fl4, 0, sizeof(fl4));
2899         fl4.daddr = dst;
2900         fl4.saddr = src;
2901         fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
2902         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2903         fl4.flowi4_mark = mark;
2904         fl4.flowi4_uid = uid;
2905         if (sport)
2906                 fl4.fl4_sport = sport;
2907         if (dport)
2908                 fl4.fl4_dport = dport;
2909         fl4.flowi4_proto = ip_proto;
2910
2911         rcu_read_lock();
2912
2913         if (iif) {
2914                 struct net_device *dev;
2915
2916                 dev = dev_get_by_index_rcu(net, iif);
2917                 if (!dev) {
2918                         err = -ENODEV;
2919                         goto errout_rcu;
2920                 }
2921
2922                 fl4.flowi4_iif = iif; /* for rt_fill_info */
2923                 skb->dev        = dev;
2924                 skb->mark       = mark;
2925                 err = ip_route_input_rcu(skb, dst, src,
2926                                          rtm->rtm_tos & IPTOS_RT_MASK, dev,
2927                                          &res);
2928
2929                 rt = skb_rtable(skb);
2930                 if (err == 0 && rt->dst.error)
2931                         err = -rt->dst.error;
2932         } else {
2933                 fl4.flowi4_iif = LOOPBACK_IFINDEX;
2934                 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
2935                 err = 0;
2936                 if (IS_ERR(rt))
2937                         err = PTR_ERR(rt);
2938                 else
2939                         skb_dst_set(skb, &rt->dst);
2940         }
2941
2942         if (err)
2943                 goto errout_rcu;
2944
2945         if (rtm->rtm_flags & RTM_F_NOTIFY)
2946                 rt->rt_flags |= RTCF_NOTIFY;
2947
2948         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2949                 table_id = res.table ? res.table->tb_id : 0;
2950
2951         /* reset skb for netlink reply msg */
2952         skb_trim(skb, 0);
2953         skb_reset_network_header(skb);
2954         skb_reset_transport_header(skb);
2955         skb_reset_mac_header(skb);
2956
2957         if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2958                 if (!res.fi) {
2959                         err = fib_props[res.type].error;
2960                         if (!err)
2961                                 err = -EHOSTUNREACH;
2962                         goto errout_rcu;
2963                 }
2964                 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2965                                     nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2966                                     rt->rt_type, res.prefix, res.prefixlen,
2967                                     fl4.flowi4_tos, res.fi, 0);
2968         } else {
2969                 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
2970                                    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
2971         }
2972         if (err < 0)
2973                 goto errout_rcu;
2974
2975         rcu_read_unlock();
2976
2977         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2978
2979 errout_free:
2980         return err;
2981 errout_rcu:
2982         rcu_read_unlock();
2983         kfree_skb(skb);
2984         goto errout_free;
2985 }
2986
2987 void ip_rt_multicast_event(struct in_device *in_dev)
2988 {
2989         rt_cache_flush(dev_net(in_dev->dev));
2990 }
2991
2992 #ifdef CONFIG_SYSCTL
2993 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2994 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2995 static int ip_rt_gc_elasticity __read_mostly    = 8;
2996 static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
2997
2998 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2999                                         void __user *buffer,
3000                                         size_t *lenp, loff_t *ppos)
3001 {
3002         struct net *net = (struct net *)__ctl->extra1;
3003
3004         if (write) {
3005                 rt_cache_flush(net);
3006                 fnhe_genid_bump(net);
3007                 return 0;
3008         }
3009
3010         return -EINVAL;
3011 }
3012
3013 static struct ctl_table ipv4_route_table[] = {
3014         {
3015                 .procname       = "gc_thresh",
3016                 .data           = &ipv4_dst_ops.gc_thresh,
3017                 .maxlen         = sizeof(int),
3018                 .mode           = 0644,
3019                 .proc_handler   = proc_dointvec,
3020         },
3021         {
3022                 .procname       = "max_size",
3023                 .data           = &ip_rt_max_size,
3024                 .maxlen         = sizeof(int),
3025                 .mode           = 0644,
3026                 .proc_handler   = proc_dointvec,
3027         },
3028         {
3029                 /*  Deprecated. Use gc_min_interval_ms */
3030
3031                 .procname       = "gc_min_interval",
3032                 .data           = &ip_rt_gc_min_interval,
3033                 .maxlen         = sizeof(int),
3034                 .mode           = 0644,
3035                 .proc_handler   = proc_dointvec_jiffies,
3036         },
3037         {
3038                 .procname       = "gc_min_interval_ms",
3039                 .data           = &ip_rt_gc_min_interval,
3040                 .maxlen         = sizeof(int),
3041                 .mode           = 0644,
3042                 .proc_handler   = proc_dointvec_ms_jiffies,
3043         },
3044         {
3045                 .procname       = "gc_timeout",
3046                 .data           = &ip_rt_gc_timeout,
3047                 .maxlen         = sizeof(int),
3048                 .mode           = 0644,
3049                 .proc_handler   = proc_dointvec_jiffies,
3050         },
3051         {
3052                 .procname       = "gc_interval",
3053                 .data           = &ip_rt_gc_interval,
3054                 .maxlen         = sizeof(int),
3055                 .mode           = 0644,
3056                 .proc_handler   = proc_dointvec_jiffies,
3057         },
3058         {
3059                 .procname       = "redirect_load",
3060                 .data           = &ip_rt_redirect_load,
3061                 .maxlen         = sizeof(int),
3062                 .mode           = 0644,
3063                 .proc_handler   = proc_dointvec,
3064         },
3065         {
3066                 .procname       = "redirect_number",
3067                 .data           = &ip_rt_redirect_number,
3068                 .maxlen         = sizeof(int),
3069                 .mode           = 0644,
3070                 .proc_handler   = proc_dointvec,
3071         },
3072         {
3073                 .procname       = "redirect_silence",
3074                 .data           = &ip_rt_redirect_silence,
3075                 .maxlen         = sizeof(int),
3076                 .mode           = 0644,
3077                 .proc_handler   = proc_dointvec,
3078         },
3079         {
3080                 .procname       = "error_cost",
3081                 .data           = &ip_rt_error_cost,
3082                 .maxlen         = sizeof(int),
3083                 .mode           = 0644,
3084                 .proc_handler   = proc_dointvec,
3085         },
3086         {
3087                 .procname       = "error_burst",
3088                 .data           = &ip_rt_error_burst,
3089                 .maxlen         = sizeof(int),
3090                 .mode           = 0644,
3091                 .proc_handler   = proc_dointvec,
3092         },
3093         {
3094                 .procname       = "gc_elasticity",
3095                 .data           = &ip_rt_gc_elasticity,
3096                 .maxlen         = sizeof(int),
3097                 .mode           = 0644,
3098                 .proc_handler   = proc_dointvec,
3099         },
3100         {
3101                 .procname       = "mtu_expires",
3102                 .data           = &ip_rt_mtu_expires,
3103                 .maxlen         = sizeof(int),
3104                 .mode           = 0644,
3105                 .proc_handler   = proc_dointvec_jiffies,
3106         },
3107         {
3108                 .procname       = "min_pmtu",
3109                 .data           = &ip_rt_min_pmtu,
3110                 .maxlen         = sizeof(int),
3111                 .mode           = 0644,
3112                 .proc_handler   = proc_dointvec_minmax,
3113                 .extra1         = &ip_min_valid_pmtu,
3114         },
3115         {
3116                 .procname       = "min_adv_mss",
3117                 .data           = &ip_rt_min_advmss,
3118                 .maxlen         = sizeof(int),
3119                 .mode           = 0644,
3120                 .proc_handler   = proc_dointvec,
3121         },
3122         { }
3123 };
3124
3125 static struct ctl_table ipv4_route_flush_table[] = {
3126         {
3127                 .procname       = "flush",
3128                 .maxlen         = sizeof(int),
3129                 .mode           = 0200,
3130                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3131         },
3132         { },
3133 };
3134
3135 static __net_init int sysctl_route_net_init(struct net *net)
3136 {
3137         struct ctl_table *tbl;
3138
3139         tbl = ipv4_route_flush_table;
3140         if (!net_eq(net, &init_net)) {
3141                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3142                 if (!tbl)
3143                         goto err_dup;
3144
3145                 /* Don't export sysctls to unprivileged users */
3146                 if (net->user_ns != &init_user_ns)
3147                         tbl[0].procname = NULL;
3148         }
3149         tbl[0].extra1 = net;
3150
3151         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3152         if (!net->ipv4.route_hdr)
3153                 goto err_reg;
3154         return 0;
3155
3156 err_reg:
3157         if (tbl != ipv4_route_flush_table)
3158                 kfree(tbl);
3159 err_dup:
3160         return -ENOMEM;
3161 }
3162
3163 static __net_exit void sysctl_route_net_exit(struct net *net)
3164 {
3165         struct ctl_table *tbl;
3166
3167         tbl = net->ipv4.route_hdr->ctl_table_arg;
3168         unregister_net_sysctl_table(net->ipv4.route_hdr);
3169         BUG_ON(tbl == ipv4_route_flush_table);
3170         kfree(tbl);
3171 }
3172
3173 static __net_initdata struct pernet_operations sysctl_route_ops = {
3174         .init = sysctl_route_net_init,
3175         .exit = sysctl_route_net_exit,
3176 };
3177 #endif
3178
3179 static __net_init int rt_genid_init(struct net *net)
3180 {
3181         atomic_set(&net->ipv4.rt_genid, 0);
3182         atomic_set(&net->fnhe_genid, 0);
3183         atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3184         return 0;
3185 }
3186
3187 static __net_initdata struct pernet_operations rt_genid_ops = {
3188         .init = rt_genid_init,
3189 };
3190
3191 static int __net_init ipv4_inetpeer_init(struct net *net)
3192 {
3193         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3194
3195         if (!bp)
3196                 return -ENOMEM;
3197         inet_peer_base_init(bp);
3198         net->ipv4.peers = bp;
3199         return 0;
3200 }
3201
3202 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3203 {
3204         struct inet_peer_base *bp = net->ipv4.peers;
3205
3206         net->ipv4.peers = NULL;
3207         inetpeer_invalidate_tree(bp);
3208         kfree(bp);
3209 }
3210
3211 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3212         .init   =       ipv4_inetpeer_init,
3213         .exit   =       ipv4_inetpeer_exit,
3214 };
3215
3216 #ifdef CONFIG_IP_ROUTE_CLASSID
3217 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3218 #endif /* CONFIG_IP_ROUTE_CLASSID */
3219
3220 int __init ip_rt_init(void)
3221 {
3222         void *idents_hash;
3223         int cpu;
3224
3225         /* For modern hosts, this will use 2 MB of memory */
3226         idents_hash = alloc_large_system_hash("IP idents",
3227                                               sizeof(*ip_idents) + sizeof(*ip_tstamps),
3228                                               0,
3229                                               16, /* one bucket per 64 KB */
3230                                               HASH_ZERO,
3231                                               NULL,
3232                                               &ip_idents_mask,
3233                                               2048,
3234                                               256*1024);
3235
3236         ip_idents = idents_hash;
3237
3238         prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
3239
3240         ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
3241
3242         for_each_possible_cpu(cpu) {
3243                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3244
3245                 INIT_LIST_HEAD(&ul->head);
3246                 spin_lock_init(&ul->lock);
3247         }
3248 #ifdef CONFIG_IP_ROUTE_CLASSID
3249         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3250         if (!ip_rt_acct)
3251                 panic("IP: failed to allocate ip_rt_acct\n");
3252 #endif
3253
3254         ipv4_dst_ops.kmem_cachep =
3255                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3256                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3257
3258         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3259
3260         if (dst_entries_init(&ipv4_dst_ops) < 0)
3261                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3262
3263         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3264                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3265
3266         ipv4_dst_ops.gc_thresh = ~0;
3267         ip_rt_max_size = INT_MAX;
3268
3269         devinet_init();
3270         ip_fib_init();
3271
3272         if (ip_rt_proc_init())
3273                 pr_err("Unable to create route proc files\n");
3274 #ifdef CONFIG_XFRM
3275         xfrm_init();
3276         xfrm4_init();
3277 #endif
3278         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3279                       RTNL_FLAG_DOIT_UNLOCKED);
3280
3281 #ifdef CONFIG_SYSCTL
3282         register_pernet_subsys(&sysctl_route_ops);
3283 #endif
3284         register_pernet_subsys(&rt_genid_ops);
3285         register_pernet_subsys(&ipv4_inetpeer_ops);
3286         return 0;
3287 }
3288
3289 #ifdef CONFIG_SYSCTL
3290 /*
3291  * We really need to sanitize the damn ipv4 init order, then all
3292  * this nonsense will go away.
3293  */
3294 void __init ip_static_sysctl_init(void)
3295 {
3296         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3297 }
3298 #endif