net/ipv4/route.c

   1 /*
   2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   3  *              operating system.  INET is implemented using the  BSD Socket
   4  *              interface as the means of communication with the user level.
   5  *
   6  *              ROUTE - implementation of the IP router.
   7  *
   8  * Authors:     Ross Biro
   9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  13  *
  14  * Fixes:
  15  *              Alan Cox        :       Verify area fixes.
  16  *              Alan Cox        :       cli() protects routing changes
  17  *              Rui Oliveira    :       ICMP routing table updates
  18  *              (rco@di.uminho.pt)      Routing table insertion and update
  19  *              Linus Torvalds  :       Rewrote bits to be sensible
  20  *              Alan Cox        :       Added BSD route gw semantics
  21  *              Alan Cox        :       Super /proc >4K
  22  *              Alan Cox        :       MTU in route table
  23  *              Alan Cox        :       MSS actually. Also added the window
  24  *                                      clamper.
  25  *              Sam Lantinga    :       Fixed route matching in rt_del()
  26  *              Alan Cox        :       Routing cache support.
  27  *              Alan Cox        :       Removed compatibility cruft.
  28  *              Alan Cox        :       RTF_REJECT support.
  29  *              Alan Cox        :       TCP irtt support.
  30  *              Jonathan Naylor :       Added Metric support.
  31  *      Miquel van Smoorenburg  :       BSD API fixes.
  32  *      Miquel van Smoorenburg  :       Metrics.
  33  *              Alan Cox        :       Use __u32 properly
  34  *              Alan Cox        :       Aligned routing errors more closely with BSD
  35  *                                      our system is still very different.
  36  *              Alan Cox        :       Faster /proc handling
  37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  38  *                                      routing caches and better behaviour.
  39  *
  40  *              Olaf Erb        :       irtt wasn't being copied right.
  41  *              Bjorn Ekwall    :       Kerneld route support.
  42  *              Alan Cox        :       Multicast fixed (I hope)
  43  *              Pavel Krauz     :       Limited broadcast fixed
  44  *              Mike McLagan    :       Routing by source
  45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  46  *                                      route.c and rewritten from scratch.
  47  *              Andi Kleen      :       Load-limit warning messages.
  48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  52  *              Marc Boucher    :       routing by fwmark
  53  *      Robert Olsson           :       Added rt_cache statistics
  54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  58  *
  59  *              This program is free software; you can redistribute it and/or
  60  *              modify it under the terms of the GNU General Public License
  61  *              as published by the Free Software Foundation; either version
  62  *              2 of the License, or (at your option) any later version.
  63  */
  64
  65 #define pr_fmt(fmt) "IPv4: " fmt
  66
  67 #include <linux/module.h>
  68 #include <linux/uaccess.h>
  69 #include <linux/bitops.h>
  70 #include <linux/types.h>
  71 #include <linux/kernel.h>
  72 #include <linux/mm.h>
  73 #include <linux/bootmem.h>
  74 #include <linux/string.h>
  75 #include <linux/socket.h>
  76 #include <linux/sockios.h>
  77 #include <linux/errno.h>
  78 #include <linux/in.h>
  79 #include <linux/inet.h>
  80 #include <linux/netdevice.h>
  81 #include <linux/proc_fs.h>
  82 #include <linux/init.h>
  83 #include <linux/skbuff.h>
  84 #include <linux/inetdevice.h>
  85 #include <linux/igmp.h>
  86 #include <linux/pkt_sched.h>
  87 #include <linux/mroute.h>
  88 #include <linux/netfilter_ipv4.h>
  89 #include <linux/random.h>
  90 #include <linux/rcupdate.h>
  91 #include <linux/times.h>
  92 #include <linux/slab.h>
  93 #include <linux/jhash.h>
  94 #include <net/dst.h>
  95 #include <net/dst_metadata.h>
  96 #include <net/net_namespace.h>
  97 #include <net/protocol.h>
  98 #include <net/ip.h>
  99 #include <net/route.h>
 100 #include <net/inetpeer.h>
 101 #include <net/sock.h>
 102 #include <net/ip_fib.h>
 103 #include <net/arp.h>
 104 #include <net/tcp.h>
 105 #include <net/icmp.h>
 106 #include <net/xfrm.h>
 107 #include <net/lwtunnel.h>
 108 #include <net/netevent.h>
 109 #include <net/rtnetlink.h>
 110 #ifdef CONFIG_SYSCTL
 111 #include <linux/sysctl.h>
 112 #endif
 113 #include <net/secure_seq.h>
 114 #include <net/ip_tunnels.h>
 115 #include <net/l3mdev.h>
 116
 117 #include "fib_lookup.h"
 118
 119 #define RT_FL_TOS(oldflp4) \
 120         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 121
 122 #define RT_GC_TIMEOUT (300*HZ)
 123
 124 static int ip_rt_max_size;
 125 static int ip_rt_redirect_number __read_mostly  = 9;
 126 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 127 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 128 static int ip_rt_error_cost __read_mostly       = HZ;
 129 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 130 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 131 static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 132 static int ip_rt_min_advmss __read_mostly       = 256;
 133
 134 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 135
 136 /*
 137  *      Interface to generic destination cache.
 138  */
 139
 140 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 141 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 142 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 143 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 144 static void              ipv4_link_failure(struct sk_buff *skb);
 145 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 146                                            struct sk_buff *skb, u32 mtu,
 147                                            bool confirm_neigh);
 148 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 149                                         struct sk_buff *skb);
 150 static void             ipv4_dst_destroy(struct dst_entry *dst);
 151
 152 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 153 {
 154         WARN_ON(1);
 155         return NULL;
 156 }
 157
 158 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 159                                            struct sk_buff *skb,
 160                                            const void *daddr);
 161 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 162
 163 static struct dst_ops ipv4_dst_ops = {
 164         .family =               AF_INET,
 165         .check =                ipv4_dst_check,
 166         .default_advmss =       ipv4_default_advmss,
 167         .mtu =                  ipv4_mtu,
 168         .cow_metrics =          ipv4_cow_metrics,
 169         .destroy =              ipv4_dst_destroy,
 170         .negative_advice =      ipv4_negative_advice,
 171         .link_failure =         ipv4_link_failure,
 172         .update_pmtu =          ip_rt_update_pmtu,
 173         .redirect =             ip_do_redirect,
 174         .local_out =            __ip_local_out,
 175         .neigh_lookup =         ipv4_neigh_lookup,
 176         .confirm_neigh =        ipv4_confirm_neigh,
 177 };
 178
 179 #define ECN_OR_COST(class)      TC_PRIO_##class
 180
 181 const __u8 ip_tos2prio[16] = {
 182         TC_PRIO_BESTEFFORT,
 183         ECN_OR_COST(BESTEFFORT),
 184         TC_PRIO_BESTEFFORT,
 185         ECN_OR_COST(BESTEFFORT),
 186         TC_PRIO_BULK,
 187         ECN_OR_COST(BULK),
 188         TC_PRIO_BULK,
 189         ECN_OR_COST(BULK),
 190         TC_PRIO_INTERACTIVE,
 191         ECN_OR_COST(INTERACTIVE),
 192         TC_PRIO_INTERACTIVE,
 193         ECN_OR_COST(INTERACTIVE),
 194         TC_PRIO_INTERACTIVE_BULK,
 195         ECN_OR_COST(INTERACTIVE_BULK),
 196         TC_PRIO_INTERACTIVE_BULK,
 197         ECN_OR_COST(INTERACTIVE_BULK)
 198 };
 199 EXPORT_SYMBOL(ip_tos2prio);
 200
 201 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 202 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 203
 204 #ifdef CONFIG_PROC_FS
 205 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 206 {
 207         if (*pos)
 208                 return NULL;
 209         return SEQ_START_TOKEN;
 210 }
 211
 212 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 213 {
 214         ++*pos;
 215         return NULL;
 216 }
 217
 218 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 219 {
 220 }
 221
 222 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 223 {
 224         if (v == SEQ_START_TOKEN)
 225                 seq_printf(seq, "%-127s\n",
 226                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 227                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 228                            "HHUptod\tSpecDst");
 229         return 0;
 230 }
 231
 232 static const struct seq_operations rt_cache_seq_ops = {
 233         .start  = rt_cache_seq_start,
 234         .next   = rt_cache_seq_next,
 235         .stop   = rt_cache_seq_stop,
 236         .show   = rt_cache_seq_show,
 237 };
 238
 239 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 240 {
 241         return seq_open(file, &rt_cache_seq_ops);
 242 }
 243
 244 static const struct file_operations rt_cache_seq_fops = {
 245         .open    = rt_cache_seq_open,
 246         .read    = seq_read,
 247         .llseek  = seq_lseek,
 248         .release = seq_release,
 249 };
 250
 251
 252 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 253 {
 254         int cpu;
 255
 256         if (*pos == 0)
 257                 return SEQ_START_TOKEN;
 258
 259         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 260                 if (!cpu_possible(cpu))
 261                         continue;
 262                 *pos = cpu+1;
 263                 return &per_cpu(rt_cache_stat, cpu);
 264         }
 265         return NULL;
 266 }
 267
 268 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 269 {
 270         int cpu;
 271
 272         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 273                 if (!cpu_possible(cpu))
 274                         continue;
 275                 *pos = cpu+1;
 276                 return &per_cpu(rt_cache_stat, cpu);
 277         }
 278         (*pos)++;
 279         return NULL;
 280
 281 }
 282
 283 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 284 {
 285
 286 }
 287
 288 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 289 {
 290         struct rt_cache_stat *st = v;
 291
 292         if (v == SEQ_START_TOKEN) {
 293                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 294                 return 0;
 295         }
 296
 297         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 298                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 299                    dst_entries_get_slow(&ipv4_dst_ops),
 300                    0, /* st->in_hit */
 301                    st->in_slow_tot,
 302                    st->in_slow_mc,
 303                    st->in_no_route,
 304                    st->in_brd,
 305                    st->in_martian_dst,
 306                    st->in_martian_src,
 307
 308                    0, /* st->out_hit */
 309                    st->out_slow_tot,
 310                    st->out_slow_mc,
 311
 312                    0, /* st->gc_total */
 313                    0, /* st->gc_ignored */
 314                    0, /* st->gc_goal_miss */
 315                    0, /* st->gc_dst_overflow */
 316                    0, /* st->in_hlist_search */
 317                    0  /* st->out_hlist_search */
 318                 );
 319         return 0;
 320 }
 321
 322 static const struct seq_operations rt_cpu_seq_ops = {
 323         .start  = rt_cpu_seq_start,
 324         .next   = rt_cpu_seq_next,
 325         .stop   = rt_cpu_seq_stop,
 326         .show   = rt_cpu_seq_show,
 327 };
 328
 329
 330 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 331 {
 332         return seq_open(file, &rt_cpu_seq_ops);
 333 }
 334
 335 static const struct file_operations rt_cpu_seq_fops = {
 336         .open    = rt_cpu_seq_open,
 337         .read    = seq_read,
 338         .llseek  = seq_lseek,
 339         .release = seq_release,
 340 };
 341
 342 #ifdef CONFIG_IP_ROUTE_CLASSID
 343 static int rt_acct_proc_show(struct seq_file *m, void *v)
 344 {
 345         struct ip_rt_acct *dst, *src;
 346         unsigned int i, j;
 347
 348         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 349         if (!dst)
 350                 return -ENOMEM;
 351
 352         for_each_possible_cpu(i) {
 353                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 354                 for (j = 0; j < 256; j++) {
 355                         dst[j].o_bytes   += src[j].o_bytes;
 356                         dst[j].o_packets += src[j].o_packets;
 357                         dst[j].i_bytes   += src[j].i_bytes;
 358                         dst[j].i_packets += src[j].i_packets;
 359                 }
 360         }
 361
 362         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 363         kfree(dst);
 364         return 0;
 365 }
 366 #endif
 367
 368 static int __net_init ip_rt_do_proc_init(struct net *net)
 369 {
 370         struct proc_dir_entry *pde;
 371
 372         pde = proc_create("rt_cache", 0444, net->proc_net,
 373                           &rt_cache_seq_fops);
 374         if (!pde)
 375                 goto err1;
 376
 377         pde = proc_create("rt_cache", 0444,
 378                           net->proc_net_stat, &rt_cpu_seq_fops);
 379         if (!pde)
 380                 goto err2;
 381
 382 #ifdef CONFIG_IP_ROUTE_CLASSID
 383         pde = proc_create_single("rt_acct", 0, net->proc_net,
 384                         rt_acct_proc_show);
 385         if (!pde)
 386                 goto err3;
 387 #endif
 388         return 0;
 389
 390 #ifdef CONFIG_IP_ROUTE_CLASSID
 391 err3:
 392         remove_proc_entry("rt_cache", net->proc_net_stat);
 393 #endif
 394 err2:
 395         remove_proc_entry("rt_cache", net->proc_net);
 396 err1:
 397         return -ENOMEM;
 398 }
 399
 400 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 401 {
 402         remove_proc_entry("rt_cache", net->proc_net_stat);
 403         remove_proc_entry("rt_cache", net->proc_net);
 404 #ifdef CONFIG_IP_ROUTE_CLASSID
 405         remove_proc_entry("rt_acct", net->proc_net);
 406 #endif
 407 }
 408
 409 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 410         .init = ip_rt_do_proc_init,
 411         .exit = ip_rt_do_proc_exit,
 412 };
 413
 414 static int __init ip_rt_proc_init(void)
 415 {
 416         return register_pernet_subsys(&ip_rt_proc_ops);
 417 }
 418
 419 #else
 420 static inline int ip_rt_proc_init(void)
 421 {
 422         return 0;
 423 }
 424 #endif /* CONFIG_PROC_FS */
 425
 426 static inline bool rt_is_expired(const struct rtable *rth)
 427 {
 428         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 429 }
 430
 431 void rt_cache_flush(struct net *net)
 432 {
 433         rt_genid_bump_ipv4(net);
 434 }
 435
 436 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 437                                            struct sk_buff *skb,
 438                                            const void *daddr)
 439 {
 440         struct net_device *dev = dst->dev;
 441         const __be32 *pkey = daddr;
 442         const struct rtable *rt;
 443         struct neighbour *n;
 444
 445         rt = (const struct rtable *) dst;
 446         if (rt->rt_gateway)
 447                 pkey = (const __be32 *) &rt->rt_gateway;
 448         else if (skb)
 449                 pkey = &ip_hdr(skb)->daddr;
 450
 451         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
 452         if (n)
 453                 return n;
 454         return neigh_create(&arp_tbl, pkey, dev);
 455 }
 456
 457 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 458 {
 459         struct net_device *dev = dst->dev;
 460         const __be32 *pkey = daddr;
 461         const struct rtable *rt;
 462
 463         rt = (const struct rtable *)dst;
 464         if (rt->rt_gateway)
 465                 pkey = (const __be32 *)&rt->rt_gateway;
 466         else if (!daddr ||
 467                  (rt->rt_flags &
 468                   (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
 469                 return;
 470
 471         __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
 472 }
 473
 474 /* Hash tables of size 2048..262144 depending on RAM size.
 475  * Each bucket uses 8 bytes.
 476  */
 477 static u32 ip_idents_mask __read_mostly;
 478 static atomic_t *ip_idents __read_mostly;
 479 static u32 *ip_tstamps __read_mostly;
 480
 481 /* In order to protect privacy, we add a perturbation to identifiers
 482  * if one generator is seldom used. This makes hard for an attacker
 483  * to infer how many packets were sent between two points in time.
 484  */
 485 u32 ip_idents_reserve(u32 hash, int segs)
 486 {
 487         u32 bucket, old, now = (u32)jiffies;
 488         atomic_t *p_id;
 489         u32 *p_tstamp;
 490         u32 delta = 0;
 491
 492         bucket = hash & ip_idents_mask;
 493         p_tstamp = ip_tstamps + bucket;
 494         p_id = ip_idents + bucket;
 495         old = READ_ONCE(*p_tstamp);
 496
 497         if (old != now && cmpxchg(p_tstamp, old, now) == old)
 498                 delta = prandom_u32_max(now - old);
 499
 500         /* If UBSAN reports an error there, please make sure your compiler
 501          * supports -fno-strict-overflow before reporting it that was a bug
 502          * in UBSAN, and it has been fixed in GCC-8.
 503          */
 504         return atomic_add_return(segs + delta, p_id) - segs;
 505 }
 506 EXPORT_SYMBOL(ip_idents_reserve);
 507
 508 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 509 {
 510         u32 hash, id;
 511
 512         /* Note the following code is not safe, but this is okay. */
 513         if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
 514                 get_random_bytes(&net->ipv4.ip_id_key,
 515                                  sizeof(net->ipv4.ip_id_key));
 516
 517         hash = siphash_3u32((__force u32)iph->daddr,
 518                             (__force u32)iph->saddr,
 519                             iph->protocol,
 520                             &net->ipv4.ip_id_key);
 521         id = ip_idents_reserve(hash, segs);
 522         iph->id = htons(id);
 523 }
 524 EXPORT_SYMBOL(__ip_select_ident);
 525
 526 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
 527                              const struct sock *sk,
 528                              const struct iphdr *iph,
 529                              int oif, u8 tos,
 530                              u8 prot, u32 mark, int flow_flags)
 531 {
 532         if (sk) {
 533                 const struct inet_sock *inet = inet_sk(sk);
 534
 535                 oif = sk->sk_bound_dev_if;
 536                 mark = sk->sk_mark;
 537                 tos = RT_CONN_FLAGS(sk);
 538                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 539         }
 540         flowi4_init_output(fl4, oif, mark, tos,
 541                            RT_SCOPE_UNIVERSE, prot,
 542                            flow_flags,
 543                            iph->daddr, iph->saddr, 0, 0,
 544                            sock_net_uid(net, sk));
 545 }
 546
 547 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 548                                const struct sock *sk)
 549 {
 550         const struct net *net = dev_net(skb->dev);
 551         const struct iphdr *iph = ip_hdr(skb);
 552         int oif = skb->dev->ifindex;
 553         u8 tos = RT_TOS(iph->tos);
 554         u8 prot = iph->protocol;
 555         u32 mark = skb->mark;
 556
 557         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
 558 }
 559
 560 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 561 {
 562         const struct inet_sock *inet = inet_sk(sk);
 563         const struct ip_options_rcu *inet_opt;
 564         __be32 daddr = inet->inet_daddr;
 565
 566         rcu_read_lock();
 567         inet_opt = rcu_dereference(inet->inet_opt);
 568         if (inet_opt && inet_opt->opt.srr)
 569                 daddr = inet_opt->opt.faddr;
 570         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 571                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 572                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 573                            inet_sk_flowi_flags(sk),
 574                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
 575         rcu_read_unlock();
 576 }
 577
 578 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 579                                  const struct sk_buff *skb)
 580 {
 581         if (skb)
 582                 build_skb_flow_key(fl4, skb, sk);
 583         else
 584                 build_sk_flow_key(fl4, sk);
 585 }
 586
 587 static DEFINE_SPINLOCK(fnhe_lock);
 588
 589 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 590 {
 591         struct rtable *rt;
 592
 593         rt = rcu_dereference(fnhe->fnhe_rth_input);
 594         if (rt) {
 595                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 596                 dst_dev_put(&rt->dst);
 597                 dst_release(&rt->dst);
 598         }
 599         rt = rcu_dereference(fnhe->fnhe_rth_output);
 600         if (rt) {
 601                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 602                 dst_dev_put(&rt->dst);
 603                 dst_release(&rt->dst);
 604         }
 605 }
 606
 607 static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
 608 {
 609         struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
 610         struct fib_nh_exception *fnhe, *oldest = NULL;
 611
 612         for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
 613                 fnhe = rcu_dereference_protected(*fnhe_p,
 614                                                  lockdep_is_held(&fnhe_lock));
 615                 if (!fnhe)
 616                         break;
 617                 if (!oldest ||
 618                     time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
 619                         oldest = fnhe;
 620                         oldest_p = fnhe_p;
 621                 }
 622         }
 623         fnhe_flush_routes(oldest);
 624         *oldest_p = oldest->fnhe_next;
 625         kfree_rcu(oldest, rcu);
 626 }
 627
 628 static u32 fnhe_hashfun(__be32 daddr)
 629 {
 630         static siphash_key_t fnhe_hash_key __read_mostly;
 631         u64 hval;
 632
 633         net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key));
 634         hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key);
 635         return hash_64(hval, FNHE_HASH_SHIFT);
 636 }
 637
 638 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 639 {
 640         rt->rt_pmtu = fnhe->fnhe_pmtu;
 641         rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
 642         rt->dst.expires = fnhe->fnhe_expires;
 643
 644         if (fnhe->fnhe_gw) {
 645                 rt->rt_flags |= RTCF_REDIRECTED;
 646                 rt->rt_gateway = fnhe->fnhe_gw;
 647                 rt->rt_uses_gateway = 1;
 648         }
 649 }
 650
 651 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
 652                                   u32 pmtu, bool lock, unsigned long expires)
 653 {
 654         struct fnhe_hash_bucket *hash;
 655         struct fib_nh_exception *fnhe;
 656         struct rtable *rt;
 657         u32 genid, hval;
 658         unsigned int i;
 659         int depth;
 660
 661         genid = fnhe_genid(dev_net(nh->nh_dev));
 662         hval = fnhe_hashfun(daddr);
 663
 664         spin_lock_bh(&fnhe_lock);
 665
 666         hash = rcu_dereference(nh->nh_exceptions);
 667         if (!hash) {
 668                 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
 669                 if (!hash)
 670                         goto out_unlock;
 671                 rcu_assign_pointer(nh->nh_exceptions, hash);
 672         }
 673
 674         hash += hval;
 675
 676         depth = 0;
 677         for (fnhe = rcu_dereference(hash->chain); fnhe;
 678              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 679                 if (fnhe->fnhe_daddr == daddr)
 680                         break;
 681                 depth++;
 682         }
 683
 684         if (fnhe) {
 685                 if (fnhe->fnhe_genid != genid)
 686                         fnhe->fnhe_genid = genid;
 687                 if (gw)
 688                         fnhe->fnhe_gw = gw;
 689                 if (pmtu) {
 690                         fnhe->fnhe_pmtu = pmtu;
 691                         fnhe->fnhe_mtu_locked = lock;
 692                 }
 693                 fnhe->fnhe_expires = max(1UL, expires);
 694                 /* Update all cached dsts too */
 695                 rt = rcu_dereference(fnhe->fnhe_rth_input);
 696                 if (rt)
 697                         fill_route_from_fnhe(rt, fnhe);
 698                 rt = rcu_dereference(fnhe->fnhe_rth_output);
 699                 if (rt)
 700                         fill_route_from_fnhe(rt, fnhe);
 701         } else {
 702                 /* Randomize max depth to avoid some side channels attacks. */
 703                 int max_depth = FNHE_RECLAIM_DEPTH +
 704                                 prandom_u32_max(FNHE_RECLAIM_DEPTH);
 705
 706                 while (depth > max_depth) {
 707                         fnhe_remove_oldest(hash);
 708                         depth--;
 709                 }
 710
 711                 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 712                 if (!fnhe)
 713                         goto out_unlock;
 714
 715                 fnhe->fnhe_next = hash->chain;
 716
 717                 fnhe->fnhe_genid = genid;
 718                 fnhe->fnhe_daddr = daddr;
 719                 fnhe->fnhe_gw = gw;
 720                 fnhe->fnhe_pmtu = pmtu;
 721                 fnhe->fnhe_mtu_locked = lock;
 722                 fnhe->fnhe_expires = max(1UL, expires);
 723
 724                 rcu_assign_pointer(hash->chain, fnhe);
 725
 726                 /* Exception created; mark the cached routes for the nexthop
 727                  * stale, so anyone caching it rechecks if this exception
 728                  * applies to them.
 729                  */
 730                 rt = rcu_dereference(nh->nh_rth_input);
 731                 if (rt)
 732                         rt->dst.obsolete = DST_OBSOLETE_KILL;
 733
 734                 for_each_possible_cpu(i) {
 735                         struct rtable __rcu **prt;
 736                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
 737                         rt = rcu_dereference(*prt);
 738                         if (rt)
 739                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 740                 }
 741         }
 742
 743         fnhe->fnhe_stamp = jiffies;
 744
 745 out_unlock:
 746         spin_unlock_bh(&fnhe_lock);
 747 }
 748
 749 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 750                              bool kill_route)
 751 {
 752         __be32 new_gw = icmp_hdr(skb)->un.gateway;
 753         __be32 old_gw = ip_hdr(skb)->saddr;
 754         struct net_device *dev = skb->dev;
 755         struct in_device *in_dev;
 756         struct fib_result res;
 757         struct neighbour *n;
 758         struct net *net;
 759
 760         switch (icmp_hdr(skb)->code & 7) {
 761         case ICMP_REDIR_NET:
 762         case ICMP_REDIR_NETTOS:
 763         case ICMP_REDIR_HOST:
 764         case ICMP_REDIR_HOSTTOS:
 765                 break;
 766
 767         default:
 768                 return;
 769         }
 770
 771         if (rt->rt_gateway != old_gw)
 772                 return;
 773
 774         in_dev = __in_dev_get_rcu(dev);
 775         if (!in_dev)
 776                 return;
 777
 778         net = dev_net(dev);
 779         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 780             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 781             ipv4_is_zeronet(new_gw))
 782                 goto reject_redirect;
 783
 784         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 785                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 786                         goto reject_redirect;
 787                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 788                         goto reject_redirect;
 789         } else {
 790                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 791                         goto reject_redirect;
 792         }
 793
 794         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
 795         if (!n)
 796                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 797         if (!IS_ERR(n)) {
 798                 if (!(n->nud_state & NUD_VALID)) {
 799                         neigh_event_send(n, NULL);
 800                 } else {
 801                         if (fib_lookup(net, fl4, &res, 0) == 0) {
 802                                 struct fib_nh *nh;
 803
 804                                 fib_select_path(net, &res, fl4, skb);
 805                                 nh = &FIB_RES_NH(res);
 806                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
 807                                                 0, false,
 808                                                 jiffies + ip_rt_gc_timeout);
 809                         }
 810                         if (kill_route)
 811                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 812                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 813                 }
 814                 neigh_release(n);
 815         }
 816         return;
 817
 818 reject_redirect:
 819 #ifdef CONFIG_IP_ROUTE_VERBOSE
 820         if (IN_DEV_LOG_MARTIANS(in_dev)) {
 821                 const struct iphdr *iph = (const struct iphdr *) skb->data;
 822                 __be32 daddr = iph->daddr;
 823                 __be32 saddr = iph->saddr;
 824
 825                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 826                                      "  Advised path = %pI4 -> %pI4\n",
 827                                      &old_gw, dev->name, &new_gw,
 828                                      &saddr, &daddr);
 829         }
 830 #endif
 831         ;
 832 }
 833
 834 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 835 {
 836         struct rtable *rt;
 837         struct flowi4 fl4;
 838         const struct iphdr *iph = (const struct iphdr *) skb->data;
 839         struct net *net = dev_net(skb->dev);
 840         int oif = skb->dev->ifindex;
 841         u8 tos = RT_TOS(iph->tos);
 842         u8 prot = iph->protocol;
 843         u32 mark = skb->mark;
 844
 845         rt = (struct rtable *) dst;
 846
 847         __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
 848         __ip_do_redirect(rt, skb, &fl4, true);
 849 }
 850
 851 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 852 {
 853         struct rtable *rt = (struct rtable *)dst;
 854         struct dst_entry *ret = dst;
 855
 856         if (rt) {
 857                 if (dst->obsolete > 0) {
 858                         ip_rt_put(rt);
 859                         ret = NULL;
 860                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 861                            rt->dst.expires) {
 862                         ip_rt_put(rt);
 863                         ret = NULL;
 864                 }
 865         }
 866         return ret;
 867 }
 868
 869 /*
 870  * Algorithm:
 871  *      1. The first ip_rt_redirect_number redirects are sent
 872  *         with exponential backoff, then we stop sending them at all,
 873  *         assuming that the host ignores our redirects.
 874  *      2. If we did not see packets requiring redirects
 875  *         during ip_rt_redirect_silence, we assume that the host
 876  *         forgot redirected route and start to send redirects again.
 877  *
 878  * This algorithm is much cheaper and more intelligent than dumb load limiting
 879  * in icmp.c.
 880  *
 881  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 882  * and "frag. need" (breaks PMTU discovery) in icmp.c.
 883  */
 884
 885 void ip_rt_send_redirect(struct sk_buff *skb)
 886 {
 887         struct rtable *rt = skb_rtable(skb);
 888         struct in_device *in_dev;
 889         struct inet_peer *peer;
 890         struct net *net;
 891         int log_martians;
 892         int vif;
 893
 894         rcu_read_lock();
 895         in_dev = __in_dev_get_rcu(rt->dst.dev);
 896         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 897                 rcu_read_unlock();
 898                 return;
 899         }
 900         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 901         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 902         rcu_read_unlock();
 903
 904         net = dev_net(rt->dst.dev);
 905         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 906         if (!peer) {
 907                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 908                           rt_nexthop(rt, ip_hdr(skb)->daddr));
 909                 return;
 910         }
 911
 912         /* No redirected packets during ip_rt_redirect_silence;
 913          * reset the algorithm.
 914          */
 915         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
 916                 peer->rate_tokens = 0;
 917                 peer->n_redirects = 0;
 918         }
 919
 920         /* Too many ignored redirects; do not send anything
 921          * set dst.rate_last to the last seen redirected packet.
 922          */
 923         if (peer->n_redirects >= ip_rt_redirect_number) {
 924                 peer->rate_last = jiffies;
 925                 goto out_put_peer;
 926         }
 927
 928         /* Check for load limit; set rate_last to the latest sent
 929          * redirect.
 930          */
 931         if (peer->n_redirects == 0 ||
 932             time_after(jiffies,
 933                        (peer->rate_last +
 934                         (ip_rt_redirect_load << peer->n_redirects)))) {
 935                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 936
 937                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 938                 peer->rate_last = jiffies;
 939                 ++peer->n_redirects;
 940 #ifdef CONFIG_IP_ROUTE_VERBOSE
 941                 if (log_martians &&
 942                     peer->n_redirects == ip_rt_redirect_number)
 943                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 944                                              &ip_hdr(skb)->saddr, inet_iif(skb),
 945                                              &ip_hdr(skb)->daddr, &gw);
 946 #endif
 947         }
 948 out_put_peer:
 949         inet_putpeer(peer);
 950 }
 951
 952 static int ip_error(struct sk_buff *skb)
 953 {
 954         struct rtable *rt = skb_rtable(skb);
 955         struct net_device *dev = skb->dev;
 956         struct in_device *in_dev;
 957         struct inet_peer *peer;
 958         unsigned long now;
 959         struct net *net;
 960         bool send;
 961         int code;
 962
 963         if (netif_is_l3_master(skb->dev)) {
 964                 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
 965                 if (!dev)
 966                         goto out;
 967         }
 968
 969         in_dev = __in_dev_get_rcu(dev);
 970
 971         /* IP on this device is disabled. */
 972         if (!in_dev)
 973                 goto out;
 974
 975         net = dev_net(rt->dst.dev);
 976         if (!IN_DEV_FORWARD(in_dev)) {
 977                 switch (rt->dst.error) {
 978                 case EHOSTUNREACH:
 979                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 980                         break;
 981
 982                 case ENETUNREACH:
 983                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 984                         break;
 985                 }
 986                 goto out;
 987         }
 988
 989         switch (rt->dst.error) {
 990         case EINVAL:
 991         default:
 992                 goto out;
 993         case EHOSTUNREACH:
 994                 code = ICMP_HOST_UNREACH;
 995                 break;
 996         case ENETUNREACH:
 997                 code = ICMP_NET_UNREACH;
 998                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 999                 break;
1000         case EACCES:
1001                 code = ICMP_PKT_FILTERED;
1002                 break;
1003         }
1004
1005         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
1006                                l3mdev_master_ifindex(skb->dev), 1);
1007
1008         send = true;
1009         if (peer) {
1010                 now = jiffies;
1011                 peer->rate_tokens += now - peer->rate_last;
1012                 if (peer->rate_tokens > ip_rt_error_burst)
1013                         peer->rate_tokens = ip_rt_error_burst;
1014                 peer->rate_last = now;
1015                 if (peer->rate_tokens >= ip_rt_error_cost)
1016                         peer->rate_tokens -= ip_rt_error_cost;
1017                 else
1018                         send = false;
1019                 inet_putpeer(peer);
1020         }
1021         if (send)
1022                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1023
1024 out:    kfree_skb(skb);
1025         return 0;
1026 }
1027
1028 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1029 {
1030         struct dst_entry *dst = &rt->dst;
1031         struct net *net = dev_net(dst->dev);
1032         u32 old_mtu = ipv4_mtu(dst);
1033         struct fib_result res;
1034         bool lock = false;
1035
1036         if (ip_mtu_locked(dst))
1037                 return;
1038
1039         if (old_mtu < mtu)
1040                 return;
1041
1042         if (mtu < ip_rt_min_pmtu) {
1043                 lock = true;
1044                 mtu = min(old_mtu, ip_rt_min_pmtu);
1045         }
1046
1047         if (rt->rt_pmtu == mtu && !lock &&
1048             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1049                 return;
1050
1051         rcu_read_lock();
1052         if (fib_lookup(net, fl4, &res, 0) == 0) {
1053                 struct fib_nh *nh;
1054
1055                 fib_select_path(net, &res, fl4, NULL);
1056                 nh = &FIB_RES_NH(res);
1057                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
1058                                       jiffies + ip_rt_mtu_expires);
1059         }
1060         rcu_read_unlock();
1061 }
1062
1063 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1064                               struct sk_buff *skb, u32 mtu,
1065                               bool confirm_neigh)
1066 {
1067         struct rtable *rt = (struct rtable *) dst;
1068         struct flowi4 fl4;
1069
1070         ip_rt_build_flow_key(&fl4, sk, skb);
1071         __ip_rt_update_pmtu(rt, &fl4, mtu);
1072 }
1073
1074 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1075                       int oif, u32 mark, u8 protocol, int flow_flags)
1076 {
1077         const struct iphdr *iph = (const struct iphdr *) skb->data;
1078         struct flowi4 fl4;
1079         struct rtable *rt;
1080
1081         if (!mark)
1082                 mark = IP4_REPLY_MARK(net, skb->mark);
1083
1084         __build_flow_key(net, &fl4, NULL, iph, oif,
1085                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1086         rt = __ip_route_output_key(net, &fl4);
1087         if (!IS_ERR(rt)) {
1088                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1089                 ip_rt_put(rt);
1090         }
1091 }
1092 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1093
1094 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1095 {
1096         const struct iphdr *iph = (const struct iphdr *) skb->data;
1097         struct flowi4 fl4;
1098         struct rtable *rt;
1099
1100         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1101
1102         if (!fl4.flowi4_mark)
1103                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1104
1105         rt = __ip_route_output_key(sock_net(sk), &fl4);
1106         if (!IS_ERR(rt)) {
1107                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1108                 ip_rt_put(rt);
1109         }
1110 }
1111
1112 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1113 {
1114         const struct iphdr *iph = (const struct iphdr *) skb->data;
1115         struct flowi4 fl4;
1116         struct rtable *rt;
1117         struct dst_entry *odst = NULL;
1118         bool new = false;
1119         struct net *net = sock_net(sk);
1120
1121         bh_lock_sock(sk);
1122
1123         if (!ip_sk_accept_pmtu(sk))
1124                 goto out;
1125
1126         odst = sk_dst_get(sk);
1127
1128         if (sock_owned_by_user(sk) || !odst) {
1129                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1130                 goto out;
1131         }
1132
1133         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1134
1135         rt = (struct rtable *)odst;
1136         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1137                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1138                 if (IS_ERR(rt))
1139                         goto out;
1140
1141                 new = true;
1142         }
1143
1144         __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1145
1146         if (!dst_check(&rt->dst, 0)) {
1147                 if (new)
1148                         dst_release(&rt->dst);
1149
1150                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1151                 if (IS_ERR(rt))
1152                         goto out;
1153
1154                 new = true;
1155         }
1156
1157         if (new)
1158                 sk_dst_set(sk, &rt->dst);
1159
1160 out:
1161         bh_unlock_sock(sk);
1162         dst_release(odst);
1163 }
1164 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1165
1166 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1167                    int oif, u32 mark, u8 protocol, int flow_flags)
1168 {
1169         const struct iphdr *iph = (const struct iphdr *) skb->data;
1170         struct flowi4 fl4;
1171         struct rtable *rt;
1172
1173         __build_flow_key(net, &fl4, NULL, iph, oif,
1174                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1175         rt = __ip_route_output_key(net, &fl4);
1176         if (!IS_ERR(rt)) {
1177                 __ip_do_redirect(rt, skb, &fl4, false);
1178                 ip_rt_put(rt);
1179         }
1180 }
1181 EXPORT_SYMBOL_GPL(ipv4_redirect);
1182
1183 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1184 {
1185         const struct iphdr *iph = (const struct iphdr *) skb->data;
1186         struct flowi4 fl4;
1187         struct rtable *rt;
1188         struct net *net = sock_net(sk);
1189
1190         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1191         rt = __ip_route_output_key(net, &fl4);
1192         if (!IS_ERR(rt)) {
1193                 __ip_do_redirect(rt, skb, &fl4, false);
1194                 ip_rt_put(rt);
1195         }
1196 }
1197 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1198
1199 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1200 {
1201         struct rtable *rt = (struct rtable *) dst;
1202
1203         /* All IPV4 dsts are created with ->obsolete set to the value
1204          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1205          * into this function always.
1206          *
1207          * When a PMTU/redirect information update invalidates a route,
1208          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1209          * DST_OBSOLETE_DEAD by dst_free().
1210          */
1211         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1212                 return NULL;
1213         return dst;
1214 }
1215
1216 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1217 {
1218         struct ip_options opt;
1219         int res;
1220
1221         /* Recompile ip options since IPCB may not be valid anymore.
1222          * Also check we have a reasonable ipv4 header.
1223          */
1224         if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1225             ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1226                 return;
1227
1228         memset(&opt, 0, sizeof(opt));
1229         if (ip_hdr(skb)->ihl > 5) {
1230                 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1231                         return;
1232                 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1233
1234                 rcu_read_lock();
1235                 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1236                 rcu_read_unlock();
1237
1238                 if (res)
1239                         return;
1240         }
1241         __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1242 }
1243
1244 static void ipv4_link_failure(struct sk_buff *skb)
1245 {
1246         struct rtable *rt;
1247
1248         ipv4_send_dest_unreach(skb);
1249
1250         rt = skb_rtable(skb);
1251         if (rt)
1252                 dst_set_expires(&rt->dst, 0);
1253 }
1254
1255 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1256 {
1257         pr_debug("%s: %pI4 -> %pI4, %s\n",
1258                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1259                  skb->dev ? skb->dev->name : "?");
1260         kfree_skb(skb);
1261         WARN_ON(1);
1262         return 0;
1263 }
1264
1265 /*
1266    We do not cache source address of outgoing interface,
1267    because it is used only by IP RR, TS and SRR options,
1268    so that it out of fast path.
1269
1270    BTW remember: "addr" is allowed to be not aligned
1271    in IP options!
1272  */
1273
1274 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1275 {
1276         __be32 src;
1277
1278         if (rt_is_output_route(rt))
1279                 src = ip_hdr(skb)->saddr;
1280         else {
1281                 struct fib_result res;
1282                 struct flowi4 fl4;
1283                 struct iphdr *iph;
1284
1285                 iph = ip_hdr(skb);
1286
1287                 memset(&fl4, 0, sizeof(fl4));
1288                 fl4.daddr = iph->daddr;
1289                 fl4.saddr = iph->saddr;
1290                 fl4.flowi4_tos = RT_TOS(iph->tos);
1291                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1292                 fl4.flowi4_iif = skb->dev->ifindex;
1293                 fl4.flowi4_mark = skb->mark;
1294
1295                 rcu_read_lock();
1296                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1297                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1298                 else
1299                         src = inet_select_addr(rt->dst.dev,
1300                                                rt_nexthop(rt, iph->daddr),
1301                                                RT_SCOPE_UNIVERSE);
1302                 rcu_read_unlock();
1303         }
1304         memcpy(addr, &src, 4);
1305 }
1306
1307 #ifdef CONFIG_IP_ROUTE_CLASSID
1308 static void set_class_tag(struct rtable *rt, u32 tag)
1309 {
1310         if (!(rt->dst.tclassid & 0xFFFF))
1311                 rt->dst.tclassid |= tag & 0xFFFF;
1312         if (!(rt->dst.tclassid & 0xFFFF0000))
1313                 rt->dst.tclassid |= tag & 0xFFFF0000;
1314 }
1315 #endif
1316
1317 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1318 {
1319         unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1320         unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1321                                     ip_rt_min_advmss);
1322
1323         return min(advmss, IPV4_MAX_PMTU - header_size);
1324 }
1325
1326 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1327 {
1328         const struct rtable *rt = (const struct rtable *) dst;
1329         unsigned int mtu = rt->rt_pmtu;
1330
1331         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1332                 mtu = dst_metric_raw(dst, RTAX_MTU);
1333
1334         if (mtu)
1335                 goto out;
1336
1337         mtu = READ_ONCE(dst->dev->mtu);
1338
1339         if (unlikely(ip_mtu_locked(dst))) {
1340                 if (rt->rt_uses_gateway && mtu > 576)
1341                         mtu = 576;
1342         }
1343
1344 out:
1345         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1346
1347         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1348 }
1349
1350 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1351 {
1352         struct fnhe_hash_bucket *hash;
1353         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1354         u32 hval = fnhe_hashfun(daddr);
1355
1356         spin_lock_bh(&fnhe_lock);
1357
1358         hash = rcu_dereference_protected(nh->nh_exceptions,
1359                                          lockdep_is_held(&fnhe_lock));
1360         hash += hval;
1361
1362         fnhe_p = &hash->chain;
1363         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1364         while (fnhe) {
1365                 if (fnhe->fnhe_daddr == daddr) {
1366                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1367                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1368                         /* set fnhe_daddr to 0 to ensure it won't bind with
1369                          * new dsts in rt_bind_exception().
1370                          */
1371                         fnhe->fnhe_daddr = 0;
1372                         fnhe_flush_routes(fnhe);
1373                         kfree_rcu(fnhe, rcu);
1374                         break;
1375                 }
1376                 fnhe_p = &fnhe->fnhe_next;
1377                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1378                                                  lockdep_is_held(&fnhe_lock));
1379         }
1380
1381         spin_unlock_bh(&fnhe_lock);
1382 }
1383
1384 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1385 {
1386         struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1387         struct fib_nh_exception *fnhe;
1388         u32 hval;
1389
1390         if (!hash)
1391                 return NULL;
1392
1393         hval = fnhe_hashfun(daddr);
1394
1395         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1396              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1397                 if (fnhe->fnhe_daddr == daddr) {
1398                         if (fnhe->fnhe_expires &&
1399                             time_after(jiffies, fnhe->fnhe_expires)) {
1400                                 ip_del_fnhe(nh, daddr);
1401                                 break;
1402                         }
1403                         return fnhe;
1404                 }
1405         }
1406         return NULL;
1407 }
1408
1409 /* MTU selection:
1410  * 1. mtu on route is locked - use it
1411  * 2. mtu from nexthop exception
1412  * 3. mtu from egress device
1413  */
1414
1415 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1416 {
1417         struct fib_info *fi = res->fi;
1418         struct fib_nh *nh = &fi->fib_nh[res->nh_sel];
1419         struct net_device *dev = nh->nh_dev;
1420         u32 mtu = 0;
1421
1422         if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1423             fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1424                 mtu = fi->fib_mtu;
1425
1426         if (likely(!mtu)) {
1427                 struct fib_nh_exception *fnhe;
1428
1429                 fnhe = find_exception(nh, daddr);
1430                 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1431                         mtu = fnhe->fnhe_pmtu;
1432         }
1433
1434         if (likely(!mtu))
1435                 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1436
1437         return mtu - lwtunnel_headroom(nh->nh_lwtstate, mtu);
1438 }
1439
1440 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1441                               __be32 daddr, const bool do_cache)
1442 {
1443         bool ret = false;
1444
1445         spin_lock_bh(&fnhe_lock);
1446
1447         if (daddr == fnhe->fnhe_daddr) {
1448                 struct rtable __rcu **porig;
1449                 struct rtable *orig;
1450                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1451
1452                 if (rt_is_input_route(rt))
1453                         porig = &fnhe->fnhe_rth_input;
1454                 else
1455                         porig = &fnhe->fnhe_rth_output;
1456                 orig = rcu_dereference(*porig);
1457
1458                 if (fnhe->fnhe_genid != genid) {
1459                         fnhe->fnhe_genid = genid;
1460                         fnhe->fnhe_gw = 0;
1461                         fnhe->fnhe_pmtu = 0;
1462                         fnhe->fnhe_expires = 0;
1463                         fnhe->fnhe_mtu_locked = false;
1464                         fnhe_flush_routes(fnhe);
1465                         orig = NULL;
1466                 }
1467                 fill_route_from_fnhe(rt, fnhe);
1468                 if (!rt->rt_gateway)
1469                         rt->rt_gateway = daddr;
1470
1471                 if (do_cache) {
1472                         dst_hold(&rt->dst);
1473                         rcu_assign_pointer(*porig, rt);
1474                         if (orig) {
1475                                 dst_dev_put(&orig->dst);
1476                                 dst_release(&orig->dst);
1477                         }
1478                         ret = true;
1479                 }
1480
1481                 fnhe->fnhe_stamp = jiffies;
1482         }
1483         spin_unlock_bh(&fnhe_lock);
1484
1485         return ret;
1486 }
1487
1488 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1489 {
1490         struct rtable *orig, *prev, **p;
1491         bool ret = true;
1492
1493         if (rt_is_input_route(rt)) {
1494                 p = (struct rtable **)&nh->nh_rth_input;
1495         } else {
1496                 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1497         }
1498         orig = *p;
1499
1500         /* hold dst before doing cmpxchg() to avoid race condition
1501          * on this dst
1502          */
1503         dst_hold(&rt->dst);
1504         prev = cmpxchg(p, orig, rt);
1505         if (prev == orig) {
1506                 if (orig) {
1507                         rt_add_uncached_list(orig);
1508                         dst_release(&orig->dst);
1509                 }
1510         } else {
1511                 dst_release(&rt->dst);
1512                 ret = false;
1513         }
1514
1515         return ret;
1516 }
1517
1518 struct uncached_list {
1519         spinlock_t              lock;
1520         struct list_head        head;
1521 };
1522
1523 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1524
1525 void rt_add_uncached_list(struct rtable *rt)
1526 {
1527         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1528
1529         rt->rt_uncached_list = ul;
1530
1531         spin_lock_bh(&ul->lock);
1532         list_add_tail(&rt->rt_uncached, &ul->head);
1533         spin_unlock_bh(&ul->lock);
1534 }
1535
1536 void rt_del_uncached_list(struct rtable *rt)
1537 {
1538         if (!list_empty(&rt->rt_uncached)) {
1539                 struct uncached_list *ul = rt->rt_uncached_list;
1540
1541                 spin_lock_bh(&ul->lock);
1542                 list_del(&rt->rt_uncached);
1543                 spin_unlock_bh(&ul->lock);
1544         }
1545 }
1546
1547 static void ipv4_dst_destroy(struct dst_entry *dst)
1548 {
1549         struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1550         struct rtable *rt = (struct rtable *)dst;
1551
1552         if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
1553                 kfree(p);
1554
1555         rt_del_uncached_list(rt);
1556 }
1557
1558 void rt_flush_dev(struct net_device *dev)
1559 {
1560         struct net *net = dev_net(dev);
1561         struct rtable *rt;
1562         int cpu;
1563
1564         for_each_possible_cpu(cpu) {
1565                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1566
1567                 spin_lock_bh(&ul->lock);
1568                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1569                         if (rt->dst.dev != dev)
1570                                 continue;
1571                         rt->dst.dev = net->loopback_dev;
1572                         dev_hold(rt->dst.dev);
1573                         dev_put(dev);
1574                 }
1575                 spin_unlock_bh(&ul->lock);
1576         }
1577 }
1578
1579 static bool rt_cache_valid(const struct rtable *rt)
1580 {
1581         return  rt &&
1582                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1583                 !rt_is_expired(rt);
1584 }
1585
1586 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1587                            const struct fib_result *res,
1588                            struct fib_nh_exception *fnhe,
1589                            struct fib_info *fi, u16 type, u32 itag,
1590                            const bool do_cache)
1591 {
1592         bool cached = false;
1593
1594         if (fi) {
1595                 struct fib_nh *nh = &FIB_RES_NH(*res);
1596
1597                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1598                         rt->rt_gateway = nh->nh_gw;
1599                         rt->rt_uses_gateway = 1;
1600                 }
1601                 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1602                 if (fi->fib_metrics != &dst_default_metrics) {
1603                         rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1604                         refcount_inc(&fi->fib_metrics->refcnt);
1605                 }
1606 #ifdef CONFIG_IP_ROUTE_CLASSID
1607                 rt->dst.tclassid = nh->nh_tclassid;
1608 #endif
1609                 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1610                 if (unlikely(fnhe))
1611                         cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1612                 else if (do_cache)
1613                         cached = rt_cache_route(nh, rt);
1614                 if (unlikely(!cached)) {
1615                         /* Routes we intend to cache in nexthop exception or
1616                          * FIB nexthop have the DST_NOCACHE bit clear.
1617                          * However, if we are unsuccessful at storing this
1618                          * route into the cache we really need to set it.
1619                          */
1620                         if (!rt->rt_gateway)
1621                                 rt->rt_gateway = daddr;
1622                         rt_add_uncached_list(rt);
1623                 }
1624         } else
1625                 rt_add_uncached_list(rt);
1626
1627 #ifdef CONFIG_IP_ROUTE_CLASSID
1628 #ifdef CONFIG_IP_MULTIPLE_TABLES
1629         set_class_tag(rt, res->tclassid);
1630 #endif
1631         set_class_tag(rt, itag);
1632 #endif
1633 }
1634
1635 struct rtable *rt_dst_alloc(struct net_device *dev,
1636                             unsigned int flags, u16 type,
1637                             bool nopolicy, bool noxfrm, bool will_cache)
1638 {
1639         struct rtable *rt;
1640
1641         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1642                        (will_cache ? 0 : DST_HOST) |
1643                        (nopolicy ? DST_NOPOLICY : 0) |
1644                        (noxfrm ? DST_NOXFRM : 0));
1645
1646         if (rt) {
1647                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1648                 rt->rt_flags = flags;
1649                 rt->rt_type = type;
1650                 rt->rt_is_input = 0;
1651                 rt->rt_iif = 0;
1652                 rt->rt_pmtu = 0;
1653                 rt->rt_mtu_locked = 0;
1654                 rt->rt_gateway = 0;
1655                 rt->rt_uses_gateway = 0;
1656                 INIT_LIST_HEAD(&rt->rt_uncached);
1657
1658                 rt->dst.output = ip_output;
1659                 if (flags & RTCF_LOCAL)
1660                         rt->dst.input = ip_local_deliver;
1661         }
1662
1663         return rt;
1664 }
1665 EXPORT_SYMBOL(rt_dst_alloc);
1666
1667 /* called in rcu_read_lock() section */
1668 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1669                           u8 tos, struct net_device *dev,
1670                           struct in_device *in_dev, u32 *itag)
1671 {
1672         int err;
1673
1674         /* Primary sanity checks. */
1675         if (!in_dev)
1676                 return -EINVAL;
1677
1678         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1679             skb->protocol != htons(ETH_P_IP))
1680                 return -EINVAL;
1681
1682         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1683                 return -EINVAL;
1684
1685         if (ipv4_is_zeronet(saddr)) {
1686                 if (!ipv4_is_local_multicast(daddr))
1687                         return -EINVAL;
1688         } else {
1689                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1690                                           in_dev, itag);
1691                 if (err < 0)
1692                         return err;
1693         }
1694         return 0;
1695 }
1696
1697 /* called in rcu_read_lock() section */
1698 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1699                              u8 tos, struct net_device *dev, int our)
1700 {
1701         struct in_device *in_dev = __in_dev_get_rcu(dev);
1702         unsigned int flags = RTCF_MULTICAST;
1703         struct rtable *rth;
1704         u32 itag = 0;
1705         int err;
1706
1707         err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1708         if (err)
1709                 return err;
1710
1711         if (our)
1712                 flags |= RTCF_LOCAL;
1713
1714         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1715                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1716         if (!rth)
1717                 return -ENOBUFS;
1718
1719 #ifdef CONFIG_IP_ROUTE_CLASSID
1720         rth->dst.tclassid = itag;
1721 #endif
1722         rth->dst.output = ip_rt_bug;
1723         rth->rt_is_input= 1;
1724
1725 #ifdef CONFIG_IP_MROUTE
1726         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1727                 rth->dst.input = ip_mr_input;
1728 #endif
1729         RT_CACHE_STAT_INC(in_slow_mc);
1730
1731         skb_dst_drop(skb);
1732         skb_dst_set(skb, &rth->dst);
1733         return 0;
1734 }
1735
1736
1737 static void ip_handle_martian_source(struct net_device *dev,
1738                                      struct in_device *in_dev,
1739                                      struct sk_buff *skb,
1740                                      __be32 daddr,
1741                                      __be32 saddr)
1742 {
1743         RT_CACHE_STAT_INC(in_martian_src);
1744 #ifdef CONFIG_IP_ROUTE_VERBOSE
1745         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1746                 /*
1747                  *      RFC1812 recommendation, if source is martian,
1748                  *      the only hint is MAC header.
1749                  */
1750                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1751                         &daddr, &saddr, dev->name);
1752                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1753                         print_hex_dump(KERN_WARNING, "ll header: ",
1754                                        DUMP_PREFIX_OFFSET, 16, 1,
1755                                        skb_mac_header(skb),
1756                                        dev->hard_header_len, true);
1757                 }
1758         }
1759 #endif
1760 }
1761
1762 /* called in rcu_read_lock() section */
1763 static int __mkroute_input(struct sk_buff *skb,
1764                            const struct fib_result *res,
1765                            struct in_device *in_dev,
1766                            __be32 daddr, __be32 saddr, u32 tos)
1767 {
1768         struct fib_nh_exception *fnhe;
1769         struct rtable *rth;
1770         int err;
1771         struct in_device *out_dev;
1772         bool do_cache;
1773         u32 itag = 0;
1774
1775         /* get a working reference to the output device */
1776         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1777         if (!out_dev) {
1778                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1779                 return -EINVAL;
1780         }
1781
1782         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1783                                   in_dev->dev, in_dev, &itag);
1784         if (err < 0) {
1785                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1786                                          saddr);
1787
1788                 goto cleanup;
1789         }
1790
1791         do_cache = res->fi && !itag;
1792         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1793             skb->protocol == htons(ETH_P_IP) &&
1794             (IN_DEV_SHARED_MEDIA(out_dev) ||
1795              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1796                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1797
1798         if (skb->protocol != htons(ETH_P_IP)) {
1799                 /* Not IP (i.e. ARP). Do not create route, if it is
1800                  * invalid for proxy arp. DNAT routes are always valid.
1801                  *
1802                  * Proxy arp feature have been extended to allow, ARP
1803                  * replies back to the same interface, to support
1804                  * Private VLAN switch technologies. See arp.c.
1805                  */
1806                 if (out_dev == in_dev &&
1807                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1808                         err = -EINVAL;
1809                         goto cleanup;
1810                 }
1811         }
1812
1813         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1814         if (do_cache) {
1815                 if (fnhe)
1816                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1817                 else
1818                         rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1819                 if (rt_cache_valid(rth)) {
1820                         skb_dst_set_noref(skb, &rth->dst);
1821                         goto out;
1822                 }
1823         }
1824
1825         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1826                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1827                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1828         if (!rth) {
1829                 err = -ENOBUFS;
1830                 goto cleanup;
1831         }
1832
1833         rth->rt_is_input = 1;
1834         RT_CACHE_STAT_INC(in_slow_tot);
1835
1836         rth->dst.input = ip_forward;
1837
1838         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1839                        do_cache);
1840         lwtunnel_set_redirect(&rth->dst);
1841         skb_dst_set(skb, &rth->dst);
1842 out:
1843         err = 0;
1844  cleanup:
1845         return err;
1846 }
1847
1848 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1849 /* To make ICMP packets follow the right flow, the multipath hash is
1850  * calculated from the inner IP addresses.
1851  */
1852 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1853                                  struct flow_keys *hash_keys)
1854 {
1855         const struct iphdr *outer_iph = ip_hdr(skb);
1856         const struct iphdr *key_iph = outer_iph;
1857         const struct iphdr *inner_iph;
1858         const struct icmphdr *icmph;
1859         struct iphdr _inner_iph;
1860         struct icmphdr _icmph;
1861
1862         if (likely(outer_iph->protocol != IPPROTO_ICMP))
1863                 goto out;
1864
1865         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1866                 goto out;
1867
1868         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1869                                    &_icmph);
1870         if (!icmph)
1871                 goto out;
1872
1873         if (icmph->type != ICMP_DEST_UNREACH &&
1874             icmph->type != ICMP_REDIRECT &&
1875             icmph->type != ICMP_TIME_EXCEEDED &&
1876             icmph->type != ICMP_PARAMETERPROB)
1877                 goto out;
1878
1879         inner_iph = skb_header_pointer(skb,
1880                                        outer_iph->ihl * 4 + sizeof(_icmph),
1881                                        sizeof(_inner_iph), &_inner_iph);
1882         if (!inner_iph)
1883                 goto out;
1884
1885         key_iph = inner_iph;
1886 out:
1887         hash_keys->addrs.v4addrs.src = key_iph->saddr;
1888         hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1889 }
1890
1891 /* if skb is set it will be used and fl4 can be NULL */
1892 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1893                        const struct sk_buff *skb, struct flow_keys *flkeys)
1894 {
1895         struct flow_keys hash_keys;
1896         u32 mhash;
1897
1898         switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1899         case 0:
1900                 memset(&hash_keys, 0, sizeof(hash_keys));
1901                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1902                 if (skb) {
1903                         ip_multipath_l3_keys(skb, &hash_keys);
1904                 } else {
1905                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1906                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1907                 }
1908                 break;
1909         case 1:
1910                 /* skb is currently provided only when forwarding */
1911                 if (skb) {
1912                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1913                         struct flow_keys keys;
1914
1915                         /* short-circuit if we already have L4 hash present */
1916                         if (skb->l4_hash)
1917                                 return skb_get_hash_raw(skb) >> 1;
1918
1919                         memset(&hash_keys, 0, sizeof(hash_keys));
1920
1921                         if (!flkeys) {
1922                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
1923                                 flkeys = &keys;
1924                         }
1925
1926                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1927                         hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1928                         hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1929                         hash_keys.ports.src = flkeys->ports.src;
1930                         hash_keys.ports.dst = flkeys->ports.dst;
1931                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1932                 } else {
1933                         memset(&hash_keys, 0, sizeof(hash_keys));
1934                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1935                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1936                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1937                         hash_keys.ports.src = fl4->fl4_sport;
1938                         hash_keys.ports.dst = fl4->fl4_dport;
1939                         hash_keys.basic.ip_proto = fl4->flowi4_proto;
1940                 }
1941                 break;
1942         }
1943         mhash = flow_hash_from_keys(&hash_keys);
1944
1945         return mhash >> 1;
1946 }
1947 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1948
1949 static int ip_mkroute_input(struct sk_buff *skb,
1950                             struct fib_result *res,
1951                             struct in_device *in_dev,
1952                             __be32 daddr, __be32 saddr, u32 tos,
1953                             struct flow_keys *hkeys)
1954 {
1955 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1956         if (res->fi && res->fi->fib_nhs > 1) {
1957                 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
1958
1959                 fib_select_multipath(res, h);
1960         }
1961 #endif
1962
1963         /* create a routing cache entry */
1964         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1965 }
1966
1967 /*
1968  *      NOTE. We drop all the packets that has local source
1969  *      addresses, because every properly looped back packet
1970  *      must have correct destination already attached by output routine.
1971  *
1972  *      Such approach solves two big problems:
1973  *      1. Not simplex devices are handled properly.
1974  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1975  *      called with rcu_read_lock()
1976  */
1977
1978 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1979                                u8 tos, struct net_device *dev,
1980                                struct fib_result *res)
1981 {
1982         struct in_device *in_dev = __in_dev_get_rcu(dev);
1983         struct flow_keys *flkeys = NULL, _flkeys;
1984         struct net    *net = dev_net(dev);
1985         struct ip_tunnel_info *tun_info;
1986         int             err = -EINVAL;
1987         unsigned int    flags = 0;
1988         u32             itag = 0;
1989         struct rtable   *rth;
1990         struct flowi4   fl4;
1991         bool do_cache = true;
1992
1993         /* IP on this device is disabled. */
1994
1995         if (!in_dev)
1996                 goto out;
1997
1998         /* Check for the most weird martians, which can be not detected
1999            by fib_lookup.
2000          */
2001
2002         tun_info = skb_tunnel_info(skb);
2003         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2004                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2005         else
2006                 fl4.flowi4_tun_key.tun_id = 0;
2007         skb_dst_drop(skb);
2008
2009         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2010                 goto martian_source;
2011
2012         res->fi = NULL;
2013         res->table = NULL;
2014         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2015                 goto brd_input;
2016
2017         /* Accept zero addresses only to limited broadcast;
2018          * I even do not know to fix it or not. Waiting for complains :-)
2019          */
2020         if (ipv4_is_zeronet(saddr))
2021                 goto martian_source;
2022
2023         if (ipv4_is_zeronet(daddr))
2024                 goto martian_destination;
2025
2026         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2027          * and call it once if daddr or/and saddr are loopback addresses
2028          */
2029         if (ipv4_is_loopback(daddr)) {
2030                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2031                         goto martian_destination;
2032         } else if (ipv4_is_loopback(saddr)) {
2033                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2034                         goto martian_source;
2035         }
2036
2037         /*
2038          *      Now we are ready to route packet.
2039          */
2040         fl4.flowi4_oif = 0;
2041         fl4.flowi4_iif = dev->ifindex;
2042         fl4.flowi4_mark = skb->mark;
2043         fl4.flowi4_tos = tos;
2044         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2045         fl4.flowi4_flags = 0;
2046         fl4.daddr = daddr;
2047         fl4.saddr = saddr;
2048         fl4.flowi4_uid = sock_net_uid(net, NULL);
2049
2050         if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2051                 flkeys = &_flkeys;
2052         } else {
2053                 fl4.flowi4_proto = 0;
2054                 fl4.fl4_sport = 0;
2055                 fl4.fl4_dport = 0;
2056         }
2057
2058         err = fib_lookup(net, &fl4, res, 0);
2059         if (err != 0) {
2060                 if (!IN_DEV_FORWARD(in_dev))
2061                         err = -EHOSTUNREACH;
2062                 goto no_route;
2063         }
2064
2065         if (res->type == RTN_BROADCAST) {
2066                 if (IN_DEV_BFORWARD(in_dev))
2067                         goto make_route;
2068                 /* not do cache if bc_forwarding is enabled */
2069                 if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2070                         do_cache = false;
2071                 goto brd_input;
2072         }
2073
2074         if (res->type == RTN_LOCAL) {
2075                 err = fib_validate_source(skb, saddr, daddr, tos,
2076                                           0, dev, in_dev, &itag);
2077                 if (err < 0)
2078                         goto martian_source;
2079                 goto local_input;
2080         }
2081
2082         if (!IN_DEV_FORWARD(in_dev)) {
2083                 err = -EHOSTUNREACH;
2084                 goto no_route;
2085         }
2086         if (res->type != RTN_UNICAST)
2087                 goto martian_destination;
2088
2089 make_route:
2090         err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2091 out:    return err;
2092
2093 brd_input:
2094         if (skb->protocol != htons(ETH_P_IP))
2095                 goto e_inval;
2096
2097         if (!ipv4_is_zeronet(saddr)) {
2098                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2099                                           in_dev, &itag);
2100                 if (err < 0)
2101                         goto martian_source;
2102         }
2103         flags |= RTCF_BROADCAST;
2104         res->type = RTN_BROADCAST;
2105         RT_CACHE_STAT_INC(in_brd);
2106
2107 local_input:
2108         do_cache &= res->fi && !itag;
2109         if (do_cache) {
2110                 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
2111                 if (rt_cache_valid(rth)) {
2112                         skb_dst_set_noref(skb, &rth->dst);
2113                         err = 0;
2114                         goto out;
2115                 }
2116         }
2117
2118         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2119                            flags | RTCF_LOCAL, res->type,
2120                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2121         if (!rth)
2122                 goto e_nobufs;
2123
2124         rth->dst.output= ip_rt_bug;
2125 #ifdef CONFIG_IP_ROUTE_CLASSID
2126         rth->dst.tclassid = itag;
2127 #endif
2128         rth->rt_is_input = 1;
2129
2130         RT_CACHE_STAT_INC(in_slow_tot);
2131         if (res->type == RTN_UNREACHABLE) {
2132                 rth->dst.input= ip_error;
2133                 rth->dst.error= -err;
2134                 rth->rt_flags   &= ~RTCF_LOCAL;
2135         }
2136
2137         if (do_cache) {
2138                 struct fib_nh *nh = &FIB_RES_NH(*res);
2139
2140                 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2141                 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2142                         WARN_ON(rth->dst.input == lwtunnel_input);
2143                         rth->dst.lwtstate->orig_input = rth->dst.input;
2144                         rth->dst.input = lwtunnel_input;
2145                 }
2146
2147                 if (unlikely(!rt_cache_route(nh, rth)))
2148                         rt_add_uncached_list(rth);
2149         }
2150         skb_dst_set(skb, &rth->dst);
2151         err = 0;
2152         goto out;
2153
2154 no_route:
2155         RT_CACHE_STAT_INC(in_no_route);
2156         res->type = RTN_UNREACHABLE;
2157         res->fi = NULL;
2158         res->table = NULL;
2159         goto local_input;
2160
2161         /*
2162          *      Do not cache martian addresses: they should be logged (RFC1812)
2163          */
2164 martian_destination:
2165         RT_CACHE_STAT_INC(in_martian_dst);
2166 #ifdef CONFIG_IP_ROUTE_VERBOSE
2167         if (IN_DEV_LOG_MARTIANS(in_dev))
2168                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2169                                      &daddr, &saddr, dev->name);
2170 #endif
2171
2172 e_inval:
2173         err = -EINVAL;
2174         goto out;
2175
2176 e_nobufs:
2177         err = -ENOBUFS;
2178         goto out;
2179
2180 martian_source:
2181         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2182         goto out;
2183 }
2184
2185 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2186                          u8 tos, struct net_device *dev)
2187 {
2188         struct fib_result res;
2189         int err;
2190
2191         tos &= IPTOS_RT_MASK;
2192         rcu_read_lock();
2193         err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2194         rcu_read_unlock();
2195
2196         return err;
2197 }
2198 EXPORT_SYMBOL(ip_route_input_noref);
2199
2200 /* called with rcu_read_lock held */
2201 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2202                        u8 tos, struct net_device *dev, struct fib_result *res)
2203 {
2204         /* Multicast recognition logic is moved from route cache to here.
2205            The problem was that too many Ethernet cards have broken/missing
2206            hardware multicast filters :-( As result the host on multicasting
2207            network acquires a lot of useless route cache entries, sort of
2208            SDR messages from all the world. Now we try to get rid of them.
2209            Really, provided software IP multicast filter is organized
2210            reasonably (at least, hashed), it does not result in a slowdown
2211            comparing with route cache reject entries.
2212            Note, that multicast routers are not affected, because
2213            route cache entry is created eventually.
2214          */
2215         if (ipv4_is_multicast(daddr)) {
2216                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2217                 int our = 0;
2218                 int err = -EINVAL;
2219
2220                 if (!in_dev)
2221                         return err;
2222                 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2223                                       ip_hdr(skb)->protocol);
2224
2225                 /* check l3 master if no match yet */
2226                 if (!our && netif_is_l3_slave(dev)) {
2227                         struct in_device *l3_in_dev;
2228
2229                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2230                         if (l3_in_dev)
2231                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2232                                                       ip_hdr(skb)->protocol);
2233                 }
2234
2235                 if (our
2236 #ifdef CONFIG_IP_MROUTE
2237                         ||
2238                     (!ipv4_is_local_multicast(daddr) &&
2239                      IN_DEV_MFORWARD(in_dev))
2240 #endif
2241                    ) {
2242                         err = ip_route_input_mc(skb, daddr, saddr,
2243                                                 tos, dev, our);
2244                 }
2245                 return err;
2246         }
2247
2248         return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2249 }
2250
2251 /* called with rcu_read_lock() */
2252 static struct rtable *__mkroute_output(const struct fib_result *res,
2253                                        const struct flowi4 *fl4, int orig_oif,
2254                                        struct net_device *dev_out,
2255                                        unsigned int flags)
2256 {
2257         struct fib_info *fi = res->fi;
2258         struct fib_nh_exception *fnhe;
2259         struct in_device *in_dev;
2260         u16 type = res->type;
2261         struct rtable *rth;
2262         bool do_cache;
2263
2264         in_dev = __in_dev_get_rcu(dev_out);
2265         if (!in_dev)
2266                 return ERR_PTR(-EINVAL);
2267
2268         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2269                 if (ipv4_is_loopback(fl4->saddr) &&
2270                     !(dev_out->flags & IFF_LOOPBACK) &&
2271                     !netif_is_l3_master(dev_out))
2272                         return ERR_PTR(-EINVAL);
2273
2274         if (ipv4_is_lbcast(fl4->daddr))
2275                 type = RTN_BROADCAST;
2276         else if (ipv4_is_multicast(fl4->daddr))
2277                 type = RTN_MULTICAST;
2278         else if (ipv4_is_zeronet(fl4->daddr))
2279                 return ERR_PTR(-EINVAL);
2280
2281         if (dev_out->flags & IFF_LOOPBACK)
2282                 flags |= RTCF_LOCAL;
2283
2284         do_cache = true;
2285         if (type == RTN_BROADCAST) {
2286                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2287                 fi = NULL;
2288         } else if (type == RTN_MULTICAST) {
2289                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2290                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2291                                      fl4->flowi4_proto))
2292                         flags &= ~RTCF_LOCAL;
2293                 else
2294                         do_cache = false;
2295                 /* If multicast route do not exist use
2296                  * default one, but do not gateway in this case.
2297                  * Yes, it is hack.
2298                  */
2299                 if (fi && res->prefixlen < 4)
2300                         fi = NULL;
2301         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2302                    (orig_oif != dev_out->ifindex)) {
2303                 /* For local routes that require a particular output interface
2304                  * we do not want to cache the result.  Caching the result
2305                  * causes incorrect behaviour when there are multiple source
2306                  * addresses on the interface, the end result being that if the
2307                  * intended recipient is waiting on that interface for the
2308                  * packet he won't receive it because it will be delivered on
2309                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2310                  * be set to the loopback interface as well.
2311                  */
2312                 do_cache = false;
2313         }
2314
2315         fnhe = NULL;
2316         do_cache &= fi != NULL;
2317         if (fi) {
2318                 struct rtable __rcu **prth;
2319                 struct fib_nh *nh = &FIB_RES_NH(*res);
2320
2321                 fnhe = find_exception(nh, fl4->daddr);
2322                 if (!do_cache)
2323                         goto add;
2324                 if (fnhe) {
2325                         prth = &fnhe->fnhe_rth_output;
2326                 } else {
2327                         if (unlikely(fl4->flowi4_flags &
2328                                      FLOWI_FLAG_KNOWN_NH &&
2329                                      !(nh->nh_gw &&
2330                                        nh->nh_scope == RT_SCOPE_LINK))) {
2331                                 do_cache = false;
2332                                 goto add;
2333                         }
2334                         prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2335                 }
2336                 rth = rcu_dereference(*prth);
2337                 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2338                         return rth;
2339         }
2340
2341 add:
2342         rth = rt_dst_alloc(dev_out, flags, type,
2343                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2344                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2345                            do_cache);
2346         if (!rth)
2347                 return ERR_PTR(-ENOBUFS);
2348
2349         rth->rt_iif = orig_oif;
2350
2351         RT_CACHE_STAT_INC(out_slow_tot);
2352
2353         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2354                 if (flags & RTCF_LOCAL &&
2355                     !(dev_out->flags & IFF_LOOPBACK)) {
2356                         rth->dst.output = ip_mc_output;
2357                         RT_CACHE_STAT_INC(out_slow_mc);
2358                 }
2359 #ifdef CONFIG_IP_MROUTE
2360                 if (type == RTN_MULTICAST) {
2361                         if (IN_DEV_MFORWARD(in_dev) &&
2362                             !ipv4_is_local_multicast(fl4->daddr)) {
2363                                 rth->dst.input = ip_mr_input;
2364                                 rth->dst.output = ip_mc_output;
2365                         }
2366                 }
2367 #endif
2368         }
2369
2370         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2371         lwtunnel_set_redirect(&rth->dst);
2372
2373         return rth;
2374 }
2375
2376 /*
2377  * Major route resolver routine.
2378  */
2379
2380 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2381                                         const struct sk_buff *skb)
2382 {
2383         __u8 tos = RT_FL_TOS(fl4);
2384         struct fib_result res = {
2385                 .type           = RTN_UNSPEC,
2386                 .fi             = NULL,
2387                 .table          = NULL,
2388                 .tclassid       = 0,
2389         };
2390         struct rtable *rth;
2391
2392         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2393         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2394         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2395                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2396
2397         rcu_read_lock();
2398         rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2399         rcu_read_unlock();
2400
2401         return rth;
2402 }
2403 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2404
2405 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2406                                             struct fib_result *res,
2407                                             const struct sk_buff *skb)
2408 {
2409         struct net_device *dev_out = NULL;
2410         int orig_oif = fl4->flowi4_oif;
2411         unsigned int flags = 0;
2412         struct rtable *rth;
2413         int err;
2414
2415         if (fl4->saddr) {
2416                 if (ipv4_is_multicast(fl4->saddr) ||
2417                     ipv4_is_lbcast(fl4->saddr) ||
2418                     ipv4_is_zeronet(fl4->saddr)) {
2419                         rth = ERR_PTR(-EINVAL);
2420                         goto out;
2421                 }
2422
2423                 rth = ERR_PTR(-ENETUNREACH);
2424
2425                 /* I removed check for oif == dev_out->oif here.
2426                    It was wrong for two reasons:
2427                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2428                       is assigned to multiple interfaces.
2429                    2. Moreover, we are allowed to send packets with saddr
2430                       of another iface. --ANK
2431                  */
2432
2433                 if (fl4->flowi4_oif == 0 &&
2434                     (ipv4_is_multicast(fl4->daddr) ||
2435                      ipv4_is_lbcast(fl4->daddr))) {
2436                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2437                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2438                         if (!dev_out)
2439                                 goto out;
2440
2441                         /* Special hack: user can direct multicasts
2442                            and limited broadcast via necessary interface
2443                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2444                            This hack is not just for fun, it allows
2445                            vic,vat and friends to work.
2446                            They bind socket to loopback, set ttl to zero
2447                            and expect that it will work.
2448                            From the viewpoint of routing cache they are broken,
2449                            because we are not allowed to build multicast path
2450                            with loopback source addr (look, routing cache
2451                            cannot know, that ttl is zero, so that packet
2452                            will not leave this host and route is valid).
2453                            Luckily, this hack is good workaround.
2454                          */
2455
2456                         fl4->flowi4_oif = dev_out->ifindex;
2457                         goto make_route;
2458                 }
2459
2460                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2461                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2462                         if (!__ip_dev_find(net, fl4->saddr, false))
2463                                 goto out;
2464                 }
2465         }
2466
2467
2468         if (fl4->flowi4_oif) {
2469                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2470                 rth = ERR_PTR(-ENODEV);
2471                 if (!dev_out)
2472                         goto out;
2473
2474                 /* RACE: Check return value of inet_select_addr instead. */
2475                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2476                         rth = ERR_PTR(-ENETUNREACH);
2477                         goto out;
2478                 }
2479                 if (ipv4_is_local_multicast(fl4->daddr) ||
2480                     ipv4_is_lbcast(fl4->daddr) ||
2481                     fl4->flowi4_proto == IPPROTO_IGMP) {
2482                         if (!fl4->saddr)
2483                                 fl4->saddr = inet_select_addr(dev_out, 0,
2484                                                               RT_SCOPE_LINK);
2485                         goto make_route;
2486                 }
2487                 if (!fl4->saddr) {
2488                         if (ipv4_is_multicast(fl4->daddr))
2489                                 fl4->saddr = inet_select_addr(dev_out, 0,
2490                                                               fl4->flowi4_scope);
2491                         else if (!fl4->daddr)
2492                                 fl4->saddr = inet_select_addr(dev_out, 0,
2493                                                               RT_SCOPE_HOST);
2494                 }
2495         }
2496
2497         if (!fl4->daddr) {
2498                 fl4->daddr = fl4->saddr;
2499                 if (!fl4->daddr)
2500                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2501                 dev_out = net->loopback_dev;
2502                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2503                 res->type = RTN_LOCAL;
2504                 flags |= RTCF_LOCAL;
2505                 goto make_route;
2506         }
2507
2508         err = fib_lookup(net, fl4, res, 0);
2509         if (err) {
2510                 res->fi = NULL;
2511                 res->table = NULL;
2512                 if (fl4->flowi4_oif &&
2513                     (ipv4_is_multicast(fl4->daddr) ||
2514                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2515                         /* Apparently, routing tables are wrong. Assume,
2516                            that the destination is on link.
2517
2518                            WHY? DW.
2519                            Because we are allowed to send to iface
2520                            even if it has NO routes and NO assigned
2521                            addresses. When oif is specified, routing
2522                            tables are looked up with only one purpose:
2523                            to catch if destination is gatewayed, rather than
2524                            direct. Moreover, if MSG_DONTROUTE is set,
2525                            we send packet, ignoring both routing tables
2526                            and ifaddr state. --ANK
2527
2528
2529                            We could make it even if oif is unknown,
2530                            likely IPv6, but we do not.
2531                          */
2532
2533                         if (fl4->saddr == 0)
2534                                 fl4->saddr = inet_select_addr(dev_out, 0,
2535                                                               RT_SCOPE_LINK);
2536                         res->type = RTN_UNICAST;
2537                         goto make_route;
2538                 }
2539                 rth = ERR_PTR(err);
2540                 goto out;
2541         }
2542
2543         if (res->type == RTN_LOCAL) {
2544                 if (!fl4->saddr) {
2545                         if (res->fi->fib_prefsrc)
2546                                 fl4->saddr = res->fi->fib_prefsrc;
2547                         else
2548                                 fl4->saddr = fl4->daddr;
2549                 }
2550
2551                 /* L3 master device is the loopback for that domain */
2552                 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2553                         net->loopback_dev;
2554
2555                 /* make sure orig_oif points to fib result device even
2556                  * though packet rx/tx happens over loopback or l3mdev
2557                  */
2558                 orig_oif = FIB_RES_OIF(*res);
2559
2560                 fl4->flowi4_oif = dev_out->ifindex;
2561                 flags |= RTCF_LOCAL;
2562                 goto make_route;
2563         }
2564
2565         fib_select_path(net, res, fl4, skb);
2566
2567         dev_out = FIB_RES_DEV(*res);
2568
2569 make_route:
2570         rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2571
2572 out:
2573         return rth;
2574 }
2575
2576 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2577 {
2578         return NULL;
2579 }
2580
2581 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2582 {
2583         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2584
2585         return mtu ? : dst->dev->mtu;
2586 }
2587
2588 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2589                                           struct sk_buff *skb, u32 mtu,
2590                                           bool confirm_neigh)
2591 {
2592 }
2593
2594 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2595                                        struct sk_buff *skb)
2596 {
2597 }
2598
2599 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2600                                           unsigned long old)
2601 {
2602         return NULL;
2603 }
2604
2605 static struct dst_ops ipv4_dst_blackhole_ops = {
2606         .family                 =       AF_INET,
2607         .check                  =       ipv4_blackhole_dst_check,
2608         .mtu                    =       ipv4_blackhole_mtu,
2609         .default_advmss         =       ipv4_default_advmss,
2610         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2611         .redirect               =       ipv4_rt_blackhole_redirect,
2612         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2613         .neigh_lookup           =       ipv4_neigh_lookup,
2614 };
2615
2616 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2617 {
2618         struct rtable *ort = (struct rtable *) dst_orig;
2619         struct rtable *rt;
2620
2621         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2622         if (rt) {
2623                 struct dst_entry *new = &rt->dst;
2624
2625                 new->__use = 1;
2626                 new->input = dst_discard;
2627                 new->output = dst_discard_out;
2628
2629                 new->dev = net->loopback_dev;
2630                 if (new->dev)
2631                         dev_hold(new->dev);
2632
2633                 rt->rt_is_input = ort->rt_is_input;
2634                 rt->rt_iif = ort->rt_iif;
2635                 rt->rt_pmtu = ort->rt_pmtu;
2636                 rt->rt_mtu_locked = ort->rt_mtu_locked;
2637
2638                 rt->rt_genid = rt_genid_ipv4(net);
2639                 rt->rt_flags = ort->rt_flags;
2640                 rt->rt_type = ort->rt_type;
2641                 rt->rt_gateway = ort->rt_gateway;
2642                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2643
2644                 INIT_LIST_HEAD(&rt->rt_uncached);
2645         }
2646
2647         dst_release(dst_orig);
2648
2649         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2650 }
2651
2652 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2653                                     const struct sock *sk)
2654 {
2655         struct rtable *rt = __ip_route_output_key(net, flp4);
2656
2657         if (IS_ERR(rt))
2658                 return rt;
2659
2660         if (flp4->flowi4_proto) {
2661                 flp4->flowi4_oif = rt->dst.dev->ifindex;
2662                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2663                                                         flowi4_to_flowi(flp4),
2664                                                         sk, 0);
2665         }
2666
2667         return rt;
2668 }
2669 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2670
2671 /* called with rcu_read_lock held */
2672 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2673                         struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2674                         struct sk_buff *skb, u32 portid, u32 seq)
2675 {
2676         struct rtmsg *r;
2677         struct nlmsghdr *nlh;
2678         unsigned long expires = 0;
2679         u32 error;
2680         u32 metrics[RTAX_MAX];
2681
2682         nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2683         if (!nlh)
2684                 return -EMSGSIZE;
2685
2686         r = nlmsg_data(nlh);
2687         r->rtm_family    = AF_INET;
2688         r->rtm_dst_len  = 32;
2689         r->rtm_src_len  = 0;
2690         r->rtm_tos      = fl4->flowi4_tos;
2691         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2692         if (nla_put_u32(skb, RTA_TABLE, table_id))
2693                 goto nla_put_failure;
2694         r->rtm_type     = rt->rt_type;
2695         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2696         r->rtm_protocol = RTPROT_UNSPEC;
2697         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2698         if (rt->rt_flags & RTCF_NOTIFY)
2699                 r->rtm_flags |= RTM_F_NOTIFY;
2700         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2701                 r->rtm_flags |= RTCF_DOREDIRECT;
2702
2703         if (nla_put_in_addr(skb, RTA_DST, dst))
2704                 goto nla_put_failure;
2705         if (src) {
2706                 r->rtm_src_len = 32;
2707                 if (nla_put_in_addr(skb, RTA_SRC, src))
2708                         goto nla_put_failure;
2709         }
2710         if (rt->dst.dev &&
2711             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2712                 goto nla_put_failure;
2713 #ifdef CONFIG_IP_ROUTE_CLASSID
2714         if (rt->dst.tclassid &&
2715             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2716                 goto nla_put_failure;
2717 #endif
2718         if (!rt_is_input_route(rt) &&
2719             fl4->saddr != src) {
2720                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2721                         goto nla_put_failure;
2722         }
2723         if (rt->rt_uses_gateway &&
2724             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2725                 goto nla_put_failure;
2726
2727         expires = rt->dst.expires;
2728         if (expires) {
2729                 unsigned long now = jiffies;
2730
2731                 if (time_before(now, expires))
2732                         expires -= now;
2733                 else
2734                         expires = 0;
2735         }
2736
2737         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2738         if (rt->rt_pmtu && expires)
2739                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2740         if (rt->rt_mtu_locked && expires)
2741                 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2742         if (rtnetlink_put_metrics(skb, metrics) < 0)
2743                 goto nla_put_failure;
2744
2745         if (fl4->flowi4_mark &&
2746             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2747                 goto nla_put_failure;
2748
2749         if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2750             nla_put_u32(skb, RTA_UID,
2751                         from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2752                 goto nla_put_failure;
2753
2754         error = rt->dst.error;
2755
2756         if (rt_is_input_route(rt)) {
2757 #ifdef CONFIG_IP_MROUTE
2758                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2759                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2760                         int err = ipmr_get_route(net, skb,
2761                                                  fl4->saddr, fl4->daddr,
2762                                                  r, portid);
2763
2764                         if (err <= 0) {
2765                                 if (err == 0)
2766                                         return 0;
2767                                 goto nla_put_failure;
2768                         }
2769                 } else
2770 #endif
2771                         if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2772                                 goto nla_put_failure;
2773         }
2774
2775         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2776                 goto nla_put_failure;
2777
2778         nlmsg_end(skb, nlh);
2779         return 0;
2780
2781 nla_put_failure:
2782         nlmsg_cancel(skb, nlh);
2783         return -EMSGSIZE;
2784 }
2785
2786 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
2787                                                    u8 ip_proto, __be16 sport,
2788                                                    __be16 dport)
2789 {
2790         struct sk_buff *skb;
2791         struct iphdr *iph;
2792
2793         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2794         if (!skb)
2795                 return NULL;
2796
2797         /* Reserve room for dummy headers, this skb can pass
2798          * through good chunk of routing engine.
2799          */
2800         skb_reset_mac_header(skb);
2801         skb_reset_network_header(skb);
2802         skb->protocol = htons(ETH_P_IP);
2803         iph = skb_put(skb, sizeof(struct iphdr));
2804         iph->protocol = ip_proto;
2805         iph->saddr = src;
2806         iph->daddr = dst;
2807         iph->version = 0x4;
2808         iph->frag_off = 0;
2809         iph->ihl = 0x5;
2810         skb_set_transport_header(skb, skb->len);
2811
2812         switch (iph->protocol) {
2813         case IPPROTO_UDP: {
2814                 struct udphdr *udph;
2815
2816                 udph = skb_put_zero(skb, sizeof(struct udphdr));
2817                 udph->source = sport;
2818                 udph->dest = dport;
2819                 udph->len = htons(sizeof(struct udphdr));
2820                 udph->check = 0;
2821                 break;
2822         }
2823         case IPPROTO_TCP: {
2824                 struct tcphdr *tcph;
2825
2826                 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
2827                 tcph->source    = sport;
2828                 tcph->dest      = dport;
2829                 tcph->doff      = sizeof(struct tcphdr) / 4;
2830                 tcph->rst = 1;
2831                 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
2832                                             src, dst, 0);
2833                 break;
2834         }
2835         case IPPROTO_ICMP: {
2836                 struct icmphdr *icmph;
2837
2838                 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
2839                 icmph->type = ICMP_ECHO;
2840                 icmph->code = 0;
2841         }
2842         }
2843
2844         return skb;
2845 }
2846
2847 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2848                              struct netlink_ext_ack *extack)
2849 {
2850         struct net *net = sock_net(in_skb->sk);
2851         struct nlattr *tb[RTA_MAX+1];
2852         u32 table_id = RT_TABLE_MAIN;
2853         __be16 sport = 0, dport = 0;
2854         struct fib_result res = {};
2855         u8 ip_proto = IPPROTO_UDP;
2856         struct rtable *rt = NULL;
2857         struct sk_buff *skb;
2858         struct rtmsg *rtm;
2859         struct flowi4 fl4;
2860         __be32 dst = 0;
2861         __be32 src = 0;
2862         kuid_t uid;
2863         u32 iif;
2864         int err;
2865         int mark;
2866
2867         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
2868                           extack);
2869         if (err < 0)
2870                 return err;
2871
2872         rtm = nlmsg_data(nlh);
2873         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2874         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2875         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2876         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2877         if (tb[RTA_UID])
2878                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2879         else
2880                 uid = (iif ? INVALID_UID : current_uid());
2881
2882         if (tb[RTA_IP_PROTO]) {
2883                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
2884                                                   &ip_proto, AF_INET, extack);
2885                 if (err)
2886                         return err;
2887         }
2888
2889         if (tb[RTA_SPORT])
2890                 sport = nla_get_be16(tb[RTA_SPORT]);
2891
2892         if (tb[RTA_DPORT])
2893                 dport = nla_get_be16(tb[RTA_DPORT]);
2894
2895         skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
2896         if (!skb)
2897                 return -ENOBUFS;
2898
2899         memset(&fl4, 0, sizeof(fl4));
2900         fl4.daddr = dst;
2901         fl4.saddr = src;
2902         fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
2903         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2904         fl4.flowi4_mark = mark;
2905         fl4.flowi4_uid = uid;
2906         if (sport)
2907                 fl4.fl4_sport = sport;
2908         if (dport)
2909                 fl4.fl4_dport = dport;
2910         fl4.flowi4_proto = ip_proto;
2911
2912         rcu_read_lock();
2913
2914         if (iif) {
2915                 struct net_device *dev;
2916
2917                 dev = dev_get_by_index_rcu(net, iif);
2918                 if (!dev) {
2919                         err = -ENODEV;
2920                         goto errout_rcu;
2921                 }
2922
2923                 fl4.flowi4_iif = iif; /* for rt_fill_info */
2924                 skb->dev        = dev;
2925                 skb->mark       = mark;
2926                 err = ip_route_input_rcu(skb, dst, src,
2927                                          rtm->rtm_tos & IPTOS_RT_MASK, dev,
2928                                          &res);
2929
2930                 rt = skb_rtable(skb);
2931                 if (err == 0 && rt->dst.error)
2932                         err = -rt->dst.error;
2933         } else {
2934                 fl4.flowi4_iif = LOOPBACK_IFINDEX;
2935                 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
2936                 err = 0;
2937                 if (IS_ERR(rt))
2938                         err = PTR_ERR(rt);
2939                 else
2940                         skb_dst_set(skb, &rt->dst);
2941         }
2942
2943         if (err)
2944                 goto errout_rcu;
2945
2946         if (rtm->rtm_flags & RTM_F_NOTIFY)
2947                 rt->rt_flags |= RTCF_NOTIFY;
2948
2949         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2950                 table_id = res.table ? res.table->tb_id : 0;
2951
2952         /* reset skb for netlink reply msg */
2953         skb_trim(skb, 0);
2954         skb_reset_network_header(skb);
2955         skb_reset_transport_header(skb);
2956         skb_reset_mac_header(skb);
2957
2958         if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2959                 if (!res.fi) {
2960                         err = fib_props[res.type].error;
2961                         if (!err)
2962                                 err = -EHOSTUNREACH;
2963                         goto errout_rcu;
2964                 }
2965                 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2966                                     nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2967                                     rt->rt_type, res.prefix, res.prefixlen,
2968                                     fl4.flowi4_tos, res.fi, 0);
2969         } else {
2970                 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
2971                                    NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
2972         }
2973         if (err < 0)
2974                 goto errout_rcu;
2975
2976         rcu_read_unlock();
2977
2978         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2979
2980 errout_free:
2981         return err;
2982 errout_rcu:
2983         rcu_read_unlock();
2984         kfree_skb(skb);
2985         goto errout_free;
2986 }
2987
2988 void ip_rt_multicast_event(struct in_device *in_dev)
2989 {
2990         rt_cache_flush(dev_net(in_dev->dev));
2991 }
2992
2993 #ifdef CONFIG_SYSCTL
2994 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2995 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2996 static int ip_rt_gc_elasticity __read_mostly    = 8;
2997 static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
2998
2999 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3000                                         void __user *buffer,
3001                                         size_t *lenp, loff_t *ppos)
3002 {
3003         struct net *net = (struct net *)__ctl->extra1;
3004
3005         if (write) {
3006                 rt_cache_flush(net);
3007                 fnhe_genid_bump(net);
3008                 return 0;
3009         }
3010
3011         return -EINVAL;
3012 }
3013
3014 static struct ctl_table ipv4_route_table[] = {
3015         {
3016                 .procname       = "gc_thresh",
3017                 .data           = &ipv4_dst_ops.gc_thresh,
3018                 .maxlen         = sizeof(int),
3019                 .mode           = 0644,
3020                 .proc_handler   = proc_dointvec,
3021         },
3022         {
3023                 .procname       = "max_size",
3024                 .data           = &ip_rt_max_size,
3025                 .maxlen         = sizeof(int),
3026                 .mode           = 0644,
3027                 .proc_handler   = proc_dointvec,
3028         },
3029         {
3030                 /*  Deprecated. Use gc_min_interval_ms */
3031
3032                 .procname       = "gc_min_interval",
3033                 .data           = &ip_rt_gc_min_interval,
3034                 .maxlen         = sizeof(int),
3035                 .mode           = 0644,
3036                 .proc_handler   = proc_dointvec_jiffies,
3037         },
3038         {
3039                 .procname       = "gc_min_interval_ms",
3040                 .data           = &ip_rt_gc_min_interval,
3041                 .maxlen         = sizeof(int),
3042                 .mode           = 0644,
3043                 .proc_handler   = proc_dointvec_ms_jiffies,
3044         },
3045         {
3046                 .procname       = "gc_timeout",
3047                 .data           = &ip_rt_gc_timeout,
3048                 .maxlen         = sizeof(int),
3049                 .mode           = 0644,
3050                 .proc_handler   = proc_dointvec_jiffies,
3051         },
3052         {
3053                 .procname       = "gc_interval",
3054                 .data           = &ip_rt_gc_interval,
3055                 .maxlen         = sizeof(int),
3056                 .mode           = 0644,
3057                 .proc_handler   = proc_dointvec_jiffies,
3058         },
3059         {
3060                 .procname       = "redirect_load",
3061                 .data           = &ip_rt_redirect_load,
3062                 .maxlen         = sizeof(int),
3063                 .mode           = 0644,
3064                 .proc_handler   = proc_dointvec,
3065         },
3066         {
3067                 .procname       = "redirect_number",
3068                 .data           = &ip_rt_redirect_number,
3069                 .maxlen         = sizeof(int),
3070                 .mode           = 0644,
3071                 .proc_handler   = proc_dointvec,
3072         },
3073         {
3074                 .procname       = "redirect_silence",
3075                 .data           = &ip_rt_redirect_silence,
3076                 .maxlen         = sizeof(int),
3077                 .mode           = 0644,
3078                 .proc_handler   = proc_dointvec,
3079         },
3080         {
3081                 .procname       = "error_cost",
3082                 .data           = &ip_rt_error_cost,
3083                 .maxlen         = sizeof(int),
3084                 .mode           = 0644,
3085                 .proc_handler   = proc_dointvec,
3086         },
3087         {
3088                 .procname       = "error_burst",
3089                 .data           = &ip_rt_error_burst,
3090                 .maxlen         = sizeof(int),
3091                 .mode           = 0644,
3092                 .proc_handler   = proc_dointvec,
3093         },
3094         {
3095                 .procname       = "gc_elasticity",
3096                 .data           = &ip_rt_gc_elasticity,
3097                 .maxlen         = sizeof(int),
3098                 .mode           = 0644,
3099                 .proc_handler   = proc_dointvec,
3100         },
3101         {
3102                 .procname       = "mtu_expires",
3103                 .data           = &ip_rt_mtu_expires,
3104                 .maxlen         = sizeof(int),
3105                 .mode           = 0644,
3106                 .proc_handler   = proc_dointvec_jiffies,
3107         },
3108         {
3109                 .procname       = "min_pmtu",
3110                 .data           = &ip_rt_min_pmtu,
3111                 .maxlen         = sizeof(int),
3112                 .mode           = 0644,
3113                 .proc_handler   = proc_dointvec_minmax,
3114                 .extra1         = &ip_min_valid_pmtu,
3115         },
3116         {
3117                 .procname       = "min_adv_mss",
3118                 .data           = &ip_rt_min_advmss,
3119                 .maxlen         = sizeof(int),
3120                 .mode           = 0644,
3121                 .proc_handler   = proc_dointvec,
3122         },
3123         { }
3124 };
3125
3126 static struct ctl_table ipv4_route_flush_table[] = {
3127         {
3128                 .procname       = "flush",
3129                 .maxlen         = sizeof(int),
3130                 .mode           = 0200,
3131                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3132         },
3133         { },
3134 };
3135
3136 static __net_init int sysctl_route_net_init(struct net *net)
3137 {
3138         struct ctl_table *tbl;
3139
3140         tbl = ipv4_route_flush_table;
3141         if (!net_eq(net, &init_net)) {
3142                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3143                 if (!tbl)
3144                         goto err_dup;
3145
3146                 /* Don't export sysctls to unprivileged users */
3147                 if (net->user_ns != &init_user_ns)
3148                         tbl[0].procname = NULL;
3149         }
3150         tbl[0].extra1 = net;
3151
3152         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3153         if (!net->ipv4.route_hdr)
3154                 goto err_reg;
3155         return 0;
3156
3157 err_reg:
3158         if (tbl != ipv4_route_flush_table)
3159                 kfree(tbl);
3160 err_dup:
3161         return -ENOMEM;
3162 }
3163
3164 static __net_exit void sysctl_route_net_exit(struct net *net)
3165 {
3166         struct ctl_table *tbl;
3167
3168         tbl = net->ipv4.route_hdr->ctl_table_arg;
3169         unregister_net_sysctl_table(net->ipv4.route_hdr);
3170         BUG_ON(tbl == ipv4_route_flush_table);
3171         kfree(tbl);
3172 }
3173
3174 static __net_initdata struct pernet_operations sysctl_route_ops = {
3175         .init = sysctl_route_net_init,
3176         .exit = sysctl_route_net_exit,
3177 };
3178 #endif
3179
3180 static __net_init int rt_genid_init(struct net *net)
3181 {
3182         atomic_set(&net->ipv4.rt_genid, 0);
3183         atomic_set(&net->fnhe_genid, 0);
3184         atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3185         return 0;
3186 }
3187
3188 static __net_initdata struct pernet_operations rt_genid_ops = {
3189         .init = rt_genid_init,
3190 };
3191
3192 static int __net_init ipv4_inetpeer_init(struct net *net)
3193 {
3194         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3195
3196         if (!bp)
3197                 return -ENOMEM;
3198         inet_peer_base_init(bp);
3199         net->ipv4.peers = bp;
3200         return 0;
3201 }
3202
3203 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3204 {
3205         struct inet_peer_base *bp = net->ipv4.peers;
3206
3207         net->ipv4.peers = NULL;
3208         inetpeer_invalidate_tree(bp);
3209         kfree(bp);
3210 }
3211
3212 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3213         .init   =       ipv4_inetpeer_init,
3214         .exit   =       ipv4_inetpeer_exit,
3215 };
3216
3217 #ifdef CONFIG_IP_ROUTE_CLASSID
3218 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3219 #endif /* CONFIG_IP_ROUTE_CLASSID */
3220
3221 int __init ip_rt_init(void)
3222 {
3223         void *idents_hash;
3224         int cpu;
3225
3226         /* For modern hosts, this will use 2 MB of memory */
3227         idents_hash = alloc_large_system_hash("IP idents",
3228                                               sizeof(*ip_idents) + sizeof(*ip_tstamps),
3229                                               0,
3230                                               16, /* one bucket per 64 KB */
3231                                               HASH_ZERO,
3232                                               NULL,
3233                                               &ip_idents_mask,
3234                                               2048,
3235                                               256*1024);
3236
3237         ip_idents = idents_hash;
3238
3239         prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
3240
3241         ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
3242
3243         for_each_possible_cpu(cpu) {
3244                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3245
3246                 INIT_LIST_HEAD(&ul->head);
3247                 spin_lock_init(&ul->lock);
3248         }
3249 #ifdef CONFIG_IP_ROUTE_CLASSID
3250         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3251         if (!ip_rt_acct)
3252                 panic("IP: failed to allocate ip_rt_acct\n");
3253 #endif
3254
3255         ipv4_dst_ops.kmem_cachep =
3256                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3257                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3258
3259         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3260
3261         if (dst_entries_init(&ipv4_dst_ops) < 0)
3262                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3263
3264         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3265                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3266
3267         ipv4_dst_ops.gc_thresh = ~0;
3268         ip_rt_max_size = INT_MAX;
3269
3270         devinet_init();
3271         ip_fib_init();
3272
3273         if (ip_rt_proc_init())
3274                 pr_err("Unable to create route proc files\n");
3275 #ifdef CONFIG_XFRM
3276         xfrm_init();
3277         xfrm4_init();
3278 #endif
3279         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3280                       RTNL_FLAG_DOIT_UNLOCKED);
3281
3282 #ifdef CONFIG_SYSCTL
3283         register_pernet_subsys(&sysctl_route_ops);
3284 #endif
3285         register_pernet_subsys(&rt_genid_ops);
3286         register_pernet_subsys(&ipv4_inetpeer_ops);
3287         return 0;
3288 }
3289
3290 #ifdef CONFIG_SYSCTL
3291 /*
3292  * We really need to sanitize the damn ipv4 init order, then all
3293  * this nonsense will go away.
3294  */
3295 void __init ip_static_sysctl_init(void)
3296 {
3297         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3298 }
3299 #endif