net/ipv4/route.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
   4  *              operating system.  INET is implemented using the  BSD Socket
   5  *              interface as the means of communication with the user level.
   6  *
   7  *              ROUTE - implementation of the IP router.
   8  *
   9  * Authors:     Ross Biro
  10  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
  11  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
  12  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
  13  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  14  *
  15  * Fixes:
  16  *              Alan Cox        :       Verify area fixes.
  17  *              Alan Cox        :       cli() protects routing changes
  18  *              Rui Oliveira    :       ICMP routing table updates
  19  *              (rco@di.uminho.pt)      Routing table insertion and update
  20  *              Linus Torvalds  :       Rewrote bits to be sensible
  21  *              Alan Cox        :       Added BSD route gw semantics
  22  *              Alan Cox        :       Super /proc >4K
  23  *              Alan Cox        :       MTU in route table
  24  *              Alan Cox        :       MSS actually. Also added the window
  25  *                                      clamper.
  26  *              Sam Lantinga    :       Fixed route matching in rt_del()
  27  *              Alan Cox        :       Routing cache support.
  28  *              Alan Cox        :       Removed compatibility cruft.
  29  *              Alan Cox        :       RTF_REJECT support.
  30  *              Alan Cox        :       TCP irtt support.
  31  *              Jonathan Naylor :       Added Metric support.
  32  *      Miquel van Smoorenburg  :       BSD API fixes.
  33  *      Miquel van Smoorenburg  :       Metrics.
  34  *              Alan Cox        :       Use __u32 properly
  35  *              Alan Cox        :       Aligned routing errors more closely with BSD
  36  *                                      our system is still very different.
  37  *              Alan Cox        :       Faster /proc handling
  38  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
  39  *                                      routing caches and better behaviour.
  40  *
  41  *              Olaf Erb        :       irtt wasn't being copied right.
  42  *              Bjorn Ekwall    :       Kerneld route support.
  43  *              Alan Cox        :       Multicast fixed (I hope)
  44  *              Pavel Krauz     :       Limited broadcast fixed
  45  *              Mike McLagan    :       Routing by source
  46  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
  47  *                                      route.c and rewritten from scratch.
  48  *              Andi Kleen      :       Load-limit warning messages.
  49  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
  50  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
  51  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
  52  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
  53  *              Marc Boucher    :       routing by fwmark
  54  *      Robert Olsson           :       Added rt_cache statistics
  55  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
  56  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
  57  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
  58  *      Ilia Sotnikov           :       Removed TOS from hash calculations
  59  */
  60
  61 #define pr_fmt(fmt) "IPv4: " fmt
  62
  63 #include <linux/module.h>
  64 #include <linux/uaccess.h>
  65 #include <linux/bitops.h>
  66 #include <linux/types.h>
  67 #include <linux/kernel.h>
  68 #include <linux/mm.h>
  69 #include <linux/memblock.h>
  70 #include <linux/string.h>
  71 #include <linux/socket.h>
  72 #include <linux/sockios.h>
  73 #include <linux/errno.h>
  74 #include <linux/in.h>
  75 #include <linux/inet.h>
  76 #include <linux/netdevice.h>
  77 #include <linux/proc_fs.h>
  78 #include <linux/init.h>
  79 #include <linux/skbuff.h>
  80 #include <linux/inetdevice.h>
  81 #include <linux/igmp.h>
  82 #include <linux/pkt_sched.h>
  83 #include <linux/mroute.h>
  84 #include <linux/netfilter_ipv4.h>
  85 #include <linux/random.h>
  86 #include <linux/rcupdate.h>
  87 #include <linux/times.h>
  88 #include <linux/slab.h>
  89 #include <linux/jhash.h>
  90 #include <net/dst.h>
  91 #include <net/dst_metadata.h>
  92 #include <net/net_namespace.h>
  93 #include <net/protocol.h>
  94 #include <net/ip.h>
  95 #include <net/route.h>
  96 #include <net/inetpeer.h>
  97 #include <net/sock.h>
  98 #include <net/ip_fib.h>
  99 #include <net/nexthop.h>
 100 #include <net/arp.h>
 101 #include <net/tcp.h>
 102 #include <net/icmp.h>
 103 #include <net/xfrm.h>
 104 #include <net/lwtunnel.h>
 105 #include <net/netevent.h>
 106 #include <net/rtnetlink.h>
 107 #ifdef CONFIG_SYSCTL
 108 #include <linux/sysctl.h>
 109 #endif
 110 #include <net/secure_seq.h>
 111 #include <net/ip_tunnels.h>
 112 #include <net/l3mdev.h>
 113
 114 #include "fib_lookup.h"
 115
 116 #define RT_FL_TOS(oldflp4) \
 117         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
 118
 119 #define RT_GC_TIMEOUT (300*HZ)
 120
 121 static int ip_rt_max_size;
 122 static int ip_rt_redirect_number __read_mostly  = 9;
 123 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
 124 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
 125 static int ip_rt_error_cost __read_mostly       = HZ;
 126 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
 127 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
 128 static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
 129 static int ip_rt_min_advmss __read_mostly       = 256;
 130
 131 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
 132
 133 /*
 134  *      Interface to generic destination cache.
 135  */
 136
 137 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
 138 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
 139 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
 140 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
 141 static void              ipv4_link_failure(struct sk_buff *skb);
 142 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
 143                                            struct sk_buff *skb, u32 mtu,
 144                                            bool confirm_neigh);
 145 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
 146                                         struct sk_buff *skb);
 147 static void             ipv4_dst_destroy(struct dst_entry *dst);
 148
 149 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
 150 {
 151         WARN_ON(1);
 152         return NULL;
 153 }
 154
 155 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 156                                            struct sk_buff *skb,
 157                                            const void *daddr);
 158 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
 159
 160 static struct dst_ops ipv4_dst_ops = {
 161         .family =               AF_INET,
 162         .check =                ipv4_dst_check,
 163         .default_advmss =       ipv4_default_advmss,
 164         .mtu =                  ipv4_mtu,
 165         .cow_metrics =          ipv4_cow_metrics,
 166         .destroy =              ipv4_dst_destroy,
 167         .negative_advice =      ipv4_negative_advice,
 168         .link_failure =         ipv4_link_failure,
 169         .update_pmtu =          ip_rt_update_pmtu,
 170         .redirect =             ip_do_redirect,
 171         .local_out =            __ip_local_out,
 172         .neigh_lookup =         ipv4_neigh_lookup,
 173         .confirm_neigh =        ipv4_confirm_neigh,
 174 };
 175
 176 #define ECN_OR_COST(class)      TC_PRIO_##class
 177
 178 const __u8 ip_tos2prio[16] = {
 179         TC_PRIO_BESTEFFORT,
 180         ECN_OR_COST(BESTEFFORT),
 181         TC_PRIO_BESTEFFORT,
 182         ECN_OR_COST(BESTEFFORT),
 183         TC_PRIO_BULK,
 184         ECN_OR_COST(BULK),
 185         TC_PRIO_BULK,
 186         ECN_OR_COST(BULK),
 187         TC_PRIO_INTERACTIVE,
 188         ECN_OR_COST(INTERACTIVE),
 189         TC_PRIO_INTERACTIVE,
 190         ECN_OR_COST(INTERACTIVE),
 191         TC_PRIO_INTERACTIVE_BULK,
 192         ECN_OR_COST(INTERACTIVE_BULK),
 193         TC_PRIO_INTERACTIVE_BULK,
 194         ECN_OR_COST(INTERACTIVE_BULK)
 195 };
 196 EXPORT_SYMBOL(ip_tos2prio);
 197
 198 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
 199 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
 200
 201 #ifdef CONFIG_PROC_FS
 202 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
 203 {
 204         if (*pos)
 205                 return NULL;
 206         return SEQ_START_TOKEN;
 207 }
 208
 209 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 210 {
 211         ++*pos;
 212         return NULL;
 213 }
 214
 215 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
 216 {
 217 }
 218
 219 static int rt_cache_seq_show(struct seq_file *seq, void *v)
 220 {
 221         if (v == SEQ_START_TOKEN)
 222                 seq_printf(seq, "%-127s\n",
 223                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
 224                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
 225                            "HHUptod\tSpecDst");
 226         return 0;
 227 }
 228
 229 static const struct seq_operations rt_cache_seq_ops = {
 230         .start  = rt_cache_seq_start,
 231         .next   = rt_cache_seq_next,
 232         .stop   = rt_cache_seq_stop,
 233         .show   = rt_cache_seq_show,
 234 };
 235
 236 static int rt_cache_seq_open(struct inode *inode, struct file *file)
 237 {
 238         return seq_open(file, &rt_cache_seq_ops);
 239 }
 240
 241 static const struct file_operations rt_cache_seq_fops = {
 242         .open    = rt_cache_seq_open,
 243         .read    = seq_read,
 244         .llseek  = seq_lseek,
 245         .release = seq_release,
 246 };
 247
 248
 249 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
 250 {
 251         int cpu;
 252
 253         if (*pos == 0)
 254                 return SEQ_START_TOKEN;
 255
 256         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
 257                 if (!cpu_possible(cpu))
 258                         continue;
 259                 *pos = cpu+1;
 260                 return &per_cpu(rt_cache_stat, cpu);
 261         }
 262         return NULL;
 263 }
 264
 265 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
 266 {
 267         int cpu;
 268
 269         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
 270                 if (!cpu_possible(cpu))
 271                         continue;
 272                 *pos = cpu+1;
 273                 return &per_cpu(rt_cache_stat, cpu);
 274         }
 275         (*pos)++;
 276         return NULL;
 277
 278 }
 279
 280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
 281 {
 282
 283 }
 284
 285 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
 286 {
 287         struct rt_cache_stat *st = v;
 288
 289         if (v == SEQ_START_TOKEN) {
 290                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
 291                 return 0;
 292         }
 293
 294         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
 295                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
 296                    dst_entries_get_slow(&ipv4_dst_ops),
 297                    0, /* st->in_hit */
 298                    st->in_slow_tot,
 299                    st->in_slow_mc,
 300                    st->in_no_route,
 301                    st->in_brd,
 302                    st->in_martian_dst,
 303                    st->in_martian_src,
 304
 305                    0, /* st->out_hit */
 306                    st->out_slow_tot,
 307                    st->out_slow_mc,
 308
 309                    0, /* st->gc_total */
 310                    0, /* st->gc_ignored */
 311                    0, /* st->gc_goal_miss */
 312                    0, /* st->gc_dst_overflow */
 313                    0, /* st->in_hlist_search */
 314                    0  /* st->out_hlist_search */
 315                 );
 316         return 0;
 317 }
 318
 319 static const struct seq_operations rt_cpu_seq_ops = {
 320         .start  = rt_cpu_seq_start,
 321         .next   = rt_cpu_seq_next,
 322         .stop   = rt_cpu_seq_stop,
 323         .show   = rt_cpu_seq_show,
 324 };
 325
 326
 327 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
 328 {
 329         return seq_open(file, &rt_cpu_seq_ops);
 330 }
 331
 332 static const struct file_operations rt_cpu_seq_fops = {
 333         .open    = rt_cpu_seq_open,
 334         .read    = seq_read,
 335         .llseek  = seq_lseek,
 336         .release = seq_release,
 337 };
 338
 339 #ifdef CONFIG_IP_ROUTE_CLASSID
 340 static int rt_acct_proc_show(struct seq_file *m, void *v)
 341 {
 342         struct ip_rt_acct *dst, *src;
 343         unsigned int i, j;
 344
 345         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
 346         if (!dst)
 347                 return -ENOMEM;
 348
 349         for_each_possible_cpu(i) {
 350                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
 351                 for (j = 0; j < 256; j++) {
 352                         dst[j].o_bytes   += src[j].o_bytes;
 353                         dst[j].o_packets += src[j].o_packets;
 354                         dst[j].i_bytes   += src[j].i_bytes;
 355                         dst[j].i_packets += src[j].i_packets;
 356                 }
 357         }
 358
 359         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
 360         kfree(dst);
 361         return 0;
 362 }
 363 #endif
 364
 365 static int __net_init ip_rt_do_proc_init(struct net *net)
 366 {
 367         struct proc_dir_entry *pde;
 368
 369         pde = proc_create("rt_cache", 0444, net->proc_net,
 370                           &rt_cache_seq_fops);
 371         if (!pde)
 372                 goto err1;
 373
 374         pde = proc_create("rt_cache", 0444,
 375                           net->proc_net_stat, &rt_cpu_seq_fops);
 376         if (!pde)
 377                 goto err2;
 378
 379 #ifdef CONFIG_IP_ROUTE_CLASSID
 380         pde = proc_create_single("rt_acct", 0, net->proc_net,
 381                         rt_acct_proc_show);
 382         if (!pde)
 383                 goto err3;
 384 #endif
 385         return 0;
 386
 387 #ifdef CONFIG_IP_ROUTE_CLASSID
 388 err3:
 389         remove_proc_entry("rt_cache", net->proc_net_stat);
 390 #endif
 391 err2:
 392         remove_proc_entry("rt_cache", net->proc_net);
 393 err1:
 394         return -ENOMEM;
 395 }
 396
 397 static void __net_exit ip_rt_do_proc_exit(struct net *net)
 398 {
 399         remove_proc_entry("rt_cache", net->proc_net_stat);
 400         remove_proc_entry("rt_cache", net->proc_net);
 401 #ifdef CONFIG_IP_ROUTE_CLASSID
 402         remove_proc_entry("rt_acct", net->proc_net);
 403 #endif
 404 }
 405
 406 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
 407         .init = ip_rt_do_proc_init,
 408         .exit = ip_rt_do_proc_exit,
 409 };
 410
 411 static int __init ip_rt_proc_init(void)
 412 {
 413         return register_pernet_subsys(&ip_rt_proc_ops);
 414 }
 415
 416 #else
 417 static inline int ip_rt_proc_init(void)
 418 {
 419         return 0;
 420 }
 421 #endif /* CONFIG_PROC_FS */
 422
 423 static inline bool rt_is_expired(const struct rtable *rth)
 424 {
 425         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
 426 }
 427
 428 void rt_cache_flush(struct net *net)
 429 {
 430         rt_genid_bump_ipv4(net);
 431 }
 432
 433 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
 434                                            struct sk_buff *skb,
 435                                            const void *daddr)
 436 {
 437         const struct rtable *rt = container_of(dst, struct rtable, dst);
 438         struct net_device *dev = dst->dev;
 439         struct neighbour *n;
 440
 441         rcu_read_lock_bh();
 442
 443         if (likely(rt->rt_gw_family == AF_INET)) {
 444                 n = ip_neigh_gw4(dev, rt->rt_gw4);
 445         } else if (rt->rt_gw_family == AF_INET6) {
 446                 n = ip_neigh_gw6(dev, &rt->rt_gw6);
 447         } else {
 448                 __be32 pkey;
 449
 450                 pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
 451                 n = ip_neigh_gw4(dev, pkey);
 452         }
 453
 454         if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
 455                 n = NULL;
 456
 457         rcu_read_unlock_bh();
 458
 459         return n;
 460 }
 461
 462 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
 463 {
 464         const struct rtable *rt = container_of(dst, struct rtable, dst);
 465         struct net_device *dev = dst->dev;
 466         const __be32 *pkey = daddr;
 467
 468         if (rt->rt_gw_family == AF_INET) {
 469                 pkey = (const __be32 *)&rt->rt_gw4;
 470         } else if (rt->rt_gw_family == AF_INET6) {
 471                 return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
 472         } else if (!daddr ||
 473                  (rt->rt_flags &
 474                   (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
 475                 return;
 476         }
 477         __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
 478 }
 479
 480 /* Hash tables of size 2048..262144 depending on RAM size.
 481  * Each bucket uses 8 bytes.
 482  */
 483 static u32 ip_idents_mask __read_mostly;
 484 static atomic_t *ip_idents __read_mostly;
 485 static u32 *ip_tstamps __read_mostly;
 486
 487 /* In order to protect privacy, we add a perturbation to identifiers
 488  * if one generator is seldom used. This makes hard for an attacker
 489  * to infer how many packets were sent between two points in time.
 490  */
 491 u32 ip_idents_reserve(u32 hash, int segs)
 492 {
 493         u32 bucket, old, now = (u32)jiffies;
 494         atomic_t *p_id;
 495         u32 *p_tstamp;
 496         u32 delta = 0;
 497
 498         bucket = hash & ip_idents_mask;
 499         p_tstamp = ip_tstamps + bucket;
 500         p_id = ip_idents + bucket;
 501         old = READ_ONCE(*p_tstamp);
 502
 503         if (old != now && cmpxchg(p_tstamp, old, now) == old)
 504                 delta = prandom_u32_max(now - old);
 505
 506         /* If UBSAN reports an error there, please make sure your compiler
 507          * supports -fno-strict-overflow before reporting it that was a bug
 508          * in UBSAN, and it has been fixed in GCC-8.
 509          */
 510         return atomic_add_return(segs + delta, p_id) - segs;
 511 }
 512 EXPORT_SYMBOL(ip_idents_reserve);
 513
 514 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
 515 {
 516         u32 hash, id;
 517
 518         /* Note the following code is not safe, but this is okay. */
 519         if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
 520                 get_random_bytes(&net->ipv4.ip_id_key,
 521                                  sizeof(net->ipv4.ip_id_key));
 522
 523         hash = siphash_3u32((__force u32)iph->daddr,
 524                             (__force u32)iph->saddr,
 525                             iph->protocol,
 526                             &net->ipv4.ip_id_key);
 527         id = ip_idents_reserve(hash, segs);
 528         iph->id = htons(id);
 529 }
 530 EXPORT_SYMBOL(__ip_select_ident);
 531
 532 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
 533                              const struct sock *sk,
 534                              const struct iphdr *iph,
 535                              int oif, u8 tos,
 536                              u8 prot, u32 mark, int flow_flags)
 537 {
 538         if (sk) {
 539                 const struct inet_sock *inet = inet_sk(sk);
 540
 541                 oif = sk->sk_bound_dev_if;
 542                 mark = sk->sk_mark;
 543                 tos = RT_CONN_FLAGS(sk);
 544                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
 545         }
 546         flowi4_init_output(fl4, oif, mark, tos,
 547                            RT_SCOPE_UNIVERSE, prot,
 548                            flow_flags,
 549                            iph->daddr, iph->saddr, 0, 0,
 550                            sock_net_uid(net, sk));
 551 }
 552
 553 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
 554                                const struct sock *sk)
 555 {
 556         const struct net *net = dev_net(skb->dev);
 557         const struct iphdr *iph = ip_hdr(skb);
 558         int oif = skb->dev->ifindex;
 559         u8 tos = RT_TOS(iph->tos);
 560         u8 prot = iph->protocol;
 561         u32 mark = skb->mark;
 562
 563         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
 564 }
 565
 566 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
 567 {
 568         const struct inet_sock *inet = inet_sk(sk);
 569         const struct ip_options_rcu *inet_opt;
 570         __be32 daddr = inet->inet_daddr;
 571
 572         rcu_read_lock();
 573         inet_opt = rcu_dereference(inet->inet_opt);
 574         if (inet_opt && inet_opt->opt.srr)
 575                 daddr = inet_opt->opt.faddr;
 576         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
 577                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
 578                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
 579                            inet_sk_flowi_flags(sk),
 580                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
 581         rcu_read_unlock();
 582 }
 583
 584 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
 585                                  const struct sk_buff *skb)
 586 {
 587         if (skb)
 588                 build_skb_flow_key(fl4, skb, sk);
 589         else
 590                 build_sk_flow_key(fl4, sk);
 591 }
 592
 593 static DEFINE_SPINLOCK(fnhe_lock);
 594
 595 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
 596 {
 597         struct rtable *rt;
 598
 599         rt = rcu_dereference(fnhe->fnhe_rth_input);
 600         if (rt) {
 601                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
 602                 dst_dev_put(&rt->dst);
 603                 dst_release(&rt->dst);
 604         }
 605         rt = rcu_dereference(fnhe->fnhe_rth_output);
 606         if (rt) {
 607                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
 608                 dst_dev_put(&rt->dst);
 609                 dst_release(&rt->dst);
 610         }
 611 }
 612
 613 static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
 614 {
 615         struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
 616         struct fib_nh_exception *fnhe, *oldest = NULL;
 617
 618         for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
 619                 fnhe = rcu_dereference_protected(*fnhe_p,
 620                                                  lockdep_is_held(&fnhe_lock));
 621                 if (!fnhe)
 622                         break;
 623                 if (!oldest ||
 624                     time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
 625                         oldest = fnhe;
 626                         oldest_p = fnhe_p;
 627                 }
 628         }
 629         fnhe_flush_routes(oldest);
 630         *oldest_p = oldest->fnhe_next;
 631         kfree_rcu(oldest, rcu);
 632 }
 633
 634 static u32 fnhe_hashfun(__be32 daddr)
 635 {
 636         static siphash_key_t fnhe_hash_key __read_mostly;
 637         u64 hval;
 638
 639         net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key));
 640         hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key);
 641         return hash_64(hval, FNHE_HASH_SHIFT);
 642 }
 643
 644 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
 645 {
 646         rt->rt_pmtu = fnhe->fnhe_pmtu;
 647         rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
 648         rt->dst.expires = fnhe->fnhe_expires;
 649
 650         if (fnhe->fnhe_gw) {
 651                 rt->rt_flags |= RTCF_REDIRECTED;
 652                 rt->rt_uses_gateway = 1;
 653                 rt->rt_gw_family = AF_INET;
 654                 rt->rt_gw4 = fnhe->fnhe_gw;
 655         }
 656 }
 657
 658 static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
 659                                   __be32 gw, u32 pmtu, bool lock,
 660                                   unsigned long expires)
 661 {
 662         struct fnhe_hash_bucket *hash;
 663         struct fib_nh_exception *fnhe;
 664         struct rtable *rt;
 665         u32 genid, hval;
 666         unsigned int i;
 667         int depth;
 668
 669         genid = fnhe_genid(dev_net(nhc->nhc_dev));
 670         hval = fnhe_hashfun(daddr);
 671
 672         spin_lock_bh(&fnhe_lock);
 673
 674         hash = rcu_dereference(nhc->nhc_exceptions);
 675         if (!hash) {
 676                 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
 677                 if (!hash)
 678                         goto out_unlock;
 679                 rcu_assign_pointer(nhc->nhc_exceptions, hash);
 680         }
 681
 682         hash += hval;
 683
 684         depth = 0;
 685         for (fnhe = rcu_dereference(hash->chain); fnhe;
 686              fnhe = rcu_dereference(fnhe->fnhe_next)) {
 687                 if (fnhe->fnhe_daddr == daddr)
 688                         break;
 689                 depth++;
 690         }
 691
 692         if (fnhe) {
 693                 if (fnhe->fnhe_genid != genid)
 694                         fnhe->fnhe_genid = genid;
 695                 if (gw)
 696                         fnhe->fnhe_gw = gw;
 697                 if (pmtu) {
 698                         fnhe->fnhe_pmtu = pmtu;
 699                         fnhe->fnhe_mtu_locked = lock;
 700                 }
 701                 fnhe->fnhe_expires = max(1UL, expires);
 702                 /* Update all cached dsts too */
 703                 rt = rcu_dereference(fnhe->fnhe_rth_input);
 704                 if (rt)
 705                         fill_route_from_fnhe(rt, fnhe);
 706                 rt = rcu_dereference(fnhe->fnhe_rth_output);
 707                 if (rt)
 708                         fill_route_from_fnhe(rt, fnhe);
 709         } else {
 710                 /* Randomize max depth to avoid some side channels attacks. */
 711                 int max_depth = FNHE_RECLAIM_DEPTH +
 712                                 prandom_u32_max(FNHE_RECLAIM_DEPTH);
 713
 714                 while (depth > max_depth) {
 715                         fnhe_remove_oldest(hash);
 716                         depth--;
 717                 }
 718
 719                 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
 720                 if (!fnhe)
 721                         goto out_unlock;
 722
 723                 fnhe->fnhe_next = hash->chain;
 724
 725                 fnhe->fnhe_genid = genid;
 726                 fnhe->fnhe_daddr = daddr;
 727                 fnhe->fnhe_gw = gw;
 728                 fnhe->fnhe_pmtu = pmtu;
 729                 fnhe->fnhe_mtu_locked = lock;
 730                 fnhe->fnhe_expires = max(1UL, expires);
 731
 732                 rcu_assign_pointer(hash->chain, fnhe);
 733
 734                 /* Exception created; mark the cached routes for the nexthop
 735                  * stale, so anyone caching it rechecks if this exception
 736                  * applies to them.
 737                  */
 738                 rt = rcu_dereference(nhc->nhc_rth_input);
 739                 if (rt)
 740                         rt->dst.obsolete = DST_OBSOLETE_KILL;
 741
 742                 for_each_possible_cpu(i) {
 743                         struct rtable __rcu **prt;
 744                         prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
 745                         rt = rcu_dereference(*prt);
 746                         if (rt)
 747                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 748                 }
 749         }
 750
 751         fnhe->fnhe_stamp = jiffies;
 752
 753 out_unlock:
 754         spin_unlock_bh(&fnhe_lock);
 755 }
 756
 757 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
 758                              bool kill_route)
 759 {
 760         __be32 new_gw = icmp_hdr(skb)->un.gateway;
 761         __be32 old_gw = ip_hdr(skb)->saddr;
 762         struct net_device *dev = skb->dev;
 763         struct in_device *in_dev;
 764         struct fib_result res;
 765         struct neighbour *n;
 766         struct net *net;
 767
 768         switch (icmp_hdr(skb)->code & 7) {
 769         case ICMP_REDIR_NET:
 770         case ICMP_REDIR_NETTOS:
 771         case ICMP_REDIR_HOST:
 772         case ICMP_REDIR_HOSTTOS:
 773                 break;
 774
 775         default:
 776                 return;
 777         }
 778
 779         if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
 780                 return;
 781
 782         in_dev = __in_dev_get_rcu(dev);
 783         if (!in_dev)
 784                 return;
 785
 786         net = dev_net(dev);
 787         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
 788             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
 789             ipv4_is_zeronet(new_gw))
 790                 goto reject_redirect;
 791
 792         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
 793                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
 794                         goto reject_redirect;
 795                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
 796                         goto reject_redirect;
 797         } else {
 798                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
 799                         goto reject_redirect;
 800         }
 801
 802         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
 803         if (!n)
 804                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
 805         if (!IS_ERR(n)) {
 806                 if (!(n->nud_state & NUD_VALID)) {
 807                         neigh_event_send(n, NULL);
 808                 } else {
 809                         if (fib_lookup(net, fl4, &res, 0) == 0) {
 810                                 struct fib_nh_common *nhc;
 811
 812                                 fib_select_path(net, &res, fl4, skb);
 813                                 nhc = FIB_RES_NHC(res);
 814                                 update_or_create_fnhe(nhc, fl4->daddr, new_gw,
 815                                                 0, false,
 816                                                 jiffies + ip_rt_gc_timeout);
 817                         }
 818                         if (kill_route)
 819                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
 820                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
 821                 }
 822                 neigh_release(n);
 823         }
 824         return;
 825
 826 reject_redirect:
 827 #ifdef CONFIG_IP_ROUTE_VERBOSE
 828         if (IN_DEV_LOG_MARTIANS(in_dev)) {
 829                 const struct iphdr *iph = (const struct iphdr *) skb->data;
 830                 __be32 daddr = iph->daddr;
 831                 __be32 saddr = iph->saddr;
 832
 833                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
 834                                      "  Advised path = %pI4 -> %pI4\n",
 835                                      &old_gw, dev->name, &new_gw,
 836                                      &saddr, &daddr);
 837         }
 838 #endif
 839         ;
 840 }
 841
 842 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
 843 {
 844         struct rtable *rt;
 845         struct flowi4 fl4;
 846         const struct iphdr *iph = (const struct iphdr *) skb->data;
 847         struct net *net = dev_net(skb->dev);
 848         int oif = skb->dev->ifindex;
 849         u8 tos = RT_TOS(iph->tos);
 850         u8 prot = iph->protocol;
 851         u32 mark = skb->mark;
 852
 853         rt = (struct rtable *) dst;
 854
 855         __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
 856         __ip_do_redirect(rt, skb, &fl4, true);
 857 }
 858
 859 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
 860 {
 861         struct rtable *rt = (struct rtable *)dst;
 862         struct dst_entry *ret = dst;
 863
 864         if (rt) {
 865                 if (dst->obsolete > 0) {
 866                         ip_rt_put(rt);
 867                         ret = NULL;
 868                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
 869                            rt->dst.expires) {
 870                         ip_rt_put(rt);
 871                         ret = NULL;
 872                 }
 873         }
 874         return ret;
 875 }
 876
 877 /*
 878  * Algorithm:
 879  *      1. The first ip_rt_redirect_number redirects are sent
 880  *         with exponential backoff, then we stop sending them at all,
 881  *         assuming that the host ignores our redirects.
 882  *      2. If we did not see packets requiring redirects
 883  *         during ip_rt_redirect_silence, we assume that the host
 884  *         forgot redirected route and start to send redirects again.
 885  *
 886  * This algorithm is much cheaper and more intelligent than dumb load limiting
 887  * in icmp.c.
 888  *
 889  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
 890  * and "frag. need" (breaks PMTU discovery) in icmp.c.
 891  */
 892
 893 void ip_rt_send_redirect(struct sk_buff *skb)
 894 {
 895         struct rtable *rt = skb_rtable(skb);
 896         struct in_device *in_dev;
 897         struct inet_peer *peer;
 898         struct net *net;
 899         int log_martians;
 900         int vif;
 901
 902         rcu_read_lock();
 903         in_dev = __in_dev_get_rcu(rt->dst.dev);
 904         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
 905                 rcu_read_unlock();
 906                 return;
 907         }
 908         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
 909         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
 910         rcu_read_unlock();
 911
 912         net = dev_net(rt->dst.dev);
 913         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
 914         if (!peer) {
 915                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
 916                           rt_nexthop(rt, ip_hdr(skb)->daddr));
 917                 return;
 918         }
 919
 920         /* No redirected packets during ip_rt_redirect_silence;
 921          * reset the algorithm.
 922          */
 923         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
 924                 peer->rate_tokens = 0;
 925                 peer->n_redirects = 0;
 926         }
 927
 928         /* Too many ignored redirects; do not send anything
 929          * set dst.rate_last to the last seen redirected packet.
 930          */
 931         if (peer->n_redirects >= ip_rt_redirect_number) {
 932                 peer->rate_last = jiffies;
 933                 goto out_put_peer;
 934         }
 935
 936         /* Check for load limit; set rate_last to the latest sent
 937          * redirect.
 938          */
 939         if (peer->n_redirects == 0 ||
 940             time_after(jiffies,
 941                        (peer->rate_last +
 942                         (ip_rt_redirect_load << peer->n_redirects)))) {
 943                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
 944
 945                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
 946                 peer->rate_last = jiffies;
 947                 ++peer->n_redirects;
 948 #ifdef CONFIG_IP_ROUTE_VERBOSE
 949                 if (log_martians &&
 950                     peer->n_redirects == ip_rt_redirect_number)
 951                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
 952                                              &ip_hdr(skb)->saddr, inet_iif(skb),
 953                                              &ip_hdr(skb)->daddr, &gw);
 954 #endif
 955         }
 956 out_put_peer:
 957         inet_putpeer(peer);
 958 }
 959
 960 static int ip_error(struct sk_buff *skb)
 961 {
 962         struct rtable *rt = skb_rtable(skb);
 963         struct net_device *dev = skb->dev;
 964         struct in_device *in_dev;
 965         struct inet_peer *peer;
 966         unsigned long now;
 967         struct net *net;
 968         bool send;
 969         int code;
 970
 971         if (netif_is_l3_master(skb->dev)) {
 972                 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
 973                 if (!dev)
 974                         goto out;
 975         }
 976
 977         in_dev = __in_dev_get_rcu(dev);
 978
 979         /* IP on this device is disabled. */
 980         if (!in_dev)
 981                 goto out;
 982
 983         net = dev_net(rt->dst.dev);
 984         if (!IN_DEV_FORWARD(in_dev)) {
 985                 switch (rt->dst.error) {
 986                 case EHOSTUNREACH:
 987                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
 988                         break;
 989
 990                 case ENETUNREACH:
 991                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
 992                         break;
 993                 }
 994                 goto out;
 995         }
 996
 997         switch (rt->dst.error) {
 998         case EINVAL:
 999         default:
1000                 goto out;
1001         case EHOSTUNREACH:
1002                 code = ICMP_HOST_UNREACH;
1003                 break;
1004         case ENETUNREACH:
1005                 code = ICMP_NET_UNREACH;
1006                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
1007                 break;
1008         case EACCES:
1009                 code = ICMP_PKT_FILTERED;
1010                 break;
1011         }
1012
1013         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
1014                                l3mdev_master_ifindex(skb->dev), 1);
1015
1016         send = true;
1017         if (peer) {
1018                 now = jiffies;
1019                 peer->rate_tokens += now - peer->rate_last;
1020                 if (peer->rate_tokens > ip_rt_error_burst)
1021                         peer->rate_tokens = ip_rt_error_burst;
1022                 peer->rate_last = now;
1023                 if (peer->rate_tokens >= ip_rt_error_cost)
1024                         peer->rate_tokens -= ip_rt_error_cost;
1025                 else
1026                         send = false;
1027                 inet_putpeer(peer);
1028         }
1029         if (send)
1030                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1031
1032 out:    kfree_skb(skb);
1033         return 0;
1034 }
1035
1036 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1037 {
1038         struct dst_entry *dst = &rt->dst;
1039         struct net *net = dev_net(dst->dev);
1040         u32 old_mtu = ipv4_mtu(dst);
1041         struct fib_result res;
1042         bool lock = false;
1043
1044         if (ip_mtu_locked(dst))
1045                 return;
1046
1047         if (old_mtu < mtu)
1048                 return;
1049
1050         if (mtu < ip_rt_min_pmtu) {
1051                 lock = true;
1052                 mtu = min(old_mtu, ip_rt_min_pmtu);
1053         }
1054
1055         if (rt->rt_pmtu == mtu && !lock &&
1056             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1057                 return;
1058
1059         rcu_read_lock();
1060         if (fib_lookup(net, fl4, &res, 0) == 0) {
1061                 struct fib_nh_common *nhc;
1062
1063                 fib_select_path(net, &res, fl4, NULL);
1064                 nhc = FIB_RES_NHC(res);
1065                 update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1066                                       jiffies + ip_rt_mtu_expires);
1067         }
1068         rcu_read_unlock();
1069 }
1070
1071 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1072                               struct sk_buff *skb, u32 mtu,
1073                               bool confirm_neigh)
1074 {
1075         struct rtable *rt = (struct rtable *) dst;
1076         struct flowi4 fl4;
1077
1078         ip_rt_build_flow_key(&fl4, sk, skb);
1079         __ip_rt_update_pmtu(rt, &fl4, mtu);
1080 }
1081
1082 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1083                       int oif, u8 protocol)
1084 {
1085         const struct iphdr *iph = (const struct iphdr *) skb->data;
1086         struct flowi4 fl4;
1087         struct rtable *rt;
1088         u32 mark = IP4_REPLY_MARK(net, skb->mark);
1089
1090         __build_flow_key(net, &fl4, NULL, iph, oif,
1091                          RT_TOS(iph->tos), protocol, mark, 0);
1092         rt = __ip_route_output_key(net, &fl4);
1093         if (!IS_ERR(rt)) {
1094                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1095                 ip_rt_put(rt);
1096         }
1097 }
1098 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1099
1100 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1101 {
1102         const struct iphdr *iph = (const struct iphdr *) skb->data;
1103         struct flowi4 fl4;
1104         struct rtable *rt;
1105
1106         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1107
1108         if (!fl4.flowi4_mark)
1109                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1110
1111         rt = __ip_route_output_key(sock_net(sk), &fl4);
1112         if (!IS_ERR(rt)) {
1113                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1114                 ip_rt_put(rt);
1115         }
1116 }
1117
1118 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1119 {
1120         const struct iphdr *iph = (const struct iphdr *) skb->data;
1121         struct flowi4 fl4;
1122         struct rtable *rt;
1123         struct dst_entry *odst = NULL;
1124         bool new = false;
1125         struct net *net = sock_net(sk);
1126
1127         bh_lock_sock(sk);
1128
1129         if (!ip_sk_accept_pmtu(sk))
1130                 goto out;
1131
1132         odst = sk_dst_get(sk);
1133
1134         if (sock_owned_by_user(sk) || !odst) {
1135                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1136                 goto out;
1137         }
1138
1139         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1140
1141         rt = (struct rtable *)odst;
1142         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1143                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1144                 if (IS_ERR(rt))
1145                         goto out;
1146
1147                 new = true;
1148         }
1149
1150         __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1151
1152         if (!dst_check(&rt->dst, 0)) {
1153                 if (new)
1154                         dst_release(&rt->dst);
1155
1156                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1157                 if (IS_ERR(rt))
1158                         goto out;
1159
1160                 new = true;
1161         }
1162
1163         if (new)
1164                 sk_dst_set(sk, &rt->dst);
1165
1166 out:
1167         bh_unlock_sock(sk);
1168         dst_release(odst);
1169 }
1170 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1171
1172 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1173                    int oif, u8 protocol)
1174 {
1175         const struct iphdr *iph = (const struct iphdr *) skb->data;
1176         struct flowi4 fl4;
1177         struct rtable *rt;
1178
1179         __build_flow_key(net, &fl4, NULL, iph, oif,
1180                          RT_TOS(iph->tos), protocol, 0, 0);
1181         rt = __ip_route_output_key(net, &fl4);
1182         if (!IS_ERR(rt)) {
1183                 __ip_do_redirect(rt, skb, &fl4, false);
1184                 ip_rt_put(rt);
1185         }
1186 }
1187 EXPORT_SYMBOL_GPL(ipv4_redirect);
1188
1189 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1190 {
1191         const struct iphdr *iph = (const struct iphdr *) skb->data;
1192         struct flowi4 fl4;
1193         struct rtable *rt;
1194         struct net *net = sock_net(sk);
1195
1196         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1197         rt = __ip_route_output_key(net, &fl4);
1198         if (!IS_ERR(rt)) {
1199                 __ip_do_redirect(rt, skb, &fl4, false);
1200                 ip_rt_put(rt);
1201         }
1202 }
1203 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1204
1205 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1206 {
1207         struct rtable *rt = (struct rtable *) dst;
1208
1209         /* All IPV4 dsts are created with ->obsolete set to the value
1210          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1211          * into this function always.
1212          *
1213          * When a PMTU/redirect information update invalidates a route,
1214          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1215          * DST_OBSOLETE_DEAD.
1216          */
1217         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1218                 return NULL;
1219         return dst;
1220 }
1221
1222 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1223 {
1224         struct ip_options opt;
1225         int res;
1226
1227         /* Recompile ip options since IPCB may not be valid anymore.
1228          * Also check we have a reasonable ipv4 header.
1229          */
1230         if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1231             ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1232                 return;
1233
1234         memset(&opt, 0, sizeof(opt));
1235         if (ip_hdr(skb)->ihl > 5) {
1236                 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1237                         return;
1238                 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1239
1240                 rcu_read_lock();
1241                 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1242                 rcu_read_unlock();
1243
1244                 if (res)
1245                         return;
1246         }
1247         __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1248 }
1249
1250 static void ipv4_link_failure(struct sk_buff *skb)
1251 {
1252         struct rtable *rt;
1253
1254         ipv4_send_dest_unreach(skb);
1255
1256         rt = skb_rtable(skb);
1257         if (rt)
1258                 dst_set_expires(&rt->dst, 0);
1259 }
1260
1261 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1262 {
1263         pr_debug("%s: %pI4 -> %pI4, %s\n",
1264                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1265                  skb->dev ? skb->dev->name : "?");
1266         kfree_skb(skb);
1267         WARN_ON(1);
1268         return 0;
1269 }
1270
1271 /*
1272    We do not cache source address of outgoing interface,
1273    because it is used only by IP RR, TS and SRR options,
1274    so that it out of fast path.
1275
1276    BTW remember: "addr" is allowed to be not aligned
1277    in IP options!
1278  */
1279
1280 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1281 {
1282         __be32 src;
1283
1284         if (rt_is_output_route(rt))
1285                 src = ip_hdr(skb)->saddr;
1286         else {
1287                 struct fib_result res;
1288                 struct iphdr *iph = ip_hdr(skb);
1289                 struct flowi4 fl4 = {
1290                         .daddr = iph->daddr,
1291                         .saddr = iph->saddr,
1292                         .flowi4_tos = RT_TOS(iph->tos),
1293                         .flowi4_oif = rt->dst.dev->ifindex,
1294                         .flowi4_iif = skb->dev->ifindex,
1295                         .flowi4_mark = skb->mark,
1296                 };
1297
1298                 rcu_read_lock();
1299                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1300                         src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1301                 else
1302                         src = inet_select_addr(rt->dst.dev,
1303                                                rt_nexthop(rt, iph->daddr),
1304                                                RT_SCOPE_UNIVERSE);
1305                 rcu_read_unlock();
1306         }
1307         memcpy(addr, &src, 4);
1308 }
1309
1310 #ifdef CONFIG_IP_ROUTE_CLASSID
1311 static void set_class_tag(struct rtable *rt, u32 tag)
1312 {
1313         if (!(rt->dst.tclassid & 0xFFFF))
1314                 rt->dst.tclassid |= tag & 0xFFFF;
1315         if (!(rt->dst.tclassid & 0xFFFF0000))
1316                 rt->dst.tclassid |= tag & 0xFFFF0000;
1317 }
1318 #endif
1319
1320 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1321 {
1322         unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1323         unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1324                                     ip_rt_min_advmss);
1325
1326         return min(advmss, IPV4_MAX_PMTU - header_size);
1327 }
1328
1329 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1330 {
1331         const struct rtable *rt = (const struct rtable *) dst;
1332         unsigned int mtu = rt->rt_pmtu;
1333
1334         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1335                 mtu = dst_metric_raw(dst, RTAX_MTU);
1336
1337         if (mtu)
1338                 goto out;
1339
1340         mtu = READ_ONCE(dst->dev->mtu);
1341
1342         if (unlikely(ip_mtu_locked(dst))) {
1343                 if (rt->rt_uses_gateway && mtu > 576)
1344                         mtu = 576;
1345         }
1346
1347 out:
1348         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1349
1350         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1351 }
1352
1353 static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1354 {
1355         struct fnhe_hash_bucket *hash;
1356         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1357         u32 hval = fnhe_hashfun(daddr);
1358
1359         spin_lock_bh(&fnhe_lock);
1360
1361         hash = rcu_dereference_protected(nhc->nhc_exceptions,
1362                                          lockdep_is_held(&fnhe_lock));
1363         hash += hval;
1364
1365         fnhe_p = &hash->chain;
1366         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1367         while (fnhe) {
1368                 if (fnhe->fnhe_daddr == daddr) {
1369                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1370                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1371                         /* set fnhe_daddr to 0 to ensure it won't bind with
1372                          * new dsts in rt_bind_exception().
1373                          */
1374                         fnhe->fnhe_daddr = 0;
1375                         fnhe_flush_routes(fnhe);
1376                         kfree_rcu(fnhe, rcu);
1377                         break;
1378                 }
1379                 fnhe_p = &fnhe->fnhe_next;
1380                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1381                                                  lockdep_is_held(&fnhe_lock));
1382         }
1383
1384         spin_unlock_bh(&fnhe_lock);
1385 }
1386
1387 static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1388                                                __be32 daddr)
1389 {
1390         struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1391         struct fib_nh_exception *fnhe;
1392         u32 hval;
1393
1394         if (!hash)
1395                 return NULL;
1396
1397         hval = fnhe_hashfun(daddr);
1398
1399         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1400              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1401                 if (fnhe->fnhe_daddr == daddr) {
1402                         if (fnhe->fnhe_expires &&
1403                             time_after(jiffies, fnhe->fnhe_expires)) {
1404                                 ip_del_fnhe(nhc, daddr);
1405                                 break;
1406                         }
1407                         return fnhe;
1408                 }
1409         }
1410         return NULL;
1411 }
1412
1413 /* MTU selection:
1414  * 1. mtu on route is locked - use it
1415  * 2. mtu from nexthop exception
1416  * 3. mtu from egress device
1417  */
1418
1419 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1420 {
1421         struct fib_nh_common *nhc = res->nhc;
1422         struct net_device *dev = nhc->nhc_dev;
1423         struct fib_info *fi = res->fi;
1424         u32 mtu = 0;
1425
1426         if (READ_ONCE(dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu) ||
1427             fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1428                 mtu = fi->fib_mtu;
1429
1430         if (likely(!mtu)) {
1431                 struct fib_nh_exception *fnhe;
1432
1433                 fnhe = find_exception(nhc, daddr);
1434                 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1435                         mtu = fnhe->fnhe_pmtu;
1436         }
1437
1438         if (likely(!mtu))
1439                 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1440
1441         return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1442 }
1443
1444 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1445                               __be32 daddr, const bool do_cache)
1446 {
1447         bool ret = false;
1448
1449         spin_lock_bh(&fnhe_lock);
1450
1451         if (daddr == fnhe->fnhe_daddr) {
1452                 struct rtable __rcu **porig;
1453                 struct rtable *orig;
1454                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1455
1456                 if (rt_is_input_route(rt))
1457                         porig = &fnhe->fnhe_rth_input;
1458                 else
1459                         porig = &fnhe->fnhe_rth_output;
1460                 orig = rcu_dereference(*porig);
1461
1462                 if (fnhe->fnhe_genid != genid) {
1463                         fnhe->fnhe_genid = genid;
1464                         fnhe->fnhe_gw = 0;
1465                         fnhe->fnhe_pmtu = 0;
1466                         fnhe->fnhe_expires = 0;
1467                         fnhe->fnhe_mtu_locked = false;
1468                         fnhe_flush_routes(fnhe);
1469                         orig = NULL;
1470                 }
1471                 fill_route_from_fnhe(rt, fnhe);
1472                 if (!rt->rt_gw4) {
1473                         rt->rt_gw4 = daddr;
1474                         rt->rt_gw_family = AF_INET;
1475                 }
1476
1477                 if (do_cache) {
1478                         dst_hold(&rt->dst);
1479                         rcu_assign_pointer(*porig, rt);
1480                         if (orig) {
1481                                 dst_dev_put(&orig->dst);
1482                                 dst_release(&orig->dst);
1483                         }
1484                         ret = true;
1485                 }
1486
1487                 fnhe->fnhe_stamp = jiffies;
1488         }
1489         spin_unlock_bh(&fnhe_lock);
1490
1491         return ret;
1492 }
1493
1494 static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1495 {
1496         struct rtable *orig, *prev, **p;
1497         bool ret = true;
1498
1499         if (rt_is_input_route(rt)) {
1500                 p = (struct rtable **)&nhc->nhc_rth_input;
1501         } else {
1502                 p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1503         }
1504         orig = *p;
1505
1506         /* hold dst before doing cmpxchg() to avoid race condition
1507          * on this dst
1508          */
1509         dst_hold(&rt->dst);
1510         prev = cmpxchg(p, orig, rt);
1511         if (prev == orig) {
1512                 if (orig) {
1513                         rt_add_uncached_list(orig);
1514                         dst_release(&orig->dst);
1515                 }
1516         } else {
1517                 dst_release(&rt->dst);
1518                 ret = false;
1519         }
1520
1521         return ret;
1522 }
1523
1524 struct uncached_list {
1525         spinlock_t              lock;
1526         struct list_head        head;
1527 };
1528
1529 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1530
1531 void rt_add_uncached_list(struct rtable *rt)
1532 {
1533         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1534
1535         rt->rt_uncached_list = ul;
1536
1537         spin_lock_bh(&ul->lock);
1538         list_add_tail(&rt->rt_uncached, &ul->head);
1539         spin_unlock_bh(&ul->lock);
1540 }
1541
1542 void rt_del_uncached_list(struct rtable *rt)
1543 {
1544         if (!list_empty(&rt->rt_uncached)) {
1545                 struct uncached_list *ul = rt->rt_uncached_list;
1546
1547                 spin_lock_bh(&ul->lock);
1548                 list_del(&rt->rt_uncached);
1549                 spin_unlock_bh(&ul->lock);
1550         }
1551 }
1552
1553 static void ipv4_dst_destroy(struct dst_entry *dst)
1554 {
1555         struct rtable *rt = (struct rtable *)dst;
1556
1557         ip_dst_metrics_put(dst);
1558         rt_del_uncached_list(rt);
1559 }
1560
1561 void rt_flush_dev(struct net_device *dev)
1562 {
1563         struct rtable *rt;
1564         int cpu;
1565
1566         for_each_possible_cpu(cpu) {
1567                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1568
1569                 spin_lock_bh(&ul->lock);
1570                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1571                         if (rt->dst.dev != dev)
1572                                 continue;
1573                         rt->dst.dev = blackhole_netdev;
1574                         dev_hold(rt->dst.dev);
1575                         dev_put(dev);
1576                 }
1577                 spin_unlock_bh(&ul->lock);
1578         }
1579 }
1580
1581 static bool rt_cache_valid(const struct rtable *rt)
1582 {
1583         return  rt &&
1584                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1585                 !rt_is_expired(rt);
1586 }
1587
1588 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1589                            const struct fib_result *res,
1590                            struct fib_nh_exception *fnhe,
1591                            struct fib_info *fi, u16 type, u32 itag,
1592                            const bool do_cache)
1593 {
1594         bool cached = false;
1595
1596         if (fi) {
1597                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1598
1599                 if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1600                         rt->rt_uses_gateway = 1;
1601                         rt->rt_gw_family = nhc->nhc_gw_family;
1602                         /* only INET and INET6 are supported */
1603                         if (likely(nhc->nhc_gw_family == AF_INET))
1604                                 rt->rt_gw4 = nhc->nhc_gw.ipv4;
1605                         else
1606                                 rt->rt_gw6 = nhc->nhc_gw.ipv6;
1607                 }
1608
1609                 ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1610
1611 #ifdef CONFIG_IP_ROUTE_CLASSID
1612                 if (nhc->nhc_family == AF_INET) {
1613                         struct fib_nh *nh;
1614
1615                         nh = container_of(nhc, struct fib_nh, nh_common);
1616                         rt->dst.tclassid = nh->nh_tclassid;
1617                 }
1618 #endif
1619                 rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1620                 if (unlikely(fnhe))
1621                         cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1622                 else if (do_cache)
1623                         cached = rt_cache_route(nhc, rt);
1624                 if (unlikely(!cached)) {
1625                         /* Routes we intend to cache in nexthop exception or
1626                          * FIB nexthop have the DST_NOCACHE bit clear.
1627                          * However, if we are unsuccessful at storing this
1628                          * route into the cache we really need to set it.
1629                          */
1630                         if (!rt->rt_gw4) {
1631                                 rt->rt_gw_family = AF_INET;
1632                                 rt->rt_gw4 = daddr;
1633                         }
1634                         rt_add_uncached_list(rt);
1635                 }
1636         } else
1637                 rt_add_uncached_list(rt);
1638
1639 #ifdef CONFIG_IP_ROUTE_CLASSID
1640 #ifdef CONFIG_IP_MULTIPLE_TABLES
1641         set_class_tag(rt, res->tclassid);
1642 #endif
1643         set_class_tag(rt, itag);
1644 #endif
1645 }
1646
1647 struct rtable *rt_dst_alloc(struct net_device *dev,
1648                             unsigned int flags, u16 type,
1649                             bool nopolicy, bool noxfrm, bool will_cache)
1650 {
1651         struct rtable *rt;
1652
1653         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1654                        (will_cache ? 0 : DST_HOST) |
1655                        (nopolicy ? DST_NOPOLICY : 0) |
1656                        (noxfrm ? DST_NOXFRM : 0));
1657
1658         if (rt) {
1659                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1660                 rt->rt_flags = flags;
1661                 rt->rt_type = type;
1662                 rt->rt_is_input = 0;
1663                 rt->rt_iif = 0;
1664                 rt->rt_pmtu = 0;
1665                 rt->rt_mtu_locked = 0;
1666                 rt->rt_uses_gateway = 0;
1667                 rt->rt_gw_family = 0;
1668                 rt->rt_gw4 = 0;
1669                 INIT_LIST_HEAD(&rt->rt_uncached);
1670
1671                 rt->dst.output = ip_output;
1672                 if (flags & RTCF_LOCAL)
1673                         rt->dst.input = ip_local_deliver;
1674         }
1675
1676         return rt;
1677 }
1678 EXPORT_SYMBOL(rt_dst_alloc);
1679
1680 struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1681 {
1682         struct rtable *new_rt;
1683
1684         new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1685                            rt->dst.flags);
1686
1687         if (new_rt) {
1688                 new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1689                 new_rt->rt_flags = rt->rt_flags;
1690                 new_rt->rt_type = rt->rt_type;
1691                 new_rt->rt_is_input = rt->rt_is_input;
1692                 new_rt->rt_iif = rt->rt_iif;
1693                 new_rt->rt_pmtu = rt->rt_pmtu;
1694                 new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1695                 new_rt->rt_gw_family = rt->rt_gw_family;
1696                 if (rt->rt_gw_family == AF_INET)
1697                         new_rt->rt_gw4 = rt->rt_gw4;
1698                 else if (rt->rt_gw_family == AF_INET6)
1699                         new_rt->rt_gw6 = rt->rt_gw6;
1700                 INIT_LIST_HEAD(&new_rt->rt_uncached);
1701
1702                 new_rt->dst.flags |= DST_HOST;
1703                 new_rt->dst.input = rt->dst.input;
1704                 new_rt->dst.output = rt->dst.output;
1705                 new_rt->dst.error = rt->dst.error;
1706                 new_rt->dst.lastuse = jiffies;
1707                 new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1708         }
1709         return new_rt;
1710 }
1711 EXPORT_SYMBOL(rt_dst_clone);
1712
1713 /* called in rcu_read_lock() section */
1714 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1715                           u8 tos, struct net_device *dev,
1716                           struct in_device *in_dev, u32 *itag)
1717 {
1718         int err;
1719
1720         /* Primary sanity checks. */
1721         if (!in_dev)
1722                 return -EINVAL;
1723
1724         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1725             skb->protocol != htons(ETH_P_IP))
1726                 return -EINVAL;
1727
1728         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1729                 return -EINVAL;
1730
1731         if (ipv4_is_zeronet(saddr)) {
1732                 if (!ipv4_is_local_multicast(daddr) &&
1733                     ip_hdr(skb)->protocol != IPPROTO_IGMP)
1734                         return -EINVAL;
1735         } else {
1736                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1737                                           in_dev, itag);
1738                 if (err < 0)
1739                         return err;
1740         }
1741         return 0;
1742 }
1743
1744 /* called in rcu_read_lock() section */
1745 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1746                              u8 tos, struct net_device *dev, int our)
1747 {
1748         struct in_device *in_dev = __in_dev_get_rcu(dev);
1749         unsigned int flags = RTCF_MULTICAST;
1750         struct rtable *rth;
1751         u32 itag = 0;
1752         int err;
1753
1754         err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1755         if (err)
1756                 return err;
1757
1758         if (our)
1759                 flags |= RTCF_LOCAL;
1760
1761         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1762                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1763         if (!rth)
1764                 return -ENOBUFS;
1765
1766 #ifdef CONFIG_IP_ROUTE_CLASSID
1767         rth->dst.tclassid = itag;
1768 #endif
1769         rth->dst.output = ip_rt_bug;
1770         rth->rt_is_input= 1;
1771
1772 #ifdef CONFIG_IP_MROUTE
1773         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1774                 rth->dst.input = ip_mr_input;
1775 #endif
1776         RT_CACHE_STAT_INC(in_slow_mc);
1777
1778         skb_dst_drop(skb);
1779         skb_dst_set(skb, &rth->dst);
1780         return 0;
1781 }
1782
1783
1784 static void ip_handle_martian_source(struct net_device *dev,
1785                                      struct in_device *in_dev,
1786                                      struct sk_buff *skb,
1787                                      __be32 daddr,
1788                                      __be32 saddr)
1789 {
1790         RT_CACHE_STAT_INC(in_martian_src);
1791 #ifdef CONFIG_IP_ROUTE_VERBOSE
1792         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1793                 /*
1794                  *      RFC1812 recommendation, if source is martian,
1795                  *      the only hint is MAC header.
1796                  */
1797                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1798                         &daddr, &saddr, dev->name);
1799                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1800                         print_hex_dump(KERN_WARNING, "ll header: ",
1801                                        DUMP_PREFIX_OFFSET, 16, 1,
1802                                        skb_mac_header(skb),
1803                                        dev->hard_header_len, false);
1804                 }
1805         }
1806 #endif
1807 }
1808
1809 /* called in rcu_read_lock() section */
1810 static int __mkroute_input(struct sk_buff *skb,
1811                            const struct fib_result *res,
1812                            struct in_device *in_dev,
1813                            __be32 daddr, __be32 saddr, u32 tos)
1814 {
1815         struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1816         struct net_device *dev = nhc->nhc_dev;
1817         struct fib_nh_exception *fnhe;
1818         struct rtable *rth;
1819         int err;
1820         struct in_device *out_dev;
1821         bool do_cache;
1822         u32 itag = 0;
1823
1824         /* get a working reference to the output device */
1825         out_dev = __in_dev_get_rcu(dev);
1826         if (!out_dev) {
1827                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1828                 return -EINVAL;
1829         }
1830
1831         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1832                                   in_dev->dev, in_dev, &itag);
1833         if (err < 0) {
1834                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1835                                          saddr);
1836
1837                 goto cleanup;
1838         }
1839
1840         do_cache = res->fi && !itag;
1841         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1842             skb->protocol == htons(ETH_P_IP)) {
1843                 __be32 gw;
1844
1845                 gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1846                 if (IN_DEV_SHARED_MEDIA(out_dev) ||
1847                     inet_addr_onlink(out_dev, saddr, gw))
1848                         IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1849         }
1850
1851         if (skb->protocol != htons(ETH_P_IP)) {
1852                 /* Not IP (i.e. ARP). Do not create route, if it is
1853                  * invalid for proxy arp. DNAT routes are always valid.
1854                  *
1855                  * Proxy arp feature have been extended to allow, ARP
1856                  * replies back to the same interface, to support
1857                  * Private VLAN switch technologies. See arp.c.
1858                  */
1859                 if (out_dev == in_dev &&
1860                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1861                         err = -EINVAL;
1862                         goto cleanup;
1863                 }
1864         }
1865
1866         fnhe = find_exception(nhc, daddr);
1867         if (do_cache) {
1868                 if (fnhe)
1869                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1870                 else
1871                         rth = rcu_dereference(nhc->nhc_rth_input);
1872                 if (rt_cache_valid(rth)) {
1873                         skb_dst_set_noref(skb, &rth->dst);
1874                         goto out;
1875                 }
1876         }
1877
1878         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1879                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1880                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1881         if (!rth) {
1882                 err = -ENOBUFS;
1883                 goto cleanup;
1884         }
1885
1886         rth->rt_is_input = 1;
1887         RT_CACHE_STAT_INC(in_slow_tot);
1888
1889         rth->dst.input = ip_forward;
1890
1891         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1892                        do_cache);
1893         lwtunnel_set_redirect(&rth->dst);
1894         skb_dst_set(skb, &rth->dst);
1895 out:
1896         err = 0;
1897  cleanup:
1898         return err;
1899 }
1900
1901 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1902 /* To make ICMP packets follow the right flow, the multipath hash is
1903  * calculated from the inner IP addresses.
1904  */
1905 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1906                                  struct flow_keys *hash_keys)
1907 {
1908         const struct iphdr *outer_iph = ip_hdr(skb);
1909         const struct iphdr *key_iph = outer_iph;
1910         const struct iphdr *inner_iph;
1911         const struct icmphdr *icmph;
1912         struct iphdr _inner_iph;
1913         struct icmphdr _icmph;
1914
1915         if (likely(outer_iph->protocol != IPPROTO_ICMP))
1916                 goto out;
1917
1918         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1919                 goto out;
1920
1921         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1922                                    &_icmph);
1923         if (!icmph)
1924                 goto out;
1925
1926         if (icmph->type != ICMP_DEST_UNREACH &&
1927             icmph->type != ICMP_REDIRECT &&
1928             icmph->type != ICMP_TIME_EXCEEDED &&
1929             icmph->type != ICMP_PARAMETERPROB)
1930                 goto out;
1931
1932         inner_iph = skb_header_pointer(skb,
1933                                        outer_iph->ihl * 4 + sizeof(_icmph),
1934                                        sizeof(_inner_iph), &_inner_iph);
1935         if (!inner_iph)
1936                 goto out;
1937
1938         key_iph = inner_iph;
1939 out:
1940         hash_keys->addrs.v4addrs.src = key_iph->saddr;
1941         hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1942 }
1943
1944 /* if skb is set it will be used and fl4 can be NULL */
1945 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1946                        const struct sk_buff *skb, struct flow_keys *flkeys)
1947 {
1948         u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1949         struct flow_keys hash_keys;
1950         u32 mhash;
1951
1952         switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1953         case 0:
1954                 memset(&hash_keys, 0, sizeof(hash_keys));
1955                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1956                 if (skb) {
1957                         ip_multipath_l3_keys(skb, &hash_keys);
1958                 } else {
1959                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1960                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1961                 }
1962                 break;
1963         case 1:
1964                 /* skb is currently provided only when forwarding */
1965                 if (skb) {
1966                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1967                         struct flow_keys keys;
1968
1969                         /* short-circuit if we already have L4 hash present */
1970                         if (skb->l4_hash)
1971                                 return skb_get_hash_raw(skb) >> 1;
1972
1973                         memset(&hash_keys, 0, sizeof(hash_keys));
1974
1975                         if (!flkeys) {
1976                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
1977                                 flkeys = &keys;
1978                         }
1979
1980                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1981                         hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1982                         hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1983                         hash_keys.ports.src = flkeys->ports.src;
1984                         hash_keys.ports.dst = flkeys->ports.dst;
1985                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1986                 } else {
1987                         memset(&hash_keys, 0, sizeof(hash_keys));
1988                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1989                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1990                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1991                         hash_keys.ports.src = fl4->fl4_sport;
1992                         hash_keys.ports.dst = fl4->fl4_dport;
1993                         hash_keys.basic.ip_proto = fl4->flowi4_proto;
1994                 }
1995                 break;
1996         case 2:
1997                 memset(&hash_keys, 0, sizeof(hash_keys));
1998                 /* skb is currently provided only when forwarding */
1999                 if (skb) {
2000                         struct flow_keys keys;
2001
2002                         skb_flow_dissect_flow_keys(skb, &keys, 0);
2003                         /* Inner can be v4 or v6 */
2004                         if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
2005                                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2006                                 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
2007                                 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
2008                         } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
2009                                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2010                                 hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
2011                                 hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
2012                                 hash_keys.tags.flow_label = keys.tags.flow_label;
2013                                 hash_keys.basic.ip_proto = keys.basic.ip_proto;
2014                         } else {
2015                                 /* Same as case 0 */
2016                                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2017                                 ip_multipath_l3_keys(skb, &hash_keys);
2018                         }
2019                 } else {
2020                         /* Same as case 0 */
2021                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2022                         hash_keys.addrs.v4addrs.src = fl4->saddr;
2023                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
2024                 }
2025                 break;
2026         }
2027         mhash = flow_hash_from_keys(&hash_keys);
2028
2029         if (multipath_hash)
2030                 mhash = jhash_2words(mhash, multipath_hash, 0);
2031
2032         return mhash >> 1;
2033 }
2034 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
2035
2036 static int ip_mkroute_input(struct sk_buff *skb,
2037                             struct fib_result *res,
2038                             struct in_device *in_dev,
2039                             __be32 daddr, __be32 saddr, u32 tos,
2040                             struct flow_keys *hkeys)
2041 {
2042 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2043         if (res->fi && fib_info_num_path(res->fi) > 1) {
2044                 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2045
2046                 fib_select_multipath(res, h);
2047         }
2048 #endif
2049
2050         /* create a routing cache entry */
2051         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2052 }
2053
2054 /*
2055  *      NOTE. We drop all the packets that has local source
2056  *      addresses, because every properly looped back packet
2057  *      must have correct destination already attached by output routine.
2058  *
2059  *      Such approach solves two big problems:
2060  *      1. Not simplex devices are handled properly.
2061  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2062  *      called with rcu_read_lock()
2063  */
2064
2065 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2066                                u8 tos, struct net_device *dev,
2067                                struct fib_result *res)
2068 {
2069         struct in_device *in_dev = __in_dev_get_rcu(dev);
2070         struct flow_keys *flkeys = NULL, _flkeys;
2071         struct net    *net = dev_net(dev);
2072         struct ip_tunnel_info *tun_info;
2073         int             err = -EINVAL;
2074         unsigned int    flags = 0;
2075         u32             itag = 0;
2076         struct rtable   *rth;
2077         struct flowi4   fl4;
2078         bool do_cache = true;
2079
2080         /* IP on this device is disabled. */
2081
2082         if (!in_dev)
2083                 goto out;
2084
2085         /* Check for the most weird martians, which can be not detected
2086            by fib_lookup.
2087          */
2088
2089         tun_info = skb_tunnel_info(skb);
2090         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2091                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2092         else
2093                 fl4.flowi4_tun_key.tun_id = 0;
2094         skb_dst_drop(skb);
2095
2096         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2097                 goto martian_source;
2098
2099         res->fi = NULL;
2100         res->table = NULL;
2101         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2102                 goto brd_input;
2103
2104         /* Accept zero addresses only to limited broadcast;
2105          * I even do not know to fix it or not. Waiting for complains :-)
2106          */
2107         if (ipv4_is_zeronet(saddr))
2108                 goto martian_source;
2109
2110         if (ipv4_is_zeronet(daddr))
2111                 goto martian_destination;
2112
2113         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2114          * and call it once if daddr or/and saddr are loopback addresses
2115          */
2116         if (ipv4_is_loopback(daddr)) {
2117                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2118                         goto martian_destination;
2119         } else if (ipv4_is_loopback(saddr)) {
2120                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2121                         goto martian_source;
2122         }
2123
2124         /*
2125          *      Now we are ready to route packet.
2126          */
2127         fl4.flowi4_oif = 0;
2128         fl4.flowi4_iif = dev->ifindex;
2129         fl4.flowi4_mark = skb->mark;
2130         fl4.flowi4_tos = tos;
2131         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2132         fl4.flowi4_flags = 0;
2133         fl4.daddr = daddr;
2134         fl4.saddr = saddr;
2135         fl4.flowi4_uid = sock_net_uid(net, NULL);
2136         fl4.flowi4_multipath_hash = 0;
2137
2138         if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2139                 flkeys = &_flkeys;
2140         } else {
2141                 fl4.flowi4_proto = 0;
2142                 fl4.fl4_sport = 0;
2143                 fl4.fl4_dport = 0;
2144         }
2145
2146         err = fib_lookup(net, &fl4, res, 0);
2147         if (err != 0) {
2148                 if (!IN_DEV_FORWARD(in_dev))
2149                         err = -EHOSTUNREACH;
2150                 goto no_route;
2151         }
2152
2153         if (res->type == RTN_BROADCAST) {
2154                 if (IN_DEV_BFORWARD(in_dev))
2155                         goto make_route;
2156                 /* not do cache if bc_forwarding is enabled */
2157                 if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2158                         do_cache = false;
2159                 goto brd_input;
2160         }
2161
2162         if (res->type == RTN_LOCAL) {
2163                 err = fib_validate_source(skb, saddr, daddr, tos,
2164                                           0, dev, in_dev, &itag);
2165                 if (err < 0)
2166                         goto martian_source;
2167                 goto local_input;
2168         }
2169
2170         if (!IN_DEV_FORWARD(in_dev)) {
2171                 err = -EHOSTUNREACH;
2172                 goto no_route;
2173         }
2174         if (res->type != RTN_UNICAST)
2175                 goto martian_destination;
2176
2177 make_route:
2178         err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2179 out:    return err;
2180
2181 brd_input:
2182         if (skb->protocol != htons(ETH_P_IP))
2183                 goto e_inval;
2184
2185         if (!ipv4_is_zeronet(saddr)) {
2186                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2187                                           in_dev, &itag);
2188                 if (err < 0)
2189                         goto martian_source;
2190         }
2191         flags |= RTCF_BROADCAST;
2192         res->type = RTN_BROADCAST;
2193         RT_CACHE_STAT_INC(in_brd);
2194
2195 local_input:
2196         do_cache &= res->fi && !itag;
2197         if (do_cache) {
2198                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2199
2200                 rth = rcu_dereference(nhc->nhc_rth_input);
2201                 if (rt_cache_valid(rth)) {
2202                         skb_dst_set_noref(skb, &rth->dst);
2203                         err = 0;
2204                         goto out;
2205                 }
2206         }
2207
2208         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2209                            flags | RTCF_LOCAL, res->type,
2210                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2211         if (!rth)
2212                 goto e_nobufs;
2213
2214         rth->dst.output= ip_rt_bug;
2215 #ifdef CONFIG_IP_ROUTE_CLASSID
2216         rth->dst.tclassid = itag;
2217 #endif
2218         rth->rt_is_input = 1;
2219
2220         RT_CACHE_STAT_INC(in_slow_tot);
2221         if (res->type == RTN_UNREACHABLE) {
2222                 rth->dst.input= ip_error;
2223                 rth->dst.error= -err;
2224                 rth->rt_flags   &= ~RTCF_LOCAL;
2225         }
2226
2227         if (do_cache) {
2228                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2229
2230                 rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2231                 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2232                         WARN_ON(rth->dst.input == lwtunnel_input);
2233                         rth->dst.lwtstate->orig_input = rth->dst.input;
2234                         rth->dst.input = lwtunnel_input;
2235                 }
2236
2237                 if (unlikely(!rt_cache_route(nhc, rth)))
2238                         rt_add_uncached_list(rth);
2239         }
2240         skb_dst_set(skb, &rth->dst);
2241         err = 0;
2242         goto out;
2243
2244 no_route:
2245         RT_CACHE_STAT_INC(in_no_route);
2246         res->type = RTN_UNREACHABLE;
2247         res->fi = NULL;
2248         res->table = NULL;
2249         goto local_input;
2250
2251         /*
2252          *      Do not cache martian addresses: they should be logged (RFC1812)
2253          */
2254 martian_destination:
2255         RT_CACHE_STAT_INC(in_martian_dst);
2256 #ifdef CONFIG_IP_ROUTE_VERBOSE
2257         if (IN_DEV_LOG_MARTIANS(in_dev))
2258                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2259                                      &daddr, &saddr, dev->name);
2260 #endif
2261
2262 e_inval:
2263         err = -EINVAL;
2264         goto out;
2265
2266 e_nobufs:
2267         err = -ENOBUFS;
2268         goto out;
2269
2270 martian_source:
2271         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2272         goto out;
2273 }
2274
2275 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2276                          u8 tos, struct net_device *dev)
2277 {
2278         struct fib_result res;
2279         int err;
2280
2281         tos &= IPTOS_RT_MASK;
2282         rcu_read_lock();
2283         err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2284         rcu_read_unlock();
2285
2286         return err;
2287 }
2288 EXPORT_SYMBOL(ip_route_input_noref);
2289
2290 /* called with rcu_read_lock held */
2291 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2292                        u8 tos, struct net_device *dev, struct fib_result *res)
2293 {
2294         /* Multicast recognition logic is moved from route cache to here.
2295            The problem was that too many Ethernet cards have broken/missing
2296            hardware multicast filters :-( As result the host on multicasting
2297            network acquires a lot of useless route cache entries, sort of
2298            SDR messages from all the world. Now we try to get rid of them.
2299            Really, provided software IP multicast filter is organized
2300            reasonably (at least, hashed), it does not result in a slowdown
2301            comparing with route cache reject entries.
2302            Note, that multicast routers are not affected, because
2303            route cache entry is created eventually.
2304          */
2305         if (ipv4_is_multicast(daddr)) {
2306                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2307                 int our = 0;
2308                 int err = -EINVAL;
2309
2310                 if (!in_dev)
2311                         return err;
2312                 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2313                                       ip_hdr(skb)->protocol);
2314
2315                 /* check l3 master if no match yet */
2316                 if (!our && netif_is_l3_slave(dev)) {
2317                         struct in_device *l3_in_dev;
2318
2319                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2320                         if (l3_in_dev)
2321                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2322                                                       ip_hdr(skb)->protocol);
2323                 }
2324
2325                 if (our
2326 #ifdef CONFIG_IP_MROUTE
2327                         ||
2328                     (!ipv4_is_local_multicast(daddr) &&
2329                      IN_DEV_MFORWARD(in_dev))
2330 #endif
2331                    ) {
2332                         err = ip_route_input_mc(skb, daddr, saddr,
2333                                                 tos, dev, our);
2334                 }
2335                 return err;
2336         }
2337
2338         return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2339 }
2340
2341 /* called with rcu_read_lock() */
2342 static struct rtable *__mkroute_output(const struct fib_result *res,
2343                                        const struct flowi4 *fl4, int orig_oif,
2344                                        struct net_device *dev_out,
2345                                        unsigned int flags)
2346 {
2347         struct fib_info *fi = res->fi;
2348         struct fib_nh_exception *fnhe;
2349         struct in_device *in_dev;
2350         u16 type = res->type;
2351         struct rtable *rth;
2352         bool do_cache;
2353
2354         in_dev = __in_dev_get_rcu(dev_out);
2355         if (!in_dev)
2356                 return ERR_PTR(-EINVAL);
2357
2358         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2359                 if (ipv4_is_loopback(fl4->saddr) &&
2360                     !(dev_out->flags & IFF_LOOPBACK) &&
2361                     !netif_is_l3_master(dev_out))
2362                         return ERR_PTR(-EINVAL);
2363
2364         if (ipv4_is_lbcast(fl4->daddr))
2365                 type = RTN_BROADCAST;
2366         else if (ipv4_is_multicast(fl4->daddr))
2367                 type = RTN_MULTICAST;
2368         else if (ipv4_is_zeronet(fl4->daddr))
2369                 return ERR_PTR(-EINVAL);
2370
2371         if (dev_out->flags & IFF_LOOPBACK)
2372                 flags |= RTCF_LOCAL;
2373
2374         do_cache = true;
2375         if (type == RTN_BROADCAST) {
2376                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2377                 fi = NULL;
2378         } else if (type == RTN_MULTICAST) {
2379                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2380                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2381                                      fl4->flowi4_proto))
2382                         flags &= ~RTCF_LOCAL;
2383                 else
2384                         do_cache = false;
2385                 /* If multicast route do not exist use
2386                  * default one, but do not gateway in this case.
2387                  * Yes, it is hack.
2388                  */
2389                 if (fi && res->prefixlen < 4)
2390                         fi = NULL;
2391         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2392                    (orig_oif != dev_out->ifindex)) {
2393                 /* For local routes that require a particular output interface
2394                  * we do not want to cache the result.  Caching the result
2395                  * causes incorrect behaviour when there are multiple source
2396                  * addresses on the interface, the end result being that if the
2397                  * intended recipient is waiting on that interface for the
2398                  * packet he won't receive it because it will be delivered on
2399                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2400                  * be set to the loopback interface as well.
2401                  */
2402                 do_cache = false;
2403         }
2404
2405         fnhe = NULL;
2406         do_cache &= fi != NULL;
2407         if (fi) {
2408                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2409                 struct rtable __rcu **prth;
2410
2411                 fnhe = find_exception(nhc, fl4->daddr);
2412                 if (!do_cache)
2413                         goto add;
2414                 if (fnhe) {
2415                         prth = &fnhe->fnhe_rth_output;
2416                 } else {
2417                         if (unlikely(fl4->flowi4_flags &
2418                                      FLOWI_FLAG_KNOWN_NH &&
2419                                      !(nhc->nhc_gw_family &&
2420                                        nhc->nhc_scope == RT_SCOPE_LINK))) {
2421                                 do_cache = false;
2422                                 goto add;
2423                         }
2424                         prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2425                 }
2426                 rth = rcu_dereference(*prth);
2427                 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2428                         return rth;
2429         }
2430
2431 add:
2432         rth = rt_dst_alloc(dev_out, flags, type,
2433                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2434                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2435                            do_cache);
2436         if (!rth)
2437                 return ERR_PTR(-ENOBUFS);
2438
2439         rth->rt_iif = orig_oif;
2440
2441         RT_CACHE_STAT_INC(out_slow_tot);
2442
2443         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2444                 if (flags & RTCF_LOCAL &&
2445                     !(dev_out->flags & IFF_LOOPBACK)) {
2446                         rth->dst.output = ip_mc_output;
2447                         RT_CACHE_STAT_INC(out_slow_mc);
2448                 }
2449 #ifdef CONFIG_IP_MROUTE
2450                 if (type == RTN_MULTICAST) {
2451                         if (IN_DEV_MFORWARD(in_dev) &&
2452                             !ipv4_is_local_multicast(fl4->daddr)) {
2453                                 rth->dst.input = ip_mr_input;
2454                                 rth->dst.output = ip_mc_output;
2455                         }
2456                 }
2457 #endif
2458         }
2459
2460         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2461         lwtunnel_set_redirect(&rth->dst);
2462
2463         return rth;
2464 }
2465
2466 /*
2467  * Major route resolver routine.
2468  */
2469
2470 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2471                                         const struct sk_buff *skb)
2472 {
2473         __u8 tos = RT_FL_TOS(fl4);
2474         struct fib_result res = {
2475                 .type           = RTN_UNSPEC,
2476                 .fi             = NULL,
2477                 .table          = NULL,
2478                 .tclassid       = 0,
2479         };
2480         struct rtable *rth;
2481
2482         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2483         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2484         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2485                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2486
2487         rcu_read_lock();
2488         rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2489         rcu_read_unlock();
2490
2491         return rth;
2492 }
2493 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2494
2495 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2496                                             struct fib_result *res,
2497                                             const struct sk_buff *skb)
2498 {
2499         struct net_device *dev_out = NULL;
2500         int orig_oif = fl4->flowi4_oif;
2501         unsigned int flags = 0;
2502         struct rtable *rth;
2503         int err;
2504
2505         if (fl4->saddr) {
2506                 if (ipv4_is_multicast(fl4->saddr) ||
2507                     ipv4_is_lbcast(fl4->saddr) ||
2508                     ipv4_is_zeronet(fl4->saddr)) {
2509                         rth = ERR_PTR(-EINVAL);
2510                         goto out;
2511                 }
2512
2513                 rth = ERR_PTR(-ENETUNREACH);
2514
2515                 /* I removed check for oif == dev_out->oif here.
2516                    It was wrong for two reasons:
2517                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2518                       is assigned to multiple interfaces.
2519                    2. Moreover, we are allowed to send packets with saddr
2520                       of another iface. --ANK
2521                  */
2522
2523                 if (fl4->flowi4_oif == 0 &&
2524                     (ipv4_is_multicast(fl4->daddr) ||
2525                      ipv4_is_lbcast(fl4->daddr))) {
2526                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2527                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2528                         if (!dev_out)
2529                                 goto out;
2530
2531                         /* Special hack: user can direct multicasts
2532                            and limited broadcast via necessary interface
2533                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2534                            This hack is not just for fun, it allows
2535                            vic,vat and friends to work.
2536                            They bind socket to loopback, set ttl to zero
2537                            and expect that it will work.
2538                            From the viewpoint of routing cache they are broken,
2539                            because we are not allowed to build multicast path
2540                            with loopback source addr (look, routing cache
2541                            cannot know, that ttl is zero, so that packet
2542                            will not leave this host and route is valid).
2543                            Luckily, this hack is good workaround.
2544                          */
2545
2546                         fl4->flowi4_oif = dev_out->ifindex;
2547                         goto make_route;
2548                 }
2549
2550                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2551                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2552                         if (!__ip_dev_find(net, fl4->saddr, false))
2553                                 goto out;
2554                 }
2555         }
2556
2557
2558         if (fl4->flowi4_oif) {
2559                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2560                 rth = ERR_PTR(-ENODEV);
2561                 if (!dev_out)
2562                         goto out;
2563
2564                 /* RACE: Check return value of inet_select_addr instead. */
2565                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2566                         rth = ERR_PTR(-ENETUNREACH);
2567                         goto out;
2568                 }
2569                 if (ipv4_is_local_multicast(fl4->daddr) ||
2570                     ipv4_is_lbcast(fl4->daddr) ||
2571                     fl4->flowi4_proto == IPPROTO_IGMP) {
2572                         if (!fl4->saddr)
2573                                 fl4->saddr = inet_select_addr(dev_out, 0,
2574                                                               RT_SCOPE_LINK);
2575                         goto make_route;
2576                 }
2577                 if (!fl4->saddr) {
2578                         if (ipv4_is_multicast(fl4->daddr))
2579                                 fl4->saddr = inet_select_addr(dev_out, 0,
2580                                                               fl4->flowi4_scope);
2581                         else if (!fl4->daddr)
2582                                 fl4->saddr = inet_select_addr(dev_out, 0,
2583                                                               RT_SCOPE_HOST);
2584                 }
2585         }
2586
2587         if (!fl4->daddr) {
2588                 fl4->daddr = fl4->saddr;
2589                 if (!fl4->daddr)
2590                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2591                 dev_out = net->loopback_dev;
2592                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2593                 res->type = RTN_LOCAL;
2594                 flags |= RTCF_LOCAL;
2595                 goto make_route;
2596         }
2597
2598         err = fib_lookup(net, fl4, res, 0);
2599         if (err) {
2600                 res->fi = NULL;
2601                 res->table = NULL;
2602                 if (fl4->flowi4_oif &&
2603                     (ipv4_is_multicast(fl4->daddr) ||
2604                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2605                         /* Apparently, routing tables are wrong. Assume,
2606                            that the destination is on link.
2607
2608                            WHY? DW.
2609                            Because we are allowed to send to iface
2610                            even if it has NO routes and NO assigned
2611                            addresses. When oif is specified, routing
2612                            tables are looked up with only one purpose:
2613                            to catch if destination is gatewayed, rather than
2614                            direct. Moreover, if MSG_DONTROUTE is set,
2615                            we send packet, ignoring both routing tables
2616                            and ifaddr state. --ANK
2617
2618
2619                            We could make it even if oif is unknown,
2620                            likely IPv6, but we do not.
2621                          */
2622
2623                         if (fl4->saddr == 0)
2624                                 fl4->saddr = inet_select_addr(dev_out, 0,
2625                                                               RT_SCOPE_LINK);
2626                         res->type = RTN_UNICAST;
2627                         goto make_route;
2628                 }
2629                 rth = ERR_PTR(err);
2630                 goto out;
2631         }
2632
2633         if (res->type == RTN_LOCAL) {
2634                 if (!fl4->saddr) {
2635                         if (res->fi->fib_prefsrc)
2636                                 fl4->saddr = res->fi->fib_prefsrc;
2637                         else
2638                                 fl4->saddr = fl4->daddr;
2639                 }
2640
2641                 /* L3 master device is the loopback for that domain */
2642                 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2643                         net->loopback_dev;
2644
2645                 /* make sure orig_oif points to fib result device even
2646                  * though packet rx/tx happens over loopback or l3mdev
2647                  */
2648                 orig_oif = FIB_RES_OIF(*res);
2649
2650                 fl4->flowi4_oif = dev_out->ifindex;
2651                 flags |= RTCF_LOCAL;
2652                 goto make_route;
2653         }
2654
2655         fib_select_path(net, res, fl4, skb);
2656
2657         dev_out = FIB_RES_DEV(*res);
2658
2659 make_route:
2660         rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2661
2662 out:
2663         return rth;
2664 }
2665
2666 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2667 {
2668         return NULL;
2669 }
2670
2671 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2672 {
2673         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2674
2675         return mtu ? : dst->dev->mtu;
2676 }
2677
2678 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2679                                           struct sk_buff *skb, u32 mtu,
2680                                           bool confirm_neigh)
2681 {
2682 }
2683
2684 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2685                                        struct sk_buff *skb)
2686 {
2687 }
2688
2689 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2690                                           unsigned long old)
2691 {
2692         return NULL;
2693 }
2694
2695 static struct dst_ops ipv4_dst_blackhole_ops = {
2696         .family                 =       AF_INET,
2697         .check                  =       ipv4_blackhole_dst_check,
2698         .mtu                    =       ipv4_blackhole_mtu,
2699         .default_advmss         =       ipv4_default_advmss,
2700         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2701         .redirect               =       ipv4_rt_blackhole_redirect,
2702         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2703         .neigh_lookup           =       ipv4_neigh_lookup,
2704 };
2705
2706 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2707 {
2708         struct rtable *ort = (struct rtable *) dst_orig;
2709         struct rtable *rt;
2710
2711         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2712         if (rt) {
2713                 struct dst_entry *new = &rt->dst;
2714
2715                 new->__use = 1;
2716                 new->input = dst_discard;
2717                 new->output = dst_discard_out;
2718
2719                 new->dev = net->loopback_dev;
2720                 if (new->dev)
2721                         dev_hold(new->dev);
2722
2723                 rt->rt_is_input = ort->rt_is_input;
2724                 rt->rt_iif = ort->rt_iif;
2725                 rt->rt_pmtu = ort->rt_pmtu;
2726                 rt->rt_mtu_locked = ort->rt_mtu_locked;
2727
2728                 rt->rt_genid = rt_genid_ipv4(net);
2729                 rt->rt_flags = ort->rt_flags;
2730                 rt->rt_type = ort->rt_type;
2731                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2732                 rt->rt_gw_family = ort->rt_gw_family;
2733                 if (rt->rt_gw_family == AF_INET)
2734                         rt->rt_gw4 = ort->rt_gw4;
2735                 else if (rt->rt_gw_family == AF_INET6)
2736                         rt->rt_gw6 = ort->rt_gw6;
2737
2738                 INIT_LIST_HEAD(&rt->rt_uncached);
2739         }
2740
2741         dst_release(dst_orig);
2742
2743         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2744 }
2745
2746 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2747                                     const struct sock *sk)
2748 {
2749         struct rtable *rt = __ip_route_output_key(net, flp4);
2750
2751         if (IS_ERR(rt))
2752                 return rt;
2753
2754         if (flp4->flowi4_proto) {
2755                 flp4->flowi4_oif = rt->dst.dev->ifindex;
2756                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2757                                                         flowi4_to_flowi(flp4),
2758                                                         sk, 0);
2759         }
2760
2761         return rt;
2762 }
2763 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2764
2765 /* called with rcu_read_lock held */
2766 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2767                         struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2768                         struct sk_buff *skb, u32 portid, u32 seq,
2769                         unsigned int flags)
2770 {
2771         struct rtmsg *r;
2772         struct nlmsghdr *nlh;
2773         unsigned long expires = 0;
2774         u32 error;
2775         u32 metrics[RTAX_MAX];
2776
2777         nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2778         if (!nlh)
2779                 return -EMSGSIZE;
2780
2781         r = nlmsg_data(nlh);
2782         r->rtm_family    = AF_INET;
2783         r->rtm_dst_len  = 32;
2784         r->rtm_src_len  = 0;
2785         r->rtm_tos      = fl4 ? fl4->flowi4_tos : 0;
2786         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2787         if (nla_put_u32(skb, RTA_TABLE, table_id))
2788                 goto nla_put_failure;
2789         r->rtm_type     = rt->rt_type;
2790         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2791         r->rtm_protocol = RTPROT_UNSPEC;
2792         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2793         if (rt->rt_flags & RTCF_NOTIFY)
2794                 r->rtm_flags |= RTM_F_NOTIFY;
2795         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2796                 r->rtm_flags |= RTCF_DOREDIRECT;
2797
2798         if (nla_put_in_addr(skb, RTA_DST, dst))
2799                 goto nla_put_failure;
2800         if (src) {
2801                 r->rtm_src_len = 32;
2802                 if (nla_put_in_addr(skb, RTA_SRC, src))
2803                         goto nla_put_failure;
2804         }
2805         if (rt->dst.dev &&
2806             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2807                 goto nla_put_failure;
2808 #ifdef CONFIG_IP_ROUTE_CLASSID
2809         if (rt->dst.tclassid &&
2810             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2811                 goto nla_put_failure;
2812 #endif
2813         if (fl4 && !rt_is_input_route(rt) &&
2814             fl4->saddr != src) {
2815                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2816                         goto nla_put_failure;
2817         }
2818         if (rt->rt_uses_gateway) {
2819                 if (rt->rt_gw_family == AF_INET &&
2820                     nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2821                         goto nla_put_failure;
2822                 } else if (rt->rt_gw_family == AF_INET6) {
2823                         int alen = sizeof(struct in6_addr);
2824                         struct nlattr *nla;
2825                         struct rtvia *via;
2826
2827                         nla = nla_reserve(skb, RTA_VIA, alen + 2);
2828                         if (!nla)
2829                                 goto nla_put_failure;
2830
2831                         via = nla_data(nla);
2832                         via->rtvia_family = AF_INET6;
2833                         memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2834                 }
2835         }
2836
2837         expires = rt->dst.expires;
2838         if (expires) {
2839                 unsigned long now = jiffies;
2840
2841                 if (time_before(now, expires))
2842                         expires -= now;
2843                 else
2844                         expires = 0;
2845         }
2846
2847         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2848         if (rt->rt_pmtu && expires)
2849                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2850         if (rt->rt_mtu_locked && expires)
2851                 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2852         if (rtnetlink_put_metrics(skb, metrics) < 0)
2853                 goto nla_put_failure;
2854
2855         if (fl4) {
2856                 if (fl4->flowi4_mark &&
2857                     nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2858                         goto nla_put_failure;
2859
2860                 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2861                     nla_put_u32(skb, RTA_UID,
2862                                 from_kuid_munged(current_user_ns(),
2863                                                  fl4->flowi4_uid)))
2864                         goto nla_put_failure;
2865
2866                 if (rt_is_input_route(rt)) {
2867 #ifdef CONFIG_IP_MROUTE
2868                         if (ipv4_is_multicast(dst) &&
2869                             !ipv4_is_local_multicast(dst) &&
2870                             IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2871                                 int err = ipmr_get_route(net, skb,
2872                                                          fl4->saddr, fl4->daddr,
2873                                                          r, portid);
2874
2875                                 if (err <= 0) {
2876                                         if (err == 0)
2877                                                 return 0;
2878                                         goto nla_put_failure;
2879                                 }
2880                         } else
2881 #endif
2882                                 if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2883                                         goto nla_put_failure;
2884                 }
2885         }
2886
2887         error = rt->dst.error;
2888
2889         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2890                 goto nla_put_failure;
2891
2892         nlmsg_end(skb, nlh);
2893         return 0;
2894
2895 nla_put_failure:
2896         nlmsg_cancel(skb, nlh);
2897         return -EMSGSIZE;
2898 }
2899
2900 static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
2901                             struct netlink_callback *cb, u32 table_id,
2902                             struct fnhe_hash_bucket *bucket, int genid,
2903                             int *fa_index, int fa_start, unsigned int flags)
2904 {
2905         int i;
2906
2907         for (i = 0; i < FNHE_HASH_SIZE; i++) {
2908                 struct fib_nh_exception *fnhe;
2909
2910                 for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
2911                      fnhe = rcu_dereference(fnhe->fnhe_next)) {
2912                         struct rtable *rt;
2913                         int err;
2914
2915                         if (*fa_index < fa_start)
2916                                 goto next;
2917
2918                         if (fnhe->fnhe_genid != genid)
2919                                 goto next;
2920
2921                         if (fnhe->fnhe_expires &&
2922                             time_after(jiffies, fnhe->fnhe_expires))
2923                                 goto next;
2924
2925                         rt = rcu_dereference(fnhe->fnhe_rth_input);
2926                         if (!rt)
2927                                 rt = rcu_dereference(fnhe->fnhe_rth_output);
2928                         if (!rt)
2929                                 goto next;
2930
2931                         err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
2932                                            table_id, NULL, skb,
2933                                            NETLINK_CB(cb->skb).portid,
2934                                            cb->nlh->nlmsg_seq, flags);
2935                         if (err)
2936                                 return err;
2937 next:
2938                         (*fa_index)++;
2939                 }
2940         }
2941
2942         return 0;
2943 }
2944
2945 int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
2946                        u32 table_id, struct fib_info *fi,
2947                        int *fa_index, int fa_start, unsigned int flags)
2948 {
2949         struct net *net = sock_net(cb->skb->sk);
2950         int nhsel, genid = fnhe_genid(net);
2951
2952         for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
2953                 struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
2954                 struct fnhe_hash_bucket *bucket;
2955                 int err;
2956
2957                 if (nhc->nhc_flags & RTNH_F_DEAD)
2958                         continue;
2959
2960                 rcu_read_lock();
2961                 bucket = rcu_dereference(nhc->nhc_exceptions);
2962                 err = 0;
2963                 if (bucket)
2964                         err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
2965                                                genid, fa_index, fa_start,
2966                                                flags);
2967                 rcu_read_unlock();
2968                 if (err)
2969                         return err;
2970         }
2971
2972         return 0;
2973 }
2974
2975 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
2976                                                    u8 ip_proto, __be16 sport,
2977                                                    __be16 dport)
2978 {
2979         struct sk_buff *skb;
2980         struct iphdr *iph;
2981
2982         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2983         if (!skb)
2984                 return NULL;
2985
2986         /* Reserve room for dummy headers, this skb can pass
2987          * through good chunk of routing engine.
2988          */
2989         skb_reset_mac_header(skb);
2990         skb_reset_network_header(skb);
2991         skb->protocol = htons(ETH_P_IP);
2992         iph = skb_put(skb, sizeof(struct iphdr));
2993         iph->protocol = ip_proto;
2994         iph->saddr = src;
2995         iph->daddr = dst;
2996         iph->version = 0x4;
2997         iph->frag_off = 0;
2998         iph->ihl = 0x5;
2999         skb_set_transport_header(skb, skb->len);
3000
3001         switch (iph->protocol) {
3002         case IPPROTO_UDP: {
3003                 struct udphdr *udph;
3004
3005                 udph = skb_put_zero(skb, sizeof(struct udphdr));
3006                 udph->source = sport;
3007                 udph->dest = dport;
3008                 udph->len = htons(sizeof(struct udphdr));
3009                 udph->check = 0;
3010                 break;
3011         }
3012         case IPPROTO_TCP: {
3013                 struct tcphdr *tcph;
3014
3015                 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3016                 tcph->source    = sport;
3017                 tcph->dest      = dport;
3018                 tcph->doff      = sizeof(struct tcphdr) / 4;
3019                 tcph->rst = 1;
3020                 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3021                                             src, dst, 0);
3022                 break;
3023         }
3024         case IPPROTO_ICMP: {
3025                 struct icmphdr *icmph;
3026
3027                 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3028                 icmph->type = ICMP_ECHO;
3029                 icmph->code = 0;
3030         }
3031         }
3032
3033         return skb;
3034 }
3035
3036 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3037                                        const struct nlmsghdr *nlh,
3038                                        struct nlattr **tb,
3039                                        struct netlink_ext_ack *extack)
3040 {
3041         struct rtmsg *rtm;
3042         int i, err;
3043
3044         if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3045                 NL_SET_ERR_MSG(extack,
3046                                "ipv4: Invalid header for route get request");
3047                 return -EINVAL;
3048         }
3049
3050         if (!netlink_strict_get_check(skb))
3051                 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3052                                               rtm_ipv4_policy, extack);
3053
3054         rtm = nlmsg_data(nlh);
3055         if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3056             (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3057             rtm->rtm_table || rtm->rtm_protocol ||
3058             rtm->rtm_scope || rtm->rtm_type) {
3059                 NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3060                 return -EINVAL;
3061         }
3062
3063         if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3064                                RTM_F_LOOKUP_TABLE |
3065                                RTM_F_FIB_MATCH)) {
3066                 NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3067                 return -EINVAL;
3068         }
3069
3070         err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3071                                             rtm_ipv4_policy, extack);
3072         if (err)
3073                 return err;
3074
3075         if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3076             (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3077                 NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3078                 return -EINVAL;
3079         }
3080
3081         for (i = 0; i <= RTA_MAX; i++) {
3082                 if (!tb[i])
3083                         continue;
3084
3085                 switch (i) {
3086                 case RTA_IIF:
3087                 case RTA_OIF:
3088                 case RTA_SRC:
3089                 case RTA_DST:
3090                 case RTA_IP_PROTO:
3091                 case RTA_SPORT:
3092                 case RTA_DPORT:
3093                 case RTA_MARK:
3094                 case RTA_UID:
3095                         break;
3096                 default:
3097                         NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3098                         return -EINVAL;
3099                 }
3100         }
3101
3102         return 0;
3103 }
3104
3105 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3106                              struct netlink_ext_ack *extack)
3107 {
3108         struct net *net = sock_net(in_skb->sk);
3109         struct nlattr *tb[RTA_MAX+1];
3110         u32 table_id = RT_TABLE_MAIN;
3111         __be16 sport = 0, dport = 0;
3112         struct fib_result res = {};
3113         u8 ip_proto = IPPROTO_UDP;
3114         struct rtable *rt = NULL;
3115         struct sk_buff *skb;
3116         struct rtmsg *rtm;
3117         struct flowi4 fl4 = {};
3118         __be32 dst = 0;
3119         __be32 src = 0;
3120         kuid_t uid;
3121         u32 iif;
3122         int err;
3123         int mark;
3124
3125         err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3126         if (err < 0)
3127                 return err;
3128
3129         rtm = nlmsg_data(nlh);
3130         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3131         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3132         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3133         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3134         if (tb[RTA_UID])
3135                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3136         else
3137                 uid = (iif ? INVALID_UID : current_uid());
3138
3139         if (tb[RTA_IP_PROTO]) {
3140                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3141                                                   &ip_proto, AF_INET, extack);
3142                 if (err)
3143                         return err;
3144         }
3145
3146         if (tb[RTA_SPORT])
3147                 sport = nla_get_be16(tb[RTA_SPORT]);
3148
3149         if (tb[RTA_DPORT])
3150                 dport = nla_get_be16(tb[RTA_DPORT]);
3151
3152         skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3153         if (!skb)
3154                 return -ENOBUFS;
3155
3156         fl4.daddr = dst;
3157         fl4.saddr = src;
3158         fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
3159         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3160         fl4.flowi4_mark = mark;
3161         fl4.flowi4_uid = uid;
3162         if (sport)
3163                 fl4.fl4_sport = sport;
3164         if (dport)
3165                 fl4.fl4_dport = dport;
3166         fl4.flowi4_proto = ip_proto;
3167
3168         rcu_read_lock();
3169
3170         if (iif) {
3171                 struct net_device *dev;
3172
3173                 dev = dev_get_by_index_rcu(net, iif);
3174                 if (!dev) {
3175                         err = -ENODEV;
3176                         goto errout_rcu;
3177                 }
3178
3179                 fl4.flowi4_iif = iif; /* for rt_fill_info */
3180                 skb->dev        = dev;
3181                 skb->mark       = mark;
3182                 err = ip_route_input_rcu(skb, dst, src,
3183                                          rtm->rtm_tos & IPTOS_RT_MASK, dev,
3184                                          &res);
3185
3186                 rt = skb_rtable(skb);
3187                 if (err == 0 && rt->dst.error)
3188                         err = -rt->dst.error;
3189         } else {
3190                 fl4.flowi4_iif = LOOPBACK_IFINDEX;
3191                 skb->dev = net->loopback_dev;
3192                 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3193                 err = 0;
3194                 if (IS_ERR(rt))
3195                         err = PTR_ERR(rt);
3196                 else
3197                         skb_dst_set(skb, &rt->dst);
3198         }
3199
3200         if (err)
3201                 goto errout_rcu;
3202
3203         if (rtm->rtm_flags & RTM_F_NOTIFY)
3204                 rt->rt_flags |= RTCF_NOTIFY;
3205
3206         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3207                 table_id = res.table ? res.table->tb_id : 0;
3208
3209         /* reset skb for netlink reply msg */
3210         skb_trim(skb, 0);
3211         skb_reset_network_header(skb);
3212         skb_reset_transport_header(skb);
3213         skb_reset_mac_header(skb);
3214
3215         if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3216                 if (!res.fi) {
3217                         err = fib_props[res.type].error;
3218                         if (!err)
3219                                 err = -EHOSTUNREACH;
3220                         goto errout_rcu;
3221                 }
3222                 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3223                                     nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
3224                                     rt->rt_type, res.prefix, res.prefixlen,
3225                                     fl4.flowi4_tos, res.fi, 0);
3226         } else {
3227                 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3228                                    NETLINK_CB(in_skb).portid,
3229                                    nlh->nlmsg_seq, 0);
3230         }
3231         if (err < 0)
3232                 goto errout_rcu;
3233
3234         rcu_read_unlock();
3235
3236         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3237
3238 errout_free:
3239         return err;
3240 errout_rcu:
3241         rcu_read_unlock();
3242         kfree_skb(skb);
3243         goto errout_free;
3244 }
3245
3246 void ip_rt_multicast_event(struct in_device *in_dev)
3247 {
3248         rt_cache_flush(dev_net(in_dev->dev));
3249 }
3250
3251 #ifdef CONFIG_SYSCTL
3252 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3253 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
3254 static int ip_rt_gc_elasticity __read_mostly    = 8;
3255 static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
3256
3257 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3258                                         void __user *buffer,
3259                                         size_t *lenp, loff_t *ppos)
3260 {
3261         struct net *net = (struct net *)__ctl->extra1;
3262
3263         if (write) {
3264                 rt_cache_flush(net);
3265                 fnhe_genid_bump(net);
3266                 return 0;
3267         }
3268
3269         return -EINVAL;
3270 }
3271
3272 static struct ctl_table ipv4_route_table[] = {
3273         {
3274                 .procname       = "gc_thresh",
3275                 .data           = &ipv4_dst_ops.gc_thresh,
3276                 .maxlen         = sizeof(int),
3277                 .mode           = 0644,
3278                 .proc_handler   = proc_dointvec,
3279         },
3280         {
3281                 .procname       = "max_size",
3282                 .data           = &ip_rt_max_size,
3283                 .maxlen         = sizeof(int),
3284                 .mode           = 0644,
3285                 .proc_handler   = proc_dointvec,
3286         },
3287         {
3288                 /*  Deprecated. Use gc_min_interval_ms */
3289
3290                 .procname       = "gc_min_interval",
3291                 .data           = &ip_rt_gc_min_interval,
3292                 .maxlen         = sizeof(int),
3293                 .mode           = 0644,
3294                 .proc_handler   = proc_dointvec_jiffies,
3295         },
3296         {
3297                 .procname       = "gc_min_interval_ms",
3298                 .data           = &ip_rt_gc_min_interval,
3299                 .maxlen         = sizeof(int),
3300                 .mode           = 0644,
3301                 .proc_handler   = proc_dointvec_ms_jiffies,
3302         },
3303         {
3304                 .procname       = "gc_timeout",
3305                 .data           = &ip_rt_gc_timeout,
3306                 .maxlen         = sizeof(int),
3307                 .mode           = 0644,
3308                 .proc_handler   = proc_dointvec_jiffies,
3309         },
3310         {
3311                 .procname       = "gc_interval",
3312                 .data           = &ip_rt_gc_interval,
3313                 .maxlen         = sizeof(int),
3314                 .mode           = 0644,
3315                 .proc_handler   = proc_dointvec_jiffies,
3316         },
3317         {
3318                 .procname       = "redirect_load",
3319                 .data           = &ip_rt_redirect_load,
3320                 .maxlen         = sizeof(int),
3321                 .mode           = 0644,
3322                 .proc_handler   = proc_dointvec,
3323         },
3324         {
3325                 .procname       = "redirect_number",
3326                 .data           = &ip_rt_redirect_number,
3327                 .maxlen         = sizeof(int),
3328                 .mode           = 0644,
3329                 .proc_handler   = proc_dointvec,
3330         },
3331         {
3332                 .procname       = "redirect_silence",
3333                 .data           = &ip_rt_redirect_silence,
3334                 .maxlen         = sizeof(int),
3335                 .mode           = 0644,
3336                 .proc_handler   = proc_dointvec,
3337         },
3338         {
3339                 .procname       = "error_cost",
3340                 .data           = &ip_rt_error_cost,
3341                 .maxlen         = sizeof(int),
3342                 .mode           = 0644,
3343                 .proc_handler   = proc_dointvec,
3344         },
3345         {
3346                 .procname       = "error_burst",
3347                 .data           = &ip_rt_error_burst,
3348                 .maxlen         = sizeof(int),
3349                 .mode           = 0644,
3350                 .proc_handler   = proc_dointvec,
3351         },
3352         {
3353                 .procname       = "gc_elasticity",
3354                 .data           = &ip_rt_gc_elasticity,
3355                 .maxlen         = sizeof(int),
3356                 .mode           = 0644,
3357                 .proc_handler   = proc_dointvec,
3358         },
3359         {
3360                 .procname       = "mtu_expires",
3361                 .data           = &ip_rt_mtu_expires,
3362                 .maxlen         = sizeof(int),
3363                 .mode           = 0644,
3364                 .proc_handler   = proc_dointvec_jiffies,
3365         },
3366         {
3367                 .procname       = "min_pmtu",
3368                 .data           = &ip_rt_min_pmtu,
3369                 .maxlen         = sizeof(int),
3370                 .mode           = 0644,
3371                 .proc_handler   = proc_dointvec_minmax,
3372                 .extra1         = &ip_min_valid_pmtu,
3373         },
3374         {
3375                 .procname       = "min_adv_mss",
3376                 .data           = &ip_rt_min_advmss,
3377                 .maxlen         = sizeof(int),
3378                 .mode           = 0644,
3379                 .proc_handler   = proc_dointvec,
3380         },
3381         { }
3382 };
3383
3384 static const char ipv4_route_flush_procname[] = "flush";
3385
3386 static struct ctl_table ipv4_route_flush_table[] = {
3387         {
3388                 .procname       = ipv4_route_flush_procname,
3389                 .maxlen         = sizeof(int),
3390                 .mode           = 0200,
3391                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3392         },
3393         { },
3394 };
3395
3396 static __net_init int sysctl_route_net_init(struct net *net)
3397 {
3398         struct ctl_table *tbl;
3399
3400         tbl = ipv4_route_flush_table;
3401         if (!net_eq(net, &init_net)) {
3402                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3403                 if (!tbl)
3404                         goto err_dup;
3405
3406                 /* Don't export non-whitelisted sysctls to unprivileged users */
3407                 if (net->user_ns != &init_user_ns) {
3408                         if (tbl[0].procname != ipv4_route_flush_procname)
3409                                 tbl[0].procname = NULL;
3410                 }
3411         }
3412         tbl[0].extra1 = net;
3413
3414         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3415         if (!net->ipv4.route_hdr)
3416                 goto err_reg;
3417         return 0;
3418
3419 err_reg:
3420         if (tbl != ipv4_route_flush_table)
3421                 kfree(tbl);
3422 err_dup:
3423         return -ENOMEM;
3424 }
3425
3426 static __net_exit void sysctl_route_net_exit(struct net *net)
3427 {
3428         struct ctl_table *tbl;
3429
3430         tbl = net->ipv4.route_hdr->ctl_table_arg;
3431         unregister_net_sysctl_table(net->ipv4.route_hdr);
3432         BUG_ON(tbl == ipv4_route_flush_table);
3433         kfree(tbl);
3434 }
3435
3436 static __net_initdata struct pernet_operations sysctl_route_ops = {
3437         .init = sysctl_route_net_init,
3438         .exit = sysctl_route_net_exit,
3439 };
3440 #endif
3441
3442 static __net_init int rt_genid_init(struct net *net)
3443 {
3444         atomic_set(&net->ipv4.rt_genid, 0);
3445         atomic_set(&net->fnhe_genid, 0);
3446         atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3447         return 0;
3448 }
3449
3450 static __net_initdata struct pernet_operations rt_genid_ops = {
3451         .init = rt_genid_init,
3452 };
3453
3454 static int __net_init ipv4_inetpeer_init(struct net *net)
3455 {
3456         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3457
3458         if (!bp)
3459                 return -ENOMEM;
3460         inet_peer_base_init(bp);
3461         net->ipv4.peers = bp;
3462         return 0;
3463 }
3464
3465 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3466 {
3467         struct inet_peer_base *bp = net->ipv4.peers;
3468
3469         net->ipv4.peers = NULL;
3470         inetpeer_invalidate_tree(bp);
3471         kfree(bp);
3472 }
3473
3474 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3475         .init   =       ipv4_inetpeer_init,
3476         .exit   =       ipv4_inetpeer_exit,
3477 };
3478
3479 #ifdef CONFIG_IP_ROUTE_CLASSID
3480 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3481 #endif /* CONFIG_IP_ROUTE_CLASSID */
3482
3483 int __init ip_rt_init(void)
3484 {
3485         void *idents_hash;
3486         int cpu;
3487
3488         /* For modern hosts, this will use 2 MB of memory */
3489         idents_hash = alloc_large_system_hash("IP idents",
3490                                               sizeof(*ip_idents) + sizeof(*ip_tstamps),
3491                                               0,
3492                                               16, /* one bucket per 64 KB */
3493                                               HASH_ZERO,
3494                                               NULL,
3495                                               &ip_idents_mask,
3496                                               2048,
3497                                               256*1024);
3498
3499         ip_idents = idents_hash;
3500
3501         prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
3502
3503         ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
3504
3505         for_each_possible_cpu(cpu) {
3506                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3507
3508                 INIT_LIST_HEAD(&ul->head);
3509                 spin_lock_init(&ul->lock);
3510         }
3511 #ifdef CONFIG_IP_ROUTE_CLASSID
3512         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3513         if (!ip_rt_acct)
3514                 panic("IP: failed to allocate ip_rt_acct\n");
3515 #endif
3516
3517         ipv4_dst_ops.kmem_cachep =
3518                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3519                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3520
3521         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3522
3523         if (dst_entries_init(&ipv4_dst_ops) < 0)
3524                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3525
3526         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3527                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3528
3529         ipv4_dst_ops.gc_thresh = ~0;
3530         ip_rt_max_size = INT_MAX;
3531
3532         devinet_init();
3533         ip_fib_init();
3534
3535         if (ip_rt_proc_init())
3536                 pr_err("Unable to create route proc files\n");
3537 #ifdef CONFIG_XFRM
3538         xfrm_init();
3539         xfrm4_init();
3540 #endif
3541         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3542                       RTNL_FLAG_DOIT_UNLOCKED);
3543
3544 #ifdef CONFIG_SYSCTL
3545         register_pernet_subsys(&sysctl_route_ops);
3546 #endif
3547         register_pernet_subsys(&rt_genid_ops);
3548         register_pernet_subsys(&ipv4_inetpeer_ops);
3549         return 0;
3550 }
3551
3552 #ifdef CONFIG_SYSCTL
3553 /*
3554  * We really need to sanitize the damn ipv4 init order, then all
3555  * this nonsense will go away.
3556  */
3557 void __init ip_static_sysctl_init(void)
3558 {
3559         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3560 }
3561 #endif