GNU Linux-libre 5.10.153-gnu1
[releases.git] / net / ipv4 / route.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
4  *              operating system.  INET is implemented using the  BSD Socket
5  *              interface as the means of communication with the user level.
6  *
7  *              ROUTE - implementation of the IP router.
8  *
9  * Authors:     Ross Biro
10  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
12  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
13  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
14  *
15  * Fixes:
16  *              Alan Cox        :       Verify area fixes.
17  *              Alan Cox        :       cli() protects routing changes
18  *              Rui Oliveira    :       ICMP routing table updates
19  *              (rco@di.uminho.pt)      Routing table insertion and update
20  *              Linus Torvalds  :       Rewrote bits to be sensible
21  *              Alan Cox        :       Added BSD route gw semantics
22  *              Alan Cox        :       Super /proc >4K
23  *              Alan Cox        :       MTU in route table
24  *              Alan Cox        :       MSS actually. Also added the window
25  *                                      clamper.
26  *              Sam Lantinga    :       Fixed route matching in rt_del()
27  *              Alan Cox        :       Routing cache support.
28  *              Alan Cox        :       Removed compatibility cruft.
29  *              Alan Cox        :       RTF_REJECT support.
30  *              Alan Cox        :       TCP irtt support.
31  *              Jonathan Naylor :       Added Metric support.
32  *      Miquel van Smoorenburg  :       BSD API fixes.
33  *      Miquel van Smoorenburg  :       Metrics.
34  *              Alan Cox        :       Use __u32 properly
35  *              Alan Cox        :       Aligned routing errors more closely with BSD
36  *                                      our system is still very different.
37  *              Alan Cox        :       Faster /proc handling
38  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
39  *                                      routing caches and better behaviour.
40  *
41  *              Olaf Erb        :       irtt wasn't being copied right.
42  *              Bjorn Ekwall    :       Kerneld route support.
43  *              Alan Cox        :       Multicast fixed (I hope)
44  *              Pavel Krauz     :       Limited broadcast fixed
45  *              Mike McLagan    :       Routing by source
46  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
47  *                                      route.c and rewritten from scratch.
48  *              Andi Kleen      :       Load-limit warning messages.
49  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
50  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
51  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
52  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
53  *              Marc Boucher    :       routing by fwmark
54  *      Robert Olsson           :       Added rt_cache statistics
55  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
56  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
57  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
58  *      Ilia Sotnikov           :       Removed TOS from hash calculations
59  */
60
61 #define pr_fmt(fmt) "IPv4: " fmt
62
63 #include <linux/module.h>
64 #include <linux/uaccess.h>
65 #include <linux/bitops.h>
66 #include <linux/types.h>
67 #include <linux/kernel.h>
68 #include <linux/mm.h>
69 #include <linux/memblock.h>
70 #include <linux/string.h>
71 #include <linux/socket.h>
72 #include <linux/sockios.h>
73 #include <linux/errno.h>
74 #include <linux/in.h>
75 #include <linux/inet.h>
76 #include <linux/netdevice.h>
77 #include <linux/proc_fs.h>
78 #include <linux/init.h>
79 #include <linux/skbuff.h>
80 #include <linux/inetdevice.h>
81 #include <linux/igmp.h>
82 #include <linux/pkt_sched.h>
83 #include <linux/mroute.h>
84 #include <linux/netfilter_ipv4.h>
85 #include <linux/random.h>
86 #include <linux/rcupdate.h>
87 #include <linux/times.h>
88 #include <linux/slab.h>
89 #include <linux/jhash.h>
90 #include <net/dst.h>
91 #include <net/dst_metadata.h>
92 #include <net/net_namespace.h>
93 #include <net/protocol.h>
94 #include <net/ip.h>
95 #include <net/route.h>
96 #include <net/inetpeer.h>
97 #include <net/sock.h>
98 #include <net/ip_fib.h>
99 #include <net/nexthop.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/lwtunnel.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
107 #ifdef CONFIG_SYSCTL
108 #include <linux/sysctl.h>
109 #endif
110 #include <net/secure_seq.h>
111 #include <net/ip_tunnels.h>
112 #include <net/l3mdev.h>
113
114 #include "fib_lookup.h"
115
116 #define RT_FL_TOS(oldflp4) \
117         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
118
119 #define RT_GC_TIMEOUT (300*HZ)
120
121 static int ip_rt_max_size;
122 static int ip_rt_redirect_number __read_mostly  = 9;
123 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
124 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
125 static int ip_rt_error_cost __read_mostly       = HZ;
126 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
127 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
128 static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
129 static int ip_rt_min_advmss __read_mostly       = 256;
130
131 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
132
133 /*
134  *      Interface to generic destination cache.
135  */
136
137 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
139 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
140 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
141 static void              ipv4_link_failure(struct sk_buff *skb);
142 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
143                                            struct sk_buff *skb, u32 mtu,
144                                            bool confirm_neigh);
145 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
146                                         struct sk_buff *skb);
147 static void             ipv4_dst_destroy(struct dst_entry *dst);
148
149 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
150 {
151         WARN_ON(1);
152         return NULL;
153 }
154
155 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
156                                            struct sk_buff *skb,
157                                            const void *daddr);
158 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
159
160 static struct dst_ops ipv4_dst_ops = {
161         .family =               AF_INET,
162         .check =                ipv4_dst_check,
163         .default_advmss =       ipv4_default_advmss,
164         .mtu =                  ipv4_mtu,
165         .cow_metrics =          ipv4_cow_metrics,
166         .destroy =              ipv4_dst_destroy,
167         .negative_advice =      ipv4_negative_advice,
168         .link_failure =         ipv4_link_failure,
169         .update_pmtu =          ip_rt_update_pmtu,
170         .redirect =             ip_do_redirect,
171         .local_out =            __ip_local_out,
172         .neigh_lookup =         ipv4_neigh_lookup,
173         .confirm_neigh =        ipv4_confirm_neigh,
174 };
175
176 #define ECN_OR_COST(class)      TC_PRIO_##class
177
178 const __u8 ip_tos2prio[16] = {
179         TC_PRIO_BESTEFFORT,
180         ECN_OR_COST(BESTEFFORT),
181         TC_PRIO_BESTEFFORT,
182         ECN_OR_COST(BESTEFFORT),
183         TC_PRIO_BULK,
184         ECN_OR_COST(BULK),
185         TC_PRIO_BULK,
186         ECN_OR_COST(BULK),
187         TC_PRIO_INTERACTIVE,
188         ECN_OR_COST(INTERACTIVE),
189         TC_PRIO_INTERACTIVE,
190         ECN_OR_COST(INTERACTIVE),
191         TC_PRIO_INTERACTIVE_BULK,
192         ECN_OR_COST(INTERACTIVE_BULK),
193         TC_PRIO_INTERACTIVE_BULK,
194         ECN_OR_COST(INTERACTIVE_BULK)
195 };
196 EXPORT_SYMBOL(ip_tos2prio);
197
198 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
199 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
200
201 #ifdef CONFIG_PROC_FS
202 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
203 {
204         if (*pos)
205                 return NULL;
206         return SEQ_START_TOKEN;
207 }
208
209 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
210 {
211         ++*pos;
212         return NULL;
213 }
214
215 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
216 {
217 }
218
219 static int rt_cache_seq_show(struct seq_file *seq, void *v)
220 {
221         if (v == SEQ_START_TOKEN)
222                 seq_printf(seq, "%-127s\n",
223                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
224                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
225                            "HHUptod\tSpecDst");
226         return 0;
227 }
228
229 static const struct seq_operations rt_cache_seq_ops = {
230         .start  = rt_cache_seq_start,
231         .next   = rt_cache_seq_next,
232         .stop   = rt_cache_seq_stop,
233         .show   = rt_cache_seq_show,
234 };
235
236 static int rt_cache_seq_open(struct inode *inode, struct file *file)
237 {
238         return seq_open(file, &rt_cache_seq_ops);
239 }
240
241 static const struct proc_ops rt_cache_proc_ops = {
242         .proc_open      = rt_cache_seq_open,
243         .proc_read      = seq_read,
244         .proc_lseek     = seq_lseek,
245         .proc_release   = seq_release,
246 };
247
248
249 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
250 {
251         int cpu;
252
253         if (*pos == 0)
254                 return SEQ_START_TOKEN;
255
256         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
257                 if (!cpu_possible(cpu))
258                         continue;
259                 *pos = cpu+1;
260                 return &per_cpu(rt_cache_stat, cpu);
261         }
262         return NULL;
263 }
264
265 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
266 {
267         int cpu;
268
269         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
270                 if (!cpu_possible(cpu))
271                         continue;
272                 *pos = cpu+1;
273                 return &per_cpu(rt_cache_stat, cpu);
274         }
275         (*pos)++;
276         return NULL;
277
278 }
279
280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
281 {
282
283 }
284
285 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
286 {
287         struct rt_cache_stat *st = v;
288
289         if (v == SEQ_START_TOKEN) {
290                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
291                 return 0;
292         }
293
294         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
295                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
296                    dst_entries_get_slow(&ipv4_dst_ops),
297                    0, /* st->in_hit */
298                    st->in_slow_tot,
299                    st->in_slow_mc,
300                    st->in_no_route,
301                    st->in_brd,
302                    st->in_martian_dst,
303                    st->in_martian_src,
304
305                    0, /* st->out_hit */
306                    st->out_slow_tot,
307                    st->out_slow_mc,
308
309                    0, /* st->gc_total */
310                    0, /* st->gc_ignored */
311                    0, /* st->gc_goal_miss */
312                    0, /* st->gc_dst_overflow */
313                    0, /* st->in_hlist_search */
314                    0  /* st->out_hlist_search */
315                 );
316         return 0;
317 }
318
319 static const struct seq_operations rt_cpu_seq_ops = {
320         .start  = rt_cpu_seq_start,
321         .next   = rt_cpu_seq_next,
322         .stop   = rt_cpu_seq_stop,
323         .show   = rt_cpu_seq_show,
324 };
325
326
327 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
328 {
329         return seq_open(file, &rt_cpu_seq_ops);
330 }
331
332 static const struct proc_ops rt_cpu_proc_ops = {
333         .proc_open      = rt_cpu_seq_open,
334         .proc_read      = seq_read,
335         .proc_lseek     = seq_lseek,
336         .proc_release   = seq_release,
337 };
338
339 #ifdef CONFIG_IP_ROUTE_CLASSID
340 static int rt_acct_proc_show(struct seq_file *m, void *v)
341 {
342         struct ip_rt_acct *dst, *src;
343         unsigned int i, j;
344
345         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
346         if (!dst)
347                 return -ENOMEM;
348
349         for_each_possible_cpu(i) {
350                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
351                 for (j = 0; j < 256; j++) {
352                         dst[j].o_bytes   += src[j].o_bytes;
353                         dst[j].o_packets += src[j].o_packets;
354                         dst[j].i_bytes   += src[j].i_bytes;
355                         dst[j].i_packets += src[j].i_packets;
356                 }
357         }
358
359         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
360         kfree(dst);
361         return 0;
362 }
363 #endif
364
365 static int __net_init ip_rt_do_proc_init(struct net *net)
366 {
367         struct proc_dir_entry *pde;
368
369         pde = proc_create("rt_cache", 0444, net->proc_net,
370                           &rt_cache_proc_ops);
371         if (!pde)
372                 goto err1;
373
374         pde = proc_create("rt_cache", 0444,
375                           net->proc_net_stat, &rt_cpu_proc_ops);
376         if (!pde)
377                 goto err2;
378
379 #ifdef CONFIG_IP_ROUTE_CLASSID
380         pde = proc_create_single("rt_acct", 0, net->proc_net,
381                         rt_acct_proc_show);
382         if (!pde)
383                 goto err3;
384 #endif
385         return 0;
386
387 #ifdef CONFIG_IP_ROUTE_CLASSID
388 err3:
389         remove_proc_entry("rt_cache", net->proc_net_stat);
390 #endif
391 err2:
392         remove_proc_entry("rt_cache", net->proc_net);
393 err1:
394         return -ENOMEM;
395 }
396
397 static void __net_exit ip_rt_do_proc_exit(struct net *net)
398 {
399         remove_proc_entry("rt_cache", net->proc_net_stat);
400         remove_proc_entry("rt_cache", net->proc_net);
401 #ifdef CONFIG_IP_ROUTE_CLASSID
402         remove_proc_entry("rt_acct", net->proc_net);
403 #endif
404 }
405
406 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
407         .init = ip_rt_do_proc_init,
408         .exit = ip_rt_do_proc_exit,
409 };
410
411 static int __init ip_rt_proc_init(void)
412 {
413         return register_pernet_subsys(&ip_rt_proc_ops);
414 }
415
416 #else
417 static inline int ip_rt_proc_init(void)
418 {
419         return 0;
420 }
421 #endif /* CONFIG_PROC_FS */
422
423 static inline bool rt_is_expired(const struct rtable *rth)
424 {
425         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
426 }
427
428 void rt_cache_flush(struct net *net)
429 {
430         rt_genid_bump_ipv4(net);
431 }
432
433 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
434                                            struct sk_buff *skb,
435                                            const void *daddr)
436 {
437         const struct rtable *rt = container_of(dst, struct rtable, dst);
438         struct net_device *dev = dst->dev;
439         struct neighbour *n;
440
441         rcu_read_lock_bh();
442
443         if (likely(rt->rt_gw_family == AF_INET)) {
444                 n = ip_neigh_gw4(dev, rt->rt_gw4);
445         } else if (rt->rt_gw_family == AF_INET6) {
446                 n = ip_neigh_gw6(dev, &rt->rt_gw6);
447         } else {
448                 __be32 pkey;
449
450                 pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
451                 n = ip_neigh_gw4(dev, pkey);
452         }
453
454         if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
455                 n = NULL;
456
457         rcu_read_unlock_bh();
458
459         return n;
460 }
461
462 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
463 {
464         const struct rtable *rt = container_of(dst, struct rtable, dst);
465         struct net_device *dev = dst->dev;
466         const __be32 *pkey = daddr;
467
468         if (rt->rt_gw_family == AF_INET) {
469                 pkey = (const __be32 *)&rt->rt_gw4;
470         } else if (rt->rt_gw_family == AF_INET6) {
471                 return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
472         } else if (!daddr ||
473                  (rt->rt_flags &
474                   (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
475                 return;
476         }
477         __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
478 }
479
480 /* Hash tables of size 2048..262144 depending on RAM size.
481  * Each bucket uses 8 bytes.
482  */
483 static u32 ip_idents_mask __read_mostly;
484 static atomic_t *ip_idents __read_mostly;
485 static u32 *ip_tstamps __read_mostly;
486
487 /* In order to protect privacy, we add a perturbation to identifiers
488  * if one generator is seldom used. This makes hard for an attacker
489  * to infer how many packets were sent between two points in time.
490  */
491 u32 ip_idents_reserve(u32 hash, int segs)
492 {
493         u32 bucket, old, now = (u32)jiffies;
494         atomic_t *p_id;
495         u32 *p_tstamp;
496         u32 delta = 0;
497
498         bucket = hash & ip_idents_mask;
499         p_tstamp = ip_tstamps + bucket;
500         p_id = ip_idents + bucket;
501         old = READ_ONCE(*p_tstamp);
502
503         if (old != now && cmpxchg(p_tstamp, old, now) == old)
504                 delta = prandom_u32_max(now - old);
505
506         /* If UBSAN reports an error there, please make sure your compiler
507          * supports -fno-strict-overflow before reporting it that was a bug
508          * in UBSAN, and it has been fixed in GCC-8.
509          */
510         return atomic_add_return(segs + delta, p_id) - segs;
511 }
512 EXPORT_SYMBOL(ip_idents_reserve);
513
514 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
515 {
516         u32 hash, id;
517
518         /* Note the following code is not safe, but this is okay. */
519         if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
520                 get_random_bytes(&net->ipv4.ip_id_key,
521                                  sizeof(net->ipv4.ip_id_key));
522
523         hash = siphash_3u32((__force u32)iph->daddr,
524                             (__force u32)iph->saddr,
525                             iph->protocol,
526                             &net->ipv4.ip_id_key);
527         id = ip_idents_reserve(hash, segs);
528         iph->id = htons(id);
529 }
530 EXPORT_SYMBOL(__ip_select_ident);
531
532 static void ip_rt_fix_tos(struct flowi4 *fl4)
533 {
534         __u8 tos = RT_FL_TOS(fl4);
535
536         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
537         fl4->flowi4_scope = tos & RTO_ONLINK ?
538                             RT_SCOPE_LINK : RT_SCOPE_UNIVERSE;
539 }
540
541 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
542                              const struct sock *sk,
543                              const struct iphdr *iph,
544                              int oif, u8 tos,
545                              u8 prot, u32 mark, int flow_flags)
546 {
547         if (sk) {
548                 const struct inet_sock *inet = inet_sk(sk);
549
550                 oif = sk->sk_bound_dev_if;
551                 mark = sk->sk_mark;
552                 tos = RT_CONN_FLAGS(sk);
553                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
554         }
555         flowi4_init_output(fl4, oif, mark, tos,
556                            RT_SCOPE_UNIVERSE, prot,
557                            flow_flags,
558                            iph->daddr, iph->saddr, 0, 0,
559                            sock_net_uid(net, sk));
560 }
561
562 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
563                                const struct sock *sk)
564 {
565         const struct net *net = dev_net(skb->dev);
566         const struct iphdr *iph = ip_hdr(skb);
567         int oif = skb->dev->ifindex;
568         u8 tos = RT_TOS(iph->tos);
569         u8 prot = iph->protocol;
570         u32 mark = skb->mark;
571
572         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
573 }
574
575 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
576 {
577         const struct inet_sock *inet = inet_sk(sk);
578         const struct ip_options_rcu *inet_opt;
579         __be32 daddr = inet->inet_daddr;
580
581         rcu_read_lock();
582         inet_opt = rcu_dereference(inet->inet_opt);
583         if (inet_opt && inet_opt->opt.srr)
584                 daddr = inet_opt->opt.faddr;
585         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
586                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
587                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
588                            inet_sk_flowi_flags(sk),
589                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
590         rcu_read_unlock();
591 }
592
593 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
594                                  const struct sk_buff *skb)
595 {
596         if (skb)
597                 build_skb_flow_key(fl4, skb, sk);
598         else
599                 build_sk_flow_key(fl4, sk);
600 }
601
602 static DEFINE_SPINLOCK(fnhe_lock);
603
604 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
605 {
606         struct rtable *rt;
607
608         rt = rcu_dereference(fnhe->fnhe_rth_input);
609         if (rt) {
610                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
611                 dst_dev_put(&rt->dst);
612                 dst_release(&rt->dst);
613         }
614         rt = rcu_dereference(fnhe->fnhe_rth_output);
615         if (rt) {
616                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
617                 dst_dev_put(&rt->dst);
618                 dst_release(&rt->dst);
619         }
620 }
621
622 static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
623 {
624         struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
625         struct fib_nh_exception *fnhe, *oldest = NULL;
626
627         for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
628                 fnhe = rcu_dereference_protected(*fnhe_p,
629                                                  lockdep_is_held(&fnhe_lock));
630                 if (!fnhe)
631                         break;
632                 if (!oldest ||
633                     time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
634                         oldest = fnhe;
635                         oldest_p = fnhe_p;
636                 }
637         }
638         fnhe_flush_routes(oldest);
639         *oldest_p = oldest->fnhe_next;
640         kfree_rcu(oldest, rcu);
641 }
642
643 static u32 fnhe_hashfun(__be32 daddr)
644 {
645         static siphash_key_t fnhe_hash_key __read_mostly;
646         u64 hval;
647
648         net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key));
649         hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key);
650         return hash_64(hval, FNHE_HASH_SHIFT);
651 }
652
653 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
654 {
655         rt->rt_pmtu = fnhe->fnhe_pmtu;
656         rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
657         rt->dst.expires = fnhe->fnhe_expires;
658
659         if (fnhe->fnhe_gw) {
660                 rt->rt_flags |= RTCF_REDIRECTED;
661                 rt->rt_uses_gateway = 1;
662                 rt->rt_gw_family = AF_INET;
663                 rt->rt_gw4 = fnhe->fnhe_gw;
664         }
665 }
666
667 static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
668                                   __be32 gw, u32 pmtu, bool lock,
669                                   unsigned long expires)
670 {
671         struct fnhe_hash_bucket *hash;
672         struct fib_nh_exception *fnhe;
673         struct rtable *rt;
674         u32 genid, hval;
675         unsigned int i;
676         int depth;
677
678         genid = fnhe_genid(dev_net(nhc->nhc_dev));
679         hval = fnhe_hashfun(daddr);
680
681         spin_lock_bh(&fnhe_lock);
682
683         hash = rcu_dereference(nhc->nhc_exceptions);
684         if (!hash) {
685                 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
686                 if (!hash)
687                         goto out_unlock;
688                 rcu_assign_pointer(nhc->nhc_exceptions, hash);
689         }
690
691         hash += hval;
692
693         depth = 0;
694         for (fnhe = rcu_dereference(hash->chain); fnhe;
695              fnhe = rcu_dereference(fnhe->fnhe_next)) {
696                 if (fnhe->fnhe_daddr == daddr)
697                         break;
698                 depth++;
699         }
700
701         if (fnhe) {
702                 if (fnhe->fnhe_genid != genid)
703                         fnhe->fnhe_genid = genid;
704                 if (gw)
705                         fnhe->fnhe_gw = gw;
706                 if (pmtu) {
707                         fnhe->fnhe_pmtu = pmtu;
708                         fnhe->fnhe_mtu_locked = lock;
709                 }
710                 fnhe->fnhe_expires = max(1UL, expires);
711                 /* Update all cached dsts too */
712                 rt = rcu_dereference(fnhe->fnhe_rth_input);
713                 if (rt)
714                         fill_route_from_fnhe(rt, fnhe);
715                 rt = rcu_dereference(fnhe->fnhe_rth_output);
716                 if (rt)
717                         fill_route_from_fnhe(rt, fnhe);
718         } else {
719                 /* Randomize max depth to avoid some side channels attacks. */
720                 int max_depth = FNHE_RECLAIM_DEPTH +
721                                 prandom_u32_max(FNHE_RECLAIM_DEPTH);
722
723                 while (depth > max_depth) {
724                         fnhe_remove_oldest(hash);
725                         depth--;
726                 }
727
728                 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
729                 if (!fnhe)
730                         goto out_unlock;
731
732                 fnhe->fnhe_next = hash->chain;
733
734                 fnhe->fnhe_genid = genid;
735                 fnhe->fnhe_daddr = daddr;
736                 fnhe->fnhe_gw = gw;
737                 fnhe->fnhe_pmtu = pmtu;
738                 fnhe->fnhe_mtu_locked = lock;
739                 fnhe->fnhe_expires = max(1UL, expires);
740
741                 rcu_assign_pointer(hash->chain, fnhe);
742
743                 /* Exception created; mark the cached routes for the nexthop
744                  * stale, so anyone caching it rechecks if this exception
745                  * applies to them.
746                  */
747                 rt = rcu_dereference(nhc->nhc_rth_input);
748                 if (rt)
749                         rt->dst.obsolete = DST_OBSOLETE_KILL;
750
751                 for_each_possible_cpu(i) {
752                         struct rtable __rcu **prt;
753                         prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
754                         rt = rcu_dereference(*prt);
755                         if (rt)
756                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
757                 }
758         }
759
760         fnhe->fnhe_stamp = jiffies;
761
762 out_unlock:
763         spin_unlock_bh(&fnhe_lock);
764 }
765
766 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
767                              bool kill_route)
768 {
769         __be32 new_gw = icmp_hdr(skb)->un.gateway;
770         __be32 old_gw = ip_hdr(skb)->saddr;
771         struct net_device *dev = skb->dev;
772         struct in_device *in_dev;
773         struct fib_result res;
774         struct neighbour *n;
775         struct net *net;
776
777         switch (icmp_hdr(skb)->code & 7) {
778         case ICMP_REDIR_NET:
779         case ICMP_REDIR_NETTOS:
780         case ICMP_REDIR_HOST:
781         case ICMP_REDIR_HOSTTOS:
782                 break;
783
784         default:
785                 return;
786         }
787
788         if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
789                 return;
790
791         in_dev = __in_dev_get_rcu(dev);
792         if (!in_dev)
793                 return;
794
795         net = dev_net(dev);
796         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
797             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
798             ipv4_is_zeronet(new_gw))
799                 goto reject_redirect;
800
801         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
802                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
803                         goto reject_redirect;
804                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
805                         goto reject_redirect;
806         } else {
807                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
808                         goto reject_redirect;
809         }
810
811         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
812         if (!n)
813                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
814         if (!IS_ERR(n)) {
815                 if (!(n->nud_state & NUD_VALID)) {
816                         neigh_event_send(n, NULL);
817                 } else {
818                         if (fib_lookup(net, fl4, &res, 0) == 0) {
819                                 struct fib_nh_common *nhc;
820
821                                 fib_select_path(net, &res, fl4, skb);
822                                 nhc = FIB_RES_NHC(res);
823                                 update_or_create_fnhe(nhc, fl4->daddr, new_gw,
824                                                 0, false,
825                                                 jiffies + ip_rt_gc_timeout);
826                         }
827                         if (kill_route)
828                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
829                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
830                 }
831                 neigh_release(n);
832         }
833         return;
834
835 reject_redirect:
836 #ifdef CONFIG_IP_ROUTE_VERBOSE
837         if (IN_DEV_LOG_MARTIANS(in_dev)) {
838                 const struct iphdr *iph = (const struct iphdr *) skb->data;
839                 __be32 daddr = iph->daddr;
840                 __be32 saddr = iph->saddr;
841
842                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
843                                      "  Advised path = %pI4 -> %pI4\n",
844                                      &old_gw, dev->name, &new_gw,
845                                      &saddr, &daddr);
846         }
847 #endif
848         ;
849 }
850
851 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
852 {
853         struct rtable *rt;
854         struct flowi4 fl4;
855         const struct iphdr *iph = (const struct iphdr *) skb->data;
856         struct net *net = dev_net(skb->dev);
857         int oif = skb->dev->ifindex;
858         u8 tos = RT_TOS(iph->tos);
859         u8 prot = iph->protocol;
860         u32 mark = skb->mark;
861
862         rt = (struct rtable *) dst;
863
864         __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
865         ip_rt_fix_tos(&fl4);
866         __ip_do_redirect(rt, skb, &fl4, true);
867 }
868
869 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
870 {
871         struct rtable *rt = (struct rtable *)dst;
872         struct dst_entry *ret = dst;
873
874         if (rt) {
875                 if (dst->obsolete > 0) {
876                         ip_rt_put(rt);
877                         ret = NULL;
878                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
879                            rt->dst.expires) {
880                         ip_rt_put(rt);
881                         ret = NULL;
882                 }
883         }
884         return ret;
885 }
886
887 /*
888  * Algorithm:
889  *      1. The first ip_rt_redirect_number redirects are sent
890  *         with exponential backoff, then we stop sending them at all,
891  *         assuming that the host ignores our redirects.
892  *      2. If we did not see packets requiring redirects
893  *         during ip_rt_redirect_silence, we assume that the host
894  *         forgot redirected route and start to send redirects again.
895  *
896  * This algorithm is much cheaper and more intelligent than dumb load limiting
897  * in icmp.c.
898  *
899  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
900  * and "frag. need" (breaks PMTU discovery) in icmp.c.
901  */
902
903 void ip_rt_send_redirect(struct sk_buff *skb)
904 {
905         struct rtable *rt = skb_rtable(skb);
906         struct in_device *in_dev;
907         struct inet_peer *peer;
908         struct net *net;
909         int log_martians;
910         int vif;
911
912         rcu_read_lock();
913         in_dev = __in_dev_get_rcu(rt->dst.dev);
914         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
915                 rcu_read_unlock();
916                 return;
917         }
918         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
919         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
920         rcu_read_unlock();
921
922         net = dev_net(rt->dst.dev);
923         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
924         if (!peer) {
925                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
926                           rt_nexthop(rt, ip_hdr(skb)->daddr));
927                 return;
928         }
929
930         /* No redirected packets during ip_rt_redirect_silence;
931          * reset the algorithm.
932          */
933         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
934                 peer->rate_tokens = 0;
935                 peer->n_redirects = 0;
936         }
937
938         /* Too many ignored redirects; do not send anything
939          * set dst.rate_last to the last seen redirected packet.
940          */
941         if (peer->n_redirects >= ip_rt_redirect_number) {
942                 peer->rate_last = jiffies;
943                 goto out_put_peer;
944         }
945
946         /* Check for load limit; set rate_last to the latest sent
947          * redirect.
948          */
949         if (peer->n_redirects == 0 ||
950             time_after(jiffies,
951                        (peer->rate_last +
952                         (ip_rt_redirect_load << peer->n_redirects)))) {
953                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
954
955                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
956                 peer->rate_last = jiffies;
957                 ++peer->n_redirects;
958 #ifdef CONFIG_IP_ROUTE_VERBOSE
959                 if (log_martians &&
960                     peer->n_redirects == ip_rt_redirect_number)
961                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
962                                              &ip_hdr(skb)->saddr, inet_iif(skb),
963                                              &ip_hdr(skb)->daddr, &gw);
964 #endif
965         }
966 out_put_peer:
967         inet_putpeer(peer);
968 }
969
970 static int ip_error(struct sk_buff *skb)
971 {
972         struct rtable *rt = skb_rtable(skb);
973         struct net_device *dev = skb->dev;
974         struct in_device *in_dev;
975         struct inet_peer *peer;
976         unsigned long now;
977         struct net *net;
978         bool send;
979         int code;
980
981         if (netif_is_l3_master(skb->dev)) {
982                 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
983                 if (!dev)
984                         goto out;
985         }
986
987         in_dev = __in_dev_get_rcu(dev);
988
989         /* IP on this device is disabled. */
990         if (!in_dev)
991                 goto out;
992
993         net = dev_net(rt->dst.dev);
994         if (!IN_DEV_FORWARD(in_dev)) {
995                 switch (rt->dst.error) {
996                 case EHOSTUNREACH:
997                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
998                         break;
999
1000                 case ENETUNREACH:
1001                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
1002                         break;
1003                 }
1004                 goto out;
1005         }
1006
1007         switch (rt->dst.error) {
1008         case EINVAL:
1009         default:
1010                 goto out;
1011         case EHOSTUNREACH:
1012                 code = ICMP_HOST_UNREACH;
1013                 break;
1014         case ENETUNREACH:
1015                 code = ICMP_NET_UNREACH;
1016                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
1017                 break;
1018         case EACCES:
1019                 code = ICMP_PKT_FILTERED;
1020                 break;
1021         }
1022
1023         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
1024                                l3mdev_master_ifindex(skb->dev), 1);
1025
1026         send = true;
1027         if (peer) {
1028                 now = jiffies;
1029                 peer->rate_tokens += now - peer->rate_last;
1030                 if (peer->rate_tokens > ip_rt_error_burst)
1031                         peer->rate_tokens = ip_rt_error_burst;
1032                 peer->rate_last = now;
1033                 if (peer->rate_tokens >= ip_rt_error_cost)
1034                         peer->rate_tokens -= ip_rt_error_cost;
1035                 else
1036                         send = false;
1037                 inet_putpeer(peer);
1038         }
1039         if (send)
1040                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1041
1042 out:    kfree_skb(skb);
1043         return 0;
1044 }
1045
1046 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1047 {
1048         struct dst_entry *dst = &rt->dst;
1049         struct net *net = dev_net(dst->dev);
1050         struct fib_result res;
1051         bool lock = false;
1052         u32 old_mtu;
1053
1054         if (ip_mtu_locked(dst))
1055                 return;
1056
1057         old_mtu = ipv4_mtu(dst);
1058         if (old_mtu < mtu)
1059                 return;
1060
1061         if (mtu < ip_rt_min_pmtu) {
1062                 lock = true;
1063                 mtu = min(old_mtu, ip_rt_min_pmtu);
1064         }
1065
1066         if (rt->rt_pmtu == mtu && !lock &&
1067             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1068                 return;
1069
1070         rcu_read_lock();
1071         if (fib_lookup(net, fl4, &res, 0) == 0) {
1072                 struct fib_nh_common *nhc;
1073
1074                 fib_select_path(net, &res, fl4, NULL);
1075                 nhc = FIB_RES_NHC(res);
1076                 update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1077                                       jiffies + ip_rt_mtu_expires);
1078         }
1079         rcu_read_unlock();
1080 }
1081
1082 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1083                               struct sk_buff *skb, u32 mtu,
1084                               bool confirm_neigh)
1085 {
1086         struct rtable *rt = (struct rtable *) dst;
1087         struct flowi4 fl4;
1088
1089         ip_rt_build_flow_key(&fl4, sk, skb);
1090         ip_rt_fix_tos(&fl4);
1091
1092         /* Don't make lookup fail for bridged encapsulations */
1093         if (skb && netif_is_any_bridge_port(skb->dev))
1094                 fl4.flowi4_oif = 0;
1095
1096         __ip_rt_update_pmtu(rt, &fl4, mtu);
1097 }
1098
1099 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1100                       int oif, u8 protocol)
1101 {
1102         const struct iphdr *iph = (const struct iphdr *)skb->data;
1103         struct flowi4 fl4;
1104         struct rtable *rt;
1105         u32 mark = IP4_REPLY_MARK(net, skb->mark);
1106
1107         __build_flow_key(net, &fl4, NULL, iph, oif,
1108                          RT_TOS(iph->tos), protocol, mark, 0);
1109         rt = __ip_route_output_key(net, &fl4);
1110         if (!IS_ERR(rt)) {
1111                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1112                 ip_rt_put(rt);
1113         }
1114 }
1115 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1116
1117 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1118 {
1119         const struct iphdr *iph = (const struct iphdr *)skb->data;
1120         struct flowi4 fl4;
1121         struct rtable *rt;
1122
1123         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1124
1125         if (!fl4.flowi4_mark)
1126                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1127
1128         rt = __ip_route_output_key(sock_net(sk), &fl4);
1129         if (!IS_ERR(rt)) {
1130                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1131                 ip_rt_put(rt);
1132         }
1133 }
1134
1135 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1136 {
1137         const struct iphdr *iph = (const struct iphdr *)skb->data;
1138         struct flowi4 fl4;
1139         struct rtable *rt;
1140         struct dst_entry *odst = NULL;
1141         bool new = false;
1142         struct net *net = sock_net(sk);
1143
1144         bh_lock_sock(sk);
1145
1146         if (!ip_sk_accept_pmtu(sk))
1147                 goto out;
1148
1149         odst = sk_dst_get(sk);
1150
1151         if (sock_owned_by_user(sk) || !odst) {
1152                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1153                 goto out;
1154         }
1155
1156         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1157
1158         rt = (struct rtable *)odst;
1159         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1160                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1161                 if (IS_ERR(rt))
1162                         goto out;
1163
1164                 new = true;
1165         } else {
1166                 ip_rt_fix_tos(&fl4);
1167         }
1168
1169         __ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu);
1170
1171         if (!dst_check(&rt->dst, 0)) {
1172                 if (new)
1173                         dst_release(&rt->dst);
1174
1175                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1176                 if (IS_ERR(rt))
1177                         goto out;
1178
1179                 new = true;
1180         }
1181
1182         if (new)
1183                 sk_dst_set(sk, &rt->dst);
1184
1185 out:
1186         bh_unlock_sock(sk);
1187         dst_release(odst);
1188 }
1189 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1190
1191 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1192                    int oif, u8 protocol)
1193 {
1194         const struct iphdr *iph = (const struct iphdr *)skb->data;
1195         struct flowi4 fl4;
1196         struct rtable *rt;
1197
1198         __build_flow_key(net, &fl4, NULL, iph, oif,
1199                          RT_TOS(iph->tos), protocol, 0, 0);
1200         rt = __ip_route_output_key(net, &fl4);
1201         if (!IS_ERR(rt)) {
1202                 __ip_do_redirect(rt, skb, &fl4, false);
1203                 ip_rt_put(rt);
1204         }
1205 }
1206 EXPORT_SYMBOL_GPL(ipv4_redirect);
1207
1208 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1209 {
1210         const struct iphdr *iph = (const struct iphdr *)skb->data;
1211         struct flowi4 fl4;
1212         struct rtable *rt;
1213         struct net *net = sock_net(sk);
1214
1215         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1216         rt = __ip_route_output_key(net, &fl4);
1217         if (!IS_ERR(rt)) {
1218                 __ip_do_redirect(rt, skb, &fl4, false);
1219                 ip_rt_put(rt);
1220         }
1221 }
1222 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1223
1224 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1225 {
1226         struct rtable *rt = (struct rtable *) dst;
1227
1228         /* All IPV4 dsts are created with ->obsolete set to the value
1229          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1230          * into this function always.
1231          *
1232          * When a PMTU/redirect information update invalidates a route,
1233          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1234          * DST_OBSOLETE_DEAD.
1235          */
1236         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1237                 return NULL;
1238         return dst;
1239 }
1240
1241 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1242 {
1243         struct ip_options opt;
1244         int res;
1245
1246         /* Recompile ip options since IPCB may not be valid anymore.
1247          * Also check we have a reasonable ipv4 header.
1248          */
1249         if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1250             ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1251                 return;
1252
1253         memset(&opt, 0, sizeof(opt));
1254         if (ip_hdr(skb)->ihl > 5) {
1255                 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1256                         return;
1257                 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1258
1259                 rcu_read_lock();
1260                 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1261                 rcu_read_unlock();
1262
1263                 if (res)
1264                         return;
1265         }
1266         __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1267 }
1268
1269 static void ipv4_link_failure(struct sk_buff *skb)
1270 {
1271         struct rtable *rt;
1272
1273         ipv4_send_dest_unreach(skb);
1274
1275         rt = skb_rtable(skb);
1276         if (rt)
1277                 dst_set_expires(&rt->dst, 0);
1278 }
1279
1280 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1281 {
1282         pr_debug("%s: %pI4 -> %pI4, %s\n",
1283                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1284                  skb->dev ? skb->dev->name : "?");
1285         kfree_skb(skb);
1286         WARN_ON(1);
1287         return 0;
1288 }
1289
1290 /*
1291    We do not cache source address of outgoing interface,
1292    because it is used only by IP RR, TS and SRR options,
1293    so that it out of fast path.
1294
1295    BTW remember: "addr" is allowed to be not aligned
1296    in IP options!
1297  */
1298
1299 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1300 {
1301         __be32 src;
1302
1303         if (rt_is_output_route(rt))
1304                 src = ip_hdr(skb)->saddr;
1305         else {
1306                 struct fib_result res;
1307                 struct iphdr *iph = ip_hdr(skb);
1308                 struct flowi4 fl4 = {
1309                         .daddr = iph->daddr,
1310                         .saddr = iph->saddr,
1311                         .flowi4_tos = RT_TOS(iph->tos),
1312                         .flowi4_oif = rt->dst.dev->ifindex,
1313                         .flowi4_iif = skb->dev->ifindex,
1314                         .flowi4_mark = skb->mark,
1315                 };
1316
1317                 rcu_read_lock();
1318                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1319                         src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1320                 else
1321                         src = inet_select_addr(rt->dst.dev,
1322                                                rt_nexthop(rt, iph->daddr),
1323                                                RT_SCOPE_UNIVERSE);
1324                 rcu_read_unlock();
1325         }
1326         memcpy(addr, &src, 4);
1327 }
1328
1329 #ifdef CONFIG_IP_ROUTE_CLASSID
1330 static void set_class_tag(struct rtable *rt, u32 tag)
1331 {
1332         if (!(rt->dst.tclassid & 0xFFFF))
1333                 rt->dst.tclassid |= tag & 0xFFFF;
1334         if (!(rt->dst.tclassid & 0xFFFF0000))
1335                 rt->dst.tclassid |= tag & 0xFFFF0000;
1336 }
1337 #endif
1338
1339 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1340 {
1341         unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1342         unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1343                                     ip_rt_min_advmss);
1344
1345         return min(advmss, IPV4_MAX_PMTU - header_size);
1346 }
1347
1348 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1349 {
1350         const struct rtable *rt = (const struct rtable *)dst;
1351         unsigned int mtu = rt->rt_pmtu;
1352
1353         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1354                 mtu = dst_metric_raw(dst, RTAX_MTU);
1355
1356         if (mtu)
1357                 goto out;
1358
1359         mtu = READ_ONCE(dst->dev->mtu);
1360
1361         if (unlikely(ip_mtu_locked(dst))) {
1362                 if (rt->rt_uses_gateway && mtu > 576)
1363                         mtu = 576;
1364         }
1365
1366 out:
1367         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1368
1369         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1370 }
1371
1372 static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1373 {
1374         struct fnhe_hash_bucket *hash;
1375         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1376         u32 hval = fnhe_hashfun(daddr);
1377
1378         spin_lock_bh(&fnhe_lock);
1379
1380         hash = rcu_dereference_protected(nhc->nhc_exceptions,
1381                                          lockdep_is_held(&fnhe_lock));
1382         hash += hval;
1383
1384         fnhe_p = &hash->chain;
1385         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1386         while (fnhe) {
1387                 if (fnhe->fnhe_daddr == daddr) {
1388                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1389                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1390                         /* set fnhe_daddr to 0 to ensure it won't bind with
1391                          * new dsts in rt_bind_exception().
1392                          */
1393                         fnhe->fnhe_daddr = 0;
1394                         fnhe_flush_routes(fnhe);
1395                         kfree_rcu(fnhe, rcu);
1396                         break;
1397                 }
1398                 fnhe_p = &fnhe->fnhe_next;
1399                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1400                                                  lockdep_is_held(&fnhe_lock));
1401         }
1402
1403         spin_unlock_bh(&fnhe_lock);
1404 }
1405
1406 static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1407                                                __be32 daddr)
1408 {
1409         struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1410         struct fib_nh_exception *fnhe;
1411         u32 hval;
1412
1413         if (!hash)
1414                 return NULL;
1415
1416         hval = fnhe_hashfun(daddr);
1417
1418         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1419              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1420                 if (fnhe->fnhe_daddr == daddr) {
1421                         if (fnhe->fnhe_expires &&
1422                             time_after(jiffies, fnhe->fnhe_expires)) {
1423                                 ip_del_fnhe(nhc, daddr);
1424                                 break;
1425                         }
1426                         return fnhe;
1427                 }
1428         }
1429         return NULL;
1430 }
1431
1432 /* MTU selection:
1433  * 1. mtu on route is locked - use it
1434  * 2. mtu from nexthop exception
1435  * 3. mtu from egress device
1436  */
1437
1438 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1439 {
1440         struct fib_nh_common *nhc = res->nhc;
1441         struct net_device *dev = nhc->nhc_dev;
1442         struct fib_info *fi = res->fi;
1443         u32 mtu = 0;
1444
1445         if (READ_ONCE(dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu) ||
1446             fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1447                 mtu = fi->fib_mtu;
1448
1449         if (likely(!mtu)) {
1450                 struct fib_nh_exception *fnhe;
1451
1452                 fnhe = find_exception(nhc, daddr);
1453                 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1454                         mtu = fnhe->fnhe_pmtu;
1455         }
1456
1457         if (likely(!mtu))
1458                 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1459
1460         return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1461 }
1462
1463 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1464                               __be32 daddr, const bool do_cache)
1465 {
1466         bool ret = false;
1467
1468         spin_lock_bh(&fnhe_lock);
1469
1470         if (daddr == fnhe->fnhe_daddr) {
1471                 struct rtable __rcu **porig;
1472                 struct rtable *orig;
1473                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1474
1475                 if (rt_is_input_route(rt))
1476                         porig = &fnhe->fnhe_rth_input;
1477                 else
1478                         porig = &fnhe->fnhe_rth_output;
1479                 orig = rcu_dereference(*porig);
1480
1481                 if (fnhe->fnhe_genid != genid) {
1482                         fnhe->fnhe_genid = genid;
1483                         fnhe->fnhe_gw = 0;
1484                         fnhe->fnhe_pmtu = 0;
1485                         fnhe->fnhe_expires = 0;
1486                         fnhe->fnhe_mtu_locked = false;
1487                         fnhe_flush_routes(fnhe);
1488                         orig = NULL;
1489                 }
1490                 fill_route_from_fnhe(rt, fnhe);
1491                 if (!rt->rt_gw4) {
1492                         rt->rt_gw4 = daddr;
1493                         rt->rt_gw_family = AF_INET;
1494                 }
1495
1496                 if (do_cache) {
1497                         dst_hold(&rt->dst);
1498                         rcu_assign_pointer(*porig, rt);
1499                         if (orig) {
1500                                 dst_dev_put(&orig->dst);
1501                                 dst_release(&orig->dst);
1502                         }
1503                         ret = true;
1504                 }
1505
1506                 fnhe->fnhe_stamp = jiffies;
1507         }
1508         spin_unlock_bh(&fnhe_lock);
1509
1510         return ret;
1511 }
1512
1513 static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1514 {
1515         struct rtable *orig, *prev, **p;
1516         bool ret = true;
1517
1518         if (rt_is_input_route(rt)) {
1519                 p = (struct rtable **)&nhc->nhc_rth_input;
1520         } else {
1521                 p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1522         }
1523         orig = *p;
1524
1525         /* hold dst before doing cmpxchg() to avoid race condition
1526          * on this dst
1527          */
1528         dst_hold(&rt->dst);
1529         prev = cmpxchg(p, orig, rt);
1530         if (prev == orig) {
1531                 if (orig) {
1532                         rt_add_uncached_list(orig);
1533                         dst_release(&orig->dst);
1534                 }
1535         } else {
1536                 dst_release(&rt->dst);
1537                 ret = false;
1538         }
1539
1540         return ret;
1541 }
1542
1543 struct uncached_list {
1544         spinlock_t              lock;
1545         struct list_head        head;
1546 };
1547
1548 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1549
1550 void rt_add_uncached_list(struct rtable *rt)
1551 {
1552         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1553
1554         rt->rt_uncached_list = ul;
1555
1556         spin_lock_bh(&ul->lock);
1557         list_add_tail(&rt->rt_uncached, &ul->head);
1558         spin_unlock_bh(&ul->lock);
1559 }
1560
1561 void rt_del_uncached_list(struct rtable *rt)
1562 {
1563         if (!list_empty(&rt->rt_uncached)) {
1564                 struct uncached_list *ul = rt->rt_uncached_list;
1565
1566                 spin_lock_bh(&ul->lock);
1567                 list_del(&rt->rt_uncached);
1568                 spin_unlock_bh(&ul->lock);
1569         }
1570 }
1571
1572 static void ipv4_dst_destroy(struct dst_entry *dst)
1573 {
1574         struct rtable *rt = (struct rtable *)dst;
1575
1576         ip_dst_metrics_put(dst);
1577         rt_del_uncached_list(rt);
1578 }
1579
1580 void rt_flush_dev(struct net_device *dev)
1581 {
1582         struct rtable *rt;
1583         int cpu;
1584
1585         for_each_possible_cpu(cpu) {
1586                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1587
1588                 spin_lock_bh(&ul->lock);
1589                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1590                         if (rt->dst.dev != dev)
1591                                 continue;
1592                         rt->dst.dev = blackhole_netdev;
1593                         dev_hold(rt->dst.dev);
1594                         dev_put(dev);
1595                 }
1596                 spin_unlock_bh(&ul->lock);
1597         }
1598 }
1599
1600 static bool rt_cache_valid(const struct rtable *rt)
1601 {
1602         return  rt &&
1603                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1604                 !rt_is_expired(rt);
1605 }
1606
1607 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1608                            const struct fib_result *res,
1609                            struct fib_nh_exception *fnhe,
1610                            struct fib_info *fi, u16 type, u32 itag,
1611                            const bool do_cache)
1612 {
1613         bool cached = false;
1614
1615         if (fi) {
1616                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1617
1618                 if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1619                         rt->rt_uses_gateway = 1;
1620                         rt->rt_gw_family = nhc->nhc_gw_family;
1621                         /* only INET and INET6 are supported */
1622                         if (likely(nhc->nhc_gw_family == AF_INET))
1623                                 rt->rt_gw4 = nhc->nhc_gw.ipv4;
1624                         else
1625                                 rt->rt_gw6 = nhc->nhc_gw.ipv6;
1626                 }
1627
1628                 ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1629
1630 #ifdef CONFIG_IP_ROUTE_CLASSID
1631                 if (nhc->nhc_family == AF_INET) {
1632                         struct fib_nh *nh;
1633
1634                         nh = container_of(nhc, struct fib_nh, nh_common);
1635                         rt->dst.tclassid = nh->nh_tclassid;
1636                 }
1637 #endif
1638                 rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1639                 if (unlikely(fnhe))
1640                         cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1641                 else if (do_cache)
1642                         cached = rt_cache_route(nhc, rt);
1643                 if (unlikely(!cached)) {
1644                         /* Routes we intend to cache in nexthop exception or
1645                          * FIB nexthop have the DST_NOCACHE bit clear.
1646                          * However, if we are unsuccessful at storing this
1647                          * route into the cache we really need to set it.
1648                          */
1649                         if (!rt->rt_gw4) {
1650                                 rt->rt_gw_family = AF_INET;
1651                                 rt->rt_gw4 = daddr;
1652                         }
1653                         rt_add_uncached_list(rt);
1654                 }
1655         } else
1656                 rt_add_uncached_list(rt);
1657
1658 #ifdef CONFIG_IP_ROUTE_CLASSID
1659 #ifdef CONFIG_IP_MULTIPLE_TABLES
1660         set_class_tag(rt, res->tclassid);
1661 #endif
1662         set_class_tag(rt, itag);
1663 #endif
1664 }
1665
1666 struct rtable *rt_dst_alloc(struct net_device *dev,
1667                             unsigned int flags, u16 type,
1668                             bool nopolicy, bool noxfrm)
1669 {
1670         struct rtable *rt;
1671
1672         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1673                        (nopolicy ? DST_NOPOLICY : 0) |
1674                        (noxfrm ? DST_NOXFRM : 0));
1675
1676         if (rt) {
1677                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1678                 rt->rt_flags = flags;
1679                 rt->rt_type = type;
1680                 rt->rt_is_input = 0;
1681                 rt->rt_iif = 0;
1682                 rt->rt_pmtu = 0;
1683                 rt->rt_mtu_locked = 0;
1684                 rt->rt_uses_gateway = 0;
1685                 rt->rt_gw_family = 0;
1686                 rt->rt_gw4 = 0;
1687                 INIT_LIST_HEAD(&rt->rt_uncached);
1688
1689                 rt->dst.output = ip_output;
1690                 if (flags & RTCF_LOCAL)
1691                         rt->dst.input = ip_local_deliver;
1692         }
1693
1694         return rt;
1695 }
1696 EXPORT_SYMBOL(rt_dst_alloc);
1697
1698 struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1699 {
1700         struct rtable *new_rt;
1701
1702         new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1703                            rt->dst.flags);
1704
1705         if (new_rt) {
1706                 new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1707                 new_rt->rt_flags = rt->rt_flags;
1708                 new_rt->rt_type = rt->rt_type;
1709                 new_rt->rt_is_input = rt->rt_is_input;
1710                 new_rt->rt_iif = rt->rt_iif;
1711                 new_rt->rt_pmtu = rt->rt_pmtu;
1712                 new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1713                 new_rt->rt_gw_family = rt->rt_gw_family;
1714                 if (rt->rt_gw_family == AF_INET)
1715                         new_rt->rt_gw4 = rt->rt_gw4;
1716                 else if (rt->rt_gw_family == AF_INET6)
1717                         new_rt->rt_gw6 = rt->rt_gw6;
1718                 INIT_LIST_HEAD(&new_rt->rt_uncached);
1719
1720                 new_rt->dst.input = rt->dst.input;
1721                 new_rt->dst.output = rt->dst.output;
1722                 new_rt->dst.error = rt->dst.error;
1723                 new_rt->dst.lastuse = jiffies;
1724                 new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1725         }
1726         return new_rt;
1727 }
1728 EXPORT_SYMBOL(rt_dst_clone);
1729
1730 /* called in rcu_read_lock() section */
1731 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1732                           u8 tos, struct net_device *dev,
1733                           struct in_device *in_dev, u32 *itag)
1734 {
1735         int err;
1736
1737         /* Primary sanity checks. */
1738         if (!in_dev)
1739                 return -EINVAL;
1740
1741         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1742             skb->protocol != htons(ETH_P_IP))
1743                 return -EINVAL;
1744
1745         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1746                 return -EINVAL;
1747
1748         if (ipv4_is_zeronet(saddr)) {
1749                 if (!ipv4_is_local_multicast(daddr) &&
1750                     ip_hdr(skb)->protocol != IPPROTO_IGMP)
1751                         return -EINVAL;
1752         } else {
1753                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1754                                           in_dev, itag);
1755                 if (err < 0)
1756                         return err;
1757         }
1758         return 0;
1759 }
1760
1761 /* called in rcu_read_lock() section */
1762 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1763                              u8 tos, struct net_device *dev, int our)
1764 {
1765         struct in_device *in_dev = __in_dev_get_rcu(dev);
1766         unsigned int flags = RTCF_MULTICAST;
1767         struct rtable *rth;
1768         bool no_policy;
1769         u32 itag = 0;
1770         int err;
1771
1772         err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1773         if (err)
1774                 return err;
1775
1776         if (our)
1777                 flags |= RTCF_LOCAL;
1778
1779         no_policy = IN_DEV_ORCONF(in_dev, NOPOLICY);
1780         if (no_policy)
1781                 IPCB(skb)->flags |= IPSKB_NOPOLICY;
1782
1783         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1784                            no_policy, false);
1785         if (!rth)
1786                 return -ENOBUFS;
1787
1788 #ifdef CONFIG_IP_ROUTE_CLASSID
1789         rth->dst.tclassid = itag;
1790 #endif
1791         rth->dst.output = ip_rt_bug;
1792         rth->rt_is_input= 1;
1793
1794 #ifdef CONFIG_IP_MROUTE
1795         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1796                 rth->dst.input = ip_mr_input;
1797 #endif
1798         RT_CACHE_STAT_INC(in_slow_mc);
1799
1800         skb_dst_drop(skb);
1801         skb_dst_set(skb, &rth->dst);
1802         return 0;
1803 }
1804
1805
1806 static void ip_handle_martian_source(struct net_device *dev,
1807                                      struct in_device *in_dev,
1808                                      struct sk_buff *skb,
1809                                      __be32 daddr,
1810                                      __be32 saddr)
1811 {
1812         RT_CACHE_STAT_INC(in_martian_src);
1813 #ifdef CONFIG_IP_ROUTE_VERBOSE
1814         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1815                 /*
1816                  *      RFC1812 recommendation, if source is martian,
1817                  *      the only hint is MAC header.
1818                  */
1819                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1820                         &daddr, &saddr, dev->name);
1821                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1822                         print_hex_dump(KERN_WARNING, "ll header: ",
1823                                        DUMP_PREFIX_OFFSET, 16, 1,
1824                                        skb_mac_header(skb),
1825                                        dev->hard_header_len, false);
1826                 }
1827         }
1828 #endif
1829 }
1830
1831 /* called in rcu_read_lock() section */
1832 static int __mkroute_input(struct sk_buff *skb,
1833                            const struct fib_result *res,
1834                            struct in_device *in_dev,
1835                            __be32 daddr, __be32 saddr, u32 tos)
1836 {
1837         struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1838         struct net_device *dev = nhc->nhc_dev;
1839         struct fib_nh_exception *fnhe;
1840         struct rtable *rth;
1841         int err;
1842         struct in_device *out_dev;
1843         bool do_cache, no_policy;
1844         u32 itag = 0;
1845
1846         /* get a working reference to the output device */
1847         out_dev = __in_dev_get_rcu(dev);
1848         if (!out_dev) {
1849                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1850                 return -EINVAL;
1851         }
1852
1853         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1854                                   in_dev->dev, in_dev, &itag);
1855         if (err < 0) {
1856                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1857                                          saddr);
1858
1859                 goto cleanup;
1860         }
1861
1862         do_cache = res->fi && !itag;
1863         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1864             skb->protocol == htons(ETH_P_IP)) {
1865                 __be32 gw;
1866
1867                 gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1868                 if (IN_DEV_SHARED_MEDIA(out_dev) ||
1869                     inet_addr_onlink(out_dev, saddr, gw))
1870                         IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1871         }
1872
1873         if (skb->protocol != htons(ETH_P_IP)) {
1874                 /* Not IP (i.e. ARP). Do not create route, if it is
1875                  * invalid for proxy arp. DNAT routes are always valid.
1876                  *
1877                  * Proxy arp feature have been extended to allow, ARP
1878                  * replies back to the same interface, to support
1879                  * Private VLAN switch technologies. See arp.c.
1880                  */
1881                 if (out_dev == in_dev &&
1882                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1883                         err = -EINVAL;
1884                         goto cleanup;
1885                 }
1886         }
1887
1888         no_policy = IN_DEV_ORCONF(in_dev, NOPOLICY);
1889         if (no_policy)
1890                 IPCB(skb)->flags |= IPSKB_NOPOLICY;
1891
1892         fnhe = find_exception(nhc, daddr);
1893         if (do_cache) {
1894                 if (fnhe)
1895                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1896                 else
1897                         rth = rcu_dereference(nhc->nhc_rth_input);
1898                 if (rt_cache_valid(rth)) {
1899                         skb_dst_set_noref(skb, &rth->dst);
1900                         goto out;
1901                 }
1902         }
1903
1904         rth = rt_dst_alloc(out_dev->dev, 0, res->type, no_policy,
1905                            IN_DEV_ORCONF(out_dev, NOXFRM));
1906         if (!rth) {
1907                 err = -ENOBUFS;
1908                 goto cleanup;
1909         }
1910
1911         rth->rt_is_input = 1;
1912         RT_CACHE_STAT_INC(in_slow_tot);
1913
1914         rth->dst.input = ip_forward;
1915
1916         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1917                        do_cache);
1918         lwtunnel_set_redirect(&rth->dst);
1919         skb_dst_set(skb, &rth->dst);
1920 out:
1921         err = 0;
1922  cleanup:
1923         return err;
1924 }
1925
1926 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1927 /* To make ICMP packets follow the right flow, the multipath hash is
1928  * calculated from the inner IP addresses.
1929  */
1930 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1931                                  struct flow_keys *hash_keys)
1932 {
1933         const struct iphdr *outer_iph = ip_hdr(skb);
1934         const struct iphdr *key_iph = outer_iph;
1935         const struct iphdr *inner_iph;
1936         const struct icmphdr *icmph;
1937         struct iphdr _inner_iph;
1938         struct icmphdr _icmph;
1939
1940         if (likely(outer_iph->protocol != IPPROTO_ICMP))
1941                 goto out;
1942
1943         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1944                 goto out;
1945
1946         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1947                                    &_icmph);
1948         if (!icmph)
1949                 goto out;
1950
1951         if (!icmp_is_err(icmph->type))
1952                 goto out;
1953
1954         inner_iph = skb_header_pointer(skb,
1955                                        outer_iph->ihl * 4 + sizeof(_icmph),
1956                                        sizeof(_inner_iph), &_inner_iph);
1957         if (!inner_iph)
1958                 goto out;
1959
1960         key_iph = inner_iph;
1961 out:
1962         hash_keys->addrs.v4addrs.src = key_iph->saddr;
1963         hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1964 }
1965
1966 /* if skb is set it will be used and fl4 can be NULL */
1967 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1968                        const struct sk_buff *skb, struct flow_keys *flkeys)
1969 {
1970         u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1971         struct flow_keys hash_keys;
1972         u32 mhash;
1973
1974         switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1975         case 0:
1976                 memset(&hash_keys, 0, sizeof(hash_keys));
1977                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1978                 if (skb) {
1979                         ip_multipath_l3_keys(skb, &hash_keys);
1980                 } else {
1981                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1982                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1983                 }
1984                 break;
1985         case 1:
1986                 /* skb is currently provided only when forwarding */
1987                 if (skb) {
1988                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1989                         struct flow_keys keys;
1990
1991                         /* short-circuit if we already have L4 hash present */
1992                         if (skb->l4_hash)
1993                                 return skb_get_hash_raw(skb) >> 1;
1994
1995                         memset(&hash_keys, 0, sizeof(hash_keys));
1996
1997                         if (!flkeys) {
1998                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
1999                                 flkeys = &keys;
2000                         }
2001
2002                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2003                         hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
2004                         hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
2005                         hash_keys.ports.src = flkeys->ports.src;
2006                         hash_keys.ports.dst = flkeys->ports.dst;
2007                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2008                 } else {
2009                         memset(&hash_keys, 0, sizeof(hash_keys));
2010                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2011                         hash_keys.addrs.v4addrs.src = fl4->saddr;
2012                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
2013                         hash_keys.ports.src = fl4->fl4_sport;
2014                         hash_keys.ports.dst = fl4->fl4_dport;
2015                         hash_keys.basic.ip_proto = fl4->flowi4_proto;
2016                 }
2017                 break;
2018         case 2:
2019                 memset(&hash_keys, 0, sizeof(hash_keys));
2020                 /* skb is currently provided only when forwarding */
2021                 if (skb) {
2022                         struct flow_keys keys;
2023
2024                         skb_flow_dissect_flow_keys(skb, &keys, 0);
2025                         /* Inner can be v4 or v6 */
2026                         if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
2027                                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2028                                 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
2029                                 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
2030                         } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
2031                                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2032                                 hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
2033                                 hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
2034                                 hash_keys.tags.flow_label = keys.tags.flow_label;
2035                                 hash_keys.basic.ip_proto = keys.basic.ip_proto;
2036                         } else {
2037                                 /* Same as case 0 */
2038                                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2039                                 ip_multipath_l3_keys(skb, &hash_keys);
2040                         }
2041                 } else {
2042                         /* Same as case 0 */
2043                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2044                         hash_keys.addrs.v4addrs.src = fl4->saddr;
2045                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
2046                 }
2047                 break;
2048         }
2049         mhash = flow_hash_from_keys(&hash_keys);
2050
2051         if (multipath_hash)
2052                 mhash = jhash_2words(mhash, multipath_hash, 0);
2053
2054         return mhash >> 1;
2055 }
2056 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
2057
2058 static int ip_mkroute_input(struct sk_buff *skb,
2059                             struct fib_result *res,
2060                             struct in_device *in_dev,
2061                             __be32 daddr, __be32 saddr, u32 tos,
2062                             struct flow_keys *hkeys)
2063 {
2064 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2065         if (res->fi && fib_info_num_path(res->fi) > 1) {
2066                 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2067
2068                 fib_select_multipath(res, h);
2069         }
2070 #endif
2071
2072         /* create a routing cache entry */
2073         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2074 }
2075
2076 /* Implements all the saddr-related checks as ip_route_input_slow(),
2077  * assuming daddr is valid and the destination is not a local broadcast one.
2078  * Uses the provided hint instead of performing a route lookup.
2079  */
2080 int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2081                       u8 tos, struct net_device *dev,
2082                       const struct sk_buff *hint)
2083 {
2084         struct in_device *in_dev = __in_dev_get_rcu(dev);
2085         struct rtable *rt = skb_rtable(hint);
2086         struct net *net = dev_net(dev);
2087         int err = -EINVAL;
2088         u32 tag = 0;
2089
2090         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2091                 goto martian_source;
2092
2093         if (ipv4_is_zeronet(saddr))
2094                 goto martian_source;
2095
2096         if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2097                 goto martian_source;
2098
2099         if (rt->rt_type != RTN_LOCAL)
2100                 goto skip_validate_source;
2101
2102         tos &= IPTOS_RT_MASK;
2103         err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
2104         if (err < 0)
2105                 goto martian_source;
2106
2107 skip_validate_source:
2108         skb_dst_copy(skb, hint);
2109         return 0;
2110
2111 martian_source:
2112         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2113         return err;
2114 }
2115
2116 /* get device for dst_alloc with local routes */
2117 static struct net_device *ip_rt_get_dev(struct net *net,
2118                                         const struct fib_result *res)
2119 {
2120         struct fib_nh_common *nhc = res->fi ? res->nhc : NULL;
2121         struct net_device *dev = NULL;
2122
2123         if (nhc)
2124                 dev = l3mdev_master_dev_rcu(nhc->nhc_dev);
2125
2126         return dev ? : net->loopback_dev;
2127 }
2128
2129 /*
2130  *      NOTE. We drop all the packets that has local source
2131  *      addresses, because every properly looped back packet
2132  *      must have correct destination already attached by output routine.
2133  *      Changes in the enforced policies must be applied also to
2134  *      ip_route_use_hint().
2135  *
2136  *      Such approach solves two big problems:
2137  *      1. Not simplex devices are handled properly.
2138  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2139  *      called with rcu_read_lock()
2140  */
2141
2142 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2143                                u8 tos, struct net_device *dev,
2144                                struct fib_result *res)
2145 {
2146         struct in_device *in_dev = __in_dev_get_rcu(dev);
2147         struct flow_keys *flkeys = NULL, _flkeys;
2148         struct net    *net = dev_net(dev);
2149         struct ip_tunnel_info *tun_info;
2150         int             err = -EINVAL;
2151         unsigned int    flags = 0;
2152         u32             itag = 0;
2153         struct rtable   *rth;
2154         struct flowi4   fl4;
2155         bool do_cache = true;
2156         bool no_policy;
2157
2158         /* IP on this device is disabled. */
2159
2160         if (!in_dev)
2161                 goto out;
2162
2163         /* Check for the most weird martians, which can be not detected
2164            by fib_lookup.
2165          */
2166
2167         tun_info = skb_tunnel_info(skb);
2168         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2169                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2170         else
2171                 fl4.flowi4_tun_key.tun_id = 0;
2172         skb_dst_drop(skb);
2173
2174         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2175                 goto martian_source;
2176
2177         res->fi = NULL;
2178         res->table = NULL;
2179         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2180                 goto brd_input;
2181
2182         /* Accept zero addresses only to limited broadcast;
2183          * I even do not know to fix it or not. Waiting for complains :-)
2184          */
2185         if (ipv4_is_zeronet(saddr))
2186                 goto martian_source;
2187
2188         if (ipv4_is_zeronet(daddr))
2189                 goto martian_destination;
2190
2191         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2192          * and call it once if daddr or/and saddr are loopback addresses
2193          */
2194         if (ipv4_is_loopback(daddr)) {
2195                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2196                         goto martian_destination;
2197         } else if (ipv4_is_loopback(saddr)) {
2198                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2199                         goto martian_source;
2200         }
2201
2202         /*
2203          *      Now we are ready to route packet.
2204          */
2205         fl4.flowi4_oif = 0;
2206         fl4.flowi4_iif = dev->ifindex;
2207         fl4.flowi4_mark = skb->mark;
2208         fl4.flowi4_tos = tos;
2209         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2210         fl4.flowi4_flags = 0;
2211         fl4.daddr = daddr;
2212         fl4.saddr = saddr;
2213         fl4.flowi4_uid = sock_net_uid(net, NULL);
2214         fl4.flowi4_multipath_hash = 0;
2215
2216         if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2217                 flkeys = &_flkeys;
2218         } else {
2219                 fl4.flowi4_proto = 0;
2220                 fl4.fl4_sport = 0;
2221                 fl4.fl4_dport = 0;
2222         }
2223
2224         err = fib_lookup(net, &fl4, res, 0);
2225         if (err != 0) {
2226                 if (!IN_DEV_FORWARD(in_dev))
2227                         err = -EHOSTUNREACH;
2228                 goto no_route;
2229         }
2230
2231         if (res->type == RTN_BROADCAST) {
2232                 if (IN_DEV_BFORWARD(in_dev))
2233                         goto make_route;
2234                 /* not do cache if bc_forwarding is enabled */
2235                 if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2236                         do_cache = false;
2237                 goto brd_input;
2238         }
2239
2240         if (res->type == RTN_LOCAL) {
2241                 err = fib_validate_source(skb, saddr, daddr, tos,
2242                                           0, dev, in_dev, &itag);
2243                 if (err < 0)
2244                         goto martian_source;
2245                 goto local_input;
2246         }
2247
2248         if (!IN_DEV_FORWARD(in_dev)) {
2249                 err = -EHOSTUNREACH;
2250                 goto no_route;
2251         }
2252         if (res->type != RTN_UNICAST)
2253                 goto martian_destination;
2254
2255 make_route:
2256         err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2257 out:    return err;
2258
2259 brd_input:
2260         if (skb->protocol != htons(ETH_P_IP))
2261                 goto e_inval;
2262
2263         if (!ipv4_is_zeronet(saddr)) {
2264                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2265                                           in_dev, &itag);
2266                 if (err < 0)
2267                         goto martian_source;
2268         }
2269         flags |= RTCF_BROADCAST;
2270         res->type = RTN_BROADCAST;
2271         RT_CACHE_STAT_INC(in_brd);
2272
2273 local_input:
2274         no_policy = IN_DEV_ORCONF(in_dev, NOPOLICY);
2275         if (no_policy)
2276                 IPCB(skb)->flags |= IPSKB_NOPOLICY;
2277
2278         do_cache &= res->fi && !itag;
2279         if (do_cache) {
2280                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2281
2282                 rth = rcu_dereference(nhc->nhc_rth_input);
2283                 if (rt_cache_valid(rth)) {
2284                         skb_dst_set_noref(skb, &rth->dst);
2285                         err = 0;
2286                         goto out;
2287                 }
2288         }
2289
2290         rth = rt_dst_alloc(ip_rt_get_dev(net, res),
2291                            flags | RTCF_LOCAL, res->type,
2292                            no_policy, false);
2293         if (!rth)
2294                 goto e_nobufs;
2295
2296         rth->dst.output= ip_rt_bug;
2297 #ifdef CONFIG_IP_ROUTE_CLASSID
2298         rth->dst.tclassid = itag;
2299 #endif
2300         rth->rt_is_input = 1;
2301
2302         RT_CACHE_STAT_INC(in_slow_tot);
2303         if (res->type == RTN_UNREACHABLE) {
2304                 rth->dst.input= ip_error;
2305                 rth->dst.error= -err;
2306                 rth->rt_flags   &= ~RTCF_LOCAL;
2307         }
2308
2309         if (do_cache) {
2310                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2311
2312                 rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2313                 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2314                         WARN_ON(rth->dst.input == lwtunnel_input);
2315                         rth->dst.lwtstate->orig_input = rth->dst.input;
2316                         rth->dst.input = lwtunnel_input;
2317                 }
2318
2319                 if (unlikely(!rt_cache_route(nhc, rth)))
2320                         rt_add_uncached_list(rth);
2321         }
2322         skb_dst_set(skb, &rth->dst);
2323         err = 0;
2324         goto out;
2325
2326 no_route:
2327         RT_CACHE_STAT_INC(in_no_route);
2328         res->type = RTN_UNREACHABLE;
2329         res->fi = NULL;
2330         res->table = NULL;
2331         goto local_input;
2332
2333         /*
2334          *      Do not cache martian addresses: they should be logged (RFC1812)
2335          */
2336 martian_destination:
2337         RT_CACHE_STAT_INC(in_martian_dst);
2338 #ifdef CONFIG_IP_ROUTE_VERBOSE
2339         if (IN_DEV_LOG_MARTIANS(in_dev))
2340                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2341                                      &daddr, &saddr, dev->name);
2342 #endif
2343
2344 e_inval:
2345         err = -EINVAL;
2346         goto out;
2347
2348 e_nobufs:
2349         err = -ENOBUFS;
2350         goto out;
2351
2352 martian_source:
2353         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2354         goto out;
2355 }
2356
2357 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2358                          u8 tos, struct net_device *dev)
2359 {
2360         struct fib_result res;
2361         int err;
2362
2363         tos &= IPTOS_RT_MASK;
2364         rcu_read_lock();
2365         err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2366         rcu_read_unlock();
2367
2368         return err;
2369 }
2370 EXPORT_SYMBOL(ip_route_input_noref);
2371
2372 /* called with rcu_read_lock held */
2373 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2374                        u8 tos, struct net_device *dev, struct fib_result *res)
2375 {
2376         /* Multicast recognition logic is moved from route cache to here.
2377            The problem was that too many Ethernet cards have broken/missing
2378            hardware multicast filters :-( As result the host on multicasting
2379            network acquires a lot of useless route cache entries, sort of
2380            SDR messages from all the world. Now we try to get rid of them.
2381            Really, provided software IP multicast filter is organized
2382            reasonably (at least, hashed), it does not result in a slowdown
2383            comparing with route cache reject entries.
2384            Note, that multicast routers are not affected, because
2385            route cache entry is created eventually.
2386          */
2387         if (ipv4_is_multicast(daddr)) {
2388                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2389                 int our = 0;
2390                 int err = -EINVAL;
2391
2392                 if (!in_dev)
2393                         return err;
2394                 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2395                                       ip_hdr(skb)->protocol);
2396
2397                 /* check l3 master if no match yet */
2398                 if (!our && netif_is_l3_slave(dev)) {
2399                         struct in_device *l3_in_dev;
2400
2401                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2402                         if (l3_in_dev)
2403                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2404                                                       ip_hdr(skb)->protocol);
2405                 }
2406
2407                 if (our
2408 #ifdef CONFIG_IP_MROUTE
2409                         ||
2410                     (!ipv4_is_local_multicast(daddr) &&
2411                      IN_DEV_MFORWARD(in_dev))
2412 #endif
2413                    ) {
2414                         err = ip_route_input_mc(skb, daddr, saddr,
2415                                                 tos, dev, our);
2416                 }
2417                 return err;
2418         }
2419
2420         return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2421 }
2422
2423 /* called with rcu_read_lock() */
2424 static struct rtable *__mkroute_output(const struct fib_result *res,
2425                                        const struct flowi4 *fl4, int orig_oif,
2426                                        struct net_device *dev_out,
2427                                        unsigned int flags)
2428 {
2429         struct fib_info *fi = res->fi;
2430         struct fib_nh_exception *fnhe;
2431         struct in_device *in_dev;
2432         u16 type = res->type;
2433         struct rtable *rth;
2434         bool do_cache;
2435
2436         in_dev = __in_dev_get_rcu(dev_out);
2437         if (!in_dev)
2438                 return ERR_PTR(-EINVAL);
2439
2440         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2441                 if (ipv4_is_loopback(fl4->saddr) &&
2442                     !(dev_out->flags & IFF_LOOPBACK) &&
2443                     !netif_is_l3_master(dev_out))
2444                         return ERR_PTR(-EINVAL);
2445
2446         if (ipv4_is_lbcast(fl4->daddr))
2447                 type = RTN_BROADCAST;
2448         else if (ipv4_is_multicast(fl4->daddr))
2449                 type = RTN_MULTICAST;
2450         else if (ipv4_is_zeronet(fl4->daddr))
2451                 return ERR_PTR(-EINVAL);
2452
2453         if (dev_out->flags & IFF_LOOPBACK)
2454                 flags |= RTCF_LOCAL;
2455
2456         do_cache = true;
2457         if (type == RTN_BROADCAST) {
2458                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2459                 fi = NULL;
2460         } else if (type == RTN_MULTICAST) {
2461                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2462                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2463                                      fl4->flowi4_proto))
2464                         flags &= ~RTCF_LOCAL;
2465                 else
2466                         do_cache = false;
2467                 /* If multicast route do not exist use
2468                  * default one, but do not gateway in this case.
2469                  * Yes, it is hack.
2470                  */
2471                 if (fi && res->prefixlen < 4)
2472                         fi = NULL;
2473         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2474                    (orig_oif != dev_out->ifindex)) {
2475                 /* For local routes that require a particular output interface
2476                  * we do not want to cache the result.  Caching the result
2477                  * causes incorrect behaviour when there are multiple source
2478                  * addresses on the interface, the end result being that if the
2479                  * intended recipient is waiting on that interface for the
2480                  * packet he won't receive it because it will be delivered on
2481                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2482                  * be set to the loopback interface as well.
2483                  */
2484                 do_cache = false;
2485         }
2486
2487         fnhe = NULL;
2488         do_cache &= fi != NULL;
2489         if (fi) {
2490                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2491                 struct rtable __rcu **prth;
2492
2493                 fnhe = find_exception(nhc, fl4->daddr);
2494                 if (!do_cache)
2495                         goto add;
2496                 if (fnhe) {
2497                         prth = &fnhe->fnhe_rth_output;
2498                 } else {
2499                         if (unlikely(fl4->flowi4_flags &
2500                                      FLOWI_FLAG_KNOWN_NH &&
2501                                      !(nhc->nhc_gw_family &&
2502                                        nhc->nhc_scope == RT_SCOPE_LINK))) {
2503                                 do_cache = false;
2504                                 goto add;
2505                         }
2506                         prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2507                 }
2508                 rth = rcu_dereference(*prth);
2509                 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2510                         return rth;
2511         }
2512
2513 add:
2514         rth = rt_dst_alloc(dev_out, flags, type,
2515                            IN_DEV_ORCONF(in_dev, NOPOLICY),
2516                            IN_DEV_ORCONF(in_dev, NOXFRM));
2517         if (!rth)
2518                 return ERR_PTR(-ENOBUFS);
2519
2520         rth->rt_iif = orig_oif;
2521
2522         RT_CACHE_STAT_INC(out_slow_tot);
2523
2524         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2525                 if (flags & RTCF_LOCAL &&
2526                     !(dev_out->flags & IFF_LOOPBACK)) {
2527                         rth->dst.output = ip_mc_output;
2528                         RT_CACHE_STAT_INC(out_slow_mc);
2529                 }
2530 #ifdef CONFIG_IP_MROUTE
2531                 if (type == RTN_MULTICAST) {
2532                         if (IN_DEV_MFORWARD(in_dev) &&
2533                             !ipv4_is_local_multicast(fl4->daddr)) {
2534                                 rth->dst.input = ip_mr_input;
2535                                 rth->dst.output = ip_mc_output;
2536                         }
2537                 }
2538 #endif
2539         }
2540
2541         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2542         lwtunnel_set_redirect(&rth->dst);
2543
2544         return rth;
2545 }
2546
2547 /*
2548  * Major route resolver routine.
2549  */
2550
2551 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2552                                         const struct sk_buff *skb)
2553 {
2554         struct fib_result res = {
2555                 .type           = RTN_UNSPEC,
2556                 .fi             = NULL,
2557                 .table          = NULL,
2558                 .tclassid       = 0,
2559         };
2560         struct rtable *rth;
2561
2562         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2563         ip_rt_fix_tos(fl4);
2564
2565         rcu_read_lock();
2566         rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2567         rcu_read_unlock();
2568
2569         return rth;
2570 }
2571 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2572
2573 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2574                                             struct fib_result *res,
2575                                             const struct sk_buff *skb)
2576 {
2577         struct net_device *dev_out = NULL;
2578         int orig_oif = fl4->flowi4_oif;
2579         unsigned int flags = 0;
2580         struct rtable *rth;
2581         int err;
2582
2583         if (fl4->saddr) {
2584                 if (ipv4_is_multicast(fl4->saddr) ||
2585                     ipv4_is_lbcast(fl4->saddr) ||
2586                     ipv4_is_zeronet(fl4->saddr)) {
2587                         rth = ERR_PTR(-EINVAL);
2588                         goto out;
2589                 }
2590
2591                 rth = ERR_PTR(-ENETUNREACH);
2592
2593                 /* I removed check for oif == dev_out->oif here.
2594                    It was wrong for two reasons:
2595                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2596                       is assigned to multiple interfaces.
2597                    2. Moreover, we are allowed to send packets with saddr
2598                       of another iface. --ANK
2599                  */
2600
2601                 if (fl4->flowi4_oif == 0 &&
2602                     (ipv4_is_multicast(fl4->daddr) ||
2603                      ipv4_is_lbcast(fl4->daddr))) {
2604                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2605                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2606                         if (!dev_out)
2607                                 goto out;
2608
2609                         /* Special hack: user can direct multicasts
2610                            and limited broadcast via necessary interface
2611                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2612                            This hack is not just for fun, it allows
2613                            vic,vat and friends to work.
2614                            They bind socket to loopback, set ttl to zero
2615                            and expect that it will work.
2616                            From the viewpoint of routing cache they are broken,
2617                            because we are not allowed to build multicast path
2618                            with loopback source addr (look, routing cache
2619                            cannot know, that ttl is zero, so that packet
2620                            will not leave this host and route is valid).
2621                            Luckily, this hack is good workaround.
2622                          */
2623
2624                         fl4->flowi4_oif = dev_out->ifindex;
2625                         goto make_route;
2626                 }
2627
2628                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2629                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2630                         if (!__ip_dev_find(net, fl4->saddr, false))
2631                                 goto out;
2632                 }
2633         }
2634
2635
2636         if (fl4->flowi4_oif) {
2637                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2638                 rth = ERR_PTR(-ENODEV);
2639                 if (!dev_out)
2640                         goto out;
2641
2642                 /* RACE: Check return value of inet_select_addr instead. */
2643                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2644                         rth = ERR_PTR(-ENETUNREACH);
2645                         goto out;
2646                 }
2647                 if (ipv4_is_local_multicast(fl4->daddr) ||
2648                     ipv4_is_lbcast(fl4->daddr) ||
2649                     fl4->flowi4_proto == IPPROTO_IGMP) {
2650                         if (!fl4->saddr)
2651                                 fl4->saddr = inet_select_addr(dev_out, 0,
2652                                                               RT_SCOPE_LINK);
2653                         goto make_route;
2654                 }
2655                 if (!fl4->saddr) {
2656                         if (ipv4_is_multicast(fl4->daddr))
2657                                 fl4->saddr = inet_select_addr(dev_out, 0,
2658                                                               fl4->flowi4_scope);
2659                         else if (!fl4->daddr)
2660                                 fl4->saddr = inet_select_addr(dev_out, 0,
2661                                                               RT_SCOPE_HOST);
2662                 }
2663         }
2664
2665         if (!fl4->daddr) {
2666                 fl4->daddr = fl4->saddr;
2667                 if (!fl4->daddr)
2668                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2669                 dev_out = net->loopback_dev;
2670                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2671                 res->type = RTN_LOCAL;
2672                 flags |= RTCF_LOCAL;
2673                 goto make_route;
2674         }
2675
2676         err = fib_lookup(net, fl4, res, 0);
2677         if (err) {
2678                 res->fi = NULL;
2679                 res->table = NULL;
2680                 if (fl4->flowi4_oif &&
2681                     (ipv4_is_multicast(fl4->daddr) ||
2682                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2683                         /* Apparently, routing tables are wrong. Assume,
2684                            that the destination is on link.
2685
2686                            WHY? DW.
2687                            Because we are allowed to send to iface
2688                            even if it has NO routes and NO assigned
2689                            addresses. When oif is specified, routing
2690                            tables are looked up with only one purpose:
2691                            to catch if destination is gatewayed, rather than
2692                            direct. Moreover, if MSG_DONTROUTE is set,
2693                            we send packet, ignoring both routing tables
2694                            and ifaddr state. --ANK
2695
2696
2697                            We could make it even if oif is unknown,
2698                            likely IPv6, but we do not.
2699                          */
2700
2701                         if (fl4->saddr == 0)
2702                                 fl4->saddr = inet_select_addr(dev_out, 0,
2703                                                               RT_SCOPE_LINK);
2704                         res->type = RTN_UNICAST;
2705                         goto make_route;
2706                 }
2707                 rth = ERR_PTR(err);
2708                 goto out;
2709         }
2710
2711         if (res->type == RTN_LOCAL) {
2712                 if (!fl4->saddr) {
2713                         if (res->fi->fib_prefsrc)
2714                                 fl4->saddr = res->fi->fib_prefsrc;
2715                         else
2716                                 fl4->saddr = fl4->daddr;
2717                 }
2718
2719                 /* L3 master device is the loopback for that domain */
2720                 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2721                         net->loopback_dev;
2722
2723                 /* make sure orig_oif points to fib result device even
2724                  * though packet rx/tx happens over loopback or l3mdev
2725                  */
2726                 orig_oif = FIB_RES_OIF(*res);
2727
2728                 fl4->flowi4_oif = dev_out->ifindex;
2729                 flags |= RTCF_LOCAL;
2730                 goto make_route;
2731         }
2732
2733         fib_select_path(net, res, fl4, skb);
2734
2735         dev_out = FIB_RES_DEV(*res);
2736
2737 make_route:
2738         rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2739
2740 out:
2741         return rth;
2742 }
2743
2744 static struct dst_ops ipv4_dst_blackhole_ops = {
2745         .family                 = AF_INET,
2746         .default_advmss         = ipv4_default_advmss,
2747         .neigh_lookup           = ipv4_neigh_lookup,
2748         .check                  = dst_blackhole_check,
2749         .cow_metrics            = dst_blackhole_cow_metrics,
2750         .update_pmtu            = dst_blackhole_update_pmtu,
2751         .redirect               = dst_blackhole_redirect,
2752         .mtu                    = dst_blackhole_mtu,
2753 };
2754
2755 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2756 {
2757         struct rtable *ort = (struct rtable *) dst_orig;
2758         struct rtable *rt;
2759
2760         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2761         if (rt) {
2762                 struct dst_entry *new = &rt->dst;
2763
2764                 new->__use = 1;
2765                 new->input = dst_discard;
2766                 new->output = dst_discard_out;
2767
2768                 new->dev = net->loopback_dev;
2769                 if (new->dev)
2770                         dev_hold(new->dev);
2771
2772                 rt->rt_is_input = ort->rt_is_input;
2773                 rt->rt_iif = ort->rt_iif;
2774                 rt->rt_pmtu = ort->rt_pmtu;
2775                 rt->rt_mtu_locked = ort->rt_mtu_locked;
2776
2777                 rt->rt_genid = rt_genid_ipv4(net);
2778                 rt->rt_flags = ort->rt_flags;
2779                 rt->rt_type = ort->rt_type;
2780                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2781                 rt->rt_gw_family = ort->rt_gw_family;
2782                 if (rt->rt_gw_family == AF_INET)
2783                         rt->rt_gw4 = ort->rt_gw4;
2784                 else if (rt->rt_gw_family == AF_INET6)
2785                         rt->rt_gw6 = ort->rt_gw6;
2786
2787                 INIT_LIST_HEAD(&rt->rt_uncached);
2788         }
2789
2790         dst_release(dst_orig);
2791
2792         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2793 }
2794
2795 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2796                                     const struct sock *sk)
2797 {
2798         struct rtable *rt = __ip_route_output_key(net, flp4);
2799
2800         if (IS_ERR(rt))
2801                 return rt;
2802
2803         if (flp4->flowi4_proto) {
2804                 flp4->flowi4_oif = rt->dst.dev->ifindex;
2805                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2806                                                         flowi4_to_flowi(flp4),
2807                                                         sk, 0);
2808         }
2809
2810         return rt;
2811 }
2812 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2813
2814 struct rtable *ip_route_output_tunnel(struct sk_buff *skb,
2815                                       struct net_device *dev,
2816                                       struct net *net, __be32 *saddr,
2817                                       const struct ip_tunnel_info *info,
2818                                       u8 protocol, bool use_cache)
2819 {
2820 #ifdef CONFIG_DST_CACHE
2821         struct dst_cache *dst_cache;
2822 #endif
2823         struct rtable *rt = NULL;
2824         struct flowi4 fl4;
2825         __u8 tos;
2826
2827 #ifdef CONFIG_DST_CACHE
2828         dst_cache = (struct dst_cache *)&info->dst_cache;
2829         if (use_cache) {
2830                 rt = dst_cache_get_ip4(dst_cache, saddr);
2831                 if (rt)
2832                         return rt;
2833         }
2834 #endif
2835         memset(&fl4, 0, sizeof(fl4));
2836         fl4.flowi4_mark = skb->mark;
2837         fl4.flowi4_proto = protocol;
2838         fl4.daddr = info->key.u.ipv4.dst;
2839         fl4.saddr = info->key.u.ipv4.src;
2840         tos = info->key.tos;
2841         fl4.flowi4_tos = RT_TOS(tos);
2842
2843         rt = ip_route_output_key(net, &fl4);
2844         if (IS_ERR(rt)) {
2845                 netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
2846                 return ERR_PTR(-ENETUNREACH);
2847         }
2848         if (rt->dst.dev == dev) { /* is this necessary? */
2849                 netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
2850                 ip_rt_put(rt);
2851                 return ERR_PTR(-ELOOP);
2852         }
2853 #ifdef CONFIG_DST_CACHE
2854         if (use_cache)
2855                 dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
2856 #endif
2857         *saddr = fl4.saddr;
2858         return rt;
2859 }
2860 EXPORT_SYMBOL_GPL(ip_route_output_tunnel);
2861
2862 /* called with rcu_read_lock held */
2863 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2864                         struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2865                         struct sk_buff *skb, u32 portid, u32 seq,
2866                         unsigned int flags)
2867 {
2868         struct rtmsg *r;
2869         struct nlmsghdr *nlh;
2870         unsigned long expires = 0;
2871         u32 error;
2872         u32 metrics[RTAX_MAX];
2873
2874         nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2875         if (!nlh)
2876                 return -EMSGSIZE;
2877
2878         r = nlmsg_data(nlh);
2879         r->rtm_family    = AF_INET;
2880         r->rtm_dst_len  = 32;
2881         r->rtm_src_len  = 0;
2882         r->rtm_tos      = fl4 ? fl4->flowi4_tos : 0;
2883         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2884         if (nla_put_u32(skb, RTA_TABLE, table_id))
2885                 goto nla_put_failure;
2886         r->rtm_type     = rt->rt_type;
2887         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2888         r->rtm_protocol = RTPROT_UNSPEC;
2889         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2890         if (rt->rt_flags & RTCF_NOTIFY)
2891                 r->rtm_flags |= RTM_F_NOTIFY;
2892         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2893                 r->rtm_flags |= RTCF_DOREDIRECT;
2894
2895         if (nla_put_in_addr(skb, RTA_DST, dst))
2896                 goto nla_put_failure;
2897         if (src) {
2898                 r->rtm_src_len = 32;
2899                 if (nla_put_in_addr(skb, RTA_SRC, src))
2900                         goto nla_put_failure;
2901         }
2902         if (rt->dst.dev &&
2903             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2904                 goto nla_put_failure;
2905 #ifdef CONFIG_IP_ROUTE_CLASSID
2906         if (rt->dst.tclassid &&
2907             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2908                 goto nla_put_failure;
2909 #endif
2910         if (fl4 && !rt_is_input_route(rt) &&
2911             fl4->saddr != src) {
2912                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2913                         goto nla_put_failure;
2914         }
2915         if (rt->rt_uses_gateway) {
2916                 if (rt->rt_gw_family == AF_INET &&
2917                     nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2918                         goto nla_put_failure;
2919                 } else if (rt->rt_gw_family == AF_INET6) {
2920                         int alen = sizeof(struct in6_addr);
2921                         struct nlattr *nla;
2922                         struct rtvia *via;
2923
2924                         nla = nla_reserve(skb, RTA_VIA, alen + 2);
2925                         if (!nla)
2926                                 goto nla_put_failure;
2927
2928                         via = nla_data(nla);
2929                         via->rtvia_family = AF_INET6;
2930                         memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2931                 }
2932         }
2933
2934         expires = rt->dst.expires;
2935         if (expires) {
2936                 unsigned long now = jiffies;
2937
2938                 if (time_before(now, expires))
2939                         expires -= now;
2940                 else
2941                         expires = 0;
2942         }
2943
2944         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2945         if (rt->rt_pmtu && expires)
2946                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2947         if (rt->rt_mtu_locked && expires)
2948                 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2949         if (rtnetlink_put_metrics(skb, metrics) < 0)
2950                 goto nla_put_failure;
2951
2952         if (fl4) {
2953                 if (fl4->flowi4_mark &&
2954                     nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2955                         goto nla_put_failure;
2956
2957                 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2958                     nla_put_u32(skb, RTA_UID,
2959                                 from_kuid_munged(current_user_ns(),
2960                                                  fl4->flowi4_uid)))
2961                         goto nla_put_failure;
2962
2963                 if (rt_is_input_route(rt)) {
2964 #ifdef CONFIG_IP_MROUTE
2965                         if (ipv4_is_multicast(dst) &&
2966                             !ipv4_is_local_multicast(dst) &&
2967                             IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2968                                 int err = ipmr_get_route(net, skb,
2969                                                          fl4->saddr, fl4->daddr,
2970                                                          r, portid);
2971
2972                                 if (err <= 0) {
2973                                         if (err == 0)
2974                                                 return 0;
2975                                         goto nla_put_failure;
2976                                 }
2977                         } else
2978 #endif
2979                                 if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2980                                         goto nla_put_failure;
2981                 }
2982         }
2983
2984         error = rt->dst.error;
2985
2986         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2987                 goto nla_put_failure;
2988
2989         nlmsg_end(skb, nlh);
2990         return 0;
2991
2992 nla_put_failure:
2993         nlmsg_cancel(skb, nlh);
2994         return -EMSGSIZE;
2995 }
2996
2997 static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
2998                             struct netlink_callback *cb, u32 table_id,
2999                             struct fnhe_hash_bucket *bucket, int genid,
3000                             int *fa_index, int fa_start, unsigned int flags)
3001 {
3002         int i;
3003
3004         for (i = 0; i < FNHE_HASH_SIZE; i++) {
3005                 struct fib_nh_exception *fnhe;
3006
3007                 for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
3008                      fnhe = rcu_dereference(fnhe->fnhe_next)) {
3009                         struct rtable *rt;
3010                         int err;
3011
3012                         if (*fa_index < fa_start)
3013                                 goto next;
3014
3015                         if (fnhe->fnhe_genid != genid)
3016                                 goto next;
3017
3018                         if (fnhe->fnhe_expires &&
3019                             time_after(jiffies, fnhe->fnhe_expires))
3020                                 goto next;
3021
3022                         rt = rcu_dereference(fnhe->fnhe_rth_input);
3023                         if (!rt)
3024                                 rt = rcu_dereference(fnhe->fnhe_rth_output);
3025                         if (!rt)
3026                                 goto next;
3027
3028                         err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
3029                                            table_id, NULL, skb,
3030                                            NETLINK_CB(cb->skb).portid,
3031                                            cb->nlh->nlmsg_seq, flags);
3032                         if (err)
3033                                 return err;
3034 next:
3035                         (*fa_index)++;
3036                 }
3037         }
3038
3039         return 0;
3040 }
3041
3042 int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
3043                        u32 table_id, struct fib_info *fi,
3044                        int *fa_index, int fa_start, unsigned int flags)
3045 {
3046         struct net *net = sock_net(cb->skb->sk);
3047         int nhsel, genid = fnhe_genid(net);
3048
3049         for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
3050                 struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
3051                 struct fnhe_hash_bucket *bucket;
3052                 int err;
3053
3054                 if (nhc->nhc_flags & RTNH_F_DEAD)
3055                         continue;
3056
3057                 rcu_read_lock();
3058                 bucket = rcu_dereference(nhc->nhc_exceptions);
3059                 err = 0;
3060                 if (bucket)
3061                         err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
3062                                                genid, fa_index, fa_start,
3063                                                flags);
3064                 rcu_read_unlock();
3065                 if (err)
3066                         return err;
3067         }
3068
3069         return 0;
3070 }
3071
3072 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
3073                                                    u8 ip_proto, __be16 sport,
3074                                                    __be16 dport)
3075 {
3076         struct sk_buff *skb;
3077         struct iphdr *iph;
3078
3079         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3080         if (!skb)
3081                 return NULL;
3082
3083         /* Reserve room for dummy headers, this skb can pass
3084          * through good chunk of routing engine.
3085          */
3086         skb_reset_mac_header(skb);
3087         skb_reset_network_header(skb);
3088         skb->protocol = htons(ETH_P_IP);
3089         iph = skb_put(skb, sizeof(struct iphdr));
3090         iph->protocol = ip_proto;
3091         iph->saddr = src;
3092         iph->daddr = dst;
3093         iph->version = 0x4;
3094         iph->frag_off = 0;
3095         iph->ihl = 0x5;
3096         skb_set_transport_header(skb, skb->len);
3097
3098         switch (iph->protocol) {
3099         case IPPROTO_UDP: {
3100                 struct udphdr *udph;
3101
3102                 udph = skb_put_zero(skb, sizeof(struct udphdr));
3103                 udph->source = sport;
3104                 udph->dest = dport;
3105                 udph->len = htons(sizeof(struct udphdr));
3106                 udph->check = 0;
3107                 break;
3108         }
3109         case IPPROTO_TCP: {
3110                 struct tcphdr *tcph;
3111
3112                 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3113                 tcph->source    = sport;
3114                 tcph->dest      = dport;
3115                 tcph->doff      = sizeof(struct tcphdr) / 4;
3116                 tcph->rst = 1;
3117                 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3118                                             src, dst, 0);
3119                 break;
3120         }
3121         case IPPROTO_ICMP: {
3122                 struct icmphdr *icmph;
3123
3124                 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3125                 icmph->type = ICMP_ECHO;
3126                 icmph->code = 0;
3127         }
3128         }
3129
3130         return skb;
3131 }
3132
3133 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3134                                        const struct nlmsghdr *nlh,
3135                                        struct nlattr **tb,
3136                                        struct netlink_ext_ack *extack)
3137 {
3138         struct rtmsg *rtm;
3139         int i, err;
3140
3141         if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3142                 NL_SET_ERR_MSG(extack,
3143                                "ipv4: Invalid header for route get request");
3144                 return -EINVAL;
3145         }
3146
3147         if (!netlink_strict_get_check(skb))
3148                 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3149                                               rtm_ipv4_policy, extack);
3150
3151         rtm = nlmsg_data(nlh);
3152         if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3153             (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3154             rtm->rtm_table || rtm->rtm_protocol ||
3155             rtm->rtm_scope || rtm->rtm_type) {
3156                 NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3157                 return -EINVAL;
3158         }
3159
3160         if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3161                                RTM_F_LOOKUP_TABLE |
3162                                RTM_F_FIB_MATCH)) {
3163                 NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3164                 return -EINVAL;
3165         }
3166
3167         err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3168                                             rtm_ipv4_policy, extack);
3169         if (err)
3170                 return err;
3171
3172         if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3173             (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3174                 NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3175                 return -EINVAL;
3176         }
3177
3178         for (i = 0; i <= RTA_MAX; i++) {
3179                 if (!tb[i])
3180                         continue;
3181
3182                 switch (i) {
3183                 case RTA_IIF:
3184                 case RTA_OIF:
3185                 case RTA_SRC:
3186                 case RTA_DST:
3187                 case RTA_IP_PROTO:
3188                 case RTA_SPORT:
3189                 case RTA_DPORT:
3190                 case RTA_MARK:
3191                 case RTA_UID:
3192                         break;
3193                 default:
3194                         NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3195                         return -EINVAL;
3196                 }
3197         }
3198
3199         return 0;
3200 }
3201
3202 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3203                              struct netlink_ext_ack *extack)
3204 {
3205         struct net *net = sock_net(in_skb->sk);
3206         struct nlattr *tb[RTA_MAX+1];
3207         u32 table_id = RT_TABLE_MAIN;
3208         __be16 sport = 0, dport = 0;
3209         struct fib_result res = {};
3210         u8 ip_proto = IPPROTO_UDP;
3211         struct rtable *rt = NULL;
3212         struct sk_buff *skb;
3213         struct rtmsg *rtm;
3214         struct flowi4 fl4 = {};
3215         __be32 dst = 0;
3216         __be32 src = 0;
3217         kuid_t uid;
3218         u32 iif;
3219         int err;
3220         int mark;
3221
3222         err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3223         if (err < 0)
3224                 return err;
3225
3226         rtm = nlmsg_data(nlh);
3227         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3228         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3229         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3230         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3231         if (tb[RTA_UID])
3232                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3233         else
3234                 uid = (iif ? INVALID_UID : current_uid());
3235
3236         if (tb[RTA_IP_PROTO]) {
3237                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3238                                                   &ip_proto, AF_INET, extack);
3239                 if (err)
3240                         return err;
3241         }
3242
3243         if (tb[RTA_SPORT])
3244                 sport = nla_get_be16(tb[RTA_SPORT]);
3245
3246         if (tb[RTA_DPORT])
3247                 dport = nla_get_be16(tb[RTA_DPORT]);
3248
3249         skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3250         if (!skb)
3251                 return -ENOBUFS;
3252
3253         fl4.daddr = dst;
3254         fl4.saddr = src;
3255         fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
3256         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3257         fl4.flowi4_mark = mark;
3258         fl4.flowi4_uid = uid;
3259         if (sport)
3260                 fl4.fl4_sport = sport;
3261         if (dport)
3262                 fl4.fl4_dport = dport;
3263         fl4.flowi4_proto = ip_proto;
3264
3265         rcu_read_lock();
3266
3267         if (iif) {
3268                 struct net_device *dev;
3269
3270                 dev = dev_get_by_index_rcu(net, iif);
3271                 if (!dev) {
3272                         err = -ENODEV;
3273                         goto errout_rcu;
3274                 }
3275
3276                 fl4.flowi4_iif = iif; /* for rt_fill_info */
3277                 skb->dev        = dev;
3278                 skb->mark       = mark;
3279                 err = ip_route_input_rcu(skb, dst, src,
3280                                          rtm->rtm_tos & IPTOS_RT_MASK, dev,
3281                                          &res);
3282
3283                 rt = skb_rtable(skb);
3284                 if (err == 0 && rt->dst.error)
3285                         err = -rt->dst.error;
3286         } else {
3287                 fl4.flowi4_iif = LOOPBACK_IFINDEX;
3288                 skb->dev = net->loopback_dev;
3289                 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3290                 err = 0;
3291                 if (IS_ERR(rt))
3292                         err = PTR_ERR(rt);
3293                 else
3294                         skb_dst_set(skb, &rt->dst);
3295         }
3296
3297         if (err)
3298                 goto errout_rcu;
3299
3300         if (rtm->rtm_flags & RTM_F_NOTIFY)
3301                 rt->rt_flags |= RTCF_NOTIFY;
3302
3303         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3304                 table_id = res.table ? res.table->tb_id : 0;
3305
3306         /* reset skb for netlink reply msg */
3307         skb_trim(skb, 0);
3308         skb_reset_network_header(skb);
3309         skb_reset_transport_header(skb);
3310         skb_reset_mac_header(skb);
3311
3312         if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3313                 struct fib_rt_info fri;
3314
3315                 if (!res.fi) {
3316                         err = fib_props[res.type].error;
3317                         if (!err)
3318                                 err = -EHOSTUNREACH;
3319                         goto errout_rcu;
3320                 }
3321                 fri.fi = res.fi;
3322                 fri.tb_id = table_id;
3323                 fri.dst = res.prefix;
3324                 fri.dst_len = res.prefixlen;
3325                 fri.tos = fl4.flowi4_tos;
3326                 fri.type = rt->rt_type;
3327                 fri.offload = 0;
3328                 fri.trap = 0;
3329                 if (res.fa_head) {
3330                         struct fib_alias *fa;
3331
3332                         hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
3333                                 u8 slen = 32 - fri.dst_len;
3334
3335                                 if (fa->fa_slen == slen &&
3336                                     fa->tb_id == fri.tb_id &&
3337                                     fa->fa_tos == fri.tos &&
3338                                     fa->fa_info == res.fi &&
3339                                     fa->fa_type == fri.type) {
3340                                         fri.offload = fa->offload;
3341                                         fri.trap = fa->trap;
3342                                         break;
3343                                 }
3344                         }
3345                 }
3346                 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3347                                     nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
3348         } else {
3349                 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3350                                    NETLINK_CB(in_skb).portid,
3351                                    nlh->nlmsg_seq, 0);
3352         }
3353         if (err < 0)
3354                 goto errout_rcu;
3355
3356         rcu_read_unlock();
3357
3358         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3359
3360 errout_free:
3361         return err;
3362 errout_rcu:
3363         rcu_read_unlock();
3364         kfree_skb(skb);
3365         goto errout_free;
3366 }
3367
3368 void ip_rt_multicast_event(struct in_device *in_dev)
3369 {
3370         rt_cache_flush(dev_net(in_dev->dev));
3371 }
3372
3373 #ifdef CONFIG_SYSCTL
3374 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3375 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
3376 static int ip_rt_gc_elasticity __read_mostly    = 8;
3377 static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
3378
3379 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3380                 void *buffer, size_t *lenp, loff_t *ppos)
3381 {
3382         struct net *net = (struct net *)__ctl->extra1;
3383
3384         if (write) {
3385                 rt_cache_flush(net);
3386                 fnhe_genid_bump(net);
3387                 return 0;
3388         }
3389
3390         return -EINVAL;
3391 }
3392
3393 static struct ctl_table ipv4_route_table[] = {
3394         {
3395                 .procname       = "gc_thresh",
3396                 .data           = &ipv4_dst_ops.gc_thresh,
3397                 .maxlen         = sizeof(int),
3398                 .mode           = 0644,
3399                 .proc_handler   = proc_dointvec,
3400         },
3401         {
3402                 .procname       = "max_size",
3403                 .data           = &ip_rt_max_size,
3404                 .maxlen         = sizeof(int),
3405                 .mode           = 0644,
3406                 .proc_handler   = proc_dointvec,
3407         },
3408         {
3409                 /*  Deprecated. Use gc_min_interval_ms */
3410
3411                 .procname       = "gc_min_interval",
3412                 .data           = &ip_rt_gc_min_interval,
3413                 .maxlen         = sizeof(int),
3414                 .mode           = 0644,
3415                 .proc_handler   = proc_dointvec_jiffies,
3416         },
3417         {
3418                 .procname       = "gc_min_interval_ms",
3419                 .data           = &ip_rt_gc_min_interval,
3420                 .maxlen         = sizeof(int),
3421                 .mode           = 0644,
3422                 .proc_handler   = proc_dointvec_ms_jiffies,
3423         },
3424         {
3425                 .procname       = "gc_timeout",
3426                 .data           = &ip_rt_gc_timeout,
3427                 .maxlen         = sizeof(int),
3428                 .mode           = 0644,
3429                 .proc_handler   = proc_dointvec_jiffies,
3430         },
3431         {
3432                 .procname       = "gc_interval",
3433                 .data           = &ip_rt_gc_interval,
3434                 .maxlen         = sizeof(int),
3435                 .mode           = 0644,
3436                 .proc_handler   = proc_dointvec_jiffies,
3437         },
3438         {
3439                 .procname       = "redirect_load",
3440                 .data           = &ip_rt_redirect_load,
3441                 .maxlen         = sizeof(int),
3442                 .mode           = 0644,
3443                 .proc_handler   = proc_dointvec,
3444         },
3445         {
3446                 .procname       = "redirect_number",
3447                 .data           = &ip_rt_redirect_number,
3448                 .maxlen         = sizeof(int),
3449                 .mode           = 0644,
3450                 .proc_handler   = proc_dointvec,
3451         },
3452         {
3453                 .procname       = "redirect_silence",
3454                 .data           = &ip_rt_redirect_silence,
3455                 .maxlen         = sizeof(int),
3456                 .mode           = 0644,
3457                 .proc_handler   = proc_dointvec,
3458         },
3459         {
3460                 .procname       = "error_cost",
3461                 .data           = &ip_rt_error_cost,
3462                 .maxlen         = sizeof(int),
3463                 .mode           = 0644,
3464                 .proc_handler   = proc_dointvec,
3465         },
3466         {
3467                 .procname       = "error_burst",
3468                 .data           = &ip_rt_error_burst,
3469                 .maxlen         = sizeof(int),
3470                 .mode           = 0644,
3471                 .proc_handler   = proc_dointvec,
3472         },
3473         {
3474                 .procname       = "gc_elasticity",
3475                 .data           = &ip_rt_gc_elasticity,
3476                 .maxlen         = sizeof(int),
3477                 .mode           = 0644,
3478                 .proc_handler   = proc_dointvec,
3479         },
3480         {
3481                 .procname       = "mtu_expires",
3482                 .data           = &ip_rt_mtu_expires,
3483                 .maxlen         = sizeof(int),
3484                 .mode           = 0644,
3485                 .proc_handler   = proc_dointvec_jiffies,
3486         },
3487         {
3488                 .procname       = "min_pmtu",
3489                 .data           = &ip_rt_min_pmtu,
3490                 .maxlen         = sizeof(int),
3491                 .mode           = 0644,
3492                 .proc_handler   = proc_dointvec_minmax,
3493                 .extra1         = &ip_min_valid_pmtu,
3494         },
3495         {
3496                 .procname       = "min_adv_mss",
3497                 .data           = &ip_rt_min_advmss,
3498                 .maxlen         = sizeof(int),
3499                 .mode           = 0644,
3500                 .proc_handler   = proc_dointvec,
3501         },
3502         { }
3503 };
3504
3505 static const char ipv4_route_flush_procname[] = "flush";
3506
3507 static struct ctl_table ipv4_route_flush_table[] = {
3508         {
3509                 .procname       = ipv4_route_flush_procname,
3510                 .maxlen         = sizeof(int),
3511                 .mode           = 0200,
3512                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3513         },
3514         { },
3515 };
3516
3517 static __net_init int sysctl_route_net_init(struct net *net)
3518 {
3519         struct ctl_table *tbl;
3520
3521         tbl = ipv4_route_flush_table;
3522         if (!net_eq(net, &init_net)) {
3523                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3524                 if (!tbl)
3525                         goto err_dup;
3526
3527                 /* Don't export non-whitelisted sysctls to unprivileged users */
3528                 if (net->user_ns != &init_user_ns) {
3529                         if (tbl[0].procname != ipv4_route_flush_procname)
3530                                 tbl[0].procname = NULL;
3531                 }
3532         }
3533         tbl[0].extra1 = net;
3534
3535         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3536         if (!net->ipv4.route_hdr)
3537                 goto err_reg;
3538         return 0;
3539
3540 err_reg:
3541         if (tbl != ipv4_route_flush_table)
3542                 kfree(tbl);
3543 err_dup:
3544         return -ENOMEM;
3545 }
3546
3547 static __net_exit void sysctl_route_net_exit(struct net *net)
3548 {
3549         struct ctl_table *tbl;
3550
3551         tbl = net->ipv4.route_hdr->ctl_table_arg;
3552         unregister_net_sysctl_table(net->ipv4.route_hdr);
3553         BUG_ON(tbl == ipv4_route_flush_table);
3554         kfree(tbl);
3555 }
3556
3557 static __net_initdata struct pernet_operations sysctl_route_ops = {
3558         .init = sysctl_route_net_init,
3559         .exit = sysctl_route_net_exit,
3560 };
3561 #endif
3562
3563 static __net_init int rt_genid_init(struct net *net)
3564 {
3565         atomic_set(&net->ipv4.rt_genid, 0);
3566         atomic_set(&net->fnhe_genid, 0);
3567         atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3568         return 0;
3569 }
3570
3571 static __net_initdata struct pernet_operations rt_genid_ops = {
3572         .init = rt_genid_init,
3573 };
3574
3575 static int __net_init ipv4_inetpeer_init(struct net *net)
3576 {
3577         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3578
3579         if (!bp)
3580                 return -ENOMEM;
3581         inet_peer_base_init(bp);
3582         net->ipv4.peers = bp;
3583         return 0;
3584 }
3585
3586 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3587 {
3588         struct inet_peer_base *bp = net->ipv4.peers;
3589
3590         net->ipv4.peers = NULL;
3591         inetpeer_invalidate_tree(bp);
3592         kfree(bp);
3593 }
3594
3595 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3596         .init   =       ipv4_inetpeer_init,
3597         .exit   =       ipv4_inetpeer_exit,
3598 };
3599
3600 #ifdef CONFIG_IP_ROUTE_CLASSID
3601 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3602 #endif /* CONFIG_IP_ROUTE_CLASSID */
3603
3604 int __init ip_rt_init(void)
3605 {
3606         void *idents_hash;
3607         int cpu;
3608
3609         /* For modern hosts, this will use 2 MB of memory */
3610         idents_hash = alloc_large_system_hash("IP idents",
3611                                               sizeof(*ip_idents) + sizeof(*ip_tstamps),
3612                                               0,
3613                                               16, /* one bucket per 64 KB */
3614                                               HASH_ZERO,
3615                                               NULL,
3616                                               &ip_idents_mask,
3617                                               2048,
3618                                               256*1024);
3619
3620         ip_idents = idents_hash;
3621
3622         prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
3623
3624         ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
3625
3626         for_each_possible_cpu(cpu) {
3627                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3628
3629                 INIT_LIST_HEAD(&ul->head);
3630                 spin_lock_init(&ul->lock);
3631         }
3632 #ifdef CONFIG_IP_ROUTE_CLASSID
3633         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3634         if (!ip_rt_acct)
3635                 panic("IP: failed to allocate ip_rt_acct\n");
3636 #endif
3637
3638         ipv4_dst_ops.kmem_cachep =
3639                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3640                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3641
3642         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3643
3644         if (dst_entries_init(&ipv4_dst_ops) < 0)
3645                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3646
3647         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3648                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3649
3650         ipv4_dst_ops.gc_thresh = ~0;
3651         ip_rt_max_size = INT_MAX;
3652
3653         devinet_init();
3654         ip_fib_init();
3655
3656         if (ip_rt_proc_init())
3657                 pr_err("Unable to create route proc files\n");
3658 #ifdef CONFIG_XFRM
3659         xfrm_init();
3660         xfrm4_init();
3661 #endif
3662         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3663                       RTNL_FLAG_DOIT_UNLOCKED);
3664
3665 #ifdef CONFIG_SYSCTL
3666         register_pernet_subsys(&sysctl_route_ops);
3667 #endif
3668         register_pernet_subsys(&rt_genid_ops);
3669         register_pernet_subsys(&ipv4_inetpeer_ops);
3670         return 0;
3671 }
3672
3673 #ifdef CONFIG_SYSCTL
3674 /*
3675  * We really need to sanitize the damn ipv4 init order, then all
3676  * this nonsense will go away.
3677  */
3678 void __init ip_static_sysctl_init(void)
3679 {
3680         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3681 }
3682 #endif