GNU Linux-libre 5.10.217-gnu1
[releases.git] / net / ipv4 / route.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * INET         An implementation of the TCP/IP protocol suite for the LINUX
4  *              operating system.  INET is implemented using the  BSD Socket
5  *              interface as the means of communication with the user level.
6  *
7  *              ROUTE - implementation of the IP router.
8  *
9  * Authors:     Ross Biro
10  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
12  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
13  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
14  *
15  * Fixes:
16  *              Alan Cox        :       Verify area fixes.
17  *              Alan Cox        :       cli() protects routing changes
18  *              Rui Oliveira    :       ICMP routing table updates
19  *              (rco@di.uminho.pt)      Routing table insertion and update
20  *              Linus Torvalds  :       Rewrote bits to be sensible
21  *              Alan Cox        :       Added BSD route gw semantics
22  *              Alan Cox        :       Super /proc >4K
23  *              Alan Cox        :       MTU in route table
24  *              Alan Cox        :       MSS actually. Also added the window
25  *                                      clamper.
26  *              Sam Lantinga    :       Fixed route matching in rt_del()
27  *              Alan Cox        :       Routing cache support.
28  *              Alan Cox        :       Removed compatibility cruft.
29  *              Alan Cox        :       RTF_REJECT support.
30  *              Alan Cox        :       TCP irtt support.
31  *              Jonathan Naylor :       Added Metric support.
32  *      Miquel van Smoorenburg  :       BSD API fixes.
33  *      Miquel van Smoorenburg  :       Metrics.
34  *              Alan Cox        :       Use __u32 properly
35  *              Alan Cox        :       Aligned routing errors more closely with BSD
36  *                                      our system is still very different.
37  *              Alan Cox        :       Faster /proc handling
38  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
39  *                                      routing caches and better behaviour.
40  *
41  *              Olaf Erb        :       irtt wasn't being copied right.
42  *              Bjorn Ekwall    :       Kerneld route support.
43  *              Alan Cox        :       Multicast fixed (I hope)
44  *              Pavel Krauz     :       Limited broadcast fixed
45  *              Mike McLagan    :       Routing by source
46  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
47  *                                      route.c and rewritten from scratch.
48  *              Andi Kleen      :       Load-limit warning messages.
49  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
50  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
51  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
52  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
53  *              Marc Boucher    :       routing by fwmark
54  *      Robert Olsson           :       Added rt_cache statistics
55  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
56  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
57  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
58  *      Ilia Sotnikov           :       Removed TOS from hash calculations
59  */
60
61 #define pr_fmt(fmt) "IPv4: " fmt
62
63 #include <linux/module.h>
64 #include <linux/uaccess.h>
65 #include <linux/bitops.h>
66 #include <linux/types.h>
67 #include <linux/kernel.h>
68 #include <linux/mm.h>
69 #include <linux/memblock.h>
70 #include <linux/string.h>
71 #include <linux/socket.h>
72 #include <linux/sockios.h>
73 #include <linux/errno.h>
74 #include <linux/in.h>
75 #include <linux/inet.h>
76 #include <linux/netdevice.h>
77 #include <linux/proc_fs.h>
78 #include <linux/init.h>
79 #include <linux/skbuff.h>
80 #include <linux/inetdevice.h>
81 #include <linux/igmp.h>
82 #include <linux/pkt_sched.h>
83 #include <linux/mroute.h>
84 #include <linux/netfilter_ipv4.h>
85 #include <linux/random.h>
86 #include <linux/rcupdate.h>
87 #include <linux/times.h>
88 #include <linux/slab.h>
89 #include <linux/jhash.h>
90 #include <net/dst.h>
91 #include <net/dst_metadata.h>
92 #include <net/net_namespace.h>
93 #include <net/protocol.h>
94 #include <net/ip.h>
95 #include <net/route.h>
96 #include <net/inetpeer.h>
97 #include <net/sock.h>
98 #include <net/ip_fib.h>
99 #include <net/nexthop.h>
100 #include <net/arp.h>
101 #include <net/tcp.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/lwtunnel.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
107 #ifdef CONFIG_SYSCTL
108 #include <linux/sysctl.h>
109 #endif
110 #include <net/secure_seq.h>
111 #include <net/ip_tunnels.h>
112 #include <net/l3mdev.h>
113
114 #include "fib_lookup.h"
115
116 #define RT_FL_TOS(oldflp4) \
117         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
118
119 #define RT_GC_TIMEOUT (300*HZ)
120
121 static int ip_rt_max_size;
122 static int ip_rt_redirect_number __read_mostly  = 9;
123 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
124 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
125 static int ip_rt_error_cost __read_mostly       = HZ;
126 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
127 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
128 static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
129 static int ip_rt_min_advmss __read_mostly       = 256;
130
131 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
132
133 /*
134  *      Interface to generic destination cache.
135  */
136
137 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
139 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
140 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
141 static void              ipv4_link_failure(struct sk_buff *skb);
142 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
143                                            struct sk_buff *skb, u32 mtu,
144                                            bool confirm_neigh);
145 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
146                                         struct sk_buff *skb);
147 static void             ipv4_dst_destroy(struct dst_entry *dst);
148
149 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
150 {
151         WARN_ON(1);
152         return NULL;
153 }
154
155 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
156                                            struct sk_buff *skb,
157                                            const void *daddr);
158 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
159
160 static struct dst_ops ipv4_dst_ops = {
161         .family =               AF_INET,
162         .check =                ipv4_dst_check,
163         .default_advmss =       ipv4_default_advmss,
164         .mtu =                  ipv4_mtu,
165         .cow_metrics =          ipv4_cow_metrics,
166         .destroy =              ipv4_dst_destroy,
167         .negative_advice =      ipv4_negative_advice,
168         .link_failure =         ipv4_link_failure,
169         .update_pmtu =          ip_rt_update_pmtu,
170         .redirect =             ip_do_redirect,
171         .local_out =            __ip_local_out,
172         .neigh_lookup =         ipv4_neigh_lookup,
173         .confirm_neigh =        ipv4_confirm_neigh,
174 };
175
176 #define ECN_OR_COST(class)      TC_PRIO_##class
177
178 const __u8 ip_tos2prio[16] = {
179         TC_PRIO_BESTEFFORT,
180         ECN_OR_COST(BESTEFFORT),
181         TC_PRIO_BESTEFFORT,
182         ECN_OR_COST(BESTEFFORT),
183         TC_PRIO_BULK,
184         ECN_OR_COST(BULK),
185         TC_PRIO_BULK,
186         ECN_OR_COST(BULK),
187         TC_PRIO_INTERACTIVE,
188         ECN_OR_COST(INTERACTIVE),
189         TC_PRIO_INTERACTIVE,
190         ECN_OR_COST(INTERACTIVE),
191         TC_PRIO_INTERACTIVE_BULK,
192         ECN_OR_COST(INTERACTIVE_BULK),
193         TC_PRIO_INTERACTIVE_BULK,
194         ECN_OR_COST(INTERACTIVE_BULK)
195 };
196 EXPORT_SYMBOL(ip_tos2prio);
197
198 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
199 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
200
201 #ifdef CONFIG_PROC_FS
202 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
203 {
204         if (*pos)
205                 return NULL;
206         return SEQ_START_TOKEN;
207 }
208
209 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
210 {
211         ++*pos;
212         return NULL;
213 }
214
215 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
216 {
217 }
218
219 static int rt_cache_seq_show(struct seq_file *seq, void *v)
220 {
221         if (v == SEQ_START_TOKEN)
222                 seq_printf(seq, "%-127s\n",
223                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
224                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
225                            "HHUptod\tSpecDst");
226         return 0;
227 }
228
229 static const struct seq_operations rt_cache_seq_ops = {
230         .start  = rt_cache_seq_start,
231         .next   = rt_cache_seq_next,
232         .stop   = rt_cache_seq_stop,
233         .show   = rt_cache_seq_show,
234 };
235
236 static int rt_cache_seq_open(struct inode *inode, struct file *file)
237 {
238         return seq_open(file, &rt_cache_seq_ops);
239 }
240
241 static const struct proc_ops rt_cache_proc_ops = {
242         .proc_open      = rt_cache_seq_open,
243         .proc_read      = seq_read,
244         .proc_lseek     = seq_lseek,
245         .proc_release   = seq_release,
246 };
247
248
249 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
250 {
251         int cpu;
252
253         if (*pos == 0)
254                 return SEQ_START_TOKEN;
255
256         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
257                 if (!cpu_possible(cpu))
258                         continue;
259                 *pos = cpu+1;
260                 return &per_cpu(rt_cache_stat, cpu);
261         }
262         return NULL;
263 }
264
265 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
266 {
267         int cpu;
268
269         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
270                 if (!cpu_possible(cpu))
271                         continue;
272                 *pos = cpu+1;
273                 return &per_cpu(rt_cache_stat, cpu);
274         }
275         (*pos)++;
276         return NULL;
277
278 }
279
280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
281 {
282
283 }
284
285 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
286 {
287         struct rt_cache_stat *st = v;
288
289         if (v == SEQ_START_TOKEN) {
290                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
291                 return 0;
292         }
293
294         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
295                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
296                    dst_entries_get_slow(&ipv4_dst_ops),
297                    0, /* st->in_hit */
298                    st->in_slow_tot,
299                    st->in_slow_mc,
300                    st->in_no_route,
301                    st->in_brd,
302                    st->in_martian_dst,
303                    st->in_martian_src,
304
305                    0, /* st->out_hit */
306                    st->out_slow_tot,
307                    st->out_slow_mc,
308
309                    0, /* st->gc_total */
310                    0, /* st->gc_ignored */
311                    0, /* st->gc_goal_miss */
312                    0, /* st->gc_dst_overflow */
313                    0, /* st->in_hlist_search */
314                    0  /* st->out_hlist_search */
315                 );
316         return 0;
317 }
318
319 static const struct seq_operations rt_cpu_seq_ops = {
320         .start  = rt_cpu_seq_start,
321         .next   = rt_cpu_seq_next,
322         .stop   = rt_cpu_seq_stop,
323         .show   = rt_cpu_seq_show,
324 };
325
326
327 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
328 {
329         return seq_open(file, &rt_cpu_seq_ops);
330 }
331
332 static const struct proc_ops rt_cpu_proc_ops = {
333         .proc_open      = rt_cpu_seq_open,
334         .proc_read      = seq_read,
335         .proc_lseek     = seq_lseek,
336         .proc_release   = seq_release,
337 };
338
339 #ifdef CONFIG_IP_ROUTE_CLASSID
340 static int rt_acct_proc_show(struct seq_file *m, void *v)
341 {
342         struct ip_rt_acct *dst, *src;
343         unsigned int i, j;
344
345         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
346         if (!dst)
347                 return -ENOMEM;
348
349         for_each_possible_cpu(i) {
350                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
351                 for (j = 0; j < 256; j++) {
352                         dst[j].o_bytes   += src[j].o_bytes;
353                         dst[j].o_packets += src[j].o_packets;
354                         dst[j].i_bytes   += src[j].i_bytes;
355                         dst[j].i_packets += src[j].i_packets;
356                 }
357         }
358
359         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
360         kfree(dst);
361         return 0;
362 }
363 #endif
364
365 static int __net_init ip_rt_do_proc_init(struct net *net)
366 {
367         struct proc_dir_entry *pde;
368
369         pde = proc_create("rt_cache", 0444, net->proc_net,
370                           &rt_cache_proc_ops);
371         if (!pde)
372                 goto err1;
373
374         pde = proc_create("rt_cache", 0444,
375                           net->proc_net_stat, &rt_cpu_proc_ops);
376         if (!pde)
377                 goto err2;
378
379 #ifdef CONFIG_IP_ROUTE_CLASSID
380         pde = proc_create_single("rt_acct", 0, net->proc_net,
381                         rt_acct_proc_show);
382         if (!pde)
383                 goto err3;
384 #endif
385         return 0;
386
387 #ifdef CONFIG_IP_ROUTE_CLASSID
388 err3:
389         remove_proc_entry("rt_cache", net->proc_net_stat);
390 #endif
391 err2:
392         remove_proc_entry("rt_cache", net->proc_net);
393 err1:
394         return -ENOMEM;
395 }
396
397 static void __net_exit ip_rt_do_proc_exit(struct net *net)
398 {
399         remove_proc_entry("rt_cache", net->proc_net_stat);
400         remove_proc_entry("rt_cache", net->proc_net);
401 #ifdef CONFIG_IP_ROUTE_CLASSID
402         remove_proc_entry("rt_acct", net->proc_net);
403 #endif
404 }
405
406 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
407         .init = ip_rt_do_proc_init,
408         .exit = ip_rt_do_proc_exit,
409 };
410
411 static int __init ip_rt_proc_init(void)
412 {
413         return register_pernet_subsys(&ip_rt_proc_ops);
414 }
415
416 #else
417 static inline int ip_rt_proc_init(void)
418 {
419         return 0;
420 }
421 #endif /* CONFIG_PROC_FS */
422
423 static inline bool rt_is_expired(const struct rtable *rth)
424 {
425         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
426 }
427
428 void rt_cache_flush(struct net *net)
429 {
430         rt_genid_bump_ipv4(net);
431 }
432
433 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
434                                            struct sk_buff *skb,
435                                            const void *daddr)
436 {
437         const struct rtable *rt = container_of(dst, struct rtable, dst);
438         struct net_device *dev = dst->dev;
439         struct neighbour *n;
440
441         rcu_read_lock_bh();
442
443         if (likely(rt->rt_gw_family == AF_INET)) {
444                 n = ip_neigh_gw4(dev, rt->rt_gw4);
445         } else if (rt->rt_gw_family == AF_INET6) {
446                 n = ip_neigh_gw6(dev, &rt->rt_gw6);
447         } else {
448                 __be32 pkey;
449
450                 pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
451                 n = ip_neigh_gw4(dev, pkey);
452         }
453
454         if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
455                 n = NULL;
456
457         rcu_read_unlock_bh();
458
459         return n;
460 }
461
462 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
463 {
464         const struct rtable *rt = container_of(dst, struct rtable, dst);
465         struct net_device *dev = dst->dev;
466         const __be32 *pkey = daddr;
467
468         if (rt->rt_gw_family == AF_INET) {
469                 pkey = (const __be32 *)&rt->rt_gw4;
470         } else if (rt->rt_gw_family == AF_INET6) {
471                 return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
472         } else if (!daddr ||
473                  (rt->rt_flags &
474                   (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
475                 return;
476         }
477         __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
478 }
479
480 /* Hash tables of size 2048..262144 depending on RAM size.
481  * Each bucket uses 8 bytes.
482  */
483 static u32 ip_idents_mask __read_mostly;
484 static atomic_t *ip_idents __read_mostly;
485 static u32 *ip_tstamps __read_mostly;
486
487 /* In order to protect privacy, we add a perturbation to identifiers
488  * if one generator is seldom used. This makes hard for an attacker
489  * to infer how many packets were sent between two points in time.
490  */
491 u32 ip_idents_reserve(u32 hash, int segs)
492 {
493         u32 bucket, old, now = (u32)jiffies;
494         atomic_t *p_id;
495         u32 *p_tstamp;
496         u32 delta = 0;
497
498         bucket = hash & ip_idents_mask;
499         p_tstamp = ip_tstamps + bucket;
500         p_id = ip_idents + bucket;
501         old = READ_ONCE(*p_tstamp);
502
503         if (old != now && cmpxchg(p_tstamp, old, now) == old)
504                 delta = prandom_u32_max(now - old);
505
506         /* If UBSAN reports an error there, please make sure your compiler
507          * supports -fno-strict-overflow before reporting it that was a bug
508          * in UBSAN, and it has been fixed in GCC-8.
509          */
510         return atomic_add_return(segs + delta, p_id) - segs;
511 }
512 EXPORT_SYMBOL(ip_idents_reserve);
513
514 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
515 {
516         u32 hash, id;
517
518         /* Note the following code is not safe, but this is okay. */
519         if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
520                 get_random_bytes(&net->ipv4.ip_id_key,
521                                  sizeof(net->ipv4.ip_id_key));
522
523         hash = siphash_3u32((__force u32)iph->daddr,
524                             (__force u32)iph->saddr,
525                             iph->protocol,
526                             &net->ipv4.ip_id_key);
527         id = ip_idents_reserve(hash, segs);
528         iph->id = htons(id);
529 }
530 EXPORT_SYMBOL(__ip_select_ident);
531
532 static void ip_rt_fix_tos(struct flowi4 *fl4)
533 {
534         __u8 tos = RT_FL_TOS(fl4);
535
536         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
537         fl4->flowi4_scope = tos & RTO_ONLINK ?
538                             RT_SCOPE_LINK : RT_SCOPE_UNIVERSE;
539 }
540
541 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
542                              const struct sock *sk,
543                              const struct iphdr *iph,
544                              int oif, u8 tos,
545                              u8 prot, u32 mark, int flow_flags)
546 {
547         if (sk) {
548                 const struct inet_sock *inet = inet_sk(sk);
549
550                 oif = sk->sk_bound_dev_if;
551                 mark = sk->sk_mark;
552                 tos = RT_CONN_FLAGS(sk);
553                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
554         }
555         flowi4_init_output(fl4, oif, mark, tos,
556                            RT_SCOPE_UNIVERSE, prot,
557                            flow_flags,
558                            iph->daddr, iph->saddr, 0, 0,
559                            sock_net_uid(net, sk));
560 }
561
562 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
563                                const struct sock *sk)
564 {
565         const struct net *net = dev_net(skb->dev);
566         const struct iphdr *iph = ip_hdr(skb);
567         int oif = skb->dev->ifindex;
568         u8 tos = RT_TOS(iph->tos);
569         u8 prot = iph->protocol;
570         u32 mark = skb->mark;
571
572         __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
573 }
574
575 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
576 {
577         const struct inet_sock *inet = inet_sk(sk);
578         const struct ip_options_rcu *inet_opt;
579         __be32 daddr = inet->inet_daddr;
580
581         rcu_read_lock();
582         inet_opt = rcu_dereference(inet->inet_opt);
583         if (inet_opt && inet_opt->opt.srr)
584                 daddr = inet_opt->opt.faddr;
585         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
586                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
587                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
588                            inet_sk_flowi_flags(sk),
589                            daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
590         rcu_read_unlock();
591 }
592
593 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
594                                  const struct sk_buff *skb)
595 {
596         if (skb)
597                 build_skb_flow_key(fl4, skb, sk);
598         else
599                 build_sk_flow_key(fl4, sk);
600 }
601
602 static DEFINE_SPINLOCK(fnhe_lock);
603
604 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
605 {
606         struct rtable *rt;
607
608         rt = rcu_dereference(fnhe->fnhe_rth_input);
609         if (rt) {
610                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
611                 dst_dev_put(&rt->dst);
612                 dst_release(&rt->dst);
613         }
614         rt = rcu_dereference(fnhe->fnhe_rth_output);
615         if (rt) {
616                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
617                 dst_dev_put(&rt->dst);
618                 dst_release(&rt->dst);
619         }
620 }
621
622 static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
623 {
624         struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
625         struct fib_nh_exception *fnhe, *oldest = NULL;
626
627         for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
628                 fnhe = rcu_dereference_protected(*fnhe_p,
629                                                  lockdep_is_held(&fnhe_lock));
630                 if (!fnhe)
631                         break;
632                 if (!oldest ||
633                     time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
634                         oldest = fnhe;
635                         oldest_p = fnhe_p;
636                 }
637         }
638         fnhe_flush_routes(oldest);
639         *oldest_p = oldest->fnhe_next;
640         kfree_rcu(oldest, rcu);
641 }
642
643 static u32 fnhe_hashfun(__be32 daddr)
644 {
645         static siphash_key_t fnhe_hash_key __read_mostly;
646         u64 hval;
647
648         net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key));
649         hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key);
650         return hash_64(hval, FNHE_HASH_SHIFT);
651 }
652
653 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
654 {
655         rt->rt_pmtu = fnhe->fnhe_pmtu;
656         rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
657         rt->dst.expires = fnhe->fnhe_expires;
658
659         if (fnhe->fnhe_gw) {
660                 rt->rt_flags |= RTCF_REDIRECTED;
661                 rt->rt_uses_gateway = 1;
662                 rt->rt_gw_family = AF_INET;
663                 rt->rt_gw4 = fnhe->fnhe_gw;
664         }
665 }
666
667 static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
668                                   __be32 gw, u32 pmtu, bool lock,
669                                   unsigned long expires)
670 {
671         struct fnhe_hash_bucket *hash;
672         struct fib_nh_exception *fnhe;
673         struct rtable *rt;
674         u32 genid, hval;
675         unsigned int i;
676         int depth;
677
678         genid = fnhe_genid(dev_net(nhc->nhc_dev));
679         hval = fnhe_hashfun(daddr);
680
681         spin_lock_bh(&fnhe_lock);
682
683         hash = rcu_dereference(nhc->nhc_exceptions);
684         if (!hash) {
685                 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
686                 if (!hash)
687                         goto out_unlock;
688                 rcu_assign_pointer(nhc->nhc_exceptions, hash);
689         }
690
691         hash += hval;
692
693         depth = 0;
694         for (fnhe = rcu_dereference(hash->chain); fnhe;
695              fnhe = rcu_dereference(fnhe->fnhe_next)) {
696                 if (fnhe->fnhe_daddr == daddr)
697                         break;
698                 depth++;
699         }
700
701         if (fnhe) {
702                 if (fnhe->fnhe_genid != genid)
703                         fnhe->fnhe_genid = genid;
704                 if (gw)
705                         fnhe->fnhe_gw = gw;
706                 if (pmtu) {
707                         fnhe->fnhe_pmtu = pmtu;
708                         fnhe->fnhe_mtu_locked = lock;
709                 }
710                 fnhe->fnhe_expires = max(1UL, expires);
711                 /* Update all cached dsts too */
712                 rt = rcu_dereference(fnhe->fnhe_rth_input);
713                 if (rt)
714                         fill_route_from_fnhe(rt, fnhe);
715                 rt = rcu_dereference(fnhe->fnhe_rth_output);
716                 if (rt)
717                         fill_route_from_fnhe(rt, fnhe);
718         } else {
719                 /* Randomize max depth to avoid some side channels attacks. */
720                 int max_depth = FNHE_RECLAIM_DEPTH +
721                                 prandom_u32_max(FNHE_RECLAIM_DEPTH);
722
723                 while (depth > max_depth) {
724                         fnhe_remove_oldest(hash);
725                         depth--;
726                 }
727
728                 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
729                 if (!fnhe)
730                         goto out_unlock;
731
732                 fnhe->fnhe_next = hash->chain;
733
734                 fnhe->fnhe_genid = genid;
735                 fnhe->fnhe_daddr = daddr;
736                 fnhe->fnhe_gw = gw;
737                 fnhe->fnhe_pmtu = pmtu;
738                 fnhe->fnhe_mtu_locked = lock;
739                 fnhe->fnhe_expires = max(1UL, expires);
740
741                 rcu_assign_pointer(hash->chain, fnhe);
742
743                 /* Exception created; mark the cached routes for the nexthop
744                  * stale, so anyone caching it rechecks if this exception
745                  * applies to them.
746                  */
747                 rt = rcu_dereference(nhc->nhc_rth_input);
748                 if (rt)
749                         rt->dst.obsolete = DST_OBSOLETE_KILL;
750
751                 for_each_possible_cpu(i) {
752                         struct rtable __rcu **prt;
753                         prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
754                         rt = rcu_dereference(*prt);
755                         if (rt)
756                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
757                 }
758         }
759
760         fnhe->fnhe_stamp = jiffies;
761
762 out_unlock:
763         spin_unlock_bh(&fnhe_lock);
764 }
765
766 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
767                              bool kill_route)
768 {
769         __be32 new_gw = icmp_hdr(skb)->un.gateway;
770         __be32 old_gw = ip_hdr(skb)->saddr;
771         struct net_device *dev = skb->dev;
772         struct in_device *in_dev;
773         struct fib_result res;
774         struct neighbour *n;
775         struct net *net;
776
777         switch (icmp_hdr(skb)->code & 7) {
778         case ICMP_REDIR_NET:
779         case ICMP_REDIR_NETTOS:
780         case ICMP_REDIR_HOST:
781         case ICMP_REDIR_HOSTTOS:
782                 break;
783
784         default:
785                 return;
786         }
787
788         if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
789                 return;
790
791         in_dev = __in_dev_get_rcu(dev);
792         if (!in_dev)
793                 return;
794
795         net = dev_net(dev);
796         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
797             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
798             ipv4_is_zeronet(new_gw))
799                 goto reject_redirect;
800
801         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
802                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
803                         goto reject_redirect;
804                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
805                         goto reject_redirect;
806         } else {
807                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
808                         goto reject_redirect;
809         }
810
811         n = __ipv4_neigh_lookup(rt->dst.dev, (__force u32)new_gw);
812         if (!n)
813                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
814         if (!IS_ERR(n)) {
815                 if (!(n->nud_state & NUD_VALID)) {
816                         neigh_event_send(n, NULL);
817                 } else {
818                         if (fib_lookup(net, fl4, &res, 0) == 0) {
819                                 struct fib_nh_common *nhc;
820
821                                 fib_select_path(net, &res, fl4, skb);
822                                 nhc = FIB_RES_NHC(res);
823                                 update_or_create_fnhe(nhc, fl4->daddr, new_gw,
824                                                 0, false,
825                                                 jiffies + ip_rt_gc_timeout);
826                         }
827                         if (kill_route)
828                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
829                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
830                 }
831                 neigh_release(n);
832         }
833         return;
834
835 reject_redirect:
836 #ifdef CONFIG_IP_ROUTE_VERBOSE
837         if (IN_DEV_LOG_MARTIANS(in_dev)) {
838                 const struct iphdr *iph = (const struct iphdr *) skb->data;
839                 __be32 daddr = iph->daddr;
840                 __be32 saddr = iph->saddr;
841
842                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
843                                      "  Advised path = %pI4 -> %pI4\n",
844                                      &old_gw, dev->name, &new_gw,
845                                      &saddr, &daddr);
846         }
847 #endif
848         ;
849 }
850
851 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
852 {
853         struct rtable *rt;
854         struct flowi4 fl4;
855         const struct iphdr *iph = (const struct iphdr *) skb->data;
856         struct net *net = dev_net(skb->dev);
857         int oif = skb->dev->ifindex;
858         u8 tos = RT_TOS(iph->tos);
859         u8 prot = iph->protocol;
860         u32 mark = skb->mark;
861
862         rt = (struct rtable *) dst;
863
864         __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
865         ip_rt_fix_tos(&fl4);
866         __ip_do_redirect(rt, skb, &fl4, true);
867 }
868
869 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
870 {
871         struct rtable *rt = (struct rtable *)dst;
872         struct dst_entry *ret = dst;
873
874         if (rt) {
875                 if (dst->obsolete > 0) {
876                         ip_rt_put(rt);
877                         ret = NULL;
878                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
879                            rt->dst.expires) {
880                         ip_rt_put(rt);
881                         ret = NULL;
882                 }
883         }
884         return ret;
885 }
886
887 /*
888  * Algorithm:
889  *      1. The first ip_rt_redirect_number redirects are sent
890  *         with exponential backoff, then we stop sending them at all,
891  *         assuming that the host ignores our redirects.
892  *      2. If we did not see packets requiring redirects
893  *         during ip_rt_redirect_silence, we assume that the host
894  *         forgot redirected route and start to send redirects again.
895  *
896  * This algorithm is much cheaper and more intelligent than dumb load limiting
897  * in icmp.c.
898  *
899  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
900  * and "frag. need" (breaks PMTU discovery) in icmp.c.
901  */
902
903 void ip_rt_send_redirect(struct sk_buff *skb)
904 {
905         struct rtable *rt = skb_rtable(skb);
906         struct in_device *in_dev;
907         struct inet_peer *peer;
908         struct net *net;
909         int log_martians;
910         int vif;
911
912         rcu_read_lock();
913         in_dev = __in_dev_get_rcu(rt->dst.dev);
914         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
915                 rcu_read_unlock();
916                 return;
917         }
918         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
919         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
920         rcu_read_unlock();
921
922         net = dev_net(rt->dst.dev);
923         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
924         if (!peer) {
925                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
926                           rt_nexthop(rt, ip_hdr(skb)->daddr));
927                 return;
928         }
929
930         /* No redirected packets during ip_rt_redirect_silence;
931          * reset the algorithm.
932          */
933         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
934                 peer->rate_tokens = 0;
935                 peer->n_redirects = 0;
936         }
937
938         /* Too many ignored redirects; do not send anything
939          * set dst.rate_last to the last seen redirected packet.
940          */
941         if (peer->n_redirects >= ip_rt_redirect_number) {
942                 peer->rate_last = jiffies;
943                 goto out_put_peer;
944         }
945
946         /* Check for load limit; set rate_last to the latest sent
947          * redirect.
948          */
949         if (peer->n_redirects == 0 ||
950             time_after(jiffies,
951                        (peer->rate_last +
952                         (ip_rt_redirect_load << peer->n_redirects)))) {
953                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
954
955                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
956                 peer->rate_last = jiffies;
957                 ++peer->n_redirects;
958                 if (IS_ENABLED(CONFIG_IP_ROUTE_VERBOSE) && log_martians &&
959                     peer->n_redirects == ip_rt_redirect_number)
960                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
961                                              &ip_hdr(skb)->saddr, inet_iif(skb),
962                                              &ip_hdr(skb)->daddr, &gw);
963         }
964 out_put_peer:
965         inet_putpeer(peer);
966 }
967
968 static int ip_error(struct sk_buff *skb)
969 {
970         struct rtable *rt = skb_rtable(skb);
971         struct net_device *dev = skb->dev;
972         struct in_device *in_dev;
973         struct inet_peer *peer;
974         unsigned long now;
975         struct net *net;
976         bool send;
977         int code;
978
979         if (netif_is_l3_master(skb->dev)) {
980                 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
981                 if (!dev)
982                         goto out;
983         }
984
985         in_dev = __in_dev_get_rcu(dev);
986
987         /* IP on this device is disabled. */
988         if (!in_dev)
989                 goto out;
990
991         net = dev_net(rt->dst.dev);
992         if (!IN_DEV_FORWARD(in_dev)) {
993                 switch (rt->dst.error) {
994                 case EHOSTUNREACH:
995                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
996                         break;
997
998                 case ENETUNREACH:
999                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
1000                         break;
1001                 }
1002                 goto out;
1003         }
1004
1005         switch (rt->dst.error) {
1006         case EINVAL:
1007         default:
1008                 goto out;
1009         case EHOSTUNREACH:
1010                 code = ICMP_HOST_UNREACH;
1011                 break;
1012         case ENETUNREACH:
1013                 code = ICMP_NET_UNREACH;
1014                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
1015                 break;
1016         case EACCES:
1017                 code = ICMP_PKT_FILTERED;
1018                 break;
1019         }
1020
1021         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
1022                                l3mdev_master_ifindex(skb->dev), 1);
1023
1024         send = true;
1025         if (peer) {
1026                 now = jiffies;
1027                 peer->rate_tokens += now - peer->rate_last;
1028                 if (peer->rate_tokens > ip_rt_error_burst)
1029                         peer->rate_tokens = ip_rt_error_burst;
1030                 peer->rate_last = now;
1031                 if (peer->rate_tokens >= ip_rt_error_cost)
1032                         peer->rate_tokens -= ip_rt_error_cost;
1033                 else
1034                         send = false;
1035                 inet_putpeer(peer);
1036         }
1037         if (send)
1038                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1039
1040 out:    kfree_skb(skb);
1041         return 0;
1042 }
1043
1044 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1045 {
1046         struct dst_entry *dst = &rt->dst;
1047         struct net *net = dev_net(dst->dev);
1048         struct fib_result res;
1049         bool lock = false;
1050         u32 old_mtu;
1051
1052         if (ip_mtu_locked(dst))
1053                 return;
1054
1055         old_mtu = ipv4_mtu(dst);
1056         if (old_mtu < mtu)
1057                 return;
1058
1059         if (mtu < ip_rt_min_pmtu) {
1060                 lock = true;
1061                 mtu = min(old_mtu, ip_rt_min_pmtu);
1062         }
1063
1064         if (rt->rt_pmtu == mtu && !lock &&
1065             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1066                 return;
1067
1068         rcu_read_lock();
1069         if (fib_lookup(net, fl4, &res, 0) == 0) {
1070                 struct fib_nh_common *nhc;
1071
1072                 fib_select_path(net, &res, fl4, NULL);
1073                 nhc = FIB_RES_NHC(res);
1074                 update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1075                                       jiffies + ip_rt_mtu_expires);
1076         }
1077         rcu_read_unlock();
1078 }
1079
1080 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1081                               struct sk_buff *skb, u32 mtu,
1082                               bool confirm_neigh)
1083 {
1084         struct rtable *rt = (struct rtable *) dst;
1085         struct flowi4 fl4;
1086
1087         ip_rt_build_flow_key(&fl4, sk, skb);
1088         ip_rt_fix_tos(&fl4);
1089
1090         /* Don't make lookup fail for bridged encapsulations */
1091         if (skb && netif_is_any_bridge_port(skb->dev))
1092                 fl4.flowi4_oif = 0;
1093
1094         __ip_rt_update_pmtu(rt, &fl4, mtu);
1095 }
1096
1097 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1098                       int oif, u8 protocol)
1099 {
1100         const struct iphdr *iph = (const struct iphdr *)skb->data;
1101         struct flowi4 fl4;
1102         struct rtable *rt;
1103         u32 mark = IP4_REPLY_MARK(net, skb->mark);
1104
1105         __build_flow_key(net, &fl4, NULL, iph, oif,
1106                          RT_TOS(iph->tos), protocol, mark, 0);
1107         rt = __ip_route_output_key(net, &fl4);
1108         if (!IS_ERR(rt)) {
1109                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1110                 ip_rt_put(rt);
1111         }
1112 }
1113 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1114
1115 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1116 {
1117         const struct iphdr *iph = (const struct iphdr *)skb->data;
1118         struct flowi4 fl4;
1119         struct rtable *rt;
1120
1121         __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1122
1123         if (!fl4.flowi4_mark)
1124                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1125
1126         rt = __ip_route_output_key(sock_net(sk), &fl4);
1127         if (!IS_ERR(rt)) {
1128                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1129                 ip_rt_put(rt);
1130         }
1131 }
1132
1133 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1134 {
1135         const struct iphdr *iph = (const struct iphdr *)skb->data;
1136         struct flowi4 fl4;
1137         struct rtable *rt;
1138         struct dst_entry *odst = NULL;
1139         bool new = false;
1140         struct net *net = sock_net(sk);
1141
1142         bh_lock_sock(sk);
1143
1144         if (!ip_sk_accept_pmtu(sk))
1145                 goto out;
1146
1147         odst = sk_dst_get(sk);
1148
1149         if (sock_owned_by_user(sk) || !odst) {
1150                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1151                 goto out;
1152         }
1153
1154         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1155
1156         rt = (struct rtable *)odst;
1157         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1158                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1159                 if (IS_ERR(rt))
1160                         goto out;
1161
1162                 new = true;
1163         } else {
1164                 ip_rt_fix_tos(&fl4);
1165         }
1166
1167         __ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu);
1168
1169         if (!dst_check(&rt->dst, 0)) {
1170                 if (new)
1171                         dst_release(&rt->dst);
1172
1173                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1174                 if (IS_ERR(rt))
1175                         goto out;
1176
1177                 new = true;
1178         }
1179
1180         if (new)
1181                 sk_dst_set(sk, &rt->dst);
1182
1183 out:
1184         bh_unlock_sock(sk);
1185         dst_release(odst);
1186 }
1187 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1188
1189 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1190                    int oif, u8 protocol)
1191 {
1192         const struct iphdr *iph = (const struct iphdr *)skb->data;
1193         struct flowi4 fl4;
1194         struct rtable *rt;
1195
1196         __build_flow_key(net, &fl4, NULL, iph, oif,
1197                          RT_TOS(iph->tos), protocol, 0, 0);
1198         rt = __ip_route_output_key(net, &fl4);
1199         if (!IS_ERR(rt)) {
1200                 __ip_do_redirect(rt, skb, &fl4, false);
1201                 ip_rt_put(rt);
1202         }
1203 }
1204 EXPORT_SYMBOL_GPL(ipv4_redirect);
1205
1206 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1207 {
1208         const struct iphdr *iph = (const struct iphdr *)skb->data;
1209         struct flowi4 fl4;
1210         struct rtable *rt;
1211         struct net *net = sock_net(sk);
1212
1213         __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1214         rt = __ip_route_output_key(net, &fl4);
1215         if (!IS_ERR(rt)) {
1216                 __ip_do_redirect(rt, skb, &fl4, false);
1217                 ip_rt_put(rt);
1218         }
1219 }
1220 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1221
1222 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1223 {
1224         struct rtable *rt = (struct rtable *) dst;
1225
1226         /* All IPV4 dsts are created with ->obsolete set to the value
1227          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1228          * into this function always.
1229          *
1230          * When a PMTU/redirect information update invalidates a route,
1231          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1232          * DST_OBSOLETE_DEAD.
1233          */
1234         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1235                 return NULL;
1236         return dst;
1237 }
1238
1239 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1240 {
1241         struct net_device *dev;
1242         struct ip_options opt;
1243         int res;
1244
1245         /* Recompile ip options since IPCB may not be valid anymore.
1246          * Also check we have a reasonable ipv4 header.
1247          */
1248         if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1249             ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1250                 return;
1251
1252         memset(&opt, 0, sizeof(opt));
1253         if (ip_hdr(skb)->ihl > 5) {
1254                 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1255                         return;
1256                 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1257
1258                 rcu_read_lock();
1259                 dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev;
1260                 res = __ip_options_compile(dev_net(dev), &opt, skb, NULL);
1261                 rcu_read_unlock();
1262
1263                 if (res)
1264                         return;
1265         }
1266         __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1267 }
1268
1269 static void ipv4_link_failure(struct sk_buff *skb)
1270 {
1271         struct rtable *rt;
1272
1273         ipv4_send_dest_unreach(skb);
1274
1275         rt = skb_rtable(skb);
1276         if (rt)
1277                 dst_set_expires(&rt->dst, 0);
1278 }
1279
1280 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1281 {
1282         pr_debug("%s: %pI4 -> %pI4, %s\n",
1283                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1284                  skb->dev ? skb->dev->name : "?");
1285         kfree_skb(skb);
1286         WARN_ON(1);
1287         return 0;
1288 }
1289
1290 /*
1291    We do not cache source address of outgoing interface,
1292    because it is used only by IP RR, TS and SRR options,
1293    so that it out of fast path.
1294
1295    BTW remember: "addr" is allowed to be not aligned
1296    in IP options!
1297  */
1298
1299 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1300 {
1301         __be32 src;
1302
1303         if (rt_is_output_route(rt))
1304                 src = ip_hdr(skb)->saddr;
1305         else {
1306                 struct fib_result res;
1307                 struct iphdr *iph = ip_hdr(skb);
1308                 struct flowi4 fl4 = {
1309                         .daddr = iph->daddr,
1310                         .saddr = iph->saddr,
1311                         .flowi4_tos = RT_TOS(iph->tos),
1312                         .flowi4_oif = rt->dst.dev->ifindex,
1313                         .flowi4_iif = skb->dev->ifindex,
1314                         .flowi4_mark = skb->mark,
1315                 };
1316
1317                 rcu_read_lock();
1318                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1319                         src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1320                 else
1321                         src = inet_select_addr(rt->dst.dev,
1322                                                rt_nexthop(rt, iph->daddr),
1323                                                RT_SCOPE_UNIVERSE);
1324                 rcu_read_unlock();
1325         }
1326         memcpy(addr, &src, 4);
1327 }
1328
1329 #ifdef CONFIG_IP_ROUTE_CLASSID
1330 static void set_class_tag(struct rtable *rt, u32 tag)
1331 {
1332         if (!(rt->dst.tclassid & 0xFFFF))
1333                 rt->dst.tclassid |= tag & 0xFFFF;
1334         if (!(rt->dst.tclassid & 0xFFFF0000))
1335                 rt->dst.tclassid |= tag & 0xFFFF0000;
1336 }
1337 #endif
1338
1339 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1340 {
1341         unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1342         unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1343                                     ip_rt_min_advmss);
1344
1345         return min(advmss, IPV4_MAX_PMTU - header_size);
1346 }
1347
1348 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1349 {
1350         const struct rtable *rt = (const struct rtable *)dst;
1351         unsigned int mtu = rt->rt_pmtu;
1352
1353         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1354                 mtu = dst_metric_raw(dst, RTAX_MTU);
1355
1356         if (mtu)
1357                 goto out;
1358
1359         mtu = READ_ONCE(dst->dev->mtu);
1360
1361         if (unlikely(ip_mtu_locked(dst))) {
1362                 if (rt->rt_uses_gateway && mtu > 576)
1363                         mtu = 576;
1364         }
1365
1366 out:
1367         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1368
1369         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1370 }
1371
1372 static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1373 {
1374         struct fnhe_hash_bucket *hash;
1375         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1376         u32 hval = fnhe_hashfun(daddr);
1377
1378         spin_lock_bh(&fnhe_lock);
1379
1380         hash = rcu_dereference_protected(nhc->nhc_exceptions,
1381                                          lockdep_is_held(&fnhe_lock));
1382         hash += hval;
1383
1384         fnhe_p = &hash->chain;
1385         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1386         while (fnhe) {
1387                 if (fnhe->fnhe_daddr == daddr) {
1388                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1389                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1390                         /* set fnhe_daddr to 0 to ensure it won't bind with
1391                          * new dsts in rt_bind_exception().
1392                          */
1393                         fnhe->fnhe_daddr = 0;
1394                         fnhe_flush_routes(fnhe);
1395                         kfree_rcu(fnhe, rcu);
1396                         break;
1397                 }
1398                 fnhe_p = &fnhe->fnhe_next;
1399                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1400                                                  lockdep_is_held(&fnhe_lock));
1401         }
1402
1403         spin_unlock_bh(&fnhe_lock);
1404 }
1405
1406 static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1407                                                __be32 daddr)
1408 {
1409         struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1410         struct fib_nh_exception *fnhe;
1411         u32 hval;
1412
1413         if (!hash)
1414                 return NULL;
1415
1416         hval = fnhe_hashfun(daddr);
1417
1418         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1419              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1420                 if (fnhe->fnhe_daddr == daddr) {
1421                         if (fnhe->fnhe_expires &&
1422                             time_after(jiffies, fnhe->fnhe_expires)) {
1423                                 ip_del_fnhe(nhc, daddr);
1424                                 break;
1425                         }
1426                         return fnhe;
1427                 }
1428         }
1429         return NULL;
1430 }
1431
1432 /* MTU selection:
1433  * 1. mtu on route is locked - use it
1434  * 2. mtu from nexthop exception
1435  * 3. mtu from egress device
1436  */
1437
1438 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1439 {
1440         struct fib_nh_common *nhc = res->nhc;
1441         struct net_device *dev = nhc->nhc_dev;
1442         struct fib_info *fi = res->fi;
1443         u32 mtu = 0;
1444
1445         if (READ_ONCE(dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu) ||
1446             fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1447                 mtu = fi->fib_mtu;
1448
1449         if (likely(!mtu)) {
1450                 struct fib_nh_exception *fnhe;
1451
1452                 fnhe = find_exception(nhc, daddr);
1453                 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1454                         mtu = fnhe->fnhe_pmtu;
1455         }
1456
1457         if (likely(!mtu))
1458                 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1459
1460         return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1461 }
1462
1463 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1464                               __be32 daddr, const bool do_cache)
1465 {
1466         bool ret = false;
1467
1468         spin_lock_bh(&fnhe_lock);
1469
1470         if (daddr == fnhe->fnhe_daddr) {
1471                 struct rtable __rcu **porig;
1472                 struct rtable *orig;
1473                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1474
1475                 if (rt_is_input_route(rt))
1476                         porig = &fnhe->fnhe_rth_input;
1477                 else
1478                         porig = &fnhe->fnhe_rth_output;
1479                 orig = rcu_dereference(*porig);
1480
1481                 if (fnhe->fnhe_genid != genid) {
1482                         fnhe->fnhe_genid = genid;
1483                         fnhe->fnhe_gw = 0;
1484                         fnhe->fnhe_pmtu = 0;
1485                         fnhe->fnhe_expires = 0;
1486                         fnhe->fnhe_mtu_locked = false;
1487                         fnhe_flush_routes(fnhe);
1488                         orig = NULL;
1489                 }
1490                 fill_route_from_fnhe(rt, fnhe);
1491                 if (!rt->rt_gw4) {
1492                         rt->rt_gw4 = daddr;
1493                         rt->rt_gw_family = AF_INET;
1494                 }
1495
1496                 if (do_cache) {
1497                         dst_hold(&rt->dst);
1498                         rcu_assign_pointer(*porig, rt);
1499                         if (orig) {
1500                                 dst_dev_put(&orig->dst);
1501                                 dst_release(&orig->dst);
1502                         }
1503                         ret = true;
1504                 }
1505
1506                 fnhe->fnhe_stamp = jiffies;
1507         }
1508         spin_unlock_bh(&fnhe_lock);
1509
1510         return ret;
1511 }
1512
1513 static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1514 {
1515         struct rtable *orig, *prev, **p;
1516         bool ret = true;
1517
1518         if (rt_is_input_route(rt)) {
1519                 p = (struct rtable **)&nhc->nhc_rth_input;
1520         } else {
1521                 p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1522         }
1523         orig = *p;
1524
1525         /* hold dst before doing cmpxchg() to avoid race condition
1526          * on this dst
1527          */
1528         dst_hold(&rt->dst);
1529         prev = cmpxchg(p, orig, rt);
1530         if (prev == orig) {
1531                 if (orig) {
1532                         rt_add_uncached_list(orig);
1533                         dst_release(&orig->dst);
1534                 }
1535         } else {
1536                 dst_release(&rt->dst);
1537                 ret = false;
1538         }
1539
1540         return ret;
1541 }
1542
1543 struct uncached_list {
1544         spinlock_t              lock;
1545         struct list_head        head;
1546 };
1547
1548 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1549
1550 void rt_add_uncached_list(struct rtable *rt)
1551 {
1552         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1553
1554         rt->rt_uncached_list = ul;
1555
1556         spin_lock_bh(&ul->lock);
1557         list_add_tail(&rt->rt_uncached, &ul->head);
1558         spin_unlock_bh(&ul->lock);
1559 }
1560
1561 void rt_del_uncached_list(struct rtable *rt)
1562 {
1563         if (!list_empty(&rt->rt_uncached)) {
1564                 struct uncached_list *ul = rt->rt_uncached_list;
1565
1566                 spin_lock_bh(&ul->lock);
1567                 list_del(&rt->rt_uncached);
1568                 spin_unlock_bh(&ul->lock);
1569         }
1570 }
1571
1572 static void ipv4_dst_destroy(struct dst_entry *dst)
1573 {
1574         struct rtable *rt = (struct rtable *)dst;
1575
1576         ip_dst_metrics_put(dst);
1577         rt_del_uncached_list(rt);
1578 }
1579
1580 void rt_flush_dev(struct net_device *dev)
1581 {
1582         struct rtable *rt;
1583         int cpu;
1584
1585         for_each_possible_cpu(cpu) {
1586                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1587
1588                 spin_lock_bh(&ul->lock);
1589                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1590                         if (rt->dst.dev != dev)
1591                                 continue;
1592                         rt->dst.dev = blackhole_netdev;
1593                         dev_hold(rt->dst.dev);
1594                         dev_put(dev);
1595                 }
1596                 spin_unlock_bh(&ul->lock);
1597         }
1598 }
1599
1600 static bool rt_cache_valid(const struct rtable *rt)
1601 {
1602         return  rt &&
1603                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1604                 !rt_is_expired(rt);
1605 }
1606
1607 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1608                            const struct fib_result *res,
1609                            struct fib_nh_exception *fnhe,
1610                            struct fib_info *fi, u16 type, u32 itag,
1611                            const bool do_cache)
1612 {
1613         bool cached = false;
1614
1615         if (fi) {
1616                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1617
1618                 if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1619                         rt->rt_uses_gateway = 1;
1620                         rt->rt_gw_family = nhc->nhc_gw_family;
1621                         /* only INET and INET6 are supported */
1622                         if (likely(nhc->nhc_gw_family == AF_INET))
1623                                 rt->rt_gw4 = nhc->nhc_gw.ipv4;
1624                         else
1625                                 rt->rt_gw6 = nhc->nhc_gw.ipv6;
1626                 }
1627
1628                 ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1629
1630 #ifdef CONFIG_IP_ROUTE_CLASSID
1631                 if (nhc->nhc_family == AF_INET) {
1632                         struct fib_nh *nh;
1633
1634                         nh = container_of(nhc, struct fib_nh, nh_common);
1635                         rt->dst.tclassid = nh->nh_tclassid;
1636                 }
1637 #endif
1638                 rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1639                 if (unlikely(fnhe))
1640                         cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1641                 else if (do_cache)
1642                         cached = rt_cache_route(nhc, rt);
1643                 if (unlikely(!cached)) {
1644                         /* Routes we intend to cache in nexthop exception or
1645                          * FIB nexthop have the DST_NOCACHE bit clear.
1646                          * However, if we are unsuccessful at storing this
1647                          * route into the cache we really need to set it.
1648                          */
1649                         if (!rt->rt_gw4) {
1650                                 rt->rt_gw_family = AF_INET;
1651                                 rt->rt_gw4 = daddr;
1652                         }
1653                         rt_add_uncached_list(rt);
1654                 }
1655         } else
1656                 rt_add_uncached_list(rt);
1657
1658 #ifdef CONFIG_IP_ROUTE_CLASSID
1659 #ifdef CONFIG_IP_MULTIPLE_TABLES
1660         set_class_tag(rt, res->tclassid);
1661 #endif
1662         set_class_tag(rt, itag);
1663 #endif
1664 }
1665
1666 struct rtable *rt_dst_alloc(struct net_device *dev,
1667                             unsigned int flags, u16 type,
1668                             bool nopolicy, bool noxfrm)
1669 {
1670         struct rtable *rt;
1671
1672         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1673                        (nopolicy ? DST_NOPOLICY : 0) |
1674                        (noxfrm ? DST_NOXFRM : 0));
1675
1676         if (rt) {
1677                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1678                 rt->rt_flags = flags;
1679                 rt->rt_type = type;
1680                 rt->rt_is_input = 0;
1681                 rt->rt_iif = 0;
1682                 rt->rt_pmtu = 0;
1683                 rt->rt_mtu_locked = 0;
1684                 rt->rt_uses_gateway = 0;
1685                 rt->rt_gw_family = 0;
1686                 rt->rt_gw4 = 0;
1687                 INIT_LIST_HEAD(&rt->rt_uncached);
1688
1689                 rt->dst.output = ip_output;
1690                 if (flags & RTCF_LOCAL)
1691                         rt->dst.input = ip_local_deliver;
1692         }
1693
1694         return rt;
1695 }
1696 EXPORT_SYMBOL(rt_dst_alloc);
1697
1698 struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1699 {
1700         struct rtable *new_rt;
1701
1702         new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1703                            rt->dst.flags);
1704
1705         if (new_rt) {
1706                 new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1707                 new_rt->rt_flags = rt->rt_flags;
1708                 new_rt->rt_type = rt->rt_type;
1709                 new_rt->rt_is_input = rt->rt_is_input;
1710                 new_rt->rt_iif = rt->rt_iif;
1711                 new_rt->rt_pmtu = rt->rt_pmtu;
1712                 new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1713                 new_rt->rt_gw_family = rt->rt_gw_family;
1714                 if (rt->rt_gw_family == AF_INET)
1715                         new_rt->rt_gw4 = rt->rt_gw4;
1716                 else if (rt->rt_gw_family == AF_INET6)
1717                         new_rt->rt_gw6 = rt->rt_gw6;
1718                 INIT_LIST_HEAD(&new_rt->rt_uncached);
1719
1720                 new_rt->dst.input = rt->dst.input;
1721                 new_rt->dst.output = rt->dst.output;
1722                 new_rt->dst.error = rt->dst.error;
1723                 new_rt->dst.lastuse = jiffies;
1724                 new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1725         }
1726         return new_rt;
1727 }
1728 EXPORT_SYMBOL(rt_dst_clone);
1729
1730 /* called in rcu_read_lock() section */
1731 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1732                           u8 tos, struct net_device *dev,
1733                           struct in_device *in_dev, u32 *itag)
1734 {
1735         int err;
1736
1737         /* Primary sanity checks. */
1738         if (!in_dev)
1739                 return -EINVAL;
1740
1741         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1742             skb->protocol != htons(ETH_P_IP))
1743                 return -EINVAL;
1744
1745         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1746                 return -EINVAL;
1747
1748         if (ipv4_is_zeronet(saddr)) {
1749                 if (!ipv4_is_local_multicast(daddr) &&
1750                     ip_hdr(skb)->protocol != IPPROTO_IGMP)
1751                         return -EINVAL;
1752         } else {
1753                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1754                                           in_dev, itag);
1755                 if (err < 0)
1756                         return err;
1757         }
1758         return 0;
1759 }
1760
1761 /* called in rcu_read_lock() section */
1762 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1763                              u8 tos, struct net_device *dev, int our)
1764 {
1765         struct in_device *in_dev = __in_dev_get_rcu(dev);
1766         unsigned int flags = RTCF_MULTICAST;
1767         struct rtable *rth;
1768         bool no_policy;
1769         u32 itag = 0;
1770         int err;
1771
1772         err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1773         if (err)
1774                 return err;
1775
1776         if (our)
1777                 flags |= RTCF_LOCAL;
1778
1779         no_policy = IN_DEV_ORCONF(in_dev, NOPOLICY);
1780         if (no_policy)
1781                 IPCB(skb)->flags |= IPSKB_NOPOLICY;
1782
1783         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1784                            no_policy, false);
1785         if (!rth)
1786                 return -ENOBUFS;
1787
1788 #ifdef CONFIG_IP_ROUTE_CLASSID
1789         rth->dst.tclassid = itag;
1790 #endif
1791         rth->dst.output = ip_rt_bug;
1792         rth->rt_is_input= 1;
1793
1794 #ifdef CONFIG_IP_MROUTE
1795         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1796                 rth->dst.input = ip_mr_input;
1797 #endif
1798         RT_CACHE_STAT_INC(in_slow_mc);
1799
1800         skb_dst_drop(skb);
1801         skb_dst_set(skb, &rth->dst);
1802         return 0;
1803 }
1804
1805
1806 static void ip_handle_martian_source(struct net_device *dev,
1807                                      struct in_device *in_dev,
1808                                      struct sk_buff *skb,
1809                                      __be32 daddr,
1810                                      __be32 saddr)
1811 {
1812         RT_CACHE_STAT_INC(in_martian_src);
1813 #ifdef CONFIG_IP_ROUTE_VERBOSE
1814         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1815                 /*
1816                  *      RFC1812 recommendation, if source is martian,
1817                  *      the only hint is MAC header.
1818                  */
1819                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1820                         &daddr, &saddr, dev->name);
1821                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1822                         print_hex_dump(KERN_WARNING, "ll header: ",
1823                                        DUMP_PREFIX_OFFSET, 16, 1,
1824                                        skb_mac_header(skb),
1825                                        dev->hard_header_len, false);
1826                 }
1827         }
1828 #endif
1829 }
1830
1831 /* called in rcu_read_lock() section */
1832 static int __mkroute_input(struct sk_buff *skb,
1833                            const struct fib_result *res,
1834                            struct in_device *in_dev,
1835                            __be32 daddr, __be32 saddr, u32 tos)
1836 {
1837         struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1838         struct net_device *dev = nhc->nhc_dev;
1839         struct fib_nh_exception *fnhe;
1840         struct rtable *rth;
1841         int err;
1842         struct in_device *out_dev;
1843         bool do_cache, no_policy;
1844         u32 itag = 0;
1845
1846         /* get a working reference to the output device */
1847         out_dev = __in_dev_get_rcu(dev);
1848         if (!out_dev) {
1849                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1850                 return -EINVAL;
1851         }
1852
1853         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1854                                   in_dev->dev, in_dev, &itag);
1855         if (err < 0) {
1856                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1857                                          saddr);
1858
1859                 goto cleanup;
1860         }
1861
1862         do_cache = res->fi && !itag;
1863         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1864             skb->protocol == htons(ETH_P_IP)) {
1865                 __be32 gw;
1866
1867                 gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1868                 if (IN_DEV_SHARED_MEDIA(out_dev) ||
1869                     inet_addr_onlink(out_dev, saddr, gw))
1870                         IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1871         }
1872
1873         if (skb->protocol != htons(ETH_P_IP)) {
1874                 /* Not IP (i.e. ARP). Do not create route, if it is
1875                  * invalid for proxy arp. DNAT routes are always valid.
1876                  *
1877                  * Proxy arp feature have been extended to allow, ARP
1878                  * replies back to the same interface, to support
1879                  * Private VLAN switch technologies. See arp.c.
1880                  */
1881                 if (out_dev == in_dev &&
1882                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1883                         err = -EINVAL;
1884                         goto cleanup;
1885                 }
1886         }
1887
1888         no_policy = IN_DEV_ORCONF(in_dev, NOPOLICY);
1889         if (no_policy)
1890                 IPCB(skb)->flags |= IPSKB_NOPOLICY;
1891
1892         fnhe = find_exception(nhc, daddr);
1893         if (do_cache) {
1894                 if (fnhe)
1895                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1896                 else
1897                         rth = rcu_dereference(nhc->nhc_rth_input);
1898                 if (rt_cache_valid(rth)) {
1899                         skb_dst_set_noref(skb, &rth->dst);
1900                         goto out;
1901                 }
1902         }
1903
1904         rth = rt_dst_alloc(out_dev->dev, 0, res->type, no_policy,
1905                            IN_DEV_ORCONF(out_dev, NOXFRM));
1906         if (!rth) {
1907                 err = -ENOBUFS;
1908                 goto cleanup;
1909         }
1910
1911         rth->rt_is_input = 1;
1912         RT_CACHE_STAT_INC(in_slow_tot);
1913
1914         rth->dst.input = ip_forward;
1915
1916         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1917                        do_cache);
1918         lwtunnel_set_redirect(&rth->dst);
1919         skb_dst_set(skb, &rth->dst);
1920 out:
1921         err = 0;
1922  cleanup:
1923         return err;
1924 }
1925
1926 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1927 /* To make ICMP packets follow the right flow, the multipath hash is
1928  * calculated from the inner IP addresses.
1929  */
1930 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1931                                  struct flow_keys *hash_keys)
1932 {
1933         const struct iphdr *outer_iph = ip_hdr(skb);
1934         const struct iphdr *key_iph = outer_iph;
1935         const struct iphdr *inner_iph;
1936         const struct icmphdr *icmph;
1937         struct iphdr _inner_iph;
1938         struct icmphdr _icmph;
1939
1940         if (likely(outer_iph->protocol != IPPROTO_ICMP))
1941                 goto out;
1942
1943         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1944                 goto out;
1945
1946         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1947                                    &_icmph);
1948         if (!icmph)
1949                 goto out;
1950
1951         if (!icmp_is_err(icmph->type))
1952                 goto out;
1953
1954         inner_iph = skb_header_pointer(skb,
1955                                        outer_iph->ihl * 4 + sizeof(_icmph),
1956                                        sizeof(_inner_iph), &_inner_iph);
1957         if (!inner_iph)
1958                 goto out;
1959
1960         key_iph = inner_iph;
1961 out:
1962         hash_keys->addrs.v4addrs.src = key_iph->saddr;
1963         hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1964 }
1965
1966 /* if skb is set it will be used and fl4 can be NULL */
1967 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1968                        const struct sk_buff *skb, struct flow_keys *flkeys)
1969 {
1970         u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1971         struct flow_keys hash_keys;
1972         u32 mhash;
1973
1974         switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1975         case 0:
1976                 memset(&hash_keys, 0, sizeof(hash_keys));
1977                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1978                 if (skb) {
1979                         ip_multipath_l3_keys(skb, &hash_keys);
1980                 } else {
1981                         hash_keys.addrs.v4addrs.src = fl4->saddr;
1982                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
1983                 }
1984                 break;
1985         case 1:
1986                 /* skb is currently provided only when forwarding */
1987                 if (skb) {
1988                         unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1989                         struct flow_keys keys;
1990
1991                         /* short-circuit if we already have L4 hash present */
1992                         if (skb->l4_hash)
1993                                 return skb_get_hash_raw(skb) >> 1;
1994
1995                         memset(&hash_keys, 0, sizeof(hash_keys));
1996
1997                         if (!flkeys) {
1998                                 skb_flow_dissect_flow_keys(skb, &keys, flag);
1999                                 flkeys = &keys;
2000                         }
2001
2002                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2003                         hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
2004                         hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
2005                         hash_keys.ports.src = flkeys->ports.src;
2006                         hash_keys.ports.dst = flkeys->ports.dst;
2007                         hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2008                 } else {
2009                         memset(&hash_keys, 0, sizeof(hash_keys));
2010                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2011                         hash_keys.addrs.v4addrs.src = fl4->saddr;
2012                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
2013                         hash_keys.ports.src = fl4->fl4_sport;
2014                         hash_keys.ports.dst = fl4->fl4_dport;
2015                         hash_keys.basic.ip_proto = fl4->flowi4_proto;
2016                 }
2017                 break;
2018         case 2:
2019                 memset(&hash_keys, 0, sizeof(hash_keys));
2020                 /* skb is currently provided only when forwarding */
2021                 if (skb) {
2022                         struct flow_keys keys;
2023
2024                         skb_flow_dissect_flow_keys(skb, &keys, 0);
2025                         /* Inner can be v4 or v6 */
2026                         if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
2027                                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2028                                 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
2029                                 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
2030                         } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
2031                                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2032                                 hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
2033                                 hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
2034                                 hash_keys.tags.flow_label = keys.tags.flow_label;
2035                                 hash_keys.basic.ip_proto = keys.basic.ip_proto;
2036                         } else {
2037                                 /* Same as case 0 */
2038                                 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2039                                 ip_multipath_l3_keys(skb, &hash_keys);
2040                         }
2041                 } else {
2042                         /* Same as case 0 */
2043                         hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2044                         hash_keys.addrs.v4addrs.src = fl4->saddr;
2045                         hash_keys.addrs.v4addrs.dst = fl4->daddr;
2046                 }
2047                 break;
2048         }
2049         mhash = flow_hash_from_keys(&hash_keys);
2050
2051         if (multipath_hash)
2052                 mhash = jhash_2words(mhash, multipath_hash, 0);
2053
2054         return mhash >> 1;
2055 }
2056 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
2057
2058 static int ip_mkroute_input(struct sk_buff *skb,
2059                             struct fib_result *res,
2060                             struct in_device *in_dev,
2061                             __be32 daddr, __be32 saddr, u32 tos,
2062                             struct flow_keys *hkeys)
2063 {
2064 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2065         if (res->fi && fib_info_num_path(res->fi) > 1) {
2066                 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2067
2068                 fib_select_multipath(res, h);
2069                 IPCB(skb)->flags |= IPSKB_MULTIPATH;
2070         }
2071 #endif
2072
2073         /* create a routing cache entry */
2074         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2075 }
2076
2077 /* Implements all the saddr-related checks as ip_route_input_slow(),
2078  * assuming daddr is valid and the destination is not a local broadcast one.
2079  * Uses the provided hint instead of performing a route lookup.
2080  */
2081 int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2082                       u8 tos, struct net_device *dev,
2083                       const struct sk_buff *hint)
2084 {
2085         struct in_device *in_dev = __in_dev_get_rcu(dev);
2086         struct rtable *rt = skb_rtable(hint);
2087         struct net *net = dev_net(dev);
2088         int err = -EINVAL;
2089         u32 tag = 0;
2090
2091         if (!in_dev)
2092                 return -EINVAL;
2093
2094         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2095                 goto martian_source;
2096
2097         if (ipv4_is_zeronet(saddr))
2098                 goto martian_source;
2099
2100         if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2101                 goto martian_source;
2102
2103         if (rt->rt_type != RTN_LOCAL)
2104                 goto skip_validate_source;
2105
2106         tos &= IPTOS_RT_MASK;
2107         err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
2108         if (err < 0)
2109                 goto martian_source;
2110
2111 skip_validate_source:
2112         skb_dst_copy(skb, hint);
2113         return 0;
2114
2115 martian_source:
2116         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2117         return err;
2118 }
2119
2120 /* get device for dst_alloc with local routes */
2121 static struct net_device *ip_rt_get_dev(struct net *net,
2122                                         const struct fib_result *res)
2123 {
2124         struct fib_nh_common *nhc = res->fi ? res->nhc : NULL;
2125         struct net_device *dev = NULL;
2126
2127         if (nhc)
2128                 dev = l3mdev_master_dev_rcu(nhc->nhc_dev);
2129
2130         return dev ? : net->loopback_dev;
2131 }
2132
2133 /*
2134  *      NOTE. We drop all the packets that has local source
2135  *      addresses, because every properly looped back packet
2136  *      must have correct destination already attached by output routine.
2137  *      Changes in the enforced policies must be applied also to
2138  *      ip_route_use_hint().
2139  *
2140  *      Such approach solves two big problems:
2141  *      1. Not simplex devices are handled properly.
2142  *      2. IP spoofing attempts are filtered with 100% of guarantee.
2143  *      called with rcu_read_lock()
2144  */
2145
2146 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2147                                u8 tos, struct net_device *dev,
2148                                struct fib_result *res)
2149 {
2150         struct in_device *in_dev = __in_dev_get_rcu(dev);
2151         struct flow_keys *flkeys = NULL, _flkeys;
2152         struct net    *net = dev_net(dev);
2153         struct ip_tunnel_info *tun_info;
2154         int             err = -EINVAL;
2155         unsigned int    flags = 0;
2156         u32             itag = 0;
2157         struct rtable   *rth;
2158         struct flowi4   fl4;
2159         bool do_cache = true;
2160         bool no_policy;
2161
2162         /* IP on this device is disabled. */
2163
2164         if (!in_dev)
2165                 goto out;
2166
2167         /* Check for the most weird martians, which can be not detected
2168            by fib_lookup.
2169          */
2170
2171         tun_info = skb_tunnel_info(skb);
2172         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2173                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2174         else
2175                 fl4.flowi4_tun_key.tun_id = 0;
2176         skb_dst_drop(skb);
2177
2178         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2179                 goto martian_source;
2180
2181         res->fi = NULL;
2182         res->table = NULL;
2183         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2184                 goto brd_input;
2185
2186         /* Accept zero addresses only to limited broadcast;
2187          * I even do not know to fix it or not. Waiting for complains :-)
2188          */
2189         if (ipv4_is_zeronet(saddr))
2190                 goto martian_source;
2191
2192         if (ipv4_is_zeronet(daddr))
2193                 goto martian_destination;
2194
2195         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2196          * and call it once if daddr or/and saddr are loopback addresses
2197          */
2198         if (ipv4_is_loopback(daddr)) {
2199                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2200                         goto martian_destination;
2201         } else if (ipv4_is_loopback(saddr)) {
2202                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2203                         goto martian_source;
2204         }
2205
2206         /*
2207          *      Now we are ready to route packet.
2208          */
2209         fl4.flowi4_oif = 0;
2210         fl4.flowi4_iif = dev->ifindex;
2211         fl4.flowi4_mark = skb->mark;
2212         fl4.flowi4_tos = tos;
2213         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2214         fl4.flowi4_flags = 0;
2215         fl4.daddr = daddr;
2216         fl4.saddr = saddr;
2217         fl4.flowi4_uid = sock_net_uid(net, NULL);
2218         fl4.flowi4_multipath_hash = 0;
2219
2220         if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2221                 flkeys = &_flkeys;
2222         } else {
2223                 fl4.flowi4_proto = 0;
2224                 fl4.fl4_sport = 0;
2225                 fl4.fl4_dport = 0;
2226         }
2227
2228         err = fib_lookup(net, &fl4, res, 0);
2229         if (err != 0) {
2230                 if (!IN_DEV_FORWARD(in_dev))
2231                         err = -EHOSTUNREACH;
2232                 goto no_route;
2233         }
2234
2235         if (res->type == RTN_BROADCAST) {
2236                 if (IN_DEV_BFORWARD(in_dev))
2237                         goto make_route;
2238                 /* not do cache if bc_forwarding is enabled */
2239                 if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2240                         do_cache = false;
2241                 goto brd_input;
2242         }
2243
2244         if (res->type == RTN_LOCAL) {
2245                 err = fib_validate_source(skb, saddr, daddr, tos,
2246                                           0, dev, in_dev, &itag);
2247                 if (err < 0)
2248                         goto martian_source;
2249                 goto local_input;
2250         }
2251
2252         if (!IN_DEV_FORWARD(in_dev)) {
2253                 err = -EHOSTUNREACH;
2254                 goto no_route;
2255         }
2256         if (res->type != RTN_UNICAST)
2257                 goto martian_destination;
2258
2259 make_route:
2260         err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2261 out:    return err;
2262
2263 brd_input:
2264         if (skb->protocol != htons(ETH_P_IP))
2265                 goto e_inval;
2266
2267         if (!ipv4_is_zeronet(saddr)) {
2268                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2269                                           in_dev, &itag);
2270                 if (err < 0)
2271                         goto martian_source;
2272         }
2273         flags |= RTCF_BROADCAST;
2274         res->type = RTN_BROADCAST;
2275         RT_CACHE_STAT_INC(in_brd);
2276
2277 local_input:
2278         no_policy = IN_DEV_ORCONF(in_dev, NOPOLICY);
2279         if (no_policy)
2280                 IPCB(skb)->flags |= IPSKB_NOPOLICY;
2281
2282         do_cache &= res->fi && !itag;
2283         if (do_cache) {
2284                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2285
2286                 rth = rcu_dereference(nhc->nhc_rth_input);
2287                 if (rt_cache_valid(rth)) {
2288                         skb_dst_set_noref(skb, &rth->dst);
2289                         err = 0;
2290                         goto out;
2291                 }
2292         }
2293
2294         rth = rt_dst_alloc(ip_rt_get_dev(net, res),
2295                            flags | RTCF_LOCAL, res->type,
2296                            no_policy, false);
2297         if (!rth)
2298                 goto e_nobufs;
2299
2300         rth->dst.output= ip_rt_bug;
2301 #ifdef CONFIG_IP_ROUTE_CLASSID
2302         rth->dst.tclassid = itag;
2303 #endif
2304         rth->rt_is_input = 1;
2305
2306         RT_CACHE_STAT_INC(in_slow_tot);
2307         if (res->type == RTN_UNREACHABLE) {
2308                 rth->dst.input= ip_error;
2309                 rth->dst.error= -err;
2310                 rth->rt_flags   &= ~RTCF_LOCAL;
2311         }
2312
2313         if (do_cache) {
2314                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2315
2316                 rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2317                 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2318                         WARN_ON(rth->dst.input == lwtunnel_input);
2319                         rth->dst.lwtstate->orig_input = rth->dst.input;
2320                         rth->dst.input = lwtunnel_input;
2321                 }
2322
2323                 if (unlikely(!rt_cache_route(nhc, rth)))
2324                         rt_add_uncached_list(rth);
2325         }
2326         skb_dst_set(skb, &rth->dst);
2327         err = 0;
2328         goto out;
2329
2330 no_route:
2331         RT_CACHE_STAT_INC(in_no_route);
2332         res->type = RTN_UNREACHABLE;
2333         res->fi = NULL;
2334         res->table = NULL;
2335         goto local_input;
2336
2337         /*
2338          *      Do not cache martian addresses: they should be logged (RFC1812)
2339          */
2340 martian_destination:
2341         RT_CACHE_STAT_INC(in_martian_dst);
2342 #ifdef CONFIG_IP_ROUTE_VERBOSE
2343         if (IN_DEV_LOG_MARTIANS(in_dev))
2344                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2345                                      &daddr, &saddr, dev->name);
2346 #endif
2347
2348 e_inval:
2349         err = -EINVAL;
2350         goto out;
2351
2352 e_nobufs:
2353         err = -ENOBUFS;
2354         goto out;
2355
2356 martian_source:
2357         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2358         goto out;
2359 }
2360
2361 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2362                          u8 tos, struct net_device *dev)
2363 {
2364         struct fib_result res;
2365         int err;
2366
2367         tos &= IPTOS_RT_MASK;
2368         rcu_read_lock();
2369         err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2370         rcu_read_unlock();
2371
2372         return err;
2373 }
2374 EXPORT_SYMBOL(ip_route_input_noref);
2375
2376 /* called with rcu_read_lock held */
2377 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2378                        u8 tos, struct net_device *dev, struct fib_result *res)
2379 {
2380         /* Multicast recognition logic is moved from route cache to here.
2381            The problem was that too many Ethernet cards have broken/missing
2382            hardware multicast filters :-( As result the host on multicasting
2383            network acquires a lot of useless route cache entries, sort of
2384            SDR messages from all the world. Now we try to get rid of them.
2385            Really, provided software IP multicast filter is organized
2386            reasonably (at least, hashed), it does not result in a slowdown
2387            comparing with route cache reject entries.
2388            Note, that multicast routers are not affected, because
2389            route cache entry is created eventually.
2390          */
2391         if (ipv4_is_multicast(daddr)) {
2392                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2393                 int our = 0;
2394                 int err = -EINVAL;
2395
2396                 if (!in_dev)
2397                         return err;
2398                 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2399                                       ip_hdr(skb)->protocol);
2400
2401                 /* check l3 master if no match yet */
2402                 if (!our && netif_is_l3_slave(dev)) {
2403                         struct in_device *l3_in_dev;
2404
2405                         l3_in_dev = __in_dev_get_rcu(skb->dev);
2406                         if (l3_in_dev)
2407                                 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2408                                                       ip_hdr(skb)->protocol);
2409                 }
2410
2411                 if (our
2412 #ifdef CONFIG_IP_MROUTE
2413                         ||
2414                     (!ipv4_is_local_multicast(daddr) &&
2415                      IN_DEV_MFORWARD(in_dev))
2416 #endif
2417                    ) {
2418                         err = ip_route_input_mc(skb, daddr, saddr,
2419                                                 tos, dev, our);
2420                 }
2421                 return err;
2422         }
2423
2424         return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2425 }
2426
2427 /* called with rcu_read_lock() */
2428 static struct rtable *__mkroute_output(const struct fib_result *res,
2429                                        const struct flowi4 *fl4, int orig_oif,
2430                                        struct net_device *dev_out,
2431                                        unsigned int flags)
2432 {
2433         struct fib_info *fi = res->fi;
2434         struct fib_nh_exception *fnhe;
2435         struct in_device *in_dev;
2436         u16 type = res->type;
2437         struct rtable *rth;
2438         bool do_cache;
2439
2440         in_dev = __in_dev_get_rcu(dev_out);
2441         if (!in_dev)
2442                 return ERR_PTR(-EINVAL);
2443
2444         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2445                 if (ipv4_is_loopback(fl4->saddr) &&
2446                     !(dev_out->flags & IFF_LOOPBACK) &&
2447                     !netif_is_l3_master(dev_out))
2448                         return ERR_PTR(-EINVAL);
2449
2450         if (ipv4_is_lbcast(fl4->daddr))
2451                 type = RTN_BROADCAST;
2452         else if (ipv4_is_multicast(fl4->daddr))
2453                 type = RTN_MULTICAST;
2454         else if (ipv4_is_zeronet(fl4->daddr))
2455                 return ERR_PTR(-EINVAL);
2456
2457         if (dev_out->flags & IFF_LOOPBACK)
2458                 flags |= RTCF_LOCAL;
2459
2460         do_cache = true;
2461         if (type == RTN_BROADCAST) {
2462                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2463                 fi = NULL;
2464         } else if (type == RTN_MULTICAST) {
2465                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2466                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2467                                      fl4->flowi4_proto))
2468                         flags &= ~RTCF_LOCAL;
2469                 else
2470                         do_cache = false;
2471                 /* If multicast route do not exist use
2472                  * default one, but do not gateway in this case.
2473                  * Yes, it is hack.
2474                  */
2475                 if (fi && res->prefixlen < 4)
2476                         fi = NULL;
2477         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2478                    (orig_oif != dev_out->ifindex)) {
2479                 /* For local routes that require a particular output interface
2480                  * we do not want to cache the result.  Caching the result
2481                  * causes incorrect behaviour when there are multiple source
2482                  * addresses on the interface, the end result being that if the
2483                  * intended recipient is waiting on that interface for the
2484                  * packet he won't receive it because it will be delivered on
2485                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2486                  * be set to the loopback interface as well.
2487                  */
2488                 do_cache = false;
2489         }
2490
2491         fnhe = NULL;
2492         do_cache &= fi != NULL;
2493         if (fi) {
2494                 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2495                 struct rtable __rcu **prth;
2496
2497                 fnhe = find_exception(nhc, fl4->daddr);
2498                 if (!do_cache)
2499                         goto add;
2500                 if (fnhe) {
2501                         prth = &fnhe->fnhe_rth_output;
2502                 } else {
2503                         if (unlikely(fl4->flowi4_flags &
2504                                      FLOWI_FLAG_KNOWN_NH &&
2505                                      !(nhc->nhc_gw_family &&
2506                                        nhc->nhc_scope == RT_SCOPE_LINK))) {
2507                                 do_cache = false;
2508                                 goto add;
2509                         }
2510                         prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2511                 }
2512                 rth = rcu_dereference(*prth);
2513                 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2514                         return rth;
2515         }
2516
2517 add:
2518         rth = rt_dst_alloc(dev_out, flags, type,
2519                            IN_DEV_ORCONF(in_dev, NOPOLICY),
2520                            IN_DEV_ORCONF(in_dev, NOXFRM));
2521         if (!rth)
2522                 return ERR_PTR(-ENOBUFS);
2523
2524         rth->rt_iif = orig_oif;
2525
2526         RT_CACHE_STAT_INC(out_slow_tot);
2527
2528         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2529                 if (flags & RTCF_LOCAL &&
2530                     !(dev_out->flags & IFF_LOOPBACK)) {
2531                         rth->dst.output = ip_mc_output;
2532                         RT_CACHE_STAT_INC(out_slow_mc);
2533                 }
2534 #ifdef CONFIG_IP_MROUTE
2535                 if (type == RTN_MULTICAST) {
2536                         if (IN_DEV_MFORWARD(in_dev) &&
2537                             !ipv4_is_local_multicast(fl4->daddr)) {
2538                                 rth->dst.input = ip_mr_input;
2539                                 rth->dst.output = ip_mc_output;
2540                         }
2541                 }
2542 #endif
2543         }
2544
2545         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2546         lwtunnel_set_redirect(&rth->dst);
2547
2548         return rth;
2549 }
2550
2551 /*
2552  * Major route resolver routine.
2553  */
2554
2555 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2556                                         const struct sk_buff *skb)
2557 {
2558         struct fib_result res = {
2559                 .type           = RTN_UNSPEC,
2560                 .fi             = NULL,
2561                 .table          = NULL,
2562                 .tclassid       = 0,
2563         };
2564         struct rtable *rth;
2565
2566         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2567         ip_rt_fix_tos(fl4);
2568
2569         rcu_read_lock();
2570         rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2571         rcu_read_unlock();
2572
2573         return rth;
2574 }
2575 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2576
2577 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2578                                             struct fib_result *res,
2579                                             const struct sk_buff *skb)
2580 {
2581         struct net_device *dev_out = NULL;
2582         int orig_oif = fl4->flowi4_oif;
2583         unsigned int flags = 0;
2584         struct rtable *rth;
2585         int err;
2586
2587         if (fl4->saddr) {
2588                 if (ipv4_is_multicast(fl4->saddr) ||
2589                     ipv4_is_lbcast(fl4->saddr) ||
2590                     ipv4_is_zeronet(fl4->saddr)) {
2591                         rth = ERR_PTR(-EINVAL);
2592                         goto out;
2593                 }
2594
2595                 rth = ERR_PTR(-ENETUNREACH);
2596
2597                 /* I removed check for oif == dev_out->oif here.
2598                    It was wrong for two reasons:
2599                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2600                       is assigned to multiple interfaces.
2601                    2. Moreover, we are allowed to send packets with saddr
2602                       of another iface. --ANK
2603                  */
2604
2605                 if (fl4->flowi4_oif == 0 &&
2606                     (ipv4_is_multicast(fl4->daddr) ||
2607                      ipv4_is_lbcast(fl4->daddr))) {
2608                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2609                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2610                         if (!dev_out)
2611                                 goto out;
2612
2613                         /* Special hack: user can direct multicasts
2614                            and limited broadcast via necessary interface
2615                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2616                            This hack is not just for fun, it allows
2617                            vic,vat and friends to work.
2618                            They bind socket to loopback, set ttl to zero
2619                            and expect that it will work.
2620                            From the viewpoint of routing cache they are broken,
2621                            because we are not allowed to build multicast path
2622                            with loopback source addr (look, routing cache
2623                            cannot know, that ttl is zero, so that packet
2624                            will not leave this host and route is valid).
2625                            Luckily, this hack is good workaround.
2626                          */
2627
2628                         fl4->flowi4_oif = dev_out->ifindex;
2629                         goto make_route;
2630                 }
2631
2632                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2633                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2634                         if (!__ip_dev_find(net, fl4->saddr, false))
2635                                 goto out;
2636                 }
2637         }
2638
2639
2640         if (fl4->flowi4_oif) {
2641                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2642                 rth = ERR_PTR(-ENODEV);
2643                 if (!dev_out)
2644                         goto out;
2645
2646                 /* RACE: Check return value of inet_select_addr instead. */
2647                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2648                         rth = ERR_PTR(-ENETUNREACH);
2649                         goto out;
2650                 }
2651                 if (ipv4_is_local_multicast(fl4->daddr) ||
2652                     ipv4_is_lbcast(fl4->daddr) ||
2653                     fl4->flowi4_proto == IPPROTO_IGMP) {
2654                         if (!fl4->saddr)
2655                                 fl4->saddr = inet_select_addr(dev_out, 0,
2656                                                               RT_SCOPE_LINK);
2657                         goto make_route;
2658                 }
2659                 if (!fl4->saddr) {
2660                         if (ipv4_is_multicast(fl4->daddr))
2661                                 fl4->saddr = inet_select_addr(dev_out, 0,
2662                                                               fl4->flowi4_scope);
2663                         else if (!fl4->daddr)
2664                                 fl4->saddr = inet_select_addr(dev_out, 0,
2665                                                               RT_SCOPE_HOST);
2666                 }
2667         }
2668
2669         if (!fl4->daddr) {
2670                 fl4->daddr = fl4->saddr;
2671                 if (!fl4->daddr)
2672                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2673                 dev_out = net->loopback_dev;
2674                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2675                 res->type = RTN_LOCAL;
2676                 flags |= RTCF_LOCAL;
2677                 goto make_route;
2678         }
2679
2680         err = fib_lookup(net, fl4, res, 0);
2681         if (err) {
2682                 res->fi = NULL;
2683                 res->table = NULL;
2684                 if (fl4->flowi4_oif &&
2685                     (ipv4_is_multicast(fl4->daddr) ||
2686                     !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2687                         /* Apparently, routing tables are wrong. Assume,
2688                            that the destination is on link.
2689
2690                            WHY? DW.
2691                            Because we are allowed to send to iface
2692                            even if it has NO routes and NO assigned
2693                            addresses. When oif is specified, routing
2694                            tables are looked up with only one purpose:
2695                            to catch if destination is gatewayed, rather than
2696                            direct. Moreover, if MSG_DONTROUTE is set,
2697                            we send packet, ignoring both routing tables
2698                            and ifaddr state. --ANK
2699
2700
2701                            We could make it even if oif is unknown,
2702                            likely IPv6, but we do not.
2703                          */
2704
2705                         if (fl4->saddr == 0)
2706                                 fl4->saddr = inet_select_addr(dev_out, 0,
2707                                                               RT_SCOPE_LINK);
2708                         res->type = RTN_UNICAST;
2709                         goto make_route;
2710                 }
2711                 rth = ERR_PTR(err);
2712                 goto out;
2713         }
2714
2715         if (res->type == RTN_LOCAL) {
2716                 if (!fl4->saddr) {
2717                         if (res->fi->fib_prefsrc)
2718                                 fl4->saddr = res->fi->fib_prefsrc;
2719                         else
2720                                 fl4->saddr = fl4->daddr;
2721                 }
2722
2723                 /* L3 master device is the loopback for that domain */
2724                 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2725                         net->loopback_dev;
2726
2727                 /* make sure orig_oif points to fib result device even
2728                  * though packet rx/tx happens over loopback or l3mdev
2729                  */
2730                 orig_oif = FIB_RES_OIF(*res);
2731
2732                 fl4->flowi4_oif = dev_out->ifindex;
2733                 flags |= RTCF_LOCAL;
2734                 goto make_route;
2735         }
2736
2737         fib_select_path(net, res, fl4, skb);
2738
2739         dev_out = FIB_RES_DEV(*res);
2740
2741 make_route:
2742         rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2743
2744 out:
2745         return rth;
2746 }
2747
2748 static struct dst_ops ipv4_dst_blackhole_ops = {
2749         .family                 = AF_INET,
2750         .default_advmss         = ipv4_default_advmss,
2751         .neigh_lookup           = ipv4_neigh_lookup,
2752         .check                  = dst_blackhole_check,
2753         .cow_metrics            = dst_blackhole_cow_metrics,
2754         .update_pmtu            = dst_blackhole_update_pmtu,
2755         .redirect               = dst_blackhole_redirect,
2756         .mtu                    = dst_blackhole_mtu,
2757 };
2758
2759 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2760 {
2761         struct rtable *ort = (struct rtable *) dst_orig;
2762         struct rtable *rt;
2763
2764         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2765         if (rt) {
2766                 struct dst_entry *new = &rt->dst;
2767
2768                 new->__use = 1;
2769                 new->input = dst_discard;
2770                 new->output = dst_discard_out;
2771
2772                 new->dev = net->loopback_dev;
2773                 if (new->dev)
2774                         dev_hold(new->dev);
2775
2776                 rt->rt_is_input = ort->rt_is_input;
2777                 rt->rt_iif = ort->rt_iif;
2778                 rt->rt_pmtu = ort->rt_pmtu;
2779                 rt->rt_mtu_locked = ort->rt_mtu_locked;
2780
2781                 rt->rt_genid = rt_genid_ipv4(net);
2782                 rt->rt_flags = ort->rt_flags;
2783                 rt->rt_type = ort->rt_type;
2784                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2785                 rt->rt_gw_family = ort->rt_gw_family;
2786                 if (rt->rt_gw_family == AF_INET)
2787                         rt->rt_gw4 = ort->rt_gw4;
2788                 else if (rt->rt_gw_family == AF_INET6)
2789                         rt->rt_gw6 = ort->rt_gw6;
2790
2791                 INIT_LIST_HEAD(&rt->rt_uncached);
2792         }
2793
2794         dst_release(dst_orig);
2795
2796         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2797 }
2798
2799 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2800                                     const struct sock *sk)
2801 {
2802         struct rtable *rt = __ip_route_output_key(net, flp4);
2803
2804         if (IS_ERR(rt))
2805                 return rt;
2806
2807         if (flp4->flowi4_proto) {
2808                 flp4->flowi4_oif = rt->dst.dev->ifindex;
2809                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2810                                                         flowi4_to_flowi(flp4),
2811                                                         sk, 0);
2812         }
2813
2814         return rt;
2815 }
2816 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2817
2818 struct rtable *ip_route_output_tunnel(struct sk_buff *skb,
2819                                       struct net_device *dev,
2820                                       struct net *net, __be32 *saddr,
2821                                       const struct ip_tunnel_info *info,
2822                                       u8 protocol, bool use_cache)
2823 {
2824 #ifdef CONFIG_DST_CACHE
2825         struct dst_cache *dst_cache;
2826 #endif
2827         struct rtable *rt = NULL;
2828         struct flowi4 fl4;
2829         __u8 tos;
2830
2831 #ifdef CONFIG_DST_CACHE
2832         dst_cache = (struct dst_cache *)&info->dst_cache;
2833         if (use_cache) {
2834                 rt = dst_cache_get_ip4(dst_cache, saddr);
2835                 if (rt)
2836                         return rt;
2837         }
2838 #endif
2839         memset(&fl4, 0, sizeof(fl4));
2840         fl4.flowi4_mark = skb->mark;
2841         fl4.flowi4_proto = protocol;
2842         fl4.daddr = info->key.u.ipv4.dst;
2843         fl4.saddr = info->key.u.ipv4.src;
2844         tos = info->key.tos;
2845         fl4.flowi4_tos = RT_TOS(tos);
2846
2847         rt = ip_route_output_key(net, &fl4);
2848         if (IS_ERR(rt)) {
2849                 netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
2850                 return ERR_PTR(-ENETUNREACH);
2851         }
2852         if (rt->dst.dev == dev) { /* is this necessary? */
2853                 netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
2854                 ip_rt_put(rt);
2855                 return ERR_PTR(-ELOOP);
2856         }
2857 #ifdef CONFIG_DST_CACHE
2858         if (use_cache)
2859                 dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
2860 #endif
2861         *saddr = fl4.saddr;
2862         return rt;
2863 }
2864 EXPORT_SYMBOL_GPL(ip_route_output_tunnel);
2865
2866 /* called with rcu_read_lock held */
2867 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2868                         struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2869                         struct sk_buff *skb, u32 portid, u32 seq,
2870                         unsigned int flags)
2871 {
2872         struct rtmsg *r;
2873         struct nlmsghdr *nlh;
2874         unsigned long expires = 0;
2875         u32 error;
2876         u32 metrics[RTAX_MAX];
2877
2878         nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2879         if (!nlh)
2880                 return -EMSGSIZE;
2881
2882         r = nlmsg_data(nlh);
2883         r->rtm_family    = AF_INET;
2884         r->rtm_dst_len  = 32;
2885         r->rtm_src_len  = 0;
2886         r->rtm_tos      = fl4 ? fl4->flowi4_tos : 0;
2887         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2888         if (nla_put_u32(skb, RTA_TABLE, table_id))
2889                 goto nla_put_failure;
2890         r->rtm_type     = rt->rt_type;
2891         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2892         r->rtm_protocol = RTPROT_UNSPEC;
2893         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2894         if (rt->rt_flags & RTCF_NOTIFY)
2895                 r->rtm_flags |= RTM_F_NOTIFY;
2896         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2897                 r->rtm_flags |= RTCF_DOREDIRECT;
2898
2899         if (nla_put_in_addr(skb, RTA_DST, dst))
2900                 goto nla_put_failure;
2901         if (src) {
2902                 r->rtm_src_len = 32;
2903                 if (nla_put_in_addr(skb, RTA_SRC, src))
2904                         goto nla_put_failure;
2905         }
2906         if (rt->dst.dev &&
2907             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2908                 goto nla_put_failure;
2909 #ifdef CONFIG_IP_ROUTE_CLASSID
2910         if (rt->dst.tclassid &&
2911             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2912                 goto nla_put_failure;
2913 #endif
2914         if (fl4 && !rt_is_input_route(rt) &&
2915             fl4->saddr != src) {
2916                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2917                         goto nla_put_failure;
2918         }
2919         if (rt->rt_uses_gateway) {
2920                 if (rt->rt_gw_family == AF_INET &&
2921                     nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2922                         goto nla_put_failure;
2923                 } else if (rt->rt_gw_family == AF_INET6) {
2924                         int alen = sizeof(struct in6_addr);
2925                         struct nlattr *nla;
2926                         struct rtvia *via;
2927
2928                         nla = nla_reserve(skb, RTA_VIA, alen + 2);
2929                         if (!nla)
2930                                 goto nla_put_failure;
2931
2932                         via = nla_data(nla);
2933                         via->rtvia_family = AF_INET6;
2934                         memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2935                 }
2936         }
2937
2938         expires = rt->dst.expires;
2939         if (expires) {
2940                 unsigned long now = jiffies;
2941
2942                 if (time_before(now, expires))
2943                         expires -= now;
2944                 else
2945                         expires = 0;
2946         }
2947
2948         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2949         if (rt->rt_pmtu && expires)
2950                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2951         if (rt->rt_mtu_locked && expires)
2952                 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2953         if (rtnetlink_put_metrics(skb, metrics) < 0)
2954                 goto nla_put_failure;
2955
2956         if (fl4) {
2957                 if (fl4->flowi4_mark &&
2958                     nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2959                         goto nla_put_failure;
2960
2961                 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2962                     nla_put_u32(skb, RTA_UID,
2963                                 from_kuid_munged(current_user_ns(),
2964                                                  fl4->flowi4_uid)))
2965                         goto nla_put_failure;
2966
2967                 if (rt_is_input_route(rt)) {
2968 #ifdef CONFIG_IP_MROUTE
2969                         if (ipv4_is_multicast(dst) &&
2970                             !ipv4_is_local_multicast(dst) &&
2971                             IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2972                                 int err = ipmr_get_route(net, skb,
2973                                                          fl4->saddr, fl4->daddr,
2974                                                          r, portid);
2975
2976                                 if (err <= 0) {
2977                                         if (err == 0)
2978                                                 return 0;
2979                                         goto nla_put_failure;
2980                                 }
2981                         } else
2982 #endif
2983                                 if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2984                                         goto nla_put_failure;
2985                 }
2986         }
2987
2988         error = rt->dst.error;
2989
2990         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2991                 goto nla_put_failure;
2992
2993         nlmsg_end(skb, nlh);
2994         return 0;
2995
2996 nla_put_failure:
2997         nlmsg_cancel(skb, nlh);
2998         return -EMSGSIZE;
2999 }
3000
3001 static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
3002                             struct netlink_callback *cb, u32 table_id,
3003                             struct fnhe_hash_bucket *bucket, int genid,
3004                             int *fa_index, int fa_start, unsigned int flags)
3005 {
3006         int i;
3007
3008         for (i = 0; i < FNHE_HASH_SIZE; i++) {
3009                 struct fib_nh_exception *fnhe;
3010
3011                 for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
3012                      fnhe = rcu_dereference(fnhe->fnhe_next)) {
3013                         struct rtable *rt;
3014                         int err;
3015
3016                         if (*fa_index < fa_start)
3017                                 goto next;
3018
3019                         if (fnhe->fnhe_genid != genid)
3020                                 goto next;
3021
3022                         if (fnhe->fnhe_expires &&
3023                             time_after(jiffies, fnhe->fnhe_expires))
3024                                 goto next;
3025
3026                         rt = rcu_dereference(fnhe->fnhe_rth_input);
3027                         if (!rt)
3028                                 rt = rcu_dereference(fnhe->fnhe_rth_output);
3029                         if (!rt)
3030                                 goto next;
3031
3032                         err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
3033                                            table_id, NULL, skb,
3034                                            NETLINK_CB(cb->skb).portid,
3035                                            cb->nlh->nlmsg_seq, flags);
3036                         if (err)
3037                                 return err;
3038 next:
3039                         (*fa_index)++;
3040                 }
3041         }
3042
3043         return 0;
3044 }
3045
3046 int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
3047                        u32 table_id, struct fib_info *fi,
3048                        int *fa_index, int fa_start, unsigned int flags)
3049 {
3050         struct net *net = sock_net(cb->skb->sk);
3051         int nhsel, genid = fnhe_genid(net);
3052
3053         for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
3054                 struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
3055                 struct fnhe_hash_bucket *bucket;
3056                 int err;
3057
3058                 if (nhc->nhc_flags & RTNH_F_DEAD)
3059                         continue;
3060
3061                 rcu_read_lock();
3062                 bucket = rcu_dereference(nhc->nhc_exceptions);
3063                 err = 0;
3064                 if (bucket)
3065                         err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
3066                                                genid, fa_index, fa_start,
3067                                                flags);
3068                 rcu_read_unlock();
3069                 if (err)
3070                         return err;
3071         }
3072
3073         return 0;
3074 }
3075
3076 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
3077                                                    u8 ip_proto, __be16 sport,
3078                                                    __be16 dport)
3079 {
3080         struct sk_buff *skb;
3081         struct iphdr *iph;
3082
3083         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3084         if (!skb)
3085                 return NULL;
3086
3087         /* Reserve room for dummy headers, this skb can pass
3088          * through good chunk of routing engine.
3089          */
3090         skb_reset_mac_header(skb);
3091         skb_reset_network_header(skb);
3092         skb->protocol = htons(ETH_P_IP);
3093         iph = skb_put(skb, sizeof(struct iphdr));
3094         iph->protocol = ip_proto;
3095         iph->saddr = src;
3096         iph->daddr = dst;
3097         iph->version = 0x4;
3098         iph->frag_off = 0;
3099         iph->ihl = 0x5;
3100         skb_set_transport_header(skb, skb->len);
3101
3102         switch (iph->protocol) {
3103         case IPPROTO_UDP: {
3104                 struct udphdr *udph;
3105
3106                 udph = skb_put_zero(skb, sizeof(struct udphdr));
3107                 udph->source = sport;
3108                 udph->dest = dport;
3109                 udph->len = htons(sizeof(struct udphdr));
3110                 udph->check = 0;
3111                 break;
3112         }
3113         case IPPROTO_TCP: {
3114                 struct tcphdr *tcph;
3115
3116                 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3117                 tcph->source    = sport;
3118                 tcph->dest      = dport;
3119                 tcph->doff      = sizeof(struct tcphdr) / 4;
3120                 tcph->rst = 1;
3121                 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3122                                             src, dst, 0);
3123                 break;
3124         }
3125         case IPPROTO_ICMP: {
3126                 struct icmphdr *icmph;
3127
3128                 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3129                 icmph->type = ICMP_ECHO;
3130                 icmph->code = 0;
3131         }
3132         }
3133
3134         return skb;
3135 }
3136
3137 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3138                                        const struct nlmsghdr *nlh,
3139                                        struct nlattr **tb,
3140                                        struct netlink_ext_ack *extack)
3141 {
3142         struct rtmsg *rtm;
3143         int i, err;
3144
3145         if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3146                 NL_SET_ERR_MSG(extack,
3147                                "ipv4: Invalid header for route get request");
3148                 return -EINVAL;
3149         }
3150
3151         if (!netlink_strict_get_check(skb))
3152                 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3153                                               rtm_ipv4_policy, extack);
3154
3155         rtm = nlmsg_data(nlh);
3156         if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3157             (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3158             rtm->rtm_table || rtm->rtm_protocol ||
3159             rtm->rtm_scope || rtm->rtm_type) {
3160                 NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3161                 return -EINVAL;
3162         }
3163
3164         if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3165                                RTM_F_LOOKUP_TABLE |
3166                                RTM_F_FIB_MATCH)) {
3167                 NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3168                 return -EINVAL;
3169         }
3170
3171         err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3172                                             rtm_ipv4_policy, extack);
3173         if (err)
3174                 return err;
3175
3176         if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3177             (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3178                 NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3179                 return -EINVAL;
3180         }
3181
3182         for (i = 0; i <= RTA_MAX; i++) {
3183                 if (!tb[i])
3184                         continue;
3185
3186                 switch (i) {
3187                 case RTA_IIF:
3188                 case RTA_OIF:
3189                 case RTA_SRC:
3190                 case RTA_DST:
3191                 case RTA_IP_PROTO:
3192                 case RTA_SPORT:
3193                 case RTA_DPORT:
3194                 case RTA_MARK:
3195                 case RTA_UID:
3196                         break;
3197                 default:
3198                         NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3199                         return -EINVAL;
3200                 }
3201         }
3202
3203         return 0;
3204 }
3205
3206 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3207                              struct netlink_ext_ack *extack)
3208 {
3209         struct net *net = sock_net(in_skb->sk);
3210         struct nlattr *tb[RTA_MAX+1];
3211         u32 table_id = RT_TABLE_MAIN;
3212         __be16 sport = 0, dport = 0;
3213         struct fib_result res = {};
3214         u8 ip_proto = IPPROTO_UDP;
3215         struct rtable *rt = NULL;
3216         struct sk_buff *skb;
3217         struct rtmsg *rtm;
3218         struct flowi4 fl4 = {};
3219         __be32 dst = 0;
3220         __be32 src = 0;
3221         kuid_t uid;
3222         u32 iif;
3223         int err;
3224         int mark;
3225
3226         err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3227         if (err < 0)
3228                 return err;
3229
3230         rtm = nlmsg_data(nlh);
3231         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3232         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3233         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3234         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3235         if (tb[RTA_UID])
3236                 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3237         else
3238                 uid = (iif ? INVALID_UID : current_uid());
3239
3240         if (tb[RTA_IP_PROTO]) {
3241                 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3242                                                   &ip_proto, AF_INET, extack);
3243                 if (err)
3244                         return err;
3245         }
3246
3247         if (tb[RTA_SPORT])
3248                 sport = nla_get_be16(tb[RTA_SPORT]);
3249
3250         if (tb[RTA_DPORT])
3251                 dport = nla_get_be16(tb[RTA_DPORT]);
3252
3253         skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3254         if (!skb)
3255                 return -ENOBUFS;
3256
3257         fl4.daddr = dst;
3258         fl4.saddr = src;
3259         fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
3260         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3261         fl4.flowi4_mark = mark;
3262         fl4.flowi4_uid = uid;
3263         if (sport)
3264                 fl4.fl4_sport = sport;
3265         if (dport)
3266                 fl4.fl4_dport = dport;
3267         fl4.flowi4_proto = ip_proto;
3268
3269         rcu_read_lock();
3270
3271         if (iif) {
3272                 struct net_device *dev;
3273
3274                 dev = dev_get_by_index_rcu(net, iif);
3275                 if (!dev) {
3276                         err = -ENODEV;
3277                         goto errout_rcu;
3278                 }
3279
3280                 fl4.flowi4_iif = iif; /* for rt_fill_info */
3281                 skb->dev        = dev;
3282                 skb->mark       = mark;
3283                 err = ip_route_input_rcu(skb, dst, src,
3284                                          rtm->rtm_tos & IPTOS_RT_MASK, dev,
3285                                          &res);
3286
3287                 rt = skb_rtable(skb);
3288                 if (err == 0 && rt->dst.error)
3289                         err = -rt->dst.error;
3290         } else {
3291                 fl4.flowi4_iif = LOOPBACK_IFINDEX;
3292                 skb->dev = net->loopback_dev;
3293                 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3294                 err = 0;
3295                 if (IS_ERR(rt))
3296                         err = PTR_ERR(rt);
3297                 else
3298                         skb_dst_set(skb, &rt->dst);
3299         }
3300
3301         if (err)
3302                 goto errout_rcu;
3303
3304         if (rtm->rtm_flags & RTM_F_NOTIFY)
3305                 rt->rt_flags |= RTCF_NOTIFY;
3306
3307         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3308                 table_id = res.table ? res.table->tb_id : 0;
3309
3310         /* reset skb for netlink reply msg */
3311         skb_trim(skb, 0);
3312         skb_reset_network_header(skb);
3313         skb_reset_transport_header(skb);
3314         skb_reset_mac_header(skb);
3315
3316         if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3317                 struct fib_rt_info fri;
3318
3319                 if (!res.fi) {
3320                         err = fib_props[res.type].error;
3321                         if (!err)
3322                                 err = -EHOSTUNREACH;
3323                         goto errout_rcu;
3324                 }
3325                 fri.fi = res.fi;
3326                 fri.tb_id = table_id;
3327                 fri.dst = res.prefix;
3328                 fri.dst_len = res.prefixlen;
3329                 fri.tos = fl4.flowi4_tos;
3330                 fri.type = rt->rt_type;
3331                 fri.offload = 0;
3332                 fri.trap = 0;
3333                 if (res.fa_head) {
3334                         struct fib_alias *fa;
3335
3336                         hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
3337                                 u8 slen = 32 - fri.dst_len;
3338
3339                                 if (fa->fa_slen == slen &&
3340                                     fa->tb_id == fri.tb_id &&
3341                                     fa->fa_tos == fri.tos &&
3342                                     fa->fa_info == res.fi &&
3343                                     fa->fa_type == fri.type) {
3344                                         fri.offload = fa->offload;
3345                                         fri.trap = fa->trap;
3346                                         break;
3347                                 }
3348                         }
3349                 }
3350                 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3351                                     nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
3352         } else {
3353                 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3354                                    NETLINK_CB(in_skb).portid,
3355                                    nlh->nlmsg_seq, 0);
3356         }
3357         if (err < 0)
3358                 goto errout_rcu;
3359
3360         rcu_read_unlock();
3361
3362         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3363
3364 errout_free:
3365         return err;
3366 errout_rcu:
3367         rcu_read_unlock();
3368         kfree_skb(skb);
3369         goto errout_free;
3370 }
3371
3372 void ip_rt_multicast_event(struct in_device *in_dev)
3373 {
3374         rt_cache_flush(dev_net(in_dev->dev));
3375 }
3376
3377 #ifdef CONFIG_SYSCTL
3378 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
3379 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
3380 static int ip_rt_gc_elasticity __read_mostly    = 8;
3381 static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
3382
3383 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3384                 void *buffer, size_t *lenp, loff_t *ppos)
3385 {
3386         struct net *net = (struct net *)__ctl->extra1;
3387
3388         if (write) {
3389                 rt_cache_flush(net);
3390                 fnhe_genid_bump(net);
3391                 return 0;
3392         }
3393
3394         return -EINVAL;
3395 }
3396
3397 static struct ctl_table ipv4_route_table[] = {
3398         {
3399                 .procname       = "gc_thresh",
3400                 .data           = &ipv4_dst_ops.gc_thresh,
3401                 .maxlen         = sizeof(int),
3402                 .mode           = 0644,
3403                 .proc_handler   = proc_dointvec,
3404         },
3405         {
3406                 .procname       = "max_size",
3407                 .data           = &ip_rt_max_size,
3408                 .maxlen         = sizeof(int),
3409                 .mode           = 0644,
3410                 .proc_handler   = proc_dointvec,
3411         },
3412         {
3413                 /*  Deprecated. Use gc_min_interval_ms */
3414
3415                 .procname       = "gc_min_interval",
3416                 .data           = &ip_rt_gc_min_interval,
3417                 .maxlen         = sizeof(int),
3418                 .mode           = 0644,
3419                 .proc_handler   = proc_dointvec_jiffies,
3420         },
3421         {
3422                 .procname       = "gc_min_interval_ms",
3423                 .data           = &ip_rt_gc_min_interval,
3424                 .maxlen         = sizeof(int),
3425                 .mode           = 0644,
3426                 .proc_handler   = proc_dointvec_ms_jiffies,
3427         },
3428         {
3429                 .procname       = "gc_timeout",
3430                 .data           = &ip_rt_gc_timeout,
3431                 .maxlen         = sizeof(int),
3432                 .mode           = 0644,
3433                 .proc_handler   = proc_dointvec_jiffies,
3434         },
3435         {
3436                 .procname       = "gc_interval",
3437                 .data           = &ip_rt_gc_interval,
3438                 .maxlen         = sizeof(int),
3439                 .mode           = 0644,
3440                 .proc_handler   = proc_dointvec_jiffies,
3441         },
3442         {
3443                 .procname       = "redirect_load",
3444                 .data           = &ip_rt_redirect_load,
3445                 .maxlen         = sizeof(int),
3446                 .mode           = 0644,
3447                 .proc_handler   = proc_dointvec,
3448         },
3449         {
3450                 .procname       = "redirect_number",
3451                 .data           = &ip_rt_redirect_number,
3452                 .maxlen         = sizeof(int),
3453                 .mode           = 0644,
3454                 .proc_handler   = proc_dointvec,
3455         },
3456         {
3457                 .procname       = "redirect_silence",
3458                 .data           = &ip_rt_redirect_silence,
3459                 .maxlen         = sizeof(int),
3460                 .mode           = 0644,
3461                 .proc_handler   = proc_dointvec,
3462         },
3463         {
3464                 .procname       = "error_cost",
3465                 .data           = &ip_rt_error_cost,
3466                 .maxlen         = sizeof(int),
3467                 .mode           = 0644,
3468                 .proc_handler   = proc_dointvec,
3469         },
3470         {
3471                 .procname       = "error_burst",
3472                 .data           = &ip_rt_error_burst,
3473                 .maxlen         = sizeof(int),
3474                 .mode           = 0644,
3475                 .proc_handler   = proc_dointvec,
3476         },
3477         {
3478                 .procname       = "gc_elasticity",
3479                 .data           = &ip_rt_gc_elasticity,
3480                 .maxlen         = sizeof(int),
3481                 .mode           = 0644,
3482                 .proc_handler   = proc_dointvec,
3483         },
3484         {
3485                 .procname       = "mtu_expires",
3486                 .data           = &ip_rt_mtu_expires,
3487                 .maxlen         = sizeof(int),
3488                 .mode           = 0644,
3489                 .proc_handler   = proc_dointvec_jiffies,
3490         },
3491         {
3492                 .procname       = "min_pmtu",
3493                 .data           = &ip_rt_min_pmtu,
3494                 .maxlen         = sizeof(int),
3495                 .mode           = 0644,
3496                 .proc_handler   = proc_dointvec_minmax,
3497                 .extra1         = &ip_min_valid_pmtu,
3498         },
3499         {
3500                 .procname       = "min_adv_mss",
3501                 .data           = &ip_rt_min_advmss,
3502                 .maxlen         = sizeof(int),
3503                 .mode           = 0644,
3504                 .proc_handler   = proc_dointvec,
3505         },
3506         { }
3507 };
3508
3509 static const char ipv4_route_flush_procname[] = "flush";
3510
3511 static struct ctl_table ipv4_route_flush_table[] = {
3512         {
3513                 .procname       = ipv4_route_flush_procname,
3514                 .maxlen         = sizeof(int),
3515                 .mode           = 0200,
3516                 .proc_handler   = ipv4_sysctl_rtcache_flush,
3517         },
3518         { },
3519 };
3520
3521 static __net_init int sysctl_route_net_init(struct net *net)
3522 {
3523         struct ctl_table *tbl;
3524
3525         tbl = ipv4_route_flush_table;
3526         if (!net_eq(net, &init_net)) {
3527                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3528                 if (!tbl)
3529                         goto err_dup;
3530
3531                 /* Don't export non-whitelisted sysctls to unprivileged users */
3532                 if (net->user_ns != &init_user_ns) {
3533                         if (tbl[0].procname != ipv4_route_flush_procname)
3534                                 tbl[0].procname = NULL;
3535                 }
3536         }
3537         tbl[0].extra1 = net;
3538
3539         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3540         if (!net->ipv4.route_hdr)
3541                 goto err_reg;
3542         return 0;
3543
3544 err_reg:
3545         if (tbl != ipv4_route_flush_table)
3546                 kfree(tbl);
3547 err_dup:
3548         return -ENOMEM;
3549 }
3550
3551 static __net_exit void sysctl_route_net_exit(struct net *net)
3552 {
3553         struct ctl_table *tbl;
3554
3555         tbl = net->ipv4.route_hdr->ctl_table_arg;
3556         unregister_net_sysctl_table(net->ipv4.route_hdr);
3557         BUG_ON(tbl == ipv4_route_flush_table);
3558         kfree(tbl);
3559 }
3560
3561 static __net_initdata struct pernet_operations sysctl_route_ops = {
3562         .init = sysctl_route_net_init,
3563         .exit = sysctl_route_net_exit,
3564 };
3565 #endif
3566
3567 static __net_init int rt_genid_init(struct net *net)
3568 {
3569         atomic_set(&net->ipv4.rt_genid, 0);
3570         atomic_set(&net->fnhe_genid, 0);
3571         atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3572         return 0;
3573 }
3574
3575 static __net_initdata struct pernet_operations rt_genid_ops = {
3576         .init = rt_genid_init,
3577 };
3578
3579 static int __net_init ipv4_inetpeer_init(struct net *net)
3580 {
3581         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3582
3583         if (!bp)
3584                 return -ENOMEM;
3585         inet_peer_base_init(bp);
3586         net->ipv4.peers = bp;
3587         return 0;
3588 }
3589
3590 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3591 {
3592         struct inet_peer_base *bp = net->ipv4.peers;
3593
3594         net->ipv4.peers = NULL;
3595         inetpeer_invalidate_tree(bp);
3596         kfree(bp);
3597 }
3598
3599 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3600         .init   =       ipv4_inetpeer_init,
3601         .exit   =       ipv4_inetpeer_exit,
3602 };
3603
3604 #ifdef CONFIG_IP_ROUTE_CLASSID
3605 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3606 #endif /* CONFIG_IP_ROUTE_CLASSID */
3607
3608 int __init ip_rt_init(void)
3609 {
3610         void *idents_hash;
3611         int cpu;
3612
3613         /* For modern hosts, this will use 2 MB of memory */
3614         idents_hash = alloc_large_system_hash("IP idents",
3615                                               sizeof(*ip_idents) + sizeof(*ip_tstamps),
3616                                               0,
3617                                               16, /* one bucket per 64 KB */
3618                                               HASH_ZERO,
3619                                               NULL,
3620                                               &ip_idents_mask,
3621                                               2048,
3622                                               256*1024);
3623
3624         ip_idents = idents_hash;
3625
3626         prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
3627
3628         ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
3629
3630         for_each_possible_cpu(cpu) {
3631                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3632
3633                 INIT_LIST_HEAD(&ul->head);
3634                 spin_lock_init(&ul->lock);
3635         }
3636 #ifdef CONFIG_IP_ROUTE_CLASSID
3637         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3638         if (!ip_rt_acct)
3639                 panic("IP: failed to allocate ip_rt_acct\n");
3640 #endif
3641
3642         ipv4_dst_ops.kmem_cachep =
3643                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3644                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3645
3646         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3647
3648         if (dst_entries_init(&ipv4_dst_ops) < 0)
3649                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3650
3651         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3652                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3653
3654         ipv4_dst_ops.gc_thresh = ~0;
3655         ip_rt_max_size = INT_MAX;
3656
3657         devinet_init();
3658         ip_fib_init();
3659
3660         if (ip_rt_proc_init())
3661                 pr_err("Unable to create route proc files\n");
3662 #ifdef CONFIG_XFRM
3663         xfrm_init();
3664         xfrm4_init();
3665 #endif
3666         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3667                       RTNL_FLAG_DOIT_UNLOCKED);
3668
3669 #ifdef CONFIG_SYSCTL
3670         register_pernet_subsys(&sysctl_route_ops);
3671 #endif
3672         register_pernet_subsys(&rt_genid_ops);
3673         register_pernet_subsys(&ipv4_inetpeer_ops);
3674         return 0;
3675 }
3676
3677 #ifdef CONFIG_SYSCTL
3678 /*
3679  * We really need to sanitize the damn ipv4 init order, then all
3680  * this nonsense will go away.
3681  */
3682 void __init ip_static_sysctl_init(void)
3683 {
3684         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3685 }
3686 #endif