GNU Linux-libre 4.9.282-gnu1
[releases.git] / net / ipv4 / route.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              ROUTE - implementation of the IP router.
7  *
8  * Authors:     Ross Biro
9  *              Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10  *              Alan Cox, <gw4pts@gw4pts.ampr.org>
11  *              Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12  *              Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13  *
14  * Fixes:
15  *              Alan Cox        :       Verify area fixes.
16  *              Alan Cox        :       cli() protects routing changes
17  *              Rui Oliveira    :       ICMP routing table updates
18  *              (rco@di.uminho.pt)      Routing table insertion and update
19  *              Linus Torvalds  :       Rewrote bits to be sensible
20  *              Alan Cox        :       Added BSD route gw semantics
21  *              Alan Cox        :       Super /proc >4K
22  *              Alan Cox        :       MTU in route table
23  *              Alan Cox        :       MSS actually. Also added the window
24  *                                      clamper.
25  *              Sam Lantinga    :       Fixed route matching in rt_del()
26  *              Alan Cox        :       Routing cache support.
27  *              Alan Cox        :       Removed compatibility cruft.
28  *              Alan Cox        :       RTF_REJECT support.
29  *              Alan Cox        :       TCP irtt support.
30  *              Jonathan Naylor :       Added Metric support.
31  *      Miquel van Smoorenburg  :       BSD API fixes.
32  *      Miquel van Smoorenburg  :       Metrics.
33  *              Alan Cox        :       Use __u32 properly
34  *              Alan Cox        :       Aligned routing errors more closely with BSD
35  *                                      our system is still very different.
36  *              Alan Cox        :       Faster /proc handling
37  *      Alexey Kuznetsov        :       Massive rework to support tree based routing,
38  *                                      routing caches and better behaviour.
39  *
40  *              Olaf Erb        :       irtt wasn't being copied right.
41  *              Bjorn Ekwall    :       Kerneld route support.
42  *              Alan Cox        :       Multicast fixed (I hope)
43  *              Pavel Krauz     :       Limited broadcast fixed
44  *              Mike McLagan    :       Routing by source
45  *      Alexey Kuznetsov        :       End of old history. Split to fib.c and
46  *                                      route.c and rewritten from scratch.
47  *              Andi Kleen      :       Load-limit warning messages.
48  *      Vitaly E. Lavrov        :       Transparent proxy revived after year coma.
49  *      Vitaly E. Lavrov        :       Race condition in ip_route_input_slow.
50  *      Tobias Ringstrom        :       Uninitialized res.type in ip_route_output_slow.
51  *      Vladimir V. Ivanov      :       IP rule info (flowid) is really useful.
52  *              Marc Boucher    :       routing by fwmark
53  *      Robert Olsson           :       Added rt_cache statistics
54  *      Arnaldo C. Melo         :       Convert proc stuff to seq_file
55  *      Eric Dumazet            :       hashed spinlocks and rt_check_expire() fixes.
56  *      Ilia Sotnikov           :       Ignore TOS on PMTUD and Redirect
57  *      Ilia Sotnikov           :       Removed TOS from hash calculations
58  *
59  *              This program is free software; you can redistribute it and/or
60  *              modify it under the terms of the GNU General Public License
61  *              as published by the Free Software Foundation; either version
62  *              2 of the License, or (at your option) any later version.
63  */
64
65 #define pr_fmt(fmt) "IPv4: " fmt
66
67 #include <linux/module.h>
68 #include <asm/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
72 #include <linux/mm.h>
73 #include <linux/bootmem.h>
74 #include <linux/string.h>
75 #include <linux/socket.h>
76 #include <linux/sockios.h>
77 #include <linux/errno.h>
78 #include <linux/in.h>
79 #include <linux/inet.h>
80 #include <linux/netdevice.h>
81 #include <linux/proc_fs.h>
82 #include <linux/init.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/rcupdate.h>
91 #include <linux/times.h>
92 #include <linux/slab.h>
93 #include <linux/jhash.h>
94 #include <net/dst.h>
95 #include <net/dst_metadata.h>
96 #include <net/net_namespace.h>
97 #include <net/protocol.h>
98 #include <net/ip.h>
99 #include <net/route.h>
100 #include <net/inetpeer.h>
101 #include <net/sock.h>
102 #include <net/ip_fib.h>
103 #include <net/arp.h>
104 #include <net/tcp.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/lwtunnel.h>
108 #include <net/netevent.h>
109 #include <net/rtnetlink.h>
110 #ifdef CONFIG_SYSCTL
111 #include <linux/sysctl.h>
112 #include <linux/kmemleak.h>
113 #endif
114 #include <net/secure_seq.h>
115 #include <net/ip_tunnels.h>
116 #include <net/l3mdev.h>
117
118 #define RT_FL_TOS(oldflp4) \
119         ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
120
121 #define RT_GC_TIMEOUT (300*HZ)
122
123 static int ip_rt_max_size;
124 static int ip_rt_redirect_number __read_mostly  = 9;
125 static int ip_rt_redirect_load __read_mostly    = HZ / 50;
126 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
127 static int ip_rt_error_cost __read_mostly       = HZ;
128 static int ip_rt_error_burst __read_mostly      = 5 * HZ;
129 static int ip_rt_mtu_expires __read_mostly      = 10 * 60 * HZ;
130 static u32 ip_rt_min_pmtu __read_mostly         = 512 + 20 + 20;
131 static int ip_rt_min_advmss __read_mostly       = 256;
132
133 static int ip_rt_gc_timeout __read_mostly       = RT_GC_TIMEOUT;
134
135 /*
136  *      Interface to generic destination cache.
137  */
138
139 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
140 static unsigned int      ipv4_default_advmss(const struct dst_entry *dst);
141 static unsigned int      ipv4_mtu(const struct dst_entry *dst);
142 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
143 static void              ipv4_link_failure(struct sk_buff *skb);
144 static void              ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
145                                            struct sk_buff *skb, u32 mtu);
146 static void              ip_do_redirect(struct dst_entry *dst, struct sock *sk,
147                                         struct sk_buff *skb);
148 static void             ipv4_dst_destroy(struct dst_entry *dst);
149
150 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
151 {
152         WARN_ON(1);
153         return NULL;
154 }
155
156 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
157                                            struct sk_buff *skb,
158                                            const void *daddr);
159
160 static struct dst_ops ipv4_dst_ops = {
161         .family =               AF_INET,
162         .check =                ipv4_dst_check,
163         .default_advmss =       ipv4_default_advmss,
164         .mtu =                  ipv4_mtu,
165         .cow_metrics =          ipv4_cow_metrics,
166         .destroy =              ipv4_dst_destroy,
167         .negative_advice =      ipv4_negative_advice,
168         .link_failure =         ipv4_link_failure,
169         .update_pmtu =          ip_rt_update_pmtu,
170         .redirect =             ip_do_redirect,
171         .local_out =            __ip_local_out,
172         .neigh_lookup =         ipv4_neigh_lookup,
173 };
174
175 #define ECN_OR_COST(class)      TC_PRIO_##class
176
177 const __u8 ip_tos2prio[16] = {
178         TC_PRIO_BESTEFFORT,
179         ECN_OR_COST(BESTEFFORT),
180         TC_PRIO_BESTEFFORT,
181         ECN_OR_COST(BESTEFFORT),
182         TC_PRIO_BULK,
183         ECN_OR_COST(BULK),
184         TC_PRIO_BULK,
185         ECN_OR_COST(BULK),
186         TC_PRIO_INTERACTIVE,
187         ECN_OR_COST(INTERACTIVE),
188         TC_PRIO_INTERACTIVE,
189         ECN_OR_COST(INTERACTIVE),
190         TC_PRIO_INTERACTIVE_BULK,
191         ECN_OR_COST(INTERACTIVE_BULK),
192         TC_PRIO_INTERACTIVE_BULK,
193         ECN_OR_COST(INTERACTIVE_BULK)
194 };
195 EXPORT_SYMBOL(ip_tos2prio);
196
197 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
198 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
199
200 #ifdef CONFIG_PROC_FS
201 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
202 {
203         if (*pos)
204                 return NULL;
205         return SEQ_START_TOKEN;
206 }
207
208 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
209 {
210         ++*pos;
211         return NULL;
212 }
213
214 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
215 {
216 }
217
218 static int rt_cache_seq_show(struct seq_file *seq, void *v)
219 {
220         if (v == SEQ_START_TOKEN)
221                 seq_printf(seq, "%-127s\n",
222                            "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
223                            "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
224                            "HHUptod\tSpecDst");
225         return 0;
226 }
227
228 static const struct seq_operations rt_cache_seq_ops = {
229         .start  = rt_cache_seq_start,
230         .next   = rt_cache_seq_next,
231         .stop   = rt_cache_seq_stop,
232         .show   = rt_cache_seq_show,
233 };
234
235 static int rt_cache_seq_open(struct inode *inode, struct file *file)
236 {
237         return seq_open(file, &rt_cache_seq_ops);
238 }
239
240 static const struct file_operations rt_cache_seq_fops = {
241         .owner   = THIS_MODULE,
242         .open    = rt_cache_seq_open,
243         .read    = seq_read,
244         .llseek  = seq_lseek,
245         .release = seq_release,
246 };
247
248
249 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
250 {
251         int cpu;
252
253         if (*pos == 0)
254                 return SEQ_START_TOKEN;
255
256         for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
257                 if (!cpu_possible(cpu))
258                         continue;
259                 *pos = cpu+1;
260                 return &per_cpu(rt_cache_stat, cpu);
261         }
262         return NULL;
263 }
264
265 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
266 {
267         int cpu;
268
269         for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
270                 if (!cpu_possible(cpu))
271                         continue;
272                 *pos = cpu+1;
273                 return &per_cpu(rt_cache_stat, cpu);
274         }
275         (*pos)++;
276         return NULL;
277
278 }
279
280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
281 {
282
283 }
284
285 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
286 {
287         struct rt_cache_stat *st = v;
288
289         if (v == SEQ_START_TOKEN) {
290                 seq_printf(seq, "entries  in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src  out_hit out_slow_tot out_slow_mc  gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
291                 return 0;
292         }
293
294         seq_printf(seq,"%08x  %08x %08x %08x %08x %08x %08x %08x "
295                    " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
296                    dst_entries_get_slow(&ipv4_dst_ops),
297                    0, /* st->in_hit */
298                    st->in_slow_tot,
299                    st->in_slow_mc,
300                    st->in_no_route,
301                    st->in_brd,
302                    st->in_martian_dst,
303                    st->in_martian_src,
304
305                    0, /* st->out_hit */
306                    st->out_slow_tot,
307                    st->out_slow_mc,
308
309                    0, /* st->gc_total */
310                    0, /* st->gc_ignored */
311                    0, /* st->gc_goal_miss */
312                    0, /* st->gc_dst_overflow */
313                    0, /* st->in_hlist_search */
314                    0  /* st->out_hlist_search */
315                 );
316         return 0;
317 }
318
319 static const struct seq_operations rt_cpu_seq_ops = {
320         .start  = rt_cpu_seq_start,
321         .next   = rt_cpu_seq_next,
322         .stop   = rt_cpu_seq_stop,
323         .show   = rt_cpu_seq_show,
324 };
325
326
327 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
328 {
329         return seq_open(file, &rt_cpu_seq_ops);
330 }
331
332 static const struct file_operations rt_cpu_seq_fops = {
333         .owner   = THIS_MODULE,
334         .open    = rt_cpu_seq_open,
335         .read    = seq_read,
336         .llseek  = seq_lseek,
337         .release = seq_release,
338 };
339
340 #ifdef CONFIG_IP_ROUTE_CLASSID
341 static int rt_acct_proc_show(struct seq_file *m, void *v)
342 {
343         struct ip_rt_acct *dst, *src;
344         unsigned int i, j;
345
346         dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
347         if (!dst)
348                 return -ENOMEM;
349
350         for_each_possible_cpu(i) {
351                 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
352                 for (j = 0; j < 256; j++) {
353                         dst[j].o_bytes   += src[j].o_bytes;
354                         dst[j].o_packets += src[j].o_packets;
355                         dst[j].i_bytes   += src[j].i_bytes;
356                         dst[j].i_packets += src[j].i_packets;
357                 }
358         }
359
360         seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
361         kfree(dst);
362         return 0;
363 }
364
365 static int rt_acct_proc_open(struct inode *inode, struct file *file)
366 {
367         return single_open(file, rt_acct_proc_show, NULL);
368 }
369
370 static const struct file_operations rt_acct_proc_fops = {
371         .owner          = THIS_MODULE,
372         .open           = rt_acct_proc_open,
373         .read           = seq_read,
374         .llseek         = seq_lseek,
375         .release        = single_release,
376 };
377 #endif
378
379 static int __net_init ip_rt_do_proc_init(struct net *net)
380 {
381         struct proc_dir_entry *pde;
382
383         pde = proc_create("rt_cache", S_IRUGO, net->proc_net,
384                           &rt_cache_seq_fops);
385         if (!pde)
386                 goto err1;
387
388         pde = proc_create("rt_cache", S_IRUGO,
389                           net->proc_net_stat, &rt_cpu_seq_fops);
390         if (!pde)
391                 goto err2;
392
393 #ifdef CONFIG_IP_ROUTE_CLASSID
394         pde = proc_create("rt_acct", 0, net->proc_net, &rt_acct_proc_fops);
395         if (!pde)
396                 goto err3;
397 #endif
398         return 0;
399
400 #ifdef CONFIG_IP_ROUTE_CLASSID
401 err3:
402         remove_proc_entry("rt_cache", net->proc_net_stat);
403 #endif
404 err2:
405         remove_proc_entry("rt_cache", net->proc_net);
406 err1:
407         return -ENOMEM;
408 }
409
410 static void __net_exit ip_rt_do_proc_exit(struct net *net)
411 {
412         remove_proc_entry("rt_cache", net->proc_net_stat);
413         remove_proc_entry("rt_cache", net->proc_net);
414 #ifdef CONFIG_IP_ROUTE_CLASSID
415         remove_proc_entry("rt_acct", net->proc_net);
416 #endif
417 }
418
419 static struct pernet_operations ip_rt_proc_ops __net_initdata =  {
420         .init = ip_rt_do_proc_init,
421         .exit = ip_rt_do_proc_exit,
422 };
423
424 static int __init ip_rt_proc_init(void)
425 {
426         return register_pernet_subsys(&ip_rt_proc_ops);
427 }
428
429 #else
430 static inline int ip_rt_proc_init(void)
431 {
432         return 0;
433 }
434 #endif /* CONFIG_PROC_FS */
435
436 static inline bool rt_is_expired(const struct rtable *rth)
437 {
438         return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
439 }
440
441 void rt_cache_flush(struct net *net)
442 {
443         rt_genid_bump_ipv4(net);
444 }
445
446 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
447                                            struct sk_buff *skb,
448                                            const void *daddr)
449 {
450         struct net_device *dev = dst->dev;
451         const __be32 *pkey = daddr;
452         const struct rtable *rt;
453         struct neighbour *n;
454
455         rt = (const struct rtable *) dst;
456         if (rt->rt_gateway)
457                 pkey = (const __be32 *) &rt->rt_gateway;
458         else if (skb)
459                 pkey = &ip_hdr(skb)->daddr;
460
461         n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
462         if (n)
463                 return n;
464         return neigh_create(&arp_tbl, pkey, dev);
465 }
466
467 /* Hash tables of size 2048..262144 depending on RAM size.
468  * Each bucket uses 8 bytes.
469  */
470 static u32 ip_idents_mask __read_mostly;
471 static atomic_t *ip_idents __read_mostly;
472 static u32 *ip_tstamps __read_mostly;
473
474 /* In order to protect privacy, we add a perturbation to identifiers
475  * if one generator is seldom used. This makes hard for an attacker
476  * to infer how many packets were sent between two points in time.
477  */
478 u32 ip_idents_reserve(u32 hash, int segs)
479 {
480         u32 bucket, old, now = (u32)jiffies;
481         atomic_t *p_id;
482         u32 *p_tstamp;
483         u32 delta = 0;
484
485         bucket = hash & ip_idents_mask;
486         p_tstamp = ip_tstamps + bucket;
487         p_id = ip_idents + bucket;
488         old = ACCESS_ONCE(*p_tstamp);
489
490         if (old != now && cmpxchg(p_tstamp, old, now) == old)
491                 delta = prandom_u32_max(now - old);
492
493         /* If UBSAN reports an error there, please make sure your compiler
494          * supports -fno-strict-overflow before reporting it that was a bug
495          * in UBSAN, and it has been fixed in GCC-8.
496          */
497         return atomic_add_return(segs + delta, p_id) - segs;
498 }
499 EXPORT_SYMBOL(ip_idents_reserve);
500
501 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
502 {
503         u32 hash, id;
504
505         /* Note the following code is not safe, but this is okay. */
506         if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
507                 get_random_bytes(&net->ipv4.ip_id_key,
508                                  sizeof(net->ipv4.ip_id_key));
509
510         hash = siphash_3u32((__force u32)iph->daddr,
511                             (__force u32)iph->saddr,
512                             iph->protocol,
513                             &net->ipv4.ip_id_key);
514         id = ip_idents_reserve(hash, segs);
515         iph->id = htons(id);
516 }
517 EXPORT_SYMBOL(__ip_select_ident);
518
519 static void __build_flow_key(struct flowi4 *fl4, const struct sock *sk,
520                              const struct iphdr *iph,
521                              int oif, u8 tos,
522                              u8 prot, u32 mark, int flow_flags)
523 {
524         if (sk) {
525                 const struct inet_sock *inet = inet_sk(sk);
526
527                 oif = sk->sk_bound_dev_if;
528                 mark = sk->sk_mark;
529                 tos = RT_CONN_FLAGS(sk);
530                 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
531         }
532         flowi4_init_output(fl4, oif, mark, tos,
533                            RT_SCOPE_UNIVERSE, prot,
534                            flow_flags,
535                            iph->daddr, iph->saddr, 0, 0);
536 }
537
538 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
539                                const struct sock *sk)
540 {
541         const struct iphdr *iph = ip_hdr(skb);
542         int oif = skb->dev->ifindex;
543         u8 tos = RT_TOS(iph->tos);
544         u8 prot = iph->protocol;
545         u32 mark = skb->mark;
546
547         __build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
548 }
549
550 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
551 {
552         const struct inet_sock *inet = inet_sk(sk);
553         const struct ip_options_rcu *inet_opt;
554         __be32 daddr = inet->inet_daddr;
555
556         rcu_read_lock();
557         inet_opt = rcu_dereference(inet->inet_opt);
558         if (inet_opt && inet_opt->opt.srr)
559                 daddr = inet_opt->opt.faddr;
560         flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
561                            RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
562                            inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
563                            inet_sk_flowi_flags(sk),
564                            daddr, inet->inet_saddr, 0, 0);
565         rcu_read_unlock();
566 }
567
568 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
569                                  const struct sk_buff *skb)
570 {
571         if (skb)
572                 build_skb_flow_key(fl4, skb, sk);
573         else
574                 build_sk_flow_key(fl4, sk);
575 }
576
577 static inline void rt_free(struct rtable *rt)
578 {
579         call_rcu(&rt->dst.rcu_head, dst_rcu_free);
580 }
581
582 static DEFINE_SPINLOCK(fnhe_lock);
583
584 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
585 {
586         struct rtable *rt;
587
588         rt = rcu_dereference(fnhe->fnhe_rth_input);
589         if (rt) {
590                 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
591                 rt_free(rt);
592         }
593         rt = rcu_dereference(fnhe->fnhe_rth_output);
594         if (rt) {
595                 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
596                 rt_free(rt);
597         }
598 }
599
600 static struct fib_nh_exception *fnhe_oldest(struct fnhe_hash_bucket *hash)
601 {
602         struct fib_nh_exception *fnhe, *oldest;
603
604         oldest = rcu_dereference(hash->chain);
605         for (fnhe = rcu_dereference(oldest->fnhe_next); fnhe;
606              fnhe = rcu_dereference(fnhe->fnhe_next)) {
607                 if (time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp))
608                         oldest = fnhe;
609         }
610         fnhe_flush_routes(oldest);
611         return oldest;
612 }
613
614 static inline u32 fnhe_hashfun(__be32 daddr)
615 {
616         static u32 fnhe_hashrnd __read_mostly;
617         u32 hval;
618
619         net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
620         hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
621         return hash_32(hval, FNHE_HASH_SHIFT);
622 }
623
624 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
625 {
626         rt->rt_pmtu = fnhe->fnhe_pmtu;
627         rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
628         rt->dst.expires = fnhe->fnhe_expires;
629
630         if (fnhe->fnhe_gw) {
631                 rt->rt_flags |= RTCF_REDIRECTED;
632                 rt->rt_gateway = fnhe->fnhe_gw;
633                 rt->rt_uses_gateway = 1;
634         }
635 }
636
637 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
638                                   u32 pmtu, bool lock, unsigned long expires)
639 {
640         struct fnhe_hash_bucket *hash;
641         struct fib_nh_exception *fnhe;
642         struct rtable *rt;
643         u32 genid, hval;
644         unsigned int i;
645         int depth;
646
647         genid = fnhe_genid(dev_net(nh->nh_dev));
648         hval = fnhe_hashfun(daddr);
649
650         spin_lock_bh(&fnhe_lock);
651
652         hash = rcu_dereference(nh->nh_exceptions);
653         if (!hash) {
654                 hash = kzalloc(FNHE_HASH_SIZE * sizeof(*hash), GFP_ATOMIC);
655                 if (!hash)
656                         goto out_unlock;
657                 rcu_assign_pointer(nh->nh_exceptions, hash);
658         }
659
660         hash += hval;
661
662         depth = 0;
663         for (fnhe = rcu_dereference(hash->chain); fnhe;
664              fnhe = rcu_dereference(fnhe->fnhe_next)) {
665                 if (fnhe->fnhe_daddr == daddr)
666                         break;
667                 depth++;
668         }
669
670         if (fnhe) {
671                 if (fnhe->fnhe_genid != genid)
672                         fnhe->fnhe_genid = genid;
673                 if (gw)
674                         fnhe->fnhe_gw = gw;
675                 if (pmtu) {
676                         fnhe->fnhe_pmtu = pmtu;
677                         fnhe->fnhe_mtu_locked = lock;
678                 }
679                 fnhe->fnhe_expires = max(1UL, expires);
680                 /* Update all cached dsts too */
681                 rt = rcu_dereference(fnhe->fnhe_rth_input);
682                 if (rt)
683                         fill_route_from_fnhe(rt, fnhe);
684                 rt = rcu_dereference(fnhe->fnhe_rth_output);
685                 if (rt)
686                         fill_route_from_fnhe(rt, fnhe);
687         } else {
688                 if (depth > FNHE_RECLAIM_DEPTH)
689                         fnhe = fnhe_oldest(hash);
690                 else {
691                         fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
692                         if (!fnhe)
693                                 goto out_unlock;
694
695                         fnhe->fnhe_next = hash->chain;
696                         rcu_assign_pointer(hash->chain, fnhe);
697                 }
698                 fnhe->fnhe_genid = genid;
699                 fnhe->fnhe_daddr = daddr;
700                 fnhe->fnhe_gw = gw;
701                 fnhe->fnhe_pmtu = pmtu;
702                 fnhe->fnhe_mtu_locked = lock;
703                 fnhe->fnhe_expires = expires;
704
705                 /* Exception created; mark the cached routes for the nexthop
706                  * stale, so anyone caching it rechecks if this exception
707                  * applies to them.
708                  */
709                 rt = rcu_dereference(nh->nh_rth_input);
710                 if (rt)
711                         rt->dst.obsolete = DST_OBSOLETE_KILL;
712
713                 for_each_possible_cpu(i) {
714                         struct rtable __rcu **prt;
715                         prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
716                         rt = rcu_dereference(*prt);
717                         if (rt)
718                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
719                 }
720         }
721
722         fnhe->fnhe_stamp = jiffies;
723
724 out_unlock:
725         spin_unlock_bh(&fnhe_lock);
726 }
727
728 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
729                              bool kill_route)
730 {
731         __be32 new_gw = icmp_hdr(skb)->un.gateway;
732         __be32 old_gw = ip_hdr(skb)->saddr;
733         struct net_device *dev = skb->dev;
734         struct in_device *in_dev;
735         struct fib_result res;
736         struct neighbour *n;
737         struct net *net;
738
739         switch (icmp_hdr(skb)->code & 7) {
740         case ICMP_REDIR_NET:
741         case ICMP_REDIR_NETTOS:
742         case ICMP_REDIR_HOST:
743         case ICMP_REDIR_HOSTTOS:
744                 break;
745
746         default:
747                 return;
748         }
749
750         if (rt->rt_gateway != old_gw)
751                 return;
752
753         in_dev = __in_dev_get_rcu(dev);
754         if (!in_dev)
755                 return;
756
757         net = dev_net(dev);
758         if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
759             ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
760             ipv4_is_zeronet(new_gw))
761                 goto reject_redirect;
762
763         if (!IN_DEV_SHARED_MEDIA(in_dev)) {
764                 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
765                         goto reject_redirect;
766                 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
767                         goto reject_redirect;
768         } else {
769                 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
770                         goto reject_redirect;
771         }
772
773         n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
774         if (!n)
775                 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
776         if (!IS_ERR(n)) {
777                 if (!(n->nud_state & NUD_VALID)) {
778                         neigh_event_send(n, NULL);
779                 } else {
780                         if (fib_lookup(net, fl4, &res, 0) == 0) {
781                                 struct fib_nh *nh = &FIB_RES_NH(res);
782
783                                 update_or_create_fnhe(nh, fl4->daddr, new_gw,
784                                                 0, false,
785                                                 jiffies + ip_rt_gc_timeout);
786                         }
787                         if (kill_route)
788                                 rt->dst.obsolete = DST_OBSOLETE_KILL;
789                         call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
790                 }
791                 neigh_release(n);
792         }
793         return;
794
795 reject_redirect:
796 #ifdef CONFIG_IP_ROUTE_VERBOSE
797         if (IN_DEV_LOG_MARTIANS(in_dev)) {
798                 const struct iphdr *iph = (const struct iphdr *) skb->data;
799                 __be32 daddr = iph->daddr;
800                 __be32 saddr = iph->saddr;
801
802                 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
803                                      "  Advised path = %pI4 -> %pI4\n",
804                                      &old_gw, dev->name, &new_gw,
805                                      &saddr, &daddr);
806         }
807 #endif
808         ;
809 }
810
811 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
812 {
813         struct rtable *rt;
814         struct flowi4 fl4;
815         const struct iphdr *iph = (const struct iphdr *) skb->data;
816         int oif = skb->dev->ifindex;
817         u8 tos = RT_TOS(iph->tos);
818         u8 prot = iph->protocol;
819         u32 mark = skb->mark;
820
821         rt = (struct rtable *) dst;
822
823         __build_flow_key(&fl4, sk, iph, oif, tos, prot, mark, 0);
824         __ip_do_redirect(rt, skb, &fl4, true);
825 }
826
827 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
828 {
829         struct rtable *rt = (struct rtable *)dst;
830         struct dst_entry *ret = dst;
831
832         if (rt) {
833                 if (dst->obsolete > 0) {
834                         ip_rt_put(rt);
835                         ret = NULL;
836                 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
837                            rt->dst.expires) {
838                         ip_rt_put(rt);
839                         ret = NULL;
840                 }
841         }
842         return ret;
843 }
844
845 /*
846  * Algorithm:
847  *      1. The first ip_rt_redirect_number redirects are sent
848  *         with exponential backoff, then we stop sending them at all,
849  *         assuming that the host ignores our redirects.
850  *      2. If we did not see packets requiring redirects
851  *         during ip_rt_redirect_silence, we assume that the host
852  *         forgot redirected route and start to send redirects again.
853  *
854  * This algorithm is much cheaper and more intelligent than dumb load limiting
855  * in icmp.c.
856  *
857  * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
858  * and "frag. need" (breaks PMTU discovery) in icmp.c.
859  */
860
861 void ip_rt_send_redirect(struct sk_buff *skb)
862 {
863         struct rtable *rt = skb_rtable(skb);
864         struct in_device *in_dev;
865         struct inet_peer *peer;
866         struct net *net;
867         int log_martians;
868         int vif;
869
870         rcu_read_lock();
871         in_dev = __in_dev_get_rcu(rt->dst.dev);
872         if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
873                 rcu_read_unlock();
874                 return;
875         }
876         log_martians = IN_DEV_LOG_MARTIANS(in_dev);
877         vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
878         rcu_read_unlock();
879
880         net = dev_net(rt->dst.dev);
881         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
882         if (!peer) {
883                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
884                           rt_nexthop(rt, ip_hdr(skb)->daddr));
885                 return;
886         }
887
888         /* No redirected packets during ip_rt_redirect_silence;
889          * reset the algorithm.
890          */
891         if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
892                 peer->rate_tokens = 0;
893                 peer->n_redirects = 0;
894         }
895
896         /* Too many ignored redirects; do not send anything
897          * set dst.rate_last to the last seen redirected packet.
898          */
899         if (peer->n_redirects >= ip_rt_redirect_number) {
900                 peer->rate_last = jiffies;
901                 goto out_put_peer;
902         }
903
904         /* Check for load limit; set rate_last to the latest sent
905          * redirect.
906          */
907         if (peer->n_redirects == 0 ||
908             time_after(jiffies,
909                        (peer->rate_last +
910                         (ip_rt_redirect_load << peer->n_redirects)))) {
911                 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
912
913                 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
914                 peer->rate_last = jiffies;
915                 ++peer->n_redirects;
916 #ifdef CONFIG_IP_ROUTE_VERBOSE
917                 if (log_martians &&
918                     peer->n_redirects == ip_rt_redirect_number)
919                         net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
920                                              &ip_hdr(skb)->saddr, inet_iif(skb),
921                                              &ip_hdr(skb)->daddr, &gw);
922 #endif
923         }
924 out_put_peer:
925         inet_putpeer(peer);
926 }
927
928 static int ip_error(struct sk_buff *skb)
929 {
930         struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
931         struct rtable *rt = skb_rtable(skb);
932         struct inet_peer *peer;
933         unsigned long now;
934         struct net *net;
935         bool send;
936         int code;
937
938         /* IP on this device is disabled. */
939         if (!in_dev)
940                 goto out;
941
942         net = dev_net(rt->dst.dev);
943         if (!IN_DEV_FORWARD(in_dev)) {
944                 switch (rt->dst.error) {
945                 case EHOSTUNREACH:
946                         __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
947                         break;
948
949                 case ENETUNREACH:
950                         __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
951                         break;
952                 }
953                 goto out;
954         }
955
956         switch (rt->dst.error) {
957         case EINVAL:
958         default:
959                 goto out;
960         case EHOSTUNREACH:
961                 code = ICMP_HOST_UNREACH;
962                 break;
963         case ENETUNREACH:
964                 code = ICMP_NET_UNREACH;
965                 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
966                 break;
967         case EACCES:
968                 code = ICMP_PKT_FILTERED;
969                 break;
970         }
971
972         peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
973                                l3mdev_master_ifindex(skb->dev), 1);
974
975         send = true;
976         if (peer) {
977                 now = jiffies;
978                 peer->rate_tokens += now - peer->rate_last;
979                 if (peer->rate_tokens > ip_rt_error_burst)
980                         peer->rate_tokens = ip_rt_error_burst;
981                 peer->rate_last = now;
982                 if (peer->rate_tokens >= ip_rt_error_cost)
983                         peer->rate_tokens -= ip_rt_error_cost;
984                 else
985                         send = false;
986                 inet_putpeer(peer);
987         }
988         if (send)
989                 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
990
991 out:    kfree_skb(skb);
992         return 0;
993 }
994
995 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
996 {
997         struct dst_entry *dst = &rt->dst;
998         u32 old_mtu = ipv4_mtu(dst);
999         struct fib_result res;
1000         bool lock = false;
1001
1002         if (ip_mtu_locked(dst))
1003                 return;
1004
1005         if (old_mtu < mtu)
1006                 return;
1007
1008         if (mtu < ip_rt_min_pmtu) {
1009                 lock = true;
1010                 mtu = min(old_mtu, ip_rt_min_pmtu);
1011         }
1012
1013         if (rt->rt_pmtu == mtu && !lock &&
1014             time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1015                 return;
1016
1017         rcu_read_lock();
1018         if (fib_lookup(dev_net(dst->dev), fl4, &res, 0) == 0) {
1019                 struct fib_nh *nh = &FIB_RES_NH(res);
1020
1021                 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
1022                                       jiffies + ip_rt_mtu_expires);
1023         }
1024         rcu_read_unlock();
1025 }
1026
1027 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1028                               struct sk_buff *skb, u32 mtu)
1029 {
1030         struct rtable *rt = (struct rtable *) dst;
1031         struct flowi4 fl4;
1032
1033         ip_rt_build_flow_key(&fl4, sk, skb);
1034         __ip_rt_update_pmtu(rt, &fl4, mtu);
1035 }
1036
1037 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1038                       int oif, u32 mark, u8 protocol, int flow_flags)
1039 {
1040         const struct iphdr *iph = (const struct iphdr *) skb->data;
1041         struct flowi4 fl4;
1042         struct rtable *rt;
1043
1044         if (!mark)
1045                 mark = IP4_REPLY_MARK(net, skb->mark);
1046
1047         __build_flow_key(&fl4, NULL, iph, oif,
1048                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1049         rt = __ip_route_output_key(net, &fl4);
1050         if (!IS_ERR(rt)) {
1051                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1052                 ip_rt_put(rt);
1053         }
1054 }
1055 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1056
1057 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1058 {
1059         const struct iphdr *iph = (const struct iphdr *) skb->data;
1060         struct flowi4 fl4;
1061         struct rtable *rt;
1062
1063         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1064
1065         if (!fl4.flowi4_mark)
1066                 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1067
1068         rt = __ip_route_output_key(sock_net(sk), &fl4);
1069         if (!IS_ERR(rt)) {
1070                 __ip_rt_update_pmtu(rt, &fl4, mtu);
1071                 ip_rt_put(rt);
1072         }
1073 }
1074
1075 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1076 {
1077         const struct iphdr *iph = (const struct iphdr *) skb->data;
1078         struct flowi4 fl4;
1079         struct rtable *rt;
1080         struct dst_entry *odst = NULL;
1081         bool new = false;
1082
1083         bh_lock_sock(sk);
1084
1085         if (!ip_sk_accept_pmtu(sk))
1086                 goto out;
1087
1088         odst = sk_dst_get(sk);
1089
1090         if (sock_owned_by_user(sk) || !odst) {
1091                 __ipv4_sk_update_pmtu(skb, sk, mtu);
1092                 goto out;
1093         }
1094
1095         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1096
1097         rt = (struct rtable *)odst;
1098         if (odst->obsolete && !odst->ops->check(odst, 0)) {
1099                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1100                 if (IS_ERR(rt))
1101                         goto out;
1102
1103                 new = true;
1104         }
1105
1106         __ip_rt_update_pmtu((struct rtable *) rt->dst.path, &fl4, mtu);
1107
1108         if (!dst_check(&rt->dst, 0)) {
1109                 if (new)
1110                         dst_release(&rt->dst);
1111
1112                 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1113                 if (IS_ERR(rt))
1114                         goto out;
1115
1116                 new = true;
1117         }
1118
1119         if (new)
1120                 sk_dst_set(sk, &rt->dst);
1121
1122 out:
1123         bh_unlock_sock(sk);
1124         dst_release(odst);
1125 }
1126 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1127
1128 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1129                    int oif, u32 mark, u8 protocol, int flow_flags)
1130 {
1131         const struct iphdr *iph = (const struct iphdr *) skb->data;
1132         struct flowi4 fl4;
1133         struct rtable *rt;
1134
1135         __build_flow_key(&fl4, NULL, iph, oif,
1136                          RT_TOS(iph->tos), protocol, mark, flow_flags);
1137         rt = __ip_route_output_key(net, &fl4);
1138         if (!IS_ERR(rt)) {
1139                 __ip_do_redirect(rt, skb, &fl4, false);
1140                 ip_rt_put(rt);
1141         }
1142 }
1143 EXPORT_SYMBOL_GPL(ipv4_redirect);
1144
1145 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1146 {
1147         const struct iphdr *iph = (const struct iphdr *) skb->data;
1148         struct flowi4 fl4;
1149         struct rtable *rt;
1150
1151         __build_flow_key(&fl4, sk, iph, 0, 0, 0, 0, 0);
1152         rt = __ip_route_output_key(sock_net(sk), &fl4);
1153         if (!IS_ERR(rt)) {
1154                 __ip_do_redirect(rt, skb, &fl4, false);
1155                 ip_rt_put(rt);
1156         }
1157 }
1158 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1159
1160 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1161 {
1162         struct rtable *rt = (struct rtable *) dst;
1163
1164         /* All IPV4 dsts are created with ->obsolete set to the value
1165          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1166          * into this function always.
1167          *
1168          * When a PMTU/redirect information update invalidates a route,
1169          * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1170          * DST_OBSOLETE_DEAD by dst_free().
1171          */
1172         if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1173                 return NULL;
1174         return dst;
1175 }
1176
1177 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1178 {
1179         struct ip_options opt;
1180         int res;
1181
1182         /* Recompile ip options since IPCB may not be valid anymore.
1183          * Also check we have a reasonable ipv4 header.
1184          */
1185         if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1186             ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1187                 return;
1188
1189         memset(&opt, 0, sizeof(opt));
1190         if (ip_hdr(skb)->ihl > 5) {
1191                 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1192                         return;
1193                 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1194
1195                 rcu_read_lock();
1196                 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1197                 rcu_read_unlock();
1198
1199                 if (res)
1200                         return;
1201         }
1202         __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1203 }
1204
1205 static void ipv4_link_failure(struct sk_buff *skb)
1206 {
1207         struct rtable *rt;
1208
1209         ipv4_send_dest_unreach(skb);
1210
1211         rt = skb_rtable(skb);
1212         if (rt)
1213                 dst_set_expires(&rt->dst, 0);
1214 }
1215
1216 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1217 {
1218         pr_debug("%s: %pI4 -> %pI4, %s\n",
1219                  __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1220                  skb->dev ? skb->dev->name : "?");
1221         kfree_skb(skb);
1222         WARN_ON(1);
1223         return 0;
1224 }
1225
1226 /*
1227    We do not cache source address of outgoing interface,
1228    because it is used only by IP RR, TS and SRR options,
1229    so that it out of fast path.
1230
1231    BTW remember: "addr" is allowed to be not aligned
1232    in IP options!
1233  */
1234
1235 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1236 {
1237         __be32 src;
1238
1239         if (rt_is_output_route(rt))
1240                 src = ip_hdr(skb)->saddr;
1241         else {
1242                 struct fib_result res;
1243                 struct flowi4 fl4;
1244                 struct iphdr *iph;
1245
1246                 iph = ip_hdr(skb);
1247
1248                 memset(&fl4, 0, sizeof(fl4));
1249                 fl4.daddr = iph->daddr;
1250                 fl4.saddr = iph->saddr;
1251                 fl4.flowi4_tos = RT_TOS(iph->tos);
1252                 fl4.flowi4_oif = rt->dst.dev->ifindex;
1253                 fl4.flowi4_iif = skb->dev->ifindex;
1254                 fl4.flowi4_mark = skb->mark;
1255
1256                 rcu_read_lock();
1257                 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1258                         src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1259                 else
1260                         src = inet_select_addr(rt->dst.dev,
1261                                                rt_nexthop(rt, iph->daddr),
1262                                                RT_SCOPE_UNIVERSE);
1263                 rcu_read_unlock();
1264         }
1265         memcpy(addr, &src, 4);
1266 }
1267
1268 #ifdef CONFIG_IP_ROUTE_CLASSID
1269 static void set_class_tag(struct rtable *rt, u32 tag)
1270 {
1271         if (!(rt->dst.tclassid & 0xFFFF))
1272                 rt->dst.tclassid |= tag & 0xFFFF;
1273         if (!(rt->dst.tclassid & 0xFFFF0000))
1274                 rt->dst.tclassid |= tag & 0xFFFF0000;
1275 }
1276 #endif
1277
1278 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1279 {
1280         unsigned int advmss = dst_metric_raw(dst, RTAX_ADVMSS);
1281
1282         if (advmss == 0) {
1283                 advmss = max_t(unsigned int, dst->dev->mtu - 40,
1284                                ip_rt_min_advmss);
1285                 if (advmss > 65535 - 40)
1286                         advmss = 65535 - 40;
1287         }
1288         return advmss;
1289 }
1290
1291 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1292 {
1293         const struct rtable *rt = (const struct rtable *) dst;
1294         unsigned int mtu = rt->rt_pmtu;
1295
1296         if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1297                 mtu = dst_metric_raw(dst, RTAX_MTU);
1298
1299         if (mtu)
1300                 return mtu;
1301
1302         mtu = READ_ONCE(dst->dev->mtu);
1303
1304         if (unlikely(ip_mtu_locked(dst))) {
1305                 if (rt->rt_uses_gateway && mtu > 576)
1306                         mtu = 576;
1307         }
1308
1309         mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1310
1311         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1312 }
1313
1314 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1315 {
1316         struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1317         struct fib_nh_exception *fnhe;
1318         u32 hval;
1319
1320         if (!hash)
1321                 return NULL;
1322
1323         hval = fnhe_hashfun(daddr);
1324
1325         for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1326              fnhe = rcu_dereference(fnhe->fnhe_next)) {
1327                 if (fnhe->fnhe_daddr == daddr)
1328                         return fnhe;
1329         }
1330         return NULL;
1331 }
1332
1333 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1334                               __be32 daddr)
1335 {
1336         bool ret = false;
1337
1338         spin_lock_bh(&fnhe_lock);
1339
1340         if (daddr == fnhe->fnhe_daddr) {
1341                 struct rtable __rcu **porig;
1342                 struct rtable *orig;
1343                 int genid = fnhe_genid(dev_net(rt->dst.dev));
1344
1345                 if (rt_is_input_route(rt))
1346                         porig = &fnhe->fnhe_rth_input;
1347                 else
1348                         porig = &fnhe->fnhe_rth_output;
1349                 orig = rcu_dereference(*porig);
1350
1351                 if (fnhe->fnhe_genid != genid) {
1352                         fnhe->fnhe_genid = genid;
1353                         fnhe->fnhe_gw = 0;
1354                         fnhe->fnhe_pmtu = 0;
1355                         fnhe->fnhe_expires = 0;
1356                         fnhe_flush_routes(fnhe);
1357                         orig = NULL;
1358                 }
1359                 fill_route_from_fnhe(rt, fnhe);
1360                 if (!rt->rt_gateway)
1361                         rt->rt_gateway = daddr;
1362
1363                 if (!(rt->dst.flags & DST_NOCACHE)) {
1364                         rcu_assign_pointer(*porig, rt);
1365                         if (orig)
1366                                 rt_free(orig);
1367                         ret = true;
1368                 }
1369
1370                 fnhe->fnhe_stamp = jiffies;
1371         }
1372         spin_unlock_bh(&fnhe_lock);
1373
1374         return ret;
1375 }
1376
1377 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1378 {
1379         struct rtable *orig, *prev, **p;
1380         bool ret = true;
1381
1382         if (rt_is_input_route(rt)) {
1383                 p = (struct rtable **)&nh->nh_rth_input;
1384         } else {
1385                 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1386         }
1387         orig = *p;
1388
1389         prev = cmpxchg(p, orig, rt);
1390         if (prev == orig) {
1391                 if (orig)
1392                         rt_free(orig);
1393         } else
1394                 ret = false;
1395
1396         return ret;
1397 }
1398
1399 struct uncached_list {
1400         spinlock_t              lock;
1401         struct list_head        head;
1402 };
1403
1404 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1405
1406 static void rt_add_uncached_list(struct rtable *rt)
1407 {
1408         struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1409
1410         rt->rt_uncached_list = ul;
1411
1412         spin_lock_bh(&ul->lock);
1413         list_add_tail(&rt->rt_uncached, &ul->head);
1414         spin_unlock_bh(&ul->lock);
1415 }
1416
1417 static void ipv4_dst_destroy(struct dst_entry *dst)
1418 {
1419         struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1420         struct rtable *rt = (struct rtable *) dst;
1421
1422         if (p != &dst_default_metrics && atomic_dec_and_test(&p->refcnt))
1423                 kfree(p);
1424
1425         if (!list_empty(&rt->rt_uncached)) {
1426                 struct uncached_list *ul = rt->rt_uncached_list;
1427
1428                 spin_lock_bh(&ul->lock);
1429                 list_del(&rt->rt_uncached);
1430                 spin_unlock_bh(&ul->lock);
1431         }
1432 }
1433
1434 void rt_flush_dev(struct net_device *dev)
1435 {
1436         struct net *net = dev_net(dev);
1437         struct rtable *rt;
1438         int cpu;
1439
1440         for_each_possible_cpu(cpu) {
1441                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1442
1443                 spin_lock_bh(&ul->lock);
1444                 list_for_each_entry(rt, &ul->head, rt_uncached) {
1445                         if (rt->dst.dev != dev)
1446                                 continue;
1447                         rt->dst.dev = net->loopback_dev;
1448                         dev_hold(rt->dst.dev);
1449                         dev_put(dev);
1450                 }
1451                 spin_unlock_bh(&ul->lock);
1452         }
1453 }
1454
1455 static bool rt_cache_valid(const struct rtable *rt)
1456 {
1457         return  rt &&
1458                 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1459                 !rt_is_expired(rt);
1460 }
1461
1462 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1463                            const struct fib_result *res,
1464                            struct fib_nh_exception *fnhe,
1465                            struct fib_info *fi, u16 type, u32 itag)
1466 {
1467         bool cached = false;
1468
1469         if (fi) {
1470                 struct fib_nh *nh = &FIB_RES_NH(*res);
1471
1472                 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1473                         rt->rt_gateway = nh->nh_gw;
1474                         rt->rt_uses_gateway = 1;
1475                 }
1476                 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1477                 if (fi->fib_metrics != &dst_default_metrics) {
1478                         rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1479                         atomic_inc(&fi->fib_metrics->refcnt);
1480                 }
1481 #ifdef CONFIG_IP_ROUTE_CLASSID
1482                 rt->dst.tclassid = nh->nh_tclassid;
1483 #endif
1484                 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1485                 if (unlikely(fnhe))
1486                         cached = rt_bind_exception(rt, fnhe, daddr);
1487                 else if (!(rt->dst.flags & DST_NOCACHE))
1488                         cached = rt_cache_route(nh, rt);
1489                 if (unlikely(!cached)) {
1490                         /* Routes we intend to cache in nexthop exception or
1491                          * FIB nexthop have the DST_NOCACHE bit clear.
1492                          * However, if we are unsuccessful at storing this
1493                          * route into the cache we really need to set it.
1494                          */
1495                         rt->dst.flags |= DST_NOCACHE;
1496                         if (!rt->rt_gateway)
1497                                 rt->rt_gateway = daddr;
1498                         rt_add_uncached_list(rt);
1499                 }
1500         } else
1501                 rt_add_uncached_list(rt);
1502
1503 #ifdef CONFIG_IP_ROUTE_CLASSID
1504 #ifdef CONFIG_IP_MULTIPLE_TABLES
1505         set_class_tag(rt, res->tclassid);
1506 #endif
1507         set_class_tag(rt, itag);
1508 #endif
1509 }
1510
1511 struct rtable *rt_dst_alloc(struct net_device *dev,
1512                             unsigned int flags, u16 type,
1513                             bool nopolicy, bool noxfrm, bool will_cache)
1514 {
1515         struct rtable *rt;
1516
1517         rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1518                        (will_cache ? 0 : (DST_HOST | DST_NOCACHE)) |
1519                        (nopolicy ? DST_NOPOLICY : 0) |
1520                        (noxfrm ? DST_NOXFRM : 0));
1521
1522         if (rt) {
1523                 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1524                 rt->rt_flags = flags;
1525                 rt->rt_type = type;
1526                 rt->rt_is_input = 0;
1527                 rt->rt_iif = 0;
1528                 rt->rt_pmtu = 0;
1529                 rt->rt_mtu_locked = 0;
1530                 rt->rt_gateway = 0;
1531                 rt->rt_uses_gateway = 0;
1532                 rt->rt_table_id = 0;
1533                 INIT_LIST_HEAD(&rt->rt_uncached);
1534
1535                 rt->dst.output = ip_output;
1536                 if (flags & RTCF_LOCAL)
1537                         rt->dst.input = ip_local_deliver;
1538         }
1539
1540         return rt;
1541 }
1542 EXPORT_SYMBOL(rt_dst_alloc);
1543
1544 /* called in rcu_read_lock() section */
1545 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1546                                 u8 tos, struct net_device *dev, int our)
1547 {
1548         struct rtable *rth;
1549         struct in_device *in_dev = __in_dev_get_rcu(dev);
1550         unsigned int flags = RTCF_MULTICAST;
1551         u32 itag = 0;
1552         int err;
1553
1554         /* Primary sanity checks. */
1555
1556         if (!in_dev)
1557                 return -EINVAL;
1558
1559         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1560             skb->protocol != htons(ETH_P_IP))
1561                 goto e_inval;
1562
1563         if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1564                 goto e_inval;
1565
1566         if (ipv4_is_zeronet(saddr)) {
1567                 if (!ipv4_is_local_multicast(daddr))
1568                         goto e_inval;
1569         } else {
1570                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1571                                           in_dev, &itag);
1572                 if (err < 0)
1573                         goto e_err;
1574         }
1575         if (our)
1576                 flags |= RTCF_LOCAL;
1577
1578         rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1579                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1580         if (!rth)
1581                 goto e_nobufs;
1582
1583 #ifdef CONFIG_IP_ROUTE_CLASSID
1584         rth->dst.tclassid = itag;
1585 #endif
1586         rth->dst.output = ip_rt_bug;
1587         rth->rt_is_input= 1;
1588
1589 #ifdef CONFIG_IP_MROUTE
1590         if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1591                 rth->dst.input = ip_mr_input;
1592 #endif
1593         RT_CACHE_STAT_INC(in_slow_mc);
1594
1595         skb_dst_set(skb, &rth->dst);
1596         return 0;
1597
1598 e_nobufs:
1599         return -ENOBUFS;
1600 e_inval:
1601         return -EINVAL;
1602 e_err:
1603         return err;
1604 }
1605
1606
1607 static void ip_handle_martian_source(struct net_device *dev,
1608                                      struct in_device *in_dev,
1609                                      struct sk_buff *skb,
1610                                      __be32 daddr,
1611                                      __be32 saddr)
1612 {
1613         RT_CACHE_STAT_INC(in_martian_src);
1614 #ifdef CONFIG_IP_ROUTE_VERBOSE
1615         if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1616                 /*
1617                  *      RFC1812 recommendation, if source is martian,
1618                  *      the only hint is MAC header.
1619                  */
1620                 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1621                         &daddr, &saddr, dev->name);
1622                 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1623                         print_hex_dump(KERN_WARNING, "ll header: ",
1624                                        DUMP_PREFIX_OFFSET, 16, 1,
1625                                        skb_mac_header(skb),
1626                                        dev->hard_header_len, true);
1627                 }
1628         }
1629 #endif
1630 }
1631
1632 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1633 {
1634         struct fnhe_hash_bucket *hash;
1635         struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1636         u32 hval = fnhe_hashfun(daddr);
1637
1638         spin_lock_bh(&fnhe_lock);
1639
1640         hash = rcu_dereference_protected(nh->nh_exceptions,
1641                                          lockdep_is_held(&fnhe_lock));
1642         hash += hval;
1643
1644         fnhe_p = &hash->chain;
1645         fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1646         while (fnhe) {
1647                 if (fnhe->fnhe_daddr == daddr) {
1648                         rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1649                                 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1650                         /* set fnhe_daddr to 0 to ensure it won't bind with
1651                          * new dsts in rt_bind_exception().
1652                          */
1653                         fnhe->fnhe_daddr = 0;
1654                         fnhe_flush_routes(fnhe);
1655                         kfree_rcu(fnhe, rcu);
1656                         break;
1657                 }
1658                 fnhe_p = &fnhe->fnhe_next;
1659                 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1660                                                  lockdep_is_held(&fnhe_lock));
1661         }
1662
1663         spin_unlock_bh(&fnhe_lock);
1664 }
1665
1666 /* called in rcu_read_lock() section */
1667 static int __mkroute_input(struct sk_buff *skb,
1668                            const struct fib_result *res,
1669                            struct in_device *in_dev,
1670                            __be32 daddr, __be32 saddr, u32 tos)
1671 {
1672         struct fib_nh_exception *fnhe;
1673         struct rtable *rth;
1674         int err;
1675         struct in_device *out_dev;
1676         bool do_cache;
1677         u32 itag = 0;
1678
1679         /* get a working reference to the output device */
1680         out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1681         if (!out_dev) {
1682                 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1683                 return -EINVAL;
1684         }
1685
1686         err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1687                                   in_dev->dev, in_dev, &itag);
1688         if (err < 0) {
1689                 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1690                                          saddr);
1691
1692                 goto cleanup;
1693         }
1694
1695         do_cache = res->fi && !itag;
1696         if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1697             skb->protocol == htons(ETH_P_IP) &&
1698             (IN_DEV_SHARED_MEDIA(out_dev) ||
1699              inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1700                 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1701
1702         if (skb->protocol != htons(ETH_P_IP)) {
1703                 /* Not IP (i.e. ARP). Do not create route, if it is
1704                  * invalid for proxy arp. DNAT routes are always valid.
1705                  *
1706                  * Proxy arp feature have been extended to allow, ARP
1707                  * replies back to the same interface, to support
1708                  * Private VLAN switch technologies. See arp.c.
1709                  */
1710                 if (out_dev == in_dev &&
1711                     IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1712                         err = -EINVAL;
1713                         goto cleanup;
1714                 }
1715         }
1716
1717         fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1718         if (do_cache) {
1719                 if (fnhe) {
1720                         rth = rcu_dereference(fnhe->fnhe_rth_input);
1721                         if (rth && rth->dst.expires &&
1722                             time_after(jiffies, rth->dst.expires)) {
1723                                 ip_del_fnhe(&FIB_RES_NH(*res), daddr);
1724                                 fnhe = NULL;
1725                         } else {
1726                                 goto rt_cache;
1727                         }
1728                 }
1729
1730                 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1731
1732 rt_cache:
1733                 if (rt_cache_valid(rth)) {
1734                         skb_dst_set_noref(skb, &rth->dst);
1735                         goto out;
1736                 }
1737         }
1738
1739         rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1740                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
1741                            IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1742         if (!rth) {
1743                 err = -ENOBUFS;
1744                 goto cleanup;
1745         }
1746
1747         rth->rt_is_input = 1;
1748         if (res->table)
1749                 rth->rt_table_id = res->table->tb_id;
1750         RT_CACHE_STAT_INC(in_slow_tot);
1751
1752         rth->dst.input = ip_forward;
1753
1754         rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag);
1755         if (lwtunnel_output_redirect(rth->dst.lwtstate)) {
1756                 rth->dst.lwtstate->orig_output = rth->dst.output;
1757                 rth->dst.output = lwtunnel_output;
1758         }
1759         if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
1760                 rth->dst.lwtstate->orig_input = rth->dst.input;
1761                 rth->dst.input = lwtunnel_input;
1762         }
1763         skb_dst_set(skb, &rth->dst);
1764 out:
1765         err = 0;
1766  cleanup:
1767         return err;
1768 }
1769
1770 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1771
1772 /* To make ICMP packets follow the right flow, the multipath hash is
1773  * calculated from the inner IP addresses in reverse order.
1774  */
1775 static int ip_multipath_icmp_hash(struct sk_buff *skb)
1776 {
1777         const struct iphdr *outer_iph = ip_hdr(skb);
1778         struct icmphdr _icmph;
1779         const struct icmphdr *icmph;
1780         struct iphdr _inner_iph;
1781         const struct iphdr *inner_iph;
1782
1783         if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1784                 goto standard_hash;
1785
1786         icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1787                                    &_icmph);
1788         if (!icmph)
1789                 goto standard_hash;
1790
1791         if (icmph->type != ICMP_DEST_UNREACH &&
1792             icmph->type != ICMP_REDIRECT &&
1793             icmph->type != ICMP_TIME_EXCEEDED &&
1794             icmph->type != ICMP_PARAMETERPROB) {
1795                 goto standard_hash;
1796         }
1797
1798         inner_iph = skb_header_pointer(skb,
1799                                        outer_iph->ihl * 4 + sizeof(_icmph),
1800                                        sizeof(_inner_iph), &_inner_iph);
1801         if (!inner_iph)
1802                 goto standard_hash;
1803
1804         return fib_multipath_hash(inner_iph->daddr, inner_iph->saddr);
1805
1806 standard_hash:
1807         return fib_multipath_hash(outer_iph->saddr, outer_iph->daddr);
1808 }
1809
1810 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1811
1812 static int ip_mkroute_input(struct sk_buff *skb,
1813                             struct fib_result *res,
1814                             const struct flowi4 *fl4,
1815                             struct in_device *in_dev,
1816                             __be32 daddr, __be32 saddr, u32 tos)
1817 {
1818 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1819         if (res->fi && res->fi->fib_nhs > 1) {
1820                 int h;
1821
1822                 if (unlikely(ip_hdr(skb)->protocol == IPPROTO_ICMP))
1823                         h = ip_multipath_icmp_hash(skb);
1824                 else
1825                         h = fib_multipath_hash(saddr, daddr);
1826                 fib_select_multipath(res, h);
1827         }
1828 #endif
1829
1830         /* create a routing cache entry */
1831         return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1832 }
1833
1834 /*
1835  *      NOTE. We drop all the packets that has local source
1836  *      addresses, because every properly looped back packet
1837  *      must have correct destination already attached by output routine.
1838  *
1839  *      Such approach solves two big problems:
1840  *      1. Not simplex devices are handled properly.
1841  *      2. IP spoofing attempts are filtered with 100% of guarantee.
1842  *      called with rcu_read_lock()
1843  */
1844
1845 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1846                                u8 tos, struct net_device *dev)
1847 {
1848         struct fib_result res;
1849         struct in_device *in_dev = __in_dev_get_rcu(dev);
1850         struct ip_tunnel_info *tun_info;
1851         struct flowi4   fl4;
1852         unsigned int    flags = 0;
1853         u32             itag = 0;
1854         struct rtable   *rth;
1855         int             err = -EINVAL;
1856         struct net    *net = dev_net(dev);
1857         bool do_cache;
1858
1859         /* IP on this device is disabled. */
1860
1861         if (!in_dev)
1862                 goto out;
1863
1864         /* Check for the most weird martians, which can be not detected
1865            by fib_lookup.
1866          */
1867
1868         tun_info = skb_tunnel_info(skb);
1869         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1870                 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
1871         else
1872                 fl4.flowi4_tun_key.tun_id = 0;
1873         skb_dst_drop(skb);
1874
1875         if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
1876                 goto martian_source;
1877
1878         res.fi = NULL;
1879         res.table = NULL;
1880         if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
1881                 goto brd_input;
1882
1883         /* Accept zero addresses only to limited broadcast;
1884          * I even do not know to fix it or not. Waiting for complains :-)
1885          */
1886         if (ipv4_is_zeronet(saddr))
1887                 goto martian_source;
1888
1889         if (ipv4_is_zeronet(daddr))
1890                 goto martian_destination;
1891
1892         /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
1893          * and call it once if daddr or/and saddr are loopback addresses
1894          */
1895         if (ipv4_is_loopback(daddr)) {
1896                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1897                         goto martian_destination;
1898         } else if (ipv4_is_loopback(saddr)) {
1899                 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
1900                         goto martian_source;
1901         }
1902
1903         /*
1904          *      Now we are ready to route packet.
1905          */
1906         fl4.flowi4_oif = 0;
1907         fl4.flowi4_iif = dev->ifindex;
1908         fl4.flowi4_mark = skb->mark;
1909         fl4.flowi4_tos = tos;
1910         fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
1911         fl4.flowi4_flags = 0;
1912         fl4.daddr = daddr;
1913         fl4.saddr = saddr;
1914         err = fib_lookup(net, &fl4, &res, 0);
1915         if (err != 0) {
1916                 if (!IN_DEV_FORWARD(in_dev))
1917                         err = -EHOSTUNREACH;
1918                 goto no_route;
1919         }
1920
1921         if (res.type == RTN_BROADCAST)
1922                 goto brd_input;
1923
1924         if (res.type == RTN_LOCAL) {
1925                 err = fib_validate_source(skb, saddr, daddr, tos,
1926                                           0, dev, in_dev, &itag);
1927                 if (err < 0)
1928                         goto martian_source;
1929                 goto local_input;
1930         }
1931
1932         if (!IN_DEV_FORWARD(in_dev)) {
1933                 err = -EHOSTUNREACH;
1934                 goto no_route;
1935         }
1936         if (res.type != RTN_UNICAST)
1937                 goto martian_destination;
1938
1939         err = ip_mkroute_input(skb, &res, &fl4, in_dev, daddr, saddr, tos);
1940 out:    return err;
1941
1942 brd_input:
1943         if (skb->protocol != htons(ETH_P_IP))
1944                 goto e_inval;
1945
1946         if (!ipv4_is_zeronet(saddr)) {
1947                 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1948                                           in_dev, &itag);
1949                 if (err < 0)
1950                         goto martian_source;
1951         }
1952         flags |= RTCF_BROADCAST;
1953         res.type = RTN_BROADCAST;
1954         RT_CACHE_STAT_INC(in_brd);
1955
1956 local_input:
1957         do_cache = false;
1958         if (res.fi) {
1959                 if (!itag) {
1960                         rth = rcu_dereference(FIB_RES_NH(res).nh_rth_input);
1961                         if (rt_cache_valid(rth)) {
1962                                 skb_dst_set_noref(skb, &rth->dst);
1963                                 err = 0;
1964                                 goto out;
1965                         }
1966                         do_cache = true;
1967                 }
1968         }
1969
1970         rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
1971                            flags | RTCF_LOCAL, res.type,
1972                            IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
1973         if (!rth)
1974                 goto e_nobufs;
1975
1976         rth->dst.output= ip_rt_bug;
1977 #ifdef CONFIG_IP_ROUTE_CLASSID
1978         rth->dst.tclassid = itag;
1979 #endif
1980         rth->rt_is_input = 1;
1981         if (res.table)
1982                 rth->rt_table_id = res.table->tb_id;
1983
1984         RT_CACHE_STAT_INC(in_slow_tot);
1985         if (res.type == RTN_UNREACHABLE) {
1986                 rth->dst.input= ip_error;
1987                 rth->dst.error= -err;
1988                 rth->rt_flags   &= ~RTCF_LOCAL;
1989         }
1990         if (do_cache) {
1991                 if (unlikely(!rt_cache_route(&FIB_RES_NH(res), rth))) {
1992                         rth->dst.flags |= DST_NOCACHE;
1993                         rt_add_uncached_list(rth);
1994                 }
1995         }
1996         skb_dst_set(skb, &rth->dst);
1997         err = 0;
1998         goto out;
1999
2000 no_route:
2001         RT_CACHE_STAT_INC(in_no_route);
2002         res.type = RTN_UNREACHABLE;
2003         res.fi = NULL;
2004         res.table = NULL;
2005         goto local_input;
2006
2007         /*
2008          *      Do not cache martian addresses: they should be logged (RFC1812)
2009          */
2010 martian_destination:
2011         RT_CACHE_STAT_INC(in_martian_dst);
2012 #ifdef CONFIG_IP_ROUTE_VERBOSE
2013         if (IN_DEV_LOG_MARTIANS(in_dev))
2014                 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2015                                      &daddr, &saddr, dev->name);
2016 #endif
2017
2018 e_inval:
2019         err = -EINVAL;
2020         goto out;
2021
2022 e_nobufs:
2023         err = -ENOBUFS;
2024         goto out;
2025
2026 martian_source:
2027         ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2028         goto out;
2029 }
2030
2031 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2032                          u8 tos, struct net_device *dev)
2033 {
2034         int res;
2035
2036         tos &= IPTOS_RT_MASK;
2037         rcu_read_lock();
2038
2039         /* Multicast recognition logic is moved from route cache to here.
2040            The problem was that too many Ethernet cards have broken/missing
2041            hardware multicast filters :-( As result the host on multicasting
2042            network acquires a lot of useless route cache entries, sort of
2043            SDR messages from all the world. Now we try to get rid of them.
2044            Really, provided software IP multicast filter is organized
2045            reasonably (at least, hashed), it does not result in a slowdown
2046            comparing with route cache reject entries.
2047            Note, that multicast routers are not affected, because
2048            route cache entry is created eventually.
2049          */
2050         if (ipv4_is_multicast(daddr)) {
2051                 struct in_device *in_dev = __in_dev_get_rcu(dev);
2052
2053                 if (in_dev) {
2054                         int our = ip_check_mc_rcu(in_dev, daddr, saddr,
2055                                                   ip_hdr(skb)->protocol);
2056                         if (our
2057 #ifdef CONFIG_IP_MROUTE
2058                                 ||
2059                             (!ipv4_is_local_multicast(daddr) &&
2060                              IN_DEV_MFORWARD(in_dev))
2061 #endif
2062                            ) {
2063                                 int res = ip_route_input_mc(skb, daddr, saddr,
2064                                                             tos, dev, our);
2065                                 rcu_read_unlock();
2066                                 return res;
2067                         }
2068                 }
2069                 rcu_read_unlock();
2070                 return -EINVAL;
2071         }
2072         res = ip_route_input_slow(skb, daddr, saddr, tos, dev);
2073         rcu_read_unlock();
2074         return res;
2075 }
2076 EXPORT_SYMBOL(ip_route_input_noref);
2077
2078 /* called with rcu_read_lock() */
2079 static struct rtable *__mkroute_output(const struct fib_result *res,
2080                                        const struct flowi4 *fl4, int orig_oif,
2081                                        struct net_device *dev_out,
2082                                        unsigned int flags)
2083 {
2084         struct fib_info *fi = res->fi;
2085         struct fib_nh_exception *fnhe;
2086         struct in_device *in_dev;
2087         u16 type = res->type;
2088         struct rtable *rth;
2089         bool do_cache;
2090
2091         in_dev = __in_dev_get_rcu(dev_out);
2092         if (!in_dev)
2093                 return ERR_PTR(-EINVAL);
2094
2095         if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2096                 if (ipv4_is_loopback(fl4->saddr) &&
2097                     !(dev_out->flags & IFF_LOOPBACK) &&
2098                     !netif_is_l3_master(dev_out))
2099                         return ERR_PTR(-EINVAL);
2100
2101         if (ipv4_is_lbcast(fl4->daddr))
2102                 type = RTN_BROADCAST;
2103         else if (ipv4_is_multicast(fl4->daddr))
2104                 type = RTN_MULTICAST;
2105         else if (ipv4_is_zeronet(fl4->daddr))
2106                 return ERR_PTR(-EINVAL);
2107
2108         if (dev_out->flags & IFF_LOOPBACK)
2109                 flags |= RTCF_LOCAL;
2110
2111         do_cache = true;
2112         if (type == RTN_BROADCAST) {
2113                 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2114                 fi = NULL;
2115         } else if (type == RTN_MULTICAST) {
2116                 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2117                 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2118                                      fl4->flowi4_proto))
2119                         flags &= ~RTCF_LOCAL;
2120                 else
2121                         do_cache = false;
2122                 /* If multicast route do not exist use
2123                  * default one, but do not gateway in this case.
2124                  * Yes, it is hack.
2125                  */
2126                 if (fi && res->prefixlen < 4)
2127                         fi = NULL;
2128         } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2129                    (orig_oif != dev_out->ifindex)) {
2130                 /* For local routes that require a particular output interface
2131                  * we do not want to cache the result.  Caching the result
2132                  * causes incorrect behaviour when there are multiple source
2133                  * addresses on the interface, the end result being that if the
2134                  * intended recipient is waiting on that interface for the
2135                  * packet he won't receive it because it will be delivered on
2136                  * the loopback interface and the IP_PKTINFO ipi_ifindex will
2137                  * be set to the loopback interface as well.
2138                  */
2139                 fi = NULL;
2140         }
2141
2142         fnhe = NULL;
2143         do_cache &= fi != NULL;
2144         if (do_cache) {
2145                 struct rtable __rcu **prth;
2146                 struct fib_nh *nh = &FIB_RES_NH(*res);
2147
2148                 fnhe = find_exception(nh, fl4->daddr);
2149                 if (fnhe) {
2150                         prth = &fnhe->fnhe_rth_output;
2151                         rth = rcu_dereference(*prth);
2152                         if (rth && rth->dst.expires &&
2153                             time_after(jiffies, rth->dst.expires)) {
2154                                 ip_del_fnhe(nh, fl4->daddr);
2155                                 fnhe = NULL;
2156                         } else {
2157                                 goto rt_cache;
2158                         }
2159                 }
2160
2161                 if (unlikely(fl4->flowi4_flags &
2162                              FLOWI_FLAG_KNOWN_NH &&
2163                              !(nh->nh_gw &&
2164                                nh->nh_scope == RT_SCOPE_LINK))) {
2165                         do_cache = false;
2166                         goto add;
2167                 }
2168                 prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2169                 rth = rcu_dereference(*prth);
2170
2171 rt_cache:
2172                 if (rt_cache_valid(rth)) {
2173                         dst_hold(&rth->dst);
2174                         return rth;
2175                 }
2176         }
2177
2178 add:
2179         rth = rt_dst_alloc(dev_out, flags, type,
2180                            IN_DEV_CONF_GET(in_dev, NOPOLICY),
2181                            IN_DEV_CONF_GET(in_dev, NOXFRM),
2182                            do_cache);
2183         if (!rth)
2184                 return ERR_PTR(-ENOBUFS);
2185
2186         rth->rt_iif     = orig_oif ? : 0;
2187         if (res->table)
2188                 rth->rt_table_id = res->table->tb_id;
2189
2190         RT_CACHE_STAT_INC(out_slow_tot);
2191
2192         if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2193                 if (flags & RTCF_LOCAL &&
2194                     !(dev_out->flags & IFF_LOOPBACK)) {
2195                         rth->dst.output = ip_mc_output;
2196                         RT_CACHE_STAT_INC(out_slow_mc);
2197                 }
2198 #ifdef CONFIG_IP_MROUTE
2199                 if (type == RTN_MULTICAST) {
2200                         if (IN_DEV_MFORWARD(in_dev) &&
2201                             !ipv4_is_local_multicast(fl4->daddr)) {
2202                                 rth->dst.input = ip_mr_input;
2203                                 rth->dst.output = ip_mc_output;
2204                         }
2205                 }
2206 #endif
2207         }
2208
2209         rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0);
2210         if (lwtunnel_output_redirect(rth->dst.lwtstate))
2211                 rth->dst.output = lwtunnel_output;
2212
2213         return rth;
2214 }
2215
2216 /*
2217  * Major route resolver routine.
2218  */
2219
2220 struct rtable *__ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2221                                           int mp_hash)
2222 {
2223         struct net_device *dev_out = NULL;
2224         __u8 tos = RT_FL_TOS(fl4);
2225         unsigned int flags = 0;
2226         struct fib_result res;
2227         struct rtable *rth;
2228         int orig_oif;
2229         int err;
2230
2231         res.tclassid    = 0;
2232         res.fi          = NULL;
2233         res.table       = NULL;
2234
2235         orig_oif = fl4->flowi4_oif;
2236
2237         fl4->flowi4_iif = LOOPBACK_IFINDEX;
2238         fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2239         fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2240                          RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2241
2242         rcu_read_lock();
2243         if (fl4->saddr) {
2244                 if (ipv4_is_multicast(fl4->saddr) ||
2245                     ipv4_is_lbcast(fl4->saddr) ||
2246                     ipv4_is_zeronet(fl4->saddr)) {
2247                         rth = ERR_PTR(-EINVAL);
2248                         goto out;
2249                 }
2250
2251                 rth = ERR_PTR(-ENETUNREACH);
2252
2253                 /* I removed check for oif == dev_out->oif here.
2254                    It was wrong for two reasons:
2255                    1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2256                       is assigned to multiple interfaces.
2257                    2. Moreover, we are allowed to send packets with saddr
2258                       of another iface. --ANK
2259                  */
2260
2261                 if (fl4->flowi4_oif == 0 &&
2262                     (ipv4_is_multicast(fl4->daddr) ||
2263                      ipv4_is_lbcast(fl4->daddr))) {
2264                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2265                         dev_out = __ip_dev_find(net, fl4->saddr, false);
2266                         if (!dev_out)
2267                                 goto out;
2268
2269                         /* Special hack: user can direct multicasts
2270                            and limited broadcast via necessary interface
2271                            without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2272                            This hack is not just for fun, it allows
2273                            vic,vat and friends to work.
2274                            They bind socket to loopback, set ttl to zero
2275                            and expect that it will work.
2276                            From the viewpoint of routing cache they are broken,
2277                            because we are not allowed to build multicast path
2278                            with loopback source addr (look, routing cache
2279                            cannot know, that ttl is zero, so that packet
2280                            will not leave this host and route is valid).
2281                            Luckily, this hack is good workaround.
2282                          */
2283
2284                         fl4->flowi4_oif = dev_out->ifindex;
2285                         goto make_route;
2286                 }
2287
2288                 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2289                         /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2290                         if (!__ip_dev_find(net, fl4->saddr, false))
2291                                 goto out;
2292                 }
2293         }
2294
2295
2296         if (fl4->flowi4_oif) {
2297                 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2298                 rth = ERR_PTR(-ENODEV);
2299                 if (!dev_out)
2300                         goto out;
2301
2302                 /* RACE: Check return value of inet_select_addr instead. */
2303                 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2304                         rth = ERR_PTR(-ENETUNREACH);
2305                         goto out;
2306                 }
2307                 if (ipv4_is_local_multicast(fl4->daddr) ||
2308                     ipv4_is_lbcast(fl4->daddr) ||
2309                     fl4->flowi4_proto == IPPROTO_IGMP) {
2310                         if (!fl4->saddr)
2311                                 fl4->saddr = inet_select_addr(dev_out, 0,
2312                                                               RT_SCOPE_LINK);
2313                         goto make_route;
2314                 }
2315                 if (!fl4->saddr) {
2316                         if (ipv4_is_multicast(fl4->daddr))
2317                                 fl4->saddr = inet_select_addr(dev_out, 0,
2318                                                               fl4->flowi4_scope);
2319                         else if (!fl4->daddr)
2320                                 fl4->saddr = inet_select_addr(dev_out, 0,
2321                                                               RT_SCOPE_HOST);
2322                 }
2323         }
2324
2325         if (!fl4->daddr) {
2326                 fl4->daddr = fl4->saddr;
2327                 if (!fl4->daddr)
2328                         fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2329                 dev_out = net->loopback_dev;
2330                 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2331                 res.type = RTN_LOCAL;
2332                 flags |= RTCF_LOCAL;
2333                 goto make_route;
2334         }
2335
2336         err = fib_lookup(net, fl4, &res, 0);
2337         if (err) {
2338                 res.fi = NULL;
2339                 res.table = NULL;
2340                 if (fl4->flowi4_oif &&
2341                     !netif_index_is_l3_master(net, fl4->flowi4_oif)) {
2342                         /* Apparently, routing tables are wrong. Assume,
2343                            that the destination is on link.
2344
2345                            WHY? DW.
2346                            Because we are allowed to send to iface
2347                            even if it has NO routes and NO assigned
2348                            addresses. When oif is specified, routing
2349                            tables are looked up with only one purpose:
2350                            to catch if destination is gatewayed, rather than
2351                            direct. Moreover, if MSG_DONTROUTE is set,
2352                            we send packet, ignoring both routing tables
2353                            and ifaddr state. --ANK
2354
2355
2356                            We could make it even if oif is unknown,
2357                            likely IPv6, but we do not.
2358                          */
2359
2360                         if (fl4->saddr == 0)
2361                                 fl4->saddr = inet_select_addr(dev_out, 0,
2362                                                               RT_SCOPE_LINK);
2363                         res.type = RTN_UNICAST;
2364                         goto make_route;
2365                 }
2366                 rth = ERR_PTR(err);
2367                 goto out;
2368         }
2369
2370         if (res.type == RTN_LOCAL) {
2371                 if (!fl4->saddr) {
2372                         if (res.fi->fib_prefsrc)
2373                                 fl4->saddr = res.fi->fib_prefsrc;
2374                         else
2375                                 fl4->saddr = fl4->daddr;
2376                 }
2377
2378                 /* L3 master device is the loopback for that domain */
2379                 dev_out = l3mdev_master_dev_rcu(dev_out) ? : net->loopback_dev;
2380                 fl4->flowi4_oif = dev_out->ifindex;
2381                 flags |= RTCF_LOCAL;
2382                 goto make_route;
2383         }
2384
2385         fib_select_path(net, &res, fl4, mp_hash);
2386
2387         dev_out = FIB_RES_DEV(res);
2388         fl4->flowi4_oif = dev_out->ifindex;
2389
2390
2391 make_route:
2392         rth = __mkroute_output(&res, fl4, orig_oif, dev_out, flags);
2393
2394 out:
2395         rcu_read_unlock();
2396         return rth;
2397 }
2398 EXPORT_SYMBOL_GPL(__ip_route_output_key_hash);
2399
2400 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2401 {
2402         return NULL;
2403 }
2404
2405 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2406 {
2407         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2408
2409         return mtu ? : dst->dev->mtu;
2410 }
2411
2412 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2413                                           struct sk_buff *skb, u32 mtu)
2414 {
2415 }
2416
2417 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2418                                        struct sk_buff *skb)
2419 {
2420 }
2421
2422 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2423                                           unsigned long old)
2424 {
2425         return NULL;
2426 }
2427
2428 static struct dst_ops ipv4_dst_blackhole_ops = {
2429         .family                 =       AF_INET,
2430         .check                  =       ipv4_blackhole_dst_check,
2431         .mtu                    =       ipv4_blackhole_mtu,
2432         .default_advmss         =       ipv4_default_advmss,
2433         .update_pmtu            =       ipv4_rt_blackhole_update_pmtu,
2434         .redirect               =       ipv4_rt_blackhole_redirect,
2435         .cow_metrics            =       ipv4_rt_blackhole_cow_metrics,
2436         .neigh_lookup           =       ipv4_neigh_lookup,
2437 };
2438
2439 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2440 {
2441         struct rtable *ort = (struct rtable *) dst_orig;
2442         struct rtable *rt;
2443
2444         rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_NONE, 0);
2445         if (rt) {
2446                 struct dst_entry *new = &rt->dst;
2447
2448                 new->__use = 1;
2449                 new->input = dst_discard;
2450                 new->output = dst_discard_out;
2451
2452                 new->dev = ort->dst.dev;
2453                 if (new->dev)
2454                         dev_hold(new->dev);
2455
2456                 rt->rt_is_input = ort->rt_is_input;
2457                 rt->rt_iif = ort->rt_iif;
2458                 rt->rt_pmtu = ort->rt_pmtu;
2459                 rt->rt_mtu_locked = ort->rt_mtu_locked;
2460
2461                 rt->rt_genid = rt_genid_ipv4(net);
2462                 rt->rt_flags = ort->rt_flags;
2463                 rt->rt_type = ort->rt_type;
2464                 rt->rt_gateway = ort->rt_gateway;
2465                 rt->rt_uses_gateway = ort->rt_uses_gateway;
2466
2467                 INIT_LIST_HEAD(&rt->rt_uncached);
2468                 dst_free(new);
2469         }
2470
2471         dst_release(dst_orig);
2472
2473         return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2474 }
2475
2476 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2477                                     const struct sock *sk)
2478 {
2479         struct rtable *rt = __ip_route_output_key(net, flp4);
2480
2481         if (IS_ERR(rt))
2482                 return rt;
2483
2484         if (flp4->flowi4_proto)
2485                 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2486                                                         flowi4_to_flowi(flp4),
2487                                                         sk, 0);
2488
2489         return rt;
2490 }
2491 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2492
2493 static int rt_fill_info(struct net *net,  __be32 dst, __be32 src, u32 table_id,
2494                         struct flowi4 *fl4, struct sk_buff *skb, u32 portid,
2495                         u32 seq, int event, int nowait, unsigned int flags)
2496 {
2497         struct rtable *rt = skb_rtable(skb);
2498         struct rtmsg *r;
2499         struct nlmsghdr *nlh;
2500         unsigned long expires = 0;
2501         u32 error;
2502         u32 metrics[RTAX_MAX];
2503
2504         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*r), flags);
2505         if (!nlh)
2506                 return -EMSGSIZE;
2507
2508         r = nlmsg_data(nlh);
2509         r->rtm_family    = AF_INET;
2510         r->rtm_dst_len  = 32;
2511         r->rtm_src_len  = 0;
2512         r->rtm_tos      = fl4->flowi4_tos;
2513         r->rtm_table    = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2514         if (nla_put_u32(skb, RTA_TABLE, table_id))
2515                 goto nla_put_failure;
2516         r->rtm_type     = rt->rt_type;
2517         r->rtm_scope    = RT_SCOPE_UNIVERSE;
2518         r->rtm_protocol = RTPROT_UNSPEC;
2519         r->rtm_flags    = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2520         if (rt->rt_flags & RTCF_NOTIFY)
2521                 r->rtm_flags |= RTM_F_NOTIFY;
2522         if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2523                 r->rtm_flags |= RTCF_DOREDIRECT;
2524
2525         if (nla_put_in_addr(skb, RTA_DST, dst))
2526                 goto nla_put_failure;
2527         if (src) {
2528                 r->rtm_src_len = 32;
2529                 if (nla_put_in_addr(skb, RTA_SRC, src))
2530                         goto nla_put_failure;
2531         }
2532         if (rt->dst.dev &&
2533             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2534                 goto nla_put_failure;
2535 #ifdef CONFIG_IP_ROUTE_CLASSID
2536         if (rt->dst.tclassid &&
2537             nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2538                 goto nla_put_failure;
2539 #endif
2540         if (!rt_is_input_route(rt) &&
2541             fl4->saddr != src) {
2542                 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2543                         goto nla_put_failure;
2544         }
2545         if (rt->rt_uses_gateway &&
2546             nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2547                 goto nla_put_failure;
2548
2549         expires = rt->dst.expires;
2550         if (expires) {
2551                 unsigned long now = jiffies;
2552
2553                 if (time_before(now, expires))
2554                         expires -= now;
2555                 else
2556                         expires = 0;
2557         }
2558
2559         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2560         if (rt->rt_pmtu && expires)
2561                 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2562         if (rt->rt_mtu_locked && expires)
2563                 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2564         if (rtnetlink_put_metrics(skb, metrics) < 0)
2565                 goto nla_put_failure;
2566
2567         if (fl4->flowi4_mark &&
2568             nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2569                 goto nla_put_failure;
2570
2571         error = rt->dst.error;
2572
2573         if (rt_is_input_route(rt)) {
2574 #ifdef CONFIG_IP_MROUTE
2575                 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2576                     IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2577                         int err = ipmr_get_route(net, skb,
2578                                                  fl4->saddr, fl4->daddr,
2579                                                  r, nowait, portid);
2580
2581                         if (err <= 0) {
2582                                 if (!nowait) {
2583                                         if (err == 0)
2584                                                 return 0;
2585                                         goto nla_put_failure;
2586                                 } else {
2587                                         if (err == -EMSGSIZE)
2588                                                 goto nla_put_failure;
2589                                         error = err;
2590                                 }
2591                         }
2592                 } else
2593 #endif
2594                         if (nla_put_u32(skb, RTA_IIF, skb->dev->ifindex))
2595                                 goto nla_put_failure;
2596         }
2597
2598         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2599                 goto nla_put_failure;
2600
2601         nlmsg_end(skb, nlh);
2602         return 0;
2603
2604 nla_put_failure:
2605         nlmsg_cancel(skb, nlh);
2606         return -EMSGSIZE;
2607 }
2608
2609 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
2610 {
2611         struct net *net = sock_net(in_skb->sk);
2612         struct rtmsg *rtm;
2613         struct nlattr *tb[RTA_MAX+1];
2614         struct rtable *rt = NULL;
2615         struct flowi4 fl4;
2616         __be32 dst = 0;
2617         __be32 src = 0;
2618         u32 iif;
2619         int err;
2620         int mark;
2621         struct sk_buff *skb;
2622         u32 table_id = RT_TABLE_MAIN;
2623
2624         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy);
2625         if (err < 0)
2626                 goto errout;
2627
2628         rtm = nlmsg_data(nlh);
2629
2630         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2631         if (!skb) {
2632                 err = -ENOBUFS;
2633                 goto errout;
2634         }
2635
2636         /* Reserve room for dummy headers, this skb can pass
2637            through good chunk of routing engine.
2638          */
2639         skb_reset_mac_header(skb);
2640         skb_reset_network_header(skb);
2641
2642         /* Bugfix: need to give ip_route_input enough of an IP header to not gag. */
2643         ip_hdr(skb)->protocol = IPPROTO_UDP;
2644         skb_reserve(skb, MAX_HEADER + sizeof(struct iphdr));
2645
2646         src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2647         dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2648         iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2649         mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2650
2651         memset(&fl4, 0, sizeof(fl4));
2652         fl4.daddr = dst;
2653         fl4.saddr = src;
2654         fl4.flowi4_tos = rtm->rtm_tos;
2655         fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2656         fl4.flowi4_mark = mark;
2657
2658         if (iif) {
2659                 struct net_device *dev;
2660
2661                 dev = __dev_get_by_index(net, iif);
2662                 if (!dev) {
2663                         err = -ENODEV;
2664                         goto errout_free;
2665                 }
2666
2667                 skb->protocol   = htons(ETH_P_IP);
2668                 skb->dev        = dev;
2669                 skb->mark       = mark;
2670                 local_bh_disable();
2671                 err = ip_route_input(skb, dst, src, rtm->rtm_tos, dev);
2672                 local_bh_enable();
2673
2674                 rt = skb_rtable(skb);
2675                 if (err == 0 && rt->dst.error)
2676                         err = -rt->dst.error;
2677         } else {
2678                 rt = ip_route_output_key(net, &fl4);
2679
2680                 err = 0;
2681                 if (IS_ERR(rt))
2682                         err = PTR_ERR(rt);
2683         }
2684
2685         if (err)
2686                 goto errout_free;
2687
2688         skb_dst_set(skb, &rt->dst);
2689         if (rtm->rtm_flags & RTM_F_NOTIFY)
2690                 rt->rt_flags |= RTCF_NOTIFY;
2691
2692         if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2693                 table_id = rt->rt_table_id;
2694
2695         err = rt_fill_info(net, dst, src, table_id, &fl4, skb,
2696                            NETLINK_CB(in_skb).portid, nlh->nlmsg_seq,
2697                            RTM_NEWROUTE, 0, 0);
2698         if (err < 0)
2699                 goto errout_free;
2700
2701         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2702 errout:
2703         return err;
2704
2705 errout_free:
2706         kfree_skb(skb);
2707         goto errout;
2708 }
2709
2710 void ip_rt_multicast_event(struct in_device *in_dev)
2711 {
2712         rt_cache_flush(dev_net(in_dev->dev));
2713 }
2714
2715 #ifdef CONFIG_SYSCTL
2716 static int ip_rt_gc_interval __read_mostly  = 60 * HZ;
2717 static int ip_rt_gc_min_interval __read_mostly  = HZ / 2;
2718 static int ip_rt_gc_elasticity __read_mostly    = 8;
2719 static int ip_min_valid_pmtu __read_mostly      = IPV4_MIN_MTU;
2720
2721 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2722                                         void __user *buffer,
2723                                         size_t *lenp, loff_t *ppos)
2724 {
2725         struct net *net = (struct net *)__ctl->extra1;
2726
2727         if (write) {
2728                 rt_cache_flush(net);
2729                 fnhe_genid_bump(net);
2730                 return 0;
2731         }
2732
2733         return -EINVAL;
2734 }
2735
2736 static struct ctl_table ipv4_route_table[] = {
2737         {
2738                 .procname       = "gc_thresh",
2739                 .data           = &ipv4_dst_ops.gc_thresh,
2740                 .maxlen         = sizeof(int),
2741                 .mode           = 0644,
2742                 .proc_handler   = proc_dointvec,
2743         },
2744         {
2745                 .procname       = "max_size",
2746                 .data           = &ip_rt_max_size,
2747                 .maxlen         = sizeof(int),
2748                 .mode           = 0644,
2749                 .proc_handler   = proc_dointvec,
2750         },
2751         {
2752                 /*  Deprecated. Use gc_min_interval_ms */
2753
2754                 .procname       = "gc_min_interval",
2755                 .data           = &ip_rt_gc_min_interval,
2756                 .maxlen         = sizeof(int),
2757                 .mode           = 0644,
2758                 .proc_handler   = proc_dointvec_jiffies,
2759         },
2760         {
2761                 .procname       = "gc_min_interval_ms",
2762                 .data           = &ip_rt_gc_min_interval,
2763                 .maxlen         = sizeof(int),
2764                 .mode           = 0644,
2765                 .proc_handler   = proc_dointvec_ms_jiffies,
2766         },
2767         {
2768                 .procname       = "gc_timeout",
2769                 .data           = &ip_rt_gc_timeout,
2770                 .maxlen         = sizeof(int),
2771                 .mode           = 0644,
2772                 .proc_handler   = proc_dointvec_jiffies,
2773         },
2774         {
2775                 .procname       = "gc_interval",
2776                 .data           = &ip_rt_gc_interval,
2777                 .maxlen         = sizeof(int),
2778                 .mode           = 0644,
2779                 .proc_handler   = proc_dointvec_jiffies,
2780         },
2781         {
2782                 .procname       = "redirect_load",
2783                 .data           = &ip_rt_redirect_load,
2784                 .maxlen         = sizeof(int),
2785                 .mode           = 0644,
2786                 .proc_handler   = proc_dointvec,
2787         },
2788         {
2789                 .procname       = "redirect_number",
2790                 .data           = &ip_rt_redirect_number,
2791                 .maxlen         = sizeof(int),
2792                 .mode           = 0644,
2793                 .proc_handler   = proc_dointvec,
2794         },
2795         {
2796                 .procname       = "redirect_silence",
2797                 .data           = &ip_rt_redirect_silence,
2798                 .maxlen         = sizeof(int),
2799                 .mode           = 0644,
2800                 .proc_handler   = proc_dointvec,
2801         },
2802         {
2803                 .procname       = "error_cost",
2804                 .data           = &ip_rt_error_cost,
2805                 .maxlen         = sizeof(int),
2806                 .mode           = 0644,
2807                 .proc_handler   = proc_dointvec,
2808         },
2809         {
2810                 .procname       = "error_burst",
2811                 .data           = &ip_rt_error_burst,
2812                 .maxlen         = sizeof(int),
2813                 .mode           = 0644,
2814                 .proc_handler   = proc_dointvec,
2815         },
2816         {
2817                 .procname       = "gc_elasticity",
2818                 .data           = &ip_rt_gc_elasticity,
2819                 .maxlen         = sizeof(int),
2820                 .mode           = 0644,
2821                 .proc_handler   = proc_dointvec,
2822         },
2823         {
2824                 .procname       = "mtu_expires",
2825                 .data           = &ip_rt_mtu_expires,
2826                 .maxlen         = sizeof(int),
2827                 .mode           = 0644,
2828                 .proc_handler   = proc_dointvec_jiffies,
2829         },
2830         {
2831                 .procname       = "min_pmtu",
2832                 .data           = &ip_rt_min_pmtu,
2833                 .maxlen         = sizeof(int),
2834                 .mode           = 0644,
2835                 .proc_handler   = proc_dointvec_minmax,
2836                 .extra1         = &ip_min_valid_pmtu,
2837         },
2838         {
2839                 .procname       = "min_adv_mss",
2840                 .data           = &ip_rt_min_advmss,
2841                 .maxlen         = sizeof(int),
2842                 .mode           = 0644,
2843                 .proc_handler   = proc_dointvec,
2844         },
2845         { }
2846 };
2847
2848 static struct ctl_table ipv4_route_flush_table[] = {
2849         {
2850                 .procname       = "flush",
2851                 .maxlen         = sizeof(int),
2852                 .mode           = 0200,
2853                 .proc_handler   = ipv4_sysctl_rtcache_flush,
2854         },
2855         { },
2856 };
2857
2858 static __net_init int sysctl_route_net_init(struct net *net)
2859 {
2860         struct ctl_table *tbl;
2861
2862         tbl = ipv4_route_flush_table;
2863         if (!net_eq(net, &init_net)) {
2864                 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
2865                 if (!tbl)
2866                         goto err_dup;
2867
2868                 /* Don't export sysctls to unprivileged users */
2869                 if (net->user_ns != &init_user_ns)
2870                         tbl[0].procname = NULL;
2871         }
2872         tbl[0].extra1 = net;
2873
2874         net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
2875         if (!net->ipv4.route_hdr)
2876                 goto err_reg;
2877         return 0;
2878
2879 err_reg:
2880         if (tbl != ipv4_route_flush_table)
2881                 kfree(tbl);
2882 err_dup:
2883         return -ENOMEM;
2884 }
2885
2886 static __net_exit void sysctl_route_net_exit(struct net *net)
2887 {
2888         struct ctl_table *tbl;
2889
2890         tbl = net->ipv4.route_hdr->ctl_table_arg;
2891         unregister_net_sysctl_table(net->ipv4.route_hdr);
2892         BUG_ON(tbl == ipv4_route_flush_table);
2893         kfree(tbl);
2894 }
2895
2896 static __net_initdata struct pernet_operations sysctl_route_ops = {
2897         .init = sysctl_route_net_init,
2898         .exit = sysctl_route_net_exit,
2899 };
2900 #endif
2901
2902 static __net_init int rt_genid_init(struct net *net)
2903 {
2904         atomic_set(&net->ipv4.rt_genid, 0);
2905         atomic_set(&net->fnhe_genid, 0);
2906         get_random_bytes(&net->ipv4.dev_addr_genid,
2907                          sizeof(net->ipv4.dev_addr_genid));
2908         return 0;
2909 }
2910
2911 static __net_initdata struct pernet_operations rt_genid_ops = {
2912         .init = rt_genid_init,
2913 };
2914
2915 static int __net_init ipv4_inetpeer_init(struct net *net)
2916 {
2917         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
2918
2919         if (!bp)
2920                 return -ENOMEM;
2921         inet_peer_base_init(bp);
2922         net->ipv4.peers = bp;
2923         return 0;
2924 }
2925
2926 static void __net_exit ipv4_inetpeer_exit(struct net *net)
2927 {
2928         struct inet_peer_base *bp = net->ipv4.peers;
2929
2930         net->ipv4.peers = NULL;
2931         inetpeer_invalidate_tree(bp);
2932         kfree(bp);
2933 }
2934
2935 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
2936         .init   =       ipv4_inetpeer_init,
2937         .exit   =       ipv4_inetpeer_exit,
2938 };
2939
2940 #ifdef CONFIG_IP_ROUTE_CLASSID
2941 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
2942 #endif /* CONFIG_IP_ROUTE_CLASSID */
2943
2944 int __init ip_rt_init(void)
2945 {
2946         void *idents_hash;
2947         int rc = 0;
2948         int cpu;
2949
2950         /* For modern hosts, this will use 2 MB of memory */
2951         idents_hash = alloc_large_system_hash("IP idents",
2952                                               sizeof(*ip_idents) + sizeof(*ip_tstamps),
2953                                               0,
2954                                               16, /* one bucket per 64 KB */
2955                                               0,
2956                                               NULL,
2957                                               &ip_idents_mask,
2958                                               2048,
2959                                               256*1024);
2960
2961         ip_idents = idents_hash;
2962
2963         prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
2964
2965         ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
2966         memset(ip_tstamps, 0, (ip_idents_mask + 1) * sizeof(*ip_tstamps));
2967
2968         for_each_possible_cpu(cpu) {
2969                 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
2970
2971                 INIT_LIST_HEAD(&ul->head);
2972                 spin_lock_init(&ul->lock);
2973         }
2974 #ifdef CONFIG_IP_ROUTE_CLASSID
2975         ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
2976         if (!ip_rt_acct)
2977                 panic("IP: failed to allocate ip_rt_acct\n");
2978 #endif
2979
2980         ipv4_dst_ops.kmem_cachep =
2981                 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
2982                                   SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
2983
2984         ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
2985
2986         if (dst_entries_init(&ipv4_dst_ops) < 0)
2987                 panic("IP: failed to allocate ipv4_dst_ops counter\n");
2988
2989         if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
2990                 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
2991
2992         ipv4_dst_ops.gc_thresh = ~0;
2993         ip_rt_max_size = INT_MAX;
2994
2995         devinet_init();
2996         ip_fib_init();
2997
2998         if (ip_rt_proc_init())
2999                 pr_err("Unable to create route proc files\n");
3000 #ifdef CONFIG_XFRM
3001         xfrm_init();
3002         xfrm4_init();
3003 #endif
3004         rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, NULL);
3005
3006 #ifdef CONFIG_SYSCTL
3007         register_pernet_subsys(&sysctl_route_ops);
3008 #endif
3009         register_pernet_subsys(&rt_genid_ops);
3010         register_pernet_subsys(&ipv4_inetpeer_ops);
3011         return rc;
3012 }
3013
3014 #ifdef CONFIG_SYSCTL
3015 /*
3016  * We really need to sanitize the damn ipv4 init order, then all
3017  * this nonsense will go away.
3018  */
3019 void __init ip_static_sysctl_init(void)
3020 {
3021         register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);
3022 }
3023 #endif