2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
65 #define pr_fmt(fmt) "IPv4: " fmt
67 #include <linux/module.h>
68 #include <linux/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
73 #include <linux/bootmem.h>
74 #include <linux/string.h>
75 #include <linux/socket.h>
76 #include <linux/sockios.h>
77 #include <linux/errno.h>
79 #include <linux/inet.h>
80 #include <linux/netdevice.h>
81 #include <linux/proc_fs.h>
82 #include <linux/init.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/rcupdate.h>
91 #include <linux/times.h>
92 #include <linux/slab.h>
93 #include <linux/jhash.h>
95 #include <net/dst_metadata.h>
96 #include <net/net_namespace.h>
97 #include <net/protocol.h>
99 #include <net/route.h>
100 #include <net/inetpeer.h>
101 #include <net/sock.h>
102 #include <net/ip_fib.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/lwtunnel.h>
108 #include <net/netevent.h>
109 #include <net/rtnetlink.h>
111 #include <linux/sysctl.h>
113 #include <net/secure_seq.h>
114 #include <net/ip_tunnels.h>
115 #include <net/l3mdev.h>
117 #include "fib_lookup.h"
119 #define RT_FL_TOS(oldflp4) \
120 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
122 #define RT_GC_TIMEOUT (300*HZ)
124 static int ip_rt_max_size;
125 static int ip_rt_redirect_number __read_mostly = 9;
126 static int ip_rt_redirect_load __read_mostly = HZ / 50;
127 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
128 static int ip_rt_error_cost __read_mostly = HZ;
129 static int ip_rt_error_burst __read_mostly = 5 * HZ;
130 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
131 static u32 ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
132 static int ip_rt_min_advmss __read_mostly = 256;
134 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
137 * Interface to generic destination cache.
140 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
141 static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
142 static unsigned int ipv4_mtu(const struct dst_entry *dst);
143 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
144 static void ipv4_link_failure(struct sk_buff *skb);
145 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
146 struct sk_buff *skb, u32 mtu,
148 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
149 struct sk_buff *skb);
150 static void ipv4_dst_destroy(struct dst_entry *dst);
152 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
158 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
161 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
163 static struct dst_ops ipv4_dst_ops = {
165 .check = ipv4_dst_check,
166 .default_advmss = ipv4_default_advmss,
168 .cow_metrics = ipv4_cow_metrics,
169 .destroy = ipv4_dst_destroy,
170 .negative_advice = ipv4_negative_advice,
171 .link_failure = ipv4_link_failure,
172 .update_pmtu = ip_rt_update_pmtu,
173 .redirect = ip_do_redirect,
174 .local_out = __ip_local_out,
175 .neigh_lookup = ipv4_neigh_lookup,
176 .confirm_neigh = ipv4_confirm_neigh,
179 #define ECN_OR_COST(class) TC_PRIO_##class
181 const __u8 ip_tos2prio[16] = {
183 ECN_OR_COST(BESTEFFORT),
185 ECN_OR_COST(BESTEFFORT),
191 ECN_OR_COST(INTERACTIVE),
193 ECN_OR_COST(INTERACTIVE),
194 TC_PRIO_INTERACTIVE_BULK,
195 ECN_OR_COST(INTERACTIVE_BULK),
196 TC_PRIO_INTERACTIVE_BULK,
197 ECN_OR_COST(INTERACTIVE_BULK)
199 EXPORT_SYMBOL(ip_tos2prio);
201 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
202 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
204 #ifdef CONFIG_PROC_FS
205 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
209 return SEQ_START_TOKEN;
212 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
218 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
222 static int rt_cache_seq_show(struct seq_file *seq, void *v)
224 if (v == SEQ_START_TOKEN)
225 seq_printf(seq, "%-127s\n",
226 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
227 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
232 static const struct seq_operations rt_cache_seq_ops = {
233 .start = rt_cache_seq_start,
234 .next = rt_cache_seq_next,
235 .stop = rt_cache_seq_stop,
236 .show = rt_cache_seq_show,
239 static int rt_cache_seq_open(struct inode *inode, struct file *file)
241 return seq_open(file, &rt_cache_seq_ops);
244 static const struct file_operations rt_cache_seq_fops = {
245 .open = rt_cache_seq_open,
248 .release = seq_release,
252 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
257 return SEQ_START_TOKEN;
259 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
260 if (!cpu_possible(cpu))
263 return &per_cpu(rt_cache_stat, cpu);
268 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
272 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
273 if (!cpu_possible(cpu))
276 return &per_cpu(rt_cache_stat, cpu);
283 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
288 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
290 struct rt_cache_stat *st = v;
292 if (v == SEQ_START_TOKEN) {
293 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
297 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
298 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
299 dst_entries_get_slow(&ipv4_dst_ops),
312 0, /* st->gc_total */
313 0, /* st->gc_ignored */
314 0, /* st->gc_goal_miss */
315 0, /* st->gc_dst_overflow */
316 0, /* st->in_hlist_search */
317 0 /* st->out_hlist_search */
322 static const struct seq_operations rt_cpu_seq_ops = {
323 .start = rt_cpu_seq_start,
324 .next = rt_cpu_seq_next,
325 .stop = rt_cpu_seq_stop,
326 .show = rt_cpu_seq_show,
330 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
332 return seq_open(file, &rt_cpu_seq_ops);
335 static const struct file_operations rt_cpu_seq_fops = {
336 .open = rt_cpu_seq_open,
339 .release = seq_release,
342 #ifdef CONFIG_IP_ROUTE_CLASSID
343 static int rt_acct_proc_show(struct seq_file *m, void *v)
345 struct ip_rt_acct *dst, *src;
348 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
352 for_each_possible_cpu(i) {
353 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
354 for (j = 0; j < 256; j++) {
355 dst[j].o_bytes += src[j].o_bytes;
356 dst[j].o_packets += src[j].o_packets;
357 dst[j].i_bytes += src[j].i_bytes;
358 dst[j].i_packets += src[j].i_packets;
362 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
368 static int __net_init ip_rt_do_proc_init(struct net *net)
370 struct proc_dir_entry *pde;
372 pde = proc_create("rt_cache", 0444, net->proc_net,
377 pde = proc_create("rt_cache", 0444,
378 net->proc_net_stat, &rt_cpu_seq_fops);
382 #ifdef CONFIG_IP_ROUTE_CLASSID
383 pde = proc_create_single("rt_acct", 0, net->proc_net,
390 #ifdef CONFIG_IP_ROUTE_CLASSID
392 remove_proc_entry("rt_cache", net->proc_net_stat);
395 remove_proc_entry("rt_cache", net->proc_net);
400 static void __net_exit ip_rt_do_proc_exit(struct net *net)
402 remove_proc_entry("rt_cache", net->proc_net_stat);
403 remove_proc_entry("rt_cache", net->proc_net);
404 #ifdef CONFIG_IP_ROUTE_CLASSID
405 remove_proc_entry("rt_acct", net->proc_net);
409 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
410 .init = ip_rt_do_proc_init,
411 .exit = ip_rt_do_proc_exit,
414 static int __init ip_rt_proc_init(void)
416 return register_pernet_subsys(&ip_rt_proc_ops);
420 static inline int ip_rt_proc_init(void)
424 #endif /* CONFIG_PROC_FS */
426 static inline bool rt_is_expired(const struct rtable *rth)
428 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
431 void rt_cache_flush(struct net *net)
433 rt_genid_bump_ipv4(net);
436 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
440 struct net_device *dev = dst->dev;
441 const __be32 *pkey = daddr;
442 const struct rtable *rt;
445 rt = (const struct rtable *) dst;
447 pkey = (const __be32 *) &rt->rt_gateway;
449 pkey = &ip_hdr(skb)->daddr;
451 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
454 return neigh_create(&arp_tbl, pkey, dev);
457 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
459 struct net_device *dev = dst->dev;
460 const __be32 *pkey = daddr;
461 const struct rtable *rt;
463 rt = (const struct rtable *)dst;
465 pkey = (const __be32 *)&rt->rt_gateway;
468 (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
471 __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
474 /* Hash tables of size 2048..262144 depending on RAM size.
475 * Each bucket uses 8 bytes.
477 static u32 ip_idents_mask __read_mostly;
478 static atomic_t *ip_idents __read_mostly;
479 static u32 *ip_tstamps __read_mostly;
481 /* In order to protect privacy, we add a perturbation to identifiers
482 * if one generator is seldom used. This makes hard for an attacker
483 * to infer how many packets were sent between two points in time.
485 u32 ip_idents_reserve(u32 hash, int segs)
487 u32 bucket, old, now = (u32)jiffies;
492 bucket = hash & ip_idents_mask;
493 p_tstamp = ip_tstamps + bucket;
494 p_id = ip_idents + bucket;
495 old = READ_ONCE(*p_tstamp);
497 if (old != now && cmpxchg(p_tstamp, old, now) == old)
498 delta = prandom_u32_max(now - old);
500 /* If UBSAN reports an error there, please make sure your compiler
501 * supports -fno-strict-overflow before reporting it that was a bug
502 * in UBSAN, and it has been fixed in GCC-8.
504 return atomic_add_return(segs + delta, p_id) - segs;
506 EXPORT_SYMBOL(ip_idents_reserve);
508 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
512 /* Note the following code is not safe, but this is okay. */
513 if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
514 get_random_bytes(&net->ipv4.ip_id_key,
515 sizeof(net->ipv4.ip_id_key));
517 hash = siphash_3u32((__force u32)iph->daddr,
518 (__force u32)iph->saddr,
520 &net->ipv4.ip_id_key);
521 id = ip_idents_reserve(hash, segs);
524 EXPORT_SYMBOL(__ip_select_ident);
526 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
527 const struct sock *sk,
528 const struct iphdr *iph,
530 u8 prot, u32 mark, int flow_flags)
533 const struct inet_sock *inet = inet_sk(sk);
535 oif = sk->sk_bound_dev_if;
537 tos = RT_CONN_FLAGS(sk);
538 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
540 flowi4_init_output(fl4, oif, mark, tos,
541 RT_SCOPE_UNIVERSE, prot,
543 iph->daddr, iph->saddr, 0, 0,
544 sock_net_uid(net, sk));
547 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
548 const struct sock *sk)
550 const struct net *net = dev_net(skb->dev);
551 const struct iphdr *iph = ip_hdr(skb);
552 int oif = skb->dev->ifindex;
553 u8 tos = RT_TOS(iph->tos);
554 u8 prot = iph->protocol;
555 u32 mark = skb->mark;
557 __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
560 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
562 const struct inet_sock *inet = inet_sk(sk);
563 const struct ip_options_rcu *inet_opt;
564 __be32 daddr = inet->inet_daddr;
567 inet_opt = rcu_dereference(inet->inet_opt);
568 if (inet_opt && inet_opt->opt.srr)
569 daddr = inet_opt->opt.faddr;
570 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
571 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
572 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
573 inet_sk_flowi_flags(sk),
574 daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
578 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
579 const struct sk_buff *skb)
582 build_skb_flow_key(fl4, skb, sk);
584 build_sk_flow_key(fl4, sk);
587 static DEFINE_SPINLOCK(fnhe_lock);
589 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
593 rt = rcu_dereference(fnhe->fnhe_rth_input);
595 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
596 dst_dev_put(&rt->dst);
597 dst_release(&rt->dst);
599 rt = rcu_dereference(fnhe->fnhe_rth_output);
601 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
602 dst_dev_put(&rt->dst);
603 dst_release(&rt->dst);
607 static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
609 struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
610 struct fib_nh_exception *fnhe, *oldest = NULL;
612 for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
613 fnhe = rcu_dereference_protected(*fnhe_p,
614 lockdep_is_held(&fnhe_lock));
618 time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
623 fnhe_flush_routes(oldest);
624 *oldest_p = oldest->fnhe_next;
625 kfree_rcu(oldest, rcu);
628 static u32 fnhe_hashfun(__be32 daddr)
630 static siphash_key_t fnhe_hash_key __read_mostly;
633 net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key));
634 hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key);
635 return hash_64(hval, FNHE_HASH_SHIFT);
638 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
640 rt->rt_pmtu = fnhe->fnhe_pmtu;
641 rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
642 rt->dst.expires = fnhe->fnhe_expires;
645 rt->rt_flags |= RTCF_REDIRECTED;
646 rt->rt_gateway = fnhe->fnhe_gw;
647 rt->rt_uses_gateway = 1;
651 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
652 u32 pmtu, bool lock, unsigned long expires)
654 struct fnhe_hash_bucket *hash;
655 struct fib_nh_exception *fnhe;
661 genid = fnhe_genid(dev_net(nh->nh_dev));
662 hval = fnhe_hashfun(daddr);
664 spin_lock_bh(&fnhe_lock);
666 hash = rcu_dereference(nh->nh_exceptions);
668 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
671 rcu_assign_pointer(nh->nh_exceptions, hash);
677 for (fnhe = rcu_dereference(hash->chain); fnhe;
678 fnhe = rcu_dereference(fnhe->fnhe_next)) {
679 if (fnhe->fnhe_daddr == daddr)
685 if (fnhe->fnhe_genid != genid)
686 fnhe->fnhe_genid = genid;
690 fnhe->fnhe_pmtu = pmtu;
691 fnhe->fnhe_mtu_locked = lock;
693 fnhe->fnhe_expires = max(1UL, expires);
694 /* Update all cached dsts too */
695 rt = rcu_dereference(fnhe->fnhe_rth_input);
697 fill_route_from_fnhe(rt, fnhe);
698 rt = rcu_dereference(fnhe->fnhe_rth_output);
700 fill_route_from_fnhe(rt, fnhe);
702 /* Randomize max depth to avoid some side channels attacks. */
703 int max_depth = FNHE_RECLAIM_DEPTH +
704 prandom_u32_max(FNHE_RECLAIM_DEPTH);
706 while (depth > max_depth) {
707 fnhe_remove_oldest(hash);
711 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
715 fnhe->fnhe_next = hash->chain;
717 fnhe->fnhe_genid = genid;
718 fnhe->fnhe_daddr = daddr;
720 fnhe->fnhe_pmtu = pmtu;
721 fnhe->fnhe_mtu_locked = lock;
722 fnhe->fnhe_expires = max(1UL, expires);
724 rcu_assign_pointer(hash->chain, fnhe);
726 /* Exception created; mark the cached routes for the nexthop
727 * stale, so anyone caching it rechecks if this exception
730 rt = rcu_dereference(nh->nh_rth_input);
732 rt->dst.obsolete = DST_OBSOLETE_KILL;
734 for_each_possible_cpu(i) {
735 struct rtable __rcu **prt;
736 prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
737 rt = rcu_dereference(*prt);
739 rt->dst.obsolete = DST_OBSOLETE_KILL;
743 fnhe->fnhe_stamp = jiffies;
746 spin_unlock_bh(&fnhe_lock);
749 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
752 __be32 new_gw = icmp_hdr(skb)->un.gateway;
753 __be32 old_gw = ip_hdr(skb)->saddr;
754 struct net_device *dev = skb->dev;
755 struct in_device *in_dev;
756 struct fib_result res;
760 switch (icmp_hdr(skb)->code & 7) {
762 case ICMP_REDIR_NETTOS:
763 case ICMP_REDIR_HOST:
764 case ICMP_REDIR_HOSTTOS:
771 if (rt->rt_gateway != old_gw)
774 in_dev = __in_dev_get_rcu(dev);
779 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
780 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
781 ipv4_is_zeronet(new_gw))
782 goto reject_redirect;
784 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
785 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
786 goto reject_redirect;
787 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
788 goto reject_redirect;
790 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
791 goto reject_redirect;
794 n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
796 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
798 if (!(n->nud_state & NUD_VALID)) {
799 neigh_event_send(n, NULL);
801 if (fib_lookup(net, fl4, &res, 0) == 0) {
804 fib_select_path(net, &res, fl4, skb);
805 nh = &FIB_RES_NH(res);
806 update_or_create_fnhe(nh, fl4->daddr, new_gw,
808 jiffies + ip_rt_gc_timeout);
811 rt->dst.obsolete = DST_OBSOLETE_KILL;
812 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
819 #ifdef CONFIG_IP_ROUTE_VERBOSE
820 if (IN_DEV_LOG_MARTIANS(in_dev)) {
821 const struct iphdr *iph = (const struct iphdr *) skb->data;
822 __be32 daddr = iph->daddr;
823 __be32 saddr = iph->saddr;
825 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
826 " Advised path = %pI4 -> %pI4\n",
827 &old_gw, dev->name, &new_gw,
834 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
838 const struct iphdr *iph = (const struct iphdr *) skb->data;
839 struct net *net = dev_net(skb->dev);
840 int oif = skb->dev->ifindex;
841 u8 tos = RT_TOS(iph->tos);
842 u8 prot = iph->protocol;
843 u32 mark = skb->mark;
845 rt = (struct rtable *) dst;
847 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
848 __ip_do_redirect(rt, skb, &fl4, true);
851 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
853 struct rtable *rt = (struct rtable *)dst;
854 struct dst_entry *ret = dst;
857 if (dst->obsolete > 0) {
860 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
871 * 1. The first ip_rt_redirect_number redirects are sent
872 * with exponential backoff, then we stop sending them at all,
873 * assuming that the host ignores our redirects.
874 * 2. If we did not see packets requiring redirects
875 * during ip_rt_redirect_silence, we assume that the host
876 * forgot redirected route and start to send redirects again.
878 * This algorithm is much cheaper and more intelligent than dumb load limiting
881 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
882 * and "frag. need" (breaks PMTU discovery) in icmp.c.
885 void ip_rt_send_redirect(struct sk_buff *skb)
887 struct rtable *rt = skb_rtable(skb);
888 struct in_device *in_dev;
889 struct inet_peer *peer;
895 in_dev = __in_dev_get_rcu(rt->dst.dev);
896 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
900 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
901 vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
904 net = dev_net(rt->dst.dev);
905 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
907 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
908 rt_nexthop(rt, ip_hdr(skb)->daddr));
912 /* No redirected packets during ip_rt_redirect_silence;
913 * reset the algorithm.
915 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
916 peer->rate_tokens = 0;
917 peer->n_redirects = 0;
920 /* Too many ignored redirects; do not send anything
921 * set dst.rate_last to the last seen redirected packet.
923 if (peer->n_redirects >= ip_rt_redirect_number) {
924 peer->rate_last = jiffies;
928 /* Check for load limit; set rate_last to the latest sent
931 if (peer->n_redirects == 0 ||
934 (ip_rt_redirect_load << peer->n_redirects)))) {
935 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
937 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
938 peer->rate_last = jiffies;
940 #ifdef CONFIG_IP_ROUTE_VERBOSE
942 peer->n_redirects == ip_rt_redirect_number)
943 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
944 &ip_hdr(skb)->saddr, inet_iif(skb),
945 &ip_hdr(skb)->daddr, &gw);
952 static int ip_error(struct sk_buff *skb)
954 struct rtable *rt = skb_rtable(skb);
955 struct net_device *dev = skb->dev;
956 struct in_device *in_dev;
957 struct inet_peer *peer;
963 if (netif_is_l3_master(skb->dev)) {
964 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
969 in_dev = __in_dev_get_rcu(dev);
971 /* IP on this device is disabled. */
975 net = dev_net(rt->dst.dev);
976 if (!IN_DEV_FORWARD(in_dev)) {
977 switch (rt->dst.error) {
979 __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
983 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
989 switch (rt->dst.error) {
994 code = ICMP_HOST_UNREACH;
997 code = ICMP_NET_UNREACH;
998 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
1001 code = ICMP_PKT_FILTERED;
1005 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
1006 l3mdev_master_ifindex(skb->dev), 1);
1011 peer->rate_tokens += now - peer->rate_last;
1012 if (peer->rate_tokens > ip_rt_error_burst)
1013 peer->rate_tokens = ip_rt_error_burst;
1014 peer->rate_last = now;
1015 if (peer->rate_tokens >= ip_rt_error_cost)
1016 peer->rate_tokens -= ip_rt_error_cost;
1022 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1024 out: kfree_skb(skb);
1028 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1030 struct dst_entry *dst = &rt->dst;
1031 struct net *net = dev_net(dst->dev);
1032 u32 old_mtu = ipv4_mtu(dst);
1033 struct fib_result res;
1036 if (ip_mtu_locked(dst))
1042 if (mtu < ip_rt_min_pmtu) {
1044 mtu = min(old_mtu, ip_rt_min_pmtu);
1047 if (rt->rt_pmtu == mtu && !lock &&
1048 time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1052 if (fib_lookup(net, fl4, &res, 0) == 0) {
1055 fib_select_path(net, &res, fl4, NULL);
1056 nh = &FIB_RES_NH(res);
1057 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
1058 jiffies + ip_rt_mtu_expires);
1063 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1064 struct sk_buff *skb, u32 mtu,
1067 struct rtable *rt = (struct rtable *) dst;
1070 ip_rt_build_flow_key(&fl4, sk, skb);
1071 __ip_rt_update_pmtu(rt, &fl4, mtu);
1074 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1075 int oif, u32 mark, u8 protocol, int flow_flags)
1077 const struct iphdr *iph = (const struct iphdr *) skb->data;
1082 mark = IP4_REPLY_MARK(net, skb->mark);
1084 __build_flow_key(net, &fl4, NULL, iph, oif,
1085 RT_TOS(iph->tos), protocol, mark, flow_flags);
1086 rt = __ip_route_output_key(net, &fl4);
1088 __ip_rt_update_pmtu(rt, &fl4, mtu);
1092 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1094 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1096 const struct iphdr *iph = (const struct iphdr *) skb->data;
1100 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1102 if (!fl4.flowi4_mark)
1103 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1105 rt = __ip_route_output_key(sock_net(sk), &fl4);
1107 __ip_rt_update_pmtu(rt, &fl4, mtu);
1112 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1114 const struct iphdr *iph = (const struct iphdr *) skb->data;
1117 struct dst_entry *odst = NULL;
1119 struct net *net = sock_net(sk);
1123 if (!ip_sk_accept_pmtu(sk))
1126 odst = sk_dst_get(sk);
1128 if (sock_owned_by_user(sk) || !odst) {
1129 __ipv4_sk_update_pmtu(skb, sk, mtu);
1133 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1135 rt = (struct rtable *)odst;
1136 if (odst->obsolete && !odst->ops->check(odst, 0)) {
1137 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1144 __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1146 if (!dst_check(&rt->dst, 0)) {
1148 dst_release(&rt->dst);
1150 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1158 sk_dst_set(sk, &rt->dst);
1164 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1166 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1167 int oif, u32 mark, u8 protocol, int flow_flags)
1169 const struct iphdr *iph = (const struct iphdr *) skb->data;
1173 __build_flow_key(net, &fl4, NULL, iph, oif,
1174 RT_TOS(iph->tos), protocol, mark, flow_flags);
1175 rt = __ip_route_output_key(net, &fl4);
1177 __ip_do_redirect(rt, skb, &fl4, false);
1181 EXPORT_SYMBOL_GPL(ipv4_redirect);
1183 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1185 const struct iphdr *iph = (const struct iphdr *) skb->data;
1188 struct net *net = sock_net(sk);
1190 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1191 rt = __ip_route_output_key(net, &fl4);
1193 __ip_do_redirect(rt, skb, &fl4, false);
1197 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1199 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1201 struct rtable *rt = (struct rtable *) dst;
1203 /* All IPV4 dsts are created with ->obsolete set to the value
1204 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1205 * into this function always.
1207 * When a PMTU/redirect information update invalidates a route,
1208 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1209 * DST_OBSOLETE_DEAD by dst_free().
1211 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1216 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1218 struct ip_options opt;
1221 /* Recompile ip options since IPCB may not be valid anymore.
1222 * Also check we have a reasonable ipv4 header.
1224 if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1225 ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1228 memset(&opt, 0, sizeof(opt));
1229 if (ip_hdr(skb)->ihl > 5) {
1230 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1232 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1235 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1241 __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1244 static void ipv4_link_failure(struct sk_buff *skb)
1248 ipv4_send_dest_unreach(skb);
1250 rt = skb_rtable(skb);
1252 dst_set_expires(&rt->dst, 0);
1255 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1257 pr_debug("%s: %pI4 -> %pI4, %s\n",
1258 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1259 skb->dev ? skb->dev->name : "?");
1266 We do not cache source address of outgoing interface,
1267 because it is used only by IP RR, TS and SRR options,
1268 so that it out of fast path.
1270 BTW remember: "addr" is allowed to be not aligned
1274 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1278 if (rt_is_output_route(rt))
1279 src = ip_hdr(skb)->saddr;
1281 struct fib_result res;
1287 memset(&fl4, 0, sizeof(fl4));
1288 fl4.daddr = iph->daddr;
1289 fl4.saddr = iph->saddr;
1290 fl4.flowi4_tos = RT_TOS(iph->tos);
1291 fl4.flowi4_oif = rt->dst.dev->ifindex;
1292 fl4.flowi4_iif = skb->dev->ifindex;
1293 fl4.flowi4_mark = skb->mark;
1296 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1297 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1299 src = inet_select_addr(rt->dst.dev,
1300 rt_nexthop(rt, iph->daddr),
1304 memcpy(addr, &src, 4);
1307 #ifdef CONFIG_IP_ROUTE_CLASSID
1308 static void set_class_tag(struct rtable *rt, u32 tag)
1310 if (!(rt->dst.tclassid & 0xFFFF))
1311 rt->dst.tclassid |= tag & 0xFFFF;
1312 if (!(rt->dst.tclassid & 0xFFFF0000))
1313 rt->dst.tclassid |= tag & 0xFFFF0000;
1317 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1319 unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1320 unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1323 return min(advmss, IPV4_MAX_PMTU - header_size);
1326 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1328 const struct rtable *rt = (const struct rtable *) dst;
1329 unsigned int mtu = rt->rt_pmtu;
1331 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1332 mtu = dst_metric_raw(dst, RTAX_MTU);
1337 mtu = READ_ONCE(dst->dev->mtu);
1339 if (unlikely(ip_mtu_locked(dst))) {
1340 if (rt->rt_uses_gateway && mtu > 576)
1345 mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1347 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1350 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1352 struct fnhe_hash_bucket *hash;
1353 struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1354 u32 hval = fnhe_hashfun(daddr);
1356 spin_lock_bh(&fnhe_lock);
1358 hash = rcu_dereference_protected(nh->nh_exceptions,
1359 lockdep_is_held(&fnhe_lock));
1362 fnhe_p = &hash->chain;
1363 fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1365 if (fnhe->fnhe_daddr == daddr) {
1366 rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1367 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1368 /* set fnhe_daddr to 0 to ensure it won't bind with
1369 * new dsts in rt_bind_exception().
1371 fnhe->fnhe_daddr = 0;
1372 fnhe_flush_routes(fnhe);
1373 kfree_rcu(fnhe, rcu);
1376 fnhe_p = &fnhe->fnhe_next;
1377 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1378 lockdep_is_held(&fnhe_lock));
1381 spin_unlock_bh(&fnhe_lock);
1384 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1386 struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1387 struct fib_nh_exception *fnhe;
1393 hval = fnhe_hashfun(daddr);
1395 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1396 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1397 if (fnhe->fnhe_daddr == daddr) {
1398 if (fnhe->fnhe_expires &&
1399 time_after(jiffies, fnhe->fnhe_expires)) {
1400 ip_del_fnhe(nh, daddr);
1410 * 1. mtu on route is locked - use it
1411 * 2. mtu from nexthop exception
1412 * 3. mtu from egress device
1415 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1417 struct fib_info *fi = res->fi;
1418 struct fib_nh *nh = &fi->fib_nh[res->nh_sel];
1419 struct net_device *dev = nh->nh_dev;
1422 if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1423 fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1427 struct fib_nh_exception *fnhe;
1429 fnhe = find_exception(nh, daddr);
1430 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1431 mtu = fnhe->fnhe_pmtu;
1435 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1437 return mtu - lwtunnel_headroom(nh->nh_lwtstate, mtu);
1440 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1441 __be32 daddr, const bool do_cache)
1445 spin_lock_bh(&fnhe_lock);
1447 if (daddr == fnhe->fnhe_daddr) {
1448 struct rtable __rcu **porig;
1449 struct rtable *orig;
1450 int genid = fnhe_genid(dev_net(rt->dst.dev));
1452 if (rt_is_input_route(rt))
1453 porig = &fnhe->fnhe_rth_input;
1455 porig = &fnhe->fnhe_rth_output;
1456 orig = rcu_dereference(*porig);
1458 if (fnhe->fnhe_genid != genid) {
1459 fnhe->fnhe_genid = genid;
1461 fnhe->fnhe_pmtu = 0;
1462 fnhe->fnhe_expires = 0;
1463 fnhe->fnhe_mtu_locked = false;
1464 fnhe_flush_routes(fnhe);
1467 fill_route_from_fnhe(rt, fnhe);
1468 if (!rt->rt_gateway)
1469 rt->rt_gateway = daddr;
1473 rcu_assign_pointer(*porig, rt);
1475 dst_dev_put(&orig->dst);
1476 dst_release(&orig->dst);
1481 fnhe->fnhe_stamp = jiffies;
1483 spin_unlock_bh(&fnhe_lock);
1488 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1490 struct rtable *orig, *prev, **p;
1493 if (rt_is_input_route(rt)) {
1494 p = (struct rtable **)&nh->nh_rth_input;
1496 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1500 /* hold dst before doing cmpxchg() to avoid race condition
1504 prev = cmpxchg(p, orig, rt);
1507 rt_add_uncached_list(orig);
1508 dst_release(&orig->dst);
1511 dst_release(&rt->dst);
1518 struct uncached_list {
1520 struct list_head head;
1523 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1525 void rt_add_uncached_list(struct rtable *rt)
1527 struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1529 rt->rt_uncached_list = ul;
1531 spin_lock_bh(&ul->lock);
1532 list_add_tail(&rt->rt_uncached, &ul->head);
1533 spin_unlock_bh(&ul->lock);
1536 void rt_del_uncached_list(struct rtable *rt)
1538 if (!list_empty(&rt->rt_uncached)) {
1539 struct uncached_list *ul = rt->rt_uncached_list;
1541 spin_lock_bh(&ul->lock);
1542 list_del(&rt->rt_uncached);
1543 spin_unlock_bh(&ul->lock);
1547 static void ipv4_dst_destroy(struct dst_entry *dst)
1549 struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1550 struct rtable *rt = (struct rtable *)dst;
1552 if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
1555 rt_del_uncached_list(rt);
1558 void rt_flush_dev(struct net_device *dev)
1560 struct net *net = dev_net(dev);
1564 for_each_possible_cpu(cpu) {
1565 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1567 spin_lock_bh(&ul->lock);
1568 list_for_each_entry(rt, &ul->head, rt_uncached) {
1569 if (rt->dst.dev != dev)
1571 rt->dst.dev = net->loopback_dev;
1572 dev_hold(rt->dst.dev);
1575 spin_unlock_bh(&ul->lock);
1579 static bool rt_cache_valid(const struct rtable *rt)
1582 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1586 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1587 const struct fib_result *res,
1588 struct fib_nh_exception *fnhe,
1589 struct fib_info *fi, u16 type, u32 itag,
1590 const bool do_cache)
1592 bool cached = false;
1595 struct fib_nh *nh = &FIB_RES_NH(*res);
1597 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1598 rt->rt_gateway = nh->nh_gw;
1599 rt->rt_uses_gateway = 1;
1601 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1602 if (fi->fib_metrics != &dst_default_metrics) {
1603 rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1604 refcount_inc(&fi->fib_metrics->refcnt);
1606 #ifdef CONFIG_IP_ROUTE_CLASSID
1607 rt->dst.tclassid = nh->nh_tclassid;
1609 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1611 cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1613 cached = rt_cache_route(nh, rt);
1614 if (unlikely(!cached)) {
1615 /* Routes we intend to cache in nexthop exception or
1616 * FIB nexthop have the DST_NOCACHE bit clear.
1617 * However, if we are unsuccessful at storing this
1618 * route into the cache we really need to set it.
1620 if (!rt->rt_gateway)
1621 rt->rt_gateway = daddr;
1622 rt_add_uncached_list(rt);
1625 rt_add_uncached_list(rt);
1627 #ifdef CONFIG_IP_ROUTE_CLASSID
1628 #ifdef CONFIG_IP_MULTIPLE_TABLES
1629 set_class_tag(rt, res->tclassid);
1631 set_class_tag(rt, itag);
1635 struct rtable *rt_dst_alloc(struct net_device *dev,
1636 unsigned int flags, u16 type,
1637 bool nopolicy, bool noxfrm, bool will_cache)
1641 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1642 (will_cache ? 0 : DST_HOST) |
1643 (nopolicy ? DST_NOPOLICY : 0) |
1644 (noxfrm ? DST_NOXFRM : 0));
1647 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1648 rt->rt_flags = flags;
1650 rt->rt_is_input = 0;
1653 rt->rt_mtu_locked = 0;
1655 rt->rt_uses_gateway = 0;
1656 INIT_LIST_HEAD(&rt->rt_uncached);
1658 rt->dst.output = ip_output;
1659 if (flags & RTCF_LOCAL)
1660 rt->dst.input = ip_local_deliver;
1665 EXPORT_SYMBOL(rt_dst_alloc);
1667 /* called in rcu_read_lock() section */
1668 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1669 u8 tos, struct net_device *dev,
1670 struct in_device *in_dev, u32 *itag)
1674 /* Primary sanity checks. */
1678 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1679 skb->protocol != htons(ETH_P_IP))
1682 if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1685 if (ipv4_is_zeronet(saddr)) {
1686 if (!ipv4_is_local_multicast(daddr))
1689 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1697 /* called in rcu_read_lock() section */
1698 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1699 u8 tos, struct net_device *dev, int our)
1701 struct in_device *in_dev = __in_dev_get_rcu(dev);
1702 unsigned int flags = RTCF_MULTICAST;
1707 err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1712 flags |= RTCF_LOCAL;
1714 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1715 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1719 #ifdef CONFIG_IP_ROUTE_CLASSID
1720 rth->dst.tclassid = itag;
1722 rth->dst.output = ip_rt_bug;
1723 rth->rt_is_input= 1;
1725 #ifdef CONFIG_IP_MROUTE
1726 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1727 rth->dst.input = ip_mr_input;
1729 RT_CACHE_STAT_INC(in_slow_mc);
1732 skb_dst_set(skb, &rth->dst);
1737 static void ip_handle_martian_source(struct net_device *dev,
1738 struct in_device *in_dev,
1739 struct sk_buff *skb,
1743 RT_CACHE_STAT_INC(in_martian_src);
1744 #ifdef CONFIG_IP_ROUTE_VERBOSE
1745 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1747 * RFC1812 recommendation, if source is martian,
1748 * the only hint is MAC header.
1750 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1751 &daddr, &saddr, dev->name);
1752 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1753 print_hex_dump(KERN_WARNING, "ll header: ",
1754 DUMP_PREFIX_OFFSET, 16, 1,
1755 skb_mac_header(skb),
1756 dev->hard_header_len, true);
1762 /* called in rcu_read_lock() section */
1763 static int __mkroute_input(struct sk_buff *skb,
1764 const struct fib_result *res,
1765 struct in_device *in_dev,
1766 __be32 daddr, __be32 saddr, u32 tos)
1768 struct fib_nh_exception *fnhe;
1771 struct in_device *out_dev;
1775 /* get a working reference to the output device */
1776 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1778 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1782 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1783 in_dev->dev, in_dev, &itag);
1785 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1791 do_cache = res->fi && !itag;
1792 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1793 skb->protocol == htons(ETH_P_IP) &&
1794 (IN_DEV_SHARED_MEDIA(out_dev) ||
1795 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1796 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1798 if (skb->protocol != htons(ETH_P_IP)) {
1799 /* Not IP (i.e. ARP). Do not create route, if it is
1800 * invalid for proxy arp. DNAT routes are always valid.
1802 * Proxy arp feature have been extended to allow, ARP
1803 * replies back to the same interface, to support
1804 * Private VLAN switch technologies. See arp.c.
1806 if (out_dev == in_dev &&
1807 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1813 fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1816 rth = rcu_dereference(fnhe->fnhe_rth_input);
1818 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1819 if (rt_cache_valid(rth)) {
1820 skb_dst_set_noref(skb, &rth->dst);
1825 rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1826 IN_DEV_CONF_GET(in_dev, NOPOLICY),
1827 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1833 rth->rt_is_input = 1;
1834 RT_CACHE_STAT_INC(in_slow_tot);
1836 rth->dst.input = ip_forward;
1838 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1840 lwtunnel_set_redirect(&rth->dst);
1841 skb_dst_set(skb, &rth->dst);
1848 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1849 /* To make ICMP packets follow the right flow, the multipath hash is
1850 * calculated from the inner IP addresses.
1852 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1853 struct flow_keys *hash_keys)
1855 const struct iphdr *outer_iph = ip_hdr(skb);
1856 const struct iphdr *key_iph = outer_iph;
1857 const struct iphdr *inner_iph;
1858 const struct icmphdr *icmph;
1859 struct iphdr _inner_iph;
1860 struct icmphdr _icmph;
1862 if (likely(outer_iph->protocol != IPPROTO_ICMP))
1865 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1868 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1873 if (icmph->type != ICMP_DEST_UNREACH &&
1874 icmph->type != ICMP_REDIRECT &&
1875 icmph->type != ICMP_TIME_EXCEEDED &&
1876 icmph->type != ICMP_PARAMETERPROB)
1879 inner_iph = skb_header_pointer(skb,
1880 outer_iph->ihl * 4 + sizeof(_icmph),
1881 sizeof(_inner_iph), &_inner_iph);
1885 key_iph = inner_iph;
1887 hash_keys->addrs.v4addrs.src = key_iph->saddr;
1888 hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1891 /* if skb is set it will be used and fl4 can be NULL */
1892 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1893 const struct sk_buff *skb, struct flow_keys *flkeys)
1895 struct flow_keys hash_keys;
1898 switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1900 memset(&hash_keys, 0, sizeof(hash_keys));
1901 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1903 ip_multipath_l3_keys(skb, &hash_keys);
1905 hash_keys.addrs.v4addrs.src = fl4->saddr;
1906 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1910 /* skb is currently provided only when forwarding */
1912 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1913 struct flow_keys keys;
1915 /* short-circuit if we already have L4 hash present */
1917 return skb_get_hash_raw(skb) >> 1;
1919 memset(&hash_keys, 0, sizeof(hash_keys));
1922 skb_flow_dissect_flow_keys(skb, &keys, flag);
1926 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1927 hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1928 hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1929 hash_keys.ports.src = flkeys->ports.src;
1930 hash_keys.ports.dst = flkeys->ports.dst;
1931 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1933 memset(&hash_keys, 0, sizeof(hash_keys));
1934 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1935 hash_keys.addrs.v4addrs.src = fl4->saddr;
1936 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1937 hash_keys.ports.src = fl4->fl4_sport;
1938 hash_keys.ports.dst = fl4->fl4_dport;
1939 hash_keys.basic.ip_proto = fl4->flowi4_proto;
1943 mhash = flow_hash_from_keys(&hash_keys);
1947 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1949 static int ip_mkroute_input(struct sk_buff *skb,
1950 struct fib_result *res,
1951 struct in_device *in_dev,
1952 __be32 daddr, __be32 saddr, u32 tos,
1953 struct flow_keys *hkeys)
1955 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1956 if (res->fi && res->fi->fib_nhs > 1) {
1957 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
1959 fib_select_multipath(res, h);
1963 /* create a routing cache entry */
1964 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1968 * NOTE. We drop all the packets that has local source
1969 * addresses, because every properly looped back packet
1970 * must have correct destination already attached by output routine.
1972 * Such approach solves two big problems:
1973 * 1. Not simplex devices are handled properly.
1974 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1975 * called with rcu_read_lock()
1978 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1979 u8 tos, struct net_device *dev,
1980 struct fib_result *res)
1982 struct in_device *in_dev = __in_dev_get_rcu(dev);
1983 struct flow_keys *flkeys = NULL, _flkeys;
1984 struct net *net = dev_net(dev);
1985 struct ip_tunnel_info *tun_info;
1987 unsigned int flags = 0;
1991 bool do_cache = true;
1993 /* IP on this device is disabled. */
1998 /* Check for the most weird martians, which can be not detected
2002 tun_info = skb_tunnel_info(skb);
2003 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2004 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2006 fl4.flowi4_tun_key.tun_id = 0;
2009 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2010 goto martian_source;
2014 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2017 /* Accept zero addresses only to limited broadcast;
2018 * I even do not know to fix it or not. Waiting for complains :-)
2020 if (ipv4_is_zeronet(saddr))
2021 goto martian_source;
2023 if (ipv4_is_zeronet(daddr))
2024 goto martian_destination;
2026 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2027 * and call it once if daddr or/and saddr are loopback addresses
2029 if (ipv4_is_loopback(daddr)) {
2030 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2031 goto martian_destination;
2032 } else if (ipv4_is_loopback(saddr)) {
2033 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2034 goto martian_source;
2038 * Now we are ready to route packet.
2041 fl4.flowi4_iif = dev->ifindex;
2042 fl4.flowi4_mark = skb->mark;
2043 fl4.flowi4_tos = tos;
2044 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2045 fl4.flowi4_flags = 0;
2048 fl4.flowi4_uid = sock_net_uid(net, NULL);
2050 if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2053 fl4.flowi4_proto = 0;
2058 err = fib_lookup(net, &fl4, res, 0);
2060 if (!IN_DEV_FORWARD(in_dev))
2061 err = -EHOSTUNREACH;
2065 if (res->type == RTN_BROADCAST) {
2066 if (IN_DEV_BFORWARD(in_dev))
2068 /* not do cache if bc_forwarding is enabled */
2069 if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2074 if (res->type == RTN_LOCAL) {
2075 err = fib_validate_source(skb, saddr, daddr, tos,
2076 0, dev, in_dev, &itag);
2078 goto martian_source;
2082 if (!IN_DEV_FORWARD(in_dev)) {
2083 err = -EHOSTUNREACH;
2086 if (res->type != RTN_UNICAST)
2087 goto martian_destination;
2090 err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2094 if (skb->protocol != htons(ETH_P_IP))
2097 if (!ipv4_is_zeronet(saddr)) {
2098 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2101 goto martian_source;
2103 flags |= RTCF_BROADCAST;
2104 res->type = RTN_BROADCAST;
2105 RT_CACHE_STAT_INC(in_brd);
2108 do_cache &= res->fi && !itag;
2110 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
2111 if (rt_cache_valid(rth)) {
2112 skb_dst_set_noref(skb, &rth->dst);
2118 rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2119 flags | RTCF_LOCAL, res->type,
2120 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2124 rth->dst.output= ip_rt_bug;
2125 #ifdef CONFIG_IP_ROUTE_CLASSID
2126 rth->dst.tclassid = itag;
2128 rth->rt_is_input = 1;
2130 RT_CACHE_STAT_INC(in_slow_tot);
2131 if (res->type == RTN_UNREACHABLE) {
2132 rth->dst.input= ip_error;
2133 rth->dst.error= -err;
2134 rth->rt_flags &= ~RTCF_LOCAL;
2138 struct fib_nh *nh = &FIB_RES_NH(*res);
2140 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2141 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2142 WARN_ON(rth->dst.input == lwtunnel_input);
2143 rth->dst.lwtstate->orig_input = rth->dst.input;
2144 rth->dst.input = lwtunnel_input;
2147 if (unlikely(!rt_cache_route(nh, rth)))
2148 rt_add_uncached_list(rth);
2150 skb_dst_set(skb, &rth->dst);
2155 RT_CACHE_STAT_INC(in_no_route);
2156 res->type = RTN_UNREACHABLE;
2162 * Do not cache martian addresses: they should be logged (RFC1812)
2164 martian_destination:
2165 RT_CACHE_STAT_INC(in_martian_dst);
2166 #ifdef CONFIG_IP_ROUTE_VERBOSE
2167 if (IN_DEV_LOG_MARTIANS(in_dev))
2168 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2169 &daddr, &saddr, dev->name);
2181 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2185 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2186 u8 tos, struct net_device *dev)
2188 struct fib_result res;
2191 tos &= IPTOS_RT_MASK;
2193 err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2198 EXPORT_SYMBOL(ip_route_input_noref);
2200 /* called with rcu_read_lock held */
2201 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2202 u8 tos, struct net_device *dev, struct fib_result *res)
2204 /* Multicast recognition logic is moved from route cache to here.
2205 The problem was that too many Ethernet cards have broken/missing
2206 hardware multicast filters :-( As result the host on multicasting
2207 network acquires a lot of useless route cache entries, sort of
2208 SDR messages from all the world. Now we try to get rid of them.
2209 Really, provided software IP multicast filter is organized
2210 reasonably (at least, hashed), it does not result in a slowdown
2211 comparing with route cache reject entries.
2212 Note, that multicast routers are not affected, because
2213 route cache entry is created eventually.
2215 if (ipv4_is_multicast(daddr)) {
2216 struct in_device *in_dev = __in_dev_get_rcu(dev);
2222 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2223 ip_hdr(skb)->protocol);
2225 /* check l3 master if no match yet */
2226 if (!our && netif_is_l3_slave(dev)) {
2227 struct in_device *l3_in_dev;
2229 l3_in_dev = __in_dev_get_rcu(skb->dev);
2231 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2232 ip_hdr(skb)->protocol);
2236 #ifdef CONFIG_IP_MROUTE
2238 (!ipv4_is_local_multicast(daddr) &&
2239 IN_DEV_MFORWARD(in_dev))
2242 err = ip_route_input_mc(skb, daddr, saddr,
2248 return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2251 /* called with rcu_read_lock() */
2252 static struct rtable *__mkroute_output(const struct fib_result *res,
2253 const struct flowi4 *fl4, int orig_oif,
2254 struct net_device *dev_out,
2257 struct fib_info *fi = res->fi;
2258 struct fib_nh_exception *fnhe;
2259 struct in_device *in_dev;
2260 u16 type = res->type;
2264 in_dev = __in_dev_get_rcu(dev_out);
2266 return ERR_PTR(-EINVAL);
2268 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2269 if (ipv4_is_loopback(fl4->saddr) &&
2270 !(dev_out->flags & IFF_LOOPBACK) &&
2271 !netif_is_l3_master(dev_out))
2272 return ERR_PTR(-EINVAL);
2274 if (ipv4_is_lbcast(fl4->daddr))
2275 type = RTN_BROADCAST;
2276 else if (ipv4_is_multicast(fl4->daddr))
2277 type = RTN_MULTICAST;
2278 else if (ipv4_is_zeronet(fl4->daddr))
2279 return ERR_PTR(-EINVAL);
2281 if (dev_out->flags & IFF_LOOPBACK)
2282 flags |= RTCF_LOCAL;
2285 if (type == RTN_BROADCAST) {
2286 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2288 } else if (type == RTN_MULTICAST) {
2289 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2290 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2292 flags &= ~RTCF_LOCAL;
2295 /* If multicast route do not exist use
2296 * default one, but do not gateway in this case.
2299 if (fi && res->prefixlen < 4)
2301 } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2302 (orig_oif != dev_out->ifindex)) {
2303 /* For local routes that require a particular output interface
2304 * we do not want to cache the result. Caching the result
2305 * causes incorrect behaviour when there are multiple source
2306 * addresses on the interface, the end result being that if the
2307 * intended recipient is waiting on that interface for the
2308 * packet he won't receive it because it will be delivered on
2309 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2310 * be set to the loopback interface as well.
2316 do_cache &= fi != NULL;
2318 struct rtable __rcu **prth;
2319 struct fib_nh *nh = &FIB_RES_NH(*res);
2321 fnhe = find_exception(nh, fl4->daddr);
2325 prth = &fnhe->fnhe_rth_output;
2327 if (unlikely(fl4->flowi4_flags &
2328 FLOWI_FLAG_KNOWN_NH &&
2330 nh->nh_scope == RT_SCOPE_LINK))) {
2334 prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2336 rth = rcu_dereference(*prth);
2337 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2342 rth = rt_dst_alloc(dev_out, flags, type,
2343 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2344 IN_DEV_CONF_GET(in_dev, NOXFRM),
2347 return ERR_PTR(-ENOBUFS);
2349 rth->rt_iif = orig_oif;
2351 RT_CACHE_STAT_INC(out_slow_tot);
2353 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2354 if (flags & RTCF_LOCAL &&
2355 !(dev_out->flags & IFF_LOOPBACK)) {
2356 rth->dst.output = ip_mc_output;
2357 RT_CACHE_STAT_INC(out_slow_mc);
2359 #ifdef CONFIG_IP_MROUTE
2360 if (type == RTN_MULTICAST) {
2361 if (IN_DEV_MFORWARD(in_dev) &&
2362 !ipv4_is_local_multicast(fl4->daddr)) {
2363 rth->dst.input = ip_mr_input;
2364 rth->dst.output = ip_mc_output;
2370 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2371 lwtunnel_set_redirect(&rth->dst);
2377 * Major route resolver routine.
2380 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2381 const struct sk_buff *skb)
2383 __u8 tos = RT_FL_TOS(fl4);
2384 struct fib_result res = {
2392 fl4->flowi4_iif = LOOPBACK_IFINDEX;
2393 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2394 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2395 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2398 rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2403 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2405 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2406 struct fib_result *res,
2407 const struct sk_buff *skb)
2409 struct net_device *dev_out = NULL;
2410 int orig_oif = fl4->flowi4_oif;
2411 unsigned int flags = 0;
2416 if (ipv4_is_multicast(fl4->saddr) ||
2417 ipv4_is_lbcast(fl4->saddr) ||
2418 ipv4_is_zeronet(fl4->saddr)) {
2419 rth = ERR_PTR(-EINVAL);
2423 rth = ERR_PTR(-ENETUNREACH);
2425 /* I removed check for oif == dev_out->oif here.
2426 It was wrong for two reasons:
2427 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2428 is assigned to multiple interfaces.
2429 2. Moreover, we are allowed to send packets with saddr
2430 of another iface. --ANK
2433 if (fl4->flowi4_oif == 0 &&
2434 (ipv4_is_multicast(fl4->daddr) ||
2435 ipv4_is_lbcast(fl4->daddr))) {
2436 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2437 dev_out = __ip_dev_find(net, fl4->saddr, false);
2441 /* Special hack: user can direct multicasts
2442 and limited broadcast via necessary interface
2443 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2444 This hack is not just for fun, it allows
2445 vic,vat and friends to work.
2446 They bind socket to loopback, set ttl to zero
2447 and expect that it will work.
2448 From the viewpoint of routing cache they are broken,
2449 because we are not allowed to build multicast path
2450 with loopback source addr (look, routing cache
2451 cannot know, that ttl is zero, so that packet
2452 will not leave this host and route is valid).
2453 Luckily, this hack is good workaround.
2456 fl4->flowi4_oif = dev_out->ifindex;
2460 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2461 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2462 if (!__ip_dev_find(net, fl4->saddr, false))
2468 if (fl4->flowi4_oif) {
2469 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2470 rth = ERR_PTR(-ENODEV);
2474 /* RACE: Check return value of inet_select_addr instead. */
2475 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2476 rth = ERR_PTR(-ENETUNREACH);
2479 if (ipv4_is_local_multicast(fl4->daddr) ||
2480 ipv4_is_lbcast(fl4->daddr) ||
2481 fl4->flowi4_proto == IPPROTO_IGMP) {
2483 fl4->saddr = inet_select_addr(dev_out, 0,
2488 if (ipv4_is_multicast(fl4->daddr))
2489 fl4->saddr = inet_select_addr(dev_out, 0,
2491 else if (!fl4->daddr)
2492 fl4->saddr = inet_select_addr(dev_out, 0,
2498 fl4->daddr = fl4->saddr;
2500 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2501 dev_out = net->loopback_dev;
2502 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2503 res->type = RTN_LOCAL;
2504 flags |= RTCF_LOCAL;
2508 err = fib_lookup(net, fl4, res, 0);
2512 if (fl4->flowi4_oif &&
2513 (ipv4_is_multicast(fl4->daddr) ||
2514 !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2515 /* Apparently, routing tables are wrong. Assume,
2516 that the destination is on link.
2519 Because we are allowed to send to iface
2520 even if it has NO routes and NO assigned
2521 addresses. When oif is specified, routing
2522 tables are looked up with only one purpose:
2523 to catch if destination is gatewayed, rather than
2524 direct. Moreover, if MSG_DONTROUTE is set,
2525 we send packet, ignoring both routing tables
2526 and ifaddr state. --ANK
2529 We could make it even if oif is unknown,
2530 likely IPv6, but we do not.
2533 if (fl4->saddr == 0)
2534 fl4->saddr = inet_select_addr(dev_out, 0,
2536 res->type = RTN_UNICAST;
2543 if (res->type == RTN_LOCAL) {
2545 if (res->fi->fib_prefsrc)
2546 fl4->saddr = res->fi->fib_prefsrc;
2548 fl4->saddr = fl4->daddr;
2551 /* L3 master device is the loopback for that domain */
2552 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2555 /* make sure orig_oif points to fib result device even
2556 * though packet rx/tx happens over loopback or l3mdev
2558 orig_oif = FIB_RES_OIF(*res);
2560 fl4->flowi4_oif = dev_out->ifindex;
2561 flags |= RTCF_LOCAL;
2565 fib_select_path(net, res, fl4, skb);
2567 dev_out = FIB_RES_DEV(*res);
2570 rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2576 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2581 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2583 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2585 return mtu ? : dst->dev->mtu;
2588 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2589 struct sk_buff *skb, u32 mtu,
2594 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2595 struct sk_buff *skb)
2599 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2605 static struct dst_ops ipv4_dst_blackhole_ops = {
2607 .check = ipv4_blackhole_dst_check,
2608 .mtu = ipv4_blackhole_mtu,
2609 .default_advmss = ipv4_default_advmss,
2610 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2611 .redirect = ipv4_rt_blackhole_redirect,
2612 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2613 .neigh_lookup = ipv4_neigh_lookup,
2616 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2618 struct rtable *ort = (struct rtable *) dst_orig;
2621 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2623 struct dst_entry *new = &rt->dst;
2626 new->input = dst_discard;
2627 new->output = dst_discard_out;
2629 new->dev = net->loopback_dev;
2633 rt->rt_is_input = ort->rt_is_input;
2634 rt->rt_iif = ort->rt_iif;
2635 rt->rt_pmtu = ort->rt_pmtu;
2636 rt->rt_mtu_locked = ort->rt_mtu_locked;
2638 rt->rt_genid = rt_genid_ipv4(net);
2639 rt->rt_flags = ort->rt_flags;
2640 rt->rt_type = ort->rt_type;
2641 rt->rt_gateway = ort->rt_gateway;
2642 rt->rt_uses_gateway = ort->rt_uses_gateway;
2644 INIT_LIST_HEAD(&rt->rt_uncached);
2647 dst_release(dst_orig);
2649 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2652 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2653 const struct sock *sk)
2655 struct rtable *rt = __ip_route_output_key(net, flp4);
2660 if (flp4->flowi4_proto) {
2661 flp4->flowi4_oif = rt->dst.dev->ifindex;
2662 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2663 flowi4_to_flowi(flp4),
2669 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2671 /* called with rcu_read_lock held */
2672 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2673 struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2674 struct sk_buff *skb, u32 portid, u32 seq)
2677 struct nlmsghdr *nlh;
2678 unsigned long expires = 0;
2680 u32 metrics[RTAX_MAX];
2682 nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2686 r = nlmsg_data(nlh);
2687 r->rtm_family = AF_INET;
2688 r->rtm_dst_len = 32;
2690 r->rtm_tos = fl4->flowi4_tos;
2691 r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2692 if (nla_put_u32(skb, RTA_TABLE, table_id))
2693 goto nla_put_failure;
2694 r->rtm_type = rt->rt_type;
2695 r->rtm_scope = RT_SCOPE_UNIVERSE;
2696 r->rtm_protocol = RTPROT_UNSPEC;
2697 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2698 if (rt->rt_flags & RTCF_NOTIFY)
2699 r->rtm_flags |= RTM_F_NOTIFY;
2700 if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2701 r->rtm_flags |= RTCF_DOREDIRECT;
2703 if (nla_put_in_addr(skb, RTA_DST, dst))
2704 goto nla_put_failure;
2706 r->rtm_src_len = 32;
2707 if (nla_put_in_addr(skb, RTA_SRC, src))
2708 goto nla_put_failure;
2711 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2712 goto nla_put_failure;
2713 #ifdef CONFIG_IP_ROUTE_CLASSID
2714 if (rt->dst.tclassid &&
2715 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2716 goto nla_put_failure;
2718 if (!rt_is_input_route(rt) &&
2719 fl4->saddr != src) {
2720 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2721 goto nla_put_failure;
2723 if (rt->rt_uses_gateway &&
2724 nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2725 goto nla_put_failure;
2727 expires = rt->dst.expires;
2729 unsigned long now = jiffies;
2731 if (time_before(now, expires))
2737 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2738 if (rt->rt_pmtu && expires)
2739 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2740 if (rt->rt_mtu_locked && expires)
2741 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2742 if (rtnetlink_put_metrics(skb, metrics) < 0)
2743 goto nla_put_failure;
2745 if (fl4->flowi4_mark &&
2746 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2747 goto nla_put_failure;
2749 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2750 nla_put_u32(skb, RTA_UID,
2751 from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2752 goto nla_put_failure;
2754 error = rt->dst.error;
2756 if (rt_is_input_route(rt)) {
2757 #ifdef CONFIG_IP_MROUTE
2758 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2759 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2760 int err = ipmr_get_route(net, skb,
2761 fl4->saddr, fl4->daddr,
2767 goto nla_put_failure;
2771 if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2772 goto nla_put_failure;
2775 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2776 goto nla_put_failure;
2778 nlmsg_end(skb, nlh);
2782 nlmsg_cancel(skb, nlh);
2786 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
2787 u8 ip_proto, __be16 sport,
2790 struct sk_buff *skb;
2793 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2797 /* Reserve room for dummy headers, this skb can pass
2798 * through good chunk of routing engine.
2800 skb_reset_mac_header(skb);
2801 skb_reset_network_header(skb);
2802 skb->protocol = htons(ETH_P_IP);
2803 iph = skb_put(skb, sizeof(struct iphdr));
2804 iph->protocol = ip_proto;
2810 skb_set_transport_header(skb, skb->len);
2812 switch (iph->protocol) {
2814 struct udphdr *udph;
2816 udph = skb_put_zero(skb, sizeof(struct udphdr));
2817 udph->source = sport;
2819 udph->len = htons(sizeof(struct udphdr));
2824 struct tcphdr *tcph;
2826 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
2827 tcph->source = sport;
2829 tcph->doff = sizeof(struct tcphdr) / 4;
2831 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
2835 case IPPROTO_ICMP: {
2836 struct icmphdr *icmph;
2838 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
2839 icmph->type = ICMP_ECHO;
2847 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2848 struct netlink_ext_ack *extack)
2850 struct net *net = sock_net(in_skb->sk);
2851 struct nlattr *tb[RTA_MAX+1];
2852 u32 table_id = RT_TABLE_MAIN;
2853 __be16 sport = 0, dport = 0;
2854 struct fib_result res = {};
2855 u8 ip_proto = IPPROTO_UDP;
2856 struct rtable *rt = NULL;
2857 struct sk_buff *skb;
2867 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
2872 rtm = nlmsg_data(nlh);
2873 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2874 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2875 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2876 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2878 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2880 uid = (iif ? INVALID_UID : current_uid());
2882 if (tb[RTA_IP_PROTO]) {
2883 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
2884 &ip_proto, AF_INET, extack);
2890 sport = nla_get_be16(tb[RTA_SPORT]);
2893 dport = nla_get_be16(tb[RTA_DPORT]);
2895 skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
2899 memset(&fl4, 0, sizeof(fl4));
2902 fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
2903 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2904 fl4.flowi4_mark = mark;
2905 fl4.flowi4_uid = uid;
2907 fl4.fl4_sport = sport;
2909 fl4.fl4_dport = dport;
2910 fl4.flowi4_proto = ip_proto;
2915 struct net_device *dev;
2917 dev = dev_get_by_index_rcu(net, iif);
2923 fl4.flowi4_iif = iif; /* for rt_fill_info */
2926 err = ip_route_input_rcu(skb, dst, src,
2927 rtm->rtm_tos & IPTOS_RT_MASK, dev,
2930 rt = skb_rtable(skb);
2931 if (err == 0 && rt->dst.error)
2932 err = -rt->dst.error;
2934 fl4.flowi4_iif = LOOPBACK_IFINDEX;
2935 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
2940 skb_dst_set(skb, &rt->dst);
2946 if (rtm->rtm_flags & RTM_F_NOTIFY)
2947 rt->rt_flags |= RTCF_NOTIFY;
2949 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2950 table_id = res.table ? res.table->tb_id : 0;
2952 /* reset skb for netlink reply msg */
2954 skb_reset_network_header(skb);
2955 skb_reset_transport_header(skb);
2956 skb_reset_mac_header(skb);
2958 if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2960 err = fib_props[res.type].error;
2962 err = -EHOSTUNREACH;
2965 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2966 nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2967 rt->rt_type, res.prefix, res.prefixlen,
2968 fl4.flowi4_tos, res.fi, 0);
2970 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
2971 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
2978 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2988 void ip_rt_multicast_event(struct in_device *in_dev)
2990 rt_cache_flush(dev_net(in_dev->dev));
2993 #ifdef CONFIG_SYSCTL
2994 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
2995 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
2996 static int ip_rt_gc_elasticity __read_mostly = 8;
2997 static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU;
2999 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3000 void __user *buffer,
3001 size_t *lenp, loff_t *ppos)
3003 struct net *net = (struct net *)__ctl->extra1;
3006 rt_cache_flush(net);
3007 fnhe_genid_bump(net);
3014 static struct ctl_table ipv4_route_table[] = {
3016 .procname = "gc_thresh",
3017 .data = &ipv4_dst_ops.gc_thresh,
3018 .maxlen = sizeof(int),
3020 .proc_handler = proc_dointvec,
3023 .procname = "max_size",
3024 .data = &ip_rt_max_size,
3025 .maxlen = sizeof(int),
3027 .proc_handler = proc_dointvec,
3030 /* Deprecated. Use gc_min_interval_ms */
3032 .procname = "gc_min_interval",
3033 .data = &ip_rt_gc_min_interval,
3034 .maxlen = sizeof(int),
3036 .proc_handler = proc_dointvec_jiffies,
3039 .procname = "gc_min_interval_ms",
3040 .data = &ip_rt_gc_min_interval,
3041 .maxlen = sizeof(int),
3043 .proc_handler = proc_dointvec_ms_jiffies,
3046 .procname = "gc_timeout",
3047 .data = &ip_rt_gc_timeout,
3048 .maxlen = sizeof(int),
3050 .proc_handler = proc_dointvec_jiffies,
3053 .procname = "gc_interval",
3054 .data = &ip_rt_gc_interval,
3055 .maxlen = sizeof(int),
3057 .proc_handler = proc_dointvec_jiffies,
3060 .procname = "redirect_load",
3061 .data = &ip_rt_redirect_load,
3062 .maxlen = sizeof(int),
3064 .proc_handler = proc_dointvec,
3067 .procname = "redirect_number",
3068 .data = &ip_rt_redirect_number,
3069 .maxlen = sizeof(int),
3071 .proc_handler = proc_dointvec,
3074 .procname = "redirect_silence",
3075 .data = &ip_rt_redirect_silence,
3076 .maxlen = sizeof(int),
3078 .proc_handler = proc_dointvec,
3081 .procname = "error_cost",
3082 .data = &ip_rt_error_cost,
3083 .maxlen = sizeof(int),
3085 .proc_handler = proc_dointvec,
3088 .procname = "error_burst",
3089 .data = &ip_rt_error_burst,
3090 .maxlen = sizeof(int),
3092 .proc_handler = proc_dointvec,
3095 .procname = "gc_elasticity",
3096 .data = &ip_rt_gc_elasticity,
3097 .maxlen = sizeof(int),
3099 .proc_handler = proc_dointvec,
3102 .procname = "mtu_expires",
3103 .data = &ip_rt_mtu_expires,
3104 .maxlen = sizeof(int),
3106 .proc_handler = proc_dointvec_jiffies,
3109 .procname = "min_pmtu",
3110 .data = &ip_rt_min_pmtu,
3111 .maxlen = sizeof(int),
3113 .proc_handler = proc_dointvec_minmax,
3114 .extra1 = &ip_min_valid_pmtu,
3117 .procname = "min_adv_mss",
3118 .data = &ip_rt_min_advmss,
3119 .maxlen = sizeof(int),
3121 .proc_handler = proc_dointvec,
3126 static struct ctl_table ipv4_route_flush_table[] = {
3128 .procname = "flush",
3129 .maxlen = sizeof(int),
3131 .proc_handler = ipv4_sysctl_rtcache_flush,
3136 static __net_init int sysctl_route_net_init(struct net *net)
3138 struct ctl_table *tbl;
3140 tbl = ipv4_route_flush_table;
3141 if (!net_eq(net, &init_net)) {
3142 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3146 /* Don't export sysctls to unprivileged users */
3147 if (net->user_ns != &init_user_ns)
3148 tbl[0].procname = NULL;
3150 tbl[0].extra1 = net;
3152 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3153 if (!net->ipv4.route_hdr)
3158 if (tbl != ipv4_route_flush_table)
3164 static __net_exit void sysctl_route_net_exit(struct net *net)
3166 struct ctl_table *tbl;
3168 tbl = net->ipv4.route_hdr->ctl_table_arg;
3169 unregister_net_sysctl_table(net->ipv4.route_hdr);
3170 BUG_ON(tbl == ipv4_route_flush_table);
3174 static __net_initdata struct pernet_operations sysctl_route_ops = {
3175 .init = sysctl_route_net_init,
3176 .exit = sysctl_route_net_exit,
3180 static __net_init int rt_genid_init(struct net *net)
3182 atomic_set(&net->ipv4.rt_genid, 0);
3183 atomic_set(&net->fnhe_genid, 0);
3184 atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3188 static __net_initdata struct pernet_operations rt_genid_ops = {
3189 .init = rt_genid_init,
3192 static int __net_init ipv4_inetpeer_init(struct net *net)
3194 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3198 inet_peer_base_init(bp);
3199 net->ipv4.peers = bp;
3203 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3205 struct inet_peer_base *bp = net->ipv4.peers;
3207 net->ipv4.peers = NULL;
3208 inetpeer_invalidate_tree(bp);
3212 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3213 .init = ipv4_inetpeer_init,
3214 .exit = ipv4_inetpeer_exit,
3217 #ifdef CONFIG_IP_ROUTE_CLASSID
3218 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3219 #endif /* CONFIG_IP_ROUTE_CLASSID */
3221 int __init ip_rt_init(void)
3226 /* For modern hosts, this will use 2 MB of memory */
3227 idents_hash = alloc_large_system_hash("IP idents",
3228 sizeof(*ip_idents) + sizeof(*ip_tstamps),
3230 16, /* one bucket per 64 KB */
3237 ip_idents = idents_hash;
3239 prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
3241 ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
3243 for_each_possible_cpu(cpu) {
3244 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3246 INIT_LIST_HEAD(&ul->head);
3247 spin_lock_init(&ul->lock);
3249 #ifdef CONFIG_IP_ROUTE_CLASSID
3250 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3252 panic("IP: failed to allocate ip_rt_acct\n");
3255 ipv4_dst_ops.kmem_cachep =
3256 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3257 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3259 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3261 if (dst_entries_init(&ipv4_dst_ops) < 0)
3262 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3264 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3265 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3267 ipv4_dst_ops.gc_thresh = ~0;
3268 ip_rt_max_size = INT_MAX;
3273 if (ip_rt_proc_init())
3274 pr_err("Unable to create route proc files\n");
3279 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3280 RTNL_FLAG_DOIT_UNLOCKED);
3282 #ifdef CONFIG_SYSCTL
3283 register_pernet_subsys(&sysctl_route_ops);
3285 register_pernet_subsys(&rt_genid_ops);
3286 register_pernet_subsys(&ipv4_inetpeer_ops);
3290 #ifdef CONFIG_SYSCTL
3292 * We really need to sanitize the damn ipv4 init order, then all
3293 * this nonsense will go away.
3295 void __init ip_static_sysctl_init(void)
3297 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);