2 * INET An implementation of the TCP/IP protocol suite for the LINUX
3 * operating system. INET is implemented using the BSD Socket
4 * interface as the means of communication with the user level.
6 * ROUTE - implementation of the IP router.
9 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
10 * Alan Cox, <gw4pts@gw4pts.ampr.org>
11 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
12 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
15 * Alan Cox : Verify area fixes.
16 * Alan Cox : cli() protects routing changes
17 * Rui Oliveira : ICMP routing table updates
18 * (rco@di.uminho.pt) Routing table insertion and update
19 * Linus Torvalds : Rewrote bits to be sensible
20 * Alan Cox : Added BSD route gw semantics
21 * Alan Cox : Super /proc >4K
22 * Alan Cox : MTU in route table
23 * Alan Cox : MSS actually. Also added the window
25 * Sam Lantinga : Fixed route matching in rt_del()
26 * Alan Cox : Routing cache support.
27 * Alan Cox : Removed compatibility cruft.
28 * Alan Cox : RTF_REJECT support.
29 * Alan Cox : TCP irtt support.
30 * Jonathan Naylor : Added Metric support.
31 * Miquel van Smoorenburg : BSD API fixes.
32 * Miquel van Smoorenburg : Metrics.
33 * Alan Cox : Use __u32 properly
34 * Alan Cox : Aligned routing errors more closely with BSD
35 * our system is still very different.
36 * Alan Cox : Faster /proc handling
37 * Alexey Kuznetsov : Massive rework to support tree based routing,
38 * routing caches and better behaviour.
40 * Olaf Erb : irtt wasn't being copied right.
41 * Bjorn Ekwall : Kerneld route support.
42 * Alan Cox : Multicast fixed (I hope)
43 * Pavel Krauz : Limited broadcast fixed
44 * Mike McLagan : Routing by source
45 * Alexey Kuznetsov : End of old history. Split to fib.c and
46 * route.c and rewritten from scratch.
47 * Andi Kleen : Load-limit warning messages.
48 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
49 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
50 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
51 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
52 * Marc Boucher : routing by fwmark
53 * Robert Olsson : Added rt_cache statistics
54 * Arnaldo C. Melo : Convert proc stuff to seq_file
55 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
56 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
57 * Ilia Sotnikov : Removed TOS from hash calculations
59 * This program is free software; you can redistribute it and/or
60 * modify it under the terms of the GNU General Public License
61 * as published by the Free Software Foundation; either version
62 * 2 of the License, or (at your option) any later version.
65 #define pr_fmt(fmt) "IPv4: " fmt
67 #include <linux/module.h>
68 #include <linux/uaccess.h>
69 #include <linux/bitops.h>
70 #include <linux/types.h>
71 #include <linux/kernel.h>
73 #include <linux/bootmem.h>
74 #include <linux/string.h>
75 #include <linux/socket.h>
76 #include <linux/sockios.h>
77 #include <linux/errno.h>
79 #include <linux/inet.h>
80 #include <linux/netdevice.h>
81 #include <linux/proc_fs.h>
82 #include <linux/init.h>
83 #include <linux/skbuff.h>
84 #include <linux/inetdevice.h>
85 #include <linux/igmp.h>
86 #include <linux/pkt_sched.h>
87 #include <linux/mroute.h>
88 #include <linux/netfilter_ipv4.h>
89 #include <linux/random.h>
90 #include <linux/rcupdate.h>
91 #include <linux/times.h>
92 #include <linux/slab.h>
93 #include <linux/jhash.h>
95 #include <net/dst_metadata.h>
96 #include <net/net_namespace.h>
97 #include <net/protocol.h>
99 #include <net/route.h>
100 #include <net/inetpeer.h>
101 #include <net/sock.h>
102 #include <net/ip_fib.h>
105 #include <net/icmp.h>
106 #include <net/xfrm.h>
107 #include <net/lwtunnel.h>
108 #include <net/netevent.h>
109 #include <net/rtnetlink.h>
111 #include <linux/sysctl.h>
113 #include <net/secure_seq.h>
114 #include <net/ip_tunnels.h>
115 #include <net/l3mdev.h>
117 #include "fib_lookup.h"
119 #define RT_FL_TOS(oldflp4) \
120 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
122 #define RT_GC_TIMEOUT (300*HZ)
124 static int ip_rt_max_size;
125 static int ip_rt_redirect_number __read_mostly = 9;
126 static int ip_rt_redirect_load __read_mostly = HZ / 50;
127 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
128 static int ip_rt_error_cost __read_mostly = HZ;
129 static int ip_rt_error_burst __read_mostly = 5 * HZ;
130 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
131 static u32 ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
132 static int ip_rt_min_advmss __read_mostly = 256;
134 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
137 * Interface to generic destination cache.
140 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
141 static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
142 static unsigned int ipv4_mtu(const struct dst_entry *dst);
143 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
144 static void ipv4_link_failure(struct sk_buff *skb);
145 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
146 struct sk_buff *skb, u32 mtu,
148 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
149 struct sk_buff *skb);
150 static void ipv4_dst_destroy(struct dst_entry *dst);
152 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
158 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
161 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
163 static struct dst_ops ipv4_dst_ops = {
165 .check = ipv4_dst_check,
166 .default_advmss = ipv4_default_advmss,
168 .cow_metrics = ipv4_cow_metrics,
169 .destroy = ipv4_dst_destroy,
170 .negative_advice = ipv4_negative_advice,
171 .link_failure = ipv4_link_failure,
172 .update_pmtu = ip_rt_update_pmtu,
173 .redirect = ip_do_redirect,
174 .local_out = __ip_local_out,
175 .neigh_lookup = ipv4_neigh_lookup,
176 .confirm_neigh = ipv4_confirm_neigh,
179 #define ECN_OR_COST(class) TC_PRIO_##class
181 const __u8 ip_tos2prio[16] = {
183 ECN_OR_COST(BESTEFFORT),
185 ECN_OR_COST(BESTEFFORT),
191 ECN_OR_COST(INTERACTIVE),
193 ECN_OR_COST(INTERACTIVE),
194 TC_PRIO_INTERACTIVE_BULK,
195 ECN_OR_COST(INTERACTIVE_BULK),
196 TC_PRIO_INTERACTIVE_BULK,
197 ECN_OR_COST(INTERACTIVE_BULK)
199 EXPORT_SYMBOL(ip_tos2prio);
201 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
202 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
204 #ifdef CONFIG_PROC_FS
205 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
209 return SEQ_START_TOKEN;
212 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
218 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
222 static int rt_cache_seq_show(struct seq_file *seq, void *v)
224 if (v == SEQ_START_TOKEN)
225 seq_printf(seq, "%-127s\n",
226 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
227 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
232 static const struct seq_operations rt_cache_seq_ops = {
233 .start = rt_cache_seq_start,
234 .next = rt_cache_seq_next,
235 .stop = rt_cache_seq_stop,
236 .show = rt_cache_seq_show,
239 static int rt_cache_seq_open(struct inode *inode, struct file *file)
241 return seq_open(file, &rt_cache_seq_ops);
244 static const struct file_operations rt_cache_seq_fops = {
245 .open = rt_cache_seq_open,
248 .release = seq_release,
252 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
257 return SEQ_START_TOKEN;
259 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
260 if (!cpu_possible(cpu))
263 return &per_cpu(rt_cache_stat, cpu);
268 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
272 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
273 if (!cpu_possible(cpu))
276 return &per_cpu(rt_cache_stat, cpu);
283 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
288 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
290 struct rt_cache_stat *st = v;
292 if (v == SEQ_START_TOKEN) {
293 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
297 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
298 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
299 dst_entries_get_slow(&ipv4_dst_ops),
312 0, /* st->gc_total */
313 0, /* st->gc_ignored */
314 0, /* st->gc_goal_miss */
315 0, /* st->gc_dst_overflow */
316 0, /* st->in_hlist_search */
317 0 /* st->out_hlist_search */
322 static const struct seq_operations rt_cpu_seq_ops = {
323 .start = rt_cpu_seq_start,
324 .next = rt_cpu_seq_next,
325 .stop = rt_cpu_seq_stop,
326 .show = rt_cpu_seq_show,
330 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
332 return seq_open(file, &rt_cpu_seq_ops);
335 static const struct file_operations rt_cpu_seq_fops = {
336 .open = rt_cpu_seq_open,
339 .release = seq_release,
342 #ifdef CONFIG_IP_ROUTE_CLASSID
343 static int rt_acct_proc_show(struct seq_file *m, void *v)
345 struct ip_rt_acct *dst, *src;
348 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
352 for_each_possible_cpu(i) {
353 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
354 for (j = 0; j < 256; j++) {
355 dst[j].o_bytes += src[j].o_bytes;
356 dst[j].o_packets += src[j].o_packets;
357 dst[j].i_bytes += src[j].i_bytes;
358 dst[j].i_packets += src[j].i_packets;
362 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
368 static int __net_init ip_rt_do_proc_init(struct net *net)
370 struct proc_dir_entry *pde;
372 pde = proc_create("rt_cache", 0444, net->proc_net,
377 pde = proc_create("rt_cache", 0444,
378 net->proc_net_stat, &rt_cpu_seq_fops);
382 #ifdef CONFIG_IP_ROUTE_CLASSID
383 pde = proc_create_single("rt_acct", 0, net->proc_net,
390 #ifdef CONFIG_IP_ROUTE_CLASSID
392 remove_proc_entry("rt_cache", net->proc_net_stat);
395 remove_proc_entry("rt_cache", net->proc_net);
400 static void __net_exit ip_rt_do_proc_exit(struct net *net)
402 remove_proc_entry("rt_cache", net->proc_net_stat);
403 remove_proc_entry("rt_cache", net->proc_net);
404 #ifdef CONFIG_IP_ROUTE_CLASSID
405 remove_proc_entry("rt_acct", net->proc_net);
409 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
410 .init = ip_rt_do_proc_init,
411 .exit = ip_rt_do_proc_exit,
414 static int __init ip_rt_proc_init(void)
416 return register_pernet_subsys(&ip_rt_proc_ops);
420 static inline int ip_rt_proc_init(void)
424 #endif /* CONFIG_PROC_FS */
426 static inline bool rt_is_expired(const struct rtable *rth)
428 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
431 void rt_cache_flush(struct net *net)
433 rt_genid_bump_ipv4(net);
436 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
440 struct net_device *dev = dst->dev;
441 const __be32 *pkey = daddr;
442 const struct rtable *rt;
445 rt = (const struct rtable *) dst;
447 pkey = (const __be32 *) &rt->rt_gateway;
449 pkey = &ip_hdr(skb)->daddr;
451 n = __ipv4_neigh_lookup(dev, *(__force u32 *)pkey);
454 return neigh_create(&arp_tbl, pkey, dev);
457 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
459 struct net_device *dev = dst->dev;
460 const __be32 *pkey = daddr;
461 const struct rtable *rt;
463 rt = (const struct rtable *)dst;
465 pkey = (const __be32 *)&rt->rt_gateway;
468 (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL)))
471 __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
474 /* Hash tables of size 2048..262144 depending on RAM size.
475 * Each bucket uses 8 bytes.
477 static u32 ip_idents_mask __read_mostly;
478 static atomic_t *ip_idents __read_mostly;
479 static u32 *ip_tstamps __read_mostly;
481 /* In order to protect privacy, we add a perturbation to identifiers
482 * if one generator is seldom used. This makes hard for an attacker
483 * to infer how many packets were sent between two points in time.
485 u32 ip_idents_reserve(u32 hash, int segs)
487 u32 bucket, old, now = (u32)jiffies;
492 bucket = hash & ip_idents_mask;
493 p_tstamp = ip_tstamps + bucket;
494 p_id = ip_idents + bucket;
495 old = READ_ONCE(*p_tstamp);
497 if (old != now && cmpxchg(p_tstamp, old, now) == old)
498 delta = prandom_u32_max(now - old);
500 /* If UBSAN reports an error there, please make sure your compiler
501 * supports -fno-strict-overflow before reporting it that was a bug
502 * in UBSAN, and it has been fixed in GCC-8.
504 return atomic_add_return(segs + delta, p_id) - segs;
506 EXPORT_SYMBOL(ip_idents_reserve);
508 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
512 /* Note the following code is not safe, but this is okay. */
513 if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
514 get_random_bytes(&net->ipv4.ip_id_key,
515 sizeof(net->ipv4.ip_id_key));
517 hash = siphash_3u32((__force u32)iph->daddr,
518 (__force u32)iph->saddr,
520 &net->ipv4.ip_id_key);
521 id = ip_idents_reserve(hash, segs);
524 EXPORT_SYMBOL(__ip_select_ident);
526 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
527 const struct sock *sk,
528 const struct iphdr *iph,
530 u8 prot, u32 mark, int flow_flags)
533 const struct inet_sock *inet = inet_sk(sk);
535 oif = sk->sk_bound_dev_if;
537 tos = RT_CONN_FLAGS(sk);
538 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
540 flowi4_init_output(fl4, oif, mark, tos,
541 RT_SCOPE_UNIVERSE, prot,
543 iph->daddr, iph->saddr, 0, 0,
544 sock_net_uid(net, sk));
547 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
548 const struct sock *sk)
550 const struct net *net = dev_net(skb->dev);
551 const struct iphdr *iph = ip_hdr(skb);
552 int oif = skb->dev->ifindex;
553 u8 tos = RT_TOS(iph->tos);
554 u8 prot = iph->protocol;
555 u32 mark = skb->mark;
557 __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
560 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
562 const struct inet_sock *inet = inet_sk(sk);
563 const struct ip_options_rcu *inet_opt;
564 __be32 daddr = inet->inet_daddr;
567 inet_opt = rcu_dereference(inet->inet_opt);
568 if (inet_opt && inet_opt->opt.srr)
569 daddr = inet_opt->opt.faddr;
570 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
571 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
572 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
573 inet_sk_flowi_flags(sk),
574 daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
578 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
579 const struct sk_buff *skb)
582 build_skb_flow_key(fl4, skb, sk);
584 build_sk_flow_key(fl4, sk);
587 static DEFINE_SPINLOCK(fnhe_lock);
589 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
593 rt = rcu_dereference(fnhe->fnhe_rth_input);
595 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
596 dst_dev_put(&rt->dst);
597 dst_release(&rt->dst);
599 rt = rcu_dereference(fnhe->fnhe_rth_output);
601 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
602 dst_dev_put(&rt->dst);
603 dst_release(&rt->dst);
607 static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
609 struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
610 struct fib_nh_exception *fnhe, *oldest = NULL;
612 for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
613 fnhe = rcu_dereference_protected(*fnhe_p,
614 lockdep_is_held(&fnhe_lock));
618 time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
623 fnhe_flush_routes(oldest);
624 *oldest_p = oldest->fnhe_next;
625 kfree_rcu(oldest, rcu);
628 static inline u32 fnhe_hashfun(__be32 daddr)
630 static u32 fnhe_hashrnd __read_mostly;
633 net_get_random_once(&fnhe_hashrnd, sizeof(fnhe_hashrnd));
634 hval = jhash_1word((__force u32) daddr, fnhe_hashrnd);
635 return hash_32(hval, FNHE_HASH_SHIFT);
638 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
640 rt->rt_pmtu = fnhe->fnhe_pmtu;
641 rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
642 rt->dst.expires = fnhe->fnhe_expires;
645 rt->rt_flags |= RTCF_REDIRECTED;
646 rt->rt_gateway = fnhe->fnhe_gw;
647 rt->rt_uses_gateway = 1;
651 static void update_or_create_fnhe(struct fib_nh *nh, __be32 daddr, __be32 gw,
652 u32 pmtu, bool lock, unsigned long expires)
654 struct fnhe_hash_bucket *hash;
655 struct fib_nh_exception *fnhe;
661 genid = fnhe_genid(dev_net(nh->nh_dev));
662 hval = fnhe_hashfun(daddr);
664 spin_lock_bh(&fnhe_lock);
666 hash = rcu_dereference(nh->nh_exceptions);
668 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
671 rcu_assign_pointer(nh->nh_exceptions, hash);
677 for (fnhe = rcu_dereference(hash->chain); fnhe;
678 fnhe = rcu_dereference(fnhe->fnhe_next)) {
679 if (fnhe->fnhe_daddr == daddr)
685 if (fnhe->fnhe_genid != genid)
686 fnhe->fnhe_genid = genid;
690 fnhe->fnhe_pmtu = pmtu;
691 fnhe->fnhe_mtu_locked = lock;
693 fnhe->fnhe_expires = max(1UL, expires);
694 /* Update all cached dsts too */
695 rt = rcu_dereference(fnhe->fnhe_rth_input);
697 fill_route_from_fnhe(rt, fnhe);
698 rt = rcu_dereference(fnhe->fnhe_rth_output);
700 fill_route_from_fnhe(rt, fnhe);
702 /* Randomize max depth to avoid some side channels attacks. */
703 int max_depth = FNHE_RECLAIM_DEPTH +
704 prandom_u32_max(FNHE_RECLAIM_DEPTH);
706 while (depth > max_depth) {
707 fnhe_remove_oldest(hash);
711 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
715 fnhe->fnhe_next = hash->chain;
717 fnhe->fnhe_genid = genid;
718 fnhe->fnhe_daddr = daddr;
720 fnhe->fnhe_pmtu = pmtu;
721 fnhe->fnhe_mtu_locked = lock;
722 fnhe->fnhe_expires = max(1UL, expires);
724 rcu_assign_pointer(hash->chain, fnhe);
726 /* Exception created; mark the cached routes for the nexthop
727 * stale, so anyone caching it rechecks if this exception
730 rt = rcu_dereference(nh->nh_rth_input);
732 rt->dst.obsolete = DST_OBSOLETE_KILL;
734 for_each_possible_cpu(i) {
735 struct rtable __rcu **prt;
736 prt = per_cpu_ptr(nh->nh_pcpu_rth_output, i);
737 rt = rcu_dereference(*prt);
739 rt->dst.obsolete = DST_OBSOLETE_KILL;
743 fnhe->fnhe_stamp = jiffies;
746 spin_unlock_bh(&fnhe_lock);
749 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
752 __be32 new_gw = icmp_hdr(skb)->un.gateway;
753 __be32 old_gw = ip_hdr(skb)->saddr;
754 struct net_device *dev = skb->dev;
755 struct in_device *in_dev;
756 struct fib_result res;
760 switch (icmp_hdr(skb)->code & 7) {
762 case ICMP_REDIR_NETTOS:
763 case ICMP_REDIR_HOST:
764 case ICMP_REDIR_HOSTTOS:
771 if (rt->rt_gateway != old_gw)
774 in_dev = __in_dev_get_rcu(dev);
779 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
780 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
781 ipv4_is_zeronet(new_gw))
782 goto reject_redirect;
784 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
785 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
786 goto reject_redirect;
787 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
788 goto reject_redirect;
790 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
791 goto reject_redirect;
794 n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
796 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
798 if (!(n->nud_state & NUD_VALID)) {
799 neigh_event_send(n, NULL);
801 if (fib_lookup(net, fl4, &res, 0) == 0) {
804 fib_select_path(net, &res, fl4, skb);
805 nh = &FIB_RES_NH(res);
806 update_or_create_fnhe(nh, fl4->daddr, new_gw,
808 jiffies + ip_rt_gc_timeout);
811 rt->dst.obsolete = DST_OBSOLETE_KILL;
812 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
819 #ifdef CONFIG_IP_ROUTE_VERBOSE
820 if (IN_DEV_LOG_MARTIANS(in_dev)) {
821 const struct iphdr *iph = (const struct iphdr *) skb->data;
822 __be32 daddr = iph->daddr;
823 __be32 saddr = iph->saddr;
825 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
826 " Advised path = %pI4 -> %pI4\n",
827 &old_gw, dev->name, &new_gw,
834 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
838 const struct iphdr *iph = (const struct iphdr *) skb->data;
839 struct net *net = dev_net(skb->dev);
840 int oif = skb->dev->ifindex;
841 u8 tos = RT_TOS(iph->tos);
842 u8 prot = iph->protocol;
843 u32 mark = skb->mark;
845 rt = (struct rtable *) dst;
847 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
848 __ip_do_redirect(rt, skb, &fl4, true);
851 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
853 struct rtable *rt = (struct rtable *)dst;
854 struct dst_entry *ret = dst;
857 if (dst->obsolete > 0) {
860 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
871 * 1. The first ip_rt_redirect_number redirects are sent
872 * with exponential backoff, then we stop sending them at all,
873 * assuming that the host ignores our redirects.
874 * 2. If we did not see packets requiring redirects
875 * during ip_rt_redirect_silence, we assume that the host
876 * forgot redirected route and start to send redirects again.
878 * This algorithm is much cheaper and more intelligent than dumb load limiting
881 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
882 * and "frag. need" (breaks PMTU discovery) in icmp.c.
885 void ip_rt_send_redirect(struct sk_buff *skb)
887 struct rtable *rt = skb_rtable(skb);
888 struct in_device *in_dev;
889 struct inet_peer *peer;
895 in_dev = __in_dev_get_rcu(rt->dst.dev);
896 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
900 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
901 vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
904 net = dev_net(rt->dst.dev);
905 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
907 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
908 rt_nexthop(rt, ip_hdr(skb)->daddr));
912 /* No redirected packets during ip_rt_redirect_silence;
913 * reset the algorithm.
915 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
916 peer->rate_tokens = 0;
917 peer->n_redirects = 0;
920 /* Too many ignored redirects; do not send anything
921 * set dst.rate_last to the last seen redirected packet.
923 if (peer->n_redirects >= ip_rt_redirect_number) {
924 peer->rate_last = jiffies;
928 /* Check for load limit; set rate_last to the latest sent
931 if (peer->n_redirects == 0 ||
934 (ip_rt_redirect_load << peer->n_redirects)))) {
935 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
937 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
938 peer->rate_last = jiffies;
940 #ifdef CONFIG_IP_ROUTE_VERBOSE
942 peer->n_redirects == ip_rt_redirect_number)
943 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
944 &ip_hdr(skb)->saddr, inet_iif(skb),
945 &ip_hdr(skb)->daddr, &gw);
952 static int ip_error(struct sk_buff *skb)
954 struct rtable *rt = skb_rtable(skb);
955 struct net_device *dev = skb->dev;
956 struct in_device *in_dev;
957 struct inet_peer *peer;
963 if (netif_is_l3_master(skb->dev)) {
964 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
969 in_dev = __in_dev_get_rcu(dev);
971 /* IP on this device is disabled. */
975 net = dev_net(rt->dst.dev);
976 if (!IN_DEV_FORWARD(in_dev)) {
977 switch (rt->dst.error) {
979 __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
983 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
989 switch (rt->dst.error) {
994 code = ICMP_HOST_UNREACH;
997 code = ICMP_NET_UNREACH;
998 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
1001 code = ICMP_PKT_FILTERED;
1005 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
1006 l3mdev_master_ifindex(skb->dev), 1);
1011 peer->rate_tokens += now - peer->rate_last;
1012 if (peer->rate_tokens > ip_rt_error_burst)
1013 peer->rate_tokens = ip_rt_error_burst;
1014 peer->rate_last = now;
1015 if (peer->rate_tokens >= ip_rt_error_cost)
1016 peer->rate_tokens -= ip_rt_error_cost;
1022 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1024 out: kfree_skb(skb);
1028 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1030 struct dst_entry *dst = &rt->dst;
1031 struct net *net = dev_net(dst->dev);
1032 u32 old_mtu = ipv4_mtu(dst);
1033 struct fib_result res;
1036 if (ip_mtu_locked(dst))
1042 if (mtu < ip_rt_min_pmtu) {
1044 mtu = min(old_mtu, ip_rt_min_pmtu);
1047 if (rt->rt_pmtu == mtu && !lock &&
1048 time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1052 if (fib_lookup(net, fl4, &res, 0) == 0) {
1055 fib_select_path(net, &res, fl4, NULL);
1056 nh = &FIB_RES_NH(res);
1057 update_or_create_fnhe(nh, fl4->daddr, 0, mtu, lock,
1058 jiffies + ip_rt_mtu_expires);
1063 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1064 struct sk_buff *skb, u32 mtu,
1067 struct rtable *rt = (struct rtable *) dst;
1070 ip_rt_build_flow_key(&fl4, sk, skb);
1071 __ip_rt_update_pmtu(rt, &fl4, mtu);
1074 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1075 int oif, u32 mark, u8 protocol, int flow_flags)
1077 const struct iphdr *iph = (const struct iphdr *) skb->data;
1082 mark = IP4_REPLY_MARK(net, skb->mark);
1084 __build_flow_key(net, &fl4, NULL, iph, oif,
1085 RT_TOS(iph->tos), protocol, mark, flow_flags);
1086 rt = __ip_route_output_key(net, &fl4);
1088 __ip_rt_update_pmtu(rt, &fl4, mtu);
1092 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1094 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1096 const struct iphdr *iph = (const struct iphdr *) skb->data;
1100 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1102 if (!fl4.flowi4_mark)
1103 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1105 rt = __ip_route_output_key(sock_net(sk), &fl4);
1107 __ip_rt_update_pmtu(rt, &fl4, mtu);
1112 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1114 const struct iphdr *iph = (const struct iphdr *) skb->data;
1117 struct dst_entry *odst = NULL;
1119 struct net *net = sock_net(sk);
1123 if (!ip_sk_accept_pmtu(sk))
1126 odst = sk_dst_get(sk);
1128 if (sock_owned_by_user(sk) || !odst) {
1129 __ipv4_sk_update_pmtu(skb, sk, mtu);
1133 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1135 rt = (struct rtable *)odst;
1136 if (odst->obsolete && !odst->ops->check(odst, 0)) {
1137 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1144 __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1146 if (!dst_check(&rt->dst, 0)) {
1148 dst_release(&rt->dst);
1150 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1158 sk_dst_set(sk, &rt->dst);
1164 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1166 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1167 int oif, u32 mark, u8 protocol, int flow_flags)
1169 const struct iphdr *iph = (const struct iphdr *) skb->data;
1173 __build_flow_key(net, &fl4, NULL, iph, oif,
1174 RT_TOS(iph->tos), protocol, mark, flow_flags);
1175 rt = __ip_route_output_key(net, &fl4);
1177 __ip_do_redirect(rt, skb, &fl4, false);
1181 EXPORT_SYMBOL_GPL(ipv4_redirect);
1183 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1185 const struct iphdr *iph = (const struct iphdr *) skb->data;
1188 struct net *net = sock_net(sk);
1190 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1191 rt = __ip_route_output_key(net, &fl4);
1193 __ip_do_redirect(rt, skb, &fl4, false);
1197 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1199 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1201 struct rtable *rt = (struct rtable *) dst;
1203 /* All IPV4 dsts are created with ->obsolete set to the value
1204 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1205 * into this function always.
1207 * When a PMTU/redirect information update invalidates a route,
1208 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1209 * DST_OBSOLETE_DEAD by dst_free().
1211 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1216 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1218 struct ip_options opt;
1221 /* Recompile ip options since IPCB may not be valid anymore.
1222 * Also check we have a reasonable ipv4 header.
1224 if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1225 ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1228 memset(&opt, 0, sizeof(opt));
1229 if (ip_hdr(skb)->ihl > 5) {
1230 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1232 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1235 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1241 __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1244 static void ipv4_link_failure(struct sk_buff *skb)
1248 ipv4_send_dest_unreach(skb);
1250 rt = skb_rtable(skb);
1252 dst_set_expires(&rt->dst, 0);
1255 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1257 pr_debug("%s: %pI4 -> %pI4, %s\n",
1258 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1259 skb->dev ? skb->dev->name : "?");
1266 We do not cache source address of outgoing interface,
1267 because it is used only by IP RR, TS and SRR options,
1268 so that it out of fast path.
1270 BTW remember: "addr" is allowed to be not aligned
1274 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1278 if (rt_is_output_route(rt))
1279 src = ip_hdr(skb)->saddr;
1281 struct fib_result res;
1287 memset(&fl4, 0, sizeof(fl4));
1288 fl4.daddr = iph->daddr;
1289 fl4.saddr = iph->saddr;
1290 fl4.flowi4_tos = RT_TOS(iph->tos);
1291 fl4.flowi4_oif = rt->dst.dev->ifindex;
1292 fl4.flowi4_iif = skb->dev->ifindex;
1293 fl4.flowi4_mark = skb->mark;
1296 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1297 src = FIB_RES_PREFSRC(dev_net(rt->dst.dev), res);
1299 src = inet_select_addr(rt->dst.dev,
1300 rt_nexthop(rt, iph->daddr),
1304 memcpy(addr, &src, 4);
1307 #ifdef CONFIG_IP_ROUTE_CLASSID
1308 static void set_class_tag(struct rtable *rt, u32 tag)
1310 if (!(rt->dst.tclassid & 0xFFFF))
1311 rt->dst.tclassid |= tag & 0xFFFF;
1312 if (!(rt->dst.tclassid & 0xFFFF0000))
1313 rt->dst.tclassid |= tag & 0xFFFF0000;
1317 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1319 unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1320 unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1323 return min(advmss, IPV4_MAX_PMTU - header_size);
1326 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1328 const struct rtable *rt = (const struct rtable *) dst;
1329 unsigned int mtu = rt->rt_pmtu;
1331 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1332 mtu = dst_metric_raw(dst, RTAX_MTU);
1337 mtu = READ_ONCE(dst->dev->mtu);
1339 if (unlikely(ip_mtu_locked(dst))) {
1340 if (rt->rt_uses_gateway && mtu > 576)
1345 mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1347 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1350 static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
1352 struct fnhe_hash_bucket *hash;
1353 struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1354 u32 hval = fnhe_hashfun(daddr);
1356 spin_lock_bh(&fnhe_lock);
1358 hash = rcu_dereference_protected(nh->nh_exceptions,
1359 lockdep_is_held(&fnhe_lock));
1362 fnhe_p = &hash->chain;
1363 fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1365 if (fnhe->fnhe_daddr == daddr) {
1366 rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1367 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1368 /* set fnhe_daddr to 0 to ensure it won't bind with
1369 * new dsts in rt_bind_exception().
1371 fnhe->fnhe_daddr = 0;
1372 fnhe_flush_routes(fnhe);
1373 kfree_rcu(fnhe, rcu);
1376 fnhe_p = &fnhe->fnhe_next;
1377 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1378 lockdep_is_held(&fnhe_lock));
1381 spin_unlock_bh(&fnhe_lock);
1384 static struct fib_nh_exception *find_exception(struct fib_nh *nh, __be32 daddr)
1386 struct fnhe_hash_bucket *hash = rcu_dereference(nh->nh_exceptions);
1387 struct fib_nh_exception *fnhe;
1393 hval = fnhe_hashfun(daddr);
1395 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1396 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1397 if (fnhe->fnhe_daddr == daddr) {
1398 if (fnhe->fnhe_expires &&
1399 time_after(jiffies, fnhe->fnhe_expires)) {
1400 ip_del_fnhe(nh, daddr);
1410 * 1. mtu on route is locked - use it
1411 * 2. mtu from nexthop exception
1412 * 3. mtu from egress device
1415 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1417 struct fib_info *fi = res->fi;
1418 struct fib_nh *nh = &fi->fib_nh[res->nh_sel];
1419 struct net_device *dev = nh->nh_dev;
1422 if (dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu ||
1423 fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1427 struct fib_nh_exception *fnhe;
1429 fnhe = find_exception(nh, daddr);
1430 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1431 mtu = fnhe->fnhe_pmtu;
1435 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1437 return mtu - lwtunnel_headroom(nh->nh_lwtstate, mtu);
1440 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1441 __be32 daddr, const bool do_cache)
1445 spin_lock_bh(&fnhe_lock);
1447 if (daddr == fnhe->fnhe_daddr) {
1448 struct rtable __rcu **porig;
1449 struct rtable *orig;
1450 int genid = fnhe_genid(dev_net(rt->dst.dev));
1452 if (rt_is_input_route(rt))
1453 porig = &fnhe->fnhe_rth_input;
1455 porig = &fnhe->fnhe_rth_output;
1456 orig = rcu_dereference(*porig);
1458 if (fnhe->fnhe_genid != genid) {
1459 fnhe->fnhe_genid = genid;
1461 fnhe->fnhe_pmtu = 0;
1462 fnhe->fnhe_expires = 0;
1463 fnhe->fnhe_mtu_locked = false;
1464 fnhe_flush_routes(fnhe);
1467 fill_route_from_fnhe(rt, fnhe);
1468 if (!rt->rt_gateway)
1469 rt->rt_gateway = daddr;
1473 rcu_assign_pointer(*porig, rt);
1475 dst_dev_put(&orig->dst);
1476 dst_release(&orig->dst);
1481 fnhe->fnhe_stamp = jiffies;
1483 spin_unlock_bh(&fnhe_lock);
1488 static bool rt_cache_route(struct fib_nh *nh, struct rtable *rt)
1490 struct rtable *orig, *prev, **p;
1493 if (rt_is_input_route(rt)) {
1494 p = (struct rtable **)&nh->nh_rth_input;
1496 p = (struct rtable **)raw_cpu_ptr(nh->nh_pcpu_rth_output);
1500 /* hold dst before doing cmpxchg() to avoid race condition
1504 prev = cmpxchg(p, orig, rt);
1507 rt_add_uncached_list(orig);
1508 dst_release(&orig->dst);
1511 dst_release(&rt->dst);
1518 struct uncached_list {
1520 struct list_head head;
1523 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1525 void rt_add_uncached_list(struct rtable *rt)
1527 struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1529 rt->rt_uncached_list = ul;
1531 spin_lock_bh(&ul->lock);
1532 list_add_tail(&rt->rt_uncached, &ul->head);
1533 spin_unlock_bh(&ul->lock);
1536 void rt_del_uncached_list(struct rtable *rt)
1538 if (!list_empty(&rt->rt_uncached)) {
1539 struct uncached_list *ul = rt->rt_uncached_list;
1541 spin_lock_bh(&ul->lock);
1542 list_del(&rt->rt_uncached);
1543 spin_unlock_bh(&ul->lock);
1547 static void ipv4_dst_destroy(struct dst_entry *dst)
1549 struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
1550 struct rtable *rt = (struct rtable *)dst;
1552 if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
1555 rt_del_uncached_list(rt);
1558 void rt_flush_dev(struct net_device *dev)
1560 struct net *net = dev_net(dev);
1564 for_each_possible_cpu(cpu) {
1565 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1567 spin_lock_bh(&ul->lock);
1568 list_for_each_entry(rt, &ul->head, rt_uncached) {
1569 if (rt->dst.dev != dev)
1571 rt->dst.dev = net->loopback_dev;
1572 dev_hold(rt->dst.dev);
1575 spin_unlock_bh(&ul->lock);
1579 static bool rt_cache_valid(const struct rtable *rt)
1582 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1586 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1587 const struct fib_result *res,
1588 struct fib_nh_exception *fnhe,
1589 struct fib_info *fi, u16 type, u32 itag,
1590 const bool do_cache)
1592 bool cached = false;
1595 struct fib_nh *nh = &FIB_RES_NH(*res);
1597 if (nh->nh_gw && nh->nh_scope == RT_SCOPE_LINK) {
1598 rt->rt_gateway = nh->nh_gw;
1599 rt->rt_uses_gateway = 1;
1601 dst_init_metrics(&rt->dst, fi->fib_metrics->metrics, true);
1602 if (fi->fib_metrics != &dst_default_metrics) {
1603 rt->dst._metrics |= DST_METRICS_REFCOUNTED;
1604 refcount_inc(&fi->fib_metrics->refcnt);
1606 #ifdef CONFIG_IP_ROUTE_CLASSID
1607 rt->dst.tclassid = nh->nh_tclassid;
1609 rt->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
1611 cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1613 cached = rt_cache_route(nh, rt);
1614 if (unlikely(!cached)) {
1615 /* Routes we intend to cache in nexthop exception or
1616 * FIB nexthop have the DST_NOCACHE bit clear.
1617 * However, if we are unsuccessful at storing this
1618 * route into the cache we really need to set it.
1620 if (!rt->rt_gateway)
1621 rt->rt_gateway = daddr;
1622 rt_add_uncached_list(rt);
1625 rt_add_uncached_list(rt);
1627 #ifdef CONFIG_IP_ROUTE_CLASSID
1628 #ifdef CONFIG_IP_MULTIPLE_TABLES
1629 set_class_tag(rt, res->tclassid);
1631 set_class_tag(rt, itag);
1635 struct rtable *rt_dst_alloc(struct net_device *dev,
1636 unsigned int flags, u16 type,
1637 bool nopolicy, bool noxfrm, bool will_cache)
1641 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1642 (will_cache ? 0 : DST_HOST) |
1643 (nopolicy ? DST_NOPOLICY : 0) |
1644 (noxfrm ? DST_NOXFRM : 0));
1647 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1648 rt->rt_flags = flags;
1650 rt->rt_is_input = 0;
1653 rt->rt_mtu_locked = 0;
1655 rt->rt_uses_gateway = 0;
1656 INIT_LIST_HEAD(&rt->rt_uncached);
1658 rt->dst.output = ip_output;
1659 if (flags & RTCF_LOCAL)
1660 rt->dst.input = ip_local_deliver;
1665 EXPORT_SYMBOL(rt_dst_alloc);
1667 /* called in rcu_read_lock() section */
1668 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1669 u8 tos, struct net_device *dev,
1670 struct in_device *in_dev, u32 *itag)
1674 /* Primary sanity checks. */
1678 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1679 skb->protocol != htons(ETH_P_IP))
1682 if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1685 if (ipv4_is_zeronet(saddr)) {
1686 if (!ipv4_is_local_multicast(daddr))
1689 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1697 /* called in rcu_read_lock() section */
1698 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1699 u8 tos, struct net_device *dev, int our)
1701 struct in_device *in_dev = __in_dev_get_rcu(dev);
1702 unsigned int flags = RTCF_MULTICAST;
1707 err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1712 flags |= RTCF_LOCAL;
1714 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1715 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1719 #ifdef CONFIG_IP_ROUTE_CLASSID
1720 rth->dst.tclassid = itag;
1722 rth->dst.output = ip_rt_bug;
1723 rth->rt_is_input= 1;
1725 #ifdef CONFIG_IP_MROUTE
1726 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1727 rth->dst.input = ip_mr_input;
1729 RT_CACHE_STAT_INC(in_slow_mc);
1731 skb_dst_set(skb, &rth->dst);
1736 static void ip_handle_martian_source(struct net_device *dev,
1737 struct in_device *in_dev,
1738 struct sk_buff *skb,
1742 RT_CACHE_STAT_INC(in_martian_src);
1743 #ifdef CONFIG_IP_ROUTE_VERBOSE
1744 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1746 * RFC1812 recommendation, if source is martian,
1747 * the only hint is MAC header.
1749 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1750 &daddr, &saddr, dev->name);
1751 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1752 print_hex_dump(KERN_WARNING, "ll header: ",
1753 DUMP_PREFIX_OFFSET, 16, 1,
1754 skb_mac_header(skb),
1755 dev->hard_header_len, true);
1761 /* called in rcu_read_lock() section */
1762 static int __mkroute_input(struct sk_buff *skb,
1763 const struct fib_result *res,
1764 struct in_device *in_dev,
1765 __be32 daddr, __be32 saddr, u32 tos)
1767 struct fib_nh_exception *fnhe;
1770 struct in_device *out_dev;
1774 /* get a working reference to the output device */
1775 out_dev = __in_dev_get_rcu(FIB_RES_DEV(*res));
1777 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1781 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1782 in_dev->dev, in_dev, &itag);
1784 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1790 do_cache = res->fi && !itag;
1791 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1792 skb->protocol == htons(ETH_P_IP) &&
1793 (IN_DEV_SHARED_MEDIA(out_dev) ||
1794 inet_addr_onlink(out_dev, saddr, FIB_RES_GW(*res))))
1795 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1797 if (skb->protocol != htons(ETH_P_IP)) {
1798 /* Not IP (i.e. ARP). Do not create route, if it is
1799 * invalid for proxy arp. DNAT routes are always valid.
1801 * Proxy arp feature have been extended to allow, ARP
1802 * replies back to the same interface, to support
1803 * Private VLAN switch technologies. See arp.c.
1805 if (out_dev == in_dev &&
1806 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1812 fnhe = find_exception(&FIB_RES_NH(*res), daddr);
1815 rth = rcu_dereference(fnhe->fnhe_rth_input);
1817 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
1818 if (rt_cache_valid(rth)) {
1819 skb_dst_set_noref(skb, &rth->dst);
1824 rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1825 IN_DEV_CONF_GET(in_dev, NOPOLICY),
1826 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1832 rth->rt_is_input = 1;
1833 RT_CACHE_STAT_INC(in_slow_tot);
1835 rth->dst.input = ip_forward;
1837 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1839 lwtunnel_set_redirect(&rth->dst);
1840 skb_dst_set(skb, &rth->dst);
1847 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1848 /* To make ICMP packets follow the right flow, the multipath hash is
1849 * calculated from the inner IP addresses.
1851 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1852 struct flow_keys *hash_keys)
1854 const struct iphdr *outer_iph = ip_hdr(skb);
1855 const struct iphdr *key_iph = outer_iph;
1856 const struct iphdr *inner_iph;
1857 const struct icmphdr *icmph;
1858 struct iphdr _inner_iph;
1859 struct icmphdr _icmph;
1861 if (likely(outer_iph->protocol != IPPROTO_ICMP))
1864 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1867 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1872 if (icmph->type != ICMP_DEST_UNREACH &&
1873 icmph->type != ICMP_REDIRECT &&
1874 icmph->type != ICMP_TIME_EXCEEDED &&
1875 icmph->type != ICMP_PARAMETERPROB)
1878 inner_iph = skb_header_pointer(skb,
1879 outer_iph->ihl * 4 + sizeof(_icmph),
1880 sizeof(_inner_iph), &_inner_iph);
1884 key_iph = inner_iph;
1886 hash_keys->addrs.v4addrs.src = key_iph->saddr;
1887 hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1890 /* if skb is set it will be used and fl4 can be NULL */
1891 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1892 const struct sk_buff *skb, struct flow_keys *flkeys)
1894 struct flow_keys hash_keys;
1897 switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1899 memset(&hash_keys, 0, sizeof(hash_keys));
1900 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1902 ip_multipath_l3_keys(skb, &hash_keys);
1904 hash_keys.addrs.v4addrs.src = fl4->saddr;
1905 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1909 /* skb is currently provided only when forwarding */
1911 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1912 struct flow_keys keys;
1914 /* short-circuit if we already have L4 hash present */
1916 return skb_get_hash_raw(skb) >> 1;
1918 memset(&hash_keys, 0, sizeof(hash_keys));
1921 skb_flow_dissect_flow_keys(skb, &keys, flag);
1925 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1926 hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1927 hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1928 hash_keys.ports.src = flkeys->ports.src;
1929 hash_keys.ports.dst = flkeys->ports.dst;
1930 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1932 memset(&hash_keys, 0, sizeof(hash_keys));
1933 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1934 hash_keys.addrs.v4addrs.src = fl4->saddr;
1935 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1936 hash_keys.ports.src = fl4->fl4_sport;
1937 hash_keys.ports.dst = fl4->fl4_dport;
1938 hash_keys.basic.ip_proto = fl4->flowi4_proto;
1942 mhash = flow_hash_from_keys(&hash_keys);
1946 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
1948 static int ip_mkroute_input(struct sk_buff *skb,
1949 struct fib_result *res,
1950 struct in_device *in_dev,
1951 __be32 daddr, __be32 saddr, u32 tos,
1952 struct flow_keys *hkeys)
1954 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1955 if (res->fi && res->fi->fib_nhs > 1) {
1956 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
1958 fib_select_multipath(res, h);
1962 /* create a routing cache entry */
1963 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
1967 * NOTE. We drop all the packets that has local source
1968 * addresses, because every properly looped back packet
1969 * must have correct destination already attached by output routine.
1971 * Such approach solves two big problems:
1972 * 1. Not simplex devices are handled properly.
1973 * 2. IP spoofing attempts are filtered with 100% of guarantee.
1974 * called with rcu_read_lock()
1977 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1978 u8 tos, struct net_device *dev,
1979 struct fib_result *res)
1981 struct in_device *in_dev = __in_dev_get_rcu(dev);
1982 struct flow_keys *flkeys = NULL, _flkeys;
1983 struct net *net = dev_net(dev);
1984 struct ip_tunnel_info *tun_info;
1986 unsigned int flags = 0;
1990 bool do_cache = true;
1992 /* IP on this device is disabled. */
1997 /* Check for the most weird martians, which can be not detected
2001 tun_info = skb_tunnel_info(skb);
2002 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2003 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2005 fl4.flowi4_tun_key.tun_id = 0;
2008 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2009 goto martian_source;
2013 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2016 /* Accept zero addresses only to limited broadcast;
2017 * I even do not know to fix it or not. Waiting for complains :-)
2019 if (ipv4_is_zeronet(saddr))
2020 goto martian_source;
2022 if (ipv4_is_zeronet(daddr))
2023 goto martian_destination;
2025 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2026 * and call it once if daddr or/and saddr are loopback addresses
2028 if (ipv4_is_loopback(daddr)) {
2029 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2030 goto martian_destination;
2031 } else if (ipv4_is_loopback(saddr)) {
2032 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2033 goto martian_source;
2037 * Now we are ready to route packet.
2040 fl4.flowi4_iif = dev->ifindex;
2041 fl4.flowi4_mark = skb->mark;
2042 fl4.flowi4_tos = tos;
2043 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2044 fl4.flowi4_flags = 0;
2047 fl4.flowi4_uid = sock_net_uid(net, NULL);
2049 if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2052 fl4.flowi4_proto = 0;
2057 err = fib_lookup(net, &fl4, res, 0);
2059 if (!IN_DEV_FORWARD(in_dev))
2060 err = -EHOSTUNREACH;
2064 if (res->type == RTN_BROADCAST) {
2065 if (IN_DEV_BFORWARD(in_dev))
2067 /* not do cache if bc_forwarding is enabled */
2068 if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2073 if (res->type == RTN_LOCAL) {
2074 err = fib_validate_source(skb, saddr, daddr, tos,
2075 0, dev, in_dev, &itag);
2077 goto martian_source;
2081 if (!IN_DEV_FORWARD(in_dev)) {
2082 err = -EHOSTUNREACH;
2085 if (res->type != RTN_UNICAST)
2086 goto martian_destination;
2089 err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2093 if (skb->protocol != htons(ETH_P_IP))
2096 if (!ipv4_is_zeronet(saddr)) {
2097 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2100 goto martian_source;
2102 flags |= RTCF_BROADCAST;
2103 res->type = RTN_BROADCAST;
2104 RT_CACHE_STAT_INC(in_brd);
2107 do_cache &= res->fi && !itag;
2109 rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
2110 if (rt_cache_valid(rth)) {
2111 skb_dst_set_noref(skb, &rth->dst);
2117 rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2118 flags | RTCF_LOCAL, res->type,
2119 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2123 rth->dst.output= ip_rt_bug;
2124 #ifdef CONFIG_IP_ROUTE_CLASSID
2125 rth->dst.tclassid = itag;
2127 rth->rt_is_input = 1;
2129 RT_CACHE_STAT_INC(in_slow_tot);
2130 if (res->type == RTN_UNREACHABLE) {
2131 rth->dst.input= ip_error;
2132 rth->dst.error= -err;
2133 rth->rt_flags &= ~RTCF_LOCAL;
2137 struct fib_nh *nh = &FIB_RES_NH(*res);
2139 rth->dst.lwtstate = lwtstate_get(nh->nh_lwtstate);
2140 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2141 WARN_ON(rth->dst.input == lwtunnel_input);
2142 rth->dst.lwtstate->orig_input = rth->dst.input;
2143 rth->dst.input = lwtunnel_input;
2146 if (unlikely(!rt_cache_route(nh, rth)))
2147 rt_add_uncached_list(rth);
2149 skb_dst_set(skb, &rth->dst);
2154 RT_CACHE_STAT_INC(in_no_route);
2155 res->type = RTN_UNREACHABLE;
2161 * Do not cache martian addresses: they should be logged (RFC1812)
2163 martian_destination:
2164 RT_CACHE_STAT_INC(in_martian_dst);
2165 #ifdef CONFIG_IP_ROUTE_VERBOSE
2166 if (IN_DEV_LOG_MARTIANS(in_dev))
2167 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2168 &daddr, &saddr, dev->name);
2180 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2184 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2185 u8 tos, struct net_device *dev)
2187 struct fib_result res;
2190 tos &= IPTOS_RT_MASK;
2192 err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2197 EXPORT_SYMBOL(ip_route_input_noref);
2199 /* called with rcu_read_lock held */
2200 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2201 u8 tos, struct net_device *dev, struct fib_result *res)
2203 /* Multicast recognition logic is moved from route cache to here.
2204 The problem was that too many Ethernet cards have broken/missing
2205 hardware multicast filters :-( As result the host on multicasting
2206 network acquires a lot of useless route cache entries, sort of
2207 SDR messages from all the world. Now we try to get rid of them.
2208 Really, provided software IP multicast filter is organized
2209 reasonably (at least, hashed), it does not result in a slowdown
2210 comparing with route cache reject entries.
2211 Note, that multicast routers are not affected, because
2212 route cache entry is created eventually.
2214 if (ipv4_is_multicast(daddr)) {
2215 struct in_device *in_dev = __in_dev_get_rcu(dev);
2221 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2222 ip_hdr(skb)->protocol);
2224 /* check l3 master if no match yet */
2225 if (!our && netif_is_l3_slave(dev)) {
2226 struct in_device *l3_in_dev;
2228 l3_in_dev = __in_dev_get_rcu(skb->dev);
2230 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2231 ip_hdr(skb)->protocol);
2235 #ifdef CONFIG_IP_MROUTE
2237 (!ipv4_is_local_multicast(daddr) &&
2238 IN_DEV_MFORWARD(in_dev))
2241 err = ip_route_input_mc(skb, daddr, saddr,
2247 return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2250 /* called with rcu_read_lock() */
2251 static struct rtable *__mkroute_output(const struct fib_result *res,
2252 const struct flowi4 *fl4, int orig_oif,
2253 struct net_device *dev_out,
2256 struct fib_info *fi = res->fi;
2257 struct fib_nh_exception *fnhe;
2258 struct in_device *in_dev;
2259 u16 type = res->type;
2263 in_dev = __in_dev_get_rcu(dev_out);
2265 return ERR_PTR(-EINVAL);
2267 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2268 if (ipv4_is_loopback(fl4->saddr) &&
2269 !(dev_out->flags & IFF_LOOPBACK) &&
2270 !netif_is_l3_master(dev_out))
2271 return ERR_PTR(-EINVAL);
2273 if (ipv4_is_lbcast(fl4->daddr))
2274 type = RTN_BROADCAST;
2275 else if (ipv4_is_multicast(fl4->daddr))
2276 type = RTN_MULTICAST;
2277 else if (ipv4_is_zeronet(fl4->daddr))
2278 return ERR_PTR(-EINVAL);
2280 if (dev_out->flags & IFF_LOOPBACK)
2281 flags |= RTCF_LOCAL;
2284 if (type == RTN_BROADCAST) {
2285 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2287 } else if (type == RTN_MULTICAST) {
2288 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2289 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2291 flags &= ~RTCF_LOCAL;
2294 /* If multicast route do not exist use
2295 * default one, but do not gateway in this case.
2298 if (fi && res->prefixlen < 4)
2300 } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2301 (orig_oif != dev_out->ifindex)) {
2302 /* For local routes that require a particular output interface
2303 * we do not want to cache the result. Caching the result
2304 * causes incorrect behaviour when there are multiple source
2305 * addresses on the interface, the end result being that if the
2306 * intended recipient is waiting on that interface for the
2307 * packet he won't receive it because it will be delivered on
2308 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2309 * be set to the loopback interface as well.
2315 do_cache &= fi != NULL;
2317 struct rtable __rcu **prth;
2318 struct fib_nh *nh = &FIB_RES_NH(*res);
2320 fnhe = find_exception(nh, fl4->daddr);
2324 prth = &fnhe->fnhe_rth_output;
2326 if (unlikely(fl4->flowi4_flags &
2327 FLOWI_FLAG_KNOWN_NH &&
2329 nh->nh_scope == RT_SCOPE_LINK))) {
2333 prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
2335 rth = rcu_dereference(*prth);
2336 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2341 rth = rt_dst_alloc(dev_out, flags, type,
2342 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2343 IN_DEV_CONF_GET(in_dev, NOXFRM),
2346 return ERR_PTR(-ENOBUFS);
2348 rth->rt_iif = orig_oif;
2350 RT_CACHE_STAT_INC(out_slow_tot);
2352 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2353 if (flags & RTCF_LOCAL &&
2354 !(dev_out->flags & IFF_LOOPBACK)) {
2355 rth->dst.output = ip_mc_output;
2356 RT_CACHE_STAT_INC(out_slow_mc);
2358 #ifdef CONFIG_IP_MROUTE
2359 if (type == RTN_MULTICAST) {
2360 if (IN_DEV_MFORWARD(in_dev) &&
2361 !ipv4_is_local_multicast(fl4->daddr)) {
2362 rth->dst.input = ip_mr_input;
2363 rth->dst.output = ip_mc_output;
2369 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2370 lwtunnel_set_redirect(&rth->dst);
2376 * Major route resolver routine.
2379 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2380 const struct sk_buff *skb)
2382 __u8 tos = RT_FL_TOS(fl4);
2383 struct fib_result res = {
2391 fl4->flowi4_iif = LOOPBACK_IFINDEX;
2392 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2393 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2394 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2397 rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2402 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2404 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2405 struct fib_result *res,
2406 const struct sk_buff *skb)
2408 struct net_device *dev_out = NULL;
2409 int orig_oif = fl4->flowi4_oif;
2410 unsigned int flags = 0;
2415 if (ipv4_is_multicast(fl4->saddr) ||
2416 ipv4_is_lbcast(fl4->saddr) ||
2417 ipv4_is_zeronet(fl4->saddr)) {
2418 rth = ERR_PTR(-EINVAL);
2422 rth = ERR_PTR(-ENETUNREACH);
2424 /* I removed check for oif == dev_out->oif here.
2425 It was wrong for two reasons:
2426 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2427 is assigned to multiple interfaces.
2428 2. Moreover, we are allowed to send packets with saddr
2429 of another iface. --ANK
2432 if (fl4->flowi4_oif == 0 &&
2433 (ipv4_is_multicast(fl4->daddr) ||
2434 ipv4_is_lbcast(fl4->daddr))) {
2435 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2436 dev_out = __ip_dev_find(net, fl4->saddr, false);
2440 /* Special hack: user can direct multicasts
2441 and limited broadcast via necessary interface
2442 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2443 This hack is not just for fun, it allows
2444 vic,vat and friends to work.
2445 They bind socket to loopback, set ttl to zero
2446 and expect that it will work.
2447 From the viewpoint of routing cache they are broken,
2448 because we are not allowed to build multicast path
2449 with loopback source addr (look, routing cache
2450 cannot know, that ttl is zero, so that packet
2451 will not leave this host and route is valid).
2452 Luckily, this hack is good workaround.
2455 fl4->flowi4_oif = dev_out->ifindex;
2459 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2460 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2461 if (!__ip_dev_find(net, fl4->saddr, false))
2467 if (fl4->flowi4_oif) {
2468 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2469 rth = ERR_PTR(-ENODEV);
2473 /* RACE: Check return value of inet_select_addr instead. */
2474 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2475 rth = ERR_PTR(-ENETUNREACH);
2478 if (ipv4_is_local_multicast(fl4->daddr) ||
2479 ipv4_is_lbcast(fl4->daddr) ||
2480 fl4->flowi4_proto == IPPROTO_IGMP) {
2482 fl4->saddr = inet_select_addr(dev_out, 0,
2487 if (ipv4_is_multicast(fl4->daddr))
2488 fl4->saddr = inet_select_addr(dev_out, 0,
2490 else if (!fl4->daddr)
2491 fl4->saddr = inet_select_addr(dev_out, 0,
2497 fl4->daddr = fl4->saddr;
2499 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2500 dev_out = net->loopback_dev;
2501 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2502 res->type = RTN_LOCAL;
2503 flags |= RTCF_LOCAL;
2507 err = fib_lookup(net, fl4, res, 0);
2511 if (fl4->flowi4_oif &&
2512 (ipv4_is_multicast(fl4->daddr) ||
2513 !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2514 /* Apparently, routing tables are wrong. Assume,
2515 that the destination is on link.
2518 Because we are allowed to send to iface
2519 even if it has NO routes and NO assigned
2520 addresses. When oif is specified, routing
2521 tables are looked up with only one purpose:
2522 to catch if destination is gatewayed, rather than
2523 direct. Moreover, if MSG_DONTROUTE is set,
2524 we send packet, ignoring both routing tables
2525 and ifaddr state. --ANK
2528 We could make it even if oif is unknown,
2529 likely IPv6, but we do not.
2532 if (fl4->saddr == 0)
2533 fl4->saddr = inet_select_addr(dev_out, 0,
2535 res->type = RTN_UNICAST;
2542 if (res->type == RTN_LOCAL) {
2544 if (res->fi->fib_prefsrc)
2545 fl4->saddr = res->fi->fib_prefsrc;
2547 fl4->saddr = fl4->daddr;
2550 /* L3 master device is the loopback for that domain */
2551 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2554 /* make sure orig_oif points to fib result device even
2555 * though packet rx/tx happens over loopback or l3mdev
2557 orig_oif = FIB_RES_OIF(*res);
2559 fl4->flowi4_oif = dev_out->ifindex;
2560 flags |= RTCF_LOCAL;
2564 fib_select_path(net, res, fl4, skb);
2566 dev_out = FIB_RES_DEV(*res);
2569 rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2575 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2580 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2582 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2584 return mtu ? : dst->dev->mtu;
2587 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2588 struct sk_buff *skb, u32 mtu,
2593 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2594 struct sk_buff *skb)
2598 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2604 static struct dst_ops ipv4_dst_blackhole_ops = {
2606 .check = ipv4_blackhole_dst_check,
2607 .mtu = ipv4_blackhole_mtu,
2608 .default_advmss = ipv4_default_advmss,
2609 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2610 .redirect = ipv4_rt_blackhole_redirect,
2611 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2612 .neigh_lookup = ipv4_neigh_lookup,
2615 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2617 struct rtable *ort = (struct rtable *) dst_orig;
2620 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2622 struct dst_entry *new = &rt->dst;
2625 new->input = dst_discard;
2626 new->output = dst_discard_out;
2628 new->dev = net->loopback_dev;
2632 rt->rt_is_input = ort->rt_is_input;
2633 rt->rt_iif = ort->rt_iif;
2634 rt->rt_pmtu = ort->rt_pmtu;
2635 rt->rt_mtu_locked = ort->rt_mtu_locked;
2637 rt->rt_genid = rt_genid_ipv4(net);
2638 rt->rt_flags = ort->rt_flags;
2639 rt->rt_type = ort->rt_type;
2640 rt->rt_gateway = ort->rt_gateway;
2641 rt->rt_uses_gateway = ort->rt_uses_gateway;
2643 INIT_LIST_HEAD(&rt->rt_uncached);
2646 dst_release(dst_orig);
2648 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2651 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2652 const struct sock *sk)
2654 struct rtable *rt = __ip_route_output_key(net, flp4);
2659 if (flp4->flowi4_proto) {
2660 flp4->flowi4_oif = rt->dst.dev->ifindex;
2661 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2662 flowi4_to_flowi(flp4),
2668 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2670 /* called with rcu_read_lock held */
2671 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2672 struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2673 struct sk_buff *skb, u32 portid, u32 seq)
2676 struct nlmsghdr *nlh;
2677 unsigned long expires = 0;
2679 u32 metrics[RTAX_MAX];
2681 nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), 0);
2685 r = nlmsg_data(nlh);
2686 r->rtm_family = AF_INET;
2687 r->rtm_dst_len = 32;
2689 r->rtm_tos = fl4->flowi4_tos;
2690 r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2691 if (nla_put_u32(skb, RTA_TABLE, table_id))
2692 goto nla_put_failure;
2693 r->rtm_type = rt->rt_type;
2694 r->rtm_scope = RT_SCOPE_UNIVERSE;
2695 r->rtm_protocol = RTPROT_UNSPEC;
2696 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2697 if (rt->rt_flags & RTCF_NOTIFY)
2698 r->rtm_flags |= RTM_F_NOTIFY;
2699 if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2700 r->rtm_flags |= RTCF_DOREDIRECT;
2702 if (nla_put_in_addr(skb, RTA_DST, dst))
2703 goto nla_put_failure;
2705 r->rtm_src_len = 32;
2706 if (nla_put_in_addr(skb, RTA_SRC, src))
2707 goto nla_put_failure;
2710 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2711 goto nla_put_failure;
2712 #ifdef CONFIG_IP_ROUTE_CLASSID
2713 if (rt->dst.tclassid &&
2714 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2715 goto nla_put_failure;
2717 if (!rt_is_input_route(rt) &&
2718 fl4->saddr != src) {
2719 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2720 goto nla_put_failure;
2722 if (rt->rt_uses_gateway &&
2723 nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gateway))
2724 goto nla_put_failure;
2726 expires = rt->dst.expires;
2728 unsigned long now = jiffies;
2730 if (time_before(now, expires))
2736 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2737 if (rt->rt_pmtu && expires)
2738 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2739 if (rt->rt_mtu_locked && expires)
2740 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2741 if (rtnetlink_put_metrics(skb, metrics) < 0)
2742 goto nla_put_failure;
2744 if (fl4->flowi4_mark &&
2745 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2746 goto nla_put_failure;
2748 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2749 nla_put_u32(skb, RTA_UID,
2750 from_kuid_munged(current_user_ns(), fl4->flowi4_uid)))
2751 goto nla_put_failure;
2753 error = rt->dst.error;
2755 if (rt_is_input_route(rt)) {
2756 #ifdef CONFIG_IP_MROUTE
2757 if (ipv4_is_multicast(dst) && !ipv4_is_local_multicast(dst) &&
2758 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2759 int err = ipmr_get_route(net, skb,
2760 fl4->saddr, fl4->daddr,
2766 goto nla_put_failure;
2770 if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2771 goto nla_put_failure;
2774 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2775 goto nla_put_failure;
2777 nlmsg_end(skb, nlh);
2781 nlmsg_cancel(skb, nlh);
2785 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
2786 u8 ip_proto, __be16 sport,
2789 struct sk_buff *skb;
2792 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2796 /* Reserve room for dummy headers, this skb can pass
2797 * through good chunk of routing engine.
2799 skb_reset_mac_header(skb);
2800 skb_reset_network_header(skb);
2801 skb->protocol = htons(ETH_P_IP);
2802 iph = skb_put(skb, sizeof(struct iphdr));
2803 iph->protocol = ip_proto;
2809 skb_set_transport_header(skb, skb->len);
2811 switch (iph->protocol) {
2813 struct udphdr *udph;
2815 udph = skb_put_zero(skb, sizeof(struct udphdr));
2816 udph->source = sport;
2818 udph->len = htons(sizeof(struct udphdr));
2823 struct tcphdr *tcph;
2825 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
2826 tcph->source = sport;
2828 tcph->doff = sizeof(struct tcphdr) / 4;
2830 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
2834 case IPPROTO_ICMP: {
2835 struct icmphdr *icmph;
2837 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
2838 icmph->type = ICMP_ECHO;
2846 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
2847 struct netlink_ext_ack *extack)
2849 struct net *net = sock_net(in_skb->sk);
2850 struct nlattr *tb[RTA_MAX+1];
2851 u32 table_id = RT_TABLE_MAIN;
2852 __be16 sport = 0, dport = 0;
2853 struct fib_result res = {};
2854 u8 ip_proto = IPPROTO_UDP;
2855 struct rtable *rt = NULL;
2856 struct sk_buff *skb;
2866 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv4_policy,
2871 rtm = nlmsg_data(nlh);
2872 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
2873 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
2874 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
2875 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
2877 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
2879 uid = (iif ? INVALID_UID : current_uid());
2881 if (tb[RTA_IP_PROTO]) {
2882 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
2883 &ip_proto, AF_INET, extack);
2889 sport = nla_get_be16(tb[RTA_SPORT]);
2892 dport = nla_get_be16(tb[RTA_DPORT]);
2894 skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
2898 memset(&fl4, 0, sizeof(fl4));
2901 fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
2902 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
2903 fl4.flowi4_mark = mark;
2904 fl4.flowi4_uid = uid;
2906 fl4.fl4_sport = sport;
2908 fl4.fl4_dport = dport;
2909 fl4.flowi4_proto = ip_proto;
2914 struct net_device *dev;
2916 dev = dev_get_by_index_rcu(net, iif);
2922 fl4.flowi4_iif = iif; /* for rt_fill_info */
2925 err = ip_route_input_rcu(skb, dst, src,
2926 rtm->rtm_tos & IPTOS_RT_MASK, dev,
2929 rt = skb_rtable(skb);
2930 if (err == 0 && rt->dst.error)
2931 err = -rt->dst.error;
2933 fl4.flowi4_iif = LOOPBACK_IFINDEX;
2934 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
2939 skb_dst_set(skb, &rt->dst);
2945 if (rtm->rtm_flags & RTM_F_NOTIFY)
2946 rt->rt_flags |= RTCF_NOTIFY;
2948 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
2949 table_id = res.table ? res.table->tb_id : 0;
2951 /* reset skb for netlink reply msg */
2953 skb_reset_network_header(skb);
2954 skb_reset_transport_header(skb);
2955 skb_reset_mac_header(skb);
2957 if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
2959 err = fib_props[res.type].error;
2961 err = -EHOSTUNREACH;
2964 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
2965 nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
2966 rt->rt_type, res.prefix, res.prefixlen,
2967 fl4.flowi4_tos, res.fi, 0);
2969 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
2970 NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);
2977 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
2987 void ip_rt_multicast_event(struct in_device *in_dev)
2989 rt_cache_flush(dev_net(in_dev->dev));
2992 #ifdef CONFIG_SYSCTL
2993 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
2994 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
2995 static int ip_rt_gc_elasticity __read_mostly = 8;
2996 static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU;
2998 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
2999 void __user *buffer,
3000 size_t *lenp, loff_t *ppos)
3002 struct net *net = (struct net *)__ctl->extra1;
3005 rt_cache_flush(net);
3006 fnhe_genid_bump(net);
3013 static struct ctl_table ipv4_route_table[] = {
3015 .procname = "gc_thresh",
3016 .data = &ipv4_dst_ops.gc_thresh,
3017 .maxlen = sizeof(int),
3019 .proc_handler = proc_dointvec,
3022 .procname = "max_size",
3023 .data = &ip_rt_max_size,
3024 .maxlen = sizeof(int),
3026 .proc_handler = proc_dointvec,
3029 /* Deprecated. Use gc_min_interval_ms */
3031 .procname = "gc_min_interval",
3032 .data = &ip_rt_gc_min_interval,
3033 .maxlen = sizeof(int),
3035 .proc_handler = proc_dointvec_jiffies,
3038 .procname = "gc_min_interval_ms",
3039 .data = &ip_rt_gc_min_interval,
3040 .maxlen = sizeof(int),
3042 .proc_handler = proc_dointvec_ms_jiffies,
3045 .procname = "gc_timeout",
3046 .data = &ip_rt_gc_timeout,
3047 .maxlen = sizeof(int),
3049 .proc_handler = proc_dointvec_jiffies,
3052 .procname = "gc_interval",
3053 .data = &ip_rt_gc_interval,
3054 .maxlen = sizeof(int),
3056 .proc_handler = proc_dointvec_jiffies,
3059 .procname = "redirect_load",
3060 .data = &ip_rt_redirect_load,
3061 .maxlen = sizeof(int),
3063 .proc_handler = proc_dointvec,
3066 .procname = "redirect_number",
3067 .data = &ip_rt_redirect_number,
3068 .maxlen = sizeof(int),
3070 .proc_handler = proc_dointvec,
3073 .procname = "redirect_silence",
3074 .data = &ip_rt_redirect_silence,
3075 .maxlen = sizeof(int),
3077 .proc_handler = proc_dointvec,
3080 .procname = "error_cost",
3081 .data = &ip_rt_error_cost,
3082 .maxlen = sizeof(int),
3084 .proc_handler = proc_dointvec,
3087 .procname = "error_burst",
3088 .data = &ip_rt_error_burst,
3089 .maxlen = sizeof(int),
3091 .proc_handler = proc_dointvec,
3094 .procname = "gc_elasticity",
3095 .data = &ip_rt_gc_elasticity,
3096 .maxlen = sizeof(int),
3098 .proc_handler = proc_dointvec,
3101 .procname = "mtu_expires",
3102 .data = &ip_rt_mtu_expires,
3103 .maxlen = sizeof(int),
3105 .proc_handler = proc_dointvec_jiffies,
3108 .procname = "min_pmtu",
3109 .data = &ip_rt_min_pmtu,
3110 .maxlen = sizeof(int),
3112 .proc_handler = proc_dointvec_minmax,
3113 .extra1 = &ip_min_valid_pmtu,
3116 .procname = "min_adv_mss",
3117 .data = &ip_rt_min_advmss,
3118 .maxlen = sizeof(int),
3120 .proc_handler = proc_dointvec,
3125 static struct ctl_table ipv4_route_flush_table[] = {
3127 .procname = "flush",
3128 .maxlen = sizeof(int),
3130 .proc_handler = ipv4_sysctl_rtcache_flush,
3135 static __net_init int sysctl_route_net_init(struct net *net)
3137 struct ctl_table *tbl;
3139 tbl = ipv4_route_flush_table;
3140 if (!net_eq(net, &init_net)) {
3141 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3145 /* Don't export sysctls to unprivileged users */
3146 if (net->user_ns != &init_user_ns)
3147 tbl[0].procname = NULL;
3149 tbl[0].extra1 = net;
3151 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3152 if (!net->ipv4.route_hdr)
3157 if (tbl != ipv4_route_flush_table)
3163 static __net_exit void sysctl_route_net_exit(struct net *net)
3165 struct ctl_table *tbl;
3167 tbl = net->ipv4.route_hdr->ctl_table_arg;
3168 unregister_net_sysctl_table(net->ipv4.route_hdr);
3169 BUG_ON(tbl == ipv4_route_flush_table);
3173 static __net_initdata struct pernet_operations sysctl_route_ops = {
3174 .init = sysctl_route_net_init,
3175 .exit = sysctl_route_net_exit,
3179 static __net_init int rt_genid_init(struct net *net)
3181 atomic_set(&net->ipv4.rt_genid, 0);
3182 atomic_set(&net->fnhe_genid, 0);
3183 atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3187 static __net_initdata struct pernet_operations rt_genid_ops = {
3188 .init = rt_genid_init,
3191 static int __net_init ipv4_inetpeer_init(struct net *net)
3193 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3197 inet_peer_base_init(bp);
3198 net->ipv4.peers = bp;
3202 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3204 struct inet_peer_base *bp = net->ipv4.peers;
3206 net->ipv4.peers = NULL;
3207 inetpeer_invalidate_tree(bp);
3211 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3212 .init = ipv4_inetpeer_init,
3213 .exit = ipv4_inetpeer_exit,
3216 #ifdef CONFIG_IP_ROUTE_CLASSID
3217 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3218 #endif /* CONFIG_IP_ROUTE_CLASSID */
3220 int __init ip_rt_init(void)
3225 /* For modern hosts, this will use 2 MB of memory */
3226 idents_hash = alloc_large_system_hash("IP idents",
3227 sizeof(*ip_idents) + sizeof(*ip_tstamps),
3229 16, /* one bucket per 64 KB */
3236 ip_idents = idents_hash;
3238 prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
3240 ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
3242 for_each_possible_cpu(cpu) {
3243 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3245 INIT_LIST_HEAD(&ul->head);
3246 spin_lock_init(&ul->lock);
3248 #ifdef CONFIG_IP_ROUTE_CLASSID
3249 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3251 panic("IP: failed to allocate ip_rt_acct\n");
3254 ipv4_dst_ops.kmem_cachep =
3255 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3256 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3258 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3260 if (dst_entries_init(&ipv4_dst_ops) < 0)
3261 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3263 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3264 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3266 ipv4_dst_ops.gc_thresh = ~0;
3267 ip_rt_max_size = INT_MAX;
3272 if (ip_rt_proc_init())
3273 pr_err("Unable to create route proc files\n");
3278 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3279 RTNL_FLAG_DOIT_UNLOCKED);
3281 #ifdef CONFIG_SYSCTL
3282 register_pernet_subsys(&sysctl_route_ops);
3284 register_pernet_subsys(&rt_genid_ops);
3285 register_pernet_subsys(&ipv4_inetpeer_ops);
3289 #ifdef CONFIG_SYSCTL
3291 * We really need to sanitize the damn ipv4 init order, then all
3292 * this nonsense will go away.
3294 void __init ip_static_sysctl_init(void)
3296 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);