1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
7 * ROUTE - implementation of the IP router.
10 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11 * Alan Cox, <gw4pts@gw4pts.ampr.org>
12 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
13 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
16 * Alan Cox : Verify area fixes.
17 * Alan Cox : cli() protects routing changes
18 * Rui Oliveira : ICMP routing table updates
19 * (rco@di.uminho.pt) Routing table insertion and update
20 * Linus Torvalds : Rewrote bits to be sensible
21 * Alan Cox : Added BSD route gw semantics
22 * Alan Cox : Super /proc >4K
23 * Alan Cox : MTU in route table
24 * Alan Cox : MSS actually. Also added the window
26 * Sam Lantinga : Fixed route matching in rt_del()
27 * Alan Cox : Routing cache support.
28 * Alan Cox : Removed compatibility cruft.
29 * Alan Cox : RTF_REJECT support.
30 * Alan Cox : TCP irtt support.
31 * Jonathan Naylor : Added Metric support.
32 * Miquel van Smoorenburg : BSD API fixes.
33 * Miquel van Smoorenburg : Metrics.
34 * Alan Cox : Use __u32 properly
35 * Alan Cox : Aligned routing errors more closely with BSD
36 * our system is still very different.
37 * Alan Cox : Faster /proc handling
38 * Alexey Kuznetsov : Massive rework to support tree based routing,
39 * routing caches and better behaviour.
41 * Olaf Erb : irtt wasn't being copied right.
42 * Bjorn Ekwall : Kerneld route support.
43 * Alan Cox : Multicast fixed (I hope)
44 * Pavel Krauz : Limited broadcast fixed
45 * Mike McLagan : Routing by source
46 * Alexey Kuznetsov : End of old history. Split to fib.c and
47 * route.c and rewritten from scratch.
48 * Andi Kleen : Load-limit warning messages.
49 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
50 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
51 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
52 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
53 * Marc Boucher : routing by fwmark
54 * Robert Olsson : Added rt_cache statistics
55 * Arnaldo C. Melo : Convert proc stuff to seq_file
56 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
57 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
58 * Ilia Sotnikov : Removed TOS from hash calculations
61 #define pr_fmt(fmt) "IPv4: " fmt
63 #include <linux/module.h>
64 #include <linux/uaccess.h>
65 #include <linux/bitops.h>
66 #include <linux/types.h>
67 #include <linux/kernel.h>
69 #include <linux/memblock.h>
70 #include <linux/string.h>
71 #include <linux/socket.h>
72 #include <linux/sockios.h>
73 #include <linux/errno.h>
75 #include <linux/inet.h>
76 #include <linux/netdevice.h>
77 #include <linux/proc_fs.h>
78 #include <linux/init.h>
79 #include <linux/skbuff.h>
80 #include <linux/inetdevice.h>
81 #include <linux/igmp.h>
82 #include <linux/pkt_sched.h>
83 #include <linux/mroute.h>
84 #include <linux/netfilter_ipv4.h>
85 #include <linux/random.h>
86 #include <linux/rcupdate.h>
87 #include <linux/times.h>
88 #include <linux/slab.h>
89 #include <linux/jhash.h>
91 #include <net/dst_metadata.h>
92 #include <net/net_namespace.h>
93 #include <net/protocol.h>
95 #include <net/route.h>
96 #include <net/inetpeer.h>
98 #include <net/ip_fib.h>
99 #include <net/nexthop.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/lwtunnel.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
108 #include <linux/sysctl.h>
110 #include <net/secure_seq.h>
111 #include <net/ip_tunnels.h>
112 #include <net/l3mdev.h>
114 #include "fib_lookup.h"
116 #define RT_FL_TOS(oldflp4) \
117 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
119 #define RT_GC_TIMEOUT (300*HZ)
121 static int ip_rt_max_size;
122 static int ip_rt_redirect_number __read_mostly = 9;
123 static int ip_rt_redirect_load __read_mostly = HZ / 50;
124 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
125 static int ip_rt_error_cost __read_mostly = HZ;
126 static int ip_rt_error_burst __read_mostly = 5 * HZ;
127 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
128 static u32 ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
129 static int ip_rt_min_advmss __read_mostly = 256;
131 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
134 * Interface to generic destination cache.
137 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138 static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
139 static unsigned int ipv4_mtu(const struct dst_entry *dst);
140 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
141 static void ipv4_link_failure(struct sk_buff *skb);
142 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
143 struct sk_buff *skb, u32 mtu,
145 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
146 struct sk_buff *skb);
147 static void ipv4_dst_destroy(struct dst_entry *dst);
149 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
155 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
158 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
160 static struct dst_ops ipv4_dst_ops = {
162 .check = ipv4_dst_check,
163 .default_advmss = ipv4_default_advmss,
165 .cow_metrics = ipv4_cow_metrics,
166 .destroy = ipv4_dst_destroy,
167 .negative_advice = ipv4_negative_advice,
168 .link_failure = ipv4_link_failure,
169 .update_pmtu = ip_rt_update_pmtu,
170 .redirect = ip_do_redirect,
171 .local_out = __ip_local_out,
172 .neigh_lookup = ipv4_neigh_lookup,
173 .confirm_neigh = ipv4_confirm_neigh,
176 #define ECN_OR_COST(class) TC_PRIO_##class
178 const __u8 ip_tos2prio[16] = {
180 ECN_OR_COST(BESTEFFORT),
182 ECN_OR_COST(BESTEFFORT),
188 ECN_OR_COST(INTERACTIVE),
190 ECN_OR_COST(INTERACTIVE),
191 TC_PRIO_INTERACTIVE_BULK,
192 ECN_OR_COST(INTERACTIVE_BULK),
193 TC_PRIO_INTERACTIVE_BULK,
194 ECN_OR_COST(INTERACTIVE_BULK)
196 EXPORT_SYMBOL(ip_tos2prio);
198 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
199 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
201 #ifdef CONFIG_PROC_FS
202 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
206 return SEQ_START_TOKEN;
209 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
215 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
219 static int rt_cache_seq_show(struct seq_file *seq, void *v)
221 if (v == SEQ_START_TOKEN)
222 seq_printf(seq, "%-127s\n",
223 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
224 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
229 static const struct seq_operations rt_cache_seq_ops = {
230 .start = rt_cache_seq_start,
231 .next = rt_cache_seq_next,
232 .stop = rt_cache_seq_stop,
233 .show = rt_cache_seq_show,
236 static int rt_cache_seq_open(struct inode *inode, struct file *file)
238 return seq_open(file, &rt_cache_seq_ops);
241 static const struct file_operations rt_cache_seq_fops = {
242 .open = rt_cache_seq_open,
245 .release = seq_release,
249 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
254 return SEQ_START_TOKEN;
256 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
257 if (!cpu_possible(cpu))
260 return &per_cpu(rt_cache_stat, cpu);
265 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
269 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
270 if (!cpu_possible(cpu))
273 return &per_cpu(rt_cache_stat, cpu);
280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
285 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
287 struct rt_cache_stat *st = v;
289 if (v == SEQ_START_TOKEN) {
290 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
294 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
295 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
296 dst_entries_get_slow(&ipv4_dst_ops),
309 0, /* st->gc_total */
310 0, /* st->gc_ignored */
311 0, /* st->gc_goal_miss */
312 0, /* st->gc_dst_overflow */
313 0, /* st->in_hlist_search */
314 0 /* st->out_hlist_search */
319 static const struct seq_operations rt_cpu_seq_ops = {
320 .start = rt_cpu_seq_start,
321 .next = rt_cpu_seq_next,
322 .stop = rt_cpu_seq_stop,
323 .show = rt_cpu_seq_show,
327 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
329 return seq_open(file, &rt_cpu_seq_ops);
332 static const struct file_operations rt_cpu_seq_fops = {
333 .open = rt_cpu_seq_open,
336 .release = seq_release,
339 #ifdef CONFIG_IP_ROUTE_CLASSID
340 static int rt_acct_proc_show(struct seq_file *m, void *v)
342 struct ip_rt_acct *dst, *src;
345 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
349 for_each_possible_cpu(i) {
350 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
351 for (j = 0; j < 256; j++) {
352 dst[j].o_bytes += src[j].o_bytes;
353 dst[j].o_packets += src[j].o_packets;
354 dst[j].i_bytes += src[j].i_bytes;
355 dst[j].i_packets += src[j].i_packets;
359 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
365 static int __net_init ip_rt_do_proc_init(struct net *net)
367 struct proc_dir_entry *pde;
369 pde = proc_create("rt_cache", 0444, net->proc_net,
374 pde = proc_create("rt_cache", 0444,
375 net->proc_net_stat, &rt_cpu_seq_fops);
379 #ifdef CONFIG_IP_ROUTE_CLASSID
380 pde = proc_create_single("rt_acct", 0, net->proc_net,
387 #ifdef CONFIG_IP_ROUTE_CLASSID
389 remove_proc_entry("rt_cache", net->proc_net_stat);
392 remove_proc_entry("rt_cache", net->proc_net);
397 static void __net_exit ip_rt_do_proc_exit(struct net *net)
399 remove_proc_entry("rt_cache", net->proc_net_stat);
400 remove_proc_entry("rt_cache", net->proc_net);
401 #ifdef CONFIG_IP_ROUTE_CLASSID
402 remove_proc_entry("rt_acct", net->proc_net);
406 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
407 .init = ip_rt_do_proc_init,
408 .exit = ip_rt_do_proc_exit,
411 static int __init ip_rt_proc_init(void)
413 return register_pernet_subsys(&ip_rt_proc_ops);
417 static inline int ip_rt_proc_init(void)
421 #endif /* CONFIG_PROC_FS */
423 static inline bool rt_is_expired(const struct rtable *rth)
425 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
428 void rt_cache_flush(struct net *net)
430 rt_genid_bump_ipv4(net);
433 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
437 const struct rtable *rt = container_of(dst, struct rtable, dst);
438 struct net_device *dev = dst->dev;
443 if (likely(rt->rt_gw_family == AF_INET)) {
444 n = ip_neigh_gw4(dev, rt->rt_gw4);
445 } else if (rt->rt_gw_family == AF_INET6) {
446 n = ip_neigh_gw6(dev, &rt->rt_gw6);
450 pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
451 n = ip_neigh_gw4(dev, pkey);
454 if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
457 rcu_read_unlock_bh();
462 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
464 const struct rtable *rt = container_of(dst, struct rtable, dst);
465 struct net_device *dev = dst->dev;
466 const __be32 *pkey = daddr;
468 if (rt->rt_gw_family == AF_INET) {
469 pkey = (const __be32 *)&rt->rt_gw4;
470 } else if (rt->rt_gw_family == AF_INET6) {
471 return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
474 (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
477 __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
480 /* Hash tables of size 2048..262144 depending on RAM size.
481 * Each bucket uses 8 bytes.
483 static u32 ip_idents_mask __read_mostly;
484 static atomic_t *ip_idents __read_mostly;
485 static u32 *ip_tstamps __read_mostly;
487 /* In order to protect privacy, we add a perturbation to identifiers
488 * if one generator is seldom used. This makes hard for an attacker
489 * to infer how many packets were sent between two points in time.
491 u32 ip_idents_reserve(u32 hash, int segs)
493 u32 bucket, old, now = (u32)jiffies;
498 bucket = hash & ip_idents_mask;
499 p_tstamp = ip_tstamps + bucket;
500 p_id = ip_idents + bucket;
501 old = READ_ONCE(*p_tstamp);
503 if (old != now && cmpxchg(p_tstamp, old, now) == old)
504 delta = prandom_u32_max(now - old);
506 /* If UBSAN reports an error there, please make sure your compiler
507 * supports -fno-strict-overflow before reporting it that was a bug
508 * in UBSAN, and it has been fixed in GCC-8.
510 return atomic_add_return(segs + delta, p_id) - segs;
512 EXPORT_SYMBOL(ip_idents_reserve);
514 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
518 /* Note the following code is not safe, but this is okay. */
519 if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
520 get_random_bytes(&net->ipv4.ip_id_key,
521 sizeof(net->ipv4.ip_id_key));
523 hash = siphash_3u32((__force u32)iph->daddr,
524 (__force u32)iph->saddr,
526 &net->ipv4.ip_id_key);
527 id = ip_idents_reserve(hash, segs);
530 EXPORT_SYMBOL(__ip_select_ident);
532 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
533 const struct sock *sk,
534 const struct iphdr *iph,
536 u8 prot, u32 mark, int flow_flags)
539 const struct inet_sock *inet = inet_sk(sk);
541 oif = sk->sk_bound_dev_if;
543 tos = RT_CONN_FLAGS(sk);
544 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
546 flowi4_init_output(fl4, oif, mark, tos,
547 RT_SCOPE_UNIVERSE, prot,
549 iph->daddr, iph->saddr, 0, 0,
550 sock_net_uid(net, sk));
553 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
554 const struct sock *sk)
556 const struct net *net = dev_net(skb->dev);
557 const struct iphdr *iph = ip_hdr(skb);
558 int oif = skb->dev->ifindex;
559 u8 tos = RT_TOS(iph->tos);
560 u8 prot = iph->protocol;
561 u32 mark = skb->mark;
563 __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
566 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
568 const struct inet_sock *inet = inet_sk(sk);
569 const struct ip_options_rcu *inet_opt;
570 __be32 daddr = inet->inet_daddr;
573 inet_opt = rcu_dereference(inet->inet_opt);
574 if (inet_opt && inet_opt->opt.srr)
575 daddr = inet_opt->opt.faddr;
576 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
577 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
578 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
579 inet_sk_flowi_flags(sk),
580 daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
584 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
585 const struct sk_buff *skb)
588 build_skb_flow_key(fl4, skb, sk);
590 build_sk_flow_key(fl4, sk);
593 static DEFINE_SPINLOCK(fnhe_lock);
595 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
599 rt = rcu_dereference(fnhe->fnhe_rth_input);
601 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
602 dst_dev_put(&rt->dst);
603 dst_release(&rt->dst);
605 rt = rcu_dereference(fnhe->fnhe_rth_output);
607 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
608 dst_dev_put(&rt->dst);
609 dst_release(&rt->dst);
613 static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
615 struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
616 struct fib_nh_exception *fnhe, *oldest = NULL;
618 for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
619 fnhe = rcu_dereference_protected(*fnhe_p,
620 lockdep_is_held(&fnhe_lock));
624 time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
629 fnhe_flush_routes(oldest);
630 *oldest_p = oldest->fnhe_next;
631 kfree_rcu(oldest, rcu);
634 static u32 fnhe_hashfun(__be32 daddr)
636 static siphash_key_t fnhe_hash_key __read_mostly;
639 net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key));
640 hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key);
641 return hash_64(hval, FNHE_HASH_SHIFT);
644 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
646 rt->rt_pmtu = fnhe->fnhe_pmtu;
647 rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
648 rt->dst.expires = fnhe->fnhe_expires;
651 rt->rt_flags |= RTCF_REDIRECTED;
652 rt->rt_uses_gateway = 1;
653 rt->rt_gw_family = AF_INET;
654 rt->rt_gw4 = fnhe->fnhe_gw;
658 static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
659 __be32 gw, u32 pmtu, bool lock,
660 unsigned long expires)
662 struct fnhe_hash_bucket *hash;
663 struct fib_nh_exception *fnhe;
669 genid = fnhe_genid(dev_net(nhc->nhc_dev));
670 hval = fnhe_hashfun(daddr);
672 spin_lock_bh(&fnhe_lock);
674 hash = rcu_dereference(nhc->nhc_exceptions);
676 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
679 rcu_assign_pointer(nhc->nhc_exceptions, hash);
685 for (fnhe = rcu_dereference(hash->chain); fnhe;
686 fnhe = rcu_dereference(fnhe->fnhe_next)) {
687 if (fnhe->fnhe_daddr == daddr)
693 if (fnhe->fnhe_genid != genid)
694 fnhe->fnhe_genid = genid;
698 fnhe->fnhe_pmtu = pmtu;
699 fnhe->fnhe_mtu_locked = lock;
701 fnhe->fnhe_expires = max(1UL, expires);
702 /* Update all cached dsts too */
703 rt = rcu_dereference(fnhe->fnhe_rth_input);
705 fill_route_from_fnhe(rt, fnhe);
706 rt = rcu_dereference(fnhe->fnhe_rth_output);
708 fill_route_from_fnhe(rt, fnhe);
710 /* Randomize max depth to avoid some side channels attacks. */
711 int max_depth = FNHE_RECLAIM_DEPTH +
712 prandom_u32_max(FNHE_RECLAIM_DEPTH);
714 while (depth > max_depth) {
715 fnhe_remove_oldest(hash);
719 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
723 fnhe->fnhe_next = hash->chain;
725 fnhe->fnhe_genid = genid;
726 fnhe->fnhe_daddr = daddr;
728 fnhe->fnhe_pmtu = pmtu;
729 fnhe->fnhe_mtu_locked = lock;
730 fnhe->fnhe_expires = max(1UL, expires);
732 rcu_assign_pointer(hash->chain, fnhe);
734 /* Exception created; mark the cached routes for the nexthop
735 * stale, so anyone caching it rechecks if this exception
738 rt = rcu_dereference(nhc->nhc_rth_input);
740 rt->dst.obsolete = DST_OBSOLETE_KILL;
742 for_each_possible_cpu(i) {
743 struct rtable __rcu **prt;
744 prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
745 rt = rcu_dereference(*prt);
747 rt->dst.obsolete = DST_OBSOLETE_KILL;
751 fnhe->fnhe_stamp = jiffies;
754 spin_unlock_bh(&fnhe_lock);
757 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
760 __be32 new_gw = icmp_hdr(skb)->un.gateway;
761 __be32 old_gw = ip_hdr(skb)->saddr;
762 struct net_device *dev = skb->dev;
763 struct in_device *in_dev;
764 struct fib_result res;
768 switch (icmp_hdr(skb)->code & 7) {
770 case ICMP_REDIR_NETTOS:
771 case ICMP_REDIR_HOST:
772 case ICMP_REDIR_HOSTTOS:
779 if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
782 in_dev = __in_dev_get_rcu(dev);
787 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
788 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
789 ipv4_is_zeronet(new_gw))
790 goto reject_redirect;
792 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
793 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
794 goto reject_redirect;
795 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
796 goto reject_redirect;
798 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
799 goto reject_redirect;
802 n = __ipv4_neigh_lookup(rt->dst.dev, new_gw);
804 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
806 if (!(n->nud_state & NUD_VALID)) {
807 neigh_event_send(n, NULL);
809 if (fib_lookup(net, fl4, &res, 0) == 0) {
810 struct fib_nh_common *nhc;
812 fib_select_path(net, &res, fl4, skb);
813 nhc = FIB_RES_NHC(res);
814 update_or_create_fnhe(nhc, fl4->daddr, new_gw,
816 jiffies + ip_rt_gc_timeout);
819 rt->dst.obsolete = DST_OBSOLETE_KILL;
820 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
827 #ifdef CONFIG_IP_ROUTE_VERBOSE
828 if (IN_DEV_LOG_MARTIANS(in_dev)) {
829 const struct iphdr *iph = (const struct iphdr *) skb->data;
830 __be32 daddr = iph->daddr;
831 __be32 saddr = iph->saddr;
833 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
834 " Advised path = %pI4 -> %pI4\n",
835 &old_gw, dev->name, &new_gw,
842 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
846 const struct iphdr *iph = (const struct iphdr *) skb->data;
847 struct net *net = dev_net(skb->dev);
848 int oif = skb->dev->ifindex;
849 u8 tos = RT_TOS(iph->tos);
850 u8 prot = iph->protocol;
851 u32 mark = skb->mark;
853 rt = (struct rtable *) dst;
855 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
856 __ip_do_redirect(rt, skb, &fl4, true);
859 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
861 struct rtable *rt = (struct rtable *)dst;
862 struct dst_entry *ret = dst;
865 if (dst->obsolete > 0) {
868 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
879 * 1. The first ip_rt_redirect_number redirects are sent
880 * with exponential backoff, then we stop sending them at all,
881 * assuming that the host ignores our redirects.
882 * 2. If we did not see packets requiring redirects
883 * during ip_rt_redirect_silence, we assume that the host
884 * forgot redirected route and start to send redirects again.
886 * This algorithm is much cheaper and more intelligent than dumb load limiting
889 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
890 * and "frag. need" (breaks PMTU discovery) in icmp.c.
893 void ip_rt_send_redirect(struct sk_buff *skb)
895 struct rtable *rt = skb_rtable(skb);
896 struct in_device *in_dev;
897 struct inet_peer *peer;
903 in_dev = __in_dev_get_rcu(rt->dst.dev);
904 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
908 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
909 vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
912 net = dev_net(rt->dst.dev);
913 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
915 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
916 rt_nexthop(rt, ip_hdr(skb)->daddr));
920 /* No redirected packets during ip_rt_redirect_silence;
921 * reset the algorithm.
923 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
924 peer->rate_tokens = 0;
925 peer->n_redirects = 0;
928 /* Too many ignored redirects; do not send anything
929 * set dst.rate_last to the last seen redirected packet.
931 if (peer->n_redirects >= ip_rt_redirect_number) {
932 peer->rate_last = jiffies;
936 /* Check for load limit; set rate_last to the latest sent
939 if (peer->n_redirects == 0 ||
942 (ip_rt_redirect_load << peer->n_redirects)))) {
943 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
945 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
946 peer->rate_last = jiffies;
948 #ifdef CONFIG_IP_ROUTE_VERBOSE
950 peer->n_redirects == ip_rt_redirect_number)
951 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
952 &ip_hdr(skb)->saddr, inet_iif(skb),
953 &ip_hdr(skb)->daddr, &gw);
960 static int ip_error(struct sk_buff *skb)
962 struct rtable *rt = skb_rtable(skb);
963 struct net_device *dev = skb->dev;
964 struct in_device *in_dev;
965 struct inet_peer *peer;
971 if (netif_is_l3_master(skb->dev)) {
972 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
977 in_dev = __in_dev_get_rcu(dev);
979 /* IP on this device is disabled. */
983 net = dev_net(rt->dst.dev);
984 if (!IN_DEV_FORWARD(in_dev)) {
985 switch (rt->dst.error) {
987 __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
991 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
997 switch (rt->dst.error) {
1002 code = ICMP_HOST_UNREACH;
1005 code = ICMP_NET_UNREACH;
1006 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
1009 code = ICMP_PKT_FILTERED;
1013 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
1014 l3mdev_master_ifindex(skb->dev), 1);
1019 peer->rate_tokens += now - peer->rate_last;
1020 if (peer->rate_tokens > ip_rt_error_burst)
1021 peer->rate_tokens = ip_rt_error_burst;
1022 peer->rate_last = now;
1023 if (peer->rate_tokens >= ip_rt_error_cost)
1024 peer->rate_tokens -= ip_rt_error_cost;
1030 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1032 out: kfree_skb(skb);
1036 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1038 struct dst_entry *dst = &rt->dst;
1039 struct net *net = dev_net(dst->dev);
1040 u32 old_mtu = ipv4_mtu(dst);
1041 struct fib_result res;
1044 if (ip_mtu_locked(dst))
1050 if (mtu < ip_rt_min_pmtu) {
1052 mtu = min(old_mtu, ip_rt_min_pmtu);
1055 if (rt->rt_pmtu == mtu && !lock &&
1056 time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1060 if (fib_lookup(net, fl4, &res, 0) == 0) {
1061 struct fib_nh_common *nhc;
1063 fib_select_path(net, &res, fl4, NULL);
1064 nhc = FIB_RES_NHC(res);
1065 update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1066 jiffies + ip_rt_mtu_expires);
1071 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1072 struct sk_buff *skb, u32 mtu,
1075 struct rtable *rt = (struct rtable *) dst;
1078 ip_rt_build_flow_key(&fl4, sk, skb);
1079 __ip_rt_update_pmtu(rt, &fl4, mtu);
1082 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1083 int oif, u8 protocol)
1085 const struct iphdr *iph = (const struct iphdr *) skb->data;
1088 u32 mark = IP4_REPLY_MARK(net, skb->mark);
1090 __build_flow_key(net, &fl4, NULL, iph, oif,
1091 RT_TOS(iph->tos), protocol, mark, 0);
1092 rt = __ip_route_output_key(net, &fl4);
1094 __ip_rt_update_pmtu(rt, &fl4, mtu);
1098 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1100 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1102 const struct iphdr *iph = (const struct iphdr *) skb->data;
1106 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1108 if (!fl4.flowi4_mark)
1109 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1111 rt = __ip_route_output_key(sock_net(sk), &fl4);
1113 __ip_rt_update_pmtu(rt, &fl4, mtu);
1118 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1120 const struct iphdr *iph = (const struct iphdr *) skb->data;
1123 struct dst_entry *odst = NULL;
1125 struct net *net = sock_net(sk);
1129 if (!ip_sk_accept_pmtu(sk))
1132 odst = sk_dst_get(sk);
1134 if (sock_owned_by_user(sk) || !odst) {
1135 __ipv4_sk_update_pmtu(skb, sk, mtu);
1139 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1141 rt = (struct rtable *)odst;
1142 if (odst->obsolete && !odst->ops->check(odst, 0)) {
1143 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1150 __ip_rt_update_pmtu((struct rtable *) xfrm_dst_path(&rt->dst), &fl4, mtu);
1152 if (!dst_check(&rt->dst, 0)) {
1154 dst_release(&rt->dst);
1156 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1164 sk_dst_set(sk, &rt->dst);
1170 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1172 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1173 int oif, u8 protocol)
1175 const struct iphdr *iph = (const struct iphdr *) skb->data;
1179 __build_flow_key(net, &fl4, NULL, iph, oif,
1180 RT_TOS(iph->tos), protocol, 0, 0);
1181 rt = __ip_route_output_key(net, &fl4);
1183 __ip_do_redirect(rt, skb, &fl4, false);
1187 EXPORT_SYMBOL_GPL(ipv4_redirect);
1189 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1191 const struct iphdr *iph = (const struct iphdr *) skb->data;
1194 struct net *net = sock_net(sk);
1196 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1197 rt = __ip_route_output_key(net, &fl4);
1199 __ip_do_redirect(rt, skb, &fl4, false);
1203 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1205 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1207 struct rtable *rt = (struct rtable *) dst;
1209 /* All IPV4 dsts are created with ->obsolete set to the value
1210 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1211 * into this function always.
1213 * When a PMTU/redirect information update invalidates a route,
1214 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1215 * DST_OBSOLETE_DEAD.
1217 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1222 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1224 struct ip_options opt;
1227 /* Recompile ip options since IPCB may not be valid anymore.
1228 * Also check we have a reasonable ipv4 header.
1230 if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1231 ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1234 memset(&opt, 0, sizeof(opt));
1235 if (ip_hdr(skb)->ihl > 5) {
1236 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1238 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1241 res = __ip_options_compile(dev_net(skb->dev), &opt, skb, NULL);
1247 __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1250 static void ipv4_link_failure(struct sk_buff *skb)
1254 ipv4_send_dest_unreach(skb);
1256 rt = skb_rtable(skb);
1258 dst_set_expires(&rt->dst, 0);
1261 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1263 pr_debug("%s: %pI4 -> %pI4, %s\n",
1264 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1265 skb->dev ? skb->dev->name : "?");
1272 We do not cache source address of outgoing interface,
1273 because it is used only by IP RR, TS and SRR options,
1274 so that it out of fast path.
1276 BTW remember: "addr" is allowed to be not aligned
1280 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1284 if (rt_is_output_route(rt))
1285 src = ip_hdr(skb)->saddr;
1287 struct fib_result res;
1288 struct iphdr *iph = ip_hdr(skb);
1289 struct flowi4 fl4 = {
1290 .daddr = iph->daddr,
1291 .saddr = iph->saddr,
1292 .flowi4_tos = RT_TOS(iph->tos),
1293 .flowi4_oif = rt->dst.dev->ifindex,
1294 .flowi4_iif = skb->dev->ifindex,
1295 .flowi4_mark = skb->mark,
1299 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1300 src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1302 src = inet_select_addr(rt->dst.dev,
1303 rt_nexthop(rt, iph->daddr),
1307 memcpy(addr, &src, 4);
1310 #ifdef CONFIG_IP_ROUTE_CLASSID
1311 static void set_class_tag(struct rtable *rt, u32 tag)
1313 if (!(rt->dst.tclassid & 0xFFFF))
1314 rt->dst.tclassid |= tag & 0xFFFF;
1315 if (!(rt->dst.tclassid & 0xFFFF0000))
1316 rt->dst.tclassid |= tag & 0xFFFF0000;
1320 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1322 unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1323 unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1326 return min(advmss, IPV4_MAX_PMTU - header_size);
1329 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1331 const struct rtable *rt = (const struct rtable *) dst;
1332 unsigned int mtu = rt->rt_pmtu;
1334 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1335 mtu = dst_metric_raw(dst, RTAX_MTU);
1340 mtu = READ_ONCE(dst->dev->mtu);
1342 if (unlikely(ip_mtu_locked(dst))) {
1343 if (rt->rt_uses_gateway && mtu > 576)
1348 mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1350 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1353 static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1355 struct fnhe_hash_bucket *hash;
1356 struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1357 u32 hval = fnhe_hashfun(daddr);
1359 spin_lock_bh(&fnhe_lock);
1361 hash = rcu_dereference_protected(nhc->nhc_exceptions,
1362 lockdep_is_held(&fnhe_lock));
1365 fnhe_p = &hash->chain;
1366 fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1368 if (fnhe->fnhe_daddr == daddr) {
1369 rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1370 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1371 /* set fnhe_daddr to 0 to ensure it won't bind with
1372 * new dsts in rt_bind_exception().
1374 fnhe->fnhe_daddr = 0;
1375 fnhe_flush_routes(fnhe);
1376 kfree_rcu(fnhe, rcu);
1379 fnhe_p = &fnhe->fnhe_next;
1380 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1381 lockdep_is_held(&fnhe_lock));
1384 spin_unlock_bh(&fnhe_lock);
1387 static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1390 struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1391 struct fib_nh_exception *fnhe;
1397 hval = fnhe_hashfun(daddr);
1399 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1400 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1401 if (fnhe->fnhe_daddr == daddr) {
1402 if (fnhe->fnhe_expires &&
1403 time_after(jiffies, fnhe->fnhe_expires)) {
1404 ip_del_fnhe(nhc, daddr);
1414 * 1. mtu on route is locked - use it
1415 * 2. mtu from nexthop exception
1416 * 3. mtu from egress device
1419 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1421 struct fib_nh_common *nhc = res->nhc;
1422 struct net_device *dev = nhc->nhc_dev;
1423 struct fib_info *fi = res->fi;
1426 if (READ_ONCE(dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu) ||
1427 fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1431 struct fib_nh_exception *fnhe;
1433 fnhe = find_exception(nhc, daddr);
1434 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1435 mtu = fnhe->fnhe_pmtu;
1439 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1441 return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1444 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1445 __be32 daddr, const bool do_cache)
1449 spin_lock_bh(&fnhe_lock);
1451 if (daddr == fnhe->fnhe_daddr) {
1452 struct rtable __rcu **porig;
1453 struct rtable *orig;
1454 int genid = fnhe_genid(dev_net(rt->dst.dev));
1456 if (rt_is_input_route(rt))
1457 porig = &fnhe->fnhe_rth_input;
1459 porig = &fnhe->fnhe_rth_output;
1460 orig = rcu_dereference(*porig);
1462 if (fnhe->fnhe_genid != genid) {
1463 fnhe->fnhe_genid = genid;
1465 fnhe->fnhe_pmtu = 0;
1466 fnhe->fnhe_expires = 0;
1467 fnhe->fnhe_mtu_locked = false;
1468 fnhe_flush_routes(fnhe);
1471 fill_route_from_fnhe(rt, fnhe);
1474 rt->rt_gw_family = AF_INET;
1479 rcu_assign_pointer(*porig, rt);
1481 dst_dev_put(&orig->dst);
1482 dst_release(&orig->dst);
1487 fnhe->fnhe_stamp = jiffies;
1489 spin_unlock_bh(&fnhe_lock);
1494 static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1496 struct rtable *orig, *prev, **p;
1499 if (rt_is_input_route(rt)) {
1500 p = (struct rtable **)&nhc->nhc_rth_input;
1502 p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1506 /* hold dst before doing cmpxchg() to avoid race condition
1510 prev = cmpxchg(p, orig, rt);
1513 rt_add_uncached_list(orig);
1514 dst_release(&orig->dst);
1517 dst_release(&rt->dst);
1524 struct uncached_list {
1526 struct list_head head;
1529 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1531 void rt_add_uncached_list(struct rtable *rt)
1533 struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1535 rt->rt_uncached_list = ul;
1537 spin_lock_bh(&ul->lock);
1538 list_add_tail(&rt->rt_uncached, &ul->head);
1539 spin_unlock_bh(&ul->lock);
1542 void rt_del_uncached_list(struct rtable *rt)
1544 if (!list_empty(&rt->rt_uncached)) {
1545 struct uncached_list *ul = rt->rt_uncached_list;
1547 spin_lock_bh(&ul->lock);
1548 list_del(&rt->rt_uncached);
1549 spin_unlock_bh(&ul->lock);
1553 static void ipv4_dst_destroy(struct dst_entry *dst)
1555 struct rtable *rt = (struct rtable *)dst;
1557 ip_dst_metrics_put(dst);
1558 rt_del_uncached_list(rt);
1561 void rt_flush_dev(struct net_device *dev)
1566 for_each_possible_cpu(cpu) {
1567 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1569 spin_lock_bh(&ul->lock);
1570 list_for_each_entry(rt, &ul->head, rt_uncached) {
1571 if (rt->dst.dev != dev)
1573 rt->dst.dev = blackhole_netdev;
1574 dev_hold(rt->dst.dev);
1577 spin_unlock_bh(&ul->lock);
1581 static bool rt_cache_valid(const struct rtable *rt)
1584 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1588 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1589 const struct fib_result *res,
1590 struct fib_nh_exception *fnhe,
1591 struct fib_info *fi, u16 type, u32 itag,
1592 const bool do_cache)
1594 bool cached = false;
1597 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1599 if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1600 rt->rt_uses_gateway = 1;
1601 rt->rt_gw_family = nhc->nhc_gw_family;
1602 /* only INET and INET6 are supported */
1603 if (likely(nhc->nhc_gw_family == AF_INET))
1604 rt->rt_gw4 = nhc->nhc_gw.ipv4;
1606 rt->rt_gw6 = nhc->nhc_gw.ipv6;
1609 ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1611 #ifdef CONFIG_IP_ROUTE_CLASSID
1612 if (nhc->nhc_family == AF_INET) {
1615 nh = container_of(nhc, struct fib_nh, nh_common);
1616 rt->dst.tclassid = nh->nh_tclassid;
1619 rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1621 cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1623 cached = rt_cache_route(nhc, rt);
1624 if (unlikely(!cached)) {
1625 /* Routes we intend to cache in nexthop exception or
1626 * FIB nexthop have the DST_NOCACHE bit clear.
1627 * However, if we are unsuccessful at storing this
1628 * route into the cache we really need to set it.
1631 rt->rt_gw_family = AF_INET;
1634 rt_add_uncached_list(rt);
1637 rt_add_uncached_list(rt);
1639 #ifdef CONFIG_IP_ROUTE_CLASSID
1640 #ifdef CONFIG_IP_MULTIPLE_TABLES
1641 set_class_tag(rt, res->tclassid);
1643 set_class_tag(rt, itag);
1647 struct rtable *rt_dst_alloc(struct net_device *dev,
1648 unsigned int flags, u16 type,
1649 bool nopolicy, bool noxfrm, bool will_cache)
1653 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1654 (will_cache ? 0 : DST_HOST) |
1655 (nopolicy ? DST_NOPOLICY : 0) |
1656 (noxfrm ? DST_NOXFRM : 0));
1659 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1660 rt->rt_flags = flags;
1662 rt->rt_is_input = 0;
1665 rt->rt_mtu_locked = 0;
1666 rt->rt_uses_gateway = 0;
1667 rt->rt_gw_family = 0;
1669 INIT_LIST_HEAD(&rt->rt_uncached);
1671 rt->dst.output = ip_output;
1672 if (flags & RTCF_LOCAL)
1673 rt->dst.input = ip_local_deliver;
1678 EXPORT_SYMBOL(rt_dst_alloc);
1680 struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1682 struct rtable *new_rt;
1684 new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1688 new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1689 new_rt->rt_flags = rt->rt_flags;
1690 new_rt->rt_type = rt->rt_type;
1691 new_rt->rt_is_input = rt->rt_is_input;
1692 new_rt->rt_iif = rt->rt_iif;
1693 new_rt->rt_pmtu = rt->rt_pmtu;
1694 new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1695 new_rt->rt_gw_family = rt->rt_gw_family;
1696 if (rt->rt_gw_family == AF_INET)
1697 new_rt->rt_gw4 = rt->rt_gw4;
1698 else if (rt->rt_gw_family == AF_INET6)
1699 new_rt->rt_gw6 = rt->rt_gw6;
1700 INIT_LIST_HEAD(&new_rt->rt_uncached);
1702 new_rt->dst.flags |= DST_HOST;
1703 new_rt->dst.input = rt->dst.input;
1704 new_rt->dst.output = rt->dst.output;
1705 new_rt->dst.error = rt->dst.error;
1706 new_rt->dst.lastuse = jiffies;
1707 new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1711 EXPORT_SYMBOL(rt_dst_clone);
1713 /* called in rcu_read_lock() section */
1714 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1715 u8 tos, struct net_device *dev,
1716 struct in_device *in_dev, u32 *itag)
1720 /* Primary sanity checks. */
1724 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1725 skb->protocol != htons(ETH_P_IP))
1728 if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1731 if (ipv4_is_zeronet(saddr)) {
1732 if (!ipv4_is_local_multicast(daddr) &&
1733 ip_hdr(skb)->protocol != IPPROTO_IGMP)
1736 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1744 /* called in rcu_read_lock() section */
1745 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1746 u8 tos, struct net_device *dev, int our)
1748 struct in_device *in_dev = __in_dev_get_rcu(dev);
1749 unsigned int flags = RTCF_MULTICAST;
1754 err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1759 flags |= RTCF_LOCAL;
1761 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1762 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
1766 #ifdef CONFIG_IP_ROUTE_CLASSID
1767 rth->dst.tclassid = itag;
1769 rth->dst.output = ip_rt_bug;
1770 rth->rt_is_input= 1;
1772 #ifdef CONFIG_IP_MROUTE
1773 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1774 rth->dst.input = ip_mr_input;
1776 RT_CACHE_STAT_INC(in_slow_mc);
1779 skb_dst_set(skb, &rth->dst);
1784 static void ip_handle_martian_source(struct net_device *dev,
1785 struct in_device *in_dev,
1786 struct sk_buff *skb,
1790 RT_CACHE_STAT_INC(in_martian_src);
1791 #ifdef CONFIG_IP_ROUTE_VERBOSE
1792 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1794 * RFC1812 recommendation, if source is martian,
1795 * the only hint is MAC header.
1797 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1798 &daddr, &saddr, dev->name);
1799 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1800 print_hex_dump(KERN_WARNING, "ll header: ",
1801 DUMP_PREFIX_OFFSET, 16, 1,
1802 skb_mac_header(skb),
1803 dev->hard_header_len, false);
1809 /* called in rcu_read_lock() section */
1810 static int __mkroute_input(struct sk_buff *skb,
1811 const struct fib_result *res,
1812 struct in_device *in_dev,
1813 __be32 daddr, __be32 saddr, u32 tos)
1815 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1816 struct net_device *dev = nhc->nhc_dev;
1817 struct fib_nh_exception *fnhe;
1820 struct in_device *out_dev;
1824 /* get a working reference to the output device */
1825 out_dev = __in_dev_get_rcu(dev);
1827 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1831 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1832 in_dev->dev, in_dev, &itag);
1834 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1840 do_cache = res->fi && !itag;
1841 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1842 skb->protocol == htons(ETH_P_IP)) {
1845 gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1846 if (IN_DEV_SHARED_MEDIA(out_dev) ||
1847 inet_addr_onlink(out_dev, saddr, gw))
1848 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1851 if (skb->protocol != htons(ETH_P_IP)) {
1852 /* Not IP (i.e. ARP). Do not create route, if it is
1853 * invalid for proxy arp. DNAT routes are always valid.
1855 * Proxy arp feature have been extended to allow, ARP
1856 * replies back to the same interface, to support
1857 * Private VLAN switch technologies. See arp.c.
1859 if (out_dev == in_dev &&
1860 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1866 fnhe = find_exception(nhc, daddr);
1869 rth = rcu_dereference(fnhe->fnhe_rth_input);
1871 rth = rcu_dereference(nhc->nhc_rth_input);
1872 if (rt_cache_valid(rth)) {
1873 skb_dst_set_noref(skb, &rth->dst);
1878 rth = rt_dst_alloc(out_dev->dev, 0, res->type,
1879 IN_DEV_CONF_GET(in_dev, NOPOLICY),
1880 IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
1886 rth->rt_is_input = 1;
1887 RT_CACHE_STAT_INC(in_slow_tot);
1889 rth->dst.input = ip_forward;
1891 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1893 lwtunnel_set_redirect(&rth->dst);
1894 skb_dst_set(skb, &rth->dst);
1901 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1902 /* To make ICMP packets follow the right flow, the multipath hash is
1903 * calculated from the inner IP addresses.
1905 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1906 struct flow_keys *hash_keys)
1908 const struct iphdr *outer_iph = ip_hdr(skb);
1909 const struct iphdr *key_iph = outer_iph;
1910 const struct iphdr *inner_iph;
1911 const struct icmphdr *icmph;
1912 struct iphdr _inner_iph;
1913 struct icmphdr _icmph;
1915 if (likely(outer_iph->protocol != IPPROTO_ICMP))
1918 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1921 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1926 if (icmph->type != ICMP_DEST_UNREACH &&
1927 icmph->type != ICMP_REDIRECT &&
1928 icmph->type != ICMP_TIME_EXCEEDED &&
1929 icmph->type != ICMP_PARAMETERPROB)
1932 inner_iph = skb_header_pointer(skb,
1933 outer_iph->ihl * 4 + sizeof(_icmph),
1934 sizeof(_inner_iph), &_inner_iph);
1938 key_iph = inner_iph;
1940 hash_keys->addrs.v4addrs.src = key_iph->saddr;
1941 hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1944 /* if skb is set it will be used and fl4 can be NULL */
1945 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1946 const struct sk_buff *skb, struct flow_keys *flkeys)
1948 u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1949 struct flow_keys hash_keys;
1952 switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1954 memset(&hash_keys, 0, sizeof(hash_keys));
1955 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1957 ip_multipath_l3_keys(skb, &hash_keys);
1959 hash_keys.addrs.v4addrs.src = fl4->saddr;
1960 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1964 /* skb is currently provided only when forwarding */
1966 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1967 struct flow_keys keys;
1969 /* short-circuit if we already have L4 hash present */
1971 return skb_get_hash_raw(skb) >> 1;
1973 memset(&hash_keys, 0, sizeof(hash_keys));
1976 skb_flow_dissect_flow_keys(skb, &keys, flag);
1980 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1981 hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
1982 hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
1983 hash_keys.ports.src = flkeys->ports.src;
1984 hash_keys.ports.dst = flkeys->ports.dst;
1985 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
1987 memset(&hash_keys, 0, sizeof(hash_keys));
1988 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1989 hash_keys.addrs.v4addrs.src = fl4->saddr;
1990 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1991 hash_keys.ports.src = fl4->fl4_sport;
1992 hash_keys.ports.dst = fl4->fl4_dport;
1993 hash_keys.basic.ip_proto = fl4->flowi4_proto;
1997 memset(&hash_keys, 0, sizeof(hash_keys));
1998 /* skb is currently provided only when forwarding */
2000 struct flow_keys keys;
2002 skb_flow_dissect_flow_keys(skb, &keys, 0);
2003 /* Inner can be v4 or v6 */
2004 if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
2005 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2006 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
2007 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
2008 } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
2009 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2010 hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
2011 hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
2012 hash_keys.tags.flow_label = keys.tags.flow_label;
2013 hash_keys.basic.ip_proto = keys.basic.ip_proto;
2015 /* Same as case 0 */
2016 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2017 ip_multipath_l3_keys(skb, &hash_keys);
2020 /* Same as case 0 */
2021 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2022 hash_keys.addrs.v4addrs.src = fl4->saddr;
2023 hash_keys.addrs.v4addrs.dst = fl4->daddr;
2027 mhash = flow_hash_from_keys(&hash_keys);
2030 mhash = jhash_2words(mhash, multipath_hash, 0);
2034 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
2036 static int ip_mkroute_input(struct sk_buff *skb,
2037 struct fib_result *res,
2038 struct in_device *in_dev,
2039 __be32 daddr, __be32 saddr, u32 tos,
2040 struct flow_keys *hkeys)
2042 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2043 if (res->fi && fib_info_num_path(res->fi) > 1) {
2044 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2046 fib_select_multipath(res, h);
2050 /* create a routing cache entry */
2051 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2055 * NOTE. We drop all the packets that has local source
2056 * addresses, because every properly looped back packet
2057 * must have correct destination already attached by output routine.
2059 * Such approach solves two big problems:
2060 * 1. Not simplex devices are handled properly.
2061 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2062 * called with rcu_read_lock()
2065 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2066 u8 tos, struct net_device *dev,
2067 struct fib_result *res)
2069 struct in_device *in_dev = __in_dev_get_rcu(dev);
2070 struct flow_keys *flkeys = NULL, _flkeys;
2071 struct net *net = dev_net(dev);
2072 struct ip_tunnel_info *tun_info;
2074 unsigned int flags = 0;
2078 bool do_cache = true;
2080 /* IP on this device is disabled. */
2085 /* Check for the most weird martians, which can be not detected
2089 tun_info = skb_tunnel_info(skb);
2090 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2091 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2093 fl4.flowi4_tun_key.tun_id = 0;
2096 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2097 goto martian_source;
2101 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2104 /* Accept zero addresses only to limited broadcast;
2105 * I even do not know to fix it or not. Waiting for complains :-)
2107 if (ipv4_is_zeronet(saddr))
2108 goto martian_source;
2110 if (ipv4_is_zeronet(daddr))
2111 goto martian_destination;
2113 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2114 * and call it once if daddr or/and saddr are loopback addresses
2116 if (ipv4_is_loopback(daddr)) {
2117 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2118 goto martian_destination;
2119 } else if (ipv4_is_loopback(saddr)) {
2120 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2121 goto martian_source;
2125 * Now we are ready to route packet.
2128 fl4.flowi4_iif = dev->ifindex;
2129 fl4.flowi4_mark = skb->mark;
2130 fl4.flowi4_tos = tos;
2131 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2132 fl4.flowi4_flags = 0;
2135 fl4.flowi4_uid = sock_net_uid(net, NULL);
2136 fl4.flowi4_multipath_hash = 0;
2138 if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2141 fl4.flowi4_proto = 0;
2146 err = fib_lookup(net, &fl4, res, 0);
2148 if (!IN_DEV_FORWARD(in_dev))
2149 err = -EHOSTUNREACH;
2153 if (res->type == RTN_BROADCAST) {
2154 if (IN_DEV_BFORWARD(in_dev))
2156 /* not do cache if bc_forwarding is enabled */
2157 if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2162 if (res->type == RTN_LOCAL) {
2163 err = fib_validate_source(skb, saddr, daddr, tos,
2164 0, dev, in_dev, &itag);
2166 goto martian_source;
2170 if (!IN_DEV_FORWARD(in_dev)) {
2171 err = -EHOSTUNREACH;
2174 if (res->type != RTN_UNICAST)
2175 goto martian_destination;
2178 err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2182 if (skb->protocol != htons(ETH_P_IP))
2185 if (!ipv4_is_zeronet(saddr)) {
2186 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2189 goto martian_source;
2191 flags |= RTCF_BROADCAST;
2192 res->type = RTN_BROADCAST;
2193 RT_CACHE_STAT_INC(in_brd);
2196 do_cache &= res->fi && !itag;
2198 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2200 rth = rcu_dereference(nhc->nhc_rth_input);
2201 if (rt_cache_valid(rth)) {
2202 skb_dst_set_noref(skb, &rth->dst);
2208 rth = rt_dst_alloc(l3mdev_master_dev_rcu(dev) ? : net->loopback_dev,
2209 flags | RTCF_LOCAL, res->type,
2210 IN_DEV_CONF_GET(in_dev, NOPOLICY), false, do_cache);
2214 rth->dst.output= ip_rt_bug;
2215 #ifdef CONFIG_IP_ROUTE_CLASSID
2216 rth->dst.tclassid = itag;
2218 rth->rt_is_input = 1;
2220 RT_CACHE_STAT_INC(in_slow_tot);
2221 if (res->type == RTN_UNREACHABLE) {
2222 rth->dst.input= ip_error;
2223 rth->dst.error= -err;
2224 rth->rt_flags &= ~RTCF_LOCAL;
2228 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2230 rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2231 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2232 WARN_ON(rth->dst.input == lwtunnel_input);
2233 rth->dst.lwtstate->orig_input = rth->dst.input;
2234 rth->dst.input = lwtunnel_input;
2237 if (unlikely(!rt_cache_route(nhc, rth)))
2238 rt_add_uncached_list(rth);
2240 skb_dst_set(skb, &rth->dst);
2245 RT_CACHE_STAT_INC(in_no_route);
2246 res->type = RTN_UNREACHABLE;
2252 * Do not cache martian addresses: they should be logged (RFC1812)
2254 martian_destination:
2255 RT_CACHE_STAT_INC(in_martian_dst);
2256 #ifdef CONFIG_IP_ROUTE_VERBOSE
2257 if (IN_DEV_LOG_MARTIANS(in_dev))
2258 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2259 &daddr, &saddr, dev->name);
2271 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2275 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2276 u8 tos, struct net_device *dev)
2278 struct fib_result res;
2281 tos &= IPTOS_RT_MASK;
2283 err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2288 EXPORT_SYMBOL(ip_route_input_noref);
2290 /* called with rcu_read_lock held */
2291 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2292 u8 tos, struct net_device *dev, struct fib_result *res)
2294 /* Multicast recognition logic is moved from route cache to here.
2295 The problem was that too many Ethernet cards have broken/missing
2296 hardware multicast filters :-( As result the host on multicasting
2297 network acquires a lot of useless route cache entries, sort of
2298 SDR messages from all the world. Now we try to get rid of them.
2299 Really, provided software IP multicast filter is organized
2300 reasonably (at least, hashed), it does not result in a slowdown
2301 comparing with route cache reject entries.
2302 Note, that multicast routers are not affected, because
2303 route cache entry is created eventually.
2305 if (ipv4_is_multicast(daddr)) {
2306 struct in_device *in_dev = __in_dev_get_rcu(dev);
2312 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2313 ip_hdr(skb)->protocol);
2315 /* check l3 master if no match yet */
2316 if (!our && netif_is_l3_slave(dev)) {
2317 struct in_device *l3_in_dev;
2319 l3_in_dev = __in_dev_get_rcu(skb->dev);
2321 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2322 ip_hdr(skb)->protocol);
2326 #ifdef CONFIG_IP_MROUTE
2328 (!ipv4_is_local_multicast(daddr) &&
2329 IN_DEV_MFORWARD(in_dev))
2332 err = ip_route_input_mc(skb, daddr, saddr,
2338 return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2341 /* called with rcu_read_lock() */
2342 static struct rtable *__mkroute_output(const struct fib_result *res,
2343 const struct flowi4 *fl4, int orig_oif,
2344 struct net_device *dev_out,
2347 struct fib_info *fi = res->fi;
2348 struct fib_nh_exception *fnhe;
2349 struct in_device *in_dev;
2350 u16 type = res->type;
2354 in_dev = __in_dev_get_rcu(dev_out);
2356 return ERR_PTR(-EINVAL);
2358 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2359 if (ipv4_is_loopback(fl4->saddr) &&
2360 !(dev_out->flags & IFF_LOOPBACK) &&
2361 !netif_is_l3_master(dev_out))
2362 return ERR_PTR(-EINVAL);
2364 if (ipv4_is_lbcast(fl4->daddr))
2365 type = RTN_BROADCAST;
2366 else if (ipv4_is_multicast(fl4->daddr))
2367 type = RTN_MULTICAST;
2368 else if (ipv4_is_zeronet(fl4->daddr))
2369 return ERR_PTR(-EINVAL);
2371 if (dev_out->flags & IFF_LOOPBACK)
2372 flags |= RTCF_LOCAL;
2375 if (type == RTN_BROADCAST) {
2376 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2378 } else if (type == RTN_MULTICAST) {
2379 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2380 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2382 flags &= ~RTCF_LOCAL;
2385 /* If multicast route do not exist use
2386 * default one, but do not gateway in this case.
2389 if (fi && res->prefixlen < 4)
2391 } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2392 (orig_oif != dev_out->ifindex)) {
2393 /* For local routes that require a particular output interface
2394 * we do not want to cache the result. Caching the result
2395 * causes incorrect behaviour when there are multiple source
2396 * addresses on the interface, the end result being that if the
2397 * intended recipient is waiting on that interface for the
2398 * packet he won't receive it because it will be delivered on
2399 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2400 * be set to the loopback interface as well.
2406 do_cache &= fi != NULL;
2408 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2409 struct rtable __rcu **prth;
2411 fnhe = find_exception(nhc, fl4->daddr);
2415 prth = &fnhe->fnhe_rth_output;
2417 if (unlikely(fl4->flowi4_flags &
2418 FLOWI_FLAG_KNOWN_NH &&
2419 !(nhc->nhc_gw_family &&
2420 nhc->nhc_scope == RT_SCOPE_LINK))) {
2424 prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2426 rth = rcu_dereference(*prth);
2427 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2432 rth = rt_dst_alloc(dev_out, flags, type,
2433 IN_DEV_CONF_GET(in_dev, NOPOLICY),
2434 IN_DEV_CONF_GET(in_dev, NOXFRM),
2437 return ERR_PTR(-ENOBUFS);
2439 rth->rt_iif = orig_oif;
2441 RT_CACHE_STAT_INC(out_slow_tot);
2443 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2444 if (flags & RTCF_LOCAL &&
2445 !(dev_out->flags & IFF_LOOPBACK)) {
2446 rth->dst.output = ip_mc_output;
2447 RT_CACHE_STAT_INC(out_slow_mc);
2449 #ifdef CONFIG_IP_MROUTE
2450 if (type == RTN_MULTICAST) {
2451 if (IN_DEV_MFORWARD(in_dev) &&
2452 !ipv4_is_local_multicast(fl4->daddr)) {
2453 rth->dst.input = ip_mr_input;
2454 rth->dst.output = ip_mc_output;
2460 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2461 lwtunnel_set_redirect(&rth->dst);
2467 * Major route resolver routine.
2470 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2471 const struct sk_buff *skb)
2473 __u8 tos = RT_FL_TOS(fl4);
2474 struct fib_result res = {
2482 fl4->flowi4_iif = LOOPBACK_IFINDEX;
2483 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
2484 fl4->flowi4_scope = ((tos & RTO_ONLINK) ?
2485 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE);
2488 rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2493 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2495 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2496 struct fib_result *res,
2497 const struct sk_buff *skb)
2499 struct net_device *dev_out = NULL;
2500 int orig_oif = fl4->flowi4_oif;
2501 unsigned int flags = 0;
2506 if (ipv4_is_multicast(fl4->saddr) ||
2507 ipv4_is_lbcast(fl4->saddr) ||
2508 ipv4_is_zeronet(fl4->saddr)) {
2509 rth = ERR_PTR(-EINVAL);
2513 rth = ERR_PTR(-ENETUNREACH);
2515 /* I removed check for oif == dev_out->oif here.
2516 It was wrong for two reasons:
2517 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2518 is assigned to multiple interfaces.
2519 2. Moreover, we are allowed to send packets with saddr
2520 of another iface. --ANK
2523 if (fl4->flowi4_oif == 0 &&
2524 (ipv4_is_multicast(fl4->daddr) ||
2525 ipv4_is_lbcast(fl4->daddr))) {
2526 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2527 dev_out = __ip_dev_find(net, fl4->saddr, false);
2531 /* Special hack: user can direct multicasts
2532 and limited broadcast via necessary interface
2533 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2534 This hack is not just for fun, it allows
2535 vic,vat and friends to work.
2536 They bind socket to loopback, set ttl to zero
2537 and expect that it will work.
2538 From the viewpoint of routing cache they are broken,
2539 because we are not allowed to build multicast path
2540 with loopback source addr (look, routing cache
2541 cannot know, that ttl is zero, so that packet
2542 will not leave this host and route is valid).
2543 Luckily, this hack is good workaround.
2546 fl4->flowi4_oif = dev_out->ifindex;
2550 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2551 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2552 if (!__ip_dev_find(net, fl4->saddr, false))
2558 if (fl4->flowi4_oif) {
2559 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2560 rth = ERR_PTR(-ENODEV);
2564 /* RACE: Check return value of inet_select_addr instead. */
2565 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2566 rth = ERR_PTR(-ENETUNREACH);
2569 if (ipv4_is_local_multicast(fl4->daddr) ||
2570 ipv4_is_lbcast(fl4->daddr) ||
2571 fl4->flowi4_proto == IPPROTO_IGMP) {
2573 fl4->saddr = inet_select_addr(dev_out, 0,
2578 if (ipv4_is_multicast(fl4->daddr))
2579 fl4->saddr = inet_select_addr(dev_out, 0,
2581 else if (!fl4->daddr)
2582 fl4->saddr = inet_select_addr(dev_out, 0,
2588 fl4->daddr = fl4->saddr;
2590 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2591 dev_out = net->loopback_dev;
2592 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2593 res->type = RTN_LOCAL;
2594 flags |= RTCF_LOCAL;
2598 err = fib_lookup(net, fl4, res, 0);
2602 if (fl4->flowi4_oif &&
2603 (ipv4_is_multicast(fl4->daddr) ||
2604 !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2605 /* Apparently, routing tables are wrong. Assume,
2606 that the destination is on link.
2609 Because we are allowed to send to iface
2610 even if it has NO routes and NO assigned
2611 addresses. When oif is specified, routing
2612 tables are looked up with only one purpose:
2613 to catch if destination is gatewayed, rather than
2614 direct. Moreover, if MSG_DONTROUTE is set,
2615 we send packet, ignoring both routing tables
2616 and ifaddr state. --ANK
2619 We could make it even if oif is unknown,
2620 likely IPv6, but we do not.
2623 if (fl4->saddr == 0)
2624 fl4->saddr = inet_select_addr(dev_out, 0,
2626 res->type = RTN_UNICAST;
2633 if (res->type == RTN_LOCAL) {
2635 if (res->fi->fib_prefsrc)
2636 fl4->saddr = res->fi->fib_prefsrc;
2638 fl4->saddr = fl4->daddr;
2641 /* L3 master device is the loopback for that domain */
2642 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2645 /* make sure orig_oif points to fib result device even
2646 * though packet rx/tx happens over loopback or l3mdev
2648 orig_oif = FIB_RES_OIF(*res);
2650 fl4->flowi4_oif = dev_out->ifindex;
2651 flags |= RTCF_LOCAL;
2655 fib_select_path(net, res, fl4, skb);
2657 dev_out = FIB_RES_DEV(*res);
2660 rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2666 static struct dst_entry *ipv4_blackhole_dst_check(struct dst_entry *dst, u32 cookie)
2671 static unsigned int ipv4_blackhole_mtu(const struct dst_entry *dst)
2673 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
2675 return mtu ? : dst->dev->mtu;
2678 static void ipv4_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
2679 struct sk_buff *skb, u32 mtu,
2684 static void ipv4_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
2685 struct sk_buff *skb)
2689 static u32 *ipv4_rt_blackhole_cow_metrics(struct dst_entry *dst,
2695 static struct dst_ops ipv4_dst_blackhole_ops = {
2697 .check = ipv4_blackhole_dst_check,
2698 .mtu = ipv4_blackhole_mtu,
2699 .default_advmss = ipv4_default_advmss,
2700 .update_pmtu = ipv4_rt_blackhole_update_pmtu,
2701 .redirect = ipv4_rt_blackhole_redirect,
2702 .cow_metrics = ipv4_rt_blackhole_cow_metrics,
2703 .neigh_lookup = ipv4_neigh_lookup,
2706 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2708 struct rtable *ort = (struct rtable *) dst_orig;
2711 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2713 struct dst_entry *new = &rt->dst;
2716 new->input = dst_discard;
2717 new->output = dst_discard_out;
2719 new->dev = net->loopback_dev;
2723 rt->rt_is_input = ort->rt_is_input;
2724 rt->rt_iif = ort->rt_iif;
2725 rt->rt_pmtu = ort->rt_pmtu;
2726 rt->rt_mtu_locked = ort->rt_mtu_locked;
2728 rt->rt_genid = rt_genid_ipv4(net);
2729 rt->rt_flags = ort->rt_flags;
2730 rt->rt_type = ort->rt_type;
2731 rt->rt_uses_gateway = ort->rt_uses_gateway;
2732 rt->rt_gw_family = ort->rt_gw_family;
2733 if (rt->rt_gw_family == AF_INET)
2734 rt->rt_gw4 = ort->rt_gw4;
2735 else if (rt->rt_gw_family == AF_INET6)
2736 rt->rt_gw6 = ort->rt_gw6;
2738 INIT_LIST_HEAD(&rt->rt_uncached);
2741 dst_release(dst_orig);
2743 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2746 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2747 const struct sock *sk)
2749 struct rtable *rt = __ip_route_output_key(net, flp4);
2754 if (flp4->flowi4_proto) {
2755 flp4->flowi4_oif = rt->dst.dev->ifindex;
2756 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2757 flowi4_to_flowi(flp4),
2763 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2765 /* called with rcu_read_lock held */
2766 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2767 struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2768 struct sk_buff *skb, u32 portid, u32 seq,
2772 struct nlmsghdr *nlh;
2773 unsigned long expires = 0;
2775 u32 metrics[RTAX_MAX];
2777 nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2781 r = nlmsg_data(nlh);
2782 r->rtm_family = AF_INET;
2783 r->rtm_dst_len = 32;
2785 r->rtm_tos = fl4 ? fl4->flowi4_tos : 0;
2786 r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2787 if (nla_put_u32(skb, RTA_TABLE, table_id))
2788 goto nla_put_failure;
2789 r->rtm_type = rt->rt_type;
2790 r->rtm_scope = RT_SCOPE_UNIVERSE;
2791 r->rtm_protocol = RTPROT_UNSPEC;
2792 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2793 if (rt->rt_flags & RTCF_NOTIFY)
2794 r->rtm_flags |= RTM_F_NOTIFY;
2795 if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2796 r->rtm_flags |= RTCF_DOREDIRECT;
2798 if (nla_put_in_addr(skb, RTA_DST, dst))
2799 goto nla_put_failure;
2801 r->rtm_src_len = 32;
2802 if (nla_put_in_addr(skb, RTA_SRC, src))
2803 goto nla_put_failure;
2806 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2807 goto nla_put_failure;
2808 #ifdef CONFIG_IP_ROUTE_CLASSID
2809 if (rt->dst.tclassid &&
2810 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2811 goto nla_put_failure;
2813 if (fl4 && !rt_is_input_route(rt) &&
2814 fl4->saddr != src) {
2815 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2816 goto nla_put_failure;
2818 if (rt->rt_uses_gateway) {
2819 if (rt->rt_gw_family == AF_INET &&
2820 nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2821 goto nla_put_failure;
2822 } else if (rt->rt_gw_family == AF_INET6) {
2823 int alen = sizeof(struct in6_addr);
2827 nla = nla_reserve(skb, RTA_VIA, alen + 2);
2829 goto nla_put_failure;
2831 via = nla_data(nla);
2832 via->rtvia_family = AF_INET6;
2833 memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2837 expires = rt->dst.expires;
2839 unsigned long now = jiffies;
2841 if (time_before(now, expires))
2847 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2848 if (rt->rt_pmtu && expires)
2849 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2850 if (rt->rt_mtu_locked && expires)
2851 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2852 if (rtnetlink_put_metrics(skb, metrics) < 0)
2853 goto nla_put_failure;
2856 if (fl4->flowi4_mark &&
2857 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2858 goto nla_put_failure;
2860 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2861 nla_put_u32(skb, RTA_UID,
2862 from_kuid_munged(current_user_ns(),
2864 goto nla_put_failure;
2866 if (rt_is_input_route(rt)) {
2867 #ifdef CONFIG_IP_MROUTE
2868 if (ipv4_is_multicast(dst) &&
2869 !ipv4_is_local_multicast(dst) &&
2870 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2871 int err = ipmr_get_route(net, skb,
2872 fl4->saddr, fl4->daddr,
2878 goto nla_put_failure;
2882 if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2883 goto nla_put_failure;
2887 error = rt->dst.error;
2889 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2890 goto nla_put_failure;
2892 nlmsg_end(skb, nlh);
2896 nlmsg_cancel(skb, nlh);
2900 static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
2901 struct netlink_callback *cb, u32 table_id,
2902 struct fnhe_hash_bucket *bucket, int genid,
2903 int *fa_index, int fa_start, unsigned int flags)
2907 for (i = 0; i < FNHE_HASH_SIZE; i++) {
2908 struct fib_nh_exception *fnhe;
2910 for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
2911 fnhe = rcu_dereference(fnhe->fnhe_next)) {
2915 if (*fa_index < fa_start)
2918 if (fnhe->fnhe_genid != genid)
2921 if (fnhe->fnhe_expires &&
2922 time_after(jiffies, fnhe->fnhe_expires))
2925 rt = rcu_dereference(fnhe->fnhe_rth_input);
2927 rt = rcu_dereference(fnhe->fnhe_rth_output);
2931 err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
2932 table_id, NULL, skb,
2933 NETLINK_CB(cb->skb).portid,
2934 cb->nlh->nlmsg_seq, flags);
2945 int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
2946 u32 table_id, struct fib_info *fi,
2947 int *fa_index, int fa_start, unsigned int flags)
2949 struct net *net = sock_net(cb->skb->sk);
2950 int nhsel, genid = fnhe_genid(net);
2952 for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
2953 struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
2954 struct fnhe_hash_bucket *bucket;
2957 if (nhc->nhc_flags & RTNH_F_DEAD)
2961 bucket = rcu_dereference(nhc->nhc_exceptions);
2964 err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
2965 genid, fa_index, fa_start,
2975 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
2976 u8 ip_proto, __be16 sport,
2979 struct sk_buff *skb;
2982 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
2986 /* Reserve room for dummy headers, this skb can pass
2987 * through good chunk of routing engine.
2989 skb_reset_mac_header(skb);
2990 skb_reset_network_header(skb);
2991 skb->protocol = htons(ETH_P_IP);
2992 iph = skb_put(skb, sizeof(struct iphdr));
2993 iph->protocol = ip_proto;
2999 skb_set_transport_header(skb, skb->len);
3001 switch (iph->protocol) {
3003 struct udphdr *udph;
3005 udph = skb_put_zero(skb, sizeof(struct udphdr));
3006 udph->source = sport;
3008 udph->len = htons(sizeof(struct udphdr));
3013 struct tcphdr *tcph;
3015 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3016 tcph->source = sport;
3018 tcph->doff = sizeof(struct tcphdr) / 4;
3020 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3024 case IPPROTO_ICMP: {
3025 struct icmphdr *icmph;
3027 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3028 icmph->type = ICMP_ECHO;
3036 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3037 const struct nlmsghdr *nlh,
3039 struct netlink_ext_ack *extack)
3044 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3045 NL_SET_ERR_MSG(extack,
3046 "ipv4: Invalid header for route get request");
3050 if (!netlink_strict_get_check(skb))
3051 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3052 rtm_ipv4_policy, extack);
3054 rtm = nlmsg_data(nlh);
3055 if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3056 (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3057 rtm->rtm_table || rtm->rtm_protocol ||
3058 rtm->rtm_scope || rtm->rtm_type) {
3059 NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3063 if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3064 RTM_F_LOOKUP_TABLE |
3066 NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3070 err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3071 rtm_ipv4_policy, extack);
3075 if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3076 (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3077 NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3081 for (i = 0; i <= RTA_MAX; i++) {
3097 NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3105 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3106 struct netlink_ext_ack *extack)
3108 struct net *net = sock_net(in_skb->sk);
3109 struct nlattr *tb[RTA_MAX+1];
3110 u32 table_id = RT_TABLE_MAIN;
3111 __be16 sport = 0, dport = 0;
3112 struct fib_result res = {};
3113 u8 ip_proto = IPPROTO_UDP;
3114 struct rtable *rt = NULL;
3115 struct sk_buff *skb;
3117 struct flowi4 fl4 = {};
3125 err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3129 rtm = nlmsg_data(nlh);
3130 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3131 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3132 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3133 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3135 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3137 uid = (iif ? INVALID_UID : current_uid());
3139 if (tb[RTA_IP_PROTO]) {
3140 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3141 &ip_proto, AF_INET, extack);
3147 sport = nla_get_be16(tb[RTA_SPORT]);
3150 dport = nla_get_be16(tb[RTA_DPORT]);
3152 skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3158 fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
3159 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3160 fl4.flowi4_mark = mark;
3161 fl4.flowi4_uid = uid;
3163 fl4.fl4_sport = sport;
3165 fl4.fl4_dport = dport;
3166 fl4.flowi4_proto = ip_proto;
3171 struct net_device *dev;
3173 dev = dev_get_by_index_rcu(net, iif);
3179 fl4.flowi4_iif = iif; /* for rt_fill_info */
3182 err = ip_route_input_rcu(skb, dst, src,
3183 rtm->rtm_tos & IPTOS_RT_MASK, dev,
3186 rt = skb_rtable(skb);
3187 if (err == 0 && rt->dst.error)
3188 err = -rt->dst.error;
3190 fl4.flowi4_iif = LOOPBACK_IFINDEX;
3191 skb->dev = net->loopback_dev;
3192 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3197 skb_dst_set(skb, &rt->dst);
3203 if (rtm->rtm_flags & RTM_F_NOTIFY)
3204 rt->rt_flags |= RTCF_NOTIFY;
3206 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3207 table_id = res.table ? res.table->tb_id : 0;
3209 /* reset skb for netlink reply msg */
3211 skb_reset_network_header(skb);
3212 skb_reset_transport_header(skb);
3213 skb_reset_mac_header(skb);
3215 if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3217 err = fib_props[res.type].error;
3219 err = -EHOSTUNREACH;
3222 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3223 nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
3224 rt->rt_type, res.prefix, res.prefixlen,
3225 fl4.flowi4_tos, res.fi, 0);
3227 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3228 NETLINK_CB(in_skb).portid,
3236 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3246 void ip_rt_multicast_event(struct in_device *in_dev)
3248 rt_cache_flush(dev_net(in_dev->dev));
3251 #ifdef CONFIG_SYSCTL
3252 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
3253 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
3254 static int ip_rt_gc_elasticity __read_mostly = 8;
3255 static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU;
3257 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3258 void __user *buffer,
3259 size_t *lenp, loff_t *ppos)
3261 struct net *net = (struct net *)__ctl->extra1;
3264 rt_cache_flush(net);
3265 fnhe_genid_bump(net);
3272 static struct ctl_table ipv4_route_table[] = {
3274 .procname = "gc_thresh",
3275 .data = &ipv4_dst_ops.gc_thresh,
3276 .maxlen = sizeof(int),
3278 .proc_handler = proc_dointvec,
3281 .procname = "max_size",
3282 .data = &ip_rt_max_size,
3283 .maxlen = sizeof(int),
3285 .proc_handler = proc_dointvec,
3288 /* Deprecated. Use gc_min_interval_ms */
3290 .procname = "gc_min_interval",
3291 .data = &ip_rt_gc_min_interval,
3292 .maxlen = sizeof(int),
3294 .proc_handler = proc_dointvec_jiffies,
3297 .procname = "gc_min_interval_ms",
3298 .data = &ip_rt_gc_min_interval,
3299 .maxlen = sizeof(int),
3301 .proc_handler = proc_dointvec_ms_jiffies,
3304 .procname = "gc_timeout",
3305 .data = &ip_rt_gc_timeout,
3306 .maxlen = sizeof(int),
3308 .proc_handler = proc_dointvec_jiffies,
3311 .procname = "gc_interval",
3312 .data = &ip_rt_gc_interval,
3313 .maxlen = sizeof(int),
3315 .proc_handler = proc_dointvec_jiffies,
3318 .procname = "redirect_load",
3319 .data = &ip_rt_redirect_load,
3320 .maxlen = sizeof(int),
3322 .proc_handler = proc_dointvec,
3325 .procname = "redirect_number",
3326 .data = &ip_rt_redirect_number,
3327 .maxlen = sizeof(int),
3329 .proc_handler = proc_dointvec,
3332 .procname = "redirect_silence",
3333 .data = &ip_rt_redirect_silence,
3334 .maxlen = sizeof(int),
3336 .proc_handler = proc_dointvec,
3339 .procname = "error_cost",
3340 .data = &ip_rt_error_cost,
3341 .maxlen = sizeof(int),
3343 .proc_handler = proc_dointvec,
3346 .procname = "error_burst",
3347 .data = &ip_rt_error_burst,
3348 .maxlen = sizeof(int),
3350 .proc_handler = proc_dointvec,
3353 .procname = "gc_elasticity",
3354 .data = &ip_rt_gc_elasticity,
3355 .maxlen = sizeof(int),
3357 .proc_handler = proc_dointvec,
3360 .procname = "mtu_expires",
3361 .data = &ip_rt_mtu_expires,
3362 .maxlen = sizeof(int),
3364 .proc_handler = proc_dointvec_jiffies,
3367 .procname = "min_pmtu",
3368 .data = &ip_rt_min_pmtu,
3369 .maxlen = sizeof(int),
3371 .proc_handler = proc_dointvec_minmax,
3372 .extra1 = &ip_min_valid_pmtu,
3375 .procname = "min_adv_mss",
3376 .data = &ip_rt_min_advmss,
3377 .maxlen = sizeof(int),
3379 .proc_handler = proc_dointvec,
3384 static const char ipv4_route_flush_procname[] = "flush";
3386 static struct ctl_table ipv4_route_flush_table[] = {
3388 .procname = ipv4_route_flush_procname,
3389 .maxlen = sizeof(int),
3391 .proc_handler = ipv4_sysctl_rtcache_flush,
3396 static __net_init int sysctl_route_net_init(struct net *net)
3398 struct ctl_table *tbl;
3400 tbl = ipv4_route_flush_table;
3401 if (!net_eq(net, &init_net)) {
3402 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3406 /* Don't export non-whitelisted sysctls to unprivileged users */
3407 if (net->user_ns != &init_user_ns) {
3408 if (tbl[0].procname != ipv4_route_flush_procname)
3409 tbl[0].procname = NULL;
3412 tbl[0].extra1 = net;
3414 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3415 if (!net->ipv4.route_hdr)
3420 if (tbl != ipv4_route_flush_table)
3426 static __net_exit void sysctl_route_net_exit(struct net *net)
3428 struct ctl_table *tbl;
3430 tbl = net->ipv4.route_hdr->ctl_table_arg;
3431 unregister_net_sysctl_table(net->ipv4.route_hdr);
3432 BUG_ON(tbl == ipv4_route_flush_table);
3436 static __net_initdata struct pernet_operations sysctl_route_ops = {
3437 .init = sysctl_route_net_init,
3438 .exit = sysctl_route_net_exit,
3442 static __net_init int rt_genid_init(struct net *net)
3444 atomic_set(&net->ipv4.rt_genid, 0);
3445 atomic_set(&net->fnhe_genid, 0);
3446 atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3450 static __net_initdata struct pernet_operations rt_genid_ops = {
3451 .init = rt_genid_init,
3454 static int __net_init ipv4_inetpeer_init(struct net *net)
3456 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3460 inet_peer_base_init(bp);
3461 net->ipv4.peers = bp;
3465 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3467 struct inet_peer_base *bp = net->ipv4.peers;
3469 net->ipv4.peers = NULL;
3470 inetpeer_invalidate_tree(bp);
3474 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3475 .init = ipv4_inetpeer_init,
3476 .exit = ipv4_inetpeer_exit,
3479 #ifdef CONFIG_IP_ROUTE_CLASSID
3480 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3481 #endif /* CONFIG_IP_ROUTE_CLASSID */
3483 int __init ip_rt_init(void)
3488 /* For modern hosts, this will use 2 MB of memory */
3489 idents_hash = alloc_large_system_hash("IP idents",
3490 sizeof(*ip_idents) + sizeof(*ip_tstamps),
3492 16, /* one bucket per 64 KB */
3499 ip_idents = idents_hash;
3501 prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
3503 ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
3505 for_each_possible_cpu(cpu) {
3506 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3508 INIT_LIST_HEAD(&ul->head);
3509 spin_lock_init(&ul->lock);
3511 #ifdef CONFIG_IP_ROUTE_CLASSID
3512 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3514 panic("IP: failed to allocate ip_rt_acct\n");
3517 ipv4_dst_ops.kmem_cachep =
3518 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3519 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3521 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3523 if (dst_entries_init(&ipv4_dst_ops) < 0)
3524 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3526 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3527 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3529 ipv4_dst_ops.gc_thresh = ~0;
3530 ip_rt_max_size = INT_MAX;
3535 if (ip_rt_proc_init())
3536 pr_err("Unable to create route proc files\n");
3541 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3542 RTNL_FLAG_DOIT_UNLOCKED);
3544 #ifdef CONFIG_SYSCTL
3545 register_pernet_subsys(&sysctl_route_ops);
3547 register_pernet_subsys(&rt_genid_ops);
3548 register_pernet_subsys(&ipv4_inetpeer_ops);
3552 #ifdef CONFIG_SYSCTL
3554 * We really need to sanitize the damn ipv4 init order, then all
3555 * this nonsense will go away.
3557 void __init ip_static_sysctl_init(void)
3559 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);