1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
7 * ROUTE - implementation of the IP router.
10 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
11 * Alan Cox, <gw4pts@gw4pts.ampr.org>
12 * Linus Torvalds, <Linus.Torvalds@helsinki.fi>
13 * Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
16 * Alan Cox : Verify area fixes.
17 * Alan Cox : cli() protects routing changes
18 * Rui Oliveira : ICMP routing table updates
19 * (rco@di.uminho.pt) Routing table insertion and update
20 * Linus Torvalds : Rewrote bits to be sensible
21 * Alan Cox : Added BSD route gw semantics
22 * Alan Cox : Super /proc >4K
23 * Alan Cox : MTU in route table
24 * Alan Cox : MSS actually. Also added the window
26 * Sam Lantinga : Fixed route matching in rt_del()
27 * Alan Cox : Routing cache support.
28 * Alan Cox : Removed compatibility cruft.
29 * Alan Cox : RTF_REJECT support.
30 * Alan Cox : TCP irtt support.
31 * Jonathan Naylor : Added Metric support.
32 * Miquel van Smoorenburg : BSD API fixes.
33 * Miquel van Smoorenburg : Metrics.
34 * Alan Cox : Use __u32 properly
35 * Alan Cox : Aligned routing errors more closely with BSD
36 * our system is still very different.
37 * Alan Cox : Faster /proc handling
38 * Alexey Kuznetsov : Massive rework to support tree based routing,
39 * routing caches and better behaviour.
41 * Olaf Erb : irtt wasn't being copied right.
42 * Bjorn Ekwall : Kerneld route support.
43 * Alan Cox : Multicast fixed (I hope)
44 * Pavel Krauz : Limited broadcast fixed
45 * Mike McLagan : Routing by source
46 * Alexey Kuznetsov : End of old history. Split to fib.c and
47 * route.c and rewritten from scratch.
48 * Andi Kleen : Load-limit warning messages.
49 * Vitaly E. Lavrov : Transparent proxy revived after year coma.
50 * Vitaly E. Lavrov : Race condition in ip_route_input_slow.
51 * Tobias Ringstrom : Uninitialized res.type in ip_route_output_slow.
52 * Vladimir V. Ivanov : IP rule info (flowid) is really useful.
53 * Marc Boucher : routing by fwmark
54 * Robert Olsson : Added rt_cache statistics
55 * Arnaldo C. Melo : Convert proc stuff to seq_file
56 * Eric Dumazet : hashed spinlocks and rt_check_expire() fixes.
57 * Ilia Sotnikov : Ignore TOS on PMTUD and Redirect
58 * Ilia Sotnikov : Removed TOS from hash calculations
61 #define pr_fmt(fmt) "IPv4: " fmt
63 #include <linux/module.h>
64 #include <linux/uaccess.h>
65 #include <linux/bitops.h>
66 #include <linux/types.h>
67 #include <linux/kernel.h>
69 #include <linux/memblock.h>
70 #include <linux/string.h>
71 #include <linux/socket.h>
72 #include <linux/sockios.h>
73 #include <linux/errno.h>
75 #include <linux/inet.h>
76 #include <linux/netdevice.h>
77 #include <linux/proc_fs.h>
78 #include <linux/init.h>
79 #include <linux/skbuff.h>
80 #include <linux/inetdevice.h>
81 #include <linux/igmp.h>
82 #include <linux/pkt_sched.h>
83 #include <linux/mroute.h>
84 #include <linux/netfilter_ipv4.h>
85 #include <linux/random.h>
86 #include <linux/rcupdate.h>
87 #include <linux/times.h>
88 #include <linux/slab.h>
89 #include <linux/jhash.h>
91 #include <net/dst_metadata.h>
92 #include <net/net_namespace.h>
93 #include <net/protocol.h>
95 #include <net/route.h>
96 #include <net/inetpeer.h>
98 #include <net/ip_fib.h>
99 #include <net/nexthop.h>
102 #include <net/icmp.h>
103 #include <net/xfrm.h>
104 #include <net/lwtunnel.h>
105 #include <net/netevent.h>
106 #include <net/rtnetlink.h>
108 #include <linux/sysctl.h>
110 #include <net/secure_seq.h>
111 #include <net/ip_tunnels.h>
112 #include <net/l3mdev.h>
114 #include "fib_lookup.h"
116 #define RT_FL_TOS(oldflp4) \
117 ((oldflp4)->flowi4_tos & (IPTOS_RT_MASK | RTO_ONLINK))
119 #define RT_GC_TIMEOUT (300*HZ)
121 static int ip_rt_max_size;
122 static int ip_rt_redirect_number __read_mostly = 9;
123 static int ip_rt_redirect_load __read_mostly = HZ / 50;
124 static int ip_rt_redirect_silence __read_mostly = ((HZ / 50) << (9 + 1));
125 static int ip_rt_error_cost __read_mostly = HZ;
126 static int ip_rt_error_burst __read_mostly = 5 * HZ;
127 static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
128 static u32 ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
129 static int ip_rt_min_advmss __read_mostly = 256;
131 static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
134 * Interface to generic destination cache.
137 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie);
138 static unsigned int ipv4_default_advmss(const struct dst_entry *dst);
139 static unsigned int ipv4_mtu(const struct dst_entry *dst);
140 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst);
141 static void ipv4_link_failure(struct sk_buff *skb);
142 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
143 struct sk_buff *skb, u32 mtu,
145 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk,
146 struct sk_buff *skb);
147 static void ipv4_dst_destroy(struct dst_entry *dst);
149 static u32 *ipv4_cow_metrics(struct dst_entry *dst, unsigned long old)
155 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
158 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr);
160 static struct dst_ops ipv4_dst_ops = {
162 .check = ipv4_dst_check,
163 .default_advmss = ipv4_default_advmss,
165 .cow_metrics = ipv4_cow_metrics,
166 .destroy = ipv4_dst_destroy,
167 .negative_advice = ipv4_negative_advice,
168 .link_failure = ipv4_link_failure,
169 .update_pmtu = ip_rt_update_pmtu,
170 .redirect = ip_do_redirect,
171 .local_out = __ip_local_out,
172 .neigh_lookup = ipv4_neigh_lookup,
173 .confirm_neigh = ipv4_confirm_neigh,
176 #define ECN_OR_COST(class) TC_PRIO_##class
178 const __u8 ip_tos2prio[16] = {
180 ECN_OR_COST(BESTEFFORT),
182 ECN_OR_COST(BESTEFFORT),
188 ECN_OR_COST(INTERACTIVE),
190 ECN_OR_COST(INTERACTIVE),
191 TC_PRIO_INTERACTIVE_BULK,
192 ECN_OR_COST(INTERACTIVE_BULK),
193 TC_PRIO_INTERACTIVE_BULK,
194 ECN_OR_COST(INTERACTIVE_BULK)
196 EXPORT_SYMBOL(ip_tos2prio);
198 static DEFINE_PER_CPU(struct rt_cache_stat, rt_cache_stat);
199 #define RT_CACHE_STAT_INC(field) raw_cpu_inc(rt_cache_stat.field)
201 #ifdef CONFIG_PROC_FS
202 static void *rt_cache_seq_start(struct seq_file *seq, loff_t *pos)
206 return SEQ_START_TOKEN;
209 static void *rt_cache_seq_next(struct seq_file *seq, void *v, loff_t *pos)
215 static void rt_cache_seq_stop(struct seq_file *seq, void *v)
219 static int rt_cache_seq_show(struct seq_file *seq, void *v)
221 if (v == SEQ_START_TOKEN)
222 seq_printf(seq, "%-127s\n",
223 "Iface\tDestination\tGateway \tFlags\t\tRefCnt\tUse\t"
224 "Metric\tSource\t\tMTU\tWindow\tIRTT\tTOS\tHHRef\t"
229 static const struct seq_operations rt_cache_seq_ops = {
230 .start = rt_cache_seq_start,
231 .next = rt_cache_seq_next,
232 .stop = rt_cache_seq_stop,
233 .show = rt_cache_seq_show,
236 static int rt_cache_seq_open(struct inode *inode, struct file *file)
238 return seq_open(file, &rt_cache_seq_ops);
241 static const struct proc_ops rt_cache_proc_ops = {
242 .proc_open = rt_cache_seq_open,
243 .proc_read = seq_read,
244 .proc_lseek = seq_lseek,
245 .proc_release = seq_release,
249 static void *rt_cpu_seq_start(struct seq_file *seq, loff_t *pos)
254 return SEQ_START_TOKEN;
256 for (cpu = *pos-1; cpu < nr_cpu_ids; ++cpu) {
257 if (!cpu_possible(cpu))
260 return &per_cpu(rt_cache_stat, cpu);
265 static void *rt_cpu_seq_next(struct seq_file *seq, void *v, loff_t *pos)
269 for (cpu = *pos; cpu < nr_cpu_ids; ++cpu) {
270 if (!cpu_possible(cpu))
273 return &per_cpu(rt_cache_stat, cpu);
280 static void rt_cpu_seq_stop(struct seq_file *seq, void *v)
285 static int rt_cpu_seq_show(struct seq_file *seq, void *v)
287 struct rt_cache_stat *st = v;
289 if (v == SEQ_START_TOKEN) {
290 seq_printf(seq, "entries in_hit in_slow_tot in_slow_mc in_no_route in_brd in_martian_dst in_martian_src out_hit out_slow_tot out_slow_mc gc_total gc_ignored gc_goal_miss gc_dst_overflow in_hlist_search out_hlist_search\n");
294 seq_printf(seq,"%08x %08x %08x %08x %08x %08x %08x %08x "
295 " %08x %08x %08x %08x %08x %08x %08x %08x %08x \n",
296 dst_entries_get_slow(&ipv4_dst_ops),
309 0, /* st->gc_total */
310 0, /* st->gc_ignored */
311 0, /* st->gc_goal_miss */
312 0, /* st->gc_dst_overflow */
313 0, /* st->in_hlist_search */
314 0 /* st->out_hlist_search */
319 static const struct seq_operations rt_cpu_seq_ops = {
320 .start = rt_cpu_seq_start,
321 .next = rt_cpu_seq_next,
322 .stop = rt_cpu_seq_stop,
323 .show = rt_cpu_seq_show,
327 static int rt_cpu_seq_open(struct inode *inode, struct file *file)
329 return seq_open(file, &rt_cpu_seq_ops);
332 static const struct proc_ops rt_cpu_proc_ops = {
333 .proc_open = rt_cpu_seq_open,
334 .proc_read = seq_read,
335 .proc_lseek = seq_lseek,
336 .proc_release = seq_release,
339 #ifdef CONFIG_IP_ROUTE_CLASSID
340 static int rt_acct_proc_show(struct seq_file *m, void *v)
342 struct ip_rt_acct *dst, *src;
345 dst = kcalloc(256, sizeof(struct ip_rt_acct), GFP_KERNEL);
349 for_each_possible_cpu(i) {
350 src = (struct ip_rt_acct *)per_cpu_ptr(ip_rt_acct, i);
351 for (j = 0; j < 256; j++) {
352 dst[j].o_bytes += src[j].o_bytes;
353 dst[j].o_packets += src[j].o_packets;
354 dst[j].i_bytes += src[j].i_bytes;
355 dst[j].i_packets += src[j].i_packets;
359 seq_write(m, dst, 256 * sizeof(struct ip_rt_acct));
365 static int __net_init ip_rt_do_proc_init(struct net *net)
367 struct proc_dir_entry *pde;
369 pde = proc_create("rt_cache", 0444, net->proc_net,
374 pde = proc_create("rt_cache", 0444,
375 net->proc_net_stat, &rt_cpu_proc_ops);
379 #ifdef CONFIG_IP_ROUTE_CLASSID
380 pde = proc_create_single("rt_acct", 0, net->proc_net,
387 #ifdef CONFIG_IP_ROUTE_CLASSID
389 remove_proc_entry("rt_cache", net->proc_net_stat);
392 remove_proc_entry("rt_cache", net->proc_net);
397 static void __net_exit ip_rt_do_proc_exit(struct net *net)
399 remove_proc_entry("rt_cache", net->proc_net_stat);
400 remove_proc_entry("rt_cache", net->proc_net);
401 #ifdef CONFIG_IP_ROUTE_CLASSID
402 remove_proc_entry("rt_acct", net->proc_net);
406 static struct pernet_operations ip_rt_proc_ops __net_initdata = {
407 .init = ip_rt_do_proc_init,
408 .exit = ip_rt_do_proc_exit,
411 static int __init ip_rt_proc_init(void)
413 return register_pernet_subsys(&ip_rt_proc_ops);
417 static inline int ip_rt_proc_init(void)
421 #endif /* CONFIG_PROC_FS */
423 static inline bool rt_is_expired(const struct rtable *rth)
425 return rth->rt_genid != rt_genid_ipv4(dev_net(rth->dst.dev));
428 void rt_cache_flush(struct net *net)
430 rt_genid_bump_ipv4(net);
433 static struct neighbour *ipv4_neigh_lookup(const struct dst_entry *dst,
437 const struct rtable *rt = container_of(dst, struct rtable, dst);
438 struct net_device *dev = dst->dev;
443 if (likely(rt->rt_gw_family == AF_INET)) {
444 n = ip_neigh_gw4(dev, rt->rt_gw4);
445 } else if (rt->rt_gw_family == AF_INET6) {
446 n = ip_neigh_gw6(dev, &rt->rt_gw6);
450 pkey = skb ? ip_hdr(skb)->daddr : *((__be32 *) daddr);
451 n = ip_neigh_gw4(dev, pkey);
454 if (!IS_ERR(n) && !refcount_inc_not_zero(&n->refcnt))
457 rcu_read_unlock_bh();
462 static void ipv4_confirm_neigh(const struct dst_entry *dst, const void *daddr)
464 const struct rtable *rt = container_of(dst, struct rtable, dst);
465 struct net_device *dev = dst->dev;
466 const __be32 *pkey = daddr;
468 if (rt->rt_gw_family == AF_INET) {
469 pkey = (const __be32 *)&rt->rt_gw4;
470 } else if (rt->rt_gw_family == AF_INET6) {
471 return __ipv6_confirm_neigh_stub(dev, &rt->rt_gw6);
474 (RTCF_MULTICAST | RTCF_BROADCAST | RTCF_LOCAL))) {
477 __ipv4_confirm_neigh(dev, *(__force u32 *)pkey);
480 /* Hash tables of size 2048..262144 depending on RAM size.
481 * Each bucket uses 8 bytes.
483 static u32 ip_idents_mask __read_mostly;
484 static atomic_t *ip_idents __read_mostly;
485 static u32 *ip_tstamps __read_mostly;
487 /* In order to protect privacy, we add a perturbation to identifiers
488 * if one generator is seldom used. This makes hard for an attacker
489 * to infer how many packets were sent between two points in time.
491 u32 ip_idents_reserve(u32 hash, int segs)
493 u32 bucket, old, now = (u32)jiffies;
498 bucket = hash & ip_idents_mask;
499 p_tstamp = ip_tstamps + bucket;
500 p_id = ip_idents + bucket;
501 old = READ_ONCE(*p_tstamp);
503 if (old != now && cmpxchg(p_tstamp, old, now) == old)
504 delta = prandom_u32_max(now - old);
506 /* If UBSAN reports an error there, please make sure your compiler
507 * supports -fno-strict-overflow before reporting it that was a bug
508 * in UBSAN, and it has been fixed in GCC-8.
510 return atomic_add_return(segs + delta, p_id) - segs;
512 EXPORT_SYMBOL(ip_idents_reserve);
514 void __ip_select_ident(struct net *net, struct iphdr *iph, int segs)
518 /* Note the following code is not safe, but this is okay. */
519 if (unlikely(siphash_key_is_zero(&net->ipv4.ip_id_key)))
520 get_random_bytes(&net->ipv4.ip_id_key,
521 sizeof(net->ipv4.ip_id_key));
523 hash = siphash_3u32((__force u32)iph->daddr,
524 (__force u32)iph->saddr,
526 &net->ipv4.ip_id_key);
527 id = ip_idents_reserve(hash, segs);
530 EXPORT_SYMBOL(__ip_select_ident);
532 static void ip_rt_fix_tos(struct flowi4 *fl4)
534 __u8 tos = RT_FL_TOS(fl4);
536 fl4->flowi4_tos = tos & IPTOS_RT_MASK;
537 fl4->flowi4_scope = tos & RTO_ONLINK ?
538 RT_SCOPE_LINK : RT_SCOPE_UNIVERSE;
541 static void __build_flow_key(const struct net *net, struct flowi4 *fl4,
542 const struct sock *sk,
543 const struct iphdr *iph,
545 u8 prot, u32 mark, int flow_flags)
548 const struct inet_sock *inet = inet_sk(sk);
550 oif = sk->sk_bound_dev_if;
552 tos = RT_CONN_FLAGS(sk);
553 prot = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol;
555 flowi4_init_output(fl4, oif, mark, tos,
556 RT_SCOPE_UNIVERSE, prot,
558 iph->daddr, iph->saddr, 0, 0,
559 sock_net_uid(net, sk));
562 static void build_skb_flow_key(struct flowi4 *fl4, const struct sk_buff *skb,
563 const struct sock *sk)
565 const struct net *net = dev_net(skb->dev);
566 const struct iphdr *iph = ip_hdr(skb);
567 int oif = skb->dev->ifindex;
568 u8 tos = RT_TOS(iph->tos);
569 u8 prot = iph->protocol;
570 u32 mark = skb->mark;
572 __build_flow_key(net, fl4, sk, iph, oif, tos, prot, mark, 0);
575 static void build_sk_flow_key(struct flowi4 *fl4, const struct sock *sk)
577 const struct inet_sock *inet = inet_sk(sk);
578 const struct ip_options_rcu *inet_opt;
579 __be32 daddr = inet->inet_daddr;
582 inet_opt = rcu_dereference(inet->inet_opt);
583 if (inet_opt && inet_opt->opt.srr)
584 daddr = inet_opt->opt.faddr;
585 flowi4_init_output(fl4, sk->sk_bound_dev_if, sk->sk_mark,
586 RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE,
587 inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol,
588 inet_sk_flowi_flags(sk),
589 daddr, inet->inet_saddr, 0, 0, sk->sk_uid);
593 static void ip_rt_build_flow_key(struct flowi4 *fl4, const struct sock *sk,
594 const struct sk_buff *skb)
597 build_skb_flow_key(fl4, skb, sk);
599 build_sk_flow_key(fl4, sk);
602 static DEFINE_SPINLOCK(fnhe_lock);
604 static void fnhe_flush_routes(struct fib_nh_exception *fnhe)
608 rt = rcu_dereference(fnhe->fnhe_rth_input);
610 RCU_INIT_POINTER(fnhe->fnhe_rth_input, NULL);
611 dst_dev_put(&rt->dst);
612 dst_release(&rt->dst);
614 rt = rcu_dereference(fnhe->fnhe_rth_output);
616 RCU_INIT_POINTER(fnhe->fnhe_rth_output, NULL);
617 dst_dev_put(&rt->dst);
618 dst_release(&rt->dst);
622 static void fnhe_remove_oldest(struct fnhe_hash_bucket *hash)
624 struct fib_nh_exception __rcu **fnhe_p, **oldest_p;
625 struct fib_nh_exception *fnhe, *oldest = NULL;
627 for (fnhe_p = &hash->chain; ; fnhe_p = &fnhe->fnhe_next) {
628 fnhe = rcu_dereference_protected(*fnhe_p,
629 lockdep_is_held(&fnhe_lock));
633 time_before(fnhe->fnhe_stamp, oldest->fnhe_stamp)) {
638 fnhe_flush_routes(oldest);
639 *oldest_p = oldest->fnhe_next;
640 kfree_rcu(oldest, rcu);
643 static u32 fnhe_hashfun(__be32 daddr)
645 static siphash_key_t fnhe_hash_key __read_mostly;
648 net_get_random_once(&fnhe_hash_key, sizeof(fnhe_hash_key));
649 hval = siphash_1u32((__force u32)daddr, &fnhe_hash_key);
650 return hash_64(hval, FNHE_HASH_SHIFT);
653 static void fill_route_from_fnhe(struct rtable *rt, struct fib_nh_exception *fnhe)
655 rt->rt_pmtu = fnhe->fnhe_pmtu;
656 rt->rt_mtu_locked = fnhe->fnhe_mtu_locked;
657 rt->dst.expires = fnhe->fnhe_expires;
660 rt->rt_flags |= RTCF_REDIRECTED;
661 rt->rt_uses_gateway = 1;
662 rt->rt_gw_family = AF_INET;
663 rt->rt_gw4 = fnhe->fnhe_gw;
667 static void update_or_create_fnhe(struct fib_nh_common *nhc, __be32 daddr,
668 __be32 gw, u32 pmtu, bool lock,
669 unsigned long expires)
671 struct fnhe_hash_bucket *hash;
672 struct fib_nh_exception *fnhe;
678 genid = fnhe_genid(dev_net(nhc->nhc_dev));
679 hval = fnhe_hashfun(daddr);
681 spin_lock_bh(&fnhe_lock);
683 hash = rcu_dereference(nhc->nhc_exceptions);
685 hash = kcalloc(FNHE_HASH_SIZE, sizeof(*hash), GFP_ATOMIC);
688 rcu_assign_pointer(nhc->nhc_exceptions, hash);
694 for (fnhe = rcu_dereference(hash->chain); fnhe;
695 fnhe = rcu_dereference(fnhe->fnhe_next)) {
696 if (fnhe->fnhe_daddr == daddr)
702 if (fnhe->fnhe_genid != genid)
703 fnhe->fnhe_genid = genid;
707 fnhe->fnhe_pmtu = pmtu;
708 fnhe->fnhe_mtu_locked = lock;
710 fnhe->fnhe_expires = max(1UL, expires);
711 /* Update all cached dsts too */
712 rt = rcu_dereference(fnhe->fnhe_rth_input);
714 fill_route_from_fnhe(rt, fnhe);
715 rt = rcu_dereference(fnhe->fnhe_rth_output);
717 fill_route_from_fnhe(rt, fnhe);
719 /* Randomize max depth to avoid some side channels attacks. */
720 int max_depth = FNHE_RECLAIM_DEPTH +
721 prandom_u32_max(FNHE_RECLAIM_DEPTH);
723 while (depth > max_depth) {
724 fnhe_remove_oldest(hash);
728 fnhe = kzalloc(sizeof(*fnhe), GFP_ATOMIC);
732 fnhe->fnhe_next = hash->chain;
734 fnhe->fnhe_genid = genid;
735 fnhe->fnhe_daddr = daddr;
737 fnhe->fnhe_pmtu = pmtu;
738 fnhe->fnhe_mtu_locked = lock;
739 fnhe->fnhe_expires = max(1UL, expires);
741 rcu_assign_pointer(hash->chain, fnhe);
743 /* Exception created; mark the cached routes for the nexthop
744 * stale, so anyone caching it rechecks if this exception
747 rt = rcu_dereference(nhc->nhc_rth_input);
749 rt->dst.obsolete = DST_OBSOLETE_KILL;
751 for_each_possible_cpu(i) {
752 struct rtable __rcu **prt;
753 prt = per_cpu_ptr(nhc->nhc_pcpu_rth_output, i);
754 rt = rcu_dereference(*prt);
756 rt->dst.obsolete = DST_OBSOLETE_KILL;
760 fnhe->fnhe_stamp = jiffies;
763 spin_unlock_bh(&fnhe_lock);
766 static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flowi4 *fl4,
769 __be32 new_gw = icmp_hdr(skb)->un.gateway;
770 __be32 old_gw = ip_hdr(skb)->saddr;
771 struct net_device *dev = skb->dev;
772 struct in_device *in_dev;
773 struct fib_result res;
777 switch (icmp_hdr(skb)->code & 7) {
779 case ICMP_REDIR_NETTOS:
780 case ICMP_REDIR_HOST:
781 case ICMP_REDIR_HOSTTOS:
788 if (rt->rt_gw_family != AF_INET || rt->rt_gw4 != old_gw)
791 in_dev = __in_dev_get_rcu(dev);
796 if (new_gw == old_gw || !IN_DEV_RX_REDIRECTS(in_dev) ||
797 ipv4_is_multicast(new_gw) || ipv4_is_lbcast(new_gw) ||
798 ipv4_is_zeronet(new_gw))
799 goto reject_redirect;
801 if (!IN_DEV_SHARED_MEDIA(in_dev)) {
802 if (!inet_addr_onlink(in_dev, new_gw, old_gw))
803 goto reject_redirect;
804 if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev))
805 goto reject_redirect;
807 if (inet_addr_type(net, new_gw) != RTN_UNICAST)
808 goto reject_redirect;
811 n = __ipv4_neigh_lookup(rt->dst.dev, (__force u32)new_gw);
813 n = neigh_create(&arp_tbl, &new_gw, rt->dst.dev);
815 if (!(n->nud_state & NUD_VALID)) {
816 neigh_event_send(n, NULL);
818 if (fib_lookup(net, fl4, &res, 0) == 0) {
819 struct fib_nh_common *nhc;
821 fib_select_path(net, &res, fl4, skb);
822 nhc = FIB_RES_NHC(res);
823 update_or_create_fnhe(nhc, fl4->daddr, new_gw,
825 jiffies + ip_rt_gc_timeout);
828 rt->dst.obsolete = DST_OBSOLETE_KILL;
829 call_netevent_notifiers(NETEVENT_NEIGH_UPDATE, n);
836 #ifdef CONFIG_IP_ROUTE_VERBOSE
837 if (IN_DEV_LOG_MARTIANS(in_dev)) {
838 const struct iphdr *iph = (const struct iphdr *) skb->data;
839 __be32 daddr = iph->daddr;
840 __be32 saddr = iph->saddr;
842 net_info_ratelimited("Redirect from %pI4 on %s about %pI4 ignored\n"
843 " Advised path = %pI4 -> %pI4\n",
844 &old_gw, dev->name, &new_gw,
851 static void ip_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
855 const struct iphdr *iph = (const struct iphdr *) skb->data;
856 struct net *net = dev_net(skb->dev);
857 int oif = skb->dev->ifindex;
858 u8 tos = RT_TOS(iph->tos);
859 u8 prot = iph->protocol;
860 u32 mark = skb->mark;
862 rt = (struct rtable *) dst;
864 __build_flow_key(net, &fl4, sk, iph, oif, tos, prot, mark, 0);
866 __ip_do_redirect(rt, skb, &fl4, true);
869 static struct dst_entry *ipv4_negative_advice(struct dst_entry *dst)
871 struct rtable *rt = (struct rtable *)dst;
872 struct dst_entry *ret = dst;
875 if (dst->obsolete > 0) {
878 } else if ((rt->rt_flags & RTCF_REDIRECTED) ||
889 * 1. The first ip_rt_redirect_number redirects are sent
890 * with exponential backoff, then we stop sending them at all,
891 * assuming that the host ignores our redirects.
892 * 2. If we did not see packets requiring redirects
893 * during ip_rt_redirect_silence, we assume that the host
894 * forgot redirected route and start to send redirects again.
896 * This algorithm is much cheaper and more intelligent than dumb load limiting
899 * NOTE. Do not forget to inhibit load limiting for redirects (redundant)
900 * and "frag. need" (breaks PMTU discovery) in icmp.c.
903 void ip_rt_send_redirect(struct sk_buff *skb)
905 struct rtable *rt = skb_rtable(skb);
906 struct in_device *in_dev;
907 struct inet_peer *peer;
913 in_dev = __in_dev_get_rcu(rt->dst.dev);
914 if (!in_dev || !IN_DEV_TX_REDIRECTS(in_dev)) {
918 log_martians = IN_DEV_LOG_MARTIANS(in_dev);
919 vif = l3mdev_master_ifindex_rcu(rt->dst.dev);
922 net = dev_net(rt->dst.dev);
923 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr, vif, 1);
925 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST,
926 rt_nexthop(rt, ip_hdr(skb)->daddr));
930 /* No redirected packets during ip_rt_redirect_silence;
931 * reset the algorithm.
933 if (time_after(jiffies, peer->rate_last + ip_rt_redirect_silence)) {
934 peer->rate_tokens = 0;
935 peer->n_redirects = 0;
938 /* Too many ignored redirects; do not send anything
939 * set dst.rate_last to the last seen redirected packet.
941 if (peer->n_redirects >= ip_rt_redirect_number) {
942 peer->rate_last = jiffies;
946 /* Check for load limit; set rate_last to the latest sent
949 if (peer->n_redirects == 0 ||
952 (ip_rt_redirect_load << peer->n_redirects)))) {
953 __be32 gw = rt_nexthop(rt, ip_hdr(skb)->daddr);
955 icmp_send(skb, ICMP_REDIRECT, ICMP_REDIR_HOST, gw);
956 peer->rate_last = jiffies;
958 if (IS_ENABLED(CONFIG_IP_ROUTE_VERBOSE) && log_martians &&
959 peer->n_redirects == ip_rt_redirect_number)
960 net_warn_ratelimited("host %pI4/if%d ignores redirects for %pI4 to %pI4\n",
961 &ip_hdr(skb)->saddr, inet_iif(skb),
962 &ip_hdr(skb)->daddr, &gw);
968 static int ip_error(struct sk_buff *skb)
970 struct rtable *rt = skb_rtable(skb);
971 struct net_device *dev = skb->dev;
972 struct in_device *in_dev;
973 struct inet_peer *peer;
979 if (netif_is_l3_master(skb->dev)) {
980 dev = __dev_get_by_index(dev_net(skb->dev), IPCB(skb)->iif);
985 in_dev = __in_dev_get_rcu(dev);
987 /* IP on this device is disabled. */
991 net = dev_net(rt->dst.dev);
992 if (!IN_DEV_FORWARD(in_dev)) {
993 switch (rt->dst.error) {
995 __IP_INC_STATS(net, IPSTATS_MIB_INADDRERRORS);
999 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
1005 switch (rt->dst.error) {
1010 code = ICMP_HOST_UNREACH;
1013 code = ICMP_NET_UNREACH;
1014 __IP_INC_STATS(net, IPSTATS_MIB_INNOROUTES);
1017 code = ICMP_PKT_FILTERED;
1021 peer = inet_getpeer_v4(net->ipv4.peers, ip_hdr(skb)->saddr,
1022 l3mdev_master_ifindex(skb->dev), 1);
1027 peer->rate_tokens += now - peer->rate_last;
1028 if (peer->rate_tokens > ip_rt_error_burst)
1029 peer->rate_tokens = ip_rt_error_burst;
1030 peer->rate_last = now;
1031 if (peer->rate_tokens >= ip_rt_error_cost)
1032 peer->rate_tokens -= ip_rt_error_cost;
1038 icmp_send(skb, ICMP_DEST_UNREACH, code, 0);
1040 out: kfree_skb(skb);
1044 static void __ip_rt_update_pmtu(struct rtable *rt, struct flowi4 *fl4, u32 mtu)
1046 struct dst_entry *dst = &rt->dst;
1047 struct net *net = dev_net(dst->dev);
1048 struct fib_result res;
1052 if (ip_mtu_locked(dst))
1055 old_mtu = ipv4_mtu(dst);
1059 if (mtu < ip_rt_min_pmtu) {
1061 mtu = min(old_mtu, ip_rt_min_pmtu);
1064 if (rt->rt_pmtu == mtu && !lock &&
1065 time_before(jiffies, dst->expires - ip_rt_mtu_expires / 2))
1069 if (fib_lookup(net, fl4, &res, 0) == 0) {
1070 struct fib_nh_common *nhc;
1072 fib_select_path(net, &res, fl4, NULL);
1073 nhc = FIB_RES_NHC(res);
1074 update_or_create_fnhe(nhc, fl4->daddr, 0, mtu, lock,
1075 jiffies + ip_rt_mtu_expires);
1080 static void ip_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1081 struct sk_buff *skb, u32 mtu,
1084 struct rtable *rt = (struct rtable *) dst;
1087 ip_rt_build_flow_key(&fl4, sk, skb);
1088 ip_rt_fix_tos(&fl4);
1090 /* Don't make lookup fail for bridged encapsulations */
1091 if (skb && netif_is_any_bridge_port(skb->dev))
1094 __ip_rt_update_pmtu(rt, &fl4, mtu);
1097 void ipv4_update_pmtu(struct sk_buff *skb, struct net *net, u32 mtu,
1098 int oif, u8 protocol)
1100 const struct iphdr *iph = (const struct iphdr *)skb->data;
1103 u32 mark = IP4_REPLY_MARK(net, skb->mark);
1105 __build_flow_key(net, &fl4, NULL, iph, oif,
1106 RT_TOS(iph->tos), protocol, mark, 0);
1107 rt = __ip_route_output_key(net, &fl4);
1109 __ip_rt_update_pmtu(rt, &fl4, mtu);
1113 EXPORT_SYMBOL_GPL(ipv4_update_pmtu);
1115 static void __ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1117 const struct iphdr *iph = (const struct iphdr *)skb->data;
1121 __build_flow_key(sock_net(sk), &fl4, sk, iph, 0, 0, 0, 0, 0);
1123 if (!fl4.flowi4_mark)
1124 fl4.flowi4_mark = IP4_REPLY_MARK(sock_net(sk), skb->mark);
1126 rt = __ip_route_output_key(sock_net(sk), &fl4);
1128 __ip_rt_update_pmtu(rt, &fl4, mtu);
1133 void ipv4_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, u32 mtu)
1135 const struct iphdr *iph = (const struct iphdr *)skb->data;
1138 struct dst_entry *odst = NULL;
1140 struct net *net = sock_net(sk);
1144 if (!ip_sk_accept_pmtu(sk))
1147 odst = sk_dst_get(sk);
1149 if (sock_owned_by_user(sk) || !odst) {
1150 __ipv4_sk_update_pmtu(skb, sk, mtu);
1154 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1156 rt = (struct rtable *)odst;
1157 if (odst->obsolete && !odst->ops->check(odst, 0)) {
1158 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1164 ip_rt_fix_tos(&fl4);
1167 __ip_rt_update_pmtu((struct rtable *)xfrm_dst_path(&rt->dst), &fl4, mtu);
1169 if (!dst_check(&rt->dst, 0)) {
1171 dst_release(&rt->dst);
1173 rt = ip_route_output_flow(sock_net(sk), &fl4, sk);
1181 sk_dst_set(sk, &rt->dst);
1187 EXPORT_SYMBOL_GPL(ipv4_sk_update_pmtu);
1189 void ipv4_redirect(struct sk_buff *skb, struct net *net,
1190 int oif, u8 protocol)
1192 const struct iphdr *iph = (const struct iphdr *)skb->data;
1196 __build_flow_key(net, &fl4, NULL, iph, oif,
1197 RT_TOS(iph->tos), protocol, 0, 0);
1198 rt = __ip_route_output_key(net, &fl4);
1200 __ip_do_redirect(rt, skb, &fl4, false);
1204 EXPORT_SYMBOL_GPL(ipv4_redirect);
1206 void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk)
1208 const struct iphdr *iph = (const struct iphdr *)skb->data;
1211 struct net *net = sock_net(sk);
1213 __build_flow_key(net, &fl4, sk, iph, 0, 0, 0, 0, 0);
1214 rt = __ip_route_output_key(net, &fl4);
1216 __ip_do_redirect(rt, skb, &fl4, false);
1220 EXPORT_SYMBOL_GPL(ipv4_sk_redirect);
1222 static struct dst_entry *ipv4_dst_check(struct dst_entry *dst, u32 cookie)
1224 struct rtable *rt = (struct rtable *) dst;
1226 /* All IPV4 dsts are created with ->obsolete set to the value
1227 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1228 * into this function always.
1230 * When a PMTU/redirect information update invalidates a route,
1231 * this is indicated by setting obsolete to DST_OBSOLETE_KILL or
1232 * DST_OBSOLETE_DEAD.
1234 if (dst->obsolete != DST_OBSOLETE_FORCE_CHK || rt_is_expired(rt))
1239 static void ipv4_send_dest_unreach(struct sk_buff *skb)
1241 struct net_device *dev;
1242 struct ip_options opt;
1245 /* Recompile ip options since IPCB may not be valid anymore.
1246 * Also check we have a reasonable ipv4 header.
1248 if (!pskb_network_may_pull(skb, sizeof(struct iphdr)) ||
1249 ip_hdr(skb)->version != 4 || ip_hdr(skb)->ihl < 5)
1252 memset(&opt, 0, sizeof(opt));
1253 if (ip_hdr(skb)->ihl > 5) {
1254 if (!pskb_network_may_pull(skb, ip_hdr(skb)->ihl * 4))
1256 opt.optlen = ip_hdr(skb)->ihl * 4 - sizeof(struct iphdr);
1259 dev = skb->dev ? skb->dev : skb_rtable(skb)->dst.dev;
1260 res = __ip_options_compile(dev_net(dev), &opt, skb, NULL);
1266 __icmp_send(skb, ICMP_DEST_UNREACH, ICMP_HOST_UNREACH, 0, &opt);
1269 static void ipv4_link_failure(struct sk_buff *skb)
1273 ipv4_send_dest_unreach(skb);
1275 rt = skb_rtable(skb);
1277 dst_set_expires(&rt->dst, 0);
1280 static int ip_rt_bug(struct net *net, struct sock *sk, struct sk_buff *skb)
1282 pr_debug("%s: %pI4 -> %pI4, %s\n",
1283 __func__, &ip_hdr(skb)->saddr, &ip_hdr(skb)->daddr,
1284 skb->dev ? skb->dev->name : "?");
1291 We do not cache source address of outgoing interface,
1292 because it is used only by IP RR, TS and SRR options,
1293 so that it out of fast path.
1295 BTW remember: "addr" is allowed to be not aligned
1299 void ip_rt_get_source(u8 *addr, struct sk_buff *skb, struct rtable *rt)
1303 if (rt_is_output_route(rt))
1304 src = ip_hdr(skb)->saddr;
1306 struct fib_result res;
1307 struct iphdr *iph = ip_hdr(skb);
1308 struct flowi4 fl4 = {
1309 .daddr = iph->daddr,
1310 .saddr = iph->saddr,
1311 .flowi4_tos = RT_TOS(iph->tos),
1312 .flowi4_oif = rt->dst.dev->ifindex,
1313 .flowi4_iif = skb->dev->ifindex,
1314 .flowi4_mark = skb->mark,
1318 if (fib_lookup(dev_net(rt->dst.dev), &fl4, &res, 0) == 0)
1319 src = fib_result_prefsrc(dev_net(rt->dst.dev), &res);
1321 src = inet_select_addr(rt->dst.dev,
1322 rt_nexthop(rt, iph->daddr),
1326 memcpy(addr, &src, 4);
1329 #ifdef CONFIG_IP_ROUTE_CLASSID
1330 static void set_class_tag(struct rtable *rt, u32 tag)
1332 if (!(rt->dst.tclassid & 0xFFFF))
1333 rt->dst.tclassid |= tag & 0xFFFF;
1334 if (!(rt->dst.tclassid & 0xFFFF0000))
1335 rt->dst.tclassid |= tag & 0xFFFF0000;
1339 static unsigned int ipv4_default_advmss(const struct dst_entry *dst)
1341 unsigned int header_size = sizeof(struct tcphdr) + sizeof(struct iphdr);
1342 unsigned int advmss = max_t(unsigned int, ipv4_mtu(dst) - header_size,
1345 return min(advmss, IPV4_MAX_PMTU - header_size);
1348 static unsigned int ipv4_mtu(const struct dst_entry *dst)
1350 const struct rtable *rt = (const struct rtable *)dst;
1351 unsigned int mtu = rt->rt_pmtu;
1353 if (!mtu || time_after_eq(jiffies, rt->dst.expires))
1354 mtu = dst_metric_raw(dst, RTAX_MTU);
1359 mtu = READ_ONCE(dst->dev->mtu);
1361 if (unlikely(ip_mtu_locked(dst))) {
1362 if (rt->rt_uses_gateway && mtu > 576)
1367 mtu = min_t(unsigned int, mtu, IP_MAX_MTU);
1369 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1372 static void ip_del_fnhe(struct fib_nh_common *nhc, __be32 daddr)
1374 struct fnhe_hash_bucket *hash;
1375 struct fib_nh_exception *fnhe, __rcu **fnhe_p;
1376 u32 hval = fnhe_hashfun(daddr);
1378 spin_lock_bh(&fnhe_lock);
1380 hash = rcu_dereference_protected(nhc->nhc_exceptions,
1381 lockdep_is_held(&fnhe_lock));
1384 fnhe_p = &hash->chain;
1385 fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
1387 if (fnhe->fnhe_daddr == daddr) {
1388 rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
1389 fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
1390 /* set fnhe_daddr to 0 to ensure it won't bind with
1391 * new dsts in rt_bind_exception().
1393 fnhe->fnhe_daddr = 0;
1394 fnhe_flush_routes(fnhe);
1395 kfree_rcu(fnhe, rcu);
1398 fnhe_p = &fnhe->fnhe_next;
1399 fnhe = rcu_dereference_protected(fnhe->fnhe_next,
1400 lockdep_is_held(&fnhe_lock));
1403 spin_unlock_bh(&fnhe_lock);
1406 static struct fib_nh_exception *find_exception(struct fib_nh_common *nhc,
1409 struct fnhe_hash_bucket *hash = rcu_dereference(nhc->nhc_exceptions);
1410 struct fib_nh_exception *fnhe;
1416 hval = fnhe_hashfun(daddr);
1418 for (fnhe = rcu_dereference(hash[hval].chain); fnhe;
1419 fnhe = rcu_dereference(fnhe->fnhe_next)) {
1420 if (fnhe->fnhe_daddr == daddr) {
1421 if (fnhe->fnhe_expires &&
1422 time_after(jiffies, fnhe->fnhe_expires)) {
1423 ip_del_fnhe(nhc, daddr);
1433 * 1. mtu on route is locked - use it
1434 * 2. mtu from nexthop exception
1435 * 3. mtu from egress device
1438 u32 ip_mtu_from_fib_result(struct fib_result *res, __be32 daddr)
1440 struct fib_nh_common *nhc = res->nhc;
1441 struct net_device *dev = nhc->nhc_dev;
1442 struct fib_info *fi = res->fi;
1445 if (READ_ONCE(dev_net(dev)->ipv4.sysctl_ip_fwd_use_pmtu) ||
1446 fi->fib_metrics->metrics[RTAX_LOCK - 1] & (1 << RTAX_MTU))
1450 struct fib_nh_exception *fnhe;
1452 fnhe = find_exception(nhc, daddr);
1453 if (fnhe && !time_after_eq(jiffies, fnhe->fnhe_expires))
1454 mtu = fnhe->fnhe_pmtu;
1458 mtu = min(READ_ONCE(dev->mtu), IP_MAX_MTU);
1460 return mtu - lwtunnel_headroom(nhc->nhc_lwtstate, mtu);
1463 static bool rt_bind_exception(struct rtable *rt, struct fib_nh_exception *fnhe,
1464 __be32 daddr, const bool do_cache)
1468 spin_lock_bh(&fnhe_lock);
1470 if (daddr == fnhe->fnhe_daddr) {
1471 struct rtable __rcu **porig;
1472 struct rtable *orig;
1473 int genid = fnhe_genid(dev_net(rt->dst.dev));
1475 if (rt_is_input_route(rt))
1476 porig = &fnhe->fnhe_rth_input;
1478 porig = &fnhe->fnhe_rth_output;
1479 orig = rcu_dereference(*porig);
1481 if (fnhe->fnhe_genid != genid) {
1482 fnhe->fnhe_genid = genid;
1484 fnhe->fnhe_pmtu = 0;
1485 fnhe->fnhe_expires = 0;
1486 fnhe->fnhe_mtu_locked = false;
1487 fnhe_flush_routes(fnhe);
1490 fill_route_from_fnhe(rt, fnhe);
1493 rt->rt_gw_family = AF_INET;
1498 rcu_assign_pointer(*porig, rt);
1500 dst_dev_put(&orig->dst);
1501 dst_release(&orig->dst);
1506 fnhe->fnhe_stamp = jiffies;
1508 spin_unlock_bh(&fnhe_lock);
1513 static bool rt_cache_route(struct fib_nh_common *nhc, struct rtable *rt)
1515 struct rtable *orig, *prev, **p;
1518 if (rt_is_input_route(rt)) {
1519 p = (struct rtable **)&nhc->nhc_rth_input;
1521 p = (struct rtable **)raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
1525 /* hold dst before doing cmpxchg() to avoid race condition
1529 prev = cmpxchg(p, orig, rt);
1532 rt_add_uncached_list(orig);
1533 dst_release(&orig->dst);
1536 dst_release(&rt->dst);
1543 struct uncached_list {
1545 struct list_head head;
1548 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt_uncached_list);
1550 void rt_add_uncached_list(struct rtable *rt)
1552 struct uncached_list *ul = raw_cpu_ptr(&rt_uncached_list);
1554 rt->rt_uncached_list = ul;
1556 spin_lock_bh(&ul->lock);
1557 list_add_tail(&rt->rt_uncached, &ul->head);
1558 spin_unlock_bh(&ul->lock);
1561 void rt_del_uncached_list(struct rtable *rt)
1563 if (!list_empty(&rt->rt_uncached)) {
1564 struct uncached_list *ul = rt->rt_uncached_list;
1566 spin_lock_bh(&ul->lock);
1567 list_del(&rt->rt_uncached);
1568 spin_unlock_bh(&ul->lock);
1572 static void ipv4_dst_destroy(struct dst_entry *dst)
1574 struct rtable *rt = (struct rtable *)dst;
1576 ip_dst_metrics_put(dst);
1577 rt_del_uncached_list(rt);
1580 void rt_flush_dev(struct net_device *dev)
1585 for_each_possible_cpu(cpu) {
1586 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
1588 spin_lock_bh(&ul->lock);
1589 list_for_each_entry(rt, &ul->head, rt_uncached) {
1590 if (rt->dst.dev != dev)
1592 rt->dst.dev = blackhole_netdev;
1593 dev_hold(rt->dst.dev);
1596 spin_unlock_bh(&ul->lock);
1600 static bool rt_cache_valid(const struct rtable *rt)
1603 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1607 static void rt_set_nexthop(struct rtable *rt, __be32 daddr,
1608 const struct fib_result *res,
1609 struct fib_nh_exception *fnhe,
1610 struct fib_info *fi, u16 type, u32 itag,
1611 const bool do_cache)
1613 bool cached = false;
1616 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1618 if (nhc->nhc_gw_family && nhc->nhc_scope == RT_SCOPE_LINK) {
1619 rt->rt_uses_gateway = 1;
1620 rt->rt_gw_family = nhc->nhc_gw_family;
1621 /* only INET and INET6 are supported */
1622 if (likely(nhc->nhc_gw_family == AF_INET))
1623 rt->rt_gw4 = nhc->nhc_gw.ipv4;
1625 rt->rt_gw6 = nhc->nhc_gw.ipv6;
1628 ip_dst_init_metrics(&rt->dst, fi->fib_metrics);
1630 #ifdef CONFIG_IP_ROUTE_CLASSID
1631 if (nhc->nhc_family == AF_INET) {
1634 nh = container_of(nhc, struct fib_nh, nh_common);
1635 rt->dst.tclassid = nh->nh_tclassid;
1638 rt->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
1640 cached = rt_bind_exception(rt, fnhe, daddr, do_cache);
1642 cached = rt_cache_route(nhc, rt);
1643 if (unlikely(!cached)) {
1644 /* Routes we intend to cache in nexthop exception or
1645 * FIB nexthop have the DST_NOCACHE bit clear.
1646 * However, if we are unsuccessful at storing this
1647 * route into the cache we really need to set it.
1650 rt->rt_gw_family = AF_INET;
1653 rt_add_uncached_list(rt);
1656 rt_add_uncached_list(rt);
1658 #ifdef CONFIG_IP_ROUTE_CLASSID
1659 #ifdef CONFIG_IP_MULTIPLE_TABLES
1660 set_class_tag(rt, res->tclassid);
1662 set_class_tag(rt, itag);
1666 struct rtable *rt_dst_alloc(struct net_device *dev,
1667 unsigned int flags, u16 type,
1668 bool nopolicy, bool noxfrm)
1672 rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1673 (nopolicy ? DST_NOPOLICY : 0) |
1674 (noxfrm ? DST_NOXFRM : 0));
1677 rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1678 rt->rt_flags = flags;
1680 rt->rt_is_input = 0;
1683 rt->rt_mtu_locked = 0;
1684 rt->rt_uses_gateway = 0;
1685 rt->rt_gw_family = 0;
1687 INIT_LIST_HEAD(&rt->rt_uncached);
1689 rt->dst.output = ip_output;
1690 if (flags & RTCF_LOCAL)
1691 rt->dst.input = ip_local_deliver;
1696 EXPORT_SYMBOL(rt_dst_alloc);
1698 struct rtable *rt_dst_clone(struct net_device *dev, struct rtable *rt)
1700 struct rtable *new_rt;
1702 new_rt = dst_alloc(&ipv4_dst_ops, dev, 1, DST_OBSOLETE_FORCE_CHK,
1706 new_rt->rt_genid = rt_genid_ipv4(dev_net(dev));
1707 new_rt->rt_flags = rt->rt_flags;
1708 new_rt->rt_type = rt->rt_type;
1709 new_rt->rt_is_input = rt->rt_is_input;
1710 new_rt->rt_iif = rt->rt_iif;
1711 new_rt->rt_pmtu = rt->rt_pmtu;
1712 new_rt->rt_mtu_locked = rt->rt_mtu_locked;
1713 new_rt->rt_gw_family = rt->rt_gw_family;
1714 if (rt->rt_gw_family == AF_INET)
1715 new_rt->rt_gw4 = rt->rt_gw4;
1716 else if (rt->rt_gw_family == AF_INET6)
1717 new_rt->rt_gw6 = rt->rt_gw6;
1718 INIT_LIST_HEAD(&new_rt->rt_uncached);
1720 new_rt->dst.input = rt->dst.input;
1721 new_rt->dst.output = rt->dst.output;
1722 new_rt->dst.error = rt->dst.error;
1723 new_rt->dst.lastuse = jiffies;
1724 new_rt->dst.lwtstate = lwtstate_get(rt->dst.lwtstate);
1728 EXPORT_SYMBOL(rt_dst_clone);
1730 /* called in rcu_read_lock() section */
1731 int ip_mc_validate_source(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1732 u8 tos, struct net_device *dev,
1733 struct in_device *in_dev, u32 *itag)
1737 /* Primary sanity checks. */
1741 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr) ||
1742 skb->protocol != htons(ETH_P_IP))
1745 if (ipv4_is_loopback(saddr) && !IN_DEV_ROUTE_LOCALNET(in_dev))
1748 if (ipv4_is_zeronet(saddr)) {
1749 if (!ipv4_is_local_multicast(daddr) &&
1750 ip_hdr(skb)->protocol != IPPROTO_IGMP)
1753 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
1761 /* called in rcu_read_lock() section */
1762 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
1763 u8 tos, struct net_device *dev, int our)
1765 struct in_device *in_dev = __in_dev_get_rcu(dev);
1766 unsigned int flags = RTCF_MULTICAST;
1772 err = ip_mc_validate_source(skb, daddr, saddr, tos, dev, in_dev, &itag);
1777 flags |= RTCF_LOCAL;
1779 no_policy = IN_DEV_ORCONF(in_dev, NOPOLICY);
1781 IPCB(skb)->flags |= IPSKB_NOPOLICY;
1783 rth = rt_dst_alloc(dev_net(dev)->loopback_dev, flags, RTN_MULTICAST,
1788 #ifdef CONFIG_IP_ROUTE_CLASSID
1789 rth->dst.tclassid = itag;
1791 rth->dst.output = ip_rt_bug;
1792 rth->rt_is_input= 1;
1794 #ifdef CONFIG_IP_MROUTE
1795 if (!ipv4_is_local_multicast(daddr) && IN_DEV_MFORWARD(in_dev))
1796 rth->dst.input = ip_mr_input;
1798 RT_CACHE_STAT_INC(in_slow_mc);
1801 skb_dst_set(skb, &rth->dst);
1806 static void ip_handle_martian_source(struct net_device *dev,
1807 struct in_device *in_dev,
1808 struct sk_buff *skb,
1812 RT_CACHE_STAT_INC(in_martian_src);
1813 #ifdef CONFIG_IP_ROUTE_VERBOSE
1814 if (IN_DEV_LOG_MARTIANS(in_dev) && net_ratelimit()) {
1816 * RFC1812 recommendation, if source is martian,
1817 * the only hint is MAC header.
1819 pr_warn("martian source %pI4 from %pI4, on dev %s\n",
1820 &daddr, &saddr, dev->name);
1821 if (dev->hard_header_len && skb_mac_header_was_set(skb)) {
1822 print_hex_dump(KERN_WARNING, "ll header: ",
1823 DUMP_PREFIX_OFFSET, 16, 1,
1824 skb_mac_header(skb),
1825 dev->hard_header_len, false);
1831 /* called in rcu_read_lock() section */
1832 static int __mkroute_input(struct sk_buff *skb,
1833 const struct fib_result *res,
1834 struct in_device *in_dev,
1835 __be32 daddr, __be32 saddr, u32 tos)
1837 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
1838 struct net_device *dev = nhc->nhc_dev;
1839 struct fib_nh_exception *fnhe;
1842 struct in_device *out_dev;
1843 bool do_cache, no_policy;
1846 /* get a working reference to the output device */
1847 out_dev = __in_dev_get_rcu(dev);
1849 net_crit_ratelimited("Bug in ip_route_input_slow(). Please report.\n");
1853 err = fib_validate_source(skb, saddr, daddr, tos, FIB_RES_OIF(*res),
1854 in_dev->dev, in_dev, &itag);
1856 ip_handle_martian_source(in_dev->dev, in_dev, skb, daddr,
1862 do_cache = res->fi && !itag;
1863 if (out_dev == in_dev && err && IN_DEV_TX_REDIRECTS(out_dev) &&
1864 skb->protocol == htons(ETH_P_IP)) {
1867 gw = nhc->nhc_gw_family == AF_INET ? nhc->nhc_gw.ipv4 : 0;
1868 if (IN_DEV_SHARED_MEDIA(out_dev) ||
1869 inet_addr_onlink(out_dev, saddr, gw))
1870 IPCB(skb)->flags |= IPSKB_DOREDIRECT;
1873 if (skb->protocol != htons(ETH_P_IP)) {
1874 /* Not IP (i.e. ARP). Do not create route, if it is
1875 * invalid for proxy arp. DNAT routes are always valid.
1877 * Proxy arp feature have been extended to allow, ARP
1878 * replies back to the same interface, to support
1879 * Private VLAN switch technologies. See arp.c.
1881 if (out_dev == in_dev &&
1882 IN_DEV_PROXY_ARP_PVLAN(in_dev) == 0) {
1888 no_policy = IN_DEV_ORCONF(in_dev, NOPOLICY);
1890 IPCB(skb)->flags |= IPSKB_NOPOLICY;
1892 fnhe = find_exception(nhc, daddr);
1895 rth = rcu_dereference(fnhe->fnhe_rth_input);
1897 rth = rcu_dereference(nhc->nhc_rth_input);
1898 if (rt_cache_valid(rth)) {
1899 skb_dst_set_noref(skb, &rth->dst);
1904 rth = rt_dst_alloc(out_dev->dev, 0, res->type, no_policy,
1905 IN_DEV_ORCONF(out_dev, NOXFRM));
1911 rth->rt_is_input = 1;
1912 RT_CACHE_STAT_INC(in_slow_tot);
1914 rth->dst.input = ip_forward;
1916 rt_set_nexthop(rth, daddr, res, fnhe, res->fi, res->type, itag,
1918 lwtunnel_set_redirect(&rth->dst);
1919 skb_dst_set(skb, &rth->dst);
1926 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1927 /* To make ICMP packets follow the right flow, the multipath hash is
1928 * calculated from the inner IP addresses.
1930 static void ip_multipath_l3_keys(const struct sk_buff *skb,
1931 struct flow_keys *hash_keys)
1933 const struct iphdr *outer_iph = ip_hdr(skb);
1934 const struct iphdr *key_iph = outer_iph;
1935 const struct iphdr *inner_iph;
1936 const struct icmphdr *icmph;
1937 struct iphdr _inner_iph;
1938 struct icmphdr _icmph;
1940 if (likely(outer_iph->protocol != IPPROTO_ICMP))
1943 if (unlikely((outer_iph->frag_off & htons(IP_OFFSET)) != 0))
1946 icmph = skb_header_pointer(skb, outer_iph->ihl * 4, sizeof(_icmph),
1951 if (!icmp_is_err(icmph->type))
1954 inner_iph = skb_header_pointer(skb,
1955 outer_iph->ihl * 4 + sizeof(_icmph),
1956 sizeof(_inner_iph), &_inner_iph);
1960 key_iph = inner_iph;
1962 hash_keys->addrs.v4addrs.src = key_iph->saddr;
1963 hash_keys->addrs.v4addrs.dst = key_iph->daddr;
1966 /* if skb is set it will be used and fl4 can be NULL */
1967 int fib_multipath_hash(const struct net *net, const struct flowi4 *fl4,
1968 const struct sk_buff *skb, struct flow_keys *flkeys)
1970 u32 multipath_hash = fl4 ? fl4->flowi4_multipath_hash : 0;
1971 struct flow_keys hash_keys;
1974 switch (net->ipv4.sysctl_fib_multipath_hash_policy) {
1976 memset(&hash_keys, 0, sizeof(hash_keys));
1977 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
1979 ip_multipath_l3_keys(skb, &hash_keys);
1981 hash_keys.addrs.v4addrs.src = fl4->saddr;
1982 hash_keys.addrs.v4addrs.dst = fl4->daddr;
1986 /* skb is currently provided only when forwarding */
1988 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
1989 struct flow_keys keys;
1991 /* short-circuit if we already have L4 hash present */
1993 return skb_get_hash_raw(skb) >> 1;
1995 memset(&hash_keys, 0, sizeof(hash_keys));
1998 skb_flow_dissect_flow_keys(skb, &keys, flag);
2002 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2003 hash_keys.addrs.v4addrs.src = flkeys->addrs.v4addrs.src;
2004 hash_keys.addrs.v4addrs.dst = flkeys->addrs.v4addrs.dst;
2005 hash_keys.ports.src = flkeys->ports.src;
2006 hash_keys.ports.dst = flkeys->ports.dst;
2007 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2009 memset(&hash_keys, 0, sizeof(hash_keys));
2010 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2011 hash_keys.addrs.v4addrs.src = fl4->saddr;
2012 hash_keys.addrs.v4addrs.dst = fl4->daddr;
2013 hash_keys.ports.src = fl4->fl4_sport;
2014 hash_keys.ports.dst = fl4->fl4_dport;
2015 hash_keys.basic.ip_proto = fl4->flowi4_proto;
2019 memset(&hash_keys, 0, sizeof(hash_keys));
2020 /* skb is currently provided only when forwarding */
2022 struct flow_keys keys;
2024 skb_flow_dissect_flow_keys(skb, &keys, 0);
2025 /* Inner can be v4 or v6 */
2026 if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
2027 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2028 hash_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
2029 hash_keys.addrs.v4addrs.dst = keys.addrs.v4addrs.dst;
2030 } else if (keys.control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
2031 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2032 hash_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
2033 hash_keys.addrs.v6addrs.dst = keys.addrs.v6addrs.dst;
2034 hash_keys.tags.flow_label = keys.tags.flow_label;
2035 hash_keys.basic.ip_proto = keys.basic.ip_proto;
2037 /* Same as case 0 */
2038 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2039 ip_multipath_l3_keys(skb, &hash_keys);
2042 /* Same as case 0 */
2043 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
2044 hash_keys.addrs.v4addrs.src = fl4->saddr;
2045 hash_keys.addrs.v4addrs.dst = fl4->daddr;
2049 mhash = flow_hash_from_keys(&hash_keys);
2052 mhash = jhash_2words(mhash, multipath_hash, 0);
2056 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
2058 static int ip_mkroute_input(struct sk_buff *skb,
2059 struct fib_result *res,
2060 struct in_device *in_dev,
2061 __be32 daddr, __be32 saddr, u32 tos,
2062 struct flow_keys *hkeys)
2064 #ifdef CONFIG_IP_ROUTE_MULTIPATH
2065 if (res->fi && fib_info_num_path(res->fi) > 1) {
2066 int h = fib_multipath_hash(res->fi->fib_net, NULL, skb, hkeys);
2068 fib_select_multipath(res, h);
2069 IPCB(skb)->flags |= IPSKB_MULTIPATH;
2073 /* create a routing cache entry */
2074 return __mkroute_input(skb, res, in_dev, daddr, saddr, tos);
2077 /* Implements all the saddr-related checks as ip_route_input_slow(),
2078 * assuming daddr is valid and the destination is not a local broadcast one.
2079 * Uses the provided hint instead of performing a route lookup.
2081 int ip_route_use_hint(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2082 u8 tos, struct net_device *dev,
2083 const struct sk_buff *hint)
2085 struct in_device *in_dev = __in_dev_get_rcu(dev);
2086 struct rtable *rt = skb_rtable(hint);
2087 struct net *net = dev_net(dev);
2094 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2095 goto martian_source;
2097 if (ipv4_is_zeronet(saddr))
2098 goto martian_source;
2100 if (ipv4_is_loopback(saddr) && !IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2101 goto martian_source;
2103 if (rt->rt_type != RTN_LOCAL)
2104 goto skip_validate_source;
2106 tos &= IPTOS_RT_MASK;
2107 err = fib_validate_source(skb, saddr, daddr, tos, 0, dev, in_dev, &tag);
2109 goto martian_source;
2111 skip_validate_source:
2112 skb_dst_copy(skb, hint);
2116 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2120 /* get device for dst_alloc with local routes */
2121 static struct net_device *ip_rt_get_dev(struct net *net,
2122 const struct fib_result *res)
2124 struct fib_nh_common *nhc = res->fi ? res->nhc : NULL;
2125 struct net_device *dev = NULL;
2128 dev = l3mdev_master_dev_rcu(nhc->nhc_dev);
2130 return dev ? : net->loopback_dev;
2134 * NOTE. We drop all the packets that has local source
2135 * addresses, because every properly looped back packet
2136 * must have correct destination already attached by output routine.
2137 * Changes in the enforced policies must be applied also to
2138 * ip_route_use_hint().
2140 * Such approach solves two big problems:
2141 * 1. Not simplex devices are handled properly.
2142 * 2. IP spoofing attempts are filtered with 100% of guarantee.
2143 * called with rcu_read_lock()
2146 static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2147 u8 tos, struct net_device *dev,
2148 struct fib_result *res)
2150 struct in_device *in_dev = __in_dev_get_rcu(dev);
2151 struct flow_keys *flkeys = NULL, _flkeys;
2152 struct net *net = dev_net(dev);
2153 struct ip_tunnel_info *tun_info;
2155 unsigned int flags = 0;
2159 bool do_cache = true;
2162 /* IP on this device is disabled. */
2167 /* Check for the most weird martians, which can be not detected
2171 tun_info = skb_tunnel_info(skb);
2172 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2173 fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
2175 fl4.flowi4_tun_key.tun_id = 0;
2178 if (ipv4_is_multicast(saddr) || ipv4_is_lbcast(saddr))
2179 goto martian_source;
2183 if (ipv4_is_lbcast(daddr) || (saddr == 0 && daddr == 0))
2186 /* Accept zero addresses only to limited broadcast;
2187 * I even do not know to fix it or not. Waiting for complains :-)
2189 if (ipv4_is_zeronet(saddr))
2190 goto martian_source;
2192 if (ipv4_is_zeronet(daddr))
2193 goto martian_destination;
2195 /* Following code try to avoid calling IN_DEV_NET_ROUTE_LOCALNET(),
2196 * and call it once if daddr or/and saddr are loopback addresses
2198 if (ipv4_is_loopback(daddr)) {
2199 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2200 goto martian_destination;
2201 } else if (ipv4_is_loopback(saddr)) {
2202 if (!IN_DEV_NET_ROUTE_LOCALNET(in_dev, net))
2203 goto martian_source;
2207 * Now we are ready to route packet.
2210 fl4.flowi4_iif = dev->ifindex;
2211 fl4.flowi4_mark = skb->mark;
2212 fl4.flowi4_tos = tos;
2213 fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
2214 fl4.flowi4_flags = 0;
2217 fl4.flowi4_uid = sock_net_uid(net, NULL);
2218 fl4.flowi4_multipath_hash = 0;
2220 if (fib4_rules_early_flow_dissect(net, skb, &fl4, &_flkeys)) {
2223 fl4.flowi4_proto = 0;
2228 err = fib_lookup(net, &fl4, res, 0);
2230 if (!IN_DEV_FORWARD(in_dev))
2231 err = -EHOSTUNREACH;
2235 if (res->type == RTN_BROADCAST) {
2236 if (IN_DEV_BFORWARD(in_dev))
2238 /* not do cache if bc_forwarding is enabled */
2239 if (IPV4_DEVCONF_ALL(net, BC_FORWARDING))
2244 if (res->type == RTN_LOCAL) {
2245 err = fib_validate_source(skb, saddr, daddr, tos,
2246 0, dev, in_dev, &itag);
2248 goto martian_source;
2252 if (!IN_DEV_FORWARD(in_dev)) {
2253 err = -EHOSTUNREACH;
2256 if (res->type != RTN_UNICAST)
2257 goto martian_destination;
2260 err = ip_mkroute_input(skb, res, in_dev, daddr, saddr, tos, flkeys);
2264 if (skb->protocol != htons(ETH_P_IP))
2267 if (!ipv4_is_zeronet(saddr)) {
2268 err = fib_validate_source(skb, saddr, 0, tos, 0, dev,
2271 goto martian_source;
2273 flags |= RTCF_BROADCAST;
2274 res->type = RTN_BROADCAST;
2275 RT_CACHE_STAT_INC(in_brd);
2278 no_policy = IN_DEV_ORCONF(in_dev, NOPOLICY);
2280 IPCB(skb)->flags |= IPSKB_NOPOLICY;
2282 do_cache &= res->fi && !itag;
2284 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2286 rth = rcu_dereference(nhc->nhc_rth_input);
2287 if (rt_cache_valid(rth)) {
2288 skb_dst_set_noref(skb, &rth->dst);
2294 rth = rt_dst_alloc(ip_rt_get_dev(net, res),
2295 flags | RTCF_LOCAL, res->type,
2300 rth->dst.output= ip_rt_bug;
2301 #ifdef CONFIG_IP_ROUTE_CLASSID
2302 rth->dst.tclassid = itag;
2304 rth->rt_is_input = 1;
2306 RT_CACHE_STAT_INC(in_slow_tot);
2307 if (res->type == RTN_UNREACHABLE) {
2308 rth->dst.input= ip_error;
2309 rth->dst.error= -err;
2310 rth->rt_flags &= ~RTCF_LOCAL;
2314 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2316 rth->dst.lwtstate = lwtstate_get(nhc->nhc_lwtstate);
2317 if (lwtunnel_input_redirect(rth->dst.lwtstate)) {
2318 WARN_ON(rth->dst.input == lwtunnel_input);
2319 rth->dst.lwtstate->orig_input = rth->dst.input;
2320 rth->dst.input = lwtunnel_input;
2323 if (unlikely(!rt_cache_route(nhc, rth)))
2324 rt_add_uncached_list(rth);
2326 skb_dst_set(skb, &rth->dst);
2331 RT_CACHE_STAT_INC(in_no_route);
2332 res->type = RTN_UNREACHABLE;
2338 * Do not cache martian addresses: they should be logged (RFC1812)
2340 martian_destination:
2341 RT_CACHE_STAT_INC(in_martian_dst);
2342 #ifdef CONFIG_IP_ROUTE_VERBOSE
2343 if (IN_DEV_LOG_MARTIANS(in_dev))
2344 net_warn_ratelimited("martian destination %pI4 from %pI4, dev %s\n",
2345 &daddr, &saddr, dev->name);
2357 ip_handle_martian_source(dev, in_dev, skb, daddr, saddr);
2361 int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2362 u8 tos, struct net_device *dev)
2364 struct fib_result res;
2367 tos &= IPTOS_RT_MASK;
2369 err = ip_route_input_rcu(skb, daddr, saddr, tos, dev, &res);
2374 EXPORT_SYMBOL(ip_route_input_noref);
2376 /* called with rcu_read_lock held */
2377 int ip_route_input_rcu(struct sk_buff *skb, __be32 daddr, __be32 saddr,
2378 u8 tos, struct net_device *dev, struct fib_result *res)
2380 /* Multicast recognition logic is moved from route cache to here.
2381 The problem was that too many Ethernet cards have broken/missing
2382 hardware multicast filters :-( As result the host on multicasting
2383 network acquires a lot of useless route cache entries, sort of
2384 SDR messages from all the world. Now we try to get rid of them.
2385 Really, provided software IP multicast filter is organized
2386 reasonably (at least, hashed), it does not result in a slowdown
2387 comparing with route cache reject entries.
2388 Note, that multicast routers are not affected, because
2389 route cache entry is created eventually.
2391 if (ipv4_is_multicast(daddr)) {
2392 struct in_device *in_dev = __in_dev_get_rcu(dev);
2398 our = ip_check_mc_rcu(in_dev, daddr, saddr,
2399 ip_hdr(skb)->protocol);
2401 /* check l3 master if no match yet */
2402 if (!our && netif_is_l3_slave(dev)) {
2403 struct in_device *l3_in_dev;
2405 l3_in_dev = __in_dev_get_rcu(skb->dev);
2407 our = ip_check_mc_rcu(l3_in_dev, daddr, saddr,
2408 ip_hdr(skb)->protocol);
2412 #ifdef CONFIG_IP_MROUTE
2414 (!ipv4_is_local_multicast(daddr) &&
2415 IN_DEV_MFORWARD(in_dev))
2418 err = ip_route_input_mc(skb, daddr, saddr,
2424 return ip_route_input_slow(skb, daddr, saddr, tos, dev, res);
2427 /* called with rcu_read_lock() */
2428 static struct rtable *__mkroute_output(const struct fib_result *res,
2429 const struct flowi4 *fl4, int orig_oif,
2430 struct net_device *dev_out,
2433 struct fib_info *fi = res->fi;
2434 struct fib_nh_exception *fnhe;
2435 struct in_device *in_dev;
2436 u16 type = res->type;
2440 in_dev = __in_dev_get_rcu(dev_out);
2442 return ERR_PTR(-EINVAL);
2444 if (likely(!IN_DEV_ROUTE_LOCALNET(in_dev)))
2445 if (ipv4_is_loopback(fl4->saddr) &&
2446 !(dev_out->flags & IFF_LOOPBACK) &&
2447 !netif_is_l3_master(dev_out))
2448 return ERR_PTR(-EINVAL);
2450 if (ipv4_is_lbcast(fl4->daddr))
2451 type = RTN_BROADCAST;
2452 else if (ipv4_is_multicast(fl4->daddr))
2453 type = RTN_MULTICAST;
2454 else if (ipv4_is_zeronet(fl4->daddr))
2455 return ERR_PTR(-EINVAL);
2457 if (dev_out->flags & IFF_LOOPBACK)
2458 flags |= RTCF_LOCAL;
2461 if (type == RTN_BROADCAST) {
2462 flags |= RTCF_BROADCAST | RTCF_LOCAL;
2464 } else if (type == RTN_MULTICAST) {
2465 flags |= RTCF_MULTICAST | RTCF_LOCAL;
2466 if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr,
2468 flags &= ~RTCF_LOCAL;
2471 /* If multicast route do not exist use
2472 * default one, but do not gateway in this case.
2475 if (fi && res->prefixlen < 4)
2477 } else if ((type == RTN_LOCAL) && (orig_oif != 0) &&
2478 (orig_oif != dev_out->ifindex)) {
2479 /* For local routes that require a particular output interface
2480 * we do not want to cache the result. Caching the result
2481 * causes incorrect behaviour when there are multiple source
2482 * addresses on the interface, the end result being that if the
2483 * intended recipient is waiting on that interface for the
2484 * packet he won't receive it because it will be delivered on
2485 * the loopback interface and the IP_PKTINFO ipi_ifindex will
2486 * be set to the loopback interface as well.
2492 do_cache &= fi != NULL;
2494 struct fib_nh_common *nhc = FIB_RES_NHC(*res);
2495 struct rtable __rcu **prth;
2497 fnhe = find_exception(nhc, fl4->daddr);
2501 prth = &fnhe->fnhe_rth_output;
2503 if (unlikely(fl4->flowi4_flags &
2504 FLOWI_FLAG_KNOWN_NH &&
2505 !(nhc->nhc_gw_family &&
2506 nhc->nhc_scope == RT_SCOPE_LINK))) {
2510 prth = raw_cpu_ptr(nhc->nhc_pcpu_rth_output);
2512 rth = rcu_dereference(*prth);
2513 if (rt_cache_valid(rth) && dst_hold_safe(&rth->dst))
2518 rth = rt_dst_alloc(dev_out, flags, type,
2519 IN_DEV_ORCONF(in_dev, NOPOLICY),
2520 IN_DEV_ORCONF(in_dev, NOXFRM));
2522 return ERR_PTR(-ENOBUFS);
2524 rth->rt_iif = orig_oif;
2526 RT_CACHE_STAT_INC(out_slow_tot);
2528 if (flags & (RTCF_BROADCAST | RTCF_MULTICAST)) {
2529 if (flags & RTCF_LOCAL &&
2530 !(dev_out->flags & IFF_LOOPBACK)) {
2531 rth->dst.output = ip_mc_output;
2532 RT_CACHE_STAT_INC(out_slow_mc);
2534 #ifdef CONFIG_IP_MROUTE
2535 if (type == RTN_MULTICAST) {
2536 if (IN_DEV_MFORWARD(in_dev) &&
2537 !ipv4_is_local_multicast(fl4->daddr)) {
2538 rth->dst.input = ip_mr_input;
2539 rth->dst.output = ip_mc_output;
2545 rt_set_nexthop(rth, fl4->daddr, res, fnhe, fi, type, 0, do_cache);
2546 lwtunnel_set_redirect(&rth->dst);
2552 * Major route resolver routine.
2555 struct rtable *ip_route_output_key_hash(struct net *net, struct flowi4 *fl4,
2556 const struct sk_buff *skb)
2558 struct fib_result res = {
2566 fl4->flowi4_iif = LOOPBACK_IFINDEX;
2570 rth = ip_route_output_key_hash_rcu(net, fl4, &res, skb);
2575 EXPORT_SYMBOL_GPL(ip_route_output_key_hash);
2577 struct rtable *ip_route_output_key_hash_rcu(struct net *net, struct flowi4 *fl4,
2578 struct fib_result *res,
2579 const struct sk_buff *skb)
2581 struct net_device *dev_out = NULL;
2582 int orig_oif = fl4->flowi4_oif;
2583 unsigned int flags = 0;
2588 if (ipv4_is_multicast(fl4->saddr) ||
2589 ipv4_is_lbcast(fl4->saddr) ||
2590 ipv4_is_zeronet(fl4->saddr)) {
2591 rth = ERR_PTR(-EINVAL);
2595 rth = ERR_PTR(-ENETUNREACH);
2597 /* I removed check for oif == dev_out->oif here.
2598 It was wrong for two reasons:
2599 1. ip_dev_find(net, saddr) can return wrong iface, if saddr
2600 is assigned to multiple interfaces.
2601 2. Moreover, we are allowed to send packets with saddr
2602 of another iface. --ANK
2605 if (fl4->flowi4_oif == 0 &&
2606 (ipv4_is_multicast(fl4->daddr) ||
2607 ipv4_is_lbcast(fl4->daddr))) {
2608 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2609 dev_out = __ip_dev_find(net, fl4->saddr, false);
2613 /* Special hack: user can direct multicasts
2614 and limited broadcast via necessary interface
2615 without fiddling with IP_MULTICAST_IF or IP_PKTINFO.
2616 This hack is not just for fun, it allows
2617 vic,vat and friends to work.
2618 They bind socket to loopback, set ttl to zero
2619 and expect that it will work.
2620 From the viewpoint of routing cache they are broken,
2621 because we are not allowed to build multicast path
2622 with loopback source addr (look, routing cache
2623 cannot know, that ttl is zero, so that packet
2624 will not leave this host and route is valid).
2625 Luckily, this hack is good workaround.
2628 fl4->flowi4_oif = dev_out->ifindex;
2632 if (!(fl4->flowi4_flags & FLOWI_FLAG_ANYSRC)) {
2633 /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */
2634 if (!__ip_dev_find(net, fl4->saddr, false))
2640 if (fl4->flowi4_oif) {
2641 dev_out = dev_get_by_index_rcu(net, fl4->flowi4_oif);
2642 rth = ERR_PTR(-ENODEV);
2646 /* RACE: Check return value of inet_select_addr instead. */
2647 if (!(dev_out->flags & IFF_UP) || !__in_dev_get_rcu(dev_out)) {
2648 rth = ERR_PTR(-ENETUNREACH);
2651 if (ipv4_is_local_multicast(fl4->daddr) ||
2652 ipv4_is_lbcast(fl4->daddr) ||
2653 fl4->flowi4_proto == IPPROTO_IGMP) {
2655 fl4->saddr = inet_select_addr(dev_out, 0,
2660 if (ipv4_is_multicast(fl4->daddr))
2661 fl4->saddr = inet_select_addr(dev_out, 0,
2663 else if (!fl4->daddr)
2664 fl4->saddr = inet_select_addr(dev_out, 0,
2670 fl4->daddr = fl4->saddr;
2672 fl4->daddr = fl4->saddr = htonl(INADDR_LOOPBACK);
2673 dev_out = net->loopback_dev;
2674 fl4->flowi4_oif = LOOPBACK_IFINDEX;
2675 res->type = RTN_LOCAL;
2676 flags |= RTCF_LOCAL;
2680 err = fib_lookup(net, fl4, res, 0);
2684 if (fl4->flowi4_oif &&
2685 (ipv4_is_multicast(fl4->daddr) ||
2686 !netif_index_is_l3_master(net, fl4->flowi4_oif))) {
2687 /* Apparently, routing tables are wrong. Assume,
2688 that the destination is on link.
2691 Because we are allowed to send to iface
2692 even if it has NO routes and NO assigned
2693 addresses. When oif is specified, routing
2694 tables are looked up with only one purpose:
2695 to catch if destination is gatewayed, rather than
2696 direct. Moreover, if MSG_DONTROUTE is set,
2697 we send packet, ignoring both routing tables
2698 and ifaddr state. --ANK
2701 We could make it even if oif is unknown,
2702 likely IPv6, but we do not.
2705 if (fl4->saddr == 0)
2706 fl4->saddr = inet_select_addr(dev_out, 0,
2708 res->type = RTN_UNICAST;
2715 if (res->type == RTN_LOCAL) {
2717 if (res->fi->fib_prefsrc)
2718 fl4->saddr = res->fi->fib_prefsrc;
2720 fl4->saddr = fl4->daddr;
2723 /* L3 master device is the loopback for that domain */
2724 dev_out = l3mdev_master_dev_rcu(FIB_RES_DEV(*res)) ? :
2727 /* make sure orig_oif points to fib result device even
2728 * though packet rx/tx happens over loopback or l3mdev
2730 orig_oif = FIB_RES_OIF(*res);
2732 fl4->flowi4_oif = dev_out->ifindex;
2733 flags |= RTCF_LOCAL;
2737 fib_select_path(net, res, fl4, skb);
2739 dev_out = FIB_RES_DEV(*res);
2742 rth = __mkroute_output(res, fl4, orig_oif, dev_out, flags);
2748 static struct dst_ops ipv4_dst_blackhole_ops = {
2750 .default_advmss = ipv4_default_advmss,
2751 .neigh_lookup = ipv4_neigh_lookup,
2752 .check = dst_blackhole_check,
2753 .cow_metrics = dst_blackhole_cow_metrics,
2754 .update_pmtu = dst_blackhole_update_pmtu,
2755 .redirect = dst_blackhole_redirect,
2756 .mtu = dst_blackhole_mtu,
2759 struct dst_entry *ipv4_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2761 struct rtable *ort = (struct rtable *) dst_orig;
2764 rt = dst_alloc(&ipv4_dst_blackhole_ops, NULL, 1, DST_OBSOLETE_DEAD, 0);
2766 struct dst_entry *new = &rt->dst;
2769 new->input = dst_discard;
2770 new->output = dst_discard_out;
2772 new->dev = net->loopback_dev;
2776 rt->rt_is_input = ort->rt_is_input;
2777 rt->rt_iif = ort->rt_iif;
2778 rt->rt_pmtu = ort->rt_pmtu;
2779 rt->rt_mtu_locked = ort->rt_mtu_locked;
2781 rt->rt_genid = rt_genid_ipv4(net);
2782 rt->rt_flags = ort->rt_flags;
2783 rt->rt_type = ort->rt_type;
2784 rt->rt_uses_gateway = ort->rt_uses_gateway;
2785 rt->rt_gw_family = ort->rt_gw_family;
2786 if (rt->rt_gw_family == AF_INET)
2787 rt->rt_gw4 = ort->rt_gw4;
2788 else if (rt->rt_gw_family == AF_INET6)
2789 rt->rt_gw6 = ort->rt_gw6;
2791 INIT_LIST_HEAD(&rt->rt_uncached);
2794 dst_release(dst_orig);
2796 return rt ? &rt->dst : ERR_PTR(-ENOMEM);
2799 struct rtable *ip_route_output_flow(struct net *net, struct flowi4 *flp4,
2800 const struct sock *sk)
2802 struct rtable *rt = __ip_route_output_key(net, flp4);
2807 if (flp4->flowi4_proto) {
2808 flp4->flowi4_oif = rt->dst.dev->ifindex;
2809 rt = (struct rtable *)xfrm_lookup_route(net, &rt->dst,
2810 flowi4_to_flowi(flp4),
2816 EXPORT_SYMBOL_GPL(ip_route_output_flow);
2818 struct rtable *ip_route_output_tunnel(struct sk_buff *skb,
2819 struct net_device *dev,
2820 struct net *net, __be32 *saddr,
2821 const struct ip_tunnel_info *info,
2822 u8 protocol, bool use_cache)
2824 #ifdef CONFIG_DST_CACHE
2825 struct dst_cache *dst_cache;
2827 struct rtable *rt = NULL;
2831 #ifdef CONFIG_DST_CACHE
2832 dst_cache = (struct dst_cache *)&info->dst_cache;
2834 rt = dst_cache_get_ip4(dst_cache, saddr);
2839 memset(&fl4, 0, sizeof(fl4));
2840 fl4.flowi4_mark = skb->mark;
2841 fl4.flowi4_proto = protocol;
2842 fl4.daddr = info->key.u.ipv4.dst;
2843 fl4.saddr = info->key.u.ipv4.src;
2844 tos = info->key.tos;
2845 fl4.flowi4_tos = RT_TOS(tos);
2847 rt = ip_route_output_key(net, &fl4);
2849 netdev_dbg(dev, "no route to %pI4\n", &fl4.daddr);
2850 return ERR_PTR(-ENETUNREACH);
2852 if (rt->dst.dev == dev) { /* is this necessary? */
2853 netdev_dbg(dev, "circular route to %pI4\n", &fl4.daddr);
2855 return ERR_PTR(-ELOOP);
2857 #ifdef CONFIG_DST_CACHE
2859 dst_cache_set_ip4(dst_cache, &rt->dst, fl4.saddr);
2864 EXPORT_SYMBOL_GPL(ip_route_output_tunnel);
2866 /* called with rcu_read_lock held */
2867 static int rt_fill_info(struct net *net, __be32 dst, __be32 src,
2868 struct rtable *rt, u32 table_id, struct flowi4 *fl4,
2869 struct sk_buff *skb, u32 portid, u32 seq,
2873 struct nlmsghdr *nlh;
2874 unsigned long expires = 0;
2876 u32 metrics[RTAX_MAX];
2878 nlh = nlmsg_put(skb, portid, seq, RTM_NEWROUTE, sizeof(*r), flags);
2882 r = nlmsg_data(nlh);
2883 r->rtm_family = AF_INET;
2884 r->rtm_dst_len = 32;
2886 r->rtm_tos = fl4 ? fl4->flowi4_tos : 0;
2887 r->rtm_table = table_id < 256 ? table_id : RT_TABLE_COMPAT;
2888 if (nla_put_u32(skb, RTA_TABLE, table_id))
2889 goto nla_put_failure;
2890 r->rtm_type = rt->rt_type;
2891 r->rtm_scope = RT_SCOPE_UNIVERSE;
2892 r->rtm_protocol = RTPROT_UNSPEC;
2893 r->rtm_flags = (rt->rt_flags & ~0xFFFF) | RTM_F_CLONED;
2894 if (rt->rt_flags & RTCF_NOTIFY)
2895 r->rtm_flags |= RTM_F_NOTIFY;
2896 if (IPCB(skb)->flags & IPSKB_DOREDIRECT)
2897 r->rtm_flags |= RTCF_DOREDIRECT;
2899 if (nla_put_in_addr(skb, RTA_DST, dst))
2900 goto nla_put_failure;
2902 r->rtm_src_len = 32;
2903 if (nla_put_in_addr(skb, RTA_SRC, src))
2904 goto nla_put_failure;
2907 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
2908 goto nla_put_failure;
2909 #ifdef CONFIG_IP_ROUTE_CLASSID
2910 if (rt->dst.tclassid &&
2911 nla_put_u32(skb, RTA_FLOW, rt->dst.tclassid))
2912 goto nla_put_failure;
2914 if (fl4 && !rt_is_input_route(rt) &&
2915 fl4->saddr != src) {
2916 if (nla_put_in_addr(skb, RTA_PREFSRC, fl4->saddr))
2917 goto nla_put_failure;
2919 if (rt->rt_uses_gateway) {
2920 if (rt->rt_gw_family == AF_INET &&
2921 nla_put_in_addr(skb, RTA_GATEWAY, rt->rt_gw4)) {
2922 goto nla_put_failure;
2923 } else if (rt->rt_gw_family == AF_INET6) {
2924 int alen = sizeof(struct in6_addr);
2928 nla = nla_reserve(skb, RTA_VIA, alen + 2);
2930 goto nla_put_failure;
2932 via = nla_data(nla);
2933 via->rtvia_family = AF_INET6;
2934 memcpy(via->rtvia_addr, &rt->rt_gw6, alen);
2938 expires = rt->dst.expires;
2940 unsigned long now = jiffies;
2942 if (time_before(now, expires))
2948 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
2949 if (rt->rt_pmtu && expires)
2950 metrics[RTAX_MTU - 1] = rt->rt_pmtu;
2951 if (rt->rt_mtu_locked && expires)
2952 metrics[RTAX_LOCK - 1] |= BIT(RTAX_MTU);
2953 if (rtnetlink_put_metrics(skb, metrics) < 0)
2954 goto nla_put_failure;
2957 if (fl4->flowi4_mark &&
2958 nla_put_u32(skb, RTA_MARK, fl4->flowi4_mark))
2959 goto nla_put_failure;
2961 if (!uid_eq(fl4->flowi4_uid, INVALID_UID) &&
2962 nla_put_u32(skb, RTA_UID,
2963 from_kuid_munged(current_user_ns(),
2965 goto nla_put_failure;
2967 if (rt_is_input_route(rt)) {
2968 #ifdef CONFIG_IP_MROUTE
2969 if (ipv4_is_multicast(dst) &&
2970 !ipv4_is_local_multicast(dst) &&
2971 IPV4_DEVCONF_ALL(net, MC_FORWARDING)) {
2972 int err = ipmr_get_route(net, skb,
2973 fl4->saddr, fl4->daddr,
2979 goto nla_put_failure;
2983 if (nla_put_u32(skb, RTA_IIF, fl4->flowi4_iif))
2984 goto nla_put_failure;
2988 error = rt->dst.error;
2990 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, error) < 0)
2991 goto nla_put_failure;
2993 nlmsg_end(skb, nlh);
2997 nlmsg_cancel(skb, nlh);
3001 static int fnhe_dump_bucket(struct net *net, struct sk_buff *skb,
3002 struct netlink_callback *cb, u32 table_id,
3003 struct fnhe_hash_bucket *bucket, int genid,
3004 int *fa_index, int fa_start, unsigned int flags)
3008 for (i = 0; i < FNHE_HASH_SIZE; i++) {
3009 struct fib_nh_exception *fnhe;
3011 for (fnhe = rcu_dereference(bucket[i].chain); fnhe;
3012 fnhe = rcu_dereference(fnhe->fnhe_next)) {
3016 if (*fa_index < fa_start)
3019 if (fnhe->fnhe_genid != genid)
3022 if (fnhe->fnhe_expires &&
3023 time_after(jiffies, fnhe->fnhe_expires))
3026 rt = rcu_dereference(fnhe->fnhe_rth_input);
3028 rt = rcu_dereference(fnhe->fnhe_rth_output);
3032 err = rt_fill_info(net, fnhe->fnhe_daddr, 0, rt,
3033 table_id, NULL, skb,
3034 NETLINK_CB(cb->skb).portid,
3035 cb->nlh->nlmsg_seq, flags);
3046 int fib_dump_info_fnhe(struct sk_buff *skb, struct netlink_callback *cb,
3047 u32 table_id, struct fib_info *fi,
3048 int *fa_index, int fa_start, unsigned int flags)
3050 struct net *net = sock_net(cb->skb->sk);
3051 int nhsel, genid = fnhe_genid(net);
3053 for (nhsel = 0; nhsel < fib_info_num_path(fi); nhsel++) {
3054 struct fib_nh_common *nhc = fib_info_nhc(fi, nhsel);
3055 struct fnhe_hash_bucket *bucket;
3058 if (nhc->nhc_flags & RTNH_F_DEAD)
3062 bucket = rcu_dereference(nhc->nhc_exceptions);
3065 err = fnhe_dump_bucket(net, skb, cb, table_id, bucket,
3066 genid, fa_index, fa_start,
3076 static struct sk_buff *inet_rtm_getroute_build_skb(__be32 src, __be32 dst,
3077 u8 ip_proto, __be16 sport,
3080 struct sk_buff *skb;
3083 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3087 /* Reserve room for dummy headers, this skb can pass
3088 * through good chunk of routing engine.
3090 skb_reset_mac_header(skb);
3091 skb_reset_network_header(skb);
3092 skb->protocol = htons(ETH_P_IP);
3093 iph = skb_put(skb, sizeof(struct iphdr));
3094 iph->protocol = ip_proto;
3100 skb_set_transport_header(skb, skb->len);
3102 switch (iph->protocol) {
3104 struct udphdr *udph;
3106 udph = skb_put_zero(skb, sizeof(struct udphdr));
3107 udph->source = sport;
3109 udph->len = htons(sizeof(struct udphdr));
3114 struct tcphdr *tcph;
3116 tcph = skb_put_zero(skb, sizeof(struct tcphdr));
3117 tcph->source = sport;
3119 tcph->doff = sizeof(struct tcphdr) / 4;
3121 tcph->check = ~tcp_v4_check(sizeof(struct tcphdr),
3125 case IPPROTO_ICMP: {
3126 struct icmphdr *icmph;
3128 icmph = skb_put_zero(skb, sizeof(struct icmphdr));
3129 icmph->type = ICMP_ECHO;
3137 static int inet_rtm_valid_getroute_req(struct sk_buff *skb,
3138 const struct nlmsghdr *nlh,
3140 struct netlink_ext_ack *extack)
3145 if (nlh->nlmsg_len < nlmsg_msg_size(sizeof(*rtm))) {
3146 NL_SET_ERR_MSG(extack,
3147 "ipv4: Invalid header for route get request");
3151 if (!netlink_strict_get_check(skb))
3152 return nlmsg_parse_deprecated(nlh, sizeof(*rtm), tb, RTA_MAX,
3153 rtm_ipv4_policy, extack);
3155 rtm = nlmsg_data(nlh);
3156 if ((rtm->rtm_src_len && rtm->rtm_src_len != 32) ||
3157 (rtm->rtm_dst_len && rtm->rtm_dst_len != 32) ||
3158 rtm->rtm_table || rtm->rtm_protocol ||
3159 rtm->rtm_scope || rtm->rtm_type) {
3160 NL_SET_ERR_MSG(extack, "ipv4: Invalid values in header for route get request");
3164 if (rtm->rtm_flags & ~(RTM_F_NOTIFY |
3165 RTM_F_LOOKUP_TABLE |
3167 NL_SET_ERR_MSG(extack, "ipv4: Unsupported rtm_flags for route get request");
3171 err = nlmsg_parse_deprecated_strict(nlh, sizeof(*rtm), tb, RTA_MAX,
3172 rtm_ipv4_policy, extack);
3176 if ((tb[RTA_SRC] && !rtm->rtm_src_len) ||
3177 (tb[RTA_DST] && !rtm->rtm_dst_len)) {
3178 NL_SET_ERR_MSG(extack, "ipv4: rtm_src_len and rtm_dst_len must be 32 for IPv4");
3182 for (i = 0; i <= RTA_MAX; i++) {
3198 NL_SET_ERR_MSG(extack, "ipv4: Unsupported attribute in route get request");
3206 static int inet_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3207 struct netlink_ext_ack *extack)
3209 struct net *net = sock_net(in_skb->sk);
3210 struct nlattr *tb[RTA_MAX+1];
3211 u32 table_id = RT_TABLE_MAIN;
3212 __be16 sport = 0, dport = 0;
3213 struct fib_result res = {};
3214 u8 ip_proto = IPPROTO_UDP;
3215 struct rtable *rt = NULL;
3216 struct sk_buff *skb;
3218 struct flowi4 fl4 = {};
3226 err = inet_rtm_valid_getroute_req(in_skb, nlh, tb, extack);
3230 rtm = nlmsg_data(nlh);
3231 src = tb[RTA_SRC] ? nla_get_in_addr(tb[RTA_SRC]) : 0;
3232 dst = tb[RTA_DST] ? nla_get_in_addr(tb[RTA_DST]) : 0;
3233 iif = tb[RTA_IIF] ? nla_get_u32(tb[RTA_IIF]) : 0;
3234 mark = tb[RTA_MARK] ? nla_get_u32(tb[RTA_MARK]) : 0;
3236 uid = make_kuid(current_user_ns(), nla_get_u32(tb[RTA_UID]));
3238 uid = (iif ? INVALID_UID : current_uid());
3240 if (tb[RTA_IP_PROTO]) {
3241 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
3242 &ip_proto, AF_INET, extack);
3248 sport = nla_get_be16(tb[RTA_SPORT]);
3251 dport = nla_get_be16(tb[RTA_DPORT]);
3253 skb = inet_rtm_getroute_build_skb(src, dst, ip_proto, sport, dport);
3259 fl4.flowi4_tos = rtm->rtm_tos & IPTOS_RT_MASK;
3260 fl4.flowi4_oif = tb[RTA_OIF] ? nla_get_u32(tb[RTA_OIF]) : 0;
3261 fl4.flowi4_mark = mark;
3262 fl4.flowi4_uid = uid;
3264 fl4.fl4_sport = sport;
3266 fl4.fl4_dport = dport;
3267 fl4.flowi4_proto = ip_proto;
3272 struct net_device *dev;
3274 dev = dev_get_by_index_rcu(net, iif);
3280 fl4.flowi4_iif = iif; /* for rt_fill_info */
3283 err = ip_route_input_rcu(skb, dst, src,
3284 rtm->rtm_tos & IPTOS_RT_MASK, dev,
3287 rt = skb_rtable(skb);
3288 if (err == 0 && rt->dst.error)
3289 err = -rt->dst.error;
3291 fl4.flowi4_iif = LOOPBACK_IFINDEX;
3292 skb->dev = net->loopback_dev;
3293 rt = ip_route_output_key_hash_rcu(net, &fl4, &res, skb);
3298 skb_dst_set(skb, &rt->dst);
3304 if (rtm->rtm_flags & RTM_F_NOTIFY)
3305 rt->rt_flags |= RTCF_NOTIFY;
3307 if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
3308 table_id = res.table ? res.table->tb_id : 0;
3310 /* reset skb for netlink reply msg */
3312 skb_reset_network_header(skb);
3313 skb_reset_transport_header(skb);
3314 skb_reset_mac_header(skb);
3316 if (rtm->rtm_flags & RTM_F_FIB_MATCH) {
3317 struct fib_rt_info fri;
3320 err = fib_props[res.type].error;
3322 err = -EHOSTUNREACH;
3326 fri.tb_id = table_id;
3327 fri.dst = res.prefix;
3328 fri.dst_len = res.prefixlen;
3329 fri.tos = fl4.flowi4_tos;
3330 fri.type = rt->rt_type;
3334 struct fib_alias *fa;
3336 hlist_for_each_entry_rcu(fa, res.fa_head, fa_list) {
3337 u8 slen = 32 - fri.dst_len;
3339 if (fa->fa_slen == slen &&
3340 fa->tb_id == fri.tb_id &&
3341 fa->fa_tos == fri.tos &&
3342 fa->fa_info == res.fi &&
3343 fa->fa_type == fri.type) {
3344 fri.offload = fa->offload;
3345 fri.trap = fa->trap;
3350 err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
3351 nlh->nlmsg_seq, RTM_NEWROUTE, &fri, 0);
3353 err = rt_fill_info(net, dst, src, rt, table_id, &fl4, skb,
3354 NETLINK_CB(in_skb).portid,
3362 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3372 void ip_rt_multicast_event(struct in_device *in_dev)
3374 rt_cache_flush(dev_net(in_dev->dev));
3377 #ifdef CONFIG_SYSCTL
3378 static int ip_rt_gc_interval __read_mostly = 60 * HZ;
3379 static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
3380 static int ip_rt_gc_elasticity __read_mostly = 8;
3381 static int ip_min_valid_pmtu __read_mostly = IPV4_MIN_MTU;
3383 static int ipv4_sysctl_rtcache_flush(struct ctl_table *__ctl, int write,
3384 void *buffer, size_t *lenp, loff_t *ppos)
3386 struct net *net = (struct net *)__ctl->extra1;
3389 rt_cache_flush(net);
3390 fnhe_genid_bump(net);
3397 static struct ctl_table ipv4_route_table[] = {
3399 .procname = "gc_thresh",
3400 .data = &ipv4_dst_ops.gc_thresh,
3401 .maxlen = sizeof(int),
3403 .proc_handler = proc_dointvec,
3406 .procname = "max_size",
3407 .data = &ip_rt_max_size,
3408 .maxlen = sizeof(int),
3410 .proc_handler = proc_dointvec,
3413 /* Deprecated. Use gc_min_interval_ms */
3415 .procname = "gc_min_interval",
3416 .data = &ip_rt_gc_min_interval,
3417 .maxlen = sizeof(int),
3419 .proc_handler = proc_dointvec_jiffies,
3422 .procname = "gc_min_interval_ms",
3423 .data = &ip_rt_gc_min_interval,
3424 .maxlen = sizeof(int),
3426 .proc_handler = proc_dointvec_ms_jiffies,
3429 .procname = "gc_timeout",
3430 .data = &ip_rt_gc_timeout,
3431 .maxlen = sizeof(int),
3433 .proc_handler = proc_dointvec_jiffies,
3436 .procname = "gc_interval",
3437 .data = &ip_rt_gc_interval,
3438 .maxlen = sizeof(int),
3440 .proc_handler = proc_dointvec_jiffies,
3443 .procname = "redirect_load",
3444 .data = &ip_rt_redirect_load,
3445 .maxlen = sizeof(int),
3447 .proc_handler = proc_dointvec,
3450 .procname = "redirect_number",
3451 .data = &ip_rt_redirect_number,
3452 .maxlen = sizeof(int),
3454 .proc_handler = proc_dointvec,
3457 .procname = "redirect_silence",
3458 .data = &ip_rt_redirect_silence,
3459 .maxlen = sizeof(int),
3461 .proc_handler = proc_dointvec,
3464 .procname = "error_cost",
3465 .data = &ip_rt_error_cost,
3466 .maxlen = sizeof(int),
3468 .proc_handler = proc_dointvec,
3471 .procname = "error_burst",
3472 .data = &ip_rt_error_burst,
3473 .maxlen = sizeof(int),
3475 .proc_handler = proc_dointvec,
3478 .procname = "gc_elasticity",
3479 .data = &ip_rt_gc_elasticity,
3480 .maxlen = sizeof(int),
3482 .proc_handler = proc_dointvec,
3485 .procname = "mtu_expires",
3486 .data = &ip_rt_mtu_expires,
3487 .maxlen = sizeof(int),
3489 .proc_handler = proc_dointvec_jiffies,
3492 .procname = "min_pmtu",
3493 .data = &ip_rt_min_pmtu,
3494 .maxlen = sizeof(int),
3496 .proc_handler = proc_dointvec_minmax,
3497 .extra1 = &ip_min_valid_pmtu,
3500 .procname = "min_adv_mss",
3501 .data = &ip_rt_min_advmss,
3502 .maxlen = sizeof(int),
3504 .proc_handler = proc_dointvec,
3509 static const char ipv4_route_flush_procname[] = "flush";
3511 static struct ctl_table ipv4_route_flush_table[] = {
3513 .procname = ipv4_route_flush_procname,
3514 .maxlen = sizeof(int),
3516 .proc_handler = ipv4_sysctl_rtcache_flush,
3521 static __net_init int sysctl_route_net_init(struct net *net)
3523 struct ctl_table *tbl;
3525 tbl = ipv4_route_flush_table;
3526 if (!net_eq(net, &init_net)) {
3527 tbl = kmemdup(tbl, sizeof(ipv4_route_flush_table), GFP_KERNEL);
3531 /* Don't export non-whitelisted sysctls to unprivileged users */
3532 if (net->user_ns != &init_user_ns) {
3533 if (tbl[0].procname != ipv4_route_flush_procname)
3534 tbl[0].procname = NULL;
3537 tbl[0].extra1 = net;
3539 net->ipv4.route_hdr = register_net_sysctl(net, "net/ipv4/route", tbl);
3540 if (!net->ipv4.route_hdr)
3545 if (tbl != ipv4_route_flush_table)
3551 static __net_exit void sysctl_route_net_exit(struct net *net)
3553 struct ctl_table *tbl;
3555 tbl = net->ipv4.route_hdr->ctl_table_arg;
3556 unregister_net_sysctl_table(net->ipv4.route_hdr);
3557 BUG_ON(tbl == ipv4_route_flush_table);
3561 static __net_initdata struct pernet_operations sysctl_route_ops = {
3562 .init = sysctl_route_net_init,
3563 .exit = sysctl_route_net_exit,
3567 static __net_init int rt_genid_init(struct net *net)
3569 atomic_set(&net->ipv4.rt_genid, 0);
3570 atomic_set(&net->fnhe_genid, 0);
3571 atomic_set(&net->ipv4.dev_addr_genid, get_random_int());
3575 static __net_initdata struct pernet_operations rt_genid_ops = {
3576 .init = rt_genid_init,
3579 static int __net_init ipv4_inetpeer_init(struct net *net)
3581 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3585 inet_peer_base_init(bp);
3586 net->ipv4.peers = bp;
3590 static void __net_exit ipv4_inetpeer_exit(struct net *net)
3592 struct inet_peer_base *bp = net->ipv4.peers;
3594 net->ipv4.peers = NULL;
3595 inetpeer_invalidate_tree(bp);
3599 static __net_initdata struct pernet_operations ipv4_inetpeer_ops = {
3600 .init = ipv4_inetpeer_init,
3601 .exit = ipv4_inetpeer_exit,
3604 #ifdef CONFIG_IP_ROUTE_CLASSID
3605 struct ip_rt_acct __percpu *ip_rt_acct __read_mostly;
3606 #endif /* CONFIG_IP_ROUTE_CLASSID */
3608 int __init ip_rt_init(void)
3613 /* For modern hosts, this will use 2 MB of memory */
3614 idents_hash = alloc_large_system_hash("IP idents",
3615 sizeof(*ip_idents) + sizeof(*ip_tstamps),
3617 16, /* one bucket per 64 KB */
3624 ip_idents = idents_hash;
3626 prandom_bytes(ip_idents, (ip_idents_mask + 1) * sizeof(*ip_idents));
3628 ip_tstamps = idents_hash + (ip_idents_mask + 1) * sizeof(*ip_idents);
3630 for_each_possible_cpu(cpu) {
3631 struct uncached_list *ul = &per_cpu(rt_uncached_list, cpu);
3633 INIT_LIST_HEAD(&ul->head);
3634 spin_lock_init(&ul->lock);
3636 #ifdef CONFIG_IP_ROUTE_CLASSID
3637 ip_rt_acct = __alloc_percpu(256 * sizeof(struct ip_rt_acct), __alignof__(struct ip_rt_acct));
3639 panic("IP: failed to allocate ip_rt_acct\n");
3642 ipv4_dst_ops.kmem_cachep =
3643 kmem_cache_create("ip_dst_cache", sizeof(struct rtable), 0,
3644 SLAB_HWCACHE_ALIGN|SLAB_PANIC, NULL);
3646 ipv4_dst_blackhole_ops.kmem_cachep = ipv4_dst_ops.kmem_cachep;
3648 if (dst_entries_init(&ipv4_dst_ops) < 0)
3649 panic("IP: failed to allocate ipv4_dst_ops counter\n");
3651 if (dst_entries_init(&ipv4_dst_blackhole_ops) < 0)
3652 panic("IP: failed to allocate ipv4_dst_blackhole_ops counter\n");
3654 ipv4_dst_ops.gc_thresh = ~0;
3655 ip_rt_max_size = INT_MAX;
3660 if (ip_rt_proc_init())
3661 pr_err("Unable to create route proc files\n");
3666 rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
3667 RTNL_FLAG_DOIT_UNLOCKED);
3669 #ifdef CONFIG_SYSCTL
3670 register_pernet_subsys(&sysctl_route_ops);
3672 register_pernet_subsys(&rt_genid_ops);
3673 register_pernet_subsys(&ipv4_inetpeer_ops);
3677 #ifdef CONFIG_SYSCTL
3679 * We really need to sanitize the damn ipv4 init order, then all
3680 * this nonsense will go away.
3682 void __init ip_static_sysctl_init(void)
3684 register_net_sysctl(&init_net, "net/ipv4/route", ipv4_route_table);