2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
55 #include <linux/rtnetlink.h>
57 #include <net/dst_metadata.h>
59 #include <net/netevent.h>
60 #include <net/netlink.h>
61 #include <net/nexthop.h>
62 #include <net/lwtunnel.h>
63 #include <net/ip_tunnels.h>
64 #include <net/l3mdev.h>
65 #include <trace/events/fib6.h>
67 #include <linux/uaccess.h>
70 #include <linux/sysctl.h>
74 RT6_NUD_FAIL_HARD = -3,
75 RT6_NUD_FAIL_PROBE = -2,
76 RT6_NUD_FAIL_DO_RR = -1,
80 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
81 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
82 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
83 static unsigned int ip6_mtu(const struct dst_entry *dst);
84 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
85 static void ip6_dst_destroy(struct dst_entry *);
86 static void ip6_dst_ifdown(struct dst_entry *,
87 struct net_device *dev, int how);
88 static int ip6_dst_gc(struct dst_ops *ops);
90 static int ip6_pkt_discard(struct sk_buff *skb);
91 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
92 static int ip6_pkt_prohibit(struct sk_buff *skb);
93 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
94 static void ip6_link_failure(struct sk_buff *skb);
95 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
96 struct sk_buff *skb, u32 mtu,
98 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
100 static void rt6_dst_from_metrics_check(struct rt6_info *rt);
101 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
102 static size_t rt6_nlmsg_size(struct rt6_info *rt);
103 static int rt6_fill_node(struct net *net,
104 struct sk_buff *skb, struct rt6_info *rt,
105 struct in6_addr *dst, struct in6_addr *src,
106 int iif, int type, u32 portid, u32 seq,
109 #ifdef CONFIG_IPV6_ROUTE_INFO
110 static struct rt6_info *rt6_add_route_info(struct net *net,
111 const struct in6_addr *prefix, int prefixlen,
112 const struct in6_addr *gwaddr,
113 struct net_device *dev,
115 static struct rt6_info *rt6_get_route_info(struct net *net,
116 const struct in6_addr *prefix, int prefixlen,
117 const struct in6_addr *gwaddr,
118 struct net_device *dev);
121 struct uncached_list {
123 struct list_head head;
126 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
128 static void rt6_uncached_list_add(struct rt6_info *rt)
130 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
132 rt->rt6i_uncached_list = ul;
134 spin_lock_bh(&ul->lock);
135 list_add_tail(&rt->rt6i_uncached, &ul->head);
136 spin_unlock_bh(&ul->lock);
139 static void rt6_uncached_list_del(struct rt6_info *rt)
141 if (!list_empty(&rt->rt6i_uncached)) {
142 struct uncached_list *ul = rt->rt6i_uncached_list;
144 spin_lock_bh(&ul->lock);
145 list_del(&rt->rt6i_uncached);
146 spin_unlock_bh(&ul->lock);
150 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
152 struct net_device *loopback_dev = net->loopback_dev;
155 if (dev == loopback_dev)
158 for_each_possible_cpu(cpu) {
159 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
162 spin_lock_bh(&ul->lock);
163 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
164 struct inet6_dev *rt_idev = rt->rt6i_idev;
165 struct net_device *rt_dev = rt->dst.dev;
167 if (rt_idev->dev == dev) {
168 rt->rt6i_idev = in6_dev_get(loopback_dev);
169 in6_dev_put(rt_idev);
173 rt->dst.dev = loopback_dev;
174 dev_hold(rt->dst.dev);
178 spin_unlock_bh(&ul->lock);
182 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
184 return dst_metrics_write_ptr(rt->dst.from);
187 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
189 struct rt6_info *rt = (struct rt6_info *)dst;
191 if (rt->rt6i_flags & RTF_PCPU)
192 return rt6_pcpu_cow_metrics(rt);
193 else if (rt->rt6i_flags & RTF_CACHE)
196 return dst_cow_metrics_generic(dst, old);
199 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
203 struct in6_addr *p = &rt->rt6i_gateway;
205 if (!ipv6_addr_any(p))
206 return (const void *) p;
208 return &ipv6_hdr(skb)->daddr;
212 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
216 struct rt6_info *rt = (struct rt6_info *) dst;
219 daddr = choose_neigh_daddr(rt, skb, daddr);
220 n = __ipv6_neigh_lookup(dst->dev, daddr);
223 return neigh_create(&nd_tbl, daddr, dst->dev);
226 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
228 struct net_device *dev = dst->dev;
229 struct rt6_info *rt = (struct rt6_info *)dst;
231 daddr = choose_neigh_daddr(rt, NULL, daddr);
234 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
236 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
238 __ipv6_confirm_neigh(dev, daddr);
241 static struct dst_ops ip6_dst_ops_template = {
245 .check = ip6_dst_check,
246 .default_advmss = ip6_default_advmss,
248 .cow_metrics = ipv6_cow_metrics,
249 .destroy = ip6_dst_destroy,
250 .ifdown = ip6_dst_ifdown,
251 .negative_advice = ip6_negative_advice,
252 .link_failure = ip6_link_failure,
253 .update_pmtu = ip6_rt_update_pmtu,
254 .redirect = rt6_do_redirect,
255 .local_out = __ip6_local_out,
256 .neigh_lookup = ip6_neigh_lookup,
257 .confirm_neigh = ip6_confirm_neigh,
260 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
262 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
264 return mtu ? : dst->dev->mtu;
267 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
268 struct sk_buff *skb, u32 mtu,
273 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
278 static struct dst_ops ip6_dst_blackhole_ops = {
280 .destroy = ip6_dst_destroy,
281 .check = ip6_dst_check,
282 .mtu = ip6_blackhole_mtu,
283 .default_advmss = ip6_default_advmss,
284 .update_pmtu = ip6_rt_blackhole_update_pmtu,
285 .redirect = ip6_rt_blackhole_redirect,
286 .cow_metrics = dst_cow_metrics_generic,
287 .neigh_lookup = ip6_neigh_lookup,
290 static const u32 ip6_template_metrics[RTAX_MAX] = {
291 [RTAX_HOPLIMIT - 1] = 0,
294 static const struct rt6_info ip6_null_entry_template = {
296 .__refcnt = ATOMIC_INIT(1),
298 .obsolete = DST_OBSOLETE_FORCE_CHK,
299 .error = -ENETUNREACH,
300 .input = ip6_pkt_discard,
301 .output = ip6_pkt_discard_out,
303 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
304 .rt6i_protocol = RTPROT_KERNEL,
305 .rt6i_metric = ~(u32) 0,
306 .rt6i_ref = ATOMIC_INIT(1),
309 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
311 static const struct rt6_info ip6_prohibit_entry_template = {
313 .__refcnt = ATOMIC_INIT(1),
315 .obsolete = DST_OBSOLETE_FORCE_CHK,
317 .input = ip6_pkt_prohibit,
318 .output = ip6_pkt_prohibit_out,
320 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
321 .rt6i_protocol = RTPROT_KERNEL,
322 .rt6i_metric = ~(u32) 0,
323 .rt6i_ref = ATOMIC_INIT(1),
326 static const struct rt6_info ip6_blk_hole_entry_template = {
328 .__refcnt = ATOMIC_INIT(1),
330 .obsolete = DST_OBSOLETE_FORCE_CHK,
332 .input = dst_discard,
333 .output = dst_discard_out,
335 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
336 .rt6i_protocol = RTPROT_KERNEL,
337 .rt6i_metric = ~(u32) 0,
338 .rt6i_ref = ATOMIC_INIT(1),
343 static void rt6_info_init(struct rt6_info *rt)
345 struct dst_entry *dst = &rt->dst;
347 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
348 INIT_LIST_HEAD(&rt->rt6i_siblings);
349 INIT_LIST_HEAD(&rt->rt6i_uncached);
352 /* allocate dst with ip6_dst_ops */
353 static struct rt6_info *__ip6_dst_alloc(struct net *net,
354 struct net_device *dev,
357 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
358 1, DST_OBSOLETE_FORCE_CHK, flags);
366 struct rt6_info *ip6_dst_alloc(struct net *net,
367 struct net_device *dev,
370 struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
373 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
377 for_each_possible_cpu(cpu) {
380 p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
381 /* no one shares rt */
385 dst_release_immediate(&rt->dst);
392 EXPORT_SYMBOL(ip6_dst_alloc);
394 static void ip6_dst_destroy(struct dst_entry *dst)
396 struct rt6_info *rt = (struct rt6_info *)dst;
397 struct dst_entry *from = dst->from;
398 struct inet6_dev *idev;
400 dst_destroy_metrics_generic(dst);
401 free_percpu(rt->rt6i_pcpu);
402 rt6_uncached_list_del(rt);
404 idev = rt->rt6i_idev;
406 rt->rt6i_idev = NULL;
414 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
417 struct rt6_info *rt = (struct rt6_info *)dst;
418 struct inet6_dev *idev = rt->rt6i_idev;
419 struct net_device *loopback_dev =
420 dev_net(dev)->loopback_dev;
422 if (idev && idev->dev != loopback_dev) {
423 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
425 rt->rt6i_idev = loopback_idev;
431 static bool __rt6_check_expired(const struct rt6_info *rt)
433 if (rt->rt6i_flags & RTF_EXPIRES)
434 return time_after(jiffies, rt->dst.expires);
439 static bool rt6_check_expired(const struct rt6_info *rt)
441 if (rt->rt6i_flags & RTF_EXPIRES) {
442 if (time_after(jiffies, rt->dst.expires))
444 } else if (rt->dst.from) {
445 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
446 rt6_check_expired((struct rt6_info *)rt->dst.from);
451 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
452 struct flowi6 *fl6, int oif,
455 struct rt6_info *sibling, *next_sibling;
458 /* We might have already computed the hash for ICMPv6 errors. In such
459 * case it will always be non-zero. Otherwise now is the time to do it.
462 fl6->mp_hash = rt6_multipath_hash(fl6, NULL);
464 route_choosen = fl6->mp_hash % (match->rt6i_nsiblings + 1);
465 /* Don't change the route, if route_choosen == 0
466 * (siblings does not include ourself)
469 list_for_each_entry_safe(sibling, next_sibling,
470 &match->rt6i_siblings, rt6i_siblings) {
472 if (route_choosen == 0) {
473 if (rt6_score_route(sibling, oif, strict) < 0)
483 * Route lookup. Any table->tb6_lock is implied.
486 static inline struct rt6_info *rt6_device_match(struct net *net,
488 const struct in6_addr *saddr,
492 struct rt6_info *local = NULL;
493 struct rt6_info *sprt;
495 if (!oif && ipv6_addr_any(saddr))
498 for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
499 struct net_device *dev = sprt->dst.dev;
502 if (dev->ifindex == oif)
504 if (dev->flags & IFF_LOOPBACK) {
505 if (!sprt->rt6i_idev ||
506 sprt->rt6i_idev->dev->ifindex != oif) {
507 if (flags & RT6_LOOKUP_F_IFACE)
510 local->rt6i_idev->dev->ifindex == oif)
516 if (ipv6_chk_addr(net, saddr, dev,
517 flags & RT6_LOOKUP_F_IFACE))
526 if (flags & RT6_LOOKUP_F_IFACE)
527 return net->ipv6.ip6_null_entry;
533 #ifdef CONFIG_IPV6_ROUTER_PREF
534 struct __rt6_probe_work {
535 struct work_struct work;
536 struct in6_addr target;
537 struct net_device *dev;
540 static void rt6_probe_deferred(struct work_struct *w)
542 struct in6_addr mcaddr;
543 struct __rt6_probe_work *work =
544 container_of(w, struct __rt6_probe_work, work);
546 addrconf_addr_solict_mult(&work->target, &mcaddr);
547 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
552 static void rt6_probe(struct rt6_info *rt)
554 struct __rt6_probe_work *work;
555 struct neighbour *neigh;
557 * Okay, this does not seem to be appropriate
558 * for now, however, we need to check if it
559 * is really so; aka Router Reachability Probing.
561 * Router Reachability Probe MUST be rate-limited
562 * to no more than one per minute.
564 if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
567 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
569 if (neigh->nud_state & NUD_VALID)
573 write_lock(&neigh->lock);
574 if (!(neigh->nud_state & NUD_VALID) &&
577 rt->rt6i_idev->cnf.rtr_probe_interval)) {
578 work = kmalloc(sizeof(*work), GFP_ATOMIC);
580 __neigh_set_probe_once(neigh);
582 write_unlock(&neigh->lock);
584 work = kmalloc(sizeof(*work), GFP_ATOMIC);
588 INIT_WORK(&work->work, rt6_probe_deferred);
589 work->target = rt->rt6i_gateway;
590 dev_hold(rt->dst.dev);
591 work->dev = rt->dst.dev;
592 schedule_work(&work->work);
596 rcu_read_unlock_bh();
599 static inline void rt6_probe(struct rt6_info *rt)
605 * Default Router Selection (RFC 2461 6.3.6)
607 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
609 struct net_device *dev = rt->dst.dev;
610 if (!oif || dev->ifindex == oif)
612 if ((dev->flags & IFF_LOOPBACK) &&
613 rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
618 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
620 struct neighbour *neigh;
621 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
623 if (rt->rt6i_flags & RTF_NONEXTHOP ||
624 !(rt->rt6i_flags & RTF_GATEWAY))
625 return RT6_NUD_SUCCEED;
628 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
630 read_lock(&neigh->lock);
631 if (neigh->nud_state & NUD_VALID)
632 ret = RT6_NUD_SUCCEED;
633 #ifdef CONFIG_IPV6_ROUTER_PREF
634 else if (!(neigh->nud_state & NUD_FAILED))
635 ret = RT6_NUD_SUCCEED;
637 ret = RT6_NUD_FAIL_PROBE;
639 read_unlock(&neigh->lock);
641 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
642 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
644 rcu_read_unlock_bh();
649 static int rt6_score_route(struct rt6_info *rt, int oif,
654 m = rt6_check_dev(rt, oif);
655 if (!m && (strict & RT6_LOOKUP_F_IFACE))
656 return RT6_NUD_FAIL_HARD;
657 #ifdef CONFIG_IPV6_ROUTER_PREF
658 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
660 if (strict & RT6_LOOKUP_F_REACHABLE) {
661 int n = rt6_check_neigh(rt);
668 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
669 int *mpri, struct rt6_info *match,
673 bool match_do_rr = false;
674 struct inet6_dev *idev = rt->rt6i_idev;
675 struct net_device *dev = rt->dst.dev;
677 if (dev && !netif_carrier_ok(dev) &&
678 idev->cnf.ignore_routes_with_linkdown &&
679 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
682 if (rt6_check_expired(rt))
685 m = rt6_score_route(rt, oif, strict);
686 if (m == RT6_NUD_FAIL_DO_RR) {
688 m = 0; /* lowest valid score */
689 } else if (m == RT6_NUD_FAIL_HARD) {
693 if (strict & RT6_LOOKUP_F_REACHABLE)
696 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
698 *do_rr = match_do_rr;
706 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
707 struct rt6_info *rr_head,
708 u32 metric, int oif, int strict,
711 struct rt6_info *rt, *match, *cont;
716 for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
717 if (rt->rt6i_metric != metric) {
722 match = find_match(rt, oif, strict, &mpri, match, do_rr);
725 for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
726 if (rt->rt6i_metric != metric) {
731 match = find_match(rt, oif, strict, &mpri, match, do_rr);
737 for (rt = cont; rt; rt = rt->dst.rt6_next)
738 match = find_match(rt, oif, strict, &mpri, match, do_rr);
743 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
745 struct rt6_info *match, *rt0;
751 fn->rr_ptr = rt0 = fn->leaf;
753 match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
757 struct rt6_info *next = rt0->dst.rt6_next;
759 /* no entries matched; do round-robin */
760 if (!next || next->rt6i_metric != rt0->rt6i_metric)
767 net = dev_net(rt0->dst.dev);
768 return match ? match : net->ipv6.ip6_null_entry;
771 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
773 return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
776 #ifdef CONFIG_IPV6_ROUTE_INFO
777 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
778 const struct in6_addr *gwaddr)
780 struct net *net = dev_net(dev);
781 struct route_info *rinfo = (struct route_info *) opt;
782 struct in6_addr prefix_buf, *prefix;
784 unsigned long lifetime;
787 if (len < sizeof(struct route_info)) {
791 /* Sanity check for prefix_len and length */
792 if (rinfo->length > 3) {
794 } else if (rinfo->prefix_len > 128) {
796 } else if (rinfo->prefix_len > 64) {
797 if (rinfo->length < 2) {
800 } else if (rinfo->prefix_len > 0) {
801 if (rinfo->length < 1) {
806 pref = rinfo->route_pref;
807 if (pref == ICMPV6_ROUTER_PREF_INVALID)
810 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
812 if (rinfo->length == 3)
813 prefix = (struct in6_addr *)rinfo->prefix;
815 /* this function is safe */
816 ipv6_addr_prefix(&prefix_buf,
817 (struct in6_addr *)rinfo->prefix,
819 prefix = &prefix_buf;
822 if (rinfo->prefix_len == 0)
823 rt = rt6_get_dflt_router(gwaddr, dev);
825 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
828 if (rt && !lifetime) {
834 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
837 rt->rt6i_flags = RTF_ROUTEINFO |
838 (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
841 if (!addrconf_finite_timeout(lifetime))
842 rt6_clean_expires(rt);
844 rt6_set_expires(rt, jiffies + HZ * lifetime);
852 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
853 struct in6_addr *saddr)
855 struct fib6_node *pn;
857 if (fn->fn_flags & RTN_TL_ROOT)
860 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
861 fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
864 if (fn->fn_flags & RTN_RTINFO)
869 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
870 struct fib6_table *table,
871 struct flowi6 *fl6, int flags)
873 struct fib6_node *fn;
876 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
877 flags &= ~RT6_LOOKUP_F_IFACE;
879 read_lock_bh(&table->tb6_lock);
880 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
883 rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
884 if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
885 rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
886 if (rt == net->ipv6.ip6_null_entry) {
887 fn = fib6_backtrack(fn, &fl6->saddr);
891 dst_use(&rt->dst, jiffies);
892 read_unlock_bh(&table->tb6_lock);
894 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
900 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
903 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
905 EXPORT_SYMBOL_GPL(ip6_route_lookup);
907 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
908 const struct in6_addr *saddr, int oif, int strict)
910 struct flowi6 fl6 = {
914 struct dst_entry *dst;
915 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
918 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
919 flags |= RT6_LOOKUP_F_HAS_SADDR;
922 dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
924 return (struct rt6_info *) dst;
930 EXPORT_SYMBOL(rt6_lookup);
932 /* ip6_ins_rt is called with FREE table->tb6_lock.
933 * It takes new route entry, the addition fails by any reason the
935 * Caller must hold dst before calling it.
938 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
939 struct mx6_config *mxc,
940 struct netlink_ext_ack *extack)
943 struct fib6_table *table;
945 table = rt->rt6i_table;
946 write_lock_bh(&table->tb6_lock);
947 err = fib6_add(&table->tb6_root, rt, info, mxc, extack);
948 write_unlock_bh(&table->tb6_lock);
953 int ip6_ins_rt(struct rt6_info *rt)
955 struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
956 struct mx6_config mxc = { .mx = NULL, };
958 /* Hold dst to account for the reference from the fib6 tree */
960 return __ip6_ins_rt(rt, &info, &mxc, NULL);
963 /* called with rcu_lock held */
964 static struct net_device *ip6_rt_get_dev_rcu(struct rt6_info *rt)
966 struct net_device *dev = rt->dst.dev;
968 if (rt->rt6i_flags & (RTF_LOCAL | RTF_ANYCAST)) {
969 /* for copies of local routes, dst->dev needs to be the
970 * device if it is a master device, the master device if
971 * device is enslaved, and the loopback as the default
973 if (netif_is_l3_slave(dev) &&
974 !rt6_need_strict(&rt->rt6i_dst.addr))
975 dev = l3mdev_master_dev_rcu(dev);
976 else if (!netif_is_l3_master(dev))
977 dev = dev_net(dev)->loopback_dev;
978 /* last case is netif_is_l3_master(dev) is true in which
979 * case we want dev returned to be dev
986 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
987 const struct in6_addr *daddr,
988 const struct in6_addr *saddr)
990 struct net_device *dev;
997 if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
998 ort = (struct rt6_info *)ort->dst.from;
1001 dev = ip6_rt_get_dev_rcu(ort);
1002 rt = __ip6_dst_alloc(dev_net(dev), dev, 0);
1007 ip6_rt_copy_init(rt, ort);
1008 rt->rt6i_flags |= RTF_CACHE;
1009 rt->rt6i_metric = 0;
1010 rt->dst.flags |= DST_HOST;
1011 rt->rt6i_dst.addr = *daddr;
1012 rt->rt6i_dst.plen = 128;
1014 if (!rt6_is_gw_or_nonexthop(ort)) {
1015 if (ort->rt6i_dst.plen != 128 &&
1016 ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
1017 rt->rt6i_flags |= RTF_ANYCAST;
1018 #ifdef CONFIG_IPV6_SUBTREES
1019 if (rt->rt6i_src.plen && saddr) {
1020 rt->rt6i_src.addr = *saddr;
1021 rt->rt6i_src.plen = 128;
1029 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
1031 struct net_device *dev;
1032 struct rt6_info *pcpu_rt;
1035 dev = ip6_rt_get_dev_rcu(rt);
1036 pcpu_rt = __ip6_dst_alloc(dev_net(dev), dev, rt->dst.flags);
1040 ip6_rt_copy_init(pcpu_rt, rt);
1041 pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
1042 pcpu_rt->rt6i_flags |= RTF_PCPU;
1046 /* It should be called with read_lock_bh(&tb6_lock) acquired */
1047 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1049 struct rt6_info *pcpu_rt, **p;
1051 p = this_cpu_ptr(rt->rt6i_pcpu);
1055 dst_hold(&pcpu_rt->dst);
1056 rt6_dst_from_metrics_check(pcpu_rt);
1061 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1063 struct fib6_table *table = rt->rt6i_table;
1064 struct rt6_info *pcpu_rt, *prev, **p;
1066 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1068 struct net *net = dev_net(rt->dst.dev);
1070 dst_hold(&net->ipv6.ip6_null_entry->dst);
1071 return net->ipv6.ip6_null_entry;
1074 read_lock_bh(&table->tb6_lock);
1075 if (rt->rt6i_pcpu) {
1076 p = this_cpu_ptr(rt->rt6i_pcpu);
1077 prev = cmpxchg(p, NULL, pcpu_rt);
1079 /* If someone did it before us, return prev instead */
1080 dst_release_immediate(&pcpu_rt->dst);
1084 /* rt has been removed from the fib6 tree
1085 * before we have a chance to acquire the read_lock.
1086 * In this case, don't brother to create a pcpu rt
1087 * since rt is going away anyway. The next
1088 * dst_check() will trigger a re-lookup.
1090 dst_release_immediate(&pcpu_rt->dst);
1093 dst_hold(&pcpu_rt->dst);
1094 rt6_dst_from_metrics_check(pcpu_rt);
1095 read_unlock_bh(&table->tb6_lock);
1099 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1100 int oif, struct flowi6 *fl6, int flags)
1102 struct fib6_node *fn, *saved_fn;
1103 struct rt6_info *rt;
1106 strict |= flags & RT6_LOOKUP_F_IFACE;
1107 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1108 if (net->ipv6.devconf_all->forwarding == 0)
1109 strict |= RT6_LOOKUP_F_REACHABLE;
1111 read_lock_bh(&table->tb6_lock);
1113 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1116 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1120 rt = rt6_select(fn, oif, strict);
1121 if (rt->rt6i_nsiblings)
1122 rt = rt6_multipath_select(rt, fl6, oif, strict);
1123 if (rt == net->ipv6.ip6_null_entry) {
1124 fn = fib6_backtrack(fn, &fl6->saddr);
1126 goto redo_rt6_select;
1127 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1128 /* also consider unreachable route */
1129 strict &= ~RT6_LOOKUP_F_REACHABLE;
1131 goto redo_rt6_select;
1136 if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1137 dst_use(&rt->dst, jiffies);
1138 read_unlock_bh(&table->tb6_lock);
1140 rt6_dst_from_metrics_check(rt);
1142 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1144 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1145 !(rt->rt6i_flags & RTF_GATEWAY))) {
1146 /* Create a RTF_CACHE clone which will not be
1147 * owned by the fib6 tree. It is for the special case where
1148 * the daddr in the skb during the neighbor look-up is different
1149 * from the fl6->daddr used to look-up route here.
1152 struct rt6_info *uncached_rt;
1154 dst_use(&rt->dst, jiffies);
1155 read_unlock_bh(&table->tb6_lock);
1157 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1158 dst_release(&rt->dst);
1161 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1162 * No need for another dst_hold()
1164 rt6_uncached_list_add(uncached_rt);
1166 uncached_rt = net->ipv6.ip6_null_entry;
1167 dst_hold(&uncached_rt->dst);
1170 trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
1174 /* Get a percpu copy */
1176 struct rt6_info *pcpu_rt;
1178 rt->dst.lastuse = jiffies;
1180 pcpu_rt = rt6_get_pcpu_route(rt);
1183 read_unlock_bh(&table->tb6_lock);
1185 /* We have to do the read_unlock first
1186 * because rt6_make_pcpu_route() may trigger
1187 * ip6_dst_gc() which will take the write_lock.
1190 read_unlock_bh(&table->tb6_lock);
1191 pcpu_rt = rt6_make_pcpu_route(rt);
1192 dst_release(&rt->dst);
1195 trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
1200 EXPORT_SYMBOL_GPL(ip6_pol_route);
1202 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1203 struct flowi6 *fl6, int flags)
1205 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1208 struct dst_entry *ip6_route_input_lookup(struct net *net,
1209 struct net_device *dev,
1210 struct flowi6 *fl6, int flags)
1212 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1213 flags |= RT6_LOOKUP_F_IFACE;
1215 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1217 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1219 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
1220 struct flow_keys *keys)
1222 const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
1223 const struct ipv6hdr *key_iph = outer_iph;
1224 const struct ipv6hdr *inner_iph;
1225 const struct icmp6hdr *icmph;
1226 struct ipv6hdr _inner_iph;
1227 struct icmp6hdr _icmph;
1229 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
1232 icmph = skb_header_pointer(skb, skb_transport_offset(skb),
1233 sizeof(_icmph), &_icmph);
1237 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
1238 icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
1239 icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
1240 icmph->icmp6_type != ICMPV6_PARAMPROB)
1243 inner_iph = skb_header_pointer(skb,
1244 skb_transport_offset(skb) + sizeof(*icmph),
1245 sizeof(_inner_iph), &_inner_iph);
1249 key_iph = inner_iph;
1251 memset(keys, 0, sizeof(*keys));
1252 keys->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
1253 keys->addrs.v6addrs.src = key_iph->saddr;
1254 keys->addrs.v6addrs.dst = key_iph->daddr;
1255 keys->tags.flow_label = ip6_flowlabel(key_iph);
1256 keys->basic.ip_proto = key_iph->nexthdr;
1259 /* if skb is set it will be used and fl6 can be NULL */
1260 u32 rt6_multipath_hash(const struct flowi6 *fl6, const struct sk_buff *skb)
1262 struct flow_keys hash_keys;
1265 ip6_multipath_l3_keys(skb, &hash_keys);
1266 return flow_hash_from_keys(&hash_keys);
1269 return get_hash_from_flowi6(fl6);
1272 void ip6_route_input(struct sk_buff *skb)
1274 const struct ipv6hdr *iph = ipv6_hdr(skb);
1275 struct net *net = dev_net(skb->dev);
1276 int flags = RT6_LOOKUP_F_HAS_SADDR;
1277 struct ip_tunnel_info *tun_info;
1278 struct flowi6 fl6 = {
1279 .flowi6_iif = skb->dev->ifindex,
1280 .daddr = iph->daddr,
1281 .saddr = iph->saddr,
1282 .flowlabel = ip6_flowinfo(iph),
1283 .flowi6_mark = skb->mark,
1284 .flowi6_proto = iph->nexthdr,
1287 tun_info = skb_tunnel_info(skb);
1288 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1289 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1290 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
1291 fl6.mp_hash = rt6_multipath_hash(&fl6, skb);
1293 skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1296 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1297 struct flowi6 *fl6, int flags)
1299 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1302 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1303 struct flowi6 *fl6, int flags)
1307 if (rt6_need_strict(&fl6->daddr)) {
1308 struct dst_entry *dst;
1310 dst = l3mdev_link_scope_lookup(net, fl6);
1315 fl6->flowi6_iif = LOOPBACK_IFINDEX;
1317 any_src = ipv6_addr_any(&fl6->saddr);
1318 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1319 (fl6->flowi6_oif && any_src))
1320 flags |= RT6_LOOKUP_F_IFACE;
1323 flags |= RT6_LOOKUP_F_HAS_SADDR;
1325 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1327 return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1329 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1331 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1333 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1334 struct net_device *loopback_dev = net->loopback_dev;
1335 struct dst_entry *new = NULL;
1337 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
1338 DST_OBSOLETE_DEAD, 0);
1344 new->input = dst_discard;
1345 new->output = dst_discard_out;
1347 dst_copy_metrics(new, &ort->dst);
1349 rt->rt6i_idev = in6_dev_get(loopback_dev);
1350 rt->rt6i_gateway = ort->rt6i_gateway;
1351 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1352 rt->rt6i_metric = 0;
1354 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1355 #ifdef CONFIG_IPV6_SUBTREES
1356 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1360 dst_release(dst_orig);
1361 return new ? new : ERR_PTR(-ENOMEM);
1365 * Destination cache support functions
1368 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1371 dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1372 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1375 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1379 if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
1382 if (rt6_check_expired(rt))
1388 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1390 if (!__rt6_check_expired(rt) &&
1391 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1392 rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1398 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1400 struct rt6_info *rt;
1402 rt = (struct rt6_info *) dst;
1404 /* All IPV6 dsts are created with ->obsolete set to the value
1405 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1406 * into this function always.
1409 rt6_dst_from_metrics_check(rt);
1411 if (rt->rt6i_flags & RTF_PCPU ||
1412 (unlikely(!list_empty(&rt->rt6i_uncached)) && rt->dst.from))
1413 return rt6_dst_from_check(rt, cookie);
1415 return rt6_check(rt, cookie);
1418 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1420 struct rt6_info *rt = (struct rt6_info *) dst;
1423 if (rt->rt6i_flags & RTF_CACHE) {
1424 if (rt6_check_expired(rt)) {
1436 static void ip6_link_failure(struct sk_buff *skb)
1438 struct rt6_info *rt;
1440 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1442 rt = (struct rt6_info *) skb_dst(skb);
1444 if (rt->rt6i_flags & RTF_CACHE) {
1445 if (dst_hold_safe(&rt->dst))
1448 struct fib6_node *fn;
1451 fn = rcu_dereference(rt->rt6i_node);
1452 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
1459 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1461 struct net *net = dev_net(rt->dst.dev);
1463 rt->rt6i_flags |= RTF_MODIFIED;
1464 rt->rt6i_pmtu = mtu;
1465 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1468 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
1470 return !(rt->rt6i_flags & RTF_CACHE) &&
1471 (rt->rt6i_flags & RTF_PCPU ||
1472 rcu_access_pointer(rt->rt6i_node));
1475 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1476 const struct ipv6hdr *iph, u32 mtu,
1479 const struct in6_addr *daddr, *saddr;
1480 struct rt6_info *rt6 = (struct rt6_info *)dst;
1482 /* Note: do *NOT* check dst_metric_locked(dst, RTAX_MTU)
1483 * IPv6 pmtu discovery isn't optional, so 'mtu lock' cannot disable it.
1484 * [see also comment in rt6_mtu_change_route()]
1488 daddr = &iph->daddr;
1489 saddr = &iph->saddr;
1491 daddr = &sk->sk_v6_daddr;
1492 saddr = &inet6_sk(sk)->saddr;
1499 dst_confirm_neigh(dst, daddr);
1501 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1502 if (mtu >= dst_mtu(dst))
1505 if (!rt6_cache_allowed_for_pmtu(rt6)) {
1506 rt6_do_update_pmtu(rt6, mtu);
1508 struct rt6_info *nrt6;
1510 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1512 rt6_do_update_pmtu(nrt6, mtu);
1514 /* ip6_ins_rt(nrt6) will bump the
1515 * rt6->rt6i_node->fn_sernum
1516 * which will fail the next rt6_check() and
1517 * invalidate the sk->sk_dst_cache.
1520 /* Release the reference taken in
1521 * ip6_rt_cache_alloc()
1523 dst_release(&nrt6->dst);
1528 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1529 struct sk_buff *skb, u32 mtu,
1532 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu,
1536 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1537 int oif, u32 mark, kuid_t uid)
1539 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1540 struct dst_entry *dst;
1543 memset(&fl6, 0, sizeof(fl6));
1544 fl6.flowi6_oif = oif;
1545 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1546 fl6.daddr = iph->daddr;
1547 fl6.saddr = iph->saddr;
1548 fl6.flowlabel = ip6_flowinfo(iph);
1549 fl6.flowi6_uid = uid;
1551 dst = ip6_route_output(net, NULL, &fl6);
1553 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu), true);
1556 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1558 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1560 int oif = sk->sk_bound_dev_if;
1561 struct dst_entry *dst;
1563 if (!oif && skb->dev)
1564 oif = l3mdev_master_ifindex(skb->dev);
1566 ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
1568 dst = __sk_dst_get(sk);
1569 if (!dst || !dst->obsolete ||
1570 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
1574 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
1575 ip6_datagram_dst_update(sk, false);
1578 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1580 /* Handle redirects */
1581 struct ip6rd_flowi {
1583 struct in6_addr gateway;
1586 static struct rt6_info *__ip6_route_redirect(struct net *net,
1587 struct fib6_table *table,
1591 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1592 struct rt6_info *rt;
1593 struct fib6_node *fn;
1595 /* Get the "current" route for this destination and
1596 * check if the redirect has come from appropriate router.
1598 * RFC 4861 specifies that redirects should only be
1599 * accepted if they come from the nexthop to the target.
1600 * Due to the way the routes are chosen, this notion
1601 * is a bit fuzzy and one might need to check all possible
1605 read_lock_bh(&table->tb6_lock);
1606 fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1608 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1609 if (rt6_check_expired(rt))
1613 if (!(rt->rt6i_flags & RTF_GATEWAY))
1615 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1617 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1623 rt = net->ipv6.ip6_null_entry;
1624 else if (rt->dst.error) {
1625 rt = net->ipv6.ip6_null_entry;
1629 if (rt == net->ipv6.ip6_null_entry) {
1630 fn = fib6_backtrack(fn, &fl6->saddr);
1638 read_unlock_bh(&table->tb6_lock);
1640 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1644 static struct dst_entry *ip6_route_redirect(struct net *net,
1645 const struct flowi6 *fl6,
1646 const struct in6_addr *gateway)
1648 int flags = RT6_LOOKUP_F_HAS_SADDR;
1649 struct ip6rd_flowi rdfl;
1652 rdfl.gateway = *gateway;
1654 return fib6_rule_lookup(net, &rdfl.fl6,
1655 flags, __ip6_route_redirect);
1658 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
1661 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1662 struct dst_entry *dst;
1665 memset(&fl6, 0, sizeof(fl6));
1666 fl6.flowi6_iif = LOOPBACK_IFINDEX;
1667 fl6.flowi6_oif = oif;
1668 fl6.flowi6_mark = mark;
1669 fl6.daddr = iph->daddr;
1670 fl6.saddr = iph->saddr;
1671 fl6.flowlabel = ip6_flowinfo(iph);
1672 fl6.flowi6_uid = uid;
1674 dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1675 rt6_do_redirect(dst, NULL, skb);
1678 EXPORT_SYMBOL_GPL(ip6_redirect);
1680 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1683 const struct ipv6hdr *iph = ipv6_hdr(skb);
1684 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
1685 struct dst_entry *dst;
1688 memset(&fl6, 0, sizeof(fl6));
1689 fl6.flowi6_iif = LOOPBACK_IFINDEX;
1690 fl6.flowi6_oif = oif;
1691 fl6.flowi6_mark = mark;
1692 fl6.daddr = msg->dest;
1693 fl6.saddr = iph->daddr;
1694 fl6.flowi6_uid = sock_net_uid(net, NULL);
1696 dst = ip6_route_redirect(net, &fl6, &iph->saddr);
1697 rt6_do_redirect(dst, NULL, skb);
1701 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1703 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
1706 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1708 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1710 struct net_device *dev = dst->dev;
1711 unsigned int mtu = dst_mtu(dst);
1712 struct net *net = dev_net(dev);
1714 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1716 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1717 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1720 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1721 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1722 * IPV6_MAXPLEN is also valid and means: "any MSS,
1723 * rely only on pmtu discovery"
1725 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1730 static unsigned int ip6_mtu(const struct dst_entry *dst)
1732 const struct rt6_info *rt = (const struct rt6_info *)dst;
1733 unsigned int mtu = rt->rt6i_pmtu;
1734 struct inet6_dev *idev;
1739 mtu = dst_metric_raw(dst, RTAX_MTU);
1746 idev = __in6_dev_get(dst->dev);
1748 mtu = idev->cnf.mtu6;
1752 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1754 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1757 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1760 struct dst_entry *dst;
1761 struct rt6_info *rt;
1762 struct inet6_dev *idev = in6_dev_get(dev);
1763 struct net *net = dev_net(dev);
1765 if (unlikely(!idev))
1766 return ERR_PTR(-ENODEV);
1768 rt = ip6_dst_alloc(net, dev, 0);
1769 if (unlikely(!rt)) {
1771 dst = ERR_PTR(-ENOMEM);
1775 rt->dst.flags |= DST_HOST;
1776 rt->dst.input = ip6_input;
1777 rt->dst.output = ip6_output;
1778 rt->rt6i_gateway = fl6->daddr;
1779 rt->rt6i_dst.addr = fl6->daddr;
1780 rt->rt6i_dst.plen = 128;
1781 rt->rt6i_idev = idev;
1782 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1784 /* Add this dst into uncached_list so that rt6_ifdown() can
1785 * do proper release of the net_device
1787 rt6_uncached_list_add(rt);
1789 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1795 static int ip6_dst_gc(struct dst_ops *ops)
1797 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1798 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1799 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1800 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1801 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1802 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1805 entries = dst_entries_get_fast(ops);
1806 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1807 entries <= rt_max_size)
1810 net->ipv6.ip6_rt_gc_expire++;
1811 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
1812 entries = dst_entries_get_slow(ops);
1813 if (entries < ops->gc_thresh)
1814 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1816 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1817 return entries > rt_max_size;
1820 static int ip6_convert_metrics(struct mx6_config *mxc,
1821 const struct fib6_config *cfg)
1823 bool ecn_ca = false;
1831 mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1835 nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1836 int type = nla_type(nla);
1841 if (unlikely(type > RTAX_MAX))
1844 if (type == RTAX_CC_ALGO) {
1845 char tmp[TCP_CA_NAME_MAX];
1847 nla_strlcpy(tmp, nla, sizeof(tmp));
1848 val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
1849 if (val == TCP_CA_UNSPEC)
1852 val = nla_get_u32(nla);
1854 if (type == RTAX_HOPLIMIT && val > 255)
1856 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
1860 __set_bit(type - 1, mxc->mx_valid);
1864 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
1865 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
1875 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
1876 struct fib6_config *cfg,
1877 const struct in6_addr *gw_addr)
1879 struct flowi6 fl6 = {
1880 .flowi6_oif = cfg->fc_ifindex,
1882 .saddr = cfg->fc_prefsrc,
1884 struct fib6_table *table;
1885 struct rt6_info *rt;
1886 int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
1888 table = fib6_get_table(net, cfg->fc_table);
1892 if (!ipv6_addr_any(&cfg->fc_prefsrc))
1893 flags |= RT6_LOOKUP_F_HAS_SADDR;
1895 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
1897 /* if table lookup failed, fall back to full lookup */
1898 if (rt == net->ipv6.ip6_null_entry) {
1906 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg,
1907 struct netlink_ext_ack *extack)
1909 struct net *net = cfg->fc_nlinfo.nl_net;
1910 struct rt6_info *rt = NULL;
1911 struct net_device *dev = NULL;
1912 struct inet6_dev *idev = NULL;
1913 struct fib6_table *table;
1917 /* RTF_PCPU is an internal flag; can not be set by userspace */
1918 if (cfg->fc_flags & RTF_PCPU) {
1919 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
1923 if (cfg->fc_dst_len > 128) {
1924 NL_SET_ERR_MSG(extack, "Invalid prefix length");
1927 if (cfg->fc_src_len > 128) {
1928 NL_SET_ERR_MSG(extack, "Invalid source address length");
1931 #ifndef CONFIG_IPV6_SUBTREES
1932 if (cfg->fc_src_len) {
1933 NL_SET_ERR_MSG(extack,
1934 "Specifying source address requires IPV6_SUBTREES to be enabled");
1938 if (cfg->fc_ifindex) {
1940 dev = dev_get_by_index(net, cfg->fc_ifindex);
1943 idev = in6_dev_get(dev);
1948 if (cfg->fc_metric == 0)
1949 cfg->fc_metric = IP6_RT_PRIO_USER;
1952 if (cfg->fc_nlinfo.nlh &&
1953 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1954 table = fib6_get_table(net, cfg->fc_table);
1956 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1957 table = fib6_new_table(net, cfg->fc_table);
1960 table = fib6_new_table(net, cfg->fc_table);
1966 rt = ip6_dst_alloc(net, NULL,
1967 (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
1974 if (cfg->fc_flags & RTF_EXPIRES)
1975 rt6_set_expires(rt, jiffies +
1976 clock_t_to_jiffies(cfg->fc_expires));
1978 rt6_clean_expires(rt);
1980 if (cfg->fc_protocol == RTPROT_UNSPEC)
1981 cfg->fc_protocol = RTPROT_BOOT;
1982 rt->rt6i_protocol = cfg->fc_protocol;
1984 addr_type = ipv6_addr_type(&cfg->fc_dst);
1986 if (addr_type & IPV6_ADDR_MULTICAST)
1987 rt->dst.input = ip6_mc_input;
1988 else if (cfg->fc_flags & RTF_LOCAL)
1989 rt->dst.input = ip6_input;
1991 rt->dst.input = ip6_forward;
1993 rt->dst.output = ip6_output;
1995 if (cfg->fc_encap) {
1996 struct lwtunnel_state *lwtstate;
1998 err = lwtunnel_build_state(cfg->fc_encap_type,
1999 cfg->fc_encap, AF_INET6, cfg,
2003 rt->dst.lwtstate = lwtstate_get(lwtstate);
2004 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
2005 rt->dst.lwtstate->orig_output = rt->dst.output;
2006 rt->dst.output = lwtunnel_output;
2008 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
2009 rt->dst.lwtstate->orig_input = rt->dst.input;
2010 rt->dst.input = lwtunnel_input;
2014 ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
2015 rt->rt6i_dst.plen = cfg->fc_dst_len;
2016 if (rt->rt6i_dst.plen == 128)
2017 rt->dst.flags |= DST_HOST;
2019 #ifdef CONFIG_IPV6_SUBTREES
2020 ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
2021 rt->rt6i_src.plen = cfg->fc_src_len;
2024 rt->rt6i_metric = cfg->fc_metric;
2026 /* We cannot add true routes via loopback here,
2027 they would result in kernel looping; promote them to reject routes
2029 if ((cfg->fc_flags & RTF_REJECT) ||
2030 (dev && (dev->flags & IFF_LOOPBACK) &&
2031 !(addr_type & IPV6_ADDR_LOOPBACK) &&
2032 !(cfg->fc_flags & RTF_LOCAL))) {
2033 /* hold loopback dev/idev if we haven't done so. */
2034 if (dev != net->loopback_dev) {
2039 dev = net->loopback_dev;
2041 idev = in6_dev_get(dev);
2047 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
2048 switch (cfg->fc_type) {
2050 rt->dst.error = -EINVAL;
2051 rt->dst.output = dst_discard_out;
2052 rt->dst.input = dst_discard;
2055 rt->dst.error = -EACCES;
2056 rt->dst.output = ip6_pkt_prohibit_out;
2057 rt->dst.input = ip6_pkt_prohibit;
2060 case RTN_UNREACHABLE:
2062 rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
2063 : (cfg->fc_type == RTN_UNREACHABLE)
2064 ? -EHOSTUNREACH : -ENETUNREACH;
2065 rt->dst.output = ip6_pkt_discard_out;
2066 rt->dst.input = ip6_pkt_discard;
2072 if (cfg->fc_flags & RTF_GATEWAY) {
2073 const struct in6_addr *gw_addr;
2076 gw_addr = &cfg->fc_gateway;
2077 gwa_type = ipv6_addr_type(gw_addr);
2079 /* if gw_addr is local we will fail to detect this in case
2080 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2081 * will return already-added prefix route via interface that
2082 * prefix route was assigned to, which might be non-loopback.
2085 if (ipv6_chk_addr_and_flags(net, gw_addr,
2086 gwa_type & IPV6_ADDR_LINKLOCAL ?
2087 dev : NULL, 0, 0)) {
2088 NL_SET_ERR_MSG(extack, "Invalid gateway address");
2091 rt->rt6i_gateway = *gw_addr;
2093 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2094 struct rt6_info *grt = NULL;
2096 /* IPv6 strictly inhibits using not link-local
2097 addresses as nexthop address.
2098 Otherwise, router will not able to send redirects.
2099 It is very good, but in some (rare!) circumstances
2100 (SIT, PtP, NBMA NOARP links) it is handy to allow
2101 some exceptions. --ANK
2102 We allow IPv4-mapped nexthops to support RFC4798-type
2105 if (!(gwa_type & (IPV6_ADDR_UNICAST |
2106 IPV6_ADDR_MAPPED))) {
2107 NL_SET_ERR_MSG(extack,
2108 "Invalid gateway address");
2112 if (cfg->fc_table) {
2113 grt = ip6_nh_lookup_table(net, cfg, gw_addr);
2116 if (grt->rt6i_flags & RTF_GATEWAY ||
2117 (dev && dev != grt->dst.dev)) {
2125 grt = rt6_lookup(net, gw_addr, NULL,
2126 cfg->fc_ifindex, 1);
2128 err = -EHOSTUNREACH;
2132 if (dev != grt->dst.dev) {
2138 idev = grt->rt6i_idev;
2140 in6_dev_hold(grt->rt6i_idev);
2142 if (!(grt->rt6i_flags & RTF_GATEWAY))
2151 NL_SET_ERR_MSG(extack, "Egress device not specified");
2153 } else if (dev->flags & IFF_LOOPBACK) {
2154 NL_SET_ERR_MSG(extack,
2155 "Egress device can not be loopback device for this route");
2164 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2165 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2166 NL_SET_ERR_MSG(extack, "Invalid source address");
2170 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2171 rt->rt6i_prefsrc.plen = 128;
2173 rt->rt6i_prefsrc.plen = 0;
2175 rt->rt6i_flags = cfg->fc_flags;
2179 rt->rt6i_idev = idev;
2180 rt->rt6i_table = table;
2182 cfg->fc_nlinfo.nl_net = dev_net(dev);
2191 dst_release_immediate(&rt->dst);
2193 return ERR_PTR(err);
2196 int ip6_route_add(struct fib6_config *cfg,
2197 struct netlink_ext_ack *extack)
2199 struct mx6_config mxc = { .mx = NULL, };
2200 struct rt6_info *rt;
2203 rt = ip6_route_info_create(cfg, extack);
2210 err = ip6_convert_metrics(&mxc, cfg);
2214 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc, extack);
2221 dst_release_immediate(&rt->dst);
2226 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2229 struct fib6_table *table;
2230 struct net *net = dev_net(rt->dst.dev);
2232 if (rt == net->ipv6.ip6_null_entry) {
2237 table = rt->rt6i_table;
2238 write_lock_bh(&table->tb6_lock);
2239 err = fib6_del(rt, info);
2240 write_unlock_bh(&table->tb6_lock);
2247 int ip6_del_rt(struct rt6_info *rt)
2249 struct nl_info info = {
2250 .nl_net = dev_net(rt->dst.dev),
2252 return __ip6_del_rt(rt, &info);
2255 static int __ip6_del_rt_siblings(struct rt6_info *rt, struct fib6_config *cfg)
2257 struct nl_info *info = &cfg->fc_nlinfo;
2258 struct net *net = info->nl_net;
2259 struct sk_buff *skb = NULL;
2260 struct fib6_table *table;
2263 if (rt == net->ipv6.ip6_null_entry)
2265 table = rt->rt6i_table;
2266 write_lock_bh(&table->tb6_lock);
2268 if (rt->rt6i_nsiblings && cfg->fc_delete_all_nh) {
2269 struct rt6_info *sibling, *next_sibling;
2271 /* prefer to send a single notification with all hops */
2272 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
2274 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
2276 if (rt6_fill_node(net, skb, rt,
2277 NULL, NULL, 0, RTM_DELROUTE,
2278 info->portid, seq, 0) < 0) {
2282 info->skip_notify = 1;
2285 list_for_each_entry_safe(sibling, next_sibling,
2288 err = fib6_del(sibling, info);
2294 err = fib6_del(rt, info);
2296 write_unlock_bh(&table->tb6_lock);
2301 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
2302 info->nlh, gfp_any());
2307 static int ip6_route_del(struct fib6_config *cfg,
2308 struct netlink_ext_ack *extack)
2310 struct fib6_table *table;
2311 struct fib6_node *fn;
2312 struct rt6_info *rt;
2315 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2317 NL_SET_ERR_MSG(extack, "FIB table does not exist");
2321 read_lock_bh(&table->tb6_lock);
2323 fn = fib6_locate(&table->tb6_root,
2324 &cfg->fc_dst, cfg->fc_dst_len,
2325 &cfg->fc_src, cfg->fc_src_len);
2328 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2329 if ((rt->rt6i_flags & RTF_CACHE) &&
2330 !(cfg->fc_flags & RTF_CACHE))
2332 if (cfg->fc_ifindex &&
2334 rt->dst.dev->ifindex != cfg->fc_ifindex))
2336 if (cfg->fc_flags & RTF_GATEWAY &&
2337 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2339 if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2341 if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
2344 read_unlock_bh(&table->tb6_lock);
2346 /* if gateway was specified only delete the one hop */
2347 if (cfg->fc_flags & RTF_GATEWAY)
2348 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2350 return __ip6_del_rt_siblings(rt, cfg);
2353 read_unlock_bh(&table->tb6_lock);
2358 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2360 struct netevent_redirect netevent;
2361 struct rt6_info *rt, *nrt = NULL;
2362 struct ndisc_options ndopts;
2363 struct inet6_dev *in6_dev;
2364 struct neighbour *neigh;
2366 int optlen, on_link;
2369 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2370 optlen -= sizeof(*msg);
2373 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2377 msg = (struct rd_msg *)icmp6_hdr(skb);
2379 if (ipv6_addr_is_multicast(&msg->dest)) {
2380 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2385 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2387 } else if (ipv6_addr_type(&msg->target) !=
2388 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2389 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2393 in6_dev = __in6_dev_get(skb->dev);
2396 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2400 * The IP source address of the Redirect MUST be the same as the current
2401 * first-hop router for the specified ICMP Destination Address.
2404 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2405 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2410 if (ndopts.nd_opts_tgt_lladdr) {
2411 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2414 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2419 rt = (struct rt6_info *) dst;
2420 if (rt->rt6i_flags & RTF_REJECT) {
2421 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2425 /* Redirect received -> path was valid.
2426 * Look, redirects are sent only in response to data packets,
2427 * so that this nexthop apparently is reachable. --ANK
2429 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
2431 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2436 * We have finally decided to accept it.
2439 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
2440 NEIGH_UPDATE_F_WEAK_OVERRIDE|
2441 NEIGH_UPDATE_F_OVERRIDE|
2442 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2443 NEIGH_UPDATE_F_ISROUTER)),
2444 NDISC_REDIRECT, &ndopts);
2446 nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2450 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2452 nrt->rt6i_flags &= ~RTF_GATEWAY;
2454 nrt->rt6i_protocol = RTPROT_REDIRECT;
2455 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2457 if (ip6_ins_rt(nrt))
2460 netevent.old = &rt->dst;
2461 netevent.new = &nrt->dst;
2462 netevent.daddr = &msg->dest;
2463 netevent.neigh = neigh;
2464 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2466 if (rt->rt6i_flags & RTF_CACHE) {
2467 rt = (struct rt6_info *) dst_clone(&rt->dst);
2472 /* Release the reference taken in
2473 * ip6_rt_cache_alloc()
2475 dst_release(&nrt->dst);
2478 neigh_release(neigh);
2482 * Misc support functions
2485 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2487 BUG_ON(from->dst.from);
2489 rt->rt6i_flags &= ~RTF_EXPIRES;
2490 dst_hold(&from->dst);
2491 rt->dst.from = &from->dst;
2492 dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2495 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2497 rt->dst.input = ort->dst.input;
2498 rt->dst.output = ort->dst.output;
2499 rt->rt6i_dst = ort->rt6i_dst;
2500 rt->dst.error = ort->dst.error;
2501 rt->rt6i_idev = ort->rt6i_idev;
2503 in6_dev_hold(rt->rt6i_idev);
2504 rt->dst.lastuse = jiffies;
2505 rt->rt6i_gateway = ort->rt6i_gateway;
2506 rt->rt6i_flags = ort->rt6i_flags;
2507 rt6_set_from(rt, ort);
2508 rt->rt6i_metric = ort->rt6i_metric;
2509 #ifdef CONFIG_IPV6_SUBTREES
2510 rt->rt6i_src = ort->rt6i_src;
2512 rt->rt6i_prefsrc = ort->rt6i_prefsrc;
2513 rt->rt6i_table = ort->rt6i_table;
2514 rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
2517 #ifdef CONFIG_IPV6_ROUTE_INFO
2518 static struct rt6_info *rt6_get_route_info(struct net *net,
2519 const struct in6_addr *prefix, int prefixlen,
2520 const struct in6_addr *gwaddr,
2521 struct net_device *dev)
2523 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
2524 int ifindex = dev->ifindex;
2525 struct fib6_node *fn;
2526 struct rt6_info *rt = NULL;
2527 struct fib6_table *table;
2529 table = fib6_get_table(net, tb_id);
2533 read_lock_bh(&table->tb6_lock);
2534 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2538 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2539 if (rt->dst.dev->ifindex != ifindex)
2541 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
2543 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
2549 read_unlock_bh(&table->tb6_lock);
2553 static struct rt6_info *rt6_add_route_info(struct net *net,
2554 const struct in6_addr *prefix, int prefixlen,
2555 const struct in6_addr *gwaddr,
2556 struct net_device *dev,
2559 struct fib6_config cfg = {
2560 .fc_metric = IP6_RT_PRIO_USER,
2561 .fc_ifindex = dev->ifindex,
2562 .fc_dst_len = prefixlen,
2563 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
2564 RTF_UP | RTF_PREF(pref),
2565 .fc_protocol = RTPROT_RA,
2566 .fc_nlinfo.portid = 0,
2567 .fc_nlinfo.nlh = NULL,
2568 .fc_nlinfo.nl_net = net,
2571 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
2572 cfg.fc_dst = *prefix;
2573 cfg.fc_gateway = *gwaddr;
2575 /* We should treat it as a default route if prefix length is 0. */
2577 cfg.fc_flags |= RTF_DEFAULT;
2579 ip6_route_add(&cfg, NULL);
2581 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
2585 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2587 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
2588 struct rt6_info *rt;
2589 struct fib6_table *table;
2591 table = fib6_get_table(dev_net(dev), tb_id);
2595 read_lock_bh(&table->tb6_lock);
2596 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2597 if (dev == rt->dst.dev &&
2598 ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
2599 ipv6_addr_equal(&rt->rt6i_gateway, addr))
2604 read_unlock_bh(&table->tb6_lock);
2608 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2609 struct net_device *dev,
2612 struct fib6_config cfg = {
2613 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
2614 .fc_metric = IP6_RT_PRIO_USER,
2615 .fc_ifindex = dev->ifindex,
2616 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2617 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2618 .fc_protocol = RTPROT_RA,
2619 .fc_nlinfo.portid = 0,
2620 .fc_nlinfo.nlh = NULL,
2621 .fc_nlinfo.nl_net = dev_net(dev),
2624 cfg.fc_gateway = *gwaddr;
2626 if (!ip6_route_add(&cfg, NULL)) {
2627 struct fib6_table *table;
2629 table = fib6_get_table(dev_net(dev), cfg.fc_table);
2631 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
2634 return rt6_get_dflt_router(gwaddr, dev);
2637 static void __rt6_purge_dflt_routers(struct fib6_table *table)
2639 struct rt6_info *rt;
2642 read_lock_bh(&table->tb6_lock);
2643 for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2644 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
2645 (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
2647 read_unlock_bh(&table->tb6_lock);
2652 read_unlock_bh(&table->tb6_lock);
2654 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
2657 void rt6_purge_dflt_routers(struct net *net)
2659 struct fib6_table *table;
2660 struct hlist_head *head;
2665 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
2666 head = &net->ipv6.fib_table_hash[h];
2667 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
2668 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
2669 __rt6_purge_dflt_routers(table);
2676 static void rtmsg_to_fib6_config(struct net *net,
2677 struct in6_rtmsg *rtmsg,
2678 struct fib6_config *cfg)
2680 memset(cfg, 0, sizeof(*cfg));
2682 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
2684 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
2685 cfg->fc_metric = rtmsg->rtmsg_metric;
2686 cfg->fc_expires = rtmsg->rtmsg_info;
2687 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
2688 cfg->fc_src_len = rtmsg->rtmsg_src_len;
2689 cfg->fc_flags = rtmsg->rtmsg_flags;
2691 cfg->fc_nlinfo.nl_net = net;
2693 cfg->fc_dst = rtmsg->rtmsg_dst;
2694 cfg->fc_src = rtmsg->rtmsg_src;
2695 cfg->fc_gateway = rtmsg->rtmsg_gateway;
2698 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2700 struct fib6_config cfg;
2701 struct in6_rtmsg rtmsg;
2705 case SIOCADDRT: /* Add a route */
2706 case SIOCDELRT: /* Delete a route */
2707 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2709 err = copy_from_user(&rtmsg, arg,
2710 sizeof(struct in6_rtmsg));
2714 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2719 err = ip6_route_add(&cfg, NULL);
2722 err = ip6_route_del(&cfg, NULL);
2736 * Drop the packet on the floor
2739 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2742 struct dst_entry *dst = skb_dst(skb);
2743 switch (ipstats_mib_noroutes) {
2744 case IPSTATS_MIB_INNOROUTES:
2745 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2746 if (type == IPV6_ADDR_ANY) {
2747 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2748 IPSTATS_MIB_INADDRERRORS);
2752 case IPSTATS_MIB_OUTNOROUTES:
2753 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2754 ipstats_mib_noroutes);
2757 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2762 static int ip6_pkt_discard(struct sk_buff *skb)
2764 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2767 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2769 skb->dev = skb_dst(skb)->dev;
2770 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2773 static int ip6_pkt_prohibit(struct sk_buff *skb)
2775 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2778 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2780 skb->dev = skb_dst(skb)->dev;
2781 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2785 * Allocate a dst for local (unicast / anycast) address.
2788 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2789 const struct in6_addr *addr,
2793 struct net *net = dev_net(idev->dev);
2794 struct net_device *dev = idev->dev;
2795 struct rt6_info *rt;
2797 rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
2799 return ERR_PTR(-ENOMEM);
2803 rt->dst.flags |= DST_HOST;
2804 rt->dst.input = ip6_input;
2805 rt->dst.output = ip6_output;
2806 rt->rt6i_idev = idev;
2808 rt->rt6i_protocol = RTPROT_KERNEL;
2809 rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2811 rt->rt6i_flags |= RTF_ANYCAST;
2813 rt->rt6i_flags |= RTF_LOCAL;
2815 rt->rt6i_gateway = *addr;
2816 rt->rt6i_dst.addr = *addr;
2817 rt->rt6i_dst.plen = 128;
2818 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
2819 rt->rt6i_table = fib6_get_table(net, tb_id);
2824 /* remove deleted ip from prefsrc entries */
2825 struct arg_dev_net_ip {
2826 struct net_device *dev;
2828 struct in6_addr *addr;
2831 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2833 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2834 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2835 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2837 if (((void *)rt->dst.dev == dev || !dev) &&
2838 rt != net->ipv6.ip6_null_entry &&
2839 ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2840 /* remove prefsrc entry */
2841 rt->rt6i_prefsrc.plen = 0;
2846 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2848 struct net *net = dev_net(ifp->idev->dev);
2849 struct arg_dev_net_ip adni = {
2850 .dev = ifp->idev->dev,
2854 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
2857 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
2858 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
2860 /* Remove routers and update dst entries when gateway turn into host. */
2861 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
2863 struct in6_addr *gateway = (struct in6_addr *)arg;
2865 if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
2866 ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
2867 ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
2873 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
2875 fib6_clean_all(net, fib6_clean_tohost, gateway);
2878 struct arg_dev_net {
2879 struct net_device *dev;
2883 /* called with write lock held for table with rt */
2884 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2886 const struct arg_dev_net *adn = arg;
2887 const struct net_device *dev = adn->dev;
2889 if ((rt->dst.dev == dev || !dev) &&
2890 rt != adn->net->ipv6.ip6_null_entry &&
2891 (rt->rt6i_nsiblings == 0 ||
2892 (dev && netdev_unregistering(dev)) ||
2893 !rt->rt6i_idev->cnf.ignore_routes_with_linkdown))
2899 void rt6_ifdown(struct net *net, struct net_device *dev)
2901 struct arg_dev_net adn = {
2906 fib6_clean_all(net, fib6_ifdown, &adn);
2908 rt6_uncached_list_flush_dev(net, dev);
2911 struct rt6_mtu_change_arg {
2912 struct net_device *dev;
2916 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2918 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2919 struct inet6_dev *idev;
2921 /* In IPv6 pmtu discovery is not optional,
2922 so that RTAX_MTU lock cannot disable it.
2923 We still use this lock to block changes
2924 caused by addrconf/ndisc.
2927 idev = __in6_dev_get(arg->dev);
2931 /* For administrative MTU increase, there is no way to discover
2932 IPv6 PMTU increase, so PMTU increase should be updated here.
2933 Since RFC 1981 doesn't include administrative MTU increase
2934 update PMTU increase is a MUST. (i.e. jumbo frame)
2937 If new MTU is less than route PMTU, this new MTU will be the
2938 lowest MTU in the path, update the route PMTU to reflect PMTU
2939 decreases; if new MTU is greater than route PMTU, and the
2940 old MTU is the lowest MTU in the path, update the route PMTU
2941 to reflect the increase. In this case if the other nodes' MTU
2942 also have the lowest MTU, TOO BIG MESSAGE will be lead to
2945 if (rt->dst.dev == arg->dev &&
2946 dst_metric_raw(&rt->dst, RTAX_MTU) &&
2947 !dst_metric_locked(&rt->dst, RTAX_MTU)) {
2948 if (rt->rt6i_flags & RTF_CACHE) {
2949 /* For RTF_CACHE with rt6i_pmtu == 0
2950 * (i.e. a redirected route),
2951 * the metrics of its rt->dst.from has already
2954 if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
2955 rt->rt6i_pmtu = arg->mtu;
2956 } else if (dst_mtu(&rt->dst) >= arg->mtu ||
2957 (dst_mtu(&rt->dst) < arg->mtu &&
2958 dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
2959 dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2965 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2967 struct rt6_mtu_change_arg arg = {
2972 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
2975 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2976 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
2977 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) },
2978 [RTA_OIF] = { .type = NLA_U32 },
2979 [RTA_IIF] = { .type = NLA_U32 },
2980 [RTA_PRIORITY] = { .type = NLA_U32 },
2981 [RTA_METRICS] = { .type = NLA_NESTED },
2982 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
2983 [RTA_PREF] = { .type = NLA_U8 },
2984 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
2985 [RTA_ENCAP] = { .type = NLA_NESTED },
2986 [RTA_EXPIRES] = { .type = NLA_U32 },
2987 [RTA_UID] = { .type = NLA_U32 },
2988 [RTA_MARK] = { .type = NLA_U32 },
2989 [RTA_TABLE] = { .type = NLA_U32 },
2992 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2993 struct fib6_config *cfg,
2994 struct netlink_ext_ack *extack)
2997 struct nlattr *tb[RTA_MAX+1];
3001 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3007 rtm = nlmsg_data(nlh);
3008 memset(cfg, 0, sizeof(*cfg));
3010 cfg->fc_table = rtm->rtm_table;
3011 cfg->fc_dst_len = rtm->rtm_dst_len;
3012 cfg->fc_src_len = rtm->rtm_src_len;
3013 cfg->fc_flags = RTF_UP;
3014 cfg->fc_protocol = rtm->rtm_protocol;
3015 cfg->fc_type = rtm->rtm_type;
3017 if (rtm->rtm_type == RTN_UNREACHABLE ||
3018 rtm->rtm_type == RTN_BLACKHOLE ||
3019 rtm->rtm_type == RTN_PROHIBIT ||
3020 rtm->rtm_type == RTN_THROW)
3021 cfg->fc_flags |= RTF_REJECT;
3023 if (rtm->rtm_type == RTN_LOCAL)
3024 cfg->fc_flags |= RTF_LOCAL;
3026 if (rtm->rtm_flags & RTM_F_CLONED)
3027 cfg->fc_flags |= RTF_CACHE;
3029 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
3030 cfg->fc_nlinfo.nlh = nlh;
3031 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
3033 if (tb[RTA_GATEWAY]) {
3034 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
3035 cfg->fc_flags |= RTF_GATEWAY;
3038 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
3043 int plen = (rtm->rtm_dst_len + 7) >> 3;
3045 if (nla_len(tb[RTA_DST]) < plen)
3048 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
3052 int plen = (rtm->rtm_src_len + 7) >> 3;
3054 if (nla_len(tb[RTA_SRC]) < plen)
3057 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
3060 if (tb[RTA_PREFSRC])
3061 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
3064 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
3066 if (tb[RTA_PRIORITY])
3067 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
3069 if (tb[RTA_METRICS]) {
3070 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
3071 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
3075 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
3077 if (tb[RTA_MULTIPATH]) {
3078 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
3079 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
3081 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
3082 cfg->fc_mp_len, extack);
3088 pref = nla_get_u8(tb[RTA_PREF]);
3089 if (pref != ICMPV6_ROUTER_PREF_LOW &&
3090 pref != ICMPV6_ROUTER_PREF_HIGH)
3091 pref = ICMPV6_ROUTER_PREF_MEDIUM;
3092 cfg->fc_flags |= RTF_PREF(pref);
3096 cfg->fc_encap = tb[RTA_ENCAP];
3098 if (tb[RTA_ENCAP_TYPE]) {
3099 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
3101 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
3106 if (tb[RTA_EXPIRES]) {
3107 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
3109 if (addrconf_finite_timeout(timeout)) {
3110 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
3111 cfg->fc_flags |= RTF_EXPIRES;
3121 struct rt6_info *rt6_info;
3122 struct fib6_config r_cfg;
3123 struct mx6_config mxc;
3124 struct list_head next;
3127 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
3131 list_for_each_entry(nh, rt6_nh_list, next) {
3132 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
3133 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
3134 nh->r_cfg.fc_ifindex);
3138 static int ip6_route_info_append(struct list_head *rt6_nh_list,
3139 struct rt6_info *rt, struct fib6_config *r_cfg)
3144 list_for_each_entry(nh, rt6_nh_list, next) {
3145 /* check if rt6_info already exists */
3146 if (rt6_duplicate_nexthop(nh->rt6_info, rt))
3150 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
3154 err = ip6_convert_metrics(&nh->mxc, r_cfg);
3159 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
3160 list_add_tail(&nh->next, rt6_nh_list);
3165 static void ip6_route_mpath_notify(struct rt6_info *rt,
3166 struct rt6_info *rt_last,
3167 struct nl_info *info,
3170 /* if this is an APPEND route, then rt points to the first route
3171 * inserted and rt_last points to last route inserted. Userspace
3172 * wants a consistent dump of the route which starts at the first
3173 * nexthop. Since sibling routes are always added at the end of
3174 * the list, find the first sibling of the last route appended
3176 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->rt6i_nsiblings) {
3177 rt = list_first_entry(&rt_last->rt6i_siblings,
3183 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
3186 static int fib6_gw_from_attr(struct in6_addr *gw, struct nlattr *nla,
3187 struct netlink_ext_ack *extack)
3189 if (nla_len(nla) < sizeof(*gw)) {
3190 NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_GATEWAY");
3194 *gw = nla_get_in6_addr(nla);
3199 static int ip6_route_multipath_add(struct fib6_config *cfg,
3200 struct netlink_ext_ack *extack)
3202 struct rt6_info *rt_notif = NULL, *rt_last = NULL;
3203 struct nl_info *info = &cfg->fc_nlinfo;
3204 struct fib6_config r_cfg;
3205 struct rtnexthop *rtnh;
3206 struct rt6_info *rt;
3207 struct rt6_nh *err_nh;
3208 struct rt6_nh *nh, *nh_safe;
3214 int replace = (cfg->fc_nlinfo.nlh &&
3215 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
3216 LIST_HEAD(rt6_nh_list);
3218 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
3219 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
3220 nlflags |= NLM_F_APPEND;
3222 remaining = cfg->fc_mp_len;
3223 rtnh = (struct rtnexthop *)cfg->fc_mp;
3225 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
3226 * rt6_info structs per nexthop
3228 while (rtnh_ok(rtnh, remaining)) {
3229 memcpy(&r_cfg, cfg, sizeof(*cfg));
3230 if (rtnh->rtnh_ifindex)
3231 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3233 attrlen = rtnh_attrlen(rtnh);
3235 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3237 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3239 err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla,
3244 r_cfg.fc_flags |= RTF_GATEWAY;
3246 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
3247 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
3249 r_cfg.fc_encap_type = nla_get_u16(nla);
3252 rt = ip6_route_info_create(&r_cfg, extack);
3259 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
3261 dst_release_immediate(&rt->dst);
3265 rtnh = rtnh_next(rtnh, &remaining);
3268 /* for add and replace send one notification with all nexthops.
3269 * Skip the notification in fib6_add_rt2node and send one with
3270 * the full route when done
3272 info->skip_notify = 1;
3275 list_for_each_entry(nh, &rt6_nh_list, next) {
3276 err = __ip6_ins_rt(nh->rt6_info, info, &nh->mxc, extack);
3279 /* save reference to last route successfully inserted */
3280 rt_last = nh->rt6_info;
3282 /* save reference to first route for notification */
3284 rt_notif = nh->rt6_info;
3287 /* nh->rt6_info is used or freed at this point, reset to NULL*/
3288 nh->rt6_info = NULL;
3291 ip6_print_replace_route_err(&rt6_nh_list);
3296 /* Because each route is added like a single route we remove
3297 * these flags after the first nexthop: if there is a collision,
3298 * we have already failed to add the first nexthop:
3299 * fib6_add_rt2node() has rejected it; when replacing, old
3300 * nexthops have been replaced by first new, the rest should
3303 if (cfg->fc_nlinfo.nlh) {
3304 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
3306 cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_CREATE;
3311 /* success ... tell user about new route */
3312 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3316 /* send notification for routes that were added so that
3317 * the delete notifications sent by ip6_route_del are
3321 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
3323 /* Delete routes that were already added */
3324 list_for_each_entry(nh, &rt6_nh_list, next) {
3327 ip6_route_del(&nh->r_cfg, extack);
3331 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
3333 dst_release_immediate(&nh->rt6_info->dst);
3335 list_del(&nh->next);
3342 static int ip6_route_multipath_del(struct fib6_config *cfg,
3343 struct netlink_ext_ack *extack)
3345 struct fib6_config r_cfg;
3346 struct rtnexthop *rtnh;
3349 int err = 1, last_err = 0;
3351 remaining = cfg->fc_mp_len;
3352 rtnh = (struct rtnexthop *)cfg->fc_mp;
3354 /* Parse a Multipath Entry */
3355 while (rtnh_ok(rtnh, remaining)) {
3356 memcpy(&r_cfg, cfg, sizeof(*cfg));
3357 if (rtnh->rtnh_ifindex)
3358 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3360 attrlen = rtnh_attrlen(rtnh);
3362 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3364 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3366 err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla,
3373 r_cfg.fc_flags |= RTF_GATEWAY;
3376 err = ip6_route_del(&r_cfg, extack);
3381 rtnh = rtnh_next(rtnh, &remaining);
3387 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3388 struct netlink_ext_ack *extack)
3390 struct fib6_config cfg;
3393 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3398 return ip6_route_multipath_del(&cfg, extack);
3400 cfg.fc_delete_all_nh = 1;
3401 return ip6_route_del(&cfg, extack);
3405 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
3406 struct netlink_ext_ack *extack)
3408 struct fib6_config cfg;
3411 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
3416 return ip6_route_multipath_add(&cfg, extack);
3418 return ip6_route_add(&cfg, extack);
3421 static size_t rt6_nlmsg_size(struct rt6_info *rt)
3423 int nexthop_len = 0;
3425 if (rt->rt6i_nsiblings) {
3426 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
3427 + NLA_ALIGN(sizeof(struct rtnexthop))
3428 + nla_total_size(16) /* RTA_GATEWAY */
3429 + lwtunnel_get_encap_size(rt->dst.lwtstate);
3431 nexthop_len *= rt->rt6i_nsiblings;
3434 return NLMSG_ALIGN(sizeof(struct rtmsg))
3435 + nla_total_size(16) /* RTA_SRC */
3436 + nla_total_size(16) /* RTA_DST */
3437 + nla_total_size(16) /* RTA_GATEWAY */
3438 + nla_total_size(16) /* RTA_PREFSRC */
3439 + nla_total_size(4) /* RTA_TABLE */
3440 + nla_total_size(4) /* RTA_IIF */
3441 + nla_total_size(4) /* RTA_OIF */
3442 + nla_total_size(4) /* RTA_PRIORITY */
3443 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3444 + nla_total_size(sizeof(struct rta_cacheinfo))
3445 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3446 + nla_total_size(1) /* RTA_PREF */
3447 + lwtunnel_get_encap_size(rt->dst.lwtstate)
3451 static int rt6_nexthop_info(struct sk_buff *skb, struct rt6_info *rt,
3452 unsigned int *flags, bool skip_oif)
3454 if (!netif_running(rt->dst.dev) || !netif_carrier_ok(rt->dst.dev)) {
3455 *flags |= RTNH_F_LINKDOWN;
3456 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3457 *flags |= RTNH_F_DEAD;
3460 if (rt->rt6i_flags & RTF_GATEWAY) {
3461 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3462 goto nla_put_failure;
3465 if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
3466 *flags |= RTNH_F_OFFLOAD;
3468 /* not needed for multipath encoding b/c it has a rtnexthop struct */
3469 if (!skip_oif && rt->dst.dev &&
3470 nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3471 goto nla_put_failure;
3473 if (rt->dst.lwtstate &&
3474 lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
3475 goto nla_put_failure;
3483 /* add multipath next hop */
3484 static int rt6_add_nexthop(struct sk_buff *skb, struct rt6_info *rt)
3486 struct rtnexthop *rtnh;
3487 unsigned int flags = 0;
3489 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
3491 goto nla_put_failure;
3493 rtnh->rtnh_hops = 0;
3494 rtnh->rtnh_ifindex = rt->dst.dev ? rt->dst.dev->ifindex : 0;
3496 if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
3497 goto nla_put_failure;
3499 rtnh->rtnh_flags = flags;
3501 /* length of rtnetlink header + attributes */
3502 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
3510 static int rt6_fill_node(struct net *net,
3511 struct sk_buff *skb, struct rt6_info *rt,
3512 struct in6_addr *dst, struct in6_addr *src,
3513 int iif, int type, u32 portid, u32 seq,
3516 u32 metrics[RTAX_MAX];
3518 struct nlmsghdr *nlh;
3522 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
3526 rtm = nlmsg_data(nlh);
3527 rtm->rtm_family = AF_INET6;
3528 rtm->rtm_dst_len = rt->rt6i_dst.plen;
3529 rtm->rtm_src_len = rt->rt6i_src.plen;
3532 table = rt->rt6i_table->tb6_id;
3534 table = RT6_TABLE_UNSPEC;
3535 rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
3536 if (nla_put_u32(skb, RTA_TABLE, table))
3537 goto nla_put_failure;
3538 if (rt->rt6i_flags & RTF_REJECT) {
3539 switch (rt->dst.error) {
3541 rtm->rtm_type = RTN_BLACKHOLE;
3544 rtm->rtm_type = RTN_PROHIBIT;
3547 rtm->rtm_type = RTN_THROW;
3550 rtm->rtm_type = RTN_UNREACHABLE;
3554 else if (rt->rt6i_flags & RTF_LOCAL)
3555 rtm->rtm_type = RTN_LOCAL;
3556 else if (rt->rt6i_flags & RTF_ANYCAST)
3557 rtm->rtm_type = RTN_ANYCAST;
3558 else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
3559 rtm->rtm_type = RTN_LOCAL;
3561 rtm->rtm_type = RTN_UNICAST;
3563 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
3564 rtm->rtm_protocol = rt->rt6i_protocol;
3566 if (rt->rt6i_flags & RTF_CACHE)
3567 rtm->rtm_flags |= RTM_F_CLONED;
3570 if (nla_put_in6_addr(skb, RTA_DST, dst))
3571 goto nla_put_failure;
3572 rtm->rtm_dst_len = 128;
3573 } else if (rtm->rtm_dst_len)
3574 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
3575 goto nla_put_failure;
3576 #ifdef CONFIG_IPV6_SUBTREES
3578 if (nla_put_in6_addr(skb, RTA_SRC, src))
3579 goto nla_put_failure;
3580 rtm->rtm_src_len = 128;
3581 } else if (rtm->rtm_src_len &&
3582 nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
3583 goto nla_put_failure;
3586 #ifdef CONFIG_IPV6_MROUTE
3587 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
3588 int err = ip6mr_get_route(net, skb, rtm, portid);
3593 goto nla_put_failure;
3596 if (nla_put_u32(skb, RTA_IIF, iif))
3597 goto nla_put_failure;
3599 struct in6_addr saddr_buf;
3600 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
3601 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3602 goto nla_put_failure;
3605 if (rt->rt6i_prefsrc.plen) {
3606 struct in6_addr saddr_buf;
3607 saddr_buf = rt->rt6i_prefsrc.addr;
3608 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3609 goto nla_put_failure;
3612 memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
3614 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
3615 if (rtnetlink_put_metrics(skb, metrics) < 0)
3616 goto nla_put_failure;
3618 if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
3619 goto nla_put_failure;
3621 /* For multipath routes, walk the siblings list and add
3622 * each as a nexthop within RTA_MULTIPATH.
3624 if (rt->rt6i_nsiblings) {
3625 struct rt6_info *sibling, *next_sibling;
3628 mp = nla_nest_start(skb, RTA_MULTIPATH);
3630 goto nla_put_failure;
3632 if (rt6_add_nexthop(skb, rt) < 0)
3633 goto nla_put_failure;
3635 list_for_each_entry_safe(sibling, next_sibling,
3636 &rt->rt6i_siblings, rt6i_siblings) {
3637 if (rt6_add_nexthop(skb, sibling) < 0)
3638 goto nla_put_failure;
3641 nla_nest_end(skb, mp);
3643 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
3644 goto nla_put_failure;
3647 expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
3649 if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
3650 goto nla_put_failure;
3652 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
3653 goto nla_put_failure;
3656 nlmsg_end(skb, nlh);
3660 nlmsg_cancel(skb, nlh);
3664 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
3666 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
3667 struct net *net = arg->net;
3669 if (rt == net->ipv6.ip6_null_entry)
3672 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
3673 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
3675 /* user wants prefix routes only */
3676 if (rtm->rtm_flags & RTM_F_PREFIX &&
3677 !(rt->rt6i_flags & RTF_PREFIX_RT)) {
3678 /* success since this is not a prefix route */
3683 return rt6_fill_node(net,
3684 arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
3685 NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
3689 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
3690 struct netlink_ext_ack *extack)
3692 struct net *net = sock_net(in_skb->sk);
3693 struct nlattr *tb[RTA_MAX+1];
3694 int err, iif = 0, oif = 0;
3695 struct dst_entry *dst;
3696 struct rt6_info *rt;
3697 struct sk_buff *skb;
3702 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
3708 memset(&fl6, 0, sizeof(fl6));
3709 rtm = nlmsg_data(nlh);
3710 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
3711 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
3714 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
3717 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
3721 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
3724 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
3728 iif = nla_get_u32(tb[RTA_IIF]);
3731 oif = nla_get_u32(tb[RTA_OIF]);
3734 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
3737 fl6.flowi6_uid = make_kuid(current_user_ns(),
3738 nla_get_u32(tb[RTA_UID]));
3740 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
3743 struct net_device *dev;
3748 dev = dev_get_by_index_rcu(net, iif);
3755 fl6.flowi6_iif = iif;
3757 if (!ipv6_addr_any(&fl6.saddr))
3758 flags |= RT6_LOOKUP_F_HAS_SADDR;
3760 dst = ip6_route_input_lookup(net, dev, &fl6, flags);
3764 fl6.flowi6_oif = oif;
3766 dst = ip6_route_output(net, NULL, &fl6);
3770 rt = container_of(dst, struct rt6_info, dst);
3771 if (rt->dst.error) {
3772 err = rt->dst.error;
3777 if (rt == net->ipv6.ip6_null_entry) {
3778 err = rt->dst.error;
3783 if (fibmatch && rt->dst.from) {
3784 struct rt6_info *ort = container_of(rt->dst.from,
3785 struct rt6_info, dst);
3787 dst_hold(&ort->dst);
3792 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3799 skb_dst_set(skb, &rt->dst);
3801 err = rt6_fill_node(net, skb, rt, NULL, NULL, iif,
3802 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3805 err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3806 RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3813 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3818 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
3819 unsigned int nlm_flags)
3821 struct sk_buff *skb;
3822 struct net *net = info->nl_net;
3827 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3829 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3833 err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3834 event, info->portid, seq, nlm_flags);
3836 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3837 WARN_ON(err == -EMSGSIZE);
3841 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3842 info->nlh, gfp_any());
3846 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
3849 static int ip6_route_dev_notify(struct notifier_block *this,
3850 unsigned long event, void *ptr)
3852 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3853 struct net *net = dev_net(dev);
3855 if (!(dev->flags & IFF_LOOPBACK))
3858 if (event == NETDEV_REGISTER) {
3859 net->ipv6.ip6_null_entry->dst.dev = dev;
3860 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
3861 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3862 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
3863 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
3864 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
3865 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
3867 } else if (event == NETDEV_UNREGISTER &&
3868 dev->reg_state != NETREG_UNREGISTERED) {
3869 /* NETDEV_UNREGISTER could be fired for multiple times by
3870 * netdev_wait_allrefs(). Make sure we only call this once.
3872 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
3873 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3874 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
3875 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
3886 #ifdef CONFIG_PROC_FS
3888 static const struct file_operations ipv6_route_proc_fops = {
3889 .owner = THIS_MODULE,
3890 .open = ipv6_route_open,
3892 .llseek = seq_lseek,
3893 .release = seq_release_net,
3896 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
3898 struct net *net = (struct net *)seq->private;
3899 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
3900 net->ipv6.rt6_stats->fib_nodes,
3901 net->ipv6.rt6_stats->fib_route_nodes,
3902 net->ipv6.rt6_stats->fib_rt_alloc,
3903 net->ipv6.rt6_stats->fib_rt_entries,
3904 net->ipv6.rt6_stats->fib_rt_cache,
3905 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
3906 net->ipv6.rt6_stats->fib_discarded_routes);
3911 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
3913 return single_open_net(inode, file, rt6_stats_seq_show);
3916 static const struct file_operations rt6_stats_seq_fops = {
3917 .owner = THIS_MODULE,
3918 .open = rt6_stats_seq_open,
3920 .llseek = seq_lseek,
3921 .release = single_release_net,
3923 #endif /* CONFIG_PROC_FS */
3925 #ifdef CONFIG_SYSCTL
3928 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
3929 void __user *buffer, size_t *lenp, loff_t *ppos)
3936 net = (struct net *)ctl->extra1;
3937 delay = net->ipv6.sysctl.flush_delay;
3938 proc_dointvec(ctl, write, buffer, lenp, ppos);
3939 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
3943 struct ctl_table ipv6_route_table_template[] = {
3945 .procname = "flush",
3946 .data = &init_net.ipv6.sysctl.flush_delay,
3947 .maxlen = sizeof(int),
3949 .proc_handler = ipv6_sysctl_rtcache_flush
3952 .procname = "gc_thresh",
3953 .data = &ip6_dst_ops_template.gc_thresh,
3954 .maxlen = sizeof(int),
3956 .proc_handler = proc_dointvec,
3959 .procname = "max_size",
3960 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
3961 .maxlen = sizeof(int),
3963 .proc_handler = proc_dointvec,
3966 .procname = "gc_min_interval",
3967 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3968 .maxlen = sizeof(int),
3970 .proc_handler = proc_dointvec_jiffies,
3973 .procname = "gc_timeout",
3974 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
3975 .maxlen = sizeof(int),
3977 .proc_handler = proc_dointvec_jiffies,
3980 .procname = "gc_interval",
3981 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
3982 .maxlen = sizeof(int),
3984 .proc_handler = proc_dointvec_jiffies,
3987 .procname = "gc_elasticity",
3988 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
3989 .maxlen = sizeof(int),
3991 .proc_handler = proc_dointvec,
3994 .procname = "mtu_expires",
3995 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
3996 .maxlen = sizeof(int),
3998 .proc_handler = proc_dointvec_jiffies,
4001 .procname = "min_adv_mss",
4002 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
4003 .maxlen = sizeof(int),
4005 .proc_handler = proc_dointvec,
4008 .procname = "gc_min_interval_ms",
4009 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
4010 .maxlen = sizeof(int),
4012 .proc_handler = proc_dointvec_ms_jiffies,
4017 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
4019 struct ctl_table *table;
4021 table = kmemdup(ipv6_route_table_template,
4022 sizeof(ipv6_route_table_template),
4026 table[0].data = &net->ipv6.sysctl.flush_delay;
4027 table[0].extra1 = net;
4028 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
4029 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
4030 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4031 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
4032 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
4033 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
4034 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
4035 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
4036 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
4038 /* Don't export sysctls to unprivileged users */
4039 if (net->user_ns != &init_user_ns)
4040 table[0].procname = NULL;
4047 static int __net_init ip6_route_net_init(struct net *net)
4051 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
4052 sizeof(net->ipv6.ip6_dst_ops));
4054 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
4055 goto out_ip6_dst_ops;
4057 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
4058 sizeof(*net->ipv6.ip6_null_entry),
4060 if (!net->ipv6.ip6_null_entry)
4061 goto out_ip6_dst_entries;
4062 net->ipv6.ip6_null_entry->dst.path =
4063 (struct dst_entry *)net->ipv6.ip6_null_entry;
4064 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4065 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
4066 ip6_template_metrics, true);
4068 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4069 net->ipv6.fib6_has_custom_rules = false;
4070 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
4071 sizeof(*net->ipv6.ip6_prohibit_entry),
4073 if (!net->ipv6.ip6_prohibit_entry)
4074 goto out_ip6_null_entry;
4075 net->ipv6.ip6_prohibit_entry->dst.path =
4076 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
4077 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4078 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
4079 ip6_template_metrics, true);
4081 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
4082 sizeof(*net->ipv6.ip6_blk_hole_entry),
4084 if (!net->ipv6.ip6_blk_hole_entry)
4085 goto out_ip6_prohibit_entry;
4086 net->ipv6.ip6_blk_hole_entry->dst.path =
4087 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
4088 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
4089 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
4090 ip6_template_metrics, true);
4093 net->ipv6.sysctl.flush_delay = 0;
4094 net->ipv6.sysctl.ip6_rt_max_size = 4096;
4095 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
4096 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
4097 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
4098 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
4099 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
4100 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
4102 net->ipv6.ip6_rt_gc_expire = 30*HZ;
4108 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4109 out_ip6_prohibit_entry:
4110 kfree(net->ipv6.ip6_prohibit_entry);
4112 kfree(net->ipv6.ip6_null_entry);
4114 out_ip6_dst_entries:
4115 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4120 static void __net_exit ip6_route_net_exit(struct net *net)
4122 kfree(net->ipv6.ip6_null_entry);
4123 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4124 kfree(net->ipv6.ip6_prohibit_entry);
4125 kfree(net->ipv6.ip6_blk_hole_entry);
4127 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
4130 static int __net_init ip6_route_net_init_late(struct net *net)
4132 #ifdef CONFIG_PROC_FS
4133 proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
4134 proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
4139 static void __net_exit ip6_route_net_exit_late(struct net *net)
4141 #ifdef CONFIG_PROC_FS
4142 remove_proc_entry("ipv6_route", net->proc_net);
4143 remove_proc_entry("rt6_stats", net->proc_net);
4147 static struct pernet_operations ip6_route_net_ops = {
4148 .init = ip6_route_net_init,
4149 .exit = ip6_route_net_exit,
4152 static int __net_init ipv6_inetpeer_init(struct net *net)
4154 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
4158 inet_peer_base_init(bp);
4159 net->ipv6.peers = bp;
4163 static void __net_exit ipv6_inetpeer_exit(struct net *net)
4165 struct inet_peer_base *bp = net->ipv6.peers;
4167 net->ipv6.peers = NULL;
4168 inetpeer_invalidate_tree(bp);
4172 static struct pernet_operations ipv6_inetpeer_ops = {
4173 .init = ipv6_inetpeer_init,
4174 .exit = ipv6_inetpeer_exit,
4177 static struct pernet_operations ip6_route_net_late_ops = {
4178 .init = ip6_route_net_init_late,
4179 .exit = ip6_route_net_exit_late,
4182 static struct notifier_block ip6_route_dev_notifier = {
4183 .notifier_call = ip6_route_dev_notify,
4184 .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
4187 void __init ip6_route_init_special_entries(void)
4189 /* Registering of the loopback is done before this portion of code,
4190 * the loopback reference in rt6_info will not be taken, do it
4191 * manually for init_net */
4192 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
4193 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4194 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
4195 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
4196 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4197 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
4198 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
4202 int __init ip6_route_init(void)
4208 ip6_dst_ops_template.kmem_cachep =
4209 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
4210 SLAB_HWCACHE_ALIGN, NULL);
4211 if (!ip6_dst_ops_template.kmem_cachep)
4214 ret = dst_entries_init(&ip6_dst_blackhole_ops);
4216 goto out_kmem_cache;
4218 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
4220 goto out_dst_entries;
4222 ret = register_pernet_subsys(&ip6_route_net_ops);
4224 goto out_register_inetpeer;
4226 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
4230 goto out_register_subsys;
4236 ret = fib6_rules_init();
4240 ret = register_pernet_subsys(&ip6_route_net_late_ops);
4242 goto fib6_rules_init;
4245 if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, 0) ||
4246 __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, 0) ||
4247 __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL,
4248 RTNL_FLAG_DOIT_UNLOCKED))
4249 goto out_register_late_subsys;
4251 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
4253 goto out_register_late_subsys;
4255 for_each_possible_cpu(cpu) {
4256 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
4258 INIT_LIST_HEAD(&ul->head);
4259 spin_lock_init(&ul->lock);
4265 out_register_late_subsys:
4266 unregister_pernet_subsys(&ip6_route_net_late_ops);
4268 fib6_rules_cleanup();
4273 out_register_subsys:
4274 unregister_pernet_subsys(&ip6_route_net_ops);
4275 out_register_inetpeer:
4276 unregister_pernet_subsys(&ipv6_inetpeer_ops);
4278 dst_entries_destroy(&ip6_dst_blackhole_ops);
4280 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
4284 void ip6_route_cleanup(void)
4286 unregister_netdevice_notifier(&ip6_route_dev_notifier);
4287 unregister_pernet_subsys(&ip6_route_net_late_ops);
4288 fib6_rules_cleanup();
4291 unregister_pernet_subsys(&ipv6_inetpeer_ops);
4292 unregister_pernet_subsys(&ip6_route_net_ops);
4293 dst_entries_destroy(&ip6_dst_blackhole_ops);
4294 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);