2 * Linux INET6 implementation
6 * Pedro Roque <roque@di.fc.ul.pt>
8 * This program is free software; you can redistribute it and/or
9 * modify it under the terms of the GNU General Public License
10 * as published by the Free Software Foundation; either version
11 * 2 of the License, or (at your option) any later version.
16 * YOSHIFUJI Hideaki @USAGI
17 * reworked default router selection.
18 * - respect outgoing interface
19 * - select from (probably) reachable routers (i.e.
20 * routers in REACHABLE, STALE, DELAY or PROBE states).
21 * - always select the same router if it is (probably)
22 * reachable. otherwise, round-robin the list.
24 * Fixed routing subtrees.
27 #define pr_fmt(fmt) "IPv6: " fmt
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <linux/jhash.h>
48 #include <linux/siphash.h>
49 #include <net/net_namespace.h>
52 #include <net/ip6_fib.h>
53 #include <net/ip6_route.h>
54 #include <net/ndisc.h>
55 #include <net/addrconf.h>
57 #include <linux/rtnetlink.h>
59 #include <net/dst_metadata.h>
61 #include <net/netevent.h>
62 #include <net/netlink.h>
63 #include <net/nexthop.h>
64 #include <net/lwtunnel.h>
65 #include <net/ip_tunnels.h>
66 #include <net/l3mdev.h>
68 #include <linux/uaccess.h>
71 #include <linux/sysctl.h>
74 static int ip6_rt_type_to_error(u8 fib6_type);
76 #define CREATE_TRACE_POINTS
77 #include <trace/events/fib6.h>
78 EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
79 #undef CREATE_TRACE_POINTS
82 RT6_NUD_FAIL_HARD = -3,
83 RT6_NUD_FAIL_PROBE = -2,
84 RT6_NUD_FAIL_DO_RR = -1,
88 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
89 static unsigned int ip6_default_advmss(const struct dst_entry *dst);
90 static unsigned int ip6_mtu(const struct dst_entry *dst);
91 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
92 static void ip6_dst_destroy(struct dst_entry *);
93 static void ip6_dst_ifdown(struct dst_entry *,
94 struct net_device *dev, int how);
95 static int ip6_dst_gc(struct dst_ops *ops);
97 static int ip6_pkt_discard(struct sk_buff *skb);
98 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
99 static int ip6_pkt_prohibit(struct sk_buff *skb);
100 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
101 static void ip6_link_failure(struct sk_buff *skb);
102 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
103 struct sk_buff *skb, u32 mtu,
105 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
106 struct sk_buff *skb);
107 static int rt6_score_route(struct fib6_info *rt, int oif, int strict);
108 static size_t rt6_nlmsg_size(struct fib6_info *rt);
109 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
110 struct fib6_info *rt, struct dst_entry *dst,
111 struct in6_addr *dest, struct in6_addr *src,
112 int iif, int type, u32 portid, u32 seq,
114 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
115 const struct in6_addr *daddr,
116 const struct in6_addr *saddr);
118 #ifdef CONFIG_IPV6_ROUTE_INFO
119 static struct fib6_info *rt6_add_route_info(struct net *net,
120 const struct in6_addr *prefix, int prefixlen,
121 const struct in6_addr *gwaddr,
122 struct net_device *dev,
124 static struct fib6_info *rt6_get_route_info(struct net *net,
125 const struct in6_addr *prefix, int prefixlen,
126 const struct in6_addr *gwaddr,
127 struct net_device *dev);
130 struct uncached_list {
132 struct list_head head;
135 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
137 void rt6_uncached_list_add(struct rt6_info *rt)
139 struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
141 rt->rt6i_uncached_list = ul;
143 spin_lock_bh(&ul->lock);
144 list_add_tail(&rt->rt6i_uncached, &ul->head);
145 spin_unlock_bh(&ul->lock);
148 void rt6_uncached_list_del(struct rt6_info *rt)
150 if (!list_empty(&rt->rt6i_uncached)) {
151 struct uncached_list *ul = rt->rt6i_uncached_list;
152 struct net *net = dev_net(rt->dst.dev);
154 spin_lock_bh(&ul->lock);
155 list_del(&rt->rt6i_uncached);
156 atomic_dec(&net->ipv6.rt6_stats->fib_rt_uncache);
157 spin_unlock_bh(&ul->lock);
161 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
163 struct net_device *loopback_dev = net->loopback_dev;
166 if (dev == loopback_dev)
169 for_each_possible_cpu(cpu) {
170 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
173 spin_lock_bh(&ul->lock);
174 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
175 struct inet6_dev *rt_idev = rt->rt6i_idev;
176 struct net_device *rt_dev = rt->dst.dev;
178 if (rt_idev->dev == dev) {
179 rt->rt6i_idev = in6_dev_get(loopback_dev);
180 in6_dev_put(rt_idev);
184 rt->dst.dev = loopback_dev;
185 dev_hold(rt->dst.dev);
189 spin_unlock_bh(&ul->lock);
193 static inline const void *choose_neigh_daddr(const struct in6_addr *p,
197 if (!ipv6_addr_any(p))
198 return (const void *) p;
200 return &ipv6_hdr(skb)->daddr;
204 struct neighbour *ip6_neigh_lookup(const struct in6_addr *gw,
205 struct net_device *dev,
211 daddr = choose_neigh_daddr(gw, skb, daddr);
212 n = __ipv6_neigh_lookup(dev, daddr);
216 n = neigh_create(&nd_tbl, daddr, dev);
217 return IS_ERR(n) ? NULL : n;
220 static struct neighbour *ip6_dst_neigh_lookup(const struct dst_entry *dst,
224 const struct rt6_info *rt = container_of(dst, struct rt6_info, dst);
226 return ip6_neigh_lookup(&rt->rt6i_gateway, dst->dev, skb, daddr);
229 static void ip6_confirm_neigh(const struct dst_entry *dst, const void *daddr)
231 struct net_device *dev = dst->dev;
232 struct rt6_info *rt = (struct rt6_info *)dst;
234 daddr = choose_neigh_daddr(&rt->rt6i_gateway, NULL, daddr);
237 if (dev->flags & (IFF_NOARP | IFF_LOOPBACK))
239 if (ipv6_addr_is_multicast((const struct in6_addr *)daddr))
241 __ipv6_confirm_neigh(dev, daddr);
244 static struct dst_ops ip6_dst_ops_template = {
248 .check = ip6_dst_check,
249 .default_advmss = ip6_default_advmss,
251 .cow_metrics = dst_cow_metrics_generic,
252 .destroy = ip6_dst_destroy,
253 .ifdown = ip6_dst_ifdown,
254 .negative_advice = ip6_negative_advice,
255 .link_failure = ip6_link_failure,
256 .update_pmtu = ip6_rt_update_pmtu,
257 .redirect = rt6_do_redirect,
258 .local_out = __ip6_local_out,
259 .neigh_lookup = ip6_dst_neigh_lookup,
260 .confirm_neigh = ip6_confirm_neigh,
263 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
265 unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
267 return mtu ? : dst->dev->mtu;
270 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
271 struct sk_buff *skb, u32 mtu,
276 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
281 static struct dst_ops ip6_dst_blackhole_ops = {
283 .destroy = ip6_dst_destroy,
284 .check = ip6_dst_check,
285 .mtu = ip6_blackhole_mtu,
286 .default_advmss = ip6_default_advmss,
287 .update_pmtu = ip6_rt_blackhole_update_pmtu,
288 .redirect = ip6_rt_blackhole_redirect,
289 .cow_metrics = dst_cow_metrics_generic,
290 .neigh_lookup = ip6_dst_neigh_lookup,
293 static const u32 ip6_template_metrics[RTAX_MAX] = {
294 [RTAX_HOPLIMIT - 1] = 0,
297 static const struct fib6_info fib6_null_entry_template = {
298 .fib6_flags = (RTF_REJECT | RTF_NONEXTHOP),
299 .fib6_protocol = RTPROT_KERNEL,
300 .fib6_metric = ~(u32)0,
301 .fib6_ref = ATOMIC_INIT(1),
302 .fib6_type = RTN_UNREACHABLE,
303 .fib6_metrics = (struct dst_metrics *)&dst_default_metrics,
306 static const struct rt6_info ip6_null_entry_template = {
308 .__refcnt = ATOMIC_INIT(1),
310 .obsolete = DST_OBSOLETE_FORCE_CHK,
311 .error = -ENETUNREACH,
312 .input = ip6_pkt_discard,
313 .output = ip6_pkt_discard_out,
315 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
318 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
320 static const struct rt6_info ip6_prohibit_entry_template = {
322 .__refcnt = ATOMIC_INIT(1),
324 .obsolete = DST_OBSOLETE_FORCE_CHK,
326 .input = ip6_pkt_prohibit,
327 .output = ip6_pkt_prohibit_out,
329 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
332 static const struct rt6_info ip6_blk_hole_entry_template = {
334 .__refcnt = ATOMIC_INIT(1),
336 .obsolete = DST_OBSOLETE_FORCE_CHK,
338 .input = dst_discard,
339 .output = dst_discard_out,
341 .rt6i_flags = (RTF_REJECT | RTF_NONEXTHOP),
346 static void rt6_info_init(struct rt6_info *rt)
348 struct dst_entry *dst = &rt->dst;
350 memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
351 INIT_LIST_HEAD(&rt->rt6i_uncached);
354 /* allocate dst with ip6_dst_ops */
355 struct rt6_info *ip6_dst_alloc(struct net *net, struct net_device *dev,
358 struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
359 1, DST_OBSOLETE_FORCE_CHK, flags);
363 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
368 EXPORT_SYMBOL(ip6_dst_alloc);
370 static void ip6_dst_destroy(struct dst_entry *dst)
372 struct dst_metrics *p = (struct dst_metrics *)DST_METRICS_PTR(dst);
373 struct rt6_info *rt = (struct rt6_info *)dst;
374 struct fib6_info *from;
375 struct inet6_dev *idev;
377 if (p != &dst_default_metrics && refcount_dec_and_test(&p->refcnt))
380 rt6_uncached_list_del(rt);
382 idev = rt->rt6i_idev;
384 rt->rt6i_idev = NULL;
388 from = xchg((__force struct fib6_info **)&rt->from, NULL);
389 fib6_info_release(from);
392 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
395 struct rt6_info *rt = (struct rt6_info *)dst;
396 struct inet6_dev *idev = rt->rt6i_idev;
397 struct net_device *loopback_dev =
398 dev_net(dev)->loopback_dev;
400 if (idev && idev->dev != loopback_dev) {
401 struct inet6_dev *loopback_idev = in6_dev_get(loopback_dev);
403 rt->rt6i_idev = loopback_idev;
409 static bool __rt6_check_expired(const struct rt6_info *rt)
411 if (rt->rt6i_flags & RTF_EXPIRES)
412 return time_after(jiffies, rt->dst.expires);
417 static bool rt6_check_expired(const struct rt6_info *rt)
419 struct fib6_info *from;
421 from = rcu_dereference(rt->from);
423 if (rt->rt6i_flags & RTF_EXPIRES) {
424 if (time_after(jiffies, rt->dst.expires))
427 return rt->dst.obsolete != DST_OBSOLETE_FORCE_CHK ||
428 fib6_check_expired(from);
433 struct fib6_info *fib6_multipath_select(const struct net *net,
434 struct fib6_info *match,
435 struct flowi6 *fl6, int oif,
436 const struct sk_buff *skb,
439 struct fib6_info *sibling, *next_sibling;
441 /* We might have already computed the hash for ICMPv6 errors. In such
442 * case it will always be non-zero. Otherwise now is the time to do it.
445 fl6->mp_hash = rt6_multipath_hash(net, fl6, skb, NULL);
447 if (fl6->mp_hash <= atomic_read(&match->fib6_nh.nh_upper_bound))
450 list_for_each_entry_safe(sibling, next_sibling, &match->fib6_siblings,
454 nh_upper_bound = atomic_read(&sibling->fib6_nh.nh_upper_bound);
455 if (fl6->mp_hash > nh_upper_bound)
457 if (rt6_score_route(sibling, oif, strict) < 0)
467 * Route lookup. rcu_read_lock() should be held.
470 static inline struct fib6_info *rt6_device_match(struct net *net,
471 struct fib6_info *rt,
472 const struct in6_addr *saddr,
476 struct fib6_info *sprt;
478 if (!oif && ipv6_addr_any(saddr) &&
479 !(rt->fib6_nh.nh_flags & RTNH_F_DEAD))
482 for (sprt = rt; sprt; sprt = rcu_dereference(sprt->fib6_next)) {
483 const struct net_device *dev = sprt->fib6_nh.nh_dev;
485 if (sprt->fib6_nh.nh_flags & RTNH_F_DEAD)
489 if (dev->ifindex == oif)
492 if (ipv6_chk_addr(net, saddr, dev,
493 flags & RT6_LOOKUP_F_IFACE))
498 if (oif && flags & RT6_LOOKUP_F_IFACE)
499 return net->ipv6.fib6_null_entry;
501 return rt->fib6_nh.nh_flags & RTNH_F_DEAD ? net->ipv6.fib6_null_entry : rt;
504 #ifdef CONFIG_IPV6_ROUTER_PREF
505 struct __rt6_probe_work {
506 struct work_struct work;
507 struct in6_addr target;
508 struct net_device *dev;
511 static void rt6_probe_deferred(struct work_struct *w)
513 struct in6_addr mcaddr;
514 struct __rt6_probe_work *work =
515 container_of(w, struct __rt6_probe_work, work);
517 addrconf_addr_solict_mult(&work->target, &mcaddr);
518 ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, 0);
523 static void rt6_probe(struct fib6_info *rt)
525 struct __rt6_probe_work *work = NULL;
526 const struct in6_addr *nh_gw;
527 unsigned long last_probe;
528 struct neighbour *neigh;
529 struct net_device *dev;
530 struct inet6_dev *idev;
533 * Okay, this does not seem to be appropriate
534 * for now, however, we need to check if it
535 * is really so; aka Router Reachability Probing.
537 * Router Reachability Probe MUST be rate-limited
538 * to no more than one per minute.
540 if (!rt || !(rt->fib6_flags & RTF_GATEWAY))
543 nh_gw = &rt->fib6_nh.nh_gw;
544 dev = rt->fib6_nh.nh_dev;
546 last_probe = READ_ONCE(rt->last_probe);
547 idev = __in6_dev_get(dev);
548 neigh = __ipv6_neigh_lookup_noref(dev, nh_gw);
550 if (neigh->nud_state & NUD_VALID)
553 write_lock(&neigh->lock);
554 if (!(neigh->nud_state & NUD_VALID) &&
556 neigh->updated + idev->cnf.rtr_probe_interval)) {
557 work = kmalloc(sizeof(*work), GFP_ATOMIC);
559 __neigh_set_probe_once(neigh);
561 write_unlock(&neigh->lock);
562 } else if (time_after(jiffies, last_probe +
563 idev->cnf.rtr_probe_interval)) {
564 work = kmalloc(sizeof(*work), GFP_ATOMIC);
567 if (!work || cmpxchg(&rt->last_probe,
568 last_probe, jiffies) != last_probe) {
571 INIT_WORK(&work->work, rt6_probe_deferred);
572 work->target = *nh_gw;
575 schedule_work(&work->work);
579 rcu_read_unlock_bh();
582 static inline void rt6_probe(struct fib6_info *rt)
588 * Default Router Selection (RFC 2461 6.3.6)
590 static inline int rt6_check_dev(struct fib6_info *rt, int oif)
592 const struct net_device *dev = rt->fib6_nh.nh_dev;
594 if (!oif || dev->ifindex == oif)
599 static inline enum rt6_nud_state rt6_check_neigh(struct fib6_info *rt)
601 enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
602 struct neighbour *neigh;
604 if (rt->fib6_flags & RTF_NONEXTHOP ||
605 !(rt->fib6_flags & RTF_GATEWAY))
606 return RT6_NUD_SUCCEED;
609 neigh = __ipv6_neigh_lookup_noref(rt->fib6_nh.nh_dev,
612 read_lock(&neigh->lock);
613 if (neigh->nud_state & NUD_VALID)
614 ret = RT6_NUD_SUCCEED;
615 #ifdef CONFIG_IPV6_ROUTER_PREF
616 else if (!(neigh->nud_state & NUD_FAILED))
617 ret = RT6_NUD_SUCCEED;
619 ret = RT6_NUD_FAIL_PROBE;
621 read_unlock(&neigh->lock);
623 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
624 RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
626 rcu_read_unlock_bh();
631 static int rt6_score_route(struct fib6_info *rt, int oif, int strict)
635 m = rt6_check_dev(rt, oif);
636 if (!m && (strict & RT6_LOOKUP_F_IFACE))
637 return RT6_NUD_FAIL_HARD;
638 #ifdef CONFIG_IPV6_ROUTER_PREF
639 m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->fib6_flags)) << 2;
641 if (strict & RT6_LOOKUP_F_REACHABLE) {
642 int n = rt6_check_neigh(rt);
649 /* called with rc_read_lock held */
650 static inline bool fib6_ignore_linkdown(const struct fib6_info *f6i)
652 const struct net_device *dev = fib6_info_nh_dev(f6i);
656 const struct inet6_dev *idev = __in6_dev_get(dev);
658 rc = !!idev->cnf.ignore_routes_with_linkdown;
664 static struct fib6_info *find_match(struct fib6_info *rt, int oif, int strict,
665 int *mpri, struct fib6_info *match,
669 bool match_do_rr = false;
671 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
674 if (fib6_ignore_linkdown(rt) &&
675 rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
676 !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
679 if (fib6_check_expired(rt))
682 m = rt6_score_route(rt, oif, strict);
683 if (m == RT6_NUD_FAIL_DO_RR) {
685 m = 0; /* lowest valid score */
686 } else if (m == RT6_NUD_FAIL_HARD) {
690 if (strict & RT6_LOOKUP_F_REACHABLE)
693 /* note that m can be RT6_NUD_FAIL_PROBE at this point */
695 *do_rr = match_do_rr;
703 static struct fib6_info *find_rr_leaf(struct fib6_node *fn,
704 struct fib6_info *leaf,
705 struct fib6_info *rr_head,
706 u32 metric, int oif, int strict,
709 struct fib6_info *rt, *match, *cont;
714 for (rt = rr_head; rt; rt = rcu_dereference(rt->fib6_next)) {
715 if (rt->fib6_metric != metric) {
720 match = find_match(rt, oif, strict, &mpri, match, do_rr);
723 for (rt = leaf; rt && rt != rr_head;
724 rt = rcu_dereference(rt->fib6_next)) {
725 if (rt->fib6_metric != metric) {
730 match = find_match(rt, oif, strict, &mpri, match, do_rr);
736 for (rt = cont; rt; rt = rcu_dereference(rt->fib6_next))
737 match = find_match(rt, oif, strict, &mpri, match, do_rr);
742 static struct fib6_info *rt6_select(struct net *net, struct fib6_node *fn,
745 struct fib6_info *leaf = rcu_dereference(fn->leaf);
746 struct fib6_info *match, *rt0;
750 if (!leaf || leaf == net->ipv6.fib6_null_entry)
751 return net->ipv6.fib6_null_entry;
753 rt0 = rcu_dereference(fn->rr_ptr);
757 /* Double check to make sure fn is not an intermediate node
758 * and fn->leaf does not points to its child's leaf
759 * (This might happen if all routes under fn are deleted from
760 * the tree and fib6_repair_tree() is called on the node.)
762 key_plen = rt0->fib6_dst.plen;
763 #ifdef CONFIG_IPV6_SUBTREES
764 if (rt0->fib6_src.plen)
765 key_plen = rt0->fib6_src.plen;
767 if (fn->fn_bit != key_plen)
768 return net->ipv6.fib6_null_entry;
770 match = find_rr_leaf(fn, leaf, rt0, rt0->fib6_metric, oif, strict,
774 struct fib6_info *next = rcu_dereference(rt0->fib6_next);
776 /* no entries matched; do round-robin */
777 if (!next || next->fib6_metric != rt0->fib6_metric)
781 spin_lock_bh(&leaf->fib6_table->tb6_lock);
782 /* make sure next is not being deleted from the tree */
784 rcu_assign_pointer(fn->rr_ptr, next);
785 spin_unlock_bh(&leaf->fib6_table->tb6_lock);
789 return match ? match : net->ipv6.fib6_null_entry;
792 static bool rt6_is_gw_or_nonexthop(const struct fib6_info *rt)
794 return (rt->fib6_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
797 #ifdef CONFIG_IPV6_ROUTE_INFO
798 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
799 const struct in6_addr *gwaddr)
801 struct net *net = dev_net(dev);
802 struct route_info *rinfo = (struct route_info *) opt;
803 struct in6_addr prefix_buf, *prefix;
805 unsigned long lifetime;
806 struct fib6_info *rt;
808 if (len < sizeof(struct route_info)) {
812 /* Sanity check for prefix_len and length */
813 if (rinfo->length > 3) {
815 } else if (rinfo->prefix_len > 128) {
817 } else if (rinfo->prefix_len > 64) {
818 if (rinfo->length < 2) {
821 } else if (rinfo->prefix_len > 0) {
822 if (rinfo->length < 1) {
827 pref = rinfo->route_pref;
828 if (pref == ICMPV6_ROUTER_PREF_INVALID)
831 lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
833 if (rinfo->length == 3)
834 prefix = (struct in6_addr *)rinfo->prefix;
836 /* this function is safe */
837 ipv6_addr_prefix(&prefix_buf,
838 (struct in6_addr *)rinfo->prefix,
840 prefix = &prefix_buf;
843 if (rinfo->prefix_len == 0)
844 rt = rt6_get_dflt_router(net, gwaddr, dev);
846 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
849 if (rt && !lifetime) {
855 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
858 rt->fib6_flags = RTF_ROUTEINFO |
859 (rt->fib6_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
862 if (!addrconf_finite_timeout(lifetime))
863 fib6_clean_expires(rt);
865 fib6_set_expires(rt, jiffies + HZ * lifetime);
867 fib6_info_release(rt);
874 * Misc support functions
877 /* called with rcu_lock held */
878 static struct net_device *ip6_rt_get_dev_rcu(struct fib6_info *rt)
880 struct net_device *dev = rt->fib6_nh.nh_dev;
882 if (rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) {
883 /* for copies of local routes, dst->dev needs to be the
884 * device if it is a master device, the master device if
885 * device is enslaved, and the loopback as the default
887 if (netif_is_l3_slave(dev) &&
888 !rt6_need_strict(&rt->fib6_dst.addr))
889 dev = l3mdev_master_dev_rcu(dev);
890 else if (!netif_is_l3_master(dev))
891 dev = dev_net(dev)->loopback_dev;
892 /* last case is netif_is_l3_master(dev) is true in which
893 * case we want dev returned to be dev
900 static const int fib6_prop[RTN_MAX + 1] = {
907 [RTN_BLACKHOLE] = -EINVAL,
908 [RTN_UNREACHABLE] = -EHOSTUNREACH,
909 [RTN_PROHIBIT] = -EACCES,
910 [RTN_THROW] = -EAGAIN,
912 [RTN_XRESOLVE] = -EINVAL,
915 static int ip6_rt_type_to_error(u8 fib6_type)
917 return fib6_prop[fib6_type];
920 static unsigned short fib6_info_dst_flags(struct fib6_info *rt)
922 unsigned short flags = 0;
925 flags |= DST_NOCOUNT;
926 if (rt->dst_nopolicy)
927 flags |= DST_NOPOLICY;
934 static void ip6_rt_init_dst_reject(struct rt6_info *rt, struct fib6_info *ort)
936 rt->dst.error = ip6_rt_type_to_error(ort->fib6_type);
938 switch (ort->fib6_type) {
940 rt->dst.output = dst_discard_out;
941 rt->dst.input = dst_discard;
944 rt->dst.output = ip6_pkt_prohibit_out;
945 rt->dst.input = ip6_pkt_prohibit;
948 case RTN_UNREACHABLE:
950 rt->dst.output = ip6_pkt_discard_out;
951 rt->dst.input = ip6_pkt_discard;
956 static void ip6_rt_init_dst(struct rt6_info *rt, struct fib6_info *ort)
958 if (ort->fib6_flags & RTF_REJECT) {
959 ip6_rt_init_dst_reject(rt, ort);
964 rt->dst.output = ip6_output;
966 if (ort->fib6_type == RTN_LOCAL || ort->fib6_type == RTN_ANYCAST) {
967 rt->dst.input = ip6_input;
968 } else if (ipv6_addr_type(&ort->fib6_dst.addr) & IPV6_ADDR_MULTICAST) {
969 rt->dst.input = ip6_mc_input;
971 rt->dst.input = ip6_forward;
974 if (ort->fib6_nh.nh_lwtstate) {
975 rt->dst.lwtstate = lwtstate_get(ort->fib6_nh.nh_lwtstate);
976 lwtunnel_set_redirect(&rt->dst);
979 rt->dst.lastuse = jiffies;
982 /* Caller must already hold reference to @from */
983 static void rt6_set_from(struct rt6_info *rt, struct fib6_info *from)
985 rt->rt6i_flags &= ~RTF_EXPIRES;
986 rcu_assign_pointer(rt->from, from);
987 dst_init_metrics(&rt->dst, from->fib6_metrics->metrics, true);
988 if (from->fib6_metrics != &dst_default_metrics) {
989 rt->dst._metrics |= DST_METRICS_REFCOUNTED;
990 refcount_inc(&from->fib6_metrics->refcnt);
994 /* Caller must already hold reference to @ort */
995 static void ip6_rt_copy_init(struct rt6_info *rt, struct fib6_info *ort)
997 struct net_device *dev = fib6_info_nh_dev(ort);
999 ip6_rt_init_dst(rt, ort);
1001 rt->rt6i_dst = ort->fib6_dst;
1002 rt->rt6i_idev = dev ? in6_dev_get(dev) : NULL;
1003 rt->rt6i_gateway = ort->fib6_nh.nh_gw;
1004 rt->rt6i_flags = ort->fib6_flags;
1005 rt6_set_from(rt, ort);
1006 #ifdef CONFIG_IPV6_SUBTREES
1007 rt->rt6i_src = ort->fib6_src;
1009 rt->rt6i_prefsrc = ort->fib6_prefsrc;
1012 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
1013 struct in6_addr *saddr)
1015 struct fib6_node *pn, *sn;
1017 if (fn->fn_flags & RTN_TL_ROOT)
1019 pn = rcu_dereference(fn->parent);
1020 sn = FIB6_SUBTREE(pn);
1022 fn = fib6_node_lookup(sn, NULL, saddr);
1025 if (fn->fn_flags & RTN_RTINFO)
1030 static bool ip6_hold_safe(struct net *net, struct rt6_info **prt,
1033 struct rt6_info *rt = *prt;
1035 if (dst_hold_safe(&rt->dst))
1037 if (null_fallback) {
1038 rt = net->ipv6.ip6_null_entry;
1047 /* called with rcu_lock held */
1048 static struct rt6_info *ip6_create_rt_rcu(struct fib6_info *rt)
1050 unsigned short flags = fib6_info_dst_flags(rt);
1051 struct net_device *dev = rt->fib6_nh.nh_dev;
1052 struct rt6_info *nrt;
1054 if (!fib6_info_hold_safe(rt))
1057 nrt = ip6_dst_alloc(dev_net(dev), dev, flags);
1059 fib6_info_release(rt);
1063 ip6_rt_copy_init(nrt, rt);
1067 nrt = dev_net(dev)->ipv6.ip6_null_entry;
1068 dst_hold(&nrt->dst);
1072 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
1073 struct fib6_table *table,
1075 const struct sk_buff *skb,
1078 struct fib6_info *f6i;
1079 struct fib6_node *fn;
1080 struct rt6_info *rt;
1082 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1083 flags &= ~RT6_LOOKUP_F_IFACE;
1086 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1088 f6i = rcu_dereference(fn->leaf);
1090 f6i = net->ipv6.fib6_null_entry;
1092 f6i = rt6_device_match(net, f6i, &fl6->saddr,
1093 fl6->flowi6_oif, flags);
1094 if (f6i->fib6_nsiblings && fl6->flowi6_oif == 0)
1095 f6i = fib6_multipath_select(net, f6i, fl6,
1096 fl6->flowi6_oif, skb,
1099 if (f6i == net->ipv6.fib6_null_entry) {
1100 fn = fib6_backtrack(fn, &fl6->saddr);
1105 trace_fib6_table_lookup(net, f6i, table, fl6);
1107 /* Search through exception table */
1108 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1110 if (ip6_hold_safe(net, &rt, true))
1111 dst_use_noref(&rt->dst, jiffies);
1112 } else if (f6i == net->ipv6.fib6_null_entry) {
1113 rt = net->ipv6.ip6_null_entry;
1116 rt = ip6_create_rt_rcu(f6i);
1124 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
1125 const struct sk_buff *skb, int flags)
1127 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_lookup);
1129 EXPORT_SYMBOL_GPL(ip6_route_lookup);
1131 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
1132 const struct in6_addr *saddr, int oif,
1133 const struct sk_buff *skb, int strict)
1135 struct flowi6 fl6 = {
1139 struct dst_entry *dst;
1140 int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
1143 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
1144 flags |= RT6_LOOKUP_F_HAS_SADDR;
1147 dst = fib6_rule_lookup(net, &fl6, skb, flags, ip6_pol_route_lookup);
1148 if (dst->error == 0)
1149 return (struct rt6_info *) dst;
1155 EXPORT_SYMBOL(rt6_lookup);
1157 /* ip6_ins_rt is called with FREE table->tb6_lock.
1158 * It takes new route entry, the addition fails by any reason the
1159 * route is released.
1160 * Caller must hold dst before calling it.
1163 static int __ip6_ins_rt(struct fib6_info *rt, struct nl_info *info,
1164 struct netlink_ext_ack *extack)
1167 struct fib6_table *table;
1169 table = rt->fib6_table;
1170 spin_lock_bh(&table->tb6_lock);
1171 err = fib6_add(&table->tb6_root, rt, info, extack);
1172 spin_unlock_bh(&table->tb6_lock);
1177 int ip6_ins_rt(struct net *net, struct fib6_info *rt)
1179 struct nl_info info = { .nl_net = net, };
1181 return __ip6_ins_rt(rt, &info, NULL);
1184 static struct rt6_info *ip6_rt_cache_alloc(struct fib6_info *ort,
1185 const struct in6_addr *daddr,
1186 const struct in6_addr *saddr)
1188 struct net_device *dev;
1189 struct rt6_info *rt;
1195 if (!fib6_info_hold_safe(ort))
1198 dev = ip6_rt_get_dev_rcu(ort);
1199 rt = ip6_dst_alloc(dev_net(dev), dev, 0);
1201 fib6_info_release(ort);
1205 ip6_rt_copy_init(rt, ort);
1206 rt->rt6i_flags |= RTF_CACHE;
1207 rt->dst.flags |= DST_HOST;
1208 rt->rt6i_dst.addr = *daddr;
1209 rt->rt6i_dst.plen = 128;
1211 if (!rt6_is_gw_or_nonexthop(ort)) {
1212 if (ort->fib6_dst.plen != 128 &&
1213 ipv6_addr_equal(&ort->fib6_dst.addr, daddr))
1214 rt->rt6i_flags |= RTF_ANYCAST;
1215 #ifdef CONFIG_IPV6_SUBTREES
1216 if (rt->rt6i_src.plen && saddr) {
1217 rt->rt6i_src.addr = *saddr;
1218 rt->rt6i_src.plen = 128;
1226 static struct rt6_info *ip6_rt_pcpu_alloc(struct fib6_info *rt)
1228 unsigned short flags = fib6_info_dst_flags(rt);
1229 struct net_device *dev;
1230 struct rt6_info *pcpu_rt;
1232 if (!fib6_info_hold_safe(rt))
1236 dev = ip6_rt_get_dev_rcu(rt);
1237 pcpu_rt = ip6_dst_alloc(dev_net(dev), dev, flags);
1240 fib6_info_release(rt);
1243 ip6_rt_copy_init(pcpu_rt, rt);
1244 pcpu_rt->rt6i_flags |= RTF_PCPU;
1248 /* It should be called with rcu_read_lock() acquired */
1249 static struct rt6_info *rt6_get_pcpu_route(struct fib6_info *rt)
1251 struct rt6_info *pcpu_rt, **p;
1253 p = this_cpu_ptr(rt->rt6i_pcpu);
1257 ip6_hold_safe(NULL, &pcpu_rt, false);
1262 static struct rt6_info *rt6_make_pcpu_route(struct net *net,
1263 struct fib6_info *rt)
1265 struct rt6_info *pcpu_rt, *prev, **p;
1267 pcpu_rt = ip6_rt_pcpu_alloc(rt);
1269 dst_hold(&net->ipv6.ip6_null_entry->dst);
1270 return net->ipv6.ip6_null_entry;
1273 dst_hold(&pcpu_rt->dst);
1274 p = this_cpu_ptr(rt->rt6i_pcpu);
1275 prev = cmpxchg(p, NULL, pcpu_rt);
1278 if (rt->fib6_destroying) {
1279 struct fib6_info *from;
1281 from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL);
1282 fib6_info_release(from);
1288 /* exception hash table implementation
1290 static DEFINE_SPINLOCK(rt6_exception_lock);
1292 /* Remove rt6_ex from hash table and free the memory
1293 * Caller must hold rt6_exception_lock
1295 static void rt6_remove_exception(struct rt6_exception_bucket *bucket,
1296 struct rt6_exception *rt6_ex)
1298 struct fib6_info *from;
1301 if (!bucket || !rt6_ex)
1304 net = dev_net(rt6_ex->rt6i->dst.dev);
1305 net->ipv6.rt6_stats->fib_rt_cache--;
1307 /* purge completely the exception to allow releasing the held resources:
1308 * some [sk] cache may keep the dst around for unlimited time
1310 from = xchg((__force struct fib6_info **)&rt6_ex->rt6i->from, NULL);
1311 fib6_info_release(from);
1312 dst_dev_put(&rt6_ex->rt6i->dst);
1314 hlist_del_rcu(&rt6_ex->hlist);
1315 dst_release(&rt6_ex->rt6i->dst);
1316 kfree_rcu(rt6_ex, rcu);
1317 WARN_ON_ONCE(!bucket->depth);
1321 /* Remove oldest rt6_ex in bucket and free the memory
1322 * Caller must hold rt6_exception_lock
1324 static void rt6_exception_remove_oldest(struct rt6_exception_bucket *bucket)
1326 struct rt6_exception *rt6_ex, *oldest = NULL;
1331 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1332 if (!oldest || time_before(rt6_ex->stamp, oldest->stamp))
1335 rt6_remove_exception(bucket, oldest);
1338 static u32 rt6_exception_hash(const struct in6_addr *dst,
1339 const struct in6_addr *src)
1341 static siphash_key_t rt6_exception_key __read_mostly;
1343 struct in6_addr dst;
1344 struct in6_addr src;
1345 } __aligned(SIPHASH_ALIGNMENT) combined = {
1350 net_get_random_once(&rt6_exception_key, sizeof(rt6_exception_key));
1352 #ifdef CONFIG_IPV6_SUBTREES
1354 combined.src = *src;
1356 val = siphash(&combined, sizeof(combined), &rt6_exception_key);
1358 return hash_64(val, FIB6_EXCEPTION_BUCKET_SIZE_SHIFT);
1361 /* Helper function to find the cached rt in the hash table
1362 * and update bucket pointer to point to the bucket for this
1363 * (daddr, saddr) pair
1364 * Caller must hold rt6_exception_lock
1366 static struct rt6_exception *
1367 __rt6_find_exception_spinlock(struct rt6_exception_bucket **bucket,
1368 const struct in6_addr *daddr,
1369 const struct in6_addr *saddr)
1371 struct rt6_exception *rt6_ex;
1374 if (!(*bucket) || !daddr)
1377 hval = rt6_exception_hash(daddr, saddr);
1380 hlist_for_each_entry(rt6_ex, &(*bucket)->chain, hlist) {
1381 struct rt6_info *rt6 = rt6_ex->rt6i;
1382 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1384 #ifdef CONFIG_IPV6_SUBTREES
1385 if (matched && saddr)
1386 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1394 /* Helper function to find the cached rt in the hash table
1395 * and update bucket pointer to point to the bucket for this
1396 * (daddr, saddr) pair
1397 * Caller must hold rcu_read_lock()
1399 static struct rt6_exception *
1400 __rt6_find_exception_rcu(struct rt6_exception_bucket **bucket,
1401 const struct in6_addr *daddr,
1402 const struct in6_addr *saddr)
1404 struct rt6_exception *rt6_ex;
1407 WARN_ON_ONCE(!rcu_read_lock_held());
1409 if (!(*bucket) || !daddr)
1412 hval = rt6_exception_hash(daddr, saddr);
1415 hlist_for_each_entry_rcu(rt6_ex, &(*bucket)->chain, hlist) {
1416 struct rt6_info *rt6 = rt6_ex->rt6i;
1417 bool matched = ipv6_addr_equal(daddr, &rt6->rt6i_dst.addr);
1419 #ifdef CONFIG_IPV6_SUBTREES
1420 if (matched && saddr)
1421 matched = ipv6_addr_equal(saddr, &rt6->rt6i_src.addr);
1429 static unsigned int fib6_mtu(const struct fib6_info *rt)
1433 if (rt->fib6_pmtu) {
1434 mtu = rt->fib6_pmtu;
1436 struct net_device *dev = fib6_info_nh_dev(rt);
1437 struct inet6_dev *idev;
1440 idev = __in6_dev_get(dev);
1441 mtu = idev->cnf.mtu6;
1445 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1447 return mtu - lwtunnel_headroom(rt->fib6_nh.nh_lwtstate, mtu);
1450 static int rt6_insert_exception(struct rt6_info *nrt,
1451 struct fib6_info *ort)
1453 struct net *net = dev_net(nrt->dst.dev);
1454 struct rt6_exception_bucket *bucket;
1455 struct in6_addr *src_key = NULL;
1456 struct rt6_exception *rt6_ex;
1460 spin_lock_bh(&rt6_exception_lock);
1462 if (ort->exception_bucket_flushed) {
1467 bucket = rcu_dereference_protected(ort->rt6i_exception_bucket,
1468 lockdep_is_held(&rt6_exception_lock));
1470 bucket = kcalloc(FIB6_EXCEPTION_BUCKET_SIZE, sizeof(*bucket),
1476 rcu_assign_pointer(ort->rt6i_exception_bucket, bucket);
1479 #ifdef CONFIG_IPV6_SUBTREES
1480 /* rt6i_src.plen != 0 indicates ort is in subtree
1481 * and exception table is indexed by a hash of
1482 * both rt6i_dst and rt6i_src.
1483 * Otherwise, the exception table is indexed by
1484 * a hash of only rt6i_dst.
1486 if (ort->fib6_src.plen)
1487 src_key = &nrt->rt6i_src.addr;
1490 /* Update rt6i_prefsrc as it could be changed
1491 * in rt6_remove_prefsrc()
1493 nrt->rt6i_prefsrc = ort->fib6_prefsrc;
1494 /* rt6_mtu_change() might lower mtu on ort.
1495 * Only insert this exception route if its mtu
1496 * is less than ort's mtu value.
1498 if (dst_metric_raw(&nrt->dst, RTAX_MTU) >= fib6_mtu(ort)) {
1503 rt6_ex = __rt6_find_exception_spinlock(&bucket, &nrt->rt6i_dst.addr,
1506 rt6_remove_exception(bucket, rt6_ex);
1508 rt6_ex = kzalloc(sizeof(*rt6_ex), GFP_ATOMIC);
1514 rt6_ex->stamp = jiffies;
1515 hlist_add_head_rcu(&rt6_ex->hlist, &bucket->chain);
1517 net->ipv6.rt6_stats->fib_rt_cache++;
1519 /* Randomize max depth to avoid some side channels attacks. */
1520 max_depth = FIB6_MAX_DEPTH + prandom_u32_max(FIB6_MAX_DEPTH);
1521 while (bucket->depth > max_depth)
1522 rt6_exception_remove_oldest(bucket);
1525 spin_unlock_bh(&rt6_exception_lock);
1527 /* Update fn->fn_sernum to invalidate all cached dst */
1529 spin_lock_bh(&ort->fib6_table->tb6_lock);
1530 fib6_update_sernum(net, ort);
1531 spin_unlock_bh(&ort->fib6_table->tb6_lock);
1532 fib6_force_start_gc(net);
1538 void rt6_flush_exceptions(struct fib6_info *rt)
1540 struct rt6_exception_bucket *bucket;
1541 struct rt6_exception *rt6_ex;
1542 struct hlist_node *tmp;
1545 spin_lock_bh(&rt6_exception_lock);
1546 /* Prevent rt6_insert_exception() to recreate the bucket list */
1547 rt->exception_bucket_flushed = 1;
1549 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1550 lockdep_is_held(&rt6_exception_lock));
1554 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1555 hlist_for_each_entry_safe(rt6_ex, tmp, &bucket->chain, hlist)
1556 rt6_remove_exception(bucket, rt6_ex);
1557 WARN_ON_ONCE(bucket->depth);
1562 spin_unlock_bh(&rt6_exception_lock);
1565 /* Find cached rt in the hash table inside passed in rt
1566 * Caller has to hold rcu_read_lock()
1568 static struct rt6_info *rt6_find_cached_rt(struct fib6_info *rt,
1569 const struct in6_addr *daddr,
1570 const struct in6_addr *saddr)
1572 const struct in6_addr *src_key = NULL;
1573 struct rt6_exception_bucket *bucket;
1574 struct rt6_exception *rt6_ex;
1575 struct rt6_info *res = NULL;
1577 #ifdef CONFIG_IPV6_SUBTREES
1578 /* rt6i_src.plen != 0 indicates rt is in subtree
1579 * and exception table is indexed by a hash of
1580 * both rt6i_dst and rt6i_src.
1581 * However, the src addr used to create the hash
1582 * might not be exactly the passed in saddr which
1583 * is a /128 addr from the flow.
1584 * So we need to use f6i->fib6_src to redo lookup
1585 * if the passed in saddr does not find anything.
1586 * (See the logic in ip6_rt_cache_alloc() on how
1587 * rt->rt6i_src is updated.)
1589 if (rt->fib6_src.plen)
1593 bucket = rcu_dereference(rt->rt6i_exception_bucket);
1594 rt6_ex = __rt6_find_exception_rcu(&bucket, daddr, src_key);
1596 if (rt6_ex && !rt6_check_expired(rt6_ex->rt6i))
1599 #ifdef CONFIG_IPV6_SUBTREES
1600 /* Use fib6_src as src_key and redo lookup */
1601 if (!res && src_key && src_key != &rt->fib6_src.addr) {
1602 src_key = &rt->fib6_src.addr;
1610 /* Remove the passed in cached rt from the hash table that contains it */
1611 static int rt6_remove_exception_rt(struct rt6_info *rt)
1613 struct rt6_exception_bucket *bucket;
1614 struct in6_addr *src_key = NULL;
1615 struct rt6_exception *rt6_ex;
1616 struct fib6_info *from;
1619 from = rcu_dereference(rt->from);
1621 !(rt->rt6i_flags & RTF_CACHE))
1624 if (!rcu_access_pointer(from->rt6i_exception_bucket))
1627 spin_lock_bh(&rt6_exception_lock);
1628 bucket = rcu_dereference_protected(from->rt6i_exception_bucket,
1629 lockdep_is_held(&rt6_exception_lock));
1630 #ifdef CONFIG_IPV6_SUBTREES
1631 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1632 * and exception table is indexed by a hash of
1633 * both rt6i_dst and rt6i_src.
1634 * Otherwise, the exception table is indexed by
1635 * a hash of only rt6i_dst.
1637 if (from->fib6_src.plen)
1638 src_key = &rt->rt6i_src.addr;
1640 rt6_ex = __rt6_find_exception_spinlock(&bucket,
1644 rt6_remove_exception(bucket, rt6_ex);
1650 spin_unlock_bh(&rt6_exception_lock);
1654 /* Find rt6_ex which contains the passed in rt cache and
1657 static void rt6_update_exception_stamp_rt(struct rt6_info *rt)
1659 struct rt6_exception_bucket *bucket;
1660 struct in6_addr *src_key = NULL;
1661 struct rt6_exception *rt6_ex;
1662 struct fib6_info *from;
1665 from = rcu_dereference(rt->from);
1666 if (!from || !(rt->rt6i_flags & RTF_CACHE))
1669 bucket = rcu_dereference(from->rt6i_exception_bucket);
1671 #ifdef CONFIG_IPV6_SUBTREES
1672 /* rt6i_src.plen != 0 indicates 'from' is in subtree
1673 * and exception table is indexed by a hash of
1674 * both rt6i_dst and rt6i_src.
1675 * Otherwise, the exception table is indexed by
1676 * a hash of only rt6i_dst.
1678 if (from->fib6_src.plen)
1679 src_key = &rt->rt6i_src.addr;
1681 rt6_ex = __rt6_find_exception_rcu(&bucket,
1685 rt6_ex->stamp = jiffies;
1691 static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt)
1693 struct rt6_exception_bucket *bucket;
1694 struct rt6_exception *rt6_ex;
1697 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1698 lockdep_is_held(&rt6_exception_lock));
1701 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1702 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1703 rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
1710 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
1711 struct rt6_info *rt, int mtu)
1713 /* If the new MTU is lower than the route PMTU, this new MTU will be the
1714 * lowest MTU in the path: always allow updating the route PMTU to
1715 * reflect PMTU decreases.
1717 * If the new MTU is higher, and the route PMTU is equal to the local
1718 * MTU, this means the old MTU is the lowest in the path, so allow
1719 * updating it: if other nodes now have lower MTUs, PMTU discovery will
1723 if (dst_mtu(&rt->dst) >= mtu)
1726 if (dst_mtu(&rt->dst) == idev->cnf.mtu6)
1732 static void rt6_exceptions_update_pmtu(struct inet6_dev *idev,
1733 struct fib6_info *rt, int mtu)
1735 struct rt6_exception_bucket *bucket;
1736 struct rt6_exception *rt6_ex;
1739 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1740 lockdep_is_held(&rt6_exception_lock));
1745 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1746 hlist_for_each_entry(rt6_ex, &bucket->chain, hlist) {
1747 struct rt6_info *entry = rt6_ex->rt6i;
1749 /* For RTF_CACHE with rt6i_pmtu == 0 (i.e. a redirected
1750 * route), the metrics of its rt->from have already
1753 if (dst_metric_raw(&entry->dst, RTAX_MTU) &&
1754 rt6_mtu_change_route_allowed(idev, entry, mtu))
1755 dst_metric_set(&entry->dst, RTAX_MTU, mtu);
1761 #define RTF_CACHE_GATEWAY (RTF_GATEWAY | RTF_CACHE)
1763 static void rt6_exceptions_clean_tohost(struct fib6_info *rt,
1764 struct in6_addr *gateway)
1766 struct rt6_exception_bucket *bucket;
1767 struct rt6_exception *rt6_ex;
1768 struct hlist_node *tmp;
1771 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1774 spin_lock_bh(&rt6_exception_lock);
1775 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1776 lockdep_is_held(&rt6_exception_lock));
1779 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1780 hlist_for_each_entry_safe(rt6_ex, tmp,
1781 &bucket->chain, hlist) {
1782 struct rt6_info *entry = rt6_ex->rt6i;
1784 if ((entry->rt6i_flags & RTF_CACHE_GATEWAY) ==
1785 RTF_CACHE_GATEWAY &&
1786 ipv6_addr_equal(gateway,
1787 &entry->rt6i_gateway)) {
1788 rt6_remove_exception(bucket, rt6_ex);
1795 spin_unlock_bh(&rt6_exception_lock);
1798 static void rt6_age_examine_exception(struct rt6_exception_bucket *bucket,
1799 struct rt6_exception *rt6_ex,
1800 struct fib6_gc_args *gc_args,
1803 struct rt6_info *rt = rt6_ex->rt6i;
1805 /* we are pruning and obsoleting aged-out and non gateway exceptions
1806 * even if others have still references to them, so that on next
1807 * dst_check() such references can be dropped.
1808 * EXPIRES exceptions - e.g. pmtu-generated ones are pruned when
1809 * expired, independently from their aging, as per RFC 8201 section 4
1811 if (!(rt->rt6i_flags & RTF_EXPIRES)) {
1812 if (time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
1813 RT6_TRACE("aging clone %p\n", rt);
1814 rt6_remove_exception(bucket, rt6_ex);
1817 } else if (time_after(jiffies, rt->dst.expires)) {
1818 RT6_TRACE("purging expired route %p\n", rt);
1819 rt6_remove_exception(bucket, rt6_ex);
1823 if (rt->rt6i_flags & RTF_GATEWAY) {
1824 struct neighbour *neigh;
1825 __u8 neigh_flags = 0;
1827 neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
1829 neigh_flags = neigh->flags;
1831 if (!(neigh_flags & NTF_ROUTER)) {
1832 RT6_TRACE("purging route %p via non-router but gateway\n",
1834 rt6_remove_exception(bucket, rt6_ex);
1842 void rt6_age_exceptions(struct fib6_info *rt,
1843 struct fib6_gc_args *gc_args,
1846 struct rt6_exception_bucket *bucket;
1847 struct rt6_exception *rt6_ex;
1848 struct hlist_node *tmp;
1851 if (!rcu_access_pointer(rt->rt6i_exception_bucket))
1855 spin_lock(&rt6_exception_lock);
1856 bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
1857 lockdep_is_held(&rt6_exception_lock));
1860 for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
1861 hlist_for_each_entry_safe(rt6_ex, tmp,
1862 &bucket->chain, hlist) {
1863 rt6_age_examine_exception(bucket, rt6_ex,
1869 spin_unlock(&rt6_exception_lock);
1870 rcu_read_unlock_bh();
1873 /* must be called with rcu lock held */
1874 struct fib6_info *fib6_table_lookup(struct net *net, struct fib6_table *table,
1875 int oif, struct flowi6 *fl6, int strict)
1877 struct fib6_node *fn, *saved_fn;
1878 struct fib6_info *f6i;
1880 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1883 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1887 f6i = rt6_select(net, fn, oif, strict);
1888 if (f6i == net->ipv6.fib6_null_entry) {
1889 fn = fib6_backtrack(fn, &fl6->saddr);
1891 goto redo_rt6_select;
1892 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1893 /* also consider unreachable route */
1894 strict &= ~RT6_LOOKUP_F_REACHABLE;
1896 goto redo_rt6_select;
1900 trace_fib6_table_lookup(net, f6i, table, fl6);
1905 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1906 int oif, struct flowi6 *fl6,
1907 const struct sk_buff *skb, int flags)
1909 struct fib6_info *f6i;
1910 struct rt6_info *rt;
1913 strict |= flags & RT6_LOOKUP_F_IFACE;
1914 strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1915 if (net->ipv6.devconf_all->forwarding == 0)
1916 strict |= RT6_LOOKUP_F_REACHABLE;
1920 f6i = fib6_table_lookup(net, table, oif, fl6, strict);
1921 if (f6i->fib6_nsiblings)
1922 f6i = fib6_multipath_select(net, f6i, fl6, oif, skb, strict);
1924 if (f6i == net->ipv6.fib6_null_entry) {
1925 rt = net->ipv6.ip6_null_entry;
1931 /*Search through exception table */
1932 rt = rt6_find_cached_rt(f6i, &fl6->daddr, &fl6->saddr);
1934 if (ip6_hold_safe(net, &rt, true))
1935 dst_use_noref(&rt->dst, jiffies);
1939 } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1940 !(f6i->fib6_flags & RTF_GATEWAY))) {
1941 /* Create a RTF_CACHE clone which will not be
1942 * owned by the fib6 tree. It is for the special case where
1943 * the daddr in the skb during the neighbor look-up is different
1944 * from the fl6->daddr used to look-up route here.
1946 struct rt6_info *uncached_rt;
1948 uncached_rt = ip6_rt_cache_alloc(f6i, &fl6->daddr, NULL);
1953 /* Uncached_rt's refcnt is taken during ip6_rt_cache_alloc()
1954 * No need for another dst_hold()
1956 rt6_uncached_list_add(uncached_rt);
1957 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
1959 uncached_rt = net->ipv6.ip6_null_entry;
1960 dst_hold(&uncached_rt->dst);
1965 /* Get a percpu copy */
1967 struct rt6_info *pcpu_rt;
1970 pcpu_rt = rt6_get_pcpu_route(f6i);
1973 pcpu_rt = rt6_make_pcpu_route(net, f6i);
1981 EXPORT_SYMBOL_GPL(ip6_pol_route);
1983 static struct rt6_info *ip6_pol_route_input(struct net *net,
1984 struct fib6_table *table,
1986 const struct sk_buff *skb,
1989 return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, skb, flags);
1992 struct dst_entry *ip6_route_input_lookup(struct net *net,
1993 struct net_device *dev,
1995 const struct sk_buff *skb,
1998 if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1999 flags |= RT6_LOOKUP_F_IFACE;
2001 return fib6_rule_lookup(net, fl6, skb, flags, ip6_pol_route_input);
2003 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
2005 static void ip6_multipath_l3_keys(const struct sk_buff *skb,
2006 struct flow_keys *keys,
2007 struct flow_keys *flkeys)
2009 const struct ipv6hdr *outer_iph = ipv6_hdr(skb);
2010 const struct ipv6hdr *key_iph = outer_iph;
2011 struct flow_keys *_flkeys = flkeys;
2012 const struct ipv6hdr *inner_iph;
2013 const struct icmp6hdr *icmph;
2014 struct ipv6hdr _inner_iph;
2015 struct icmp6hdr _icmph;
2017 if (likely(outer_iph->nexthdr != IPPROTO_ICMPV6))
2020 icmph = skb_header_pointer(skb, skb_transport_offset(skb),
2021 sizeof(_icmph), &_icmph);
2025 if (icmph->icmp6_type != ICMPV6_DEST_UNREACH &&
2026 icmph->icmp6_type != ICMPV6_PKT_TOOBIG &&
2027 icmph->icmp6_type != ICMPV6_TIME_EXCEED &&
2028 icmph->icmp6_type != ICMPV6_PARAMPROB)
2031 inner_iph = skb_header_pointer(skb,
2032 skb_transport_offset(skb) + sizeof(*icmph),
2033 sizeof(_inner_iph), &_inner_iph);
2037 key_iph = inner_iph;
2041 keys->addrs.v6addrs.src = _flkeys->addrs.v6addrs.src;
2042 keys->addrs.v6addrs.dst = _flkeys->addrs.v6addrs.dst;
2043 keys->tags.flow_label = _flkeys->tags.flow_label;
2044 keys->basic.ip_proto = _flkeys->basic.ip_proto;
2046 keys->addrs.v6addrs.src = key_iph->saddr;
2047 keys->addrs.v6addrs.dst = key_iph->daddr;
2048 keys->tags.flow_label = ip6_flowlabel(key_iph);
2049 keys->basic.ip_proto = key_iph->nexthdr;
2053 /* if skb is set it will be used and fl6 can be NULL */
2054 u32 rt6_multipath_hash(const struct net *net, const struct flowi6 *fl6,
2055 const struct sk_buff *skb, struct flow_keys *flkeys)
2057 struct flow_keys hash_keys;
2060 switch (ip6_multipath_hash_policy(net)) {
2062 memset(&hash_keys, 0, sizeof(hash_keys));
2063 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2065 ip6_multipath_l3_keys(skb, &hash_keys, flkeys);
2067 hash_keys.addrs.v6addrs.src = fl6->saddr;
2068 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2069 hash_keys.tags.flow_label = (__force u32)flowi6_get_flowlabel(fl6);
2070 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2075 unsigned int flag = FLOW_DISSECTOR_F_STOP_AT_ENCAP;
2076 struct flow_keys keys;
2078 /* short-circuit if we already have L4 hash present */
2080 return skb_get_hash_raw(skb) >> 1;
2082 memset(&hash_keys, 0, sizeof(hash_keys));
2085 skb_flow_dissect_flow_keys(skb, &keys, flag);
2088 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2089 hash_keys.addrs.v6addrs.src = flkeys->addrs.v6addrs.src;
2090 hash_keys.addrs.v6addrs.dst = flkeys->addrs.v6addrs.dst;
2091 hash_keys.ports.src = flkeys->ports.src;
2092 hash_keys.ports.dst = flkeys->ports.dst;
2093 hash_keys.basic.ip_proto = flkeys->basic.ip_proto;
2095 memset(&hash_keys, 0, sizeof(hash_keys));
2096 hash_keys.control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
2097 hash_keys.addrs.v6addrs.src = fl6->saddr;
2098 hash_keys.addrs.v6addrs.dst = fl6->daddr;
2099 hash_keys.ports.src = fl6->fl6_sport;
2100 hash_keys.ports.dst = fl6->fl6_dport;
2101 hash_keys.basic.ip_proto = fl6->flowi6_proto;
2105 mhash = flow_hash_from_keys(&hash_keys);
2110 void ip6_route_input(struct sk_buff *skb)
2112 const struct ipv6hdr *iph = ipv6_hdr(skb);
2113 struct net *net = dev_net(skb->dev);
2114 int flags = RT6_LOOKUP_F_HAS_SADDR;
2115 struct ip_tunnel_info *tun_info;
2116 struct flowi6 fl6 = {
2117 .flowi6_iif = skb->dev->ifindex,
2118 .daddr = iph->daddr,
2119 .saddr = iph->saddr,
2120 .flowlabel = ip6_flowinfo(iph),
2121 .flowi6_mark = skb->mark,
2122 .flowi6_proto = iph->nexthdr,
2124 struct flow_keys *flkeys = NULL, _flkeys;
2126 tun_info = skb_tunnel_info(skb);
2127 if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
2128 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
2130 if (fib6_rules_early_flow_dissect(net, skb, &fl6, &_flkeys))
2133 if (unlikely(fl6.flowi6_proto == IPPROTO_ICMPV6))
2134 fl6.mp_hash = rt6_multipath_hash(net, &fl6, skb, flkeys);
2137 ip6_route_input_lookup(net, skb->dev, &fl6, skb, flags));
2140 static struct rt6_info *ip6_pol_route_output(struct net *net,
2141 struct fib6_table *table,
2143 const struct sk_buff *skb,
2146 return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, skb, flags);
2149 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
2150 struct flowi6 *fl6, int flags)
2154 if (rt6_need_strict(&fl6->daddr)) {
2155 struct dst_entry *dst;
2157 dst = l3mdev_link_scope_lookup(net, fl6);
2162 fl6->flowi6_iif = LOOPBACK_IFINDEX;
2164 any_src = ipv6_addr_any(&fl6->saddr);
2165 if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
2166 (fl6->flowi6_oif && any_src))
2167 flags |= RT6_LOOKUP_F_IFACE;
2170 flags |= RT6_LOOKUP_F_HAS_SADDR;
2172 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
2174 return fib6_rule_lookup(net, fl6, NULL, flags, ip6_pol_route_output);
2176 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
2178 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
2180 struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
2181 struct net_device *loopback_dev = net->loopback_dev;
2182 struct dst_entry *new = NULL;
2184 rt = dst_alloc(&ip6_dst_blackhole_ops, loopback_dev, 1,
2185 DST_OBSOLETE_DEAD, 0);
2188 atomic_inc(&net->ipv6.rt6_stats->fib_rt_alloc);
2192 new->input = dst_discard;
2193 new->output = dst_discard_out;
2195 dst_copy_metrics(new, &ort->dst);
2197 rt->rt6i_idev = in6_dev_get(loopback_dev);
2198 rt->rt6i_gateway = ort->rt6i_gateway;
2199 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
2201 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
2202 #ifdef CONFIG_IPV6_SUBTREES
2203 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
2207 dst_release(dst_orig);
2208 return new ? new : ERR_PTR(-ENOMEM);
2212 * Destination cache support functions
2215 static bool fib6_check(struct fib6_info *f6i, u32 cookie)
2219 if (!fib6_get_cookie_safe(f6i, &rt_cookie) || rt_cookie != cookie)
2222 if (fib6_check_expired(f6i))
2228 static struct dst_entry *rt6_check(struct rt6_info *rt,
2229 struct fib6_info *from,
2234 if (!from || !fib6_get_cookie_safe(from, &rt_cookie) ||
2235 rt_cookie != cookie)
2238 if (rt6_check_expired(rt))
2244 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt,
2245 struct fib6_info *from,
2248 if (!__rt6_check_expired(rt) &&
2249 rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
2250 fib6_check(from, cookie))
2256 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
2258 struct dst_entry *dst_ret;
2259 struct fib6_info *from;
2260 struct rt6_info *rt;
2262 rt = container_of(dst, struct rt6_info, dst);
2266 /* All IPV6 dsts are created with ->obsolete set to the value
2267 * DST_OBSOLETE_FORCE_CHK which forces validation calls down
2268 * into this function always.
2271 from = rcu_dereference(rt->from);
2273 if (from && (rt->rt6i_flags & RTF_PCPU ||
2274 unlikely(!list_empty(&rt->rt6i_uncached))))
2275 dst_ret = rt6_dst_from_check(rt, from, cookie);
2277 dst_ret = rt6_check(rt, from, cookie);
2284 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
2286 struct rt6_info *rt = (struct rt6_info *) dst;
2289 if (rt->rt6i_flags & RTF_CACHE) {
2291 if (rt6_check_expired(rt)) {
2292 rt6_remove_exception_rt(rt);
2304 static void ip6_link_failure(struct sk_buff *skb)
2306 struct rt6_info *rt;
2308 icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
2310 rt = (struct rt6_info *) skb_dst(skb);
2313 if (rt->rt6i_flags & RTF_CACHE) {
2314 rt6_remove_exception_rt(rt);
2316 struct fib6_info *from;
2317 struct fib6_node *fn;
2319 from = rcu_dereference(rt->from);
2321 fn = rcu_dereference(from->fib6_node);
2322 if (fn && (rt->rt6i_flags & RTF_DEFAULT))
2323 WRITE_ONCE(fn->fn_sernum, -1);
2330 static void rt6_update_expires(struct rt6_info *rt0, int timeout)
2332 if (!(rt0->rt6i_flags & RTF_EXPIRES)) {
2333 struct fib6_info *from;
2336 from = rcu_dereference(rt0->from);
2338 rt0->dst.expires = from->expires;
2342 dst_set_expires(&rt0->dst, timeout);
2343 rt0->rt6i_flags |= RTF_EXPIRES;
2346 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
2348 struct net *net = dev_net(rt->dst.dev);
2350 dst_metric_set(&rt->dst, RTAX_MTU, mtu);
2351 rt->rt6i_flags |= RTF_MODIFIED;
2352 rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
2355 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
2360 from_set = !!rcu_dereference(rt->from);
2363 return !(rt->rt6i_flags & RTF_CACHE) &&
2364 (rt->rt6i_flags & RTF_PCPU || from_set);
2367 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
2368 const struct ipv6hdr *iph, u32 mtu,
2371 const struct in6_addr *daddr, *saddr;
2372 struct rt6_info *rt6 = (struct rt6_info *)dst;
2374 /* Note: do *NOT* check dst_metric_locked(dst, RTAX_MTU)
2375 * IPv6 pmtu discovery isn't optional, so 'mtu lock' cannot disable it.
2376 * [see also comment in rt6_mtu_change_route()]
2380 daddr = &iph->daddr;
2381 saddr = &iph->saddr;
2383 daddr = &sk->sk_v6_daddr;
2384 saddr = &inet6_sk(sk)->saddr;
2391 dst_confirm_neigh(dst, daddr);
2393 mtu = max_t(u32, mtu, IPV6_MIN_MTU);
2394 if (mtu >= dst_mtu(dst))
2397 if (!rt6_cache_allowed_for_pmtu(rt6)) {
2398 rt6_do_update_pmtu(rt6, mtu);
2399 /* update rt6_ex->stamp for cache */
2400 if (rt6->rt6i_flags & RTF_CACHE)
2401 rt6_update_exception_stamp_rt(rt6);
2403 struct fib6_info *from;
2404 struct rt6_info *nrt6;
2407 from = rcu_dereference(rt6->from);
2412 nrt6 = ip6_rt_cache_alloc(from, daddr, saddr);
2414 rt6_do_update_pmtu(nrt6, mtu);
2415 if (rt6_insert_exception(nrt6, from))
2416 dst_release_immediate(&nrt6->dst);
2422 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
2423 struct sk_buff *skb, u32 mtu,
2426 __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu,
2430 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
2431 int oif, u32 mark, kuid_t uid)
2433 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2434 struct dst_entry *dst;
2437 memset(&fl6, 0, sizeof(fl6));
2438 fl6.flowi6_oif = oif;
2439 fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
2440 fl6.daddr = iph->daddr;
2441 fl6.saddr = iph->saddr;
2442 fl6.flowlabel = ip6_flowinfo(iph);
2443 fl6.flowi6_uid = uid;
2445 dst = ip6_route_output(net, NULL, &fl6);
2447 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu), true);
2450 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
2452 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
2454 int oif = sk->sk_bound_dev_if;
2455 struct dst_entry *dst;
2457 if (!oif && skb->dev)
2458 oif = l3mdev_master_ifindex(skb->dev);
2460 ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark, sk->sk_uid);
2462 dst = __sk_dst_get(sk);
2463 if (!dst || !dst->obsolete ||
2464 dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
2468 if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
2469 ip6_datagram_dst_update(sk, false);
2472 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
2474 void ip6_sk_dst_store_flow(struct sock *sk, struct dst_entry *dst,
2475 const struct flowi6 *fl6)
2477 #ifdef CONFIG_IPV6_SUBTREES
2478 struct ipv6_pinfo *np = inet6_sk(sk);
2481 ip6_dst_store(sk, dst,
2482 ipv6_addr_equal(&fl6->daddr, &sk->sk_v6_daddr) ?
2483 &sk->sk_v6_daddr : NULL,
2484 #ifdef CONFIG_IPV6_SUBTREES
2485 ipv6_addr_equal(&fl6->saddr, &np->saddr) ?
2491 /* Handle redirects */
2492 struct ip6rd_flowi {
2494 struct in6_addr gateway;
2497 static struct rt6_info *__ip6_route_redirect(struct net *net,
2498 struct fib6_table *table,
2500 const struct sk_buff *skb,
2503 struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
2504 struct rt6_info *ret = NULL, *rt_cache;
2505 struct fib6_info *rt;
2506 struct fib6_node *fn;
2508 /* l3mdev_update_flow overrides oif if the device is enslaved; in
2509 * this case we must match on the real ingress device, so reset it
2511 if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
2512 fl6->flowi6_oif = skb->dev->ifindex;
2514 /* Get the "current" route for this destination and
2515 * check if the redirect has come from appropriate router.
2517 * RFC 4861 specifies that redirects should only be
2518 * accepted if they come from the nexthop to the target.
2519 * Due to the way the routes are chosen, this notion
2520 * is a bit fuzzy and one might need to check all possible
2525 fn = fib6_node_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
2527 for_each_fib6_node_rt_rcu(fn) {
2528 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
2530 if (fib6_check_expired(rt))
2532 if (rt->fib6_flags & RTF_REJECT)
2534 if (!(rt->fib6_flags & RTF_GATEWAY))
2536 if (fl6->flowi6_oif != rt->fib6_nh.nh_dev->ifindex)
2538 /* rt_cache's gateway might be different from its 'parent'
2539 * in the case of an ip redirect.
2540 * So we keep searching in the exception table if the gateway
2543 if (!ipv6_addr_equal(&rdfl->gateway, &rt->fib6_nh.nh_gw)) {
2544 rt_cache = rt6_find_cached_rt(rt,
2548 ipv6_addr_equal(&rdfl->gateway,
2549 &rt_cache->rt6i_gateway)) {
2559 rt = net->ipv6.fib6_null_entry;
2560 else if (rt->fib6_flags & RTF_REJECT) {
2561 ret = net->ipv6.ip6_null_entry;
2565 if (rt == net->ipv6.fib6_null_entry) {
2566 fn = fib6_backtrack(fn, &fl6->saddr);
2573 ip6_hold_safe(net, &ret, true);
2575 ret = ip6_create_rt_rcu(rt);
2579 trace_fib6_table_lookup(net, rt, table, fl6);
2583 static struct dst_entry *ip6_route_redirect(struct net *net,
2584 const struct flowi6 *fl6,
2585 const struct sk_buff *skb,
2586 const struct in6_addr *gateway)
2588 int flags = RT6_LOOKUP_F_HAS_SADDR;
2589 struct ip6rd_flowi rdfl;
2592 rdfl.gateway = *gateway;
2594 return fib6_rule_lookup(net, &rdfl.fl6, skb,
2595 flags, __ip6_route_redirect);
2598 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark,
2601 const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
2602 struct dst_entry *dst;
2605 memset(&fl6, 0, sizeof(fl6));
2606 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2607 fl6.flowi6_oif = oif;
2608 fl6.flowi6_mark = mark;
2609 fl6.daddr = iph->daddr;
2610 fl6.saddr = iph->saddr;
2611 fl6.flowlabel = ip6_flowinfo(iph);
2612 fl6.flowi6_uid = uid;
2614 dst = ip6_route_redirect(net, &fl6, skb, &ipv6_hdr(skb)->saddr);
2615 rt6_do_redirect(dst, NULL, skb);
2618 EXPORT_SYMBOL_GPL(ip6_redirect);
2620 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
2623 const struct ipv6hdr *iph = ipv6_hdr(skb);
2624 const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
2625 struct dst_entry *dst;
2628 memset(&fl6, 0, sizeof(fl6));
2629 fl6.flowi6_iif = LOOPBACK_IFINDEX;
2630 fl6.flowi6_oif = oif;
2631 fl6.flowi6_mark = mark;
2632 fl6.daddr = msg->dest;
2633 fl6.saddr = iph->daddr;
2634 fl6.flowi6_uid = sock_net_uid(net, NULL);
2636 dst = ip6_route_redirect(net, &fl6, skb, &iph->saddr);
2637 rt6_do_redirect(dst, NULL, skb);
2641 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
2643 ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark,
2646 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
2648 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
2650 struct net_device *dev = dst->dev;
2651 unsigned int mtu = dst_mtu(dst);
2652 struct net *net = dev_net(dev);
2654 mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
2656 if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
2657 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
2660 * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
2661 * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
2662 * IPV6_MAXPLEN is also valid and means: "any MSS,
2663 * rely only on pmtu discovery"
2665 if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
2670 static unsigned int ip6_mtu(const struct dst_entry *dst)
2672 struct inet6_dev *idev;
2675 mtu = dst_metric_raw(dst, RTAX_MTU);
2682 idev = __in6_dev_get(dst->dev);
2684 mtu = idev->cnf.mtu6;
2688 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2690 return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
2694 * 1. mtu on route is locked - use it
2695 * 2. mtu from nexthop exception
2696 * 3. mtu from egress device
2698 * based on ip6_dst_mtu_forward and exception logic of
2699 * rt6_find_cached_rt; called with rcu_read_lock
2701 u32 ip6_mtu_from_fib6(struct fib6_info *f6i, struct in6_addr *daddr,
2702 struct in6_addr *saddr)
2704 struct inet6_dev *idev;
2705 struct rt6_info *rt;
2708 if (unlikely(fib6_metric_locked(f6i, RTAX_MTU))) {
2709 mtu = f6i->fib6_pmtu;
2714 rt = rt6_find_cached_rt(f6i, daddr, saddr);
2716 mtu = dst_metric_raw(&rt->dst, RTAX_MTU);
2718 struct net_device *dev = fib6_info_nh_dev(f6i);
2721 idev = __in6_dev_get(dev);
2722 if (idev && idev->cnf.mtu6 > mtu)
2723 mtu = idev->cnf.mtu6;
2726 mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
2728 return mtu - lwtunnel_headroom(fib6_info_nh_lwt(f6i), mtu);
2731 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
2734 struct dst_entry *dst;
2735 struct rt6_info *rt;
2736 struct inet6_dev *idev = in6_dev_get(dev);
2737 struct net *net = dev_net(dev);
2739 if (unlikely(!idev))
2740 return ERR_PTR(-ENODEV);
2742 rt = ip6_dst_alloc(net, dev, 0);
2743 if (unlikely(!rt)) {
2745 dst = ERR_PTR(-ENOMEM);
2749 rt->dst.flags |= DST_HOST;
2750 rt->dst.input = ip6_input;
2751 rt->dst.output = ip6_output;
2752 rt->rt6i_gateway = fl6->daddr;
2753 rt->rt6i_dst.addr = fl6->daddr;
2754 rt->rt6i_dst.plen = 128;
2755 rt->rt6i_idev = idev;
2756 dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
2758 /* Add this dst into uncached_list so that rt6_disable_ip() can
2759 * do proper release of the net_device
2761 rt6_uncached_list_add(rt);
2762 atomic_inc(&net->ipv6.rt6_stats->fib_rt_uncache);
2764 dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
2770 static int ip6_dst_gc(struct dst_ops *ops)
2772 struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
2773 int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
2774 int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
2775 int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
2776 int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
2777 unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
2780 entries = dst_entries_get_fast(ops);
2781 if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
2782 entries <= rt_max_size)
2785 net->ipv6.ip6_rt_gc_expire++;
2786 fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
2787 entries = dst_entries_get_slow(ops);
2788 if (entries < ops->gc_thresh)
2789 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
2791 net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
2792 return entries > rt_max_size;
2795 static int ip6_convert_metrics(struct net *net, struct fib6_info *rt,
2796 struct fib6_config *cfg)
2798 struct dst_metrics *p;
2803 p = kzalloc(sizeof(*rt->fib6_metrics), GFP_KERNEL);
2807 refcount_set(&p->refcnt, 1);
2808 rt->fib6_metrics = p;
2810 return ip_metrics_convert(net, cfg->fc_mx, cfg->fc_mx_len, p->metrics);
2813 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
2814 struct fib6_config *cfg,
2815 const struct in6_addr *gw_addr,
2816 u32 tbid, int flags)
2818 struct flowi6 fl6 = {
2819 .flowi6_oif = cfg->fc_ifindex,
2821 .saddr = cfg->fc_prefsrc,
2823 struct fib6_table *table;
2824 struct rt6_info *rt;
2826 table = fib6_get_table(net, tbid);
2830 if (!ipv6_addr_any(&cfg->fc_prefsrc))
2831 flags |= RT6_LOOKUP_F_HAS_SADDR;
2833 flags |= RT6_LOOKUP_F_IGNORE_LINKSTATE;
2834 rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, NULL, flags);
2836 /* if table lookup failed, fall back to full lookup */
2837 if (rt == net->ipv6.ip6_null_entry) {
2845 static int ip6_route_check_nh_onlink(struct net *net,
2846 struct fib6_config *cfg,
2847 const struct net_device *dev,
2848 struct netlink_ext_ack *extack)
2850 u32 tbid = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
2851 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2852 u32 flags = RTF_LOCAL | RTF_ANYCAST | RTF_REJECT;
2853 struct fib6_info *from;
2854 struct rt6_info *grt;
2858 grt = ip6_nh_lookup_table(net, cfg, gw_addr, tbid, 0);
2861 from = rcu_dereference(grt->from);
2862 if (!grt->dst.error &&
2863 /* ignore match if it is the default route */
2864 from && !ipv6_addr_any(&from->fib6_dst.addr) &&
2865 (grt->rt6i_flags & flags || dev != grt->dst.dev)) {
2866 NL_SET_ERR_MSG(extack,
2867 "Nexthop has invalid gateway or device mismatch");
2878 static int ip6_route_check_nh(struct net *net,
2879 struct fib6_config *cfg,
2880 struct net_device **_dev,
2881 struct inet6_dev **idev)
2883 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2884 struct net_device *dev = _dev ? *_dev : NULL;
2885 struct rt6_info *grt = NULL;
2886 int err = -EHOSTUNREACH;
2888 if (cfg->fc_table) {
2889 int flags = RT6_LOOKUP_F_IFACE;
2891 grt = ip6_nh_lookup_table(net, cfg, gw_addr,
2892 cfg->fc_table, flags);
2894 if (grt->rt6i_flags & RTF_GATEWAY ||
2895 (dev && dev != grt->dst.dev)) {
2903 grt = rt6_lookup(net, gw_addr, NULL, cfg->fc_ifindex, NULL, 1);
2909 if (dev != grt->dst.dev) {
2914 *_dev = dev = grt->dst.dev;
2915 *idev = grt->rt6i_idev;
2917 in6_dev_hold(grt->rt6i_idev);
2920 if (!(grt->rt6i_flags & RTF_GATEWAY))
2929 static int ip6_validate_gw(struct net *net, struct fib6_config *cfg,
2930 struct net_device **_dev, struct inet6_dev **idev,
2931 struct netlink_ext_ack *extack)
2933 const struct in6_addr *gw_addr = &cfg->fc_gateway;
2934 int gwa_type = ipv6_addr_type(gw_addr);
2935 bool skip_dev = gwa_type & IPV6_ADDR_LINKLOCAL ? false : true;
2936 const struct net_device *dev = *_dev;
2937 bool need_addr_check = !dev;
2940 /* if gw_addr is local we will fail to detect this in case
2941 * address is still TENTATIVE (DAD in progress). rt6_lookup()
2942 * will return already-added prefix route via interface that
2943 * prefix route was assigned to, which might be non-loopback.
2946 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2947 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
2951 if (gwa_type != (IPV6_ADDR_LINKLOCAL | IPV6_ADDR_UNICAST)) {
2952 /* IPv6 strictly inhibits using not link-local
2953 * addresses as nexthop address.
2954 * Otherwise, router will not able to send redirects.
2955 * It is very good, but in some (rare!) circumstances
2956 * (SIT, PtP, NBMA NOARP links) it is handy to allow
2957 * some exceptions. --ANK
2958 * We allow IPv4-mapped nexthops to support RFC4798-type
2961 if (!(gwa_type & (IPV6_ADDR_UNICAST | IPV6_ADDR_MAPPED))) {
2962 NL_SET_ERR_MSG(extack, "Invalid gateway address");
2966 if (cfg->fc_flags & RTNH_F_ONLINK)
2967 err = ip6_route_check_nh_onlink(net, cfg, dev, extack);
2969 err = ip6_route_check_nh(net, cfg, _dev, idev);
2975 /* reload in case device was changed */
2980 NL_SET_ERR_MSG(extack, "Egress device not specified");
2982 } else if (dev->flags & IFF_LOOPBACK) {
2983 NL_SET_ERR_MSG(extack,
2984 "Egress device can not be loopback device for this route");
2988 /* if we did not check gw_addr above, do so now that the
2989 * egress device has been resolved.
2991 if (need_addr_check &&
2992 ipv6_chk_addr_and_flags(net, gw_addr, dev, skip_dev, 0, 0)) {
2993 NL_SET_ERR_MSG(extack, "Gateway can not be a local address");
3002 static struct fib6_info *ip6_route_info_create(struct fib6_config *cfg,
3004 struct netlink_ext_ack *extack)
3006 struct net *net = cfg->fc_nlinfo.nl_net;
3007 struct fib6_info *rt = NULL;
3008 struct net_device *dev = NULL;
3009 struct inet6_dev *idev = NULL;
3010 struct fib6_table *table;
3014 /* RTF_PCPU is an internal flag; can not be set by userspace */
3015 if (cfg->fc_flags & RTF_PCPU) {
3016 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_PCPU");
3020 /* RTF_CACHE is an internal flag; can not be set by userspace */
3021 if (cfg->fc_flags & RTF_CACHE) {
3022 NL_SET_ERR_MSG(extack, "Userspace can not set RTF_CACHE");
3026 if (cfg->fc_type > RTN_MAX) {
3027 NL_SET_ERR_MSG(extack, "Invalid route type");
3031 if (cfg->fc_dst_len > 128) {
3032 NL_SET_ERR_MSG(extack, "Invalid prefix length");
3035 if (cfg->fc_src_len > 128) {
3036 NL_SET_ERR_MSG(extack, "Invalid source address length");
3039 #ifndef CONFIG_IPV6_SUBTREES
3040 if (cfg->fc_src_len) {
3041 NL_SET_ERR_MSG(extack,
3042 "Specifying source address requires IPV6_SUBTREES to be enabled");
3046 if (cfg->fc_ifindex) {
3048 dev = dev_get_by_index(net, cfg->fc_ifindex);
3051 idev = in6_dev_get(dev);
3056 if (cfg->fc_metric == 0)
3057 cfg->fc_metric = IP6_RT_PRIO_USER;
3059 if (cfg->fc_flags & RTNH_F_ONLINK) {
3061 NL_SET_ERR_MSG(extack,
3062 "Nexthop device required for onlink");
3067 if (!(dev->flags & IFF_UP)) {
3068 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3075 if (cfg->fc_nlinfo.nlh &&
3076 !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
3077 table = fib6_get_table(net, cfg->fc_table);
3079 pr_warn("NLM_F_CREATE should be specified when creating new route\n");
3080 table = fib6_new_table(net, cfg->fc_table);
3083 table = fib6_new_table(net, cfg->fc_table);
3090 rt = fib6_info_alloc(gfp_flags);
3094 #ifdef CONFIG_IPV6_ROUTER_PREF
3095 rt->last_probe = jiffies;
3097 if (cfg->fc_flags & RTF_ADDRCONF)
3098 rt->dst_nocount = true;
3100 err = ip6_convert_metrics(net, rt, cfg);
3104 if (cfg->fc_flags & RTF_EXPIRES)
3105 fib6_set_expires(rt, jiffies +
3106 clock_t_to_jiffies(cfg->fc_expires));
3108 fib6_clean_expires(rt);
3110 if (cfg->fc_protocol == RTPROT_UNSPEC)
3111 cfg->fc_protocol = RTPROT_BOOT;
3112 rt->fib6_protocol = cfg->fc_protocol;
3114 addr_type = ipv6_addr_type(&cfg->fc_dst);
3116 if (cfg->fc_encap) {
3117 struct lwtunnel_state *lwtstate;
3119 err = lwtunnel_build_state(cfg->fc_encap_type,
3120 cfg->fc_encap, AF_INET6, cfg,
3124 rt->fib6_nh.nh_lwtstate = lwtstate_get(lwtstate);
3127 ipv6_addr_prefix(&rt->fib6_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
3128 rt->fib6_dst.plen = cfg->fc_dst_len;
3129 if (rt->fib6_dst.plen == 128)
3130 rt->dst_host = true;
3132 #ifdef CONFIG_IPV6_SUBTREES
3133 ipv6_addr_prefix(&rt->fib6_src.addr, &cfg->fc_src, cfg->fc_src_len);
3134 rt->fib6_src.plen = cfg->fc_src_len;
3137 rt->fib6_metric = cfg->fc_metric;
3138 rt->fib6_nh.nh_weight = 1;
3140 rt->fib6_type = cfg->fc_type ? : RTN_UNICAST;
3142 /* We cannot add true routes via loopback here,
3143 they would result in kernel looping; promote them to reject routes
3145 if ((cfg->fc_flags & RTF_REJECT) ||
3146 (dev && (dev->flags & IFF_LOOPBACK) &&
3147 !(addr_type & IPV6_ADDR_LOOPBACK) &&
3148 !(cfg->fc_flags & RTF_LOCAL))) {
3149 /* hold loopback dev/idev if we haven't done so. */
3150 if (dev != net->loopback_dev) {
3155 dev = net->loopback_dev;
3157 idev = in6_dev_get(dev);
3163 rt->fib6_flags = RTF_REJECT|RTF_NONEXTHOP;
3167 if (cfg->fc_flags & RTF_GATEWAY) {
3168 err = ip6_validate_gw(net, cfg, &dev, &idev, extack);
3172 rt->fib6_nh.nh_gw = cfg->fc_gateway;
3179 if (idev->cnf.disable_ipv6) {
3180 NL_SET_ERR_MSG(extack, "IPv6 is disabled on nexthop device");
3185 if (!(dev->flags & IFF_UP)) {
3186 NL_SET_ERR_MSG(extack, "Nexthop device is not up");
3191 if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
3192 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
3193 NL_SET_ERR_MSG(extack, "Invalid source address");
3197 rt->fib6_prefsrc.addr = cfg->fc_prefsrc;
3198 rt->fib6_prefsrc.plen = 128;
3200 rt->fib6_prefsrc.plen = 0;
3202 rt->fib6_flags = cfg->fc_flags;
3205 if (!(rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST)) &&
3206 !netif_carrier_ok(dev))
3207 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
3208 rt->fib6_nh.nh_flags |= (cfg->fc_flags & RTNH_F_ONLINK);
3209 rt->fib6_nh.nh_dev = dev;
3210 rt->fib6_table = table;
3212 cfg->fc_nlinfo.nl_net = dev_net(dev);
3224 fib6_info_release(rt);
3225 return ERR_PTR(err);
3228 int ip6_route_add(struct fib6_config *cfg, gfp_t gfp_flags,
3229 struct netlink_ext_ack *extack)
3231 struct fib6_info *rt;
3234 rt = ip6_route_info_create(cfg, gfp_flags, extack);
3238 err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, extack);
3239 fib6_info_release(rt);
3244 static int __ip6_del_rt(struct fib6_info *rt, struct nl_info *info)
3246 struct net *net = info->nl_net;
3247 struct fib6_table *table;
3250 if (rt == net->ipv6.fib6_null_entry) {
3255 table = rt->fib6_table;
3256 spin_lock_bh(&table->tb6_lock);
3257 err = fib6_del(rt, info);
3258 spin_unlock_bh(&table->tb6_lock);
3261 fib6_info_release(rt);
3265 int ip6_del_rt(struct net *net, struct fib6_info *rt)
3267 struct nl_info info = { .nl_net = net };
3269 return __ip6_del_rt(rt, &info);
3272 static int __ip6_del_rt_siblings(struct fib6_info *rt, struct fib6_config *cfg)
3274 struct nl_info *info = &cfg->fc_nlinfo;
3275 struct net *net = info->nl_net;
3276 struct sk_buff *skb = NULL;
3277 struct fib6_table *table;
3280 if (rt == net->ipv6.fib6_null_entry)
3282 table = rt->fib6_table;
3283 spin_lock_bh(&table->tb6_lock);
3285 if (rt->fib6_nsiblings && cfg->fc_delete_all_nh) {
3286 struct fib6_info *sibling, *next_sibling;
3288 /* prefer to send a single notification with all hops */
3289 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3291 u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3293 if (rt6_fill_node(net, skb, rt, NULL,
3294 NULL, NULL, 0, RTM_DELROUTE,
3295 info->portid, seq, 0) < 0) {
3299 info->skip_notify = 1;
3302 list_for_each_entry_safe(sibling, next_sibling,
3305 err = fib6_del(sibling, info);
3311 err = fib6_del(rt, info);
3313 spin_unlock_bh(&table->tb6_lock);
3315 fib6_info_release(rt);
3318 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3319 info->nlh, gfp_any());
3324 static int ip6_del_cached_rt(struct rt6_info *rt, struct fib6_config *cfg)
3328 if (cfg->fc_ifindex && rt->dst.dev->ifindex != cfg->fc_ifindex)
3331 if (cfg->fc_flags & RTF_GATEWAY &&
3332 !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
3335 rc = rt6_remove_exception_rt(rt);
3340 static int ip6_route_del(struct fib6_config *cfg,
3341 struct netlink_ext_ack *extack)
3343 struct rt6_info *rt_cache;
3344 struct fib6_table *table;
3345 struct fib6_info *rt;
3346 struct fib6_node *fn;
3349 table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
3351 NL_SET_ERR_MSG(extack, "FIB table does not exist");
3357 fn = fib6_locate(&table->tb6_root,
3358 &cfg->fc_dst, cfg->fc_dst_len,
3359 &cfg->fc_src, cfg->fc_src_len,
3360 !(cfg->fc_flags & RTF_CACHE));
3363 for_each_fib6_node_rt_rcu(fn) {
3364 if (cfg->fc_flags & RTF_CACHE) {
3367 rt_cache = rt6_find_cached_rt(rt, &cfg->fc_dst,
3370 rc = ip6_del_cached_rt(rt_cache, cfg);
3378 if (cfg->fc_ifindex &&
3379 (!rt->fib6_nh.nh_dev ||
3380 rt->fib6_nh.nh_dev->ifindex != cfg->fc_ifindex))
3382 if (cfg->fc_flags & RTF_GATEWAY &&
3383 !ipv6_addr_equal(&cfg->fc_gateway, &rt->fib6_nh.nh_gw))
3385 if (cfg->fc_metric && cfg->fc_metric != rt->fib6_metric)
3387 if (cfg->fc_protocol && cfg->fc_protocol != rt->fib6_protocol)
3389 if (!fib6_info_hold_safe(rt))
3393 /* if gateway was specified only delete the one hop */
3394 if (cfg->fc_flags & RTF_GATEWAY)
3395 return __ip6_del_rt(rt, &cfg->fc_nlinfo);
3397 return __ip6_del_rt_siblings(rt, cfg);
3405 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
3407 struct netevent_redirect netevent;
3408 struct rt6_info *rt, *nrt = NULL;
3409 struct ndisc_options ndopts;
3410 struct inet6_dev *in6_dev;
3411 struct neighbour *neigh;
3412 struct fib6_info *from;
3414 int optlen, on_link;
3417 optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
3418 optlen -= sizeof(*msg);
3421 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
3425 msg = (struct rd_msg *)icmp6_hdr(skb);
3427 if (ipv6_addr_is_multicast(&msg->dest)) {
3428 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
3433 if (ipv6_addr_equal(&msg->dest, &msg->target)) {
3435 } else if (ipv6_addr_type(&msg->target) !=
3436 (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
3437 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
3441 in6_dev = __in6_dev_get(skb->dev);
3444 if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
3448 * The IP source address of the Redirect MUST be the same as the current
3449 * first-hop router for the specified ICMP Destination Address.
3452 if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
3453 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
3458 if (ndopts.nd_opts_tgt_lladdr) {
3459 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
3462 net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
3467 rt = (struct rt6_info *) dst;
3468 if (rt->rt6i_flags & RTF_REJECT) {
3469 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
3473 /* Redirect received -> path was valid.
3474 * Look, redirects are sent only in response to data packets,
3475 * so that this nexthop apparently is reachable. --ANK
3477 dst_confirm_neigh(&rt->dst, &ipv6_hdr(skb)->saddr);
3479 neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
3484 * We have finally decided to accept it.
3487 ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
3488 NEIGH_UPDATE_F_WEAK_OVERRIDE|
3489 NEIGH_UPDATE_F_OVERRIDE|
3490 (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
3491 NEIGH_UPDATE_F_ISROUTER)),
3492 NDISC_REDIRECT, &ndopts);
3495 from = rcu_dereference(rt->from);
3499 nrt = ip6_rt_cache_alloc(from, &msg->dest, NULL);
3503 nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
3505 nrt->rt6i_flags &= ~RTF_GATEWAY;
3507 nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
3509 /* rt6_insert_exception() will take care of duplicated exceptions */
3510 if (rt6_insert_exception(nrt, from)) {
3511 dst_release_immediate(&nrt->dst);
3515 netevent.old = &rt->dst;
3516 netevent.new = &nrt->dst;
3517 netevent.daddr = &msg->dest;
3518 netevent.neigh = neigh;
3519 call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
3523 neigh_release(neigh);
3526 #ifdef CONFIG_IPV6_ROUTE_INFO
3527 static struct fib6_info *rt6_get_route_info(struct net *net,
3528 const struct in6_addr *prefix, int prefixlen,
3529 const struct in6_addr *gwaddr,
3530 struct net_device *dev)
3532 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
3533 int ifindex = dev->ifindex;
3534 struct fib6_node *fn;
3535 struct fib6_info *rt = NULL;
3536 struct fib6_table *table;
3538 table = fib6_get_table(net, tb_id);
3543 fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0, true);
3547 for_each_fib6_node_rt_rcu(fn) {
3548 if (rt->fib6_nh.nh_dev->ifindex != ifindex)
3550 if ((rt->fib6_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
3552 if (!ipv6_addr_equal(&rt->fib6_nh.nh_gw, gwaddr))
3554 if (!fib6_info_hold_safe(rt))
3563 static struct fib6_info *rt6_add_route_info(struct net *net,
3564 const struct in6_addr *prefix, int prefixlen,
3565 const struct in6_addr *gwaddr,
3566 struct net_device *dev,
3569 struct fib6_config cfg = {
3570 .fc_metric = IP6_RT_PRIO_USER,
3571 .fc_ifindex = dev->ifindex,
3572 .fc_dst_len = prefixlen,
3573 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
3574 RTF_UP | RTF_PREF(pref),
3575 .fc_protocol = RTPROT_RA,
3576 .fc_type = RTN_UNICAST,
3577 .fc_nlinfo.portid = 0,
3578 .fc_nlinfo.nlh = NULL,
3579 .fc_nlinfo.nl_net = net,
3582 cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
3583 cfg.fc_dst = *prefix;
3584 cfg.fc_gateway = *gwaddr;
3586 /* We should treat it as a default route if prefix length is 0. */
3588 cfg.fc_flags |= RTF_DEFAULT;
3590 ip6_route_add(&cfg, GFP_ATOMIC, NULL);
3592 return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
3596 struct fib6_info *rt6_get_dflt_router(struct net *net,
3597 const struct in6_addr *addr,
3598 struct net_device *dev)
3600 u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
3601 struct fib6_info *rt;
3602 struct fib6_table *table;
3604 table = fib6_get_table(net, tb_id);
3609 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3610 if (dev == rt->fib6_nh.nh_dev &&
3611 ((rt->fib6_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
3612 ipv6_addr_equal(&rt->fib6_nh.nh_gw, addr))
3615 if (rt && !fib6_info_hold_safe(rt))
3621 struct fib6_info *rt6_add_dflt_router(struct net *net,
3622 const struct in6_addr *gwaddr,
3623 struct net_device *dev,
3626 struct fib6_config cfg = {
3627 .fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
3628 .fc_metric = IP6_RT_PRIO_USER,
3629 .fc_ifindex = dev->ifindex,
3630 .fc_flags = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
3631 RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
3632 .fc_protocol = RTPROT_RA,
3633 .fc_type = RTN_UNICAST,
3634 .fc_nlinfo.portid = 0,
3635 .fc_nlinfo.nlh = NULL,
3636 .fc_nlinfo.nl_net = net,
3639 cfg.fc_gateway = *gwaddr;
3641 if (!ip6_route_add(&cfg, GFP_ATOMIC, NULL)) {
3642 struct fib6_table *table;
3644 table = fib6_get_table(dev_net(dev), cfg.fc_table);
3646 table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
3649 return rt6_get_dflt_router(net, gwaddr, dev);
3652 static void __rt6_purge_dflt_routers(struct net *net,
3653 struct fib6_table *table)
3655 struct fib6_info *rt;
3659 for_each_fib6_node_rt_rcu(&table->tb6_root) {
3660 struct net_device *dev = fib6_info_nh_dev(rt);
3661 struct inet6_dev *idev = dev ? __in6_dev_get(dev) : NULL;
3663 if (rt->fib6_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
3664 (!idev || idev->cnf.accept_ra != 2) &&
3665 fib6_info_hold_safe(rt)) {
3667 ip6_del_rt(net, rt);
3673 table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
3676 void rt6_purge_dflt_routers(struct net *net)
3678 struct fib6_table *table;
3679 struct hlist_head *head;
3684 for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
3685 head = &net->ipv6.fib_table_hash[h];
3686 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
3687 if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
3688 __rt6_purge_dflt_routers(net, table);
3695 static void rtmsg_to_fib6_config(struct net *net,
3696 struct in6_rtmsg *rtmsg,
3697 struct fib6_config *cfg)
3699 memset(cfg, 0, sizeof(*cfg));
3701 cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
3703 cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
3704 cfg->fc_metric = rtmsg->rtmsg_metric;
3705 cfg->fc_expires = rtmsg->rtmsg_info;
3706 cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
3707 cfg->fc_src_len = rtmsg->rtmsg_src_len;
3708 cfg->fc_flags = rtmsg->rtmsg_flags;
3709 cfg->fc_type = rtmsg->rtmsg_type;
3711 cfg->fc_nlinfo.nl_net = net;
3713 cfg->fc_dst = rtmsg->rtmsg_dst;
3714 cfg->fc_src = rtmsg->rtmsg_src;
3715 cfg->fc_gateway = rtmsg->rtmsg_gateway;
3718 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
3720 struct fib6_config cfg;
3721 struct in6_rtmsg rtmsg;
3725 case SIOCADDRT: /* Add a route */
3726 case SIOCDELRT: /* Delete a route */
3727 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
3729 err = copy_from_user(&rtmsg, arg,
3730 sizeof(struct in6_rtmsg));
3734 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
3739 err = ip6_route_add(&cfg, GFP_KERNEL, NULL);
3742 err = ip6_route_del(&cfg, NULL);
3756 * Drop the packet on the floor
3759 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
3762 struct dst_entry *dst = skb_dst(skb);
3763 switch (ipstats_mib_noroutes) {
3764 case IPSTATS_MIB_INNOROUTES:
3765 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
3766 if (type == IPV6_ADDR_ANY) {
3767 IP6_INC_STATS(dev_net(dst->dev),
3768 __in6_dev_get_safely(skb->dev),
3769 IPSTATS_MIB_INADDRERRORS);
3773 case IPSTATS_MIB_OUTNOROUTES:
3774 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
3775 ipstats_mib_noroutes);
3778 icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
3783 static int ip6_pkt_discard(struct sk_buff *skb)
3785 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
3788 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3790 skb->dev = skb_dst(skb)->dev;
3791 return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
3794 static int ip6_pkt_prohibit(struct sk_buff *skb)
3796 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
3799 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
3801 skb->dev = skb_dst(skb)->dev;
3802 return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
3806 * Allocate a dst for local (unicast / anycast) address.
3809 struct fib6_info *addrconf_f6i_alloc(struct net *net,
3810 struct inet6_dev *idev,
3811 const struct in6_addr *addr,
3812 bool anycast, gfp_t gfp_flags)
3815 struct net_device *dev = idev->dev;
3816 struct fib6_info *f6i;
3818 f6i = fib6_info_alloc(gfp_flags);
3820 return ERR_PTR(-ENOMEM);
3822 f6i->dst_nocount = true;
3823 f6i->dst_host = true;
3824 f6i->fib6_protocol = RTPROT_KERNEL;
3825 f6i->fib6_flags = RTF_UP | RTF_NONEXTHOP;
3827 f6i->fib6_type = RTN_ANYCAST;
3828 f6i->fib6_flags |= RTF_ANYCAST;
3830 f6i->fib6_type = RTN_LOCAL;
3831 f6i->fib6_flags |= RTF_LOCAL;
3834 f6i->fib6_nh.nh_gw = *addr;
3836 f6i->fib6_nh.nh_dev = dev;
3837 f6i->fib6_dst.addr = *addr;
3838 f6i->fib6_dst.plen = 128;
3839 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
3840 f6i->fib6_table = fib6_get_table(net, tb_id);
3845 /* remove deleted ip from prefsrc entries */
3846 struct arg_dev_net_ip {
3847 struct net_device *dev;
3849 struct in6_addr *addr;
3852 static int fib6_remove_prefsrc(struct fib6_info *rt, void *arg)
3854 struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
3855 struct net *net = ((struct arg_dev_net_ip *)arg)->net;
3856 struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
3858 if (((void *)rt->fib6_nh.nh_dev == dev || !dev) &&
3859 rt != net->ipv6.fib6_null_entry &&
3860 ipv6_addr_equal(addr, &rt->fib6_prefsrc.addr)) {
3861 spin_lock_bh(&rt6_exception_lock);
3862 /* remove prefsrc entry */
3863 rt->fib6_prefsrc.plen = 0;
3864 /* need to update cache as well */
3865 rt6_exceptions_remove_prefsrc(rt);
3866 spin_unlock_bh(&rt6_exception_lock);
3871 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
3873 struct net *net = dev_net(ifp->idev->dev);
3874 struct arg_dev_net_ip adni = {
3875 .dev = ifp->idev->dev,
3879 fib6_clean_all(net, fib6_remove_prefsrc, &adni);
3882 #define RTF_RA_ROUTER (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
3884 /* Remove routers and update dst entries when gateway turn into host. */
3885 static int fib6_clean_tohost(struct fib6_info *rt, void *arg)
3887 struct in6_addr *gateway = (struct in6_addr *)arg;
3889 if (((rt->fib6_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) &&
3890 ipv6_addr_equal(gateway, &rt->fib6_nh.nh_gw)) {
3894 /* Further clean up cached routes in exception table.
3895 * This is needed because cached route may have a different
3896 * gateway than its 'parent' in the case of an ip redirect.
3898 rt6_exceptions_clean_tohost(rt, gateway);
3903 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
3905 fib6_clean_all(net, fib6_clean_tohost, gateway);
3908 struct arg_netdev_event {
3909 const struct net_device *dev;
3911 unsigned int nh_flags;
3912 unsigned long event;
3916 static struct fib6_info *rt6_multipath_first_sibling(const struct fib6_info *rt)
3918 struct fib6_info *iter;
3919 struct fib6_node *fn;
3921 fn = rcu_dereference_protected(rt->fib6_node,
3922 lockdep_is_held(&rt->fib6_table->tb6_lock));
3923 iter = rcu_dereference_protected(fn->leaf,
3924 lockdep_is_held(&rt->fib6_table->tb6_lock));
3926 if (iter->fib6_metric == rt->fib6_metric &&
3927 rt6_qualify_for_ecmp(iter))
3929 iter = rcu_dereference_protected(iter->fib6_next,
3930 lockdep_is_held(&rt->fib6_table->tb6_lock));
3936 static bool rt6_is_dead(const struct fib6_info *rt)
3938 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD ||
3939 (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN &&
3940 fib6_ignore_linkdown(rt)))
3946 static int rt6_multipath_total_weight(const struct fib6_info *rt)
3948 struct fib6_info *iter;
3951 if (!rt6_is_dead(rt))
3952 total += rt->fib6_nh.nh_weight;
3954 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings) {
3955 if (!rt6_is_dead(iter))
3956 total += iter->fib6_nh.nh_weight;
3962 static void rt6_upper_bound_set(struct fib6_info *rt, int *weight, int total)
3964 int upper_bound = -1;
3966 if (!rt6_is_dead(rt)) {
3967 *weight += rt->fib6_nh.nh_weight;
3968 upper_bound = DIV_ROUND_CLOSEST_ULL((u64) (*weight) << 31,
3971 atomic_set(&rt->fib6_nh.nh_upper_bound, upper_bound);
3974 static void rt6_multipath_upper_bound_set(struct fib6_info *rt, int total)
3976 struct fib6_info *iter;
3979 rt6_upper_bound_set(rt, &weight, total);
3981 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
3982 rt6_upper_bound_set(iter, &weight, total);
3985 void rt6_multipath_rebalance(struct fib6_info *rt)
3987 struct fib6_info *first;
3990 /* In case the entire multipath route was marked for flushing,
3991 * then there is no need to rebalance upon the removal of every
3994 if (!rt->fib6_nsiblings || rt->should_flush)
3997 /* During lookup routes are evaluated in order, so we need to
3998 * make sure upper bounds are assigned from the first sibling
4001 first = rt6_multipath_first_sibling(rt);
4002 if (WARN_ON_ONCE(!first))
4005 total = rt6_multipath_total_weight(first);
4006 rt6_multipath_upper_bound_set(first, total);
4009 static int fib6_ifup(struct fib6_info *rt, void *p_arg)
4011 const struct arg_netdev_event *arg = p_arg;
4012 struct net *net = dev_net(arg->dev);
4014 if (rt != net->ipv6.fib6_null_entry && rt->fib6_nh.nh_dev == arg->dev) {
4015 rt->fib6_nh.nh_flags &= ~arg->nh_flags;
4016 fib6_update_sernum_upto_root(net, rt);
4017 rt6_multipath_rebalance(rt);
4023 void rt6_sync_up(struct net_device *dev, unsigned int nh_flags)
4025 struct arg_netdev_event arg = {
4028 .nh_flags = nh_flags,
4032 if (nh_flags & RTNH_F_DEAD && netif_carrier_ok(dev))
4033 arg.nh_flags |= RTNH_F_LINKDOWN;
4035 fib6_clean_all(dev_net(dev), fib6_ifup, &arg);
4038 static bool rt6_multipath_uses_dev(const struct fib6_info *rt,
4039 const struct net_device *dev)
4041 struct fib6_info *iter;
4043 if (rt->fib6_nh.nh_dev == dev)
4045 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4046 if (iter->fib6_nh.nh_dev == dev)
4052 static void rt6_multipath_flush(struct fib6_info *rt)
4054 struct fib6_info *iter;
4056 rt->should_flush = 1;
4057 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4058 iter->should_flush = 1;
4061 static unsigned int rt6_multipath_dead_count(const struct fib6_info *rt,
4062 const struct net_device *down_dev)
4064 struct fib6_info *iter;
4065 unsigned int dead = 0;
4067 if (rt->fib6_nh.nh_dev == down_dev ||
4068 rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4070 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4071 if (iter->fib6_nh.nh_dev == down_dev ||
4072 iter->fib6_nh.nh_flags & RTNH_F_DEAD)
4078 static void rt6_multipath_nh_flags_set(struct fib6_info *rt,
4079 const struct net_device *dev,
4080 unsigned int nh_flags)
4082 struct fib6_info *iter;
4084 if (rt->fib6_nh.nh_dev == dev)
4085 rt->fib6_nh.nh_flags |= nh_flags;
4086 list_for_each_entry(iter, &rt->fib6_siblings, fib6_siblings)
4087 if (iter->fib6_nh.nh_dev == dev)
4088 iter->fib6_nh.nh_flags |= nh_flags;
4091 /* called with write lock held for table with rt */
4092 static int fib6_ifdown(struct fib6_info *rt, void *p_arg)
4094 const struct arg_netdev_event *arg = p_arg;
4095 const struct net_device *dev = arg->dev;
4096 struct net *net = dev_net(dev);
4098 if (rt == net->ipv6.fib6_null_entry)
4101 switch (arg->event) {
4102 case NETDEV_UNREGISTER:
4103 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4105 if (rt->should_flush)
4107 if (!rt->fib6_nsiblings)
4108 return rt->fib6_nh.nh_dev == dev ? -1 : 0;
4109 if (rt6_multipath_uses_dev(rt, dev)) {
4112 count = rt6_multipath_dead_count(rt, dev);
4113 if (rt->fib6_nsiblings + 1 == count) {
4114 rt6_multipath_flush(rt);
4117 rt6_multipath_nh_flags_set(rt, dev, RTNH_F_DEAD |
4119 fib6_update_sernum(net, rt);
4120 rt6_multipath_rebalance(rt);
4124 if (rt->fib6_nh.nh_dev != dev ||
4125 rt->fib6_flags & (RTF_LOCAL | RTF_ANYCAST))
4127 rt->fib6_nh.nh_flags |= RTNH_F_LINKDOWN;
4128 rt6_multipath_rebalance(rt);
4135 void rt6_sync_down_dev(struct net_device *dev, unsigned long event)
4137 struct arg_netdev_event arg = {
4144 fib6_clean_all(dev_net(dev), fib6_ifdown, &arg);
4147 void rt6_disable_ip(struct net_device *dev, unsigned long event)
4149 rt6_sync_down_dev(dev, event);
4150 rt6_uncached_list_flush_dev(dev_net(dev), dev);
4151 neigh_ifdown(&nd_tbl, dev);
4154 struct rt6_mtu_change_arg {
4155 struct net_device *dev;
4159 static int rt6_mtu_change_route(struct fib6_info *rt, void *p_arg)
4161 struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
4162 struct inet6_dev *idev;
4164 /* In IPv6 pmtu discovery is not optional,
4165 so that RTAX_MTU lock cannot disable it.
4166 We still use this lock to block changes
4167 caused by addrconf/ndisc.
4170 idev = __in6_dev_get(arg->dev);
4174 /* For administrative MTU increase, there is no way to discover
4175 IPv6 PMTU increase, so PMTU increase should be updated here.
4176 Since RFC 1981 doesn't include administrative MTU increase
4177 update PMTU increase is a MUST. (i.e. jumbo frame)
4179 if (rt->fib6_nh.nh_dev == arg->dev &&
4180 !fib6_metric_locked(rt, RTAX_MTU)) {
4181 u32 mtu = rt->fib6_pmtu;
4183 if (mtu >= arg->mtu ||
4184 (mtu < arg->mtu && mtu == idev->cnf.mtu6))
4185 fib6_metric_set(rt, RTAX_MTU, arg->mtu);
4187 spin_lock_bh(&rt6_exception_lock);
4188 rt6_exceptions_update_pmtu(idev, rt, arg->mtu);
4189 spin_unlock_bh(&rt6_exception_lock);
4194 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
4196 struct rt6_mtu_change_arg arg = {
4201 fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
4204 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
4205 [RTA_GATEWAY] = { .len = sizeof(struct in6_addr) },
4206 [RTA_PREFSRC] = { .len = sizeof(struct in6_addr) },
4207 [RTA_OIF] = { .type = NLA_U32 },
4208 [RTA_IIF] = { .type = NLA_U32 },
4209 [RTA_PRIORITY] = { .type = NLA_U32 },
4210 [RTA_METRICS] = { .type = NLA_NESTED },
4211 [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
4212 [RTA_PREF] = { .type = NLA_U8 },
4213 [RTA_ENCAP_TYPE] = { .type = NLA_U16 },
4214 [RTA_ENCAP] = { .type = NLA_NESTED },
4215 [RTA_EXPIRES] = { .type = NLA_U32 },
4216 [RTA_UID] = { .type = NLA_U32 },
4217 [RTA_MARK] = { .type = NLA_U32 },
4218 [RTA_TABLE] = { .type = NLA_U32 },
4219 [RTA_IP_PROTO] = { .type = NLA_U8 },
4220 [RTA_SPORT] = { .type = NLA_U16 },
4221 [RTA_DPORT] = { .type = NLA_U16 },
4224 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
4225 struct fib6_config *cfg,
4226 struct netlink_ext_ack *extack)
4229 struct nlattr *tb[RTA_MAX+1];
4233 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4239 rtm = nlmsg_data(nlh);
4240 memset(cfg, 0, sizeof(*cfg));
4242 cfg->fc_table = rtm->rtm_table;
4243 cfg->fc_dst_len = rtm->rtm_dst_len;
4244 cfg->fc_src_len = rtm->rtm_src_len;
4245 cfg->fc_flags = RTF_UP;
4246 cfg->fc_protocol = rtm->rtm_protocol;
4247 cfg->fc_type = rtm->rtm_type;
4249 if (rtm->rtm_type == RTN_UNREACHABLE ||
4250 rtm->rtm_type == RTN_BLACKHOLE ||
4251 rtm->rtm_type == RTN_PROHIBIT ||
4252 rtm->rtm_type == RTN_THROW)
4253 cfg->fc_flags |= RTF_REJECT;
4255 if (rtm->rtm_type == RTN_LOCAL)
4256 cfg->fc_flags |= RTF_LOCAL;
4258 if (rtm->rtm_flags & RTM_F_CLONED)
4259 cfg->fc_flags |= RTF_CACHE;
4261 cfg->fc_flags |= (rtm->rtm_flags & RTNH_F_ONLINK);
4263 cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
4264 cfg->fc_nlinfo.nlh = nlh;
4265 cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
4267 if (tb[RTA_GATEWAY]) {
4268 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
4269 cfg->fc_flags |= RTF_GATEWAY;
4272 NL_SET_ERR_MSG(extack, "IPv6 does not support RTA_VIA attribute");
4277 int plen = (rtm->rtm_dst_len + 7) >> 3;
4279 if (nla_len(tb[RTA_DST]) < plen)
4282 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
4286 int plen = (rtm->rtm_src_len + 7) >> 3;
4288 if (nla_len(tb[RTA_SRC]) < plen)
4291 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
4294 if (tb[RTA_PREFSRC])
4295 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
4298 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
4300 if (tb[RTA_PRIORITY])
4301 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
4303 if (tb[RTA_METRICS]) {
4304 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
4305 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
4309 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
4311 if (tb[RTA_MULTIPATH]) {
4312 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
4313 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
4315 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
4316 cfg->fc_mp_len, extack);
4322 pref = nla_get_u8(tb[RTA_PREF]);
4323 if (pref != ICMPV6_ROUTER_PREF_LOW &&
4324 pref != ICMPV6_ROUTER_PREF_HIGH)
4325 pref = ICMPV6_ROUTER_PREF_MEDIUM;
4326 cfg->fc_flags |= RTF_PREF(pref);
4330 cfg->fc_encap = tb[RTA_ENCAP];
4332 if (tb[RTA_ENCAP_TYPE]) {
4333 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
4335 err = lwtunnel_valid_encap_type(cfg->fc_encap_type, extack);
4340 if (tb[RTA_EXPIRES]) {
4341 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
4343 if (addrconf_finite_timeout(timeout)) {
4344 cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
4345 cfg->fc_flags |= RTF_EXPIRES;
4355 struct fib6_info *fib6_info;
4356 struct fib6_config r_cfg;
4357 struct list_head next;
4360 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
4364 list_for_each_entry(nh, rt6_nh_list, next) {
4365 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6c nexthop %pI6c ifi %d\n",
4366 &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
4367 nh->r_cfg.fc_ifindex);
4371 static int ip6_route_info_append(struct net *net,
4372 struct list_head *rt6_nh_list,
4373 struct fib6_info *rt,
4374 struct fib6_config *r_cfg)
4379 list_for_each_entry(nh, rt6_nh_list, next) {
4380 /* check if fib6_info already exists */
4381 if (rt6_duplicate_nexthop(nh->fib6_info, rt))
4385 nh = kzalloc(sizeof(*nh), GFP_KERNEL);
4389 memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
4390 list_add_tail(&nh->next, rt6_nh_list);
4395 static void ip6_route_mpath_notify(struct fib6_info *rt,
4396 struct fib6_info *rt_last,
4397 struct nl_info *info,
4400 /* if this is an APPEND route, then rt points to the first route
4401 * inserted and rt_last points to last route inserted. Userspace
4402 * wants a consistent dump of the route which starts at the first
4403 * nexthop. Since sibling routes are always added at the end of
4404 * the list, find the first sibling of the last route appended
4406 if ((nlflags & NLM_F_APPEND) && rt_last && rt_last->fib6_nsiblings) {
4407 rt = list_first_entry(&rt_last->fib6_siblings,
4413 inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
4416 static int fib6_gw_from_attr(struct in6_addr *gw, struct nlattr *nla,
4417 struct netlink_ext_ack *extack)
4419 if (nla_len(nla) < sizeof(*gw)) {
4420 NL_SET_ERR_MSG(extack, "Invalid IPv6 address in RTA_GATEWAY");
4424 *gw = nla_get_in6_addr(nla);
4429 static int ip6_route_multipath_add(struct fib6_config *cfg,
4430 struct netlink_ext_ack *extack)
4432 struct fib6_info *rt_notif = NULL, *rt_last = NULL;
4433 struct nl_info *info = &cfg->fc_nlinfo;
4434 struct fib6_config r_cfg;
4435 struct rtnexthop *rtnh;
4436 struct fib6_info *rt;
4437 struct rt6_nh *err_nh;
4438 struct rt6_nh *nh, *nh_safe;
4444 int replace = (cfg->fc_nlinfo.nlh &&
4445 (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
4446 LIST_HEAD(rt6_nh_list);
4448 nlflags = replace ? NLM_F_REPLACE : NLM_F_CREATE;
4449 if (info->nlh && info->nlh->nlmsg_flags & NLM_F_APPEND)
4450 nlflags |= NLM_F_APPEND;
4452 remaining = cfg->fc_mp_len;
4453 rtnh = (struct rtnexthop *)cfg->fc_mp;
4455 /* Parse a Multipath Entry and build a list (rt6_nh_list) of
4456 * fib6_info structs per nexthop
4458 while (rtnh_ok(rtnh, remaining)) {
4459 memcpy(&r_cfg, cfg, sizeof(*cfg));
4460 if (rtnh->rtnh_ifindex)
4461 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4463 attrlen = rtnh_attrlen(rtnh);
4465 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4467 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4469 err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla,
4474 r_cfg.fc_flags |= RTF_GATEWAY;
4476 r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
4477 nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
4479 r_cfg.fc_encap_type = nla_get_u16(nla);
4482 r_cfg.fc_flags |= (rtnh->rtnh_flags & RTNH_F_ONLINK);
4483 rt = ip6_route_info_create(&r_cfg, GFP_KERNEL, extack);
4489 if (!rt6_qualify_for_ecmp(rt)) {
4491 NL_SET_ERR_MSG(extack,
4492 "Device only routes can not be added for IPv6 using the multipath API.");
4493 fib6_info_release(rt);
4497 rt->fib6_nh.nh_weight = rtnh->rtnh_hops + 1;
4499 err = ip6_route_info_append(info->nl_net, &rt6_nh_list,
4502 fib6_info_release(rt);
4506 rtnh = rtnh_next(rtnh, &remaining);
4509 /* for add and replace send one notification with all nexthops.
4510 * Skip the notification in fib6_add_rt2node and send one with
4511 * the full route when done
4513 info->skip_notify = 1;
4516 list_for_each_entry(nh, &rt6_nh_list, next) {
4517 err = __ip6_ins_rt(nh->fib6_info, info, extack);
4518 fib6_info_release(nh->fib6_info);
4521 /* save reference to last route successfully inserted */
4522 rt_last = nh->fib6_info;
4524 /* save reference to first route for notification */
4526 rt_notif = nh->fib6_info;
4529 /* nh->fib6_info is used or freed at this point, reset to NULL*/
4530 nh->fib6_info = NULL;
4533 ip6_print_replace_route_err(&rt6_nh_list);
4538 /* Because each route is added like a single route we remove
4539 * these flags after the first nexthop: if there is a collision,
4540 * we have already failed to add the first nexthop:
4541 * fib6_add_rt2node() has rejected it; when replacing, old
4542 * nexthops have been replaced by first new, the rest should
4545 if (cfg->fc_nlinfo.nlh) {
4546 cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
4548 cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_CREATE;
4553 /* success ... tell user about new route */
4554 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4558 /* send notification for routes that were added so that
4559 * the delete notifications sent by ip6_route_del are
4563 ip6_route_mpath_notify(rt_notif, rt_last, info, nlflags);
4565 /* Delete routes that were already added */
4566 list_for_each_entry(nh, &rt6_nh_list, next) {
4569 ip6_route_del(&nh->r_cfg, extack);
4573 list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
4575 fib6_info_release(nh->fib6_info);
4576 list_del(&nh->next);
4583 static int ip6_route_multipath_del(struct fib6_config *cfg,
4584 struct netlink_ext_ack *extack)
4586 struct fib6_config r_cfg;
4587 struct rtnexthop *rtnh;
4590 int err = 1, last_err = 0;
4592 remaining = cfg->fc_mp_len;
4593 rtnh = (struct rtnexthop *)cfg->fc_mp;
4595 /* Parse a Multipath Entry */
4596 while (rtnh_ok(rtnh, remaining)) {
4597 memcpy(&r_cfg, cfg, sizeof(*cfg));
4598 if (rtnh->rtnh_ifindex)
4599 r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
4601 attrlen = rtnh_attrlen(rtnh);
4603 struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
4605 nla = nla_find(attrs, attrlen, RTA_GATEWAY);
4607 err = fib6_gw_from_attr(&r_cfg.fc_gateway, nla,
4614 r_cfg.fc_flags |= RTF_GATEWAY;
4617 err = ip6_route_del(&r_cfg, extack);
4622 rtnh = rtnh_next(rtnh, &remaining);
4628 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4629 struct netlink_ext_ack *extack)
4631 struct fib6_config cfg;
4634 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4639 return ip6_route_multipath_del(&cfg, extack);
4641 cfg.fc_delete_all_nh = 1;
4642 return ip6_route_del(&cfg, extack);
4646 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh,
4647 struct netlink_ext_ack *extack)
4649 struct fib6_config cfg;
4652 err = rtm_to_fib6_config(skb, nlh, &cfg, extack);
4657 return ip6_route_multipath_add(&cfg, extack);
4659 return ip6_route_add(&cfg, GFP_KERNEL, extack);
4662 static size_t rt6_nlmsg_size(struct fib6_info *rt)
4664 int nexthop_len = 0;
4666 if (rt->fib6_nsiblings) {
4667 nexthop_len = nla_total_size(0) /* RTA_MULTIPATH */
4668 + NLA_ALIGN(sizeof(struct rtnexthop))
4669 + nla_total_size(16) /* RTA_GATEWAY */
4670 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate);
4672 nexthop_len *= rt->fib6_nsiblings;
4675 return NLMSG_ALIGN(sizeof(struct rtmsg))
4676 + nla_total_size(16) /* RTA_SRC */
4677 + nla_total_size(16) /* RTA_DST */
4678 + nla_total_size(16) /* RTA_GATEWAY */
4679 + nla_total_size(16) /* RTA_PREFSRC */
4680 + nla_total_size(4) /* RTA_TABLE */
4681 + nla_total_size(4) /* RTA_IIF */
4682 + nla_total_size(4) /* RTA_OIF */
4683 + nla_total_size(4) /* RTA_PRIORITY */
4684 + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
4685 + nla_total_size(sizeof(struct rta_cacheinfo))
4686 + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
4687 + nla_total_size(1) /* RTA_PREF */
4688 + lwtunnel_get_encap_size(rt->fib6_nh.nh_lwtstate)
4692 static int rt6_nexthop_info(struct sk_buff *skb, struct fib6_info *rt,
4693 unsigned int *flags, bool skip_oif)
4695 if (rt->fib6_nh.nh_flags & RTNH_F_DEAD)
4696 *flags |= RTNH_F_DEAD;
4698 if (rt->fib6_nh.nh_flags & RTNH_F_LINKDOWN) {
4699 *flags |= RTNH_F_LINKDOWN;
4702 if (fib6_ignore_linkdown(rt))
4703 *flags |= RTNH_F_DEAD;
4707 if (rt->fib6_flags & RTF_GATEWAY) {
4708 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->fib6_nh.nh_gw) < 0)
4709 goto nla_put_failure;
4712 *flags |= (rt->fib6_nh.nh_flags & RTNH_F_ONLINK);
4713 if (rt->fib6_nh.nh_flags & RTNH_F_OFFLOAD)
4714 *flags |= RTNH_F_OFFLOAD;
4716 /* not needed for multipath encoding b/c it has a rtnexthop struct */
4717 if (!skip_oif && rt->fib6_nh.nh_dev &&
4718 nla_put_u32(skb, RTA_OIF, rt->fib6_nh.nh_dev->ifindex))
4719 goto nla_put_failure;
4721 if (rt->fib6_nh.nh_lwtstate &&
4722 lwtunnel_fill_encap(skb, rt->fib6_nh.nh_lwtstate) < 0)
4723 goto nla_put_failure;
4731 /* add multipath next hop */
4732 static int rt6_add_nexthop(struct sk_buff *skb, struct fib6_info *rt)
4734 const struct net_device *dev = rt->fib6_nh.nh_dev;
4735 struct rtnexthop *rtnh;
4736 unsigned int flags = 0;
4738 rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
4740 goto nla_put_failure;
4742 rtnh->rtnh_hops = rt->fib6_nh.nh_weight - 1;
4743 rtnh->rtnh_ifindex = dev ? dev->ifindex : 0;
4745 if (rt6_nexthop_info(skb, rt, &flags, true) < 0)
4746 goto nla_put_failure;
4748 rtnh->rtnh_flags = flags;
4750 /* length of rtnetlink header + attributes */
4751 rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
4759 static int rt6_fill_node(struct net *net, struct sk_buff *skb,
4760 struct fib6_info *rt, struct dst_entry *dst,
4761 struct in6_addr *dest, struct in6_addr *src,
4762 int iif, int type, u32 portid, u32 seq,
4765 struct rt6_info *rt6 = (struct rt6_info *)dst;
4766 struct rt6key *rt6_dst, *rt6_src;
4767 u32 *pmetrics, table, rt6_flags;
4768 struct nlmsghdr *nlh;
4772 nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
4777 rt6_dst = &rt6->rt6i_dst;
4778 rt6_src = &rt6->rt6i_src;
4779 rt6_flags = rt6->rt6i_flags;
4781 rt6_dst = &rt->fib6_dst;
4782 rt6_src = &rt->fib6_src;
4783 rt6_flags = rt->fib6_flags;
4786 rtm = nlmsg_data(nlh);
4787 rtm->rtm_family = AF_INET6;
4788 rtm->rtm_dst_len = rt6_dst->plen;
4789 rtm->rtm_src_len = rt6_src->plen;
4792 table = rt->fib6_table->tb6_id;
4794 table = RT6_TABLE_UNSPEC;
4795 rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
4796 if (nla_put_u32(skb, RTA_TABLE, table))
4797 goto nla_put_failure;
4799 rtm->rtm_type = rt->fib6_type;
4801 rtm->rtm_scope = RT_SCOPE_UNIVERSE;
4802 rtm->rtm_protocol = rt->fib6_protocol;
4804 if (rt6_flags & RTF_CACHE)
4805 rtm->rtm_flags |= RTM_F_CLONED;
4808 if (nla_put_in6_addr(skb, RTA_DST, dest))
4809 goto nla_put_failure;
4810 rtm->rtm_dst_len = 128;
4811 } else if (rtm->rtm_dst_len)
4812 if (nla_put_in6_addr(skb, RTA_DST, &rt6_dst->addr))
4813 goto nla_put_failure;
4814 #ifdef CONFIG_IPV6_SUBTREES
4816 if (nla_put_in6_addr(skb, RTA_SRC, src))
4817 goto nla_put_failure;
4818 rtm->rtm_src_len = 128;
4819 } else if (rtm->rtm_src_len &&
4820 nla_put_in6_addr(skb, RTA_SRC, &rt6_src->addr))
4821 goto nla_put_failure;
4824 #ifdef CONFIG_IPV6_MROUTE
4825 if (ipv6_addr_is_multicast(&rt6_dst->addr)) {
4826 int err = ip6mr_get_route(net, skb, rtm, portid);
4831 goto nla_put_failure;
4834 if (nla_put_u32(skb, RTA_IIF, iif))
4835 goto nla_put_failure;
4837 struct in6_addr saddr_buf;
4838 if (ip6_route_get_saddr(net, rt, dest, 0, &saddr_buf) == 0 &&
4839 nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4840 goto nla_put_failure;
4843 if (rt->fib6_prefsrc.plen) {
4844 struct in6_addr saddr_buf;
4845 saddr_buf = rt->fib6_prefsrc.addr;
4846 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
4847 goto nla_put_failure;
4850 pmetrics = dst ? dst_metrics_ptr(dst) : rt->fib6_metrics->metrics;
4851 if (rtnetlink_put_metrics(skb, pmetrics) < 0)
4852 goto nla_put_failure;
4854 if (nla_put_u32(skb, RTA_PRIORITY, rt->fib6_metric))
4855 goto nla_put_failure;
4857 /* For multipath routes, walk the siblings list and add
4858 * each as a nexthop within RTA_MULTIPATH.
4861 if (rt6_flags & RTF_GATEWAY &&
4862 nla_put_in6_addr(skb, RTA_GATEWAY, &rt6->rt6i_gateway))
4863 goto nla_put_failure;
4865 if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
4866 goto nla_put_failure;
4867 } else if (rt->fib6_nsiblings) {
4868 struct fib6_info *sibling, *next_sibling;
4871 mp = nla_nest_start(skb, RTA_MULTIPATH);
4873 goto nla_put_failure;
4875 if (rt6_add_nexthop(skb, rt) < 0)
4876 goto nla_put_failure;
4878 list_for_each_entry_safe(sibling, next_sibling,
4879 &rt->fib6_siblings, fib6_siblings) {
4880 if (rt6_add_nexthop(skb, sibling) < 0)
4881 goto nla_put_failure;
4884 nla_nest_end(skb, mp);
4886 if (rt6_nexthop_info(skb, rt, &rtm->rtm_flags, false) < 0)
4887 goto nla_put_failure;
4890 if (rt6_flags & RTF_EXPIRES) {
4891 expires = dst ? dst->expires : rt->expires;
4895 if (rtnl_put_cacheinfo(skb, dst, 0, expires, dst ? dst->error : 0) < 0)
4896 goto nla_put_failure;
4898 if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt6_flags)))
4899 goto nla_put_failure;
4902 nlmsg_end(skb, nlh);
4906 nlmsg_cancel(skb, nlh);
4910 int rt6_dump_route(struct fib6_info *rt, void *p_arg)
4912 struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
4913 struct net *net = arg->net;
4915 if (rt == net->ipv6.fib6_null_entry)
4918 if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
4919 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
4921 /* user wants prefix routes only */
4922 if (rtm->rtm_flags & RTM_F_PREFIX &&
4923 !(rt->fib6_flags & RTF_PREFIX_RT)) {
4924 /* success since this is not a prefix route */
4929 return rt6_fill_node(net, arg->skb, rt, NULL, NULL, NULL, 0,
4930 RTM_NEWROUTE, NETLINK_CB(arg->cb->skb).portid,
4931 arg->cb->nlh->nlmsg_seq, NLM_F_MULTI);
4934 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh,
4935 struct netlink_ext_ack *extack)
4937 struct net *net = sock_net(in_skb->sk);
4938 struct nlattr *tb[RTA_MAX+1];
4939 int err, iif = 0, oif = 0;
4940 struct fib6_info *from;
4941 struct dst_entry *dst;
4942 struct rt6_info *rt;
4943 struct sk_buff *skb;
4948 err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy,
4954 memset(&fl6, 0, sizeof(fl6));
4955 rtm = nlmsg_data(nlh);
4956 fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
4957 fibmatch = !!(rtm->rtm_flags & RTM_F_FIB_MATCH);
4960 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
4963 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
4967 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
4970 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
4974 iif = nla_get_u32(tb[RTA_IIF]);
4977 oif = nla_get_u32(tb[RTA_OIF]);
4980 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
4983 fl6.flowi6_uid = make_kuid(current_user_ns(),
4984 nla_get_u32(tb[RTA_UID]));
4986 fl6.flowi6_uid = iif ? INVALID_UID : current_uid();
4989 fl6.fl6_sport = nla_get_be16(tb[RTA_SPORT]);
4992 fl6.fl6_dport = nla_get_be16(tb[RTA_DPORT]);
4994 if (tb[RTA_IP_PROTO]) {
4995 err = rtm_getroute_parse_ip_proto(tb[RTA_IP_PROTO],
4996 &fl6.flowi6_proto, AF_INET6,
5003 struct net_device *dev;
5008 dev = dev_get_by_index_rcu(net, iif);
5015 fl6.flowi6_iif = iif;
5017 if (!ipv6_addr_any(&fl6.saddr))
5018 flags |= RT6_LOOKUP_F_HAS_SADDR;
5020 dst = ip6_route_input_lookup(net, dev, &fl6, NULL, flags);
5024 fl6.flowi6_oif = oif;
5026 dst = ip6_route_output(net, NULL, &fl6);
5030 rt = container_of(dst, struct rt6_info, dst);
5031 if (rt->dst.error) {
5032 err = rt->dst.error;
5037 if (rt == net->ipv6.ip6_null_entry) {
5038 err = rt->dst.error;
5043 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
5050 skb_dst_set(skb, &rt->dst);
5053 from = rcu_dereference(rt->from);
5056 err = rt6_fill_node(net, skb, from, NULL, NULL, NULL,
5058 NETLINK_CB(in_skb).portid,
5061 err = rt6_fill_node(net, skb, from, dst, &fl6.daddr,
5062 &fl6.saddr, iif, RTM_NEWROUTE,
5063 NETLINK_CB(in_skb).portid,
5075 err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
5080 void inet6_rt_notify(int event, struct fib6_info *rt, struct nl_info *info,
5081 unsigned int nlm_flags)
5083 struct sk_buff *skb;
5084 struct net *net = info->nl_net;
5089 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
5091 skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
5095 err = rt6_fill_node(net, skb, rt, NULL, NULL, NULL, 0,
5096 event, info->portid, seq, nlm_flags);
5098 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
5099 WARN_ON(err == -EMSGSIZE);
5103 rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
5104 info->nlh, gfp_any());
5108 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
5111 static int ip6_route_dev_notify(struct notifier_block *this,
5112 unsigned long event, void *ptr)
5114 struct net_device *dev = netdev_notifier_info_to_dev(ptr);
5115 struct net *net = dev_net(dev);
5117 if (!(dev->flags & IFF_LOOPBACK))
5120 if (event == NETDEV_REGISTER) {
5121 net->ipv6.fib6_null_entry->fib6_nh.nh_dev = dev;
5122 net->ipv6.ip6_null_entry->dst.dev = dev;
5123 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
5124 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5125 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
5126 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
5127 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
5128 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
5130 } else if (event == NETDEV_UNREGISTER &&
5131 dev->reg_state != NETREG_UNREGISTERED) {
5132 /* NETDEV_UNREGISTER could be fired for multiple times by
5133 * netdev_wait_allrefs(). Make sure we only call this once.
5135 in6_dev_put_clear(&net->ipv6.ip6_null_entry->rt6i_idev);
5136 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5137 in6_dev_put_clear(&net->ipv6.ip6_prohibit_entry->rt6i_idev);
5138 in6_dev_put_clear(&net->ipv6.ip6_blk_hole_entry->rt6i_idev);
5149 #ifdef CONFIG_PROC_FS
5150 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
5152 struct net *net = (struct net *)seq->private;
5153 seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
5154 net->ipv6.rt6_stats->fib_nodes,
5155 net->ipv6.rt6_stats->fib_route_nodes,
5156 atomic_read(&net->ipv6.rt6_stats->fib_rt_alloc),
5157 net->ipv6.rt6_stats->fib_rt_entries,
5158 net->ipv6.rt6_stats->fib_rt_cache,
5159 dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
5160 net->ipv6.rt6_stats->fib_discarded_routes);
5164 #endif /* CONFIG_PROC_FS */
5166 #ifdef CONFIG_SYSCTL
5169 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
5170 void __user *buffer, size_t *lenp, loff_t *ppos)
5177 net = (struct net *)ctl->extra1;
5178 delay = net->ipv6.sysctl.flush_delay;
5179 proc_dointvec(ctl, write, buffer, lenp, ppos);
5180 fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
5184 struct ctl_table ipv6_route_table_template[] = {
5186 .procname = "flush",
5187 .data = &init_net.ipv6.sysctl.flush_delay,
5188 .maxlen = sizeof(int),
5190 .proc_handler = ipv6_sysctl_rtcache_flush
5193 .procname = "gc_thresh",
5194 .data = &ip6_dst_ops_template.gc_thresh,
5195 .maxlen = sizeof(int),
5197 .proc_handler = proc_dointvec,
5200 .procname = "max_size",
5201 .data = &init_net.ipv6.sysctl.ip6_rt_max_size,
5202 .maxlen = sizeof(int),
5204 .proc_handler = proc_dointvec,
5207 .procname = "gc_min_interval",
5208 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5209 .maxlen = sizeof(int),
5211 .proc_handler = proc_dointvec_jiffies,
5214 .procname = "gc_timeout",
5215 .data = &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
5216 .maxlen = sizeof(int),
5218 .proc_handler = proc_dointvec_jiffies,
5221 .procname = "gc_interval",
5222 .data = &init_net.ipv6.sysctl.ip6_rt_gc_interval,
5223 .maxlen = sizeof(int),
5225 .proc_handler = proc_dointvec_jiffies,
5228 .procname = "gc_elasticity",
5229 .data = &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
5230 .maxlen = sizeof(int),
5232 .proc_handler = proc_dointvec,
5235 .procname = "mtu_expires",
5236 .data = &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
5237 .maxlen = sizeof(int),
5239 .proc_handler = proc_dointvec_jiffies,
5242 .procname = "min_adv_mss",
5243 .data = &init_net.ipv6.sysctl.ip6_rt_min_advmss,
5244 .maxlen = sizeof(int),
5246 .proc_handler = proc_dointvec,
5249 .procname = "gc_min_interval_ms",
5250 .data = &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
5251 .maxlen = sizeof(int),
5253 .proc_handler = proc_dointvec_ms_jiffies,
5258 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
5260 struct ctl_table *table;
5262 table = kmemdup(ipv6_route_table_template,
5263 sizeof(ipv6_route_table_template),
5267 table[0].data = &net->ipv6.sysctl.flush_delay;
5268 table[0].extra1 = net;
5269 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
5270 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
5271 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5272 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
5273 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
5274 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
5275 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
5276 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
5277 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
5279 /* Don't export sysctls to unprivileged users */
5280 if (net->user_ns != &init_user_ns)
5281 table[0].procname = NULL;
5288 static int __net_init ip6_route_net_init(struct net *net)
5292 memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
5293 sizeof(net->ipv6.ip6_dst_ops));
5295 if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
5296 goto out_ip6_dst_ops;
5298 net->ipv6.fib6_null_entry = kmemdup(&fib6_null_entry_template,
5299 sizeof(*net->ipv6.fib6_null_entry),
5301 if (!net->ipv6.fib6_null_entry)
5302 goto out_ip6_dst_entries;
5304 net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
5305 sizeof(*net->ipv6.ip6_null_entry),
5307 if (!net->ipv6.ip6_null_entry)
5308 goto out_fib6_null_entry;
5309 net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5310 dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
5311 ip6_template_metrics, true);
5313 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5314 net->ipv6.fib6_has_custom_rules = false;
5315 net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
5316 sizeof(*net->ipv6.ip6_prohibit_entry),
5318 if (!net->ipv6.ip6_prohibit_entry)
5319 goto out_ip6_null_entry;
5320 net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5321 dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
5322 ip6_template_metrics, true);
5324 net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
5325 sizeof(*net->ipv6.ip6_blk_hole_entry),
5327 if (!net->ipv6.ip6_blk_hole_entry)
5328 goto out_ip6_prohibit_entry;
5329 net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
5330 dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
5331 ip6_template_metrics, true);
5334 net->ipv6.sysctl.flush_delay = 0;
5335 net->ipv6.sysctl.ip6_rt_max_size = 4096;
5336 net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
5337 net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
5338 net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
5339 net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
5340 net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
5341 net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
5343 net->ipv6.ip6_rt_gc_expire = 30*HZ;
5349 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5350 out_ip6_prohibit_entry:
5351 kfree(net->ipv6.ip6_prohibit_entry);
5353 kfree(net->ipv6.ip6_null_entry);
5355 out_fib6_null_entry:
5356 kfree(net->ipv6.fib6_null_entry);
5357 out_ip6_dst_entries:
5358 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5363 static void __net_exit ip6_route_net_exit(struct net *net)
5365 kfree(net->ipv6.fib6_null_entry);
5366 kfree(net->ipv6.ip6_null_entry);
5367 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5368 kfree(net->ipv6.ip6_prohibit_entry);
5369 kfree(net->ipv6.ip6_blk_hole_entry);
5371 dst_entries_destroy(&net->ipv6.ip6_dst_ops);
5374 static int __net_init ip6_route_net_init_late(struct net *net)
5376 #ifdef CONFIG_PROC_FS
5377 proc_create_net("ipv6_route", 0, net->proc_net, &ipv6_route_seq_ops,
5378 sizeof(struct ipv6_route_iter));
5379 proc_create_net_single("rt6_stats", 0444, net->proc_net,
5380 rt6_stats_seq_show, NULL);
5385 static void __net_exit ip6_route_net_exit_late(struct net *net)
5387 #ifdef CONFIG_PROC_FS
5388 remove_proc_entry("ipv6_route", net->proc_net);
5389 remove_proc_entry("rt6_stats", net->proc_net);
5393 static struct pernet_operations ip6_route_net_ops = {
5394 .init = ip6_route_net_init,
5395 .exit = ip6_route_net_exit,
5398 static int __net_init ipv6_inetpeer_init(struct net *net)
5400 struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
5404 inet_peer_base_init(bp);
5405 net->ipv6.peers = bp;
5409 static void __net_exit ipv6_inetpeer_exit(struct net *net)
5411 struct inet_peer_base *bp = net->ipv6.peers;
5413 net->ipv6.peers = NULL;
5414 inetpeer_invalidate_tree(bp);
5418 static struct pernet_operations ipv6_inetpeer_ops = {
5419 .init = ipv6_inetpeer_init,
5420 .exit = ipv6_inetpeer_exit,
5423 static struct pernet_operations ip6_route_net_late_ops = {
5424 .init = ip6_route_net_init_late,
5425 .exit = ip6_route_net_exit_late,
5428 static struct notifier_block ip6_route_dev_notifier = {
5429 .notifier_call = ip6_route_dev_notify,
5430 .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
5433 void __init ip6_route_init_special_entries(void)
5435 /* Registering of the loopback is done before this portion of code,
5436 * the loopback reference in rt6_info will not be taken, do it
5437 * manually for init_net */
5438 init_net.ipv6.fib6_null_entry->fib6_nh.nh_dev = init_net.loopback_dev;
5439 init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
5440 init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5441 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
5442 init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
5443 init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5444 init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
5445 init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
5449 int __init ip6_route_init(void)
5455 ip6_dst_ops_template.kmem_cachep =
5456 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
5457 SLAB_HWCACHE_ALIGN, NULL);
5458 if (!ip6_dst_ops_template.kmem_cachep)
5461 ret = dst_entries_init(&ip6_dst_blackhole_ops);
5463 goto out_kmem_cache;
5465 ret = register_pernet_subsys(&ipv6_inetpeer_ops);
5467 goto out_dst_entries;
5469 ret = register_pernet_subsys(&ip6_route_net_ops);
5471 goto out_register_inetpeer;
5473 ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
5477 goto out_register_subsys;
5483 ret = fib6_rules_init();
5487 ret = register_pernet_subsys(&ip6_route_net_late_ops);
5489 goto fib6_rules_init;
5491 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_NEWROUTE,
5492 inet6_rtm_newroute, NULL, 0);
5494 goto out_register_late_subsys;
5496 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_DELROUTE,
5497 inet6_rtm_delroute, NULL, 0);
5499 goto out_register_late_subsys;
5501 ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE,
5502 inet6_rtm_getroute, NULL,
5503 RTNL_FLAG_DOIT_UNLOCKED);
5505 goto out_register_late_subsys;
5507 ret = register_netdevice_notifier(&ip6_route_dev_notifier);
5509 goto out_register_late_subsys;
5511 for_each_possible_cpu(cpu) {
5512 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
5514 INIT_LIST_HEAD(&ul->head);
5515 spin_lock_init(&ul->lock);
5521 out_register_late_subsys:
5522 rtnl_unregister_all(PF_INET6);
5523 unregister_pernet_subsys(&ip6_route_net_late_ops);
5525 fib6_rules_cleanup();
5530 out_register_subsys:
5531 unregister_pernet_subsys(&ip6_route_net_ops);
5532 out_register_inetpeer:
5533 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5535 dst_entries_destroy(&ip6_dst_blackhole_ops);
5537 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
5541 void ip6_route_cleanup(void)
5543 unregister_netdevice_notifier(&ip6_route_dev_notifier);
5544 unregister_pernet_subsys(&ip6_route_net_late_ops);
5545 fib6_rules_cleanup();
5548 unregister_pernet_subsys(&ipv6_inetpeer_ops);
5549 unregister_pernet_subsys(&ip6_route_net_ops);
5550 dst_entries_destroy(&ip6_dst_blackhole_ops);
5551 kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);