GNU Linux-libre 4.9.317-gnu1
[releases.git] / net / ipv6 / route.c
1 /*
2  *      Linux INET6 implementation
3  *      FIB front-end.
4  *
5  *      Authors:
6  *      Pedro Roque             <roque@di.fc.ul.pt>
7  *
8  *      This program is free software; you can redistribute it and/or
9  *      modify it under the terms of the GNU General Public License
10  *      as published by the Free Software Foundation; either version
11  *      2 of the License, or (at your option) any later version.
12  */
13
14 /*      Changes:
15  *
16  *      YOSHIFUJI Hideaki @USAGI
17  *              reworked default router selection.
18  *              - respect outgoing interface
19  *              - select from (probably) reachable routers (i.e.
20  *              routers in REACHABLE, STALE, DELAY or PROBE states).
21  *              - always select the same router if it is (probably)
22  *              reachable.  otherwise, round-robin the list.
23  *      Ville Nuorvala
24  *              Fixed routing subtrees.
25  */
26
27 #define pr_fmt(fmt) "IPv6: " fmt
28
29 #include <linux/capability.h>
30 #include <linux/errno.h>
31 #include <linux/export.h>
32 #include <linux/types.h>
33 #include <linux/times.h>
34 #include <linux/socket.h>
35 #include <linux/sockios.h>
36 #include <linux/net.h>
37 #include <linux/route.h>
38 #include <linux/netdevice.h>
39 #include <linux/in6.h>
40 #include <linux/mroute6.h>
41 #include <linux/init.h>
42 #include <linux/if_arp.h>
43 #include <linux/proc_fs.h>
44 #include <linux/seq_file.h>
45 #include <linux/nsproxy.h>
46 #include <linux/slab.h>
47 #include <net/net_namespace.h>
48 #include <net/snmp.h>
49 #include <net/ipv6.h>
50 #include <net/ip6_fib.h>
51 #include <net/ip6_route.h>
52 #include <net/ndisc.h>
53 #include <net/addrconf.h>
54 #include <net/tcp.h>
55 #include <linux/rtnetlink.h>
56 #include <net/dst.h>
57 #include <net/dst_metadata.h>
58 #include <net/xfrm.h>
59 #include <net/netevent.h>
60 #include <net/netlink.h>
61 #include <net/nexthop.h>
62 #include <net/lwtunnel.h>
63 #include <net/ip_tunnels.h>
64 #include <net/l3mdev.h>
65 #include <trace/events/fib6.h>
66
67 #include <asm/uaccess.h>
68
69 #ifdef CONFIG_SYSCTL
70 #include <linux/sysctl.h>
71 #endif
72
73 enum rt6_nud_state {
74         RT6_NUD_FAIL_HARD = -3,
75         RT6_NUD_FAIL_PROBE = -2,
76         RT6_NUD_FAIL_DO_RR = -1,
77         RT6_NUD_SUCCEED = 1
78 };
79
80 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort);
81 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie);
82 static unsigned int      ip6_default_advmss(const struct dst_entry *dst);
83 static unsigned int      ip6_mtu(const struct dst_entry *dst);
84 static struct dst_entry *ip6_negative_advice(struct dst_entry *);
85 static void             ip6_dst_destroy(struct dst_entry *);
86 static void             ip6_dst_ifdown(struct dst_entry *,
87                                        struct net_device *dev, int how);
88 static int               ip6_dst_gc(struct dst_ops *ops);
89
90 static int              ip6_pkt_discard(struct sk_buff *skb);
91 static int              ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb);
92 static int              ip6_pkt_prohibit(struct sk_buff *skb);
93 static int              ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb);
94 static void             ip6_link_failure(struct sk_buff *skb);
95 static void             ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
96                                            struct sk_buff *skb, u32 mtu);
97 static void             rt6_do_redirect(struct dst_entry *dst, struct sock *sk,
98                                         struct sk_buff *skb);
99 static void             rt6_dst_from_metrics_check(struct rt6_info *rt);
100 static int rt6_score_route(struct rt6_info *rt, int oif, int strict);
101
102 #ifdef CONFIG_IPV6_ROUTE_INFO
103 static struct rt6_info *rt6_add_route_info(struct net *net,
104                                            const struct in6_addr *prefix, int prefixlen,
105                                            const struct in6_addr *gwaddr,
106                                            struct net_device *dev,
107                                            unsigned int pref);
108 static struct rt6_info *rt6_get_route_info(struct net *net,
109                                            const struct in6_addr *prefix, int prefixlen,
110                                            const struct in6_addr *gwaddr,
111                                            struct net_device *dev);
112 #endif
113
114 struct uncached_list {
115         spinlock_t              lock;
116         struct list_head        head;
117 };
118
119 static DEFINE_PER_CPU_ALIGNED(struct uncached_list, rt6_uncached_list);
120
121 static void rt6_uncached_list_add(struct rt6_info *rt)
122 {
123         struct uncached_list *ul = raw_cpu_ptr(&rt6_uncached_list);
124
125         rt->dst.flags |= DST_NOCACHE;
126         rt->rt6i_uncached_list = ul;
127
128         spin_lock_bh(&ul->lock);
129         list_add_tail(&rt->rt6i_uncached, &ul->head);
130         spin_unlock_bh(&ul->lock);
131 }
132
133 static void rt6_uncached_list_del(struct rt6_info *rt)
134 {
135         if (!list_empty(&rt->rt6i_uncached)) {
136                 struct uncached_list *ul = rt->rt6i_uncached_list;
137
138                 spin_lock_bh(&ul->lock);
139                 list_del(&rt->rt6i_uncached);
140                 spin_unlock_bh(&ul->lock);
141         }
142 }
143
144 static void rt6_uncached_list_flush_dev(struct net *net, struct net_device *dev)
145 {
146         struct net_device *loopback_dev = net->loopback_dev;
147         int cpu;
148
149         if (dev == loopback_dev)
150                 return;
151
152         for_each_possible_cpu(cpu) {
153                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
154                 struct rt6_info *rt;
155
156                 spin_lock_bh(&ul->lock);
157                 list_for_each_entry(rt, &ul->head, rt6i_uncached) {
158                         struct inet6_dev *rt_idev = rt->rt6i_idev;
159                         struct net_device *rt_dev = rt->dst.dev;
160
161                         if (rt_idev->dev == dev) {
162                                 rt->rt6i_idev = in6_dev_get(loopback_dev);
163                                 in6_dev_put(rt_idev);
164                         }
165
166                         if (rt_dev == dev) {
167                                 rt->dst.dev = loopback_dev;
168                                 dev_hold(rt->dst.dev);
169                                 dev_put(rt_dev);
170                         }
171                 }
172                 spin_unlock_bh(&ul->lock);
173         }
174 }
175
176 static u32 *rt6_pcpu_cow_metrics(struct rt6_info *rt)
177 {
178         return dst_metrics_write_ptr(rt->dst.from);
179 }
180
181 static u32 *ipv6_cow_metrics(struct dst_entry *dst, unsigned long old)
182 {
183         struct rt6_info *rt = (struct rt6_info *)dst;
184
185         if (rt->rt6i_flags & RTF_PCPU)
186                 return rt6_pcpu_cow_metrics(rt);
187         else if (rt->rt6i_flags & RTF_CACHE)
188                 return NULL;
189         else
190                 return dst_cow_metrics_generic(dst, old);
191 }
192
193 static inline const void *choose_neigh_daddr(struct rt6_info *rt,
194                                              struct sk_buff *skb,
195                                              const void *daddr)
196 {
197         struct in6_addr *p = &rt->rt6i_gateway;
198
199         if (!ipv6_addr_any(p))
200                 return (const void *) p;
201         else if (skb)
202                 return &ipv6_hdr(skb)->daddr;
203         return daddr;
204 }
205
206 static struct neighbour *ip6_neigh_lookup(const struct dst_entry *dst,
207                                           struct sk_buff *skb,
208                                           const void *daddr)
209 {
210         struct rt6_info *rt = (struct rt6_info *) dst;
211         struct neighbour *n;
212
213         daddr = choose_neigh_daddr(rt, skb, daddr);
214         n = __ipv6_neigh_lookup(dst->dev, daddr);
215         if (n)
216                 return n;
217         return neigh_create(&nd_tbl, daddr, dst->dev);
218 }
219
220 static struct dst_ops ip6_dst_ops_template = {
221         .family                 =       AF_INET6,
222         .gc                     =       ip6_dst_gc,
223         .gc_thresh              =       1024,
224         .check                  =       ip6_dst_check,
225         .default_advmss         =       ip6_default_advmss,
226         .mtu                    =       ip6_mtu,
227         .cow_metrics            =       ipv6_cow_metrics,
228         .destroy                =       ip6_dst_destroy,
229         .ifdown                 =       ip6_dst_ifdown,
230         .negative_advice        =       ip6_negative_advice,
231         .link_failure           =       ip6_link_failure,
232         .update_pmtu            =       ip6_rt_update_pmtu,
233         .redirect               =       rt6_do_redirect,
234         .local_out              =       __ip6_local_out,
235         .neigh_lookup           =       ip6_neigh_lookup,
236 };
237
238 static unsigned int ip6_blackhole_mtu(const struct dst_entry *dst)
239 {
240         unsigned int mtu = dst_metric_raw(dst, RTAX_MTU);
241
242         return mtu ? : dst->dev->mtu;
243 }
244
245 static void ip6_rt_blackhole_update_pmtu(struct dst_entry *dst, struct sock *sk,
246                                          struct sk_buff *skb, u32 mtu)
247 {
248 }
249
250 static void ip6_rt_blackhole_redirect(struct dst_entry *dst, struct sock *sk,
251                                       struct sk_buff *skb)
252 {
253 }
254
255 static struct dst_ops ip6_dst_blackhole_ops = {
256         .family                 =       AF_INET6,
257         .destroy                =       ip6_dst_destroy,
258         .check                  =       ip6_dst_check,
259         .mtu                    =       ip6_blackhole_mtu,
260         .default_advmss         =       ip6_default_advmss,
261         .update_pmtu            =       ip6_rt_blackhole_update_pmtu,
262         .redirect               =       ip6_rt_blackhole_redirect,
263         .cow_metrics            =       dst_cow_metrics_generic,
264         .neigh_lookup           =       ip6_neigh_lookup,
265 };
266
267 static const u32 ip6_template_metrics[RTAX_MAX] = {
268         [RTAX_HOPLIMIT - 1] = 0,
269 };
270
271 static const struct rt6_info ip6_null_entry_template = {
272         .dst = {
273                 .__refcnt       = ATOMIC_INIT(1),
274                 .__use          = 1,
275                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
276                 .error          = -ENETUNREACH,
277                 .input          = ip6_pkt_discard,
278                 .output         = ip6_pkt_discard_out,
279         },
280         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
281         .rt6i_protocol  = RTPROT_KERNEL,
282         .rt6i_metric    = ~(u32) 0,
283         .rt6i_ref       = ATOMIC_INIT(1),
284 };
285
286 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
287
288 static const struct rt6_info ip6_prohibit_entry_template = {
289         .dst = {
290                 .__refcnt       = ATOMIC_INIT(1),
291                 .__use          = 1,
292                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
293                 .error          = -EACCES,
294                 .input          = ip6_pkt_prohibit,
295                 .output         = ip6_pkt_prohibit_out,
296         },
297         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
298         .rt6i_protocol  = RTPROT_KERNEL,
299         .rt6i_metric    = ~(u32) 0,
300         .rt6i_ref       = ATOMIC_INIT(1),
301 };
302
303 static const struct rt6_info ip6_blk_hole_entry_template = {
304         .dst = {
305                 .__refcnt       = ATOMIC_INIT(1),
306                 .__use          = 1,
307                 .obsolete       = DST_OBSOLETE_FORCE_CHK,
308                 .error          = -EINVAL,
309                 .input          = dst_discard,
310                 .output         = dst_discard_out,
311         },
312         .rt6i_flags     = (RTF_REJECT | RTF_NONEXTHOP),
313         .rt6i_protocol  = RTPROT_KERNEL,
314         .rt6i_metric    = ~(u32) 0,
315         .rt6i_ref       = ATOMIC_INIT(1),
316 };
317
318 #endif
319
320 static void rt6_info_init(struct rt6_info *rt)
321 {
322         struct dst_entry *dst = &rt->dst;
323
324         memset(dst + 1, 0, sizeof(*rt) - sizeof(*dst));
325         INIT_LIST_HEAD(&rt->rt6i_siblings);
326         INIT_LIST_HEAD(&rt->rt6i_uncached);
327 }
328
329 /* allocate dst with ip6_dst_ops */
330 static struct rt6_info *__ip6_dst_alloc(struct net *net,
331                                         struct net_device *dev,
332                                         int flags)
333 {
334         struct rt6_info *rt = dst_alloc(&net->ipv6.ip6_dst_ops, dev,
335                                         0, DST_OBSOLETE_FORCE_CHK, flags);
336
337         if (rt)
338                 rt6_info_init(rt);
339
340         return rt;
341 }
342
343 struct rt6_info *ip6_dst_alloc(struct net *net,
344                                struct net_device *dev,
345                                int flags)
346 {
347         struct rt6_info *rt = __ip6_dst_alloc(net, dev, flags);
348
349         if (rt) {
350                 rt->rt6i_pcpu = alloc_percpu_gfp(struct rt6_info *, GFP_ATOMIC);
351                 if (rt->rt6i_pcpu) {
352                         int cpu;
353
354                         for_each_possible_cpu(cpu) {
355                                 struct rt6_info **p;
356
357                                 p = per_cpu_ptr(rt->rt6i_pcpu, cpu);
358                                 /* no one shares rt */
359                                 *p =  NULL;
360                         }
361                 } else {
362                         dst_destroy((struct dst_entry *)rt);
363                         return NULL;
364                 }
365         }
366
367         return rt;
368 }
369 EXPORT_SYMBOL(ip6_dst_alloc);
370
371 static void ip6_dst_destroy(struct dst_entry *dst)
372 {
373         struct rt6_info *rt = (struct rt6_info *)dst;
374         struct dst_entry *from = dst->from;
375         struct inet6_dev *idev;
376
377         dst_destroy_metrics_generic(dst);
378         free_percpu(rt->rt6i_pcpu);
379         rt6_uncached_list_del(rt);
380
381         idev = rt->rt6i_idev;
382         if (idev) {
383                 rt->rt6i_idev = NULL;
384                 in6_dev_put(idev);
385         }
386
387         dst->from = NULL;
388         dst_release(from);
389 }
390
391 static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
392                            int how)
393 {
394         struct rt6_info *rt = (struct rt6_info *)dst;
395         struct inet6_dev *idev = rt->rt6i_idev;
396         struct net_device *loopback_dev =
397                 dev_net(dev)->loopback_dev;
398
399         if (dev != loopback_dev) {
400                 if (idev && idev->dev == dev) {
401                         struct inet6_dev *loopback_idev =
402                                 in6_dev_get(loopback_dev);
403                         if (loopback_idev) {
404                                 rt->rt6i_idev = loopback_idev;
405                                 in6_dev_put(idev);
406                         }
407                 }
408         }
409 }
410
411 static bool __rt6_check_expired(const struct rt6_info *rt)
412 {
413         if (rt->rt6i_flags & RTF_EXPIRES)
414                 return time_after(jiffies, rt->dst.expires);
415         else
416                 return false;
417 }
418
419 static bool rt6_check_expired(const struct rt6_info *rt)
420 {
421         if (rt->rt6i_flags & RTF_EXPIRES) {
422                 if (time_after(jiffies, rt->dst.expires))
423                         return true;
424         } else if (rt->dst.from) {
425                 return rt6_check_expired((struct rt6_info *) rt->dst.from);
426         }
427         return false;
428 }
429
430 /* Multipath route selection:
431  *   Hash based function using packet header and flowlabel.
432  * Adapted from fib_info_hashfn()
433  */
434 static int rt6_info_hash_nhsfn(unsigned int candidate_count,
435                                const struct flowi6 *fl6)
436 {
437         return get_hash_from_flowi6(fl6) % candidate_count;
438 }
439
440 static struct rt6_info *rt6_multipath_select(struct rt6_info *match,
441                                              struct flowi6 *fl6, int oif,
442                                              int strict)
443 {
444         struct rt6_info *sibling, *next_sibling;
445         int route_choosen;
446
447         route_choosen = rt6_info_hash_nhsfn(match->rt6i_nsiblings + 1, fl6);
448         /* Don't change the route, if route_choosen == 0
449          * (siblings does not include ourself)
450          */
451         if (route_choosen)
452                 list_for_each_entry_safe(sibling, next_sibling,
453                                 &match->rt6i_siblings, rt6i_siblings) {
454                         route_choosen--;
455                         if (route_choosen == 0) {
456                                 if (rt6_score_route(sibling, oif, strict) < 0)
457                                         break;
458                                 match = sibling;
459                                 break;
460                         }
461                 }
462         return match;
463 }
464
465 /*
466  *      Route lookup. Any table->tb6_lock is implied.
467  */
468
469 static inline struct rt6_info *rt6_device_match(struct net *net,
470                                                     struct rt6_info *rt,
471                                                     const struct in6_addr *saddr,
472                                                     int oif,
473                                                     int flags)
474 {
475         struct rt6_info *local = NULL;
476         struct rt6_info *sprt;
477
478         if (!oif && ipv6_addr_any(saddr))
479                 goto out;
480
481         for (sprt = rt; sprt; sprt = sprt->dst.rt6_next) {
482                 struct net_device *dev = sprt->dst.dev;
483
484                 if (oif) {
485                         if (dev->ifindex == oif)
486                                 return sprt;
487                         if (dev->flags & IFF_LOOPBACK) {
488                                 if (!sprt->rt6i_idev ||
489                                     sprt->rt6i_idev->dev->ifindex != oif) {
490                                         if (flags & RT6_LOOKUP_F_IFACE)
491                                                 continue;
492                                         if (local &&
493                                             local->rt6i_idev->dev->ifindex == oif)
494                                                 continue;
495                                 }
496                                 local = sprt;
497                         }
498                 } else {
499                         if (ipv6_chk_addr(net, saddr, dev,
500                                           flags & RT6_LOOKUP_F_IFACE))
501                                 return sprt;
502                 }
503         }
504
505         if (oif) {
506                 if (local)
507                         return local;
508
509                 if (flags & RT6_LOOKUP_F_IFACE)
510                         return net->ipv6.ip6_null_entry;
511         }
512 out:
513         return rt;
514 }
515
516 #ifdef CONFIG_IPV6_ROUTER_PREF
517 struct __rt6_probe_work {
518         struct work_struct work;
519         struct in6_addr target;
520         struct net_device *dev;
521 };
522
523 static void rt6_probe_deferred(struct work_struct *w)
524 {
525         struct in6_addr mcaddr;
526         struct __rt6_probe_work *work =
527                 container_of(w, struct __rt6_probe_work, work);
528
529         addrconf_addr_solict_mult(&work->target, &mcaddr);
530         ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL);
531         dev_put(work->dev);
532         kfree(work);
533 }
534
535 static void rt6_probe(struct rt6_info *rt)
536 {
537         struct __rt6_probe_work *work;
538         struct neighbour *neigh;
539         /*
540          * Okay, this does not seem to be appropriate
541          * for now, however, we need to check if it
542          * is really so; aka Router Reachability Probing.
543          *
544          * Router Reachability Probe MUST be rate-limited
545          * to no more than one per minute.
546          */
547         if (!rt || !(rt->rt6i_flags & RTF_GATEWAY))
548                 return;
549         rcu_read_lock_bh();
550         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
551         if (neigh) {
552                 if (neigh->nud_state & NUD_VALID)
553                         goto out;
554
555                 work = NULL;
556                 write_lock(&neigh->lock);
557                 if (!(neigh->nud_state & NUD_VALID) &&
558                     time_after(jiffies,
559                                neigh->updated +
560                                rt->rt6i_idev->cnf.rtr_probe_interval)) {
561                         work = kmalloc(sizeof(*work), GFP_ATOMIC);
562                         if (work)
563                                 __neigh_set_probe_once(neigh);
564                 }
565                 write_unlock(&neigh->lock);
566         } else {
567                 work = kmalloc(sizeof(*work), GFP_ATOMIC);
568         }
569
570         if (work) {
571                 INIT_WORK(&work->work, rt6_probe_deferred);
572                 work->target = rt->rt6i_gateway;
573                 dev_hold(rt->dst.dev);
574                 work->dev = rt->dst.dev;
575                 schedule_work(&work->work);
576         }
577
578 out:
579         rcu_read_unlock_bh();
580 }
581 #else
582 static inline void rt6_probe(struct rt6_info *rt)
583 {
584 }
585 #endif
586
587 /*
588  * Default Router Selection (RFC 2461 6.3.6)
589  */
590 static inline int rt6_check_dev(struct rt6_info *rt, int oif)
591 {
592         struct net_device *dev = rt->dst.dev;
593         if (!oif || dev->ifindex == oif)
594                 return 2;
595         if ((dev->flags & IFF_LOOPBACK) &&
596             rt->rt6i_idev && rt->rt6i_idev->dev->ifindex == oif)
597                 return 1;
598         return 0;
599 }
600
601 static inline enum rt6_nud_state rt6_check_neigh(struct rt6_info *rt)
602 {
603         struct neighbour *neigh;
604         enum rt6_nud_state ret = RT6_NUD_FAIL_HARD;
605
606         if (rt->rt6i_flags & RTF_NONEXTHOP ||
607             !(rt->rt6i_flags & RTF_GATEWAY))
608                 return RT6_NUD_SUCCEED;
609
610         rcu_read_lock_bh();
611         neigh = __ipv6_neigh_lookup_noref(rt->dst.dev, &rt->rt6i_gateway);
612         if (neigh) {
613                 read_lock(&neigh->lock);
614                 if (neigh->nud_state & NUD_VALID)
615                         ret = RT6_NUD_SUCCEED;
616 #ifdef CONFIG_IPV6_ROUTER_PREF
617                 else if (!(neigh->nud_state & NUD_FAILED))
618                         ret = RT6_NUD_SUCCEED;
619                 else
620                         ret = RT6_NUD_FAIL_PROBE;
621 #endif
622                 read_unlock(&neigh->lock);
623         } else {
624                 ret = IS_ENABLED(CONFIG_IPV6_ROUTER_PREF) ?
625                       RT6_NUD_SUCCEED : RT6_NUD_FAIL_DO_RR;
626         }
627         rcu_read_unlock_bh();
628
629         return ret;
630 }
631
632 static int rt6_score_route(struct rt6_info *rt, int oif,
633                            int strict)
634 {
635         int m;
636
637         m = rt6_check_dev(rt, oif);
638         if (!m && (strict & RT6_LOOKUP_F_IFACE))
639                 return RT6_NUD_FAIL_HARD;
640 #ifdef CONFIG_IPV6_ROUTER_PREF
641         m |= IPV6_DECODE_PREF(IPV6_EXTRACT_PREF(rt->rt6i_flags)) << 2;
642 #endif
643         if (strict & RT6_LOOKUP_F_REACHABLE) {
644                 int n = rt6_check_neigh(rt);
645                 if (n < 0)
646                         return n;
647         }
648         return m;
649 }
650
651 static struct rt6_info *find_match(struct rt6_info *rt, int oif, int strict,
652                                    int *mpri, struct rt6_info *match,
653                                    bool *do_rr)
654 {
655         int m;
656         bool match_do_rr = false;
657         struct inet6_dev *idev = rt->rt6i_idev;
658         struct net_device *dev = rt->dst.dev;
659
660         if (dev && !netif_carrier_ok(dev) &&
661             idev->cnf.ignore_routes_with_linkdown &&
662             !(strict & RT6_LOOKUP_F_IGNORE_LINKSTATE))
663                 goto out;
664
665         if (rt6_check_expired(rt))
666                 goto out;
667
668         m = rt6_score_route(rt, oif, strict);
669         if (m == RT6_NUD_FAIL_DO_RR) {
670                 match_do_rr = true;
671                 m = 0; /* lowest valid score */
672         } else if (m == RT6_NUD_FAIL_HARD) {
673                 goto out;
674         }
675
676         if (strict & RT6_LOOKUP_F_REACHABLE)
677                 rt6_probe(rt);
678
679         /* note that m can be RT6_NUD_FAIL_PROBE at this point */
680         if (m > *mpri) {
681                 *do_rr = match_do_rr;
682                 *mpri = m;
683                 match = rt;
684         }
685 out:
686         return match;
687 }
688
689 static struct rt6_info *find_rr_leaf(struct fib6_node *fn,
690                                      struct rt6_info *rr_head,
691                                      u32 metric, int oif, int strict,
692                                      bool *do_rr)
693 {
694         struct rt6_info *rt, *match, *cont;
695         int mpri = -1;
696
697         match = NULL;
698         cont = NULL;
699         for (rt = rr_head; rt; rt = rt->dst.rt6_next) {
700                 if (rt->rt6i_metric != metric) {
701                         cont = rt;
702                         break;
703                 }
704
705                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
706         }
707
708         for (rt = fn->leaf; rt && rt != rr_head; rt = rt->dst.rt6_next) {
709                 if (rt->rt6i_metric != metric) {
710                         cont = rt;
711                         break;
712                 }
713
714                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
715         }
716
717         if (match || !cont)
718                 return match;
719
720         for (rt = cont; rt; rt = rt->dst.rt6_next)
721                 match = find_match(rt, oif, strict, &mpri, match, do_rr);
722
723         return match;
724 }
725
726 static struct rt6_info *rt6_select(struct fib6_node *fn, int oif, int strict)
727 {
728         struct rt6_info *match, *rt0;
729         struct net *net;
730         bool do_rr = false;
731
732         rt0 = fn->rr_ptr;
733         if (!rt0)
734                 fn->rr_ptr = rt0 = fn->leaf;
735
736         match = find_rr_leaf(fn, rt0, rt0->rt6i_metric, oif, strict,
737                              &do_rr);
738
739         if (do_rr) {
740                 struct rt6_info *next = rt0->dst.rt6_next;
741
742                 /* no entries matched; do round-robin */
743                 if (!next || next->rt6i_metric != rt0->rt6i_metric)
744                         next = fn->leaf;
745
746                 if (next != rt0)
747                         fn->rr_ptr = next;
748         }
749
750         net = dev_net(rt0->dst.dev);
751         return match ? match : net->ipv6.ip6_null_entry;
752 }
753
754 static bool rt6_is_gw_or_nonexthop(const struct rt6_info *rt)
755 {
756         return (rt->rt6i_flags & (RTF_NONEXTHOP | RTF_GATEWAY));
757 }
758
759 #ifdef CONFIG_IPV6_ROUTE_INFO
760 int rt6_route_rcv(struct net_device *dev, u8 *opt, int len,
761                   const struct in6_addr *gwaddr)
762 {
763         struct net *net = dev_net(dev);
764         struct route_info *rinfo = (struct route_info *) opt;
765         struct in6_addr prefix_buf, *prefix;
766         unsigned int pref;
767         unsigned long lifetime;
768         struct rt6_info *rt;
769
770         if (len < sizeof(struct route_info)) {
771                 return -EINVAL;
772         }
773
774         /* Sanity check for prefix_len and length */
775         if (rinfo->length > 3) {
776                 return -EINVAL;
777         } else if (rinfo->prefix_len > 128) {
778                 return -EINVAL;
779         } else if (rinfo->prefix_len > 64) {
780                 if (rinfo->length < 2) {
781                         return -EINVAL;
782                 }
783         } else if (rinfo->prefix_len > 0) {
784                 if (rinfo->length < 1) {
785                         return -EINVAL;
786                 }
787         }
788
789         pref = rinfo->route_pref;
790         if (pref == ICMPV6_ROUTER_PREF_INVALID)
791                 return -EINVAL;
792
793         lifetime = addrconf_timeout_fixup(ntohl(rinfo->lifetime), HZ);
794
795         if (rinfo->length == 3)
796                 prefix = (struct in6_addr *)rinfo->prefix;
797         else {
798                 /* this function is safe */
799                 ipv6_addr_prefix(&prefix_buf,
800                                  (struct in6_addr *)rinfo->prefix,
801                                  rinfo->prefix_len);
802                 prefix = &prefix_buf;
803         }
804
805         if (rinfo->prefix_len == 0)
806                 rt = rt6_get_dflt_router(gwaddr, dev);
807         else
808                 rt = rt6_get_route_info(net, prefix, rinfo->prefix_len,
809                                         gwaddr, dev);
810
811         if (rt && !lifetime) {
812                 ip6_del_rt(rt);
813                 rt = NULL;
814         }
815
816         if (!rt && lifetime)
817                 rt = rt6_add_route_info(net, prefix, rinfo->prefix_len, gwaddr,
818                                         dev, pref);
819         else if (rt)
820                 rt->rt6i_flags = RTF_ROUTEINFO |
821                                  (rt->rt6i_flags & ~RTF_PREF_MASK) | RTF_PREF(pref);
822
823         if (rt) {
824                 if (!addrconf_finite_timeout(lifetime))
825                         rt6_clean_expires(rt);
826                 else
827                         rt6_set_expires(rt, jiffies + HZ * lifetime);
828
829                 ip6_rt_put(rt);
830         }
831         return 0;
832 }
833 #endif
834
835 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
836                                         struct in6_addr *saddr)
837 {
838         struct fib6_node *pn;
839         while (1) {
840                 if (fn->fn_flags & RTN_TL_ROOT)
841                         return NULL;
842                 pn = fn->parent;
843                 if (FIB6_SUBTREE(pn) && FIB6_SUBTREE(pn) != fn)
844                         fn = fib6_lookup(FIB6_SUBTREE(pn), NULL, saddr);
845                 else
846                         fn = pn;
847                 if (fn->fn_flags & RTN_RTINFO)
848                         return fn;
849         }
850 }
851
852 static struct rt6_info *ip6_pol_route_lookup(struct net *net,
853                                              struct fib6_table *table,
854                                              struct flowi6 *fl6, int flags)
855 {
856         struct fib6_node *fn;
857         struct rt6_info *rt;
858
859         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
860                 flags &= ~RT6_LOOKUP_F_IFACE;
861
862         read_lock_bh(&table->tb6_lock);
863         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
864 restart:
865         rt = fn->leaf;
866         rt = rt6_device_match(net, rt, &fl6->saddr, fl6->flowi6_oif, flags);
867         if (rt->rt6i_nsiblings && fl6->flowi6_oif == 0)
868                 rt = rt6_multipath_select(rt, fl6, fl6->flowi6_oif, flags);
869         if (rt == net->ipv6.ip6_null_entry) {
870                 fn = fib6_backtrack(fn, &fl6->saddr);
871                 if (fn)
872                         goto restart;
873         }
874         dst_use(&rt->dst, jiffies);
875         read_unlock_bh(&table->tb6_lock);
876
877         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
878
879         return rt;
880
881 }
882
883 struct dst_entry *ip6_route_lookup(struct net *net, struct flowi6 *fl6,
884                                     int flags)
885 {
886         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_lookup);
887 }
888 EXPORT_SYMBOL_GPL(ip6_route_lookup);
889
890 struct rt6_info *rt6_lookup(struct net *net, const struct in6_addr *daddr,
891                             const struct in6_addr *saddr, int oif, int strict)
892 {
893         struct flowi6 fl6 = {
894                 .flowi6_oif = oif,
895                 .daddr = *daddr,
896         };
897         struct dst_entry *dst;
898         int flags = strict ? RT6_LOOKUP_F_IFACE : 0;
899
900         if (saddr) {
901                 memcpy(&fl6.saddr, saddr, sizeof(*saddr));
902                 flags |= RT6_LOOKUP_F_HAS_SADDR;
903         }
904
905         dst = fib6_rule_lookup(net, &fl6, flags, ip6_pol_route_lookup);
906         if (dst->error == 0)
907                 return (struct rt6_info *) dst;
908
909         dst_release(dst);
910
911         return NULL;
912 }
913 EXPORT_SYMBOL(rt6_lookup);
914
915 /* ip6_ins_rt is called with FREE table->tb6_lock.
916    It takes new route entry, the addition fails by any reason the
917    route is freed. In any case, if caller does not hold it, it may
918    be destroyed.
919  */
920
921 static int __ip6_ins_rt(struct rt6_info *rt, struct nl_info *info,
922                         struct mx6_config *mxc)
923 {
924         int err;
925         struct fib6_table *table;
926
927         table = rt->rt6i_table;
928         write_lock_bh(&table->tb6_lock);
929         err = fib6_add(&table->tb6_root, rt, info, mxc);
930         write_unlock_bh(&table->tb6_lock);
931
932         return err;
933 }
934
935 int ip6_ins_rt(struct rt6_info *rt)
936 {
937         struct nl_info info = { .nl_net = dev_net(rt->dst.dev), };
938         struct mx6_config mxc = { .mx = NULL, };
939
940         return __ip6_ins_rt(rt, &info, &mxc);
941 }
942
943 static struct rt6_info *ip6_rt_cache_alloc(struct rt6_info *ort,
944                                            const struct in6_addr *daddr,
945                                            const struct in6_addr *saddr)
946 {
947         struct rt6_info *rt;
948
949         /*
950          *      Clone the route.
951          */
952
953         if (ort->rt6i_flags & (RTF_CACHE | RTF_PCPU))
954                 ort = (struct rt6_info *)ort->dst.from;
955
956         rt = __ip6_dst_alloc(dev_net(ort->dst.dev), ort->dst.dev, 0);
957
958         if (!rt)
959                 return NULL;
960
961         ip6_rt_copy_init(rt, ort);
962         rt->rt6i_flags |= RTF_CACHE;
963         rt->rt6i_metric = 0;
964         rt->dst.flags |= DST_HOST;
965         rt->rt6i_dst.addr = *daddr;
966         rt->rt6i_dst.plen = 128;
967
968         if (!rt6_is_gw_or_nonexthop(ort)) {
969                 if (ort->rt6i_dst.plen != 128 &&
970                     ipv6_addr_equal(&ort->rt6i_dst.addr, daddr))
971                         rt->rt6i_flags |= RTF_ANYCAST;
972 #ifdef CONFIG_IPV6_SUBTREES
973                 if (rt->rt6i_src.plen && saddr) {
974                         rt->rt6i_src.addr = *saddr;
975                         rt->rt6i_src.plen = 128;
976                 }
977 #endif
978         }
979
980         return rt;
981 }
982
983 static struct rt6_info *ip6_rt_pcpu_alloc(struct rt6_info *rt)
984 {
985         struct rt6_info *pcpu_rt;
986
987         pcpu_rt = __ip6_dst_alloc(dev_net(rt->dst.dev),
988                                   rt->dst.dev, rt->dst.flags);
989
990         if (!pcpu_rt)
991                 return NULL;
992         ip6_rt_copy_init(pcpu_rt, rt);
993         pcpu_rt->rt6i_protocol = rt->rt6i_protocol;
994         pcpu_rt->rt6i_flags |= RTF_PCPU;
995         return pcpu_rt;
996 }
997
998 /* It should be called with read_lock_bh(&tb6_lock) acquired */
999 static struct rt6_info *rt6_get_pcpu_route(struct rt6_info *rt)
1000 {
1001         struct rt6_info *pcpu_rt, **p;
1002
1003         p = this_cpu_ptr(rt->rt6i_pcpu);
1004         pcpu_rt = *p;
1005
1006         if (pcpu_rt) {
1007                 dst_hold(&pcpu_rt->dst);
1008                 rt6_dst_from_metrics_check(pcpu_rt);
1009         }
1010         return pcpu_rt;
1011 }
1012
1013 static struct rt6_info *rt6_make_pcpu_route(struct rt6_info *rt)
1014 {
1015         struct fib6_table *table = rt->rt6i_table;
1016         struct rt6_info *pcpu_rt, *prev, **p;
1017
1018         pcpu_rt = ip6_rt_pcpu_alloc(rt);
1019         if (!pcpu_rt) {
1020                 struct net *net = dev_net(rt->dst.dev);
1021
1022                 dst_hold(&net->ipv6.ip6_null_entry->dst);
1023                 return net->ipv6.ip6_null_entry;
1024         }
1025
1026         read_lock_bh(&table->tb6_lock);
1027         if (rt->rt6i_pcpu) {
1028                 p = this_cpu_ptr(rt->rt6i_pcpu);
1029                 prev = cmpxchg(p, NULL, pcpu_rt);
1030                 if (prev) {
1031                         /* If someone did it before us, return prev instead */
1032                         dst_destroy(&pcpu_rt->dst);
1033                         pcpu_rt = prev;
1034                 }
1035         } else {
1036                 /* rt has been removed from the fib6 tree
1037                  * before we have a chance to acquire the read_lock.
1038                  * In this case, don't brother to create a pcpu rt
1039                  * since rt is going away anyway.  The next
1040                  * dst_check() will trigger a re-lookup.
1041                  */
1042                 dst_destroy(&pcpu_rt->dst);
1043                 pcpu_rt = rt;
1044         }
1045         dst_hold(&pcpu_rt->dst);
1046         rt6_dst_from_metrics_check(pcpu_rt);
1047         read_unlock_bh(&table->tb6_lock);
1048         return pcpu_rt;
1049 }
1050
1051 struct rt6_info *ip6_pol_route(struct net *net, struct fib6_table *table,
1052                                int oif, struct flowi6 *fl6, int flags)
1053 {
1054         struct fib6_node *fn, *saved_fn;
1055         struct rt6_info *rt;
1056         int strict = 0;
1057
1058         strict |= flags & RT6_LOOKUP_F_IFACE;
1059         strict |= flags & RT6_LOOKUP_F_IGNORE_LINKSTATE;
1060         if (net->ipv6.devconf_all->forwarding == 0)
1061                 strict |= RT6_LOOKUP_F_REACHABLE;
1062
1063         read_lock_bh(&table->tb6_lock);
1064
1065         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1066         saved_fn = fn;
1067
1068         if (fl6->flowi6_flags & FLOWI_FLAG_SKIP_NH_OIF)
1069                 oif = 0;
1070
1071 redo_rt6_select:
1072         rt = rt6_select(fn, oif, strict);
1073         if (rt->rt6i_nsiblings)
1074                 rt = rt6_multipath_select(rt, fl6, oif, strict);
1075         if (rt == net->ipv6.ip6_null_entry) {
1076                 fn = fib6_backtrack(fn, &fl6->saddr);
1077                 if (fn)
1078                         goto redo_rt6_select;
1079                 else if (strict & RT6_LOOKUP_F_REACHABLE) {
1080                         /* also consider unreachable route */
1081                         strict &= ~RT6_LOOKUP_F_REACHABLE;
1082                         fn = saved_fn;
1083                         goto redo_rt6_select;
1084                 }
1085         }
1086
1087
1088         if (rt == net->ipv6.ip6_null_entry || (rt->rt6i_flags & RTF_CACHE)) {
1089                 dst_use(&rt->dst, jiffies);
1090                 read_unlock_bh(&table->tb6_lock);
1091
1092                 rt6_dst_from_metrics_check(rt);
1093
1094                 trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1095                 return rt;
1096         } else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
1097                             !(rt->rt6i_flags & RTF_GATEWAY))) {
1098                 /* Create a RTF_CACHE clone which will not be
1099                  * owned by the fib6 tree.  It is for the special case where
1100                  * the daddr in the skb during the neighbor look-up is different
1101                  * from the fl6->daddr used to look-up route here.
1102                  */
1103
1104                 struct rt6_info *uncached_rt;
1105
1106                 dst_use(&rt->dst, jiffies);
1107                 read_unlock_bh(&table->tb6_lock);
1108
1109                 uncached_rt = ip6_rt_cache_alloc(rt, &fl6->daddr, NULL);
1110                 dst_release(&rt->dst);
1111
1112                 if (uncached_rt)
1113                         rt6_uncached_list_add(uncached_rt);
1114                 else
1115                         uncached_rt = net->ipv6.ip6_null_entry;
1116
1117                 dst_hold(&uncached_rt->dst);
1118
1119                 trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
1120                 return uncached_rt;
1121
1122         } else {
1123                 /* Get a percpu copy */
1124
1125                 struct rt6_info *pcpu_rt;
1126
1127                 rt->dst.lastuse = jiffies;
1128                 rt->dst.__use++;
1129                 pcpu_rt = rt6_get_pcpu_route(rt);
1130
1131                 if (pcpu_rt) {
1132                         read_unlock_bh(&table->tb6_lock);
1133                 } else {
1134                         /* We have to do the read_unlock first
1135                          * because rt6_make_pcpu_route() may trigger
1136                          * ip6_dst_gc() which will take the write_lock.
1137                          */
1138                         dst_hold(&rt->dst);
1139                         read_unlock_bh(&table->tb6_lock);
1140                         pcpu_rt = rt6_make_pcpu_route(rt);
1141                         dst_release(&rt->dst);
1142                 }
1143
1144                 trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
1145                 return pcpu_rt;
1146
1147         }
1148 }
1149 EXPORT_SYMBOL_GPL(ip6_pol_route);
1150
1151 static struct rt6_info *ip6_pol_route_input(struct net *net, struct fib6_table *table,
1152                                             struct flowi6 *fl6, int flags)
1153 {
1154         return ip6_pol_route(net, table, fl6->flowi6_iif, fl6, flags);
1155 }
1156
1157 struct dst_entry *ip6_route_input_lookup(struct net *net,
1158                                          struct net_device *dev,
1159                                          struct flowi6 *fl6, int flags)
1160 {
1161         if (rt6_need_strict(&fl6->daddr) && dev->type != ARPHRD_PIMREG)
1162                 flags |= RT6_LOOKUP_F_IFACE;
1163
1164         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_input);
1165 }
1166 EXPORT_SYMBOL_GPL(ip6_route_input_lookup);
1167
1168 void ip6_route_input(struct sk_buff *skb)
1169 {
1170         const struct ipv6hdr *iph = ipv6_hdr(skb);
1171         struct net *net = dev_net(skb->dev);
1172         int flags = RT6_LOOKUP_F_HAS_SADDR;
1173         struct ip_tunnel_info *tun_info;
1174         struct flowi6 fl6 = {
1175                 .flowi6_iif = skb->dev->ifindex,
1176                 .daddr = iph->daddr,
1177                 .saddr = iph->saddr,
1178                 .flowlabel = ip6_flowinfo(iph),
1179                 .flowi6_mark = skb->mark,
1180                 .flowi6_proto = iph->nexthdr,
1181         };
1182
1183         tun_info = skb_tunnel_info(skb);
1184         if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
1185                 fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
1186         skb_dst_drop(skb);
1187         skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
1188 }
1189
1190 static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table *table,
1191                                              struct flowi6 *fl6, int flags)
1192 {
1193         return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
1194 }
1195
1196 struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
1197                                          struct flowi6 *fl6, int flags)
1198 {
1199         bool any_src;
1200
1201         if (rt6_need_strict(&fl6->daddr)) {
1202                 struct dst_entry *dst;
1203
1204                 dst = l3mdev_link_scope_lookup(net, fl6);
1205                 if (dst)
1206                         return dst;
1207         }
1208
1209         fl6->flowi6_iif = LOOPBACK_IFINDEX;
1210
1211         any_src = ipv6_addr_any(&fl6->saddr);
1212         if ((sk && sk->sk_bound_dev_if) || rt6_need_strict(&fl6->daddr) ||
1213             (fl6->flowi6_oif && any_src))
1214                 flags |= RT6_LOOKUP_F_IFACE;
1215
1216         if (!any_src)
1217                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1218         else if (sk)
1219                 flags |= rt6_srcprefs2flags(inet6_sk(sk)->srcprefs);
1220
1221         return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
1222 }
1223 EXPORT_SYMBOL_GPL(ip6_route_output_flags);
1224
1225 struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
1226 {
1227         struct rt6_info *rt, *ort = (struct rt6_info *) dst_orig;
1228         struct dst_entry *new = NULL;
1229
1230         rt = dst_alloc(&ip6_dst_blackhole_ops, ort->dst.dev, 1, DST_OBSOLETE_NONE, 0);
1231         if (rt) {
1232                 rt6_info_init(rt);
1233
1234                 new = &rt->dst;
1235                 new->__use = 1;
1236                 new->input = dst_discard;
1237                 new->output = dst_discard_out;
1238
1239                 dst_copy_metrics(new, &ort->dst);
1240                 rt->rt6i_idev = ort->rt6i_idev;
1241                 if (rt->rt6i_idev)
1242                         in6_dev_hold(rt->rt6i_idev);
1243
1244                 rt->rt6i_gateway = ort->rt6i_gateway;
1245                 rt->rt6i_flags = ort->rt6i_flags & ~RTF_PCPU;
1246                 rt->rt6i_metric = 0;
1247
1248                 memcpy(&rt->rt6i_dst, &ort->rt6i_dst, sizeof(struct rt6key));
1249 #ifdef CONFIG_IPV6_SUBTREES
1250                 memcpy(&rt->rt6i_src, &ort->rt6i_src, sizeof(struct rt6key));
1251 #endif
1252
1253                 dst_free(new);
1254         }
1255
1256         dst_release(dst_orig);
1257         return new ? new : ERR_PTR(-ENOMEM);
1258 }
1259
1260 /*
1261  *      Destination cache support functions
1262  */
1263
1264 static void rt6_dst_from_metrics_check(struct rt6_info *rt)
1265 {
1266         if (rt->dst.from &&
1267             dst_metrics_ptr(&rt->dst) != dst_metrics_ptr(rt->dst.from))
1268                 dst_init_metrics(&rt->dst, dst_metrics_ptr(rt->dst.from), true);
1269 }
1270
1271 static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
1272 {
1273         u32 rt_cookie = 0;
1274
1275         if (!rt6_get_cookie_safe(rt, &rt_cookie) || rt_cookie != cookie)
1276                 return NULL;
1277
1278         if (rt6_check_expired(rt))
1279                 return NULL;
1280
1281         return &rt->dst;
1282 }
1283
1284 static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
1285 {
1286         if (!__rt6_check_expired(rt) &&
1287             rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
1288             rt6_check((struct rt6_info *)(rt->dst.from), cookie))
1289                 return &rt->dst;
1290         else
1291                 return NULL;
1292 }
1293
1294 static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
1295 {
1296         struct rt6_info *rt;
1297
1298         rt = (struct rt6_info *) dst;
1299
1300         /* All IPV6 dsts are created with ->obsolete set to the value
1301          * DST_OBSOLETE_FORCE_CHK which forces validation calls down
1302          * into this function always.
1303          */
1304
1305         rt6_dst_from_metrics_check(rt);
1306
1307         if (rt->rt6i_flags & RTF_PCPU ||
1308             (unlikely(dst->flags & DST_NOCACHE) && rt->dst.from))
1309                 return rt6_dst_from_check(rt, cookie);
1310         else
1311                 return rt6_check(rt, cookie);
1312 }
1313
1314 static struct dst_entry *ip6_negative_advice(struct dst_entry *dst)
1315 {
1316         struct rt6_info *rt = (struct rt6_info *) dst;
1317
1318         if (rt) {
1319                 if (rt->rt6i_flags & RTF_CACHE) {
1320                         if (rt6_check_expired(rt)) {
1321                                 ip6_del_rt(rt);
1322                                 dst = NULL;
1323                         }
1324                 } else {
1325                         dst_release(dst);
1326                         dst = NULL;
1327                 }
1328         }
1329         return dst;
1330 }
1331
1332 static void ip6_link_failure(struct sk_buff *skb)
1333 {
1334         struct rt6_info *rt;
1335
1336         icmpv6_send(skb, ICMPV6_DEST_UNREACH, ICMPV6_ADDR_UNREACH, 0);
1337
1338         rt = (struct rt6_info *) skb_dst(skb);
1339         if (rt) {
1340                 if (rt->rt6i_flags & RTF_CACHE) {
1341                         dst_hold(&rt->dst);
1342                         ip6_del_rt(rt);
1343                 } else {
1344                         struct fib6_node *fn;
1345
1346                         rcu_read_lock();
1347                         fn = rcu_dereference(rt->rt6i_node);
1348                         if (fn && (rt->rt6i_flags & RTF_DEFAULT))
1349                                 fn->fn_sernum = -1;
1350                         rcu_read_unlock();
1351                 }
1352         }
1353 }
1354
1355 static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
1356 {
1357         struct net *net = dev_net(rt->dst.dev);
1358
1359         rt->rt6i_flags |= RTF_MODIFIED;
1360         rt->rt6i_pmtu = mtu;
1361         rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
1362 }
1363
1364 static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
1365 {
1366         return !(rt->rt6i_flags & RTF_CACHE) &&
1367                 (rt->rt6i_flags & RTF_PCPU ||
1368                  rcu_access_pointer(rt->rt6i_node));
1369 }
1370
1371 static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
1372                                  const struct ipv6hdr *iph, u32 mtu)
1373 {
1374         struct rt6_info *rt6 = (struct rt6_info *)dst;
1375
1376         /* Note: do *NOT* check dst_metric_locked(dst, RTAX_MTU)
1377          * IPv6 pmtu discovery isn't optional, so 'mtu lock' cannot disable it.
1378          * [see also comment in rt6_mtu_change_route()]
1379          */
1380
1381         dst_confirm(dst);
1382         mtu = max_t(u32, mtu, IPV6_MIN_MTU);
1383         if (mtu >= dst_mtu(dst))
1384                 return;
1385
1386         if (!rt6_cache_allowed_for_pmtu(rt6)) {
1387                 rt6_do_update_pmtu(rt6, mtu);
1388         } else {
1389                 const struct in6_addr *daddr, *saddr;
1390                 struct rt6_info *nrt6;
1391
1392                 if (iph) {
1393                         daddr = &iph->daddr;
1394                         saddr = &iph->saddr;
1395                 } else if (sk) {
1396                         daddr = &sk->sk_v6_daddr;
1397                         saddr = &inet6_sk(sk)->saddr;
1398                 } else {
1399                         return;
1400                 }
1401                 nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
1402                 if (nrt6) {
1403                         rt6_do_update_pmtu(nrt6, mtu);
1404
1405                         /* ip6_ins_rt(nrt6) will bump the
1406                          * rt6->rt6i_node->fn_sernum
1407                          * which will fail the next rt6_check() and
1408                          * invalidate the sk->sk_dst_cache.
1409                          */
1410                         ip6_ins_rt(nrt6);
1411                 }
1412         }
1413 }
1414
1415 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
1416                                struct sk_buff *skb, u32 mtu)
1417 {
1418         __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
1419 }
1420
1421 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
1422                      int oif, u32 mark)
1423 {
1424         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1425         struct dst_entry *dst;
1426         struct flowi6 fl6;
1427
1428         memset(&fl6, 0, sizeof(fl6));
1429         fl6.flowi6_oif = oif;
1430         fl6.flowi6_mark = mark ? mark : IP6_REPLY_MARK(net, skb->mark);
1431         fl6.daddr = iph->daddr;
1432         fl6.saddr = iph->saddr;
1433         fl6.flowlabel = ip6_flowinfo(iph);
1434
1435         dst = ip6_route_output(net, NULL, &fl6);
1436         if (!dst->error)
1437                 __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
1438         dst_release(dst);
1439 }
1440 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
1441
1442 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
1443 {
1444         int oif = sk->sk_bound_dev_if;
1445         struct dst_entry *dst;
1446
1447         if (!oif && skb->dev)
1448                 oif = l3mdev_master_ifindex(skb->dev);
1449
1450         ip6_update_pmtu(skb, sock_net(sk), mtu, oif, sk->sk_mark);
1451
1452         dst = __sk_dst_get(sk);
1453         if (!dst || !dst->obsolete ||
1454             dst->ops->check(dst, inet6_sk(sk)->dst_cookie))
1455                 return;
1456
1457         bh_lock_sock(sk);
1458         if (!sock_owned_by_user(sk) && !ipv6_addr_v4mapped(&sk->sk_v6_daddr))
1459                 ip6_datagram_dst_update(sk, false);
1460         bh_unlock_sock(sk);
1461 }
1462 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
1463
1464 /* Handle redirects */
1465 struct ip6rd_flowi {
1466         struct flowi6 fl6;
1467         struct in6_addr gateway;
1468 };
1469
1470 static struct rt6_info *__ip6_route_redirect(struct net *net,
1471                                              struct fib6_table *table,
1472                                              struct flowi6 *fl6,
1473                                              int flags)
1474 {
1475         struct ip6rd_flowi *rdfl = (struct ip6rd_flowi *)fl6;
1476         struct rt6_info *rt;
1477         struct fib6_node *fn;
1478
1479         /* Get the "current" route for this destination and
1480          * check if the redirect has come from approriate router.
1481          *
1482          * RFC 4861 specifies that redirects should only be
1483          * accepted if they come from the nexthop to the target.
1484          * Due to the way the routes are chosen, this notion
1485          * is a bit fuzzy and one might need to check all possible
1486          * routes.
1487          */
1488
1489         read_lock_bh(&table->tb6_lock);
1490         fn = fib6_lookup(&table->tb6_root, &fl6->daddr, &fl6->saddr);
1491 restart:
1492         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
1493                 if (rt6_check_expired(rt))
1494                         continue;
1495                 if (rt->dst.error)
1496                         break;
1497                 if (!(rt->rt6i_flags & RTF_GATEWAY))
1498                         continue;
1499                 if (fl6->flowi6_oif != rt->dst.dev->ifindex)
1500                         continue;
1501                 if (!ipv6_addr_equal(&rdfl->gateway, &rt->rt6i_gateway))
1502                         continue;
1503                 break;
1504         }
1505
1506         if (!rt)
1507                 rt = net->ipv6.ip6_null_entry;
1508         else if (rt->dst.error) {
1509                 rt = net->ipv6.ip6_null_entry;
1510                 goto out;
1511         }
1512
1513         if (rt == net->ipv6.ip6_null_entry) {
1514                 fn = fib6_backtrack(fn, &fl6->saddr);
1515                 if (fn)
1516                         goto restart;
1517         }
1518
1519 out:
1520         dst_hold(&rt->dst);
1521
1522         read_unlock_bh(&table->tb6_lock);
1523
1524         trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
1525         return rt;
1526 };
1527
1528 static struct dst_entry *ip6_route_redirect(struct net *net,
1529                                         const struct flowi6 *fl6,
1530                                         const struct in6_addr *gateway)
1531 {
1532         int flags = RT6_LOOKUP_F_HAS_SADDR;
1533         struct ip6rd_flowi rdfl;
1534
1535         rdfl.fl6 = *fl6;
1536         rdfl.gateway = *gateway;
1537
1538         return fib6_rule_lookup(net, &rdfl.fl6,
1539                                 flags, __ip6_route_redirect);
1540 }
1541
1542 void ip6_redirect(struct sk_buff *skb, struct net *net, int oif, u32 mark)
1543 {
1544         const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
1545         struct dst_entry *dst;
1546         struct flowi6 fl6;
1547
1548         memset(&fl6, 0, sizeof(fl6));
1549         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1550         fl6.flowi6_oif = oif;
1551         fl6.flowi6_mark = mark;
1552         fl6.daddr = iph->daddr;
1553         fl6.saddr = iph->saddr;
1554         fl6.flowlabel = ip6_flowinfo(iph);
1555
1556         dst = ip6_route_redirect(net, &fl6, &ipv6_hdr(skb)->saddr);
1557         rt6_do_redirect(dst, NULL, skb);
1558         dst_release(dst);
1559 }
1560 EXPORT_SYMBOL_GPL(ip6_redirect);
1561
1562 void ip6_redirect_no_header(struct sk_buff *skb, struct net *net, int oif,
1563                             u32 mark)
1564 {
1565         const struct ipv6hdr *iph = ipv6_hdr(skb);
1566         const struct rd_msg *msg = (struct rd_msg *)icmp6_hdr(skb);
1567         struct dst_entry *dst;
1568         struct flowi6 fl6;
1569
1570         memset(&fl6, 0, sizeof(fl6));
1571         fl6.flowi6_iif = LOOPBACK_IFINDEX;
1572         fl6.flowi6_oif = oif;
1573         fl6.flowi6_mark = mark;
1574         fl6.daddr = msg->dest;
1575         fl6.saddr = iph->daddr;
1576
1577         dst = ip6_route_redirect(net, &fl6, &iph->saddr);
1578         rt6_do_redirect(dst, NULL, skb);
1579         dst_release(dst);
1580 }
1581
1582 void ip6_sk_redirect(struct sk_buff *skb, struct sock *sk)
1583 {
1584         ip6_redirect(skb, sock_net(sk), sk->sk_bound_dev_if, sk->sk_mark);
1585 }
1586 EXPORT_SYMBOL_GPL(ip6_sk_redirect);
1587
1588 static unsigned int ip6_default_advmss(const struct dst_entry *dst)
1589 {
1590         struct net_device *dev = dst->dev;
1591         unsigned int mtu = dst_mtu(dst);
1592         struct net *net = dev_net(dev);
1593
1594         mtu -= sizeof(struct ipv6hdr) + sizeof(struct tcphdr);
1595
1596         if (mtu < net->ipv6.sysctl.ip6_rt_min_advmss)
1597                 mtu = net->ipv6.sysctl.ip6_rt_min_advmss;
1598
1599         /*
1600          * Maximal non-jumbo IPv6 payload is IPV6_MAXPLEN and
1601          * corresponding MSS is IPV6_MAXPLEN - tcp_header_size.
1602          * IPV6_MAXPLEN is also valid and means: "any MSS,
1603          * rely only on pmtu discovery"
1604          */
1605         if (mtu > IPV6_MAXPLEN - sizeof(struct tcphdr))
1606                 mtu = IPV6_MAXPLEN;
1607         return mtu;
1608 }
1609
1610 static unsigned int ip6_mtu(const struct dst_entry *dst)
1611 {
1612         const struct rt6_info *rt = (const struct rt6_info *)dst;
1613         unsigned int mtu = rt->rt6i_pmtu;
1614         struct inet6_dev *idev;
1615
1616         if (mtu)
1617                 goto out;
1618
1619         mtu = dst_metric_raw(dst, RTAX_MTU);
1620         if (mtu)
1621                 goto out;
1622
1623         mtu = IPV6_MIN_MTU;
1624
1625         rcu_read_lock();
1626         idev = __in6_dev_get(dst->dev);
1627         if (idev)
1628                 mtu = idev->cnf.mtu6;
1629         rcu_read_unlock();
1630
1631 out:
1632         mtu = min_t(unsigned int, mtu, IP6_MAX_MTU);
1633
1634         return mtu - lwtunnel_headroom(dst->lwtstate, mtu);
1635 }
1636
1637 static struct dst_entry *icmp6_dst_gc_list;
1638 static DEFINE_SPINLOCK(icmp6_dst_lock);
1639
1640 struct dst_entry *icmp6_dst_alloc(struct net_device *dev,
1641                                   struct flowi6 *fl6)
1642 {
1643         struct dst_entry *dst;
1644         struct rt6_info *rt;
1645         struct inet6_dev *idev = in6_dev_get(dev);
1646         struct net *net = dev_net(dev);
1647
1648         if (unlikely(!idev))
1649                 return ERR_PTR(-ENODEV);
1650
1651         rt = ip6_dst_alloc(net, dev, 0);
1652         if (unlikely(!rt)) {
1653                 in6_dev_put(idev);
1654                 dst = ERR_PTR(-ENOMEM);
1655                 goto out;
1656         }
1657
1658         rt->dst.flags |= DST_HOST;
1659         rt->dst.input = ip6_input;
1660         rt->dst.output  = ip6_output;
1661         atomic_set(&rt->dst.__refcnt, 1);
1662         rt->rt6i_gateway  = fl6->daddr;
1663         rt->rt6i_dst.addr = fl6->daddr;
1664         rt->rt6i_dst.plen = 128;
1665         rt->rt6i_idev     = idev;
1666         dst_metric_set(&rt->dst, RTAX_HOPLIMIT, 0);
1667
1668         spin_lock_bh(&icmp6_dst_lock);
1669         rt->dst.next = icmp6_dst_gc_list;
1670         icmp6_dst_gc_list = &rt->dst;
1671         spin_unlock_bh(&icmp6_dst_lock);
1672
1673         fib6_force_start_gc(net);
1674
1675         dst = xfrm_lookup(net, &rt->dst, flowi6_to_flowi(fl6), NULL, 0);
1676
1677 out:
1678         return dst;
1679 }
1680
1681 int icmp6_dst_gc(void)
1682 {
1683         struct dst_entry *dst, **pprev;
1684         int more = 0;
1685
1686         spin_lock_bh(&icmp6_dst_lock);
1687         pprev = &icmp6_dst_gc_list;
1688
1689         while ((dst = *pprev) != NULL) {
1690                 if (!atomic_read(&dst->__refcnt)) {
1691                         *pprev = dst->next;
1692                         dst_free(dst);
1693                 } else {
1694                         pprev = &dst->next;
1695                         ++more;
1696                 }
1697         }
1698
1699         spin_unlock_bh(&icmp6_dst_lock);
1700
1701         return more;
1702 }
1703
1704 static void icmp6_clean_all(int (*func)(struct rt6_info *rt, void *arg),
1705                             void *arg)
1706 {
1707         struct dst_entry *dst, **pprev;
1708
1709         spin_lock_bh(&icmp6_dst_lock);
1710         pprev = &icmp6_dst_gc_list;
1711         while ((dst = *pprev) != NULL) {
1712                 struct rt6_info *rt = (struct rt6_info *) dst;
1713                 if (func(rt, arg)) {
1714                         *pprev = dst->next;
1715                         dst_free(dst);
1716                 } else {
1717                         pprev = &dst->next;
1718                 }
1719         }
1720         spin_unlock_bh(&icmp6_dst_lock);
1721 }
1722
1723 static int ip6_dst_gc(struct dst_ops *ops)
1724 {
1725         struct net *net = container_of(ops, struct net, ipv6.ip6_dst_ops);
1726         int rt_min_interval = net->ipv6.sysctl.ip6_rt_gc_min_interval;
1727         int rt_max_size = net->ipv6.sysctl.ip6_rt_max_size;
1728         int rt_elasticity = net->ipv6.sysctl.ip6_rt_gc_elasticity;
1729         int rt_gc_timeout = net->ipv6.sysctl.ip6_rt_gc_timeout;
1730         unsigned long rt_last_gc = net->ipv6.ip6_rt_last_gc;
1731         int entries;
1732
1733         entries = dst_entries_get_fast(ops);
1734         if (time_after(rt_last_gc + rt_min_interval, jiffies) &&
1735             entries <= rt_max_size)
1736                 goto out;
1737
1738         net->ipv6.ip6_rt_gc_expire++;
1739         fib6_run_gc(net->ipv6.ip6_rt_gc_expire, net, true);
1740         entries = dst_entries_get_slow(ops);
1741         if (entries < ops->gc_thresh)
1742                 net->ipv6.ip6_rt_gc_expire = rt_gc_timeout>>1;
1743 out:
1744         net->ipv6.ip6_rt_gc_expire -= net->ipv6.ip6_rt_gc_expire>>rt_elasticity;
1745         return entries > rt_max_size;
1746 }
1747
1748 static int ip6_convert_metrics(struct mx6_config *mxc,
1749                                const struct fib6_config *cfg)
1750 {
1751         bool ecn_ca = false;
1752         struct nlattr *nla;
1753         int remaining;
1754         u32 *mp;
1755
1756         if (!cfg->fc_mx)
1757                 return 0;
1758
1759         mp = kzalloc(sizeof(u32) * RTAX_MAX, GFP_KERNEL);
1760         if (unlikely(!mp))
1761                 return -ENOMEM;
1762
1763         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
1764                 int type = nla_type(nla);
1765                 u32 val;
1766
1767                 if (!type)
1768                         continue;
1769                 if (unlikely(type > RTAX_MAX))
1770                         goto err;
1771
1772                 if (type == RTAX_CC_ALGO) {
1773                         char tmp[TCP_CA_NAME_MAX];
1774
1775                         nla_strlcpy(tmp, nla, sizeof(tmp));
1776                         val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
1777                         if (val == TCP_CA_UNSPEC)
1778                                 goto err;
1779                 } else {
1780                         val = nla_get_u32(nla);
1781                 }
1782                 if (type == RTAX_HOPLIMIT && val > 255)
1783                         val = 255;
1784                 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
1785                         goto err;
1786
1787                 mp[type - 1] = val;
1788                 __set_bit(type - 1, mxc->mx_valid);
1789         }
1790
1791         if (ecn_ca) {
1792                 __set_bit(RTAX_FEATURES - 1, mxc->mx_valid);
1793                 mp[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
1794         }
1795
1796         mxc->mx = mp;
1797         return 0;
1798  err:
1799         kfree(mp);
1800         return -EINVAL;
1801 }
1802
1803 static struct rt6_info *ip6_nh_lookup_table(struct net *net,
1804                                             struct fib6_config *cfg,
1805                                             const struct in6_addr *gw_addr)
1806 {
1807         struct flowi6 fl6 = {
1808                 .flowi6_oif = cfg->fc_ifindex,
1809                 .daddr = *gw_addr,
1810                 .saddr = cfg->fc_prefsrc,
1811         };
1812         struct fib6_table *table;
1813         struct rt6_info *rt;
1814         int flags = RT6_LOOKUP_F_IFACE | RT6_LOOKUP_F_IGNORE_LINKSTATE;
1815
1816         table = fib6_get_table(net, cfg->fc_table);
1817         if (!table)
1818                 return NULL;
1819
1820         if (!ipv6_addr_any(&cfg->fc_prefsrc))
1821                 flags |= RT6_LOOKUP_F_HAS_SADDR;
1822
1823         rt = ip6_pol_route(net, table, cfg->fc_ifindex, &fl6, flags);
1824
1825         /* if table lookup failed, fall back to full lookup */
1826         if (rt == net->ipv6.ip6_null_entry) {
1827                 ip6_rt_put(rt);
1828                 rt = NULL;
1829         }
1830
1831         return rt;
1832 }
1833
1834 static struct rt6_info *ip6_route_info_create(struct fib6_config *cfg)
1835 {
1836         struct net *net = cfg->fc_nlinfo.nl_net;
1837         struct rt6_info *rt = NULL;
1838         struct net_device *dev = NULL;
1839         struct inet6_dev *idev = NULL;
1840         struct fib6_table *table;
1841         int addr_type;
1842         int err = -EINVAL;
1843
1844         /* RTF_PCPU is an internal flag; can not be set by userspace */
1845         if (cfg->fc_flags & RTF_PCPU)
1846                 goto out;
1847
1848         if (cfg->fc_dst_len > 128 || cfg->fc_src_len > 128)
1849                 goto out;
1850 #ifndef CONFIG_IPV6_SUBTREES
1851         if (cfg->fc_src_len)
1852                 goto out;
1853 #endif
1854         if (cfg->fc_ifindex) {
1855                 err = -ENODEV;
1856                 dev = dev_get_by_index(net, cfg->fc_ifindex);
1857                 if (!dev)
1858                         goto out;
1859                 idev = in6_dev_get(dev);
1860                 if (!idev)
1861                         goto out;
1862         }
1863
1864         if (cfg->fc_metric == 0)
1865                 cfg->fc_metric = IP6_RT_PRIO_USER;
1866
1867         err = -ENOBUFS;
1868         if (cfg->fc_nlinfo.nlh &&
1869             !(cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_CREATE)) {
1870                 table = fib6_get_table(net, cfg->fc_table);
1871                 if (!table) {
1872                         pr_warn("NLM_F_CREATE should be specified when creating new route\n");
1873                         table = fib6_new_table(net, cfg->fc_table);
1874                 }
1875         } else {
1876                 table = fib6_new_table(net, cfg->fc_table);
1877         }
1878
1879         if (!table)
1880                 goto out;
1881
1882         rt = ip6_dst_alloc(net, NULL,
1883                            (cfg->fc_flags & RTF_ADDRCONF) ? 0 : DST_NOCOUNT);
1884
1885         if (!rt) {
1886                 err = -ENOMEM;
1887                 goto out;
1888         }
1889
1890         if (cfg->fc_flags & RTF_EXPIRES)
1891                 rt6_set_expires(rt, jiffies +
1892                                 clock_t_to_jiffies(cfg->fc_expires));
1893         else
1894                 rt6_clean_expires(rt);
1895
1896         if (cfg->fc_protocol == RTPROT_UNSPEC)
1897                 cfg->fc_protocol = RTPROT_BOOT;
1898         rt->rt6i_protocol = cfg->fc_protocol;
1899
1900         addr_type = ipv6_addr_type(&cfg->fc_dst);
1901
1902         if (addr_type & IPV6_ADDR_MULTICAST)
1903                 rt->dst.input = ip6_mc_input;
1904         else if (cfg->fc_flags & RTF_LOCAL)
1905                 rt->dst.input = ip6_input;
1906         else
1907                 rt->dst.input = ip6_forward;
1908
1909         rt->dst.output = ip6_output;
1910
1911         if (cfg->fc_encap) {
1912                 struct lwtunnel_state *lwtstate;
1913
1914                 err = lwtunnel_build_state(dev, cfg->fc_encap_type,
1915                                            cfg->fc_encap, AF_INET6, cfg,
1916                                            &lwtstate);
1917                 if (err)
1918                         goto out;
1919                 rt->dst.lwtstate = lwtstate_get(lwtstate);
1920                 if (lwtunnel_output_redirect(rt->dst.lwtstate)) {
1921                         rt->dst.lwtstate->orig_output = rt->dst.output;
1922                         rt->dst.output = lwtunnel_output;
1923                 }
1924                 if (lwtunnel_input_redirect(rt->dst.lwtstate)) {
1925                         rt->dst.lwtstate->orig_input = rt->dst.input;
1926                         rt->dst.input = lwtunnel_input;
1927                 }
1928         }
1929
1930         ipv6_addr_prefix(&rt->rt6i_dst.addr, &cfg->fc_dst, cfg->fc_dst_len);
1931         rt->rt6i_dst.plen = cfg->fc_dst_len;
1932         if (rt->rt6i_dst.plen == 128)
1933                 rt->dst.flags |= DST_HOST;
1934
1935 #ifdef CONFIG_IPV6_SUBTREES
1936         ipv6_addr_prefix(&rt->rt6i_src.addr, &cfg->fc_src, cfg->fc_src_len);
1937         rt->rt6i_src.plen = cfg->fc_src_len;
1938 #endif
1939
1940         rt->rt6i_metric = cfg->fc_metric;
1941
1942         /* We cannot add true routes via loopback here,
1943            they would result in kernel looping; promote them to reject routes
1944          */
1945         if ((cfg->fc_flags & RTF_REJECT) ||
1946             (dev && (dev->flags & IFF_LOOPBACK) &&
1947              !(addr_type & IPV6_ADDR_LOOPBACK) &&
1948              !(cfg->fc_flags & RTF_LOCAL))) {
1949                 /* hold loopback dev/idev if we haven't done so. */
1950                 if (dev != net->loopback_dev) {
1951                         if (dev) {
1952                                 dev_put(dev);
1953                                 in6_dev_put(idev);
1954                         }
1955                         dev = net->loopback_dev;
1956                         dev_hold(dev);
1957                         idev = in6_dev_get(dev);
1958                         if (!idev) {
1959                                 err = -ENODEV;
1960                                 goto out;
1961                         }
1962                 }
1963                 rt->rt6i_flags = RTF_REJECT|RTF_NONEXTHOP;
1964                 switch (cfg->fc_type) {
1965                 case RTN_BLACKHOLE:
1966                         rt->dst.error = -EINVAL;
1967                         rt->dst.output = dst_discard_out;
1968                         rt->dst.input = dst_discard;
1969                         break;
1970                 case RTN_PROHIBIT:
1971                         rt->dst.error = -EACCES;
1972                         rt->dst.output = ip6_pkt_prohibit_out;
1973                         rt->dst.input = ip6_pkt_prohibit;
1974                         break;
1975                 case RTN_THROW:
1976                 case RTN_UNREACHABLE:
1977                 default:
1978                         rt->dst.error = (cfg->fc_type == RTN_THROW) ? -EAGAIN
1979                                         : (cfg->fc_type == RTN_UNREACHABLE)
1980                                         ? -EHOSTUNREACH : -ENETUNREACH;
1981                         rt->dst.output = ip6_pkt_discard_out;
1982                         rt->dst.input = ip6_pkt_discard;
1983                         break;
1984                 }
1985                 goto install_route;
1986         }
1987
1988         if (cfg->fc_flags & RTF_GATEWAY) {
1989                 const struct in6_addr *gw_addr;
1990                 int gwa_type;
1991
1992                 gw_addr = &cfg->fc_gateway;
1993                 gwa_type = ipv6_addr_type(gw_addr);
1994
1995                 /* if gw_addr is local we will fail to detect this in case
1996                  * address is still TENTATIVE (DAD in progress). rt6_lookup()
1997                  * will return already-added prefix route via interface that
1998                  * prefix route was assigned to, which might be non-loopback.
1999                  */
2000                 err = -EINVAL;
2001                 if (ipv6_chk_addr_and_flags(net, gw_addr,
2002                                             gwa_type & IPV6_ADDR_LINKLOCAL ?
2003                                             dev : NULL, 0, 0))
2004                         goto out;
2005
2006                 rt->rt6i_gateway = *gw_addr;
2007
2008                 if (gwa_type != (IPV6_ADDR_LINKLOCAL|IPV6_ADDR_UNICAST)) {
2009                         struct rt6_info *grt = NULL;
2010
2011                         /* IPv6 strictly inhibits using not link-local
2012                            addresses as nexthop address.
2013                            Otherwise, router will not able to send redirects.
2014                            It is very good, but in some (rare!) circumstances
2015                            (SIT, PtP, NBMA NOARP links) it is handy to allow
2016                            some exceptions. --ANK
2017                          */
2018                         if (!(gwa_type & IPV6_ADDR_UNICAST))
2019                                 goto out;
2020
2021                         if (cfg->fc_table) {
2022                                 grt = ip6_nh_lookup_table(net, cfg, gw_addr);
2023
2024                                 if (grt) {
2025                                         if (grt->rt6i_flags & RTF_GATEWAY ||
2026                                             (dev && dev != grt->dst.dev)) {
2027                                                 ip6_rt_put(grt);
2028                                                 grt = NULL;
2029                                         }
2030                                 }
2031                         }
2032
2033                         if (!grt)
2034                                 grt = rt6_lookup(net, gw_addr, NULL,
2035                                                  cfg->fc_ifindex, 1);
2036
2037                         err = -EHOSTUNREACH;
2038                         if (!grt)
2039                                 goto out;
2040                         if (dev) {
2041                                 if (dev != grt->dst.dev) {
2042                                         ip6_rt_put(grt);
2043                                         goto out;
2044                                 }
2045                         } else {
2046                                 dev = grt->dst.dev;
2047                                 idev = grt->rt6i_idev;
2048                                 dev_hold(dev);
2049                                 in6_dev_hold(grt->rt6i_idev);
2050                         }
2051                         if (!(grt->rt6i_flags & RTF_GATEWAY))
2052                                 err = 0;
2053                         ip6_rt_put(grt);
2054
2055                         if (err)
2056                                 goto out;
2057                 }
2058                 err = -EINVAL;
2059                 if (!dev || (dev->flags & IFF_LOOPBACK))
2060                         goto out;
2061         }
2062
2063         err = -ENODEV;
2064         if (!dev)
2065                 goto out;
2066
2067         if (!ipv6_addr_any(&cfg->fc_prefsrc)) {
2068                 if (!ipv6_chk_addr(net, &cfg->fc_prefsrc, dev, 0)) {
2069                         err = -EINVAL;
2070                         goto out;
2071                 }
2072                 rt->rt6i_prefsrc.addr = cfg->fc_prefsrc;
2073                 rt->rt6i_prefsrc.plen = 128;
2074         } else
2075                 rt->rt6i_prefsrc.plen = 0;
2076
2077         rt->rt6i_flags = cfg->fc_flags;
2078
2079 install_route:
2080         rt->dst.dev = dev;
2081         rt->rt6i_idev = idev;
2082         rt->rt6i_table = table;
2083
2084         cfg->fc_nlinfo.nl_net = dev_net(dev);
2085
2086         return rt;
2087 out:
2088         if (dev)
2089                 dev_put(dev);
2090         if (idev)
2091                 in6_dev_put(idev);
2092         if (rt)
2093                 dst_free(&rt->dst);
2094
2095         return ERR_PTR(err);
2096 }
2097
2098 int ip6_route_add(struct fib6_config *cfg)
2099 {
2100         struct mx6_config mxc = { .mx = NULL, };
2101         struct rt6_info *rt;
2102         int err;
2103
2104         rt = ip6_route_info_create(cfg);
2105         if (IS_ERR(rt)) {
2106                 err = PTR_ERR(rt);
2107                 rt = NULL;
2108                 goto out;
2109         }
2110
2111         err = ip6_convert_metrics(&mxc, cfg);
2112         if (err)
2113                 goto out;
2114
2115         err = __ip6_ins_rt(rt, &cfg->fc_nlinfo, &mxc);
2116
2117         kfree(mxc.mx);
2118
2119         return err;
2120 out:
2121         if (rt)
2122                 dst_free(&rt->dst);
2123
2124         return err;
2125 }
2126
2127 static int __ip6_del_rt(struct rt6_info *rt, struct nl_info *info)
2128 {
2129         int err;
2130         struct fib6_table *table;
2131         struct net *net = dev_net(rt->dst.dev);
2132
2133         if (rt == net->ipv6.ip6_null_entry ||
2134             rt->dst.flags & DST_NOCACHE) {
2135                 err = -ENOENT;
2136                 goto out;
2137         }
2138
2139         table = rt->rt6i_table;
2140         write_lock_bh(&table->tb6_lock);
2141         err = fib6_del(rt, info);
2142         write_unlock_bh(&table->tb6_lock);
2143
2144 out:
2145         ip6_rt_put(rt);
2146         return err;
2147 }
2148
2149 int ip6_del_rt(struct rt6_info *rt)
2150 {
2151         struct nl_info info = {
2152                 .nl_net = dev_net(rt->dst.dev),
2153         };
2154         return __ip6_del_rt(rt, &info);
2155 }
2156
2157 static int ip6_route_del(struct fib6_config *cfg)
2158 {
2159         struct fib6_table *table;
2160         struct fib6_node *fn;
2161         struct rt6_info *rt;
2162         int err = -ESRCH;
2163
2164         table = fib6_get_table(cfg->fc_nlinfo.nl_net, cfg->fc_table);
2165         if (!table)
2166                 return err;
2167
2168         read_lock_bh(&table->tb6_lock);
2169
2170         fn = fib6_locate(&table->tb6_root,
2171                          &cfg->fc_dst, cfg->fc_dst_len,
2172                          &cfg->fc_src, cfg->fc_src_len);
2173
2174         if (fn) {
2175                 for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2176                         if ((rt->rt6i_flags & RTF_CACHE) &&
2177                             !(cfg->fc_flags & RTF_CACHE))
2178                                 continue;
2179                         if (cfg->fc_ifindex &&
2180                             (!rt->dst.dev ||
2181                              rt->dst.dev->ifindex != cfg->fc_ifindex))
2182                                 continue;
2183                         if (cfg->fc_flags & RTF_GATEWAY &&
2184                             !ipv6_addr_equal(&cfg->fc_gateway, &rt->rt6i_gateway))
2185                                 continue;
2186                         if (cfg->fc_metric && cfg->fc_metric != rt->rt6i_metric)
2187                                 continue;
2188                         if (cfg->fc_protocol && cfg->fc_protocol != rt->rt6i_protocol)
2189                                 continue;
2190                         dst_hold(&rt->dst);
2191                         read_unlock_bh(&table->tb6_lock);
2192
2193                         return __ip6_del_rt(rt, &cfg->fc_nlinfo);
2194                 }
2195         }
2196         read_unlock_bh(&table->tb6_lock);
2197
2198         return err;
2199 }
2200
2201 static void rt6_do_redirect(struct dst_entry *dst, struct sock *sk, struct sk_buff *skb)
2202 {
2203         struct netevent_redirect netevent;
2204         struct rt6_info *rt, *nrt = NULL;
2205         struct ndisc_options ndopts;
2206         struct inet6_dev *in6_dev;
2207         struct neighbour *neigh;
2208         struct rd_msg *msg;
2209         int optlen, on_link;
2210         u8 *lladdr;
2211
2212         optlen = skb_tail_pointer(skb) - skb_transport_header(skb);
2213         optlen -= sizeof(*msg);
2214
2215         if (optlen < 0) {
2216                 net_dbg_ratelimited("rt6_do_redirect: packet too short\n");
2217                 return;
2218         }
2219
2220         msg = (struct rd_msg *)icmp6_hdr(skb);
2221
2222         if (ipv6_addr_is_multicast(&msg->dest)) {
2223                 net_dbg_ratelimited("rt6_do_redirect: destination address is multicast\n");
2224                 return;
2225         }
2226
2227         on_link = 0;
2228         if (ipv6_addr_equal(&msg->dest, &msg->target)) {
2229                 on_link = 1;
2230         } else if (ipv6_addr_type(&msg->target) !=
2231                    (IPV6_ADDR_UNICAST|IPV6_ADDR_LINKLOCAL)) {
2232                 net_dbg_ratelimited("rt6_do_redirect: target address is not link-local unicast\n");
2233                 return;
2234         }
2235
2236         in6_dev = __in6_dev_get(skb->dev);
2237         if (!in6_dev)
2238                 return;
2239         if (in6_dev->cnf.forwarding || !in6_dev->cnf.accept_redirects)
2240                 return;
2241
2242         /* RFC2461 8.1:
2243          *      The IP source address of the Redirect MUST be the same as the current
2244          *      first-hop router for the specified ICMP Destination Address.
2245          */
2246
2247         if (!ndisc_parse_options(skb->dev, msg->opt, optlen, &ndopts)) {
2248                 net_dbg_ratelimited("rt6_redirect: invalid ND options\n");
2249                 return;
2250         }
2251
2252         lladdr = NULL;
2253         if (ndopts.nd_opts_tgt_lladdr) {
2254                 lladdr = ndisc_opt_addr_data(ndopts.nd_opts_tgt_lladdr,
2255                                              skb->dev);
2256                 if (!lladdr) {
2257                         net_dbg_ratelimited("rt6_redirect: invalid link-layer address length\n");
2258                         return;
2259                 }
2260         }
2261
2262         rt = (struct rt6_info *) dst;
2263         if (rt->rt6i_flags & RTF_REJECT) {
2264                 net_dbg_ratelimited("rt6_redirect: source isn't a valid nexthop for redirect target\n");
2265                 return;
2266         }
2267
2268         /* Redirect received -> path was valid.
2269          * Look, redirects are sent only in response to data packets,
2270          * so that this nexthop apparently is reachable. --ANK
2271          */
2272         dst_confirm(&rt->dst);
2273
2274         neigh = __neigh_lookup(&nd_tbl, &msg->target, skb->dev, 1);
2275         if (!neigh)
2276                 return;
2277
2278         /*
2279          *      We have finally decided to accept it.
2280          */
2281
2282         ndisc_update(skb->dev, neigh, lladdr, NUD_STALE,
2283                      NEIGH_UPDATE_F_WEAK_OVERRIDE|
2284                      NEIGH_UPDATE_F_OVERRIDE|
2285                      (on_link ? 0 : (NEIGH_UPDATE_F_OVERRIDE_ISROUTER|
2286                                      NEIGH_UPDATE_F_ISROUTER)),
2287                      NDISC_REDIRECT, &ndopts);
2288
2289         nrt = ip6_rt_cache_alloc(rt, &msg->dest, NULL);
2290         if (!nrt)
2291                 goto out;
2292
2293         nrt->rt6i_flags = RTF_GATEWAY|RTF_UP|RTF_DYNAMIC|RTF_CACHE;
2294         if (on_link)
2295                 nrt->rt6i_flags &= ~RTF_GATEWAY;
2296
2297         nrt->rt6i_gateway = *(struct in6_addr *)neigh->primary_key;
2298
2299         if (ip6_ins_rt(nrt))
2300                 goto out;
2301
2302         netevent.old = &rt->dst;
2303         netevent.new = &nrt->dst;
2304         netevent.daddr = &msg->dest;
2305         netevent.neigh = neigh;
2306         call_netevent_notifiers(NETEVENT_REDIRECT, &netevent);
2307
2308         if (rt->rt6i_flags & RTF_CACHE) {
2309                 rt = (struct rt6_info *) dst_clone(&rt->dst);
2310                 ip6_del_rt(rt);
2311         }
2312
2313 out:
2314         neigh_release(neigh);
2315 }
2316
2317 /*
2318  *      Misc support functions
2319  */
2320
2321 static void rt6_set_from(struct rt6_info *rt, struct rt6_info *from)
2322 {
2323         BUG_ON(from->dst.from);
2324
2325         rt->rt6i_flags &= ~RTF_EXPIRES;
2326         dst_hold(&from->dst);
2327         rt->dst.from = &from->dst;
2328         dst_init_metrics(&rt->dst, dst_metrics_ptr(&from->dst), true);
2329 }
2330
2331 static void ip6_rt_copy_init(struct rt6_info *rt, struct rt6_info *ort)
2332 {
2333         rt->dst.input = ort->dst.input;
2334         rt->dst.output = ort->dst.output;
2335         rt->rt6i_dst = ort->rt6i_dst;
2336         rt->dst.error = ort->dst.error;
2337         rt->rt6i_idev = ort->rt6i_idev;
2338         if (rt->rt6i_idev)
2339                 in6_dev_hold(rt->rt6i_idev);
2340         rt->dst.lastuse = jiffies;
2341         rt->rt6i_gateway = ort->rt6i_gateway;
2342         rt->rt6i_flags = ort->rt6i_flags;
2343         rt6_set_from(rt, ort);
2344         rt->rt6i_metric = ort->rt6i_metric;
2345 #ifdef CONFIG_IPV6_SUBTREES
2346         rt->rt6i_src = ort->rt6i_src;
2347 #endif
2348         rt->rt6i_prefsrc = ort->rt6i_prefsrc;
2349         rt->rt6i_table = ort->rt6i_table;
2350         rt->dst.lwtstate = lwtstate_get(ort->dst.lwtstate);
2351 }
2352
2353 #ifdef CONFIG_IPV6_ROUTE_INFO
2354 static struct rt6_info *rt6_get_route_info(struct net *net,
2355                                            const struct in6_addr *prefix, int prefixlen,
2356                                            const struct in6_addr *gwaddr,
2357                                            struct net_device *dev)
2358 {
2359         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO;
2360         int ifindex = dev->ifindex;
2361         struct fib6_node *fn;
2362         struct rt6_info *rt = NULL;
2363         struct fib6_table *table;
2364
2365         table = fib6_get_table(net, tb_id);
2366         if (!table)
2367                 return NULL;
2368
2369         read_lock_bh(&table->tb6_lock);
2370         fn = fib6_locate(&table->tb6_root, prefix, prefixlen, NULL, 0);
2371         if (!fn)
2372                 goto out;
2373
2374         for (rt = fn->leaf; rt; rt = rt->dst.rt6_next) {
2375                 if (rt->dst.dev->ifindex != ifindex)
2376                         continue;
2377                 if ((rt->rt6i_flags & (RTF_ROUTEINFO|RTF_GATEWAY)) != (RTF_ROUTEINFO|RTF_GATEWAY))
2378                         continue;
2379                 if (!ipv6_addr_equal(&rt->rt6i_gateway, gwaddr))
2380                         continue;
2381                 dst_hold(&rt->dst);
2382                 break;
2383         }
2384 out:
2385         read_unlock_bh(&table->tb6_lock);
2386         return rt;
2387 }
2388
2389 static struct rt6_info *rt6_add_route_info(struct net *net,
2390                                            const struct in6_addr *prefix, int prefixlen,
2391                                            const struct in6_addr *gwaddr,
2392                                            struct net_device *dev,
2393                                            unsigned int pref)
2394 {
2395         struct fib6_config cfg = {
2396                 .fc_metric      = IP6_RT_PRIO_USER,
2397                 .fc_ifindex     = dev->ifindex,
2398                 .fc_dst_len     = prefixlen,
2399                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_ROUTEINFO |
2400                                   RTF_UP | RTF_PREF(pref),
2401                 .fc_nlinfo.portid = 0,
2402                 .fc_nlinfo.nlh = NULL,
2403                 .fc_nlinfo.nl_net = net,
2404         };
2405
2406         cfg.fc_table = l3mdev_fib_table(dev) ? : RT6_TABLE_INFO,
2407         cfg.fc_dst = *prefix;
2408         cfg.fc_gateway = *gwaddr;
2409
2410         /* We should treat it as a default route if prefix length is 0. */
2411         if (!prefixlen)
2412                 cfg.fc_flags |= RTF_DEFAULT;
2413
2414         ip6_route_add(&cfg);
2415
2416         return rt6_get_route_info(net, prefix, prefixlen, gwaddr, dev);
2417 }
2418 #endif
2419
2420 struct rt6_info *rt6_get_dflt_router(const struct in6_addr *addr, struct net_device *dev)
2421 {
2422         u32 tb_id = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT;
2423         struct rt6_info *rt;
2424         struct fib6_table *table;
2425
2426         table = fib6_get_table(dev_net(dev), tb_id);
2427         if (!table)
2428                 return NULL;
2429
2430         read_lock_bh(&table->tb6_lock);
2431         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2432                 if (dev == rt->dst.dev &&
2433                     ((rt->rt6i_flags & (RTF_ADDRCONF | RTF_DEFAULT)) == (RTF_ADDRCONF | RTF_DEFAULT)) &&
2434                     ipv6_addr_equal(&rt->rt6i_gateway, addr))
2435                         break;
2436         }
2437         if (rt)
2438                 dst_hold(&rt->dst);
2439         read_unlock_bh(&table->tb6_lock);
2440         return rt;
2441 }
2442
2443 struct rt6_info *rt6_add_dflt_router(const struct in6_addr *gwaddr,
2444                                      struct net_device *dev,
2445                                      unsigned int pref)
2446 {
2447         struct fib6_config cfg = {
2448                 .fc_table       = l3mdev_fib_table(dev) ? : RT6_TABLE_DFLT,
2449                 .fc_metric      = IP6_RT_PRIO_USER,
2450                 .fc_ifindex     = dev->ifindex,
2451                 .fc_flags       = RTF_GATEWAY | RTF_ADDRCONF | RTF_DEFAULT |
2452                                   RTF_UP | RTF_EXPIRES | RTF_PREF(pref),
2453                 .fc_nlinfo.portid = 0,
2454                 .fc_nlinfo.nlh = NULL,
2455                 .fc_nlinfo.nl_net = dev_net(dev),
2456         };
2457
2458         cfg.fc_gateway = *gwaddr;
2459
2460         if (!ip6_route_add(&cfg)) {
2461                 struct fib6_table *table;
2462
2463                 table = fib6_get_table(dev_net(dev), cfg.fc_table);
2464                 if (table)
2465                         table->flags |= RT6_TABLE_HAS_DFLT_ROUTER;
2466         }
2467
2468         return rt6_get_dflt_router(gwaddr, dev);
2469 }
2470
2471 static void __rt6_purge_dflt_routers(struct fib6_table *table)
2472 {
2473         struct rt6_info *rt;
2474
2475 restart:
2476         read_lock_bh(&table->tb6_lock);
2477         for (rt = table->tb6_root.leaf; rt; rt = rt->dst.rt6_next) {
2478                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ADDRCONF) &&
2479                     (!rt->rt6i_idev || rt->rt6i_idev->cnf.accept_ra != 2)) {
2480                         dst_hold(&rt->dst);
2481                         read_unlock_bh(&table->tb6_lock);
2482                         ip6_del_rt(rt);
2483                         goto restart;
2484                 }
2485         }
2486         read_unlock_bh(&table->tb6_lock);
2487
2488         table->flags &= ~RT6_TABLE_HAS_DFLT_ROUTER;
2489 }
2490
2491 void rt6_purge_dflt_routers(struct net *net)
2492 {
2493         struct fib6_table *table;
2494         struct hlist_head *head;
2495         unsigned int h;
2496
2497         rcu_read_lock();
2498
2499         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
2500                 head = &net->ipv6.fib_table_hash[h];
2501                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
2502                         if (table->flags & RT6_TABLE_HAS_DFLT_ROUTER)
2503                                 __rt6_purge_dflt_routers(table);
2504                 }
2505         }
2506
2507         rcu_read_unlock();
2508 }
2509
2510 static void rtmsg_to_fib6_config(struct net *net,
2511                                  struct in6_rtmsg *rtmsg,
2512                                  struct fib6_config *cfg)
2513 {
2514         memset(cfg, 0, sizeof(*cfg));
2515
2516         cfg->fc_table = l3mdev_fib_table_by_index(net, rtmsg->rtmsg_ifindex) ?
2517                          : RT6_TABLE_MAIN;
2518         cfg->fc_ifindex = rtmsg->rtmsg_ifindex;
2519         cfg->fc_metric = rtmsg->rtmsg_metric;
2520         cfg->fc_expires = rtmsg->rtmsg_info;
2521         cfg->fc_dst_len = rtmsg->rtmsg_dst_len;
2522         cfg->fc_src_len = rtmsg->rtmsg_src_len;
2523         cfg->fc_flags = rtmsg->rtmsg_flags;
2524
2525         cfg->fc_nlinfo.nl_net = net;
2526
2527         cfg->fc_dst = rtmsg->rtmsg_dst;
2528         cfg->fc_src = rtmsg->rtmsg_src;
2529         cfg->fc_gateway = rtmsg->rtmsg_gateway;
2530 }
2531
2532 int ipv6_route_ioctl(struct net *net, unsigned int cmd, void __user *arg)
2533 {
2534         struct fib6_config cfg;
2535         struct in6_rtmsg rtmsg;
2536         int err;
2537
2538         switch (cmd) {
2539         case SIOCADDRT:         /* Add a route */
2540         case SIOCDELRT:         /* Delete a route */
2541                 if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
2542                         return -EPERM;
2543                 err = copy_from_user(&rtmsg, arg,
2544                                      sizeof(struct in6_rtmsg));
2545                 if (err)
2546                         return -EFAULT;
2547
2548                 rtmsg_to_fib6_config(net, &rtmsg, &cfg);
2549
2550                 rtnl_lock();
2551                 switch (cmd) {
2552                 case SIOCADDRT:
2553                         err = ip6_route_add(&cfg);
2554                         break;
2555                 case SIOCDELRT:
2556                         err = ip6_route_del(&cfg);
2557                         break;
2558                 default:
2559                         err = -EINVAL;
2560                 }
2561                 rtnl_unlock();
2562
2563                 return err;
2564         }
2565
2566         return -EINVAL;
2567 }
2568
2569 /*
2570  *      Drop the packet on the floor
2571  */
2572
2573 static int ip6_pkt_drop(struct sk_buff *skb, u8 code, int ipstats_mib_noroutes)
2574 {
2575         int type;
2576         struct dst_entry *dst = skb_dst(skb);
2577         switch (ipstats_mib_noroutes) {
2578         case IPSTATS_MIB_INNOROUTES:
2579                 type = ipv6_addr_type(&ipv6_hdr(skb)->daddr);
2580                 if (type == IPV6_ADDR_ANY) {
2581                         IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2582                                       IPSTATS_MIB_INADDRERRORS);
2583                         break;
2584                 }
2585                 /* FALLTHROUGH */
2586         case IPSTATS_MIB_OUTNOROUTES:
2587                 IP6_INC_STATS(dev_net(dst->dev), ip6_dst_idev(dst),
2588                               ipstats_mib_noroutes);
2589                 break;
2590         }
2591         icmpv6_send(skb, ICMPV6_DEST_UNREACH, code, 0);
2592         kfree_skb(skb);
2593         return 0;
2594 }
2595
2596 static int ip6_pkt_discard(struct sk_buff *skb)
2597 {
2598         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_INNOROUTES);
2599 }
2600
2601 static int ip6_pkt_discard_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2602 {
2603         skb->dev = skb_dst(skb)->dev;
2604         return ip6_pkt_drop(skb, ICMPV6_NOROUTE, IPSTATS_MIB_OUTNOROUTES);
2605 }
2606
2607 static int ip6_pkt_prohibit(struct sk_buff *skb)
2608 {
2609         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_INNOROUTES);
2610 }
2611
2612 static int ip6_pkt_prohibit_out(struct net *net, struct sock *sk, struct sk_buff *skb)
2613 {
2614         skb->dev = skb_dst(skb)->dev;
2615         return ip6_pkt_drop(skb, ICMPV6_ADM_PROHIBITED, IPSTATS_MIB_OUTNOROUTES);
2616 }
2617
2618 /*
2619  *      Allocate a dst for local (unicast / anycast) address.
2620  */
2621
2622 struct rt6_info *addrconf_dst_alloc(struct inet6_dev *idev,
2623                                     const struct in6_addr *addr,
2624                                     bool anycast)
2625 {
2626         u32 tb_id;
2627         struct net *net = dev_net(idev->dev);
2628         struct net_device *dev = net->loopback_dev;
2629         struct rt6_info *rt;
2630
2631         /* use L3 Master device as loopback for host routes if device
2632          * is enslaved and address is not link local or multicast
2633          */
2634         if (!rt6_need_strict(addr))
2635                 dev = l3mdev_master_dev_rcu(idev->dev) ? : dev;
2636
2637         rt = ip6_dst_alloc(net, dev, DST_NOCOUNT);
2638         if (!rt)
2639                 return ERR_PTR(-ENOMEM);
2640
2641         in6_dev_hold(idev);
2642
2643         rt->dst.flags |= DST_HOST;
2644         rt->dst.input = ip6_input;
2645         rt->dst.output = ip6_output;
2646         rt->rt6i_idev = idev;
2647
2648         rt->rt6i_flags = RTF_UP | RTF_NONEXTHOP;
2649         if (anycast)
2650                 rt->rt6i_flags |= RTF_ANYCAST;
2651         else
2652                 rt->rt6i_flags |= RTF_LOCAL;
2653
2654         rt->rt6i_gateway  = *addr;
2655         rt->rt6i_dst.addr = *addr;
2656         rt->rt6i_dst.plen = 128;
2657         tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
2658         rt->rt6i_table = fib6_get_table(net, tb_id);
2659         rt->dst.flags |= DST_NOCACHE;
2660
2661         atomic_set(&rt->dst.__refcnt, 1);
2662
2663         return rt;
2664 }
2665
2666 /* remove deleted ip from prefsrc entries */
2667 struct arg_dev_net_ip {
2668         struct net_device *dev;
2669         struct net *net;
2670         struct in6_addr *addr;
2671 };
2672
2673 static int fib6_remove_prefsrc(struct rt6_info *rt, void *arg)
2674 {
2675         struct net_device *dev = ((struct arg_dev_net_ip *)arg)->dev;
2676         struct net *net = ((struct arg_dev_net_ip *)arg)->net;
2677         struct in6_addr *addr = ((struct arg_dev_net_ip *)arg)->addr;
2678
2679         if (((void *)rt->dst.dev == dev || !dev) &&
2680             rt != net->ipv6.ip6_null_entry &&
2681             ipv6_addr_equal(addr, &rt->rt6i_prefsrc.addr)) {
2682                 /* remove prefsrc entry */
2683                 rt->rt6i_prefsrc.plen = 0;
2684         }
2685         return 0;
2686 }
2687
2688 void rt6_remove_prefsrc(struct inet6_ifaddr *ifp)
2689 {
2690         struct net *net = dev_net(ifp->idev->dev);
2691         struct arg_dev_net_ip adni = {
2692                 .dev = ifp->idev->dev,
2693                 .net = net,
2694                 .addr = &ifp->addr,
2695         };
2696         fib6_clean_all(net, fib6_remove_prefsrc, &adni);
2697 }
2698
2699 #define RTF_RA_ROUTER           (RTF_ADDRCONF | RTF_DEFAULT | RTF_GATEWAY)
2700 #define RTF_CACHE_GATEWAY       (RTF_GATEWAY | RTF_CACHE)
2701
2702 /* Remove routers and update dst entries when gateway turn into host. */
2703 static int fib6_clean_tohost(struct rt6_info *rt, void *arg)
2704 {
2705         struct in6_addr *gateway = (struct in6_addr *)arg;
2706
2707         if ((((rt->rt6i_flags & RTF_RA_ROUTER) == RTF_RA_ROUTER) ||
2708              ((rt->rt6i_flags & RTF_CACHE_GATEWAY) == RTF_CACHE_GATEWAY)) &&
2709              ipv6_addr_equal(gateway, &rt->rt6i_gateway)) {
2710                 return -1;
2711         }
2712         return 0;
2713 }
2714
2715 void rt6_clean_tohost(struct net *net, struct in6_addr *gateway)
2716 {
2717         fib6_clean_all(net, fib6_clean_tohost, gateway);
2718 }
2719
2720 struct arg_dev_net {
2721         struct net_device *dev;
2722         struct net *net;
2723 };
2724
2725 static int fib6_ifdown(struct rt6_info *rt, void *arg)
2726 {
2727         const struct arg_dev_net *adn = arg;
2728         const struct net_device *dev = adn->dev;
2729
2730         if ((rt->dst.dev == dev || !dev) &&
2731             rt != adn->net->ipv6.ip6_null_entry)
2732                 return -1;
2733
2734         return 0;
2735 }
2736
2737 void rt6_ifdown(struct net *net, struct net_device *dev)
2738 {
2739         struct arg_dev_net adn = {
2740                 .dev = dev,
2741                 .net = net,
2742         };
2743
2744         fib6_clean_all(net, fib6_ifdown, &adn);
2745         icmp6_clean_all(fib6_ifdown, &adn);
2746         if (dev)
2747                 rt6_uncached_list_flush_dev(net, dev);
2748 }
2749
2750 struct rt6_mtu_change_arg {
2751         struct net_device *dev;
2752         unsigned int mtu;
2753 };
2754
2755 static int rt6_mtu_change_route(struct rt6_info *rt, void *p_arg)
2756 {
2757         struct rt6_mtu_change_arg *arg = (struct rt6_mtu_change_arg *) p_arg;
2758         struct inet6_dev *idev;
2759
2760         /* In IPv6 pmtu discovery is not optional,
2761            so that RTAX_MTU lock cannot disable it.
2762            We still use this lock to block changes
2763            caused by addrconf/ndisc.
2764         */
2765
2766         idev = __in6_dev_get(arg->dev);
2767         if (!idev)
2768                 return 0;
2769
2770         /* For administrative MTU increase, there is no way to discover
2771            IPv6 PMTU increase, so PMTU increase should be updated here.
2772            Since RFC 1981 doesn't include administrative MTU increase
2773            update PMTU increase is a MUST. (i.e. jumbo frame)
2774          */
2775         /*
2776            If new MTU is less than route PMTU, this new MTU will be the
2777            lowest MTU in the path, update the route PMTU to reflect PMTU
2778            decreases; if new MTU is greater than route PMTU, and the
2779            old MTU is the lowest MTU in the path, update the route PMTU
2780            to reflect the increase. In this case if the other nodes' MTU
2781            also have the lowest MTU, TOO BIG MESSAGE will be lead to
2782            PMTU discouvery.
2783          */
2784         if (rt->dst.dev == arg->dev &&
2785             dst_metric_raw(&rt->dst, RTAX_MTU) &&
2786             !dst_metric_locked(&rt->dst, RTAX_MTU)) {
2787                 if (rt->rt6i_flags & RTF_CACHE) {
2788                         /* For RTF_CACHE with rt6i_pmtu == 0
2789                          * (i.e. a redirected route),
2790                          * the metrics of its rt->dst.from has already
2791                          * been updated.
2792                          */
2793                         if (rt->rt6i_pmtu && rt->rt6i_pmtu > arg->mtu)
2794                                 rt->rt6i_pmtu = arg->mtu;
2795                 } else if (dst_mtu(&rt->dst) >= arg->mtu ||
2796                            (dst_mtu(&rt->dst) < arg->mtu &&
2797                             dst_mtu(&rt->dst) == idev->cnf.mtu6)) {
2798                         dst_metric_set(&rt->dst, RTAX_MTU, arg->mtu);
2799                 }
2800         }
2801         return 0;
2802 }
2803
2804 void rt6_mtu_change(struct net_device *dev, unsigned int mtu)
2805 {
2806         struct rt6_mtu_change_arg arg = {
2807                 .dev = dev,
2808                 .mtu = mtu,
2809         };
2810
2811         fib6_clean_all(dev_net(dev), rt6_mtu_change_route, &arg);
2812 }
2813
2814 static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
2815         [RTA_GATEWAY]           = { .len = sizeof(struct in6_addr) },
2816         [RTA_PREFSRC]           = { .len = sizeof(struct in6_addr) },
2817         [RTA_OIF]               = { .type = NLA_U32 },
2818         [RTA_IIF]               = { .type = NLA_U32 },
2819         [RTA_PRIORITY]          = { .type = NLA_U32 },
2820         [RTA_METRICS]           = { .type = NLA_NESTED },
2821         [RTA_MULTIPATH]         = { .len = sizeof(struct rtnexthop) },
2822         [RTA_PREF]              = { .type = NLA_U8 },
2823         [RTA_ENCAP_TYPE]        = { .type = NLA_U16 },
2824         [RTA_ENCAP]             = { .type = NLA_NESTED },
2825         [RTA_EXPIRES]           = { .type = NLA_U32 },
2826         [RTA_TABLE]             = { .type = NLA_U32 },
2827 };
2828
2829 static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
2830                               struct fib6_config *cfg)
2831 {
2832         struct rtmsg *rtm;
2833         struct nlattr *tb[RTA_MAX+1];
2834         unsigned int pref;
2835         int err;
2836
2837         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
2838         if (err < 0)
2839                 goto errout;
2840
2841         err = -EINVAL;
2842         rtm = nlmsg_data(nlh);
2843         memset(cfg, 0, sizeof(*cfg));
2844
2845         cfg->fc_table = rtm->rtm_table;
2846         cfg->fc_dst_len = rtm->rtm_dst_len;
2847         cfg->fc_src_len = rtm->rtm_src_len;
2848         cfg->fc_flags = RTF_UP;
2849         cfg->fc_protocol = rtm->rtm_protocol;
2850         cfg->fc_type = rtm->rtm_type;
2851
2852         if (rtm->rtm_type == RTN_UNREACHABLE ||
2853             rtm->rtm_type == RTN_BLACKHOLE ||
2854             rtm->rtm_type == RTN_PROHIBIT ||
2855             rtm->rtm_type == RTN_THROW)
2856                 cfg->fc_flags |= RTF_REJECT;
2857
2858         if (rtm->rtm_type == RTN_LOCAL)
2859                 cfg->fc_flags |= RTF_LOCAL;
2860
2861         if (rtm->rtm_flags & RTM_F_CLONED)
2862                 cfg->fc_flags |= RTF_CACHE;
2863
2864         cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
2865         cfg->fc_nlinfo.nlh = nlh;
2866         cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
2867
2868         if (tb[RTA_GATEWAY]) {
2869                 cfg->fc_gateway = nla_get_in6_addr(tb[RTA_GATEWAY]);
2870                 cfg->fc_flags |= RTF_GATEWAY;
2871         }
2872
2873         if (tb[RTA_DST]) {
2874                 int plen = (rtm->rtm_dst_len + 7) >> 3;
2875
2876                 if (nla_len(tb[RTA_DST]) < plen)
2877                         goto errout;
2878
2879                 nla_memcpy(&cfg->fc_dst, tb[RTA_DST], plen);
2880         }
2881
2882         if (tb[RTA_SRC]) {
2883                 int plen = (rtm->rtm_src_len + 7) >> 3;
2884
2885                 if (nla_len(tb[RTA_SRC]) < plen)
2886                         goto errout;
2887
2888                 nla_memcpy(&cfg->fc_src, tb[RTA_SRC], plen);
2889         }
2890
2891         if (tb[RTA_PREFSRC])
2892                 cfg->fc_prefsrc = nla_get_in6_addr(tb[RTA_PREFSRC]);
2893
2894         if (tb[RTA_OIF])
2895                 cfg->fc_ifindex = nla_get_u32(tb[RTA_OIF]);
2896
2897         if (tb[RTA_PRIORITY])
2898                 cfg->fc_metric = nla_get_u32(tb[RTA_PRIORITY]);
2899
2900         if (tb[RTA_METRICS]) {
2901                 cfg->fc_mx = nla_data(tb[RTA_METRICS]);
2902                 cfg->fc_mx_len = nla_len(tb[RTA_METRICS]);
2903         }
2904
2905         if (tb[RTA_TABLE])
2906                 cfg->fc_table = nla_get_u32(tb[RTA_TABLE]);
2907
2908         if (tb[RTA_MULTIPATH]) {
2909                 cfg->fc_mp = nla_data(tb[RTA_MULTIPATH]);
2910                 cfg->fc_mp_len = nla_len(tb[RTA_MULTIPATH]);
2911
2912                 err = lwtunnel_valid_encap_type_attr(cfg->fc_mp,
2913                                                      cfg->fc_mp_len);
2914                 if (err < 0)
2915                         goto errout;
2916         }
2917
2918         if (tb[RTA_PREF]) {
2919                 pref = nla_get_u8(tb[RTA_PREF]);
2920                 if (pref != ICMPV6_ROUTER_PREF_LOW &&
2921                     pref != ICMPV6_ROUTER_PREF_HIGH)
2922                         pref = ICMPV6_ROUTER_PREF_MEDIUM;
2923                 cfg->fc_flags |= RTF_PREF(pref);
2924         }
2925
2926         if (tb[RTA_ENCAP])
2927                 cfg->fc_encap = tb[RTA_ENCAP];
2928
2929         if (tb[RTA_ENCAP_TYPE]) {
2930                 cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
2931
2932                 err = lwtunnel_valid_encap_type(cfg->fc_encap_type);
2933                 if (err < 0)
2934                         goto errout;
2935         }
2936
2937         if (tb[RTA_EXPIRES]) {
2938                 unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
2939
2940                 if (addrconf_finite_timeout(timeout)) {
2941                         cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
2942                         cfg->fc_flags |= RTF_EXPIRES;
2943                 }
2944         }
2945
2946         err = 0;
2947 errout:
2948         return err;
2949 }
2950
2951 struct rt6_nh {
2952         struct rt6_info *rt6_info;
2953         struct fib6_config r_cfg;
2954         struct mx6_config mxc;
2955         struct list_head next;
2956 };
2957
2958 static void ip6_print_replace_route_err(struct list_head *rt6_nh_list)
2959 {
2960         struct rt6_nh *nh;
2961
2962         list_for_each_entry(nh, rt6_nh_list, next) {
2963                 pr_warn("IPV6: multipath route replace failed (check consistency of installed routes): %pI6 nexthop %pI6 ifi %d\n",
2964                         &nh->r_cfg.fc_dst, &nh->r_cfg.fc_gateway,
2965                         nh->r_cfg.fc_ifindex);
2966         }
2967 }
2968
2969 static int ip6_route_info_append(struct list_head *rt6_nh_list,
2970                                  struct rt6_info *rt, struct fib6_config *r_cfg)
2971 {
2972         struct rt6_nh *nh;
2973         int err = -EEXIST;
2974
2975         list_for_each_entry(nh, rt6_nh_list, next) {
2976                 /* check if rt6_info already exists */
2977                 if (rt6_duplicate_nexthop(nh->rt6_info, rt))
2978                         return err;
2979         }
2980
2981         nh = kzalloc(sizeof(*nh), GFP_KERNEL);
2982         if (!nh)
2983                 return -ENOMEM;
2984         nh->rt6_info = rt;
2985         err = ip6_convert_metrics(&nh->mxc, r_cfg);
2986         if (err) {
2987                 kfree(nh);
2988                 return err;
2989         }
2990         memcpy(&nh->r_cfg, r_cfg, sizeof(*r_cfg));
2991         list_add_tail(&nh->next, rt6_nh_list);
2992
2993         return 0;
2994 }
2995
2996 static int ip6_route_multipath_add(struct fib6_config *cfg)
2997 {
2998         struct fib6_config r_cfg;
2999         struct rtnexthop *rtnh;
3000         struct rt6_info *rt;
3001         struct rt6_nh *err_nh;
3002         struct rt6_nh *nh, *nh_safe;
3003         int remaining;
3004         int attrlen;
3005         int err = 1;
3006         int nhn = 0;
3007         int replace = (cfg->fc_nlinfo.nlh &&
3008                        (cfg->fc_nlinfo.nlh->nlmsg_flags & NLM_F_REPLACE));
3009         LIST_HEAD(rt6_nh_list);
3010
3011         remaining = cfg->fc_mp_len;
3012         rtnh = (struct rtnexthop *)cfg->fc_mp;
3013
3014         /* Parse a Multipath Entry and build a list (rt6_nh_list) of
3015          * rt6_info structs per nexthop
3016          */
3017         while (rtnh_ok(rtnh, remaining)) {
3018                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3019                 if (rtnh->rtnh_ifindex)
3020                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3021
3022                 attrlen = rtnh_attrlen(rtnh);
3023                 if (attrlen > 0) {
3024                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3025
3026                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3027                         if (nla) {
3028                                 r_cfg.fc_gateway = nla_get_in6_addr(nla);
3029                                 r_cfg.fc_flags |= RTF_GATEWAY;
3030                         }
3031                         r_cfg.fc_encap = nla_find(attrs, attrlen, RTA_ENCAP);
3032                         nla = nla_find(attrs, attrlen, RTA_ENCAP_TYPE);
3033                         if (nla)
3034                                 r_cfg.fc_encap_type = nla_get_u16(nla);
3035                 }
3036
3037                 rt = ip6_route_info_create(&r_cfg);
3038                 if (IS_ERR(rt)) {
3039                         err = PTR_ERR(rt);
3040                         rt = NULL;
3041                         goto cleanup;
3042                 }
3043
3044                 err = ip6_route_info_append(&rt6_nh_list, rt, &r_cfg);
3045                 if (err) {
3046                         dst_free(&rt->dst);
3047                         goto cleanup;
3048                 }
3049
3050                 rtnh = rtnh_next(rtnh, &remaining);
3051         }
3052
3053         err_nh = NULL;
3054         list_for_each_entry(nh, &rt6_nh_list, next) {
3055                 err = __ip6_ins_rt(nh->rt6_info, &cfg->fc_nlinfo, &nh->mxc);
3056                 /* nh->rt6_info is used or freed at this point, reset to NULL*/
3057                 nh->rt6_info = NULL;
3058                 if (err) {
3059                         if (replace && nhn)
3060                                 ip6_print_replace_route_err(&rt6_nh_list);
3061                         err_nh = nh;
3062                         goto add_errout;
3063                 }
3064
3065                 /* Because each route is added like a single route we remove
3066                  * these flags after the first nexthop: if there is a collision,
3067                  * we have already failed to add the first nexthop:
3068                  * fib6_add_rt2node() has rejected it; when replacing, old
3069                  * nexthops have been replaced by first new, the rest should
3070                  * be added to it.
3071                  */
3072                 if (cfg->fc_nlinfo.nlh) {
3073                         cfg->fc_nlinfo.nlh->nlmsg_flags &= ~(NLM_F_EXCL |
3074                                                              NLM_F_REPLACE);
3075                         cfg->fc_nlinfo.nlh->nlmsg_flags |= NLM_F_CREATE;
3076                 }
3077                 nhn++;
3078         }
3079
3080         goto cleanup;
3081
3082 add_errout:
3083         /* Delete routes that were already added */
3084         list_for_each_entry(nh, &rt6_nh_list, next) {
3085                 if (err_nh == nh)
3086                         break;
3087                 ip6_route_del(&nh->r_cfg);
3088         }
3089
3090 cleanup:
3091         list_for_each_entry_safe(nh, nh_safe, &rt6_nh_list, next) {
3092                 if (nh->rt6_info)
3093                         dst_free(&nh->rt6_info->dst);
3094                 kfree(nh->mxc.mx);
3095                 list_del(&nh->next);
3096                 kfree(nh);
3097         }
3098
3099         return err;
3100 }
3101
3102 static int ip6_route_multipath_del(struct fib6_config *cfg)
3103 {
3104         struct fib6_config r_cfg;
3105         struct rtnexthop *rtnh;
3106         int remaining;
3107         int attrlen;
3108         int err = 1, last_err = 0;
3109
3110         remaining = cfg->fc_mp_len;
3111         rtnh = (struct rtnexthop *)cfg->fc_mp;
3112
3113         /* Parse a Multipath Entry */
3114         while (rtnh_ok(rtnh, remaining)) {
3115                 memcpy(&r_cfg, cfg, sizeof(*cfg));
3116                 if (rtnh->rtnh_ifindex)
3117                         r_cfg.fc_ifindex = rtnh->rtnh_ifindex;
3118
3119                 attrlen = rtnh_attrlen(rtnh);
3120                 if (attrlen > 0) {
3121                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
3122
3123                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
3124                         if (nla) {
3125                                 nla_memcpy(&r_cfg.fc_gateway, nla, 16);
3126                                 r_cfg.fc_flags |= RTF_GATEWAY;
3127                         }
3128                 }
3129                 err = ip6_route_del(&r_cfg);
3130                 if (err)
3131                         last_err = err;
3132
3133                 rtnh = rtnh_next(rtnh, &remaining);
3134         }
3135
3136         return last_err;
3137 }
3138
3139 static int inet6_rtm_delroute(struct sk_buff *skb, struct nlmsghdr *nlh)
3140 {
3141         struct fib6_config cfg;
3142         int err;
3143
3144         err = rtm_to_fib6_config(skb, nlh, &cfg);
3145         if (err < 0)
3146                 return err;
3147
3148         if (cfg.fc_mp)
3149                 return ip6_route_multipath_del(&cfg);
3150         else
3151                 return ip6_route_del(&cfg);
3152 }
3153
3154 static int inet6_rtm_newroute(struct sk_buff *skb, struct nlmsghdr *nlh)
3155 {
3156         struct fib6_config cfg;
3157         int err;
3158
3159         err = rtm_to_fib6_config(skb, nlh, &cfg);
3160         if (err < 0)
3161                 return err;
3162
3163         if (cfg.fc_mp)
3164                 return ip6_route_multipath_add(&cfg);
3165         else
3166                 return ip6_route_add(&cfg);
3167 }
3168
3169 static inline size_t rt6_nlmsg_size(struct rt6_info *rt)
3170 {
3171         return NLMSG_ALIGN(sizeof(struct rtmsg))
3172                + nla_total_size(16) /* RTA_SRC */
3173                + nla_total_size(16) /* RTA_DST */
3174                + nla_total_size(16) /* RTA_GATEWAY */
3175                + nla_total_size(16) /* RTA_PREFSRC */
3176                + nla_total_size(4) /* RTA_TABLE */
3177                + nla_total_size(4) /* RTA_IIF */
3178                + nla_total_size(4) /* RTA_OIF */
3179                + nla_total_size(4) /* RTA_PRIORITY */
3180                + RTAX_MAX * nla_total_size(4) /* RTA_METRICS */
3181                + nla_total_size(sizeof(struct rta_cacheinfo))
3182                + nla_total_size(TCP_CA_NAME_MAX) /* RTAX_CC_ALGO */
3183                + nla_total_size(1) /* RTA_PREF */
3184                + lwtunnel_get_encap_size(rt->dst.lwtstate);
3185 }
3186
3187 static int rt6_fill_node(struct net *net,
3188                          struct sk_buff *skb, struct rt6_info *rt,
3189                          struct in6_addr *dst, struct in6_addr *src,
3190                          int iif, int type, u32 portid, u32 seq,
3191                          int prefix, int nowait, unsigned int flags)
3192 {
3193         u32 metrics[RTAX_MAX];
3194         struct rtmsg *rtm;
3195         struct nlmsghdr *nlh;
3196         long expires;
3197         u32 table;
3198
3199         if (prefix) {   /* user wants prefix routes only */
3200                 if (!(rt->rt6i_flags & RTF_PREFIX_RT)) {
3201                         /* success since this is not a prefix route */
3202                         return 1;
3203                 }
3204         }
3205
3206         nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
3207         if (!nlh)
3208                 return -EMSGSIZE;
3209
3210         rtm = nlmsg_data(nlh);
3211         rtm->rtm_family = AF_INET6;
3212         rtm->rtm_dst_len = rt->rt6i_dst.plen;
3213         rtm->rtm_src_len = rt->rt6i_src.plen;
3214         rtm->rtm_tos = 0;
3215         if (rt->rt6i_table)
3216                 table = rt->rt6i_table->tb6_id;
3217         else
3218                 table = RT6_TABLE_UNSPEC;
3219         rtm->rtm_table = table < 256 ? table : RT_TABLE_COMPAT;
3220         if (nla_put_u32(skb, RTA_TABLE, table))
3221                 goto nla_put_failure;
3222         if (rt->rt6i_flags & RTF_REJECT) {
3223                 switch (rt->dst.error) {
3224                 case -EINVAL:
3225                         rtm->rtm_type = RTN_BLACKHOLE;
3226                         break;
3227                 case -EACCES:
3228                         rtm->rtm_type = RTN_PROHIBIT;
3229                         break;
3230                 case -EAGAIN:
3231                         rtm->rtm_type = RTN_THROW;
3232                         break;
3233                 default:
3234                         rtm->rtm_type = RTN_UNREACHABLE;
3235                         break;
3236                 }
3237         }
3238         else if (rt->rt6i_flags & RTF_LOCAL)
3239                 rtm->rtm_type = RTN_LOCAL;
3240         else if (rt->dst.dev && (rt->dst.dev->flags & IFF_LOOPBACK))
3241                 rtm->rtm_type = RTN_LOCAL;
3242         else
3243                 rtm->rtm_type = RTN_UNICAST;
3244         rtm->rtm_flags = 0;
3245         if (!netif_carrier_ok(rt->dst.dev)) {
3246                 rtm->rtm_flags |= RTNH_F_LINKDOWN;
3247                 if (rt->rt6i_idev->cnf.ignore_routes_with_linkdown)
3248                         rtm->rtm_flags |= RTNH_F_DEAD;
3249         }
3250         rtm->rtm_scope = RT_SCOPE_UNIVERSE;
3251         rtm->rtm_protocol = rt->rt6i_protocol;
3252         if (rt->rt6i_flags & RTF_DYNAMIC)
3253                 rtm->rtm_protocol = RTPROT_REDIRECT;
3254         else if (rt->rt6i_flags & RTF_ADDRCONF) {
3255                 if (rt->rt6i_flags & (RTF_DEFAULT | RTF_ROUTEINFO))
3256                         rtm->rtm_protocol = RTPROT_RA;
3257                 else
3258                         rtm->rtm_protocol = RTPROT_KERNEL;
3259         }
3260
3261         if (rt->rt6i_flags & RTF_CACHE)
3262                 rtm->rtm_flags |= RTM_F_CLONED;
3263
3264         if (dst) {
3265                 if (nla_put_in6_addr(skb, RTA_DST, dst))
3266                         goto nla_put_failure;
3267                 rtm->rtm_dst_len = 128;
3268         } else if (rtm->rtm_dst_len)
3269                 if (nla_put_in6_addr(skb, RTA_DST, &rt->rt6i_dst.addr))
3270                         goto nla_put_failure;
3271 #ifdef CONFIG_IPV6_SUBTREES
3272         if (src) {
3273                 if (nla_put_in6_addr(skb, RTA_SRC, src))
3274                         goto nla_put_failure;
3275                 rtm->rtm_src_len = 128;
3276         } else if (rtm->rtm_src_len &&
3277                    nla_put_in6_addr(skb, RTA_SRC, &rt->rt6i_src.addr))
3278                 goto nla_put_failure;
3279 #endif
3280         if (iif) {
3281 #ifdef CONFIG_IPV6_MROUTE
3282                 if (ipv6_addr_is_multicast(&rt->rt6i_dst.addr)) {
3283                         int err = ip6mr_get_route(net, skb, rtm, nowait,
3284                                                   portid);
3285
3286                         if (err <= 0) {
3287                                 if (!nowait) {
3288                                         if (err == 0)
3289                                                 return 0;
3290                                         goto nla_put_failure;
3291                                 } else {
3292                                         if (err == -EMSGSIZE)
3293                                                 goto nla_put_failure;
3294                                 }
3295                         }
3296                 } else
3297 #endif
3298                         if (nla_put_u32(skb, RTA_IIF, iif))
3299                                 goto nla_put_failure;
3300         } else if (dst) {
3301                 struct in6_addr saddr_buf;
3302                 if (ip6_route_get_saddr(net, rt, dst, 0, &saddr_buf) == 0 &&
3303                     nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3304                         goto nla_put_failure;
3305         }
3306
3307         if (rt->rt6i_prefsrc.plen) {
3308                 struct in6_addr saddr_buf;
3309                 saddr_buf = rt->rt6i_prefsrc.addr;
3310                 if (nla_put_in6_addr(skb, RTA_PREFSRC, &saddr_buf))
3311                         goto nla_put_failure;
3312         }
3313
3314         memcpy(metrics, dst_metrics_ptr(&rt->dst), sizeof(metrics));
3315         if (rt->rt6i_pmtu)
3316                 metrics[RTAX_MTU - 1] = rt->rt6i_pmtu;
3317         if (rtnetlink_put_metrics(skb, metrics) < 0)
3318                 goto nla_put_failure;
3319
3320         if (rt->rt6i_flags & RTF_GATEWAY) {
3321                 if (nla_put_in6_addr(skb, RTA_GATEWAY, &rt->rt6i_gateway) < 0)
3322                         goto nla_put_failure;
3323         }
3324
3325         if (rt->dst.dev &&
3326             nla_put_u32(skb, RTA_OIF, rt->dst.dev->ifindex))
3327                 goto nla_put_failure;
3328         if (nla_put_u32(skb, RTA_PRIORITY, rt->rt6i_metric))
3329                 goto nla_put_failure;
3330
3331         expires = (rt->rt6i_flags & RTF_EXPIRES) ? rt->dst.expires - jiffies : 0;
3332
3333         if (rtnl_put_cacheinfo(skb, &rt->dst, 0, expires, rt->dst.error) < 0)
3334                 goto nla_put_failure;
3335
3336         if (nla_put_u8(skb, RTA_PREF, IPV6_EXTRACT_PREF(rt->rt6i_flags)))
3337                 goto nla_put_failure;
3338
3339         if (lwtunnel_fill_encap(skb, rt->dst.lwtstate) < 0)
3340                 goto nla_put_failure;
3341
3342         nlmsg_end(skb, nlh);
3343         return 0;
3344
3345 nla_put_failure:
3346         nlmsg_cancel(skb, nlh);
3347         return -EMSGSIZE;
3348 }
3349
3350 int rt6_dump_route(struct rt6_info *rt, void *p_arg)
3351 {
3352         struct rt6_rtnl_dump_arg *arg = (struct rt6_rtnl_dump_arg *) p_arg;
3353         int prefix;
3354
3355         if (nlmsg_len(arg->cb->nlh) >= sizeof(struct rtmsg)) {
3356                 struct rtmsg *rtm = nlmsg_data(arg->cb->nlh);
3357                 prefix = (rtm->rtm_flags & RTM_F_PREFIX) != 0;
3358         } else
3359                 prefix = 0;
3360
3361         return rt6_fill_node(arg->net,
3362                      arg->skb, rt, NULL, NULL, 0, RTM_NEWROUTE,
3363                      NETLINK_CB(arg->cb->skb).portid, arg->cb->nlh->nlmsg_seq,
3364                      prefix, 0, NLM_F_MULTI);
3365 }
3366
3367 static int inet6_rtm_getroute(struct sk_buff *in_skb, struct nlmsghdr *nlh)
3368 {
3369         struct net *net = sock_net(in_skb->sk);
3370         struct nlattr *tb[RTA_MAX+1];
3371         struct rt6_info *rt;
3372         struct sk_buff *skb;
3373         struct rtmsg *rtm;
3374         struct flowi6 fl6;
3375         int err, iif = 0, oif = 0;
3376
3377         err = nlmsg_parse(nlh, sizeof(*rtm), tb, RTA_MAX, rtm_ipv6_policy);
3378         if (err < 0)
3379                 goto errout;
3380
3381         err = -EINVAL;
3382         memset(&fl6, 0, sizeof(fl6));
3383         rtm = nlmsg_data(nlh);
3384         fl6.flowlabel = ip6_make_flowinfo(rtm->rtm_tos, 0);
3385
3386         if (tb[RTA_SRC]) {
3387                 if (nla_len(tb[RTA_SRC]) < sizeof(struct in6_addr))
3388                         goto errout;
3389
3390                 fl6.saddr = *(struct in6_addr *)nla_data(tb[RTA_SRC]);
3391         }
3392
3393         if (tb[RTA_DST]) {
3394                 if (nla_len(tb[RTA_DST]) < sizeof(struct in6_addr))
3395                         goto errout;
3396
3397                 fl6.daddr = *(struct in6_addr *)nla_data(tb[RTA_DST]);
3398         }
3399
3400         if (tb[RTA_IIF])
3401                 iif = nla_get_u32(tb[RTA_IIF]);
3402
3403         if (tb[RTA_OIF])
3404                 oif = nla_get_u32(tb[RTA_OIF]);
3405
3406         if (tb[RTA_MARK])
3407                 fl6.flowi6_mark = nla_get_u32(tb[RTA_MARK]);
3408
3409         if (iif) {
3410                 struct net_device *dev;
3411                 int flags = 0;
3412
3413                 dev = __dev_get_by_index(net, iif);
3414                 if (!dev) {
3415                         err = -ENODEV;
3416                         goto errout;
3417                 }
3418
3419                 fl6.flowi6_iif = iif;
3420
3421                 if (!ipv6_addr_any(&fl6.saddr))
3422                         flags |= RT6_LOOKUP_F_HAS_SADDR;
3423
3424                 rt = (struct rt6_info *)ip6_route_input_lookup(net, dev, &fl6,
3425                                                                flags);
3426         } else {
3427                 fl6.flowi6_oif = oif;
3428
3429                 rt = (struct rt6_info *)ip6_route_output(net, NULL, &fl6);
3430         }
3431
3432         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
3433         if (!skb) {
3434                 ip6_rt_put(rt);
3435                 err = -ENOBUFS;
3436                 goto errout;
3437         }
3438
3439         /* Reserve room for dummy headers, this skb can pass
3440            through good chunk of routing engine.
3441          */
3442         skb_reset_mac_header(skb);
3443         skb_reserve(skb, MAX_HEADER + sizeof(struct ipv6hdr));
3444
3445         skb_dst_set(skb, &rt->dst);
3446
3447         err = rt6_fill_node(net, skb, rt, &fl6.daddr, &fl6.saddr, iif,
3448                             RTM_NEWROUTE, NETLINK_CB(in_skb).portid,
3449                             nlh->nlmsg_seq, 0, 0, 0);
3450         if (err < 0) {
3451                 kfree_skb(skb);
3452                 goto errout;
3453         }
3454
3455         err = rtnl_unicast(skb, net, NETLINK_CB(in_skb).portid);
3456 errout:
3457         return err;
3458 }
3459
3460 void inet6_rt_notify(int event, struct rt6_info *rt, struct nl_info *info,
3461                      unsigned int nlm_flags)
3462 {
3463         struct sk_buff *skb;
3464         struct net *net = info->nl_net;
3465         u32 seq;
3466         int err;
3467
3468         err = -ENOBUFS;
3469         seq = info->nlh ? info->nlh->nlmsg_seq : 0;
3470
3471         skb = nlmsg_new(rt6_nlmsg_size(rt), gfp_any());
3472         if (!skb)
3473                 goto errout;
3474
3475         err = rt6_fill_node(net, skb, rt, NULL, NULL, 0,
3476                                 event, info->portid, seq, 0, 0, nlm_flags);
3477         if (err < 0) {
3478                 /* -EMSGSIZE implies BUG in rt6_nlmsg_size() */
3479                 WARN_ON(err == -EMSGSIZE);
3480                 kfree_skb(skb);
3481                 goto errout;
3482         }
3483         rtnl_notify(skb, net, info->portid, RTNLGRP_IPV6_ROUTE,
3484                     info->nlh, gfp_any());
3485         return;
3486 errout:
3487         if (err < 0)
3488                 rtnl_set_sk_err(net, RTNLGRP_IPV6_ROUTE, err);
3489 }
3490
3491 static int ip6_route_dev_notify(struct notifier_block *this,
3492                                 unsigned long event, void *ptr)
3493 {
3494         struct net_device *dev = netdev_notifier_info_to_dev(ptr);
3495         struct net *net = dev_net(dev);
3496
3497         if (!(dev->flags & IFF_LOOPBACK))
3498                 return NOTIFY_OK;
3499
3500         if (event == NETDEV_REGISTER) {
3501                 net->ipv6.ip6_null_entry->dst.dev = dev;
3502                 net->ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(dev);
3503 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3504                 net->ipv6.ip6_prohibit_entry->dst.dev = dev;
3505                 net->ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(dev);
3506                 net->ipv6.ip6_blk_hole_entry->dst.dev = dev;
3507                 net->ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(dev);
3508 #endif
3509          } else if (event == NETDEV_UNREGISTER &&
3510                     dev->reg_state != NETREG_UNREGISTERED) {
3511                 /* NETDEV_UNREGISTER could be fired for multiple times by
3512                  * netdev_wait_allrefs(). Make sure we only call this once.
3513                  */
3514                 in6_dev_put(net->ipv6.ip6_null_entry->rt6i_idev);
3515 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3516                 in6_dev_put(net->ipv6.ip6_prohibit_entry->rt6i_idev);
3517                 in6_dev_put(net->ipv6.ip6_blk_hole_entry->rt6i_idev);
3518 #endif
3519         }
3520
3521         return NOTIFY_OK;
3522 }
3523
3524 /*
3525  *      /proc
3526  */
3527
3528 #ifdef CONFIG_PROC_FS
3529
3530 static const struct file_operations ipv6_route_proc_fops = {
3531         .owner          = THIS_MODULE,
3532         .open           = ipv6_route_open,
3533         .read           = seq_read,
3534         .llseek         = seq_lseek,
3535         .release        = seq_release_net,
3536 };
3537
3538 static int rt6_stats_seq_show(struct seq_file *seq, void *v)
3539 {
3540         struct net *net = (struct net *)seq->private;
3541         seq_printf(seq, "%04x %04x %04x %04x %04x %04x %04x\n",
3542                    net->ipv6.rt6_stats->fib_nodes,
3543                    net->ipv6.rt6_stats->fib_route_nodes,
3544                    net->ipv6.rt6_stats->fib_rt_alloc,
3545                    net->ipv6.rt6_stats->fib_rt_entries,
3546                    net->ipv6.rt6_stats->fib_rt_cache,
3547                    dst_entries_get_slow(&net->ipv6.ip6_dst_ops),
3548                    net->ipv6.rt6_stats->fib_discarded_routes);
3549
3550         return 0;
3551 }
3552
3553 static int rt6_stats_seq_open(struct inode *inode, struct file *file)
3554 {
3555         return single_open_net(inode, file, rt6_stats_seq_show);
3556 }
3557
3558 static const struct file_operations rt6_stats_seq_fops = {
3559         .owner   = THIS_MODULE,
3560         .open    = rt6_stats_seq_open,
3561         .read    = seq_read,
3562         .llseek  = seq_lseek,
3563         .release = single_release_net,
3564 };
3565 #endif  /* CONFIG_PROC_FS */
3566
3567 #ifdef CONFIG_SYSCTL
3568
3569 static
3570 int ipv6_sysctl_rtcache_flush(struct ctl_table *ctl, int write,
3571                               void __user *buffer, size_t *lenp, loff_t *ppos)
3572 {
3573         struct net *net;
3574         int delay;
3575         if (!write)
3576                 return -EINVAL;
3577
3578         net = (struct net *)ctl->extra1;
3579         delay = net->ipv6.sysctl.flush_delay;
3580         proc_dointvec(ctl, write, buffer, lenp, ppos);
3581         fib6_run_gc(delay <= 0 ? 0 : (unsigned long)delay, net, delay > 0);
3582         return 0;
3583 }
3584
3585 struct ctl_table ipv6_route_table_template[] = {
3586         {
3587                 .procname       =       "flush",
3588                 .data           =       &init_net.ipv6.sysctl.flush_delay,
3589                 .maxlen         =       sizeof(int),
3590                 .mode           =       0200,
3591                 .proc_handler   =       ipv6_sysctl_rtcache_flush
3592         },
3593         {
3594                 .procname       =       "gc_thresh",
3595                 .data           =       &ip6_dst_ops_template.gc_thresh,
3596                 .maxlen         =       sizeof(int),
3597                 .mode           =       0644,
3598                 .proc_handler   =       proc_dointvec,
3599         },
3600         {
3601                 .procname       =       "max_size",
3602                 .data           =       &init_net.ipv6.sysctl.ip6_rt_max_size,
3603                 .maxlen         =       sizeof(int),
3604                 .mode           =       0644,
3605                 .proc_handler   =       proc_dointvec,
3606         },
3607         {
3608                 .procname       =       "gc_min_interval",
3609                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3610                 .maxlen         =       sizeof(int),
3611                 .mode           =       0644,
3612                 .proc_handler   =       proc_dointvec_jiffies,
3613         },
3614         {
3615                 .procname       =       "gc_timeout",
3616                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_timeout,
3617                 .maxlen         =       sizeof(int),
3618                 .mode           =       0644,
3619                 .proc_handler   =       proc_dointvec_jiffies,
3620         },
3621         {
3622                 .procname       =       "gc_interval",
3623                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_interval,
3624                 .maxlen         =       sizeof(int),
3625                 .mode           =       0644,
3626                 .proc_handler   =       proc_dointvec_jiffies,
3627         },
3628         {
3629                 .procname       =       "gc_elasticity",
3630                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_elasticity,
3631                 .maxlen         =       sizeof(int),
3632                 .mode           =       0644,
3633                 .proc_handler   =       proc_dointvec,
3634         },
3635         {
3636                 .procname       =       "mtu_expires",
3637                 .data           =       &init_net.ipv6.sysctl.ip6_rt_mtu_expires,
3638                 .maxlen         =       sizeof(int),
3639                 .mode           =       0644,
3640                 .proc_handler   =       proc_dointvec_jiffies,
3641         },
3642         {
3643                 .procname       =       "min_adv_mss",
3644                 .data           =       &init_net.ipv6.sysctl.ip6_rt_min_advmss,
3645                 .maxlen         =       sizeof(int),
3646                 .mode           =       0644,
3647                 .proc_handler   =       proc_dointvec,
3648         },
3649         {
3650                 .procname       =       "gc_min_interval_ms",
3651                 .data           =       &init_net.ipv6.sysctl.ip6_rt_gc_min_interval,
3652                 .maxlen         =       sizeof(int),
3653                 .mode           =       0644,
3654                 .proc_handler   =       proc_dointvec_ms_jiffies,
3655         },
3656         { }
3657 };
3658
3659 struct ctl_table * __net_init ipv6_route_sysctl_init(struct net *net)
3660 {
3661         struct ctl_table *table;
3662
3663         table = kmemdup(ipv6_route_table_template,
3664                         sizeof(ipv6_route_table_template),
3665                         GFP_KERNEL);
3666
3667         if (table) {
3668                 table[0].data = &net->ipv6.sysctl.flush_delay;
3669                 table[0].extra1 = net;
3670                 table[1].data = &net->ipv6.ip6_dst_ops.gc_thresh;
3671                 table[2].data = &net->ipv6.sysctl.ip6_rt_max_size;
3672                 table[3].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3673                 table[4].data = &net->ipv6.sysctl.ip6_rt_gc_timeout;
3674                 table[5].data = &net->ipv6.sysctl.ip6_rt_gc_interval;
3675                 table[6].data = &net->ipv6.sysctl.ip6_rt_gc_elasticity;
3676                 table[7].data = &net->ipv6.sysctl.ip6_rt_mtu_expires;
3677                 table[8].data = &net->ipv6.sysctl.ip6_rt_min_advmss;
3678                 table[9].data = &net->ipv6.sysctl.ip6_rt_gc_min_interval;
3679
3680                 /* Don't export sysctls to unprivileged users */
3681                 if (net->user_ns != &init_user_ns)
3682                         table[0].procname = NULL;
3683         }
3684
3685         return table;
3686 }
3687 #endif
3688
3689 static int __net_init ip6_route_net_init(struct net *net)
3690 {
3691         int ret = -ENOMEM;
3692
3693         memcpy(&net->ipv6.ip6_dst_ops, &ip6_dst_ops_template,
3694                sizeof(net->ipv6.ip6_dst_ops));
3695
3696         if (dst_entries_init(&net->ipv6.ip6_dst_ops) < 0)
3697                 goto out_ip6_dst_ops;
3698
3699         net->ipv6.ip6_null_entry = kmemdup(&ip6_null_entry_template,
3700                                            sizeof(*net->ipv6.ip6_null_entry),
3701                                            GFP_KERNEL);
3702         if (!net->ipv6.ip6_null_entry)
3703                 goto out_ip6_dst_entries;
3704         net->ipv6.ip6_null_entry->dst.path =
3705                 (struct dst_entry *)net->ipv6.ip6_null_entry;
3706         net->ipv6.ip6_null_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3707         dst_init_metrics(&net->ipv6.ip6_null_entry->dst,
3708                          ip6_template_metrics, true);
3709
3710 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3711         net->ipv6.ip6_prohibit_entry = kmemdup(&ip6_prohibit_entry_template,
3712                                                sizeof(*net->ipv6.ip6_prohibit_entry),
3713                                                GFP_KERNEL);
3714         if (!net->ipv6.ip6_prohibit_entry)
3715                 goto out_ip6_null_entry;
3716         net->ipv6.ip6_prohibit_entry->dst.path =
3717                 (struct dst_entry *)net->ipv6.ip6_prohibit_entry;
3718         net->ipv6.ip6_prohibit_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3719         dst_init_metrics(&net->ipv6.ip6_prohibit_entry->dst,
3720                          ip6_template_metrics, true);
3721
3722         net->ipv6.ip6_blk_hole_entry = kmemdup(&ip6_blk_hole_entry_template,
3723                                                sizeof(*net->ipv6.ip6_blk_hole_entry),
3724                                                GFP_KERNEL);
3725         if (!net->ipv6.ip6_blk_hole_entry)
3726                 goto out_ip6_prohibit_entry;
3727         net->ipv6.ip6_blk_hole_entry->dst.path =
3728                 (struct dst_entry *)net->ipv6.ip6_blk_hole_entry;
3729         net->ipv6.ip6_blk_hole_entry->dst.ops = &net->ipv6.ip6_dst_ops;
3730         dst_init_metrics(&net->ipv6.ip6_blk_hole_entry->dst,
3731                          ip6_template_metrics, true);
3732 #endif
3733
3734         net->ipv6.sysctl.flush_delay = 0;
3735         net->ipv6.sysctl.ip6_rt_max_size = 4096;
3736         net->ipv6.sysctl.ip6_rt_gc_min_interval = HZ / 2;
3737         net->ipv6.sysctl.ip6_rt_gc_timeout = 60*HZ;
3738         net->ipv6.sysctl.ip6_rt_gc_interval = 30*HZ;
3739         net->ipv6.sysctl.ip6_rt_gc_elasticity = 9;
3740         net->ipv6.sysctl.ip6_rt_mtu_expires = 10*60*HZ;
3741         net->ipv6.sysctl.ip6_rt_min_advmss = IPV6_MIN_MTU - 20 - 40;
3742
3743         net->ipv6.ip6_rt_gc_expire = 30*HZ;
3744
3745         ret = 0;
3746 out:
3747         return ret;
3748
3749 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3750 out_ip6_prohibit_entry:
3751         kfree(net->ipv6.ip6_prohibit_entry);
3752 out_ip6_null_entry:
3753         kfree(net->ipv6.ip6_null_entry);
3754 #endif
3755 out_ip6_dst_entries:
3756         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3757 out_ip6_dst_ops:
3758         goto out;
3759 }
3760
3761 static void __net_exit ip6_route_net_exit(struct net *net)
3762 {
3763         kfree(net->ipv6.ip6_null_entry);
3764 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3765         kfree(net->ipv6.ip6_prohibit_entry);
3766         kfree(net->ipv6.ip6_blk_hole_entry);
3767 #endif
3768         dst_entries_destroy(&net->ipv6.ip6_dst_ops);
3769 }
3770
3771 static int __net_init ip6_route_net_init_late(struct net *net)
3772 {
3773 #ifdef CONFIG_PROC_FS
3774         proc_create("ipv6_route", 0, net->proc_net, &ipv6_route_proc_fops);
3775         proc_create("rt6_stats", S_IRUGO, net->proc_net, &rt6_stats_seq_fops);
3776 #endif
3777         return 0;
3778 }
3779
3780 static void __net_exit ip6_route_net_exit_late(struct net *net)
3781 {
3782 #ifdef CONFIG_PROC_FS
3783         remove_proc_entry("ipv6_route", net->proc_net);
3784         remove_proc_entry("rt6_stats", net->proc_net);
3785 #endif
3786 }
3787
3788 static struct pernet_operations ip6_route_net_ops = {
3789         .init = ip6_route_net_init,
3790         .exit = ip6_route_net_exit,
3791 };
3792
3793 static int __net_init ipv6_inetpeer_init(struct net *net)
3794 {
3795         struct inet_peer_base *bp = kmalloc(sizeof(*bp), GFP_KERNEL);
3796
3797         if (!bp)
3798                 return -ENOMEM;
3799         inet_peer_base_init(bp);
3800         net->ipv6.peers = bp;
3801         return 0;
3802 }
3803
3804 static void __net_exit ipv6_inetpeer_exit(struct net *net)
3805 {
3806         struct inet_peer_base *bp = net->ipv6.peers;
3807
3808         net->ipv6.peers = NULL;
3809         inetpeer_invalidate_tree(bp);
3810         kfree(bp);
3811 }
3812
3813 static struct pernet_operations ipv6_inetpeer_ops = {
3814         .init   =       ipv6_inetpeer_init,
3815         .exit   =       ipv6_inetpeer_exit,
3816 };
3817
3818 static struct pernet_operations ip6_route_net_late_ops = {
3819         .init = ip6_route_net_init_late,
3820         .exit = ip6_route_net_exit_late,
3821 };
3822
3823 static struct notifier_block ip6_route_dev_notifier = {
3824         .notifier_call = ip6_route_dev_notify,
3825         .priority = ADDRCONF_NOTIFY_PRIORITY - 10,
3826 };
3827
3828 void __init ip6_route_init_special_entries(void)
3829 {
3830         /* Registering of the loopback is done before this portion of code,
3831          * the loopback reference in rt6_info will not be taken, do it
3832          * manually for init_net */
3833         init_net.ipv6.ip6_null_entry->dst.dev = init_net.loopback_dev;
3834         init_net.ipv6.ip6_null_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3835   #ifdef CONFIG_IPV6_MULTIPLE_TABLES
3836         init_net.ipv6.ip6_prohibit_entry->dst.dev = init_net.loopback_dev;
3837         init_net.ipv6.ip6_prohibit_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3838         init_net.ipv6.ip6_blk_hole_entry->dst.dev = init_net.loopback_dev;
3839         init_net.ipv6.ip6_blk_hole_entry->rt6i_idev = in6_dev_get(init_net.loopback_dev);
3840   #endif
3841 }
3842
3843 int __init ip6_route_init(void)
3844 {
3845         int ret;
3846         int cpu;
3847
3848         ret = -ENOMEM;
3849         ip6_dst_ops_template.kmem_cachep =
3850                 kmem_cache_create("ip6_dst_cache", sizeof(struct rt6_info), 0,
3851                                   SLAB_HWCACHE_ALIGN, NULL);
3852         if (!ip6_dst_ops_template.kmem_cachep)
3853                 goto out;
3854
3855         ret = dst_entries_init(&ip6_dst_blackhole_ops);
3856         if (ret)
3857                 goto out_kmem_cache;
3858
3859         ret = register_pernet_subsys(&ipv6_inetpeer_ops);
3860         if (ret)
3861                 goto out_dst_entries;
3862
3863         ret = register_pernet_subsys(&ip6_route_net_ops);
3864         if (ret)
3865                 goto out_register_inetpeer;
3866
3867         ip6_dst_blackhole_ops.kmem_cachep = ip6_dst_ops_template.kmem_cachep;
3868
3869         ret = fib6_init();
3870         if (ret)
3871                 goto out_register_subsys;
3872
3873         ret = xfrm6_init();
3874         if (ret)
3875                 goto out_fib6_init;
3876
3877         ret = fib6_rules_init();
3878         if (ret)
3879                 goto xfrm6_init;
3880
3881         ret = register_pernet_subsys(&ip6_route_net_late_ops);
3882         if (ret)
3883                 goto fib6_rules_init;
3884
3885         ret = -ENOBUFS;
3886         if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, NULL) ||
3887             __rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, NULL) ||
3888             __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, NULL))
3889                 goto out_register_late_subsys;
3890
3891         ret = register_netdevice_notifier(&ip6_route_dev_notifier);
3892         if (ret)
3893                 goto out_register_late_subsys;
3894
3895         for_each_possible_cpu(cpu) {
3896                 struct uncached_list *ul = per_cpu_ptr(&rt6_uncached_list, cpu);
3897
3898                 INIT_LIST_HEAD(&ul->head);
3899                 spin_lock_init(&ul->lock);
3900         }
3901
3902 out:
3903         return ret;
3904
3905 out_register_late_subsys:
3906         unregister_pernet_subsys(&ip6_route_net_late_ops);
3907 fib6_rules_init:
3908         fib6_rules_cleanup();
3909 xfrm6_init:
3910         xfrm6_fini();
3911 out_fib6_init:
3912         fib6_gc_cleanup();
3913 out_register_subsys:
3914         unregister_pernet_subsys(&ip6_route_net_ops);
3915 out_register_inetpeer:
3916         unregister_pernet_subsys(&ipv6_inetpeer_ops);
3917 out_dst_entries:
3918         dst_entries_destroy(&ip6_dst_blackhole_ops);
3919 out_kmem_cache:
3920         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3921         goto out;
3922 }
3923
3924 void ip6_route_cleanup(void)
3925 {
3926         unregister_netdevice_notifier(&ip6_route_dev_notifier);
3927         unregister_pernet_subsys(&ip6_route_net_late_ops);
3928         fib6_rules_cleanup();
3929         xfrm6_fini();
3930         fib6_gc_cleanup();
3931         unregister_pernet_subsys(&ipv6_inetpeer_ops);
3932         unregister_pernet_subsys(&ip6_route_net_ops);
3933         dst_entries_destroy(&ip6_dst_blackhole_ops);
3934         kmem_cache_destroy(ip6_dst_ops_template.kmem_cachep);
3935 }