GNU Linux-libre 5.4.207-gnu1
[releases.git] / net / ipv6 / ip6_fib.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  *      Linux INET6 implementation
4  *      Forwarding Information Database
5  *
6  *      Authors:
7  *      Pedro Roque             <roque@di.fc.ul.pt>
8  *
9  *      Changes:
10  *      Yuji SEKIYA @USAGI:     Support default route on router node;
11  *                              remove ip6_null_entry from the top of
12  *                              routing table.
13  *      Ville Nuorvala:         Fixed routing subtrees.
14  */
15
16 #define pr_fmt(fmt) "IPv6: " fmt
17
18 #include <linux/errno.h>
19 #include <linux/types.h>
20 #include <linux/net.h>
21 #include <linux/route.h>
22 #include <linux/netdevice.h>
23 #include <linux/in6.h>
24 #include <linux/init.h>
25 #include <linux/list.h>
26 #include <linux/slab.h>
27
28 #include <net/ip.h>
29 #include <net/ipv6.h>
30 #include <net/ndisc.h>
31 #include <net/addrconf.h>
32 #include <net/lwtunnel.h>
33 #include <net/fib_notifier.h>
34
35 #include <net/ip6_fib.h>
36 #include <net/ip6_route.h>
37
38 static struct kmem_cache *fib6_node_kmem __read_mostly;
39
40 struct fib6_cleaner {
41         struct fib6_walker w;
42         struct net *net;
43         int (*func)(struct fib6_info *, void *arg);
44         int sernum;
45         void *arg;
46         bool skip_notify;
47 };
48
49 #ifdef CONFIG_IPV6_SUBTREES
50 #define FWS_INIT FWS_S
51 #else
52 #define FWS_INIT FWS_L
53 #endif
54
55 static struct fib6_info *fib6_find_prefix(struct net *net,
56                                          struct fib6_table *table,
57                                          struct fib6_node *fn);
58 static struct fib6_node *fib6_repair_tree(struct net *net,
59                                           struct fib6_table *table,
60                                           struct fib6_node *fn);
61 static int fib6_walk(struct net *net, struct fib6_walker *w);
62 static int fib6_walk_continue(struct fib6_walker *w);
63
64 /*
65  *      A routing update causes an increase of the serial number on the
66  *      affected subtree. This allows for cached routes to be asynchronously
67  *      tested when modifications are made to the destination cache as a
68  *      result of redirects, path MTU changes, etc.
69  */
70
71 static void fib6_gc_timer_cb(struct timer_list *t);
72
73 #define FOR_WALKERS(net, w) \
74         list_for_each_entry(w, &(net)->ipv6.fib6_walkers, lh)
75
76 static void fib6_walker_link(struct net *net, struct fib6_walker *w)
77 {
78         write_lock_bh(&net->ipv6.fib6_walker_lock);
79         list_add(&w->lh, &net->ipv6.fib6_walkers);
80         write_unlock_bh(&net->ipv6.fib6_walker_lock);
81 }
82
83 static void fib6_walker_unlink(struct net *net, struct fib6_walker *w)
84 {
85         write_lock_bh(&net->ipv6.fib6_walker_lock);
86         list_del(&w->lh);
87         write_unlock_bh(&net->ipv6.fib6_walker_lock);
88 }
89
90 static int fib6_new_sernum(struct net *net)
91 {
92         int new, old;
93
94         do {
95                 old = atomic_read(&net->ipv6.fib6_sernum);
96                 new = old < INT_MAX ? old + 1 : 1;
97         } while (atomic_cmpxchg(&net->ipv6.fib6_sernum,
98                                 old, new) != old);
99         return new;
100 }
101
102 enum {
103         FIB6_NO_SERNUM_CHANGE = 0,
104 };
105
106 void fib6_update_sernum(struct net *net, struct fib6_info *f6i)
107 {
108         struct fib6_node *fn;
109
110         fn = rcu_dereference_protected(f6i->fib6_node,
111                         lockdep_is_held(&f6i->fib6_table->tb6_lock));
112         if (fn)
113                 WRITE_ONCE(fn->fn_sernum, fib6_new_sernum(net));
114 }
115
116 /*
117  *      Auxiliary address test functions for the radix tree.
118  *
119  *      These assume a 32bit processor (although it will work on
120  *      64bit processors)
121  */
122
123 /*
124  *      test bit
125  */
126 #if defined(__LITTLE_ENDIAN)
127 # define BITOP_BE32_SWIZZLE     (0x1F & ~7)
128 #else
129 # define BITOP_BE32_SWIZZLE     0
130 #endif
131
132 static __be32 addr_bit_set(const void *token, int fn_bit)
133 {
134         const __be32 *addr = token;
135         /*
136          * Here,
137          *      1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)
138          * is optimized version of
139          *      htonl(1 << ((~fn_bit)&0x1F))
140          * See include/asm-generic/bitops/le.h.
141          */
142         return (__force __be32)(1 << ((~fn_bit ^ BITOP_BE32_SWIZZLE) & 0x1f)) &
143                addr[fn_bit >> 5];
144 }
145
146 struct fib6_info *fib6_info_alloc(gfp_t gfp_flags, bool with_fib6_nh)
147 {
148         struct fib6_info *f6i;
149         size_t sz = sizeof(*f6i);
150
151         if (with_fib6_nh)
152                 sz += sizeof(struct fib6_nh);
153
154         f6i = kzalloc(sz, gfp_flags);
155         if (!f6i)
156                 return NULL;
157
158         /* fib6_siblings is a union with nh_list, so this initializes both */
159         INIT_LIST_HEAD(&f6i->fib6_siblings);
160         refcount_set(&f6i->fib6_ref, 1);
161
162         return f6i;
163 }
164
165 void fib6_info_destroy_rcu(struct rcu_head *head)
166 {
167         struct fib6_info *f6i = container_of(head, struct fib6_info, rcu);
168
169         WARN_ON(f6i->fib6_node);
170
171         if (f6i->nh)
172                 nexthop_put(f6i->nh);
173         else
174                 fib6_nh_release(f6i->fib6_nh);
175
176         ip_fib_metrics_put(f6i->fib6_metrics);
177         kfree(f6i);
178 }
179 EXPORT_SYMBOL_GPL(fib6_info_destroy_rcu);
180
181 static struct fib6_node *node_alloc(struct net *net)
182 {
183         struct fib6_node *fn;
184
185         fn = kmem_cache_zalloc(fib6_node_kmem, GFP_ATOMIC);
186         if (fn)
187                 net->ipv6.rt6_stats->fib_nodes++;
188
189         return fn;
190 }
191
192 static void node_free_immediate(struct net *net, struct fib6_node *fn)
193 {
194         kmem_cache_free(fib6_node_kmem, fn);
195         net->ipv6.rt6_stats->fib_nodes--;
196 }
197
198 static void node_free_rcu(struct rcu_head *head)
199 {
200         struct fib6_node *fn = container_of(head, struct fib6_node, rcu);
201
202         kmem_cache_free(fib6_node_kmem, fn);
203 }
204
205 static void node_free(struct net *net, struct fib6_node *fn)
206 {
207         call_rcu(&fn->rcu, node_free_rcu);
208         net->ipv6.rt6_stats->fib_nodes--;
209 }
210
211 static void fib6_free_table(struct fib6_table *table)
212 {
213         inetpeer_invalidate_tree(&table->tb6_peers);
214         kfree(table);
215 }
216
217 static void fib6_link_table(struct net *net, struct fib6_table *tb)
218 {
219         unsigned int h;
220
221         /*
222          * Initialize table lock at a single place to give lockdep a key,
223          * tables aren't visible prior to being linked to the list.
224          */
225         spin_lock_init(&tb->tb6_lock);
226         h = tb->tb6_id & (FIB6_TABLE_HASHSZ - 1);
227
228         /*
229          * No protection necessary, this is the only list mutatation
230          * operation, tables never disappear once they exist.
231          */
232         hlist_add_head_rcu(&tb->tb6_hlist, &net->ipv6.fib_table_hash[h]);
233 }
234
235 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
236
237 static struct fib6_table *fib6_alloc_table(struct net *net, u32 id)
238 {
239         struct fib6_table *table;
240
241         table = kzalloc(sizeof(*table), GFP_ATOMIC);
242         if (table) {
243                 table->tb6_id = id;
244                 rcu_assign_pointer(table->tb6_root.leaf,
245                                    net->ipv6.fib6_null_entry);
246                 table->tb6_root.fn_flags = RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
247                 inet_peer_base_init(&table->tb6_peers);
248         }
249
250         return table;
251 }
252
253 struct fib6_table *fib6_new_table(struct net *net, u32 id)
254 {
255         struct fib6_table *tb;
256
257         if (id == 0)
258                 id = RT6_TABLE_MAIN;
259         tb = fib6_get_table(net, id);
260         if (tb)
261                 return tb;
262
263         tb = fib6_alloc_table(net, id);
264         if (tb)
265                 fib6_link_table(net, tb);
266
267         return tb;
268 }
269 EXPORT_SYMBOL_GPL(fib6_new_table);
270
271 struct fib6_table *fib6_get_table(struct net *net, u32 id)
272 {
273         struct fib6_table *tb;
274         struct hlist_head *head;
275         unsigned int h;
276
277         if (id == 0)
278                 id = RT6_TABLE_MAIN;
279         h = id & (FIB6_TABLE_HASHSZ - 1);
280         rcu_read_lock();
281         head = &net->ipv6.fib_table_hash[h];
282         hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
283                 if (tb->tb6_id == id) {
284                         rcu_read_unlock();
285                         return tb;
286                 }
287         }
288         rcu_read_unlock();
289
290         return NULL;
291 }
292 EXPORT_SYMBOL_GPL(fib6_get_table);
293
294 static void __net_init fib6_tables_init(struct net *net)
295 {
296         fib6_link_table(net, net->ipv6.fib6_main_tbl);
297         fib6_link_table(net, net->ipv6.fib6_local_tbl);
298 }
299 #else
300
301 struct fib6_table *fib6_new_table(struct net *net, u32 id)
302 {
303         return fib6_get_table(net, id);
304 }
305
306 struct fib6_table *fib6_get_table(struct net *net, u32 id)
307 {
308           return net->ipv6.fib6_main_tbl;
309 }
310
311 struct dst_entry *fib6_rule_lookup(struct net *net, struct flowi6 *fl6,
312                                    const struct sk_buff *skb,
313                                    int flags, pol_lookup_t lookup)
314 {
315         struct rt6_info *rt;
316
317         rt = lookup(net, net->ipv6.fib6_main_tbl, fl6, skb, flags);
318         if (rt->dst.error == -EAGAIN) {
319                 ip6_rt_put_flags(rt, flags);
320                 rt = net->ipv6.ip6_null_entry;
321                 if (!(flags & RT6_LOOKUP_F_DST_NOREF))
322                         dst_hold(&rt->dst);
323         }
324
325         return &rt->dst;
326 }
327
328 /* called with rcu lock held; no reference taken on fib6_info */
329 int fib6_lookup(struct net *net, int oif, struct flowi6 *fl6,
330                 struct fib6_result *res, int flags)
331 {
332         return fib6_table_lookup(net, net->ipv6.fib6_main_tbl, oif, fl6,
333                                  res, flags);
334 }
335
336 static void __net_init fib6_tables_init(struct net *net)
337 {
338         fib6_link_table(net, net->ipv6.fib6_main_tbl);
339 }
340
341 #endif
342
343 unsigned int fib6_tables_seq_read(struct net *net)
344 {
345         unsigned int h, fib_seq = 0;
346
347         rcu_read_lock();
348         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
349                 struct hlist_head *head = &net->ipv6.fib_table_hash[h];
350                 struct fib6_table *tb;
351
352                 hlist_for_each_entry_rcu(tb, head, tb6_hlist)
353                         fib_seq += tb->fib_seq;
354         }
355         rcu_read_unlock();
356
357         return fib_seq;
358 }
359
360 static int call_fib6_entry_notifier(struct notifier_block *nb, struct net *net,
361                                     enum fib_event_type event_type,
362                                     struct fib6_info *rt)
363 {
364         struct fib6_entry_notifier_info info = {
365                 .rt = rt,
366         };
367
368         return call_fib6_notifier(nb, net, event_type, &info.info);
369 }
370
371 int call_fib6_entry_notifiers(struct net *net,
372                               enum fib_event_type event_type,
373                               struct fib6_info *rt,
374                               struct netlink_ext_ack *extack)
375 {
376         struct fib6_entry_notifier_info info = {
377                 .info.extack = extack,
378                 .rt = rt,
379         };
380
381         rt->fib6_table->fib_seq++;
382         return call_fib6_notifiers(net, event_type, &info.info);
383 }
384
385 int call_fib6_multipath_entry_notifiers(struct net *net,
386                                         enum fib_event_type event_type,
387                                         struct fib6_info *rt,
388                                         unsigned int nsiblings,
389                                         struct netlink_ext_ack *extack)
390 {
391         struct fib6_entry_notifier_info info = {
392                 .info.extack = extack,
393                 .rt = rt,
394                 .nsiblings = nsiblings,
395         };
396
397         rt->fib6_table->fib_seq++;
398         return call_fib6_notifiers(net, event_type, &info.info);
399 }
400
401 struct fib6_dump_arg {
402         struct net *net;
403         struct notifier_block *nb;
404 };
405
406 static void fib6_rt_dump(struct fib6_info *rt, struct fib6_dump_arg *arg)
407 {
408         if (rt == arg->net->ipv6.fib6_null_entry)
409                 return;
410         call_fib6_entry_notifier(arg->nb, arg->net, FIB_EVENT_ENTRY_ADD, rt);
411 }
412
413 static int fib6_node_dump(struct fib6_walker *w)
414 {
415         struct fib6_info *rt;
416
417         for_each_fib6_walker_rt(w)
418                 fib6_rt_dump(rt, w->args);
419         w->leaf = NULL;
420         return 0;
421 }
422
423 static void fib6_table_dump(struct net *net, struct fib6_table *tb,
424                             struct fib6_walker *w)
425 {
426         w->root = &tb->tb6_root;
427         spin_lock_bh(&tb->tb6_lock);
428         fib6_walk(net, w);
429         spin_unlock_bh(&tb->tb6_lock);
430 }
431
432 /* Called with rcu_read_lock() */
433 int fib6_tables_dump(struct net *net, struct notifier_block *nb)
434 {
435         struct fib6_dump_arg arg;
436         struct fib6_walker *w;
437         unsigned int h;
438
439         w = kzalloc(sizeof(*w), GFP_ATOMIC);
440         if (!w)
441                 return -ENOMEM;
442
443         w->func = fib6_node_dump;
444         arg.net = net;
445         arg.nb = nb;
446         w->args = &arg;
447
448         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
449                 struct hlist_head *head = &net->ipv6.fib_table_hash[h];
450                 struct fib6_table *tb;
451
452                 hlist_for_each_entry_rcu(tb, head, tb6_hlist)
453                         fib6_table_dump(net, tb, w);
454         }
455
456         kfree(w);
457
458         return 0;
459 }
460
461 static int fib6_dump_node(struct fib6_walker *w)
462 {
463         int res;
464         struct fib6_info *rt;
465
466         for_each_fib6_walker_rt(w) {
467                 res = rt6_dump_route(rt, w->args, w->skip_in_node);
468                 if (res >= 0) {
469                         /* Frame is full, suspend walking */
470                         w->leaf = rt;
471
472                         /* We'll restart from this node, so if some routes were
473                          * already dumped, skip them next time.
474                          */
475                         w->skip_in_node += res;
476
477                         return 1;
478                 }
479                 w->skip_in_node = 0;
480
481                 /* Multipath routes are dumped in one route with the
482                  * RTA_MULTIPATH attribute. Jump 'rt' to point to the
483                  * last sibling of this route (no need to dump the
484                  * sibling routes again)
485                  */
486                 if (rt->fib6_nsiblings)
487                         rt = list_last_entry(&rt->fib6_siblings,
488                                              struct fib6_info,
489                                              fib6_siblings);
490         }
491         w->leaf = NULL;
492         return 0;
493 }
494
495 static void fib6_dump_end(struct netlink_callback *cb)
496 {
497         struct net *net = sock_net(cb->skb->sk);
498         struct fib6_walker *w = (void *)cb->args[2];
499
500         if (w) {
501                 if (cb->args[4]) {
502                         cb->args[4] = 0;
503                         fib6_walker_unlink(net, w);
504                 }
505                 cb->args[2] = 0;
506                 kfree(w);
507         }
508         cb->done = (void *)cb->args[3];
509         cb->args[1] = 3;
510 }
511
512 static int fib6_dump_done(struct netlink_callback *cb)
513 {
514         fib6_dump_end(cb);
515         return cb->done ? cb->done(cb) : 0;
516 }
517
518 static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
519                            struct netlink_callback *cb)
520 {
521         struct net *net = sock_net(skb->sk);
522         struct fib6_walker *w;
523         int res;
524
525         w = (void *)cb->args[2];
526         w->root = &table->tb6_root;
527
528         if (cb->args[4] == 0) {
529                 w->count = 0;
530                 w->skip = 0;
531                 w->skip_in_node = 0;
532
533                 spin_lock_bh(&table->tb6_lock);
534                 res = fib6_walk(net, w);
535                 spin_unlock_bh(&table->tb6_lock);
536                 if (res > 0) {
537                         cb->args[4] = 1;
538                         cb->args[5] = READ_ONCE(w->root->fn_sernum);
539                 }
540         } else {
541                 int sernum = READ_ONCE(w->root->fn_sernum);
542                 if (cb->args[5] != sernum) {
543                         /* Begin at the root if the tree changed */
544                         cb->args[5] = sernum;
545                         w->state = FWS_INIT;
546                         w->node = w->root;
547                         w->skip = w->count;
548                         w->skip_in_node = 0;
549                 } else
550                         w->skip = 0;
551
552                 spin_lock_bh(&table->tb6_lock);
553                 res = fib6_walk_continue(w);
554                 spin_unlock_bh(&table->tb6_lock);
555                 if (res <= 0) {
556                         fib6_walker_unlink(net, w);
557                         cb->args[4] = 0;
558                 }
559         }
560
561         return res;
562 }
563
564 static int inet6_dump_fib(struct sk_buff *skb, struct netlink_callback *cb)
565 {
566         struct rt6_rtnl_dump_arg arg = { .filter.dump_exceptions = true,
567                                          .filter.dump_routes = true };
568         const struct nlmsghdr *nlh = cb->nlh;
569         struct net *net = sock_net(skb->sk);
570         unsigned int h, s_h;
571         unsigned int e = 0, s_e;
572         struct fib6_walker *w;
573         struct fib6_table *tb;
574         struct hlist_head *head;
575         int res = 0;
576
577         if (cb->strict_check) {
578                 int err;
579
580                 err = ip_valid_fib_dump_req(net, nlh, &arg.filter, cb);
581                 if (err < 0)
582                         return err;
583         } else if (nlmsg_len(nlh) >= sizeof(struct rtmsg)) {
584                 struct rtmsg *rtm = nlmsg_data(nlh);
585
586                 if (rtm->rtm_flags & RTM_F_PREFIX)
587                         arg.filter.flags = RTM_F_PREFIX;
588         }
589
590         w = (void *)cb->args[2];
591         if (!w) {
592                 /* New dump:
593                  *
594                  * 1. hook callback destructor.
595                  */
596                 cb->args[3] = (long)cb->done;
597                 cb->done = fib6_dump_done;
598
599                 /*
600                  * 2. allocate and initialize walker.
601                  */
602                 w = kzalloc(sizeof(*w), GFP_ATOMIC);
603                 if (!w)
604                         return -ENOMEM;
605                 w->func = fib6_dump_node;
606                 cb->args[2] = (long)w;
607         }
608
609         arg.skb = skb;
610         arg.cb = cb;
611         arg.net = net;
612         w->args = &arg;
613
614         if (arg.filter.table_id) {
615                 tb = fib6_get_table(net, arg.filter.table_id);
616                 if (!tb) {
617                         if (rtnl_msg_family(cb->nlh) != PF_INET6)
618                                 goto out;
619
620                         NL_SET_ERR_MSG_MOD(cb->extack, "FIB table does not exist");
621                         return -ENOENT;
622                 }
623
624                 if (!cb->args[0]) {
625                         res = fib6_dump_table(tb, skb, cb);
626                         if (!res)
627                                 cb->args[0] = 1;
628                 }
629                 goto out;
630         }
631
632         s_h = cb->args[0];
633         s_e = cb->args[1];
634
635         rcu_read_lock();
636         for (h = s_h; h < FIB6_TABLE_HASHSZ; h++, s_e = 0) {
637                 e = 0;
638                 head = &net->ipv6.fib_table_hash[h];
639                 hlist_for_each_entry_rcu(tb, head, tb6_hlist) {
640                         if (e < s_e)
641                                 goto next;
642                         res = fib6_dump_table(tb, skb, cb);
643                         if (res != 0)
644                                 goto out_unlock;
645 next:
646                         e++;
647                 }
648         }
649 out_unlock:
650         rcu_read_unlock();
651         cb->args[1] = e;
652         cb->args[0] = h;
653 out:
654         res = res < 0 ? res : skb->len;
655         if (res <= 0)
656                 fib6_dump_end(cb);
657         return res;
658 }
659
660 void fib6_metric_set(struct fib6_info *f6i, int metric, u32 val)
661 {
662         if (!f6i)
663                 return;
664
665         if (f6i->fib6_metrics == &dst_default_metrics) {
666                 struct dst_metrics *p = kzalloc(sizeof(*p), GFP_ATOMIC);
667
668                 if (!p)
669                         return;
670
671                 refcount_set(&p->refcnt, 1);
672                 f6i->fib6_metrics = p;
673         }
674
675         f6i->fib6_metrics->metrics[metric - 1] = val;
676 }
677
678 /*
679  *      Routing Table
680  *
681  *      return the appropriate node for a routing tree "add" operation
682  *      by either creating and inserting or by returning an existing
683  *      node.
684  */
685
686 static struct fib6_node *fib6_add_1(struct net *net,
687                                     struct fib6_table *table,
688                                     struct fib6_node *root,
689                                     struct in6_addr *addr, int plen,
690                                     int offset, int allow_create,
691                                     int replace_required,
692                                     struct netlink_ext_ack *extack)
693 {
694         struct fib6_node *fn, *in, *ln;
695         struct fib6_node *pn = NULL;
696         struct rt6key *key;
697         int     bit;
698         __be32  dir = 0;
699
700         RT6_TRACE("fib6_add_1\n");
701
702         /* insert node in tree */
703
704         fn = root;
705
706         do {
707                 struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
708                                             lockdep_is_held(&table->tb6_lock));
709                 key = (struct rt6key *)((u8 *)leaf + offset);
710
711                 /*
712                  *      Prefix match
713                  */
714                 if (plen < fn->fn_bit ||
715                     !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit)) {
716                         if (!allow_create) {
717                                 if (replace_required) {
718                                         NL_SET_ERR_MSG(extack,
719                                                        "Can not replace route - no match found");
720                                         pr_warn("Can't replace route, no match found\n");
721                                         return ERR_PTR(-ENOENT);
722                                 }
723                                 pr_warn("NLM_F_CREATE should be set when creating new route\n");
724                         }
725                         goto insert_above;
726                 }
727
728                 /*
729                  *      Exact match ?
730                  */
731
732                 if (plen == fn->fn_bit) {
733                         /* clean up an intermediate node */
734                         if (!(fn->fn_flags & RTN_RTINFO)) {
735                                 RCU_INIT_POINTER(fn->leaf, NULL);
736                                 fib6_info_release(leaf);
737                         /* remove null_entry in the root node */
738                         } else if (fn->fn_flags & RTN_TL_ROOT &&
739                                    rcu_access_pointer(fn->leaf) ==
740                                    net->ipv6.fib6_null_entry) {
741                                 RCU_INIT_POINTER(fn->leaf, NULL);
742                         }
743
744                         return fn;
745                 }
746
747                 /*
748                  *      We have more bits to go
749                  */
750
751                 /* Try to walk down on tree. */
752                 dir = addr_bit_set(addr, fn->fn_bit);
753                 pn = fn;
754                 fn = dir ?
755                      rcu_dereference_protected(fn->right,
756                                         lockdep_is_held(&table->tb6_lock)) :
757                      rcu_dereference_protected(fn->left,
758                                         lockdep_is_held(&table->tb6_lock));
759         } while (fn);
760
761         if (!allow_create) {
762                 /* We should not create new node because
763                  * NLM_F_REPLACE was specified without NLM_F_CREATE
764                  * I assume it is safe to require NLM_F_CREATE when
765                  * REPLACE flag is used! Later we may want to remove the
766                  * check for replace_required, because according
767                  * to netlink specification, NLM_F_CREATE
768                  * MUST be specified if new route is created.
769                  * That would keep IPv6 consistent with IPv4
770                  */
771                 if (replace_required) {
772                         NL_SET_ERR_MSG(extack,
773                                        "Can not replace route - no match found");
774                         pr_warn("Can't replace route, no match found\n");
775                         return ERR_PTR(-ENOENT);
776                 }
777                 pr_warn("NLM_F_CREATE should be set when creating new route\n");
778         }
779         /*
780          *      We walked to the bottom of tree.
781          *      Create new leaf node without children.
782          */
783
784         ln = node_alloc(net);
785
786         if (!ln)
787                 return ERR_PTR(-ENOMEM);
788         ln->fn_bit = plen;
789         RCU_INIT_POINTER(ln->parent, pn);
790
791         if (dir)
792                 rcu_assign_pointer(pn->right, ln);
793         else
794                 rcu_assign_pointer(pn->left, ln);
795
796         return ln;
797
798
799 insert_above:
800         /*
801          * split since we don't have a common prefix anymore or
802          * we have a less significant route.
803          * we've to insert an intermediate node on the list
804          * this new node will point to the one we need to create
805          * and the current
806          */
807
808         pn = rcu_dereference_protected(fn->parent,
809                                        lockdep_is_held(&table->tb6_lock));
810
811         /* find 1st bit in difference between the 2 addrs.
812
813            See comment in __ipv6_addr_diff: bit may be an invalid value,
814            but if it is >= plen, the value is ignored in any case.
815          */
816
817         bit = __ipv6_addr_diff(addr, &key->addr, sizeof(*addr));
818
819         /*
820          *              (intermediate)[in]
821          *                /        \
822          *      (new leaf node)[ln] (old node)[fn]
823          */
824         if (plen > bit) {
825                 in = node_alloc(net);
826                 ln = node_alloc(net);
827
828                 if (!in || !ln) {
829                         if (in)
830                                 node_free_immediate(net, in);
831                         if (ln)
832                                 node_free_immediate(net, ln);
833                         return ERR_PTR(-ENOMEM);
834                 }
835
836                 /*
837                  * new intermediate node.
838                  * RTN_RTINFO will
839                  * be off since that an address that chooses one of
840                  * the branches would not match less specific routes
841                  * in the other branch
842                  */
843
844                 in->fn_bit = bit;
845
846                 RCU_INIT_POINTER(in->parent, pn);
847                 in->leaf = fn->leaf;
848                 fib6_info_hold(rcu_dereference_protected(in->leaf,
849                                 lockdep_is_held(&table->tb6_lock)));
850
851                 /* update parent pointer */
852                 if (dir)
853                         rcu_assign_pointer(pn->right, in);
854                 else
855                         rcu_assign_pointer(pn->left, in);
856
857                 ln->fn_bit = plen;
858
859                 RCU_INIT_POINTER(ln->parent, in);
860                 rcu_assign_pointer(fn->parent, in);
861
862                 if (addr_bit_set(addr, bit)) {
863                         rcu_assign_pointer(in->right, ln);
864                         rcu_assign_pointer(in->left, fn);
865                 } else {
866                         rcu_assign_pointer(in->left, ln);
867                         rcu_assign_pointer(in->right, fn);
868                 }
869         } else { /* plen <= bit */
870
871                 /*
872                  *              (new leaf node)[ln]
873                  *                /        \
874                  *           (old node)[fn] NULL
875                  */
876
877                 ln = node_alloc(net);
878
879                 if (!ln)
880                         return ERR_PTR(-ENOMEM);
881
882                 ln->fn_bit = plen;
883
884                 RCU_INIT_POINTER(ln->parent, pn);
885
886                 if (addr_bit_set(&key->addr, plen))
887                         RCU_INIT_POINTER(ln->right, fn);
888                 else
889                         RCU_INIT_POINTER(ln->left, fn);
890
891                 rcu_assign_pointer(fn->parent, ln);
892
893                 if (dir)
894                         rcu_assign_pointer(pn->right, ln);
895                 else
896                         rcu_assign_pointer(pn->left, ln);
897         }
898         return ln;
899 }
900
901 static void __fib6_drop_pcpu_from(struct fib6_nh *fib6_nh,
902                                   const struct fib6_info *match,
903                                   const struct fib6_table *table)
904 {
905         int cpu;
906
907         if (!fib6_nh->rt6i_pcpu)
908                 return;
909
910         /* release the reference to this fib entry from
911          * all of its cached pcpu routes
912          */
913         for_each_possible_cpu(cpu) {
914                 struct rt6_info **ppcpu_rt;
915                 struct rt6_info *pcpu_rt;
916
917                 ppcpu_rt = per_cpu_ptr(fib6_nh->rt6i_pcpu, cpu);
918                 pcpu_rt = *ppcpu_rt;
919
920                 /* only dropping the 'from' reference if the cached route
921                  * is using 'match'. The cached pcpu_rt->from only changes
922                  * from a fib6_info to NULL (ip6_dst_destroy); it can never
923                  * change from one fib6_info reference to another
924                  */
925                 if (pcpu_rt && rcu_access_pointer(pcpu_rt->from) == match) {
926                         struct fib6_info *from;
927
928                         from = xchg((__force struct fib6_info **)&pcpu_rt->from, NULL);
929                         fib6_info_release(from);
930                 }
931         }
932 }
933
934 struct fib6_nh_pcpu_arg {
935         struct fib6_info        *from;
936         const struct fib6_table *table;
937 };
938
939 static int fib6_nh_drop_pcpu_from(struct fib6_nh *nh, void *_arg)
940 {
941         struct fib6_nh_pcpu_arg *arg = _arg;
942
943         __fib6_drop_pcpu_from(nh, arg->from, arg->table);
944         return 0;
945 }
946
947 static void fib6_drop_pcpu_from(struct fib6_info *f6i,
948                                 const struct fib6_table *table)
949 {
950         /* Make sure rt6_make_pcpu_route() wont add other percpu routes
951          * while we are cleaning them here.
952          */
953         f6i->fib6_destroying = 1;
954         mb(); /* paired with the cmpxchg() in rt6_make_pcpu_route() */
955
956         if (f6i->nh) {
957                 struct fib6_nh_pcpu_arg arg = {
958                         .from = f6i,
959                         .table = table
960                 };
961
962                 nexthop_for_each_fib6_nh(f6i->nh, fib6_nh_drop_pcpu_from,
963                                          &arg);
964         } else {
965                 struct fib6_nh *fib6_nh;
966
967                 fib6_nh = f6i->fib6_nh;
968                 __fib6_drop_pcpu_from(fib6_nh, f6i, table);
969         }
970 }
971
972 static void fib6_purge_rt(struct fib6_info *rt, struct fib6_node *fn,
973                           struct net *net)
974 {
975         struct fib6_table *table = rt->fib6_table;
976
977         /* Flush all cached dst in exception table */
978         rt6_flush_exceptions(rt);
979         fib6_drop_pcpu_from(rt, table);
980
981         if (rt->nh && !list_empty(&rt->nh_list))
982                 list_del_init(&rt->nh_list);
983
984         if (refcount_read(&rt->fib6_ref) != 1) {
985                 /* This route is used as dummy address holder in some split
986                  * nodes. It is not leaked, but it still holds other resources,
987                  * which must be released in time. So, scan ascendant nodes
988                  * and replace dummy references to this route with references
989                  * to still alive ones.
990                  */
991                 while (fn) {
992                         struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
993                                             lockdep_is_held(&table->tb6_lock));
994                         struct fib6_info *new_leaf;
995                         if (!(fn->fn_flags & RTN_RTINFO) && leaf == rt) {
996                                 new_leaf = fib6_find_prefix(net, table, fn);
997                                 fib6_info_hold(new_leaf);
998
999                                 rcu_assign_pointer(fn->leaf, new_leaf);
1000                                 fib6_info_release(rt);
1001                         }
1002                         fn = rcu_dereference_protected(fn->parent,
1003                                     lockdep_is_held(&table->tb6_lock));
1004                 }
1005         }
1006 }
1007
1008 /*
1009  *      Insert routing information in a node.
1010  */
1011
1012 static int fib6_add_rt2node(struct fib6_node *fn, struct fib6_info *rt,
1013                             struct nl_info *info,
1014                             struct netlink_ext_ack *extack)
1015 {
1016         struct fib6_info *leaf = rcu_dereference_protected(fn->leaf,
1017                                     lockdep_is_held(&rt->fib6_table->tb6_lock));
1018         struct fib6_info *iter = NULL;
1019         struct fib6_info __rcu **ins;
1020         struct fib6_info __rcu **fallback_ins = NULL;
1021         int replace = (info->nlh &&
1022                        (info->nlh->nlmsg_flags & NLM_F_REPLACE));
1023         int add = (!info->nlh ||
1024                    (info->nlh->nlmsg_flags & NLM_F_CREATE));
1025         int found = 0;
1026         bool rt_can_ecmp = rt6_qualify_for_ecmp(rt);
1027         u16 nlflags = NLM_F_EXCL;
1028         int err;
1029
1030         if (info->nlh && (info->nlh->nlmsg_flags & NLM_F_APPEND))
1031                 nlflags |= NLM_F_APPEND;
1032
1033         ins = &fn->leaf;
1034
1035         for (iter = leaf; iter;
1036              iter = rcu_dereference_protected(iter->fib6_next,
1037                                 lockdep_is_held(&rt->fib6_table->tb6_lock))) {
1038                 /*
1039                  *      Search for duplicates
1040                  */
1041
1042                 if (iter->fib6_metric == rt->fib6_metric) {
1043                         /*
1044                          *      Same priority level
1045                          */
1046                         if (info->nlh &&
1047                             (info->nlh->nlmsg_flags & NLM_F_EXCL))
1048                                 return -EEXIST;
1049
1050                         nlflags &= ~NLM_F_EXCL;
1051                         if (replace) {
1052                                 if (rt_can_ecmp == rt6_qualify_for_ecmp(iter)) {
1053                                         found++;
1054                                         break;
1055                                 }
1056                                 fallback_ins = fallback_ins ?: ins;
1057                                 goto next_iter;
1058                         }
1059
1060                         if (rt6_duplicate_nexthop(iter, rt)) {
1061                                 if (rt->fib6_nsiblings)
1062                                         rt->fib6_nsiblings = 0;
1063                                 if (!(iter->fib6_flags & RTF_EXPIRES))
1064                                         return -EEXIST;
1065                                 if (!(rt->fib6_flags & RTF_EXPIRES))
1066                                         fib6_clean_expires(iter);
1067                                 else
1068                                         fib6_set_expires(iter, rt->expires);
1069
1070                                 if (rt->fib6_pmtu)
1071                                         fib6_metric_set(iter, RTAX_MTU,
1072                                                         rt->fib6_pmtu);
1073                                 return -EEXIST;
1074                         }
1075                         /* If we have the same destination and the same metric,
1076                          * but not the same gateway, then the route we try to
1077                          * add is sibling to this route, increment our counter
1078                          * of siblings, and later we will add our route to the
1079                          * list.
1080                          * Only static routes (which don't have flag
1081                          * RTF_EXPIRES) are used for ECMPv6.
1082                          *
1083                          * To avoid long list, we only had siblings if the
1084                          * route have a gateway.
1085                          */
1086                         if (rt_can_ecmp &&
1087                             rt6_qualify_for_ecmp(iter))
1088                                 rt->fib6_nsiblings++;
1089                 }
1090
1091                 if (iter->fib6_metric > rt->fib6_metric)
1092                         break;
1093
1094 next_iter:
1095                 ins = &iter->fib6_next;
1096         }
1097
1098         if (fallback_ins && !found) {
1099                 /* No matching route with same ecmp-able-ness found, replace
1100                  * first matching route
1101                  */
1102                 ins = fallback_ins;
1103                 iter = rcu_dereference_protected(*ins,
1104                                     lockdep_is_held(&rt->fib6_table->tb6_lock));
1105                 found++;
1106         }
1107
1108         /* Reset round-robin state, if necessary */
1109         if (ins == &fn->leaf)
1110                 fn->rr_ptr = NULL;
1111
1112         /* Link this route to others same route. */
1113         if (rt->fib6_nsiblings) {
1114                 unsigned int fib6_nsiblings;
1115                 struct fib6_info *sibling, *temp_sibling;
1116
1117                 /* Find the first route that have the same metric */
1118                 sibling = leaf;
1119                 while (sibling) {
1120                         if (sibling->fib6_metric == rt->fib6_metric &&
1121                             rt6_qualify_for_ecmp(sibling)) {
1122                                 list_add_tail(&rt->fib6_siblings,
1123                                               &sibling->fib6_siblings);
1124                                 break;
1125                         }
1126                         sibling = rcu_dereference_protected(sibling->fib6_next,
1127                                     lockdep_is_held(&rt->fib6_table->tb6_lock));
1128                 }
1129                 /* For each sibling in the list, increment the counter of
1130                  * siblings. BUG() if counters does not match, list of siblings
1131                  * is broken!
1132                  */
1133                 fib6_nsiblings = 0;
1134                 list_for_each_entry_safe(sibling, temp_sibling,
1135                                          &rt->fib6_siblings, fib6_siblings) {
1136                         sibling->fib6_nsiblings++;
1137                         BUG_ON(sibling->fib6_nsiblings != rt->fib6_nsiblings);
1138                         fib6_nsiblings++;
1139                 }
1140                 BUG_ON(fib6_nsiblings != rt->fib6_nsiblings);
1141                 rt6_multipath_rebalance(temp_sibling);
1142         }
1143
1144         /*
1145          *      insert node
1146          */
1147         if (!replace) {
1148                 if (!add)
1149                         pr_warn("NLM_F_CREATE should be set when creating new route\n");
1150
1151 add:
1152                 nlflags |= NLM_F_CREATE;
1153
1154                 if (!info->skip_notify_kernel) {
1155                         err = call_fib6_entry_notifiers(info->nl_net,
1156                                                         FIB_EVENT_ENTRY_ADD,
1157                                                         rt, extack);
1158                         if (err) {
1159                                 struct fib6_info *sibling, *next_sibling;
1160
1161                                 /* If the route has siblings, then it first
1162                                  * needs to be unlinked from them.
1163                                  */
1164                                 if (!rt->fib6_nsiblings)
1165                                         return err;
1166
1167                                 list_for_each_entry_safe(sibling, next_sibling,
1168                                                          &rt->fib6_siblings,
1169                                                          fib6_siblings)
1170                                         sibling->fib6_nsiblings--;
1171                                 rt->fib6_nsiblings = 0;
1172                                 list_del_init(&rt->fib6_siblings);
1173                                 rt6_multipath_rebalance(next_sibling);
1174                                 return err;
1175                         }
1176                 }
1177
1178                 rcu_assign_pointer(rt->fib6_next, iter);
1179                 fib6_info_hold(rt);
1180                 rcu_assign_pointer(rt->fib6_node, fn);
1181                 rcu_assign_pointer(*ins, rt);
1182                 if (!info->skip_notify)
1183                         inet6_rt_notify(RTM_NEWROUTE, rt, info, nlflags);
1184                 info->nl_net->ipv6.rt6_stats->fib_rt_entries++;
1185
1186                 if (!(fn->fn_flags & RTN_RTINFO)) {
1187                         info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
1188                         fn->fn_flags |= RTN_RTINFO;
1189                 }
1190
1191         } else {
1192                 int nsiblings;
1193
1194                 if (!found) {
1195                         if (add)
1196                                 goto add;
1197                         pr_warn("NLM_F_REPLACE set, but no existing node found!\n");
1198                         return -ENOENT;
1199                 }
1200
1201                 if (!info->skip_notify_kernel) {
1202                         err = call_fib6_entry_notifiers(info->nl_net,
1203                                                         FIB_EVENT_ENTRY_REPLACE,
1204                                                         rt, extack);
1205                         if (err)
1206                                 return err;
1207                 }
1208
1209                 fib6_info_hold(rt);
1210                 rcu_assign_pointer(rt->fib6_node, fn);
1211                 rt->fib6_next = iter->fib6_next;
1212                 rcu_assign_pointer(*ins, rt);
1213                 if (!info->skip_notify)
1214                         inet6_rt_notify(RTM_NEWROUTE, rt, info, NLM_F_REPLACE);
1215                 if (!(fn->fn_flags & RTN_RTINFO)) {
1216                         info->nl_net->ipv6.rt6_stats->fib_route_nodes++;
1217                         fn->fn_flags |= RTN_RTINFO;
1218                 }
1219                 nsiblings = iter->fib6_nsiblings;
1220                 iter->fib6_node = NULL;
1221                 fib6_purge_rt(iter, fn, info->nl_net);
1222                 if (rcu_access_pointer(fn->rr_ptr) == iter)
1223                         fn->rr_ptr = NULL;
1224                 fib6_info_release(iter);
1225
1226                 if (nsiblings) {
1227                         /* Replacing an ECMP route, remove all siblings */
1228                         ins = &rt->fib6_next;
1229                         iter = rcu_dereference_protected(*ins,
1230                                     lockdep_is_held(&rt->fib6_table->tb6_lock));
1231                         while (iter) {
1232                                 if (iter->fib6_metric > rt->fib6_metric)
1233                                         break;
1234                                 if (rt6_qualify_for_ecmp(iter)) {
1235                                         *ins = iter->fib6_next;
1236                                         iter->fib6_node = NULL;
1237                                         fib6_purge_rt(iter, fn, info->nl_net);
1238                                         if (rcu_access_pointer(fn->rr_ptr) == iter)
1239                                                 fn->rr_ptr = NULL;
1240                                         fib6_info_release(iter);
1241                                         nsiblings--;
1242                                         info->nl_net->ipv6.rt6_stats->fib_rt_entries--;
1243                                 } else {
1244                                         ins = &iter->fib6_next;
1245                                 }
1246                                 iter = rcu_dereference_protected(*ins,
1247                                         lockdep_is_held(&rt->fib6_table->tb6_lock));
1248                         }
1249                         WARN_ON(nsiblings != 0);
1250                 }
1251         }
1252
1253         return 0;
1254 }
1255
1256 static void fib6_start_gc(struct net *net, struct fib6_info *rt)
1257 {
1258         if (!timer_pending(&net->ipv6.ip6_fib_timer) &&
1259             (rt->fib6_flags & RTF_EXPIRES))
1260                 mod_timer(&net->ipv6.ip6_fib_timer,
1261                           jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
1262 }
1263
1264 void fib6_force_start_gc(struct net *net)
1265 {
1266         if (!timer_pending(&net->ipv6.ip6_fib_timer))
1267                 mod_timer(&net->ipv6.ip6_fib_timer,
1268                           jiffies + net->ipv6.sysctl.ip6_rt_gc_interval);
1269 }
1270
1271 static void __fib6_update_sernum_upto_root(struct fib6_info *rt,
1272                                            int sernum)
1273 {
1274         struct fib6_node *fn = rcu_dereference_protected(rt->fib6_node,
1275                                 lockdep_is_held(&rt->fib6_table->tb6_lock));
1276
1277         /* paired with smp_rmb() in rt6_get_cookie_safe() */
1278         smp_wmb();
1279         while (fn) {
1280                 WRITE_ONCE(fn->fn_sernum, sernum);
1281                 fn = rcu_dereference_protected(fn->parent,
1282                                 lockdep_is_held(&rt->fib6_table->tb6_lock));
1283         }
1284 }
1285
1286 void fib6_update_sernum_upto_root(struct net *net, struct fib6_info *rt)
1287 {
1288         __fib6_update_sernum_upto_root(rt, fib6_new_sernum(net));
1289 }
1290
1291 /* allow ipv4 to update sernum via ipv6_stub */
1292 void fib6_update_sernum_stub(struct net *net, struct fib6_info *f6i)
1293 {
1294         spin_lock_bh(&f6i->fib6_table->tb6_lock);
1295         fib6_update_sernum_upto_root(net, f6i);
1296         spin_unlock_bh(&f6i->fib6_table->tb6_lock);
1297 }
1298
1299 /*
1300  *      Add routing information to the routing tree.
1301  *      <destination addr>/<source addr>
1302  *      with source addr info in sub-trees
1303  *      Need to own table->tb6_lock
1304  */
1305
1306 int fib6_add(struct fib6_node *root, struct fib6_info *rt,
1307              struct nl_info *info, struct netlink_ext_ack *extack)
1308 {
1309         struct fib6_table *table = rt->fib6_table;
1310         struct fib6_node *fn, *pn = NULL;
1311         int err = -ENOMEM;
1312         int allow_create = 1;
1313         int replace_required = 0;
1314
1315         if (info->nlh) {
1316                 if (!(info->nlh->nlmsg_flags & NLM_F_CREATE))
1317                         allow_create = 0;
1318                 if (info->nlh->nlmsg_flags & NLM_F_REPLACE)
1319                         replace_required = 1;
1320         }
1321         if (!allow_create && !replace_required)
1322                 pr_warn("RTM_NEWROUTE with no NLM_F_CREATE or NLM_F_REPLACE\n");
1323
1324         fn = fib6_add_1(info->nl_net, table, root,
1325                         &rt->fib6_dst.addr, rt->fib6_dst.plen,
1326                         offsetof(struct fib6_info, fib6_dst), allow_create,
1327                         replace_required, extack);
1328         if (IS_ERR(fn)) {
1329                 err = PTR_ERR(fn);
1330                 fn = NULL;
1331                 goto out;
1332         }
1333
1334         pn = fn;
1335
1336 #ifdef CONFIG_IPV6_SUBTREES
1337         if (rt->fib6_src.plen) {
1338                 struct fib6_node *sn;
1339
1340                 if (!rcu_access_pointer(fn->subtree)) {
1341                         struct fib6_node *sfn;
1342
1343                         /*
1344                          * Create subtree.
1345                          *
1346                          *              fn[main tree]
1347                          *              |
1348                          *              sfn[subtree root]
1349                          *                 \
1350                          *                  sn[new leaf node]
1351                          */
1352
1353                         /* Create subtree root node */
1354                         sfn = node_alloc(info->nl_net);
1355                         if (!sfn)
1356                                 goto failure;
1357
1358                         fib6_info_hold(info->nl_net->ipv6.fib6_null_entry);
1359                         rcu_assign_pointer(sfn->leaf,
1360                                            info->nl_net->ipv6.fib6_null_entry);
1361                         sfn->fn_flags = RTN_ROOT;
1362
1363                         /* Now add the first leaf node to new subtree */
1364
1365                         sn = fib6_add_1(info->nl_net, table, sfn,
1366                                         &rt->fib6_src.addr, rt->fib6_src.plen,
1367                                         offsetof(struct fib6_info, fib6_src),
1368                                         allow_create, replace_required, extack);
1369
1370                         if (IS_ERR(sn)) {
1371                                 /* If it is failed, discard just allocated
1372                                    root, and then (in failure) stale node
1373                                    in main tree.
1374                                  */
1375                                 node_free_immediate(info->nl_net, sfn);
1376                                 err = PTR_ERR(sn);
1377                                 goto failure;
1378                         }
1379
1380                         /* Now link new subtree to main tree */
1381                         rcu_assign_pointer(sfn->parent, fn);
1382                         rcu_assign_pointer(fn->subtree, sfn);
1383                 } else {
1384                         sn = fib6_add_1(info->nl_net, table, FIB6_SUBTREE(fn),
1385                                         &rt->fib6_src.addr, rt->fib6_src.plen,
1386                                         offsetof(struct fib6_info, fib6_src),
1387                                         allow_create, replace_required, extack);
1388
1389                         if (IS_ERR(sn)) {
1390                                 err = PTR_ERR(sn);
1391                                 goto failure;
1392                         }
1393                 }
1394
1395                 if (!rcu_access_pointer(fn->leaf)) {
1396                         if (fn->fn_flags & RTN_TL_ROOT) {
1397                                 /* put back null_entry for root node */
1398                                 rcu_assign_pointer(fn->leaf,
1399                                             info->nl_net->ipv6.fib6_null_entry);
1400                         } else {
1401                                 fib6_info_hold(rt);
1402                                 rcu_assign_pointer(fn->leaf, rt);
1403                         }
1404                 }
1405                 fn = sn;
1406         }
1407 #endif
1408
1409         err = fib6_add_rt2node(fn, rt, info, extack);
1410         if (!err) {
1411                 if (rt->nh)
1412                         list_add(&rt->nh_list, &rt->nh->f6i_list);
1413                 __fib6_update_sernum_upto_root(rt, fib6_new_sernum(info->nl_net));
1414                 fib6_start_gc(info->nl_net, rt);
1415         }
1416
1417 out:
1418         if (err) {
1419 #ifdef CONFIG_IPV6_SUBTREES
1420                 /*
1421                  * If fib6_add_1 has cleared the old leaf pointer in the
1422                  * super-tree leaf node we have to find a new one for it.
1423                  */
1424                 if (pn != fn) {
1425                         struct fib6_info *pn_leaf =
1426                                 rcu_dereference_protected(pn->leaf,
1427                                     lockdep_is_held(&table->tb6_lock));
1428                         if (pn_leaf == rt) {
1429                                 pn_leaf = NULL;
1430                                 RCU_INIT_POINTER(pn->leaf, NULL);
1431                                 fib6_info_release(rt);
1432                         }
1433                         if (!pn_leaf && !(pn->fn_flags & RTN_RTINFO)) {
1434                                 pn_leaf = fib6_find_prefix(info->nl_net, table,
1435                                                            pn);
1436 #if RT6_DEBUG >= 2
1437                                 if (!pn_leaf) {
1438                                         WARN_ON(!pn_leaf);
1439                                         pn_leaf =
1440                                             info->nl_net->ipv6.fib6_null_entry;
1441                                 }
1442 #endif
1443                                 fib6_info_hold(pn_leaf);
1444                                 rcu_assign_pointer(pn->leaf, pn_leaf);
1445                         }
1446                 }
1447 #endif
1448                 goto failure;
1449         }
1450         return err;
1451
1452 failure:
1453         /* fn->leaf could be NULL and fib6_repair_tree() needs to be called if:
1454          * 1. fn is an intermediate node and we failed to add the new
1455          * route to it in both subtree creation failure and fib6_add_rt2node()
1456          * failure case.
1457          * 2. fn is the root node in the table and we fail to add the first
1458          * default route to it.
1459          */
1460         if (fn &&
1461             (!(fn->fn_flags & (RTN_RTINFO|RTN_ROOT)) ||
1462              (fn->fn_flags & RTN_TL_ROOT &&
1463               !rcu_access_pointer(fn->leaf))))
1464                 fib6_repair_tree(info->nl_net, table, fn);
1465         return err;
1466 }
1467
1468 /*
1469  *      Routing tree lookup
1470  *
1471  */
1472
1473 struct lookup_args {
1474         int                     offset;         /* key offset on fib6_info */
1475         const struct in6_addr   *addr;          /* search key                   */
1476 };
1477
1478 static struct fib6_node *fib6_node_lookup_1(struct fib6_node *root,
1479                                             struct lookup_args *args)
1480 {
1481         struct fib6_node *fn;
1482         __be32 dir;
1483
1484         if (unlikely(args->offset == 0))
1485                 return NULL;
1486
1487         /*
1488          *      Descend on a tree
1489          */
1490
1491         fn = root;
1492
1493         for (;;) {
1494                 struct fib6_node *next;
1495
1496                 dir = addr_bit_set(args->addr, fn->fn_bit);
1497
1498                 next = dir ? rcu_dereference(fn->right) :
1499                              rcu_dereference(fn->left);
1500
1501                 if (next) {
1502                         fn = next;
1503                         continue;
1504                 }
1505                 break;
1506         }
1507
1508         while (fn) {
1509                 struct fib6_node *subtree = FIB6_SUBTREE(fn);
1510
1511                 if (subtree || fn->fn_flags & RTN_RTINFO) {
1512                         struct fib6_info *leaf = rcu_dereference(fn->leaf);
1513                         struct rt6key *key;
1514
1515                         if (!leaf)
1516                                 goto backtrack;
1517
1518                         key = (struct rt6key *) ((u8 *)leaf + args->offset);
1519
1520                         if (ipv6_prefix_equal(&key->addr, args->addr, key->plen)) {
1521 #ifdef CONFIG_IPV6_SUBTREES
1522                                 if (subtree) {
1523                                         struct fib6_node *sfn;
1524                                         sfn = fib6_node_lookup_1(subtree,
1525                                                                  args + 1);
1526                                         if (!sfn)
1527                                                 goto backtrack;
1528                                         fn = sfn;
1529                                 }
1530 #endif
1531                                 if (fn->fn_flags & RTN_RTINFO)
1532                                         return fn;
1533                         }
1534                 }
1535 backtrack:
1536                 if (fn->fn_flags & RTN_ROOT)
1537                         break;
1538
1539                 fn = rcu_dereference(fn->parent);
1540         }
1541
1542         return NULL;
1543 }
1544
1545 /* called with rcu_read_lock() held
1546  */
1547 struct fib6_node *fib6_node_lookup(struct fib6_node *root,
1548                                    const struct in6_addr *daddr,
1549                                    const struct in6_addr *saddr)
1550 {
1551         struct fib6_node *fn;
1552         struct lookup_args args[] = {
1553                 {
1554                         .offset = offsetof(struct fib6_info, fib6_dst),
1555                         .addr = daddr,
1556                 },
1557 #ifdef CONFIG_IPV6_SUBTREES
1558                 {
1559                         .offset = offsetof(struct fib6_info, fib6_src),
1560                         .addr = saddr,
1561                 },
1562 #endif
1563                 {
1564                         .offset = 0,    /* sentinel */
1565                 }
1566         };
1567
1568         fn = fib6_node_lookup_1(root, daddr ? args : args + 1);
1569         if (!fn || fn->fn_flags & RTN_TL_ROOT)
1570                 fn = root;
1571
1572         return fn;
1573 }
1574
1575 /*
1576  *      Get node with specified destination prefix (and source prefix,
1577  *      if subtrees are used)
1578  *      exact_match == true means we try to find fn with exact match of
1579  *      the passed in prefix addr
1580  *      exact_match == false means we try to find fn with longest prefix
1581  *      match of the passed in prefix addr. This is useful for finding fn
1582  *      for cached route as it will be stored in the exception table under
1583  *      the node with longest prefix length.
1584  */
1585
1586
1587 static struct fib6_node *fib6_locate_1(struct fib6_node *root,
1588                                        const struct in6_addr *addr,
1589                                        int plen, int offset,
1590                                        bool exact_match)
1591 {
1592         struct fib6_node *fn, *prev = NULL;
1593
1594         for (fn = root; fn ; ) {
1595                 struct fib6_info *leaf = rcu_dereference(fn->leaf);
1596                 struct rt6key *key;
1597
1598                 /* This node is being deleted */
1599                 if (!leaf) {
1600                         if (plen <= fn->fn_bit)
1601                                 goto out;
1602                         else
1603                                 goto next;
1604                 }
1605
1606                 key = (struct rt6key *)((u8 *)leaf + offset);
1607
1608                 /*
1609                  *      Prefix match
1610                  */
1611                 if (plen < fn->fn_bit ||
1612                     !ipv6_prefix_equal(&key->addr, addr, fn->fn_bit))
1613                         goto out;
1614
1615                 if (plen == fn->fn_bit)
1616                         return fn;
1617
1618                 if (fn->fn_flags & RTN_RTINFO)
1619                         prev = fn;
1620
1621 next:
1622                 /*
1623                  *      We have more bits to go
1624                  */
1625                 if (addr_bit_set(addr, fn->fn_bit))
1626                         fn = rcu_dereference(fn->right);
1627                 else
1628                         fn = rcu_dereference(fn->left);
1629         }
1630 out:
1631         if (exact_match)
1632                 return NULL;
1633         else
1634                 return prev;
1635 }
1636
1637 struct fib6_node *fib6_locate(struct fib6_node *root,
1638                               const struct in6_addr *daddr, int dst_len,
1639                               const struct in6_addr *saddr, int src_len,
1640                               bool exact_match)
1641 {
1642         struct fib6_node *fn;
1643
1644         fn = fib6_locate_1(root, daddr, dst_len,
1645                            offsetof(struct fib6_info, fib6_dst),
1646                            exact_match);
1647
1648 #ifdef CONFIG_IPV6_SUBTREES
1649         if (src_len) {
1650                 WARN_ON(saddr == NULL);
1651                 if (fn) {
1652                         struct fib6_node *subtree = FIB6_SUBTREE(fn);
1653
1654                         if (subtree) {
1655                                 fn = fib6_locate_1(subtree, saddr, src_len,
1656                                            offsetof(struct fib6_info, fib6_src),
1657                                            exact_match);
1658                         }
1659                 }
1660         }
1661 #endif
1662
1663         if (fn && fn->fn_flags & RTN_RTINFO)
1664                 return fn;
1665
1666         return NULL;
1667 }
1668
1669
1670 /*
1671  *      Deletion
1672  *
1673  */
1674
1675 static struct fib6_info *fib6_find_prefix(struct net *net,
1676                                          struct fib6_table *table,
1677                                          struct fib6_node *fn)
1678 {
1679         struct fib6_node *child_left, *child_right;
1680
1681         if (fn->fn_flags & RTN_ROOT)
1682                 return net->ipv6.fib6_null_entry;
1683
1684         while (fn) {
1685                 child_left = rcu_dereference_protected(fn->left,
1686                                     lockdep_is_held(&table->tb6_lock));
1687                 child_right = rcu_dereference_protected(fn->right,
1688                                     lockdep_is_held(&table->tb6_lock));
1689                 if (child_left)
1690                         return rcu_dereference_protected(child_left->leaf,
1691                                         lockdep_is_held(&table->tb6_lock));
1692                 if (child_right)
1693                         return rcu_dereference_protected(child_right->leaf,
1694                                         lockdep_is_held(&table->tb6_lock));
1695
1696                 fn = FIB6_SUBTREE(fn);
1697         }
1698         return NULL;
1699 }
1700
1701 /*
1702  *      Called to trim the tree of intermediate nodes when possible. "fn"
1703  *      is the node we want to try and remove.
1704  *      Need to own table->tb6_lock
1705  */
1706
1707 static struct fib6_node *fib6_repair_tree(struct net *net,
1708                                           struct fib6_table *table,
1709                                           struct fib6_node *fn)
1710 {
1711         int children;
1712         int nstate;
1713         struct fib6_node *child;
1714         struct fib6_walker *w;
1715         int iter = 0;
1716
1717         /* Set fn->leaf to null_entry for root node. */
1718         if (fn->fn_flags & RTN_TL_ROOT) {
1719                 rcu_assign_pointer(fn->leaf, net->ipv6.fib6_null_entry);
1720                 return fn;
1721         }
1722
1723         for (;;) {
1724                 struct fib6_node *fn_r = rcu_dereference_protected(fn->right,
1725                                             lockdep_is_held(&table->tb6_lock));
1726                 struct fib6_node *fn_l = rcu_dereference_protected(fn->left,
1727                                             lockdep_is_held(&table->tb6_lock));
1728                 struct fib6_node *pn = rcu_dereference_protected(fn->parent,
1729                                             lockdep_is_held(&table->tb6_lock));
1730                 struct fib6_node *pn_r = rcu_dereference_protected(pn->right,
1731                                             lockdep_is_held(&table->tb6_lock));
1732                 struct fib6_node *pn_l = rcu_dereference_protected(pn->left,
1733                                             lockdep_is_held(&table->tb6_lock));
1734                 struct fib6_info *fn_leaf = rcu_dereference_protected(fn->leaf,
1735                                             lockdep_is_held(&table->tb6_lock));
1736                 struct fib6_info *pn_leaf = rcu_dereference_protected(pn->leaf,
1737                                             lockdep_is_held(&table->tb6_lock));
1738                 struct fib6_info *new_fn_leaf;
1739
1740                 RT6_TRACE("fixing tree: plen=%d iter=%d\n", fn->fn_bit, iter);
1741                 iter++;
1742
1743                 WARN_ON(fn->fn_flags & RTN_RTINFO);
1744                 WARN_ON(fn->fn_flags & RTN_TL_ROOT);
1745                 WARN_ON(fn_leaf);
1746
1747                 children = 0;
1748                 child = NULL;
1749                 if (fn_r)
1750                         child = fn_r, children |= 1;
1751                 if (fn_l)
1752                         child = fn_l, children |= 2;
1753
1754                 if (children == 3 || FIB6_SUBTREE(fn)
1755 #ifdef CONFIG_IPV6_SUBTREES
1756                     /* Subtree root (i.e. fn) may have one child */
1757                     || (children && fn->fn_flags & RTN_ROOT)
1758 #endif
1759                     ) {
1760                         new_fn_leaf = fib6_find_prefix(net, table, fn);
1761 #if RT6_DEBUG >= 2
1762                         if (!new_fn_leaf) {
1763                                 WARN_ON(!new_fn_leaf);
1764                                 new_fn_leaf = net->ipv6.fib6_null_entry;
1765                         }
1766 #endif
1767                         fib6_info_hold(new_fn_leaf);
1768                         rcu_assign_pointer(fn->leaf, new_fn_leaf);
1769                         return pn;
1770                 }
1771
1772 #ifdef CONFIG_IPV6_SUBTREES
1773                 if (FIB6_SUBTREE(pn) == fn) {
1774                         WARN_ON(!(fn->fn_flags & RTN_ROOT));
1775                         RCU_INIT_POINTER(pn->subtree, NULL);
1776                         nstate = FWS_L;
1777                 } else {
1778                         WARN_ON(fn->fn_flags & RTN_ROOT);
1779 #endif
1780                         if (pn_r == fn)
1781                                 rcu_assign_pointer(pn->right, child);
1782                         else if (pn_l == fn)
1783                                 rcu_assign_pointer(pn->left, child);
1784 #if RT6_DEBUG >= 2
1785                         else
1786                                 WARN_ON(1);
1787 #endif
1788                         if (child)
1789                                 rcu_assign_pointer(child->parent, pn);
1790                         nstate = FWS_R;
1791 #ifdef CONFIG_IPV6_SUBTREES
1792                 }
1793 #endif
1794
1795                 read_lock(&net->ipv6.fib6_walker_lock);
1796                 FOR_WALKERS(net, w) {
1797                         if (!child) {
1798                                 if (w->node == fn) {
1799                                         RT6_TRACE("W %p adjusted by delnode 1, s=%d/%d\n", w, w->state, nstate);
1800                                         w->node = pn;
1801                                         w->state = nstate;
1802                                 }
1803                         } else {
1804                                 if (w->node == fn) {
1805                                         w->node = child;
1806                                         if (children&2) {
1807                                                 RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state);
1808                                                 w->state = w->state >= FWS_R ? FWS_U : FWS_INIT;
1809                                         } else {
1810                                                 RT6_TRACE("W %p adjusted by delnode 2, s=%d\n", w, w->state);
1811                                                 w->state = w->state >= FWS_C ? FWS_U : FWS_INIT;
1812                                         }
1813                                 }
1814                         }
1815                 }
1816                 read_unlock(&net->ipv6.fib6_walker_lock);
1817
1818                 node_free(net, fn);
1819                 if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn))
1820                         return pn;
1821
1822                 RCU_INIT_POINTER(pn->leaf, NULL);
1823                 fib6_info_release(pn_leaf);
1824                 fn = pn;
1825         }
1826 }
1827
1828 static void fib6_del_route(struct fib6_table *table, struct fib6_node *fn,
1829                            struct fib6_info __rcu **rtp, struct nl_info *info)
1830 {
1831         struct fib6_walker *w;
1832         struct fib6_info *rt = rcu_dereference_protected(*rtp,
1833                                     lockdep_is_held(&table->tb6_lock));
1834         struct net *net = info->nl_net;
1835
1836         RT6_TRACE("fib6_del_route\n");
1837
1838         /* Unlink it */
1839         *rtp = rt->fib6_next;
1840         rt->fib6_node = NULL;
1841         net->ipv6.rt6_stats->fib_rt_entries--;
1842         net->ipv6.rt6_stats->fib_discarded_routes++;
1843
1844         /* Reset round-robin state, if necessary */
1845         if (rcu_access_pointer(fn->rr_ptr) == rt)
1846                 fn->rr_ptr = NULL;
1847
1848         /* Remove this entry from other siblings */
1849         if (rt->fib6_nsiblings) {
1850                 struct fib6_info *sibling, *next_sibling;
1851
1852                 list_for_each_entry_safe(sibling, next_sibling,
1853                                          &rt->fib6_siblings, fib6_siblings)
1854                         sibling->fib6_nsiblings--;
1855                 rt->fib6_nsiblings = 0;
1856                 list_del_init(&rt->fib6_siblings);
1857                 rt6_multipath_rebalance(next_sibling);
1858         }
1859
1860         /* Adjust walkers */
1861         read_lock(&net->ipv6.fib6_walker_lock);
1862         FOR_WALKERS(net, w) {
1863                 if (w->state == FWS_C && w->leaf == rt) {
1864                         RT6_TRACE("walker %p adjusted by delroute\n", w);
1865                         w->leaf = rcu_dereference_protected(rt->fib6_next,
1866                                             lockdep_is_held(&table->tb6_lock));
1867                         if (!w->leaf)
1868                                 w->state = FWS_U;
1869                 }
1870         }
1871         read_unlock(&net->ipv6.fib6_walker_lock);
1872
1873         /* If it was last route, call fib6_repair_tree() to:
1874          * 1. For root node, put back null_entry as how the table was created.
1875          * 2. For other nodes, expunge its radix tree node.
1876          */
1877         if (!rcu_access_pointer(fn->leaf)) {
1878                 if (!(fn->fn_flags & RTN_TL_ROOT)) {
1879                         fn->fn_flags &= ~RTN_RTINFO;
1880                         net->ipv6.rt6_stats->fib_route_nodes--;
1881                 }
1882                 fn = fib6_repair_tree(net, table, fn);
1883         }
1884
1885         fib6_purge_rt(rt, fn, net);
1886
1887         if (!info->skip_notify_kernel)
1888                 call_fib6_entry_notifiers(net, FIB_EVENT_ENTRY_DEL, rt, NULL);
1889         if (!info->skip_notify)
1890                 inet6_rt_notify(RTM_DELROUTE, rt, info, 0);
1891
1892         fib6_info_release(rt);
1893 }
1894
1895 /* Need to own table->tb6_lock */
1896 int fib6_del(struct fib6_info *rt, struct nl_info *info)
1897 {
1898         struct net *net = info->nl_net;
1899         struct fib6_info __rcu **rtp;
1900         struct fib6_info __rcu **rtp_next;
1901         struct fib6_table *table;
1902         struct fib6_node *fn;
1903
1904         if (rt == net->ipv6.fib6_null_entry)
1905                 return -ENOENT;
1906
1907         table = rt->fib6_table;
1908         fn = rcu_dereference_protected(rt->fib6_node,
1909                                        lockdep_is_held(&table->tb6_lock));
1910         if (!fn)
1911                 return -ENOENT;
1912
1913         WARN_ON(!(fn->fn_flags & RTN_RTINFO));
1914
1915         /*
1916          *      Walk the leaf entries looking for ourself
1917          */
1918
1919         for (rtp = &fn->leaf; *rtp; rtp = rtp_next) {
1920                 struct fib6_info *cur = rcu_dereference_protected(*rtp,
1921                                         lockdep_is_held(&table->tb6_lock));
1922                 if (rt == cur) {
1923                         fib6_del_route(table, fn, rtp, info);
1924                         return 0;
1925                 }
1926                 rtp_next = &cur->fib6_next;
1927         }
1928         return -ENOENT;
1929 }
1930
1931 /*
1932  *      Tree traversal function.
1933  *
1934  *      Certainly, it is not interrupt safe.
1935  *      However, it is internally reenterable wrt itself and fib6_add/fib6_del.
1936  *      It means, that we can modify tree during walking
1937  *      and use this function for garbage collection, clone pruning,
1938  *      cleaning tree when a device goes down etc. etc.
1939  *
1940  *      It guarantees that every node will be traversed,
1941  *      and that it will be traversed only once.
1942  *
1943  *      Callback function w->func may return:
1944  *      0 -> continue walking.
1945  *      positive value -> walking is suspended (used by tree dumps,
1946  *      and probably by gc, if it will be split to several slices)
1947  *      negative value -> terminate walking.
1948  *
1949  *      The function itself returns:
1950  *      0   -> walk is complete.
1951  *      >0  -> walk is incomplete (i.e. suspended)
1952  *      <0  -> walk is terminated by an error.
1953  *
1954  *      This function is called with tb6_lock held.
1955  */
1956
1957 static int fib6_walk_continue(struct fib6_walker *w)
1958 {
1959         struct fib6_node *fn, *pn, *left, *right;
1960
1961         /* w->root should always be table->tb6_root */
1962         WARN_ON_ONCE(!(w->root->fn_flags & RTN_TL_ROOT));
1963
1964         for (;;) {
1965                 fn = w->node;
1966                 if (!fn)
1967                         return 0;
1968
1969                 switch (w->state) {
1970 #ifdef CONFIG_IPV6_SUBTREES
1971                 case FWS_S:
1972                         if (FIB6_SUBTREE(fn)) {
1973                                 w->node = FIB6_SUBTREE(fn);
1974                                 continue;
1975                         }
1976                         w->state = FWS_L;
1977 #endif
1978                         /* fall through */
1979                 case FWS_L:
1980                         left = rcu_dereference_protected(fn->left, 1);
1981                         if (left) {
1982                                 w->node = left;
1983                                 w->state = FWS_INIT;
1984                                 continue;
1985                         }
1986                         w->state = FWS_R;
1987                         /* fall through */
1988                 case FWS_R:
1989                         right = rcu_dereference_protected(fn->right, 1);
1990                         if (right) {
1991                                 w->node = right;
1992                                 w->state = FWS_INIT;
1993                                 continue;
1994                         }
1995                         w->state = FWS_C;
1996                         w->leaf = rcu_dereference_protected(fn->leaf, 1);
1997                         /* fall through */
1998                 case FWS_C:
1999                         if (w->leaf && fn->fn_flags & RTN_RTINFO) {
2000                                 int err;
2001
2002                                 if (w->skip) {
2003                                         w->skip--;
2004                                         goto skip;
2005                                 }
2006
2007                                 err = w->func(w);
2008                                 if (err)
2009                                         return err;
2010
2011                                 w->count++;
2012                                 continue;
2013                         }
2014 skip:
2015                         w->state = FWS_U;
2016                         /* fall through */
2017                 case FWS_U:
2018                         if (fn == w->root)
2019                                 return 0;
2020                         pn = rcu_dereference_protected(fn->parent, 1);
2021                         left = rcu_dereference_protected(pn->left, 1);
2022                         right = rcu_dereference_protected(pn->right, 1);
2023                         w->node = pn;
2024 #ifdef CONFIG_IPV6_SUBTREES
2025                         if (FIB6_SUBTREE(pn) == fn) {
2026                                 WARN_ON(!(fn->fn_flags & RTN_ROOT));
2027                                 w->state = FWS_L;
2028                                 continue;
2029                         }
2030 #endif
2031                         if (left == fn) {
2032                                 w->state = FWS_R;
2033                                 continue;
2034                         }
2035                         if (right == fn) {
2036                                 w->state = FWS_C;
2037                                 w->leaf = rcu_dereference_protected(w->node->leaf, 1);
2038                                 continue;
2039                         }
2040 #if RT6_DEBUG >= 2
2041                         WARN_ON(1);
2042 #endif
2043                 }
2044         }
2045 }
2046
2047 static int fib6_walk(struct net *net, struct fib6_walker *w)
2048 {
2049         int res;
2050
2051         w->state = FWS_INIT;
2052         w->node = w->root;
2053
2054         fib6_walker_link(net, w);
2055         res = fib6_walk_continue(w);
2056         if (res <= 0)
2057                 fib6_walker_unlink(net, w);
2058         return res;
2059 }
2060
2061 static int fib6_clean_node(struct fib6_walker *w)
2062 {
2063         int res;
2064         struct fib6_info *rt;
2065         struct fib6_cleaner *c = container_of(w, struct fib6_cleaner, w);
2066         struct nl_info info = {
2067                 .nl_net = c->net,
2068                 .skip_notify = c->skip_notify,
2069         };
2070
2071         if (c->sernum != FIB6_NO_SERNUM_CHANGE &&
2072             READ_ONCE(w->node->fn_sernum) != c->sernum)
2073                 WRITE_ONCE(w->node->fn_sernum, c->sernum);
2074
2075         if (!c->func) {
2076                 WARN_ON_ONCE(c->sernum == FIB6_NO_SERNUM_CHANGE);
2077                 w->leaf = NULL;
2078                 return 0;
2079         }
2080
2081         for_each_fib6_walker_rt(w) {
2082                 res = c->func(rt, c->arg);
2083                 if (res == -1) {
2084                         w->leaf = rt;
2085                         res = fib6_del(rt, &info);
2086                         if (res) {
2087 #if RT6_DEBUG >= 2
2088                                 pr_debug("%s: del failed: rt=%p@%p err=%d\n",
2089                                          __func__, rt,
2090                                          rcu_access_pointer(rt->fib6_node),
2091                                          res);
2092 #endif
2093                                 continue;
2094                         }
2095                         return 0;
2096                 } else if (res == -2) {
2097                         if (WARN_ON(!rt->fib6_nsiblings))
2098                                 continue;
2099                         rt = list_last_entry(&rt->fib6_siblings,
2100                                              struct fib6_info, fib6_siblings);
2101                         continue;
2102                 }
2103                 WARN_ON(res != 0);
2104         }
2105         w->leaf = rt;
2106         return 0;
2107 }
2108
2109 /*
2110  *      Convenient frontend to tree walker.
2111  *
2112  *      func is called on each route.
2113  *              It may return -2 -> skip multipath route.
2114  *                            -1 -> delete this route.
2115  *                            0  -> continue walking
2116  */
2117
2118 static void fib6_clean_tree(struct net *net, struct fib6_node *root,
2119                             int (*func)(struct fib6_info *, void *arg),
2120                             int sernum, void *arg, bool skip_notify)
2121 {
2122         struct fib6_cleaner c;
2123
2124         c.w.root = root;
2125         c.w.func = fib6_clean_node;
2126         c.w.count = 0;
2127         c.w.skip = 0;
2128         c.w.skip_in_node = 0;
2129         c.func = func;
2130         c.sernum = sernum;
2131         c.arg = arg;
2132         c.net = net;
2133         c.skip_notify = skip_notify;
2134
2135         fib6_walk(net, &c.w);
2136 }
2137
2138 static void __fib6_clean_all(struct net *net,
2139                              int (*func)(struct fib6_info *, void *),
2140                              int sernum, void *arg, bool skip_notify)
2141 {
2142         struct fib6_table *table;
2143         struct hlist_head *head;
2144         unsigned int h;
2145
2146         rcu_read_lock();
2147         for (h = 0; h < FIB6_TABLE_HASHSZ; h++) {
2148                 head = &net->ipv6.fib_table_hash[h];
2149                 hlist_for_each_entry_rcu(table, head, tb6_hlist) {
2150                         spin_lock_bh(&table->tb6_lock);
2151                         fib6_clean_tree(net, &table->tb6_root,
2152                                         func, sernum, arg, skip_notify);
2153                         spin_unlock_bh(&table->tb6_lock);
2154                 }
2155         }
2156         rcu_read_unlock();
2157 }
2158
2159 void fib6_clean_all(struct net *net, int (*func)(struct fib6_info *, void *),
2160                     void *arg)
2161 {
2162         __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, false);
2163 }
2164
2165 void fib6_clean_all_skip_notify(struct net *net,
2166                                 int (*func)(struct fib6_info *, void *),
2167                                 void *arg)
2168 {
2169         __fib6_clean_all(net, func, FIB6_NO_SERNUM_CHANGE, arg, true);
2170 }
2171
2172 static void fib6_flush_trees(struct net *net)
2173 {
2174         int new_sernum = fib6_new_sernum(net);
2175
2176         __fib6_clean_all(net, NULL, new_sernum, NULL, false);
2177 }
2178
2179 /*
2180  *      Garbage collection
2181  */
2182
2183 static int fib6_age(struct fib6_info *rt, void *arg)
2184 {
2185         struct fib6_gc_args *gc_args = arg;
2186         unsigned long now = jiffies;
2187
2188         /*
2189          *      check addrconf expiration here.
2190          *      Routes are expired even if they are in use.
2191          */
2192
2193         if (rt->fib6_flags & RTF_EXPIRES && rt->expires) {
2194                 if (time_after(now, rt->expires)) {
2195                         RT6_TRACE("expiring %p\n", rt);
2196                         return -1;
2197                 }
2198                 gc_args->more++;
2199         }
2200
2201         /*      Also age clones in the exception table.
2202          *      Note, that clones are aged out
2203          *      only if they are not in use now.
2204          */
2205         rt6_age_exceptions(rt, gc_args, now);
2206
2207         return 0;
2208 }
2209
2210 void fib6_run_gc(unsigned long expires, struct net *net, bool force)
2211 {
2212         struct fib6_gc_args gc_args;
2213         unsigned long now;
2214
2215         if (force) {
2216                 spin_lock_bh(&net->ipv6.fib6_gc_lock);
2217         } else if (!spin_trylock_bh(&net->ipv6.fib6_gc_lock)) {
2218                 mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ);
2219                 return;
2220         }
2221         gc_args.timeout = expires ? (int)expires :
2222                           net->ipv6.sysctl.ip6_rt_gc_interval;
2223         gc_args.more = 0;
2224
2225         fib6_clean_all(net, fib6_age, &gc_args);
2226         now = jiffies;
2227         net->ipv6.ip6_rt_last_gc = now;
2228
2229         if (gc_args.more)
2230                 mod_timer(&net->ipv6.ip6_fib_timer,
2231                           round_jiffies(now
2232                                         + net->ipv6.sysctl.ip6_rt_gc_interval));
2233         else
2234                 del_timer(&net->ipv6.ip6_fib_timer);
2235         spin_unlock_bh(&net->ipv6.fib6_gc_lock);
2236 }
2237
2238 static void fib6_gc_timer_cb(struct timer_list *t)
2239 {
2240         struct net *arg = from_timer(arg, t, ipv6.ip6_fib_timer);
2241
2242         fib6_run_gc(0, arg, true);
2243 }
2244
2245 static int __net_init fib6_net_init(struct net *net)
2246 {
2247         size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ;
2248         int err;
2249
2250         err = fib6_notifier_init(net);
2251         if (err)
2252                 return err;
2253
2254         spin_lock_init(&net->ipv6.fib6_gc_lock);
2255         rwlock_init(&net->ipv6.fib6_walker_lock);
2256         INIT_LIST_HEAD(&net->ipv6.fib6_walkers);
2257         timer_setup(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, 0);
2258
2259         net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL);
2260         if (!net->ipv6.rt6_stats)
2261                 goto out_timer;
2262
2263         /* Avoid false sharing : Use at least a full cache line */
2264         size = max_t(size_t, size, L1_CACHE_BYTES);
2265
2266         net->ipv6.fib_table_hash = kzalloc(size, GFP_KERNEL);
2267         if (!net->ipv6.fib_table_hash)
2268                 goto out_rt6_stats;
2269
2270         net->ipv6.fib6_main_tbl = kzalloc(sizeof(*net->ipv6.fib6_main_tbl),
2271                                           GFP_KERNEL);
2272         if (!net->ipv6.fib6_main_tbl)
2273                 goto out_fib_table_hash;
2274
2275         net->ipv6.fib6_main_tbl->tb6_id = RT6_TABLE_MAIN;
2276         rcu_assign_pointer(net->ipv6.fib6_main_tbl->tb6_root.leaf,
2277                            net->ipv6.fib6_null_entry);
2278         net->ipv6.fib6_main_tbl->tb6_root.fn_flags =
2279                 RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
2280         inet_peer_base_init(&net->ipv6.fib6_main_tbl->tb6_peers);
2281
2282 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2283         net->ipv6.fib6_local_tbl = kzalloc(sizeof(*net->ipv6.fib6_local_tbl),
2284                                            GFP_KERNEL);
2285         if (!net->ipv6.fib6_local_tbl)
2286                 goto out_fib6_main_tbl;
2287         net->ipv6.fib6_local_tbl->tb6_id = RT6_TABLE_LOCAL;
2288         rcu_assign_pointer(net->ipv6.fib6_local_tbl->tb6_root.leaf,
2289                            net->ipv6.fib6_null_entry);
2290         net->ipv6.fib6_local_tbl->tb6_root.fn_flags =
2291                 RTN_ROOT | RTN_TL_ROOT | RTN_RTINFO;
2292         inet_peer_base_init(&net->ipv6.fib6_local_tbl->tb6_peers);
2293 #endif
2294         fib6_tables_init(net);
2295
2296         return 0;
2297
2298 #ifdef CONFIG_IPV6_MULTIPLE_TABLES
2299 out_fib6_main_tbl:
2300         kfree(net->ipv6.fib6_main_tbl);
2301 #endif
2302 out_fib_table_hash:
2303         kfree(net->ipv6.fib_table_hash);
2304 out_rt6_stats:
2305         kfree(net->ipv6.rt6_stats);
2306 out_timer:
2307         fib6_notifier_exit(net);
2308         return -ENOMEM;
2309 }
2310
2311 static void fib6_net_exit(struct net *net)
2312 {
2313         unsigned int i;
2314
2315         del_timer_sync(&net->ipv6.ip6_fib_timer);
2316
2317         for (i = 0; i < FIB6_TABLE_HASHSZ; i++) {
2318                 struct hlist_head *head = &net->ipv6.fib_table_hash[i];
2319                 struct hlist_node *tmp;
2320                 struct fib6_table *tb;
2321
2322                 hlist_for_each_entry_safe(tb, tmp, head, tb6_hlist) {
2323                         hlist_del(&tb->tb6_hlist);
2324                         fib6_free_table(tb);
2325                 }
2326         }
2327
2328         kfree(net->ipv6.fib_table_hash);
2329         kfree(net->ipv6.rt6_stats);
2330         fib6_notifier_exit(net);
2331 }
2332
2333 static struct pernet_operations fib6_net_ops = {
2334         .init = fib6_net_init,
2335         .exit = fib6_net_exit,
2336 };
2337
2338 int __init fib6_init(void)
2339 {
2340         int ret = -ENOMEM;
2341
2342         fib6_node_kmem = kmem_cache_create("fib6_nodes",
2343                                            sizeof(struct fib6_node),
2344                                            0, SLAB_HWCACHE_ALIGN,
2345                                            NULL);
2346         if (!fib6_node_kmem)
2347                 goto out;
2348
2349         ret = register_pernet_subsys(&fib6_net_ops);
2350         if (ret)
2351                 goto out_kmem_cache_create;
2352
2353         ret = rtnl_register_module(THIS_MODULE, PF_INET6, RTM_GETROUTE, NULL,
2354                                    inet6_dump_fib, 0);
2355         if (ret)
2356                 goto out_unregister_subsys;
2357
2358         __fib6_flush_trees = fib6_flush_trees;
2359 out:
2360         return ret;
2361
2362 out_unregister_subsys:
2363         unregister_pernet_subsys(&fib6_net_ops);
2364 out_kmem_cache_create:
2365         kmem_cache_destroy(fib6_node_kmem);
2366         goto out;
2367 }
2368
2369 void fib6_gc_cleanup(void)
2370 {
2371         unregister_pernet_subsys(&fib6_net_ops);
2372         kmem_cache_destroy(fib6_node_kmem);
2373 }
2374
2375 #ifdef CONFIG_PROC_FS
2376 static int ipv6_route_seq_show(struct seq_file *seq, void *v)
2377 {
2378         struct fib6_info *rt = v;
2379         struct ipv6_route_iter *iter = seq->private;
2380         struct fib6_nh *fib6_nh = rt->fib6_nh;
2381         unsigned int flags = rt->fib6_flags;
2382         const struct net_device *dev;
2383
2384         if (rt->nh)
2385                 fib6_nh = nexthop_fib6_nh_bh(rt->nh);
2386
2387         seq_printf(seq, "%pi6 %02x ", &rt->fib6_dst.addr, rt->fib6_dst.plen);
2388
2389 #ifdef CONFIG_IPV6_SUBTREES
2390         seq_printf(seq, "%pi6 %02x ", &rt->fib6_src.addr, rt->fib6_src.plen);
2391 #else
2392         seq_puts(seq, "00000000000000000000000000000000 00 ");
2393 #endif
2394         if (fib6_nh->fib_nh_gw_family) {
2395                 flags |= RTF_GATEWAY;
2396                 seq_printf(seq, "%pi6", &fib6_nh->fib_nh_gw6);
2397         } else {
2398                 seq_puts(seq, "00000000000000000000000000000000");
2399         }
2400
2401         dev = fib6_nh->fib_nh_dev;
2402         seq_printf(seq, " %08x %08x %08x %08x %8s\n",
2403                    rt->fib6_metric, refcount_read(&rt->fib6_ref), 0,
2404                    flags, dev ? dev->name : "");
2405         iter->w.leaf = NULL;
2406         return 0;
2407 }
2408
2409 static int ipv6_route_yield(struct fib6_walker *w)
2410 {
2411         struct ipv6_route_iter *iter = w->args;
2412
2413         if (!iter->skip)
2414                 return 1;
2415
2416         do {
2417                 iter->w.leaf = rcu_dereference_protected(
2418                                 iter->w.leaf->fib6_next,
2419                                 lockdep_is_held(&iter->tbl->tb6_lock));
2420                 iter->skip--;
2421                 if (!iter->skip && iter->w.leaf)
2422                         return 1;
2423         } while (iter->w.leaf);
2424
2425         return 0;
2426 }
2427
2428 static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter,
2429                                       struct net *net)
2430 {
2431         memset(&iter->w, 0, sizeof(iter->w));
2432         iter->w.func = ipv6_route_yield;
2433         iter->w.root = &iter->tbl->tb6_root;
2434         iter->w.state = FWS_INIT;
2435         iter->w.node = iter->w.root;
2436         iter->w.args = iter;
2437         iter->sernum = READ_ONCE(iter->w.root->fn_sernum);
2438         INIT_LIST_HEAD(&iter->w.lh);
2439         fib6_walker_link(net, &iter->w);
2440 }
2441
2442 static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl,
2443                                                     struct net *net)
2444 {
2445         unsigned int h;
2446         struct hlist_node *node;
2447
2448         if (tbl) {
2449                 h = (tbl->tb6_id & (FIB6_TABLE_HASHSZ - 1)) + 1;
2450                 node = rcu_dereference_bh(hlist_next_rcu(&tbl->tb6_hlist));
2451         } else {
2452                 h = 0;
2453                 node = NULL;
2454         }
2455
2456         while (!node && h < FIB6_TABLE_HASHSZ) {
2457                 node = rcu_dereference_bh(
2458                         hlist_first_rcu(&net->ipv6.fib_table_hash[h++]));
2459         }
2460         return hlist_entry_safe(node, struct fib6_table, tb6_hlist);
2461 }
2462
2463 static void ipv6_route_check_sernum(struct ipv6_route_iter *iter)
2464 {
2465         int sernum = READ_ONCE(iter->w.root->fn_sernum);
2466
2467         if (iter->sernum != sernum) {
2468                 iter->sernum = sernum;
2469                 iter->w.state = FWS_INIT;
2470                 iter->w.node = iter->w.root;
2471                 WARN_ON(iter->w.skip);
2472                 iter->w.skip = iter->w.count;
2473         }
2474 }
2475
2476 static void *ipv6_route_seq_next(struct seq_file *seq, void *v, loff_t *pos)
2477 {
2478         int r;
2479         struct fib6_info *n;
2480         struct net *net = seq_file_net(seq);
2481         struct ipv6_route_iter *iter = seq->private;
2482
2483         ++(*pos);
2484         if (!v)
2485                 goto iter_table;
2486
2487         n = rcu_dereference_bh(((struct fib6_info *)v)->fib6_next);
2488         if (n)
2489                 return n;
2490
2491 iter_table:
2492         ipv6_route_check_sernum(iter);
2493         spin_lock_bh(&iter->tbl->tb6_lock);
2494         r = fib6_walk_continue(&iter->w);
2495         spin_unlock_bh(&iter->tbl->tb6_lock);
2496         if (r > 0) {
2497                 return iter->w.leaf;
2498         } else if (r < 0) {
2499                 fib6_walker_unlink(net, &iter->w);
2500                 return NULL;
2501         }
2502         fib6_walker_unlink(net, &iter->w);
2503
2504         iter->tbl = ipv6_route_seq_next_table(iter->tbl, net);
2505         if (!iter->tbl)
2506                 return NULL;
2507
2508         ipv6_route_seq_setup_walk(iter, net);
2509         goto iter_table;
2510 }
2511
2512 static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos)
2513         __acquires(RCU_BH)
2514 {
2515         struct net *net = seq_file_net(seq);
2516         struct ipv6_route_iter *iter = seq->private;
2517
2518         rcu_read_lock_bh();
2519         iter->tbl = ipv6_route_seq_next_table(NULL, net);
2520         iter->skip = *pos;
2521
2522         if (iter->tbl) {
2523                 loff_t p = 0;
2524
2525                 ipv6_route_seq_setup_walk(iter, net);
2526                 return ipv6_route_seq_next(seq, NULL, &p);
2527         } else {
2528                 return NULL;
2529         }
2530 }
2531
2532 static bool ipv6_route_iter_active(struct ipv6_route_iter *iter)
2533 {
2534         struct fib6_walker *w = &iter->w;
2535         return w->node && !(w->state == FWS_U && w->node == w->root);
2536 }
2537
2538 static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
2539         __releases(RCU_BH)
2540 {
2541         struct net *net = seq_file_net(seq);
2542         struct ipv6_route_iter *iter = seq->private;
2543
2544         if (ipv6_route_iter_active(iter))
2545                 fib6_walker_unlink(net, &iter->w);
2546
2547         rcu_read_unlock_bh();
2548 }
2549
2550 const struct seq_operations ipv6_route_seq_ops = {
2551         .start  = ipv6_route_seq_start,
2552         .next   = ipv6_route_seq_next,
2553         .stop   = ipv6_route_seq_stop,
2554         .show   = ipv6_route_seq_show
2555 };
2556 #endif /* CONFIG_PROC_FS */