GNU Linux-libre 4.9.318-gnu1
[releases.git] / net / ipv4 / fib_semantics.c
1 /*
2  * INET         An implementation of the TCP/IP protocol suite for the LINUX
3  *              operating system.  INET is implemented using the  BSD Socket
4  *              interface as the means of communication with the user level.
5  *
6  *              IPv4 Forwarding Information Base: semantics.
7  *
8  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
9  *
10  *              This program is free software; you can redistribute it and/or
11  *              modify it under the terms of the GNU General Public License
12  *              as published by the Free Software Foundation; either version
13  *              2 of the License, or (at your option) any later version.
14  */
15
16 #include <asm/uaccess.h>
17 #include <linux/bitops.h>
18 #include <linux/types.h>
19 #include <linux/kernel.h>
20 #include <linux/jiffies.h>
21 #include <linux/mm.h>
22 #include <linux/string.h>
23 #include <linux/socket.h>
24 #include <linux/sockios.h>
25 #include <linux/errno.h>
26 #include <linux/in.h>
27 #include <linux/inet.h>
28 #include <linux/inetdevice.h>
29 #include <linux/netdevice.h>
30 #include <linux/if_arp.h>
31 #include <linux/proc_fs.h>
32 #include <linux/skbuff.h>
33 #include <linux/init.h>
34 #include <linux/slab.h>
35
36 #include <net/arp.h>
37 #include <net/ip.h>
38 #include <net/protocol.h>
39 #include <net/route.h>
40 #include <net/tcp.h>
41 #include <net/sock.h>
42 #include <net/ip_fib.h>
43 #include <net/netlink.h>
44 #include <net/nexthop.h>
45 #include <net/lwtunnel.h>
46
47 #include "fib_lookup.h"
48
49 static DEFINE_SPINLOCK(fib_info_lock);
50 static struct hlist_head *fib_info_hash;
51 static struct hlist_head *fib_info_laddrhash;
52 static unsigned int fib_info_hash_size;
53 static unsigned int fib_info_cnt;
54
55 #define DEVINDEX_HASHBITS 8
56 #define DEVINDEX_HASHSIZE (1U << DEVINDEX_HASHBITS)
57 static struct hlist_head fib_info_devhash[DEVINDEX_HASHSIZE];
58
59 #ifdef CONFIG_IP_ROUTE_MULTIPATH
60 u32 fib_multipath_secret __read_mostly;
61
62 #define for_nexthops(fi) {                                              \
63         int nhsel; const struct fib_nh *nh;                             \
64         for (nhsel = 0, nh = (fi)->fib_nh;                              \
65              nhsel < (fi)->fib_nhs;                                     \
66              nh++, nhsel++)
67
68 #define change_nexthops(fi) {                                           \
69         int nhsel; struct fib_nh *nexthop_nh;                           \
70         for (nhsel = 0, nexthop_nh = (struct fib_nh *)((fi)->fib_nh);   \
71              nhsel < (fi)->fib_nhs;                                     \
72              nexthop_nh++, nhsel++)
73
74 #else /* CONFIG_IP_ROUTE_MULTIPATH */
75
76 /* Hope, that gcc will optimize it to get rid of dummy loop */
77
78 #define for_nexthops(fi) {                                              \
79         int nhsel; const struct fib_nh *nh = (fi)->fib_nh;              \
80         for (nhsel = 0; nhsel < 1; nhsel++)
81
82 #define change_nexthops(fi) {                                           \
83         int nhsel;                                                      \
84         struct fib_nh *nexthop_nh = (struct fib_nh *)((fi)->fib_nh);    \
85         for (nhsel = 0; nhsel < 1; nhsel++)
86
87 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
88
89 #define endfor_nexthops(fi) }
90
91
92 const struct fib_prop fib_props[RTN_MAX + 1] = {
93         [RTN_UNSPEC] = {
94                 .error  = 0,
95                 .scope  = RT_SCOPE_NOWHERE,
96         },
97         [RTN_UNICAST] = {
98                 .error  = 0,
99                 .scope  = RT_SCOPE_UNIVERSE,
100         },
101         [RTN_LOCAL] = {
102                 .error  = 0,
103                 .scope  = RT_SCOPE_HOST,
104         },
105         [RTN_BROADCAST] = {
106                 .error  = 0,
107                 .scope  = RT_SCOPE_LINK,
108         },
109         [RTN_ANYCAST] = {
110                 .error  = 0,
111                 .scope  = RT_SCOPE_LINK,
112         },
113         [RTN_MULTICAST] = {
114                 .error  = 0,
115                 .scope  = RT_SCOPE_UNIVERSE,
116         },
117         [RTN_BLACKHOLE] = {
118                 .error  = -EINVAL,
119                 .scope  = RT_SCOPE_UNIVERSE,
120         },
121         [RTN_UNREACHABLE] = {
122                 .error  = -EHOSTUNREACH,
123                 .scope  = RT_SCOPE_UNIVERSE,
124         },
125         [RTN_PROHIBIT] = {
126                 .error  = -EACCES,
127                 .scope  = RT_SCOPE_UNIVERSE,
128         },
129         [RTN_THROW] = {
130                 .error  = -EAGAIN,
131                 .scope  = RT_SCOPE_UNIVERSE,
132         },
133         [RTN_NAT] = {
134                 .error  = -EINVAL,
135                 .scope  = RT_SCOPE_NOWHERE,
136         },
137         [RTN_XRESOLVE] = {
138                 .error  = -EINVAL,
139                 .scope  = RT_SCOPE_NOWHERE,
140         },
141 };
142
143 static void rt_fibinfo_free(struct rtable __rcu **rtp)
144 {
145         struct rtable *rt = rcu_dereference_protected(*rtp, 1);
146
147         if (!rt)
148                 return;
149
150         /* Not even needed : RCU_INIT_POINTER(*rtp, NULL);
151          * because we waited an RCU grace period before calling
152          * free_fib_info_rcu()
153          */
154
155         dst_free(&rt->dst);
156 }
157
158 static void free_nh_exceptions(struct fib_nh *nh)
159 {
160         struct fnhe_hash_bucket *hash;
161         int i;
162
163         hash = rcu_dereference_protected(nh->nh_exceptions, 1);
164         if (!hash)
165                 return;
166         for (i = 0; i < FNHE_HASH_SIZE; i++) {
167                 struct fib_nh_exception *fnhe;
168
169                 fnhe = rcu_dereference_protected(hash[i].chain, 1);
170                 while (fnhe) {
171                         struct fib_nh_exception *next;
172                         
173                         next = rcu_dereference_protected(fnhe->fnhe_next, 1);
174
175                         rt_fibinfo_free(&fnhe->fnhe_rth_input);
176                         rt_fibinfo_free(&fnhe->fnhe_rth_output);
177
178                         kfree(fnhe);
179
180                         fnhe = next;
181                 }
182         }
183         kfree(hash);
184 }
185
186 static void rt_fibinfo_free_cpus(struct rtable __rcu * __percpu *rtp)
187 {
188         int cpu;
189
190         if (!rtp)
191                 return;
192
193         for_each_possible_cpu(cpu) {
194                 struct rtable *rt;
195
196                 rt = rcu_dereference_protected(*per_cpu_ptr(rtp, cpu), 1);
197                 if (rt)
198                         dst_free(&rt->dst);
199         }
200         free_percpu(rtp);
201 }
202
203 /* Release a nexthop info record */
204 static void free_fib_info_rcu(struct rcu_head *head)
205 {
206         struct fib_info *fi = container_of(head, struct fib_info, rcu);
207         struct dst_metrics *m;
208
209         change_nexthops(fi) {
210                 if (nexthop_nh->nh_dev)
211                         dev_put(nexthop_nh->nh_dev);
212                 lwtstate_put(nexthop_nh->nh_lwtstate);
213                 free_nh_exceptions(nexthop_nh);
214                 rt_fibinfo_free_cpus(nexthop_nh->nh_pcpu_rth_output);
215                 rt_fibinfo_free(&nexthop_nh->nh_rth_input);
216         } endfor_nexthops(fi);
217
218         m = fi->fib_metrics;
219         if (m != &dst_default_metrics && atomic_dec_and_test(&m->refcnt))
220                 kfree(m);
221         kfree(fi);
222 }
223
224 void free_fib_info(struct fib_info *fi)
225 {
226         if (fi->fib_dead == 0) {
227                 pr_warn("Freeing alive fib_info %p\n", fi);
228                 return;
229         }
230         fib_info_cnt--;
231 #ifdef CONFIG_IP_ROUTE_CLASSID
232         change_nexthops(fi) {
233                 if (nexthop_nh->nh_tclassid)
234                         fi->fib_net->ipv4.fib_num_tclassid_users--;
235         } endfor_nexthops(fi);
236 #endif
237         call_rcu(&fi->rcu, free_fib_info_rcu);
238 }
239
240 void fib_release_info(struct fib_info *fi)
241 {
242         spin_lock_bh(&fib_info_lock);
243         if (fi && --fi->fib_treeref == 0) {
244                 hlist_del(&fi->fib_hash);
245                 if (fi->fib_prefsrc)
246                         hlist_del(&fi->fib_lhash);
247                 change_nexthops(fi) {
248                         if (!nexthop_nh->nh_dev)
249                                 continue;
250                         hlist_del(&nexthop_nh->nh_hash);
251                 } endfor_nexthops(fi)
252                 fi->fib_dead = 1;
253                 fib_info_put(fi);
254         }
255         spin_unlock_bh(&fib_info_lock);
256 }
257
258 static inline int nh_comp(const struct fib_info *fi, const struct fib_info *ofi)
259 {
260         const struct fib_nh *onh = ofi->fib_nh;
261
262         for_nexthops(fi) {
263                 if (nh->nh_oif != onh->nh_oif ||
264                     nh->nh_gw  != onh->nh_gw ||
265                     nh->nh_scope != onh->nh_scope ||
266 #ifdef CONFIG_IP_ROUTE_MULTIPATH
267                     nh->nh_weight != onh->nh_weight ||
268 #endif
269 #ifdef CONFIG_IP_ROUTE_CLASSID
270                     nh->nh_tclassid != onh->nh_tclassid ||
271 #endif
272                     lwtunnel_cmp_encap(nh->nh_lwtstate, onh->nh_lwtstate) ||
273                     ((nh->nh_flags ^ onh->nh_flags) & ~RTNH_COMPARE_MASK))
274                         return -1;
275                 onh++;
276         } endfor_nexthops(fi);
277         return 0;
278 }
279
280 static inline unsigned int fib_devindex_hashfn(unsigned int val)
281 {
282         unsigned int mask = DEVINDEX_HASHSIZE - 1;
283
284         return (val ^
285                 (val >> DEVINDEX_HASHBITS) ^
286                 (val >> (DEVINDEX_HASHBITS * 2))) & mask;
287 }
288
289 static inline unsigned int fib_info_hashfn(const struct fib_info *fi)
290 {
291         unsigned int mask = (fib_info_hash_size - 1);
292         unsigned int val = fi->fib_nhs;
293
294         val ^= (fi->fib_protocol << 8) | fi->fib_scope;
295         val ^= (__force u32)fi->fib_prefsrc;
296         val ^= fi->fib_priority;
297         for_nexthops(fi) {
298                 val ^= fib_devindex_hashfn(nh->nh_oif);
299         } endfor_nexthops(fi)
300
301         return (val ^ (val >> 7) ^ (val >> 12)) & mask;
302 }
303
304 static struct fib_info *fib_find_info(const struct fib_info *nfi)
305 {
306         struct hlist_head *head;
307         struct fib_info *fi;
308         unsigned int hash;
309
310         hash = fib_info_hashfn(nfi);
311         head = &fib_info_hash[hash];
312
313         hlist_for_each_entry(fi, head, fib_hash) {
314                 if (!net_eq(fi->fib_net, nfi->fib_net))
315                         continue;
316                 if (fi->fib_nhs != nfi->fib_nhs)
317                         continue;
318                 if (nfi->fib_protocol == fi->fib_protocol &&
319                     nfi->fib_scope == fi->fib_scope &&
320                     nfi->fib_prefsrc == fi->fib_prefsrc &&
321                     nfi->fib_priority == fi->fib_priority &&
322                     nfi->fib_type == fi->fib_type &&
323                     memcmp(nfi->fib_metrics, fi->fib_metrics,
324                            sizeof(u32) * RTAX_MAX) == 0 &&
325                     !((nfi->fib_flags ^ fi->fib_flags) & ~RTNH_COMPARE_MASK) &&
326                     (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0))
327                         return fi;
328         }
329
330         return NULL;
331 }
332
333 /* Check, that the gateway is already configured.
334  * Used only by redirect accept routine.
335  */
336 int ip_fib_check_default(__be32 gw, struct net_device *dev)
337 {
338         struct hlist_head *head;
339         struct fib_nh *nh;
340         unsigned int hash;
341
342         spin_lock(&fib_info_lock);
343
344         hash = fib_devindex_hashfn(dev->ifindex);
345         head = &fib_info_devhash[hash];
346         hlist_for_each_entry(nh, head, nh_hash) {
347                 if (nh->nh_dev == dev &&
348                     nh->nh_gw == gw &&
349                     !(nh->nh_flags & RTNH_F_DEAD)) {
350                         spin_unlock(&fib_info_lock);
351                         return 0;
352                 }
353         }
354
355         spin_unlock(&fib_info_lock);
356
357         return -1;
358 }
359
360 static inline size_t fib_nlmsg_size(struct fib_info *fi)
361 {
362         size_t payload = NLMSG_ALIGN(sizeof(struct rtmsg))
363                          + nla_total_size(4) /* RTA_TABLE */
364                          + nla_total_size(4) /* RTA_DST */
365                          + nla_total_size(4) /* RTA_PRIORITY */
366                          + nla_total_size(4) /* RTA_PREFSRC */
367                          + nla_total_size(TCP_CA_NAME_MAX); /* RTAX_CC_ALGO */
368
369         /* space for nested metrics */
370         payload += nla_total_size((RTAX_MAX * nla_total_size(4)));
371
372         if (fi->fib_nhs) {
373                 size_t nh_encapsize = 0;
374                 /* Also handles the special case fib_nhs == 1 */
375
376                 /* each nexthop is packed in an attribute */
377                 size_t nhsize = nla_total_size(sizeof(struct rtnexthop));
378
379                 /* may contain flow and gateway attribute */
380                 nhsize += 2 * nla_total_size(4);
381
382                 /* grab encap info */
383                 for_nexthops(fi) {
384                         if (nh->nh_lwtstate) {
385                                 /* RTA_ENCAP_TYPE */
386                                 nh_encapsize += lwtunnel_get_encap_size(
387                                                 nh->nh_lwtstate);
388                                 /* RTA_ENCAP */
389                                 nh_encapsize +=  nla_total_size(2);
390                         }
391                 } endfor_nexthops(fi);
392
393                 /* all nexthops are packed in a nested attribute */
394                 payload += nla_total_size((fi->fib_nhs * nhsize) +
395                                           nh_encapsize);
396
397         }
398
399         return payload;
400 }
401
402 void rtmsg_fib(int event, __be32 key, struct fib_alias *fa,
403                int dst_len, u32 tb_id, const struct nl_info *info,
404                unsigned int nlm_flags)
405 {
406         struct sk_buff *skb;
407         u32 seq = info->nlh ? info->nlh->nlmsg_seq : 0;
408         int err = -ENOBUFS;
409
410         skb = nlmsg_new(fib_nlmsg_size(fa->fa_info), GFP_KERNEL);
411         if (!skb)
412                 goto errout;
413
414         err = fib_dump_info(skb, info->portid, seq, event, tb_id,
415                             fa->fa_type, key, dst_len,
416                             fa->fa_tos, fa->fa_info, nlm_flags);
417         if (err < 0) {
418                 /* -EMSGSIZE implies BUG in fib_nlmsg_size() */
419                 WARN_ON(err == -EMSGSIZE);
420                 kfree_skb(skb);
421                 goto errout;
422         }
423         rtnl_notify(skb, info->nl_net, info->portid, RTNLGRP_IPV4_ROUTE,
424                     info->nlh, GFP_KERNEL);
425         return;
426 errout:
427         if (err < 0)
428                 rtnl_set_sk_err(info->nl_net, RTNLGRP_IPV4_ROUTE, err);
429 }
430
431 static int fib_detect_death(struct fib_info *fi, int order,
432                             struct fib_info **last_resort, int *last_idx,
433                             int dflt)
434 {
435         struct neighbour *n;
436         int state = NUD_NONE;
437
438         n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev);
439         if (n) {
440                 state = n->nud_state;
441                 neigh_release(n);
442         } else {
443                 return 0;
444         }
445         if (state == NUD_REACHABLE)
446                 return 0;
447         if ((state & NUD_VALID) && order != dflt)
448                 return 0;
449         if ((state & NUD_VALID) ||
450             (*last_idx < 0 && order > dflt && state != NUD_INCOMPLETE)) {
451                 *last_resort = fi;
452                 *last_idx = order;
453         }
454         return 1;
455 }
456
457 #ifdef CONFIG_IP_ROUTE_MULTIPATH
458
459 static int fib_count_nexthops(struct rtnexthop *rtnh, int remaining)
460 {
461         int nhs = 0;
462
463         while (rtnh_ok(rtnh, remaining)) {
464                 nhs++;
465                 rtnh = rtnh_next(rtnh, &remaining);
466         }
467
468         /* leftover implies invalid nexthop configuration, discard it */
469         return remaining > 0 ? 0 : nhs;
470 }
471
472 static int fib_get_nhs(struct fib_info *fi, struct rtnexthop *rtnh,
473                        int remaining, struct fib_config *cfg)
474 {
475         struct net *net = cfg->fc_nlinfo.nl_net;
476         int ret;
477
478         change_nexthops(fi) {
479                 int attrlen;
480
481                 if (!rtnh_ok(rtnh, remaining))
482                         return -EINVAL;
483
484                 if (rtnh->rtnh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN))
485                         return -EINVAL;
486
487                 nexthop_nh->nh_flags =
488                         (cfg->fc_flags & ~0xFF) | rtnh->rtnh_flags;
489                 nexthop_nh->nh_oif = rtnh->rtnh_ifindex;
490                 nexthop_nh->nh_weight = rtnh->rtnh_hops + 1;
491
492                 attrlen = rtnh_attrlen(rtnh);
493                 if (attrlen > 0) {
494                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
495
496                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
497                         nexthop_nh->nh_gw = nla ? nla_get_in_addr(nla) : 0;
498 #ifdef CONFIG_IP_ROUTE_CLASSID
499                         nla = nla_find(attrs, attrlen, RTA_FLOW);
500                         nexthop_nh->nh_tclassid = nla ? nla_get_u32(nla) : 0;
501                         if (nexthop_nh->nh_tclassid)
502                                 fi->fib_net->ipv4.fib_num_tclassid_users++;
503 #endif
504                         nla = nla_find(attrs, attrlen, RTA_ENCAP);
505                         if (nla) {
506                                 struct lwtunnel_state *lwtstate;
507                                 struct net_device *dev = NULL;
508                                 struct nlattr *nla_entype;
509
510                                 nla_entype = nla_find(attrs, attrlen,
511                                                       RTA_ENCAP_TYPE);
512                                 if (!nla_entype)
513                                         goto err_inval;
514                                 if (cfg->fc_oif)
515                                         dev = __dev_get_by_index(net, cfg->fc_oif);
516                                 ret = lwtunnel_build_state(dev, nla_get_u16(
517                                                            nla_entype),
518                                                            nla,  AF_INET, cfg,
519                                                            &lwtstate);
520                                 if (ret)
521                                         goto errout;
522                                 nexthop_nh->nh_lwtstate =
523                                         lwtstate_get(lwtstate);
524                         }
525                 }
526
527                 rtnh = rtnh_next(rtnh, &remaining);
528         } endfor_nexthops(fi);
529
530         return 0;
531
532 err_inval:
533         ret = -EINVAL;
534
535 errout:
536         return ret;
537 }
538
539 static void fib_rebalance(struct fib_info *fi)
540 {
541         int total;
542         int w;
543         struct in_device *in_dev;
544
545         if (fi->fib_nhs < 2)
546                 return;
547
548         total = 0;
549         for_nexthops(fi) {
550                 if (nh->nh_flags & RTNH_F_DEAD)
551                         continue;
552
553                 in_dev = __in_dev_get_rtnl(nh->nh_dev);
554
555                 if (in_dev &&
556                     IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
557                     nh->nh_flags & RTNH_F_LINKDOWN)
558                         continue;
559
560                 total += nh->nh_weight;
561         } endfor_nexthops(fi);
562
563         w = 0;
564         change_nexthops(fi) {
565                 int upper_bound;
566
567                 in_dev = __in_dev_get_rtnl(nexthop_nh->nh_dev);
568
569                 if (nexthop_nh->nh_flags & RTNH_F_DEAD) {
570                         upper_bound = -1;
571                 } else if (in_dev &&
572                            IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev) &&
573                            nexthop_nh->nh_flags & RTNH_F_LINKDOWN) {
574                         upper_bound = -1;
575                 } else {
576                         w += nexthop_nh->nh_weight;
577                         upper_bound = DIV_ROUND_CLOSEST_ULL((u64)w << 31,
578                                                             total) - 1;
579                 }
580
581                 atomic_set(&nexthop_nh->nh_upper_bound, upper_bound);
582         } endfor_nexthops(fi);
583
584         net_get_random_once(&fib_multipath_secret,
585                             sizeof(fib_multipath_secret));
586 }
587
588 static inline void fib_add_weight(struct fib_info *fi,
589                                   const struct fib_nh *nh)
590 {
591         fi->fib_weight += nh->nh_weight;
592 }
593
594 #else /* CONFIG_IP_ROUTE_MULTIPATH */
595
596 #define fib_rebalance(fi) do { } while (0)
597 #define fib_add_weight(fi, nh) do { } while (0)
598
599 #endif /* CONFIG_IP_ROUTE_MULTIPATH */
600
601 static int fib_encap_match(struct net *net, u16 encap_type,
602                            struct nlattr *encap,
603                            int oif, const struct fib_nh *nh,
604                            const struct fib_config *cfg)
605 {
606         struct lwtunnel_state *lwtstate;
607         struct net_device *dev = NULL;
608         int ret, result = 0;
609
610         if (encap_type == LWTUNNEL_ENCAP_NONE)
611                 return 0;
612
613         if (oif)
614                 dev = __dev_get_by_index(net, oif);
615         ret = lwtunnel_build_state(dev, encap_type, encap,
616                                    AF_INET, cfg, &lwtstate);
617         if (!ret) {
618                 result = lwtunnel_cmp_encap(lwtstate, nh->nh_lwtstate);
619                 lwtstate_free(lwtstate);
620         }
621
622         return result;
623 }
624
625 int fib_nh_match(struct fib_config *cfg, struct fib_info *fi)
626 {
627         struct net *net = cfg->fc_nlinfo.nl_net;
628 #ifdef CONFIG_IP_ROUTE_MULTIPATH
629         struct rtnexthop *rtnh;
630         int remaining;
631 #endif
632
633         if (cfg->fc_priority && cfg->fc_priority != fi->fib_priority)
634                 return 1;
635
636         if (cfg->fc_oif || cfg->fc_gw) {
637                 if (cfg->fc_encap) {
638                         if (fib_encap_match(net, cfg->fc_encap_type,
639                                             cfg->fc_encap, cfg->fc_oif,
640                                             fi->fib_nh, cfg))
641                             return 1;
642                 }
643 #ifdef CONFIG_IP_ROUTE_CLASSID
644                 if (cfg->fc_flow &&
645                     cfg->fc_flow != fi->fib_nh->nh_tclassid)
646                         return 1;
647 #endif
648                 if ((!cfg->fc_oif || cfg->fc_oif == fi->fib_nh->nh_oif) &&
649                     (!cfg->fc_gw  || cfg->fc_gw == fi->fib_nh->nh_gw))
650                         return 0;
651                 return 1;
652         }
653
654 #ifdef CONFIG_IP_ROUTE_MULTIPATH
655         if (!cfg->fc_mp)
656                 return 0;
657
658         rtnh = cfg->fc_mp;
659         remaining = cfg->fc_mp_len;
660
661         for_nexthops(fi) {
662                 int attrlen;
663
664                 if (!rtnh_ok(rtnh, remaining))
665                         return -EINVAL;
666
667                 if (rtnh->rtnh_ifindex && rtnh->rtnh_ifindex != nh->nh_oif)
668                         return 1;
669
670                 attrlen = rtnh_attrlen(rtnh);
671                 if (attrlen > 0) {
672                         struct nlattr *nla, *attrs = rtnh_attrs(rtnh);
673
674                         nla = nla_find(attrs, attrlen, RTA_GATEWAY);
675                         if (nla && nla_get_in_addr(nla) != nh->nh_gw)
676                                 return 1;
677 #ifdef CONFIG_IP_ROUTE_CLASSID
678                         nla = nla_find(attrs, attrlen, RTA_FLOW);
679                         if (nla && nla_get_u32(nla) != nh->nh_tclassid)
680                                 return 1;
681 #endif
682                 }
683
684                 rtnh = rtnh_next(rtnh, &remaining);
685         } endfor_nexthops(fi);
686 #endif
687         return 0;
688 }
689
690
691 /*
692  * Picture
693  * -------
694  *
695  * Semantics of nexthop is very messy by historical reasons.
696  * We have to take into account, that:
697  * a) gateway can be actually local interface address,
698  *    so that gatewayed route is direct.
699  * b) gateway must be on-link address, possibly
700  *    described not by an ifaddr, but also by a direct route.
701  * c) If both gateway and interface are specified, they should not
702  *    contradict.
703  * d) If we use tunnel routes, gateway could be not on-link.
704  *
705  * Attempt to reconcile all of these (alas, self-contradictory) conditions
706  * results in pretty ugly and hairy code with obscure logic.
707  *
708  * I chose to generalized it instead, so that the size
709  * of code does not increase practically, but it becomes
710  * much more general.
711  * Every prefix is assigned a "scope" value: "host" is local address,
712  * "link" is direct route,
713  * [ ... "site" ... "interior" ... ]
714  * and "universe" is true gateway route with global meaning.
715  *
716  * Every prefix refers to a set of "nexthop"s (gw, oif),
717  * where gw must have narrower scope. This recursion stops
718  * when gw has LOCAL scope or if "nexthop" is declared ONLINK,
719  * which means that gw is forced to be on link.
720  *
721  * Code is still hairy, but now it is apparently logically
722  * consistent and very flexible. F.e. as by-product it allows
723  * to co-exists in peace independent exterior and interior
724  * routing processes.
725  *
726  * Normally it looks as following.
727  *
728  * {universe prefix}  -> (gw, oif) [scope link]
729  *                |
730  *                |-> {link prefix} -> (gw, oif) [scope local]
731  *                                      |
732  *                                      |-> {local prefix} (terminal node)
733  */
734 static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi,
735                         struct fib_nh *nh)
736 {
737         int err = 0;
738         struct net *net;
739         struct net_device *dev;
740
741         net = cfg->fc_nlinfo.nl_net;
742         if (nh->nh_gw) {
743                 struct fib_result res;
744
745                 if (nh->nh_flags & RTNH_F_ONLINK) {
746                         unsigned int addr_type;
747
748                         if (cfg->fc_scope >= RT_SCOPE_LINK)
749                                 return -EINVAL;
750                         dev = __dev_get_by_index(net, nh->nh_oif);
751                         if (!dev)
752                                 return -ENODEV;
753                         if (!(dev->flags & IFF_UP))
754                                 return -ENETDOWN;
755                         addr_type = inet_addr_type_dev_table(net, dev, nh->nh_gw);
756                         if (addr_type != RTN_UNICAST)
757                                 return -EINVAL;
758                         if (!netif_carrier_ok(dev))
759                                 nh->nh_flags |= RTNH_F_LINKDOWN;
760                         nh->nh_dev = dev;
761                         dev_hold(dev);
762                         nh->nh_scope = RT_SCOPE_LINK;
763                         return 0;
764                 }
765                 rcu_read_lock();
766                 {
767                         struct fib_table *tbl = NULL;
768                         struct flowi4 fl4 = {
769                                 .daddr = nh->nh_gw,
770                                 .flowi4_scope = cfg->fc_scope + 1,
771                                 .flowi4_oif = nh->nh_oif,
772                                 .flowi4_iif = LOOPBACK_IFINDEX,
773                         };
774
775                         /* It is not necessary, but requires a bit of thinking */
776                         if (fl4.flowi4_scope < RT_SCOPE_LINK)
777                                 fl4.flowi4_scope = RT_SCOPE_LINK;
778
779                         if (cfg->fc_table && cfg->fc_table != RT_TABLE_MAIN)
780                                 tbl = fib_get_table(net, cfg->fc_table);
781
782                         if (tbl)
783                                 err = fib_table_lookup(tbl, &fl4, &res,
784                                                        FIB_LOOKUP_IGNORE_LINKSTATE |
785                                                        FIB_LOOKUP_NOREF);
786
787                         /* on error or if no table given do full lookup. This
788                          * is needed for example when nexthops are in the local
789                          * table rather than the given table
790                          */
791                         if (!tbl || err) {
792                                 err = fib_lookup(net, &fl4, &res,
793                                                  FIB_LOOKUP_IGNORE_LINKSTATE);
794                         }
795
796                         if (err) {
797                                 rcu_read_unlock();
798                                 return err;
799                         }
800                 }
801                 err = -EINVAL;
802                 if (res.type != RTN_UNICAST && res.type != RTN_LOCAL)
803                         goto out;
804                 nh->nh_scope = res.scope;
805                 nh->nh_oif = FIB_RES_OIF(res);
806                 nh->nh_dev = dev = FIB_RES_DEV(res);
807                 if (!dev)
808                         goto out;
809                 dev_hold(dev);
810                 if (!netif_carrier_ok(dev))
811                         nh->nh_flags |= RTNH_F_LINKDOWN;
812                 err = (dev->flags & IFF_UP) ? 0 : -ENETDOWN;
813         } else {
814                 struct in_device *in_dev;
815
816                 if (nh->nh_flags & (RTNH_F_PERVASIVE | RTNH_F_ONLINK))
817                         return -EINVAL;
818
819                 rcu_read_lock();
820                 err = -ENODEV;
821                 in_dev = inetdev_by_index(net, nh->nh_oif);
822                 if (!in_dev)
823                         goto out;
824                 err = -ENETDOWN;
825                 if (!(in_dev->dev->flags & IFF_UP))
826                         goto out;
827                 nh->nh_dev = in_dev->dev;
828                 dev_hold(nh->nh_dev);
829                 nh->nh_scope = RT_SCOPE_HOST;
830                 if (!netif_carrier_ok(nh->nh_dev))
831                         nh->nh_flags |= RTNH_F_LINKDOWN;
832                 err = 0;
833         }
834 out:
835         rcu_read_unlock();
836         return err;
837 }
838
839 static inline unsigned int fib_laddr_hashfn(__be32 val)
840 {
841         unsigned int mask = (fib_info_hash_size - 1);
842
843         return ((__force u32)val ^
844                 ((__force u32)val >> 7) ^
845                 ((__force u32)val >> 14)) & mask;
846 }
847
848 static struct hlist_head *fib_info_hash_alloc(int bytes)
849 {
850         if (bytes <= PAGE_SIZE)
851                 return kzalloc(bytes, GFP_KERNEL);
852         else
853                 return (struct hlist_head *)
854                         __get_free_pages(GFP_KERNEL | __GFP_ZERO,
855                                          get_order(bytes));
856 }
857
858 static void fib_info_hash_free(struct hlist_head *hash, int bytes)
859 {
860         if (!hash)
861                 return;
862
863         if (bytes <= PAGE_SIZE)
864                 kfree(hash);
865         else
866                 free_pages((unsigned long) hash, get_order(bytes));
867 }
868
869 static void fib_info_hash_move(struct hlist_head *new_info_hash,
870                                struct hlist_head *new_laddrhash,
871                                unsigned int new_size)
872 {
873         struct hlist_head *old_info_hash, *old_laddrhash;
874         unsigned int old_size = fib_info_hash_size;
875         unsigned int i, bytes;
876
877         spin_lock_bh(&fib_info_lock);
878         old_info_hash = fib_info_hash;
879         old_laddrhash = fib_info_laddrhash;
880         fib_info_hash_size = new_size;
881
882         for (i = 0; i < old_size; i++) {
883                 struct hlist_head *head = &fib_info_hash[i];
884                 struct hlist_node *n;
885                 struct fib_info *fi;
886
887                 hlist_for_each_entry_safe(fi, n, head, fib_hash) {
888                         struct hlist_head *dest;
889                         unsigned int new_hash;
890
891                         new_hash = fib_info_hashfn(fi);
892                         dest = &new_info_hash[new_hash];
893                         hlist_add_head(&fi->fib_hash, dest);
894                 }
895         }
896         fib_info_hash = new_info_hash;
897
898         for (i = 0; i < old_size; i++) {
899                 struct hlist_head *lhead = &fib_info_laddrhash[i];
900                 struct hlist_node *n;
901                 struct fib_info *fi;
902
903                 hlist_for_each_entry_safe(fi, n, lhead, fib_lhash) {
904                         struct hlist_head *ldest;
905                         unsigned int new_hash;
906
907                         new_hash = fib_laddr_hashfn(fi->fib_prefsrc);
908                         ldest = &new_laddrhash[new_hash];
909                         hlist_add_head(&fi->fib_lhash, ldest);
910                 }
911         }
912         fib_info_laddrhash = new_laddrhash;
913
914         spin_unlock_bh(&fib_info_lock);
915
916         bytes = old_size * sizeof(struct hlist_head *);
917         fib_info_hash_free(old_info_hash, bytes);
918         fib_info_hash_free(old_laddrhash, bytes);
919 }
920
921 __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh)
922 {
923         nh->nh_saddr = inet_select_addr(nh->nh_dev,
924                                         nh->nh_gw,
925                                         nh->nh_parent->fib_scope);
926         nh->nh_saddr_genid = atomic_read(&net->ipv4.dev_addr_genid);
927
928         return nh->nh_saddr;
929 }
930
931 static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc)
932 {
933         if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
934             fib_prefsrc != cfg->fc_dst) {
935                 u32 tb_id = cfg->fc_table;
936                 int rc;
937
938                 if (tb_id == RT_TABLE_MAIN)
939                         tb_id = RT_TABLE_LOCAL;
940
941                 rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net,
942                                           fib_prefsrc, tb_id);
943
944                 if (rc != RTN_LOCAL && tb_id != RT_TABLE_LOCAL) {
945                         rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net,
946                                                   fib_prefsrc, RT_TABLE_LOCAL);
947                 }
948
949                 if (rc != RTN_LOCAL)
950                         return false;
951         }
952         return true;
953 }
954
955 static int
956 fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg)
957 {
958         bool ecn_ca = false;
959         struct nlattr *nla;
960         int remaining;
961
962         if (!cfg->fc_mx)
963                 return 0;
964
965         nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
966                 int type = nla_type(nla);
967                 u32 val;
968
969                 if (!type)
970                         continue;
971                 if (type > RTAX_MAX)
972                         return -EINVAL;
973
974                 if (type == RTAX_CC_ALGO) {
975                         char tmp[TCP_CA_NAME_MAX];
976
977                         nla_strlcpy(tmp, nla, sizeof(tmp));
978                         val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
979                         if (val == TCP_CA_UNSPEC)
980                                 return -EINVAL;
981                 } else {
982                         if (nla_len(nla) != sizeof(u32))
983                                 return -EINVAL;
984                         val = nla_get_u32(nla);
985                 }
986                 if (type == RTAX_ADVMSS && val > 65535 - 40)
987                         val = 65535 - 40;
988                 if (type == RTAX_MTU && val > 65535 - 15)
989                         val = 65535 - 15;
990                 if (type == RTAX_HOPLIMIT && val > 255)
991                         val = 255;
992                 if (type == RTAX_FEATURES && (val & ~RTAX_FEATURE_MASK))
993                         return -EINVAL;
994                 fi->fib_metrics->metrics[type - 1] = val;
995         }
996
997         if (ecn_ca)
998                 fi->fib_metrics->metrics[RTAX_FEATURES - 1] |= DST_FEATURE_ECN_CA;
999
1000         return 0;
1001 }
1002
1003 struct fib_info *fib_create_info(struct fib_config *cfg)
1004 {
1005         int err;
1006         struct fib_info *fi = NULL;
1007         struct fib_info *ofi;
1008         int nhs = 1;
1009         struct net *net = cfg->fc_nlinfo.nl_net;
1010
1011         if (cfg->fc_type > RTN_MAX)
1012                 goto err_inval;
1013
1014         /* Fast check to catch the most weird cases */
1015         if (fib_props[cfg->fc_type].scope > cfg->fc_scope)
1016                 goto err_inval;
1017
1018         if (cfg->fc_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN))
1019                 goto err_inval;
1020
1021 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1022         if (cfg->fc_mp) {
1023                 nhs = fib_count_nexthops(cfg->fc_mp, cfg->fc_mp_len);
1024                 if (nhs == 0)
1025                         goto err_inval;
1026         }
1027 #endif
1028
1029         err = -ENOBUFS;
1030         if (fib_info_cnt >= fib_info_hash_size) {
1031                 unsigned int new_size = fib_info_hash_size << 1;
1032                 struct hlist_head *new_info_hash;
1033                 struct hlist_head *new_laddrhash;
1034                 unsigned int bytes;
1035
1036                 if (!new_size)
1037                         new_size = 16;
1038                 bytes = new_size * sizeof(struct hlist_head *);
1039                 new_info_hash = fib_info_hash_alloc(bytes);
1040                 new_laddrhash = fib_info_hash_alloc(bytes);
1041                 if (!new_info_hash || !new_laddrhash) {
1042                         fib_info_hash_free(new_info_hash, bytes);
1043                         fib_info_hash_free(new_laddrhash, bytes);
1044                 } else
1045                         fib_info_hash_move(new_info_hash, new_laddrhash, new_size);
1046
1047                 if (!fib_info_hash_size)
1048                         goto failure;
1049         }
1050
1051         fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
1052         if (!fi)
1053                 goto failure;
1054         if (cfg->fc_mx) {
1055                 fi->fib_metrics = kzalloc(sizeof(*fi->fib_metrics), GFP_KERNEL);
1056                 if (unlikely(!fi->fib_metrics)) {
1057                         kfree(fi);
1058                         return ERR_PTR(err);
1059                 }
1060                 atomic_set(&fi->fib_metrics->refcnt, 1);
1061         } else {
1062                 fi->fib_metrics = (struct dst_metrics *)&dst_default_metrics;
1063         }
1064         fib_info_cnt++;
1065         fi->fib_net = net;
1066         fi->fib_protocol = cfg->fc_protocol;
1067         fi->fib_scope = cfg->fc_scope;
1068         fi->fib_flags = cfg->fc_flags;
1069         fi->fib_priority = cfg->fc_priority;
1070         fi->fib_prefsrc = cfg->fc_prefsrc;
1071         fi->fib_type = cfg->fc_type;
1072         fi->fib_tb_id = cfg->fc_table;
1073
1074         fi->fib_nhs = nhs;
1075         change_nexthops(fi) {
1076                 nexthop_nh->nh_parent = fi;
1077                 nexthop_nh->nh_pcpu_rth_output = alloc_percpu(struct rtable __rcu *);
1078                 if (!nexthop_nh->nh_pcpu_rth_output)
1079                         goto failure;
1080         } endfor_nexthops(fi)
1081
1082         err = fib_convert_metrics(fi, cfg);
1083         if (err)
1084                 goto failure;
1085
1086         if (cfg->fc_mp) {
1087 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1088                 err = fib_get_nhs(fi, cfg->fc_mp, cfg->fc_mp_len, cfg);
1089                 if (err != 0)
1090                         goto failure;
1091                 if (cfg->fc_oif && fi->fib_nh->nh_oif != cfg->fc_oif)
1092                         goto err_inval;
1093                 if (cfg->fc_gw && fi->fib_nh->nh_gw != cfg->fc_gw)
1094                         goto err_inval;
1095 #ifdef CONFIG_IP_ROUTE_CLASSID
1096                 if (cfg->fc_flow && fi->fib_nh->nh_tclassid != cfg->fc_flow)
1097                         goto err_inval;
1098 #endif
1099 #else
1100                 goto err_inval;
1101 #endif
1102         } else {
1103                 struct fib_nh *nh = fi->fib_nh;
1104
1105                 if (cfg->fc_encap) {
1106                         struct lwtunnel_state *lwtstate;
1107                         struct net_device *dev = NULL;
1108
1109                         if (cfg->fc_encap_type == LWTUNNEL_ENCAP_NONE)
1110                                 goto err_inval;
1111                         if (cfg->fc_oif)
1112                                 dev = __dev_get_by_index(net, cfg->fc_oif);
1113                         err = lwtunnel_build_state(dev, cfg->fc_encap_type,
1114                                                    cfg->fc_encap, AF_INET, cfg,
1115                                                    &lwtstate);
1116                         if (err)
1117                                 goto failure;
1118
1119                         nh->nh_lwtstate = lwtstate_get(lwtstate);
1120                 }
1121                 nh->nh_oif = cfg->fc_oif;
1122                 nh->nh_gw = cfg->fc_gw;
1123                 nh->nh_flags = cfg->fc_flags;
1124 #ifdef CONFIG_IP_ROUTE_CLASSID
1125                 nh->nh_tclassid = cfg->fc_flow;
1126                 if (nh->nh_tclassid)
1127                         fi->fib_net->ipv4.fib_num_tclassid_users++;
1128 #endif
1129 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1130                 nh->nh_weight = 1;
1131 #endif
1132         }
1133
1134         if (fib_props[cfg->fc_type].error) {
1135                 if (cfg->fc_gw || cfg->fc_oif || cfg->fc_mp)
1136                         goto err_inval;
1137                 goto link_it;
1138         } else {
1139                 switch (cfg->fc_type) {
1140                 case RTN_UNICAST:
1141                 case RTN_LOCAL:
1142                 case RTN_BROADCAST:
1143                 case RTN_ANYCAST:
1144                 case RTN_MULTICAST:
1145                         break;
1146                 default:
1147                         goto err_inval;
1148                 }
1149         }
1150
1151         if (cfg->fc_scope > RT_SCOPE_HOST)
1152                 goto err_inval;
1153
1154         if (cfg->fc_scope == RT_SCOPE_HOST) {
1155                 struct fib_nh *nh = fi->fib_nh;
1156
1157                 /* Local address is added. */
1158                 if (nhs != 1 || nh->nh_gw)
1159                         goto err_inval;
1160                 nh->nh_scope = RT_SCOPE_NOWHERE;
1161                 nh->nh_dev = dev_get_by_index(net, fi->fib_nh->nh_oif);
1162                 err = -ENODEV;
1163                 if (!nh->nh_dev)
1164                         goto failure;
1165         } else {
1166                 int linkdown = 0;
1167
1168                 change_nexthops(fi) {
1169                         err = fib_check_nh(cfg, fi, nexthop_nh);
1170                         if (err != 0)
1171                                 goto failure;
1172                         if (nexthop_nh->nh_flags & RTNH_F_LINKDOWN)
1173                                 linkdown++;
1174                 } endfor_nexthops(fi)
1175                 if (linkdown == fi->fib_nhs)
1176                         fi->fib_flags |= RTNH_F_LINKDOWN;
1177         }
1178
1179         if (fi->fib_prefsrc && !fib_valid_prefsrc(cfg, fi->fib_prefsrc))
1180                 goto err_inval;
1181
1182         change_nexthops(fi) {
1183                 fib_info_update_nh_saddr(net, nexthop_nh);
1184                 fib_add_weight(fi, nexthop_nh);
1185         } endfor_nexthops(fi)
1186
1187         fib_rebalance(fi);
1188
1189 link_it:
1190         ofi = fib_find_info(fi);
1191         if (ofi) {
1192                 fi->fib_dead = 1;
1193                 free_fib_info(fi);
1194                 ofi->fib_treeref++;
1195                 return ofi;
1196         }
1197
1198         fi->fib_treeref++;
1199         atomic_inc(&fi->fib_clntref);
1200         spin_lock_bh(&fib_info_lock);
1201         hlist_add_head(&fi->fib_hash,
1202                        &fib_info_hash[fib_info_hashfn(fi)]);
1203         if (fi->fib_prefsrc) {
1204                 struct hlist_head *head;
1205
1206                 head = &fib_info_laddrhash[fib_laddr_hashfn(fi->fib_prefsrc)];
1207                 hlist_add_head(&fi->fib_lhash, head);
1208         }
1209         change_nexthops(fi) {
1210                 struct hlist_head *head;
1211                 unsigned int hash;
1212
1213                 if (!nexthop_nh->nh_dev)
1214                         continue;
1215                 hash = fib_devindex_hashfn(nexthop_nh->nh_dev->ifindex);
1216                 head = &fib_info_devhash[hash];
1217                 hlist_add_head(&nexthop_nh->nh_hash, head);
1218         } endfor_nexthops(fi)
1219         spin_unlock_bh(&fib_info_lock);
1220         return fi;
1221
1222 err_inval:
1223         err = -EINVAL;
1224
1225 failure:
1226         if (fi) {
1227                 fi->fib_dead = 1;
1228                 free_fib_info(fi);
1229         }
1230
1231         return ERR_PTR(err);
1232 }
1233
1234 int fib_dump_info(struct sk_buff *skb, u32 portid, u32 seq, int event,
1235                   u32 tb_id, u8 type, __be32 dst, int dst_len, u8 tos,
1236                   struct fib_info *fi, unsigned int flags)
1237 {
1238         struct nlmsghdr *nlh;
1239         struct rtmsg *rtm;
1240
1241         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*rtm), flags);
1242         if (!nlh)
1243                 return -EMSGSIZE;
1244
1245         rtm = nlmsg_data(nlh);
1246         rtm->rtm_family = AF_INET;
1247         rtm->rtm_dst_len = dst_len;
1248         rtm->rtm_src_len = 0;
1249         rtm->rtm_tos = tos;
1250         if (tb_id < 256)
1251                 rtm->rtm_table = tb_id;
1252         else
1253                 rtm->rtm_table = RT_TABLE_COMPAT;
1254         if (nla_put_u32(skb, RTA_TABLE, tb_id))
1255                 goto nla_put_failure;
1256         rtm->rtm_type = type;
1257         rtm->rtm_flags = fi->fib_flags;
1258         rtm->rtm_scope = fi->fib_scope;
1259         rtm->rtm_protocol = fi->fib_protocol;
1260
1261         if (rtm->rtm_dst_len &&
1262             nla_put_in_addr(skb, RTA_DST, dst))
1263                 goto nla_put_failure;
1264         if (fi->fib_priority &&
1265             nla_put_u32(skb, RTA_PRIORITY, fi->fib_priority))
1266                 goto nla_put_failure;
1267         if (rtnetlink_put_metrics(skb, fi->fib_metrics->metrics) < 0)
1268                 goto nla_put_failure;
1269
1270         if (fi->fib_prefsrc &&
1271             nla_put_in_addr(skb, RTA_PREFSRC, fi->fib_prefsrc))
1272                 goto nla_put_failure;
1273         if (fi->fib_nhs == 1) {
1274                 struct in_device *in_dev;
1275
1276                 if (fi->fib_nh->nh_gw &&
1277                     nla_put_in_addr(skb, RTA_GATEWAY, fi->fib_nh->nh_gw))
1278                         goto nla_put_failure;
1279                 if (fi->fib_nh->nh_oif &&
1280                     nla_put_u32(skb, RTA_OIF, fi->fib_nh->nh_oif))
1281                         goto nla_put_failure;
1282                 if (fi->fib_nh->nh_flags & RTNH_F_LINKDOWN) {
1283                         in_dev = __in_dev_get_rtnl(fi->fib_nh->nh_dev);
1284                         if (in_dev &&
1285                             IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev))
1286                                 rtm->rtm_flags |= RTNH_F_DEAD;
1287                 }
1288 #ifdef CONFIG_IP_ROUTE_CLASSID
1289                 if (fi->fib_nh[0].nh_tclassid &&
1290                     nla_put_u32(skb, RTA_FLOW, fi->fib_nh[0].nh_tclassid))
1291                         goto nla_put_failure;
1292 #endif
1293                 if (fi->fib_nh->nh_lwtstate &&
1294                     lwtunnel_fill_encap(skb, fi->fib_nh->nh_lwtstate) < 0)
1295                         goto nla_put_failure;
1296         }
1297 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1298         if (fi->fib_nhs > 1) {
1299                 struct rtnexthop *rtnh;
1300                 struct nlattr *mp;
1301
1302                 mp = nla_nest_start(skb, RTA_MULTIPATH);
1303                 if (!mp)
1304                         goto nla_put_failure;
1305
1306                 for_nexthops(fi) {
1307                         struct in_device *in_dev;
1308
1309                         rtnh = nla_reserve_nohdr(skb, sizeof(*rtnh));
1310                         if (!rtnh)
1311                                 goto nla_put_failure;
1312
1313                         rtnh->rtnh_flags = nh->nh_flags & 0xFF;
1314                         if (nh->nh_flags & RTNH_F_LINKDOWN) {
1315                                 in_dev = __in_dev_get_rtnl(nh->nh_dev);
1316                                 if (in_dev &&
1317                                     IN_DEV_IGNORE_ROUTES_WITH_LINKDOWN(in_dev))
1318                                         rtnh->rtnh_flags |= RTNH_F_DEAD;
1319                         }
1320                         rtnh->rtnh_hops = nh->nh_weight - 1;
1321                         rtnh->rtnh_ifindex = nh->nh_oif;
1322
1323                         if (nh->nh_gw &&
1324                             nla_put_in_addr(skb, RTA_GATEWAY, nh->nh_gw))
1325                                 goto nla_put_failure;
1326 #ifdef CONFIG_IP_ROUTE_CLASSID
1327                         if (nh->nh_tclassid &&
1328                             nla_put_u32(skb, RTA_FLOW, nh->nh_tclassid))
1329                                 goto nla_put_failure;
1330 #endif
1331                         if (nh->nh_lwtstate &&
1332                             lwtunnel_fill_encap(skb, nh->nh_lwtstate) < 0)
1333                                 goto nla_put_failure;
1334
1335                         /* length of rtnetlink header + attributes */
1336                         rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *) rtnh;
1337                 } endfor_nexthops(fi);
1338
1339                 nla_nest_end(skb, mp);
1340         }
1341 #endif
1342         nlmsg_end(skb, nlh);
1343         return 0;
1344
1345 nla_put_failure:
1346         nlmsg_cancel(skb, nlh);
1347         return -EMSGSIZE;
1348 }
1349
1350 /*
1351  * Update FIB if:
1352  * - local address disappeared -> we must delete all the entries
1353  *   referring to it.
1354  * - device went down -> we must shutdown all nexthops going via it.
1355  */
1356 int fib_sync_down_addr(struct net_device *dev, __be32 local)
1357 {
1358         int ret = 0;
1359         unsigned int hash = fib_laddr_hashfn(local);
1360         struct hlist_head *head = &fib_info_laddrhash[hash];
1361         int tb_id = l3mdev_fib_table(dev) ? : RT_TABLE_MAIN;
1362         struct net *net = dev_net(dev);
1363         struct fib_info *fi;
1364
1365         if (!fib_info_laddrhash || local == 0)
1366                 return 0;
1367
1368         hlist_for_each_entry(fi, head, fib_lhash) {
1369                 if (!net_eq(fi->fib_net, net) ||
1370                     fi->fib_tb_id != tb_id)
1371                         continue;
1372                 if (fi->fib_prefsrc == local) {
1373                         fi->fib_flags |= RTNH_F_DEAD;
1374                         ret++;
1375                 }
1376         }
1377         return ret;
1378 }
1379
1380 /* Update the PMTU of exceptions when:
1381  * - the new MTU of the first hop becomes smaller than the PMTU
1382  * - the old MTU was the same as the PMTU, and it limited discovery of
1383  *   larger MTUs on the path. With that limit raised, we can now
1384  *   discover larger MTUs
1385  * A special case is locked exceptions, for which the PMTU is smaller
1386  * than the minimal accepted PMTU:
1387  * - if the new MTU is greater than the PMTU, don't make any change
1388  * - otherwise, unlock and set PMTU
1389  */
1390 static void nh_update_mtu(struct fib_nh *nh, u32 new, u32 orig)
1391 {
1392         struct fnhe_hash_bucket *bucket;
1393         int i;
1394
1395         bucket = rcu_dereference_protected(nh->nh_exceptions, 1);
1396         if (!bucket)
1397                 return;
1398
1399         for (i = 0; i < FNHE_HASH_SIZE; i++) {
1400                 struct fib_nh_exception *fnhe;
1401
1402                 for (fnhe = rcu_dereference_protected(bucket[i].chain, 1);
1403                      fnhe;
1404                      fnhe = rcu_dereference_protected(fnhe->fnhe_next, 1)) {
1405                         if (fnhe->fnhe_mtu_locked) {
1406                                 if (new <= fnhe->fnhe_pmtu) {
1407                                         fnhe->fnhe_pmtu = new;
1408                                         fnhe->fnhe_mtu_locked = false;
1409                                 }
1410                         } else if (new < fnhe->fnhe_pmtu ||
1411                                    orig == fnhe->fnhe_pmtu) {
1412                                 fnhe->fnhe_pmtu = new;
1413                         }
1414                 }
1415         }
1416 }
1417
1418 void fib_sync_mtu(struct net_device *dev, u32 orig_mtu)
1419 {
1420         unsigned int hash = fib_devindex_hashfn(dev->ifindex);
1421         struct hlist_head *head = &fib_info_devhash[hash];
1422         struct fib_nh *nh;
1423
1424         hlist_for_each_entry(nh, head, nh_hash) {
1425                 if (nh->nh_dev == dev)
1426                         nh_update_mtu(nh, dev->mtu, orig_mtu);
1427         }
1428 }
1429
1430 /* Event              force Flags           Description
1431  * NETDEV_CHANGE      0     LINKDOWN        Carrier OFF, not for scope host
1432  * NETDEV_DOWN        0     LINKDOWN|DEAD   Link down, not for scope host
1433  * NETDEV_DOWN        1     LINKDOWN|DEAD   Last address removed
1434  * NETDEV_UNREGISTER  1     LINKDOWN|DEAD   Device removed
1435  */
1436 int fib_sync_down_dev(struct net_device *dev, unsigned long event, bool force)
1437 {
1438         int ret = 0;
1439         int scope = RT_SCOPE_NOWHERE;
1440         struct fib_info *prev_fi = NULL;
1441         unsigned int hash = fib_devindex_hashfn(dev->ifindex);
1442         struct hlist_head *head = &fib_info_devhash[hash];
1443         struct fib_nh *nh;
1444
1445         if (force)
1446                 scope = -1;
1447
1448         hlist_for_each_entry(nh, head, nh_hash) {
1449                 struct fib_info *fi = nh->nh_parent;
1450                 int dead;
1451
1452                 BUG_ON(!fi->fib_nhs);
1453                 if (nh->nh_dev != dev || fi == prev_fi)
1454                         continue;
1455                 prev_fi = fi;
1456                 dead = 0;
1457                 change_nexthops(fi) {
1458                         if (nexthop_nh->nh_flags & RTNH_F_DEAD)
1459                                 dead++;
1460                         else if (nexthop_nh->nh_dev == dev &&
1461                                  nexthop_nh->nh_scope != scope) {
1462                                 switch (event) {
1463                                 case NETDEV_DOWN:
1464                                 case NETDEV_UNREGISTER:
1465                                         nexthop_nh->nh_flags |= RTNH_F_DEAD;
1466                                         /* fall through */
1467                                 case NETDEV_CHANGE:
1468                                         nexthop_nh->nh_flags |= RTNH_F_LINKDOWN;
1469                                         break;
1470                                 }
1471                                 dead++;
1472                         }
1473 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1474                         if (event == NETDEV_UNREGISTER &&
1475                             nexthop_nh->nh_dev == dev) {
1476                                 dead = fi->fib_nhs;
1477                                 break;
1478                         }
1479 #endif
1480                 } endfor_nexthops(fi)
1481                 if (dead == fi->fib_nhs) {
1482                         switch (event) {
1483                         case NETDEV_DOWN:
1484                         case NETDEV_UNREGISTER:
1485                                 fi->fib_flags |= RTNH_F_DEAD;
1486                                 /* fall through */
1487                         case NETDEV_CHANGE:
1488                                 fi->fib_flags |= RTNH_F_LINKDOWN;
1489                                 break;
1490                         }
1491                         ret++;
1492                 }
1493
1494                 fib_rebalance(fi);
1495         }
1496
1497         return ret;
1498 }
1499
1500 /* Must be invoked inside of an RCU protected region.  */
1501 void fib_select_default(const struct flowi4 *flp, struct fib_result *res)
1502 {
1503         struct fib_info *fi = NULL, *last_resort = NULL;
1504         struct hlist_head *fa_head = res->fa_head;
1505         struct fib_table *tb = res->table;
1506         u8 slen = 32 - res->prefixlen;
1507         int order = -1, last_idx = -1;
1508         struct fib_alias *fa, *fa1 = NULL;
1509         u32 last_prio = res->fi->fib_priority;
1510         u8 last_tos = 0;
1511
1512         hlist_for_each_entry_rcu(fa, fa_head, fa_list) {
1513                 struct fib_info *next_fi = fa->fa_info;
1514
1515                 if (fa->fa_slen != slen)
1516                         continue;
1517                 if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
1518                         continue;
1519                 if (fa->tb_id != tb->tb_id)
1520                         continue;
1521                 if (next_fi->fib_priority > last_prio &&
1522                     fa->fa_tos == last_tos) {
1523                         if (last_tos)
1524                                 continue;
1525                         break;
1526                 }
1527                 if (next_fi->fib_flags & RTNH_F_DEAD)
1528                         continue;
1529                 last_tos = fa->fa_tos;
1530                 last_prio = next_fi->fib_priority;
1531
1532                 if (next_fi->fib_scope != res->scope ||
1533                     fa->fa_type != RTN_UNICAST)
1534                         continue;
1535                 if (!next_fi->fib_nh[0].nh_gw ||
1536                     next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK)
1537                         continue;
1538
1539                 fib_alias_accessed(fa);
1540
1541                 if (!fi) {
1542                         if (next_fi != res->fi)
1543                                 break;
1544                         fa1 = fa;
1545                 } else if (!fib_detect_death(fi, order, &last_resort,
1546                                              &last_idx, fa1->fa_default)) {
1547                         fib_result_assign(res, fi);
1548                         fa1->fa_default = order;
1549                         goto out;
1550                 }
1551                 fi = next_fi;
1552                 order++;
1553         }
1554
1555         if (order <= 0 || !fi) {
1556                 if (fa1)
1557                         fa1->fa_default = -1;
1558                 goto out;
1559         }
1560
1561         if (!fib_detect_death(fi, order, &last_resort, &last_idx,
1562                               fa1->fa_default)) {
1563                 fib_result_assign(res, fi);
1564                 fa1->fa_default = order;
1565                 goto out;
1566         }
1567
1568         if (last_idx >= 0)
1569                 fib_result_assign(res, last_resort);
1570         fa1->fa_default = last_idx;
1571 out:
1572         return;
1573 }
1574
1575 /*
1576  * Dead device goes up. We wake up dead nexthops.
1577  * It takes sense only on multipath routes.
1578  */
1579 int fib_sync_up(struct net_device *dev, unsigned int nh_flags)
1580 {
1581         struct fib_info *prev_fi;
1582         unsigned int hash;
1583         struct hlist_head *head;
1584         struct fib_nh *nh;
1585         int ret;
1586
1587         if (!(dev->flags & IFF_UP))
1588                 return 0;
1589
1590         if (nh_flags & RTNH_F_DEAD) {
1591                 unsigned int flags = dev_get_flags(dev);
1592
1593                 if (flags & (IFF_RUNNING | IFF_LOWER_UP))
1594                         nh_flags |= RTNH_F_LINKDOWN;
1595         }
1596
1597         prev_fi = NULL;
1598         hash = fib_devindex_hashfn(dev->ifindex);
1599         head = &fib_info_devhash[hash];
1600         ret = 0;
1601
1602         hlist_for_each_entry(nh, head, nh_hash) {
1603                 struct fib_info *fi = nh->nh_parent;
1604                 int alive;
1605
1606                 BUG_ON(!fi->fib_nhs);
1607                 if (nh->nh_dev != dev || fi == prev_fi)
1608                         continue;
1609
1610                 prev_fi = fi;
1611                 alive = 0;
1612                 change_nexthops(fi) {
1613                         if (!(nexthop_nh->nh_flags & nh_flags)) {
1614                                 alive++;
1615                                 continue;
1616                         }
1617                         if (!nexthop_nh->nh_dev ||
1618                             !(nexthop_nh->nh_dev->flags & IFF_UP))
1619                                 continue;
1620                         if (nexthop_nh->nh_dev != dev ||
1621                             !__in_dev_get_rtnl(dev))
1622                                 continue;
1623                         alive++;
1624                         nexthop_nh->nh_flags &= ~nh_flags;
1625                 } endfor_nexthops(fi)
1626
1627                 if (alive > 0) {
1628                         fi->fib_flags &= ~nh_flags;
1629                         ret++;
1630                 }
1631
1632                 fib_rebalance(fi);
1633         }
1634
1635         return ret;
1636 }
1637
1638 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1639 static bool fib_good_nh(const struct fib_nh *nh)
1640 {
1641         int state = NUD_REACHABLE;
1642
1643         if (nh->nh_scope == RT_SCOPE_LINK) {
1644                 struct neighbour *n;
1645
1646                 rcu_read_lock_bh();
1647
1648                 n = __ipv4_neigh_lookup_noref(nh->nh_dev,
1649                                               (__force u32)nh->nh_gw);
1650                 if (n)
1651                         state = n->nud_state;
1652
1653                 rcu_read_unlock_bh();
1654         }
1655
1656         return !!(state & NUD_VALID);
1657 }
1658
1659 void fib_select_multipath(struct fib_result *res, int hash)
1660 {
1661         struct fib_info *fi = res->fi;
1662         struct net *net = fi->fib_net;
1663         bool first = false;
1664
1665         for_nexthops(fi) {
1666                 if (net->ipv4.sysctl_fib_multipath_use_neigh) {
1667                         if (!fib_good_nh(nh))
1668                                 continue;
1669                         if (!first) {
1670                                 res->nh_sel = nhsel;
1671                                 first = true;
1672                         }
1673                 }
1674
1675                 if (hash > atomic_read(&nh->nh_upper_bound))
1676                         continue;
1677
1678                 res->nh_sel = nhsel;
1679                 return;
1680         } endfor_nexthops(fi);
1681 }
1682 #endif
1683
1684 void fib_select_path(struct net *net, struct fib_result *res,
1685                      struct flowi4 *fl4, int mp_hash)
1686 {
1687         bool oif_check;
1688
1689         oif_check = (fl4->flowi4_oif == 0 ||
1690                      fl4->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF);
1691
1692 #ifdef CONFIG_IP_ROUTE_MULTIPATH
1693         if (res->fi->fib_nhs > 1 && oif_check) {
1694                 if (mp_hash < 0)
1695                         mp_hash = get_hash_from_flowi4(fl4) >> 1;
1696
1697                 fib_select_multipath(res, mp_hash);
1698         }
1699         else
1700 #endif
1701         if (!res->prefixlen &&
1702             res->table->tb_num_default > 1 &&
1703             res->type == RTN_UNICAST && oif_check)
1704                 fib_select_default(fl4, res);
1705
1706         if (!fl4->saddr)
1707                 fl4->saddr = FIB_RES_PREFSRC(net, *res);
1708 }
1709 EXPORT_SYMBOL_GPL(fib_select_path);