GNU Linux-libre 4.9.315-gnu1
[releases.git] / net / sched / sch_api.c
1 /*
2  * net/sched/sch_api.c  Packet scheduler API.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 #include <linux/slab.h>
32 #include <linux/hashtable.h>
33
34 #include <net/net_namespace.h>
35 #include <net/sock.h>
36 #include <net/netlink.h>
37 #include <net/pkt_sched.h>
38
39 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
40                         struct nlmsghdr *n, u32 clid,
41                         struct Qdisc *old, struct Qdisc *new);
42 static int tclass_notify(struct net *net, struct sk_buff *oskb,
43                          struct nlmsghdr *n, struct Qdisc *q,
44                          unsigned long cl, int event);
45
46 /*
47
48    Short review.
49    -------------
50
51    This file consists of two interrelated parts:
52
53    1. queueing disciplines manager frontend.
54    2. traffic classes manager frontend.
55
56    Generally, queueing discipline ("qdisc") is a black box,
57    which is able to enqueue packets and to dequeue them (when
58    device is ready to send something) in order and at times
59    determined by algorithm hidden in it.
60
61    qdisc's are divided to two categories:
62    - "queues", which have no internal structure visible from outside.
63    - "schedulers", which split all the packets to "traffic classes",
64      using "packet classifiers" (look at cls_api.c)
65
66    In turn, classes may have child qdiscs (as rule, queues)
67    attached to them etc. etc. etc.
68
69    The goal of the routines in this file is to translate
70    information supplied by user in the form of handles
71    to more intelligible for kernel form, to make some sanity
72    checks and part of work, which is common to all qdiscs
73    and to provide rtnetlink notifications.
74
75    All real intelligent work is done inside qdisc modules.
76
77
78
79    Every discipline has two major routines: enqueue and dequeue.
80
81    ---dequeue
82
83    dequeue usually returns a skb to send. It is allowed to return NULL,
84    but it does not mean that queue is empty, it just means that
85    discipline does not want to send anything this time.
86    Queue is really empty if q->q.qlen == 0.
87    For complicated disciplines with multiple queues q->q is not
88    real packet queue, but however q->q.qlen must be valid.
89
90    ---enqueue
91
92    enqueue returns 0, if packet was enqueued successfully.
93    If packet (this one or another one) was dropped, it returns
94    not zero error code.
95    NET_XMIT_DROP        - this packet dropped
96      Expected action: do not backoff, but wait until queue will clear.
97    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
98      Expected action: backoff or ignore
99
100    Auxiliary routines:
101
102    ---peek
103
104    like dequeue but without removing a packet from the queue
105
106    ---reset
107
108    returns qdisc to initial state: purge all buffers, clear all
109    timers, counters (except for statistics) etc.
110
111    ---init
112
113    initializes newly created qdisc.
114
115    ---destroy
116
117    destroys resources allocated by init and during lifetime of qdisc.
118
119    ---change
120
121    changes qdisc parameters.
122  */
123
124 /* Protects list of registered TC modules. It is pure SMP lock. */
125 static DEFINE_RWLOCK(qdisc_mod_lock);
126
127
128 /************************************************
129  *      Queueing disciplines manipulation.      *
130  ************************************************/
131
132
133 /* The list of all installed queueing disciplines. */
134
135 static struct Qdisc_ops *qdisc_base;
136
137 /* Register/unregister queueing discipline */
138
139 int register_qdisc(struct Qdisc_ops *qops)
140 {
141         struct Qdisc_ops *q, **qp;
142         int rc = -EEXIST;
143
144         write_lock(&qdisc_mod_lock);
145         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
146                 if (!strcmp(qops->id, q->id))
147                         goto out;
148
149         if (qops->enqueue == NULL)
150                 qops->enqueue = noop_qdisc_ops.enqueue;
151         if (qops->peek == NULL) {
152                 if (qops->dequeue == NULL)
153                         qops->peek = noop_qdisc_ops.peek;
154                 else
155                         goto out_einval;
156         }
157         if (qops->dequeue == NULL)
158                 qops->dequeue = noop_qdisc_ops.dequeue;
159
160         if (qops->cl_ops) {
161                 const struct Qdisc_class_ops *cops = qops->cl_ops;
162
163                 if (!(cops->get && cops->put && cops->walk && cops->leaf))
164                         goto out_einval;
165
166                 if (cops->tcf_chain && !(cops->bind_tcf && cops->unbind_tcf))
167                         goto out_einval;
168         }
169
170         qops->next = NULL;
171         *qp = qops;
172         rc = 0;
173 out:
174         write_unlock(&qdisc_mod_lock);
175         return rc;
176
177 out_einval:
178         rc = -EINVAL;
179         goto out;
180 }
181 EXPORT_SYMBOL(register_qdisc);
182
183 int unregister_qdisc(struct Qdisc_ops *qops)
184 {
185         struct Qdisc_ops *q, **qp;
186         int err = -ENOENT;
187
188         write_lock(&qdisc_mod_lock);
189         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
190                 if (q == qops)
191                         break;
192         if (q) {
193                 *qp = q->next;
194                 q->next = NULL;
195                 err = 0;
196         }
197         write_unlock(&qdisc_mod_lock);
198         return err;
199 }
200 EXPORT_SYMBOL(unregister_qdisc);
201
202 /* Get default qdisc if not otherwise specified */
203 void qdisc_get_default(char *name, size_t len)
204 {
205         read_lock(&qdisc_mod_lock);
206         strlcpy(name, default_qdisc_ops->id, len);
207         read_unlock(&qdisc_mod_lock);
208 }
209
210 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
211 {
212         struct Qdisc_ops *q = NULL;
213
214         for (q = qdisc_base; q; q = q->next) {
215                 if (!strcmp(name, q->id)) {
216                         if (!try_module_get(q->owner))
217                                 q = NULL;
218                         break;
219                 }
220         }
221
222         return q;
223 }
224
225 /* Set new default qdisc to use */
226 int qdisc_set_default(const char *name)
227 {
228         const struct Qdisc_ops *ops;
229
230         if (!capable(CAP_NET_ADMIN))
231                 return -EPERM;
232
233         write_lock(&qdisc_mod_lock);
234         ops = qdisc_lookup_default(name);
235         if (!ops) {
236                 /* Not found, drop lock and try to load module */
237                 write_unlock(&qdisc_mod_lock);
238                 request_module("sch_%s", name);
239                 write_lock(&qdisc_mod_lock);
240
241                 ops = qdisc_lookup_default(name);
242         }
243
244         if (ops) {
245                 /* Set new default */
246                 module_put(default_qdisc_ops->owner);
247                 default_qdisc_ops = ops;
248         }
249         write_unlock(&qdisc_mod_lock);
250
251         return ops ? 0 : -ENOENT;
252 }
253
254 /* We know handle. Find qdisc among all qdisc's attached to device
255  * (root qdisc, all its children, children of children etc.)
256  * Note: caller either uses rtnl or rcu_read_lock()
257  */
258
259 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
260 {
261         struct Qdisc *q;
262
263         if (!qdisc_dev(root))
264                 return (root->handle == handle ? root : NULL);
265
266         if (!(root->flags & TCQ_F_BUILTIN) &&
267             root->handle == handle)
268                 return root;
269
270         hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) {
271                 if (q->handle == handle)
272                         return q;
273         }
274         return NULL;
275 }
276
277 void qdisc_hash_add(struct Qdisc *q)
278 {
279         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
280                 ASSERT_RTNL();
281                 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
282         }
283 }
284 EXPORT_SYMBOL(qdisc_hash_add);
285
286 void qdisc_hash_del(struct Qdisc *q)
287 {
288         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
289                 ASSERT_RTNL();
290                 hash_del_rcu(&q->hash);
291         }
292 }
293 EXPORT_SYMBOL(qdisc_hash_del);
294
295 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
296 {
297         struct Qdisc *q;
298
299         if (!handle)
300                 return NULL;
301         q = qdisc_match_from_root(dev->qdisc, handle);
302         if (q)
303                 goto out;
304
305         if (dev_ingress_queue(dev))
306                 q = qdisc_match_from_root(
307                         dev_ingress_queue(dev)->qdisc_sleeping,
308                         handle);
309 out:
310         return q;
311 }
312
313 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
314 {
315         unsigned long cl;
316         struct Qdisc *leaf;
317         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
318
319         if (cops == NULL)
320                 return NULL;
321         cl = cops->get(p, classid);
322
323         if (cl == 0)
324                 return NULL;
325         leaf = cops->leaf(p, cl);
326         cops->put(p, cl);
327         return leaf;
328 }
329
330 /* Find queueing discipline by name */
331
332 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
333 {
334         struct Qdisc_ops *q = NULL;
335
336         if (kind) {
337                 read_lock(&qdisc_mod_lock);
338                 for (q = qdisc_base; q; q = q->next) {
339                         if (nla_strcmp(kind, q->id) == 0) {
340                                 if (!try_module_get(q->owner))
341                                         q = NULL;
342                                 break;
343                         }
344                 }
345                 read_unlock(&qdisc_mod_lock);
346         }
347         return q;
348 }
349
350 /* The linklayer setting were not transferred from iproute2, in older
351  * versions, and the rate tables lookup systems have been dropped in
352  * the kernel. To keep backward compatible with older iproute2 tc
353  * utils, we detect the linklayer setting by detecting if the rate
354  * table were modified.
355  *
356  * For linklayer ATM table entries, the rate table will be aligned to
357  * 48 bytes, thus some table entries will contain the same value.  The
358  * mpu (min packet unit) is also encoded into the old rate table, thus
359  * starting from the mpu, we find low and high table entries for
360  * mapping this cell.  If these entries contain the same value, when
361  * the rate tables have been modified for linklayer ATM.
362  *
363  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
364  * and then roundup to the next cell, calc the table entry one below,
365  * and compare.
366  */
367 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
368 {
369         int low       = roundup(r->mpu, 48);
370         int high      = roundup(low+1, 48);
371         int cell_low  = low >> r->cell_log;
372         int cell_high = (high >> r->cell_log) - 1;
373
374         /* rtab is too inaccurate at rates > 100Mbit/s */
375         if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
376                 pr_debug("TC linklayer: Giving up ATM detection\n");
377                 return TC_LINKLAYER_ETHERNET;
378         }
379
380         if ((cell_high > cell_low) && (cell_high < 256)
381             && (rtab[cell_low] == rtab[cell_high])) {
382                 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
383                          cell_low, cell_high, rtab[cell_high]);
384                 return TC_LINKLAYER_ATM;
385         }
386         return TC_LINKLAYER_ETHERNET;
387 }
388
389 static struct qdisc_rate_table *qdisc_rtab_list;
390
391 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
392                                         struct nlattr *tab)
393 {
394         struct qdisc_rate_table *rtab;
395
396         if (tab == NULL || r->rate == 0 ||
397             r->cell_log == 0 || r->cell_log >= 32 ||
398             nla_len(tab) != TC_RTAB_SIZE)
399                 return NULL;
400
401         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
402                 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
403                     !memcmp(&rtab->data, nla_data(tab), 1024)) {
404                         rtab->refcnt++;
405                         return rtab;
406                 }
407         }
408
409         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
410         if (rtab) {
411                 rtab->rate = *r;
412                 rtab->refcnt = 1;
413                 memcpy(rtab->data, nla_data(tab), 1024);
414                 if (r->linklayer == TC_LINKLAYER_UNAWARE)
415                         r->linklayer = __detect_linklayer(r, rtab->data);
416                 rtab->next = qdisc_rtab_list;
417                 qdisc_rtab_list = rtab;
418         }
419         return rtab;
420 }
421 EXPORT_SYMBOL(qdisc_get_rtab);
422
423 void qdisc_put_rtab(struct qdisc_rate_table *tab)
424 {
425         struct qdisc_rate_table *rtab, **rtabp;
426
427         if (!tab || --tab->refcnt)
428                 return;
429
430         for (rtabp = &qdisc_rtab_list;
431              (rtab = *rtabp) != NULL;
432              rtabp = &rtab->next) {
433                 if (rtab == tab) {
434                         *rtabp = rtab->next;
435                         kfree(rtab);
436                         return;
437                 }
438         }
439 }
440 EXPORT_SYMBOL(qdisc_put_rtab);
441
442 static LIST_HEAD(qdisc_stab_list);
443 static DEFINE_SPINLOCK(qdisc_stab_lock);
444
445 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
446         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
447         [TCA_STAB_DATA] = { .type = NLA_BINARY },
448 };
449
450 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt)
451 {
452         struct nlattr *tb[TCA_STAB_MAX + 1];
453         struct qdisc_size_table *stab;
454         struct tc_sizespec *s;
455         unsigned int tsize = 0;
456         u16 *tab = NULL;
457         int err;
458
459         err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy);
460         if (err < 0)
461                 return ERR_PTR(err);
462         if (!tb[TCA_STAB_BASE])
463                 return ERR_PTR(-EINVAL);
464
465         s = nla_data(tb[TCA_STAB_BASE]);
466
467         if (s->tsize > 0) {
468                 if (!tb[TCA_STAB_DATA])
469                         return ERR_PTR(-EINVAL);
470                 tab = nla_data(tb[TCA_STAB_DATA]);
471                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
472         }
473
474         if (tsize != s->tsize || (!tab && tsize > 0))
475                 return ERR_PTR(-EINVAL);
476
477         spin_lock(&qdisc_stab_lock);
478
479         list_for_each_entry(stab, &qdisc_stab_list, list) {
480                 if (memcmp(&stab->szopts, s, sizeof(*s)))
481                         continue;
482                 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
483                         continue;
484                 stab->refcnt++;
485                 spin_unlock(&qdisc_stab_lock);
486                 return stab;
487         }
488
489         spin_unlock(&qdisc_stab_lock);
490
491         stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
492         if (!stab)
493                 return ERR_PTR(-ENOMEM);
494
495         stab->refcnt = 1;
496         stab->szopts = *s;
497         if (tsize > 0)
498                 memcpy(stab->data, tab, tsize * sizeof(u16));
499
500         spin_lock(&qdisc_stab_lock);
501         list_add_tail(&stab->list, &qdisc_stab_list);
502         spin_unlock(&qdisc_stab_lock);
503
504         return stab;
505 }
506
507 static void stab_kfree_rcu(struct rcu_head *head)
508 {
509         kfree(container_of(head, struct qdisc_size_table, rcu));
510 }
511
512 void qdisc_put_stab(struct qdisc_size_table *tab)
513 {
514         if (!tab)
515                 return;
516
517         spin_lock(&qdisc_stab_lock);
518
519         if (--tab->refcnt == 0) {
520                 list_del(&tab->list);
521                 call_rcu_bh(&tab->rcu, stab_kfree_rcu);
522         }
523
524         spin_unlock(&qdisc_stab_lock);
525 }
526 EXPORT_SYMBOL(qdisc_put_stab);
527
528 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
529 {
530         struct nlattr *nest;
531
532         nest = nla_nest_start(skb, TCA_STAB);
533         if (nest == NULL)
534                 goto nla_put_failure;
535         if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
536                 goto nla_put_failure;
537         nla_nest_end(skb, nest);
538
539         return skb->len;
540
541 nla_put_failure:
542         return -1;
543 }
544
545 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
546                                const struct qdisc_size_table *stab)
547 {
548         int pkt_len, slot;
549
550         pkt_len = skb->len + stab->szopts.overhead;
551         if (unlikely(!stab->szopts.tsize))
552                 goto out;
553
554         slot = pkt_len + stab->szopts.cell_align;
555         if (unlikely(slot < 0))
556                 slot = 0;
557
558         slot >>= stab->szopts.cell_log;
559         if (likely(slot < stab->szopts.tsize))
560                 pkt_len = stab->data[slot];
561         else
562                 pkt_len = stab->data[stab->szopts.tsize - 1] *
563                                 (slot / stab->szopts.tsize) +
564                                 stab->data[slot % stab->szopts.tsize];
565
566         pkt_len <<= stab->szopts.size_log;
567 out:
568         if (unlikely(pkt_len < 1))
569                 pkt_len = 1;
570         qdisc_skb_cb(skb)->pkt_len = pkt_len;
571 }
572 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
573
574 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
575 {
576         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
577                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
578                         txt, qdisc->ops->id, qdisc->handle >> 16);
579                 qdisc->flags |= TCQ_F_WARN_NONWC;
580         }
581 }
582 EXPORT_SYMBOL(qdisc_warn_nonwc);
583
584 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
585 {
586         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
587                                                  timer);
588
589         rcu_read_lock();
590         __netif_schedule(qdisc_root(wd->qdisc));
591         rcu_read_unlock();
592
593         return HRTIMER_NORESTART;
594 }
595
596 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
597 {
598         hrtimer_init(&wd->timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
599         wd->timer.function = qdisc_watchdog;
600         wd->qdisc = qdisc;
601 }
602 EXPORT_SYMBOL(qdisc_watchdog_init);
603
604 void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
605 {
606         if (test_bit(__QDISC_STATE_DEACTIVATED,
607                      &qdisc_root_sleeping(wd->qdisc)->state))
608                 return;
609
610         if (wd->last_expires == expires)
611                 return;
612
613         wd->last_expires = expires;
614         hrtimer_start(&wd->timer,
615                       ns_to_ktime(expires),
616                       HRTIMER_MODE_ABS_PINNED);
617 }
618 EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
619
620 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
621 {
622         hrtimer_cancel(&wd->timer);
623 }
624 EXPORT_SYMBOL(qdisc_watchdog_cancel);
625
626 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
627 {
628         unsigned int size = n * sizeof(struct hlist_head), i;
629         struct hlist_head *h;
630
631         if (size <= PAGE_SIZE)
632                 h = kmalloc(size, GFP_KERNEL);
633         else
634                 h = (struct hlist_head *)
635                         __get_free_pages(GFP_KERNEL, get_order(size));
636
637         if (h != NULL) {
638                 for (i = 0; i < n; i++)
639                         INIT_HLIST_HEAD(&h[i]);
640         }
641         return h;
642 }
643
644 static void qdisc_class_hash_free(struct hlist_head *h, unsigned int n)
645 {
646         unsigned int size = n * sizeof(struct hlist_head);
647
648         if (size <= PAGE_SIZE)
649                 kfree(h);
650         else
651                 free_pages((unsigned long)h, get_order(size));
652 }
653
654 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
655 {
656         struct Qdisc_class_common *cl;
657         struct hlist_node *next;
658         struct hlist_head *nhash, *ohash;
659         unsigned int nsize, nmask, osize;
660         unsigned int i, h;
661
662         /* Rehash when load factor exceeds 0.75 */
663         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
664                 return;
665         nsize = clhash->hashsize * 2;
666         nmask = nsize - 1;
667         nhash = qdisc_class_hash_alloc(nsize);
668         if (nhash == NULL)
669                 return;
670
671         ohash = clhash->hash;
672         osize = clhash->hashsize;
673
674         sch_tree_lock(sch);
675         for (i = 0; i < osize; i++) {
676                 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
677                         h = qdisc_class_hash(cl->classid, nmask);
678                         hlist_add_head(&cl->hnode, &nhash[h]);
679                 }
680         }
681         clhash->hash     = nhash;
682         clhash->hashsize = nsize;
683         clhash->hashmask = nmask;
684         sch_tree_unlock(sch);
685
686         qdisc_class_hash_free(ohash, osize);
687 }
688 EXPORT_SYMBOL(qdisc_class_hash_grow);
689
690 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
691 {
692         unsigned int size = 4;
693
694         clhash->hash = qdisc_class_hash_alloc(size);
695         if (clhash->hash == NULL)
696                 return -ENOMEM;
697         clhash->hashsize  = size;
698         clhash->hashmask  = size - 1;
699         clhash->hashelems = 0;
700         return 0;
701 }
702 EXPORT_SYMBOL(qdisc_class_hash_init);
703
704 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
705 {
706         qdisc_class_hash_free(clhash->hash, clhash->hashsize);
707 }
708 EXPORT_SYMBOL(qdisc_class_hash_destroy);
709
710 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
711                              struct Qdisc_class_common *cl)
712 {
713         unsigned int h;
714
715         INIT_HLIST_NODE(&cl->hnode);
716         h = qdisc_class_hash(cl->classid, clhash->hashmask);
717         hlist_add_head(&cl->hnode, &clhash->hash[h]);
718         clhash->hashelems++;
719 }
720 EXPORT_SYMBOL(qdisc_class_hash_insert);
721
722 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
723                              struct Qdisc_class_common *cl)
724 {
725         hlist_del(&cl->hnode);
726         clhash->hashelems--;
727 }
728 EXPORT_SYMBOL(qdisc_class_hash_remove);
729
730 /* Allocate an unique handle from space managed by kernel
731  * Possible range is [8000-FFFF]:0000 (0x8000 values)
732  */
733 static u32 qdisc_alloc_handle(struct net_device *dev)
734 {
735         int i = 0x8000;
736         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
737
738         do {
739                 autohandle += TC_H_MAKE(0x10000U, 0);
740                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
741                         autohandle = TC_H_MAKE(0x80000000U, 0);
742                 if (!qdisc_lookup(dev, autohandle))
743                         return autohandle;
744                 cond_resched();
745         } while (--i > 0);
746
747         return 0;
748 }
749
750 void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned int n,
751                                unsigned int len)
752 {
753         const struct Qdisc_class_ops *cops;
754         unsigned long cl;
755         u32 parentid;
756         int drops;
757
758         if (n == 0 && len == 0)
759                 return;
760         drops = max_t(int, n, 0);
761         rcu_read_lock();
762         while ((parentid = sch->parent)) {
763                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
764                         break;
765
766                 if (sch->flags & TCQ_F_NOPARENT)
767                         break;
768                 /* TODO: perform the search on a per txq basis */
769                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
770                 if (sch == NULL) {
771                         WARN_ON_ONCE(parentid != TC_H_ROOT);
772                         break;
773                 }
774                 cops = sch->ops->cl_ops;
775                 if (cops->qlen_notify) {
776                         cl = cops->get(sch, parentid);
777                         cops->qlen_notify(sch, cl);
778                         cops->put(sch, cl);
779                 }
780                 sch->q.qlen -= n;
781                 sch->qstats.backlog -= len;
782                 __qdisc_qstats_drop(sch, drops);
783         }
784         rcu_read_unlock();
785 }
786 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
787
788 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
789                                struct nlmsghdr *n, u32 clid,
790                                struct Qdisc *old, struct Qdisc *new)
791 {
792         if (new || old)
793                 qdisc_notify(net, skb, n, clid, old, new);
794
795         if (old)
796                 qdisc_destroy(old);
797 }
798
799 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
800  * to device "dev".
801  *
802  * When appropriate send a netlink notification using 'skb'
803  * and "n".
804  *
805  * On success, destroy old qdisc.
806  */
807
808 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
809                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
810                        struct Qdisc *new, struct Qdisc *old)
811 {
812         struct Qdisc *q = old;
813         struct net *net = dev_net(dev);
814         int err = 0;
815
816         if (parent == NULL) {
817                 unsigned int i, num_q, ingress;
818
819                 ingress = 0;
820                 num_q = dev->num_tx_queues;
821                 if ((q && q->flags & TCQ_F_INGRESS) ||
822                     (new && new->flags & TCQ_F_INGRESS)) {
823                         num_q = 1;
824                         ingress = 1;
825                         if (!dev_ingress_queue(dev))
826                                 return -ENOENT;
827                 }
828
829                 if (dev->flags & IFF_UP)
830                         dev_deactivate(dev);
831
832                 if (new && new->ops->attach)
833                         goto skip;
834
835                 for (i = 0; i < num_q; i++) {
836                         struct netdev_queue *dev_queue = dev_ingress_queue(dev);
837
838                         if (!ingress)
839                                 dev_queue = netdev_get_tx_queue(dev, i);
840
841                         old = dev_graft_qdisc(dev_queue, new);
842                         if (new && i > 0)
843                                 atomic_inc(&new->refcnt);
844
845                         if (!ingress)
846                                 qdisc_destroy(old);
847                 }
848
849 skip:
850                 if (!ingress) {
851                         notify_and_destroy(net, skb, n, classid,
852                                            dev->qdisc, new);
853                         if (new && !new->ops->attach)
854                                 atomic_inc(&new->refcnt);
855                         dev->qdisc = new ? : &noop_qdisc;
856
857                         if (new && new->ops->attach)
858                                 new->ops->attach(new);
859                 } else {
860                         notify_and_destroy(net, skb, n, classid, old, new);
861                 }
862
863                 if (dev->flags & IFF_UP)
864                         dev_activate(dev);
865         } else {
866                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
867
868                 err = -EOPNOTSUPP;
869                 if (cops && cops->graft) {
870                         unsigned long cl = cops->get(parent, classid);
871                         if (cl) {
872                                 err = cops->graft(parent, cl, new, &old);
873                                 cops->put(parent, cl);
874                         } else
875                                 err = -ENOENT;
876                 }
877                 if (!err)
878                         notify_and_destroy(net, skb, n, classid, old, new);
879         }
880         return err;
881 }
882
883 /* lockdep annotation is needed for ingress; egress gets it only for name */
884 static struct lock_class_key qdisc_tx_lock;
885 static struct lock_class_key qdisc_rx_lock;
886
887 /*
888    Allocate and initialize new qdisc.
889
890    Parameters are passed via opt.
891  */
892
893 static struct Qdisc *qdisc_create(struct net_device *dev,
894                                   struct netdev_queue *dev_queue,
895                                   struct Qdisc *p, u32 parent, u32 handle,
896                                   struct nlattr **tca, int *errp)
897 {
898         int err;
899         struct nlattr *kind = tca[TCA_KIND];
900         struct Qdisc *sch;
901         struct Qdisc_ops *ops;
902         struct qdisc_size_table *stab;
903
904         ops = qdisc_lookup_ops(kind);
905 #ifdef CONFIG_MODULES
906         if (ops == NULL && kind != NULL) {
907                 char name[IFNAMSIZ];
908                 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
909                         /* We dropped the RTNL semaphore in order to
910                          * perform the module load.  So, even if we
911                          * succeeded in loading the module we have to
912                          * tell the caller to replay the request.  We
913                          * indicate this using -EAGAIN.
914                          * We replay the request because the device may
915                          * go away in the mean time.
916                          */
917                         rtnl_unlock();
918                         request_module("sch_%s", name);
919                         rtnl_lock();
920                         ops = qdisc_lookup_ops(kind);
921                         if (ops != NULL) {
922                                 /* We will try again qdisc_lookup_ops,
923                                  * so don't keep a reference.
924                                  */
925                                 module_put(ops->owner);
926                                 err = -EAGAIN;
927                                 goto err_out;
928                         }
929                 }
930         }
931 #endif
932
933         err = -ENOENT;
934         if (ops == NULL)
935                 goto err_out;
936
937         sch = qdisc_alloc(dev_queue, ops);
938         if (IS_ERR(sch)) {
939                 err = PTR_ERR(sch);
940                 goto err_out2;
941         }
942
943         sch->parent = parent;
944
945         if (handle == TC_H_INGRESS) {
946                 sch->flags |= TCQ_F_INGRESS;
947                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
948                 lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
949         } else {
950                 if (handle == 0) {
951                         handle = qdisc_alloc_handle(dev);
952                         err = -ENOMEM;
953                         if (handle == 0)
954                                 goto err_out3;
955                 }
956                 lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
957                 if (!netif_is_multiqueue(dev))
958                         sch->flags |= TCQ_F_ONETXQUEUE;
959         }
960
961         sch->handle = handle;
962
963         if (!ops->init || (err = ops->init(sch, tca[TCA_OPTIONS])) == 0) {
964                 if (qdisc_is_percpu_stats(sch)) {
965                         sch->cpu_bstats =
966                                 netdev_alloc_pcpu_stats(struct gnet_stats_basic_cpu);
967                         if (!sch->cpu_bstats)
968                                 goto err_out4;
969
970                         sch->cpu_qstats = alloc_percpu(struct gnet_stats_queue);
971                         if (!sch->cpu_qstats)
972                                 goto err_out4;
973                 }
974
975                 if (tca[TCA_STAB]) {
976                         stab = qdisc_get_stab(tca[TCA_STAB]);
977                         if (IS_ERR(stab)) {
978                                 err = PTR_ERR(stab);
979                                 goto err_out4;
980                         }
981                         rcu_assign_pointer(sch->stab, stab);
982                 }
983                 if (tca[TCA_RATE]) {
984                         seqcount_t *running;
985
986                         err = -EOPNOTSUPP;
987                         if (sch->flags & TCQ_F_MQROOT)
988                                 goto err_out4;
989
990                         if ((sch->parent != TC_H_ROOT) &&
991                             !(sch->flags & TCQ_F_INGRESS) &&
992                             (!p || !(p->flags & TCQ_F_MQROOT)))
993                                 running = qdisc_root_sleeping_running(sch);
994                         else
995                                 running = &sch->running;
996
997                         err = gen_new_estimator(&sch->bstats,
998                                                 sch->cpu_bstats,
999                                                 &sch->rate_est,
1000                                                 NULL,
1001                                                 running,
1002                                                 tca[TCA_RATE]);
1003                         if (err)
1004                                 goto err_out4;
1005                 }
1006
1007                 qdisc_hash_add(sch);
1008
1009                 return sch;
1010         }
1011         /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1012         if (ops->destroy)
1013                 ops->destroy(sch);
1014 err_out3:
1015         dev_put(dev);
1016         kfree((char *) sch - sch->padded);
1017 err_out2:
1018         module_put(ops->owner);
1019 err_out:
1020         *errp = err;
1021         return NULL;
1022
1023 err_out4:
1024         free_percpu(sch->cpu_bstats);
1025         free_percpu(sch->cpu_qstats);
1026         /*
1027          * Any broken qdiscs that would require a ops->reset() here?
1028          * The qdisc was never in action so it shouldn't be necessary.
1029          */
1030         qdisc_put_stab(rtnl_dereference(sch->stab));
1031         if (ops->destroy)
1032                 ops->destroy(sch);
1033         goto err_out3;
1034 }
1035
1036 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca)
1037 {
1038         struct qdisc_size_table *ostab, *stab = NULL;
1039         int err = 0;
1040
1041         if (tca[TCA_OPTIONS]) {
1042                 if (sch->ops->change == NULL)
1043                         return -EINVAL;
1044                 err = sch->ops->change(sch, tca[TCA_OPTIONS]);
1045                 if (err)
1046                         return err;
1047         }
1048
1049         if (tca[TCA_STAB]) {
1050                 stab = qdisc_get_stab(tca[TCA_STAB]);
1051                 if (IS_ERR(stab))
1052                         return PTR_ERR(stab);
1053         }
1054
1055         ostab = rtnl_dereference(sch->stab);
1056         rcu_assign_pointer(sch->stab, stab);
1057         qdisc_put_stab(ostab);
1058
1059         if (tca[TCA_RATE]) {
1060                 /* NB: ignores errors from replace_estimator
1061                    because change can't be undone. */
1062                 if (sch->flags & TCQ_F_MQROOT)
1063                         goto out;
1064                 gen_replace_estimator(&sch->bstats,
1065                                       sch->cpu_bstats,
1066                                       &sch->rate_est,
1067                                       NULL,
1068                                       qdisc_root_sleeping_running(sch),
1069                                       tca[TCA_RATE]);
1070         }
1071 out:
1072         return 0;
1073 }
1074
1075 struct check_loop_arg {
1076         struct qdisc_walker     w;
1077         struct Qdisc            *p;
1078         int                     depth;
1079 };
1080
1081 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1082                          struct qdisc_walker *w);
1083
1084 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1085 {
1086         struct check_loop_arg   arg;
1087
1088         if (q->ops->cl_ops == NULL)
1089                 return 0;
1090
1091         arg.w.stop = arg.w.skip = arg.w.count = 0;
1092         arg.w.fn = check_loop_fn;
1093         arg.depth = depth;
1094         arg.p = p;
1095         q->ops->cl_ops->walk(q, &arg.w);
1096         return arg.w.stop ? -ELOOP : 0;
1097 }
1098
1099 static int
1100 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1101 {
1102         struct Qdisc *leaf;
1103         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1104         struct check_loop_arg *arg = (struct check_loop_arg *)w;
1105
1106         leaf = cops->leaf(q, cl);
1107         if (leaf) {
1108                 if (leaf == arg->p || arg->depth > 7)
1109                         return -ELOOP;
1110                 return check_loop(leaf, arg->p, arg->depth + 1);
1111         }
1112         return 0;
1113 }
1114
1115 /*
1116  * Delete/get qdisc.
1117  */
1118
1119 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
1120 {
1121         struct net *net = sock_net(skb->sk);
1122         struct tcmsg *tcm = nlmsg_data(n);
1123         struct nlattr *tca[TCA_MAX + 1];
1124         struct net_device *dev;
1125         u32 clid;
1126         struct Qdisc *q = NULL;
1127         struct Qdisc *p = NULL;
1128         int err;
1129
1130         if ((n->nlmsg_type != RTM_GETQDISC) &&
1131             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1132                 return -EPERM;
1133
1134         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1135         if (err < 0)
1136                 return err;
1137
1138         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1139         if (!dev)
1140                 return -ENODEV;
1141
1142         clid = tcm->tcm_parent;
1143         if (clid) {
1144                 if (clid != TC_H_ROOT) {
1145                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1146                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1147                                 if (!p)
1148                                         return -ENOENT;
1149                                 q = qdisc_leaf(p, clid);
1150                         } else if (dev_ingress_queue(dev)) {
1151                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1152                         }
1153                 } else {
1154                         q = dev->qdisc;
1155                 }
1156                 if (!q)
1157                         return -ENOENT;
1158
1159                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle)
1160                         return -EINVAL;
1161         } else {
1162                 q = qdisc_lookup(dev, tcm->tcm_handle);
1163                 if (!q)
1164                         return -ENOENT;
1165         }
1166
1167         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1168                 return -EINVAL;
1169
1170         if (n->nlmsg_type == RTM_DELQDISC) {
1171                 if (!clid)
1172                         return -EINVAL;
1173                 if (q->handle == 0)
1174                         return -ENOENT;
1175                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q);
1176                 if (err != 0)
1177                         return err;
1178         } else {
1179                 qdisc_notify(net, skb, n, clid, NULL, q);
1180         }
1181         return 0;
1182 }
1183
1184 /*
1185  * Create/change qdisc.
1186  */
1187
1188 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n)
1189 {
1190         struct net *net = sock_net(skb->sk);
1191         struct tcmsg *tcm;
1192         struct nlattr *tca[TCA_MAX + 1];
1193         struct net_device *dev;
1194         u32 clid;
1195         struct Qdisc *q, *p;
1196         int err;
1197
1198         if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1199                 return -EPERM;
1200
1201 replay:
1202         /* Reinit, just in case something touches this. */
1203         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1204         if (err < 0)
1205                 return err;
1206
1207         tcm = nlmsg_data(n);
1208         clid = tcm->tcm_parent;
1209         q = p = NULL;
1210
1211         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1212         if (!dev)
1213                 return -ENODEV;
1214
1215
1216         if (clid) {
1217                 if (clid != TC_H_ROOT) {
1218                         if (clid != TC_H_INGRESS) {
1219                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1220                                 if (!p)
1221                                         return -ENOENT;
1222                                 q = qdisc_leaf(p, clid);
1223                         } else if (dev_ingress_queue_create(dev)) {
1224                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1225                         }
1226                 } else {
1227                         q = dev->qdisc;
1228                 }
1229
1230                 /* It may be default qdisc, ignore it */
1231                 if (q && q->handle == 0)
1232                         q = NULL;
1233
1234                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1235                         if (tcm->tcm_handle) {
1236                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE))
1237                                         return -EEXIST;
1238                                 if (TC_H_MIN(tcm->tcm_handle))
1239                                         return -EINVAL;
1240                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1241                                 if (!q)
1242                                         goto create_n_graft;
1243                                 if (n->nlmsg_flags & NLM_F_EXCL)
1244                                         return -EEXIST;
1245                                 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1246                                         return -EINVAL;
1247                                 if (q == p ||
1248                                     (p && check_loop(q, p, 0)))
1249                                         return -ELOOP;
1250                                 atomic_inc(&q->refcnt);
1251                                 goto graft;
1252                         } else {
1253                                 if (!q)
1254                                         goto create_n_graft;
1255
1256                                 /* This magic test requires explanation.
1257                                  *
1258                                  *   We know, that some child q is already
1259                                  *   attached to this parent and have choice:
1260                                  *   either to change it or to create/graft new one.
1261                                  *
1262                                  *   1. We are allowed to create/graft only
1263                                  *   if CREATE and REPLACE flags are set.
1264                                  *
1265                                  *   2. If EXCL is set, requestor wanted to say,
1266                                  *   that qdisc tcm_handle is not expected
1267                                  *   to exist, so that we choose create/graft too.
1268                                  *
1269                                  *   3. The last case is when no flags are set.
1270                                  *   Alas, it is sort of hole in API, we
1271                                  *   cannot decide what to do unambiguously.
1272                                  *   For now we select create/graft, if
1273                                  *   user gave KIND, which does not match existing.
1274                                  */
1275                                 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1276                                     (n->nlmsg_flags & NLM_F_REPLACE) &&
1277                                     ((n->nlmsg_flags & NLM_F_EXCL) ||
1278                                      (tca[TCA_KIND] &&
1279                                       nla_strcmp(tca[TCA_KIND], q->ops->id))))
1280                                         goto create_n_graft;
1281                         }
1282                 }
1283         } else {
1284                 if (!tcm->tcm_handle)
1285                         return -EINVAL;
1286                 q = qdisc_lookup(dev, tcm->tcm_handle);
1287         }
1288
1289         /* Change qdisc parameters */
1290         if (q == NULL)
1291                 return -ENOENT;
1292         if (n->nlmsg_flags & NLM_F_EXCL)
1293                 return -EEXIST;
1294         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id))
1295                 return -EINVAL;
1296         err = qdisc_change(q, tca);
1297         if (err == 0)
1298                 qdisc_notify(net, skb, n, clid, NULL, q);
1299         return err;
1300
1301 create_n_graft:
1302         if (!(n->nlmsg_flags & NLM_F_CREATE))
1303                 return -ENOENT;
1304         if (clid == TC_H_INGRESS) {
1305                 if (dev_ingress_queue(dev))
1306                         q = qdisc_create(dev, dev_ingress_queue(dev), p,
1307                                          tcm->tcm_parent, tcm->tcm_parent,
1308                                          tca, &err);
1309                 else
1310                         err = -ENOENT;
1311         } else {
1312                 struct netdev_queue *dev_queue;
1313
1314                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1315                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1316                 else if (p)
1317                         dev_queue = p->dev_queue;
1318                 else
1319                         dev_queue = netdev_get_tx_queue(dev, 0);
1320
1321                 q = qdisc_create(dev, dev_queue, p,
1322                                  tcm->tcm_parent, tcm->tcm_handle,
1323                                  tca, &err);
1324         }
1325         if (q == NULL) {
1326                 if (err == -EAGAIN)
1327                         goto replay;
1328                 return err;
1329         }
1330
1331 graft:
1332         err = qdisc_graft(dev, p, skb, n, clid, q, NULL);
1333         if (err) {
1334                 if (q)
1335                         qdisc_destroy(q);
1336                 return err;
1337         }
1338
1339         return 0;
1340 }
1341
1342 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
1343                          u32 portid, u32 seq, u16 flags, int event)
1344 {
1345         struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
1346         struct gnet_stats_queue __percpu *cpu_qstats = NULL;
1347         struct tcmsg *tcm;
1348         struct nlmsghdr  *nlh;
1349         unsigned char *b = skb_tail_pointer(skb);
1350         struct gnet_dump d;
1351         struct qdisc_size_table *stab;
1352         __u32 qlen;
1353
1354         cond_resched();
1355         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1356         if (!nlh)
1357                 goto out_nlmsg_trim;
1358         tcm = nlmsg_data(nlh);
1359         tcm->tcm_family = AF_UNSPEC;
1360         tcm->tcm__pad1 = 0;
1361         tcm->tcm__pad2 = 0;
1362         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1363         tcm->tcm_parent = clid;
1364         tcm->tcm_handle = q->handle;
1365         tcm->tcm_info = atomic_read(&q->refcnt);
1366         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1367                 goto nla_put_failure;
1368         if (q->ops->dump && q->ops->dump(q, skb) < 0)
1369                 goto nla_put_failure;
1370         qlen = q->q.qlen;
1371
1372         stab = rtnl_dereference(q->stab);
1373         if (stab && qdisc_dump_stab(skb, stab) < 0)
1374                 goto nla_put_failure;
1375
1376         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1377                                          NULL, &d, TCA_PAD) < 0)
1378                 goto nla_put_failure;
1379
1380         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
1381                 goto nla_put_failure;
1382
1383         if (qdisc_is_percpu_stats(q)) {
1384                 cpu_bstats = q->cpu_bstats;
1385                 cpu_qstats = q->cpu_qstats;
1386         }
1387
1388         if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
1389                                   &d, cpu_bstats, &q->bstats) < 0 ||
1390             gnet_stats_copy_rate_est(&d, &q->bstats, &q->rate_est) < 0 ||
1391             gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
1392                 goto nla_put_failure;
1393
1394         if (gnet_stats_finish_copy(&d) < 0)
1395                 goto nla_put_failure;
1396
1397         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1398         return skb->len;
1399
1400 out_nlmsg_trim:
1401 nla_put_failure:
1402         nlmsg_trim(skb, b);
1403         return -1;
1404 }
1405
1406 static bool tc_qdisc_dump_ignore(struct Qdisc *q)
1407 {
1408         return (q->flags & TCQ_F_BUILTIN) ? true : false;
1409 }
1410
1411 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1412                         struct nlmsghdr *n, u32 clid,
1413                         struct Qdisc *old, struct Qdisc *new)
1414 {
1415         struct sk_buff *skb;
1416         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1417
1418         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1419         if (!skb)
1420                 return -ENOBUFS;
1421
1422         if (old && !tc_qdisc_dump_ignore(old)) {
1423                 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
1424                                   0, RTM_DELQDISC) < 0)
1425                         goto err_out;
1426         }
1427         if (new && !tc_qdisc_dump_ignore(new)) {
1428                 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
1429                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1430                         goto err_out;
1431         }
1432
1433         if (skb->len)
1434                 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1435                                       n->nlmsg_flags & NLM_F_ECHO);
1436
1437 err_out:
1438         kfree_skb(skb);
1439         return -EINVAL;
1440 }
1441
1442 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1443                               struct netlink_callback *cb,
1444                               int *q_idx_p, int s_q_idx, bool recur)
1445 {
1446         int ret = 0, q_idx = *q_idx_p;
1447         struct Qdisc *q;
1448         int b;
1449
1450         if (!root)
1451                 return 0;
1452
1453         q = root;
1454         if (q_idx < s_q_idx) {
1455                 q_idx++;
1456         } else {
1457                 if (!tc_qdisc_dump_ignore(q) &&
1458                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1459                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1460                                   RTM_NEWQDISC) <= 0)
1461                         goto done;
1462                 q_idx++;
1463         }
1464
1465         /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1466          * itself has already been dumped.
1467          *
1468          * If we've already dumped the top-level (ingress) qdisc above and the global
1469          * qdisc hashtable, we don't want to hit it again
1470          */
1471         if (!qdisc_dev(root) || !recur)
1472                 goto out;
1473
1474         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1475                 if (q_idx < s_q_idx) {
1476                         q_idx++;
1477                         continue;
1478                 }
1479                 if (!tc_qdisc_dump_ignore(q) &&
1480                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1481                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1482                                   RTM_NEWQDISC) <= 0)
1483                         goto done;
1484                 q_idx++;
1485         }
1486
1487 out:
1488         *q_idx_p = q_idx;
1489         return ret;
1490 done:
1491         ret = -1;
1492         goto out;
1493 }
1494
1495 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1496 {
1497         struct net *net = sock_net(skb->sk);
1498         int idx, q_idx;
1499         int s_idx, s_q_idx;
1500         struct net_device *dev;
1501
1502         s_idx = cb->args[0];
1503         s_q_idx = q_idx = cb->args[1];
1504
1505         idx = 0;
1506         ASSERT_RTNL();
1507         for_each_netdev(net, dev) {
1508                 struct netdev_queue *dev_queue;
1509
1510                 if (idx < s_idx)
1511                         goto cont;
1512                 if (idx > s_idx)
1513                         s_q_idx = 0;
1514                 q_idx = 0;
1515
1516                 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx,
1517                                        true) < 0)
1518                         goto done;
1519
1520                 dev_queue = dev_ingress_queue(dev);
1521                 if (dev_queue &&
1522                     tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1523                                        &q_idx, s_q_idx, false) < 0)
1524                         goto done;
1525
1526 cont:
1527                 idx++;
1528         }
1529
1530 done:
1531         cb->args[0] = idx;
1532         cb->args[1] = q_idx;
1533
1534         return skb->len;
1535 }
1536
1537
1538
1539 /************************************************
1540  *      Traffic classes manipulation.           *
1541  ************************************************/
1542
1543
1544
1545 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n)
1546 {
1547         struct net *net = sock_net(skb->sk);
1548         struct tcmsg *tcm = nlmsg_data(n);
1549         struct nlattr *tca[TCA_MAX + 1];
1550         struct net_device *dev;
1551         struct Qdisc *q = NULL;
1552         const struct Qdisc_class_ops *cops;
1553         unsigned long cl = 0;
1554         unsigned long new_cl;
1555         u32 portid;
1556         u32 clid;
1557         u32 qid;
1558         int err;
1559
1560         if ((n->nlmsg_type != RTM_GETTCLASS) &&
1561             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1562                 return -EPERM;
1563
1564         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, NULL);
1565         if (err < 0)
1566                 return err;
1567
1568         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1569         if (!dev)
1570                 return -ENODEV;
1571
1572         /*
1573            parent == TC_H_UNSPEC - unspecified parent.
1574            parent == TC_H_ROOT   - class is root, which has no parent.
1575            parent == X:0         - parent is root class.
1576            parent == X:Y         - parent is a node in hierarchy.
1577            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
1578
1579            handle == 0:0         - generate handle from kernel pool.
1580            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
1581            handle == X:Y         - clear.
1582            handle == X:0         - root class.
1583          */
1584
1585         /* Step 1. Determine qdisc handle X:0 */
1586
1587         portid = tcm->tcm_parent;
1588         clid = tcm->tcm_handle;
1589         qid = TC_H_MAJ(clid);
1590
1591         if (portid != TC_H_ROOT) {
1592                 u32 qid1 = TC_H_MAJ(portid);
1593
1594                 if (qid && qid1) {
1595                         /* If both majors are known, they must be identical. */
1596                         if (qid != qid1)
1597                                 return -EINVAL;
1598                 } else if (qid1) {
1599                         qid = qid1;
1600                 } else if (qid == 0)
1601                         qid = dev->qdisc->handle;
1602
1603                 /* Now qid is genuine qdisc handle consistent
1604                  * both with parent and child.
1605                  *
1606                  * TC_H_MAJ(portid) still may be unspecified, complete it now.
1607                  */
1608                 if (portid)
1609                         portid = TC_H_MAKE(qid, portid);
1610         } else {
1611                 if (qid == 0)
1612                         qid = dev->qdisc->handle;
1613         }
1614
1615         /* OK. Locate qdisc */
1616         q = qdisc_lookup(dev, qid);
1617         if (!q)
1618                 return -ENOENT;
1619
1620         /* An check that it supports classes */
1621         cops = q->ops->cl_ops;
1622         if (cops == NULL)
1623                 return -EINVAL;
1624
1625         /* Now try to get class */
1626         if (clid == 0) {
1627                 if (portid == TC_H_ROOT)
1628                         clid = qid;
1629         } else
1630                 clid = TC_H_MAKE(qid, clid);
1631
1632         if (clid)
1633                 cl = cops->get(q, clid);
1634
1635         if (cl == 0) {
1636                 err = -ENOENT;
1637                 if (n->nlmsg_type != RTM_NEWTCLASS ||
1638                     !(n->nlmsg_flags & NLM_F_CREATE))
1639                         goto out;
1640         } else {
1641                 switch (n->nlmsg_type) {
1642                 case RTM_NEWTCLASS:
1643                         err = -EEXIST;
1644                         if (n->nlmsg_flags & NLM_F_EXCL)
1645                                 goto out;
1646                         break;
1647                 case RTM_DELTCLASS:
1648                         err = -EOPNOTSUPP;
1649                         if (cops->delete)
1650                                 err = cops->delete(q, cl);
1651                         if (err == 0)
1652                                 tclass_notify(net, skb, n, q, cl,
1653                                               RTM_DELTCLASS);
1654                         goto out;
1655                 case RTM_GETTCLASS:
1656                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1657                         goto out;
1658                 default:
1659                         err = -EINVAL;
1660                         goto out;
1661                 }
1662         }
1663
1664         new_cl = cl;
1665         err = -EOPNOTSUPP;
1666         if (cops->change)
1667                 err = cops->change(q, clid, portid, tca, &new_cl);
1668         if (err == 0)
1669                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1670
1671 out:
1672         if (cl)
1673                 cops->put(q, cl);
1674
1675         return err;
1676 }
1677
1678
1679 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1680                           unsigned long cl,
1681                           u32 portid, u32 seq, u16 flags, int event)
1682 {
1683         struct tcmsg *tcm;
1684         struct nlmsghdr  *nlh;
1685         unsigned char *b = skb_tail_pointer(skb);
1686         struct gnet_dump d;
1687         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1688
1689         cond_resched();
1690         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1691         if (!nlh)
1692                 goto out_nlmsg_trim;
1693         tcm = nlmsg_data(nlh);
1694         tcm->tcm_family = AF_UNSPEC;
1695         tcm->tcm__pad1 = 0;
1696         tcm->tcm__pad2 = 0;
1697         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1698         tcm->tcm_parent = q->handle;
1699         tcm->tcm_handle = q->handle;
1700         tcm->tcm_info = 0;
1701         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1702                 goto nla_put_failure;
1703         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1704                 goto nla_put_failure;
1705
1706         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1707                                          NULL, &d, TCA_PAD) < 0)
1708                 goto nla_put_failure;
1709
1710         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1711                 goto nla_put_failure;
1712
1713         if (gnet_stats_finish_copy(&d) < 0)
1714                 goto nla_put_failure;
1715
1716         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1717         return skb->len;
1718
1719 out_nlmsg_trim:
1720 nla_put_failure:
1721         nlmsg_trim(skb, b);
1722         return -1;
1723 }
1724
1725 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1726                          struct nlmsghdr *n, struct Qdisc *q,
1727                          unsigned long cl, int event)
1728 {
1729         struct sk_buff *skb;
1730         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1731
1732         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1733         if (!skb)
1734                 return -ENOBUFS;
1735
1736         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1737                 kfree_skb(skb);
1738                 return -EINVAL;
1739         }
1740
1741         return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1742                               n->nlmsg_flags & NLM_F_ECHO);
1743 }
1744
1745 struct qdisc_dump_args {
1746         struct qdisc_walker     w;
1747         struct sk_buff          *skb;
1748         struct netlink_callback *cb;
1749 };
1750
1751 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
1752                             struct qdisc_walker *arg)
1753 {
1754         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
1755
1756         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
1757                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
1758                               RTM_NEWTCLASS);
1759 }
1760
1761 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
1762                                 struct tcmsg *tcm, struct netlink_callback *cb,
1763                                 int *t_p, int s_t)
1764 {
1765         struct qdisc_dump_args arg;
1766
1767         if (tc_qdisc_dump_ignore(q) ||
1768             *t_p < s_t || !q->ops->cl_ops ||
1769             (tcm->tcm_parent &&
1770              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
1771                 (*t_p)++;
1772                 return 0;
1773         }
1774         if (*t_p > s_t)
1775                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
1776         arg.w.fn = qdisc_class_dump;
1777         arg.skb = skb;
1778         arg.cb = cb;
1779         arg.w.stop  = 0;
1780         arg.w.skip = cb->args[1];
1781         arg.w.count = 0;
1782         q->ops->cl_ops->walk(q, &arg.w);
1783         cb->args[1] = arg.w.count;
1784         if (arg.w.stop)
1785                 return -1;
1786         (*t_p)++;
1787         return 0;
1788 }
1789
1790 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
1791                                struct tcmsg *tcm, struct netlink_callback *cb,
1792                                int *t_p, int s_t, bool recur)
1793 {
1794         struct Qdisc *q;
1795         int b;
1796
1797         if (!root)
1798                 return 0;
1799
1800         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
1801                 return -1;
1802
1803         if (!qdisc_dev(root) || !recur)
1804                 return 0;
1805
1806         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1807                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
1808                         return -1;
1809         }
1810
1811         return 0;
1812 }
1813
1814 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
1815 {
1816         struct tcmsg *tcm = nlmsg_data(cb->nlh);
1817         struct net *net = sock_net(skb->sk);
1818         struct netdev_queue *dev_queue;
1819         struct net_device *dev;
1820         int t, s_t;
1821
1822         if (nlmsg_len(cb->nlh) < sizeof(*tcm))
1823                 return 0;
1824         dev = dev_get_by_index(net, tcm->tcm_ifindex);
1825         if (!dev)
1826                 return 0;
1827
1828         s_t = cb->args[0];
1829         t = 0;
1830
1831         if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t, true) < 0)
1832                 goto done;
1833
1834         dev_queue = dev_ingress_queue(dev);
1835         if (dev_queue &&
1836             tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
1837                                 &t, s_t, false) < 0)
1838                 goto done;
1839
1840 done:
1841         cb->args[0] = t;
1842
1843         dev_put(dev);
1844         return skb->len;
1845 }
1846
1847 /* Main classifier routine: scans classifier chain attached
1848  * to this qdisc, (optionally) tests for protocol and asks
1849  * specific classifiers.
1850  */
1851 int tc_classify(struct sk_buff *skb, const struct tcf_proto *tp,
1852                 struct tcf_result *res, bool compat_mode)
1853 {
1854 #ifdef CONFIG_NET_CLS_ACT
1855         const struct tcf_proto *old_tp = tp;
1856         int limit = 0;
1857
1858 reclassify:
1859 #endif
1860         for (; tp; tp = rcu_dereference_bh(tp->next)) {
1861                 __be16 protocol = tc_skb_protocol(skb);
1862                 int err;
1863
1864                 if (tp->protocol != protocol &&
1865                     tp->protocol != htons(ETH_P_ALL))
1866                         continue;
1867
1868                 err = tp->classify(skb, tp, res);
1869 #ifdef CONFIG_NET_CLS_ACT
1870                 if (unlikely(err == TC_ACT_RECLASSIFY && !compat_mode))
1871                         goto reset;
1872 #endif
1873                 if (err >= 0)
1874                         return err;
1875         }
1876
1877         return TC_ACT_UNSPEC; /* signal: continue lookup */
1878 #ifdef CONFIG_NET_CLS_ACT
1879 reset:
1880         if (unlikely(limit++ >= MAX_REC_LOOP)) {
1881                 net_notice_ratelimited("%s: reclassify loop, rule prio %u, protocol %02x\n",
1882                                        tp->q->ops->id, tp->prio & 0xffff,
1883                                        ntohs(tp->protocol));
1884                 return TC_ACT_SHOT;
1885         }
1886
1887         tp = old_tp;
1888         goto reclassify;
1889 #endif
1890 }
1891 EXPORT_SYMBOL(tc_classify);
1892
1893 bool tcf_destroy(struct tcf_proto *tp, bool force)
1894 {
1895         if (tp->ops->destroy(tp, force)) {
1896                 module_put(tp->ops->owner);
1897                 kfree_rcu(tp, rcu);
1898                 return true;
1899         }
1900
1901         return false;
1902 }
1903
1904 void tcf_destroy_chain(struct tcf_proto __rcu **fl)
1905 {
1906         struct tcf_proto *tp;
1907
1908         while ((tp = rtnl_dereference(*fl)) != NULL) {
1909                 RCU_INIT_POINTER(*fl, tp->next);
1910                 tcf_destroy(tp, true);
1911         }
1912 }
1913 EXPORT_SYMBOL(tcf_destroy_chain);
1914
1915 #ifdef CONFIG_PROC_FS
1916 static int psched_show(struct seq_file *seq, void *v)
1917 {
1918         seq_printf(seq, "%08x %08x %08x %08x\n",
1919                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
1920                    1000000,
1921                    (u32)NSEC_PER_SEC / hrtimer_resolution);
1922
1923         return 0;
1924 }
1925
1926 static int psched_open(struct inode *inode, struct file *file)
1927 {
1928         return single_open(file, psched_show, NULL);
1929 }
1930
1931 static const struct file_operations psched_fops = {
1932         .owner = THIS_MODULE,
1933         .open = psched_open,
1934         .read  = seq_read,
1935         .llseek = seq_lseek,
1936         .release = single_release,
1937 };
1938
1939 static int __net_init psched_net_init(struct net *net)
1940 {
1941         struct proc_dir_entry *e;
1942
1943         e = proc_create("psched", 0, net->proc_net, &psched_fops);
1944         if (e == NULL)
1945                 return -ENOMEM;
1946
1947         return 0;
1948 }
1949
1950 static void __net_exit psched_net_exit(struct net *net)
1951 {
1952         remove_proc_entry("psched", net->proc_net);
1953 }
1954 #else
1955 static int __net_init psched_net_init(struct net *net)
1956 {
1957         return 0;
1958 }
1959
1960 static void __net_exit psched_net_exit(struct net *net)
1961 {
1962 }
1963 #endif
1964
1965 static struct pernet_operations psched_net_ops = {
1966         .init = psched_net_init,
1967         .exit = psched_net_exit,
1968 };
1969
1970 static int __init pktsched_init(void)
1971 {
1972         int err;
1973
1974         err = register_pernet_subsys(&psched_net_ops);
1975         if (err) {
1976                 pr_err("pktsched_init: "
1977                        "cannot initialize per netns operations\n");
1978                 return err;
1979         }
1980
1981         register_qdisc(&pfifo_fast_ops);
1982         register_qdisc(&pfifo_qdisc_ops);
1983         register_qdisc(&bfifo_qdisc_ops);
1984         register_qdisc(&pfifo_head_drop_qdisc_ops);
1985         register_qdisc(&mq_qdisc_ops);
1986         register_qdisc(&noqueue_qdisc_ops);
1987
1988         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, NULL);
1989         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, NULL);
1990         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
1991                       NULL);
1992         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, NULL);
1993         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, NULL);
1994         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
1995                       NULL);
1996
1997         return 0;
1998 }
1999
2000 subsys_initcall(pktsched_init);