GNU Linux-libre 4.19.211-gnu1
[releases.git] / net / sched / sch_api.c
1 /*
2  * net/sched/sch_api.c  Packet scheduler API.
3  *
4  *              This program is free software; you can redistribute it and/or
5  *              modify it under the terms of the GNU General Public License
6  *              as published by the Free Software Foundation; either version
7  *              2 of the License, or (at your option) any later version.
8  *
9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
10  *
11  * Fixes:
12  *
13  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
14  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
15  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
16  */
17
18 #include <linux/module.h>
19 #include <linux/types.h>
20 #include <linux/kernel.h>
21 #include <linux/string.h>
22 #include <linux/errno.h>
23 #include <linux/skbuff.h>
24 #include <linux/init.h>
25 #include <linux/proc_fs.h>
26 #include <linux/seq_file.h>
27 #include <linux/kmod.h>
28 #include <linux/list.h>
29 #include <linux/hrtimer.h>
30 #include <linux/lockdep.h>
31 #include <linux/slab.h>
32 #include <linux/hashtable.h>
33
34 #include <net/net_namespace.h>
35 #include <net/sock.h>
36 #include <net/netlink.h>
37 #include <net/pkt_sched.h>
38 #include <net/pkt_cls.h>
39
40 /*
41
42    Short review.
43    -------------
44
45    This file consists of two interrelated parts:
46
47    1. queueing disciplines manager frontend.
48    2. traffic classes manager frontend.
49
50    Generally, queueing discipline ("qdisc") is a black box,
51    which is able to enqueue packets and to dequeue them (when
52    device is ready to send something) in order and at times
53    determined by algorithm hidden in it.
54
55    qdisc's are divided to two categories:
56    - "queues", which have no internal structure visible from outside.
57    - "schedulers", which split all the packets to "traffic classes",
58      using "packet classifiers" (look at cls_api.c)
59
60    In turn, classes may have child qdiscs (as rule, queues)
61    attached to them etc. etc. etc.
62
63    The goal of the routines in this file is to translate
64    information supplied by user in the form of handles
65    to more intelligible for kernel form, to make some sanity
66    checks and part of work, which is common to all qdiscs
67    and to provide rtnetlink notifications.
68
69    All real intelligent work is done inside qdisc modules.
70
71
72
73    Every discipline has two major routines: enqueue and dequeue.
74
75    ---dequeue
76
77    dequeue usually returns a skb to send. It is allowed to return NULL,
78    but it does not mean that queue is empty, it just means that
79    discipline does not want to send anything this time.
80    Queue is really empty if q->q.qlen == 0.
81    For complicated disciplines with multiple queues q->q is not
82    real packet queue, but however q->q.qlen must be valid.
83
84    ---enqueue
85
86    enqueue returns 0, if packet was enqueued successfully.
87    If packet (this one or another one) was dropped, it returns
88    not zero error code.
89    NET_XMIT_DROP        - this packet dropped
90      Expected action: do not backoff, but wait until queue will clear.
91    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
92      Expected action: backoff or ignore
93
94    Auxiliary routines:
95
96    ---peek
97
98    like dequeue but without removing a packet from the queue
99
100    ---reset
101
102    returns qdisc to initial state: purge all buffers, clear all
103    timers, counters (except for statistics) etc.
104
105    ---init
106
107    initializes newly created qdisc.
108
109    ---destroy
110
111    destroys resources allocated by init and during lifetime of qdisc.
112
113    ---change
114
115    changes qdisc parameters.
116  */
117
118 /* Protects list of registered TC modules. It is pure SMP lock. */
119 static DEFINE_RWLOCK(qdisc_mod_lock);
120
121
122 /************************************************
123  *      Queueing disciplines manipulation.      *
124  ************************************************/
125
126
127 /* The list of all installed queueing disciplines. */
128
129 static struct Qdisc_ops *qdisc_base;
130
131 /* Register/unregister queueing discipline */
132
133 int register_qdisc(struct Qdisc_ops *qops)
134 {
135         struct Qdisc_ops *q, **qp;
136         int rc = -EEXIST;
137
138         write_lock(&qdisc_mod_lock);
139         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
140                 if (!strcmp(qops->id, q->id))
141                         goto out;
142
143         if (qops->enqueue == NULL)
144                 qops->enqueue = noop_qdisc_ops.enqueue;
145         if (qops->peek == NULL) {
146                 if (qops->dequeue == NULL)
147                         qops->peek = noop_qdisc_ops.peek;
148                 else
149                         goto out_einval;
150         }
151         if (qops->dequeue == NULL)
152                 qops->dequeue = noop_qdisc_ops.dequeue;
153
154         if (qops->cl_ops) {
155                 const struct Qdisc_class_ops *cops = qops->cl_ops;
156
157                 if (!(cops->find && cops->walk && cops->leaf))
158                         goto out_einval;
159
160                 if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
161                         goto out_einval;
162         }
163
164         qops->next = NULL;
165         *qp = qops;
166         rc = 0;
167 out:
168         write_unlock(&qdisc_mod_lock);
169         return rc;
170
171 out_einval:
172         rc = -EINVAL;
173         goto out;
174 }
175 EXPORT_SYMBOL(register_qdisc);
176
177 int unregister_qdisc(struct Qdisc_ops *qops)
178 {
179         struct Qdisc_ops *q, **qp;
180         int err = -ENOENT;
181
182         write_lock(&qdisc_mod_lock);
183         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
184                 if (q == qops)
185                         break;
186         if (q) {
187                 *qp = q->next;
188                 q->next = NULL;
189                 err = 0;
190         }
191         write_unlock(&qdisc_mod_lock);
192         return err;
193 }
194 EXPORT_SYMBOL(unregister_qdisc);
195
196 /* Get default qdisc if not otherwise specified */
197 void qdisc_get_default(char *name, size_t len)
198 {
199         read_lock(&qdisc_mod_lock);
200         strlcpy(name, default_qdisc_ops->id, len);
201         read_unlock(&qdisc_mod_lock);
202 }
203
204 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
205 {
206         struct Qdisc_ops *q = NULL;
207
208         for (q = qdisc_base; q; q = q->next) {
209                 if (!strcmp(name, q->id)) {
210                         if (!try_module_get(q->owner))
211                                 q = NULL;
212                         break;
213                 }
214         }
215
216         return q;
217 }
218
219 /* Set new default qdisc to use */
220 int qdisc_set_default(const char *name)
221 {
222         const struct Qdisc_ops *ops;
223
224         if (!capable(CAP_NET_ADMIN))
225                 return -EPERM;
226
227         write_lock(&qdisc_mod_lock);
228         ops = qdisc_lookup_default(name);
229         if (!ops) {
230                 /* Not found, drop lock and try to load module */
231                 write_unlock(&qdisc_mod_lock);
232                 request_module("sch_%s", name);
233                 write_lock(&qdisc_mod_lock);
234
235                 ops = qdisc_lookup_default(name);
236         }
237
238         if (ops) {
239                 /* Set new default */
240                 module_put(default_qdisc_ops->owner);
241                 default_qdisc_ops = ops;
242         }
243         write_unlock(&qdisc_mod_lock);
244
245         return ops ? 0 : -ENOENT;
246 }
247
248 #ifdef CONFIG_NET_SCH_DEFAULT
249 /* Set default value from kernel config */
250 static int __init sch_default_qdisc(void)
251 {
252         return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
253 }
254 late_initcall(sch_default_qdisc);
255 #endif
256
257 /* We know handle. Find qdisc among all qdisc's attached to device
258  * (root qdisc, all its children, children of children etc.)
259  * Note: caller either uses rtnl or rcu_read_lock()
260  */
261
262 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
263 {
264         struct Qdisc *q;
265
266         if (!qdisc_dev(root))
267                 return (root->handle == handle ? root : NULL);
268
269         if (!(root->flags & TCQ_F_BUILTIN) &&
270             root->handle == handle)
271                 return root;
272
273         hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) {
274                 if (q->handle == handle)
275                         return q;
276         }
277         return NULL;
278 }
279
280 void qdisc_hash_add(struct Qdisc *q, bool invisible)
281 {
282         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
283                 ASSERT_RTNL();
284                 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
285                 if (invisible)
286                         q->flags |= TCQ_F_INVISIBLE;
287         }
288 }
289 EXPORT_SYMBOL(qdisc_hash_add);
290
291 void qdisc_hash_del(struct Qdisc *q)
292 {
293         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
294                 ASSERT_RTNL();
295                 hash_del_rcu(&q->hash);
296         }
297 }
298 EXPORT_SYMBOL(qdisc_hash_del);
299
300 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
301 {
302         struct Qdisc *q;
303
304         if (!handle)
305                 return NULL;
306         q = qdisc_match_from_root(dev->qdisc, handle);
307         if (q)
308                 goto out;
309
310         if (dev_ingress_queue(dev))
311                 q = qdisc_match_from_root(
312                         dev_ingress_queue(dev)->qdisc_sleeping,
313                         handle);
314 out:
315         return q;
316 }
317
318 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
319 {
320         unsigned long cl;
321         struct Qdisc *leaf;
322         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
323
324         if (cops == NULL)
325                 return NULL;
326         cl = cops->find(p, classid);
327
328         if (cl == 0)
329                 return NULL;
330         leaf = cops->leaf(p, cl);
331         return leaf;
332 }
333
334 /* Find queueing discipline by name */
335
336 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
337 {
338         struct Qdisc_ops *q = NULL;
339
340         if (kind) {
341                 read_lock(&qdisc_mod_lock);
342                 for (q = qdisc_base; q; q = q->next) {
343                         if (nla_strcmp(kind, q->id) == 0) {
344                                 if (!try_module_get(q->owner))
345                                         q = NULL;
346                                 break;
347                         }
348                 }
349                 read_unlock(&qdisc_mod_lock);
350         }
351         return q;
352 }
353
354 /* The linklayer setting were not transferred from iproute2, in older
355  * versions, and the rate tables lookup systems have been dropped in
356  * the kernel. To keep backward compatible with older iproute2 tc
357  * utils, we detect the linklayer setting by detecting if the rate
358  * table were modified.
359  *
360  * For linklayer ATM table entries, the rate table will be aligned to
361  * 48 bytes, thus some table entries will contain the same value.  The
362  * mpu (min packet unit) is also encoded into the old rate table, thus
363  * starting from the mpu, we find low and high table entries for
364  * mapping this cell.  If these entries contain the same value, when
365  * the rate tables have been modified for linklayer ATM.
366  *
367  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
368  * and then roundup to the next cell, calc the table entry one below,
369  * and compare.
370  */
371 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
372 {
373         int low       = roundup(r->mpu, 48);
374         int high      = roundup(low+1, 48);
375         int cell_low  = low >> r->cell_log;
376         int cell_high = (high >> r->cell_log) - 1;
377
378         /* rtab is too inaccurate at rates > 100Mbit/s */
379         if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
380                 pr_debug("TC linklayer: Giving up ATM detection\n");
381                 return TC_LINKLAYER_ETHERNET;
382         }
383
384         if ((cell_high > cell_low) && (cell_high < 256)
385             && (rtab[cell_low] == rtab[cell_high])) {
386                 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
387                          cell_low, cell_high, rtab[cell_high]);
388                 return TC_LINKLAYER_ATM;
389         }
390         return TC_LINKLAYER_ETHERNET;
391 }
392
393 static struct qdisc_rate_table *qdisc_rtab_list;
394
395 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
396                                         struct nlattr *tab,
397                                         struct netlink_ext_ack *extack)
398 {
399         struct qdisc_rate_table *rtab;
400
401         if (tab == NULL || r->rate == 0 ||
402             r->cell_log == 0 || r->cell_log >= 32 ||
403             nla_len(tab) != TC_RTAB_SIZE) {
404                 NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
405                 return NULL;
406         }
407
408         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
409                 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
410                     !memcmp(&rtab->data, nla_data(tab), 1024)) {
411                         rtab->refcnt++;
412                         return rtab;
413                 }
414         }
415
416         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
417         if (rtab) {
418                 rtab->rate = *r;
419                 rtab->refcnt = 1;
420                 memcpy(rtab->data, nla_data(tab), 1024);
421                 if (r->linklayer == TC_LINKLAYER_UNAWARE)
422                         r->linklayer = __detect_linklayer(r, rtab->data);
423                 rtab->next = qdisc_rtab_list;
424                 qdisc_rtab_list = rtab;
425         } else {
426                 NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
427         }
428         return rtab;
429 }
430 EXPORT_SYMBOL(qdisc_get_rtab);
431
432 void qdisc_put_rtab(struct qdisc_rate_table *tab)
433 {
434         struct qdisc_rate_table *rtab, **rtabp;
435
436         if (!tab || --tab->refcnt)
437                 return;
438
439         for (rtabp = &qdisc_rtab_list;
440              (rtab = *rtabp) != NULL;
441              rtabp = &rtab->next) {
442                 if (rtab == tab) {
443                         *rtabp = rtab->next;
444                         kfree(rtab);
445                         return;
446                 }
447         }
448 }
449 EXPORT_SYMBOL(qdisc_put_rtab);
450
451 static LIST_HEAD(qdisc_stab_list);
452
453 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
454         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
455         [TCA_STAB_DATA] = { .type = NLA_BINARY },
456 };
457
458 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
459                                                struct netlink_ext_ack *extack)
460 {
461         struct nlattr *tb[TCA_STAB_MAX + 1];
462         struct qdisc_size_table *stab;
463         struct tc_sizespec *s;
464         unsigned int tsize = 0;
465         u16 *tab = NULL;
466         int err;
467
468         err = nla_parse_nested(tb, TCA_STAB_MAX, opt, stab_policy, extack);
469         if (err < 0)
470                 return ERR_PTR(err);
471         if (!tb[TCA_STAB_BASE]) {
472                 NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
473                 return ERR_PTR(-EINVAL);
474         }
475
476         s = nla_data(tb[TCA_STAB_BASE]);
477
478         if (s->tsize > 0) {
479                 if (!tb[TCA_STAB_DATA]) {
480                         NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
481                         return ERR_PTR(-EINVAL);
482                 }
483                 tab = nla_data(tb[TCA_STAB_DATA]);
484                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
485         }
486
487         if (tsize != s->tsize || (!tab && tsize > 0)) {
488                 NL_SET_ERR_MSG(extack, "Invalid size of size table");
489                 return ERR_PTR(-EINVAL);
490         }
491
492         list_for_each_entry(stab, &qdisc_stab_list, list) {
493                 if (memcmp(&stab->szopts, s, sizeof(*s)))
494                         continue;
495                 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
496                         continue;
497                 stab->refcnt++;
498                 return stab;
499         }
500
501         stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
502         if (!stab)
503                 return ERR_PTR(-ENOMEM);
504
505         stab->refcnt = 1;
506         stab->szopts = *s;
507         if (tsize > 0)
508                 memcpy(stab->data, tab, tsize * sizeof(u16));
509
510         list_add_tail(&stab->list, &qdisc_stab_list);
511
512         return stab;
513 }
514
515 static void stab_kfree_rcu(struct rcu_head *head)
516 {
517         kfree(container_of(head, struct qdisc_size_table, rcu));
518 }
519
520 void qdisc_put_stab(struct qdisc_size_table *tab)
521 {
522         if (!tab)
523                 return;
524
525         if (--tab->refcnt == 0) {
526                 list_del(&tab->list);
527                 call_rcu_bh(&tab->rcu, stab_kfree_rcu);
528         }
529 }
530 EXPORT_SYMBOL(qdisc_put_stab);
531
532 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
533 {
534         struct nlattr *nest;
535
536         nest = nla_nest_start(skb, TCA_STAB);
537         if (nest == NULL)
538                 goto nla_put_failure;
539         if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
540                 goto nla_put_failure;
541         nla_nest_end(skb, nest);
542
543         return skb->len;
544
545 nla_put_failure:
546         return -1;
547 }
548
549 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
550                                const struct qdisc_size_table *stab)
551 {
552         int pkt_len, slot;
553
554         pkt_len = skb->len + stab->szopts.overhead;
555         if (unlikely(!stab->szopts.tsize))
556                 goto out;
557
558         slot = pkt_len + stab->szopts.cell_align;
559         if (unlikely(slot < 0))
560                 slot = 0;
561
562         slot >>= stab->szopts.cell_log;
563         if (likely(slot < stab->szopts.tsize))
564                 pkt_len = stab->data[slot];
565         else
566                 pkt_len = stab->data[stab->szopts.tsize - 1] *
567                                 (slot / stab->szopts.tsize) +
568                                 stab->data[slot % stab->szopts.tsize];
569
570         pkt_len <<= stab->szopts.size_log;
571 out:
572         if (unlikely(pkt_len < 1))
573                 pkt_len = 1;
574         qdisc_skb_cb(skb)->pkt_len = pkt_len;
575 }
576 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
577
578 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
579 {
580         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
581                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
582                         txt, qdisc->ops->id, qdisc->handle >> 16);
583                 qdisc->flags |= TCQ_F_WARN_NONWC;
584         }
585 }
586 EXPORT_SYMBOL(qdisc_warn_nonwc);
587
588 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
589 {
590         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
591                                                  timer);
592
593         rcu_read_lock();
594         __netif_schedule(qdisc_root(wd->qdisc));
595         rcu_read_unlock();
596
597         return HRTIMER_NORESTART;
598 }
599
600 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
601                                  clockid_t clockid)
602 {
603         hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
604         wd->timer.function = qdisc_watchdog;
605         wd->qdisc = qdisc;
606 }
607 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
608
609 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
610 {
611         qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
612 }
613 EXPORT_SYMBOL(qdisc_watchdog_init);
614
615 void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
616 {
617         if (test_bit(__QDISC_STATE_DEACTIVATED,
618                      &qdisc_root_sleeping(wd->qdisc)->state))
619                 return;
620
621         if (wd->last_expires == expires)
622                 return;
623
624         wd->last_expires = expires;
625         hrtimer_start(&wd->timer,
626                       ns_to_ktime(expires),
627                       HRTIMER_MODE_ABS_PINNED);
628 }
629 EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
630
631 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
632 {
633         hrtimer_cancel(&wd->timer);
634 }
635 EXPORT_SYMBOL(qdisc_watchdog_cancel);
636
637 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
638 {
639         struct hlist_head *h;
640         unsigned int i;
641
642         h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
643
644         if (h != NULL) {
645                 for (i = 0; i < n; i++)
646                         INIT_HLIST_HEAD(&h[i]);
647         }
648         return h;
649 }
650
651 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
652 {
653         struct Qdisc_class_common *cl;
654         struct hlist_node *next;
655         struct hlist_head *nhash, *ohash;
656         unsigned int nsize, nmask, osize;
657         unsigned int i, h;
658
659         /* Rehash when load factor exceeds 0.75 */
660         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
661                 return;
662         nsize = clhash->hashsize * 2;
663         nmask = nsize - 1;
664         nhash = qdisc_class_hash_alloc(nsize);
665         if (nhash == NULL)
666                 return;
667
668         ohash = clhash->hash;
669         osize = clhash->hashsize;
670
671         sch_tree_lock(sch);
672         for (i = 0; i < osize; i++) {
673                 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
674                         h = qdisc_class_hash(cl->classid, nmask);
675                         hlist_add_head(&cl->hnode, &nhash[h]);
676                 }
677         }
678         clhash->hash     = nhash;
679         clhash->hashsize = nsize;
680         clhash->hashmask = nmask;
681         sch_tree_unlock(sch);
682
683         kvfree(ohash);
684 }
685 EXPORT_SYMBOL(qdisc_class_hash_grow);
686
687 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
688 {
689         unsigned int size = 4;
690
691         clhash->hash = qdisc_class_hash_alloc(size);
692         if (!clhash->hash)
693                 return -ENOMEM;
694         clhash->hashsize  = size;
695         clhash->hashmask  = size - 1;
696         clhash->hashelems = 0;
697         return 0;
698 }
699 EXPORT_SYMBOL(qdisc_class_hash_init);
700
701 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
702 {
703         kvfree(clhash->hash);
704 }
705 EXPORT_SYMBOL(qdisc_class_hash_destroy);
706
707 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
708                              struct Qdisc_class_common *cl)
709 {
710         unsigned int h;
711
712         INIT_HLIST_NODE(&cl->hnode);
713         h = qdisc_class_hash(cl->classid, clhash->hashmask);
714         hlist_add_head(&cl->hnode, &clhash->hash[h]);
715         clhash->hashelems++;
716 }
717 EXPORT_SYMBOL(qdisc_class_hash_insert);
718
719 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
720                              struct Qdisc_class_common *cl)
721 {
722         hlist_del(&cl->hnode);
723         clhash->hashelems--;
724 }
725 EXPORT_SYMBOL(qdisc_class_hash_remove);
726
727 /* Allocate an unique handle from space managed by kernel
728  * Possible range is [8000-FFFF]:0000 (0x8000 values)
729  */
730 static u32 qdisc_alloc_handle(struct net_device *dev)
731 {
732         int i = 0x8000;
733         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
734
735         do {
736                 autohandle += TC_H_MAKE(0x10000U, 0);
737                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
738                         autohandle = TC_H_MAKE(0x80000000U, 0);
739                 if (!qdisc_lookup(dev, autohandle))
740                         return autohandle;
741                 cond_resched();
742         } while (--i > 0);
743
744         return 0;
745 }
746
747 void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned int n,
748                                unsigned int len)
749 {
750         bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
751         const struct Qdisc_class_ops *cops;
752         unsigned long cl;
753         u32 parentid;
754         bool notify;
755         int drops;
756
757         if (n == 0 && len == 0)
758                 return;
759         drops = max_t(int, n, 0);
760         rcu_read_lock();
761         while ((parentid = sch->parent)) {
762                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
763                         break;
764
765                 if (sch->flags & TCQ_F_NOPARENT)
766                         break;
767                 /* Notify parent qdisc only if child qdisc becomes empty.
768                  *
769                  * If child was empty even before update then backlog
770                  * counter is screwed and we skip notification because
771                  * parent class is already passive.
772                  *
773                  * If the original child was offloaded then it is allowed
774                  * to be seem as empty, so the parent is notified anyway.
775                  */
776                 notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
777                                                        !qdisc_is_offloaded);
778                 /* TODO: perform the search on a per txq basis */
779                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
780                 if (sch == NULL) {
781                         WARN_ON_ONCE(parentid != TC_H_ROOT);
782                         break;
783                 }
784                 cops = sch->ops->cl_ops;
785                 if (notify && cops->qlen_notify) {
786                         cl = cops->find(sch, parentid);
787                         cops->qlen_notify(sch, cl);
788                 }
789                 sch->q.qlen -= n;
790                 sch->qstats.backlog -= len;
791                 __qdisc_qstats_drop(sch, drops);
792         }
793         rcu_read_unlock();
794 }
795 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
796
797 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
798                          u32 portid, u32 seq, u16 flags, int event)
799 {
800         struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
801         struct gnet_stats_queue __percpu *cpu_qstats = NULL;
802         struct tcmsg *tcm;
803         struct nlmsghdr  *nlh;
804         unsigned char *b = skb_tail_pointer(skb);
805         struct gnet_dump d;
806         struct qdisc_size_table *stab;
807         u32 block_index;
808         __u32 qlen;
809
810         cond_resched();
811         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
812         if (!nlh)
813                 goto out_nlmsg_trim;
814         tcm = nlmsg_data(nlh);
815         tcm->tcm_family = AF_UNSPEC;
816         tcm->tcm__pad1 = 0;
817         tcm->tcm__pad2 = 0;
818         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
819         tcm->tcm_parent = clid;
820         tcm->tcm_handle = q->handle;
821         tcm->tcm_info = refcount_read(&q->refcnt);
822         if (nla_put_string(skb, TCA_KIND, q->ops->id))
823                 goto nla_put_failure;
824         if (q->ops->ingress_block_get) {
825                 block_index = q->ops->ingress_block_get(q);
826                 if (block_index &&
827                     nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
828                         goto nla_put_failure;
829         }
830         if (q->ops->egress_block_get) {
831                 block_index = q->ops->egress_block_get(q);
832                 if (block_index &&
833                     nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
834                         goto nla_put_failure;
835         }
836         if (q->ops->dump && q->ops->dump(q, skb) < 0)
837                 goto nla_put_failure;
838         if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
839                 goto nla_put_failure;
840         qlen = qdisc_qlen_sum(q);
841
842         stab = rtnl_dereference(q->stab);
843         if (stab && qdisc_dump_stab(skb, stab) < 0)
844                 goto nla_put_failure;
845
846         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
847                                          NULL, &d, TCA_PAD) < 0)
848                 goto nla_put_failure;
849
850         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
851                 goto nla_put_failure;
852
853         if (qdisc_is_percpu_stats(q)) {
854                 cpu_bstats = q->cpu_bstats;
855                 cpu_qstats = q->cpu_qstats;
856         }
857
858         if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
859                                   &d, cpu_bstats, &q->bstats) < 0 ||
860             gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
861             gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
862                 goto nla_put_failure;
863
864         if (gnet_stats_finish_copy(&d) < 0)
865                 goto nla_put_failure;
866
867         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
868         return skb->len;
869
870 out_nlmsg_trim:
871 nla_put_failure:
872         nlmsg_trim(skb, b);
873         return -1;
874 }
875
876 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
877 {
878         if (q->flags & TCQ_F_BUILTIN)
879                 return true;
880         if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
881                 return true;
882
883         return false;
884 }
885
886 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
887                         struct nlmsghdr *n, u32 clid,
888                         struct Qdisc *old, struct Qdisc *new)
889 {
890         struct sk_buff *skb;
891         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
892
893         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
894         if (!skb)
895                 return -ENOBUFS;
896
897         if (old && !tc_qdisc_dump_ignore(old, false)) {
898                 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
899                                   0, RTM_DELQDISC) < 0)
900                         goto err_out;
901         }
902         if (new && !tc_qdisc_dump_ignore(new, false)) {
903                 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
904                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
905                         goto err_out;
906         }
907
908         if (skb->len)
909                 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
910                                       n->nlmsg_flags & NLM_F_ECHO);
911
912 err_out:
913         kfree_skb(skb);
914         return -EINVAL;
915 }
916
917 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
918                                struct nlmsghdr *n, u32 clid,
919                                struct Qdisc *old, struct Qdisc *new)
920 {
921         if (new || old)
922                 qdisc_notify(net, skb, n, clid, old, new);
923
924         if (old)
925                 qdisc_destroy(old);
926 }
927
928 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
929  * to device "dev".
930  *
931  * When appropriate send a netlink notification using 'skb'
932  * and "n".
933  *
934  * On success, destroy old qdisc.
935  */
936
937 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
938                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
939                        struct Qdisc *new, struct Qdisc *old,
940                        struct netlink_ext_ack *extack)
941 {
942         struct Qdisc *q = old;
943         struct net *net = dev_net(dev);
944         int err = 0;
945
946         if (parent == NULL) {
947                 unsigned int i, num_q, ingress;
948
949                 ingress = 0;
950                 num_q = dev->num_tx_queues;
951                 if ((q && q->flags & TCQ_F_INGRESS) ||
952                     (new && new->flags & TCQ_F_INGRESS)) {
953                         num_q = 1;
954                         ingress = 1;
955                         if (!dev_ingress_queue(dev)) {
956                                 NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
957                                 return -ENOENT;
958                         }
959                 }
960
961                 if (dev->flags & IFF_UP)
962                         dev_deactivate(dev);
963
964                 if (new && new->ops->attach)
965                         goto skip;
966
967                 for (i = 0; i < num_q; i++) {
968                         struct netdev_queue *dev_queue = dev_ingress_queue(dev);
969
970                         if (!ingress)
971                                 dev_queue = netdev_get_tx_queue(dev, i);
972
973                         old = dev_graft_qdisc(dev_queue, new);
974                         if (new && i > 0)
975                                 qdisc_refcount_inc(new);
976
977                         if (!ingress)
978                                 qdisc_destroy(old);
979                 }
980
981 skip:
982                 if (!ingress) {
983                         notify_and_destroy(net, skb, n, classid,
984                                            dev->qdisc, new);
985                         if (new && !new->ops->attach)
986                                 qdisc_refcount_inc(new);
987                         dev->qdisc = new ? : &noop_qdisc;
988
989                         if (new && new->ops->attach)
990                                 new->ops->attach(new);
991                 } else {
992                         notify_and_destroy(net, skb, n, classid, old, new);
993                 }
994
995                 if (dev->flags & IFF_UP)
996                         dev_activate(dev);
997         } else {
998                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
999
1000                 /* Only support running class lockless if parent is lockless */
1001                 if (new && (new->flags & TCQ_F_NOLOCK) &&
1002                     parent && !(parent->flags & TCQ_F_NOLOCK))
1003                         new->flags &= ~TCQ_F_NOLOCK;
1004
1005                 err = -EOPNOTSUPP;
1006                 if (cops && cops->graft) {
1007                         unsigned long cl = cops->find(parent, classid);
1008
1009                         if (cl) {
1010                                 err = cops->graft(parent, cl, new, &old,
1011                                                   extack);
1012                         } else {
1013                                 NL_SET_ERR_MSG(extack, "Specified class not found");
1014                                 err = -ENOENT;
1015                         }
1016                 }
1017                 if (!err)
1018                         notify_and_destroy(net, skb, n, classid, old, new);
1019         }
1020         return err;
1021 }
1022
1023 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1024                                    struct netlink_ext_ack *extack)
1025 {
1026         u32 block_index;
1027
1028         if (tca[TCA_INGRESS_BLOCK]) {
1029                 block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1030
1031                 if (!block_index) {
1032                         NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1033                         return -EINVAL;
1034                 }
1035                 if (!sch->ops->ingress_block_set) {
1036                         NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1037                         return -EOPNOTSUPP;
1038                 }
1039                 sch->ops->ingress_block_set(sch, block_index);
1040         }
1041         if (tca[TCA_EGRESS_BLOCK]) {
1042                 block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1043
1044                 if (!block_index) {
1045                         NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1046                         return -EINVAL;
1047                 }
1048                 if (!sch->ops->egress_block_set) {
1049                         NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1050                         return -EOPNOTSUPP;
1051                 }
1052                 sch->ops->egress_block_set(sch, block_index);
1053         }
1054         return 0;
1055 }
1056
1057 /* lockdep annotation is needed for ingress; egress gets it only for name */
1058 static struct lock_class_key qdisc_tx_lock;
1059 static struct lock_class_key qdisc_rx_lock;
1060
1061 /*
1062    Allocate and initialize new qdisc.
1063
1064    Parameters are passed via opt.
1065  */
1066
1067 static struct Qdisc *qdisc_create(struct net_device *dev,
1068                                   struct netdev_queue *dev_queue,
1069                                   struct Qdisc *p, u32 parent, u32 handle,
1070                                   struct nlattr **tca, int *errp,
1071                                   struct netlink_ext_ack *extack)
1072 {
1073         int err;
1074         struct nlattr *kind = tca[TCA_KIND];
1075         struct Qdisc *sch;
1076         struct Qdisc_ops *ops;
1077         struct qdisc_size_table *stab;
1078
1079         ops = qdisc_lookup_ops(kind);
1080 #ifdef CONFIG_MODULES
1081         if (ops == NULL && kind != NULL) {
1082                 char name[IFNAMSIZ];
1083                 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
1084                         /* We dropped the RTNL semaphore in order to
1085                          * perform the module load.  So, even if we
1086                          * succeeded in loading the module we have to
1087                          * tell the caller to replay the request.  We
1088                          * indicate this using -EAGAIN.
1089                          * We replay the request because the device may
1090                          * go away in the mean time.
1091                          */
1092                         rtnl_unlock();
1093                         request_module("sch_%s", name);
1094                         rtnl_lock();
1095                         ops = qdisc_lookup_ops(kind);
1096                         if (ops != NULL) {
1097                                 /* We will try again qdisc_lookup_ops,
1098                                  * so don't keep a reference.
1099                                  */
1100                                 module_put(ops->owner);
1101                                 err = -EAGAIN;
1102                                 goto err_out;
1103                         }
1104                 }
1105         }
1106 #endif
1107
1108         err = -ENOENT;
1109         if (!ops) {
1110                 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1111                 goto err_out;
1112         }
1113
1114         sch = qdisc_alloc(dev_queue, ops, extack);
1115         if (IS_ERR(sch)) {
1116                 err = PTR_ERR(sch);
1117                 goto err_out2;
1118         }
1119
1120         sch->parent = parent;
1121
1122         if (handle == TC_H_INGRESS) {
1123                 sch->flags |= TCQ_F_INGRESS;
1124                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
1125                 lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
1126         } else {
1127                 if (handle == 0) {
1128                         handle = qdisc_alloc_handle(dev);
1129                         err = -ENOMEM;
1130                         if (handle == 0)
1131                                 goto err_out3;
1132                 }
1133                 lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
1134                 if (!netif_is_multiqueue(dev))
1135                         sch->flags |= TCQ_F_ONETXQUEUE;
1136         }
1137
1138         sch->handle = handle;
1139
1140         /* This exist to keep backward compatible with a userspace
1141          * loophole, what allowed userspace to get IFF_NO_QUEUE
1142          * facility on older kernels by setting tx_queue_len=0 (prior
1143          * to qdisc init), and then forgot to reinit tx_queue_len
1144          * before again attaching a qdisc.
1145          */
1146         if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1147                 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1148                 netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1149         }
1150
1151         err = qdisc_block_indexes_set(sch, tca, extack);
1152         if (err)
1153                 goto err_out3;
1154
1155         if (ops->init) {
1156                 err = ops->init(sch, tca[TCA_OPTIONS], extack);
1157                 if (err != 0)
1158                         goto err_out5;
1159         }
1160
1161         if (tca[TCA_STAB]) {
1162                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1163                 if (IS_ERR(stab)) {
1164                         err = PTR_ERR(stab);
1165                         goto err_out4;
1166                 }
1167                 rcu_assign_pointer(sch->stab, stab);
1168         }
1169         if (tca[TCA_RATE]) {
1170                 seqcount_t *running;
1171
1172                 err = -EOPNOTSUPP;
1173                 if (sch->flags & TCQ_F_MQROOT) {
1174                         NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1175                         goto err_out4;
1176                 }
1177
1178                 if (sch->parent != TC_H_ROOT &&
1179                     !(sch->flags & TCQ_F_INGRESS) &&
1180                     (!p || !(p->flags & TCQ_F_MQROOT)))
1181                         running = qdisc_root_sleeping_running(sch);
1182                 else
1183                         running = &sch->running;
1184
1185                 err = gen_new_estimator(&sch->bstats,
1186                                         sch->cpu_bstats,
1187                                         &sch->rate_est,
1188                                         NULL,
1189                                         running,
1190                                         tca[TCA_RATE]);
1191                 if (err) {
1192                         NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1193                         goto err_out4;
1194                 }
1195         }
1196
1197         qdisc_hash_add(sch, false);
1198
1199         return sch;
1200
1201 err_out5:
1202         /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1203         if (ops->destroy)
1204                 ops->destroy(sch);
1205 err_out3:
1206         dev_put(dev);
1207         qdisc_free(sch);
1208 err_out2:
1209         module_put(ops->owner);
1210 err_out:
1211         *errp = err;
1212         return NULL;
1213
1214 err_out4:
1215         /*
1216          * Any broken qdiscs that would require a ops->reset() here?
1217          * The qdisc was never in action so it shouldn't be necessary.
1218          */
1219         qdisc_put_stab(rtnl_dereference(sch->stab));
1220         if (ops->destroy)
1221                 ops->destroy(sch);
1222         goto err_out3;
1223 }
1224
1225 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1226                         struct netlink_ext_ack *extack)
1227 {
1228         struct qdisc_size_table *ostab, *stab = NULL;
1229         int err = 0;
1230
1231         if (tca[TCA_OPTIONS]) {
1232                 if (!sch->ops->change) {
1233                         NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1234                         return -EINVAL;
1235                 }
1236                 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1237                         NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1238                         return -EOPNOTSUPP;
1239                 }
1240                 err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1241                 if (err)
1242                         return err;
1243         }
1244
1245         if (tca[TCA_STAB]) {
1246                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1247                 if (IS_ERR(stab))
1248                         return PTR_ERR(stab);
1249         }
1250
1251         ostab = rtnl_dereference(sch->stab);
1252         rcu_assign_pointer(sch->stab, stab);
1253         qdisc_put_stab(ostab);
1254
1255         if (tca[TCA_RATE]) {
1256                 /* NB: ignores errors from replace_estimator
1257                    because change can't be undone. */
1258                 if (sch->flags & TCQ_F_MQROOT)
1259                         goto out;
1260                 gen_replace_estimator(&sch->bstats,
1261                                       sch->cpu_bstats,
1262                                       &sch->rate_est,
1263                                       NULL,
1264                                       qdisc_root_sleeping_running(sch),
1265                                       tca[TCA_RATE]);
1266         }
1267 out:
1268         return 0;
1269 }
1270
1271 struct check_loop_arg {
1272         struct qdisc_walker     w;
1273         struct Qdisc            *p;
1274         int                     depth;
1275 };
1276
1277 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1278                          struct qdisc_walker *w);
1279
1280 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1281 {
1282         struct check_loop_arg   arg;
1283
1284         if (q->ops->cl_ops == NULL)
1285                 return 0;
1286
1287         arg.w.stop = arg.w.skip = arg.w.count = 0;
1288         arg.w.fn = check_loop_fn;
1289         arg.depth = depth;
1290         arg.p = p;
1291         q->ops->cl_ops->walk(q, &arg.w);
1292         return arg.w.stop ? -ELOOP : 0;
1293 }
1294
1295 static int
1296 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1297 {
1298         struct Qdisc *leaf;
1299         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1300         struct check_loop_arg *arg = (struct check_loop_arg *)w;
1301
1302         leaf = cops->leaf(q, cl);
1303         if (leaf) {
1304                 if (leaf == arg->p || arg->depth > 7)
1305                         return -ELOOP;
1306                 return check_loop(leaf, arg->p, arg->depth + 1);
1307         }
1308         return 0;
1309 }
1310
1311 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1312         [TCA_KIND]              = { .type = NLA_NUL_STRING,
1313                                     .len = IFNAMSIZ - 1 },
1314         [TCA_RATE]              = { .type = NLA_BINARY,
1315                                     .len = sizeof(struct tc_estimator) },
1316         [TCA_STAB]              = { .type = NLA_NESTED },
1317         [TCA_DUMP_INVISIBLE]    = { .type = NLA_FLAG },
1318         [TCA_CHAIN]             = { .type = NLA_U32 },
1319         [TCA_INGRESS_BLOCK]     = { .type = NLA_U32 },
1320         [TCA_EGRESS_BLOCK]      = { .type = NLA_U32 },
1321 };
1322
1323 /*
1324  * Delete/get qdisc.
1325  */
1326
1327 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1328                         struct netlink_ext_ack *extack)
1329 {
1330         struct net *net = sock_net(skb->sk);
1331         struct tcmsg *tcm = nlmsg_data(n);
1332         struct nlattr *tca[TCA_MAX + 1];
1333         struct net_device *dev;
1334         u32 clid;
1335         struct Qdisc *q = NULL;
1336         struct Qdisc *p = NULL;
1337         int err;
1338
1339         if ((n->nlmsg_type != RTM_GETQDISC) &&
1340             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1341                 return -EPERM;
1342
1343         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
1344                           extack);
1345         if (err < 0)
1346                 return err;
1347
1348         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1349         if (!dev)
1350                 return -ENODEV;
1351
1352         clid = tcm->tcm_parent;
1353         if (clid) {
1354                 if (clid != TC_H_ROOT) {
1355                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1356                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1357                                 if (!p) {
1358                                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1359                                         return -ENOENT;
1360                                 }
1361                                 q = qdisc_leaf(p, clid);
1362                         } else if (dev_ingress_queue(dev)) {
1363                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1364                         }
1365                 } else {
1366                         q = dev->qdisc;
1367                 }
1368                 if (!q) {
1369                         NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1370                         return -ENOENT;
1371                 }
1372
1373                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1374                         NL_SET_ERR_MSG(extack, "Invalid handle");
1375                         return -EINVAL;
1376                 }
1377         } else {
1378                 q = qdisc_lookup(dev, tcm->tcm_handle);
1379                 if (!q) {
1380                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1381                         return -ENOENT;
1382                 }
1383         }
1384
1385         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1386                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1387                 return -EINVAL;
1388         }
1389
1390         if (n->nlmsg_type == RTM_DELQDISC) {
1391                 if (!clid) {
1392                         NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1393                         return -EINVAL;
1394                 }
1395                 if (q->handle == 0) {
1396                         NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1397                         return -ENOENT;
1398                 }
1399                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1400                 if (err != 0)
1401                         return err;
1402         } else {
1403                 qdisc_notify(net, skb, n, clid, NULL, q);
1404         }
1405         return 0;
1406 }
1407
1408 /*
1409  * Create/change qdisc.
1410  */
1411
1412 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1413                            struct netlink_ext_ack *extack)
1414 {
1415         struct net *net = sock_net(skb->sk);
1416         struct tcmsg *tcm;
1417         struct nlattr *tca[TCA_MAX + 1];
1418         struct net_device *dev;
1419         u32 clid;
1420         struct Qdisc *q, *p;
1421         int err;
1422
1423         if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1424                 return -EPERM;
1425
1426 replay:
1427         /* Reinit, just in case something touches this. */
1428         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
1429                           extack);
1430         if (err < 0)
1431                 return err;
1432
1433         tcm = nlmsg_data(n);
1434         clid = tcm->tcm_parent;
1435         q = p = NULL;
1436
1437         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1438         if (!dev)
1439                 return -ENODEV;
1440
1441
1442         if (clid) {
1443                 if (clid != TC_H_ROOT) {
1444                         if (clid != TC_H_INGRESS) {
1445                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1446                                 if (!p) {
1447                                         NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1448                                         return -ENOENT;
1449                                 }
1450                                 q = qdisc_leaf(p, clid);
1451                         } else if (dev_ingress_queue_create(dev)) {
1452                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1453                         }
1454                 } else {
1455                         q = dev->qdisc;
1456                 }
1457
1458                 /* It may be default qdisc, ignore it */
1459                 if (q && q->handle == 0)
1460                         q = NULL;
1461
1462                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1463                         if (tcm->tcm_handle) {
1464                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1465                                         NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1466                                         return -EEXIST;
1467                                 }
1468                                 if (TC_H_MIN(tcm->tcm_handle)) {
1469                                         NL_SET_ERR_MSG(extack, "Invalid minor handle");
1470                                         return -EINVAL;
1471                                 }
1472                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1473                                 if (!q)
1474                                         goto create_n_graft;
1475                                 if (n->nlmsg_flags & NLM_F_EXCL) {
1476                                         NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1477                                         return -EEXIST;
1478                                 }
1479                                 if (tca[TCA_KIND] &&
1480                                     nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1481                                         NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1482                                         return -EINVAL;
1483                                 }
1484                                 if (q == p ||
1485                                     (p && check_loop(q, p, 0))) {
1486                                         NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1487                                         return -ELOOP;
1488                                 }
1489                                 qdisc_refcount_inc(q);
1490                                 goto graft;
1491                         } else {
1492                                 if (!q)
1493                                         goto create_n_graft;
1494
1495                                 /* This magic test requires explanation.
1496                                  *
1497                                  *   We know, that some child q is already
1498                                  *   attached to this parent and have choice:
1499                                  *   either to change it or to create/graft new one.
1500                                  *
1501                                  *   1. We are allowed to create/graft only
1502                                  *   if CREATE and REPLACE flags are set.
1503                                  *
1504                                  *   2. If EXCL is set, requestor wanted to say,
1505                                  *   that qdisc tcm_handle is not expected
1506                                  *   to exist, so that we choose create/graft too.
1507                                  *
1508                                  *   3. The last case is when no flags are set.
1509                                  *   Alas, it is sort of hole in API, we
1510                                  *   cannot decide what to do unambiguously.
1511                                  *   For now we select create/graft, if
1512                                  *   user gave KIND, which does not match existing.
1513                                  */
1514                                 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1515                                     (n->nlmsg_flags & NLM_F_REPLACE) &&
1516                                     ((n->nlmsg_flags & NLM_F_EXCL) ||
1517                                      (tca[TCA_KIND] &&
1518                                       nla_strcmp(tca[TCA_KIND], q->ops->id))))
1519                                         goto create_n_graft;
1520                         }
1521                 }
1522         } else {
1523                 if (!tcm->tcm_handle) {
1524                         NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1525                         return -EINVAL;
1526                 }
1527                 q = qdisc_lookup(dev, tcm->tcm_handle);
1528         }
1529
1530         /* Change qdisc parameters */
1531         if (!q) {
1532                 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1533                 return -ENOENT;
1534         }
1535         if (n->nlmsg_flags & NLM_F_EXCL) {
1536                 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1537                 return -EEXIST;
1538         }
1539         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1540                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1541                 return -EINVAL;
1542         }
1543         err = qdisc_change(q, tca, extack);
1544         if (err == 0)
1545                 qdisc_notify(net, skb, n, clid, NULL, q);
1546         return err;
1547
1548 create_n_graft:
1549         if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1550                 NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1551                 return -ENOENT;
1552         }
1553         if (clid == TC_H_INGRESS) {
1554                 if (dev_ingress_queue(dev)) {
1555                         q = qdisc_create(dev, dev_ingress_queue(dev), p,
1556                                          tcm->tcm_parent, tcm->tcm_parent,
1557                                          tca, &err, extack);
1558                 } else {
1559                         NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1560                         err = -ENOENT;
1561                 }
1562         } else {
1563                 struct netdev_queue *dev_queue;
1564
1565                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1566                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1567                 else if (p)
1568                         dev_queue = p->dev_queue;
1569                 else
1570                         dev_queue = netdev_get_tx_queue(dev, 0);
1571
1572                 q = qdisc_create(dev, dev_queue, p,
1573                                  tcm->tcm_parent, tcm->tcm_handle,
1574                                  tca, &err, extack);
1575         }
1576         if (q == NULL) {
1577                 if (err == -EAGAIN)
1578                         goto replay;
1579                 return err;
1580         }
1581
1582 graft:
1583         err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1584         if (err) {
1585                 if (q)
1586                         qdisc_destroy(q);
1587                 return err;
1588         }
1589
1590         return 0;
1591 }
1592
1593 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1594                               struct netlink_callback *cb,
1595                               int *q_idx_p, int s_q_idx, bool recur,
1596                               bool dump_invisible)
1597 {
1598         int ret = 0, q_idx = *q_idx_p;
1599         struct Qdisc *q;
1600         int b;
1601
1602         if (!root)
1603                 return 0;
1604
1605         q = root;
1606         if (q_idx < s_q_idx) {
1607                 q_idx++;
1608         } else {
1609                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1610                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1611                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1612                                   RTM_NEWQDISC) <= 0)
1613                         goto done;
1614                 q_idx++;
1615         }
1616
1617         /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1618          * itself has already been dumped.
1619          *
1620          * If we've already dumped the top-level (ingress) qdisc above and the global
1621          * qdisc hashtable, we don't want to hit it again
1622          */
1623         if (!qdisc_dev(root) || !recur)
1624                 goto out;
1625
1626         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1627                 if (q_idx < s_q_idx) {
1628                         q_idx++;
1629                         continue;
1630                 }
1631                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1632                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1633                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1634                                   RTM_NEWQDISC) <= 0)
1635                         goto done;
1636                 q_idx++;
1637         }
1638
1639 out:
1640         *q_idx_p = q_idx;
1641         return ret;
1642 done:
1643         ret = -1;
1644         goto out;
1645 }
1646
1647 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1648 {
1649         struct net *net = sock_net(skb->sk);
1650         int idx, q_idx;
1651         int s_idx, s_q_idx;
1652         struct net_device *dev;
1653         const struct nlmsghdr *nlh = cb->nlh;
1654         struct nlattr *tca[TCA_MAX + 1];
1655         int err;
1656
1657         s_idx = cb->args[0];
1658         s_q_idx = q_idx = cb->args[1];
1659
1660         idx = 0;
1661         ASSERT_RTNL();
1662
1663         err = nlmsg_parse(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1664                           rtm_tca_policy, NULL);
1665         if (err < 0)
1666                 return err;
1667
1668         for_each_netdev(net, dev) {
1669                 struct netdev_queue *dev_queue;
1670
1671                 if (idx < s_idx)
1672                         goto cont;
1673                 if (idx > s_idx)
1674                         s_q_idx = 0;
1675                 q_idx = 0;
1676
1677                 if (tc_dump_qdisc_root(dev->qdisc, skb, cb, &q_idx, s_q_idx,
1678                                        true, tca[TCA_DUMP_INVISIBLE]) < 0)
1679                         goto done;
1680
1681                 dev_queue = dev_ingress_queue(dev);
1682                 if (dev_queue &&
1683                     tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1684                                        &q_idx, s_q_idx, false,
1685                                        tca[TCA_DUMP_INVISIBLE]) < 0)
1686                         goto done;
1687
1688 cont:
1689                 idx++;
1690         }
1691
1692 done:
1693         cb->args[0] = idx;
1694         cb->args[1] = q_idx;
1695
1696         return skb->len;
1697 }
1698
1699
1700
1701 /************************************************
1702  *      Traffic classes manipulation.           *
1703  ************************************************/
1704
1705 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1706                           unsigned long cl,
1707                           u32 portid, u32 seq, u16 flags, int event)
1708 {
1709         struct tcmsg *tcm;
1710         struct nlmsghdr  *nlh;
1711         unsigned char *b = skb_tail_pointer(skb);
1712         struct gnet_dump d;
1713         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1714
1715         cond_resched();
1716         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1717         if (!nlh)
1718                 goto out_nlmsg_trim;
1719         tcm = nlmsg_data(nlh);
1720         tcm->tcm_family = AF_UNSPEC;
1721         tcm->tcm__pad1 = 0;
1722         tcm->tcm__pad2 = 0;
1723         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1724         tcm->tcm_parent = q->handle;
1725         tcm->tcm_handle = q->handle;
1726         tcm->tcm_info = 0;
1727         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1728                 goto nla_put_failure;
1729         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1730                 goto nla_put_failure;
1731
1732         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1733                                          NULL, &d, TCA_PAD) < 0)
1734                 goto nla_put_failure;
1735
1736         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1737                 goto nla_put_failure;
1738
1739         if (gnet_stats_finish_copy(&d) < 0)
1740                 goto nla_put_failure;
1741
1742         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1743         return skb->len;
1744
1745 out_nlmsg_trim:
1746 nla_put_failure:
1747         nlmsg_trim(skb, b);
1748         return -1;
1749 }
1750
1751 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1752                          struct nlmsghdr *n, struct Qdisc *q,
1753                          unsigned long cl, int event)
1754 {
1755         struct sk_buff *skb;
1756         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1757
1758         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1759         if (!skb)
1760                 return -ENOBUFS;
1761
1762         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1763                 kfree_skb(skb);
1764                 return -EINVAL;
1765         }
1766
1767         return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1768                               n->nlmsg_flags & NLM_F_ECHO);
1769 }
1770
1771 static int tclass_del_notify(struct net *net,
1772                              const struct Qdisc_class_ops *cops,
1773                              struct sk_buff *oskb, struct nlmsghdr *n,
1774                              struct Qdisc *q, unsigned long cl)
1775 {
1776         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1777         struct sk_buff *skb;
1778         int err = 0;
1779
1780         if (!cops->delete)
1781                 return -EOPNOTSUPP;
1782
1783         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1784         if (!skb)
1785                 return -ENOBUFS;
1786
1787         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1788                            RTM_DELTCLASS) < 0) {
1789                 kfree_skb(skb);
1790                 return -EINVAL;
1791         }
1792
1793         err = cops->delete(q, cl);
1794         if (err) {
1795                 kfree_skb(skb);
1796                 return err;
1797         }
1798
1799         return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1800                               n->nlmsg_flags & NLM_F_ECHO);
1801 }
1802
1803 #ifdef CONFIG_NET_CLS
1804
1805 struct tcf_bind_args {
1806         struct tcf_walker w;
1807         unsigned long base;
1808         unsigned long cl;
1809         u32 classid;
1810 };
1811
1812 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1813 {
1814         struct tcf_bind_args *a = (void *)arg;
1815
1816         if (tp->ops->bind_class) {
1817                 struct Qdisc *q = tcf_block_q(tp->chain->block);
1818
1819                 sch_tree_lock(q);
1820                 tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
1821                 sch_tree_unlock(q);
1822         }
1823         return 0;
1824 }
1825
1826 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1827                            unsigned long new_cl)
1828 {
1829         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1830         struct tcf_block *block;
1831         struct tcf_chain *chain;
1832         unsigned long cl;
1833
1834         cl = cops->find(q, portid);
1835         if (!cl)
1836                 return;
1837         if (!cops->tcf_block)
1838                 return;
1839         block = cops->tcf_block(q, cl, NULL);
1840         if (!block)
1841                 return;
1842         list_for_each_entry(chain, &block->chain_list, list) {
1843                 struct tcf_proto *tp;
1844
1845                 for (tp = rtnl_dereference(chain->filter_chain);
1846                      tp; tp = rtnl_dereference(tp->next)) {
1847                         struct tcf_bind_args arg = {};
1848
1849                         arg.w.fn = tcf_node_bind;
1850                         arg.classid = clid;
1851                         arg.base = cl;
1852                         arg.cl = new_cl;
1853                         tp->ops->walk(tp, &arg.w);
1854                 }
1855         }
1856 }
1857
1858 #else
1859
1860 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1861                            unsigned long new_cl)
1862 {
1863 }
1864
1865 #endif
1866
1867 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
1868                          struct netlink_ext_ack *extack)
1869 {
1870         struct net *net = sock_net(skb->sk);
1871         struct tcmsg *tcm = nlmsg_data(n);
1872         struct nlattr *tca[TCA_MAX + 1];
1873         struct net_device *dev;
1874         struct Qdisc *q = NULL;
1875         const struct Qdisc_class_ops *cops;
1876         unsigned long cl = 0;
1877         unsigned long new_cl;
1878         u32 portid;
1879         u32 clid;
1880         u32 qid;
1881         int err;
1882
1883         if ((n->nlmsg_type != RTM_GETTCLASS) &&
1884             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1885                 return -EPERM;
1886
1887         err = nlmsg_parse(n, sizeof(*tcm), tca, TCA_MAX, rtm_tca_policy,
1888                           extack);
1889         if (err < 0)
1890                 return err;
1891
1892         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1893         if (!dev)
1894                 return -ENODEV;
1895
1896         /*
1897            parent == TC_H_UNSPEC - unspecified parent.
1898            parent == TC_H_ROOT   - class is root, which has no parent.
1899            parent == X:0         - parent is root class.
1900            parent == X:Y         - parent is a node in hierarchy.
1901            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
1902
1903            handle == 0:0         - generate handle from kernel pool.
1904            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
1905            handle == X:Y         - clear.
1906            handle == X:0         - root class.
1907          */
1908
1909         /* Step 1. Determine qdisc handle X:0 */
1910
1911         portid = tcm->tcm_parent;
1912         clid = tcm->tcm_handle;
1913         qid = TC_H_MAJ(clid);
1914
1915         if (portid != TC_H_ROOT) {
1916                 u32 qid1 = TC_H_MAJ(portid);
1917
1918                 if (qid && qid1) {
1919                         /* If both majors are known, they must be identical. */
1920                         if (qid != qid1)
1921                                 return -EINVAL;
1922                 } else if (qid1) {
1923                         qid = qid1;
1924                 } else if (qid == 0)
1925                         qid = dev->qdisc->handle;
1926
1927                 /* Now qid is genuine qdisc handle consistent
1928                  * both with parent and child.
1929                  *
1930                  * TC_H_MAJ(portid) still may be unspecified, complete it now.
1931                  */
1932                 if (portid)
1933                         portid = TC_H_MAKE(qid, portid);
1934         } else {
1935                 if (qid == 0)
1936                         qid = dev->qdisc->handle;
1937         }
1938
1939         /* OK. Locate qdisc */
1940         q = qdisc_lookup(dev, qid);
1941         if (!q)
1942                 return -ENOENT;
1943
1944         /* An check that it supports classes */
1945         cops = q->ops->cl_ops;
1946         if (cops == NULL)
1947                 return -EINVAL;
1948
1949         /* Now try to get class */
1950         if (clid == 0) {
1951                 if (portid == TC_H_ROOT)
1952                         clid = qid;
1953         } else
1954                 clid = TC_H_MAKE(qid, clid);
1955
1956         if (clid)
1957                 cl = cops->find(q, clid);
1958
1959         if (cl == 0) {
1960                 err = -ENOENT;
1961                 if (n->nlmsg_type != RTM_NEWTCLASS ||
1962                     !(n->nlmsg_flags & NLM_F_CREATE))
1963                         goto out;
1964         } else {
1965                 switch (n->nlmsg_type) {
1966                 case RTM_NEWTCLASS:
1967                         err = -EEXIST;
1968                         if (n->nlmsg_flags & NLM_F_EXCL)
1969                                 goto out;
1970                         break;
1971                 case RTM_DELTCLASS:
1972                         err = tclass_del_notify(net, cops, skb, n, q, cl);
1973                         /* Unbind the class with flilters with 0 */
1974                         tc_bind_tclass(q, portid, clid, 0);
1975                         goto out;
1976                 case RTM_GETTCLASS:
1977                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
1978                         goto out;
1979                 default:
1980                         err = -EINVAL;
1981                         goto out;
1982                 }
1983         }
1984
1985         if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1986                 NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
1987                 return -EOPNOTSUPP;
1988         }
1989
1990         new_cl = cl;
1991         err = -EOPNOTSUPP;
1992         if (cops->change)
1993                 err = cops->change(q, clid, portid, tca, &new_cl, extack);
1994         if (err == 0) {
1995                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
1996                 /* We just create a new class, need to do reverse binding. */
1997                 if (cl != new_cl)
1998                         tc_bind_tclass(q, portid, clid, new_cl);
1999         }
2000 out:
2001         return err;
2002 }
2003
2004 struct qdisc_dump_args {
2005         struct qdisc_walker     w;
2006         struct sk_buff          *skb;
2007         struct netlink_callback *cb;
2008 };
2009
2010 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2011                             struct qdisc_walker *arg)
2012 {
2013         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2014
2015         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2016                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2017                               RTM_NEWTCLASS);
2018 }
2019
2020 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2021                                 struct tcmsg *tcm, struct netlink_callback *cb,
2022                                 int *t_p, int s_t)
2023 {
2024         struct qdisc_dump_args arg;
2025
2026         if (tc_qdisc_dump_ignore(q, false) ||
2027             *t_p < s_t || !q->ops->cl_ops ||
2028             (tcm->tcm_parent &&
2029              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2030                 (*t_p)++;
2031                 return 0;
2032         }
2033         if (*t_p > s_t)
2034                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2035         arg.w.fn = qdisc_class_dump;
2036         arg.skb = skb;
2037         arg.cb = cb;
2038         arg.w.stop  = 0;
2039         arg.w.skip = cb->args[1];
2040         arg.w.count = 0;
2041         q->ops->cl_ops->walk(q, &arg.w);
2042         cb->args[1] = arg.w.count;
2043         if (arg.w.stop)
2044                 return -1;
2045         (*t_p)++;
2046         return 0;
2047 }
2048
2049 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2050                                struct tcmsg *tcm, struct netlink_callback *cb,
2051                                int *t_p, int s_t, bool recur)
2052 {
2053         struct Qdisc *q;
2054         int b;
2055
2056         if (!root)
2057                 return 0;
2058
2059         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2060                 return -1;
2061
2062         if (!qdisc_dev(root) || !recur)
2063                 return 0;
2064
2065         if (tcm->tcm_parent) {
2066                 q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2067                 if (q && q != root &&
2068                     tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2069                         return -1;
2070                 return 0;
2071         }
2072         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2073                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2074                         return -1;
2075         }
2076
2077         return 0;
2078 }
2079
2080 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2081 {
2082         struct tcmsg *tcm = nlmsg_data(cb->nlh);
2083         struct net *net = sock_net(skb->sk);
2084         struct netdev_queue *dev_queue;
2085         struct net_device *dev;
2086         int t, s_t;
2087
2088         if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2089                 return 0;
2090         dev = dev_get_by_index(net, tcm->tcm_ifindex);
2091         if (!dev)
2092                 return 0;
2093
2094         s_t = cb->args[0];
2095         t = 0;
2096
2097         if (tc_dump_tclass_root(dev->qdisc, skb, tcm, cb, &t, s_t, true) < 0)
2098                 goto done;
2099
2100         dev_queue = dev_ingress_queue(dev);
2101         if (dev_queue &&
2102             tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2103                                 &t, s_t, false) < 0)
2104                 goto done;
2105
2106 done:
2107         cb->args[0] = t;
2108
2109         dev_put(dev);
2110         return skb->len;
2111 }
2112
2113 #ifdef CONFIG_PROC_FS
2114 static int psched_show(struct seq_file *seq, void *v)
2115 {
2116         seq_printf(seq, "%08x %08x %08x %08x\n",
2117                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2118                    1000000,
2119                    (u32)NSEC_PER_SEC / hrtimer_resolution);
2120
2121         return 0;
2122 }
2123
2124 static int __net_init psched_net_init(struct net *net)
2125 {
2126         struct proc_dir_entry *e;
2127
2128         e = proc_create_single("psched", 0, net->proc_net, psched_show);
2129         if (e == NULL)
2130                 return -ENOMEM;
2131
2132         return 0;
2133 }
2134
2135 static void __net_exit psched_net_exit(struct net *net)
2136 {
2137         remove_proc_entry("psched", net->proc_net);
2138 }
2139 #else
2140 static int __net_init psched_net_init(struct net *net)
2141 {
2142         return 0;
2143 }
2144
2145 static void __net_exit psched_net_exit(struct net *net)
2146 {
2147 }
2148 #endif
2149
2150 static struct pernet_operations psched_net_ops = {
2151         .init = psched_net_init,
2152         .exit = psched_net_exit,
2153 };
2154
2155 static int __init pktsched_init(void)
2156 {
2157         int err;
2158
2159         err = register_pernet_subsys(&psched_net_ops);
2160         if (err) {
2161                 pr_err("pktsched_init: "
2162                        "cannot initialize per netns operations\n");
2163                 return err;
2164         }
2165
2166         register_qdisc(&pfifo_fast_ops);
2167         register_qdisc(&pfifo_qdisc_ops);
2168         register_qdisc(&bfifo_qdisc_ops);
2169         register_qdisc(&pfifo_head_drop_qdisc_ops);
2170         register_qdisc(&mq_qdisc_ops);
2171         register_qdisc(&noqueue_qdisc_ops);
2172
2173         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2174         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2175         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2176                       0);
2177         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2178         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2179         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2180                       0);
2181
2182         return 0;
2183 }
2184
2185 subsys_initcall(pktsched_init);