GNU Linux-libre 5.4.257-gnu1
[releases.git] / net / sched / sch_api.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_api.c  Packet scheduler API.
4  *
5  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6  *
7  * Fixes:
8  *
9  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
10  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
11  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
12  */
13
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/string.h>
18 #include <linux/errno.h>
19 #include <linux/skbuff.h>
20 #include <linux/init.h>
21 #include <linux/proc_fs.h>
22 #include <linux/seq_file.h>
23 #include <linux/kmod.h>
24 #include <linux/list.h>
25 #include <linux/hrtimer.h>
26 #include <linux/slab.h>
27 #include <linux/hashtable.h>
28
29 #include <net/net_namespace.h>
30 #include <net/sock.h>
31 #include <net/netlink.h>
32 #include <net/pkt_sched.h>
33 #include <net/pkt_cls.h>
34
35 /*
36
37    Short review.
38    -------------
39
40    This file consists of two interrelated parts:
41
42    1. queueing disciplines manager frontend.
43    2. traffic classes manager frontend.
44
45    Generally, queueing discipline ("qdisc") is a black box,
46    which is able to enqueue packets and to dequeue them (when
47    device is ready to send something) in order and at times
48    determined by algorithm hidden in it.
49
50    qdisc's are divided to two categories:
51    - "queues", which have no internal structure visible from outside.
52    - "schedulers", which split all the packets to "traffic classes",
53      using "packet classifiers" (look at cls_api.c)
54
55    In turn, classes may have child qdiscs (as rule, queues)
56    attached to them etc. etc. etc.
57
58    The goal of the routines in this file is to translate
59    information supplied by user in the form of handles
60    to more intelligible for kernel form, to make some sanity
61    checks and part of work, which is common to all qdiscs
62    and to provide rtnetlink notifications.
63
64    All real intelligent work is done inside qdisc modules.
65
66
67
68    Every discipline has two major routines: enqueue and dequeue.
69
70    ---dequeue
71
72    dequeue usually returns a skb to send. It is allowed to return NULL,
73    but it does not mean that queue is empty, it just means that
74    discipline does not want to send anything this time.
75    Queue is really empty if q->q.qlen == 0.
76    For complicated disciplines with multiple queues q->q is not
77    real packet queue, but however q->q.qlen must be valid.
78
79    ---enqueue
80
81    enqueue returns 0, if packet was enqueued successfully.
82    If packet (this one or another one) was dropped, it returns
83    not zero error code.
84    NET_XMIT_DROP        - this packet dropped
85      Expected action: do not backoff, but wait until queue will clear.
86    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
87      Expected action: backoff or ignore
88
89    Auxiliary routines:
90
91    ---peek
92
93    like dequeue but without removing a packet from the queue
94
95    ---reset
96
97    returns qdisc to initial state: purge all buffers, clear all
98    timers, counters (except for statistics) etc.
99
100    ---init
101
102    initializes newly created qdisc.
103
104    ---destroy
105
106    destroys resources allocated by init and during lifetime of qdisc.
107
108    ---change
109
110    changes qdisc parameters.
111  */
112
113 /* Protects list of registered TC modules. It is pure SMP lock. */
114 static DEFINE_RWLOCK(qdisc_mod_lock);
115
116
117 /************************************************
118  *      Queueing disciplines manipulation.      *
119  ************************************************/
120
121
122 /* The list of all installed queueing disciplines. */
123
124 static struct Qdisc_ops *qdisc_base;
125
126 /* Register/unregister queueing discipline */
127
128 int register_qdisc(struct Qdisc_ops *qops)
129 {
130         struct Qdisc_ops *q, **qp;
131         int rc = -EEXIST;
132
133         write_lock(&qdisc_mod_lock);
134         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
135                 if (!strcmp(qops->id, q->id))
136                         goto out;
137
138         if (qops->enqueue == NULL)
139                 qops->enqueue = noop_qdisc_ops.enqueue;
140         if (qops->peek == NULL) {
141                 if (qops->dequeue == NULL)
142                         qops->peek = noop_qdisc_ops.peek;
143                 else
144                         goto out_einval;
145         }
146         if (qops->dequeue == NULL)
147                 qops->dequeue = noop_qdisc_ops.dequeue;
148
149         if (qops->cl_ops) {
150                 const struct Qdisc_class_ops *cops = qops->cl_ops;
151
152                 if (!(cops->find && cops->walk && cops->leaf))
153                         goto out_einval;
154
155                 if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
156                         goto out_einval;
157         }
158
159         qops->next = NULL;
160         *qp = qops;
161         rc = 0;
162 out:
163         write_unlock(&qdisc_mod_lock);
164         return rc;
165
166 out_einval:
167         rc = -EINVAL;
168         goto out;
169 }
170 EXPORT_SYMBOL(register_qdisc);
171
172 int unregister_qdisc(struct Qdisc_ops *qops)
173 {
174         struct Qdisc_ops *q, **qp;
175         int err = -ENOENT;
176
177         write_lock(&qdisc_mod_lock);
178         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
179                 if (q == qops)
180                         break;
181         if (q) {
182                 *qp = q->next;
183                 q->next = NULL;
184                 err = 0;
185         }
186         write_unlock(&qdisc_mod_lock);
187         return err;
188 }
189 EXPORT_SYMBOL(unregister_qdisc);
190
191 /* Get default qdisc if not otherwise specified */
192 void qdisc_get_default(char *name, size_t len)
193 {
194         read_lock(&qdisc_mod_lock);
195         strlcpy(name, default_qdisc_ops->id, len);
196         read_unlock(&qdisc_mod_lock);
197 }
198
199 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
200 {
201         struct Qdisc_ops *q = NULL;
202
203         for (q = qdisc_base; q; q = q->next) {
204                 if (!strcmp(name, q->id)) {
205                         if (!try_module_get(q->owner))
206                                 q = NULL;
207                         break;
208                 }
209         }
210
211         return q;
212 }
213
214 /* Set new default qdisc to use */
215 int qdisc_set_default(const char *name)
216 {
217         const struct Qdisc_ops *ops;
218
219         if (!capable(CAP_NET_ADMIN))
220                 return -EPERM;
221
222         write_lock(&qdisc_mod_lock);
223         ops = qdisc_lookup_default(name);
224         if (!ops) {
225                 /* Not found, drop lock and try to load module */
226                 write_unlock(&qdisc_mod_lock);
227                 request_module("sch_%s", name);
228                 write_lock(&qdisc_mod_lock);
229
230                 ops = qdisc_lookup_default(name);
231         }
232
233         if (ops) {
234                 /* Set new default */
235                 module_put(default_qdisc_ops->owner);
236                 default_qdisc_ops = ops;
237         }
238         write_unlock(&qdisc_mod_lock);
239
240         return ops ? 0 : -ENOENT;
241 }
242
243 #ifdef CONFIG_NET_SCH_DEFAULT
244 /* Set default value from kernel config */
245 static int __init sch_default_qdisc(void)
246 {
247         return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
248 }
249 late_initcall(sch_default_qdisc);
250 #endif
251
252 /* We know handle. Find qdisc among all qdisc's attached to device
253  * (root qdisc, all its children, children of children etc.)
254  * Note: caller either uses rtnl or rcu_read_lock()
255  */
256
257 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
258 {
259         struct Qdisc *q;
260
261         if (!qdisc_dev(root))
262                 return (root->handle == handle ? root : NULL);
263
264         if (!(root->flags & TCQ_F_BUILTIN) &&
265             root->handle == handle)
266                 return root;
267
268         hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) {
269                 if (q->handle == handle)
270                         return q;
271         }
272         return NULL;
273 }
274
275 void qdisc_hash_add(struct Qdisc *q, bool invisible)
276 {
277         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
278                 ASSERT_RTNL();
279                 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
280                 if (invisible)
281                         q->flags |= TCQ_F_INVISIBLE;
282         }
283 }
284 EXPORT_SYMBOL(qdisc_hash_add);
285
286 void qdisc_hash_del(struct Qdisc *q)
287 {
288         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
289                 ASSERT_RTNL();
290                 hash_del_rcu(&q->hash);
291         }
292 }
293 EXPORT_SYMBOL(qdisc_hash_del);
294
295 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
296 {
297         struct Qdisc *q;
298
299         if (!handle)
300                 return NULL;
301         q = qdisc_match_from_root(rtnl_dereference(dev->qdisc), handle);
302         if (q)
303                 goto out;
304
305         if (dev_ingress_queue(dev))
306                 q = qdisc_match_from_root(
307                         dev_ingress_queue(dev)->qdisc_sleeping,
308                         handle);
309 out:
310         return q;
311 }
312
313 struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
314 {
315         struct netdev_queue *nq;
316         struct Qdisc *q;
317
318         if (!handle)
319                 return NULL;
320         q = qdisc_match_from_root(rcu_dereference(dev->qdisc), handle);
321         if (q)
322                 goto out;
323
324         nq = dev_ingress_queue_rcu(dev);
325         if (nq)
326                 q = qdisc_match_from_root(nq->qdisc_sleeping, handle);
327 out:
328         return q;
329 }
330
331 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
332 {
333         unsigned long cl;
334         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
335
336         if (cops == NULL)
337                 return NULL;
338         cl = cops->find(p, classid);
339
340         if (cl == 0)
341                 return NULL;
342         return cops->leaf(p, cl);
343 }
344
345 /* Find queueing discipline by name */
346
347 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
348 {
349         struct Qdisc_ops *q = NULL;
350
351         if (kind) {
352                 read_lock(&qdisc_mod_lock);
353                 for (q = qdisc_base; q; q = q->next) {
354                         if (nla_strcmp(kind, q->id) == 0) {
355                                 if (!try_module_get(q->owner))
356                                         q = NULL;
357                                 break;
358                         }
359                 }
360                 read_unlock(&qdisc_mod_lock);
361         }
362         return q;
363 }
364
365 /* The linklayer setting were not transferred from iproute2, in older
366  * versions, and the rate tables lookup systems have been dropped in
367  * the kernel. To keep backward compatible with older iproute2 tc
368  * utils, we detect the linklayer setting by detecting if the rate
369  * table were modified.
370  *
371  * For linklayer ATM table entries, the rate table will be aligned to
372  * 48 bytes, thus some table entries will contain the same value.  The
373  * mpu (min packet unit) is also encoded into the old rate table, thus
374  * starting from the mpu, we find low and high table entries for
375  * mapping this cell.  If these entries contain the same value, when
376  * the rate tables have been modified for linklayer ATM.
377  *
378  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
379  * and then roundup to the next cell, calc the table entry one below,
380  * and compare.
381  */
382 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
383 {
384         int low       = roundup(r->mpu, 48);
385         int high      = roundup(low+1, 48);
386         int cell_low  = low >> r->cell_log;
387         int cell_high = (high >> r->cell_log) - 1;
388
389         /* rtab is too inaccurate at rates > 100Mbit/s */
390         if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
391                 pr_debug("TC linklayer: Giving up ATM detection\n");
392                 return TC_LINKLAYER_ETHERNET;
393         }
394
395         if ((cell_high > cell_low) && (cell_high < 256)
396             && (rtab[cell_low] == rtab[cell_high])) {
397                 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
398                          cell_low, cell_high, rtab[cell_high]);
399                 return TC_LINKLAYER_ATM;
400         }
401         return TC_LINKLAYER_ETHERNET;
402 }
403
404 static struct qdisc_rate_table *qdisc_rtab_list;
405
406 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
407                                         struct nlattr *tab,
408                                         struct netlink_ext_ack *extack)
409 {
410         struct qdisc_rate_table *rtab;
411
412         if (tab == NULL || r->rate == 0 ||
413             r->cell_log == 0 || r->cell_log >= 32 ||
414             nla_len(tab) != TC_RTAB_SIZE) {
415                 NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
416                 return NULL;
417         }
418
419         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
420                 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
421                     !memcmp(&rtab->data, nla_data(tab), 1024)) {
422                         rtab->refcnt++;
423                         return rtab;
424                 }
425         }
426
427         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
428         if (rtab) {
429                 rtab->rate = *r;
430                 rtab->refcnt = 1;
431                 memcpy(rtab->data, nla_data(tab), 1024);
432                 if (r->linklayer == TC_LINKLAYER_UNAWARE)
433                         r->linklayer = __detect_linklayer(r, rtab->data);
434                 rtab->next = qdisc_rtab_list;
435                 qdisc_rtab_list = rtab;
436         } else {
437                 NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
438         }
439         return rtab;
440 }
441 EXPORT_SYMBOL(qdisc_get_rtab);
442
443 void qdisc_put_rtab(struct qdisc_rate_table *tab)
444 {
445         struct qdisc_rate_table *rtab, **rtabp;
446
447         if (!tab || --tab->refcnt)
448                 return;
449
450         for (rtabp = &qdisc_rtab_list;
451              (rtab = *rtabp) != NULL;
452              rtabp = &rtab->next) {
453                 if (rtab == tab) {
454                         *rtabp = rtab->next;
455                         kfree(rtab);
456                         return;
457                 }
458         }
459 }
460 EXPORT_SYMBOL(qdisc_put_rtab);
461
462 static LIST_HEAD(qdisc_stab_list);
463
464 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
465         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
466         [TCA_STAB_DATA] = { .type = NLA_BINARY },
467 };
468
469 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
470                                                struct netlink_ext_ack *extack)
471 {
472         struct nlattr *tb[TCA_STAB_MAX + 1];
473         struct qdisc_size_table *stab;
474         struct tc_sizespec *s;
475         unsigned int tsize = 0;
476         u16 *tab = NULL;
477         int err;
478
479         err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy,
480                                           extack);
481         if (err < 0)
482                 return ERR_PTR(err);
483         if (!tb[TCA_STAB_BASE]) {
484                 NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
485                 return ERR_PTR(-EINVAL);
486         }
487
488         s = nla_data(tb[TCA_STAB_BASE]);
489
490         if (s->tsize > 0) {
491                 if (!tb[TCA_STAB_DATA]) {
492                         NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
493                         return ERR_PTR(-EINVAL);
494                 }
495                 tab = nla_data(tb[TCA_STAB_DATA]);
496                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
497         }
498
499         if (tsize != s->tsize || (!tab && tsize > 0)) {
500                 NL_SET_ERR_MSG(extack, "Invalid size of size table");
501                 return ERR_PTR(-EINVAL);
502         }
503
504         list_for_each_entry(stab, &qdisc_stab_list, list) {
505                 if (memcmp(&stab->szopts, s, sizeof(*s)))
506                         continue;
507                 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
508                         continue;
509                 stab->refcnt++;
510                 return stab;
511         }
512
513         if (s->size_log > STAB_SIZE_LOG_MAX ||
514             s->cell_log > STAB_SIZE_LOG_MAX) {
515                 NL_SET_ERR_MSG(extack, "Invalid logarithmic size of size table");
516                 return ERR_PTR(-EINVAL);
517         }
518
519         stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
520         if (!stab)
521                 return ERR_PTR(-ENOMEM);
522
523         stab->refcnt = 1;
524         stab->szopts = *s;
525         if (tsize > 0)
526                 memcpy(stab->data, tab, tsize * sizeof(u16));
527
528         list_add_tail(&stab->list, &qdisc_stab_list);
529
530         return stab;
531 }
532
533 void qdisc_put_stab(struct qdisc_size_table *tab)
534 {
535         if (!tab)
536                 return;
537
538         if (--tab->refcnt == 0) {
539                 list_del(&tab->list);
540                 kfree_rcu(tab, rcu);
541         }
542 }
543 EXPORT_SYMBOL(qdisc_put_stab);
544
545 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
546 {
547         struct nlattr *nest;
548
549         nest = nla_nest_start_noflag(skb, TCA_STAB);
550         if (nest == NULL)
551                 goto nla_put_failure;
552         if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
553                 goto nla_put_failure;
554         nla_nest_end(skb, nest);
555
556         return skb->len;
557
558 nla_put_failure:
559         return -1;
560 }
561
562 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
563                                const struct qdisc_size_table *stab)
564 {
565         int pkt_len, slot;
566
567         pkt_len = skb->len + stab->szopts.overhead;
568         if (unlikely(!stab->szopts.tsize))
569                 goto out;
570
571         slot = pkt_len + stab->szopts.cell_align;
572         if (unlikely(slot < 0))
573                 slot = 0;
574
575         slot >>= stab->szopts.cell_log;
576         if (likely(slot < stab->szopts.tsize))
577                 pkt_len = stab->data[slot];
578         else
579                 pkt_len = stab->data[stab->szopts.tsize - 1] *
580                                 (slot / stab->szopts.tsize) +
581                                 stab->data[slot % stab->szopts.tsize];
582
583         pkt_len <<= stab->szopts.size_log;
584 out:
585         if (unlikely(pkt_len < 1))
586                 pkt_len = 1;
587         qdisc_skb_cb(skb)->pkt_len = pkt_len;
588 }
589 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
590
591 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
592 {
593         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
594                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
595                         txt, qdisc->ops->id, qdisc->handle >> 16);
596                 qdisc->flags |= TCQ_F_WARN_NONWC;
597         }
598 }
599 EXPORT_SYMBOL(qdisc_warn_nonwc);
600
601 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
602 {
603         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
604                                                  timer);
605
606         rcu_read_lock();
607         __netif_schedule(qdisc_root(wd->qdisc));
608         rcu_read_unlock();
609
610         return HRTIMER_NORESTART;
611 }
612
613 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
614                                  clockid_t clockid)
615 {
616         hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
617         wd->timer.function = qdisc_watchdog;
618         wd->qdisc = qdisc;
619 }
620 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
621
622 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
623 {
624         qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
625 }
626 EXPORT_SYMBOL(qdisc_watchdog_init);
627
628 void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
629 {
630         if (test_bit(__QDISC_STATE_DEACTIVATED,
631                      &qdisc_root_sleeping(wd->qdisc)->state))
632                 return;
633
634         if (wd->last_expires == expires)
635                 return;
636
637         wd->last_expires = expires;
638         hrtimer_start(&wd->timer,
639                       ns_to_ktime(expires),
640                       HRTIMER_MODE_ABS_PINNED);
641 }
642 EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
643
644 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
645 {
646         hrtimer_cancel(&wd->timer);
647 }
648 EXPORT_SYMBOL(qdisc_watchdog_cancel);
649
650 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
651 {
652         struct hlist_head *h;
653         unsigned int i;
654
655         h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
656
657         if (h != NULL) {
658                 for (i = 0; i < n; i++)
659                         INIT_HLIST_HEAD(&h[i]);
660         }
661         return h;
662 }
663
664 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
665 {
666         struct Qdisc_class_common *cl;
667         struct hlist_node *next;
668         struct hlist_head *nhash, *ohash;
669         unsigned int nsize, nmask, osize;
670         unsigned int i, h;
671
672         /* Rehash when load factor exceeds 0.75 */
673         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
674                 return;
675         nsize = clhash->hashsize * 2;
676         nmask = nsize - 1;
677         nhash = qdisc_class_hash_alloc(nsize);
678         if (nhash == NULL)
679                 return;
680
681         ohash = clhash->hash;
682         osize = clhash->hashsize;
683
684         sch_tree_lock(sch);
685         for (i = 0; i < osize; i++) {
686                 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
687                         h = qdisc_class_hash(cl->classid, nmask);
688                         hlist_add_head(&cl->hnode, &nhash[h]);
689                 }
690         }
691         clhash->hash     = nhash;
692         clhash->hashsize = nsize;
693         clhash->hashmask = nmask;
694         sch_tree_unlock(sch);
695
696         kvfree(ohash);
697 }
698 EXPORT_SYMBOL(qdisc_class_hash_grow);
699
700 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
701 {
702         unsigned int size = 4;
703
704         clhash->hash = qdisc_class_hash_alloc(size);
705         if (!clhash->hash)
706                 return -ENOMEM;
707         clhash->hashsize  = size;
708         clhash->hashmask  = size - 1;
709         clhash->hashelems = 0;
710         return 0;
711 }
712 EXPORT_SYMBOL(qdisc_class_hash_init);
713
714 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
715 {
716         kvfree(clhash->hash);
717 }
718 EXPORT_SYMBOL(qdisc_class_hash_destroy);
719
720 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
721                              struct Qdisc_class_common *cl)
722 {
723         unsigned int h;
724
725         INIT_HLIST_NODE(&cl->hnode);
726         h = qdisc_class_hash(cl->classid, clhash->hashmask);
727         hlist_add_head(&cl->hnode, &clhash->hash[h]);
728         clhash->hashelems++;
729 }
730 EXPORT_SYMBOL(qdisc_class_hash_insert);
731
732 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
733                              struct Qdisc_class_common *cl)
734 {
735         hlist_del(&cl->hnode);
736         clhash->hashelems--;
737 }
738 EXPORT_SYMBOL(qdisc_class_hash_remove);
739
740 /* Allocate an unique handle from space managed by kernel
741  * Possible range is [8000-FFFF]:0000 (0x8000 values)
742  */
743 static u32 qdisc_alloc_handle(struct net_device *dev)
744 {
745         int i = 0x8000;
746         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
747
748         do {
749                 autohandle += TC_H_MAKE(0x10000U, 0);
750                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
751                         autohandle = TC_H_MAKE(0x80000000U, 0);
752                 if (!qdisc_lookup(dev, autohandle))
753                         return autohandle;
754                 cond_resched();
755         } while (--i > 0);
756
757         return 0;
758 }
759
760 void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
761 {
762         bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
763         const struct Qdisc_class_ops *cops;
764         unsigned long cl;
765         u32 parentid;
766         bool notify;
767         int drops;
768
769         if (n == 0 && len == 0)
770                 return;
771         drops = max_t(int, n, 0);
772         rcu_read_lock();
773         while ((parentid = sch->parent)) {
774                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
775                         break;
776
777                 if (sch->flags & TCQ_F_NOPARENT)
778                         break;
779                 /* Notify parent qdisc only if child qdisc becomes empty.
780                  *
781                  * If child was empty even before update then backlog
782                  * counter is screwed and we skip notification because
783                  * parent class is already passive.
784                  *
785                  * If the original child was offloaded then it is allowed
786                  * to be seem as empty, so the parent is notified anyway.
787                  */
788                 notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
789                                                        !qdisc_is_offloaded);
790                 /* TODO: perform the search on a per txq basis */
791                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
792                 if (sch == NULL) {
793                         WARN_ON_ONCE(parentid != TC_H_ROOT);
794                         break;
795                 }
796                 cops = sch->ops->cl_ops;
797                 if (notify && cops->qlen_notify) {
798                         cl = cops->find(sch, parentid);
799                         cops->qlen_notify(sch, cl);
800                 }
801                 sch->q.qlen -= n;
802                 sch->qstats.backlog -= len;
803                 __qdisc_qstats_drop(sch, drops);
804         }
805         rcu_read_unlock();
806 }
807 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
808
809 int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
810                               void *type_data)
811 {
812         struct net_device *dev = qdisc_dev(sch);
813         int err;
814
815         sch->flags &= ~TCQ_F_OFFLOADED;
816         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
817                 return 0;
818
819         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
820         if (err == -EOPNOTSUPP)
821                 return 0;
822
823         if (!err)
824                 sch->flags |= TCQ_F_OFFLOADED;
825
826         return err;
827 }
828 EXPORT_SYMBOL(qdisc_offload_dump_helper);
829
830 void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
831                                 struct Qdisc *new, struct Qdisc *old,
832                                 enum tc_setup_type type, void *type_data,
833                                 struct netlink_ext_ack *extack)
834 {
835         bool any_qdisc_is_offloaded;
836         int err;
837
838         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
839                 return;
840
841         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
842
843         /* Don't report error if the graft is part of destroy operation. */
844         if (!err || !new || new == &noop_qdisc)
845                 return;
846
847         /* Don't report error if the parent, the old child and the new
848          * one are not offloaded.
849          */
850         any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
851         any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
852         any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
853
854         if (any_qdisc_is_offloaded)
855                 NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
856 }
857 EXPORT_SYMBOL(qdisc_offload_graft_helper);
858
859 static void qdisc_offload_graft_root(struct net_device *dev,
860                                      struct Qdisc *new, struct Qdisc *old,
861                                      struct netlink_ext_ack *extack)
862 {
863         struct tc_root_qopt_offload graft_offload = {
864                 .command        = TC_ROOT_GRAFT,
865                 .handle         = new ? new->handle : 0,
866                 .ingress        = (new && new->flags & TCQ_F_INGRESS) ||
867                                   (old && old->flags & TCQ_F_INGRESS),
868         };
869
870         qdisc_offload_graft_helper(dev, NULL, new, old,
871                                    TC_SETUP_ROOT_QDISC, &graft_offload, extack);
872 }
873
874 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
875                          u32 portid, u32 seq, u16 flags, int event)
876 {
877         struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
878         struct gnet_stats_queue __percpu *cpu_qstats = NULL;
879         struct tcmsg *tcm;
880         struct nlmsghdr  *nlh;
881         unsigned char *b = skb_tail_pointer(skb);
882         struct gnet_dump d;
883         struct qdisc_size_table *stab;
884         u32 block_index;
885         __u32 qlen;
886
887         cond_resched();
888         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
889         if (!nlh)
890                 goto out_nlmsg_trim;
891         tcm = nlmsg_data(nlh);
892         tcm->tcm_family = AF_UNSPEC;
893         tcm->tcm__pad1 = 0;
894         tcm->tcm__pad2 = 0;
895         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
896         tcm->tcm_parent = clid;
897         tcm->tcm_handle = q->handle;
898         tcm->tcm_info = refcount_read(&q->refcnt);
899         if (nla_put_string(skb, TCA_KIND, q->ops->id))
900                 goto nla_put_failure;
901         if (q->ops->ingress_block_get) {
902                 block_index = q->ops->ingress_block_get(q);
903                 if (block_index &&
904                     nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
905                         goto nla_put_failure;
906         }
907         if (q->ops->egress_block_get) {
908                 block_index = q->ops->egress_block_get(q);
909                 if (block_index &&
910                     nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
911                         goto nla_put_failure;
912         }
913         if (q->ops->dump && q->ops->dump(q, skb) < 0)
914                 goto nla_put_failure;
915         if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
916                 goto nla_put_failure;
917         qlen = qdisc_qlen_sum(q);
918
919         stab = rtnl_dereference(q->stab);
920         if (stab && qdisc_dump_stab(skb, stab) < 0)
921                 goto nla_put_failure;
922
923         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
924                                          NULL, &d, TCA_PAD) < 0)
925                 goto nla_put_failure;
926
927         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
928                 goto nla_put_failure;
929
930         if (qdisc_is_percpu_stats(q)) {
931                 cpu_bstats = q->cpu_bstats;
932                 cpu_qstats = q->cpu_qstats;
933         }
934
935         if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
936                                   &d, cpu_bstats, &q->bstats) < 0 ||
937             gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
938             gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
939                 goto nla_put_failure;
940
941         if (gnet_stats_finish_copy(&d) < 0)
942                 goto nla_put_failure;
943
944         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
945         return skb->len;
946
947 out_nlmsg_trim:
948 nla_put_failure:
949         nlmsg_trim(skb, b);
950         return -1;
951 }
952
953 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
954 {
955         if (q->flags & TCQ_F_BUILTIN)
956                 return true;
957         if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
958                 return true;
959
960         return false;
961 }
962
963 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
964                         struct nlmsghdr *n, u32 clid,
965                         struct Qdisc *old, struct Qdisc *new)
966 {
967         struct sk_buff *skb;
968         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
969
970         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
971         if (!skb)
972                 return -ENOBUFS;
973
974         if (old && !tc_qdisc_dump_ignore(old, false)) {
975                 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
976                                   0, RTM_DELQDISC) < 0)
977                         goto err_out;
978         }
979         if (new && !tc_qdisc_dump_ignore(new, false)) {
980                 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
981                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
982                         goto err_out;
983         }
984
985         if (skb->len)
986                 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
987                                       n->nlmsg_flags & NLM_F_ECHO);
988
989 err_out:
990         kfree_skb(skb);
991         return -EINVAL;
992 }
993
994 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
995                                struct nlmsghdr *n, u32 clid,
996                                struct Qdisc *old, struct Qdisc *new)
997 {
998         if (new || old)
999                 qdisc_notify(net, skb, n, clid, old, new);
1000
1001         if (old)
1002                 qdisc_put(old);
1003 }
1004
1005 static void qdisc_clear_nolock(struct Qdisc *sch)
1006 {
1007         sch->flags &= ~TCQ_F_NOLOCK;
1008         if (!(sch->flags & TCQ_F_CPUSTATS))
1009                 return;
1010
1011         free_percpu(sch->cpu_bstats);
1012         free_percpu(sch->cpu_qstats);
1013         sch->cpu_bstats = NULL;
1014         sch->cpu_qstats = NULL;
1015         sch->flags &= ~TCQ_F_CPUSTATS;
1016 }
1017
1018 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
1019  * to device "dev".
1020  *
1021  * When appropriate send a netlink notification using 'skb'
1022  * and "n".
1023  *
1024  * On success, destroy old qdisc.
1025  */
1026
1027 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
1028                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
1029                        struct Qdisc *new, struct Qdisc *old,
1030                        struct netlink_ext_ack *extack)
1031 {
1032         struct Qdisc *q = old;
1033         struct net *net = dev_net(dev);
1034
1035         if (parent == NULL) {
1036                 unsigned int i, num_q, ingress;
1037
1038                 ingress = 0;
1039                 num_q = dev->num_tx_queues;
1040                 if ((q && q->flags & TCQ_F_INGRESS) ||
1041                     (new && new->flags & TCQ_F_INGRESS)) {
1042                         num_q = 1;
1043                         ingress = 1;
1044                         if (!dev_ingress_queue(dev)) {
1045                                 NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
1046                                 return -ENOENT;
1047                         }
1048                 }
1049
1050                 if (dev->flags & IFF_UP)
1051                         dev_deactivate(dev);
1052
1053                 qdisc_offload_graft_root(dev, new, old, extack);
1054
1055                 if (new && new->ops->attach)
1056                         goto skip;
1057
1058                 for (i = 0; i < num_q; i++) {
1059                         struct netdev_queue *dev_queue = dev_ingress_queue(dev);
1060
1061                         if (!ingress)
1062                                 dev_queue = netdev_get_tx_queue(dev, i);
1063
1064                         old = dev_graft_qdisc(dev_queue, new);
1065                         if (new && i > 0)
1066                                 qdisc_refcount_inc(new);
1067
1068                         if (!ingress)
1069                                 qdisc_put(old);
1070                 }
1071
1072 skip:
1073                 if (!ingress) {
1074                         old = rtnl_dereference(dev->qdisc);
1075                         if (new && !new->ops->attach)
1076                                 qdisc_refcount_inc(new);
1077                         rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc);
1078
1079                         notify_and_destroy(net, skb, n, classid, old, new);
1080
1081                         if (new && new->ops->attach)
1082                                 new->ops->attach(new);
1083                 } else {
1084                         notify_and_destroy(net, skb, n, classid, old, new);
1085                 }
1086
1087                 if (dev->flags & IFF_UP)
1088                         dev_activate(dev);
1089         } else {
1090                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1091                 unsigned long cl;
1092                 int err;
1093
1094                 /* Only support running class lockless if parent is lockless */
1095                 if (new && (new->flags & TCQ_F_NOLOCK) &&
1096                     parent && !(parent->flags & TCQ_F_NOLOCK))
1097                         qdisc_clear_nolock(new);
1098
1099                 if (!cops || !cops->graft)
1100                         return -EOPNOTSUPP;
1101
1102                 cl = cops->find(parent, classid);
1103                 if (!cl) {
1104                         NL_SET_ERR_MSG(extack, "Specified class not found");
1105                         return -ENOENT;
1106                 }
1107
1108                 if (new && new->ops == &noqueue_qdisc_ops) {
1109                         NL_SET_ERR_MSG(extack, "Cannot assign noqueue to a class");
1110                         return -EINVAL;
1111                 }
1112
1113                 err = cops->graft(parent, cl, new, &old, extack);
1114                 if (err)
1115                         return err;
1116                 notify_and_destroy(net, skb, n, classid, old, new);
1117         }
1118         return 0;
1119 }
1120
1121 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1122                                    struct netlink_ext_ack *extack)
1123 {
1124         u32 block_index;
1125
1126         if (tca[TCA_INGRESS_BLOCK]) {
1127                 block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1128
1129                 if (!block_index) {
1130                         NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1131                         return -EINVAL;
1132                 }
1133                 if (!sch->ops->ingress_block_set) {
1134                         NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1135                         return -EOPNOTSUPP;
1136                 }
1137                 sch->ops->ingress_block_set(sch, block_index);
1138         }
1139         if (tca[TCA_EGRESS_BLOCK]) {
1140                 block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1141
1142                 if (!block_index) {
1143                         NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1144                         return -EINVAL;
1145                 }
1146                 if (!sch->ops->egress_block_set) {
1147                         NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1148                         return -EOPNOTSUPP;
1149                 }
1150                 sch->ops->egress_block_set(sch, block_index);
1151         }
1152         return 0;
1153 }
1154
1155 /*
1156    Allocate and initialize new qdisc.
1157
1158    Parameters are passed via opt.
1159  */
1160
1161 static struct Qdisc *qdisc_create(struct net_device *dev,
1162                                   struct netdev_queue *dev_queue,
1163                                   struct Qdisc *p, u32 parent, u32 handle,
1164                                   struct nlattr **tca, int *errp,
1165                                   struct netlink_ext_ack *extack)
1166 {
1167         int err;
1168         struct nlattr *kind = tca[TCA_KIND];
1169         struct Qdisc *sch;
1170         struct Qdisc_ops *ops;
1171         struct qdisc_size_table *stab;
1172
1173         ops = qdisc_lookup_ops(kind);
1174 #ifdef CONFIG_MODULES
1175         if (ops == NULL && kind != NULL) {
1176                 char name[IFNAMSIZ];
1177                 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
1178                         /* We dropped the RTNL semaphore in order to
1179                          * perform the module load.  So, even if we
1180                          * succeeded in loading the module we have to
1181                          * tell the caller to replay the request.  We
1182                          * indicate this using -EAGAIN.
1183                          * We replay the request because the device may
1184                          * go away in the mean time.
1185                          */
1186                         rtnl_unlock();
1187                         request_module("sch_%s", name);
1188                         rtnl_lock();
1189                         ops = qdisc_lookup_ops(kind);
1190                         if (ops != NULL) {
1191                                 /* We will try again qdisc_lookup_ops,
1192                                  * so don't keep a reference.
1193                                  */
1194                                 module_put(ops->owner);
1195                                 err = -EAGAIN;
1196                                 goto err_out;
1197                         }
1198                 }
1199         }
1200 #endif
1201
1202         err = -ENOENT;
1203         if (!ops) {
1204                 NL_SET_ERR_MSG(extack, "Specified qdisc kind is unknown");
1205                 goto err_out;
1206         }
1207
1208         sch = qdisc_alloc(dev_queue, ops, extack);
1209         if (IS_ERR(sch)) {
1210                 err = PTR_ERR(sch);
1211                 goto err_out2;
1212         }
1213
1214         sch->parent = parent;
1215
1216         if (handle == TC_H_INGRESS) {
1217                 if (!(sch->flags & TCQ_F_INGRESS)) {
1218                         NL_SET_ERR_MSG(extack,
1219                                        "Specified parent ID is reserved for ingress and clsact Qdiscs");
1220                         err = -EINVAL;
1221                         goto err_out3;
1222                 }
1223                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
1224         } else {
1225                 if (handle == 0) {
1226                         handle = qdisc_alloc_handle(dev);
1227                         if (handle == 0) {
1228                                 NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded");
1229                                 err = -ENOSPC;
1230                                 goto err_out3;
1231                         }
1232                 }
1233                 if (!netif_is_multiqueue(dev))
1234                         sch->flags |= TCQ_F_ONETXQUEUE;
1235         }
1236
1237         sch->handle = handle;
1238
1239         /* This exist to keep backward compatible with a userspace
1240          * loophole, what allowed userspace to get IFF_NO_QUEUE
1241          * facility on older kernels by setting tx_queue_len=0 (prior
1242          * to qdisc init), and then forgot to reinit tx_queue_len
1243          * before again attaching a qdisc.
1244          */
1245         if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1246                 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1247                 netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1248         }
1249
1250         err = qdisc_block_indexes_set(sch, tca, extack);
1251         if (err)
1252                 goto err_out3;
1253
1254         if (ops->init) {
1255                 err = ops->init(sch, tca[TCA_OPTIONS], extack);
1256                 if (err != 0)
1257                         goto err_out5;
1258         }
1259
1260         if (tca[TCA_STAB]) {
1261                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1262                 if (IS_ERR(stab)) {
1263                         err = PTR_ERR(stab);
1264                         goto err_out4;
1265                 }
1266                 rcu_assign_pointer(sch->stab, stab);
1267         }
1268         if (tca[TCA_RATE]) {
1269                 seqcount_t *running;
1270
1271                 err = -EOPNOTSUPP;
1272                 if (sch->flags & TCQ_F_MQROOT) {
1273                         NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1274                         goto err_out4;
1275                 }
1276
1277                 if (sch->parent != TC_H_ROOT &&
1278                     !(sch->flags & TCQ_F_INGRESS) &&
1279                     (!p || !(p->flags & TCQ_F_MQROOT)))
1280                         running = qdisc_root_sleeping_running(sch);
1281                 else
1282                         running = &sch->running;
1283
1284                 err = gen_new_estimator(&sch->bstats,
1285                                         sch->cpu_bstats,
1286                                         &sch->rate_est,
1287                                         NULL,
1288                                         running,
1289                                         tca[TCA_RATE]);
1290                 if (err) {
1291                         NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1292                         goto err_out4;
1293                 }
1294         }
1295
1296         qdisc_hash_add(sch, false);
1297
1298         return sch;
1299
1300 err_out5:
1301         /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1302         if (ops->destroy)
1303                 ops->destroy(sch);
1304 err_out3:
1305         dev_put(dev);
1306         qdisc_free(sch);
1307 err_out2:
1308         module_put(ops->owner);
1309 err_out:
1310         *errp = err;
1311         return NULL;
1312
1313 err_out4:
1314         /*
1315          * Any broken qdiscs that would require a ops->reset() here?
1316          * The qdisc was never in action so it shouldn't be necessary.
1317          */
1318         qdisc_put_stab(rtnl_dereference(sch->stab));
1319         if (ops->destroy)
1320                 ops->destroy(sch);
1321         goto err_out3;
1322 }
1323
1324 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1325                         struct netlink_ext_ack *extack)
1326 {
1327         struct qdisc_size_table *ostab, *stab = NULL;
1328         int err = 0;
1329
1330         if (tca[TCA_OPTIONS]) {
1331                 if (!sch->ops->change) {
1332                         NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1333                         return -EINVAL;
1334                 }
1335                 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1336                         NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1337                         return -EOPNOTSUPP;
1338                 }
1339                 err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1340                 if (err)
1341                         return err;
1342         }
1343
1344         if (tca[TCA_STAB]) {
1345                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1346                 if (IS_ERR(stab))
1347                         return PTR_ERR(stab);
1348         }
1349
1350         ostab = rtnl_dereference(sch->stab);
1351         rcu_assign_pointer(sch->stab, stab);
1352         qdisc_put_stab(ostab);
1353
1354         if (tca[TCA_RATE]) {
1355                 /* NB: ignores errors from replace_estimator
1356                    because change can't be undone. */
1357                 if (sch->flags & TCQ_F_MQROOT)
1358                         goto out;
1359                 gen_replace_estimator(&sch->bstats,
1360                                       sch->cpu_bstats,
1361                                       &sch->rate_est,
1362                                       NULL,
1363                                       qdisc_root_sleeping_running(sch),
1364                                       tca[TCA_RATE]);
1365         }
1366 out:
1367         return 0;
1368 }
1369
1370 struct check_loop_arg {
1371         struct qdisc_walker     w;
1372         struct Qdisc            *p;
1373         int                     depth;
1374 };
1375
1376 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1377                          struct qdisc_walker *w);
1378
1379 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1380 {
1381         struct check_loop_arg   arg;
1382
1383         if (q->ops->cl_ops == NULL)
1384                 return 0;
1385
1386         arg.w.stop = arg.w.skip = arg.w.count = 0;
1387         arg.w.fn = check_loop_fn;
1388         arg.depth = depth;
1389         arg.p = p;
1390         q->ops->cl_ops->walk(q, &arg.w);
1391         return arg.w.stop ? -ELOOP : 0;
1392 }
1393
1394 static int
1395 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1396 {
1397         struct Qdisc *leaf;
1398         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1399         struct check_loop_arg *arg = (struct check_loop_arg *)w;
1400
1401         leaf = cops->leaf(q, cl);
1402         if (leaf) {
1403                 if (leaf == arg->p || arg->depth > 7)
1404                         return -ELOOP;
1405                 return check_loop(leaf, arg->p, arg->depth + 1);
1406         }
1407         return 0;
1408 }
1409
1410 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1411         [TCA_KIND]              = { .type = NLA_STRING },
1412         [TCA_RATE]              = { .type = NLA_BINARY,
1413                                     .len = sizeof(struct tc_estimator) },
1414         [TCA_STAB]              = { .type = NLA_NESTED },
1415         [TCA_DUMP_INVISIBLE]    = { .type = NLA_FLAG },
1416         [TCA_CHAIN]             = { .type = NLA_U32 },
1417         [TCA_INGRESS_BLOCK]     = { .type = NLA_U32 },
1418         [TCA_EGRESS_BLOCK]      = { .type = NLA_U32 },
1419 };
1420
1421 /*
1422  * Delete/get qdisc.
1423  */
1424
1425 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1426                         struct netlink_ext_ack *extack)
1427 {
1428         struct net *net = sock_net(skb->sk);
1429         struct tcmsg *tcm = nlmsg_data(n);
1430         struct nlattr *tca[TCA_MAX + 1];
1431         struct net_device *dev;
1432         u32 clid;
1433         struct Qdisc *q = NULL;
1434         struct Qdisc *p = NULL;
1435         int err;
1436
1437         if ((n->nlmsg_type != RTM_GETQDISC) &&
1438             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1439                 return -EPERM;
1440
1441         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1442                                      rtm_tca_policy, extack);
1443         if (err < 0)
1444                 return err;
1445
1446         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1447         if (!dev)
1448                 return -ENODEV;
1449
1450         clid = tcm->tcm_parent;
1451         if (clid) {
1452                 if (clid != TC_H_ROOT) {
1453                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1454                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1455                                 if (!p) {
1456                                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1457                                         return -ENOENT;
1458                                 }
1459                                 q = qdisc_leaf(p, clid);
1460                         } else if (dev_ingress_queue(dev)) {
1461                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1462                         }
1463                 } else {
1464                         q = rtnl_dereference(dev->qdisc);
1465                 }
1466                 if (!q) {
1467                         NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1468                         return -ENOENT;
1469                 }
1470
1471                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1472                         NL_SET_ERR_MSG(extack, "Invalid handle");
1473                         return -EINVAL;
1474                 }
1475         } else {
1476                 q = qdisc_lookup(dev, tcm->tcm_handle);
1477                 if (!q) {
1478                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1479                         return -ENOENT;
1480                 }
1481         }
1482
1483         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1484                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1485                 return -EINVAL;
1486         }
1487
1488         if (n->nlmsg_type == RTM_DELQDISC) {
1489                 if (!clid) {
1490                         NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1491                         return -EINVAL;
1492                 }
1493                 if (q->handle == 0) {
1494                         NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1495                         return -ENOENT;
1496                 }
1497                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1498                 if (err != 0)
1499                         return err;
1500         } else {
1501                 qdisc_notify(net, skb, n, clid, NULL, q);
1502         }
1503         return 0;
1504 }
1505
1506 static bool req_create_or_replace(struct nlmsghdr *n)
1507 {
1508         return (n->nlmsg_flags & NLM_F_CREATE &&
1509                 n->nlmsg_flags & NLM_F_REPLACE);
1510 }
1511
1512 static bool req_create_exclusive(struct nlmsghdr *n)
1513 {
1514         return (n->nlmsg_flags & NLM_F_CREATE &&
1515                 n->nlmsg_flags & NLM_F_EXCL);
1516 }
1517
1518 static bool req_change(struct nlmsghdr *n)
1519 {
1520         return (!(n->nlmsg_flags & NLM_F_CREATE) &&
1521                 !(n->nlmsg_flags & NLM_F_REPLACE) &&
1522                 !(n->nlmsg_flags & NLM_F_EXCL));
1523 }
1524
1525 /*
1526  * Create/change qdisc.
1527  */
1528 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1529                            struct netlink_ext_ack *extack)
1530 {
1531         struct net *net = sock_net(skb->sk);
1532         struct tcmsg *tcm;
1533         struct nlattr *tca[TCA_MAX + 1];
1534         struct net_device *dev;
1535         u32 clid;
1536         struct Qdisc *q, *p;
1537         int err;
1538
1539         if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1540                 return -EPERM;
1541
1542 replay:
1543         /* Reinit, just in case something touches this. */
1544         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1545                                      rtm_tca_policy, extack);
1546         if (err < 0)
1547                 return err;
1548
1549         tcm = nlmsg_data(n);
1550         clid = tcm->tcm_parent;
1551         q = p = NULL;
1552
1553         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1554         if (!dev)
1555                 return -ENODEV;
1556
1557
1558         if (clid) {
1559                 if (clid != TC_H_ROOT) {
1560                         if (clid != TC_H_INGRESS) {
1561                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1562                                 if (!p) {
1563                                         NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1564                                         return -ENOENT;
1565                                 }
1566                                 q = qdisc_leaf(p, clid);
1567                         } else if (dev_ingress_queue_create(dev)) {
1568                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1569                         }
1570                 } else {
1571                         q = rtnl_dereference(dev->qdisc);
1572                 }
1573
1574                 /* It may be default qdisc, ignore it */
1575                 if (q && q->handle == 0)
1576                         q = NULL;
1577
1578                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1579                         if (tcm->tcm_handle) {
1580                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1581                                         NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1582                                         return -EEXIST;
1583                                 }
1584                                 if (TC_H_MIN(tcm->tcm_handle)) {
1585                                         NL_SET_ERR_MSG(extack, "Invalid minor handle");
1586                                         return -EINVAL;
1587                                 }
1588                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1589                                 if (!q)
1590                                         goto create_n_graft;
1591                                 if (n->nlmsg_flags & NLM_F_EXCL) {
1592                                         NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1593                                         return -EEXIST;
1594                                 }
1595                                 if (tca[TCA_KIND] &&
1596                                     nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1597                                         NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1598                                         return -EINVAL;
1599                                 }
1600                                 if (q->flags & TCQ_F_INGRESS) {
1601                                         NL_SET_ERR_MSG(extack,
1602                                                        "Cannot regraft ingress or clsact Qdiscs");
1603                                         return -EINVAL;
1604                                 }
1605                                 if (q == p ||
1606                                     (p && check_loop(q, p, 0))) {
1607                                         NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1608                                         return -ELOOP;
1609                                 }
1610                                 if (clid == TC_H_INGRESS) {
1611                                         NL_SET_ERR_MSG(extack, "Ingress cannot graft directly");
1612                                         return -EINVAL;
1613                                 }
1614                                 qdisc_refcount_inc(q);
1615                                 goto graft;
1616                         } else {
1617                                 if (!q)
1618                                         goto create_n_graft;
1619
1620                                 /* This magic test requires explanation.
1621                                  *
1622                                  *   We know, that some child q is already
1623                                  *   attached to this parent and have choice:
1624                                  *   1) change it or 2) create/graft new one.
1625                                  *   If the requested qdisc kind is different
1626                                  *   than the existing one, then we choose graft.
1627                                  *   If they are the same then this is "change"
1628                                  *   operation - just let it fallthrough..
1629                                  *
1630                                  *   1. We are allowed to create/graft only
1631                                  *   if the request is explicitly stating
1632                                  *   "please create if it doesn't exist".
1633                                  *
1634                                  *   2. If the request is to exclusive create
1635                                  *   then the qdisc tcm_handle is not expected
1636                                  *   to exist, so that we choose create/graft too.
1637                                  *
1638                                  *   3. The last case is when no flags are set.
1639                                  *   This will happen when for example tc
1640                                  *   utility issues a "change" command.
1641                                  *   Alas, it is sort of hole in API, we
1642                                  *   cannot decide what to do unambiguously.
1643                                  *   For now we select create/graft.
1644                                  */
1645                                 if (tca[TCA_KIND] &&
1646                                     nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1647                                         if (req_create_or_replace(n) ||
1648                                             req_create_exclusive(n))
1649                                                 goto create_n_graft;
1650                                         else if (req_change(n))
1651                                                 goto create_n_graft2;
1652                                 }
1653                         }
1654                 }
1655         } else {
1656                 if (!tcm->tcm_handle) {
1657                         NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1658                         return -EINVAL;
1659                 }
1660                 q = qdisc_lookup(dev, tcm->tcm_handle);
1661         }
1662
1663         /* Change qdisc parameters */
1664         if (!q) {
1665                 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1666                 return -ENOENT;
1667         }
1668         if (n->nlmsg_flags & NLM_F_EXCL) {
1669                 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1670                 return -EEXIST;
1671         }
1672         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1673                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1674                 return -EINVAL;
1675         }
1676         err = qdisc_change(q, tca, extack);
1677         if (err == 0)
1678                 qdisc_notify(net, skb, n, clid, NULL, q);
1679         return err;
1680
1681 create_n_graft:
1682         if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1683                 NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1684                 return -ENOENT;
1685         }
1686 create_n_graft2:
1687         if (clid == TC_H_INGRESS) {
1688                 if (dev_ingress_queue(dev)) {
1689                         q = qdisc_create(dev, dev_ingress_queue(dev), p,
1690                                          tcm->tcm_parent, tcm->tcm_parent,
1691                                          tca, &err, extack);
1692                 } else {
1693                         NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1694                         err = -ENOENT;
1695                 }
1696         } else {
1697                 struct netdev_queue *dev_queue;
1698
1699                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1700                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1701                 else if (p)
1702                         dev_queue = p->dev_queue;
1703                 else
1704                         dev_queue = netdev_get_tx_queue(dev, 0);
1705
1706                 q = qdisc_create(dev, dev_queue, p,
1707                                  tcm->tcm_parent, tcm->tcm_handle,
1708                                  tca, &err, extack);
1709         }
1710         if (q == NULL) {
1711                 if (err == -EAGAIN)
1712                         goto replay;
1713                 return err;
1714         }
1715
1716 graft:
1717         err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1718         if (err) {
1719                 if (q)
1720                         qdisc_put(q);
1721                 return err;
1722         }
1723
1724         return 0;
1725 }
1726
1727 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1728                               struct netlink_callback *cb,
1729                               int *q_idx_p, int s_q_idx, bool recur,
1730                               bool dump_invisible)
1731 {
1732         int ret = 0, q_idx = *q_idx_p;
1733         struct Qdisc *q;
1734         int b;
1735
1736         if (!root)
1737                 return 0;
1738
1739         q = root;
1740         if (q_idx < s_q_idx) {
1741                 q_idx++;
1742         } else {
1743                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1744                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1745                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1746                                   RTM_NEWQDISC) <= 0)
1747                         goto done;
1748                 q_idx++;
1749         }
1750
1751         /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1752          * itself has already been dumped.
1753          *
1754          * If we've already dumped the top-level (ingress) qdisc above and the global
1755          * qdisc hashtable, we don't want to hit it again
1756          */
1757         if (!qdisc_dev(root) || !recur)
1758                 goto out;
1759
1760         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1761                 if (q_idx < s_q_idx) {
1762                         q_idx++;
1763                         continue;
1764                 }
1765                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1766                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1767                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1768                                   RTM_NEWQDISC) <= 0)
1769                         goto done;
1770                 q_idx++;
1771         }
1772
1773 out:
1774         *q_idx_p = q_idx;
1775         return ret;
1776 done:
1777         ret = -1;
1778         goto out;
1779 }
1780
1781 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1782 {
1783         struct net *net = sock_net(skb->sk);
1784         int idx, q_idx;
1785         int s_idx, s_q_idx;
1786         struct net_device *dev;
1787         const struct nlmsghdr *nlh = cb->nlh;
1788         struct nlattr *tca[TCA_MAX + 1];
1789         int err;
1790
1791         s_idx = cb->args[0];
1792         s_q_idx = q_idx = cb->args[1];
1793
1794         idx = 0;
1795         ASSERT_RTNL();
1796
1797         err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1798                                      rtm_tca_policy, cb->extack);
1799         if (err < 0)
1800                 return err;
1801
1802         for_each_netdev(net, dev) {
1803                 struct netdev_queue *dev_queue;
1804
1805                 if (idx < s_idx)
1806                         goto cont;
1807                 if (idx > s_idx)
1808                         s_q_idx = 0;
1809                 q_idx = 0;
1810
1811                 if (tc_dump_qdisc_root(rtnl_dereference(dev->qdisc),
1812                                        skb, cb, &q_idx, s_q_idx,
1813                                        true, tca[TCA_DUMP_INVISIBLE]) < 0)
1814                         goto done;
1815
1816                 dev_queue = dev_ingress_queue(dev);
1817                 if (dev_queue &&
1818                     tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1819                                        &q_idx, s_q_idx, false,
1820                                        tca[TCA_DUMP_INVISIBLE]) < 0)
1821                         goto done;
1822
1823 cont:
1824                 idx++;
1825         }
1826
1827 done:
1828         cb->args[0] = idx;
1829         cb->args[1] = q_idx;
1830
1831         return skb->len;
1832 }
1833
1834
1835
1836 /************************************************
1837  *      Traffic classes manipulation.           *
1838  ************************************************/
1839
1840 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1841                           unsigned long cl,
1842                           u32 portid, u32 seq, u16 flags, int event)
1843 {
1844         struct tcmsg *tcm;
1845         struct nlmsghdr  *nlh;
1846         unsigned char *b = skb_tail_pointer(skb);
1847         struct gnet_dump d;
1848         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1849
1850         cond_resched();
1851         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1852         if (!nlh)
1853                 goto out_nlmsg_trim;
1854         tcm = nlmsg_data(nlh);
1855         tcm->tcm_family = AF_UNSPEC;
1856         tcm->tcm__pad1 = 0;
1857         tcm->tcm__pad2 = 0;
1858         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1859         tcm->tcm_parent = q->handle;
1860         tcm->tcm_handle = q->handle;
1861         tcm->tcm_info = 0;
1862         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1863                 goto nla_put_failure;
1864         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1865                 goto nla_put_failure;
1866
1867         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1868                                          NULL, &d, TCA_PAD) < 0)
1869                 goto nla_put_failure;
1870
1871         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1872                 goto nla_put_failure;
1873
1874         if (gnet_stats_finish_copy(&d) < 0)
1875                 goto nla_put_failure;
1876
1877         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1878         return skb->len;
1879
1880 out_nlmsg_trim:
1881 nla_put_failure:
1882         nlmsg_trim(skb, b);
1883         return -1;
1884 }
1885
1886 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1887                          struct nlmsghdr *n, struct Qdisc *q,
1888                          unsigned long cl, int event)
1889 {
1890         struct sk_buff *skb;
1891         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1892         int err = 0;
1893
1894         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1895         if (!skb)
1896                 return -ENOBUFS;
1897
1898         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1899                 kfree_skb(skb);
1900                 return -EINVAL;
1901         }
1902
1903         err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1904                              n->nlmsg_flags & NLM_F_ECHO);
1905         if (err > 0)
1906                 err = 0;
1907         return err;
1908 }
1909
1910 static int tclass_del_notify(struct net *net,
1911                              const struct Qdisc_class_ops *cops,
1912                              struct sk_buff *oskb, struct nlmsghdr *n,
1913                              struct Qdisc *q, unsigned long cl)
1914 {
1915         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1916         struct sk_buff *skb;
1917         int err = 0;
1918
1919         if (!cops->delete)
1920                 return -EOPNOTSUPP;
1921
1922         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1923         if (!skb)
1924                 return -ENOBUFS;
1925
1926         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1927                            RTM_DELTCLASS) < 0) {
1928                 kfree_skb(skb);
1929                 return -EINVAL;
1930         }
1931
1932         err = cops->delete(q, cl);
1933         if (err) {
1934                 kfree_skb(skb);
1935                 return err;
1936         }
1937
1938         err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1939                              n->nlmsg_flags & NLM_F_ECHO);
1940         if (err > 0)
1941                 err = 0;
1942         return err;
1943 }
1944
1945 #ifdef CONFIG_NET_CLS
1946
1947 struct tcf_bind_args {
1948         struct tcf_walker w;
1949         unsigned long base;
1950         unsigned long cl;
1951         u32 classid;
1952 };
1953
1954 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1955 {
1956         struct tcf_bind_args *a = (void *)arg;
1957
1958         if (tp->ops->bind_class) {
1959                 struct Qdisc *q = tcf_block_q(tp->chain->block);
1960
1961                 sch_tree_lock(q);
1962                 tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
1963                 sch_tree_unlock(q);
1964         }
1965         return 0;
1966 }
1967
1968 struct tc_bind_class_args {
1969         struct qdisc_walker w;
1970         unsigned long new_cl;
1971         u32 portid;
1972         u32 clid;
1973 };
1974
1975 static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl,
1976                                 struct qdisc_walker *w)
1977 {
1978         struct tc_bind_class_args *a = (struct tc_bind_class_args *)w;
1979         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1980         struct tcf_block *block;
1981         struct tcf_chain *chain;
1982
1983         block = cops->tcf_block(q, cl, NULL);
1984         if (!block)
1985                 return 0;
1986         for (chain = tcf_get_next_chain(block, NULL);
1987              chain;
1988              chain = tcf_get_next_chain(block, chain)) {
1989                 struct tcf_proto *tp;
1990
1991                 for (tp = tcf_get_next_proto(chain, NULL, true);
1992                      tp; tp = tcf_get_next_proto(chain, tp, true)) {
1993                         struct tcf_bind_args arg = {};
1994
1995                         arg.w.fn = tcf_node_bind;
1996                         arg.classid = a->clid;
1997                         arg.base = cl;
1998                         arg.cl = a->new_cl;
1999                         tp->ops->walk(tp, &arg.w, true);
2000                 }
2001         }
2002
2003         return 0;
2004 }
2005
2006 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
2007                            unsigned long new_cl)
2008 {
2009         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
2010         struct tc_bind_class_args args = {};
2011
2012         if (!cops->tcf_block)
2013                 return;
2014         args.portid = portid;
2015         args.clid = clid;
2016         args.new_cl = new_cl;
2017         args.w.fn = tc_bind_class_walker;
2018         q->ops->cl_ops->walk(q, &args.w);
2019 }
2020
2021 #else
2022
2023 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
2024                            unsigned long new_cl)
2025 {
2026 }
2027
2028 #endif
2029
2030 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
2031                          struct netlink_ext_ack *extack)
2032 {
2033         struct net *net = sock_net(skb->sk);
2034         struct tcmsg *tcm = nlmsg_data(n);
2035         struct nlattr *tca[TCA_MAX + 1];
2036         struct net_device *dev;
2037         struct Qdisc *q = NULL;
2038         const struct Qdisc_class_ops *cops;
2039         unsigned long cl = 0;
2040         unsigned long new_cl;
2041         u32 portid;
2042         u32 clid;
2043         u32 qid;
2044         int err;
2045
2046         if ((n->nlmsg_type != RTM_GETTCLASS) &&
2047             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
2048                 return -EPERM;
2049
2050         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
2051                                      rtm_tca_policy, extack);
2052         if (err < 0)
2053                 return err;
2054
2055         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
2056         if (!dev)
2057                 return -ENODEV;
2058
2059         /*
2060            parent == TC_H_UNSPEC - unspecified parent.
2061            parent == TC_H_ROOT   - class is root, which has no parent.
2062            parent == X:0         - parent is root class.
2063            parent == X:Y         - parent is a node in hierarchy.
2064            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
2065
2066            handle == 0:0         - generate handle from kernel pool.
2067            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
2068            handle == X:Y         - clear.
2069            handle == X:0         - root class.
2070          */
2071
2072         /* Step 1. Determine qdisc handle X:0 */
2073
2074         portid = tcm->tcm_parent;
2075         clid = tcm->tcm_handle;
2076         qid = TC_H_MAJ(clid);
2077
2078         if (portid != TC_H_ROOT) {
2079                 u32 qid1 = TC_H_MAJ(portid);
2080
2081                 if (qid && qid1) {
2082                         /* If both majors are known, they must be identical. */
2083                         if (qid != qid1)
2084                                 return -EINVAL;
2085                 } else if (qid1) {
2086                         qid = qid1;
2087                 } else if (qid == 0)
2088                         qid = rtnl_dereference(dev->qdisc)->handle;
2089
2090                 /* Now qid is genuine qdisc handle consistent
2091                  * both with parent and child.
2092                  *
2093                  * TC_H_MAJ(portid) still may be unspecified, complete it now.
2094                  */
2095                 if (portid)
2096                         portid = TC_H_MAKE(qid, portid);
2097         } else {
2098                 if (qid == 0)
2099                         qid = rtnl_dereference(dev->qdisc)->handle;
2100         }
2101
2102         /* OK. Locate qdisc */
2103         q = qdisc_lookup(dev, qid);
2104         if (!q)
2105                 return -ENOENT;
2106
2107         /* An check that it supports classes */
2108         cops = q->ops->cl_ops;
2109         if (cops == NULL)
2110                 return -EINVAL;
2111
2112         /* Now try to get class */
2113         if (clid == 0) {
2114                 if (portid == TC_H_ROOT)
2115                         clid = qid;
2116         } else
2117                 clid = TC_H_MAKE(qid, clid);
2118
2119         if (clid)
2120                 cl = cops->find(q, clid);
2121
2122         if (cl == 0) {
2123                 err = -ENOENT;
2124                 if (n->nlmsg_type != RTM_NEWTCLASS ||
2125                     !(n->nlmsg_flags & NLM_F_CREATE))
2126                         goto out;
2127         } else {
2128                 switch (n->nlmsg_type) {
2129                 case RTM_NEWTCLASS:
2130                         err = -EEXIST;
2131                         if (n->nlmsg_flags & NLM_F_EXCL)
2132                                 goto out;
2133                         break;
2134                 case RTM_DELTCLASS:
2135                         err = tclass_del_notify(net, cops, skb, n, q, cl);
2136                         /* Unbind the class with flilters with 0 */
2137                         tc_bind_tclass(q, portid, clid, 0);
2138                         goto out;
2139                 case RTM_GETTCLASS:
2140                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
2141                         goto out;
2142                 default:
2143                         err = -EINVAL;
2144                         goto out;
2145                 }
2146         }
2147
2148         if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
2149                 NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
2150                 return -EOPNOTSUPP;
2151         }
2152
2153         new_cl = cl;
2154         err = -EOPNOTSUPP;
2155         if (cops->change)
2156                 err = cops->change(q, clid, portid, tca, &new_cl, extack);
2157         if (err == 0) {
2158                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
2159                 /* We just create a new class, need to do reverse binding. */
2160                 if (cl != new_cl)
2161                         tc_bind_tclass(q, portid, clid, new_cl);
2162         }
2163 out:
2164         return err;
2165 }
2166
2167 struct qdisc_dump_args {
2168         struct qdisc_walker     w;
2169         struct sk_buff          *skb;
2170         struct netlink_callback *cb;
2171 };
2172
2173 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2174                             struct qdisc_walker *arg)
2175 {
2176         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2177
2178         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2179                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2180                               RTM_NEWTCLASS);
2181 }
2182
2183 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2184                                 struct tcmsg *tcm, struct netlink_callback *cb,
2185                                 int *t_p, int s_t)
2186 {
2187         struct qdisc_dump_args arg;
2188
2189         if (tc_qdisc_dump_ignore(q, false) ||
2190             *t_p < s_t || !q->ops->cl_ops ||
2191             (tcm->tcm_parent &&
2192              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2193                 (*t_p)++;
2194                 return 0;
2195         }
2196         if (*t_p > s_t)
2197                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2198         arg.w.fn = qdisc_class_dump;
2199         arg.skb = skb;
2200         arg.cb = cb;
2201         arg.w.stop  = 0;
2202         arg.w.skip = cb->args[1];
2203         arg.w.count = 0;
2204         q->ops->cl_ops->walk(q, &arg.w);
2205         cb->args[1] = arg.w.count;
2206         if (arg.w.stop)
2207                 return -1;
2208         (*t_p)++;
2209         return 0;
2210 }
2211
2212 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2213                                struct tcmsg *tcm, struct netlink_callback *cb,
2214                                int *t_p, int s_t, bool recur)
2215 {
2216         struct Qdisc *q;
2217         int b;
2218
2219         if (!root)
2220                 return 0;
2221
2222         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2223                 return -1;
2224
2225         if (!qdisc_dev(root) || !recur)
2226                 return 0;
2227
2228         if (tcm->tcm_parent) {
2229                 q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2230                 if (q && q != root &&
2231                     tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2232                         return -1;
2233                 return 0;
2234         }
2235         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2236                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2237                         return -1;
2238         }
2239
2240         return 0;
2241 }
2242
2243 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2244 {
2245         struct tcmsg *tcm = nlmsg_data(cb->nlh);
2246         struct net *net = sock_net(skb->sk);
2247         struct netdev_queue *dev_queue;
2248         struct net_device *dev;
2249         int t, s_t;
2250
2251         if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2252                 return 0;
2253         dev = dev_get_by_index(net, tcm->tcm_ifindex);
2254         if (!dev)
2255                 return 0;
2256
2257         s_t = cb->args[0];
2258         t = 0;
2259
2260         if (tc_dump_tclass_root(rtnl_dereference(dev->qdisc),
2261                                 skb, tcm, cb, &t, s_t, true) < 0)
2262                 goto done;
2263
2264         dev_queue = dev_ingress_queue(dev);
2265         if (dev_queue &&
2266             tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2267                                 &t, s_t, false) < 0)
2268                 goto done;
2269
2270 done:
2271         cb->args[0] = t;
2272
2273         dev_put(dev);
2274         return skb->len;
2275 }
2276
2277 #ifdef CONFIG_PROC_FS
2278 static int psched_show(struct seq_file *seq, void *v)
2279 {
2280         seq_printf(seq, "%08x %08x %08x %08x\n",
2281                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2282                    1000000,
2283                    (u32)NSEC_PER_SEC / hrtimer_resolution);
2284
2285         return 0;
2286 }
2287
2288 static int __net_init psched_net_init(struct net *net)
2289 {
2290         struct proc_dir_entry *e;
2291
2292         e = proc_create_single("psched", 0, net->proc_net, psched_show);
2293         if (e == NULL)
2294                 return -ENOMEM;
2295
2296         return 0;
2297 }
2298
2299 static void __net_exit psched_net_exit(struct net *net)
2300 {
2301         remove_proc_entry("psched", net->proc_net);
2302 }
2303 #else
2304 static int __net_init psched_net_init(struct net *net)
2305 {
2306         return 0;
2307 }
2308
2309 static void __net_exit psched_net_exit(struct net *net)
2310 {
2311 }
2312 #endif
2313
2314 static struct pernet_operations psched_net_ops = {
2315         .init = psched_net_init,
2316         .exit = psched_net_exit,
2317 };
2318
2319 static int __init pktsched_init(void)
2320 {
2321         int err;
2322
2323         err = register_pernet_subsys(&psched_net_ops);
2324         if (err) {
2325                 pr_err("pktsched_init: "
2326                        "cannot initialize per netns operations\n");
2327                 return err;
2328         }
2329
2330         register_qdisc(&pfifo_fast_ops);
2331         register_qdisc(&pfifo_qdisc_ops);
2332         register_qdisc(&bfifo_qdisc_ops);
2333         register_qdisc(&pfifo_head_drop_qdisc_ops);
2334         register_qdisc(&mq_qdisc_ops);
2335         register_qdisc(&noqueue_qdisc_ops);
2336
2337         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2338         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2339         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2340                       0);
2341         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2342         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2343         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2344                       0);
2345
2346         return 0;
2347 }
2348
2349 subsys_initcall(pktsched_init);