GNU Linux-libre 6.1.24-gnu
[releases.git] / net / sched / sch_api.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_api.c  Packet scheduler API.
4  *
5  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6  *
7  * Fixes:
8  *
9  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
10  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
11  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
12  */
13
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/string.h>
18 #include <linux/errno.h>
19 #include <linux/skbuff.h>
20 #include <linux/init.h>
21 #include <linux/proc_fs.h>
22 #include <linux/seq_file.h>
23 #include <linux/kmod.h>
24 #include <linux/list.h>
25 #include <linux/hrtimer.h>
26 #include <linux/slab.h>
27 #include <linux/hashtable.h>
28
29 #include <net/net_namespace.h>
30 #include <net/sock.h>
31 #include <net/netlink.h>
32 #include <net/pkt_sched.h>
33 #include <net/pkt_cls.h>
34
35 #include <trace/events/qdisc.h>
36
37 /*
38
39    Short review.
40    -------------
41
42    This file consists of two interrelated parts:
43
44    1. queueing disciplines manager frontend.
45    2. traffic classes manager frontend.
46
47    Generally, queueing discipline ("qdisc") is a black box,
48    which is able to enqueue packets and to dequeue them (when
49    device is ready to send something) in order and at times
50    determined by algorithm hidden in it.
51
52    qdisc's are divided to two categories:
53    - "queues", which have no internal structure visible from outside.
54    - "schedulers", which split all the packets to "traffic classes",
55      using "packet classifiers" (look at cls_api.c)
56
57    In turn, classes may have child qdiscs (as rule, queues)
58    attached to them etc. etc. etc.
59
60    The goal of the routines in this file is to translate
61    information supplied by user in the form of handles
62    to more intelligible for kernel form, to make some sanity
63    checks and part of work, which is common to all qdiscs
64    and to provide rtnetlink notifications.
65
66    All real intelligent work is done inside qdisc modules.
67
68
69
70    Every discipline has two major routines: enqueue and dequeue.
71
72    ---dequeue
73
74    dequeue usually returns a skb to send. It is allowed to return NULL,
75    but it does not mean that queue is empty, it just means that
76    discipline does not want to send anything this time.
77    Queue is really empty if q->q.qlen == 0.
78    For complicated disciplines with multiple queues q->q is not
79    real packet queue, but however q->q.qlen must be valid.
80
81    ---enqueue
82
83    enqueue returns 0, if packet was enqueued successfully.
84    If packet (this one or another one) was dropped, it returns
85    not zero error code.
86    NET_XMIT_DROP        - this packet dropped
87      Expected action: do not backoff, but wait until queue will clear.
88    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
89      Expected action: backoff or ignore
90
91    Auxiliary routines:
92
93    ---peek
94
95    like dequeue but without removing a packet from the queue
96
97    ---reset
98
99    returns qdisc to initial state: purge all buffers, clear all
100    timers, counters (except for statistics) etc.
101
102    ---init
103
104    initializes newly created qdisc.
105
106    ---destroy
107
108    destroys resources allocated by init and during lifetime of qdisc.
109
110    ---change
111
112    changes qdisc parameters.
113  */
114
115 /* Protects list of registered TC modules. It is pure SMP lock. */
116 static DEFINE_RWLOCK(qdisc_mod_lock);
117
118
119 /************************************************
120  *      Queueing disciplines manipulation.      *
121  ************************************************/
122
123
124 /* The list of all installed queueing disciplines. */
125
126 static struct Qdisc_ops *qdisc_base;
127
128 /* Register/unregister queueing discipline */
129
130 int register_qdisc(struct Qdisc_ops *qops)
131 {
132         struct Qdisc_ops *q, **qp;
133         int rc = -EEXIST;
134
135         write_lock(&qdisc_mod_lock);
136         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
137                 if (!strcmp(qops->id, q->id))
138                         goto out;
139
140         if (qops->enqueue == NULL)
141                 qops->enqueue = noop_qdisc_ops.enqueue;
142         if (qops->peek == NULL) {
143                 if (qops->dequeue == NULL)
144                         qops->peek = noop_qdisc_ops.peek;
145                 else
146                         goto out_einval;
147         }
148         if (qops->dequeue == NULL)
149                 qops->dequeue = noop_qdisc_ops.dequeue;
150
151         if (qops->cl_ops) {
152                 const struct Qdisc_class_ops *cops = qops->cl_ops;
153
154                 if (!(cops->find && cops->walk && cops->leaf))
155                         goto out_einval;
156
157                 if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
158                         goto out_einval;
159         }
160
161         qops->next = NULL;
162         *qp = qops;
163         rc = 0;
164 out:
165         write_unlock(&qdisc_mod_lock);
166         return rc;
167
168 out_einval:
169         rc = -EINVAL;
170         goto out;
171 }
172 EXPORT_SYMBOL(register_qdisc);
173
174 void unregister_qdisc(struct Qdisc_ops *qops)
175 {
176         struct Qdisc_ops *q, **qp;
177         int err = -ENOENT;
178
179         write_lock(&qdisc_mod_lock);
180         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
181                 if (q == qops)
182                         break;
183         if (q) {
184                 *qp = q->next;
185                 q->next = NULL;
186                 err = 0;
187         }
188         write_unlock(&qdisc_mod_lock);
189
190         WARN(err, "unregister qdisc(%s) failed\n", qops->id);
191 }
192 EXPORT_SYMBOL(unregister_qdisc);
193
194 /* Get default qdisc if not otherwise specified */
195 void qdisc_get_default(char *name, size_t len)
196 {
197         read_lock(&qdisc_mod_lock);
198         strscpy(name, default_qdisc_ops->id, len);
199         read_unlock(&qdisc_mod_lock);
200 }
201
202 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
203 {
204         struct Qdisc_ops *q = NULL;
205
206         for (q = qdisc_base; q; q = q->next) {
207                 if (!strcmp(name, q->id)) {
208                         if (!try_module_get(q->owner))
209                                 q = NULL;
210                         break;
211                 }
212         }
213
214         return q;
215 }
216
217 /* Set new default qdisc to use */
218 int qdisc_set_default(const char *name)
219 {
220         const struct Qdisc_ops *ops;
221
222         if (!capable(CAP_NET_ADMIN))
223                 return -EPERM;
224
225         write_lock(&qdisc_mod_lock);
226         ops = qdisc_lookup_default(name);
227         if (!ops) {
228                 /* Not found, drop lock and try to load module */
229                 write_unlock(&qdisc_mod_lock);
230                 request_module("sch_%s", name);
231                 write_lock(&qdisc_mod_lock);
232
233                 ops = qdisc_lookup_default(name);
234         }
235
236         if (ops) {
237                 /* Set new default */
238                 module_put(default_qdisc_ops->owner);
239                 default_qdisc_ops = ops;
240         }
241         write_unlock(&qdisc_mod_lock);
242
243         return ops ? 0 : -ENOENT;
244 }
245
246 #ifdef CONFIG_NET_SCH_DEFAULT
247 /* Set default value from kernel config */
248 static int __init sch_default_qdisc(void)
249 {
250         return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
251 }
252 late_initcall(sch_default_qdisc);
253 #endif
254
255 /* We know handle. Find qdisc among all qdisc's attached to device
256  * (root qdisc, all its children, children of children etc.)
257  * Note: caller either uses rtnl or rcu_read_lock()
258  */
259
260 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
261 {
262         struct Qdisc *q;
263
264         if (!qdisc_dev(root))
265                 return (root->handle == handle ? root : NULL);
266
267         if (!(root->flags & TCQ_F_BUILTIN) &&
268             root->handle == handle)
269                 return root;
270
271         hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle,
272                                    lockdep_rtnl_is_held()) {
273                 if (q->handle == handle)
274                         return q;
275         }
276         return NULL;
277 }
278
279 void qdisc_hash_add(struct Qdisc *q, bool invisible)
280 {
281         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
282                 ASSERT_RTNL();
283                 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
284                 if (invisible)
285                         q->flags |= TCQ_F_INVISIBLE;
286         }
287 }
288 EXPORT_SYMBOL(qdisc_hash_add);
289
290 void qdisc_hash_del(struct Qdisc *q)
291 {
292         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
293                 ASSERT_RTNL();
294                 hash_del_rcu(&q->hash);
295         }
296 }
297 EXPORT_SYMBOL(qdisc_hash_del);
298
299 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
300 {
301         struct Qdisc *q;
302
303         if (!handle)
304                 return NULL;
305         q = qdisc_match_from_root(rtnl_dereference(dev->qdisc), handle);
306         if (q)
307                 goto out;
308
309         if (dev_ingress_queue(dev))
310                 q = qdisc_match_from_root(
311                         dev_ingress_queue(dev)->qdisc_sleeping,
312                         handle);
313 out:
314         return q;
315 }
316
317 struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
318 {
319         struct netdev_queue *nq;
320         struct Qdisc *q;
321
322         if (!handle)
323                 return NULL;
324         q = qdisc_match_from_root(rcu_dereference(dev->qdisc), handle);
325         if (q)
326                 goto out;
327
328         nq = dev_ingress_queue_rcu(dev);
329         if (nq)
330                 q = qdisc_match_from_root(nq->qdisc_sleeping, handle);
331 out:
332         return q;
333 }
334
335 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
336 {
337         unsigned long cl;
338         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
339
340         if (cops == NULL)
341                 return NULL;
342         cl = cops->find(p, classid);
343
344         if (cl == 0)
345                 return NULL;
346         return cops->leaf(p, cl);
347 }
348
349 /* Find queueing discipline by name */
350
351 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
352 {
353         struct Qdisc_ops *q = NULL;
354
355         if (kind) {
356                 read_lock(&qdisc_mod_lock);
357                 for (q = qdisc_base; q; q = q->next) {
358                         if (nla_strcmp(kind, q->id) == 0) {
359                                 if (!try_module_get(q->owner))
360                                         q = NULL;
361                                 break;
362                         }
363                 }
364                 read_unlock(&qdisc_mod_lock);
365         }
366         return q;
367 }
368
369 /* The linklayer setting were not transferred from iproute2, in older
370  * versions, and the rate tables lookup systems have been dropped in
371  * the kernel. To keep backward compatible with older iproute2 tc
372  * utils, we detect the linklayer setting by detecting if the rate
373  * table were modified.
374  *
375  * For linklayer ATM table entries, the rate table will be aligned to
376  * 48 bytes, thus some table entries will contain the same value.  The
377  * mpu (min packet unit) is also encoded into the old rate table, thus
378  * starting from the mpu, we find low and high table entries for
379  * mapping this cell.  If these entries contain the same value, when
380  * the rate tables have been modified for linklayer ATM.
381  *
382  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
383  * and then roundup to the next cell, calc the table entry one below,
384  * and compare.
385  */
386 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
387 {
388         int low       = roundup(r->mpu, 48);
389         int high      = roundup(low+1, 48);
390         int cell_low  = low >> r->cell_log;
391         int cell_high = (high >> r->cell_log) - 1;
392
393         /* rtab is too inaccurate at rates > 100Mbit/s */
394         if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
395                 pr_debug("TC linklayer: Giving up ATM detection\n");
396                 return TC_LINKLAYER_ETHERNET;
397         }
398
399         if ((cell_high > cell_low) && (cell_high < 256)
400             && (rtab[cell_low] == rtab[cell_high])) {
401                 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
402                          cell_low, cell_high, rtab[cell_high]);
403                 return TC_LINKLAYER_ATM;
404         }
405         return TC_LINKLAYER_ETHERNET;
406 }
407
408 static struct qdisc_rate_table *qdisc_rtab_list;
409
410 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
411                                         struct nlattr *tab,
412                                         struct netlink_ext_ack *extack)
413 {
414         struct qdisc_rate_table *rtab;
415
416         if (tab == NULL || r->rate == 0 ||
417             r->cell_log == 0 || r->cell_log >= 32 ||
418             nla_len(tab) != TC_RTAB_SIZE) {
419                 NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
420                 return NULL;
421         }
422
423         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
424                 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
425                     !memcmp(&rtab->data, nla_data(tab), 1024)) {
426                         rtab->refcnt++;
427                         return rtab;
428                 }
429         }
430
431         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
432         if (rtab) {
433                 rtab->rate = *r;
434                 rtab->refcnt = 1;
435                 memcpy(rtab->data, nla_data(tab), 1024);
436                 if (r->linklayer == TC_LINKLAYER_UNAWARE)
437                         r->linklayer = __detect_linklayer(r, rtab->data);
438                 rtab->next = qdisc_rtab_list;
439                 qdisc_rtab_list = rtab;
440         } else {
441                 NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
442         }
443         return rtab;
444 }
445 EXPORT_SYMBOL(qdisc_get_rtab);
446
447 void qdisc_put_rtab(struct qdisc_rate_table *tab)
448 {
449         struct qdisc_rate_table *rtab, **rtabp;
450
451         if (!tab || --tab->refcnt)
452                 return;
453
454         for (rtabp = &qdisc_rtab_list;
455              (rtab = *rtabp) != NULL;
456              rtabp = &rtab->next) {
457                 if (rtab == tab) {
458                         *rtabp = rtab->next;
459                         kfree(rtab);
460                         return;
461                 }
462         }
463 }
464 EXPORT_SYMBOL(qdisc_put_rtab);
465
466 static LIST_HEAD(qdisc_stab_list);
467
468 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
469         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
470         [TCA_STAB_DATA] = { .type = NLA_BINARY },
471 };
472
473 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
474                                                struct netlink_ext_ack *extack)
475 {
476         struct nlattr *tb[TCA_STAB_MAX + 1];
477         struct qdisc_size_table *stab;
478         struct tc_sizespec *s;
479         unsigned int tsize = 0;
480         u16 *tab = NULL;
481         int err;
482
483         err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy,
484                                           extack);
485         if (err < 0)
486                 return ERR_PTR(err);
487         if (!tb[TCA_STAB_BASE]) {
488                 NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
489                 return ERR_PTR(-EINVAL);
490         }
491
492         s = nla_data(tb[TCA_STAB_BASE]);
493
494         if (s->tsize > 0) {
495                 if (!tb[TCA_STAB_DATA]) {
496                         NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
497                         return ERR_PTR(-EINVAL);
498                 }
499                 tab = nla_data(tb[TCA_STAB_DATA]);
500                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
501         }
502
503         if (tsize != s->tsize || (!tab && tsize > 0)) {
504                 NL_SET_ERR_MSG(extack, "Invalid size of size table");
505                 return ERR_PTR(-EINVAL);
506         }
507
508         list_for_each_entry(stab, &qdisc_stab_list, list) {
509                 if (memcmp(&stab->szopts, s, sizeof(*s)))
510                         continue;
511                 if (tsize > 0 &&
512                     memcmp(stab->data, tab, flex_array_size(stab, data, tsize)))
513                         continue;
514                 stab->refcnt++;
515                 return stab;
516         }
517
518         if (s->size_log > STAB_SIZE_LOG_MAX ||
519             s->cell_log > STAB_SIZE_LOG_MAX) {
520                 NL_SET_ERR_MSG(extack, "Invalid logarithmic size of size table");
521                 return ERR_PTR(-EINVAL);
522         }
523
524         stab = kmalloc(struct_size(stab, data, tsize), GFP_KERNEL);
525         if (!stab)
526                 return ERR_PTR(-ENOMEM);
527
528         stab->refcnt = 1;
529         stab->szopts = *s;
530         if (tsize > 0)
531                 memcpy(stab->data, tab, flex_array_size(stab, data, tsize));
532
533         list_add_tail(&stab->list, &qdisc_stab_list);
534
535         return stab;
536 }
537
538 void qdisc_put_stab(struct qdisc_size_table *tab)
539 {
540         if (!tab)
541                 return;
542
543         if (--tab->refcnt == 0) {
544                 list_del(&tab->list);
545                 kfree_rcu(tab, rcu);
546         }
547 }
548 EXPORT_SYMBOL(qdisc_put_stab);
549
550 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
551 {
552         struct nlattr *nest;
553
554         nest = nla_nest_start_noflag(skb, TCA_STAB);
555         if (nest == NULL)
556                 goto nla_put_failure;
557         if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
558                 goto nla_put_failure;
559         nla_nest_end(skb, nest);
560
561         return skb->len;
562
563 nla_put_failure:
564         return -1;
565 }
566
567 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
568                                const struct qdisc_size_table *stab)
569 {
570         int pkt_len, slot;
571
572         pkt_len = skb->len + stab->szopts.overhead;
573         if (unlikely(!stab->szopts.tsize))
574                 goto out;
575
576         slot = pkt_len + stab->szopts.cell_align;
577         if (unlikely(slot < 0))
578                 slot = 0;
579
580         slot >>= stab->szopts.cell_log;
581         if (likely(slot < stab->szopts.tsize))
582                 pkt_len = stab->data[slot];
583         else
584                 pkt_len = stab->data[stab->szopts.tsize - 1] *
585                                 (slot / stab->szopts.tsize) +
586                                 stab->data[slot % stab->szopts.tsize];
587
588         pkt_len <<= stab->szopts.size_log;
589 out:
590         if (unlikely(pkt_len < 1))
591                 pkt_len = 1;
592         qdisc_skb_cb(skb)->pkt_len = pkt_len;
593 }
594 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
595
596 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
597 {
598         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
599                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
600                         txt, qdisc->ops->id, qdisc->handle >> 16);
601                 qdisc->flags |= TCQ_F_WARN_NONWC;
602         }
603 }
604 EXPORT_SYMBOL(qdisc_warn_nonwc);
605
606 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
607 {
608         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
609                                                  timer);
610
611         rcu_read_lock();
612         __netif_schedule(qdisc_root(wd->qdisc));
613         rcu_read_unlock();
614
615         return HRTIMER_NORESTART;
616 }
617
618 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
619                                  clockid_t clockid)
620 {
621         hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
622         wd->timer.function = qdisc_watchdog;
623         wd->qdisc = qdisc;
624 }
625 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
626
627 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
628 {
629         qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
630 }
631 EXPORT_SYMBOL(qdisc_watchdog_init);
632
633 void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires,
634                                       u64 delta_ns)
635 {
636         if (test_bit(__QDISC_STATE_DEACTIVATED,
637                      &qdisc_root_sleeping(wd->qdisc)->state))
638                 return;
639
640         if (hrtimer_is_queued(&wd->timer)) {
641                 /* If timer is already set in [expires, expires + delta_ns],
642                  * do not reprogram it.
643                  */
644                 if (wd->last_expires - expires <= delta_ns)
645                         return;
646         }
647
648         wd->last_expires = expires;
649         hrtimer_start_range_ns(&wd->timer,
650                                ns_to_ktime(expires),
651                                delta_ns,
652                                HRTIMER_MODE_ABS_PINNED);
653 }
654 EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns);
655
656 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
657 {
658         hrtimer_cancel(&wd->timer);
659 }
660 EXPORT_SYMBOL(qdisc_watchdog_cancel);
661
662 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
663 {
664         struct hlist_head *h;
665         unsigned int i;
666
667         h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
668
669         if (h != NULL) {
670                 for (i = 0; i < n; i++)
671                         INIT_HLIST_HEAD(&h[i]);
672         }
673         return h;
674 }
675
676 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
677 {
678         struct Qdisc_class_common *cl;
679         struct hlist_node *next;
680         struct hlist_head *nhash, *ohash;
681         unsigned int nsize, nmask, osize;
682         unsigned int i, h;
683
684         /* Rehash when load factor exceeds 0.75 */
685         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
686                 return;
687         nsize = clhash->hashsize * 2;
688         nmask = nsize - 1;
689         nhash = qdisc_class_hash_alloc(nsize);
690         if (nhash == NULL)
691                 return;
692
693         ohash = clhash->hash;
694         osize = clhash->hashsize;
695
696         sch_tree_lock(sch);
697         for (i = 0; i < osize; i++) {
698                 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
699                         h = qdisc_class_hash(cl->classid, nmask);
700                         hlist_add_head(&cl->hnode, &nhash[h]);
701                 }
702         }
703         clhash->hash     = nhash;
704         clhash->hashsize = nsize;
705         clhash->hashmask = nmask;
706         sch_tree_unlock(sch);
707
708         kvfree(ohash);
709 }
710 EXPORT_SYMBOL(qdisc_class_hash_grow);
711
712 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
713 {
714         unsigned int size = 4;
715
716         clhash->hash = qdisc_class_hash_alloc(size);
717         if (!clhash->hash)
718                 return -ENOMEM;
719         clhash->hashsize  = size;
720         clhash->hashmask  = size - 1;
721         clhash->hashelems = 0;
722         return 0;
723 }
724 EXPORT_SYMBOL(qdisc_class_hash_init);
725
726 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
727 {
728         kvfree(clhash->hash);
729 }
730 EXPORT_SYMBOL(qdisc_class_hash_destroy);
731
732 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
733                              struct Qdisc_class_common *cl)
734 {
735         unsigned int h;
736
737         INIT_HLIST_NODE(&cl->hnode);
738         h = qdisc_class_hash(cl->classid, clhash->hashmask);
739         hlist_add_head(&cl->hnode, &clhash->hash[h]);
740         clhash->hashelems++;
741 }
742 EXPORT_SYMBOL(qdisc_class_hash_insert);
743
744 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
745                              struct Qdisc_class_common *cl)
746 {
747         hlist_del(&cl->hnode);
748         clhash->hashelems--;
749 }
750 EXPORT_SYMBOL(qdisc_class_hash_remove);
751
752 /* Allocate an unique handle from space managed by kernel
753  * Possible range is [8000-FFFF]:0000 (0x8000 values)
754  */
755 static u32 qdisc_alloc_handle(struct net_device *dev)
756 {
757         int i = 0x8000;
758         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
759
760         do {
761                 autohandle += TC_H_MAKE(0x10000U, 0);
762                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
763                         autohandle = TC_H_MAKE(0x80000000U, 0);
764                 if (!qdisc_lookup(dev, autohandle))
765                         return autohandle;
766                 cond_resched();
767         } while (--i > 0);
768
769         return 0;
770 }
771
772 void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
773 {
774         bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
775         const struct Qdisc_class_ops *cops;
776         unsigned long cl;
777         u32 parentid;
778         bool notify;
779         int drops;
780
781         if (n == 0 && len == 0)
782                 return;
783         drops = max_t(int, n, 0);
784         rcu_read_lock();
785         while ((parentid = sch->parent)) {
786                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
787                         break;
788
789                 if (sch->flags & TCQ_F_NOPARENT)
790                         break;
791                 /* Notify parent qdisc only if child qdisc becomes empty.
792                  *
793                  * If child was empty even before update then backlog
794                  * counter is screwed and we skip notification because
795                  * parent class is already passive.
796                  *
797                  * If the original child was offloaded then it is allowed
798                  * to be seem as empty, so the parent is notified anyway.
799                  */
800                 notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
801                                                        !qdisc_is_offloaded);
802                 /* TODO: perform the search on a per txq basis */
803                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
804                 if (sch == NULL) {
805                         WARN_ON_ONCE(parentid != TC_H_ROOT);
806                         break;
807                 }
808                 cops = sch->ops->cl_ops;
809                 if (notify && cops->qlen_notify) {
810                         cl = cops->find(sch, parentid);
811                         cops->qlen_notify(sch, cl);
812                 }
813                 sch->q.qlen -= n;
814                 sch->qstats.backlog -= len;
815                 __qdisc_qstats_drop(sch, drops);
816         }
817         rcu_read_unlock();
818 }
819 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
820
821 int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
822                               void *type_data)
823 {
824         struct net_device *dev = qdisc_dev(sch);
825         int err;
826
827         sch->flags &= ~TCQ_F_OFFLOADED;
828         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
829                 return 0;
830
831         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
832         if (err == -EOPNOTSUPP)
833                 return 0;
834
835         if (!err)
836                 sch->flags |= TCQ_F_OFFLOADED;
837
838         return err;
839 }
840 EXPORT_SYMBOL(qdisc_offload_dump_helper);
841
842 void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
843                                 struct Qdisc *new, struct Qdisc *old,
844                                 enum tc_setup_type type, void *type_data,
845                                 struct netlink_ext_ack *extack)
846 {
847         bool any_qdisc_is_offloaded;
848         int err;
849
850         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
851                 return;
852
853         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
854
855         /* Don't report error if the graft is part of destroy operation. */
856         if (!err || !new || new == &noop_qdisc)
857                 return;
858
859         /* Don't report error if the parent, the old child and the new
860          * one are not offloaded.
861          */
862         any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
863         any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
864         any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
865
866         if (any_qdisc_is_offloaded)
867                 NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
868 }
869 EXPORT_SYMBOL(qdisc_offload_graft_helper);
870
871 void qdisc_offload_query_caps(struct net_device *dev,
872                               enum tc_setup_type type,
873                               void *caps, size_t caps_len)
874 {
875         const struct net_device_ops *ops = dev->netdev_ops;
876         struct tc_query_caps_base base = {
877                 .type = type,
878                 .caps = caps,
879         };
880
881         memset(caps, 0, caps_len);
882
883         if (ops->ndo_setup_tc)
884                 ops->ndo_setup_tc(dev, TC_QUERY_CAPS, &base);
885 }
886 EXPORT_SYMBOL(qdisc_offload_query_caps);
887
888 static void qdisc_offload_graft_root(struct net_device *dev,
889                                      struct Qdisc *new, struct Qdisc *old,
890                                      struct netlink_ext_ack *extack)
891 {
892         struct tc_root_qopt_offload graft_offload = {
893                 .command        = TC_ROOT_GRAFT,
894                 .handle         = new ? new->handle : 0,
895                 .ingress        = (new && new->flags & TCQ_F_INGRESS) ||
896                                   (old && old->flags & TCQ_F_INGRESS),
897         };
898
899         qdisc_offload_graft_helper(dev, NULL, new, old,
900                                    TC_SETUP_ROOT_QDISC, &graft_offload, extack);
901 }
902
903 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
904                          u32 portid, u32 seq, u16 flags, int event)
905 {
906         struct gnet_stats_basic_sync __percpu *cpu_bstats = NULL;
907         struct gnet_stats_queue __percpu *cpu_qstats = NULL;
908         struct tcmsg *tcm;
909         struct nlmsghdr  *nlh;
910         unsigned char *b = skb_tail_pointer(skb);
911         struct gnet_dump d;
912         struct qdisc_size_table *stab;
913         u32 block_index;
914         __u32 qlen;
915
916         cond_resched();
917         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
918         if (!nlh)
919                 goto out_nlmsg_trim;
920         tcm = nlmsg_data(nlh);
921         tcm->tcm_family = AF_UNSPEC;
922         tcm->tcm__pad1 = 0;
923         tcm->tcm__pad2 = 0;
924         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
925         tcm->tcm_parent = clid;
926         tcm->tcm_handle = q->handle;
927         tcm->tcm_info = refcount_read(&q->refcnt);
928         if (nla_put_string(skb, TCA_KIND, q->ops->id))
929                 goto nla_put_failure;
930         if (q->ops->ingress_block_get) {
931                 block_index = q->ops->ingress_block_get(q);
932                 if (block_index &&
933                     nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
934                         goto nla_put_failure;
935         }
936         if (q->ops->egress_block_get) {
937                 block_index = q->ops->egress_block_get(q);
938                 if (block_index &&
939                     nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
940                         goto nla_put_failure;
941         }
942         if (q->ops->dump && q->ops->dump(q, skb) < 0)
943                 goto nla_put_failure;
944         if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
945                 goto nla_put_failure;
946         qlen = qdisc_qlen_sum(q);
947
948         stab = rtnl_dereference(q->stab);
949         if (stab && qdisc_dump_stab(skb, stab) < 0)
950                 goto nla_put_failure;
951
952         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
953                                          NULL, &d, TCA_PAD) < 0)
954                 goto nla_put_failure;
955
956         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
957                 goto nla_put_failure;
958
959         if (qdisc_is_percpu_stats(q)) {
960                 cpu_bstats = q->cpu_bstats;
961                 cpu_qstats = q->cpu_qstats;
962         }
963
964         if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats, true) < 0 ||
965             gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
966             gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
967                 goto nla_put_failure;
968
969         if (gnet_stats_finish_copy(&d) < 0)
970                 goto nla_put_failure;
971
972         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
973         return skb->len;
974
975 out_nlmsg_trim:
976 nla_put_failure:
977         nlmsg_trim(skb, b);
978         return -1;
979 }
980
981 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
982 {
983         if (q->flags & TCQ_F_BUILTIN)
984                 return true;
985         if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
986                 return true;
987
988         return false;
989 }
990
991 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
992                         struct nlmsghdr *n, u32 clid,
993                         struct Qdisc *old, struct Qdisc *new)
994 {
995         struct sk_buff *skb;
996         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
997
998         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
999         if (!skb)
1000                 return -ENOBUFS;
1001
1002         if (old && !tc_qdisc_dump_ignore(old, false)) {
1003                 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
1004                                   0, RTM_DELQDISC) < 0)
1005                         goto err_out;
1006         }
1007         if (new && !tc_qdisc_dump_ignore(new, false)) {
1008                 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
1009                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
1010                         goto err_out;
1011         }
1012
1013         if (skb->len)
1014                 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1015                                       n->nlmsg_flags & NLM_F_ECHO);
1016
1017 err_out:
1018         kfree_skb(skb);
1019         return -EINVAL;
1020 }
1021
1022 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
1023                                struct nlmsghdr *n, u32 clid,
1024                                struct Qdisc *old, struct Qdisc *new)
1025 {
1026         if (new || old)
1027                 qdisc_notify(net, skb, n, clid, old, new);
1028
1029         if (old)
1030                 qdisc_put(old);
1031 }
1032
1033 static void qdisc_clear_nolock(struct Qdisc *sch)
1034 {
1035         sch->flags &= ~TCQ_F_NOLOCK;
1036         if (!(sch->flags & TCQ_F_CPUSTATS))
1037                 return;
1038
1039         free_percpu(sch->cpu_bstats);
1040         free_percpu(sch->cpu_qstats);
1041         sch->cpu_bstats = NULL;
1042         sch->cpu_qstats = NULL;
1043         sch->flags &= ~TCQ_F_CPUSTATS;
1044 }
1045
1046 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
1047  * to device "dev".
1048  *
1049  * When appropriate send a netlink notification using 'skb'
1050  * and "n".
1051  *
1052  * On success, destroy old qdisc.
1053  */
1054
1055 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
1056                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
1057                        struct Qdisc *new, struct Qdisc *old,
1058                        struct netlink_ext_ack *extack)
1059 {
1060         struct Qdisc *q = old;
1061         struct net *net = dev_net(dev);
1062
1063         if (parent == NULL) {
1064                 unsigned int i, num_q, ingress;
1065
1066                 ingress = 0;
1067                 num_q = dev->num_tx_queues;
1068                 if ((q && q->flags & TCQ_F_INGRESS) ||
1069                     (new && new->flags & TCQ_F_INGRESS)) {
1070                         num_q = 1;
1071                         ingress = 1;
1072                         if (!dev_ingress_queue(dev)) {
1073                                 NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
1074                                 return -ENOENT;
1075                         }
1076                 }
1077
1078                 if (dev->flags & IFF_UP)
1079                         dev_deactivate(dev);
1080
1081                 qdisc_offload_graft_root(dev, new, old, extack);
1082
1083                 if (new && new->ops->attach && !ingress)
1084                         goto skip;
1085
1086                 for (i = 0; i < num_q; i++) {
1087                         struct netdev_queue *dev_queue = dev_ingress_queue(dev);
1088
1089                         if (!ingress)
1090                                 dev_queue = netdev_get_tx_queue(dev, i);
1091
1092                         old = dev_graft_qdisc(dev_queue, new);
1093                         if (new && i > 0)
1094                                 qdisc_refcount_inc(new);
1095
1096                         if (!ingress)
1097                                 qdisc_put(old);
1098                 }
1099
1100 skip:
1101                 if (!ingress) {
1102                         old = rtnl_dereference(dev->qdisc);
1103                         if (new && !new->ops->attach)
1104                                 qdisc_refcount_inc(new);
1105                         rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc);
1106
1107                         notify_and_destroy(net, skb, n, classid, old, new);
1108
1109                         if (new && new->ops->attach)
1110                                 new->ops->attach(new);
1111                 } else {
1112                         notify_and_destroy(net, skb, n, classid, old, new);
1113                 }
1114
1115                 if (dev->flags & IFF_UP)
1116                         dev_activate(dev);
1117         } else {
1118                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1119                 unsigned long cl;
1120                 int err;
1121
1122                 /* Only support running class lockless if parent is lockless */
1123                 if (new && (new->flags & TCQ_F_NOLOCK) && !(parent->flags & TCQ_F_NOLOCK))
1124                         qdisc_clear_nolock(new);
1125
1126                 if (!cops || !cops->graft)
1127                         return -EOPNOTSUPP;
1128
1129                 cl = cops->find(parent, classid);
1130                 if (!cl) {
1131                         NL_SET_ERR_MSG(extack, "Specified class not found");
1132                         return -ENOENT;
1133                 }
1134
1135                 if (new && new->ops == &noqueue_qdisc_ops) {
1136                         NL_SET_ERR_MSG(extack, "Cannot assign noqueue to a class");
1137                         return -EINVAL;
1138                 }
1139
1140                 err = cops->graft(parent, cl, new, &old, extack);
1141                 if (err)
1142                         return err;
1143                 notify_and_destroy(net, skb, n, classid, old, new);
1144         }
1145         return 0;
1146 }
1147
1148 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1149                                    struct netlink_ext_ack *extack)
1150 {
1151         u32 block_index;
1152
1153         if (tca[TCA_INGRESS_BLOCK]) {
1154                 block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1155
1156                 if (!block_index) {
1157                         NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1158                         return -EINVAL;
1159                 }
1160                 if (!sch->ops->ingress_block_set) {
1161                         NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1162                         return -EOPNOTSUPP;
1163                 }
1164                 sch->ops->ingress_block_set(sch, block_index);
1165         }
1166         if (tca[TCA_EGRESS_BLOCK]) {
1167                 block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1168
1169                 if (!block_index) {
1170                         NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1171                         return -EINVAL;
1172                 }
1173                 if (!sch->ops->egress_block_set) {
1174                         NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1175                         return -EOPNOTSUPP;
1176                 }
1177                 sch->ops->egress_block_set(sch, block_index);
1178         }
1179         return 0;
1180 }
1181
1182 /*
1183    Allocate and initialize new qdisc.
1184
1185    Parameters are passed via opt.
1186  */
1187
1188 static struct Qdisc *qdisc_create(struct net_device *dev,
1189                                   struct netdev_queue *dev_queue,
1190                                   u32 parent, u32 handle,
1191                                   struct nlattr **tca, int *errp,
1192                                   struct netlink_ext_ack *extack)
1193 {
1194         int err;
1195         struct nlattr *kind = tca[TCA_KIND];
1196         struct Qdisc *sch;
1197         struct Qdisc_ops *ops;
1198         struct qdisc_size_table *stab;
1199
1200         ops = qdisc_lookup_ops(kind);
1201 #ifdef CONFIG_MODULES
1202         if (ops == NULL && kind != NULL) {
1203                 char name[IFNAMSIZ];
1204                 if (nla_strscpy(name, kind, IFNAMSIZ) >= 0) {
1205                         /* We dropped the RTNL semaphore in order to
1206                          * perform the module load.  So, even if we
1207                          * succeeded in loading the module we have to
1208                          * tell the caller to replay the request.  We
1209                          * indicate this using -EAGAIN.
1210                          * We replay the request because the device may
1211                          * go away in the mean time.
1212                          */
1213                         rtnl_unlock();
1214                         request_module("sch_%s", name);
1215                         rtnl_lock();
1216                         ops = qdisc_lookup_ops(kind);
1217                         if (ops != NULL) {
1218                                 /* We will try again qdisc_lookup_ops,
1219                                  * so don't keep a reference.
1220                                  */
1221                                 module_put(ops->owner);
1222                                 err = -EAGAIN;
1223                                 goto err_out;
1224                         }
1225                 }
1226         }
1227 #endif
1228
1229         err = -ENOENT;
1230         if (!ops) {
1231                 NL_SET_ERR_MSG(extack, "Specified qdisc kind is unknown");
1232                 goto err_out;
1233         }
1234
1235         sch = qdisc_alloc(dev_queue, ops, extack);
1236         if (IS_ERR(sch)) {
1237                 err = PTR_ERR(sch);
1238                 goto err_out2;
1239         }
1240
1241         sch->parent = parent;
1242
1243         if (handle == TC_H_INGRESS) {
1244                 sch->flags |= TCQ_F_INGRESS;
1245                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
1246         } else {
1247                 if (handle == 0) {
1248                         handle = qdisc_alloc_handle(dev);
1249                         if (handle == 0) {
1250                                 NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded");
1251                                 err = -ENOSPC;
1252                                 goto err_out3;
1253                         }
1254                 }
1255                 if (!netif_is_multiqueue(dev))
1256                         sch->flags |= TCQ_F_ONETXQUEUE;
1257         }
1258
1259         sch->handle = handle;
1260
1261         /* This exist to keep backward compatible with a userspace
1262          * loophole, what allowed userspace to get IFF_NO_QUEUE
1263          * facility on older kernels by setting tx_queue_len=0 (prior
1264          * to qdisc init), and then forgot to reinit tx_queue_len
1265          * before again attaching a qdisc.
1266          */
1267         if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1268                 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1269                 netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1270         }
1271
1272         err = qdisc_block_indexes_set(sch, tca, extack);
1273         if (err)
1274                 goto err_out3;
1275
1276         if (ops->init) {
1277                 err = ops->init(sch, tca[TCA_OPTIONS], extack);
1278                 if (err != 0)
1279                         goto err_out5;
1280         }
1281
1282         if (tca[TCA_STAB]) {
1283                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1284                 if (IS_ERR(stab)) {
1285                         err = PTR_ERR(stab);
1286                         goto err_out4;
1287                 }
1288                 rcu_assign_pointer(sch->stab, stab);
1289         }
1290         if (tca[TCA_RATE]) {
1291                 err = -EOPNOTSUPP;
1292                 if (sch->flags & TCQ_F_MQROOT) {
1293                         NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1294                         goto err_out4;
1295                 }
1296
1297                 err = gen_new_estimator(&sch->bstats,
1298                                         sch->cpu_bstats,
1299                                         &sch->rate_est,
1300                                         NULL,
1301                                         true,
1302                                         tca[TCA_RATE]);
1303                 if (err) {
1304                         NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1305                         goto err_out4;
1306                 }
1307         }
1308
1309         qdisc_hash_add(sch, false);
1310         trace_qdisc_create(ops, dev, parent);
1311
1312         return sch;
1313
1314 err_out5:
1315         /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1316         if (ops->destroy)
1317                 ops->destroy(sch);
1318 err_out3:
1319         netdev_put(dev, &sch->dev_tracker);
1320         qdisc_free(sch);
1321 err_out2:
1322         module_put(ops->owner);
1323 err_out:
1324         *errp = err;
1325         return NULL;
1326
1327 err_out4:
1328         /*
1329          * Any broken qdiscs that would require a ops->reset() here?
1330          * The qdisc was never in action so it shouldn't be necessary.
1331          */
1332         qdisc_put_stab(rtnl_dereference(sch->stab));
1333         if (ops->destroy)
1334                 ops->destroy(sch);
1335         goto err_out3;
1336 }
1337
1338 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1339                         struct netlink_ext_ack *extack)
1340 {
1341         struct qdisc_size_table *ostab, *stab = NULL;
1342         int err = 0;
1343
1344         if (tca[TCA_OPTIONS]) {
1345                 if (!sch->ops->change) {
1346                         NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1347                         return -EINVAL;
1348                 }
1349                 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1350                         NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1351                         return -EOPNOTSUPP;
1352                 }
1353                 err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1354                 if (err)
1355                         return err;
1356         }
1357
1358         if (tca[TCA_STAB]) {
1359                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1360                 if (IS_ERR(stab))
1361                         return PTR_ERR(stab);
1362         }
1363
1364         ostab = rtnl_dereference(sch->stab);
1365         rcu_assign_pointer(sch->stab, stab);
1366         qdisc_put_stab(ostab);
1367
1368         if (tca[TCA_RATE]) {
1369                 /* NB: ignores errors from replace_estimator
1370                    because change can't be undone. */
1371                 if (sch->flags & TCQ_F_MQROOT)
1372                         goto out;
1373                 gen_replace_estimator(&sch->bstats,
1374                                       sch->cpu_bstats,
1375                                       &sch->rate_est,
1376                                       NULL,
1377                                       true,
1378                                       tca[TCA_RATE]);
1379         }
1380 out:
1381         return 0;
1382 }
1383
1384 struct check_loop_arg {
1385         struct qdisc_walker     w;
1386         struct Qdisc            *p;
1387         int                     depth;
1388 };
1389
1390 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1391                          struct qdisc_walker *w);
1392
1393 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1394 {
1395         struct check_loop_arg   arg;
1396
1397         if (q->ops->cl_ops == NULL)
1398                 return 0;
1399
1400         arg.w.stop = arg.w.skip = arg.w.count = 0;
1401         arg.w.fn = check_loop_fn;
1402         arg.depth = depth;
1403         arg.p = p;
1404         q->ops->cl_ops->walk(q, &arg.w);
1405         return arg.w.stop ? -ELOOP : 0;
1406 }
1407
1408 static int
1409 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1410 {
1411         struct Qdisc *leaf;
1412         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1413         struct check_loop_arg *arg = (struct check_loop_arg *)w;
1414
1415         leaf = cops->leaf(q, cl);
1416         if (leaf) {
1417                 if (leaf == arg->p || arg->depth > 7)
1418                         return -ELOOP;
1419                 return check_loop(leaf, arg->p, arg->depth + 1);
1420         }
1421         return 0;
1422 }
1423
1424 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1425         [TCA_KIND]              = { .type = NLA_STRING },
1426         [TCA_RATE]              = { .type = NLA_BINARY,
1427                                     .len = sizeof(struct tc_estimator) },
1428         [TCA_STAB]              = { .type = NLA_NESTED },
1429         [TCA_DUMP_INVISIBLE]    = { .type = NLA_FLAG },
1430         [TCA_CHAIN]             = { .type = NLA_U32 },
1431         [TCA_INGRESS_BLOCK]     = { .type = NLA_U32 },
1432         [TCA_EGRESS_BLOCK]      = { .type = NLA_U32 },
1433 };
1434
1435 /*
1436  * Delete/get qdisc.
1437  */
1438
1439 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1440                         struct netlink_ext_ack *extack)
1441 {
1442         struct net *net = sock_net(skb->sk);
1443         struct tcmsg *tcm = nlmsg_data(n);
1444         struct nlattr *tca[TCA_MAX + 1];
1445         struct net_device *dev;
1446         u32 clid;
1447         struct Qdisc *q = NULL;
1448         struct Qdisc *p = NULL;
1449         int err;
1450
1451         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1452                                      rtm_tca_policy, extack);
1453         if (err < 0)
1454                 return err;
1455
1456         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1457         if (!dev)
1458                 return -ENODEV;
1459
1460         clid = tcm->tcm_parent;
1461         if (clid) {
1462                 if (clid != TC_H_ROOT) {
1463                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1464                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1465                                 if (!p) {
1466                                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1467                                         return -ENOENT;
1468                                 }
1469                                 q = qdisc_leaf(p, clid);
1470                         } else if (dev_ingress_queue(dev)) {
1471                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1472                         }
1473                 } else {
1474                         q = rtnl_dereference(dev->qdisc);
1475                 }
1476                 if (!q) {
1477                         NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1478                         return -ENOENT;
1479                 }
1480
1481                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1482                         NL_SET_ERR_MSG(extack, "Invalid handle");
1483                         return -EINVAL;
1484                 }
1485         } else {
1486                 q = qdisc_lookup(dev, tcm->tcm_handle);
1487                 if (!q) {
1488                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1489                         return -ENOENT;
1490                 }
1491         }
1492
1493         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1494                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1495                 return -EINVAL;
1496         }
1497
1498         if (n->nlmsg_type == RTM_DELQDISC) {
1499                 if (!clid) {
1500                         NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1501                         return -EINVAL;
1502                 }
1503                 if (q->handle == 0) {
1504                         NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1505                         return -ENOENT;
1506                 }
1507                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1508                 if (err != 0)
1509                         return err;
1510         } else {
1511                 qdisc_notify(net, skb, n, clid, NULL, q);
1512         }
1513         return 0;
1514 }
1515
1516 /*
1517  * Create/change qdisc.
1518  */
1519
1520 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1521                            struct netlink_ext_ack *extack)
1522 {
1523         struct net *net = sock_net(skb->sk);
1524         struct tcmsg *tcm;
1525         struct nlattr *tca[TCA_MAX + 1];
1526         struct net_device *dev;
1527         u32 clid;
1528         struct Qdisc *q, *p;
1529         int err;
1530
1531 replay:
1532         /* Reinit, just in case something touches this. */
1533         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1534                                      rtm_tca_policy, extack);
1535         if (err < 0)
1536                 return err;
1537
1538         tcm = nlmsg_data(n);
1539         clid = tcm->tcm_parent;
1540         q = p = NULL;
1541
1542         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1543         if (!dev)
1544                 return -ENODEV;
1545
1546
1547         if (clid) {
1548                 if (clid != TC_H_ROOT) {
1549                         if (clid != TC_H_INGRESS) {
1550                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1551                                 if (!p) {
1552                                         NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1553                                         return -ENOENT;
1554                                 }
1555                                 q = qdisc_leaf(p, clid);
1556                         } else if (dev_ingress_queue_create(dev)) {
1557                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1558                         }
1559                 } else {
1560                         q = rtnl_dereference(dev->qdisc);
1561                 }
1562
1563                 /* It may be default qdisc, ignore it */
1564                 if (q && q->handle == 0)
1565                         q = NULL;
1566
1567                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1568                         if (tcm->tcm_handle) {
1569                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1570                                         NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1571                                         return -EEXIST;
1572                                 }
1573                                 if (TC_H_MIN(tcm->tcm_handle)) {
1574                                         NL_SET_ERR_MSG(extack, "Invalid minor handle");
1575                                         return -EINVAL;
1576                                 }
1577                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1578                                 if (!q)
1579                                         goto create_n_graft;
1580                                 if (n->nlmsg_flags & NLM_F_EXCL) {
1581                                         NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1582                                         return -EEXIST;
1583                                 }
1584                                 if (tca[TCA_KIND] &&
1585                                     nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1586                                         NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1587                                         return -EINVAL;
1588                                 }
1589                                 if (q == p ||
1590                                     (p && check_loop(q, p, 0))) {
1591                                         NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1592                                         return -ELOOP;
1593                                 }
1594                                 qdisc_refcount_inc(q);
1595                                 goto graft;
1596                         } else {
1597                                 if (!q)
1598                                         goto create_n_graft;
1599
1600                                 /* This magic test requires explanation.
1601                                  *
1602                                  *   We know, that some child q is already
1603                                  *   attached to this parent and have choice:
1604                                  *   either to change it or to create/graft new one.
1605                                  *
1606                                  *   1. We are allowed to create/graft only
1607                                  *   if CREATE and REPLACE flags are set.
1608                                  *
1609                                  *   2. If EXCL is set, requestor wanted to say,
1610                                  *   that qdisc tcm_handle is not expected
1611                                  *   to exist, so that we choose create/graft too.
1612                                  *
1613                                  *   3. The last case is when no flags are set.
1614                                  *   Alas, it is sort of hole in API, we
1615                                  *   cannot decide what to do unambiguously.
1616                                  *   For now we select create/graft, if
1617                                  *   user gave KIND, which does not match existing.
1618                                  */
1619                                 if ((n->nlmsg_flags & NLM_F_CREATE) &&
1620                                     (n->nlmsg_flags & NLM_F_REPLACE) &&
1621                                     ((n->nlmsg_flags & NLM_F_EXCL) ||
1622                                      (tca[TCA_KIND] &&
1623                                       nla_strcmp(tca[TCA_KIND], q->ops->id))))
1624                                         goto create_n_graft;
1625                         }
1626                 }
1627         } else {
1628                 if (!tcm->tcm_handle) {
1629                         NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1630                         return -EINVAL;
1631                 }
1632                 q = qdisc_lookup(dev, tcm->tcm_handle);
1633         }
1634
1635         /* Change qdisc parameters */
1636         if (!q) {
1637                 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1638                 return -ENOENT;
1639         }
1640         if (n->nlmsg_flags & NLM_F_EXCL) {
1641                 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1642                 return -EEXIST;
1643         }
1644         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1645                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1646                 return -EINVAL;
1647         }
1648         err = qdisc_change(q, tca, extack);
1649         if (err == 0)
1650                 qdisc_notify(net, skb, n, clid, NULL, q);
1651         return err;
1652
1653 create_n_graft:
1654         if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1655                 NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1656                 return -ENOENT;
1657         }
1658         if (clid == TC_H_INGRESS) {
1659                 if (dev_ingress_queue(dev)) {
1660                         q = qdisc_create(dev, dev_ingress_queue(dev),
1661                                          tcm->tcm_parent, tcm->tcm_parent,
1662                                          tca, &err, extack);
1663                 } else {
1664                         NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1665                         err = -ENOENT;
1666                 }
1667         } else {
1668                 struct netdev_queue *dev_queue;
1669
1670                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1671                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1672                 else if (p)
1673                         dev_queue = p->dev_queue;
1674                 else
1675                         dev_queue = netdev_get_tx_queue(dev, 0);
1676
1677                 q = qdisc_create(dev, dev_queue,
1678                                  tcm->tcm_parent, tcm->tcm_handle,
1679                                  tca, &err, extack);
1680         }
1681         if (q == NULL) {
1682                 if (err == -EAGAIN)
1683                         goto replay;
1684                 return err;
1685         }
1686
1687 graft:
1688         err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1689         if (err) {
1690                 if (q)
1691                         qdisc_put(q);
1692                 return err;
1693         }
1694
1695         return 0;
1696 }
1697
1698 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1699                               struct netlink_callback *cb,
1700                               int *q_idx_p, int s_q_idx, bool recur,
1701                               bool dump_invisible)
1702 {
1703         int ret = 0, q_idx = *q_idx_p;
1704         struct Qdisc *q;
1705         int b;
1706
1707         if (!root)
1708                 return 0;
1709
1710         q = root;
1711         if (q_idx < s_q_idx) {
1712                 q_idx++;
1713         } else {
1714                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1715                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1716                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1717                                   RTM_NEWQDISC) <= 0)
1718                         goto done;
1719                 q_idx++;
1720         }
1721
1722         /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1723          * itself has already been dumped.
1724          *
1725          * If we've already dumped the top-level (ingress) qdisc above and the global
1726          * qdisc hashtable, we don't want to hit it again
1727          */
1728         if (!qdisc_dev(root) || !recur)
1729                 goto out;
1730
1731         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1732                 if (q_idx < s_q_idx) {
1733                         q_idx++;
1734                         continue;
1735                 }
1736                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1737                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1738                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1739                                   RTM_NEWQDISC) <= 0)
1740                         goto done;
1741                 q_idx++;
1742         }
1743
1744 out:
1745         *q_idx_p = q_idx;
1746         return ret;
1747 done:
1748         ret = -1;
1749         goto out;
1750 }
1751
1752 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1753 {
1754         struct net *net = sock_net(skb->sk);
1755         int idx, q_idx;
1756         int s_idx, s_q_idx;
1757         struct net_device *dev;
1758         const struct nlmsghdr *nlh = cb->nlh;
1759         struct nlattr *tca[TCA_MAX + 1];
1760         int err;
1761
1762         s_idx = cb->args[0];
1763         s_q_idx = q_idx = cb->args[1];
1764
1765         idx = 0;
1766         ASSERT_RTNL();
1767
1768         err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1769                                      rtm_tca_policy, cb->extack);
1770         if (err < 0)
1771                 return err;
1772
1773         for_each_netdev(net, dev) {
1774                 struct netdev_queue *dev_queue;
1775
1776                 if (idx < s_idx)
1777                         goto cont;
1778                 if (idx > s_idx)
1779                         s_q_idx = 0;
1780                 q_idx = 0;
1781
1782                 if (tc_dump_qdisc_root(rtnl_dereference(dev->qdisc),
1783                                        skb, cb, &q_idx, s_q_idx,
1784                                        true, tca[TCA_DUMP_INVISIBLE]) < 0)
1785                         goto done;
1786
1787                 dev_queue = dev_ingress_queue(dev);
1788                 if (dev_queue &&
1789                     tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1790                                        &q_idx, s_q_idx, false,
1791                                        tca[TCA_DUMP_INVISIBLE]) < 0)
1792                         goto done;
1793
1794 cont:
1795                 idx++;
1796         }
1797
1798 done:
1799         cb->args[0] = idx;
1800         cb->args[1] = q_idx;
1801
1802         return skb->len;
1803 }
1804
1805
1806
1807 /************************************************
1808  *      Traffic classes manipulation.           *
1809  ************************************************/
1810
1811 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1812                           unsigned long cl,
1813                           u32 portid, u32 seq, u16 flags, int event)
1814 {
1815         struct tcmsg *tcm;
1816         struct nlmsghdr  *nlh;
1817         unsigned char *b = skb_tail_pointer(skb);
1818         struct gnet_dump d;
1819         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1820
1821         cond_resched();
1822         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1823         if (!nlh)
1824                 goto out_nlmsg_trim;
1825         tcm = nlmsg_data(nlh);
1826         tcm->tcm_family = AF_UNSPEC;
1827         tcm->tcm__pad1 = 0;
1828         tcm->tcm__pad2 = 0;
1829         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1830         tcm->tcm_parent = q->handle;
1831         tcm->tcm_handle = q->handle;
1832         tcm->tcm_info = 0;
1833         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1834                 goto nla_put_failure;
1835         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1836                 goto nla_put_failure;
1837
1838         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1839                                          NULL, &d, TCA_PAD) < 0)
1840                 goto nla_put_failure;
1841
1842         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1843                 goto nla_put_failure;
1844
1845         if (gnet_stats_finish_copy(&d) < 0)
1846                 goto nla_put_failure;
1847
1848         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1849         return skb->len;
1850
1851 out_nlmsg_trim:
1852 nla_put_failure:
1853         nlmsg_trim(skb, b);
1854         return -1;
1855 }
1856
1857 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1858                          struct nlmsghdr *n, struct Qdisc *q,
1859                          unsigned long cl, int event)
1860 {
1861         struct sk_buff *skb;
1862         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1863
1864         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1865         if (!skb)
1866                 return -ENOBUFS;
1867
1868         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1869                 kfree_skb(skb);
1870                 return -EINVAL;
1871         }
1872
1873         return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1874                               n->nlmsg_flags & NLM_F_ECHO);
1875 }
1876
1877 static int tclass_del_notify(struct net *net,
1878                              const struct Qdisc_class_ops *cops,
1879                              struct sk_buff *oskb, struct nlmsghdr *n,
1880                              struct Qdisc *q, unsigned long cl,
1881                              struct netlink_ext_ack *extack)
1882 {
1883         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1884         struct sk_buff *skb;
1885         int err = 0;
1886
1887         if (!cops->delete)
1888                 return -EOPNOTSUPP;
1889
1890         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1891         if (!skb)
1892                 return -ENOBUFS;
1893
1894         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1895                            RTM_DELTCLASS) < 0) {
1896                 kfree_skb(skb);
1897                 return -EINVAL;
1898         }
1899
1900         err = cops->delete(q, cl, extack);
1901         if (err) {
1902                 kfree_skb(skb);
1903                 return err;
1904         }
1905
1906         err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1907                              n->nlmsg_flags & NLM_F_ECHO);
1908         return err;
1909 }
1910
1911 #ifdef CONFIG_NET_CLS
1912
1913 struct tcf_bind_args {
1914         struct tcf_walker w;
1915         unsigned long base;
1916         unsigned long cl;
1917         u32 classid;
1918 };
1919
1920 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1921 {
1922         struct tcf_bind_args *a = (void *)arg;
1923
1924         if (n && tp->ops->bind_class) {
1925                 struct Qdisc *q = tcf_block_q(tp->chain->block);
1926
1927                 sch_tree_lock(q);
1928                 tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
1929                 sch_tree_unlock(q);
1930         }
1931         return 0;
1932 }
1933
1934 struct tc_bind_class_args {
1935         struct qdisc_walker w;
1936         unsigned long new_cl;
1937         u32 portid;
1938         u32 clid;
1939 };
1940
1941 static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl,
1942                                 struct qdisc_walker *w)
1943 {
1944         struct tc_bind_class_args *a = (struct tc_bind_class_args *)w;
1945         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1946         struct tcf_block *block;
1947         struct tcf_chain *chain;
1948
1949         block = cops->tcf_block(q, cl, NULL);
1950         if (!block)
1951                 return 0;
1952         for (chain = tcf_get_next_chain(block, NULL);
1953              chain;
1954              chain = tcf_get_next_chain(block, chain)) {
1955                 struct tcf_proto *tp;
1956
1957                 for (tp = tcf_get_next_proto(chain, NULL);
1958                      tp; tp = tcf_get_next_proto(chain, tp)) {
1959                         struct tcf_bind_args arg = {};
1960
1961                         arg.w.fn = tcf_node_bind;
1962                         arg.classid = a->clid;
1963                         arg.base = cl;
1964                         arg.cl = a->new_cl;
1965                         tp->ops->walk(tp, &arg.w, true);
1966                 }
1967         }
1968
1969         return 0;
1970 }
1971
1972 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1973                            unsigned long new_cl)
1974 {
1975         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1976         struct tc_bind_class_args args = {};
1977
1978         if (!cops->tcf_block)
1979                 return;
1980         args.portid = portid;
1981         args.clid = clid;
1982         args.new_cl = new_cl;
1983         args.w.fn = tc_bind_class_walker;
1984         q->ops->cl_ops->walk(q, &args.w);
1985 }
1986
1987 #else
1988
1989 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
1990                            unsigned long new_cl)
1991 {
1992 }
1993
1994 #endif
1995
1996 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
1997                          struct netlink_ext_ack *extack)
1998 {
1999         struct net *net = sock_net(skb->sk);
2000         struct tcmsg *tcm = nlmsg_data(n);
2001         struct nlattr *tca[TCA_MAX + 1];
2002         struct net_device *dev;
2003         struct Qdisc *q = NULL;
2004         const struct Qdisc_class_ops *cops;
2005         unsigned long cl = 0;
2006         unsigned long new_cl;
2007         u32 portid;
2008         u32 clid;
2009         u32 qid;
2010         int err;
2011
2012         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
2013                                      rtm_tca_policy, extack);
2014         if (err < 0)
2015                 return err;
2016
2017         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
2018         if (!dev)
2019                 return -ENODEV;
2020
2021         /*
2022            parent == TC_H_UNSPEC - unspecified parent.
2023            parent == TC_H_ROOT   - class is root, which has no parent.
2024            parent == X:0         - parent is root class.
2025            parent == X:Y         - parent is a node in hierarchy.
2026            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
2027
2028            handle == 0:0         - generate handle from kernel pool.
2029            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
2030            handle == X:Y         - clear.
2031            handle == X:0         - root class.
2032          */
2033
2034         /* Step 1. Determine qdisc handle X:0 */
2035
2036         portid = tcm->tcm_parent;
2037         clid = tcm->tcm_handle;
2038         qid = TC_H_MAJ(clid);
2039
2040         if (portid != TC_H_ROOT) {
2041                 u32 qid1 = TC_H_MAJ(portid);
2042
2043                 if (qid && qid1) {
2044                         /* If both majors are known, they must be identical. */
2045                         if (qid != qid1)
2046                                 return -EINVAL;
2047                 } else if (qid1) {
2048                         qid = qid1;
2049                 } else if (qid == 0)
2050                         qid = rtnl_dereference(dev->qdisc)->handle;
2051
2052                 /* Now qid is genuine qdisc handle consistent
2053                  * both with parent and child.
2054                  *
2055                  * TC_H_MAJ(portid) still may be unspecified, complete it now.
2056                  */
2057                 if (portid)
2058                         portid = TC_H_MAKE(qid, portid);
2059         } else {
2060                 if (qid == 0)
2061                         qid = rtnl_dereference(dev->qdisc)->handle;
2062         }
2063
2064         /* OK. Locate qdisc */
2065         q = qdisc_lookup(dev, qid);
2066         if (!q)
2067                 return -ENOENT;
2068
2069         /* An check that it supports classes */
2070         cops = q->ops->cl_ops;
2071         if (cops == NULL)
2072                 return -EINVAL;
2073
2074         /* Now try to get class */
2075         if (clid == 0) {
2076                 if (portid == TC_H_ROOT)
2077                         clid = qid;
2078         } else
2079                 clid = TC_H_MAKE(qid, clid);
2080
2081         if (clid)
2082                 cl = cops->find(q, clid);
2083
2084         if (cl == 0) {
2085                 err = -ENOENT;
2086                 if (n->nlmsg_type != RTM_NEWTCLASS ||
2087                     !(n->nlmsg_flags & NLM_F_CREATE))
2088                         goto out;
2089         } else {
2090                 switch (n->nlmsg_type) {
2091                 case RTM_NEWTCLASS:
2092                         err = -EEXIST;
2093                         if (n->nlmsg_flags & NLM_F_EXCL)
2094                                 goto out;
2095                         break;
2096                 case RTM_DELTCLASS:
2097                         err = tclass_del_notify(net, cops, skb, n, q, cl, extack);
2098                         /* Unbind the class with flilters with 0 */
2099                         tc_bind_tclass(q, portid, clid, 0);
2100                         goto out;
2101                 case RTM_GETTCLASS:
2102                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
2103                         goto out;
2104                 default:
2105                         err = -EINVAL;
2106                         goto out;
2107                 }
2108         }
2109
2110         if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
2111                 NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
2112                 return -EOPNOTSUPP;
2113         }
2114
2115         new_cl = cl;
2116         err = -EOPNOTSUPP;
2117         if (cops->change)
2118                 err = cops->change(q, clid, portid, tca, &new_cl, extack);
2119         if (err == 0) {
2120                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
2121                 /* We just create a new class, need to do reverse binding. */
2122                 if (cl != new_cl)
2123                         tc_bind_tclass(q, portid, clid, new_cl);
2124         }
2125 out:
2126         return err;
2127 }
2128
2129 struct qdisc_dump_args {
2130         struct qdisc_walker     w;
2131         struct sk_buff          *skb;
2132         struct netlink_callback *cb;
2133 };
2134
2135 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2136                             struct qdisc_walker *arg)
2137 {
2138         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2139
2140         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2141                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2142                               RTM_NEWTCLASS);
2143 }
2144
2145 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2146                                 struct tcmsg *tcm, struct netlink_callback *cb,
2147                                 int *t_p, int s_t)
2148 {
2149         struct qdisc_dump_args arg;
2150
2151         if (tc_qdisc_dump_ignore(q, false) ||
2152             *t_p < s_t || !q->ops->cl_ops ||
2153             (tcm->tcm_parent &&
2154              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2155                 (*t_p)++;
2156                 return 0;
2157         }
2158         if (*t_p > s_t)
2159                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2160         arg.w.fn = qdisc_class_dump;
2161         arg.skb = skb;
2162         arg.cb = cb;
2163         arg.w.stop  = 0;
2164         arg.w.skip = cb->args[1];
2165         arg.w.count = 0;
2166         q->ops->cl_ops->walk(q, &arg.w);
2167         cb->args[1] = arg.w.count;
2168         if (arg.w.stop)
2169                 return -1;
2170         (*t_p)++;
2171         return 0;
2172 }
2173
2174 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2175                                struct tcmsg *tcm, struct netlink_callback *cb,
2176                                int *t_p, int s_t, bool recur)
2177 {
2178         struct Qdisc *q;
2179         int b;
2180
2181         if (!root)
2182                 return 0;
2183
2184         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2185                 return -1;
2186
2187         if (!qdisc_dev(root) || !recur)
2188                 return 0;
2189
2190         if (tcm->tcm_parent) {
2191                 q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2192                 if (q && q != root &&
2193                     tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2194                         return -1;
2195                 return 0;
2196         }
2197         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2198                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2199                         return -1;
2200         }
2201
2202         return 0;
2203 }
2204
2205 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2206 {
2207         struct tcmsg *tcm = nlmsg_data(cb->nlh);
2208         struct net *net = sock_net(skb->sk);
2209         struct netdev_queue *dev_queue;
2210         struct net_device *dev;
2211         int t, s_t;
2212
2213         if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2214                 return 0;
2215         dev = dev_get_by_index(net, tcm->tcm_ifindex);
2216         if (!dev)
2217                 return 0;
2218
2219         s_t = cb->args[0];
2220         t = 0;
2221
2222         if (tc_dump_tclass_root(rtnl_dereference(dev->qdisc),
2223                                 skb, tcm, cb, &t, s_t, true) < 0)
2224                 goto done;
2225
2226         dev_queue = dev_ingress_queue(dev);
2227         if (dev_queue &&
2228             tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2229                                 &t, s_t, false) < 0)
2230                 goto done;
2231
2232 done:
2233         cb->args[0] = t;
2234
2235         dev_put(dev);
2236         return skb->len;
2237 }
2238
2239 #ifdef CONFIG_PROC_FS
2240 static int psched_show(struct seq_file *seq, void *v)
2241 {
2242         seq_printf(seq, "%08x %08x %08x %08x\n",
2243                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2244                    1000000,
2245                    (u32)NSEC_PER_SEC / hrtimer_resolution);
2246
2247         return 0;
2248 }
2249
2250 static int __net_init psched_net_init(struct net *net)
2251 {
2252         struct proc_dir_entry *e;
2253
2254         e = proc_create_single("psched", 0, net->proc_net, psched_show);
2255         if (e == NULL)
2256                 return -ENOMEM;
2257
2258         return 0;
2259 }
2260
2261 static void __net_exit psched_net_exit(struct net *net)
2262 {
2263         remove_proc_entry("psched", net->proc_net);
2264 }
2265 #else
2266 static int __net_init psched_net_init(struct net *net)
2267 {
2268         return 0;
2269 }
2270
2271 static void __net_exit psched_net_exit(struct net *net)
2272 {
2273 }
2274 #endif
2275
2276 static struct pernet_operations psched_net_ops = {
2277         .init = psched_net_init,
2278         .exit = psched_net_exit,
2279 };
2280
2281 static int __init pktsched_init(void)
2282 {
2283         int err;
2284
2285         err = register_pernet_subsys(&psched_net_ops);
2286         if (err) {
2287                 pr_err("pktsched_init: "
2288                        "cannot initialize per netns operations\n");
2289                 return err;
2290         }
2291
2292         register_qdisc(&pfifo_fast_ops);
2293         register_qdisc(&pfifo_qdisc_ops);
2294         register_qdisc(&bfifo_qdisc_ops);
2295         register_qdisc(&pfifo_head_drop_qdisc_ops);
2296         register_qdisc(&mq_qdisc_ops);
2297         register_qdisc(&noqueue_qdisc_ops);
2298
2299         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2300         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2301         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2302                       0);
2303         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2304         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2305         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2306                       0);
2307
2308         return 0;
2309 }
2310
2311 subsys_initcall(pktsched_init);