GNU Linux-libre 5.10.217-gnu1
[releases.git] / net / sched / sch_api.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_api.c  Packet scheduler API.
4  *
5  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6  *
7  * Fixes:
8  *
9  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
10  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
11  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
12  */
13
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/string.h>
18 #include <linux/errno.h>
19 #include <linux/skbuff.h>
20 #include <linux/init.h>
21 #include <linux/proc_fs.h>
22 #include <linux/seq_file.h>
23 #include <linux/kmod.h>
24 #include <linux/list.h>
25 #include <linux/hrtimer.h>
26 #include <linux/slab.h>
27 #include <linux/hashtable.h>
28
29 #include <net/net_namespace.h>
30 #include <net/sock.h>
31 #include <net/netlink.h>
32 #include <net/pkt_sched.h>
33 #include <net/pkt_cls.h>
34
35 #include <trace/events/qdisc.h>
36
37 /*
38
39    Short review.
40    -------------
41
42    This file consists of two interrelated parts:
43
44    1. queueing disciplines manager frontend.
45    2. traffic classes manager frontend.
46
47    Generally, queueing discipline ("qdisc") is a black box,
48    which is able to enqueue packets and to dequeue them (when
49    device is ready to send something) in order and at times
50    determined by algorithm hidden in it.
51
52    qdisc's are divided to two categories:
53    - "queues", which have no internal structure visible from outside.
54    - "schedulers", which split all the packets to "traffic classes",
55      using "packet classifiers" (look at cls_api.c)
56
57    In turn, classes may have child qdiscs (as rule, queues)
58    attached to them etc. etc. etc.
59
60    The goal of the routines in this file is to translate
61    information supplied by user in the form of handles
62    to more intelligible for kernel form, to make some sanity
63    checks and part of work, which is common to all qdiscs
64    and to provide rtnetlink notifications.
65
66    All real intelligent work is done inside qdisc modules.
67
68
69
70    Every discipline has two major routines: enqueue and dequeue.
71
72    ---dequeue
73
74    dequeue usually returns a skb to send. It is allowed to return NULL,
75    but it does not mean that queue is empty, it just means that
76    discipline does not want to send anything this time.
77    Queue is really empty if q->q.qlen == 0.
78    For complicated disciplines with multiple queues q->q is not
79    real packet queue, but however q->q.qlen must be valid.
80
81    ---enqueue
82
83    enqueue returns 0, if packet was enqueued successfully.
84    If packet (this one or another one) was dropped, it returns
85    not zero error code.
86    NET_XMIT_DROP        - this packet dropped
87      Expected action: do not backoff, but wait until queue will clear.
88    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
89      Expected action: backoff or ignore
90
91    Auxiliary routines:
92
93    ---peek
94
95    like dequeue but without removing a packet from the queue
96
97    ---reset
98
99    returns qdisc to initial state: purge all buffers, clear all
100    timers, counters (except for statistics) etc.
101
102    ---init
103
104    initializes newly created qdisc.
105
106    ---destroy
107
108    destroys resources allocated by init and during lifetime of qdisc.
109
110    ---change
111
112    changes qdisc parameters.
113  */
114
115 /* Protects list of registered TC modules. It is pure SMP lock. */
116 static DEFINE_RWLOCK(qdisc_mod_lock);
117
118
119 /************************************************
120  *      Queueing disciplines manipulation.      *
121  ************************************************/
122
123
124 /* The list of all installed queueing disciplines. */
125
126 static struct Qdisc_ops *qdisc_base;
127
128 /* Register/unregister queueing discipline */
129
130 int register_qdisc(struct Qdisc_ops *qops)
131 {
132         struct Qdisc_ops *q, **qp;
133         int rc = -EEXIST;
134
135         write_lock(&qdisc_mod_lock);
136         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
137                 if (!strcmp(qops->id, q->id))
138                         goto out;
139
140         if (qops->enqueue == NULL)
141                 qops->enqueue = noop_qdisc_ops.enqueue;
142         if (qops->peek == NULL) {
143                 if (qops->dequeue == NULL)
144                         qops->peek = noop_qdisc_ops.peek;
145                 else
146                         goto out_einval;
147         }
148         if (qops->dequeue == NULL)
149                 qops->dequeue = noop_qdisc_ops.dequeue;
150
151         if (qops->cl_ops) {
152                 const struct Qdisc_class_ops *cops = qops->cl_ops;
153
154                 if (!(cops->find && cops->walk && cops->leaf))
155                         goto out_einval;
156
157                 if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
158                         goto out_einval;
159         }
160
161         qops->next = NULL;
162         *qp = qops;
163         rc = 0;
164 out:
165         write_unlock(&qdisc_mod_lock);
166         return rc;
167
168 out_einval:
169         rc = -EINVAL;
170         goto out;
171 }
172 EXPORT_SYMBOL(register_qdisc);
173
174 int unregister_qdisc(struct Qdisc_ops *qops)
175 {
176         struct Qdisc_ops *q, **qp;
177         int err = -ENOENT;
178
179         write_lock(&qdisc_mod_lock);
180         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
181                 if (q == qops)
182                         break;
183         if (q) {
184                 *qp = q->next;
185                 q->next = NULL;
186                 err = 0;
187         }
188         write_unlock(&qdisc_mod_lock);
189         return err;
190 }
191 EXPORT_SYMBOL(unregister_qdisc);
192
193 /* Get default qdisc if not otherwise specified */
194 void qdisc_get_default(char *name, size_t len)
195 {
196         read_lock(&qdisc_mod_lock);
197         strlcpy(name, default_qdisc_ops->id, len);
198         read_unlock(&qdisc_mod_lock);
199 }
200
201 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
202 {
203         struct Qdisc_ops *q = NULL;
204
205         for (q = qdisc_base; q; q = q->next) {
206                 if (!strcmp(name, q->id)) {
207                         if (!try_module_get(q->owner))
208                                 q = NULL;
209                         break;
210                 }
211         }
212
213         return q;
214 }
215
216 /* Set new default qdisc to use */
217 int qdisc_set_default(const char *name)
218 {
219         const struct Qdisc_ops *ops;
220
221         if (!capable(CAP_NET_ADMIN))
222                 return -EPERM;
223
224         write_lock(&qdisc_mod_lock);
225         ops = qdisc_lookup_default(name);
226         if (!ops) {
227                 /* Not found, drop lock and try to load module */
228                 write_unlock(&qdisc_mod_lock);
229                 request_module("sch_%s", name);
230                 write_lock(&qdisc_mod_lock);
231
232                 ops = qdisc_lookup_default(name);
233         }
234
235         if (ops) {
236                 /* Set new default */
237                 module_put(default_qdisc_ops->owner);
238                 default_qdisc_ops = ops;
239         }
240         write_unlock(&qdisc_mod_lock);
241
242         return ops ? 0 : -ENOENT;
243 }
244
245 #ifdef CONFIG_NET_SCH_DEFAULT
246 /* Set default value from kernel config */
247 static int __init sch_default_qdisc(void)
248 {
249         return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
250 }
251 late_initcall(sch_default_qdisc);
252 #endif
253
254 /* We know handle. Find qdisc among all qdisc's attached to device
255  * (root qdisc, all its children, children of children etc.)
256  * Note: caller either uses rtnl or rcu_read_lock()
257  */
258
259 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
260 {
261         struct Qdisc *q;
262
263         if (!qdisc_dev(root))
264                 return (root->handle == handle ? root : NULL);
265
266         if (!(root->flags & TCQ_F_BUILTIN) &&
267             root->handle == handle)
268                 return root;
269
270         hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle,
271                                    lockdep_rtnl_is_held()) {
272                 if (q->handle == handle)
273                         return q;
274         }
275         return NULL;
276 }
277
278 void qdisc_hash_add(struct Qdisc *q, bool invisible)
279 {
280         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
281                 ASSERT_RTNL();
282                 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
283                 if (invisible)
284                         q->flags |= TCQ_F_INVISIBLE;
285         }
286 }
287 EXPORT_SYMBOL(qdisc_hash_add);
288
289 void qdisc_hash_del(struct Qdisc *q)
290 {
291         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
292                 ASSERT_RTNL();
293                 hash_del_rcu(&q->hash);
294         }
295 }
296 EXPORT_SYMBOL(qdisc_hash_del);
297
298 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
299 {
300         struct Qdisc *q;
301
302         if (!handle)
303                 return NULL;
304         q = qdisc_match_from_root(rtnl_dereference(dev->qdisc), handle);
305         if (q)
306                 goto out;
307
308         if (dev_ingress_queue(dev))
309                 q = qdisc_match_from_root(
310                         dev_ingress_queue(dev)->qdisc_sleeping,
311                         handle);
312 out:
313         return q;
314 }
315
316 struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
317 {
318         struct netdev_queue *nq;
319         struct Qdisc *q;
320
321         if (!handle)
322                 return NULL;
323         q = qdisc_match_from_root(rcu_dereference(dev->qdisc), handle);
324         if (q)
325                 goto out;
326
327         nq = dev_ingress_queue_rcu(dev);
328         if (nq)
329                 q = qdisc_match_from_root(nq->qdisc_sleeping, handle);
330 out:
331         return q;
332 }
333
334 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
335 {
336         unsigned long cl;
337         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
338
339         if (cops == NULL)
340                 return NULL;
341         cl = cops->find(p, classid);
342
343         if (cl == 0)
344                 return NULL;
345         return cops->leaf(p, cl);
346 }
347
348 /* Find queueing discipline by name */
349
350 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
351 {
352         struct Qdisc_ops *q = NULL;
353
354         if (kind) {
355                 read_lock(&qdisc_mod_lock);
356                 for (q = qdisc_base; q; q = q->next) {
357                         if (nla_strcmp(kind, q->id) == 0) {
358                                 if (!try_module_get(q->owner))
359                                         q = NULL;
360                                 break;
361                         }
362                 }
363                 read_unlock(&qdisc_mod_lock);
364         }
365         return q;
366 }
367
368 /* The linklayer setting were not transferred from iproute2, in older
369  * versions, and the rate tables lookup systems have been dropped in
370  * the kernel. To keep backward compatible with older iproute2 tc
371  * utils, we detect the linklayer setting by detecting if the rate
372  * table were modified.
373  *
374  * For linklayer ATM table entries, the rate table will be aligned to
375  * 48 bytes, thus some table entries will contain the same value.  The
376  * mpu (min packet unit) is also encoded into the old rate table, thus
377  * starting from the mpu, we find low and high table entries for
378  * mapping this cell.  If these entries contain the same value, when
379  * the rate tables have been modified for linklayer ATM.
380  *
381  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
382  * and then roundup to the next cell, calc the table entry one below,
383  * and compare.
384  */
385 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
386 {
387         int low       = roundup(r->mpu, 48);
388         int high      = roundup(low+1, 48);
389         int cell_low  = low >> r->cell_log;
390         int cell_high = (high >> r->cell_log) - 1;
391
392         /* rtab is too inaccurate at rates > 100Mbit/s */
393         if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
394                 pr_debug("TC linklayer: Giving up ATM detection\n");
395                 return TC_LINKLAYER_ETHERNET;
396         }
397
398         if ((cell_high > cell_low) && (cell_high < 256)
399             && (rtab[cell_low] == rtab[cell_high])) {
400                 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
401                          cell_low, cell_high, rtab[cell_high]);
402                 return TC_LINKLAYER_ATM;
403         }
404         return TC_LINKLAYER_ETHERNET;
405 }
406
407 static struct qdisc_rate_table *qdisc_rtab_list;
408
409 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
410                                         struct nlattr *tab,
411                                         struct netlink_ext_ack *extack)
412 {
413         struct qdisc_rate_table *rtab;
414
415         if (tab == NULL || r->rate == 0 ||
416             r->cell_log == 0 || r->cell_log >= 32 ||
417             nla_len(tab) != TC_RTAB_SIZE) {
418                 NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
419                 return NULL;
420         }
421
422         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
423                 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
424                     !memcmp(&rtab->data, nla_data(tab), 1024)) {
425                         rtab->refcnt++;
426                         return rtab;
427                 }
428         }
429
430         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
431         if (rtab) {
432                 rtab->rate = *r;
433                 rtab->refcnt = 1;
434                 memcpy(rtab->data, nla_data(tab), 1024);
435                 if (r->linklayer == TC_LINKLAYER_UNAWARE)
436                         r->linklayer = __detect_linklayer(r, rtab->data);
437                 rtab->next = qdisc_rtab_list;
438                 qdisc_rtab_list = rtab;
439         } else {
440                 NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
441         }
442         return rtab;
443 }
444 EXPORT_SYMBOL(qdisc_get_rtab);
445
446 void qdisc_put_rtab(struct qdisc_rate_table *tab)
447 {
448         struct qdisc_rate_table *rtab, **rtabp;
449
450         if (!tab || --tab->refcnt)
451                 return;
452
453         for (rtabp = &qdisc_rtab_list;
454              (rtab = *rtabp) != NULL;
455              rtabp = &rtab->next) {
456                 if (rtab == tab) {
457                         *rtabp = rtab->next;
458                         kfree(rtab);
459                         return;
460                 }
461         }
462 }
463 EXPORT_SYMBOL(qdisc_put_rtab);
464
465 static LIST_HEAD(qdisc_stab_list);
466
467 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
468         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
469         [TCA_STAB_DATA] = { .type = NLA_BINARY },
470 };
471
472 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
473                                                struct netlink_ext_ack *extack)
474 {
475         struct nlattr *tb[TCA_STAB_MAX + 1];
476         struct qdisc_size_table *stab;
477         struct tc_sizespec *s;
478         unsigned int tsize = 0;
479         u16 *tab = NULL;
480         int err;
481
482         err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy,
483                                           extack);
484         if (err < 0)
485                 return ERR_PTR(err);
486         if (!tb[TCA_STAB_BASE]) {
487                 NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
488                 return ERR_PTR(-EINVAL);
489         }
490
491         s = nla_data(tb[TCA_STAB_BASE]);
492
493         if (s->tsize > 0) {
494                 if (!tb[TCA_STAB_DATA]) {
495                         NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
496                         return ERR_PTR(-EINVAL);
497                 }
498                 tab = nla_data(tb[TCA_STAB_DATA]);
499                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
500         }
501
502         if (tsize != s->tsize || (!tab && tsize > 0)) {
503                 NL_SET_ERR_MSG(extack, "Invalid size of size table");
504                 return ERR_PTR(-EINVAL);
505         }
506
507         list_for_each_entry(stab, &qdisc_stab_list, list) {
508                 if (memcmp(&stab->szopts, s, sizeof(*s)))
509                         continue;
510                 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
511                         continue;
512                 stab->refcnt++;
513                 return stab;
514         }
515
516         if (s->size_log > STAB_SIZE_LOG_MAX ||
517             s->cell_log > STAB_SIZE_LOG_MAX) {
518                 NL_SET_ERR_MSG(extack, "Invalid logarithmic size of size table");
519                 return ERR_PTR(-EINVAL);
520         }
521
522         stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
523         if (!stab)
524                 return ERR_PTR(-ENOMEM);
525
526         stab->refcnt = 1;
527         stab->szopts = *s;
528         if (tsize > 0)
529                 memcpy(stab->data, tab, tsize * sizeof(u16));
530
531         list_add_tail(&stab->list, &qdisc_stab_list);
532
533         return stab;
534 }
535
536 void qdisc_put_stab(struct qdisc_size_table *tab)
537 {
538         if (!tab)
539                 return;
540
541         if (--tab->refcnt == 0) {
542                 list_del(&tab->list);
543                 kfree_rcu(tab, rcu);
544         }
545 }
546 EXPORT_SYMBOL(qdisc_put_stab);
547
548 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
549 {
550         struct nlattr *nest;
551
552         nest = nla_nest_start_noflag(skb, TCA_STAB);
553         if (nest == NULL)
554                 goto nla_put_failure;
555         if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
556                 goto nla_put_failure;
557         nla_nest_end(skb, nest);
558
559         return skb->len;
560
561 nla_put_failure:
562         return -1;
563 }
564
565 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
566                                const struct qdisc_size_table *stab)
567 {
568         int pkt_len, slot;
569
570         pkt_len = skb->len + stab->szopts.overhead;
571         if (unlikely(!stab->szopts.tsize))
572                 goto out;
573
574         slot = pkt_len + stab->szopts.cell_align;
575         if (unlikely(slot < 0))
576                 slot = 0;
577
578         slot >>= stab->szopts.cell_log;
579         if (likely(slot < stab->szopts.tsize))
580                 pkt_len = stab->data[slot];
581         else
582                 pkt_len = stab->data[stab->szopts.tsize - 1] *
583                                 (slot / stab->szopts.tsize) +
584                                 stab->data[slot % stab->szopts.tsize];
585
586         pkt_len <<= stab->szopts.size_log;
587 out:
588         if (unlikely(pkt_len < 1))
589                 pkt_len = 1;
590         qdisc_skb_cb(skb)->pkt_len = pkt_len;
591 }
592 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
593
594 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
595 {
596         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
597                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
598                         txt, qdisc->ops->id, qdisc->handle >> 16);
599                 qdisc->flags |= TCQ_F_WARN_NONWC;
600         }
601 }
602 EXPORT_SYMBOL(qdisc_warn_nonwc);
603
604 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
605 {
606         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
607                                                  timer);
608
609         rcu_read_lock();
610         __netif_schedule(qdisc_root(wd->qdisc));
611         rcu_read_unlock();
612
613         return HRTIMER_NORESTART;
614 }
615
616 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
617                                  clockid_t clockid)
618 {
619         hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
620         wd->timer.function = qdisc_watchdog;
621         wd->qdisc = qdisc;
622 }
623 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
624
625 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
626 {
627         qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
628 }
629 EXPORT_SYMBOL(qdisc_watchdog_init);
630
631 void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires,
632                                       u64 delta_ns)
633 {
634         if (test_bit(__QDISC_STATE_DEACTIVATED,
635                      &qdisc_root_sleeping(wd->qdisc)->state))
636                 return;
637
638         if (hrtimer_is_queued(&wd->timer)) {
639                 /* If timer is already set in [expires, expires + delta_ns],
640                  * do not reprogram it.
641                  */
642                 if (wd->last_expires - expires <= delta_ns)
643                         return;
644         }
645
646         wd->last_expires = expires;
647         hrtimer_start_range_ns(&wd->timer,
648                                ns_to_ktime(expires),
649                                delta_ns,
650                                HRTIMER_MODE_ABS_PINNED);
651 }
652 EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns);
653
654 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
655 {
656         hrtimer_cancel(&wd->timer);
657 }
658 EXPORT_SYMBOL(qdisc_watchdog_cancel);
659
660 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
661 {
662         struct hlist_head *h;
663         unsigned int i;
664
665         h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
666
667         if (h != NULL) {
668                 for (i = 0; i < n; i++)
669                         INIT_HLIST_HEAD(&h[i]);
670         }
671         return h;
672 }
673
674 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
675 {
676         struct Qdisc_class_common *cl;
677         struct hlist_node *next;
678         struct hlist_head *nhash, *ohash;
679         unsigned int nsize, nmask, osize;
680         unsigned int i, h;
681
682         /* Rehash when load factor exceeds 0.75 */
683         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
684                 return;
685         nsize = clhash->hashsize * 2;
686         nmask = nsize - 1;
687         nhash = qdisc_class_hash_alloc(nsize);
688         if (nhash == NULL)
689                 return;
690
691         ohash = clhash->hash;
692         osize = clhash->hashsize;
693
694         sch_tree_lock(sch);
695         for (i = 0; i < osize; i++) {
696                 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
697                         h = qdisc_class_hash(cl->classid, nmask);
698                         hlist_add_head(&cl->hnode, &nhash[h]);
699                 }
700         }
701         clhash->hash     = nhash;
702         clhash->hashsize = nsize;
703         clhash->hashmask = nmask;
704         sch_tree_unlock(sch);
705
706         kvfree(ohash);
707 }
708 EXPORT_SYMBOL(qdisc_class_hash_grow);
709
710 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
711 {
712         unsigned int size = 4;
713
714         clhash->hash = qdisc_class_hash_alloc(size);
715         if (!clhash->hash)
716                 return -ENOMEM;
717         clhash->hashsize  = size;
718         clhash->hashmask  = size - 1;
719         clhash->hashelems = 0;
720         return 0;
721 }
722 EXPORT_SYMBOL(qdisc_class_hash_init);
723
724 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
725 {
726         kvfree(clhash->hash);
727 }
728 EXPORT_SYMBOL(qdisc_class_hash_destroy);
729
730 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
731                              struct Qdisc_class_common *cl)
732 {
733         unsigned int h;
734
735         INIT_HLIST_NODE(&cl->hnode);
736         h = qdisc_class_hash(cl->classid, clhash->hashmask);
737         hlist_add_head(&cl->hnode, &clhash->hash[h]);
738         clhash->hashelems++;
739 }
740 EXPORT_SYMBOL(qdisc_class_hash_insert);
741
742 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
743                              struct Qdisc_class_common *cl)
744 {
745         hlist_del(&cl->hnode);
746         clhash->hashelems--;
747 }
748 EXPORT_SYMBOL(qdisc_class_hash_remove);
749
750 /* Allocate an unique handle from space managed by kernel
751  * Possible range is [8000-FFFF]:0000 (0x8000 values)
752  */
753 static u32 qdisc_alloc_handle(struct net_device *dev)
754 {
755         int i = 0x8000;
756         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
757
758         do {
759                 autohandle += TC_H_MAKE(0x10000U, 0);
760                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
761                         autohandle = TC_H_MAKE(0x80000000U, 0);
762                 if (!qdisc_lookup(dev, autohandle))
763                         return autohandle;
764                 cond_resched();
765         } while (--i > 0);
766
767         return 0;
768 }
769
770 void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
771 {
772         bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
773         const struct Qdisc_class_ops *cops;
774         unsigned long cl;
775         u32 parentid;
776         bool notify;
777         int drops;
778
779         if (n == 0 && len == 0)
780                 return;
781         drops = max_t(int, n, 0);
782         rcu_read_lock();
783         while ((parentid = sch->parent)) {
784                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
785                         break;
786
787                 if (sch->flags & TCQ_F_NOPARENT)
788                         break;
789                 /* Notify parent qdisc only if child qdisc becomes empty.
790                  *
791                  * If child was empty even before update then backlog
792                  * counter is screwed and we skip notification because
793                  * parent class is already passive.
794                  *
795                  * If the original child was offloaded then it is allowed
796                  * to be seem as empty, so the parent is notified anyway.
797                  */
798                 notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
799                                                        !qdisc_is_offloaded);
800                 /* TODO: perform the search on a per txq basis */
801                 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
802                 if (sch == NULL) {
803                         WARN_ON_ONCE(parentid != TC_H_ROOT);
804                         break;
805                 }
806                 cops = sch->ops->cl_ops;
807                 if (notify && cops->qlen_notify) {
808                         cl = cops->find(sch, parentid);
809                         cops->qlen_notify(sch, cl);
810                 }
811                 sch->q.qlen -= n;
812                 sch->qstats.backlog -= len;
813                 __qdisc_qstats_drop(sch, drops);
814         }
815         rcu_read_unlock();
816 }
817 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
818
819 int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
820                               void *type_data)
821 {
822         struct net_device *dev = qdisc_dev(sch);
823         int err;
824
825         sch->flags &= ~TCQ_F_OFFLOADED;
826         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
827                 return 0;
828
829         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
830         if (err == -EOPNOTSUPP)
831                 return 0;
832
833         if (!err)
834                 sch->flags |= TCQ_F_OFFLOADED;
835
836         return err;
837 }
838 EXPORT_SYMBOL(qdisc_offload_dump_helper);
839
840 void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
841                                 struct Qdisc *new, struct Qdisc *old,
842                                 enum tc_setup_type type, void *type_data,
843                                 struct netlink_ext_ack *extack)
844 {
845         bool any_qdisc_is_offloaded;
846         int err;
847
848         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
849                 return;
850
851         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
852
853         /* Don't report error if the graft is part of destroy operation. */
854         if (!err || !new || new == &noop_qdisc)
855                 return;
856
857         /* Don't report error if the parent, the old child and the new
858          * one are not offloaded.
859          */
860         any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
861         any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
862         any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
863
864         if (any_qdisc_is_offloaded)
865                 NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
866 }
867 EXPORT_SYMBOL(qdisc_offload_graft_helper);
868
869 static void qdisc_offload_graft_root(struct net_device *dev,
870                                      struct Qdisc *new, struct Qdisc *old,
871                                      struct netlink_ext_ack *extack)
872 {
873         struct tc_root_qopt_offload graft_offload = {
874                 .command        = TC_ROOT_GRAFT,
875                 .handle         = new ? new->handle : 0,
876                 .ingress        = (new && new->flags & TCQ_F_INGRESS) ||
877                                   (old && old->flags & TCQ_F_INGRESS),
878         };
879
880         qdisc_offload_graft_helper(dev, NULL, new, old,
881                                    TC_SETUP_ROOT_QDISC, &graft_offload, extack);
882 }
883
884 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
885                          u32 portid, u32 seq, u16 flags, int event)
886 {
887         struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
888         struct gnet_stats_queue __percpu *cpu_qstats = NULL;
889         struct tcmsg *tcm;
890         struct nlmsghdr  *nlh;
891         unsigned char *b = skb_tail_pointer(skb);
892         struct gnet_dump d;
893         struct qdisc_size_table *stab;
894         u32 block_index;
895         __u32 qlen;
896
897         cond_resched();
898         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
899         if (!nlh)
900                 goto out_nlmsg_trim;
901         tcm = nlmsg_data(nlh);
902         tcm->tcm_family = AF_UNSPEC;
903         tcm->tcm__pad1 = 0;
904         tcm->tcm__pad2 = 0;
905         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
906         tcm->tcm_parent = clid;
907         tcm->tcm_handle = q->handle;
908         tcm->tcm_info = refcount_read(&q->refcnt);
909         if (nla_put_string(skb, TCA_KIND, q->ops->id))
910                 goto nla_put_failure;
911         if (q->ops->ingress_block_get) {
912                 block_index = q->ops->ingress_block_get(q);
913                 if (block_index &&
914                     nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
915                         goto nla_put_failure;
916         }
917         if (q->ops->egress_block_get) {
918                 block_index = q->ops->egress_block_get(q);
919                 if (block_index &&
920                     nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
921                         goto nla_put_failure;
922         }
923         if (q->ops->dump && q->ops->dump(q, skb) < 0)
924                 goto nla_put_failure;
925         if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
926                 goto nla_put_failure;
927         qlen = qdisc_qlen_sum(q);
928
929         stab = rtnl_dereference(q->stab);
930         if (stab && qdisc_dump_stab(skb, stab) < 0)
931                 goto nla_put_failure;
932
933         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
934                                          NULL, &d, TCA_PAD) < 0)
935                 goto nla_put_failure;
936
937         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
938                 goto nla_put_failure;
939
940         if (qdisc_is_percpu_stats(q)) {
941                 cpu_bstats = q->cpu_bstats;
942                 cpu_qstats = q->cpu_qstats;
943         }
944
945         if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
946                                   &d, cpu_bstats, &q->bstats) < 0 ||
947             gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
948             gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
949                 goto nla_put_failure;
950
951         if (gnet_stats_finish_copy(&d) < 0)
952                 goto nla_put_failure;
953
954         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
955         return skb->len;
956
957 out_nlmsg_trim:
958 nla_put_failure:
959         nlmsg_trim(skb, b);
960         return -1;
961 }
962
963 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
964 {
965         if (q->flags & TCQ_F_BUILTIN)
966                 return true;
967         if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
968                 return true;
969
970         return false;
971 }
972
973 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
974                         struct nlmsghdr *n, u32 clid,
975                         struct Qdisc *old, struct Qdisc *new)
976 {
977         struct sk_buff *skb;
978         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
979
980         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
981         if (!skb)
982                 return -ENOBUFS;
983
984         if (old && !tc_qdisc_dump_ignore(old, false)) {
985                 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
986                                   0, RTM_DELQDISC) < 0)
987                         goto err_out;
988         }
989         if (new && !tc_qdisc_dump_ignore(new, false)) {
990                 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
991                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
992                         goto err_out;
993         }
994
995         if (skb->len)
996                 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
997                                       n->nlmsg_flags & NLM_F_ECHO);
998
999 err_out:
1000         kfree_skb(skb);
1001         return -EINVAL;
1002 }
1003
1004 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
1005                                struct nlmsghdr *n, u32 clid,
1006                                struct Qdisc *old, struct Qdisc *new)
1007 {
1008         if (new || old)
1009                 qdisc_notify(net, skb, n, clid, old, new);
1010
1011         if (old)
1012                 qdisc_put(old);
1013 }
1014
1015 static void qdisc_clear_nolock(struct Qdisc *sch)
1016 {
1017         sch->flags &= ~TCQ_F_NOLOCK;
1018         if (!(sch->flags & TCQ_F_CPUSTATS))
1019                 return;
1020
1021         free_percpu(sch->cpu_bstats);
1022         free_percpu(sch->cpu_qstats);
1023         sch->cpu_bstats = NULL;
1024         sch->cpu_qstats = NULL;
1025         sch->flags &= ~TCQ_F_CPUSTATS;
1026 }
1027
1028 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
1029  * to device "dev".
1030  *
1031  * When appropriate send a netlink notification using 'skb'
1032  * and "n".
1033  *
1034  * On success, destroy old qdisc.
1035  */
1036
1037 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
1038                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
1039                        struct Qdisc *new, struct Qdisc *old,
1040                        struct netlink_ext_ack *extack)
1041 {
1042         struct Qdisc *q = old;
1043         struct net *net = dev_net(dev);
1044
1045         if (parent == NULL) {
1046                 unsigned int i, num_q, ingress;
1047                 struct netdev_queue *dev_queue;
1048
1049                 ingress = 0;
1050                 num_q = dev->num_tx_queues;
1051                 if ((q && q->flags & TCQ_F_INGRESS) ||
1052                     (new && new->flags & TCQ_F_INGRESS)) {
1053                         ingress = 1;
1054                         if (!dev_ingress_queue(dev)) {
1055                                 NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
1056                                 return -ENOENT;
1057                         }
1058                 }
1059
1060                 if (dev->flags & IFF_UP)
1061                         dev_deactivate(dev);
1062
1063                 qdisc_offload_graft_root(dev, new, old, extack);
1064
1065                 if (new && new->ops->attach)
1066                         goto skip;
1067
1068                 if (!ingress) {
1069                         for (i = 0; i < num_q; i++) {
1070                                 dev_queue = netdev_get_tx_queue(dev, i);
1071                                 old = dev_graft_qdisc(dev_queue, new);
1072
1073                                 if (new && i > 0)
1074                                         qdisc_refcount_inc(new);
1075                                 qdisc_put(old);
1076                         }
1077                 } else {
1078                         dev_queue = dev_ingress_queue(dev);
1079                         old = dev_graft_qdisc(dev_queue, new);
1080                 }
1081
1082 skip:
1083                 if (!ingress) {
1084                         old = rtnl_dereference(dev->qdisc);
1085                         if (new && !new->ops->attach)
1086                                 qdisc_refcount_inc(new);
1087                         rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc);
1088
1089                         notify_and_destroy(net, skb, n, classid, old, new);
1090
1091                         if (new && new->ops->attach)
1092                                 new->ops->attach(new);
1093                 } else {
1094                         notify_and_destroy(net, skb, n, classid, old, new);
1095                 }
1096
1097                 if (dev->flags & IFF_UP)
1098                         dev_activate(dev);
1099         } else {
1100                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1101                 unsigned long cl;
1102                 int err;
1103
1104                 /* Only support running class lockless if parent is lockless */
1105                 if (new && (new->flags & TCQ_F_NOLOCK) && !(parent->flags & TCQ_F_NOLOCK))
1106                         qdisc_clear_nolock(new);
1107
1108                 if (!cops || !cops->graft)
1109                         return -EOPNOTSUPP;
1110
1111                 cl = cops->find(parent, classid);
1112                 if (!cl) {
1113                         NL_SET_ERR_MSG(extack, "Specified class not found");
1114                         return -ENOENT;
1115                 }
1116
1117                 if (new && new->ops == &noqueue_qdisc_ops) {
1118                         NL_SET_ERR_MSG(extack, "Cannot assign noqueue to a class");
1119                         return -EINVAL;
1120                 }
1121
1122                 err = cops->graft(parent, cl, new, &old, extack);
1123                 if (err)
1124                         return err;
1125                 notify_and_destroy(net, skb, n, classid, old, new);
1126         }
1127         return 0;
1128 }
1129
1130 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1131                                    struct netlink_ext_ack *extack)
1132 {
1133         u32 block_index;
1134
1135         if (tca[TCA_INGRESS_BLOCK]) {
1136                 block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1137
1138                 if (!block_index) {
1139                         NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1140                         return -EINVAL;
1141                 }
1142                 if (!sch->ops->ingress_block_set) {
1143                         NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1144                         return -EOPNOTSUPP;
1145                 }
1146                 sch->ops->ingress_block_set(sch, block_index);
1147         }
1148         if (tca[TCA_EGRESS_BLOCK]) {
1149                 block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1150
1151                 if (!block_index) {
1152                         NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1153                         return -EINVAL;
1154                 }
1155                 if (!sch->ops->egress_block_set) {
1156                         NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1157                         return -EOPNOTSUPP;
1158                 }
1159                 sch->ops->egress_block_set(sch, block_index);
1160         }
1161         return 0;
1162 }
1163
1164 /*
1165    Allocate and initialize new qdisc.
1166
1167    Parameters are passed via opt.
1168  */
1169
1170 static struct Qdisc *qdisc_create(struct net_device *dev,
1171                                   struct netdev_queue *dev_queue,
1172                                   struct Qdisc *p, u32 parent, u32 handle,
1173                                   struct nlattr **tca, int *errp,
1174                                   struct netlink_ext_ack *extack)
1175 {
1176         int err;
1177         struct nlattr *kind = tca[TCA_KIND];
1178         struct Qdisc *sch;
1179         struct Qdisc_ops *ops;
1180         struct qdisc_size_table *stab;
1181
1182         ops = qdisc_lookup_ops(kind);
1183 #ifdef CONFIG_MODULES
1184         if (ops == NULL && kind != NULL) {
1185                 char name[IFNAMSIZ];
1186                 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
1187                         /* We dropped the RTNL semaphore in order to
1188                          * perform the module load.  So, even if we
1189                          * succeeded in loading the module we have to
1190                          * tell the caller to replay the request.  We
1191                          * indicate this using -EAGAIN.
1192                          * We replay the request because the device may
1193                          * go away in the mean time.
1194                          */
1195                         rtnl_unlock();
1196                         request_module("sch_%s", name);
1197                         rtnl_lock();
1198                         ops = qdisc_lookup_ops(kind);
1199                         if (ops != NULL) {
1200                                 /* We will try again qdisc_lookup_ops,
1201                                  * so don't keep a reference.
1202                                  */
1203                                 module_put(ops->owner);
1204                                 err = -EAGAIN;
1205                                 goto err_out;
1206                         }
1207                 }
1208         }
1209 #endif
1210
1211         err = -ENOENT;
1212         if (!ops) {
1213                 NL_SET_ERR_MSG(extack, "Specified qdisc kind is unknown");
1214                 goto err_out;
1215         }
1216
1217         sch = qdisc_alloc(dev_queue, ops, extack);
1218         if (IS_ERR(sch)) {
1219                 err = PTR_ERR(sch);
1220                 goto err_out2;
1221         }
1222
1223         sch->parent = parent;
1224
1225         if (handle == TC_H_INGRESS) {
1226                 if (!(sch->flags & TCQ_F_INGRESS)) {
1227                         NL_SET_ERR_MSG(extack,
1228                                        "Specified parent ID is reserved for ingress and clsact Qdiscs");
1229                         err = -EINVAL;
1230                         goto err_out3;
1231                 }
1232                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
1233         } else {
1234                 if (handle == 0) {
1235                         handle = qdisc_alloc_handle(dev);
1236                         if (handle == 0) {
1237                                 NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded");
1238                                 err = -ENOSPC;
1239                                 goto err_out3;
1240                         }
1241                 }
1242                 if (!netif_is_multiqueue(dev))
1243                         sch->flags |= TCQ_F_ONETXQUEUE;
1244         }
1245
1246         sch->handle = handle;
1247
1248         /* This exist to keep backward compatible with a userspace
1249          * loophole, what allowed userspace to get IFF_NO_QUEUE
1250          * facility on older kernels by setting tx_queue_len=0 (prior
1251          * to qdisc init), and then forgot to reinit tx_queue_len
1252          * before again attaching a qdisc.
1253          */
1254         if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1255                 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1256                 netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1257         }
1258
1259         err = qdisc_block_indexes_set(sch, tca, extack);
1260         if (err)
1261                 goto err_out3;
1262
1263         if (ops->init) {
1264                 err = ops->init(sch, tca[TCA_OPTIONS], extack);
1265                 if (err != 0)
1266                         goto err_out5;
1267         }
1268
1269         if (tca[TCA_STAB]) {
1270                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1271                 if (IS_ERR(stab)) {
1272                         err = PTR_ERR(stab);
1273                         goto err_out4;
1274                 }
1275                 rcu_assign_pointer(sch->stab, stab);
1276         }
1277         if (tca[TCA_RATE]) {
1278                 seqcount_t *running;
1279
1280                 err = -EOPNOTSUPP;
1281                 if (sch->flags & TCQ_F_MQROOT) {
1282                         NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1283                         goto err_out4;
1284                 }
1285
1286                 if (sch->parent != TC_H_ROOT &&
1287                     !(sch->flags & TCQ_F_INGRESS) &&
1288                     (!p || !(p->flags & TCQ_F_MQROOT)))
1289                         running = qdisc_root_sleeping_running(sch);
1290                 else
1291                         running = &sch->running;
1292
1293                 err = gen_new_estimator(&sch->bstats,
1294                                         sch->cpu_bstats,
1295                                         &sch->rate_est,
1296                                         NULL,
1297                                         running,
1298                                         tca[TCA_RATE]);
1299                 if (err) {
1300                         NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1301                         goto err_out4;
1302                 }
1303         }
1304
1305         qdisc_hash_add(sch, false);
1306         trace_qdisc_create(ops, dev, parent);
1307
1308         return sch;
1309
1310 err_out5:
1311         /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1312         if (ops->destroy)
1313                 ops->destroy(sch);
1314 err_out3:
1315         dev_put(dev);
1316         qdisc_free(sch);
1317 err_out2:
1318         module_put(ops->owner);
1319 err_out:
1320         *errp = err;
1321         return NULL;
1322
1323 err_out4:
1324         /*
1325          * Any broken qdiscs that would require a ops->reset() here?
1326          * The qdisc was never in action so it shouldn't be necessary.
1327          */
1328         qdisc_put_stab(rtnl_dereference(sch->stab));
1329         if (ops->destroy)
1330                 ops->destroy(sch);
1331         goto err_out3;
1332 }
1333
1334 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1335                         struct netlink_ext_ack *extack)
1336 {
1337         struct qdisc_size_table *ostab, *stab = NULL;
1338         int err = 0;
1339
1340         if (tca[TCA_OPTIONS]) {
1341                 if (!sch->ops->change) {
1342                         NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1343                         return -EINVAL;
1344                 }
1345                 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1346                         NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1347                         return -EOPNOTSUPP;
1348                 }
1349                 err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1350                 if (err)
1351                         return err;
1352         }
1353
1354         if (tca[TCA_STAB]) {
1355                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1356                 if (IS_ERR(stab))
1357                         return PTR_ERR(stab);
1358         }
1359
1360         ostab = rtnl_dereference(sch->stab);
1361         rcu_assign_pointer(sch->stab, stab);
1362         qdisc_put_stab(ostab);
1363
1364         if (tca[TCA_RATE]) {
1365                 /* NB: ignores errors from replace_estimator
1366                    because change can't be undone. */
1367                 if (sch->flags & TCQ_F_MQROOT)
1368                         goto out;
1369                 gen_replace_estimator(&sch->bstats,
1370                                       sch->cpu_bstats,
1371                                       &sch->rate_est,
1372                                       NULL,
1373                                       qdisc_root_sleeping_running(sch),
1374                                       tca[TCA_RATE]);
1375         }
1376 out:
1377         return 0;
1378 }
1379
1380 struct check_loop_arg {
1381         struct qdisc_walker     w;
1382         struct Qdisc            *p;
1383         int                     depth;
1384 };
1385
1386 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1387                          struct qdisc_walker *w);
1388
1389 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1390 {
1391         struct check_loop_arg   arg;
1392
1393         if (q->ops->cl_ops == NULL)
1394                 return 0;
1395
1396         arg.w.stop = arg.w.skip = arg.w.count = 0;
1397         arg.w.fn = check_loop_fn;
1398         arg.depth = depth;
1399         arg.p = p;
1400         q->ops->cl_ops->walk(q, &arg.w);
1401         return arg.w.stop ? -ELOOP : 0;
1402 }
1403
1404 static int
1405 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1406 {
1407         struct Qdisc *leaf;
1408         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1409         struct check_loop_arg *arg = (struct check_loop_arg *)w;
1410
1411         leaf = cops->leaf(q, cl);
1412         if (leaf) {
1413                 if (leaf == arg->p || arg->depth > 7)
1414                         return -ELOOP;
1415                 return check_loop(leaf, arg->p, arg->depth + 1);
1416         }
1417         return 0;
1418 }
1419
1420 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1421         [TCA_KIND]              = { .type = NLA_STRING },
1422         [TCA_RATE]              = { .type = NLA_BINARY,
1423                                     .len = sizeof(struct tc_estimator) },
1424         [TCA_STAB]              = { .type = NLA_NESTED },
1425         [TCA_DUMP_INVISIBLE]    = { .type = NLA_FLAG },
1426         [TCA_CHAIN]             = { .type = NLA_U32 },
1427         [TCA_INGRESS_BLOCK]     = { .type = NLA_U32 },
1428         [TCA_EGRESS_BLOCK]      = { .type = NLA_U32 },
1429 };
1430
1431 /*
1432  * Delete/get qdisc.
1433  */
1434
1435 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1436                         struct netlink_ext_ack *extack)
1437 {
1438         struct net *net = sock_net(skb->sk);
1439         struct tcmsg *tcm = nlmsg_data(n);
1440         struct nlattr *tca[TCA_MAX + 1];
1441         struct net_device *dev;
1442         u32 clid;
1443         struct Qdisc *q = NULL;
1444         struct Qdisc *p = NULL;
1445         int err;
1446
1447         if ((n->nlmsg_type != RTM_GETQDISC) &&
1448             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1449                 return -EPERM;
1450
1451         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1452                                      rtm_tca_policy, extack);
1453         if (err < 0)
1454                 return err;
1455
1456         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1457         if (!dev)
1458                 return -ENODEV;
1459
1460         clid = tcm->tcm_parent;
1461         if (clid) {
1462                 if (clid != TC_H_ROOT) {
1463                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1464                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1465                                 if (!p) {
1466                                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1467                                         return -ENOENT;
1468                                 }
1469                                 q = qdisc_leaf(p, clid);
1470                         } else if (dev_ingress_queue(dev)) {
1471                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1472                         }
1473                 } else {
1474                         q = rtnl_dereference(dev->qdisc);
1475                 }
1476                 if (!q) {
1477                         NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1478                         return -ENOENT;
1479                 }
1480
1481                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1482                         NL_SET_ERR_MSG(extack, "Invalid handle");
1483                         return -EINVAL;
1484                 }
1485         } else {
1486                 q = qdisc_lookup(dev, tcm->tcm_handle);
1487                 if (!q) {
1488                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1489                         return -ENOENT;
1490                 }
1491         }
1492
1493         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1494                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1495                 return -EINVAL;
1496         }
1497
1498         if (n->nlmsg_type == RTM_DELQDISC) {
1499                 if (!clid) {
1500                         NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1501                         return -EINVAL;
1502                 }
1503                 if (q->handle == 0) {
1504                         NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1505                         return -ENOENT;
1506                 }
1507                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1508                 if (err != 0)
1509                         return err;
1510         } else {
1511                 qdisc_notify(net, skb, n, clid, NULL, q);
1512         }
1513         return 0;
1514 }
1515
1516 static bool req_create_or_replace(struct nlmsghdr *n)
1517 {
1518         return (n->nlmsg_flags & NLM_F_CREATE &&
1519                 n->nlmsg_flags & NLM_F_REPLACE);
1520 }
1521
1522 static bool req_create_exclusive(struct nlmsghdr *n)
1523 {
1524         return (n->nlmsg_flags & NLM_F_CREATE &&
1525                 n->nlmsg_flags & NLM_F_EXCL);
1526 }
1527
1528 static bool req_change(struct nlmsghdr *n)
1529 {
1530         return (!(n->nlmsg_flags & NLM_F_CREATE) &&
1531                 !(n->nlmsg_flags & NLM_F_REPLACE) &&
1532                 !(n->nlmsg_flags & NLM_F_EXCL));
1533 }
1534
1535 /*
1536  * Create/change qdisc.
1537  */
1538 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1539                            struct netlink_ext_ack *extack)
1540 {
1541         struct net *net = sock_net(skb->sk);
1542         struct tcmsg *tcm;
1543         struct nlattr *tca[TCA_MAX + 1];
1544         struct net_device *dev;
1545         u32 clid;
1546         struct Qdisc *q, *p;
1547         int err;
1548
1549         if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1550                 return -EPERM;
1551
1552 replay:
1553         /* Reinit, just in case something touches this. */
1554         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1555                                      rtm_tca_policy, extack);
1556         if (err < 0)
1557                 return err;
1558
1559         tcm = nlmsg_data(n);
1560         clid = tcm->tcm_parent;
1561         q = p = NULL;
1562
1563         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1564         if (!dev)
1565                 return -ENODEV;
1566
1567
1568         if (clid) {
1569                 if (clid != TC_H_ROOT) {
1570                         if (clid != TC_H_INGRESS) {
1571                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1572                                 if (!p) {
1573                                         NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1574                                         return -ENOENT;
1575                                 }
1576                                 q = qdisc_leaf(p, clid);
1577                         } else if (dev_ingress_queue_create(dev)) {
1578                                 q = dev_ingress_queue(dev)->qdisc_sleeping;
1579                         }
1580                 } else {
1581                         q = rtnl_dereference(dev->qdisc);
1582                 }
1583
1584                 /* It may be default qdisc, ignore it */
1585                 if (q && q->handle == 0)
1586                         q = NULL;
1587
1588                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1589                         if (tcm->tcm_handle) {
1590                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1591                                         NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1592                                         return -EEXIST;
1593                                 }
1594                                 if (TC_H_MIN(tcm->tcm_handle)) {
1595                                         NL_SET_ERR_MSG(extack, "Invalid minor handle");
1596                                         return -EINVAL;
1597                                 }
1598                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1599                                 if (!q)
1600                                         goto create_n_graft;
1601                                 if (n->nlmsg_flags & NLM_F_EXCL) {
1602                                         NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1603                                         return -EEXIST;
1604                                 }
1605                                 if (tca[TCA_KIND] &&
1606                                     nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1607                                         NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1608                                         return -EINVAL;
1609                                 }
1610                                 if (q->flags & TCQ_F_INGRESS) {
1611                                         NL_SET_ERR_MSG(extack,
1612                                                        "Cannot regraft ingress or clsact Qdiscs");
1613                                         return -EINVAL;
1614                                 }
1615                                 if (q == p ||
1616                                     (p && check_loop(q, p, 0))) {
1617                                         NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1618                                         return -ELOOP;
1619                                 }
1620                                 if (clid == TC_H_INGRESS) {
1621                                         NL_SET_ERR_MSG(extack, "Ingress cannot graft directly");
1622                                         return -EINVAL;
1623                                 }
1624                                 qdisc_refcount_inc(q);
1625                                 goto graft;
1626                         } else {
1627                                 if (!q)
1628                                         goto create_n_graft;
1629
1630                                 /* This magic test requires explanation.
1631                                  *
1632                                  *   We know, that some child q is already
1633                                  *   attached to this parent and have choice:
1634                                  *   1) change it or 2) create/graft new one.
1635                                  *   If the requested qdisc kind is different
1636                                  *   than the existing one, then we choose graft.
1637                                  *   If they are the same then this is "change"
1638                                  *   operation - just let it fallthrough..
1639                                  *
1640                                  *   1. We are allowed to create/graft only
1641                                  *   if the request is explicitly stating
1642                                  *   "please create if it doesn't exist".
1643                                  *
1644                                  *   2. If the request is to exclusive create
1645                                  *   then the qdisc tcm_handle is not expected
1646                                  *   to exist, so that we choose create/graft too.
1647                                  *
1648                                  *   3. The last case is when no flags are set.
1649                                  *   This will happen when for example tc
1650                                  *   utility issues a "change" command.
1651                                  *   Alas, it is sort of hole in API, we
1652                                  *   cannot decide what to do unambiguously.
1653                                  *   For now we select create/graft.
1654                                  */
1655                                 if (tca[TCA_KIND] &&
1656                                     nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1657                                         if (req_create_or_replace(n) ||
1658                                             req_create_exclusive(n))
1659                                                 goto create_n_graft;
1660                                         else if (req_change(n))
1661                                                 goto create_n_graft2;
1662                                 }
1663                         }
1664                 }
1665         } else {
1666                 if (!tcm->tcm_handle) {
1667                         NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1668                         return -EINVAL;
1669                 }
1670                 q = qdisc_lookup(dev, tcm->tcm_handle);
1671         }
1672
1673         /* Change qdisc parameters */
1674         if (!q) {
1675                 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1676                 return -ENOENT;
1677         }
1678         if (n->nlmsg_flags & NLM_F_EXCL) {
1679                 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1680                 return -EEXIST;
1681         }
1682         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1683                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1684                 return -EINVAL;
1685         }
1686         err = qdisc_change(q, tca, extack);
1687         if (err == 0)
1688                 qdisc_notify(net, skb, n, clid, NULL, q);
1689         return err;
1690
1691 create_n_graft:
1692         if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1693                 NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1694                 return -ENOENT;
1695         }
1696 create_n_graft2:
1697         if (clid == TC_H_INGRESS) {
1698                 if (dev_ingress_queue(dev)) {
1699                         q = qdisc_create(dev, dev_ingress_queue(dev), p,
1700                                          tcm->tcm_parent, tcm->tcm_parent,
1701                                          tca, &err, extack);
1702                 } else {
1703                         NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1704                         err = -ENOENT;
1705                 }
1706         } else {
1707                 struct netdev_queue *dev_queue;
1708
1709                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1710                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1711                 else if (p)
1712                         dev_queue = p->dev_queue;
1713                 else
1714                         dev_queue = netdev_get_tx_queue(dev, 0);
1715
1716                 q = qdisc_create(dev, dev_queue, p,
1717                                  tcm->tcm_parent, tcm->tcm_handle,
1718                                  tca, &err, extack);
1719         }
1720         if (q == NULL) {
1721                 if (err == -EAGAIN)
1722                         goto replay;
1723                 return err;
1724         }
1725
1726 graft:
1727         err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1728         if (err) {
1729                 if (q)
1730                         qdisc_put(q);
1731                 return err;
1732         }
1733
1734         return 0;
1735 }
1736
1737 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1738                               struct netlink_callback *cb,
1739                               int *q_idx_p, int s_q_idx, bool recur,
1740                               bool dump_invisible)
1741 {
1742         int ret = 0, q_idx = *q_idx_p;
1743         struct Qdisc *q;
1744         int b;
1745
1746         if (!root)
1747                 return 0;
1748
1749         q = root;
1750         if (q_idx < s_q_idx) {
1751                 q_idx++;
1752         } else {
1753                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1754                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1755                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1756                                   RTM_NEWQDISC) <= 0)
1757                         goto done;
1758                 q_idx++;
1759         }
1760
1761         /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1762          * itself has already been dumped.
1763          *
1764          * If we've already dumped the top-level (ingress) qdisc above and the global
1765          * qdisc hashtable, we don't want to hit it again
1766          */
1767         if (!qdisc_dev(root) || !recur)
1768                 goto out;
1769
1770         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1771                 if (q_idx < s_q_idx) {
1772                         q_idx++;
1773                         continue;
1774                 }
1775                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1776                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1777                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1778                                   RTM_NEWQDISC) <= 0)
1779                         goto done;
1780                 q_idx++;
1781         }
1782
1783 out:
1784         *q_idx_p = q_idx;
1785         return ret;
1786 done:
1787         ret = -1;
1788         goto out;
1789 }
1790
1791 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1792 {
1793         struct net *net = sock_net(skb->sk);
1794         int idx, q_idx;
1795         int s_idx, s_q_idx;
1796         struct net_device *dev;
1797         const struct nlmsghdr *nlh = cb->nlh;
1798         struct nlattr *tca[TCA_MAX + 1];
1799         int err;
1800
1801         s_idx = cb->args[0];
1802         s_q_idx = q_idx = cb->args[1];
1803
1804         idx = 0;
1805         ASSERT_RTNL();
1806
1807         err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1808                                      rtm_tca_policy, cb->extack);
1809         if (err < 0)
1810                 return err;
1811
1812         for_each_netdev(net, dev) {
1813                 struct netdev_queue *dev_queue;
1814
1815                 if (idx < s_idx)
1816                         goto cont;
1817                 if (idx > s_idx)
1818                         s_q_idx = 0;
1819                 q_idx = 0;
1820
1821                 if (tc_dump_qdisc_root(rtnl_dereference(dev->qdisc),
1822                                        skb, cb, &q_idx, s_q_idx,
1823                                        true, tca[TCA_DUMP_INVISIBLE]) < 0)
1824                         goto done;
1825
1826                 dev_queue = dev_ingress_queue(dev);
1827                 if (dev_queue &&
1828                     tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1829                                        &q_idx, s_q_idx, false,
1830                                        tca[TCA_DUMP_INVISIBLE]) < 0)
1831                         goto done;
1832
1833 cont:
1834                 idx++;
1835         }
1836
1837 done:
1838         cb->args[0] = idx;
1839         cb->args[1] = q_idx;
1840
1841         return skb->len;
1842 }
1843
1844
1845
1846 /************************************************
1847  *      Traffic classes manipulation.           *
1848  ************************************************/
1849
1850 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1851                           unsigned long cl,
1852                           u32 portid, u32 seq, u16 flags, int event)
1853 {
1854         struct tcmsg *tcm;
1855         struct nlmsghdr  *nlh;
1856         unsigned char *b = skb_tail_pointer(skb);
1857         struct gnet_dump d;
1858         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1859
1860         cond_resched();
1861         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1862         if (!nlh)
1863                 goto out_nlmsg_trim;
1864         tcm = nlmsg_data(nlh);
1865         tcm->tcm_family = AF_UNSPEC;
1866         tcm->tcm__pad1 = 0;
1867         tcm->tcm__pad2 = 0;
1868         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1869         tcm->tcm_parent = q->handle;
1870         tcm->tcm_handle = q->handle;
1871         tcm->tcm_info = 0;
1872         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1873                 goto nla_put_failure;
1874         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1875                 goto nla_put_failure;
1876
1877         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1878                                          NULL, &d, TCA_PAD) < 0)
1879                 goto nla_put_failure;
1880
1881         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1882                 goto nla_put_failure;
1883
1884         if (gnet_stats_finish_copy(&d) < 0)
1885                 goto nla_put_failure;
1886
1887         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1888         return skb->len;
1889
1890 out_nlmsg_trim:
1891 nla_put_failure:
1892         nlmsg_trim(skb, b);
1893         return -1;
1894 }
1895
1896 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1897                          struct nlmsghdr *n, struct Qdisc *q,
1898                          unsigned long cl, int event)
1899 {
1900         struct sk_buff *skb;
1901         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1902         int err = 0;
1903
1904         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1905         if (!skb)
1906                 return -ENOBUFS;
1907
1908         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1909                 kfree_skb(skb);
1910                 return -EINVAL;
1911         }
1912
1913         err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1914                              n->nlmsg_flags & NLM_F_ECHO);
1915         if (err > 0)
1916                 err = 0;
1917         return err;
1918 }
1919
1920 static int tclass_del_notify(struct net *net,
1921                              const struct Qdisc_class_ops *cops,
1922                              struct sk_buff *oskb, struct nlmsghdr *n,
1923                              struct Qdisc *q, unsigned long cl)
1924 {
1925         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1926         struct sk_buff *skb;
1927         int err = 0;
1928
1929         if (!cops->delete)
1930                 return -EOPNOTSUPP;
1931
1932         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1933         if (!skb)
1934                 return -ENOBUFS;
1935
1936         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1937                            RTM_DELTCLASS) < 0) {
1938                 kfree_skb(skb);
1939                 return -EINVAL;
1940         }
1941
1942         err = cops->delete(q, cl);
1943         if (err) {
1944                 kfree_skb(skb);
1945                 return err;
1946         }
1947
1948         err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1949                              n->nlmsg_flags & NLM_F_ECHO);
1950         if (err > 0)
1951                 err = 0;
1952         return err;
1953 }
1954
1955 #ifdef CONFIG_NET_CLS
1956
1957 struct tcf_bind_args {
1958         struct tcf_walker w;
1959         unsigned long base;
1960         unsigned long cl;
1961         u32 classid;
1962 };
1963
1964 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1965 {
1966         struct tcf_bind_args *a = (void *)arg;
1967
1968         if (tp->ops->bind_class) {
1969                 struct Qdisc *q = tcf_block_q(tp->chain->block);
1970
1971                 sch_tree_lock(q);
1972                 tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
1973                 sch_tree_unlock(q);
1974         }
1975         return 0;
1976 }
1977
1978 struct tc_bind_class_args {
1979         struct qdisc_walker w;
1980         unsigned long new_cl;
1981         u32 portid;
1982         u32 clid;
1983 };
1984
1985 static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl,
1986                                 struct qdisc_walker *w)
1987 {
1988         struct tc_bind_class_args *a = (struct tc_bind_class_args *)w;
1989         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1990         struct tcf_block *block;
1991         struct tcf_chain *chain;
1992
1993         block = cops->tcf_block(q, cl, NULL);
1994         if (!block)
1995                 return 0;
1996         for (chain = tcf_get_next_chain(block, NULL);
1997              chain;
1998              chain = tcf_get_next_chain(block, chain)) {
1999                 struct tcf_proto *tp;
2000
2001                 for (tp = tcf_get_next_proto(chain, NULL, true);
2002                      tp; tp = tcf_get_next_proto(chain, tp, true)) {
2003                         struct tcf_bind_args arg = {};
2004
2005                         arg.w.fn = tcf_node_bind;
2006                         arg.classid = a->clid;
2007                         arg.base = cl;
2008                         arg.cl = a->new_cl;
2009                         tp->ops->walk(tp, &arg.w, true);
2010                 }
2011         }
2012
2013         return 0;
2014 }
2015
2016 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
2017                            unsigned long new_cl)
2018 {
2019         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
2020         struct tc_bind_class_args args = {};
2021
2022         if (!cops->tcf_block)
2023                 return;
2024         args.portid = portid;
2025         args.clid = clid;
2026         args.new_cl = new_cl;
2027         args.w.fn = tc_bind_class_walker;
2028         q->ops->cl_ops->walk(q, &args.w);
2029 }
2030
2031 #else
2032
2033 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
2034                            unsigned long new_cl)
2035 {
2036 }
2037
2038 #endif
2039
2040 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
2041                          struct netlink_ext_ack *extack)
2042 {
2043         struct net *net = sock_net(skb->sk);
2044         struct tcmsg *tcm = nlmsg_data(n);
2045         struct nlattr *tca[TCA_MAX + 1];
2046         struct net_device *dev;
2047         struct Qdisc *q = NULL;
2048         const struct Qdisc_class_ops *cops;
2049         unsigned long cl = 0;
2050         unsigned long new_cl;
2051         u32 portid;
2052         u32 clid;
2053         u32 qid;
2054         int err;
2055
2056         if ((n->nlmsg_type != RTM_GETTCLASS) &&
2057             !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
2058                 return -EPERM;
2059
2060         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
2061                                      rtm_tca_policy, extack);
2062         if (err < 0)
2063                 return err;
2064
2065         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
2066         if (!dev)
2067                 return -ENODEV;
2068
2069         /*
2070            parent == TC_H_UNSPEC - unspecified parent.
2071            parent == TC_H_ROOT   - class is root, which has no parent.
2072            parent == X:0         - parent is root class.
2073            parent == X:Y         - parent is a node in hierarchy.
2074            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
2075
2076            handle == 0:0         - generate handle from kernel pool.
2077            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
2078            handle == X:Y         - clear.
2079            handle == X:0         - root class.
2080          */
2081
2082         /* Step 1. Determine qdisc handle X:0 */
2083
2084         portid = tcm->tcm_parent;
2085         clid = tcm->tcm_handle;
2086         qid = TC_H_MAJ(clid);
2087
2088         if (portid != TC_H_ROOT) {
2089                 u32 qid1 = TC_H_MAJ(portid);
2090
2091                 if (qid && qid1) {
2092                         /* If both majors are known, they must be identical. */
2093                         if (qid != qid1)
2094                                 return -EINVAL;
2095                 } else if (qid1) {
2096                         qid = qid1;
2097                 } else if (qid == 0)
2098                         qid = rtnl_dereference(dev->qdisc)->handle;
2099
2100                 /* Now qid is genuine qdisc handle consistent
2101                  * both with parent and child.
2102                  *
2103                  * TC_H_MAJ(portid) still may be unspecified, complete it now.
2104                  */
2105                 if (portid)
2106                         portid = TC_H_MAKE(qid, portid);
2107         } else {
2108                 if (qid == 0)
2109                         qid = rtnl_dereference(dev->qdisc)->handle;
2110         }
2111
2112         /* OK. Locate qdisc */
2113         q = qdisc_lookup(dev, qid);
2114         if (!q)
2115                 return -ENOENT;
2116
2117         /* An check that it supports classes */
2118         cops = q->ops->cl_ops;
2119         if (cops == NULL)
2120                 return -EINVAL;
2121
2122         /* Now try to get class */
2123         if (clid == 0) {
2124                 if (portid == TC_H_ROOT)
2125                         clid = qid;
2126         } else
2127                 clid = TC_H_MAKE(qid, clid);
2128
2129         if (clid)
2130                 cl = cops->find(q, clid);
2131
2132         if (cl == 0) {
2133                 err = -ENOENT;
2134                 if (n->nlmsg_type != RTM_NEWTCLASS ||
2135                     !(n->nlmsg_flags & NLM_F_CREATE))
2136                         goto out;
2137         } else {
2138                 switch (n->nlmsg_type) {
2139                 case RTM_NEWTCLASS:
2140                         err = -EEXIST;
2141                         if (n->nlmsg_flags & NLM_F_EXCL)
2142                                 goto out;
2143                         break;
2144                 case RTM_DELTCLASS:
2145                         err = tclass_del_notify(net, cops, skb, n, q, cl);
2146                         /* Unbind the class with flilters with 0 */
2147                         tc_bind_tclass(q, portid, clid, 0);
2148                         goto out;
2149                 case RTM_GETTCLASS:
2150                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
2151                         goto out;
2152                 default:
2153                         err = -EINVAL;
2154                         goto out;
2155                 }
2156         }
2157
2158         if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
2159                 NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
2160                 return -EOPNOTSUPP;
2161         }
2162
2163         new_cl = cl;
2164         err = -EOPNOTSUPP;
2165         if (cops->change)
2166                 err = cops->change(q, clid, portid, tca, &new_cl, extack);
2167         if (err == 0) {
2168                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
2169                 /* We just create a new class, need to do reverse binding. */
2170                 if (cl != new_cl)
2171                         tc_bind_tclass(q, portid, clid, new_cl);
2172         }
2173 out:
2174         return err;
2175 }
2176
2177 struct qdisc_dump_args {
2178         struct qdisc_walker     w;
2179         struct sk_buff          *skb;
2180         struct netlink_callback *cb;
2181 };
2182
2183 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2184                             struct qdisc_walker *arg)
2185 {
2186         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2187
2188         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2189                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2190                               RTM_NEWTCLASS);
2191 }
2192
2193 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2194                                 struct tcmsg *tcm, struct netlink_callback *cb,
2195                                 int *t_p, int s_t)
2196 {
2197         struct qdisc_dump_args arg;
2198
2199         if (tc_qdisc_dump_ignore(q, false) ||
2200             *t_p < s_t || !q->ops->cl_ops ||
2201             (tcm->tcm_parent &&
2202              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2203                 (*t_p)++;
2204                 return 0;
2205         }
2206         if (*t_p > s_t)
2207                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2208         arg.w.fn = qdisc_class_dump;
2209         arg.skb = skb;
2210         arg.cb = cb;
2211         arg.w.stop  = 0;
2212         arg.w.skip = cb->args[1];
2213         arg.w.count = 0;
2214         q->ops->cl_ops->walk(q, &arg.w);
2215         cb->args[1] = arg.w.count;
2216         if (arg.w.stop)
2217                 return -1;
2218         (*t_p)++;
2219         return 0;
2220 }
2221
2222 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2223                                struct tcmsg *tcm, struct netlink_callback *cb,
2224                                int *t_p, int s_t, bool recur)
2225 {
2226         struct Qdisc *q;
2227         int b;
2228
2229         if (!root)
2230                 return 0;
2231
2232         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2233                 return -1;
2234
2235         if (!qdisc_dev(root) || !recur)
2236                 return 0;
2237
2238         if (tcm->tcm_parent) {
2239                 q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2240                 if (q && q != root &&
2241                     tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2242                         return -1;
2243                 return 0;
2244         }
2245         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2246                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2247                         return -1;
2248         }
2249
2250         return 0;
2251 }
2252
2253 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2254 {
2255         struct tcmsg *tcm = nlmsg_data(cb->nlh);
2256         struct net *net = sock_net(skb->sk);
2257         struct netdev_queue *dev_queue;
2258         struct net_device *dev;
2259         int t, s_t;
2260
2261         if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2262                 return 0;
2263         dev = dev_get_by_index(net, tcm->tcm_ifindex);
2264         if (!dev)
2265                 return 0;
2266
2267         s_t = cb->args[0];
2268         t = 0;
2269
2270         if (tc_dump_tclass_root(rtnl_dereference(dev->qdisc),
2271                                 skb, tcm, cb, &t, s_t, true) < 0)
2272                 goto done;
2273
2274         dev_queue = dev_ingress_queue(dev);
2275         if (dev_queue &&
2276             tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2277                                 &t, s_t, false) < 0)
2278                 goto done;
2279
2280 done:
2281         cb->args[0] = t;
2282
2283         dev_put(dev);
2284         return skb->len;
2285 }
2286
2287 #ifdef CONFIG_PROC_FS
2288 static int psched_show(struct seq_file *seq, void *v)
2289 {
2290         seq_printf(seq, "%08x %08x %08x %08x\n",
2291                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2292                    1000000,
2293                    (u32)NSEC_PER_SEC / hrtimer_resolution);
2294
2295         return 0;
2296 }
2297
2298 static int __net_init psched_net_init(struct net *net)
2299 {
2300         struct proc_dir_entry *e;
2301
2302         e = proc_create_single("psched", 0, net->proc_net, psched_show);
2303         if (e == NULL)
2304                 return -ENOMEM;
2305
2306         return 0;
2307 }
2308
2309 static void __net_exit psched_net_exit(struct net *net)
2310 {
2311         remove_proc_entry("psched", net->proc_net);
2312 }
2313 #else
2314 static int __net_init psched_net_init(struct net *net)
2315 {
2316         return 0;
2317 }
2318
2319 static void __net_exit psched_net_exit(struct net *net)
2320 {
2321 }
2322 #endif
2323
2324 static struct pernet_operations psched_net_ops = {
2325         .init = psched_net_init,
2326         .exit = psched_net_exit,
2327 };
2328
2329 static int __init pktsched_init(void)
2330 {
2331         int err;
2332
2333         err = register_pernet_subsys(&psched_net_ops);
2334         if (err) {
2335                 pr_err("pktsched_init: "
2336                        "cannot initialize per netns operations\n");
2337                 return err;
2338         }
2339
2340         register_qdisc(&pfifo_fast_ops);
2341         register_qdisc(&pfifo_qdisc_ops);
2342         register_qdisc(&bfifo_qdisc_ops);
2343         register_qdisc(&pfifo_head_drop_qdisc_ops);
2344         register_qdisc(&mq_qdisc_ops);
2345         register_qdisc(&noqueue_qdisc_ops);
2346
2347         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2348         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2349         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2350                       0);
2351         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2352         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2353         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2354                       0);
2355
2356         return 0;
2357 }
2358
2359 subsys_initcall(pktsched_init);