Mention branches and keyring.
[releases.git] / sched / sch_api.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_api.c  Packet scheduler API.
4  *
5  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6  *
7  * Fixes:
8  *
9  * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
10  * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
11  * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
12  */
13
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/string.h>
18 #include <linux/errno.h>
19 #include <linux/skbuff.h>
20 #include <linux/init.h>
21 #include <linux/proc_fs.h>
22 #include <linux/seq_file.h>
23 #include <linux/kmod.h>
24 #include <linux/list.h>
25 #include <linux/hrtimer.h>
26 #include <linux/slab.h>
27 #include <linux/hashtable.h>
28
29 #include <net/net_namespace.h>
30 #include <net/sock.h>
31 #include <net/netlink.h>
32 #include <net/pkt_sched.h>
33 #include <net/pkt_cls.h>
34
35 #include <trace/events/qdisc.h>
36
37 /*
38
39    Short review.
40    -------------
41
42    This file consists of two interrelated parts:
43
44    1. queueing disciplines manager frontend.
45    2. traffic classes manager frontend.
46
47    Generally, queueing discipline ("qdisc") is a black box,
48    which is able to enqueue packets and to dequeue them (when
49    device is ready to send something) in order and at times
50    determined by algorithm hidden in it.
51
52    qdisc's are divided to two categories:
53    - "queues", which have no internal structure visible from outside.
54    - "schedulers", which split all the packets to "traffic classes",
55      using "packet classifiers" (look at cls_api.c)
56
57    In turn, classes may have child qdiscs (as rule, queues)
58    attached to them etc. etc. etc.
59
60    The goal of the routines in this file is to translate
61    information supplied by user in the form of handles
62    to more intelligible for kernel form, to make some sanity
63    checks and part of work, which is common to all qdiscs
64    and to provide rtnetlink notifications.
65
66    All real intelligent work is done inside qdisc modules.
67
68
69
70    Every discipline has two major routines: enqueue and dequeue.
71
72    ---dequeue
73
74    dequeue usually returns a skb to send. It is allowed to return NULL,
75    but it does not mean that queue is empty, it just means that
76    discipline does not want to send anything this time.
77    Queue is really empty if q->q.qlen == 0.
78    For complicated disciplines with multiple queues q->q is not
79    real packet queue, but however q->q.qlen must be valid.
80
81    ---enqueue
82
83    enqueue returns 0, if packet was enqueued successfully.
84    If packet (this one or another one) was dropped, it returns
85    not zero error code.
86    NET_XMIT_DROP        - this packet dropped
87      Expected action: do not backoff, but wait until queue will clear.
88    NET_XMIT_CN          - probably this packet enqueued, but another one dropped.
89      Expected action: backoff or ignore
90
91    Auxiliary routines:
92
93    ---peek
94
95    like dequeue but without removing a packet from the queue
96
97    ---reset
98
99    returns qdisc to initial state: purge all buffers, clear all
100    timers, counters (except for statistics) etc.
101
102    ---init
103
104    initializes newly created qdisc.
105
106    ---destroy
107
108    destroys resources allocated by init and during lifetime of qdisc.
109
110    ---change
111
112    changes qdisc parameters.
113  */
114
115 /* Protects list of registered TC modules. It is pure SMP lock. */
116 static DEFINE_RWLOCK(qdisc_mod_lock);
117
118
119 /************************************************
120  *      Queueing disciplines manipulation.      *
121  ************************************************/
122
123
124 /* The list of all installed queueing disciplines. */
125
126 static struct Qdisc_ops *qdisc_base;
127
128 /* Register/unregister queueing discipline */
129
130 int register_qdisc(struct Qdisc_ops *qops)
131 {
132         struct Qdisc_ops *q, **qp;
133         int rc = -EEXIST;
134
135         write_lock(&qdisc_mod_lock);
136         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
137                 if (!strcmp(qops->id, q->id))
138                         goto out;
139
140         if (qops->enqueue == NULL)
141                 qops->enqueue = noop_qdisc_ops.enqueue;
142         if (qops->peek == NULL) {
143                 if (qops->dequeue == NULL)
144                         qops->peek = noop_qdisc_ops.peek;
145                 else
146                         goto out_einval;
147         }
148         if (qops->dequeue == NULL)
149                 qops->dequeue = noop_qdisc_ops.dequeue;
150
151         if (qops->cl_ops) {
152                 const struct Qdisc_class_ops *cops = qops->cl_ops;
153
154                 if (!(cops->find && cops->walk && cops->leaf))
155                         goto out_einval;
156
157                 if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
158                         goto out_einval;
159         }
160
161         qops->next = NULL;
162         *qp = qops;
163         rc = 0;
164 out:
165         write_unlock(&qdisc_mod_lock);
166         return rc;
167
168 out_einval:
169         rc = -EINVAL;
170         goto out;
171 }
172 EXPORT_SYMBOL(register_qdisc);
173
174 void unregister_qdisc(struct Qdisc_ops *qops)
175 {
176         struct Qdisc_ops *q, **qp;
177         int err = -ENOENT;
178
179         write_lock(&qdisc_mod_lock);
180         for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
181                 if (q == qops)
182                         break;
183         if (q) {
184                 *qp = q->next;
185                 q->next = NULL;
186                 err = 0;
187         }
188         write_unlock(&qdisc_mod_lock);
189
190         WARN(err, "unregister qdisc(%s) failed\n", qops->id);
191 }
192 EXPORT_SYMBOL(unregister_qdisc);
193
194 /* Get default qdisc if not otherwise specified */
195 void qdisc_get_default(char *name, size_t len)
196 {
197         read_lock(&qdisc_mod_lock);
198         strscpy(name, default_qdisc_ops->id, len);
199         read_unlock(&qdisc_mod_lock);
200 }
201
202 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
203 {
204         struct Qdisc_ops *q = NULL;
205
206         for (q = qdisc_base; q; q = q->next) {
207                 if (!strcmp(name, q->id)) {
208                         if (!try_module_get(q->owner))
209                                 q = NULL;
210                         break;
211                 }
212         }
213
214         return q;
215 }
216
217 /* Set new default qdisc to use */
218 int qdisc_set_default(const char *name)
219 {
220         const struct Qdisc_ops *ops;
221
222         if (!capable(CAP_NET_ADMIN))
223                 return -EPERM;
224
225         write_lock(&qdisc_mod_lock);
226         ops = qdisc_lookup_default(name);
227         if (!ops) {
228                 /* Not found, drop lock and try to load module */
229                 write_unlock(&qdisc_mod_lock);
230                 request_module("sch_%s", name);
231                 write_lock(&qdisc_mod_lock);
232
233                 ops = qdisc_lookup_default(name);
234         }
235
236         if (ops) {
237                 /* Set new default */
238                 module_put(default_qdisc_ops->owner);
239                 default_qdisc_ops = ops;
240         }
241         write_unlock(&qdisc_mod_lock);
242
243         return ops ? 0 : -ENOENT;
244 }
245
246 #ifdef CONFIG_NET_SCH_DEFAULT
247 /* Set default value from kernel config */
248 static int __init sch_default_qdisc(void)
249 {
250         return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
251 }
252 late_initcall(sch_default_qdisc);
253 #endif
254
255 /* We know handle. Find qdisc among all qdisc's attached to device
256  * (root qdisc, all its children, children of children etc.)
257  * Note: caller either uses rtnl or rcu_read_lock()
258  */
259
260 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
261 {
262         struct Qdisc *q;
263
264         if (!qdisc_dev(root))
265                 return (root->handle == handle ? root : NULL);
266
267         if (!(root->flags & TCQ_F_BUILTIN) &&
268             root->handle == handle)
269                 return root;
270
271         hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle,
272                                    lockdep_rtnl_is_held()) {
273                 if (q->handle == handle)
274                         return q;
275         }
276         return NULL;
277 }
278
279 void qdisc_hash_add(struct Qdisc *q, bool invisible)
280 {
281         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
282                 ASSERT_RTNL();
283                 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
284                 if (invisible)
285                         q->flags |= TCQ_F_INVISIBLE;
286         }
287 }
288 EXPORT_SYMBOL(qdisc_hash_add);
289
290 void qdisc_hash_del(struct Qdisc *q)
291 {
292         if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
293                 ASSERT_RTNL();
294                 hash_del_rcu(&q->hash);
295         }
296 }
297 EXPORT_SYMBOL(qdisc_hash_del);
298
299 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
300 {
301         struct Qdisc *q;
302
303         if (!handle)
304                 return NULL;
305         q = qdisc_match_from_root(rtnl_dereference(dev->qdisc), handle);
306         if (q)
307                 goto out;
308
309         if (dev_ingress_queue(dev))
310                 q = qdisc_match_from_root(
311                         rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping),
312                         handle);
313 out:
314         return q;
315 }
316
317 struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
318 {
319         struct netdev_queue *nq;
320         struct Qdisc *q;
321
322         if (!handle)
323                 return NULL;
324         q = qdisc_match_from_root(rcu_dereference(dev->qdisc), handle);
325         if (q)
326                 goto out;
327
328         nq = dev_ingress_queue_rcu(dev);
329         if (nq)
330                 q = qdisc_match_from_root(rcu_dereference(nq->qdisc_sleeping),
331                                           handle);
332 out:
333         return q;
334 }
335
336 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
337 {
338         unsigned long cl;
339         const struct Qdisc_class_ops *cops = p->ops->cl_ops;
340
341         if (cops == NULL)
342                 return NULL;
343         cl = cops->find(p, classid);
344
345         if (cl == 0)
346                 return NULL;
347         return cops->leaf(p, cl);
348 }
349
350 /* Find queueing discipline by name */
351
352 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
353 {
354         struct Qdisc_ops *q = NULL;
355
356         if (kind) {
357                 read_lock(&qdisc_mod_lock);
358                 for (q = qdisc_base; q; q = q->next) {
359                         if (nla_strcmp(kind, q->id) == 0) {
360                                 if (!try_module_get(q->owner))
361                                         q = NULL;
362                                 break;
363                         }
364                 }
365                 read_unlock(&qdisc_mod_lock);
366         }
367         return q;
368 }
369
370 /* The linklayer setting were not transferred from iproute2, in older
371  * versions, and the rate tables lookup systems have been dropped in
372  * the kernel. To keep backward compatible with older iproute2 tc
373  * utils, we detect the linklayer setting by detecting if the rate
374  * table were modified.
375  *
376  * For linklayer ATM table entries, the rate table will be aligned to
377  * 48 bytes, thus some table entries will contain the same value.  The
378  * mpu (min packet unit) is also encoded into the old rate table, thus
379  * starting from the mpu, we find low and high table entries for
380  * mapping this cell.  If these entries contain the same value, when
381  * the rate tables have been modified for linklayer ATM.
382  *
383  * This is done by rounding mpu to the nearest 48 bytes cell/entry,
384  * and then roundup to the next cell, calc the table entry one below,
385  * and compare.
386  */
387 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
388 {
389         int low       = roundup(r->mpu, 48);
390         int high      = roundup(low+1, 48);
391         int cell_low  = low >> r->cell_log;
392         int cell_high = (high >> r->cell_log) - 1;
393
394         /* rtab is too inaccurate at rates > 100Mbit/s */
395         if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
396                 pr_debug("TC linklayer: Giving up ATM detection\n");
397                 return TC_LINKLAYER_ETHERNET;
398         }
399
400         if ((cell_high > cell_low) && (cell_high < 256)
401             && (rtab[cell_low] == rtab[cell_high])) {
402                 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
403                          cell_low, cell_high, rtab[cell_high]);
404                 return TC_LINKLAYER_ATM;
405         }
406         return TC_LINKLAYER_ETHERNET;
407 }
408
409 static struct qdisc_rate_table *qdisc_rtab_list;
410
411 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
412                                         struct nlattr *tab,
413                                         struct netlink_ext_ack *extack)
414 {
415         struct qdisc_rate_table *rtab;
416
417         if (tab == NULL || r->rate == 0 ||
418             r->cell_log == 0 || r->cell_log >= 32 ||
419             nla_len(tab) != TC_RTAB_SIZE) {
420                 NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
421                 return NULL;
422         }
423
424         for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
425                 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
426                     !memcmp(&rtab->data, nla_data(tab), 1024)) {
427                         rtab->refcnt++;
428                         return rtab;
429                 }
430         }
431
432         rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
433         if (rtab) {
434                 rtab->rate = *r;
435                 rtab->refcnt = 1;
436                 memcpy(rtab->data, nla_data(tab), 1024);
437                 if (r->linklayer == TC_LINKLAYER_UNAWARE)
438                         r->linklayer = __detect_linklayer(r, rtab->data);
439                 rtab->next = qdisc_rtab_list;
440                 qdisc_rtab_list = rtab;
441         } else {
442                 NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
443         }
444         return rtab;
445 }
446 EXPORT_SYMBOL(qdisc_get_rtab);
447
448 void qdisc_put_rtab(struct qdisc_rate_table *tab)
449 {
450         struct qdisc_rate_table *rtab, **rtabp;
451
452         if (!tab || --tab->refcnt)
453                 return;
454
455         for (rtabp = &qdisc_rtab_list;
456              (rtab = *rtabp) != NULL;
457              rtabp = &rtab->next) {
458                 if (rtab == tab) {
459                         *rtabp = rtab->next;
460                         kfree(rtab);
461                         return;
462                 }
463         }
464 }
465 EXPORT_SYMBOL(qdisc_put_rtab);
466
467 static LIST_HEAD(qdisc_stab_list);
468
469 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
470         [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
471         [TCA_STAB_DATA] = { .type = NLA_BINARY },
472 };
473
474 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
475                                                struct netlink_ext_ack *extack)
476 {
477         struct nlattr *tb[TCA_STAB_MAX + 1];
478         struct qdisc_size_table *stab;
479         struct tc_sizespec *s;
480         unsigned int tsize = 0;
481         u16 *tab = NULL;
482         int err;
483
484         err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy,
485                                           extack);
486         if (err < 0)
487                 return ERR_PTR(err);
488         if (!tb[TCA_STAB_BASE]) {
489                 NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
490                 return ERR_PTR(-EINVAL);
491         }
492
493         s = nla_data(tb[TCA_STAB_BASE]);
494
495         if (s->tsize > 0) {
496                 if (!tb[TCA_STAB_DATA]) {
497                         NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
498                         return ERR_PTR(-EINVAL);
499                 }
500                 tab = nla_data(tb[TCA_STAB_DATA]);
501                 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
502         }
503
504         if (tsize != s->tsize || (!tab && tsize > 0)) {
505                 NL_SET_ERR_MSG(extack, "Invalid size of size table");
506                 return ERR_PTR(-EINVAL);
507         }
508
509         list_for_each_entry(stab, &qdisc_stab_list, list) {
510                 if (memcmp(&stab->szopts, s, sizeof(*s)))
511                         continue;
512                 if (tsize > 0 &&
513                     memcmp(stab->data, tab, flex_array_size(stab, data, tsize)))
514                         continue;
515                 stab->refcnt++;
516                 return stab;
517         }
518
519         if (s->size_log > STAB_SIZE_LOG_MAX ||
520             s->cell_log > STAB_SIZE_LOG_MAX) {
521                 NL_SET_ERR_MSG(extack, "Invalid logarithmic size of size table");
522                 return ERR_PTR(-EINVAL);
523         }
524
525         stab = kmalloc(struct_size(stab, data, tsize), GFP_KERNEL);
526         if (!stab)
527                 return ERR_PTR(-ENOMEM);
528
529         stab->refcnt = 1;
530         stab->szopts = *s;
531         if (tsize > 0)
532                 memcpy(stab->data, tab, flex_array_size(stab, data, tsize));
533
534         list_add_tail(&stab->list, &qdisc_stab_list);
535
536         return stab;
537 }
538
539 void qdisc_put_stab(struct qdisc_size_table *tab)
540 {
541         if (!tab)
542                 return;
543
544         if (--tab->refcnt == 0) {
545                 list_del(&tab->list);
546                 kfree_rcu(tab, rcu);
547         }
548 }
549 EXPORT_SYMBOL(qdisc_put_stab);
550
551 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
552 {
553         struct nlattr *nest;
554
555         nest = nla_nest_start_noflag(skb, TCA_STAB);
556         if (nest == NULL)
557                 goto nla_put_failure;
558         if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
559                 goto nla_put_failure;
560         nla_nest_end(skb, nest);
561
562         return skb->len;
563
564 nla_put_failure:
565         return -1;
566 }
567
568 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
569                                const struct qdisc_size_table *stab)
570 {
571         int pkt_len, slot;
572
573         pkt_len = skb->len + stab->szopts.overhead;
574         if (unlikely(!stab->szopts.tsize))
575                 goto out;
576
577         slot = pkt_len + stab->szopts.cell_align;
578         if (unlikely(slot < 0))
579                 slot = 0;
580
581         slot >>= stab->szopts.cell_log;
582         if (likely(slot < stab->szopts.tsize))
583                 pkt_len = stab->data[slot];
584         else
585                 pkt_len = stab->data[stab->szopts.tsize - 1] *
586                                 (slot / stab->szopts.tsize) +
587                                 stab->data[slot % stab->szopts.tsize];
588
589         pkt_len <<= stab->szopts.size_log;
590 out:
591         if (unlikely(pkt_len < 1))
592                 pkt_len = 1;
593         qdisc_skb_cb(skb)->pkt_len = pkt_len;
594 }
595 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
596
597 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
598 {
599         if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
600                 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
601                         txt, qdisc->ops->id, qdisc->handle >> 16);
602                 qdisc->flags |= TCQ_F_WARN_NONWC;
603         }
604 }
605 EXPORT_SYMBOL(qdisc_warn_nonwc);
606
607 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
608 {
609         struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
610                                                  timer);
611
612         rcu_read_lock();
613         __netif_schedule(qdisc_root(wd->qdisc));
614         rcu_read_unlock();
615
616         return HRTIMER_NORESTART;
617 }
618
619 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
620                                  clockid_t clockid)
621 {
622         hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
623         wd->timer.function = qdisc_watchdog;
624         wd->qdisc = qdisc;
625 }
626 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
627
628 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
629 {
630         qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
631 }
632 EXPORT_SYMBOL(qdisc_watchdog_init);
633
634 void qdisc_watchdog_schedule_range_ns(struct qdisc_watchdog *wd, u64 expires,
635                                       u64 delta_ns)
636 {
637         bool deactivated;
638
639         rcu_read_lock();
640         deactivated = test_bit(__QDISC_STATE_DEACTIVATED,
641                                &qdisc_root_sleeping(wd->qdisc)->state);
642         rcu_read_unlock();
643         if (deactivated)
644                 return;
645
646         if (hrtimer_is_queued(&wd->timer)) {
647                 /* If timer is already set in [expires, expires + delta_ns],
648                  * do not reprogram it.
649                  */
650                 if (wd->last_expires - expires <= delta_ns)
651                         return;
652         }
653
654         wd->last_expires = expires;
655         hrtimer_start_range_ns(&wd->timer,
656                                ns_to_ktime(expires),
657                                delta_ns,
658                                HRTIMER_MODE_ABS_PINNED);
659 }
660 EXPORT_SYMBOL(qdisc_watchdog_schedule_range_ns);
661
662 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
663 {
664         hrtimer_cancel(&wd->timer);
665 }
666 EXPORT_SYMBOL(qdisc_watchdog_cancel);
667
668 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
669 {
670         struct hlist_head *h;
671         unsigned int i;
672
673         h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
674
675         if (h != NULL) {
676                 for (i = 0; i < n; i++)
677                         INIT_HLIST_HEAD(&h[i]);
678         }
679         return h;
680 }
681
682 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
683 {
684         struct Qdisc_class_common *cl;
685         struct hlist_node *next;
686         struct hlist_head *nhash, *ohash;
687         unsigned int nsize, nmask, osize;
688         unsigned int i, h;
689
690         /* Rehash when load factor exceeds 0.75 */
691         if (clhash->hashelems * 4 <= clhash->hashsize * 3)
692                 return;
693         nsize = clhash->hashsize * 2;
694         nmask = nsize - 1;
695         nhash = qdisc_class_hash_alloc(nsize);
696         if (nhash == NULL)
697                 return;
698
699         ohash = clhash->hash;
700         osize = clhash->hashsize;
701
702         sch_tree_lock(sch);
703         for (i = 0; i < osize; i++) {
704                 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
705                         h = qdisc_class_hash(cl->classid, nmask);
706                         hlist_add_head(&cl->hnode, &nhash[h]);
707                 }
708         }
709         clhash->hash     = nhash;
710         clhash->hashsize = nsize;
711         clhash->hashmask = nmask;
712         sch_tree_unlock(sch);
713
714         kvfree(ohash);
715 }
716 EXPORT_SYMBOL(qdisc_class_hash_grow);
717
718 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
719 {
720         unsigned int size = 4;
721
722         clhash->hash = qdisc_class_hash_alloc(size);
723         if (!clhash->hash)
724                 return -ENOMEM;
725         clhash->hashsize  = size;
726         clhash->hashmask  = size - 1;
727         clhash->hashelems = 0;
728         return 0;
729 }
730 EXPORT_SYMBOL(qdisc_class_hash_init);
731
732 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
733 {
734         kvfree(clhash->hash);
735 }
736 EXPORT_SYMBOL(qdisc_class_hash_destroy);
737
738 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
739                              struct Qdisc_class_common *cl)
740 {
741         unsigned int h;
742
743         INIT_HLIST_NODE(&cl->hnode);
744         h = qdisc_class_hash(cl->classid, clhash->hashmask);
745         hlist_add_head(&cl->hnode, &clhash->hash[h]);
746         clhash->hashelems++;
747 }
748 EXPORT_SYMBOL(qdisc_class_hash_insert);
749
750 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
751                              struct Qdisc_class_common *cl)
752 {
753         hlist_del(&cl->hnode);
754         clhash->hashelems--;
755 }
756 EXPORT_SYMBOL(qdisc_class_hash_remove);
757
758 /* Allocate an unique handle from space managed by kernel
759  * Possible range is [8000-FFFF]:0000 (0x8000 values)
760  */
761 static u32 qdisc_alloc_handle(struct net_device *dev)
762 {
763         int i = 0x8000;
764         static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
765
766         do {
767                 autohandle += TC_H_MAKE(0x10000U, 0);
768                 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
769                         autohandle = TC_H_MAKE(0x80000000U, 0);
770                 if (!qdisc_lookup(dev, autohandle))
771                         return autohandle;
772                 cond_resched();
773         } while (--i > 0);
774
775         return 0;
776 }
777
778 void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
779 {
780         bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
781         const struct Qdisc_class_ops *cops;
782         unsigned long cl;
783         u32 parentid;
784         bool notify;
785         int drops;
786
787         if (n == 0 && len == 0)
788                 return;
789         drops = max_t(int, n, 0);
790         rcu_read_lock();
791         while ((parentid = sch->parent)) {
792                 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
793                         break;
794
795                 if (sch->flags & TCQ_F_NOPARENT)
796                         break;
797                 /* Notify parent qdisc only if child qdisc becomes empty.
798                  *
799                  * If child was empty even before update then backlog
800                  * counter is screwed and we skip notification because
801                  * parent class is already passive.
802                  *
803                  * If the original child was offloaded then it is allowed
804                  * to be seem as empty, so the parent is notified anyway.
805                  */
806                 notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
807                                                        !qdisc_is_offloaded);
808                 /* TODO: perform the search on a per txq basis */
809                 sch = qdisc_lookup_rcu(qdisc_dev(sch), TC_H_MAJ(parentid));
810                 if (sch == NULL) {
811                         WARN_ON_ONCE(parentid != TC_H_ROOT);
812                         break;
813                 }
814                 cops = sch->ops->cl_ops;
815                 if (notify && cops->qlen_notify) {
816                         cl = cops->find(sch, parentid);
817                         cops->qlen_notify(sch, cl);
818                 }
819                 sch->q.qlen -= n;
820                 sch->qstats.backlog -= len;
821                 __qdisc_qstats_drop(sch, drops);
822         }
823         rcu_read_unlock();
824 }
825 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
826
827 int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
828                               void *type_data)
829 {
830         struct net_device *dev = qdisc_dev(sch);
831         int err;
832
833         sch->flags &= ~TCQ_F_OFFLOADED;
834         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
835                 return 0;
836
837         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
838         if (err == -EOPNOTSUPP)
839                 return 0;
840
841         if (!err)
842                 sch->flags |= TCQ_F_OFFLOADED;
843
844         return err;
845 }
846 EXPORT_SYMBOL(qdisc_offload_dump_helper);
847
848 void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
849                                 struct Qdisc *new, struct Qdisc *old,
850                                 enum tc_setup_type type, void *type_data,
851                                 struct netlink_ext_ack *extack)
852 {
853         bool any_qdisc_is_offloaded;
854         int err;
855
856         if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
857                 return;
858
859         err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
860
861         /* Don't report error if the graft is part of destroy operation. */
862         if (!err || !new || new == &noop_qdisc)
863                 return;
864
865         /* Don't report error if the parent, the old child and the new
866          * one are not offloaded.
867          */
868         any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
869         any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
870         any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
871
872         if (any_qdisc_is_offloaded)
873                 NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
874 }
875 EXPORT_SYMBOL(qdisc_offload_graft_helper);
876
877 void qdisc_offload_query_caps(struct net_device *dev,
878                               enum tc_setup_type type,
879                               void *caps, size_t caps_len)
880 {
881         const struct net_device_ops *ops = dev->netdev_ops;
882         struct tc_query_caps_base base = {
883                 .type = type,
884                 .caps = caps,
885         };
886
887         memset(caps, 0, caps_len);
888
889         if (ops->ndo_setup_tc)
890                 ops->ndo_setup_tc(dev, TC_QUERY_CAPS, &base);
891 }
892 EXPORT_SYMBOL(qdisc_offload_query_caps);
893
894 static void qdisc_offload_graft_root(struct net_device *dev,
895                                      struct Qdisc *new, struct Qdisc *old,
896                                      struct netlink_ext_ack *extack)
897 {
898         struct tc_root_qopt_offload graft_offload = {
899                 .command        = TC_ROOT_GRAFT,
900                 .handle         = new ? new->handle : 0,
901                 .ingress        = (new && new->flags & TCQ_F_INGRESS) ||
902                                   (old && old->flags & TCQ_F_INGRESS),
903         };
904
905         qdisc_offload_graft_helper(dev, NULL, new, old,
906                                    TC_SETUP_ROOT_QDISC, &graft_offload, extack);
907 }
908
909 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
910                          u32 portid, u32 seq, u16 flags, int event,
911                          struct netlink_ext_ack *extack)
912 {
913         struct gnet_stats_basic_sync __percpu *cpu_bstats = NULL;
914         struct gnet_stats_queue __percpu *cpu_qstats = NULL;
915         struct tcmsg *tcm;
916         struct nlmsghdr  *nlh;
917         unsigned char *b = skb_tail_pointer(skb);
918         struct gnet_dump d;
919         struct qdisc_size_table *stab;
920         u32 block_index;
921         __u32 qlen;
922
923         cond_resched();
924         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
925         if (!nlh)
926                 goto out_nlmsg_trim;
927         tcm = nlmsg_data(nlh);
928         tcm->tcm_family = AF_UNSPEC;
929         tcm->tcm__pad1 = 0;
930         tcm->tcm__pad2 = 0;
931         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
932         tcm->tcm_parent = clid;
933         tcm->tcm_handle = q->handle;
934         tcm->tcm_info = refcount_read(&q->refcnt);
935         if (nla_put_string(skb, TCA_KIND, q->ops->id))
936                 goto nla_put_failure;
937         if (q->ops->ingress_block_get) {
938                 block_index = q->ops->ingress_block_get(q);
939                 if (block_index &&
940                     nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
941                         goto nla_put_failure;
942         }
943         if (q->ops->egress_block_get) {
944                 block_index = q->ops->egress_block_get(q);
945                 if (block_index &&
946                     nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
947                         goto nla_put_failure;
948         }
949         if (q->ops->dump && q->ops->dump(q, skb) < 0)
950                 goto nla_put_failure;
951         if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
952                 goto nla_put_failure;
953         qlen = qdisc_qlen_sum(q);
954
955         stab = rtnl_dereference(q->stab);
956         if (stab && qdisc_dump_stab(skb, stab) < 0)
957                 goto nla_put_failure;
958
959         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
960                                          NULL, &d, TCA_PAD) < 0)
961                 goto nla_put_failure;
962
963         if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
964                 goto nla_put_failure;
965
966         if (qdisc_is_percpu_stats(q)) {
967                 cpu_bstats = q->cpu_bstats;
968                 cpu_qstats = q->cpu_qstats;
969         }
970
971         if (gnet_stats_copy_basic(&d, cpu_bstats, &q->bstats, true) < 0 ||
972             gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
973             gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
974                 goto nla_put_failure;
975
976         if (gnet_stats_finish_copy(&d) < 0)
977                 goto nla_put_failure;
978
979         if (extack && extack->_msg &&
980             nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg))
981                 goto out_nlmsg_trim;
982
983         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
984
985         return skb->len;
986
987 out_nlmsg_trim:
988 nla_put_failure:
989         nlmsg_trim(skb, b);
990         return -1;
991 }
992
993 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
994 {
995         if (q->flags & TCQ_F_BUILTIN)
996                 return true;
997         if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
998                 return true;
999
1000         return false;
1001 }
1002
1003 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
1004                         struct nlmsghdr *n, u32 clid,
1005                         struct Qdisc *old, struct Qdisc *new,
1006                         struct netlink_ext_ack *extack)
1007 {
1008         struct sk_buff *skb;
1009         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1010
1011         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1012         if (!skb)
1013                 return -ENOBUFS;
1014
1015         if (old && !tc_qdisc_dump_ignore(old, false)) {
1016                 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
1017                                   0, RTM_DELQDISC, extack) < 0)
1018                         goto err_out;
1019         }
1020         if (new && !tc_qdisc_dump_ignore(new, false)) {
1021                 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
1022                                   old ? NLM_F_REPLACE : 0, RTM_NEWQDISC, extack) < 0)
1023                         goto err_out;
1024         }
1025
1026         if (skb->len)
1027                 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1028                                       n->nlmsg_flags & NLM_F_ECHO);
1029
1030 err_out:
1031         kfree_skb(skb);
1032         return -EINVAL;
1033 }
1034
1035 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
1036                                struct nlmsghdr *n, u32 clid,
1037                                struct Qdisc *old, struct Qdisc *new,
1038                                struct netlink_ext_ack *extack)
1039 {
1040         if (new || old)
1041                 qdisc_notify(net, skb, n, clid, old, new, extack);
1042
1043         if (old)
1044                 qdisc_put(old);
1045 }
1046
1047 static void qdisc_clear_nolock(struct Qdisc *sch)
1048 {
1049         sch->flags &= ~TCQ_F_NOLOCK;
1050         if (!(sch->flags & TCQ_F_CPUSTATS))
1051                 return;
1052
1053         free_percpu(sch->cpu_bstats);
1054         free_percpu(sch->cpu_qstats);
1055         sch->cpu_bstats = NULL;
1056         sch->cpu_qstats = NULL;
1057         sch->flags &= ~TCQ_F_CPUSTATS;
1058 }
1059
1060 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
1061  * to device "dev".
1062  *
1063  * When appropriate send a netlink notification using 'skb'
1064  * and "n".
1065  *
1066  * On success, destroy old qdisc.
1067  */
1068
1069 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
1070                        struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
1071                        struct Qdisc *new, struct Qdisc *old,
1072                        struct netlink_ext_ack *extack)
1073 {
1074         struct Qdisc *q = old;
1075         struct net *net = dev_net(dev);
1076
1077         if (parent == NULL) {
1078                 unsigned int i, num_q, ingress;
1079                 struct netdev_queue *dev_queue;
1080
1081                 ingress = 0;
1082                 num_q = dev->num_tx_queues;
1083                 if ((q && q->flags & TCQ_F_INGRESS) ||
1084                     (new && new->flags & TCQ_F_INGRESS)) {
1085                         ingress = 1;
1086                         dev_queue = dev_ingress_queue(dev);
1087                         if (!dev_queue) {
1088                                 NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
1089                                 return -ENOENT;
1090                         }
1091
1092                         q = rtnl_dereference(dev_queue->qdisc_sleeping);
1093
1094                         /* This is the counterpart of that qdisc_refcount_inc_nz() call in
1095                          * __tcf_qdisc_find() for filter requests.
1096                          */
1097                         if (!qdisc_refcount_dec_if_one(q)) {
1098                                 NL_SET_ERR_MSG(extack,
1099                                                "Current ingress or clsact Qdisc has ongoing filter requests");
1100                                 return -EBUSY;
1101                         }
1102                 }
1103
1104                 if (dev->flags & IFF_UP)
1105                         dev_deactivate(dev);
1106
1107                 qdisc_offload_graft_root(dev, new, old, extack);
1108
1109                 if (new && new->ops->attach && !ingress)
1110                         goto skip;
1111
1112                 if (!ingress) {
1113                         for (i = 0; i < num_q; i++) {
1114                                 dev_queue = netdev_get_tx_queue(dev, i);
1115                                 old = dev_graft_qdisc(dev_queue, new);
1116
1117                                 if (new && i > 0)
1118                                         qdisc_refcount_inc(new);
1119                                 qdisc_put(old);
1120                         }
1121                 } else {
1122                         old = dev_graft_qdisc(dev_queue, NULL);
1123
1124                         /* {ingress,clsact}_destroy() @old before grafting @new to avoid
1125                          * unprotected concurrent accesses to net_device::miniq_{in,e}gress
1126                          * pointer(s) in mini_qdisc_pair_swap().
1127                          */
1128                         qdisc_notify(net, skb, n, classid, old, new, extack);
1129                         qdisc_destroy(old);
1130
1131                         dev_graft_qdisc(dev_queue, new);
1132                 }
1133
1134 skip:
1135                 if (!ingress) {
1136                         old = rtnl_dereference(dev->qdisc);
1137                         if (new && !new->ops->attach)
1138                                 qdisc_refcount_inc(new);
1139                         rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc);
1140
1141                         notify_and_destroy(net, skb, n, classid, old, new, extack);
1142
1143                         if (new && new->ops->attach)
1144                                 new->ops->attach(new);
1145                 }
1146
1147                 if (dev->flags & IFF_UP)
1148                         dev_activate(dev);
1149         } else {
1150                 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1151                 unsigned long cl;
1152                 int err;
1153
1154                 /* Only support running class lockless if parent is lockless */
1155                 if (new && (new->flags & TCQ_F_NOLOCK) && !(parent->flags & TCQ_F_NOLOCK))
1156                         qdisc_clear_nolock(new);
1157
1158                 if (!cops || !cops->graft)
1159                         return -EOPNOTSUPP;
1160
1161                 cl = cops->find(parent, classid);
1162                 if (!cl) {
1163                         NL_SET_ERR_MSG(extack, "Specified class not found");
1164                         return -ENOENT;
1165                 }
1166
1167                 if (new && new->ops == &noqueue_qdisc_ops) {
1168                         NL_SET_ERR_MSG(extack, "Cannot assign noqueue to a class");
1169                         return -EINVAL;
1170                 }
1171
1172                 err = cops->graft(parent, cl, new, &old, extack);
1173                 if (err)
1174                         return err;
1175                 notify_and_destroy(net, skb, n, classid, old, new, extack);
1176         }
1177         return 0;
1178 }
1179
1180 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1181                                    struct netlink_ext_ack *extack)
1182 {
1183         u32 block_index;
1184
1185         if (tca[TCA_INGRESS_BLOCK]) {
1186                 block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1187
1188                 if (!block_index) {
1189                         NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1190                         return -EINVAL;
1191                 }
1192                 if (!sch->ops->ingress_block_set) {
1193                         NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1194                         return -EOPNOTSUPP;
1195                 }
1196                 sch->ops->ingress_block_set(sch, block_index);
1197         }
1198         if (tca[TCA_EGRESS_BLOCK]) {
1199                 block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1200
1201                 if (!block_index) {
1202                         NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1203                         return -EINVAL;
1204                 }
1205                 if (!sch->ops->egress_block_set) {
1206                         NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1207                         return -EOPNOTSUPP;
1208                 }
1209                 sch->ops->egress_block_set(sch, block_index);
1210         }
1211         return 0;
1212 }
1213
1214 /*
1215    Allocate and initialize new qdisc.
1216
1217    Parameters are passed via opt.
1218  */
1219
1220 static struct Qdisc *qdisc_create(struct net_device *dev,
1221                                   struct netdev_queue *dev_queue,
1222                                   u32 parent, u32 handle,
1223                                   struct nlattr **tca, int *errp,
1224                                   struct netlink_ext_ack *extack)
1225 {
1226         int err;
1227         struct nlattr *kind = tca[TCA_KIND];
1228         struct Qdisc *sch;
1229         struct Qdisc_ops *ops;
1230         struct qdisc_size_table *stab;
1231
1232         ops = qdisc_lookup_ops(kind);
1233 #ifdef CONFIG_MODULES
1234         if (ops == NULL && kind != NULL) {
1235                 char name[IFNAMSIZ];
1236                 if (nla_strscpy(name, kind, IFNAMSIZ) >= 0) {
1237                         /* We dropped the RTNL semaphore in order to
1238                          * perform the module load.  So, even if we
1239                          * succeeded in loading the module we have to
1240                          * tell the caller to replay the request.  We
1241                          * indicate this using -EAGAIN.
1242                          * We replay the request because the device may
1243                          * go away in the mean time.
1244                          */
1245                         rtnl_unlock();
1246                         request_module("sch_%s", name);
1247                         rtnl_lock();
1248                         ops = qdisc_lookup_ops(kind);
1249                         if (ops != NULL) {
1250                                 /* We will try again qdisc_lookup_ops,
1251                                  * so don't keep a reference.
1252                                  */
1253                                 module_put(ops->owner);
1254                                 err = -EAGAIN;
1255                                 goto err_out;
1256                         }
1257                 }
1258         }
1259 #endif
1260
1261         err = -ENOENT;
1262         if (!ops) {
1263                 NL_SET_ERR_MSG(extack, "Specified qdisc kind is unknown");
1264                 goto err_out;
1265         }
1266
1267         sch = qdisc_alloc(dev_queue, ops, extack);
1268         if (IS_ERR(sch)) {
1269                 err = PTR_ERR(sch);
1270                 goto err_out2;
1271         }
1272
1273         sch->parent = parent;
1274
1275         if (handle == TC_H_INGRESS) {
1276                 if (!(sch->flags & TCQ_F_INGRESS)) {
1277                         NL_SET_ERR_MSG(extack,
1278                                        "Specified parent ID is reserved for ingress and clsact Qdiscs");
1279                         err = -EINVAL;
1280                         goto err_out3;
1281                 }
1282                 handle = TC_H_MAKE(TC_H_INGRESS, 0);
1283         } else {
1284                 if (handle == 0) {
1285                         handle = qdisc_alloc_handle(dev);
1286                         if (handle == 0) {
1287                                 NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded");
1288                                 err = -ENOSPC;
1289                                 goto err_out3;
1290                         }
1291                 }
1292                 if (!netif_is_multiqueue(dev))
1293                         sch->flags |= TCQ_F_ONETXQUEUE;
1294         }
1295
1296         sch->handle = handle;
1297
1298         /* This exist to keep backward compatible with a userspace
1299          * loophole, what allowed userspace to get IFF_NO_QUEUE
1300          * facility on older kernels by setting tx_queue_len=0 (prior
1301          * to qdisc init), and then forgot to reinit tx_queue_len
1302          * before again attaching a qdisc.
1303          */
1304         if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1305                 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1306                 netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1307         }
1308
1309         err = qdisc_block_indexes_set(sch, tca, extack);
1310         if (err)
1311                 goto err_out3;
1312
1313         if (ops->init) {
1314                 err = ops->init(sch, tca[TCA_OPTIONS], extack);
1315                 if (err != 0)
1316                         goto err_out5;
1317         }
1318
1319         if (tca[TCA_STAB]) {
1320                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1321                 if (IS_ERR(stab)) {
1322                         err = PTR_ERR(stab);
1323                         goto err_out4;
1324                 }
1325                 rcu_assign_pointer(sch->stab, stab);
1326         }
1327         if (tca[TCA_RATE]) {
1328                 err = -EOPNOTSUPP;
1329                 if (sch->flags & TCQ_F_MQROOT) {
1330                         NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1331                         goto err_out4;
1332                 }
1333
1334                 err = gen_new_estimator(&sch->bstats,
1335                                         sch->cpu_bstats,
1336                                         &sch->rate_est,
1337                                         NULL,
1338                                         true,
1339                                         tca[TCA_RATE]);
1340                 if (err) {
1341                         NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1342                         goto err_out4;
1343                 }
1344         }
1345
1346         qdisc_hash_add(sch, false);
1347         trace_qdisc_create(ops, dev, parent);
1348
1349         return sch;
1350
1351 err_out5:
1352         /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1353         if (ops->destroy)
1354                 ops->destroy(sch);
1355 err_out3:
1356         netdev_put(dev, &sch->dev_tracker);
1357         qdisc_free(sch);
1358 err_out2:
1359         module_put(ops->owner);
1360 err_out:
1361         *errp = err;
1362         return NULL;
1363
1364 err_out4:
1365         /*
1366          * Any broken qdiscs that would require a ops->reset() here?
1367          * The qdisc was never in action so it shouldn't be necessary.
1368          */
1369         qdisc_put_stab(rtnl_dereference(sch->stab));
1370         if (ops->destroy)
1371                 ops->destroy(sch);
1372         goto err_out3;
1373 }
1374
1375 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1376                         struct netlink_ext_ack *extack)
1377 {
1378         struct qdisc_size_table *ostab, *stab = NULL;
1379         int err = 0;
1380
1381         if (tca[TCA_OPTIONS]) {
1382                 if (!sch->ops->change) {
1383                         NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1384                         return -EINVAL;
1385                 }
1386                 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1387                         NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1388                         return -EOPNOTSUPP;
1389                 }
1390                 err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1391                 if (err)
1392                         return err;
1393         }
1394
1395         if (tca[TCA_STAB]) {
1396                 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1397                 if (IS_ERR(stab))
1398                         return PTR_ERR(stab);
1399         }
1400
1401         ostab = rtnl_dereference(sch->stab);
1402         rcu_assign_pointer(sch->stab, stab);
1403         qdisc_put_stab(ostab);
1404
1405         if (tca[TCA_RATE]) {
1406                 /* NB: ignores errors from replace_estimator
1407                    because change can't be undone. */
1408                 if (sch->flags & TCQ_F_MQROOT)
1409                         goto out;
1410                 gen_replace_estimator(&sch->bstats,
1411                                       sch->cpu_bstats,
1412                                       &sch->rate_est,
1413                                       NULL,
1414                                       true,
1415                                       tca[TCA_RATE]);
1416         }
1417 out:
1418         return 0;
1419 }
1420
1421 struct check_loop_arg {
1422         struct qdisc_walker     w;
1423         struct Qdisc            *p;
1424         int                     depth;
1425 };
1426
1427 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1428                          struct qdisc_walker *w);
1429
1430 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1431 {
1432         struct check_loop_arg   arg;
1433
1434         if (q->ops->cl_ops == NULL)
1435                 return 0;
1436
1437         arg.w.stop = arg.w.skip = arg.w.count = 0;
1438         arg.w.fn = check_loop_fn;
1439         arg.depth = depth;
1440         arg.p = p;
1441         q->ops->cl_ops->walk(q, &arg.w);
1442         return arg.w.stop ? -ELOOP : 0;
1443 }
1444
1445 static int
1446 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1447 {
1448         struct Qdisc *leaf;
1449         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1450         struct check_loop_arg *arg = (struct check_loop_arg *)w;
1451
1452         leaf = cops->leaf(q, cl);
1453         if (leaf) {
1454                 if (leaf == arg->p || arg->depth > 7)
1455                         return -ELOOP;
1456                 return check_loop(leaf, arg->p, arg->depth + 1);
1457         }
1458         return 0;
1459 }
1460
1461 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1462         [TCA_KIND]              = { .type = NLA_STRING },
1463         [TCA_RATE]              = { .type = NLA_BINARY,
1464                                     .len = sizeof(struct tc_estimator) },
1465         [TCA_STAB]              = { .type = NLA_NESTED },
1466         [TCA_DUMP_INVISIBLE]    = { .type = NLA_FLAG },
1467         [TCA_CHAIN]             = { .type = NLA_U32 },
1468         [TCA_INGRESS_BLOCK]     = { .type = NLA_U32 },
1469         [TCA_EGRESS_BLOCK]      = { .type = NLA_U32 },
1470 };
1471
1472 /*
1473  * Delete/get qdisc.
1474  */
1475
1476 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1477                         struct netlink_ext_ack *extack)
1478 {
1479         struct net *net = sock_net(skb->sk);
1480         struct tcmsg *tcm = nlmsg_data(n);
1481         struct nlattr *tca[TCA_MAX + 1];
1482         struct net_device *dev;
1483         u32 clid;
1484         struct Qdisc *q = NULL;
1485         struct Qdisc *p = NULL;
1486         int err;
1487
1488         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1489                                      rtm_tca_policy, extack);
1490         if (err < 0)
1491                 return err;
1492
1493         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1494         if (!dev)
1495                 return -ENODEV;
1496
1497         clid = tcm->tcm_parent;
1498         if (clid) {
1499                 if (clid != TC_H_ROOT) {
1500                         if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1501                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1502                                 if (!p) {
1503                                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1504                                         return -ENOENT;
1505                                 }
1506                                 q = qdisc_leaf(p, clid);
1507                         } else if (dev_ingress_queue(dev)) {
1508                                 q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping);
1509                         }
1510                 } else {
1511                         q = rtnl_dereference(dev->qdisc);
1512                 }
1513                 if (!q) {
1514                         NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1515                         return -ENOENT;
1516                 }
1517
1518                 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1519                         NL_SET_ERR_MSG(extack, "Invalid handle");
1520                         return -EINVAL;
1521                 }
1522         } else {
1523                 q = qdisc_lookup(dev, tcm->tcm_handle);
1524                 if (!q) {
1525                         NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1526                         return -ENOENT;
1527                 }
1528         }
1529
1530         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1531                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1532                 return -EINVAL;
1533         }
1534
1535         if (n->nlmsg_type == RTM_DELQDISC) {
1536                 if (!clid) {
1537                         NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1538                         return -EINVAL;
1539                 }
1540                 if (q->handle == 0) {
1541                         NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1542                         return -ENOENT;
1543                 }
1544                 err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1545                 if (err != 0)
1546                         return err;
1547         } else {
1548                 qdisc_notify(net, skb, n, clid, NULL, q, NULL);
1549         }
1550         return 0;
1551 }
1552
1553 static bool req_create_or_replace(struct nlmsghdr *n)
1554 {
1555         return (n->nlmsg_flags & NLM_F_CREATE &&
1556                 n->nlmsg_flags & NLM_F_REPLACE);
1557 }
1558
1559 static bool req_create_exclusive(struct nlmsghdr *n)
1560 {
1561         return (n->nlmsg_flags & NLM_F_CREATE &&
1562                 n->nlmsg_flags & NLM_F_EXCL);
1563 }
1564
1565 static bool req_change(struct nlmsghdr *n)
1566 {
1567         return (!(n->nlmsg_flags & NLM_F_CREATE) &&
1568                 !(n->nlmsg_flags & NLM_F_REPLACE) &&
1569                 !(n->nlmsg_flags & NLM_F_EXCL));
1570 }
1571
1572 /*
1573  * Create/change qdisc.
1574  */
1575 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1576                            struct netlink_ext_ack *extack)
1577 {
1578         struct net *net = sock_net(skb->sk);
1579         struct tcmsg *tcm;
1580         struct nlattr *tca[TCA_MAX + 1];
1581         struct net_device *dev;
1582         u32 clid;
1583         struct Qdisc *q, *p;
1584         int err;
1585
1586 replay:
1587         /* Reinit, just in case something touches this. */
1588         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1589                                      rtm_tca_policy, extack);
1590         if (err < 0)
1591                 return err;
1592
1593         tcm = nlmsg_data(n);
1594         clid = tcm->tcm_parent;
1595         q = p = NULL;
1596
1597         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1598         if (!dev)
1599                 return -ENODEV;
1600
1601
1602         if (clid) {
1603                 if (clid != TC_H_ROOT) {
1604                         if (clid != TC_H_INGRESS) {
1605                                 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1606                                 if (!p) {
1607                                         NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1608                                         return -ENOENT;
1609                                 }
1610                                 q = qdisc_leaf(p, clid);
1611                         } else if (dev_ingress_queue_create(dev)) {
1612                                 q = rtnl_dereference(dev_ingress_queue(dev)->qdisc_sleeping);
1613                         }
1614                 } else {
1615                         q = rtnl_dereference(dev->qdisc);
1616                 }
1617
1618                 /* It may be default qdisc, ignore it */
1619                 if (q && q->handle == 0)
1620                         q = NULL;
1621
1622                 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1623                         if (tcm->tcm_handle) {
1624                                 if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1625                                         NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1626                                         return -EEXIST;
1627                                 }
1628                                 if (TC_H_MIN(tcm->tcm_handle)) {
1629                                         NL_SET_ERR_MSG(extack, "Invalid minor handle");
1630                                         return -EINVAL;
1631                                 }
1632                                 q = qdisc_lookup(dev, tcm->tcm_handle);
1633                                 if (!q)
1634                                         goto create_n_graft;
1635                                 if (n->nlmsg_flags & NLM_F_EXCL) {
1636                                         NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1637                                         return -EEXIST;
1638                                 }
1639                                 if (tca[TCA_KIND] &&
1640                                     nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1641                                         NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1642                                         return -EINVAL;
1643                                 }
1644                                 if (q->flags & TCQ_F_INGRESS) {
1645                                         NL_SET_ERR_MSG(extack,
1646                                                        "Cannot regraft ingress or clsact Qdiscs");
1647                                         return -EINVAL;
1648                                 }
1649                                 if (q == p ||
1650                                     (p && check_loop(q, p, 0))) {
1651                                         NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1652                                         return -ELOOP;
1653                                 }
1654                                 if (clid == TC_H_INGRESS) {
1655                                         NL_SET_ERR_MSG(extack, "Ingress cannot graft directly");
1656                                         return -EINVAL;
1657                                 }
1658                                 qdisc_refcount_inc(q);
1659                                 goto graft;
1660                         } else {
1661                                 if (!q)
1662                                         goto create_n_graft;
1663
1664                                 /* This magic test requires explanation.
1665                                  *
1666                                  *   We know, that some child q is already
1667                                  *   attached to this parent and have choice:
1668                                  *   1) change it or 2) create/graft new one.
1669                                  *   If the requested qdisc kind is different
1670                                  *   than the existing one, then we choose graft.
1671                                  *   If they are the same then this is "change"
1672                                  *   operation - just let it fallthrough..
1673                                  *
1674                                  *   1. We are allowed to create/graft only
1675                                  *   if the request is explicitly stating
1676                                  *   "please create if it doesn't exist".
1677                                  *
1678                                  *   2. If the request is to exclusive create
1679                                  *   then the qdisc tcm_handle is not expected
1680                                  *   to exist, so that we choose create/graft too.
1681                                  *
1682                                  *   3. The last case is when no flags are set.
1683                                  *   This will happen when for example tc
1684                                  *   utility issues a "change" command.
1685                                  *   Alas, it is sort of hole in API, we
1686                                  *   cannot decide what to do unambiguously.
1687                                  *   For now we select create/graft.
1688                                  */
1689                                 if (tca[TCA_KIND] &&
1690                                     nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1691                                         if (req_create_or_replace(n) ||
1692                                             req_create_exclusive(n))
1693                                                 goto create_n_graft;
1694                                         else if (req_change(n))
1695                                                 goto create_n_graft2;
1696                                 }
1697                         }
1698                 }
1699         } else {
1700                 if (!tcm->tcm_handle) {
1701                         NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1702                         return -EINVAL;
1703                 }
1704                 q = qdisc_lookup(dev, tcm->tcm_handle);
1705         }
1706
1707         /* Change qdisc parameters */
1708         if (!q) {
1709                 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1710                 return -ENOENT;
1711         }
1712         if (n->nlmsg_flags & NLM_F_EXCL) {
1713                 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1714                 return -EEXIST;
1715         }
1716         if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1717                 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1718                 return -EINVAL;
1719         }
1720         err = qdisc_change(q, tca, extack);
1721         if (err == 0)
1722                 qdisc_notify(net, skb, n, clid, NULL, q, extack);
1723         return err;
1724
1725 create_n_graft:
1726         if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1727                 NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1728                 return -ENOENT;
1729         }
1730 create_n_graft2:
1731         if (clid == TC_H_INGRESS) {
1732                 if (dev_ingress_queue(dev)) {
1733                         q = qdisc_create(dev, dev_ingress_queue(dev),
1734                                          tcm->tcm_parent, tcm->tcm_parent,
1735                                          tca, &err, extack);
1736                 } else {
1737                         NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1738                         err = -ENOENT;
1739                 }
1740         } else {
1741                 struct netdev_queue *dev_queue;
1742
1743                 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1744                         dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1745                 else if (p)
1746                         dev_queue = p->dev_queue;
1747                 else
1748                         dev_queue = netdev_get_tx_queue(dev, 0);
1749
1750                 q = qdisc_create(dev, dev_queue,
1751                                  tcm->tcm_parent, tcm->tcm_handle,
1752                                  tca, &err, extack);
1753         }
1754         if (q == NULL) {
1755                 if (err == -EAGAIN)
1756                         goto replay;
1757                 return err;
1758         }
1759
1760 graft:
1761         err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1762         if (err) {
1763                 if (q)
1764                         qdisc_put(q);
1765                 return err;
1766         }
1767
1768         return 0;
1769 }
1770
1771 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1772                               struct netlink_callback *cb,
1773                               int *q_idx_p, int s_q_idx, bool recur,
1774                               bool dump_invisible)
1775 {
1776         int ret = 0, q_idx = *q_idx_p;
1777         struct Qdisc *q;
1778         int b;
1779
1780         if (!root)
1781                 return 0;
1782
1783         q = root;
1784         if (q_idx < s_q_idx) {
1785                 q_idx++;
1786         } else {
1787                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1788                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1789                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1790                                   RTM_NEWQDISC, NULL) <= 0)
1791                         goto done;
1792                 q_idx++;
1793         }
1794
1795         /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1796          * itself has already been dumped.
1797          *
1798          * If we've already dumped the top-level (ingress) qdisc above and the global
1799          * qdisc hashtable, we don't want to hit it again
1800          */
1801         if (!qdisc_dev(root) || !recur)
1802                 goto out;
1803
1804         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1805                 if (q_idx < s_q_idx) {
1806                         q_idx++;
1807                         continue;
1808                 }
1809                 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1810                     tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1811                                   cb->nlh->nlmsg_seq, NLM_F_MULTI,
1812                                   RTM_NEWQDISC, NULL) <= 0)
1813                         goto done;
1814                 q_idx++;
1815         }
1816
1817 out:
1818         *q_idx_p = q_idx;
1819         return ret;
1820 done:
1821         ret = -1;
1822         goto out;
1823 }
1824
1825 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1826 {
1827         struct net *net = sock_net(skb->sk);
1828         int idx, q_idx;
1829         int s_idx, s_q_idx;
1830         struct net_device *dev;
1831         const struct nlmsghdr *nlh = cb->nlh;
1832         struct nlattr *tca[TCA_MAX + 1];
1833         int err;
1834
1835         s_idx = cb->args[0];
1836         s_q_idx = q_idx = cb->args[1];
1837
1838         idx = 0;
1839         ASSERT_RTNL();
1840
1841         err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1842                                      rtm_tca_policy, cb->extack);
1843         if (err < 0)
1844                 return err;
1845
1846         for_each_netdev(net, dev) {
1847                 struct netdev_queue *dev_queue;
1848
1849                 if (idx < s_idx)
1850                         goto cont;
1851                 if (idx > s_idx)
1852                         s_q_idx = 0;
1853                 q_idx = 0;
1854
1855                 if (tc_dump_qdisc_root(rtnl_dereference(dev->qdisc),
1856                                        skb, cb, &q_idx, s_q_idx,
1857                                        true, tca[TCA_DUMP_INVISIBLE]) < 0)
1858                         goto done;
1859
1860                 dev_queue = dev_ingress_queue(dev);
1861                 if (dev_queue &&
1862                     tc_dump_qdisc_root(rtnl_dereference(dev_queue->qdisc_sleeping),
1863                                        skb, cb, &q_idx, s_q_idx, false,
1864                                        tca[TCA_DUMP_INVISIBLE]) < 0)
1865                         goto done;
1866
1867 cont:
1868                 idx++;
1869         }
1870
1871 done:
1872         cb->args[0] = idx;
1873         cb->args[1] = q_idx;
1874
1875         return skb->len;
1876 }
1877
1878
1879
1880 /************************************************
1881  *      Traffic classes manipulation.           *
1882  ************************************************/
1883
1884 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1885                           unsigned long cl, u32 portid, u32 seq, u16 flags,
1886                           int event, struct netlink_ext_ack *extack)
1887 {
1888         struct tcmsg *tcm;
1889         struct nlmsghdr  *nlh;
1890         unsigned char *b = skb_tail_pointer(skb);
1891         struct gnet_dump d;
1892         const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1893
1894         cond_resched();
1895         nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1896         if (!nlh)
1897                 goto out_nlmsg_trim;
1898         tcm = nlmsg_data(nlh);
1899         tcm->tcm_family = AF_UNSPEC;
1900         tcm->tcm__pad1 = 0;
1901         tcm->tcm__pad2 = 0;
1902         tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1903         tcm->tcm_parent = q->handle;
1904         tcm->tcm_handle = q->handle;
1905         tcm->tcm_info = 0;
1906         if (nla_put_string(skb, TCA_KIND, q->ops->id))
1907                 goto nla_put_failure;
1908         if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1909                 goto nla_put_failure;
1910
1911         if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1912                                          NULL, &d, TCA_PAD) < 0)
1913                 goto nla_put_failure;
1914
1915         if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1916                 goto nla_put_failure;
1917
1918         if (gnet_stats_finish_copy(&d) < 0)
1919                 goto nla_put_failure;
1920
1921         if (extack && extack->_msg &&
1922             nla_put_string(skb, TCA_EXT_WARN_MSG, extack->_msg))
1923                 goto out_nlmsg_trim;
1924
1925         nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1926
1927         return skb->len;
1928
1929 out_nlmsg_trim:
1930 nla_put_failure:
1931         nlmsg_trim(skb, b);
1932         return -1;
1933 }
1934
1935 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1936                          struct nlmsghdr *n, struct Qdisc *q,
1937                          unsigned long cl, int event, struct netlink_ext_ack *extack)
1938 {
1939         struct sk_buff *skb;
1940         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1941
1942         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1943         if (!skb)
1944                 return -ENOBUFS;
1945
1946         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event, extack) < 0) {
1947                 kfree_skb(skb);
1948                 return -EINVAL;
1949         }
1950
1951         return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1952                               n->nlmsg_flags & NLM_F_ECHO);
1953 }
1954
1955 static int tclass_del_notify(struct net *net,
1956                              const struct Qdisc_class_ops *cops,
1957                              struct sk_buff *oskb, struct nlmsghdr *n,
1958                              struct Qdisc *q, unsigned long cl,
1959                              struct netlink_ext_ack *extack)
1960 {
1961         u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1962         struct sk_buff *skb;
1963         int err = 0;
1964
1965         if (!cops->delete)
1966                 return -EOPNOTSUPP;
1967
1968         skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1969         if (!skb)
1970                 return -ENOBUFS;
1971
1972         if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1973                            RTM_DELTCLASS, extack) < 0) {
1974                 kfree_skb(skb);
1975                 return -EINVAL;
1976         }
1977
1978         err = cops->delete(q, cl, extack);
1979         if (err) {
1980                 kfree_skb(skb);
1981                 return err;
1982         }
1983
1984         err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1985                              n->nlmsg_flags & NLM_F_ECHO);
1986         return err;
1987 }
1988
1989 #ifdef CONFIG_NET_CLS
1990
1991 struct tcf_bind_args {
1992         struct tcf_walker w;
1993         unsigned long base;
1994         unsigned long cl;
1995         u32 classid;
1996 };
1997
1998 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1999 {
2000         struct tcf_bind_args *a = (void *)arg;
2001
2002         if (n && tp->ops->bind_class) {
2003                 struct Qdisc *q = tcf_block_q(tp->chain->block);
2004
2005                 sch_tree_lock(q);
2006                 tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
2007                 sch_tree_unlock(q);
2008         }
2009         return 0;
2010 }
2011
2012 struct tc_bind_class_args {
2013         struct qdisc_walker w;
2014         unsigned long new_cl;
2015         u32 portid;
2016         u32 clid;
2017 };
2018
2019 static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl,
2020                                 struct qdisc_walker *w)
2021 {
2022         struct tc_bind_class_args *a = (struct tc_bind_class_args *)w;
2023         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
2024         struct tcf_block *block;
2025         struct tcf_chain *chain;
2026
2027         block = cops->tcf_block(q, cl, NULL);
2028         if (!block)
2029                 return 0;
2030         for (chain = tcf_get_next_chain(block, NULL);
2031              chain;
2032              chain = tcf_get_next_chain(block, chain)) {
2033                 struct tcf_proto *tp;
2034
2035                 for (tp = tcf_get_next_proto(chain, NULL);
2036                      tp; tp = tcf_get_next_proto(chain, tp)) {
2037                         struct tcf_bind_args arg = {};
2038
2039                         arg.w.fn = tcf_node_bind;
2040                         arg.classid = a->clid;
2041                         arg.base = cl;
2042                         arg.cl = a->new_cl;
2043                         tp->ops->walk(tp, &arg.w, true);
2044                 }
2045         }
2046
2047         return 0;
2048 }
2049
2050 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
2051                            unsigned long new_cl)
2052 {
2053         const struct Qdisc_class_ops *cops = q->ops->cl_ops;
2054         struct tc_bind_class_args args = {};
2055
2056         if (!cops->tcf_block)
2057                 return;
2058         args.portid = portid;
2059         args.clid = clid;
2060         args.new_cl = new_cl;
2061         args.w.fn = tc_bind_class_walker;
2062         q->ops->cl_ops->walk(q, &args.w);
2063 }
2064
2065 #else
2066
2067 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
2068                            unsigned long new_cl)
2069 {
2070 }
2071
2072 #endif
2073
2074 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
2075                          struct netlink_ext_ack *extack)
2076 {
2077         struct net *net = sock_net(skb->sk);
2078         struct tcmsg *tcm = nlmsg_data(n);
2079         struct nlattr *tca[TCA_MAX + 1];
2080         struct net_device *dev;
2081         struct Qdisc *q = NULL;
2082         const struct Qdisc_class_ops *cops;
2083         unsigned long cl = 0;
2084         unsigned long new_cl;
2085         u32 portid;
2086         u32 clid;
2087         u32 qid;
2088         int err;
2089
2090         err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
2091                                      rtm_tca_policy, extack);
2092         if (err < 0)
2093                 return err;
2094
2095         dev = __dev_get_by_index(net, tcm->tcm_ifindex);
2096         if (!dev)
2097                 return -ENODEV;
2098
2099         /*
2100            parent == TC_H_UNSPEC - unspecified parent.
2101            parent == TC_H_ROOT   - class is root, which has no parent.
2102            parent == X:0         - parent is root class.
2103            parent == X:Y         - parent is a node in hierarchy.
2104            parent == 0:Y         - parent is X:Y, where X:0 is qdisc.
2105
2106            handle == 0:0         - generate handle from kernel pool.
2107            handle == 0:Y         - class is X:Y, where X:0 is qdisc.
2108            handle == X:Y         - clear.
2109            handle == X:0         - root class.
2110          */
2111
2112         /* Step 1. Determine qdisc handle X:0 */
2113
2114         portid = tcm->tcm_parent;
2115         clid = tcm->tcm_handle;
2116         qid = TC_H_MAJ(clid);
2117
2118         if (portid != TC_H_ROOT) {
2119                 u32 qid1 = TC_H_MAJ(portid);
2120
2121                 if (qid && qid1) {
2122                         /* If both majors are known, they must be identical. */
2123                         if (qid != qid1)
2124                                 return -EINVAL;
2125                 } else if (qid1) {
2126                         qid = qid1;
2127                 } else if (qid == 0)
2128                         qid = rtnl_dereference(dev->qdisc)->handle;
2129
2130                 /* Now qid is genuine qdisc handle consistent
2131                  * both with parent and child.
2132                  *
2133                  * TC_H_MAJ(portid) still may be unspecified, complete it now.
2134                  */
2135                 if (portid)
2136                         portid = TC_H_MAKE(qid, portid);
2137         } else {
2138                 if (qid == 0)
2139                         qid = rtnl_dereference(dev->qdisc)->handle;
2140         }
2141
2142         /* OK. Locate qdisc */
2143         q = qdisc_lookup(dev, qid);
2144         if (!q)
2145                 return -ENOENT;
2146
2147         /* An check that it supports classes */
2148         cops = q->ops->cl_ops;
2149         if (cops == NULL)
2150                 return -EINVAL;
2151
2152         /* Now try to get class */
2153         if (clid == 0) {
2154                 if (portid == TC_H_ROOT)
2155                         clid = qid;
2156         } else
2157                 clid = TC_H_MAKE(qid, clid);
2158
2159         if (clid)
2160                 cl = cops->find(q, clid);
2161
2162         if (cl == 0) {
2163                 err = -ENOENT;
2164                 if (n->nlmsg_type != RTM_NEWTCLASS ||
2165                     !(n->nlmsg_flags & NLM_F_CREATE))
2166                         goto out;
2167         } else {
2168                 switch (n->nlmsg_type) {
2169                 case RTM_NEWTCLASS:
2170                         err = -EEXIST;
2171                         if (n->nlmsg_flags & NLM_F_EXCL)
2172                                 goto out;
2173                         break;
2174                 case RTM_DELTCLASS:
2175                         err = tclass_del_notify(net, cops, skb, n, q, cl, extack);
2176                         /* Unbind the class with flilters with 0 */
2177                         tc_bind_tclass(q, portid, clid, 0);
2178                         goto out;
2179                 case RTM_GETTCLASS:
2180                         err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS, extack);
2181                         goto out;
2182                 default:
2183                         err = -EINVAL;
2184                         goto out;
2185                 }
2186         }
2187
2188         if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
2189                 NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
2190                 return -EOPNOTSUPP;
2191         }
2192
2193         new_cl = cl;
2194         err = -EOPNOTSUPP;
2195         if (cops->change)
2196                 err = cops->change(q, clid, portid, tca, &new_cl, extack);
2197         if (err == 0) {
2198                 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS, extack);
2199                 /* We just create a new class, need to do reverse binding. */
2200                 if (cl != new_cl)
2201                         tc_bind_tclass(q, portid, clid, new_cl);
2202         }
2203 out:
2204         return err;
2205 }
2206
2207 struct qdisc_dump_args {
2208         struct qdisc_walker     w;
2209         struct sk_buff          *skb;
2210         struct netlink_callback *cb;
2211 };
2212
2213 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2214                             struct qdisc_walker *arg)
2215 {
2216         struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2217
2218         return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2219                               a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2220                               RTM_NEWTCLASS, NULL);
2221 }
2222
2223 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2224                                 struct tcmsg *tcm, struct netlink_callback *cb,
2225                                 int *t_p, int s_t)
2226 {
2227         struct qdisc_dump_args arg;
2228
2229         if (tc_qdisc_dump_ignore(q, false) ||
2230             *t_p < s_t || !q->ops->cl_ops ||
2231             (tcm->tcm_parent &&
2232              TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2233                 (*t_p)++;
2234                 return 0;
2235         }
2236         if (*t_p > s_t)
2237                 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2238         arg.w.fn = qdisc_class_dump;
2239         arg.skb = skb;
2240         arg.cb = cb;
2241         arg.w.stop  = 0;
2242         arg.w.skip = cb->args[1];
2243         arg.w.count = 0;
2244         q->ops->cl_ops->walk(q, &arg.w);
2245         cb->args[1] = arg.w.count;
2246         if (arg.w.stop)
2247                 return -1;
2248         (*t_p)++;
2249         return 0;
2250 }
2251
2252 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2253                                struct tcmsg *tcm, struct netlink_callback *cb,
2254                                int *t_p, int s_t, bool recur)
2255 {
2256         struct Qdisc *q;
2257         int b;
2258
2259         if (!root)
2260                 return 0;
2261
2262         if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2263                 return -1;
2264
2265         if (!qdisc_dev(root) || !recur)
2266                 return 0;
2267
2268         if (tcm->tcm_parent) {
2269                 q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2270                 if (q && q != root &&
2271                     tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2272                         return -1;
2273                 return 0;
2274         }
2275         hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2276                 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2277                         return -1;
2278         }
2279
2280         return 0;
2281 }
2282
2283 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2284 {
2285         struct tcmsg *tcm = nlmsg_data(cb->nlh);
2286         struct net *net = sock_net(skb->sk);
2287         struct netdev_queue *dev_queue;
2288         struct net_device *dev;
2289         int t, s_t;
2290
2291         if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2292                 return 0;
2293         dev = dev_get_by_index(net, tcm->tcm_ifindex);
2294         if (!dev)
2295                 return 0;
2296
2297         s_t = cb->args[0];
2298         t = 0;
2299
2300         if (tc_dump_tclass_root(rtnl_dereference(dev->qdisc),
2301                                 skb, tcm, cb, &t, s_t, true) < 0)
2302                 goto done;
2303
2304         dev_queue = dev_ingress_queue(dev);
2305         if (dev_queue &&
2306             tc_dump_tclass_root(rtnl_dereference(dev_queue->qdisc_sleeping),
2307                                 skb, tcm, cb, &t, s_t, false) < 0)
2308                 goto done;
2309
2310 done:
2311         cb->args[0] = t;
2312
2313         dev_put(dev);
2314         return skb->len;
2315 }
2316
2317 #ifdef CONFIG_PROC_FS
2318 static int psched_show(struct seq_file *seq, void *v)
2319 {
2320         seq_printf(seq, "%08x %08x %08x %08x\n",
2321                    (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2322                    1000000,
2323                    (u32)NSEC_PER_SEC / hrtimer_resolution);
2324
2325         return 0;
2326 }
2327
2328 static int __net_init psched_net_init(struct net *net)
2329 {
2330         struct proc_dir_entry *e;
2331
2332         e = proc_create_single("psched", 0, net->proc_net, psched_show);
2333         if (e == NULL)
2334                 return -ENOMEM;
2335
2336         return 0;
2337 }
2338
2339 static void __net_exit psched_net_exit(struct net *net)
2340 {
2341         remove_proc_entry("psched", net->proc_net);
2342 }
2343 #else
2344 static int __net_init psched_net_init(struct net *net)
2345 {
2346         return 0;
2347 }
2348
2349 static void __net_exit psched_net_exit(struct net *net)
2350 {
2351 }
2352 #endif
2353
2354 static struct pernet_operations psched_net_ops = {
2355         .init = psched_net_init,
2356         .exit = psched_net_exit,
2357 };
2358
2359 static int __init pktsched_init(void)
2360 {
2361         int err;
2362
2363         err = register_pernet_subsys(&psched_net_ops);
2364         if (err) {
2365                 pr_err("pktsched_init: "
2366                        "cannot initialize per netns operations\n");
2367                 return err;
2368         }
2369
2370         register_qdisc(&pfifo_fast_ops);
2371         register_qdisc(&pfifo_qdisc_ops);
2372         register_qdisc(&bfifo_qdisc_ops);
2373         register_qdisc(&pfifo_head_drop_qdisc_ops);
2374         register_qdisc(&mq_qdisc_ops);
2375         register_qdisc(&noqueue_qdisc_ops);
2376
2377         rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2378         rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2379         rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2380                       0);
2381         rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2382         rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2383         rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2384                       0);
2385
2386         return 0;
2387 }
2388
2389 subsys_initcall(pktsched_init);