1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * net/sched/sch_api.c Packet scheduler API.
5 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
9 * Rani Assaf <rani@magic.metawire.com> :980802: JIFFIES and CPU clock sources are repaired.
10 * Eduardo J. Blanco <ejbs@netlabs.com.uy> :990222: kmod support
11 * Jamal Hadi Salim <hadi@nortelnetworks.com>: 990601: ingress support
14 #include <linux/module.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/string.h>
18 #include <linux/errno.h>
19 #include <linux/skbuff.h>
20 #include <linux/init.h>
21 #include <linux/proc_fs.h>
22 #include <linux/seq_file.h>
23 #include <linux/kmod.h>
24 #include <linux/list.h>
25 #include <linux/hrtimer.h>
26 #include <linux/slab.h>
27 #include <linux/hashtable.h>
29 #include <net/net_namespace.h>
31 #include <net/netlink.h>
32 #include <net/pkt_sched.h>
33 #include <net/pkt_cls.h>
40 This file consists of two interrelated parts:
42 1. queueing disciplines manager frontend.
43 2. traffic classes manager frontend.
45 Generally, queueing discipline ("qdisc") is a black box,
46 which is able to enqueue packets and to dequeue them (when
47 device is ready to send something) in order and at times
48 determined by algorithm hidden in it.
50 qdisc's are divided to two categories:
51 - "queues", which have no internal structure visible from outside.
52 - "schedulers", which split all the packets to "traffic classes",
53 using "packet classifiers" (look at cls_api.c)
55 In turn, classes may have child qdiscs (as rule, queues)
56 attached to them etc. etc. etc.
58 The goal of the routines in this file is to translate
59 information supplied by user in the form of handles
60 to more intelligible for kernel form, to make some sanity
61 checks and part of work, which is common to all qdiscs
62 and to provide rtnetlink notifications.
64 All real intelligent work is done inside qdisc modules.
68 Every discipline has two major routines: enqueue and dequeue.
72 dequeue usually returns a skb to send. It is allowed to return NULL,
73 but it does not mean that queue is empty, it just means that
74 discipline does not want to send anything this time.
75 Queue is really empty if q->q.qlen == 0.
76 For complicated disciplines with multiple queues q->q is not
77 real packet queue, but however q->q.qlen must be valid.
81 enqueue returns 0, if packet was enqueued successfully.
82 If packet (this one or another one) was dropped, it returns
84 NET_XMIT_DROP - this packet dropped
85 Expected action: do not backoff, but wait until queue will clear.
86 NET_XMIT_CN - probably this packet enqueued, but another one dropped.
87 Expected action: backoff or ignore
93 like dequeue but without removing a packet from the queue
97 returns qdisc to initial state: purge all buffers, clear all
98 timers, counters (except for statistics) etc.
102 initializes newly created qdisc.
106 destroys resources allocated by init and during lifetime of qdisc.
110 changes qdisc parameters.
113 /* Protects list of registered TC modules. It is pure SMP lock. */
114 static DEFINE_RWLOCK(qdisc_mod_lock);
117 /************************************************
118 * Queueing disciplines manipulation. *
119 ************************************************/
122 /* The list of all installed queueing disciplines. */
124 static struct Qdisc_ops *qdisc_base;
126 /* Register/unregister queueing discipline */
128 int register_qdisc(struct Qdisc_ops *qops)
130 struct Qdisc_ops *q, **qp;
133 write_lock(&qdisc_mod_lock);
134 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
135 if (!strcmp(qops->id, q->id))
138 if (qops->enqueue == NULL)
139 qops->enqueue = noop_qdisc_ops.enqueue;
140 if (qops->peek == NULL) {
141 if (qops->dequeue == NULL)
142 qops->peek = noop_qdisc_ops.peek;
146 if (qops->dequeue == NULL)
147 qops->dequeue = noop_qdisc_ops.dequeue;
150 const struct Qdisc_class_ops *cops = qops->cl_ops;
152 if (!(cops->find && cops->walk && cops->leaf))
155 if (cops->tcf_block && !(cops->bind_tcf && cops->unbind_tcf))
163 write_unlock(&qdisc_mod_lock);
170 EXPORT_SYMBOL(register_qdisc);
172 int unregister_qdisc(struct Qdisc_ops *qops)
174 struct Qdisc_ops *q, **qp;
177 write_lock(&qdisc_mod_lock);
178 for (qp = &qdisc_base; (q = *qp) != NULL; qp = &q->next)
186 write_unlock(&qdisc_mod_lock);
189 EXPORT_SYMBOL(unregister_qdisc);
191 /* Get default qdisc if not otherwise specified */
192 void qdisc_get_default(char *name, size_t len)
194 read_lock(&qdisc_mod_lock);
195 strlcpy(name, default_qdisc_ops->id, len);
196 read_unlock(&qdisc_mod_lock);
199 static struct Qdisc_ops *qdisc_lookup_default(const char *name)
201 struct Qdisc_ops *q = NULL;
203 for (q = qdisc_base; q; q = q->next) {
204 if (!strcmp(name, q->id)) {
205 if (!try_module_get(q->owner))
214 /* Set new default qdisc to use */
215 int qdisc_set_default(const char *name)
217 const struct Qdisc_ops *ops;
219 if (!capable(CAP_NET_ADMIN))
222 write_lock(&qdisc_mod_lock);
223 ops = qdisc_lookup_default(name);
225 /* Not found, drop lock and try to load module */
226 write_unlock(&qdisc_mod_lock);
227 request_module("sch_%s", name);
228 write_lock(&qdisc_mod_lock);
230 ops = qdisc_lookup_default(name);
234 /* Set new default */
235 module_put(default_qdisc_ops->owner);
236 default_qdisc_ops = ops;
238 write_unlock(&qdisc_mod_lock);
240 return ops ? 0 : -ENOENT;
243 #ifdef CONFIG_NET_SCH_DEFAULT
244 /* Set default value from kernel config */
245 static int __init sch_default_qdisc(void)
247 return qdisc_set_default(CONFIG_DEFAULT_NET_SCH);
249 late_initcall(sch_default_qdisc);
252 /* We know handle. Find qdisc among all qdisc's attached to device
253 * (root qdisc, all its children, children of children etc.)
254 * Note: caller either uses rtnl or rcu_read_lock()
257 static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
261 if (!qdisc_dev(root))
262 return (root->handle == handle ? root : NULL);
264 if (!(root->flags & TCQ_F_BUILTIN) &&
265 root->handle == handle)
268 hash_for_each_possible_rcu(qdisc_dev(root)->qdisc_hash, q, hash, handle) {
269 if (q->handle == handle)
275 void qdisc_hash_add(struct Qdisc *q, bool invisible)
277 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
279 hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
281 q->flags |= TCQ_F_INVISIBLE;
284 EXPORT_SYMBOL(qdisc_hash_add);
286 void qdisc_hash_del(struct Qdisc *q)
288 if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
290 hash_del_rcu(&q->hash);
293 EXPORT_SYMBOL(qdisc_hash_del);
295 struct Qdisc *qdisc_lookup(struct net_device *dev, u32 handle)
301 q = qdisc_match_from_root(rtnl_dereference(dev->qdisc), handle);
305 if (dev_ingress_queue(dev))
306 q = qdisc_match_from_root(
307 dev_ingress_queue(dev)->qdisc_sleeping,
313 struct Qdisc *qdisc_lookup_rcu(struct net_device *dev, u32 handle)
315 struct netdev_queue *nq;
320 q = qdisc_match_from_root(rcu_dereference(dev->qdisc), handle);
324 nq = dev_ingress_queue_rcu(dev);
326 q = qdisc_match_from_root(nq->qdisc_sleeping, handle);
331 static struct Qdisc *qdisc_leaf(struct Qdisc *p, u32 classid)
334 const struct Qdisc_class_ops *cops = p->ops->cl_ops;
338 cl = cops->find(p, classid);
342 return cops->leaf(p, cl);
345 /* Find queueing discipline by name */
347 static struct Qdisc_ops *qdisc_lookup_ops(struct nlattr *kind)
349 struct Qdisc_ops *q = NULL;
352 read_lock(&qdisc_mod_lock);
353 for (q = qdisc_base; q; q = q->next) {
354 if (nla_strcmp(kind, q->id) == 0) {
355 if (!try_module_get(q->owner))
360 read_unlock(&qdisc_mod_lock);
365 /* The linklayer setting were not transferred from iproute2, in older
366 * versions, and the rate tables lookup systems have been dropped in
367 * the kernel. To keep backward compatible with older iproute2 tc
368 * utils, we detect the linklayer setting by detecting if the rate
369 * table were modified.
371 * For linklayer ATM table entries, the rate table will be aligned to
372 * 48 bytes, thus some table entries will contain the same value. The
373 * mpu (min packet unit) is also encoded into the old rate table, thus
374 * starting from the mpu, we find low and high table entries for
375 * mapping this cell. If these entries contain the same value, when
376 * the rate tables have been modified for linklayer ATM.
378 * This is done by rounding mpu to the nearest 48 bytes cell/entry,
379 * and then roundup to the next cell, calc the table entry one below,
382 static __u8 __detect_linklayer(struct tc_ratespec *r, __u32 *rtab)
384 int low = roundup(r->mpu, 48);
385 int high = roundup(low+1, 48);
386 int cell_low = low >> r->cell_log;
387 int cell_high = (high >> r->cell_log) - 1;
389 /* rtab is too inaccurate at rates > 100Mbit/s */
390 if ((r->rate > (100000000/8)) || (rtab[0] == 0)) {
391 pr_debug("TC linklayer: Giving up ATM detection\n");
392 return TC_LINKLAYER_ETHERNET;
395 if ((cell_high > cell_low) && (cell_high < 256)
396 && (rtab[cell_low] == rtab[cell_high])) {
397 pr_debug("TC linklayer: Detected ATM, low(%d)=high(%d)=%u\n",
398 cell_low, cell_high, rtab[cell_high]);
399 return TC_LINKLAYER_ATM;
401 return TC_LINKLAYER_ETHERNET;
404 static struct qdisc_rate_table *qdisc_rtab_list;
406 struct qdisc_rate_table *qdisc_get_rtab(struct tc_ratespec *r,
408 struct netlink_ext_ack *extack)
410 struct qdisc_rate_table *rtab;
412 if (tab == NULL || r->rate == 0 ||
413 r->cell_log == 0 || r->cell_log >= 32 ||
414 nla_len(tab) != TC_RTAB_SIZE) {
415 NL_SET_ERR_MSG(extack, "Invalid rate table parameters for searching");
419 for (rtab = qdisc_rtab_list; rtab; rtab = rtab->next) {
420 if (!memcmp(&rtab->rate, r, sizeof(struct tc_ratespec)) &&
421 !memcmp(&rtab->data, nla_data(tab), 1024)) {
427 rtab = kmalloc(sizeof(*rtab), GFP_KERNEL);
431 memcpy(rtab->data, nla_data(tab), 1024);
432 if (r->linklayer == TC_LINKLAYER_UNAWARE)
433 r->linklayer = __detect_linklayer(r, rtab->data);
434 rtab->next = qdisc_rtab_list;
435 qdisc_rtab_list = rtab;
437 NL_SET_ERR_MSG(extack, "Failed to allocate new qdisc rate table");
441 EXPORT_SYMBOL(qdisc_get_rtab);
443 void qdisc_put_rtab(struct qdisc_rate_table *tab)
445 struct qdisc_rate_table *rtab, **rtabp;
447 if (!tab || --tab->refcnt)
450 for (rtabp = &qdisc_rtab_list;
451 (rtab = *rtabp) != NULL;
452 rtabp = &rtab->next) {
460 EXPORT_SYMBOL(qdisc_put_rtab);
462 static LIST_HEAD(qdisc_stab_list);
464 static const struct nla_policy stab_policy[TCA_STAB_MAX + 1] = {
465 [TCA_STAB_BASE] = { .len = sizeof(struct tc_sizespec) },
466 [TCA_STAB_DATA] = { .type = NLA_BINARY },
469 static struct qdisc_size_table *qdisc_get_stab(struct nlattr *opt,
470 struct netlink_ext_ack *extack)
472 struct nlattr *tb[TCA_STAB_MAX + 1];
473 struct qdisc_size_table *stab;
474 struct tc_sizespec *s;
475 unsigned int tsize = 0;
479 err = nla_parse_nested_deprecated(tb, TCA_STAB_MAX, opt, stab_policy,
483 if (!tb[TCA_STAB_BASE]) {
484 NL_SET_ERR_MSG(extack, "Size table base attribute is missing");
485 return ERR_PTR(-EINVAL);
488 s = nla_data(tb[TCA_STAB_BASE]);
491 if (!tb[TCA_STAB_DATA]) {
492 NL_SET_ERR_MSG(extack, "Size table data attribute is missing");
493 return ERR_PTR(-EINVAL);
495 tab = nla_data(tb[TCA_STAB_DATA]);
496 tsize = nla_len(tb[TCA_STAB_DATA]) / sizeof(u16);
499 if (tsize != s->tsize || (!tab && tsize > 0)) {
500 NL_SET_ERR_MSG(extack, "Invalid size of size table");
501 return ERR_PTR(-EINVAL);
504 list_for_each_entry(stab, &qdisc_stab_list, list) {
505 if (memcmp(&stab->szopts, s, sizeof(*s)))
507 if (tsize > 0 && memcmp(stab->data, tab, tsize * sizeof(u16)))
513 if (s->size_log > STAB_SIZE_LOG_MAX ||
514 s->cell_log > STAB_SIZE_LOG_MAX) {
515 NL_SET_ERR_MSG(extack, "Invalid logarithmic size of size table");
516 return ERR_PTR(-EINVAL);
519 stab = kmalloc(sizeof(*stab) + tsize * sizeof(u16), GFP_KERNEL);
521 return ERR_PTR(-ENOMEM);
526 memcpy(stab->data, tab, tsize * sizeof(u16));
528 list_add_tail(&stab->list, &qdisc_stab_list);
533 void qdisc_put_stab(struct qdisc_size_table *tab)
538 if (--tab->refcnt == 0) {
539 list_del(&tab->list);
543 EXPORT_SYMBOL(qdisc_put_stab);
545 static int qdisc_dump_stab(struct sk_buff *skb, struct qdisc_size_table *stab)
549 nest = nla_nest_start_noflag(skb, TCA_STAB);
551 goto nla_put_failure;
552 if (nla_put(skb, TCA_STAB_BASE, sizeof(stab->szopts), &stab->szopts))
553 goto nla_put_failure;
554 nla_nest_end(skb, nest);
562 void __qdisc_calculate_pkt_len(struct sk_buff *skb,
563 const struct qdisc_size_table *stab)
567 pkt_len = skb->len + stab->szopts.overhead;
568 if (unlikely(!stab->szopts.tsize))
571 slot = pkt_len + stab->szopts.cell_align;
572 if (unlikely(slot < 0))
575 slot >>= stab->szopts.cell_log;
576 if (likely(slot < stab->szopts.tsize))
577 pkt_len = stab->data[slot];
579 pkt_len = stab->data[stab->szopts.tsize - 1] *
580 (slot / stab->szopts.tsize) +
581 stab->data[slot % stab->szopts.tsize];
583 pkt_len <<= stab->szopts.size_log;
585 if (unlikely(pkt_len < 1))
587 qdisc_skb_cb(skb)->pkt_len = pkt_len;
589 EXPORT_SYMBOL(__qdisc_calculate_pkt_len);
591 void qdisc_warn_nonwc(const char *txt, struct Qdisc *qdisc)
593 if (!(qdisc->flags & TCQ_F_WARN_NONWC)) {
594 pr_warn("%s: %s qdisc %X: is non-work-conserving?\n",
595 txt, qdisc->ops->id, qdisc->handle >> 16);
596 qdisc->flags |= TCQ_F_WARN_NONWC;
599 EXPORT_SYMBOL(qdisc_warn_nonwc);
601 static enum hrtimer_restart qdisc_watchdog(struct hrtimer *timer)
603 struct qdisc_watchdog *wd = container_of(timer, struct qdisc_watchdog,
607 __netif_schedule(qdisc_root(wd->qdisc));
610 return HRTIMER_NORESTART;
613 void qdisc_watchdog_init_clockid(struct qdisc_watchdog *wd, struct Qdisc *qdisc,
616 hrtimer_init(&wd->timer, clockid, HRTIMER_MODE_ABS_PINNED);
617 wd->timer.function = qdisc_watchdog;
620 EXPORT_SYMBOL(qdisc_watchdog_init_clockid);
622 void qdisc_watchdog_init(struct qdisc_watchdog *wd, struct Qdisc *qdisc)
624 qdisc_watchdog_init_clockid(wd, qdisc, CLOCK_MONOTONIC);
626 EXPORT_SYMBOL(qdisc_watchdog_init);
628 void qdisc_watchdog_schedule_ns(struct qdisc_watchdog *wd, u64 expires)
630 if (test_bit(__QDISC_STATE_DEACTIVATED,
631 &qdisc_root_sleeping(wd->qdisc)->state))
634 if (wd->last_expires == expires)
637 wd->last_expires = expires;
638 hrtimer_start(&wd->timer,
639 ns_to_ktime(expires),
640 HRTIMER_MODE_ABS_PINNED);
642 EXPORT_SYMBOL(qdisc_watchdog_schedule_ns);
644 void qdisc_watchdog_cancel(struct qdisc_watchdog *wd)
646 hrtimer_cancel(&wd->timer);
648 EXPORT_SYMBOL(qdisc_watchdog_cancel);
650 static struct hlist_head *qdisc_class_hash_alloc(unsigned int n)
652 struct hlist_head *h;
655 h = kvmalloc_array(n, sizeof(struct hlist_head), GFP_KERNEL);
658 for (i = 0; i < n; i++)
659 INIT_HLIST_HEAD(&h[i]);
664 void qdisc_class_hash_grow(struct Qdisc *sch, struct Qdisc_class_hash *clhash)
666 struct Qdisc_class_common *cl;
667 struct hlist_node *next;
668 struct hlist_head *nhash, *ohash;
669 unsigned int nsize, nmask, osize;
672 /* Rehash when load factor exceeds 0.75 */
673 if (clhash->hashelems * 4 <= clhash->hashsize * 3)
675 nsize = clhash->hashsize * 2;
677 nhash = qdisc_class_hash_alloc(nsize);
681 ohash = clhash->hash;
682 osize = clhash->hashsize;
685 for (i = 0; i < osize; i++) {
686 hlist_for_each_entry_safe(cl, next, &ohash[i], hnode) {
687 h = qdisc_class_hash(cl->classid, nmask);
688 hlist_add_head(&cl->hnode, &nhash[h]);
691 clhash->hash = nhash;
692 clhash->hashsize = nsize;
693 clhash->hashmask = nmask;
694 sch_tree_unlock(sch);
698 EXPORT_SYMBOL(qdisc_class_hash_grow);
700 int qdisc_class_hash_init(struct Qdisc_class_hash *clhash)
702 unsigned int size = 4;
704 clhash->hash = qdisc_class_hash_alloc(size);
707 clhash->hashsize = size;
708 clhash->hashmask = size - 1;
709 clhash->hashelems = 0;
712 EXPORT_SYMBOL(qdisc_class_hash_init);
714 void qdisc_class_hash_destroy(struct Qdisc_class_hash *clhash)
716 kvfree(clhash->hash);
718 EXPORT_SYMBOL(qdisc_class_hash_destroy);
720 void qdisc_class_hash_insert(struct Qdisc_class_hash *clhash,
721 struct Qdisc_class_common *cl)
725 INIT_HLIST_NODE(&cl->hnode);
726 h = qdisc_class_hash(cl->classid, clhash->hashmask);
727 hlist_add_head(&cl->hnode, &clhash->hash[h]);
730 EXPORT_SYMBOL(qdisc_class_hash_insert);
732 void qdisc_class_hash_remove(struct Qdisc_class_hash *clhash,
733 struct Qdisc_class_common *cl)
735 hlist_del(&cl->hnode);
738 EXPORT_SYMBOL(qdisc_class_hash_remove);
740 /* Allocate an unique handle from space managed by kernel
741 * Possible range is [8000-FFFF]:0000 (0x8000 values)
743 static u32 qdisc_alloc_handle(struct net_device *dev)
746 static u32 autohandle = TC_H_MAKE(0x80000000U, 0);
749 autohandle += TC_H_MAKE(0x10000U, 0);
750 if (autohandle == TC_H_MAKE(TC_H_ROOT, 0))
751 autohandle = TC_H_MAKE(0x80000000U, 0);
752 if (!qdisc_lookup(dev, autohandle))
760 void qdisc_tree_reduce_backlog(struct Qdisc *sch, int n, int len)
762 bool qdisc_is_offloaded = sch->flags & TCQ_F_OFFLOADED;
763 const struct Qdisc_class_ops *cops;
769 if (n == 0 && len == 0)
771 drops = max_t(int, n, 0);
773 while ((parentid = sch->parent)) {
774 if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
777 if (sch->flags & TCQ_F_NOPARENT)
779 /* Notify parent qdisc only if child qdisc becomes empty.
781 * If child was empty even before update then backlog
782 * counter is screwed and we skip notification because
783 * parent class is already passive.
785 * If the original child was offloaded then it is allowed
786 * to be seem as empty, so the parent is notified anyway.
788 notify = !sch->q.qlen && !WARN_ON_ONCE(!n &&
789 !qdisc_is_offloaded);
790 /* TODO: perform the search on a per txq basis */
791 sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
793 WARN_ON_ONCE(parentid != TC_H_ROOT);
796 cops = sch->ops->cl_ops;
797 if (notify && cops->qlen_notify) {
798 cl = cops->find(sch, parentid);
799 cops->qlen_notify(sch, cl);
802 sch->qstats.backlog -= len;
803 __qdisc_qstats_drop(sch, drops);
807 EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
809 int qdisc_offload_dump_helper(struct Qdisc *sch, enum tc_setup_type type,
812 struct net_device *dev = qdisc_dev(sch);
815 sch->flags &= ~TCQ_F_OFFLOADED;
816 if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
819 err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
820 if (err == -EOPNOTSUPP)
824 sch->flags |= TCQ_F_OFFLOADED;
828 EXPORT_SYMBOL(qdisc_offload_dump_helper);
830 void qdisc_offload_graft_helper(struct net_device *dev, struct Qdisc *sch,
831 struct Qdisc *new, struct Qdisc *old,
832 enum tc_setup_type type, void *type_data,
833 struct netlink_ext_ack *extack)
835 bool any_qdisc_is_offloaded;
838 if (!tc_can_offload(dev) || !dev->netdev_ops->ndo_setup_tc)
841 err = dev->netdev_ops->ndo_setup_tc(dev, type, type_data);
843 /* Don't report error if the graft is part of destroy operation. */
844 if (!err || !new || new == &noop_qdisc)
847 /* Don't report error if the parent, the old child and the new
848 * one are not offloaded.
850 any_qdisc_is_offloaded = new->flags & TCQ_F_OFFLOADED;
851 any_qdisc_is_offloaded |= sch && sch->flags & TCQ_F_OFFLOADED;
852 any_qdisc_is_offloaded |= old && old->flags & TCQ_F_OFFLOADED;
854 if (any_qdisc_is_offloaded)
855 NL_SET_ERR_MSG(extack, "Offloading graft operation failed.");
857 EXPORT_SYMBOL(qdisc_offload_graft_helper);
859 static void qdisc_offload_graft_root(struct net_device *dev,
860 struct Qdisc *new, struct Qdisc *old,
861 struct netlink_ext_ack *extack)
863 struct tc_root_qopt_offload graft_offload = {
864 .command = TC_ROOT_GRAFT,
865 .handle = new ? new->handle : 0,
866 .ingress = (new && new->flags & TCQ_F_INGRESS) ||
867 (old && old->flags & TCQ_F_INGRESS),
870 qdisc_offload_graft_helper(dev, NULL, new, old,
871 TC_SETUP_ROOT_QDISC, &graft_offload, extack);
874 static int tc_fill_qdisc(struct sk_buff *skb, struct Qdisc *q, u32 clid,
875 u32 portid, u32 seq, u16 flags, int event)
877 struct gnet_stats_basic_cpu __percpu *cpu_bstats = NULL;
878 struct gnet_stats_queue __percpu *cpu_qstats = NULL;
880 struct nlmsghdr *nlh;
881 unsigned char *b = skb_tail_pointer(skb);
883 struct qdisc_size_table *stab;
888 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
891 tcm = nlmsg_data(nlh);
892 tcm->tcm_family = AF_UNSPEC;
895 tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
896 tcm->tcm_parent = clid;
897 tcm->tcm_handle = q->handle;
898 tcm->tcm_info = refcount_read(&q->refcnt);
899 if (nla_put_string(skb, TCA_KIND, q->ops->id))
900 goto nla_put_failure;
901 if (q->ops->ingress_block_get) {
902 block_index = q->ops->ingress_block_get(q);
904 nla_put_u32(skb, TCA_INGRESS_BLOCK, block_index))
905 goto nla_put_failure;
907 if (q->ops->egress_block_get) {
908 block_index = q->ops->egress_block_get(q);
910 nla_put_u32(skb, TCA_EGRESS_BLOCK, block_index))
911 goto nla_put_failure;
913 if (q->ops->dump && q->ops->dump(q, skb) < 0)
914 goto nla_put_failure;
915 if (nla_put_u8(skb, TCA_HW_OFFLOAD, !!(q->flags & TCQ_F_OFFLOADED)))
916 goto nla_put_failure;
917 qlen = qdisc_qlen_sum(q);
919 stab = rtnl_dereference(q->stab);
920 if (stab && qdisc_dump_stab(skb, stab) < 0)
921 goto nla_put_failure;
923 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
924 NULL, &d, TCA_PAD) < 0)
925 goto nla_put_failure;
927 if (q->ops->dump_stats && q->ops->dump_stats(q, &d) < 0)
928 goto nla_put_failure;
930 if (qdisc_is_percpu_stats(q)) {
931 cpu_bstats = q->cpu_bstats;
932 cpu_qstats = q->cpu_qstats;
935 if (gnet_stats_copy_basic(qdisc_root_sleeping_running(q),
936 &d, cpu_bstats, &q->bstats) < 0 ||
937 gnet_stats_copy_rate_est(&d, &q->rate_est) < 0 ||
938 gnet_stats_copy_queue(&d, cpu_qstats, &q->qstats, qlen) < 0)
939 goto nla_put_failure;
941 if (gnet_stats_finish_copy(&d) < 0)
942 goto nla_put_failure;
944 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
953 static bool tc_qdisc_dump_ignore(struct Qdisc *q, bool dump_invisible)
955 if (q->flags & TCQ_F_BUILTIN)
957 if ((q->flags & TCQ_F_INVISIBLE) && !dump_invisible)
963 static int qdisc_notify(struct net *net, struct sk_buff *oskb,
964 struct nlmsghdr *n, u32 clid,
965 struct Qdisc *old, struct Qdisc *new)
968 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
970 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
974 if (old && !tc_qdisc_dump_ignore(old, false)) {
975 if (tc_fill_qdisc(skb, old, clid, portid, n->nlmsg_seq,
976 0, RTM_DELQDISC) < 0)
979 if (new && !tc_qdisc_dump_ignore(new, false)) {
980 if (tc_fill_qdisc(skb, new, clid, portid, n->nlmsg_seq,
981 old ? NLM_F_REPLACE : 0, RTM_NEWQDISC) < 0)
986 return rtnetlink_send(skb, net, portid, RTNLGRP_TC,
987 n->nlmsg_flags & NLM_F_ECHO);
994 static void notify_and_destroy(struct net *net, struct sk_buff *skb,
995 struct nlmsghdr *n, u32 clid,
996 struct Qdisc *old, struct Qdisc *new)
999 qdisc_notify(net, skb, n, clid, old, new);
1005 static void qdisc_clear_nolock(struct Qdisc *sch)
1007 sch->flags &= ~TCQ_F_NOLOCK;
1008 if (!(sch->flags & TCQ_F_CPUSTATS))
1011 free_percpu(sch->cpu_bstats);
1012 free_percpu(sch->cpu_qstats);
1013 sch->cpu_bstats = NULL;
1014 sch->cpu_qstats = NULL;
1015 sch->flags &= ~TCQ_F_CPUSTATS;
1018 /* Graft qdisc "new" to class "classid" of qdisc "parent" or
1021 * When appropriate send a netlink notification using 'skb'
1024 * On success, destroy old qdisc.
1027 static int qdisc_graft(struct net_device *dev, struct Qdisc *parent,
1028 struct sk_buff *skb, struct nlmsghdr *n, u32 classid,
1029 struct Qdisc *new, struct Qdisc *old,
1030 struct netlink_ext_ack *extack)
1032 struct Qdisc *q = old;
1033 struct net *net = dev_net(dev);
1035 if (parent == NULL) {
1036 unsigned int i, num_q, ingress;
1039 num_q = dev->num_tx_queues;
1040 if ((q && q->flags & TCQ_F_INGRESS) ||
1041 (new && new->flags & TCQ_F_INGRESS)) {
1044 if (!dev_ingress_queue(dev)) {
1045 NL_SET_ERR_MSG(extack, "Device does not have an ingress queue");
1050 if (dev->flags & IFF_UP)
1051 dev_deactivate(dev);
1053 qdisc_offload_graft_root(dev, new, old, extack);
1055 if (new && new->ops->attach)
1058 for (i = 0; i < num_q; i++) {
1059 struct netdev_queue *dev_queue = dev_ingress_queue(dev);
1062 dev_queue = netdev_get_tx_queue(dev, i);
1064 old = dev_graft_qdisc(dev_queue, new);
1066 qdisc_refcount_inc(new);
1074 old = rtnl_dereference(dev->qdisc);
1075 if (new && !new->ops->attach)
1076 qdisc_refcount_inc(new);
1077 rcu_assign_pointer(dev->qdisc, new ? : &noop_qdisc);
1079 notify_and_destroy(net, skb, n, classid, old, new);
1081 if (new && new->ops->attach)
1082 new->ops->attach(new);
1084 notify_and_destroy(net, skb, n, classid, old, new);
1087 if (dev->flags & IFF_UP)
1090 const struct Qdisc_class_ops *cops = parent->ops->cl_ops;
1094 /* Only support running class lockless if parent is lockless */
1095 if (new && (new->flags & TCQ_F_NOLOCK) &&
1096 parent && !(parent->flags & TCQ_F_NOLOCK))
1097 qdisc_clear_nolock(new);
1099 if (!cops || !cops->graft)
1102 cl = cops->find(parent, classid);
1104 NL_SET_ERR_MSG(extack, "Specified class not found");
1108 if (new && new->ops == &noqueue_qdisc_ops) {
1109 NL_SET_ERR_MSG(extack, "Cannot assign noqueue to a class");
1113 err = cops->graft(parent, cl, new, &old, extack);
1116 notify_and_destroy(net, skb, n, classid, old, new);
1121 static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca,
1122 struct netlink_ext_ack *extack)
1126 if (tca[TCA_INGRESS_BLOCK]) {
1127 block_index = nla_get_u32(tca[TCA_INGRESS_BLOCK]);
1130 NL_SET_ERR_MSG(extack, "Ingress block index cannot be 0");
1133 if (!sch->ops->ingress_block_set) {
1134 NL_SET_ERR_MSG(extack, "Ingress block sharing is not supported");
1137 sch->ops->ingress_block_set(sch, block_index);
1139 if (tca[TCA_EGRESS_BLOCK]) {
1140 block_index = nla_get_u32(tca[TCA_EGRESS_BLOCK]);
1143 NL_SET_ERR_MSG(extack, "Egress block index cannot be 0");
1146 if (!sch->ops->egress_block_set) {
1147 NL_SET_ERR_MSG(extack, "Egress block sharing is not supported");
1150 sch->ops->egress_block_set(sch, block_index);
1156 Allocate and initialize new qdisc.
1158 Parameters are passed via opt.
1161 static struct Qdisc *qdisc_create(struct net_device *dev,
1162 struct netdev_queue *dev_queue,
1163 struct Qdisc *p, u32 parent, u32 handle,
1164 struct nlattr **tca, int *errp,
1165 struct netlink_ext_ack *extack)
1168 struct nlattr *kind = tca[TCA_KIND];
1170 struct Qdisc_ops *ops;
1171 struct qdisc_size_table *stab;
1173 ops = qdisc_lookup_ops(kind);
1174 #ifdef CONFIG_MODULES
1175 if (ops == NULL && kind != NULL) {
1176 char name[IFNAMSIZ];
1177 if (nla_strlcpy(name, kind, IFNAMSIZ) < IFNAMSIZ) {
1178 /* We dropped the RTNL semaphore in order to
1179 * perform the module load. So, even if we
1180 * succeeded in loading the module we have to
1181 * tell the caller to replay the request. We
1182 * indicate this using -EAGAIN.
1183 * We replay the request because the device may
1184 * go away in the mean time.
1187 request_module("sch_%s", name);
1189 ops = qdisc_lookup_ops(kind);
1191 /* We will try again qdisc_lookup_ops,
1192 * so don't keep a reference.
1194 module_put(ops->owner);
1204 NL_SET_ERR_MSG(extack, "Specified qdisc kind is unknown");
1208 sch = qdisc_alloc(dev_queue, ops, extack);
1214 sch->parent = parent;
1216 if (handle == TC_H_INGRESS) {
1217 if (!(sch->flags & TCQ_F_INGRESS)) {
1218 NL_SET_ERR_MSG(extack,
1219 "Specified parent ID is reserved for ingress and clsact Qdiscs");
1223 handle = TC_H_MAKE(TC_H_INGRESS, 0);
1226 handle = qdisc_alloc_handle(dev);
1228 NL_SET_ERR_MSG(extack, "Maximum number of qdisc handles was exceeded");
1233 if (!netif_is_multiqueue(dev))
1234 sch->flags |= TCQ_F_ONETXQUEUE;
1237 sch->handle = handle;
1239 /* This exist to keep backward compatible with a userspace
1240 * loophole, what allowed userspace to get IFF_NO_QUEUE
1241 * facility on older kernels by setting tx_queue_len=0 (prior
1242 * to qdisc init), and then forgot to reinit tx_queue_len
1243 * before again attaching a qdisc.
1245 if ((dev->priv_flags & IFF_NO_QUEUE) && (dev->tx_queue_len == 0)) {
1246 dev->tx_queue_len = DEFAULT_TX_QUEUE_LEN;
1247 netdev_info(dev, "Caught tx_queue_len zero misconfig\n");
1250 err = qdisc_block_indexes_set(sch, tca, extack);
1255 err = ops->init(sch, tca[TCA_OPTIONS], extack);
1260 if (tca[TCA_STAB]) {
1261 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1263 err = PTR_ERR(stab);
1266 rcu_assign_pointer(sch->stab, stab);
1268 if (tca[TCA_RATE]) {
1269 seqcount_t *running;
1272 if (sch->flags & TCQ_F_MQROOT) {
1273 NL_SET_ERR_MSG(extack, "Cannot attach rate estimator to a multi-queue root qdisc");
1277 if (sch->parent != TC_H_ROOT &&
1278 !(sch->flags & TCQ_F_INGRESS) &&
1279 (!p || !(p->flags & TCQ_F_MQROOT)))
1280 running = qdisc_root_sleeping_running(sch);
1282 running = &sch->running;
1284 err = gen_new_estimator(&sch->bstats,
1291 NL_SET_ERR_MSG(extack, "Failed to generate new estimator");
1296 qdisc_hash_add(sch, false);
1301 /* ops->init() failed, we call ->destroy() like qdisc_create_dflt() */
1308 module_put(ops->owner);
1315 * Any broken qdiscs that would require a ops->reset() here?
1316 * The qdisc was never in action so it shouldn't be necessary.
1318 qdisc_put_stab(rtnl_dereference(sch->stab));
1324 static int qdisc_change(struct Qdisc *sch, struct nlattr **tca,
1325 struct netlink_ext_ack *extack)
1327 struct qdisc_size_table *ostab, *stab = NULL;
1330 if (tca[TCA_OPTIONS]) {
1331 if (!sch->ops->change) {
1332 NL_SET_ERR_MSG(extack, "Change operation not supported by specified qdisc");
1335 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
1336 NL_SET_ERR_MSG(extack, "Change of blocks is not supported");
1339 err = sch->ops->change(sch, tca[TCA_OPTIONS], extack);
1344 if (tca[TCA_STAB]) {
1345 stab = qdisc_get_stab(tca[TCA_STAB], extack);
1347 return PTR_ERR(stab);
1350 ostab = rtnl_dereference(sch->stab);
1351 rcu_assign_pointer(sch->stab, stab);
1352 qdisc_put_stab(ostab);
1354 if (tca[TCA_RATE]) {
1355 /* NB: ignores errors from replace_estimator
1356 because change can't be undone. */
1357 if (sch->flags & TCQ_F_MQROOT)
1359 gen_replace_estimator(&sch->bstats,
1363 qdisc_root_sleeping_running(sch),
1370 struct check_loop_arg {
1371 struct qdisc_walker w;
1376 static int check_loop_fn(struct Qdisc *q, unsigned long cl,
1377 struct qdisc_walker *w);
1379 static int check_loop(struct Qdisc *q, struct Qdisc *p, int depth)
1381 struct check_loop_arg arg;
1383 if (q->ops->cl_ops == NULL)
1386 arg.w.stop = arg.w.skip = arg.w.count = 0;
1387 arg.w.fn = check_loop_fn;
1390 q->ops->cl_ops->walk(q, &arg.w);
1391 return arg.w.stop ? -ELOOP : 0;
1395 check_loop_fn(struct Qdisc *q, unsigned long cl, struct qdisc_walker *w)
1398 const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1399 struct check_loop_arg *arg = (struct check_loop_arg *)w;
1401 leaf = cops->leaf(q, cl);
1403 if (leaf == arg->p || arg->depth > 7)
1405 return check_loop(leaf, arg->p, arg->depth + 1);
1410 const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = {
1411 [TCA_KIND] = { .type = NLA_STRING },
1412 [TCA_RATE] = { .type = NLA_BINARY,
1413 .len = sizeof(struct tc_estimator) },
1414 [TCA_STAB] = { .type = NLA_NESTED },
1415 [TCA_DUMP_INVISIBLE] = { .type = NLA_FLAG },
1416 [TCA_CHAIN] = { .type = NLA_U32 },
1417 [TCA_INGRESS_BLOCK] = { .type = NLA_U32 },
1418 [TCA_EGRESS_BLOCK] = { .type = NLA_U32 },
1425 static int tc_get_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1426 struct netlink_ext_ack *extack)
1428 struct net *net = sock_net(skb->sk);
1429 struct tcmsg *tcm = nlmsg_data(n);
1430 struct nlattr *tca[TCA_MAX + 1];
1431 struct net_device *dev;
1433 struct Qdisc *q = NULL;
1434 struct Qdisc *p = NULL;
1437 if ((n->nlmsg_type != RTM_GETQDISC) &&
1438 !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1441 err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1442 rtm_tca_policy, extack);
1446 dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1450 clid = tcm->tcm_parent;
1452 if (clid != TC_H_ROOT) {
1453 if (TC_H_MAJ(clid) != TC_H_MAJ(TC_H_INGRESS)) {
1454 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1456 NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified classid");
1459 q = qdisc_leaf(p, clid);
1460 } else if (dev_ingress_queue(dev)) {
1461 q = dev_ingress_queue(dev)->qdisc_sleeping;
1464 q = rtnl_dereference(dev->qdisc);
1467 NL_SET_ERR_MSG(extack, "Cannot find specified qdisc on specified device");
1471 if (tcm->tcm_handle && q->handle != tcm->tcm_handle) {
1472 NL_SET_ERR_MSG(extack, "Invalid handle");
1476 q = qdisc_lookup(dev, tcm->tcm_handle);
1478 NL_SET_ERR_MSG(extack, "Failed to find qdisc with specified handle");
1483 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1484 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1488 if (n->nlmsg_type == RTM_DELQDISC) {
1490 NL_SET_ERR_MSG(extack, "Classid cannot be zero");
1493 if (q->handle == 0) {
1494 NL_SET_ERR_MSG(extack, "Cannot delete qdisc with handle of zero");
1497 err = qdisc_graft(dev, p, skb, n, clid, NULL, q, extack);
1501 qdisc_notify(net, skb, n, clid, NULL, q);
1506 static bool req_create_or_replace(struct nlmsghdr *n)
1508 return (n->nlmsg_flags & NLM_F_CREATE &&
1509 n->nlmsg_flags & NLM_F_REPLACE);
1512 static bool req_create_exclusive(struct nlmsghdr *n)
1514 return (n->nlmsg_flags & NLM_F_CREATE &&
1515 n->nlmsg_flags & NLM_F_EXCL);
1518 static bool req_change(struct nlmsghdr *n)
1520 return (!(n->nlmsg_flags & NLM_F_CREATE) &&
1521 !(n->nlmsg_flags & NLM_F_REPLACE) &&
1522 !(n->nlmsg_flags & NLM_F_EXCL));
1526 * Create/change qdisc.
1528 static int tc_modify_qdisc(struct sk_buff *skb, struct nlmsghdr *n,
1529 struct netlink_ext_ack *extack)
1531 struct net *net = sock_net(skb->sk);
1533 struct nlattr *tca[TCA_MAX + 1];
1534 struct net_device *dev;
1536 struct Qdisc *q, *p;
1539 if (!netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
1543 /* Reinit, just in case something touches this. */
1544 err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
1545 rtm_tca_policy, extack);
1549 tcm = nlmsg_data(n);
1550 clid = tcm->tcm_parent;
1553 dev = __dev_get_by_index(net, tcm->tcm_ifindex);
1559 if (clid != TC_H_ROOT) {
1560 if (clid != TC_H_INGRESS) {
1561 p = qdisc_lookup(dev, TC_H_MAJ(clid));
1563 NL_SET_ERR_MSG(extack, "Failed to find specified qdisc");
1566 q = qdisc_leaf(p, clid);
1567 } else if (dev_ingress_queue_create(dev)) {
1568 q = dev_ingress_queue(dev)->qdisc_sleeping;
1571 q = rtnl_dereference(dev->qdisc);
1574 /* It may be default qdisc, ignore it */
1575 if (q && q->handle == 0)
1578 if (!q || !tcm->tcm_handle || q->handle != tcm->tcm_handle) {
1579 if (tcm->tcm_handle) {
1580 if (q && !(n->nlmsg_flags & NLM_F_REPLACE)) {
1581 NL_SET_ERR_MSG(extack, "NLM_F_REPLACE needed to override");
1584 if (TC_H_MIN(tcm->tcm_handle)) {
1585 NL_SET_ERR_MSG(extack, "Invalid minor handle");
1588 q = qdisc_lookup(dev, tcm->tcm_handle);
1590 goto create_n_graft;
1591 if (n->nlmsg_flags & NLM_F_EXCL) {
1592 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot override");
1595 if (tca[TCA_KIND] &&
1596 nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1597 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1600 if (q->flags & TCQ_F_INGRESS) {
1601 NL_SET_ERR_MSG(extack,
1602 "Cannot regraft ingress or clsact Qdiscs");
1606 (p && check_loop(q, p, 0))) {
1607 NL_SET_ERR_MSG(extack, "Qdisc parent/child loop detected");
1610 if (clid == TC_H_INGRESS) {
1611 NL_SET_ERR_MSG(extack, "Ingress cannot graft directly");
1614 qdisc_refcount_inc(q);
1618 goto create_n_graft;
1620 /* This magic test requires explanation.
1622 * We know, that some child q is already
1623 * attached to this parent and have choice:
1624 * 1) change it or 2) create/graft new one.
1625 * If the requested qdisc kind is different
1626 * than the existing one, then we choose graft.
1627 * If they are the same then this is "change"
1628 * operation - just let it fallthrough..
1630 * 1. We are allowed to create/graft only
1631 * if the request is explicitly stating
1632 * "please create if it doesn't exist".
1634 * 2. If the request is to exclusive create
1635 * then the qdisc tcm_handle is not expected
1636 * to exist, so that we choose create/graft too.
1638 * 3. The last case is when no flags are set.
1639 * This will happen when for example tc
1640 * utility issues a "change" command.
1641 * Alas, it is sort of hole in API, we
1642 * cannot decide what to do unambiguously.
1643 * For now we select create/graft.
1645 if (tca[TCA_KIND] &&
1646 nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1647 if (req_create_or_replace(n) ||
1648 req_create_exclusive(n))
1649 goto create_n_graft;
1650 else if (req_change(n))
1651 goto create_n_graft2;
1656 if (!tcm->tcm_handle) {
1657 NL_SET_ERR_MSG(extack, "Handle cannot be zero");
1660 q = qdisc_lookup(dev, tcm->tcm_handle);
1663 /* Change qdisc parameters */
1665 NL_SET_ERR_MSG(extack, "Specified qdisc not found");
1668 if (n->nlmsg_flags & NLM_F_EXCL) {
1669 NL_SET_ERR_MSG(extack, "Exclusivity flag on, cannot modify");
1672 if (tca[TCA_KIND] && nla_strcmp(tca[TCA_KIND], q->ops->id)) {
1673 NL_SET_ERR_MSG(extack, "Invalid qdisc name");
1676 err = qdisc_change(q, tca, extack);
1678 qdisc_notify(net, skb, n, clid, NULL, q);
1682 if (!(n->nlmsg_flags & NLM_F_CREATE)) {
1683 NL_SET_ERR_MSG(extack, "Qdisc not found. To create specify NLM_F_CREATE flag");
1687 if (clid == TC_H_INGRESS) {
1688 if (dev_ingress_queue(dev)) {
1689 q = qdisc_create(dev, dev_ingress_queue(dev), p,
1690 tcm->tcm_parent, tcm->tcm_parent,
1693 NL_SET_ERR_MSG(extack, "Cannot find ingress queue for specified device");
1697 struct netdev_queue *dev_queue;
1699 if (p && p->ops->cl_ops && p->ops->cl_ops->select_queue)
1700 dev_queue = p->ops->cl_ops->select_queue(p, tcm);
1702 dev_queue = p->dev_queue;
1704 dev_queue = netdev_get_tx_queue(dev, 0);
1706 q = qdisc_create(dev, dev_queue, p,
1707 tcm->tcm_parent, tcm->tcm_handle,
1717 err = qdisc_graft(dev, p, skb, n, clid, q, NULL, extack);
1727 static int tc_dump_qdisc_root(struct Qdisc *root, struct sk_buff *skb,
1728 struct netlink_callback *cb,
1729 int *q_idx_p, int s_q_idx, bool recur,
1730 bool dump_invisible)
1732 int ret = 0, q_idx = *q_idx_p;
1740 if (q_idx < s_q_idx) {
1743 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1744 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1745 cb->nlh->nlmsg_seq, NLM_F_MULTI,
1751 /* If dumping singletons, there is no qdisc_dev(root) and the singleton
1752 * itself has already been dumped.
1754 * If we've already dumped the top-level (ingress) qdisc above and the global
1755 * qdisc hashtable, we don't want to hit it again
1757 if (!qdisc_dev(root) || !recur)
1760 hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
1761 if (q_idx < s_q_idx) {
1765 if (!tc_qdisc_dump_ignore(q, dump_invisible) &&
1766 tc_fill_qdisc(skb, q, q->parent, NETLINK_CB(cb->skb).portid,
1767 cb->nlh->nlmsg_seq, NLM_F_MULTI,
1781 static int tc_dump_qdisc(struct sk_buff *skb, struct netlink_callback *cb)
1783 struct net *net = sock_net(skb->sk);
1786 struct net_device *dev;
1787 const struct nlmsghdr *nlh = cb->nlh;
1788 struct nlattr *tca[TCA_MAX + 1];
1791 s_idx = cb->args[0];
1792 s_q_idx = q_idx = cb->args[1];
1797 err = nlmsg_parse_deprecated(nlh, sizeof(struct tcmsg), tca, TCA_MAX,
1798 rtm_tca_policy, cb->extack);
1802 for_each_netdev(net, dev) {
1803 struct netdev_queue *dev_queue;
1811 if (tc_dump_qdisc_root(rtnl_dereference(dev->qdisc),
1812 skb, cb, &q_idx, s_q_idx,
1813 true, tca[TCA_DUMP_INVISIBLE]) < 0)
1816 dev_queue = dev_ingress_queue(dev);
1818 tc_dump_qdisc_root(dev_queue->qdisc_sleeping, skb, cb,
1819 &q_idx, s_q_idx, false,
1820 tca[TCA_DUMP_INVISIBLE]) < 0)
1829 cb->args[1] = q_idx;
1836 /************************************************
1837 * Traffic classes manipulation. *
1838 ************************************************/
1840 static int tc_fill_tclass(struct sk_buff *skb, struct Qdisc *q,
1842 u32 portid, u32 seq, u16 flags, int event)
1845 struct nlmsghdr *nlh;
1846 unsigned char *b = skb_tail_pointer(skb);
1848 const struct Qdisc_class_ops *cl_ops = q->ops->cl_ops;
1851 nlh = nlmsg_put(skb, portid, seq, event, sizeof(*tcm), flags);
1853 goto out_nlmsg_trim;
1854 tcm = nlmsg_data(nlh);
1855 tcm->tcm_family = AF_UNSPEC;
1858 tcm->tcm_ifindex = qdisc_dev(q)->ifindex;
1859 tcm->tcm_parent = q->handle;
1860 tcm->tcm_handle = q->handle;
1862 if (nla_put_string(skb, TCA_KIND, q->ops->id))
1863 goto nla_put_failure;
1864 if (cl_ops->dump && cl_ops->dump(q, cl, skb, tcm) < 0)
1865 goto nla_put_failure;
1867 if (gnet_stats_start_copy_compat(skb, TCA_STATS2, TCA_STATS, TCA_XSTATS,
1868 NULL, &d, TCA_PAD) < 0)
1869 goto nla_put_failure;
1871 if (cl_ops->dump_stats && cl_ops->dump_stats(q, cl, &d) < 0)
1872 goto nla_put_failure;
1874 if (gnet_stats_finish_copy(&d) < 0)
1875 goto nla_put_failure;
1877 nlh->nlmsg_len = skb_tail_pointer(skb) - b;
1886 static int tclass_notify(struct net *net, struct sk_buff *oskb,
1887 struct nlmsghdr *n, struct Qdisc *q,
1888 unsigned long cl, int event)
1890 struct sk_buff *skb;
1891 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1894 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1898 if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0, event) < 0) {
1903 err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1904 n->nlmsg_flags & NLM_F_ECHO);
1910 static int tclass_del_notify(struct net *net,
1911 const struct Qdisc_class_ops *cops,
1912 struct sk_buff *oskb, struct nlmsghdr *n,
1913 struct Qdisc *q, unsigned long cl)
1915 u32 portid = oskb ? NETLINK_CB(oskb).portid : 0;
1916 struct sk_buff *skb;
1922 skb = alloc_skb(NLMSG_GOODSIZE, GFP_KERNEL);
1926 if (tc_fill_tclass(skb, q, cl, portid, n->nlmsg_seq, 0,
1927 RTM_DELTCLASS) < 0) {
1932 err = cops->delete(q, cl);
1938 err = rtnetlink_send(skb, net, portid, RTNLGRP_TC,
1939 n->nlmsg_flags & NLM_F_ECHO);
1945 #ifdef CONFIG_NET_CLS
1947 struct tcf_bind_args {
1948 struct tcf_walker w;
1954 static int tcf_node_bind(struct tcf_proto *tp, void *n, struct tcf_walker *arg)
1956 struct tcf_bind_args *a = (void *)arg;
1958 if (tp->ops->bind_class) {
1959 struct Qdisc *q = tcf_block_q(tp->chain->block);
1962 tp->ops->bind_class(n, a->classid, a->cl, q, a->base);
1968 struct tc_bind_class_args {
1969 struct qdisc_walker w;
1970 unsigned long new_cl;
1975 static int tc_bind_class_walker(struct Qdisc *q, unsigned long cl,
1976 struct qdisc_walker *w)
1978 struct tc_bind_class_args *a = (struct tc_bind_class_args *)w;
1979 const struct Qdisc_class_ops *cops = q->ops->cl_ops;
1980 struct tcf_block *block;
1981 struct tcf_chain *chain;
1983 block = cops->tcf_block(q, cl, NULL);
1986 for (chain = tcf_get_next_chain(block, NULL);
1988 chain = tcf_get_next_chain(block, chain)) {
1989 struct tcf_proto *tp;
1991 for (tp = tcf_get_next_proto(chain, NULL, true);
1992 tp; tp = tcf_get_next_proto(chain, tp, true)) {
1993 struct tcf_bind_args arg = {};
1995 arg.w.fn = tcf_node_bind;
1996 arg.classid = a->clid;
1999 tp->ops->walk(tp, &arg.w, true);
2006 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
2007 unsigned long new_cl)
2009 const struct Qdisc_class_ops *cops = q->ops->cl_ops;
2010 struct tc_bind_class_args args = {};
2012 if (!cops->tcf_block)
2014 args.portid = portid;
2016 args.new_cl = new_cl;
2017 args.w.fn = tc_bind_class_walker;
2018 q->ops->cl_ops->walk(q, &args.w);
2023 static void tc_bind_tclass(struct Qdisc *q, u32 portid, u32 clid,
2024 unsigned long new_cl)
2030 static int tc_ctl_tclass(struct sk_buff *skb, struct nlmsghdr *n,
2031 struct netlink_ext_ack *extack)
2033 struct net *net = sock_net(skb->sk);
2034 struct tcmsg *tcm = nlmsg_data(n);
2035 struct nlattr *tca[TCA_MAX + 1];
2036 struct net_device *dev;
2037 struct Qdisc *q = NULL;
2038 const struct Qdisc_class_ops *cops;
2039 unsigned long cl = 0;
2040 unsigned long new_cl;
2046 if ((n->nlmsg_type != RTM_GETTCLASS) &&
2047 !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
2050 err = nlmsg_parse_deprecated(n, sizeof(*tcm), tca, TCA_MAX,
2051 rtm_tca_policy, extack);
2055 dev = __dev_get_by_index(net, tcm->tcm_ifindex);
2060 parent == TC_H_UNSPEC - unspecified parent.
2061 parent == TC_H_ROOT - class is root, which has no parent.
2062 parent == X:0 - parent is root class.
2063 parent == X:Y - parent is a node in hierarchy.
2064 parent == 0:Y - parent is X:Y, where X:0 is qdisc.
2066 handle == 0:0 - generate handle from kernel pool.
2067 handle == 0:Y - class is X:Y, where X:0 is qdisc.
2068 handle == X:Y - clear.
2069 handle == X:0 - root class.
2072 /* Step 1. Determine qdisc handle X:0 */
2074 portid = tcm->tcm_parent;
2075 clid = tcm->tcm_handle;
2076 qid = TC_H_MAJ(clid);
2078 if (portid != TC_H_ROOT) {
2079 u32 qid1 = TC_H_MAJ(portid);
2082 /* If both majors are known, they must be identical. */
2087 } else if (qid == 0)
2088 qid = rtnl_dereference(dev->qdisc)->handle;
2090 /* Now qid is genuine qdisc handle consistent
2091 * both with parent and child.
2093 * TC_H_MAJ(portid) still may be unspecified, complete it now.
2096 portid = TC_H_MAKE(qid, portid);
2099 qid = rtnl_dereference(dev->qdisc)->handle;
2102 /* OK. Locate qdisc */
2103 q = qdisc_lookup(dev, qid);
2107 /* An check that it supports classes */
2108 cops = q->ops->cl_ops;
2112 /* Now try to get class */
2114 if (portid == TC_H_ROOT)
2117 clid = TC_H_MAKE(qid, clid);
2120 cl = cops->find(q, clid);
2124 if (n->nlmsg_type != RTM_NEWTCLASS ||
2125 !(n->nlmsg_flags & NLM_F_CREATE))
2128 switch (n->nlmsg_type) {
2131 if (n->nlmsg_flags & NLM_F_EXCL)
2135 err = tclass_del_notify(net, cops, skb, n, q, cl);
2136 /* Unbind the class with flilters with 0 */
2137 tc_bind_tclass(q, portid, clid, 0);
2140 err = tclass_notify(net, skb, n, q, cl, RTM_NEWTCLASS);
2148 if (tca[TCA_INGRESS_BLOCK] || tca[TCA_EGRESS_BLOCK]) {
2149 NL_SET_ERR_MSG(extack, "Shared blocks are not supported for classes");
2156 err = cops->change(q, clid, portid, tca, &new_cl, extack);
2158 tclass_notify(net, skb, n, q, new_cl, RTM_NEWTCLASS);
2159 /* We just create a new class, need to do reverse binding. */
2161 tc_bind_tclass(q, portid, clid, new_cl);
2167 struct qdisc_dump_args {
2168 struct qdisc_walker w;
2169 struct sk_buff *skb;
2170 struct netlink_callback *cb;
2173 static int qdisc_class_dump(struct Qdisc *q, unsigned long cl,
2174 struct qdisc_walker *arg)
2176 struct qdisc_dump_args *a = (struct qdisc_dump_args *)arg;
2178 return tc_fill_tclass(a->skb, q, cl, NETLINK_CB(a->cb->skb).portid,
2179 a->cb->nlh->nlmsg_seq, NLM_F_MULTI,
2183 static int tc_dump_tclass_qdisc(struct Qdisc *q, struct sk_buff *skb,
2184 struct tcmsg *tcm, struct netlink_callback *cb,
2187 struct qdisc_dump_args arg;
2189 if (tc_qdisc_dump_ignore(q, false) ||
2190 *t_p < s_t || !q->ops->cl_ops ||
2192 TC_H_MAJ(tcm->tcm_parent) != q->handle)) {
2197 memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0]));
2198 arg.w.fn = qdisc_class_dump;
2202 arg.w.skip = cb->args[1];
2204 q->ops->cl_ops->walk(q, &arg.w);
2205 cb->args[1] = arg.w.count;
2212 static int tc_dump_tclass_root(struct Qdisc *root, struct sk_buff *skb,
2213 struct tcmsg *tcm, struct netlink_callback *cb,
2214 int *t_p, int s_t, bool recur)
2222 if (tc_dump_tclass_qdisc(root, skb, tcm, cb, t_p, s_t) < 0)
2225 if (!qdisc_dev(root) || !recur)
2228 if (tcm->tcm_parent) {
2229 q = qdisc_match_from_root(root, TC_H_MAJ(tcm->tcm_parent));
2230 if (q && q != root &&
2231 tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2235 hash_for_each(qdisc_dev(root)->qdisc_hash, b, q, hash) {
2236 if (tc_dump_tclass_qdisc(q, skb, tcm, cb, t_p, s_t) < 0)
2243 static int tc_dump_tclass(struct sk_buff *skb, struct netlink_callback *cb)
2245 struct tcmsg *tcm = nlmsg_data(cb->nlh);
2246 struct net *net = sock_net(skb->sk);
2247 struct netdev_queue *dev_queue;
2248 struct net_device *dev;
2251 if (nlmsg_len(cb->nlh) < sizeof(*tcm))
2253 dev = dev_get_by_index(net, tcm->tcm_ifindex);
2260 if (tc_dump_tclass_root(rtnl_dereference(dev->qdisc),
2261 skb, tcm, cb, &t, s_t, true) < 0)
2264 dev_queue = dev_ingress_queue(dev);
2266 tc_dump_tclass_root(dev_queue->qdisc_sleeping, skb, tcm, cb,
2267 &t, s_t, false) < 0)
2277 #ifdef CONFIG_PROC_FS
2278 static int psched_show(struct seq_file *seq, void *v)
2280 seq_printf(seq, "%08x %08x %08x %08x\n",
2281 (u32)NSEC_PER_USEC, (u32)PSCHED_TICKS2NS(1),
2283 (u32)NSEC_PER_SEC / hrtimer_resolution);
2288 static int __net_init psched_net_init(struct net *net)
2290 struct proc_dir_entry *e;
2292 e = proc_create_single("psched", 0, net->proc_net, psched_show);
2299 static void __net_exit psched_net_exit(struct net *net)
2301 remove_proc_entry("psched", net->proc_net);
2304 static int __net_init psched_net_init(struct net *net)
2309 static void __net_exit psched_net_exit(struct net *net)
2314 static struct pernet_operations psched_net_ops = {
2315 .init = psched_net_init,
2316 .exit = psched_net_exit,
2319 static int __init pktsched_init(void)
2323 err = register_pernet_subsys(&psched_net_ops);
2325 pr_err("pktsched_init: "
2326 "cannot initialize per netns operations\n");
2330 register_qdisc(&pfifo_fast_ops);
2331 register_qdisc(&pfifo_qdisc_ops);
2332 register_qdisc(&bfifo_qdisc_ops);
2333 register_qdisc(&pfifo_head_drop_qdisc_ops);
2334 register_qdisc(&mq_qdisc_ops);
2335 register_qdisc(&noqueue_qdisc_ops);
2337 rtnl_register(PF_UNSPEC, RTM_NEWQDISC, tc_modify_qdisc, NULL, 0);
2338 rtnl_register(PF_UNSPEC, RTM_DELQDISC, tc_get_qdisc, NULL, 0);
2339 rtnl_register(PF_UNSPEC, RTM_GETQDISC, tc_get_qdisc, tc_dump_qdisc,
2341 rtnl_register(PF_UNSPEC, RTM_NEWTCLASS, tc_ctl_tclass, NULL, 0);
2342 rtnl_register(PF_UNSPEC, RTM_DELTCLASS, tc_ctl_tclass, NULL, 0);
2343 rtnl_register(PF_UNSPEC, RTM_GETTCLASS, tc_ctl_tclass, tc_dump_tclass,
2349 subsys_initcall(pktsched_init);