1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * net/sched/sch_cbq.c Class-Based Queueing discipline.
5 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
8 #include <linux/module.h>
9 #include <linux/slab.h>
10 #include <linux/types.h>
11 #include <linux/kernel.h>
12 #include <linux/string.h>
13 #include <linux/errno.h>
14 #include <linux/skbuff.h>
15 #include <net/netlink.h>
16 #include <net/pkt_sched.h>
17 #include <net/pkt_cls.h>
20 /* Class-Based Queueing (CBQ) algorithm.
21 =======================================
23 Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
24 Management Models for Packet Networks",
25 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
27 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
29 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
32 [4] Sally Floyd and Michael Speer, "Experimental Results
33 for Class-Based Queueing", 1998, not published.
35 -----------------------------------------------------------------------
37 Algorithm skeleton was taken from NS simulator cbq.cc.
38 If someone wants to check this code against the LBL version,
39 he should take into account that ONLY the skeleton was borrowed,
40 the implementation is different. Particularly:
42 --- The WRR algorithm is different. Our version looks more
43 reasonable (I hope) and works when quanta are allowed to be
44 less than MTU, which is always the case when real time classes
45 have small rates. Note, that the statement of [3] is
46 incomplete, delay may actually be estimated even if class
47 per-round allotment is less than MTU. Namely, if per-round
48 allotment is W*r_i, and r_1+...+r_k = r < 1
50 delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
52 In the worst case we have IntServ estimate with D = W*r+k*MTU
53 and C = MTU*r. The proof (if correct at all) is trivial.
56 --- It seems that cbq-2.0 is not very accurate. At least, I cannot
57 interpret some places, which look like wrong translations
58 from NS. Anyone is advised to find these differences
59 and explain to me, why I am wrong 8).
61 --- Linux has no EOI event, so that we cannot estimate true class
62 idle time. Workaround is to consider the next dequeue event
63 as sign that previous packet is finished. This is wrong because of
64 internal device queueing, but on a permanently loaded link it is true.
65 Moreover, combined with clock integrator, this scheme looks
66 very close to an ideal solution. */
68 struct cbq_sched_data;
72 struct Qdisc_class_common common;
73 struct cbq_class *next_alive; /* next class with backlog in this priority band */
76 unsigned char priority; /* class priority */
77 unsigned char priority2; /* priority to be used after overlimit */
78 unsigned char ewma_log; /* time constant for idle time calculation */
82 /* Link-sharing scheduler parameters */
83 long maxidle; /* Class parameters: see below. */
87 struct qdisc_rate_table *R_tab;
89 /* General scheduler (WRR) parameters */
91 long quantum; /* Allotment per WRR round */
92 long weight; /* Relative allotment: see below */
94 struct Qdisc *qdisc; /* Ptr to CBQ discipline */
95 struct cbq_class *split; /* Ptr to split node */
96 struct cbq_class *share; /* Ptr to LS parent in the class tree */
97 struct cbq_class *tparent; /* Ptr to tree parent in the class tree */
98 struct cbq_class *borrow; /* NULL if class is bandwidth limited;
100 struct cbq_class *sibling; /* Sibling chain */
101 struct cbq_class *children; /* Pointer to children chain */
103 struct Qdisc *q; /* Elementary queueing discipline */
107 unsigned char cpriority; /* Effective priority */
108 unsigned char delayed;
109 unsigned char level; /* level of the class in hierarchy:
110 0 for leaf classes, and maximal
111 level of children + 1 for nodes.
114 psched_time_t last; /* Last end of service */
115 psched_time_t undertime;
117 long deficit; /* Saved deficit for WRR */
118 psched_time_t penalized;
119 struct gnet_stats_basic_packed bstats;
120 struct gnet_stats_queue qstats;
121 struct net_rate_estimator __rcu *rate_est;
122 struct tc_cbq_xstats xstats;
124 struct tcf_proto __rcu *filter_list;
125 struct tcf_block *block;
129 struct cbq_class *defaults[TC_PRIO_MAX + 1];
132 struct cbq_sched_data {
133 struct Qdisc_class_hash clhash; /* Hash table of all classes */
134 int nclasses[TC_CBQ_MAXPRIO + 1];
135 unsigned int quanta[TC_CBQ_MAXPRIO + 1];
137 struct cbq_class link;
139 unsigned int activemask;
140 struct cbq_class *active[TC_CBQ_MAXPRIO + 1]; /* List of all classes
143 #ifdef CONFIG_NET_CLS_ACT
144 struct cbq_class *rx_class;
146 struct cbq_class *tx_class;
147 struct cbq_class *tx_borrowed;
149 psched_time_t now; /* Cached timestamp */
152 struct hrtimer delay_timer;
153 struct qdisc_watchdog watchdog; /* Watchdog timer,
157 psched_tdiff_t wd_expires;
163 #define L2T(cl, len) qdisc_l2t((cl)->R_tab, len)
165 static inline struct cbq_class *
166 cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
168 struct Qdisc_class_common *clc;
170 clc = qdisc_class_find(&q->clhash, classid);
173 return container_of(clc, struct cbq_class, common);
176 #ifdef CONFIG_NET_CLS_ACT
178 static struct cbq_class *
179 cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
181 struct cbq_class *cl;
183 for (cl = this->tparent; cl; cl = cl->tparent) {
184 struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
186 if (new != NULL && new != this)
194 /* Classify packet. The procedure is pretty complicated, but
195 * it allows us to combine link sharing and priority scheduling
198 * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
199 * so that it resolves to split nodes. Then packets are classified
200 * by logical priority, or a more specific classifier may be attached
204 static struct cbq_class *
205 cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
207 struct cbq_sched_data *q = qdisc_priv(sch);
208 struct cbq_class *head = &q->link;
209 struct cbq_class **defmap;
210 struct cbq_class *cl = NULL;
211 u32 prio = skb->priority;
212 struct tcf_proto *fl;
213 struct tcf_result res;
216 * Step 1. If skb->priority points to one of our classes, use it.
218 if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
219 (cl = cbq_class_lookup(q, prio)) != NULL)
222 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
225 defmap = head->defaults;
227 fl = rcu_dereference_bh(head->filter_list);
229 * Step 2+n. Apply classifier.
231 result = tcf_classify(skb, fl, &res, true);
232 if (!fl || result < 0)
234 if (result == TC_ACT_SHOT)
237 cl = (void *)res.class;
239 if (TC_H_MAJ(res.classid))
240 cl = cbq_class_lookup(q, res.classid);
241 else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
242 cl = defmap[TC_PRIO_BESTEFFORT];
247 if (cl->level >= head->level)
249 #ifdef CONFIG_NET_CLS_ACT
254 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
257 case TC_ACT_RECLASSIFY:
258 return cbq_reclassify(skb, cl);
265 * Step 3+n. If classifier selected a link sharing class,
266 * apply agency specific classifier.
267 * Repeat this procdure until we hit a leaf node.
276 * Step 4. No success...
278 if (TC_H_MAJ(prio) == 0 &&
279 !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
280 !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
287 * A packet has just been enqueued on the empty class.
288 * cbq_activate_class adds it to the tail of active class list
289 * of its priority band.
292 static inline void cbq_activate_class(struct cbq_class *cl)
294 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
295 int prio = cl->cpriority;
296 struct cbq_class *cl_tail;
298 cl_tail = q->active[prio];
299 q->active[prio] = cl;
301 if (cl_tail != NULL) {
302 cl->next_alive = cl_tail->next_alive;
303 cl_tail->next_alive = cl;
306 q->activemask |= (1<<prio);
311 * Unlink class from active chain.
312 * Note that this same procedure is done directly in cbq_dequeue*
313 * during round-robin procedure.
316 static void cbq_deactivate_class(struct cbq_class *this)
318 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
319 int prio = this->cpriority;
320 struct cbq_class *cl;
321 struct cbq_class *cl_prev = q->active[prio];
324 cl = cl_prev->next_alive;
326 cl_prev->next_alive = cl->next_alive;
327 cl->next_alive = NULL;
329 if (cl == q->active[prio]) {
330 q->active[prio] = cl_prev;
331 if (cl == q->active[prio]) {
332 q->active[prio] = NULL;
333 q->activemask &= ~(1<<prio);
339 } while ((cl_prev = cl) != q->active[prio]);
343 cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
345 int toplevel = q->toplevel;
347 if (toplevel > cl->level) {
348 psched_time_t now = psched_get_time();
351 if (cl->undertime < now) {
352 q->toplevel = cl->level;
355 } while ((cl = cl->borrow) != NULL && toplevel > cl->level);
360 cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
361 struct sk_buff **to_free)
363 struct cbq_sched_data *q = qdisc_priv(sch);
365 struct cbq_class *cl = cbq_classify(skb, sch, &ret);
367 #ifdef CONFIG_NET_CLS_ACT
371 if (ret & __NET_XMIT_BYPASS)
372 qdisc_qstats_drop(sch);
373 __qdisc_drop(skb, to_free);
377 ret = qdisc_enqueue(skb, cl->q, to_free);
378 if (ret == NET_XMIT_SUCCESS) {
380 cbq_mark_toplevel(q, cl);
382 cbq_activate_class(cl);
386 if (net_xmit_drop_count(ret)) {
387 qdisc_qstats_drop(sch);
388 cbq_mark_toplevel(q, cl);
394 /* Overlimit action: penalize leaf class by adding offtime */
395 static void cbq_overlimit(struct cbq_class *cl)
397 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
398 psched_tdiff_t delay = cl->undertime - q->now;
401 delay += cl->offtime;
404 * Class goes to sleep, so that it will have no
405 * chance to work avgidle. Let's forgive it 8)
407 * BTW cbq-2.0 has a crap in this
408 * place, apparently they forgot to shift it by cl->ewma_log.
411 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
412 if (cl->avgidle < cl->minidle)
413 cl->avgidle = cl->minidle;
416 cl->undertime = q->now + delay;
418 cl->xstats.overactions++;
421 if (q->wd_expires == 0 || q->wd_expires > delay)
422 q->wd_expires = delay;
424 /* Dirty work! We must schedule wakeups based on
425 * real available rate, rather than leaf rate,
426 * which may be tiny (even zero).
428 if (q->toplevel == TC_CBQ_MAXLEVEL) {
430 psched_tdiff_t base_delay = q->wd_expires;
432 for (b = cl->borrow; b; b = b->borrow) {
433 delay = b->undertime - q->now;
434 if (delay < base_delay) {
441 q->wd_expires = base_delay;
445 static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
448 struct cbq_class *cl;
449 struct cbq_class *cl_prev = q->active[prio];
450 psched_time_t sched = now;
456 cl = cl_prev->next_alive;
457 if (now - cl->penalized > 0) {
458 cl_prev->next_alive = cl->next_alive;
459 cl->next_alive = NULL;
460 cl->cpriority = cl->priority;
462 cbq_activate_class(cl);
464 if (cl == q->active[prio]) {
465 q->active[prio] = cl_prev;
466 if (cl == q->active[prio]) {
467 q->active[prio] = NULL;
472 cl = cl_prev->next_alive;
473 } else if (sched - cl->penalized > 0)
474 sched = cl->penalized;
475 } while ((cl_prev = cl) != q->active[prio]);
480 static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
482 struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
484 struct Qdisc *sch = q->watchdog.qdisc;
486 psched_tdiff_t delay = 0;
489 now = psched_get_time();
495 int prio = ffz(~pmask);
500 tmp = cbq_undelay_prio(q, prio, now);
503 if (tmp < delay || delay == 0)
512 time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
513 hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED);
516 __netif_schedule(qdisc_root(sch));
517 return HRTIMER_NORESTART;
521 * It is mission critical procedure.
523 * We "regenerate" toplevel cutoff, if transmitting class
524 * has backlog and it is not regulated. It is not part of
525 * original CBQ description, but looks more reasonable.
526 * Probably, it is wrong. This question needs further investigation.
530 cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
531 struct cbq_class *borrowed)
533 if (cl && q->toplevel >= borrowed->level) {
534 if (cl->q->q.qlen > 1) {
536 if (borrowed->undertime == PSCHED_PASTPERFECT) {
537 q->toplevel = borrowed->level;
540 } while ((borrowed = borrowed->borrow) != NULL);
543 /* It is not necessary now. Uncommenting it
544 will save CPU cycles, but decrease fairness.
546 q->toplevel = TC_CBQ_MAXLEVEL;
552 cbq_update(struct cbq_sched_data *q)
554 struct cbq_class *this = q->tx_class;
555 struct cbq_class *cl = this;
560 /* Time integrator. We calculate EOS time
561 * by adding expected packet transmission time.
563 now = q->now + L2T(&q->link, len);
565 for ( ; cl; cl = cl->share) {
566 long avgidle = cl->avgidle;
569 cl->bstats.packets++;
570 cl->bstats.bytes += len;
573 * (now - last) is total time between packet right edges.
574 * (last_pktlen/rate) is "virtual" busy time, so that
576 * idle = (now - last) - last_pktlen/rate
579 idle = now - cl->last;
580 if ((unsigned long)idle > 128*1024*1024) {
581 avgidle = cl->maxidle;
583 idle -= L2T(cl, len);
585 /* true_avgidle := (1-W)*true_avgidle + W*idle,
586 * where W=2^{-ewma_log}. But cl->avgidle is scaled:
587 * cl->avgidle == true_avgidle/W,
590 avgidle += idle - (avgidle>>cl->ewma_log);
594 /* Overlimit or at-limit */
596 if (avgidle < cl->minidle)
597 avgidle = cl->minidle;
599 cl->avgidle = avgidle;
601 /* Calculate expected time, when this class
602 * will be allowed to send.
603 * It will occur, when:
604 * (1-W)*true_avgidle + W*delay = 0, i.e.
605 * idle = (1/W - 1)*(-true_avgidle)
607 * idle = (1 - W)*(-cl->avgidle);
609 idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
613 * To maintain the rate allocated to the class,
614 * we add to undertime virtual clock,
615 * necessary to complete transmitted packet.
616 * (len/phys_bandwidth has been already passed
617 * to the moment of cbq_update)
620 idle -= L2T(&q->link, len);
621 idle += L2T(cl, len);
623 cl->undertime = now + idle;
627 cl->undertime = PSCHED_PASTPERFECT;
628 if (avgidle > cl->maxidle)
629 cl->avgidle = cl->maxidle;
631 cl->avgidle = avgidle;
633 if ((s64)(now - cl->last) > 0)
637 cbq_update_toplevel(q, this, q->tx_borrowed);
640 static inline struct cbq_class *
641 cbq_under_limit(struct cbq_class *cl)
643 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
644 struct cbq_class *this_cl = cl;
646 if (cl->tparent == NULL)
649 if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
655 /* It is very suspicious place. Now overlimit
656 * action is generated for not bounded classes
657 * only if link is completely congested.
658 * Though it is in agree with ancestor-only paradigm,
659 * it looks very stupid. Particularly,
660 * it means that this chunk of code will either
661 * never be called or result in strong amplification
662 * of burstiness. Dangerous, silly, and, however,
663 * no another solution exists.
667 this_cl->qstats.overlimits++;
668 cbq_overlimit(this_cl);
671 if (cl->level > q->toplevel)
673 } while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
679 static inline struct sk_buff *
680 cbq_dequeue_prio(struct Qdisc *sch, int prio)
682 struct cbq_sched_data *q = qdisc_priv(sch);
683 struct cbq_class *cl_tail, *cl_prev, *cl;
687 cl_tail = cl_prev = q->active[prio];
688 cl = cl_prev->next_alive;
695 struct cbq_class *borrow = cl;
698 (borrow = cbq_under_limit(cl)) == NULL)
701 if (cl->deficit <= 0) {
702 /* Class exhausted its allotment per
703 * this round. Switch to the next one.
706 cl->deficit += cl->quantum;
710 skb = cl->q->dequeue(cl->q);
712 /* Class did not give us any skb :-(
713 * It could occur even if cl->q->q.qlen != 0
714 * f.e. if cl->q == "tbf"
719 cl->deficit -= qdisc_pkt_len(skb);
721 q->tx_borrowed = borrow;
723 #ifndef CBQ_XSTATS_BORROWS_BYTES
724 borrow->xstats.borrows++;
725 cl->xstats.borrows++;
727 borrow->xstats.borrows += qdisc_pkt_len(skb);
728 cl->xstats.borrows += qdisc_pkt_len(skb);
731 q->tx_len = qdisc_pkt_len(skb);
733 if (cl->deficit <= 0) {
734 q->active[prio] = cl;
736 cl->deficit += cl->quantum;
741 if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
742 /* Class is empty or penalized.
743 * Unlink it from active chain.
745 cl_prev->next_alive = cl->next_alive;
746 cl->next_alive = NULL;
748 /* Did cl_tail point to it? */
753 /* Was it the last class in this band? */
756 q->active[prio] = NULL;
757 q->activemask &= ~(1<<prio);
759 cbq_activate_class(cl);
763 q->active[prio] = cl_tail;
766 cbq_activate_class(cl);
774 } while (cl_prev != cl_tail);
777 q->active[prio] = cl_prev;
782 static inline struct sk_buff *
783 cbq_dequeue_1(struct Qdisc *sch)
785 struct cbq_sched_data *q = qdisc_priv(sch);
787 unsigned int activemask;
789 activemask = q->activemask & 0xFF;
791 int prio = ffz(~activemask);
792 activemask &= ~(1<<prio);
793 skb = cbq_dequeue_prio(sch, prio);
800 static struct sk_buff *
801 cbq_dequeue(struct Qdisc *sch)
804 struct cbq_sched_data *q = qdisc_priv(sch);
807 now = psched_get_time();
817 skb = cbq_dequeue_1(sch);
819 qdisc_bstats_update(sch, skb);
824 /* All the classes are overlimit.
826 * It is possible, if:
828 * 1. Scheduler is empty.
829 * 2. Toplevel cutoff inhibited borrowing.
830 * 3. Root class is overlimit.
832 * Reset 2d and 3d conditions and retry.
834 * Note, that NS and cbq-2.0 are buggy, peeking
835 * an arbitrary class is appropriate for ancestor-only
836 * sharing, but not for toplevel algorithm.
838 * Our version is better, but slower, because it requires
839 * two passes, but it is unavoidable with top-level sharing.
842 if (q->toplevel == TC_CBQ_MAXLEVEL &&
843 q->link.undertime == PSCHED_PASTPERFECT)
846 q->toplevel = TC_CBQ_MAXLEVEL;
847 q->link.undertime = PSCHED_PASTPERFECT;
850 /* No packets in scheduler or nobody wants to give them to us :-(
851 * Sigh... start watchdog timer in the last case.
855 qdisc_qstats_overlimit(sch);
857 qdisc_watchdog_schedule(&q->watchdog,
858 now + q->wd_expires);
863 /* CBQ class maintanance routines */
865 static void cbq_adjust_levels(struct cbq_class *this)
872 struct cbq_class *cl;
877 if (cl->level > level)
879 } while ((cl = cl->sibling) != this->children);
881 this->level = level + 1;
882 } while ((this = this->tparent) != NULL);
885 static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
887 struct cbq_class *cl;
890 if (q->quanta[prio] == 0)
893 for (h = 0; h < q->clhash.hashsize; h++) {
894 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
895 /* BUGGGG... Beware! This expression suffer of
896 * arithmetic overflows!
898 if (cl->priority == prio) {
899 cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
902 if (cl->quantum <= 0 ||
903 cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) {
904 pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n",
905 cl->common.classid, cl->quantum);
906 cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
912 static void cbq_sync_defmap(struct cbq_class *cl)
914 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
915 struct cbq_class *split = cl->split;
922 for (i = 0; i <= TC_PRIO_MAX; i++) {
923 if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
924 split->defaults[i] = NULL;
927 for (i = 0; i <= TC_PRIO_MAX; i++) {
928 int level = split->level;
930 if (split->defaults[i])
933 for (h = 0; h < q->clhash.hashsize; h++) {
936 hlist_for_each_entry(c, &q->clhash.hash[h],
938 if (c->split == split && c->level < level &&
939 c->defmap & (1<<i)) {
940 split->defaults[i] = c;
948 static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
950 struct cbq_class *split = NULL;
956 splitid = split->common.classid;
959 if (split == NULL || split->common.classid != splitid) {
960 for (split = cl->tparent; split; split = split->tparent)
961 if (split->common.classid == splitid)
968 if (cl->split != split) {
972 cl->defmap = def & mask;
974 cl->defmap = (cl->defmap & ~mask) | (def & mask);
979 static void cbq_unlink_class(struct cbq_class *this)
981 struct cbq_class *cl, **clp;
982 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
984 qdisc_class_hash_remove(&q->clhash, &this->common);
987 clp = &this->sibling;
995 } while ((cl = *clp) != this->sibling);
997 if (this->tparent->children == this) {
998 this->tparent->children = this->sibling;
999 if (this->sibling == this)
1000 this->tparent->children = NULL;
1003 WARN_ON(this->sibling != this);
1007 static void cbq_link_class(struct cbq_class *this)
1009 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1010 struct cbq_class *parent = this->tparent;
1012 this->sibling = this;
1013 qdisc_class_hash_insert(&q->clhash, &this->common);
1018 if (parent->children == NULL) {
1019 parent->children = this;
1021 this->sibling = parent->children->sibling;
1022 parent->children->sibling = this;
1027 cbq_reset(struct Qdisc *sch)
1029 struct cbq_sched_data *q = qdisc_priv(sch);
1030 struct cbq_class *cl;
1037 q->tx_borrowed = NULL;
1038 qdisc_watchdog_cancel(&q->watchdog);
1039 hrtimer_cancel(&q->delay_timer);
1040 q->toplevel = TC_CBQ_MAXLEVEL;
1041 q->now = psched_get_time();
1043 for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1044 q->active[prio] = NULL;
1046 for (h = 0; h < q->clhash.hashsize; h++) {
1047 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1050 cl->next_alive = NULL;
1051 cl->undertime = PSCHED_PASTPERFECT;
1052 cl->avgidle = cl->maxidle;
1053 cl->deficit = cl->quantum;
1054 cl->cpriority = cl->priority;
1061 static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1063 if (lss->change & TCF_CBQ_LSS_FLAGS) {
1064 cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1065 cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1067 if (lss->change & TCF_CBQ_LSS_EWMA)
1068 cl->ewma_log = lss->ewma_log;
1069 if (lss->change & TCF_CBQ_LSS_AVPKT)
1070 cl->avpkt = lss->avpkt;
1071 if (lss->change & TCF_CBQ_LSS_MINIDLE)
1072 cl->minidle = -(long)lss->minidle;
1073 if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
1074 cl->maxidle = lss->maxidle;
1075 cl->avgidle = lss->maxidle;
1077 if (lss->change & TCF_CBQ_LSS_OFFTIME)
1078 cl->offtime = lss->offtime;
1082 static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1084 q->nclasses[cl->priority]--;
1085 q->quanta[cl->priority] -= cl->weight;
1086 cbq_normalize_quanta(q, cl->priority);
1089 static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1091 q->nclasses[cl->priority]++;
1092 q->quanta[cl->priority] += cl->weight;
1093 cbq_normalize_quanta(q, cl->priority);
1096 static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1098 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1101 cl->allot = wrr->allot;
1103 cl->weight = wrr->weight;
1104 if (wrr->priority) {
1105 cl->priority = wrr->priority - 1;
1106 cl->cpriority = cl->priority;
1107 if (cl->priority >= cl->priority2)
1108 cl->priority2 = TC_CBQ_MAXPRIO - 1;
1115 static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1117 cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1121 static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
1122 [TCA_CBQ_LSSOPT] = { .len = sizeof(struct tc_cbq_lssopt) },
1123 [TCA_CBQ_WRROPT] = { .len = sizeof(struct tc_cbq_wrropt) },
1124 [TCA_CBQ_FOPT] = { .len = sizeof(struct tc_cbq_fopt) },
1125 [TCA_CBQ_OVL_STRATEGY] = { .len = sizeof(struct tc_cbq_ovl) },
1126 [TCA_CBQ_RATE] = { .len = sizeof(struct tc_ratespec) },
1127 [TCA_CBQ_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1128 [TCA_CBQ_POLICE] = { .len = sizeof(struct tc_cbq_police) },
1131 static int cbq_opt_parse(struct nlattr *tb[TCA_CBQ_MAX + 1],
1133 struct netlink_ext_ack *extack)
1138 NL_SET_ERR_MSG(extack, "CBQ options are required for this operation");
1142 err = nla_parse_nested_deprecated(tb, TCA_CBQ_MAX, opt,
1143 cbq_policy, extack);
1147 if (tb[TCA_CBQ_WRROPT]) {
1148 const struct tc_cbq_wrropt *wrr = nla_data(tb[TCA_CBQ_WRROPT]);
1150 if (wrr->priority > TC_CBQ_MAXPRIO) {
1151 NL_SET_ERR_MSG(extack, "priority is bigger than TC_CBQ_MAXPRIO");
1158 static int cbq_init(struct Qdisc *sch, struct nlattr *opt,
1159 struct netlink_ext_ack *extack)
1161 struct cbq_sched_data *q = qdisc_priv(sch);
1162 struct nlattr *tb[TCA_CBQ_MAX + 1];
1163 struct tc_ratespec *r;
1166 qdisc_watchdog_init(&q->watchdog, sch);
1167 hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1168 q->delay_timer.function = cbq_undelay;
1170 err = cbq_opt_parse(tb, opt, extack);
1174 if (!tb[TCA_CBQ_RTAB] || !tb[TCA_CBQ_RATE]) {
1175 NL_SET_ERR_MSG(extack, "Rate specification missing or incomplete");
1179 r = nla_data(tb[TCA_CBQ_RATE]);
1181 q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB], extack);
1185 err = tcf_block_get(&q->link.block, &q->link.filter_list, sch, extack);
1189 err = qdisc_class_hash_init(&q->clhash);
1193 q->link.sibling = &q->link;
1194 q->link.common.classid = sch->handle;
1195 q->link.qdisc = sch;
1196 q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1199 q->link.q = &noop_qdisc;
1201 qdisc_hash_add(q->link.q, true);
1203 q->link.priority = TC_CBQ_MAXPRIO - 1;
1204 q->link.priority2 = TC_CBQ_MAXPRIO - 1;
1205 q->link.cpriority = TC_CBQ_MAXPRIO - 1;
1206 q->link.allot = psched_mtu(qdisc_dev(sch));
1207 q->link.quantum = q->link.allot;
1208 q->link.weight = q->link.R_tab->rate.rate;
1210 q->link.ewma_log = TC_CBQ_DEF_EWMA;
1211 q->link.avpkt = q->link.allot/2;
1212 q->link.minidle = -0x7FFFFFFF;
1214 q->toplevel = TC_CBQ_MAXLEVEL;
1215 q->now = psched_get_time();
1217 cbq_link_class(&q->link);
1219 if (tb[TCA_CBQ_LSSOPT])
1220 cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
1222 cbq_addprio(q, &q->link);
1226 tcf_block_put(q->link.block);
1229 qdisc_put_rtab(q->link.R_tab);
1233 static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1235 unsigned char *b = skb_tail_pointer(skb);
1237 if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
1238 goto nla_put_failure;
1246 static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1248 unsigned char *b = skb_tail_pointer(skb);
1249 struct tc_cbq_lssopt opt;
1252 if (cl->borrow == NULL)
1253 opt.flags |= TCF_CBQ_LSS_BOUNDED;
1254 if (cl->share == NULL)
1255 opt.flags |= TCF_CBQ_LSS_ISOLATED;
1256 opt.ewma_log = cl->ewma_log;
1257 opt.level = cl->level;
1258 opt.avpkt = cl->avpkt;
1259 opt.maxidle = cl->maxidle;
1260 opt.minidle = (u32)(-cl->minidle);
1261 opt.offtime = cl->offtime;
1263 if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
1264 goto nla_put_failure;
1272 static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1274 unsigned char *b = skb_tail_pointer(skb);
1275 struct tc_cbq_wrropt opt;
1277 memset(&opt, 0, sizeof(opt));
1279 opt.allot = cl->allot;
1280 opt.priority = cl->priority + 1;
1281 opt.cpriority = cl->cpriority + 1;
1282 opt.weight = cl->weight;
1283 if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
1284 goto nla_put_failure;
1292 static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1294 unsigned char *b = skb_tail_pointer(skb);
1295 struct tc_cbq_fopt opt;
1297 if (cl->split || cl->defmap) {
1298 opt.split = cl->split ? cl->split->common.classid : 0;
1299 opt.defmap = cl->defmap;
1301 if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
1302 goto nla_put_failure;
1311 static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1313 if (cbq_dump_lss(skb, cl) < 0 ||
1314 cbq_dump_rate(skb, cl) < 0 ||
1315 cbq_dump_wrr(skb, cl) < 0 ||
1316 cbq_dump_fopt(skb, cl) < 0)
1321 static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1323 struct cbq_sched_data *q = qdisc_priv(sch);
1324 struct nlattr *nest;
1326 nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1328 goto nla_put_failure;
1329 if (cbq_dump_attr(skb, &q->link) < 0)
1330 goto nla_put_failure;
1331 return nla_nest_end(skb, nest);
1334 nla_nest_cancel(skb, nest);
1339 cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1341 struct cbq_sched_data *q = qdisc_priv(sch);
1343 q->link.xstats.avgidle = q->link.avgidle;
1344 return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1348 cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1349 struct sk_buff *skb, struct tcmsg *tcm)
1351 struct cbq_class *cl = (struct cbq_class *)arg;
1352 struct nlattr *nest;
1355 tcm->tcm_parent = cl->tparent->common.classid;
1357 tcm->tcm_parent = TC_H_ROOT;
1358 tcm->tcm_handle = cl->common.classid;
1359 tcm->tcm_info = cl->q->handle;
1361 nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1363 goto nla_put_failure;
1364 if (cbq_dump_attr(skb, cl) < 0)
1365 goto nla_put_failure;
1366 return nla_nest_end(skb, nest);
1369 nla_nest_cancel(skb, nest);
1374 cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1375 struct gnet_dump *d)
1377 struct cbq_sched_data *q = qdisc_priv(sch);
1378 struct cbq_class *cl = (struct cbq_class *)arg;
1381 cl->xstats.avgidle = cl->avgidle;
1382 cl->xstats.undertime = 0;
1383 qdisc_qstats_qlen_backlog(cl->q, &qlen, &cl->qstats.backlog);
1385 if (cl->undertime != PSCHED_PASTPERFECT)
1386 cl->xstats.undertime = cl->undertime - q->now;
1388 if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
1389 d, NULL, &cl->bstats) < 0 ||
1390 gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1391 gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0)
1394 return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1397 static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1398 struct Qdisc **old, struct netlink_ext_ack *extack)
1400 struct cbq_class *cl = (struct cbq_class *)arg;
1403 new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1404 cl->common.classid, extack);
1409 *old = qdisc_replace(sch, new, &cl->q);
1413 static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
1415 struct cbq_class *cl = (struct cbq_class *)arg;
1420 static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1422 struct cbq_class *cl = (struct cbq_class *)arg;
1424 cbq_deactivate_class(cl);
1427 static unsigned long cbq_find(struct Qdisc *sch, u32 classid)
1429 struct cbq_sched_data *q = qdisc_priv(sch);
1431 return (unsigned long)cbq_class_lookup(q, classid);
1434 static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1436 struct cbq_sched_data *q = qdisc_priv(sch);
1438 WARN_ON(cl->filters);
1440 tcf_block_put(cl->block);
1442 qdisc_put_rtab(cl->R_tab);
1443 gen_kill_estimator(&cl->rate_est);
1448 static void cbq_destroy(struct Qdisc *sch)
1450 struct cbq_sched_data *q = qdisc_priv(sch);
1451 struct hlist_node *next;
1452 struct cbq_class *cl;
1455 #ifdef CONFIG_NET_CLS_ACT
1459 * Filters must be destroyed first because we don't destroy the
1460 * classes from root to leafs which means that filters can still
1461 * be bound to classes which have been destroyed already. --TGR '04
1463 for (h = 0; h < q->clhash.hashsize; h++) {
1464 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1465 tcf_block_put(cl->block);
1469 for (h = 0; h < q->clhash.hashsize; h++) {
1470 hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
1472 cbq_destroy_class(sch, cl);
1474 qdisc_class_hash_destroy(&q->clhash);
1478 cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
1479 unsigned long *arg, struct netlink_ext_ack *extack)
1482 struct cbq_sched_data *q = qdisc_priv(sch);
1483 struct cbq_class *cl = (struct cbq_class *)*arg;
1484 struct nlattr *opt = tca[TCA_OPTIONS];
1485 struct nlattr *tb[TCA_CBQ_MAX + 1];
1486 struct cbq_class *parent;
1487 struct qdisc_rate_table *rtab = NULL;
1489 err = cbq_opt_parse(tb, opt, extack);
1493 if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE]) {
1494 NL_SET_ERR_MSG(extack, "Neither overlimit strategy nor policing attributes can be used for changing class params");
1502 cl->tparent->common.classid != parentid) {
1503 NL_SET_ERR_MSG(extack, "Invalid parent id");
1506 if (!cl->tparent && parentid != TC_H_ROOT) {
1507 NL_SET_ERR_MSG(extack, "Parent must be root");
1512 if (tb[TCA_CBQ_RATE]) {
1513 rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
1514 tb[TCA_CBQ_RTAB], extack);
1519 if (tca[TCA_RATE]) {
1520 err = gen_replace_estimator(&cl->bstats, NULL,
1523 qdisc_root_sleeping_running(sch),
1526 NL_SET_ERR_MSG(extack, "Failed to replace specified rate estimator");
1527 qdisc_put_rtab(rtab);
1532 /* Change class parameters */
1535 if (cl->next_alive != NULL)
1536 cbq_deactivate_class(cl);
1539 qdisc_put_rtab(cl->R_tab);
1543 if (tb[TCA_CBQ_LSSOPT])
1544 cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1546 if (tb[TCA_CBQ_WRROPT]) {
1548 cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1551 if (tb[TCA_CBQ_FOPT])
1552 cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1555 cbq_activate_class(cl);
1557 sch_tree_unlock(sch);
1562 if (parentid == TC_H_ROOT)
1565 if (!tb[TCA_CBQ_WRROPT] || !tb[TCA_CBQ_RATE] || !tb[TCA_CBQ_LSSOPT]) {
1566 NL_SET_ERR_MSG(extack, "One of the following attributes MUST be specified: WRR, rate or link sharing");
1570 rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB],
1577 if (TC_H_MAJ(classid ^ sch->handle) ||
1578 cbq_class_lookup(q, classid)) {
1579 NL_SET_ERR_MSG(extack, "Specified class not found");
1584 classid = TC_H_MAKE(sch->handle, 0x8000);
1586 for (i = 0; i < 0x8000; i++) {
1587 if (++q->hgenerator >= 0x8000)
1589 if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1594 NL_SET_ERR_MSG(extack, "Unable to generate classid");
1597 classid = classid|q->hgenerator;
1602 parent = cbq_class_lookup(q, parentid);
1605 NL_SET_ERR_MSG(extack, "Failed to find parentid");
1611 cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1615 err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack);
1621 if (tca[TCA_RATE]) {
1622 err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
1624 qdisc_root_sleeping_running(sch),
1627 NL_SET_ERR_MSG(extack, "Couldn't create new estimator");
1628 tcf_block_put(cl->block);
1636 cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid,
1639 cl->q = &noop_qdisc;
1641 qdisc_hash_add(cl->q, true);
1643 cl->common.classid = classid;
1644 cl->tparent = parent;
1646 cl->allot = parent->allot;
1647 cl->quantum = cl->allot;
1648 cl->weight = cl->R_tab->rate.rate;
1652 cl->borrow = cl->tparent;
1653 if (cl->tparent != &q->link)
1654 cl->share = cl->tparent;
1655 cbq_adjust_levels(parent);
1656 cl->minidle = -0x7FFFFFFF;
1657 cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1658 cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1659 if (cl->ewma_log == 0)
1660 cl->ewma_log = q->link.ewma_log;
1661 if (cl->maxidle == 0)
1662 cl->maxidle = q->link.maxidle;
1664 cl->avpkt = q->link.avpkt;
1665 if (tb[TCA_CBQ_FOPT])
1666 cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1667 sch_tree_unlock(sch);
1669 qdisc_class_hash_grow(sch, &q->clhash);
1671 *arg = (unsigned long)cl;
1675 qdisc_put_rtab(rtab);
1679 static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1681 struct cbq_sched_data *q = qdisc_priv(sch);
1682 struct cbq_class *cl = (struct cbq_class *)arg;
1684 if (cl->filters || cl->children || cl == &q->link)
1689 qdisc_purge_queue(cl->q);
1692 cbq_deactivate_class(cl);
1694 if (q->tx_borrowed == cl)
1695 q->tx_borrowed = q->tx_class;
1696 if (q->tx_class == cl) {
1698 q->tx_borrowed = NULL;
1700 #ifdef CONFIG_NET_CLS_ACT
1701 if (q->rx_class == cl)
1705 cbq_unlink_class(cl);
1706 cbq_adjust_levels(cl->tparent);
1708 cbq_sync_defmap(cl);
1711 sch_tree_unlock(sch);
1713 cbq_destroy_class(sch, cl);
1717 static struct tcf_block *cbq_tcf_block(struct Qdisc *sch, unsigned long arg,
1718 struct netlink_ext_ack *extack)
1720 struct cbq_sched_data *q = qdisc_priv(sch);
1721 struct cbq_class *cl = (struct cbq_class *)arg;
1729 static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1732 struct cbq_sched_data *q = qdisc_priv(sch);
1733 struct cbq_class *p = (struct cbq_class *)parent;
1734 struct cbq_class *cl = cbq_class_lookup(q, classid);
1737 if (p && p->level <= cl->level)
1740 return (unsigned long)cl;
1745 static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
1747 struct cbq_class *cl = (struct cbq_class *)arg;
1752 static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1754 struct cbq_sched_data *q = qdisc_priv(sch);
1755 struct cbq_class *cl;
1761 for (h = 0; h < q->clhash.hashsize; h++) {
1762 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1763 if (arg->count < arg->skip) {
1767 if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1776 static const struct Qdisc_class_ops cbq_class_ops = {
1779 .qlen_notify = cbq_qlen_notify,
1781 .change = cbq_change_class,
1782 .delete = cbq_delete,
1784 .tcf_block = cbq_tcf_block,
1785 .bind_tcf = cbq_bind_filter,
1786 .unbind_tcf = cbq_unbind_filter,
1787 .dump = cbq_dump_class,
1788 .dump_stats = cbq_dump_class_stats,
1791 static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
1793 .cl_ops = &cbq_class_ops,
1795 .priv_size = sizeof(struct cbq_sched_data),
1796 .enqueue = cbq_enqueue,
1797 .dequeue = cbq_dequeue,
1798 .peek = qdisc_peek_dequeued,
1801 .destroy = cbq_destroy,
1804 .dump_stats = cbq_dump_stats,
1805 .owner = THIS_MODULE,
1808 static int __init cbq_module_init(void)
1810 return register_qdisc(&cbq_qdisc_ops);
1812 static void __exit cbq_module_exit(void)
1814 unregister_qdisc(&cbq_qdisc_ops);
1816 module_init(cbq_module_init)
1817 module_exit(cbq_module_exit)
1818 MODULE_LICENSE("GPL");