2 * net/sched/sch_cbq.c Class-Based Queueing discipline.
4 * This program is free software; you can redistribute it and/or
5 * modify it under the terms of the GNU General Public License
6 * as published by the Free Software Foundation; either version
7 * 2 of the License, or (at your option) any later version.
9 * Authors: Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
13 #include <linux/module.h>
14 #include <linux/slab.h>
15 #include <linux/types.h>
16 #include <linux/kernel.h>
17 #include <linux/string.h>
18 #include <linux/errno.h>
19 #include <linux/skbuff.h>
20 #include <net/netlink.h>
21 #include <net/pkt_sched.h>
22 #include <net/pkt_cls.h>
25 /* Class-Based Queueing (CBQ) algorithm.
26 =======================================
28 Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
29 Management Models for Packet Networks",
30 IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
32 [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
34 [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
37 [4] Sally Floyd and Michael Speer, "Experimental Results
38 for Class-Based Queueing", 1998, not published.
40 -----------------------------------------------------------------------
42 Algorithm skeleton was taken from NS simulator cbq.cc.
43 If someone wants to check this code against the LBL version,
44 he should take into account that ONLY the skeleton was borrowed,
45 the implementation is different. Particularly:
47 --- The WRR algorithm is different. Our version looks more
48 reasonable (I hope) and works when quanta are allowed to be
49 less than MTU, which is always the case when real time classes
50 have small rates. Note, that the statement of [3] is
51 incomplete, delay may actually be estimated even if class
52 per-round allotment is less than MTU. Namely, if per-round
53 allotment is W*r_i, and r_1+...+r_k = r < 1
55 delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
57 In the worst case we have IntServ estimate with D = W*r+k*MTU
58 and C = MTU*r. The proof (if correct at all) is trivial.
61 --- It seems that cbq-2.0 is not very accurate. At least, I cannot
62 interpret some places, which look like wrong translations
63 from NS. Anyone is advised to find these differences
64 and explain to me, why I am wrong 8).
66 --- Linux has no EOI event, so that we cannot estimate true class
67 idle time. Workaround is to consider the next dequeue event
68 as sign that previous packet is finished. This is wrong because of
69 internal device queueing, but on a permanently loaded link it is true.
70 Moreover, combined with clock integrator, this scheme looks
71 very close to an ideal solution. */
73 struct cbq_sched_data;
77 struct Qdisc_class_common common;
78 struct cbq_class *next_alive; /* next class with backlog in this priority band */
81 unsigned char priority; /* class priority */
82 unsigned char priority2; /* priority to be used after overlimit */
83 unsigned char ewma_log; /* time constant for idle time calculation */
87 /* Link-sharing scheduler parameters */
88 long maxidle; /* Class parameters: see below. */
92 struct qdisc_rate_table *R_tab;
94 /* General scheduler (WRR) parameters */
96 long quantum; /* Allotment per WRR round */
97 long weight; /* Relative allotment: see below */
99 struct Qdisc *qdisc; /* Ptr to CBQ discipline */
100 struct cbq_class *split; /* Ptr to split node */
101 struct cbq_class *share; /* Ptr to LS parent in the class tree */
102 struct cbq_class *tparent; /* Ptr to tree parent in the class tree */
103 struct cbq_class *borrow; /* NULL if class is bandwidth limited;
105 struct cbq_class *sibling; /* Sibling chain */
106 struct cbq_class *children; /* Pointer to children chain */
108 struct Qdisc *q; /* Elementary queueing discipline */
112 unsigned char cpriority; /* Effective priority */
113 unsigned char delayed;
114 unsigned char level; /* level of the class in hierarchy:
115 0 for leaf classes, and maximal
116 level of children + 1 for nodes.
119 psched_time_t last; /* Last end of service */
120 psched_time_t undertime;
122 long deficit; /* Saved deficit for WRR */
123 psched_time_t penalized;
124 struct gnet_stats_basic_packed bstats;
125 struct gnet_stats_queue qstats;
126 struct net_rate_estimator __rcu *rate_est;
127 struct tc_cbq_xstats xstats;
129 struct tcf_proto __rcu *filter_list;
130 struct tcf_block *block;
134 struct cbq_class *defaults[TC_PRIO_MAX + 1];
137 struct cbq_sched_data {
138 struct Qdisc_class_hash clhash; /* Hash table of all classes */
139 int nclasses[TC_CBQ_MAXPRIO + 1];
140 unsigned int quanta[TC_CBQ_MAXPRIO + 1];
142 struct cbq_class link;
144 unsigned int activemask;
145 struct cbq_class *active[TC_CBQ_MAXPRIO + 1]; /* List of all classes
148 #ifdef CONFIG_NET_CLS_ACT
149 struct cbq_class *rx_class;
151 struct cbq_class *tx_class;
152 struct cbq_class *tx_borrowed;
154 psched_time_t now; /* Cached timestamp */
157 struct hrtimer delay_timer;
158 struct qdisc_watchdog watchdog; /* Watchdog timer,
162 psched_tdiff_t wd_expires;
168 #define L2T(cl, len) qdisc_l2t((cl)->R_tab, len)
170 static inline struct cbq_class *
171 cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
173 struct Qdisc_class_common *clc;
175 clc = qdisc_class_find(&q->clhash, classid);
178 return container_of(clc, struct cbq_class, common);
181 #ifdef CONFIG_NET_CLS_ACT
183 static struct cbq_class *
184 cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
186 struct cbq_class *cl;
188 for (cl = this->tparent; cl; cl = cl->tparent) {
189 struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
191 if (new != NULL && new != this)
199 /* Classify packet. The procedure is pretty complicated, but
200 * it allows us to combine link sharing and priority scheduling
203 * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
204 * so that it resolves to split nodes. Then packets are classified
205 * by logical priority, or a more specific classifier may be attached
209 static struct cbq_class *
210 cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
212 struct cbq_sched_data *q = qdisc_priv(sch);
213 struct cbq_class *head = &q->link;
214 struct cbq_class **defmap;
215 struct cbq_class *cl = NULL;
216 u32 prio = skb->priority;
217 struct tcf_proto *fl;
218 struct tcf_result res;
221 * Step 1. If skb->priority points to one of our classes, use it.
223 if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
224 (cl = cbq_class_lookup(q, prio)) != NULL)
227 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
230 defmap = head->defaults;
232 fl = rcu_dereference_bh(head->filter_list);
234 * Step 2+n. Apply classifier.
236 result = tcf_classify(skb, fl, &res, true);
237 if (!fl || result < 0)
240 cl = (void *)res.class;
242 if (TC_H_MAJ(res.classid))
243 cl = cbq_class_lookup(q, res.classid);
244 else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
245 cl = defmap[TC_PRIO_BESTEFFORT];
250 if (cl->level >= head->level)
252 #ifdef CONFIG_NET_CLS_ACT
257 *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
260 case TC_ACT_RECLASSIFY:
261 return cbq_reclassify(skb, cl);
268 * Step 3+n. If classifier selected a link sharing class,
269 * apply agency specific classifier.
270 * Repeat this procdure until we hit a leaf node.
279 * Step 4. No success...
281 if (TC_H_MAJ(prio) == 0 &&
282 !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
283 !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
290 * A packet has just been enqueued on the empty class.
291 * cbq_activate_class adds it to the tail of active class list
292 * of its priority band.
295 static inline void cbq_activate_class(struct cbq_class *cl)
297 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
298 int prio = cl->cpriority;
299 struct cbq_class *cl_tail;
301 cl_tail = q->active[prio];
302 q->active[prio] = cl;
304 if (cl_tail != NULL) {
305 cl->next_alive = cl_tail->next_alive;
306 cl_tail->next_alive = cl;
309 q->activemask |= (1<<prio);
314 * Unlink class from active chain.
315 * Note that this same procedure is done directly in cbq_dequeue*
316 * during round-robin procedure.
319 static void cbq_deactivate_class(struct cbq_class *this)
321 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
322 int prio = this->cpriority;
323 struct cbq_class *cl;
324 struct cbq_class *cl_prev = q->active[prio];
327 cl = cl_prev->next_alive;
329 cl_prev->next_alive = cl->next_alive;
330 cl->next_alive = NULL;
332 if (cl == q->active[prio]) {
333 q->active[prio] = cl_prev;
334 if (cl == q->active[prio]) {
335 q->active[prio] = NULL;
336 q->activemask &= ~(1<<prio);
342 } while ((cl_prev = cl) != q->active[prio]);
346 cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
348 int toplevel = q->toplevel;
350 if (toplevel > cl->level) {
351 psched_time_t now = psched_get_time();
354 if (cl->undertime < now) {
355 q->toplevel = cl->level;
358 } while ((cl = cl->borrow) != NULL && toplevel > cl->level);
363 cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
364 struct sk_buff **to_free)
366 struct cbq_sched_data *q = qdisc_priv(sch);
367 int uninitialized_var(ret);
368 struct cbq_class *cl = cbq_classify(skb, sch, &ret);
370 #ifdef CONFIG_NET_CLS_ACT
374 if (ret & __NET_XMIT_BYPASS)
375 qdisc_qstats_drop(sch);
376 __qdisc_drop(skb, to_free);
380 ret = qdisc_enqueue(skb, cl->q, to_free);
381 if (ret == NET_XMIT_SUCCESS) {
383 cbq_mark_toplevel(q, cl);
385 cbq_activate_class(cl);
389 if (net_xmit_drop_count(ret)) {
390 qdisc_qstats_drop(sch);
391 cbq_mark_toplevel(q, cl);
397 /* Overlimit action: penalize leaf class by adding offtime */
398 static void cbq_overlimit(struct cbq_class *cl)
400 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
401 psched_tdiff_t delay = cl->undertime - q->now;
404 delay += cl->offtime;
407 * Class goes to sleep, so that it will have no
408 * chance to work avgidle. Let's forgive it 8)
410 * BTW cbq-2.0 has a crap in this
411 * place, apparently they forgot to shift it by cl->ewma_log.
414 delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
415 if (cl->avgidle < cl->minidle)
416 cl->avgidle = cl->minidle;
419 cl->undertime = q->now + delay;
421 cl->xstats.overactions++;
424 if (q->wd_expires == 0 || q->wd_expires > delay)
425 q->wd_expires = delay;
427 /* Dirty work! We must schedule wakeups based on
428 * real available rate, rather than leaf rate,
429 * which may be tiny (even zero).
431 if (q->toplevel == TC_CBQ_MAXLEVEL) {
433 psched_tdiff_t base_delay = q->wd_expires;
435 for (b = cl->borrow; b; b = b->borrow) {
436 delay = b->undertime - q->now;
437 if (delay < base_delay) {
444 q->wd_expires = base_delay;
448 static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
451 struct cbq_class *cl;
452 struct cbq_class *cl_prev = q->active[prio];
453 psched_time_t sched = now;
459 cl = cl_prev->next_alive;
460 if (now - cl->penalized > 0) {
461 cl_prev->next_alive = cl->next_alive;
462 cl->next_alive = NULL;
463 cl->cpriority = cl->priority;
465 cbq_activate_class(cl);
467 if (cl == q->active[prio]) {
468 q->active[prio] = cl_prev;
469 if (cl == q->active[prio]) {
470 q->active[prio] = NULL;
475 cl = cl_prev->next_alive;
476 } else if (sched - cl->penalized > 0)
477 sched = cl->penalized;
478 } while ((cl_prev = cl) != q->active[prio]);
483 static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
485 struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
487 struct Qdisc *sch = q->watchdog.qdisc;
489 psched_tdiff_t delay = 0;
492 now = psched_get_time();
498 int prio = ffz(~pmask);
503 tmp = cbq_undelay_prio(q, prio, now);
506 if (tmp < delay || delay == 0)
515 time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
516 hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED);
519 __netif_schedule(qdisc_root(sch));
520 return HRTIMER_NORESTART;
524 * It is mission critical procedure.
526 * We "regenerate" toplevel cutoff, if transmitting class
527 * has backlog and it is not regulated. It is not part of
528 * original CBQ description, but looks more reasonable.
529 * Probably, it is wrong. This question needs further investigation.
533 cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
534 struct cbq_class *borrowed)
536 if (cl && q->toplevel >= borrowed->level) {
537 if (cl->q->q.qlen > 1) {
539 if (borrowed->undertime == PSCHED_PASTPERFECT) {
540 q->toplevel = borrowed->level;
543 } while ((borrowed = borrowed->borrow) != NULL);
546 /* It is not necessary now. Uncommenting it
547 will save CPU cycles, but decrease fairness.
549 q->toplevel = TC_CBQ_MAXLEVEL;
555 cbq_update(struct cbq_sched_data *q)
557 struct cbq_class *this = q->tx_class;
558 struct cbq_class *cl = this;
563 /* Time integrator. We calculate EOS time
564 * by adding expected packet transmission time.
566 now = q->now + L2T(&q->link, len);
568 for ( ; cl; cl = cl->share) {
569 long avgidle = cl->avgidle;
572 cl->bstats.packets++;
573 cl->bstats.bytes += len;
576 * (now - last) is total time between packet right edges.
577 * (last_pktlen/rate) is "virtual" busy time, so that
579 * idle = (now - last) - last_pktlen/rate
582 idle = now - cl->last;
583 if ((unsigned long)idle > 128*1024*1024) {
584 avgidle = cl->maxidle;
586 idle -= L2T(cl, len);
588 /* true_avgidle := (1-W)*true_avgidle + W*idle,
589 * where W=2^{-ewma_log}. But cl->avgidle is scaled:
590 * cl->avgidle == true_avgidle/W,
593 avgidle += idle - (avgidle>>cl->ewma_log);
597 /* Overlimit or at-limit */
599 if (avgidle < cl->minidle)
600 avgidle = cl->minidle;
602 cl->avgidle = avgidle;
604 /* Calculate expected time, when this class
605 * will be allowed to send.
606 * It will occur, when:
607 * (1-W)*true_avgidle + W*delay = 0, i.e.
608 * idle = (1/W - 1)*(-true_avgidle)
610 * idle = (1 - W)*(-cl->avgidle);
612 idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
616 * To maintain the rate allocated to the class,
617 * we add to undertime virtual clock,
618 * necessary to complete transmitted packet.
619 * (len/phys_bandwidth has been already passed
620 * to the moment of cbq_update)
623 idle -= L2T(&q->link, len);
624 idle += L2T(cl, len);
626 cl->undertime = now + idle;
630 cl->undertime = PSCHED_PASTPERFECT;
631 if (avgidle > cl->maxidle)
632 cl->avgidle = cl->maxidle;
634 cl->avgidle = avgidle;
636 if ((s64)(now - cl->last) > 0)
640 cbq_update_toplevel(q, this, q->tx_borrowed);
643 static inline struct cbq_class *
644 cbq_under_limit(struct cbq_class *cl)
646 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
647 struct cbq_class *this_cl = cl;
649 if (cl->tparent == NULL)
652 if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
658 /* It is very suspicious place. Now overlimit
659 * action is generated for not bounded classes
660 * only if link is completely congested.
661 * Though it is in agree with ancestor-only paradigm,
662 * it looks very stupid. Particularly,
663 * it means that this chunk of code will either
664 * never be called or result in strong amplification
665 * of burstiness. Dangerous, silly, and, however,
666 * no another solution exists.
670 this_cl->qstats.overlimits++;
671 cbq_overlimit(this_cl);
674 if (cl->level > q->toplevel)
676 } while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
682 static inline struct sk_buff *
683 cbq_dequeue_prio(struct Qdisc *sch, int prio)
685 struct cbq_sched_data *q = qdisc_priv(sch);
686 struct cbq_class *cl_tail, *cl_prev, *cl;
690 cl_tail = cl_prev = q->active[prio];
691 cl = cl_prev->next_alive;
698 struct cbq_class *borrow = cl;
701 (borrow = cbq_under_limit(cl)) == NULL)
704 if (cl->deficit <= 0) {
705 /* Class exhausted its allotment per
706 * this round. Switch to the next one.
709 cl->deficit += cl->quantum;
713 skb = cl->q->dequeue(cl->q);
715 /* Class did not give us any skb :-(
716 * It could occur even if cl->q->q.qlen != 0
717 * f.e. if cl->q == "tbf"
722 cl->deficit -= qdisc_pkt_len(skb);
724 q->tx_borrowed = borrow;
726 #ifndef CBQ_XSTATS_BORROWS_BYTES
727 borrow->xstats.borrows++;
728 cl->xstats.borrows++;
730 borrow->xstats.borrows += qdisc_pkt_len(skb);
731 cl->xstats.borrows += qdisc_pkt_len(skb);
734 q->tx_len = qdisc_pkt_len(skb);
736 if (cl->deficit <= 0) {
737 q->active[prio] = cl;
739 cl->deficit += cl->quantum;
744 if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
745 /* Class is empty or penalized.
746 * Unlink it from active chain.
748 cl_prev->next_alive = cl->next_alive;
749 cl->next_alive = NULL;
751 /* Did cl_tail point to it? */
756 /* Was it the last class in this band? */
759 q->active[prio] = NULL;
760 q->activemask &= ~(1<<prio);
762 cbq_activate_class(cl);
766 q->active[prio] = cl_tail;
769 cbq_activate_class(cl);
777 } while (cl_prev != cl_tail);
780 q->active[prio] = cl_prev;
785 static inline struct sk_buff *
786 cbq_dequeue_1(struct Qdisc *sch)
788 struct cbq_sched_data *q = qdisc_priv(sch);
790 unsigned int activemask;
792 activemask = q->activemask & 0xFF;
794 int prio = ffz(~activemask);
795 activemask &= ~(1<<prio);
796 skb = cbq_dequeue_prio(sch, prio);
803 static struct sk_buff *
804 cbq_dequeue(struct Qdisc *sch)
807 struct cbq_sched_data *q = qdisc_priv(sch);
810 now = psched_get_time();
820 skb = cbq_dequeue_1(sch);
822 qdisc_bstats_update(sch, skb);
827 /* All the classes are overlimit.
829 * It is possible, if:
831 * 1. Scheduler is empty.
832 * 2. Toplevel cutoff inhibited borrowing.
833 * 3. Root class is overlimit.
835 * Reset 2d and 3d conditions and retry.
837 * Note, that NS and cbq-2.0 are buggy, peeking
838 * an arbitrary class is appropriate for ancestor-only
839 * sharing, but not for toplevel algorithm.
841 * Our version is better, but slower, because it requires
842 * two passes, but it is unavoidable with top-level sharing.
845 if (q->toplevel == TC_CBQ_MAXLEVEL &&
846 q->link.undertime == PSCHED_PASTPERFECT)
849 q->toplevel = TC_CBQ_MAXLEVEL;
850 q->link.undertime = PSCHED_PASTPERFECT;
853 /* No packets in scheduler or nobody wants to give them to us :-(
854 * Sigh... start watchdog timer in the last case.
858 qdisc_qstats_overlimit(sch);
860 qdisc_watchdog_schedule(&q->watchdog,
861 now + q->wd_expires);
866 /* CBQ class maintanance routines */
868 static void cbq_adjust_levels(struct cbq_class *this)
875 struct cbq_class *cl;
880 if (cl->level > level)
882 } while ((cl = cl->sibling) != this->children);
884 this->level = level + 1;
885 } while ((this = this->tparent) != NULL);
888 static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
890 struct cbq_class *cl;
893 if (q->quanta[prio] == 0)
896 for (h = 0; h < q->clhash.hashsize; h++) {
897 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
898 /* BUGGGG... Beware! This expression suffer of
899 * arithmetic overflows!
901 if (cl->priority == prio) {
902 cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
905 if (cl->quantum <= 0 ||
906 cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) {
907 pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n",
908 cl->common.classid, cl->quantum);
909 cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
915 static void cbq_sync_defmap(struct cbq_class *cl)
917 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
918 struct cbq_class *split = cl->split;
925 for (i = 0; i <= TC_PRIO_MAX; i++) {
926 if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
927 split->defaults[i] = NULL;
930 for (i = 0; i <= TC_PRIO_MAX; i++) {
931 int level = split->level;
933 if (split->defaults[i])
936 for (h = 0; h < q->clhash.hashsize; h++) {
939 hlist_for_each_entry(c, &q->clhash.hash[h],
941 if (c->split == split && c->level < level &&
942 c->defmap & (1<<i)) {
943 split->defaults[i] = c;
951 static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
953 struct cbq_class *split = NULL;
959 splitid = split->common.classid;
962 if (split == NULL || split->common.classid != splitid) {
963 for (split = cl->tparent; split; split = split->tparent)
964 if (split->common.classid == splitid)
971 if (cl->split != split) {
975 cl->defmap = def & mask;
977 cl->defmap = (cl->defmap & ~mask) | (def & mask);
982 static void cbq_unlink_class(struct cbq_class *this)
984 struct cbq_class *cl, **clp;
985 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
987 qdisc_class_hash_remove(&q->clhash, &this->common);
990 clp = &this->sibling;
998 } while ((cl = *clp) != this->sibling);
1000 if (this->tparent->children == this) {
1001 this->tparent->children = this->sibling;
1002 if (this->sibling == this)
1003 this->tparent->children = NULL;
1006 WARN_ON(this->sibling != this);
1010 static void cbq_link_class(struct cbq_class *this)
1012 struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1013 struct cbq_class *parent = this->tparent;
1015 this->sibling = this;
1016 qdisc_class_hash_insert(&q->clhash, &this->common);
1021 if (parent->children == NULL) {
1022 parent->children = this;
1024 this->sibling = parent->children->sibling;
1025 parent->children->sibling = this;
1030 cbq_reset(struct Qdisc *sch)
1032 struct cbq_sched_data *q = qdisc_priv(sch);
1033 struct cbq_class *cl;
1040 q->tx_borrowed = NULL;
1041 qdisc_watchdog_cancel(&q->watchdog);
1042 hrtimer_cancel(&q->delay_timer);
1043 q->toplevel = TC_CBQ_MAXLEVEL;
1044 q->now = psched_get_time();
1046 for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1047 q->active[prio] = NULL;
1049 for (h = 0; h < q->clhash.hashsize; h++) {
1050 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1053 cl->next_alive = NULL;
1054 cl->undertime = PSCHED_PASTPERFECT;
1055 cl->avgidle = cl->maxidle;
1056 cl->deficit = cl->quantum;
1057 cl->cpriority = cl->priority;
1064 static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1066 if (lss->change & TCF_CBQ_LSS_FLAGS) {
1067 cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1068 cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1070 if (lss->change & TCF_CBQ_LSS_EWMA)
1071 cl->ewma_log = lss->ewma_log;
1072 if (lss->change & TCF_CBQ_LSS_AVPKT)
1073 cl->avpkt = lss->avpkt;
1074 if (lss->change & TCF_CBQ_LSS_MINIDLE)
1075 cl->minidle = -(long)lss->minidle;
1076 if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
1077 cl->maxidle = lss->maxidle;
1078 cl->avgidle = lss->maxidle;
1080 if (lss->change & TCF_CBQ_LSS_OFFTIME)
1081 cl->offtime = lss->offtime;
1085 static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1087 q->nclasses[cl->priority]--;
1088 q->quanta[cl->priority] -= cl->weight;
1089 cbq_normalize_quanta(q, cl->priority);
1092 static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1094 q->nclasses[cl->priority]++;
1095 q->quanta[cl->priority] += cl->weight;
1096 cbq_normalize_quanta(q, cl->priority);
1099 static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1101 struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1104 cl->allot = wrr->allot;
1106 cl->weight = wrr->weight;
1107 if (wrr->priority) {
1108 cl->priority = wrr->priority - 1;
1109 cl->cpriority = cl->priority;
1110 if (cl->priority >= cl->priority2)
1111 cl->priority2 = TC_CBQ_MAXPRIO - 1;
1118 static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1120 cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1124 static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
1125 [TCA_CBQ_LSSOPT] = { .len = sizeof(struct tc_cbq_lssopt) },
1126 [TCA_CBQ_WRROPT] = { .len = sizeof(struct tc_cbq_wrropt) },
1127 [TCA_CBQ_FOPT] = { .len = sizeof(struct tc_cbq_fopt) },
1128 [TCA_CBQ_OVL_STRATEGY] = { .len = sizeof(struct tc_cbq_ovl) },
1129 [TCA_CBQ_RATE] = { .len = sizeof(struct tc_ratespec) },
1130 [TCA_CBQ_RTAB] = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1131 [TCA_CBQ_POLICE] = { .len = sizeof(struct tc_cbq_police) },
1134 static int cbq_opt_parse(struct nlattr *tb[TCA_CBQ_MAX + 1], struct nlattr *opt)
1141 err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy, NULL);
1145 if (tb[TCA_CBQ_WRROPT]) {
1146 const struct tc_cbq_wrropt *wrr = nla_data(tb[TCA_CBQ_WRROPT]);
1148 if (wrr->priority > TC_CBQ_MAXPRIO)
1154 static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
1156 struct cbq_sched_data *q = qdisc_priv(sch);
1157 struct nlattr *tb[TCA_CBQ_MAX + 1];
1158 struct tc_ratespec *r;
1161 qdisc_watchdog_init(&q->watchdog, sch);
1162 hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1163 q->delay_timer.function = cbq_undelay;
1165 err = cbq_opt_parse(tb, opt);
1169 if (tb[TCA_CBQ_RTAB] == NULL || tb[TCA_CBQ_RATE] == NULL)
1172 r = nla_data(tb[TCA_CBQ_RATE]);
1174 if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB])) == NULL)
1177 err = tcf_block_get(&q->link.block, &q->link.filter_list);
1181 err = qdisc_class_hash_init(&q->clhash);
1185 q->link.sibling = &q->link;
1186 q->link.common.classid = sch->handle;
1187 q->link.qdisc = sch;
1188 q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1191 q->link.q = &noop_qdisc;
1193 qdisc_hash_add(q->link.q, true);
1195 q->link.priority = TC_CBQ_MAXPRIO - 1;
1196 q->link.priority2 = TC_CBQ_MAXPRIO - 1;
1197 q->link.cpriority = TC_CBQ_MAXPRIO - 1;
1198 q->link.allot = psched_mtu(qdisc_dev(sch));
1199 q->link.quantum = q->link.allot;
1200 q->link.weight = q->link.R_tab->rate.rate;
1202 q->link.ewma_log = TC_CBQ_DEF_EWMA;
1203 q->link.avpkt = q->link.allot/2;
1204 q->link.minidle = -0x7FFFFFFF;
1206 q->toplevel = TC_CBQ_MAXLEVEL;
1207 q->now = psched_get_time();
1209 cbq_link_class(&q->link);
1211 if (tb[TCA_CBQ_LSSOPT])
1212 cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
1214 cbq_addprio(q, &q->link);
1218 tcf_block_put(q->link.block);
1221 qdisc_put_rtab(q->link.R_tab);
1225 static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1227 unsigned char *b = skb_tail_pointer(skb);
1229 if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
1230 goto nla_put_failure;
1238 static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1240 unsigned char *b = skb_tail_pointer(skb);
1241 struct tc_cbq_lssopt opt;
1244 if (cl->borrow == NULL)
1245 opt.flags |= TCF_CBQ_LSS_BOUNDED;
1246 if (cl->share == NULL)
1247 opt.flags |= TCF_CBQ_LSS_ISOLATED;
1248 opt.ewma_log = cl->ewma_log;
1249 opt.level = cl->level;
1250 opt.avpkt = cl->avpkt;
1251 opt.maxidle = cl->maxidle;
1252 opt.minidle = (u32)(-cl->minidle);
1253 opt.offtime = cl->offtime;
1255 if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
1256 goto nla_put_failure;
1264 static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1266 unsigned char *b = skb_tail_pointer(skb);
1267 struct tc_cbq_wrropt opt;
1269 memset(&opt, 0, sizeof(opt));
1271 opt.allot = cl->allot;
1272 opt.priority = cl->priority + 1;
1273 opt.cpriority = cl->cpriority + 1;
1274 opt.weight = cl->weight;
1275 if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
1276 goto nla_put_failure;
1284 static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1286 unsigned char *b = skb_tail_pointer(skb);
1287 struct tc_cbq_fopt opt;
1289 if (cl->split || cl->defmap) {
1290 opt.split = cl->split ? cl->split->common.classid : 0;
1291 opt.defmap = cl->defmap;
1293 if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
1294 goto nla_put_failure;
1303 static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1305 if (cbq_dump_lss(skb, cl) < 0 ||
1306 cbq_dump_rate(skb, cl) < 0 ||
1307 cbq_dump_wrr(skb, cl) < 0 ||
1308 cbq_dump_fopt(skb, cl) < 0)
1313 static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1315 struct cbq_sched_data *q = qdisc_priv(sch);
1316 struct nlattr *nest;
1318 nest = nla_nest_start(skb, TCA_OPTIONS);
1320 goto nla_put_failure;
1321 if (cbq_dump_attr(skb, &q->link) < 0)
1322 goto nla_put_failure;
1323 return nla_nest_end(skb, nest);
1326 nla_nest_cancel(skb, nest);
1331 cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1333 struct cbq_sched_data *q = qdisc_priv(sch);
1335 q->link.xstats.avgidle = q->link.avgidle;
1336 return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1340 cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1341 struct sk_buff *skb, struct tcmsg *tcm)
1343 struct cbq_class *cl = (struct cbq_class *)arg;
1344 struct nlattr *nest;
1347 tcm->tcm_parent = cl->tparent->common.classid;
1349 tcm->tcm_parent = TC_H_ROOT;
1350 tcm->tcm_handle = cl->common.classid;
1351 tcm->tcm_info = cl->q->handle;
1353 nest = nla_nest_start(skb, TCA_OPTIONS);
1355 goto nla_put_failure;
1356 if (cbq_dump_attr(skb, cl) < 0)
1357 goto nla_put_failure;
1358 return nla_nest_end(skb, nest);
1361 nla_nest_cancel(skb, nest);
1366 cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1367 struct gnet_dump *d)
1369 struct cbq_sched_data *q = qdisc_priv(sch);
1370 struct cbq_class *cl = (struct cbq_class *)arg;
1372 cl->xstats.avgidle = cl->avgidle;
1373 cl->xstats.undertime = 0;
1375 if (cl->undertime != PSCHED_PASTPERFECT)
1376 cl->xstats.undertime = cl->undertime - q->now;
1378 if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
1379 d, NULL, &cl->bstats) < 0 ||
1380 gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1381 gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->q->q.qlen) < 0)
1384 return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1387 static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1390 struct cbq_class *cl = (struct cbq_class *)arg;
1393 new = qdisc_create_dflt(sch->dev_queue,
1394 &pfifo_qdisc_ops, cl->common.classid);
1399 *old = qdisc_replace(sch, new, &cl->q);
1403 static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
1405 struct cbq_class *cl = (struct cbq_class *)arg;
1410 static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1412 struct cbq_class *cl = (struct cbq_class *)arg;
1414 cbq_deactivate_class(cl);
1417 static unsigned long cbq_find(struct Qdisc *sch, u32 classid)
1419 struct cbq_sched_data *q = qdisc_priv(sch);
1421 return (unsigned long)cbq_class_lookup(q, classid);
1424 static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1426 struct cbq_sched_data *q = qdisc_priv(sch);
1428 WARN_ON(cl->filters);
1430 tcf_block_put(cl->block);
1431 qdisc_destroy(cl->q);
1432 qdisc_put_rtab(cl->R_tab);
1433 gen_kill_estimator(&cl->rate_est);
1438 static void cbq_destroy(struct Qdisc *sch)
1440 struct cbq_sched_data *q = qdisc_priv(sch);
1441 struct hlist_node *next;
1442 struct cbq_class *cl;
1445 #ifdef CONFIG_NET_CLS_ACT
1449 * Filters must be destroyed first because we don't destroy the
1450 * classes from root to leafs which means that filters can still
1451 * be bound to classes which have been destroyed already. --TGR '04
1453 for (h = 0; h < q->clhash.hashsize; h++) {
1454 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1455 tcf_block_put(cl->block);
1459 for (h = 0; h < q->clhash.hashsize; h++) {
1460 hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
1462 cbq_destroy_class(sch, cl);
1464 qdisc_class_hash_destroy(&q->clhash);
1468 cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
1472 struct cbq_sched_data *q = qdisc_priv(sch);
1473 struct cbq_class *cl = (struct cbq_class *)*arg;
1474 struct nlattr *opt = tca[TCA_OPTIONS];
1475 struct nlattr *tb[TCA_CBQ_MAX + 1];
1476 struct cbq_class *parent;
1477 struct qdisc_rate_table *rtab = NULL;
1479 err = cbq_opt_parse(tb, opt);
1483 if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE])
1490 cl->tparent->common.classid != parentid)
1492 if (!cl->tparent && parentid != TC_H_ROOT)
1496 if (tb[TCA_CBQ_RATE]) {
1497 rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
1503 if (tca[TCA_RATE]) {
1504 err = gen_replace_estimator(&cl->bstats, NULL,
1507 qdisc_root_sleeping_running(sch),
1510 qdisc_put_rtab(rtab);
1515 /* Change class parameters */
1518 if (cl->next_alive != NULL)
1519 cbq_deactivate_class(cl);
1522 qdisc_put_rtab(cl->R_tab);
1526 if (tb[TCA_CBQ_LSSOPT])
1527 cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1529 if (tb[TCA_CBQ_WRROPT]) {
1531 cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1534 if (tb[TCA_CBQ_FOPT])
1535 cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1538 cbq_activate_class(cl);
1540 sch_tree_unlock(sch);
1545 if (parentid == TC_H_ROOT)
1548 if (tb[TCA_CBQ_WRROPT] == NULL || tb[TCA_CBQ_RATE] == NULL ||
1549 tb[TCA_CBQ_LSSOPT] == NULL)
1552 rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]);
1558 if (TC_H_MAJ(classid ^ sch->handle) ||
1559 cbq_class_lookup(q, classid))
1563 classid = TC_H_MAKE(sch->handle, 0x8000);
1565 for (i = 0; i < 0x8000; i++) {
1566 if (++q->hgenerator >= 0x8000)
1568 if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1574 classid = classid|q->hgenerator;
1579 parent = cbq_class_lookup(q, parentid);
1586 cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1590 err = tcf_block_get(&cl->block, &cl->filter_list);
1596 if (tca[TCA_RATE]) {
1597 err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
1599 qdisc_root_sleeping_running(sch),
1602 tcf_block_put(cl->block);
1610 cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
1612 cl->q = &noop_qdisc;
1614 qdisc_hash_add(cl->q, true);
1616 cl->common.classid = classid;
1617 cl->tparent = parent;
1619 cl->allot = parent->allot;
1620 cl->quantum = cl->allot;
1621 cl->weight = cl->R_tab->rate.rate;
1625 cl->borrow = cl->tparent;
1626 if (cl->tparent != &q->link)
1627 cl->share = cl->tparent;
1628 cbq_adjust_levels(parent);
1629 cl->minidle = -0x7FFFFFFF;
1630 cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1631 cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1632 if (cl->ewma_log == 0)
1633 cl->ewma_log = q->link.ewma_log;
1634 if (cl->maxidle == 0)
1635 cl->maxidle = q->link.maxidle;
1637 cl->avpkt = q->link.avpkt;
1638 if (tb[TCA_CBQ_FOPT])
1639 cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1640 sch_tree_unlock(sch);
1642 qdisc_class_hash_grow(sch, &q->clhash);
1644 *arg = (unsigned long)cl;
1648 qdisc_put_rtab(rtab);
1652 static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1654 struct cbq_sched_data *q = qdisc_priv(sch);
1655 struct cbq_class *cl = (struct cbq_class *)arg;
1656 unsigned int qlen, backlog;
1658 if (cl->filters || cl->children || cl == &q->link)
1663 qlen = cl->q->q.qlen;
1664 backlog = cl->q->qstats.backlog;
1666 qdisc_tree_reduce_backlog(cl->q, qlen, backlog);
1669 cbq_deactivate_class(cl);
1671 if (q->tx_borrowed == cl)
1672 q->tx_borrowed = q->tx_class;
1673 if (q->tx_class == cl) {
1675 q->tx_borrowed = NULL;
1677 #ifdef CONFIG_NET_CLS_ACT
1678 if (q->rx_class == cl)
1682 cbq_unlink_class(cl);
1683 cbq_adjust_levels(cl->tparent);
1685 cbq_sync_defmap(cl);
1688 sch_tree_unlock(sch);
1690 cbq_destroy_class(sch, cl);
1694 static struct tcf_block *cbq_tcf_block(struct Qdisc *sch, unsigned long arg)
1696 struct cbq_sched_data *q = qdisc_priv(sch);
1697 struct cbq_class *cl = (struct cbq_class *)arg;
1705 static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1708 struct cbq_sched_data *q = qdisc_priv(sch);
1709 struct cbq_class *p = (struct cbq_class *)parent;
1710 struct cbq_class *cl = cbq_class_lookup(q, classid);
1713 if (p && p->level <= cl->level)
1716 return (unsigned long)cl;
1721 static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
1723 struct cbq_class *cl = (struct cbq_class *)arg;
1728 static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1730 struct cbq_sched_data *q = qdisc_priv(sch);
1731 struct cbq_class *cl;
1737 for (h = 0; h < q->clhash.hashsize; h++) {
1738 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1739 if (arg->count < arg->skip) {
1743 if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1752 static const struct Qdisc_class_ops cbq_class_ops = {
1755 .qlen_notify = cbq_qlen_notify,
1757 .change = cbq_change_class,
1758 .delete = cbq_delete,
1760 .tcf_block = cbq_tcf_block,
1761 .bind_tcf = cbq_bind_filter,
1762 .unbind_tcf = cbq_unbind_filter,
1763 .dump = cbq_dump_class,
1764 .dump_stats = cbq_dump_class_stats,
1767 static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
1769 .cl_ops = &cbq_class_ops,
1771 .priv_size = sizeof(struct cbq_sched_data),
1772 .enqueue = cbq_enqueue,
1773 .dequeue = cbq_dequeue,
1774 .peek = qdisc_peek_dequeued,
1777 .destroy = cbq_destroy,
1780 .dump_stats = cbq_dump_stats,
1781 .owner = THIS_MODULE,
1784 static int __init cbq_module_init(void)
1786 return register_qdisc(&cbq_qdisc_ops);
1788 static void __exit cbq_module_exit(void)
1790 unregister_qdisc(&cbq_qdisc_ops);
1792 module_init(cbq_module_init)
1793 module_exit(cbq_module_exit)
1794 MODULE_LICENSE("GPL");