GNU Linux-libre 5.4.257-gnu1
[releases.git] / net / sched / sch_cbq.c
1 // SPDX-License-Identifier: GPL-2.0-or-later
2 /*
3  * net/sched/sch_cbq.c  Class-Based Queueing discipline.
4  *
5  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
6  */
7
8 #include <linux/module.h>
9 #include <linux/slab.h>
10 #include <linux/types.h>
11 #include <linux/kernel.h>
12 #include <linux/string.h>
13 #include <linux/errno.h>
14 #include <linux/skbuff.h>
15 #include <net/netlink.h>
16 #include <net/pkt_sched.h>
17 #include <net/pkt_cls.h>
18
19
20 /*      Class-Based Queueing (CBQ) algorithm.
21         =======================================
22
23         Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
24                  Management Models for Packet Networks",
25                  IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
26
27                  [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
28
29                  [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
30                  Parameters", 1996
31
32                  [4] Sally Floyd and Michael Speer, "Experimental Results
33                  for Class-Based Queueing", 1998, not published.
34
35         -----------------------------------------------------------------------
36
37         Algorithm skeleton was taken from NS simulator cbq.cc.
38         If someone wants to check this code against the LBL version,
39         he should take into account that ONLY the skeleton was borrowed,
40         the implementation is different. Particularly:
41
42         --- The WRR algorithm is different. Our version looks more
43         reasonable (I hope) and works when quanta are allowed to be
44         less than MTU, which is always the case when real time classes
45         have small rates. Note, that the statement of [3] is
46         incomplete, delay may actually be estimated even if class
47         per-round allotment is less than MTU. Namely, if per-round
48         allotment is W*r_i, and r_1+...+r_k = r < 1
49
50         delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
51
52         In the worst case we have IntServ estimate with D = W*r+k*MTU
53         and C = MTU*r. The proof (if correct at all) is trivial.
54
55
56         --- It seems that cbq-2.0 is not very accurate. At least, I cannot
57         interpret some places, which look like wrong translations
58         from NS. Anyone is advised to find these differences
59         and explain to me, why I am wrong 8).
60
61         --- Linux has no EOI event, so that we cannot estimate true class
62         idle time. Workaround is to consider the next dequeue event
63         as sign that previous packet is finished. This is wrong because of
64         internal device queueing, but on a permanently loaded link it is true.
65         Moreover, combined with clock integrator, this scheme looks
66         very close to an ideal solution.  */
67
68 struct cbq_sched_data;
69
70
71 struct cbq_class {
72         struct Qdisc_class_common common;
73         struct cbq_class        *next_alive;    /* next class with backlog in this priority band */
74
75 /* Parameters */
76         unsigned char           priority;       /* class priority */
77         unsigned char           priority2;      /* priority to be used after overlimit */
78         unsigned char           ewma_log;       /* time constant for idle time calculation */
79
80         u32                     defmap;
81
82         /* Link-sharing scheduler parameters */
83         long                    maxidle;        /* Class parameters: see below. */
84         long                    offtime;
85         long                    minidle;
86         u32                     avpkt;
87         struct qdisc_rate_table *R_tab;
88
89         /* General scheduler (WRR) parameters */
90         long                    allot;
91         long                    quantum;        /* Allotment per WRR round */
92         long                    weight;         /* Relative allotment: see below */
93
94         struct Qdisc            *qdisc;         /* Ptr to CBQ discipline */
95         struct cbq_class        *split;         /* Ptr to split node */
96         struct cbq_class        *share;         /* Ptr to LS parent in the class tree */
97         struct cbq_class        *tparent;       /* Ptr to tree parent in the class tree */
98         struct cbq_class        *borrow;        /* NULL if class is bandwidth limited;
99                                                    parent otherwise */
100         struct cbq_class        *sibling;       /* Sibling chain */
101         struct cbq_class        *children;      /* Pointer to children chain */
102
103         struct Qdisc            *q;             /* Elementary queueing discipline */
104
105
106 /* Variables */
107         unsigned char           cpriority;      /* Effective priority */
108         unsigned char           delayed;
109         unsigned char           level;          /* level of the class in hierarchy:
110                                                    0 for leaf classes, and maximal
111                                                    level of children + 1 for nodes.
112                                                  */
113
114         psched_time_t           last;           /* Last end of service */
115         psched_time_t           undertime;
116         long                    avgidle;
117         long                    deficit;        /* Saved deficit for WRR */
118         psched_time_t           penalized;
119         struct gnet_stats_basic_packed bstats;
120         struct gnet_stats_queue qstats;
121         struct net_rate_estimator __rcu *rate_est;
122         struct tc_cbq_xstats    xstats;
123
124         struct tcf_proto __rcu  *filter_list;
125         struct tcf_block        *block;
126
127         int                     filters;
128
129         struct cbq_class        *defaults[TC_PRIO_MAX + 1];
130 };
131
132 struct cbq_sched_data {
133         struct Qdisc_class_hash clhash;                 /* Hash table of all classes */
134         int                     nclasses[TC_CBQ_MAXPRIO + 1];
135         unsigned int            quanta[TC_CBQ_MAXPRIO + 1];
136
137         struct cbq_class        link;
138
139         unsigned int            activemask;
140         struct cbq_class        *active[TC_CBQ_MAXPRIO + 1];    /* List of all classes
141                                                                    with backlog */
142
143 #ifdef CONFIG_NET_CLS_ACT
144         struct cbq_class        *rx_class;
145 #endif
146         struct cbq_class        *tx_class;
147         struct cbq_class        *tx_borrowed;
148         int                     tx_len;
149         psched_time_t           now;            /* Cached timestamp */
150         unsigned int            pmask;
151
152         struct hrtimer          delay_timer;
153         struct qdisc_watchdog   watchdog;       /* Watchdog timer,
154                                                    started when CBQ has
155                                                    backlog, but cannot
156                                                    transmit just now */
157         psched_tdiff_t          wd_expires;
158         int                     toplevel;
159         u32                     hgenerator;
160 };
161
162
163 #define L2T(cl, len)    qdisc_l2t((cl)->R_tab, len)
164
165 static inline struct cbq_class *
166 cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
167 {
168         struct Qdisc_class_common *clc;
169
170         clc = qdisc_class_find(&q->clhash, classid);
171         if (clc == NULL)
172                 return NULL;
173         return container_of(clc, struct cbq_class, common);
174 }
175
176 #ifdef CONFIG_NET_CLS_ACT
177
178 static struct cbq_class *
179 cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
180 {
181         struct cbq_class *cl;
182
183         for (cl = this->tparent; cl; cl = cl->tparent) {
184                 struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
185
186                 if (new != NULL && new != this)
187                         return new;
188         }
189         return NULL;
190 }
191
192 #endif
193
194 /* Classify packet. The procedure is pretty complicated, but
195  * it allows us to combine link sharing and priority scheduling
196  * transparently.
197  *
198  * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
199  * so that it resolves to split nodes. Then packets are classified
200  * by logical priority, or a more specific classifier may be attached
201  * to the split node.
202  */
203
204 static struct cbq_class *
205 cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
206 {
207         struct cbq_sched_data *q = qdisc_priv(sch);
208         struct cbq_class *head = &q->link;
209         struct cbq_class **defmap;
210         struct cbq_class *cl = NULL;
211         u32 prio = skb->priority;
212         struct tcf_proto *fl;
213         struct tcf_result res;
214
215         /*
216          *  Step 1. If skb->priority points to one of our classes, use it.
217          */
218         if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
219             (cl = cbq_class_lookup(q, prio)) != NULL)
220                 return cl;
221
222         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
223         for (;;) {
224                 int result = 0;
225                 defmap = head->defaults;
226
227                 fl = rcu_dereference_bh(head->filter_list);
228                 /*
229                  * Step 2+n. Apply classifier.
230                  */
231                 result = tcf_classify(skb, fl, &res, true);
232                 if (!fl || result < 0)
233                         goto fallback;
234                 if (result == TC_ACT_SHOT)
235                         return NULL;
236
237                 cl = (void *)res.class;
238                 if (!cl) {
239                         if (TC_H_MAJ(res.classid))
240                                 cl = cbq_class_lookup(q, res.classid);
241                         else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
242                                 cl = defmap[TC_PRIO_BESTEFFORT];
243
244                         if (cl == NULL)
245                                 goto fallback;
246                 }
247                 if (cl->level >= head->level)
248                         goto fallback;
249 #ifdef CONFIG_NET_CLS_ACT
250                 switch (result) {
251                 case TC_ACT_QUEUED:
252                 case TC_ACT_STOLEN:
253                 case TC_ACT_TRAP:
254                         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
255                         /* fall through */
256                         fallthrough;
257                 case TC_ACT_RECLASSIFY:
258                         return cbq_reclassify(skb, cl);
259                 }
260 #endif
261                 if (cl->level == 0)
262                         return cl;
263
264                 /*
265                  * Step 3+n. If classifier selected a link sharing class,
266                  *         apply agency specific classifier.
267                  *         Repeat this procdure until we hit a leaf node.
268                  */
269                 head = cl;
270         }
271
272 fallback:
273         cl = head;
274
275         /*
276          * Step 4. No success...
277          */
278         if (TC_H_MAJ(prio) == 0 &&
279             !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
280             !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
281                 return head;
282
283         return cl;
284 }
285
286 /*
287  * A packet has just been enqueued on the empty class.
288  * cbq_activate_class adds it to the tail of active class list
289  * of its priority band.
290  */
291
292 static inline void cbq_activate_class(struct cbq_class *cl)
293 {
294         struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
295         int prio = cl->cpriority;
296         struct cbq_class *cl_tail;
297
298         cl_tail = q->active[prio];
299         q->active[prio] = cl;
300
301         if (cl_tail != NULL) {
302                 cl->next_alive = cl_tail->next_alive;
303                 cl_tail->next_alive = cl;
304         } else {
305                 cl->next_alive = cl;
306                 q->activemask |= (1<<prio);
307         }
308 }
309
310 /*
311  * Unlink class from active chain.
312  * Note that this same procedure is done directly in cbq_dequeue*
313  * during round-robin procedure.
314  */
315
316 static void cbq_deactivate_class(struct cbq_class *this)
317 {
318         struct cbq_sched_data *q = qdisc_priv(this->qdisc);
319         int prio = this->cpriority;
320         struct cbq_class *cl;
321         struct cbq_class *cl_prev = q->active[prio];
322
323         do {
324                 cl = cl_prev->next_alive;
325                 if (cl == this) {
326                         cl_prev->next_alive = cl->next_alive;
327                         cl->next_alive = NULL;
328
329                         if (cl == q->active[prio]) {
330                                 q->active[prio] = cl_prev;
331                                 if (cl == q->active[prio]) {
332                                         q->active[prio] = NULL;
333                                         q->activemask &= ~(1<<prio);
334                                         return;
335                                 }
336                         }
337                         return;
338                 }
339         } while ((cl_prev = cl) != q->active[prio]);
340 }
341
342 static void
343 cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
344 {
345         int toplevel = q->toplevel;
346
347         if (toplevel > cl->level) {
348                 psched_time_t now = psched_get_time();
349
350                 do {
351                         if (cl->undertime < now) {
352                                 q->toplevel = cl->level;
353                                 return;
354                         }
355                 } while ((cl = cl->borrow) != NULL && toplevel > cl->level);
356         }
357 }
358
359 static int
360 cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
361             struct sk_buff **to_free)
362 {
363         struct cbq_sched_data *q = qdisc_priv(sch);
364         int ret;
365         struct cbq_class *cl = cbq_classify(skb, sch, &ret);
366
367 #ifdef CONFIG_NET_CLS_ACT
368         q->rx_class = cl;
369 #endif
370         if (cl == NULL) {
371                 if (ret & __NET_XMIT_BYPASS)
372                         qdisc_qstats_drop(sch);
373                 __qdisc_drop(skb, to_free);
374                 return ret;
375         }
376
377         ret = qdisc_enqueue(skb, cl->q, to_free);
378         if (ret == NET_XMIT_SUCCESS) {
379                 sch->q.qlen++;
380                 cbq_mark_toplevel(q, cl);
381                 if (!cl->next_alive)
382                         cbq_activate_class(cl);
383                 return ret;
384         }
385
386         if (net_xmit_drop_count(ret)) {
387                 qdisc_qstats_drop(sch);
388                 cbq_mark_toplevel(q, cl);
389                 cl->qstats.drops++;
390         }
391         return ret;
392 }
393
394 /* Overlimit action: penalize leaf class by adding offtime */
395 static void cbq_overlimit(struct cbq_class *cl)
396 {
397         struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
398         psched_tdiff_t delay = cl->undertime - q->now;
399
400         if (!cl->delayed) {
401                 delay += cl->offtime;
402
403                 /*
404                  * Class goes to sleep, so that it will have no
405                  * chance to work avgidle. Let's forgive it 8)
406                  *
407                  * BTW cbq-2.0 has a crap in this
408                  * place, apparently they forgot to shift it by cl->ewma_log.
409                  */
410                 if (cl->avgidle < 0)
411                         delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
412                 if (cl->avgidle < cl->minidle)
413                         cl->avgidle = cl->minidle;
414                 if (delay <= 0)
415                         delay = 1;
416                 cl->undertime = q->now + delay;
417
418                 cl->xstats.overactions++;
419                 cl->delayed = 1;
420         }
421         if (q->wd_expires == 0 || q->wd_expires > delay)
422                 q->wd_expires = delay;
423
424         /* Dirty work! We must schedule wakeups based on
425          * real available rate, rather than leaf rate,
426          * which may be tiny (even zero).
427          */
428         if (q->toplevel == TC_CBQ_MAXLEVEL) {
429                 struct cbq_class *b;
430                 psched_tdiff_t base_delay = q->wd_expires;
431
432                 for (b = cl->borrow; b; b = b->borrow) {
433                         delay = b->undertime - q->now;
434                         if (delay < base_delay) {
435                                 if (delay <= 0)
436                                         delay = 1;
437                                 base_delay = delay;
438                         }
439                 }
440
441                 q->wd_expires = base_delay;
442         }
443 }
444
445 static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
446                                        psched_time_t now)
447 {
448         struct cbq_class *cl;
449         struct cbq_class *cl_prev = q->active[prio];
450         psched_time_t sched = now;
451
452         if (cl_prev == NULL)
453                 return 0;
454
455         do {
456                 cl = cl_prev->next_alive;
457                 if (now - cl->penalized > 0) {
458                         cl_prev->next_alive = cl->next_alive;
459                         cl->next_alive = NULL;
460                         cl->cpriority = cl->priority;
461                         cl->delayed = 0;
462                         cbq_activate_class(cl);
463
464                         if (cl == q->active[prio]) {
465                                 q->active[prio] = cl_prev;
466                                 if (cl == q->active[prio]) {
467                                         q->active[prio] = NULL;
468                                         return 0;
469                                 }
470                         }
471
472                         cl = cl_prev->next_alive;
473                 } else if (sched - cl->penalized > 0)
474                         sched = cl->penalized;
475         } while ((cl_prev = cl) != q->active[prio]);
476
477         return sched - now;
478 }
479
480 static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
481 {
482         struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
483                                                 delay_timer);
484         struct Qdisc *sch = q->watchdog.qdisc;
485         psched_time_t now;
486         psched_tdiff_t delay = 0;
487         unsigned int pmask;
488
489         now = psched_get_time();
490
491         pmask = q->pmask;
492         q->pmask = 0;
493
494         while (pmask) {
495                 int prio = ffz(~pmask);
496                 psched_tdiff_t tmp;
497
498                 pmask &= ~(1<<prio);
499
500                 tmp = cbq_undelay_prio(q, prio, now);
501                 if (tmp > 0) {
502                         q->pmask |= 1<<prio;
503                         if (tmp < delay || delay == 0)
504                                 delay = tmp;
505                 }
506         }
507
508         if (delay) {
509                 ktime_t time;
510
511                 time = 0;
512                 time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
513                 hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED);
514         }
515
516         __netif_schedule(qdisc_root(sch));
517         return HRTIMER_NORESTART;
518 }
519
520 /*
521  * It is mission critical procedure.
522  *
523  * We "regenerate" toplevel cutoff, if transmitting class
524  * has backlog and it is not regulated. It is not part of
525  * original CBQ description, but looks more reasonable.
526  * Probably, it is wrong. This question needs further investigation.
527  */
528
529 static inline void
530 cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
531                     struct cbq_class *borrowed)
532 {
533         if (cl && q->toplevel >= borrowed->level) {
534                 if (cl->q->q.qlen > 1) {
535                         do {
536                                 if (borrowed->undertime == PSCHED_PASTPERFECT) {
537                                         q->toplevel = borrowed->level;
538                                         return;
539                                 }
540                         } while ((borrowed = borrowed->borrow) != NULL);
541                 }
542 #if 0
543         /* It is not necessary now. Uncommenting it
544            will save CPU cycles, but decrease fairness.
545          */
546                 q->toplevel = TC_CBQ_MAXLEVEL;
547 #endif
548         }
549 }
550
551 static void
552 cbq_update(struct cbq_sched_data *q)
553 {
554         struct cbq_class *this = q->tx_class;
555         struct cbq_class *cl = this;
556         int len = q->tx_len;
557         psched_time_t now;
558
559         q->tx_class = NULL;
560         /* Time integrator. We calculate EOS time
561          * by adding expected packet transmission time.
562          */
563         now = q->now + L2T(&q->link, len);
564
565         for ( ; cl; cl = cl->share) {
566                 long avgidle = cl->avgidle;
567                 long idle;
568
569                 cl->bstats.packets++;
570                 cl->bstats.bytes += len;
571
572                 /*
573                  * (now - last) is total time between packet right edges.
574                  * (last_pktlen/rate) is "virtual" busy time, so that
575                  *
576                  *      idle = (now - last) - last_pktlen/rate
577                  */
578
579                 idle = now - cl->last;
580                 if ((unsigned long)idle > 128*1024*1024) {
581                         avgidle = cl->maxidle;
582                 } else {
583                         idle -= L2T(cl, len);
584
585                 /* true_avgidle := (1-W)*true_avgidle + W*idle,
586                  * where W=2^{-ewma_log}. But cl->avgidle is scaled:
587                  * cl->avgidle == true_avgidle/W,
588                  * hence:
589                  */
590                         avgidle += idle - (avgidle>>cl->ewma_log);
591                 }
592
593                 if (avgidle <= 0) {
594                         /* Overlimit or at-limit */
595
596                         if (avgidle < cl->minidle)
597                                 avgidle = cl->minidle;
598
599                         cl->avgidle = avgidle;
600
601                         /* Calculate expected time, when this class
602                          * will be allowed to send.
603                          * It will occur, when:
604                          * (1-W)*true_avgidle + W*delay = 0, i.e.
605                          * idle = (1/W - 1)*(-true_avgidle)
606                          * or
607                          * idle = (1 - W)*(-cl->avgidle);
608                          */
609                         idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
610
611                         /*
612                          * That is not all.
613                          * To maintain the rate allocated to the class,
614                          * we add to undertime virtual clock,
615                          * necessary to complete transmitted packet.
616                          * (len/phys_bandwidth has been already passed
617                          * to the moment of cbq_update)
618                          */
619
620                         idle -= L2T(&q->link, len);
621                         idle += L2T(cl, len);
622
623                         cl->undertime = now + idle;
624                 } else {
625                         /* Underlimit */
626
627                         cl->undertime = PSCHED_PASTPERFECT;
628                         if (avgidle > cl->maxidle)
629                                 cl->avgidle = cl->maxidle;
630                         else
631                                 cl->avgidle = avgidle;
632                 }
633                 if ((s64)(now - cl->last) > 0)
634                         cl->last = now;
635         }
636
637         cbq_update_toplevel(q, this, q->tx_borrowed);
638 }
639
640 static inline struct cbq_class *
641 cbq_under_limit(struct cbq_class *cl)
642 {
643         struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
644         struct cbq_class *this_cl = cl;
645
646         if (cl->tparent == NULL)
647                 return cl;
648
649         if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
650                 cl->delayed = 0;
651                 return cl;
652         }
653
654         do {
655                 /* It is very suspicious place. Now overlimit
656                  * action is generated for not bounded classes
657                  * only if link is completely congested.
658                  * Though it is in agree with ancestor-only paradigm,
659                  * it looks very stupid. Particularly,
660                  * it means that this chunk of code will either
661                  * never be called or result in strong amplification
662                  * of burstiness. Dangerous, silly, and, however,
663                  * no another solution exists.
664                  */
665                 cl = cl->borrow;
666                 if (!cl) {
667                         this_cl->qstats.overlimits++;
668                         cbq_overlimit(this_cl);
669                         return NULL;
670                 }
671                 if (cl->level > q->toplevel)
672                         return NULL;
673         } while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
674
675         cl->delayed = 0;
676         return cl;
677 }
678
679 static inline struct sk_buff *
680 cbq_dequeue_prio(struct Qdisc *sch, int prio)
681 {
682         struct cbq_sched_data *q = qdisc_priv(sch);
683         struct cbq_class *cl_tail, *cl_prev, *cl;
684         struct sk_buff *skb;
685         int deficit;
686
687         cl_tail = cl_prev = q->active[prio];
688         cl = cl_prev->next_alive;
689
690         do {
691                 deficit = 0;
692
693                 /* Start round */
694                 do {
695                         struct cbq_class *borrow = cl;
696
697                         if (cl->q->q.qlen &&
698                             (borrow = cbq_under_limit(cl)) == NULL)
699                                 goto skip_class;
700
701                         if (cl->deficit <= 0) {
702                                 /* Class exhausted its allotment per
703                                  * this round. Switch to the next one.
704                                  */
705                                 deficit = 1;
706                                 cl->deficit += cl->quantum;
707                                 goto next_class;
708                         }
709
710                         skb = cl->q->dequeue(cl->q);
711
712                         /* Class did not give us any skb :-(
713                          * It could occur even if cl->q->q.qlen != 0
714                          * f.e. if cl->q == "tbf"
715                          */
716                         if (skb == NULL)
717                                 goto skip_class;
718
719                         cl->deficit -= qdisc_pkt_len(skb);
720                         q->tx_class = cl;
721                         q->tx_borrowed = borrow;
722                         if (borrow != cl) {
723 #ifndef CBQ_XSTATS_BORROWS_BYTES
724                                 borrow->xstats.borrows++;
725                                 cl->xstats.borrows++;
726 #else
727                                 borrow->xstats.borrows += qdisc_pkt_len(skb);
728                                 cl->xstats.borrows += qdisc_pkt_len(skb);
729 #endif
730                         }
731                         q->tx_len = qdisc_pkt_len(skb);
732
733                         if (cl->deficit <= 0) {
734                                 q->active[prio] = cl;
735                                 cl = cl->next_alive;
736                                 cl->deficit += cl->quantum;
737                         }
738                         return skb;
739
740 skip_class:
741                         if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
742                                 /* Class is empty or penalized.
743                                  * Unlink it from active chain.
744                                  */
745                                 cl_prev->next_alive = cl->next_alive;
746                                 cl->next_alive = NULL;
747
748                                 /* Did cl_tail point to it? */
749                                 if (cl == cl_tail) {
750                                         /* Repair it! */
751                                         cl_tail = cl_prev;
752
753                                         /* Was it the last class in this band? */
754                                         if (cl == cl_tail) {
755                                                 /* Kill the band! */
756                                                 q->active[prio] = NULL;
757                                                 q->activemask &= ~(1<<prio);
758                                                 if (cl->q->q.qlen)
759                                                         cbq_activate_class(cl);
760                                                 return NULL;
761                                         }
762
763                                         q->active[prio] = cl_tail;
764                                 }
765                                 if (cl->q->q.qlen)
766                                         cbq_activate_class(cl);
767
768                                 cl = cl_prev;
769                         }
770
771 next_class:
772                         cl_prev = cl;
773                         cl = cl->next_alive;
774                 } while (cl_prev != cl_tail);
775         } while (deficit);
776
777         q->active[prio] = cl_prev;
778
779         return NULL;
780 }
781
782 static inline struct sk_buff *
783 cbq_dequeue_1(struct Qdisc *sch)
784 {
785         struct cbq_sched_data *q = qdisc_priv(sch);
786         struct sk_buff *skb;
787         unsigned int activemask;
788
789         activemask = q->activemask & 0xFF;
790         while (activemask) {
791                 int prio = ffz(~activemask);
792                 activemask &= ~(1<<prio);
793                 skb = cbq_dequeue_prio(sch, prio);
794                 if (skb)
795                         return skb;
796         }
797         return NULL;
798 }
799
800 static struct sk_buff *
801 cbq_dequeue(struct Qdisc *sch)
802 {
803         struct sk_buff *skb;
804         struct cbq_sched_data *q = qdisc_priv(sch);
805         psched_time_t now;
806
807         now = psched_get_time();
808
809         if (q->tx_class)
810                 cbq_update(q);
811
812         q->now = now;
813
814         for (;;) {
815                 q->wd_expires = 0;
816
817                 skb = cbq_dequeue_1(sch);
818                 if (skb) {
819                         qdisc_bstats_update(sch, skb);
820                         sch->q.qlen--;
821                         return skb;
822                 }
823
824                 /* All the classes are overlimit.
825                  *
826                  * It is possible, if:
827                  *
828                  * 1. Scheduler is empty.
829                  * 2. Toplevel cutoff inhibited borrowing.
830                  * 3. Root class is overlimit.
831                  *
832                  * Reset 2d and 3d conditions and retry.
833                  *
834                  * Note, that NS and cbq-2.0 are buggy, peeking
835                  * an arbitrary class is appropriate for ancestor-only
836                  * sharing, but not for toplevel algorithm.
837                  *
838                  * Our version is better, but slower, because it requires
839                  * two passes, but it is unavoidable with top-level sharing.
840                  */
841
842                 if (q->toplevel == TC_CBQ_MAXLEVEL &&
843                     q->link.undertime == PSCHED_PASTPERFECT)
844                         break;
845
846                 q->toplevel = TC_CBQ_MAXLEVEL;
847                 q->link.undertime = PSCHED_PASTPERFECT;
848         }
849
850         /* No packets in scheduler or nobody wants to give them to us :-(
851          * Sigh... start watchdog timer in the last case.
852          */
853
854         if (sch->q.qlen) {
855                 qdisc_qstats_overlimit(sch);
856                 if (q->wd_expires)
857                         qdisc_watchdog_schedule(&q->watchdog,
858                                                 now + q->wd_expires);
859         }
860         return NULL;
861 }
862
863 /* CBQ class maintanance routines */
864
865 static void cbq_adjust_levels(struct cbq_class *this)
866 {
867         if (this == NULL)
868                 return;
869
870         do {
871                 int level = 0;
872                 struct cbq_class *cl;
873
874                 cl = this->children;
875                 if (cl) {
876                         do {
877                                 if (cl->level > level)
878                                         level = cl->level;
879                         } while ((cl = cl->sibling) != this->children);
880                 }
881                 this->level = level + 1;
882         } while ((this = this->tparent) != NULL);
883 }
884
885 static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
886 {
887         struct cbq_class *cl;
888         unsigned int h;
889
890         if (q->quanta[prio] == 0)
891                 return;
892
893         for (h = 0; h < q->clhash.hashsize; h++) {
894                 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
895                         /* BUGGGG... Beware! This expression suffer of
896                          * arithmetic overflows!
897                          */
898                         if (cl->priority == prio) {
899                                 cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
900                                         q->quanta[prio];
901                         }
902                         if (cl->quantum <= 0 ||
903                             cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) {
904                                 pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n",
905                                         cl->common.classid, cl->quantum);
906                                 cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
907                         }
908                 }
909         }
910 }
911
912 static void cbq_sync_defmap(struct cbq_class *cl)
913 {
914         struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
915         struct cbq_class *split = cl->split;
916         unsigned int h;
917         int i;
918
919         if (split == NULL)
920                 return;
921
922         for (i = 0; i <= TC_PRIO_MAX; i++) {
923                 if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
924                         split->defaults[i] = NULL;
925         }
926
927         for (i = 0; i <= TC_PRIO_MAX; i++) {
928                 int level = split->level;
929
930                 if (split->defaults[i])
931                         continue;
932
933                 for (h = 0; h < q->clhash.hashsize; h++) {
934                         struct cbq_class *c;
935
936                         hlist_for_each_entry(c, &q->clhash.hash[h],
937                                              common.hnode) {
938                                 if (c->split == split && c->level < level &&
939                                     c->defmap & (1<<i)) {
940                                         split->defaults[i] = c;
941                                         level = c->level;
942                                 }
943                         }
944                 }
945         }
946 }
947
948 static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
949 {
950         struct cbq_class *split = NULL;
951
952         if (splitid == 0) {
953                 split = cl->split;
954                 if (!split)
955                         return;
956                 splitid = split->common.classid;
957         }
958
959         if (split == NULL || split->common.classid != splitid) {
960                 for (split = cl->tparent; split; split = split->tparent)
961                         if (split->common.classid == splitid)
962                                 break;
963         }
964
965         if (split == NULL)
966                 return;
967
968         if (cl->split != split) {
969                 cl->defmap = 0;
970                 cbq_sync_defmap(cl);
971                 cl->split = split;
972                 cl->defmap = def & mask;
973         } else
974                 cl->defmap = (cl->defmap & ~mask) | (def & mask);
975
976         cbq_sync_defmap(cl);
977 }
978
979 static void cbq_unlink_class(struct cbq_class *this)
980 {
981         struct cbq_class *cl, **clp;
982         struct cbq_sched_data *q = qdisc_priv(this->qdisc);
983
984         qdisc_class_hash_remove(&q->clhash, &this->common);
985
986         if (this->tparent) {
987                 clp = &this->sibling;
988                 cl = *clp;
989                 do {
990                         if (cl == this) {
991                                 *clp = cl->sibling;
992                                 break;
993                         }
994                         clp = &cl->sibling;
995                 } while ((cl = *clp) != this->sibling);
996
997                 if (this->tparent->children == this) {
998                         this->tparent->children = this->sibling;
999                         if (this->sibling == this)
1000                                 this->tparent->children = NULL;
1001                 }
1002         } else {
1003                 WARN_ON(this->sibling != this);
1004         }
1005 }
1006
1007 static void cbq_link_class(struct cbq_class *this)
1008 {
1009         struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1010         struct cbq_class *parent = this->tparent;
1011
1012         this->sibling = this;
1013         qdisc_class_hash_insert(&q->clhash, &this->common);
1014
1015         if (parent == NULL)
1016                 return;
1017
1018         if (parent->children == NULL) {
1019                 parent->children = this;
1020         } else {
1021                 this->sibling = parent->children->sibling;
1022                 parent->children->sibling = this;
1023         }
1024 }
1025
1026 static void
1027 cbq_reset(struct Qdisc *sch)
1028 {
1029         struct cbq_sched_data *q = qdisc_priv(sch);
1030         struct cbq_class *cl;
1031         int prio;
1032         unsigned int h;
1033
1034         q->activemask = 0;
1035         q->pmask = 0;
1036         q->tx_class = NULL;
1037         q->tx_borrowed = NULL;
1038         qdisc_watchdog_cancel(&q->watchdog);
1039         hrtimer_cancel(&q->delay_timer);
1040         q->toplevel = TC_CBQ_MAXLEVEL;
1041         q->now = psched_get_time();
1042
1043         for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1044                 q->active[prio] = NULL;
1045
1046         for (h = 0; h < q->clhash.hashsize; h++) {
1047                 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1048                         qdisc_reset(cl->q);
1049
1050                         cl->next_alive = NULL;
1051                         cl->undertime = PSCHED_PASTPERFECT;
1052                         cl->avgidle = cl->maxidle;
1053                         cl->deficit = cl->quantum;
1054                         cl->cpriority = cl->priority;
1055                 }
1056         }
1057         sch->q.qlen = 0;
1058 }
1059
1060
1061 static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1062 {
1063         if (lss->change & TCF_CBQ_LSS_FLAGS) {
1064                 cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1065                 cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1066         }
1067         if (lss->change & TCF_CBQ_LSS_EWMA)
1068                 cl->ewma_log = lss->ewma_log;
1069         if (lss->change & TCF_CBQ_LSS_AVPKT)
1070                 cl->avpkt = lss->avpkt;
1071         if (lss->change & TCF_CBQ_LSS_MINIDLE)
1072                 cl->minidle = -(long)lss->minidle;
1073         if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
1074                 cl->maxidle = lss->maxidle;
1075                 cl->avgidle = lss->maxidle;
1076         }
1077         if (lss->change & TCF_CBQ_LSS_OFFTIME)
1078                 cl->offtime = lss->offtime;
1079         return 0;
1080 }
1081
1082 static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1083 {
1084         q->nclasses[cl->priority]--;
1085         q->quanta[cl->priority] -= cl->weight;
1086         cbq_normalize_quanta(q, cl->priority);
1087 }
1088
1089 static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1090 {
1091         q->nclasses[cl->priority]++;
1092         q->quanta[cl->priority] += cl->weight;
1093         cbq_normalize_quanta(q, cl->priority);
1094 }
1095
1096 static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1097 {
1098         struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1099
1100         if (wrr->allot)
1101                 cl->allot = wrr->allot;
1102         if (wrr->weight)
1103                 cl->weight = wrr->weight;
1104         if (wrr->priority) {
1105                 cl->priority = wrr->priority - 1;
1106                 cl->cpriority = cl->priority;
1107                 if (cl->priority >= cl->priority2)
1108                         cl->priority2 = TC_CBQ_MAXPRIO - 1;
1109         }
1110
1111         cbq_addprio(q, cl);
1112         return 0;
1113 }
1114
1115 static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1116 {
1117         cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1118         return 0;
1119 }
1120
1121 static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
1122         [TCA_CBQ_LSSOPT]        = { .len = sizeof(struct tc_cbq_lssopt) },
1123         [TCA_CBQ_WRROPT]        = { .len = sizeof(struct tc_cbq_wrropt) },
1124         [TCA_CBQ_FOPT]          = { .len = sizeof(struct tc_cbq_fopt) },
1125         [TCA_CBQ_OVL_STRATEGY]  = { .len = sizeof(struct tc_cbq_ovl) },
1126         [TCA_CBQ_RATE]          = { .len = sizeof(struct tc_ratespec) },
1127         [TCA_CBQ_RTAB]          = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1128         [TCA_CBQ_POLICE]        = { .len = sizeof(struct tc_cbq_police) },
1129 };
1130
1131 static int cbq_opt_parse(struct nlattr *tb[TCA_CBQ_MAX + 1],
1132                          struct nlattr *opt,
1133                          struct netlink_ext_ack *extack)
1134 {
1135         int err;
1136
1137         if (!opt) {
1138                 NL_SET_ERR_MSG(extack, "CBQ options are required for this operation");
1139                 return -EINVAL;
1140         }
1141
1142         err = nla_parse_nested_deprecated(tb, TCA_CBQ_MAX, opt,
1143                                           cbq_policy, extack);
1144         if (err < 0)
1145                 return err;
1146
1147         if (tb[TCA_CBQ_WRROPT]) {
1148                 const struct tc_cbq_wrropt *wrr = nla_data(tb[TCA_CBQ_WRROPT]);
1149
1150                 if (wrr->priority > TC_CBQ_MAXPRIO) {
1151                         NL_SET_ERR_MSG(extack, "priority is bigger than TC_CBQ_MAXPRIO");
1152                         err = -EINVAL;
1153                 }
1154         }
1155         return err;
1156 }
1157
1158 static int cbq_init(struct Qdisc *sch, struct nlattr *opt,
1159                     struct netlink_ext_ack *extack)
1160 {
1161         struct cbq_sched_data *q = qdisc_priv(sch);
1162         struct nlattr *tb[TCA_CBQ_MAX + 1];
1163         struct tc_ratespec *r;
1164         int err;
1165
1166         qdisc_watchdog_init(&q->watchdog, sch);
1167         hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1168         q->delay_timer.function = cbq_undelay;
1169
1170         err = cbq_opt_parse(tb, opt, extack);
1171         if (err < 0)
1172                 return err;
1173
1174         if (!tb[TCA_CBQ_RTAB] || !tb[TCA_CBQ_RATE]) {
1175                 NL_SET_ERR_MSG(extack, "Rate specification missing or incomplete");
1176                 return -EINVAL;
1177         }
1178
1179         r = nla_data(tb[TCA_CBQ_RATE]);
1180
1181         q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB], extack);
1182         if (!q->link.R_tab)
1183                 return -EINVAL;
1184
1185         err = tcf_block_get(&q->link.block, &q->link.filter_list, sch, extack);
1186         if (err)
1187                 goto put_rtab;
1188
1189         err = qdisc_class_hash_init(&q->clhash);
1190         if (err < 0)
1191                 goto put_block;
1192
1193         q->link.sibling = &q->link;
1194         q->link.common.classid = sch->handle;
1195         q->link.qdisc = sch;
1196         q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1197                                       sch->handle, NULL);
1198         if (!q->link.q)
1199                 q->link.q = &noop_qdisc;
1200         else
1201                 qdisc_hash_add(q->link.q, true);
1202
1203         q->link.priority = TC_CBQ_MAXPRIO - 1;
1204         q->link.priority2 = TC_CBQ_MAXPRIO - 1;
1205         q->link.cpriority = TC_CBQ_MAXPRIO - 1;
1206         q->link.allot = psched_mtu(qdisc_dev(sch));
1207         q->link.quantum = q->link.allot;
1208         q->link.weight = q->link.R_tab->rate.rate;
1209
1210         q->link.ewma_log = TC_CBQ_DEF_EWMA;
1211         q->link.avpkt = q->link.allot/2;
1212         q->link.minidle = -0x7FFFFFFF;
1213
1214         q->toplevel = TC_CBQ_MAXLEVEL;
1215         q->now = psched_get_time();
1216
1217         cbq_link_class(&q->link);
1218
1219         if (tb[TCA_CBQ_LSSOPT])
1220                 cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
1221
1222         cbq_addprio(q, &q->link);
1223         return 0;
1224
1225 put_block:
1226         tcf_block_put(q->link.block);
1227
1228 put_rtab:
1229         qdisc_put_rtab(q->link.R_tab);
1230         return err;
1231 }
1232
1233 static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1234 {
1235         unsigned char *b = skb_tail_pointer(skb);
1236
1237         if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
1238                 goto nla_put_failure;
1239         return skb->len;
1240
1241 nla_put_failure:
1242         nlmsg_trim(skb, b);
1243         return -1;
1244 }
1245
1246 static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1247 {
1248         unsigned char *b = skb_tail_pointer(skb);
1249         struct tc_cbq_lssopt opt;
1250
1251         opt.flags = 0;
1252         if (cl->borrow == NULL)
1253                 opt.flags |= TCF_CBQ_LSS_BOUNDED;
1254         if (cl->share == NULL)
1255                 opt.flags |= TCF_CBQ_LSS_ISOLATED;
1256         opt.ewma_log = cl->ewma_log;
1257         opt.level = cl->level;
1258         opt.avpkt = cl->avpkt;
1259         opt.maxidle = cl->maxidle;
1260         opt.minidle = (u32)(-cl->minidle);
1261         opt.offtime = cl->offtime;
1262         opt.change = ~0;
1263         if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
1264                 goto nla_put_failure;
1265         return skb->len;
1266
1267 nla_put_failure:
1268         nlmsg_trim(skb, b);
1269         return -1;
1270 }
1271
1272 static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1273 {
1274         unsigned char *b = skb_tail_pointer(skb);
1275         struct tc_cbq_wrropt opt;
1276
1277         memset(&opt, 0, sizeof(opt));
1278         opt.flags = 0;
1279         opt.allot = cl->allot;
1280         opt.priority = cl->priority + 1;
1281         opt.cpriority = cl->cpriority + 1;
1282         opt.weight = cl->weight;
1283         if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
1284                 goto nla_put_failure;
1285         return skb->len;
1286
1287 nla_put_failure:
1288         nlmsg_trim(skb, b);
1289         return -1;
1290 }
1291
1292 static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1293 {
1294         unsigned char *b = skb_tail_pointer(skb);
1295         struct tc_cbq_fopt opt;
1296
1297         if (cl->split || cl->defmap) {
1298                 opt.split = cl->split ? cl->split->common.classid : 0;
1299                 opt.defmap = cl->defmap;
1300                 opt.defchange = ~0;
1301                 if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
1302                         goto nla_put_failure;
1303         }
1304         return skb->len;
1305
1306 nla_put_failure:
1307         nlmsg_trim(skb, b);
1308         return -1;
1309 }
1310
1311 static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1312 {
1313         if (cbq_dump_lss(skb, cl) < 0 ||
1314             cbq_dump_rate(skb, cl) < 0 ||
1315             cbq_dump_wrr(skb, cl) < 0 ||
1316             cbq_dump_fopt(skb, cl) < 0)
1317                 return -1;
1318         return 0;
1319 }
1320
1321 static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1322 {
1323         struct cbq_sched_data *q = qdisc_priv(sch);
1324         struct nlattr *nest;
1325
1326         nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1327         if (nest == NULL)
1328                 goto nla_put_failure;
1329         if (cbq_dump_attr(skb, &q->link) < 0)
1330                 goto nla_put_failure;
1331         return nla_nest_end(skb, nest);
1332
1333 nla_put_failure:
1334         nla_nest_cancel(skb, nest);
1335         return -1;
1336 }
1337
1338 static int
1339 cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1340 {
1341         struct cbq_sched_data *q = qdisc_priv(sch);
1342
1343         q->link.xstats.avgidle = q->link.avgidle;
1344         return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1345 }
1346
1347 static int
1348 cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1349                struct sk_buff *skb, struct tcmsg *tcm)
1350 {
1351         struct cbq_class *cl = (struct cbq_class *)arg;
1352         struct nlattr *nest;
1353
1354         if (cl->tparent)
1355                 tcm->tcm_parent = cl->tparent->common.classid;
1356         else
1357                 tcm->tcm_parent = TC_H_ROOT;
1358         tcm->tcm_handle = cl->common.classid;
1359         tcm->tcm_info = cl->q->handle;
1360
1361         nest = nla_nest_start_noflag(skb, TCA_OPTIONS);
1362         if (nest == NULL)
1363                 goto nla_put_failure;
1364         if (cbq_dump_attr(skb, cl) < 0)
1365                 goto nla_put_failure;
1366         return nla_nest_end(skb, nest);
1367
1368 nla_put_failure:
1369         nla_nest_cancel(skb, nest);
1370         return -1;
1371 }
1372
1373 static int
1374 cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1375         struct gnet_dump *d)
1376 {
1377         struct cbq_sched_data *q = qdisc_priv(sch);
1378         struct cbq_class *cl = (struct cbq_class *)arg;
1379         __u32 qlen;
1380
1381         cl->xstats.avgidle = cl->avgidle;
1382         cl->xstats.undertime = 0;
1383         qdisc_qstats_qlen_backlog(cl->q, &qlen, &cl->qstats.backlog);
1384
1385         if (cl->undertime != PSCHED_PASTPERFECT)
1386                 cl->xstats.undertime = cl->undertime - q->now;
1387
1388         if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
1389                                   d, NULL, &cl->bstats) < 0 ||
1390             gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1391             gnet_stats_copy_queue(d, NULL, &cl->qstats, qlen) < 0)
1392                 return -1;
1393
1394         return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1395 }
1396
1397 static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1398                      struct Qdisc **old, struct netlink_ext_ack *extack)
1399 {
1400         struct cbq_class *cl = (struct cbq_class *)arg;
1401
1402         if (new == NULL) {
1403                 new = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1404                                         cl->common.classid, extack);
1405                 if (new == NULL)
1406                         return -ENOBUFS;
1407         }
1408
1409         *old = qdisc_replace(sch, new, &cl->q);
1410         return 0;
1411 }
1412
1413 static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
1414 {
1415         struct cbq_class *cl = (struct cbq_class *)arg;
1416
1417         return cl->q;
1418 }
1419
1420 static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1421 {
1422         struct cbq_class *cl = (struct cbq_class *)arg;
1423
1424         cbq_deactivate_class(cl);
1425 }
1426
1427 static unsigned long cbq_find(struct Qdisc *sch, u32 classid)
1428 {
1429         struct cbq_sched_data *q = qdisc_priv(sch);
1430
1431         return (unsigned long)cbq_class_lookup(q, classid);
1432 }
1433
1434 static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1435 {
1436         struct cbq_sched_data *q = qdisc_priv(sch);
1437
1438         WARN_ON(cl->filters);
1439
1440         tcf_block_put(cl->block);
1441         qdisc_put(cl->q);
1442         qdisc_put_rtab(cl->R_tab);
1443         gen_kill_estimator(&cl->rate_est);
1444         if (cl != &q->link)
1445                 kfree(cl);
1446 }
1447
1448 static void cbq_destroy(struct Qdisc *sch)
1449 {
1450         struct cbq_sched_data *q = qdisc_priv(sch);
1451         struct hlist_node *next;
1452         struct cbq_class *cl;
1453         unsigned int h;
1454
1455 #ifdef CONFIG_NET_CLS_ACT
1456         q->rx_class = NULL;
1457 #endif
1458         /*
1459          * Filters must be destroyed first because we don't destroy the
1460          * classes from root to leafs which means that filters can still
1461          * be bound to classes which have been destroyed already. --TGR '04
1462          */
1463         for (h = 0; h < q->clhash.hashsize; h++) {
1464                 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1465                         tcf_block_put(cl->block);
1466                         cl->block = NULL;
1467                 }
1468         }
1469         for (h = 0; h < q->clhash.hashsize; h++) {
1470                 hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
1471                                           common.hnode)
1472                         cbq_destroy_class(sch, cl);
1473         }
1474         qdisc_class_hash_destroy(&q->clhash);
1475 }
1476
1477 static int
1478 cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
1479                  unsigned long *arg, struct netlink_ext_ack *extack)
1480 {
1481         int err;
1482         struct cbq_sched_data *q = qdisc_priv(sch);
1483         struct cbq_class *cl = (struct cbq_class *)*arg;
1484         struct nlattr *opt = tca[TCA_OPTIONS];
1485         struct nlattr *tb[TCA_CBQ_MAX + 1];
1486         struct cbq_class *parent;
1487         struct qdisc_rate_table *rtab = NULL;
1488
1489         err = cbq_opt_parse(tb, opt, extack);
1490         if (err < 0)
1491                 return err;
1492
1493         if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE]) {
1494                 NL_SET_ERR_MSG(extack, "Neither overlimit strategy nor policing attributes can be used for changing class params");
1495                 return -EOPNOTSUPP;
1496         }
1497
1498         if (cl) {
1499                 /* Check parent */
1500                 if (parentid) {
1501                         if (cl->tparent &&
1502                             cl->tparent->common.classid != parentid) {
1503                                 NL_SET_ERR_MSG(extack, "Invalid parent id");
1504                                 return -EINVAL;
1505                         }
1506                         if (!cl->tparent && parentid != TC_H_ROOT) {
1507                                 NL_SET_ERR_MSG(extack, "Parent must be root");
1508                                 return -EINVAL;
1509                         }
1510                 }
1511
1512                 if (tb[TCA_CBQ_RATE]) {
1513                         rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
1514                                               tb[TCA_CBQ_RTAB], extack);
1515                         if (rtab == NULL)
1516                                 return -EINVAL;
1517                 }
1518
1519                 if (tca[TCA_RATE]) {
1520                         err = gen_replace_estimator(&cl->bstats, NULL,
1521                                                     &cl->rate_est,
1522                                                     NULL,
1523                                                     qdisc_root_sleeping_running(sch),
1524                                                     tca[TCA_RATE]);
1525                         if (err) {
1526                                 NL_SET_ERR_MSG(extack, "Failed to replace specified rate estimator");
1527                                 qdisc_put_rtab(rtab);
1528                                 return err;
1529                         }
1530                 }
1531
1532                 /* Change class parameters */
1533                 sch_tree_lock(sch);
1534
1535                 if (cl->next_alive != NULL)
1536                         cbq_deactivate_class(cl);
1537
1538                 if (rtab) {
1539                         qdisc_put_rtab(cl->R_tab);
1540                         cl->R_tab = rtab;
1541                 }
1542
1543                 if (tb[TCA_CBQ_LSSOPT])
1544                         cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1545
1546                 if (tb[TCA_CBQ_WRROPT]) {
1547                         cbq_rmprio(q, cl);
1548                         cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1549                 }
1550
1551                 if (tb[TCA_CBQ_FOPT])
1552                         cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1553
1554                 if (cl->q->q.qlen)
1555                         cbq_activate_class(cl);
1556
1557                 sch_tree_unlock(sch);
1558
1559                 return 0;
1560         }
1561
1562         if (parentid == TC_H_ROOT)
1563                 return -EINVAL;
1564
1565         if (!tb[TCA_CBQ_WRROPT] || !tb[TCA_CBQ_RATE] || !tb[TCA_CBQ_LSSOPT]) {
1566                 NL_SET_ERR_MSG(extack, "One of the following attributes MUST be specified: WRR, rate or link sharing");
1567                 return -EINVAL;
1568         }
1569
1570         rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB],
1571                               extack);
1572         if (rtab == NULL)
1573                 return -EINVAL;
1574
1575         if (classid) {
1576                 err = -EINVAL;
1577                 if (TC_H_MAJ(classid ^ sch->handle) ||
1578                     cbq_class_lookup(q, classid)) {
1579                         NL_SET_ERR_MSG(extack, "Specified class not found");
1580                         goto failure;
1581                 }
1582         } else {
1583                 int i;
1584                 classid = TC_H_MAKE(sch->handle, 0x8000);
1585
1586                 for (i = 0; i < 0x8000; i++) {
1587                         if (++q->hgenerator >= 0x8000)
1588                                 q->hgenerator = 1;
1589                         if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1590                                 break;
1591                 }
1592                 err = -ENOSR;
1593                 if (i >= 0x8000) {
1594                         NL_SET_ERR_MSG(extack, "Unable to generate classid");
1595                         goto failure;
1596                 }
1597                 classid = classid|q->hgenerator;
1598         }
1599
1600         parent = &q->link;
1601         if (parentid) {
1602                 parent = cbq_class_lookup(q, parentid);
1603                 err = -EINVAL;
1604                 if (!parent) {
1605                         NL_SET_ERR_MSG(extack, "Failed to find parentid");
1606                         goto failure;
1607                 }
1608         }
1609
1610         err = -ENOBUFS;
1611         cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1612         if (cl == NULL)
1613                 goto failure;
1614
1615         err = tcf_block_get(&cl->block, &cl->filter_list, sch, extack);
1616         if (err) {
1617                 kfree(cl);
1618                 goto failure;
1619         }
1620
1621         if (tca[TCA_RATE]) {
1622                 err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
1623                                         NULL,
1624                                         qdisc_root_sleeping_running(sch),
1625                                         tca[TCA_RATE]);
1626                 if (err) {
1627                         NL_SET_ERR_MSG(extack, "Couldn't create new estimator");
1628                         tcf_block_put(cl->block);
1629                         kfree(cl);
1630                         goto failure;
1631                 }
1632         }
1633
1634         cl->R_tab = rtab;
1635         rtab = NULL;
1636         cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid,
1637                                   NULL);
1638         if (!cl->q)
1639                 cl->q = &noop_qdisc;
1640         else
1641                 qdisc_hash_add(cl->q, true);
1642
1643         cl->common.classid = classid;
1644         cl->tparent = parent;
1645         cl->qdisc = sch;
1646         cl->allot = parent->allot;
1647         cl->quantum = cl->allot;
1648         cl->weight = cl->R_tab->rate.rate;
1649
1650         sch_tree_lock(sch);
1651         cbq_link_class(cl);
1652         cl->borrow = cl->tparent;
1653         if (cl->tparent != &q->link)
1654                 cl->share = cl->tparent;
1655         cbq_adjust_levels(parent);
1656         cl->minidle = -0x7FFFFFFF;
1657         cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1658         cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1659         if (cl->ewma_log == 0)
1660                 cl->ewma_log = q->link.ewma_log;
1661         if (cl->maxidle == 0)
1662                 cl->maxidle = q->link.maxidle;
1663         if (cl->avpkt == 0)
1664                 cl->avpkt = q->link.avpkt;
1665         if (tb[TCA_CBQ_FOPT])
1666                 cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1667         sch_tree_unlock(sch);
1668
1669         qdisc_class_hash_grow(sch, &q->clhash);
1670
1671         *arg = (unsigned long)cl;
1672         return 0;
1673
1674 failure:
1675         qdisc_put_rtab(rtab);
1676         return err;
1677 }
1678
1679 static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1680 {
1681         struct cbq_sched_data *q = qdisc_priv(sch);
1682         struct cbq_class *cl = (struct cbq_class *)arg;
1683
1684         if (cl->filters || cl->children || cl == &q->link)
1685                 return -EBUSY;
1686
1687         sch_tree_lock(sch);
1688
1689         qdisc_purge_queue(cl->q);
1690
1691         if (cl->next_alive)
1692                 cbq_deactivate_class(cl);
1693
1694         if (q->tx_borrowed == cl)
1695                 q->tx_borrowed = q->tx_class;
1696         if (q->tx_class == cl) {
1697                 q->tx_class = NULL;
1698                 q->tx_borrowed = NULL;
1699         }
1700 #ifdef CONFIG_NET_CLS_ACT
1701         if (q->rx_class == cl)
1702                 q->rx_class = NULL;
1703 #endif
1704
1705         cbq_unlink_class(cl);
1706         cbq_adjust_levels(cl->tparent);
1707         cl->defmap = 0;
1708         cbq_sync_defmap(cl);
1709
1710         cbq_rmprio(q, cl);
1711         sch_tree_unlock(sch);
1712
1713         cbq_destroy_class(sch, cl);
1714         return 0;
1715 }
1716
1717 static struct tcf_block *cbq_tcf_block(struct Qdisc *sch, unsigned long arg,
1718                                        struct netlink_ext_ack *extack)
1719 {
1720         struct cbq_sched_data *q = qdisc_priv(sch);
1721         struct cbq_class *cl = (struct cbq_class *)arg;
1722
1723         if (cl == NULL)
1724                 cl = &q->link;
1725
1726         return cl->block;
1727 }
1728
1729 static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1730                                      u32 classid)
1731 {
1732         struct cbq_sched_data *q = qdisc_priv(sch);
1733         struct cbq_class *p = (struct cbq_class *)parent;
1734         struct cbq_class *cl = cbq_class_lookup(q, classid);
1735
1736         if (cl) {
1737                 if (p && p->level <= cl->level)
1738                         return 0;
1739                 cl->filters++;
1740                 return (unsigned long)cl;
1741         }
1742         return 0;
1743 }
1744
1745 static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
1746 {
1747         struct cbq_class *cl = (struct cbq_class *)arg;
1748
1749         cl->filters--;
1750 }
1751
1752 static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1753 {
1754         struct cbq_sched_data *q = qdisc_priv(sch);
1755         struct cbq_class *cl;
1756         unsigned int h;
1757
1758         if (arg->stop)
1759                 return;
1760
1761         for (h = 0; h < q->clhash.hashsize; h++) {
1762                 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1763                         if (arg->count < arg->skip) {
1764                                 arg->count++;
1765                                 continue;
1766                         }
1767                         if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1768                                 arg->stop = 1;
1769                                 return;
1770                         }
1771                         arg->count++;
1772                 }
1773         }
1774 }
1775
1776 static const struct Qdisc_class_ops cbq_class_ops = {
1777         .graft          =       cbq_graft,
1778         .leaf           =       cbq_leaf,
1779         .qlen_notify    =       cbq_qlen_notify,
1780         .find           =       cbq_find,
1781         .change         =       cbq_change_class,
1782         .delete         =       cbq_delete,
1783         .walk           =       cbq_walk,
1784         .tcf_block      =       cbq_tcf_block,
1785         .bind_tcf       =       cbq_bind_filter,
1786         .unbind_tcf     =       cbq_unbind_filter,
1787         .dump           =       cbq_dump_class,
1788         .dump_stats     =       cbq_dump_class_stats,
1789 };
1790
1791 static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
1792         .next           =       NULL,
1793         .cl_ops         =       &cbq_class_ops,
1794         .id             =       "cbq",
1795         .priv_size      =       sizeof(struct cbq_sched_data),
1796         .enqueue        =       cbq_enqueue,
1797         .dequeue        =       cbq_dequeue,
1798         .peek           =       qdisc_peek_dequeued,
1799         .init           =       cbq_init,
1800         .reset          =       cbq_reset,
1801         .destroy        =       cbq_destroy,
1802         .change         =       NULL,
1803         .dump           =       cbq_dump,
1804         .dump_stats     =       cbq_dump_stats,
1805         .owner          =       THIS_MODULE,
1806 };
1807
1808 static int __init cbq_module_init(void)
1809 {
1810         return register_qdisc(&cbq_qdisc_ops);
1811 }
1812 static void __exit cbq_module_exit(void)
1813 {
1814         unregister_qdisc(&cbq_qdisc_ops);
1815 }
1816 module_init(cbq_module_init)
1817 module_exit(cbq_module_exit)
1818 MODULE_LICENSE("GPL");