net/sched/sch_cbq.c

   1 /*
   2  * net/sched/sch_cbq.c  Class-Based Queueing discipline.
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License, or (at your option) any later version.
   8  *
   9  * Authors:     Alexey Kuznetsov, <kuznet@ms2.inr.ac.ru>
  10  *
  11  */
  12
  13 #include <linux/module.h>
  14 #include <linux/slab.h>
  15 #include <linux/types.h>
  16 #include <linux/kernel.h>
  17 #include <linux/string.h>
  18 #include <linux/errno.h>
  19 #include <linux/skbuff.h>
  20 #include <net/netlink.h>
  21 #include <net/pkt_sched.h>
  22 #include <net/pkt_cls.h>
  23
  24
  25 /*      Class-Based Queueing (CBQ) algorithm.
  26         =======================================
  27
  28         Sources: [1] Sally Floyd and Van Jacobson, "Link-sharing and Resource
  29                  Management Models for Packet Networks",
  30                  IEEE/ACM Transactions on Networking, Vol.3, No.4, 1995
  31
  32                  [2] Sally Floyd, "Notes on CBQ and Guaranteed Service", 1995
  33
  34                  [3] Sally Floyd, "Notes on Class-Based Queueing: Setting
  35                  Parameters", 1996
  36
  37                  [4] Sally Floyd and Michael Speer, "Experimental Results
  38                  for Class-Based Queueing", 1998, not published.
  39
  40         -----------------------------------------------------------------------
  41
  42         Algorithm skeleton was taken from NS simulator cbq.cc.
  43         If someone wants to check this code against the LBL version,
  44         he should take into account that ONLY the skeleton was borrowed,
  45         the implementation is different. Particularly:
  46
  47         --- The WRR algorithm is different. Our version looks more
  48         reasonable (I hope) and works when quanta are allowed to be
  49         less than MTU, which is always the case when real time classes
  50         have small rates. Note, that the statement of [3] is
  51         incomplete, delay may actually be estimated even if class
  52         per-round allotment is less than MTU. Namely, if per-round
  53         allotment is W*r_i, and r_1+...+r_k = r < 1
  54
  55         delay_i <= ([MTU/(W*r_i)]*W*r + W*r + k*MTU)/B
  56
  57         In the worst case we have IntServ estimate with D = W*r+k*MTU
  58         and C = MTU*r. The proof (if correct at all) is trivial.
  59
  60
  61         --- It seems that cbq-2.0 is not very accurate. At least, I cannot
  62         interpret some places, which look like wrong translations
  63         from NS. Anyone is advised to find these differences
  64         and explain to me, why I am wrong 8).
  65
  66         --- Linux has no EOI event, so that we cannot estimate true class
  67         idle time. Workaround is to consider the next dequeue event
  68         as sign that previous packet is finished. This is wrong because of
  69         internal device queueing, but on a permanently loaded link it is true.
  70         Moreover, combined with clock integrator, this scheme looks
  71         very close to an ideal solution.  */
  72
  73 struct cbq_sched_data;
  74
  75
  76 struct cbq_class {
  77         struct Qdisc_class_common common;
  78         struct cbq_class        *next_alive;    /* next class with backlog in this priority band */
  79
  80 /* Parameters */
  81         unsigned char           priority;       /* class priority */
  82         unsigned char           priority2;      /* priority to be used after overlimit */
  83         unsigned char           ewma_log;       /* time constant for idle time calculation */
  84
  85         u32                     defmap;
  86
  87         /* Link-sharing scheduler parameters */
  88         long                    maxidle;        /* Class parameters: see below. */
  89         long                    offtime;
  90         long                    minidle;
  91         u32                     avpkt;
  92         struct qdisc_rate_table *R_tab;
  93
  94         /* General scheduler (WRR) parameters */
  95         long                    allot;
  96         long                    quantum;        /* Allotment per WRR round */
  97         long                    weight;         /* Relative allotment: see below */
  98
  99         struct Qdisc            *qdisc;         /* Ptr to CBQ discipline */
 100         struct cbq_class        *split;         /* Ptr to split node */
 101         struct cbq_class        *share;         /* Ptr to LS parent in the class tree */
 102         struct cbq_class        *tparent;       /* Ptr to tree parent in the class tree */
 103         struct cbq_class        *borrow;        /* NULL if class is bandwidth limited;
 104                                                    parent otherwise */
 105         struct cbq_class        *sibling;       /* Sibling chain */
 106         struct cbq_class        *children;      /* Pointer to children chain */
 107
 108         struct Qdisc            *q;             /* Elementary queueing discipline */
 109
 110
 111 /* Variables */
 112         unsigned char           cpriority;      /* Effective priority */
 113         unsigned char           delayed;
 114         unsigned char           level;          /* level of the class in hierarchy:
 115                                                    0 for leaf classes, and maximal
 116                                                    level of children + 1 for nodes.
 117                                                  */
 118
 119         psched_time_t           last;           /* Last end of service */
 120         psched_time_t           undertime;
 121         long                    avgidle;
 122         long                    deficit;        /* Saved deficit for WRR */
 123         psched_time_t           penalized;
 124         struct gnet_stats_basic_packed bstats;
 125         struct gnet_stats_queue qstats;
 126         struct net_rate_estimator __rcu *rate_est;
 127         struct tc_cbq_xstats    xstats;
 128
 129         struct tcf_proto __rcu  *filter_list;
 130         struct tcf_block        *block;
 131
 132         int                     filters;
 133
 134         struct cbq_class        *defaults[TC_PRIO_MAX + 1];
 135 };
 136
 137 struct cbq_sched_data {
 138         struct Qdisc_class_hash clhash;                 /* Hash table of all classes */
 139         int                     nclasses[TC_CBQ_MAXPRIO + 1];
 140         unsigned int            quanta[TC_CBQ_MAXPRIO + 1];
 141
 142         struct cbq_class        link;
 143
 144         unsigned int            activemask;
 145         struct cbq_class        *active[TC_CBQ_MAXPRIO + 1];    /* List of all classes
 146                                                                    with backlog */
 147
 148 #ifdef CONFIG_NET_CLS_ACT
 149         struct cbq_class        *rx_class;
 150 #endif
 151         struct cbq_class        *tx_class;
 152         struct cbq_class        *tx_borrowed;
 153         int                     tx_len;
 154         psched_time_t           now;            /* Cached timestamp */
 155         unsigned int            pmask;
 156
 157         struct hrtimer          delay_timer;
 158         struct qdisc_watchdog   watchdog;       /* Watchdog timer,
 159                                                    started when CBQ has
 160                                                    backlog, but cannot
 161                                                    transmit just now */
 162         psched_tdiff_t          wd_expires;
 163         int                     toplevel;
 164         u32                     hgenerator;
 165 };
 166
 167
 168 #define L2T(cl, len)    qdisc_l2t((cl)->R_tab, len)
 169
 170 static inline struct cbq_class *
 171 cbq_class_lookup(struct cbq_sched_data *q, u32 classid)
 172 {
 173         struct Qdisc_class_common *clc;
 174
 175         clc = qdisc_class_find(&q->clhash, classid);
 176         if (clc == NULL)
 177                 return NULL;
 178         return container_of(clc, struct cbq_class, common);
 179 }
 180
 181 #ifdef CONFIG_NET_CLS_ACT
 182
 183 static struct cbq_class *
 184 cbq_reclassify(struct sk_buff *skb, struct cbq_class *this)
 185 {
 186         struct cbq_class *cl;
 187
 188         for (cl = this->tparent; cl; cl = cl->tparent) {
 189                 struct cbq_class *new = cl->defaults[TC_PRIO_BESTEFFORT];
 190
 191                 if (new != NULL && new != this)
 192                         return new;
 193         }
 194         return NULL;
 195 }
 196
 197 #endif
 198
 199 /* Classify packet. The procedure is pretty complicated, but
 200  * it allows us to combine link sharing and priority scheduling
 201  * transparently.
 202  *
 203  * Namely, you can put link sharing rules (f.e. route based) at root of CBQ,
 204  * so that it resolves to split nodes. Then packets are classified
 205  * by logical priority, or a more specific classifier may be attached
 206  * to the split node.
 207  */
 208
 209 static struct cbq_class *
 210 cbq_classify(struct sk_buff *skb, struct Qdisc *sch, int *qerr)
 211 {
 212         struct cbq_sched_data *q = qdisc_priv(sch);
 213         struct cbq_class *head = &q->link;
 214         struct cbq_class **defmap;
 215         struct cbq_class *cl = NULL;
 216         u32 prio = skb->priority;
 217         struct tcf_proto *fl;
 218         struct tcf_result res;
 219
 220         /*
 221          *  Step 1. If skb->priority points to one of our classes, use it.
 222          */
 223         if (TC_H_MAJ(prio ^ sch->handle) == 0 &&
 224             (cl = cbq_class_lookup(q, prio)) != NULL)
 225                 return cl;
 226
 227         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 228         for (;;) {
 229                 int result = 0;
 230                 defmap = head->defaults;
 231
 232                 fl = rcu_dereference_bh(head->filter_list);
 233                 /*
 234                  * Step 2+n. Apply classifier.
 235                  */
 236                 result = tcf_classify(skb, fl, &res, true);
 237                 if (!fl || result < 0)
 238                         goto fallback;
 239
 240                 cl = (void *)res.class;
 241                 if (!cl) {
 242                         if (TC_H_MAJ(res.classid))
 243                                 cl = cbq_class_lookup(q, res.classid);
 244                         else if ((cl = defmap[res.classid & TC_PRIO_MAX]) == NULL)
 245                                 cl = defmap[TC_PRIO_BESTEFFORT];
 246
 247                         if (cl == NULL)
 248                                 goto fallback;
 249                 }
 250                 if (cl->level >= head->level)
 251                         goto fallback;
 252 #ifdef CONFIG_NET_CLS_ACT
 253                 switch (result) {
 254                 case TC_ACT_QUEUED:
 255                 case TC_ACT_STOLEN:
 256                 case TC_ACT_TRAP:
 257                         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
 258                 case TC_ACT_SHOT:
 259                         return NULL;
 260                 case TC_ACT_RECLASSIFY:
 261                         return cbq_reclassify(skb, cl);
 262                 }
 263 #endif
 264                 if (cl->level == 0)
 265                         return cl;
 266
 267                 /*
 268                  * Step 3+n. If classifier selected a link sharing class,
 269                  *         apply agency specific classifier.
 270                  *         Repeat this procdure until we hit a leaf node.
 271                  */
 272                 head = cl;
 273         }
 274
 275 fallback:
 276         cl = head;
 277
 278         /*
 279          * Step 4. No success...
 280          */
 281         if (TC_H_MAJ(prio) == 0 &&
 282             !(cl = head->defaults[prio & TC_PRIO_MAX]) &&
 283             !(cl = head->defaults[TC_PRIO_BESTEFFORT]))
 284                 return head;
 285
 286         return cl;
 287 }
 288
 289 /*
 290  * A packet has just been enqueued on the empty class.
 291  * cbq_activate_class adds it to the tail of active class list
 292  * of its priority band.
 293  */
 294
 295 static inline void cbq_activate_class(struct cbq_class *cl)
 296 {
 297         struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 298         int prio = cl->cpriority;
 299         struct cbq_class *cl_tail;
 300
 301         cl_tail = q->active[prio];
 302         q->active[prio] = cl;
 303
 304         if (cl_tail != NULL) {
 305                 cl->next_alive = cl_tail->next_alive;
 306                 cl_tail->next_alive = cl;
 307         } else {
 308                 cl->next_alive = cl;
 309                 q->activemask |= (1<<prio);
 310         }
 311 }
 312
 313 /*
 314  * Unlink class from active chain.
 315  * Note that this same procedure is done directly in cbq_dequeue*
 316  * during round-robin procedure.
 317  */
 318
 319 static void cbq_deactivate_class(struct cbq_class *this)
 320 {
 321         struct cbq_sched_data *q = qdisc_priv(this->qdisc);
 322         int prio = this->cpriority;
 323         struct cbq_class *cl;
 324         struct cbq_class *cl_prev = q->active[prio];
 325
 326         do {
 327                 cl = cl_prev->next_alive;
 328                 if (cl == this) {
 329                         cl_prev->next_alive = cl->next_alive;
 330                         cl->next_alive = NULL;
 331
 332                         if (cl == q->active[prio]) {
 333                                 q->active[prio] = cl_prev;
 334                                 if (cl == q->active[prio]) {
 335                                         q->active[prio] = NULL;
 336                                         q->activemask &= ~(1<<prio);
 337                                         return;
 338                                 }
 339                         }
 340                         return;
 341                 }
 342         } while ((cl_prev = cl) != q->active[prio]);
 343 }
 344
 345 static void
 346 cbq_mark_toplevel(struct cbq_sched_data *q, struct cbq_class *cl)
 347 {
 348         int toplevel = q->toplevel;
 349
 350         if (toplevel > cl->level) {
 351                 psched_time_t now = psched_get_time();
 352
 353                 do {
 354                         if (cl->undertime < now) {
 355                                 q->toplevel = cl->level;
 356                                 return;
 357                         }
 358                 } while ((cl = cl->borrow) != NULL && toplevel > cl->level);
 359         }
 360 }
 361
 362 static int
 363 cbq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 364             struct sk_buff **to_free)
 365 {
 366         struct cbq_sched_data *q = qdisc_priv(sch);
 367         int uninitialized_var(ret);
 368         struct cbq_class *cl = cbq_classify(skb, sch, &ret);
 369
 370 #ifdef CONFIG_NET_CLS_ACT
 371         q->rx_class = cl;
 372 #endif
 373         if (cl == NULL) {
 374                 if (ret & __NET_XMIT_BYPASS)
 375                         qdisc_qstats_drop(sch);
 376                 __qdisc_drop(skb, to_free);
 377                 return ret;
 378         }
 379
 380         ret = qdisc_enqueue(skb, cl->q, to_free);
 381         if (ret == NET_XMIT_SUCCESS) {
 382                 sch->q.qlen++;
 383                 cbq_mark_toplevel(q, cl);
 384                 if (!cl->next_alive)
 385                         cbq_activate_class(cl);
 386                 return ret;
 387         }
 388
 389         if (net_xmit_drop_count(ret)) {
 390                 qdisc_qstats_drop(sch);
 391                 cbq_mark_toplevel(q, cl);
 392                 cl->qstats.drops++;
 393         }
 394         return ret;
 395 }
 396
 397 /* Overlimit action: penalize leaf class by adding offtime */
 398 static void cbq_overlimit(struct cbq_class *cl)
 399 {
 400         struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 401         psched_tdiff_t delay = cl->undertime - q->now;
 402
 403         if (!cl->delayed) {
 404                 delay += cl->offtime;
 405
 406                 /*
 407                  * Class goes to sleep, so that it will have no
 408                  * chance to work avgidle. Let's forgive it 8)
 409                  *
 410                  * BTW cbq-2.0 has a crap in this
 411                  * place, apparently they forgot to shift it by cl->ewma_log.
 412                  */
 413                 if (cl->avgidle < 0)
 414                         delay -= (-cl->avgidle) - ((-cl->avgidle) >> cl->ewma_log);
 415                 if (cl->avgidle < cl->minidle)
 416                         cl->avgidle = cl->minidle;
 417                 if (delay <= 0)
 418                         delay = 1;
 419                 cl->undertime = q->now + delay;
 420
 421                 cl->xstats.overactions++;
 422                 cl->delayed = 1;
 423         }
 424         if (q->wd_expires == 0 || q->wd_expires > delay)
 425                 q->wd_expires = delay;
 426
 427         /* Dirty work! We must schedule wakeups based on
 428          * real available rate, rather than leaf rate,
 429          * which may be tiny (even zero).
 430          */
 431         if (q->toplevel == TC_CBQ_MAXLEVEL) {
 432                 struct cbq_class *b;
 433                 psched_tdiff_t base_delay = q->wd_expires;
 434
 435                 for (b = cl->borrow; b; b = b->borrow) {
 436                         delay = b->undertime - q->now;
 437                         if (delay < base_delay) {
 438                                 if (delay <= 0)
 439                                         delay = 1;
 440                                 base_delay = delay;
 441                         }
 442                 }
 443
 444                 q->wd_expires = base_delay;
 445         }
 446 }
 447
 448 static psched_tdiff_t cbq_undelay_prio(struct cbq_sched_data *q, int prio,
 449                                        psched_time_t now)
 450 {
 451         struct cbq_class *cl;
 452         struct cbq_class *cl_prev = q->active[prio];
 453         psched_time_t sched = now;
 454
 455         if (cl_prev == NULL)
 456                 return 0;
 457
 458         do {
 459                 cl = cl_prev->next_alive;
 460                 if (now - cl->penalized > 0) {
 461                         cl_prev->next_alive = cl->next_alive;
 462                         cl->next_alive = NULL;
 463                         cl->cpriority = cl->priority;
 464                         cl->delayed = 0;
 465                         cbq_activate_class(cl);
 466
 467                         if (cl == q->active[prio]) {
 468                                 q->active[prio] = cl_prev;
 469                                 if (cl == q->active[prio]) {
 470                                         q->active[prio] = NULL;
 471                                         return 0;
 472                                 }
 473                         }
 474
 475                         cl = cl_prev->next_alive;
 476                 } else if (sched - cl->penalized > 0)
 477                         sched = cl->penalized;
 478         } while ((cl_prev = cl) != q->active[prio]);
 479
 480         return sched - now;
 481 }
 482
 483 static enum hrtimer_restart cbq_undelay(struct hrtimer *timer)
 484 {
 485         struct cbq_sched_data *q = container_of(timer, struct cbq_sched_data,
 486                                                 delay_timer);
 487         struct Qdisc *sch = q->watchdog.qdisc;
 488         psched_time_t now;
 489         psched_tdiff_t delay = 0;
 490         unsigned int pmask;
 491
 492         now = psched_get_time();
 493
 494         pmask = q->pmask;
 495         q->pmask = 0;
 496
 497         while (pmask) {
 498                 int prio = ffz(~pmask);
 499                 psched_tdiff_t tmp;
 500
 501                 pmask &= ~(1<<prio);
 502
 503                 tmp = cbq_undelay_prio(q, prio, now);
 504                 if (tmp > 0) {
 505                         q->pmask |= 1<<prio;
 506                         if (tmp < delay || delay == 0)
 507                                 delay = tmp;
 508                 }
 509         }
 510
 511         if (delay) {
 512                 ktime_t time;
 513
 514                 time = 0;
 515                 time = ktime_add_ns(time, PSCHED_TICKS2NS(now + delay));
 516                 hrtimer_start(&q->delay_timer, time, HRTIMER_MODE_ABS_PINNED);
 517         }
 518
 519         __netif_schedule(qdisc_root(sch));
 520         return HRTIMER_NORESTART;
 521 }
 522
 523 /*
 524  * It is mission critical procedure.
 525  *
 526  * We "regenerate" toplevel cutoff, if transmitting class
 527  * has backlog and it is not regulated. It is not part of
 528  * original CBQ description, but looks more reasonable.
 529  * Probably, it is wrong. This question needs further investigation.
 530  */
 531
 532 static inline void
 533 cbq_update_toplevel(struct cbq_sched_data *q, struct cbq_class *cl,
 534                     struct cbq_class *borrowed)
 535 {
 536         if (cl && q->toplevel >= borrowed->level) {
 537                 if (cl->q->q.qlen > 1) {
 538                         do {
 539                                 if (borrowed->undertime == PSCHED_PASTPERFECT) {
 540                                         q->toplevel = borrowed->level;
 541                                         return;
 542                                 }
 543                         } while ((borrowed = borrowed->borrow) != NULL);
 544                 }
 545 #if 0
 546         /* It is not necessary now. Uncommenting it
 547            will save CPU cycles, but decrease fairness.
 548          */
 549                 q->toplevel = TC_CBQ_MAXLEVEL;
 550 #endif
 551         }
 552 }
 553
 554 static void
 555 cbq_update(struct cbq_sched_data *q)
 556 {
 557         struct cbq_class *this = q->tx_class;
 558         struct cbq_class *cl = this;
 559         int len = q->tx_len;
 560         psched_time_t now;
 561
 562         q->tx_class = NULL;
 563         /* Time integrator. We calculate EOS time
 564          * by adding expected packet transmission time.
 565          */
 566         now = q->now + L2T(&q->link, len);
 567
 568         for ( ; cl; cl = cl->share) {
 569                 long avgidle = cl->avgidle;
 570                 long idle;
 571
 572                 cl->bstats.packets++;
 573                 cl->bstats.bytes += len;
 574
 575                 /*
 576                  * (now - last) is total time between packet right edges.
 577                  * (last_pktlen/rate) is "virtual" busy time, so that
 578                  *
 579                  *      idle = (now - last) - last_pktlen/rate
 580                  */
 581
 582                 idle = now - cl->last;
 583                 if ((unsigned long)idle > 128*1024*1024) {
 584                         avgidle = cl->maxidle;
 585                 } else {
 586                         idle -= L2T(cl, len);
 587
 588                 /* true_avgidle := (1-W)*true_avgidle + W*idle,
 589                  * where W=2^{-ewma_log}. But cl->avgidle is scaled:
 590                  * cl->avgidle == true_avgidle/W,
 591                  * hence:
 592                  */
 593                         avgidle += idle - (avgidle>>cl->ewma_log);
 594                 }
 595
 596                 if (avgidle <= 0) {
 597                         /* Overlimit or at-limit */
 598
 599                         if (avgidle < cl->minidle)
 600                                 avgidle = cl->minidle;
 601
 602                         cl->avgidle = avgidle;
 603
 604                         /* Calculate expected time, when this class
 605                          * will be allowed to send.
 606                          * It will occur, when:
 607                          * (1-W)*true_avgidle + W*delay = 0, i.e.
 608                          * idle = (1/W - 1)*(-true_avgidle)
 609                          * or
 610                          * idle = (1 - W)*(-cl->avgidle);
 611                          */
 612                         idle = (-avgidle) - ((-avgidle) >> cl->ewma_log);
 613
 614                         /*
 615                          * That is not all.
 616                          * To maintain the rate allocated to the class,
 617                          * we add to undertime virtual clock,
 618                          * necessary to complete transmitted packet.
 619                          * (len/phys_bandwidth has been already passed
 620                          * to the moment of cbq_update)
 621                          */
 622
 623                         idle -= L2T(&q->link, len);
 624                         idle += L2T(cl, len);
 625
 626                         cl->undertime = now + idle;
 627                 } else {
 628                         /* Underlimit */
 629
 630                         cl->undertime = PSCHED_PASTPERFECT;
 631                         if (avgidle > cl->maxidle)
 632                                 cl->avgidle = cl->maxidle;
 633                         else
 634                                 cl->avgidle = avgidle;
 635                 }
 636                 if ((s64)(now - cl->last) > 0)
 637                         cl->last = now;
 638         }
 639
 640         cbq_update_toplevel(q, this, q->tx_borrowed);
 641 }
 642
 643 static inline struct cbq_class *
 644 cbq_under_limit(struct cbq_class *cl)
 645 {
 646         struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 647         struct cbq_class *this_cl = cl;
 648
 649         if (cl->tparent == NULL)
 650                 return cl;
 651
 652         if (cl->undertime == PSCHED_PASTPERFECT || q->now >= cl->undertime) {
 653                 cl->delayed = 0;
 654                 return cl;
 655         }
 656
 657         do {
 658                 /* It is very suspicious place. Now overlimit
 659                  * action is generated for not bounded classes
 660                  * only if link is completely congested.
 661                  * Though it is in agree with ancestor-only paradigm,
 662                  * it looks very stupid. Particularly,
 663                  * it means that this chunk of code will either
 664                  * never be called or result in strong amplification
 665                  * of burstiness. Dangerous, silly, and, however,
 666                  * no another solution exists.
 667                  */
 668                 cl = cl->borrow;
 669                 if (!cl) {
 670                         this_cl->qstats.overlimits++;
 671                         cbq_overlimit(this_cl);
 672                         return NULL;
 673                 }
 674                 if (cl->level > q->toplevel)
 675                         return NULL;
 676         } while (cl->undertime != PSCHED_PASTPERFECT && q->now < cl->undertime);
 677
 678         cl->delayed = 0;
 679         return cl;
 680 }
 681
 682 static inline struct sk_buff *
 683 cbq_dequeue_prio(struct Qdisc *sch, int prio)
 684 {
 685         struct cbq_sched_data *q = qdisc_priv(sch);
 686         struct cbq_class *cl_tail, *cl_prev, *cl;
 687         struct sk_buff *skb;
 688         int deficit;
 689
 690         cl_tail = cl_prev = q->active[prio];
 691         cl = cl_prev->next_alive;
 692
 693         do {
 694                 deficit = 0;
 695
 696                 /* Start round */
 697                 do {
 698                         struct cbq_class *borrow = cl;
 699
 700                         if (cl->q->q.qlen &&
 701                             (borrow = cbq_under_limit(cl)) == NULL)
 702                                 goto skip_class;
 703
 704                         if (cl->deficit <= 0) {
 705                                 /* Class exhausted its allotment per
 706                                  * this round. Switch to the next one.
 707                                  */
 708                                 deficit = 1;
 709                                 cl->deficit += cl->quantum;
 710                                 goto next_class;
 711                         }
 712
 713                         skb = cl->q->dequeue(cl->q);
 714
 715                         /* Class did not give us any skb :-(
 716                          * It could occur even if cl->q->q.qlen != 0
 717                          * f.e. if cl->q == "tbf"
 718                          */
 719                         if (skb == NULL)
 720                                 goto skip_class;
 721
 722                         cl->deficit -= qdisc_pkt_len(skb);
 723                         q->tx_class = cl;
 724                         q->tx_borrowed = borrow;
 725                         if (borrow != cl) {
 726 #ifndef CBQ_XSTATS_BORROWS_BYTES
 727                                 borrow->xstats.borrows++;
 728                                 cl->xstats.borrows++;
 729 #else
 730                                 borrow->xstats.borrows += qdisc_pkt_len(skb);
 731                                 cl->xstats.borrows += qdisc_pkt_len(skb);
 732 #endif
 733                         }
 734                         q->tx_len = qdisc_pkt_len(skb);
 735
 736                         if (cl->deficit <= 0) {
 737                                 q->active[prio] = cl;
 738                                 cl = cl->next_alive;
 739                                 cl->deficit += cl->quantum;
 740                         }
 741                         return skb;
 742
 743 skip_class:
 744                         if (cl->q->q.qlen == 0 || prio != cl->cpriority) {
 745                                 /* Class is empty or penalized.
 746                                  * Unlink it from active chain.
 747                                  */
 748                                 cl_prev->next_alive = cl->next_alive;
 749                                 cl->next_alive = NULL;
 750
 751                                 /* Did cl_tail point to it? */
 752                                 if (cl == cl_tail) {
 753                                         /* Repair it! */
 754                                         cl_tail = cl_prev;
 755
 756                                         /* Was it the last class in this band? */
 757                                         if (cl == cl_tail) {
 758                                                 /* Kill the band! */
 759                                                 q->active[prio] = NULL;
 760                                                 q->activemask &= ~(1<<prio);
 761                                                 if (cl->q->q.qlen)
 762                                                         cbq_activate_class(cl);
 763                                                 return NULL;
 764                                         }
 765
 766                                         q->active[prio] = cl_tail;
 767                                 }
 768                                 if (cl->q->q.qlen)
 769                                         cbq_activate_class(cl);
 770
 771                                 cl = cl_prev;
 772                         }
 773
 774 next_class:
 775                         cl_prev = cl;
 776                         cl = cl->next_alive;
 777                 } while (cl_prev != cl_tail);
 778         } while (deficit);
 779
 780         q->active[prio] = cl_prev;
 781
 782         return NULL;
 783 }
 784
 785 static inline struct sk_buff *
 786 cbq_dequeue_1(struct Qdisc *sch)
 787 {
 788         struct cbq_sched_data *q = qdisc_priv(sch);
 789         struct sk_buff *skb;
 790         unsigned int activemask;
 791
 792         activemask = q->activemask & 0xFF;
 793         while (activemask) {
 794                 int prio = ffz(~activemask);
 795                 activemask &= ~(1<<prio);
 796                 skb = cbq_dequeue_prio(sch, prio);
 797                 if (skb)
 798                         return skb;
 799         }
 800         return NULL;
 801 }
 802
 803 static struct sk_buff *
 804 cbq_dequeue(struct Qdisc *sch)
 805 {
 806         struct sk_buff *skb;
 807         struct cbq_sched_data *q = qdisc_priv(sch);
 808         psched_time_t now;
 809
 810         now = psched_get_time();
 811
 812         if (q->tx_class)
 813                 cbq_update(q);
 814
 815         q->now = now;
 816
 817         for (;;) {
 818                 q->wd_expires = 0;
 819
 820                 skb = cbq_dequeue_1(sch);
 821                 if (skb) {
 822                         qdisc_bstats_update(sch, skb);
 823                         sch->q.qlen--;
 824                         return skb;
 825                 }
 826
 827                 /* All the classes are overlimit.
 828                  *
 829                  * It is possible, if:
 830                  *
 831                  * 1. Scheduler is empty.
 832                  * 2. Toplevel cutoff inhibited borrowing.
 833                  * 3. Root class is overlimit.
 834                  *
 835                  * Reset 2d and 3d conditions and retry.
 836                  *
 837                  * Note, that NS and cbq-2.0 are buggy, peeking
 838                  * an arbitrary class is appropriate for ancestor-only
 839                  * sharing, but not for toplevel algorithm.
 840                  *
 841                  * Our version is better, but slower, because it requires
 842                  * two passes, but it is unavoidable with top-level sharing.
 843                  */
 844
 845                 if (q->toplevel == TC_CBQ_MAXLEVEL &&
 846                     q->link.undertime == PSCHED_PASTPERFECT)
 847                         break;
 848
 849                 q->toplevel = TC_CBQ_MAXLEVEL;
 850                 q->link.undertime = PSCHED_PASTPERFECT;
 851         }
 852
 853         /* No packets in scheduler or nobody wants to give them to us :-(
 854          * Sigh... start watchdog timer in the last case.
 855          */
 856
 857         if (sch->q.qlen) {
 858                 qdisc_qstats_overlimit(sch);
 859                 if (q->wd_expires)
 860                         qdisc_watchdog_schedule(&q->watchdog,
 861                                                 now + q->wd_expires);
 862         }
 863         return NULL;
 864 }
 865
 866 /* CBQ class maintanance routines */
 867
 868 static void cbq_adjust_levels(struct cbq_class *this)
 869 {
 870         if (this == NULL)
 871                 return;
 872
 873         do {
 874                 int level = 0;
 875                 struct cbq_class *cl;
 876
 877                 cl = this->children;
 878                 if (cl) {
 879                         do {
 880                                 if (cl->level > level)
 881                                         level = cl->level;
 882                         } while ((cl = cl->sibling) != this->children);
 883                 }
 884                 this->level = level + 1;
 885         } while ((this = this->tparent) != NULL);
 886 }
 887
 888 static void cbq_normalize_quanta(struct cbq_sched_data *q, int prio)
 889 {
 890         struct cbq_class *cl;
 891         unsigned int h;
 892
 893         if (q->quanta[prio] == 0)
 894                 return;
 895
 896         for (h = 0; h < q->clhash.hashsize; h++) {
 897                 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
 898                         /* BUGGGG... Beware! This expression suffer of
 899                          * arithmetic overflows!
 900                          */
 901                         if (cl->priority == prio) {
 902                                 cl->quantum = (cl->weight*cl->allot*q->nclasses[prio])/
 903                                         q->quanta[prio];
 904                         }
 905                         if (cl->quantum <= 0 ||
 906                             cl->quantum > 32*qdisc_dev(cl->qdisc)->mtu) {
 907                                 pr_warn("CBQ: class %08x has bad quantum==%ld, repaired.\n",
 908                                         cl->common.classid, cl->quantum);
 909                                 cl->quantum = qdisc_dev(cl->qdisc)->mtu/2 + 1;
 910                         }
 911                 }
 912         }
 913 }
 914
 915 static void cbq_sync_defmap(struct cbq_class *cl)
 916 {
 917         struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
 918         struct cbq_class *split = cl->split;
 919         unsigned int h;
 920         int i;
 921
 922         if (split == NULL)
 923                 return;
 924
 925         for (i = 0; i <= TC_PRIO_MAX; i++) {
 926                 if (split->defaults[i] == cl && !(cl->defmap & (1<<i)))
 927                         split->defaults[i] = NULL;
 928         }
 929
 930         for (i = 0; i <= TC_PRIO_MAX; i++) {
 931                 int level = split->level;
 932
 933                 if (split->defaults[i])
 934                         continue;
 935
 936                 for (h = 0; h < q->clhash.hashsize; h++) {
 937                         struct cbq_class *c;
 938
 939                         hlist_for_each_entry(c, &q->clhash.hash[h],
 940                                              common.hnode) {
 941                                 if (c->split == split && c->level < level &&
 942                                     c->defmap & (1<<i)) {
 943                                         split->defaults[i] = c;
 944                                         level = c->level;
 945                                 }
 946                         }
 947                 }
 948         }
 949 }
 950
 951 static void cbq_change_defmap(struct cbq_class *cl, u32 splitid, u32 def, u32 mask)
 952 {
 953         struct cbq_class *split = NULL;
 954
 955         if (splitid == 0) {
 956                 split = cl->split;
 957                 if (!split)
 958                         return;
 959                 splitid = split->common.classid;
 960         }
 961
 962         if (split == NULL || split->common.classid != splitid) {
 963                 for (split = cl->tparent; split; split = split->tparent)
 964                         if (split->common.classid == splitid)
 965                                 break;
 966         }
 967
 968         if (split == NULL)
 969                 return;
 970
 971         if (cl->split != split) {
 972                 cl->defmap = 0;
 973                 cbq_sync_defmap(cl);
 974                 cl->split = split;
 975                 cl->defmap = def & mask;
 976         } else
 977                 cl->defmap = (cl->defmap & ~mask) | (def & mask);
 978
 979         cbq_sync_defmap(cl);
 980 }
 981
 982 static void cbq_unlink_class(struct cbq_class *this)
 983 {
 984         struct cbq_class *cl, **clp;
 985         struct cbq_sched_data *q = qdisc_priv(this->qdisc);
 986
 987         qdisc_class_hash_remove(&q->clhash, &this->common);
 988
 989         if (this->tparent) {
 990                 clp = &this->sibling;
 991                 cl = *clp;
 992                 do {
 993                         if (cl == this) {
 994                                 *clp = cl->sibling;
 995                                 break;
 996                         }
 997                         clp = &cl->sibling;
 998                 } while ((cl = *clp) != this->sibling);
 999
1000                 if (this->tparent->children == this) {
1001                         this->tparent->children = this->sibling;
1002                         if (this->sibling == this)
1003                                 this->tparent->children = NULL;
1004                 }
1005         } else {
1006                 WARN_ON(this->sibling != this);
1007         }
1008 }
1009
1010 static void cbq_link_class(struct cbq_class *this)
1011 {
1012         struct cbq_sched_data *q = qdisc_priv(this->qdisc);
1013         struct cbq_class *parent = this->tparent;
1014
1015         this->sibling = this;
1016         qdisc_class_hash_insert(&q->clhash, &this->common);
1017
1018         if (parent == NULL)
1019                 return;
1020
1021         if (parent->children == NULL) {
1022                 parent->children = this;
1023         } else {
1024                 this->sibling = parent->children->sibling;
1025                 parent->children->sibling = this;
1026         }
1027 }
1028
1029 static void
1030 cbq_reset(struct Qdisc *sch)
1031 {
1032         struct cbq_sched_data *q = qdisc_priv(sch);
1033         struct cbq_class *cl;
1034         int prio;
1035         unsigned int h;
1036
1037         q->activemask = 0;
1038         q->pmask = 0;
1039         q->tx_class = NULL;
1040         q->tx_borrowed = NULL;
1041         qdisc_watchdog_cancel(&q->watchdog);
1042         hrtimer_cancel(&q->delay_timer);
1043         q->toplevel = TC_CBQ_MAXLEVEL;
1044         q->now = psched_get_time();
1045
1046         for (prio = 0; prio <= TC_CBQ_MAXPRIO; prio++)
1047                 q->active[prio] = NULL;
1048
1049         for (h = 0; h < q->clhash.hashsize; h++) {
1050                 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1051                         qdisc_reset(cl->q);
1052
1053                         cl->next_alive = NULL;
1054                         cl->undertime = PSCHED_PASTPERFECT;
1055                         cl->avgidle = cl->maxidle;
1056                         cl->deficit = cl->quantum;
1057                         cl->cpriority = cl->priority;
1058                 }
1059         }
1060         sch->q.qlen = 0;
1061 }
1062
1063
1064 static int cbq_set_lss(struct cbq_class *cl, struct tc_cbq_lssopt *lss)
1065 {
1066         if (lss->change & TCF_CBQ_LSS_FLAGS) {
1067                 cl->share = (lss->flags & TCF_CBQ_LSS_ISOLATED) ? NULL : cl->tparent;
1068                 cl->borrow = (lss->flags & TCF_CBQ_LSS_BOUNDED) ? NULL : cl->tparent;
1069         }
1070         if (lss->change & TCF_CBQ_LSS_EWMA)
1071                 cl->ewma_log = lss->ewma_log;
1072         if (lss->change & TCF_CBQ_LSS_AVPKT)
1073                 cl->avpkt = lss->avpkt;
1074         if (lss->change & TCF_CBQ_LSS_MINIDLE)
1075                 cl->minidle = -(long)lss->minidle;
1076         if (lss->change & TCF_CBQ_LSS_MAXIDLE) {
1077                 cl->maxidle = lss->maxidle;
1078                 cl->avgidle = lss->maxidle;
1079         }
1080         if (lss->change & TCF_CBQ_LSS_OFFTIME)
1081                 cl->offtime = lss->offtime;
1082         return 0;
1083 }
1084
1085 static void cbq_rmprio(struct cbq_sched_data *q, struct cbq_class *cl)
1086 {
1087         q->nclasses[cl->priority]--;
1088         q->quanta[cl->priority] -= cl->weight;
1089         cbq_normalize_quanta(q, cl->priority);
1090 }
1091
1092 static void cbq_addprio(struct cbq_sched_data *q, struct cbq_class *cl)
1093 {
1094         q->nclasses[cl->priority]++;
1095         q->quanta[cl->priority] += cl->weight;
1096         cbq_normalize_quanta(q, cl->priority);
1097 }
1098
1099 static int cbq_set_wrr(struct cbq_class *cl, struct tc_cbq_wrropt *wrr)
1100 {
1101         struct cbq_sched_data *q = qdisc_priv(cl->qdisc);
1102
1103         if (wrr->allot)
1104                 cl->allot = wrr->allot;
1105         if (wrr->weight)
1106                 cl->weight = wrr->weight;
1107         if (wrr->priority) {
1108                 cl->priority = wrr->priority - 1;
1109                 cl->cpriority = cl->priority;
1110                 if (cl->priority >= cl->priority2)
1111                         cl->priority2 = TC_CBQ_MAXPRIO - 1;
1112         }
1113
1114         cbq_addprio(q, cl);
1115         return 0;
1116 }
1117
1118 static int cbq_set_fopt(struct cbq_class *cl, struct tc_cbq_fopt *fopt)
1119 {
1120         cbq_change_defmap(cl, fopt->split, fopt->defmap, fopt->defchange);
1121         return 0;
1122 }
1123
1124 static const struct nla_policy cbq_policy[TCA_CBQ_MAX + 1] = {
1125         [TCA_CBQ_LSSOPT]        = { .len = sizeof(struct tc_cbq_lssopt) },
1126         [TCA_CBQ_WRROPT]        = { .len = sizeof(struct tc_cbq_wrropt) },
1127         [TCA_CBQ_FOPT]          = { .len = sizeof(struct tc_cbq_fopt) },
1128         [TCA_CBQ_OVL_STRATEGY]  = { .len = sizeof(struct tc_cbq_ovl) },
1129         [TCA_CBQ_RATE]          = { .len = sizeof(struct tc_ratespec) },
1130         [TCA_CBQ_RTAB]          = { .type = NLA_BINARY, .len = TC_RTAB_SIZE },
1131         [TCA_CBQ_POLICE]        = { .len = sizeof(struct tc_cbq_police) },
1132 };
1133
1134 static int cbq_opt_parse(struct nlattr *tb[TCA_CBQ_MAX + 1], struct nlattr *opt)
1135 {
1136         int err;
1137
1138         if (!opt)
1139                 return -EINVAL;
1140
1141         err = nla_parse_nested(tb, TCA_CBQ_MAX, opt, cbq_policy, NULL);
1142         if (err < 0)
1143                 return err;
1144
1145         if (tb[TCA_CBQ_WRROPT]) {
1146                 const struct tc_cbq_wrropt *wrr = nla_data(tb[TCA_CBQ_WRROPT]);
1147
1148                 if (wrr->priority > TC_CBQ_MAXPRIO)
1149                         err = -EINVAL;
1150         }
1151         return err;
1152 }
1153
1154 static int cbq_init(struct Qdisc *sch, struct nlattr *opt)
1155 {
1156         struct cbq_sched_data *q = qdisc_priv(sch);
1157         struct nlattr *tb[TCA_CBQ_MAX + 1];
1158         struct tc_ratespec *r;
1159         int err;
1160
1161         qdisc_watchdog_init(&q->watchdog, sch);
1162         hrtimer_init(&q->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
1163         q->delay_timer.function = cbq_undelay;
1164
1165         err = cbq_opt_parse(tb, opt);
1166         if (err < 0)
1167                 return err;
1168
1169         if (tb[TCA_CBQ_RTAB] == NULL || tb[TCA_CBQ_RATE] == NULL)
1170                 return -EINVAL;
1171
1172         r = nla_data(tb[TCA_CBQ_RATE]);
1173
1174         if ((q->link.R_tab = qdisc_get_rtab(r, tb[TCA_CBQ_RTAB])) == NULL)
1175                 return -EINVAL;
1176
1177         err = tcf_block_get(&q->link.block, &q->link.filter_list);
1178         if (err)
1179                 goto put_rtab;
1180
1181         err = qdisc_class_hash_init(&q->clhash);
1182         if (err < 0)
1183                 goto put_block;
1184
1185         q->link.sibling = &q->link;
1186         q->link.common.classid = sch->handle;
1187         q->link.qdisc = sch;
1188         q->link.q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops,
1189                                       sch->handle);
1190         if (!q->link.q)
1191                 q->link.q = &noop_qdisc;
1192         else
1193                 qdisc_hash_add(q->link.q, true);
1194
1195         q->link.priority = TC_CBQ_MAXPRIO - 1;
1196         q->link.priority2 = TC_CBQ_MAXPRIO - 1;
1197         q->link.cpriority = TC_CBQ_MAXPRIO - 1;
1198         q->link.allot = psched_mtu(qdisc_dev(sch));
1199         q->link.quantum = q->link.allot;
1200         q->link.weight = q->link.R_tab->rate.rate;
1201
1202         q->link.ewma_log = TC_CBQ_DEF_EWMA;
1203         q->link.avpkt = q->link.allot/2;
1204         q->link.minidle = -0x7FFFFFFF;
1205
1206         q->toplevel = TC_CBQ_MAXLEVEL;
1207         q->now = psched_get_time();
1208
1209         cbq_link_class(&q->link);
1210
1211         if (tb[TCA_CBQ_LSSOPT])
1212                 cbq_set_lss(&q->link, nla_data(tb[TCA_CBQ_LSSOPT]));
1213
1214         cbq_addprio(q, &q->link);
1215         return 0;
1216
1217 put_block:
1218         tcf_block_put(q->link.block);
1219
1220 put_rtab:
1221         qdisc_put_rtab(q->link.R_tab);
1222         return err;
1223 }
1224
1225 static int cbq_dump_rate(struct sk_buff *skb, struct cbq_class *cl)
1226 {
1227         unsigned char *b = skb_tail_pointer(skb);
1228
1229         if (nla_put(skb, TCA_CBQ_RATE, sizeof(cl->R_tab->rate), &cl->R_tab->rate))
1230                 goto nla_put_failure;
1231         return skb->len;
1232
1233 nla_put_failure:
1234         nlmsg_trim(skb, b);
1235         return -1;
1236 }
1237
1238 static int cbq_dump_lss(struct sk_buff *skb, struct cbq_class *cl)
1239 {
1240         unsigned char *b = skb_tail_pointer(skb);
1241         struct tc_cbq_lssopt opt;
1242
1243         opt.flags = 0;
1244         if (cl->borrow == NULL)
1245                 opt.flags |= TCF_CBQ_LSS_BOUNDED;
1246         if (cl->share == NULL)
1247                 opt.flags |= TCF_CBQ_LSS_ISOLATED;
1248         opt.ewma_log = cl->ewma_log;
1249         opt.level = cl->level;
1250         opt.avpkt = cl->avpkt;
1251         opt.maxidle = cl->maxidle;
1252         opt.minidle = (u32)(-cl->minidle);
1253         opt.offtime = cl->offtime;
1254         opt.change = ~0;
1255         if (nla_put(skb, TCA_CBQ_LSSOPT, sizeof(opt), &opt))
1256                 goto nla_put_failure;
1257         return skb->len;
1258
1259 nla_put_failure:
1260         nlmsg_trim(skb, b);
1261         return -1;
1262 }
1263
1264 static int cbq_dump_wrr(struct sk_buff *skb, struct cbq_class *cl)
1265 {
1266         unsigned char *b = skb_tail_pointer(skb);
1267         struct tc_cbq_wrropt opt;
1268
1269         memset(&opt, 0, sizeof(opt));
1270         opt.flags = 0;
1271         opt.allot = cl->allot;
1272         opt.priority = cl->priority + 1;
1273         opt.cpriority = cl->cpriority + 1;
1274         opt.weight = cl->weight;
1275         if (nla_put(skb, TCA_CBQ_WRROPT, sizeof(opt), &opt))
1276                 goto nla_put_failure;
1277         return skb->len;
1278
1279 nla_put_failure:
1280         nlmsg_trim(skb, b);
1281         return -1;
1282 }
1283
1284 static int cbq_dump_fopt(struct sk_buff *skb, struct cbq_class *cl)
1285 {
1286         unsigned char *b = skb_tail_pointer(skb);
1287         struct tc_cbq_fopt opt;
1288
1289         if (cl->split || cl->defmap) {
1290                 opt.split = cl->split ? cl->split->common.classid : 0;
1291                 opt.defmap = cl->defmap;
1292                 opt.defchange = ~0;
1293                 if (nla_put(skb, TCA_CBQ_FOPT, sizeof(opt), &opt))
1294                         goto nla_put_failure;
1295         }
1296         return skb->len;
1297
1298 nla_put_failure:
1299         nlmsg_trim(skb, b);
1300         return -1;
1301 }
1302
1303 static int cbq_dump_attr(struct sk_buff *skb, struct cbq_class *cl)
1304 {
1305         if (cbq_dump_lss(skb, cl) < 0 ||
1306             cbq_dump_rate(skb, cl) < 0 ||
1307             cbq_dump_wrr(skb, cl) < 0 ||
1308             cbq_dump_fopt(skb, cl) < 0)
1309                 return -1;
1310         return 0;
1311 }
1312
1313 static int cbq_dump(struct Qdisc *sch, struct sk_buff *skb)
1314 {
1315         struct cbq_sched_data *q = qdisc_priv(sch);
1316         struct nlattr *nest;
1317
1318         nest = nla_nest_start(skb, TCA_OPTIONS);
1319         if (nest == NULL)
1320                 goto nla_put_failure;
1321         if (cbq_dump_attr(skb, &q->link) < 0)
1322                 goto nla_put_failure;
1323         return nla_nest_end(skb, nest);
1324
1325 nla_put_failure:
1326         nla_nest_cancel(skb, nest);
1327         return -1;
1328 }
1329
1330 static int
1331 cbq_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
1332 {
1333         struct cbq_sched_data *q = qdisc_priv(sch);
1334
1335         q->link.xstats.avgidle = q->link.avgidle;
1336         return gnet_stats_copy_app(d, &q->link.xstats, sizeof(q->link.xstats));
1337 }
1338
1339 static int
1340 cbq_dump_class(struct Qdisc *sch, unsigned long arg,
1341                struct sk_buff *skb, struct tcmsg *tcm)
1342 {
1343         struct cbq_class *cl = (struct cbq_class *)arg;
1344         struct nlattr *nest;
1345
1346         if (cl->tparent)
1347                 tcm->tcm_parent = cl->tparent->common.classid;
1348         else
1349                 tcm->tcm_parent = TC_H_ROOT;
1350         tcm->tcm_handle = cl->common.classid;
1351         tcm->tcm_info = cl->q->handle;
1352
1353         nest = nla_nest_start(skb, TCA_OPTIONS);
1354         if (nest == NULL)
1355                 goto nla_put_failure;
1356         if (cbq_dump_attr(skb, cl) < 0)
1357                 goto nla_put_failure;
1358         return nla_nest_end(skb, nest);
1359
1360 nla_put_failure:
1361         nla_nest_cancel(skb, nest);
1362         return -1;
1363 }
1364
1365 static int
1366 cbq_dump_class_stats(struct Qdisc *sch, unsigned long arg,
1367         struct gnet_dump *d)
1368 {
1369         struct cbq_sched_data *q = qdisc_priv(sch);
1370         struct cbq_class *cl = (struct cbq_class *)arg;
1371
1372         cl->xstats.avgidle = cl->avgidle;
1373         cl->xstats.undertime = 0;
1374
1375         if (cl->undertime != PSCHED_PASTPERFECT)
1376                 cl->xstats.undertime = cl->undertime - q->now;
1377
1378         if (gnet_stats_copy_basic(qdisc_root_sleeping_running(sch),
1379                                   d, NULL, &cl->bstats) < 0 ||
1380             gnet_stats_copy_rate_est(d, &cl->rate_est) < 0 ||
1381             gnet_stats_copy_queue(d, NULL, &cl->qstats, cl->q->q.qlen) < 0)
1382                 return -1;
1383
1384         return gnet_stats_copy_app(d, &cl->xstats, sizeof(cl->xstats));
1385 }
1386
1387 static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1388                      struct Qdisc **old)
1389 {
1390         struct cbq_class *cl = (struct cbq_class *)arg;
1391
1392         if (new == NULL) {
1393                 new = qdisc_create_dflt(sch->dev_queue,
1394                                         &pfifo_qdisc_ops, cl->common.classid);
1395                 if (new == NULL)
1396                         return -ENOBUFS;
1397         }
1398
1399         *old = qdisc_replace(sch, new, &cl->q);
1400         return 0;
1401 }
1402
1403 static struct Qdisc *cbq_leaf(struct Qdisc *sch, unsigned long arg)
1404 {
1405         struct cbq_class *cl = (struct cbq_class *)arg;
1406
1407         return cl->q;
1408 }
1409
1410 static void cbq_qlen_notify(struct Qdisc *sch, unsigned long arg)
1411 {
1412         struct cbq_class *cl = (struct cbq_class *)arg;
1413
1414         cbq_deactivate_class(cl);
1415 }
1416
1417 static unsigned long cbq_find(struct Qdisc *sch, u32 classid)
1418 {
1419         struct cbq_sched_data *q = qdisc_priv(sch);
1420
1421         return (unsigned long)cbq_class_lookup(q, classid);
1422 }
1423
1424 static void cbq_destroy_class(struct Qdisc *sch, struct cbq_class *cl)
1425 {
1426         struct cbq_sched_data *q = qdisc_priv(sch);
1427
1428         WARN_ON(cl->filters);
1429
1430         tcf_block_put(cl->block);
1431         qdisc_destroy(cl->q);
1432         qdisc_put_rtab(cl->R_tab);
1433         gen_kill_estimator(&cl->rate_est);
1434         if (cl != &q->link)
1435                 kfree(cl);
1436 }
1437
1438 static void cbq_destroy(struct Qdisc *sch)
1439 {
1440         struct cbq_sched_data *q = qdisc_priv(sch);
1441         struct hlist_node *next;
1442         struct cbq_class *cl;
1443         unsigned int h;
1444
1445 #ifdef CONFIG_NET_CLS_ACT
1446         q->rx_class = NULL;
1447 #endif
1448         /*
1449          * Filters must be destroyed first because we don't destroy the
1450          * classes from root to leafs which means that filters can still
1451          * be bound to classes which have been destroyed already. --TGR '04
1452          */
1453         for (h = 0; h < q->clhash.hashsize; h++) {
1454                 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1455                         tcf_block_put(cl->block);
1456                         cl->block = NULL;
1457                 }
1458         }
1459         for (h = 0; h < q->clhash.hashsize; h++) {
1460                 hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
1461                                           common.hnode)
1462                         cbq_destroy_class(sch, cl);
1463         }
1464         qdisc_class_hash_destroy(&q->clhash);
1465 }
1466
1467 static int
1468 cbq_change_class(struct Qdisc *sch, u32 classid, u32 parentid, struct nlattr **tca,
1469                  unsigned long *arg)
1470 {
1471         int err;
1472         struct cbq_sched_data *q = qdisc_priv(sch);
1473         struct cbq_class *cl = (struct cbq_class *)*arg;
1474         struct nlattr *opt = tca[TCA_OPTIONS];
1475         struct nlattr *tb[TCA_CBQ_MAX + 1];
1476         struct cbq_class *parent;
1477         struct qdisc_rate_table *rtab = NULL;
1478
1479         err = cbq_opt_parse(tb, opt);
1480         if (err < 0)
1481                 return err;
1482
1483         if (tb[TCA_CBQ_OVL_STRATEGY] || tb[TCA_CBQ_POLICE])
1484                 return -EOPNOTSUPP;
1485
1486         if (cl) {
1487                 /* Check parent */
1488                 if (parentid) {
1489                         if (cl->tparent &&
1490                             cl->tparent->common.classid != parentid)
1491                                 return -EINVAL;
1492                         if (!cl->tparent && parentid != TC_H_ROOT)
1493                                 return -EINVAL;
1494                 }
1495
1496                 if (tb[TCA_CBQ_RATE]) {
1497                         rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]),
1498                                               tb[TCA_CBQ_RTAB]);
1499                         if (rtab == NULL)
1500                                 return -EINVAL;
1501                 }
1502
1503                 if (tca[TCA_RATE]) {
1504                         err = gen_replace_estimator(&cl->bstats, NULL,
1505                                                     &cl->rate_est,
1506                                                     NULL,
1507                                                     qdisc_root_sleeping_running(sch),
1508                                                     tca[TCA_RATE]);
1509                         if (err) {
1510                                 qdisc_put_rtab(rtab);
1511                                 return err;
1512                         }
1513                 }
1514
1515                 /* Change class parameters */
1516                 sch_tree_lock(sch);
1517
1518                 if (cl->next_alive != NULL)
1519                         cbq_deactivate_class(cl);
1520
1521                 if (rtab) {
1522                         qdisc_put_rtab(cl->R_tab);
1523                         cl->R_tab = rtab;
1524                 }
1525
1526                 if (tb[TCA_CBQ_LSSOPT])
1527                         cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1528
1529                 if (tb[TCA_CBQ_WRROPT]) {
1530                         cbq_rmprio(q, cl);
1531                         cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1532                 }
1533
1534                 if (tb[TCA_CBQ_FOPT])
1535                         cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1536
1537                 if (cl->q->q.qlen)
1538                         cbq_activate_class(cl);
1539
1540                 sch_tree_unlock(sch);
1541
1542                 return 0;
1543         }
1544
1545         if (parentid == TC_H_ROOT)
1546                 return -EINVAL;
1547
1548         if (tb[TCA_CBQ_WRROPT] == NULL || tb[TCA_CBQ_RATE] == NULL ||
1549             tb[TCA_CBQ_LSSOPT] == NULL)
1550                 return -EINVAL;
1551
1552         rtab = qdisc_get_rtab(nla_data(tb[TCA_CBQ_RATE]), tb[TCA_CBQ_RTAB]);
1553         if (rtab == NULL)
1554                 return -EINVAL;
1555
1556         if (classid) {
1557                 err = -EINVAL;
1558                 if (TC_H_MAJ(classid ^ sch->handle) ||
1559                     cbq_class_lookup(q, classid))
1560                         goto failure;
1561         } else {
1562                 int i;
1563                 classid = TC_H_MAKE(sch->handle, 0x8000);
1564
1565                 for (i = 0; i < 0x8000; i++) {
1566                         if (++q->hgenerator >= 0x8000)
1567                                 q->hgenerator = 1;
1568                         if (cbq_class_lookup(q, classid|q->hgenerator) == NULL)
1569                                 break;
1570                 }
1571                 err = -ENOSR;
1572                 if (i >= 0x8000)
1573                         goto failure;
1574                 classid = classid|q->hgenerator;
1575         }
1576
1577         parent = &q->link;
1578         if (parentid) {
1579                 parent = cbq_class_lookup(q, parentid);
1580                 err = -EINVAL;
1581                 if (parent == NULL)
1582                         goto failure;
1583         }
1584
1585         err = -ENOBUFS;
1586         cl = kzalloc(sizeof(*cl), GFP_KERNEL);
1587         if (cl == NULL)
1588                 goto failure;
1589
1590         err = tcf_block_get(&cl->block, &cl->filter_list);
1591         if (err) {
1592                 kfree(cl);
1593                 goto failure;
1594         }
1595
1596         if (tca[TCA_RATE]) {
1597                 err = gen_new_estimator(&cl->bstats, NULL, &cl->rate_est,
1598                                         NULL,
1599                                         qdisc_root_sleeping_running(sch),
1600                                         tca[TCA_RATE]);
1601                 if (err) {
1602                         tcf_block_put(cl->block);
1603                         kfree(cl);
1604                         goto failure;
1605                 }
1606         }
1607
1608         cl->R_tab = rtab;
1609         rtab = NULL;
1610         cl->q = qdisc_create_dflt(sch->dev_queue, &pfifo_qdisc_ops, classid);
1611         if (!cl->q)
1612                 cl->q = &noop_qdisc;
1613         else
1614                 qdisc_hash_add(cl->q, true);
1615
1616         cl->common.classid = classid;
1617         cl->tparent = parent;
1618         cl->qdisc = sch;
1619         cl->allot = parent->allot;
1620         cl->quantum = cl->allot;
1621         cl->weight = cl->R_tab->rate.rate;
1622
1623         sch_tree_lock(sch);
1624         cbq_link_class(cl);
1625         cl->borrow = cl->tparent;
1626         if (cl->tparent != &q->link)
1627                 cl->share = cl->tparent;
1628         cbq_adjust_levels(parent);
1629         cl->minidle = -0x7FFFFFFF;
1630         cbq_set_lss(cl, nla_data(tb[TCA_CBQ_LSSOPT]));
1631         cbq_set_wrr(cl, nla_data(tb[TCA_CBQ_WRROPT]));
1632         if (cl->ewma_log == 0)
1633                 cl->ewma_log = q->link.ewma_log;
1634         if (cl->maxidle == 0)
1635                 cl->maxidle = q->link.maxidle;
1636         if (cl->avpkt == 0)
1637                 cl->avpkt = q->link.avpkt;
1638         if (tb[TCA_CBQ_FOPT])
1639                 cbq_set_fopt(cl, nla_data(tb[TCA_CBQ_FOPT]));
1640         sch_tree_unlock(sch);
1641
1642         qdisc_class_hash_grow(sch, &q->clhash);
1643
1644         *arg = (unsigned long)cl;
1645         return 0;
1646
1647 failure:
1648         qdisc_put_rtab(rtab);
1649         return err;
1650 }
1651
1652 static int cbq_delete(struct Qdisc *sch, unsigned long arg)
1653 {
1654         struct cbq_sched_data *q = qdisc_priv(sch);
1655         struct cbq_class *cl = (struct cbq_class *)arg;
1656         unsigned int qlen, backlog;
1657
1658         if (cl->filters || cl->children || cl == &q->link)
1659                 return -EBUSY;
1660
1661         sch_tree_lock(sch);
1662
1663         qlen = cl->q->q.qlen;
1664         backlog = cl->q->qstats.backlog;
1665         qdisc_reset(cl->q);
1666         qdisc_tree_reduce_backlog(cl->q, qlen, backlog);
1667
1668         if (cl->next_alive)
1669                 cbq_deactivate_class(cl);
1670
1671         if (q->tx_borrowed == cl)
1672                 q->tx_borrowed = q->tx_class;
1673         if (q->tx_class == cl) {
1674                 q->tx_class = NULL;
1675                 q->tx_borrowed = NULL;
1676         }
1677 #ifdef CONFIG_NET_CLS_ACT
1678         if (q->rx_class == cl)
1679                 q->rx_class = NULL;
1680 #endif
1681
1682         cbq_unlink_class(cl);
1683         cbq_adjust_levels(cl->tparent);
1684         cl->defmap = 0;
1685         cbq_sync_defmap(cl);
1686
1687         cbq_rmprio(q, cl);
1688         sch_tree_unlock(sch);
1689
1690         cbq_destroy_class(sch, cl);
1691         return 0;
1692 }
1693
1694 static struct tcf_block *cbq_tcf_block(struct Qdisc *sch, unsigned long arg)
1695 {
1696         struct cbq_sched_data *q = qdisc_priv(sch);
1697         struct cbq_class *cl = (struct cbq_class *)arg;
1698
1699         if (cl == NULL)
1700                 cl = &q->link;
1701
1702         return cl->block;
1703 }
1704
1705 static unsigned long cbq_bind_filter(struct Qdisc *sch, unsigned long parent,
1706                                      u32 classid)
1707 {
1708         struct cbq_sched_data *q = qdisc_priv(sch);
1709         struct cbq_class *p = (struct cbq_class *)parent;
1710         struct cbq_class *cl = cbq_class_lookup(q, classid);
1711
1712         if (cl) {
1713                 if (p && p->level <= cl->level)
1714                         return 0;
1715                 cl->filters++;
1716                 return (unsigned long)cl;
1717         }
1718         return 0;
1719 }
1720
1721 static void cbq_unbind_filter(struct Qdisc *sch, unsigned long arg)
1722 {
1723         struct cbq_class *cl = (struct cbq_class *)arg;
1724
1725         cl->filters--;
1726 }
1727
1728 static void cbq_walk(struct Qdisc *sch, struct qdisc_walker *arg)
1729 {
1730         struct cbq_sched_data *q = qdisc_priv(sch);
1731         struct cbq_class *cl;
1732         unsigned int h;
1733
1734         if (arg->stop)
1735                 return;
1736
1737         for (h = 0; h < q->clhash.hashsize; h++) {
1738                 hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
1739                         if (arg->count < arg->skip) {
1740                                 arg->count++;
1741                                 continue;
1742                         }
1743                         if (arg->fn(sch, (unsigned long)cl, arg) < 0) {
1744                                 arg->stop = 1;
1745                                 return;
1746                         }
1747                         arg->count++;
1748                 }
1749         }
1750 }
1751
1752 static const struct Qdisc_class_ops cbq_class_ops = {
1753         .graft          =       cbq_graft,
1754         .leaf           =       cbq_leaf,
1755         .qlen_notify    =       cbq_qlen_notify,
1756         .find           =       cbq_find,
1757         .change         =       cbq_change_class,
1758         .delete         =       cbq_delete,
1759         .walk           =       cbq_walk,
1760         .tcf_block      =       cbq_tcf_block,
1761         .bind_tcf       =       cbq_bind_filter,
1762         .unbind_tcf     =       cbq_unbind_filter,
1763         .dump           =       cbq_dump_class,
1764         .dump_stats     =       cbq_dump_class_stats,
1765 };
1766
1767 static struct Qdisc_ops cbq_qdisc_ops __read_mostly = {
1768         .next           =       NULL,
1769         .cl_ops         =       &cbq_class_ops,
1770         .id             =       "cbq",
1771         .priv_size      =       sizeof(struct cbq_sched_data),
1772         .enqueue        =       cbq_enqueue,
1773         .dequeue        =       cbq_dequeue,
1774         .peek           =       qdisc_peek_dequeued,
1775         .init           =       cbq_init,
1776         .reset          =       cbq_reset,
1777         .destroy        =       cbq_destroy,
1778         .change         =       NULL,
1779         .dump           =       cbq_dump,
1780         .dump_stats     =       cbq_dump_stats,
1781         .owner          =       THIS_MODULE,
1782 };
1783
1784 static int __init cbq_module_init(void)
1785 {
1786         return register_qdisc(&cbq_qdisc_ops);
1787 }
1788 static void __exit cbq_module_exit(void)
1789 {
1790         unregister_qdisc(&cbq_qdisc_ops);
1791 }
1792 module_init(cbq_module_init)
1793 module_exit(cbq_module_exit)
1794 MODULE_LICENSE("GPL");