sched/sch_cake.c

   1 // SPDX-License-Identifier: GPL-2.0 OR BSD-3-Clause
   2
   3 /* COMMON Applications Kept Enhanced (CAKE) discipline
   4  *
   5  * Copyright (C) 2014-2018 Jonathan Morton <chromatix99@gmail.com>
   6  * Copyright (C) 2015-2018 Toke Høiland-Jørgensen <toke@toke.dk>
   7  * Copyright (C) 2014-2018 Dave Täht <dave.taht@gmail.com>
   8  * Copyright (C) 2015-2018 Sebastian Moeller <moeller0@gmx.de>
   9  * (C) 2015-2018 Kevin Darbyshire-Bryant <kevin@darbyshire-bryant.me.uk>
  10  * Copyright (C) 2017-2018 Ryan Mounce <ryan@mounce.com.au>
  11  *
  12  * The CAKE Principles:
  13  *                 (or, how to have your cake and eat it too)
  14  *
  15  * This is a combination of several shaping, AQM and FQ techniques into one
  16  * easy-to-use package:
  17  *
  18  * - An overall bandwidth shaper, to move the bottleneck away from dumb CPE
  19  *   equipment and bloated MACs.  This operates in deficit mode (as in sch_fq),
  20  *   eliminating the need for any sort of burst parameter (eg. token bucket
  21  *   depth).  Burst support is limited to that necessary to overcome scheduling
  22  *   latency.
  23  *
  24  * - A Diffserv-aware priority queue, giving more priority to certain classes,
  25  *   up to a specified fraction of bandwidth.  Above that bandwidth threshold,
  26  *   the priority is reduced to avoid starving other tins.
  27  *
  28  * - Each priority tin has a separate Flow Queue system, to isolate traffic
  29  *   flows from each other.  This prevents a burst on one flow from increasing
  30  *   the delay to another.  Flows are distributed to queues using a
  31  *   set-associative hash function.
  32  *
  33  * - Each queue is actively managed by Cobalt, which is a combination of the
  34  *   Codel and Blue AQM algorithms.  This serves flows fairly, and signals
  35  *   congestion early via ECN (if available) and/or packet drops, to keep
  36  *   latency low.  The codel parameters are auto-tuned based on the bandwidth
  37  *   setting, as is necessary at low bandwidths.
  38  *
  39  * The configuration parameters are kept deliberately simple for ease of use.
  40  * Everything has sane defaults.  Complete generality of configuration is *not*
  41  * a goal.
  42  *
  43  * The priority queue operates according to a weighted DRR scheme, combined with
  44  * a bandwidth tracker which reuses the shaper logic to detect which side of the
  45  * bandwidth sharing threshold the tin is operating.  This determines whether a
  46  * priority-based weight (high) or a bandwidth-based weight (low) is used for
  47  * that tin in the current pass.
  48  *
  49  * This qdisc was inspired by Eric Dumazet's fq_codel code, which he kindly
  50  * granted us permission to leverage.
  51  */
  52
  53 #include <linux/module.h>
  54 #include <linux/types.h>
  55 #include <linux/kernel.h>
  56 #include <linux/jiffies.h>
  57 #include <linux/string.h>
  58 #include <linux/in.h>
  59 #include <linux/errno.h>
  60 #include <linux/init.h>
  61 #include <linux/skbuff.h>
  62 #include <linux/jhash.h>
  63 #include <linux/slab.h>
  64 #include <linux/vmalloc.h>
  65 #include <linux/reciprocal_div.h>
  66 #include <net/netlink.h>
  67 #include <linux/if_vlan.h>
  68 #include <net/gso.h>
  69 #include <net/pkt_sched.h>
  70 #include <net/pkt_cls.h>
  71 #include <net/tcp.h>
  72 #include <net/flow_dissector.h>
  73
  74 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
  75 #include <net/netfilter/nf_conntrack_core.h>
  76 #endif
  77
  78 #define CAKE_SET_WAYS (8)
  79 #define CAKE_MAX_TINS (8)
  80 #define CAKE_QUEUES (1024)
  81 #define CAKE_FLOW_MASK 63
  82 #define CAKE_FLOW_NAT_FLAG 64
  83
  84 /* struct cobalt_params - contains codel and blue parameters
  85  * @interval:   codel initial drop rate
  86  * @target:     maximum persistent sojourn time & blue update rate
  87  * @mtu_time:   serialisation delay of maximum-size packet
  88  * @p_inc:      increment of blue drop probability (0.32 fxp)
  89  * @p_dec:      decrement of blue drop probability (0.32 fxp)
  90  */
  91 struct cobalt_params {
  92         u64     interval;
  93         u64     target;
  94         u64     mtu_time;
  95         u32     p_inc;
  96         u32     p_dec;
  97 };
  98
  99 /* struct cobalt_vars - contains codel and blue variables
 100  * @count:              codel dropping frequency
 101  * @rec_inv_sqrt:       reciprocal value of sqrt(count) >> 1
 102  * @drop_next:          time to drop next packet, or when we dropped last
 103  * @blue_timer:         Blue time to next drop
 104  * @p_drop:             BLUE drop probability (0.32 fxp)
 105  * @dropping:           set if in dropping state
 106  * @ecn_marked:         set if marked
 107  */
 108 struct cobalt_vars {
 109         u32     count;
 110         u32     rec_inv_sqrt;
 111         ktime_t drop_next;
 112         ktime_t blue_timer;
 113         u32     p_drop;
 114         bool    dropping;
 115         bool    ecn_marked;
 116 };
 117
 118 enum {
 119         CAKE_SET_NONE = 0,
 120         CAKE_SET_SPARSE,
 121         CAKE_SET_SPARSE_WAIT, /* counted in SPARSE, actually in BULK */
 122         CAKE_SET_BULK,
 123         CAKE_SET_DECAYING
 124 };
 125
 126 struct cake_flow {
 127         /* this stuff is all needed per-flow at dequeue time */
 128         struct sk_buff    *head;
 129         struct sk_buff    *tail;
 130         struct list_head  flowchain;
 131         s32               deficit;
 132         u32               dropped;
 133         struct cobalt_vars cvars;
 134         u16               srchost; /* index into cake_host table */
 135         u16               dsthost;
 136         u8                set;
 137 }; /* please try to keep this structure <= 64 bytes */
 138
 139 struct cake_host {
 140         u32 srchost_tag;
 141         u32 dsthost_tag;
 142         u16 srchost_bulk_flow_count;
 143         u16 dsthost_bulk_flow_count;
 144 };
 145
 146 struct cake_heap_entry {
 147         u16 t:3, b:10;
 148 };
 149
 150 struct cake_tin_data {
 151         struct cake_flow flows[CAKE_QUEUES];
 152         u32     backlogs[CAKE_QUEUES];
 153         u32     tags[CAKE_QUEUES]; /* for set association */
 154         u16     overflow_idx[CAKE_QUEUES];
 155         struct cake_host hosts[CAKE_QUEUES]; /* for triple isolation */
 156         u16     flow_quantum;
 157
 158         struct cobalt_params cparams;
 159         u32     drop_overlimit;
 160         u16     bulk_flow_count;
 161         u16     sparse_flow_count;
 162         u16     decaying_flow_count;
 163         u16     unresponsive_flow_count;
 164
 165         u32     max_skblen;
 166
 167         struct list_head new_flows;
 168         struct list_head old_flows;
 169         struct list_head decaying_flows;
 170
 171         /* time_next = time_this + ((len * rate_ns) >> rate_shft) */
 172         ktime_t time_next_packet;
 173         u64     tin_rate_ns;
 174         u64     tin_rate_bps;
 175         u16     tin_rate_shft;
 176
 177         u16     tin_quantum;
 178         s32     tin_deficit;
 179         u32     tin_backlog;
 180         u32     tin_dropped;
 181         u32     tin_ecn_mark;
 182
 183         u32     packets;
 184         u64     bytes;
 185
 186         u32     ack_drops;
 187
 188         /* moving averages */
 189         u64 avge_delay;
 190         u64 peak_delay;
 191         u64 base_delay;
 192
 193         /* hash function stats */
 194         u32     way_directs;
 195         u32     way_hits;
 196         u32     way_misses;
 197         u32     way_collisions;
 198 }; /* number of tins is small, so size of this struct doesn't matter much */
 199
 200 struct cake_sched_data {
 201         struct tcf_proto __rcu *filter_list; /* optional external classifier */
 202         struct tcf_block *block;
 203         struct cake_tin_data *tins;
 204
 205         struct cake_heap_entry overflow_heap[CAKE_QUEUES * CAKE_MAX_TINS];
 206         u16             overflow_timeout;
 207
 208         u16             tin_cnt;
 209         u8              tin_mode;
 210         u8              flow_mode;
 211         u8              ack_filter;
 212         u8              atm_mode;
 213
 214         u32             fwmark_mask;
 215         u16             fwmark_shft;
 216
 217         /* time_next = time_this + ((len * rate_ns) >> rate_shft) */
 218         u16             rate_shft;
 219         ktime_t         time_next_packet;
 220         ktime_t         failsafe_next_packet;
 221         u64             rate_ns;
 222         u64             rate_bps;
 223         u16             rate_flags;
 224         s16             rate_overhead;
 225         u16             rate_mpu;
 226         u64             interval;
 227         u64             target;
 228
 229         /* resource tracking */
 230         u32             buffer_used;
 231         u32             buffer_max_used;
 232         u32             buffer_limit;
 233         u32             buffer_config_limit;
 234
 235         /* indices for dequeue */
 236         u16             cur_tin;
 237         u16             cur_flow;
 238
 239         struct qdisc_watchdog watchdog;
 240         const u8        *tin_index;
 241         const u8        *tin_order;
 242
 243         /* bandwidth capacity estimate */
 244         ktime_t         last_packet_time;
 245         ktime_t         avg_window_begin;
 246         u64             avg_packet_interval;
 247         u64             avg_window_bytes;
 248         u64             avg_peak_bandwidth;
 249         ktime_t         last_reconfig_time;
 250
 251         /* packet length stats */
 252         u32             avg_netoff;
 253         u16             max_netlen;
 254         u16             max_adjlen;
 255         u16             min_netlen;
 256         u16             min_adjlen;
 257 };
 258
 259 enum {
 260         CAKE_FLAG_OVERHEAD         = BIT(0),
 261         CAKE_FLAG_AUTORATE_INGRESS = BIT(1),
 262         CAKE_FLAG_INGRESS          = BIT(2),
 263         CAKE_FLAG_WASH             = BIT(3),
 264         CAKE_FLAG_SPLIT_GSO        = BIT(4)
 265 };
 266
 267 /* COBALT operates the Codel and BLUE algorithms in parallel, in order to
 268  * obtain the best features of each.  Codel is excellent on flows which
 269  * respond to congestion signals in a TCP-like way.  BLUE is more effective on
 270  * unresponsive flows.
 271  */
 272
 273 struct cobalt_skb_cb {
 274         ktime_t enqueue_time;
 275         u32     adjusted_len;
 276 };
 277
 278 static u64 us_to_ns(u64 us)
 279 {
 280         return us * NSEC_PER_USEC;
 281 }
 282
 283 static struct cobalt_skb_cb *get_cobalt_cb(const struct sk_buff *skb)
 284 {
 285         qdisc_cb_private_validate(skb, sizeof(struct cobalt_skb_cb));
 286         return (struct cobalt_skb_cb *)qdisc_skb_cb(skb)->data;
 287 }
 288
 289 static ktime_t cobalt_get_enqueue_time(const struct sk_buff *skb)
 290 {
 291         return get_cobalt_cb(skb)->enqueue_time;
 292 }
 293
 294 static void cobalt_set_enqueue_time(struct sk_buff *skb,
 295                                     ktime_t now)
 296 {
 297         get_cobalt_cb(skb)->enqueue_time = now;
 298 }
 299
 300 static u16 quantum_div[CAKE_QUEUES + 1] = {0};
 301
 302 /* Diffserv lookup tables */
 303
 304 static const u8 precedence[] = {
 305         0, 0, 0, 0, 0, 0, 0, 0,
 306         1, 1, 1, 1, 1, 1, 1, 1,
 307         2, 2, 2, 2, 2, 2, 2, 2,
 308         3, 3, 3, 3, 3, 3, 3, 3,
 309         4, 4, 4, 4, 4, 4, 4, 4,
 310         5, 5, 5, 5, 5, 5, 5, 5,
 311         6, 6, 6, 6, 6, 6, 6, 6,
 312         7, 7, 7, 7, 7, 7, 7, 7,
 313 };
 314
 315 static const u8 diffserv8[] = {
 316         2, 0, 1, 2, 4, 2, 2, 2,
 317         1, 2, 1, 2, 1, 2, 1, 2,
 318         5, 2, 4, 2, 4, 2, 4, 2,
 319         3, 2, 3, 2, 3, 2, 3, 2,
 320         6, 2, 3, 2, 3, 2, 3, 2,
 321         6, 2, 2, 2, 6, 2, 6, 2,
 322         7, 2, 2, 2, 2, 2, 2, 2,
 323         7, 2, 2, 2, 2, 2, 2, 2,
 324 };
 325
 326 static const u8 diffserv4[] = {
 327         0, 1, 0, 0, 2, 0, 0, 0,
 328         1, 0, 0, 0, 0, 0, 0, 0,
 329         2, 0, 2, 0, 2, 0, 2, 0,
 330         2, 0, 2, 0, 2, 0, 2, 0,
 331         3, 0, 2, 0, 2, 0, 2, 0,
 332         3, 0, 0, 0, 3, 0, 3, 0,
 333         3, 0, 0, 0, 0, 0, 0, 0,
 334         3, 0, 0, 0, 0, 0, 0, 0,
 335 };
 336
 337 static const u8 diffserv3[] = {
 338         0, 1, 0, 0, 2, 0, 0, 0,
 339         1, 0, 0, 0, 0, 0, 0, 0,
 340         0, 0, 0, 0, 0, 0, 0, 0,
 341         0, 0, 0, 0, 0, 0, 0, 0,
 342         0, 0, 0, 0, 0, 0, 0, 0,
 343         0, 0, 0, 0, 2, 0, 2, 0,
 344         2, 0, 0, 0, 0, 0, 0, 0,
 345         2, 0, 0, 0, 0, 0, 0, 0,
 346 };
 347
 348 static const u8 besteffort[] = {
 349         0, 0, 0, 0, 0, 0, 0, 0,
 350         0, 0, 0, 0, 0, 0, 0, 0,
 351         0, 0, 0, 0, 0, 0, 0, 0,
 352         0, 0, 0, 0, 0, 0, 0, 0,
 353         0, 0, 0, 0, 0, 0, 0, 0,
 354         0, 0, 0, 0, 0, 0, 0, 0,
 355         0, 0, 0, 0, 0, 0, 0, 0,
 356         0, 0, 0, 0, 0, 0, 0, 0,
 357 };
 358
 359 /* tin priority order for stats dumping */
 360
 361 static const u8 normal_order[] = {0, 1, 2, 3, 4, 5, 6, 7};
 362 static const u8 bulk_order[] = {1, 0, 2, 3};
 363
 364 #define REC_INV_SQRT_CACHE (16)
 365 static u32 cobalt_rec_inv_sqrt_cache[REC_INV_SQRT_CACHE] = {0};
 366
 367 /* http://en.wikipedia.org/wiki/Methods_of_computing_square_roots
 368  * new_invsqrt = (invsqrt / 2) * (3 - count * invsqrt^2)
 369  *
 370  * Here, invsqrt is a fixed point number (< 1.0), 32bit mantissa, aka Q0.32
 371  */
 372
 373 static void cobalt_newton_step(struct cobalt_vars *vars)
 374 {
 375         u32 invsqrt, invsqrt2;
 376         u64 val;
 377
 378         invsqrt = vars->rec_inv_sqrt;
 379         invsqrt2 = ((u64)invsqrt * invsqrt) >> 32;
 380         val = (3LL << 32) - ((u64)vars->count * invsqrt2);
 381
 382         val >>= 2; /* avoid overflow in following multiply */
 383         val = (val * invsqrt) >> (32 - 2 + 1);
 384
 385         vars->rec_inv_sqrt = val;
 386 }
 387
 388 static void cobalt_invsqrt(struct cobalt_vars *vars)
 389 {
 390         if (vars->count < REC_INV_SQRT_CACHE)
 391                 vars->rec_inv_sqrt = cobalt_rec_inv_sqrt_cache[vars->count];
 392         else
 393                 cobalt_newton_step(vars);
 394 }
 395
 396 /* There is a big difference in timing between the accurate values placed in
 397  * the cache and the approximations given by a single Newton step for small
 398  * count values, particularly when stepping from count 1 to 2 or vice versa.
 399  * Above 16, a single Newton step gives sufficient accuracy in either
 400  * direction, given the precision stored.
 401  *
 402  * The magnitude of the error when stepping up to count 2 is such as to give
 403  * the value that *should* have been produced at count 4.
 404  */
 405
 406 static void cobalt_cache_init(void)
 407 {
 408         struct cobalt_vars v;
 409
 410         memset(&v, 0, sizeof(v));
 411         v.rec_inv_sqrt = ~0U;
 412         cobalt_rec_inv_sqrt_cache[0] = v.rec_inv_sqrt;
 413
 414         for (v.count = 1; v.count < REC_INV_SQRT_CACHE; v.count++) {
 415                 cobalt_newton_step(&v);
 416                 cobalt_newton_step(&v);
 417                 cobalt_newton_step(&v);
 418                 cobalt_newton_step(&v);
 419
 420                 cobalt_rec_inv_sqrt_cache[v.count] = v.rec_inv_sqrt;
 421         }
 422 }
 423
 424 static void cobalt_vars_init(struct cobalt_vars *vars)
 425 {
 426         memset(vars, 0, sizeof(*vars));
 427
 428         if (!cobalt_rec_inv_sqrt_cache[0]) {
 429                 cobalt_cache_init();
 430                 cobalt_rec_inv_sqrt_cache[0] = ~0;
 431         }
 432 }
 433
 434 /* CoDel control_law is t + interval/sqrt(count)
 435  * We maintain in rec_inv_sqrt the reciprocal value of sqrt(count) to avoid
 436  * both sqrt() and divide operation.
 437  */
 438 static ktime_t cobalt_control(ktime_t t,
 439                               u64 interval,
 440                               u32 rec_inv_sqrt)
 441 {
 442         return ktime_add_ns(t, reciprocal_scale(interval,
 443                                                 rec_inv_sqrt));
 444 }
 445
 446 /* Call this when a packet had to be dropped due to queue overflow.  Returns
 447  * true if the BLUE state was quiescent before but active after this call.
 448  */
 449 static bool cobalt_queue_full(struct cobalt_vars *vars,
 450                               struct cobalt_params *p,
 451                               ktime_t now)
 452 {
 453         bool up = false;
 454
 455         if (ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
 456                 up = !vars->p_drop;
 457                 vars->p_drop += p->p_inc;
 458                 if (vars->p_drop < p->p_inc)
 459                         vars->p_drop = ~0;
 460                 vars->blue_timer = now;
 461         }
 462         vars->dropping = true;
 463         vars->drop_next = now;
 464         if (!vars->count)
 465                 vars->count = 1;
 466
 467         return up;
 468 }
 469
 470 /* Call this when the queue was serviced but turned out to be empty.  Returns
 471  * true if the BLUE state was active before but quiescent after this call.
 472  */
 473 static bool cobalt_queue_empty(struct cobalt_vars *vars,
 474                                struct cobalt_params *p,
 475                                ktime_t now)
 476 {
 477         bool down = false;
 478
 479         if (vars->p_drop &&
 480             ktime_to_ns(ktime_sub(now, vars->blue_timer)) > p->target) {
 481                 if (vars->p_drop < p->p_dec)
 482                         vars->p_drop = 0;
 483                 else
 484                         vars->p_drop -= p->p_dec;
 485                 vars->blue_timer = now;
 486                 down = !vars->p_drop;
 487         }
 488         vars->dropping = false;
 489
 490         if (vars->count && ktime_to_ns(ktime_sub(now, vars->drop_next)) >= 0) {
 491                 vars->count--;
 492                 cobalt_invsqrt(vars);
 493                 vars->drop_next = cobalt_control(vars->drop_next,
 494                                                  p->interval,
 495                                                  vars->rec_inv_sqrt);
 496         }
 497
 498         return down;
 499 }
 500
 501 /* Call this with a freshly dequeued packet for possible congestion marking.
 502  * Returns true as an instruction to drop the packet, false for delivery.
 503  */
 504 static bool cobalt_should_drop(struct cobalt_vars *vars,
 505                                struct cobalt_params *p,
 506                                ktime_t now,
 507                                struct sk_buff *skb,
 508                                u32 bulk_flows)
 509 {
 510         bool next_due, over_target, drop = false;
 511         ktime_t schedule;
 512         u64 sojourn;
 513
 514 /* The 'schedule' variable records, in its sign, whether 'now' is before or
 515  * after 'drop_next'.  This allows 'drop_next' to be updated before the next
 516  * scheduling decision is actually branched, without destroying that
 517  * information.  Similarly, the first 'schedule' value calculated is preserved
 518  * in the boolean 'next_due'.
 519  *
 520  * As for 'drop_next', we take advantage of the fact that 'interval' is both
 521  * the delay between first exceeding 'target' and the first signalling event,
 522  * *and* the scaling factor for the signalling frequency.  It's therefore very
 523  * natural to use a single mechanism for both purposes, and eliminates a
 524  * significant amount of reference Codel's spaghetti code.  To help with this,
 525  * both the '0' and '1' entries in the invsqrt cache are 0xFFFFFFFF, as close
 526  * as possible to 1.0 in fixed-point.
 527  */
 528
 529         sojourn = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
 530         schedule = ktime_sub(now, vars->drop_next);
 531         over_target = sojourn > p->target &&
 532                       sojourn > p->mtu_time * bulk_flows * 2 &&
 533                       sojourn > p->mtu_time * 4;
 534         next_due = vars->count && ktime_to_ns(schedule) >= 0;
 535
 536         vars->ecn_marked = false;
 537
 538         if (over_target) {
 539                 if (!vars->dropping) {
 540                         vars->dropping = true;
 541                         vars->drop_next = cobalt_control(now,
 542                                                          p->interval,
 543                                                          vars->rec_inv_sqrt);
 544                 }
 545                 if (!vars->count)
 546                         vars->count = 1;
 547         } else if (vars->dropping) {
 548                 vars->dropping = false;
 549         }
 550
 551         if (next_due && vars->dropping) {
 552                 /* Use ECN mark if possible, otherwise drop */
 553                 drop = !(vars->ecn_marked = INET_ECN_set_ce(skb));
 554
 555                 vars->count++;
 556                 if (!vars->count)
 557                         vars->count--;
 558                 cobalt_invsqrt(vars);
 559                 vars->drop_next = cobalt_control(vars->drop_next,
 560                                                  p->interval,
 561                                                  vars->rec_inv_sqrt);
 562                 schedule = ktime_sub(now, vars->drop_next);
 563         } else {
 564                 while (next_due) {
 565                         vars->count--;
 566                         cobalt_invsqrt(vars);
 567                         vars->drop_next = cobalt_control(vars->drop_next,
 568                                                          p->interval,
 569                                                          vars->rec_inv_sqrt);
 570                         schedule = ktime_sub(now, vars->drop_next);
 571                         next_due = vars->count && ktime_to_ns(schedule) >= 0;
 572                 }
 573         }
 574
 575         /* Simple BLUE implementation.  Lack of ECN is deliberate. */
 576         if (vars->p_drop)
 577                 drop |= (get_random_u32() < vars->p_drop);
 578
 579         /* Overload the drop_next field as an activity timeout */
 580         if (!vars->count)
 581                 vars->drop_next = ktime_add_ns(now, p->interval);
 582         else if (ktime_to_ns(schedule) > 0 && !drop)
 583                 vars->drop_next = now;
 584
 585         return drop;
 586 }
 587
 588 static bool cake_update_flowkeys(struct flow_keys *keys,
 589                                  const struct sk_buff *skb)
 590 {
 591 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
 592         struct nf_conntrack_tuple tuple = {};
 593         bool rev = !skb->_nfct, upd = false;
 594         __be32 ip;
 595
 596         if (skb_protocol(skb, true) != htons(ETH_P_IP))
 597                 return false;
 598
 599         if (!nf_ct_get_tuple_skb(&tuple, skb))
 600                 return false;
 601
 602         ip = rev ? tuple.dst.u3.ip : tuple.src.u3.ip;
 603         if (ip != keys->addrs.v4addrs.src) {
 604                 keys->addrs.v4addrs.src = ip;
 605                 upd = true;
 606         }
 607         ip = rev ? tuple.src.u3.ip : tuple.dst.u3.ip;
 608         if (ip != keys->addrs.v4addrs.dst) {
 609                 keys->addrs.v4addrs.dst = ip;
 610                 upd = true;
 611         }
 612
 613         if (keys->ports.ports) {
 614                 __be16 port;
 615
 616                 port = rev ? tuple.dst.u.all : tuple.src.u.all;
 617                 if (port != keys->ports.src) {
 618                         keys->ports.src = port;
 619                         upd = true;
 620                 }
 621                 port = rev ? tuple.src.u.all : tuple.dst.u.all;
 622                 if (port != keys->ports.dst) {
 623                         port = keys->ports.dst;
 624                         upd = true;
 625                 }
 626         }
 627         return upd;
 628 #else
 629         return false;
 630 #endif
 631 }
 632
 633 /* Cake has several subtle multiple bit settings. In these cases you
 634  *  would be matching triple isolate mode as well.
 635  */
 636
 637 static bool cake_dsrc(int flow_mode)
 638 {
 639         return (flow_mode & CAKE_FLOW_DUAL_SRC) == CAKE_FLOW_DUAL_SRC;
 640 }
 641
 642 static bool cake_ddst(int flow_mode)
 643 {
 644         return (flow_mode & CAKE_FLOW_DUAL_DST) == CAKE_FLOW_DUAL_DST;
 645 }
 646
 647 static u32 cake_hash(struct cake_tin_data *q, const struct sk_buff *skb,
 648                      int flow_mode, u16 flow_override, u16 host_override)
 649 {
 650         bool hash_flows = (!flow_override && !!(flow_mode & CAKE_FLOW_FLOWS));
 651         bool hash_hosts = (!host_override && !!(flow_mode & CAKE_FLOW_HOSTS));
 652         bool nat_enabled = !!(flow_mode & CAKE_FLOW_NAT_FLAG);
 653         u32 flow_hash = 0, srchost_hash = 0, dsthost_hash = 0;
 654         u16 reduced_hash, srchost_idx, dsthost_idx;
 655         struct flow_keys keys, host_keys;
 656         bool use_skbhash = skb->l4_hash;
 657
 658         if (unlikely(flow_mode == CAKE_FLOW_NONE))
 659                 return 0;
 660
 661         /* If both overrides are set, or we can use the SKB hash and nat mode is
 662          * disabled, we can skip packet dissection entirely. If nat mode is
 663          * enabled there's another check below after doing the conntrack lookup.
 664          */
 665         if ((!hash_flows || (use_skbhash && !nat_enabled)) && !hash_hosts)
 666                 goto skip_hash;
 667
 668         skb_flow_dissect_flow_keys(skb, &keys,
 669                                    FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL);
 670
 671         /* Don't use the SKB hash if we change the lookup keys from conntrack */
 672         if (nat_enabled && cake_update_flowkeys(&keys, skb))
 673                 use_skbhash = false;
 674
 675         /* If we can still use the SKB hash and don't need the host hash, we can
 676          * skip the rest of the hashing procedure
 677          */
 678         if (use_skbhash && !hash_hosts)
 679                 goto skip_hash;
 680
 681         /* flow_hash_from_keys() sorts the addresses by value, so we have
 682          * to preserve their order in a separate data structure to treat
 683          * src and dst host addresses as independently selectable.
 684          */
 685         host_keys = keys;
 686         host_keys.ports.ports     = 0;
 687         host_keys.basic.ip_proto  = 0;
 688         host_keys.keyid.keyid     = 0;
 689         host_keys.tags.flow_label = 0;
 690
 691         switch (host_keys.control.addr_type) {
 692         case FLOW_DISSECTOR_KEY_IPV4_ADDRS:
 693                 host_keys.addrs.v4addrs.src = 0;
 694                 dsthost_hash = flow_hash_from_keys(&host_keys);
 695                 host_keys.addrs.v4addrs.src = keys.addrs.v4addrs.src;
 696                 host_keys.addrs.v4addrs.dst = 0;
 697                 srchost_hash = flow_hash_from_keys(&host_keys);
 698                 break;
 699
 700         case FLOW_DISSECTOR_KEY_IPV6_ADDRS:
 701                 memset(&host_keys.addrs.v6addrs.src, 0,
 702                        sizeof(host_keys.addrs.v6addrs.src));
 703                 dsthost_hash = flow_hash_from_keys(&host_keys);
 704                 host_keys.addrs.v6addrs.src = keys.addrs.v6addrs.src;
 705                 memset(&host_keys.addrs.v6addrs.dst, 0,
 706                        sizeof(host_keys.addrs.v6addrs.dst));
 707                 srchost_hash = flow_hash_from_keys(&host_keys);
 708                 break;
 709
 710         default:
 711                 dsthost_hash = 0;
 712                 srchost_hash = 0;
 713         }
 714
 715         /* This *must* be after the above switch, since as a
 716          * side-effect it sorts the src and dst addresses.
 717          */
 718         if (hash_flows && !use_skbhash)
 719                 flow_hash = flow_hash_from_keys(&keys);
 720
 721 skip_hash:
 722         if (flow_override)
 723                 flow_hash = flow_override - 1;
 724         else if (use_skbhash && (flow_mode & CAKE_FLOW_FLOWS))
 725                 flow_hash = skb->hash;
 726         if (host_override) {
 727                 dsthost_hash = host_override - 1;
 728                 srchost_hash = host_override - 1;
 729         }
 730
 731         if (!(flow_mode & CAKE_FLOW_FLOWS)) {
 732                 if (flow_mode & CAKE_FLOW_SRC_IP)
 733                         flow_hash ^= srchost_hash;
 734
 735                 if (flow_mode & CAKE_FLOW_DST_IP)
 736                         flow_hash ^= dsthost_hash;
 737         }
 738
 739         reduced_hash = flow_hash % CAKE_QUEUES;
 740
 741         /* set-associative hashing */
 742         /* fast path if no hash collision (direct lookup succeeds) */
 743         if (likely(q->tags[reduced_hash] == flow_hash &&
 744                    q->flows[reduced_hash].set)) {
 745                 q->way_directs++;
 746         } else {
 747                 u32 inner_hash = reduced_hash % CAKE_SET_WAYS;
 748                 u32 outer_hash = reduced_hash - inner_hash;
 749                 bool allocate_src = false;
 750                 bool allocate_dst = false;
 751                 u32 i, k;
 752
 753                 /* check if any active queue in the set is reserved for
 754                  * this flow.
 755                  */
 756                 for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
 757                      i++, k = (k + 1) % CAKE_SET_WAYS) {
 758                         if (q->tags[outer_hash + k] == flow_hash) {
 759                                 if (i)
 760                                         q->way_hits++;
 761
 762                                 if (!q->flows[outer_hash + k].set) {
 763                                         /* need to increment host refcnts */
 764                                         allocate_src = cake_dsrc(flow_mode);
 765                                         allocate_dst = cake_ddst(flow_mode);
 766                                 }
 767
 768                                 goto found;
 769                         }
 770                 }
 771
 772                 /* no queue is reserved for this flow, look for an
 773                  * empty one.
 774                  */
 775                 for (i = 0; i < CAKE_SET_WAYS;
 776                          i++, k = (k + 1) % CAKE_SET_WAYS) {
 777                         if (!q->flows[outer_hash + k].set) {
 778                                 q->way_misses++;
 779                                 allocate_src = cake_dsrc(flow_mode);
 780                                 allocate_dst = cake_ddst(flow_mode);
 781                                 goto found;
 782                         }
 783                 }
 784
 785                 /* With no empty queues, default to the original
 786                  * queue, accept the collision, update the host tags.
 787                  */
 788                 q->way_collisions++;
 789                 if (q->flows[outer_hash + k].set == CAKE_SET_BULK) {
 790                         q->hosts[q->flows[reduced_hash].srchost].srchost_bulk_flow_count--;
 791                         q->hosts[q->flows[reduced_hash].dsthost].dsthost_bulk_flow_count--;
 792                 }
 793                 allocate_src = cake_dsrc(flow_mode);
 794                 allocate_dst = cake_ddst(flow_mode);
 795 found:
 796                 /* reserve queue for future packets in same flow */
 797                 reduced_hash = outer_hash + k;
 798                 q->tags[reduced_hash] = flow_hash;
 799
 800                 if (allocate_src) {
 801                         srchost_idx = srchost_hash % CAKE_QUEUES;
 802                         inner_hash = srchost_idx % CAKE_SET_WAYS;
 803                         outer_hash = srchost_idx - inner_hash;
 804                         for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
 805                                 i++, k = (k + 1) % CAKE_SET_WAYS) {
 806                                 if (q->hosts[outer_hash + k].srchost_tag ==
 807                                     srchost_hash)
 808                                         goto found_src;
 809                         }
 810                         for (i = 0; i < CAKE_SET_WAYS;
 811                                 i++, k = (k + 1) % CAKE_SET_WAYS) {
 812                                 if (!q->hosts[outer_hash + k].srchost_bulk_flow_count)
 813                                         break;
 814                         }
 815                         q->hosts[outer_hash + k].srchost_tag = srchost_hash;
 816 found_src:
 817                         srchost_idx = outer_hash + k;
 818                         if (q->flows[reduced_hash].set == CAKE_SET_BULK)
 819                                 q->hosts[srchost_idx].srchost_bulk_flow_count++;
 820                         q->flows[reduced_hash].srchost = srchost_idx;
 821                 }
 822
 823                 if (allocate_dst) {
 824                         dsthost_idx = dsthost_hash % CAKE_QUEUES;
 825                         inner_hash = dsthost_idx % CAKE_SET_WAYS;
 826                         outer_hash = dsthost_idx - inner_hash;
 827                         for (i = 0, k = inner_hash; i < CAKE_SET_WAYS;
 828                              i++, k = (k + 1) % CAKE_SET_WAYS) {
 829                                 if (q->hosts[outer_hash + k].dsthost_tag ==
 830                                     dsthost_hash)
 831                                         goto found_dst;
 832                         }
 833                         for (i = 0; i < CAKE_SET_WAYS;
 834                              i++, k = (k + 1) % CAKE_SET_WAYS) {
 835                                 if (!q->hosts[outer_hash + k].dsthost_bulk_flow_count)
 836                                         break;
 837                         }
 838                         q->hosts[outer_hash + k].dsthost_tag = dsthost_hash;
 839 found_dst:
 840                         dsthost_idx = outer_hash + k;
 841                         if (q->flows[reduced_hash].set == CAKE_SET_BULK)
 842                                 q->hosts[dsthost_idx].dsthost_bulk_flow_count++;
 843                         q->flows[reduced_hash].dsthost = dsthost_idx;
 844                 }
 845         }
 846
 847         return reduced_hash;
 848 }
 849
 850 /* helper functions : might be changed when/if skb use a standard list_head */
 851 /* remove one skb from head of slot queue */
 852
 853 static struct sk_buff *dequeue_head(struct cake_flow *flow)
 854 {
 855         struct sk_buff *skb = flow->head;
 856
 857         if (skb) {
 858                 flow->head = skb->next;
 859                 skb_mark_not_on_list(skb);
 860         }
 861
 862         return skb;
 863 }
 864
 865 /* add skb to flow queue (tail add) */
 866
 867 static void flow_queue_add(struct cake_flow *flow, struct sk_buff *skb)
 868 {
 869         if (!flow->head)
 870                 flow->head = skb;
 871         else
 872                 flow->tail->next = skb;
 873         flow->tail = skb;
 874         skb->next = NULL;
 875 }
 876
 877 static struct iphdr *cake_get_iphdr(const struct sk_buff *skb,
 878                                     struct ipv6hdr *buf)
 879 {
 880         unsigned int offset = skb_network_offset(skb);
 881         struct iphdr *iph;
 882
 883         iph = skb_header_pointer(skb, offset, sizeof(struct iphdr), buf);
 884
 885         if (!iph)
 886                 return NULL;
 887
 888         if (iph->version == 4 && iph->protocol == IPPROTO_IPV6)
 889                 return skb_header_pointer(skb, offset + iph->ihl * 4,
 890                                           sizeof(struct ipv6hdr), buf);
 891
 892         else if (iph->version == 4)
 893                 return iph;
 894
 895         else if (iph->version == 6)
 896                 return skb_header_pointer(skb, offset, sizeof(struct ipv6hdr),
 897                                           buf);
 898
 899         return NULL;
 900 }
 901
 902 static struct tcphdr *cake_get_tcphdr(const struct sk_buff *skb,
 903                                       void *buf, unsigned int bufsize)
 904 {
 905         unsigned int offset = skb_network_offset(skb);
 906         const struct ipv6hdr *ipv6h;
 907         const struct tcphdr *tcph;
 908         const struct iphdr *iph;
 909         struct ipv6hdr _ipv6h;
 910         struct tcphdr _tcph;
 911
 912         ipv6h = skb_header_pointer(skb, offset, sizeof(_ipv6h), &_ipv6h);
 913
 914         if (!ipv6h)
 915                 return NULL;
 916
 917         if (ipv6h->version == 4) {
 918                 iph = (struct iphdr *)ipv6h;
 919                 offset += iph->ihl * 4;
 920
 921                 /* special-case 6in4 tunnelling, as that is a common way to get
 922                  * v6 connectivity in the home
 923                  */
 924                 if (iph->protocol == IPPROTO_IPV6) {
 925                         ipv6h = skb_header_pointer(skb, offset,
 926                                                    sizeof(_ipv6h), &_ipv6h);
 927
 928                         if (!ipv6h || ipv6h->nexthdr != IPPROTO_TCP)
 929                                 return NULL;
 930
 931                         offset += sizeof(struct ipv6hdr);
 932
 933                 } else if (iph->protocol != IPPROTO_TCP) {
 934                         return NULL;
 935                 }
 936
 937         } else if (ipv6h->version == 6) {
 938                 if (ipv6h->nexthdr != IPPROTO_TCP)
 939                         return NULL;
 940
 941                 offset += sizeof(struct ipv6hdr);
 942         } else {
 943                 return NULL;
 944         }
 945
 946         tcph = skb_header_pointer(skb, offset, sizeof(_tcph), &_tcph);
 947         if (!tcph || tcph->doff < 5)
 948                 return NULL;
 949
 950         return skb_header_pointer(skb, offset,
 951                                   min(__tcp_hdrlen(tcph), bufsize), buf);
 952 }
 953
 954 static const void *cake_get_tcpopt(const struct tcphdr *tcph,
 955                                    int code, int *oplen)
 956 {
 957         /* inspired by tcp_parse_options in tcp_input.c */
 958         int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
 959         const u8 *ptr = (const u8 *)(tcph + 1);
 960
 961         while (length > 0) {
 962                 int opcode = *ptr++;
 963                 int opsize;
 964
 965                 if (opcode == TCPOPT_EOL)
 966                         break;
 967                 if (opcode == TCPOPT_NOP) {
 968                         length--;
 969                         continue;
 970                 }
 971                 if (length < 2)
 972                         break;
 973                 opsize = *ptr++;
 974                 if (opsize < 2 || opsize > length)
 975                         break;
 976
 977                 if (opcode == code) {
 978                         *oplen = opsize;
 979                         return ptr;
 980                 }
 981
 982                 ptr += opsize - 2;
 983                 length -= opsize;
 984         }
 985
 986         return NULL;
 987 }
 988
 989 /* Compare two SACK sequences. A sequence is considered greater if it SACKs more
 990  * bytes than the other. In the case where both sequences ACKs bytes that the
 991  * other doesn't, A is considered greater. DSACKs in A also makes A be
 992  * considered greater.
 993  *
 994  * @return -1, 0 or 1 as normal compare functions
 995  */
 996 static int cake_tcph_sack_compare(const struct tcphdr *tcph_a,
 997                                   const struct tcphdr *tcph_b)
 998 {
 999         const struct tcp_sack_block_wire *sack_a, *sack_b;
1000         u32 ack_seq_a = ntohl(tcph_a->ack_seq);
1001         u32 bytes_a = 0, bytes_b = 0;
1002         int oplen_a, oplen_b;
1003         bool first = true;
1004
1005         sack_a = cake_get_tcpopt(tcph_a, TCPOPT_SACK, &oplen_a);
1006         sack_b = cake_get_tcpopt(tcph_b, TCPOPT_SACK, &oplen_b);
1007
1008         /* pointers point to option contents */
1009         oplen_a -= TCPOLEN_SACK_BASE;
1010         oplen_b -= TCPOLEN_SACK_BASE;
1011
1012         if (sack_a && oplen_a >= sizeof(*sack_a) &&
1013             (!sack_b || oplen_b < sizeof(*sack_b)))
1014                 return -1;
1015         else if (sack_b && oplen_b >= sizeof(*sack_b) &&
1016                  (!sack_a || oplen_a < sizeof(*sack_a)))
1017                 return 1;
1018         else if ((!sack_a || oplen_a < sizeof(*sack_a)) &&
1019                  (!sack_b || oplen_b < sizeof(*sack_b)))
1020                 return 0;
1021
1022         while (oplen_a >= sizeof(*sack_a)) {
1023                 const struct tcp_sack_block_wire *sack_tmp = sack_b;
1024                 u32 start_a = get_unaligned_be32(&sack_a->start_seq);
1025                 u32 end_a = get_unaligned_be32(&sack_a->end_seq);
1026                 int oplen_tmp = oplen_b;
1027                 bool found = false;
1028
1029                 /* DSACK; always considered greater to prevent dropping */
1030                 if (before(start_a, ack_seq_a))
1031                         return -1;
1032
1033                 bytes_a += end_a - start_a;
1034
1035                 while (oplen_tmp >= sizeof(*sack_tmp)) {
1036                         u32 start_b = get_unaligned_be32(&sack_tmp->start_seq);
1037                         u32 end_b = get_unaligned_be32(&sack_tmp->end_seq);
1038
1039                         /* first time through we count the total size */
1040                         if (first)
1041                                 bytes_b += end_b - start_b;
1042
1043                         if (!after(start_b, start_a) && !before(end_b, end_a)) {
1044                                 found = true;
1045                                 if (!first)
1046                                         break;
1047                         }
1048                         oplen_tmp -= sizeof(*sack_tmp);
1049                         sack_tmp++;
1050                 }
1051
1052                 if (!found)
1053                         return -1;
1054
1055                 oplen_a -= sizeof(*sack_a);
1056                 sack_a++;
1057                 first = false;
1058         }
1059
1060         /* If we made it this far, all ranges SACKed by A are covered by B, so
1061          * either the SACKs are equal, or B SACKs more bytes.
1062          */
1063         return bytes_b > bytes_a ? 1 : 0;
1064 }
1065
1066 static void cake_tcph_get_tstamp(const struct tcphdr *tcph,
1067                                  u32 *tsval, u32 *tsecr)
1068 {
1069         const u8 *ptr;
1070         int opsize;
1071
1072         ptr = cake_get_tcpopt(tcph, TCPOPT_TIMESTAMP, &opsize);
1073
1074         if (ptr && opsize == TCPOLEN_TIMESTAMP) {
1075                 *tsval = get_unaligned_be32(ptr);
1076                 *tsecr = get_unaligned_be32(ptr + 4);
1077         }
1078 }
1079
1080 static bool cake_tcph_may_drop(const struct tcphdr *tcph,
1081                                u32 tstamp_new, u32 tsecr_new)
1082 {
1083         /* inspired by tcp_parse_options in tcp_input.c */
1084         int length = __tcp_hdrlen(tcph) - sizeof(struct tcphdr);
1085         const u8 *ptr = (const u8 *)(tcph + 1);
1086         u32 tstamp, tsecr;
1087
1088         /* 3 reserved flags must be unset to avoid future breakage
1089          * ACK must be set
1090          * ECE/CWR are handled separately
1091          * All other flags URG/PSH/RST/SYN/FIN must be unset
1092          * 0x0FFF0000 = all TCP flags (confirm ACK=1, others zero)
1093          * 0x00C00000 = CWR/ECE (handled separately)
1094          * 0x0F3F0000 = 0x0FFF0000 & ~0x00C00000
1095          */
1096         if (((tcp_flag_word(tcph) &
1097               cpu_to_be32(0x0F3F0000)) != TCP_FLAG_ACK))
1098                 return false;
1099
1100         while (length > 0) {
1101                 int opcode = *ptr++;
1102                 int opsize;
1103
1104                 if (opcode == TCPOPT_EOL)
1105                         break;
1106                 if (opcode == TCPOPT_NOP) {
1107                         length--;
1108                         continue;
1109                 }
1110                 if (length < 2)
1111                         break;
1112                 opsize = *ptr++;
1113                 if (opsize < 2 || opsize > length)
1114                         break;
1115
1116                 switch (opcode) {
1117                 case TCPOPT_MD5SIG: /* doesn't influence state */
1118                         break;
1119
1120                 case TCPOPT_SACK: /* stricter checking performed later */
1121                         if (opsize % 8 != 2)
1122                                 return false;
1123                         break;
1124
1125                 case TCPOPT_TIMESTAMP:
1126                         /* only drop timestamps lower than new */
1127                         if (opsize != TCPOLEN_TIMESTAMP)
1128                                 return false;
1129                         tstamp = get_unaligned_be32(ptr);
1130                         tsecr = get_unaligned_be32(ptr + 4);
1131                         if (after(tstamp, tstamp_new) ||
1132                             after(tsecr, tsecr_new))
1133                                 return false;
1134                         break;
1135
1136                 case TCPOPT_MSS:  /* these should only be set on SYN */
1137                 case TCPOPT_WINDOW:
1138                 case TCPOPT_SACK_PERM:
1139                 case TCPOPT_FASTOPEN:
1140                 case TCPOPT_EXP:
1141                 default: /* don't drop if any unknown options are present */
1142                         return false;
1143                 }
1144
1145                 ptr += opsize - 2;
1146                 length -= opsize;
1147         }
1148
1149         return true;
1150 }
1151
1152 static struct sk_buff *cake_ack_filter(struct cake_sched_data *q,
1153                                        struct cake_flow *flow)
1154 {
1155         bool aggressive = q->ack_filter == CAKE_ACK_AGGRESSIVE;
1156         struct sk_buff *elig_ack = NULL, *elig_ack_prev = NULL;
1157         struct sk_buff *skb_check, *skb_prev = NULL;
1158         const struct ipv6hdr *ipv6h, *ipv6h_check;
1159         unsigned char _tcph[64], _tcph_check[64];
1160         const struct tcphdr *tcph, *tcph_check;
1161         const struct iphdr *iph, *iph_check;
1162         struct ipv6hdr _iph, _iph_check;
1163         const struct sk_buff *skb;
1164         int seglen, num_found = 0;
1165         u32 tstamp = 0, tsecr = 0;
1166         __be32 elig_flags = 0;
1167         int sack_comp;
1168
1169         /* no other possible ACKs to filter */
1170         if (flow->head == flow->tail)
1171                 return NULL;
1172
1173         skb = flow->tail;
1174         tcph = cake_get_tcphdr(skb, _tcph, sizeof(_tcph));
1175         iph = cake_get_iphdr(skb, &_iph);
1176         if (!tcph)
1177                 return NULL;
1178
1179         cake_tcph_get_tstamp(tcph, &tstamp, &tsecr);
1180
1181         /* the 'triggering' packet need only have the ACK flag set.
1182          * also check that SYN is not set, as there won't be any previous ACKs.
1183          */
1184         if ((tcp_flag_word(tcph) &
1185              (TCP_FLAG_ACK | TCP_FLAG_SYN)) != TCP_FLAG_ACK)
1186                 return NULL;
1187
1188         /* the 'triggering' ACK is at the tail of the queue, we have already
1189          * returned if it is the only packet in the flow. loop through the rest
1190          * of the queue looking for pure ACKs with the same 5-tuple as the
1191          * triggering one.
1192          */
1193         for (skb_check = flow->head;
1194              skb_check && skb_check != skb;
1195              skb_prev = skb_check, skb_check = skb_check->next) {
1196                 iph_check = cake_get_iphdr(skb_check, &_iph_check);
1197                 tcph_check = cake_get_tcphdr(skb_check, &_tcph_check,
1198                                              sizeof(_tcph_check));
1199
1200                 /* only TCP packets with matching 5-tuple are eligible, and only
1201                  * drop safe headers
1202                  */
1203                 if (!tcph_check || iph->version != iph_check->version ||
1204                     tcph_check->source != tcph->source ||
1205                     tcph_check->dest != tcph->dest)
1206                         continue;
1207
1208                 if (iph_check->version == 4) {
1209                         if (iph_check->saddr != iph->saddr ||
1210                             iph_check->daddr != iph->daddr)
1211                                 continue;
1212
1213                         seglen = iph_totlen(skb, iph_check) -
1214                                        (4 * iph_check->ihl);
1215                 } else if (iph_check->version == 6) {
1216                         ipv6h = (struct ipv6hdr *)iph;
1217                         ipv6h_check = (struct ipv6hdr *)iph_check;
1218
1219                         if (ipv6_addr_cmp(&ipv6h_check->saddr, &ipv6h->saddr) ||
1220                             ipv6_addr_cmp(&ipv6h_check->daddr, &ipv6h->daddr))
1221                                 continue;
1222
1223                         seglen = ntohs(ipv6h_check->payload_len);
1224                 } else {
1225                         WARN_ON(1);  /* shouldn't happen */
1226                         continue;
1227                 }
1228
1229                 /* If the ECE/CWR flags changed from the previous eligible
1230                  * packet in the same flow, we should no longer be dropping that
1231                  * previous packet as this would lose information.
1232                  */
1233                 if (elig_ack && (tcp_flag_word(tcph_check) &
1234                                  (TCP_FLAG_ECE | TCP_FLAG_CWR)) != elig_flags) {
1235                         elig_ack = NULL;
1236                         elig_ack_prev = NULL;
1237                         num_found--;
1238                 }
1239
1240                 /* Check TCP options and flags, don't drop ACKs with segment
1241                  * data, and don't drop ACKs with a higher cumulative ACK
1242                  * counter than the triggering packet. Check ACK seqno here to
1243                  * avoid parsing SACK options of packets we are going to exclude
1244                  * anyway.
1245                  */
1246                 if (!cake_tcph_may_drop(tcph_check, tstamp, tsecr) ||
1247                     (seglen - __tcp_hdrlen(tcph_check)) != 0 ||
1248                     after(ntohl(tcph_check->ack_seq), ntohl(tcph->ack_seq)))
1249                         continue;
1250
1251                 /* Check SACK options. The triggering packet must SACK more data
1252                  * than the ACK under consideration, or SACK the same range but
1253                  * have a larger cumulative ACK counter. The latter is a
1254                  * pathological case, but is contained in the following check
1255                  * anyway, just to be safe.
1256                  */
1257                 sack_comp = cake_tcph_sack_compare(tcph_check, tcph);
1258
1259                 if (sack_comp < 0 ||
1260                     (ntohl(tcph_check->ack_seq) == ntohl(tcph->ack_seq) &&
1261                      sack_comp == 0))
1262                         continue;
1263
1264                 /* At this point we have found an eligible pure ACK to drop; if
1265                  * we are in aggressive mode, we are done. Otherwise, keep
1266                  * searching unless this is the second eligible ACK we
1267                  * found.
1268                  *
1269                  * Since we want to drop ACK closest to the head of the queue,
1270                  * save the first eligible ACK we find, even if we need to loop
1271                  * again.
1272                  */
1273                 if (!elig_ack) {
1274                         elig_ack = skb_check;
1275                         elig_ack_prev = skb_prev;
1276                         elig_flags = (tcp_flag_word(tcph_check)
1277                                       & (TCP_FLAG_ECE | TCP_FLAG_CWR));
1278                 }
1279
1280                 if (num_found++ > 0)
1281                         goto found;
1282         }
1283
1284         /* We made it through the queue without finding two eligible ACKs . If
1285          * we found a single eligible ACK we can drop it in aggressive mode if
1286          * we can guarantee that this does not interfere with ECN flag
1287          * information. We ensure this by dropping it only if the enqueued
1288          * packet is consecutive with the eligible ACK, and their flags match.
1289          */
1290         if (elig_ack && aggressive && elig_ack->next == skb &&
1291             (elig_flags == (tcp_flag_word(tcph) &
1292                             (TCP_FLAG_ECE | TCP_FLAG_CWR))))
1293                 goto found;
1294
1295         return NULL;
1296
1297 found:
1298         if (elig_ack_prev)
1299                 elig_ack_prev->next = elig_ack->next;
1300         else
1301                 flow->head = elig_ack->next;
1302
1303         skb_mark_not_on_list(elig_ack);
1304
1305         return elig_ack;
1306 }
1307
1308 static u64 cake_ewma(u64 avg, u64 sample, u32 shift)
1309 {
1310         avg -= avg >> shift;
1311         avg += sample >> shift;
1312         return avg;
1313 }
1314
1315 static u32 cake_calc_overhead(struct cake_sched_data *q, u32 len, u32 off)
1316 {
1317         if (q->rate_flags & CAKE_FLAG_OVERHEAD)
1318                 len -= off;
1319
1320         if (q->max_netlen < len)
1321                 q->max_netlen = len;
1322         if (q->min_netlen > len)
1323                 q->min_netlen = len;
1324
1325         len += q->rate_overhead;
1326
1327         if (len < q->rate_mpu)
1328                 len = q->rate_mpu;
1329
1330         if (q->atm_mode == CAKE_ATM_ATM) {
1331                 len += 47;
1332                 len /= 48;
1333                 len *= 53;
1334         } else if (q->atm_mode == CAKE_ATM_PTM) {
1335                 /* Add one byte per 64 bytes or part thereof.
1336                  * This is conservative and easier to calculate than the
1337                  * precise value.
1338                  */
1339                 len += (len + 63) / 64;
1340         }
1341
1342         if (q->max_adjlen < len)
1343                 q->max_adjlen = len;
1344         if (q->min_adjlen > len)
1345                 q->min_adjlen = len;
1346
1347         return len;
1348 }
1349
1350 static u32 cake_overhead(struct cake_sched_data *q, const struct sk_buff *skb)
1351 {
1352         const struct skb_shared_info *shinfo = skb_shinfo(skb);
1353         unsigned int hdr_len, last_len = 0;
1354         u32 off = skb_network_offset(skb);
1355         u32 len = qdisc_pkt_len(skb);
1356         u16 segs = 1;
1357
1358         q->avg_netoff = cake_ewma(q->avg_netoff, off << 16, 8);
1359
1360         if (!shinfo->gso_size)
1361                 return cake_calc_overhead(q, len, off);
1362
1363         /* borrowed from qdisc_pkt_len_init() */
1364         hdr_len = skb_transport_offset(skb);
1365
1366         /* + transport layer */
1367         if (likely(shinfo->gso_type & (SKB_GSO_TCPV4 |
1368                                                 SKB_GSO_TCPV6))) {
1369                 const struct tcphdr *th;
1370                 struct tcphdr _tcphdr;
1371
1372                 th = skb_header_pointer(skb, hdr_len,
1373                                         sizeof(_tcphdr), &_tcphdr);
1374                 if (likely(th))
1375                         hdr_len += __tcp_hdrlen(th);
1376         } else {
1377                 struct udphdr _udphdr;
1378
1379                 if (skb_header_pointer(skb, hdr_len,
1380                                        sizeof(_udphdr), &_udphdr))
1381                         hdr_len += sizeof(struct udphdr);
1382         }
1383
1384         if (unlikely(shinfo->gso_type & SKB_GSO_DODGY))
1385                 segs = DIV_ROUND_UP(skb->len - hdr_len,
1386                                     shinfo->gso_size);
1387         else
1388                 segs = shinfo->gso_segs;
1389
1390         len = shinfo->gso_size + hdr_len;
1391         last_len = skb->len - shinfo->gso_size * (segs - 1);
1392
1393         return (cake_calc_overhead(q, len, off) * (segs - 1) +
1394                 cake_calc_overhead(q, last_len, off));
1395 }
1396
1397 static void cake_heap_swap(struct cake_sched_data *q, u16 i, u16 j)
1398 {
1399         struct cake_heap_entry ii = q->overflow_heap[i];
1400         struct cake_heap_entry jj = q->overflow_heap[j];
1401
1402         q->overflow_heap[i] = jj;
1403         q->overflow_heap[j] = ii;
1404
1405         q->tins[ii.t].overflow_idx[ii.b] = j;
1406         q->tins[jj.t].overflow_idx[jj.b] = i;
1407 }
1408
1409 static u32 cake_heap_get_backlog(const struct cake_sched_data *q, u16 i)
1410 {
1411         struct cake_heap_entry ii = q->overflow_heap[i];
1412
1413         return q->tins[ii.t].backlogs[ii.b];
1414 }
1415
1416 static void cake_heapify(struct cake_sched_data *q, u16 i)
1417 {
1418         static const u32 a = CAKE_MAX_TINS * CAKE_QUEUES;
1419         u32 mb = cake_heap_get_backlog(q, i);
1420         u32 m = i;
1421
1422         while (m < a) {
1423                 u32 l = m + m + 1;
1424                 u32 r = l + 1;
1425
1426                 if (l < a) {
1427                         u32 lb = cake_heap_get_backlog(q, l);
1428
1429                         if (lb > mb) {
1430                                 m  = l;
1431                                 mb = lb;
1432                         }
1433                 }
1434
1435                 if (r < a) {
1436                         u32 rb = cake_heap_get_backlog(q, r);
1437
1438                         if (rb > mb) {
1439                                 m  = r;
1440                                 mb = rb;
1441                         }
1442                 }
1443
1444                 if (m != i) {
1445                         cake_heap_swap(q, i, m);
1446                         i = m;
1447                 } else {
1448                         break;
1449                 }
1450         }
1451 }
1452
1453 static void cake_heapify_up(struct cake_sched_data *q, u16 i)
1454 {
1455         while (i > 0 && i < CAKE_MAX_TINS * CAKE_QUEUES) {
1456                 u16 p = (i - 1) >> 1;
1457                 u32 ib = cake_heap_get_backlog(q, i);
1458                 u32 pb = cake_heap_get_backlog(q, p);
1459
1460                 if (ib > pb) {
1461                         cake_heap_swap(q, i, p);
1462                         i = p;
1463                 } else {
1464                         break;
1465                 }
1466         }
1467 }
1468
1469 static int cake_advance_shaper(struct cake_sched_data *q,
1470                                struct cake_tin_data *b,
1471                                struct sk_buff *skb,
1472                                ktime_t now, bool drop)
1473 {
1474         u32 len = get_cobalt_cb(skb)->adjusted_len;
1475
1476         /* charge packet bandwidth to this tin
1477          * and to the global shaper.
1478          */
1479         if (q->rate_ns) {
1480                 u64 tin_dur = (len * b->tin_rate_ns) >> b->tin_rate_shft;
1481                 u64 global_dur = (len * q->rate_ns) >> q->rate_shft;
1482                 u64 failsafe_dur = global_dur + (global_dur >> 1);
1483
1484                 if (ktime_before(b->time_next_packet, now))
1485                         b->time_next_packet = ktime_add_ns(b->time_next_packet,
1486                                                            tin_dur);
1487
1488                 else if (ktime_before(b->time_next_packet,
1489                                       ktime_add_ns(now, tin_dur)))
1490                         b->time_next_packet = ktime_add_ns(now, tin_dur);
1491
1492                 q->time_next_packet = ktime_add_ns(q->time_next_packet,
1493                                                    global_dur);
1494                 if (!drop)
1495                         q->failsafe_next_packet = \
1496                                 ktime_add_ns(q->failsafe_next_packet,
1497                                              failsafe_dur);
1498         }
1499         return len;
1500 }
1501
1502 static unsigned int cake_drop(struct Qdisc *sch, struct sk_buff **to_free)
1503 {
1504         struct cake_sched_data *q = qdisc_priv(sch);
1505         ktime_t now = ktime_get();
1506         u32 idx = 0, tin = 0, len;
1507         struct cake_heap_entry qq;
1508         struct cake_tin_data *b;
1509         struct cake_flow *flow;
1510         struct sk_buff *skb;
1511
1512         if (!q->overflow_timeout) {
1513                 int i;
1514                 /* Build fresh max-heap */
1515                 for (i = CAKE_MAX_TINS * CAKE_QUEUES / 2; i >= 0; i--)
1516                         cake_heapify(q, i);
1517         }
1518         q->overflow_timeout = 65535;
1519
1520         /* select longest queue for pruning */
1521         qq  = q->overflow_heap[0];
1522         tin = qq.t;
1523         idx = qq.b;
1524
1525         b = &q->tins[tin];
1526         flow = &b->flows[idx];
1527         skb = dequeue_head(flow);
1528         if (unlikely(!skb)) {
1529                 /* heap has gone wrong, rebuild it next time */
1530                 q->overflow_timeout = 0;
1531                 return idx + (tin << 16);
1532         }
1533
1534         if (cobalt_queue_full(&flow->cvars, &b->cparams, now))
1535                 b->unresponsive_flow_count++;
1536
1537         len = qdisc_pkt_len(skb);
1538         q->buffer_used      -= skb->truesize;
1539         b->backlogs[idx]    -= len;
1540         b->tin_backlog      -= len;
1541         sch->qstats.backlog -= len;
1542         qdisc_tree_reduce_backlog(sch, 1, len);
1543
1544         flow->dropped++;
1545         b->tin_dropped++;
1546         sch->qstats.drops++;
1547
1548         if (q->rate_flags & CAKE_FLAG_INGRESS)
1549                 cake_advance_shaper(q, b, skb, now, true);
1550
1551         __qdisc_drop(skb, to_free);
1552         sch->q.qlen--;
1553
1554         cake_heapify(q, 0);
1555
1556         return idx + (tin << 16);
1557 }
1558
1559 static u8 cake_handle_diffserv(struct sk_buff *skb, bool wash)
1560 {
1561         const int offset = skb_network_offset(skb);
1562         u16 *buf, buf_;
1563         u8 dscp;
1564
1565         switch (skb_protocol(skb, true)) {
1566         case htons(ETH_P_IP):
1567                 buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1568                 if (unlikely(!buf))
1569                         return 0;
1570
1571                 /* ToS is in the second byte of iphdr */
1572                 dscp = ipv4_get_dsfield((struct iphdr *)buf) >> 2;
1573
1574                 if (wash && dscp) {
1575                         const int wlen = offset + sizeof(struct iphdr);
1576
1577                         if (!pskb_may_pull(skb, wlen) ||
1578                             skb_try_make_writable(skb, wlen))
1579                                 return 0;
1580
1581                         ipv4_change_dsfield(ip_hdr(skb), INET_ECN_MASK, 0);
1582                 }
1583
1584                 return dscp;
1585
1586         case htons(ETH_P_IPV6):
1587                 buf = skb_header_pointer(skb, offset, sizeof(buf_), &buf_);
1588                 if (unlikely(!buf))
1589                         return 0;
1590
1591                 /* Traffic class is in the first and second bytes of ipv6hdr */
1592                 dscp = ipv6_get_dsfield((struct ipv6hdr *)buf) >> 2;
1593
1594                 if (wash && dscp) {
1595                         const int wlen = offset + sizeof(struct ipv6hdr);
1596
1597                         if (!pskb_may_pull(skb, wlen) ||
1598                             skb_try_make_writable(skb, wlen))
1599                                 return 0;
1600
1601                         ipv6_change_dsfield(ipv6_hdr(skb), INET_ECN_MASK, 0);
1602                 }
1603
1604                 return dscp;
1605
1606         case htons(ETH_P_ARP):
1607                 return 0x38;  /* CS7 - Net Control */
1608
1609         default:
1610                 /* If there is no Diffserv field, treat as best-effort */
1611                 return 0;
1612         }
1613 }
1614
1615 static struct cake_tin_data *cake_select_tin(struct Qdisc *sch,
1616                                              struct sk_buff *skb)
1617 {
1618         struct cake_sched_data *q = qdisc_priv(sch);
1619         u32 tin, mark;
1620         bool wash;
1621         u8 dscp;
1622
1623         /* Tin selection: Default to diffserv-based selection, allow overriding
1624          * using firewall marks or skb->priority. Call DSCP parsing early if
1625          * wash is enabled, otherwise defer to below to skip unneeded parsing.
1626          */
1627         mark = (skb->mark & q->fwmark_mask) >> q->fwmark_shft;
1628         wash = !!(q->rate_flags & CAKE_FLAG_WASH);
1629         if (wash)
1630                 dscp = cake_handle_diffserv(skb, wash);
1631
1632         if (q->tin_mode == CAKE_DIFFSERV_BESTEFFORT)
1633                 tin = 0;
1634
1635         else if (mark && mark <= q->tin_cnt)
1636                 tin = q->tin_order[mark - 1];
1637
1638         else if (TC_H_MAJ(skb->priority) == sch->handle &&
1639                  TC_H_MIN(skb->priority) > 0 &&
1640                  TC_H_MIN(skb->priority) <= q->tin_cnt)
1641                 tin = q->tin_order[TC_H_MIN(skb->priority) - 1];
1642
1643         else {
1644                 if (!wash)
1645                         dscp = cake_handle_diffserv(skb, wash);
1646                 tin = q->tin_index[dscp];
1647
1648                 if (unlikely(tin >= q->tin_cnt))
1649                         tin = 0;
1650         }
1651
1652         return &q->tins[tin];
1653 }
1654
1655 static u32 cake_classify(struct Qdisc *sch, struct cake_tin_data **t,
1656                          struct sk_buff *skb, int flow_mode, int *qerr)
1657 {
1658         struct cake_sched_data *q = qdisc_priv(sch);
1659         struct tcf_proto *filter;
1660         struct tcf_result res;
1661         u16 flow = 0, host = 0;
1662         int result;
1663
1664         filter = rcu_dereference_bh(q->filter_list);
1665         if (!filter)
1666                 goto hash;
1667
1668         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
1669         result = tcf_classify(skb, NULL, filter, &res, false);
1670
1671         if (result >= 0) {
1672 #ifdef CONFIG_NET_CLS_ACT
1673                 switch (result) {
1674                 case TC_ACT_STOLEN:
1675                 case TC_ACT_QUEUED:
1676                 case TC_ACT_TRAP:
1677                         *qerr = NET_XMIT_SUCCESS | __NET_XMIT_STOLEN;
1678                         fallthrough;
1679                 case TC_ACT_SHOT:
1680                         return 0;
1681                 }
1682 #endif
1683                 if (TC_H_MIN(res.classid) <= CAKE_QUEUES)
1684                         flow = TC_H_MIN(res.classid);
1685                 if (TC_H_MAJ(res.classid) <= (CAKE_QUEUES << 16))
1686                         host = TC_H_MAJ(res.classid) >> 16;
1687         }
1688 hash:
1689         *t = cake_select_tin(sch, skb);
1690         return cake_hash(*t, skb, flow_mode, flow, host) + 1;
1691 }
1692
1693 static void cake_reconfigure(struct Qdisc *sch);
1694
1695 static s32 cake_enqueue(struct sk_buff *skb, struct Qdisc *sch,
1696                         struct sk_buff **to_free)
1697 {
1698         struct cake_sched_data *q = qdisc_priv(sch);
1699         int len = qdisc_pkt_len(skb);
1700         int ret;
1701         struct sk_buff *ack = NULL;
1702         ktime_t now = ktime_get();
1703         struct cake_tin_data *b;
1704         struct cake_flow *flow;
1705         u32 idx;
1706
1707         /* choose flow to insert into */
1708         idx = cake_classify(sch, &b, skb, q->flow_mode, &ret);
1709         if (idx == 0) {
1710                 if (ret & __NET_XMIT_BYPASS)
1711                         qdisc_qstats_drop(sch);
1712                 __qdisc_drop(skb, to_free);
1713                 return ret;
1714         }
1715         idx--;
1716         flow = &b->flows[idx];
1717
1718         /* ensure shaper state isn't stale */
1719         if (!b->tin_backlog) {
1720                 if (ktime_before(b->time_next_packet, now))
1721                         b->time_next_packet = now;
1722
1723                 if (!sch->q.qlen) {
1724                         if (ktime_before(q->time_next_packet, now)) {
1725                                 q->failsafe_next_packet = now;
1726                                 q->time_next_packet = now;
1727                         } else if (ktime_after(q->time_next_packet, now) &&
1728                                    ktime_after(q->failsafe_next_packet, now)) {
1729                                 u64 next = \
1730                                         min(ktime_to_ns(q->time_next_packet),
1731                                             ktime_to_ns(
1732                                                    q->failsafe_next_packet));
1733                                 sch->qstats.overlimits++;
1734                                 qdisc_watchdog_schedule_ns(&q->watchdog, next);
1735                         }
1736                 }
1737         }
1738
1739         if (unlikely(len > b->max_skblen))
1740                 b->max_skblen = len;
1741
1742         if (skb_is_gso(skb) && q->rate_flags & CAKE_FLAG_SPLIT_GSO) {
1743                 struct sk_buff *segs, *nskb;
1744                 netdev_features_t features = netif_skb_features(skb);
1745                 unsigned int slen = 0, numsegs = 0;
1746
1747                 segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
1748                 if (IS_ERR_OR_NULL(segs))
1749                         return qdisc_drop(skb, sch, to_free);
1750
1751                 skb_list_walk_safe(segs, segs, nskb) {
1752                         skb_mark_not_on_list(segs);
1753                         qdisc_skb_cb(segs)->pkt_len = segs->len;
1754                         cobalt_set_enqueue_time(segs, now);
1755                         get_cobalt_cb(segs)->adjusted_len = cake_overhead(q,
1756                                                                           segs);
1757                         flow_queue_add(flow, segs);
1758
1759                         sch->q.qlen++;
1760                         numsegs++;
1761                         slen += segs->len;
1762                         q->buffer_used += segs->truesize;
1763                         b->packets++;
1764                 }
1765
1766                 /* stats */
1767                 b->bytes            += slen;
1768                 b->backlogs[idx]    += slen;
1769                 b->tin_backlog      += slen;
1770                 sch->qstats.backlog += slen;
1771                 q->avg_window_bytes += slen;
1772
1773                 qdisc_tree_reduce_backlog(sch, 1-numsegs, len-slen);
1774                 consume_skb(skb);
1775         } else {
1776                 /* not splitting */
1777                 cobalt_set_enqueue_time(skb, now);
1778                 get_cobalt_cb(skb)->adjusted_len = cake_overhead(q, skb);
1779                 flow_queue_add(flow, skb);
1780
1781                 if (q->ack_filter)
1782                         ack = cake_ack_filter(q, flow);
1783
1784                 if (ack) {
1785                         b->ack_drops++;
1786                         sch->qstats.drops++;
1787                         b->bytes += qdisc_pkt_len(ack);
1788                         len -= qdisc_pkt_len(ack);
1789                         q->buffer_used += skb->truesize - ack->truesize;
1790                         if (q->rate_flags & CAKE_FLAG_INGRESS)
1791                                 cake_advance_shaper(q, b, ack, now, true);
1792
1793                         qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(ack));
1794                         consume_skb(ack);
1795                 } else {
1796                         sch->q.qlen++;
1797                         q->buffer_used      += skb->truesize;
1798                 }
1799
1800                 /* stats */
1801                 b->packets++;
1802                 b->bytes            += len;
1803                 b->backlogs[idx]    += len;
1804                 b->tin_backlog      += len;
1805                 sch->qstats.backlog += len;
1806                 q->avg_window_bytes += len;
1807         }
1808
1809         if (q->overflow_timeout)
1810                 cake_heapify_up(q, b->overflow_idx[idx]);
1811
1812         /* incoming bandwidth capacity estimate */
1813         if (q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS) {
1814                 u64 packet_interval = \
1815                         ktime_to_ns(ktime_sub(now, q->last_packet_time));
1816
1817                 if (packet_interval > NSEC_PER_SEC)
1818                         packet_interval = NSEC_PER_SEC;
1819
1820                 /* filter out short-term bursts, eg. wifi aggregation */
1821                 q->avg_packet_interval = \
1822                         cake_ewma(q->avg_packet_interval,
1823                                   packet_interval,
1824                                   (packet_interval > q->avg_packet_interval ?
1825                                           2 : 8));
1826
1827                 q->last_packet_time = now;
1828
1829                 if (packet_interval > q->avg_packet_interval) {
1830                         u64 window_interval = \
1831                                 ktime_to_ns(ktime_sub(now,
1832                                                       q->avg_window_begin));
1833                         u64 b = q->avg_window_bytes * (u64)NSEC_PER_SEC;
1834
1835                         b = div64_u64(b, window_interval);
1836                         q->avg_peak_bandwidth =
1837                                 cake_ewma(q->avg_peak_bandwidth, b,
1838                                           b > q->avg_peak_bandwidth ? 2 : 8);
1839                         q->avg_window_bytes = 0;
1840                         q->avg_window_begin = now;
1841
1842                         if (ktime_after(now,
1843                                         ktime_add_ms(q->last_reconfig_time,
1844                                                      250))) {
1845                                 q->rate_bps = (q->avg_peak_bandwidth * 15) >> 4;
1846                                 cake_reconfigure(sch);
1847                         }
1848                 }
1849         } else {
1850                 q->avg_window_bytes = 0;
1851                 q->last_packet_time = now;
1852         }
1853
1854         /* flowchain */
1855         if (!flow->set || flow->set == CAKE_SET_DECAYING) {
1856                 struct cake_host *srchost = &b->hosts[flow->srchost];
1857                 struct cake_host *dsthost = &b->hosts[flow->dsthost];
1858                 u16 host_load = 1;
1859
1860                 if (!flow->set) {
1861                         list_add_tail(&flow->flowchain, &b->new_flows);
1862                 } else {
1863                         b->decaying_flow_count--;
1864                         list_move_tail(&flow->flowchain, &b->new_flows);
1865                 }
1866                 flow->set = CAKE_SET_SPARSE;
1867                 b->sparse_flow_count++;
1868
1869                 if (cake_dsrc(q->flow_mode))
1870                         host_load = max(host_load, srchost->srchost_bulk_flow_count);
1871
1872                 if (cake_ddst(q->flow_mode))
1873                         host_load = max(host_load, dsthost->dsthost_bulk_flow_count);
1874
1875                 flow->deficit = (b->flow_quantum *
1876                                  quantum_div[host_load]) >> 16;
1877         } else if (flow->set == CAKE_SET_SPARSE_WAIT) {
1878                 struct cake_host *srchost = &b->hosts[flow->srchost];
1879                 struct cake_host *dsthost = &b->hosts[flow->dsthost];
1880
1881                 /* this flow was empty, accounted as a sparse flow, but actually
1882                  * in the bulk rotation.
1883                  */
1884                 flow->set = CAKE_SET_BULK;
1885                 b->sparse_flow_count--;
1886                 b->bulk_flow_count++;
1887
1888                 if (cake_dsrc(q->flow_mode))
1889                         srchost->srchost_bulk_flow_count++;
1890
1891                 if (cake_ddst(q->flow_mode))
1892                         dsthost->dsthost_bulk_flow_count++;
1893
1894         }
1895
1896         if (q->buffer_used > q->buffer_max_used)
1897                 q->buffer_max_used = q->buffer_used;
1898
1899         if (q->buffer_used > q->buffer_limit) {
1900                 u32 dropped = 0;
1901
1902                 while (q->buffer_used > q->buffer_limit) {
1903                         dropped++;
1904                         cake_drop(sch, to_free);
1905                 }
1906                 b->drop_overlimit += dropped;
1907         }
1908         return NET_XMIT_SUCCESS;
1909 }
1910
1911 static struct sk_buff *cake_dequeue_one(struct Qdisc *sch)
1912 {
1913         struct cake_sched_data *q = qdisc_priv(sch);
1914         struct cake_tin_data *b = &q->tins[q->cur_tin];
1915         struct cake_flow *flow = &b->flows[q->cur_flow];
1916         struct sk_buff *skb = NULL;
1917         u32 len;
1918
1919         if (flow->head) {
1920                 skb = dequeue_head(flow);
1921                 len = qdisc_pkt_len(skb);
1922                 b->backlogs[q->cur_flow] -= len;
1923                 b->tin_backlog           -= len;
1924                 sch->qstats.backlog      -= len;
1925                 q->buffer_used           -= skb->truesize;
1926                 sch->q.qlen--;
1927
1928                 if (q->overflow_timeout)
1929                         cake_heapify(q, b->overflow_idx[q->cur_flow]);
1930         }
1931         return skb;
1932 }
1933
1934 /* Discard leftover packets from a tin no longer in use. */
1935 static void cake_clear_tin(struct Qdisc *sch, u16 tin)
1936 {
1937         struct cake_sched_data *q = qdisc_priv(sch);
1938         struct sk_buff *skb;
1939
1940         q->cur_tin = tin;
1941         for (q->cur_flow = 0; q->cur_flow < CAKE_QUEUES; q->cur_flow++)
1942                 while (!!(skb = cake_dequeue_one(sch)))
1943                         kfree_skb(skb);
1944 }
1945
1946 static struct sk_buff *cake_dequeue(struct Qdisc *sch)
1947 {
1948         struct cake_sched_data *q = qdisc_priv(sch);
1949         struct cake_tin_data *b = &q->tins[q->cur_tin];
1950         struct cake_host *srchost, *dsthost;
1951         ktime_t now = ktime_get();
1952         struct cake_flow *flow;
1953         struct list_head *head;
1954         bool first_flow = true;
1955         struct sk_buff *skb;
1956         u16 host_load;
1957         u64 delay;
1958         u32 len;
1959
1960 begin:
1961         if (!sch->q.qlen)
1962                 return NULL;
1963
1964         /* global hard shaper */
1965         if (ktime_after(q->time_next_packet, now) &&
1966             ktime_after(q->failsafe_next_packet, now)) {
1967                 u64 next = min(ktime_to_ns(q->time_next_packet),
1968                                ktime_to_ns(q->failsafe_next_packet));
1969
1970                 sch->qstats.overlimits++;
1971                 qdisc_watchdog_schedule_ns(&q->watchdog, next);
1972                 return NULL;
1973         }
1974
1975         /* Choose a class to work on. */
1976         if (!q->rate_ns) {
1977                 /* In unlimited mode, can't rely on shaper timings, just balance
1978                  * with DRR
1979                  */
1980                 bool wrapped = false, empty = true;
1981
1982                 while (b->tin_deficit < 0 ||
1983                        !(b->sparse_flow_count + b->bulk_flow_count)) {
1984                         if (b->tin_deficit <= 0)
1985                                 b->tin_deficit += b->tin_quantum;
1986                         if (b->sparse_flow_count + b->bulk_flow_count)
1987                                 empty = false;
1988
1989                         q->cur_tin++;
1990                         b++;
1991                         if (q->cur_tin >= q->tin_cnt) {
1992                                 q->cur_tin = 0;
1993                                 b = q->tins;
1994
1995                                 if (wrapped) {
1996                                         /* It's possible for q->qlen to be
1997                                          * nonzero when we actually have no
1998                                          * packets anywhere.
1999                                          */
2000                                         if (empty)
2001                                                 return NULL;
2002                                 } else {
2003                                         wrapped = true;
2004                                 }
2005                         }
2006                 }
2007         } else {
2008                 /* In shaped mode, choose:
2009                  * - Highest-priority tin with queue and meeting schedule, or
2010                  * - The earliest-scheduled tin with queue.
2011                  */
2012                 ktime_t best_time = KTIME_MAX;
2013                 int tin, best_tin = 0;
2014
2015                 for (tin = 0; tin < q->tin_cnt; tin++) {
2016                         b = q->tins + tin;
2017                         if ((b->sparse_flow_count + b->bulk_flow_count) > 0) {
2018                                 ktime_t time_to_pkt = \
2019                                         ktime_sub(b->time_next_packet, now);
2020
2021                                 if (ktime_to_ns(time_to_pkt) <= 0 ||
2022                                     ktime_compare(time_to_pkt,
2023                                                   best_time) <= 0) {
2024                                         best_time = time_to_pkt;
2025                                         best_tin = tin;
2026                                 }
2027                         }
2028                 }
2029
2030                 q->cur_tin = best_tin;
2031                 b = q->tins + best_tin;
2032
2033                 /* No point in going further if no packets to deliver. */
2034                 if (unlikely(!(b->sparse_flow_count + b->bulk_flow_count)))
2035                         return NULL;
2036         }
2037
2038 retry:
2039         /* service this class */
2040         head = &b->decaying_flows;
2041         if (!first_flow || list_empty(head)) {
2042                 head = &b->new_flows;
2043                 if (list_empty(head)) {
2044                         head = &b->old_flows;
2045                         if (unlikely(list_empty(head))) {
2046                                 head = &b->decaying_flows;
2047                                 if (unlikely(list_empty(head)))
2048                                         goto begin;
2049                         }
2050                 }
2051         }
2052         flow = list_first_entry(head, struct cake_flow, flowchain);
2053         q->cur_flow = flow - b->flows;
2054         first_flow = false;
2055
2056         /* triple isolation (modified DRR++) */
2057         srchost = &b->hosts[flow->srchost];
2058         dsthost = &b->hosts[flow->dsthost];
2059         host_load = 1;
2060
2061         /* flow isolation (DRR++) */
2062         if (flow->deficit <= 0) {
2063                 /* Keep all flows with deficits out of the sparse and decaying
2064                  * rotations.  No non-empty flow can go into the decaying
2065                  * rotation, so they can't get deficits
2066                  */
2067                 if (flow->set == CAKE_SET_SPARSE) {
2068                         if (flow->head) {
2069                                 b->sparse_flow_count--;
2070                                 b->bulk_flow_count++;
2071
2072                                 if (cake_dsrc(q->flow_mode))
2073                                         srchost->srchost_bulk_flow_count++;
2074
2075                                 if (cake_ddst(q->flow_mode))
2076                                         dsthost->dsthost_bulk_flow_count++;
2077
2078                                 flow->set = CAKE_SET_BULK;
2079                         } else {
2080                                 /* we've moved it to the bulk rotation for
2081                                  * correct deficit accounting but we still want
2082                                  * to count it as a sparse flow, not a bulk one.
2083                                  */
2084                                 flow->set = CAKE_SET_SPARSE_WAIT;
2085                         }
2086                 }
2087
2088                 if (cake_dsrc(q->flow_mode))
2089                         host_load = max(host_load, srchost->srchost_bulk_flow_count);
2090
2091                 if (cake_ddst(q->flow_mode))
2092                         host_load = max(host_load, dsthost->dsthost_bulk_flow_count);
2093
2094                 WARN_ON(host_load > CAKE_QUEUES);
2095
2096                 /* The get_random_u16() is a way to apply dithering to avoid
2097                  * accumulating roundoff errors
2098                  */
2099                 flow->deficit += (b->flow_quantum * quantum_div[host_load] +
2100                                   get_random_u16()) >> 16;
2101                 list_move_tail(&flow->flowchain, &b->old_flows);
2102
2103                 goto retry;
2104         }
2105
2106         /* Retrieve a packet via the AQM */
2107         while (1) {
2108                 skb = cake_dequeue_one(sch);
2109                 if (!skb) {
2110                         /* this queue was actually empty */
2111                         if (cobalt_queue_empty(&flow->cvars, &b->cparams, now))
2112                                 b->unresponsive_flow_count--;
2113
2114                         if (flow->cvars.p_drop || flow->cvars.count ||
2115                             ktime_before(now, flow->cvars.drop_next)) {
2116                                 /* keep in the flowchain until the state has
2117                                  * decayed to rest
2118                                  */
2119                                 list_move_tail(&flow->flowchain,
2120                                                &b->decaying_flows);
2121                                 if (flow->set == CAKE_SET_BULK) {
2122                                         b->bulk_flow_count--;
2123
2124                                         if (cake_dsrc(q->flow_mode))
2125                                                 srchost->srchost_bulk_flow_count--;
2126
2127                                         if (cake_ddst(q->flow_mode))
2128                                                 dsthost->dsthost_bulk_flow_count--;
2129
2130                                         b->decaying_flow_count++;
2131                                 } else if (flow->set == CAKE_SET_SPARSE ||
2132                                            flow->set == CAKE_SET_SPARSE_WAIT) {
2133                                         b->sparse_flow_count--;
2134                                         b->decaying_flow_count++;
2135                                 }
2136                                 flow->set = CAKE_SET_DECAYING;
2137                         } else {
2138                                 /* remove empty queue from the flowchain */
2139                                 list_del_init(&flow->flowchain);
2140                                 if (flow->set == CAKE_SET_SPARSE ||
2141                                     flow->set == CAKE_SET_SPARSE_WAIT)
2142                                         b->sparse_flow_count--;
2143                                 else if (flow->set == CAKE_SET_BULK) {
2144                                         b->bulk_flow_count--;
2145
2146                                         if (cake_dsrc(q->flow_mode))
2147                                                 srchost->srchost_bulk_flow_count--;
2148
2149                                         if (cake_ddst(q->flow_mode))
2150                                                 dsthost->dsthost_bulk_flow_count--;
2151
2152                                 } else
2153                                         b->decaying_flow_count--;
2154
2155                                 flow->set = CAKE_SET_NONE;
2156                         }
2157                         goto begin;
2158                 }
2159
2160                 /* Last packet in queue may be marked, shouldn't be dropped */
2161                 if (!cobalt_should_drop(&flow->cvars, &b->cparams, now, skb,
2162                                         (b->bulk_flow_count *
2163                                          !!(q->rate_flags &
2164                                             CAKE_FLAG_INGRESS))) ||
2165                     !flow->head)
2166                         break;
2167
2168                 /* drop this packet, get another one */
2169                 if (q->rate_flags & CAKE_FLAG_INGRESS) {
2170                         len = cake_advance_shaper(q, b, skb,
2171                                                   now, true);
2172                         flow->deficit -= len;
2173                         b->tin_deficit -= len;
2174                 }
2175                 flow->dropped++;
2176                 b->tin_dropped++;
2177                 qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
2178                 qdisc_qstats_drop(sch);
2179                 kfree_skb(skb);
2180                 if (q->rate_flags & CAKE_FLAG_INGRESS)
2181                         goto retry;
2182         }
2183
2184         b->tin_ecn_mark += !!flow->cvars.ecn_marked;
2185         qdisc_bstats_update(sch, skb);
2186
2187         /* collect delay stats */
2188         delay = ktime_to_ns(ktime_sub(now, cobalt_get_enqueue_time(skb)));
2189         b->avge_delay = cake_ewma(b->avge_delay, delay, 8);
2190         b->peak_delay = cake_ewma(b->peak_delay, delay,
2191                                   delay > b->peak_delay ? 2 : 8);
2192         b->base_delay = cake_ewma(b->base_delay, delay,
2193                                   delay < b->base_delay ? 2 : 8);
2194
2195         len = cake_advance_shaper(q, b, skb, now, false);
2196         flow->deficit -= len;
2197         b->tin_deficit -= len;
2198
2199         if (ktime_after(q->time_next_packet, now) && sch->q.qlen) {
2200                 u64 next = min(ktime_to_ns(q->time_next_packet),
2201                                ktime_to_ns(q->failsafe_next_packet));
2202
2203                 qdisc_watchdog_schedule_ns(&q->watchdog, next);
2204         } else if (!sch->q.qlen) {
2205                 int i;
2206
2207                 for (i = 0; i < q->tin_cnt; i++) {
2208                         if (q->tins[i].decaying_flow_count) {
2209                                 ktime_t next = \
2210                                         ktime_add_ns(now,
2211                                                      q->tins[i].cparams.target);
2212
2213                                 qdisc_watchdog_schedule_ns(&q->watchdog,
2214                                                            ktime_to_ns(next));
2215                                 break;
2216                         }
2217                 }
2218         }
2219
2220         if (q->overflow_timeout)
2221                 q->overflow_timeout--;
2222
2223         return skb;
2224 }
2225
2226 static void cake_reset(struct Qdisc *sch)
2227 {
2228         struct cake_sched_data *q = qdisc_priv(sch);
2229         u32 c;
2230
2231         if (!q->tins)
2232                 return;
2233
2234         for (c = 0; c < CAKE_MAX_TINS; c++)
2235                 cake_clear_tin(sch, c);
2236 }
2237
2238 static const struct nla_policy cake_policy[TCA_CAKE_MAX + 1] = {
2239         [TCA_CAKE_BASE_RATE64]   = { .type = NLA_U64 },
2240         [TCA_CAKE_DIFFSERV_MODE] = { .type = NLA_U32 },
2241         [TCA_CAKE_ATM]           = { .type = NLA_U32 },
2242         [TCA_CAKE_FLOW_MODE]     = { .type = NLA_U32 },
2243         [TCA_CAKE_OVERHEAD]      = { .type = NLA_S32 },
2244         [TCA_CAKE_RTT]           = { .type = NLA_U32 },
2245         [TCA_CAKE_TARGET]        = { .type = NLA_U32 },
2246         [TCA_CAKE_AUTORATE]      = { .type = NLA_U32 },
2247         [TCA_CAKE_MEMORY]        = { .type = NLA_U32 },
2248         [TCA_CAKE_NAT]           = { .type = NLA_U32 },
2249         [TCA_CAKE_RAW]           = { .type = NLA_U32 },
2250         [TCA_CAKE_WASH]          = { .type = NLA_U32 },
2251         [TCA_CAKE_MPU]           = { .type = NLA_U32 },
2252         [TCA_CAKE_INGRESS]       = { .type = NLA_U32 },
2253         [TCA_CAKE_ACK_FILTER]    = { .type = NLA_U32 },
2254         [TCA_CAKE_SPLIT_GSO]     = { .type = NLA_U32 },
2255         [TCA_CAKE_FWMARK]        = { .type = NLA_U32 },
2256 };
2257
2258 static void cake_set_rate(struct cake_tin_data *b, u64 rate, u32 mtu,
2259                           u64 target_ns, u64 rtt_est_ns)
2260 {
2261         /* convert byte-rate into time-per-byte
2262          * so it will always unwedge in reasonable time.
2263          */
2264         static const u64 MIN_RATE = 64;
2265         u32 byte_target = mtu;
2266         u64 byte_target_ns;
2267         u8  rate_shft = 0;
2268         u64 rate_ns = 0;
2269
2270         b->flow_quantum = 1514;
2271         if (rate) {
2272                 b->flow_quantum = max(min(rate >> 12, 1514ULL), 300ULL);
2273                 rate_shft = 34;
2274                 rate_ns = ((u64)NSEC_PER_SEC) << rate_shft;
2275                 rate_ns = div64_u64(rate_ns, max(MIN_RATE, rate));
2276                 while (!!(rate_ns >> 34)) {
2277                         rate_ns >>= 1;
2278                         rate_shft--;
2279                 }
2280         } /* else unlimited, ie. zero delay */
2281
2282         b->tin_rate_bps  = rate;
2283         b->tin_rate_ns   = rate_ns;
2284         b->tin_rate_shft = rate_shft;
2285
2286         byte_target_ns = (byte_target * rate_ns) >> rate_shft;
2287
2288         b->cparams.target = max((byte_target_ns * 3) / 2, target_ns);
2289         b->cparams.interval = max(rtt_est_ns +
2290                                      b->cparams.target - target_ns,
2291                                      b->cparams.target * 2);
2292         b->cparams.mtu_time = byte_target_ns;
2293         b->cparams.p_inc = 1 << 24; /* 1/256 */
2294         b->cparams.p_dec = 1 << 20; /* 1/4096 */
2295 }
2296
2297 static int cake_config_besteffort(struct Qdisc *sch)
2298 {
2299         struct cake_sched_data *q = qdisc_priv(sch);
2300         struct cake_tin_data *b = &q->tins[0];
2301         u32 mtu = psched_mtu(qdisc_dev(sch));
2302         u64 rate = q->rate_bps;
2303
2304         q->tin_cnt = 1;
2305
2306         q->tin_index = besteffort;
2307         q->tin_order = normal_order;
2308
2309         cake_set_rate(b, rate, mtu,
2310                       us_to_ns(q->target), us_to_ns(q->interval));
2311         b->tin_quantum = 65535;
2312
2313         return 0;
2314 }
2315
2316 static int cake_config_precedence(struct Qdisc *sch)
2317 {
2318         /* convert high-level (user visible) parameters into internal format */
2319         struct cake_sched_data *q = qdisc_priv(sch);
2320         u32 mtu = psched_mtu(qdisc_dev(sch));
2321         u64 rate = q->rate_bps;
2322         u32 quantum = 256;
2323         u32 i;
2324
2325         q->tin_cnt = 8;
2326         q->tin_index = precedence;
2327         q->tin_order = normal_order;
2328
2329         for (i = 0; i < q->tin_cnt; i++) {
2330                 struct cake_tin_data *b = &q->tins[i];
2331
2332                 cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2333                               us_to_ns(q->interval));
2334
2335                 b->tin_quantum = max_t(u16, 1U, quantum);
2336
2337                 /* calculate next class's parameters */
2338                 rate  *= 7;
2339                 rate >>= 3;
2340
2341                 quantum  *= 7;
2342                 quantum >>= 3;
2343         }
2344
2345         return 0;
2346 }
2347
2348 /*      List of known Diffserv codepoints:
2349  *
2350  *      Default Forwarding (DF/CS0) - Best Effort
2351  *      Max Throughput (TOS2)
2352  *      Min Delay (TOS4)
2353  *      LLT "La" (TOS5)
2354  *      Assured Forwarding 1 (AF1x) - x3
2355  *      Assured Forwarding 2 (AF2x) - x3
2356  *      Assured Forwarding 3 (AF3x) - x3
2357  *      Assured Forwarding 4 (AF4x) - x3
2358  *      Precedence Class 1 (CS1)
2359  *      Precedence Class 2 (CS2)
2360  *      Precedence Class 3 (CS3)
2361  *      Precedence Class 4 (CS4)
2362  *      Precedence Class 5 (CS5)
2363  *      Precedence Class 6 (CS6)
2364  *      Precedence Class 7 (CS7)
2365  *      Voice Admit (VA)
2366  *      Expedited Forwarding (EF)
2367  *      Lower Effort (LE)
2368  *
2369  *      Total 26 codepoints.
2370  */
2371
2372 /*      List of traffic classes in RFC 4594, updated by RFC 8622:
2373  *              (roughly descending order of contended priority)
2374  *              (roughly ascending order of uncontended throughput)
2375  *
2376  *      Network Control (CS6,CS7)      - routing traffic
2377  *      Telephony (EF,VA)         - aka. VoIP streams
2378  *      Signalling (CS5)               - VoIP setup
2379  *      Multimedia Conferencing (AF4x) - aka. video calls
2380  *      Realtime Interactive (CS4)     - eg. games
2381  *      Multimedia Streaming (AF3x)    - eg. YouTube, NetFlix, Twitch
2382  *      Broadcast Video (CS3)
2383  *      Low-Latency Data (AF2x,TOS4)      - eg. database
2384  *      Ops, Admin, Management (CS2)      - eg. ssh
2385  *      Standard Service (DF & unrecognised codepoints)
2386  *      High-Throughput Data (AF1x,TOS2)  - eg. web traffic
2387  *      Low-Priority Data (LE,CS1)        - eg. BitTorrent
2388  *
2389  *      Total 12 traffic classes.
2390  */
2391
2392 static int cake_config_diffserv8(struct Qdisc *sch)
2393 {
2394 /*      Pruned list of traffic classes for typical applications:
2395  *
2396  *              Network Control          (CS6, CS7)
2397  *              Minimum Latency          (EF, VA, CS5, CS4)
2398  *              Interactive Shell        (CS2)
2399  *              Low Latency Transactions (AF2x, TOS4)
2400  *              Video Streaming          (AF4x, AF3x, CS3)
2401  *              Bog Standard             (DF etc.)
2402  *              High Throughput          (AF1x, TOS2, CS1)
2403  *              Background Traffic       (LE)
2404  *
2405  *              Total 8 traffic classes.
2406  */
2407
2408         struct cake_sched_data *q = qdisc_priv(sch);
2409         u32 mtu = psched_mtu(qdisc_dev(sch));
2410         u64 rate = q->rate_bps;
2411         u32 quantum = 256;
2412         u32 i;
2413
2414         q->tin_cnt = 8;
2415
2416         /* codepoint to class mapping */
2417         q->tin_index = diffserv8;
2418         q->tin_order = normal_order;
2419
2420         /* class characteristics */
2421         for (i = 0; i < q->tin_cnt; i++) {
2422                 struct cake_tin_data *b = &q->tins[i];
2423
2424                 cake_set_rate(b, rate, mtu, us_to_ns(q->target),
2425                               us_to_ns(q->interval));
2426
2427                 b->tin_quantum = max_t(u16, 1U, quantum);
2428
2429                 /* calculate next class's parameters */
2430                 rate  *= 7;
2431                 rate >>= 3;
2432
2433                 quantum  *= 7;
2434                 quantum >>= 3;
2435         }
2436
2437         return 0;
2438 }
2439
2440 static int cake_config_diffserv4(struct Qdisc *sch)
2441 {
2442 /*  Further pruned list of traffic classes for four-class system:
2443  *
2444  *          Latency Sensitive  (CS7, CS6, EF, VA, CS5, CS4)
2445  *          Streaming Media    (AF4x, AF3x, CS3, AF2x, TOS4, CS2)
2446  *          Best Effort        (DF, AF1x, TOS2, and those not specified)
2447  *          Background Traffic (LE, CS1)
2448  *
2449  *              Total 4 traffic classes.
2450  */
2451
2452         struct cake_sched_data *q = qdisc_priv(sch);
2453         u32 mtu = psched_mtu(qdisc_dev(sch));
2454         u64 rate = q->rate_bps;
2455         u32 quantum = 1024;
2456
2457         q->tin_cnt = 4;
2458
2459         /* codepoint to class mapping */
2460         q->tin_index = diffserv4;
2461         q->tin_order = bulk_order;
2462
2463         /* class characteristics */
2464         cake_set_rate(&q->tins[0], rate, mtu,
2465                       us_to_ns(q->target), us_to_ns(q->interval));
2466         cake_set_rate(&q->tins[1], rate >> 4, mtu,
2467                       us_to_ns(q->target), us_to_ns(q->interval));
2468         cake_set_rate(&q->tins[2], rate >> 1, mtu,
2469                       us_to_ns(q->target), us_to_ns(q->interval));
2470         cake_set_rate(&q->tins[3], rate >> 2, mtu,
2471                       us_to_ns(q->target), us_to_ns(q->interval));
2472
2473         /* bandwidth-sharing weights */
2474         q->tins[0].tin_quantum = quantum;
2475         q->tins[1].tin_quantum = quantum >> 4;
2476         q->tins[2].tin_quantum = quantum >> 1;
2477         q->tins[3].tin_quantum = quantum >> 2;
2478
2479         return 0;
2480 }
2481
2482 static int cake_config_diffserv3(struct Qdisc *sch)
2483 {
2484 /*  Simplified Diffserv structure with 3 tins.
2485  *              Latency Sensitive       (CS7, CS6, EF, VA, TOS4)
2486  *              Best Effort
2487  *              Low Priority            (LE, CS1)
2488  */
2489         struct cake_sched_data *q = qdisc_priv(sch);
2490         u32 mtu = psched_mtu(qdisc_dev(sch));
2491         u64 rate = q->rate_bps;
2492         u32 quantum = 1024;
2493
2494         q->tin_cnt = 3;
2495
2496         /* codepoint to class mapping */
2497         q->tin_index = diffserv3;
2498         q->tin_order = bulk_order;
2499
2500         /* class characteristics */
2501         cake_set_rate(&q->tins[0], rate, mtu,
2502                       us_to_ns(q->target), us_to_ns(q->interval));
2503         cake_set_rate(&q->tins[1], rate >> 4, mtu,
2504                       us_to_ns(q->target), us_to_ns(q->interval));
2505         cake_set_rate(&q->tins[2], rate >> 2, mtu,
2506                       us_to_ns(q->target), us_to_ns(q->interval));
2507
2508         /* bandwidth-sharing weights */
2509         q->tins[0].tin_quantum = quantum;
2510         q->tins[1].tin_quantum = quantum >> 4;
2511         q->tins[2].tin_quantum = quantum >> 2;
2512
2513         return 0;
2514 }
2515
2516 static void cake_reconfigure(struct Qdisc *sch)
2517 {
2518         struct cake_sched_data *q = qdisc_priv(sch);
2519         int c, ft;
2520
2521         switch (q->tin_mode) {
2522         case CAKE_DIFFSERV_BESTEFFORT:
2523                 ft = cake_config_besteffort(sch);
2524                 break;
2525
2526         case CAKE_DIFFSERV_PRECEDENCE:
2527                 ft = cake_config_precedence(sch);
2528                 break;
2529
2530         case CAKE_DIFFSERV_DIFFSERV8:
2531                 ft = cake_config_diffserv8(sch);
2532                 break;
2533
2534         case CAKE_DIFFSERV_DIFFSERV4:
2535                 ft = cake_config_diffserv4(sch);
2536                 break;
2537
2538         case CAKE_DIFFSERV_DIFFSERV3:
2539         default:
2540                 ft = cake_config_diffserv3(sch);
2541                 break;
2542         }
2543
2544         for (c = q->tin_cnt; c < CAKE_MAX_TINS; c++) {
2545                 cake_clear_tin(sch, c);
2546                 q->tins[c].cparams.mtu_time = q->tins[ft].cparams.mtu_time;
2547         }
2548
2549         q->rate_ns   = q->tins[ft].tin_rate_ns;
2550         q->rate_shft = q->tins[ft].tin_rate_shft;
2551
2552         if (q->buffer_config_limit) {
2553                 q->buffer_limit = q->buffer_config_limit;
2554         } else if (q->rate_bps) {
2555                 u64 t = q->rate_bps * q->interval;
2556
2557                 do_div(t, USEC_PER_SEC / 4);
2558                 q->buffer_limit = max_t(u32, t, 4U << 20);
2559         } else {
2560                 q->buffer_limit = ~0;
2561         }
2562
2563         sch->flags &= ~TCQ_F_CAN_BYPASS;
2564
2565         q->buffer_limit = min(q->buffer_limit,
2566                               max(sch->limit * psched_mtu(qdisc_dev(sch)),
2567                                   q->buffer_config_limit));
2568 }
2569
2570 static int cake_change(struct Qdisc *sch, struct nlattr *opt,
2571                        struct netlink_ext_ack *extack)
2572 {
2573         struct cake_sched_data *q = qdisc_priv(sch);
2574         struct nlattr *tb[TCA_CAKE_MAX + 1];
2575         int err;
2576
2577         err = nla_parse_nested_deprecated(tb, TCA_CAKE_MAX, opt, cake_policy,
2578                                           extack);
2579         if (err < 0)
2580                 return err;
2581
2582         if (tb[TCA_CAKE_NAT]) {
2583 #if IS_ENABLED(CONFIG_NF_CONNTRACK)
2584                 q->flow_mode &= ~CAKE_FLOW_NAT_FLAG;
2585                 q->flow_mode |= CAKE_FLOW_NAT_FLAG *
2586                         !!nla_get_u32(tb[TCA_CAKE_NAT]);
2587 #else
2588                 NL_SET_ERR_MSG_ATTR(extack, tb[TCA_CAKE_NAT],
2589                                     "No conntrack support in kernel");
2590                 return -EOPNOTSUPP;
2591 #endif
2592         }
2593
2594         if (tb[TCA_CAKE_BASE_RATE64])
2595                 q->rate_bps = nla_get_u64(tb[TCA_CAKE_BASE_RATE64]);
2596
2597         if (tb[TCA_CAKE_DIFFSERV_MODE])
2598                 q->tin_mode = nla_get_u32(tb[TCA_CAKE_DIFFSERV_MODE]);
2599
2600         if (tb[TCA_CAKE_WASH]) {
2601                 if (!!nla_get_u32(tb[TCA_CAKE_WASH]))
2602                         q->rate_flags |= CAKE_FLAG_WASH;
2603                 else
2604                         q->rate_flags &= ~CAKE_FLAG_WASH;
2605         }
2606
2607         if (tb[TCA_CAKE_FLOW_MODE])
2608                 q->flow_mode = ((q->flow_mode & CAKE_FLOW_NAT_FLAG) |
2609                                 (nla_get_u32(tb[TCA_CAKE_FLOW_MODE]) &
2610                                         CAKE_FLOW_MASK));
2611
2612         if (tb[TCA_CAKE_ATM])
2613                 q->atm_mode = nla_get_u32(tb[TCA_CAKE_ATM]);
2614
2615         if (tb[TCA_CAKE_OVERHEAD]) {
2616                 q->rate_overhead = nla_get_s32(tb[TCA_CAKE_OVERHEAD]);
2617                 q->rate_flags |= CAKE_FLAG_OVERHEAD;
2618
2619                 q->max_netlen = 0;
2620                 q->max_adjlen = 0;
2621                 q->min_netlen = ~0;
2622                 q->min_adjlen = ~0;
2623         }
2624
2625         if (tb[TCA_CAKE_RAW]) {
2626                 q->rate_flags &= ~CAKE_FLAG_OVERHEAD;
2627
2628                 q->max_netlen = 0;
2629                 q->max_adjlen = 0;
2630                 q->min_netlen = ~0;
2631                 q->min_adjlen = ~0;
2632         }
2633
2634         if (tb[TCA_CAKE_MPU])
2635                 q->rate_mpu = nla_get_u32(tb[TCA_CAKE_MPU]);
2636
2637         if (tb[TCA_CAKE_RTT]) {
2638                 q->interval = nla_get_u32(tb[TCA_CAKE_RTT]);
2639
2640                 if (!q->interval)
2641                         q->interval = 1;
2642         }
2643
2644         if (tb[TCA_CAKE_TARGET]) {
2645                 q->target = nla_get_u32(tb[TCA_CAKE_TARGET]);
2646
2647                 if (!q->target)
2648                         q->target = 1;
2649         }
2650
2651         if (tb[TCA_CAKE_AUTORATE]) {
2652                 if (!!nla_get_u32(tb[TCA_CAKE_AUTORATE]))
2653                         q->rate_flags |= CAKE_FLAG_AUTORATE_INGRESS;
2654                 else
2655                         q->rate_flags &= ~CAKE_FLAG_AUTORATE_INGRESS;
2656         }
2657
2658         if (tb[TCA_CAKE_INGRESS]) {
2659                 if (!!nla_get_u32(tb[TCA_CAKE_INGRESS]))
2660                         q->rate_flags |= CAKE_FLAG_INGRESS;
2661                 else
2662                         q->rate_flags &= ~CAKE_FLAG_INGRESS;
2663         }
2664
2665         if (tb[TCA_CAKE_ACK_FILTER])
2666                 q->ack_filter = nla_get_u32(tb[TCA_CAKE_ACK_FILTER]);
2667
2668         if (tb[TCA_CAKE_MEMORY])
2669                 q->buffer_config_limit = nla_get_u32(tb[TCA_CAKE_MEMORY]);
2670
2671         if (tb[TCA_CAKE_SPLIT_GSO]) {
2672                 if (!!nla_get_u32(tb[TCA_CAKE_SPLIT_GSO]))
2673                         q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2674                 else
2675                         q->rate_flags &= ~CAKE_FLAG_SPLIT_GSO;
2676         }
2677
2678         if (tb[TCA_CAKE_FWMARK]) {
2679                 q->fwmark_mask = nla_get_u32(tb[TCA_CAKE_FWMARK]);
2680                 q->fwmark_shft = q->fwmark_mask ? __ffs(q->fwmark_mask) : 0;
2681         }
2682
2683         if (q->tins) {
2684                 sch_tree_lock(sch);
2685                 cake_reconfigure(sch);
2686                 sch_tree_unlock(sch);
2687         }
2688
2689         return 0;
2690 }
2691
2692 static void cake_destroy(struct Qdisc *sch)
2693 {
2694         struct cake_sched_data *q = qdisc_priv(sch);
2695
2696         qdisc_watchdog_cancel(&q->watchdog);
2697         tcf_block_put(q->block);
2698         kvfree(q->tins);
2699 }
2700
2701 static int cake_init(struct Qdisc *sch, struct nlattr *opt,
2702                      struct netlink_ext_ack *extack)
2703 {
2704         struct cake_sched_data *q = qdisc_priv(sch);
2705         int i, j, err;
2706
2707         sch->limit = 10240;
2708         q->tin_mode = CAKE_DIFFSERV_DIFFSERV3;
2709         q->flow_mode  = CAKE_FLOW_TRIPLE;
2710
2711         q->rate_bps = 0; /* unlimited by default */
2712
2713         q->interval = 100000; /* 100ms default */
2714         q->target   =   5000; /* 5ms: codel RFC argues
2715                                * for 5 to 10% of interval
2716                                */
2717         q->rate_flags |= CAKE_FLAG_SPLIT_GSO;
2718         q->cur_tin = 0;
2719         q->cur_flow  = 0;
2720
2721         qdisc_watchdog_init(&q->watchdog, sch);
2722
2723         if (opt) {
2724                 err = cake_change(sch, opt, extack);
2725
2726                 if (err)
2727                         return err;
2728         }
2729
2730         err = tcf_block_get(&q->block, &q->filter_list, sch, extack);
2731         if (err)
2732                 return err;
2733
2734         quantum_div[0] = ~0;
2735         for (i = 1; i <= CAKE_QUEUES; i++)
2736                 quantum_div[i] = 65535 / i;
2737
2738         q->tins = kvcalloc(CAKE_MAX_TINS, sizeof(struct cake_tin_data),
2739                            GFP_KERNEL);
2740         if (!q->tins)
2741                 return -ENOMEM;
2742
2743         for (i = 0; i < CAKE_MAX_TINS; i++) {
2744                 struct cake_tin_data *b = q->tins + i;
2745
2746                 INIT_LIST_HEAD(&b->new_flows);
2747                 INIT_LIST_HEAD(&b->old_flows);
2748                 INIT_LIST_HEAD(&b->decaying_flows);
2749                 b->sparse_flow_count = 0;
2750                 b->bulk_flow_count = 0;
2751                 b->decaying_flow_count = 0;
2752
2753                 for (j = 0; j < CAKE_QUEUES; j++) {
2754                         struct cake_flow *flow = b->flows + j;
2755                         u32 k = j * CAKE_MAX_TINS + i;
2756
2757                         INIT_LIST_HEAD(&flow->flowchain);
2758                         cobalt_vars_init(&flow->cvars);
2759
2760                         q->overflow_heap[k].t = i;
2761                         q->overflow_heap[k].b = j;
2762                         b->overflow_idx[j] = k;
2763                 }
2764         }
2765
2766         cake_reconfigure(sch);
2767         q->avg_peak_bandwidth = q->rate_bps;
2768         q->min_netlen = ~0;
2769         q->min_adjlen = ~0;
2770         return 0;
2771 }
2772
2773 static int cake_dump(struct Qdisc *sch, struct sk_buff *skb)
2774 {
2775         struct cake_sched_data *q = qdisc_priv(sch);
2776         struct nlattr *opts;
2777
2778         opts = nla_nest_start_noflag(skb, TCA_OPTIONS);
2779         if (!opts)
2780                 goto nla_put_failure;
2781
2782         if (nla_put_u64_64bit(skb, TCA_CAKE_BASE_RATE64, q->rate_bps,
2783                               TCA_CAKE_PAD))
2784                 goto nla_put_failure;
2785
2786         if (nla_put_u32(skb, TCA_CAKE_FLOW_MODE,
2787                         q->flow_mode & CAKE_FLOW_MASK))
2788                 goto nla_put_failure;
2789
2790         if (nla_put_u32(skb, TCA_CAKE_RTT, q->interval))
2791                 goto nla_put_failure;
2792
2793         if (nla_put_u32(skb, TCA_CAKE_TARGET, q->target))
2794                 goto nla_put_failure;
2795
2796         if (nla_put_u32(skb, TCA_CAKE_MEMORY, q->buffer_config_limit))
2797                 goto nla_put_failure;
2798
2799         if (nla_put_u32(skb, TCA_CAKE_AUTORATE,
2800                         !!(q->rate_flags & CAKE_FLAG_AUTORATE_INGRESS)))
2801                 goto nla_put_failure;
2802
2803         if (nla_put_u32(skb, TCA_CAKE_INGRESS,
2804                         !!(q->rate_flags & CAKE_FLAG_INGRESS)))
2805                 goto nla_put_failure;
2806
2807         if (nla_put_u32(skb, TCA_CAKE_ACK_FILTER, q->ack_filter))
2808                 goto nla_put_failure;
2809
2810         if (nla_put_u32(skb, TCA_CAKE_NAT,
2811                         !!(q->flow_mode & CAKE_FLOW_NAT_FLAG)))
2812                 goto nla_put_failure;
2813
2814         if (nla_put_u32(skb, TCA_CAKE_DIFFSERV_MODE, q->tin_mode))
2815                 goto nla_put_failure;
2816
2817         if (nla_put_u32(skb, TCA_CAKE_WASH,
2818                         !!(q->rate_flags & CAKE_FLAG_WASH)))
2819                 goto nla_put_failure;
2820
2821         if (nla_put_u32(skb, TCA_CAKE_OVERHEAD, q->rate_overhead))
2822                 goto nla_put_failure;
2823
2824         if (!(q->rate_flags & CAKE_FLAG_OVERHEAD))
2825                 if (nla_put_u32(skb, TCA_CAKE_RAW, 0))
2826                         goto nla_put_failure;
2827
2828         if (nla_put_u32(skb, TCA_CAKE_ATM, q->atm_mode))
2829                 goto nla_put_failure;
2830
2831         if (nla_put_u32(skb, TCA_CAKE_MPU, q->rate_mpu))
2832                 goto nla_put_failure;
2833
2834         if (nla_put_u32(skb, TCA_CAKE_SPLIT_GSO,
2835                         !!(q->rate_flags & CAKE_FLAG_SPLIT_GSO)))
2836                 goto nla_put_failure;
2837
2838         if (nla_put_u32(skb, TCA_CAKE_FWMARK, q->fwmark_mask))
2839                 goto nla_put_failure;
2840
2841         return nla_nest_end(skb, opts);
2842
2843 nla_put_failure:
2844         return -1;
2845 }
2846
2847 static int cake_dump_stats(struct Qdisc *sch, struct gnet_dump *d)
2848 {
2849         struct nlattr *stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
2850         struct cake_sched_data *q = qdisc_priv(sch);
2851         struct nlattr *tstats, *ts;
2852         int i;
2853
2854         if (!stats)
2855                 return -1;
2856
2857 #define PUT_STAT_U32(attr, data) do {                                  \
2858                 if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
2859                         goto nla_put_failure;                          \
2860         } while (0)
2861 #define PUT_STAT_U64(attr, data) do {                                  \
2862                 if (nla_put_u64_64bit(d->skb, TCA_CAKE_STATS_ ## attr, \
2863                                         data, TCA_CAKE_STATS_PAD)) \
2864                         goto nla_put_failure;                          \
2865         } while (0)
2866
2867         PUT_STAT_U64(CAPACITY_ESTIMATE64, q->avg_peak_bandwidth);
2868         PUT_STAT_U32(MEMORY_LIMIT, q->buffer_limit);
2869         PUT_STAT_U32(MEMORY_USED, q->buffer_max_used);
2870         PUT_STAT_U32(AVG_NETOFF, ((q->avg_netoff + 0x8000) >> 16));
2871         PUT_STAT_U32(MAX_NETLEN, q->max_netlen);
2872         PUT_STAT_U32(MAX_ADJLEN, q->max_adjlen);
2873         PUT_STAT_U32(MIN_NETLEN, q->min_netlen);
2874         PUT_STAT_U32(MIN_ADJLEN, q->min_adjlen);
2875
2876 #undef PUT_STAT_U32
2877 #undef PUT_STAT_U64
2878
2879         tstats = nla_nest_start_noflag(d->skb, TCA_CAKE_STATS_TIN_STATS);
2880         if (!tstats)
2881                 goto nla_put_failure;
2882
2883 #define PUT_TSTAT_U32(attr, data) do {                                  \
2884                 if (nla_put_u32(d->skb, TCA_CAKE_TIN_STATS_ ## attr, data)) \
2885                         goto nla_put_failure;                           \
2886         } while (0)
2887 #define PUT_TSTAT_U64(attr, data) do {                                  \
2888                 if (nla_put_u64_64bit(d->skb, TCA_CAKE_TIN_STATS_ ## attr, \
2889                                         data, TCA_CAKE_TIN_STATS_PAD))  \
2890                         goto nla_put_failure;                           \
2891         } while (0)
2892
2893         for (i = 0; i < q->tin_cnt; i++) {
2894                 struct cake_tin_data *b = &q->tins[q->tin_order[i]];
2895
2896                 ts = nla_nest_start_noflag(d->skb, i + 1);
2897                 if (!ts)
2898                         goto nla_put_failure;
2899
2900                 PUT_TSTAT_U64(THRESHOLD_RATE64, b->tin_rate_bps);
2901                 PUT_TSTAT_U64(SENT_BYTES64, b->bytes);
2902                 PUT_TSTAT_U32(BACKLOG_BYTES, b->tin_backlog);
2903
2904                 PUT_TSTAT_U32(TARGET_US,
2905                               ktime_to_us(ns_to_ktime(b->cparams.target)));
2906                 PUT_TSTAT_U32(INTERVAL_US,
2907                               ktime_to_us(ns_to_ktime(b->cparams.interval)));
2908
2909                 PUT_TSTAT_U32(SENT_PACKETS, b->packets);
2910                 PUT_TSTAT_U32(DROPPED_PACKETS, b->tin_dropped);
2911                 PUT_TSTAT_U32(ECN_MARKED_PACKETS, b->tin_ecn_mark);
2912                 PUT_TSTAT_U32(ACKS_DROPPED_PACKETS, b->ack_drops);
2913
2914                 PUT_TSTAT_U32(PEAK_DELAY_US,
2915                               ktime_to_us(ns_to_ktime(b->peak_delay)));
2916                 PUT_TSTAT_U32(AVG_DELAY_US,
2917                               ktime_to_us(ns_to_ktime(b->avge_delay)));
2918                 PUT_TSTAT_U32(BASE_DELAY_US,
2919                               ktime_to_us(ns_to_ktime(b->base_delay)));
2920
2921                 PUT_TSTAT_U32(WAY_INDIRECT_HITS, b->way_hits);
2922                 PUT_TSTAT_U32(WAY_MISSES, b->way_misses);
2923                 PUT_TSTAT_U32(WAY_COLLISIONS, b->way_collisions);
2924
2925                 PUT_TSTAT_U32(SPARSE_FLOWS, b->sparse_flow_count +
2926                                             b->decaying_flow_count);
2927                 PUT_TSTAT_U32(BULK_FLOWS, b->bulk_flow_count);
2928                 PUT_TSTAT_U32(UNRESPONSIVE_FLOWS, b->unresponsive_flow_count);
2929                 PUT_TSTAT_U32(MAX_SKBLEN, b->max_skblen);
2930
2931                 PUT_TSTAT_U32(FLOW_QUANTUM, b->flow_quantum);
2932                 nla_nest_end(d->skb, ts);
2933         }
2934
2935 #undef PUT_TSTAT_U32
2936 #undef PUT_TSTAT_U64
2937
2938         nla_nest_end(d->skb, tstats);
2939         return nla_nest_end(d->skb, stats);
2940
2941 nla_put_failure:
2942         nla_nest_cancel(d->skb, stats);
2943         return -1;
2944 }
2945
2946 static struct Qdisc *cake_leaf(struct Qdisc *sch, unsigned long arg)
2947 {
2948         return NULL;
2949 }
2950
2951 static unsigned long cake_find(struct Qdisc *sch, u32 classid)
2952 {
2953         return 0;
2954 }
2955
2956 static unsigned long cake_bind(struct Qdisc *sch, unsigned long parent,
2957                                u32 classid)
2958 {
2959         return 0;
2960 }
2961
2962 static void cake_unbind(struct Qdisc *q, unsigned long cl)
2963 {
2964 }
2965
2966 static struct tcf_block *cake_tcf_block(struct Qdisc *sch, unsigned long cl,
2967                                         struct netlink_ext_ack *extack)
2968 {
2969         struct cake_sched_data *q = qdisc_priv(sch);
2970
2971         if (cl)
2972                 return NULL;
2973         return q->block;
2974 }
2975
2976 static int cake_dump_class(struct Qdisc *sch, unsigned long cl,
2977                            struct sk_buff *skb, struct tcmsg *tcm)
2978 {
2979         tcm->tcm_handle |= TC_H_MIN(cl);
2980         return 0;
2981 }
2982
2983 static int cake_dump_class_stats(struct Qdisc *sch, unsigned long cl,
2984                                  struct gnet_dump *d)
2985 {
2986         struct cake_sched_data *q = qdisc_priv(sch);
2987         const struct cake_flow *flow = NULL;
2988         struct gnet_stats_queue qs = { 0 };
2989         struct nlattr *stats;
2990         u32 idx = cl - 1;
2991
2992         if (idx < CAKE_QUEUES * q->tin_cnt) {
2993                 const struct cake_tin_data *b = \
2994                         &q->tins[q->tin_order[idx / CAKE_QUEUES]];
2995                 const struct sk_buff *skb;
2996
2997                 flow = &b->flows[idx % CAKE_QUEUES];
2998
2999                 if (flow->head) {
3000                         sch_tree_lock(sch);
3001                         skb = flow->head;
3002                         while (skb) {
3003                                 qs.qlen++;
3004                                 skb = skb->next;
3005                         }
3006                         sch_tree_unlock(sch);
3007                 }
3008                 qs.backlog = b->backlogs[idx % CAKE_QUEUES];
3009                 qs.drops = flow->dropped;
3010         }
3011         if (gnet_stats_copy_queue(d, NULL, &qs, qs.qlen) < 0)
3012                 return -1;
3013         if (flow) {
3014                 ktime_t now = ktime_get();
3015
3016                 stats = nla_nest_start_noflag(d->skb, TCA_STATS_APP);
3017                 if (!stats)
3018                         return -1;
3019
3020 #define PUT_STAT_U32(attr, data) do {                                  \
3021                 if (nla_put_u32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3022                         goto nla_put_failure;                          \
3023         } while (0)
3024 #define PUT_STAT_S32(attr, data) do {                                  \
3025                 if (nla_put_s32(d->skb, TCA_CAKE_STATS_ ## attr, data)) \
3026                         goto nla_put_failure;                          \
3027         } while (0)
3028
3029                 PUT_STAT_S32(DEFICIT, flow->deficit);
3030                 PUT_STAT_U32(DROPPING, flow->cvars.dropping);
3031                 PUT_STAT_U32(COBALT_COUNT, flow->cvars.count);
3032                 PUT_STAT_U32(P_DROP, flow->cvars.p_drop);
3033                 if (flow->cvars.p_drop) {
3034                         PUT_STAT_S32(BLUE_TIMER_US,
3035                                      ktime_to_us(
3036                                              ktime_sub(now,
3037                                                        flow->cvars.blue_timer)));
3038                 }
3039                 if (flow->cvars.dropping) {
3040                         PUT_STAT_S32(DROP_NEXT_US,
3041                                      ktime_to_us(
3042                                              ktime_sub(now,
3043                                                        flow->cvars.drop_next)));
3044                 }
3045
3046                 if (nla_nest_end(d->skb, stats) < 0)
3047                         return -1;
3048         }
3049
3050         return 0;
3051
3052 nla_put_failure:
3053         nla_nest_cancel(d->skb, stats);
3054         return -1;
3055 }
3056
3057 static void cake_walk(struct Qdisc *sch, struct qdisc_walker *arg)
3058 {
3059         struct cake_sched_data *q = qdisc_priv(sch);
3060         unsigned int i, j;
3061
3062         if (arg->stop)
3063                 return;
3064
3065         for (i = 0; i < q->tin_cnt; i++) {
3066                 struct cake_tin_data *b = &q->tins[q->tin_order[i]];
3067
3068                 for (j = 0; j < CAKE_QUEUES; j++) {
3069                         if (list_empty(&b->flows[j].flowchain)) {
3070                                 arg->count++;
3071                                 continue;
3072                         }
3073                         if (!tc_qdisc_stats_dump(sch, i * CAKE_QUEUES + j + 1,
3074                                                  arg))
3075                                 break;
3076                 }
3077         }
3078 }
3079
3080 static const struct Qdisc_class_ops cake_class_ops = {
3081         .leaf           =       cake_leaf,
3082         .find           =       cake_find,
3083         .tcf_block      =       cake_tcf_block,
3084         .bind_tcf       =       cake_bind,
3085         .unbind_tcf     =       cake_unbind,
3086         .dump           =       cake_dump_class,
3087         .dump_stats     =       cake_dump_class_stats,
3088         .walk           =       cake_walk,
3089 };
3090
3091 static struct Qdisc_ops cake_qdisc_ops __read_mostly = {
3092         .cl_ops         =       &cake_class_ops,
3093         .id             =       "cake",
3094         .priv_size      =       sizeof(struct cake_sched_data),
3095         .enqueue        =       cake_enqueue,
3096         .dequeue        =       cake_dequeue,
3097         .peek           =       qdisc_peek_dequeued,
3098         .init           =       cake_init,
3099         .reset          =       cake_reset,
3100         .destroy        =       cake_destroy,
3101         .change         =       cake_change,
3102         .dump           =       cake_dump,
3103         .dump_stats     =       cake_dump_stats,
3104         .owner          =       THIS_MODULE,
3105 };
3106
3107 static int __init cake_module_init(void)
3108 {
3109         return register_qdisc(&cake_qdisc_ops);
3110 }
3111
3112 static void __exit cake_module_exit(void)
3113 {
3114         unregister_qdisc(&cake_qdisc_ops);
3115 }
3116
3117 module_init(cake_module_init)
3118 module_exit(cake_module_exit)
3119 MODULE_AUTHOR("Jonathan Morton");
3120 MODULE_LICENSE("Dual BSD/GPL");
3121 MODULE_DESCRIPTION("The CAKE shaper.");