net/sched/sch_netem.c

   1 /*
   2  * net/sched/sch_netem.c        Network emulator
   3  *
   4  *              This program is free software; you can redistribute it and/or
   5  *              modify it under the terms of the GNU General Public License
   6  *              as published by the Free Software Foundation; either version
   7  *              2 of the License.
   8  *
   9  *              Many of the algorithms and ideas for this came from
  10  *              NIST Net which is not copyrighted.
  11  *
  12  * Authors:     Stephen Hemminger <shemminger@osdl.org>
  13  *              Catalin(ux aka Dino) BOIE <catab at umbrella dot ro>
  14  */
  15
  16 #include <linux/mm.h>
  17 #include <linux/module.h>
  18 #include <linux/slab.h>
  19 #include <linux/types.h>
  20 #include <linux/kernel.h>
  21 #include <linux/errno.h>
  22 #include <linux/skbuff.h>
  23 #include <linux/vmalloc.h>
  24 #include <linux/rtnetlink.h>
  25 #include <linux/reciprocal_div.h>
  26 #include <linux/rbtree.h>
  27
  28 #include <net/netlink.h>
  29 #include <net/pkt_sched.h>
  30 #include <net/inet_ecn.h>
  31
  32 #define VERSION "1.3"
  33
  34 /*      Network Emulation Queuing algorithm.
  35         ====================================
  36
  37         Sources: [1] Mark Carson, Darrin Santay, "NIST Net - A Linux-based
  38                  Network Emulation Tool
  39                  [2] Luigi Rizzo, DummyNet for FreeBSD
  40
  41          ----------------------------------------------------------------
  42
  43          This started out as a simple way to delay outgoing packets to
  44          test TCP but has grown to include most of the functionality
  45          of a full blown network emulator like NISTnet. It can delay
  46          packets and add random jitter (and correlation). The random
  47          distribution can be loaded from a table as well to provide
  48          normal, Pareto, or experimental curves. Packet loss,
  49          duplication, and reordering can also be emulated.
  50
  51          This qdisc does not do classification that can be handled in
  52          layering other disciplines.  It does not need to do bandwidth
  53          control either since that can be handled by using token
  54          bucket or other rate control.
  55
  56      Correlated Loss Generator models
  57
  58         Added generation of correlated loss according to the
  59         "Gilbert-Elliot" model, a 4-state markov model.
  60
  61         References:
  62         [1] NetemCLG Home http://netgroup.uniroma2.it/NetemCLG
  63         [2] S. Salsano, F. Ludovici, A. Ordine, "Definition of a general
  64         and intuitive loss model for packet networks and its implementation
  65         in the Netem module in the Linux kernel", available in [1]
  66
  67         Authors: Stefano Salsano <stefano.salsano at uniroma2.it
  68                  Fabio Ludovici <fabio.ludovici at yahoo.it>
  69 */
  70
  71 struct disttable {
  72         u32  size;
  73         s16 table[0];
  74 };
  75
  76 struct netem_sched_data {
  77         /* internal t(ime)fifo qdisc uses t_root and sch->limit */
  78         struct rb_root t_root;
  79
  80         /* optional qdisc for classful handling (NULL at netem init) */
  81         struct Qdisc    *qdisc;
  82
  83         struct qdisc_watchdog watchdog;
  84
  85         s64 latency;
  86         s64 jitter;
  87
  88         u32 loss;
  89         u32 ecn;
  90         u32 limit;
  91         u32 counter;
  92         u32 gap;
  93         u32 duplicate;
  94         u32 reorder;
  95         u32 corrupt;
  96         u64 rate;
  97         s32 packet_overhead;
  98         u32 cell_size;
  99         struct reciprocal_value cell_size_reciprocal;
 100         s32 cell_overhead;
 101
 102         struct crndstate {
 103                 u32 last;
 104                 u32 rho;
 105         } delay_cor, loss_cor, dup_cor, reorder_cor, corrupt_cor;
 106
 107         struct disttable *delay_dist;
 108
 109         enum  {
 110                 CLG_RANDOM,
 111                 CLG_4_STATES,
 112                 CLG_GILB_ELL,
 113         } loss_model;
 114
 115         enum {
 116                 TX_IN_GAP_PERIOD = 1,
 117                 TX_IN_BURST_PERIOD,
 118                 LOST_IN_GAP_PERIOD,
 119                 LOST_IN_BURST_PERIOD,
 120         } _4_state_model;
 121
 122         enum {
 123                 GOOD_STATE = 1,
 124                 BAD_STATE,
 125         } GE_state_model;
 126
 127         /* Correlated Loss Generation models */
 128         struct clgstate {
 129                 /* state of the Markov chain */
 130                 u8 state;
 131
 132                 /* 4-states and Gilbert-Elliot models */
 133                 u32 a1; /* p13 for 4-states or p for GE */
 134                 u32 a2; /* p31 for 4-states or r for GE */
 135                 u32 a3; /* p32 for 4-states or h for GE */
 136                 u32 a4; /* p14 for 4-states or 1-k for GE */
 137                 u32 a5; /* p23 used only in 4-states */
 138         } clg;
 139
 140         struct tc_netem_slot slot_config;
 141         struct slotstate {
 142                 u64 slot_next;
 143                 s32 packets_left;
 144                 s32 bytes_left;
 145         } slot;
 146
 147         struct disttable *slot_dist;
 148 };
 149
 150 /* Time stamp put into socket buffer control block
 151  * Only valid when skbs are in our internal t(ime)fifo queue.
 152  *
 153  * As skb->rbnode uses same storage than skb->next, skb->prev and skb->tstamp,
 154  * and skb->next & skb->prev are scratch space for a qdisc,
 155  * we save skb->tstamp value in skb->cb[] before destroying it.
 156  */
 157 struct netem_skb_cb {
 158         u64             time_to_send;
 159 };
 160
 161 static inline struct netem_skb_cb *netem_skb_cb(struct sk_buff *skb)
 162 {
 163         /* we assume we can use skb next/prev/tstamp as storage for rb_node */
 164         qdisc_cb_private_validate(skb, sizeof(struct netem_skb_cb));
 165         return (struct netem_skb_cb *)qdisc_skb_cb(skb)->data;
 166 }
 167
 168 /* init_crandom - initialize correlated random number generator
 169  * Use entropy source for initial seed.
 170  */
 171 static void init_crandom(struct crndstate *state, unsigned long rho)
 172 {
 173         state->rho = rho;
 174         state->last = prandom_u32();
 175 }
 176
 177 /* get_crandom - correlated random number generator
 178  * Next number depends on last value.
 179  * rho is scaled to avoid floating point.
 180  */
 181 static u32 get_crandom(struct crndstate *state)
 182 {
 183         u64 value, rho;
 184         unsigned long answer;
 185
 186         if (!state || state->rho == 0)  /* no correlation */
 187                 return prandom_u32();
 188
 189         value = prandom_u32();
 190         rho = (u64)state->rho + 1;
 191         answer = (value * ((1ull<<32) - rho) + state->last * rho) >> 32;
 192         state->last = answer;
 193         return answer;
 194 }
 195
 196 /* loss_4state - 4-state model loss generator
 197  * Generates losses according to the 4-state Markov chain adopted in
 198  * the GI (General and Intuitive) loss model.
 199  */
 200 static bool loss_4state(struct netem_sched_data *q)
 201 {
 202         struct clgstate *clg = &q->clg;
 203         u32 rnd = prandom_u32();
 204
 205         /*
 206          * Makes a comparison between rnd and the transition
 207          * probabilities outgoing from the current state, then decides the
 208          * next state and if the next packet has to be transmitted or lost.
 209          * The four states correspond to:
 210          *   TX_IN_GAP_PERIOD => successfully transmitted packets within a gap period
 211          *   LOST_IN_BURST_PERIOD => isolated losses within a gap period
 212          *   LOST_IN_GAP_PERIOD => lost packets within a burst period
 213          *   TX_IN_GAP_PERIOD => successfully transmitted packets within a burst period
 214          */
 215         switch (clg->state) {
 216         case TX_IN_GAP_PERIOD:
 217                 if (rnd < clg->a4) {
 218                         clg->state = LOST_IN_BURST_PERIOD;
 219                         return true;
 220                 } else if (clg->a4 < rnd && rnd < clg->a1 + clg->a4) {
 221                         clg->state = LOST_IN_GAP_PERIOD;
 222                         return true;
 223                 } else if (clg->a1 + clg->a4 < rnd) {
 224                         clg->state = TX_IN_GAP_PERIOD;
 225                 }
 226
 227                 break;
 228         case TX_IN_BURST_PERIOD:
 229                 if (rnd < clg->a5) {
 230                         clg->state = LOST_IN_GAP_PERIOD;
 231                         return true;
 232                 } else {
 233                         clg->state = TX_IN_BURST_PERIOD;
 234                 }
 235
 236                 break;
 237         case LOST_IN_GAP_PERIOD:
 238                 if (rnd < clg->a3)
 239                         clg->state = TX_IN_BURST_PERIOD;
 240                 else if (clg->a3 < rnd && rnd < clg->a2 + clg->a3) {
 241                         clg->state = TX_IN_GAP_PERIOD;
 242                 } else if (clg->a2 + clg->a3 < rnd) {
 243                         clg->state = LOST_IN_GAP_PERIOD;
 244                         return true;
 245                 }
 246                 break;
 247         case LOST_IN_BURST_PERIOD:
 248                 clg->state = TX_IN_GAP_PERIOD;
 249                 break;
 250         }
 251
 252         return false;
 253 }
 254
 255 /* loss_gilb_ell - Gilbert-Elliot model loss generator
 256  * Generates losses according to the Gilbert-Elliot loss model or
 257  * its special cases  (Gilbert or Simple Gilbert)
 258  *
 259  * Makes a comparison between random number and the transition
 260  * probabilities outgoing from the current state, then decides the
 261  * next state. A second random number is extracted and the comparison
 262  * with the loss probability of the current state decides if the next
 263  * packet will be transmitted or lost.
 264  */
 265 static bool loss_gilb_ell(struct netem_sched_data *q)
 266 {
 267         struct clgstate *clg = &q->clg;
 268
 269         switch (clg->state) {
 270         case GOOD_STATE:
 271                 if (prandom_u32() < clg->a1)
 272                         clg->state = BAD_STATE;
 273                 if (prandom_u32() < clg->a4)
 274                         return true;
 275                 break;
 276         case BAD_STATE:
 277                 if (prandom_u32() < clg->a2)
 278                         clg->state = GOOD_STATE;
 279                 if (prandom_u32() > clg->a3)
 280                         return true;
 281         }
 282
 283         return false;
 284 }
 285
 286 static bool loss_event(struct netem_sched_data *q)
 287 {
 288         switch (q->loss_model) {
 289         case CLG_RANDOM:
 290                 /* Random packet drop 0 => none, ~0 => all */
 291                 return q->loss && q->loss >= get_crandom(&q->loss_cor);
 292
 293         case CLG_4_STATES:
 294                 /* 4state loss model algorithm (used also for GI model)
 295                 * Extracts a value from the markov 4 state loss generator,
 296                 * if it is 1 drops a packet and if needed writes the event in
 297                 * the kernel logs
 298                 */
 299                 return loss_4state(q);
 300
 301         case CLG_GILB_ELL:
 302                 /* Gilbert-Elliot loss model algorithm
 303                 * Extracts a value from the Gilbert-Elliot loss generator,
 304                 * if it is 1 drops a packet and if needed writes the event in
 305                 * the kernel logs
 306                 */
 307                 return loss_gilb_ell(q);
 308         }
 309
 310         return false;   /* not reached */
 311 }
 312
 313
 314 /* tabledist - return a pseudo-randomly distributed value with mean mu and
 315  * std deviation sigma.  Uses table lookup to approximate the desired
 316  * distribution, and a uniformly-distributed pseudo-random source.
 317  */
 318 static s64 tabledist(s64 mu, s32 sigma,
 319                      struct crndstate *state,
 320                      const struct disttable *dist)
 321 {
 322         s64 x;
 323         long t;
 324         u32 rnd;
 325
 326         if (sigma == 0)
 327                 return mu;
 328
 329         rnd = get_crandom(state);
 330
 331         /* default uniform distribution */
 332         if (dist == NULL)
 333                 return ((rnd % (2 * (u32)sigma)) + mu) - sigma;
 334
 335         t = dist->table[rnd % dist->size];
 336         x = (sigma % NETEM_DIST_SCALE) * t;
 337         if (x >= 0)
 338                 x += NETEM_DIST_SCALE/2;
 339         else
 340                 x -= NETEM_DIST_SCALE/2;
 341
 342         return  x / NETEM_DIST_SCALE + (sigma / NETEM_DIST_SCALE) * t + mu;
 343 }
 344
 345 static u64 packet_time_ns(u64 len, const struct netem_sched_data *q)
 346 {
 347         len += q->packet_overhead;
 348
 349         if (q->cell_size) {
 350                 u32 cells = reciprocal_divide(len, q->cell_size_reciprocal);
 351
 352                 if (len > cells * q->cell_size) /* extra cell needed for remainder */
 353                         cells++;
 354                 len = cells * (q->cell_size + q->cell_overhead);
 355         }
 356
 357         return div64_u64(len * NSEC_PER_SEC, q->rate);
 358 }
 359
 360 static void tfifo_reset(struct Qdisc *sch)
 361 {
 362         struct netem_sched_data *q = qdisc_priv(sch);
 363         struct rb_node *p = rb_first(&q->t_root);
 364
 365         while (p) {
 366                 struct sk_buff *skb = rb_to_skb(p);
 367
 368                 p = rb_next(p);
 369                 rb_erase(&skb->rbnode, &q->t_root);
 370                 rtnl_kfree_skbs(skb, skb);
 371         }
 372 }
 373
 374 static void tfifo_enqueue(struct sk_buff *nskb, struct Qdisc *sch)
 375 {
 376         struct netem_sched_data *q = qdisc_priv(sch);
 377         u64 tnext = netem_skb_cb(nskb)->time_to_send;
 378         struct rb_node **p = &q->t_root.rb_node, *parent = NULL;
 379
 380         while (*p) {
 381                 struct sk_buff *skb;
 382
 383                 parent = *p;
 384                 skb = rb_to_skb(parent);
 385                 if (tnext >= netem_skb_cb(skb)->time_to_send)
 386                         p = &parent->rb_right;
 387                 else
 388                         p = &parent->rb_left;
 389         }
 390         rb_link_node(&nskb->rbnode, parent, p);
 391         rb_insert_color(&nskb->rbnode, &q->t_root);
 392         sch->q.qlen++;
 393 }
 394
 395 /* netem can't properly corrupt a megapacket (like we get from GSO), so instead
 396  * when we statistically choose to corrupt one, we instead segment it, returning
 397  * the first packet to be corrupted, and re-enqueue the remaining frames
 398  */
 399 static struct sk_buff *netem_segment(struct sk_buff *skb, struct Qdisc *sch,
 400                                      struct sk_buff **to_free)
 401 {
 402         struct sk_buff *segs;
 403         netdev_features_t features = netif_skb_features(skb);
 404
 405         segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
 406
 407         if (IS_ERR_OR_NULL(segs)) {
 408                 qdisc_drop(skb, sch, to_free);
 409                 return NULL;
 410         }
 411         consume_skb(skb);
 412         return segs;
 413 }
 414
 415 static void netem_enqueue_skb_head(struct qdisc_skb_head *qh, struct sk_buff *skb)
 416 {
 417         skb->next = qh->head;
 418
 419         if (!qh->head)
 420                 qh->tail = skb;
 421         qh->head = skb;
 422         qh->qlen++;
 423 }
 424
 425 /*
 426  * Insert one skb into qdisc.
 427  * Note: parent depends on return value to account for queue length.
 428  *      NET_XMIT_DROP: queue length didn't change.
 429  *      NET_XMIT_SUCCESS: one skb was queued.
 430  */
 431 static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
 432                          struct sk_buff **to_free)
 433 {
 434         struct netem_sched_data *q = qdisc_priv(sch);
 435         /* We don't fill cb now as skb_unshare() may invalidate it */
 436         struct netem_skb_cb *cb;
 437         struct sk_buff *skb2;
 438         struct sk_buff *segs = NULL;
 439         unsigned int prev_len = qdisc_pkt_len(skb);
 440         int count = 1;
 441         int rc = NET_XMIT_SUCCESS;
 442         int rc_drop = NET_XMIT_DROP;
 443
 444         /* Do not fool qdisc_drop_all() */
 445         skb->prev = NULL;
 446
 447         /* Random duplication */
 448         if (q->duplicate && q->duplicate >= get_crandom(&q->dup_cor))
 449                 ++count;
 450
 451         /* Drop packet? */
 452         if (loss_event(q)) {
 453                 if (q->ecn && INET_ECN_set_ce(skb))
 454                         qdisc_qstats_drop(sch); /* mark packet */
 455                 else
 456                         --count;
 457         }
 458         if (count == 0) {
 459                 qdisc_qstats_drop(sch);
 460                 __qdisc_drop(skb, to_free);
 461                 return NET_XMIT_SUCCESS | __NET_XMIT_BYPASS;
 462         }
 463
 464         /* If a delay is expected, orphan the skb. (orphaning usually takes
 465          * place at TX completion time, so _before_ the link transit delay)
 466          */
 467         if (q->latency || q->jitter || q->rate)
 468                 skb_orphan_partial(skb);
 469
 470         /*
 471          * If we need to duplicate packet, then re-insert at top of the
 472          * qdisc tree, since parent queuer expects that only one
 473          * skb will be queued.
 474          */
 475         if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
 476                 struct Qdisc *rootq = qdisc_root_bh(sch);
 477                 u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
 478
 479                 q->duplicate = 0;
 480                 rootq->enqueue(skb2, rootq, to_free);
 481                 q->duplicate = dupsave;
 482                 rc_drop = NET_XMIT_SUCCESS;
 483         }
 484
 485         /*
 486          * Randomized packet corruption.
 487          * Make copy if needed since we are modifying
 488          * If packet is going to be hardware checksummed, then
 489          * do it now in software before we mangle it.
 490          */
 491         if (q->corrupt && q->corrupt >= get_crandom(&q->corrupt_cor)) {
 492                 if (skb_is_gso(skb)) {
 493                         segs = netem_segment(skb, sch, to_free);
 494                         if (!segs)
 495                                 return rc_drop;
 496                         qdisc_skb_cb(segs)->pkt_len = segs->len;
 497                 } else {
 498                         segs = skb;
 499                 }
 500
 501                 skb = segs;
 502                 segs = segs->next;
 503
 504                 skb = skb_unshare(skb, GFP_ATOMIC);
 505                 if (unlikely(!skb)) {
 506                         qdisc_qstats_drop(sch);
 507                         goto finish_segs;
 508                 }
 509                 if (skb->ip_summed == CHECKSUM_PARTIAL &&
 510                     skb_checksum_help(skb)) {
 511                         qdisc_drop(skb, sch, to_free);
 512                         skb = NULL;
 513                         goto finish_segs;
 514                 }
 515
 516                 skb->data[prandom_u32() % skb_headlen(skb)] ^=
 517                         1<<(prandom_u32() % 8);
 518         }
 519
 520         if (unlikely(sch->q.qlen >= sch->limit)) {
 521                 qdisc_drop_all(skb, sch, to_free);
 522                 return rc_drop;
 523         }
 524
 525         qdisc_qstats_backlog_inc(sch, skb);
 526
 527         cb = netem_skb_cb(skb);
 528         if (q->gap == 0 ||              /* not doing reordering */
 529             q->counter < q->gap - 1 ||  /* inside last reordering gap */
 530             q->reorder < get_crandom(&q->reorder_cor)) {
 531                 u64 now;
 532                 s64 delay;
 533
 534                 delay = tabledist(q->latency, q->jitter,
 535                                   &q->delay_cor, q->delay_dist);
 536
 537                 now = ktime_get_ns();
 538
 539                 if (q->rate) {
 540                         struct netem_skb_cb *last = NULL;
 541
 542                         if (sch->q.tail)
 543                                 last = netem_skb_cb(sch->q.tail);
 544                         if (q->t_root.rb_node) {
 545                                 struct sk_buff *t_skb;
 546                                 struct netem_skb_cb *t_last;
 547
 548                                 t_skb = skb_rb_last(&q->t_root);
 549                                 t_last = netem_skb_cb(t_skb);
 550                                 if (!last ||
 551                                     t_last->time_to_send > last->time_to_send) {
 552                                         last = t_last;
 553                                 }
 554                         }
 555
 556                         if (last) {
 557                                 /*
 558                                  * Last packet in queue is reference point (now),
 559                                  * calculate this time bonus and subtract
 560                                  * from delay.
 561                                  */
 562                                 delay -= last->time_to_send - now;
 563                                 delay = max_t(s64, 0, delay);
 564                                 now = last->time_to_send;
 565                         }
 566
 567                         delay += packet_time_ns(qdisc_pkt_len(skb), q);
 568                 }
 569
 570                 cb->time_to_send = now + delay;
 571                 ++q->counter;
 572                 tfifo_enqueue(skb, sch);
 573         } else {
 574                 /*
 575                  * Do re-ordering by putting one out of N packets at the front
 576                  * of the queue.
 577                  */
 578                 cb->time_to_send = ktime_get_ns();
 579                 q->counter = 0;
 580
 581                 netem_enqueue_skb_head(&sch->q, skb);
 582                 sch->qstats.requeues++;
 583         }
 584
 585 finish_segs:
 586         if (segs) {
 587                 unsigned int len, last_len;
 588                 int nb;
 589
 590                 len = skb ? skb->len : 0;
 591                 nb = skb ? 1 : 0;
 592
 593                 while (segs) {
 594                         skb2 = segs->next;
 595                         segs->next = NULL;
 596                         qdisc_skb_cb(segs)->pkt_len = segs->len;
 597                         last_len = segs->len;
 598                         rc = qdisc_enqueue(segs, sch, to_free);
 599                         if (rc != NET_XMIT_SUCCESS) {
 600                                 if (net_xmit_drop_count(rc))
 601                                         qdisc_qstats_drop(sch);
 602                         } else {
 603                                 nb++;
 604                                 len += last_len;
 605                         }
 606                         segs = skb2;
 607                 }
 608                 /* Parent qdiscs accounted for 1 skb of size @prev_len */
 609                 qdisc_tree_reduce_backlog(sch, -(nb - 1), -(len - prev_len));
 610         } else if (!skb) {
 611                 return NET_XMIT_DROP;
 612         }
 613         return NET_XMIT_SUCCESS;
 614 }
 615
 616 /* Delay the next round with a new future slot with a
 617  * correct number of bytes and packets.
 618  */
 619
 620 static void get_slot_next(struct netem_sched_data *q, u64 now)
 621 {
 622         s64 next_delay;
 623
 624         if (!q->slot_dist)
 625                 next_delay = q->slot_config.min_delay +
 626                                 (prandom_u32() *
 627                                  (q->slot_config.max_delay -
 628                                   q->slot_config.min_delay) >> 32);
 629         else
 630                 next_delay = tabledist(q->slot_config.dist_delay,
 631                                        (s32)(q->slot_config.dist_jitter),
 632                                        NULL, q->slot_dist);
 633
 634         q->slot.slot_next = now + next_delay;
 635         q->slot.packets_left = q->slot_config.max_packets;
 636         q->slot.bytes_left = q->slot_config.max_bytes;
 637 }
 638
 639 static struct sk_buff *netem_dequeue(struct Qdisc *sch)
 640 {
 641         struct netem_sched_data *q = qdisc_priv(sch);
 642         struct sk_buff *skb;
 643         struct rb_node *p;
 644
 645 tfifo_dequeue:
 646         skb = __qdisc_dequeue_head(&sch->q);
 647         if (skb) {
 648                 qdisc_qstats_backlog_dec(sch, skb);
 649 deliver:
 650                 qdisc_bstats_update(sch, skb);
 651                 return skb;
 652         }
 653         p = rb_first(&q->t_root);
 654         if (p) {
 655                 u64 time_to_send;
 656                 u64 now = ktime_get_ns();
 657
 658                 skb = rb_to_skb(p);
 659
 660                 /* if more time remaining? */
 661                 time_to_send = netem_skb_cb(skb)->time_to_send;
 662                 if (q->slot.slot_next && q->slot.slot_next < time_to_send)
 663                         get_slot_next(q, now);
 664
 665                 if (time_to_send <= now &&  q->slot.slot_next <= now) {
 666                         rb_erase(p, &q->t_root);
 667                         sch->q.qlen--;
 668                         qdisc_qstats_backlog_dec(sch, skb);
 669                         skb->next = NULL;
 670                         skb->prev = NULL;
 671                         /* skb->dev shares skb->rbnode area,
 672                          * we need to restore its value.
 673                          */
 674                         skb->dev = qdisc_dev(sch);
 675
 676 #ifdef CONFIG_NET_CLS_ACT
 677                         /*
 678                          * If it's at ingress let's pretend the delay is
 679                          * from the network (tstamp will be updated).
 680                          */
 681                         if (skb->tc_redirected && skb->tc_from_ingress)
 682                                 skb->tstamp = 0;
 683 #endif
 684
 685                         if (q->slot.slot_next) {
 686                                 q->slot.packets_left--;
 687                                 q->slot.bytes_left -= qdisc_pkt_len(skb);
 688                                 if (q->slot.packets_left <= 0 ||
 689                                     q->slot.bytes_left <= 0)
 690                                         get_slot_next(q, now);
 691                         }
 692
 693                         if (q->qdisc) {
 694                                 unsigned int pkt_len = qdisc_pkt_len(skb);
 695                                 struct sk_buff *to_free = NULL;
 696                                 int err;
 697
 698                                 err = qdisc_enqueue(skb, q->qdisc, &to_free);
 699                                 kfree_skb_list(to_free);
 700                                 if (err != NET_XMIT_SUCCESS &&
 701                                     net_xmit_drop_count(err)) {
 702                                         qdisc_qstats_drop(sch);
 703                                         qdisc_tree_reduce_backlog(sch, 1,
 704                                                                   pkt_len);
 705                                 }
 706                                 goto tfifo_dequeue;
 707                         }
 708                         goto deliver;
 709                 }
 710
 711                 if (q->qdisc) {
 712                         skb = q->qdisc->ops->dequeue(q->qdisc);
 713                         if (skb)
 714                                 goto deliver;
 715                 }
 716
 717                 qdisc_watchdog_schedule_ns(&q->watchdog,
 718                                            max(time_to_send,
 719                                                q->slot.slot_next));
 720         }
 721
 722         if (q->qdisc) {
 723                 skb = q->qdisc->ops->dequeue(q->qdisc);
 724                 if (skb)
 725                         goto deliver;
 726         }
 727         return NULL;
 728 }
 729
 730 static void netem_reset(struct Qdisc *sch)
 731 {
 732         struct netem_sched_data *q = qdisc_priv(sch);
 733
 734         qdisc_reset_queue(sch);
 735         tfifo_reset(sch);
 736         if (q->qdisc)
 737                 qdisc_reset(q->qdisc);
 738         qdisc_watchdog_cancel(&q->watchdog);
 739 }
 740
 741 static void dist_free(struct disttable *d)
 742 {
 743         kvfree(d);
 744 }
 745
 746 /*
 747  * Distribution data is a variable size payload containing
 748  * signed 16 bit values.
 749  */
 750
 751 static int get_dist_table(struct Qdisc *sch, struct disttable **tbl,
 752                           const struct nlattr *attr)
 753 {
 754         size_t n = nla_len(attr)/sizeof(__s16);
 755         const __s16 *data = nla_data(attr);
 756         spinlock_t *root_lock;
 757         struct disttable *d;
 758         int i;
 759
 760         if (!n || n > NETEM_DIST_MAX)
 761                 return -EINVAL;
 762
 763         d = kvmalloc(sizeof(struct disttable) + n * sizeof(s16), GFP_KERNEL);
 764         if (!d)
 765                 return -ENOMEM;
 766
 767         d->size = n;
 768         for (i = 0; i < n; i++)
 769                 d->table[i] = data[i];
 770
 771         root_lock = qdisc_root_sleeping_lock(sch);
 772
 773         spin_lock_bh(root_lock);
 774         swap(*tbl, d);
 775         spin_unlock_bh(root_lock);
 776
 777         dist_free(d);
 778         return 0;
 779 }
 780
 781 static void get_slot(struct netem_sched_data *q, const struct nlattr *attr)
 782 {
 783         const struct tc_netem_slot *c = nla_data(attr);
 784
 785         q->slot_config = *c;
 786         if (q->slot_config.max_packets == 0)
 787                 q->slot_config.max_packets = INT_MAX;
 788         if (q->slot_config.max_bytes == 0)
 789                 q->slot_config.max_bytes = INT_MAX;
 790
 791         /* capping dist_jitter to the range acceptable by tabledist() */
 792         q->slot_config.dist_jitter = min_t(__s64, INT_MAX, abs(q->slot_config.dist_jitter));
 793
 794         q->slot.packets_left = q->slot_config.max_packets;
 795         q->slot.bytes_left = q->slot_config.max_bytes;
 796         if (q->slot_config.min_delay | q->slot_config.max_delay |
 797             q->slot_config.dist_jitter)
 798                 q->slot.slot_next = ktime_get_ns();
 799         else
 800                 q->slot.slot_next = 0;
 801 }
 802
 803 static void get_correlation(struct netem_sched_data *q, const struct nlattr *attr)
 804 {
 805         const struct tc_netem_corr *c = nla_data(attr);
 806
 807         init_crandom(&q->delay_cor, c->delay_corr);
 808         init_crandom(&q->loss_cor, c->loss_corr);
 809         init_crandom(&q->dup_cor, c->dup_corr);
 810 }
 811
 812 static void get_reorder(struct netem_sched_data *q, const struct nlattr *attr)
 813 {
 814         const struct tc_netem_reorder *r = nla_data(attr);
 815
 816         q->reorder = r->probability;
 817         init_crandom(&q->reorder_cor, r->correlation);
 818 }
 819
 820 static void get_corrupt(struct netem_sched_data *q, const struct nlattr *attr)
 821 {
 822         const struct tc_netem_corrupt *r = nla_data(attr);
 823
 824         q->corrupt = r->probability;
 825         init_crandom(&q->corrupt_cor, r->correlation);
 826 }
 827
 828 static void get_rate(struct netem_sched_data *q, const struct nlattr *attr)
 829 {
 830         const struct tc_netem_rate *r = nla_data(attr);
 831
 832         q->rate = r->rate;
 833         q->packet_overhead = r->packet_overhead;
 834         q->cell_size = r->cell_size;
 835         q->cell_overhead = r->cell_overhead;
 836         if (q->cell_size)
 837                 q->cell_size_reciprocal = reciprocal_value(q->cell_size);
 838         else
 839                 q->cell_size_reciprocal = (struct reciprocal_value) { 0 };
 840 }
 841
 842 static int get_loss_clg(struct netem_sched_data *q, const struct nlattr *attr)
 843 {
 844         const struct nlattr *la;
 845         int rem;
 846
 847         nla_for_each_nested(la, attr, rem) {
 848                 u16 type = nla_type(la);
 849
 850                 switch (type) {
 851                 case NETEM_LOSS_GI: {
 852                         const struct tc_netem_gimodel *gi = nla_data(la);
 853
 854                         if (nla_len(la) < sizeof(struct tc_netem_gimodel)) {
 855                                 pr_info("netem: incorrect gi model size\n");
 856                                 return -EINVAL;
 857                         }
 858
 859                         q->loss_model = CLG_4_STATES;
 860
 861                         q->clg.state = TX_IN_GAP_PERIOD;
 862                         q->clg.a1 = gi->p13;
 863                         q->clg.a2 = gi->p31;
 864                         q->clg.a3 = gi->p32;
 865                         q->clg.a4 = gi->p14;
 866                         q->clg.a5 = gi->p23;
 867                         break;
 868                 }
 869
 870                 case NETEM_LOSS_GE: {
 871                         const struct tc_netem_gemodel *ge = nla_data(la);
 872
 873                         if (nla_len(la) < sizeof(struct tc_netem_gemodel)) {
 874                                 pr_info("netem: incorrect ge model size\n");
 875                                 return -EINVAL;
 876                         }
 877
 878                         q->loss_model = CLG_GILB_ELL;
 879                         q->clg.state = GOOD_STATE;
 880                         q->clg.a1 = ge->p;
 881                         q->clg.a2 = ge->r;
 882                         q->clg.a3 = ge->h;
 883                         q->clg.a4 = ge->k1;
 884                         break;
 885                 }
 886
 887                 default:
 888                         pr_info("netem: unknown loss type %u\n", type);
 889                         return -EINVAL;
 890                 }
 891         }
 892
 893         return 0;
 894 }
 895
 896 static const struct nla_policy netem_policy[TCA_NETEM_MAX + 1] = {
 897         [TCA_NETEM_CORR]        = { .len = sizeof(struct tc_netem_corr) },
 898         [TCA_NETEM_REORDER]     = { .len = sizeof(struct tc_netem_reorder) },
 899         [TCA_NETEM_CORRUPT]     = { .len = sizeof(struct tc_netem_corrupt) },
 900         [TCA_NETEM_RATE]        = { .len = sizeof(struct tc_netem_rate) },
 901         [TCA_NETEM_LOSS]        = { .type = NLA_NESTED },
 902         [TCA_NETEM_ECN]         = { .type = NLA_U32 },
 903         [TCA_NETEM_RATE64]      = { .type = NLA_U64 },
 904         [TCA_NETEM_LATENCY64]   = { .type = NLA_S64 },
 905         [TCA_NETEM_JITTER64]    = { .type = NLA_S64 },
 906         [TCA_NETEM_SLOT]        = { .len = sizeof(struct tc_netem_slot) },
 907 };
 908
 909 static int parse_attr(struct nlattr *tb[], int maxtype, struct nlattr *nla,
 910                       const struct nla_policy *policy, int len)
 911 {
 912         int nested_len = nla_len(nla) - NLA_ALIGN(len);
 913
 914         if (nested_len < 0) {
 915                 pr_info("netem: invalid attributes len %d\n", nested_len);
 916                 return -EINVAL;
 917         }
 918
 919         if (nested_len >= nla_attr_size(0))
 920                 return nla_parse(tb, maxtype, nla_data(nla) + NLA_ALIGN(len),
 921                                  nested_len, policy, NULL);
 922
 923         memset(tb, 0, sizeof(struct nlattr *) * (maxtype + 1));
 924         return 0;
 925 }
 926
 927 /* Parse netlink message to set options */
 928 static int netem_change(struct Qdisc *sch, struct nlattr *opt,
 929                         struct netlink_ext_ack *extack)
 930 {
 931         struct netem_sched_data *q = qdisc_priv(sch);
 932         struct nlattr *tb[TCA_NETEM_MAX + 1];
 933         struct tc_netem_qopt *qopt;
 934         struct clgstate old_clg;
 935         int old_loss_model = CLG_RANDOM;
 936         int ret;
 937
 938         if (opt == NULL)
 939                 return -EINVAL;
 940
 941         qopt = nla_data(opt);
 942         ret = parse_attr(tb, TCA_NETEM_MAX, opt, netem_policy, sizeof(*qopt));
 943         if (ret < 0)
 944                 return ret;
 945
 946         /* backup q->clg and q->loss_model */
 947         old_clg = q->clg;
 948         old_loss_model = q->loss_model;
 949
 950         if (tb[TCA_NETEM_LOSS]) {
 951                 ret = get_loss_clg(q, tb[TCA_NETEM_LOSS]);
 952                 if (ret) {
 953                         q->loss_model = old_loss_model;
 954                         return ret;
 955                 }
 956         } else {
 957                 q->loss_model = CLG_RANDOM;
 958         }
 959
 960         if (tb[TCA_NETEM_DELAY_DIST]) {
 961                 ret = get_dist_table(sch, &q->delay_dist,
 962                                      tb[TCA_NETEM_DELAY_DIST]);
 963                 if (ret)
 964                         goto get_table_failure;
 965         }
 966
 967         if (tb[TCA_NETEM_SLOT_DIST]) {
 968                 ret = get_dist_table(sch, &q->slot_dist,
 969                                      tb[TCA_NETEM_SLOT_DIST]);
 970                 if (ret)
 971                         goto get_table_failure;
 972         }
 973
 974         sch->limit = qopt->limit;
 975
 976         q->latency = PSCHED_TICKS2NS(qopt->latency);
 977         q->jitter = PSCHED_TICKS2NS(qopt->jitter);
 978         q->limit = qopt->limit;
 979         q->gap = qopt->gap;
 980         q->counter = 0;
 981         q->loss = qopt->loss;
 982         q->duplicate = qopt->duplicate;
 983
 984         /* for compatibility with earlier versions.
 985          * if gap is set, need to assume 100% probability
 986          */
 987         if (q->gap)
 988                 q->reorder = ~0;
 989
 990         if (tb[TCA_NETEM_CORR])
 991                 get_correlation(q, tb[TCA_NETEM_CORR]);
 992
 993         if (tb[TCA_NETEM_REORDER])
 994                 get_reorder(q, tb[TCA_NETEM_REORDER]);
 995
 996         if (tb[TCA_NETEM_CORRUPT])
 997                 get_corrupt(q, tb[TCA_NETEM_CORRUPT]);
 998
 999         if (tb[TCA_NETEM_RATE])
1000                 get_rate(q, tb[TCA_NETEM_RATE]);
1001
1002         if (tb[TCA_NETEM_RATE64])
1003                 q->rate = max_t(u64, q->rate,
1004                                 nla_get_u64(tb[TCA_NETEM_RATE64]));
1005
1006         if (tb[TCA_NETEM_LATENCY64])
1007                 q->latency = nla_get_s64(tb[TCA_NETEM_LATENCY64]);
1008
1009         if (tb[TCA_NETEM_JITTER64])
1010                 q->jitter = nla_get_s64(tb[TCA_NETEM_JITTER64]);
1011
1012         if (tb[TCA_NETEM_ECN])
1013                 q->ecn = nla_get_u32(tb[TCA_NETEM_ECN]);
1014
1015         if (tb[TCA_NETEM_SLOT])
1016                 get_slot(q, tb[TCA_NETEM_SLOT]);
1017
1018         /* capping jitter to the range acceptable by tabledist() */
1019         q->jitter = min_t(s64, abs(q->jitter), INT_MAX);
1020
1021         return ret;
1022
1023 get_table_failure:
1024         /* recover clg and loss_model, in case of
1025          * q->clg and q->loss_model were modified
1026          * in get_loss_clg()
1027          */
1028         q->clg = old_clg;
1029         q->loss_model = old_loss_model;
1030         return ret;
1031 }
1032
1033 static int netem_init(struct Qdisc *sch, struct nlattr *opt,
1034                       struct netlink_ext_ack *extack)
1035 {
1036         struct netem_sched_data *q = qdisc_priv(sch);
1037         int ret;
1038
1039         qdisc_watchdog_init(&q->watchdog, sch);
1040
1041         if (!opt)
1042                 return -EINVAL;
1043
1044         q->loss_model = CLG_RANDOM;
1045         ret = netem_change(sch, opt, extack);
1046         if (ret)
1047                 pr_info("netem: change failed\n");
1048         return ret;
1049 }
1050
1051 static void netem_destroy(struct Qdisc *sch)
1052 {
1053         struct netem_sched_data *q = qdisc_priv(sch);
1054
1055         qdisc_watchdog_cancel(&q->watchdog);
1056         if (q->qdisc)
1057                 qdisc_destroy(q->qdisc);
1058         dist_free(q->delay_dist);
1059         dist_free(q->slot_dist);
1060 }
1061
1062 static int dump_loss_model(const struct netem_sched_data *q,
1063                            struct sk_buff *skb)
1064 {
1065         struct nlattr *nest;
1066
1067         nest = nla_nest_start(skb, TCA_NETEM_LOSS);
1068         if (nest == NULL)
1069                 goto nla_put_failure;
1070
1071         switch (q->loss_model) {
1072         case CLG_RANDOM:
1073                 /* legacy loss model */
1074                 nla_nest_cancel(skb, nest);
1075                 return 0;       /* no data */
1076
1077         case CLG_4_STATES: {
1078                 struct tc_netem_gimodel gi = {
1079                         .p13 = q->clg.a1,
1080                         .p31 = q->clg.a2,
1081                         .p32 = q->clg.a3,
1082                         .p14 = q->clg.a4,
1083                         .p23 = q->clg.a5,
1084                 };
1085
1086                 if (nla_put(skb, NETEM_LOSS_GI, sizeof(gi), &gi))
1087                         goto nla_put_failure;
1088                 break;
1089         }
1090         case CLG_GILB_ELL: {
1091                 struct tc_netem_gemodel ge = {
1092                         .p = q->clg.a1,
1093                         .r = q->clg.a2,
1094                         .h = q->clg.a3,
1095                         .k1 = q->clg.a4,
1096                 };
1097
1098                 if (nla_put(skb, NETEM_LOSS_GE, sizeof(ge), &ge))
1099                         goto nla_put_failure;
1100                 break;
1101         }
1102         }
1103
1104         nla_nest_end(skb, nest);
1105         return 0;
1106
1107 nla_put_failure:
1108         nla_nest_cancel(skb, nest);
1109         return -1;
1110 }
1111
1112 static int netem_dump(struct Qdisc *sch, struct sk_buff *skb)
1113 {
1114         const struct netem_sched_data *q = qdisc_priv(sch);
1115         struct nlattr *nla = (struct nlattr *) skb_tail_pointer(skb);
1116         struct tc_netem_qopt qopt;
1117         struct tc_netem_corr cor;
1118         struct tc_netem_reorder reorder;
1119         struct tc_netem_corrupt corrupt;
1120         struct tc_netem_rate rate;
1121         struct tc_netem_slot slot;
1122
1123         qopt.latency = min_t(psched_tdiff_t, PSCHED_NS2TICKS(q->latency),
1124                              UINT_MAX);
1125         qopt.jitter = min_t(psched_tdiff_t, PSCHED_NS2TICKS(q->jitter),
1126                             UINT_MAX);
1127         qopt.limit = q->limit;
1128         qopt.loss = q->loss;
1129         qopt.gap = q->gap;
1130         qopt.duplicate = q->duplicate;
1131         if (nla_put(skb, TCA_OPTIONS, sizeof(qopt), &qopt))
1132                 goto nla_put_failure;
1133
1134         if (nla_put(skb, TCA_NETEM_LATENCY64, sizeof(q->latency), &q->latency))
1135                 goto nla_put_failure;
1136
1137         if (nla_put(skb, TCA_NETEM_JITTER64, sizeof(q->jitter), &q->jitter))
1138                 goto nla_put_failure;
1139
1140         cor.delay_corr = q->delay_cor.rho;
1141         cor.loss_corr = q->loss_cor.rho;
1142         cor.dup_corr = q->dup_cor.rho;
1143         if (nla_put(skb, TCA_NETEM_CORR, sizeof(cor), &cor))
1144                 goto nla_put_failure;
1145
1146         reorder.probability = q->reorder;
1147         reorder.correlation = q->reorder_cor.rho;
1148         if (nla_put(skb, TCA_NETEM_REORDER, sizeof(reorder), &reorder))
1149                 goto nla_put_failure;
1150
1151         corrupt.probability = q->corrupt;
1152         corrupt.correlation = q->corrupt_cor.rho;
1153         if (nla_put(skb, TCA_NETEM_CORRUPT, sizeof(corrupt), &corrupt))
1154                 goto nla_put_failure;
1155
1156         if (q->rate >= (1ULL << 32)) {
1157                 if (nla_put_u64_64bit(skb, TCA_NETEM_RATE64, q->rate,
1158                                       TCA_NETEM_PAD))
1159                         goto nla_put_failure;
1160                 rate.rate = ~0U;
1161         } else {
1162                 rate.rate = q->rate;
1163         }
1164         rate.packet_overhead = q->packet_overhead;
1165         rate.cell_size = q->cell_size;
1166         rate.cell_overhead = q->cell_overhead;
1167         if (nla_put(skb, TCA_NETEM_RATE, sizeof(rate), &rate))
1168                 goto nla_put_failure;
1169
1170         if (q->ecn && nla_put_u32(skb, TCA_NETEM_ECN, q->ecn))
1171                 goto nla_put_failure;
1172
1173         if (dump_loss_model(q, skb) != 0)
1174                 goto nla_put_failure;
1175
1176         if (q->slot_config.min_delay | q->slot_config.max_delay |
1177             q->slot_config.dist_jitter) {
1178                 slot = q->slot_config;
1179                 if (slot.max_packets == INT_MAX)
1180                         slot.max_packets = 0;
1181                 if (slot.max_bytes == INT_MAX)
1182                         slot.max_bytes = 0;
1183                 if (nla_put(skb, TCA_NETEM_SLOT, sizeof(slot), &slot))
1184                         goto nla_put_failure;
1185         }
1186
1187         return nla_nest_end(skb, nla);
1188
1189 nla_put_failure:
1190         nlmsg_trim(skb, nla);
1191         return -1;
1192 }
1193
1194 static int netem_dump_class(struct Qdisc *sch, unsigned long cl,
1195                           struct sk_buff *skb, struct tcmsg *tcm)
1196 {
1197         struct netem_sched_data *q = qdisc_priv(sch);
1198
1199         if (cl != 1 || !q->qdisc)       /* only one class */
1200                 return -ENOENT;
1201
1202         tcm->tcm_handle |= TC_H_MIN(1);
1203         tcm->tcm_info = q->qdisc->handle;
1204
1205         return 0;
1206 }
1207
1208 static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
1209                      struct Qdisc **old, struct netlink_ext_ack *extack)
1210 {
1211         struct netem_sched_data *q = qdisc_priv(sch);
1212
1213         *old = qdisc_replace(sch, new, &q->qdisc);
1214         return 0;
1215 }
1216
1217 static struct Qdisc *netem_leaf(struct Qdisc *sch, unsigned long arg)
1218 {
1219         struct netem_sched_data *q = qdisc_priv(sch);
1220         return q->qdisc;
1221 }
1222
1223 static unsigned long netem_find(struct Qdisc *sch, u32 classid)
1224 {
1225         return 1;
1226 }
1227
1228 static void netem_walk(struct Qdisc *sch, struct qdisc_walker *walker)
1229 {
1230         if (!walker->stop) {
1231                 if (walker->count >= walker->skip)
1232                         if (walker->fn(sch, 1, walker) < 0) {
1233                                 walker->stop = 1;
1234                                 return;
1235                         }
1236                 walker->count++;
1237         }
1238 }
1239
1240 static const struct Qdisc_class_ops netem_class_ops = {
1241         .graft          =       netem_graft,
1242         .leaf           =       netem_leaf,
1243         .find           =       netem_find,
1244         .walk           =       netem_walk,
1245         .dump           =       netem_dump_class,
1246 };
1247
1248 static struct Qdisc_ops netem_qdisc_ops __read_mostly = {
1249         .id             =       "netem",
1250         .cl_ops         =       &netem_class_ops,
1251         .priv_size      =       sizeof(struct netem_sched_data),
1252         .enqueue        =       netem_enqueue,
1253         .dequeue        =       netem_dequeue,
1254         .peek           =       qdisc_peek_dequeued,
1255         .init           =       netem_init,
1256         .reset          =       netem_reset,
1257         .destroy        =       netem_destroy,
1258         .change         =       netem_change,
1259         .dump           =       netem_dump,
1260         .owner          =       THIS_MODULE,
1261 };
1262
1263
1264 static int __init netem_module_init(void)
1265 {
1266         pr_info("netem: version " VERSION "\n");
1267         return register_qdisc(&netem_qdisc_ops);
1268 }
1269 static void __exit netem_module_exit(void)
1270 {
1271         unregister_qdisc(&netem_qdisc_ops);
1272 }
1273 module_init(netem_module_init)
1274 module_exit(netem_module_exit)
1275 MODULE_LICENSE("GPL");