block/blk-iocost.c

   1 /* SPDX-License-Identifier: GPL-2.0
   2  *
   3  * IO cost model based controller.
   4  *
   5  * Copyright (C) 2019 Tejun Heo <tj@kernel.org>
   6  * Copyright (C) 2019 Andy Newell <newella@fb.com>
   7  * Copyright (C) 2019 Facebook
   8  *
   9  * One challenge of controlling IO resources is the lack of trivially
  10  * observable cost metric.  This is distinguished from CPU and memory where
  11  * wallclock time and the number of bytes can serve as accurate enough
  12  * approximations.
  13  *
  14  * Bandwidth and iops are the most commonly used metrics for IO devices but
  15  * depending on the type and specifics of the device, different IO patterns
  16  * easily lead to multiple orders of magnitude variations rendering them
  17  * useless for the purpose of IO capacity distribution.  While on-device
  18  * time, with a lot of clutches, could serve as a useful approximation for
  19  * non-queued rotational devices, this is no longer viable with modern
  20  * devices, even the rotational ones.
  21  *
  22  * While there is no cost metric we can trivially observe, it isn't a
  23  * complete mystery.  For example, on a rotational device, seek cost
  24  * dominates while a contiguous transfer contributes a smaller amount
  25  * proportional to the size.  If we can characterize at least the relative
  26  * costs of these different types of IOs, it should be possible to
  27  * implement a reasonable work-conserving proportional IO resource
  28  * distribution.
  29  *
  30  * 1. IO Cost Model
  31  *
  32  * IO cost model estimates the cost of an IO given its basic parameters and
  33  * history (e.g. the end sector of the last IO).  The cost is measured in
  34  * device time.  If a given IO is estimated to cost 10ms, the device should
  35  * be able to process ~100 of those IOs in a second.
  36  *
  37  * Currently, there's only one builtin cost model - linear.  Each IO is
  38  * classified as sequential or random and given a base cost accordingly.
  39  * On top of that, a size cost proportional to the length of the IO is
  40  * added.  While simple, this model captures the operational
  41  * characteristics of a wide varienty of devices well enough.  Default
  42  * parameters for several different classes of devices are provided and the
  43  * parameters can be configured from userspace via
  44  * /sys/fs/cgroup/io.cost.model.
  45  *
  46  * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate
  47  * device-specific coefficients.
  48  *
  49  * 2. Control Strategy
  50  *
  51  * The device virtual time (vtime) is used as the primary control metric.
  52  * The control strategy is composed of the following three parts.
  53  *
  54  * 2-1. Vtime Distribution
  55  *
  56  * When a cgroup becomes active in terms of IOs, its hierarchical share is
  57  * calculated.  Please consider the following hierarchy where the numbers
  58  * inside parentheses denote the configured weights.
  59  *
  60  *           root
  61  *         /       \
  62  *      A (w:100)  B (w:300)
  63  *      /       \
  64  *  A0 (w:100)  A1 (w:100)
  65  *
  66  * If B is idle and only A0 and A1 are actively issuing IOs, as the two are
  67  * of equal weight, each gets 50% share.  If then B starts issuing IOs, B
  68  * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest,
  69  * 12.5% each.  The distribution mechanism only cares about these flattened
  70  * shares.  They're called hweights (hierarchical weights) and always add
  71  * upto 1 (WEIGHT_ONE).
  72  *
  73  * A given cgroup's vtime runs slower in inverse proportion to its hweight.
  74  * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5)
  75  * against the device vtime - an IO which takes 10ms on the underlying
  76  * device is considered to take 80ms on A0.
  77  *
  78  * This constitutes the basis of IO capacity distribution.  Each cgroup's
  79  * vtime is running at a rate determined by its hweight.  A cgroup tracks
  80  * the vtime consumed by past IOs and can issue a new IO if doing so
  81  * wouldn't outrun the current device vtime.  Otherwise, the IO is
  82  * suspended until the vtime has progressed enough to cover it.
  83  *
  84  * 2-2. Vrate Adjustment
  85  *
  86  * It's unrealistic to expect the cost model to be perfect.  There are too
  87  * many devices and even on the same device the overall performance
  88  * fluctuates depending on numerous factors such as IO mixture and device
  89  * internal garbage collection.  The controller needs to adapt dynamically.
  90  *
  91  * This is achieved by adjusting the overall IO rate according to how busy
  92  * the device is.  If the device becomes overloaded, we're sending down too
  93  * many IOs and should generally slow down.  If there are waiting issuers
  94  * but the device isn't saturated, we're issuing too few and should
  95  * generally speed up.
  96  *
  97  * To slow down, we lower the vrate - the rate at which the device vtime
  98  * passes compared to the wall clock.  For example, if the vtime is running
  99  * at the vrate of 75%, all cgroups added up would only be able to issue
 100  * 750ms worth of IOs per second, and vice-versa for speeding up.
 101  *
 102  * Device business is determined using two criteria - rq wait and
 103  * completion latencies.
 104  *
 105  * When a device gets saturated, the on-device and then the request queues
 106  * fill up and a bio which is ready to be issued has to wait for a request
 107  * to become available.  When this delay becomes noticeable, it's a clear
 108  * indication that the device is saturated and we lower the vrate.  This
 109  * saturation signal is fairly conservative as it only triggers when both
 110  * hardware and software queues are filled up, and is used as the default
 111  * busy signal.
 112  *
 113  * As devices can have deep queues and be unfair in how the queued commands
 114  * are executed, soley depending on rq wait may not result in satisfactory
 115  * control quality.  For a better control quality, completion latency QoS
 116  * parameters can be configured so that the device is considered saturated
 117  * if N'th percentile completion latency rises above the set point.
 118  *
 119  * The completion latency requirements are a function of both the
 120  * underlying device characteristics and the desired IO latency quality of
 121  * service.  There is an inherent trade-off - the tighter the latency QoS,
 122  * the higher the bandwidth lossage.  Latency QoS is disabled by default
 123  * and can be set through /sys/fs/cgroup/io.cost.qos.
 124  *
 125  * 2-3. Work Conservation
 126  *
 127  * Imagine two cgroups A and B with equal weights.  A is issuing a small IO
 128  * periodically while B is sending out enough parallel IOs to saturate the
 129  * device on its own.  Let's say A's usage amounts to 100ms worth of IO
 130  * cost per second, i.e., 10% of the device capacity.  The naive
 131  * distribution of half and half would lead to 60% utilization of the
 132  * device, a significant reduction in the total amount of work done
 133  * compared to free-for-all competition.  This is too high a cost to pay
 134  * for IO control.
 135  *
 136  * To conserve the total amount of work done, we keep track of how much
 137  * each active cgroup is actually using and yield part of its weight if
 138  * there are other cgroups which can make use of it.  In the above case,
 139  * A's weight will be lowered so that it hovers above the actual usage and
 140  * B would be able to use the rest.
 141  *
 142  * As we don't want to penalize a cgroup for donating its weight, the
 143  * surplus weight adjustment factors in a margin and has an immediate
 144  * snapback mechanism in case the cgroup needs more IO vtime for itself.
 145  *
 146  * Note that adjusting down surplus weights has the same effects as
 147  * accelerating vtime for other cgroups and work conservation can also be
 148  * implemented by adjusting vrate dynamically.  However, squaring who can
 149  * donate and should take back how much requires hweight propagations
 150  * anyway making it easier to implement and understand as a separate
 151  * mechanism.
 152  *
 153  * 3. Monitoring
 154  *
 155  * Instead of debugfs or other clumsy monitoring mechanisms, this
 156  * controller uses a drgn based monitoring script -
 157  * tools/cgroup/iocost_monitor.py.  For details on drgn, please see
 158  * https://github.com/osandov/drgn.  The output looks like the following.
 159  *
 160  *  sdb RUN   per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12%
 161  *                 active      weight      hweight% inflt% dbt  delay usages%
 162  *  test/a              *    50/   50  33.33/ 33.33  27.65   2  0*041 033:033:033
 163  *  test/b              *   100/  100  66.67/ 66.67  17.56   0  0*000 066:079:077
 164  *
 165  * - per        : Timer period
 166  * - cur_per    : Internal wall and device vtime clock
 167  * - vrate      : Device virtual time rate against wall clock
 168  * - weight     : Surplus-adjusted and configured weights
 169  * - hweight    : Surplus-adjusted and configured hierarchical weights
 170  * - inflt      : The percentage of in-flight IO cost at the end of last period
 171  * - del_ms     : Deferred issuer delay induction level and duration
 172  * - usages     : Usage history
 173  */
 174
 175 #include <linux/kernel.h>
 176 #include <linux/module.h>
 177 #include <linux/timer.h>
 178 #include <linux/time64.h>
 179 #include <linux/parser.h>
 180 #include <linux/sched/signal.h>
 181 #include <linux/blk-cgroup.h>
 182 #include <asm/local.h>
 183 #include <asm/local64.h>
 184 #include "blk-rq-qos.h"
 185 #include "blk-stat.h"
 186 #include "blk-wbt.h"
 187
 188 #ifdef CONFIG_TRACEPOINTS
 189
 190 /* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */
 191 #define TRACE_IOCG_PATH_LEN 1024
 192 static DEFINE_SPINLOCK(trace_iocg_path_lock);
 193 static char trace_iocg_path[TRACE_IOCG_PATH_LEN];
 194
 195 #define TRACE_IOCG_PATH(type, iocg, ...)                                        \
 196         do {                                                                    \
 197                 unsigned long flags;                                            \
 198                 if (trace_iocost_##type##_enabled()) {                          \
 199                         spin_lock_irqsave(&trace_iocg_path_lock, flags);        \
 200                         cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup,      \
 201                                     trace_iocg_path, TRACE_IOCG_PATH_LEN);      \
 202                         trace_iocost_##type(iocg, trace_iocg_path,              \
 203                                               ##__VA_ARGS__);                   \
 204                         spin_unlock_irqrestore(&trace_iocg_path_lock, flags);   \
 205                 }                                                               \
 206         } while (0)
 207
 208 #else   /* CONFIG_TRACE_POINTS */
 209 #define TRACE_IOCG_PATH(type, iocg, ...)        do { } while (0)
 210 #endif  /* CONFIG_TRACE_POINTS */
 211
 212 enum {
 213         MILLION                 = 1000000,
 214
 215         /* timer period is calculated from latency requirements, bound it */
 216         MIN_PERIOD              = USEC_PER_MSEC,
 217         MAX_PERIOD              = USEC_PER_SEC,
 218
 219         /*
 220          * iocg->vtime is targeted at 50% behind the device vtime, which
 221          * serves as its IO credit buffer.  Surplus weight adjustment is
 222          * immediately canceled if the vtime margin runs below 10%.
 223          */
 224         MARGIN_MIN_PCT          = 10,
 225         MARGIN_LOW_PCT          = 20,
 226         MARGIN_TARGET_PCT       = 50,
 227
 228         INUSE_ADJ_STEP_PCT      = 25,
 229
 230         /* Have some play in timer operations */
 231         TIMER_SLACK_PCT         = 1,
 232
 233         /* 1/64k is granular enough and can easily be handled w/ u32 */
 234         WEIGHT_ONE              = 1 << 16,
 235 };
 236
 237 enum {
 238         /*
 239          * As vtime is used to calculate the cost of each IO, it needs to
 240          * be fairly high precision.  For example, it should be able to
 241          * represent the cost of a single page worth of discard with
 242          * suffificient accuracy.  At the same time, it should be able to
 243          * represent reasonably long enough durations to be useful and
 244          * convenient during operation.
 245          *
 246          * 1s worth of vtime is 2^37.  This gives us both sub-nanosecond
 247          * granularity and days of wrap-around time even at extreme vrates.
 248          */
 249         VTIME_PER_SEC_SHIFT     = 37,
 250         VTIME_PER_SEC           = 1LLU << VTIME_PER_SEC_SHIFT,
 251         VTIME_PER_USEC          = VTIME_PER_SEC / USEC_PER_SEC,
 252         VTIME_PER_NSEC          = VTIME_PER_SEC / NSEC_PER_SEC,
 253
 254         /* bound vrate adjustments within two orders of magnitude */
 255         VRATE_MIN_PPM           = 10000,        /* 1% */
 256         VRATE_MAX_PPM           = 100000000,    /* 10000% */
 257
 258         VRATE_MIN               = VTIME_PER_USEC * VRATE_MIN_PPM / MILLION,
 259         VRATE_CLAMP_ADJ_PCT     = 4,
 260
 261         /* switch iff the conditions are met for longer than this */
 262         AUTOP_CYCLE_NSEC        = 10LLU * NSEC_PER_SEC,
 263 };
 264
 265 enum {
 266         /* if IOs end up waiting for requests, issue less */
 267         RQ_WAIT_BUSY_PCT        = 5,
 268
 269         /* unbusy hysterisis */
 270         UNBUSY_THR_PCT          = 75,
 271
 272         /*
 273          * The effect of delay is indirect and non-linear and a huge amount of
 274          * future debt can accumulate abruptly while unthrottled. Linearly scale
 275          * up delay as debt is going up and then let it decay exponentially.
 276          * This gives us quick ramp ups while delay is accumulating and long
 277          * tails which can help reducing the frequency of debt explosions on
 278          * unthrottle. The parameters are experimentally determined.
 279          *
 280          * The delay mechanism provides adequate protection and behavior in many
 281          * cases. However, this is far from ideal and falls shorts on both
 282          * fronts. The debtors are often throttled too harshly costing a
 283          * significant level of fairness and possibly total work while the
 284          * protection against their impacts on the system can be choppy and
 285          * unreliable.
 286          *
 287          * The shortcoming primarily stems from the fact that, unlike for page
 288          * cache, the kernel doesn't have well-defined back-pressure propagation
 289          * mechanism and policies for anonymous memory. Fully addressing this
 290          * issue will likely require substantial improvements in the area.
 291          */
 292         MIN_DELAY_THR_PCT       = 500,
 293         MAX_DELAY_THR_PCT       = 25000,
 294         MIN_DELAY               = 250,
 295         MAX_DELAY               = 250 * USEC_PER_MSEC,
 296
 297         /* halve debts if avg usage over 100ms is under 50% */
 298         DFGV_USAGE_PCT          = 50,
 299         DFGV_PERIOD             = 100 * USEC_PER_MSEC,
 300
 301         /* don't let cmds which take a very long time pin lagging for too long */
 302         MAX_LAGGING_PERIODS     = 10,
 303
 304         /*
 305          * Count IO size in 4k pages.  The 12bit shift helps keeping
 306          * size-proportional components of cost calculation in closer
 307          * numbers of digits to per-IO cost components.
 308          */
 309         IOC_PAGE_SHIFT          = 12,
 310         IOC_PAGE_SIZE           = 1 << IOC_PAGE_SHIFT,
 311         IOC_SECT_TO_PAGE_SHIFT  = IOC_PAGE_SHIFT - SECTOR_SHIFT,
 312
 313         /* if apart further than 16M, consider randio for linear model */
 314         LCOEF_RANDIO_PAGES      = 4096,
 315 };
 316
 317 enum ioc_running {
 318         IOC_IDLE,
 319         IOC_RUNNING,
 320         IOC_STOP,
 321 };
 322
 323 /* io.cost.qos controls including per-dev enable of the whole controller */
 324 enum {
 325         QOS_ENABLE,
 326         QOS_CTRL,
 327         NR_QOS_CTRL_PARAMS,
 328 };
 329
 330 /* io.cost.qos params */
 331 enum {
 332         QOS_RPPM,
 333         QOS_RLAT,
 334         QOS_WPPM,
 335         QOS_WLAT,
 336         QOS_MIN,
 337         QOS_MAX,
 338         NR_QOS_PARAMS,
 339 };
 340
 341 /* io.cost.model controls */
 342 enum {
 343         COST_CTRL,
 344         COST_MODEL,
 345         NR_COST_CTRL_PARAMS,
 346 };
 347
 348 /* builtin linear cost model coefficients */
 349 enum {
 350         I_LCOEF_RBPS,
 351         I_LCOEF_RSEQIOPS,
 352         I_LCOEF_RRANDIOPS,
 353         I_LCOEF_WBPS,
 354         I_LCOEF_WSEQIOPS,
 355         I_LCOEF_WRANDIOPS,
 356         NR_I_LCOEFS,
 357 };
 358
 359 enum {
 360         LCOEF_RPAGE,
 361         LCOEF_RSEQIO,
 362         LCOEF_RRANDIO,
 363         LCOEF_WPAGE,
 364         LCOEF_WSEQIO,
 365         LCOEF_WRANDIO,
 366         NR_LCOEFS,
 367 };
 368
 369 enum {
 370         AUTOP_INVALID,
 371         AUTOP_HDD,
 372         AUTOP_SSD_QD1,
 373         AUTOP_SSD_DFL,
 374         AUTOP_SSD_FAST,
 375 };
 376
 377 struct ioc_params {
 378         u32                             qos[NR_QOS_PARAMS];
 379         u64                             i_lcoefs[NR_I_LCOEFS];
 380         u64                             lcoefs[NR_LCOEFS];
 381         u32                             too_fast_vrate_pct;
 382         u32                             too_slow_vrate_pct;
 383 };
 384
 385 struct ioc_margins {
 386         s64                             min;
 387         s64                             low;
 388         s64                             target;
 389 };
 390
 391 struct ioc_missed {
 392         local_t                         nr_met;
 393         local_t                         nr_missed;
 394         u32                             last_met;
 395         u32                             last_missed;
 396 };
 397
 398 struct ioc_pcpu_stat {
 399         struct ioc_missed               missed[2];
 400
 401         local64_t                       rq_wait_ns;
 402         u64                             last_rq_wait_ns;
 403 };
 404
 405 /* per device */
 406 struct ioc {
 407         struct rq_qos                   rqos;
 408
 409         bool                            enabled;
 410
 411         struct ioc_params               params;
 412         struct ioc_margins              margins;
 413         u32                             period_us;
 414         u32                             timer_slack_ns;
 415         u64                             vrate_min;
 416         u64                             vrate_max;
 417
 418         spinlock_t                      lock;
 419         struct timer_list               timer;
 420         struct list_head                active_iocgs;   /* active cgroups */
 421         struct ioc_pcpu_stat __percpu   *pcpu_stat;
 422
 423         enum ioc_running                running;
 424         atomic64_t                      vtime_rate;
 425         u64                             vtime_base_rate;
 426         s64                             vtime_err;
 427
 428         seqcount_spinlock_t             period_seqcount;
 429         u64                             period_at;      /* wallclock starttime */
 430         u64                             period_at_vtime; /* vtime starttime */
 431
 432         atomic64_t                      cur_period;     /* inc'd each period */
 433         int                             busy_level;     /* saturation history */
 434
 435         bool                            weights_updated;
 436         atomic_t                        hweight_gen;    /* for lazy hweights */
 437
 438         /* debt forgivness */
 439         u64                             dfgv_period_at;
 440         u64                             dfgv_period_rem;
 441         u64                             dfgv_usage_us_sum;
 442
 443         u64                             autop_too_fast_at;
 444         u64                             autop_too_slow_at;
 445         int                             autop_idx;
 446         bool                            user_qos_params:1;
 447         bool                            user_cost_model:1;
 448 };
 449
 450 struct iocg_pcpu_stat {
 451         local64_t                       abs_vusage;
 452 };
 453
 454 struct iocg_stat {
 455         u64                             usage_us;
 456         u64                             wait_us;
 457         u64                             indebt_us;
 458         u64                             indelay_us;
 459 };
 460
 461 /* per device-cgroup pair */
 462 struct ioc_gq {
 463         struct blkg_policy_data         pd;
 464         struct ioc                      *ioc;
 465
 466         /*
 467          * A iocg can get its weight from two sources - an explicit
 468          * per-device-cgroup configuration or the default weight of the
 469          * cgroup.  `cfg_weight` is the explicit per-device-cgroup
 470          * configuration.  `weight` is the effective considering both
 471          * sources.
 472          *
 473          * When an idle cgroup becomes active its `active` goes from 0 to
 474          * `weight`.  `inuse` is the surplus adjusted active weight.
 475          * `active` and `inuse` are used to calculate `hweight_active` and
 476          * `hweight_inuse`.
 477          *
 478          * `last_inuse` remembers `inuse` while an iocg is idle to persist
 479          * surplus adjustments.
 480          *
 481          * `inuse` may be adjusted dynamically during period. `saved_*` are used
 482          * to determine and track adjustments.
 483          */
 484         u32                             cfg_weight;
 485         u32                             weight;
 486         u32                             active;
 487         u32                             inuse;
 488
 489         u32                             last_inuse;
 490         s64                             saved_margin;
 491
 492         sector_t                        cursor;         /* to detect randio */
 493
 494         /*
 495          * `vtime` is this iocg's vtime cursor which progresses as IOs are
 496          * issued.  If lagging behind device vtime, the delta represents
 497          * the currently available IO budget.  If running ahead, the
 498          * overage.
 499          *
 500          * `vtime_done` is the same but progressed on completion rather
 501          * than issue.  The delta behind `vtime` represents the cost of
 502          * currently in-flight IOs.
 503          */
 504         atomic64_t                      vtime;
 505         atomic64_t                      done_vtime;
 506         u64                             abs_vdebt;
 507
 508         /* current delay in effect and when it started */
 509         u64                             delay;
 510         u64                             delay_at;
 511
 512         /*
 513          * The period this iocg was last active in.  Used for deactivation
 514          * and invalidating `vtime`.
 515          */
 516         atomic64_t                      active_period;
 517         struct list_head                active_list;
 518
 519         /* see __propagate_weights() and current_hweight() for details */
 520         u64                             child_active_sum;
 521         u64                             child_inuse_sum;
 522         u64                             child_adjusted_sum;
 523         int                             hweight_gen;
 524         u32                             hweight_active;
 525         u32                             hweight_inuse;
 526         u32                             hweight_donating;
 527         u32                             hweight_after_donation;
 528
 529         struct list_head                walk_list;
 530         struct list_head                surplus_list;
 531
 532         struct wait_queue_head          waitq;
 533         struct hrtimer                  waitq_timer;
 534
 535         /* timestamp at the latest activation */
 536         u64                             activated_at;
 537
 538         /* statistics */
 539         struct iocg_pcpu_stat __percpu  *pcpu_stat;
 540         struct iocg_stat                local_stat;
 541         struct iocg_stat                desc_stat;
 542         struct iocg_stat                last_stat;
 543         u64                             last_stat_abs_vusage;
 544         u64                             usage_delta_us;
 545         u64                             wait_since;
 546         u64                             indebt_since;
 547         u64                             indelay_since;
 548
 549         /* this iocg's depth in the hierarchy and ancestors including self */
 550         int                             level;
 551         struct ioc_gq                   *ancestors[];
 552 };
 553
 554 /* per cgroup */
 555 struct ioc_cgrp {
 556         struct blkcg_policy_data        cpd;
 557         unsigned int                    dfl_weight;
 558 };
 559
 560 struct ioc_now {
 561         u64                             now_ns;
 562         u64                             now;
 563         u64                             vnow;
 564         u64                             vrate;
 565 };
 566
 567 struct iocg_wait {
 568         struct wait_queue_entry         wait;
 569         struct bio                      *bio;
 570         u64                             abs_cost;
 571         bool                            committed;
 572 };
 573
 574 struct iocg_wake_ctx {
 575         struct ioc_gq                   *iocg;
 576         u32                             hw_inuse;
 577         s64                             vbudget;
 578 };
 579
 580 static const struct ioc_params autop[] = {
 581         [AUTOP_HDD] = {
 582                 .qos                            = {
 583                         [QOS_RLAT]              =        250000, /* 250ms */
 584                         [QOS_WLAT]              =        250000,
 585                         [QOS_MIN]               = VRATE_MIN_PPM,
 586                         [QOS_MAX]               = VRATE_MAX_PPM,
 587                 },
 588                 .i_lcoefs                       = {
 589                         [I_LCOEF_RBPS]          =     174019176,
 590                         [I_LCOEF_RSEQIOPS]      =         41708,
 591                         [I_LCOEF_RRANDIOPS]     =           370,
 592                         [I_LCOEF_WBPS]          =     178075866,
 593                         [I_LCOEF_WSEQIOPS]      =         42705,
 594                         [I_LCOEF_WRANDIOPS]     =           378,
 595                 },
 596         },
 597         [AUTOP_SSD_QD1] = {
 598                 .qos                            = {
 599                         [QOS_RLAT]              =         25000, /* 25ms */
 600                         [QOS_WLAT]              =         25000,
 601                         [QOS_MIN]               = VRATE_MIN_PPM,
 602                         [QOS_MAX]               = VRATE_MAX_PPM,
 603                 },
 604                 .i_lcoefs                       = {
 605                         [I_LCOEF_RBPS]          =     245855193,
 606                         [I_LCOEF_RSEQIOPS]      =         61575,
 607                         [I_LCOEF_RRANDIOPS]     =          6946,
 608                         [I_LCOEF_WBPS]          =     141365009,
 609                         [I_LCOEF_WSEQIOPS]      =         33716,
 610                         [I_LCOEF_WRANDIOPS]     =         26796,
 611                 },
 612         },
 613         [AUTOP_SSD_DFL] = {
 614                 .qos                            = {
 615                         [QOS_RLAT]              =         25000, /* 25ms */
 616                         [QOS_WLAT]              =         25000,
 617                         [QOS_MIN]               = VRATE_MIN_PPM,
 618                         [QOS_MAX]               = VRATE_MAX_PPM,
 619                 },
 620                 .i_lcoefs                       = {
 621                         [I_LCOEF_RBPS]          =     488636629,
 622                         [I_LCOEF_RSEQIOPS]      =          8932,
 623                         [I_LCOEF_RRANDIOPS]     =          8518,
 624                         [I_LCOEF_WBPS]          =     427891549,
 625                         [I_LCOEF_WSEQIOPS]      =         28755,
 626                         [I_LCOEF_WRANDIOPS]     =         21940,
 627                 },
 628                 .too_fast_vrate_pct             =           500,
 629         },
 630         [AUTOP_SSD_FAST] = {
 631                 .qos                            = {
 632                         [QOS_RLAT]              =          5000, /* 5ms */
 633                         [QOS_WLAT]              =          5000,
 634                         [QOS_MIN]               = VRATE_MIN_PPM,
 635                         [QOS_MAX]               = VRATE_MAX_PPM,
 636                 },
 637                 .i_lcoefs                       = {
 638                         [I_LCOEF_RBPS]          =    3102524156LLU,
 639                         [I_LCOEF_RSEQIOPS]      =        724816,
 640                         [I_LCOEF_RRANDIOPS]     =        778122,
 641                         [I_LCOEF_WBPS]          =    1742780862LLU,
 642                         [I_LCOEF_WSEQIOPS]      =        425702,
 643                         [I_LCOEF_WRANDIOPS]     =        443193,
 644                 },
 645                 .too_slow_vrate_pct             =            10,
 646         },
 647 };
 648
 649 /*
 650  * vrate adjust percentages indexed by ioc->busy_level.  We adjust up on
 651  * vtime credit shortage and down on device saturation.
 652  */
 653 static u32 vrate_adj_pct[] =
 654         { 0, 0, 0, 0,
 655           1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
 656           2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
 657           4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 };
 658
 659 static struct blkcg_policy blkcg_policy_iocost;
 660
 661 /* accessors and helpers */
 662 static struct ioc *rqos_to_ioc(struct rq_qos *rqos)
 663 {
 664         return container_of(rqos, struct ioc, rqos);
 665 }
 666
 667 static struct ioc *q_to_ioc(struct request_queue *q)
 668 {
 669         return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST));
 670 }
 671
 672 static const char *q_name(struct request_queue *q)
 673 {
 674         if (blk_queue_registered(q))
 675                 return kobject_name(q->kobj.parent);
 676         else
 677                 return "<unknown>";
 678 }
 679
 680 static const char __maybe_unused *ioc_name(struct ioc *ioc)
 681 {
 682         return q_name(ioc->rqos.q);
 683 }
 684
 685 static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd)
 686 {
 687         return pd ? container_of(pd, struct ioc_gq, pd) : NULL;
 688 }
 689
 690 static struct ioc_gq *blkg_to_iocg(struct blkcg_gq *blkg)
 691 {
 692         return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost));
 693 }
 694
 695 static struct blkcg_gq *iocg_to_blkg(struct ioc_gq *iocg)
 696 {
 697         return pd_to_blkg(&iocg->pd);
 698 }
 699
 700 static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg)
 701 {
 702         return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost),
 703                             struct ioc_cgrp, cpd);
 704 }
 705
 706 /*
 707  * Scale @abs_cost to the inverse of @hw_inuse.  The lower the hierarchical
 708  * weight, the more expensive each IO.  Must round up.
 709  */
 710 static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
 711 {
 712         return DIV64_U64_ROUND_UP(abs_cost * WEIGHT_ONE, hw_inuse);
 713 }
 714
 715 /*
 716  * The inverse of abs_cost_to_cost().  Must round up.
 717  */
 718 static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse)
 719 {
 720         return DIV64_U64_ROUND_UP(cost * hw_inuse, WEIGHT_ONE);
 721 }
 722
 723 static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio,
 724                             u64 abs_cost, u64 cost)
 725 {
 726         struct iocg_pcpu_stat *gcs;
 727
 728         bio->bi_iocost_cost = cost;
 729         atomic64_add(cost, &iocg->vtime);
 730
 731         gcs = get_cpu_ptr(iocg->pcpu_stat);
 732         local64_add(abs_cost, &gcs->abs_vusage);
 733         put_cpu_ptr(gcs);
 734 }
 735
 736 static void iocg_lock(struct ioc_gq *iocg, bool lock_ioc, unsigned long *flags)
 737 {
 738         if (lock_ioc) {
 739                 spin_lock_irqsave(&iocg->ioc->lock, *flags);
 740                 spin_lock(&iocg->waitq.lock);
 741         } else {
 742                 spin_lock_irqsave(&iocg->waitq.lock, *flags);
 743         }
 744 }
 745
 746 static void iocg_unlock(struct ioc_gq *iocg, bool unlock_ioc, unsigned long *flags)
 747 {
 748         if (unlock_ioc) {
 749                 spin_unlock(&iocg->waitq.lock);
 750                 spin_unlock_irqrestore(&iocg->ioc->lock, *flags);
 751         } else {
 752                 spin_unlock_irqrestore(&iocg->waitq.lock, *flags);
 753         }
 754 }
 755
 756 #define CREATE_TRACE_POINTS
 757 #include <trace/events/iocost.h>
 758
 759 static void ioc_refresh_margins(struct ioc *ioc)
 760 {
 761         struct ioc_margins *margins = &ioc->margins;
 762         u32 period_us = ioc->period_us;
 763         u64 vrate = ioc->vtime_base_rate;
 764
 765         margins->min = (period_us * MARGIN_MIN_PCT / 100) * vrate;
 766         margins->low = (period_us * MARGIN_LOW_PCT / 100) * vrate;
 767         margins->target = (period_us * MARGIN_TARGET_PCT / 100) * vrate;
 768 }
 769
 770 /* latency Qos params changed, update period_us and all the dependent params */
 771 static void ioc_refresh_period_us(struct ioc *ioc)
 772 {
 773         u32 ppm, lat, multi, period_us;
 774
 775         lockdep_assert_held(&ioc->lock);
 776
 777         /* pick the higher latency target */
 778         if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) {
 779                 ppm = ioc->params.qos[QOS_RPPM];
 780                 lat = ioc->params.qos[QOS_RLAT];
 781         } else {
 782                 ppm = ioc->params.qos[QOS_WPPM];
 783                 lat = ioc->params.qos[QOS_WLAT];
 784         }
 785
 786         /*
 787          * We want the period to be long enough to contain a healthy number
 788          * of IOs while short enough for granular control.  Define it as a
 789          * multiple of the latency target.  Ideally, the multiplier should
 790          * be scaled according to the percentile so that it would nominally
 791          * contain a certain number of requests.  Let's be simpler and
 792          * scale it linearly so that it's 2x >= pct(90) and 10x at pct(50).
 793          */
 794         if (ppm)
 795                 multi = max_t(u32, (MILLION - ppm) / 50000, 2);
 796         else
 797                 multi = 2;
 798         period_us = multi * lat;
 799         period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD);
 800
 801         /* calculate dependent params */
 802         ioc->period_us = period_us;
 803         ioc->timer_slack_ns = div64_u64(
 804                 (u64)period_us * NSEC_PER_USEC * TIMER_SLACK_PCT,
 805                 100);
 806         ioc_refresh_margins(ioc);
 807 }
 808
 809 static int ioc_autop_idx(struct ioc *ioc)
 810 {
 811         int idx = ioc->autop_idx;
 812         const struct ioc_params *p = &autop[idx];
 813         u32 vrate_pct;
 814         u64 now_ns;
 815
 816         /* rotational? */
 817         if (!blk_queue_nonrot(ioc->rqos.q))
 818                 return AUTOP_HDD;
 819
 820         /* handle SATA SSDs w/ broken NCQ */
 821         if (blk_queue_depth(ioc->rqos.q) == 1)
 822                 return AUTOP_SSD_QD1;
 823
 824         /* use one of the normal ssd sets */
 825         if (idx < AUTOP_SSD_DFL)
 826                 return AUTOP_SSD_DFL;
 827
 828         /* if user is overriding anything, maintain what was there */
 829         if (ioc->user_qos_params || ioc->user_cost_model)
 830                 return idx;
 831
 832         /* step up/down based on the vrate */
 833         vrate_pct = div64_u64(ioc->vtime_base_rate * 100, VTIME_PER_USEC);
 834         now_ns = ktime_get_ns();
 835
 836         if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
 837                 if (!ioc->autop_too_fast_at)
 838                         ioc->autop_too_fast_at = now_ns;
 839                 if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC)
 840                         return idx + 1;
 841         } else {
 842                 ioc->autop_too_fast_at = 0;
 843         }
 844
 845         if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) {
 846                 if (!ioc->autop_too_slow_at)
 847                         ioc->autop_too_slow_at = now_ns;
 848                 if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC)
 849                         return idx - 1;
 850         } else {
 851                 ioc->autop_too_slow_at = 0;
 852         }
 853
 854         return idx;
 855 }
 856
 857 /*
 858  * Take the followings as input
 859  *
 860  *  @bps        maximum sequential throughput
 861  *  @seqiops    maximum sequential 4k iops
 862  *  @randiops   maximum random 4k iops
 863  *
 864  * and calculate the linear model cost coefficients.
 865  *
 866  *  *@page      per-page cost           1s / (@bps / 4096)
 867  *  *@seqio     base cost of a seq IO   max((1s / @seqiops) - *@page, 0)
 868  *  @randiops   base cost of a rand IO  max((1s / @randiops) - *@page, 0)
 869  */
 870 static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops,
 871                         u64 *page, u64 *seqio, u64 *randio)
 872 {
 873         u64 v;
 874
 875         *page = *seqio = *randio = 0;
 876
 877         if (bps) {
 878                 u64 bps_pages = DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE);
 879
 880                 if (bps_pages)
 881                         *page = DIV64_U64_ROUND_UP(VTIME_PER_SEC, bps_pages);
 882                 else
 883                         *page = 1;
 884         }
 885
 886         if (seqiops) {
 887                 v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops);
 888                 if (v > *page)
 889                         *seqio = v - *page;
 890         }
 891
 892         if (randiops) {
 893                 v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops);
 894                 if (v > *page)
 895                         *randio = v - *page;
 896         }
 897 }
 898
 899 static void ioc_refresh_lcoefs(struct ioc *ioc)
 900 {
 901         u64 *u = ioc->params.i_lcoefs;
 902         u64 *c = ioc->params.lcoefs;
 903
 904         calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
 905                     &c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]);
 906         calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS],
 907                     &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]);
 908 }
 909
 910 static bool ioc_refresh_params(struct ioc *ioc, bool force)
 911 {
 912         const struct ioc_params *p;
 913         int idx;
 914
 915         lockdep_assert_held(&ioc->lock);
 916
 917         idx = ioc_autop_idx(ioc);
 918         p = &autop[idx];
 919
 920         if (idx == ioc->autop_idx && !force)
 921                 return false;
 922
 923         if (idx != ioc->autop_idx)
 924                 atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
 925
 926         ioc->autop_idx = idx;
 927         ioc->autop_too_fast_at = 0;
 928         ioc->autop_too_slow_at = 0;
 929
 930         if (!ioc->user_qos_params)
 931                 memcpy(ioc->params.qos, p->qos, sizeof(p->qos));
 932         if (!ioc->user_cost_model)
 933                 memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs));
 934
 935         ioc_refresh_period_us(ioc);
 936         ioc_refresh_lcoefs(ioc);
 937
 938         ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] *
 939                                             VTIME_PER_USEC, MILLION);
 940         ioc->vrate_max = div64_u64((u64)ioc->params.qos[QOS_MAX] *
 941                                    VTIME_PER_USEC, MILLION);
 942
 943         return true;
 944 }
 945
 946 /*
 947  * When an iocg accumulates too much vtime or gets deactivated, we throw away
 948  * some vtime, which lowers the overall device utilization. As the exact amount
 949  * which is being thrown away is known, we can compensate by accelerating the
 950  * vrate accordingly so that the extra vtime generated in the current period
 951  * matches what got lost.
 952  */
 953 static void ioc_refresh_vrate(struct ioc *ioc, struct ioc_now *now)
 954 {
 955         s64 pleft = ioc->period_at + ioc->period_us - now->now;
 956         s64 vperiod = ioc->period_us * ioc->vtime_base_rate;
 957         s64 vcomp, vcomp_min, vcomp_max;
 958
 959         lockdep_assert_held(&ioc->lock);
 960
 961         /* we need some time left in this period */
 962         if (pleft <= 0)
 963                 goto done;
 964
 965         /*
 966          * Calculate how much vrate should be adjusted to offset the error.
 967          * Limit the amount of adjustment and deduct the adjusted amount from
 968          * the error.
 969          */
 970         vcomp = -div64_s64(ioc->vtime_err, pleft);
 971         vcomp_min = -(ioc->vtime_base_rate >> 1);
 972         vcomp_max = ioc->vtime_base_rate;
 973         vcomp = clamp(vcomp, vcomp_min, vcomp_max);
 974
 975         ioc->vtime_err += vcomp * pleft;
 976
 977         atomic64_set(&ioc->vtime_rate, ioc->vtime_base_rate + vcomp);
 978 done:
 979         /* bound how much error can accumulate */
 980         ioc->vtime_err = clamp(ioc->vtime_err, -vperiod, vperiod);
 981 }
 982
 983 static void ioc_adjust_base_vrate(struct ioc *ioc, u32 rq_wait_pct,
 984                                   int nr_lagging, int nr_shortages,
 985                                   int prev_busy_level, u32 *missed_ppm)
 986 {
 987         u64 vrate = ioc->vtime_base_rate;
 988         u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
 989
 990         if (!ioc->busy_level || (ioc->busy_level < 0 && nr_lagging)) {
 991                 if (ioc->busy_level != prev_busy_level || nr_lagging)
 992                         trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate),
 993                                                    missed_ppm, rq_wait_pct,
 994                                                    nr_lagging, nr_shortages);
 995
 996                 return;
 997         }
 998
 999         /*
1000          * If vrate is out of bounds, apply clamp gradually as the
1001          * bounds can change abruptly.  Otherwise, apply busy_level
1002          * based adjustment.
1003          */
1004         if (vrate < vrate_min) {
1005                 vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT), 100);
1006                 vrate = min(vrate, vrate_min);
1007         } else if (vrate > vrate_max) {
1008                 vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT), 100);
1009                 vrate = max(vrate, vrate_max);
1010         } else {
1011                 int idx = min_t(int, abs(ioc->busy_level),
1012                                 ARRAY_SIZE(vrate_adj_pct) - 1);
1013                 u32 adj_pct = vrate_adj_pct[idx];
1014
1015                 if (ioc->busy_level > 0)
1016                         adj_pct = 100 - adj_pct;
1017                 else
1018                         adj_pct = 100 + adj_pct;
1019
1020                 vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
1021                               vrate_min, vrate_max);
1022         }
1023
1024         trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct,
1025                                    nr_lagging, nr_shortages);
1026
1027         ioc->vtime_base_rate = vrate;
1028         ioc_refresh_margins(ioc);
1029 }
1030
1031 /* take a snapshot of the current [v]time and vrate */
1032 static void ioc_now(struct ioc *ioc, struct ioc_now *now)
1033 {
1034         unsigned seq;
1035
1036         now->now_ns = ktime_get();
1037         now->now = ktime_to_us(now->now_ns);
1038         now->vrate = atomic64_read(&ioc->vtime_rate);
1039
1040         /*
1041          * The current vtime is
1042          *
1043          *   vtime at period start + (wallclock time since the start) * vrate
1044          *
1045          * As a consistent snapshot of `period_at_vtime` and `period_at` is
1046          * needed, they're seqcount protected.
1047          */
1048         do {
1049                 seq = read_seqcount_begin(&ioc->period_seqcount);
1050                 now->vnow = ioc->period_at_vtime +
1051                         (now->now - ioc->period_at) * now->vrate;
1052         } while (read_seqcount_retry(&ioc->period_seqcount, seq));
1053 }
1054
1055 static void ioc_start_period(struct ioc *ioc, struct ioc_now *now)
1056 {
1057         WARN_ON_ONCE(ioc->running != IOC_RUNNING);
1058
1059         write_seqcount_begin(&ioc->period_seqcount);
1060         ioc->period_at = now->now;
1061         ioc->period_at_vtime = now->vnow;
1062         write_seqcount_end(&ioc->period_seqcount);
1063
1064         ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us);
1065         add_timer(&ioc->timer);
1066 }
1067
1068 /*
1069  * Update @iocg's `active` and `inuse` to @active and @inuse, update level
1070  * weight sums and propagate upwards accordingly. If @save, the current margin
1071  * is saved to be used as reference for later inuse in-period adjustments.
1072  */
1073 static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse,
1074                                 bool save, struct ioc_now *now)
1075 {
1076         struct ioc *ioc = iocg->ioc;
1077         int lvl;
1078
1079         lockdep_assert_held(&ioc->lock);
1080
1081         /*
1082          * For an active leaf node, its inuse shouldn't be zero or exceed
1083          * @active. An active internal node's inuse is solely determined by the
1084          * inuse to active ratio of its children regardless of @inuse.
1085          */
1086         if (list_empty(&iocg->active_list) && iocg->child_active_sum) {
1087                 inuse = DIV64_U64_ROUND_UP(active * iocg->child_inuse_sum,
1088                                            iocg->child_active_sum);
1089         } else {
1090                 inuse = clamp_t(u32, inuse, 1, active);
1091         }
1092
1093         iocg->last_inuse = iocg->inuse;
1094         if (save)
1095                 iocg->saved_margin = now->vnow - atomic64_read(&iocg->vtime);
1096
1097         if (active == iocg->active && inuse == iocg->inuse)
1098                 return;
1099
1100         for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
1101                 struct ioc_gq *parent = iocg->ancestors[lvl];
1102                 struct ioc_gq *child = iocg->ancestors[lvl + 1];
1103                 u32 parent_active = 0, parent_inuse = 0;
1104
1105                 /* update the level sums */
1106                 parent->child_active_sum += (s32)(active - child->active);
1107                 parent->child_inuse_sum += (s32)(inuse - child->inuse);
1108                 /* apply the updates */
1109                 child->active = active;
1110                 child->inuse = inuse;
1111
1112                 /*
1113                  * The delta between inuse and active sums indicates that
1114                  * much of weight is being given away.  Parent's inuse
1115                  * and active should reflect the ratio.
1116                  */
1117                 if (parent->child_active_sum) {
1118                         parent_active = parent->weight;
1119                         parent_inuse = DIV64_U64_ROUND_UP(
1120                                 parent_active * parent->child_inuse_sum,
1121                                 parent->child_active_sum);
1122                 }
1123
1124                 /* do we need to keep walking up? */
1125                 if (parent_active == parent->active &&
1126                     parent_inuse == parent->inuse)
1127                         break;
1128
1129                 active = parent_active;
1130                 inuse = parent_inuse;
1131         }
1132
1133         ioc->weights_updated = true;
1134 }
1135
1136 static void commit_weights(struct ioc *ioc)
1137 {
1138         lockdep_assert_held(&ioc->lock);
1139
1140         if (ioc->weights_updated) {
1141                 /* paired with rmb in current_hweight(), see there */
1142                 smp_wmb();
1143                 atomic_inc(&ioc->hweight_gen);
1144                 ioc->weights_updated = false;
1145         }
1146 }
1147
1148 static void propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse,
1149                               bool save, struct ioc_now *now)
1150 {
1151         __propagate_weights(iocg, active, inuse, save, now);
1152         commit_weights(iocg->ioc);
1153 }
1154
1155 static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep)
1156 {
1157         struct ioc *ioc = iocg->ioc;
1158         int lvl;
1159         u32 hwa, hwi;
1160         int ioc_gen;
1161
1162         /* hot path - if uptodate, use cached */
1163         ioc_gen = atomic_read(&ioc->hweight_gen);
1164         if (ioc_gen == iocg->hweight_gen)
1165                 goto out;
1166
1167         /*
1168          * Paired with wmb in commit_weights(). If we saw the updated
1169          * hweight_gen, all the weight updates from __propagate_weights() are
1170          * visible too.
1171          *
1172          * We can race with weight updates during calculation and get it
1173          * wrong.  However, hweight_gen would have changed and a future
1174          * reader will recalculate and we're guaranteed to discard the
1175          * wrong result soon.
1176          */
1177         smp_rmb();
1178
1179         hwa = hwi = WEIGHT_ONE;
1180         for (lvl = 0; lvl <= iocg->level - 1; lvl++) {
1181                 struct ioc_gq *parent = iocg->ancestors[lvl];
1182                 struct ioc_gq *child = iocg->ancestors[lvl + 1];
1183                 u64 active_sum = READ_ONCE(parent->child_active_sum);
1184                 u64 inuse_sum = READ_ONCE(parent->child_inuse_sum);
1185                 u32 active = READ_ONCE(child->active);
1186                 u32 inuse = READ_ONCE(child->inuse);
1187
1188                 /* we can race with deactivations and either may read as zero */
1189                 if (!active_sum || !inuse_sum)
1190                         continue;
1191
1192                 active_sum = max_t(u64, active, active_sum);
1193                 hwa = div64_u64((u64)hwa * active, active_sum);
1194
1195                 inuse_sum = max_t(u64, inuse, inuse_sum);
1196                 hwi = div64_u64((u64)hwi * inuse, inuse_sum);
1197         }
1198
1199         iocg->hweight_active = max_t(u32, hwa, 1);
1200         iocg->hweight_inuse = max_t(u32, hwi, 1);
1201         iocg->hweight_gen = ioc_gen;
1202 out:
1203         if (hw_activep)
1204                 *hw_activep = iocg->hweight_active;
1205         if (hw_inusep)
1206                 *hw_inusep = iocg->hweight_inuse;
1207 }
1208
1209 /*
1210  * Calculate the hweight_inuse @iocg would get with max @inuse assuming all the
1211  * other weights stay unchanged.
1212  */
1213 static u32 current_hweight_max(struct ioc_gq *iocg)
1214 {
1215         u32 hwm = WEIGHT_ONE;
1216         u32 inuse = iocg->active;
1217         u64 child_inuse_sum;
1218         int lvl;
1219
1220         lockdep_assert_held(&iocg->ioc->lock);
1221
1222         for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
1223                 struct ioc_gq *parent = iocg->ancestors[lvl];
1224                 struct ioc_gq *child = iocg->ancestors[lvl + 1];
1225
1226                 child_inuse_sum = parent->child_inuse_sum + inuse - child->inuse;
1227                 hwm = div64_u64((u64)hwm * inuse, child_inuse_sum);
1228                 inuse = DIV64_U64_ROUND_UP(parent->active * child_inuse_sum,
1229                                            parent->child_active_sum);
1230         }
1231
1232         return max_t(u32, hwm, 1);
1233 }
1234
1235 static void weight_updated(struct ioc_gq *iocg, struct ioc_now *now)
1236 {
1237         struct ioc *ioc = iocg->ioc;
1238         struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1239         struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg);
1240         u32 weight;
1241
1242         lockdep_assert_held(&ioc->lock);
1243
1244         weight = iocg->cfg_weight ?: iocc->dfl_weight;
1245         if (weight != iocg->weight && iocg->active)
1246                 propagate_weights(iocg, weight, iocg->inuse, true, now);
1247         iocg->weight = weight;
1248 }
1249
1250 static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
1251 {
1252         struct ioc *ioc = iocg->ioc;
1253         u64 last_period, cur_period;
1254         u64 vtime, vtarget;
1255         int i;
1256
1257         /*
1258          * If seem to be already active, just update the stamp to tell the
1259          * timer that we're still active.  We don't mind occassional races.
1260          */
1261         if (!list_empty(&iocg->active_list)) {
1262                 ioc_now(ioc, now);
1263                 cur_period = atomic64_read(&ioc->cur_period);
1264                 if (atomic64_read(&iocg->active_period) != cur_period)
1265                         atomic64_set(&iocg->active_period, cur_period);
1266                 return true;
1267         }
1268
1269         /* racy check on internal node IOs, treat as root level IOs */
1270         if (iocg->child_active_sum)
1271                 return false;
1272
1273         spin_lock_irq(&ioc->lock);
1274
1275         ioc_now(ioc, now);
1276
1277         /* update period */
1278         cur_period = atomic64_read(&ioc->cur_period);
1279         last_period = atomic64_read(&iocg->active_period);
1280         atomic64_set(&iocg->active_period, cur_period);
1281
1282         /* already activated or breaking leaf-only constraint? */
1283         if (!list_empty(&iocg->active_list))
1284                 goto succeed_unlock;
1285         for (i = iocg->level - 1; i > 0; i--)
1286                 if (!list_empty(&iocg->ancestors[i]->active_list))
1287                         goto fail_unlock;
1288
1289         if (iocg->child_active_sum)
1290                 goto fail_unlock;
1291
1292         /*
1293          * Always start with the target budget. On deactivation, we throw away
1294          * anything above it.
1295          */
1296         vtarget = now->vnow - ioc->margins.target;
1297         vtime = atomic64_read(&iocg->vtime);
1298
1299         atomic64_add(vtarget - vtime, &iocg->vtime);
1300         atomic64_add(vtarget - vtime, &iocg->done_vtime);
1301         vtime = vtarget;
1302
1303         /*
1304          * Activate, propagate weight and start period timer if not
1305          * running.  Reset hweight_gen to avoid accidental match from
1306          * wrapping.
1307          */
1308         iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1;
1309         list_add(&iocg->active_list, &ioc->active_iocgs);
1310
1311         propagate_weights(iocg, iocg->weight,
1312                           iocg->last_inuse ?: iocg->weight, true, now);
1313
1314         TRACE_IOCG_PATH(iocg_activate, iocg, now,
1315                         last_period, cur_period, vtime);
1316
1317         iocg->activated_at = now->now;
1318
1319         if (ioc->running == IOC_IDLE) {
1320                 ioc->running = IOC_RUNNING;
1321                 ioc->dfgv_period_at = now->now;
1322                 ioc->dfgv_period_rem = 0;
1323                 ioc_start_period(ioc, now);
1324         }
1325
1326 succeed_unlock:
1327         spin_unlock_irq(&ioc->lock);
1328         return true;
1329
1330 fail_unlock:
1331         spin_unlock_irq(&ioc->lock);
1332         return false;
1333 }
1334
1335 static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now)
1336 {
1337         struct ioc *ioc = iocg->ioc;
1338         struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1339         u64 tdelta, delay, new_delay;
1340         s64 vover, vover_pct;
1341         u32 hwa;
1342
1343         lockdep_assert_held(&iocg->waitq.lock);
1344
1345         /* calculate the current delay in effect - 1/2 every second */
1346         tdelta = now->now - iocg->delay_at;
1347         if (iocg->delay)
1348                 delay = iocg->delay >> div64_u64(tdelta, USEC_PER_SEC);
1349         else
1350                 delay = 0;
1351
1352         /* calculate the new delay from the debt amount */
1353         current_hweight(iocg, &hwa, NULL);
1354         vover = atomic64_read(&iocg->vtime) +
1355                 abs_cost_to_cost(iocg->abs_vdebt, hwa) - now->vnow;
1356         vover_pct = div64_s64(100 * vover,
1357                               ioc->period_us * ioc->vtime_base_rate);
1358
1359         if (vover_pct <= MIN_DELAY_THR_PCT)
1360                 new_delay = 0;
1361         else if (vover_pct >= MAX_DELAY_THR_PCT)
1362                 new_delay = MAX_DELAY;
1363         else
1364                 new_delay = MIN_DELAY +
1365                         div_u64((MAX_DELAY - MIN_DELAY) *
1366                                 (vover_pct - MIN_DELAY_THR_PCT),
1367                                 MAX_DELAY_THR_PCT - MIN_DELAY_THR_PCT);
1368
1369         /* pick the higher one and apply */
1370         if (new_delay > delay) {
1371                 iocg->delay = new_delay;
1372                 iocg->delay_at = now->now;
1373                 delay = new_delay;
1374         }
1375
1376         if (delay >= MIN_DELAY) {
1377                 if (!iocg->indelay_since)
1378                         iocg->indelay_since = now->now;
1379                 blkcg_set_delay(blkg, delay * NSEC_PER_USEC);
1380                 return true;
1381         } else {
1382                 if (iocg->indelay_since) {
1383                         iocg->local_stat.indelay_us += now->now - iocg->indelay_since;
1384                         iocg->indelay_since = 0;
1385                 }
1386                 iocg->delay = 0;
1387                 blkcg_clear_delay(blkg);
1388                 return false;
1389         }
1390 }
1391
1392 static void iocg_incur_debt(struct ioc_gq *iocg, u64 abs_cost,
1393                             struct ioc_now *now)
1394 {
1395         struct iocg_pcpu_stat *gcs;
1396
1397         lockdep_assert_held(&iocg->ioc->lock);
1398         lockdep_assert_held(&iocg->waitq.lock);
1399         WARN_ON_ONCE(list_empty(&iocg->active_list));
1400
1401         /*
1402          * Once in debt, debt handling owns inuse. @iocg stays at the minimum
1403          * inuse donating all of it share to others until its debt is paid off.
1404          */
1405         if (!iocg->abs_vdebt && abs_cost) {
1406                 iocg->indebt_since = now->now;
1407                 propagate_weights(iocg, iocg->active, 0, false, now);
1408         }
1409
1410         iocg->abs_vdebt += abs_cost;
1411
1412         gcs = get_cpu_ptr(iocg->pcpu_stat);
1413         local64_add(abs_cost, &gcs->abs_vusage);
1414         put_cpu_ptr(gcs);
1415 }
1416
1417 static void iocg_pay_debt(struct ioc_gq *iocg, u64 abs_vpay,
1418                           struct ioc_now *now)
1419 {
1420         lockdep_assert_held(&iocg->ioc->lock);
1421         lockdep_assert_held(&iocg->waitq.lock);
1422
1423         /* make sure that nobody messed with @iocg */
1424         WARN_ON_ONCE(list_empty(&iocg->active_list));
1425         WARN_ON_ONCE(iocg->inuse > 1);
1426
1427         iocg->abs_vdebt -= min(abs_vpay, iocg->abs_vdebt);
1428
1429         /* if debt is paid in full, restore inuse */
1430         if (!iocg->abs_vdebt) {
1431                 iocg->local_stat.indebt_us += now->now - iocg->indebt_since;
1432                 iocg->indebt_since = 0;
1433
1434                 propagate_weights(iocg, iocg->active, iocg->last_inuse,
1435                                   false, now);
1436         }
1437 }
1438
1439 static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
1440                         int flags, void *key)
1441 {
1442         struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait);
1443         struct iocg_wake_ctx *ctx = (struct iocg_wake_ctx *)key;
1444         u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse);
1445
1446         ctx->vbudget -= cost;
1447
1448         if (ctx->vbudget < 0)
1449                 return -1;
1450
1451         iocg_commit_bio(ctx->iocg, wait->bio, wait->abs_cost, cost);
1452         wait->committed = true;
1453
1454         /*
1455          * autoremove_wake_function() removes the wait entry only when it
1456          * actually changed the task state. We want the wait always removed.
1457          * Remove explicitly and use default_wake_function(). Note that the
1458          * order of operations is important as finish_wait() tests whether
1459          * @wq_entry is removed without grabbing the lock.
1460          */
1461         default_wake_function(wq_entry, mode, flags, key);
1462         list_del_init_careful(&wq_entry->entry);
1463         return 0;
1464 }
1465
1466 /*
1467  * Calculate the accumulated budget, pay debt if @pay_debt and wake up waiters
1468  * accordingly. When @pay_debt is %true, the caller must be holding ioc->lock in
1469  * addition to iocg->waitq.lock.
1470  */
1471 static void iocg_kick_waitq(struct ioc_gq *iocg, bool pay_debt,
1472                             struct ioc_now *now)
1473 {
1474         struct ioc *ioc = iocg->ioc;
1475         struct iocg_wake_ctx ctx = { .iocg = iocg };
1476         u64 vshortage, expires, oexpires;
1477         s64 vbudget;
1478         u32 hwa;
1479
1480         lockdep_assert_held(&iocg->waitq.lock);
1481
1482         current_hweight(iocg, &hwa, NULL);
1483         vbudget = now->vnow - atomic64_read(&iocg->vtime);
1484
1485         /* pay off debt */
1486         if (pay_debt && iocg->abs_vdebt && vbudget > 0) {
1487                 u64 abs_vbudget = cost_to_abs_cost(vbudget, hwa);
1488                 u64 abs_vpay = min_t(u64, abs_vbudget, iocg->abs_vdebt);
1489                 u64 vpay = abs_cost_to_cost(abs_vpay, hwa);
1490
1491                 lockdep_assert_held(&ioc->lock);
1492
1493                 atomic64_add(vpay, &iocg->vtime);
1494                 atomic64_add(vpay, &iocg->done_vtime);
1495                 iocg_pay_debt(iocg, abs_vpay, now);
1496                 vbudget -= vpay;
1497         }
1498
1499         if (iocg->abs_vdebt || iocg->delay)
1500                 iocg_kick_delay(iocg, now);
1501
1502         /*
1503          * Debt can still be outstanding if we haven't paid all yet or the
1504          * caller raced and called without @pay_debt. Shouldn't wake up waiters
1505          * under debt. Make sure @vbudget reflects the outstanding amount and is
1506          * not positive.
1507          */
1508         if (iocg->abs_vdebt) {
1509                 s64 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hwa);
1510                 vbudget = min_t(s64, 0, vbudget - vdebt);
1511         }
1512
1513         /*
1514          * Wake up the ones which are due and see how much vtime we'll need for
1515          * the next one. As paying off debt restores hw_inuse, it must be read
1516          * after the above debt payment.
1517          */
1518         ctx.vbudget = vbudget;
1519         current_hweight(iocg, NULL, &ctx.hw_inuse);
1520
1521         __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
1522
1523         if (!waitqueue_active(&iocg->waitq)) {
1524                 if (iocg->wait_since) {
1525                         iocg->local_stat.wait_us += now->now - iocg->wait_since;
1526                         iocg->wait_since = 0;
1527                 }
1528                 return;
1529         }
1530
1531         if (!iocg->wait_since)
1532                 iocg->wait_since = now->now;
1533
1534         if (WARN_ON_ONCE(ctx.vbudget >= 0))
1535                 return;
1536
1537         /* determine next wakeup, add a timer margin to guarantee chunking */
1538         vshortage = -ctx.vbudget;
1539         expires = now->now_ns +
1540                 DIV64_U64_ROUND_UP(vshortage, ioc->vtime_base_rate) *
1541                 NSEC_PER_USEC;
1542         expires += ioc->timer_slack_ns;
1543
1544         /* if already active and close enough, don't bother */
1545         oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer));
1546         if (hrtimer_is_queued(&iocg->waitq_timer) &&
1547             abs(oexpires - expires) <= ioc->timer_slack_ns)
1548                 return;
1549
1550         hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires),
1551                                ioc->timer_slack_ns, HRTIMER_MODE_ABS);
1552 }
1553
1554 static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
1555 {
1556         struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer);
1557         bool pay_debt = READ_ONCE(iocg->abs_vdebt);
1558         struct ioc_now now;
1559         unsigned long flags;
1560
1561         ioc_now(iocg->ioc, &now);
1562
1563         iocg_lock(iocg, pay_debt, &flags);
1564         iocg_kick_waitq(iocg, pay_debt, &now);
1565         iocg_unlock(iocg, pay_debt, &flags);
1566
1567         return HRTIMER_NORESTART;
1568 }
1569
1570 static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p)
1571 {
1572         u32 nr_met[2] = { };
1573         u32 nr_missed[2] = { };
1574         u64 rq_wait_ns = 0;
1575         int cpu, rw;
1576
1577         for_each_online_cpu(cpu) {
1578                 struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu);
1579                 u64 this_rq_wait_ns;
1580
1581                 for (rw = READ; rw <= WRITE; rw++) {
1582                         u32 this_met = local_read(&stat->missed[rw].nr_met);
1583                         u32 this_missed = local_read(&stat->missed[rw].nr_missed);
1584
1585                         nr_met[rw] += this_met - stat->missed[rw].last_met;
1586                         nr_missed[rw] += this_missed - stat->missed[rw].last_missed;
1587                         stat->missed[rw].last_met = this_met;
1588                         stat->missed[rw].last_missed = this_missed;
1589                 }
1590
1591                 this_rq_wait_ns = local64_read(&stat->rq_wait_ns);
1592                 rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns;
1593                 stat->last_rq_wait_ns = this_rq_wait_ns;
1594         }
1595
1596         for (rw = READ; rw <= WRITE; rw++) {
1597                 if (nr_met[rw] + nr_missed[rw])
1598                         missed_ppm_ar[rw] =
1599                                 DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION,
1600                                                    nr_met[rw] + nr_missed[rw]);
1601                 else
1602                         missed_ppm_ar[rw] = 0;
1603         }
1604
1605         *rq_wait_pct_p = div64_u64(rq_wait_ns * 100,
1606                                    ioc->period_us * NSEC_PER_USEC);
1607 }
1608
1609 /* was iocg idle this period? */
1610 static bool iocg_is_idle(struct ioc_gq *iocg)
1611 {
1612         struct ioc *ioc = iocg->ioc;
1613
1614         /* did something get issued this period? */
1615         if (atomic64_read(&iocg->active_period) ==
1616             atomic64_read(&ioc->cur_period))
1617                 return false;
1618
1619         /* is something in flight? */
1620         if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime))
1621                 return false;
1622
1623         return true;
1624 }
1625
1626 /*
1627  * Call this function on the target leaf @iocg's to build pre-order traversal
1628  * list of all the ancestors in @inner_walk. The inner nodes are linked through
1629  * ->walk_list and the caller is responsible for dissolving the list after use.
1630  */
1631 static void iocg_build_inner_walk(struct ioc_gq *iocg,
1632                                   struct list_head *inner_walk)
1633 {
1634         int lvl;
1635
1636         WARN_ON_ONCE(!list_empty(&iocg->walk_list));
1637
1638         /* find the first ancestor which hasn't been visited yet */
1639         for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
1640                 if (!list_empty(&iocg->ancestors[lvl]->walk_list))
1641                         break;
1642         }
1643
1644         /* walk down and visit the inner nodes to get pre-order traversal */
1645         while (++lvl <= iocg->level - 1) {
1646                 struct ioc_gq *inner = iocg->ancestors[lvl];
1647
1648                 /* record traversal order */
1649                 list_add_tail(&inner->walk_list, inner_walk);
1650         }
1651 }
1652
1653 /* collect per-cpu counters and propagate the deltas to the parent */
1654 static void iocg_flush_stat_one(struct ioc_gq *iocg, struct ioc_now *now)
1655 {
1656         struct ioc *ioc = iocg->ioc;
1657         struct iocg_stat new_stat;
1658         u64 abs_vusage = 0;
1659         u64 vusage_delta;
1660         int cpu;
1661
1662         lockdep_assert_held(&iocg->ioc->lock);
1663
1664         /* collect per-cpu counters */
1665         for_each_possible_cpu(cpu) {
1666                 abs_vusage += local64_read(
1667                                 per_cpu_ptr(&iocg->pcpu_stat->abs_vusage, cpu));
1668         }
1669         vusage_delta = abs_vusage - iocg->last_stat_abs_vusage;
1670         iocg->last_stat_abs_vusage = abs_vusage;
1671
1672         iocg->usage_delta_us = div64_u64(vusage_delta, ioc->vtime_base_rate);
1673         iocg->local_stat.usage_us += iocg->usage_delta_us;
1674
1675         /* propagate upwards */
1676         new_stat.usage_us =
1677                 iocg->local_stat.usage_us + iocg->desc_stat.usage_us;
1678         new_stat.wait_us =
1679                 iocg->local_stat.wait_us + iocg->desc_stat.wait_us;
1680         new_stat.indebt_us =
1681                 iocg->local_stat.indebt_us + iocg->desc_stat.indebt_us;
1682         new_stat.indelay_us =
1683                 iocg->local_stat.indelay_us + iocg->desc_stat.indelay_us;
1684
1685         /* propagate the deltas to the parent */
1686         if (iocg->level > 0) {
1687                 struct iocg_stat *parent_stat =
1688                         &iocg->ancestors[iocg->level - 1]->desc_stat;
1689
1690                 parent_stat->usage_us +=
1691                         new_stat.usage_us - iocg->last_stat.usage_us;
1692                 parent_stat->wait_us +=
1693                         new_stat.wait_us - iocg->last_stat.wait_us;
1694                 parent_stat->indebt_us +=
1695                         new_stat.indebt_us - iocg->last_stat.indebt_us;
1696                 parent_stat->indelay_us +=
1697                         new_stat.indelay_us - iocg->last_stat.indelay_us;
1698         }
1699
1700         iocg->last_stat = new_stat;
1701 }
1702
1703 /* get stat counters ready for reading on all active iocgs */
1704 static void iocg_flush_stat(struct list_head *target_iocgs, struct ioc_now *now)
1705 {
1706         LIST_HEAD(inner_walk);
1707         struct ioc_gq *iocg, *tiocg;
1708
1709         /* flush leaves and build inner node walk list */
1710         list_for_each_entry(iocg, target_iocgs, active_list) {
1711                 iocg_flush_stat_one(iocg, now);
1712                 iocg_build_inner_walk(iocg, &inner_walk);
1713         }
1714
1715         /* keep flushing upwards by walking the inner list backwards */
1716         list_for_each_entry_safe_reverse(iocg, tiocg, &inner_walk, walk_list) {
1717                 iocg_flush_stat_one(iocg, now);
1718                 list_del_init(&iocg->walk_list);
1719         }
1720 }
1721
1722 /*
1723  * Determine what @iocg's hweight_inuse should be after donating unused
1724  * capacity. @hwm is the upper bound and used to signal no donation. This
1725  * function also throws away @iocg's excess budget.
1726  */
1727 static u32 hweight_after_donation(struct ioc_gq *iocg, u32 old_hwi, u32 hwm,
1728                                   u32 usage, struct ioc_now *now)
1729 {
1730         struct ioc *ioc = iocg->ioc;
1731         u64 vtime = atomic64_read(&iocg->vtime);
1732         s64 excess, delta, target, new_hwi;
1733
1734         /* debt handling owns inuse for debtors */
1735         if (iocg->abs_vdebt)
1736                 return 1;
1737
1738         /* see whether minimum margin requirement is met */
1739         if (waitqueue_active(&iocg->waitq) ||
1740             time_after64(vtime, now->vnow - ioc->margins.min))
1741                 return hwm;
1742
1743         /* throw away excess above target */
1744         excess = now->vnow - vtime - ioc->margins.target;
1745         if (excess > 0) {
1746                 atomic64_add(excess, &iocg->vtime);
1747                 atomic64_add(excess, &iocg->done_vtime);
1748                 vtime += excess;
1749                 ioc->vtime_err -= div64_u64(excess * old_hwi, WEIGHT_ONE);
1750         }
1751
1752         /*
1753          * Let's say the distance between iocg's and device's vtimes as a
1754          * fraction of period duration is delta. Assuming that the iocg will
1755          * consume the usage determined above, we want to determine new_hwi so
1756          * that delta equals MARGIN_TARGET at the end of the next period.
1757          *
1758          * We need to execute usage worth of IOs while spending the sum of the
1759          * new budget (1 - MARGIN_TARGET) and the leftover from the last period
1760          * (delta):
1761          *
1762          *   usage = (1 - MARGIN_TARGET + delta) * new_hwi
1763          *
1764          * Therefore, the new_hwi is:
1765          *
1766          *   new_hwi = usage / (1 - MARGIN_TARGET + delta)
1767          */
1768         delta = div64_s64(WEIGHT_ONE * (now->vnow - vtime),
1769                           now->vnow - ioc->period_at_vtime);
1770         target = WEIGHT_ONE * MARGIN_TARGET_PCT / 100;
1771         new_hwi = div64_s64(WEIGHT_ONE * usage, WEIGHT_ONE - target + delta);
1772
1773         return clamp_t(s64, new_hwi, 1, hwm);
1774 }
1775
1776 /*
1777  * For work-conservation, an iocg which isn't using all of its share should
1778  * donate the leftover to other iocgs. There are two ways to achieve this - 1.
1779  * bumping up vrate accordingly 2. lowering the donating iocg's inuse weight.
1780  *
1781  * #1 is mathematically simpler but has the drawback of requiring synchronous
1782  * global hweight_inuse updates when idle iocg's get activated or inuse weights
1783  * change due to donation snapbacks as it has the possibility of grossly
1784  * overshooting what's allowed by the model and vrate.
1785  *
1786  * #2 is inherently safe with local operations. The donating iocg can easily
1787  * snap back to higher weights when needed without worrying about impacts on
1788  * other nodes as the impacts will be inherently correct. This also makes idle
1789  * iocg activations safe. The only effect activations have is decreasing
1790  * hweight_inuse of others, the right solution to which is for those iocgs to
1791  * snap back to higher weights.
1792  *
1793  * So, we go with #2. The challenge is calculating how each donating iocg's
1794  * inuse should be adjusted to achieve the target donation amounts. This is done
1795  * using Andy's method described in the following pdf.
1796  *
1797  *   https://drive.google.com/file/d/1PsJwxPFtjUnwOY1QJ5AeICCcsL7BM3bo
1798  *
1799  * Given the weights and target after-donation hweight_inuse values, Andy's
1800  * method determines how the proportional distribution should look like at each
1801  * sibling level to maintain the relative relationship between all non-donating
1802  * pairs. To roughly summarize, it divides the tree into donating and
1803  * non-donating parts, calculates global donation rate which is used to
1804  * determine the target hweight_inuse for each node, and then derives per-level
1805  * proportions.
1806  *
1807  * The following pdf shows that global distribution calculated this way can be
1808  * achieved by scaling inuse weights of donating leaves and propagating the
1809  * adjustments upwards proportionally.
1810  *
1811  *   https://drive.google.com/file/d/1vONz1-fzVO7oY5DXXsLjSxEtYYQbOvsE
1812  *
1813  * Combining the above two, we can determine how each leaf iocg's inuse should
1814  * be adjusted to achieve the target donation.
1815  *
1816  *   https://drive.google.com/file/d/1WcrltBOSPN0qXVdBgnKm4mdp9FhuEFQN
1817  *
1818  * The inline comments use symbols from the last pdf.
1819  *
1820  *   b is the sum of the absolute budgets in the subtree. 1 for the root node.
1821  *   f is the sum of the absolute budgets of non-donating nodes in the subtree.
1822  *   t is the sum of the absolute budgets of donating nodes in the subtree.
1823  *   w is the weight of the node. w = w_f + w_t
1824  *   w_f is the non-donating portion of w. w_f = w * f / b
1825  *   w_b is the donating portion of w. w_t = w * t / b
1826  *   s is the sum of all sibling weights. s = Sum(w) for siblings
1827  *   s_f and s_t are the non-donating and donating portions of s.
1828  *
1829  * Subscript p denotes the parent's counterpart and ' the adjusted value - e.g.
1830  * w_pt is the donating portion of the parent's weight and w'_pt the same value
1831  * after adjustments. Subscript r denotes the root node's values.
1832  */
1833 static void transfer_surpluses(struct list_head *surpluses, struct ioc_now *now)
1834 {
1835         LIST_HEAD(over_hwa);
1836         LIST_HEAD(inner_walk);
1837         struct ioc_gq *iocg, *tiocg, *root_iocg;
1838         u32 after_sum, over_sum, over_target, gamma;
1839
1840         /*
1841          * It's pretty unlikely but possible for the total sum of
1842          * hweight_after_donation's to be higher than WEIGHT_ONE, which will
1843          * confuse the following calculations. If such condition is detected,
1844          * scale down everyone over its full share equally to keep the sum below
1845          * WEIGHT_ONE.
1846          */
1847         after_sum = 0;
1848         over_sum = 0;
1849         list_for_each_entry(iocg, surpluses, surplus_list) {
1850                 u32 hwa;
1851
1852                 current_hweight(iocg, &hwa, NULL);
1853                 after_sum += iocg->hweight_after_donation;
1854
1855                 if (iocg->hweight_after_donation > hwa) {
1856                         over_sum += iocg->hweight_after_donation;
1857                         list_add(&iocg->walk_list, &over_hwa);
1858                 }
1859         }
1860
1861         if (after_sum >= WEIGHT_ONE) {
1862                 /*
1863                  * The delta should be deducted from the over_sum, calculate
1864                  * target over_sum value.
1865                  */
1866                 u32 over_delta = after_sum - (WEIGHT_ONE - 1);
1867                 WARN_ON_ONCE(over_sum <= over_delta);
1868                 over_target = over_sum - over_delta;
1869         } else {
1870                 over_target = 0;
1871         }
1872
1873         list_for_each_entry_safe(iocg, tiocg, &over_hwa, walk_list) {
1874                 if (over_target)
1875                         iocg->hweight_after_donation =
1876                                 div_u64((u64)iocg->hweight_after_donation *
1877                                         over_target, over_sum);
1878                 list_del_init(&iocg->walk_list);
1879         }
1880
1881         /*
1882          * Build pre-order inner node walk list and prepare for donation
1883          * adjustment calculations.
1884          */
1885         list_for_each_entry(iocg, surpluses, surplus_list) {
1886                 iocg_build_inner_walk(iocg, &inner_walk);
1887         }
1888
1889         root_iocg = list_first_entry(&inner_walk, struct ioc_gq, walk_list);
1890         WARN_ON_ONCE(root_iocg->level > 0);
1891
1892         list_for_each_entry(iocg, &inner_walk, walk_list) {
1893                 iocg->child_adjusted_sum = 0;
1894                 iocg->hweight_donating = 0;
1895                 iocg->hweight_after_donation = 0;
1896         }
1897
1898         /*
1899          * Propagate the donating budget (b_t) and after donation budget (b'_t)
1900          * up the hierarchy.
1901          */
1902         list_for_each_entry(iocg, surpluses, surplus_list) {
1903                 struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
1904
1905                 parent->hweight_donating += iocg->hweight_donating;
1906                 parent->hweight_after_donation += iocg->hweight_after_donation;
1907         }
1908
1909         list_for_each_entry_reverse(iocg, &inner_walk, walk_list) {
1910                 if (iocg->level > 0) {
1911                         struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
1912
1913                         parent->hweight_donating += iocg->hweight_donating;
1914                         parent->hweight_after_donation += iocg->hweight_after_donation;
1915                 }
1916         }
1917
1918         /*
1919          * Calculate inner hwa's (b) and make sure the donation values are
1920          * within the accepted ranges as we're doing low res calculations with
1921          * roundups.
1922          */
1923         list_for_each_entry(iocg, &inner_walk, walk_list) {
1924                 if (iocg->level) {
1925                         struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
1926
1927                         iocg->hweight_active = DIV64_U64_ROUND_UP(
1928                                 (u64)parent->hweight_active * iocg->active,
1929                                 parent->child_active_sum);
1930
1931                 }
1932
1933                 iocg->hweight_donating = min(iocg->hweight_donating,
1934                                              iocg->hweight_active);
1935                 iocg->hweight_after_donation = min(iocg->hweight_after_donation,
1936                                                    iocg->hweight_donating - 1);
1937                 if (WARN_ON_ONCE(iocg->hweight_active <= 1 ||
1938                                  iocg->hweight_donating <= 1 ||
1939                                  iocg->hweight_after_donation == 0)) {
1940                         pr_warn("iocg: invalid donation weights in ");
1941                         pr_cont_cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup);
1942                         pr_cont(": active=%u donating=%u after=%u\n",
1943                                 iocg->hweight_active, iocg->hweight_donating,
1944                                 iocg->hweight_after_donation);
1945                 }
1946         }
1947
1948         /*
1949          * Calculate the global donation rate (gamma) - the rate to adjust
1950          * non-donating budgets by.
1951          *
1952          * No need to use 64bit multiplication here as the first operand is
1953          * guaranteed to be smaller than WEIGHT_ONE (1<<16).
1954          *
1955          * We know that there are beneficiary nodes and the sum of the donating
1956          * hweights can't be whole; however, due to the round-ups during hweight
1957          * calculations, root_iocg->hweight_donating might still end up equal to
1958          * or greater than whole. Limit the range when calculating the divider.
1959          *
1960          * gamma = (1 - t_r') / (1 - t_r)
1961          */
1962         gamma = DIV_ROUND_UP(
1963                 (WEIGHT_ONE - root_iocg->hweight_after_donation) * WEIGHT_ONE,
1964                 WEIGHT_ONE - min_t(u32, root_iocg->hweight_donating, WEIGHT_ONE - 1));
1965
1966         /*
1967          * Calculate adjusted hwi, child_adjusted_sum and inuse for the inner
1968          * nodes.
1969          */
1970         list_for_each_entry(iocg, &inner_walk, walk_list) {
1971                 struct ioc_gq *parent;
1972                 u32 inuse, wpt, wptp;
1973                 u64 st, sf;
1974
1975                 if (iocg->level == 0) {
1976                         /* adjusted weight sum for 1st level: s' = s * b_pf / b'_pf */
1977                         iocg->child_adjusted_sum = DIV64_U64_ROUND_UP(
1978                                 iocg->child_active_sum * (WEIGHT_ONE - iocg->hweight_donating),
1979                                 WEIGHT_ONE - iocg->hweight_after_donation);
1980                         continue;
1981                 }
1982
1983                 parent = iocg->ancestors[iocg->level - 1];
1984
1985                 /* b' = gamma * b_f + b_t' */
1986                 iocg->hweight_inuse = DIV64_U64_ROUND_UP(
1987                         (u64)gamma * (iocg->hweight_active - iocg->hweight_donating),
1988                         WEIGHT_ONE) + iocg->hweight_after_donation;
1989
1990                 /* w' = s' * b' / b'_p */
1991                 inuse = DIV64_U64_ROUND_UP(
1992                         (u64)parent->child_adjusted_sum * iocg->hweight_inuse,
1993                         parent->hweight_inuse);
1994
1995                 /* adjusted weight sum for children: s' = s_f + s_t * w'_pt / w_pt */
1996                 st = DIV64_U64_ROUND_UP(
1997                         iocg->child_active_sum * iocg->hweight_donating,
1998                         iocg->hweight_active);
1999                 sf = iocg->child_active_sum - st;
2000                 wpt = DIV64_U64_ROUND_UP(
2001                         (u64)iocg->active * iocg->hweight_donating,
2002                         iocg->hweight_active);
2003                 wptp = DIV64_U64_ROUND_UP(
2004                         (u64)inuse * iocg->hweight_after_donation,
2005                         iocg->hweight_inuse);
2006
2007                 iocg->child_adjusted_sum = sf + DIV64_U64_ROUND_UP(st * wptp, wpt);
2008         }
2009
2010         /*
2011          * All inner nodes now have ->hweight_inuse and ->child_adjusted_sum and
2012          * we can finally determine leaf adjustments.
2013          */
2014         list_for_each_entry(iocg, surpluses, surplus_list) {
2015                 struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
2016                 u32 inuse;
2017
2018                 /*
2019                  * In-debt iocgs participated in the donation calculation with
2020                  * the minimum target hweight_inuse. Configuring inuse
2021                  * accordingly would work fine but debt handling expects
2022                  * @iocg->inuse stay at the minimum and we don't wanna
2023                  * interfere.
2024                  */
2025                 if (iocg->abs_vdebt) {
2026                         WARN_ON_ONCE(iocg->inuse > 1);
2027                         continue;
2028                 }
2029
2030                 /* w' = s' * b' / b'_p, note that b' == b'_t for donating leaves */
2031                 inuse = DIV64_U64_ROUND_UP(
2032                         parent->child_adjusted_sum * iocg->hweight_after_donation,
2033                         parent->hweight_inuse);
2034
2035                 TRACE_IOCG_PATH(inuse_transfer, iocg, now,
2036                                 iocg->inuse, inuse,
2037                                 iocg->hweight_inuse,
2038                                 iocg->hweight_after_donation);
2039
2040                 __propagate_weights(iocg, iocg->active, inuse, true, now);
2041         }
2042
2043         /* walk list should be dissolved after use */
2044         list_for_each_entry_safe(iocg, tiocg, &inner_walk, walk_list)
2045                 list_del_init(&iocg->walk_list);
2046 }
2047
2048 /*
2049  * A low weight iocg can amass a large amount of debt, for example, when
2050  * anonymous memory gets reclaimed aggressively. If the system has a lot of
2051  * memory paired with a slow IO device, the debt can span multiple seconds or
2052  * more. If there are no other subsequent IO issuers, the in-debt iocg may end
2053  * up blocked paying its debt while the IO device is idle.
2054  *
2055  * The following protects against such cases. If the device has been
2056  * sufficiently idle for a while, the debts are halved and delays are
2057  * recalculated.
2058  */
2059 static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors,
2060                               struct ioc_now *now)
2061 {
2062         struct ioc_gq *iocg;
2063         u64 dur, usage_pct, nr_cycles;
2064
2065         /* if no debtor, reset the cycle */
2066         if (!nr_debtors) {
2067                 ioc->dfgv_period_at = now->now;
2068                 ioc->dfgv_period_rem = 0;
2069                 ioc->dfgv_usage_us_sum = 0;
2070                 return;
2071         }
2072
2073         /*
2074          * Debtors can pass through a lot of writes choking the device and we
2075          * don't want to be forgiving debts while the device is struggling from
2076          * write bursts. If we're missing latency targets, consider the device
2077          * fully utilized.
2078          */
2079         if (ioc->busy_level > 0)
2080                 usage_us_sum = max_t(u64, usage_us_sum, ioc->period_us);
2081
2082         ioc->dfgv_usage_us_sum += usage_us_sum;
2083         if (time_before64(now->now, ioc->dfgv_period_at + DFGV_PERIOD))
2084                 return;
2085
2086         /*
2087          * At least DFGV_PERIOD has passed since the last period. Calculate the
2088          * average usage and reset the period counters.
2089          */
2090         dur = now->now - ioc->dfgv_period_at;
2091         usage_pct = div64_u64(100 * ioc->dfgv_usage_us_sum, dur);
2092
2093         ioc->dfgv_period_at = now->now;
2094         ioc->dfgv_usage_us_sum = 0;
2095
2096         /* if was too busy, reset everything */
2097         if (usage_pct > DFGV_USAGE_PCT) {
2098                 ioc->dfgv_period_rem = 0;
2099                 return;
2100         }
2101
2102         /*
2103          * Usage is lower than threshold. Let's forgive some debts. Debt
2104          * forgiveness runs off of the usual ioc timer but its period usually
2105          * doesn't match ioc's. Compensate the difference by performing the
2106          * reduction as many times as would fit in the duration since the last
2107          * run and carrying over the left-over duration in @ioc->dfgv_period_rem
2108          * - if ioc period is 75% of DFGV_PERIOD, one out of three consecutive
2109          * reductions is doubled.
2110          */
2111         nr_cycles = dur + ioc->dfgv_period_rem;
2112         ioc->dfgv_period_rem = do_div(nr_cycles, DFGV_PERIOD);
2113
2114         list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
2115                 u64 __maybe_unused old_debt, __maybe_unused old_delay;
2116
2117                 if (!iocg->abs_vdebt && !iocg->delay)
2118                         continue;
2119
2120                 spin_lock(&iocg->waitq.lock);
2121
2122                 old_debt = iocg->abs_vdebt;
2123                 old_delay = iocg->delay;
2124
2125                 if (iocg->abs_vdebt)
2126                         iocg->abs_vdebt = iocg->abs_vdebt >> nr_cycles ?: 1;
2127                 if (iocg->delay)
2128                         iocg->delay = iocg->delay >> nr_cycles ?: 1;
2129
2130                 iocg_kick_waitq(iocg, true, now);
2131
2132                 TRACE_IOCG_PATH(iocg_forgive_debt, iocg, now, usage_pct,
2133                                 old_debt, iocg->abs_vdebt,
2134                                 old_delay, iocg->delay);
2135
2136                 spin_unlock(&iocg->waitq.lock);
2137         }
2138 }
2139
2140 /*
2141  * Check the active iocgs' state to avoid oversleeping and deactive
2142  * idle iocgs.
2143  *
2144  * Since waiters determine the sleep durations based on the vrate
2145  * they saw at the time of sleep, if vrate has increased, some
2146  * waiters could be sleeping for too long. Wake up tardy waiters
2147  * which should have woken up in the last period and expire idle
2148  * iocgs.
2149  */
2150 static int ioc_check_iocgs(struct ioc *ioc, struct ioc_now *now)
2151 {
2152         int nr_debtors = 0;
2153         struct ioc_gq *iocg, *tiocg;
2154
2155         list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
2156                 if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
2157                     !iocg->delay && !iocg_is_idle(iocg))
2158                         continue;
2159
2160                 spin_lock(&iocg->waitq.lock);
2161
2162                 /* flush wait and indebt stat deltas */
2163                 if (iocg->wait_since) {
2164                         iocg->local_stat.wait_us += now->now - iocg->wait_since;
2165                         iocg->wait_since = now->now;
2166                 }
2167                 if (iocg->indebt_since) {
2168                         iocg->local_stat.indebt_us +=
2169                                 now->now - iocg->indebt_since;
2170                         iocg->indebt_since = now->now;
2171                 }
2172                 if (iocg->indelay_since) {
2173                         iocg->local_stat.indelay_us +=
2174                                 now->now - iocg->indelay_since;
2175                         iocg->indelay_since = now->now;
2176                 }
2177
2178                 if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt ||
2179                     iocg->delay) {
2180                         /* might be oversleeping vtime / hweight changes, kick */
2181                         iocg_kick_waitq(iocg, true, now);
2182                         if (iocg->abs_vdebt || iocg->delay)
2183                                 nr_debtors++;
2184                 } else if (iocg_is_idle(iocg)) {
2185                         /* no waiter and idle, deactivate */
2186                         u64 vtime = atomic64_read(&iocg->vtime);
2187                         s64 excess;
2188
2189                         /*
2190                          * @iocg has been inactive for a full duration and will
2191                          * have a high budget. Account anything above target as
2192                          * error and throw away. On reactivation, it'll start
2193                          * with the target budget.
2194                          */
2195                         excess = now->vnow - vtime - ioc->margins.target;
2196                         if (excess > 0) {
2197                                 u32 old_hwi;
2198
2199                                 current_hweight(iocg, NULL, &old_hwi);
2200                                 ioc->vtime_err -= div64_u64(excess * old_hwi,
2201                                                             WEIGHT_ONE);
2202                         }
2203
2204                         TRACE_IOCG_PATH(iocg_idle, iocg, now,
2205                                         atomic64_read(&iocg->active_period),
2206                                         atomic64_read(&ioc->cur_period), vtime);
2207                         __propagate_weights(iocg, 0, 0, false, now);
2208                         list_del_init(&iocg->active_list);
2209                 }
2210
2211                 spin_unlock(&iocg->waitq.lock);
2212         }
2213
2214         commit_weights(ioc);
2215         return nr_debtors;
2216 }
2217
2218 static void ioc_timer_fn(struct timer_list *timer)
2219 {
2220         struct ioc *ioc = container_of(timer, struct ioc, timer);
2221         struct ioc_gq *iocg, *tiocg;
2222         struct ioc_now now;
2223         LIST_HEAD(surpluses);
2224         int nr_debtors, nr_shortages = 0, nr_lagging = 0;
2225         u64 usage_us_sum = 0;
2226         u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
2227         u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
2228         u32 missed_ppm[2], rq_wait_pct;
2229         u64 period_vtime;
2230         int prev_busy_level;
2231
2232         /* how were the latencies during the period? */
2233         ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
2234
2235         /* take care of active iocgs */
2236         spin_lock_irq(&ioc->lock);
2237
2238         ioc_now(ioc, &now);
2239
2240         period_vtime = now.vnow - ioc->period_at_vtime;
2241         if (WARN_ON_ONCE(!period_vtime)) {
2242                 spin_unlock_irq(&ioc->lock);
2243                 return;
2244         }
2245
2246         nr_debtors = ioc_check_iocgs(ioc, &now);
2247
2248         /*
2249          * Wait and indebt stat are flushed above and the donation calculation
2250          * below needs updated usage stat. Let's bring stat up-to-date.
2251          */
2252         iocg_flush_stat(&ioc->active_iocgs, &now);
2253
2254         /* calc usage and see whether some weights need to be moved around */
2255         list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
2256                 u64 vdone, vtime, usage_us;
2257                 u32 hw_active, hw_inuse;
2258
2259                 /*
2260                  * Collect unused and wind vtime closer to vnow to prevent
2261                  * iocgs from accumulating a large amount of budget.
2262                  */
2263                 vdone = atomic64_read(&iocg->done_vtime);
2264                 vtime = atomic64_read(&iocg->vtime);
2265                 current_hweight(iocg, &hw_active, &hw_inuse);
2266
2267                 /*
2268                  * Latency QoS detection doesn't account for IOs which are
2269                  * in-flight for longer than a period.  Detect them by
2270                  * comparing vdone against period start.  If lagging behind
2271                  * IOs from past periods, don't increase vrate.
2272                  */
2273                 if ((ppm_rthr != MILLION || ppm_wthr != MILLION) &&
2274                     !atomic_read(&iocg_to_blkg(iocg)->use_delay) &&
2275                     time_after64(vtime, vdone) &&
2276                     time_after64(vtime, now.vnow -
2277                                  MAX_LAGGING_PERIODS * period_vtime) &&
2278                     time_before64(vdone, now.vnow - period_vtime))
2279                         nr_lagging++;
2280
2281                 /*
2282                  * Determine absolute usage factoring in in-flight IOs to avoid
2283                  * high-latency completions appearing as idle.
2284                  */
2285                 usage_us = iocg->usage_delta_us;
2286                 usage_us_sum += usage_us;
2287
2288                 /* see whether there's surplus vtime */
2289                 WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
2290                 if (hw_inuse < hw_active ||
2291                     (!waitqueue_active(&iocg->waitq) &&
2292                      time_before64(vtime, now.vnow - ioc->margins.low))) {
2293                         u32 hwa, old_hwi, hwm, new_hwi, usage;
2294                         u64 usage_dur;
2295
2296                         if (vdone != vtime) {
2297                                 u64 inflight_us = DIV64_U64_ROUND_UP(
2298                                         cost_to_abs_cost(vtime - vdone, hw_inuse),
2299                                         ioc->vtime_base_rate);
2300
2301                                 usage_us = max(usage_us, inflight_us);
2302                         }
2303
2304                         /* convert to hweight based usage ratio */
2305                         if (time_after64(iocg->activated_at, ioc->period_at))
2306                                 usage_dur = max_t(u64, now.now - iocg->activated_at, 1);
2307                         else
2308                                 usage_dur = max_t(u64, now.now - ioc->period_at, 1);
2309
2310                         usage = clamp_t(u32,
2311                                 DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE,
2312                                                    usage_dur),
2313                                 1, WEIGHT_ONE);
2314
2315                         /*
2316                          * Already donating or accumulated enough to start.
2317                          * Determine the donation amount.
2318                          */
2319                         current_hweight(iocg, &hwa, &old_hwi);
2320                         hwm = current_hweight_max(iocg);
2321                         new_hwi = hweight_after_donation(iocg, old_hwi, hwm,
2322                                                          usage, &now);
2323                         /*
2324                          * Donation calculation assumes hweight_after_donation
2325                          * to be positive, a condition that a donor w/ hwa < 2
2326                          * can't meet. Don't bother with donation if hwa is
2327                          * below 2. It's not gonna make a meaningful difference
2328                          * anyway.
2329                          */
2330                         if (new_hwi < hwm && hwa >= 2) {
2331                                 iocg->hweight_donating = hwa;
2332                                 iocg->hweight_after_donation = new_hwi;
2333                                 list_add(&iocg->surplus_list, &surpluses);
2334                         } else if (!iocg->abs_vdebt) {
2335                                 /*
2336                                  * @iocg doesn't have enough to donate. Reset
2337                                  * its inuse to active.
2338                                  *
2339                                  * Don't reset debtors as their inuse's are
2340                                  * owned by debt handling. This shouldn't affect
2341                                  * donation calculuation in any meaningful way
2342                                  * as @iocg doesn't have a meaningful amount of
2343                                  * share anyway.
2344                                  */
2345                                 TRACE_IOCG_PATH(inuse_shortage, iocg, &now,
2346                                                 iocg->inuse, iocg->active,
2347                                                 iocg->hweight_inuse, new_hwi);
2348
2349                                 __propagate_weights(iocg, iocg->active,
2350                                                     iocg->active, true, &now);
2351                                 nr_shortages++;
2352                         }
2353                 } else {
2354                         /* genuinely short on vtime */
2355                         nr_shortages++;
2356                 }
2357         }
2358
2359         if (!list_empty(&surpluses) && nr_shortages)
2360                 transfer_surpluses(&surpluses, &now);
2361
2362         commit_weights(ioc);
2363
2364         /* surplus list should be dissolved after use */
2365         list_for_each_entry_safe(iocg, tiocg, &surpluses, surplus_list)
2366                 list_del_init(&iocg->surplus_list);
2367
2368         /*
2369          * If q is getting clogged or we're missing too much, we're issuing
2370          * too much IO and should lower vtime rate.  If we're not missing
2371          * and experiencing shortages but not surpluses, we're too stingy
2372          * and should increase vtime rate.
2373          */
2374         prev_busy_level = ioc->busy_level;
2375         if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
2376             missed_ppm[READ] > ppm_rthr ||
2377             missed_ppm[WRITE] > ppm_wthr) {
2378                 /* clearly missing QoS targets, slow down vrate */
2379                 ioc->busy_level = max(ioc->busy_level, 0);
2380                 ioc->busy_level++;
2381         } else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 &&
2382                    missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 &&
2383                    missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) {
2384                 /* QoS targets are being met with >25% margin */
2385                 if (nr_shortages) {
2386                         /*
2387                          * We're throttling while the device has spare
2388                          * capacity.  If vrate was being slowed down, stop.
2389                          */
2390                         ioc->busy_level = min(ioc->busy_level, 0);
2391
2392                         /*
2393                          * If there are IOs spanning multiple periods, wait
2394                          * them out before pushing the device harder.
2395                          */
2396                         if (!nr_lagging)
2397                                 ioc->busy_level--;
2398                 } else {
2399                         /*
2400                          * Nobody is being throttled and the users aren't
2401                          * issuing enough IOs to saturate the device.  We
2402                          * simply don't know how close the device is to
2403                          * saturation.  Coast.
2404                          */
2405                         ioc->busy_level = 0;
2406                 }
2407         } else {
2408                 /* inside the hysterisis margin, we're good */
2409                 ioc->busy_level = 0;
2410         }
2411
2412         ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
2413
2414         ioc_adjust_base_vrate(ioc, rq_wait_pct, nr_lagging, nr_shortages,
2415                               prev_busy_level, missed_ppm);
2416
2417         ioc_refresh_params(ioc, false);
2418
2419         ioc_forgive_debts(ioc, usage_us_sum, nr_debtors, &now);
2420
2421         /*
2422          * This period is done.  Move onto the next one.  If nothing's
2423          * going on with the device, stop the timer.
2424          */
2425         atomic64_inc(&ioc->cur_period);
2426
2427         if (ioc->running != IOC_STOP) {
2428                 if (!list_empty(&ioc->active_iocgs)) {
2429                         ioc_start_period(ioc, &now);
2430                 } else {
2431                         ioc->busy_level = 0;
2432                         ioc->vtime_err = 0;
2433                         ioc->running = IOC_IDLE;
2434                 }
2435
2436                 ioc_refresh_vrate(ioc, &now);
2437         }
2438
2439         spin_unlock_irq(&ioc->lock);
2440 }
2441
2442 static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime,
2443                                       u64 abs_cost, struct ioc_now *now)
2444 {
2445         struct ioc *ioc = iocg->ioc;
2446         struct ioc_margins *margins = &ioc->margins;
2447         u32 __maybe_unused old_inuse = iocg->inuse, __maybe_unused old_hwi;
2448         u32 hwi, adj_step;
2449         s64 margin;
2450         u64 cost, new_inuse;
2451         unsigned long flags;
2452
2453         current_hweight(iocg, NULL, &hwi);
2454         old_hwi = hwi;
2455         cost = abs_cost_to_cost(abs_cost, hwi);
2456         margin = now->vnow - vtime - cost;
2457
2458         /* debt handling owns inuse for debtors */
2459         if (iocg->abs_vdebt)
2460                 return cost;
2461
2462         /*
2463          * We only increase inuse during period and do so if the margin has
2464          * deteriorated since the previous adjustment.
2465          */
2466         if (margin >= iocg->saved_margin || margin >= margins->low ||
2467             iocg->inuse == iocg->active)
2468                 return cost;
2469
2470         spin_lock_irqsave(&ioc->lock, flags);
2471
2472         /* we own inuse only when @iocg is in the normal active state */
2473         if (iocg->abs_vdebt || list_empty(&iocg->active_list)) {
2474                 spin_unlock_irqrestore(&ioc->lock, flags);
2475                 return cost;
2476         }
2477
2478         /*
2479          * Bump up inuse till @abs_cost fits in the existing budget.
2480          * adj_step must be determined after acquiring ioc->lock - we might
2481          * have raced and lost to another thread for activation and could
2482          * be reading 0 iocg->active before ioc->lock which will lead to
2483          * infinite loop.
2484          */
2485         new_inuse = iocg->inuse;
2486         adj_step = DIV_ROUND_UP(iocg->active * INUSE_ADJ_STEP_PCT, 100);
2487         do {
2488                 new_inuse = new_inuse + adj_step;
2489                 propagate_weights(iocg, iocg->active, new_inuse, true, now);
2490                 current_hweight(iocg, NULL, &hwi);
2491                 cost = abs_cost_to_cost(abs_cost, hwi);
2492         } while (time_after64(vtime + cost, now->vnow) &&
2493                  iocg->inuse != iocg->active);
2494
2495         spin_unlock_irqrestore(&ioc->lock, flags);
2496
2497         TRACE_IOCG_PATH(inuse_adjust, iocg, now,
2498                         old_inuse, iocg->inuse, old_hwi, hwi);
2499
2500         return cost;
2501 }
2502
2503 static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg,
2504                                     bool is_merge, u64 *costp)
2505 {
2506         struct ioc *ioc = iocg->ioc;
2507         u64 coef_seqio, coef_randio, coef_page;
2508         u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1);
2509         u64 seek_pages = 0;
2510         u64 cost = 0;
2511
2512         switch (bio_op(bio)) {
2513         case REQ_OP_READ:
2514                 coef_seqio      = ioc->params.lcoefs[LCOEF_RSEQIO];
2515                 coef_randio     = ioc->params.lcoefs[LCOEF_RRANDIO];
2516                 coef_page       = ioc->params.lcoefs[LCOEF_RPAGE];
2517                 break;
2518         case REQ_OP_WRITE:
2519                 coef_seqio      = ioc->params.lcoefs[LCOEF_WSEQIO];
2520                 coef_randio     = ioc->params.lcoefs[LCOEF_WRANDIO];
2521                 coef_page       = ioc->params.lcoefs[LCOEF_WPAGE];
2522                 break;
2523         default:
2524                 goto out;
2525         }
2526
2527         if (iocg->cursor) {
2528                 seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor);
2529                 seek_pages >>= IOC_SECT_TO_PAGE_SHIFT;
2530         }
2531
2532         if (!is_merge) {
2533                 if (seek_pages > LCOEF_RANDIO_PAGES) {
2534                         cost += coef_randio;
2535                 } else {
2536                         cost += coef_seqio;
2537                 }
2538         }
2539         cost += pages * coef_page;
2540 out:
2541         *costp = cost;
2542 }
2543
2544 static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge)
2545 {
2546         u64 cost;
2547
2548         calc_vtime_cost_builtin(bio, iocg, is_merge, &cost);
2549         return cost;
2550 }
2551
2552 static void calc_size_vtime_cost_builtin(struct request *rq, struct ioc *ioc,
2553                                          u64 *costp)
2554 {
2555         unsigned int pages = blk_rq_stats_sectors(rq) >> IOC_SECT_TO_PAGE_SHIFT;
2556
2557         switch (req_op(rq)) {
2558         case REQ_OP_READ:
2559                 *costp = pages * ioc->params.lcoefs[LCOEF_RPAGE];
2560                 break;
2561         case REQ_OP_WRITE:
2562                 *costp = pages * ioc->params.lcoefs[LCOEF_WPAGE];
2563                 break;
2564         default:
2565                 *costp = 0;
2566         }
2567 }
2568
2569 static u64 calc_size_vtime_cost(struct request *rq, struct ioc *ioc)
2570 {
2571         u64 cost;
2572
2573         calc_size_vtime_cost_builtin(rq, ioc, &cost);
2574         return cost;
2575 }
2576
2577 static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
2578 {
2579         struct blkcg_gq *blkg = bio->bi_blkg;
2580         struct ioc *ioc = rqos_to_ioc(rqos);
2581         struct ioc_gq *iocg = blkg_to_iocg(blkg);
2582         struct ioc_now now;
2583         struct iocg_wait wait;
2584         u64 abs_cost, cost, vtime;
2585         bool use_debt, ioc_locked;
2586         unsigned long flags;
2587
2588         /* bypass IOs if disabled, still initializing, or for root cgroup */
2589         if (!ioc->enabled || !iocg || !iocg->level)
2590                 return;
2591
2592         /* calculate the absolute vtime cost */
2593         abs_cost = calc_vtime_cost(bio, iocg, false);
2594         if (!abs_cost)
2595                 return;
2596
2597         if (!iocg_activate(iocg, &now))
2598                 return;
2599
2600         iocg->cursor = bio_end_sector(bio);
2601         vtime = atomic64_read(&iocg->vtime);
2602         cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now);
2603
2604         /*
2605          * If no one's waiting and within budget, issue right away.  The
2606          * tests are racy but the races aren't systemic - we only miss once
2607          * in a while which is fine.
2608          */
2609         if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
2610             time_before_eq64(vtime + cost, now.vnow)) {
2611                 iocg_commit_bio(iocg, bio, abs_cost, cost);
2612                 return;
2613         }
2614
2615         /*
2616          * We're over budget. This can be handled in two ways. IOs which may
2617          * cause priority inversions are punted to @ioc->aux_iocg and charged as
2618          * debt. Otherwise, the issuer is blocked on @iocg->waitq. Debt handling
2619          * requires @ioc->lock, waitq handling @iocg->waitq.lock. Determine
2620          * whether debt handling is needed and acquire locks accordingly.
2621          */
2622         use_debt = bio_issue_as_root_blkg(bio) || fatal_signal_pending(current);
2623         ioc_locked = use_debt || READ_ONCE(iocg->abs_vdebt);
2624 retry_lock:
2625         iocg_lock(iocg, ioc_locked, &flags);
2626
2627         /*
2628          * @iocg must stay activated for debt and waitq handling. Deactivation
2629          * is synchronized against both ioc->lock and waitq.lock and we won't
2630          * get deactivated as long as we're waiting or has debt, so we're good
2631          * if we're activated here. In the unlikely cases that we aren't, just
2632          * issue the IO.
2633          */
2634         if (unlikely(list_empty(&iocg->active_list))) {
2635                 iocg_unlock(iocg, ioc_locked, &flags);
2636                 iocg_commit_bio(iocg, bio, abs_cost, cost);
2637                 return;
2638         }
2639
2640         /*
2641          * We're over budget. If @bio has to be issued regardless, remember
2642          * the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay
2643          * off the debt before waking more IOs.
2644          *
2645          * This way, the debt is continuously paid off each period with the
2646          * actual budget available to the cgroup. If we just wound vtime, we
2647          * would incorrectly use the current hw_inuse for the entire amount
2648          * which, for example, can lead to the cgroup staying blocked for a
2649          * long time even with substantially raised hw_inuse.
2650          *
2651          * An iocg with vdebt should stay online so that the timer can keep
2652          * deducting its vdebt and [de]activate use_delay mechanism
2653          * accordingly. We don't want to race against the timer trying to
2654          * clear them and leave @iocg inactive w/ dangling use_delay heavily
2655          * penalizing the cgroup and its descendants.
2656          */
2657         if (use_debt) {
2658                 iocg_incur_debt(iocg, abs_cost, &now);
2659                 if (iocg_kick_delay(iocg, &now))
2660                         blkcg_schedule_throttle(rqos->q,
2661                                         (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
2662                 iocg_unlock(iocg, ioc_locked, &flags);
2663                 return;
2664         }
2665
2666         /* guarantee that iocgs w/ waiters have maximum inuse */
2667         if (!iocg->abs_vdebt && iocg->inuse != iocg->active) {
2668                 if (!ioc_locked) {
2669                         iocg_unlock(iocg, false, &flags);
2670                         ioc_locked = true;
2671                         goto retry_lock;
2672                 }
2673                 propagate_weights(iocg, iocg->active, iocg->active, true,
2674                                   &now);
2675         }
2676
2677         /*
2678          * Append self to the waitq and schedule the wakeup timer if we're
2679          * the first waiter.  The timer duration is calculated based on the
2680          * current vrate.  vtime and hweight changes can make it too short
2681          * or too long.  Each wait entry records the absolute cost it's
2682          * waiting for to allow re-evaluation using a custom wait entry.
2683          *
2684          * If too short, the timer simply reschedules itself.  If too long,
2685          * the period timer will notice and trigger wakeups.
2686          *
2687          * All waiters are on iocg->waitq and the wait states are
2688          * synchronized using waitq.lock.
2689          */
2690         init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
2691         wait.wait.private = current;
2692         wait.bio = bio;
2693         wait.abs_cost = abs_cost;
2694         wait.committed = false; /* will be set true by waker */
2695
2696         __add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
2697         iocg_kick_waitq(iocg, ioc_locked, &now);
2698
2699         iocg_unlock(iocg, ioc_locked, &flags);
2700
2701         while (true) {
2702                 set_current_state(TASK_UNINTERRUPTIBLE);
2703                 if (wait.committed)
2704                         break;
2705                 io_schedule();
2706         }
2707
2708         /* waker already committed us, proceed */
2709         finish_wait(&iocg->waitq, &wait.wait);
2710 }
2711
2712 static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
2713                            struct bio *bio)
2714 {
2715         struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
2716         struct ioc *ioc = rqos_to_ioc(rqos);
2717         sector_t bio_end = bio_end_sector(bio);
2718         struct ioc_now now;
2719         u64 vtime, abs_cost, cost;
2720         unsigned long flags;
2721
2722         /* bypass if disabled, still initializing, or for root cgroup */
2723         if (!ioc->enabled || !iocg || !iocg->level)
2724                 return;
2725
2726         abs_cost = calc_vtime_cost(bio, iocg, true);
2727         if (!abs_cost)
2728                 return;
2729
2730         ioc_now(ioc, &now);
2731
2732         vtime = atomic64_read(&iocg->vtime);
2733         cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now);
2734
2735         /* update cursor if backmerging into the request at the cursor */
2736         if (blk_rq_pos(rq) < bio_end &&
2737             blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor)
2738                 iocg->cursor = bio_end;
2739
2740         /*
2741          * Charge if there's enough vtime budget and the existing request has
2742          * cost assigned.
2743          */
2744         if (rq->bio && rq->bio->bi_iocost_cost &&
2745             time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) {
2746                 iocg_commit_bio(iocg, bio, abs_cost, cost);
2747                 return;
2748         }
2749
2750         /*
2751          * Otherwise, account it as debt if @iocg is online, which it should
2752          * be for the vast majority of cases. See debt handling in
2753          * ioc_rqos_throttle() for details.
2754          */
2755         spin_lock_irqsave(&ioc->lock, flags);
2756         spin_lock(&iocg->waitq.lock);
2757
2758         if (likely(!list_empty(&iocg->active_list))) {
2759                 iocg_incur_debt(iocg, abs_cost, &now);
2760                 if (iocg_kick_delay(iocg, &now))
2761                         blkcg_schedule_throttle(rqos->q,
2762                                         (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
2763         } else {
2764                 iocg_commit_bio(iocg, bio, abs_cost, cost);
2765         }
2766
2767         spin_unlock(&iocg->waitq.lock);
2768         spin_unlock_irqrestore(&ioc->lock, flags);
2769 }
2770
2771 static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
2772 {
2773         struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
2774
2775         if (iocg && bio->bi_iocost_cost)
2776                 atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime);
2777 }
2778
2779 static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
2780 {
2781         struct ioc *ioc = rqos_to_ioc(rqos);
2782         struct ioc_pcpu_stat *ccs;
2783         u64 on_q_ns, rq_wait_ns, size_nsec;
2784         int pidx, rw;
2785
2786         if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns)
2787                 return;
2788
2789         switch (req_op(rq) & REQ_OP_MASK) {
2790         case REQ_OP_READ:
2791                 pidx = QOS_RLAT;
2792                 rw = READ;
2793                 break;
2794         case REQ_OP_WRITE:
2795                 pidx = QOS_WLAT;
2796                 rw = WRITE;
2797                 break;
2798         default:
2799                 return;
2800         }
2801
2802         on_q_ns = ktime_get_ns() - rq->alloc_time_ns;
2803         rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
2804         size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC);
2805
2806         ccs = get_cpu_ptr(ioc->pcpu_stat);
2807
2808         if (on_q_ns <= size_nsec ||
2809             on_q_ns - size_nsec <= ioc->params.qos[pidx] * NSEC_PER_USEC)
2810                 local_inc(&ccs->missed[rw].nr_met);
2811         else
2812                 local_inc(&ccs->missed[rw].nr_missed);
2813
2814         local64_add(rq_wait_ns, &ccs->rq_wait_ns);
2815
2816         put_cpu_ptr(ccs);
2817 }
2818
2819 static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos)
2820 {
2821         struct ioc *ioc = rqos_to_ioc(rqos);
2822
2823         spin_lock_irq(&ioc->lock);
2824         ioc_refresh_params(ioc, false);
2825         spin_unlock_irq(&ioc->lock);
2826 }
2827
2828 static void ioc_rqos_exit(struct rq_qos *rqos)
2829 {
2830         struct ioc *ioc = rqos_to_ioc(rqos);
2831
2832         blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost);
2833
2834         spin_lock_irq(&ioc->lock);
2835         ioc->running = IOC_STOP;
2836         spin_unlock_irq(&ioc->lock);
2837
2838         del_timer_sync(&ioc->timer);
2839         free_percpu(ioc->pcpu_stat);
2840         kfree(ioc);
2841 }
2842
2843 static struct rq_qos_ops ioc_rqos_ops = {
2844         .throttle = ioc_rqos_throttle,
2845         .merge = ioc_rqos_merge,
2846         .done_bio = ioc_rqos_done_bio,
2847         .done = ioc_rqos_done,
2848         .queue_depth_changed = ioc_rqos_queue_depth_changed,
2849         .exit = ioc_rqos_exit,
2850 };
2851
2852 static int blk_iocost_init(struct request_queue *q)
2853 {
2854         struct ioc *ioc;
2855         struct rq_qos *rqos;
2856         int i, cpu, ret;
2857
2858         ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
2859         if (!ioc)
2860                 return -ENOMEM;
2861
2862         ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat);
2863         if (!ioc->pcpu_stat) {
2864                 kfree(ioc);
2865                 return -ENOMEM;
2866         }
2867
2868         for_each_possible_cpu(cpu) {
2869                 struct ioc_pcpu_stat *ccs = per_cpu_ptr(ioc->pcpu_stat, cpu);
2870
2871                 for (i = 0; i < ARRAY_SIZE(ccs->missed); i++) {
2872                         local_set(&ccs->missed[i].nr_met, 0);
2873                         local_set(&ccs->missed[i].nr_missed, 0);
2874                 }
2875                 local64_set(&ccs->rq_wait_ns, 0);
2876         }
2877
2878         rqos = &ioc->rqos;
2879         rqos->id = RQ_QOS_COST;
2880         rqos->ops = &ioc_rqos_ops;
2881         rqos->q = q;
2882
2883         spin_lock_init(&ioc->lock);
2884         timer_setup(&ioc->timer, ioc_timer_fn, 0);
2885         INIT_LIST_HEAD(&ioc->active_iocgs);
2886
2887         ioc->running = IOC_IDLE;
2888         ioc->vtime_base_rate = VTIME_PER_USEC;
2889         atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
2890         seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock);
2891         ioc->period_at = ktime_to_us(ktime_get());
2892         atomic64_set(&ioc->cur_period, 0);
2893         atomic_set(&ioc->hweight_gen, 0);
2894
2895         spin_lock_irq(&ioc->lock);
2896         ioc->autop_idx = AUTOP_INVALID;
2897         ioc_refresh_params(ioc, true);
2898         spin_unlock_irq(&ioc->lock);
2899
2900         /*
2901          * rqos must be added before activation to allow iocg_pd_init() to
2902          * lookup the ioc from q. This means that the rqos methods may get
2903          * called before policy activation completion, can't assume that the
2904          * target bio has an iocg associated and need to test for NULL iocg.
2905          */
2906         ret = rq_qos_add(q, rqos);
2907         if (ret)
2908                 goto err_free_ioc;
2909
2910         ret = blkcg_activate_policy(q, &blkcg_policy_iocost);
2911         if (ret)
2912                 goto err_del_qos;
2913         return 0;
2914
2915 err_del_qos:
2916         rq_qos_del(q, rqos);
2917 err_free_ioc:
2918         free_percpu(ioc->pcpu_stat);
2919         kfree(ioc);
2920         return ret;
2921 }
2922
2923 static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
2924 {
2925         struct ioc_cgrp *iocc;
2926
2927         iocc = kzalloc(sizeof(struct ioc_cgrp), gfp);
2928         if (!iocc)
2929                 return NULL;
2930
2931         iocc->dfl_weight = CGROUP_WEIGHT_DFL * WEIGHT_ONE;
2932         return &iocc->cpd;
2933 }
2934
2935 static void ioc_cpd_free(struct blkcg_policy_data *cpd)
2936 {
2937         kfree(container_of(cpd, struct ioc_cgrp, cpd));
2938 }
2939
2940 static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q,
2941                                              struct blkcg *blkcg)
2942 {
2943         int levels = blkcg->css.cgroup->level + 1;
2944         struct ioc_gq *iocg;
2945
2946         iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp, q->node);
2947         if (!iocg)
2948                 return NULL;
2949
2950         iocg->pcpu_stat = alloc_percpu_gfp(struct iocg_pcpu_stat, gfp);
2951         if (!iocg->pcpu_stat) {
2952                 kfree(iocg);
2953                 return NULL;
2954         }
2955
2956         return &iocg->pd;
2957 }
2958
2959 static void ioc_pd_init(struct blkg_policy_data *pd)
2960 {
2961         struct ioc_gq *iocg = pd_to_iocg(pd);
2962         struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd);
2963         struct ioc *ioc = q_to_ioc(blkg->q);
2964         struct ioc_now now;
2965         struct blkcg_gq *tblkg;
2966         unsigned long flags;
2967
2968         ioc_now(ioc, &now);
2969
2970         iocg->ioc = ioc;
2971         atomic64_set(&iocg->vtime, now.vnow);
2972         atomic64_set(&iocg->done_vtime, now.vnow);
2973         atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
2974         INIT_LIST_HEAD(&iocg->active_list);
2975         INIT_LIST_HEAD(&iocg->walk_list);
2976         INIT_LIST_HEAD(&iocg->surplus_list);
2977         iocg->hweight_active = WEIGHT_ONE;
2978         iocg->hweight_inuse = WEIGHT_ONE;
2979
2980         init_waitqueue_head(&iocg->waitq);
2981         hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2982         iocg->waitq_timer.function = iocg_waitq_timer_fn;
2983
2984         iocg->level = blkg->blkcg->css.cgroup->level;
2985
2986         for (tblkg = blkg; tblkg; tblkg = tblkg->parent) {
2987                 struct ioc_gq *tiocg = blkg_to_iocg(tblkg);
2988                 iocg->ancestors[tiocg->level] = tiocg;
2989         }
2990
2991         spin_lock_irqsave(&ioc->lock, flags);
2992         weight_updated(iocg, &now);
2993         spin_unlock_irqrestore(&ioc->lock, flags);
2994 }
2995
2996 static void ioc_pd_free(struct blkg_policy_data *pd)
2997 {
2998         struct ioc_gq *iocg = pd_to_iocg(pd);
2999         struct ioc *ioc = iocg->ioc;
3000         unsigned long flags;
3001
3002         if (ioc) {
3003                 spin_lock_irqsave(&ioc->lock, flags);
3004
3005                 if (!list_empty(&iocg->active_list)) {
3006                         struct ioc_now now;
3007
3008                         ioc_now(ioc, &now);
3009                         propagate_weights(iocg, 0, 0, false, &now);
3010                         list_del_init(&iocg->active_list);
3011                 }
3012
3013                 WARN_ON_ONCE(!list_empty(&iocg->walk_list));
3014                 WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
3015
3016                 spin_unlock_irqrestore(&ioc->lock, flags);
3017
3018                 hrtimer_cancel(&iocg->waitq_timer);
3019         }
3020         free_percpu(iocg->pcpu_stat);
3021         kfree(iocg);
3022 }
3023
3024 static bool ioc_pd_stat(struct blkg_policy_data *pd, struct seq_file *s)
3025 {
3026         struct ioc_gq *iocg = pd_to_iocg(pd);
3027         struct ioc *ioc = iocg->ioc;
3028
3029         if (!ioc->enabled)
3030                 return false;
3031
3032         if (iocg->level == 0) {
3033                 unsigned vp10k = DIV64_U64_ROUND_CLOSEST(
3034                         ioc->vtime_base_rate * 10000,
3035                         VTIME_PER_USEC);
3036                 seq_printf(s, " cost.vrate=%u.%02u", vp10k / 100, vp10k % 100);
3037         }
3038
3039         seq_printf(s, " cost.usage=%llu", iocg->last_stat.usage_us);
3040
3041         if (blkcg_debug_stats)
3042                 seq_printf(s, " cost.wait=%llu cost.indebt=%llu cost.indelay=%llu",
3043                         iocg->last_stat.wait_us,
3044                         iocg->last_stat.indebt_us,
3045                         iocg->last_stat.indelay_us);
3046         return true;
3047 }
3048
3049 static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
3050                              int off)
3051 {
3052         const char *dname = blkg_dev_name(pd->blkg);
3053         struct ioc_gq *iocg = pd_to_iocg(pd);
3054
3055         if (dname && iocg->cfg_weight)
3056                 seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight / WEIGHT_ONE);
3057         return 0;
3058 }
3059
3060
3061 static int ioc_weight_show(struct seq_file *sf, void *v)
3062 {
3063         struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
3064         struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
3065
3066         seq_printf(sf, "default %u\n", iocc->dfl_weight / WEIGHT_ONE);
3067         blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill,
3068                           &blkcg_policy_iocost, seq_cft(sf)->private, false);
3069         return 0;
3070 }
3071
3072 static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
3073                                 size_t nbytes, loff_t off)
3074 {
3075         struct blkcg *blkcg = css_to_blkcg(of_css(of));
3076         struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
3077         struct blkg_conf_ctx ctx;
3078         struct ioc_now now;
3079         struct ioc_gq *iocg;
3080         u32 v;
3081         int ret;
3082
3083         if (!strchr(buf, ':')) {
3084                 struct blkcg_gq *blkg;
3085
3086                 if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v))
3087                         return -EINVAL;
3088
3089                 if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
3090                         return -EINVAL;
3091
3092                 spin_lock_irq(&blkcg->lock);
3093                 iocc->dfl_weight = v * WEIGHT_ONE;
3094                 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
3095                         struct ioc_gq *iocg = blkg_to_iocg(blkg);
3096
3097                         if (iocg) {
3098                                 spin_lock(&iocg->ioc->lock);
3099                                 ioc_now(iocg->ioc, &now);
3100                                 weight_updated(iocg, &now);
3101                                 spin_unlock(&iocg->ioc->lock);
3102                         }
3103                 }
3104                 spin_unlock_irq(&blkcg->lock);
3105
3106                 return nbytes;
3107         }
3108
3109         ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx);
3110         if (ret)
3111                 return ret;
3112
3113         iocg = blkg_to_iocg(ctx.blkg);
3114
3115         if (!strncmp(ctx.body, "default", 7)) {
3116                 v = 0;
3117         } else {
3118                 if (!sscanf(ctx.body, "%u", &v))
3119                         goto einval;
3120                 if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
3121                         goto einval;
3122         }
3123
3124         spin_lock(&iocg->ioc->lock);
3125         iocg->cfg_weight = v * WEIGHT_ONE;
3126         ioc_now(iocg->ioc, &now);
3127         weight_updated(iocg, &now);
3128         spin_unlock(&iocg->ioc->lock);
3129
3130         blkg_conf_finish(&ctx);
3131         return nbytes;
3132
3133 einval:
3134         blkg_conf_finish(&ctx);
3135         return -EINVAL;
3136 }
3137
3138 static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
3139                           int off)
3140 {
3141         const char *dname = blkg_dev_name(pd->blkg);
3142         struct ioc *ioc = pd_to_iocg(pd)->ioc;
3143
3144         if (!dname)
3145                 return 0;
3146
3147         seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n",
3148                    dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto",
3149                    ioc->params.qos[QOS_RPPM] / 10000,
3150                    ioc->params.qos[QOS_RPPM] % 10000 / 100,
3151                    ioc->params.qos[QOS_RLAT],
3152                    ioc->params.qos[QOS_WPPM] / 10000,
3153                    ioc->params.qos[QOS_WPPM] % 10000 / 100,
3154                    ioc->params.qos[QOS_WLAT],
3155                    ioc->params.qos[QOS_MIN] / 10000,
3156                    ioc->params.qos[QOS_MIN] % 10000 / 100,
3157                    ioc->params.qos[QOS_MAX] / 10000,
3158                    ioc->params.qos[QOS_MAX] % 10000 / 100);
3159         return 0;
3160 }
3161
3162 static int ioc_qos_show(struct seq_file *sf, void *v)
3163 {
3164         struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
3165
3166         blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill,
3167                           &blkcg_policy_iocost, seq_cft(sf)->private, false);
3168         return 0;
3169 }
3170
3171 static const match_table_t qos_ctrl_tokens = {
3172         { QOS_ENABLE,           "enable=%u"     },
3173         { QOS_CTRL,             "ctrl=%s"       },
3174         { NR_QOS_CTRL_PARAMS,   NULL            },
3175 };
3176
3177 static const match_table_t qos_tokens = {
3178         { QOS_RPPM,             "rpct=%s"       },
3179         { QOS_RLAT,             "rlat=%u"       },
3180         { QOS_WPPM,             "wpct=%s"       },
3181         { QOS_WLAT,             "wlat=%u"       },
3182         { QOS_MIN,              "min=%s"        },
3183         { QOS_MAX,              "max=%s"        },
3184         { NR_QOS_PARAMS,        NULL            },
3185 };
3186
3187 static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
3188                              size_t nbytes, loff_t off)
3189 {
3190         struct block_device *bdev;
3191         struct ioc *ioc;
3192         u32 qos[NR_QOS_PARAMS];
3193         bool enable, user;
3194         char *p;
3195         int ret;
3196
3197         bdev = blkcg_conf_open_bdev(&input);
3198         if (IS_ERR(bdev))
3199                 return PTR_ERR(bdev);
3200
3201         ioc = q_to_ioc(bdev->bd_disk->queue);
3202         if (!ioc) {
3203                 ret = blk_iocost_init(bdev->bd_disk->queue);
3204                 if (ret)
3205                         goto err;
3206                 ioc = q_to_ioc(bdev->bd_disk->queue);
3207         }
3208
3209         spin_lock_irq(&ioc->lock);
3210         memcpy(qos, ioc->params.qos, sizeof(qos));
3211         enable = ioc->enabled;
3212         user = ioc->user_qos_params;
3213         spin_unlock_irq(&ioc->lock);
3214
3215         while ((p = strsep(&input, " \t\n"))) {
3216                 substring_t args[MAX_OPT_ARGS];
3217                 char buf[32];
3218                 int tok;
3219                 s64 v;
3220
3221                 if (!*p)
3222                         continue;
3223
3224                 switch (match_token(p, qos_ctrl_tokens, args)) {
3225                 case QOS_ENABLE:
3226                         match_u64(&args[0], &v);
3227                         enable = v;
3228                         continue;
3229                 case QOS_CTRL:
3230                         match_strlcpy(buf, &args[0], sizeof(buf));
3231                         if (!strcmp(buf, "auto"))
3232                                 user = false;
3233                         else if (!strcmp(buf, "user"))
3234                                 user = true;
3235                         else
3236                                 goto einval;
3237                         continue;
3238                 }
3239
3240                 tok = match_token(p, qos_tokens, args);
3241                 switch (tok) {
3242                 case QOS_RPPM:
3243                 case QOS_WPPM:
3244                         if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
3245                             sizeof(buf))
3246                                 goto einval;
3247                         if (cgroup_parse_float(buf, 2, &v))
3248                                 goto einval;
3249                         if (v < 0 || v > 10000)
3250                                 goto einval;
3251                         qos[tok] = v * 100;
3252                         break;
3253                 case QOS_RLAT:
3254                 case QOS_WLAT:
3255                         if (match_u64(&args[0], &v))
3256                                 goto einval;
3257                         qos[tok] = v;
3258                         break;
3259                 case QOS_MIN:
3260                 case QOS_MAX:
3261                         if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
3262                             sizeof(buf))
3263                                 goto einval;
3264                         if (cgroup_parse_float(buf, 2, &v))
3265                                 goto einval;
3266                         if (v < 0)
3267                                 goto einval;
3268                         qos[tok] = clamp_t(s64, v * 100,
3269                                            VRATE_MIN_PPM, VRATE_MAX_PPM);
3270                         break;
3271                 default:
3272                         goto einval;
3273                 }
3274                 user = true;
3275         }
3276
3277         if (qos[QOS_MIN] > qos[QOS_MAX])
3278                 goto einval;
3279
3280         spin_lock_irq(&ioc->lock);
3281
3282         if (enable) {
3283                 blk_stat_enable_accounting(ioc->rqos.q);
3284                 blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
3285                 ioc->enabled = true;
3286         } else {
3287                 blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
3288                 ioc->enabled = false;
3289         }
3290
3291         if (user) {
3292                 memcpy(ioc->params.qos, qos, sizeof(qos));
3293                 ioc->user_qos_params = true;
3294         } else {
3295                 ioc->user_qos_params = false;
3296         }
3297
3298         ioc_refresh_params(ioc, true);
3299         spin_unlock_irq(&ioc->lock);
3300
3301         blkdev_put_no_open(bdev);
3302         return nbytes;
3303 einval:
3304         ret = -EINVAL;
3305 err:
3306         blkdev_put_no_open(bdev);
3307         return ret;
3308 }
3309
3310 static u64 ioc_cost_model_prfill(struct seq_file *sf,
3311                                  struct blkg_policy_data *pd, int off)
3312 {
3313         const char *dname = blkg_dev_name(pd->blkg);
3314         struct ioc *ioc = pd_to_iocg(pd)->ioc;
3315         u64 *u = ioc->params.i_lcoefs;
3316
3317         if (!dname)
3318                 return 0;
3319
3320         seq_printf(sf, "%s ctrl=%s model=linear "
3321                    "rbps=%llu rseqiops=%llu rrandiops=%llu "
3322                    "wbps=%llu wseqiops=%llu wrandiops=%llu\n",
3323                    dname, ioc->user_cost_model ? "user" : "auto",
3324                    u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
3325                    u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
3326         return 0;
3327 }
3328
3329 static int ioc_cost_model_show(struct seq_file *sf, void *v)
3330 {
3331         struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
3332
3333         blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill,
3334                           &blkcg_policy_iocost, seq_cft(sf)->private, false);
3335         return 0;
3336 }
3337
3338 static const match_table_t cost_ctrl_tokens = {
3339         { COST_CTRL,            "ctrl=%s"       },
3340         { COST_MODEL,           "model=%s"      },
3341         { NR_COST_CTRL_PARAMS,  NULL            },
3342 };
3343
3344 static const match_table_t i_lcoef_tokens = {
3345         { I_LCOEF_RBPS,         "rbps=%u"       },
3346         { I_LCOEF_RSEQIOPS,     "rseqiops=%u"   },
3347         { I_LCOEF_RRANDIOPS,    "rrandiops=%u"  },
3348         { I_LCOEF_WBPS,         "wbps=%u"       },
3349         { I_LCOEF_WSEQIOPS,     "wseqiops=%u"   },
3350         { I_LCOEF_WRANDIOPS,    "wrandiops=%u"  },
3351         { NR_I_LCOEFS,          NULL            },
3352 };
3353
3354 static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
3355                                     size_t nbytes, loff_t off)
3356 {
3357         struct block_device *bdev;
3358         struct ioc *ioc;
3359         u64 u[NR_I_LCOEFS];
3360         bool user;
3361         char *p;
3362         int ret;
3363
3364         bdev = blkcg_conf_open_bdev(&input);
3365         if (IS_ERR(bdev))
3366                 return PTR_ERR(bdev);
3367
3368         ioc = q_to_ioc(bdev->bd_disk->queue);
3369         if (!ioc) {
3370                 ret = blk_iocost_init(bdev->bd_disk->queue);
3371                 if (ret)
3372                         goto err;
3373                 ioc = q_to_ioc(bdev->bd_disk->queue);
3374         }
3375
3376         spin_lock_irq(&ioc->lock);
3377         memcpy(u, ioc->params.i_lcoefs, sizeof(u));
3378         user = ioc->user_cost_model;
3379         spin_unlock_irq(&ioc->lock);
3380
3381         while ((p = strsep(&input, " \t\n"))) {
3382                 substring_t args[MAX_OPT_ARGS];
3383                 char buf[32];
3384                 int tok;
3385                 u64 v;
3386
3387                 if (!*p)
3388                         continue;
3389
3390                 switch (match_token(p, cost_ctrl_tokens, args)) {
3391                 case COST_CTRL:
3392                         match_strlcpy(buf, &args[0], sizeof(buf));
3393                         if (!strcmp(buf, "auto"))
3394                                 user = false;
3395                         else if (!strcmp(buf, "user"))
3396                                 user = true;
3397                         else
3398                                 goto einval;
3399                         continue;
3400                 case COST_MODEL:
3401                         match_strlcpy(buf, &args[0], sizeof(buf));
3402                         if (strcmp(buf, "linear"))
3403                                 goto einval;
3404                         continue;
3405                 }
3406
3407                 tok = match_token(p, i_lcoef_tokens, args);
3408                 if (tok == NR_I_LCOEFS)
3409                         goto einval;
3410                 if (match_u64(&args[0], &v))
3411                         goto einval;
3412                 u[tok] = v;
3413                 user = true;
3414         }
3415
3416         spin_lock_irq(&ioc->lock);
3417         if (user) {
3418                 memcpy(ioc->params.i_lcoefs, u, sizeof(u));
3419                 ioc->user_cost_model = true;
3420         } else {
3421                 ioc->user_cost_model = false;
3422         }
3423         ioc_refresh_params(ioc, true);
3424         spin_unlock_irq(&ioc->lock);
3425
3426         blkdev_put_no_open(bdev);
3427         return nbytes;
3428
3429 einval:
3430         ret = -EINVAL;
3431 err:
3432         blkdev_put_no_open(bdev);
3433         return ret;
3434 }
3435
3436 static struct cftype ioc_files[] = {
3437         {
3438                 .name = "weight",
3439                 .flags = CFTYPE_NOT_ON_ROOT,
3440                 .seq_show = ioc_weight_show,
3441                 .write = ioc_weight_write,
3442         },
3443         {
3444                 .name = "cost.qos",
3445                 .flags = CFTYPE_ONLY_ON_ROOT,
3446                 .seq_show = ioc_qos_show,
3447                 .write = ioc_qos_write,
3448         },
3449         {
3450                 .name = "cost.model",
3451                 .flags = CFTYPE_ONLY_ON_ROOT,
3452                 .seq_show = ioc_cost_model_show,
3453                 .write = ioc_cost_model_write,
3454         },
3455         {}
3456 };
3457
3458 static struct blkcg_policy blkcg_policy_iocost = {
3459         .dfl_cftypes    = ioc_files,
3460         .cpd_alloc_fn   = ioc_cpd_alloc,
3461         .cpd_free_fn    = ioc_cpd_free,
3462         .pd_alloc_fn    = ioc_pd_alloc,
3463         .pd_init_fn     = ioc_pd_init,
3464         .pd_free_fn     = ioc_pd_free,
3465         .pd_stat_fn     = ioc_pd_stat,
3466 };
3467
3468 static int __init ioc_init(void)
3469 {
3470         return blkcg_policy_register(&blkcg_policy_iocost);
3471 }
3472
3473 static void __exit ioc_exit(void)
3474 {
3475         blkcg_policy_unregister(&blkcg_policy_iocost);
3476 }
3477
3478 module_init(ioc_init);
3479 module_exit(ioc_exit);