include/linux/blk-cgroup.h

   1 /* SPDX-License-Identifier: GPL-2.0 */
   2 #ifndef _BLK_CGROUP_H
   3 #define _BLK_CGROUP_H
   4 /*
   5  * Common Block IO controller cgroup interface
   6  *
   7  * Based on ideas and code from CFQ, CFS and BFQ:
   8  * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk>
   9  *
  10  * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it>
  11  *                    Paolo Valente <paolo.valente@unimore.it>
  12  *
  13  * Copyright (C) 2009 Vivek Goyal <vgoyal@redhat.com>
  14  *                    Nauman Rafique <nauman@google.com>
  15  */
  16
  17 #include <linux/cgroup.h>
  18 #include <linux/percpu_counter.h>
  19 #include <linux/seq_file.h>
  20 #include <linux/radix-tree.h>
  21 #include <linux/blkdev.h>
  22 #include <linux/atomic.h>
  23 #include <linux/kthread.h>
  24 #include <linux/fs.h>
  25 #include <linux/blkdev.h>
  26
  27 /* percpu_counter batch for blkg_[rw]stats, per-cpu drift doesn't matter */
  28 #define BLKG_STAT_CPU_BATCH     (INT_MAX / 2)
  29
  30 /* Max limits for throttle policy */
  31 #define THROTL_IOPS_MAX         UINT_MAX
  32
  33 #ifdef CONFIG_BLK_CGROUP
  34
  35 enum blkg_rwstat_type {
  36         BLKG_RWSTAT_READ,
  37         BLKG_RWSTAT_WRITE,
  38         BLKG_RWSTAT_SYNC,
  39         BLKG_RWSTAT_ASYNC,
  40         BLKG_RWSTAT_DISCARD,
  41
  42         BLKG_RWSTAT_NR,
  43         BLKG_RWSTAT_TOTAL = BLKG_RWSTAT_NR,
  44 };
  45
  46 struct blkcg_gq;
  47
  48 struct blkcg {
  49         struct cgroup_subsys_state      css;
  50         spinlock_t                      lock;
  51
  52         struct radix_tree_root          blkg_tree;
  53         struct blkcg_gq __rcu           *blkg_hint;
  54         struct hlist_head               blkg_list;
  55
  56         struct blkcg_policy_data        *cpd[BLKCG_MAX_POLS];
  57
  58         struct list_head                all_blkcgs_node;
  59 #ifdef CONFIG_CGROUP_WRITEBACK
  60         struct list_head                cgwb_list;
  61         refcount_t                      cgwb_refcnt;
  62 #endif
  63 };
  64
  65 /*
  66  * blkg_[rw]stat->aux_cnt is excluded for local stats but included for
  67  * recursive.  Used to carry stats of dead children.
  68  */
  69 struct blkg_rwstat {
  70         struct percpu_counter           cpu_cnt[BLKG_RWSTAT_NR];
  71         atomic64_t                      aux_cnt[BLKG_RWSTAT_NR];
  72 };
  73
  74 struct blkg_rwstat_sample {
  75         u64                             cnt[BLKG_RWSTAT_NR];
  76 };
  77
  78 /*
  79  * A blkcg_gq (blkg) is association between a block cgroup (blkcg) and a
  80  * request_queue (q).  This is used by blkcg policies which need to track
  81  * information per blkcg - q pair.
  82  *
  83  * There can be multiple active blkcg policies and each blkg:policy pair is
  84  * represented by a blkg_policy_data which is allocated and freed by each
  85  * policy's pd_alloc/free_fn() methods.  A policy can allocate private data
  86  * area by allocating larger data structure which embeds blkg_policy_data
  87  * at the beginning.
  88  */
  89 struct blkg_policy_data {
  90         /* the blkg and policy id this per-policy data belongs to */
  91         struct blkcg_gq                 *blkg;
  92         int                             plid;
  93 };
  94
  95 /*
  96  * Policies that need to keep per-blkcg data which is independent from any
  97  * request_queue associated to it should implement cpd_alloc/free_fn()
  98  * methods.  A policy can allocate private data area by allocating larger
  99  * data structure which embeds blkcg_policy_data at the beginning.
 100  * cpd_init() is invoked to let each policy handle per-blkcg data.
 101  */
 102 struct blkcg_policy_data {
 103         /* the blkcg and policy id this per-policy data belongs to */
 104         struct blkcg                    *blkcg;
 105         int                             plid;
 106 };
 107
 108 /* association between a blk cgroup and a request queue */
 109 struct blkcg_gq {
 110         /* Pointer to the associated request_queue */
 111         struct request_queue            *q;
 112         struct list_head                q_node;
 113         struct hlist_node               blkcg_node;
 114         struct blkcg                    *blkcg;
 115
 116         /*
 117          * Each blkg gets congested separately and the congestion state is
 118          * propagated to the matching bdi_writeback_congested.
 119          */
 120         struct bdi_writeback_congested  *wb_congested;
 121
 122         /* all non-root blkcg_gq's are guaranteed to have access to parent */
 123         struct blkcg_gq                 *parent;
 124
 125         /* reference count */
 126         struct percpu_ref               refcnt;
 127
 128         /* is this blkg online? protected by both blkcg and q locks */
 129         bool                            online;
 130
 131         struct blkg_rwstat              stat_bytes;
 132         struct blkg_rwstat              stat_ios;
 133
 134         struct blkg_policy_data         *pd[BLKCG_MAX_POLS];
 135
 136         spinlock_t                      async_bio_lock;
 137         struct bio_list                 async_bios;
 138         struct work_struct              async_bio_work;
 139
 140         atomic_t                        use_delay;
 141         atomic64_t                      delay_nsec;
 142         atomic64_t                      delay_start;
 143         u64                             last_delay;
 144         int                             last_use;
 145
 146         struct rcu_head                 rcu_head;
 147 };
 148
 149 typedef struct blkcg_policy_data *(blkcg_pol_alloc_cpd_fn)(gfp_t gfp);
 150 typedef void (blkcg_pol_init_cpd_fn)(struct blkcg_policy_data *cpd);
 151 typedef void (blkcg_pol_free_cpd_fn)(struct blkcg_policy_data *cpd);
 152 typedef void (blkcg_pol_bind_cpd_fn)(struct blkcg_policy_data *cpd);
 153 typedef struct blkg_policy_data *(blkcg_pol_alloc_pd_fn)(gfp_t gfp,
 154                                 struct request_queue *q, struct blkcg *blkcg);
 155 typedef void (blkcg_pol_init_pd_fn)(struct blkg_policy_data *pd);
 156 typedef void (blkcg_pol_online_pd_fn)(struct blkg_policy_data *pd);
 157 typedef void (blkcg_pol_offline_pd_fn)(struct blkg_policy_data *pd);
 158 typedef void (blkcg_pol_free_pd_fn)(struct blkg_policy_data *pd);
 159 typedef void (blkcg_pol_reset_pd_stats_fn)(struct blkg_policy_data *pd);
 160 typedef size_t (blkcg_pol_stat_pd_fn)(struct blkg_policy_data *pd, char *buf,
 161                                       size_t size);
 162
 163 struct blkcg_policy {
 164         int                             plid;
 165         /* cgroup files for the policy */
 166         struct cftype                   *dfl_cftypes;
 167         struct cftype                   *legacy_cftypes;
 168
 169         /* operations */
 170         blkcg_pol_alloc_cpd_fn          *cpd_alloc_fn;
 171         blkcg_pol_init_cpd_fn           *cpd_init_fn;
 172         blkcg_pol_free_cpd_fn           *cpd_free_fn;
 173         blkcg_pol_bind_cpd_fn           *cpd_bind_fn;
 174
 175         blkcg_pol_alloc_pd_fn           *pd_alloc_fn;
 176         blkcg_pol_init_pd_fn            *pd_init_fn;
 177         blkcg_pol_online_pd_fn          *pd_online_fn;
 178         blkcg_pol_offline_pd_fn         *pd_offline_fn;
 179         blkcg_pol_free_pd_fn            *pd_free_fn;
 180         blkcg_pol_reset_pd_stats_fn     *pd_reset_stats_fn;
 181         blkcg_pol_stat_pd_fn            *pd_stat_fn;
 182 };
 183
 184 extern struct blkcg blkcg_root;
 185 extern struct cgroup_subsys_state * const blkcg_root_css;
 186 extern bool blkcg_debug_stats;
 187
 188 struct blkcg_gq *blkg_lookup_slowpath(struct blkcg *blkcg,
 189                                       struct request_queue *q, bool update_hint);
 190 struct blkcg_gq *__blkg_lookup_create(struct blkcg *blkcg,
 191                                       struct request_queue *q);
 192 struct blkcg_gq *blkg_lookup_create(struct blkcg *blkcg,
 193                                     struct request_queue *q);
 194 int blkcg_init_queue(struct request_queue *q);
 195 void blkcg_drain_queue(struct request_queue *q);
 196 void blkcg_exit_queue(struct request_queue *q);
 197
 198 /* Blkio controller policy registration */
 199 int blkcg_policy_register(struct blkcg_policy *pol);
 200 void blkcg_policy_unregister(struct blkcg_policy *pol);
 201 int blkcg_activate_policy(struct request_queue *q,
 202                           const struct blkcg_policy *pol);
 203 void blkcg_deactivate_policy(struct request_queue *q,
 204                              const struct blkcg_policy *pol);
 205
 206 static inline u64 blkg_rwstat_read_counter(struct blkg_rwstat *rwstat,
 207                 unsigned int idx)
 208 {
 209         return atomic64_read(&rwstat->aux_cnt[idx]) +
 210                 percpu_counter_sum_positive(&rwstat->cpu_cnt[idx]);
 211 }
 212
 213 const char *blkg_dev_name(struct blkcg_gq *blkg);
 214 void blkcg_print_blkgs(struct seq_file *sf, struct blkcg *blkcg,
 215                        u64 (*prfill)(struct seq_file *,
 216                                      struct blkg_policy_data *, int),
 217                        const struct blkcg_policy *pol, int data,
 218                        bool show_total);
 219 u64 __blkg_prfill_u64(struct seq_file *sf, struct blkg_policy_data *pd, u64 v);
 220 u64 __blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 221                          const struct blkg_rwstat_sample *rwstat);
 222 u64 blkg_prfill_rwstat(struct seq_file *sf, struct blkg_policy_data *pd,
 223                        int off);
 224 int blkg_print_stat_bytes(struct seq_file *sf, void *v);
 225 int blkg_print_stat_ios(struct seq_file *sf, void *v);
 226 int blkg_print_stat_bytes_recursive(struct seq_file *sf, void *v);
 227 int blkg_print_stat_ios_recursive(struct seq_file *sf, void *v);
 228
 229 void blkg_rwstat_recursive_sum(struct blkcg_gq *blkg, struct blkcg_policy *pol,
 230                 int off, struct blkg_rwstat_sample *sum);
 231
 232 struct blkg_conf_ctx {
 233         struct gendisk                  *disk;
 234         struct blkcg_gq                 *blkg;
 235         char                            *body;
 236 };
 237
 238 struct gendisk *blkcg_conf_get_disk(char **inputp);
 239 int blkg_conf_prep(struct blkcg *blkcg, const struct blkcg_policy *pol,
 240                    char *input, struct blkg_conf_ctx *ctx);
 241 void blkg_conf_finish(struct blkg_conf_ctx *ctx);
 242
 243 /**
 244  * blkcg_css - find the current css
 245  *
 246  * Find the css associated with either the kthread or the current task.
 247  * This may return a dying css, so it is up to the caller to use tryget logic
 248  * to confirm it is alive and well.
 249  */
 250 static inline struct cgroup_subsys_state *blkcg_css(void)
 251 {
 252         struct cgroup_subsys_state *css;
 253
 254         css = kthread_blkcg();
 255         if (css)
 256                 return css;
 257         return task_css(current, io_cgrp_id);
 258 }
 259
 260 static inline struct blkcg *css_to_blkcg(struct cgroup_subsys_state *css)
 261 {
 262         return css ? container_of(css, struct blkcg, css) : NULL;
 263 }
 264
 265 /**
 266  * __bio_blkcg - internal, inconsistent version to get blkcg
 267  *
 268  * DO NOT USE.
 269  * This function is inconsistent and consequently is dangerous to use.  The
 270  * first part of the function returns a blkcg where a reference is owned by the
 271  * bio.  This means it does not need to be rcu protected as it cannot go away
 272  * with the bio owning a reference to it.  However, the latter potentially gets
 273  * it from task_css().  This can race against task migration and the cgroup
 274  * dying.  It is also semantically different as it must be called rcu protected
 275  * and is susceptible to failure when trying to get a reference to it.
 276  * Therefore, it is not ok to assume that *_get() will always succeed on the
 277  * blkcg returned here.
 278  */
 279 static inline struct blkcg *__bio_blkcg(struct bio *bio)
 280 {
 281         if (bio && bio->bi_blkg)
 282                 return bio->bi_blkg->blkcg;
 283         return css_to_blkcg(blkcg_css());
 284 }
 285
 286 /**
 287  * bio_blkcg - grab the blkcg associated with a bio
 288  * @bio: target bio
 289  *
 290  * This returns the blkcg associated with a bio, %NULL if not associated.
 291  * Callers are expected to either handle %NULL or know association has been
 292  * done prior to calling this.
 293  */
 294 static inline struct blkcg *bio_blkcg(struct bio *bio)
 295 {
 296         if (bio && bio->bi_blkg)
 297                 return bio->bi_blkg->blkcg;
 298         return NULL;
 299 }
 300
 301 static inline bool blk_cgroup_congested(void)
 302 {
 303         struct cgroup_subsys_state *css;
 304         bool ret = false;
 305
 306         rcu_read_lock();
 307         css = kthread_blkcg();
 308         if (!css)
 309                 css = task_css(current, io_cgrp_id);
 310         while (css) {
 311                 if (atomic_read(&css->cgroup->congestion_count)) {
 312                         ret = true;
 313                         break;
 314                 }
 315                 css = css->parent;
 316         }
 317         rcu_read_unlock();
 318         return ret;
 319 }
 320
 321 /**
 322  * bio_issue_as_root_blkg - see if this bio needs to be issued as root blkg
 323  * @return: true if this bio needs to be submitted with the root blkg context.
 324  *
 325  * In order to avoid priority inversions we sometimes need to issue a bio as if
 326  * it were attached to the root blkg, and then backcharge to the actual owning
 327  * blkg.  The idea is we do bio_blkcg() to look up the actual context for the
 328  * bio and attach the appropriate blkg to the bio.  Then we call this helper and
 329  * if it is true run with the root blkg for that queue and then do any
 330  * backcharging to the originating cgroup once the io is complete.
 331  */
 332 static inline bool bio_issue_as_root_blkg(struct bio *bio)
 333 {
 334         return (bio->bi_opf & (REQ_META | REQ_SWAP)) != 0;
 335 }
 336
 337 /**
 338  * blkcg_parent - get the parent of a blkcg
 339  * @blkcg: blkcg of interest
 340  *
 341  * Return the parent blkcg of @blkcg.  Can be called anytime.
 342  */
 343 static inline struct blkcg *blkcg_parent(struct blkcg *blkcg)
 344 {
 345         return css_to_blkcg(blkcg->css.parent);
 346 }
 347
 348 /**
 349  * __blkg_lookup - internal version of blkg_lookup()
 350  * @blkcg: blkcg of interest
 351  * @q: request_queue of interest
 352  * @update_hint: whether to update lookup hint with the result or not
 353  *
 354  * This is internal version and shouldn't be used by policy
 355  * implementations.  Looks up blkgs for the @blkcg - @q pair regardless of
 356  * @q's bypass state.  If @update_hint is %true, the caller should be
 357  * holding @q->queue_lock and lookup hint is updated on success.
 358  */
 359 static inline struct blkcg_gq *__blkg_lookup(struct blkcg *blkcg,
 360                                              struct request_queue *q,
 361                                              bool update_hint)
 362 {
 363         struct blkcg_gq *blkg;
 364
 365         if (blkcg == &blkcg_root)
 366                 return q->root_blkg;
 367
 368         blkg = rcu_dereference(blkcg->blkg_hint);
 369         if (blkg && blkg->q == q)
 370                 return blkg;
 371
 372         return blkg_lookup_slowpath(blkcg, q, update_hint);
 373 }
 374
 375 /**
 376  * blkg_lookup - lookup blkg for the specified blkcg - q pair
 377  * @blkcg: blkcg of interest
 378  * @q: request_queue of interest
 379  *
 380  * Lookup blkg for the @blkcg - @q pair.  This function should be called
 381  * under RCU read lock.
 382  */
 383 static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg,
 384                                            struct request_queue *q)
 385 {
 386         WARN_ON_ONCE(!rcu_read_lock_held());
 387         return __blkg_lookup(blkcg, q, false);
 388 }
 389
 390 /**
 391  * blk_queue_root_blkg - return blkg for the (blkcg_root, @q) pair
 392  * @q: request_queue of interest
 393  *
 394  * Lookup blkg for @q at the root level. See also blkg_lookup().
 395  */
 396 static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q)
 397 {
 398         return q->root_blkg;
 399 }
 400
 401 /**
 402  * blkg_to_pdata - get policy private data
 403  * @blkg: blkg of interest
 404  * @pol: policy of interest
 405  *
 406  * Return pointer to private data associated with the @blkg-@pol pair.
 407  */
 408 static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
 409                                                   struct blkcg_policy *pol)
 410 {
 411         return blkg ? blkg->pd[pol->plid] : NULL;
 412 }
 413
 414 static inline struct blkcg_policy_data *blkcg_to_cpd(struct blkcg *blkcg,
 415                                                      struct blkcg_policy *pol)
 416 {
 417         return blkcg ? blkcg->cpd[pol->plid] : NULL;
 418 }
 419
 420 /**
 421  * pdata_to_blkg - get blkg associated with policy private data
 422  * @pd: policy private data of interest
 423  *
 424  * @pd is policy private data.  Determine the blkg it's associated with.
 425  */
 426 static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd)
 427 {
 428         return pd ? pd->blkg : NULL;
 429 }
 430
 431 static inline struct blkcg *cpd_to_blkcg(struct blkcg_policy_data *cpd)
 432 {
 433         return cpd ? cpd->blkcg : NULL;
 434 }
 435
 436 extern void blkcg_destroy_blkgs(struct blkcg *blkcg);
 437
 438 #ifdef CONFIG_CGROUP_WRITEBACK
 439
 440 /**
 441  * blkcg_cgwb_get - get a reference for blkcg->cgwb_list
 442  * @blkcg: blkcg of interest
 443  *
 444  * This is used to track the number of active wb's related to a blkcg.
 445  */
 446 static inline void blkcg_cgwb_get(struct blkcg *blkcg)
 447 {
 448         refcount_inc(&blkcg->cgwb_refcnt);
 449 }
 450
 451 /**
 452  * blkcg_cgwb_put - put a reference for @blkcg->cgwb_list
 453  * @blkcg: blkcg of interest
 454  *
 455  * This is used to track the number of active wb's related to a blkcg.
 456  * When this count goes to zero, all active wb has finished so the
 457  * blkcg can continue destruction by calling blkcg_destroy_blkgs().
 458  * This work may occur in cgwb_release_workfn() on the cgwb_release
 459  * workqueue.
 460  */
 461 static inline void blkcg_cgwb_put(struct blkcg *blkcg)
 462 {
 463         if (refcount_dec_and_test(&blkcg->cgwb_refcnt))
 464                 blkcg_destroy_blkgs(blkcg);
 465 }
 466
 467 #else
 468
 469 static inline void blkcg_cgwb_get(struct blkcg *blkcg) { }
 470
 471 static inline void blkcg_cgwb_put(struct blkcg *blkcg)
 472 {
 473         /* wb isn't being accounted, so trigger destruction right away */
 474         blkcg_destroy_blkgs(blkcg);
 475 }
 476
 477 #endif
 478
 479 /**
 480  * blkg_path - format cgroup path of blkg
 481  * @blkg: blkg of interest
 482  * @buf: target buffer
 483  * @buflen: target buffer length
 484  *
 485  * Format the path of the cgroup of @blkg into @buf.
 486  */
 487 static inline int blkg_path(struct blkcg_gq *blkg, char *buf, int buflen)
 488 {
 489         return cgroup_path(blkg->blkcg->css.cgroup, buf, buflen);
 490 }
 491
 492 /**
 493  * blkg_get - get a blkg reference
 494  * @blkg: blkg to get
 495  *
 496  * The caller should be holding an existing reference.
 497  */
 498 static inline void blkg_get(struct blkcg_gq *blkg)
 499 {
 500         percpu_ref_get(&blkg->refcnt);
 501 }
 502
 503 /**
 504  * blkg_tryget - try and get a blkg reference
 505  * @blkg: blkg to get
 506  *
 507  * This is for use when doing an RCU lookup of the blkg.  We may be in the midst
 508  * of freeing this blkg, so we can only use it if the refcnt is not zero.
 509  */
 510 static inline bool blkg_tryget(struct blkcg_gq *blkg)
 511 {
 512         return blkg && percpu_ref_tryget(&blkg->refcnt);
 513 }
 514
 515 /**
 516  * blkg_tryget_closest - try and get a blkg ref on the closet blkg
 517  * @blkg: blkg to get
 518  *
 519  * This needs to be called rcu protected.  As the failure mode here is to walk
 520  * up the blkg tree, this ensure that the blkg->parent pointers are always
 521  * valid.  This returns the blkg that it ended up taking a reference on or %NULL
 522  * if no reference was taken.
 523  */
 524 static inline struct blkcg_gq *blkg_tryget_closest(struct blkcg_gq *blkg)
 525 {
 526         struct blkcg_gq *ret_blkg = NULL;
 527
 528         WARN_ON_ONCE(!rcu_read_lock_held());
 529
 530         while (blkg) {
 531                 if (blkg_tryget(blkg)) {
 532                         ret_blkg = blkg;
 533                         break;
 534                 }
 535                 blkg = blkg->parent;
 536         }
 537
 538         return ret_blkg;
 539 }
 540
 541 /**
 542  * blkg_put - put a blkg reference
 543  * @blkg: blkg to put
 544  */
 545 static inline void blkg_put(struct blkcg_gq *blkg)
 546 {
 547         percpu_ref_put(&blkg->refcnt);
 548 }
 549
 550 /**
 551  * blkg_for_each_descendant_pre - pre-order walk of a blkg's descendants
 552  * @d_blkg: loop cursor pointing to the current descendant
 553  * @pos_css: used for iteration
 554  * @p_blkg: target blkg to walk descendants of
 555  *
 556  * Walk @c_blkg through the descendants of @p_blkg.  Must be used with RCU
 557  * read locked.  If called under either blkcg or queue lock, the iteration
 558  * is guaranteed to include all and only online blkgs.  The caller may
 559  * update @pos_css by calling css_rightmost_descendant() to skip subtree.
 560  * @p_blkg is included in the iteration and the first node to be visited.
 561  */
 562 #define blkg_for_each_descendant_pre(d_blkg, pos_css, p_blkg)           \
 563         css_for_each_descendant_pre((pos_css), &(p_blkg)->blkcg->css)   \
 564                 if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css),    \
 565                                               (p_blkg)->q, false)))
 566
 567 /**
 568  * blkg_for_each_descendant_post - post-order walk of a blkg's descendants
 569  * @d_blkg: loop cursor pointing to the current descendant
 570  * @pos_css: used for iteration
 571  * @p_blkg: target blkg to walk descendants of
 572  *
 573  * Similar to blkg_for_each_descendant_pre() but performs post-order
 574  * traversal instead.  Synchronization rules are the same.  @p_blkg is
 575  * included in the iteration and the last node to be visited.
 576  */
 577 #define blkg_for_each_descendant_post(d_blkg, pos_css, p_blkg)          \
 578         css_for_each_descendant_post((pos_css), &(p_blkg)->blkcg->css)  \
 579                 if (((d_blkg) = __blkg_lookup(css_to_blkcg(pos_css),    \
 580                                               (p_blkg)->q, false)))
 581
 582 static inline int blkg_rwstat_init(struct blkg_rwstat *rwstat, gfp_t gfp)
 583 {
 584         int i, ret;
 585
 586         for (i = 0; i < BLKG_RWSTAT_NR; i++) {
 587                 ret = percpu_counter_init(&rwstat->cpu_cnt[i], 0, gfp);
 588                 if (ret) {
 589                         while (--i >= 0)
 590                                 percpu_counter_destroy(&rwstat->cpu_cnt[i]);
 591                         return ret;
 592                 }
 593                 atomic64_set(&rwstat->aux_cnt[i], 0);
 594         }
 595         return 0;
 596 }
 597
 598 static inline void blkg_rwstat_exit(struct blkg_rwstat *rwstat)
 599 {
 600         int i;
 601
 602         for (i = 0; i < BLKG_RWSTAT_NR; i++)
 603                 percpu_counter_destroy(&rwstat->cpu_cnt[i]);
 604 }
 605
 606 /**
 607  * blkg_rwstat_add - add a value to a blkg_rwstat
 608  * @rwstat: target blkg_rwstat
 609  * @op: REQ_OP and flags
 610  * @val: value to add
 611  *
 612  * Add @val to @rwstat.  The counters are chosen according to @rw.  The
 613  * caller is responsible for synchronizing calls to this function.
 614  */
 615 static inline void blkg_rwstat_add(struct blkg_rwstat *rwstat,
 616                                    unsigned int op, uint64_t val)
 617 {
 618         struct percpu_counter *cnt;
 619
 620         if (op_is_discard(op))
 621                 cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_DISCARD];
 622         else if (op_is_write(op))
 623                 cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_WRITE];
 624         else
 625                 cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_READ];
 626
 627         percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH);
 628
 629         if (op_is_sync(op))
 630                 cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_SYNC];
 631         else
 632                 cnt = &rwstat->cpu_cnt[BLKG_RWSTAT_ASYNC];
 633
 634         percpu_counter_add_batch(cnt, val, BLKG_STAT_CPU_BATCH);
 635 }
 636
 637 /**
 638  * blkg_rwstat_read - read the current values of a blkg_rwstat
 639  * @rwstat: blkg_rwstat to read
 640  *
 641  * Read the current snapshot of @rwstat and return it in the aux counts.
 642  */
 643 static inline void blkg_rwstat_read(struct blkg_rwstat *rwstat,
 644                 struct blkg_rwstat_sample *result)
 645 {
 646         int i;
 647
 648         for (i = 0; i < BLKG_RWSTAT_NR; i++)
 649                 result->cnt[i] =
 650                         percpu_counter_sum_positive(&rwstat->cpu_cnt[i]);
 651 }
 652
 653 /**
 654  * blkg_rwstat_total - read the total count of a blkg_rwstat
 655  * @rwstat: blkg_rwstat to read
 656  *
 657  * Return the total count of @rwstat regardless of the IO direction.  This
 658  * function can be called without synchronization and takes care of u64
 659  * atomicity.
 660  */
 661 static inline uint64_t blkg_rwstat_total(struct blkg_rwstat *rwstat)
 662 {
 663         struct blkg_rwstat_sample tmp = { };
 664
 665         blkg_rwstat_read(rwstat, &tmp);
 666         return tmp.cnt[BLKG_RWSTAT_READ] + tmp.cnt[BLKG_RWSTAT_WRITE];
 667 }
 668
 669 /**
 670  * blkg_rwstat_reset - reset a blkg_rwstat
 671  * @rwstat: blkg_rwstat to reset
 672  */
 673 static inline void blkg_rwstat_reset(struct blkg_rwstat *rwstat)
 674 {
 675         int i;
 676
 677         for (i = 0; i < BLKG_RWSTAT_NR; i++) {
 678                 percpu_counter_set(&rwstat->cpu_cnt[i], 0);
 679                 atomic64_set(&rwstat->aux_cnt[i], 0);
 680         }
 681 }
 682
 683 /**
 684  * blkg_rwstat_add_aux - add a blkg_rwstat into another's aux count
 685  * @to: the destination blkg_rwstat
 686  * @from: the source
 687  *
 688  * Add @from's count including the aux one to @to's aux count.
 689  */
 690 static inline void blkg_rwstat_add_aux(struct blkg_rwstat *to,
 691                                        struct blkg_rwstat *from)
 692 {
 693         u64 sum[BLKG_RWSTAT_NR];
 694         int i;
 695
 696         for (i = 0; i < BLKG_RWSTAT_NR; i++)
 697                 sum[i] = percpu_counter_sum_positive(&from->cpu_cnt[i]);
 698
 699         for (i = 0; i < BLKG_RWSTAT_NR; i++)
 700                 atomic64_add(sum[i] + atomic64_read(&from->aux_cnt[i]),
 701                              &to->aux_cnt[i]);
 702 }
 703
 704 #ifdef CONFIG_BLK_DEV_THROTTLING
 705 extern bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 706                            struct bio *bio);
 707 #else
 708 static inline bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg,
 709                                   struct bio *bio) { return false; }
 710 #endif
 711
 712 bool __blkcg_punt_bio_submit(struct bio *bio);
 713
 714 static inline bool blkcg_punt_bio_submit(struct bio *bio)
 715 {
 716         if (bio->bi_opf & REQ_CGROUP_PUNT)
 717                 return __blkcg_punt_bio_submit(bio);
 718         else
 719                 return false;
 720 }
 721
 722 static inline void blkcg_bio_issue_init(struct bio *bio)
 723 {
 724         bio_issue_init(&bio->bi_issue, bio_sectors(bio));
 725 }
 726
 727 static inline bool blkcg_bio_issue_check(struct request_queue *q,
 728                                          struct bio *bio)
 729 {
 730         struct blkcg_gq *blkg;
 731         bool throtl = false;
 732
 733         rcu_read_lock();
 734
 735         if (!bio->bi_blkg) {
 736                 char b[BDEVNAME_SIZE];
 737
 738                 WARN_ONCE(1,
 739                           "no blkg associated for bio on block-device: %s\n",
 740                           bio_devname(bio, b));
 741                 bio_associate_blkg(bio);
 742         }
 743
 744         blkg = bio->bi_blkg;
 745
 746         throtl = blk_throtl_bio(q, blkg, bio);
 747
 748         if (!throtl) {
 749                 /*
 750                  * If the bio is flagged with BIO_QUEUE_ENTERED it means this
 751                  * is a split bio and we would have already accounted for the
 752                  * size of the bio.
 753                  */
 754                 if (!bio_flagged(bio, BIO_QUEUE_ENTERED))
 755                         blkg_rwstat_add(&blkg->stat_bytes, bio->bi_opf,
 756                                         bio->bi_iter.bi_size);
 757                 blkg_rwstat_add(&blkg->stat_ios, bio->bi_opf, 1);
 758         }
 759
 760         blkcg_bio_issue_init(bio);
 761
 762         rcu_read_unlock();
 763         return !throtl;
 764 }
 765
 766 static inline void blkcg_use_delay(struct blkcg_gq *blkg)
 767 {
 768         if (atomic_add_return(1, &blkg->use_delay) == 1)
 769                 atomic_inc(&blkg->blkcg->css.cgroup->congestion_count);
 770 }
 771
 772 /**
 773  * blk_cgroup_mergeable - Determine whether to allow or disallow merges
 774  * @rq: request to merge into
 775  * @bio: bio to merge
 776  *
 777  * @bio and @rq should belong to the same cgroup and their issue_as_root should
 778  * match. The latter is necessary as we don't want to throttle e.g. a metadata
 779  * update because it happens to be next to a regular IO.
 780  */
 781 static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio)
 782 {
 783         return rq->bio->bi_blkg == bio->bi_blkg &&
 784                 bio_issue_as_root_blkg(rq->bio) == bio_issue_as_root_blkg(bio);
 785 }
 786
 787 static inline int blkcg_unuse_delay(struct blkcg_gq *blkg)
 788 {
 789         int old = atomic_read(&blkg->use_delay);
 790
 791         if (old == 0)
 792                 return 0;
 793
 794         /*
 795          * We do this song and dance because we can race with somebody else
 796          * adding or removing delay.  If we just did an atomic_dec we'd end up
 797          * negative and we'd already be in trouble.  We need to subtract 1 and
 798          * then check to see if we were the last delay so we can drop the
 799          * congestion count on the cgroup.
 800          */
 801         while (old) {
 802                 int cur = atomic_cmpxchg(&blkg->use_delay, old, old - 1);
 803                 if (cur == old)
 804                         break;
 805                 old = cur;
 806         }
 807
 808         if (old == 0)
 809                 return 0;
 810         if (old == 1)
 811                 atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
 812         return 1;
 813 }
 814
 815 static inline void blkcg_clear_delay(struct blkcg_gq *blkg)
 816 {
 817         int old = atomic_read(&blkg->use_delay);
 818         if (!old)
 819                 return;
 820         /* We only want 1 person clearing the congestion count for this blkg. */
 821         while (old) {
 822                 int cur = atomic_cmpxchg(&blkg->use_delay, old, 0);
 823                 if (cur == old) {
 824                         atomic_dec(&blkg->blkcg->css.cgroup->congestion_count);
 825                         break;
 826                 }
 827                 old = cur;
 828         }
 829 }
 830
 831 void blkcg_add_delay(struct blkcg_gq *blkg, u64 now, u64 delta);
 832 void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay);
 833 void blkcg_maybe_throttle_current(void);
 834 #else   /* CONFIG_BLK_CGROUP */
 835
 836 struct blkcg {
 837 };
 838
 839 struct blkg_policy_data {
 840 };
 841
 842 struct blkcg_policy_data {
 843 };
 844
 845 struct blkcg_gq {
 846 };
 847
 848 struct blkcg_policy {
 849 };
 850
 851 #define blkcg_root_css  ((struct cgroup_subsys_state *)ERR_PTR(-EINVAL))
 852
 853 static inline void blkcg_maybe_throttle_current(void) { }
 854 static inline bool blk_cgroup_congested(void) { return false; }
 855
 856 #ifdef CONFIG_BLOCK
 857
 858 static inline void blkcg_schedule_throttle(struct request_queue *q, bool use_memdelay) { }
 859
 860 static inline struct blkcg_gq *blkg_lookup(struct blkcg *blkcg, void *key) { return NULL; }
 861 static inline struct blkcg_gq *blk_queue_root_blkg(struct request_queue *q)
 862 { return NULL; }
 863 static inline int blkcg_init_queue(struct request_queue *q) { return 0; }
 864 static inline void blkcg_drain_queue(struct request_queue *q) { }
 865 static inline void blkcg_exit_queue(struct request_queue *q) { }
 866 static inline int blkcg_policy_register(struct blkcg_policy *pol) { return 0; }
 867 static inline void blkcg_policy_unregister(struct blkcg_policy *pol) { }
 868 static inline int blkcg_activate_policy(struct request_queue *q,
 869                                         const struct blkcg_policy *pol) { return 0; }
 870 static inline void blkcg_deactivate_policy(struct request_queue *q,
 871                                            const struct blkcg_policy *pol) { }
 872
 873 static inline struct blkcg *__bio_blkcg(struct bio *bio) { return NULL; }
 874 static inline struct blkcg *bio_blkcg(struct bio *bio) { return NULL; }
 875
 876 static inline struct blkg_policy_data *blkg_to_pd(struct blkcg_gq *blkg,
 877                                                   struct blkcg_policy *pol) { return NULL; }
 878 static inline struct blkcg_gq *pd_to_blkg(struct blkg_policy_data *pd) { return NULL; }
 879 static inline char *blkg_path(struct blkcg_gq *blkg) { return NULL; }
 880 static inline void blkg_get(struct blkcg_gq *blkg) { }
 881 static inline void blkg_put(struct blkcg_gq *blkg) { }
 882
 883 static inline bool blkcg_punt_bio_submit(struct bio *bio) { return false; }
 884 static inline void blkcg_bio_issue_init(struct bio *bio) { }
 885 static inline bool blkcg_bio_issue_check(struct request_queue *q,
 886                                          struct bio *bio) { return true; }
 887 static inline bool blk_cgroup_mergeable(struct request *rq, struct bio *bio) { return true; }
 888
 889 #define blk_queue_for_each_rl(rl, q)    \
 890         for ((rl) = &(q)->root_rl; (rl); (rl) = NULL)
 891
 892 #endif  /* CONFIG_BLOCK */
 893 #endif  /* CONFIG_BLK_CGROUP */
 894 #endif  /* _BLK_CGROUP_H */