fs/btrfs/discard.c

   1 // SPDX-License-Identifier: GPL-2.0
   2
   3 #include <linux/jiffies.h>
   4 #include <linux/kernel.h>
   5 #include <linux/ktime.h>
   6 #include <linux/list.h>
   7 #include <linux/math64.h>
   8 #include <linux/sizes.h>
   9 #include <linux/workqueue.h>
  10 #include "ctree.h"
  11 #include "block-group.h"
  12 #include "discard.h"
  13 #include "free-space-cache.h"
  14
  15 /*
  16  * This contains the logic to handle async discard.
  17  *
  18  * Async discard manages trimming of free space outside of transaction commit.
  19  * Discarding is done by managing the block_groups on a LRU list based on free
  20  * space recency.  Two passes are used to first prioritize discarding extents
  21  * and then allow for trimming in the bitmap the best opportunity to coalesce.
  22  * The block_groups are maintained on multiple lists to allow for multiple
  23  * passes with different discard filter requirements.  A delayed work item is
  24  * used to manage discarding with timeout determined by a max of the delay
  25  * incurred by the iops rate limit, the byte rate limit, and the max delay of
  26  * BTRFS_DISCARD_MAX_DELAY.
  27  *
  28  * Note, this only keeps track of block_groups that are explicitly for data.
  29  * Mixed block_groups are not supported.
  30  *
  31  * The first list is special to manage discarding of fully free block groups.
  32  * This is necessary because we issue a final trim for a full free block group
  33  * after forgetting it.  When a block group becomes unused, instead of directly
  34  * being added to the unused_bgs list, we add it to this first list.  Then
  35  * from there, if it becomes fully discarded, we place it onto the unused_bgs
  36  * list.
  37  *
  38  * The in-memory free space cache serves as the backing state for discard.
  39  * Consequently this means there is no persistence.  We opt to load all the
  40  * block groups in as not discarded, so the mount case degenerates to the
  41  * crashing case.
  42  *
  43  * As the free space cache uses bitmaps, there exists a tradeoff between
  44  * ease/efficiency for find_free_extent() and the accuracy of discard state.
  45  * Here we opt to let untrimmed regions merge with everything while only letting
  46  * trimmed regions merge with other trimmed regions.  This can cause
  47  * overtrimming, but the coalescing benefit seems to be worth it.  Additionally,
  48  * bitmap state is tracked as a whole.  If we're able to fully trim a bitmap,
  49  * the trimmed flag is set on the bitmap.  Otherwise, if an allocation comes in,
  50  * this resets the state and we will retry trimming the whole bitmap.  This is a
  51  * tradeoff between discard state accuracy and the cost of accounting.
  52  */
  53
  54 /* This is an initial delay to give some chance for block reuse */
  55 #define BTRFS_DISCARD_DELAY             (120ULL * NSEC_PER_SEC)
  56 #define BTRFS_DISCARD_UNUSED_DELAY      (10ULL * NSEC_PER_SEC)
  57
  58 /* Target completion latency of discarding all discardable extents */
  59 #define BTRFS_DISCARD_TARGET_MSEC       (6 * 60 * 60UL * MSEC_PER_SEC)
  60 #define BTRFS_DISCARD_MIN_DELAY_MSEC    (1UL)
  61 #define BTRFS_DISCARD_MAX_DELAY_MSEC    (1000UL)
  62 #define BTRFS_DISCARD_MAX_IOPS          (10U)
  63
  64 /* Montonically decreasing minimum length filters after index 0 */
  65 static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = {
  66         0,
  67         BTRFS_ASYNC_DISCARD_MAX_FILTER,
  68         BTRFS_ASYNC_DISCARD_MIN_FILTER
  69 };
  70
  71 static struct list_head *get_discard_list(struct btrfs_discard_ctl *discard_ctl,
  72                                           struct btrfs_block_group *block_group)
  73 {
  74         return &discard_ctl->discard_list[block_group->discard_index];
  75 }
  76
  77 static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
  78                                   struct btrfs_block_group *block_group)
  79 {
  80         lockdep_assert_held(&discard_ctl->lock);
  81         if (!btrfs_run_discard_work(discard_ctl))
  82                 return;
  83
  84         if (list_empty(&block_group->discard_list) ||
  85             block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) {
  86                 if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED)
  87                         block_group->discard_index = BTRFS_DISCARD_INDEX_START;
  88                 block_group->discard_eligible_time = (ktime_get_ns() +
  89                                                       BTRFS_DISCARD_DELAY);
  90                 block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR;
  91         }
  92         if (list_empty(&block_group->discard_list))
  93                 btrfs_get_block_group(block_group);
  94
  95         list_move_tail(&block_group->discard_list,
  96                        get_discard_list(discard_ctl, block_group));
  97 }
  98
  99 static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl,
 100                                 struct btrfs_block_group *block_group)
 101 {
 102         if (!btrfs_is_block_group_data_only(block_group))
 103                 return;
 104
 105         spin_lock(&discard_ctl->lock);
 106         __add_to_discard_list(discard_ctl, block_group);
 107         spin_unlock(&discard_ctl->lock);
 108 }
 109
 110 static void add_to_discard_unused_list(struct btrfs_discard_ctl *discard_ctl,
 111                                        struct btrfs_block_group *block_group)
 112 {
 113         bool queued;
 114
 115         spin_lock(&discard_ctl->lock);
 116
 117         queued = !list_empty(&block_group->discard_list);
 118
 119         if (!btrfs_run_discard_work(discard_ctl)) {
 120                 spin_unlock(&discard_ctl->lock);
 121                 return;
 122         }
 123
 124         list_del_init(&block_group->discard_list);
 125
 126         block_group->discard_index = BTRFS_DISCARD_INDEX_UNUSED;
 127         block_group->discard_eligible_time = (ktime_get_ns() +
 128                                               BTRFS_DISCARD_UNUSED_DELAY);
 129         block_group->discard_state = BTRFS_DISCARD_RESET_CURSOR;
 130         if (!queued)
 131                 btrfs_get_block_group(block_group);
 132         list_add_tail(&block_group->discard_list,
 133                       &discard_ctl->discard_list[BTRFS_DISCARD_INDEX_UNUSED]);
 134
 135         spin_unlock(&discard_ctl->lock);
 136 }
 137
 138 static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl,
 139                                      struct btrfs_block_group *block_group)
 140 {
 141         bool running = false;
 142         bool queued = false;
 143
 144         spin_lock(&discard_ctl->lock);
 145
 146         if (block_group == discard_ctl->block_group) {
 147                 running = true;
 148                 discard_ctl->block_group = NULL;
 149         }
 150
 151         block_group->discard_eligible_time = 0;
 152         queued = !list_empty(&block_group->discard_list);
 153         list_del_init(&block_group->discard_list);
 154         /*
 155          * If the block group is currently running in the discard workfn, we
 156          * don't want to deref it, since it's still being used by the workfn.
 157          * The workfn will notice this case and deref the block group when it is
 158          * finished.
 159          */
 160         if (queued && !running)
 161                 btrfs_put_block_group(block_group);
 162
 163         spin_unlock(&discard_ctl->lock);
 164
 165         return running;
 166 }
 167
 168 /**
 169  * find_next_block_group - find block_group that's up next for discarding
 170  * @discard_ctl: discard control
 171  * @now: current time
 172  *
 173  * Iterate over the discard lists to find the next block_group up for
 174  * discarding checking the discard_eligible_time of block_group.
 175  */
 176 static struct btrfs_block_group *find_next_block_group(
 177                                         struct btrfs_discard_ctl *discard_ctl,
 178                                         u64 now)
 179 {
 180         struct btrfs_block_group *ret_block_group = NULL, *block_group;
 181         int i;
 182
 183         for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) {
 184                 struct list_head *discard_list = &discard_ctl->discard_list[i];
 185
 186                 if (!list_empty(discard_list)) {
 187                         block_group = list_first_entry(discard_list,
 188                                                        struct btrfs_block_group,
 189                                                        discard_list);
 190
 191                         if (!ret_block_group)
 192                                 ret_block_group = block_group;
 193
 194                         if (ret_block_group->discard_eligible_time < now)
 195                                 break;
 196
 197                         if (ret_block_group->discard_eligible_time >
 198                             block_group->discard_eligible_time)
 199                                 ret_block_group = block_group;
 200                 }
 201         }
 202
 203         return ret_block_group;
 204 }
 205
 206 /**
 207  * Wrap find_next_block_group()
 208  *
 209  * @discard_ctl:   discard control
 210  * @discard_state: the discard_state of the block_group after state management
 211  * @discard_index: the discard_index of the block_group after state management
 212  * @now:           time when discard was invoked, in ns
 213  *
 214  * This wraps find_next_block_group() and sets the block_group to be in use.
 215  * discard_state's control flow is managed here.  Variables related to
 216  * discard_state are reset here as needed (eg discard_cursor).  @discard_state
 217  * and @discard_index are remembered as it may change while we're discarding,
 218  * but we want the discard to execute in the context determined here.
 219  */
 220 static struct btrfs_block_group *peek_discard_list(
 221                                         struct btrfs_discard_ctl *discard_ctl,
 222                                         enum btrfs_discard_state *discard_state,
 223                                         int *discard_index, u64 now)
 224 {
 225         struct btrfs_block_group *block_group;
 226
 227         spin_lock(&discard_ctl->lock);
 228 again:
 229         block_group = find_next_block_group(discard_ctl, now);
 230
 231         if (block_group && now >= block_group->discard_eligible_time) {
 232                 if (block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED &&
 233                     block_group->used != 0) {
 234                         if (btrfs_is_block_group_data_only(block_group)) {
 235                                 __add_to_discard_list(discard_ctl, block_group);
 236                         } else {
 237                                 list_del_init(&block_group->discard_list);
 238                                 btrfs_put_block_group(block_group);
 239                         }
 240                         goto again;
 241                 }
 242                 if (block_group->discard_state == BTRFS_DISCARD_RESET_CURSOR) {
 243                         block_group->discard_cursor = block_group->start;
 244                         block_group->discard_state = BTRFS_DISCARD_EXTENTS;
 245                 }
 246                 discard_ctl->block_group = block_group;
 247         }
 248         if (block_group) {
 249                 *discard_state = block_group->discard_state;
 250                 *discard_index = block_group->discard_index;
 251         }
 252         spin_unlock(&discard_ctl->lock);
 253
 254         return block_group;
 255 }
 256
 257 /**
 258  * btrfs_discard_check_filter - updates a block groups filters
 259  * @block_group: block group of interest
 260  * @bytes: recently freed region size after coalescing
 261  *
 262  * Async discard maintains multiple lists with progressively smaller filters
 263  * to prioritize discarding based on size.  Should a free space that matches
 264  * a larger filter be returned to the free_space_cache, prioritize that discard
 265  * by moving @block_group to the proper filter.
 266  */
 267 void btrfs_discard_check_filter(struct btrfs_block_group *block_group,
 268                                 u64 bytes)
 269 {
 270         struct btrfs_discard_ctl *discard_ctl;
 271
 272         if (!block_group ||
 273             !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
 274                 return;
 275
 276         discard_ctl = &block_group->fs_info->discard_ctl;
 277
 278         if (block_group->discard_index > BTRFS_DISCARD_INDEX_START &&
 279             bytes >= discard_minlen[block_group->discard_index - 1]) {
 280                 int i;
 281
 282                 remove_from_discard_list(discard_ctl, block_group);
 283
 284                 for (i = BTRFS_DISCARD_INDEX_START; i < BTRFS_NR_DISCARD_LISTS;
 285                      i++) {
 286                         if (bytes >= discard_minlen[i]) {
 287                                 block_group->discard_index = i;
 288                                 add_to_discard_list(discard_ctl, block_group);
 289                                 break;
 290                         }
 291                 }
 292         }
 293 }
 294
 295 /**
 296  * btrfs_update_discard_index - moves a block group along the discard lists
 297  * @discard_ctl: discard control
 298  * @block_group: block_group of interest
 299  *
 300  * Increment @block_group's discard_index.  If it falls of the list, let it be.
 301  * Otherwise add it back to the appropriate list.
 302  */
 303 static void btrfs_update_discard_index(struct btrfs_discard_ctl *discard_ctl,
 304                                        struct btrfs_block_group *block_group)
 305 {
 306         block_group->discard_index++;
 307         if (block_group->discard_index == BTRFS_NR_DISCARD_LISTS) {
 308                 block_group->discard_index = 1;
 309                 return;
 310         }
 311
 312         add_to_discard_list(discard_ctl, block_group);
 313 }
 314
 315 /**
 316  * btrfs_discard_cancel_work - remove a block_group from the discard lists
 317  * @discard_ctl: discard control
 318  * @block_group: block_group of interest
 319  *
 320  * This removes @block_group from the discard lists.  If necessary, it waits on
 321  * the current work and then reschedules the delayed work.
 322  */
 323 void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl,
 324                                struct btrfs_block_group *block_group)
 325 {
 326         if (remove_from_discard_list(discard_ctl, block_group)) {
 327                 cancel_delayed_work_sync(&discard_ctl->work);
 328                 btrfs_discard_schedule_work(discard_ctl, true);
 329         }
 330 }
 331
 332 /**
 333  * btrfs_discard_queue_work - handles queuing the block_groups
 334  * @discard_ctl: discard control
 335  * @block_group: block_group of interest
 336  *
 337  * This maintains the LRU order of the discard lists.
 338  */
 339 void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl,
 340                               struct btrfs_block_group *block_group)
 341 {
 342         if (!block_group || !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC))
 343                 return;
 344
 345         if (block_group->used == 0)
 346                 add_to_discard_unused_list(discard_ctl, block_group);
 347         else
 348                 add_to_discard_list(discard_ctl, block_group);
 349
 350         if (!delayed_work_pending(&discard_ctl->work))
 351                 btrfs_discard_schedule_work(discard_ctl, false);
 352 }
 353
 354 static void __btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
 355                                           u64 now, bool override)
 356 {
 357         struct btrfs_block_group *block_group;
 358
 359         if (!btrfs_run_discard_work(discard_ctl))
 360                 return;
 361         if (!override && delayed_work_pending(&discard_ctl->work))
 362                 return;
 363
 364         block_group = find_next_block_group(discard_ctl, now);
 365         if (block_group) {
 366                 u64 delay = discard_ctl->delay_ms * NSEC_PER_MSEC;
 367                 u32 kbps_limit = READ_ONCE(discard_ctl->kbps_limit);
 368
 369                 /*
 370                  * A single delayed workqueue item is responsible for
 371                  * discarding, so we can manage the bytes rate limit by keeping
 372                  * track of the previous discard.
 373                  */
 374                 if (kbps_limit && discard_ctl->prev_discard) {
 375                         u64 bps_limit = ((u64)kbps_limit) * SZ_1K;
 376                         u64 bps_delay = div64_u64(discard_ctl->prev_discard *
 377                                                   NSEC_PER_SEC, bps_limit);
 378
 379                         delay = max(delay, bps_delay);
 380                 }
 381
 382                 /*
 383                  * This timeout is to hopefully prevent immediate discarding
 384                  * in a recently allocated block group.
 385                  */
 386                 if (now < block_group->discard_eligible_time) {
 387                         u64 bg_timeout = block_group->discard_eligible_time - now;
 388
 389                         delay = max(delay, bg_timeout);
 390                 }
 391
 392                 if (override && discard_ctl->prev_discard) {
 393                         u64 elapsed = now - discard_ctl->prev_discard_time;
 394
 395                         if (delay > elapsed)
 396                                 delay -= elapsed;
 397                         else
 398                                 delay = 0;
 399                 }
 400
 401                 mod_delayed_work(discard_ctl->discard_workers,
 402                                  &discard_ctl->work, nsecs_to_jiffies(delay));
 403         }
 404 }
 405
 406 /*
 407  * btrfs_discard_schedule_work - responsible for scheduling the discard work
 408  * @discard_ctl:  discard control
 409  * @override:     override the current timer
 410  *
 411  * Discards are issued by a delayed workqueue item.  @override is used to
 412  * update the current delay as the baseline delay interval is reevaluated on
 413  * transaction commit.  This is also maxed with any other rate limit.
 414  */
 415 void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
 416                                  bool override)
 417 {
 418         const u64 now = ktime_get_ns();
 419
 420         spin_lock(&discard_ctl->lock);
 421         __btrfs_discard_schedule_work(discard_ctl, now, override);
 422         spin_unlock(&discard_ctl->lock);
 423 }
 424
 425 /**
 426  * btrfs_finish_discard_pass - determine next step of a block_group
 427  * @discard_ctl: discard control
 428  * @block_group: block_group of interest
 429  *
 430  * This determines the next step for a block group after it's finished going
 431  * through a pass on a discard list.  If it is unused and fully trimmed, we can
 432  * mark it unused and send it to the unused_bgs path.  Otherwise, pass it onto
 433  * the appropriate filter list or let it fall off.
 434  */
 435 static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl,
 436                                       struct btrfs_block_group *block_group)
 437 {
 438         remove_from_discard_list(discard_ctl, block_group);
 439
 440         if (block_group->used == 0) {
 441                 if (btrfs_is_free_space_trimmed(block_group))
 442                         btrfs_mark_bg_unused(block_group);
 443                 else
 444                         add_to_discard_unused_list(discard_ctl, block_group);
 445         } else {
 446                 btrfs_update_discard_index(discard_ctl, block_group);
 447         }
 448 }
 449
 450 /**
 451  * btrfs_discard_workfn - discard work function
 452  * @work: work
 453  *
 454  * This finds the next block_group to start discarding and then discards a
 455  * single region.  It does this in a two-pass fashion: first extents and second
 456  * bitmaps.  Completely discarded block groups are sent to the unused_bgs path.
 457  */
 458 static void btrfs_discard_workfn(struct work_struct *work)
 459 {
 460         struct btrfs_discard_ctl *discard_ctl;
 461         struct btrfs_block_group *block_group;
 462         enum btrfs_discard_state discard_state;
 463         int discard_index = 0;
 464         u64 trimmed = 0;
 465         u64 minlen = 0;
 466         u64 now = ktime_get_ns();
 467
 468         discard_ctl = container_of(work, struct btrfs_discard_ctl, work.work);
 469
 470         block_group = peek_discard_list(discard_ctl, &discard_state,
 471                                         &discard_index, now);
 472         if (!block_group || !btrfs_run_discard_work(discard_ctl))
 473                 return;
 474         if (now < block_group->discard_eligible_time) {
 475                 btrfs_discard_schedule_work(discard_ctl, false);
 476                 return;
 477         }
 478
 479         /* Perform discarding */
 480         minlen = discard_minlen[discard_index];
 481
 482         if (discard_state == BTRFS_DISCARD_BITMAPS) {
 483                 u64 maxlen = 0;
 484
 485                 /*
 486                  * Use the previous levels minimum discard length as the max
 487                  * length filter.  In the case something is added to make a
 488                  * region go beyond the max filter, the entire bitmap is set
 489                  * back to BTRFS_TRIM_STATE_UNTRIMMED.
 490                  */
 491                 if (discard_index != BTRFS_DISCARD_INDEX_UNUSED)
 492                         maxlen = discard_minlen[discard_index - 1];
 493
 494                 btrfs_trim_block_group_bitmaps(block_group, &trimmed,
 495                                        block_group->discard_cursor,
 496                                        btrfs_block_group_end(block_group),
 497                                        minlen, maxlen, true);
 498                 discard_ctl->discard_bitmap_bytes += trimmed;
 499         } else {
 500                 btrfs_trim_block_group_extents(block_group, &trimmed,
 501                                        block_group->discard_cursor,
 502                                        btrfs_block_group_end(block_group),
 503                                        minlen, true);
 504                 discard_ctl->discard_extent_bytes += trimmed;
 505         }
 506
 507         /* Determine next steps for a block_group */
 508         if (block_group->discard_cursor >= btrfs_block_group_end(block_group)) {
 509                 if (discard_state == BTRFS_DISCARD_BITMAPS) {
 510                         btrfs_finish_discard_pass(discard_ctl, block_group);
 511                 } else {
 512                         block_group->discard_cursor = block_group->start;
 513                         spin_lock(&discard_ctl->lock);
 514                         if (block_group->discard_state !=
 515                             BTRFS_DISCARD_RESET_CURSOR)
 516                                 block_group->discard_state =
 517                                                         BTRFS_DISCARD_BITMAPS;
 518                         spin_unlock(&discard_ctl->lock);
 519                 }
 520         }
 521
 522         now = ktime_get_ns();
 523         spin_lock(&discard_ctl->lock);
 524         discard_ctl->prev_discard = trimmed;
 525         discard_ctl->prev_discard_time = now;
 526         /*
 527          * If the block group was removed from the discard list while it was
 528          * running in this workfn, then we didn't deref it, since this function
 529          * still owned that reference. But we set the discard_ctl->block_group
 530          * back to NULL, so we can use that condition to know that now we need
 531          * to deref the block_group.
 532          */
 533         if (discard_ctl->block_group == NULL)
 534                 btrfs_put_block_group(block_group);
 535         discard_ctl->block_group = NULL;
 536         __btrfs_discard_schedule_work(discard_ctl, now, false);
 537         spin_unlock(&discard_ctl->lock);
 538 }
 539
 540 /**
 541  * btrfs_run_discard_work - determines if async discard should be running
 542  * @discard_ctl: discard control
 543  *
 544  * Checks if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set.
 545  */
 546 bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl)
 547 {
 548         struct btrfs_fs_info *fs_info = container_of(discard_ctl,
 549                                                      struct btrfs_fs_info,
 550                                                      discard_ctl);
 551
 552         return (!(fs_info->sb->s_flags & SB_RDONLY) &&
 553                 test_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags));
 554 }
 555
 556 /**
 557  * btrfs_discard_calc_delay - recalculate the base delay
 558  * @discard_ctl: discard control
 559  *
 560  * Recalculate the base delay which is based off the total number of
 561  * discardable_extents.  Clamp this between the lower_limit (iops_limit or 1ms)
 562  * and the upper_limit (BTRFS_DISCARD_MAX_DELAY_MSEC).
 563  */
 564 void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl)
 565 {
 566         s32 discardable_extents;
 567         s64 discardable_bytes;
 568         u32 iops_limit;
 569         unsigned long delay;
 570
 571         discardable_extents = atomic_read(&discard_ctl->discardable_extents);
 572         if (!discardable_extents)
 573                 return;
 574
 575         spin_lock(&discard_ctl->lock);
 576
 577         /*
 578          * The following is to fix a potential -1 discrepenancy that we're not
 579          * sure how to reproduce. But given that this is the only place that
 580          * utilizes these numbers and this is only called by from
 581          * btrfs_finish_extent_commit() which is synchronized, we can correct
 582          * here.
 583          */
 584         if (discardable_extents < 0)
 585                 atomic_add(-discardable_extents,
 586                            &discard_ctl->discardable_extents);
 587
 588         discardable_bytes = atomic64_read(&discard_ctl->discardable_bytes);
 589         if (discardable_bytes < 0)
 590                 atomic64_add(-discardable_bytes,
 591                              &discard_ctl->discardable_bytes);
 592
 593         if (discardable_extents <= 0) {
 594                 spin_unlock(&discard_ctl->lock);
 595                 return;
 596         }
 597
 598         iops_limit = READ_ONCE(discard_ctl->iops_limit);
 599         if (iops_limit)
 600                 delay = MSEC_PER_SEC / iops_limit;
 601         else
 602                 delay = BTRFS_DISCARD_TARGET_MSEC / discardable_extents;
 603
 604         delay = clamp(delay, BTRFS_DISCARD_MIN_DELAY_MSEC,
 605                       BTRFS_DISCARD_MAX_DELAY_MSEC);
 606         discard_ctl->delay_ms = delay;
 607
 608         spin_unlock(&discard_ctl->lock);
 609 }
 610
 611 /**
 612  * btrfs_discard_update_discardable - propagate discard counters
 613  * @block_group: block_group of interest
 614  *
 615  * This propagates deltas of counters up to the discard_ctl.  It maintains a
 616  * current counter and a previous counter passing the delta up to the global
 617  * stat.  Then the current counter value becomes the previous counter value.
 618  */
 619 void btrfs_discard_update_discardable(struct btrfs_block_group *block_group)
 620 {
 621         struct btrfs_free_space_ctl *ctl;
 622         struct btrfs_discard_ctl *discard_ctl;
 623         s32 extents_delta;
 624         s64 bytes_delta;
 625
 626         if (!block_group ||
 627             !btrfs_test_opt(block_group->fs_info, DISCARD_ASYNC) ||
 628             !btrfs_is_block_group_data_only(block_group))
 629                 return;
 630
 631         ctl = block_group->free_space_ctl;
 632         discard_ctl = &block_group->fs_info->discard_ctl;
 633
 634         lockdep_assert_held(&ctl->tree_lock);
 635         extents_delta = ctl->discardable_extents[BTRFS_STAT_CURR] -
 636                         ctl->discardable_extents[BTRFS_STAT_PREV];
 637         if (extents_delta) {
 638                 atomic_add(extents_delta, &discard_ctl->discardable_extents);
 639                 ctl->discardable_extents[BTRFS_STAT_PREV] =
 640                         ctl->discardable_extents[BTRFS_STAT_CURR];
 641         }
 642
 643         bytes_delta = ctl->discardable_bytes[BTRFS_STAT_CURR] -
 644                       ctl->discardable_bytes[BTRFS_STAT_PREV];
 645         if (bytes_delta) {
 646                 atomic64_add(bytes_delta, &discard_ctl->discardable_bytes);
 647                 ctl->discardable_bytes[BTRFS_STAT_PREV] =
 648                         ctl->discardable_bytes[BTRFS_STAT_CURR];
 649         }
 650 }
 651
 652 /**
 653  * btrfs_discard_punt_unused_bgs_list - punt unused_bgs list to discard lists
 654  * @fs_info: fs_info of interest
 655  *
 656  * The unused_bgs list needs to be punted to the discard lists because the
 657  * order of operations is changed.  In the normal synchronous discard path, the
 658  * block groups are trimmed via a single large trim in transaction commit.  This
 659  * is ultimately what we are trying to avoid with asynchronous discard.  Thus,
 660  * it must be done before going down the unused_bgs path.
 661  */
 662 void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info)
 663 {
 664         struct btrfs_block_group *block_group, *next;
 665
 666         spin_lock(&fs_info->unused_bgs_lock);
 667         /* We enabled async discard, so punt all to the queue */
 668         list_for_each_entry_safe(block_group, next, &fs_info->unused_bgs,
 669                                  bg_list) {
 670                 list_del_init(&block_group->bg_list);
 671                 btrfs_discard_queue_work(&fs_info->discard_ctl, block_group);
 672                 /*
 673                  * This put is for the get done by btrfs_mark_bg_unused.
 674                  * Queueing discard incremented it for discard's reference.
 675                  */
 676                 btrfs_put_block_group(block_group);
 677         }
 678         spin_unlock(&fs_info->unused_bgs_lock);
 679 }
 680
 681 /**
 682  * btrfs_discard_purge_list - purge discard lists
 683  * @discard_ctl: discard control
 684  *
 685  * If we are disabling async discard, we may have intercepted block groups that
 686  * are completely free and ready for the unused_bgs path.  As discarding will
 687  * now happen in transaction commit or not at all, we can safely mark the
 688  * corresponding block groups as unused and they will be sent on their merry
 689  * way to the unused_bgs list.
 690  */
 691 static void btrfs_discard_purge_list(struct btrfs_discard_ctl *discard_ctl)
 692 {
 693         struct btrfs_block_group *block_group, *next;
 694         int i;
 695
 696         spin_lock(&discard_ctl->lock);
 697         for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++) {
 698                 list_for_each_entry_safe(block_group, next,
 699                                          &discard_ctl->discard_list[i],
 700                                          discard_list) {
 701                         list_del_init(&block_group->discard_list);
 702                         spin_unlock(&discard_ctl->lock);
 703                         if (block_group->used == 0)
 704                                 btrfs_mark_bg_unused(block_group);
 705                         spin_lock(&discard_ctl->lock);
 706                         btrfs_put_block_group(block_group);
 707                 }
 708         }
 709         spin_unlock(&discard_ctl->lock);
 710 }
 711
 712 void btrfs_discard_resume(struct btrfs_fs_info *fs_info)
 713 {
 714         if (!btrfs_test_opt(fs_info, DISCARD_ASYNC)) {
 715                 btrfs_discard_cleanup(fs_info);
 716                 return;
 717         }
 718
 719         btrfs_discard_punt_unused_bgs_list(fs_info);
 720
 721         set_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags);
 722 }
 723
 724 void btrfs_discard_stop(struct btrfs_fs_info *fs_info)
 725 {
 726         clear_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags);
 727 }
 728
 729 void btrfs_discard_init(struct btrfs_fs_info *fs_info)
 730 {
 731         struct btrfs_discard_ctl *discard_ctl = &fs_info->discard_ctl;
 732         int i;
 733
 734         spin_lock_init(&discard_ctl->lock);
 735         INIT_DELAYED_WORK(&discard_ctl->work, btrfs_discard_workfn);
 736
 737         for (i = 0; i < BTRFS_NR_DISCARD_LISTS; i++)
 738                 INIT_LIST_HEAD(&discard_ctl->discard_list[i]);
 739
 740         discard_ctl->prev_discard = 0;
 741         discard_ctl->prev_discard_time = 0;
 742         atomic_set(&discard_ctl->discardable_extents, 0);
 743         atomic64_set(&discard_ctl->discardable_bytes, 0);
 744         discard_ctl->max_discard_size = BTRFS_ASYNC_DISCARD_DEFAULT_MAX_SIZE;
 745         discard_ctl->delay_ms = BTRFS_DISCARD_MAX_DELAY_MSEC;
 746         discard_ctl->iops_limit = BTRFS_DISCARD_MAX_IOPS;
 747         discard_ctl->kbps_limit = 0;
 748         discard_ctl->discard_extent_bytes = 0;
 749         discard_ctl->discard_bitmap_bytes = 0;
 750         atomic64_set(&discard_ctl->discard_bytes_saved, 0);
 751 }
 752
 753 void btrfs_discard_cleanup(struct btrfs_fs_info *fs_info)
 754 {
 755         btrfs_discard_stop(fs_info);
 756         cancel_delayed_work_sync(&fs_info->discard_ctl.work);
 757         btrfs_discard_purge_list(&fs_info->discard_ctl);
 758 }