fs/btrfs/volumes.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (C) 2007 Oracle.  All rights reserved.
   4  */
   5
   6 #include <linux/sched.h>
   7 #include <linux/sched/mm.h>
   8 #include <linux/bio.h>
   9 #include <linux/slab.h>
  10 #include <linux/blkdev.h>
  11 #include <linux/ratelimit.h>
  12 #include <linux/kthread.h>
  13 #include <linux/raid/pq.h>
  14 #include <linux/semaphore.h>
  15 #include <linux/uuid.h>
  16 #include <linux/list_sort.h>
  17 #include <linux/namei.h>
  18 #include "misc.h"
  19 #include "ctree.h"
  20 #include "extent_map.h"
  21 #include "disk-io.h"
  22 #include "transaction.h"
  23 #include "print-tree.h"
  24 #include "volumes.h"
  25 #include "raid56.h"
  26 #include "async-thread.h"
  27 #include "check-integrity.h"
  28 #include "rcu-string.h"
  29 #include "dev-replace.h"
  30 #include "sysfs.h"
  31 #include "tree-checker.h"
  32 #include "space-info.h"
  33 #include "block-group.h"
  34 #include "discard.h"
  35
  36 const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
  37         [BTRFS_RAID_RAID10] = {
  38                 .sub_stripes    = 2,
  39                 .dev_stripes    = 1,
  40                 .devs_max       = 0,    /* 0 == as many as possible */
  41                 .devs_min       = 4,
  42                 .tolerated_failures = 1,
  43                 .devs_increment = 2,
  44                 .ncopies        = 2,
  45                 .nparity        = 0,
  46                 .raid_name      = "raid10",
  47                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID10,
  48                 .mindev_error   = BTRFS_ERROR_DEV_RAID10_MIN_NOT_MET,
  49         },
  50         [BTRFS_RAID_RAID1] = {
  51                 .sub_stripes    = 1,
  52                 .dev_stripes    = 1,
  53                 .devs_max       = 2,
  54                 .devs_min       = 2,
  55                 .tolerated_failures = 1,
  56                 .devs_increment = 2,
  57                 .ncopies        = 2,
  58                 .nparity        = 0,
  59                 .raid_name      = "raid1",
  60                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID1,
  61                 .mindev_error   = BTRFS_ERROR_DEV_RAID1_MIN_NOT_MET,
  62         },
  63         [BTRFS_RAID_RAID1C3] = {
  64                 .sub_stripes    = 1,
  65                 .dev_stripes    = 1,
  66                 .devs_max       = 3,
  67                 .devs_min       = 3,
  68                 .tolerated_failures = 2,
  69                 .devs_increment = 3,
  70                 .ncopies        = 3,
  71                 .nparity        = 0,
  72                 .raid_name      = "raid1c3",
  73                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID1C3,
  74                 .mindev_error   = BTRFS_ERROR_DEV_RAID1C3_MIN_NOT_MET,
  75         },
  76         [BTRFS_RAID_RAID1C4] = {
  77                 .sub_stripes    = 1,
  78                 .dev_stripes    = 1,
  79                 .devs_max       = 4,
  80                 .devs_min       = 4,
  81                 .tolerated_failures = 3,
  82                 .devs_increment = 4,
  83                 .ncopies        = 4,
  84                 .nparity        = 0,
  85                 .raid_name      = "raid1c4",
  86                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID1C4,
  87                 .mindev_error   = BTRFS_ERROR_DEV_RAID1C4_MIN_NOT_MET,
  88         },
  89         [BTRFS_RAID_DUP] = {
  90                 .sub_stripes    = 1,
  91                 .dev_stripes    = 2,
  92                 .devs_max       = 1,
  93                 .devs_min       = 1,
  94                 .tolerated_failures = 0,
  95                 .devs_increment = 1,
  96                 .ncopies        = 2,
  97                 .nparity        = 0,
  98                 .raid_name      = "dup",
  99                 .bg_flag        = BTRFS_BLOCK_GROUP_DUP,
 100                 .mindev_error   = 0,
 101         },
 102         [BTRFS_RAID_RAID0] = {
 103                 .sub_stripes    = 1,
 104                 .dev_stripes    = 1,
 105                 .devs_max       = 0,
 106                 .devs_min       = 2,
 107                 .tolerated_failures = 0,
 108                 .devs_increment = 1,
 109                 .ncopies        = 1,
 110                 .nparity        = 0,
 111                 .raid_name      = "raid0",
 112                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID0,
 113                 .mindev_error   = 0,
 114         },
 115         [BTRFS_RAID_SINGLE] = {
 116                 .sub_stripes    = 1,
 117                 .dev_stripes    = 1,
 118                 .devs_max       = 1,
 119                 .devs_min       = 1,
 120                 .tolerated_failures = 0,
 121                 .devs_increment = 1,
 122                 .ncopies        = 1,
 123                 .nparity        = 0,
 124                 .raid_name      = "single",
 125                 .bg_flag        = 0,
 126                 .mindev_error   = 0,
 127         },
 128         [BTRFS_RAID_RAID5] = {
 129                 .sub_stripes    = 1,
 130                 .dev_stripes    = 1,
 131                 .devs_max       = 0,
 132                 .devs_min       = 2,
 133                 .tolerated_failures = 1,
 134                 .devs_increment = 1,
 135                 .ncopies        = 1,
 136                 .nparity        = 1,
 137                 .raid_name      = "raid5",
 138                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID5,
 139                 .mindev_error   = BTRFS_ERROR_DEV_RAID5_MIN_NOT_MET,
 140         },
 141         [BTRFS_RAID_RAID6] = {
 142                 .sub_stripes    = 1,
 143                 .dev_stripes    = 1,
 144                 .devs_max       = 0,
 145                 .devs_min       = 3,
 146                 .tolerated_failures = 2,
 147                 .devs_increment = 1,
 148                 .ncopies        = 1,
 149                 .nparity        = 2,
 150                 .raid_name      = "raid6",
 151                 .bg_flag        = BTRFS_BLOCK_GROUP_RAID6,
 152                 .mindev_error   = BTRFS_ERROR_DEV_RAID6_MIN_NOT_MET,
 153         },
 154 };
 155
 156 const char *btrfs_bg_type_to_raid_name(u64 flags)
 157 {
 158         const int index = btrfs_bg_flags_to_raid_index(flags);
 159
 160         if (index >= BTRFS_NR_RAID_TYPES)
 161                 return NULL;
 162
 163         return btrfs_raid_array[index].raid_name;
 164 }
 165
 166 /*
 167  * Fill @buf with textual description of @bg_flags, no more than @size_buf
 168  * bytes including terminating null byte.
 169  */
 170 void btrfs_describe_block_groups(u64 bg_flags, char *buf, u32 size_buf)
 171 {
 172         int i;
 173         int ret;
 174         char *bp = buf;
 175         u64 flags = bg_flags;
 176         u32 size_bp = size_buf;
 177
 178         if (!flags) {
 179                 strcpy(bp, "NONE");
 180                 return;
 181         }
 182
 183 #define DESCRIBE_FLAG(flag, desc)                                               \
 184         do {                                                            \
 185                 if (flags & (flag)) {                                   \
 186                         ret = snprintf(bp, size_bp, "%s|", (desc));     \
 187                         if (ret < 0 || ret >= size_bp)                  \
 188                                 goto out_overflow;                      \
 189                         size_bp -= ret;                                 \
 190                         bp += ret;                                      \
 191                         flags &= ~(flag);                               \
 192                 }                                                       \
 193         } while (0)
 194
 195         DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_DATA, "data");
 196         DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_SYSTEM, "system");
 197         DESCRIBE_FLAG(BTRFS_BLOCK_GROUP_METADATA, "metadata");
 198
 199         DESCRIBE_FLAG(BTRFS_AVAIL_ALLOC_BIT_SINGLE, "single");
 200         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++)
 201                 DESCRIBE_FLAG(btrfs_raid_array[i].bg_flag,
 202                               btrfs_raid_array[i].raid_name);
 203 #undef DESCRIBE_FLAG
 204
 205         if (flags) {
 206                 ret = snprintf(bp, size_bp, "0x%llx|", flags);
 207                 size_bp -= ret;
 208         }
 209
 210         if (size_bp < size_buf)
 211                 buf[size_buf - size_bp - 1] = '\0'; /* remove last | */
 212
 213         /*
 214          * The text is trimmed, it's up to the caller to provide sufficiently
 215          * large buffer
 216          */
 217 out_overflow:;
 218 }
 219
 220 static int init_first_rw_device(struct btrfs_trans_handle *trans);
 221 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info);
 222 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev);
 223 static void btrfs_dev_stat_print_on_load(struct btrfs_device *device);
 224 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
 225                              enum btrfs_map_op op,
 226                              u64 logical, u64 *length,
 227                              struct btrfs_bio **bbio_ret,
 228                              int mirror_num, int need_raid_map);
 229
 230 /*
 231  * Device locking
 232  * ==============
 233  *
 234  * There are several mutexes that protect manipulation of devices and low-level
 235  * structures like chunks but not block groups, extents or files
 236  *
 237  * uuid_mutex (global lock)
 238  * ------------------------
 239  * protects the fs_uuids list that tracks all per-fs fs_devices, resulting from
 240  * the SCAN_DEV ioctl registration or from mount either implicitly (the first
 241  * device) or requested by the device= mount option
 242  *
 243  * the mutex can be very coarse and can cover long-running operations
 244  *
 245  * protects: updates to fs_devices counters like missing devices, rw devices,
 246  * seeding, structure cloning, opening/closing devices at mount/umount time
 247  *
 248  * global::fs_devs - add, remove, updates to the global list
 249  *
 250  * does not protect: manipulation of the fs_devices::devices list in general
 251  * but in mount context it could be used to exclude list modifications by eg.
 252  * scan ioctl
 253  *
 254  * btrfs_device::name - renames (write side), read is RCU
 255  *
 256  * fs_devices::device_list_mutex (per-fs, with RCU)
 257  * ------------------------------------------------
 258  * protects updates to fs_devices::devices, ie. adding and deleting
 259  *
 260  * simple list traversal with read-only actions can be done with RCU protection
 261  *
 262  * may be used to exclude some operations from running concurrently without any
 263  * modifications to the list (see write_all_supers)
 264  *
 265  * Is not required at mount and close times, because our device list is
 266  * protected by the uuid_mutex at that point.
 267  *
 268  * balance_mutex
 269  * -------------
 270  * protects balance structures (status, state) and context accessed from
 271  * several places (internally, ioctl)
 272  *
 273  * chunk_mutex
 274  * -----------
 275  * protects chunks, adding or removing during allocation, trim or when a new
 276  * device is added/removed. Additionally it also protects post_commit_list of
 277  * individual devices, since they can be added to the transaction's
 278  * post_commit_list only with chunk_mutex held.
 279  *
 280  * cleaner_mutex
 281  * -------------
 282  * a big lock that is held by the cleaner thread and prevents running subvolume
 283  * cleaning together with relocation or delayed iputs
 284  *
 285  *
 286  * Lock nesting
 287  * ============
 288  *
 289  * uuid_mutex
 290  *   device_list_mutex
 291  *     chunk_mutex
 292  *   balance_mutex
 293  *
 294  *
 295  * Exclusive operations
 296  * ====================
 297  *
 298  * Maintains the exclusivity of the following operations that apply to the
 299  * whole filesystem and cannot run in parallel.
 300  *
 301  * - Balance (*)
 302  * - Device add
 303  * - Device remove
 304  * - Device replace (*)
 305  * - Resize
 306  *
 307  * The device operations (as above) can be in one of the following states:
 308  *
 309  * - Running state
 310  * - Paused state
 311  * - Completed state
 312  *
 313  * Only device operations marked with (*) can go into the Paused state for the
 314  * following reasons:
 315  *
 316  * - ioctl (only Balance can be Paused through ioctl)
 317  * - filesystem remounted as read-only
 318  * - filesystem unmounted and mounted as read-only
 319  * - system power-cycle and filesystem mounted as read-only
 320  * - filesystem or device errors leading to forced read-only
 321  *
 322  * The status of exclusive operation is set and cleared atomically.
 323  * During the course of Paused state, fs_info::exclusive_operation remains set.
 324  * A device operation in Paused or Running state can be canceled or resumed
 325  * either by ioctl (Balance only) or when remounted as read-write.
 326  * The exclusive status is cleared when the device operation is canceled or
 327  * completed.
 328  */
 329
 330 DEFINE_MUTEX(uuid_mutex);
 331 static LIST_HEAD(fs_uuids);
 332 struct list_head * __attribute_const__ btrfs_get_fs_uuids(void)
 333 {
 334         return &fs_uuids;
 335 }
 336
 337 /*
 338  * alloc_fs_devices - allocate struct btrfs_fs_devices
 339  * @fsid:               if not NULL, copy the UUID to fs_devices::fsid
 340  * @metadata_fsid:      if not NULL, copy the UUID to fs_devices::metadata_fsid
 341  *
 342  * Return a pointer to a new struct btrfs_fs_devices on success, or ERR_PTR().
 343  * The returned struct is not linked onto any lists and can be destroyed with
 344  * kfree() right away.
 345  */
 346 static struct btrfs_fs_devices *alloc_fs_devices(const u8 *fsid,
 347                                                  const u8 *metadata_fsid)
 348 {
 349         struct btrfs_fs_devices *fs_devs;
 350
 351         fs_devs = kzalloc(sizeof(*fs_devs), GFP_KERNEL);
 352         if (!fs_devs)
 353                 return ERR_PTR(-ENOMEM);
 354
 355         mutex_init(&fs_devs->device_list_mutex);
 356
 357         INIT_LIST_HEAD(&fs_devs->devices);
 358         INIT_LIST_HEAD(&fs_devs->alloc_list);
 359         INIT_LIST_HEAD(&fs_devs->fs_list);
 360         INIT_LIST_HEAD(&fs_devs->seed_list);
 361         if (fsid)
 362                 memcpy(fs_devs->fsid, fsid, BTRFS_FSID_SIZE);
 363
 364         if (metadata_fsid)
 365                 memcpy(fs_devs->metadata_uuid, metadata_fsid, BTRFS_FSID_SIZE);
 366         else if (fsid)
 367                 memcpy(fs_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE);
 368
 369         return fs_devs;
 370 }
 371
 372 void btrfs_free_device(struct btrfs_device *device)
 373 {
 374         WARN_ON(!list_empty(&device->post_commit_list));
 375         rcu_string_free(device->name);
 376         extent_io_tree_release(&device->alloc_state);
 377         bio_put(device->flush_bio);
 378         kfree(device);
 379 }
 380
 381 static void free_fs_devices(struct btrfs_fs_devices *fs_devices)
 382 {
 383         struct btrfs_device *device;
 384
 385         WARN_ON(fs_devices->opened);
 386         while (!list_empty(&fs_devices->devices)) {
 387                 device = list_entry(fs_devices->devices.next,
 388                                     struct btrfs_device, dev_list);
 389                 list_del(&device->dev_list);
 390                 btrfs_free_device(device);
 391         }
 392         kfree(fs_devices);
 393 }
 394
 395 void __exit btrfs_cleanup_fs_uuids(void)
 396 {
 397         struct btrfs_fs_devices *fs_devices;
 398
 399         while (!list_empty(&fs_uuids)) {
 400                 fs_devices = list_entry(fs_uuids.next,
 401                                         struct btrfs_fs_devices, fs_list);
 402                 list_del(&fs_devices->fs_list);
 403                 free_fs_devices(fs_devices);
 404         }
 405 }
 406
 407 /*
 408  * Returns a pointer to a new btrfs_device on success; ERR_PTR() on error.
 409  * Returned struct is not linked onto any lists and must be destroyed using
 410  * btrfs_free_device.
 411  */
 412 static struct btrfs_device *__alloc_device(struct btrfs_fs_info *fs_info)
 413 {
 414         struct btrfs_device *dev;
 415
 416         dev = kzalloc(sizeof(*dev), GFP_KERNEL);
 417         if (!dev)
 418                 return ERR_PTR(-ENOMEM);
 419
 420         /*
 421          * Preallocate a bio that's always going to be used for flushing device
 422          * barriers and matches the device lifespan
 423          */
 424         dev->flush_bio = bio_alloc_bioset(GFP_KERNEL, 0, NULL);
 425         if (!dev->flush_bio) {
 426                 kfree(dev);
 427                 return ERR_PTR(-ENOMEM);
 428         }
 429
 430         INIT_LIST_HEAD(&dev->dev_list);
 431         INIT_LIST_HEAD(&dev->dev_alloc_list);
 432         INIT_LIST_HEAD(&dev->post_commit_list);
 433
 434         atomic_set(&dev->reada_in_flight, 0);
 435         atomic_set(&dev->dev_stats_ccnt, 0);
 436         btrfs_device_data_ordered_init(dev);
 437         INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
 438         INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM);
 439         extent_io_tree_init(fs_info, &dev->alloc_state,
 440                             IO_TREE_DEVICE_ALLOC_STATE, NULL);
 441
 442         return dev;
 443 }
 444
 445 static noinline struct btrfs_fs_devices *find_fsid(
 446                 const u8 *fsid, const u8 *metadata_fsid)
 447 {
 448         struct btrfs_fs_devices *fs_devices;
 449
 450         ASSERT(fsid);
 451
 452         /* Handle non-split brain cases */
 453         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 454                 if (metadata_fsid) {
 455                         if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0
 456                             && memcmp(metadata_fsid, fs_devices->metadata_uuid,
 457                                       BTRFS_FSID_SIZE) == 0)
 458                                 return fs_devices;
 459                 } else {
 460                         if (memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE) == 0)
 461                                 return fs_devices;
 462                 }
 463         }
 464         return NULL;
 465 }
 466
 467 static struct btrfs_fs_devices *find_fsid_with_metadata_uuid(
 468                                 struct btrfs_super_block *disk_super)
 469 {
 470
 471         struct btrfs_fs_devices *fs_devices;
 472
 473         /*
 474          * Handle scanned device having completed its fsid change but
 475          * belonging to a fs_devices that was created by first scanning
 476          * a device which didn't have its fsid/metadata_uuid changed
 477          * at all and the CHANGING_FSID_V2 flag set.
 478          */
 479         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 480                 if (fs_devices->fsid_change &&
 481                     memcmp(disk_super->metadata_uuid, fs_devices->fsid,
 482                            BTRFS_FSID_SIZE) == 0 &&
 483                     memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
 484                            BTRFS_FSID_SIZE) == 0) {
 485                         return fs_devices;
 486                 }
 487         }
 488         /*
 489          * Handle scanned device having completed its fsid change but
 490          * belonging to a fs_devices that was created by a device that
 491          * has an outdated pair of fsid/metadata_uuid and
 492          * CHANGING_FSID_V2 flag set.
 493          */
 494         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 495                 if (fs_devices->fsid_change &&
 496                     memcmp(fs_devices->metadata_uuid,
 497                            fs_devices->fsid, BTRFS_FSID_SIZE) != 0 &&
 498                     memcmp(disk_super->metadata_uuid, fs_devices->metadata_uuid,
 499                            BTRFS_FSID_SIZE) == 0) {
 500                         return fs_devices;
 501                 }
 502         }
 503
 504         return find_fsid(disk_super->fsid, disk_super->metadata_uuid);
 505 }
 506
 507
 508 static int
 509 btrfs_get_bdev_and_sb(const char *device_path, fmode_t flags, void *holder,
 510                       int flush, struct block_device **bdev,
 511                       struct btrfs_super_block **disk_super)
 512 {
 513         int ret;
 514
 515         *bdev = blkdev_get_by_path(device_path, flags, holder);
 516
 517         if (IS_ERR(*bdev)) {
 518                 ret = PTR_ERR(*bdev);
 519                 goto error;
 520         }
 521
 522         if (flush)
 523                 filemap_write_and_wait((*bdev)->bd_inode->i_mapping);
 524         ret = set_blocksize(*bdev, BTRFS_BDEV_BLOCKSIZE);
 525         if (ret) {
 526                 blkdev_put(*bdev, flags);
 527                 goto error;
 528         }
 529         invalidate_bdev(*bdev);
 530         *disk_super = btrfs_read_dev_super(*bdev);
 531         if (IS_ERR(*disk_super)) {
 532                 ret = PTR_ERR(*disk_super);
 533                 blkdev_put(*bdev, flags);
 534                 goto error;
 535         }
 536
 537         return 0;
 538
 539 error:
 540         *bdev = NULL;
 541         return ret;
 542 }
 543
 544 /*
 545  * Check if the device in the path matches the device in the given struct device.
 546  *
 547  * Returns:
 548  *   true  If it is the same device.
 549  *   false If it is not the same device or on error.
 550  */
 551 static bool device_matched(const struct btrfs_device *device, const char *path)
 552 {
 553         char *device_name;
 554         struct block_device *bdev_old;
 555         struct block_device *bdev_new;
 556
 557         /*
 558          * If we are looking for a device with the matching dev_t, then skip
 559          * device without a name (a missing device).
 560          */
 561         if (!device->name)
 562                 return false;
 563
 564         device_name = kzalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL);
 565         if (!device_name)
 566                 return false;
 567
 568         rcu_read_lock();
 569         scnprintf(device_name, BTRFS_PATH_NAME_MAX, "%s", rcu_str_deref(device->name));
 570         rcu_read_unlock();
 571
 572         bdev_old = lookup_bdev(device_name);
 573         kfree(device_name);
 574         if (IS_ERR(bdev_old))
 575                 return false;
 576
 577         bdev_new = lookup_bdev(path);
 578         if (IS_ERR(bdev_new))
 579                 return false;
 580
 581         if (bdev_old == bdev_new)
 582                 return true;
 583
 584         return false;
 585 }
 586
 587 /*
 588  *  Search and remove all stale (devices which are not mounted) devices.
 589  *  When both inputs are NULL, it will search and release all stale devices.
 590  *  path:       Optional. When provided will it release all unmounted devices
 591  *              matching this path only.
 592  *  skip_dev:   Optional. Will skip this device when searching for the stale
 593  *              devices.
 594  *  Return:     0 for success or if @path is NULL.
 595  *              -EBUSY if @path is a mounted device.
 596  *              -ENOENT if @path does not match any device in the list.
 597  */
 598 static int btrfs_free_stale_devices(const char *path,
 599                                      struct btrfs_device *skip_device)
 600 {
 601         struct btrfs_fs_devices *fs_devices, *tmp_fs_devices;
 602         struct btrfs_device *device, *tmp_device;
 603         int ret = 0;
 604
 605         lockdep_assert_held(&uuid_mutex);
 606
 607         if (path)
 608                 ret = -ENOENT;
 609
 610         list_for_each_entry_safe(fs_devices, tmp_fs_devices, &fs_uuids, fs_list) {
 611
 612                 mutex_lock(&fs_devices->device_list_mutex);
 613                 list_for_each_entry_safe(device, tmp_device,
 614                                          &fs_devices->devices, dev_list) {
 615                         if (skip_device && skip_device == device)
 616                                 continue;
 617                         if (path && !device_matched(device, path))
 618                                 continue;
 619                         if (fs_devices->opened) {
 620                                 /* for an already deleted device return 0 */
 621                                 if (path && ret != 0)
 622                                         ret = -EBUSY;
 623                                 break;
 624                         }
 625
 626                         /* delete the stale device */
 627                         fs_devices->num_devices--;
 628                         list_del(&device->dev_list);
 629                         btrfs_free_device(device);
 630
 631                         ret = 0;
 632                 }
 633                 mutex_unlock(&fs_devices->device_list_mutex);
 634
 635                 if (fs_devices->num_devices == 0) {
 636                         btrfs_sysfs_remove_fsid(fs_devices);
 637                         list_del(&fs_devices->fs_list);
 638                         free_fs_devices(fs_devices);
 639                 }
 640         }
 641
 642         return ret;
 643 }
 644
 645 /*
 646  * This is only used on mount, and we are protected from competing things
 647  * messing with our fs_devices by the uuid_mutex, thus we do not need the
 648  * fs_devices->device_list_mutex here.
 649  */
 650 static int btrfs_open_one_device(struct btrfs_fs_devices *fs_devices,
 651                         struct btrfs_device *device, fmode_t flags,
 652                         void *holder)
 653 {
 654         struct request_queue *q;
 655         struct block_device *bdev;
 656         struct btrfs_super_block *disk_super;
 657         u64 devid;
 658         int ret;
 659
 660         if (device->bdev)
 661                 return -EINVAL;
 662         if (!device->name)
 663                 return -EINVAL;
 664
 665         ret = btrfs_get_bdev_and_sb(device->name->str, flags, holder, 1,
 666                                     &bdev, &disk_super);
 667         if (ret)
 668                 return ret;
 669
 670         devid = btrfs_stack_device_id(&disk_super->dev_item);
 671         if (devid != device->devid)
 672                 goto error_free_page;
 673
 674         if (memcmp(device->uuid, disk_super->dev_item.uuid, BTRFS_UUID_SIZE))
 675                 goto error_free_page;
 676
 677         device->generation = btrfs_super_generation(disk_super);
 678
 679         if (btrfs_super_flags(disk_super) & BTRFS_SUPER_FLAG_SEEDING) {
 680                 if (btrfs_super_incompat_flags(disk_super) &
 681                     BTRFS_FEATURE_INCOMPAT_METADATA_UUID) {
 682                         pr_err(
 683                 "BTRFS: Invalid seeding and uuid-changed device detected\n");
 684                         goto error_free_page;
 685                 }
 686
 687                 clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 688                 fs_devices->seeding = true;
 689         } else {
 690                 if (bdev_read_only(bdev))
 691                         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 692                 else
 693                         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
 694         }
 695
 696         q = bdev_get_queue(bdev);
 697         if (!blk_queue_nonrot(q))
 698                 fs_devices->rotating = true;
 699
 700         device->bdev = bdev;
 701         clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
 702         device->mode = flags;
 703
 704         fs_devices->open_devices++;
 705         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
 706             device->devid != BTRFS_DEV_REPLACE_DEVID) {
 707                 fs_devices->rw_devices++;
 708                 list_add_tail(&device->dev_alloc_list, &fs_devices->alloc_list);
 709         }
 710         btrfs_release_disk_super(disk_super);
 711
 712         return 0;
 713
 714 error_free_page:
 715         btrfs_release_disk_super(disk_super);
 716         blkdev_put(bdev, flags);
 717
 718         return -EINVAL;
 719 }
 720
 721 u8 *btrfs_sb_fsid_ptr(struct btrfs_super_block *sb)
 722 {
 723         bool has_metadata_uuid = (btrfs_super_incompat_flags(sb) &
 724                                   BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
 725
 726         return has_metadata_uuid ? sb->metadata_uuid : sb->fsid;
 727 }
 728
 729 /*
 730  * Handle scanned device having its CHANGING_FSID_V2 flag set and the fs_devices
 731  * being created with a disk that has already completed its fsid change. Such
 732  * disk can belong to an fs which has its FSID changed or to one which doesn't.
 733  * Handle both cases here.
 734  */
 735 static struct btrfs_fs_devices *find_fsid_inprogress(
 736                                         struct btrfs_super_block *disk_super)
 737 {
 738         struct btrfs_fs_devices *fs_devices;
 739
 740         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 741                 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
 742                            BTRFS_FSID_SIZE) != 0 &&
 743                     memcmp(fs_devices->metadata_uuid, disk_super->fsid,
 744                            BTRFS_FSID_SIZE) == 0 && !fs_devices->fsid_change) {
 745                         return fs_devices;
 746                 }
 747         }
 748
 749         return find_fsid(disk_super->fsid, NULL);
 750 }
 751
 752
 753 static struct btrfs_fs_devices *find_fsid_changed(
 754                                         struct btrfs_super_block *disk_super)
 755 {
 756         struct btrfs_fs_devices *fs_devices;
 757
 758         /*
 759          * Handles the case where scanned device is part of an fs that had
 760          * multiple successful changes of FSID but curently device didn't
 761          * observe it. Meaning our fsid will be different than theirs. We need
 762          * to handle two subcases :
 763          *  1 - The fs still continues to have different METADATA/FSID uuids.
 764          *  2 - The fs is switched back to its original FSID (METADATA/FSID
 765          *  are equal).
 766          */
 767         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 768                 /* Changed UUIDs */
 769                 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
 770                            BTRFS_FSID_SIZE) != 0 &&
 771                     memcmp(fs_devices->metadata_uuid, disk_super->metadata_uuid,
 772                            BTRFS_FSID_SIZE) == 0 &&
 773                     memcmp(fs_devices->fsid, disk_super->fsid,
 774                            BTRFS_FSID_SIZE) != 0)
 775                         return fs_devices;
 776
 777                 /* Unchanged UUIDs */
 778                 if (memcmp(fs_devices->metadata_uuid, fs_devices->fsid,
 779                            BTRFS_FSID_SIZE) == 0 &&
 780                     memcmp(fs_devices->fsid, disk_super->metadata_uuid,
 781                            BTRFS_FSID_SIZE) == 0)
 782                         return fs_devices;
 783         }
 784
 785         return NULL;
 786 }
 787
 788 static struct btrfs_fs_devices *find_fsid_reverted_metadata(
 789                                 struct btrfs_super_block *disk_super)
 790 {
 791         struct btrfs_fs_devices *fs_devices;
 792
 793         /*
 794          * Handle the case where the scanned device is part of an fs whose last
 795          * metadata UUID change reverted it to the original FSID. At the same
 796          * time * fs_devices was first created by another constitutent device
 797          * which didn't fully observe the operation. This results in an
 798          * btrfs_fs_devices created with metadata/fsid different AND
 799          * btrfs_fs_devices::fsid_change set AND the metadata_uuid of the
 800          * fs_devices equal to the FSID of the disk.
 801          */
 802         list_for_each_entry(fs_devices, &fs_uuids, fs_list) {
 803                 if (memcmp(fs_devices->fsid, fs_devices->metadata_uuid,
 804                            BTRFS_FSID_SIZE) != 0 &&
 805                     memcmp(fs_devices->metadata_uuid, disk_super->fsid,
 806                            BTRFS_FSID_SIZE) == 0 &&
 807                     fs_devices->fsid_change)
 808                         return fs_devices;
 809         }
 810
 811         return NULL;
 812 }
 813 /*
 814  * Add new device to list of registered devices
 815  *
 816  * Returns:
 817  * device pointer which was just added or updated when successful
 818  * error pointer when failed
 819  */
 820 static noinline struct btrfs_device *device_list_add(const char *path,
 821                            struct btrfs_super_block *disk_super,
 822                            bool *new_device_added)
 823 {
 824         struct btrfs_device *device;
 825         struct btrfs_fs_devices *fs_devices = NULL;
 826         struct rcu_string *name;
 827         u64 found_transid = btrfs_super_generation(disk_super);
 828         u64 devid = btrfs_stack_device_id(&disk_super->dev_item);
 829         bool has_metadata_uuid = (btrfs_super_incompat_flags(disk_super) &
 830                 BTRFS_FEATURE_INCOMPAT_METADATA_UUID);
 831         bool fsid_change_in_progress = (btrfs_super_flags(disk_super) &
 832                                         BTRFS_SUPER_FLAG_CHANGING_FSID_V2);
 833
 834         if (fsid_change_in_progress) {
 835                 if (!has_metadata_uuid)
 836                         fs_devices = find_fsid_inprogress(disk_super);
 837                 else
 838                         fs_devices = find_fsid_changed(disk_super);
 839         } else if (has_metadata_uuid) {
 840                 fs_devices = find_fsid_with_metadata_uuid(disk_super);
 841         } else {
 842                 fs_devices = find_fsid_reverted_metadata(disk_super);
 843                 if (!fs_devices)
 844                         fs_devices = find_fsid(disk_super->fsid, NULL);
 845         }
 846
 847
 848         if (!fs_devices) {
 849                 if (has_metadata_uuid)
 850                         fs_devices = alloc_fs_devices(disk_super->fsid,
 851                                                       disk_super->metadata_uuid);
 852                 else
 853                         fs_devices = alloc_fs_devices(disk_super->fsid, NULL);
 854
 855                 if (IS_ERR(fs_devices))
 856                         return ERR_CAST(fs_devices);
 857
 858                 fs_devices->fsid_change = fsid_change_in_progress;
 859
 860                 mutex_lock(&fs_devices->device_list_mutex);
 861                 list_add(&fs_devices->fs_list, &fs_uuids);
 862
 863                 device = NULL;
 864         } else {
 865                 mutex_lock(&fs_devices->device_list_mutex);
 866                 device = btrfs_find_device(fs_devices, devid,
 867                                 disk_super->dev_item.uuid, NULL, false);
 868
 869                 /*
 870                  * If this disk has been pulled into an fs devices created by
 871                  * a device which had the CHANGING_FSID_V2 flag then replace the
 872                  * metadata_uuid/fsid values of the fs_devices.
 873                  */
 874                 if (fs_devices->fsid_change &&
 875                     found_transid > fs_devices->latest_generation) {
 876                         memcpy(fs_devices->fsid, disk_super->fsid,
 877                                         BTRFS_FSID_SIZE);
 878
 879                         if (has_metadata_uuid)
 880                                 memcpy(fs_devices->metadata_uuid,
 881                                        disk_super->metadata_uuid,
 882                                        BTRFS_FSID_SIZE);
 883                         else
 884                                 memcpy(fs_devices->metadata_uuid,
 885                                        disk_super->fsid, BTRFS_FSID_SIZE);
 886
 887                         fs_devices->fsid_change = false;
 888                 }
 889         }
 890
 891         if (!device) {
 892                 if (fs_devices->opened) {
 893                         mutex_unlock(&fs_devices->device_list_mutex);
 894                         return ERR_PTR(-EBUSY);
 895                 }
 896
 897                 device = btrfs_alloc_device(NULL, &devid,
 898                                             disk_super->dev_item.uuid);
 899                 if (IS_ERR(device)) {
 900                         mutex_unlock(&fs_devices->device_list_mutex);
 901                         /* we can safely leave the fs_devices entry around */
 902                         return device;
 903                 }
 904
 905                 name = rcu_string_strdup(path, GFP_NOFS);
 906                 if (!name) {
 907                         btrfs_free_device(device);
 908                         mutex_unlock(&fs_devices->device_list_mutex);
 909                         return ERR_PTR(-ENOMEM);
 910                 }
 911                 rcu_assign_pointer(device->name, name);
 912
 913                 list_add_rcu(&device->dev_list, &fs_devices->devices);
 914                 fs_devices->num_devices++;
 915
 916                 device->fs_devices = fs_devices;
 917                 *new_device_added = true;
 918
 919                 if (disk_super->label[0])
 920                         pr_info(
 921         "BTRFS: device label %s devid %llu transid %llu %s scanned by %s (%d)\n",
 922                                 disk_super->label, devid, found_transid, path,
 923                                 current->comm, task_pid_nr(current));
 924                 else
 925                         pr_info(
 926         "BTRFS: device fsid %pU devid %llu transid %llu %s scanned by %s (%d)\n",
 927                                 disk_super->fsid, devid, found_transid, path,
 928                                 current->comm, task_pid_nr(current));
 929
 930         } else if (!device->name || strcmp(device->name->str, path)) {
 931                 /*
 932                  * When FS is already mounted.
 933                  * 1. If you are here and if the device->name is NULL that
 934                  *    means this device was missing at time of FS mount.
 935                  * 2. If you are here and if the device->name is different
 936                  *    from 'path' that means either
 937                  *      a. The same device disappeared and reappeared with
 938                  *         different name. or
 939                  *      b. The missing-disk-which-was-replaced, has
 940                  *         reappeared now.
 941                  *
 942                  * We must allow 1 and 2a above. But 2b would be a spurious
 943                  * and unintentional.
 944                  *
 945                  * Further in case of 1 and 2a above, the disk at 'path'
 946                  * would have missed some transaction when it was away and
 947                  * in case of 2a the stale bdev has to be updated as well.
 948                  * 2b must not be allowed at all time.
 949                  */
 950
 951                 /*
 952                  * For now, we do allow update to btrfs_fs_device through the
 953                  * btrfs dev scan cli after FS has been mounted.  We're still
 954                  * tracking a problem where systems fail mount by subvolume id
 955                  * when we reject replacement on a mounted FS.
 956                  */
 957                 if (!fs_devices->opened && found_transid < device->generation) {
 958                         /*
 959                          * That is if the FS is _not_ mounted and if you
 960                          * are here, that means there is more than one
 961                          * disk with same uuid and devid.We keep the one
 962                          * with larger generation number or the last-in if
 963                          * generation are equal.
 964                          */
 965                         mutex_unlock(&fs_devices->device_list_mutex);
 966                         return ERR_PTR(-EEXIST);
 967                 }
 968
 969                 /*
 970                  * We are going to replace the device path for a given devid,
 971                  * make sure it's the same device if the device is mounted
 972                  */
 973                 if (device->bdev) {
 974                         struct block_device *path_bdev;
 975
 976                         path_bdev = lookup_bdev(path);
 977                         if (IS_ERR(path_bdev)) {
 978                                 mutex_unlock(&fs_devices->device_list_mutex);
 979                                 return ERR_CAST(path_bdev);
 980                         }
 981
 982                         if (device->bdev != path_bdev) {
 983                                 bdput(path_bdev);
 984                                 mutex_unlock(&fs_devices->device_list_mutex);
 985                                 /*
 986                                  * device->fs_info may not be reliable here, so
 987                                  * pass in a NULL instead. This avoids a
 988                                  * possible use-after-free when the fs_info and
 989                                  * fs_info->sb are already torn down.
 990                                  */
 991                                 btrfs_warn_in_rcu(NULL,
 992         "duplicate device %s devid %llu generation %llu scanned by %s (%d)",
 993                                                   path, devid, found_transid,
 994                                                   current->comm,
 995                                                   task_pid_nr(current));
 996                                 return ERR_PTR(-EEXIST);
 997                         }
 998                         bdput(path_bdev);
 999                         btrfs_info_in_rcu(device->fs_info,
1000         "devid %llu device path %s changed to %s scanned by %s (%d)",
1001                                           devid, rcu_str_deref(device->name),
1002                                           path, current->comm,
1003                                           task_pid_nr(current));
1004                 }
1005
1006                 name = rcu_string_strdup(path, GFP_NOFS);
1007                 if (!name) {
1008                         mutex_unlock(&fs_devices->device_list_mutex);
1009                         return ERR_PTR(-ENOMEM);
1010                 }
1011                 rcu_string_free(device->name);
1012                 rcu_assign_pointer(device->name, name);
1013                 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
1014                         fs_devices->missing_devices--;
1015                         clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
1016                 }
1017         }
1018
1019         /*
1020          * Unmount does not free the btrfs_device struct but would zero
1021          * generation along with most of the other members. So just update
1022          * it back. We need it to pick the disk with largest generation
1023          * (as above).
1024          */
1025         if (!fs_devices->opened) {
1026                 device->generation = found_transid;
1027                 fs_devices->latest_generation = max_t(u64, found_transid,
1028                                                 fs_devices->latest_generation);
1029         }
1030
1031         fs_devices->total_devices = btrfs_super_num_devices(disk_super);
1032
1033         mutex_unlock(&fs_devices->device_list_mutex);
1034         return device;
1035 }
1036
1037 static struct btrfs_fs_devices *clone_fs_devices(struct btrfs_fs_devices *orig)
1038 {
1039         struct btrfs_fs_devices *fs_devices;
1040         struct btrfs_device *device;
1041         struct btrfs_device *orig_dev;
1042         int ret = 0;
1043
1044         lockdep_assert_held(&uuid_mutex);
1045
1046         fs_devices = alloc_fs_devices(orig->fsid, NULL);
1047         if (IS_ERR(fs_devices))
1048                 return fs_devices;
1049
1050         fs_devices->total_devices = orig->total_devices;
1051
1052         list_for_each_entry(orig_dev, &orig->devices, dev_list) {
1053                 struct rcu_string *name;
1054
1055                 device = btrfs_alloc_device(NULL, &orig_dev->devid,
1056                                             orig_dev->uuid);
1057                 if (IS_ERR(device)) {
1058                         ret = PTR_ERR(device);
1059                         goto error;
1060                 }
1061
1062                 /*
1063                  * This is ok to do without rcu read locked because we hold the
1064                  * uuid mutex so nothing we touch in here is going to disappear.
1065                  */
1066                 if (orig_dev->name) {
1067                         name = rcu_string_strdup(orig_dev->name->str,
1068                                         GFP_KERNEL);
1069                         if (!name) {
1070                                 btrfs_free_device(device);
1071                                 ret = -ENOMEM;
1072                                 goto error;
1073                         }
1074                         rcu_assign_pointer(device->name, name);
1075                 }
1076
1077                 list_add(&device->dev_list, &fs_devices->devices);
1078                 device->fs_devices = fs_devices;
1079                 fs_devices->num_devices++;
1080         }
1081         return fs_devices;
1082 error:
1083         free_fs_devices(fs_devices);
1084         return ERR_PTR(ret);
1085 }
1086
1087 static void __btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices,
1088                                       int step, struct btrfs_device **latest_dev)
1089 {
1090         struct btrfs_device *device, *next;
1091
1092         /* This is the initialized path, it is safe to release the devices. */
1093         list_for_each_entry_safe(device, next, &fs_devices->devices, dev_list) {
1094                 if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state)) {
1095                         if (!test_bit(BTRFS_DEV_STATE_REPLACE_TGT,
1096                                       &device->dev_state) &&
1097                             !test_bit(BTRFS_DEV_STATE_MISSING,
1098                                       &device->dev_state) &&
1099                             (!*latest_dev ||
1100                              device->generation > (*latest_dev)->generation)) {
1101                                 *latest_dev = device;
1102                         }
1103                         continue;
1104                 }
1105
1106                 /*
1107                  * We have already validated the presence of BTRFS_DEV_REPLACE_DEVID,
1108                  * in btrfs_init_dev_replace() so just continue.
1109                  */
1110                 if (device->devid == BTRFS_DEV_REPLACE_DEVID)
1111                         continue;
1112
1113                 if (device->bdev) {
1114                         blkdev_put(device->bdev, device->mode);
1115                         device->bdev = NULL;
1116                         fs_devices->open_devices--;
1117                 }
1118                 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1119                         list_del_init(&device->dev_alloc_list);
1120                         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
1121                         fs_devices->rw_devices--;
1122                 }
1123                 list_del_init(&device->dev_list);
1124                 fs_devices->num_devices--;
1125                 btrfs_free_device(device);
1126         }
1127
1128 }
1129
1130 /*
1131  * After we have read the system tree and know devids belonging to this
1132  * filesystem, remove the device which does not belong there.
1133  */
1134 void btrfs_free_extra_devids(struct btrfs_fs_devices *fs_devices, int step)
1135 {
1136         struct btrfs_device *latest_dev = NULL;
1137         struct btrfs_fs_devices *seed_dev;
1138
1139         mutex_lock(&uuid_mutex);
1140         __btrfs_free_extra_devids(fs_devices, step, &latest_dev);
1141
1142         list_for_each_entry(seed_dev, &fs_devices->seed_list, seed_list)
1143                 __btrfs_free_extra_devids(seed_dev, step, &latest_dev);
1144
1145         fs_devices->latest_bdev = latest_dev->bdev;
1146
1147         mutex_unlock(&uuid_mutex);
1148 }
1149
1150 static void btrfs_close_bdev(struct btrfs_device *device)
1151 {
1152         if (!device->bdev)
1153                 return;
1154
1155         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1156                 sync_blockdev(device->bdev);
1157                 invalidate_bdev(device->bdev);
1158         }
1159
1160         blkdev_put(device->bdev, device->mode);
1161 }
1162
1163 static void btrfs_close_one_device(struct btrfs_device *device)
1164 {
1165         struct btrfs_fs_devices *fs_devices = device->fs_devices;
1166
1167         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
1168             device->devid != BTRFS_DEV_REPLACE_DEVID) {
1169                 list_del_init(&device->dev_alloc_list);
1170                 fs_devices->rw_devices--;
1171         }
1172
1173         if (device->devid == BTRFS_DEV_REPLACE_DEVID)
1174                 clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
1175
1176         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
1177                 clear_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
1178                 fs_devices->missing_devices--;
1179         }
1180
1181         btrfs_close_bdev(device);
1182         if (device->bdev) {
1183                 fs_devices->open_devices--;
1184                 device->bdev = NULL;
1185         }
1186         clear_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
1187
1188         device->fs_info = NULL;
1189         atomic_set(&device->dev_stats_ccnt, 0);
1190         extent_io_tree_release(&device->alloc_state);
1191
1192         /*
1193          * Reset the flush error record. We might have a transient flush error
1194          * in this mount, and if so we aborted the current transaction and set
1195          * the fs to an error state, guaranteeing no super blocks can be further
1196          * committed. However that error might be transient and if we unmount the
1197          * filesystem and mount it again, we should allow the mount to succeed
1198          * (btrfs_check_rw_degradable() should not fail) - if after mounting the
1199          * filesystem again we still get flush errors, then we will again abort
1200          * any transaction and set the error state, guaranteeing no commits of
1201          * unsafe super blocks.
1202          */
1203         device->last_flush_error = 0;
1204
1205         /* Verify the device is back in a pristine state  */
1206         ASSERT(!test_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state));
1207         ASSERT(!test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
1208         ASSERT(list_empty(&device->dev_alloc_list));
1209         ASSERT(list_empty(&device->post_commit_list));
1210         ASSERT(atomic_read(&device->reada_in_flight) == 0);
1211 }
1212
1213 static void close_fs_devices(struct btrfs_fs_devices *fs_devices)
1214 {
1215         struct btrfs_device *device, *tmp;
1216
1217         lockdep_assert_held(&uuid_mutex);
1218
1219         if (--fs_devices->opened > 0)
1220                 return;
1221
1222         list_for_each_entry_safe(device, tmp, &fs_devices->devices, dev_list)
1223                 btrfs_close_one_device(device);
1224
1225         WARN_ON(fs_devices->open_devices);
1226         WARN_ON(fs_devices->rw_devices);
1227         fs_devices->opened = 0;
1228         fs_devices->seeding = false;
1229         fs_devices->fs_info = NULL;
1230 }
1231
1232 void btrfs_close_devices(struct btrfs_fs_devices *fs_devices)
1233 {
1234         LIST_HEAD(list);
1235         struct btrfs_fs_devices *tmp;
1236
1237         mutex_lock(&uuid_mutex);
1238         close_fs_devices(fs_devices);
1239         if (!fs_devices->opened) {
1240                 list_splice_init(&fs_devices->seed_list, &list);
1241
1242                 /*
1243                  * If the struct btrfs_fs_devices is not assembled with any
1244                  * other device, it can be re-initialized during the next mount
1245                  * without the needing device-scan step. Therefore, it can be
1246                  * fully freed.
1247                  */
1248                 if (fs_devices->num_devices == 1) {
1249                         list_del(&fs_devices->fs_list);
1250                         free_fs_devices(fs_devices);
1251                 }
1252         }
1253
1254
1255         list_for_each_entry_safe(fs_devices, tmp, &list, seed_list) {
1256                 close_fs_devices(fs_devices);
1257                 list_del(&fs_devices->seed_list);
1258                 free_fs_devices(fs_devices);
1259         }
1260         mutex_unlock(&uuid_mutex);
1261 }
1262
1263 static int open_fs_devices(struct btrfs_fs_devices *fs_devices,
1264                                 fmode_t flags, void *holder)
1265 {
1266         struct btrfs_device *device;
1267         struct btrfs_device *latest_dev = NULL;
1268         struct btrfs_device *tmp_device;
1269
1270         flags |= FMODE_EXCL;
1271
1272         list_for_each_entry_safe(device, tmp_device, &fs_devices->devices,
1273                                  dev_list) {
1274                 int ret;
1275
1276                 ret = btrfs_open_one_device(fs_devices, device, flags, holder);
1277                 if (ret == 0 &&
1278                     (!latest_dev || device->generation > latest_dev->generation)) {
1279                         latest_dev = device;
1280                 } else if (ret == -ENODATA) {
1281                         fs_devices->num_devices--;
1282                         list_del(&device->dev_list);
1283                         btrfs_free_device(device);
1284                 }
1285         }
1286         if (fs_devices->open_devices == 0)
1287                 return -EINVAL;
1288
1289         fs_devices->opened = 1;
1290         fs_devices->latest_bdev = latest_dev->bdev;
1291         fs_devices->total_rw_bytes = 0;
1292         fs_devices->chunk_alloc_policy = BTRFS_CHUNK_ALLOC_REGULAR;
1293
1294         return 0;
1295 }
1296
1297 static int devid_cmp(void *priv, const struct list_head *a,
1298                      const struct list_head *b)
1299 {
1300         struct btrfs_device *dev1, *dev2;
1301
1302         dev1 = list_entry(a, struct btrfs_device, dev_list);
1303         dev2 = list_entry(b, struct btrfs_device, dev_list);
1304
1305         if (dev1->devid < dev2->devid)
1306                 return -1;
1307         else if (dev1->devid > dev2->devid)
1308                 return 1;
1309         return 0;
1310 }
1311
1312 int btrfs_open_devices(struct btrfs_fs_devices *fs_devices,
1313                        fmode_t flags, void *holder)
1314 {
1315         int ret;
1316
1317         lockdep_assert_held(&uuid_mutex);
1318         /*
1319          * The device_list_mutex cannot be taken here in case opening the
1320          * underlying device takes further locks like bd_mutex.
1321          *
1322          * We also don't need the lock here as this is called during mount and
1323          * exclusion is provided by uuid_mutex
1324          */
1325
1326         if (fs_devices->opened) {
1327                 fs_devices->opened++;
1328                 ret = 0;
1329         } else {
1330                 list_sort(NULL, &fs_devices->devices, devid_cmp);
1331                 ret = open_fs_devices(fs_devices, flags, holder);
1332         }
1333
1334         return ret;
1335 }
1336
1337 void btrfs_release_disk_super(struct btrfs_super_block *super)
1338 {
1339         struct page *page = virt_to_page(super);
1340
1341         put_page(page);
1342 }
1343
1344 static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev,
1345                                                        u64 bytenr)
1346 {
1347         struct btrfs_super_block *disk_super;
1348         struct page *page;
1349         void *p;
1350         pgoff_t index;
1351
1352         /* make sure our super fits in the device */
1353         if (bytenr + PAGE_SIZE >= i_size_read(bdev->bd_inode))
1354                 return ERR_PTR(-EINVAL);
1355
1356         /* make sure our super fits in the page */
1357         if (sizeof(*disk_super) > PAGE_SIZE)
1358                 return ERR_PTR(-EINVAL);
1359
1360         /* make sure our super doesn't straddle pages on disk */
1361         index = bytenr >> PAGE_SHIFT;
1362         if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index)
1363                 return ERR_PTR(-EINVAL);
1364
1365         /* pull in the page with our super */
1366         page = read_cache_page_gfp(bdev->bd_inode->i_mapping, index, GFP_KERNEL);
1367
1368         if (IS_ERR(page))
1369                 return ERR_CAST(page);
1370
1371         p = page_address(page);
1372
1373         /* align our pointer to the offset of the super block */
1374         disk_super = p + offset_in_page(bytenr);
1375
1376         if (btrfs_super_bytenr(disk_super) != bytenr ||
1377             btrfs_super_magic(disk_super) != BTRFS_MAGIC) {
1378                 btrfs_release_disk_super(p);
1379                 return ERR_PTR(-EINVAL);
1380         }
1381
1382         if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1])
1383                 disk_super->label[BTRFS_LABEL_SIZE - 1] = 0;
1384
1385         return disk_super;
1386 }
1387
1388 int btrfs_forget_devices(const char *path)
1389 {
1390         int ret;
1391
1392         mutex_lock(&uuid_mutex);
1393         ret = btrfs_free_stale_devices(strlen(path) ? path : NULL, NULL);
1394         mutex_unlock(&uuid_mutex);
1395
1396         return ret;
1397 }
1398
1399 /*
1400  * Look for a btrfs signature on a device. This may be called out of the mount path
1401  * and we are not allowed to call set_blocksize during the scan. The superblock
1402  * is read via pagecache
1403  */
1404 struct btrfs_device *btrfs_scan_one_device(const char *path, fmode_t flags,
1405                                            void *holder)
1406 {
1407         struct btrfs_super_block *disk_super;
1408         bool new_device_added = false;
1409         struct btrfs_device *device = NULL;
1410         struct block_device *bdev;
1411         u64 bytenr;
1412
1413         lockdep_assert_held(&uuid_mutex);
1414
1415         /*
1416          * we would like to check all the supers, but that would make
1417          * a btrfs mount succeed after a mkfs from a different FS.
1418          * So, we need to add a special mount option to scan for
1419          * later supers, using BTRFS_SUPER_MIRROR_MAX instead
1420          */
1421         bytenr = btrfs_sb_offset(0);
1422
1423         /*
1424          * Avoid using flag |= FMODE_EXCL here, as the systemd-udev may
1425          * initiate the device scan which may race with the user's mount
1426          * or mkfs command, resulting in failure.
1427          * Since the device scan is solely for reading purposes, there is
1428          * no need for FMODE_EXCL. Additionally, the devices are read again
1429          * during the mount process. It is ok to get some inconsistent
1430          * values temporarily, as the device paths of the fsid are the only
1431          * required information for assembling the volume.
1432          */
1433         bdev = blkdev_get_by_path(path, flags, holder);
1434         if (IS_ERR(bdev))
1435                 return ERR_CAST(bdev);
1436
1437         disk_super = btrfs_read_disk_super(bdev, bytenr);
1438         if (IS_ERR(disk_super)) {
1439                 device = ERR_CAST(disk_super);
1440                 goto error_bdev_put;
1441         }
1442
1443         device = device_list_add(path, disk_super, &new_device_added);
1444         if (!IS_ERR(device)) {
1445                 if (new_device_added)
1446                         btrfs_free_stale_devices(path, device);
1447         }
1448
1449         btrfs_release_disk_super(disk_super);
1450
1451 error_bdev_put:
1452         blkdev_put(bdev, flags);
1453
1454         return device;
1455 }
1456
1457 /*
1458  * Try to find a chunk that intersects [start, start + len] range and when one
1459  * such is found, record the end of it in *start
1460  */
1461 static bool contains_pending_extent(struct btrfs_device *device, u64 *start,
1462                                     u64 len)
1463 {
1464         u64 physical_start, physical_end;
1465
1466         lockdep_assert_held(&device->fs_info->chunk_mutex);
1467
1468         if (!find_first_extent_bit(&device->alloc_state, *start,
1469                                    &physical_start, &physical_end,
1470                                    CHUNK_ALLOCATED, NULL)) {
1471
1472                 if (in_range(physical_start, *start, len) ||
1473                     in_range(*start, physical_start,
1474                              physical_end + 1 - physical_start)) {
1475                         *start = physical_end + 1;
1476                         return true;
1477                 }
1478         }
1479         return false;
1480 }
1481
1482 static u64 dev_extent_search_start(struct btrfs_device *device, u64 start)
1483 {
1484         switch (device->fs_devices->chunk_alloc_policy) {
1485         case BTRFS_CHUNK_ALLOC_REGULAR:
1486                 /*
1487                  * We don't want to overwrite the superblock on the drive nor
1488                  * any area used by the boot loader (grub for example), so we
1489                  * make sure to start at an offset of at least 1MB.
1490                  */
1491                 return max_t(u64, start, SZ_1M);
1492         default:
1493                 BUG();
1494         }
1495 }
1496
1497 /**
1498  * dev_extent_hole_check - check if specified hole is suitable for allocation
1499  * @device:     the device which we have the hole
1500  * @hole_start: starting position of the hole
1501  * @hole_size:  the size of the hole
1502  * @num_bytes:  the size of the free space that we need
1503  *
1504  * This function may modify @hole_start and @hole_end to reflect the suitable
1505  * position for allocation. Returns 1 if hole position is updated, 0 otherwise.
1506  */
1507 static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start,
1508                                   u64 *hole_size, u64 num_bytes)
1509 {
1510         bool changed = false;
1511         u64 hole_end = *hole_start + *hole_size;
1512
1513         /*
1514          * Check before we set max_hole_start, otherwise we could end up
1515          * sending back this offset anyway.
1516          */
1517         if (contains_pending_extent(device, hole_start, *hole_size)) {
1518                 if (hole_end >= *hole_start)
1519                         *hole_size = hole_end - *hole_start;
1520                 else
1521                         *hole_size = 0;
1522                 changed = true;
1523         }
1524
1525         switch (device->fs_devices->chunk_alloc_policy) {
1526         case BTRFS_CHUNK_ALLOC_REGULAR:
1527                 /* No extra check */
1528                 break;
1529         default:
1530                 BUG();
1531         }
1532
1533         return changed;
1534 }
1535
1536 /*
1537  * find_free_dev_extent_start - find free space in the specified device
1538  * @device:       the device which we search the free space in
1539  * @num_bytes:    the size of the free space that we need
1540  * @search_start: the position from which to begin the search
1541  * @start:        store the start of the free space.
1542  * @len:          the size of the free space. that we find, or the size
1543  *                of the max free space if we don't find suitable free space
1544  *
1545  * this uses a pretty simple search, the expectation is that it is
1546  * called very infrequently and that a given device has a small number
1547  * of extents
1548  *
1549  * @start is used to store the start of the free space if we find. But if we
1550  * don't find suitable free space, it will be used to store the start position
1551  * of the max free space.
1552  *
1553  * @len is used to store the size of the free space that we find.
1554  * But if we don't find suitable free space, it is used to store the size of
1555  * the max free space.
1556  *
1557  * NOTE: This function will search *commit* root of device tree, and does extra
1558  * check to ensure dev extents are not double allocated.
1559  * This makes the function safe to allocate dev extents but may not report
1560  * correct usable device space, as device extent freed in current transaction
1561  * is not reported as avaiable.
1562  */
1563 static int find_free_dev_extent_start(struct btrfs_device *device,
1564                                 u64 num_bytes, u64 search_start, u64 *start,
1565                                 u64 *len)
1566 {
1567         struct btrfs_fs_info *fs_info = device->fs_info;
1568         struct btrfs_root *root = fs_info->dev_root;
1569         struct btrfs_key key;
1570         struct btrfs_dev_extent *dev_extent;
1571         struct btrfs_path *path;
1572         u64 hole_size;
1573         u64 max_hole_start;
1574         u64 max_hole_size;
1575         u64 extent_end;
1576         u64 search_end = device->total_bytes;
1577         int ret;
1578         int slot;
1579         struct extent_buffer *l;
1580
1581         search_start = dev_extent_search_start(device, search_start);
1582
1583         path = btrfs_alloc_path();
1584         if (!path)
1585                 return -ENOMEM;
1586
1587         max_hole_start = search_start;
1588         max_hole_size = 0;
1589
1590 again:
1591         if (search_start >= search_end ||
1592                 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1593                 ret = -ENOSPC;
1594                 goto out;
1595         }
1596
1597         path->reada = READA_FORWARD;
1598         path->search_commit_root = 1;
1599         path->skip_locking = 1;
1600
1601         key.objectid = device->devid;
1602         key.offset = search_start;
1603         key.type = BTRFS_DEV_EXTENT_KEY;
1604
1605         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
1606         if (ret < 0)
1607                 goto out;
1608         if (ret > 0) {
1609                 ret = btrfs_previous_item(root, path, key.objectid, key.type);
1610                 if (ret < 0)
1611                         goto out;
1612         }
1613
1614         while (search_start < search_end) {
1615                 l = path->nodes[0];
1616                 slot = path->slots[0];
1617                 if (slot >= btrfs_header_nritems(l)) {
1618                         ret = btrfs_next_leaf(root, path);
1619                         if (ret == 0)
1620                                 continue;
1621                         if (ret < 0)
1622                                 goto out;
1623
1624                         break;
1625                 }
1626                 btrfs_item_key_to_cpu(l, &key, slot);
1627
1628                 if (key.objectid < device->devid)
1629                         goto next;
1630
1631                 if (key.objectid > device->devid)
1632                         break;
1633
1634                 if (key.type != BTRFS_DEV_EXTENT_KEY)
1635                         goto next;
1636
1637                 if (key.offset > search_end)
1638                         break;
1639
1640                 if (key.offset > search_start) {
1641                         hole_size = key.offset - search_start;
1642                         dev_extent_hole_check(device, &search_start, &hole_size,
1643                                               num_bytes);
1644
1645                         if (hole_size > max_hole_size) {
1646                                 max_hole_start = search_start;
1647                                 max_hole_size = hole_size;
1648                         }
1649
1650                         /*
1651                          * If this free space is greater than which we need,
1652                          * it must be the max free space that we have found
1653                          * until now, so max_hole_start must point to the start
1654                          * of this free space and the length of this free space
1655                          * is stored in max_hole_size. Thus, we return
1656                          * max_hole_start and max_hole_size and go back to the
1657                          * caller.
1658                          */
1659                         if (hole_size >= num_bytes) {
1660                                 ret = 0;
1661                                 goto out;
1662                         }
1663                 }
1664
1665                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
1666                 extent_end = key.offset + btrfs_dev_extent_length(l,
1667                                                                   dev_extent);
1668                 if (extent_end > search_start)
1669                         search_start = extent_end;
1670 next:
1671                 path->slots[0]++;
1672                 cond_resched();
1673         }
1674
1675         /*
1676          * At this point, search_start should be the end of
1677          * allocated dev extents, and when shrinking the device,
1678          * search_end may be smaller than search_start.
1679          */
1680         if (search_end > search_start) {
1681                 hole_size = search_end - search_start;
1682                 if (dev_extent_hole_check(device, &search_start, &hole_size,
1683                                           num_bytes)) {
1684                         btrfs_release_path(path);
1685                         goto again;
1686                 }
1687
1688                 if (hole_size > max_hole_size) {
1689                         max_hole_start = search_start;
1690                         max_hole_size = hole_size;
1691                 }
1692         }
1693
1694         /* See above. */
1695         if (max_hole_size < num_bytes)
1696                 ret = -ENOSPC;
1697         else
1698                 ret = 0;
1699
1700         ASSERT(max_hole_start + max_hole_size <= search_end);
1701 out:
1702         btrfs_free_path(path);
1703         *start = max_hole_start;
1704         if (len)
1705                 *len = max_hole_size;
1706         return ret;
1707 }
1708
1709 int find_free_dev_extent(struct btrfs_device *device, u64 num_bytes,
1710                          u64 *start, u64 *len)
1711 {
1712         /* FIXME use last free of some kind */
1713         return find_free_dev_extent_start(device, num_bytes, 0, start, len);
1714 }
1715
1716 static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans,
1717                           struct btrfs_device *device,
1718                           u64 start, u64 *dev_extent_len)
1719 {
1720         struct btrfs_fs_info *fs_info = device->fs_info;
1721         struct btrfs_root *root = fs_info->dev_root;
1722         int ret;
1723         struct btrfs_path *path;
1724         struct btrfs_key key;
1725         struct btrfs_key found_key;
1726         struct extent_buffer *leaf = NULL;
1727         struct btrfs_dev_extent *extent = NULL;
1728
1729         path = btrfs_alloc_path();
1730         if (!path)
1731                 return -ENOMEM;
1732
1733         key.objectid = device->devid;
1734         key.offset = start;
1735         key.type = BTRFS_DEV_EXTENT_KEY;
1736 again:
1737         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1738         if (ret > 0) {
1739                 ret = btrfs_previous_item(root, path, key.objectid,
1740                                           BTRFS_DEV_EXTENT_KEY);
1741                 if (ret)
1742                         goto out;
1743                 leaf = path->nodes[0];
1744                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
1745                 extent = btrfs_item_ptr(leaf, path->slots[0],
1746                                         struct btrfs_dev_extent);
1747                 BUG_ON(found_key.offset > start || found_key.offset +
1748                        btrfs_dev_extent_length(leaf, extent) < start);
1749                 key = found_key;
1750                 btrfs_release_path(path);
1751                 goto again;
1752         } else if (ret == 0) {
1753                 leaf = path->nodes[0];
1754                 extent = btrfs_item_ptr(leaf, path->slots[0],
1755                                         struct btrfs_dev_extent);
1756         } else {
1757                 btrfs_handle_fs_error(fs_info, ret, "Slot search failed");
1758                 goto out;
1759         }
1760
1761         *dev_extent_len = btrfs_dev_extent_length(leaf, extent);
1762
1763         ret = btrfs_del_item(trans, root, path);
1764         if (ret) {
1765                 btrfs_handle_fs_error(fs_info, ret,
1766                                       "Failed to remove dev extent item");
1767         } else {
1768                 set_bit(BTRFS_TRANS_HAVE_FREE_BGS, &trans->transaction->flags);
1769         }
1770 out:
1771         btrfs_free_path(path);
1772         return ret;
1773 }
1774
1775 static int btrfs_alloc_dev_extent(struct btrfs_trans_handle *trans,
1776                                   struct btrfs_device *device,
1777                                   u64 chunk_offset, u64 start, u64 num_bytes)
1778 {
1779         int ret;
1780         struct btrfs_path *path;
1781         struct btrfs_fs_info *fs_info = device->fs_info;
1782         struct btrfs_root *root = fs_info->dev_root;
1783         struct btrfs_dev_extent *extent;
1784         struct extent_buffer *leaf;
1785         struct btrfs_key key;
1786
1787         WARN_ON(!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state));
1788         WARN_ON(test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state));
1789         path = btrfs_alloc_path();
1790         if (!path)
1791                 return -ENOMEM;
1792
1793         key.objectid = device->devid;
1794         key.offset = start;
1795         key.type = BTRFS_DEV_EXTENT_KEY;
1796         ret = btrfs_insert_empty_item(trans, root, path, &key,
1797                                       sizeof(*extent));
1798         if (ret)
1799                 goto out;
1800
1801         leaf = path->nodes[0];
1802         extent = btrfs_item_ptr(leaf, path->slots[0],
1803                                 struct btrfs_dev_extent);
1804         btrfs_set_dev_extent_chunk_tree(leaf, extent,
1805                                         BTRFS_CHUNK_TREE_OBJECTID);
1806         btrfs_set_dev_extent_chunk_objectid(leaf, extent,
1807                                             BTRFS_FIRST_CHUNK_TREE_OBJECTID);
1808         btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset);
1809
1810         btrfs_set_dev_extent_length(leaf, extent, num_bytes);
1811         btrfs_mark_buffer_dirty(leaf);
1812 out:
1813         btrfs_free_path(path);
1814         return ret;
1815 }
1816
1817 static u64 find_next_chunk(struct btrfs_fs_info *fs_info)
1818 {
1819         struct extent_map_tree *em_tree;
1820         struct extent_map *em;
1821         struct rb_node *n;
1822         u64 ret = 0;
1823
1824         em_tree = &fs_info->mapping_tree;
1825         read_lock(&em_tree->lock);
1826         n = rb_last(&em_tree->map.rb_root);
1827         if (n) {
1828                 em = rb_entry(n, struct extent_map, rb_node);
1829                 ret = em->start + em->len;
1830         }
1831         read_unlock(&em_tree->lock);
1832
1833         return ret;
1834 }
1835
1836 static noinline int find_next_devid(struct btrfs_fs_info *fs_info,
1837                                     u64 *devid_ret)
1838 {
1839         int ret;
1840         struct btrfs_key key;
1841         struct btrfs_key found_key;
1842         struct btrfs_path *path;
1843
1844         path = btrfs_alloc_path();
1845         if (!path)
1846                 return -ENOMEM;
1847
1848         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1849         key.type = BTRFS_DEV_ITEM_KEY;
1850         key.offset = (u64)-1;
1851
1852         ret = btrfs_search_slot(NULL, fs_info->chunk_root, &key, path, 0, 0);
1853         if (ret < 0)
1854                 goto error;
1855
1856         if (ret == 0) {
1857                 /* Corruption */
1858                 btrfs_err(fs_info, "corrupted chunk tree devid -1 matched");
1859                 ret = -EUCLEAN;
1860                 goto error;
1861         }
1862
1863         ret = btrfs_previous_item(fs_info->chunk_root, path,
1864                                   BTRFS_DEV_ITEMS_OBJECTID,
1865                                   BTRFS_DEV_ITEM_KEY);
1866         if (ret) {
1867                 *devid_ret = 1;
1868         } else {
1869                 btrfs_item_key_to_cpu(path->nodes[0], &found_key,
1870                                       path->slots[0]);
1871                 *devid_ret = found_key.offset + 1;
1872         }
1873         ret = 0;
1874 error:
1875         btrfs_free_path(path);
1876         return ret;
1877 }
1878
1879 /*
1880  * the device information is stored in the chunk root
1881  * the btrfs_device struct should be fully filled in
1882  */
1883 static int btrfs_add_dev_item(struct btrfs_trans_handle *trans,
1884                             struct btrfs_device *device)
1885 {
1886         int ret;
1887         struct btrfs_path *path;
1888         struct btrfs_dev_item *dev_item;
1889         struct extent_buffer *leaf;
1890         struct btrfs_key key;
1891         unsigned long ptr;
1892
1893         path = btrfs_alloc_path();
1894         if (!path)
1895                 return -ENOMEM;
1896
1897         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1898         key.type = BTRFS_DEV_ITEM_KEY;
1899         key.offset = device->devid;
1900
1901         ret = btrfs_insert_empty_item(trans, trans->fs_info->chunk_root, path,
1902                                       &key, sizeof(*dev_item));
1903         if (ret)
1904                 goto out;
1905
1906         leaf = path->nodes[0];
1907         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
1908
1909         btrfs_set_device_id(leaf, dev_item, device->devid);
1910         btrfs_set_device_generation(leaf, dev_item, 0);
1911         btrfs_set_device_type(leaf, dev_item, device->type);
1912         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
1913         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
1914         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
1915         btrfs_set_device_total_bytes(leaf, dev_item,
1916                                      btrfs_device_get_disk_total_bytes(device));
1917         btrfs_set_device_bytes_used(leaf, dev_item,
1918                                     btrfs_device_get_bytes_used(device));
1919         btrfs_set_device_group(leaf, dev_item, 0);
1920         btrfs_set_device_seek_speed(leaf, dev_item, 0);
1921         btrfs_set_device_bandwidth(leaf, dev_item, 0);
1922         btrfs_set_device_start_offset(leaf, dev_item, 0);
1923
1924         ptr = btrfs_device_uuid(dev_item);
1925         write_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
1926         ptr = btrfs_device_fsid(dev_item);
1927         write_extent_buffer(leaf, trans->fs_info->fs_devices->metadata_uuid,
1928                             ptr, BTRFS_FSID_SIZE);
1929         btrfs_mark_buffer_dirty(leaf);
1930
1931         ret = 0;
1932 out:
1933         btrfs_free_path(path);
1934         return ret;
1935 }
1936
1937 /*
1938  * Function to update ctime/mtime for a given device path.
1939  * Mainly used for ctime/mtime based probe like libblkid.
1940  *
1941  * We don't care about errors here, this is just to be kind to userspace.
1942  */
1943 static void update_dev_time(const char *device_path)
1944 {
1945         struct path path;
1946         struct timespec64 now;
1947         int ret;
1948
1949         ret = kern_path(device_path, LOOKUP_FOLLOW, &path);
1950         if (ret)
1951                 return;
1952
1953         now = current_time(d_inode(path.dentry));
1954         inode_update_time(d_inode(path.dentry), &now, S_MTIME | S_CTIME);
1955         path_put(&path);
1956 }
1957
1958 static int btrfs_rm_dev_item(struct btrfs_device *device)
1959 {
1960         struct btrfs_root *root = device->fs_info->chunk_root;
1961         int ret;
1962         struct btrfs_path *path;
1963         struct btrfs_key key;
1964         struct btrfs_trans_handle *trans;
1965
1966         path = btrfs_alloc_path();
1967         if (!path)
1968                 return -ENOMEM;
1969
1970         trans = btrfs_start_transaction(root, 0);
1971         if (IS_ERR(trans)) {
1972                 btrfs_free_path(path);
1973                 return PTR_ERR(trans);
1974         }
1975         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
1976         key.type = BTRFS_DEV_ITEM_KEY;
1977         key.offset = device->devid;
1978
1979         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
1980         if (ret) {
1981                 if (ret > 0)
1982                         ret = -ENOENT;
1983                 btrfs_abort_transaction(trans, ret);
1984                 btrfs_end_transaction(trans);
1985                 goto out;
1986         }
1987
1988         ret = btrfs_del_item(trans, root, path);
1989         if (ret) {
1990                 btrfs_abort_transaction(trans, ret);
1991                 btrfs_end_transaction(trans);
1992         }
1993
1994 out:
1995         btrfs_free_path(path);
1996         if (!ret)
1997                 ret = btrfs_commit_transaction(trans);
1998         return ret;
1999 }
2000
2001 /*
2002  * Verify that @num_devices satisfies the RAID profile constraints in the whole
2003  * filesystem. It's up to the caller to adjust that number regarding eg. device
2004  * replace.
2005  */
2006 static int btrfs_check_raid_min_devices(struct btrfs_fs_info *fs_info,
2007                 u64 num_devices)
2008 {
2009         u64 all_avail;
2010         unsigned seq;
2011         int i;
2012
2013         do {
2014                 seq = read_seqbegin(&fs_info->profiles_lock);
2015
2016                 all_avail = fs_info->avail_data_alloc_bits |
2017                             fs_info->avail_system_alloc_bits |
2018                             fs_info->avail_metadata_alloc_bits;
2019         } while (read_seqretry(&fs_info->profiles_lock, seq));
2020
2021         for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) {
2022                 if (!(all_avail & btrfs_raid_array[i].bg_flag))
2023                         continue;
2024
2025                 if (num_devices < btrfs_raid_array[i].devs_min) {
2026                         int ret = btrfs_raid_array[i].mindev_error;
2027
2028                         if (ret)
2029                                 return ret;
2030                 }
2031         }
2032
2033         return 0;
2034 }
2035
2036 static struct btrfs_device * btrfs_find_next_active_device(
2037                 struct btrfs_fs_devices *fs_devs, struct btrfs_device *device)
2038 {
2039         struct btrfs_device *next_device;
2040
2041         list_for_each_entry(next_device, &fs_devs->devices, dev_list) {
2042                 if (next_device != device &&
2043                     !test_bit(BTRFS_DEV_STATE_MISSING, &next_device->dev_state)
2044                     && next_device->bdev)
2045                         return next_device;
2046         }
2047
2048         return NULL;
2049 }
2050
2051 /*
2052  * Helper function to check if the given device is part of s_bdev / latest_bdev
2053  * and replace it with the provided or the next active device, in the context
2054  * where this function called, there should be always be another device (or
2055  * this_dev) which is active.
2056  */
2057 void __cold btrfs_assign_next_active_device(struct btrfs_device *device,
2058                                             struct btrfs_device *next_device)
2059 {
2060         struct btrfs_fs_info *fs_info = device->fs_info;
2061
2062         if (!next_device)
2063                 next_device = btrfs_find_next_active_device(fs_info->fs_devices,
2064                                                             device);
2065         ASSERT(next_device);
2066
2067         if (fs_info->sb->s_bdev &&
2068                         (fs_info->sb->s_bdev == device->bdev))
2069                 fs_info->sb->s_bdev = next_device->bdev;
2070
2071         if (fs_info->fs_devices->latest_bdev == device->bdev)
2072                 fs_info->fs_devices->latest_bdev = next_device->bdev;
2073 }
2074
2075 /*
2076  * Return btrfs_fs_devices::num_devices excluding the device that's being
2077  * currently replaced.
2078  */
2079 static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info)
2080 {
2081         u64 num_devices = fs_info->fs_devices->num_devices;
2082
2083         down_read(&fs_info->dev_replace.rwsem);
2084         if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) {
2085                 ASSERT(num_devices > 1);
2086                 num_devices--;
2087         }
2088         up_read(&fs_info->dev_replace.rwsem);
2089
2090         return num_devices;
2091 }
2092
2093 void btrfs_scratch_superblocks(struct btrfs_fs_info *fs_info,
2094                                struct block_device *bdev,
2095                                const char *device_path)
2096 {
2097         struct btrfs_super_block *disk_super;
2098         int copy_num;
2099
2100         if (!bdev)
2101                 return;
2102
2103         for (copy_num = 0; copy_num < BTRFS_SUPER_MIRROR_MAX; copy_num++) {
2104                 struct page *page;
2105                 int ret;
2106
2107                 disk_super = btrfs_read_dev_one_super(bdev, copy_num);
2108                 if (IS_ERR(disk_super))
2109                         continue;
2110
2111                 memset(&disk_super->magic, 0, sizeof(disk_super->magic));
2112
2113                 page = virt_to_page(disk_super);
2114                 set_page_dirty(page);
2115                 lock_page(page);
2116                 /* write_on_page() unlocks the page */
2117                 ret = write_one_page(page);
2118                 if (ret)
2119                         btrfs_warn(fs_info,
2120                                 "error clearing superblock number %d (%d)",
2121                                 copy_num, ret);
2122                 btrfs_release_disk_super(disk_super);
2123
2124         }
2125
2126         /* Notify udev that device has changed */
2127         btrfs_kobject_uevent(bdev, KOBJ_CHANGE);
2128
2129         /* Update ctime/mtime for device path for libblkid */
2130         update_dev_time(device_path);
2131 }
2132
2133 int btrfs_rm_device(struct btrfs_fs_info *fs_info, const char *device_path,
2134                     u64 devid)
2135 {
2136         struct btrfs_device *device;
2137         struct btrfs_fs_devices *cur_devices;
2138         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2139         u64 num_devices;
2140         int ret = 0;
2141
2142         /*
2143          * The device list in fs_devices is accessed without locks (neither
2144          * uuid_mutex nor device_list_mutex) as it won't change on a mounted
2145          * filesystem and another device rm cannot run.
2146          */
2147         num_devices = btrfs_num_devices(fs_info);
2148
2149         ret = btrfs_check_raid_min_devices(fs_info, num_devices - 1);
2150         if (ret)
2151                 goto out;
2152
2153         device = btrfs_find_device_by_devspec(fs_info, devid, device_path);
2154
2155         if (IS_ERR(device)) {
2156                 if (PTR_ERR(device) == -ENOENT &&
2157                     device_path && strcmp(device_path, "missing") == 0)
2158                         ret = BTRFS_ERROR_DEV_MISSING_NOT_FOUND;
2159                 else
2160                         ret = PTR_ERR(device);
2161                 goto out;
2162         }
2163
2164         if (btrfs_pinned_by_swapfile(fs_info, device)) {
2165                 btrfs_warn_in_rcu(fs_info,
2166                   "cannot remove device %s (devid %llu) due to active swapfile",
2167                                   rcu_str_deref(device->name), device->devid);
2168                 ret = -ETXTBSY;
2169                 goto out;
2170         }
2171
2172         if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2173                 ret = BTRFS_ERROR_DEV_TGT_REPLACE;
2174                 goto out;
2175         }
2176
2177         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
2178             fs_info->fs_devices->rw_devices == 1) {
2179                 ret = BTRFS_ERROR_DEV_ONLY_WRITABLE;
2180                 goto out;
2181         }
2182
2183         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2184                 mutex_lock(&fs_info->chunk_mutex);
2185                 list_del_init(&device->dev_alloc_list);
2186                 device->fs_devices->rw_devices--;
2187                 mutex_unlock(&fs_info->chunk_mutex);
2188         }
2189
2190         ret = btrfs_shrink_device(device, 0);
2191         if (!ret)
2192                 btrfs_reada_remove_dev(device);
2193         if (ret)
2194                 goto error_undo;
2195
2196         /*
2197          * TODO: the superblock still includes this device in its num_devices
2198          * counter although write_all_supers() is not locked out. This
2199          * could give a filesystem state which requires a degraded mount.
2200          */
2201         ret = btrfs_rm_dev_item(device);
2202         if (ret)
2203                 goto error_undo;
2204
2205         clear_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2206         btrfs_scrub_cancel_dev(device);
2207
2208         /*
2209          * the device list mutex makes sure that we don't change
2210          * the device list while someone else is writing out all
2211          * the device supers. Whoever is writing all supers, should
2212          * lock the device list mutex before getting the number of
2213          * devices in the super block (super_copy). Conversely,
2214          * whoever updates the number of devices in the super block
2215          * (super_copy) should hold the device list mutex.
2216          */
2217
2218         /*
2219          * In normal cases the cur_devices == fs_devices. But in case
2220          * of deleting a seed device, the cur_devices should point to
2221          * its own fs_devices listed under the fs_devices->seed.
2222          */
2223         cur_devices = device->fs_devices;
2224         mutex_lock(&fs_devices->device_list_mutex);
2225         list_del_rcu(&device->dev_list);
2226
2227         cur_devices->num_devices--;
2228         cur_devices->total_devices--;
2229         /* Update total_devices of the parent fs_devices if it's seed */
2230         if (cur_devices != fs_devices)
2231                 fs_devices->total_devices--;
2232
2233         if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
2234                 cur_devices->missing_devices--;
2235
2236         btrfs_assign_next_active_device(device, NULL);
2237
2238         if (device->bdev) {
2239                 cur_devices->open_devices--;
2240                 /* remove sysfs entry */
2241                 btrfs_sysfs_remove_device(device);
2242         }
2243
2244         num_devices = btrfs_super_num_devices(fs_info->super_copy) - 1;
2245         btrfs_set_super_num_devices(fs_info->super_copy, num_devices);
2246         mutex_unlock(&fs_devices->device_list_mutex);
2247
2248         /*
2249          * at this point, the device is zero sized and detached from
2250          * the devices list.  All that's left is to zero out the old
2251          * supers and free the device.
2252          */
2253         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2254                 btrfs_scratch_superblocks(fs_info, device->bdev,
2255                                           device->name->str);
2256
2257         btrfs_close_bdev(device);
2258         synchronize_rcu();
2259         btrfs_free_device(device);
2260
2261         if (cur_devices->open_devices == 0) {
2262                 list_del_init(&cur_devices->seed_list);
2263                 close_fs_devices(cur_devices);
2264                 free_fs_devices(cur_devices);
2265         }
2266
2267 out:
2268         return ret;
2269
2270 error_undo:
2271         btrfs_reada_undo_remove_dev(device);
2272         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
2273                 mutex_lock(&fs_info->chunk_mutex);
2274                 list_add(&device->dev_alloc_list,
2275                          &fs_devices->alloc_list);
2276                 device->fs_devices->rw_devices++;
2277                 mutex_unlock(&fs_info->chunk_mutex);
2278         }
2279         goto out;
2280 }
2281
2282 void btrfs_rm_dev_replace_remove_srcdev(struct btrfs_device *srcdev)
2283 {
2284         struct btrfs_fs_devices *fs_devices;
2285
2286         lockdep_assert_held(&srcdev->fs_info->fs_devices->device_list_mutex);
2287
2288         /*
2289          * in case of fs with no seed, srcdev->fs_devices will point
2290          * to fs_devices of fs_info. However when the dev being replaced is
2291          * a seed dev it will point to the seed's local fs_devices. In short
2292          * srcdev will have its correct fs_devices in both the cases.
2293          */
2294         fs_devices = srcdev->fs_devices;
2295
2296         list_del_rcu(&srcdev->dev_list);
2297         list_del(&srcdev->dev_alloc_list);
2298         fs_devices->num_devices--;
2299         if (test_bit(BTRFS_DEV_STATE_MISSING, &srcdev->dev_state))
2300                 fs_devices->missing_devices--;
2301
2302         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &srcdev->dev_state))
2303                 fs_devices->rw_devices--;
2304
2305         if (srcdev->bdev)
2306                 fs_devices->open_devices--;
2307 }
2308
2309 void btrfs_rm_dev_replace_free_srcdev(struct btrfs_device *srcdev)
2310 {
2311         struct btrfs_fs_devices *fs_devices = srcdev->fs_devices;
2312
2313         mutex_lock(&uuid_mutex);
2314
2315         btrfs_close_bdev(srcdev);
2316         synchronize_rcu();
2317         btrfs_free_device(srcdev);
2318
2319         /* if this is no devs we rather delete the fs_devices */
2320         if (!fs_devices->num_devices) {
2321                 /*
2322                  * On a mounted FS, num_devices can't be zero unless it's a
2323                  * seed. In case of a seed device being replaced, the replace
2324                  * target added to the sprout FS, so there will be no more
2325                  * device left under the seed FS.
2326                  */
2327                 ASSERT(fs_devices->seeding);
2328
2329                 list_del_init(&fs_devices->seed_list);
2330                 close_fs_devices(fs_devices);
2331                 free_fs_devices(fs_devices);
2332         }
2333         mutex_unlock(&uuid_mutex);
2334 }
2335
2336 void btrfs_destroy_dev_replace_tgtdev(struct btrfs_device *tgtdev)
2337 {
2338         struct btrfs_fs_devices *fs_devices = tgtdev->fs_info->fs_devices;
2339
2340         mutex_lock(&fs_devices->device_list_mutex);
2341
2342         btrfs_sysfs_remove_device(tgtdev);
2343
2344         if (tgtdev->bdev)
2345                 fs_devices->open_devices--;
2346
2347         fs_devices->num_devices--;
2348
2349         btrfs_assign_next_active_device(tgtdev, NULL);
2350
2351         list_del_rcu(&tgtdev->dev_list);
2352
2353         mutex_unlock(&fs_devices->device_list_mutex);
2354
2355         /*
2356          * The update_dev_time() with in btrfs_scratch_superblocks()
2357          * may lead to a call to btrfs_show_devname() which will try
2358          * to hold device_list_mutex. And here this device
2359          * is already out of device list, so we don't have to hold
2360          * the device_list_mutex lock.
2361          */
2362         btrfs_scratch_superblocks(tgtdev->fs_info, tgtdev->bdev,
2363                                   tgtdev->name->str);
2364
2365         btrfs_close_bdev(tgtdev);
2366         synchronize_rcu();
2367         btrfs_free_device(tgtdev);
2368 }
2369
2370 static struct btrfs_device *btrfs_find_device_by_path(
2371                 struct btrfs_fs_info *fs_info, const char *device_path)
2372 {
2373         int ret = 0;
2374         struct btrfs_super_block *disk_super;
2375         u64 devid;
2376         u8 *dev_uuid;
2377         struct block_device *bdev;
2378         struct btrfs_device *device;
2379
2380         ret = btrfs_get_bdev_and_sb(device_path, FMODE_READ,
2381                                     fs_info->bdev_holder, 0, &bdev, &disk_super);
2382         if (ret)
2383                 return ERR_PTR(ret);
2384
2385         devid = btrfs_stack_device_id(&disk_super->dev_item);
2386         dev_uuid = disk_super->dev_item.uuid;
2387         if (btrfs_fs_incompat(fs_info, METADATA_UUID))
2388                 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2389                                            disk_super->metadata_uuid, true);
2390         else
2391                 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2392                                            disk_super->fsid, true);
2393
2394         btrfs_release_disk_super(disk_super);
2395         if (!device)
2396                 device = ERR_PTR(-ENOENT);
2397         blkdev_put(bdev, FMODE_READ);
2398         return device;
2399 }
2400
2401 /*
2402  * Lookup a device given by device id, or the path if the id is 0.
2403  */
2404 struct btrfs_device *btrfs_find_device_by_devspec(
2405                 struct btrfs_fs_info *fs_info, u64 devid,
2406                 const char *device_path)
2407 {
2408         struct btrfs_device *device;
2409
2410         if (devid) {
2411                 device = btrfs_find_device(fs_info->fs_devices, devid, NULL,
2412                                            NULL, true);
2413                 if (!device)
2414                         return ERR_PTR(-ENOENT);
2415                 return device;
2416         }
2417
2418         if (!device_path || !device_path[0])
2419                 return ERR_PTR(-EINVAL);
2420
2421         if (strcmp(device_path, "missing") == 0) {
2422                 /* Find first missing device */
2423                 list_for_each_entry(device, &fs_info->fs_devices->devices,
2424                                     dev_list) {
2425                         if (test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
2426                                      &device->dev_state) && !device->bdev)
2427                                 return device;
2428                 }
2429                 return ERR_PTR(-ENOENT);
2430         }
2431
2432         return btrfs_find_device_by_path(fs_info, device_path);
2433 }
2434
2435 /*
2436  * does all the dirty work required for changing file system's UUID.
2437  */
2438 static int btrfs_prepare_sprout(struct btrfs_fs_info *fs_info)
2439 {
2440         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2441         struct btrfs_fs_devices *old_devices;
2442         struct btrfs_fs_devices *seed_devices;
2443         struct btrfs_super_block *disk_super = fs_info->super_copy;
2444         struct btrfs_device *device;
2445         u64 super_flags;
2446
2447         lockdep_assert_held(&uuid_mutex);
2448         if (!fs_devices->seeding)
2449                 return -EINVAL;
2450
2451         /*
2452          * Private copy of the seed devices, anchored at
2453          * fs_info->fs_devices->seed_list
2454          */
2455         seed_devices = alloc_fs_devices(NULL, NULL);
2456         if (IS_ERR(seed_devices))
2457                 return PTR_ERR(seed_devices);
2458
2459         /*
2460          * It's necessary to retain a copy of the original seed fs_devices in
2461          * fs_uuids so that filesystems which have been seeded can successfully
2462          * reference the seed device from open_seed_devices. This also supports
2463          * multiple fs seed.
2464          */
2465         old_devices = clone_fs_devices(fs_devices);
2466         if (IS_ERR(old_devices)) {
2467                 kfree(seed_devices);
2468                 return PTR_ERR(old_devices);
2469         }
2470
2471         list_add(&old_devices->fs_list, &fs_uuids);
2472
2473         memcpy(seed_devices, fs_devices, sizeof(*seed_devices));
2474         seed_devices->opened = 1;
2475         INIT_LIST_HEAD(&seed_devices->devices);
2476         INIT_LIST_HEAD(&seed_devices->alloc_list);
2477         mutex_init(&seed_devices->device_list_mutex);
2478
2479         mutex_lock(&fs_devices->device_list_mutex);
2480         list_splice_init_rcu(&fs_devices->devices, &seed_devices->devices,
2481                               synchronize_rcu);
2482         list_for_each_entry(device, &seed_devices->devices, dev_list)
2483                 device->fs_devices = seed_devices;
2484
2485         fs_devices->seeding = false;
2486         fs_devices->num_devices = 0;
2487         fs_devices->open_devices = 0;
2488         fs_devices->missing_devices = 0;
2489         fs_devices->rotating = false;
2490         list_add(&seed_devices->seed_list, &fs_devices->seed_list);
2491
2492         generate_random_uuid(fs_devices->fsid);
2493         memcpy(fs_devices->metadata_uuid, fs_devices->fsid, BTRFS_FSID_SIZE);
2494         memcpy(disk_super->fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
2495         mutex_unlock(&fs_devices->device_list_mutex);
2496
2497         super_flags = btrfs_super_flags(disk_super) &
2498                       ~BTRFS_SUPER_FLAG_SEEDING;
2499         btrfs_set_super_flags(disk_super, super_flags);
2500
2501         return 0;
2502 }
2503
2504 /*
2505  * Store the expected generation for seed devices in device items.
2506  */
2507 static int btrfs_finish_sprout(struct btrfs_trans_handle *trans)
2508 {
2509         struct btrfs_fs_info *fs_info = trans->fs_info;
2510         struct btrfs_root *root = fs_info->chunk_root;
2511         struct btrfs_path *path;
2512         struct extent_buffer *leaf;
2513         struct btrfs_dev_item *dev_item;
2514         struct btrfs_device *device;
2515         struct btrfs_key key;
2516         u8 fs_uuid[BTRFS_FSID_SIZE];
2517         u8 dev_uuid[BTRFS_UUID_SIZE];
2518         u64 devid;
2519         int ret;
2520
2521         path = btrfs_alloc_path();
2522         if (!path)
2523                 return -ENOMEM;
2524
2525         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2526         key.offset = 0;
2527         key.type = BTRFS_DEV_ITEM_KEY;
2528
2529         while (1) {
2530                 ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2531                 if (ret < 0)
2532                         goto error;
2533
2534                 leaf = path->nodes[0];
2535 next_slot:
2536                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
2537                         ret = btrfs_next_leaf(root, path);
2538                         if (ret > 0)
2539                                 break;
2540                         if (ret < 0)
2541                                 goto error;
2542                         leaf = path->nodes[0];
2543                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2544                         btrfs_release_path(path);
2545                         continue;
2546                 }
2547
2548                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2549                 if (key.objectid != BTRFS_DEV_ITEMS_OBJECTID ||
2550                     key.type != BTRFS_DEV_ITEM_KEY)
2551                         break;
2552
2553                 dev_item = btrfs_item_ptr(leaf, path->slots[0],
2554                                           struct btrfs_dev_item);
2555                 devid = btrfs_device_id(leaf, dev_item);
2556                 read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
2557                                    BTRFS_UUID_SIZE);
2558                 read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
2559                                    BTRFS_FSID_SIZE);
2560                 device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
2561                                            fs_uuid, true);
2562                 BUG_ON(!device); /* Logic error */
2563
2564                 if (device->fs_devices->seeding) {
2565                         btrfs_set_device_generation(leaf, dev_item,
2566                                                     device->generation);
2567                         btrfs_mark_buffer_dirty(leaf);
2568                 }
2569
2570                 path->slots[0]++;
2571                 goto next_slot;
2572         }
2573         ret = 0;
2574 error:
2575         btrfs_free_path(path);
2576         return ret;
2577 }
2578
2579 int btrfs_init_new_device(struct btrfs_fs_info *fs_info, const char *device_path)
2580 {
2581         struct btrfs_root *root = fs_info->dev_root;
2582         struct request_queue *q;
2583         struct btrfs_trans_handle *trans;
2584         struct btrfs_device *device;
2585         struct block_device *bdev;
2586         struct super_block *sb = fs_info->sb;
2587         struct rcu_string *name;
2588         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2589         u64 orig_super_total_bytes;
2590         u64 orig_super_num_devices;
2591         int seeding_dev = 0;
2592         int ret = 0;
2593         bool locked = false;
2594
2595         if (sb_rdonly(sb) && !fs_devices->seeding)
2596                 return -EROFS;
2597
2598         bdev = blkdev_get_by_path(device_path, FMODE_WRITE | FMODE_EXCL,
2599                                   fs_info->bdev_holder);
2600         if (IS_ERR(bdev))
2601                 return PTR_ERR(bdev);
2602
2603         if (fs_devices->seeding) {
2604                 seeding_dev = 1;
2605                 down_write(&sb->s_umount);
2606                 mutex_lock(&uuid_mutex);
2607                 locked = true;
2608         }
2609
2610         sync_blockdev(bdev);
2611
2612         rcu_read_lock();
2613         list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
2614                 if (device->bdev == bdev) {
2615                         ret = -EEXIST;
2616                         rcu_read_unlock();
2617                         goto error;
2618                 }
2619         }
2620         rcu_read_unlock();
2621
2622         device = btrfs_alloc_device(fs_info, NULL, NULL);
2623         if (IS_ERR(device)) {
2624                 /* we can safely leave the fs_devices entry around */
2625                 ret = PTR_ERR(device);
2626                 goto error;
2627         }
2628
2629         name = rcu_string_strdup(device_path, GFP_KERNEL);
2630         if (!name) {
2631                 ret = -ENOMEM;
2632                 goto error_free_device;
2633         }
2634         rcu_assign_pointer(device->name, name);
2635
2636         trans = btrfs_start_transaction(root, 0);
2637         if (IS_ERR(trans)) {
2638                 ret = PTR_ERR(trans);
2639                 goto error_free_device;
2640         }
2641
2642         q = bdev_get_queue(bdev);
2643         set_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state);
2644         device->generation = trans->transid;
2645         device->io_width = fs_info->sectorsize;
2646         device->io_align = fs_info->sectorsize;
2647         device->sector_size = fs_info->sectorsize;
2648         device->total_bytes = round_down(i_size_read(bdev->bd_inode),
2649                                          fs_info->sectorsize);
2650         device->disk_total_bytes = device->total_bytes;
2651         device->commit_total_bytes = device->total_bytes;
2652         device->fs_info = fs_info;
2653         device->bdev = bdev;
2654         set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
2655         clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
2656         device->mode = FMODE_EXCL;
2657         device->dev_stats_valid = 1;
2658         set_blocksize(device->bdev, BTRFS_BDEV_BLOCKSIZE);
2659
2660         if (seeding_dev) {
2661                 sb->s_flags &= ~SB_RDONLY;
2662                 ret = btrfs_prepare_sprout(fs_info);
2663                 if (ret) {
2664                         btrfs_abort_transaction(trans, ret);
2665                         goto error_trans;
2666                 }
2667         }
2668
2669         device->fs_devices = fs_devices;
2670
2671         mutex_lock(&fs_devices->device_list_mutex);
2672         mutex_lock(&fs_info->chunk_mutex);
2673         list_add_rcu(&device->dev_list, &fs_devices->devices);
2674         list_add(&device->dev_alloc_list, &fs_devices->alloc_list);
2675         fs_devices->num_devices++;
2676         fs_devices->open_devices++;
2677         fs_devices->rw_devices++;
2678         fs_devices->total_devices++;
2679         fs_devices->total_rw_bytes += device->total_bytes;
2680
2681         atomic64_add(device->total_bytes, &fs_info->free_chunk_space);
2682
2683         if (!blk_queue_nonrot(q))
2684                 fs_devices->rotating = true;
2685
2686         orig_super_total_bytes = btrfs_super_total_bytes(fs_info->super_copy);
2687         btrfs_set_super_total_bytes(fs_info->super_copy,
2688                 round_down(orig_super_total_bytes + device->total_bytes,
2689                            fs_info->sectorsize));
2690
2691         orig_super_num_devices = btrfs_super_num_devices(fs_info->super_copy);
2692         btrfs_set_super_num_devices(fs_info->super_copy,
2693                                     orig_super_num_devices + 1);
2694
2695         /*
2696          * we've got more storage, clear any full flags on the space
2697          * infos
2698          */
2699         btrfs_clear_space_info_full(fs_info);
2700
2701         mutex_unlock(&fs_info->chunk_mutex);
2702
2703         /* Add sysfs device entry */
2704         btrfs_sysfs_add_device(device);
2705
2706         mutex_unlock(&fs_devices->device_list_mutex);
2707
2708         if (seeding_dev) {
2709                 mutex_lock(&fs_info->chunk_mutex);
2710                 ret = init_first_rw_device(trans);
2711                 mutex_unlock(&fs_info->chunk_mutex);
2712                 if (ret) {
2713                         btrfs_abort_transaction(trans, ret);
2714                         goto error_sysfs;
2715                 }
2716         }
2717
2718         ret = btrfs_add_dev_item(trans, device);
2719         if (ret) {
2720                 btrfs_abort_transaction(trans, ret);
2721                 goto error_sysfs;
2722         }
2723
2724         if (seeding_dev) {
2725                 ret = btrfs_finish_sprout(trans);
2726                 if (ret) {
2727                         btrfs_abort_transaction(trans, ret);
2728                         goto error_sysfs;
2729                 }
2730
2731                 /*
2732                  * fs_devices now represents the newly sprouted filesystem and
2733                  * its fsid has been changed by btrfs_prepare_sprout
2734                  */
2735                 btrfs_sysfs_update_sprout_fsid(fs_devices);
2736         }
2737
2738         ret = btrfs_commit_transaction(trans);
2739
2740         if (seeding_dev) {
2741                 mutex_unlock(&uuid_mutex);
2742                 up_write(&sb->s_umount);
2743                 locked = false;
2744
2745                 if (ret) /* transaction commit */
2746                         return ret;
2747
2748                 ret = btrfs_relocate_sys_chunks(fs_info);
2749                 if (ret < 0)
2750                         btrfs_handle_fs_error(fs_info, ret,
2751                                     "Failed to relocate sys chunks after device initialization. This can be fixed using the \"btrfs balance\" command.");
2752                 trans = btrfs_attach_transaction(root);
2753                 if (IS_ERR(trans)) {
2754                         if (PTR_ERR(trans) == -ENOENT)
2755                                 return 0;
2756                         ret = PTR_ERR(trans);
2757                         trans = NULL;
2758                         goto error_sysfs;
2759                 }
2760                 ret = btrfs_commit_transaction(trans);
2761         }
2762
2763         /*
2764          * Now that we have written a new super block to this device, check all
2765          * other fs_devices list if device_path alienates any other scanned
2766          * device.
2767          * We can ignore the return value as it typically returns -EINVAL and
2768          * only succeeds if the device was an alien.
2769          */
2770         btrfs_forget_devices(device_path);
2771
2772         /* Update ctime/mtime for blkid or udev */
2773         update_dev_time(device_path);
2774
2775         return ret;
2776
2777 error_sysfs:
2778         btrfs_sysfs_remove_device(device);
2779         mutex_lock(&fs_info->fs_devices->device_list_mutex);
2780         mutex_lock(&fs_info->chunk_mutex);
2781         list_del_rcu(&device->dev_list);
2782         list_del(&device->dev_alloc_list);
2783         fs_info->fs_devices->num_devices--;
2784         fs_info->fs_devices->open_devices--;
2785         fs_info->fs_devices->rw_devices--;
2786         fs_info->fs_devices->total_devices--;
2787         fs_info->fs_devices->total_rw_bytes -= device->total_bytes;
2788         atomic64_sub(device->total_bytes, &fs_info->free_chunk_space);
2789         btrfs_set_super_total_bytes(fs_info->super_copy,
2790                                     orig_super_total_bytes);
2791         btrfs_set_super_num_devices(fs_info->super_copy,
2792                                     orig_super_num_devices);
2793         mutex_unlock(&fs_info->chunk_mutex);
2794         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
2795 error_trans:
2796         if (seeding_dev)
2797                 sb->s_flags |= SB_RDONLY;
2798         if (trans)
2799                 btrfs_end_transaction(trans);
2800 error_free_device:
2801         btrfs_free_device(device);
2802 error:
2803         blkdev_put(bdev, FMODE_EXCL);
2804         if (locked) {
2805                 mutex_unlock(&uuid_mutex);
2806                 up_write(&sb->s_umount);
2807         }
2808         return ret;
2809 }
2810
2811 static noinline int btrfs_update_device(struct btrfs_trans_handle *trans,
2812                                         struct btrfs_device *device)
2813 {
2814         int ret;
2815         struct btrfs_path *path;
2816         struct btrfs_root *root = device->fs_info->chunk_root;
2817         struct btrfs_dev_item *dev_item;
2818         struct extent_buffer *leaf;
2819         struct btrfs_key key;
2820
2821         path = btrfs_alloc_path();
2822         if (!path)
2823                 return -ENOMEM;
2824
2825         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
2826         key.type = BTRFS_DEV_ITEM_KEY;
2827         key.offset = device->devid;
2828
2829         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2830         if (ret < 0)
2831                 goto out;
2832
2833         if (ret > 0) {
2834                 ret = -ENOENT;
2835                 goto out;
2836         }
2837
2838         leaf = path->nodes[0];
2839         dev_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_item);
2840
2841         btrfs_set_device_id(leaf, dev_item, device->devid);
2842         btrfs_set_device_type(leaf, dev_item, device->type);
2843         btrfs_set_device_io_align(leaf, dev_item, device->io_align);
2844         btrfs_set_device_io_width(leaf, dev_item, device->io_width);
2845         btrfs_set_device_sector_size(leaf, dev_item, device->sector_size);
2846         btrfs_set_device_total_bytes(leaf, dev_item,
2847                                      btrfs_device_get_disk_total_bytes(device));
2848         btrfs_set_device_bytes_used(leaf, dev_item,
2849                                     btrfs_device_get_bytes_used(device));
2850         btrfs_mark_buffer_dirty(leaf);
2851
2852 out:
2853         btrfs_free_path(path);
2854         return ret;
2855 }
2856
2857 int btrfs_grow_device(struct btrfs_trans_handle *trans,
2858                       struct btrfs_device *device, u64 new_size)
2859 {
2860         struct btrfs_fs_info *fs_info = device->fs_info;
2861         struct btrfs_super_block *super_copy = fs_info->super_copy;
2862         u64 old_total;
2863         u64 diff;
2864
2865         if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
2866                 return -EACCES;
2867
2868         new_size = round_down(new_size, fs_info->sectorsize);
2869
2870         mutex_lock(&fs_info->chunk_mutex);
2871         old_total = btrfs_super_total_bytes(super_copy);
2872         diff = round_down(new_size - device->total_bytes, fs_info->sectorsize);
2873
2874         if (new_size <= device->total_bytes ||
2875             test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
2876                 mutex_unlock(&fs_info->chunk_mutex);
2877                 return -EINVAL;
2878         }
2879
2880         btrfs_set_super_total_bytes(super_copy,
2881                         round_down(old_total + diff, fs_info->sectorsize));
2882         device->fs_devices->total_rw_bytes += diff;
2883
2884         btrfs_device_set_total_bytes(device, new_size);
2885         btrfs_device_set_disk_total_bytes(device, new_size);
2886         btrfs_clear_space_info_full(device->fs_info);
2887         if (list_empty(&device->post_commit_list))
2888                 list_add_tail(&device->post_commit_list,
2889                               &trans->transaction->dev_update_list);
2890         mutex_unlock(&fs_info->chunk_mutex);
2891
2892         return btrfs_update_device(trans, device);
2893 }
2894
2895 static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
2896 {
2897         struct btrfs_fs_info *fs_info = trans->fs_info;
2898         struct btrfs_root *root = fs_info->chunk_root;
2899         int ret;
2900         struct btrfs_path *path;
2901         struct btrfs_key key;
2902
2903         path = btrfs_alloc_path();
2904         if (!path)
2905                 return -ENOMEM;
2906
2907         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
2908         key.offset = chunk_offset;
2909         key.type = BTRFS_CHUNK_ITEM_KEY;
2910
2911         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
2912         if (ret < 0)
2913                 goto out;
2914         else if (ret > 0) { /* Logic error or corruption */
2915                 btrfs_handle_fs_error(fs_info, -ENOENT,
2916                                       "Failed lookup while freeing chunk.");
2917                 ret = -ENOENT;
2918                 goto out;
2919         }
2920
2921         ret = btrfs_del_item(trans, root, path);
2922         if (ret < 0)
2923                 btrfs_handle_fs_error(fs_info, ret,
2924                                       "Failed to delete chunk item.");
2925 out:
2926         btrfs_free_path(path);
2927         return ret;
2928 }
2929
2930 static int btrfs_del_sys_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
2931 {
2932         struct btrfs_super_block *super_copy = fs_info->super_copy;
2933         struct btrfs_disk_key *disk_key;
2934         struct btrfs_chunk *chunk;
2935         u8 *ptr;
2936         int ret = 0;
2937         u32 num_stripes;
2938         u32 array_size;
2939         u32 len = 0;
2940         u32 cur;
2941         struct btrfs_key key;
2942
2943         mutex_lock(&fs_info->chunk_mutex);
2944         array_size = btrfs_super_sys_array_size(super_copy);
2945
2946         ptr = super_copy->sys_chunk_array;
2947         cur = 0;
2948
2949         while (cur < array_size) {
2950                 disk_key = (struct btrfs_disk_key *)ptr;
2951                 btrfs_disk_key_to_cpu(&key, disk_key);
2952
2953                 len = sizeof(*disk_key);
2954
2955                 if (key.type == BTRFS_CHUNK_ITEM_KEY) {
2956                         chunk = (struct btrfs_chunk *)(ptr + len);
2957                         num_stripes = btrfs_stack_chunk_num_stripes(chunk);
2958                         len += btrfs_chunk_item_size(num_stripes);
2959                 } else {
2960                         ret = -EIO;
2961                         break;
2962                 }
2963                 if (key.objectid == BTRFS_FIRST_CHUNK_TREE_OBJECTID &&
2964                     key.offset == chunk_offset) {
2965                         memmove(ptr, ptr + len, array_size - (cur + len));
2966                         array_size -= len;
2967                         btrfs_set_super_sys_array_size(super_copy, array_size);
2968                 } else {
2969                         ptr += len;
2970                         cur += len;
2971                 }
2972         }
2973         mutex_unlock(&fs_info->chunk_mutex);
2974         return ret;
2975 }
2976
2977 /*
2978  * btrfs_get_chunk_map() - Find the mapping containing the given logical extent.
2979  * @logical: Logical block offset in bytes.
2980  * @length: Length of extent in bytes.
2981  *
2982  * Return: Chunk mapping or ERR_PTR.
2983  */
2984 struct extent_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info,
2985                                        u64 logical, u64 length)
2986 {
2987         struct extent_map_tree *em_tree;
2988         struct extent_map *em;
2989
2990         em_tree = &fs_info->mapping_tree;
2991         read_lock(&em_tree->lock);
2992         em = lookup_extent_mapping(em_tree, logical, length);
2993         read_unlock(&em_tree->lock);
2994
2995         if (!em) {
2996                 btrfs_crit(fs_info,
2997                            "unable to find chunk map for logical %llu length %llu",
2998                            logical, length);
2999                 return ERR_PTR(-EINVAL);
3000         }
3001
3002         if (em->start > logical || em->start + em->len <= logical) {
3003                 btrfs_crit(fs_info,
3004                            "found a bad chunk map, wanted %llu-%llu, found %llu-%llu",
3005                            logical, logical + length, em->start, em->start + em->len);
3006                 free_extent_map(em);
3007                 return ERR_PTR(-EINVAL);
3008         }
3009
3010         /* callers are responsible for dropping em's ref. */
3011         return em;
3012 }
3013
3014 int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset)
3015 {
3016         struct btrfs_fs_info *fs_info = trans->fs_info;
3017         struct extent_map *em;
3018         struct map_lookup *map;
3019         u64 dev_extent_len = 0;
3020         int i, ret = 0;
3021         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
3022
3023         em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
3024         if (IS_ERR(em)) {
3025                 /*
3026                  * This is a logic error, but we don't want to just rely on the
3027                  * user having built with ASSERT enabled, so if ASSERT doesn't
3028                  * do anything we still error out.
3029                  */
3030                 ASSERT(0);
3031                 return PTR_ERR(em);
3032         }
3033         map = em->map_lookup;
3034         mutex_lock(&fs_info->chunk_mutex);
3035         check_system_chunk(trans, map->type);
3036         mutex_unlock(&fs_info->chunk_mutex);
3037
3038         /*
3039          * Take the device list mutex to prevent races with the final phase of
3040          * a device replace operation that replaces the device object associated
3041          * with map stripes (dev-replace.c:btrfs_dev_replace_finishing()).
3042          */
3043         mutex_lock(&fs_devices->device_list_mutex);
3044         for (i = 0; i < map->num_stripes; i++) {
3045                 struct btrfs_device *device = map->stripes[i].dev;
3046                 ret = btrfs_free_dev_extent(trans, device,
3047                                             map->stripes[i].physical,
3048                                             &dev_extent_len);
3049                 if (ret) {
3050                         mutex_unlock(&fs_devices->device_list_mutex);
3051                         btrfs_abort_transaction(trans, ret);
3052                         goto out;
3053                 }
3054
3055                 if (device->bytes_used > 0) {
3056                         mutex_lock(&fs_info->chunk_mutex);
3057                         btrfs_device_set_bytes_used(device,
3058                                         device->bytes_used - dev_extent_len);
3059                         atomic64_add(dev_extent_len, &fs_info->free_chunk_space);
3060                         btrfs_clear_space_info_full(fs_info);
3061                         mutex_unlock(&fs_info->chunk_mutex);
3062                 }
3063
3064                 ret = btrfs_update_device(trans, device);
3065                 if (ret) {
3066                         mutex_unlock(&fs_devices->device_list_mutex);
3067                         btrfs_abort_transaction(trans, ret);
3068                         goto out;
3069                 }
3070         }
3071         mutex_unlock(&fs_devices->device_list_mutex);
3072
3073         ret = btrfs_free_chunk(trans, chunk_offset);
3074         if (ret) {
3075                 btrfs_abort_transaction(trans, ret);
3076                 goto out;
3077         }
3078
3079         trace_btrfs_chunk_free(fs_info, map, chunk_offset, em->len);
3080
3081         if (map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
3082                 ret = btrfs_del_sys_chunk(fs_info, chunk_offset);
3083                 if (ret) {
3084                         btrfs_abort_transaction(trans, ret);
3085                         goto out;
3086                 }
3087         }
3088
3089         ret = btrfs_remove_block_group(trans, chunk_offset, em);
3090         if (ret) {
3091                 btrfs_abort_transaction(trans, ret);
3092                 goto out;
3093         }
3094
3095 out:
3096         /* once for us */
3097         free_extent_map(em);
3098         return ret;
3099 }
3100
3101 static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset)
3102 {
3103         struct btrfs_root *root = fs_info->chunk_root;
3104         struct btrfs_trans_handle *trans;
3105         struct btrfs_block_group *block_group;
3106         int ret;
3107
3108         /*
3109          * Prevent races with automatic removal of unused block groups.
3110          * After we relocate and before we remove the chunk with offset
3111          * chunk_offset, automatic removal of the block group can kick in,
3112          * resulting in a failure when calling btrfs_remove_chunk() below.
3113          *
3114          * Make sure to acquire this mutex before doing a tree search (dev
3115          * or chunk trees) to find chunks. Otherwise the cleaner kthread might
3116          * call btrfs_remove_chunk() (through btrfs_delete_unused_bgs()) after
3117          * we release the path used to search the chunk/dev tree and before
3118          * the current task acquires this mutex and calls us.
3119          */
3120         lockdep_assert_held(&fs_info->delete_unused_bgs_mutex);
3121
3122         /* step one, relocate all the extents inside this chunk */
3123         btrfs_scrub_pause(fs_info);
3124         ret = btrfs_relocate_block_group(fs_info, chunk_offset);
3125         btrfs_scrub_continue(fs_info);
3126         if (ret)
3127                 return ret;
3128
3129         block_group = btrfs_lookup_block_group(fs_info, chunk_offset);
3130         if (!block_group)
3131                 return -ENOENT;
3132         btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group);
3133         btrfs_put_block_group(block_group);
3134
3135         trans = btrfs_start_trans_remove_block_group(root->fs_info,
3136                                                      chunk_offset);
3137         if (IS_ERR(trans)) {
3138                 ret = PTR_ERR(trans);
3139                 btrfs_handle_fs_error(root->fs_info, ret, NULL);
3140                 return ret;
3141         }
3142
3143         /*
3144          * step two, delete the device extents and the
3145          * chunk tree entries
3146          */
3147         ret = btrfs_remove_chunk(trans, chunk_offset);
3148         btrfs_end_transaction(trans);
3149         return ret;
3150 }
3151
3152 static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info)
3153 {
3154         struct btrfs_root *chunk_root = fs_info->chunk_root;
3155         struct btrfs_path *path;
3156         struct extent_buffer *leaf;
3157         struct btrfs_chunk *chunk;
3158         struct btrfs_key key;
3159         struct btrfs_key found_key;
3160         u64 chunk_type;
3161         bool retried = false;
3162         int failed = 0;
3163         int ret;
3164
3165         path = btrfs_alloc_path();
3166         if (!path)
3167                 return -ENOMEM;
3168
3169 again:
3170         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3171         key.offset = (u64)-1;
3172         key.type = BTRFS_CHUNK_ITEM_KEY;
3173
3174         while (1) {
3175                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
3176                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3177                 if (ret < 0) {
3178                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3179                         goto error;
3180                 }
3181                 if (ret == 0) {
3182                         /*
3183                          * On the first search we would find chunk tree with
3184                          * offset -1, which is not possible. On subsequent
3185                          * loops this would find an existing item on an invalid
3186                          * offset (one less than the previous one, wrong
3187                          * alignment and size).
3188                          */
3189                         ret = -EUCLEAN;
3190                         goto error;
3191                 }
3192
3193                 ret = btrfs_previous_item(chunk_root, path, key.objectid,
3194                                           key.type);
3195                 if (ret)
3196                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3197                 if (ret < 0)
3198                         goto error;
3199                 if (ret > 0)
3200                         break;
3201
3202                 leaf = path->nodes[0];
3203                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
3204
3205                 chunk = btrfs_item_ptr(leaf, path->slots[0],
3206                                        struct btrfs_chunk);
3207                 chunk_type = btrfs_chunk_type(leaf, chunk);
3208                 btrfs_release_path(path);
3209
3210                 if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) {
3211                         ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3212                         if (ret == -ENOSPC)
3213                                 failed++;
3214                         else
3215                                 BUG_ON(ret);
3216                 }
3217                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3218
3219                 if (found_key.offset == 0)
3220                         break;
3221                 key.offset = found_key.offset - 1;
3222         }
3223         ret = 0;
3224         if (failed && !retried) {
3225                 failed = 0;
3226                 retried = true;
3227                 goto again;
3228         } else if (WARN_ON(failed && retried)) {
3229                 ret = -ENOSPC;
3230         }
3231 error:
3232         btrfs_free_path(path);
3233         return ret;
3234 }
3235
3236 /*
3237  * return 1 : allocate a data chunk successfully,
3238  * return <0: errors during allocating a data chunk,
3239  * return 0 : no need to allocate a data chunk.
3240  */
3241 static int btrfs_may_alloc_data_chunk(struct btrfs_fs_info *fs_info,
3242                                       u64 chunk_offset)
3243 {
3244         struct btrfs_block_group *cache;
3245         u64 bytes_used;
3246         u64 chunk_type;
3247
3248         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3249         ASSERT(cache);
3250         chunk_type = cache->flags;
3251         btrfs_put_block_group(cache);
3252
3253         if (!(chunk_type & BTRFS_BLOCK_GROUP_DATA))
3254                 return 0;
3255
3256         spin_lock(&fs_info->data_sinfo->lock);
3257         bytes_used = fs_info->data_sinfo->bytes_used;
3258         spin_unlock(&fs_info->data_sinfo->lock);
3259
3260         if (!bytes_used) {
3261                 struct btrfs_trans_handle *trans;
3262                 int ret;
3263
3264                 trans = btrfs_join_transaction(fs_info->tree_root);
3265                 if (IS_ERR(trans))
3266                         return PTR_ERR(trans);
3267
3268                 ret = btrfs_force_chunk_alloc(trans, BTRFS_BLOCK_GROUP_DATA);
3269                 btrfs_end_transaction(trans);
3270                 if (ret < 0)
3271                         return ret;
3272                 return 1;
3273         }
3274
3275         return 0;
3276 }
3277
3278 static int insert_balance_item(struct btrfs_fs_info *fs_info,
3279                                struct btrfs_balance_control *bctl)
3280 {
3281         struct btrfs_root *root = fs_info->tree_root;
3282         struct btrfs_trans_handle *trans;
3283         struct btrfs_balance_item *item;
3284         struct btrfs_disk_balance_args disk_bargs;
3285         struct btrfs_path *path;
3286         struct extent_buffer *leaf;
3287         struct btrfs_key key;
3288         int ret, err;
3289
3290         path = btrfs_alloc_path();
3291         if (!path)
3292                 return -ENOMEM;
3293
3294         trans = btrfs_start_transaction(root, 0);
3295         if (IS_ERR(trans)) {
3296                 btrfs_free_path(path);
3297                 return PTR_ERR(trans);
3298         }
3299
3300         key.objectid = BTRFS_BALANCE_OBJECTID;
3301         key.type = BTRFS_TEMPORARY_ITEM_KEY;
3302         key.offset = 0;
3303
3304         ret = btrfs_insert_empty_item(trans, root, path, &key,
3305                                       sizeof(*item));
3306         if (ret)
3307                 goto out;
3308
3309         leaf = path->nodes[0];
3310         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
3311
3312         memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
3313
3314         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->data);
3315         btrfs_set_balance_data(leaf, item, &disk_bargs);
3316         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->meta);
3317         btrfs_set_balance_meta(leaf, item, &disk_bargs);
3318         btrfs_cpu_balance_args_to_disk(&disk_bargs, &bctl->sys);
3319         btrfs_set_balance_sys(leaf, item, &disk_bargs);
3320
3321         btrfs_set_balance_flags(leaf, item, bctl->flags);
3322
3323         btrfs_mark_buffer_dirty(leaf);
3324 out:
3325         btrfs_free_path(path);
3326         err = btrfs_commit_transaction(trans);
3327         if (err && !ret)
3328                 ret = err;
3329         return ret;
3330 }
3331
3332 static int del_balance_item(struct btrfs_fs_info *fs_info)
3333 {
3334         struct btrfs_root *root = fs_info->tree_root;
3335         struct btrfs_trans_handle *trans;
3336         struct btrfs_path *path;
3337         struct btrfs_key key;
3338         int ret, err;
3339
3340         path = btrfs_alloc_path();
3341         if (!path)
3342                 return -ENOMEM;
3343
3344         trans = btrfs_start_transaction_fallback_global_rsv(root, 0);
3345         if (IS_ERR(trans)) {
3346                 btrfs_free_path(path);
3347                 return PTR_ERR(trans);
3348         }
3349
3350         key.objectid = BTRFS_BALANCE_OBJECTID;
3351         key.type = BTRFS_TEMPORARY_ITEM_KEY;
3352         key.offset = 0;
3353
3354         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3355         if (ret < 0)
3356                 goto out;
3357         if (ret > 0) {
3358                 ret = -ENOENT;
3359                 goto out;
3360         }
3361
3362         ret = btrfs_del_item(trans, root, path);
3363 out:
3364         btrfs_free_path(path);
3365         err = btrfs_commit_transaction(trans);
3366         if (err && !ret)
3367                 ret = err;
3368         return ret;
3369 }
3370
3371 /*
3372  * This is a heuristic used to reduce the number of chunks balanced on
3373  * resume after balance was interrupted.
3374  */
3375 static void update_balance_args(struct btrfs_balance_control *bctl)
3376 {
3377         /*
3378          * Turn on soft mode for chunk types that were being converted.
3379          */
3380         if (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)
3381                 bctl->data.flags |= BTRFS_BALANCE_ARGS_SOFT;
3382         if (bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)
3383                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_SOFT;
3384         if (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)
3385                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_SOFT;
3386
3387         /*
3388          * Turn on usage filter if is not already used.  The idea is
3389          * that chunks that we have already balanced should be
3390          * reasonably full.  Don't do it for chunks that are being
3391          * converted - that will keep us from relocating unconverted
3392          * (albeit full) chunks.
3393          */
3394         if (!(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3395             !(bctl->data.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3396             !(bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3397                 bctl->data.flags |= BTRFS_BALANCE_ARGS_USAGE;
3398                 bctl->data.usage = 90;
3399         }
3400         if (!(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3401             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3402             !(bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3403                 bctl->sys.flags |= BTRFS_BALANCE_ARGS_USAGE;
3404                 bctl->sys.usage = 90;
3405         }
3406         if (!(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE) &&
3407             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3408             !(bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT)) {
3409                 bctl->meta.flags |= BTRFS_BALANCE_ARGS_USAGE;
3410                 bctl->meta.usage = 90;
3411         }
3412 }
3413
3414 /*
3415  * Clear the balance status in fs_info and delete the balance item from disk.
3416  */
3417 static void reset_balance_state(struct btrfs_fs_info *fs_info)
3418 {
3419         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3420         int ret;
3421
3422         BUG_ON(!fs_info->balance_ctl);
3423
3424         spin_lock(&fs_info->balance_lock);
3425         fs_info->balance_ctl = NULL;
3426         spin_unlock(&fs_info->balance_lock);
3427
3428         kfree(bctl);
3429         ret = del_balance_item(fs_info);
3430         if (ret)
3431                 btrfs_handle_fs_error(fs_info, ret, NULL);
3432 }
3433
3434 /*
3435  * Balance filters.  Return 1 if chunk should be filtered out
3436  * (should not be balanced).
3437  */
3438 static int chunk_profiles_filter(u64 chunk_type,
3439                                  struct btrfs_balance_args *bargs)
3440 {
3441         chunk_type = chunk_to_extended(chunk_type) &
3442                                 BTRFS_EXTENDED_PROFILE_MASK;
3443
3444         if (bargs->profiles & chunk_type)
3445                 return 0;
3446
3447         return 1;
3448 }
3449
3450 static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset,
3451                               struct btrfs_balance_args *bargs)
3452 {
3453         struct btrfs_block_group *cache;
3454         u64 chunk_used;
3455         u64 user_thresh_min;
3456         u64 user_thresh_max;
3457         int ret = 1;
3458
3459         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3460         chunk_used = cache->used;
3461
3462         if (bargs->usage_min == 0)
3463                 user_thresh_min = 0;
3464         else
3465                 user_thresh_min = div_factor_fine(cache->length,
3466                                                   bargs->usage_min);
3467
3468         if (bargs->usage_max == 0)
3469                 user_thresh_max = 1;
3470         else if (bargs->usage_max > 100)
3471                 user_thresh_max = cache->length;
3472         else
3473                 user_thresh_max = div_factor_fine(cache->length,
3474                                                   bargs->usage_max);
3475
3476         if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max)
3477                 ret = 0;
3478
3479         btrfs_put_block_group(cache);
3480         return ret;
3481 }
3482
3483 static int chunk_usage_filter(struct btrfs_fs_info *fs_info,
3484                 u64 chunk_offset, struct btrfs_balance_args *bargs)
3485 {
3486         struct btrfs_block_group *cache;
3487         u64 chunk_used, user_thresh;
3488         int ret = 1;
3489
3490         cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3491         chunk_used = cache->used;
3492
3493         if (bargs->usage_min == 0)
3494                 user_thresh = 1;
3495         else if (bargs->usage > 100)
3496                 user_thresh = cache->length;
3497         else
3498                 user_thresh = div_factor_fine(cache->length, bargs->usage);
3499
3500         if (chunk_used < user_thresh)
3501                 ret = 0;
3502
3503         btrfs_put_block_group(cache);
3504         return ret;
3505 }
3506
3507 static int chunk_devid_filter(struct extent_buffer *leaf,
3508                               struct btrfs_chunk *chunk,
3509                               struct btrfs_balance_args *bargs)
3510 {
3511         struct btrfs_stripe *stripe;
3512         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3513         int i;
3514
3515         for (i = 0; i < num_stripes; i++) {
3516                 stripe = btrfs_stripe_nr(chunk, i);
3517                 if (btrfs_stripe_devid(leaf, stripe) == bargs->devid)
3518                         return 0;
3519         }
3520
3521         return 1;
3522 }
3523
3524 static u64 calc_data_stripes(u64 type, int num_stripes)
3525 {
3526         const int index = btrfs_bg_flags_to_raid_index(type);
3527         const int ncopies = btrfs_raid_array[index].ncopies;
3528         const int nparity = btrfs_raid_array[index].nparity;
3529
3530         if (nparity)
3531                 return num_stripes - nparity;
3532         else
3533                 return num_stripes / ncopies;
3534 }
3535
3536 /* [pstart, pend) */
3537 static int chunk_drange_filter(struct extent_buffer *leaf,
3538                                struct btrfs_chunk *chunk,
3539                                struct btrfs_balance_args *bargs)
3540 {
3541         struct btrfs_stripe *stripe;
3542         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3543         u64 stripe_offset;
3544         u64 stripe_length;
3545         u64 type;
3546         int factor;
3547         int i;
3548
3549         if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID))
3550                 return 0;
3551
3552         type = btrfs_chunk_type(leaf, chunk);
3553         factor = calc_data_stripes(type, num_stripes);
3554
3555         for (i = 0; i < num_stripes; i++) {
3556                 stripe = btrfs_stripe_nr(chunk, i);
3557                 if (btrfs_stripe_devid(leaf, stripe) != bargs->devid)
3558                         continue;
3559
3560                 stripe_offset = btrfs_stripe_offset(leaf, stripe);
3561                 stripe_length = btrfs_chunk_length(leaf, chunk);
3562                 stripe_length = div_u64(stripe_length, factor);
3563
3564                 if (stripe_offset < bargs->pend &&
3565                     stripe_offset + stripe_length > bargs->pstart)
3566                         return 0;
3567         }
3568
3569         return 1;
3570 }
3571
3572 /* [vstart, vend) */
3573 static int chunk_vrange_filter(struct extent_buffer *leaf,
3574                                struct btrfs_chunk *chunk,
3575                                u64 chunk_offset,
3576                                struct btrfs_balance_args *bargs)
3577 {
3578         if (chunk_offset < bargs->vend &&
3579             chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart)
3580                 /* at least part of the chunk is inside this vrange */
3581                 return 0;
3582
3583         return 1;
3584 }
3585
3586 static int chunk_stripes_range_filter(struct extent_buffer *leaf,
3587                                struct btrfs_chunk *chunk,
3588                                struct btrfs_balance_args *bargs)
3589 {
3590         int num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
3591
3592         if (bargs->stripes_min <= num_stripes
3593                         && num_stripes <= bargs->stripes_max)
3594                 return 0;
3595
3596         return 1;
3597 }
3598
3599 static int chunk_soft_convert_filter(u64 chunk_type,
3600                                      struct btrfs_balance_args *bargs)
3601 {
3602         if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
3603                 return 0;
3604
3605         chunk_type = chunk_to_extended(chunk_type) &
3606                                 BTRFS_EXTENDED_PROFILE_MASK;
3607
3608         if (bargs->target == chunk_type)
3609                 return 1;
3610
3611         return 0;
3612 }
3613
3614 static int should_balance_chunk(struct extent_buffer *leaf,
3615                                 struct btrfs_chunk *chunk, u64 chunk_offset)
3616 {
3617         struct btrfs_fs_info *fs_info = leaf->fs_info;
3618         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3619         struct btrfs_balance_args *bargs = NULL;
3620         u64 chunk_type = btrfs_chunk_type(leaf, chunk);
3621
3622         /* type filter */
3623         if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) &
3624               (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) {
3625                 return 0;
3626         }
3627
3628         if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3629                 bargs = &bctl->data;
3630         else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3631                 bargs = &bctl->sys;
3632         else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3633                 bargs = &bctl->meta;
3634
3635         /* profiles filter */
3636         if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) &&
3637             chunk_profiles_filter(chunk_type, bargs)) {
3638                 return 0;
3639         }
3640
3641         /* usage filter */
3642         if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) &&
3643             chunk_usage_filter(fs_info, chunk_offset, bargs)) {
3644                 return 0;
3645         } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) &&
3646             chunk_usage_range_filter(fs_info, chunk_offset, bargs)) {
3647                 return 0;
3648         }
3649
3650         /* devid filter */
3651         if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) &&
3652             chunk_devid_filter(leaf, chunk, bargs)) {
3653                 return 0;
3654         }
3655
3656         /* drange filter, makes sense only with devid filter */
3657         if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) &&
3658             chunk_drange_filter(leaf, chunk, bargs)) {
3659                 return 0;
3660         }
3661
3662         /* vrange filter */
3663         if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) &&
3664             chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) {
3665                 return 0;
3666         }
3667
3668         /* stripes filter */
3669         if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) &&
3670             chunk_stripes_range_filter(leaf, chunk, bargs)) {
3671                 return 0;
3672         }
3673
3674         /* soft profile changing mode */
3675         if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) &&
3676             chunk_soft_convert_filter(chunk_type, bargs)) {
3677                 return 0;
3678         }
3679
3680         /*
3681          * limited by count, must be the last filter
3682          */
3683         if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) {
3684                 if (bargs->limit == 0)
3685                         return 0;
3686                 else
3687                         bargs->limit--;
3688         } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) {
3689                 /*
3690                  * Same logic as the 'limit' filter; the minimum cannot be
3691                  * determined here because we do not have the global information
3692                  * about the count of all chunks that satisfy the filters.
3693                  */
3694                 if (bargs->limit_max == 0)
3695                         return 0;
3696                 else
3697                         bargs->limit_max--;
3698         }
3699
3700         return 1;
3701 }
3702
3703 static int __btrfs_balance(struct btrfs_fs_info *fs_info)
3704 {
3705         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3706         struct btrfs_root *chunk_root = fs_info->chunk_root;
3707         u64 chunk_type;
3708         struct btrfs_chunk *chunk;
3709         struct btrfs_path *path = NULL;
3710         struct btrfs_key key;
3711         struct btrfs_key found_key;
3712         struct extent_buffer *leaf;
3713         int slot;
3714         int ret;
3715         int enospc_errors = 0;
3716         bool counting = true;
3717         /* The single value limit and min/max limits use the same bytes in the */
3718         u64 limit_data = bctl->data.limit;
3719         u64 limit_meta = bctl->meta.limit;
3720         u64 limit_sys = bctl->sys.limit;
3721         u32 count_data = 0;
3722         u32 count_meta = 0;
3723         u32 count_sys = 0;
3724         int chunk_reserved = 0;
3725
3726         path = btrfs_alloc_path();
3727         if (!path) {
3728                 ret = -ENOMEM;
3729                 goto error;
3730         }
3731
3732         /* zero out stat counters */
3733         spin_lock(&fs_info->balance_lock);
3734         memset(&bctl->stat, 0, sizeof(bctl->stat));
3735         spin_unlock(&fs_info->balance_lock);
3736 again:
3737         if (!counting) {
3738                 /*
3739                  * The single value limit and min/max limits use the same bytes
3740                  * in the
3741                  */
3742                 bctl->data.limit = limit_data;
3743                 bctl->meta.limit = limit_meta;
3744                 bctl->sys.limit = limit_sys;
3745         }
3746         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
3747         key.offset = (u64)-1;
3748         key.type = BTRFS_CHUNK_ITEM_KEY;
3749
3750         while (1) {
3751                 if ((!counting && atomic_read(&fs_info->balance_pause_req)) ||
3752                     atomic_read(&fs_info->balance_cancel_req)) {
3753                         ret = -ECANCELED;
3754                         goto error;
3755                 }
3756
3757                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
3758                 ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0);
3759                 if (ret < 0) {
3760                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3761                         goto error;
3762                 }
3763
3764                 /*
3765                  * this shouldn't happen, it means the last relocate
3766                  * failed
3767                  */
3768                 if (ret == 0)
3769                         BUG(); /* FIXME break ? */
3770
3771                 ret = btrfs_previous_item(chunk_root, path, 0,
3772                                           BTRFS_CHUNK_ITEM_KEY);
3773                 if (ret) {
3774                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3775                         ret = 0;
3776                         break;
3777                 }
3778
3779                 leaf = path->nodes[0];
3780                 slot = path->slots[0];
3781                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
3782
3783                 if (found_key.objectid != key.objectid) {
3784                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3785                         break;
3786                 }
3787
3788                 chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
3789                 chunk_type = btrfs_chunk_type(leaf, chunk);
3790
3791                 if (!counting) {
3792                         spin_lock(&fs_info->balance_lock);
3793                         bctl->stat.considered++;
3794                         spin_unlock(&fs_info->balance_lock);
3795                 }
3796
3797                 ret = should_balance_chunk(leaf, chunk, found_key.offset);
3798
3799                 btrfs_release_path(path);
3800                 if (!ret) {
3801                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3802                         goto loop;
3803                 }
3804
3805                 if (counting) {
3806                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3807                         spin_lock(&fs_info->balance_lock);
3808                         bctl->stat.expected++;
3809                         spin_unlock(&fs_info->balance_lock);
3810
3811                         if (chunk_type & BTRFS_BLOCK_GROUP_DATA)
3812                                 count_data++;
3813                         else if (chunk_type & BTRFS_BLOCK_GROUP_SYSTEM)
3814                                 count_sys++;
3815                         else if (chunk_type & BTRFS_BLOCK_GROUP_METADATA)
3816                                 count_meta++;
3817
3818                         goto loop;
3819                 }
3820
3821                 /*
3822                  * Apply limit_min filter, no need to check if the LIMITS
3823                  * filter is used, limit_min is 0 by default
3824                  */
3825                 if (((chunk_type & BTRFS_BLOCK_GROUP_DATA) &&
3826                                         count_data < bctl->data.limit_min)
3827                                 || ((chunk_type & BTRFS_BLOCK_GROUP_METADATA) &&
3828                                         count_meta < bctl->meta.limit_min)
3829                                 || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) &&
3830                                         count_sys < bctl->sys.limit_min)) {
3831                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3832                         goto loop;
3833                 }
3834
3835                 if (!chunk_reserved) {
3836                         /*
3837                          * We may be relocating the only data chunk we have,
3838                          * which could potentially end up with losing data's
3839                          * raid profile, so lets allocate an empty one in
3840                          * advance.
3841                          */
3842                         ret = btrfs_may_alloc_data_chunk(fs_info,
3843                                                          found_key.offset);
3844                         if (ret < 0) {
3845                                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3846                                 goto error;
3847                         } else if (ret == 1) {
3848                                 chunk_reserved = 1;
3849                         }
3850                 }
3851
3852                 ret = btrfs_relocate_chunk(fs_info, found_key.offset);
3853                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
3854                 if (ret == -ENOSPC) {
3855                         enospc_errors++;
3856                 } else if (ret == -ETXTBSY) {
3857                         btrfs_info(fs_info,
3858            "skipping relocation of block group %llu due to active swapfile",
3859                                    found_key.offset);
3860                         ret = 0;
3861                 } else if (ret) {
3862                         goto error;
3863                 } else {
3864                         spin_lock(&fs_info->balance_lock);
3865                         bctl->stat.completed++;
3866                         spin_unlock(&fs_info->balance_lock);
3867                 }
3868 loop:
3869                 if (found_key.offset == 0)
3870                         break;
3871                 key.offset = found_key.offset - 1;
3872         }
3873
3874         if (counting) {
3875                 btrfs_release_path(path);
3876                 counting = false;
3877                 goto again;
3878         }
3879 error:
3880         btrfs_free_path(path);
3881         if (enospc_errors) {
3882                 btrfs_info(fs_info, "%d enospc errors during balance",
3883                            enospc_errors);
3884                 if (!ret)
3885                         ret = -ENOSPC;
3886         }
3887
3888         return ret;
3889 }
3890
3891 /**
3892  * alloc_profile_is_valid - see if a given profile is valid and reduced
3893  * @flags: profile to validate
3894  * @extended: if true @flags is treated as an extended profile
3895  */
3896 static int alloc_profile_is_valid(u64 flags, int extended)
3897 {
3898         u64 mask = (extended ? BTRFS_EXTENDED_PROFILE_MASK :
3899                                BTRFS_BLOCK_GROUP_PROFILE_MASK);
3900
3901         flags &= ~BTRFS_BLOCK_GROUP_TYPE_MASK;
3902
3903         /* 1) check that all other bits are zeroed */
3904         if (flags & ~mask)
3905                 return 0;
3906
3907         /* 2) see if profile is reduced */
3908         if (flags == 0)
3909                 return !extended; /* "0" is valid for usual profiles */
3910
3911         return has_single_bit_set(flags);
3912 }
3913
3914 static inline int balance_need_close(struct btrfs_fs_info *fs_info)
3915 {
3916         /* cancel requested || normal exit path */
3917         return atomic_read(&fs_info->balance_cancel_req) ||
3918                 (atomic_read(&fs_info->balance_pause_req) == 0 &&
3919                  atomic_read(&fs_info->balance_cancel_req) == 0);
3920 }
3921
3922 /*
3923  * Validate target profile against allowed profiles and return true if it's OK.
3924  * Otherwise print the error message and return false.
3925  */
3926 static inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
3927                 const struct btrfs_balance_args *bargs,
3928                 u64 allowed, const char *type)
3929 {
3930         if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
3931                 return true;
3932
3933         /* Profile is valid and does not have bits outside of the allowed set */
3934         if (alloc_profile_is_valid(bargs->target, 1) &&
3935             (bargs->target & ~allowed) == 0)
3936                 return true;
3937
3938         btrfs_err(fs_info, "balance: invalid convert %s profile %s",
3939                         type, btrfs_bg_type_to_raid_name(bargs->target));
3940         return false;
3941 }
3942
3943 /*
3944  * Fill @buf with textual description of balance filter flags @bargs, up to
3945  * @size_buf including the terminating null. The output may be trimmed if it
3946  * does not fit into the provided buffer.
3947  */
3948 static void describe_balance_args(struct btrfs_balance_args *bargs, char *buf,
3949                                  u32 size_buf)
3950 {
3951         int ret;
3952         u32 size_bp = size_buf;
3953         char *bp = buf;
3954         u64 flags = bargs->flags;
3955         char tmp_buf[128] = {'\0'};
3956
3957         if (!flags)
3958                 return;
3959
3960 #define CHECK_APPEND_NOARG(a)                                           \
3961         do {                                                            \
3962                 ret = snprintf(bp, size_bp, (a));                       \
3963                 if (ret < 0 || ret >= size_bp)                          \
3964                         goto out_overflow;                              \
3965                 size_bp -= ret;                                         \
3966                 bp += ret;                                              \
3967         } while (0)
3968
3969 #define CHECK_APPEND_1ARG(a, v1)                                        \
3970         do {                                                            \
3971                 ret = snprintf(bp, size_bp, (a), (v1));                 \
3972                 if (ret < 0 || ret >= size_bp)                          \
3973                         goto out_overflow;                              \
3974                 size_bp -= ret;                                         \
3975                 bp += ret;                                              \
3976         } while (0)
3977
3978 #define CHECK_APPEND_2ARG(a, v1, v2)                                    \
3979         do {                                                            \
3980                 ret = snprintf(bp, size_bp, (a), (v1), (v2));           \
3981                 if (ret < 0 || ret >= size_bp)                          \
3982                         goto out_overflow;                              \
3983                 size_bp -= ret;                                         \
3984                 bp += ret;                                              \
3985         } while (0)
3986
3987         if (flags & BTRFS_BALANCE_ARGS_CONVERT)
3988                 CHECK_APPEND_1ARG("convert=%s,",
3989                                   btrfs_bg_type_to_raid_name(bargs->target));
3990
3991         if (flags & BTRFS_BALANCE_ARGS_SOFT)
3992                 CHECK_APPEND_NOARG("soft,");
3993
3994         if (flags & BTRFS_BALANCE_ARGS_PROFILES) {
3995                 btrfs_describe_block_groups(bargs->profiles, tmp_buf,
3996                                             sizeof(tmp_buf));
3997                 CHECK_APPEND_1ARG("profiles=%s,", tmp_buf);
3998         }
3999
4000         if (flags & BTRFS_BALANCE_ARGS_USAGE)
4001                 CHECK_APPEND_1ARG("usage=%llu,", bargs->usage);
4002
4003         if (flags & BTRFS_BALANCE_ARGS_USAGE_RANGE)
4004                 CHECK_APPEND_2ARG("usage=%u..%u,",
4005                                   bargs->usage_min, bargs->usage_max);
4006
4007         if (flags & BTRFS_BALANCE_ARGS_DEVID)
4008                 CHECK_APPEND_1ARG("devid=%llu,", bargs->devid);
4009
4010         if (flags & BTRFS_BALANCE_ARGS_DRANGE)
4011                 CHECK_APPEND_2ARG("drange=%llu..%llu,",
4012                                   bargs->pstart, bargs->pend);
4013
4014         if (flags & BTRFS_BALANCE_ARGS_VRANGE)
4015                 CHECK_APPEND_2ARG("vrange=%llu..%llu,",
4016                                   bargs->vstart, bargs->vend);
4017
4018         if (flags & BTRFS_BALANCE_ARGS_LIMIT)
4019                 CHECK_APPEND_1ARG("limit=%llu,", bargs->limit);
4020
4021         if (flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)
4022                 CHECK_APPEND_2ARG("limit=%u..%u,",
4023                                 bargs->limit_min, bargs->limit_max);
4024
4025         if (flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE)
4026                 CHECK_APPEND_2ARG("stripes=%u..%u,",
4027                                   bargs->stripes_min, bargs->stripes_max);
4028
4029 #undef CHECK_APPEND_2ARG
4030 #undef CHECK_APPEND_1ARG
4031 #undef CHECK_APPEND_NOARG
4032
4033 out_overflow:
4034
4035         if (size_bp < size_buf)
4036                 buf[size_buf - size_bp - 1] = '\0'; /* remove last , */
4037         else
4038                 buf[0] = '\0';
4039 }
4040
4041 static void describe_balance_start_or_resume(struct btrfs_fs_info *fs_info)
4042 {
4043         u32 size_buf = 1024;
4044         char tmp_buf[192] = {'\0'};
4045         char *buf;
4046         char *bp;
4047         u32 size_bp = size_buf;
4048         int ret;
4049         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
4050
4051         buf = kzalloc(size_buf, GFP_KERNEL);
4052         if (!buf)
4053                 return;
4054
4055         bp = buf;
4056
4057 #define CHECK_APPEND_1ARG(a, v1)                                        \
4058         do {                                                            \
4059                 ret = snprintf(bp, size_bp, (a), (v1));                 \
4060                 if (ret < 0 || ret >= size_bp)                          \
4061                         goto out_overflow;                              \
4062                 size_bp -= ret;                                         \
4063                 bp += ret;                                              \
4064         } while (0)
4065
4066         if (bctl->flags & BTRFS_BALANCE_FORCE)
4067                 CHECK_APPEND_1ARG("%s", "-f ");
4068
4069         if (bctl->flags & BTRFS_BALANCE_DATA) {
4070                 describe_balance_args(&bctl->data, tmp_buf, sizeof(tmp_buf));
4071                 CHECK_APPEND_1ARG("-d%s ", tmp_buf);
4072         }
4073
4074         if (bctl->flags & BTRFS_BALANCE_METADATA) {
4075                 describe_balance_args(&bctl->meta, tmp_buf, sizeof(tmp_buf));
4076                 CHECK_APPEND_1ARG("-m%s ", tmp_buf);
4077         }
4078
4079         if (bctl->flags & BTRFS_BALANCE_SYSTEM) {
4080                 describe_balance_args(&bctl->sys, tmp_buf, sizeof(tmp_buf));
4081                 CHECK_APPEND_1ARG("-s%s ", tmp_buf);
4082         }
4083
4084 #undef CHECK_APPEND_1ARG
4085
4086 out_overflow:
4087
4088         if (size_bp < size_buf)
4089                 buf[size_buf - size_bp - 1] = '\0'; /* remove last " " */
4090         btrfs_info(fs_info, "balance: %s %s",
4091                    (bctl->flags & BTRFS_BALANCE_RESUME) ?
4092                    "resume" : "start", buf);
4093
4094         kfree(buf);
4095 }
4096
4097 /*
4098  * Should be called with balance mutexe held
4099  */
4100 int btrfs_balance(struct btrfs_fs_info *fs_info,
4101                   struct btrfs_balance_control *bctl,
4102                   struct btrfs_ioctl_balance_args *bargs)
4103 {
4104         u64 meta_target, data_target;
4105         u64 allowed;
4106         int mixed = 0;
4107         int ret;
4108         u64 num_devices;
4109         unsigned seq;
4110         bool reducing_redundancy;
4111         int i;
4112
4113         if (btrfs_fs_closing(fs_info) ||
4114             atomic_read(&fs_info->balance_pause_req) ||
4115             btrfs_should_cancel_balance(fs_info)) {
4116                 ret = -EINVAL;
4117                 goto out;
4118         }
4119
4120         allowed = btrfs_super_incompat_flags(fs_info->super_copy);
4121         if (allowed & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS)
4122                 mixed = 1;
4123
4124         /*
4125          * In case of mixed groups both data and meta should be picked,
4126          * and identical options should be given for both of them.
4127          */
4128         allowed = BTRFS_BALANCE_DATA | BTRFS_BALANCE_METADATA;
4129         if (mixed && (bctl->flags & allowed)) {
4130                 if (!(bctl->flags & BTRFS_BALANCE_DATA) ||
4131                     !(bctl->flags & BTRFS_BALANCE_METADATA) ||
4132                     memcmp(&bctl->data, &bctl->meta, sizeof(bctl->data))) {
4133                         btrfs_err(fs_info,
4134           "balance: mixed groups data and metadata options must be the same");
4135                         ret = -EINVAL;
4136                         goto out;
4137                 }
4138         }
4139
4140         /*
4141          * rw_devices will not change at the moment, device add/delete/replace
4142          * are exclusive
4143          */
4144         num_devices = fs_info->fs_devices->rw_devices;
4145
4146         /*
4147          * SINGLE profile on-disk has no profile bit, but in-memory we have a
4148          * special bit for it, to make it easier to distinguish.  Thus we need
4149          * to set it manually, or balance would refuse the profile.
4150          */
4151         allowed = BTRFS_AVAIL_ALLOC_BIT_SINGLE;
4152         for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++)
4153                 if (num_devices >= btrfs_raid_array[i].devs_min)
4154                         allowed |= btrfs_raid_array[i].bg_flag;
4155
4156         if (!validate_convert_profile(fs_info, &bctl->data, allowed, "data") ||
4157             !validate_convert_profile(fs_info, &bctl->meta, allowed, "metadata") ||
4158             !validate_convert_profile(fs_info, &bctl->sys,  allowed, "system")) {
4159                 ret = -EINVAL;
4160                 goto out;
4161         }
4162
4163         /*
4164          * Allow to reduce metadata or system integrity only if force set for
4165          * profiles with redundancy (copies, parity)
4166          */
4167         allowed = 0;
4168         for (i = 0; i < ARRAY_SIZE(btrfs_raid_array); i++) {
4169                 if (btrfs_raid_array[i].ncopies >= 2 ||
4170                     btrfs_raid_array[i].tolerated_failures >= 1)
4171                         allowed |= btrfs_raid_array[i].bg_flag;
4172         }
4173         do {
4174                 seq = read_seqbegin(&fs_info->profiles_lock);
4175
4176                 if (((bctl->sys.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
4177                      (fs_info->avail_system_alloc_bits & allowed) &&
4178                      !(bctl->sys.target & allowed)) ||
4179                     ((bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) &&
4180                      (fs_info->avail_metadata_alloc_bits & allowed) &&
4181                      !(bctl->meta.target & allowed)))
4182                         reducing_redundancy = true;
4183                 else
4184                         reducing_redundancy = false;
4185
4186                 /* if we're not converting, the target field is uninitialized */
4187                 meta_target = (bctl->meta.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
4188                         bctl->meta.target : fs_info->avail_metadata_alloc_bits;
4189                 data_target = (bctl->data.flags & BTRFS_BALANCE_ARGS_CONVERT) ?
4190                         bctl->data.target : fs_info->avail_data_alloc_bits;
4191         } while (read_seqretry(&fs_info->profiles_lock, seq));
4192
4193         if (reducing_redundancy) {
4194                 if (bctl->flags & BTRFS_BALANCE_FORCE) {
4195                         btrfs_info(fs_info,
4196                            "balance: force reducing metadata redundancy");
4197                 } else {
4198                         btrfs_err(fs_info,
4199         "balance: reduces metadata redundancy, use --force if you want this");
4200                         ret = -EINVAL;
4201                         goto out;
4202                 }
4203         }
4204
4205         if (btrfs_get_num_tolerated_disk_barrier_failures(meta_target) <
4206                 btrfs_get_num_tolerated_disk_barrier_failures(data_target)) {
4207                 btrfs_warn(fs_info,
4208         "balance: metadata profile %s has lower redundancy than data profile %s",
4209                                 btrfs_bg_type_to_raid_name(meta_target),
4210                                 btrfs_bg_type_to_raid_name(data_target));
4211         }
4212
4213         if (fs_info->send_in_progress) {
4214                 btrfs_warn_rl(fs_info,
4215 "cannot run balance while send operations are in progress (%d in progress)",
4216                               fs_info->send_in_progress);
4217                 ret = -EAGAIN;
4218                 goto out;
4219         }
4220
4221         ret = insert_balance_item(fs_info, bctl);
4222         if (ret && ret != -EEXIST)
4223                 goto out;
4224
4225         if (!(bctl->flags & BTRFS_BALANCE_RESUME)) {
4226                 BUG_ON(ret == -EEXIST);
4227                 BUG_ON(fs_info->balance_ctl);
4228                 spin_lock(&fs_info->balance_lock);
4229                 fs_info->balance_ctl = bctl;
4230                 spin_unlock(&fs_info->balance_lock);
4231         } else {
4232                 BUG_ON(ret != -EEXIST);
4233                 spin_lock(&fs_info->balance_lock);
4234                 update_balance_args(bctl);
4235                 spin_unlock(&fs_info->balance_lock);
4236         }
4237
4238         ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4239         set_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
4240         describe_balance_start_or_resume(fs_info);
4241         mutex_unlock(&fs_info->balance_mutex);
4242
4243         ret = __btrfs_balance(fs_info);
4244
4245         mutex_lock(&fs_info->balance_mutex);
4246         if (ret == -ECANCELED && atomic_read(&fs_info->balance_pause_req))
4247                 btrfs_info(fs_info, "balance: paused");
4248         /*
4249          * Balance can be canceled by:
4250          *
4251          * - Regular cancel request
4252          *   Then ret == -ECANCELED and balance_cancel_req > 0
4253          *
4254          * - Fatal signal to "btrfs" process
4255          *   Either the signal caught by wait_reserve_ticket() and callers
4256          *   got -EINTR, or caught by btrfs_should_cancel_balance() and
4257          *   got -ECANCELED.
4258          *   Either way, in this case balance_cancel_req = 0, and
4259          *   ret == -EINTR or ret == -ECANCELED.
4260          *
4261          * So here we only check the return value to catch canceled balance.
4262          */
4263         else if (ret == -ECANCELED || ret == -EINTR)
4264                 btrfs_info(fs_info, "balance: canceled");
4265         else
4266                 btrfs_info(fs_info, "balance: ended with status: %d", ret);
4267
4268         clear_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags);
4269
4270         if (bargs) {
4271                 memset(bargs, 0, sizeof(*bargs));
4272                 btrfs_update_ioctl_balance_args(fs_info, bargs);
4273         }
4274
4275         if ((ret && ret != -ECANCELED && ret != -ENOSPC) ||
4276             balance_need_close(fs_info)) {
4277                 reset_balance_state(fs_info);
4278                 btrfs_exclop_finish(fs_info);
4279         }
4280
4281         wake_up(&fs_info->balance_wait_q);
4282
4283         return ret;
4284 out:
4285         if (bctl->flags & BTRFS_BALANCE_RESUME)
4286                 reset_balance_state(fs_info);
4287         else
4288                 kfree(bctl);
4289         btrfs_exclop_finish(fs_info);
4290
4291         return ret;
4292 }
4293
4294 static int balance_kthread(void *data)
4295 {
4296         struct btrfs_fs_info *fs_info = data;
4297         int ret = 0;
4298
4299         sb_start_write(fs_info->sb);
4300         mutex_lock(&fs_info->balance_mutex);
4301         if (fs_info->balance_ctl)
4302                 ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
4303         mutex_unlock(&fs_info->balance_mutex);
4304         sb_end_write(fs_info->sb);
4305
4306         return ret;
4307 }
4308
4309 int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info)
4310 {
4311         struct task_struct *tsk;
4312
4313         mutex_lock(&fs_info->balance_mutex);
4314         if (!fs_info->balance_ctl) {
4315                 mutex_unlock(&fs_info->balance_mutex);
4316                 return 0;
4317         }
4318         mutex_unlock(&fs_info->balance_mutex);
4319
4320         if (btrfs_test_opt(fs_info, SKIP_BALANCE)) {
4321                 btrfs_info(fs_info, "balance: resume skipped");
4322                 return 0;
4323         }
4324
4325         /*
4326          * A ro->rw remount sequence should continue with the paused balance
4327          * regardless of who pauses it, system or the user as of now, so set
4328          * the resume flag.
4329          */
4330         spin_lock(&fs_info->balance_lock);
4331         fs_info->balance_ctl->flags |= BTRFS_BALANCE_RESUME;
4332         spin_unlock(&fs_info->balance_lock);
4333
4334         tsk = kthread_run(balance_kthread, fs_info, "btrfs-balance");
4335         return PTR_ERR_OR_ZERO(tsk);
4336 }
4337
4338 int btrfs_recover_balance(struct btrfs_fs_info *fs_info)
4339 {
4340         struct btrfs_balance_control *bctl;
4341         struct btrfs_balance_item *item;
4342         struct btrfs_disk_balance_args disk_bargs;
4343         struct btrfs_path *path;
4344         struct extent_buffer *leaf;
4345         struct btrfs_key key;
4346         int ret;
4347
4348         path = btrfs_alloc_path();
4349         if (!path)
4350                 return -ENOMEM;
4351
4352         key.objectid = BTRFS_BALANCE_OBJECTID;
4353         key.type = BTRFS_TEMPORARY_ITEM_KEY;
4354         key.offset = 0;
4355
4356         ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
4357         if (ret < 0)
4358                 goto out;
4359         if (ret > 0) { /* ret = -ENOENT; */
4360                 ret = 0;
4361                 goto out;
4362         }
4363
4364         bctl = kzalloc(sizeof(*bctl), GFP_NOFS);
4365         if (!bctl) {
4366                 ret = -ENOMEM;
4367                 goto out;
4368         }
4369
4370         leaf = path->nodes[0];
4371         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_balance_item);
4372
4373         bctl->flags = btrfs_balance_flags(leaf, item);
4374         bctl->flags |= BTRFS_BALANCE_RESUME;
4375
4376         btrfs_balance_data(leaf, item, &disk_bargs);
4377         btrfs_disk_balance_args_to_cpu(&bctl->data, &disk_bargs);
4378         btrfs_balance_meta(leaf, item, &disk_bargs);
4379         btrfs_disk_balance_args_to_cpu(&bctl->meta, &disk_bargs);
4380         btrfs_balance_sys(leaf, item, &disk_bargs);
4381         btrfs_disk_balance_args_to_cpu(&bctl->sys, &disk_bargs);
4382
4383         /*
4384          * This should never happen, as the paused balance state is recovered
4385          * during mount without any chance of other exclusive ops to collide.
4386          *
4387          * This gives the exclusive op status to balance and keeps in paused
4388          * state until user intervention (cancel or umount). If the ownership
4389          * cannot be assigned, show a message but do not fail. The balance
4390          * is in a paused state and must have fs_info::balance_ctl properly
4391          * set up.
4392          */
4393         if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE))
4394                 btrfs_warn(fs_info,
4395         "balance: cannot set exclusive op status, resume manually");
4396
4397         btrfs_release_path(path);
4398
4399         mutex_lock(&fs_info->balance_mutex);
4400         BUG_ON(fs_info->balance_ctl);
4401         spin_lock(&fs_info->balance_lock);
4402         fs_info->balance_ctl = bctl;
4403         spin_unlock(&fs_info->balance_lock);
4404         mutex_unlock(&fs_info->balance_mutex);
4405 out:
4406         btrfs_free_path(path);
4407         return ret;
4408 }
4409
4410 int btrfs_pause_balance(struct btrfs_fs_info *fs_info)
4411 {
4412         int ret = 0;
4413
4414         mutex_lock(&fs_info->balance_mutex);
4415         if (!fs_info->balance_ctl) {
4416                 mutex_unlock(&fs_info->balance_mutex);
4417                 return -ENOTCONN;
4418         }
4419
4420         if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4421                 atomic_inc(&fs_info->balance_pause_req);
4422                 mutex_unlock(&fs_info->balance_mutex);
4423
4424                 wait_event(fs_info->balance_wait_q,
4425                            !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4426
4427                 mutex_lock(&fs_info->balance_mutex);
4428                 /* we are good with balance_ctl ripped off from under us */
4429                 BUG_ON(test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4430                 atomic_dec(&fs_info->balance_pause_req);
4431         } else {
4432                 ret = -ENOTCONN;
4433         }
4434
4435         mutex_unlock(&fs_info->balance_mutex);
4436         return ret;
4437 }
4438
4439 int btrfs_cancel_balance(struct btrfs_fs_info *fs_info)
4440 {
4441         mutex_lock(&fs_info->balance_mutex);
4442         if (!fs_info->balance_ctl) {
4443                 mutex_unlock(&fs_info->balance_mutex);
4444                 return -ENOTCONN;
4445         }
4446
4447         /*
4448          * A paused balance with the item stored on disk can be resumed at
4449          * mount time if the mount is read-write. Otherwise it's still paused
4450          * and we must not allow cancelling as it deletes the item.
4451          */
4452         if (sb_rdonly(fs_info->sb)) {
4453                 mutex_unlock(&fs_info->balance_mutex);
4454                 return -EROFS;
4455         }
4456
4457         atomic_inc(&fs_info->balance_cancel_req);
4458         /*
4459          * if we are running just wait and return, balance item is
4460          * deleted in btrfs_balance in this case
4461          */
4462         if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4463                 mutex_unlock(&fs_info->balance_mutex);
4464                 wait_event(fs_info->balance_wait_q,
4465                            !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4466                 mutex_lock(&fs_info->balance_mutex);
4467         } else {
4468                 mutex_unlock(&fs_info->balance_mutex);
4469                 /*
4470                  * Lock released to allow other waiters to continue, we'll
4471                  * reexamine the status again.
4472                  */
4473                 mutex_lock(&fs_info->balance_mutex);
4474
4475                 if (fs_info->balance_ctl) {
4476                         reset_balance_state(fs_info);
4477                         btrfs_exclop_finish(fs_info);
4478                         btrfs_info(fs_info, "balance: canceled");
4479                 }
4480         }
4481
4482         ASSERT(!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags));
4483         atomic_dec(&fs_info->balance_cancel_req);
4484         mutex_unlock(&fs_info->balance_mutex);
4485         return 0;
4486 }
4487
4488 int btrfs_uuid_scan_kthread(void *data)
4489 {
4490         struct btrfs_fs_info *fs_info = data;
4491         struct btrfs_root *root = fs_info->tree_root;
4492         struct btrfs_key key;
4493         struct btrfs_path *path = NULL;
4494         int ret = 0;
4495         struct extent_buffer *eb;
4496         int slot;
4497         struct btrfs_root_item root_item;
4498         u32 item_size;
4499         struct btrfs_trans_handle *trans = NULL;
4500         bool closing = false;
4501
4502         path = btrfs_alloc_path();
4503         if (!path) {
4504                 ret = -ENOMEM;
4505                 goto out;
4506         }
4507
4508         key.objectid = 0;
4509         key.type = BTRFS_ROOT_ITEM_KEY;
4510         key.offset = 0;
4511
4512         while (1) {
4513                 if (btrfs_fs_closing(fs_info)) {
4514                         closing = true;
4515                         break;
4516                 }
4517                 ret = btrfs_search_forward(root, &key, path,
4518                                 BTRFS_OLDEST_GENERATION);
4519                 if (ret) {
4520                         if (ret > 0)
4521                                 ret = 0;
4522                         break;
4523                 }
4524
4525                 if (key.type != BTRFS_ROOT_ITEM_KEY ||
4526                     (key.objectid < BTRFS_FIRST_FREE_OBJECTID &&
4527                      key.objectid != BTRFS_FS_TREE_OBJECTID) ||
4528                     key.objectid > BTRFS_LAST_FREE_OBJECTID)
4529                         goto skip;
4530
4531                 eb = path->nodes[0];
4532                 slot = path->slots[0];
4533                 item_size = btrfs_item_size_nr(eb, slot);
4534                 if (item_size < sizeof(root_item))
4535                         goto skip;
4536
4537                 read_extent_buffer(eb, &root_item,
4538                                    btrfs_item_ptr_offset(eb, slot),
4539                                    (int)sizeof(root_item));
4540                 if (btrfs_root_refs(&root_item) == 0)
4541                         goto skip;
4542
4543                 if (!btrfs_is_empty_uuid(root_item.uuid) ||
4544                     !btrfs_is_empty_uuid(root_item.received_uuid)) {
4545                         if (trans)
4546                                 goto update_tree;
4547
4548                         btrfs_release_path(path);
4549                         /*
4550                          * 1 - subvol uuid item
4551                          * 1 - received_subvol uuid item
4552                          */
4553                         trans = btrfs_start_transaction(fs_info->uuid_root, 2);
4554                         if (IS_ERR(trans)) {
4555                                 ret = PTR_ERR(trans);
4556                                 break;
4557                         }
4558                         continue;
4559                 } else {
4560                         goto skip;
4561                 }
4562 update_tree:
4563                 btrfs_release_path(path);
4564                 if (!btrfs_is_empty_uuid(root_item.uuid)) {
4565                         ret = btrfs_uuid_tree_add(trans, root_item.uuid,
4566                                                   BTRFS_UUID_KEY_SUBVOL,
4567                                                   key.objectid);
4568                         if (ret < 0) {
4569                                 btrfs_warn(fs_info, "uuid_tree_add failed %d",
4570                                         ret);
4571                                 break;
4572                         }
4573                 }
4574
4575                 if (!btrfs_is_empty_uuid(root_item.received_uuid)) {
4576                         ret = btrfs_uuid_tree_add(trans,
4577                                                   root_item.received_uuid,
4578                                                  BTRFS_UUID_KEY_RECEIVED_SUBVOL,
4579                                                   key.objectid);
4580                         if (ret < 0) {
4581                                 btrfs_warn(fs_info, "uuid_tree_add failed %d",
4582                                         ret);
4583                                 break;
4584                         }
4585                 }
4586
4587 skip:
4588                 btrfs_release_path(path);
4589                 if (trans) {
4590                         ret = btrfs_end_transaction(trans);
4591                         trans = NULL;
4592                         if (ret)
4593                                 break;
4594                 }
4595
4596                 if (key.offset < (u64)-1) {
4597                         key.offset++;
4598                 } else if (key.type < BTRFS_ROOT_ITEM_KEY) {
4599                         key.offset = 0;
4600                         key.type = BTRFS_ROOT_ITEM_KEY;
4601                 } else if (key.objectid < (u64)-1) {
4602                         key.offset = 0;
4603                         key.type = BTRFS_ROOT_ITEM_KEY;
4604                         key.objectid++;
4605                 } else {
4606                         break;
4607                 }
4608                 cond_resched();
4609         }
4610
4611 out:
4612         btrfs_free_path(path);
4613         if (trans && !IS_ERR(trans))
4614                 btrfs_end_transaction(trans);
4615         if (ret)
4616                 btrfs_warn(fs_info, "btrfs_uuid_scan_kthread failed %d", ret);
4617         else if (!closing)
4618                 set_bit(BTRFS_FS_UPDATE_UUID_TREE_GEN, &fs_info->flags);
4619         up(&fs_info->uuid_tree_rescan_sem);
4620         return 0;
4621 }
4622
4623 int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info)
4624 {
4625         struct btrfs_trans_handle *trans;
4626         struct btrfs_root *tree_root = fs_info->tree_root;
4627         struct btrfs_root *uuid_root;
4628         struct task_struct *task;
4629         int ret;
4630
4631         /*
4632          * 1 - root node
4633          * 1 - root item
4634          */
4635         trans = btrfs_start_transaction(tree_root, 2);
4636         if (IS_ERR(trans))
4637                 return PTR_ERR(trans);
4638
4639         uuid_root = btrfs_create_tree(trans, BTRFS_UUID_TREE_OBJECTID);
4640         if (IS_ERR(uuid_root)) {
4641                 ret = PTR_ERR(uuid_root);
4642                 btrfs_abort_transaction(trans, ret);
4643                 btrfs_end_transaction(trans);
4644                 return ret;
4645         }
4646
4647         fs_info->uuid_root = uuid_root;
4648
4649         ret = btrfs_commit_transaction(trans);
4650         if (ret)
4651                 return ret;
4652
4653         down(&fs_info->uuid_tree_rescan_sem);
4654         task = kthread_run(btrfs_uuid_scan_kthread, fs_info, "btrfs-uuid");
4655         if (IS_ERR(task)) {
4656                 /* fs_info->update_uuid_tree_gen remains 0 in all error case */
4657                 btrfs_warn(fs_info, "failed to start uuid_scan task");
4658                 up(&fs_info->uuid_tree_rescan_sem);
4659                 return PTR_ERR(task);
4660         }
4661
4662         return 0;
4663 }
4664
4665 /*
4666  * shrinking a device means finding all of the device extents past
4667  * the new size, and then following the back refs to the chunks.
4668  * The chunk relocation code actually frees the device extent
4669  */
4670 int btrfs_shrink_device(struct btrfs_device *device, u64 new_size)
4671 {
4672         struct btrfs_fs_info *fs_info = device->fs_info;
4673         struct btrfs_root *root = fs_info->dev_root;
4674         struct btrfs_trans_handle *trans;
4675         struct btrfs_dev_extent *dev_extent = NULL;
4676         struct btrfs_path *path;
4677         u64 length;
4678         u64 chunk_offset;
4679         int ret;
4680         int slot;
4681         int failed = 0;
4682         bool retried = false;
4683         struct extent_buffer *l;
4684         struct btrfs_key key;
4685         struct btrfs_super_block *super_copy = fs_info->super_copy;
4686         u64 old_total = btrfs_super_total_bytes(super_copy);
4687         u64 old_size = btrfs_device_get_total_bytes(device);
4688         u64 diff;
4689         u64 start;
4690
4691         new_size = round_down(new_size, fs_info->sectorsize);
4692         start = new_size;
4693         diff = round_down(old_size - new_size, fs_info->sectorsize);
4694
4695         if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
4696                 return -EINVAL;
4697
4698         path = btrfs_alloc_path();
4699         if (!path)
4700                 return -ENOMEM;
4701
4702         path->reada = READA_BACK;
4703
4704         trans = btrfs_start_transaction(root, 0);
4705         if (IS_ERR(trans)) {
4706                 btrfs_free_path(path);
4707                 return PTR_ERR(trans);
4708         }
4709
4710         mutex_lock(&fs_info->chunk_mutex);
4711
4712         btrfs_device_set_total_bytes(device, new_size);
4713         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
4714                 device->fs_devices->total_rw_bytes -= diff;
4715                 atomic64_sub(diff, &fs_info->free_chunk_space);
4716         }
4717
4718         /*
4719          * Once the device's size has been set to the new size, ensure all
4720          * in-memory chunks are synced to disk so that the loop below sees them
4721          * and relocates them accordingly.
4722          */
4723         if (contains_pending_extent(device, &start, diff)) {
4724                 mutex_unlock(&fs_info->chunk_mutex);
4725                 ret = btrfs_commit_transaction(trans);
4726                 if (ret)
4727                         goto done;
4728         } else {
4729                 mutex_unlock(&fs_info->chunk_mutex);
4730                 btrfs_end_transaction(trans);
4731         }
4732
4733 again:
4734         key.objectid = device->devid;
4735         key.offset = (u64)-1;
4736         key.type = BTRFS_DEV_EXTENT_KEY;
4737
4738         do {
4739                 mutex_lock(&fs_info->delete_unused_bgs_mutex);
4740                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4741                 if (ret < 0) {
4742                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4743                         goto done;
4744                 }
4745
4746                 ret = btrfs_previous_item(root, path, 0, key.type);
4747                 if (ret)
4748                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4749                 if (ret < 0)
4750                         goto done;
4751                 if (ret) {
4752                         ret = 0;
4753                         btrfs_release_path(path);
4754                         break;
4755                 }
4756
4757                 l = path->nodes[0];
4758                 slot = path->slots[0];
4759                 btrfs_item_key_to_cpu(l, &key, path->slots[0]);
4760
4761                 if (key.objectid != device->devid) {
4762                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4763                         btrfs_release_path(path);
4764                         break;
4765                 }
4766
4767                 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
4768                 length = btrfs_dev_extent_length(l, dev_extent);
4769
4770                 if (key.offset + length <= new_size) {
4771                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4772                         btrfs_release_path(path);
4773                         break;
4774                 }
4775
4776                 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
4777                 btrfs_release_path(path);
4778
4779                 /*
4780                  * We may be relocating the only data chunk we have,
4781                  * which could potentially end up with losing data's
4782                  * raid profile, so lets allocate an empty one in
4783                  * advance.
4784                  */
4785                 ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset);
4786                 if (ret < 0) {
4787                         mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4788                         goto done;
4789                 }
4790
4791                 ret = btrfs_relocate_chunk(fs_info, chunk_offset);
4792                 mutex_unlock(&fs_info->delete_unused_bgs_mutex);
4793                 if (ret == -ENOSPC) {
4794                         failed++;
4795                 } else if (ret) {
4796                         if (ret == -ETXTBSY) {
4797                                 btrfs_warn(fs_info,
4798                    "could not shrink block group %llu due to active swapfile",
4799                                            chunk_offset);
4800                         }
4801                         goto done;
4802                 }
4803         } while (key.offset-- > 0);
4804
4805         if (failed && !retried) {
4806                 failed = 0;
4807                 retried = true;
4808                 goto again;
4809         } else if (failed && retried) {
4810                 ret = -ENOSPC;
4811                 goto done;
4812         }
4813
4814         /* Shrinking succeeded, else we would be at "done". */
4815         trans = btrfs_start_transaction(root, 0);
4816         if (IS_ERR(trans)) {
4817                 ret = PTR_ERR(trans);
4818                 goto done;
4819         }
4820
4821         mutex_lock(&fs_info->chunk_mutex);
4822         /* Clear all state bits beyond the shrunk device size */
4823         clear_extent_bits(&device->alloc_state, new_size, (u64)-1,
4824                           CHUNK_STATE_MASK);
4825
4826         btrfs_device_set_disk_total_bytes(device, new_size);
4827         if (list_empty(&device->post_commit_list))
4828                 list_add_tail(&device->post_commit_list,
4829                               &trans->transaction->dev_update_list);
4830
4831         WARN_ON(diff > old_total);
4832         btrfs_set_super_total_bytes(super_copy,
4833                         round_down(old_total - diff, fs_info->sectorsize));
4834         mutex_unlock(&fs_info->chunk_mutex);
4835
4836         /* Now btrfs_update_device() will change the on-disk size. */
4837         ret = btrfs_update_device(trans, device);
4838         if (ret < 0) {
4839                 btrfs_abort_transaction(trans, ret);
4840                 btrfs_end_transaction(trans);
4841         } else {
4842                 ret = btrfs_commit_transaction(trans);
4843         }
4844 done:
4845         btrfs_free_path(path);
4846         if (ret) {
4847                 mutex_lock(&fs_info->chunk_mutex);
4848                 btrfs_device_set_total_bytes(device, old_size);
4849                 if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
4850                         device->fs_devices->total_rw_bytes += diff;
4851                 atomic64_add(diff, &fs_info->free_chunk_space);
4852                 mutex_unlock(&fs_info->chunk_mutex);
4853         }
4854         return ret;
4855 }
4856
4857 static int btrfs_add_system_chunk(struct btrfs_fs_info *fs_info,
4858                            struct btrfs_key *key,
4859                            struct btrfs_chunk *chunk, int item_size)
4860 {
4861         struct btrfs_super_block *super_copy = fs_info->super_copy;
4862         struct btrfs_disk_key disk_key;
4863         u32 array_size;
4864         u8 *ptr;
4865
4866         mutex_lock(&fs_info->chunk_mutex);
4867         array_size = btrfs_super_sys_array_size(super_copy);
4868         if (array_size + item_size + sizeof(disk_key)
4869                         > BTRFS_SYSTEM_CHUNK_ARRAY_SIZE) {
4870                 mutex_unlock(&fs_info->chunk_mutex);
4871                 return -EFBIG;
4872         }
4873
4874         ptr = super_copy->sys_chunk_array + array_size;
4875         btrfs_cpu_key_to_disk(&disk_key, key);
4876         memcpy(ptr, &disk_key, sizeof(disk_key));
4877         ptr += sizeof(disk_key);
4878         memcpy(ptr, chunk, item_size);
4879         item_size += sizeof(disk_key);
4880         btrfs_set_super_sys_array_size(super_copy, array_size + item_size);
4881         mutex_unlock(&fs_info->chunk_mutex);
4882
4883         return 0;
4884 }
4885
4886 /*
4887  * sort the devices in descending order by max_avail, total_avail
4888  */
4889 static int btrfs_cmp_device_info(const void *a, const void *b)
4890 {
4891         const struct btrfs_device_info *di_a = a;
4892         const struct btrfs_device_info *di_b = b;
4893
4894         if (di_a->max_avail > di_b->max_avail)
4895                 return -1;
4896         if (di_a->max_avail < di_b->max_avail)
4897                 return 1;
4898         if (di_a->total_avail > di_b->total_avail)
4899                 return -1;
4900         if (di_a->total_avail < di_b->total_avail)
4901                 return 1;
4902         return 0;
4903 }
4904
4905 static void check_raid56_incompat_flag(struct btrfs_fs_info *info, u64 type)
4906 {
4907         if (!(type & BTRFS_BLOCK_GROUP_RAID56_MASK))
4908                 return;
4909
4910         btrfs_set_fs_incompat(info, RAID56);
4911 }
4912
4913 static void check_raid1c34_incompat_flag(struct btrfs_fs_info *info, u64 type)
4914 {
4915         if (!(type & (BTRFS_BLOCK_GROUP_RAID1C3 | BTRFS_BLOCK_GROUP_RAID1C4)))
4916                 return;
4917
4918         btrfs_set_fs_incompat(info, RAID1C34);
4919 }
4920
4921 /*
4922  * Structure used internally for __btrfs_alloc_chunk() function.
4923  * Wraps needed parameters.
4924  */
4925 struct alloc_chunk_ctl {
4926         u64 start;
4927         u64 type;
4928         /* Total number of stripes to allocate */
4929         int num_stripes;
4930         /* sub_stripes info for map */
4931         int sub_stripes;
4932         /* Stripes per device */
4933         int dev_stripes;
4934         /* Maximum number of devices to use */
4935         int devs_max;
4936         /* Minimum number of devices to use */
4937         int devs_min;
4938         /* ndevs has to be a multiple of this */
4939         int devs_increment;
4940         /* Number of copies */
4941         int ncopies;
4942         /* Number of stripes worth of bytes to store parity information */
4943         int nparity;
4944         u64 max_stripe_size;
4945         u64 max_chunk_size;
4946         u64 dev_extent_min;
4947         u64 stripe_size;
4948         u64 chunk_size;
4949         int ndevs;
4950 };
4951
4952 static void init_alloc_chunk_ctl_policy_regular(
4953                                 struct btrfs_fs_devices *fs_devices,
4954                                 struct alloc_chunk_ctl *ctl)
4955 {
4956         u64 type = ctl->type;
4957
4958         if (type & BTRFS_BLOCK_GROUP_DATA) {
4959                 ctl->max_stripe_size = SZ_1G;
4960                 ctl->max_chunk_size = BTRFS_MAX_DATA_CHUNK_SIZE;
4961         } else if (type & BTRFS_BLOCK_GROUP_METADATA) {
4962                 /* For larger filesystems, use larger metadata chunks */
4963                 if (fs_devices->total_rw_bytes > 50ULL * SZ_1G)
4964                         ctl->max_stripe_size = SZ_1G;
4965                 else
4966                         ctl->max_stripe_size = SZ_256M;
4967                 ctl->max_chunk_size = ctl->max_stripe_size;
4968         } else if (type & BTRFS_BLOCK_GROUP_SYSTEM) {
4969                 ctl->max_stripe_size = SZ_32M;
4970                 ctl->max_chunk_size = 2 * ctl->max_stripe_size;
4971                 ctl->devs_max = min_t(int, ctl->devs_max,
4972                                       BTRFS_MAX_DEVS_SYS_CHUNK);
4973         } else {
4974                 BUG();
4975         }
4976
4977         /* We don't want a chunk larger than 10% of writable space */
4978         ctl->max_chunk_size = min(div_factor(fs_devices->total_rw_bytes, 1),
4979                                   ctl->max_chunk_size);
4980         ctl->dev_extent_min = BTRFS_STRIPE_LEN * ctl->dev_stripes;
4981 }
4982
4983 static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices,
4984                                  struct alloc_chunk_ctl *ctl)
4985 {
4986         int index = btrfs_bg_flags_to_raid_index(ctl->type);
4987
4988         ctl->sub_stripes = btrfs_raid_array[index].sub_stripes;
4989         ctl->dev_stripes = btrfs_raid_array[index].dev_stripes;
4990         ctl->devs_max = btrfs_raid_array[index].devs_max;
4991         if (!ctl->devs_max)
4992                 ctl->devs_max = BTRFS_MAX_DEVS(fs_devices->fs_info);
4993         ctl->devs_min = btrfs_raid_array[index].devs_min;
4994         ctl->devs_increment = btrfs_raid_array[index].devs_increment;
4995         ctl->ncopies = btrfs_raid_array[index].ncopies;
4996         ctl->nparity = btrfs_raid_array[index].nparity;
4997         ctl->ndevs = 0;
4998
4999         switch (fs_devices->chunk_alloc_policy) {
5000         case BTRFS_CHUNK_ALLOC_REGULAR:
5001                 init_alloc_chunk_ctl_policy_regular(fs_devices, ctl);
5002                 break;
5003         default:
5004                 BUG();
5005         }
5006 }
5007
5008 static int gather_device_info(struct btrfs_fs_devices *fs_devices,
5009                               struct alloc_chunk_ctl *ctl,
5010                               struct btrfs_device_info *devices_info)
5011 {
5012         struct btrfs_fs_info *info = fs_devices->fs_info;
5013         struct btrfs_device *device;
5014         u64 total_avail;
5015         u64 dev_extent_want = ctl->max_stripe_size * ctl->dev_stripes;
5016         int ret;
5017         int ndevs = 0;
5018         u64 max_avail;
5019         u64 dev_offset;
5020
5021         /*
5022          * in the first pass through the devices list, we gather information
5023          * about the available holes on each device.
5024          */
5025         list_for_each_entry(device, &fs_devices->alloc_list, dev_alloc_list) {
5026                 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
5027                         WARN(1, KERN_ERR
5028                                "BTRFS: read-only device in alloc_list\n");
5029                         continue;
5030                 }
5031
5032                 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
5033                                         &device->dev_state) ||
5034                     test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state))
5035                         continue;
5036
5037                 if (device->total_bytes > device->bytes_used)
5038                         total_avail = device->total_bytes - device->bytes_used;
5039                 else
5040                         total_avail = 0;
5041
5042                 /* If there is no space on this device, skip it. */
5043                 if (total_avail < ctl->dev_extent_min)
5044                         continue;
5045
5046                 ret = find_free_dev_extent(device, dev_extent_want, &dev_offset,
5047                                            &max_avail);
5048                 if (ret && ret != -ENOSPC)
5049                         return ret;
5050
5051                 if (ret == 0)
5052                         max_avail = dev_extent_want;
5053
5054                 if (max_avail < ctl->dev_extent_min) {
5055                         if (btrfs_test_opt(info, ENOSPC_DEBUG))
5056                                 btrfs_debug(info,
5057                         "%s: devid %llu has no free space, have=%llu want=%llu",
5058                                             __func__, device->devid, max_avail,
5059                                             ctl->dev_extent_min);
5060                         continue;
5061                 }
5062
5063                 if (ndevs == fs_devices->rw_devices) {
5064                         WARN(1, "%s: found more than %llu devices\n",
5065                              __func__, fs_devices->rw_devices);
5066                         break;
5067                 }
5068                 devices_info[ndevs].dev_offset = dev_offset;
5069                 devices_info[ndevs].max_avail = max_avail;
5070                 devices_info[ndevs].total_avail = total_avail;
5071                 devices_info[ndevs].dev = device;
5072                 ++ndevs;
5073         }
5074         ctl->ndevs = ndevs;
5075
5076         /*
5077          * now sort the devices by hole size / available space
5078          */
5079         sort(devices_info, ndevs, sizeof(struct btrfs_device_info),
5080              btrfs_cmp_device_info, NULL);
5081
5082         return 0;
5083 }
5084
5085 static int decide_stripe_size_regular(struct alloc_chunk_ctl *ctl,
5086                                       struct btrfs_device_info *devices_info)
5087 {
5088         /* Number of stripes that count for block group size */
5089         int data_stripes;
5090
5091         /*
5092          * The primary goal is to maximize the number of stripes, so use as
5093          * many devices as possible, even if the stripes are not maximum sized.
5094          *
5095          * The DUP profile stores more than one stripe per device, the
5096          * max_avail is the total size so we have to adjust.
5097          */
5098         ctl->stripe_size = div_u64(devices_info[ctl->ndevs - 1].max_avail,
5099                                    ctl->dev_stripes);
5100         ctl->num_stripes = ctl->ndevs * ctl->dev_stripes;
5101
5102         /* This will have to be fixed for RAID1 and RAID10 over more drives */
5103         data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies;
5104
5105         /*
5106          * Use the number of data stripes to figure out how big this chunk is
5107          * really going to be in terms of logical address space, and compare
5108          * that answer with the max chunk size. If it's higher, we try to
5109          * reduce stripe_size.
5110          */
5111         if (ctl->stripe_size * data_stripes > ctl->max_chunk_size) {
5112                 /*
5113                  * Reduce stripe_size, round it up to a 16MB boundary again and
5114                  * then use it, unless it ends up being even bigger than the
5115                  * previous value we had already.
5116                  */
5117                 ctl->stripe_size = min(round_up(div_u64(ctl->max_chunk_size,
5118                                                         data_stripes), SZ_16M),
5119                                        ctl->stripe_size);
5120         }
5121
5122         /* Align to BTRFS_STRIPE_LEN */
5123         ctl->stripe_size = round_down(ctl->stripe_size, BTRFS_STRIPE_LEN);
5124         ctl->chunk_size = ctl->stripe_size * data_stripes;
5125
5126         return 0;
5127 }
5128
5129 static int decide_stripe_size(struct btrfs_fs_devices *fs_devices,
5130                               struct alloc_chunk_ctl *ctl,
5131                               struct btrfs_device_info *devices_info)
5132 {
5133         struct btrfs_fs_info *info = fs_devices->fs_info;
5134
5135         /*
5136          * Round down to number of usable stripes, devs_increment can be any
5137          * number so we can't use round_down() that requires power of 2, while
5138          * rounddown is safe.
5139          */
5140         ctl->ndevs = rounddown(ctl->ndevs, ctl->devs_increment);
5141
5142         if (ctl->ndevs < ctl->devs_min) {
5143                 if (btrfs_test_opt(info, ENOSPC_DEBUG)) {
5144                         btrfs_debug(info,
5145         "%s: not enough devices with free space: have=%d minimum required=%d",
5146                                     __func__, ctl->ndevs, ctl->devs_min);
5147                 }
5148                 return -ENOSPC;
5149         }
5150
5151         ctl->ndevs = min(ctl->ndevs, ctl->devs_max);
5152
5153         switch (fs_devices->chunk_alloc_policy) {
5154         case BTRFS_CHUNK_ALLOC_REGULAR:
5155                 return decide_stripe_size_regular(ctl, devices_info);
5156         default:
5157                 BUG();
5158         }
5159 }
5160
5161 static int create_chunk(struct btrfs_trans_handle *trans,
5162                         struct alloc_chunk_ctl *ctl,
5163                         struct btrfs_device_info *devices_info)
5164 {
5165         struct btrfs_fs_info *info = trans->fs_info;
5166         struct map_lookup *map = NULL;
5167         struct extent_map_tree *em_tree;
5168         struct extent_map *em;
5169         u64 start = ctl->start;
5170         u64 type = ctl->type;
5171         int ret;
5172         int i;
5173         int j;
5174
5175         map = kmalloc(map_lookup_size(ctl->num_stripes), GFP_NOFS);
5176         if (!map)
5177                 return -ENOMEM;
5178         map->num_stripes = ctl->num_stripes;
5179
5180         for (i = 0; i < ctl->ndevs; ++i) {
5181                 for (j = 0; j < ctl->dev_stripes; ++j) {
5182                         int s = i * ctl->dev_stripes + j;
5183                         map->stripes[s].dev = devices_info[i].dev;
5184                         map->stripes[s].physical = devices_info[i].dev_offset +
5185                                                    j * ctl->stripe_size;
5186                 }
5187         }
5188         map->stripe_len = BTRFS_STRIPE_LEN;
5189         map->io_align = BTRFS_STRIPE_LEN;
5190         map->io_width = BTRFS_STRIPE_LEN;
5191         map->type = type;
5192         map->sub_stripes = ctl->sub_stripes;
5193
5194         trace_btrfs_chunk_alloc(info, map, start, ctl->chunk_size);
5195
5196         em = alloc_extent_map();
5197         if (!em) {
5198                 kfree(map);
5199                 return -ENOMEM;
5200         }
5201         set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
5202         em->map_lookup = map;
5203         em->start = start;
5204         em->len = ctl->chunk_size;
5205         em->block_start = 0;
5206         em->block_len = em->len;
5207         em->orig_block_len = ctl->stripe_size;
5208
5209         em_tree = &info->mapping_tree;
5210         write_lock(&em_tree->lock);
5211         ret = add_extent_mapping(em_tree, em, 0);
5212         if (ret) {
5213                 write_unlock(&em_tree->lock);
5214                 free_extent_map(em);
5215                 return ret;
5216         }
5217         write_unlock(&em_tree->lock);
5218
5219         ret = btrfs_make_block_group(trans, 0, type, start, ctl->chunk_size);
5220         if (ret)
5221                 goto error_del_extent;
5222
5223         for (i = 0; i < map->num_stripes; i++) {
5224                 struct btrfs_device *dev = map->stripes[i].dev;
5225
5226                 btrfs_device_set_bytes_used(dev,
5227                                             dev->bytes_used + ctl->stripe_size);
5228                 if (list_empty(&dev->post_commit_list))
5229                         list_add_tail(&dev->post_commit_list,
5230                                       &trans->transaction->dev_update_list);
5231         }
5232
5233         atomic64_sub(ctl->stripe_size * map->num_stripes,
5234                      &info->free_chunk_space);
5235
5236         free_extent_map(em);
5237         check_raid56_incompat_flag(info, type);
5238         check_raid1c34_incompat_flag(info, type);
5239
5240         return 0;
5241
5242 error_del_extent:
5243         write_lock(&em_tree->lock);
5244         remove_extent_mapping(em_tree, em);
5245         write_unlock(&em_tree->lock);
5246
5247         /* One for our allocation */
5248         free_extent_map(em);
5249         /* One for the tree reference */
5250         free_extent_map(em);
5251
5252         return ret;
5253 }
5254
5255 int btrfs_alloc_chunk(struct btrfs_trans_handle *trans, u64 type)
5256 {
5257         struct btrfs_fs_info *info = trans->fs_info;
5258         struct btrfs_fs_devices *fs_devices = info->fs_devices;
5259         struct btrfs_device_info *devices_info = NULL;
5260         struct alloc_chunk_ctl ctl;
5261         int ret;
5262
5263         lockdep_assert_held(&info->chunk_mutex);
5264
5265         if (!alloc_profile_is_valid(type, 0)) {
5266                 ASSERT(0);
5267                 return -EINVAL;
5268         }
5269
5270         if (list_empty(&fs_devices->alloc_list)) {
5271                 if (btrfs_test_opt(info, ENOSPC_DEBUG))
5272                         btrfs_debug(info, "%s: no writable device", __func__);
5273                 return -ENOSPC;
5274         }
5275
5276         if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) {
5277                 btrfs_err(info, "invalid chunk type 0x%llx requested", type);
5278                 ASSERT(0);
5279                 return -EINVAL;
5280         }
5281
5282         ctl.start = find_next_chunk(info);
5283         ctl.type = type;
5284         init_alloc_chunk_ctl(fs_devices, &ctl);
5285
5286         devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info),
5287                                GFP_NOFS);
5288         if (!devices_info)
5289                 return -ENOMEM;
5290
5291         ret = gather_device_info(fs_devices, &ctl, devices_info);
5292         if (ret < 0)
5293                 goto out;
5294
5295         ret = decide_stripe_size(fs_devices, &ctl, devices_info);
5296         if (ret < 0)
5297                 goto out;
5298
5299         ret = create_chunk(trans, &ctl, devices_info);
5300
5301 out:
5302         kfree(devices_info);
5303         return ret;
5304 }
5305
5306 /*
5307  * Chunk allocation falls into two parts. The first part does work
5308  * that makes the new allocated chunk usable, but does not do any operation
5309  * that modifies the chunk tree. The second part does the work that
5310  * requires modifying the chunk tree. This division is important for the
5311  * bootstrap process of adding storage to a seed btrfs.
5312  */
5313 int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans,
5314                              u64 chunk_offset, u64 chunk_size)
5315 {
5316         struct btrfs_fs_info *fs_info = trans->fs_info;
5317         struct btrfs_root *extent_root = fs_info->extent_root;
5318         struct btrfs_root *chunk_root = fs_info->chunk_root;
5319         struct btrfs_key key;
5320         struct btrfs_device *device;
5321         struct btrfs_chunk *chunk;
5322         struct btrfs_stripe *stripe;
5323         struct extent_map *em;
5324         struct map_lookup *map;
5325         size_t item_size;
5326         u64 dev_offset;
5327         u64 stripe_size;
5328         int i = 0;
5329         int ret = 0;
5330
5331         em = btrfs_get_chunk_map(fs_info, chunk_offset, chunk_size);
5332         if (IS_ERR(em))
5333                 return PTR_ERR(em);
5334
5335         map = em->map_lookup;
5336         item_size = btrfs_chunk_item_size(map->num_stripes);
5337         stripe_size = em->orig_block_len;
5338
5339         chunk = kzalloc(item_size, GFP_NOFS);
5340         if (!chunk) {
5341                 ret = -ENOMEM;
5342                 goto out;
5343         }
5344
5345         /*
5346          * Take the device list mutex to prevent races with the final phase of
5347          * a device replace operation that replaces the device object associated
5348          * with the map's stripes, because the device object's id can change
5349          * at any time during that final phase of the device replace operation
5350          * (dev-replace.c:btrfs_dev_replace_finishing()).
5351          */
5352         mutex_lock(&fs_info->fs_devices->device_list_mutex);
5353         for (i = 0; i < map->num_stripes; i++) {
5354                 device = map->stripes[i].dev;
5355                 dev_offset = map->stripes[i].physical;
5356
5357                 ret = btrfs_update_device(trans, device);
5358                 if (ret)
5359                         break;
5360                 ret = btrfs_alloc_dev_extent(trans, device, chunk_offset,
5361                                              dev_offset, stripe_size);
5362                 if (ret)
5363                         break;
5364         }
5365         if (ret) {
5366                 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
5367                 goto out;
5368         }
5369
5370         stripe = &chunk->stripe;
5371         for (i = 0; i < map->num_stripes; i++) {
5372                 device = map->stripes[i].dev;
5373                 dev_offset = map->stripes[i].physical;
5374
5375                 btrfs_set_stack_stripe_devid(stripe, device->devid);
5376                 btrfs_set_stack_stripe_offset(stripe, dev_offset);
5377                 memcpy(stripe->dev_uuid, device->uuid, BTRFS_UUID_SIZE);
5378                 stripe++;
5379         }
5380         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
5381
5382         btrfs_set_stack_chunk_length(chunk, chunk_size);
5383         btrfs_set_stack_chunk_owner(chunk, extent_root->root_key.objectid);
5384         btrfs_set_stack_chunk_stripe_len(chunk, map->stripe_len);
5385         btrfs_set_stack_chunk_type(chunk, map->type);
5386         btrfs_set_stack_chunk_num_stripes(chunk, map->num_stripes);
5387         btrfs_set_stack_chunk_io_align(chunk, map->stripe_len);
5388         btrfs_set_stack_chunk_io_width(chunk, map->stripe_len);
5389         btrfs_set_stack_chunk_sector_size(chunk, fs_info->sectorsize);
5390         btrfs_set_stack_chunk_sub_stripes(chunk, map->sub_stripes);
5391
5392         key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID;
5393         key.type = BTRFS_CHUNK_ITEM_KEY;
5394         key.offset = chunk_offset;
5395
5396         ret = btrfs_insert_item(trans, chunk_root, &key, chunk, item_size);
5397         if (ret == 0 && map->type & BTRFS_BLOCK_GROUP_SYSTEM) {
5398                 /*
5399                  * TODO: Cleanup of inserted chunk root in case of
5400                  * failure.
5401                  */
5402                 ret = btrfs_add_system_chunk(fs_info, &key, chunk, item_size);
5403         }
5404
5405 out:
5406         kfree(chunk);
5407         free_extent_map(em);
5408         return ret;
5409 }
5410
5411 static noinline int init_first_rw_device(struct btrfs_trans_handle *trans)
5412 {
5413         struct btrfs_fs_info *fs_info = trans->fs_info;
5414         u64 alloc_profile;
5415         int ret;
5416
5417         alloc_profile = btrfs_metadata_alloc_profile(fs_info);
5418         ret = btrfs_alloc_chunk(trans, alloc_profile);
5419         if (ret)
5420                 return ret;
5421
5422         alloc_profile = btrfs_system_alloc_profile(fs_info);
5423         ret = btrfs_alloc_chunk(trans, alloc_profile);
5424         return ret;
5425 }
5426
5427 static inline int btrfs_chunk_max_errors(struct map_lookup *map)
5428 {
5429         const int index = btrfs_bg_flags_to_raid_index(map->type);
5430
5431         return btrfs_raid_array[index].tolerated_failures;
5432 }
5433
5434 int btrfs_chunk_readonly(struct btrfs_fs_info *fs_info, u64 chunk_offset)
5435 {
5436         struct extent_map *em;
5437         struct map_lookup *map;
5438         int readonly = 0;
5439         int miss_ndevs = 0;
5440         int i;
5441
5442         em = btrfs_get_chunk_map(fs_info, chunk_offset, 1);
5443         if (IS_ERR(em))
5444                 return 1;
5445
5446         map = em->map_lookup;
5447         for (i = 0; i < map->num_stripes; i++) {
5448                 if (test_bit(BTRFS_DEV_STATE_MISSING,
5449                                         &map->stripes[i].dev->dev_state)) {
5450                         miss_ndevs++;
5451                         continue;
5452                 }
5453                 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE,
5454                                         &map->stripes[i].dev->dev_state)) {
5455                         readonly = 1;
5456                         goto end;
5457                 }
5458         }
5459
5460         /*
5461          * If the number of missing devices is larger than max errors,
5462          * we can not write the data into that chunk successfully, so
5463          * set it readonly.
5464          */
5465         if (miss_ndevs > btrfs_chunk_max_errors(map))
5466                 readonly = 1;
5467 end:
5468         free_extent_map(em);
5469         return readonly;
5470 }
5471
5472 void btrfs_mapping_tree_free(struct extent_map_tree *tree)
5473 {
5474         struct extent_map *em;
5475
5476         while (1) {
5477                 write_lock(&tree->lock);
5478                 em = lookup_extent_mapping(tree, 0, (u64)-1);
5479                 if (em)
5480                         remove_extent_mapping(tree, em);
5481                 write_unlock(&tree->lock);
5482                 if (!em)
5483                         break;
5484                 /* once for us */
5485                 free_extent_map(em);
5486                 /* once for the tree */
5487                 free_extent_map(em);
5488         }
5489 }
5490
5491 int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5492 {
5493         struct extent_map *em;
5494         struct map_lookup *map;
5495         int ret;
5496
5497         em = btrfs_get_chunk_map(fs_info, logical, len);
5498         if (IS_ERR(em))
5499                 /*
5500                  * We could return errors for these cases, but that could get
5501                  * ugly and we'd probably do the same thing which is just not do
5502                  * anything else and exit, so return 1 so the callers don't try
5503                  * to use other copies.
5504                  */
5505                 return 1;
5506
5507         map = em->map_lookup;
5508         if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1_MASK))
5509                 ret = map->num_stripes;
5510         else if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5511                 ret = map->sub_stripes;
5512         else if (map->type & BTRFS_BLOCK_GROUP_RAID5)
5513                 ret = 2;
5514         else if (map->type & BTRFS_BLOCK_GROUP_RAID6)
5515                 /*
5516                  * There could be two corrupted data stripes, we need
5517                  * to loop retry in order to rebuild the correct data.
5518                  *
5519                  * Fail a stripe at a time on every retry except the
5520                  * stripe under reconstruction.
5521                  */
5522                 ret = map->num_stripes;
5523         else
5524                 ret = 1;
5525         free_extent_map(em);
5526
5527         down_read(&fs_info->dev_replace.rwsem);
5528         if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace) &&
5529             fs_info->dev_replace.tgtdev)
5530                 ret++;
5531         up_read(&fs_info->dev_replace.rwsem);
5532
5533         return ret;
5534 }
5535
5536 unsigned long btrfs_full_stripe_len(struct btrfs_fs_info *fs_info,
5537                                     u64 logical)
5538 {
5539         struct extent_map *em;
5540         struct map_lookup *map;
5541         unsigned long len = fs_info->sectorsize;
5542
5543         em = btrfs_get_chunk_map(fs_info, logical, len);
5544
5545         if (!WARN_ON(IS_ERR(em))) {
5546                 map = em->map_lookup;
5547                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
5548                         len = map->stripe_len * nr_data_stripes(map);
5549                 free_extent_map(em);
5550         }
5551         return len;
5552 }
5553
5554 int btrfs_is_parity_mirror(struct btrfs_fs_info *fs_info, u64 logical, u64 len)
5555 {
5556         struct extent_map *em;
5557         struct map_lookup *map;
5558         int ret = 0;
5559
5560         em = btrfs_get_chunk_map(fs_info, logical, len);
5561
5562         if(!WARN_ON(IS_ERR(em))) {
5563                 map = em->map_lookup;
5564                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
5565                         ret = 1;
5566                 free_extent_map(em);
5567         }
5568         return ret;
5569 }
5570
5571 static int find_live_mirror(struct btrfs_fs_info *fs_info,
5572                             struct map_lookup *map, int first,
5573                             int dev_replace_is_ongoing)
5574 {
5575         int i;
5576         int num_stripes;
5577         int preferred_mirror;
5578         int tolerance;
5579         struct btrfs_device *srcdev;
5580
5581         ASSERT((map->type &
5582                  (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)));
5583
5584         if (map->type & BTRFS_BLOCK_GROUP_RAID10)
5585                 num_stripes = map->sub_stripes;
5586         else
5587                 num_stripes = map->num_stripes;
5588
5589         preferred_mirror = first + current->pid % num_stripes;
5590
5591         if (dev_replace_is_ongoing &&
5592             fs_info->dev_replace.cont_reading_from_srcdev_mode ==
5593              BTRFS_DEV_REPLACE_ITEM_CONT_READING_FROM_SRCDEV_MODE_AVOID)
5594                 srcdev = fs_info->dev_replace.srcdev;
5595         else
5596                 srcdev = NULL;
5597
5598         /*
5599          * try to avoid the drive that is the source drive for a
5600          * dev-replace procedure, only choose it if no other non-missing
5601          * mirror is available
5602          */
5603         for (tolerance = 0; tolerance < 2; tolerance++) {
5604                 if (map->stripes[preferred_mirror].dev->bdev &&
5605                     (tolerance || map->stripes[preferred_mirror].dev != srcdev))
5606                         return preferred_mirror;
5607                 for (i = first; i < first + num_stripes; i++) {
5608                         if (map->stripes[i].dev->bdev &&
5609                             (tolerance || map->stripes[i].dev != srcdev))
5610                                 return i;
5611                 }
5612         }
5613
5614         /* we couldn't find one that doesn't fail.  Just return something
5615          * and the io error handling code will clean up eventually
5616          */
5617         return preferred_mirror;
5618 }
5619
5620 /* Bubble-sort the stripe set to put the parity/syndrome stripes last */
5621 static void sort_parity_stripes(struct btrfs_bio *bbio, int num_stripes)
5622 {
5623         int i;
5624         int again = 1;
5625
5626         while (again) {
5627                 again = 0;
5628                 for (i = 0; i < num_stripes - 1; i++) {
5629                         /* Swap if parity is on a smaller index */
5630                         if (bbio->raid_map[i] > bbio->raid_map[i + 1]) {
5631                                 swap(bbio->stripes[i], bbio->stripes[i + 1]);
5632                                 swap(bbio->raid_map[i], bbio->raid_map[i + 1]);
5633                                 again = 1;
5634                         }
5635                 }
5636         }
5637 }
5638
5639 static struct btrfs_bio *alloc_btrfs_bio(int total_stripes, int real_stripes)
5640 {
5641         struct btrfs_bio *bbio = kzalloc(
5642                  /* the size of the btrfs_bio */
5643                 sizeof(struct btrfs_bio) +
5644                 /* plus the variable array for the stripes */
5645                 sizeof(struct btrfs_bio_stripe) * (total_stripes) +
5646                 /* plus the variable array for the tgt dev */
5647                 sizeof(int) * (real_stripes) +
5648                 /*
5649                  * plus the raid_map, which includes both the tgt dev
5650                  * and the stripes
5651                  */
5652                 sizeof(u64) * (total_stripes),
5653                 GFP_NOFS|__GFP_NOFAIL);
5654
5655         atomic_set(&bbio->error, 0);
5656         refcount_set(&bbio->refs, 1);
5657
5658         bbio->tgtdev_map = (int *)(bbio->stripes + total_stripes);
5659         bbio->raid_map = (u64 *)(bbio->tgtdev_map + real_stripes);
5660
5661         return bbio;
5662 }
5663
5664 void btrfs_get_bbio(struct btrfs_bio *bbio)
5665 {
5666         WARN_ON(!refcount_read(&bbio->refs));
5667         refcount_inc(&bbio->refs);
5668 }
5669
5670 void btrfs_put_bbio(struct btrfs_bio *bbio)
5671 {
5672         if (!bbio)
5673                 return;
5674         if (refcount_dec_and_test(&bbio->refs))
5675                 kfree(bbio);
5676 }
5677
5678 /* can REQ_OP_DISCARD be sent with other REQ like REQ_OP_WRITE? */
5679 /*
5680  * Please note that, discard won't be sent to target device of device
5681  * replace.
5682  */
5683 static int __btrfs_map_block_for_discard(struct btrfs_fs_info *fs_info,
5684                                          u64 logical, u64 *length_ret,
5685                                          struct btrfs_bio **bbio_ret)
5686 {
5687         struct extent_map *em;
5688         struct map_lookup *map;
5689         struct btrfs_bio *bbio;
5690         u64 length = *length_ret;
5691         u64 offset;
5692         u64 stripe_nr;
5693         u64 stripe_nr_end;
5694         u64 stripe_end_offset;
5695         u64 stripe_cnt;
5696         u64 stripe_len;
5697         u64 stripe_offset;
5698         u64 num_stripes;
5699         u32 stripe_index;
5700         u32 factor = 0;
5701         u32 sub_stripes = 0;
5702         u64 stripes_per_dev = 0;
5703         u32 remaining_stripes = 0;
5704         u32 last_stripe = 0;
5705         int ret = 0;
5706         int i;
5707
5708         /* discard always return a bbio */
5709         ASSERT(bbio_ret);
5710
5711         em = btrfs_get_chunk_map(fs_info, logical, length);
5712         if (IS_ERR(em))
5713                 return PTR_ERR(em);
5714
5715         map = em->map_lookup;
5716         /* we don't discard raid56 yet */
5717         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
5718                 ret = -EOPNOTSUPP;
5719                 goto out;
5720         }
5721
5722         offset = logical - em->start;
5723         length = min_t(u64, em->start + em->len - logical, length);
5724         *length_ret = length;
5725
5726         stripe_len = map->stripe_len;
5727         /*
5728          * stripe_nr counts the total number of stripes we have to stride
5729          * to get to this block
5730          */
5731         stripe_nr = div64_u64(offset, stripe_len);
5732
5733         /* stripe_offset is the offset of this block in its stripe */
5734         stripe_offset = offset - stripe_nr * stripe_len;
5735
5736         stripe_nr_end = round_up(offset + length, map->stripe_len);
5737         stripe_nr_end = div64_u64(stripe_nr_end, map->stripe_len);
5738         stripe_cnt = stripe_nr_end - stripe_nr;
5739         stripe_end_offset = stripe_nr_end * map->stripe_len -
5740                             (offset + length);
5741         /*
5742          * after this, stripe_nr is the number of stripes on this
5743          * device we have to walk to find the data, and stripe_index is
5744          * the number of our device in the stripe array
5745          */
5746         num_stripes = 1;
5747         stripe_index = 0;
5748         if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
5749                          BTRFS_BLOCK_GROUP_RAID10)) {
5750                 if (map->type & BTRFS_BLOCK_GROUP_RAID0)
5751                         sub_stripes = 1;
5752                 else
5753                         sub_stripes = map->sub_stripes;
5754
5755                 factor = map->num_stripes / sub_stripes;
5756                 num_stripes = min_t(u64, map->num_stripes,
5757                                     sub_stripes * stripe_cnt);
5758                 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
5759                 stripe_index *= sub_stripes;
5760                 stripes_per_dev = div_u64_rem(stripe_cnt, factor,
5761                                               &remaining_stripes);
5762                 div_u64_rem(stripe_nr_end - 1, factor, &last_stripe);
5763                 last_stripe *= sub_stripes;
5764         } else if (map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK |
5765                                 BTRFS_BLOCK_GROUP_DUP)) {
5766                 num_stripes = map->num_stripes;
5767         } else {
5768                 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
5769                                         &stripe_index);
5770         }
5771
5772         bbio = alloc_btrfs_bio(num_stripes, 0);
5773         if (!bbio) {
5774                 ret = -ENOMEM;
5775                 goto out;
5776         }
5777
5778         for (i = 0; i < num_stripes; i++) {
5779                 bbio->stripes[i].physical =
5780                         map->stripes[stripe_index].physical +
5781                         stripe_offset + stripe_nr * map->stripe_len;
5782                 bbio->stripes[i].dev = map->stripes[stripe_index].dev;
5783
5784                 if (map->type & (BTRFS_BLOCK_GROUP_RAID0 |
5785                                  BTRFS_BLOCK_GROUP_RAID10)) {
5786                         bbio->stripes[i].length = stripes_per_dev *
5787                                 map->stripe_len;
5788
5789                         if (i / sub_stripes < remaining_stripes)
5790                                 bbio->stripes[i].length +=
5791                                         map->stripe_len;
5792
5793                         /*
5794                          * Special for the first stripe and
5795                          * the last stripe:
5796                          *
5797                          * |-------|...|-------|
5798                          *     |----------|
5799                          *    off     end_off
5800                          */
5801                         if (i < sub_stripes)
5802                                 bbio->stripes[i].length -=
5803                                         stripe_offset;
5804
5805                         if (stripe_index >= last_stripe &&
5806                             stripe_index <= (last_stripe +
5807                                              sub_stripes - 1))
5808                                 bbio->stripes[i].length -=
5809                                         stripe_end_offset;
5810
5811                         if (i == sub_stripes - 1)
5812                                 stripe_offset = 0;
5813                 } else {
5814                         bbio->stripes[i].length = length;
5815                 }
5816
5817                 stripe_index++;
5818                 if (stripe_index == map->num_stripes) {
5819                         stripe_index = 0;
5820                         stripe_nr++;
5821                 }
5822         }
5823
5824         *bbio_ret = bbio;
5825         bbio->map_type = map->type;
5826         bbio->num_stripes = num_stripes;
5827 out:
5828         free_extent_map(em);
5829         return ret;
5830 }
5831
5832 /*
5833  * In dev-replace case, for repair case (that's the only case where the mirror
5834  * is selected explicitly when calling btrfs_map_block), blocks left of the
5835  * left cursor can also be read from the target drive.
5836  *
5837  * For REQ_GET_READ_MIRRORS, the target drive is added as the last one to the
5838  * array of stripes.
5839  * For READ, it also needs to be supported using the same mirror number.
5840  *
5841  * If the requested block is not left of the left cursor, EIO is returned. This
5842  * can happen because btrfs_num_copies() returns one more in the dev-replace
5843  * case.
5844  */
5845 static int get_extra_mirror_from_replace(struct btrfs_fs_info *fs_info,
5846                                          u64 logical, u64 length,
5847                                          u64 srcdev_devid, int *mirror_num,
5848                                          u64 *physical)
5849 {
5850         struct btrfs_bio *bbio = NULL;
5851         int num_stripes;
5852         int index_srcdev = 0;
5853         int found = 0;
5854         u64 physical_of_found = 0;
5855         int i;
5856         int ret = 0;
5857
5858         ret = __btrfs_map_block(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
5859                                 logical, &length, &bbio, 0, 0);
5860         if (ret) {
5861                 ASSERT(bbio == NULL);
5862                 return ret;
5863         }
5864
5865         num_stripes = bbio->num_stripes;
5866         if (*mirror_num > num_stripes) {
5867                 /*
5868                  * BTRFS_MAP_GET_READ_MIRRORS does not contain this mirror,
5869                  * that means that the requested area is not left of the left
5870                  * cursor
5871                  */
5872                 btrfs_put_bbio(bbio);
5873                 return -EIO;
5874         }
5875
5876         /*
5877          * process the rest of the function using the mirror_num of the source
5878          * drive. Therefore look it up first.  At the end, patch the device
5879          * pointer to the one of the target drive.
5880          */
5881         for (i = 0; i < num_stripes; i++) {
5882                 if (bbio->stripes[i].dev->devid != srcdev_devid)
5883                         continue;
5884
5885                 /*
5886                  * In case of DUP, in order to keep it simple, only add the
5887                  * mirror with the lowest physical address
5888                  */
5889                 if (found &&
5890                     physical_of_found <= bbio->stripes[i].physical)
5891                         continue;
5892
5893                 index_srcdev = i;
5894                 found = 1;
5895                 physical_of_found = bbio->stripes[i].physical;
5896         }
5897
5898         btrfs_put_bbio(bbio);
5899
5900         ASSERT(found);
5901         if (!found)
5902                 return -EIO;
5903
5904         *mirror_num = index_srcdev + 1;
5905         *physical = physical_of_found;
5906         return ret;
5907 }
5908
5909 static void handle_ops_on_dev_replace(enum btrfs_map_op op,
5910                                       struct btrfs_bio **bbio_ret,
5911                                       struct btrfs_dev_replace *dev_replace,
5912                                       int *num_stripes_ret, int *max_errors_ret)
5913 {
5914         struct btrfs_bio *bbio = *bbio_ret;
5915         u64 srcdev_devid = dev_replace->srcdev->devid;
5916         int tgtdev_indexes = 0;
5917         int num_stripes = *num_stripes_ret;
5918         int max_errors = *max_errors_ret;
5919         int i;
5920
5921         if (op == BTRFS_MAP_WRITE) {
5922                 int index_where_to_add;
5923
5924                 /*
5925                  * duplicate the write operations while the dev replace
5926                  * procedure is running. Since the copying of the old disk to
5927                  * the new disk takes place at run time while the filesystem is
5928                  * mounted writable, the regular write operations to the old
5929                  * disk have to be duplicated to go to the new disk as well.
5930                  *
5931                  * Note that device->missing is handled by the caller, and that
5932                  * the write to the old disk is already set up in the stripes
5933                  * array.
5934                  */
5935                 index_where_to_add = num_stripes;
5936                 for (i = 0; i < num_stripes; i++) {
5937                         if (bbio->stripes[i].dev->devid == srcdev_devid) {
5938                                 /* write to new disk, too */
5939                                 struct btrfs_bio_stripe *new =
5940                                         bbio->stripes + index_where_to_add;
5941                                 struct btrfs_bio_stripe *old =
5942                                         bbio->stripes + i;
5943
5944                                 new->physical = old->physical;
5945                                 new->length = old->length;
5946                                 new->dev = dev_replace->tgtdev;
5947                                 bbio->tgtdev_map[i] = index_where_to_add;
5948                                 index_where_to_add++;
5949                                 max_errors++;
5950                                 tgtdev_indexes++;
5951                         }
5952                 }
5953                 num_stripes = index_where_to_add;
5954         } else if (op == BTRFS_MAP_GET_READ_MIRRORS) {
5955                 int index_srcdev = 0;
5956                 int found = 0;
5957                 u64 physical_of_found = 0;
5958
5959                 /*
5960                  * During the dev-replace procedure, the target drive can also
5961                  * be used to read data in case it is needed to repair a corrupt
5962                  * block elsewhere. This is possible if the requested area is
5963                  * left of the left cursor. In this area, the target drive is a
5964                  * full copy of the source drive.
5965                  */
5966                 for (i = 0; i < num_stripes; i++) {
5967                         if (bbio->stripes[i].dev->devid == srcdev_devid) {
5968                                 /*
5969                                  * In case of DUP, in order to keep it simple,
5970                                  * only add the mirror with the lowest physical
5971                                  * address
5972                                  */
5973                                 if (found &&
5974                                     physical_of_found <=
5975                                      bbio->stripes[i].physical)
5976                                         continue;
5977                                 index_srcdev = i;
5978                                 found = 1;
5979                                 physical_of_found = bbio->stripes[i].physical;
5980                         }
5981                 }
5982                 if (found) {
5983                         struct btrfs_bio_stripe *tgtdev_stripe =
5984                                 bbio->stripes + num_stripes;
5985
5986                         tgtdev_stripe->physical = physical_of_found;
5987                         tgtdev_stripe->length =
5988                                 bbio->stripes[index_srcdev].length;
5989                         tgtdev_stripe->dev = dev_replace->tgtdev;
5990                         bbio->tgtdev_map[index_srcdev] = num_stripes;
5991
5992                         tgtdev_indexes++;
5993                         num_stripes++;
5994                 }
5995         }
5996
5997         *num_stripes_ret = num_stripes;
5998         *max_errors_ret = max_errors;
5999         bbio->num_tgtdevs = tgtdev_indexes;
6000         *bbio_ret = bbio;
6001 }
6002
6003 static bool need_full_stripe(enum btrfs_map_op op)
6004 {
6005         return (op == BTRFS_MAP_WRITE || op == BTRFS_MAP_GET_READ_MIRRORS);
6006 }
6007
6008 /*
6009  * btrfs_get_io_geometry - calculates the geomery of a particular (address, len)
6010  *                     tuple. This information is used to calculate how big a
6011  *                     particular bio can get before it straddles a stripe.
6012  *
6013  * @fs_info - the filesystem
6014  * @logical - address that we want to figure out the geometry of
6015  * @len     - the length of IO we are going to perform, starting at @logical
6016  * @op      - type of operation - write or read
6017  * @io_geom - pointer used to return values
6018  *
6019  * Returns < 0 in case a chunk for the given logical address cannot be found,
6020  * usually shouldn't happen unless @logical is corrupted, 0 otherwise.
6021  */
6022 int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
6023                         u64 logical, u64 len, struct btrfs_io_geometry *io_geom)
6024 {
6025         struct extent_map *em;
6026         struct map_lookup *map;
6027         u64 offset;
6028         u64 stripe_offset;
6029         u64 stripe_nr;
6030         u64 stripe_len;
6031         u64 raid56_full_stripe_start = (u64)-1;
6032         int data_stripes;
6033         int ret = 0;
6034
6035         ASSERT(op != BTRFS_MAP_DISCARD);
6036
6037         em = btrfs_get_chunk_map(fs_info, logical, len);
6038         if (IS_ERR(em))
6039                 return PTR_ERR(em);
6040
6041         map = em->map_lookup;
6042         /* Offset of this logical address in the chunk */
6043         offset = logical - em->start;
6044         /* Len of a stripe in a chunk */
6045         stripe_len = map->stripe_len;
6046         /* Stripe wher this block falls in */
6047         stripe_nr = div64_u64(offset, stripe_len);
6048         /* Offset of stripe in the chunk */
6049         stripe_offset = stripe_nr * stripe_len;
6050         if (offset < stripe_offset) {
6051                 btrfs_crit(fs_info,
6052 "stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu",
6053                         stripe_offset, offset, em->start, logical, stripe_len);
6054                 ret = -EINVAL;
6055                 goto out;
6056         }
6057
6058         /* stripe_offset is the offset of this block in its stripe */
6059         stripe_offset = offset - stripe_offset;
6060         data_stripes = nr_data_stripes(map);
6061
6062         if (map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK) {
6063                 u64 max_len = stripe_len - stripe_offset;
6064
6065                 /*
6066                  * In case of raid56, we need to know the stripe aligned start
6067                  */
6068                 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
6069                         unsigned long full_stripe_len = stripe_len * data_stripes;
6070                         raid56_full_stripe_start = offset;
6071
6072                         /*
6073                          * Allow a write of a full stripe, but make sure we
6074                          * don't allow straddling of stripes
6075                          */
6076                         raid56_full_stripe_start = div64_u64(raid56_full_stripe_start,
6077                                         full_stripe_len);
6078                         raid56_full_stripe_start *= full_stripe_len;
6079
6080                         /*
6081                          * For writes to RAID[56], allow a full stripeset across
6082                          * all disks. For other RAID types and for RAID[56]
6083                          * reads, just allow a single stripe (on a single disk).
6084                          */
6085                         if (op == BTRFS_MAP_WRITE) {
6086                                 max_len = stripe_len * data_stripes -
6087                                           (offset - raid56_full_stripe_start);
6088                         }
6089                 }
6090                 len = min_t(u64, em->len - offset, max_len);
6091         } else {
6092                 len = em->len - offset;
6093         }
6094
6095         io_geom->len = len;
6096         io_geom->offset = offset;
6097         io_geom->stripe_len = stripe_len;
6098         io_geom->stripe_nr = stripe_nr;
6099         io_geom->stripe_offset = stripe_offset;
6100         io_geom->raid56_stripe_offset = raid56_full_stripe_start;
6101
6102 out:
6103         /* once for us */
6104         free_extent_map(em);
6105         return ret;
6106 }
6107
6108 static int __btrfs_map_block(struct btrfs_fs_info *fs_info,
6109                              enum btrfs_map_op op,
6110                              u64 logical, u64 *length,
6111                              struct btrfs_bio **bbio_ret,
6112                              int mirror_num, int need_raid_map)
6113 {
6114         struct extent_map *em;
6115         struct map_lookup *map;
6116         u64 stripe_offset;
6117         u64 stripe_nr;
6118         u64 stripe_len;
6119         u32 stripe_index;
6120         int data_stripes;
6121         int i;
6122         int ret = 0;
6123         int num_stripes;
6124         int max_errors = 0;
6125         int tgtdev_indexes = 0;
6126         struct btrfs_bio *bbio = NULL;
6127         struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
6128         int dev_replace_is_ongoing = 0;
6129         int num_alloc_stripes;
6130         int patch_the_first_stripe_for_dev_replace = 0;
6131         u64 physical_to_patch_in_first_stripe = 0;
6132         u64 raid56_full_stripe_start = (u64)-1;
6133         struct btrfs_io_geometry geom;
6134
6135         ASSERT(bbio_ret);
6136         ASSERT(op != BTRFS_MAP_DISCARD);
6137
6138         ret = btrfs_get_io_geometry(fs_info, op, logical, *length, &geom);
6139         if (ret < 0)
6140                 return ret;
6141
6142         em = btrfs_get_chunk_map(fs_info, logical, *length);
6143         ASSERT(!IS_ERR(em));
6144         map = em->map_lookup;
6145
6146         *length = geom.len;
6147         stripe_len = geom.stripe_len;
6148         stripe_nr = geom.stripe_nr;
6149         stripe_offset = geom.stripe_offset;
6150         raid56_full_stripe_start = geom.raid56_stripe_offset;
6151         data_stripes = nr_data_stripes(map);
6152
6153         down_read(&dev_replace->rwsem);
6154         dev_replace_is_ongoing = btrfs_dev_replace_is_ongoing(dev_replace);
6155         /*
6156          * Hold the semaphore for read during the whole operation, write is
6157          * requested at commit time but must wait.
6158          */
6159         if (!dev_replace_is_ongoing)
6160                 up_read(&dev_replace->rwsem);
6161
6162         if (dev_replace_is_ongoing && mirror_num == map->num_stripes + 1 &&
6163             !need_full_stripe(op) && dev_replace->tgtdev != NULL) {
6164                 ret = get_extra_mirror_from_replace(fs_info, logical, *length,
6165                                                     dev_replace->srcdev->devid,
6166                                                     &mirror_num,
6167                                             &physical_to_patch_in_first_stripe);
6168                 if (ret)
6169                         goto out;
6170                 else
6171                         patch_the_first_stripe_for_dev_replace = 1;
6172         } else if (mirror_num > map->num_stripes) {
6173                 mirror_num = 0;
6174         }
6175
6176         num_stripes = 1;
6177         stripe_index = 0;
6178         if (map->type & BTRFS_BLOCK_GROUP_RAID0) {
6179                 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
6180                                 &stripe_index);
6181                 if (!need_full_stripe(op))
6182                         mirror_num = 1;
6183         } else if (map->type & BTRFS_BLOCK_GROUP_RAID1_MASK) {
6184                 if (need_full_stripe(op))
6185                         num_stripes = map->num_stripes;
6186                 else if (mirror_num)
6187                         stripe_index = mirror_num - 1;
6188                 else {
6189                         stripe_index = find_live_mirror(fs_info, map, 0,
6190                                             dev_replace_is_ongoing);
6191                         mirror_num = stripe_index + 1;
6192                 }
6193
6194         } else if (map->type & BTRFS_BLOCK_GROUP_DUP) {
6195                 if (need_full_stripe(op)) {
6196                         num_stripes = map->num_stripes;
6197                 } else if (mirror_num) {
6198                         stripe_index = mirror_num - 1;
6199                 } else {
6200                         mirror_num = 1;
6201                 }
6202
6203         } else if (map->type & BTRFS_BLOCK_GROUP_RAID10) {
6204                 u32 factor = map->num_stripes / map->sub_stripes;
6205
6206                 stripe_nr = div_u64_rem(stripe_nr, factor, &stripe_index);
6207                 stripe_index *= map->sub_stripes;
6208
6209                 if (need_full_stripe(op))
6210                         num_stripes = map->sub_stripes;
6211                 else if (mirror_num)
6212                         stripe_index += mirror_num - 1;
6213                 else {
6214                         int old_stripe_index = stripe_index;
6215                         stripe_index = find_live_mirror(fs_info, map,
6216                                               stripe_index,
6217                                               dev_replace_is_ongoing);
6218                         mirror_num = stripe_index - old_stripe_index + 1;
6219                 }
6220
6221         } else if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
6222                 if (need_raid_map && (need_full_stripe(op) || mirror_num > 1)) {
6223                         /* push stripe_nr back to the start of the full stripe */
6224                         stripe_nr = div64_u64(raid56_full_stripe_start,
6225                                         stripe_len * data_stripes);
6226
6227                         /* RAID[56] write or recovery. Return all stripes */
6228                         num_stripes = map->num_stripes;
6229                         max_errors = nr_parity_stripes(map);
6230
6231                         *length = map->stripe_len;
6232                         stripe_index = 0;
6233                         stripe_offset = 0;
6234                 } else {
6235                         /*
6236                          * Mirror #0 or #1 means the original data block.
6237                          * Mirror #2 is RAID5 parity block.
6238                          * Mirror #3 is RAID6 Q block.
6239                          */
6240                         stripe_nr = div_u64_rem(stripe_nr,
6241                                         data_stripes, &stripe_index);
6242                         if (mirror_num > 1)
6243                                 stripe_index = data_stripes + mirror_num - 2;
6244
6245                         /* We distribute the parity blocks across stripes */
6246                         div_u64_rem(stripe_nr + stripe_index, map->num_stripes,
6247                                         &stripe_index);
6248                         if (!need_full_stripe(op) && mirror_num <= 1)
6249                                 mirror_num = 1;
6250                 }
6251         } else {
6252                 /*
6253                  * after this, stripe_nr is the number of stripes on this
6254                  * device we have to walk to find the data, and stripe_index is
6255                  * the number of our device in the stripe array
6256                  */
6257                 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes,
6258                                 &stripe_index);
6259                 mirror_num = stripe_index + 1;
6260         }
6261         if (stripe_index >= map->num_stripes) {
6262                 btrfs_crit(fs_info,
6263                            "stripe index math went horribly wrong, got stripe_index=%u, num_stripes=%u",
6264                            stripe_index, map->num_stripes);
6265                 ret = -EINVAL;
6266                 goto out;
6267         }
6268
6269         num_alloc_stripes = num_stripes;
6270         if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL) {
6271                 if (op == BTRFS_MAP_WRITE)
6272                         num_alloc_stripes <<= 1;
6273                 if (op == BTRFS_MAP_GET_READ_MIRRORS)
6274                         num_alloc_stripes++;
6275                 tgtdev_indexes = num_stripes;
6276         }
6277
6278         bbio = alloc_btrfs_bio(num_alloc_stripes, tgtdev_indexes);
6279         if (!bbio) {
6280                 ret = -ENOMEM;
6281                 goto out;
6282         }
6283
6284         for (i = 0; i < num_stripes; i++) {
6285                 bbio->stripes[i].physical = map->stripes[stripe_index].physical +
6286                         stripe_offset + stripe_nr * map->stripe_len;
6287                 bbio->stripes[i].dev = map->stripes[stripe_index].dev;
6288                 stripe_index++;
6289         }
6290
6291         /* build raid_map */
6292         if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK && need_raid_map &&
6293             (need_full_stripe(op) || mirror_num > 1)) {
6294                 u64 tmp;
6295                 unsigned rot;
6296
6297                 /* Work out the disk rotation on this stripe-set */
6298                 div_u64_rem(stripe_nr, num_stripes, &rot);
6299
6300                 /* Fill in the logical address of each stripe */
6301                 tmp = stripe_nr * data_stripes;
6302                 for (i = 0; i < data_stripes; i++)
6303                         bbio->raid_map[(i+rot) % num_stripes] =
6304                                 em->start + (tmp + i) * map->stripe_len;
6305
6306                 bbio->raid_map[(i+rot) % map->num_stripes] = RAID5_P_STRIPE;
6307                 if (map->type & BTRFS_BLOCK_GROUP_RAID6)
6308                         bbio->raid_map[(i+rot+1) % num_stripes] =
6309                                 RAID6_Q_STRIPE;
6310
6311                 sort_parity_stripes(bbio, num_stripes);
6312         }
6313
6314         if (need_full_stripe(op))
6315                 max_errors = btrfs_chunk_max_errors(map);
6316
6317         if (dev_replace_is_ongoing && dev_replace->tgtdev != NULL &&
6318             need_full_stripe(op)) {
6319                 handle_ops_on_dev_replace(op, &bbio, dev_replace, &num_stripes,
6320                                           &max_errors);
6321         }
6322
6323         *bbio_ret = bbio;
6324         bbio->map_type = map->type;
6325         bbio->num_stripes = num_stripes;
6326         bbio->max_errors = max_errors;
6327         bbio->mirror_num = mirror_num;
6328
6329         /*
6330          * this is the case that REQ_READ && dev_replace_is_ongoing &&
6331          * mirror_num == num_stripes + 1 && dev_replace target drive is
6332          * available as a mirror
6333          */
6334         if (patch_the_first_stripe_for_dev_replace && num_stripes > 0) {
6335                 WARN_ON(num_stripes > 1);
6336                 bbio->stripes[0].dev = dev_replace->tgtdev;
6337                 bbio->stripes[0].physical = physical_to_patch_in_first_stripe;
6338                 bbio->mirror_num = map->num_stripes + 1;
6339         }
6340 out:
6341         if (dev_replace_is_ongoing) {
6342                 lockdep_assert_held(&dev_replace->rwsem);
6343                 /* Unlock and let waiting writers proceed */
6344                 up_read(&dev_replace->rwsem);
6345         }
6346         free_extent_map(em);
6347         return ret;
6348 }
6349
6350 int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
6351                       u64 logical, u64 *length,
6352                       struct btrfs_bio **bbio_ret, int mirror_num)
6353 {
6354         if (op == BTRFS_MAP_DISCARD)
6355                 return __btrfs_map_block_for_discard(fs_info, logical,
6356                                                      length, bbio_ret);
6357
6358         return __btrfs_map_block(fs_info, op, logical, length, bbio_ret,
6359                                  mirror_num, 0);
6360 }
6361
6362 /* For Scrub/replace */
6363 int btrfs_map_sblock(struct btrfs_fs_info *fs_info, enum btrfs_map_op op,
6364                      u64 logical, u64 *length,
6365                      struct btrfs_bio **bbio_ret)
6366 {
6367         return __btrfs_map_block(fs_info, op, logical, length, bbio_ret, 0, 1);
6368 }
6369
6370 static inline void btrfs_end_bbio(struct btrfs_bio *bbio, struct bio *bio)
6371 {
6372         bio->bi_private = bbio->private;
6373         bio->bi_end_io = bbio->end_io;
6374         bio_endio(bio);
6375
6376         btrfs_put_bbio(bbio);
6377 }
6378
6379 static void btrfs_end_bio(struct bio *bio)
6380 {
6381         struct btrfs_bio *bbio = bio->bi_private;
6382         int is_orig_bio = 0;
6383
6384         if (bio->bi_status) {
6385                 atomic_inc(&bbio->error);
6386                 if (bio->bi_status == BLK_STS_IOERR ||
6387                     bio->bi_status == BLK_STS_TARGET) {
6388                         struct btrfs_device *dev = btrfs_io_bio(bio)->device;
6389
6390                         ASSERT(dev->bdev);
6391                         if (bio_op(bio) == REQ_OP_WRITE)
6392                                 btrfs_dev_stat_inc_and_print(dev,
6393                                                 BTRFS_DEV_STAT_WRITE_ERRS);
6394                         else if (!(bio->bi_opf & REQ_RAHEAD))
6395                                 btrfs_dev_stat_inc_and_print(dev,
6396                                                 BTRFS_DEV_STAT_READ_ERRS);
6397                         if (bio->bi_opf & REQ_PREFLUSH)
6398                                 btrfs_dev_stat_inc_and_print(dev,
6399                                                 BTRFS_DEV_STAT_FLUSH_ERRS);
6400                 }
6401         }
6402
6403         if (bio == bbio->orig_bio)
6404                 is_orig_bio = 1;
6405
6406         btrfs_bio_counter_dec(bbio->fs_info);
6407
6408         if (atomic_dec_and_test(&bbio->stripes_pending)) {
6409                 if (!is_orig_bio) {
6410                         bio_put(bio);
6411                         bio = bbio->orig_bio;
6412                 }
6413
6414                 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
6415                 /* only send an error to the higher layers if it is
6416                  * beyond the tolerance of the btrfs bio
6417                  */
6418                 if (atomic_read(&bbio->error) > bbio->max_errors) {
6419                         bio->bi_status = BLK_STS_IOERR;
6420                 } else {
6421                         /*
6422                          * this bio is actually up to date, we didn't
6423                          * go over the max number of errors
6424                          */
6425                         bio->bi_status = BLK_STS_OK;
6426                 }
6427
6428                 btrfs_end_bbio(bbio, bio);
6429         } else if (!is_orig_bio) {
6430                 bio_put(bio);
6431         }
6432 }
6433
6434 static void submit_stripe_bio(struct btrfs_bio *bbio, struct bio *bio,
6435                               u64 physical, struct btrfs_device *dev)
6436 {
6437         struct btrfs_fs_info *fs_info = bbio->fs_info;
6438
6439         bio->bi_private = bbio;
6440         btrfs_io_bio(bio)->device = dev;
6441         bio->bi_end_io = btrfs_end_bio;
6442         bio->bi_iter.bi_sector = physical >> 9;
6443         btrfs_debug_in_rcu(fs_info,
6444         "btrfs_map_bio: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
6445                 bio_op(bio), bio->bi_opf, (u64)bio->bi_iter.bi_sector,
6446                 (unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
6447                 dev->devid, bio->bi_iter.bi_size);
6448         bio_set_dev(bio, dev->bdev);
6449
6450         btrfs_bio_counter_inc_noblocked(fs_info);
6451
6452         btrfsic_submit_bio(bio);
6453 }
6454
6455 static void bbio_error(struct btrfs_bio *bbio, struct bio *bio, u64 logical)
6456 {
6457         atomic_inc(&bbio->error);
6458         if (atomic_dec_and_test(&bbio->stripes_pending)) {
6459                 /* Should be the original bio. */
6460                 WARN_ON(bio != bbio->orig_bio);
6461
6462                 btrfs_io_bio(bio)->mirror_num = bbio->mirror_num;
6463                 bio->bi_iter.bi_sector = logical >> 9;
6464                 if (atomic_read(&bbio->error) > bbio->max_errors)
6465                         bio->bi_status = BLK_STS_IOERR;
6466                 else
6467                         bio->bi_status = BLK_STS_OK;
6468                 btrfs_end_bbio(bbio, bio);
6469         }
6470 }
6471
6472 blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
6473                            int mirror_num)
6474 {
6475         struct btrfs_device *dev;
6476         struct bio *first_bio = bio;
6477         u64 logical = (u64)bio->bi_iter.bi_sector << 9;
6478         u64 length = 0;
6479         u64 map_length;
6480         int ret;
6481         int dev_nr;
6482         int total_devs;
6483         struct btrfs_bio *bbio = NULL;
6484
6485         length = bio->bi_iter.bi_size;
6486         map_length = length;
6487
6488         btrfs_bio_counter_inc_blocked(fs_info);
6489         ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical,
6490                                 &map_length, &bbio, mirror_num, 1);
6491         if (ret) {
6492                 btrfs_bio_counter_dec(fs_info);
6493                 return errno_to_blk_status(ret);
6494         }
6495
6496         total_devs = bbio->num_stripes;
6497         bbio->orig_bio = first_bio;
6498         bbio->private = first_bio->bi_private;
6499         bbio->end_io = first_bio->bi_end_io;
6500         bbio->fs_info = fs_info;
6501         atomic_set(&bbio->stripes_pending, bbio->num_stripes);
6502
6503         if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) &&
6504             ((bio_op(bio) == REQ_OP_WRITE) || (mirror_num > 1))) {
6505                 /* In this case, map_length has been set to the length of
6506                    a single stripe; not the whole write */
6507                 if (bio_op(bio) == REQ_OP_WRITE) {
6508                         ret = raid56_parity_write(fs_info, bio, bbio,
6509                                                   map_length);
6510                 } else {
6511                         ret = raid56_parity_recover(fs_info, bio, bbio,
6512                                                     map_length, mirror_num, 1);
6513                 }
6514
6515                 btrfs_bio_counter_dec(fs_info);
6516                 return errno_to_blk_status(ret);
6517         }
6518
6519         if (map_length < length) {
6520                 btrfs_crit(fs_info,
6521                            "mapping failed logical %llu bio len %llu len %llu",
6522                            logical, length, map_length);
6523                 BUG();
6524         }
6525
6526         for (dev_nr = 0; dev_nr < total_devs; dev_nr++) {
6527                 dev = bbio->stripes[dev_nr].dev;
6528                 if (!dev || !dev->bdev || test_bit(BTRFS_DEV_STATE_MISSING,
6529                                                    &dev->dev_state) ||
6530                     (bio_op(first_bio) == REQ_OP_WRITE &&
6531                     !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
6532                         bbio_error(bbio, first_bio, logical);
6533                         continue;
6534                 }
6535
6536                 if (dev_nr < total_devs - 1)
6537                         bio = btrfs_bio_clone(first_bio);
6538                 else
6539                         bio = first_bio;
6540
6541                 submit_stripe_bio(bbio, bio, bbio->stripes[dev_nr].physical, dev);
6542         }
6543         btrfs_bio_counter_dec(fs_info);
6544         return BLK_STS_OK;
6545 }
6546
6547 /*
6548  * Find a device specified by @devid or @uuid in the list of @fs_devices, or
6549  * return NULL.
6550  *
6551  * If devid and uuid are both specified, the match must be exact, otherwise
6552  * only devid is used.
6553  *
6554  * If @seed is true, traverse through the seed devices.
6555  */
6556 struct btrfs_device *btrfs_find_device(struct btrfs_fs_devices *fs_devices,
6557                                        u64 devid, u8 *uuid, u8 *fsid,
6558                                        bool seed)
6559 {
6560         struct btrfs_device *device;
6561         struct btrfs_fs_devices *seed_devs;
6562
6563         if (!fsid || !memcmp(fs_devices->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
6564                 list_for_each_entry(device, &fs_devices->devices, dev_list) {
6565                         if (device->devid == devid &&
6566                             (!uuid || memcmp(device->uuid, uuid,
6567                                              BTRFS_UUID_SIZE) == 0))
6568                                 return device;
6569                 }
6570         }
6571
6572         list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
6573                 if (!fsid ||
6574                     !memcmp(seed_devs->metadata_uuid, fsid, BTRFS_FSID_SIZE)) {
6575                         list_for_each_entry(device, &seed_devs->devices,
6576                                             dev_list) {
6577                                 if (device->devid == devid &&
6578                                     (!uuid || memcmp(device->uuid, uuid,
6579                                                      BTRFS_UUID_SIZE) == 0))
6580                                         return device;
6581                         }
6582                 }
6583         }
6584
6585         return NULL;
6586 }
6587
6588 static struct btrfs_device *add_missing_dev(struct btrfs_fs_devices *fs_devices,
6589                                             u64 devid, u8 *dev_uuid)
6590 {
6591         struct btrfs_device *device;
6592         unsigned int nofs_flag;
6593
6594         /*
6595          * We call this under the chunk_mutex, so we want to use NOFS for this
6596          * allocation, however we don't want to change btrfs_alloc_device() to
6597          * always do NOFS because we use it in a lot of other GFP_KERNEL safe
6598          * places.
6599          */
6600         nofs_flag = memalloc_nofs_save();
6601         device = btrfs_alloc_device(NULL, &devid, dev_uuid);
6602         memalloc_nofs_restore(nofs_flag);
6603         if (IS_ERR(device))
6604                 return device;
6605
6606         list_add(&device->dev_list, &fs_devices->devices);
6607         device->fs_devices = fs_devices;
6608         fs_devices->num_devices++;
6609
6610         set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
6611         fs_devices->missing_devices++;
6612
6613         return device;
6614 }
6615
6616 /**
6617  * btrfs_alloc_device - allocate struct btrfs_device
6618  * @fs_info:    used only for generating a new devid, can be NULL if
6619  *              devid is provided (i.e. @devid != NULL).
6620  * @devid:      a pointer to devid for this device.  If NULL a new devid
6621  *              is generated.
6622  * @uuid:       a pointer to UUID for this device.  If NULL a new UUID
6623  *              is generated.
6624  *
6625  * Return: a pointer to a new &struct btrfs_device on success; ERR_PTR()
6626  * on error.  Returned struct is not linked onto any lists and must be
6627  * destroyed with btrfs_free_device.
6628  */
6629 struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info,
6630                                         const u64 *devid,
6631                                         const u8 *uuid)
6632 {
6633         struct btrfs_device *dev;
6634         u64 tmp;
6635
6636         if (WARN_ON(!devid && !fs_info))
6637                 return ERR_PTR(-EINVAL);
6638
6639         dev = __alloc_device(fs_info);
6640         if (IS_ERR(dev))
6641                 return dev;
6642
6643         if (devid)
6644                 tmp = *devid;
6645         else {
6646                 int ret;
6647
6648                 ret = find_next_devid(fs_info, &tmp);
6649                 if (ret) {
6650                         btrfs_free_device(dev);
6651                         return ERR_PTR(ret);
6652                 }
6653         }
6654         dev->devid = tmp;
6655
6656         if (uuid)
6657                 memcpy(dev->uuid, uuid, BTRFS_UUID_SIZE);
6658         else
6659                 generate_random_uuid(dev->uuid);
6660
6661         return dev;
6662 }
6663
6664 static void btrfs_report_missing_device(struct btrfs_fs_info *fs_info,
6665                                         u64 devid, u8 *uuid, bool error)
6666 {
6667         if (error)
6668                 btrfs_err_rl(fs_info, "devid %llu uuid %pU is missing",
6669                               devid, uuid);
6670         else
6671                 btrfs_warn_rl(fs_info, "devid %llu uuid %pU is missing",
6672                               devid, uuid);
6673 }
6674
6675 static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes)
6676 {
6677         int index = btrfs_bg_flags_to_raid_index(type);
6678         int ncopies = btrfs_raid_array[index].ncopies;
6679         const int nparity = btrfs_raid_array[index].nparity;
6680         int data_stripes;
6681
6682         if (nparity)
6683                 data_stripes = num_stripes - nparity;
6684         else
6685                 data_stripes = num_stripes / ncopies;
6686
6687         return div_u64(chunk_len, data_stripes);
6688 }
6689
6690 static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf,
6691                           struct btrfs_chunk *chunk)
6692 {
6693         struct btrfs_fs_info *fs_info = leaf->fs_info;
6694         struct extent_map_tree *map_tree = &fs_info->mapping_tree;
6695         struct map_lookup *map;
6696         struct extent_map *em;
6697         u64 logical;
6698         u64 length;
6699         u64 devid;
6700         u8 uuid[BTRFS_UUID_SIZE];
6701         int num_stripes;
6702         int ret;
6703         int i;
6704
6705         logical = key->offset;
6706         length = btrfs_chunk_length(leaf, chunk);
6707         num_stripes = btrfs_chunk_num_stripes(leaf, chunk);
6708
6709         /*
6710          * Only need to verify chunk item if we're reading from sys chunk array,
6711          * as chunk item in tree block is already verified by tree-checker.
6712          */
6713         if (leaf->start == BTRFS_SUPER_INFO_OFFSET) {
6714                 ret = btrfs_check_chunk_valid(leaf, chunk, logical);
6715                 if (ret)
6716                         return ret;
6717         }
6718
6719         read_lock(&map_tree->lock);
6720         em = lookup_extent_mapping(map_tree, logical, 1);
6721         read_unlock(&map_tree->lock);
6722
6723         /* already mapped? */
6724         if (em && em->start <= logical && em->start + em->len > logical) {
6725                 free_extent_map(em);
6726                 return 0;
6727         } else if (em) {
6728                 free_extent_map(em);
6729         }
6730
6731         em = alloc_extent_map();
6732         if (!em)
6733                 return -ENOMEM;
6734         map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS);
6735         if (!map) {
6736                 free_extent_map(em);
6737                 return -ENOMEM;
6738         }
6739
6740         set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags);
6741         em->map_lookup = map;
6742         em->start = logical;
6743         em->len = length;
6744         em->orig_start = 0;
6745         em->block_start = 0;
6746         em->block_len = em->len;
6747
6748         map->num_stripes = num_stripes;
6749         map->io_width = btrfs_chunk_io_width(leaf, chunk);
6750         map->io_align = btrfs_chunk_io_align(leaf, chunk);
6751         map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk);
6752         map->type = btrfs_chunk_type(leaf, chunk);
6753         map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk);
6754         map->verified_stripes = 0;
6755         em->orig_block_len = calc_stripe_length(map->type, em->len,
6756                                                 map->num_stripes);
6757         for (i = 0; i < num_stripes; i++) {
6758                 map->stripes[i].physical =
6759                         btrfs_stripe_offset_nr(leaf, chunk, i);
6760                 devid = btrfs_stripe_devid_nr(leaf, chunk, i);
6761                 read_extent_buffer(leaf, uuid, (unsigned long)
6762                                    btrfs_stripe_dev_uuid_nr(chunk, i),
6763                                    BTRFS_UUID_SIZE);
6764                 map->stripes[i].dev = btrfs_find_device(fs_info->fs_devices,
6765                                                         devid, uuid, NULL, true);
6766                 if (!map->stripes[i].dev &&
6767                     !btrfs_test_opt(fs_info, DEGRADED)) {
6768                         free_extent_map(em);
6769                         btrfs_report_missing_device(fs_info, devid, uuid, true);
6770                         return -ENOENT;
6771                 }
6772                 if (!map->stripes[i].dev) {
6773                         map->stripes[i].dev =
6774                                 add_missing_dev(fs_info->fs_devices, devid,
6775                                                 uuid);
6776                         if (IS_ERR(map->stripes[i].dev)) {
6777                                 free_extent_map(em);
6778                                 btrfs_err(fs_info,
6779                                         "failed to init missing dev %llu: %ld",
6780                                         devid, PTR_ERR(map->stripes[i].dev));
6781                                 return PTR_ERR(map->stripes[i].dev);
6782                         }
6783                         btrfs_report_missing_device(fs_info, devid, uuid, false);
6784                 }
6785                 set_bit(BTRFS_DEV_STATE_IN_FS_METADATA,
6786                                 &(map->stripes[i].dev->dev_state));
6787
6788         }
6789
6790         write_lock(&map_tree->lock);
6791         ret = add_extent_mapping(map_tree, em, 0);
6792         write_unlock(&map_tree->lock);
6793         if (ret < 0) {
6794                 btrfs_err(fs_info,
6795                           "failed to add chunk map, start=%llu len=%llu: %d",
6796                           em->start, em->len, ret);
6797         }
6798         free_extent_map(em);
6799
6800         return ret;
6801 }
6802
6803 static void fill_device_from_item(struct extent_buffer *leaf,
6804                                  struct btrfs_dev_item *dev_item,
6805                                  struct btrfs_device *device)
6806 {
6807         unsigned long ptr;
6808
6809         device->devid = btrfs_device_id(leaf, dev_item);
6810         device->disk_total_bytes = btrfs_device_total_bytes(leaf, dev_item);
6811         device->total_bytes = device->disk_total_bytes;
6812         device->commit_total_bytes = device->disk_total_bytes;
6813         device->bytes_used = btrfs_device_bytes_used(leaf, dev_item);
6814         device->commit_bytes_used = device->bytes_used;
6815         device->type = btrfs_device_type(leaf, dev_item);
6816         device->io_align = btrfs_device_io_align(leaf, dev_item);
6817         device->io_width = btrfs_device_io_width(leaf, dev_item);
6818         device->sector_size = btrfs_device_sector_size(leaf, dev_item);
6819         WARN_ON(device->devid == BTRFS_DEV_REPLACE_DEVID);
6820         clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state);
6821
6822         ptr = btrfs_device_uuid(dev_item);
6823         read_extent_buffer(leaf, device->uuid, ptr, BTRFS_UUID_SIZE);
6824 }
6825
6826 static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info,
6827                                                   u8 *fsid)
6828 {
6829         struct btrfs_fs_devices *fs_devices;
6830         int ret;
6831
6832         lockdep_assert_held(&uuid_mutex);
6833         ASSERT(fsid);
6834
6835         /* This will match only for multi-device seed fs */
6836         list_for_each_entry(fs_devices, &fs_info->fs_devices->seed_list, seed_list)
6837                 if (!memcmp(fs_devices->fsid, fsid, BTRFS_FSID_SIZE))
6838                         return fs_devices;
6839
6840
6841         fs_devices = find_fsid(fsid, NULL);
6842         if (!fs_devices) {
6843                 if (!btrfs_test_opt(fs_info, DEGRADED))
6844                         return ERR_PTR(-ENOENT);
6845
6846                 fs_devices = alloc_fs_devices(fsid, NULL);
6847                 if (IS_ERR(fs_devices))
6848                         return fs_devices;
6849
6850                 fs_devices->seeding = true;
6851                 fs_devices->opened = 1;
6852                 return fs_devices;
6853         }
6854
6855         /*
6856          * Upon first call for a seed fs fsid, just create a private copy of the
6857          * respective fs_devices and anchor it at fs_info->fs_devices->seed_list
6858          */
6859         fs_devices = clone_fs_devices(fs_devices);
6860         if (IS_ERR(fs_devices))
6861                 return fs_devices;
6862
6863         ret = open_fs_devices(fs_devices, FMODE_READ, fs_info->bdev_holder);
6864         if (ret) {
6865                 free_fs_devices(fs_devices);
6866                 return ERR_PTR(ret);
6867         }
6868
6869         if (!fs_devices->seeding) {
6870                 close_fs_devices(fs_devices);
6871                 free_fs_devices(fs_devices);
6872                 return ERR_PTR(-EINVAL);
6873         }
6874
6875         list_add(&fs_devices->seed_list, &fs_info->fs_devices->seed_list);
6876
6877         return fs_devices;
6878 }
6879
6880 static int read_one_dev(struct extent_buffer *leaf,
6881                         struct btrfs_dev_item *dev_item)
6882 {
6883         struct btrfs_fs_info *fs_info = leaf->fs_info;
6884         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
6885         struct btrfs_device *device;
6886         u64 devid;
6887         int ret;
6888         u8 fs_uuid[BTRFS_FSID_SIZE];
6889         u8 dev_uuid[BTRFS_UUID_SIZE];
6890
6891         devid = btrfs_device_id(leaf, dev_item);
6892         read_extent_buffer(leaf, dev_uuid, btrfs_device_uuid(dev_item),
6893                            BTRFS_UUID_SIZE);
6894         read_extent_buffer(leaf, fs_uuid, btrfs_device_fsid(dev_item),
6895                            BTRFS_FSID_SIZE);
6896
6897         if (memcmp(fs_uuid, fs_devices->metadata_uuid, BTRFS_FSID_SIZE)) {
6898                 fs_devices = open_seed_devices(fs_info, fs_uuid);
6899                 if (IS_ERR(fs_devices))
6900                         return PTR_ERR(fs_devices);
6901         }
6902
6903         device = btrfs_find_device(fs_info->fs_devices, devid, dev_uuid,
6904                                    fs_uuid, true);
6905         if (!device) {
6906                 if (!btrfs_test_opt(fs_info, DEGRADED)) {
6907                         btrfs_report_missing_device(fs_info, devid,
6908                                                         dev_uuid, true);
6909                         return -ENOENT;
6910                 }
6911
6912                 device = add_missing_dev(fs_devices, devid, dev_uuid);
6913                 if (IS_ERR(device)) {
6914                         btrfs_err(fs_info,
6915                                 "failed to add missing dev %llu: %ld",
6916                                 devid, PTR_ERR(device));
6917                         return PTR_ERR(device);
6918                 }
6919                 btrfs_report_missing_device(fs_info, devid, dev_uuid, false);
6920         } else {
6921                 if (!device->bdev) {
6922                         if (!btrfs_test_opt(fs_info, DEGRADED)) {
6923                                 btrfs_report_missing_device(fs_info,
6924                                                 devid, dev_uuid, true);
6925                                 return -ENOENT;
6926                         }
6927                         btrfs_report_missing_device(fs_info, devid,
6928                                                         dev_uuid, false);
6929                 }
6930
6931                 if (!device->bdev &&
6932                     !test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state)) {
6933                         /*
6934                          * this happens when a device that was properly setup
6935                          * in the device info lists suddenly goes bad.
6936                          * device->bdev is NULL, and so we have to set
6937                          * device->missing to one here
6938                          */
6939                         device->fs_devices->missing_devices++;
6940                         set_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state);
6941                 }
6942
6943                 /* Move the device to its own fs_devices */
6944                 if (device->fs_devices != fs_devices) {
6945                         ASSERT(test_bit(BTRFS_DEV_STATE_MISSING,
6946                                                         &device->dev_state));
6947
6948                         list_move(&device->dev_list, &fs_devices->devices);
6949                         device->fs_devices->num_devices--;
6950                         fs_devices->num_devices++;
6951
6952                         device->fs_devices->missing_devices--;
6953                         fs_devices->missing_devices++;
6954
6955                         device->fs_devices = fs_devices;
6956                 }
6957         }
6958
6959         if (device->fs_devices != fs_info->fs_devices) {
6960                 BUG_ON(test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state));
6961                 if (device->generation !=
6962                     btrfs_device_generation(leaf, dev_item))
6963                         return -EINVAL;
6964         }
6965
6966         fill_device_from_item(leaf, dev_item, device);
6967         set_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &device->dev_state);
6968         if (test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state) &&
6969            !test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
6970                 device->fs_devices->total_rw_bytes += device->total_bytes;
6971                 atomic64_add(device->total_bytes - device->bytes_used,
6972                                 &fs_info->free_chunk_space);
6973         }
6974         ret = 0;
6975         return ret;
6976 }
6977
6978 int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
6979 {
6980         struct btrfs_root *root = fs_info->tree_root;
6981         struct btrfs_super_block *super_copy = fs_info->super_copy;
6982         struct extent_buffer *sb;
6983         struct btrfs_disk_key *disk_key;
6984         struct btrfs_chunk *chunk;
6985         u8 *array_ptr;
6986         unsigned long sb_array_offset;
6987         int ret = 0;
6988         u32 num_stripes;
6989         u32 array_size;
6990         u32 len = 0;
6991         u32 cur_offset;
6992         u64 type;
6993         struct btrfs_key key;
6994
6995         ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
6996         /*
6997          * This will create extent buffer of nodesize, superblock size is
6998          * fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
6999          * overallocate but we can keep it as-is, only the first page is used.
7000          */
7001         sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET);
7002         if (IS_ERR(sb))
7003                 return PTR_ERR(sb);
7004         set_extent_buffer_uptodate(sb);
7005         btrfs_set_buffer_lockdep_class(root->root_key.objectid, sb, 0);
7006         /*
7007          * The sb extent buffer is artificial and just used to read the system array.
7008          * set_extent_buffer_uptodate() call does not properly mark all it's
7009          * pages up-to-date when the page is larger: extent does not cover the
7010          * whole page and consequently check_page_uptodate does not find all
7011          * the page's extents up-to-date (the hole beyond sb),
7012          * write_extent_buffer then triggers a WARN_ON.
7013          *
7014          * Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
7015          * but sb spans only this function. Add an explicit SetPageUptodate call
7016          * to silence the warning eg. on PowerPC 64.
7017          */
7018         if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE)
7019                 SetPageUptodate(sb->pages[0]);
7020
7021         write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
7022         array_size = btrfs_super_sys_array_size(super_copy);
7023
7024         array_ptr = super_copy->sys_chunk_array;
7025         sb_array_offset = offsetof(struct btrfs_super_block, sys_chunk_array);
7026         cur_offset = 0;
7027
7028         while (cur_offset < array_size) {
7029                 disk_key = (struct btrfs_disk_key *)array_ptr;
7030                 len = sizeof(*disk_key);
7031                 if (cur_offset + len > array_size)
7032                         goto out_short_read;
7033
7034                 btrfs_disk_key_to_cpu(&key, disk_key);
7035
7036                 array_ptr += len;
7037                 sb_array_offset += len;
7038                 cur_offset += len;
7039
7040                 if (key.type != BTRFS_CHUNK_ITEM_KEY) {
7041                         btrfs_err(fs_info,
7042                             "unexpected item type %u in sys_array at offset %u",
7043                                   (u32)key.type, cur_offset);
7044                         ret = -EIO;
7045                         break;
7046                 }
7047
7048                 chunk = (struct btrfs_chunk *)sb_array_offset;
7049                 /*
7050                  * At least one btrfs_chunk with one stripe must be present,
7051                  * exact stripe count check comes afterwards
7052                  */
7053                 len = btrfs_chunk_item_size(1);
7054                 if (cur_offset + len > array_size)
7055                         goto out_short_read;
7056
7057                 num_stripes = btrfs_chunk_num_stripes(sb, chunk);
7058                 if (!num_stripes) {
7059                         btrfs_err(fs_info,
7060                         "invalid number of stripes %u in sys_array at offset %u",
7061                                   num_stripes, cur_offset);
7062                         ret = -EIO;
7063                         break;
7064                 }
7065
7066                 type = btrfs_chunk_type(sb, chunk);
7067                 if ((type & BTRFS_BLOCK_GROUP_SYSTEM) == 0) {
7068                         btrfs_err(fs_info,
7069                         "invalid chunk type %llu in sys_array at offset %u",
7070                                   type, cur_offset);
7071                         ret = -EIO;
7072                         break;
7073                 }
7074
7075                 len = btrfs_chunk_item_size(num_stripes);
7076                 if (cur_offset + len > array_size)
7077                         goto out_short_read;
7078
7079                 ret = read_one_chunk(&key, sb, chunk);
7080                 if (ret)
7081                         break;
7082
7083                 array_ptr += len;
7084                 sb_array_offset += len;
7085                 cur_offset += len;
7086         }
7087         clear_extent_buffer_uptodate(sb);
7088         free_extent_buffer_stale(sb);
7089         return ret;
7090
7091 out_short_read:
7092         btrfs_err(fs_info, "sys_array too short to read %u bytes at offset %u",
7093                         len, cur_offset);
7094         clear_extent_buffer_uptodate(sb);
7095         free_extent_buffer_stale(sb);
7096         return -EIO;
7097 }
7098
7099 /*
7100  * Check if all chunks in the fs are OK for read-write degraded mount
7101  *
7102  * If the @failing_dev is specified, it's accounted as missing.
7103  *
7104  * Return true if all chunks meet the minimal RW mount requirements.
7105  * Return false if any chunk doesn't meet the minimal RW mount requirements.
7106  */
7107 bool btrfs_check_rw_degradable(struct btrfs_fs_info *fs_info,
7108                                         struct btrfs_device *failing_dev)
7109 {
7110         struct extent_map_tree *map_tree = &fs_info->mapping_tree;
7111         struct extent_map *em;
7112         u64 next_start = 0;
7113         bool ret = true;
7114
7115         read_lock(&map_tree->lock);
7116         em = lookup_extent_mapping(map_tree, 0, (u64)-1);
7117         read_unlock(&map_tree->lock);
7118         /* No chunk at all? Return false anyway */
7119         if (!em) {
7120                 ret = false;
7121                 goto out;
7122         }
7123         while (em) {
7124                 struct map_lookup *map;
7125                 int missing = 0;
7126                 int max_tolerated;
7127                 int i;
7128
7129                 map = em->map_lookup;
7130                 max_tolerated =
7131                         btrfs_get_num_tolerated_disk_barrier_failures(
7132                                         map->type);
7133                 for (i = 0; i < map->num_stripes; i++) {
7134                         struct btrfs_device *dev = map->stripes[i].dev;
7135
7136                         if (!dev || !dev->bdev ||
7137                             test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
7138                             dev->last_flush_error)
7139                                 missing++;
7140                         else if (failing_dev && failing_dev == dev)
7141                                 missing++;
7142                 }
7143                 if (missing > max_tolerated) {
7144                         if (!failing_dev)
7145                                 btrfs_warn(fs_info,
7146         "chunk %llu missing %d devices, max tolerance is %d for writable mount",
7147                                    em->start, missing, max_tolerated);
7148                         free_extent_map(em);
7149                         ret = false;
7150                         goto out;
7151                 }
7152                 next_start = extent_map_end(em);
7153                 free_extent_map(em);
7154
7155                 read_lock(&map_tree->lock);
7156                 em = lookup_extent_mapping(map_tree, next_start,
7157                                            (u64)(-1) - next_start);
7158                 read_unlock(&map_tree->lock);
7159         }
7160 out:
7161         return ret;
7162 }
7163
7164 static void readahead_tree_node_children(struct extent_buffer *node)
7165 {
7166         int i;
7167         const int nr_items = btrfs_header_nritems(node);
7168
7169         for (i = 0; i < nr_items; i++) {
7170                 u64 start;
7171
7172                 start = btrfs_node_blockptr(node, i);
7173                 readahead_tree_block(node->fs_info, start);
7174         }
7175 }
7176
7177 int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
7178 {
7179         struct btrfs_root *root = fs_info->chunk_root;
7180         struct btrfs_path *path;
7181         struct extent_buffer *leaf;
7182         struct btrfs_key key;
7183         struct btrfs_key found_key;
7184         int ret;
7185         int slot;
7186         u64 total_dev = 0;
7187         u64 last_ra_node = 0;
7188
7189         path = btrfs_alloc_path();
7190         if (!path)
7191                 return -ENOMEM;
7192
7193         /*
7194          * uuid_mutex is needed only if we are mounting a sprout FS
7195          * otherwise we don't need it.
7196          */
7197         mutex_lock(&uuid_mutex);
7198
7199         /*
7200          * It is possible for mount and umount to race in such a way that
7201          * we execute this code path, but open_fs_devices failed to clear
7202          * total_rw_bytes. We certainly want it cleared before reading the
7203          * device items, so clear it here.
7204          */
7205         fs_info->fs_devices->total_rw_bytes = 0;
7206
7207         /*
7208          * Read all device items, and then all the chunk items. All
7209          * device items are found before any chunk item (their object id
7210          * is smaller than the lowest possible object id for a chunk
7211          * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID).
7212          */
7213         key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
7214         key.offset = 0;
7215         key.type = 0;
7216         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
7217         if (ret < 0)
7218                 goto error;
7219         while (1) {
7220                 struct extent_buffer *node;
7221
7222                 leaf = path->nodes[0];
7223                 slot = path->slots[0];
7224                 if (slot >= btrfs_header_nritems(leaf)) {
7225                         ret = btrfs_next_leaf(root, path);
7226                         if (ret == 0)
7227                                 continue;
7228                         if (ret < 0)
7229                                 goto error;
7230                         break;
7231                 }
7232                 /*
7233                  * The nodes on level 1 are not locked but we don't need to do
7234                  * that during mount time as nothing else can access the tree
7235                  */
7236                 node = path->nodes[1];
7237                 if (node) {
7238                         if (last_ra_node != node->start) {
7239                                 readahead_tree_node_children(node);
7240                                 last_ra_node = node->start;
7241                         }
7242                 }
7243                 btrfs_item_key_to_cpu(leaf, &found_key, slot);
7244                 if (found_key.type == BTRFS_DEV_ITEM_KEY) {
7245                         struct btrfs_dev_item *dev_item;
7246                         dev_item = btrfs_item_ptr(leaf, slot,
7247                                                   struct btrfs_dev_item);
7248                         ret = read_one_dev(leaf, dev_item);
7249                         if (ret)
7250                                 goto error;
7251                         total_dev++;
7252                 } else if (found_key.type == BTRFS_CHUNK_ITEM_KEY) {
7253                         struct btrfs_chunk *chunk;
7254                         chunk = btrfs_item_ptr(leaf, slot, struct btrfs_chunk);
7255                         mutex_lock(&fs_info->chunk_mutex);
7256                         ret = read_one_chunk(&found_key, leaf, chunk);
7257                         mutex_unlock(&fs_info->chunk_mutex);
7258                         if (ret)
7259                                 goto error;
7260                 }
7261                 path->slots[0]++;
7262         }
7263
7264         /*
7265          * After loading chunk tree, we've got all device information,
7266          * do another round of validation checks.
7267          */
7268         if (total_dev != fs_info->fs_devices->total_devices) {
7269                 btrfs_warn(fs_info,
7270 "super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit",
7271                           btrfs_super_num_devices(fs_info->super_copy),
7272                           total_dev);
7273                 fs_info->fs_devices->total_devices = total_dev;
7274                 btrfs_set_super_num_devices(fs_info->super_copy, total_dev);
7275         }
7276         if (btrfs_super_total_bytes(fs_info->super_copy) <
7277             fs_info->fs_devices->total_rw_bytes) {
7278                 btrfs_err(fs_info,
7279         "super_total_bytes %llu mismatch with fs_devices total_rw_bytes %llu",
7280                           btrfs_super_total_bytes(fs_info->super_copy),
7281                           fs_info->fs_devices->total_rw_bytes);
7282                 ret = -EINVAL;
7283                 goto error;
7284         }
7285         ret = 0;
7286 error:
7287         mutex_unlock(&uuid_mutex);
7288
7289         btrfs_free_path(path);
7290         return ret;
7291 }
7292
7293 void btrfs_init_devices_late(struct btrfs_fs_info *fs_info)
7294 {
7295         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
7296         struct btrfs_device *device;
7297
7298         fs_devices->fs_info = fs_info;
7299
7300         mutex_lock(&fs_devices->device_list_mutex);
7301         list_for_each_entry(device, &fs_devices->devices, dev_list)
7302                 device->fs_info = fs_info;
7303
7304         list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
7305                 list_for_each_entry(device, &seed_devs->devices, dev_list)
7306                         device->fs_info = fs_info;
7307
7308                 seed_devs->fs_info = fs_info;
7309         }
7310         mutex_unlock(&fs_devices->device_list_mutex);
7311 }
7312
7313 static u64 btrfs_dev_stats_value(const struct extent_buffer *eb,
7314                                  const struct btrfs_dev_stats_item *ptr,
7315                                  int index)
7316 {
7317         u64 val;
7318
7319         read_extent_buffer(eb, &val,
7320                            offsetof(struct btrfs_dev_stats_item, values) +
7321                             ((unsigned long)ptr) + (index * sizeof(u64)),
7322                            sizeof(val));
7323         return val;
7324 }
7325
7326 static void btrfs_set_dev_stats_value(struct extent_buffer *eb,
7327                                       struct btrfs_dev_stats_item *ptr,
7328                                       int index, u64 val)
7329 {
7330         write_extent_buffer(eb, &val,
7331                             offsetof(struct btrfs_dev_stats_item, values) +
7332                              ((unsigned long)ptr) + (index * sizeof(u64)),
7333                             sizeof(val));
7334 }
7335
7336 static int btrfs_device_init_dev_stats(struct btrfs_device *device,
7337                                        struct btrfs_path *path)
7338 {
7339         struct btrfs_dev_stats_item *ptr;
7340         struct extent_buffer *eb;
7341         struct btrfs_key key;
7342         int item_size;
7343         int i, ret, slot;
7344
7345         key.objectid = BTRFS_DEV_STATS_OBJECTID;
7346         key.type = BTRFS_PERSISTENT_ITEM_KEY;
7347         key.offset = device->devid;
7348         ret = btrfs_search_slot(NULL, device->fs_info->dev_root, &key, path, 0, 0);
7349         if (ret) {
7350                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7351                         btrfs_dev_stat_set(device, i, 0);
7352                 device->dev_stats_valid = 1;
7353                 btrfs_release_path(path);
7354                 return ret < 0 ? ret : 0;
7355         }
7356         slot = path->slots[0];
7357         eb = path->nodes[0];
7358         item_size = btrfs_item_size_nr(eb, slot);
7359
7360         ptr = btrfs_item_ptr(eb, slot, struct btrfs_dev_stats_item);
7361
7362         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7363                 if (item_size >= (1 + i) * sizeof(__le64))
7364                         btrfs_dev_stat_set(device, i,
7365                                            btrfs_dev_stats_value(eb, ptr, i));
7366                 else
7367                         btrfs_dev_stat_set(device, i, 0);
7368         }
7369
7370         device->dev_stats_valid = 1;
7371         btrfs_dev_stat_print_on_load(device);
7372         btrfs_release_path(path);
7373
7374         return 0;
7375 }
7376
7377 int btrfs_init_dev_stats(struct btrfs_fs_info *fs_info)
7378 {
7379         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices, *seed_devs;
7380         struct btrfs_device *device;
7381         struct btrfs_path *path = NULL;
7382         int ret = 0;
7383
7384         path = btrfs_alloc_path();
7385         if (!path)
7386                 return -ENOMEM;
7387
7388         mutex_lock(&fs_devices->device_list_mutex);
7389         list_for_each_entry(device, &fs_devices->devices, dev_list) {
7390                 ret = btrfs_device_init_dev_stats(device, path);
7391                 if (ret)
7392                         goto out;
7393         }
7394         list_for_each_entry(seed_devs, &fs_devices->seed_list, seed_list) {
7395                 list_for_each_entry(device, &seed_devs->devices, dev_list) {
7396                         ret = btrfs_device_init_dev_stats(device, path);
7397                         if (ret)
7398                                 goto out;
7399                 }
7400         }
7401 out:
7402         mutex_unlock(&fs_devices->device_list_mutex);
7403
7404         btrfs_free_path(path);
7405         return ret;
7406 }
7407
7408 static int update_dev_stat_item(struct btrfs_trans_handle *trans,
7409                                 struct btrfs_device *device)
7410 {
7411         struct btrfs_fs_info *fs_info = trans->fs_info;
7412         struct btrfs_root *dev_root = fs_info->dev_root;
7413         struct btrfs_path *path;
7414         struct btrfs_key key;
7415         struct extent_buffer *eb;
7416         struct btrfs_dev_stats_item *ptr;
7417         int ret;
7418         int i;
7419
7420         key.objectid = BTRFS_DEV_STATS_OBJECTID;
7421         key.type = BTRFS_PERSISTENT_ITEM_KEY;
7422         key.offset = device->devid;
7423
7424         path = btrfs_alloc_path();
7425         if (!path)
7426                 return -ENOMEM;
7427         ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1);
7428         if (ret < 0) {
7429                 btrfs_warn_in_rcu(fs_info,
7430                         "error %d while searching for dev_stats item for device %s",
7431                               ret, rcu_str_deref(device->name));
7432                 goto out;
7433         }
7434
7435         if (ret == 0 &&
7436             btrfs_item_size_nr(path->nodes[0], path->slots[0]) < sizeof(*ptr)) {
7437                 /* need to delete old one and insert a new one */
7438                 ret = btrfs_del_item(trans, dev_root, path);
7439                 if (ret != 0) {
7440                         btrfs_warn_in_rcu(fs_info,
7441                                 "delete too small dev_stats item for device %s failed %d",
7442                                       rcu_str_deref(device->name), ret);
7443                         goto out;
7444                 }
7445                 ret = 1;
7446         }
7447
7448         if (ret == 1) {
7449                 /* need to insert a new item */
7450                 btrfs_release_path(path);
7451                 ret = btrfs_insert_empty_item(trans, dev_root, path,
7452                                               &key, sizeof(*ptr));
7453                 if (ret < 0) {
7454                         btrfs_warn_in_rcu(fs_info,
7455                                 "insert dev_stats item for device %s failed %d",
7456                                 rcu_str_deref(device->name), ret);
7457                         goto out;
7458                 }
7459         }
7460
7461         eb = path->nodes[0];
7462         ptr = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dev_stats_item);
7463         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7464                 btrfs_set_dev_stats_value(eb, ptr, i,
7465                                           btrfs_dev_stat_read(device, i));
7466         btrfs_mark_buffer_dirty(eb);
7467
7468 out:
7469         btrfs_free_path(path);
7470         return ret;
7471 }
7472
7473 /*
7474  * called from commit_transaction. Writes all changed device stats to disk.
7475  */
7476 int btrfs_run_dev_stats(struct btrfs_trans_handle *trans)
7477 {
7478         struct btrfs_fs_info *fs_info = trans->fs_info;
7479         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7480         struct btrfs_device *device;
7481         int stats_cnt;
7482         int ret = 0;
7483
7484         mutex_lock(&fs_devices->device_list_mutex);
7485         list_for_each_entry(device, &fs_devices->devices, dev_list) {
7486                 stats_cnt = atomic_read(&device->dev_stats_ccnt);
7487                 if (!device->dev_stats_valid || stats_cnt == 0)
7488                         continue;
7489
7490
7491                 /*
7492                  * There is a LOAD-LOAD control dependency between the value of
7493                  * dev_stats_ccnt and updating the on-disk values which requires
7494                  * reading the in-memory counters. Such control dependencies
7495                  * require explicit read memory barriers.
7496                  *
7497                  * This memory barriers pairs with smp_mb__before_atomic in
7498                  * btrfs_dev_stat_inc/btrfs_dev_stat_set and with the full
7499                  * barrier implied by atomic_xchg in
7500                  * btrfs_dev_stats_read_and_reset
7501                  */
7502                 smp_rmb();
7503
7504                 ret = update_dev_stat_item(trans, device);
7505                 if (!ret)
7506                         atomic_sub(stats_cnt, &device->dev_stats_ccnt);
7507         }
7508         mutex_unlock(&fs_devices->device_list_mutex);
7509
7510         return ret;
7511 }
7512
7513 void btrfs_dev_stat_inc_and_print(struct btrfs_device *dev, int index)
7514 {
7515         btrfs_dev_stat_inc(dev, index);
7516         btrfs_dev_stat_print_on_error(dev);
7517 }
7518
7519 static void btrfs_dev_stat_print_on_error(struct btrfs_device *dev)
7520 {
7521         if (!dev->dev_stats_valid)
7522                 return;
7523         btrfs_err_rl_in_rcu(dev->fs_info,
7524                 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7525                            rcu_str_deref(dev->name),
7526                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
7527                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
7528                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7529                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
7530                            btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7531 }
7532
7533 static void btrfs_dev_stat_print_on_load(struct btrfs_device *dev)
7534 {
7535         int i;
7536
7537         for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7538                 if (btrfs_dev_stat_read(dev, i) != 0)
7539                         break;
7540         if (i == BTRFS_DEV_STAT_VALUES_MAX)
7541                 return; /* all values == 0, suppress message */
7542
7543         btrfs_info_in_rcu(dev->fs_info,
7544                 "bdev %s errs: wr %u, rd %u, flush %u, corrupt %u, gen %u",
7545                rcu_str_deref(dev->name),
7546                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_WRITE_ERRS),
7547                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_READ_ERRS),
7548                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_FLUSH_ERRS),
7549                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_CORRUPTION_ERRS),
7550                btrfs_dev_stat_read(dev, BTRFS_DEV_STAT_GENERATION_ERRS));
7551 }
7552
7553 int btrfs_get_dev_stats(struct btrfs_fs_info *fs_info,
7554                         struct btrfs_ioctl_get_dev_stats *stats)
7555 {
7556         struct btrfs_device *dev;
7557         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
7558         int i;
7559
7560         mutex_lock(&fs_devices->device_list_mutex);
7561         dev = btrfs_find_device(fs_info->fs_devices, stats->devid, NULL, NULL,
7562                                 true);
7563         mutex_unlock(&fs_devices->device_list_mutex);
7564
7565         if (!dev) {
7566                 btrfs_warn(fs_info, "get dev_stats failed, device not found");
7567                 return -ENODEV;
7568         } else if (!dev->dev_stats_valid) {
7569                 btrfs_warn(fs_info, "get dev_stats failed, not yet valid");
7570                 return -ENODEV;
7571         } else if (stats->flags & BTRFS_DEV_STATS_RESET) {
7572                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++) {
7573                         if (stats->nr_items > i)
7574                                 stats->values[i] =
7575                                         btrfs_dev_stat_read_and_reset(dev, i);
7576                         else
7577                                 btrfs_dev_stat_set(dev, i, 0);
7578                 }
7579                 btrfs_info(fs_info, "device stats zeroed by %s (%d)",
7580                            current->comm, task_pid_nr(current));
7581         } else {
7582                 for (i = 0; i < BTRFS_DEV_STAT_VALUES_MAX; i++)
7583                         if (stats->nr_items > i)
7584                                 stats->values[i] = btrfs_dev_stat_read(dev, i);
7585         }
7586         if (stats->nr_items > BTRFS_DEV_STAT_VALUES_MAX)
7587                 stats->nr_items = BTRFS_DEV_STAT_VALUES_MAX;
7588         return 0;
7589 }
7590
7591 /*
7592  * Update the size and bytes used for each device where it changed.  This is
7593  * delayed since we would otherwise get errors while writing out the
7594  * superblocks.
7595  *
7596  * Must be invoked during transaction commit.
7597  */
7598 void btrfs_commit_device_sizes(struct btrfs_transaction *trans)
7599 {
7600         struct btrfs_device *curr, *next;
7601
7602         ASSERT(trans->state == TRANS_STATE_COMMIT_DOING);
7603
7604         if (list_empty(&trans->dev_update_list))
7605                 return;
7606
7607         /*
7608          * We don't need the device_list_mutex here.  This list is owned by the
7609          * transaction and the transaction must complete before the device is
7610          * released.
7611          */
7612         mutex_lock(&trans->fs_info->chunk_mutex);
7613         list_for_each_entry_safe(curr, next, &trans->dev_update_list,
7614                                  post_commit_list) {
7615                 list_del_init(&curr->post_commit_list);
7616                 curr->commit_total_bytes = curr->disk_total_bytes;
7617                 curr->commit_bytes_used = curr->bytes_used;
7618         }
7619         mutex_unlock(&trans->fs_info->chunk_mutex);
7620 }
7621
7622 /*
7623  * Multiplicity factor for simple profiles: DUP, RAID1-like and RAID10.
7624  */
7625 int btrfs_bg_type_to_factor(u64 flags)
7626 {
7627         const int index = btrfs_bg_flags_to_raid_index(flags);
7628
7629         return btrfs_raid_array[index].ncopies;
7630 }
7631
7632
7633
7634 static int verify_one_dev_extent(struct btrfs_fs_info *fs_info,
7635                                  u64 chunk_offset, u64 devid,
7636                                  u64 physical_offset, u64 physical_len)
7637 {
7638         struct extent_map_tree *em_tree = &fs_info->mapping_tree;
7639         struct extent_map *em;
7640         struct map_lookup *map;
7641         struct btrfs_device *dev;
7642         u64 stripe_len;
7643         bool found = false;
7644         int ret = 0;
7645         int i;
7646
7647         read_lock(&em_tree->lock);
7648         em = lookup_extent_mapping(em_tree, chunk_offset, 1);
7649         read_unlock(&em_tree->lock);
7650
7651         if (!em) {
7652                 btrfs_err(fs_info,
7653 "dev extent physical offset %llu on devid %llu doesn't have corresponding chunk",
7654                           physical_offset, devid);
7655                 ret = -EUCLEAN;
7656                 goto out;
7657         }
7658
7659         map = em->map_lookup;
7660         stripe_len = calc_stripe_length(map->type, em->len, map->num_stripes);
7661         if (physical_len != stripe_len) {
7662                 btrfs_err(fs_info,
7663 "dev extent physical offset %llu on devid %llu length doesn't match chunk %llu, have %llu expect %llu",
7664                           physical_offset, devid, em->start, physical_len,
7665                           stripe_len);
7666                 ret = -EUCLEAN;
7667                 goto out;
7668         }
7669
7670         for (i = 0; i < map->num_stripes; i++) {
7671                 if (map->stripes[i].dev->devid == devid &&
7672                     map->stripes[i].physical == physical_offset) {
7673                         found = true;
7674                         if (map->verified_stripes >= map->num_stripes) {
7675                                 btrfs_err(fs_info,
7676                                 "too many dev extents for chunk %llu found",
7677                                           em->start);
7678                                 ret = -EUCLEAN;
7679                                 goto out;
7680                         }
7681                         map->verified_stripes++;
7682                         break;
7683                 }
7684         }
7685         if (!found) {
7686                 btrfs_err(fs_info,
7687         "dev extent physical offset %llu devid %llu has no corresponding chunk",
7688                         physical_offset, devid);
7689                 ret = -EUCLEAN;
7690         }
7691
7692         /* Make sure no dev extent is beyond device bondary */
7693         dev = btrfs_find_device(fs_info->fs_devices, devid, NULL, NULL, true);
7694         if (!dev) {
7695                 btrfs_err(fs_info, "failed to find devid %llu", devid);
7696                 ret = -EUCLEAN;
7697                 goto out;
7698         }
7699
7700         /* It's possible this device is a dummy for seed device */
7701         if (dev->disk_total_bytes == 0) {
7702                 struct btrfs_fs_devices *devs;
7703
7704                 devs = list_first_entry(&fs_info->fs_devices->seed_list,
7705                                         struct btrfs_fs_devices, seed_list);
7706                 dev = btrfs_find_device(devs, devid, NULL, NULL, false);
7707                 if (!dev) {
7708                         btrfs_err(fs_info, "failed to find seed devid %llu",
7709                                   devid);
7710                         ret = -EUCLEAN;
7711                         goto out;
7712                 }
7713         }
7714
7715         if (physical_offset + physical_len > dev->disk_total_bytes) {
7716                 btrfs_err(fs_info,
7717 "dev extent devid %llu physical offset %llu len %llu is beyond device boundary %llu",
7718                           devid, physical_offset, physical_len,
7719                           dev->disk_total_bytes);
7720                 ret = -EUCLEAN;
7721                 goto out;
7722         }
7723 out:
7724         free_extent_map(em);
7725         return ret;
7726 }
7727
7728 static int verify_chunk_dev_extent_mapping(struct btrfs_fs_info *fs_info)
7729 {
7730         struct extent_map_tree *em_tree = &fs_info->mapping_tree;
7731         struct extent_map *em;
7732         struct rb_node *node;
7733         int ret = 0;
7734
7735         read_lock(&em_tree->lock);
7736         for (node = rb_first_cached(&em_tree->map); node; node = rb_next(node)) {
7737                 em = rb_entry(node, struct extent_map, rb_node);
7738                 if (em->map_lookup->num_stripes !=
7739                     em->map_lookup->verified_stripes) {
7740                         btrfs_err(fs_info,
7741                         "chunk %llu has missing dev extent, have %d expect %d",
7742                                   em->start, em->map_lookup->verified_stripes,
7743                                   em->map_lookup->num_stripes);
7744                         ret = -EUCLEAN;
7745                         goto out;
7746                 }
7747         }
7748 out:
7749         read_unlock(&em_tree->lock);
7750         return ret;
7751 }
7752
7753 /*
7754  * Ensure that all dev extents are mapped to correct chunk, otherwise
7755  * later chunk allocation/free would cause unexpected behavior.
7756  *
7757  * NOTE: This will iterate through the whole device tree, which should be of
7758  * the same size level as the chunk tree.  This slightly increases mount time.
7759  */
7760 int btrfs_verify_dev_extents(struct btrfs_fs_info *fs_info)
7761 {
7762         struct btrfs_path *path;
7763         struct btrfs_root *root = fs_info->dev_root;
7764         struct btrfs_key key;
7765         u64 prev_devid = 0;
7766         u64 prev_dev_ext_end = 0;
7767         int ret = 0;
7768
7769         key.objectid = 1;
7770         key.type = BTRFS_DEV_EXTENT_KEY;
7771         key.offset = 0;
7772
7773         path = btrfs_alloc_path();
7774         if (!path)
7775                 return -ENOMEM;
7776
7777         path->reada = READA_FORWARD;
7778         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
7779         if (ret < 0)
7780                 goto out;
7781
7782         if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
7783                 ret = btrfs_next_item(root, path);
7784                 if (ret < 0)
7785                         goto out;
7786                 /* No dev extents at all? Not good */
7787                 if (ret > 0) {
7788                         ret = -EUCLEAN;
7789                         goto out;
7790                 }
7791         }
7792         while (1) {
7793                 struct extent_buffer *leaf = path->nodes[0];
7794                 struct btrfs_dev_extent *dext;
7795                 int slot = path->slots[0];
7796                 u64 chunk_offset;
7797                 u64 physical_offset;
7798                 u64 physical_len;
7799                 u64 devid;
7800
7801                 btrfs_item_key_to_cpu(leaf, &key, slot);
7802                 if (key.type != BTRFS_DEV_EXTENT_KEY)
7803                         break;
7804                 devid = key.objectid;
7805                 physical_offset = key.offset;
7806
7807                 dext = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
7808                 chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dext);
7809                 physical_len = btrfs_dev_extent_length(leaf, dext);
7810
7811                 /* Check if this dev extent overlaps with the previous one */
7812                 if (devid == prev_devid && physical_offset < prev_dev_ext_end) {
7813                         btrfs_err(fs_info,
7814 "dev extent devid %llu physical offset %llu overlap with previous dev extent end %llu",
7815                                   devid, physical_offset, prev_dev_ext_end);
7816                         ret = -EUCLEAN;
7817                         goto out;
7818                 }
7819
7820                 ret = verify_one_dev_extent(fs_info, chunk_offset, devid,
7821                                             physical_offset, physical_len);
7822                 if (ret < 0)
7823                         goto out;
7824                 prev_devid = devid;
7825                 prev_dev_ext_end = physical_offset + physical_len;
7826
7827                 ret = btrfs_next_item(root, path);
7828                 if (ret < 0)
7829                         goto out;
7830                 if (ret > 0) {
7831                         ret = 0;
7832                         break;
7833                 }
7834         }
7835
7836         /* Ensure all chunks have corresponding dev extents */
7837         ret = verify_chunk_dev_extent_mapping(fs_info);
7838 out:
7839         btrfs_free_path(path);
7840         return ret;
7841 }
7842
7843 /*
7844  * Check whether the given block group or device is pinned by any inode being
7845  * used as a swapfile.
7846  */
7847 bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr)
7848 {
7849         struct btrfs_swapfile_pin *sp;
7850         struct rb_node *node;
7851
7852         spin_lock(&fs_info->swapfile_pins_lock);
7853         node = fs_info->swapfile_pins.rb_node;
7854         while (node) {
7855                 sp = rb_entry(node, struct btrfs_swapfile_pin, node);
7856                 if (ptr < sp->ptr)
7857                         node = node->rb_left;
7858                 else if (ptr > sp->ptr)
7859                         node = node->rb_right;
7860                 else
7861                         break;
7862         }
7863         spin_unlock(&fs_info->swapfile_pins_lock);
7864         return node != NULL;
7865 }