fs/btrfs/raid56.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (C) 2012 Fusion-io  All rights reserved.
   4  * Copyright (C) 2012 Intel Corp. All rights reserved.
   5  */
   6
   7 #include <linux/sched.h>
   8 #include <linux/bio.h>
   9 #include <linux/slab.h>
  10 #include <linux/blkdev.h>
  11 #include <linux/raid/pq.h>
  12 #include <linux/hash.h>
  13 #include <linux/list_sort.h>
  14 #include <linux/raid/xor.h>
  15 #include <linux/mm.h>
  16 #include "misc.h"
  17 #include "ctree.h"
  18 #include "disk-io.h"
  19 #include "volumes.h"
  20 #include "raid56.h"
  21 #include "async-thread.h"
  22
  23 /* set when additional merges to this rbio are not allowed */
  24 #define RBIO_RMW_LOCKED_BIT     1
  25
  26 /*
  27  * set when this rbio is sitting in the hash, but it is just a cache
  28  * of past RMW
  29  */
  30 #define RBIO_CACHE_BIT          2
  31
  32 /*
  33  * set when it is safe to trust the stripe_pages for caching
  34  */
  35 #define RBIO_CACHE_READY_BIT    3
  36
  37 #define RBIO_CACHE_SIZE 1024
  38
  39 #define BTRFS_STRIPE_HASH_TABLE_BITS                            11
  40
  41 /* Used by the raid56 code to lock stripes for read/modify/write */
  42 struct btrfs_stripe_hash {
  43         struct list_head hash_list;
  44         spinlock_t lock;
  45 };
  46
  47 /* Used by the raid56 code to lock stripes for read/modify/write */
  48 struct btrfs_stripe_hash_table {
  49         struct list_head stripe_cache;
  50         spinlock_t cache_lock;
  51         int cache_size;
  52         struct btrfs_stripe_hash table[];
  53 };
  54
  55 /*
  56  * A bvec like structure to present a sector inside a page.
  57  *
  58  * Unlike bvec we don't need bvlen, as it's fixed to sectorsize.
  59  */
  60 struct sector_ptr {
  61         struct page *page;
  62         unsigned int pgoff:24;
  63         unsigned int uptodate:8;
  64 };
  65
  66 enum btrfs_rbio_ops {
  67         BTRFS_RBIO_WRITE,
  68         BTRFS_RBIO_READ_REBUILD,
  69         BTRFS_RBIO_PARITY_SCRUB,
  70         BTRFS_RBIO_REBUILD_MISSING,
  71 };
  72
  73 struct btrfs_raid_bio {
  74         struct btrfs_io_context *bioc;
  75
  76         /* while we're doing rmw on a stripe
  77          * we put it into a hash table so we can
  78          * lock the stripe and merge more rbios
  79          * into it.
  80          */
  81         struct list_head hash_list;
  82
  83         /*
  84          * LRU list for the stripe cache
  85          */
  86         struct list_head stripe_cache;
  87
  88         /*
  89          * for scheduling work in the helper threads
  90          */
  91         struct work_struct work;
  92
  93         /*
  94          * bio list and bio_list_lock are used
  95          * to add more bios into the stripe
  96          * in hopes of avoiding the full rmw
  97          */
  98         struct bio_list bio_list;
  99         spinlock_t bio_list_lock;
 100
 101         /* also protected by the bio_list_lock, the
 102          * plug list is used by the plugging code
 103          * to collect partial bios while plugged.  The
 104          * stripe locking code also uses it to hand off
 105          * the stripe lock to the next pending IO
 106          */
 107         struct list_head plug_list;
 108
 109         /*
 110          * flags that tell us if it is safe to
 111          * merge with this bio
 112          */
 113         unsigned long flags;
 114
 115         /*
 116          * set if we're doing a parity rebuild
 117          * for a read from higher up, which is handled
 118          * differently from a parity rebuild as part of
 119          * rmw
 120          */
 121         enum btrfs_rbio_ops operation;
 122
 123         /* Size of each individual stripe on disk */
 124         u32 stripe_len;
 125
 126         /* How many pages there are for the full stripe including P/Q */
 127         u16 nr_pages;
 128
 129         /* How many sectors there are for the full stripe including P/Q */
 130         u16 nr_sectors;
 131
 132         /* Number of data stripes (no p/q) */
 133         u8 nr_data;
 134
 135         /* Numer of all stripes (including P/Q) */
 136         u8 real_stripes;
 137
 138         /* How many pages there are for each stripe */
 139         u8 stripe_npages;
 140
 141         /* How many sectors there are for each stripe */
 142         u8 stripe_nsectors;
 143
 144         /* First bad stripe, -1 means no corruption */
 145         s8 faila;
 146
 147         /* Second bad stripe (for RAID6 use) */
 148         s8 failb;
 149
 150         /* Stripe number that we're scrubbing  */
 151         u8 scrubp;
 152
 153         /*
 154          * size of all the bios in the bio_list.  This
 155          * helps us decide if the rbio maps to a full
 156          * stripe or not
 157          */
 158         int bio_list_bytes;
 159
 160         int generic_bio_cnt;
 161
 162         refcount_t refs;
 163
 164         atomic_t stripes_pending;
 165
 166         atomic_t error;
 167         /*
 168          * these are two arrays of pointers.  We allocate the
 169          * rbio big enough to hold them both and setup their
 170          * locations when the rbio is allocated
 171          */
 172
 173         /* pointers to pages that we allocated for
 174          * reading/writing stripes directly from the disk (including P/Q)
 175          */
 176         struct page **stripe_pages;
 177
 178         /* Pointers to the sectors in the bio_list, for faster lookup */
 179         struct sector_ptr *bio_sectors;
 180
 181         /*
 182          * For subpage support, we need to map each sector to above
 183          * stripe_pages.
 184          */
 185         struct sector_ptr *stripe_sectors;
 186
 187         /* Bitmap to record which horizontal stripe has data */
 188         unsigned long *dbitmap;
 189
 190         /* allocated with real_stripes-many pointers for finish_*() calls */
 191         void **finish_pointers;
 192
 193         /* Allocated with stripe_nsectors-many bits for finish_*() calls */
 194         unsigned long *finish_pbitmap;
 195 };
 196
 197 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
 198 static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
 199 static void rmw_work(struct work_struct *work);
 200 static void read_rebuild_work(struct work_struct *work);
 201 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
 202 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
 203 static void __free_raid_bio(struct btrfs_raid_bio *rbio);
 204 static void index_rbio_pages(struct btrfs_raid_bio *rbio);
 205 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
 206
 207 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
 208                                          int need_check);
 209 static void scrub_parity_work(struct work_struct *work);
 210
 211 static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func)
 212 {
 213         INIT_WORK(&rbio->work, work_func);
 214         queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work);
 215 }
 216
 217 /*
 218  * the stripe hash table is used for locking, and to collect
 219  * bios in hopes of making a full stripe
 220  */
 221 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
 222 {
 223         struct btrfs_stripe_hash_table *table;
 224         struct btrfs_stripe_hash_table *x;
 225         struct btrfs_stripe_hash *cur;
 226         struct btrfs_stripe_hash *h;
 227         int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
 228         int i;
 229
 230         if (info->stripe_hash_table)
 231                 return 0;
 232
 233         /*
 234          * The table is large, starting with order 4 and can go as high as
 235          * order 7 in case lock debugging is turned on.
 236          *
 237          * Try harder to allocate and fallback to vmalloc to lower the chance
 238          * of a failing mount.
 239          */
 240         table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL);
 241         if (!table)
 242                 return -ENOMEM;
 243
 244         spin_lock_init(&table->cache_lock);
 245         INIT_LIST_HEAD(&table->stripe_cache);
 246
 247         h = table->table;
 248
 249         for (i = 0; i < num_entries; i++) {
 250                 cur = h + i;
 251                 INIT_LIST_HEAD(&cur->hash_list);
 252                 spin_lock_init(&cur->lock);
 253         }
 254
 255         x = cmpxchg(&info->stripe_hash_table, NULL, table);
 256         kvfree(x);
 257         return 0;
 258 }
 259
 260 /*
 261  * caching an rbio means to copy anything from the
 262  * bio_sectors array into the stripe_pages array.  We
 263  * use the page uptodate bit in the stripe cache array
 264  * to indicate if it has valid data
 265  *
 266  * once the caching is done, we set the cache ready
 267  * bit.
 268  */
 269 static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
 270 {
 271         int i;
 272         int ret;
 273
 274         ret = alloc_rbio_pages(rbio);
 275         if (ret)
 276                 return;
 277
 278         for (i = 0; i < rbio->nr_sectors; i++) {
 279                 /* Some range not covered by bio (partial write), skip it */
 280                 if (!rbio->bio_sectors[i].page)
 281                         continue;
 282
 283                 ASSERT(rbio->stripe_sectors[i].page);
 284                 memcpy_page(rbio->stripe_sectors[i].page,
 285                             rbio->stripe_sectors[i].pgoff,
 286                             rbio->bio_sectors[i].page,
 287                             rbio->bio_sectors[i].pgoff,
 288                             rbio->bioc->fs_info->sectorsize);
 289                 rbio->stripe_sectors[i].uptodate = 1;
 290         }
 291         set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
 292 }
 293
 294 /*
 295  * we hash on the first logical address of the stripe
 296  */
 297 static int rbio_bucket(struct btrfs_raid_bio *rbio)
 298 {
 299         u64 num = rbio->bioc->raid_map[0];
 300
 301         /*
 302          * we shift down quite a bit.  We're using byte
 303          * addressing, and most of the lower bits are zeros.
 304          * This tends to upset hash_64, and it consistently
 305          * returns just one or two different values.
 306          *
 307          * shifting off the lower bits fixes things.
 308          */
 309         return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
 310 }
 311
 312 static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio,
 313                                        unsigned int page_nr)
 314 {
 315         const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
 316         const u32 sectors_per_page = PAGE_SIZE / sectorsize;
 317         int i;
 318
 319         ASSERT(page_nr < rbio->nr_pages);
 320
 321         for (i = sectors_per_page * page_nr;
 322              i < sectors_per_page * page_nr + sectors_per_page;
 323              i++) {
 324                 if (!rbio->stripe_sectors[i].uptodate)
 325                         return false;
 326         }
 327         return true;
 328 }
 329
 330 /*
 331  * Update the stripe_sectors[] array to use correct page and pgoff
 332  *
 333  * Should be called every time any page pointer in stripes_pages[] got modified.
 334  */
 335 static void index_stripe_sectors(struct btrfs_raid_bio *rbio)
 336 {
 337         const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
 338         u32 offset;
 339         int i;
 340
 341         for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) {
 342                 int page_index = offset >> PAGE_SHIFT;
 343
 344                 ASSERT(page_index < rbio->nr_pages);
 345                 rbio->stripe_sectors[i].page = rbio->stripe_pages[page_index];
 346                 rbio->stripe_sectors[i].pgoff = offset_in_page(offset);
 347         }
 348 }
 349
 350 /*
 351  * Stealing an rbio means taking all the uptodate pages from the stripe array
 352  * in the source rbio and putting them into the destination rbio.
 353  *
 354  * This will also update the involved stripe_sectors[] which are referring to
 355  * the old pages.
 356  */
 357 static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
 358 {
 359         int i;
 360         struct page *s;
 361         struct page *d;
 362
 363         if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
 364                 return;
 365
 366         for (i = 0; i < dest->nr_pages; i++) {
 367                 s = src->stripe_pages[i];
 368                 if (!s || !full_page_sectors_uptodate(src, i))
 369                         continue;
 370
 371                 d = dest->stripe_pages[i];
 372                 if (d)
 373                         __free_page(d);
 374
 375                 dest->stripe_pages[i] = s;
 376                 src->stripe_pages[i] = NULL;
 377         }
 378         index_stripe_sectors(dest);
 379         index_stripe_sectors(src);
 380 }
 381
 382 /*
 383  * merging means we take the bio_list from the victim and
 384  * splice it into the destination.  The victim should
 385  * be discarded afterwards.
 386  *
 387  * must be called with dest->rbio_list_lock held
 388  */
 389 static void merge_rbio(struct btrfs_raid_bio *dest,
 390                        struct btrfs_raid_bio *victim)
 391 {
 392         bio_list_merge(&dest->bio_list, &victim->bio_list);
 393         dest->bio_list_bytes += victim->bio_list_bytes;
 394         dest->generic_bio_cnt += victim->generic_bio_cnt;
 395         bio_list_init(&victim->bio_list);
 396 }
 397
 398 /*
 399  * used to prune items that are in the cache.  The caller
 400  * must hold the hash table lock.
 401  */
 402 static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
 403 {
 404         int bucket = rbio_bucket(rbio);
 405         struct btrfs_stripe_hash_table *table;
 406         struct btrfs_stripe_hash *h;
 407         int freeit = 0;
 408
 409         /*
 410          * check the bit again under the hash table lock.
 411          */
 412         if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
 413                 return;
 414
 415         table = rbio->bioc->fs_info->stripe_hash_table;
 416         h = table->table + bucket;
 417
 418         /* hold the lock for the bucket because we may be
 419          * removing it from the hash table
 420          */
 421         spin_lock(&h->lock);
 422
 423         /*
 424          * hold the lock for the bio list because we need
 425          * to make sure the bio list is empty
 426          */
 427         spin_lock(&rbio->bio_list_lock);
 428
 429         if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
 430                 list_del_init(&rbio->stripe_cache);
 431                 table->cache_size -= 1;
 432                 freeit = 1;
 433
 434                 /* if the bio list isn't empty, this rbio is
 435                  * still involved in an IO.  We take it out
 436                  * of the cache list, and drop the ref that
 437                  * was held for the list.
 438                  *
 439                  * If the bio_list was empty, we also remove
 440                  * the rbio from the hash_table, and drop
 441                  * the corresponding ref
 442                  */
 443                 if (bio_list_empty(&rbio->bio_list)) {
 444                         if (!list_empty(&rbio->hash_list)) {
 445                                 list_del_init(&rbio->hash_list);
 446                                 refcount_dec(&rbio->refs);
 447                                 BUG_ON(!list_empty(&rbio->plug_list));
 448                         }
 449                 }
 450         }
 451
 452         spin_unlock(&rbio->bio_list_lock);
 453         spin_unlock(&h->lock);
 454
 455         if (freeit)
 456                 __free_raid_bio(rbio);
 457 }
 458
 459 /*
 460  * prune a given rbio from the cache
 461  */
 462 static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
 463 {
 464         struct btrfs_stripe_hash_table *table;
 465         unsigned long flags;
 466
 467         if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
 468                 return;
 469
 470         table = rbio->bioc->fs_info->stripe_hash_table;
 471
 472         spin_lock_irqsave(&table->cache_lock, flags);
 473         __remove_rbio_from_cache(rbio);
 474         spin_unlock_irqrestore(&table->cache_lock, flags);
 475 }
 476
 477 /*
 478  * remove everything in the cache
 479  */
 480 static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
 481 {
 482         struct btrfs_stripe_hash_table *table;
 483         unsigned long flags;
 484         struct btrfs_raid_bio *rbio;
 485
 486         table = info->stripe_hash_table;
 487
 488         spin_lock_irqsave(&table->cache_lock, flags);
 489         while (!list_empty(&table->stripe_cache)) {
 490                 rbio = list_entry(table->stripe_cache.next,
 491                                   struct btrfs_raid_bio,
 492                                   stripe_cache);
 493                 __remove_rbio_from_cache(rbio);
 494         }
 495         spin_unlock_irqrestore(&table->cache_lock, flags);
 496 }
 497
 498 /*
 499  * remove all cached entries and free the hash table
 500  * used by unmount
 501  */
 502 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
 503 {
 504         if (!info->stripe_hash_table)
 505                 return;
 506         btrfs_clear_rbio_cache(info);
 507         kvfree(info->stripe_hash_table);
 508         info->stripe_hash_table = NULL;
 509 }
 510
 511 /*
 512  * insert an rbio into the stripe cache.  It
 513  * must have already been prepared by calling
 514  * cache_rbio_pages
 515  *
 516  * If this rbio was already cached, it gets
 517  * moved to the front of the lru.
 518  *
 519  * If the size of the rbio cache is too big, we
 520  * prune an item.
 521  */
 522 static void cache_rbio(struct btrfs_raid_bio *rbio)
 523 {
 524         struct btrfs_stripe_hash_table *table;
 525         unsigned long flags;
 526
 527         if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
 528                 return;
 529
 530         table = rbio->bioc->fs_info->stripe_hash_table;
 531
 532         spin_lock_irqsave(&table->cache_lock, flags);
 533         spin_lock(&rbio->bio_list_lock);
 534
 535         /* bump our ref if we were not in the list before */
 536         if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
 537                 refcount_inc(&rbio->refs);
 538
 539         if (!list_empty(&rbio->stripe_cache)){
 540                 list_move(&rbio->stripe_cache, &table->stripe_cache);
 541         } else {
 542                 list_add(&rbio->stripe_cache, &table->stripe_cache);
 543                 table->cache_size += 1;
 544         }
 545
 546         spin_unlock(&rbio->bio_list_lock);
 547
 548         if (table->cache_size > RBIO_CACHE_SIZE) {
 549                 struct btrfs_raid_bio *found;
 550
 551                 found = list_entry(table->stripe_cache.prev,
 552                                   struct btrfs_raid_bio,
 553                                   stripe_cache);
 554
 555                 if (found != rbio)
 556                         __remove_rbio_from_cache(found);
 557         }
 558
 559         spin_unlock_irqrestore(&table->cache_lock, flags);
 560 }
 561
 562 /*
 563  * helper function to run the xor_blocks api.  It is only
 564  * able to do MAX_XOR_BLOCKS at a time, so we need to
 565  * loop through.
 566  */
 567 static void run_xor(void **pages, int src_cnt, ssize_t len)
 568 {
 569         int src_off = 0;
 570         int xor_src_cnt = 0;
 571         void *dest = pages[src_cnt];
 572
 573         while(src_cnt > 0) {
 574                 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
 575                 xor_blocks(xor_src_cnt, len, dest, pages + src_off);
 576
 577                 src_cnt -= xor_src_cnt;
 578                 src_off += xor_src_cnt;
 579         }
 580 }
 581
 582 /*
 583  * Returns true if the bio list inside this rbio covers an entire stripe (no
 584  * rmw required).
 585  */
 586 static int rbio_is_full(struct btrfs_raid_bio *rbio)
 587 {
 588         unsigned long flags;
 589         unsigned long size = rbio->bio_list_bytes;
 590         int ret = 1;
 591
 592         spin_lock_irqsave(&rbio->bio_list_lock, flags);
 593         if (size != rbio->nr_data * rbio->stripe_len)
 594                 ret = 0;
 595         BUG_ON(size > rbio->nr_data * rbio->stripe_len);
 596         spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
 597
 598         return ret;
 599 }
 600
 601 /*
 602  * returns 1 if it is safe to merge two rbios together.
 603  * The merging is safe if the two rbios correspond to
 604  * the same stripe and if they are both going in the same
 605  * direction (read vs write), and if neither one is
 606  * locked for final IO
 607  *
 608  * The caller is responsible for locking such that
 609  * rmw_locked is safe to test
 610  */
 611 static int rbio_can_merge(struct btrfs_raid_bio *last,
 612                           struct btrfs_raid_bio *cur)
 613 {
 614         if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
 615             test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
 616                 return 0;
 617
 618         /*
 619          * we can't merge with cached rbios, since the
 620          * idea is that when we merge the destination
 621          * rbio is going to run our IO for us.  We can
 622          * steal from cached rbios though, other functions
 623          * handle that.
 624          */
 625         if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
 626             test_bit(RBIO_CACHE_BIT, &cur->flags))
 627                 return 0;
 628
 629         if (last->bioc->raid_map[0] != cur->bioc->raid_map[0])
 630                 return 0;
 631
 632         /* we can't merge with different operations */
 633         if (last->operation != cur->operation)
 634                 return 0;
 635         /*
 636          * We've need read the full stripe from the drive.
 637          * check and repair the parity and write the new results.
 638          *
 639          * We're not allowed to add any new bios to the
 640          * bio list here, anyone else that wants to
 641          * change this stripe needs to do their own rmw.
 642          */
 643         if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
 644                 return 0;
 645
 646         if (last->operation == BTRFS_RBIO_REBUILD_MISSING)
 647                 return 0;
 648
 649         if (last->operation == BTRFS_RBIO_READ_REBUILD) {
 650                 int fa = last->faila;
 651                 int fb = last->failb;
 652                 int cur_fa = cur->faila;
 653                 int cur_fb = cur->failb;
 654
 655                 if (last->faila >= last->failb) {
 656                         fa = last->failb;
 657                         fb = last->faila;
 658                 }
 659
 660                 if (cur->faila >= cur->failb) {
 661                         cur_fa = cur->failb;
 662                         cur_fb = cur->faila;
 663                 }
 664
 665                 if (fa != cur_fa || fb != cur_fb)
 666                         return 0;
 667         }
 668         return 1;
 669 }
 670
 671 static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio,
 672                                              unsigned int stripe_nr,
 673                                              unsigned int sector_nr)
 674 {
 675         ASSERT(stripe_nr < rbio->real_stripes);
 676         ASSERT(sector_nr < rbio->stripe_nsectors);
 677
 678         return stripe_nr * rbio->stripe_nsectors + sector_nr;
 679 }
 680
 681 /* Return a sector from rbio->stripe_sectors, not from the bio list */
 682 static struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio,
 683                                              unsigned int stripe_nr,
 684                                              unsigned int sector_nr)
 685 {
 686         return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr,
 687                                                               sector_nr)];
 688 }
 689
 690 /* Grab a sector inside P stripe */
 691 static struct sector_ptr *rbio_pstripe_sector(const struct btrfs_raid_bio *rbio,
 692                                               unsigned int sector_nr)
 693 {
 694         return rbio_stripe_sector(rbio, rbio->nr_data, sector_nr);
 695 }
 696
 697 /* Grab a sector inside Q stripe, return NULL if not RAID6 */
 698 static struct sector_ptr *rbio_qstripe_sector(const struct btrfs_raid_bio *rbio,
 699                                               unsigned int sector_nr)
 700 {
 701         if (rbio->nr_data + 1 == rbio->real_stripes)
 702                 return NULL;
 703         return rbio_stripe_sector(rbio, rbio->nr_data + 1, sector_nr);
 704 }
 705
 706 /*
 707  * The first stripe in the table for a logical address
 708  * has the lock.  rbios are added in one of three ways:
 709  *
 710  * 1) Nobody has the stripe locked yet.  The rbio is given
 711  * the lock and 0 is returned.  The caller must start the IO
 712  * themselves.
 713  *
 714  * 2) Someone has the stripe locked, but we're able to merge
 715  * with the lock owner.  The rbio is freed and the IO will
 716  * start automatically along with the existing rbio.  1 is returned.
 717  *
 718  * 3) Someone has the stripe locked, but we're not able to merge.
 719  * The rbio is added to the lock owner's plug list, or merged into
 720  * an rbio already on the plug list.  When the lock owner unlocks,
 721  * the next rbio on the list is run and the IO is started automatically.
 722  * 1 is returned
 723  *
 724  * If we return 0, the caller still owns the rbio and must continue with
 725  * IO submission.  If we return 1, the caller must assume the rbio has
 726  * already been freed.
 727  */
 728 static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
 729 {
 730         struct btrfs_stripe_hash *h;
 731         struct btrfs_raid_bio *cur;
 732         struct btrfs_raid_bio *pending;
 733         unsigned long flags;
 734         struct btrfs_raid_bio *freeit = NULL;
 735         struct btrfs_raid_bio *cache_drop = NULL;
 736         int ret = 0;
 737
 738         h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
 739
 740         spin_lock_irqsave(&h->lock, flags);
 741         list_for_each_entry(cur, &h->hash_list, hash_list) {
 742                 if (cur->bioc->raid_map[0] != rbio->bioc->raid_map[0])
 743                         continue;
 744
 745                 spin_lock(&cur->bio_list_lock);
 746
 747                 /* Can we steal this cached rbio's pages? */
 748                 if (bio_list_empty(&cur->bio_list) &&
 749                     list_empty(&cur->plug_list) &&
 750                     test_bit(RBIO_CACHE_BIT, &cur->flags) &&
 751                     !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
 752                         list_del_init(&cur->hash_list);
 753                         refcount_dec(&cur->refs);
 754
 755                         steal_rbio(cur, rbio);
 756                         cache_drop = cur;
 757                         spin_unlock(&cur->bio_list_lock);
 758
 759                         goto lockit;
 760                 }
 761
 762                 /* Can we merge into the lock owner? */
 763                 if (rbio_can_merge(cur, rbio)) {
 764                         merge_rbio(cur, rbio);
 765                         spin_unlock(&cur->bio_list_lock);
 766                         freeit = rbio;
 767                         ret = 1;
 768                         goto out;
 769                 }
 770
 771
 772                 /*
 773                  * We couldn't merge with the running rbio, see if we can merge
 774                  * with the pending ones.  We don't have to check for rmw_locked
 775                  * because there is no way they are inside finish_rmw right now
 776                  */
 777                 list_for_each_entry(pending, &cur->plug_list, plug_list) {
 778                         if (rbio_can_merge(pending, rbio)) {
 779                                 merge_rbio(pending, rbio);
 780                                 spin_unlock(&cur->bio_list_lock);
 781                                 freeit = rbio;
 782                                 ret = 1;
 783                                 goto out;
 784                         }
 785                 }
 786
 787                 /*
 788                  * No merging, put us on the tail of the plug list, our rbio
 789                  * will be started with the currently running rbio unlocks
 790                  */
 791                 list_add_tail(&rbio->plug_list, &cur->plug_list);
 792                 spin_unlock(&cur->bio_list_lock);
 793                 ret = 1;
 794                 goto out;
 795         }
 796 lockit:
 797         refcount_inc(&rbio->refs);
 798         list_add(&rbio->hash_list, &h->hash_list);
 799 out:
 800         spin_unlock_irqrestore(&h->lock, flags);
 801         if (cache_drop)
 802                 remove_rbio_from_cache(cache_drop);
 803         if (freeit)
 804                 __free_raid_bio(freeit);
 805         return ret;
 806 }
 807
 808 /*
 809  * called as rmw or parity rebuild is completed.  If the plug list has more
 810  * rbios waiting for this stripe, the next one on the list will be started
 811  */
 812 static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
 813 {
 814         int bucket;
 815         struct btrfs_stripe_hash *h;
 816         unsigned long flags;
 817         int keep_cache = 0;
 818
 819         bucket = rbio_bucket(rbio);
 820         h = rbio->bioc->fs_info->stripe_hash_table->table + bucket;
 821
 822         if (list_empty(&rbio->plug_list))
 823                 cache_rbio(rbio);
 824
 825         spin_lock_irqsave(&h->lock, flags);
 826         spin_lock(&rbio->bio_list_lock);
 827
 828         if (!list_empty(&rbio->hash_list)) {
 829                 /*
 830                  * if we're still cached and there is no other IO
 831                  * to perform, just leave this rbio here for others
 832                  * to steal from later
 833                  */
 834                 if (list_empty(&rbio->plug_list) &&
 835                     test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
 836                         keep_cache = 1;
 837                         clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
 838                         BUG_ON(!bio_list_empty(&rbio->bio_list));
 839                         goto done;
 840                 }
 841
 842                 list_del_init(&rbio->hash_list);
 843                 refcount_dec(&rbio->refs);
 844
 845                 /*
 846                  * we use the plug list to hold all the rbios
 847                  * waiting for the chance to lock this stripe.
 848                  * hand the lock over to one of them.
 849                  */
 850                 if (!list_empty(&rbio->plug_list)) {
 851                         struct btrfs_raid_bio *next;
 852                         struct list_head *head = rbio->plug_list.next;
 853
 854                         next = list_entry(head, struct btrfs_raid_bio,
 855                                           plug_list);
 856
 857                         list_del_init(&rbio->plug_list);
 858
 859                         list_add(&next->hash_list, &h->hash_list);
 860                         refcount_inc(&next->refs);
 861                         spin_unlock(&rbio->bio_list_lock);
 862                         spin_unlock_irqrestore(&h->lock, flags);
 863
 864                         if (next->operation == BTRFS_RBIO_READ_REBUILD)
 865                                 start_async_work(next, read_rebuild_work);
 866                         else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) {
 867                                 steal_rbio(rbio, next);
 868                                 start_async_work(next, read_rebuild_work);
 869                         } else if (next->operation == BTRFS_RBIO_WRITE) {
 870                                 steal_rbio(rbio, next);
 871                                 start_async_work(next, rmw_work);
 872                         } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
 873                                 steal_rbio(rbio, next);
 874                                 start_async_work(next, scrub_parity_work);
 875                         }
 876
 877                         goto done_nolock;
 878                 }
 879         }
 880 done:
 881         spin_unlock(&rbio->bio_list_lock);
 882         spin_unlock_irqrestore(&h->lock, flags);
 883
 884 done_nolock:
 885         if (!keep_cache)
 886                 remove_rbio_from_cache(rbio);
 887 }
 888
 889 static void __free_raid_bio(struct btrfs_raid_bio *rbio)
 890 {
 891         int i;
 892
 893         if (!refcount_dec_and_test(&rbio->refs))
 894                 return;
 895
 896         WARN_ON(!list_empty(&rbio->stripe_cache));
 897         WARN_ON(!list_empty(&rbio->hash_list));
 898         WARN_ON(!bio_list_empty(&rbio->bio_list));
 899
 900         for (i = 0; i < rbio->nr_pages; i++) {
 901                 if (rbio->stripe_pages[i]) {
 902                         __free_page(rbio->stripe_pages[i]);
 903                         rbio->stripe_pages[i] = NULL;
 904                 }
 905         }
 906
 907         btrfs_put_bioc(rbio->bioc);
 908         kfree(rbio);
 909 }
 910
 911 static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
 912 {
 913         struct bio *next;
 914
 915         while (cur) {
 916                 next = cur->bi_next;
 917                 cur->bi_next = NULL;
 918                 cur->bi_status = err;
 919                 bio_endio(cur);
 920                 cur = next;
 921         }
 922 }
 923
 924 /*
 925  * this frees the rbio and runs through all the bios in the
 926  * bio_list and calls end_io on them
 927  */
 928 static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
 929 {
 930         struct bio *cur = bio_list_get(&rbio->bio_list);
 931         struct bio *extra;
 932
 933         if (rbio->generic_bio_cnt)
 934                 btrfs_bio_counter_sub(rbio->bioc->fs_info, rbio->generic_bio_cnt);
 935
 936         /*
 937          * At this moment, rbio->bio_list is empty, however since rbio does not
 938          * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
 939          * hash list, rbio may be merged with others so that rbio->bio_list
 940          * becomes non-empty.
 941          * Once unlock_stripe() is done, rbio->bio_list will not be updated any
 942          * more and we can call bio_endio() on all queued bios.
 943          */
 944         unlock_stripe(rbio);
 945         extra = bio_list_get(&rbio->bio_list);
 946         __free_raid_bio(rbio);
 947
 948         rbio_endio_bio_list(cur, err);
 949         if (extra)
 950                 rbio_endio_bio_list(extra, err);
 951 }
 952
 953 /*
 954  * end io function used by finish_rmw.  When we finally
 955  * get here, we've written a full stripe
 956  */
 957 static void raid_write_end_io(struct bio *bio)
 958 {
 959         struct btrfs_raid_bio *rbio = bio->bi_private;
 960         blk_status_t err = bio->bi_status;
 961         int max_errors;
 962
 963         if (err)
 964                 fail_bio_stripe(rbio, bio);
 965
 966         bio_put(bio);
 967
 968         if (!atomic_dec_and_test(&rbio->stripes_pending))
 969                 return;
 970
 971         err = BLK_STS_OK;
 972
 973         /* OK, we have read all the stripes we need to. */
 974         max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
 975                      0 : rbio->bioc->max_errors;
 976         if (atomic_read(&rbio->error) > max_errors)
 977                 err = BLK_STS_IOERR;
 978
 979         rbio_orig_end_io(rbio, err);
 980 }
 981
 982 /**
 983  * Get a sector pointer specified by its @stripe_nr and @sector_nr
 984  *
 985  * @rbio:               The raid bio
 986  * @stripe_nr:          Stripe number, valid range [0, real_stripe)
 987  * @sector_nr:          Sector number inside the stripe,
 988  *                      valid range [0, stripe_nsectors)
 989  * @bio_list_only:      Whether to use sectors inside the bio list only.
 990  *
 991  * The read/modify/write code wants to reuse the original bio page as much
 992  * as possible, and only use stripe_sectors as fallback.
 993  */
 994 static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio,
 995                                          int stripe_nr, int sector_nr,
 996                                          bool bio_list_only)
 997 {
 998         struct sector_ptr *sector;
 999         int index;
1000
1001         ASSERT(stripe_nr >= 0 && stripe_nr < rbio->real_stripes);
1002         ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors);
1003
1004         index = stripe_nr * rbio->stripe_nsectors + sector_nr;
1005         ASSERT(index >= 0 && index < rbio->nr_sectors);
1006
1007         spin_lock_irq(&rbio->bio_list_lock);
1008         sector = &rbio->bio_sectors[index];
1009         if (sector->page || bio_list_only) {
1010                 /* Don't return sector without a valid page pointer */
1011                 if (!sector->page)
1012                         sector = NULL;
1013                 spin_unlock_irq(&rbio->bio_list_lock);
1014                 return sector;
1015         }
1016         spin_unlock_irq(&rbio->bio_list_lock);
1017
1018         return &rbio->stripe_sectors[index];
1019 }
1020
1021 /*
1022  * allocation and initial setup for the btrfs_raid_bio.  Not
1023  * this does not allocate any pages for rbio->pages.
1024  */
1025 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
1026                                          struct btrfs_io_context *bioc,
1027                                          u32 stripe_len)
1028 {
1029         const unsigned int real_stripes = bioc->num_stripes - bioc->num_tgtdevs;
1030         const unsigned int stripe_npages = stripe_len >> PAGE_SHIFT;
1031         const unsigned int num_pages = stripe_npages * real_stripes;
1032         const unsigned int stripe_nsectors = stripe_len >> fs_info->sectorsize_bits;
1033         const unsigned int num_sectors = stripe_nsectors * real_stripes;
1034         struct btrfs_raid_bio *rbio;
1035         int nr_data = 0;
1036         void *p;
1037
1038         ASSERT(IS_ALIGNED(stripe_len, PAGE_SIZE));
1039         /* PAGE_SIZE must also be aligned to sectorsize for subpage support */
1040         ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize));
1041
1042         rbio = kzalloc(sizeof(*rbio) +
1043                        sizeof(*rbio->stripe_pages) * num_pages +
1044                        sizeof(*rbio->bio_sectors) * num_sectors +
1045                        sizeof(*rbio->stripe_sectors) * num_sectors +
1046                        sizeof(*rbio->finish_pointers) * real_stripes +
1047                        sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_nsectors) +
1048                        sizeof(*rbio->finish_pbitmap) * BITS_TO_LONGS(stripe_nsectors),
1049                        GFP_NOFS);
1050         if (!rbio)
1051                 return ERR_PTR(-ENOMEM);
1052
1053         bio_list_init(&rbio->bio_list);
1054         INIT_LIST_HEAD(&rbio->plug_list);
1055         spin_lock_init(&rbio->bio_list_lock);
1056         INIT_LIST_HEAD(&rbio->stripe_cache);
1057         INIT_LIST_HEAD(&rbio->hash_list);
1058         rbio->bioc = bioc;
1059         rbio->stripe_len = stripe_len;
1060         rbio->nr_pages = num_pages;
1061         rbio->nr_sectors = num_sectors;
1062         rbio->real_stripes = real_stripes;
1063         rbio->stripe_npages = stripe_npages;
1064         rbio->stripe_nsectors = stripe_nsectors;
1065         rbio->faila = -1;
1066         rbio->failb = -1;
1067         refcount_set(&rbio->refs, 1);
1068         atomic_set(&rbio->error, 0);
1069         atomic_set(&rbio->stripes_pending, 0);
1070
1071         /*
1072          * The stripe_pages, bio_sectors, etc arrays point to the extra memory
1073          * we allocated past the end of the rbio.
1074          */
1075         p = rbio + 1;
1076 #define CONSUME_ALLOC(ptr, count)       do {                            \
1077                 ptr = p;                                                \
1078                 p = (unsigned char *)p + sizeof(*(ptr)) * (count);      \
1079         } while (0)
1080         CONSUME_ALLOC(rbio->stripe_pages, num_pages);
1081         CONSUME_ALLOC(rbio->bio_sectors, num_sectors);
1082         CONSUME_ALLOC(rbio->stripe_sectors, num_sectors);
1083         CONSUME_ALLOC(rbio->finish_pointers, real_stripes);
1084         CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_nsectors));
1085         CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_nsectors));
1086 #undef  CONSUME_ALLOC
1087
1088         if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
1089                 nr_data = real_stripes - 1;
1090         else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
1091                 nr_data = real_stripes - 2;
1092         else
1093                 BUG();
1094
1095         rbio->nr_data = nr_data;
1096         return rbio;
1097 }
1098
1099 /* allocate pages for all the stripes in the bio, including parity */
1100 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
1101 {
1102         int ret;
1103
1104         ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages);
1105         if (ret < 0)
1106                 return ret;
1107         /* Mapping all sectors */
1108         index_stripe_sectors(rbio);
1109         return 0;
1110 }
1111
1112 /* only allocate pages for p/q stripes */
1113 static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
1114 {
1115         const int data_pages = rbio->nr_data * rbio->stripe_npages;
1116         int ret;
1117
1118         ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages,
1119                                      rbio->stripe_pages + data_pages);
1120         if (ret < 0)
1121                 return ret;
1122
1123         index_stripe_sectors(rbio);
1124         return 0;
1125 }
1126
1127 /*
1128  * Add a single sector @sector into our list of bios for IO.
1129  *
1130  * Return 0 if everything went well.
1131  * Return <0 for error.
1132  */
1133 static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
1134                               struct bio_list *bio_list,
1135                               struct sector_ptr *sector,
1136                               unsigned int stripe_nr,
1137                               unsigned int sector_nr,
1138                               unsigned long bio_max_len,
1139                               unsigned int opf)
1140 {
1141         const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1142         struct bio *last = bio_list->tail;
1143         int ret;
1144         struct bio *bio;
1145         struct btrfs_io_stripe *stripe;
1146         u64 disk_start;
1147
1148         /*
1149          * Note: here stripe_nr has taken device replace into consideration,
1150          * thus it can be larger than rbio->real_stripe.
1151          * So here we check against bioc->num_stripes, not rbio->real_stripes.
1152          */
1153         ASSERT(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes);
1154         ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors);
1155         ASSERT(sector->page);
1156
1157         stripe = &rbio->bioc->stripes[stripe_nr];
1158         disk_start = stripe->physical + sector_nr * sectorsize;
1159
1160         /* if the device is missing, just fail this stripe */
1161         if (!stripe->dev->bdev)
1162                 return fail_rbio_index(rbio, stripe_nr);
1163
1164         /* see if we can add this page onto our existing bio */
1165         if (last) {
1166                 u64 last_end = last->bi_iter.bi_sector << 9;
1167                 last_end += last->bi_iter.bi_size;
1168
1169                 /*
1170                  * we can't merge these if they are from different
1171                  * devices or if they are not contiguous
1172                  */
1173                 if (last_end == disk_start && !last->bi_status &&
1174                     last->bi_bdev == stripe->dev->bdev) {
1175                         ret = bio_add_page(last, sector->page, sectorsize,
1176                                            sector->pgoff);
1177                         if (ret == sectorsize)
1178                                 return 0;
1179                 }
1180         }
1181
1182         /* put a new bio on the list */
1183         bio = bio_alloc(stripe->dev->bdev, max(bio_max_len >> PAGE_SHIFT, 1UL),
1184                         opf, GFP_NOFS);
1185         bio->bi_iter.bi_sector = disk_start >> 9;
1186         bio->bi_private = rbio;
1187
1188         bio_add_page(bio, sector->page, sectorsize, sector->pgoff);
1189         bio_list_add(bio_list, bio);
1190         return 0;
1191 }
1192
1193 /*
1194  * while we're doing the read/modify/write cycle, we could
1195  * have errors in reading pages off the disk.  This checks
1196  * for errors and if we're not able to read the page it'll
1197  * trigger parity reconstruction.  The rmw will be finished
1198  * after we've reconstructed the failed stripes
1199  */
1200 static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
1201 {
1202         if (rbio->faila >= 0 || rbio->failb >= 0) {
1203                 BUG_ON(rbio->faila == rbio->real_stripes - 1);
1204                 __raid56_parity_recover(rbio);
1205         } else {
1206                 finish_rmw(rbio);
1207         }
1208 }
1209
1210 static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio)
1211 {
1212         const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1213         struct bio_vec bvec;
1214         struct bvec_iter iter;
1215         u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1216                      rbio->bioc->raid_map[0];
1217
1218         if (bio_flagged(bio, BIO_CLONED))
1219                 bio->bi_iter = btrfs_bio(bio)->iter;
1220
1221         bio_for_each_segment(bvec, bio, iter) {
1222                 u32 bvec_offset;
1223
1224                 for (bvec_offset = 0; bvec_offset < bvec.bv_len;
1225                      bvec_offset += sectorsize, offset += sectorsize) {
1226                         int index = offset / sectorsize;
1227                         struct sector_ptr *sector = &rbio->bio_sectors[index];
1228
1229                         sector->page = bvec.bv_page;
1230                         sector->pgoff = bvec.bv_offset + bvec_offset;
1231                         ASSERT(sector->pgoff < PAGE_SIZE);
1232                 }
1233         }
1234 }
1235
1236 /*
1237  * helper function to walk our bio list and populate the bio_pages array with
1238  * the result.  This seems expensive, but it is faster than constantly
1239  * searching through the bio list as we setup the IO in finish_rmw or stripe
1240  * reconstruction.
1241  *
1242  * This must be called before you trust the answers from page_in_rbio
1243  */
1244 static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1245 {
1246         struct bio *bio;
1247
1248         spin_lock_irq(&rbio->bio_list_lock);
1249         bio_list_for_each(bio, &rbio->bio_list)
1250                 index_one_bio(rbio, bio);
1251
1252         spin_unlock_irq(&rbio->bio_list_lock);
1253 }
1254
1255 /*
1256  * this is called from one of two situations.  We either
1257  * have a full stripe from the higher layers, or we've read all
1258  * the missing bits off disk.
1259  *
1260  * This will calculate the parity and then send down any
1261  * changed blocks.
1262  */
1263 static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1264 {
1265         struct btrfs_io_context *bioc = rbio->bioc;
1266         const u32 sectorsize = bioc->fs_info->sectorsize;
1267         void **pointers = rbio->finish_pointers;
1268         int nr_data = rbio->nr_data;
1269         int stripe;
1270         int sectornr;
1271         bool has_qstripe;
1272         struct bio_list bio_list;
1273         struct bio *bio;
1274         int ret;
1275
1276         bio_list_init(&bio_list);
1277
1278         if (rbio->real_stripes - rbio->nr_data == 1)
1279                 has_qstripe = false;
1280         else if (rbio->real_stripes - rbio->nr_data == 2)
1281                 has_qstripe = true;
1282         else
1283                 BUG();
1284
1285         /* at this point we either have a full stripe,
1286          * or we've read the full stripe from the drive.
1287          * recalculate the parity and write the new results.
1288          *
1289          * We're not allowed to add any new bios to the
1290          * bio list here, anyone else that wants to
1291          * change this stripe needs to do their own rmw.
1292          */
1293         spin_lock_irq(&rbio->bio_list_lock);
1294         set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1295         spin_unlock_irq(&rbio->bio_list_lock);
1296
1297         atomic_set(&rbio->error, 0);
1298
1299         /*
1300          * now that we've set rmw_locked, run through the
1301          * bio list one last time and map the page pointers
1302          *
1303          * We don't cache full rbios because we're assuming
1304          * the higher layers are unlikely to use this area of
1305          * the disk again soon.  If they do use it again,
1306          * hopefully they will send another full bio.
1307          */
1308         index_rbio_pages(rbio);
1309         if (!rbio_is_full(rbio))
1310                 cache_rbio_pages(rbio);
1311         else
1312                 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1313
1314         for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
1315                 struct sector_ptr *sector;
1316
1317                 /* First collect one sector from each data stripe */
1318                 for (stripe = 0; stripe < nr_data; stripe++) {
1319                         sector = sector_in_rbio(rbio, stripe, sectornr, 0);
1320                         pointers[stripe] = kmap_local_page(sector->page) +
1321                                            sector->pgoff;
1322                 }
1323
1324                 /* Then add the parity stripe */
1325                 sector = rbio_pstripe_sector(rbio, sectornr);
1326                 sector->uptodate = 1;
1327                 pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff;
1328
1329                 if (has_qstripe) {
1330                         /*
1331                          * RAID6, add the qstripe and call the library function
1332                          * to fill in our p/q
1333                          */
1334                         sector = rbio_qstripe_sector(rbio, sectornr);
1335                         sector->uptodate = 1;
1336                         pointers[stripe++] = kmap_local_page(sector->page) +
1337                                              sector->pgoff;
1338
1339                         raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
1340                                                 pointers);
1341                 } else {
1342                         /* raid5 */
1343                         memcpy(pointers[nr_data], pointers[0], sectorsize);
1344                         run_xor(pointers + 1, nr_data - 1, sectorsize);
1345                 }
1346                 for (stripe = stripe - 1; stripe >= 0; stripe--)
1347                         kunmap_local(pointers[stripe]);
1348         }
1349
1350         /*
1351          * time to start writing.  Make bios for everything from the
1352          * higher layers (the bio_list in our rbio) and our p/q.  Ignore
1353          * everything else.
1354          */
1355         for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1356                 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
1357                         struct sector_ptr *sector;
1358
1359                         if (stripe < rbio->nr_data) {
1360                                 sector = sector_in_rbio(rbio, stripe, sectornr, 1);
1361                                 if (!sector)
1362                                         continue;
1363                         } else {
1364                                 sector = rbio_stripe_sector(rbio, stripe, sectornr);
1365                         }
1366
1367                         ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
1368                                                  sectornr, rbio->stripe_len,
1369                                                  REQ_OP_WRITE);
1370                         if (ret)
1371                                 goto cleanup;
1372                 }
1373         }
1374
1375         if (likely(!bioc->num_tgtdevs))
1376                 goto write_data;
1377
1378         for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1379                 if (!bioc->tgtdev_map[stripe])
1380                         continue;
1381
1382                 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
1383                         struct sector_ptr *sector;
1384
1385                         if (stripe < rbio->nr_data) {
1386                                 sector = sector_in_rbio(rbio, stripe, sectornr, 1);
1387                                 if (!sector)
1388                                         continue;
1389                         } else {
1390                                 sector = rbio_stripe_sector(rbio, stripe, sectornr);
1391                         }
1392
1393                         ret = rbio_add_io_sector(rbio, &bio_list, sector,
1394                                                rbio->bioc->tgtdev_map[stripe],
1395                                                sectornr, rbio->stripe_len,
1396                                                REQ_OP_WRITE);
1397                         if (ret)
1398                                 goto cleanup;
1399                 }
1400         }
1401
1402 write_data:
1403         atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
1404         BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
1405
1406         while ((bio = bio_list_pop(&bio_list))) {
1407                 bio->bi_end_io = raid_write_end_io;
1408
1409                 submit_bio(bio);
1410         }
1411         return;
1412
1413 cleanup:
1414         rbio_orig_end_io(rbio, BLK_STS_IOERR);
1415
1416         while ((bio = bio_list_pop(&bio_list)))
1417                 bio_put(bio);
1418 }
1419
1420 /*
1421  * helper to find the stripe number for a given bio.  Used to figure out which
1422  * stripe has failed.  This expects the bio to correspond to a physical disk,
1423  * so it looks up based on physical sector numbers.
1424  */
1425 static int find_bio_stripe(struct btrfs_raid_bio *rbio,
1426                            struct bio *bio)
1427 {
1428         u64 physical = bio->bi_iter.bi_sector;
1429         int i;
1430         struct btrfs_io_stripe *stripe;
1431
1432         physical <<= 9;
1433
1434         for (i = 0; i < rbio->bioc->num_stripes; i++) {
1435                 stripe = &rbio->bioc->stripes[i];
1436                 if (in_range(physical, stripe->physical, rbio->stripe_len) &&
1437                     stripe->dev->bdev && bio->bi_bdev == stripe->dev->bdev) {
1438                         return i;
1439                 }
1440         }
1441         return -1;
1442 }
1443
1444 /*
1445  * helper to find the stripe number for a given
1446  * bio (before mapping).  Used to figure out which stripe has
1447  * failed.  This looks up based on logical block numbers.
1448  */
1449 static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
1450                                    struct bio *bio)
1451 {
1452         u64 logical = bio->bi_iter.bi_sector << 9;
1453         int i;
1454
1455         for (i = 0; i < rbio->nr_data; i++) {
1456                 u64 stripe_start = rbio->bioc->raid_map[i];
1457
1458                 if (in_range(logical, stripe_start, rbio->stripe_len))
1459                         return i;
1460         }
1461         return -1;
1462 }
1463
1464 /*
1465  * returns -EIO if we had too many failures
1466  */
1467 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
1468 {
1469         unsigned long flags;
1470         int ret = 0;
1471
1472         spin_lock_irqsave(&rbio->bio_list_lock, flags);
1473
1474         /* we already know this stripe is bad, move on */
1475         if (rbio->faila == failed || rbio->failb == failed)
1476                 goto out;
1477
1478         if (rbio->faila == -1) {
1479                 /* first failure on this rbio */
1480                 rbio->faila = failed;
1481                 atomic_inc(&rbio->error);
1482         } else if (rbio->failb == -1) {
1483                 /* second failure on this rbio */
1484                 rbio->failb = failed;
1485                 atomic_inc(&rbio->error);
1486         } else {
1487                 ret = -EIO;
1488         }
1489 out:
1490         spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
1491
1492         return ret;
1493 }
1494
1495 /*
1496  * helper to fail a stripe based on a physical disk
1497  * bio.
1498  */
1499 static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
1500                            struct bio *bio)
1501 {
1502         int failed = find_bio_stripe(rbio, bio);
1503
1504         if (failed < 0)
1505                 return -EIO;
1506
1507         return fail_rbio_index(rbio, failed);
1508 }
1509
1510 /*
1511  * For subpage case, we can no longer set page Uptodate directly for
1512  * stripe_pages[], thus we need to locate the sector.
1513  */
1514 static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio,
1515                                              struct page *page,
1516                                              unsigned int pgoff)
1517 {
1518         int i;
1519
1520         for (i = 0; i < rbio->nr_sectors; i++) {
1521                 struct sector_ptr *sector = &rbio->stripe_sectors[i];
1522
1523                 if (sector->page == page && sector->pgoff == pgoff)
1524                         return sector;
1525         }
1526         return NULL;
1527 }
1528
1529 /*
1530  * this sets each page in the bio uptodate.  It should only be used on private
1531  * rbio pages, nothing that comes in from the higher layers
1532  */
1533 static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio)
1534 {
1535         const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1536         struct bio_vec *bvec;
1537         struct bvec_iter_all iter_all;
1538
1539         ASSERT(!bio_flagged(bio, BIO_CLONED));
1540
1541         bio_for_each_segment_all(bvec, bio, iter_all) {
1542                 struct sector_ptr *sector;
1543                 int pgoff;
1544
1545                 for (pgoff = bvec->bv_offset; pgoff - bvec->bv_offset < bvec->bv_len;
1546                      pgoff += sectorsize) {
1547                         sector = find_stripe_sector(rbio, bvec->bv_page, pgoff);
1548                         ASSERT(sector);
1549                         if (sector)
1550                                 sector->uptodate = 1;
1551                 }
1552         }
1553 }
1554
1555 /*
1556  * end io for the read phase of the rmw cycle.  All the bios here are physical
1557  * stripe bios we've read from the disk so we can recalculate the parity of the
1558  * stripe.
1559  *
1560  * This will usually kick off finish_rmw once all the bios are read in, but it
1561  * may trigger parity reconstruction if we had any errors along the way
1562  */
1563 static void raid_rmw_end_io(struct bio *bio)
1564 {
1565         struct btrfs_raid_bio *rbio = bio->bi_private;
1566
1567         if (bio->bi_status)
1568                 fail_bio_stripe(rbio, bio);
1569         else
1570                 set_bio_pages_uptodate(rbio, bio);
1571
1572         bio_put(bio);
1573
1574         if (!atomic_dec_and_test(&rbio->stripes_pending))
1575                 return;
1576
1577         if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
1578                 goto cleanup;
1579
1580         /*
1581          * this will normally call finish_rmw to start our write
1582          * but if there are any failed stripes we'll reconstruct
1583          * from parity first
1584          */
1585         validate_rbio_for_rmw(rbio);
1586         return;
1587
1588 cleanup:
1589
1590         rbio_orig_end_io(rbio, BLK_STS_IOERR);
1591 }
1592
1593 /*
1594  * the stripe must be locked by the caller.  It will
1595  * unlock after all the writes are done
1596  */
1597 static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1598 {
1599         int bios_to_read = 0;
1600         struct bio_list bio_list;
1601         int ret;
1602         int sectornr;
1603         int stripe;
1604         struct bio *bio;
1605
1606         bio_list_init(&bio_list);
1607
1608         ret = alloc_rbio_pages(rbio);
1609         if (ret)
1610                 goto cleanup;
1611
1612         index_rbio_pages(rbio);
1613
1614         atomic_set(&rbio->error, 0);
1615         /*
1616          * build a list of bios to read all the missing parts of this
1617          * stripe
1618          */
1619         for (stripe = 0; stripe < rbio->nr_data; stripe++) {
1620                 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
1621                         struct sector_ptr *sector;
1622
1623                         /*
1624                          * We want to find all the sectors missing from the
1625                          * rbio and read them from the disk.  If * sector_in_rbio()
1626                          * finds a page in the bio list we don't need to read
1627                          * it off the stripe.
1628                          */
1629                         sector = sector_in_rbio(rbio, stripe, sectornr, 1);
1630                         if (sector)
1631                                 continue;
1632
1633                         sector = rbio_stripe_sector(rbio, stripe, sectornr);
1634                         /*
1635                          * The bio cache may have handed us an uptodate page.
1636                          * If so, be happy and use it.
1637                          */
1638                         if (sector->uptodate)
1639                                 continue;
1640
1641                         ret = rbio_add_io_sector(rbio, &bio_list, sector,
1642                                        stripe, sectornr, rbio->stripe_len,
1643                                        REQ_OP_READ);
1644                         if (ret)
1645                                 goto cleanup;
1646                 }
1647         }
1648
1649         bios_to_read = bio_list_size(&bio_list);
1650         if (!bios_to_read) {
1651                 /*
1652                  * this can happen if others have merged with
1653                  * us, it means there is nothing left to read.
1654                  * But if there are missing devices it may not be
1655                  * safe to do the full stripe write yet.
1656                  */
1657                 goto finish;
1658         }
1659
1660         /*
1661          * The bioc may be freed once we submit the last bio. Make sure not to
1662          * touch it after that.
1663          */
1664         atomic_set(&rbio->stripes_pending, bios_to_read);
1665         while ((bio = bio_list_pop(&bio_list))) {
1666                 bio->bi_end_io = raid_rmw_end_io;
1667
1668                 btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
1669
1670                 submit_bio(bio);
1671         }
1672         /* the actual write will happen once the reads are done */
1673         return 0;
1674
1675 cleanup:
1676         rbio_orig_end_io(rbio, BLK_STS_IOERR);
1677
1678         while ((bio = bio_list_pop(&bio_list)))
1679                 bio_put(bio);
1680
1681         return -EIO;
1682
1683 finish:
1684         validate_rbio_for_rmw(rbio);
1685         return 0;
1686 }
1687
1688 /*
1689  * if the upper layers pass in a full stripe, we thank them by only allocating
1690  * enough pages to hold the parity, and sending it all down quickly.
1691  */
1692 static int full_stripe_write(struct btrfs_raid_bio *rbio)
1693 {
1694         int ret;
1695
1696         ret = alloc_rbio_parity_pages(rbio);
1697         if (ret) {
1698                 __free_raid_bio(rbio);
1699                 return ret;
1700         }
1701
1702         ret = lock_stripe_add(rbio);
1703         if (ret == 0)
1704                 finish_rmw(rbio);
1705         return 0;
1706 }
1707
1708 /*
1709  * partial stripe writes get handed over to async helpers.
1710  * We're really hoping to merge a few more writes into this
1711  * rbio before calculating new parity
1712  */
1713 static int partial_stripe_write(struct btrfs_raid_bio *rbio)
1714 {
1715         int ret;
1716
1717         ret = lock_stripe_add(rbio);
1718         if (ret == 0)
1719                 start_async_work(rbio, rmw_work);
1720         return 0;
1721 }
1722
1723 /*
1724  * sometimes while we were reading from the drive to
1725  * recalculate parity, enough new bios come into create
1726  * a full stripe.  So we do a check here to see if we can
1727  * go directly to finish_rmw
1728  */
1729 static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
1730 {
1731         /* head off into rmw land if we don't have a full stripe */
1732         if (!rbio_is_full(rbio))
1733                 return partial_stripe_write(rbio);
1734         return full_stripe_write(rbio);
1735 }
1736
1737 /*
1738  * We use plugging call backs to collect full stripes.
1739  * Any time we get a partial stripe write while plugged
1740  * we collect it into a list.  When the unplug comes down,
1741  * we sort the list by logical block number and merge
1742  * everything we can into the same rbios
1743  */
1744 struct btrfs_plug_cb {
1745         struct blk_plug_cb cb;
1746         struct btrfs_fs_info *info;
1747         struct list_head rbio_list;
1748         struct work_struct work;
1749 };
1750
1751 /*
1752  * rbios on the plug list are sorted for easier merging.
1753  */
1754 static int plug_cmp(void *priv, const struct list_head *a,
1755                     const struct list_head *b)
1756 {
1757         const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
1758                                                        plug_list);
1759         const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
1760                                                        plug_list);
1761         u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
1762         u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
1763
1764         if (a_sector < b_sector)
1765                 return -1;
1766         if (a_sector > b_sector)
1767                 return 1;
1768         return 0;
1769 }
1770
1771 static void run_plug(struct btrfs_plug_cb *plug)
1772 {
1773         struct btrfs_raid_bio *cur;
1774         struct btrfs_raid_bio *last = NULL;
1775
1776         /*
1777          * sort our plug list then try to merge
1778          * everything we can in hopes of creating full
1779          * stripes.
1780          */
1781         list_sort(NULL, &plug->rbio_list, plug_cmp);
1782         while (!list_empty(&plug->rbio_list)) {
1783                 cur = list_entry(plug->rbio_list.next,
1784                                  struct btrfs_raid_bio, plug_list);
1785                 list_del_init(&cur->plug_list);
1786
1787                 if (rbio_is_full(cur)) {
1788                         int ret;
1789
1790                         /* we have a full stripe, send it down */
1791                         ret = full_stripe_write(cur);
1792                         BUG_ON(ret);
1793                         continue;
1794                 }
1795                 if (last) {
1796                         if (rbio_can_merge(last, cur)) {
1797                                 merge_rbio(last, cur);
1798                                 __free_raid_bio(cur);
1799                                 continue;
1800
1801                         }
1802                         __raid56_parity_write(last);
1803                 }
1804                 last = cur;
1805         }
1806         if (last) {
1807                 __raid56_parity_write(last);
1808         }
1809         kfree(plug);
1810 }
1811
1812 /*
1813  * if the unplug comes from schedule, we have to push the
1814  * work off to a helper thread
1815  */
1816 static void unplug_work(struct work_struct *work)
1817 {
1818         struct btrfs_plug_cb *plug;
1819         plug = container_of(work, struct btrfs_plug_cb, work);
1820         run_plug(plug);
1821 }
1822
1823 static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1824 {
1825         struct btrfs_plug_cb *plug;
1826         plug = container_of(cb, struct btrfs_plug_cb, cb);
1827
1828         if (from_schedule) {
1829                 INIT_WORK(&plug->work, unplug_work);
1830                 queue_work(plug->info->rmw_workers, &plug->work);
1831                 return;
1832         }
1833         run_plug(plug);
1834 }
1835
1836 /*
1837  * our main entry point for writes from the rest of the FS.
1838  */
1839 int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, u32 stripe_len)
1840 {
1841         struct btrfs_fs_info *fs_info = bioc->fs_info;
1842         struct btrfs_raid_bio *rbio;
1843         struct btrfs_plug_cb *plug = NULL;
1844         struct blk_plug_cb *cb;
1845         int ret;
1846
1847         rbio = alloc_rbio(fs_info, bioc, stripe_len);
1848         if (IS_ERR(rbio)) {
1849                 btrfs_put_bioc(bioc);
1850                 return PTR_ERR(rbio);
1851         }
1852         bio_list_add(&rbio->bio_list, bio);
1853         rbio->bio_list_bytes = bio->bi_iter.bi_size;
1854         rbio->operation = BTRFS_RBIO_WRITE;
1855
1856         btrfs_bio_counter_inc_noblocked(fs_info);
1857         rbio->generic_bio_cnt = 1;
1858
1859         /*
1860          * don't plug on full rbios, just get them out the door
1861          * as quickly as we can
1862          */
1863         if (rbio_is_full(rbio)) {
1864                 ret = full_stripe_write(rbio);
1865                 if (ret)
1866                         btrfs_bio_counter_dec(fs_info);
1867                 return ret;
1868         }
1869
1870         cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug));
1871         if (cb) {
1872                 plug = container_of(cb, struct btrfs_plug_cb, cb);
1873                 if (!plug->info) {
1874                         plug->info = fs_info;
1875                         INIT_LIST_HEAD(&plug->rbio_list);
1876                 }
1877                 list_add_tail(&rbio->plug_list, &plug->rbio_list);
1878                 ret = 0;
1879         } else {
1880                 ret = __raid56_parity_write(rbio);
1881                 if (ret)
1882                         btrfs_bio_counter_dec(fs_info);
1883         }
1884         return ret;
1885 }
1886
1887 /*
1888  * all parity reconstruction happens here.  We've read in everything
1889  * we can find from the drives and this does the heavy lifting of
1890  * sorting the good from the bad.
1891  */
1892 static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1893 {
1894         const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1895         int sectornr, stripe;
1896         void **pointers;
1897         void **unmap_array;
1898         int faila = -1, failb = -1;
1899         blk_status_t err;
1900         int i;
1901
1902         /*
1903          * This array stores the pointer for each sector, thus it has the extra
1904          * pgoff value added from each sector
1905          */
1906         pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
1907         if (!pointers) {
1908                 err = BLK_STS_RESOURCE;
1909                 goto cleanup_io;
1910         }
1911
1912         /*
1913          * Store copy of pointers that does not get reordered during
1914          * reconstruction so that kunmap_local works.
1915          */
1916         unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
1917         if (!unmap_array) {
1918                 err = BLK_STS_RESOURCE;
1919                 goto cleanup_pointers;
1920         }
1921
1922         faila = rbio->faila;
1923         failb = rbio->failb;
1924
1925         if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1926             rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
1927                 spin_lock_irq(&rbio->bio_list_lock);
1928                 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1929                 spin_unlock_irq(&rbio->bio_list_lock);
1930         }
1931
1932         index_rbio_pages(rbio);
1933
1934         for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
1935                 struct sector_ptr *sector;
1936
1937                 /*
1938                  * Now we just use bitmap to mark the horizontal stripes in
1939                  * which we have data when doing parity scrub.
1940                  */
1941                 if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
1942                     !test_bit(sectornr, rbio->dbitmap))
1943                         continue;
1944
1945                 /*
1946                  * Setup our array of pointers with sectors from each stripe
1947                  *
1948                  * NOTE: store a duplicate array of pointers to preserve the
1949                  * pointer order
1950                  */
1951                 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1952                         /*
1953                          * If we're rebuilding a read, we have to use
1954                          * pages from the bio list
1955                          */
1956                         if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1957                              rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
1958                             (stripe == faila || stripe == failb)) {
1959                                 sector = sector_in_rbio(rbio, stripe, sectornr, 0);
1960                         } else {
1961                                 sector = rbio_stripe_sector(rbio, stripe, sectornr);
1962                         }
1963                         ASSERT(sector->page);
1964                         pointers[stripe] = kmap_local_page(sector->page) +
1965                                            sector->pgoff;
1966                         unmap_array[stripe] = pointers[stripe];
1967                 }
1968
1969                 /* All raid6 handling here */
1970                 if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
1971                         /* Single failure, rebuild from parity raid5 style */
1972                         if (failb < 0) {
1973                                 if (faila == rbio->nr_data) {
1974                                         /*
1975                                          * Just the P stripe has failed, without
1976                                          * a bad data or Q stripe.
1977                                          * TODO, we should redo the xor here.
1978                                          */
1979                                         err = BLK_STS_IOERR;
1980                                         goto cleanup;
1981                                 }
1982                                 /*
1983                                  * a single failure in raid6 is rebuilt
1984                                  * in the pstripe code below
1985                                  */
1986                                 goto pstripe;
1987                         }
1988
1989                         /* make sure our ps and qs are in order */
1990                         if (faila > failb)
1991                                 swap(faila, failb);
1992
1993                         /* if the q stripe is failed, do a pstripe reconstruction
1994                          * from the xors.
1995                          * If both the q stripe and the P stripe are failed, we're
1996                          * here due to a crc mismatch and we can't give them the
1997                          * data they want
1998                          */
1999                         if (rbio->bioc->raid_map[failb] == RAID6_Q_STRIPE) {
2000                                 if (rbio->bioc->raid_map[faila] ==
2001                                     RAID5_P_STRIPE) {
2002                                         err = BLK_STS_IOERR;
2003                                         goto cleanup;
2004                                 }
2005                                 /*
2006                                  * otherwise we have one bad data stripe and
2007                                  * a good P stripe.  raid5!
2008                                  */
2009                                 goto pstripe;
2010                         }
2011
2012                         if (rbio->bioc->raid_map[failb] == RAID5_P_STRIPE) {
2013                                 raid6_datap_recov(rbio->real_stripes,
2014                                                   sectorsize, faila, pointers);
2015                         } else {
2016                                 raid6_2data_recov(rbio->real_stripes,
2017                                                   sectorsize, faila, failb,
2018                                                   pointers);
2019                         }
2020                 } else {
2021                         void *p;
2022
2023                         /* rebuild from P stripe here (raid5 or raid6) */
2024                         BUG_ON(failb != -1);
2025 pstripe:
2026                         /* Copy parity block into failed block to start with */
2027                         memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize);
2028
2029                         /* rearrange the pointer array */
2030                         p = pointers[faila];
2031                         for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
2032                                 pointers[stripe] = pointers[stripe + 1];
2033                         pointers[rbio->nr_data - 1] = p;
2034
2035                         /* xor in the rest */
2036                         run_xor(pointers, rbio->nr_data - 1, sectorsize);
2037                 }
2038                 /* if we're doing this rebuild as part of an rmw, go through
2039                  * and set all of our private rbio pages in the
2040                  * failed stripes as uptodate.  This way finish_rmw will
2041                  * know they can be trusted.  If this was a read reconstruction,
2042                  * other endio functions will fiddle the uptodate bits
2043                  */
2044                 if (rbio->operation == BTRFS_RBIO_WRITE) {
2045                         for (i = 0;  i < rbio->stripe_nsectors; i++) {
2046                                 if (faila != -1) {
2047                                         sector = rbio_stripe_sector(rbio, faila, i);
2048                                         sector->uptodate = 1;
2049                                 }
2050                                 if (failb != -1) {
2051                                         sector = rbio_stripe_sector(rbio, failb, i);
2052                                         sector->uptodate = 1;
2053                                 }
2054                         }
2055                 }
2056                 for (stripe = rbio->real_stripes - 1; stripe >= 0; stripe--)
2057                         kunmap_local(unmap_array[stripe]);
2058         }
2059
2060         err = BLK_STS_OK;
2061 cleanup:
2062         kfree(unmap_array);
2063 cleanup_pointers:
2064         kfree(pointers);
2065
2066 cleanup_io:
2067         /*
2068          * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a
2069          * valid rbio which is consistent with ondisk content, thus such a
2070          * valid rbio can be cached to avoid further disk reads.
2071          */
2072         if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
2073             rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
2074                 /*
2075                  * - In case of two failures, where rbio->failb != -1:
2076                  *
2077                  *   Do not cache this rbio since the above read reconstruction
2078                  *   (raid6_datap_recov() or raid6_2data_recov()) may have
2079                  *   changed some content of stripes which are not identical to
2080                  *   on-disk content any more, otherwise, a later write/recover
2081                  *   may steal stripe_pages from this rbio and end up with
2082                  *   corruptions or rebuild failures.
2083                  *
2084                  * - In case of single failure, where rbio->failb == -1:
2085                  *
2086                  *   Cache this rbio iff the above read reconstruction is
2087                  *   executed without problems.
2088                  */
2089                 if (err == BLK_STS_OK && rbio->failb < 0)
2090                         cache_rbio_pages(rbio);
2091                 else
2092                         clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2093
2094                 rbio_orig_end_io(rbio, err);
2095         } else if (err == BLK_STS_OK) {
2096                 rbio->faila = -1;
2097                 rbio->failb = -1;
2098
2099                 if (rbio->operation == BTRFS_RBIO_WRITE)
2100                         finish_rmw(rbio);
2101                 else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
2102                         finish_parity_scrub(rbio, 0);
2103                 else
2104                         BUG();
2105         } else {
2106                 rbio_orig_end_io(rbio, err);
2107         }
2108 }
2109
2110 /*
2111  * This is called only for stripes we've read from disk to
2112  * reconstruct the parity.
2113  */
2114 static void raid_recover_end_io(struct bio *bio)
2115 {
2116         struct btrfs_raid_bio *rbio = bio->bi_private;
2117
2118         /*
2119          * we only read stripe pages off the disk, set them
2120          * up to date if there were no errors
2121          */
2122         if (bio->bi_status)
2123                 fail_bio_stripe(rbio, bio);
2124         else
2125                 set_bio_pages_uptodate(rbio, bio);
2126         bio_put(bio);
2127
2128         if (!atomic_dec_and_test(&rbio->stripes_pending))
2129                 return;
2130
2131         if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
2132                 rbio_orig_end_io(rbio, BLK_STS_IOERR);
2133         else
2134                 __raid_recover_end_io(rbio);
2135 }
2136
2137 /*
2138  * reads everything we need off the disk to reconstruct
2139  * the parity. endio handlers trigger final reconstruction
2140  * when the IO is done.
2141  *
2142  * This is used both for reads from the higher layers and for
2143  * parity construction required to finish a rmw cycle.
2144  */
2145 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
2146 {
2147         int bios_to_read = 0;
2148         struct bio_list bio_list;
2149         int ret;
2150         int sectornr;
2151         int stripe;
2152         struct bio *bio;
2153
2154         bio_list_init(&bio_list);
2155
2156         ret = alloc_rbio_pages(rbio);
2157         if (ret)
2158                 goto cleanup;
2159
2160         atomic_set(&rbio->error, 0);
2161
2162         /*
2163          * read everything that hasn't failed.  Thanks to the
2164          * stripe cache, it is possible that some or all of these
2165          * pages are going to be uptodate.
2166          */
2167         for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
2168                 if (rbio->faila == stripe || rbio->failb == stripe) {
2169                         atomic_inc(&rbio->error);
2170                         continue;
2171                 }
2172
2173                 for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
2174                         struct sector_ptr *sector;
2175
2176                         /*
2177                          * the rmw code may have already read this
2178                          * page in
2179                          */
2180                         sector = rbio_stripe_sector(rbio, stripe, sectornr);
2181                         if (sector->uptodate)
2182                                 continue;
2183
2184                         ret = rbio_add_io_sector(rbio, &bio_list, sector,
2185                                                  stripe, sectornr, rbio->stripe_len,
2186                                                  REQ_OP_READ);
2187                         if (ret < 0)
2188                                 goto cleanup;
2189                 }
2190         }
2191
2192         bios_to_read = bio_list_size(&bio_list);
2193         if (!bios_to_read) {
2194                 /*
2195                  * we might have no bios to read just because the pages
2196                  * were up to date, or we might have no bios to read because
2197                  * the devices were gone.
2198                  */
2199                 if (atomic_read(&rbio->error) <= rbio->bioc->max_errors) {
2200                         __raid_recover_end_io(rbio);
2201                         return 0;
2202                 } else {
2203                         goto cleanup;
2204                 }
2205         }
2206
2207         /*
2208          * The bioc may be freed once we submit the last bio. Make sure not to
2209          * touch it after that.
2210          */
2211         atomic_set(&rbio->stripes_pending, bios_to_read);
2212         while ((bio = bio_list_pop(&bio_list))) {
2213                 bio->bi_end_io = raid_recover_end_io;
2214
2215                 btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
2216
2217                 submit_bio(bio);
2218         }
2219
2220         return 0;
2221
2222 cleanup:
2223         if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
2224             rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
2225                 rbio_orig_end_io(rbio, BLK_STS_IOERR);
2226
2227         while ((bio = bio_list_pop(&bio_list)))
2228                 bio_put(bio);
2229
2230         return -EIO;
2231 }
2232
2233 /*
2234  * the main entry point for reads from the higher layers.  This
2235  * is really only called when the normal read path had a failure,
2236  * so we assume the bio they send down corresponds to a failed part
2237  * of the drive.
2238  */
2239 int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
2240                           u32 stripe_len, int mirror_num, int generic_io)
2241 {
2242         struct btrfs_fs_info *fs_info = bioc->fs_info;
2243         struct btrfs_raid_bio *rbio;
2244         int ret;
2245
2246         if (generic_io) {
2247                 ASSERT(bioc->mirror_num == mirror_num);
2248                 btrfs_bio(bio)->mirror_num = mirror_num;
2249         }
2250
2251         rbio = alloc_rbio(fs_info, bioc, stripe_len);
2252         if (IS_ERR(rbio)) {
2253                 if (generic_io)
2254                         btrfs_put_bioc(bioc);
2255                 return PTR_ERR(rbio);
2256         }
2257
2258         rbio->operation = BTRFS_RBIO_READ_REBUILD;
2259         bio_list_add(&rbio->bio_list, bio);
2260         rbio->bio_list_bytes = bio->bi_iter.bi_size;
2261
2262         rbio->faila = find_logical_bio_stripe(rbio, bio);
2263         if (rbio->faila == -1) {
2264                 btrfs_warn(fs_info,
2265 "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bioc has map_type %llu)",
2266                            __func__, bio->bi_iter.bi_sector << 9,
2267                            (u64)bio->bi_iter.bi_size, bioc->map_type);
2268                 if (generic_io)
2269                         btrfs_put_bioc(bioc);
2270                 kfree(rbio);
2271                 return -EIO;
2272         }
2273
2274         if (generic_io) {
2275                 btrfs_bio_counter_inc_noblocked(fs_info);
2276                 rbio->generic_bio_cnt = 1;
2277         } else {
2278                 btrfs_get_bioc(bioc);
2279         }
2280
2281         /*
2282          * Loop retry:
2283          * for 'mirror == 2', reconstruct from all other stripes.
2284          * for 'mirror_num > 2', select a stripe to fail on every retry.
2285          */
2286         if (mirror_num > 2) {
2287                 /*
2288                  * 'mirror == 3' is to fail the p stripe and
2289                  * reconstruct from the q stripe.  'mirror > 3' is to
2290                  * fail a data stripe and reconstruct from p+q stripe.
2291                  */
2292                 rbio->failb = rbio->real_stripes - (mirror_num - 1);
2293                 ASSERT(rbio->failb > 0);
2294                 if (rbio->failb <= rbio->faila)
2295                         rbio->failb--;
2296         }
2297
2298         ret = lock_stripe_add(rbio);
2299
2300         /*
2301          * __raid56_parity_recover will end the bio with
2302          * any errors it hits.  We don't want to return
2303          * its error value up the stack because our caller
2304          * will end up calling bio_endio with any nonzero
2305          * return
2306          */
2307         if (ret == 0)
2308                 __raid56_parity_recover(rbio);
2309         /*
2310          * our rbio has been added to the list of
2311          * rbios that will be handled after the
2312          * currently lock owner is done
2313          */
2314         return 0;
2315
2316 }
2317
2318 static void rmw_work(struct work_struct *work)
2319 {
2320         struct btrfs_raid_bio *rbio;
2321
2322         rbio = container_of(work, struct btrfs_raid_bio, work);
2323         raid56_rmw_stripe(rbio);
2324 }
2325
2326 static void read_rebuild_work(struct work_struct *work)
2327 {
2328         struct btrfs_raid_bio *rbio;
2329
2330         rbio = container_of(work, struct btrfs_raid_bio, work);
2331         __raid56_parity_recover(rbio);
2332 }
2333
2334 /*
2335  * The following code is used to scrub/replace the parity stripe
2336  *
2337  * Caller must have already increased bio_counter for getting @bioc.
2338  *
2339  * Note: We need make sure all the pages that add into the scrub/replace
2340  * raid bio are correct and not be changed during the scrub/replace. That
2341  * is those pages just hold metadata or file data with checksum.
2342  */
2343
2344 struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
2345                                 struct btrfs_io_context *bioc,
2346                                 u32 stripe_len, struct btrfs_device *scrub_dev,
2347                                 unsigned long *dbitmap, int stripe_nsectors)
2348 {
2349         struct btrfs_fs_info *fs_info = bioc->fs_info;
2350         struct btrfs_raid_bio *rbio;
2351         int i;
2352
2353         rbio = alloc_rbio(fs_info, bioc, stripe_len);
2354         if (IS_ERR(rbio))
2355                 return NULL;
2356         bio_list_add(&rbio->bio_list, bio);
2357         /*
2358          * This is a special bio which is used to hold the completion handler
2359          * and make the scrub rbio is similar to the other types
2360          */
2361         ASSERT(!bio->bi_iter.bi_size);
2362         rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
2363
2364         /*
2365          * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted
2366          * to the end position, so this search can start from the first parity
2367          * stripe.
2368          */
2369         for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
2370                 if (bioc->stripes[i].dev == scrub_dev) {
2371                         rbio->scrubp = i;
2372                         break;
2373                 }
2374         }
2375         ASSERT(i < rbio->real_stripes);
2376
2377         bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
2378
2379         /*
2380          * We have already increased bio_counter when getting bioc, record it
2381          * so we can free it at rbio_orig_end_io().
2382          */
2383         rbio->generic_bio_cnt = 1;
2384
2385         return rbio;
2386 }
2387
2388 /* Used for both parity scrub and missing. */
2389 void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
2390                             unsigned int pgoff, u64 logical)
2391 {
2392         const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
2393         int stripe_offset;
2394         int index;
2395
2396         ASSERT(logical >= rbio->bioc->raid_map[0]);
2397         ASSERT(logical + sectorsize <= rbio->bioc->raid_map[0] +
2398                                 rbio->stripe_len * rbio->nr_data);
2399         stripe_offset = (int)(logical - rbio->bioc->raid_map[0]);
2400         index = stripe_offset / sectorsize;
2401         rbio->bio_sectors[index].page = page;
2402         rbio->bio_sectors[index].pgoff = pgoff;
2403 }
2404
2405 /*
2406  * We just scrub the parity that we have correct data on the same horizontal,
2407  * so we needn't allocate all pages for all the stripes.
2408  */
2409 static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
2410 {
2411         const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
2412         int stripe;
2413         int sectornr;
2414
2415         for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) {
2416                 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
2417                         struct page *page;
2418                         int index = (stripe * rbio->stripe_nsectors + sectornr) *
2419                                     sectorsize >> PAGE_SHIFT;
2420
2421                         if (rbio->stripe_pages[index])
2422                                 continue;
2423
2424                         page = alloc_page(GFP_NOFS);
2425                         if (!page)
2426                                 return -ENOMEM;
2427                         rbio->stripe_pages[index] = page;
2428                 }
2429         }
2430         index_stripe_sectors(rbio);
2431         return 0;
2432 }
2433
2434 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
2435                                          int need_check)
2436 {
2437         struct btrfs_io_context *bioc = rbio->bioc;
2438         const u32 sectorsize = bioc->fs_info->sectorsize;
2439         void **pointers = rbio->finish_pointers;
2440         unsigned long *pbitmap = rbio->finish_pbitmap;
2441         int nr_data = rbio->nr_data;
2442         int stripe;
2443         int sectornr;
2444         bool has_qstripe;
2445         struct sector_ptr p_sector = { 0 };
2446         struct sector_ptr q_sector = { 0 };
2447         struct bio_list bio_list;
2448         struct bio *bio;
2449         int is_replace = 0;
2450         int ret;
2451
2452         bio_list_init(&bio_list);
2453
2454         if (rbio->real_stripes - rbio->nr_data == 1)
2455                 has_qstripe = false;
2456         else if (rbio->real_stripes - rbio->nr_data == 2)
2457                 has_qstripe = true;
2458         else
2459                 BUG();
2460
2461         if (bioc->num_tgtdevs && bioc->tgtdev_map[rbio->scrubp]) {
2462                 is_replace = 1;
2463                 bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_nsectors);
2464         }
2465
2466         /*
2467          * Because the higher layers(scrubber) are unlikely to
2468          * use this area of the disk again soon, so don't cache
2469          * it.
2470          */
2471         clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2472
2473         if (!need_check)
2474                 goto writeback;
2475
2476         p_sector.page = alloc_page(GFP_NOFS);
2477         if (!p_sector.page)
2478                 goto cleanup;
2479         p_sector.pgoff = 0;
2480         p_sector.uptodate = 1;
2481
2482         if (has_qstripe) {
2483                 /* RAID6, allocate and map temp space for the Q stripe */
2484                 q_sector.page = alloc_page(GFP_NOFS);
2485                 if (!q_sector.page) {
2486                         __free_page(p_sector.page);
2487                         p_sector.page = NULL;
2488                         goto cleanup;
2489                 }
2490                 q_sector.pgoff = 0;
2491                 q_sector.uptodate = 1;
2492                 pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page);
2493         }
2494
2495         atomic_set(&rbio->error, 0);
2496
2497         /* Map the parity stripe just once */
2498         pointers[nr_data] = kmap_local_page(p_sector.page);
2499
2500         for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) {
2501                 struct sector_ptr *sector;
2502                 void *parity;
2503
2504                 /* first collect one page from each data stripe */
2505                 for (stripe = 0; stripe < nr_data; stripe++) {
2506                         sector = sector_in_rbio(rbio, stripe, sectornr, 0);
2507                         pointers[stripe] = kmap_local_page(sector->page) +
2508                                            sector->pgoff;
2509                 }
2510
2511                 if (has_qstripe) {
2512                         /* RAID6, call the library function to fill in our P/Q */
2513                         raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
2514                                                 pointers);
2515                 } else {
2516                         /* raid5 */
2517                         memcpy(pointers[nr_data], pointers[0], sectorsize);
2518                         run_xor(pointers + 1, nr_data - 1, sectorsize);
2519                 }
2520
2521                 /* Check scrubbing parity and repair it */
2522                 sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2523                 parity = kmap_local_page(sector->page) + sector->pgoff;
2524                 if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0)
2525                         memcpy(parity, pointers[rbio->scrubp], sectorsize);
2526                 else
2527                         /* Parity is right, needn't writeback */
2528                         bitmap_clear(rbio->dbitmap, sectornr, 1);
2529                 kunmap_local(parity);
2530
2531                 for (stripe = nr_data - 1; stripe >= 0; stripe--)
2532                         kunmap_local(pointers[stripe]);
2533         }
2534
2535         kunmap_local(pointers[nr_data]);
2536         __free_page(p_sector.page);
2537         p_sector.page = NULL;
2538         if (q_sector.page) {
2539                 kunmap_local(pointers[rbio->real_stripes - 1]);
2540                 __free_page(q_sector.page);
2541                 q_sector.page = NULL;
2542         }
2543
2544 writeback:
2545         /*
2546          * time to start writing.  Make bios for everything from the
2547          * higher layers (the bio_list in our rbio) and our p/q.  Ignore
2548          * everything else.
2549          */
2550         for_each_set_bit(sectornr, rbio->dbitmap, rbio->stripe_nsectors) {
2551                 struct sector_ptr *sector;
2552
2553                 sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2554                 ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp,
2555                                          sectornr, rbio->stripe_len, REQ_OP_WRITE);
2556                 if (ret)
2557                         goto cleanup;
2558         }
2559
2560         if (!is_replace)
2561                 goto submit_write;
2562
2563         for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) {
2564                 struct sector_ptr *sector;
2565
2566                 sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2567                 ret = rbio_add_io_sector(rbio, &bio_list, sector,
2568                                        bioc->tgtdev_map[rbio->scrubp],
2569                                        sectornr, rbio->stripe_len, REQ_OP_WRITE);
2570                 if (ret)
2571                         goto cleanup;
2572         }
2573
2574 submit_write:
2575         nr_data = bio_list_size(&bio_list);
2576         if (!nr_data) {
2577                 /* Every parity is right */
2578                 rbio_orig_end_io(rbio, BLK_STS_OK);
2579                 return;
2580         }
2581
2582         atomic_set(&rbio->stripes_pending, nr_data);
2583
2584         while ((bio = bio_list_pop(&bio_list))) {
2585                 bio->bi_end_io = raid_write_end_io;
2586
2587                 submit_bio(bio);
2588         }
2589         return;
2590
2591 cleanup:
2592         rbio_orig_end_io(rbio, BLK_STS_IOERR);
2593
2594         while ((bio = bio_list_pop(&bio_list)))
2595                 bio_put(bio);
2596 }
2597
2598 static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
2599 {
2600         if (stripe >= 0 && stripe < rbio->nr_data)
2601                 return 1;
2602         return 0;
2603 }
2604
2605 /*
2606  * While we're doing the parity check and repair, we could have errors
2607  * in reading pages off the disk.  This checks for errors and if we're
2608  * not able to read the page it'll trigger parity reconstruction.  The
2609  * parity scrub will be finished after we've reconstructed the failed
2610  * stripes
2611  */
2612 static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
2613 {
2614         if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
2615                 goto cleanup;
2616
2617         if (rbio->faila >= 0 || rbio->failb >= 0) {
2618                 int dfail = 0, failp = -1;
2619
2620                 if (is_data_stripe(rbio, rbio->faila))
2621                         dfail++;
2622                 else if (is_parity_stripe(rbio->faila))
2623                         failp = rbio->faila;
2624
2625                 if (is_data_stripe(rbio, rbio->failb))
2626                         dfail++;
2627                 else if (is_parity_stripe(rbio->failb))
2628                         failp = rbio->failb;
2629
2630                 /*
2631                  * Because we can not use a scrubbing parity to repair
2632                  * the data, so the capability of the repair is declined.
2633                  * (In the case of RAID5, we can not repair anything)
2634                  */
2635                 if (dfail > rbio->bioc->max_errors - 1)
2636                         goto cleanup;
2637
2638                 /*
2639                  * If all data is good, only parity is correctly, just
2640                  * repair the parity.
2641                  */
2642                 if (dfail == 0) {
2643                         finish_parity_scrub(rbio, 0);
2644                         return;
2645                 }
2646
2647                 /*
2648                  * Here means we got one corrupted data stripe and one
2649                  * corrupted parity on RAID6, if the corrupted parity
2650                  * is scrubbing parity, luckily, use the other one to repair
2651                  * the data, or we can not repair the data stripe.
2652                  */
2653                 if (failp != rbio->scrubp)
2654                         goto cleanup;
2655
2656                 __raid_recover_end_io(rbio);
2657         } else {
2658                 finish_parity_scrub(rbio, 1);
2659         }
2660         return;
2661
2662 cleanup:
2663         rbio_orig_end_io(rbio, BLK_STS_IOERR);
2664 }
2665
2666 /*
2667  * end io for the read phase of the rmw cycle.  All the bios here are physical
2668  * stripe bios we've read from the disk so we can recalculate the parity of the
2669  * stripe.
2670  *
2671  * This will usually kick off finish_rmw once all the bios are read in, but it
2672  * may trigger parity reconstruction if we had any errors along the way
2673  */
2674 static void raid56_parity_scrub_end_io(struct bio *bio)
2675 {
2676         struct btrfs_raid_bio *rbio = bio->bi_private;
2677
2678         if (bio->bi_status)
2679                 fail_bio_stripe(rbio, bio);
2680         else
2681                 set_bio_pages_uptodate(rbio, bio);
2682
2683         bio_put(bio);
2684
2685         if (!atomic_dec_and_test(&rbio->stripes_pending))
2686                 return;
2687
2688         /*
2689          * this will normally call finish_rmw to start our write
2690          * but if there are any failed stripes we'll reconstruct
2691          * from parity first
2692          */
2693         validate_rbio_for_parity_scrub(rbio);
2694 }
2695
2696 static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
2697 {
2698         int bios_to_read = 0;
2699         struct bio_list bio_list;
2700         int ret;
2701         int sectornr;
2702         int stripe;
2703         struct bio *bio;
2704
2705         bio_list_init(&bio_list);
2706
2707         ret = alloc_rbio_essential_pages(rbio);
2708         if (ret)
2709                 goto cleanup;
2710
2711         atomic_set(&rbio->error, 0);
2712         /*
2713          * build a list of bios to read all the missing parts of this
2714          * stripe
2715          */
2716         for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
2717                 for_each_set_bit(sectornr , rbio->dbitmap, rbio->stripe_nsectors) {
2718                         struct sector_ptr *sector;
2719                         /*
2720                          * We want to find all the sectors missing from the
2721                          * rbio and read them from the disk.  If * sector_in_rbio()
2722                          * finds a sector in the bio list we don't need to read
2723                          * it off the stripe.
2724                          */
2725                         sector = sector_in_rbio(rbio, stripe, sectornr, 1);
2726                         if (sector)
2727                                 continue;
2728
2729                         sector = rbio_stripe_sector(rbio, stripe, sectornr);
2730                         /*
2731                          * The bio cache may have handed us an uptodate sector.
2732                          * If so, be happy and use it.
2733                          */
2734                         if (sector->uptodate)
2735                                 continue;
2736
2737                         ret = rbio_add_io_sector(rbio, &bio_list, sector,
2738                                                  stripe, sectornr, rbio->stripe_len,
2739                                                  REQ_OP_READ);
2740                         if (ret)
2741                                 goto cleanup;
2742                 }
2743         }
2744
2745         bios_to_read = bio_list_size(&bio_list);
2746         if (!bios_to_read) {
2747                 /*
2748                  * this can happen if others have merged with
2749                  * us, it means there is nothing left to read.
2750                  * But if there are missing devices it may not be
2751                  * safe to do the full stripe write yet.
2752                  */
2753                 goto finish;
2754         }
2755
2756         /*
2757          * The bioc may be freed once we submit the last bio. Make sure not to
2758          * touch it after that.
2759          */
2760         atomic_set(&rbio->stripes_pending, bios_to_read);
2761         while ((bio = bio_list_pop(&bio_list))) {
2762                 bio->bi_end_io = raid56_parity_scrub_end_io;
2763
2764                 btrfs_bio_wq_end_io(rbio->bioc->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
2765
2766                 submit_bio(bio);
2767         }
2768         /* the actual write will happen once the reads are done */
2769         return;
2770
2771 cleanup:
2772         rbio_orig_end_io(rbio, BLK_STS_IOERR);
2773
2774         while ((bio = bio_list_pop(&bio_list)))
2775                 bio_put(bio);
2776
2777         return;
2778
2779 finish:
2780         validate_rbio_for_parity_scrub(rbio);
2781 }
2782
2783 static void scrub_parity_work(struct work_struct *work)
2784 {
2785         struct btrfs_raid_bio *rbio;
2786
2787         rbio = container_of(work, struct btrfs_raid_bio, work);
2788         raid56_parity_scrub_stripe(rbio);
2789 }
2790
2791 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
2792 {
2793         if (!lock_stripe_add(rbio))
2794                 start_async_work(rbio, scrub_parity_work);
2795 }
2796
2797 /* The following code is used for dev replace of a missing RAID 5/6 device. */
2798
2799 struct btrfs_raid_bio *
2800 raid56_alloc_missing_rbio(struct bio *bio, struct btrfs_io_context *bioc,
2801                           u64 length)
2802 {
2803         struct btrfs_fs_info *fs_info = bioc->fs_info;
2804         struct btrfs_raid_bio *rbio;
2805
2806         rbio = alloc_rbio(fs_info, bioc, length);
2807         if (IS_ERR(rbio))
2808                 return NULL;
2809
2810         rbio->operation = BTRFS_RBIO_REBUILD_MISSING;
2811         bio_list_add(&rbio->bio_list, bio);
2812         /*
2813          * This is a special bio which is used to hold the completion handler
2814          * and make the scrub rbio is similar to the other types
2815          */
2816         ASSERT(!bio->bi_iter.bi_size);
2817
2818         rbio->faila = find_logical_bio_stripe(rbio, bio);
2819         if (rbio->faila == -1) {
2820                 BUG();
2821                 kfree(rbio);
2822                 return NULL;
2823         }
2824
2825         /*
2826          * When we get bioc, we have already increased bio_counter, record it
2827          * so we can free it at rbio_orig_end_io()
2828          */
2829         rbio->generic_bio_cnt = 1;
2830
2831         return rbio;
2832 }
2833
2834 void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
2835 {
2836         if (!lock_stripe_add(rbio))
2837                 start_async_work(rbio, read_rebuild_work);
2838 }