fs/btrfs/raid56.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (C) 2012 Fusion-io  All rights reserved.
   4  * Copyright (C) 2012 Intel Corp. All rights reserved.
   5  */
   6
   7 #include <linux/sched.h>
   8 #include <linux/bio.h>
   9 #include <linux/slab.h>
  10 #include <linux/blkdev.h>
  11 #include <linux/raid/pq.h>
  12 #include <linux/hash.h>
  13 #include <linux/list_sort.h>
  14 #include <linux/raid/xor.h>
  15 #include <linux/mm.h>
  16 #include "messages.h"
  17 #include "ctree.h"
  18 #include "disk-io.h"
  19 #include "volumes.h"
  20 #include "raid56.h"
  21 #include "async-thread.h"
  22 #include "file-item.h"
  23 #include "btrfs_inode.h"
  24
  25 /* set when additional merges to this rbio are not allowed */
  26 #define RBIO_RMW_LOCKED_BIT     1
  27
  28 /*
  29  * set when this rbio is sitting in the hash, but it is just a cache
  30  * of past RMW
  31  */
  32 #define RBIO_CACHE_BIT          2
  33
  34 /*
  35  * set when it is safe to trust the stripe_pages for caching
  36  */
  37 #define RBIO_CACHE_READY_BIT    3
  38
  39 #define RBIO_CACHE_SIZE 1024
  40
  41 #define BTRFS_STRIPE_HASH_TABLE_BITS                            11
  42
  43 /* Used by the raid56 code to lock stripes for read/modify/write */
  44 struct btrfs_stripe_hash {
  45         struct list_head hash_list;
  46         spinlock_t lock;
  47 };
  48
  49 /* Used by the raid56 code to lock stripes for read/modify/write */
  50 struct btrfs_stripe_hash_table {
  51         struct list_head stripe_cache;
  52         spinlock_t cache_lock;
  53         int cache_size;
  54         struct btrfs_stripe_hash table[];
  55 };
  56
  57 /*
  58  * A bvec like structure to present a sector inside a page.
  59  *
  60  * Unlike bvec we don't need bvlen, as it's fixed to sectorsize.
  61  */
  62 struct sector_ptr {
  63         struct page *page;
  64         unsigned int pgoff:24;
  65         unsigned int uptodate:8;
  66 };
  67
  68 static void rmw_rbio_work(struct work_struct *work);
  69 static void rmw_rbio_work_locked(struct work_struct *work);
  70 static void index_rbio_pages(struct btrfs_raid_bio *rbio);
  71 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
  72
  73 static int finish_parity_scrub(struct btrfs_raid_bio *rbio);
  74 static void scrub_rbio_work_locked(struct work_struct *work);
  75
  76 static void free_raid_bio_pointers(struct btrfs_raid_bio *rbio)
  77 {
  78         bitmap_free(rbio->error_bitmap);
  79         kfree(rbio->stripe_pages);
  80         kfree(rbio->bio_sectors);
  81         kfree(rbio->stripe_sectors);
  82         kfree(rbio->finish_pointers);
  83 }
  84
  85 static void free_raid_bio(struct btrfs_raid_bio *rbio)
  86 {
  87         int i;
  88
  89         if (!refcount_dec_and_test(&rbio->refs))
  90                 return;
  91
  92         WARN_ON(!list_empty(&rbio->stripe_cache));
  93         WARN_ON(!list_empty(&rbio->hash_list));
  94         WARN_ON(!bio_list_empty(&rbio->bio_list));
  95
  96         for (i = 0; i < rbio->nr_pages; i++) {
  97                 if (rbio->stripe_pages[i]) {
  98                         __free_page(rbio->stripe_pages[i]);
  99                         rbio->stripe_pages[i] = NULL;
 100                 }
 101         }
 102
 103         btrfs_put_bioc(rbio->bioc);
 104         free_raid_bio_pointers(rbio);
 105         kfree(rbio);
 106 }
 107
 108 static void start_async_work(struct btrfs_raid_bio *rbio, work_func_t work_func)
 109 {
 110         INIT_WORK(&rbio->work, work_func);
 111         queue_work(rbio->bioc->fs_info->rmw_workers, &rbio->work);
 112 }
 113
 114 /*
 115  * the stripe hash table is used for locking, and to collect
 116  * bios in hopes of making a full stripe
 117  */
 118 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
 119 {
 120         struct btrfs_stripe_hash_table *table;
 121         struct btrfs_stripe_hash_table *x;
 122         struct btrfs_stripe_hash *cur;
 123         struct btrfs_stripe_hash *h;
 124         int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
 125         int i;
 126
 127         if (info->stripe_hash_table)
 128                 return 0;
 129
 130         /*
 131          * The table is large, starting with order 4 and can go as high as
 132          * order 7 in case lock debugging is turned on.
 133          *
 134          * Try harder to allocate and fallback to vmalloc to lower the chance
 135          * of a failing mount.
 136          */
 137         table = kvzalloc(struct_size(table, table, num_entries), GFP_KERNEL);
 138         if (!table)
 139                 return -ENOMEM;
 140
 141         spin_lock_init(&table->cache_lock);
 142         INIT_LIST_HEAD(&table->stripe_cache);
 143
 144         h = table->table;
 145
 146         for (i = 0; i < num_entries; i++) {
 147                 cur = h + i;
 148                 INIT_LIST_HEAD(&cur->hash_list);
 149                 spin_lock_init(&cur->lock);
 150         }
 151
 152         x = cmpxchg(&info->stripe_hash_table, NULL, table);
 153         kvfree(x);
 154         return 0;
 155 }
 156
 157 /*
 158  * caching an rbio means to copy anything from the
 159  * bio_sectors array into the stripe_pages array.  We
 160  * use the page uptodate bit in the stripe cache array
 161  * to indicate if it has valid data
 162  *
 163  * once the caching is done, we set the cache ready
 164  * bit.
 165  */
 166 static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
 167 {
 168         int i;
 169         int ret;
 170
 171         ret = alloc_rbio_pages(rbio);
 172         if (ret)
 173                 return;
 174
 175         for (i = 0; i < rbio->nr_sectors; i++) {
 176                 /* Some range not covered by bio (partial write), skip it */
 177                 if (!rbio->bio_sectors[i].page) {
 178                         /*
 179                          * Even if the sector is not covered by bio, if it is
 180                          * a data sector it should still be uptodate as it is
 181                          * read from disk.
 182                          */
 183                         if (i < rbio->nr_data * rbio->stripe_nsectors)
 184                                 ASSERT(rbio->stripe_sectors[i].uptodate);
 185                         continue;
 186                 }
 187
 188                 ASSERT(rbio->stripe_sectors[i].page);
 189                 memcpy_page(rbio->stripe_sectors[i].page,
 190                             rbio->stripe_sectors[i].pgoff,
 191                             rbio->bio_sectors[i].page,
 192                             rbio->bio_sectors[i].pgoff,
 193                             rbio->bioc->fs_info->sectorsize);
 194                 rbio->stripe_sectors[i].uptodate = 1;
 195         }
 196         set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
 197 }
 198
 199 /*
 200  * we hash on the first logical address of the stripe
 201  */
 202 static int rbio_bucket(struct btrfs_raid_bio *rbio)
 203 {
 204         u64 num = rbio->bioc->full_stripe_logical;
 205
 206         /*
 207          * we shift down quite a bit.  We're using byte
 208          * addressing, and most of the lower bits are zeros.
 209          * This tends to upset hash_64, and it consistently
 210          * returns just one or two different values.
 211          *
 212          * shifting off the lower bits fixes things.
 213          */
 214         return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
 215 }
 216
 217 static bool full_page_sectors_uptodate(struct btrfs_raid_bio *rbio,
 218                                        unsigned int page_nr)
 219 {
 220         const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
 221         const u32 sectors_per_page = PAGE_SIZE / sectorsize;
 222         int i;
 223
 224         ASSERT(page_nr < rbio->nr_pages);
 225
 226         for (i = sectors_per_page * page_nr;
 227              i < sectors_per_page * page_nr + sectors_per_page;
 228              i++) {
 229                 if (!rbio->stripe_sectors[i].uptodate)
 230                         return false;
 231         }
 232         return true;
 233 }
 234
 235 /*
 236  * Update the stripe_sectors[] array to use correct page and pgoff
 237  *
 238  * Should be called every time any page pointer in stripes_pages[] got modified.
 239  */
 240 static void index_stripe_sectors(struct btrfs_raid_bio *rbio)
 241 {
 242         const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
 243         u32 offset;
 244         int i;
 245
 246         for (i = 0, offset = 0; i < rbio->nr_sectors; i++, offset += sectorsize) {
 247                 int page_index = offset >> PAGE_SHIFT;
 248
 249                 ASSERT(page_index < rbio->nr_pages);
 250                 rbio->stripe_sectors[i].page = rbio->stripe_pages[page_index];
 251                 rbio->stripe_sectors[i].pgoff = offset_in_page(offset);
 252         }
 253 }
 254
 255 static void steal_rbio_page(struct btrfs_raid_bio *src,
 256                             struct btrfs_raid_bio *dest, int page_nr)
 257 {
 258         const u32 sectorsize = src->bioc->fs_info->sectorsize;
 259         const u32 sectors_per_page = PAGE_SIZE / sectorsize;
 260         int i;
 261
 262         if (dest->stripe_pages[page_nr])
 263                 __free_page(dest->stripe_pages[page_nr]);
 264         dest->stripe_pages[page_nr] = src->stripe_pages[page_nr];
 265         src->stripe_pages[page_nr] = NULL;
 266
 267         /* Also update the sector->uptodate bits. */
 268         for (i = sectors_per_page * page_nr;
 269              i < sectors_per_page * page_nr + sectors_per_page; i++)
 270                 dest->stripe_sectors[i].uptodate = true;
 271 }
 272
 273 static bool is_data_stripe_page(struct btrfs_raid_bio *rbio, int page_nr)
 274 {
 275         const int sector_nr = (page_nr << PAGE_SHIFT) >>
 276                               rbio->bioc->fs_info->sectorsize_bits;
 277
 278         /*
 279          * We have ensured PAGE_SIZE is aligned with sectorsize, thus
 280          * we won't have a page which is half data half parity.
 281          *
 282          * Thus if the first sector of the page belongs to data stripes, then
 283          * the full page belongs to data stripes.
 284          */
 285         return (sector_nr < rbio->nr_data * rbio->stripe_nsectors);
 286 }
 287
 288 /*
 289  * Stealing an rbio means taking all the uptodate pages from the stripe array
 290  * in the source rbio and putting them into the destination rbio.
 291  *
 292  * This will also update the involved stripe_sectors[] which are referring to
 293  * the old pages.
 294  */
 295 static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
 296 {
 297         int i;
 298
 299         if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
 300                 return;
 301
 302         for (i = 0; i < dest->nr_pages; i++) {
 303                 struct page *p = src->stripe_pages[i];
 304
 305                 /*
 306                  * We don't need to steal P/Q pages as they will always be
 307                  * regenerated for RMW or full write anyway.
 308                  */
 309                 if (!is_data_stripe_page(src, i))
 310                         continue;
 311
 312                 /*
 313                  * If @src already has RBIO_CACHE_READY_BIT, it should have
 314                  * all data stripe pages present and uptodate.
 315                  */
 316                 ASSERT(p);
 317                 ASSERT(full_page_sectors_uptodate(src, i));
 318                 steal_rbio_page(src, dest, i);
 319         }
 320         index_stripe_sectors(dest);
 321         index_stripe_sectors(src);
 322 }
 323
 324 /*
 325  * merging means we take the bio_list from the victim and
 326  * splice it into the destination.  The victim should
 327  * be discarded afterwards.
 328  *
 329  * must be called with dest->rbio_list_lock held
 330  */
 331 static void merge_rbio(struct btrfs_raid_bio *dest,
 332                        struct btrfs_raid_bio *victim)
 333 {
 334         bio_list_merge(&dest->bio_list, &victim->bio_list);
 335         dest->bio_list_bytes += victim->bio_list_bytes;
 336         /* Also inherit the bitmaps from @victim. */
 337         bitmap_or(&dest->dbitmap, &victim->dbitmap, &dest->dbitmap,
 338                   dest->stripe_nsectors);
 339         bio_list_init(&victim->bio_list);
 340 }
 341
 342 /*
 343  * used to prune items that are in the cache.  The caller
 344  * must hold the hash table lock.
 345  */
 346 static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
 347 {
 348         int bucket = rbio_bucket(rbio);
 349         struct btrfs_stripe_hash_table *table;
 350         struct btrfs_stripe_hash *h;
 351         int freeit = 0;
 352
 353         /*
 354          * check the bit again under the hash table lock.
 355          */
 356         if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
 357                 return;
 358
 359         table = rbio->bioc->fs_info->stripe_hash_table;
 360         h = table->table + bucket;
 361
 362         /* hold the lock for the bucket because we may be
 363          * removing it from the hash table
 364          */
 365         spin_lock(&h->lock);
 366
 367         /*
 368          * hold the lock for the bio list because we need
 369          * to make sure the bio list is empty
 370          */
 371         spin_lock(&rbio->bio_list_lock);
 372
 373         if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
 374                 list_del_init(&rbio->stripe_cache);
 375                 table->cache_size -= 1;
 376                 freeit = 1;
 377
 378                 /* if the bio list isn't empty, this rbio is
 379                  * still involved in an IO.  We take it out
 380                  * of the cache list, and drop the ref that
 381                  * was held for the list.
 382                  *
 383                  * If the bio_list was empty, we also remove
 384                  * the rbio from the hash_table, and drop
 385                  * the corresponding ref
 386                  */
 387                 if (bio_list_empty(&rbio->bio_list)) {
 388                         if (!list_empty(&rbio->hash_list)) {
 389                                 list_del_init(&rbio->hash_list);
 390                                 refcount_dec(&rbio->refs);
 391                                 BUG_ON(!list_empty(&rbio->plug_list));
 392                         }
 393                 }
 394         }
 395
 396         spin_unlock(&rbio->bio_list_lock);
 397         spin_unlock(&h->lock);
 398
 399         if (freeit)
 400                 free_raid_bio(rbio);
 401 }
 402
 403 /*
 404  * prune a given rbio from the cache
 405  */
 406 static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
 407 {
 408         struct btrfs_stripe_hash_table *table;
 409
 410         if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
 411                 return;
 412
 413         table = rbio->bioc->fs_info->stripe_hash_table;
 414
 415         spin_lock(&table->cache_lock);
 416         __remove_rbio_from_cache(rbio);
 417         spin_unlock(&table->cache_lock);
 418 }
 419
 420 /*
 421  * remove everything in the cache
 422  */
 423 static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
 424 {
 425         struct btrfs_stripe_hash_table *table;
 426         struct btrfs_raid_bio *rbio;
 427
 428         table = info->stripe_hash_table;
 429
 430         spin_lock(&table->cache_lock);
 431         while (!list_empty(&table->stripe_cache)) {
 432                 rbio = list_entry(table->stripe_cache.next,
 433                                   struct btrfs_raid_bio,
 434                                   stripe_cache);
 435                 __remove_rbio_from_cache(rbio);
 436         }
 437         spin_unlock(&table->cache_lock);
 438 }
 439
 440 /*
 441  * remove all cached entries and free the hash table
 442  * used by unmount
 443  */
 444 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
 445 {
 446         if (!info->stripe_hash_table)
 447                 return;
 448         btrfs_clear_rbio_cache(info);
 449         kvfree(info->stripe_hash_table);
 450         info->stripe_hash_table = NULL;
 451 }
 452
 453 /*
 454  * insert an rbio into the stripe cache.  It
 455  * must have already been prepared by calling
 456  * cache_rbio_pages
 457  *
 458  * If this rbio was already cached, it gets
 459  * moved to the front of the lru.
 460  *
 461  * If the size of the rbio cache is too big, we
 462  * prune an item.
 463  */
 464 static void cache_rbio(struct btrfs_raid_bio *rbio)
 465 {
 466         struct btrfs_stripe_hash_table *table;
 467
 468         if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
 469                 return;
 470
 471         table = rbio->bioc->fs_info->stripe_hash_table;
 472
 473         spin_lock(&table->cache_lock);
 474         spin_lock(&rbio->bio_list_lock);
 475
 476         /* bump our ref if we were not in the list before */
 477         if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
 478                 refcount_inc(&rbio->refs);
 479
 480         if (!list_empty(&rbio->stripe_cache)){
 481                 list_move(&rbio->stripe_cache, &table->stripe_cache);
 482         } else {
 483                 list_add(&rbio->stripe_cache, &table->stripe_cache);
 484                 table->cache_size += 1;
 485         }
 486
 487         spin_unlock(&rbio->bio_list_lock);
 488
 489         if (table->cache_size > RBIO_CACHE_SIZE) {
 490                 struct btrfs_raid_bio *found;
 491
 492                 found = list_entry(table->stripe_cache.prev,
 493                                   struct btrfs_raid_bio,
 494                                   stripe_cache);
 495
 496                 if (found != rbio)
 497                         __remove_rbio_from_cache(found);
 498         }
 499
 500         spin_unlock(&table->cache_lock);
 501 }
 502
 503 /*
 504  * helper function to run the xor_blocks api.  It is only
 505  * able to do MAX_XOR_BLOCKS at a time, so we need to
 506  * loop through.
 507  */
 508 static void run_xor(void **pages, int src_cnt, ssize_t len)
 509 {
 510         int src_off = 0;
 511         int xor_src_cnt = 0;
 512         void *dest = pages[src_cnt];
 513
 514         while(src_cnt > 0) {
 515                 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
 516                 xor_blocks(xor_src_cnt, len, dest, pages + src_off);
 517
 518                 src_cnt -= xor_src_cnt;
 519                 src_off += xor_src_cnt;
 520         }
 521 }
 522
 523 /*
 524  * Returns true if the bio list inside this rbio covers an entire stripe (no
 525  * rmw required).
 526  */
 527 static int rbio_is_full(struct btrfs_raid_bio *rbio)
 528 {
 529         unsigned long size = rbio->bio_list_bytes;
 530         int ret = 1;
 531
 532         spin_lock(&rbio->bio_list_lock);
 533         if (size != rbio->nr_data * BTRFS_STRIPE_LEN)
 534                 ret = 0;
 535         BUG_ON(size > rbio->nr_data * BTRFS_STRIPE_LEN);
 536         spin_unlock(&rbio->bio_list_lock);
 537
 538         return ret;
 539 }
 540
 541 /*
 542  * returns 1 if it is safe to merge two rbios together.
 543  * The merging is safe if the two rbios correspond to
 544  * the same stripe and if they are both going in the same
 545  * direction (read vs write), and if neither one is
 546  * locked for final IO
 547  *
 548  * The caller is responsible for locking such that
 549  * rmw_locked is safe to test
 550  */
 551 static int rbio_can_merge(struct btrfs_raid_bio *last,
 552                           struct btrfs_raid_bio *cur)
 553 {
 554         if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
 555             test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
 556                 return 0;
 557
 558         /*
 559          * we can't merge with cached rbios, since the
 560          * idea is that when we merge the destination
 561          * rbio is going to run our IO for us.  We can
 562          * steal from cached rbios though, other functions
 563          * handle that.
 564          */
 565         if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
 566             test_bit(RBIO_CACHE_BIT, &cur->flags))
 567                 return 0;
 568
 569         if (last->bioc->full_stripe_logical != cur->bioc->full_stripe_logical)
 570                 return 0;
 571
 572         /* we can't merge with different operations */
 573         if (last->operation != cur->operation)
 574                 return 0;
 575         /*
 576          * We've need read the full stripe from the drive.
 577          * check and repair the parity and write the new results.
 578          *
 579          * We're not allowed to add any new bios to the
 580          * bio list here, anyone else that wants to
 581          * change this stripe needs to do their own rmw.
 582          */
 583         if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
 584                 return 0;
 585
 586         if (last->operation == BTRFS_RBIO_READ_REBUILD)
 587                 return 0;
 588
 589         return 1;
 590 }
 591
 592 static unsigned int rbio_stripe_sector_index(const struct btrfs_raid_bio *rbio,
 593                                              unsigned int stripe_nr,
 594                                              unsigned int sector_nr)
 595 {
 596         ASSERT(stripe_nr < rbio->real_stripes);
 597         ASSERT(sector_nr < rbio->stripe_nsectors);
 598
 599         return stripe_nr * rbio->stripe_nsectors + sector_nr;
 600 }
 601
 602 /* Return a sector from rbio->stripe_sectors, not from the bio list */
 603 static struct sector_ptr *rbio_stripe_sector(const struct btrfs_raid_bio *rbio,
 604                                              unsigned int stripe_nr,
 605                                              unsigned int sector_nr)
 606 {
 607         return &rbio->stripe_sectors[rbio_stripe_sector_index(rbio, stripe_nr,
 608                                                               sector_nr)];
 609 }
 610
 611 /* Grab a sector inside P stripe */
 612 static struct sector_ptr *rbio_pstripe_sector(const struct btrfs_raid_bio *rbio,
 613                                               unsigned int sector_nr)
 614 {
 615         return rbio_stripe_sector(rbio, rbio->nr_data, sector_nr);
 616 }
 617
 618 /* Grab a sector inside Q stripe, return NULL if not RAID6 */
 619 static struct sector_ptr *rbio_qstripe_sector(const struct btrfs_raid_bio *rbio,
 620                                               unsigned int sector_nr)
 621 {
 622         if (rbio->nr_data + 1 == rbio->real_stripes)
 623                 return NULL;
 624         return rbio_stripe_sector(rbio, rbio->nr_data + 1, sector_nr);
 625 }
 626
 627 /*
 628  * The first stripe in the table for a logical address
 629  * has the lock.  rbios are added in one of three ways:
 630  *
 631  * 1) Nobody has the stripe locked yet.  The rbio is given
 632  * the lock and 0 is returned.  The caller must start the IO
 633  * themselves.
 634  *
 635  * 2) Someone has the stripe locked, but we're able to merge
 636  * with the lock owner.  The rbio is freed and the IO will
 637  * start automatically along with the existing rbio.  1 is returned.
 638  *
 639  * 3) Someone has the stripe locked, but we're not able to merge.
 640  * The rbio is added to the lock owner's plug list, or merged into
 641  * an rbio already on the plug list.  When the lock owner unlocks,
 642  * the next rbio on the list is run and the IO is started automatically.
 643  * 1 is returned
 644  *
 645  * If we return 0, the caller still owns the rbio and must continue with
 646  * IO submission.  If we return 1, the caller must assume the rbio has
 647  * already been freed.
 648  */
 649 static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
 650 {
 651         struct btrfs_stripe_hash *h;
 652         struct btrfs_raid_bio *cur;
 653         struct btrfs_raid_bio *pending;
 654         struct btrfs_raid_bio *freeit = NULL;
 655         struct btrfs_raid_bio *cache_drop = NULL;
 656         int ret = 0;
 657
 658         h = rbio->bioc->fs_info->stripe_hash_table->table + rbio_bucket(rbio);
 659
 660         spin_lock(&h->lock);
 661         list_for_each_entry(cur, &h->hash_list, hash_list) {
 662                 if (cur->bioc->full_stripe_logical != rbio->bioc->full_stripe_logical)
 663                         continue;
 664
 665                 spin_lock(&cur->bio_list_lock);
 666
 667                 /* Can we steal this cached rbio's pages? */
 668                 if (bio_list_empty(&cur->bio_list) &&
 669                     list_empty(&cur->plug_list) &&
 670                     test_bit(RBIO_CACHE_BIT, &cur->flags) &&
 671                     !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
 672                         list_del_init(&cur->hash_list);
 673                         refcount_dec(&cur->refs);
 674
 675                         steal_rbio(cur, rbio);
 676                         cache_drop = cur;
 677                         spin_unlock(&cur->bio_list_lock);
 678
 679                         goto lockit;
 680                 }
 681
 682                 /* Can we merge into the lock owner? */
 683                 if (rbio_can_merge(cur, rbio)) {
 684                         merge_rbio(cur, rbio);
 685                         spin_unlock(&cur->bio_list_lock);
 686                         freeit = rbio;
 687                         ret = 1;
 688                         goto out;
 689                 }
 690
 691
 692                 /*
 693                  * We couldn't merge with the running rbio, see if we can merge
 694                  * with the pending ones.  We don't have to check for rmw_locked
 695                  * because there is no way they are inside finish_rmw right now
 696                  */
 697                 list_for_each_entry(pending, &cur->plug_list, plug_list) {
 698                         if (rbio_can_merge(pending, rbio)) {
 699                                 merge_rbio(pending, rbio);
 700                                 spin_unlock(&cur->bio_list_lock);
 701                                 freeit = rbio;
 702                                 ret = 1;
 703                                 goto out;
 704                         }
 705                 }
 706
 707                 /*
 708                  * No merging, put us on the tail of the plug list, our rbio
 709                  * will be started with the currently running rbio unlocks
 710                  */
 711                 list_add_tail(&rbio->plug_list, &cur->plug_list);
 712                 spin_unlock(&cur->bio_list_lock);
 713                 ret = 1;
 714                 goto out;
 715         }
 716 lockit:
 717         refcount_inc(&rbio->refs);
 718         list_add(&rbio->hash_list, &h->hash_list);
 719 out:
 720         spin_unlock(&h->lock);
 721         if (cache_drop)
 722                 remove_rbio_from_cache(cache_drop);
 723         if (freeit)
 724                 free_raid_bio(freeit);
 725         return ret;
 726 }
 727
 728 static void recover_rbio_work_locked(struct work_struct *work);
 729
 730 /*
 731  * called as rmw or parity rebuild is completed.  If the plug list has more
 732  * rbios waiting for this stripe, the next one on the list will be started
 733  */
 734 static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
 735 {
 736         int bucket;
 737         struct btrfs_stripe_hash *h;
 738         int keep_cache = 0;
 739
 740         bucket = rbio_bucket(rbio);
 741         h = rbio->bioc->fs_info->stripe_hash_table->table + bucket;
 742
 743         if (list_empty(&rbio->plug_list))
 744                 cache_rbio(rbio);
 745
 746         spin_lock(&h->lock);
 747         spin_lock(&rbio->bio_list_lock);
 748
 749         if (!list_empty(&rbio->hash_list)) {
 750                 /*
 751                  * if we're still cached and there is no other IO
 752                  * to perform, just leave this rbio here for others
 753                  * to steal from later
 754                  */
 755                 if (list_empty(&rbio->plug_list) &&
 756                     test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
 757                         keep_cache = 1;
 758                         clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
 759                         BUG_ON(!bio_list_empty(&rbio->bio_list));
 760                         goto done;
 761                 }
 762
 763                 list_del_init(&rbio->hash_list);
 764                 refcount_dec(&rbio->refs);
 765
 766                 /*
 767                  * we use the plug list to hold all the rbios
 768                  * waiting for the chance to lock this stripe.
 769                  * hand the lock over to one of them.
 770                  */
 771                 if (!list_empty(&rbio->plug_list)) {
 772                         struct btrfs_raid_bio *next;
 773                         struct list_head *head = rbio->plug_list.next;
 774
 775                         next = list_entry(head, struct btrfs_raid_bio,
 776                                           plug_list);
 777
 778                         list_del_init(&rbio->plug_list);
 779
 780                         list_add(&next->hash_list, &h->hash_list);
 781                         refcount_inc(&next->refs);
 782                         spin_unlock(&rbio->bio_list_lock);
 783                         spin_unlock(&h->lock);
 784
 785                         if (next->operation == BTRFS_RBIO_READ_REBUILD) {
 786                                 start_async_work(next, recover_rbio_work_locked);
 787                         } else if (next->operation == BTRFS_RBIO_WRITE) {
 788                                 steal_rbio(rbio, next);
 789                                 start_async_work(next, rmw_rbio_work_locked);
 790                         } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
 791                                 steal_rbio(rbio, next);
 792                                 start_async_work(next, scrub_rbio_work_locked);
 793                         }
 794
 795                         goto done_nolock;
 796                 }
 797         }
 798 done:
 799         spin_unlock(&rbio->bio_list_lock);
 800         spin_unlock(&h->lock);
 801
 802 done_nolock:
 803         if (!keep_cache)
 804                 remove_rbio_from_cache(rbio);
 805 }
 806
 807 static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
 808 {
 809         struct bio *next;
 810
 811         while (cur) {
 812                 next = cur->bi_next;
 813                 cur->bi_next = NULL;
 814                 cur->bi_status = err;
 815                 bio_endio(cur);
 816                 cur = next;
 817         }
 818 }
 819
 820 /*
 821  * this frees the rbio and runs through all the bios in the
 822  * bio_list and calls end_io on them
 823  */
 824 static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
 825 {
 826         struct bio *cur = bio_list_get(&rbio->bio_list);
 827         struct bio *extra;
 828
 829         kfree(rbio->csum_buf);
 830         bitmap_free(rbio->csum_bitmap);
 831         rbio->csum_buf = NULL;
 832         rbio->csum_bitmap = NULL;
 833
 834         /*
 835          * Clear the data bitmap, as the rbio may be cached for later usage.
 836          * do this before before unlock_stripe() so there will be no new bio
 837          * for this bio.
 838          */
 839         bitmap_clear(&rbio->dbitmap, 0, rbio->stripe_nsectors);
 840
 841         /*
 842          * At this moment, rbio->bio_list is empty, however since rbio does not
 843          * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
 844          * hash list, rbio may be merged with others so that rbio->bio_list
 845          * becomes non-empty.
 846          * Once unlock_stripe() is done, rbio->bio_list will not be updated any
 847          * more and we can call bio_endio() on all queued bios.
 848          */
 849         unlock_stripe(rbio);
 850         extra = bio_list_get(&rbio->bio_list);
 851         free_raid_bio(rbio);
 852
 853         rbio_endio_bio_list(cur, err);
 854         if (extra)
 855                 rbio_endio_bio_list(extra, err);
 856 }
 857
 858 /*
 859  * Get a sector pointer specified by its @stripe_nr and @sector_nr.
 860  *
 861  * @rbio:               The raid bio
 862  * @stripe_nr:          Stripe number, valid range [0, real_stripe)
 863  * @sector_nr:          Sector number inside the stripe,
 864  *                      valid range [0, stripe_nsectors)
 865  * @bio_list_only:      Whether to use sectors inside the bio list only.
 866  *
 867  * The read/modify/write code wants to reuse the original bio page as much
 868  * as possible, and only use stripe_sectors as fallback.
 869  */
 870 static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio,
 871                                          int stripe_nr, int sector_nr,
 872                                          bool bio_list_only)
 873 {
 874         struct sector_ptr *sector;
 875         int index;
 876
 877         ASSERT(stripe_nr >= 0 && stripe_nr < rbio->real_stripes);
 878         ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors);
 879
 880         index = stripe_nr * rbio->stripe_nsectors + sector_nr;
 881         ASSERT(index >= 0 && index < rbio->nr_sectors);
 882
 883         spin_lock(&rbio->bio_list_lock);
 884         sector = &rbio->bio_sectors[index];
 885         if (sector->page || bio_list_only) {
 886                 /* Don't return sector without a valid page pointer */
 887                 if (!sector->page)
 888                         sector = NULL;
 889                 spin_unlock(&rbio->bio_list_lock);
 890                 return sector;
 891         }
 892         spin_unlock(&rbio->bio_list_lock);
 893
 894         return &rbio->stripe_sectors[index];
 895 }
 896
 897 /*
 898  * allocation and initial setup for the btrfs_raid_bio.  Not
 899  * this does not allocate any pages for rbio->pages.
 900  */
 901 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
 902                                          struct btrfs_io_context *bioc)
 903 {
 904         const unsigned int real_stripes = bioc->num_stripes - bioc->replace_nr_stripes;
 905         const unsigned int stripe_npages = BTRFS_STRIPE_LEN >> PAGE_SHIFT;
 906         const unsigned int num_pages = stripe_npages * real_stripes;
 907         const unsigned int stripe_nsectors =
 908                 BTRFS_STRIPE_LEN >> fs_info->sectorsize_bits;
 909         const unsigned int num_sectors = stripe_nsectors * real_stripes;
 910         struct btrfs_raid_bio *rbio;
 911
 912         /* PAGE_SIZE must also be aligned to sectorsize for subpage support */
 913         ASSERT(IS_ALIGNED(PAGE_SIZE, fs_info->sectorsize));
 914         /*
 915          * Our current stripe len should be fixed to 64k thus stripe_nsectors
 916          * (at most 16) should be no larger than BITS_PER_LONG.
 917          */
 918         ASSERT(stripe_nsectors <= BITS_PER_LONG);
 919
 920         /*
 921          * Real stripes must be between 2 (2 disks RAID5, aka RAID1) and 256
 922          * (limited by u8).
 923          */
 924         ASSERT(real_stripes >= 2);
 925         ASSERT(real_stripes <= U8_MAX);
 926
 927         rbio = kzalloc(sizeof(*rbio), GFP_NOFS);
 928         if (!rbio)
 929                 return ERR_PTR(-ENOMEM);
 930         rbio->stripe_pages = kcalloc(num_pages, sizeof(struct page *),
 931                                      GFP_NOFS);
 932         rbio->bio_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr),
 933                                     GFP_NOFS);
 934         rbio->stripe_sectors = kcalloc(num_sectors, sizeof(struct sector_ptr),
 935                                        GFP_NOFS);
 936         rbio->finish_pointers = kcalloc(real_stripes, sizeof(void *), GFP_NOFS);
 937         rbio->error_bitmap = bitmap_zalloc(num_sectors, GFP_NOFS);
 938
 939         if (!rbio->stripe_pages || !rbio->bio_sectors || !rbio->stripe_sectors ||
 940             !rbio->finish_pointers || !rbio->error_bitmap) {
 941                 free_raid_bio_pointers(rbio);
 942                 kfree(rbio);
 943                 return ERR_PTR(-ENOMEM);
 944         }
 945
 946         bio_list_init(&rbio->bio_list);
 947         init_waitqueue_head(&rbio->io_wait);
 948         INIT_LIST_HEAD(&rbio->plug_list);
 949         spin_lock_init(&rbio->bio_list_lock);
 950         INIT_LIST_HEAD(&rbio->stripe_cache);
 951         INIT_LIST_HEAD(&rbio->hash_list);
 952         btrfs_get_bioc(bioc);
 953         rbio->bioc = bioc;
 954         rbio->nr_pages = num_pages;
 955         rbio->nr_sectors = num_sectors;
 956         rbio->real_stripes = real_stripes;
 957         rbio->stripe_npages = stripe_npages;
 958         rbio->stripe_nsectors = stripe_nsectors;
 959         refcount_set(&rbio->refs, 1);
 960         atomic_set(&rbio->stripes_pending, 0);
 961
 962         ASSERT(btrfs_nr_parity_stripes(bioc->map_type));
 963         rbio->nr_data = real_stripes - btrfs_nr_parity_stripes(bioc->map_type);
 964         ASSERT(rbio->nr_data > 0);
 965
 966         return rbio;
 967 }
 968
 969 /* allocate pages for all the stripes in the bio, including parity */
 970 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
 971 {
 972         int ret;
 973
 974         ret = btrfs_alloc_page_array(rbio->nr_pages, rbio->stripe_pages, 0);
 975         if (ret < 0)
 976                 return ret;
 977         /* Mapping all sectors */
 978         index_stripe_sectors(rbio);
 979         return 0;
 980 }
 981
 982 /* only allocate pages for p/q stripes */
 983 static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
 984 {
 985         const int data_pages = rbio->nr_data * rbio->stripe_npages;
 986         int ret;
 987
 988         ret = btrfs_alloc_page_array(rbio->nr_pages - data_pages,
 989                                      rbio->stripe_pages + data_pages, 0);
 990         if (ret < 0)
 991                 return ret;
 992
 993         index_stripe_sectors(rbio);
 994         return 0;
 995 }
 996
 997 /*
 998  * Return the total number of errors found in the vertical stripe of @sector_nr.
 999  *
1000  * @faila and @failb will also be updated to the first and second stripe
1001  * number of the errors.
1002  */
1003 static int get_rbio_veritical_errors(struct btrfs_raid_bio *rbio, int sector_nr,
1004                                      int *faila, int *failb)
1005 {
1006         int stripe_nr;
1007         int found_errors = 0;
1008
1009         if (faila || failb) {
1010                 /*
1011                  * Both @faila and @failb should be valid pointers if any of
1012                  * them is specified.
1013                  */
1014                 ASSERT(faila && failb);
1015                 *faila = -1;
1016                 *failb = -1;
1017         }
1018
1019         for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
1020                 int total_sector_nr = stripe_nr * rbio->stripe_nsectors + sector_nr;
1021
1022                 if (test_bit(total_sector_nr, rbio->error_bitmap)) {
1023                         found_errors++;
1024                         if (faila) {
1025                                 /* Update faila and failb. */
1026                                 if (*faila < 0)
1027                                         *faila = stripe_nr;
1028                                 else if (*failb < 0)
1029                                         *failb = stripe_nr;
1030                         }
1031                 }
1032         }
1033         return found_errors;
1034 }
1035
1036 /*
1037  * Add a single sector @sector into our list of bios for IO.
1038  *
1039  * Return 0 if everything went well.
1040  * Return <0 for error.
1041  */
1042 static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
1043                               struct bio_list *bio_list,
1044                               struct sector_ptr *sector,
1045                               unsigned int stripe_nr,
1046                               unsigned int sector_nr,
1047                               enum req_op op)
1048 {
1049         const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1050         struct bio *last = bio_list->tail;
1051         int ret;
1052         struct bio *bio;
1053         struct btrfs_io_stripe *stripe;
1054         u64 disk_start;
1055
1056         /*
1057          * Note: here stripe_nr has taken device replace into consideration,
1058          * thus it can be larger than rbio->real_stripe.
1059          * So here we check against bioc->num_stripes, not rbio->real_stripes.
1060          */
1061         ASSERT(stripe_nr >= 0 && stripe_nr < rbio->bioc->num_stripes);
1062         ASSERT(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors);
1063         ASSERT(sector->page);
1064
1065         stripe = &rbio->bioc->stripes[stripe_nr];
1066         disk_start = stripe->physical + sector_nr * sectorsize;
1067
1068         /* if the device is missing, just fail this stripe */
1069         if (!stripe->dev->bdev) {
1070                 int found_errors;
1071
1072                 set_bit(stripe_nr * rbio->stripe_nsectors + sector_nr,
1073                         rbio->error_bitmap);
1074
1075                 /* Check if we have reached tolerance early. */
1076                 found_errors = get_rbio_veritical_errors(rbio, sector_nr,
1077                                                          NULL, NULL);
1078                 if (found_errors > rbio->bioc->max_errors)
1079                         return -EIO;
1080                 return 0;
1081         }
1082
1083         /* see if we can add this page onto our existing bio */
1084         if (last) {
1085                 u64 last_end = last->bi_iter.bi_sector << SECTOR_SHIFT;
1086                 last_end += last->bi_iter.bi_size;
1087
1088                 /*
1089                  * we can't merge these if they are from different
1090                  * devices or if they are not contiguous
1091                  */
1092                 if (last_end == disk_start && !last->bi_status &&
1093                     last->bi_bdev == stripe->dev->bdev) {
1094                         ret = bio_add_page(last, sector->page, sectorsize,
1095                                            sector->pgoff);
1096                         if (ret == sectorsize)
1097                                 return 0;
1098                 }
1099         }
1100
1101         /* put a new bio on the list */
1102         bio = bio_alloc(stripe->dev->bdev,
1103                         max(BTRFS_STRIPE_LEN >> PAGE_SHIFT, 1),
1104                         op, GFP_NOFS);
1105         bio->bi_iter.bi_sector = disk_start >> SECTOR_SHIFT;
1106         bio->bi_private = rbio;
1107
1108         __bio_add_page(bio, sector->page, sectorsize, sector->pgoff);
1109         bio_list_add(bio_list, bio);
1110         return 0;
1111 }
1112
1113 static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio)
1114 {
1115         const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1116         struct bio_vec bvec;
1117         struct bvec_iter iter;
1118         u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1119                      rbio->bioc->full_stripe_logical;
1120
1121         bio_for_each_segment(bvec, bio, iter) {
1122                 u32 bvec_offset;
1123
1124                 for (bvec_offset = 0; bvec_offset < bvec.bv_len;
1125                      bvec_offset += sectorsize, offset += sectorsize) {
1126                         int index = offset / sectorsize;
1127                         struct sector_ptr *sector = &rbio->bio_sectors[index];
1128
1129                         sector->page = bvec.bv_page;
1130                         sector->pgoff = bvec.bv_offset + bvec_offset;
1131                         ASSERT(sector->pgoff < PAGE_SIZE);
1132                 }
1133         }
1134 }
1135
1136 /*
1137  * helper function to walk our bio list and populate the bio_pages array with
1138  * the result.  This seems expensive, but it is faster than constantly
1139  * searching through the bio list as we setup the IO in finish_rmw or stripe
1140  * reconstruction.
1141  *
1142  * This must be called before you trust the answers from page_in_rbio
1143  */
1144 static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1145 {
1146         struct bio *bio;
1147
1148         spin_lock(&rbio->bio_list_lock);
1149         bio_list_for_each(bio, &rbio->bio_list)
1150                 index_one_bio(rbio, bio);
1151
1152         spin_unlock(&rbio->bio_list_lock);
1153 }
1154
1155 static void bio_get_trace_info(struct btrfs_raid_bio *rbio, struct bio *bio,
1156                                struct raid56_bio_trace_info *trace_info)
1157 {
1158         const struct btrfs_io_context *bioc = rbio->bioc;
1159         int i;
1160
1161         ASSERT(bioc);
1162
1163         /* We rely on bio->bi_bdev to find the stripe number. */
1164         if (!bio->bi_bdev)
1165                 goto not_found;
1166
1167         for (i = 0; i < bioc->num_stripes; i++) {
1168                 if (bio->bi_bdev != bioc->stripes[i].dev->bdev)
1169                         continue;
1170                 trace_info->stripe_nr = i;
1171                 trace_info->devid = bioc->stripes[i].dev->devid;
1172                 trace_info->offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1173                                      bioc->stripes[i].physical;
1174                 return;
1175         }
1176
1177 not_found:
1178         trace_info->devid = -1;
1179         trace_info->offset = -1;
1180         trace_info->stripe_nr = -1;
1181 }
1182
1183 static inline void bio_list_put(struct bio_list *bio_list)
1184 {
1185         struct bio *bio;
1186
1187         while ((bio = bio_list_pop(bio_list)))
1188                 bio_put(bio);
1189 }
1190
1191 static void assert_rbio(struct btrfs_raid_bio *rbio)
1192 {
1193         if (!IS_ENABLED(CONFIG_BTRFS_DEBUG) ||
1194             !IS_ENABLED(CONFIG_BTRFS_ASSERT))
1195                 return;
1196
1197         /*
1198          * At least two stripes (2 disks RAID5), and since real_stripes is U8,
1199          * we won't go beyond 256 disks anyway.
1200          */
1201         ASSERT(rbio->real_stripes >= 2);
1202         ASSERT(rbio->nr_data > 0);
1203
1204         /*
1205          * This is another check to make sure nr data stripes is smaller
1206          * than total stripes.
1207          */
1208         ASSERT(rbio->nr_data < rbio->real_stripes);
1209 }
1210
1211 /* Generate PQ for one vertical stripe. */
1212 static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr)
1213 {
1214         void **pointers = rbio->finish_pointers;
1215         const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1216         struct sector_ptr *sector;
1217         int stripe;
1218         const bool has_qstripe = rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6;
1219
1220         /* First collect one sector from each data stripe */
1221         for (stripe = 0; stripe < rbio->nr_data; stripe++) {
1222                 sector = sector_in_rbio(rbio, stripe, sectornr, 0);
1223                 pointers[stripe] = kmap_local_page(sector->page) +
1224                                    sector->pgoff;
1225         }
1226
1227         /* Then add the parity stripe */
1228         sector = rbio_pstripe_sector(rbio, sectornr);
1229         sector->uptodate = 1;
1230         pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff;
1231
1232         if (has_qstripe) {
1233                 /*
1234                  * RAID6, add the qstripe and call the library function
1235                  * to fill in our p/q
1236                  */
1237                 sector = rbio_qstripe_sector(rbio, sectornr);
1238                 sector->uptodate = 1;
1239                 pointers[stripe++] = kmap_local_page(sector->page) +
1240                                      sector->pgoff;
1241
1242                 assert_rbio(rbio);
1243                 raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
1244                                         pointers);
1245         } else {
1246                 /* raid5 */
1247                 memcpy(pointers[rbio->nr_data], pointers[0], sectorsize);
1248                 run_xor(pointers + 1, rbio->nr_data - 1, sectorsize);
1249         }
1250         for (stripe = stripe - 1; stripe >= 0; stripe--)
1251                 kunmap_local(pointers[stripe]);
1252 }
1253
1254 static int rmw_assemble_write_bios(struct btrfs_raid_bio *rbio,
1255                                    struct bio_list *bio_list)
1256 {
1257         /* The total sector number inside the full stripe. */
1258         int total_sector_nr;
1259         int sectornr;
1260         int stripe;
1261         int ret;
1262
1263         ASSERT(bio_list_size(bio_list) == 0);
1264
1265         /* We should have at least one data sector. */
1266         ASSERT(bitmap_weight(&rbio->dbitmap, rbio->stripe_nsectors));
1267
1268         /*
1269          * Reset errors, as we may have errors inherited from from degraded
1270          * write.
1271          */
1272         bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
1273
1274         /*
1275          * Start assembly.  Make bios for everything from the higher layers (the
1276          * bio_list in our rbio) and our P/Q.  Ignore everything else.
1277          */
1278         for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
1279              total_sector_nr++) {
1280                 struct sector_ptr *sector;
1281
1282                 stripe = total_sector_nr / rbio->stripe_nsectors;
1283                 sectornr = total_sector_nr % rbio->stripe_nsectors;
1284
1285                 /* This vertical stripe has no data, skip it. */
1286                 if (!test_bit(sectornr, &rbio->dbitmap))
1287                         continue;
1288
1289                 if (stripe < rbio->nr_data) {
1290                         sector = sector_in_rbio(rbio, stripe, sectornr, 1);
1291                         if (!sector)
1292                                 continue;
1293                 } else {
1294                         sector = rbio_stripe_sector(rbio, stripe, sectornr);
1295                 }
1296
1297                 ret = rbio_add_io_sector(rbio, bio_list, sector, stripe,
1298                                          sectornr, REQ_OP_WRITE);
1299                 if (ret)
1300                         goto error;
1301         }
1302
1303         if (likely(!rbio->bioc->replace_nr_stripes))
1304                 return 0;
1305
1306         /*
1307          * Make a copy for the replace target device.
1308          *
1309          * Thus the source stripe number (in replace_stripe_src) should be valid.
1310          */
1311         ASSERT(rbio->bioc->replace_stripe_src >= 0);
1312
1313         for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
1314              total_sector_nr++) {
1315                 struct sector_ptr *sector;
1316
1317                 stripe = total_sector_nr / rbio->stripe_nsectors;
1318                 sectornr = total_sector_nr % rbio->stripe_nsectors;
1319
1320                 /*
1321                  * For RAID56, there is only one device that can be replaced,
1322                  * and replace_stripe_src[0] indicates the stripe number we
1323                  * need to copy from.
1324                  */
1325                 if (stripe != rbio->bioc->replace_stripe_src) {
1326                         /*
1327                          * We can skip the whole stripe completely, note
1328                          * total_sector_nr will be increased by one anyway.
1329                          */
1330                         ASSERT(sectornr == 0);
1331                         total_sector_nr += rbio->stripe_nsectors - 1;
1332                         continue;
1333                 }
1334
1335                 /* This vertical stripe has no data, skip it. */
1336                 if (!test_bit(sectornr, &rbio->dbitmap))
1337                         continue;
1338
1339                 if (stripe < rbio->nr_data) {
1340                         sector = sector_in_rbio(rbio, stripe, sectornr, 1);
1341                         if (!sector)
1342                                 continue;
1343                 } else {
1344                         sector = rbio_stripe_sector(rbio, stripe, sectornr);
1345                 }
1346
1347                 ret = rbio_add_io_sector(rbio, bio_list, sector,
1348                                          rbio->real_stripes,
1349                                          sectornr, REQ_OP_WRITE);
1350                 if (ret)
1351                         goto error;
1352         }
1353
1354         return 0;
1355 error:
1356         bio_list_put(bio_list);
1357         return -EIO;
1358 }
1359
1360 static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio)
1361 {
1362         struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1363         u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) -
1364                      rbio->bioc->full_stripe_logical;
1365         int total_nr_sector = offset >> fs_info->sectorsize_bits;
1366
1367         ASSERT(total_nr_sector < rbio->nr_data * rbio->stripe_nsectors);
1368
1369         bitmap_set(rbio->error_bitmap, total_nr_sector,
1370                    bio->bi_iter.bi_size >> fs_info->sectorsize_bits);
1371
1372         /*
1373          * Special handling for raid56_alloc_missing_rbio() used by
1374          * scrub/replace.  Unlike call path in raid56_parity_recover(), they
1375          * pass an empty bio here.  Thus we have to find out the missing device
1376          * and mark the stripe error instead.
1377          */
1378         if (bio->bi_iter.bi_size == 0) {
1379                 bool found_missing = false;
1380                 int stripe_nr;
1381
1382                 for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
1383                         if (!rbio->bioc->stripes[stripe_nr].dev->bdev) {
1384                                 found_missing = true;
1385                                 bitmap_set(rbio->error_bitmap,
1386                                            stripe_nr * rbio->stripe_nsectors,
1387                                            rbio->stripe_nsectors);
1388                         }
1389                 }
1390                 ASSERT(found_missing);
1391         }
1392 }
1393
1394 /*
1395  * For subpage case, we can no longer set page Up-to-date directly for
1396  * stripe_pages[], thus we need to locate the sector.
1397  */
1398 static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio,
1399                                              struct page *page,
1400                                              unsigned int pgoff)
1401 {
1402         int i;
1403
1404         for (i = 0; i < rbio->nr_sectors; i++) {
1405                 struct sector_ptr *sector = &rbio->stripe_sectors[i];
1406
1407                 if (sector->page == page && sector->pgoff == pgoff)
1408                         return sector;
1409         }
1410         return NULL;
1411 }
1412
1413 /*
1414  * this sets each page in the bio uptodate.  It should only be used on private
1415  * rbio pages, nothing that comes in from the higher layers
1416  */
1417 static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio)
1418 {
1419         const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
1420         struct bio_vec *bvec;
1421         struct bvec_iter_all iter_all;
1422
1423         ASSERT(!bio_flagged(bio, BIO_CLONED));
1424
1425         bio_for_each_segment_all(bvec, bio, iter_all) {
1426                 struct sector_ptr *sector;
1427                 int pgoff;
1428
1429                 for (pgoff = bvec->bv_offset; pgoff - bvec->bv_offset < bvec->bv_len;
1430                      pgoff += sectorsize) {
1431                         sector = find_stripe_sector(rbio, bvec->bv_page, pgoff);
1432                         ASSERT(sector);
1433                         if (sector)
1434                                 sector->uptodate = 1;
1435                 }
1436         }
1437 }
1438
1439 static int get_bio_sector_nr(struct btrfs_raid_bio *rbio, struct bio *bio)
1440 {
1441         struct bio_vec *bv = bio_first_bvec_all(bio);
1442         int i;
1443
1444         for (i = 0; i < rbio->nr_sectors; i++) {
1445                 struct sector_ptr *sector;
1446
1447                 sector = &rbio->stripe_sectors[i];
1448                 if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset)
1449                         break;
1450                 sector = &rbio->bio_sectors[i];
1451                 if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset)
1452                         break;
1453         }
1454         ASSERT(i < rbio->nr_sectors);
1455         return i;
1456 }
1457
1458 static void rbio_update_error_bitmap(struct btrfs_raid_bio *rbio, struct bio *bio)
1459 {
1460         int total_sector_nr = get_bio_sector_nr(rbio, bio);
1461         u32 bio_size = 0;
1462         struct bio_vec *bvec;
1463         int i;
1464
1465         bio_for_each_bvec_all(bvec, bio, i)
1466                 bio_size += bvec->bv_len;
1467
1468         /*
1469          * Since we can have multiple bios touching the error_bitmap, we cannot
1470          * call bitmap_set() without protection.
1471          *
1472          * Instead use set_bit() for each bit, as set_bit() itself is atomic.
1473          */
1474         for (i = total_sector_nr; i < total_sector_nr +
1475              (bio_size >> rbio->bioc->fs_info->sectorsize_bits); i++)
1476                 set_bit(i, rbio->error_bitmap);
1477 }
1478
1479 /* Verify the data sectors at read time. */
1480 static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio,
1481                                     struct bio *bio)
1482 {
1483         struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1484         int total_sector_nr = get_bio_sector_nr(rbio, bio);
1485         struct bio_vec *bvec;
1486         struct bvec_iter_all iter_all;
1487
1488         /* No data csum for the whole stripe, no need to verify. */
1489         if (!rbio->csum_bitmap || !rbio->csum_buf)
1490                 return;
1491
1492         /* P/Q stripes, they have no data csum to verify against. */
1493         if (total_sector_nr >= rbio->nr_data * rbio->stripe_nsectors)
1494                 return;
1495
1496         bio_for_each_segment_all(bvec, bio, iter_all) {
1497                 int bv_offset;
1498
1499                 for (bv_offset = bvec->bv_offset;
1500                      bv_offset < bvec->bv_offset + bvec->bv_len;
1501                      bv_offset += fs_info->sectorsize, total_sector_nr++) {
1502                         u8 csum_buf[BTRFS_CSUM_SIZE];
1503                         u8 *expected_csum = rbio->csum_buf +
1504                                             total_sector_nr * fs_info->csum_size;
1505                         int ret;
1506
1507                         /* No csum for this sector, skip to the next sector. */
1508                         if (!test_bit(total_sector_nr, rbio->csum_bitmap))
1509                                 continue;
1510
1511                         ret = btrfs_check_sector_csum(fs_info, bvec->bv_page,
1512                                 bv_offset, csum_buf, expected_csum);
1513                         if (ret < 0)
1514                                 set_bit(total_sector_nr, rbio->error_bitmap);
1515                 }
1516         }
1517 }
1518
1519 static void raid_wait_read_end_io(struct bio *bio)
1520 {
1521         struct btrfs_raid_bio *rbio = bio->bi_private;
1522
1523         if (bio->bi_status) {
1524                 rbio_update_error_bitmap(rbio, bio);
1525         } else {
1526                 set_bio_pages_uptodate(rbio, bio);
1527                 verify_bio_data_sectors(rbio, bio);
1528         }
1529
1530         bio_put(bio);
1531         if (atomic_dec_and_test(&rbio->stripes_pending))
1532                 wake_up(&rbio->io_wait);
1533 }
1534
1535 static void submit_read_wait_bio_list(struct btrfs_raid_bio *rbio,
1536                              struct bio_list *bio_list)
1537 {
1538         struct bio *bio;
1539
1540         atomic_set(&rbio->stripes_pending, bio_list_size(bio_list));
1541         while ((bio = bio_list_pop(bio_list))) {
1542                 bio->bi_end_io = raid_wait_read_end_io;
1543
1544                 if (trace_raid56_read_enabled()) {
1545                         struct raid56_bio_trace_info trace_info = { 0 };
1546
1547                         bio_get_trace_info(rbio, bio, &trace_info);
1548                         trace_raid56_read(rbio, bio, &trace_info);
1549                 }
1550                 submit_bio(bio);
1551         }
1552
1553         wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
1554 }
1555
1556 static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio)
1557 {
1558         const int data_pages = rbio->nr_data * rbio->stripe_npages;
1559         int ret;
1560
1561         ret = btrfs_alloc_page_array(data_pages, rbio->stripe_pages, 0);
1562         if (ret < 0)
1563                 return ret;
1564
1565         index_stripe_sectors(rbio);
1566         return 0;
1567 }
1568
1569 /*
1570  * We use plugging call backs to collect full stripes.
1571  * Any time we get a partial stripe write while plugged
1572  * we collect it into a list.  When the unplug comes down,
1573  * we sort the list by logical block number and merge
1574  * everything we can into the same rbios
1575  */
1576 struct btrfs_plug_cb {
1577         struct blk_plug_cb cb;
1578         struct btrfs_fs_info *info;
1579         struct list_head rbio_list;
1580 };
1581
1582 /*
1583  * rbios on the plug list are sorted for easier merging.
1584  */
1585 static int plug_cmp(void *priv, const struct list_head *a,
1586                     const struct list_head *b)
1587 {
1588         const struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
1589                                                        plug_list);
1590         const struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
1591                                                        plug_list);
1592         u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
1593         u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
1594
1595         if (a_sector < b_sector)
1596                 return -1;
1597         if (a_sector > b_sector)
1598                 return 1;
1599         return 0;
1600 }
1601
1602 static void raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1603 {
1604         struct btrfs_plug_cb *plug = container_of(cb, struct btrfs_plug_cb, cb);
1605         struct btrfs_raid_bio *cur;
1606         struct btrfs_raid_bio *last = NULL;
1607
1608         list_sort(NULL, &plug->rbio_list, plug_cmp);
1609
1610         while (!list_empty(&plug->rbio_list)) {
1611                 cur = list_entry(plug->rbio_list.next,
1612                                  struct btrfs_raid_bio, plug_list);
1613                 list_del_init(&cur->plug_list);
1614
1615                 if (rbio_is_full(cur)) {
1616                         /* We have a full stripe, queue it down. */
1617                         start_async_work(cur, rmw_rbio_work);
1618                         continue;
1619                 }
1620                 if (last) {
1621                         if (rbio_can_merge(last, cur)) {
1622                                 merge_rbio(last, cur);
1623                                 free_raid_bio(cur);
1624                                 continue;
1625                         }
1626                         start_async_work(last, rmw_rbio_work);
1627                 }
1628                 last = cur;
1629         }
1630         if (last)
1631                 start_async_work(last, rmw_rbio_work);
1632         kfree(plug);
1633 }
1634
1635 /* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */
1636 static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
1637 {
1638         const struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1639         const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT;
1640         const u64 full_stripe_start = rbio->bioc->full_stripe_logical;
1641         const u32 orig_len = orig_bio->bi_iter.bi_size;
1642         const u32 sectorsize = fs_info->sectorsize;
1643         u64 cur_logical;
1644
1645         ASSERT(orig_logical >= full_stripe_start &&
1646                orig_logical + orig_len <= full_stripe_start +
1647                rbio->nr_data * BTRFS_STRIPE_LEN);
1648
1649         bio_list_add(&rbio->bio_list, orig_bio);
1650         rbio->bio_list_bytes += orig_bio->bi_iter.bi_size;
1651
1652         /* Update the dbitmap. */
1653         for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len;
1654              cur_logical += sectorsize) {
1655                 int bit = ((u32)(cur_logical - full_stripe_start) >>
1656                            fs_info->sectorsize_bits) % rbio->stripe_nsectors;
1657
1658                 set_bit(bit, &rbio->dbitmap);
1659         }
1660 }
1661
1662 /*
1663  * our main entry point for writes from the rest of the FS.
1664  */
1665 void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
1666 {
1667         struct btrfs_fs_info *fs_info = bioc->fs_info;
1668         struct btrfs_raid_bio *rbio;
1669         struct btrfs_plug_cb *plug = NULL;
1670         struct blk_plug_cb *cb;
1671
1672         rbio = alloc_rbio(fs_info, bioc);
1673         if (IS_ERR(rbio)) {
1674                 bio->bi_status = errno_to_blk_status(PTR_ERR(rbio));
1675                 bio_endio(bio);
1676                 return;
1677         }
1678         rbio->operation = BTRFS_RBIO_WRITE;
1679         rbio_add_bio(rbio, bio);
1680
1681         /*
1682          * Don't plug on full rbios, just get them out the door
1683          * as quickly as we can
1684          */
1685         if (!rbio_is_full(rbio)) {
1686                 cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug));
1687                 if (cb) {
1688                         plug = container_of(cb, struct btrfs_plug_cb, cb);
1689                         if (!plug->info) {
1690                                 plug->info = fs_info;
1691                                 INIT_LIST_HEAD(&plug->rbio_list);
1692                         }
1693                         list_add_tail(&rbio->plug_list, &plug->rbio_list);
1694                         return;
1695                 }
1696         }
1697
1698         /*
1699          * Either we don't have any existing plug, or we're doing a full stripe,
1700          * queue the rmw work now.
1701          */
1702         start_async_work(rbio, rmw_rbio_work);
1703 }
1704
1705 static int verify_one_sector(struct btrfs_raid_bio *rbio,
1706                              int stripe_nr, int sector_nr)
1707 {
1708         struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1709         struct sector_ptr *sector;
1710         u8 csum_buf[BTRFS_CSUM_SIZE];
1711         u8 *csum_expected;
1712         int ret;
1713
1714         if (!rbio->csum_bitmap || !rbio->csum_buf)
1715                 return 0;
1716
1717         /* No way to verify P/Q as they are not covered by data csum. */
1718         if (stripe_nr >= rbio->nr_data)
1719                 return 0;
1720         /*
1721          * If we're rebuilding a read, we have to use pages from the
1722          * bio list if possible.
1723          */
1724         if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
1725                 sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0);
1726         } else {
1727                 sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr);
1728         }
1729
1730         ASSERT(sector->page);
1731
1732         csum_expected = rbio->csum_buf +
1733                         (stripe_nr * rbio->stripe_nsectors + sector_nr) *
1734                         fs_info->csum_size;
1735         ret = btrfs_check_sector_csum(fs_info, sector->page, sector->pgoff,
1736                                       csum_buf, csum_expected);
1737         return ret;
1738 }
1739
1740 /*
1741  * Recover a vertical stripe specified by @sector_nr.
1742  * @*pointers are the pre-allocated pointers by the caller, so we don't
1743  * need to allocate/free the pointers again and again.
1744  */
1745 static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
1746                             void **pointers, void **unmap_array)
1747 {
1748         struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
1749         struct sector_ptr *sector;
1750         const u32 sectorsize = fs_info->sectorsize;
1751         int found_errors;
1752         int faila;
1753         int failb;
1754         int stripe_nr;
1755         int ret = 0;
1756
1757         /*
1758          * Now we just use bitmap to mark the horizontal stripes in
1759          * which we have data when doing parity scrub.
1760          */
1761         if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
1762             !test_bit(sector_nr, &rbio->dbitmap))
1763                 return 0;
1764
1765         found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila,
1766                                                  &failb);
1767         /*
1768          * No errors in the vertical stripe, skip it.  Can happen for recovery
1769          * which only part of a stripe failed csum check.
1770          */
1771         if (!found_errors)
1772                 return 0;
1773
1774         if (found_errors > rbio->bioc->max_errors)
1775                 return -EIO;
1776
1777         /*
1778          * Setup our array of pointers with sectors from each stripe
1779          *
1780          * NOTE: store a duplicate array of pointers to preserve the
1781          * pointer order.
1782          */
1783         for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
1784                 /*
1785                  * If we're rebuilding a read, we have to use pages from the
1786                  * bio list if possible.
1787                  */
1788                 if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
1789                         sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0);
1790                 } else {
1791                         sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr);
1792                 }
1793                 ASSERT(sector->page);
1794                 pointers[stripe_nr] = kmap_local_page(sector->page) +
1795                                    sector->pgoff;
1796                 unmap_array[stripe_nr] = pointers[stripe_nr];
1797         }
1798
1799         /* All raid6 handling here */
1800         if (rbio->bioc->map_type & BTRFS_BLOCK_GROUP_RAID6) {
1801                 /* Single failure, rebuild from parity raid5 style */
1802                 if (failb < 0) {
1803                         if (faila == rbio->nr_data)
1804                                 /*
1805                                  * Just the P stripe has failed, without
1806                                  * a bad data or Q stripe.
1807                                  * We have nothing to do, just skip the
1808                                  * recovery for this stripe.
1809                                  */
1810                                 goto cleanup;
1811                         /*
1812                          * a single failure in raid6 is rebuilt
1813                          * in the pstripe code below
1814                          */
1815                         goto pstripe;
1816                 }
1817
1818                 /*
1819                  * If the q stripe is failed, do a pstripe reconstruction from
1820                  * the xors.
1821                  * If both the q stripe and the P stripe are failed, we're
1822                  * here due to a crc mismatch and we can't give them the
1823                  * data they want.
1824                  */
1825                 if (failb == rbio->real_stripes - 1) {
1826                         if (faila == rbio->real_stripes - 2)
1827                                 /*
1828                                  * Only P and Q are corrupted.
1829                                  * We only care about data stripes recovery,
1830                                  * can skip this vertical stripe.
1831                                  */
1832                                 goto cleanup;
1833                         /*
1834                          * Otherwise we have one bad data stripe and
1835                          * a good P stripe.  raid5!
1836                          */
1837                         goto pstripe;
1838                 }
1839
1840                 if (failb == rbio->real_stripes - 2) {
1841                         raid6_datap_recov(rbio->real_stripes, sectorsize,
1842                                           faila, pointers);
1843                 } else {
1844                         raid6_2data_recov(rbio->real_stripes, sectorsize,
1845                                           faila, failb, pointers);
1846                 }
1847         } else {
1848                 void *p;
1849
1850                 /* Rebuild from P stripe here (raid5 or raid6). */
1851                 ASSERT(failb == -1);
1852 pstripe:
1853                 /* Copy parity block into failed block to start with */
1854                 memcpy(pointers[faila], pointers[rbio->nr_data], sectorsize);
1855
1856                 /* Rearrange the pointer array */
1857                 p = pointers[faila];
1858                 for (stripe_nr = faila; stripe_nr < rbio->nr_data - 1;
1859                      stripe_nr++)
1860                         pointers[stripe_nr] = pointers[stripe_nr + 1];
1861                 pointers[rbio->nr_data - 1] = p;
1862
1863                 /* Xor in the rest */
1864                 run_xor(pointers, rbio->nr_data - 1, sectorsize);
1865
1866         }
1867
1868         /*
1869          * No matter if this is a RMW or recovery, we should have all
1870          * failed sectors repaired in the vertical stripe, thus they are now
1871          * uptodate.
1872          * Especially if we determine to cache the rbio, we need to
1873          * have at least all data sectors uptodate.
1874          *
1875          * If possible, also check if the repaired sector matches its data
1876          * checksum.
1877          */
1878         if (faila >= 0) {
1879                 ret = verify_one_sector(rbio, faila, sector_nr);
1880                 if (ret < 0)
1881                         goto cleanup;
1882
1883                 sector = rbio_stripe_sector(rbio, faila, sector_nr);
1884                 sector->uptodate = 1;
1885         }
1886         if (failb >= 0) {
1887                 ret = verify_one_sector(rbio, failb, sector_nr);
1888                 if (ret < 0)
1889                         goto cleanup;
1890
1891                 sector = rbio_stripe_sector(rbio, failb, sector_nr);
1892                 sector->uptodate = 1;
1893         }
1894
1895 cleanup:
1896         for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--)
1897                 kunmap_local(unmap_array[stripe_nr]);
1898         return ret;
1899 }
1900
1901 static int recover_sectors(struct btrfs_raid_bio *rbio)
1902 {
1903         void **pointers = NULL;
1904         void **unmap_array = NULL;
1905         int sectornr;
1906         int ret = 0;
1907
1908         /*
1909          * @pointers array stores the pointer for each sector.
1910          *
1911          * @unmap_array stores copy of pointers that does not get reordered
1912          * during reconstruction so that kunmap_local works.
1913          */
1914         pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
1915         unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
1916         if (!pointers || !unmap_array) {
1917                 ret = -ENOMEM;
1918                 goto out;
1919         }
1920
1921         if (rbio->operation == BTRFS_RBIO_READ_REBUILD) {
1922                 spin_lock(&rbio->bio_list_lock);
1923                 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1924                 spin_unlock(&rbio->bio_list_lock);
1925         }
1926
1927         index_rbio_pages(rbio);
1928
1929         for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
1930                 ret = recover_vertical(rbio, sectornr, pointers, unmap_array);
1931                 if (ret < 0)
1932                         break;
1933         }
1934
1935 out:
1936         kfree(pointers);
1937         kfree(unmap_array);
1938         return ret;
1939 }
1940
1941 static void recover_rbio(struct btrfs_raid_bio *rbio)
1942 {
1943         struct bio_list bio_list = BIO_EMPTY_LIST;
1944         int total_sector_nr;
1945         int ret = 0;
1946
1947         /*
1948          * Either we're doing recover for a read failure or degraded write,
1949          * caller should have set error bitmap correctly.
1950          */
1951         ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors));
1952
1953         /* For recovery, we need to read all sectors including P/Q. */
1954         ret = alloc_rbio_pages(rbio);
1955         if (ret < 0)
1956                 goto out;
1957
1958         index_rbio_pages(rbio);
1959
1960         /*
1961          * Read everything that hasn't failed. However this time we will
1962          * not trust any cached sector.
1963          * As we may read out some stale data but higher layer is not reading
1964          * that stale part.
1965          *
1966          * So here we always re-read everything in recovery path.
1967          */
1968         for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
1969              total_sector_nr++) {
1970                 int stripe = total_sector_nr / rbio->stripe_nsectors;
1971                 int sectornr = total_sector_nr % rbio->stripe_nsectors;
1972                 struct sector_ptr *sector;
1973
1974                 /*
1975                  * Skip the range which has error.  It can be a range which is
1976                  * marked error (for csum mismatch), or it can be a missing
1977                  * device.
1978                  */
1979                 if (!rbio->bioc->stripes[stripe].dev->bdev ||
1980                     test_bit(total_sector_nr, rbio->error_bitmap)) {
1981                         /*
1982                          * Also set the error bit for missing device, which
1983                          * may not yet have its error bit set.
1984                          */
1985                         set_bit(total_sector_nr, rbio->error_bitmap);
1986                         continue;
1987                 }
1988
1989                 sector = rbio_stripe_sector(rbio, stripe, sectornr);
1990                 ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
1991                                          sectornr, REQ_OP_READ);
1992                 if (ret < 0) {
1993                         bio_list_put(&bio_list);
1994                         goto out;
1995                 }
1996         }
1997
1998         submit_read_wait_bio_list(rbio, &bio_list);
1999         ret = recover_sectors(rbio);
2000 out:
2001         rbio_orig_end_io(rbio, errno_to_blk_status(ret));
2002 }
2003
2004 static void recover_rbio_work(struct work_struct *work)
2005 {
2006         struct btrfs_raid_bio *rbio;
2007
2008         rbio = container_of(work, struct btrfs_raid_bio, work);
2009         if (!lock_stripe_add(rbio))
2010                 recover_rbio(rbio);
2011 }
2012
2013 static void recover_rbio_work_locked(struct work_struct *work)
2014 {
2015         recover_rbio(container_of(work, struct btrfs_raid_bio, work));
2016 }
2017
2018 static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num)
2019 {
2020         bool found = false;
2021         int sector_nr;
2022
2023         /*
2024          * This is for RAID6 extra recovery tries, thus mirror number should
2025          * be large than 2.
2026          * Mirror 1 means read from data stripes. Mirror 2 means rebuild using
2027          * RAID5 methods.
2028          */
2029         ASSERT(mirror_num > 2);
2030         for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
2031                 int found_errors;
2032                 int faila;
2033                 int failb;
2034
2035                 found_errors = get_rbio_veritical_errors(rbio, sector_nr,
2036                                                          &faila, &failb);
2037                 /* This vertical stripe doesn't have errors. */
2038                 if (!found_errors)
2039                         continue;
2040
2041                 /*
2042                  * If we found errors, there should be only one error marked
2043                  * by previous set_rbio_range_error().
2044                  */
2045                 ASSERT(found_errors == 1);
2046                 found = true;
2047
2048                 /* Now select another stripe to mark as error. */
2049                 failb = rbio->real_stripes - (mirror_num - 1);
2050                 if (failb <= faila)
2051                         failb--;
2052
2053                 /* Set the extra bit in error bitmap. */
2054                 if (failb >= 0)
2055                         set_bit(failb * rbio->stripe_nsectors + sector_nr,
2056                                 rbio->error_bitmap);
2057         }
2058
2059         /* We should found at least one vertical stripe with error.*/
2060         ASSERT(found);
2061 }
2062
2063 /*
2064  * the main entry point for reads from the higher layers.  This
2065  * is really only called when the normal read path had a failure,
2066  * so we assume the bio they send down corresponds to a failed part
2067  * of the drive.
2068  */
2069 void raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
2070                            int mirror_num)
2071 {
2072         struct btrfs_fs_info *fs_info = bioc->fs_info;
2073         struct btrfs_raid_bio *rbio;
2074
2075         rbio = alloc_rbio(fs_info, bioc);
2076         if (IS_ERR(rbio)) {
2077                 bio->bi_status = errno_to_blk_status(PTR_ERR(rbio));
2078                 bio_endio(bio);
2079                 return;
2080         }
2081
2082         rbio->operation = BTRFS_RBIO_READ_REBUILD;
2083         rbio_add_bio(rbio, bio);
2084
2085         set_rbio_range_error(rbio, bio);
2086
2087         /*
2088          * Loop retry:
2089          * for 'mirror == 2', reconstruct from all other stripes.
2090          * for 'mirror_num > 2', select a stripe to fail on every retry.
2091          */
2092         if (mirror_num > 2)
2093                 set_rbio_raid6_extra_error(rbio, mirror_num);
2094
2095         start_async_work(rbio, recover_rbio_work);
2096 }
2097
2098 static void fill_data_csums(struct btrfs_raid_bio *rbio)
2099 {
2100         struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
2101         struct btrfs_root *csum_root = btrfs_csum_root(fs_info,
2102                                                        rbio->bioc->full_stripe_logical);
2103         const u64 start = rbio->bioc->full_stripe_logical;
2104         const u32 len = (rbio->nr_data * rbio->stripe_nsectors) <<
2105                         fs_info->sectorsize_bits;
2106         int ret;
2107
2108         /* The rbio should not have its csum buffer initialized. */
2109         ASSERT(!rbio->csum_buf && !rbio->csum_bitmap);
2110
2111         /*
2112          * Skip the csum search if:
2113          *
2114          * - The rbio doesn't belong to data block groups
2115          *   Then we are doing IO for tree blocks, no need to search csums.
2116          *
2117          * - The rbio belongs to mixed block groups
2118          *   This is to avoid deadlock, as we're already holding the full
2119          *   stripe lock, if we trigger a metadata read, and it needs to do
2120          *   raid56 recovery, we will deadlock.
2121          */
2122         if (!(rbio->bioc->map_type & BTRFS_BLOCK_GROUP_DATA) ||
2123             rbio->bioc->map_type & BTRFS_BLOCK_GROUP_METADATA)
2124                 return;
2125
2126         rbio->csum_buf = kzalloc(rbio->nr_data * rbio->stripe_nsectors *
2127                                  fs_info->csum_size, GFP_NOFS);
2128         rbio->csum_bitmap = bitmap_zalloc(rbio->nr_data * rbio->stripe_nsectors,
2129                                           GFP_NOFS);
2130         if (!rbio->csum_buf || !rbio->csum_bitmap) {
2131                 ret = -ENOMEM;
2132                 goto error;
2133         }
2134
2135         ret = btrfs_lookup_csums_bitmap(csum_root, NULL, start, start + len - 1,
2136                                         rbio->csum_buf, rbio->csum_bitmap);
2137         if (ret < 0)
2138                 goto error;
2139         if (bitmap_empty(rbio->csum_bitmap, len >> fs_info->sectorsize_bits))
2140                 goto no_csum;
2141         return;
2142
2143 error:
2144         /*
2145          * We failed to allocate memory or grab the csum, but it's not fatal,
2146          * we can still continue.  But better to warn users that RMW is no
2147          * longer safe for this particular sub-stripe write.
2148          */
2149         btrfs_warn_rl(fs_info,
2150 "sub-stripe write for full stripe %llu is not safe, failed to get csum: %d",
2151                         rbio->bioc->full_stripe_logical, ret);
2152 no_csum:
2153         kfree(rbio->csum_buf);
2154         bitmap_free(rbio->csum_bitmap);
2155         rbio->csum_buf = NULL;
2156         rbio->csum_bitmap = NULL;
2157 }
2158
2159 static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio)
2160 {
2161         struct bio_list bio_list = BIO_EMPTY_LIST;
2162         int total_sector_nr;
2163         int ret = 0;
2164
2165         /*
2166          * Fill the data csums we need for data verification.  We need to fill
2167          * the csum_bitmap/csum_buf first, as our endio function will try to
2168          * verify the data sectors.
2169          */
2170         fill_data_csums(rbio);
2171
2172         /*
2173          * Build a list of bios to read all sectors (including data and P/Q).
2174          *
2175          * This behavior is to compensate the later csum verification and recovery.
2176          */
2177         for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2178              total_sector_nr++) {
2179                 struct sector_ptr *sector;
2180                 int stripe = total_sector_nr / rbio->stripe_nsectors;
2181                 int sectornr = total_sector_nr % rbio->stripe_nsectors;
2182
2183                 sector = rbio_stripe_sector(rbio, stripe, sectornr);
2184                 ret = rbio_add_io_sector(rbio, &bio_list, sector,
2185                                stripe, sectornr, REQ_OP_READ);
2186                 if (ret) {
2187                         bio_list_put(&bio_list);
2188                         return ret;
2189                 }
2190         }
2191
2192         /*
2193          * We may or may not have any corrupted sectors (including missing dev
2194          * and csum mismatch), just let recover_sectors() to handle them all.
2195          */
2196         submit_read_wait_bio_list(rbio, &bio_list);
2197         return recover_sectors(rbio);
2198 }
2199
2200 static void raid_wait_write_end_io(struct bio *bio)
2201 {
2202         struct btrfs_raid_bio *rbio = bio->bi_private;
2203         blk_status_t err = bio->bi_status;
2204
2205         if (err)
2206                 rbio_update_error_bitmap(rbio, bio);
2207         bio_put(bio);
2208         if (atomic_dec_and_test(&rbio->stripes_pending))
2209                 wake_up(&rbio->io_wait);
2210 }
2211
2212 static void submit_write_bios(struct btrfs_raid_bio *rbio,
2213                               struct bio_list *bio_list)
2214 {
2215         struct bio *bio;
2216
2217         atomic_set(&rbio->stripes_pending, bio_list_size(bio_list));
2218         while ((bio = bio_list_pop(bio_list))) {
2219                 bio->bi_end_io = raid_wait_write_end_io;
2220
2221                 if (trace_raid56_write_enabled()) {
2222                         struct raid56_bio_trace_info trace_info = { 0 };
2223
2224                         bio_get_trace_info(rbio, bio, &trace_info);
2225                         trace_raid56_write(rbio, bio, &trace_info);
2226                 }
2227                 submit_bio(bio);
2228         }
2229 }
2230
2231 /*
2232  * To determine if we need to read any sector from the disk.
2233  * Should only be utilized in RMW path, to skip cached rbio.
2234  */
2235 static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio)
2236 {
2237         int i;
2238
2239         for (i = 0; i < rbio->nr_data * rbio->stripe_nsectors; i++) {
2240                 struct sector_ptr *sector = &rbio->stripe_sectors[i];
2241
2242                 /*
2243                  * We have a sector which doesn't have page nor uptodate,
2244                  * thus this rbio can not be cached one, as cached one must
2245                  * have all its data sectors present and uptodate.
2246                  */
2247                 if (!sector->page || !sector->uptodate)
2248                         return true;
2249         }
2250         return false;
2251 }
2252
2253 static void rmw_rbio(struct btrfs_raid_bio *rbio)
2254 {
2255         struct bio_list bio_list;
2256         int sectornr;
2257         int ret = 0;
2258
2259         /*
2260          * Allocate the pages for parity first, as P/Q pages will always be
2261          * needed for both full-stripe and sub-stripe writes.
2262          */
2263         ret = alloc_rbio_parity_pages(rbio);
2264         if (ret < 0)
2265                 goto out;
2266
2267         /*
2268          * Either full stripe write, or we have every data sector already
2269          * cached, can go to write path immediately.
2270          */
2271         if (!rbio_is_full(rbio) && need_read_stripe_sectors(rbio)) {
2272                 /*
2273                  * Now we're doing sub-stripe write, also need all data stripes
2274                  * to do the full RMW.
2275                  */
2276                 ret = alloc_rbio_data_pages(rbio);
2277                 if (ret < 0)
2278                         goto out;
2279
2280                 index_rbio_pages(rbio);
2281
2282                 ret = rmw_read_wait_recover(rbio);
2283                 if (ret < 0)
2284                         goto out;
2285         }
2286
2287         /*
2288          * At this stage we're not allowed to add any new bios to the
2289          * bio list any more, anyone else that wants to change this stripe
2290          * needs to do their own rmw.
2291          */
2292         spin_lock(&rbio->bio_list_lock);
2293         set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
2294         spin_unlock(&rbio->bio_list_lock);
2295
2296         bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
2297
2298         index_rbio_pages(rbio);
2299
2300         /*
2301          * We don't cache full rbios because we're assuming
2302          * the higher layers are unlikely to use this area of
2303          * the disk again soon.  If they do use it again,
2304          * hopefully they will send another full bio.
2305          */
2306         if (!rbio_is_full(rbio))
2307                 cache_rbio_pages(rbio);
2308         else
2309                 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2310
2311         for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++)
2312                 generate_pq_vertical(rbio, sectornr);
2313
2314         bio_list_init(&bio_list);
2315         ret = rmw_assemble_write_bios(rbio, &bio_list);
2316         if (ret < 0)
2317                 goto out;
2318
2319         /* We should have at least one bio assembled. */
2320         ASSERT(bio_list_size(&bio_list));
2321         submit_write_bios(rbio, &bio_list);
2322         wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
2323
2324         /* We may have more errors than our tolerance during the read. */
2325         for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
2326                 int found_errors;
2327
2328                 found_errors = get_rbio_veritical_errors(rbio, sectornr, NULL, NULL);
2329                 if (found_errors > rbio->bioc->max_errors) {
2330                         ret = -EIO;
2331                         break;
2332                 }
2333         }
2334 out:
2335         rbio_orig_end_io(rbio, errno_to_blk_status(ret));
2336 }
2337
2338 static void rmw_rbio_work(struct work_struct *work)
2339 {
2340         struct btrfs_raid_bio *rbio;
2341
2342         rbio = container_of(work, struct btrfs_raid_bio, work);
2343         if (lock_stripe_add(rbio) == 0)
2344                 rmw_rbio(rbio);
2345 }
2346
2347 static void rmw_rbio_work_locked(struct work_struct *work)
2348 {
2349         rmw_rbio(container_of(work, struct btrfs_raid_bio, work));
2350 }
2351
2352 /*
2353  * The following code is used to scrub/replace the parity stripe
2354  *
2355  * Caller must have already increased bio_counter for getting @bioc.
2356  *
2357  * Note: We need make sure all the pages that add into the scrub/replace
2358  * raid bio are correct and not be changed during the scrub/replace. That
2359  * is those pages just hold metadata or file data with checksum.
2360  */
2361
2362 struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
2363                                 struct btrfs_io_context *bioc,
2364                                 struct btrfs_device *scrub_dev,
2365                                 unsigned long *dbitmap, int stripe_nsectors)
2366 {
2367         struct btrfs_fs_info *fs_info = bioc->fs_info;
2368         struct btrfs_raid_bio *rbio;
2369         int i;
2370
2371         rbio = alloc_rbio(fs_info, bioc);
2372         if (IS_ERR(rbio))
2373                 return NULL;
2374         bio_list_add(&rbio->bio_list, bio);
2375         /*
2376          * This is a special bio which is used to hold the completion handler
2377          * and make the scrub rbio is similar to the other types
2378          */
2379         ASSERT(!bio->bi_iter.bi_size);
2380         rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
2381
2382         /*
2383          * After mapping bioc with BTRFS_MAP_WRITE, parities have been sorted
2384          * to the end position, so this search can start from the first parity
2385          * stripe.
2386          */
2387         for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
2388                 if (bioc->stripes[i].dev == scrub_dev) {
2389                         rbio->scrubp = i;
2390                         break;
2391                 }
2392         }
2393         ASSERT(i < rbio->real_stripes);
2394
2395         bitmap_copy(&rbio->dbitmap, dbitmap, stripe_nsectors);
2396         return rbio;
2397 }
2398
2399 /*
2400  * We just scrub the parity that we have correct data on the same horizontal,
2401  * so we needn't allocate all pages for all the stripes.
2402  */
2403 static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
2404 {
2405         const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
2406         int total_sector_nr;
2407
2408         for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2409              total_sector_nr++) {
2410                 struct page *page;
2411                 int sectornr = total_sector_nr % rbio->stripe_nsectors;
2412                 int index = (total_sector_nr * sectorsize) >> PAGE_SHIFT;
2413
2414                 if (!test_bit(sectornr, &rbio->dbitmap))
2415                         continue;
2416                 if (rbio->stripe_pages[index])
2417                         continue;
2418                 page = alloc_page(GFP_NOFS);
2419                 if (!page)
2420                         return -ENOMEM;
2421                 rbio->stripe_pages[index] = page;
2422         }
2423         index_stripe_sectors(rbio);
2424         return 0;
2425 }
2426
2427 static int finish_parity_scrub(struct btrfs_raid_bio *rbio)
2428 {
2429         struct btrfs_io_context *bioc = rbio->bioc;
2430         const u32 sectorsize = bioc->fs_info->sectorsize;
2431         void **pointers = rbio->finish_pointers;
2432         unsigned long *pbitmap = &rbio->finish_pbitmap;
2433         int nr_data = rbio->nr_data;
2434         int stripe;
2435         int sectornr;
2436         bool has_qstripe;
2437         struct sector_ptr p_sector = { 0 };
2438         struct sector_ptr q_sector = { 0 };
2439         struct bio_list bio_list;
2440         int is_replace = 0;
2441         int ret;
2442
2443         bio_list_init(&bio_list);
2444
2445         if (rbio->real_stripes - rbio->nr_data == 1)
2446                 has_qstripe = false;
2447         else if (rbio->real_stripes - rbio->nr_data == 2)
2448                 has_qstripe = true;
2449         else
2450                 BUG();
2451
2452         /*
2453          * Replace is running and our P/Q stripe is being replaced, then we
2454          * need to duplicate the final write to replace target.
2455          */
2456         if (bioc->replace_nr_stripes && bioc->replace_stripe_src == rbio->scrubp) {
2457                 is_replace = 1;
2458                 bitmap_copy(pbitmap, &rbio->dbitmap, rbio->stripe_nsectors);
2459         }
2460
2461         /*
2462          * Because the higher layers(scrubber) are unlikely to
2463          * use this area of the disk again soon, so don't cache
2464          * it.
2465          */
2466         clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2467
2468         p_sector.page = alloc_page(GFP_NOFS);
2469         if (!p_sector.page)
2470                 return -ENOMEM;
2471         p_sector.pgoff = 0;
2472         p_sector.uptodate = 1;
2473
2474         if (has_qstripe) {
2475                 /* RAID6, allocate and map temp space for the Q stripe */
2476                 q_sector.page = alloc_page(GFP_NOFS);
2477                 if (!q_sector.page) {
2478                         __free_page(p_sector.page);
2479                         p_sector.page = NULL;
2480                         return -ENOMEM;
2481                 }
2482                 q_sector.pgoff = 0;
2483                 q_sector.uptodate = 1;
2484                 pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page);
2485         }
2486
2487         bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
2488
2489         /* Map the parity stripe just once */
2490         pointers[nr_data] = kmap_local_page(p_sector.page);
2491
2492         for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
2493                 struct sector_ptr *sector;
2494                 void *parity;
2495
2496                 /* first collect one page from each data stripe */
2497                 for (stripe = 0; stripe < nr_data; stripe++) {
2498                         sector = sector_in_rbio(rbio, stripe, sectornr, 0);
2499                         pointers[stripe] = kmap_local_page(sector->page) +
2500                                            sector->pgoff;
2501                 }
2502
2503                 if (has_qstripe) {
2504                         assert_rbio(rbio);
2505                         /* RAID6, call the library function to fill in our P/Q */
2506                         raid6_call.gen_syndrome(rbio->real_stripes, sectorsize,
2507                                                 pointers);
2508                 } else {
2509                         /* raid5 */
2510                         memcpy(pointers[nr_data], pointers[0], sectorsize);
2511                         run_xor(pointers + 1, nr_data - 1, sectorsize);
2512                 }
2513
2514                 /* Check scrubbing parity and repair it */
2515                 sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2516                 parity = kmap_local_page(sector->page) + sector->pgoff;
2517                 if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0)
2518                         memcpy(parity, pointers[rbio->scrubp], sectorsize);
2519                 else
2520                         /* Parity is right, needn't writeback */
2521                         bitmap_clear(&rbio->dbitmap, sectornr, 1);
2522                 kunmap_local(parity);
2523
2524                 for (stripe = nr_data - 1; stripe >= 0; stripe--)
2525                         kunmap_local(pointers[stripe]);
2526         }
2527
2528         kunmap_local(pointers[nr_data]);
2529         __free_page(p_sector.page);
2530         p_sector.page = NULL;
2531         if (q_sector.page) {
2532                 kunmap_local(pointers[rbio->real_stripes - 1]);
2533                 __free_page(q_sector.page);
2534                 q_sector.page = NULL;
2535         }
2536
2537         /*
2538          * time to start writing.  Make bios for everything from the
2539          * higher layers (the bio_list in our rbio) and our p/q.  Ignore
2540          * everything else.
2541          */
2542         for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) {
2543                 struct sector_ptr *sector;
2544
2545                 sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2546                 ret = rbio_add_io_sector(rbio, &bio_list, sector, rbio->scrubp,
2547                                          sectornr, REQ_OP_WRITE);
2548                 if (ret)
2549                         goto cleanup;
2550         }
2551
2552         if (!is_replace)
2553                 goto submit_write;
2554
2555         /*
2556          * Replace is running and our parity stripe needs to be duplicated to
2557          * the target device.  Check we have a valid source stripe number.
2558          */
2559         ASSERT(rbio->bioc->replace_stripe_src >= 0);
2560         for_each_set_bit(sectornr, pbitmap, rbio->stripe_nsectors) {
2561                 struct sector_ptr *sector;
2562
2563                 sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr);
2564                 ret = rbio_add_io_sector(rbio, &bio_list, sector,
2565                                          rbio->real_stripes,
2566                                          sectornr, REQ_OP_WRITE);
2567                 if (ret)
2568                         goto cleanup;
2569         }
2570
2571 submit_write:
2572         submit_write_bios(rbio, &bio_list);
2573         return 0;
2574
2575 cleanup:
2576         bio_list_put(&bio_list);
2577         return ret;
2578 }
2579
2580 static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
2581 {
2582         if (stripe >= 0 && stripe < rbio->nr_data)
2583                 return 1;
2584         return 0;
2585 }
2586
2587 static int recover_scrub_rbio(struct btrfs_raid_bio *rbio)
2588 {
2589         void **pointers = NULL;
2590         void **unmap_array = NULL;
2591         int sector_nr;
2592         int ret = 0;
2593
2594         /*
2595          * @pointers array stores the pointer for each sector.
2596          *
2597          * @unmap_array stores copy of pointers that does not get reordered
2598          * during reconstruction so that kunmap_local works.
2599          */
2600         pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
2601         unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
2602         if (!pointers || !unmap_array) {
2603                 ret = -ENOMEM;
2604                 goto out;
2605         }
2606
2607         for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
2608                 int dfail = 0, failp = -1;
2609                 int faila;
2610                 int failb;
2611                 int found_errors;
2612
2613                 found_errors = get_rbio_veritical_errors(rbio, sector_nr,
2614                                                          &faila, &failb);
2615                 if (found_errors > rbio->bioc->max_errors) {
2616                         ret = -EIO;
2617                         goto out;
2618                 }
2619                 if (found_errors == 0)
2620                         continue;
2621
2622                 /* We should have at least one error here. */
2623                 ASSERT(faila >= 0 || failb >= 0);
2624
2625                 if (is_data_stripe(rbio, faila))
2626                         dfail++;
2627                 else if (is_parity_stripe(faila))
2628                         failp = faila;
2629
2630                 if (is_data_stripe(rbio, failb))
2631                         dfail++;
2632                 else if (is_parity_stripe(failb))
2633                         failp = failb;
2634                 /*
2635                  * Because we can not use a scrubbing parity to repair the
2636                  * data, so the capability of the repair is declined.  (In the
2637                  * case of RAID5, we can not repair anything.)
2638                  */
2639                 if (dfail > rbio->bioc->max_errors - 1) {
2640                         ret = -EIO;
2641                         goto out;
2642                 }
2643                 /*
2644                  * If all data is good, only parity is correctly, just repair
2645                  * the parity, no need to recover data stripes.
2646                  */
2647                 if (dfail == 0)
2648                         continue;
2649
2650                 /*
2651                  * Here means we got one corrupted data stripe and one
2652                  * corrupted parity on RAID6, if the corrupted parity is
2653                  * scrubbing parity, luckily, use the other one to repair the
2654                  * data, or we can not repair the data stripe.
2655                  */
2656                 if (failp != rbio->scrubp) {
2657                         ret = -EIO;
2658                         goto out;
2659                 }
2660
2661                 ret = recover_vertical(rbio, sector_nr, pointers, unmap_array);
2662                 if (ret < 0)
2663                         goto out;
2664         }
2665 out:
2666         kfree(pointers);
2667         kfree(unmap_array);
2668         return ret;
2669 }
2670
2671 static int scrub_assemble_read_bios(struct btrfs_raid_bio *rbio)
2672 {
2673         struct bio_list bio_list = BIO_EMPTY_LIST;
2674         int total_sector_nr;
2675         int ret = 0;
2676
2677         /* Build a list of bios to read all the missing parts. */
2678         for (total_sector_nr = 0; total_sector_nr < rbio->nr_sectors;
2679              total_sector_nr++) {
2680                 int sectornr = total_sector_nr % rbio->stripe_nsectors;
2681                 int stripe = total_sector_nr / rbio->stripe_nsectors;
2682                 struct sector_ptr *sector;
2683
2684                 /* No data in the vertical stripe, no need to read. */
2685                 if (!test_bit(sectornr, &rbio->dbitmap))
2686                         continue;
2687
2688                 /*
2689                  * We want to find all the sectors missing from the rbio and
2690                  * read them from the disk. If sector_in_rbio() finds a sector
2691                  * in the bio list we don't need to read it off the stripe.
2692                  */
2693                 sector = sector_in_rbio(rbio, stripe, sectornr, 1);
2694                 if (sector)
2695                         continue;
2696
2697                 sector = rbio_stripe_sector(rbio, stripe, sectornr);
2698                 /*
2699                  * The bio cache may have handed us an uptodate sector.  If so,
2700                  * use it.
2701                  */
2702                 if (sector->uptodate)
2703                         continue;
2704
2705                 ret = rbio_add_io_sector(rbio, &bio_list, sector, stripe,
2706                                          sectornr, REQ_OP_READ);
2707                 if (ret) {
2708                         bio_list_put(&bio_list);
2709                         return ret;
2710                 }
2711         }
2712
2713         submit_read_wait_bio_list(rbio, &bio_list);
2714         return 0;
2715 }
2716
2717 static void scrub_rbio(struct btrfs_raid_bio *rbio)
2718 {
2719         int sector_nr;
2720         int ret;
2721
2722         ret = alloc_rbio_essential_pages(rbio);
2723         if (ret)
2724                 goto out;
2725
2726         bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors);
2727
2728         ret = scrub_assemble_read_bios(rbio);
2729         if (ret < 0)
2730                 goto out;
2731
2732         /* We may have some failures, recover the failed sectors first. */
2733         ret = recover_scrub_rbio(rbio);
2734         if (ret < 0)
2735                 goto out;
2736
2737         /*
2738          * We have every sector properly prepared. Can finish the scrub
2739          * and writeback the good content.
2740          */
2741         ret = finish_parity_scrub(rbio);
2742         wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);
2743         for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
2744                 int found_errors;
2745
2746                 found_errors = get_rbio_veritical_errors(rbio, sector_nr, NULL, NULL);
2747                 if (found_errors > rbio->bioc->max_errors) {
2748                         ret = -EIO;
2749                         break;
2750                 }
2751         }
2752 out:
2753         rbio_orig_end_io(rbio, errno_to_blk_status(ret));
2754 }
2755
2756 static void scrub_rbio_work_locked(struct work_struct *work)
2757 {
2758         scrub_rbio(container_of(work, struct btrfs_raid_bio, work));
2759 }
2760
2761 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
2762 {
2763         if (!lock_stripe_add(rbio))
2764                 start_async_work(rbio, scrub_rbio_work_locked);
2765 }
2766
2767 /*
2768  * This is for scrub call sites where we already have correct data contents.
2769  * This allows us to avoid reading data stripes again.
2770  *
2771  * Unfortunately here we have to do page copy, other than reusing the pages.
2772  * This is due to the fact rbio has its own page management for its cache.
2773  */
2774 void raid56_parity_cache_data_pages(struct btrfs_raid_bio *rbio,
2775                                     struct page **data_pages, u64 data_logical)
2776 {
2777         const u64 offset_in_full_stripe = data_logical -
2778                                           rbio->bioc->full_stripe_logical;
2779         const int page_index = offset_in_full_stripe >> PAGE_SHIFT;
2780         const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
2781         const u32 sectors_per_page = PAGE_SIZE / sectorsize;
2782         int ret;
2783
2784         /*
2785          * If we hit ENOMEM temporarily, but later at
2786          * raid56_parity_submit_scrub_rbio() time it succeeded, we just do
2787          * the extra read, not a big deal.
2788          *
2789          * If we hit ENOMEM later at raid56_parity_submit_scrub_rbio() time,
2790          * the bio would got proper error number set.
2791          */
2792         ret = alloc_rbio_data_pages(rbio);
2793         if (ret < 0)
2794                 return;
2795
2796         /* data_logical must be at stripe boundary and inside the full stripe. */
2797         ASSERT(IS_ALIGNED(offset_in_full_stripe, BTRFS_STRIPE_LEN));
2798         ASSERT(offset_in_full_stripe < (rbio->nr_data << BTRFS_STRIPE_LEN_SHIFT));
2799
2800         for (int page_nr = 0; page_nr < (BTRFS_STRIPE_LEN >> PAGE_SHIFT); page_nr++) {
2801                 struct page *dst = rbio->stripe_pages[page_nr + page_index];
2802                 struct page *src = data_pages[page_nr];
2803
2804                 memcpy_page(dst, 0, src, 0, PAGE_SIZE);
2805                 for (int sector_nr = sectors_per_page * page_index;
2806                      sector_nr < sectors_per_page * (page_index + 1);
2807                      sector_nr++)
2808                         rbio->stripe_sectors[sector_nr].uptodate = true;
2809         }
2810 }