1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2012 Fusion-io All rights reserved.
4 * Copyright (C) 2012 Intel Corp. All rights reserved.
7 #include <linux/sched.h>
9 #include <linux/slab.h>
10 #include <linux/blkdev.h>
11 #include <linux/raid/pq.h>
12 #include <linux/hash.h>
13 #include <linux/list_sort.h>
14 #include <linux/raid/xor.h>
20 #include "async-thread.h"
22 /* set when additional merges to this rbio are not allowed */
23 #define RBIO_RMW_LOCKED_BIT 1
26 * set when this rbio is sitting in the hash, but it is just a cache
29 #define RBIO_CACHE_BIT 2
32 * set when it is safe to trust the stripe_pages for caching
34 #define RBIO_CACHE_READY_BIT 3
36 #define RBIO_CACHE_SIZE 1024
40 BTRFS_RBIO_READ_REBUILD,
41 BTRFS_RBIO_PARITY_SCRUB,
42 BTRFS_RBIO_REBUILD_MISSING,
45 struct btrfs_raid_bio {
46 struct btrfs_fs_info *fs_info;
47 struct btrfs_bio *bbio;
49 /* while we're doing rmw on a stripe
50 * we put it into a hash table so we can
51 * lock the stripe and merge more rbios
54 struct list_head hash_list;
57 * LRU list for the stripe cache
59 struct list_head stripe_cache;
62 * for scheduling work in the helper threads
64 struct btrfs_work work;
67 * bio list and bio_list_lock are used
68 * to add more bios into the stripe
69 * in hopes of avoiding the full rmw
71 struct bio_list bio_list;
72 spinlock_t bio_list_lock;
74 /* also protected by the bio_list_lock, the
75 * plug list is used by the plugging code
76 * to collect partial bios while plugged. The
77 * stripe locking code also uses it to hand off
78 * the stripe lock to the next pending IO
80 struct list_head plug_list;
83 * flags that tell us if it is safe to
88 /* size of each individual stripe on disk */
91 /* number of data stripes (no p/q) */
98 * set if we're doing a parity rebuild
99 * for a read from higher up, which is handled
100 * differently from a parity rebuild as part of
103 enum btrfs_rbio_ops operation;
105 /* first bad stripe */
108 /* second bad stripe (for raid6 use) */
113 * number of pages needed to represent the full
119 * size of all the bios in the bio_list. This
120 * helps us decide if the rbio maps to a full
129 atomic_t stripes_pending;
133 * these are two arrays of pointers. We allocate the
134 * rbio big enough to hold them both and setup their
135 * locations when the rbio is allocated
138 /* pointers to pages that we allocated for
139 * reading/writing stripes directly from the disk (including P/Q)
141 struct page **stripe_pages;
144 * pointers to the pages in the bio_list. Stored
145 * here for faster lookup
147 struct page **bio_pages;
150 * bitmap to record which horizontal stripe has data
152 unsigned long *dbitmap;
154 /* allocated with real_stripes-many pointers for finish_*() calls */
155 void **finish_pointers;
157 /* allocated with stripe_npages-many bits for finish_*() calls */
158 unsigned long *finish_pbitmap;
161 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
162 static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
163 static void rmw_work(struct btrfs_work *work);
164 static void read_rebuild_work(struct btrfs_work *work);
165 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
166 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
167 static void __free_raid_bio(struct btrfs_raid_bio *rbio);
168 static void index_rbio_pages(struct btrfs_raid_bio *rbio);
169 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio);
171 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
173 static void scrub_parity_work(struct btrfs_work *work);
175 static void start_async_work(struct btrfs_raid_bio *rbio, btrfs_func_t work_func)
177 btrfs_init_work(&rbio->work, btrfs_rmw_helper, work_func, NULL, NULL);
178 btrfs_queue_work(rbio->fs_info->rmw_workers, &rbio->work);
182 * the stripe hash table is used for locking, and to collect
183 * bios in hopes of making a full stripe
185 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info)
187 struct btrfs_stripe_hash_table *table;
188 struct btrfs_stripe_hash_table *x;
189 struct btrfs_stripe_hash *cur;
190 struct btrfs_stripe_hash *h;
191 int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS;
195 if (info->stripe_hash_table)
199 * The table is large, starting with order 4 and can go as high as
200 * order 7 in case lock debugging is turned on.
202 * Try harder to allocate and fallback to vmalloc to lower the chance
203 * of a failing mount.
205 table_size = sizeof(*table) + sizeof(*h) * num_entries;
206 table = kvzalloc(table_size, GFP_KERNEL);
210 spin_lock_init(&table->cache_lock);
211 INIT_LIST_HEAD(&table->stripe_cache);
215 for (i = 0; i < num_entries; i++) {
217 INIT_LIST_HEAD(&cur->hash_list);
218 spin_lock_init(&cur->lock);
221 x = cmpxchg(&info->stripe_hash_table, NULL, table);
228 * caching an rbio means to copy anything from the
229 * bio_pages array into the stripe_pages array. We
230 * use the page uptodate bit in the stripe cache array
231 * to indicate if it has valid data
233 * once the caching is done, we set the cache ready
236 static void cache_rbio_pages(struct btrfs_raid_bio *rbio)
243 ret = alloc_rbio_pages(rbio);
247 for (i = 0; i < rbio->nr_pages; i++) {
248 if (!rbio->bio_pages[i])
251 s = kmap(rbio->bio_pages[i]);
252 d = kmap(rbio->stripe_pages[i]);
256 kunmap(rbio->bio_pages[i]);
257 kunmap(rbio->stripe_pages[i]);
258 SetPageUptodate(rbio->stripe_pages[i]);
260 set_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
264 * we hash on the first logical address of the stripe
266 static int rbio_bucket(struct btrfs_raid_bio *rbio)
268 u64 num = rbio->bbio->raid_map[0];
271 * we shift down quite a bit. We're using byte
272 * addressing, and most of the lower bits are zeros.
273 * This tends to upset hash_64, and it consistently
274 * returns just one or two different values.
276 * shifting off the lower bits fixes things.
278 return hash_64(num >> 16, BTRFS_STRIPE_HASH_TABLE_BITS);
282 * stealing an rbio means taking all the uptodate pages from the stripe
283 * array in the source rbio and putting them into the destination rbio
285 static void steal_rbio(struct btrfs_raid_bio *src, struct btrfs_raid_bio *dest)
291 if (!test_bit(RBIO_CACHE_READY_BIT, &src->flags))
294 for (i = 0; i < dest->nr_pages; i++) {
295 s = src->stripe_pages[i];
296 if (!s || !PageUptodate(s)) {
300 d = dest->stripe_pages[i];
304 dest->stripe_pages[i] = s;
305 src->stripe_pages[i] = NULL;
310 * merging means we take the bio_list from the victim and
311 * splice it into the destination. The victim should
312 * be discarded afterwards.
314 * must be called with dest->rbio_list_lock held
316 static void merge_rbio(struct btrfs_raid_bio *dest,
317 struct btrfs_raid_bio *victim)
319 bio_list_merge(&dest->bio_list, &victim->bio_list);
320 dest->bio_list_bytes += victim->bio_list_bytes;
321 dest->generic_bio_cnt += victim->generic_bio_cnt;
322 bio_list_init(&victim->bio_list);
326 * used to prune items that are in the cache. The caller
327 * must hold the hash table lock.
329 static void __remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
331 int bucket = rbio_bucket(rbio);
332 struct btrfs_stripe_hash_table *table;
333 struct btrfs_stripe_hash *h;
337 * check the bit again under the hash table lock.
339 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
342 table = rbio->fs_info->stripe_hash_table;
343 h = table->table + bucket;
345 /* hold the lock for the bucket because we may be
346 * removing it from the hash table
351 * hold the lock for the bio list because we need
352 * to make sure the bio list is empty
354 spin_lock(&rbio->bio_list_lock);
356 if (test_and_clear_bit(RBIO_CACHE_BIT, &rbio->flags)) {
357 list_del_init(&rbio->stripe_cache);
358 table->cache_size -= 1;
361 /* if the bio list isn't empty, this rbio is
362 * still involved in an IO. We take it out
363 * of the cache list, and drop the ref that
364 * was held for the list.
366 * If the bio_list was empty, we also remove
367 * the rbio from the hash_table, and drop
368 * the corresponding ref
370 if (bio_list_empty(&rbio->bio_list)) {
371 if (!list_empty(&rbio->hash_list)) {
372 list_del_init(&rbio->hash_list);
373 refcount_dec(&rbio->refs);
374 BUG_ON(!list_empty(&rbio->plug_list));
379 spin_unlock(&rbio->bio_list_lock);
380 spin_unlock(&h->lock);
383 __free_raid_bio(rbio);
387 * prune a given rbio from the cache
389 static void remove_rbio_from_cache(struct btrfs_raid_bio *rbio)
391 struct btrfs_stripe_hash_table *table;
394 if (!test_bit(RBIO_CACHE_BIT, &rbio->flags))
397 table = rbio->fs_info->stripe_hash_table;
399 spin_lock_irqsave(&table->cache_lock, flags);
400 __remove_rbio_from_cache(rbio);
401 spin_unlock_irqrestore(&table->cache_lock, flags);
405 * remove everything in the cache
407 static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info)
409 struct btrfs_stripe_hash_table *table;
411 struct btrfs_raid_bio *rbio;
413 table = info->stripe_hash_table;
415 spin_lock_irqsave(&table->cache_lock, flags);
416 while (!list_empty(&table->stripe_cache)) {
417 rbio = list_entry(table->stripe_cache.next,
418 struct btrfs_raid_bio,
420 __remove_rbio_from_cache(rbio);
422 spin_unlock_irqrestore(&table->cache_lock, flags);
426 * remove all cached entries and free the hash table
429 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info)
431 if (!info->stripe_hash_table)
433 btrfs_clear_rbio_cache(info);
434 kvfree(info->stripe_hash_table);
435 info->stripe_hash_table = NULL;
439 * insert an rbio into the stripe cache. It
440 * must have already been prepared by calling
443 * If this rbio was already cached, it gets
444 * moved to the front of the lru.
446 * If the size of the rbio cache is too big, we
449 static void cache_rbio(struct btrfs_raid_bio *rbio)
451 struct btrfs_stripe_hash_table *table;
454 if (!test_bit(RBIO_CACHE_READY_BIT, &rbio->flags))
457 table = rbio->fs_info->stripe_hash_table;
459 spin_lock_irqsave(&table->cache_lock, flags);
460 spin_lock(&rbio->bio_list_lock);
462 /* bump our ref if we were not in the list before */
463 if (!test_and_set_bit(RBIO_CACHE_BIT, &rbio->flags))
464 refcount_inc(&rbio->refs);
466 if (!list_empty(&rbio->stripe_cache)){
467 list_move(&rbio->stripe_cache, &table->stripe_cache);
469 list_add(&rbio->stripe_cache, &table->stripe_cache);
470 table->cache_size += 1;
473 spin_unlock(&rbio->bio_list_lock);
475 if (table->cache_size > RBIO_CACHE_SIZE) {
476 struct btrfs_raid_bio *found;
478 found = list_entry(table->stripe_cache.prev,
479 struct btrfs_raid_bio,
483 __remove_rbio_from_cache(found);
486 spin_unlock_irqrestore(&table->cache_lock, flags);
490 * helper function to run the xor_blocks api. It is only
491 * able to do MAX_XOR_BLOCKS at a time, so we need to
494 static void run_xor(void **pages, int src_cnt, ssize_t len)
498 void *dest = pages[src_cnt];
501 xor_src_cnt = min(src_cnt, MAX_XOR_BLOCKS);
502 xor_blocks(xor_src_cnt, len, dest, pages + src_off);
504 src_cnt -= xor_src_cnt;
505 src_off += xor_src_cnt;
510 * Returns true if the bio list inside this rbio covers an entire stripe (no
513 static int rbio_is_full(struct btrfs_raid_bio *rbio)
516 unsigned long size = rbio->bio_list_bytes;
519 spin_lock_irqsave(&rbio->bio_list_lock, flags);
520 if (size != rbio->nr_data * rbio->stripe_len)
522 BUG_ON(size > rbio->nr_data * rbio->stripe_len);
523 spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
529 * returns 1 if it is safe to merge two rbios together.
530 * The merging is safe if the two rbios correspond to
531 * the same stripe and if they are both going in the same
532 * direction (read vs write), and if neither one is
533 * locked for final IO
535 * The caller is responsible for locking such that
536 * rmw_locked is safe to test
538 static int rbio_can_merge(struct btrfs_raid_bio *last,
539 struct btrfs_raid_bio *cur)
541 if (test_bit(RBIO_RMW_LOCKED_BIT, &last->flags) ||
542 test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags))
546 * we can't merge with cached rbios, since the
547 * idea is that when we merge the destination
548 * rbio is going to run our IO for us. We can
549 * steal from cached rbios though, other functions
552 if (test_bit(RBIO_CACHE_BIT, &last->flags) ||
553 test_bit(RBIO_CACHE_BIT, &cur->flags))
556 if (last->bbio->raid_map[0] !=
557 cur->bbio->raid_map[0])
560 /* we can't merge with different operations */
561 if (last->operation != cur->operation)
564 * We've need read the full stripe from the drive.
565 * check and repair the parity and write the new results.
567 * We're not allowed to add any new bios to the
568 * bio list here, anyone else that wants to
569 * change this stripe needs to do their own rmw.
571 if (last->operation == BTRFS_RBIO_PARITY_SCRUB)
574 if (last->operation == BTRFS_RBIO_REBUILD_MISSING)
577 if (last->operation == BTRFS_RBIO_READ_REBUILD) {
578 int fa = last->faila;
579 int fb = last->failb;
580 int cur_fa = cur->faila;
581 int cur_fb = cur->failb;
583 if (last->faila >= last->failb) {
588 if (cur->faila >= cur->failb) {
593 if (fa != cur_fa || fb != cur_fb)
599 static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe,
602 return stripe * rbio->stripe_npages + index;
606 * these are just the pages from the rbio array, not from anything
607 * the FS sent down to us
609 static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe,
612 return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)];
616 * helper to index into the pstripe
618 static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index)
620 return rbio_stripe_page(rbio, rbio->nr_data, index);
624 * helper to index into the qstripe, returns null
625 * if there is no qstripe
627 static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index)
629 if (rbio->nr_data + 1 == rbio->real_stripes)
631 return rbio_stripe_page(rbio, rbio->nr_data + 1, index);
635 * The first stripe in the table for a logical address
636 * has the lock. rbios are added in one of three ways:
638 * 1) Nobody has the stripe locked yet. The rbio is given
639 * the lock and 0 is returned. The caller must start the IO
642 * 2) Someone has the stripe locked, but we're able to merge
643 * with the lock owner. The rbio is freed and the IO will
644 * start automatically along with the existing rbio. 1 is returned.
646 * 3) Someone has the stripe locked, but we're not able to merge.
647 * The rbio is added to the lock owner's plug list, or merged into
648 * an rbio already on the plug list. When the lock owner unlocks,
649 * the next rbio on the list is run and the IO is started automatically.
652 * If we return 0, the caller still owns the rbio and must continue with
653 * IO submission. If we return 1, the caller must assume the rbio has
654 * already been freed.
656 static noinline int lock_stripe_add(struct btrfs_raid_bio *rbio)
658 int bucket = rbio_bucket(rbio);
659 struct btrfs_stripe_hash *h = rbio->fs_info->stripe_hash_table->table + bucket;
660 struct btrfs_raid_bio *cur;
661 struct btrfs_raid_bio *pending;
663 struct btrfs_raid_bio *freeit = NULL;
664 struct btrfs_raid_bio *cache_drop = NULL;
667 spin_lock_irqsave(&h->lock, flags);
668 list_for_each_entry(cur, &h->hash_list, hash_list) {
669 if (cur->bbio->raid_map[0] == rbio->bbio->raid_map[0]) {
670 spin_lock(&cur->bio_list_lock);
672 /* can we steal this cached rbio's pages? */
673 if (bio_list_empty(&cur->bio_list) &&
674 list_empty(&cur->plug_list) &&
675 test_bit(RBIO_CACHE_BIT, &cur->flags) &&
676 !test_bit(RBIO_RMW_LOCKED_BIT, &cur->flags)) {
677 list_del_init(&cur->hash_list);
678 refcount_dec(&cur->refs);
680 steal_rbio(cur, rbio);
682 spin_unlock(&cur->bio_list_lock);
687 /* can we merge into the lock owner? */
688 if (rbio_can_merge(cur, rbio)) {
689 merge_rbio(cur, rbio);
690 spin_unlock(&cur->bio_list_lock);
698 * we couldn't merge with the running
699 * rbio, see if we can merge with the
700 * pending ones. We don't have to
701 * check for rmw_locked because there
702 * is no way they are inside finish_rmw
705 list_for_each_entry(pending, &cur->plug_list,
707 if (rbio_can_merge(pending, rbio)) {
708 merge_rbio(pending, rbio);
709 spin_unlock(&cur->bio_list_lock);
716 /* no merging, put us on the tail of the plug list,
717 * our rbio will be started with the currently
718 * running rbio unlocks
720 list_add_tail(&rbio->plug_list, &cur->plug_list);
721 spin_unlock(&cur->bio_list_lock);
727 refcount_inc(&rbio->refs);
728 list_add(&rbio->hash_list, &h->hash_list);
730 spin_unlock_irqrestore(&h->lock, flags);
732 remove_rbio_from_cache(cache_drop);
734 __free_raid_bio(freeit);
739 * called as rmw or parity rebuild is completed. If the plug list has more
740 * rbios waiting for this stripe, the next one on the list will be started
742 static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
745 struct btrfs_stripe_hash *h;
749 bucket = rbio_bucket(rbio);
750 h = rbio->fs_info->stripe_hash_table->table + bucket;
752 if (list_empty(&rbio->plug_list))
755 spin_lock_irqsave(&h->lock, flags);
756 spin_lock(&rbio->bio_list_lock);
758 if (!list_empty(&rbio->hash_list)) {
760 * if we're still cached and there is no other IO
761 * to perform, just leave this rbio here for others
762 * to steal from later
764 if (list_empty(&rbio->plug_list) &&
765 test_bit(RBIO_CACHE_BIT, &rbio->flags)) {
767 clear_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
768 BUG_ON(!bio_list_empty(&rbio->bio_list));
772 list_del_init(&rbio->hash_list);
773 refcount_dec(&rbio->refs);
776 * we use the plug list to hold all the rbios
777 * waiting for the chance to lock this stripe.
778 * hand the lock over to one of them.
780 if (!list_empty(&rbio->plug_list)) {
781 struct btrfs_raid_bio *next;
782 struct list_head *head = rbio->plug_list.next;
784 next = list_entry(head, struct btrfs_raid_bio,
787 list_del_init(&rbio->plug_list);
789 list_add(&next->hash_list, &h->hash_list);
790 refcount_inc(&next->refs);
791 spin_unlock(&rbio->bio_list_lock);
792 spin_unlock_irqrestore(&h->lock, flags);
794 if (next->operation == BTRFS_RBIO_READ_REBUILD)
795 start_async_work(next, read_rebuild_work);
796 else if (next->operation == BTRFS_RBIO_REBUILD_MISSING) {
797 steal_rbio(rbio, next);
798 start_async_work(next, read_rebuild_work);
799 } else if (next->operation == BTRFS_RBIO_WRITE) {
800 steal_rbio(rbio, next);
801 start_async_work(next, rmw_work);
802 } else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
803 steal_rbio(rbio, next);
804 start_async_work(next, scrub_parity_work);
811 spin_unlock(&rbio->bio_list_lock);
812 spin_unlock_irqrestore(&h->lock, flags);
816 remove_rbio_from_cache(rbio);
819 static void __free_raid_bio(struct btrfs_raid_bio *rbio)
823 if (!refcount_dec_and_test(&rbio->refs))
826 WARN_ON(!list_empty(&rbio->stripe_cache));
827 WARN_ON(!list_empty(&rbio->hash_list));
828 WARN_ON(!bio_list_empty(&rbio->bio_list));
830 for (i = 0; i < rbio->nr_pages; i++) {
831 if (rbio->stripe_pages[i]) {
832 __free_page(rbio->stripe_pages[i]);
833 rbio->stripe_pages[i] = NULL;
837 btrfs_put_bbio(rbio->bbio);
841 static void rbio_endio_bio_list(struct bio *cur, blk_status_t err)
848 cur->bi_status = err;
855 * this frees the rbio and runs through all the bios in the
856 * bio_list and calls end_io on them
858 static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err)
860 struct bio *cur = bio_list_get(&rbio->bio_list);
863 if (rbio->generic_bio_cnt)
864 btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt);
867 * At this moment, rbio->bio_list is empty, however since rbio does not
868 * always have RBIO_RMW_LOCKED_BIT set and rbio is still linked on the
869 * hash list, rbio may be merged with others so that rbio->bio_list
871 * Once unlock_stripe() is done, rbio->bio_list will not be updated any
872 * more and we can call bio_endio() on all queued bios.
875 extra = bio_list_get(&rbio->bio_list);
876 __free_raid_bio(rbio);
878 rbio_endio_bio_list(cur, err);
880 rbio_endio_bio_list(extra, err);
884 * end io function used by finish_rmw. When we finally
885 * get here, we've written a full stripe
887 static void raid_write_end_io(struct bio *bio)
889 struct btrfs_raid_bio *rbio = bio->bi_private;
890 blk_status_t err = bio->bi_status;
894 fail_bio_stripe(rbio, bio);
898 if (!atomic_dec_and_test(&rbio->stripes_pending))
903 /* OK, we have read all the stripes we need to. */
904 max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ?
905 0 : rbio->bbio->max_errors;
906 if (atomic_read(&rbio->error) > max_errors)
909 rbio_orig_end_io(rbio, err);
913 * the read/modify/write code wants to use the original bio for
914 * any pages it included, and then use the rbio for everything
915 * else. This function decides if a given index (stripe number)
916 * and page number in that stripe fall inside the original bio
919 * if you set bio_list_only, you'll get a NULL back for any ranges
920 * that are outside the bio_list
922 * This doesn't take any refs on anything, you get a bare page pointer
923 * and the caller must bump refs as required.
925 * You must call index_rbio_pages once before you can trust
926 * the answers from this function.
928 static struct page *page_in_rbio(struct btrfs_raid_bio *rbio,
929 int index, int pagenr, int bio_list_only)
932 struct page *p = NULL;
934 chunk_page = index * (rbio->stripe_len >> PAGE_SHIFT) + pagenr;
936 spin_lock_irq(&rbio->bio_list_lock);
937 p = rbio->bio_pages[chunk_page];
938 spin_unlock_irq(&rbio->bio_list_lock);
940 if (p || bio_list_only)
943 return rbio->stripe_pages[chunk_page];
947 * number of pages we need for the entire stripe across all the
950 static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes)
952 return DIV_ROUND_UP(stripe_len, PAGE_SIZE) * nr_stripes;
956 * allocation and initial setup for the btrfs_raid_bio. Not
957 * this does not allocate any pages for rbio->pages.
959 static struct btrfs_raid_bio *alloc_rbio(struct btrfs_fs_info *fs_info,
960 struct btrfs_bio *bbio,
963 struct btrfs_raid_bio *rbio;
965 int real_stripes = bbio->num_stripes - bbio->num_tgtdevs;
966 int num_pages = rbio_nr_pages(stripe_len, real_stripes);
967 int stripe_npages = DIV_ROUND_UP(stripe_len, PAGE_SIZE);
970 rbio = kzalloc(sizeof(*rbio) +
971 sizeof(*rbio->stripe_pages) * num_pages +
972 sizeof(*rbio->bio_pages) * num_pages +
973 sizeof(*rbio->finish_pointers) * real_stripes +
974 sizeof(*rbio->dbitmap) * BITS_TO_LONGS(stripe_npages) +
975 sizeof(*rbio->finish_pbitmap) *
976 BITS_TO_LONGS(stripe_npages),
979 return ERR_PTR(-ENOMEM);
981 bio_list_init(&rbio->bio_list);
982 INIT_LIST_HEAD(&rbio->plug_list);
983 spin_lock_init(&rbio->bio_list_lock);
984 INIT_LIST_HEAD(&rbio->stripe_cache);
985 INIT_LIST_HEAD(&rbio->hash_list);
987 rbio->fs_info = fs_info;
988 rbio->stripe_len = stripe_len;
989 rbio->nr_pages = num_pages;
990 rbio->real_stripes = real_stripes;
991 rbio->stripe_npages = stripe_npages;
994 refcount_set(&rbio->refs, 1);
995 atomic_set(&rbio->error, 0);
996 atomic_set(&rbio->stripes_pending, 0);
999 * the stripe_pages, bio_pages, etc arrays point to the extra
1000 * memory we allocated past the end of the rbio
1003 #define CONSUME_ALLOC(ptr, count) do { \
1005 p = (unsigned char *)p + sizeof(*(ptr)) * (count); \
1007 CONSUME_ALLOC(rbio->stripe_pages, num_pages);
1008 CONSUME_ALLOC(rbio->bio_pages, num_pages);
1009 CONSUME_ALLOC(rbio->finish_pointers, real_stripes);
1010 CONSUME_ALLOC(rbio->dbitmap, BITS_TO_LONGS(stripe_npages));
1011 CONSUME_ALLOC(rbio->finish_pbitmap, BITS_TO_LONGS(stripe_npages));
1012 #undef CONSUME_ALLOC
1014 if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID5)
1015 nr_data = real_stripes - 1;
1016 else if (bbio->map_type & BTRFS_BLOCK_GROUP_RAID6)
1017 nr_data = real_stripes - 2;
1021 rbio->nr_data = nr_data;
1025 /* allocate pages for all the stripes in the bio, including parity */
1026 static int alloc_rbio_pages(struct btrfs_raid_bio *rbio)
1031 for (i = 0; i < rbio->nr_pages; i++) {
1032 if (rbio->stripe_pages[i])
1034 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
1037 rbio->stripe_pages[i] = page;
1042 /* only allocate pages for p/q stripes */
1043 static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
1048 i = rbio_stripe_page_index(rbio, rbio->nr_data, 0);
1050 for (; i < rbio->nr_pages; i++) {
1051 if (rbio->stripe_pages[i])
1053 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
1056 rbio->stripe_pages[i] = page;
1062 * add a single page from a specific stripe into our list of bios for IO
1063 * this will try to merge into existing bios if possible, and returns
1064 * zero if all went well.
1066 static int rbio_add_io_page(struct btrfs_raid_bio *rbio,
1067 struct bio_list *bio_list,
1070 unsigned long page_index,
1071 unsigned long bio_max_len)
1073 struct bio *last = bio_list->tail;
1077 struct btrfs_bio_stripe *stripe;
1080 stripe = &rbio->bbio->stripes[stripe_nr];
1081 disk_start = stripe->physical + (page_index << PAGE_SHIFT);
1083 /* if the device is missing, just fail this stripe */
1084 if (!stripe->dev->bdev)
1085 return fail_rbio_index(rbio, stripe_nr);
1087 /* see if we can add this page onto our existing bio */
1089 last_end = (u64)last->bi_iter.bi_sector << 9;
1090 last_end += last->bi_iter.bi_size;
1093 * we can't merge these if they are from different
1094 * devices or if they are not contiguous
1096 if (last_end == disk_start && stripe->dev->bdev &&
1098 last->bi_disk == stripe->dev->bdev->bd_disk &&
1099 last->bi_partno == stripe->dev->bdev->bd_partno) {
1100 ret = bio_add_page(last, page, PAGE_SIZE, 0);
1101 if (ret == PAGE_SIZE)
1106 /* put a new bio on the list */
1107 bio = btrfs_io_bio_alloc(bio_max_len >> PAGE_SHIFT ?: 1);
1108 bio->bi_iter.bi_size = 0;
1109 bio_set_dev(bio, stripe->dev->bdev);
1110 bio->bi_iter.bi_sector = disk_start >> 9;
1112 bio_add_page(bio, page, PAGE_SIZE, 0);
1113 bio_list_add(bio_list, bio);
1118 * while we're doing the read/modify/write cycle, we could
1119 * have errors in reading pages off the disk. This checks
1120 * for errors and if we're not able to read the page it'll
1121 * trigger parity reconstruction. The rmw will be finished
1122 * after we've reconstructed the failed stripes
1124 static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
1126 if (rbio->faila >= 0 || rbio->failb >= 0) {
1127 BUG_ON(rbio->faila == rbio->real_stripes - 1);
1128 __raid56_parity_recover(rbio);
1135 * helper function to walk our bio list and populate the bio_pages array with
1136 * the result. This seems expensive, but it is faster than constantly
1137 * searching through the bio list as we setup the IO in finish_rmw or stripe
1140 * This must be called before you trust the answers from page_in_rbio
1142 static void index_rbio_pages(struct btrfs_raid_bio *rbio)
1146 unsigned long stripe_offset;
1147 unsigned long page_index;
1149 spin_lock_irq(&rbio->bio_list_lock);
1150 bio_list_for_each(bio, &rbio->bio_list) {
1151 struct bio_vec bvec;
1152 struct bvec_iter iter;
1155 start = (u64)bio->bi_iter.bi_sector << 9;
1156 stripe_offset = start - rbio->bbio->raid_map[0];
1157 page_index = stripe_offset >> PAGE_SHIFT;
1159 if (bio_flagged(bio, BIO_CLONED))
1160 bio->bi_iter = btrfs_io_bio(bio)->iter;
1162 bio_for_each_segment(bvec, bio, iter) {
1163 rbio->bio_pages[page_index + i] = bvec.bv_page;
1167 spin_unlock_irq(&rbio->bio_list_lock);
1171 * this is called from one of two situations. We either
1172 * have a full stripe from the higher layers, or we've read all
1173 * the missing bits off disk.
1175 * This will calculate the parity and then send down any
1178 static noinline void finish_rmw(struct btrfs_raid_bio *rbio)
1180 struct btrfs_bio *bbio = rbio->bbio;
1181 void **pointers = rbio->finish_pointers;
1182 int nr_data = rbio->nr_data;
1186 struct bio_list bio_list;
1190 bio_list_init(&bio_list);
1192 if (rbio->real_stripes - rbio->nr_data == 1)
1193 has_qstripe = false;
1194 else if (rbio->real_stripes - rbio->nr_data == 2)
1199 /* at this point we either have a full stripe,
1200 * or we've read the full stripe from the drive.
1201 * recalculate the parity and write the new results.
1203 * We're not allowed to add any new bios to the
1204 * bio list here, anyone else that wants to
1205 * change this stripe needs to do their own rmw.
1207 spin_lock_irq(&rbio->bio_list_lock);
1208 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1209 spin_unlock_irq(&rbio->bio_list_lock);
1211 atomic_set(&rbio->error, 0);
1214 * now that we've set rmw_locked, run through the
1215 * bio list one last time and map the page pointers
1217 * We don't cache full rbios because we're assuming
1218 * the higher layers are unlikely to use this area of
1219 * the disk again soon. If they do use it again,
1220 * hopefully they will send another full bio.
1222 index_rbio_pages(rbio);
1223 if (!rbio_is_full(rbio))
1224 cache_rbio_pages(rbio);
1226 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1228 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
1230 /* first collect one page from each data stripe */
1231 for (stripe = 0; stripe < nr_data; stripe++) {
1232 p = page_in_rbio(rbio, stripe, pagenr, 0);
1233 pointers[stripe] = kmap(p);
1236 /* then add the parity stripe */
1237 p = rbio_pstripe_page(rbio, pagenr);
1239 pointers[stripe++] = kmap(p);
1244 * raid6, add the qstripe and call the
1245 * library function to fill in our p/q
1247 p = rbio_qstripe_page(rbio, pagenr);
1249 pointers[stripe++] = kmap(p);
1251 raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
1255 copy_page(pointers[nr_data], pointers[0]);
1256 run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
1260 for (stripe = 0; stripe < rbio->real_stripes; stripe++)
1261 kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
1265 * time to start writing. Make bios for everything from the
1266 * higher layers (the bio_list in our rbio) and our p/q. Ignore
1269 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1270 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
1272 if (stripe < rbio->nr_data) {
1273 page = page_in_rbio(rbio, stripe, pagenr, 1);
1277 page = rbio_stripe_page(rbio, stripe, pagenr);
1280 ret = rbio_add_io_page(rbio, &bio_list,
1281 page, stripe, pagenr, rbio->stripe_len);
1287 if (likely(!bbio->num_tgtdevs))
1290 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1291 if (!bbio->tgtdev_map[stripe])
1294 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
1296 if (stripe < rbio->nr_data) {
1297 page = page_in_rbio(rbio, stripe, pagenr, 1);
1301 page = rbio_stripe_page(rbio, stripe, pagenr);
1304 ret = rbio_add_io_page(rbio, &bio_list, page,
1305 rbio->bbio->tgtdev_map[stripe],
1306 pagenr, rbio->stripe_len);
1313 atomic_set(&rbio->stripes_pending, bio_list_size(&bio_list));
1314 BUG_ON(atomic_read(&rbio->stripes_pending) == 0);
1317 bio = bio_list_pop(&bio_list);
1321 bio->bi_private = rbio;
1322 bio->bi_end_io = raid_write_end_io;
1323 bio->bi_opf = REQ_OP_WRITE;
1330 rbio_orig_end_io(rbio, BLK_STS_IOERR);
1332 while ((bio = bio_list_pop(&bio_list)))
1337 * helper to find the stripe number for a given bio. Used to figure out which
1338 * stripe has failed. This expects the bio to correspond to a physical disk,
1339 * so it looks up based on physical sector numbers.
1341 static int find_bio_stripe(struct btrfs_raid_bio *rbio,
1344 u64 physical = bio->bi_iter.bi_sector;
1347 struct btrfs_bio_stripe *stripe;
1351 for (i = 0; i < rbio->bbio->num_stripes; i++) {
1352 stripe = &rbio->bbio->stripes[i];
1353 stripe_start = stripe->physical;
1354 if (physical >= stripe_start &&
1355 physical < stripe_start + rbio->stripe_len &&
1356 stripe->dev->bdev &&
1357 bio->bi_disk == stripe->dev->bdev->bd_disk &&
1358 bio->bi_partno == stripe->dev->bdev->bd_partno) {
1366 * helper to find the stripe number for a given
1367 * bio (before mapping). Used to figure out which stripe has
1368 * failed. This looks up based on logical block numbers.
1370 static int find_logical_bio_stripe(struct btrfs_raid_bio *rbio,
1373 u64 logical = bio->bi_iter.bi_sector;
1379 for (i = 0; i < rbio->nr_data; i++) {
1380 stripe_start = rbio->bbio->raid_map[i];
1381 if (logical >= stripe_start &&
1382 logical < stripe_start + rbio->stripe_len) {
1390 * returns -EIO if we had too many failures
1392 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed)
1394 unsigned long flags;
1397 spin_lock_irqsave(&rbio->bio_list_lock, flags);
1399 /* we already know this stripe is bad, move on */
1400 if (rbio->faila == failed || rbio->failb == failed)
1403 if (rbio->faila == -1) {
1404 /* first failure on this rbio */
1405 rbio->faila = failed;
1406 atomic_inc(&rbio->error);
1407 } else if (rbio->failb == -1) {
1408 /* second failure on this rbio */
1409 rbio->failb = failed;
1410 atomic_inc(&rbio->error);
1415 spin_unlock_irqrestore(&rbio->bio_list_lock, flags);
1421 * helper to fail a stripe based on a physical disk
1424 static int fail_bio_stripe(struct btrfs_raid_bio *rbio,
1427 int failed = find_bio_stripe(rbio, bio);
1432 return fail_rbio_index(rbio, failed);
1436 * this sets each page in the bio uptodate. It should only be used on private
1437 * rbio pages, nothing that comes in from the higher layers
1439 static void set_bio_pages_uptodate(struct bio *bio)
1441 struct bio_vec *bvec;
1444 ASSERT(!bio_flagged(bio, BIO_CLONED));
1446 bio_for_each_segment_all(bvec, bio, i)
1447 SetPageUptodate(bvec->bv_page);
1451 * end io for the read phase of the rmw cycle. All the bios here are physical
1452 * stripe bios we've read from the disk so we can recalculate the parity of the
1455 * This will usually kick off finish_rmw once all the bios are read in, but it
1456 * may trigger parity reconstruction if we had any errors along the way
1458 static void raid_rmw_end_io(struct bio *bio)
1460 struct btrfs_raid_bio *rbio = bio->bi_private;
1463 fail_bio_stripe(rbio, bio);
1465 set_bio_pages_uptodate(bio);
1469 if (!atomic_dec_and_test(&rbio->stripes_pending))
1472 if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
1476 * this will normally call finish_rmw to start our write
1477 * but if there are any failed stripes we'll reconstruct
1480 validate_rbio_for_rmw(rbio);
1485 rbio_orig_end_io(rbio, BLK_STS_IOERR);
1489 * the stripe must be locked by the caller. It will
1490 * unlock after all the writes are done
1492 static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
1494 int bios_to_read = 0;
1495 struct bio_list bio_list;
1501 bio_list_init(&bio_list);
1503 ret = alloc_rbio_pages(rbio);
1507 index_rbio_pages(rbio);
1509 atomic_set(&rbio->error, 0);
1511 * build a list of bios to read all the missing parts of this
1514 for (stripe = 0; stripe < rbio->nr_data; stripe++) {
1515 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
1518 * we want to find all the pages missing from
1519 * the rbio and read them from the disk. If
1520 * page_in_rbio finds a page in the bio list
1521 * we don't need to read it off the stripe.
1523 page = page_in_rbio(rbio, stripe, pagenr, 1);
1527 page = rbio_stripe_page(rbio, stripe, pagenr);
1529 * the bio cache may have handed us an uptodate
1530 * page. If so, be happy and use it
1532 if (PageUptodate(page))
1535 ret = rbio_add_io_page(rbio, &bio_list, page,
1536 stripe, pagenr, rbio->stripe_len);
1542 bios_to_read = bio_list_size(&bio_list);
1543 if (!bios_to_read) {
1545 * this can happen if others have merged with
1546 * us, it means there is nothing left to read.
1547 * But if there are missing devices it may not be
1548 * safe to do the full stripe write yet.
1554 * the bbio may be freed once we submit the last bio. Make sure
1555 * not to touch it after that
1557 atomic_set(&rbio->stripes_pending, bios_to_read);
1559 bio = bio_list_pop(&bio_list);
1563 bio->bi_private = rbio;
1564 bio->bi_end_io = raid_rmw_end_io;
1565 bio->bi_opf = REQ_OP_READ;
1567 btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
1571 /* the actual write will happen once the reads are done */
1575 rbio_orig_end_io(rbio, BLK_STS_IOERR);
1577 while ((bio = bio_list_pop(&bio_list)))
1583 validate_rbio_for_rmw(rbio);
1588 * if the upper layers pass in a full stripe, we thank them by only allocating
1589 * enough pages to hold the parity, and sending it all down quickly.
1591 static int full_stripe_write(struct btrfs_raid_bio *rbio)
1595 ret = alloc_rbio_parity_pages(rbio);
1597 __free_raid_bio(rbio);
1601 ret = lock_stripe_add(rbio);
1608 * partial stripe writes get handed over to async helpers.
1609 * We're really hoping to merge a few more writes into this
1610 * rbio before calculating new parity
1612 static int partial_stripe_write(struct btrfs_raid_bio *rbio)
1616 ret = lock_stripe_add(rbio);
1618 start_async_work(rbio, rmw_work);
1623 * sometimes while we were reading from the drive to
1624 * recalculate parity, enough new bios come into create
1625 * a full stripe. So we do a check here to see if we can
1626 * go directly to finish_rmw
1628 static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
1630 /* head off into rmw land if we don't have a full stripe */
1631 if (!rbio_is_full(rbio))
1632 return partial_stripe_write(rbio);
1633 return full_stripe_write(rbio);
1637 * We use plugging call backs to collect full stripes.
1638 * Any time we get a partial stripe write while plugged
1639 * we collect it into a list. When the unplug comes down,
1640 * we sort the list by logical block number and merge
1641 * everything we can into the same rbios
1643 struct btrfs_plug_cb {
1644 struct blk_plug_cb cb;
1645 struct btrfs_fs_info *info;
1646 struct list_head rbio_list;
1647 struct btrfs_work work;
1651 * rbios on the plug list are sorted for easier merging.
1653 static int plug_cmp(void *priv, struct list_head *a, struct list_head *b)
1655 struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio,
1657 struct btrfs_raid_bio *rb = container_of(b, struct btrfs_raid_bio,
1659 u64 a_sector = ra->bio_list.head->bi_iter.bi_sector;
1660 u64 b_sector = rb->bio_list.head->bi_iter.bi_sector;
1662 if (a_sector < b_sector)
1664 if (a_sector > b_sector)
1669 static void run_plug(struct btrfs_plug_cb *plug)
1671 struct btrfs_raid_bio *cur;
1672 struct btrfs_raid_bio *last = NULL;
1675 * sort our plug list then try to merge
1676 * everything we can in hopes of creating full
1679 list_sort(NULL, &plug->rbio_list, plug_cmp);
1680 while (!list_empty(&plug->rbio_list)) {
1681 cur = list_entry(plug->rbio_list.next,
1682 struct btrfs_raid_bio, plug_list);
1683 list_del_init(&cur->plug_list);
1685 if (rbio_is_full(cur)) {
1688 /* we have a full stripe, send it down */
1689 ret = full_stripe_write(cur);
1694 if (rbio_can_merge(last, cur)) {
1695 merge_rbio(last, cur);
1696 __free_raid_bio(cur);
1700 __raid56_parity_write(last);
1705 __raid56_parity_write(last);
1711 * if the unplug comes from schedule, we have to push the
1712 * work off to a helper thread
1714 static void unplug_work(struct btrfs_work *work)
1716 struct btrfs_plug_cb *plug;
1717 plug = container_of(work, struct btrfs_plug_cb, work);
1721 static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
1723 struct btrfs_plug_cb *plug;
1724 plug = container_of(cb, struct btrfs_plug_cb, cb);
1726 if (from_schedule) {
1727 btrfs_init_work(&plug->work, btrfs_rmw_helper,
1728 unplug_work, NULL, NULL);
1729 btrfs_queue_work(plug->info->rmw_workers,
1737 * our main entry point for writes from the rest of the FS.
1739 int raid56_parity_write(struct btrfs_fs_info *fs_info, struct bio *bio,
1740 struct btrfs_bio *bbio, u64 stripe_len)
1742 struct btrfs_raid_bio *rbio;
1743 struct btrfs_plug_cb *plug = NULL;
1744 struct blk_plug_cb *cb;
1747 rbio = alloc_rbio(fs_info, bbio, stripe_len);
1749 btrfs_put_bbio(bbio);
1750 return PTR_ERR(rbio);
1752 bio_list_add(&rbio->bio_list, bio);
1753 rbio->bio_list_bytes = bio->bi_iter.bi_size;
1754 rbio->operation = BTRFS_RBIO_WRITE;
1756 btrfs_bio_counter_inc_noblocked(fs_info);
1757 rbio->generic_bio_cnt = 1;
1760 * don't plug on full rbios, just get them out the door
1761 * as quickly as we can
1763 if (rbio_is_full(rbio)) {
1764 ret = full_stripe_write(rbio);
1766 btrfs_bio_counter_dec(fs_info);
1770 cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug));
1772 plug = container_of(cb, struct btrfs_plug_cb, cb);
1774 plug->info = fs_info;
1775 INIT_LIST_HEAD(&plug->rbio_list);
1777 list_add_tail(&rbio->plug_list, &plug->rbio_list);
1780 ret = __raid56_parity_write(rbio);
1782 btrfs_bio_counter_dec(fs_info);
1788 * all parity reconstruction happens here. We've read in everything
1789 * we can find from the drives and this does the heavy lifting of
1790 * sorting the good from the bad.
1792 static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
1796 int faila = -1, failb = -1;
1801 pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
1803 err = BLK_STS_RESOURCE;
1807 faila = rbio->faila;
1808 failb = rbio->failb;
1810 if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1811 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
1812 spin_lock_irq(&rbio->bio_list_lock);
1813 set_bit(RBIO_RMW_LOCKED_BIT, &rbio->flags);
1814 spin_unlock_irq(&rbio->bio_list_lock);
1817 index_rbio_pages(rbio);
1819 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
1821 * Now we just use bitmap to mark the horizontal stripes in
1822 * which we have data when doing parity scrub.
1824 if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
1825 !test_bit(pagenr, rbio->dbitmap))
1828 /* setup our array of pointers with pages
1831 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1833 * if we're rebuilding a read, we have to use
1834 * pages from the bio list
1836 if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1837 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
1838 (stripe == faila || stripe == failb)) {
1839 page = page_in_rbio(rbio, stripe, pagenr, 0);
1841 page = rbio_stripe_page(rbio, stripe, pagenr);
1843 pointers[stripe] = kmap(page);
1846 /* all raid6 handling here */
1847 if (rbio->bbio->map_type & BTRFS_BLOCK_GROUP_RAID6) {
1849 * single failure, rebuild from parity raid5
1853 if (faila == rbio->nr_data) {
1855 * Just the P stripe has failed, without
1856 * a bad data or Q stripe.
1857 * TODO, we should redo the xor here.
1859 err = BLK_STS_IOERR;
1863 * a single failure in raid6 is rebuilt
1864 * in the pstripe code below
1869 /* make sure our ps and qs are in order */
1870 if (faila > failb) {
1876 /* if the q stripe is failed, do a pstripe reconstruction
1878 * If both the q stripe and the P stripe are failed, we're
1879 * here due to a crc mismatch and we can't give them the
1882 if (rbio->bbio->raid_map[failb] == RAID6_Q_STRIPE) {
1883 if (rbio->bbio->raid_map[faila] ==
1885 err = BLK_STS_IOERR;
1889 * otherwise we have one bad data stripe and
1890 * a good P stripe. raid5!
1895 if (rbio->bbio->raid_map[failb] == RAID5_P_STRIPE) {
1896 raid6_datap_recov(rbio->real_stripes,
1897 PAGE_SIZE, faila, pointers);
1899 raid6_2data_recov(rbio->real_stripes,
1900 PAGE_SIZE, faila, failb,
1906 /* rebuild from P stripe here (raid5 or raid6) */
1907 BUG_ON(failb != -1);
1909 /* Copy parity block into failed block to start with */
1910 copy_page(pointers[faila], pointers[rbio->nr_data]);
1912 /* rearrange the pointer array */
1913 p = pointers[faila];
1914 for (stripe = faila; stripe < rbio->nr_data - 1; stripe++)
1915 pointers[stripe] = pointers[stripe + 1];
1916 pointers[rbio->nr_data - 1] = p;
1918 /* xor in the rest */
1919 run_xor(pointers, rbio->nr_data - 1, PAGE_SIZE);
1921 /* if we're doing this rebuild as part of an rmw, go through
1922 * and set all of our private rbio pages in the
1923 * failed stripes as uptodate. This way finish_rmw will
1924 * know they can be trusted. If this was a read reconstruction,
1925 * other endio functions will fiddle the uptodate bits
1927 if (rbio->operation == BTRFS_RBIO_WRITE) {
1928 for (i = 0; i < rbio->stripe_npages; i++) {
1930 page = rbio_stripe_page(rbio, faila, i);
1931 SetPageUptodate(page);
1934 page = rbio_stripe_page(rbio, failb, i);
1935 SetPageUptodate(page);
1939 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
1941 * if we're rebuilding a read, we have to use
1942 * pages from the bio list
1944 if ((rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1945 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
1946 (stripe == faila || stripe == failb)) {
1947 page = page_in_rbio(rbio, stripe, pagenr, 0);
1949 page = rbio_stripe_page(rbio, stripe, pagenr);
1961 * Similar to READ_REBUILD, REBUILD_MISSING at this point also has a
1962 * valid rbio which is consistent with ondisk content, thus such a
1963 * valid rbio can be cached to avoid further disk reads.
1965 if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
1966 rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
1968 * - In case of two failures, where rbio->failb != -1:
1970 * Do not cache this rbio since the above read reconstruction
1971 * (raid6_datap_recov() or raid6_2data_recov()) may have
1972 * changed some content of stripes which are not identical to
1973 * on-disk content any more, otherwise, a later write/recover
1974 * may steal stripe_pages from this rbio and end up with
1975 * corruptions or rebuild failures.
1977 * - In case of single failure, where rbio->failb == -1:
1979 * Cache this rbio iff the above read reconstruction is
1980 * excuted without problems.
1982 if (err == BLK_STS_OK && rbio->failb < 0)
1983 cache_rbio_pages(rbio);
1985 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
1987 rbio_orig_end_io(rbio, err);
1988 } else if (err == BLK_STS_OK) {
1992 if (rbio->operation == BTRFS_RBIO_WRITE)
1994 else if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB)
1995 finish_parity_scrub(rbio, 0);
1999 rbio_orig_end_io(rbio, err);
2004 * This is called only for stripes we've read from disk to
2005 * reconstruct the parity.
2007 static void raid_recover_end_io(struct bio *bio)
2009 struct btrfs_raid_bio *rbio = bio->bi_private;
2012 * we only read stripe pages off the disk, set them
2013 * up to date if there were no errors
2016 fail_bio_stripe(rbio, bio);
2018 set_bio_pages_uptodate(bio);
2021 if (!atomic_dec_and_test(&rbio->stripes_pending))
2024 if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
2025 rbio_orig_end_io(rbio, BLK_STS_IOERR);
2027 __raid_recover_end_io(rbio);
2031 * reads everything we need off the disk to reconstruct
2032 * the parity. endio handlers trigger final reconstruction
2033 * when the IO is done.
2035 * This is used both for reads from the higher layers and for
2036 * parity construction required to finish a rmw cycle.
2038 static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
2040 int bios_to_read = 0;
2041 struct bio_list bio_list;
2047 bio_list_init(&bio_list);
2049 ret = alloc_rbio_pages(rbio);
2053 atomic_set(&rbio->error, 0);
2056 * read everything that hasn't failed. Thanks to the
2057 * stripe cache, it is possible that some or all of these
2058 * pages are going to be uptodate.
2060 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
2061 if (rbio->faila == stripe || rbio->failb == stripe) {
2062 atomic_inc(&rbio->error);
2066 for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
2070 * the rmw code may have already read this
2073 p = rbio_stripe_page(rbio, stripe, pagenr);
2074 if (PageUptodate(p))
2077 ret = rbio_add_io_page(rbio, &bio_list,
2078 rbio_stripe_page(rbio, stripe, pagenr),
2079 stripe, pagenr, rbio->stripe_len);
2085 bios_to_read = bio_list_size(&bio_list);
2086 if (!bios_to_read) {
2088 * we might have no bios to read just because the pages
2089 * were up to date, or we might have no bios to read because
2090 * the devices were gone.
2092 if (atomic_read(&rbio->error) <= rbio->bbio->max_errors) {
2093 __raid_recover_end_io(rbio);
2101 * the bbio may be freed once we submit the last bio. Make sure
2102 * not to touch it after that
2104 atomic_set(&rbio->stripes_pending, bios_to_read);
2106 bio = bio_list_pop(&bio_list);
2110 bio->bi_private = rbio;
2111 bio->bi_end_io = raid_recover_end_io;
2112 bio->bi_opf = REQ_OP_READ;
2114 btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
2122 if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
2123 rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
2124 rbio_orig_end_io(rbio, BLK_STS_IOERR);
2126 while ((bio = bio_list_pop(&bio_list)))
2133 * the main entry point for reads from the higher layers. This
2134 * is really only called when the normal read path had a failure,
2135 * so we assume the bio they send down corresponds to a failed part
2138 int raid56_parity_recover(struct btrfs_fs_info *fs_info, struct bio *bio,
2139 struct btrfs_bio *bbio, u64 stripe_len,
2140 int mirror_num, int generic_io)
2142 struct btrfs_raid_bio *rbio;
2146 ASSERT(bbio->mirror_num == mirror_num);
2147 btrfs_io_bio(bio)->mirror_num = mirror_num;
2150 rbio = alloc_rbio(fs_info, bbio, stripe_len);
2153 btrfs_put_bbio(bbio);
2154 return PTR_ERR(rbio);
2157 rbio->operation = BTRFS_RBIO_READ_REBUILD;
2158 bio_list_add(&rbio->bio_list, bio);
2159 rbio->bio_list_bytes = bio->bi_iter.bi_size;
2161 rbio->faila = find_logical_bio_stripe(rbio, bio);
2162 if (rbio->faila == -1) {
2164 "%s could not find the bad stripe in raid56 so that we cannot recover any more (bio has logical %llu len %llu, bbio has map_type %llu)",
2165 __func__, (u64)bio->bi_iter.bi_sector << 9,
2166 (u64)bio->bi_iter.bi_size, bbio->map_type);
2168 btrfs_put_bbio(bbio);
2174 btrfs_bio_counter_inc_noblocked(fs_info);
2175 rbio->generic_bio_cnt = 1;
2177 btrfs_get_bbio(bbio);
2182 * for 'mirror == 2', reconstruct from all other stripes.
2183 * for 'mirror_num > 2', select a stripe to fail on every retry.
2185 if (mirror_num > 2) {
2187 * 'mirror == 3' is to fail the p stripe and
2188 * reconstruct from the q stripe. 'mirror > 3' is to
2189 * fail a data stripe and reconstruct from p+q stripe.
2191 rbio->failb = rbio->real_stripes - (mirror_num - 1);
2192 ASSERT(rbio->failb > 0);
2193 if (rbio->failb <= rbio->faila)
2197 ret = lock_stripe_add(rbio);
2200 * __raid56_parity_recover will end the bio with
2201 * any errors it hits. We don't want to return
2202 * its error value up the stack because our caller
2203 * will end up calling bio_endio with any nonzero
2207 __raid56_parity_recover(rbio);
2209 * our rbio has been added to the list of
2210 * rbios that will be handled after the
2211 * currently lock owner is done
2217 static void rmw_work(struct btrfs_work *work)
2219 struct btrfs_raid_bio *rbio;
2221 rbio = container_of(work, struct btrfs_raid_bio, work);
2222 raid56_rmw_stripe(rbio);
2225 static void read_rebuild_work(struct btrfs_work *work)
2227 struct btrfs_raid_bio *rbio;
2229 rbio = container_of(work, struct btrfs_raid_bio, work);
2230 __raid56_parity_recover(rbio);
2234 * The following code is used to scrub/replace the parity stripe
2236 * Caller must have already increased bio_counter for getting @bbio.
2238 * Note: We need make sure all the pages that add into the scrub/replace
2239 * raid bio are correct and not be changed during the scrub/replace. That
2240 * is those pages just hold metadata or file data with checksum.
2243 struct btrfs_raid_bio *
2244 raid56_parity_alloc_scrub_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
2245 struct btrfs_bio *bbio, u64 stripe_len,
2246 struct btrfs_device *scrub_dev,
2247 unsigned long *dbitmap, int stripe_nsectors)
2249 struct btrfs_raid_bio *rbio;
2252 rbio = alloc_rbio(fs_info, bbio, stripe_len);
2255 bio_list_add(&rbio->bio_list, bio);
2257 * This is a special bio which is used to hold the completion handler
2258 * and make the scrub rbio is similar to the other types
2260 ASSERT(!bio->bi_iter.bi_size);
2261 rbio->operation = BTRFS_RBIO_PARITY_SCRUB;
2264 * After mapping bbio with BTRFS_MAP_WRITE, parities have been sorted
2265 * to the end position, so this search can start from the first parity
2268 for (i = rbio->nr_data; i < rbio->real_stripes; i++) {
2269 if (bbio->stripes[i].dev == scrub_dev) {
2274 ASSERT(i < rbio->real_stripes);
2276 /* Now we just support the sectorsize equals to page size */
2277 ASSERT(fs_info->sectorsize == PAGE_SIZE);
2278 ASSERT(rbio->stripe_npages == stripe_nsectors);
2279 bitmap_copy(rbio->dbitmap, dbitmap, stripe_nsectors);
2282 * We have already increased bio_counter when getting bbio, record it
2283 * so we can free it at rbio_orig_end_io().
2285 rbio->generic_bio_cnt = 1;
2290 /* Used for both parity scrub and missing. */
2291 void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
2297 ASSERT(logical >= rbio->bbio->raid_map[0]);
2298 ASSERT(logical + PAGE_SIZE <= rbio->bbio->raid_map[0] +
2299 rbio->stripe_len * rbio->nr_data);
2300 stripe_offset = (int)(logical - rbio->bbio->raid_map[0]);
2301 index = stripe_offset >> PAGE_SHIFT;
2302 rbio->bio_pages[index] = page;
2306 * We just scrub the parity that we have correct data on the same horizontal,
2307 * so we needn't allocate all pages for all the stripes.
2309 static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio)
2316 for_each_set_bit(bit, rbio->dbitmap, rbio->stripe_npages) {
2317 for (i = 0; i < rbio->real_stripes; i++) {
2318 index = i * rbio->stripe_npages + bit;
2319 if (rbio->stripe_pages[index])
2322 page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
2325 rbio->stripe_pages[index] = page;
2331 static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio,
2334 struct btrfs_bio *bbio = rbio->bbio;
2335 void **pointers = rbio->finish_pointers;
2336 unsigned long *pbitmap = rbio->finish_pbitmap;
2337 int nr_data = rbio->nr_data;
2341 struct page *p_page = NULL;
2342 struct page *q_page = NULL;
2343 struct bio_list bio_list;
2348 bio_list_init(&bio_list);
2350 if (rbio->real_stripes - rbio->nr_data == 1)
2351 has_qstripe = false;
2352 else if (rbio->real_stripes - rbio->nr_data == 2)
2357 if (bbio->num_tgtdevs && bbio->tgtdev_map[rbio->scrubp]) {
2359 bitmap_copy(pbitmap, rbio->dbitmap, rbio->stripe_npages);
2363 * Because the higher layers(scrubber) are unlikely to
2364 * use this area of the disk again soon, so don't cache
2367 clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags);
2372 p_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
2375 SetPageUptodate(p_page);
2378 /* RAID6, allocate and map temp space for the Q stripe */
2379 q_page = alloc_page(GFP_NOFS | __GFP_HIGHMEM);
2381 __free_page(p_page);
2384 SetPageUptodate(q_page);
2385 pointers[rbio->real_stripes - 1] = kmap(q_page);
2388 atomic_set(&rbio->error, 0);
2390 /* Map the parity stripe just once */
2391 pointers[nr_data] = kmap(p_page);
2393 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2396 /* first collect one page from each data stripe */
2397 for (stripe = 0; stripe < nr_data; stripe++) {
2398 p = page_in_rbio(rbio, stripe, pagenr, 0);
2399 pointers[stripe] = kmap(p);
2403 /* RAID6, call the library function to fill in our P/Q */
2404 raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE,
2408 copy_page(pointers[nr_data], pointers[0]);
2409 run_xor(pointers + 1, nr_data - 1, PAGE_SIZE);
2412 /* Check scrubbing parity and repair it */
2413 p = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2415 if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE))
2416 copy_page(parity, pointers[rbio->scrubp]);
2418 /* Parity is right, needn't writeback */
2419 bitmap_clear(rbio->dbitmap, pagenr, 1);
2422 for (stripe = 0; stripe < nr_data; stripe++)
2423 kunmap(page_in_rbio(rbio, stripe, pagenr, 0));
2427 __free_page(p_page);
2430 __free_page(q_page);
2435 * time to start writing. Make bios for everything from the
2436 * higher layers (the bio_list in our rbio) and our p/q. Ignore
2439 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2442 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2443 ret = rbio_add_io_page(rbio, &bio_list,
2444 page, rbio->scrubp, pagenr, rbio->stripe_len);
2452 for_each_set_bit(pagenr, pbitmap, rbio->stripe_npages) {
2455 page = rbio_stripe_page(rbio, rbio->scrubp, pagenr);
2456 ret = rbio_add_io_page(rbio, &bio_list, page,
2457 bbio->tgtdev_map[rbio->scrubp],
2458 pagenr, rbio->stripe_len);
2464 nr_data = bio_list_size(&bio_list);
2466 /* Every parity is right */
2467 rbio_orig_end_io(rbio, BLK_STS_OK);
2471 atomic_set(&rbio->stripes_pending, nr_data);
2474 bio = bio_list_pop(&bio_list);
2478 bio->bi_private = rbio;
2479 bio->bi_end_io = raid_write_end_io;
2480 bio->bi_opf = REQ_OP_WRITE;
2487 rbio_orig_end_io(rbio, BLK_STS_IOERR);
2489 while ((bio = bio_list_pop(&bio_list)))
2493 static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)
2495 if (stripe >= 0 && stripe < rbio->nr_data)
2501 * While we're doing the parity check and repair, we could have errors
2502 * in reading pages off the disk. This checks for errors and if we're
2503 * not able to read the page it'll trigger parity reconstruction. The
2504 * parity scrub will be finished after we've reconstructed the failed
2507 static void validate_rbio_for_parity_scrub(struct btrfs_raid_bio *rbio)
2509 if (atomic_read(&rbio->error) > rbio->bbio->max_errors)
2512 if (rbio->faila >= 0 || rbio->failb >= 0) {
2513 int dfail = 0, failp = -1;
2515 if (is_data_stripe(rbio, rbio->faila))
2517 else if (is_parity_stripe(rbio->faila))
2518 failp = rbio->faila;
2520 if (is_data_stripe(rbio, rbio->failb))
2522 else if (is_parity_stripe(rbio->failb))
2523 failp = rbio->failb;
2526 * Because we can not use a scrubbing parity to repair
2527 * the data, so the capability of the repair is declined.
2528 * (In the case of RAID5, we can not repair anything)
2530 if (dfail > rbio->bbio->max_errors - 1)
2534 * If all data is good, only parity is correctly, just
2535 * repair the parity.
2538 finish_parity_scrub(rbio, 0);
2543 * Here means we got one corrupted data stripe and one
2544 * corrupted parity on RAID6, if the corrupted parity
2545 * is scrubbing parity, luckily, use the other one to repair
2546 * the data, or we can not repair the data stripe.
2548 if (failp != rbio->scrubp)
2551 __raid_recover_end_io(rbio);
2553 finish_parity_scrub(rbio, 1);
2558 rbio_orig_end_io(rbio, BLK_STS_IOERR);
2562 * end io for the read phase of the rmw cycle. All the bios here are physical
2563 * stripe bios we've read from the disk so we can recalculate the parity of the
2566 * This will usually kick off finish_rmw once all the bios are read in, but it
2567 * may trigger parity reconstruction if we had any errors along the way
2569 static void raid56_parity_scrub_end_io(struct bio *bio)
2571 struct btrfs_raid_bio *rbio = bio->bi_private;
2574 fail_bio_stripe(rbio, bio);
2576 set_bio_pages_uptodate(bio);
2580 if (!atomic_dec_and_test(&rbio->stripes_pending))
2584 * this will normally call finish_rmw to start our write
2585 * but if there are any failed stripes we'll reconstruct
2588 validate_rbio_for_parity_scrub(rbio);
2591 static void raid56_parity_scrub_stripe(struct btrfs_raid_bio *rbio)
2593 int bios_to_read = 0;
2594 struct bio_list bio_list;
2600 bio_list_init(&bio_list);
2602 ret = alloc_rbio_essential_pages(rbio);
2606 atomic_set(&rbio->error, 0);
2608 * build a list of bios to read all the missing parts of this
2611 for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
2612 for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) {
2615 * we want to find all the pages missing from
2616 * the rbio and read them from the disk. If
2617 * page_in_rbio finds a page in the bio list
2618 * we don't need to read it off the stripe.
2620 page = page_in_rbio(rbio, stripe, pagenr, 1);
2624 page = rbio_stripe_page(rbio, stripe, pagenr);
2626 * the bio cache may have handed us an uptodate
2627 * page. If so, be happy and use it
2629 if (PageUptodate(page))
2632 ret = rbio_add_io_page(rbio, &bio_list, page,
2633 stripe, pagenr, rbio->stripe_len);
2639 bios_to_read = bio_list_size(&bio_list);
2640 if (!bios_to_read) {
2642 * this can happen if others have merged with
2643 * us, it means there is nothing left to read.
2644 * But if there are missing devices it may not be
2645 * safe to do the full stripe write yet.
2651 * the bbio may be freed once we submit the last bio. Make sure
2652 * not to touch it after that
2654 atomic_set(&rbio->stripes_pending, bios_to_read);
2656 bio = bio_list_pop(&bio_list);
2660 bio->bi_private = rbio;
2661 bio->bi_end_io = raid56_parity_scrub_end_io;
2662 bio->bi_opf = REQ_OP_READ;
2664 btrfs_bio_wq_end_io(rbio->fs_info, bio, BTRFS_WQ_ENDIO_RAID56);
2668 /* the actual write will happen once the reads are done */
2672 rbio_orig_end_io(rbio, BLK_STS_IOERR);
2674 while ((bio = bio_list_pop(&bio_list)))
2680 validate_rbio_for_parity_scrub(rbio);
2683 static void scrub_parity_work(struct btrfs_work *work)
2685 struct btrfs_raid_bio *rbio;
2687 rbio = container_of(work, struct btrfs_raid_bio, work);
2688 raid56_parity_scrub_stripe(rbio);
2691 void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio)
2693 if (!lock_stripe_add(rbio))
2694 start_async_work(rbio, scrub_parity_work);
2697 /* The following code is used for dev replace of a missing RAID 5/6 device. */
2699 struct btrfs_raid_bio *
2700 raid56_alloc_missing_rbio(struct btrfs_fs_info *fs_info, struct bio *bio,
2701 struct btrfs_bio *bbio, u64 length)
2703 struct btrfs_raid_bio *rbio;
2705 rbio = alloc_rbio(fs_info, bbio, length);
2709 rbio->operation = BTRFS_RBIO_REBUILD_MISSING;
2710 bio_list_add(&rbio->bio_list, bio);
2712 * This is a special bio which is used to hold the completion handler
2713 * and make the scrub rbio is similar to the other types
2715 ASSERT(!bio->bi_iter.bi_size);
2717 rbio->faila = find_logical_bio_stripe(rbio, bio);
2718 if (rbio->faila == -1) {
2725 * When we get bbio, we have already increased bio_counter, record it
2726 * so we can free it at rbio_orig_end_io()
2728 rbio->generic_bio_cnt = 1;
2733 void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio)
2735 if (!lock_stripe_add(rbio))
2736 start_async_work(rbio, read_rebuild_work);