1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2011, 2012 STRATO. All rights reserved.
6 #include <linux/blkdev.h>
7 #include <linux/ratelimit.h>
8 #include <linux/sched/mm.h>
9 #include <crypto/hash.h>
14 #include "ordered-data.h"
15 #include "transaction.h"
17 #include "extent_io.h"
18 #include "dev-replace.h"
19 #include "check-integrity.h"
20 #include "rcu-string.h"
22 #include "block-group.h"
26 * This is only the first step towards a full-features scrub. It reads all
27 * extent and super block and verifies the checksums. In case a bad checksum
28 * is found or the extent cannot be read, good data will be written back if
31 * Future enhancements:
32 * - In case an unrepairable extent is encountered, track which files are
33 * affected and report them
34 * - track and record media errors, throw out bad devices
35 * - add a mode to also read unallocated space
42 * The following three values only influence the performance.
44 * The last one configures the number of parallel and outstanding I/O
45 * operations. The first one configures an upper limit for the number
46 * of (dynamically allocated) pages that are added to a bio.
48 #define SCRUB_SECTORS_PER_BIO 32 /* 128KiB per bio for 4KiB pages */
49 #define SCRUB_BIOS_PER_SCTX 64 /* 8MiB per device in flight for 4KiB pages */
52 * The following value times PAGE_SIZE needs to be large enough to match the
53 * largest node/leaf/sector size that shall be supported.
55 #define SCRUB_MAX_SECTORS_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K)
57 struct scrub_recover {
59 struct btrfs_io_context *bioc;
64 struct scrub_block *sblock;
66 struct btrfs_device *dev;
67 struct list_head list;
68 u64 flags; /* extent flags */
72 u64 physical_for_dev_replace;
75 unsigned int have_csum:1;
76 unsigned int io_error:1;
77 u8 csum[BTRFS_CSUM_SIZE];
79 struct scrub_recover *recover;
84 struct scrub_ctx *sctx;
85 struct btrfs_device *dev;
90 struct scrub_sector *sectors[SCRUB_SECTORS_PER_BIO];
93 struct work_struct work;
97 struct scrub_sector *sectors[SCRUB_MAX_SECTORS_PER_BLOCK];
99 atomic_t outstanding_sectors;
100 refcount_t refs; /* free mem on transition to zero */
101 struct scrub_ctx *sctx;
102 struct scrub_parity *sparity;
104 unsigned int header_error:1;
105 unsigned int checksum_error:1;
106 unsigned int no_io_error_seen:1;
107 unsigned int generation_error:1; /* also sets header_error */
109 /* The following is for the data used to check parity */
110 /* It is for the data with checksum */
111 unsigned int data_corrected:1;
113 struct work_struct work;
116 /* Used for the chunks with parity stripe such RAID5/6 */
117 struct scrub_parity {
118 struct scrub_ctx *sctx;
120 struct btrfs_device *scrub_dev;
132 struct list_head sectors_list;
134 /* Work of parity check and repair */
135 struct work_struct work;
137 /* Mark the parity blocks which have data */
138 unsigned long *dbitmap;
141 * Mark the parity blocks which have data, but errors happen when
142 * read data or check data
144 unsigned long *ebitmap;
146 unsigned long bitmap[];
150 struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX];
151 struct btrfs_fs_info *fs_info;
154 atomic_t bios_in_flight;
155 atomic_t workers_pending;
156 spinlock_t list_lock;
157 wait_queue_head_t list_wait;
158 struct list_head csum_list;
163 /* State of IO submission throttling affecting the associated device */
164 ktime_t throttle_deadline;
170 struct scrub_bio *wr_curr_bio;
171 struct mutex wr_lock;
172 struct btrfs_device *wr_tgtdev;
173 bool flush_all_writes;
178 struct btrfs_scrub_progress stat;
179 spinlock_t stat_lock;
182 * Use a ref counter to avoid use-after-free issues. Scrub workers
183 * decrement bios_in_flight and workers_pending and then do a wakeup
184 * on the list_wait wait queue. We must ensure the main scrub task
185 * doesn't free the scrub context before or while the workers are
186 * doing the wakeup() call.
191 struct scrub_warning {
192 struct btrfs_path *path;
193 u64 extent_item_size;
197 struct btrfs_device *dev;
200 struct full_stripe_lock {
207 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
208 struct scrub_block *sblocks_for_recheck);
209 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
210 struct scrub_block *sblock,
211 int retry_failed_mirror);
212 static void scrub_recheck_block_checksum(struct scrub_block *sblock);
213 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
214 struct scrub_block *sblock_good);
215 static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
216 struct scrub_block *sblock_good,
217 int sector_num, int force_write);
218 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
219 static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock,
221 static int scrub_checksum_data(struct scrub_block *sblock);
222 static int scrub_checksum_tree_block(struct scrub_block *sblock);
223 static int scrub_checksum_super(struct scrub_block *sblock);
224 static void scrub_block_put(struct scrub_block *sblock);
225 static void scrub_sector_get(struct scrub_sector *sector);
226 static void scrub_sector_put(struct scrub_sector *sector);
227 static void scrub_parity_get(struct scrub_parity *sparity);
228 static void scrub_parity_put(struct scrub_parity *sparity);
229 static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
230 u64 physical, struct btrfs_device *dev, u64 flags,
231 u64 gen, int mirror_num, u8 *csum,
232 u64 physical_for_dev_replace);
233 static void scrub_bio_end_io(struct bio *bio);
234 static void scrub_bio_end_io_worker(struct work_struct *work);
235 static void scrub_block_complete(struct scrub_block *sblock);
236 static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
237 u64 extent_logical, u32 extent_len,
238 u64 *extent_physical,
239 struct btrfs_device **extent_dev,
240 int *extent_mirror_num);
241 static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
242 struct scrub_sector *sector);
243 static void scrub_wr_submit(struct scrub_ctx *sctx);
244 static void scrub_wr_bio_end_io(struct bio *bio);
245 static void scrub_wr_bio_end_io_worker(struct work_struct *work);
246 static void scrub_put_ctx(struct scrub_ctx *sctx);
248 static inline int scrub_is_page_on_raid56(struct scrub_sector *sector)
250 return sector->recover &&
251 (sector->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
254 static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
256 refcount_inc(&sctx->refs);
257 atomic_inc(&sctx->bios_in_flight);
260 static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
262 atomic_dec(&sctx->bios_in_flight);
263 wake_up(&sctx->list_wait);
267 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
269 while (atomic_read(&fs_info->scrub_pause_req)) {
270 mutex_unlock(&fs_info->scrub_lock);
271 wait_event(fs_info->scrub_pause_wait,
272 atomic_read(&fs_info->scrub_pause_req) == 0);
273 mutex_lock(&fs_info->scrub_lock);
277 static void scrub_pause_on(struct btrfs_fs_info *fs_info)
279 atomic_inc(&fs_info->scrubs_paused);
280 wake_up(&fs_info->scrub_pause_wait);
283 static void scrub_pause_off(struct btrfs_fs_info *fs_info)
285 mutex_lock(&fs_info->scrub_lock);
286 __scrub_blocked_if_needed(fs_info);
287 atomic_dec(&fs_info->scrubs_paused);
288 mutex_unlock(&fs_info->scrub_lock);
290 wake_up(&fs_info->scrub_pause_wait);
293 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
295 scrub_pause_on(fs_info);
296 scrub_pause_off(fs_info);
300 * Insert new full stripe lock into full stripe locks tree
302 * Return pointer to existing or newly inserted full_stripe_lock structure if
303 * everything works well.
304 * Return ERR_PTR(-ENOMEM) if we failed to allocate memory
306 * NOTE: caller must hold full_stripe_locks_root->lock before calling this
309 static struct full_stripe_lock *insert_full_stripe_lock(
310 struct btrfs_full_stripe_locks_tree *locks_root,
314 struct rb_node *parent = NULL;
315 struct full_stripe_lock *entry;
316 struct full_stripe_lock *ret;
318 lockdep_assert_held(&locks_root->lock);
320 p = &locks_root->root.rb_node;
323 entry = rb_entry(parent, struct full_stripe_lock, node);
324 if (fstripe_logical < entry->logical) {
326 } else if (fstripe_logical > entry->logical) {
337 ret = kmalloc(sizeof(*ret), GFP_KERNEL);
339 return ERR_PTR(-ENOMEM);
340 ret->logical = fstripe_logical;
342 mutex_init(&ret->mutex);
344 rb_link_node(&ret->node, parent, p);
345 rb_insert_color(&ret->node, &locks_root->root);
350 * Search for a full stripe lock of a block group
352 * Return pointer to existing full stripe lock if found
353 * Return NULL if not found
355 static struct full_stripe_lock *search_full_stripe_lock(
356 struct btrfs_full_stripe_locks_tree *locks_root,
359 struct rb_node *node;
360 struct full_stripe_lock *entry;
362 lockdep_assert_held(&locks_root->lock);
364 node = locks_root->root.rb_node;
366 entry = rb_entry(node, struct full_stripe_lock, node);
367 if (fstripe_logical < entry->logical)
368 node = node->rb_left;
369 else if (fstripe_logical > entry->logical)
370 node = node->rb_right;
378 * Helper to get full stripe logical from a normal bytenr.
380 * Caller must ensure @cache is a RAID56 block group.
382 static u64 get_full_stripe_logical(struct btrfs_block_group *cache, u64 bytenr)
387 * Due to chunk item size limit, full stripe length should not be
388 * larger than U32_MAX. Just a sanity check here.
390 WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
393 * round_down() can only handle power of 2, while RAID56 full
394 * stripe length can be 64KiB * n, so we need to manually round down.
396 ret = div64_u64(bytenr - cache->start, cache->full_stripe_len) *
397 cache->full_stripe_len + cache->start;
402 * Lock a full stripe to avoid concurrency of recovery and read
404 * It's only used for profiles with parities (RAID5/6), for other profiles it
407 * Return 0 if we locked full stripe covering @bytenr, with a mutex held.
408 * So caller must call unlock_full_stripe() at the same context.
410 * Return <0 if encounters error.
412 static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
415 struct btrfs_block_group *bg_cache;
416 struct btrfs_full_stripe_locks_tree *locks_root;
417 struct full_stripe_lock *existing;
422 bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
428 /* Profiles not based on parity don't need full stripe lock */
429 if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
431 locks_root = &bg_cache->full_stripe_locks_root;
433 fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
435 /* Now insert the full stripe lock */
436 mutex_lock(&locks_root->lock);
437 existing = insert_full_stripe_lock(locks_root, fstripe_start);
438 mutex_unlock(&locks_root->lock);
439 if (IS_ERR(existing)) {
440 ret = PTR_ERR(existing);
443 mutex_lock(&existing->mutex);
446 btrfs_put_block_group(bg_cache);
451 * Unlock a full stripe.
453 * NOTE: Caller must ensure it's the same context calling corresponding
454 * lock_full_stripe().
456 * Return 0 if we unlock full stripe without problem.
457 * Return <0 for error
459 static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
462 struct btrfs_block_group *bg_cache;
463 struct btrfs_full_stripe_locks_tree *locks_root;
464 struct full_stripe_lock *fstripe_lock;
469 /* If we didn't acquire full stripe lock, no need to continue */
473 bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
478 if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
481 locks_root = &bg_cache->full_stripe_locks_root;
482 fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
484 mutex_lock(&locks_root->lock);
485 fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
486 /* Unpaired unlock_full_stripe() detected */
490 mutex_unlock(&locks_root->lock);
494 if (fstripe_lock->refs == 0) {
496 btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
497 fstripe_lock->logical);
499 fstripe_lock->refs--;
502 if (fstripe_lock->refs == 0) {
503 rb_erase(&fstripe_lock->node, &locks_root->root);
506 mutex_unlock(&locks_root->lock);
508 mutex_unlock(&fstripe_lock->mutex);
512 btrfs_put_block_group(bg_cache);
516 static void scrub_free_csums(struct scrub_ctx *sctx)
518 while (!list_empty(&sctx->csum_list)) {
519 struct btrfs_ordered_sum *sum;
520 sum = list_first_entry(&sctx->csum_list,
521 struct btrfs_ordered_sum, list);
522 list_del(&sum->list);
527 static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
534 /* this can happen when scrub is cancelled */
535 if (sctx->curr != -1) {
536 struct scrub_bio *sbio = sctx->bios[sctx->curr];
538 for (i = 0; i < sbio->sector_count; i++) {
539 WARN_ON(!sbio->sectors[i]->page);
540 scrub_block_put(sbio->sectors[i]->sblock);
545 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
546 struct scrub_bio *sbio = sctx->bios[i];
553 kfree(sctx->wr_curr_bio);
554 scrub_free_csums(sctx);
558 static void scrub_put_ctx(struct scrub_ctx *sctx)
560 if (refcount_dec_and_test(&sctx->refs))
561 scrub_free_ctx(sctx);
564 static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
565 struct btrfs_fs_info *fs_info, int is_dev_replace)
567 struct scrub_ctx *sctx;
570 sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
573 refcount_set(&sctx->refs, 1);
574 sctx->is_dev_replace = is_dev_replace;
575 sctx->sectors_per_bio = SCRUB_SECTORS_PER_BIO;
577 sctx->fs_info = fs_info;
578 INIT_LIST_HEAD(&sctx->csum_list);
579 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
580 struct scrub_bio *sbio;
582 sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
585 sctx->bios[i] = sbio;
589 sbio->sector_count = 0;
590 INIT_WORK(&sbio->work, scrub_bio_end_io_worker);
592 if (i != SCRUB_BIOS_PER_SCTX - 1)
593 sctx->bios[i]->next_free = i + 1;
595 sctx->bios[i]->next_free = -1;
597 sctx->first_free = 0;
598 atomic_set(&sctx->bios_in_flight, 0);
599 atomic_set(&sctx->workers_pending, 0);
600 atomic_set(&sctx->cancel_req, 0);
602 spin_lock_init(&sctx->list_lock);
603 spin_lock_init(&sctx->stat_lock);
604 init_waitqueue_head(&sctx->list_wait);
605 sctx->throttle_deadline = 0;
607 WARN_ON(sctx->wr_curr_bio != NULL);
608 mutex_init(&sctx->wr_lock);
609 sctx->wr_curr_bio = NULL;
610 if (is_dev_replace) {
611 WARN_ON(!fs_info->dev_replace.tgtdev);
612 sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
613 sctx->flush_all_writes = false;
619 scrub_free_ctx(sctx);
620 return ERR_PTR(-ENOMEM);
623 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
630 struct extent_buffer *eb;
631 struct btrfs_inode_item *inode_item;
632 struct scrub_warning *swarn = warn_ctx;
633 struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
634 struct inode_fs_paths *ipath = NULL;
635 struct btrfs_root *local_root;
636 struct btrfs_key key;
638 local_root = btrfs_get_fs_root(fs_info, root, true);
639 if (IS_ERR(local_root)) {
640 ret = PTR_ERR(local_root);
645 * this makes the path point to (inum INODE_ITEM ioff)
648 key.type = BTRFS_INODE_ITEM_KEY;
651 ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
653 btrfs_put_root(local_root);
654 btrfs_release_path(swarn->path);
658 eb = swarn->path->nodes[0];
659 inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
660 struct btrfs_inode_item);
661 nlink = btrfs_inode_nlink(eb, inode_item);
662 btrfs_release_path(swarn->path);
665 * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
666 * uses GFP_NOFS in this context, so we keep it consistent but it does
667 * not seem to be strictly necessary.
669 nofs_flag = memalloc_nofs_save();
670 ipath = init_ipath(4096, local_root, swarn->path);
671 memalloc_nofs_restore(nofs_flag);
673 btrfs_put_root(local_root);
674 ret = PTR_ERR(ipath);
678 ret = paths_from_inode(inum, ipath);
684 * we deliberately ignore the bit ipath might have been too small to
685 * hold all of the paths here
687 for (i = 0; i < ipath->fspath->elem_cnt; ++i)
688 btrfs_warn_in_rcu(fs_info,
689 "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)",
690 swarn->errstr, swarn->logical,
691 rcu_str_deref(swarn->dev->name),
694 fs_info->sectorsize, nlink,
695 (char *)(unsigned long)ipath->fspath->val[i]);
697 btrfs_put_root(local_root);
702 btrfs_warn_in_rcu(fs_info,
703 "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
704 swarn->errstr, swarn->logical,
705 rcu_str_deref(swarn->dev->name),
707 root, inum, offset, ret);
713 static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
715 struct btrfs_device *dev;
716 struct btrfs_fs_info *fs_info;
717 struct btrfs_path *path;
718 struct btrfs_key found_key;
719 struct extent_buffer *eb;
720 struct btrfs_extent_item *ei;
721 struct scrub_warning swarn;
722 unsigned long ptr = 0;
730 WARN_ON(sblock->sector_count < 1);
731 dev = sblock->sectors[0]->dev;
732 fs_info = sblock->sctx->fs_info;
734 path = btrfs_alloc_path();
738 swarn.physical = sblock->sectors[0]->physical;
739 swarn.logical = sblock->sectors[0]->logical;
740 swarn.errstr = errstr;
743 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
748 extent_item_pos = swarn.logical - found_key.objectid;
749 swarn.extent_item_size = found_key.offset;
752 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
753 item_size = btrfs_item_size(eb, path->slots[0]);
755 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
757 ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
758 item_size, &ref_root,
760 btrfs_warn_in_rcu(fs_info,
761 "%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
762 errstr, swarn.logical,
763 rcu_str_deref(dev->name),
765 ref_level ? "node" : "leaf",
766 ret < 0 ? -1 : ref_level,
767 ret < 0 ? -1 : ref_root);
769 btrfs_release_path(path);
771 btrfs_release_path(path);
774 iterate_extent_inodes(fs_info, found_key.objectid,
776 scrub_print_warning_inode, &swarn, false);
780 btrfs_free_path(path);
783 static inline void scrub_get_recover(struct scrub_recover *recover)
785 refcount_inc(&recover->refs);
788 static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
789 struct scrub_recover *recover)
791 if (refcount_dec_and_test(&recover->refs)) {
792 btrfs_bio_counter_dec(fs_info);
793 btrfs_put_bioc(recover->bioc);
799 * scrub_handle_errored_block gets called when either verification of the
800 * sectors failed or the bio failed to read, e.g. with EIO. In the latter
801 * case, this function handles all sectors in the bio, even though only one
803 * The goal of this function is to repair the errored block by using the
804 * contents of one of the mirrors.
806 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
808 struct scrub_ctx *sctx = sblock_to_check->sctx;
809 struct btrfs_device *dev;
810 struct btrfs_fs_info *fs_info;
812 unsigned int failed_mirror_index;
813 unsigned int is_metadata;
814 unsigned int have_csum;
815 struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
816 struct scrub_block *sblock_bad;
821 bool full_stripe_locked;
822 unsigned int nofs_flag;
823 static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
824 DEFAULT_RATELIMIT_BURST);
826 BUG_ON(sblock_to_check->sector_count < 1);
827 fs_info = sctx->fs_info;
828 if (sblock_to_check->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
830 * if we find an error in a super block, we just report it.
831 * They will get written with the next transaction commit
834 spin_lock(&sctx->stat_lock);
835 ++sctx->stat.super_errors;
836 spin_unlock(&sctx->stat_lock);
839 logical = sblock_to_check->sectors[0]->logical;
840 BUG_ON(sblock_to_check->sectors[0]->mirror_num < 1);
841 failed_mirror_index = sblock_to_check->sectors[0]->mirror_num - 1;
842 is_metadata = !(sblock_to_check->sectors[0]->flags &
843 BTRFS_EXTENT_FLAG_DATA);
844 have_csum = sblock_to_check->sectors[0]->have_csum;
845 dev = sblock_to_check->sectors[0]->dev;
847 if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical))
851 * We must use GFP_NOFS because the scrub task might be waiting for a
852 * worker task executing this function and in turn a transaction commit
853 * might be waiting the scrub task to pause (which needs to wait for all
854 * the worker tasks to complete before pausing).
855 * We do allocations in the workers through insert_full_stripe_lock()
856 * and scrub_add_sector_to_wr_bio(), which happens down the call chain of
859 nofs_flag = memalloc_nofs_save();
861 * For RAID5/6, race can happen for a different device scrub thread.
862 * For data corruption, Parity and Data threads will both try
863 * to recovery the data.
864 * Race can lead to doubly added csum error, or even unrecoverable
867 ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
869 memalloc_nofs_restore(nofs_flag);
870 spin_lock(&sctx->stat_lock);
872 sctx->stat.malloc_errors++;
873 sctx->stat.read_errors++;
874 sctx->stat.uncorrectable_errors++;
875 spin_unlock(&sctx->stat_lock);
880 * read all mirrors one after the other. This includes to
881 * re-read the extent or metadata block that failed (that was
882 * the cause that this fixup code is called) another time,
883 * sector by sector this time in order to know which sectors
884 * caused I/O errors and which ones are good (for all mirrors).
885 * It is the goal to handle the situation when more than one
886 * mirror contains I/O errors, but the errors do not
887 * overlap, i.e. the data can be repaired by selecting the
888 * sectors from those mirrors without I/O error on the
889 * particular sectors. One example (with blocks >= 2 * sectorsize)
890 * would be that mirror #1 has an I/O error on the first sector,
891 * the second sector is good, and mirror #2 has an I/O error on
892 * the second sector, but the first sector is good.
893 * Then the first sector of the first mirror can be repaired by
894 * taking the first sector of the second mirror, and the
895 * second sector of the second mirror can be repaired by
896 * copying the contents of the 2nd sector of the 1st mirror.
897 * One more note: if the sectors of one mirror contain I/O
898 * errors, the checksum cannot be verified. In order to get
899 * the best data for repairing, the first attempt is to find
900 * a mirror without I/O errors and with a validated checksum.
901 * Only if this is not possible, the sectors are picked from
902 * mirrors with I/O errors without considering the checksum.
903 * If the latter is the case, at the end, the checksum of the
904 * repaired area is verified in order to correctly maintain
908 sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
909 sizeof(*sblocks_for_recheck), GFP_KERNEL);
910 if (!sblocks_for_recheck) {
911 spin_lock(&sctx->stat_lock);
912 sctx->stat.malloc_errors++;
913 sctx->stat.read_errors++;
914 sctx->stat.uncorrectable_errors++;
915 spin_unlock(&sctx->stat_lock);
916 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
920 /* Setup the context, map the logical blocks and alloc the sectors */
921 ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
923 spin_lock(&sctx->stat_lock);
924 sctx->stat.read_errors++;
925 sctx->stat.uncorrectable_errors++;
926 spin_unlock(&sctx->stat_lock);
927 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
930 BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
931 sblock_bad = sblocks_for_recheck + failed_mirror_index;
933 /* build and submit the bios for the failed mirror, check checksums */
934 scrub_recheck_block(fs_info, sblock_bad, 1);
936 if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
937 sblock_bad->no_io_error_seen) {
939 * The error disappeared after reading sector by sector, or
940 * the area was part of a huge bio and other parts of the
941 * bio caused I/O errors, or the block layer merged several
942 * read requests into one and the error is caused by a
943 * different bio (usually one of the two latter cases is
946 spin_lock(&sctx->stat_lock);
947 sctx->stat.unverified_errors++;
948 sblock_to_check->data_corrected = 1;
949 spin_unlock(&sctx->stat_lock);
951 if (sctx->is_dev_replace)
952 scrub_write_block_to_dev_replace(sblock_bad);
956 if (!sblock_bad->no_io_error_seen) {
957 spin_lock(&sctx->stat_lock);
958 sctx->stat.read_errors++;
959 spin_unlock(&sctx->stat_lock);
960 if (__ratelimit(&rs))
961 scrub_print_warning("i/o error", sblock_to_check);
962 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
963 } else if (sblock_bad->checksum_error) {
964 spin_lock(&sctx->stat_lock);
965 sctx->stat.csum_errors++;
966 spin_unlock(&sctx->stat_lock);
967 if (__ratelimit(&rs))
968 scrub_print_warning("checksum error", sblock_to_check);
969 btrfs_dev_stat_inc_and_print(dev,
970 BTRFS_DEV_STAT_CORRUPTION_ERRS);
971 } else if (sblock_bad->header_error) {
972 spin_lock(&sctx->stat_lock);
973 sctx->stat.verify_errors++;
974 spin_unlock(&sctx->stat_lock);
975 if (__ratelimit(&rs))
976 scrub_print_warning("checksum/header error",
978 if (sblock_bad->generation_error)
979 btrfs_dev_stat_inc_and_print(dev,
980 BTRFS_DEV_STAT_GENERATION_ERRS);
982 btrfs_dev_stat_inc_and_print(dev,
983 BTRFS_DEV_STAT_CORRUPTION_ERRS);
986 if (sctx->readonly) {
987 ASSERT(!sctx->is_dev_replace);
992 * now build and submit the bios for the other mirrors, check
994 * First try to pick the mirror which is completely without I/O
995 * errors and also does not have a checksum error.
996 * If one is found, and if a checksum is present, the full block
997 * that is known to contain an error is rewritten. Afterwards
998 * the block is known to be corrected.
999 * If a mirror is found which is completely correct, and no
1000 * checksum is present, only those sectors are rewritten that had
1001 * an I/O error in the block to be repaired, since it cannot be
1002 * determined, which copy of the other sectors is better (and it
1003 * could happen otherwise that a correct sector would be
1004 * overwritten by a bad one).
1006 for (mirror_index = 0; ;mirror_index++) {
1007 struct scrub_block *sblock_other;
1009 if (mirror_index == failed_mirror_index)
1012 /* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
1013 if (!scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
1014 if (mirror_index >= BTRFS_MAX_MIRRORS)
1016 if (!sblocks_for_recheck[mirror_index].sector_count)
1019 sblock_other = sblocks_for_recheck + mirror_index;
1021 struct scrub_recover *r = sblock_bad->sectors[0]->recover;
1022 int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs;
1024 if (mirror_index >= max_allowed)
1026 if (!sblocks_for_recheck[1].sector_count)
1029 ASSERT(failed_mirror_index == 0);
1030 sblock_other = sblocks_for_recheck + 1;
1031 sblock_other->sectors[0]->mirror_num = 1 + mirror_index;
1034 /* build and submit the bios, check checksums */
1035 scrub_recheck_block(fs_info, sblock_other, 0);
1037 if (!sblock_other->header_error &&
1038 !sblock_other->checksum_error &&
1039 sblock_other->no_io_error_seen) {
1040 if (sctx->is_dev_replace) {
1041 scrub_write_block_to_dev_replace(sblock_other);
1042 goto corrected_error;
1044 ret = scrub_repair_block_from_good_copy(
1045 sblock_bad, sblock_other);
1047 goto corrected_error;
1052 if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
1053 goto did_not_correct_error;
1056 * In case of I/O errors in the area that is supposed to be
1057 * repaired, continue by picking good copies of those sectors.
1058 * Select the good sectors from mirrors to rewrite bad sectors from
1059 * the area to fix. Afterwards verify the checksum of the block
1060 * that is supposed to be repaired. This verification step is
1061 * only done for the purpose of statistic counting and for the
1062 * final scrub report, whether errors remain.
1063 * A perfect algorithm could make use of the checksum and try
1064 * all possible combinations of sectors from the different mirrors
1065 * until the checksum verification succeeds. For example, when
1066 * the 2nd sector of mirror #1 faces I/O errors, and the 2nd sector
1067 * of mirror #2 is readable but the final checksum test fails,
1068 * then the 2nd sector of mirror #3 could be tried, whether now
1069 * the final checksum succeeds. But this would be a rare
1070 * exception and is therefore not implemented. At least it is
1071 * avoided that the good copy is overwritten.
1072 * A more useful improvement would be to pick the sectors
1073 * without I/O error based on sector sizes (512 bytes on legacy
1074 * disks) instead of on sectorsize. Then maybe 512 byte of one
1075 * mirror could be repaired by taking 512 byte of a different
1076 * mirror, even if other 512 byte sectors in the same sectorsize
1077 * area are unreadable.
1080 for (sector_num = 0; sector_num < sblock_bad->sector_count;
1082 struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
1083 struct scrub_block *sblock_other = NULL;
1085 /* Skip no-io-error sectors in scrub */
1086 if (!sector_bad->io_error && !sctx->is_dev_replace)
1089 if (scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
1091 * In case of dev replace, if raid56 rebuild process
1092 * didn't work out correct data, then copy the content
1093 * in sblock_bad to make sure target device is identical
1094 * to source device, instead of writing garbage data in
1095 * sblock_for_recheck array to target device.
1097 sblock_other = NULL;
1098 } else if (sector_bad->io_error) {
1099 /* Try to find no-io-error sector in mirrors */
1100 for (mirror_index = 0;
1101 mirror_index < BTRFS_MAX_MIRRORS &&
1102 sblocks_for_recheck[mirror_index].sector_count > 0;
1104 if (!sblocks_for_recheck[mirror_index].
1105 sectors[sector_num]->io_error) {
1106 sblock_other = sblocks_for_recheck +
1115 if (sctx->is_dev_replace) {
1117 * Did not find a mirror to fetch the sector from.
1118 * scrub_write_sector_to_dev_replace() handles this
1119 * case (sector->io_error), by filling the block with
1120 * zeros before submitting the write request
1123 sblock_other = sblock_bad;
1125 if (scrub_write_sector_to_dev_replace(sblock_other,
1128 &fs_info->dev_replace.num_write_errors);
1131 } else if (sblock_other) {
1132 ret = scrub_repair_sector_from_good_copy(sblock_bad,
1136 sector_bad->io_error = 0;
1142 if (success && !sctx->is_dev_replace) {
1143 if (is_metadata || have_csum) {
1145 * need to verify the checksum now that all
1146 * sectors on disk are repaired (the write
1147 * request for data to be repaired is on its way).
1148 * Just be lazy and use scrub_recheck_block()
1149 * which re-reads the data before the checksum
1150 * is verified, but most likely the data comes out
1151 * of the page cache.
1153 scrub_recheck_block(fs_info, sblock_bad, 1);
1154 if (!sblock_bad->header_error &&
1155 !sblock_bad->checksum_error &&
1156 sblock_bad->no_io_error_seen)
1157 goto corrected_error;
1159 goto did_not_correct_error;
1162 spin_lock(&sctx->stat_lock);
1163 sctx->stat.corrected_errors++;
1164 sblock_to_check->data_corrected = 1;
1165 spin_unlock(&sctx->stat_lock);
1166 btrfs_err_rl_in_rcu(fs_info,
1167 "fixed up error at logical %llu on dev %s",
1168 logical, rcu_str_deref(dev->name));
1171 did_not_correct_error:
1172 spin_lock(&sctx->stat_lock);
1173 sctx->stat.uncorrectable_errors++;
1174 spin_unlock(&sctx->stat_lock);
1175 btrfs_err_rl_in_rcu(fs_info,
1176 "unable to fixup (regular) error at logical %llu on dev %s",
1177 logical, rcu_str_deref(dev->name));
1181 if (sblocks_for_recheck) {
1182 for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1184 struct scrub_block *sblock = sblocks_for_recheck +
1186 struct scrub_recover *recover;
1189 for (i = 0; i < sblock->sector_count; i++) {
1190 sblock->sectors[i]->sblock = NULL;
1191 recover = sblock->sectors[i]->recover;
1193 scrub_put_recover(fs_info, recover);
1194 sblock->sectors[i]->recover = NULL;
1196 scrub_sector_put(sblock->sectors[i]);
1199 kfree(sblocks_for_recheck);
1202 ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
1203 memalloc_nofs_restore(nofs_flag);
1209 static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc)
1211 if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
1213 else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
1216 return (int)bioc->num_stripes;
1219 static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
1222 int nstripes, int mirror,
1228 if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
1230 for (i = 0; i < nstripes; i++) {
1231 if (raid_map[i] == RAID6_Q_STRIPE ||
1232 raid_map[i] == RAID5_P_STRIPE)
1235 if (logical >= raid_map[i] &&
1236 logical < raid_map[i] + mapped_length)
1241 *stripe_offset = logical - raid_map[i];
1243 /* The other RAID type */
1244 *stripe_index = mirror;
1249 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
1250 struct scrub_block *sblocks_for_recheck)
1252 struct scrub_ctx *sctx = original_sblock->sctx;
1253 struct btrfs_fs_info *fs_info = sctx->fs_info;
1254 u64 length = original_sblock->sector_count << fs_info->sectorsize_bits;
1255 u64 logical = original_sblock->sectors[0]->logical;
1256 u64 generation = original_sblock->sectors[0]->generation;
1257 u64 flags = original_sblock->sectors[0]->flags;
1258 u64 have_csum = original_sblock->sectors[0]->have_csum;
1259 struct scrub_recover *recover;
1260 struct btrfs_io_context *bioc;
1265 int sector_index = 0;
1271 * Note: the two members refs and outstanding_sectors are not used (and
1272 * not set) in the blocks that are used for the recheck procedure.
1275 while (length > 0) {
1276 sublen = min_t(u64, length, fs_info->sectorsize);
1277 mapped_length = sublen;
1281 * With a length of sectorsize, each returned stripe represents
1284 btrfs_bio_counter_inc_blocked(fs_info);
1285 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
1286 logical, &mapped_length, &bioc);
1287 if (ret || !bioc || mapped_length < sublen) {
1288 btrfs_put_bioc(bioc);
1289 btrfs_bio_counter_dec(fs_info);
1293 recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
1295 btrfs_put_bioc(bioc);
1296 btrfs_bio_counter_dec(fs_info);
1300 refcount_set(&recover->refs, 1);
1301 recover->bioc = bioc;
1302 recover->map_length = mapped_length;
1304 ASSERT(sector_index < SCRUB_MAX_SECTORS_PER_BLOCK);
1306 nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS);
1308 for (mirror_index = 0; mirror_index < nmirrors;
1310 struct scrub_block *sblock;
1311 struct scrub_sector *sector;
1313 sblock = sblocks_for_recheck + mirror_index;
1314 sblock->sctx = sctx;
1316 sector = kzalloc(sizeof(*sector), GFP_NOFS);
1319 spin_lock(&sctx->stat_lock);
1320 sctx->stat.malloc_errors++;
1321 spin_unlock(&sctx->stat_lock);
1322 scrub_put_recover(fs_info, recover);
1325 scrub_sector_get(sector);
1326 sblock->sectors[sector_index] = sector;
1327 sector->sblock = sblock;
1328 sector->flags = flags;
1329 sector->generation = generation;
1330 sector->logical = logical;
1331 sector->have_csum = have_csum;
1333 memcpy(sector->csum,
1334 original_sblock->sectors[0]->csum,
1335 sctx->fs_info->csum_size);
1337 scrub_stripe_index_and_offset(logical,
1346 sector->physical = bioc->stripes[stripe_index].physical +
1348 sector->dev = bioc->stripes[stripe_index].dev;
1350 BUG_ON(sector_index >= original_sblock->sector_count);
1351 sector->physical_for_dev_replace =
1352 original_sblock->sectors[sector_index]->
1353 physical_for_dev_replace;
1354 /* For missing devices, dev->bdev is NULL */
1355 sector->mirror_num = mirror_index + 1;
1356 sblock->sector_count++;
1357 sector->page = alloc_page(GFP_NOFS);
1361 scrub_get_recover(recover);
1362 sector->recover = recover;
1364 scrub_put_recover(fs_info, recover);
1373 static void scrub_bio_wait_endio(struct bio *bio)
1375 complete(bio->bi_private);
1378 static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1380 struct scrub_sector *sector)
1382 DECLARE_COMPLETION_ONSTACK(done);
1386 bio->bi_iter.bi_sector = sector->logical >> 9;
1387 bio->bi_private = &done;
1388 bio->bi_end_io = scrub_bio_wait_endio;
1390 mirror_num = sector->sblock->sectors[0]->mirror_num;
1391 ret = raid56_parity_recover(bio, sector->recover->bioc,
1392 sector->recover->map_length,
1397 wait_for_completion_io(&done);
1398 return blk_status_to_errno(bio->bi_status);
1401 static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
1402 struct scrub_block *sblock)
1404 struct scrub_sector *first_sector = sblock->sectors[0];
1408 /* All sectors in sblock belong to the same stripe on the same device. */
1409 ASSERT(first_sector->dev);
1410 if (!first_sector->dev->bdev)
1413 bio = bio_alloc(first_sector->dev->bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
1415 for (i = 0; i < sblock->sector_count; i++) {
1416 struct scrub_sector *sector = sblock->sectors[i];
1418 WARN_ON(!sector->page);
1419 bio_add_page(bio, sector->page, PAGE_SIZE, 0);
1422 if (scrub_submit_raid56_bio_wait(fs_info, bio, first_sector)) {
1429 scrub_recheck_block_checksum(sblock);
1433 for (i = 0; i < sblock->sector_count; i++)
1434 sblock->sectors[i]->io_error = 1;
1436 sblock->no_io_error_seen = 0;
1440 * This function will check the on disk data for checksum errors, header errors
1441 * and read I/O errors. If any I/O errors happen, the exact sectors which are
1442 * errored are marked as being bad. The goal is to enable scrub to take those
1443 * sectors that are not errored from all the mirrors so that the sectors that
1444 * are errored in the just handled mirror can be repaired.
1446 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1447 struct scrub_block *sblock,
1448 int retry_failed_mirror)
1452 sblock->no_io_error_seen = 1;
1454 /* short cut for raid56 */
1455 if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->sectors[0]))
1456 return scrub_recheck_block_on_raid56(fs_info, sblock);
1458 for (i = 0; i < sblock->sector_count; i++) {
1459 struct scrub_sector *sector = sblock->sectors[i];
1461 struct bio_vec bvec;
1463 if (sector->dev->bdev == NULL) {
1464 sector->io_error = 1;
1465 sblock->no_io_error_seen = 0;
1469 WARN_ON(!sector->page);
1470 bio_init(&bio, sector->dev->bdev, &bvec, 1, REQ_OP_READ);
1471 bio_add_page(&bio, sector->page, fs_info->sectorsize, 0);
1472 bio.bi_iter.bi_sector = sector->physical >> 9;
1474 btrfsic_check_bio(&bio);
1475 if (submit_bio_wait(&bio)) {
1476 sector->io_error = 1;
1477 sblock->no_io_error_seen = 0;
1483 if (sblock->no_io_error_seen)
1484 scrub_recheck_block_checksum(sblock);
1487 static inline int scrub_check_fsid(u8 fsid[], struct scrub_sector *sector)
1489 struct btrfs_fs_devices *fs_devices = sector->dev->fs_devices;
1492 ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1496 static void scrub_recheck_block_checksum(struct scrub_block *sblock)
1498 sblock->header_error = 0;
1499 sblock->checksum_error = 0;
1500 sblock->generation_error = 0;
1502 if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_DATA)
1503 scrub_checksum_data(sblock);
1505 scrub_checksum_tree_block(sblock);
1508 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1509 struct scrub_block *sblock_good)
1514 for (i = 0; i < sblock_bad->sector_count; i++) {
1517 ret_sub = scrub_repair_sector_from_good_copy(sblock_bad,
1526 static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
1527 struct scrub_block *sblock_good,
1528 int sector_num, int force_write)
1530 struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
1531 struct scrub_sector *sector_good = sblock_good->sectors[sector_num];
1532 struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
1533 const u32 sectorsize = fs_info->sectorsize;
1535 BUG_ON(sector_bad->page == NULL);
1536 BUG_ON(sector_good->page == NULL);
1537 if (force_write || sblock_bad->header_error ||
1538 sblock_bad->checksum_error || sector_bad->io_error) {
1540 struct bio_vec bvec;
1543 if (!sector_bad->dev->bdev) {
1544 btrfs_warn_rl(fs_info,
1545 "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
1549 bio_init(&bio, sector_bad->dev->bdev, &bvec, 1, REQ_OP_WRITE);
1550 bio.bi_iter.bi_sector = sector_bad->physical >> 9;
1551 __bio_add_page(&bio, sector_good->page, sectorsize, 0);
1553 btrfsic_check_bio(&bio);
1554 ret = submit_bio_wait(&bio);
1558 btrfs_dev_stat_inc_and_print(sector_bad->dev,
1559 BTRFS_DEV_STAT_WRITE_ERRS);
1560 atomic64_inc(&fs_info->dev_replace.num_write_errors);
1568 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1570 struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
1574 * This block is used for the check of the parity on the source device,
1575 * so the data needn't be written into the destination device.
1577 if (sblock->sparity)
1580 for (i = 0; i < sblock->sector_count; i++) {
1583 ret = scrub_write_sector_to_dev_replace(sblock, i);
1585 atomic64_inc(&fs_info->dev_replace.num_write_errors);
1589 static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, int sector_num)
1591 struct scrub_sector *sector = sblock->sectors[sector_num];
1593 BUG_ON(sector->page == NULL);
1594 if (sector->io_error)
1595 clear_page(page_address(sector->page));
1597 return scrub_add_sector_to_wr_bio(sblock->sctx, sector);
1600 static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
1605 if (!btrfs_is_zoned(sctx->fs_info))
1608 if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical))
1611 if (sctx->write_pointer < physical) {
1612 length = physical - sctx->write_pointer;
1614 ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev,
1615 sctx->write_pointer, length);
1617 sctx->write_pointer = physical;
1622 static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
1623 struct scrub_sector *sector)
1625 struct scrub_bio *sbio;
1627 const u32 sectorsize = sctx->fs_info->sectorsize;
1629 mutex_lock(&sctx->wr_lock);
1631 if (!sctx->wr_curr_bio) {
1632 sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
1634 if (!sctx->wr_curr_bio) {
1635 mutex_unlock(&sctx->wr_lock);
1638 sctx->wr_curr_bio->sctx = sctx;
1639 sctx->wr_curr_bio->sector_count = 0;
1641 sbio = sctx->wr_curr_bio;
1642 if (sbio->sector_count == 0) {
1643 ret = fill_writer_pointer_gap(sctx, sector->physical_for_dev_replace);
1645 mutex_unlock(&sctx->wr_lock);
1649 sbio->physical = sector->physical_for_dev_replace;
1650 sbio->logical = sector->logical;
1651 sbio->dev = sctx->wr_tgtdev;
1653 sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
1654 REQ_OP_WRITE, GFP_NOFS);
1656 sbio->bio->bi_private = sbio;
1657 sbio->bio->bi_end_io = scrub_wr_bio_end_io;
1658 sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
1660 } else if (sbio->physical + sbio->sector_count * sectorsize !=
1661 sector->physical_for_dev_replace ||
1662 sbio->logical + sbio->sector_count * sectorsize !=
1664 scrub_wr_submit(sctx);
1668 ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0);
1669 if (ret != sectorsize) {
1670 if (sbio->sector_count < 1) {
1673 mutex_unlock(&sctx->wr_lock);
1676 scrub_wr_submit(sctx);
1680 sbio->sectors[sbio->sector_count] = sector;
1681 scrub_sector_get(sector);
1682 sbio->sector_count++;
1683 if (sbio->sector_count == sctx->sectors_per_bio)
1684 scrub_wr_submit(sctx);
1685 mutex_unlock(&sctx->wr_lock);
1690 static void scrub_wr_submit(struct scrub_ctx *sctx)
1692 struct scrub_bio *sbio;
1694 if (!sctx->wr_curr_bio)
1697 sbio = sctx->wr_curr_bio;
1698 sctx->wr_curr_bio = NULL;
1699 scrub_pending_bio_inc(sctx);
1700 /* process all writes in a single worker thread. Then the block layer
1701 * orders the requests before sending them to the driver which
1702 * doubled the write performance on spinning disks when measured
1704 btrfsic_check_bio(sbio->bio);
1705 submit_bio(sbio->bio);
1707 if (btrfs_is_zoned(sctx->fs_info))
1708 sctx->write_pointer = sbio->physical + sbio->sector_count *
1709 sctx->fs_info->sectorsize;
1712 static void scrub_wr_bio_end_io(struct bio *bio)
1714 struct scrub_bio *sbio = bio->bi_private;
1715 struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
1717 sbio->status = bio->bi_status;
1720 INIT_WORK(&sbio->work, scrub_wr_bio_end_io_worker);
1721 queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
1724 static void scrub_wr_bio_end_io_worker(struct work_struct *work)
1726 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1727 struct scrub_ctx *sctx = sbio->sctx;
1730 ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
1732 struct btrfs_dev_replace *dev_replace =
1733 &sbio->sctx->fs_info->dev_replace;
1735 for (i = 0; i < sbio->sector_count; i++) {
1736 struct scrub_sector *sector = sbio->sectors[i];
1738 sector->io_error = 1;
1739 atomic64_inc(&dev_replace->num_write_errors);
1743 for (i = 0; i < sbio->sector_count; i++)
1744 scrub_sector_put(sbio->sectors[i]);
1748 scrub_pending_bio_dec(sctx);
1751 static int scrub_checksum(struct scrub_block *sblock)
1757 * No need to initialize these stats currently,
1758 * because this function only use return value
1759 * instead of these stats value.
1764 sblock->header_error = 0;
1765 sblock->generation_error = 0;
1766 sblock->checksum_error = 0;
1768 WARN_ON(sblock->sector_count < 1);
1769 flags = sblock->sectors[0]->flags;
1771 if (flags & BTRFS_EXTENT_FLAG_DATA)
1772 ret = scrub_checksum_data(sblock);
1773 else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1774 ret = scrub_checksum_tree_block(sblock);
1775 else if (flags & BTRFS_EXTENT_FLAG_SUPER)
1776 (void)scrub_checksum_super(sblock);
1780 scrub_handle_errored_block(sblock);
1785 static int scrub_checksum_data(struct scrub_block *sblock)
1787 struct scrub_ctx *sctx = sblock->sctx;
1788 struct btrfs_fs_info *fs_info = sctx->fs_info;
1789 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1790 u8 csum[BTRFS_CSUM_SIZE];
1791 struct scrub_sector *sector;
1794 BUG_ON(sblock->sector_count < 1);
1795 sector = sblock->sectors[0];
1796 if (!sector->have_csum)
1799 kaddr = page_address(sector->page);
1801 shash->tfm = fs_info->csum_shash;
1802 crypto_shash_init(shash);
1805 * In scrub_sectors() and scrub_sectors_for_parity() we ensure each sector
1806 * only contains one sector of data.
1808 crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
1810 if (memcmp(csum, sector->csum, fs_info->csum_size))
1811 sblock->checksum_error = 1;
1812 return sblock->checksum_error;
1815 static int scrub_checksum_tree_block(struct scrub_block *sblock)
1817 struct scrub_ctx *sctx = sblock->sctx;
1818 struct btrfs_header *h;
1819 struct btrfs_fs_info *fs_info = sctx->fs_info;
1820 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1821 u8 calculated_csum[BTRFS_CSUM_SIZE];
1822 u8 on_disk_csum[BTRFS_CSUM_SIZE];
1824 * This is done in sectorsize steps even for metadata as there's a
1825 * constraint for nodesize to be aligned to sectorsize. This will need
1826 * to change so we don't misuse data and metadata units like that.
1828 const u32 sectorsize = sctx->fs_info->sectorsize;
1829 const int num_sectors = fs_info->nodesize >> fs_info->sectorsize_bits;
1831 struct scrub_sector *sector;
1834 BUG_ON(sblock->sector_count < 1);
1836 /* Each member in sectors is just one sector */
1837 ASSERT(sblock->sector_count == num_sectors);
1839 sector = sblock->sectors[0];
1840 kaddr = page_address(sector->page);
1841 h = (struct btrfs_header *)kaddr;
1842 memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size);
1845 * we don't use the getter functions here, as we
1846 * a) don't have an extent buffer and
1847 * b) the page is already kmapped
1849 if (sector->logical != btrfs_stack_header_bytenr(h))
1850 sblock->header_error = 1;
1852 if (sector->generation != btrfs_stack_header_generation(h)) {
1853 sblock->header_error = 1;
1854 sblock->generation_error = 1;
1857 if (!scrub_check_fsid(h->fsid, sector))
1858 sblock->header_error = 1;
1860 if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1862 sblock->header_error = 1;
1864 shash->tfm = fs_info->csum_shash;
1865 crypto_shash_init(shash);
1866 crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
1867 sectorsize - BTRFS_CSUM_SIZE);
1869 for (i = 1; i < num_sectors; i++) {
1870 kaddr = page_address(sblock->sectors[i]->page);
1871 crypto_shash_update(shash, kaddr, sectorsize);
1874 crypto_shash_final(shash, calculated_csum);
1875 if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size))
1876 sblock->checksum_error = 1;
1878 return sblock->header_error || sblock->checksum_error;
1881 static int scrub_checksum_super(struct scrub_block *sblock)
1883 struct btrfs_super_block *s;
1884 struct scrub_ctx *sctx = sblock->sctx;
1885 struct btrfs_fs_info *fs_info = sctx->fs_info;
1886 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1887 u8 calculated_csum[BTRFS_CSUM_SIZE];
1888 struct scrub_sector *sector;
1893 BUG_ON(sblock->sector_count < 1);
1894 sector = sblock->sectors[0];
1895 kaddr = page_address(sector->page);
1896 s = (struct btrfs_super_block *)kaddr;
1898 if (sector->logical != btrfs_super_bytenr(s))
1901 if (sector->generation != btrfs_super_generation(s))
1904 if (!scrub_check_fsid(s->fsid, sector))
1907 shash->tfm = fs_info->csum_shash;
1908 crypto_shash_init(shash);
1909 crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE,
1910 BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum);
1912 if (memcmp(calculated_csum, s->csum, sctx->fs_info->csum_size))
1915 if (fail_cor + fail_gen) {
1917 * if we find an error in a super block, we just report it.
1918 * They will get written with the next transaction commit
1921 spin_lock(&sctx->stat_lock);
1922 ++sctx->stat.super_errors;
1923 spin_unlock(&sctx->stat_lock);
1925 btrfs_dev_stat_inc_and_print(sector->dev,
1926 BTRFS_DEV_STAT_CORRUPTION_ERRS);
1928 btrfs_dev_stat_inc_and_print(sector->dev,
1929 BTRFS_DEV_STAT_GENERATION_ERRS);
1932 return fail_cor + fail_gen;
1935 static void scrub_block_get(struct scrub_block *sblock)
1937 refcount_inc(&sblock->refs);
1940 static void scrub_block_put(struct scrub_block *sblock)
1942 if (refcount_dec_and_test(&sblock->refs)) {
1945 if (sblock->sparity)
1946 scrub_parity_put(sblock->sparity);
1948 for (i = 0; i < sblock->sector_count; i++)
1949 scrub_sector_put(sblock->sectors[i]);
1954 static void scrub_sector_get(struct scrub_sector *sector)
1956 atomic_inc(§or->refs);
1959 static void scrub_sector_put(struct scrub_sector *sector)
1961 if (atomic_dec_and_test(§or->refs)) {
1963 __free_page(sector->page);
1969 * Throttling of IO submission, bandwidth-limit based, the timeslice is 1
1970 * second. Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
1972 static void scrub_throttle(struct scrub_ctx *sctx)
1974 const int time_slice = 1000;
1975 struct scrub_bio *sbio;
1976 struct btrfs_device *device;
1982 sbio = sctx->bios[sctx->curr];
1984 bwlimit = READ_ONCE(device->scrub_speed_max);
1989 * Slice is divided into intervals when the IO is submitted, adjust by
1990 * bwlimit and maximum of 64 intervals.
1992 div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
1993 div = min_t(u32, 64, div);
1995 /* Start new epoch, set deadline */
1997 if (sctx->throttle_deadline == 0) {
1998 sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
1999 sctx->throttle_sent = 0;
2002 /* Still in the time to send? */
2003 if (ktime_before(now, sctx->throttle_deadline)) {
2004 /* If current bio is within the limit, send it */
2005 sctx->throttle_sent += sbio->bio->bi_iter.bi_size;
2006 if (sctx->throttle_sent <= div_u64(bwlimit, div))
2009 /* We're over the limit, sleep until the rest of the slice */
2010 delta = ktime_ms_delta(sctx->throttle_deadline, now);
2012 /* New request after deadline, start new epoch */
2019 timeout = div_u64(delta * HZ, 1000);
2020 schedule_timeout_interruptible(timeout);
2023 /* Next call will start the deadline period */
2024 sctx->throttle_deadline = 0;
2027 static void scrub_submit(struct scrub_ctx *sctx)
2029 struct scrub_bio *sbio;
2031 if (sctx->curr == -1)
2034 scrub_throttle(sctx);
2036 sbio = sctx->bios[sctx->curr];
2038 scrub_pending_bio_inc(sctx);
2039 btrfsic_check_bio(sbio->bio);
2040 submit_bio(sbio->bio);
2043 static int scrub_add_sector_to_rd_bio(struct scrub_ctx *sctx,
2044 struct scrub_sector *sector)
2046 struct scrub_block *sblock = sector->sblock;
2047 struct scrub_bio *sbio;
2048 const u32 sectorsize = sctx->fs_info->sectorsize;
2053 * grab a fresh bio or wait for one to become available
2055 while (sctx->curr == -1) {
2056 spin_lock(&sctx->list_lock);
2057 sctx->curr = sctx->first_free;
2058 if (sctx->curr != -1) {
2059 sctx->first_free = sctx->bios[sctx->curr]->next_free;
2060 sctx->bios[sctx->curr]->next_free = -1;
2061 sctx->bios[sctx->curr]->sector_count = 0;
2062 spin_unlock(&sctx->list_lock);
2064 spin_unlock(&sctx->list_lock);
2065 wait_event(sctx->list_wait, sctx->first_free != -1);
2068 sbio = sctx->bios[sctx->curr];
2069 if (sbio->sector_count == 0) {
2070 sbio->physical = sector->physical;
2071 sbio->logical = sector->logical;
2072 sbio->dev = sector->dev;
2074 sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
2075 REQ_OP_READ, GFP_NOFS);
2077 sbio->bio->bi_private = sbio;
2078 sbio->bio->bi_end_io = scrub_bio_end_io;
2079 sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
2081 } else if (sbio->physical + sbio->sector_count * sectorsize !=
2083 sbio->logical + sbio->sector_count * sectorsize !=
2085 sbio->dev != sector->dev) {
2090 sbio->sectors[sbio->sector_count] = sector;
2091 ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0);
2092 if (ret != sectorsize) {
2093 if (sbio->sector_count < 1) {
2102 scrub_block_get(sblock); /* one for the page added to the bio */
2103 atomic_inc(&sblock->outstanding_sectors);
2104 sbio->sector_count++;
2105 if (sbio->sector_count == sctx->sectors_per_bio)
2111 static void scrub_missing_raid56_end_io(struct bio *bio)
2113 struct scrub_block *sblock = bio->bi_private;
2114 struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
2117 sblock->no_io_error_seen = 0;
2121 queue_work(fs_info->scrub_workers, &sblock->work);
2124 static void scrub_missing_raid56_worker(struct work_struct *work)
2126 struct scrub_block *sblock = container_of(work, struct scrub_block, work);
2127 struct scrub_ctx *sctx = sblock->sctx;
2128 struct btrfs_fs_info *fs_info = sctx->fs_info;
2130 struct btrfs_device *dev;
2132 logical = sblock->sectors[0]->logical;
2133 dev = sblock->sectors[0]->dev;
2135 if (sblock->no_io_error_seen)
2136 scrub_recheck_block_checksum(sblock);
2138 if (!sblock->no_io_error_seen) {
2139 spin_lock(&sctx->stat_lock);
2140 sctx->stat.read_errors++;
2141 spin_unlock(&sctx->stat_lock);
2142 btrfs_err_rl_in_rcu(fs_info,
2143 "IO error rebuilding logical %llu for dev %s",
2144 logical, rcu_str_deref(dev->name));
2145 } else if (sblock->header_error || sblock->checksum_error) {
2146 spin_lock(&sctx->stat_lock);
2147 sctx->stat.uncorrectable_errors++;
2148 spin_unlock(&sctx->stat_lock);
2149 btrfs_err_rl_in_rcu(fs_info,
2150 "failed to rebuild valid logical %llu for dev %s",
2151 logical, rcu_str_deref(dev->name));
2153 scrub_write_block_to_dev_replace(sblock);
2156 if (sctx->is_dev_replace && sctx->flush_all_writes) {
2157 mutex_lock(&sctx->wr_lock);
2158 scrub_wr_submit(sctx);
2159 mutex_unlock(&sctx->wr_lock);
2162 scrub_block_put(sblock);
2163 scrub_pending_bio_dec(sctx);
2166 static void scrub_missing_raid56_pages(struct scrub_block *sblock)
2168 struct scrub_ctx *sctx = sblock->sctx;
2169 struct btrfs_fs_info *fs_info = sctx->fs_info;
2170 u64 length = sblock->sector_count << fs_info->sectorsize_bits;
2171 u64 logical = sblock->sectors[0]->logical;
2172 struct btrfs_io_context *bioc = NULL;
2174 struct btrfs_raid_bio *rbio;
2178 btrfs_bio_counter_inc_blocked(fs_info);
2179 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
2181 if (ret || !bioc || !bioc->raid_map)
2184 if (WARN_ON(!sctx->is_dev_replace ||
2185 !(bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
2187 * We shouldn't be scrubbing a missing device. Even for dev
2188 * replace, we should only get here for RAID 5/6. We either
2189 * managed to mount something with no mirrors remaining or
2190 * there's a bug in scrub_find_good_copy()/btrfs_map_block().
2195 bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
2196 bio->bi_iter.bi_sector = logical >> 9;
2197 bio->bi_private = sblock;
2198 bio->bi_end_io = scrub_missing_raid56_end_io;
2200 rbio = raid56_alloc_missing_rbio(bio, bioc, length);
2204 for (i = 0; i < sblock->sector_count; i++) {
2205 struct scrub_sector *sector = sblock->sectors[i];
2208 * For now, our scrub is still one page per sector, so pgoff
2211 raid56_add_scrub_pages(rbio, sector->page, 0, sector->logical);
2214 INIT_WORK(&sblock->work, scrub_missing_raid56_worker);
2215 scrub_block_get(sblock);
2216 scrub_pending_bio_inc(sctx);
2217 raid56_submit_missing_rbio(rbio);
2223 btrfs_bio_counter_dec(fs_info);
2224 btrfs_put_bioc(bioc);
2225 spin_lock(&sctx->stat_lock);
2226 sctx->stat.malloc_errors++;
2227 spin_unlock(&sctx->stat_lock);
2230 static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
2231 u64 physical, struct btrfs_device *dev, u64 flags,
2232 u64 gen, int mirror_num, u8 *csum,
2233 u64 physical_for_dev_replace)
2235 struct scrub_block *sblock;
2236 const u32 sectorsize = sctx->fs_info->sectorsize;
2239 sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2241 spin_lock(&sctx->stat_lock);
2242 sctx->stat.malloc_errors++;
2243 spin_unlock(&sctx->stat_lock);
2247 /* one ref inside this function, plus one for each page added to
2249 refcount_set(&sblock->refs, 1);
2250 sblock->sctx = sctx;
2251 sblock->no_io_error_seen = 1;
2253 for (index = 0; len > 0; index++) {
2254 struct scrub_sector *sector;
2256 * Here we will allocate one page for one sector to scrub.
2257 * This is fine if PAGE_SIZE == sectorsize, but will cost
2258 * more memory for PAGE_SIZE > sectorsize case.
2260 u32 l = min(sectorsize, len);
2262 sector = kzalloc(sizeof(*sector), GFP_KERNEL);
2265 spin_lock(&sctx->stat_lock);
2266 sctx->stat.malloc_errors++;
2267 spin_unlock(&sctx->stat_lock);
2268 scrub_block_put(sblock);
2271 ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK);
2272 scrub_sector_get(sector);
2273 sblock->sectors[index] = sector;
2274 sector->sblock = sblock;
2276 sector->flags = flags;
2277 sector->generation = gen;
2278 sector->logical = logical;
2279 sector->physical = physical;
2280 sector->physical_for_dev_replace = physical_for_dev_replace;
2281 sector->mirror_num = mirror_num;
2283 sector->have_csum = 1;
2284 memcpy(sector->csum, csum, sctx->fs_info->csum_size);
2286 sector->have_csum = 0;
2288 sblock->sector_count++;
2289 sector->page = alloc_page(GFP_KERNEL);
2295 physical_for_dev_replace += l;
2298 WARN_ON(sblock->sector_count == 0);
2299 if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2301 * This case should only be hit for RAID 5/6 device replace. See
2302 * the comment in scrub_missing_raid56_pages() for details.
2304 scrub_missing_raid56_pages(sblock);
2306 for (index = 0; index < sblock->sector_count; index++) {
2307 struct scrub_sector *sector = sblock->sectors[index];
2310 ret = scrub_add_sector_to_rd_bio(sctx, sector);
2312 scrub_block_put(sblock);
2317 if (flags & BTRFS_EXTENT_FLAG_SUPER)
2321 /* last one frees, either here or in bio completion for last page */
2322 scrub_block_put(sblock);
2326 static void scrub_bio_end_io(struct bio *bio)
2328 struct scrub_bio *sbio = bio->bi_private;
2329 struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
2331 sbio->status = bio->bi_status;
2334 queue_work(fs_info->scrub_workers, &sbio->work);
2337 static void scrub_bio_end_io_worker(struct work_struct *work)
2339 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2340 struct scrub_ctx *sctx = sbio->sctx;
2343 ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
2345 for (i = 0; i < sbio->sector_count; i++) {
2346 struct scrub_sector *sector = sbio->sectors[i];
2348 sector->io_error = 1;
2349 sector->sblock->no_io_error_seen = 0;
2353 /* Now complete the scrub_block items that have all pages completed */
2354 for (i = 0; i < sbio->sector_count; i++) {
2355 struct scrub_sector *sector = sbio->sectors[i];
2356 struct scrub_block *sblock = sector->sblock;
2358 if (atomic_dec_and_test(&sblock->outstanding_sectors))
2359 scrub_block_complete(sblock);
2360 scrub_block_put(sblock);
2365 spin_lock(&sctx->list_lock);
2366 sbio->next_free = sctx->first_free;
2367 sctx->first_free = sbio->index;
2368 spin_unlock(&sctx->list_lock);
2370 if (sctx->is_dev_replace && sctx->flush_all_writes) {
2371 mutex_lock(&sctx->wr_lock);
2372 scrub_wr_submit(sctx);
2373 mutex_unlock(&sctx->wr_lock);
2376 scrub_pending_bio_dec(sctx);
2379 static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2380 unsigned long *bitmap,
2385 u32 sectorsize_bits = sparity->sctx->fs_info->sectorsize_bits;
2387 if (len >= sparity->stripe_len) {
2388 bitmap_set(bitmap, 0, sparity->nsectors);
2392 start -= sparity->logic_start;
2393 start = div64_u64_rem(start, sparity->stripe_len, &offset);
2394 offset = offset >> sectorsize_bits;
2395 nsectors = len >> sectorsize_bits;
2397 if (offset + nsectors <= sparity->nsectors) {
2398 bitmap_set(bitmap, offset, nsectors);
2402 bitmap_set(bitmap, offset, sparity->nsectors - offset);
2403 bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
2406 static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
2409 __scrub_mark_bitmap(sparity, sparity->ebitmap, start, len);
2412 static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
2415 __scrub_mark_bitmap(sparity, sparity->dbitmap, start, len);
2418 static void scrub_block_complete(struct scrub_block *sblock)
2422 if (!sblock->no_io_error_seen) {
2424 scrub_handle_errored_block(sblock);
2427 * if has checksum error, write via repair mechanism in
2428 * dev replace case, otherwise write here in dev replace
2431 corrupted = scrub_checksum(sblock);
2432 if (!corrupted && sblock->sctx->is_dev_replace)
2433 scrub_write_block_to_dev_replace(sblock);
2436 if (sblock->sparity && corrupted && !sblock->data_corrected) {
2437 u64 start = sblock->sectors[0]->logical;
2438 u64 end = sblock->sectors[sblock->sector_count - 1]->logical +
2439 sblock->sctx->fs_info->sectorsize;
2441 ASSERT(end - start <= U32_MAX);
2442 scrub_parity_mark_sectors_error(sblock->sparity,
2443 start, end - start);
2447 static void drop_csum_range(struct scrub_ctx *sctx, struct btrfs_ordered_sum *sum)
2449 sctx->stat.csum_discards += sum->len >> sctx->fs_info->sectorsize_bits;
2450 list_del(&sum->list);
2455 * Find the desired csum for range [logical, logical + sectorsize), and store
2456 * the csum into @csum.
2458 * The search source is sctx->csum_list, which is a pre-populated list
2459 * storing bytenr ordered csum ranges. We're responsible to cleanup any range
2460 * that is before @logical.
2462 * Return 0 if there is no csum for the range.
2463 * Return 1 if there is csum for the range and copied to @csum.
2465 static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
2469 while (!list_empty(&sctx->csum_list)) {
2470 struct btrfs_ordered_sum *sum = NULL;
2471 unsigned long index;
2472 unsigned long num_sectors;
2474 sum = list_first_entry(&sctx->csum_list,
2475 struct btrfs_ordered_sum, list);
2476 /* The current csum range is beyond our range, no csum found */
2477 if (sum->bytenr > logical)
2481 * The current sum is before our bytenr, since scrub is always
2482 * done in bytenr order, the csum will never be used anymore,
2483 * clean it up so that later calls won't bother with the range,
2484 * and continue search the next range.
2486 if (sum->bytenr + sum->len <= logical) {
2487 drop_csum_range(sctx, sum);
2491 /* Now the csum range covers our bytenr, copy the csum */
2493 index = (logical - sum->bytenr) >> sctx->fs_info->sectorsize_bits;
2494 num_sectors = sum->len >> sctx->fs_info->sectorsize_bits;
2496 memcpy(csum, sum->sums + index * sctx->fs_info->csum_size,
2497 sctx->fs_info->csum_size);
2499 /* Cleanup the range if we're at the end of the csum range */
2500 if (index == num_sectors - 1)
2501 drop_csum_range(sctx, sum);
2509 /* scrub extent tries to collect up to 64 kB for each bio */
2510 static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
2511 u64 logical, u32 len,
2512 u64 physical, struct btrfs_device *dev, u64 flags,
2513 u64 gen, int mirror_num)
2515 struct btrfs_device *src_dev = dev;
2516 u64 src_physical = physical;
2517 int src_mirror = mirror_num;
2519 u8 csum[BTRFS_CSUM_SIZE];
2522 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2523 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2524 blocksize = map->stripe_len;
2526 blocksize = sctx->fs_info->sectorsize;
2527 spin_lock(&sctx->stat_lock);
2528 sctx->stat.data_extents_scrubbed++;
2529 sctx->stat.data_bytes_scrubbed += len;
2530 spin_unlock(&sctx->stat_lock);
2531 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2532 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2533 blocksize = map->stripe_len;
2535 blocksize = sctx->fs_info->nodesize;
2536 spin_lock(&sctx->stat_lock);
2537 sctx->stat.tree_extents_scrubbed++;
2538 sctx->stat.tree_bytes_scrubbed += len;
2539 spin_unlock(&sctx->stat_lock);
2541 blocksize = sctx->fs_info->sectorsize;
2546 * For dev-replace case, we can have @dev being a missing device.
2547 * Regular scrub will avoid its execution on missing device at all,
2548 * as that would trigger tons of read error.
2550 * Reading from missing device will cause read error counts to
2551 * increase unnecessarily.
2552 * So here we change the read source to a good mirror.
2554 if (sctx->is_dev_replace && !dev->bdev)
2555 scrub_find_good_copy(sctx->fs_info, logical, len, &src_physical,
2556 &src_dev, &src_mirror);
2558 u32 l = min(len, blocksize);
2561 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2562 /* push csums to sbio */
2563 have_csum = scrub_find_csum(sctx, logical, csum);
2565 ++sctx->stat.no_csum;
2567 ret = scrub_sectors(sctx, logical, l, src_physical, src_dev,
2568 flags, gen, src_mirror,
2569 have_csum ? csum : NULL, physical);
2580 static int scrub_sectors_for_parity(struct scrub_parity *sparity,
2581 u64 logical, u32 len,
2582 u64 physical, struct btrfs_device *dev,
2583 u64 flags, u64 gen, int mirror_num, u8 *csum)
2585 struct scrub_ctx *sctx = sparity->sctx;
2586 struct scrub_block *sblock;
2587 const u32 sectorsize = sctx->fs_info->sectorsize;
2590 ASSERT(IS_ALIGNED(len, sectorsize));
2592 sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2594 spin_lock(&sctx->stat_lock);
2595 sctx->stat.malloc_errors++;
2596 spin_unlock(&sctx->stat_lock);
2600 /* one ref inside this function, plus one for each page added to
2602 refcount_set(&sblock->refs, 1);
2603 sblock->sctx = sctx;
2604 sblock->no_io_error_seen = 1;
2605 sblock->sparity = sparity;
2606 scrub_parity_get(sparity);
2608 for (index = 0; len > 0; index++) {
2609 struct scrub_sector *sector;
2611 sector = kzalloc(sizeof(*sector), GFP_KERNEL);
2614 spin_lock(&sctx->stat_lock);
2615 sctx->stat.malloc_errors++;
2616 spin_unlock(&sctx->stat_lock);
2617 scrub_block_put(sblock);
2620 ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK);
2621 /* For scrub block */
2622 scrub_sector_get(sector);
2623 sblock->sectors[index] = sector;
2624 /* For scrub parity */
2625 scrub_sector_get(sector);
2626 list_add_tail(§or->list, &sparity->sectors_list);
2627 sector->sblock = sblock;
2629 sector->flags = flags;
2630 sector->generation = gen;
2631 sector->logical = logical;
2632 sector->physical = physical;
2633 sector->mirror_num = mirror_num;
2635 sector->have_csum = 1;
2636 memcpy(sector->csum, csum, sctx->fs_info->csum_size);
2638 sector->have_csum = 0;
2640 sblock->sector_count++;
2641 sector->page = alloc_page(GFP_KERNEL);
2646 /* Iterate over the stripe range in sectorsize steps */
2648 logical += sectorsize;
2649 physical += sectorsize;
2652 WARN_ON(sblock->sector_count == 0);
2653 for (index = 0; index < sblock->sector_count; index++) {
2654 struct scrub_sector *sector = sblock->sectors[index];
2657 ret = scrub_add_sector_to_rd_bio(sctx, sector);
2659 scrub_block_put(sblock);
2664 /* Last one frees, either here or in bio completion for last sector */
2665 scrub_block_put(sblock);
2669 static int scrub_extent_for_parity(struct scrub_parity *sparity,
2670 u64 logical, u32 len,
2671 u64 physical, struct btrfs_device *dev,
2672 u64 flags, u64 gen, int mirror_num)
2674 struct scrub_ctx *sctx = sparity->sctx;
2676 u8 csum[BTRFS_CSUM_SIZE];
2679 if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2680 scrub_parity_mark_sectors_error(sparity, logical, len);
2684 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2685 blocksize = sparity->stripe_len;
2686 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2687 blocksize = sparity->stripe_len;
2689 blocksize = sctx->fs_info->sectorsize;
2694 u32 l = min(len, blocksize);
2697 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2698 /* push csums to sbio */
2699 have_csum = scrub_find_csum(sctx, logical, csum);
2703 ret = scrub_sectors_for_parity(sparity, logical, l, physical, dev,
2704 flags, gen, mirror_num,
2705 have_csum ? csum : NULL);
2717 * Given a physical address, this will calculate it's
2718 * logical offset. if this is a parity stripe, it will return
2719 * the most left data stripe's logical offset.
2721 * return 0 if it is a data stripe, 1 means parity stripe.
2723 static int get_raid56_logic_offset(u64 physical, int num,
2724 struct map_lookup *map, u64 *offset,
2733 const int data_stripes = nr_data_stripes(map);
2735 last_offset = (physical - map->stripes[num].physical) * data_stripes;
2737 *stripe_start = last_offset;
2739 *offset = last_offset;
2740 for (i = 0; i < data_stripes; i++) {
2741 *offset = last_offset + i * map->stripe_len;
2743 stripe_nr = div64_u64(*offset, map->stripe_len);
2744 stripe_nr = div_u64(stripe_nr, data_stripes);
2746 /* Work out the disk rotation on this stripe-set */
2747 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
2748 /* calculate which stripe this data locates */
2750 stripe_index = rot % map->num_stripes;
2751 if (stripe_index == num)
2753 if (stripe_index < num)
2756 *offset = last_offset + j * map->stripe_len;
2760 static void scrub_free_parity(struct scrub_parity *sparity)
2762 struct scrub_ctx *sctx = sparity->sctx;
2763 struct scrub_sector *curr, *next;
2766 nbits = bitmap_weight(sparity->ebitmap, sparity->nsectors);
2768 spin_lock(&sctx->stat_lock);
2769 sctx->stat.read_errors += nbits;
2770 sctx->stat.uncorrectable_errors += nbits;
2771 spin_unlock(&sctx->stat_lock);
2774 list_for_each_entry_safe(curr, next, &sparity->sectors_list, list) {
2775 list_del_init(&curr->list);
2776 scrub_sector_put(curr);
2782 static void scrub_parity_bio_endio_worker(struct work_struct *work)
2784 struct scrub_parity *sparity = container_of(work, struct scrub_parity,
2786 struct scrub_ctx *sctx = sparity->sctx;
2788 scrub_free_parity(sparity);
2789 scrub_pending_bio_dec(sctx);
2792 static void scrub_parity_bio_endio(struct bio *bio)
2794 struct scrub_parity *sparity = bio->bi_private;
2795 struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
2798 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2803 INIT_WORK(&sparity->work, scrub_parity_bio_endio_worker);
2804 queue_work(fs_info->scrub_parity_workers, &sparity->work);
2807 static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
2809 struct scrub_ctx *sctx = sparity->sctx;
2810 struct btrfs_fs_info *fs_info = sctx->fs_info;
2812 struct btrfs_raid_bio *rbio;
2813 struct btrfs_io_context *bioc = NULL;
2817 if (!bitmap_andnot(sparity->dbitmap, sparity->dbitmap, sparity->ebitmap,
2821 length = sparity->logic_end - sparity->logic_start;
2823 btrfs_bio_counter_inc_blocked(fs_info);
2824 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
2826 if (ret || !bioc || !bioc->raid_map)
2829 bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
2830 bio->bi_iter.bi_sector = sparity->logic_start >> 9;
2831 bio->bi_private = sparity;
2832 bio->bi_end_io = scrub_parity_bio_endio;
2834 rbio = raid56_parity_alloc_scrub_rbio(bio, bioc, length,
2841 scrub_pending_bio_inc(sctx);
2842 raid56_parity_submit_scrub_rbio(rbio);
2848 btrfs_bio_counter_dec(fs_info);
2849 btrfs_put_bioc(bioc);
2850 bitmap_or(sparity->ebitmap, sparity->ebitmap, sparity->dbitmap,
2852 spin_lock(&sctx->stat_lock);
2853 sctx->stat.malloc_errors++;
2854 spin_unlock(&sctx->stat_lock);
2856 scrub_free_parity(sparity);
2859 static inline int scrub_calc_parity_bitmap_len(int nsectors)
2861 return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long);
2864 static void scrub_parity_get(struct scrub_parity *sparity)
2866 refcount_inc(&sparity->refs);
2869 static void scrub_parity_put(struct scrub_parity *sparity)
2871 if (!refcount_dec_and_test(&sparity->refs))
2874 scrub_parity_check_and_repair(sparity);
2878 * Return 0 if the extent item range covers any byte of the range.
2879 * Return <0 if the extent item is before @search_start.
2880 * Return >0 if the extent item is after @start_start + @search_len.
2882 static int compare_extent_item_range(struct btrfs_path *path,
2883 u64 search_start, u64 search_len)
2885 struct btrfs_fs_info *fs_info = path->nodes[0]->fs_info;
2887 struct btrfs_key key;
2889 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2890 ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY ||
2891 key.type == BTRFS_METADATA_ITEM_KEY);
2892 if (key.type == BTRFS_METADATA_ITEM_KEY)
2893 len = fs_info->nodesize;
2897 if (key.objectid + len <= search_start)
2899 if (key.objectid >= search_start + search_len)
2905 * Locate one extent item which covers any byte in range
2906 * [@search_start, @search_start + @search_length)
2908 * If the path is not initialized, we will initialize the search by doing
2909 * a btrfs_search_slot().
2910 * If the path is already initialized, we will use the path as the initial
2911 * slot, to avoid duplicated btrfs_search_slot() calls.
2913 * NOTE: If an extent item starts before @search_start, we will still
2914 * return the extent item. This is for data extent crossing stripe boundary.
2916 * Return 0 if we found such extent item, and @path will point to the extent item.
2917 * Return >0 if no such extent item can be found, and @path will be released.
2918 * Return <0 if hit fatal error, and @path will be released.
2920 static int find_first_extent_item(struct btrfs_root *extent_root,
2921 struct btrfs_path *path,
2922 u64 search_start, u64 search_len)
2924 struct btrfs_fs_info *fs_info = extent_root->fs_info;
2925 struct btrfs_key key;
2928 /* Continue using the existing path */
2930 goto search_forward;
2932 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2933 key.type = BTRFS_METADATA_ITEM_KEY;
2935 key.type = BTRFS_EXTENT_ITEM_KEY;
2936 key.objectid = search_start;
2937 key.offset = (u64)-1;
2939 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2945 * Here we intentionally pass 0 as @min_objectid, as there could be
2946 * an extent item starting before @search_start.
2948 ret = btrfs_previous_extent_item(extent_root, path, 0);
2952 * No matter whether we have found an extent item, the next loop will
2953 * properly do every check on the key.
2957 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2958 if (key.objectid >= search_start + search_len)
2960 if (key.type != BTRFS_METADATA_ITEM_KEY &&
2961 key.type != BTRFS_EXTENT_ITEM_KEY)
2964 ret = compare_extent_item_range(path, search_start, search_len);
2971 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
2972 ret = btrfs_next_leaf(extent_root, path);
2974 /* Either no more item or fatal error */
2975 btrfs_release_path(path);
2980 btrfs_release_path(path);
2984 static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret,
2985 u64 *size_ret, u64 *flags_ret, u64 *generation_ret)
2987 struct btrfs_key key;
2988 struct btrfs_extent_item *ei;
2990 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2991 ASSERT(key.type == BTRFS_METADATA_ITEM_KEY ||
2992 key.type == BTRFS_EXTENT_ITEM_KEY);
2993 *extent_start_ret = key.objectid;
2994 if (key.type == BTRFS_METADATA_ITEM_KEY)
2995 *size_ret = path->nodes[0]->fs_info->nodesize;
2997 *size_ret = key.offset;
2998 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item);
2999 *flags_ret = btrfs_extent_flags(path->nodes[0], ei);
3000 *generation_ret = btrfs_extent_generation(path->nodes[0], ei);
3003 static bool does_range_cross_boundary(u64 extent_start, u64 extent_len,
3004 u64 boundary_start, u64 boudary_len)
3006 return (extent_start < boundary_start &&
3007 extent_start + extent_len > boundary_start) ||
3008 (extent_start < boundary_start + boudary_len &&
3009 extent_start + extent_len > boundary_start + boudary_len);
3012 static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx,
3013 struct scrub_parity *sparity,
3014 struct map_lookup *map,
3015 struct btrfs_device *sdev,
3016 struct btrfs_path *path,
3019 struct btrfs_fs_info *fs_info = sctx->fs_info;
3020 struct btrfs_root *extent_root = btrfs_extent_root(fs_info, logical);
3021 struct btrfs_root *csum_root = btrfs_csum_root(fs_info, logical);
3022 u64 cur_logical = logical;
3025 ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
3027 /* Path must not be populated */
3028 ASSERT(!path->nodes[0]);
3030 while (cur_logical < logical + map->stripe_len) {
3031 struct btrfs_io_context *bioc = NULL;
3032 struct btrfs_device *extent_dev;
3038 u64 extent_physical;
3039 u64 extent_mirror_num;
3041 ret = find_first_extent_item(extent_root, path, cur_logical,
3042 logical + map->stripe_len - cur_logical);
3043 /* No more extent item in this data stripe */
3050 get_extent_info(path, &extent_start, &extent_size, &extent_flags,
3053 /* Metadata should not cross stripe boundaries */
3054 if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3055 does_range_cross_boundary(extent_start, extent_size,
3056 logical, map->stripe_len)) {
3058 "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
3059 extent_start, logical);
3060 spin_lock(&sctx->stat_lock);
3061 sctx->stat.uncorrectable_errors++;
3062 spin_unlock(&sctx->stat_lock);
3063 cur_logical += extent_size;
3067 /* Skip hole range which doesn't have any extent */
3068 cur_logical = max(extent_start, cur_logical);
3070 /* Truncate the range inside this data stripe */
3071 extent_size = min(extent_start + extent_size,
3072 logical + map->stripe_len) - cur_logical;
3073 extent_start = cur_logical;
3074 ASSERT(extent_size <= U32_MAX);
3076 scrub_parity_mark_sectors_data(sparity, extent_start, extent_size);
3078 mapped_length = extent_size;
3079 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_start,
3080 &mapped_length, &bioc, 0);
3081 if (!ret && (!bioc || mapped_length < extent_size))
3084 btrfs_put_bioc(bioc);
3085 scrub_parity_mark_sectors_error(sparity, extent_start,
3089 extent_physical = bioc->stripes[0].physical;
3090 extent_mirror_num = bioc->mirror_num;
3091 extent_dev = bioc->stripes[0].dev;
3092 btrfs_put_bioc(bioc);
3094 ret = btrfs_lookup_csums_range(csum_root, extent_start,
3095 extent_start + extent_size - 1,
3096 &sctx->csum_list, 1);
3098 scrub_parity_mark_sectors_error(sparity, extent_start,
3103 ret = scrub_extent_for_parity(sparity, extent_start,
3104 extent_size, extent_physical,
3105 extent_dev, extent_flags,
3106 extent_gen, extent_mirror_num);
3107 scrub_free_csums(sctx);
3110 scrub_parity_mark_sectors_error(sparity, extent_start,
3116 cur_logical += extent_size;
3118 btrfs_release_path(path);
3122 static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
3123 struct map_lookup *map,
3124 struct btrfs_device *sdev,
3128 struct btrfs_fs_info *fs_info = sctx->fs_info;
3129 struct btrfs_path *path;
3132 struct scrub_parity *sparity;
3136 path = btrfs_alloc_path();
3138 spin_lock(&sctx->stat_lock);
3139 sctx->stat.malloc_errors++;
3140 spin_unlock(&sctx->stat_lock);
3143 path->search_commit_root = 1;
3144 path->skip_locking = 1;
3146 ASSERT(map->stripe_len <= U32_MAX);
3147 nsectors = map->stripe_len >> fs_info->sectorsize_bits;
3148 bitmap_len = scrub_calc_parity_bitmap_len(nsectors);
3149 sparity = kzalloc(sizeof(struct scrub_parity) + 2 * bitmap_len,
3152 spin_lock(&sctx->stat_lock);
3153 sctx->stat.malloc_errors++;
3154 spin_unlock(&sctx->stat_lock);
3155 btrfs_free_path(path);
3159 ASSERT(map->stripe_len <= U32_MAX);
3160 sparity->stripe_len = map->stripe_len;
3161 sparity->nsectors = nsectors;
3162 sparity->sctx = sctx;
3163 sparity->scrub_dev = sdev;
3164 sparity->logic_start = logic_start;
3165 sparity->logic_end = logic_end;
3166 refcount_set(&sparity->refs, 1);
3167 INIT_LIST_HEAD(&sparity->sectors_list);
3168 sparity->dbitmap = sparity->bitmap;
3169 sparity->ebitmap = (void *)sparity->bitmap + bitmap_len;
3172 for (cur_logical = logic_start; cur_logical < logic_end;
3173 cur_logical += map->stripe_len) {
3174 ret = scrub_raid56_data_stripe_for_parity(sctx, sparity, map,
3175 sdev, path, cur_logical);
3180 scrub_parity_put(sparity);
3182 mutex_lock(&sctx->wr_lock);
3183 scrub_wr_submit(sctx);
3184 mutex_unlock(&sctx->wr_lock);
3186 btrfs_free_path(path);
3187 return ret < 0 ? ret : 0;
3190 static void sync_replace_for_zoned(struct scrub_ctx *sctx)
3192 if (!btrfs_is_zoned(sctx->fs_info))
3195 sctx->flush_all_writes = true;
3197 mutex_lock(&sctx->wr_lock);
3198 scrub_wr_submit(sctx);
3199 mutex_unlock(&sctx->wr_lock);
3201 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3204 static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
3205 u64 physical, u64 physical_end)
3207 struct btrfs_fs_info *fs_info = sctx->fs_info;
3210 if (!btrfs_is_zoned(fs_info))
3213 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3215 mutex_lock(&sctx->wr_lock);
3216 if (sctx->write_pointer < physical_end) {
3217 ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical,
3219 sctx->write_pointer);
3222 "zoned: failed to recover write pointer");
3224 mutex_unlock(&sctx->wr_lock);
3225 btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical);
3231 * Scrub one range which can only has simple mirror based profile.
3232 * (Including all range in SINGLE/DUP/RAID1/RAID1C*, and each stripe in
3235 * Since we may need to handle a subset of block group, we need @logical_start
3236 * and @logical_length parameter.
3238 static int scrub_simple_mirror(struct scrub_ctx *sctx,
3239 struct btrfs_root *extent_root,
3240 struct btrfs_root *csum_root,
3241 struct btrfs_block_group *bg,
3242 struct map_lookup *map,
3243 u64 logical_start, u64 logical_length,
3244 struct btrfs_device *device,
3245 u64 physical, int mirror_num)
3247 struct btrfs_fs_info *fs_info = sctx->fs_info;
3248 const u64 logical_end = logical_start + logical_length;
3249 /* An artificial limit, inherit from old scrub behavior */
3250 const u32 max_length = SZ_64K;
3251 struct btrfs_path path = { 0 };
3252 u64 cur_logical = logical_start;
3255 /* The range must be inside the bg */
3256 ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length);
3258 path.search_commit_root = 1;
3259 path.skip_locking = 1;
3260 /* Go through each extent items inside the logical range */
3261 while (cur_logical < logical_end) {
3269 if (atomic_read(&fs_info->scrub_cancel_req) ||
3270 atomic_read(&sctx->cancel_req)) {
3275 if (atomic_read(&fs_info->scrub_pause_req)) {
3276 /* Push queued extents */
3277 sctx->flush_all_writes = true;
3279 mutex_lock(&sctx->wr_lock);
3280 scrub_wr_submit(sctx);
3281 mutex_unlock(&sctx->wr_lock);
3282 wait_event(sctx->list_wait,
3283 atomic_read(&sctx->bios_in_flight) == 0);
3284 sctx->flush_all_writes = false;
3285 scrub_blocked_if_needed(fs_info);
3287 /* Block group removed? */
3288 spin_lock(&bg->lock);
3290 spin_unlock(&bg->lock);
3294 spin_unlock(&bg->lock);
3296 ret = find_first_extent_item(extent_root, &path, cur_logical,
3297 logical_end - cur_logical);
3299 /* No more extent, just update the accounting */
3300 sctx->stat.last_physical = physical + logical_length;
3306 get_extent_info(&path, &extent_start, &extent_len,
3307 &extent_flags, &extent_gen);
3308 /* Skip hole range which doesn't have any extent */
3309 cur_logical = max(extent_start, cur_logical);
3312 * Scrub len has three limits:
3313 * - Extent size limit
3314 * - Scrub range limit
3315 * This is especially imporatant for RAID0/RAID10 to reuse
3317 * - Max scrub size limit
3319 scrub_len = min(min(extent_start + extent_len,
3320 logical_end), cur_logical + max_length) -
3323 if (extent_flags & BTRFS_EXTENT_FLAG_DATA) {
3324 ret = btrfs_lookup_csums_range(csum_root, cur_logical,
3325 cur_logical + scrub_len - 1,
3326 &sctx->csum_list, 1);
3330 if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3331 does_range_cross_boundary(extent_start, extent_len,
3332 logical_start, logical_length)) {
3334 "scrub: tree block %llu spanning boundaries, ignored. boundary=[%llu, %llu)",
3335 extent_start, logical_start, logical_end);
3336 spin_lock(&sctx->stat_lock);
3337 sctx->stat.uncorrectable_errors++;
3338 spin_unlock(&sctx->stat_lock);
3339 cur_logical += scrub_len;
3342 ret = scrub_extent(sctx, map, cur_logical, scrub_len,
3343 cur_logical - logical_start + physical,
3344 device, extent_flags, extent_gen,
3346 scrub_free_csums(sctx);
3349 if (sctx->is_dev_replace)
3350 sync_replace_for_zoned(sctx);
3351 cur_logical += scrub_len;
3352 /* Don't hold CPU for too long time */
3355 btrfs_release_path(&path);
3359 /* Calculate the full stripe length for simple stripe based profiles */
3360 static u64 simple_stripe_full_stripe_len(const struct map_lookup *map)
3362 ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3363 BTRFS_BLOCK_GROUP_RAID10));
3365 return map->num_stripes / map->sub_stripes * map->stripe_len;
3368 /* Get the logical bytenr for the stripe */
3369 static u64 simple_stripe_get_logical(struct map_lookup *map,
3370 struct btrfs_block_group *bg,
3373 ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3374 BTRFS_BLOCK_GROUP_RAID10));
3375 ASSERT(stripe_index < map->num_stripes);
3378 * (stripe_index / sub_stripes) gives how many data stripes we need to
3381 return (stripe_index / map->sub_stripes) * map->stripe_len + bg->start;
3384 /* Get the mirror number for the stripe */
3385 static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index)
3387 ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3388 BTRFS_BLOCK_GROUP_RAID10));
3389 ASSERT(stripe_index < map->num_stripes);
3391 /* For RAID0, it's fixed to 1, for RAID10 it's 0,1,0,1... */
3392 return stripe_index % map->sub_stripes + 1;
3395 static int scrub_simple_stripe(struct scrub_ctx *sctx,
3396 struct btrfs_root *extent_root,
3397 struct btrfs_root *csum_root,
3398 struct btrfs_block_group *bg,
3399 struct map_lookup *map,
3400 struct btrfs_device *device,
3403 const u64 logical_increment = simple_stripe_full_stripe_len(map);
3404 const u64 orig_logical = simple_stripe_get_logical(map, bg, stripe_index);
3405 const u64 orig_physical = map->stripes[stripe_index].physical;
3406 const int mirror_num = simple_stripe_mirror_num(map, stripe_index);
3407 u64 cur_logical = orig_logical;
3408 u64 cur_physical = orig_physical;
3411 while (cur_logical < bg->start + bg->length) {
3413 * Inside each stripe, RAID0 is just SINGLE, and RAID10 is
3414 * just RAID1, so we can reuse scrub_simple_mirror() to scrub
3417 ret = scrub_simple_mirror(sctx, extent_root, csum_root, bg, map,
3418 cur_logical, map->stripe_len, device,
3419 cur_physical, mirror_num);
3422 /* Skip to next stripe which belongs to the target device */
3423 cur_logical += logical_increment;
3424 /* For physical offset, we just go to next stripe */
3425 cur_physical += map->stripe_len;
3430 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
3431 struct btrfs_block_group *bg,
3432 struct map_lookup *map,
3433 struct btrfs_device *scrub_dev,
3434 int stripe_index, u64 dev_extent_len)
3436 struct btrfs_path *path;
3437 struct btrfs_fs_info *fs_info = sctx->fs_info;
3438 struct btrfs_root *root;
3439 struct btrfs_root *csum_root;
3440 struct blk_plug plug;
3441 const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
3442 const u64 chunk_logical = bg->start;
3444 u64 physical = map->stripes[stripe_index].physical;
3445 const u64 physical_end = physical + dev_extent_len;
3448 /* The logical increment after finishing one stripe */
3450 /* Offset inside the chunk */
3456 path = btrfs_alloc_path();
3461 * work on commit root. The related disk blocks are static as
3462 * long as COW is applied. This means, it is save to rewrite
3463 * them to repair disk errors without any race conditions
3465 path->search_commit_root = 1;
3466 path->skip_locking = 1;
3467 path->reada = READA_FORWARD;
3469 wait_event(sctx->list_wait,
3470 atomic_read(&sctx->bios_in_flight) == 0);
3471 scrub_blocked_if_needed(fs_info);
3473 root = btrfs_extent_root(fs_info, bg->start);
3474 csum_root = btrfs_csum_root(fs_info, bg->start);
3477 * collect all data csums for the stripe to avoid seeking during
3478 * the scrub. This might currently (crc32) end up to be about 1MB
3480 blk_start_plug(&plug);
3482 if (sctx->is_dev_replace &&
3483 btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) {
3484 mutex_lock(&sctx->wr_lock);
3485 sctx->write_pointer = physical;
3486 mutex_unlock(&sctx->wr_lock);
3487 sctx->flush_all_writes = true;
3491 * There used to be a big double loop to handle all profiles using the
3492 * same routine, which grows larger and more gross over time.
3494 * So here we handle each profile differently, so simpler profiles
3495 * have simpler scrubbing function.
3497 if (!(profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10 |
3498 BTRFS_BLOCK_GROUP_RAID56_MASK))) {
3500 * Above check rules out all complex profile, the remaining
3501 * profiles are SINGLE|DUP|RAID1|RAID1C*, which is simple
3502 * mirrored duplication without stripe.
3504 * Only @physical and @mirror_num needs to calculated using
3507 ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
3508 bg->start, bg->length, scrub_dev,
3509 map->stripes[stripe_index].physical,
3514 if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
3515 ret = scrub_simple_stripe(sctx, root, csum_root, bg, map,
3516 scrub_dev, stripe_index);
3517 offset = map->stripe_len * (stripe_index / map->sub_stripes);
3521 /* Only RAID56 goes through the old code */
3522 ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
3525 /* Calculate the logical end of the stripe */
3526 get_raid56_logic_offset(physical_end, stripe_index,
3527 map, &logic_end, NULL);
3528 logic_end += chunk_logical;
3530 /* Initialize @offset in case we need to go to out: label */
3531 get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL);
3532 increment = map->stripe_len * nr_data_stripes(map);
3535 * Due to the rotation, for RAID56 it's better to iterate each stripe
3536 * using their physical offset.
3538 while (physical < physical_end) {
3539 ret = get_raid56_logic_offset(physical, stripe_index, map,
3540 &logical, &stripe_logical);
3541 logical += chunk_logical;
3543 /* it is parity strip */
3544 stripe_logical += chunk_logical;
3545 stripe_end = stripe_logical + increment;
3546 ret = scrub_raid56_parity(sctx, map, scrub_dev,
3555 * Now we're at a data stripe, scrub each extents in the range.
3557 * At this stage, if we ignore the repair part, inside each data
3558 * stripe it is no different than SINGLE profile.
3559 * We can reuse scrub_simple_mirror() here, as the repair part
3560 * is still based on @mirror_num.
3562 ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
3563 logical, map->stripe_len,
3564 scrub_dev, physical, 1);
3568 logical += increment;
3569 physical += map->stripe_len;
3570 spin_lock(&sctx->stat_lock);
3572 sctx->stat.last_physical = map->stripes[stripe_index].physical +
3575 sctx->stat.last_physical = physical;
3576 spin_unlock(&sctx->stat_lock);
3581 /* push queued extents */
3583 mutex_lock(&sctx->wr_lock);
3584 scrub_wr_submit(sctx);
3585 mutex_unlock(&sctx->wr_lock);
3587 blk_finish_plug(&plug);
3588 btrfs_free_path(path);
3590 if (sctx->is_dev_replace && ret >= 0) {
3593 ret2 = sync_write_pointer_for_zoned(sctx,
3594 chunk_logical + offset,
3595 map->stripes[stripe_index].physical,
3601 return ret < 0 ? ret : 0;
3604 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
3605 struct btrfs_block_group *bg,
3606 struct btrfs_device *scrub_dev,
3610 struct btrfs_fs_info *fs_info = sctx->fs_info;
3611 struct extent_map_tree *map_tree = &fs_info->mapping_tree;
3612 struct map_lookup *map;
3613 struct extent_map *em;
3617 read_lock(&map_tree->lock);
3618 em = lookup_extent_mapping(map_tree, bg->start, bg->length);
3619 read_unlock(&map_tree->lock);
3623 * Might have been an unused block group deleted by the cleaner
3624 * kthread or relocation.
3626 spin_lock(&bg->lock);
3629 spin_unlock(&bg->lock);
3633 if (em->start != bg->start)
3635 if (em->len < dev_extent_len)
3638 map = em->map_lookup;
3639 for (i = 0; i < map->num_stripes; ++i) {
3640 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
3641 map->stripes[i].physical == dev_offset) {
3642 ret = scrub_stripe(sctx, bg, map, scrub_dev, i,
3649 free_extent_map(em);
3654 static int finish_extent_writes_for_zoned(struct btrfs_root *root,
3655 struct btrfs_block_group *cache)
3657 struct btrfs_fs_info *fs_info = cache->fs_info;
3658 struct btrfs_trans_handle *trans;
3660 if (!btrfs_is_zoned(fs_info))
3663 btrfs_wait_block_group_reservations(cache);
3664 btrfs_wait_nocow_writers(cache);
3665 btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, cache->length);
3667 trans = btrfs_join_transaction(root);
3669 return PTR_ERR(trans);
3670 return btrfs_commit_transaction(trans);
3673 static noinline_for_stack
3674 int scrub_enumerate_chunks(struct scrub_ctx *sctx,
3675 struct btrfs_device *scrub_dev, u64 start, u64 end)
3677 struct btrfs_dev_extent *dev_extent = NULL;
3678 struct btrfs_path *path;
3679 struct btrfs_fs_info *fs_info = sctx->fs_info;
3680 struct btrfs_root *root = fs_info->dev_root;
3685 struct extent_buffer *l;
3686 struct btrfs_key key;
3687 struct btrfs_key found_key;
3688 struct btrfs_block_group *cache;
3689 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
3691 path = btrfs_alloc_path();
3695 path->reada = READA_FORWARD;
3696 path->search_commit_root = 1;
3697 path->skip_locking = 1;
3699 key.objectid = scrub_dev->devid;
3701 key.type = BTRFS_DEV_EXTENT_KEY;
3706 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3710 if (path->slots[0] >=
3711 btrfs_header_nritems(path->nodes[0])) {
3712 ret = btrfs_next_leaf(root, path);
3725 slot = path->slots[0];
3727 btrfs_item_key_to_cpu(l, &found_key, slot);
3729 if (found_key.objectid != scrub_dev->devid)
3732 if (found_key.type != BTRFS_DEV_EXTENT_KEY)
3735 if (found_key.offset >= end)
3738 if (found_key.offset < key.offset)
3741 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3742 dev_extent_len = btrfs_dev_extent_length(l, dev_extent);
3744 if (found_key.offset + dev_extent_len <= start)
3747 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3750 * get a reference on the corresponding block group to prevent
3751 * the chunk from going away while we scrub it
3753 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3755 /* some chunks are removed but not committed to disk yet,
3756 * continue scrubbing */
3760 ASSERT(cache->start <= chunk_offset);
3762 * We are using the commit root to search for device extents, so
3763 * that means we could have found a device extent item from a
3764 * block group that was deleted in the current transaction. The
3765 * logical start offset of the deleted block group, stored at
3766 * @chunk_offset, might be part of the logical address range of
3767 * a new block group (which uses different physical extents).
3768 * In this case btrfs_lookup_block_group() has returned the new
3769 * block group, and its start address is less than @chunk_offset.
3771 * We skip such new block groups, because it's pointless to
3772 * process them, as we won't find their extents because we search
3773 * for them using the commit root of the extent tree. For a device
3774 * replace it's also fine to skip it, we won't miss copying them
3775 * to the target device because we have the write duplication
3776 * setup through the regular write path (by btrfs_map_block()),
3777 * and we have committed a transaction when we started the device
3778 * replace, right after setting up the device replace state.
3780 if (cache->start < chunk_offset) {
3781 btrfs_put_block_group(cache);
3785 if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
3786 spin_lock(&cache->lock);
3787 if (!cache->to_copy) {
3788 spin_unlock(&cache->lock);
3789 btrfs_put_block_group(cache);
3792 spin_unlock(&cache->lock);
3796 * Make sure that while we are scrubbing the corresponding block
3797 * group doesn't get its logical address and its device extents
3798 * reused for another block group, which can possibly be of a
3799 * different type and different profile. We do this to prevent
3800 * false error detections and crashes due to bogus attempts to
3803 spin_lock(&cache->lock);
3804 if (cache->removed) {
3805 spin_unlock(&cache->lock);
3806 btrfs_put_block_group(cache);
3809 btrfs_freeze_block_group(cache);
3810 spin_unlock(&cache->lock);
3813 * we need call btrfs_inc_block_group_ro() with scrubs_paused,
3814 * to avoid deadlock caused by:
3815 * btrfs_inc_block_group_ro()
3816 * -> btrfs_wait_for_commit()
3817 * -> btrfs_commit_transaction()
3818 * -> btrfs_scrub_pause()
3820 scrub_pause_on(fs_info);
3823 * Don't do chunk preallocation for scrub.
3825 * This is especially important for SYSTEM bgs, or we can hit
3826 * -EFBIG from btrfs_finish_chunk_alloc() like:
3827 * 1. The only SYSTEM bg is marked RO.
3828 * Since SYSTEM bg is small, that's pretty common.
3829 * 2. New SYSTEM bg will be allocated
3830 * Due to regular version will allocate new chunk.
3831 * 3. New SYSTEM bg is empty and will get cleaned up
3832 * Before cleanup really happens, it's marked RO again.
3833 * 4. Empty SYSTEM bg get scrubbed
3836 * This can easily boost the amount of SYSTEM chunks if cleaner
3837 * thread can't be triggered fast enough, and use up all space
3838 * of btrfs_super_block::sys_chunk_array
3840 * While for dev replace, we need to try our best to mark block
3841 * group RO, to prevent race between:
3842 * - Write duplication
3843 * Contains latest data
3845 * Contains data from commit tree
3847 * If target block group is not marked RO, nocow writes can
3848 * be overwritten by scrub copy, causing data corruption.
3849 * So for dev-replace, it's not allowed to continue if a block
3852 ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace);
3853 if (!ret && sctx->is_dev_replace) {
3854 ret = finish_extent_writes_for_zoned(root, cache);
3856 btrfs_dec_block_group_ro(cache);
3857 scrub_pause_off(fs_info);
3858 btrfs_put_block_group(cache);
3865 } else if (ret == -ENOSPC && !sctx->is_dev_replace) {
3867 * btrfs_inc_block_group_ro return -ENOSPC when it
3868 * failed in creating new chunk for metadata.
3869 * It is not a problem for scrub, because
3870 * metadata are always cowed, and our scrub paused
3871 * commit_transactions.
3874 } else if (ret == -ETXTBSY) {
3876 "skipping scrub of block group %llu due to active swapfile",
3878 scrub_pause_off(fs_info);
3883 "failed setting block group ro: %d", ret);
3884 btrfs_unfreeze_block_group(cache);
3885 btrfs_put_block_group(cache);
3886 scrub_pause_off(fs_info);
3891 * Now the target block is marked RO, wait for nocow writes to
3892 * finish before dev-replace.
3893 * COW is fine, as COW never overwrites extents in commit tree.
3895 if (sctx->is_dev_replace) {
3896 btrfs_wait_nocow_writers(cache);
3897 btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start,
3901 scrub_pause_off(fs_info);
3902 down_write(&dev_replace->rwsem);
3903 dev_replace->cursor_right = found_key.offset + dev_extent_len;
3904 dev_replace->cursor_left = found_key.offset;
3905 dev_replace->item_needs_writeback = 1;
3906 up_write(&dev_replace->rwsem);
3908 ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset,
3912 * flush, submit all pending read and write bios, afterwards
3914 * Note that in the dev replace case, a read request causes
3915 * write requests that are submitted in the read completion
3916 * worker. Therefore in the current situation, it is required
3917 * that all write requests are flushed, so that all read and
3918 * write requests are really completed when bios_in_flight
3921 sctx->flush_all_writes = true;
3923 mutex_lock(&sctx->wr_lock);
3924 scrub_wr_submit(sctx);
3925 mutex_unlock(&sctx->wr_lock);
3927 wait_event(sctx->list_wait,
3928 atomic_read(&sctx->bios_in_flight) == 0);
3930 scrub_pause_on(fs_info);
3933 * must be called before we decrease @scrub_paused.
3934 * make sure we don't block transaction commit while
3935 * we are waiting pending workers finished.
3937 wait_event(sctx->list_wait,
3938 atomic_read(&sctx->workers_pending) == 0);
3939 sctx->flush_all_writes = false;
3941 scrub_pause_off(fs_info);
3943 if (sctx->is_dev_replace &&
3944 !btrfs_finish_block_group_to_copy(dev_replace->srcdev,
3945 cache, found_key.offset))
3948 down_write(&dev_replace->rwsem);
3949 dev_replace->cursor_left = dev_replace->cursor_right;
3950 dev_replace->item_needs_writeback = 1;
3951 up_write(&dev_replace->rwsem);
3954 btrfs_dec_block_group_ro(cache);
3957 * We might have prevented the cleaner kthread from deleting
3958 * this block group if it was already unused because we raced
3959 * and set it to RO mode first. So add it back to the unused
3960 * list, otherwise it might not ever be deleted unless a manual
3961 * balance is triggered or it becomes used and unused again.
3963 spin_lock(&cache->lock);
3964 if (!cache->removed && !cache->ro && cache->reserved == 0 &&
3966 spin_unlock(&cache->lock);
3967 if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
3968 btrfs_discard_queue_work(&fs_info->discard_ctl,
3971 btrfs_mark_bg_unused(cache);
3973 spin_unlock(&cache->lock);
3976 btrfs_unfreeze_block_group(cache);
3977 btrfs_put_block_group(cache);
3980 if (sctx->is_dev_replace &&
3981 atomic64_read(&dev_replace->num_write_errors) > 0) {
3985 if (sctx->stat.malloc_errors > 0) {
3990 key.offset = found_key.offset + dev_extent_len;
3991 btrfs_release_path(path);
3994 btrfs_free_path(path);
3999 static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
4000 struct btrfs_device *scrub_dev)
4006 struct btrfs_fs_info *fs_info = sctx->fs_info;
4008 if (BTRFS_FS_ERROR(fs_info))
4011 /* Seed devices of a new filesystem has their own generation. */
4012 if (scrub_dev->fs_devices != fs_info->fs_devices)
4013 gen = scrub_dev->generation;
4015 gen = fs_info->last_trans_committed;
4017 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
4018 bytenr = btrfs_sb_offset(i);
4019 if (bytenr + BTRFS_SUPER_INFO_SIZE >
4020 scrub_dev->commit_total_bytes)
4022 if (!btrfs_check_super_location(scrub_dev, bytenr))
4025 ret = scrub_sectors(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
4026 scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
4031 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
4036 static void scrub_workers_put(struct btrfs_fs_info *fs_info)
4038 if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
4039 &fs_info->scrub_lock)) {
4040 struct workqueue_struct *scrub_workers = fs_info->scrub_workers;
4041 struct workqueue_struct *scrub_wr_comp =
4042 fs_info->scrub_wr_completion_workers;
4043 struct workqueue_struct *scrub_parity =
4044 fs_info->scrub_parity_workers;
4046 fs_info->scrub_workers = NULL;
4047 fs_info->scrub_wr_completion_workers = NULL;
4048 fs_info->scrub_parity_workers = NULL;
4049 mutex_unlock(&fs_info->scrub_lock);
4052 destroy_workqueue(scrub_workers);
4054 destroy_workqueue(scrub_wr_comp);
4056 destroy_workqueue(scrub_parity);
4061 * get a reference count on fs_info->scrub_workers. start worker if necessary
4063 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
4066 struct workqueue_struct *scrub_workers = NULL;
4067 struct workqueue_struct *scrub_wr_comp = NULL;
4068 struct workqueue_struct *scrub_parity = NULL;
4069 unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
4070 int max_active = fs_info->thread_pool_size;
4073 if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
4076 scrub_workers = alloc_workqueue("btrfs-scrub", flags,
4077 is_dev_replace ? 1 : max_active);
4079 goto fail_scrub_workers;
4081 scrub_wr_comp = alloc_workqueue("btrfs-scrubwrc", flags, max_active);
4083 goto fail_scrub_wr_completion_workers;
4085 scrub_parity = alloc_workqueue("btrfs-scrubparity", flags, max_active);
4087 goto fail_scrub_parity_workers;
4089 mutex_lock(&fs_info->scrub_lock);
4090 if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
4091 ASSERT(fs_info->scrub_workers == NULL &&
4092 fs_info->scrub_wr_completion_workers == NULL &&
4093 fs_info->scrub_parity_workers == NULL);
4094 fs_info->scrub_workers = scrub_workers;
4095 fs_info->scrub_wr_completion_workers = scrub_wr_comp;
4096 fs_info->scrub_parity_workers = scrub_parity;
4097 refcount_set(&fs_info->scrub_workers_refcnt, 1);
4098 mutex_unlock(&fs_info->scrub_lock);
4101 /* Other thread raced in and created the workers for us */
4102 refcount_inc(&fs_info->scrub_workers_refcnt);
4103 mutex_unlock(&fs_info->scrub_lock);
4106 destroy_workqueue(scrub_parity);
4107 fail_scrub_parity_workers:
4108 destroy_workqueue(scrub_wr_comp);
4109 fail_scrub_wr_completion_workers:
4110 destroy_workqueue(scrub_workers);
4115 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
4116 u64 end, struct btrfs_scrub_progress *progress,
4117 int readonly, int is_dev_replace)
4119 struct btrfs_dev_lookup_args args = { .devid = devid };
4120 struct scrub_ctx *sctx;
4122 struct btrfs_device *dev;
4123 unsigned int nofs_flag;
4125 if (btrfs_fs_closing(fs_info))
4128 if (fs_info->nodesize > BTRFS_STRIPE_LEN) {
4130 * in this case scrub is unable to calculate the checksum
4131 * the way scrub is implemented. Do not handle this
4132 * situation at all because it won't ever happen.
4135 "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
4141 if (fs_info->nodesize >
4142 SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits ||
4143 fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_SECTORS_PER_BLOCK) {
4145 * Would exhaust the array bounds of sectorv member in
4146 * struct scrub_block
4149 "scrub: nodesize and sectorsize <= SCRUB_MAX_SECTORS_PER_BLOCK (%d <= %d && %d <= %d) fails",
4150 fs_info->nodesize, SCRUB_MAX_SECTORS_PER_BLOCK,
4151 fs_info->sectorsize, SCRUB_MAX_SECTORS_PER_BLOCK);
4155 /* Allocate outside of device_list_mutex */
4156 sctx = scrub_setup_ctx(fs_info, is_dev_replace);
4158 return PTR_ERR(sctx);
4160 ret = scrub_workers_get(fs_info, is_dev_replace);
4164 mutex_lock(&fs_info->fs_devices->device_list_mutex);
4165 dev = btrfs_find_device(fs_info->fs_devices, &args);
4166 if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
4168 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4173 if (!is_dev_replace && !readonly &&
4174 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
4175 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4176 btrfs_err_in_rcu(fs_info,
4177 "scrub on devid %llu: filesystem on %s is not writable",
4178 devid, rcu_str_deref(dev->name));
4183 mutex_lock(&fs_info->scrub_lock);
4184 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4185 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
4186 mutex_unlock(&fs_info->scrub_lock);
4187 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4192 down_read(&fs_info->dev_replace.rwsem);
4193 if (dev->scrub_ctx ||
4195 btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
4196 up_read(&fs_info->dev_replace.rwsem);
4197 mutex_unlock(&fs_info->scrub_lock);
4198 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4202 up_read(&fs_info->dev_replace.rwsem);
4204 sctx->readonly = readonly;
4205 dev->scrub_ctx = sctx;
4206 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4209 * checking @scrub_pause_req here, we can avoid
4210 * race between committing transaction and scrubbing.
4212 __scrub_blocked_if_needed(fs_info);
4213 atomic_inc(&fs_info->scrubs_running);
4214 mutex_unlock(&fs_info->scrub_lock);
4217 * In order to avoid deadlock with reclaim when there is a transaction
4218 * trying to pause scrub, make sure we use GFP_NOFS for all the
4219 * allocations done at btrfs_scrub_sectors() and scrub_sectors_for_parity()
4220 * invoked by our callees. The pausing request is done when the
4221 * transaction commit starts, and it blocks the transaction until scrub
4222 * is paused (done at specific points at scrub_stripe() or right above
4223 * before incrementing fs_info->scrubs_running).
4225 nofs_flag = memalloc_nofs_save();
4226 if (!is_dev_replace) {
4227 btrfs_info(fs_info, "scrub: started on devid %llu", devid);
4229 * by holding device list mutex, we can
4230 * kick off writing super in log tree sync.
4232 mutex_lock(&fs_info->fs_devices->device_list_mutex);
4233 ret = scrub_supers(sctx, dev);
4234 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4238 ret = scrub_enumerate_chunks(sctx, dev, start, end);
4239 memalloc_nofs_restore(nofs_flag);
4241 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
4242 atomic_dec(&fs_info->scrubs_running);
4243 wake_up(&fs_info->scrub_pause_wait);
4245 wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
4248 memcpy(progress, &sctx->stat, sizeof(*progress));
4250 if (!is_dev_replace)
4251 btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d",
4252 ret ? "not finished" : "finished", devid, ret);
4254 mutex_lock(&fs_info->scrub_lock);
4255 dev->scrub_ctx = NULL;
4256 mutex_unlock(&fs_info->scrub_lock);
4258 scrub_workers_put(fs_info);
4259 scrub_put_ctx(sctx);
4263 scrub_workers_put(fs_info);
4265 scrub_free_ctx(sctx);
4270 void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
4272 mutex_lock(&fs_info->scrub_lock);
4273 atomic_inc(&fs_info->scrub_pause_req);
4274 while (atomic_read(&fs_info->scrubs_paused) !=
4275 atomic_read(&fs_info->scrubs_running)) {
4276 mutex_unlock(&fs_info->scrub_lock);
4277 wait_event(fs_info->scrub_pause_wait,
4278 atomic_read(&fs_info->scrubs_paused) ==
4279 atomic_read(&fs_info->scrubs_running));
4280 mutex_lock(&fs_info->scrub_lock);
4282 mutex_unlock(&fs_info->scrub_lock);
4285 void btrfs_scrub_continue(struct btrfs_fs_info *fs_info)
4287 atomic_dec(&fs_info->scrub_pause_req);
4288 wake_up(&fs_info->scrub_pause_wait);
4291 int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
4293 mutex_lock(&fs_info->scrub_lock);
4294 if (!atomic_read(&fs_info->scrubs_running)) {
4295 mutex_unlock(&fs_info->scrub_lock);
4299 atomic_inc(&fs_info->scrub_cancel_req);
4300 while (atomic_read(&fs_info->scrubs_running)) {
4301 mutex_unlock(&fs_info->scrub_lock);
4302 wait_event(fs_info->scrub_pause_wait,
4303 atomic_read(&fs_info->scrubs_running) == 0);
4304 mutex_lock(&fs_info->scrub_lock);
4306 atomic_dec(&fs_info->scrub_cancel_req);
4307 mutex_unlock(&fs_info->scrub_lock);
4312 int btrfs_scrub_cancel_dev(struct btrfs_device *dev)
4314 struct btrfs_fs_info *fs_info = dev->fs_info;
4315 struct scrub_ctx *sctx;
4317 mutex_lock(&fs_info->scrub_lock);
4318 sctx = dev->scrub_ctx;
4320 mutex_unlock(&fs_info->scrub_lock);
4323 atomic_inc(&sctx->cancel_req);
4324 while (dev->scrub_ctx) {
4325 mutex_unlock(&fs_info->scrub_lock);
4326 wait_event(fs_info->scrub_pause_wait,
4327 dev->scrub_ctx == NULL);
4328 mutex_lock(&fs_info->scrub_lock);
4330 mutex_unlock(&fs_info->scrub_lock);
4335 int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
4336 struct btrfs_scrub_progress *progress)
4338 struct btrfs_dev_lookup_args args = { .devid = devid };
4339 struct btrfs_device *dev;
4340 struct scrub_ctx *sctx = NULL;
4342 mutex_lock(&fs_info->fs_devices->device_list_mutex);
4343 dev = btrfs_find_device(fs_info->fs_devices, &args);
4345 sctx = dev->scrub_ctx;
4347 memcpy(progress, &sctx->stat, sizeof(*progress));
4348 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4350 return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
4353 static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
4354 u64 extent_logical, u32 extent_len,
4355 u64 *extent_physical,
4356 struct btrfs_device **extent_dev,
4357 int *extent_mirror_num)
4360 struct btrfs_io_context *bioc = NULL;
4363 mapped_length = extent_len;
4364 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
4365 &mapped_length, &bioc, 0);
4366 if (ret || !bioc || mapped_length < extent_len ||
4367 !bioc->stripes[0].dev->bdev) {
4368 btrfs_put_bioc(bioc);
4372 *extent_physical = bioc->stripes[0].physical;
4373 *extent_mirror_num = bioc->mirror_num;
4374 *extent_dev = bioc->stripes[0].dev;
4375 btrfs_put_bioc(bioc);