1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (C) 2011, 2012 STRATO. All rights reserved.
6 #include <linux/blkdev.h>
7 #include <linux/ratelimit.h>
8 #include <linux/sched/mm.h>
9 #include <crypto/hash.h>
14 #include "ordered-data.h"
15 #include "transaction.h"
17 #include "extent_io.h"
18 #include "dev-replace.h"
19 #include "check-integrity.h"
20 #include "rcu-string.h"
22 #include "block-group.h"
26 * This is only the first step towards a full-features scrub. It reads all
27 * extent and super block and verifies the checksums. In case a bad checksum
28 * is found or the extent cannot be read, good data will be written back if
31 * Future enhancements:
32 * - In case an unrepairable extent is encountered, track which files are
33 * affected and report them
34 * - track and record media errors, throw out bad devices
35 * - add a mode to also read unallocated space
42 * The following three values only influence the performance.
44 * The last one configures the number of parallel and outstanding I/O
45 * operations. The first one configures an upper limit for the number
46 * of (dynamically allocated) pages that are added to a bio.
48 #define SCRUB_SECTORS_PER_BIO 32 /* 128KiB per bio for 4KiB pages */
49 #define SCRUB_BIOS_PER_SCTX 64 /* 8MiB per device in flight for 4KiB pages */
52 * The following value times PAGE_SIZE needs to be large enough to match the
53 * largest node/leaf/sector size that shall be supported.
55 #define SCRUB_MAX_SECTORS_PER_BLOCK (BTRFS_MAX_METADATA_BLOCKSIZE / SZ_4K)
57 struct scrub_recover {
59 struct btrfs_io_context *bioc;
64 struct scrub_block *sblock;
66 struct btrfs_device *dev;
67 struct list_head list;
68 u64 flags; /* extent flags */
72 u64 physical_for_dev_replace;
75 unsigned int have_csum:1;
76 unsigned int io_error:1;
77 u8 csum[BTRFS_CSUM_SIZE];
79 struct scrub_recover *recover;
84 struct scrub_ctx *sctx;
85 struct btrfs_device *dev;
90 struct scrub_sector *sectors[SCRUB_SECTORS_PER_BIO];
93 struct work_struct work;
97 struct scrub_sector *sectors[SCRUB_MAX_SECTORS_PER_BLOCK];
99 atomic_t outstanding_sectors;
100 refcount_t refs; /* free mem on transition to zero */
101 struct scrub_ctx *sctx;
102 struct scrub_parity *sparity;
104 unsigned int header_error:1;
105 unsigned int checksum_error:1;
106 unsigned int no_io_error_seen:1;
107 unsigned int generation_error:1; /* also sets header_error */
109 /* The following is for the data used to check parity */
110 /* It is for the data with checksum */
111 unsigned int data_corrected:1;
113 struct work_struct work;
116 /* Used for the chunks with parity stripe such RAID5/6 */
117 struct scrub_parity {
118 struct scrub_ctx *sctx;
120 struct btrfs_device *scrub_dev;
132 struct list_head sectors_list;
134 /* Work of parity check and repair */
135 struct work_struct work;
137 /* Mark the parity blocks which have data */
138 unsigned long dbitmap;
141 * Mark the parity blocks which have data, but errors happen when
142 * read data or check data
144 unsigned long ebitmap;
148 struct scrub_bio *bios[SCRUB_BIOS_PER_SCTX];
149 struct btrfs_fs_info *fs_info;
152 atomic_t bios_in_flight;
153 atomic_t workers_pending;
154 spinlock_t list_lock;
155 wait_queue_head_t list_wait;
156 struct list_head csum_list;
161 /* State of IO submission throttling affecting the associated device */
162 ktime_t throttle_deadline;
168 struct scrub_bio *wr_curr_bio;
169 struct mutex wr_lock;
170 struct btrfs_device *wr_tgtdev;
171 bool flush_all_writes;
176 struct btrfs_scrub_progress stat;
177 spinlock_t stat_lock;
180 * Use a ref counter to avoid use-after-free issues. Scrub workers
181 * decrement bios_in_flight and workers_pending and then do a wakeup
182 * on the list_wait wait queue. We must ensure the main scrub task
183 * doesn't free the scrub context before or while the workers are
184 * doing the wakeup() call.
189 struct scrub_warning {
190 struct btrfs_path *path;
191 u64 extent_item_size;
195 struct btrfs_device *dev;
198 struct full_stripe_lock {
205 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
206 struct scrub_block *sblocks_for_recheck);
207 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
208 struct scrub_block *sblock,
209 int retry_failed_mirror);
210 static void scrub_recheck_block_checksum(struct scrub_block *sblock);
211 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
212 struct scrub_block *sblock_good);
213 static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
214 struct scrub_block *sblock_good,
215 int sector_num, int force_write);
216 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock);
217 static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock,
219 static int scrub_checksum_data(struct scrub_block *sblock);
220 static int scrub_checksum_tree_block(struct scrub_block *sblock);
221 static int scrub_checksum_super(struct scrub_block *sblock);
222 static void scrub_block_put(struct scrub_block *sblock);
223 static void scrub_sector_get(struct scrub_sector *sector);
224 static void scrub_sector_put(struct scrub_sector *sector);
225 static void scrub_parity_get(struct scrub_parity *sparity);
226 static void scrub_parity_put(struct scrub_parity *sparity);
227 static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
228 u64 physical, struct btrfs_device *dev, u64 flags,
229 u64 gen, int mirror_num, u8 *csum,
230 u64 physical_for_dev_replace);
231 static void scrub_bio_end_io(struct bio *bio);
232 static void scrub_bio_end_io_worker(struct work_struct *work);
233 static void scrub_block_complete(struct scrub_block *sblock);
234 static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
235 u64 extent_logical, u32 extent_len,
236 u64 *extent_physical,
237 struct btrfs_device **extent_dev,
238 int *extent_mirror_num);
239 static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
240 struct scrub_sector *sector);
241 static void scrub_wr_submit(struct scrub_ctx *sctx);
242 static void scrub_wr_bio_end_io(struct bio *bio);
243 static void scrub_wr_bio_end_io_worker(struct work_struct *work);
244 static void scrub_put_ctx(struct scrub_ctx *sctx);
246 static inline int scrub_is_page_on_raid56(struct scrub_sector *sector)
248 return sector->recover &&
249 (sector->recover->bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK);
252 static void scrub_pending_bio_inc(struct scrub_ctx *sctx)
254 refcount_inc(&sctx->refs);
255 atomic_inc(&sctx->bios_in_flight);
258 static void scrub_pending_bio_dec(struct scrub_ctx *sctx)
260 atomic_dec(&sctx->bios_in_flight);
261 wake_up(&sctx->list_wait);
265 static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
267 while (atomic_read(&fs_info->scrub_pause_req)) {
268 mutex_unlock(&fs_info->scrub_lock);
269 wait_event(fs_info->scrub_pause_wait,
270 atomic_read(&fs_info->scrub_pause_req) == 0);
271 mutex_lock(&fs_info->scrub_lock);
275 static void scrub_pause_on(struct btrfs_fs_info *fs_info)
277 atomic_inc(&fs_info->scrubs_paused);
278 wake_up(&fs_info->scrub_pause_wait);
281 static void scrub_pause_off(struct btrfs_fs_info *fs_info)
283 mutex_lock(&fs_info->scrub_lock);
284 __scrub_blocked_if_needed(fs_info);
285 atomic_dec(&fs_info->scrubs_paused);
286 mutex_unlock(&fs_info->scrub_lock);
288 wake_up(&fs_info->scrub_pause_wait);
291 static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info)
293 scrub_pause_on(fs_info);
294 scrub_pause_off(fs_info);
298 * Insert new full stripe lock into full stripe locks tree
300 * Return pointer to existing or newly inserted full_stripe_lock structure if
301 * everything works well.
302 * Return ERR_PTR(-ENOMEM) if we failed to allocate memory
304 * NOTE: caller must hold full_stripe_locks_root->lock before calling this
307 static struct full_stripe_lock *insert_full_stripe_lock(
308 struct btrfs_full_stripe_locks_tree *locks_root,
312 struct rb_node *parent = NULL;
313 struct full_stripe_lock *entry;
314 struct full_stripe_lock *ret;
316 lockdep_assert_held(&locks_root->lock);
318 p = &locks_root->root.rb_node;
321 entry = rb_entry(parent, struct full_stripe_lock, node);
322 if (fstripe_logical < entry->logical) {
324 } else if (fstripe_logical > entry->logical) {
335 ret = kmalloc(sizeof(*ret), GFP_KERNEL);
337 return ERR_PTR(-ENOMEM);
338 ret->logical = fstripe_logical;
340 mutex_init(&ret->mutex);
342 rb_link_node(&ret->node, parent, p);
343 rb_insert_color(&ret->node, &locks_root->root);
348 * Search for a full stripe lock of a block group
350 * Return pointer to existing full stripe lock if found
351 * Return NULL if not found
353 static struct full_stripe_lock *search_full_stripe_lock(
354 struct btrfs_full_stripe_locks_tree *locks_root,
357 struct rb_node *node;
358 struct full_stripe_lock *entry;
360 lockdep_assert_held(&locks_root->lock);
362 node = locks_root->root.rb_node;
364 entry = rb_entry(node, struct full_stripe_lock, node);
365 if (fstripe_logical < entry->logical)
366 node = node->rb_left;
367 else if (fstripe_logical > entry->logical)
368 node = node->rb_right;
376 * Helper to get full stripe logical from a normal bytenr.
378 * Caller must ensure @cache is a RAID56 block group.
380 static u64 get_full_stripe_logical(struct btrfs_block_group *cache, u64 bytenr)
385 * Due to chunk item size limit, full stripe length should not be
386 * larger than U32_MAX. Just a sanity check here.
388 WARN_ON_ONCE(cache->full_stripe_len >= U32_MAX);
391 * round_down() can only handle power of 2, while RAID56 full
392 * stripe length can be 64KiB * n, so we need to manually round down.
394 ret = div64_u64(bytenr - cache->start, cache->full_stripe_len) *
395 cache->full_stripe_len + cache->start;
400 * Lock a full stripe to avoid concurrency of recovery and read
402 * It's only used for profiles with parities (RAID5/6), for other profiles it
405 * Return 0 if we locked full stripe covering @bytenr, with a mutex held.
406 * So caller must call unlock_full_stripe() at the same context.
408 * Return <0 if encounters error.
410 static int lock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
413 struct btrfs_block_group *bg_cache;
414 struct btrfs_full_stripe_locks_tree *locks_root;
415 struct full_stripe_lock *existing;
420 bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
426 /* Profiles not based on parity don't need full stripe lock */
427 if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
429 locks_root = &bg_cache->full_stripe_locks_root;
431 fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
433 /* Now insert the full stripe lock */
434 mutex_lock(&locks_root->lock);
435 existing = insert_full_stripe_lock(locks_root, fstripe_start);
436 mutex_unlock(&locks_root->lock);
437 if (IS_ERR(existing)) {
438 ret = PTR_ERR(existing);
441 mutex_lock(&existing->mutex);
444 btrfs_put_block_group(bg_cache);
449 * Unlock a full stripe.
451 * NOTE: Caller must ensure it's the same context calling corresponding
452 * lock_full_stripe().
454 * Return 0 if we unlock full stripe without problem.
455 * Return <0 for error
457 static int unlock_full_stripe(struct btrfs_fs_info *fs_info, u64 bytenr,
460 struct btrfs_block_group *bg_cache;
461 struct btrfs_full_stripe_locks_tree *locks_root;
462 struct full_stripe_lock *fstripe_lock;
467 /* If we didn't acquire full stripe lock, no need to continue */
471 bg_cache = btrfs_lookup_block_group(fs_info, bytenr);
476 if (!(bg_cache->flags & BTRFS_BLOCK_GROUP_RAID56_MASK))
479 locks_root = &bg_cache->full_stripe_locks_root;
480 fstripe_start = get_full_stripe_logical(bg_cache, bytenr);
482 mutex_lock(&locks_root->lock);
483 fstripe_lock = search_full_stripe_lock(locks_root, fstripe_start);
484 /* Unpaired unlock_full_stripe() detected */
488 mutex_unlock(&locks_root->lock);
492 if (fstripe_lock->refs == 0) {
494 btrfs_warn(fs_info, "full stripe lock at %llu refcount underflow",
495 fstripe_lock->logical);
497 fstripe_lock->refs--;
500 if (fstripe_lock->refs == 0) {
501 rb_erase(&fstripe_lock->node, &locks_root->root);
504 mutex_unlock(&locks_root->lock);
506 mutex_unlock(&fstripe_lock->mutex);
510 btrfs_put_block_group(bg_cache);
514 static void scrub_free_csums(struct scrub_ctx *sctx)
516 while (!list_empty(&sctx->csum_list)) {
517 struct btrfs_ordered_sum *sum;
518 sum = list_first_entry(&sctx->csum_list,
519 struct btrfs_ordered_sum, list);
520 list_del(&sum->list);
525 static noinline_for_stack void scrub_free_ctx(struct scrub_ctx *sctx)
532 /* this can happen when scrub is cancelled */
533 if (sctx->curr != -1) {
534 struct scrub_bio *sbio = sctx->bios[sctx->curr];
536 for (i = 0; i < sbio->sector_count; i++) {
537 WARN_ON(!sbio->sectors[i]->page);
538 scrub_block_put(sbio->sectors[i]->sblock);
543 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
544 struct scrub_bio *sbio = sctx->bios[i];
551 kfree(sctx->wr_curr_bio);
552 scrub_free_csums(sctx);
556 static void scrub_put_ctx(struct scrub_ctx *sctx)
558 if (refcount_dec_and_test(&sctx->refs))
559 scrub_free_ctx(sctx);
562 static noinline_for_stack struct scrub_ctx *scrub_setup_ctx(
563 struct btrfs_fs_info *fs_info, int is_dev_replace)
565 struct scrub_ctx *sctx;
568 sctx = kzalloc(sizeof(*sctx), GFP_KERNEL);
571 refcount_set(&sctx->refs, 1);
572 sctx->is_dev_replace = is_dev_replace;
573 sctx->sectors_per_bio = SCRUB_SECTORS_PER_BIO;
575 sctx->fs_info = fs_info;
576 INIT_LIST_HEAD(&sctx->csum_list);
577 for (i = 0; i < SCRUB_BIOS_PER_SCTX; ++i) {
578 struct scrub_bio *sbio;
580 sbio = kzalloc(sizeof(*sbio), GFP_KERNEL);
583 sctx->bios[i] = sbio;
587 sbio->sector_count = 0;
588 INIT_WORK(&sbio->work, scrub_bio_end_io_worker);
590 if (i != SCRUB_BIOS_PER_SCTX - 1)
591 sctx->bios[i]->next_free = i + 1;
593 sctx->bios[i]->next_free = -1;
595 sctx->first_free = 0;
596 atomic_set(&sctx->bios_in_flight, 0);
597 atomic_set(&sctx->workers_pending, 0);
598 atomic_set(&sctx->cancel_req, 0);
600 spin_lock_init(&sctx->list_lock);
601 spin_lock_init(&sctx->stat_lock);
602 init_waitqueue_head(&sctx->list_wait);
603 sctx->throttle_deadline = 0;
605 WARN_ON(sctx->wr_curr_bio != NULL);
606 mutex_init(&sctx->wr_lock);
607 sctx->wr_curr_bio = NULL;
608 if (is_dev_replace) {
609 WARN_ON(!fs_info->dev_replace.tgtdev);
610 sctx->wr_tgtdev = fs_info->dev_replace.tgtdev;
611 sctx->flush_all_writes = false;
617 scrub_free_ctx(sctx);
618 return ERR_PTR(-ENOMEM);
621 static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
628 struct extent_buffer *eb;
629 struct btrfs_inode_item *inode_item;
630 struct scrub_warning *swarn = warn_ctx;
631 struct btrfs_fs_info *fs_info = swarn->dev->fs_info;
632 struct inode_fs_paths *ipath = NULL;
633 struct btrfs_root *local_root;
634 struct btrfs_key key;
636 local_root = btrfs_get_fs_root(fs_info, root, true);
637 if (IS_ERR(local_root)) {
638 ret = PTR_ERR(local_root);
643 * this makes the path point to (inum INODE_ITEM ioff)
646 key.type = BTRFS_INODE_ITEM_KEY;
649 ret = btrfs_search_slot(NULL, local_root, &key, swarn->path, 0, 0);
651 btrfs_put_root(local_root);
652 btrfs_release_path(swarn->path);
656 eb = swarn->path->nodes[0];
657 inode_item = btrfs_item_ptr(eb, swarn->path->slots[0],
658 struct btrfs_inode_item);
659 nlink = btrfs_inode_nlink(eb, inode_item);
660 btrfs_release_path(swarn->path);
663 * init_path might indirectly call vmalloc, or use GFP_KERNEL. Scrub
664 * uses GFP_NOFS in this context, so we keep it consistent but it does
665 * not seem to be strictly necessary.
667 nofs_flag = memalloc_nofs_save();
668 ipath = init_ipath(4096, local_root, swarn->path);
669 memalloc_nofs_restore(nofs_flag);
671 btrfs_put_root(local_root);
672 ret = PTR_ERR(ipath);
676 ret = paths_from_inode(inum, ipath);
682 * we deliberately ignore the bit ipath might have been too small to
683 * hold all of the paths here
685 for (i = 0; i < ipath->fspath->elem_cnt; ++i)
686 btrfs_warn_in_rcu(fs_info,
687 "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)",
688 swarn->errstr, swarn->logical,
689 rcu_str_deref(swarn->dev->name),
692 fs_info->sectorsize, nlink,
693 (char *)(unsigned long)ipath->fspath->val[i]);
695 btrfs_put_root(local_root);
700 btrfs_warn_in_rcu(fs_info,
701 "%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
702 swarn->errstr, swarn->logical,
703 rcu_str_deref(swarn->dev->name),
705 root, inum, offset, ret);
711 static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
713 struct btrfs_device *dev;
714 struct btrfs_fs_info *fs_info;
715 struct btrfs_path *path;
716 struct btrfs_key found_key;
717 struct extent_buffer *eb;
718 struct btrfs_extent_item *ei;
719 struct scrub_warning swarn;
720 unsigned long ptr = 0;
728 WARN_ON(sblock->sector_count < 1);
729 dev = sblock->sectors[0]->dev;
730 fs_info = sblock->sctx->fs_info;
732 path = btrfs_alloc_path();
736 swarn.physical = sblock->sectors[0]->physical;
737 swarn.logical = sblock->sectors[0]->logical;
738 swarn.errstr = errstr;
741 ret = extent_from_logical(fs_info, swarn.logical, path, &found_key,
746 extent_item_pos = swarn.logical - found_key.objectid;
747 swarn.extent_item_size = found_key.offset;
750 ei = btrfs_item_ptr(eb, path->slots[0], struct btrfs_extent_item);
751 item_size = btrfs_item_size(eb, path->slots[0]);
753 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
755 ret = tree_backref_for_extent(&ptr, eb, &found_key, ei,
756 item_size, &ref_root,
758 btrfs_warn_in_rcu(fs_info,
759 "%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
760 errstr, swarn.logical,
761 rcu_str_deref(dev->name),
763 ref_level ? "node" : "leaf",
764 ret < 0 ? -1 : ref_level,
765 ret < 0 ? -1 : ref_root);
767 btrfs_release_path(path);
769 btrfs_release_path(path);
772 iterate_extent_inodes(fs_info, found_key.objectid,
774 scrub_print_warning_inode, &swarn, false);
778 btrfs_free_path(path);
781 static inline void scrub_get_recover(struct scrub_recover *recover)
783 refcount_inc(&recover->refs);
786 static inline void scrub_put_recover(struct btrfs_fs_info *fs_info,
787 struct scrub_recover *recover)
789 if (refcount_dec_and_test(&recover->refs)) {
790 btrfs_bio_counter_dec(fs_info);
791 btrfs_put_bioc(recover->bioc);
797 * scrub_handle_errored_block gets called when either verification of the
798 * sectors failed or the bio failed to read, e.g. with EIO. In the latter
799 * case, this function handles all sectors in the bio, even though only one
801 * The goal of this function is to repair the errored block by using the
802 * contents of one of the mirrors.
804 static int scrub_handle_errored_block(struct scrub_block *sblock_to_check)
806 struct scrub_ctx *sctx = sblock_to_check->sctx;
807 struct btrfs_device *dev;
808 struct btrfs_fs_info *fs_info;
810 unsigned int failed_mirror_index;
811 unsigned int is_metadata;
812 unsigned int have_csum;
813 struct scrub_block *sblocks_for_recheck; /* holds one for each mirror */
814 struct scrub_block *sblock_bad;
819 bool full_stripe_locked;
820 unsigned int nofs_flag;
821 static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL,
822 DEFAULT_RATELIMIT_BURST);
824 BUG_ON(sblock_to_check->sector_count < 1);
825 fs_info = sctx->fs_info;
826 if (sblock_to_check->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
828 * if we find an error in a super block, we just report it.
829 * They will get written with the next transaction commit
832 spin_lock(&sctx->stat_lock);
833 ++sctx->stat.super_errors;
834 spin_unlock(&sctx->stat_lock);
837 logical = sblock_to_check->sectors[0]->logical;
838 BUG_ON(sblock_to_check->sectors[0]->mirror_num < 1);
839 failed_mirror_index = sblock_to_check->sectors[0]->mirror_num - 1;
840 is_metadata = !(sblock_to_check->sectors[0]->flags &
841 BTRFS_EXTENT_FLAG_DATA);
842 have_csum = sblock_to_check->sectors[0]->have_csum;
843 dev = sblock_to_check->sectors[0]->dev;
845 if (!sctx->is_dev_replace && btrfs_repair_one_zone(fs_info, logical))
849 * We must use GFP_NOFS because the scrub task might be waiting for a
850 * worker task executing this function and in turn a transaction commit
851 * might be waiting the scrub task to pause (which needs to wait for all
852 * the worker tasks to complete before pausing).
853 * We do allocations in the workers through insert_full_stripe_lock()
854 * and scrub_add_sector_to_wr_bio(), which happens down the call chain of
857 nofs_flag = memalloc_nofs_save();
859 * For RAID5/6, race can happen for a different device scrub thread.
860 * For data corruption, Parity and Data threads will both try
861 * to recovery the data.
862 * Race can lead to doubly added csum error, or even unrecoverable
865 ret = lock_full_stripe(fs_info, logical, &full_stripe_locked);
867 memalloc_nofs_restore(nofs_flag);
868 spin_lock(&sctx->stat_lock);
870 sctx->stat.malloc_errors++;
871 sctx->stat.read_errors++;
872 sctx->stat.uncorrectable_errors++;
873 spin_unlock(&sctx->stat_lock);
878 * read all mirrors one after the other. This includes to
879 * re-read the extent or metadata block that failed (that was
880 * the cause that this fixup code is called) another time,
881 * sector by sector this time in order to know which sectors
882 * caused I/O errors and which ones are good (for all mirrors).
883 * It is the goal to handle the situation when more than one
884 * mirror contains I/O errors, but the errors do not
885 * overlap, i.e. the data can be repaired by selecting the
886 * sectors from those mirrors without I/O error on the
887 * particular sectors. One example (with blocks >= 2 * sectorsize)
888 * would be that mirror #1 has an I/O error on the first sector,
889 * the second sector is good, and mirror #2 has an I/O error on
890 * the second sector, but the first sector is good.
891 * Then the first sector of the first mirror can be repaired by
892 * taking the first sector of the second mirror, and the
893 * second sector of the second mirror can be repaired by
894 * copying the contents of the 2nd sector of the 1st mirror.
895 * One more note: if the sectors of one mirror contain I/O
896 * errors, the checksum cannot be verified. In order to get
897 * the best data for repairing, the first attempt is to find
898 * a mirror without I/O errors and with a validated checksum.
899 * Only if this is not possible, the sectors are picked from
900 * mirrors with I/O errors without considering the checksum.
901 * If the latter is the case, at the end, the checksum of the
902 * repaired area is verified in order to correctly maintain
906 sblocks_for_recheck = kcalloc(BTRFS_MAX_MIRRORS,
907 sizeof(*sblocks_for_recheck), GFP_KERNEL);
908 if (!sblocks_for_recheck) {
909 spin_lock(&sctx->stat_lock);
910 sctx->stat.malloc_errors++;
911 sctx->stat.read_errors++;
912 sctx->stat.uncorrectable_errors++;
913 spin_unlock(&sctx->stat_lock);
914 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
918 /* Setup the context, map the logical blocks and alloc the sectors */
919 ret = scrub_setup_recheck_block(sblock_to_check, sblocks_for_recheck);
921 spin_lock(&sctx->stat_lock);
922 sctx->stat.read_errors++;
923 sctx->stat.uncorrectable_errors++;
924 spin_unlock(&sctx->stat_lock);
925 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
928 BUG_ON(failed_mirror_index >= BTRFS_MAX_MIRRORS);
929 sblock_bad = sblocks_for_recheck + failed_mirror_index;
931 /* build and submit the bios for the failed mirror, check checksums */
932 scrub_recheck_block(fs_info, sblock_bad, 1);
934 if (!sblock_bad->header_error && !sblock_bad->checksum_error &&
935 sblock_bad->no_io_error_seen) {
937 * The error disappeared after reading sector by sector, or
938 * the area was part of a huge bio and other parts of the
939 * bio caused I/O errors, or the block layer merged several
940 * read requests into one and the error is caused by a
941 * different bio (usually one of the two latter cases is
944 spin_lock(&sctx->stat_lock);
945 sctx->stat.unverified_errors++;
946 sblock_to_check->data_corrected = 1;
947 spin_unlock(&sctx->stat_lock);
949 if (sctx->is_dev_replace)
950 scrub_write_block_to_dev_replace(sblock_bad);
954 if (!sblock_bad->no_io_error_seen) {
955 spin_lock(&sctx->stat_lock);
956 sctx->stat.read_errors++;
957 spin_unlock(&sctx->stat_lock);
958 if (__ratelimit(&rs))
959 scrub_print_warning("i/o error", sblock_to_check);
960 btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
961 } else if (sblock_bad->checksum_error) {
962 spin_lock(&sctx->stat_lock);
963 sctx->stat.csum_errors++;
964 spin_unlock(&sctx->stat_lock);
965 if (__ratelimit(&rs))
966 scrub_print_warning("checksum error", sblock_to_check);
967 btrfs_dev_stat_inc_and_print(dev,
968 BTRFS_DEV_STAT_CORRUPTION_ERRS);
969 } else if (sblock_bad->header_error) {
970 spin_lock(&sctx->stat_lock);
971 sctx->stat.verify_errors++;
972 spin_unlock(&sctx->stat_lock);
973 if (__ratelimit(&rs))
974 scrub_print_warning("checksum/header error",
976 if (sblock_bad->generation_error)
977 btrfs_dev_stat_inc_and_print(dev,
978 BTRFS_DEV_STAT_GENERATION_ERRS);
980 btrfs_dev_stat_inc_and_print(dev,
981 BTRFS_DEV_STAT_CORRUPTION_ERRS);
984 if (sctx->readonly) {
985 ASSERT(!sctx->is_dev_replace);
990 * now build and submit the bios for the other mirrors, check
992 * First try to pick the mirror which is completely without I/O
993 * errors and also does not have a checksum error.
994 * If one is found, and if a checksum is present, the full block
995 * that is known to contain an error is rewritten. Afterwards
996 * the block is known to be corrected.
997 * If a mirror is found which is completely correct, and no
998 * checksum is present, only those sectors are rewritten that had
999 * an I/O error in the block to be repaired, since it cannot be
1000 * determined, which copy of the other sectors is better (and it
1001 * could happen otherwise that a correct sector would be
1002 * overwritten by a bad one).
1004 for (mirror_index = 0; ;mirror_index++) {
1005 struct scrub_block *sblock_other;
1007 if (mirror_index == failed_mirror_index)
1010 /* raid56's mirror can be more than BTRFS_MAX_MIRRORS */
1011 if (!scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
1012 if (mirror_index >= BTRFS_MAX_MIRRORS)
1014 if (!sblocks_for_recheck[mirror_index].sector_count)
1017 sblock_other = sblocks_for_recheck + mirror_index;
1019 struct scrub_recover *r = sblock_bad->sectors[0]->recover;
1020 int max_allowed = r->bioc->num_stripes - r->bioc->num_tgtdevs;
1022 if (mirror_index >= max_allowed)
1024 if (!sblocks_for_recheck[1].sector_count)
1027 ASSERT(failed_mirror_index == 0);
1028 sblock_other = sblocks_for_recheck + 1;
1029 sblock_other->sectors[0]->mirror_num = 1 + mirror_index;
1032 /* build and submit the bios, check checksums */
1033 scrub_recheck_block(fs_info, sblock_other, 0);
1035 if (!sblock_other->header_error &&
1036 !sblock_other->checksum_error &&
1037 sblock_other->no_io_error_seen) {
1038 if (sctx->is_dev_replace) {
1039 scrub_write_block_to_dev_replace(sblock_other);
1040 goto corrected_error;
1042 ret = scrub_repair_block_from_good_copy(
1043 sblock_bad, sblock_other);
1045 goto corrected_error;
1050 if (sblock_bad->no_io_error_seen && !sctx->is_dev_replace)
1051 goto did_not_correct_error;
1054 * In case of I/O errors in the area that is supposed to be
1055 * repaired, continue by picking good copies of those sectors.
1056 * Select the good sectors from mirrors to rewrite bad sectors from
1057 * the area to fix. Afterwards verify the checksum of the block
1058 * that is supposed to be repaired. This verification step is
1059 * only done for the purpose of statistic counting and for the
1060 * final scrub report, whether errors remain.
1061 * A perfect algorithm could make use of the checksum and try
1062 * all possible combinations of sectors from the different mirrors
1063 * until the checksum verification succeeds. For example, when
1064 * the 2nd sector of mirror #1 faces I/O errors, and the 2nd sector
1065 * of mirror #2 is readable but the final checksum test fails,
1066 * then the 2nd sector of mirror #3 could be tried, whether now
1067 * the final checksum succeeds. But this would be a rare
1068 * exception and is therefore not implemented. At least it is
1069 * avoided that the good copy is overwritten.
1070 * A more useful improvement would be to pick the sectors
1071 * without I/O error based on sector sizes (512 bytes on legacy
1072 * disks) instead of on sectorsize. Then maybe 512 byte of one
1073 * mirror could be repaired by taking 512 byte of a different
1074 * mirror, even if other 512 byte sectors in the same sectorsize
1075 * area are unreadable.
1078 for (sector_num = 0; sector_num < sblock_bad->sector_count;
1080 struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
1081 struct scrub_block *sblock_other = NULL;
1083 /* Skip no-io-error sectors in scrub */
1084 if (!sector_bad->io_error && !sctx->is_dev_replace)
1087 if (scrub_is_page_on_raid56(sblock_bad->sectors[0])) {
1089 * In case of dev replace, if raid56 rebuild process
1090 * didn't work out correct data, then copy the content
1091 * in sblock_bad to make sure target device is identical
1092 * to source device, instead of writing garbage data in
1093 * sblock_for_recheck array to target device.
1095 sblock_other = NULL;
1096 } else if (sector_bad->io_error) {
1097 /* Try to find no-io-error sector in mirrors */
1098 for (mirror_index = 0;
1099 mirror_index < BTRFS_MAX_MIRRORS &&
1100 sblocks_for_recheck[mirror_index].sector_count > 0;
1102 if (!sblocks_for_recheck[mirror_index].
1103 sectors[sector_num]->io_error) {
1104 sblock_other = sblocks_for_recheck +
1113 if (sctx->is_dev_replace) {
1115 * Did not find a mirror to fetch the sector from.
1116 * scrub_write_sector_to_dev_replace() handles this
1117 * case (sector->io_error), by filling the block with
1118 * zeros before submitting the write request
1121 sblock_other = sblock_bad;
1123 if (scrub_write_sector_to_dev_replace(sblock_other,
1126 &fs_info->dev_replace.num_write_errors);
1129 } else if (sblock_other) {
1130 ret = scrub_repair_sector_from_good_copy(sblock_bad,
1134 sector_bad->io_error = 0;
1140 if (success && !sctx->is_dev_replace) {
1141 if (is_metadata || have_csum) {
1143 * need to verify the checksum now that all
1144 * sectors on disk are repaired (the write
1145 * request for data to be repaired is on its way).
1146 * Just be lazy and use scrub_recheck_block()
1147 * which re-reads the data before the checksum
1148 * is verified, but most likely the data comes out
1149 * of the page cache.
1151 scrub_recheck_block(fs_info, sblock_bad, 1);
1152 if (!sblock_bad->header_error &&
1153 !sblock_bad->checksum_error &&
1154 sblock_bad->no_io_error_seen)
1155 goto corrected_error;
1157 goto did_not_correct_error;
1160 spin_lock(&sctx->stat_lock);
1161 sctx->stat.corrected_errors++;
1162 sblock_to_check->data_corrected = 1;
1163 spin_unlock(&sctx->stat_lock);
1164 btrfs_err_rl_in_rcu(fs_info,
1165 "fixed up error at logical %llu on dev %s",
1166 logical, rcu_str_deref(dev->name));
1169 did_not_correct_error:
1170 spin_lock(&sctx->stat_lock);
1171 sctx->stat.uncorrectable_errors++;
1172 spin_unlock(&sctx->stat_lock);
1173 btrfs_err_rl_in_rcu(fs_info,
1174 "unable to fixup (regular) error at logical %llu on dev %s",
1175 logical, rcu_str_deref(dev->name));
1179 if (sblocks_for_recheck) {
1180 for (mirror_index = 0; mirror_index < BTRFS_MAX_MIRRORS;
1182 struct scrub_block *sblock = sblocks_for_recheck +
1184 struct scrub_recover *recover;
1187 for (i = 0; i < sblock->sector_count; i++) {
1188 sblock->sectors[i]->sblock = NULL;
1189 recover = sblock->sectors[i]->recover;
1191 scrub_put_recover(fs_info, recover);
1192 sblock->sectors[i]->recover = NULL;
1194 scrub_sector_put(sblock->sectors[i]);
1197 kfree(sblocks_for_recheck);
1200 ret = unlock_full_stripe(fs_info, logical, full_stripe_locked);
1201 memalloc_nofs_restore(nofs_flag);
1207 static inline int scrub_nr_raid_mirrors(struct btrfs_io_context *bioc)
1209 if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID5)
1211 else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID6)
1214 return (int)bioc->num_stripes;
1217 static inline void scrub_stripe_index_and_offset(u64 logical, u64 map_type,
1219 int nstripes, int mirror,
1225 if (map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
1227 for (i = 0; i < nstripes; i++) {
1228 if (raid_map[i] == RAID6_Q_STRIPE ||
1229 raid_map[i] == RAID5_P_STRIPE)
1232 if (logical >= raid_map[i] &&
1233 logical < raid_map[i] + BTRFS_STRIPE_LEN)
1238 *stripe_offset = logical - raid_map[i];
1240 /* The other RAID type */
1241 *stripe_index = mirror;
1246 static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
1247 struct scrub_block *sblocks_for_recheck)
1249 struct scrub_ctx *sctx = original_sblock->sctx;
1250 struct btrfs_fs_info *fs_info = sctx->fs_info;
1251 u64 length = original_sblock->sector_count << fs_info->sectorsize_bits;
1252 u64 logical = original_sblock->sectors[0]->logical;
1253 u64 generation = original_sblock->sectors[0]->generation;
1254 u64 flags = original_sblock->sectors[0]->flags;
1255 u64 have_csum = original_sblock->sectors[0]->have_csum;
1256 struct scrub_recover *recover;
1257 struct btrfs_io_context *bioc;
1262 int sector_index = 0;
1268 * Note: the two members refs and outstanding_sectors are not used (and
1269 * not set) in the blocks that are used for the recheck procedure.
1272 while (length > 0) {
1273 sublen = min_t(u64, length, fs_info->sectorsize);
1274 mapped_length = sublen;
1278 * With a length of sectorsize, each returned stripe represents
1281 btrfs_bio_counter_inc_blocked(fs_info);
1282 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS,
1283 logical, &mapped_length, &bioc);
1284 if (ret || !bioc || mapped_length < sublen) {
1285 btrfs_put_bioc(bioc);
1286 btrfs_bio_counter_dec(fs_info);
1290 recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
1292 btrfs_put_bioc(bioc);
1293 btrfs_bio_counter_dec(fs_info);
1297 refcount_set(&recover->refs, 1);
1298 recover->bioc = bioc;
1299 recover->map_length = mapped_length;
1301 ASSERT(sector_index < SCRUB_MAX_SECTORS_PER_BLOCK);
1303 nmirrors = min(scrub_nr_raid_mirrors(bioc), BTRFS_MAX_MIRRORS);
1305 for (mirror_index = 0; mirror_index < nmirrors;
1307 struct scrub_block *sblock;
1308 struct scrub_sector *sector;
1310 sblock = sblocks_for_recheck + mirror_index;
1311 sblock->sctx = sctx;
1313 sector = kzalloc(sizeof(*sector), GFP_NOFS);
1316 spin_lock(&sctx->stat_lock);
1317 sctx->stat.malloc_errors++;
1318 spin_unlock(&sctx->stat_lock);
1319 scrub_put_recover(fs_info, recover);
1322 scrub_sector_get(sector);
1323 sblock->sectors[sector_index] = sector;
1324 sector->sblock = sblock;
1325 sector->flags = flags;
1326 sector->generation = generation;
1327 sector->logical = logical;
1328 sector->have_csum = have_csum;
1330 memcpy(sector->csum,
1331 original_sblock->sectors[0]->csum,
1332 sctx->fs_info->csum_size);
1334 scrub_stripe_index_and_offset(logical,
1342 sector->physical = bioc->stripes[stripe_index].physical +
1344 sector->dev = bioc->stripes[stripe_index].dev;
1346 BUG_ON(sector_index >= original_sblock->sector_count);
1347 sector->physical_for_dev_replace =
1348 original_sblock->sectors[sector_index]->
1349 physical_for_dev_replace;
1350 /* For missing devices, dev->bdev is NULL */
1351 sector->mirror_num = mirror_index + 1;
1352 sblock->sector_count++;
1353 sector->page = alloc_page(GFP_NOFS);
1357 scrub_get_recover(recover);
1358 sector->recover = recover;
1360 scrub_put_recover(fs_info, recover);
1369 static void scrub_bio_wait_endio(struct bio *bio)
1371 complete(bio->bi_private);
1374 static int scrub_submit_raid56_bio_wait(struct btrfs_fs_info *fs_info,
1376 struct scrub_sector *sector)
1378 DECLARE_COMPLETION_ONSTACK(done);
1380 bio->bi_iter.bi_sector = sector->logical >> 9;
1381 bio->bi_private = &done;
1382 bio->bi_end_io = scrub_bio_wait_endio;
1383 raid56_parity_recover(bio, sector->recover->bioc,
1384 sector->sblock->sectors[0]->mirror_num, false);
1386 wait_for_completion_io(&done);
1387 return blk_status_to_errno(bio->bi_status);
1390 static void scrub_recheck_block_on_raid56(struct btrfs_fs_info *fs_info,
1391 struct scrub_block *sblock)
1393 struct scrub_sector *first_sector = sblock->sectors[0];
1397 /* All sectors in sblock belong to the same stripe on the same device. */
1398 ASSERT(first_sector->dev);
1399 if (!first_sector->dev->bdev)
1402 bio = bio_alloc(first_sector->dev->bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
1404 for (i = 0; i < sblock->sector_count; i++) {
1405 struct scrub_sector *sector = sblock->sectors[i];
1407 WARN_ON(!sector->page);
1408 bio_add_page(bio, sector->page, PAGE_SIZE, 0);
1411 if (scrub_submit_raid56_bio_wait(fs_info, bio, first_sector)) {
1418 scrub_recheck_block_checksum(sblock);
1422 for (i = 0; i < sblock->sector_count; i++)
1423 sblock->sectors[i]->io_error = 1;
1425 sblock->no_io_error_seen = 0;
1429 * This function will check the on disk data for checksum errors, header errors
1430 * and read I/O errors. If any I/O errors happen, the exact sectors which are
1431 * errored are marked as being bad. The goal is to enable scrub to take those
1432 * sectors that are not errored from all the mirrors so that the sectors that
1433 * are errored in the just handled mirror can be repaired.
1435 static void scrub_recheck_block(struct btrfs_fs_info *fs_info,
1436 struct scrub_block *sblock,
1437 int retry_failed_mirror)
1441 sblock->no_io_error_seen = 1;
1443 /* short cut for raid56 */
1444 if (!retry_failed_mirror && scrub_is_page_on_raid56(sblock->sectors[0]))
1445 return scrub_recheck_block_on_raid56(fs_info, sblock);
1447 for (i = 0; i < sblock->sector_count; i++) {
1448 struct scrub_sector *sector = sblock->sectors[i];
1450 struct bio_vec bvec;
1452 if (sector->dev->bdev == NULL) {
1453 sector->io_error = 1;
1454 sblock->no_io_error_seen = 0;
1458 WARN_ON(!sector->page);
1459 bio_init(&bio, sector->dev->bdev, &bvec, 1, REQ_OP_READ);
1460 bio_add_page(&bio, sector->page, fs_info->sectorsize, 0);
1461 bio.bi_iter.bi_sector = sector->physical >> 9;
1463 btrfsic_check_bio(&bio);
1464 if (submit_bio_wait(&bio)) {
1465 sector->io_error = 1;
1466 sblock->no_io_error_seen = 0;
1472 if (sblock->no_io_error_seen)
1473 scrub_recheck_block_checksum(sblock);
1476 static inline int scrub_check_fsid(u8 fsid[], struct scrub_sector *sector)
1478 struct btrfs_fs_devices *fs_devices = sector->dev->fs_devices;
1481 ret = memcmp(fsid, fs_devices->fsid, BTRFS_FSID_SIZE);
1485 static void scrub_recheck_block_checksum(struct scrub_block *sblock)
1487 sblock->header_error = 0;
1488 sblock->checksum_error = 0;
1489 sblock->generation_error = 0;
1491 if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_DATA)
1492 scrub_checksum_data(sblock);
1494 scrub_checksum_tree_block(sblock);
1497 static int scrub_repair_block_from_good_copy(struct scrub_block *sblock_bad,
1498 struct scrub_block *sblock_good)
1503 for (i = 0; i < sblock_bad->sector_count; i++) {
1506 ret_sub = scrub_repair_sector_from_good_copy(sblock_bad,
1515 static int scrub_repair_sector_from_good_copy(struct scrub_block *sblock_bad,
1516 struct scrub_block *sblock_good,
1517 int sector_num, int force_write)
1519 struct scrub_sector *sector_bad = sblock_bad->sectors[sector_num];
1520 struct scrub_sector *sector_good = sblock_good->sectors[sector_num];
1521 struct btrfs_fs_info *fs_info = sblock_bad->sctx->fs_info;
1522 const u32 sectorsize = fs_info->sectorsize;
1524 BUG_ON(sector_bad->page == NULL);
1525 BUG_ON(sector_good->page == NULL);
1526 if (force_write || sblock_bad->header_error ||
1527 sblock_bad->checksum_error || sector_bad->io_error) {
1529 struct bio_vec bvec;
1532 if (!sector_bad->dev->bdev) {
1533 btrfs_warn_rl(fs_info,
1534 "scrub_repair_page_from_good_copy(bdev == NULL) is unexpected");
1538 bio_init(&bio, sector_bad->dev->bdev, &bvec, 1, REQ_OP_WRITE);
1539 bio.bi_iter.bi_sector = sector_bad->physical >> 9;
1540 __bio_add_page(&bio, sector_good->page, sectorsize, 0);
1542 btrfsic_check_bio(&bio);
1543 ret = submit_bio_wait(&bio);
1547 btrfs_dev_stat_inc_and_print(sector_bad->dev,
1548 BTRFS_DEV_STAT_WRITE_ERRS);
1549 atomic64_inc(&fs_info->dev_replace.num_write_errors);
1557 static void scrub_write_block_to_dev_replace(struct scrub_block *sblock)
1559 struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
1563 * This block is used for the check of the parity on the source device,
1564 * so the data needn't be written into the destination device.
1566 if (sblock->sparity)
1569 for (i = 0; i < sblock->sector_count; i++) {
1572 ret = scrub_write_sector_to_dev_replace(sblock, i);
1574 atomic64_inc(&fs_info->dev_replace.num_write_errors);
1578 static int scrub_write_sector_to_dev_replace(struct scrub_block *sblock, int sector_num)
1580 struct scrub_sector *sector = sblock->sectors[sector_num];
1582 BUG_ON(sector->page == NULL);
1583 if (sector->io_error)
1584 clear_page(page_address(sector->page));
1586 return scrub_add_sector_to_wr_bio(sblock->sctx, sector);
1589 static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical)
1594 if (!btrfs_is_zoned(sctx->fs_info))
1597 if (!btrfs_dev_is_sequential(sctx->wr_tgtdev, physical))
1600 if (sctx->write_pointer < physical) {
1601 length = physical - sctx->write_pointer;
1603 ret = btrfs_zoned_issue_zeroout(sctx->wr_tgtdev,
1604 sctx->write_pointer, length);
1606 sctx->write_pointer = physical;
1611 static int scrub_add_sector_to_wr_bio(struct scrub_ctx *sctx,
1612 struct scrub_sector *sector)
1614 struct scrub_bio *sbio;
1616 const u32 sectorsize = sctx->fs_info->sectorsize;
1618 mutex_lock(&sctx->wr_lock);
1620 if (!sctx->wr_curr_bio) {
1621 sctx->wr_curr_bio = kzalloc(sizeof(*sctx->wr_curr_bio),
1623 if (!sctx->wr_curr_bio) {
1624 mutex_unlock(&sctx->wr_lock);
1627 sctx->wr_curr_bio->sctx = sctx;
1628 sctx->wr_curr_bio->sector_count = 0;
1630 sbio = sctx->wr_curr_bio;
1631 if (sbio->sector_count == 0) {
1632 ret = fill_writer_pointer_gap(sctx, sector->physical_for_dev_replace);
1634 mutex_unlock(&sctx->wr_lock);
1638 sbio->physical = sector->physical_for_dev_replace;
1639 sbio->logical = sector->logical;
1640 sbio->dev = sctx->wr_tgtdev;
1642 sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
1643 REQ_OP_WRITE, GFP_NOFS);
1645 sbio->bio->bi_private = sbio;
1646 sbio->bio->bi_end_io = scrub_wr_bio_end_io;
1647 sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
1649 } else if (sbio->physical + sbio->sector_count * sectorsize !=
1650 sector->physical_for_dev_replace ||
1651 sbio->logical + sbio->sector_count * sectorsize !=
1653 scrub_wr_submit(sctx);
1657 ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0);
1658 if (ret != sectorsize) {
1659 if (sbio->sector_count < 1) {
1662 mutex_unlock(&sctx->wr_lock);
1665 scrub_wr_submit(sctx);
1669 sbio->sectors[sbio->sector_count] = sector;
1670 scrub_sector_get(sector);
1671 sbio->sector_count++;
1672 if (sbio->sector_count == sctx->sectors_per_bio)
1673 scrub_wr_submit(sctx);
1674 mutex_unlock(&sctx->wr_lock);
1679 static void scrub_wr_submit(struct scrub_ctx *sctx)
1681 struct scrub_bio *sbio;
1683 if (!sctx->wr_curr_bio)
1686 sbio = sctx->wr_curr_bio;
1687 sctx->wr_curr_bio = NULL;
1688 scrub_pending_bio_inc(sctx);
1689 /* process all writes in a single worker thread. Then the block layer
1690 * orders the requests before sending them to the driver which
1691 * doubled the write performance on spinning disks when measured
1693 btrfsic_check_bio(sbio->bio);
1694 submit_bio(sbio->bio);
1696 if (btrfs_is_zoned(sctx->fs_info))
1697 sctx->write_pointer = sbio->physical + sbio->sector_count *
1698 sctx->fs_info->sectorsize;
1701 static void scrub_wr_bio_end_io(struct bio *bio)
1703 struct scrub_bio *sbio = bio->bi_private;
1704 struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
1706 sbio->status = bio->bi_status;
1709 INIT_WORK(&sbio->work, scrub_wr_bio_end_io_worker);
1710 queue_work(fs_info->scrub_wr_completion_workers, &sbio->work);
1713 static void scrub_wr_bio_end_io_worker(struct work_struct *work)
1715 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
1716 struct scrub_ctx *sctx = sbio->sctx;
1719 ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
1721 struct btrfs_dev_replace *dev_replace =
1722 &sbio->sctx->fs_info->dev_replace;
1724 for (i = 0; i < sbio->sector_count; i++) {
1725 struct scrub_sector *sector = sbio->sectors[i];
1727 sector->io_error = 1;
1728 atomic64_inc(&dev_replace->num_write_errors);
1732 for (i = 0; i < sbio->sector_count; i++)
1733 scrub_sector_put(sbio->sectors[i]);
1737 scrub_pending_bio_dec(sctx);
1740 static int scrub_checksum(struct scrub_block *sblock)
1746 * No need to initialize these stats currently,
1747 * because this function only use return value
1748 * instead of these stats value.
1753 sblock->header_error = 0;
1754 sblock->generation_error = 0;
1755 sblock->checksum_error = 0;
1757 WARN_ON(sblock->sector_count < 1);
1758 flags = sblock->sectors[0]->flags;
1760 if (flags & BTRFS_EXTENT_FLAG_DATA)
1761 ret = scrub_checksum_data(sblock);
1762 else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)
1763 ret = scrub_checksum_tree_block(sblock);
1764 else if (flags & BTRFS_EXTENT_FLAG_SUPER)
1765 (void)scrub_checksum_super(sblock);
1769 scrub_handle_errored_block(sblock);
1774 static int scrub_checksum_data(struct scrub_block *sblock)
1776 struct scrub_ctx *sctx = sblock->sctx;
1777 struct btrfs_fs_info *fs_info = sctx->fs_info;
1778 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1779 u8 csum[BTRFS_CSUM_SIZE];
1780 struct scrub_sector *sector;
1783 BUG_ON(sblock->sector_count < 1);
1784 sector = sblock->sectors[0];
1785 if (!sector->have_csum)
1788 kaddr = page_address(sector->page);
1790 shash->tfm = fs_info->csum_shash;
1791 crypto_shash_init(shash);
1794 * In scrub_sectors() and scrub_sectors_for_parity() we ensure each sector
1795 * only contains one sector of data.
1797 crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
1799 if (memcmp(csum, sector->csum, fs_info->csum_size))
1800 sblock->checksum_error = 1;
1801 return sblock->checksum_error;
1804 static int scrub_checksum_tree_block(struct scrub_block *sblock)
1806 struct scrub_ctx *sctx = sblock->sctx;
1807 struct btrfs_header *h;
1808 struct btrfs_fs_info *fs_info = sctx->fs_info;
1809 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1810 u8 calculated_csum[BTRFS_CSUM_SIZE];
1811 u8 on_disk_csum[BTRFS_CSUM_SIZE];
1813 * This is done in sectorsize steps even for metadata as there's a
1814 * constraint for nodesize to be aligned to sectorsize. This will need
1815 * to change so we don't misuse data and metadata units like that.
1817 const u32 sectorsize = sctx->fs_info->sectorsize;
1818 const int num_sectors = fs_info->nodesize >> fs_info->sectorsize_bits;
1820 struct scrub_sector *sector;
1823 BUG_ON(sblock->sector_count < 1);
1825 /* Each member in sectors is just one sector */
1826 ASSERT(sblock->sector_count == num_sectors);
1828 sector = sblock->sectors[0];
1829 kaddr = page_address(sector->page);
1830 h = (struct btrfs_header *)kaddr;
1831 memcpy(on_disk_csum, h->csum, sctx->fs_info->csum_size);
1834 * we don't use the getter functions here, as we
1835 * a) don't have an extent buffer and
1836 * b) the page is already kmapped
1838 if (sector->logical != btrfs_stack_header_bytenr(h))
1839 sblock->header_error = 1;
1841 if (sector->generation != btrfs_stack_header_generation(h)) {
1842 sblock->header_error = 1;
1843 sblock->generation_error = 1;
1846 if (!scrub_check_fsid(h->fsid, sector))
1847 sblock->header_error = 1;
1849 if (memcmp(h->chunk_tree_uuid, fs_info->chunk_tree_uuid,
1851 sblock->header_error = 1;
1853 shash->tfm = fs_info->csum_shash;
1854 crypto_shash_init(shash);
1855 crypto_shash_update(shash, kaddr + BTRFS_CSUM_SIZE,
1856 sectorsize - BTRFS_CSUM_SIZE);
1858 for (i = 1; i < num_sectors; i++) {
1859 kaddr = page_address(sblock->sectors[i]->page);
1860 crypto_shash_update(shash, kaddr, sectorsize);
1863 crypto_shash_final(shash, calculated_csum);
1864 if (memcmp(calculated_csum, on_disk_csum, sctx->fs_info->csum_size))
1865 sblock->checksum_error = 1;
1867 return sblock->header_error || sblock->checksum_error;
1870 static int scrub_checksum_super(struct scrub_block *sblock)
1872 struct btrfs_super_block *s;
1873 struct scrub_ctx *sctx = sblock->sctx;
1874 struct btrfs_fs_info *fs_info = sctx->fs_info;
1875 SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
1876 u8 calculated_csum[BTRFS_CSUM_SIZE];
1877 struct scrub_sector *sector;
1882 BUG_ON(sblock->sector_count < 1);
1883 sector = sblock->sectors[0];
1884 kaddr = page_address(sector->page);
1885 s = (struct btrfs_super_block *)kaddr;
1887 if (sector->logical != btrfs_super_bytenr(s))
1890 if (sector->generation != btrfs_super_generation(s))
1893 if (!scrub_check_fsid(s->fsid, sector))
1896 shash->tfm = fs_info->csum_shash;
1897 crypto_shash_init(shash);
1898 crypto_shash_digest(shash, kaddr + BTRFS_CSUM_SIZE,
1899 BTRFS_SUPER_INFO_SIZE - BTRFS_CSUM_SIZE, calculated_csum);
1901 if (memcmp(calculated_csum, s->csum, sctx->fs_info->csum_size))
1904 if (fail_cor + fail_gen) {
1906 * if we find an error in a super block, we just report it.
1907 * They will get written with the next transaction commit
1910 spin_lock(&sctx->stat_lock);
1911 ++sctx->stat.super_errors;
1912 spin_unlock(&sctx->stat_lock);
1914 btrfs_dev_stat_inc_and_print(sector->dev,
1915 BTRFS_DEV_STAT_CORRUPTION_ERRS);
1917 btrfs_dev_stat_inc_and_print(sector->dev,
1918 BTRFS_DEV_STAT_GENERATION_ERRS);
1921 return fail_cor + fail_gen;
1924 static void scrub_block_get(struct scrub_block *sblock)
1926 refcount_inc(&sblock->refs);
1929 static void scrub_block_put(struct scrub_block *sblock)
1931 if (refcount_dec_and_test(&sblock->refs)) {
1934 if (sblock->sparity)
1935 scrub_parity_put(sblock->sparity);
1937 for (i = 0; i < sblock->sector_count; i++)
1938 scrub_sector_put(sblock->sectors[i]);
1943 static void scrub_sector_get(struct scrub_sector *sector)
1945 atomic_inc(§or->refs);
1948 static void scrub_sector_put(struct scrub_sector *sector)
1950 if (atomic_dec_and_test(§or->refs)) {
1952 __free_page(sector->page);
1958 * Throttling of IO submission, bandwidth-limit based, the timeslice is 1
1959 * second. Limit can be set via /sys/fs/UUID/devinfo/devid/scrub_speed_max.
1961 static void scrub_throttle(struct scrub_ctx *sctx)
1963 const int time_slice = 1000;
1964 struct scrub_bio *sbio;
1965 struct btrfs_device *device;
1971 sbio = sctx->bios[sctx->curr];
1973 bwlimit = READ_ONCE(device->scrub_speed_max);
1978 * Slice is divided into intervals when the IO is submitted, adjust by
1979 * bwlimit and maximum of 64 intervals.
1981 div = max_t(u32, 1, (u32)(bwlimit / (16 * 1024 * 1024)));
1982 div = min_t(u32, 64, div);
1984 /* Start new epoch, set deadline */
1986 if (sctx->throttle_deadline == 0) {
1987 sctx->throttle_deadline = ktime_add_ms(now, time_slice / div);
1988 sctx->throttle_sent = 0;
1991 /* Still in the time to send? */
1992 if (ktime_before(now, sctx->throttle_deadline)) {
1993 /* If current bio is within the limit, send it */
1994 sctx->throttle_sent += sbio->bio->bi_iter.bi_size;
1995 if (sctx->throttle_sent <= div_u64(bwlimit, div))
1998 /* We're over the limit, sleep until the rest of the slice */
1999 delta = ktime_ms_delta(sctx->throttle_deadline, now);
2001 /* New request after deadline, start new epoch */
2008 timeout = div_u64(delta * HZ, 1000);
2009 schedule_timeout_interruptible(timeout);
2012 /* Next call will start the deadline period */
2013 sctx->throttle_deadline = 0;
2016 static void scrub_submit(struct scrub_ctx *sctx)
2018 struct scrub_bio *sbio;
2020 if (sctx->curr == -1)
2023 scrub_throttle(sctx);
2025 sbio = sctx->bios[sctx->curr];
2027 scrub_pending_bio_inc(sctx);
2028 btrfsic_check_bio(sbio->bio);
2029 submit_bio(sbio->bio);
2032 static int scrub_add_sector_to_rd_bio(struct scrub_ctx *sctx,
2033 struct scrub_sector *sector)
2035 struct scrub_block *sblock = sector->sblock;
2036 struct scrub_bio *sbio;
2037 const u32 sectorsize = sctx->fs_info->sectorsize;
2042 * grab a fresh bio or wait for one to become available
2044 while (sctx->curr == -1) {
2045 spin_lock(&sctx->list_lock);
2046 sctx->curr = sctx->first_free;
2047 if (sctx->curr != -1) {
2048 sctx->first_free = sctx->bios[sctx->curr]->next_free;
2049 sctx->bios[sctx->curr]->next_free = -1;
2050 sctx->bios[sctx->curr]->sector_count = 0;
2051 spin_unlock(&sctx->list_lock);
2053 spin_unlock(&sctx->list_lock);
2054 wait_event(sctx->list_wait, sctx->first_free != -1);
2057 sbio = sctx->bios[sctx->curr];
2058 if (sbio->sector_count == 0) {
2059 sbio->physical = sector->physical;
2060 sbio->logical = sector->logical;
2061 sbio->dev = sector->dev;
2063 sbio->bio = bio_alloc(sbio->dev->bdev, sctx->sectors_per_bio,
2064 REQ_OP_READ, GFP_NOFS);
2066 sbio->bio->bi_private = sbio;
2067 sbio->bio->bi_end_io = scrub_bio_end_io;
2068 sbio->bio->bi_iter.bi_sector = sbio->physical >> 9;
2070 } else if (sbio->physical + sbio->sector_count * sectorsize !=
2072 sbio->logical + sbio->sector_count * sectorsize !=
2074 sbio->dev != sector->dev) {
2079 sbio->sectors[sbio->sector_count] = sector;
2080 ret = bio_add_page(sbio->bio, sector->page, sectorsize, 0);
2081 if (ret != sectorsize) {
2082 if (sbio->sector_count < 1) {
2091 scrub_block_get(sblock); /* one for the page added to the bio */
2092 atomic_inc(&sblock->outstanding_sectors);
2093 sbio->sector_count++;
2094 if (sbio->sector_count == sctx->sectors_per_bio)
2100 static void scrub_missing_raid56_end_io(struct bio *bio)
2102 struct scrub_block *sblock = bio->bi_private;
2103 struct btrfs_fs_info *fs_info = sblock->sctx->fs_info;
2106 sblock->no_io_error_seen = 0;
2110 queue_work(fs_info->scrub_workers, &sblock->work);
2113 static void scrub_missing_raid56_worker(struct work_struct *work)
2115 struct scrub_block *sblock = container_of(work, struct scrub_block, work);
2116 struct scrub_ctx *sctx = sblock->sctx;
2117 struct btrfs_fs_info *fs_info = sctx->fs_info;
2119 struct btrfs_device *dev;
2121 logical = sblock->sectors[0]->logical;
2122 dev = sblock->sectors[0]->dev;
2124 if (sblock->no_io_error_seen)
2125 scrub_recheck_block_checksum(sblock);
2127 if (!sblock->no_io_error_seen) {
2128 spin_lock(&sctx->stat_lock);
2129 sctx->stat.read_errors++;
2130 spin_unlock(&sctx->stat_lock);
2131 btrfs_err_rl_in_rcu(fs_info,
2132 "IO error rebuilding logical %llu for dev %s",
2133 logical, rcu_str_deref(dev->name));
2134 } else if (sblock->header_error || sblock->checksum_error) {
2135 spin_lock(&sctx->stat_lock);
2136 sctx->stat.uncorrectable_errors++;
2137 spin_unlock(&sctx->stat_lock);
2138 btrfs_err_rl_in_rcu(fs_info,
2139 "failed to rebuild valid logical %llu for dev %s",
2140 logical, rcu_str_deref(dev->name));
2142 scrub_write_block_to_dev_replace(sblock);
2145 if (sctx->is_dev_replace && sctx->flush_all_writes) {
2146 mutex_lock(&sctx->wr_lock);
2147 scrub_wr_submit(sctx);
2148 mutex_unlock(&sctx->wr_lock);
2151 scrub_block_put(sblock);
2152 scrub_pending_bio_dec(sctx);
2155 static void scrub_missing_raid56_pages(struct scrub_block *sblock)
2157 struct scrub_ctx *sctx = sblock->sctx;
2158 struct btrfs_fs_info *fs_info = sctx->fs_info;
2159 u64 length = sblock->sector_count << fs_info->sectorsize_bits;
2160 u64 logical = sblock->sectors[0]->logical;
2161 struct btrfs_io_context *bioc = NULL;
2163 struct btrfs_raid_bio *rbio;
2167 btrfs_bio_counter_inc_blocked(fs_info);
2168 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_GET_READ_MIRRORS, logical,
2170 if (ret || !bioc || !bioc->raid_map)
2173 if (WARN_ON(!sctx->is_dev_replace ||
2174 !(bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK))) {
2176 * We shouldn't be scrubbing a missing device. Even for dev
2177 * replace, we should only get here for RAID 5/6. We either
2178 * managed to mount something with no mirrors remaining or
2179 * there's a bug in scrub_find_good_copy()/btrfs_map_block().
2184 bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
2185 bio->bi_iter.bi_sector = logical >> 9;
2186 bio->bi_private = sblock;
2187 bio->bi_end_io = scrub_missing_raid56_end_io;
2189 rbio = raid56_alloc_missing_rbio(bio, bioc);
2193 for (i = 0; i < sblock->sector_count; i++) {
2194 struct scrub_sector *sector = sblock->sectors[i];
2197 * For now, our scrub is still one page per sector, so pgoff
2200 raid56_add_scrub_pages(rbio, sector->page, 0, sector->logical);
2203 INIT_WORK(&sblock->work, scrub_missing_raid56_worker);
2204 scrub_block_get(sblock);
2205 scrub_pending_bio_inc(sctx);
2206 raid56_submit_missing_rbio(rbio);
2212 btrfs_bio_counter_dec(fs_info);
2213 btrfs_put_bioc(bioc);
2214 spin_lock(&sctx->stat_lock);
2215 sctx->stat.malloc_errors++;
2216 spin_unlock(&sctx->stat_lock);
2219 static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
2220 u64 physical, struct btrfs_device *dev, u64 flags,
2221 u64 gen, int mirror_num, u8 *csum,
2222 u64 physical_for_dev_replace)
2224 struct scrub_block *sblock;
2225 const u32 sectorsize = sctx->fs_info->sectorsize;
2228 sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2230 spin_lock(&sctx->stat_lock);
2231 sctx->stat.malloc_errors++;
2232 spin_unlock(&sctx->stat_lock);
2236 /* one ref inside this function, plus one for each page added to
2238 refcount_set(&sblock->refs, 1);
2239 sblock->sctx = sctx;
2240 sblock->no_io_error_seen = 1;
2242 for (index = 0; len > 0; index++) {
2243 struct scrub_sector *sector;
2245 * Here we will allocate one page for one sector to scrub.
2246 * This is fine if PAGE_SIZE == sectorsize, but will cost
2247 * more memory for PAGE_SIZE > sectorsize case.
2249 u32 l = min(sectorsize, len);
2251 sector = kzalloc(sizeof(*sector), GFP_KERNEL);
2254 spin_lock(&sctx->stat_lock);
2255 sctx->stat.malloc_errors++;
2256 spin_unlock(&sctx->stat_lock);
2257 scrub_block_put(sblock);
2260 ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK);
2261 scrub_sector_get(sector);
2262 sblock->sectors[index] = sector;
2263 sector->sblock = sblock;
2265 sector->flags = flags;
2266 sector->generation = gen;
2267 sector->logical = logical;
2268 sector->physical = physical;
2269 sector->physical_for_dev_replace = physical_for_dev_replace;
2270 sector->mirror_num = mirror_num;
2272 sector->have_csum = 1;
2273 memcpy(sector->csum, csum, sctx->fs_info->csum_size);
2275 sector->have_csum = 0;
2277 sblock->sector_count++;
2278 sector->page = alloc_page(GFP_KERNEL);
2284 physical_for_dev_replace += l;
2287 WARN_ON(sblock->sector_count == 0);
2288 if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2290 * This case should only be hit for RAID 5/6 device replace. See
2291 * the comment in scrub_missing_raid56_pages() for details.
2293 scrub_missing_raid56_pages(sblock);
2295 for (index = 0; index < sblock->sector_count; index++) {
2296 struct scrub_sector *sector = sblock->sectors[index];
2299 ret = scrub_add_sector_to_rd_bio(sctx, sector);
2301 scrub_block_put(sblock);
2306 if (flags & BTRFS_EXTENT_FLAG_SUPER)
2310 /* last one frees, either here or in bio completion for last page */
2311 scrub_block_put(sblock);
2315 static void scrub_bio_end_io(struct bio *bio)
2317 struct scrub_bio *sbio = bio->bi_private;
2318 struct btrfs_fs_info *fs_info = sbio->dev->fs_info;
2320 sbio->status = bio->bi_status;
2323 queue_work(fs_info->scrub_workers, &sbio->work);
2326 static void scrub_bio_end_io_worker(struct work_struct *work)
2328 struct scrub_bio *sbio = container_of(work, struct scrub_bio, work);
2329 struct scrub_ctx *sctx = sbio->sctx;
2332 ASSERT(sbio->sector_count <= SCRUB_SECTORS_PER_BIO);
2334 for (i = 0; i < sbio->sector_count; i++) {
2335 struct scrub_sector *sector = sbio->sectors[i];
2337 sector->io_error = 1;
2338 sector->sblock->no_io_error_seen = 0;
2342 /* Now complete the scrub_block items that have all pages completed */
2343 for (i = 0; i < sbio->sector_count; i++) {
2344 struct scrub_sector *sector = sbio->sectors[i];
2345 struct scrub_block *sblock = sector->sblock;
2347 if (atomic_dec_and_test(&sblock->outstanding_sectors))
2348 scrub_block_complete(sblock);
2349 scrub_block_put(sblock);
2354 spin_lock(&sctx->list_lock);
2355 sbio->next_free = sctx->first_free;
2356 sctx->first_free = sbio->index;
2357 spin_unlock(&sctx->list_lock);
2359 if (sctx->is_dev_replace && sctx->flush_all_writes) {
2360 mutex_lock(&sctx->wr_lock);
2361 scrub_wr_submit(sctx);
2362 mutex_unlock(&sctx->wr_lock);
2365 scrub_pending_bio_dec(sctx);
2368 static inline void __scrub_mark_bitmap(struct scrub_parity *sparity,
2369 unsigned long *bitmap,
2374 u32 sectorsize_bits = sparity->sctx->fs_info->sectorsize_bits;
2376 if (len >= sparity->stripe_len) {
2377 bitmap_set(bitmap, 0, sparity->nsectors);
2381 start -= sparity->logic_start;
2382 start = div64_u64_rem(start, sparity->stripe_len, &offset);
2383 offset = offset >> sectorsize_bits;
2384 nsectors = len >> sectorsize_bits;
2386 if (offset + nsectors <= sparity->nsectors) {
2387 bitmap_set(bitmap, offset, nsectors);
2391 bitmap_set(bitmap, offset, sparity->nsectors - offset);
2392 bitmap_set(bitmap, 0, nsectors - (sparity->nsectors - offset));
2395 static inline void scrub_parity_mark_sectors_error(struct scrub_parity *sparity,
2398 __scrub_mark_bitmap(sparity, &sparity->ebitmap, start, len);
2401 static inline void scrub_parity_mark_sectors_data(struct scrub_parity *sparity,
2404 __scrub_mark_bitmap(sparity, &sparity->dbitmap, start, len);
2407 static void scrub_block_complete(struct scrub_block *sblock)
2411 if (!sblock->no_io_error_seen) {
2413 scrub_handle_errored_block(sblock);
2416 * if has checksum error, write via repair mechanism in
2417 * dev replace case, otherwise write here in dev replace
2420 corrupted = scrub_checksum(sblock);
2421 if (!corrupted && sblock->sctx->is_dev_replace)
2422 scrub_write_block_to_dev_replace(sblock);
2425 if (sblock->sparity && corrupted && !sblock->data_corrected) {
2426 u64 start = sblock->sectors[0]->logical;
2427 u64 end = sblock->sectors[sblock->sector_count - 1]->logical +
2428 sblock->sctx->fs_info->sectorsize;
2430 ASSERT(end - start <= U32_MAX);
2431 scrub_parity_mark_sectors_error(sblock->sparity,
2432 start, end - start);
2436 static void drop_csum_range(struct scrub_ctx *sctx, struct btrfs_ordered_sum *sum)
2438 sctx->stat.csum_discards += sum->len >> sctx->fs_info->sectorsize_bits;
2439 list_del(&sum->list);
2444 * Find the desired csum for range [logical, logical + sectorsize), and store
2445 * the csum into @csum.
2447 * The search source is sctx->csum_list, which is a pre-populated list
2448 * storing bytenr ordered csum ranges. We're responsible to cleanup any range
2449 * that is before @logical.
2451 * Return 0 if there is no csum for the range.
2452 * Return 1 if there is csum for the range and copied to @csum.
2454 static int scrub_find_csum(struct scrub_ctx *sctx, u64 logical, u8 *csum)
2458 while (!list_empty(&sctx->csum_list)) {
2459 struct btrfs_ordered_sum *sum = NULL;
2460 unsigned long index;
2461 unsigned long num_sectors;
2463 sum = list_first_entry(&sctx->csum_list,
2464 struct btrfs_ordered_sum, list);
2465 /* The current csum range is beyond our range, no csum found */
2466 if (sum->bytenr > logical)
2470 * The current sum is before our bytenr, since scrub is always
2471 * done in bytenr order, the csum will never be used anymore,
2472 * clean it up so that later calls won't bother with the range,
2473 * and continue search the next range.
2475 if (sum->bytenr + sum->len <= logical) {
2476 drop_csum_range(sctx, sum);
2480 /* Now the csum range covers our bytenr, copy the csum */
2482 index = (logical - sum->bytenr) >> sctx->fs_info->sectorsize_bits;
2483 num_sectors = sum->len >> sctx->fs_info->sectorsize_bits;
2485 memcpy(csum, sum->sums + index * sctx->fs_info->csum_size,
2486 sctx->fs_info->csum_size);
2488 /* Cleanup the range if we're at the end of the csum range */
2489 if (index == num_sectors - 1)
2490 drop_csum_range(sctx, sum);
2498 /* scrub extent tries to collect up to 64 kB for each bio */
2499 static int scrub_extent(struct scrub_ctx *sctx, struct map_lookup *map,
2500 u64 logical, u32 len,
2501 u64 physical, struct btrfs_device *dev, u64 flags,
2502 u64 gen, int mirror_num)
2504 struct btrfs_device *src_dev = dev;
2505 u64 src_physical = physical;
2506 int src_mirror = mirror_num;
2508 u8 csum[BTRFS_CSUM_SIZE];
2511 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2512 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2513 blocksize = map->stripe_len;
2515 blocksize = sctx->fs_info->sectorsize;
2516 spin_lock(&sctx->stat_lock);
2517 sctx->stat.data_extents_scrubbed++;
2518 sctx->stat.data_bytes_scrubbed += len;
2519 spin_unlock(&sctx->stat_lock);
2520 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2521 if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK)
2522 blocksize = map->stripe_len;
2524 blocksize = sctx->fs_info->nodesize;
2525 spin_lock(&sctx->stat_lock);
2526 sctx->stat.tree_extents_scrubbed++;
2527 sctx->stat.tree_bytes_scrubbed += len;
2528 spin_unlock(&sctx->stat_lock);
2530 blocksize = sctx->fs_info->sectorsize;
2535 * For dev-replace case, we can have @dev being a missing device.
2536 * Regular scrub will avoid its execution on missing device at all,
2537 * as that would trigger tons of read error.
2539 * Reading from missing device will cause read error counts to
2540 * increase unnecessarily.
2541 * So here we change the read source to a good mirror.
2543 if (sctx->is_dev_replace && !dev->bdev)
2544 scrub_find_good_copy(sctx->fs_info, logical, len, &src_physical,
2545 &src_dev, &src_mirror);
2547 u32 l = min(len, blocksize);
2550 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2551 /* push csums to sbio */
2552 have_csum = scrub_find_csum(sctx, logical, csum);
2554 ++sctx->stat.no_csum;
2556 ret = scrub_sectors(sctx, logical, l, src_physical, src_dev,
2557 flags, gen, src_mirror,
2558 have_csum ? csum : NULL, physical);
2569 static int scrub_sectors_for_parity(struct scrub_parity *sparity,
2570 u64 logical, u32 len,
2571 u64 physical, struct btrfs_device *dev,
2572 u64 flags, u64 gen, int mirror_num, u8 *csum)
2574 struct scrub_ctx *sctx = sparity->sctx;
2575 struct scrub_block *sblock;
2576 const u32 sectorsize = sctx->fs_info->sectorsize;
2579 ASSERT(IS_ALIGNED(len, sectorsize));
2581 sblock = kzalloc(sizeof(*sblock), GFP_KERNEL);
2583 spin_lock(&sctx->stat_lock);
2584 sctx->stat.malloc_errors++;
2585 spin_unlock(&sctx->stat_lock);
2589 /* one ref inside this function, plus one for each page added to
2591 refcount_set(&sblock->refs, 1);
2592 sblock->sctx = sctx;
2593 sblock->no_io_error_seen = 1;
2594 sblock->sparity = sparity;
2595 scrub_parity_get(sparity);
2597 for (index = 0; len > 0; index++) {
2598 struct scrub_sector *sector;
2600 sector = kzalloc(sizeof(*sector), GFP_KERNEL);
2603 spin_lock(&sctx->stat_lock);
2604 sctx->stat.malloc_errors++;
2605 spin_unlock(&sctx->stat_lock);
2606 scrub_block_put(sblock);
2609 ASSERT(index < SCRUB_MAX_SECTORS_PER_BLOCK);
2610 /* For scrub block */
2611 scrub_sector_get(sector);
2612 sblock->sectors[index] = sector;
2613 /* For scrub parity */
2614 scrub_sector_get(sector);
2615 list_add_tail(§or->list, &sparity->sectors_list);
2616 sector->sblock = sblock;
2618 sector->flags = flags;
2619 sector->generation = gen;
2620 sector->logical = logical;
2621 sector->physical = physical;
2622 sector->mirror_num = mirror_num;
2624 sector->have_csum = 1;
2625 memcpy(sector->csum, csum, sctx->fs_info->csum_size);
2627 sector->have_csum = 0;
2629 sblock->sector_count++;
2630 sector->page = alloc_page(GFP_KERNEL);
2635 /* Iterate over the stripe range in sectorsize steps */
2637 logical += sectorsize;
2638 physical += sectorsize;
2641 WARN_ON(sblock->sector_count == 0);
2642 for (index = 0; index < sblock->sector_count; index++) {
2643 struct scrub_sector *sector = sblock->sectors[index];
2646 ret = scrub_add_sector_to_rd_bio(sctx, sector);
2648 scrub_block_put(sblock);
2653 /* Last one frees, either here or in bio completion for last sector */
2654 scrub_block_put(sblock);
2658 static int scrub_extent_for_parity(struct scrub_parity *sparity,
2659 u64 logical, u32 len,
2660 u64 physical, struct btrfs_device *dev,
2661 u64 flags, u64 gen, int mirror_num)
2663 struct scrub_ctx *sctx = sparity->sctx;
2665 u8 csum[BTRFS_CSUM_SIZE];
2668 if (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state)) {
2669 scrub_parity_mark_sectors_error(sparity, logical, len);
2673 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2674 blocksize = sparity->stripe_len;
2675 } else if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
2676 blocksize = sparity->stripe_len;
2678 blocksize = sctx->fs_info->sectorsize;
2683 u32 l = min(len, blocksize);
2686 if (flags & BTRFS_EXTENT_FLAG_DATA) {
2687 /* push csums to sbio */
2688 have_csum = scrub_find_csum(sctx, logical, csum);
2692 ret = scrub_sectors_for_parity(sparity, logical, l, physical, dev,
2693 flags, gen, mirror_num,
2694 have_csum ? csum : NULL);
2706 * Given a physical address, this will calculate it's
2707 * logical offset. if this is a parity stripe, it will return
2708 * the most left data stripe's logical offset.
2710 * return 0 if it is a data stripe, 1 means parity stripe.
2712 static int get_raid56_logic_offset(u64 physical, int num,
2713 struct map_lookup *map, u64 *offset,
2722 const int data_stripes = nr_data_stripes(map);
2724 last_offset = (physical - map->stripes[num].physical) * data_stripes;
2726 *stripe_start = last_offset;
2728 *offset = last_offset;
2729 for (i = 0; i < data_stripes; i++) {
2730 *offset = last_offset + i * map->stripe_len;
2732 stripe_nr = div64_u64(*offset, map->stripe_len);
2733 stripe_nr = div_u64(stripe_nr, data_stripes);
2735 /* Work out the disk rotation on this stripe-set */
2736 stripe_nr = div_u64_rem(stripe_nr, map->num_stripes, &rot);
2737 /* calculate which stripe this data locates */
2739 stripe_index = rot % map->num_stripes;
2740 if (stripe_index == num)
2742 if (stripe_index < num)
2745 *offset = last_offset + j * map->stripe_len;
2749 static void scrub_free_parity(struct scrub_parity *sparity)
2751 struct scrub_ctx *sctx = sparity->sctx;
2752 struct scrub_sector *curr, *next;
2755 nbits = bitmap_weight(&sparity->ebitmap, sparity->nsectors);
2757 spin_lock(&sctx->stat_lock);
2758 sctx->stat.read_errors += nbits;
2759 sctx->stat.uncorrectable_errors += nbits;
2760 spin_unlock(&sctx->stat_lock);
2763 list_for_each_entry_safe(curr, next, &sparity->sectors_list, list) {
2764 list_del_init(&curr->list);
2765 scrub_sector_put(curr);
2771 static void scrub_parity_bio_endio_worker(struct work_struct *work)
2773 struct scrub_parity *sparity = container_of(work, struct scrub_parity,
2775 struct scrub_ctx *sctx = sparity->sctx;
2777 scrub_free_parity(sparity);
2778 scrub_pending_bio_dec(sctx);
2781 static void scrub_parity_bio_endio(struct bio *bio)
2783 struct scrub_parity *sparity = bio->bi_private;
2784 struct btrfs_fs_info *fs_info = sparity->sctx->fs_info;
2787 bitmap_or(&sparity->ebitmap, &sparity->ebitmap,
2788 &sparity->dbitmap, sparity->nsectors);
2792 INIT_WORK(&sparity->work, scrub_parity_bio_endio_worker);
2793 queue_work(fs_info->scrub_parity_workers, &sparity->work);
2796 static void scrub_parity_check_and_repair(struct scrub_parity *sparity)
2798 struct scrub_ctx *sctx = sparity->sctx;
2799 struct btrfs_fs_info *fs_info = sctx->fs_info;
2801 struct btrfs_raid_bio *rbio;
2802 struct btrfs_io_context *bioc = NULL;
2806 if (!bitmap_andnot(&sparity->dbitmap, &sparity->dbitmap,
2807 &sparity->ebitmap, sparity->nsectors))
2810 length = sparity->logic_end - sparity->logic_start;
2812 btrfs_bio_counter_inc_blocked(fs_info);
2813 ret = btrfs_map_sblock(fs_info, BTRFS_MAP_WRITE, sparity->logic_start,
2815 if (ret || !bioc || !bioc->raid_map)
2818 bio = bio_alloc(NULL, BIO_MAX_VECS, REQ_OP_READ, GFP_NOFS);
2819 bio->bi_iter.bi_sector = sparity->logic_start >> 9;
2820 bio->bi_private = sparity;
2821 bio->bi_end_io = scrub_parity_bio_endio;
2823 rbio = raid56_parity_alloc_scrub_rbio(bio, bioc,
2830 scrub_pending_bio_inc(sctx);
2831 raid56_parity_submit_scrub_rbio(rbio);
2837 btrfs_bio_counter_dec(fs_info);
2838 btrfs_put_bioc(bioc);
2839 bitmap_or(&sparity->ebitmap, &sparity->ebitmap, &sparity->dbitmap,
2841 spin_lock(&sctx->stat_lock);
2842 sctx->stat.malloc_errors++;
2843 spin_unlock(&sctx->stat_lock);
2845 scrub_free_parity(sparity);
2848 static void scrub_parity_get(struct scrub_parity *sparity)
2850 refcount_inc(&sparity->refs);
2853 static void scrub_parity_put(struct scrub_parity *sparity)
2855 if (!refcount_dec_and_test(&sparity->refs))
2858 scrub_parity_check_and_repair(sparity);
2862 * Return 0 if the extent item range covers any byte of the range.
2863 * Return <0 if the extent item is before @search_start.
2864 * Return >0 if the extent item is after @start_start + @search_len.
2866 static int compare_extent_item_range(struct btrfs_path *path,
2867 u64 search_start, u64 search_len)
2869 struct btrfs_fs_info *fs_info = path->nodes[0]->fs_info;
2871 struct btrfs_key key;
2873 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2874 ASSERT(key.type == BTRFS_EXTENT_ITEM_KEY ||
2875 key.type == BTRFS_METADATA_ITEM_KEY);
2876 if (key.type == BTRFS_METADATA_ITEM_KEY)
2877 len = fs_info->nodesize;
2881 if (key.objectid + len <= search_start)
2883 if (key.objectid >= search_start + search_len)
2889 * Locate one extent item which covers any byte in range
2890 * [@search_start, @search_start + @search_length)
2892 * If the path is not initialized, we will initialize the search by doing
2893 * a btrfs_search_slot().
2894 * If the path is already initialized, we will use the path as the initial
2895 * slot, to avoid duplicated btrfs_search_slot() calls.
2897 * NOTE: If an extent item starts before @search_start, we will still
2898 * return the extent item. This is for data extent crossing stripe boundary.
2900 * Return 0 if we found such extent item, and @path will point to the extent item.
2901 * Return >0 if no such extent item can be found, and @path will be released.
2902 * Return <0 if hit fatal error, and @path will be released.
2904 static int find_first_extent_item(struct btrfs_root *extent_root,
2905 struct btrfs_path *path,
2906 u64 search_start, u64 search_len)
2908 struct btrfs_fs_info *fs_info = extent_root->fs_info;
2909 struct btrfs_key key;
2912 /* Continue using the existing path */
2914 goto search_forward;
2916 if (btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2917 key.type = BTRFS_METADATA_ITEM_KEY;
2919 key.type = BTRFS_EXTENT_ITEM_KEY;
2920 key.objectid = search_start;
2921 key.offset = (u64)-1;
2923 ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2929 * Here we intentionally pass 0 as @min_objectid, as there could be
2930 * an extent item starting before @search_start.
2932 ret = btrfs_previous_extent_item(extent_root, path, 0);
2936 * No matter whether we have found an extent item, the next loop will
2937 * properly do every check on the key.
2941 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2942 if (key.objectid >= search_start + search_len)
2944 if (key.type != BTRFS_METADATA_ITEM_KEY &&
2945 key.type != BTRFS_EXTENT_ITEM_KEY)
2948 ret = compare_extent_item_range(path, search_start, search_len);
2955 if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
2956 ret = btrfs_next_leaf(extent_root, path);
2958 /* Either no more item or fatal error */
2959 btrfs_release_path(path);
2964 btrfs_release_path(path);
2968 static void get_extent_info(struct btrfs_path *path, u64 *extent_start_ret,
2969 u64 *size_ret, u64 *flags_ret, u64 *generation_ret)
2971 struct btrfs_key key;
2972 struct btrfs_extent_item *ei;
2974 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
2975 ASSERT(key.type == BTRFS_METADATA_ITEM_KEY ||
2976 key.type == BTRFS_EXTENT_ITEM_KEY);
2977 *extent_start_ret = key.objectid;
2978 if (key.type == BTRFS_METADATA_ITEM_KEY)
2979 *size_ret = path->nodes[0]->fs_info->nodesize;
2981 *size_ret = key.offset;
2982 ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item);
2983 *flags_ret = btrfs_extent_flags(path->nodes[0], ei);
2984 *generation_ret = btrfs_extent_generation(path->nodes[0], ei);
2987 static bool does_range_cross_boundary(u64 extent_start, u64 extent_len,
2988 u64 boundary_start, u64 boudary_len)
2990 return (extent_start < boundary_start &&
2991 extent_start + extent_len > boundary_start) ||
2992 (extent_start < boundary_start + boudary_len &&
2993 extent_start + extent_len > boundary_start + boudary_len);
2996 static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx,
2997 struct scrub_parity *sparity,
2998 struct map_lookup *map,
2999 struct btrfs_device *sdev,
3000 struct btrfs_path *path,
3003 struct btrfs_fs_info *fs_info = sctx->fs_info;
3004 struct btrfs_root *extent_root = btrfs_extent_root(fs_info, logical);
3005 struct btrfs_root *csum_root = btrfs_csum_root(fs_info, logical);
3006 u64 cur_logical = logical;
3009 ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
3011 /* Path must not be populated */
3012 ASSERT(!path->nodes[0]);
3014 while (cur_logical < logical + map->stripe_len) {
3015 struct btrfs_io_context *bioc = NULL;
3016 struct btrfs_device *extent_dev;
3022 u64 extent_physical;
3023 u64 extent_mirror_num;
3025 ret = find_first_extent_item(extent_root, path, cur_logical,
3026 logical + map->stripe_len - cur_logical);
3027 /* No more extent item in this data stripe */
3034 get_extent_info(path, &extent_start, &extent_size, &extent_flags,
3037 /* Metadata should not cross stripe boundaries */
3038 if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3039 does_range_cross_boundary(extent_start, extent_size,
3040 logical, map->stripe_len)) {
3042 "scrub: tree block %llu spanning stripes, ignored. logical=%llu",
3043 extent_start, logical);
3044 spin_lock(&sctx->stat_lock);
3045 sctx->stat.uncorrectable_errors++;
3046 spin_unlock(&sctx->stat_lock);
3047 cur_logical += extent_size;
3051 /* Skip hole range which doesn't have any extent */
3052 cur_logical = max(extent_start, cur_logical);
3054 /* Truncate the range inside this data stripe */
3055 extent_size = min(extent_start + extent_size,
3056 logical + map->stripe_len) - cur_logical;
3057 extent_start = cur_logical;
3058 ASSERT(extent_size <= U32_MAX);
3060 scrub_parity_mark_sectors_data(sparity, extent_start, extent_size);
3062 mapped_length = extent_size;
3063 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_start,
3064 &mapped_length, &bioc, 0);
3065 if (!ret && (!bioc || mapped_length < extent_size))
3068 btrfs_put_bioc(bioc);
3069 scrub_parity_mark_sectors_error(sparity, extent_start,
3073 extent_physical = bioc->stripes[0].physical;
3074 extent_mirror_num = bioc->mirror_num;
3075 extent_dev = bioc->stripes[0].dev;
3076 btrfs_put_bioc(bioc);
3078 ret = btrfs_lookup_csums_range(csum_root, extent_start,
3079 extent_start + extent_size - 1,
3080 &sctx->csum_list, 1);
3082 scrub_parity_mark_sectors_error(sparity, extent_start,
3087 ret = scrub_extent_for_parity(sparity, extent_start,
3088 extent_size, extent_physical,
3089 extent_dev, extent_flags,
3090 extent_gen, extent_mirror_num);
3091 scrub_free_csums(sctx);
3094 scrub_parity_mark_sectors_error(sparity, extent_start,
3100 cur_logical += extent_size;
3102 btrfs_release_path(path);
3106 static noinline_for_stack int scrub_raid56_parity(struct scrub_ctx *sctx,
3107 struct map_lookup *map,
3108 struct btrfs_device *sdev,
3112 struct btrfs_fs_info *fs_info = sctx->fs_info;
3113 struct btrfs_path *path;
3116 struct scrub_parity *sparity;
3119 path = btrfs_alloc_path();
3121 spin_lock(&sctx->stat_lock);
3122 sctx->stat.malloc_errors++;
3123 spin_unlock(&sctx->stat_lock);
3126 path->search_commit_root = 1;
3127 path->skip_locking = 1;
3129 ASSERT(map->stripe_len <= U32_MAX);
3130 nsectors = map->stripe_len >> fs_info->sectorsize_bits;
3131 ASSERT(nsectors <= BITS_PER_LONG);
3132 sparity = kzalloc(sizeof(struct scrub_parity), GFP_NOFS);
3134 spin_lock(&sctx->stat_lock);
3135 sctx->stat.malloc_errors++;
3136 spin_unlock(&sctx->stat_lock);
3137 btrfs_free_path(path);
3141 ASSERT(map->stripe_len <= U32_MAX);
3142 sparity->stripe_len = map->stripe_len;
3143 sparity->nsectors = nsectors;
3144 sparity->sctx = sctx;
3145 sparity->scrub_dev = sdev;
3146 sparity->logic_start = logic_start;
3147 sparity->logic_end = logic_end;
3148 refcount_set(&sparity->refs, 1);
3149 INIT_LIST_HEAD(&sparity->sectors_list);
3152 for (cur_logical = logic_start; cur_logical < logic_end;
3153 cur_logical += map->stripe_len) {
3154 ret = scrub_raid56_data_stripe_for_parity(sctx, sparity, map,
3155 sdev, path, cur_logical);
3160 scrub_parity_put(sparity);
3162 mutex_lock(&sctx->wr_lock);
3163 scrub_wr_submit(sctx);
3164 mutex_unlock(&sctx->wr_lock);
3166 btrfs_free_path(path);
3167 return ret < 0 ? ret : 0;
3170 static void sync_replace_for_zoned(struct scrub_ctx *sctx)
3172 if (!btrfs_is_zoned(sctx->fs_info))
3175 sctx->flush_all_writes = true;
3177 mutex_lock(&sctx->wr_lock);
3178 scrub_wr_submit(sctx);
3179 mutex_unlock(&sctx->wr_lock);
3181 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3184 static int sync_write_pointer_for_zoned(struct scrub_ctx *sctx, u64 logical,
3185 u64 physical, u64 physical_end)
3187 struct btrfs_fs_info *fs_info = sctx->fs_info;
3190 if (!btrfs_is_zoned(fs_info))
3193 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
3195 mutex_lock(&sctx->wr_lock);
3196 if (sctx->write_pointer < physical_end) {
3197 ret = btrfs_sync_zone_write_pointer(sctx->wr_tgtdev, logical,
3199 sctx->write_pointer);
3202 "zoned: failed to recover write pointer");
3204 mutex_unlock(&sctx->wr_lock);
3205 btrfs_dev_clear_zone_empty(sctx->wr_tgtdev, physical);
3211 * Scrub one range which can only has simple mirror based profile.
3212 * (Including all range in SINGLE/DUP/RAID1/RAID1C*, and each stripe in
3215 * Since we may need to handle a subset of block group, we need @logical_start
3216 * and @logical_length parameter.
3218 static int scrub_simple_mirror(struct scrub_ctx *sctx,
3219 struct btrfs_root *extent_root,
3220 struct btrfs_root *csum_root,
3221 struct btrfs_block_group *bg,
3222 struct map_lookup *map,
3223 u64 logical_start, u64 logical_length,
3224 struct btrfs_device *device,
3225 u64 physical, int mirror_num)
3227 struct btrfs_fs_info *fs_info = sctx->fs_info;
3228 const u64 logical_end = logical_start + logical_length;
3229 /* An artificial limit, inherit from old scrub behavior */
3230 const u32 max_length = SZ_64K;
3231 struct btrfs_path path = { 0 };
3232 u64 cur_logical = logical_start;
3235 /* The range must be inside the bg */
3236 ASSERT(logical_start >= bg->start && logical_end <= bg->start + bg->length);
3238 path.search_commit_root = 1;
3239 path.skip_locking = 1;
3240 /* Go through each extent items inside the logical range */
3241 while (cur_logical < logical_end) {
3249 if (atomic_read(&fs_info->scrub_cancel_req) ||
3250 atomic_read(&sctx->cancel_req)) {
3255 if (atomic_read(&fs_info->scrub_pause_req)) {
3256 /* Push queued extents */
3257 sctx->flush_all_writes = true;
3259 mutex_lock(&sctx->wr_lock);
3260 scrub_wr_submit(sctx);
3261 mutex_unlock(&sctx->wr_lock);
3262 wait_event(sctx->list_wait,
3263 atomic_read(&sctx->bios_in_flight) == 0);
3264 sctx->flush_all_writes = false;
3265 scrub_blocked_if_needed(fs_info);
3267 /* Block group removed? */
3268 spin_lock(&bg->lock);
3270 spin_unlock(&bg->lock);
3274 spin_unlock(&bg->lock);
3276 ret = find_first_extent_item(extent_root, &path, cur_logical,
3277 logical_end - cur_logical);
3279 /* No more extent, just update the accounting */
3280 sctx->stat.last_physical = physical + logical_length;
3286 get_extent_info(&path, &extent_start, &extent_len,
3287 &extent_flags, &extent_gen);
3288 /* Skip hole range which doesn't have any extent */
3289 cur_logical = max(extent_start, cur_logical);
3292 * Scrub len has three limits:
3293 * - Extent size limit
3294 * - Scrub range limit
3295 * This is especially imporatant for RAID0/RAID10 to reuse
3297 * - Max scrub size limit
3299 scrub_len = min(min(extent_start + extent_len,
3300 logical_end), cur_logical + max_length) -
3303 if (extent_flags & BTRFS_EXTENT_FLAG_DATA) {
3304 ret = btrfs_lookup_csums_range(csum_root, cur_logical,
3305 cur_logical + scrub_len - 1,
3306 &sctx->csum_list, 1);
3310 if ((extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) &&
3311 does_range_cross_boundary(extent_start, extent_len,
3312 logical_start, logical_length)) {
3314 "scrub: tree block %llu spanning boundaries, ignored. boundary=[%llu, %llu)",
3315 extent_start, logical_start, logical_end);
3316 spin_lock(&sctx->stat_lock);
3317 sctx->stat.uncorrectable_errors++;
3318 spin_unlock(&sctx->stat_lock);
3319 cur_logical += scrub_len;
3322 ret = scrub_extent(sctx, map, cur_logical, scrub_len,
3323 cur_logical - logical_start + physical,
3324 device, extent_flags, extent_gen,
3326 scrub_free_csums(sctx);
3329 if (sctx->is_dev_replace)
3330 sync_replace_for_zoned(sctx);
3331 cur_logical += scrub_len;
3332 /* Don't hold CPU for too long time */
3335 btrfs_release_path(&path);
3339 /* Calculate the full stripe length for simple stripe based profiles */
3340 static u64 simple_stripe_full_stripe_len(const struct map_lookup *map)
3342 ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3343 BTRFS_BLOCK_GROUP_RAID10));
3345 return map->num_stripes / map->sub_stripes * map->stripe_len;
3348 /* Get the logical bytenr for the stripe */
3349 static u64 simple_stripe_get_logical(struct map_lookup *map,
3350 struct btrfs_block_group *bg,
3353 ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3354 BTRFS_BLOCK_GROUP_RAID10));
3355 ASSERT(stripe_index < map->num_stripes);
3358 * (stripe_index / sub_stripes) gives how many data stripes we need to
3361 return (stripe_index / map->sub_stripes) * map->stripe_len + bg->start;
3364 /* Get the mirror number for the stripe */
3365 static int simple_stripe_mirror_num(struct map_lookup *map, int stripe_index)
3367 ASSERT(map->type & (BTRFS_BLOCK_GROUP_RAID0 |
3368 BTRFS_BLOCK_GROUP_RAID10));
3369 ASSERT(stripe_index < map->num_stripes);
3371 /* For RAID0, it's fixed to 1, for RAID10 it's 0,1,0,1... */
3372 return stripe_index % map->sub_stripes + 1;
3375 static int scrub_simple_stripe(struct scrub_ctx *sctx,
3376 struct btrfs_root *extent_root,
3377 struct btrfs_root *csum_root,
3378 struct btrfs_block_group *bg,
3379 struct map_lookup *map,
3380 struct btrfs_device *device,
3383 const u64 logical_increment = simple_stripe_full_stripe_len(map);
3384 const u64 orig_logical = simple_stripe_get_logical(map, bg, stripe_index);
3385 const u64 orig_physical = map->stripes[stripe_index].physical;
3386 const int mirror_num = simple_stripe_mirror_num(map, stripe_index);
3387 u64 cur_logical = orig_logical;
3388 u64 cur_physical = orig_physical;
3391 while (cur_logical < bg->start + bg->length) {
3393 * Inside each stripe, RAID0 is just SINGLE, and RAID10 is
3394 * just RAID1, so we can reuse scrub_simple_mirror() to scrub
3397 ret = scrub_simple_mirror(sctx, extent_root, csum_root, bg, map,
3398 cur_logical, map->stripe_len, device,
3399 cur_physical, mirror_num);
3402 /* Skip to next stripe which belongs to the target device */
3403 cur_logical += logical_increment;
3404 /* For physical offset, we just go to next stripe */
3405 cur_physical += map->stripe_len;
3410 static noinline_for_stack int scrub_stripe(struct scrub_ctx *sctx,
3411 struct btrfs_block_group *bg,
3412 struct extent_map *em,
3413 struct btrfs_device *scrub_dev,
3416 struct btrfs_path *path;
3417 struct btrfs_fs_info *fs_info = sctx->fs_info;
3418 struct btrfs_root *root;
3419 struct btrfs_root *csum_root;
3420 struct blk_plug plug;
3421 struct map_lookup *map = em->map_lookup;
3422 const u64 profile = map->type & BTRFS_BLOCK_GROUP_PROFILE_MASK;
3423 const u64 chunk_logical = bg->start;
3425 u64 physical = map->stripes[stripe_index].physical;
3426 const u64 dev_stripe_len = btrfs_calc_stripe_length(em);
3427 const u64 physical_end = physical + dev_stripe_len;
3430 /* The logical increment after finishing one stripe */
3432 /* Offset inside the chunk */
3438 path = btrfs_alloc_path();
3443 * work on commit root. The related disk blocks are static as
3444 * long as COW is applied. This means, it is save to rewrite
3445 * them to repair disk errors without any race conditions
3447 path->search_commit_root = 1;
3448 path->skip_locking = 1;
3449 path->reada = READA_FORWARD;
3451 wait_event(sctx->list_wait,
3452 atomic_read(&sctx->bios_in_flight) == 0);
3453 scrub_blocked_if_needed(fs_info);
3455 root = btrfs_extent_root(fs_info, bg->start);
3456 csum_root = btrfs_csum_root(fs_info, bg->start);
3459 * collect all data csums for the stripe to avoid seeking during
3460 * the scrub. This might currently (crc32) end up to be about 1MB
3462 blk_start_plug(&plug);
3464 if (sctx->is_dev_replace &&
3465 btrfs_dev_is_sequential(sctx->wr_tgtdev, physical)) {
3466 mutex_lock(&sctx->wr_lock);
3467 sctx->write_pointer = physical;
3468 mutex_unlock(&sctx->wr_lock);
3469 sctx->flush_all_writes = true;
3473 * There used to be a big double loop to handle all profiles using the
3474 * same routine, which grows larger and more gross over time.
3476 * So here we handle each profile differently, so simpler profiles
3477 * have simpler scrubbing function.
3479 if (!(profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10 |
3480 BTRFS_BLOCK_GROUP_RAID56_MASK))) {
3482 * Above check rules out all complex profile, the remaining
3483 * profiles are SINGLE|DUP|RAID1|RAID1C*, which is simple
3484 * mirrored duplication without stripe.
3486 * Only @physical and @mirror_num needs to calculated using
3489 ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
3490 bg->start, bg->length, scrub_dev,
3491 map->stripes[stripe_index].physical,
3496 if (profile & (BTRFS_BLOCK_GROUP_RAID0 | BTRFS_BLOCK_GROUP_RAID10)) {
3497 ret = scrub_simple_stripe(sctx, root, csum_root, bg, map,
3498 scrub_dev, stripe_index);
3499 offset = map->stripe_len * (stripe_index / map->sub_stripes);
3503 /* Only RAID56 goes through the old code */
3504 ASSERT(map->type & BTRFS_BLOCK_GROUP_RAID56_MASK);
3507 /* Calculate the logical end of the stripe */
3508 get_raid56_logic_offset(physical_end, stripe_index,
3509 map, &logic_end, NULL);
3510 logic_end += chunk_logical;
3512 /* Initialize @offset in case we need to go to out: label */
3513 get_raid56_logic_offset(physical, stripe_index, map, &offset, NULL);
3514 increment = map->stripe_len * nr_data_stripes(map);
3517 * Due to the rotation, for RAID56 it's better to iterate each stripe
3518 * using their physical offset.
3520 while (physical < physical_end) {
3521 ret = get_raid56_logic_offset(physical, stripe_index, map,
3522 &logical, &stripe_logical);
3523 logical += chunk_logical;
3525 /* it is parity strip */
3526 stripe_logical += chunk_logical;
3527 stripe_end = stripe_logical + increment;
3528 ret = scrub_raid56_parity(sctx, map, scrub_dev,
3537 * Now we're at a data stripe, scrub each extents in the range.
3539 * At this stage, if we ignore the repair part, inside each data
3540 * stripe it is no different than SINGLE profile.
3541 * We can reuse scrub_simple_mirror() here, as the repair part
3542 * is still based on @mirror_num.
3544 ret = scrub_simple_mirror(sctx, root, csum_root, bg, map,
3545 logical, map->stripe_len,
3546 scrub_dev, physical, 1);
3550 logical += increment;
3551 physical += map->stripe_len;
3552 spin_lock(&sctx->stat_lock);
3554 sctx->stat.last_physical =
3555 map->stripes[stripe_index].physical + dev_stripe_len;
3557 sctx->stat.last_physical = physical;
3558 spin_unlock(&sctx->stat_lock);
3563 /* push queued extents */
3565 mutex_lock(&sctx->wr_lock);
3566 scrub_wr_submit(sctx);
3567 mutex_unlock(&sctx->wr_lock);
3569 blk_finish_plug(&plug);
3570 btrfs_free_path(path);
3572 if (sctx->is_dev_replace && ret >= 0) {
3575 ret2 = sync_write_pointer_for_zoned(sctx,
3576 chunk_logical + offset,
3577 map->stripes[stripe_index].physical,
3583 return ret < 0 ? ret : 0;
3586 static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx,
3587 struct btrfs_block_group *bg,
3588 struct btrfs_device *scrub_dev,
3592 struct btrfs_fs_info *fs_info = sctx->fs_info;
3593 struct extent_map_tree *map_tree = &fs_info->mapping_tree;
3594 struct map_lookup *map;
3595 struct extent_map *em;
3599 read_lock(&map_tree->lock);
3600 em = lookup_extent_mapping(map_tree, bg->start, bg->length);
3601 read_unlock(&map_tree->lock);
3605 * Might have been an unused block group deleted by the cleaner
3606 * kthread or relocation.
3608 spin_lock(&bg->lock);
3611 spin_unlock(&bg->lock);
3615 if (em->start != bg->start)
3617 if (em->len < dev_extent_len)
3620 map = em->map_lookup;
3621 for (i = 0; i < map->num_stripes; ++i) {
3622 if (map->stripes[i].dev->bdev == scrub_dev->bdev &&
3623 map->stripes[i].physical == dev_offset) {
3624 ret = scrub_stripe(sctx, bg, em, scrub_dev, i);
3630 free_extent_map(em);
3635 static int finish_extent_writes_for_zoned(struct btrfs_root *root,
3636 struct btrfs_block_group *cache)
3638 struct btrfs_fs_info *fs_info = cache->fs_info;
3639 struct btrfs_trans_handle *trans;
3641 if (!btrfs_is_zoned(fs_info))
3644 btrfs_wait_block_group_reservations(cache);
3645 btrfs_wait_nocow_writers(cache);
3646 btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start, cache->length);
3648 trans = btrfs_join_transaction(root);
3650 return PTR_ERR(trans);
3651 return btrfs_commit_transaction(trans);
3654 static noinline_for_stack
3655 int scrub_enumerate_chunks(struct scrub_ctx *sctx,
3656 struct btrfs_device *scrub_dev, u64 start, u64 end)
3658 struct btrfs_dev_extent *dev_extent = NULL;
3659 struct btrfs_path *path;
3660 struct btrfs_fs_info *fs_info = sctx->fs_info;
3661 struct btrfs_root *root = fs_info->dev_root;
3666 struct extent_buffer *l;
3667 struct btrfs_key key;
3668 struct btrfs_key found_key;
3669 struct btrfs_block_group *cache;
3670 struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
3672 path = btrfs_alloc_path();
3676 path->reada = READA_FORWARD;
3677 path->search_commit_root = 1;
3678 path->skip_locking = 1;
3680 key.objectid = scrub_dev->devid;
3682 key.type = BTRFS_DEV_EXTENT_KEY;
3687 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3691 if (path->slots[0] >=
3692 btrfs_header_nritems(path->nodes[0])) {
3693 ret = btrfs_next_leaf(root, path);
3706 slot = path->slots[0];
3708 btrfs_item_key_to_cpu(l, &found_key, slot);
3710 if (found_key.objectid != scrub_dev->devid)
3713 if (found_key.type != BTRFS_DEV_EXTENT_KEY)
3716 if (found_key.offset >= end)
3719 if (found_key.offset < key.offset)
3722 dev_extent = btrfs_item_ptr(l, slot, struct btrfs_dev_extent);
3723 dev_extent_len = btrfs_dev_extent_length(l, dev_extent);
3725 if (found_key.offset + dev_extent_len <= start)
3728 chunk_offset = btrfs_dev_extent_chunk_offset(l, dev_extent);
3731 * get a reference on the corresponding block group to prevent
3732 * the chunk from going away while we scrub it
3734 cache = btrfs_lookup_block_group(fs_info, chunk_offset);
3736 /* some chunks are removed but not committed to disk yet,
3737 * continue scrubbing */
3741 ASSERT(cache->start <= chunk_offset);
3743 * We are using the commit root to search for device extents, so
3744 * that means we could have found a device extent item from a
3745 * block group that was deleted in the current transaction. The
3746 * logical start offset of the deleted block group, stored at
3747 * @chunk_offset, might be part of the logical address range of
3748 * a new block group (which uses different physical extents).
3749 * In this case btrfs_lookup_block_group() has returned the new
3750 * block group, and its start address is less than @chunk_offset.
3752 * We skip such new block groups, because it's pointless to
3753 * process them, as we won't find their extents because we search
3754 * for them using the commit root of the extent tree. For a device
3755 * replace it's also fine to skip it, we won't miss copying them
3756 * to the target device because we have the write duplication
3757 * setup through the regular write path (by btrfs_map_block()),
3758 * and we have committed a transaction when we started the device
3759 * replace, right after setting up the device replace state.
3761 if (cache->start < chunk_offset) {
3762 btrfs_put_block_group(cache);
3766 if (sctx->is_dev_replace && btrfs_is_zoned(fs_info)) {
3767 spin_lock(&cache->lock);
3768 if (!cache->to_copy) {
3769 spin_unlock(&cache->lock);
3770 btrfs_put_block_group(cache);
3773 spin_unlock(&cache->lock);
3777 * Make sure that while we are scrubbing the corresponding block
3778 * group doesn't get its logical address and its device extents
3779 * reused for another block group, which can possibly be of a
3780 * different type and different profile. We do this to prevent
3781 * false error detections and crashes due to bogus attempts to
3784 spin_lock(&cache->lock);
3785 if (cache->removed) {
3786 spin_unlock(&cache->lock);
3787 btrfs_put_block_group(cache);
3790 btrfs_freeze_block_group(cache);
3791 spin_unlock(&cache->lock);
3794 * we need call btrfs_inc_block_group_ro() with scrubs_paused,
3795 * to avoid deadlock caused by:
3796 * btrfs_inc_block_group_ro()
3797 * -> btrfs_wait_for_commit()
3798 * -> btrfs_commit_transaction()
3799 * -> btrfs_scrub_pause()
3801 scrub_pause_on(fs_info);
3804 * Don't do chunk preallocation for scrub.
3806 * This is especially important for SYSTEM bgs, or we can hit
3807 * -EFBIG from btrfs_finish_chunk_alloc() like:
3808 * 1. The only SYSTEM bg is marked RO.
3809 * Since SYSTEM bg is small, that's pretty common.
3810 * 2. New SYSTEM bg will be allocated
3811 * Due to regular version will allocate new chunk.
3812 * 3. New SYSTEM bg is empty and will get cleaned up
3813 * Before cleanup really happens, it's marked RO again.
3814 * 4. Empty SYSTEM bg get scrubbed
3817 * This can easily boost the amount of SYSTEM chunks if cleaner
3818 * thread can't be triggered fast enough, and use up all space
3819 * of btrfs_super_block::sys_chunk_array
3821 * While for dev replace, we need to try our best to mark block
3822 * group RO, to prevent race between:
3823 * - Write duplication
3824 * Contains latest data
3826 * Contains data from commit tree
3828 * If target block group is not marked RO, nocow writes can
3829 * be overwritten by scrub copy, causing data corruption.
3830 * So for dev-replace, it's not allowed to continue if a block
3833 ret = btrfs_inc_block_group_ro(cache, sctx->is_dev_replace);
3834 if (!ret && sctx->is_dev_replace) {
3835 ret = finish_extent_writes_for_zoned(root, cache);
3837 btrfs_dec_block_group_ro(cache);
3838 scrub_pause_off(fs_info);
3839 btrfs_put_block_group(cache);
3846 } else if (ret == -ENOSPC && !sctx->is_dev_replace) {
3848 * btrfs_inc_block_group_ro return -ENOSPC when it
3849 * failed in creating new chunk for metadata.
3850 * It is not a problem for scrub, because
3851 * metadata are always cowed, and our scrub paused
3852 * commit_transactions.
3855 } else if (ret == -ETXTBSY) {
3857 "skipping scrub of block group %llu due to active swapfile",
3859 scrub_pause_off(fs_info);
3864 "failed setting block group ro: %d", ret);
3865 btrfs_unfreeze_block_group(cache);
3866 btrfs_put_block_group(cache);
3867 scrub_pause_off(fs_info);
3872 * Now the target block is marked RO, wait for nocow writes to
3873 * finish before dev-replace.
3874 * COW is fine, as COW never overwrites extents in commit tree.
3876 if (sctx->is_dev_replace) {
3877 btrfs_wait_nocow_writers(cache);
3878 btrfs_wait_ordered_roots(fs_info, U64_MAX, cache->start,
3882 scrub_pause_off(fs_info);
3883 down_write(&dev_replace->rwsem);
3884 dev_replace->cursor_right = found_key.offset + dev_extent_len;
3885 dev_replace->cursor_left = found_key.offset;
3886 dev_replace->item_needs_writeback = 1;
3887 up_write(&dev_replace->rwsem);
3889 ret = scrub_chunk(sctx, cache, scrub_dev, found_key.offset,
3893 * flush, submit all pending read and write bios, afterwards
3895 * Note that in the dev replace case, a read request causes
3896 * write requests that are submitted in the read completion
3897 * worker. Therefore in the current situation, it is required
3898 * that all write requests are flushed, so that all read and
3899 * write requests are really completed when bios_in_flight
3902 sctx->flush_all_writes = true;
3904 mutex_lock(&sctx->wr_lock);
3905 scrub_wr_submit(sctx);
3906 mutex_unlock(&sctx->wr_lock);
3908 wait_event(sctx->list_wait,
3909 atomic_read(&sctx->bios_in_flight) == 0);
3911 scrub_pause_on(fs_info);
3914 * must be called before we decrease @scrub_paused.
3915 * make sure we don't block transaction commit while
3916 * we are waiting pending workers finished.
3918 wait_event(sctx->list_wait,
3919 atomic_read(&sctx->workers_pending) == 0);
3920 sctx->flush_all_writes = false;
3922 scrub_pause_off(fs_info);
3924 if (sctx->is_dev_replace &&
3925 !btrfs_finish_block_group_to_copy(dev_replace->srcdev,
3926 cache, found_key.offset))
3929 down_write(&dev_replace->rwsem);
3930 dev_replace->cursor_left = dev_replace->cursor_right;
3931 dev_replace->item_needs_writeback = 1;
3932 up_write(&dev_replace->rwsem);
3935 btrfs_dec_block_group_ro(cache);
3938 * We might have prevented the cleaner kthread from deleting
3939 * this block group if it was already unused because we raced
3940 * and set it to RO mode first. So add it back to the unused
3941 * list, otherwise it might not ever be deleted unless a manual
3942 * balance is triggered or it becomes used and unused again.
3944 spin_lock(&cache->lock);
3945 if (!cache->removed && !cache->ro && cache->reserved == 0 &&
3947 spin_unlock(&cache->lock);
3948 if (btrfs_test_opt(fs_info, DISCARD_ASYNC))
3949 btrfs_discard_queue_work(&fs_info->discard_ctl,
3952 btrfs_mark_bg_unused(cache);
3954 spin_unlock(&cache->lock);
3957 btrfs_unfreeze_block_group(cache);
3958 btrfs_put_block_group(cache);
3961 if (sctx->is_dev_replace &&
3962 atomic64_read(&dev_replace->num_write_errors) > 0) {
3966 if (sctx->stat.malloc_errors > 0) {
3971 key.offset = found_key.offset + dev_extent_len;
3972 btrfs_release_path(path);
3975 btrfs_free_path(path);
3980 static noinline_for_stack int scrub_supers(struct scrub_ctx *sctx,
3981 struct btrfs_device *scrub_dev)
3987 struct btrfs_fs_info *fs_info = sctx->fs_info;
3989 if (BTRFS_FS_ERROR(fs_info))
3992 /* Seed devices of a new filesystem has their own generation. */
3993 if (scrub_dev->fs_devices != fs_info->fs_devices)
3994 gen = scrub_dev->generation;
3996 gen = fs_info->last_trans_committed;
3998 for (i = 0; i < BTRFS_SUPER_MIRROR_MAX; i++) {
3999 bytenr = btrfs_sb_offset(i);
4000 if (bytenr + BTRFS_SUPER_INFO_SIZE >
4001 scrub_dev->commit_total_bytes)
4003 if (!btrfs_check_super_location(scrub_dev, bytenr))
4006 ret = scrub_sectors(sctx, bytenr, BTRFS_SUPER_INFO_SIZE, bytenr,
4007 scrub_dev, BTRFS_EXTENT_FLAG_SUPER, gen, i,
4012 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
4017 static void scrub_workers_put(struct btrfs_fs_info *fs_info)
4019 if (refcount_dec_and_mutex_lock(&fs_info->scrub_workers_refcnt,
4020 &fs_info->scrub_lock)) {
4021 struct workqueue_struct *scrub_workers = fs_info->scrub_workers;
4022 struct workqueue_struct *scrub_wr_comp =
4023 fs_info->scrub_wr_completion_workers;
4024 struct workqueue_struct *scrub_parity =
4025 fs_info->scrub_parity_workers;
4027 fs_info->scrub_workers = NULL;
4028 fs_info->scrub_wr_completion_workers = NULL;
4029 fs_info->scrub_parity_workers = NULL;
4030 mutex_unlock(&fs_info->scrub_lock);
4033 destroy_workqueue(scrub_workers);
4035 destroy_workqueue(scrub_wr_comp);
4037 destroy_workqueue(scrub_parity);
4042 * get a reference count on fs_info->scrub_workers. start worker if necessary
4044 static noinline_for_stack int scrub_workers_get(struct btrfs_fs_info *fs_info,
4047 struct workqueue_struct *scrub_workers = NULL;
4048 struct workqueue_struct *scrub_wr_comp = NULL;
4049 struct workqueue_struct *scrub_parity = NULL;
4050 unsigned int flags = WQ_FREEZABLE | WQ_UNBOUND;
4051 int max_active = fs_info->thread_pool_size;
4054 if (refcount_inc_not_zero(&fs_info->scrub_workers_refcnt))
4057 scrub_workers = alloc_workqueue("btrfs-scrub", flags,
4058 is_dev_replace ? 1 : max_active);
4060 goto fail_scrub_workers;
4062 scrub_wr_comp = alloc_workqueue("btrfs-scrubwrc", flags, max_active);
4064 goto fail_scrub_wr_completion_workers;
4066 scrub_parity = alloc_workqueue("btrfs-scrubparity", flags, max_active);
4068 goto fail_scrub_parity_workers;
4070 mutex_lock(&fs_info->scrub_lock);
4071 if (refcount_read(&fs_info->scrub_workers_refcnt) == 0) {
4072 ASSERT(fs_info->scrub_workers == NULL &&
4073 fs_info->scrub_wr_completion_workers == NULL &&
4074 fs_info->scrub_parity_workers == NULL);
4075 fs_info->scrub_workers = scrub_workers;
4076 fs_info->scrub_wr_completion_workers = scrub_wr_comp;
4077 fs_info->scrub_parity_workers = scrub_parity;
4078 refcount_set(&fs_info->scrub_workers_refcnt, 1);
4079 mutex_unlock(&fs_info->scrub_lock);
4082 /* Other thread raced in and created the workers for us */
4083 refcount_inc(&fs_info->scrub_workers_refcnt);
4084 mutex_unlock(&fs_info->scrub_lock);
4087 destroy_workqueue(scrub_parity);
4088 fail_scrub_parity_workers:
4089 destroy_workqueue(scrub_wr_comp);
4090 fail_scrub_wr_completion_workers:
4091 destroy_workqueue(scrub_workers);
4096 int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
4097 u64 end, struct btrfs_scrub_progress *progress,
4098 int readonly, int is_dev_replace)
4100 struct btrfs_dev_lookup_args args = { .devid = devid };
4101 struct scrub_ctx *sctx;
4103 struct btrfs_device *dev;
4104 unsigned int nofs_flag;
4106 if (btrfs_fs_closing(fs_info))
4109 if (fs_info->nodesize > BTRFS_STRIPE_LEN) {
4111 * in this case scrub is unable to calculate the checksum
4112 * the way scrub is implemented. Do not handle this
4113 * situation at all because it won't ever happen.
4116 "scrub: size assumption nodesize <= BTRFS_STRIPE_LEN (%d <= %d) fails",
4122 if (fs_info->nodesize >
4123 SCRUB_MAX_SECTORS_PER_BLOCK << fs_info->sectorsize_bits ||
4124 fs_info->sectorsize > PAGE_SIZE * SCRUB_MAX_SECTORS_PER_BLOCK) {
4126 * Would exhaust the array bounds of sectorv member in
4127 * struct scrub_block
4130 "scrub: nodesize and sectorsize <= SCRUB_MAX_SECTORS_PER_BLOCK (%d <= %d && %d <= %d) fails",
4131 fs_info->nodesize, SCRUB_MAX_SECTORS_PER_BLOCK,
4132 fs_info->sectorsize, SCRUB_MAX_SECTORS_PER_BLOCK);
4136 /* Allocate outside of device_list_mutex */
4137 sctx = scrub_setup_ctx(fs_info, is_dev_replace);
4139 return PTR_ERR(sctx);
4141 ret = scrub_workers_get(fs_info, is_dev_replace);
4145 mutex_lock(&fs_info->fs_devices->device_list_mutex);
4146 dev = btrfs_find_device(fs_info->fs_devices, &args);
4147 if (!dev || (test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) &&
4149 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4154 if (!is_dev_replace && !readonly &&
4155 !test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
4156 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4157 btrfs_err_in_rcu(fs_info,
4158 "scrub on devid %llu: filesystem on %s is not writable",
4159 devid, rcu_str_deref(dev->name));
4164 mutex_lock(&fs_info->scrub_lock);
4165 if (!test_bit(BTRFS_DEV_STATE_IN_FS_METADATA, &dev->dev_state) ||
4166 test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &dev->dev_state)) {
4167 mutex_unlock(&fs_info->scrub_lock);
4168 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4173 down_read(&fs_info->dev_replace.rwsem);
4174 if (dev->scrub_ctx ||
4176 btrfs_dev_replace_is_ongoing(&fs_info->dev_replace))) {
4177 up_read(&fs_info->dev_replace.rwsem);
4178 mutex_unlock(&fs_info->scrub_lock);
4179 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4183 up_read(&fs_info->dev_replace.rwsem);
4185 sctx->readonly = readonly;
4186 dev->scrub_ctx = sctx;
4187 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4190 * checking @scrub_pause_req here, we can avoid
4191 * race between committing transaction and scrubbing.
4193 __scrub_blocked_if_needed(fs_info);
4194 atomic_inc(&fs_info->scrubs_running);
4195 mutex_unlock(&fs_info->scrub_lock);
4198 * In order to avoid deadlock with reclaim when there is a transaction
4199 * trying to pause scrub, make sure we use GFP_NOFS for all the
4200 * allocations done at btrfs_scrub_sectors() and scrub_sectors_for_parity()
4201 * invoked by our callees. The pausing request is done when the
4202 * transaction commit starts, and it blocks the transaction until scrub
4203 * is paused (done at specific points at scrub_stripe() or right above
4204 * before incrementing fs_info->scrubs_running).
4206 nofs_flag = memalloc_nofs_save();
4207 if (!is_dev_replace) {
4208 btrfs_info(fs_info, "scrub: started on devid %llu", devid);
4210 * by holding device list mutex, we can
4211 * kick off writing super in log tree sync.
4213 mutex_lock(&fs_info->fs_devices->device_list_mutex);
4214 ret = scrub_supers(sctx, dev);
4215 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4219 ret = scrub_enumerate_chunks(sctx, dev, start, end);
4220 memalloc_nofs_restore(nofs_flag);
4222 wait_event(sctx->list_wait, atomic_read(&sctx->bios_in_flight) == 0);
4223 atomic_dec(&fs_info->scrubs_running);
4224 wake_up(&fs_info->scrub_pause_wait);
4226 wait_event(sctx->list_wait, atomic_read(&sctx->workers_pending) == 0);
4229 memcpy(progress, &sctx->stat, sizeof(*progress));
4231 if (!is_dev_replace)
4232 btrfs_info(fs_info, "scrub: %s on devid %llu with status: %d",
4233 ret ? "not finished" : "finished", devid, ret);
4235 mutex_lock(&fs_info->scrub_lock);
4236 dev->scrub_ctx = NULL;
4237 mutex_unlock(&fs_info->scrub_lock);
4239 scrub_workers_put(fs_info);
4240 scrub_put_ctx(sctx);
4244 scrub_workers_put(fs_info);
4246 scrub_free_ctx(sctx);
4251 void btrfs_scrub_pause(struct btrfs_fs_info *fs_info)
4253 mutex_lock(&fs_info->scrub_lock);
4254 atomic_inc(&fs_info->scrub_pause_req);
4255 while (atomic_read(&fs_info->scrubs_paused) !=
4256 atomic_read(&fs_info->scrubs_running)) {
4257 mutex_unlock(&fs_info->scrub_lock);
4258 wait_event(fs_info->scrub_pause_wait,
4259 atomic_read(&fs_info->scrubs_paused) ==
4260 atomic_read(&fs_info->scrubs_running));
4261 mutex_lock(&fs_info->scrub_lock);
4263 mutex_unlock(&fs_info->scrub_lock);
4266 void btrfs_scrub_continue(struct btrfs_fs_info *fs_info)
4268 atomic_dec(&fs_info->scrub_pause_req);
4269 wake_up(&fs_info->scrub_pause_wait);
4272 int btrfs_scrub_cancel(struct btrfs_fs_info *fs_info)
4274 mutex_lock(&fs_info->scrub_lock);
4275 if (!atomic_read(&fs_info->scrubs_running)) {
4276 mutex_unlock(&fs_info->scrub_lock);
4280 atomic_inc(&fs_info->scrub_cancel_req);
4281 while (atomic_read(&fs_info->scrubs_running)) {
4282 mutex_unlock(&fs_info->scrub_lock);
4283 wait_event(fs_info->scrub_pause_wait,
4284 atomic_read(&fs_info->scrubs_running) == 0);
4285 mutex_lock(&fs_info->scrub_lock);
4287 atomic_dec(&fs_info->scrub_cancel_req);
4288 mutex_unlock(&fs_info->scrub_lock);
4293 int btrfs_scrub_cancel_dev(struct btrfs_device *dev)
4295 struct btrfs_fs_info *fs_info = dev->fs_info;
4296 struct scrub_ctx *sctx;
4298 mutex_lock(&fs_info->scrub_lock);
4299 sctx = dev->scrub_ctx;
4301 mutex_unlock(&fs_info->scrub_lock);
4304 atomic_inc(&sctx->cancel_req);
4305 while (dev->scrub_ctx) {
4306 mutex_unlock(&fs_info->scrub_lock);
4307 wait_event(fs_info->scrub_pause_wait,
4308 dev->scrub_ctx == NULL);
4309 mutex_lock(&fs_info->scrub_lock);
4311 mutex_unlock(&fs_info->scrub_lock);
4316 int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
4317 struct btrfs_scrub_progress *progress)
4319 struct btrfs_dev_lookup_args args = { .devid = devid };
4320 struct btrfs_device *dev;
4321 struct scrub_ctx *sctx = NULL;
4323 mutex_lock(&fs_info->fs_devices->device_list_mutex);
4324 dev = btrfs_find_device(fs_info->fs_devices, &args);
4326 sctx = dev->scrub_ctx;
4328 memcpy(progress, &sctx->stat, sizeof(*progress));
4329 mutex_unlock(&fs_info->fs_devices->device_list_mutex);
4331 return dev ? (sctx ? 0 : -ENOTCONN) : -ENODEV;
4334 static void scrub_find_good_copy(struct btrfs_fs_info *fs_info,
4335 u64 extent_logical, u32 extent_len,
4336 u64 *extent_physical,
4337 struct btrfs_device **extent_dev,
4338 int *extent_mirror_num)
4341 struct btrfs_io_context *bioc = NULL;
4344 mapped_length = extent_len;
4345 ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, extent_logical,
4346 &mapped_length, &bioc, 0);
4347 if (ret || !bioc || mapped_length < extent_len ||
4348 !bioc->stripes[0].dev->bdev) {
4349 btrfs_put_bioc(bioc);
4353 *extent_physical = bioc->stripes[0].physical;
4354 *extent_mirror_num = bioc->mirror_num;
4355 *extent_dev = bioc->stripes[0].dev;
4356 btrfs_put_bioc(bioc);