GNU Linux-libre 6.8.9-gnu
[releases.git] / fs / btrfs / relocation.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2009 Oracle.  All rights reserved.
4  */
5
6 #include <linux/sched.h>
7 #include <linux/pagemap.h>
8 #include <linux/writeback.h>
9 #include <linux/blkdev.h>
10 #include <linux/rbtree.h>
11 #include <linux/slab.h>
12 #include <linux/error-injection.h>
13 #include "ctree.h"
14 #include "disk-io.h"
15 #include "transaction.h"
16 #include "volumes.h"
17 #include "locking.h"
18 #include "btrfs_inode.h"
19 #include "async-thread.h"
20 #include "free-space-cache.h"
21 #include "qgroup.h"
22 #include "print-tree.h"
23 #include "delalloc-space.h"
24 #include "block-group.h"
25 #include "backref.h"
26 #include "misc.h"
27 #include "subpage.h"
28 #include "zoned.h"
29 #include "inode-item.h"
30 #include "space-info.h"
31 #include "fs.h"
32 #include "accessors.h"
33 #include "extent-tree.h"
34 #include "root-tree.h"
35 #include "file-item.h"
36 #include "relocation.h"
37 #include "super.h"
38 #include "tree-checker.h"
39
40 /*
41  * Relocation overview
42  *
43  * [What does relocation do]
44  *
45  * The objective of relocation is to relocate all extents of the target block
46  * group to other block groups.
47  * This is utilized by resize (shrink only), profile converting, compacting
48  * space, or balance routine to spread chunks over devices.
49  *
50  *              Before          |               After
51  * ------------------------------------------------------------------
52  *  BG A: 10 data extents       | BG A: deleted
53  *  BG B:  2 data extents       | BG B: 10 data extents (2 old + 8 relocated)
54  *  BG C:  1 extents            | BG C:  3 data extents (1 old + 2 relocated)
55  *
56  * [How does relocation work]
57  *
58  * 1.   Mark the target block group read-only
59  *      New extents won't be allocated from the target block group.
60  *
61  * 2.1  Record each extent in the target block group
62  *      To build a proper map of extents to be relocated.
63  *
64  * 2.2  Build data reloc tree and reloc trees
65  *      Data reloc tree will contain an inode, recording all newly relocated
66  *      data extents.
67  *      There will be only one data reloc tree for one data block group.
68  *
69  *      Reloc tree will be a special snapshot of its source tree, containing
70  *      relocated tree blocks.
71  *      Each tree referring to a tree block in target block group will get its
72  *      reloc tree built.
73  *
74  * 2.3  Swap source tree with its corresponding reloc tree
75  *      Each involved tree only refers to new extents after swap.
76  *
77  * 3.   Cleanup reloc trees and data reloc tree.
78  *      As old extents in the target block group are still referenced by reloc
79  *      trees, we need to clean them up before really freeing the target block
80  *      group.
81  *
82  * The main complexity is in steps 2.2 and 2.3.
83  *
84  * The entry point of relocation is relocate_block_group() function.
85  */
86
87 #define RELOCATION_RESERVED_NODES       256
88 /*
89  * map address of tree root to tree
90  */
91 struct mapping_node {
92         struct {
93                 struct rb_node rb_node;
94                 u64 bytenr;
95         }; /* Use rb_simle_node for search/insert */
96         void *data;
97 };
98
99 struct mapping_tree {
100         struct rb_root rb_root;
101         spinlock_t lock;
102 };
103
104 /*
105  * present a tree block to process
106  */
107 struct tree_block {
108         struct {
109                 struct rb_node rb_node;
110                 u64 bytenr;
111         }; /* Use rb_simple_node for search/insert */
112         u64 owner;
113         struct btrfs_key key;
114         u8 level;
115         bool key_ready;
116 };
117
118 #define MAX_EXTENTS 128
119
120 struct file_extent_cluster {
121         u64 start;
122         u64 end;
123         u64 boundary[MAX_EXTENTS];
124         unsigned int nr;
125         u64 owning_root;
126 };
127
128 /* Stages of data relocation. */
129 enum reloc_stage {
130         MOVE_DATA_EXTENTS,
131         UPDATE_DATA_PTRS
132 };
133
134 struct reloc_control {
135         /* block group to relocate */
136         struct btrfs_block_group *block_group;
137         /* extent tree */
138         struct btrfs_root *extent_root;
139         /* inode for moving data */
140         struct inode *data_inode;
141
142         struct btrfs_block_rsv *block_rsv;
143
144         struct btrfs_backref_cache backref_cache;
145
146         struct file_extent_cluster cluster;
147         /* tree blocks have been processed */
148         struct extent_io_tree processed_blocks;
149         /* map start of tree root to corresponding reloc tree */
150         struct mapping_tree reloc_root_tree;
151         /* list of reloc trees */
152         struct list_head reloc_roots;
153         /* list of subvolume trees that get relocated */
154         struct list_head dirty_subvol_roots;
155         /* size of metadata reservation for merging reloc trees */
156         u64 merging_rsv_size;
157         /* size of relocated tree nodes */
158         u64 nodes_relocated;
159         /* reserved size for block group relocation*/
160         u64 reserved_bytes;
161
162         u64 search_start;
163         u64 extents_found;
164
165         enum reloc_stage stage;
166         bool create_reloc_tree;
167         bool merge_reloc_tree;
168         bool found_file_extent;
169 };
170
171 static void mark_block_processed(struct reloc_control *rc,
172                                  struct btrfs_backref_node *node)
173 {
174         u32 blocksize;
175
176         if (node->level == 0 ||
177             in_range(node->bytenr, rc->block_group->start,
178                      rc->block_group->length)) {
179                 blocksize = rc->extent_root->fs_info->nodesize;
180                 set_extent_bit(&rc->processed_blocks, node->bytenr,
181                                node->bytenr + blocksize - 1, EXTENT_DIRTY, NULL);
182         }
183         node->processed = 1;
184 }
185
186 /*
187  * walk up backref nodes until reach node presents tree root
188  */
189 static struct btrfs_backref_node *walk_up_backref(
190                 struct btrfs_backref_node *node,
191                 struct btrfs_backref_edge *edges[], int *index)
192 {
193         struct btrfs_backref_edge *edge;
194         int idx = *index;
195
196         while (!list_empty(&node->upper)) {
197                 edge = list_entry(node->upper.next,
198                                   struct btrfs_backref_edge, list[LOWER]);
199                 edges[idx++] = edge;
200                 node = edge->node[UPPER];
201         }
202         BUG_ON(node->detached);
203         *index = idx;
204         return node;
205 }
206
207 /*
208  * walk down backref nodes to find start of next reference path
209  */
210 static struct btrfs_backref_node *walk_down_backref(
211                 struct btrfs_backref_edge *edges[], int *index)
212 {
213         struct btrfs_backref_edge *edge;
214         struct btrfs_backref_node *lower;
215         int idx = *index;
216
217         while (idx > 0) {
218                 edge = edges[idx - 1];
219                 lower = edge->node[LOWER];
220                 if (list_is_last(&edge->list[LOWER], &lower->upper)) {
221                         idx--;
222                         continue;
223                 }
224                 edge = list_entry(edge->list[LOWER].next,
225                                   struct btrfs_backref_edge, list[LOWER]);
226                 edges[idx - 1] = edge;
227                 *index = idx;
228                 return edge->node[UPPER];
229         }
230         *index = 0;
231         return NULL;
232 }
233
234 static void update_backref_node(struct btrfs_backref_cache *cache,
235                                 struct btrfs_backref_node *node, u64 bytenr)
236 {
237         struct rb_node *rb_node;
238         rb_erase(&node->rb_node, &cache->rb_root);
239         node->bytenr = bytenr;
240         rb_node = rb_simple_insert(&cache->rb_root, node->bytenr, &node->rb_node);
241         if (rb_node)
242                 btrfs_backref_panic(cache->fs_info, bytenr, -EEXIST);
243 }
244
245 /*
246  * update backref cache after a transaction commit
247  */
248 static int update_backref_cache(struct btrfs_trans_handle *trans,
249                                 struct btrfs_backref_cache *cache)
250 {
251         struct btrfs_backref_node *node;
252         int level = 0;
253
254         if (cache->last_trans == 0) {
255                 cache->last_trans = trans->transid;
256                 return 0;
257         }
258
259         if (cache->last_trans == trans->transid)
260                 return 0;
261
262         /*
263          * detached nodes are used to avoid unnecessary backref
264          * lookup. transaction commit changes the extent tree.
265          * so the detached nodes are no longer useful.
266          */
267         while (!list_empty(&cache->detached)) {
268                 node = list_entry(cache->detached.next,
269                                   struct btrfs_backref_node, list);
270                 btrfs_backref_cleanup_node(cache, node);
271         }
272
273         while (!list_empty(&cache->changed)) {
274                 node = list_entry(cache->changed.next,
275                                   struct btrfs_backref_node, list);
276                 list_del_init(&node->list);
277                 BUG_ON(node->pending);
278                 update_backref_node(cache, node, node->new_bytenr);
279         }
280
281         /*
282          * some nodes can be left in the pending list if there were
283          * errors during processing the pending nodes.
284          */
285         for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
286                 list_for_each_entry(node, &cache->pending[level], list) {
287                         BUG_ON(!node->pending);
288                         if (node->bytenr == node->new_bytenr)
289                                 continue;
290                         update_backref_node(cache, node, node->new_bytenr);
291                 }
292         }
293
294         cache->last_trans = 0;
295         return 1;
296 }
297
298 static bool reloc_root_is_dead(const struct btrfs_root *root)
299 {
300         /*
301          * Pair with set_bit/clear_bit in clean_dirty_subvols and
302          * btrfs_update_reloc_root. We need to see the updated bit before
303          * trying to access reloc_root
304          */
305         smp_rmb();
306         if (test_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state))
307                 return true;
308         return false;
309 }
310
311 /*
312  * Check if this subvolume tree has valid reloc tree.
313  *
314  * Reloc tree after swap is considered dead, thus not considered as valid.
315  * This is enough for most callers, as they don't distinguish dead reloc root
316  * from no reloc root.  But btrfs_should_ignore_reloc_root() below is a
317  * special case.
318  */
319 static bool have_reloc_root(const struct btrfs_root *root)
320 {
321         if (reloc_root_is_dead(root))
322                 return false;
323         if (!root->reloc_root)
324                 return false;
325         return true;
326 }
327
328 bool btrfs_should_ignore_reloc_root(const struct btrfs_root *root)
329 {
330         struct btrfs_root *reloc_root;
331
332         if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
333                 return false;
334
335         /* This root has been merged with its reloc tree, we can ignore it */
336         if (reloc_root_is_dead(root))
337                 return true;
338
339         reloc_root = root->reloc_root;
340         if (!reloc_root)
341                 return false;
342
343         if (btrfs_header_generation(reloc_root->commit_root) ==
344             root->fs_info->running_transaction->transid)
345                 return false;
346         /*
347          * If there is reloc tree and it was created in previous transaction
348          * backref lookup can find the reloc tree, so backref node for the fs
349          * tree root is useless for relocation.
350          */
351         return true;
352 }
353
354 /*
355  * find reloc tree by address of tree root
356  */
357 struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr)
358 {
359         struct reloc_control *rc = fs_info->reloc_ctl;
360         struct rb_node *rb_node;
361         struct mapping_node *node;
362         struct btrfs_root *root = NULL;
363
364         ASSERT(rc);
365         spin_lock(&rc->reloc_root_tree.lock);
366         rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, bytenr);
367         if (rb_node) {
368                 node = rb_entry(rb_node, struct mapping_node, rb_node);
369                 root = node->data;
370         }
371         spin_unlock(&rc->reloc_root_tree.lock);
372         return btrfs_grab_root(root);
373 }
374
375 /*
376  * For useless nodes, do two major clean ups:
377  *
378  * - Cleanup the children edges and nodes
379  *   If child node is also orphan (no parent) during cleanup, then the child
380  *   node will also be cleaned up.
381  *
382  * - Freeing up leaves (level 0), keeps nodes detached
383  *   For nodes, the node is still cached as "detached"
384  *
385  * Return false if @node is not in the @useless_nodes list.
386  * Return true if @node is in the @useless_nodes list.
387  */
388 static bool handle_useless_nodes(struct reloc_control *rc,
389                                  struct btrfs_backref_node *node)
390 {
391         struct btrfs_backref_cache *cache = &rc->backref_cache;
392         struct list_head *useless_node = &cache->useless_node;
393         bool ret = false;
394
395         while (!list_empty(useless_node)) {
396                 struct btrfs_backref_node *cur;
397
398                 cur = list_first_entry(useless_node, struct btrfs_backref_node,
399                                  list);
400                 list_del_init(&cur->list);
401
402                 /* Only tree root nodes can be added to @useless_nodes */
403                 ASSERT(list_empty(&cur->upper));
404
405                 if (cur == node)
406                         ret = true;
407
408                 /* The node is the lowest node */
409                 if (cur->lowest) {
410                         list_del_init(&cur->lower);
411                         cur->lowest = 0;
412                 }
413
414                 /* Cleanup the lower edges */
415                 while (!list_empty(&cur->lower)) {
416                         struct btrfs_backref_edge *edge;
417                         struct btrfs_backref_node *lower;
418
419                         edge = list_entry(cur->lower.next,
420                                         struct btrfs_backref_edge, list[UPPER]);
421                         list_del(&edge->list[UPPER]);
422                         list_del(&edge->list[LOWER]);
423                         lower = edge->node[LOWER];
424                         btrfs_backref_free_edge(cache, edge);
425
426                         /* Child node is also orphan, queue for cleanup */
427                         if (list_empty(&lower->upper))
428                                 list_add(&lower->list, useless_node);
429                 }
430                 /* Mark this block processed for relocation */
431                 mark_block_processed(rc, cur);
432
433                 /*
434                  * Backref nodes for tree leaves are deleted from the cache.
435                  * Backref nodes for upper level tree blocks are left in the
436                  * cache to avoid unnecessary backref lookup.
437                  */
438                 if (cur->level > 0) {
439                         list_add(&cur->list, &cache->detached);
440                         cur->detached = 1;
441                 } else {
442                         rb_erase(&cur->rb_node, &cache->rb_root);
443                         btrfs_backref_free_node(cache, cur);
444                 }
445         }
446         return ret;
447 }
448
449 /*
450  * Build backref tree for a given tree block. Root of the backref tree
451  * corresponds the tree block, leaves of the backref tree correspond roots of
452  * b-trees that reference the tree block.
453  *
454  * The basic idea of this function is check backrefs of a given block to find
455  * upper level blocks that reference the block, and then check backrefs of
456  * these upper level blocks recursively. The recursion stops when tree root is
457  * reached or backrefs for the block is cached.
458  *
459  * NOTE: if we find that backrefs for a block are cached, we know backrefs for
460  * all upper level blocks that directly/indirectly reference the block are also
461  * cached.
462  */
463 static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
464                         struct btrfs_trans_handle *trans,
465                         struct reloc_control *rc, struct btrfs_key *node_key,
466                         int level, u64 bytenr)
467 {
468         struct btrfs_backref_iter *iter;
469         struct btrfs_backref_cache *cache = &rc->backref_cache;
470         /* For searching parent of TREE_BLOCK_REF */
471         struct btrfs_path *path;
472         struct btrfs_backref_node *cur;
473         struct btrfs_backref_node *node = NULL;
474         struct btrfs_backref_edge *edge;
475         int ret;
476         int err = 0;
477
478         iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info);
479         if (!iter)
480                 return ERR_PTR(-ENOMEM);
481         path = btrfs_alloc_path();
482         if (!path) {
483                 err = -ENOMEM;
484                 goto out;
485         }
486
487         node = btrfs_backref_alloc_node(cache, bytenr, level);
488         if (!node) {
489                 err = -ENOMEM;
490                 goto out;
491         }
492
493         node->lowest = 1;
494         cur = node;
495
496         /* Breadth-first search to build backref cache */
497         do {
498                 ret = btrfs_backref_add_tree_node(trans, cache, path, iter,
499                                                   node_key, cur);
500                 if (ret < 0) {
501                         err = ret;
502                         goto out;
503                 }
504                 edge = list_first_entry_or_null(&cache->pending_edge,
505                                 struct btrfs_backref_edge, list[UPPER]);
506                 /*
507                  * The pending list isn't empty, take the first block to
508                  * process
509                  */
510                 if (edge) {
511                         list_del_init(&edge->list[UPPER]);
512                         cur = edge->node[UPPER];
513                 }
514         } while (edge);
515
516         /* Finish the upper linkage of newly added edges/nodes */
517         ret = btrfs_backref_finish_upper_links(cache, node);
518         if (ret < 0) {
519                 err = ret;
520                 goto out;
521         }
522
523         if (handle_useless_nodes(rc, node))
524                 node = NULL;
525 out:
526         btrfs_backref_iter_free(iter);
527         btrfs_free_path(path);
528         if (err) {
529                 btrfs_backref_error_cleanup(cache, node);
530                 return ERR_PTR(err);
531         }
532         ASSERT(!node || !node->detached);
533         ASSERT(list_empty(&cache->useless_node) &&
534                list_empty(&cache->pending_edge));
535         return node;
536 }
537
538 /*
539  * helper to add backref node for the newly created snapshot.
540  * the backref node is created by cloning backref node that
541  * corresponds to root of source tree
542  */
543 static int clone_backref_node(struct btrfs_trans_handle *trans,
544                               struct reloc_control *rc,
545                               const struct btrfs_root *src,
546                               struct btrfs_root *dest)
547 {
548         struct btrfs_root *reloc_root = src->reloc_root;
549         struct btrfs_backref_cache *cache = &rc->backref_cache;
550         struct btrfs_backref_node *node = NULL;
551         struct btrfs_backref_node *new_node;
552         struct btrfs_backref_edge *edge;
553         struct btrfs_backref_edge *new_edge;
554         struct rb_node *rb_node;
555
556         if (cache->last_trans > 0)
557                 update_backref_cache(trans, cache);
558
559         rb_node = rb_simple_search(&cache->rb_root, src->commit_root->start);
560         if (rb_node) {
561                 node = rb_entry(rb_node, struct btrfs_backref_node, rb_node);
562                 if (node->detached)
563                         node = NULL;
564                 else
565                         BUG_ON(node->new_bytenr != reloc_root->node->start);
566         }
567
568         if (!node) {
569                 rb_node = rb_simple_search(&cache->rb_root,
570                                            reloc_root->commit_root->start);
571                 if (rb_node) {
572                         node = rb_entry(rb_node, struct btrfs_backref_node,
573                                         rb_node);
574                         BUG_ON(node->detached);
575                 }
576         }
577
578         if (!node)
579                 return 0;
580
581         new_node = btrfs_backref_alloc_node(cache, dest->node->start,
582                                             node->level);
583         if (!new_node)
584                 return -ENOMEM;
585
586         new_node->lowest = node->lowest;
587         new_node->checked = 1;
588         new_node->root = btrfs_grab_root(dest);
589         ASSERT(new_node->root);
590
591         if (!node->lowest) {
592                 list_for_each_entry(edge, &node->lower, list[UPPER]) {
593                         new_edge = btrfs_backref_alloc_edge(cache);
594                         if (!new_edge)
595                                 goto fail;
596
597                         btrfs_backref_link_edge(new_edge, edge->node[LOWER],
598                                                 new_node, LINK_UPPER);
599                 }
600         } else {
601                 list_add_tail(&new_node->lower, &cache->leaves);
602         }
603
604         rb_node = rb_simple_insert(&cache->rb_root, new_node->bytenr,
605                                    &new_node->rb_node);
606         if (rb_node)
607                 btrfs_backref_panic(trans->fs_info, new_node->bytenr, -EEXIST);
608
609         if (!new_node->lowest) {
610                 list_for_each_entry(new_edge, &new_node->lower, list[UPPER]) {
611                         list_add_tail(&new_edge->list[LOWER],
612                                       &new_edge->node[LOWER]->upper);
613                 }
614         }
615         return 0;
616 fail:
617         while (!list_empty(&new_node->lower)) {
618                 new_edge = list_entry(new_node->lower.next,
619                                       struct btrfs_backref_edge, list[UPPER]);
620                 list_del(&new_edge->list[UPPER]);
621                 btrfs_backref_free_edge(cache, new_edge);
622         }
623         btrfs_backref_free_node(cache, new_node);
624         return -ENOMEM;
625 }
626
627 /*
628  * helper to add 'address of tree root -> reloc tree' mapping
629  */
630 static int __add_reloc_root(struct btrfs_root *root)
631 {
632         struct btrfs_fs_info *fs_info = root->fs_info;
633         struct rb_node *rb_node;
634         struct mapping_node *node;
635         struct reloc_control *rc = fs_info->reloc_ctl;
636
637         node = kmalloc(sizeof(*node), GFP_NOFS);
638         if (!node)
639                 return -ENOMEM;
640
641         node->bytenr = root->commit_root->start;
642         node->data = root;
643
644         spin_lock(&rc->reloc_root_tree.lock);
645         rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root,
646                                    node->bytenr, &node->rb_node);
647         spin_unlock(&rc->reloc_root_tree.lock);
648         if (rb_node) {
649                 btrfs_err(fs_info,
650                             "Duplicate root found for start=%llu while inserting into relocation tree",
651                             node->bytenr);
652                 return -EEXIST;
653         }
654
655         list_add_tail(&root->root_list, &rc->reloc_roots);
656         return 0;
657 }
658
659 /*
660  * helper to delete the 'address of tree root -> reloc tree'
661  * mapping
662  */
663 static void __del_reloc_root(struct btrfs_root *root)
664 {
665         struct btrfs_fs_info *fs_info = root->fs_info;
666         struct rb_node *rb_node;
667         struct mapping_node *node = NULL;
668         struct reloc_control *rc = fs_info->reloc_ctl;
669         bool put_ref = false;
670
671         if (rc && root->node) {
672                 spin_lock(&rc->reloc_root_tree.lock);
673                 rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root,
674                                            root->commit_root->start);
675                 if (rb_node) {
676                         node = rb_entry(rb_node, struct mapping_node, rb_node);
677                         rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
678                         RB_CLEAR_NODE(&node->rb_node);
679                 }
680                 spin_unlock(&rc->reloc_root_tree.lock);
681                 ASSERT(!node || (struct btrfs_root *)node->data == root);
682         }
683
684         /*
685          * We only put the reloc root here if it's on the list.  There's a lot
686          * of places where the pattern is to splice the rc->reloc_roots, process
687          * the reloc roots, and then add the reloc root back onto
688          * rc->reloc_roots.  If we call __del_reloc_root while it's off of the
689          * list we don't want the reference being dropped, because the guy
690          * messing with the list is in charge of the reference.
691          */
692         spin_lock(&fs_info->trans_lock);
693         if (!list_empty(&root->root_list)) {
694                 put_ref = true;
695                 list_del_init(&root->root_list);
696         }
697         spin_unlock(&fs_info->trans_lock);
698         if (put_ref)
699                 btrfs_put_root(root);
700         kfree(node);
701 }
702
703 /*
704  * helper to update the 'address of tree root -> reloc tree'
705  * mapping
706  */
707 static int __update_reloc_root(struct btrfs_root *root)
708 {
709         struct btrfs_fs_info *fs_info = root->fs_info;
710         struct rb_node *rb_node;
711         struct mapping_node *node = NULL;
712         struct reloc_control *rc = fs_info->reloc_ctl;
713
714         spin_lock(&rc->reloc_root_tree.lock);
715         rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root,
716                                    root->commit_root->start);
717         if (rb_node) {
718                 node = rb_entry(rb_node, struct mapping_node, rb_node);
719                 rb_erase(&node->rb_node, &rc->reloc_root_tree.rb_root);
720         }
721         spin_unlock(&rc->reloc_root_tree.lock);
722
723         if (!node)
724                 return 0;
725         BUG_ON((struct btrfs_root *)node->data != root);
726
727         spin_lock(&rc->reloc_root_tree.lock);
728         node->bytenr = root->node->start;
729         rb_node = rb_simple_insert(&rc->reloc_root_tree.rb_root,
730                                    node->bytenr, &node->rb_node);
731         spin_unlock(&rc->reloc_root_tree.lock);
732         if (rb_node)
733                 btrfs_backref_panic(fs_info, node->bytenr, -EEXIST);
734         return 0;
735 }
736
737 static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans,
738                                         struct btrfs_root *root, u64 objectid)
739 {
740         struct btrfs_fs_info *fs_info = root->fs_info;
741         struct btrfs_root *reloc_root;
742         struct extent_buffer *eb;
743         struct btrfs_root_item *root_item;
744         struct btrfs_key root_key;
745         int ret = 0;
746         bool must_abort = false;
747
748         root_item = kmalloc(sizeof(*root_item), GFP_NOFS);
749         if (!root_item)
750                 return ERR_PTR(-ENOMEM);
751
752         root_key.objectid = BTRFS_TREE_RELOC_OBJECTID;
753         root_key.type = BTRFS_ROOT_ITEM_KEY;
754         root_key.offset = objectid;
755
756         if (root->root_key.objectid == objectid) {
757                 u64 commit_root_gen;
758
759                 /* called by btrfs_init_reloc_root */
760                 ret = btrfs_copy_root(trans, root, root->commit_root, &eb,
761                                       BTRFS_TREE_RELOC_OBJECTID);
762                 if (ret)
763                         goto fail;
764
765                 /*
766                  * Set the last_snapshot field to the generation of the commit
767                  * root - like this ctree.c:btrfs_block_can_be_shared() behaves
768                  * correctly (returns true) when the relocation root is created
769                  * either inside the critical section of a transaction commit
770                  * (through transaction.c:qgroup_account_snapshot()) and when
771                  * it's created before the transaction commit is started.
772                  */
773                 commit_root_gen = btrfs_header_generation(root->commit_root);
774                 btrfs_set_root_last_snapshot(&root->root_item, commit_root_gen);
775         } else {
776                 /*
777                  * called by btrfs_reloc_post_snapshot_hook.
778                  * the source tree is a reloc tree, all tree blocks
779                  * modified after it was created have RELOC flag
780                  * set in their headers. so it's OK to not update
781                  * the 'last_snapshot'.
782                  */
783                 ret = btrfs_copy_root(trans, root, root->node, &eb,
784                                       BTRFS_TREE_RELOC_OBJECTID);
785                 if (ret)
786                         goto fail;
787         }
788
789         /*
790          * We have changed references at this point, we must abort the
791          * transaction if anything fails.
792          */
793         must_abort = true;
794
795         memcpy(root_item, &root->root_item, sizeof(*root_item));
796         btrfs_set_root_bytenr(root_item, eb->start);
797         btrfs_set_root_level(root_item, btrfs_header_level(eb));
798         btrfs_set_root_generation(root_item, trans->transid);
799
800         if (root->root_key.objectid == objectid) {
801                 btrfs_set_root_refs(root_item, 0);
802                 memset(&root_item->drop_progress, 0,
803                        sizeof(struct btrfs_disk_key));
804                 btrfs_set_root_drop_level(root_item, 0);
805         }
806
807         btrfs_tree_unlock(eb);
808         free_extent_buffer(eb);
809
810         ret = btrfs_insert_root(trans, fs_info->tree_root,
811                                 &root_key, root_item);
812         if (ret)
813                 goto fail;
814
815         kfree(root_item);
816
817         reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key);
818         if (IS_ERR(reloc_root)) {
819                 ret = PTR_ERR(reloc_root);
820                 goto abort;
821         }
822         set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state);
823         reloc_root->last_trans = trans->transid;
824         return reloc_root;
825 fail:
826         kfree(root_item);
827 abort:
828         if (must_abort)
829                 btrfs_abort_transaction(trans, ret);
830         return ERR_PTR(ret);
831 }
832
833 /*
834  * create reloc tree for a given fs tree. reloc tree is just a
835  * snapshot of the fs tree with special root objectid.
836  *
837  * The reloc_root comes out of here with two references, one for
838  * root->reloc_root, and another for being on the rc->reloc_roots list.
839  */
840 int btrfs_init_reloc_root(struct btrfs_trans_handle *trans,
841                           struct btrfs_root *root)
842 {
843         struct btrfs_fs_info *fs_info = root->fs_info;
844         struct btrfs_root *reloc_root;
845         struct reloc_control *rc = fs_info->reloc_ctl;
846         struct btrfs_block_rsv *rsv;
847         int clear_rsv = 0;
848         int ret;
849
850         if (!rc)
851                 return 0;
852
853         /*
854          * The subvolume has reloc tree but the swap is finished, no need to
855          * create/update the dead reloc tree
856          */
857         if (reloc_root_is_dead(root))
858                 return 0;
859
860         /*
861          * This is subtle but important.  We do not do
862          * record_root_in_transaction for reloc roots, instead we record their
863          * corresponding fs root, and then here we update the last trans for the
864          * reloc root.  This means that we have to do this for the entire life
865          * of the reloc root, regardless of which stage of the relocation we are
866          * in.
867          */
868         if (root->reloc_root) {
869                 reloc_root = root->reloc_root;
870                 reloc_root->last_trans = trans->transid;
871                 return 0;
872         }
873
874         /*
875          * We are merging reloc roots, we do not need new reloc trees.  Also
876          * reloc trees never need their own reloc tree.
877          */
878         if (!rc->create_reloc_tree ||
879             root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
880                 return 0;
881
882         if (!trans->reloc_reserved) {
883                 rsv = trans->block_rsv;
884                 trans->block_rsv = rc->block_rsv;
885                 clear_rsv = 1;
886         }
887         reloc_root = create_reloc_root(trans, root, root->root_key.objectid);
888         if (clear_rsv)
889                 trans->block_rsv = rsv;
890         if (IS_ERR(reloc_root))
891                 return PTR_ERR(reloc_root);
892
893         ret = __add_reloc_root(reloc_root);
894         ASSERT(ret != -EEXIST);
895         if (ret) {
896                 /* Pairs with create_reloc_root */
897                 btrfs_put_root(reloc_root);
898                 return ret;
899         }
900         root->reloc_root = btrfs_grab_root(reloc_root);
901         return 0;
902 }
903
904 /*
905  * update root item of reloc tree
906  */
907 int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
908                             struct btrfs_root *root)
909 {
910         struct btrfs_fs_info *fs_info = root->fs_info;
911         struct btrfs_root *reloc_root;
912         struct btrfs_root_item *root_item;
913         int ret;
914
915         if (!have_reloc_root(root))
916                 return 0;
917
918         reloc_root = root->reloc_root;
919         root_item = &reloc_root->root_item;
920
921         /*
922          * We are probably ok here, but __del_reloc_root() will drop its ref of
923          * the root.  We have the ref for root->reloc_root, but just in case
924          * hold it while we update the reloc root.
925          */
926         btrfs_grab_root(reloc_root);
927
928         /* root->reloc_root will stay until current relocation finished */
929         if (fs_info->reloc_ctl->merge_reloc_tree &&
930             btrfs_root_refs(root_item) == 0) {
931                 set_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
932                 /*
933                  * Mark the tree as dead before we change reloc_root so
934                  * have_reloc_root will not touch it from now on.
935                  */
936                 smp_wmb();
937                 __del_reloc_root(reloc_root);
938         }
939
940         if (reloc_root->commit_root != reloc_root->node) {
941                 __update_reloc_root(reloc_root);
942                 btrfs_set_root_node(root_item, reloc_root->node);
943                 free_extent_buffer(reloc_root->commit_root);
944                 reloc_root->commit_root = btrfs_root_node(reloc_root);
945         }
946
947         ret = btrfs_update_root(trans, fs_info->tree_root,
948                                 &reloc_root->root_key, root_item);
949         btrfs_put_root(reloc_root);
950         return ret;
951 }
952
953 /*
954  * helper to find first cached inode with inode number >= objectid
955  * in a subvolume
956  */
957 static struct inode *find_next_inode(struct btrfs_root *root, u64 objectid)
958 {
959         struct rb_node *node;
960         struct rb_node *prev;
961         struct btrfs_inode *entry;
962         struct inode *inode;
963
964         spin_lock(&root->inode_lock);
965 again:
966         node = root->inode_tree.rb_node;
967         prev = NULL;
968         while (node) {
969                 prev = node;
970                 entry = rb_entry(node, struct btrfs_inode, rb_node);
971
972                 if (objectid < btrfs_ino(entry))
973                         node = node->rb_left;
974                 else if (objectid > btrfs_ino(entry))
975                         node = node->rb_right;
976                 else
977                         break;
978         }
979         if (!node) {
980                 while (prev) {
981                         entry = rb_entry(prev, struct btrfs_inode, rb_node);
982                         if (objectid <= btrfs_ino(entry)) {
983                                 node = prev;
984                                 break;
985                         }
986                         prev = rb_next(prev);
987                 }
988         }
989         while (node) {
990                 entry = rb_entry(node, struct btrfs_inode, rb_node);
991                 inode = igrab(&entry->vfs_inode);
992                 if (inode) {
993                         spin_unlock(&root->inode_lock);
994                         return inode;
995                 }
996
997                 objectid = btrfs_ino(entry) + 1;
998                 if (cond_resched_lock(&root->inode_lock))
999                         goto again;
1000
1001                 node = rb_next(node);
1002         }
1003         spin_unlock(&root->inode_lock);
1004         return NULL;
1005 }
1006
1007 /*
1008  * get new location of data
1009  */
1010 static int get_new_location(struct inode *reloc_inode, u64 *new_bytenr,
1011                             u64 bytenr, u64 num_bytes)
1012 {
1013         struct btrfs_root *root = BTRFS_I(reloc_inode)->root;
1014         struct btrfs_path *path;
1015         struct btrfs_file_extent_item *fi;
1016         struct extent_buffer *leaf;
1017         int ret;
1018
1019         path = btrfs_alloc_path();
1020         if (!path)
1021                 return -ENOMEM;
1022
1023         bytenr -= BTRFS_I(reloc_inode)->index_cnt;
1024         ret = btrfs_lookup_file_extent(NULL, root, path,
1025                         btrfs_ino(BTRFS_I(reloc_inode)), bytenr, 0);
1026         if (ret < 0)
1027                 goto out;
1028         if (ret > 0) {
1029                 ret = -ENOENT;
1030                 goto out;
1031         }
1032
1033         leaf = path->nodes[0];
1034         fi = btrfs_item_ptr(leaf, path->slots[0],
1035                             struct btrfs_file_extent_item);
1036
1037         BUG_ON(btrfs_file_extent_offset(leaf, fi) ||
1038                btrfs_file_extent_compression(leaf, fi) ||
1039                btrfs_file_extent_encryption(leaf, fi) ||
1040                btrfs_file_extent_other_encoding(leaf, fi));
1041
1042         if (num_bytes != btrfs_file_extent_disk_num_bytes(leaf, fi)) {
1043                 ret = -EINVAL;
1044                 goto out;
1045         }
1046
1047         *new_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1048         ret = 0;
1049 out:
1050         btrfs_free_path(path);
1051         return ret;
1052 }
1053
1054 /*
1055  * update file extent items in the tree leaf to point to
1056  * the new locations.
1057  */
1058 static noinline_for_stack
1059 int replace_file_extents(struct btrfs_trans_handle *trans,
1060                          struct reloc_control *rc,
1061                          struct btrfs_root *root,
1062                          struct extent_buffer *leaf)
1063 {
1064         struct btrfs_fs_info *fs_info = root->fs_info;
1065         struct btrfs_key key;
1066         struct btrfs_file_extent_item *fi;
1067         struct inode *inode = NULL;
1068         u64 parent;
1069         u64 bytenr;
1070         u64 new_bytenr = 0;
1071         u64 num_bytes;
1072         u64 end;
1073         u32 nritems;
1074         u32 i;
1075         int ret = 0;
1076         int first = 1;
1077         int dirty = 0;
1078
1079         if (rc->stage != UPDATE_DATA_PTRS)
1080                 return 0;
1081
1082         /* reloc trees always use full backref */
1083         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1084                 parent = leaf->start;
1085         else
1086                 parent = 0;
1087
1088         nritems = btrfs_header_nritems(leaf);
1089         for (i = 0; i < nritems; i++) {
1090                 struct btrfs_ref ref = { 0 };
1091
1092                 cond_resched();
1093                 btrfs_item_key_to_cpu(leaf, &key, i);
1094                 if (key.type != BTRFS_EXTENT_DATA_KEY)
1095                         continue;
1096                 fi = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
1097                 if (btrfs_file_extent_type(leaf, fi) ==
1098                     BTRFS_FILE_EXTENT_INLINE)
1099                         continue;
1100                 bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
1101                 num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
1102                 if (bytenr == 0)
1103                         continue;
1104                 if (!in_range(bytenr, rc->block_group->start,
1105                               rc->block_group->length))
1106                         continue;
1107
1108                 /*
1109                  * if we are modifying block in fs tree, wait for read_folio
1110                  * to complete and drop the extent cache
1111                  */
1112                 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1113                         if (first) {
1114                                 inode = find_next_inode(root, key.objectid);
1115                                 first = 0;
1116                         } else if (inode && btrfs_ino(BTRFS_I(inode)) < key.objectid) {
1117                                 btrfs_add_delayed_iput(BTRFS_I(inode));
1118                                 inode = find_next_inode(root, key.objectid);
1119                         }
1120                         if (inode && btrfs_ino(BTRFS_I(inode)) == key.objectid) {
1121                                 struct extent_state *cached_state = NULL;
1122
1123                                 end = key.offset +
1124                                       btrfs_file_extent_num_bytes(leaf, fi);
1125                                 WARN_ON(!IS_ALIGNED(key.offset,
1126                                                     fs_info->sectorsize));
1127                                 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
1128                                 end--;
1129                                 ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
1130                                                       key.offset, end,
1131                                                       &cached_state);
1132                                 if (!ret)
1133                                         continue;
1134
1135                                 btrfs_drop_extent_map_range(BTRFS_I(inode),
1136                                                             key.offset, end, true);
1137                                 unlock_extent(&BTRFS_I(inode)->io_tree,
1138                                               key.offset, end, &cached_state);
1139                         }
1140                 }
1141
1142                 ret = get_new_location(rc->data_inode, &new_bytenr,
1143                                        bytenr, num_bytes);
1144                 if (ret) {
1145                         /*
1146                          * Don't have to abort since we've not changed anything
1147                          * in the file extent yet.
1148                          */
1149                         break;
1150                 }
1151
1152                 btrfs_set_file_extent_disk_bytenr(leaf, fi, new_bytenr);
1153                 dirty = 1;
1154
1155                 key.offset -= btrfs_file_extent_offset(leaf, fi);
1156                 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
1157                                        num_bytes, parent, root->root_key.objectid);
1158                 btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
1159                                     key.objectid, key.offset,
1160                                     root->root_key.objectid, false);
1161                 ret = btrfs_inc_extent_ref(trans, &ref);
1162                 if (ret) {
1163                         btrfs_abort_transaction(trans, ret);
1164                         break;
1165                 }
1166
1167                 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
1168                                        num_bytes, parent, root->root_key.objectid);
1169                 btrfs_init_data_ref(&ref, btrfs_header_owner(leaf),
1170                                     key.objectid, key.offset,
1171                                     root->root_key.objectid, false);
1172                 ret = btrfs_free_extent(trans, &ref);
1173                 if (ret) {
1174                         btrfs_abort_transaction(trans, ret);
1175                         break;
1176                 }
1177         }
1178         if (dirty)
1179                 btrfs_mark_buffer_dirty(trans, leaf);
1180         if (inode)
1181                 btrfs_add_delayed_iput(BTRFS_I(inode));
1182         return ret;
1183 }
1184
1185 static noinline_for_stack int memcmp_node_keys(const struct extent_buffer *eb,
1186                                                int slot, const struct btrfs_path *path,
1187                                                int level)
1188 {
1189         struct btrfs_disk_key key1;
1190         struct btrfs_disk_key key2;
1191         btrfs_node_key(eb, &key1, slot);
1192         btrfs_node_key(path->nodes[level], &key2, path->slots[level]);
1193         return memcmp(&key1, &key2, sizeof(key1));
1194 }
1195
1196 /*
1197  * try to replace tree blocks in fs tree with the new blocks
1198  * in reloc tree. tree blocks haven't been modified since the
1199  * reloc tree was create can be replaced.
1200  *
1201  * if a block was replaced, level of the block + 1 is returned.
1202  * if no block got replaced, 0 is returned. if there are other
1203  * errors, a negative error number is returned.
1204  */
1205 static noinline_for_stack
1206 int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc,
1207                  struct btrfs_root *dest, struct btrfs_root *src,
1208                  struct btrfs_path *path, struct btrfs_key *next_key,
1209                  int lowest_level, int max_level)
1210 {
1211         struct btrfs_fs_info *fs_info = dest->fs_info;
1212         struct extent_buffer *eb;
1213         struct extent_buffer *parent;
1214         struct btrfs_ref ref = { 0 };
1215         struct btrfs_key key;
1216         u64 old_bytenr;
1217         u64 new_bytenr;
1218         u64 old_ptr_gen;
1219         u64 new_ptr_gen;
1220         u64 last_snapshot;
1221         u32 blocksize;
1222         int cow = 0;
1223         int level;
1224         int ret;
1225         int slot;
1226
1227         ASSERT(src->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID);
1228         ASSERT(dest->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1229
1230         last_snapshot = btrfs_root_last_snapshot(&src->root_item);
1231 again:
1232         slot = path->slots[lowest_level];
1233         btrfs_node_key_to_cpu(path->nodes[lowest_level], &key, slot);
1234
1235         eb = btrfs_lock_root_node(dest);
1236         level = btrfs_header_level(eb);
1237
1238         if (level < lowest_level) {
1239                 btrfs_tree_unlock(eb);
1240                 free_extent_buffer(eb);
1241                 return 0;
1242         }
1243
1244         if (cow) {
1245                 ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb,
1246                                       BTRFS_NESTING_COW);
1247                 if (ret) {
1248                         btrfs_tree_unlock(eb);
1249                         free_extent_buffer(eb);
1250                         return ret;
1251                 }
1252         }
1253
1254         if (next_key) {
1255                 next_key->objectid = (u64)-1;
1256                 next_key->type = (u8)-1;
1257                 next_key->offset = (u64)-1;
1258         }
1259
1260         parent = eb;
1261         while (1) {
1262                 level = btrfs_header_level(parent);
1263                 ASSERT(level >= lowest_level);
1264
1265                 ret = btrfs_bin_search(parent, 0, &key, &slot);
1266                 if (ret < 0)
1267                         break;
1268                 if (ret && slot > 0)
1269                         slot--;
1270
1271                 if (next_key && slot + 1 < btrfs_header_nritems(parent))
1272                         btrfs_node_key_to_cpu(parent, next_key, slot + 1);
1273
1274                 old_bytenr = btrfs_node_blockptr(parent, slot);
1275                 blocksize = fs_info->nodesize;
1276                 old_ptr_gen = btrfs_node_ptr_generation(parent, slot);
1277
1278                 if (level <= max_level) {
1279                         eb = path->nodes[level];
1280                         new_bytenr = btrfs_node_blockptr(eb,
1281                                                         path->slots[level]);
1282                         new_ptr_gen = btrfs_node_ptr_generation(eb,
1283                                                         path->slots[level]);
1284                 } else {
1285                         new_bytenr = 0;
1286                         new_ptr_gen = 0;
1287                 }
1288
1289                 if (WARN_ON(new_bytenr > 0 && new_bytenr == old_bytenr)) {
1290                         ret = level;
1291                         break;
1292                 }
1293
1294                 if (new_bytenr == 0 || old_ptr_gen > last_snapshot ||
1295                     memcmp_node_keys(parent, slot, path, level)) {
1296                         if (level <= lowest_level) {
1297                                 ret = 0;
1298                                 break;
1299                         }
1300
1301                         eb = btrfs_read_node_slot(parent, slot);
1302                         if (IS_ERR(eb)) {
1303                                 ret = PTR_ERR(eb);
1304                                 break;
1305                         }
1306                         btrfs_tree_lock(eb);
1307                         if (cow) {
1308                                 ret = btrfs_cow_block(trans, dest, eb, parent,
1309                                                       slot, &eb,
1310                                                       BTRFS_NESTING_COW);
1311                                 if (ret) {
1312                                         btrfs_tree_unlock(eb);
1313                                         free_extent_buffer(eb);
1314                                         break;
1315                                 }
1316                         }
1317
1318                         btrfs_tree_unlock(parent);
1319                         free_extent_buffer(parent);
1320
1321                         parent = eb;
1322                         continue;
1323                 }
1324
1325                 if (!cow) {
1326                         btrfs_tree_unlock(parent);
1327                         free_extent_buffer(parent);
1328                         cow = 1;
1329                         goto again;
1330                 }
1331
1332                 btrfs_node_key_to_cpu(path->nodes[level], &key,
1333                                       path->slots[level]);
1334                 btrfs_release_path(path);
1335
1336                 path->lowest_level = level;
1337                 set_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state);
1338                 ret = btrfs_search_slot(trans, src, &key, path, 0, 1);
1339                 clear_bit(BTRFS_ROOT_RESET_LOCKDEP_CLASS, &src->state);
1340                 path->lowest_level = 0;
1341                 if (ret) {
1342                         if (ret > 0)
1343                                 ret = -ENOENT;
1344                         break;
1345                 }
1346
1347                 /*
1348                  * Info qgroup to trace both subtrees.
1349                  *
1350                  * We must trace both trees.
1351                  * 1) Tree reloc subtree
1352                  *    If not traced, we will leak data numbers
1353                  * 2) Fs subtree
1354                  *    If not traced, we will double count old data
1355                  *
1356                  * We don't scan the subtree right now, but only record
1357                  * the swapped tree blocks.
1358                  * The real subtree rescan is delayed until we have new
1359                  * CoW on the subtree root node before transaction commit.
1360                  */
1361                 ret = btrfs_qgroup_add_swapped_blocks(trans, dest,
1362                                 rc->block_group, parent, slot,
1363                                 path->nodes[level], path->slots[level],
1364                                 last_snapshot);
1365                 if (ret < 0)
1366                         break;
1367                 /*
1368                  * swap blocks in fs tree and reloc tree.
1369                  */
1370                 btrfs_set_node_blockptr(parent, slot, new_bytenr);
1371                 btrfs_set_node_ptr_generation(parent, slot, new_ptr_gen);
1372                 btrfs_mark_buffer_dirty(trans, parent);
1373
1374                 btrfs_set_node_blockptr(path->nodes[level],
1375                                         path->slots[level], old_bytenr);
1376                 btrfs_set_node_ptr_generation(path->nodes[level],
1377                                               path->slots[level], old_ptr_gen);
1378                 btrfs_mark_buffer_dirty(trans, path->nodes[level]);
1379
1380                 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, old_bytenr,
1381                                        blocksize, path->nodes[level]->start,
1382                                        src->root_key.objectid);
1383                 btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid,
1384                                     0, true);
1385                 ret = btrfs_inc_extent_ref(trans, &ref);
1386                 if (ret) {
1387                         btrfs_abort_transaction(trans, ret);
1388                         break;
1389                 }
1390                 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr,
1391                                        blocksize, 0, dest->root_key.objectid);
1392                 btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid, 0,
1393                                     true);
1394                 ret = btrfs_inc_extent_ref(trans, &ref);
1395                 if (ret) {
1396                         btrfs_abort_transaction(trans, ret);
1397                         break;
1398                 }
1399
1400                 /* We don't know the real owning_root, use 0. */
1401                 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr,
1402                                        blocksize, path->nodes[level]->start, 0);
1403                 btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid,
1404                                     0, true);
1405                 ret = btrfs_free_extent(trans, &ref);
1406                 if (ret) {
1407                         btrfs_abort_transaction(trans, ret);
1408                         break;
1409                 }
1410
1411                 /* We don't know the real owning_root, use 0. */
1412                 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr,
1413                                        blocksize, 0, 0);
1414                 btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid,
1415                                     0, true);
1416                 ret = btrfs_free_extent(trans, &ref);
1417                 if (ret) {
1418                         btrfs_abort_transaction(trans, ret);
1419                         break;
1420                 }
1421
1422                 btrfs_unlock_up_safe(path, 0);
1423
1424                 ret = level;
1425                 break;
1426         }
1427         btrfs_tree_unlock(parent);
1428         free_extent_buffer(parent);
1429         return ret;
1430 }
1431
1432 /*
1433  * helper to find next relocated block in reloc tree
1434  */
1435 static noinline_for_stack
1436 int walk_up_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1437                        int *level)
1438 {
1439         struct extent_buffer *eb;
1440         int i;
1441         u64 last_snapshot;
1442         u32 nritems;
1443
1444         last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1445
1446         for (i = 0; i < *level; i++) {
1447                 free_extent_buffer(path->nodes[i]);
1448                 path->nodes[i] = NULL;
1449         }
1450
1451         for (i = *level; i < BTRFS_MAX_LEVEL && path->nodes[i]; i++) {
1452                 eb = path->nodes[i];
1453                 nritems = btrfs_header_nritems(eb);
1454                 while (path->slots[i] + 1 < nritems) {
1455                         path->slots[i]++;
1456                         if (btrfs_node_ptr_generation(eb, path->slots[i]) <=
1457                             last_snapshot)
1458                                 continue;
1459
1460                         *level = i;
1461                         return 0;
1462                 }
1463                 free_extent_buffer(path->nodes[i]);
1464                 path->nodes[i] = NULL;
1465         }
1466         return 1;
1467 }
1468
1469 /*
1470  * walk down reloc tree to find relocated block of lowest level
1471  */
1472 static noinline_for_stack
1473 int walk_down_reloc_tree(struct btrfs_root *root, struct btrfs_path *path,
1474                          int *level)
1475 {
1476         struct extent_buffer *eb = NULL;
1477         int i;
1478         u64 ptr_gen = 0;
1479         u64 last_snapshot;
1480         u32 nritems;
1481
1482         last_snapshot = btrfs_root_last_snapshot(&root->root_item);
1483
1484         for (i = *level; i > 0; i--) {
1485                 eb = path->nodes[i];
1486                 nritems = btrfs_header_nritems(eb);
1487                 while (path->slots[i] < nritems) {
1488                         ptr_gen = btrfs_node_ptr_generation(eb, path->slots[i]);
1489                         if (ptr_gen > last_snapshot)
1490                                 break;
1491                         path->slots[i]++;
1492                 }
1493                 if (path->slots[i] >= nritems) {
1494                         if (i == *level)
1495                                 break;
1496                         *level = i + 1;
1497                         return 0;
1498                 }
1499                 if (i == 1) {
1500                         *level = i;
1501                         return 0;
1502                 }
1503
1504                 eb = btrfs_read_node_slot(eb, path->slots[i]);
1505                 if (IS_ERR(eb))
1506                         return PTR_ERR(eb);
1507                 BUG_ON(btrfs_header_level(eb) != i - 1);
1508                 path->nodes[i - 1] = eb;
1509                 path->slots[i - 1] = 0;
1510         }
1511         return 1;
1512 }
1513
1514 /*
1515  * invalidate extent cache for file extents whose key in range of
1516  * [min_key, max_key)
1517  */
1518 static int invalidate_extent_cache(struct btrfs_root *root,
1519                                    const struct btrfs_key *min_key,
1520                                    const struct btrfs_key *max_key)
1521 {
1522         struct btrfs_fs_info *fs_info = root->fs_info;
1523         struct inode *inode = NULL;
1524         u64 objectid;
1525         u64 start, end;
1526         u64 ino;
1527
1528         objectid = min_key->objectid;
1529         while (1) {
1530                 struct extent_state *cached_state = NULL;
1531
1532                 cond_resched();
1533                 iput(inode);
1534
1535                 if (objectid > max_key->objectid)
1536                         break;
1537
1538                 inode = find_next_inode(root, objectid);
1539                 if (!inode)
1540                         break;
1541                 ino = btrfs_ino(BTRFS_I(inode));
1542
1543                 if (ino > max_key->objectid) {
1544                         iput(inode);
1545                         break;
1546                 }
1547
1548                 objectid = ino + 1;
1549                 if (!S_ISREG(inode->i_mode))
1550                         continue;
1551
1552                 if (unlikely(min_key->objectid == ino)) {
1553                         if (min_key->type > BTRFS_EXTENT_DATA_KEY)
1554                                 continue;
1555                         if (min_key->type < BTRFS_EXTENT_DATA_KEY)
1556                                 start = 0;
1557                         else {
1558                                 start = min_key->offset;
1559                                 WARN_ON(!IS_ALIGNED(start, fs_info->sectorsize));
1560                         }
1561                 } else {
1562                         start = 0;
1563                 }
1564
1565                 if (unlikely(max_key->objectid == ino)) {
1566                         if (max_key->type < BTRFS_EXTENT_DATA_KEY)
1567                                 continue;
1568                         if (max_key->type > BTRFS_EXTENT_DATA_KEY) {
1569                                 end = (u64)-1;
1570                         } else {
1571                                 if (max_key->offset == 0)
1572                                         continue;
1573                                 end = max_key->offset;
1574                                 WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
1575                                 end--;
1576                         }
1577                 } else {
1578                         end = (u64)-1;
1579                 }
1580
1581                 /* the lock_extent waits for read_folio to complete */
1582                 lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
1583                 btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, true);
1584                 unlock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
1585         }
1586         return 0;
1587 }
1588
1589 static int find_next_key(struct btrfs_path *path, int level,
1590                          struct btrfs_key *key)
1591
1592 {
1593         while (level < BTRFS_MAX_LEVEL) {
1594                 if (!path->nodes[level])
1595                         break;
1596                 if (path->slots[level] + 1 <
1597                     btrfs_header_nritems(path->nodes[level])) {
1598                         btrfs_node_key_to_cpu(path->nodes[level], key,
1599                                               path->slots[level] + 1);
1600                         return 0;
1601                 }
1602                 level++;
1603         }
1604         return 1;
1605 }
1606
1607 /*
1608  * Insert current subvolume into reloc_control::dirty_subvol_roots
1609  */
1610 static int insert_dirty_subvol(struct btrfs_trans_handle *trans,
1611                                struct reloc_control *rc,
1612                                struct btrfs_root *root)
1613 {
1614         struct btrfs_root *reloc_root = root->reloc_root;
1615         struct btrfs_root_item *reloc_root_item;
1616         int ret;
1617
1618         /* @root must be a subvolume tree root with a valid reloc tree */
1619         ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
1620         ASSERT(reloc_root);
1621
1622         reloc_root_item = &reloc_root->root_item;
1623         memset(&reloc_root_item->drop_progress, 0,
1624                 sizeof(reloc_root_item->drop_progress));
1625         btrfs_set_root_drop_level(reloc_root_item, 0);
1626         btrfs_set_root_refs(reloc_root_item, 0);
1627         ret = btrfs_update_reloc_root(trans, root);
1628         if (ret)
1629                 return ret;
1630
1631         if (list_empty(&root->reloc_dirty_list)) {
1632                 btrfs_grab_root(root);
1633                 list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots);
1634         }
1635
1636         return 0;
1637 }
1638
1639 static int clean_dirty_subvols(struct reloc_control *rc)
1640 {
1641         struct btrfs_root *root;
1642         struct btrfs_root *next;
1643         int ret = 0;
1644         int ret2;
1645
1646         list_for_each_entry_safe(root, next, &rc->dirty_subvol_roots,
1647                                  reloc_dirty_list) {
1648                 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
1649                         /* Merged subvolume, cleanup its reloc root */
1650                         struct btrfs_root *reloc_root = root->reloc_root;
1651
1652                         list_del_init(&root->reloc_dirty_list);
1653                         root->reloc_root = NULL;
1654                         /*
1655                          * Need barrier to ensure clear_bit() only happens after
1656                          * root->reloc_root = NULL. Pairs with have_reloc_root.
1657                          */
1658                         smp_wmb();
1659                         clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE, &root->state);
1660                         if (reloc_root) {
1661                                 /*
1662                                  * btrfs_drop_snapshot drops our ref we hold for
1663                                  * ->reloc_root.  If it fails however we must
1664                                  * drop the ref ourselves.
1665                                  */
1666                                 ret2 = btrfs_drop_snapshot(reloc_root, 0, 1);
1667                                 if (ret2 < 0) {
1668                                         btrfs_put_root(reloc_root);
1669                                         if (!ret)
1670                                                 ret = ret2;
1671                                 }
1672                         }
1673                         btrfs_put_root(root);
1674                 } else {
1675                         /* Orphan reloc tree, just clean it up */
1676                         ret2 = btrfs_drop_snapshot(root, 0, 1);
1677                         if (ret2 < 0) {
1678                                 btrfs_put_root(root);
1679                                 if (!ret)
1680                                         ret = ret2;
1681                         }
1682                 }
1683         }
1684         return ret;
1685 }
1686
1687 /*
1688  * merge the relocated tree blocks in reloc tree with corresponding
1689  * fs tree.
1690  */
1691 static noinline_for_stack int merge_reloc_root(struct reloc_control *rc,
1692                                                struct btrfs_root *root)
1693 {
1694         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
1695         struct btrfs_key key;
1696         struct btrfs_key next_key;
1697         struct btrfs_trans_handle *trans = NULL;
1698         struct btrfs_root *reloc_root;
1699         struct btrfs_root_item *root_item;
1700         struct btrfs_path *path;
1701         struct extent_buffer *leaf;
1702         int reserve_level;
1703         int level;
1704         int max_level;
1705         int replaced = 0;
1706         int ret = 0;
1707         u32 min_reserved;
1708
1709         path = btrfs_alloc_path();
1710         if (!path)
1711                 return -ENOMEM;
1712         path->reada = READA_FORWARD;
1713
1714         reloc_root = root->reloc_root;
1715         root_item = &reloc_root->root_item;
1716
1717         if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
1718                 level = btrfs_root_level(root_item);
1719                 atomic_inc(&reloc_root->node->refs);
1720                 path->nodes[level] = reloc_root->node;
1721                 path->slots[level] = 0;
1722         } else {
1723                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
1724
1725                 level = btrfs_root_drop_level(root_item);
1726                 BUG_ON(level == 0);
1727                 path->lowest_level = level;
1728                 ret = btrfs_search_slot(NULL, reloc_root, &key, path, 0, 0);
1729                 path->lowest_level = 0;
1730                 if (ret < 0) {
1731                         btrfs_free_path(path);
1732                         return ret;
1733                 }
1734
1735                 btrfs_node_key_to_cpu(path->nodes[level], &next_key,
1736                                       path->slots[level]);
1737                 WARN_ON(memcmp(&key, &next_key, sizeof(key)));
1738
1739                 btrfs_unlock_up_safe(path, 0);
1740         }
1741
1742         /*
1743          * In merge_reloc_root(), we modify the upper level pointer to swap the
1744          * tree blocks between reloc tree and subvolume tree.  Thus for tree
1745          * block COW, we COW at most from level 1 to root level for each tree.
1746          *
1747          * Thus the needed metadata size is at most root_level * nodesize,
1748          * and * 2 since we have two trees to COW.
1749          */
1750         reserve_level = max_t(int, 1, btrfs_root_level(root_item));
1751         min_reserved = fs_info->nodesize * reserve_level * 2;
1752         memset(&next_key, 0, sizeof(next_key));
1753
1754         while (1) {
1755                 ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv,
1756                                              min_reserved,
1757                                              BTRFS_RESERVE_FLUSH_LIMIT);
1758                 if (ret)
1759                         goto out;
1760                 trans = btrfs_start_transaction(root, 0);
1761                 if (IS_ERR(trans)) {
1762                         ret = PTR_ERR(trans);
1763                         trans = NULL;
1764                         goto out;
1765                 }
1766
1767                 /*
1768                  * At this point we no longer have a reloc_control, so we can't
1769                  * depend on btrfs_init_reloc_root to update our last_trans.
1770                  *
1771                  * But that's ok, we started the trans handle on our
1772                  * corresponding fs_root, which means it's been added to the
1773                  * dirty list.  At commit time we'll still call
1774                  * btrfs_update_reloc_root() and update our root item
1775                  * appropriately.
1776                  */
1777                 reloc_root->last_trans = trans->transid;
1778                 trans->block_rsv = rc->block_rsv;
1779
1780                 replaced = 0;
1781                 max_level = level;
1782
1783                 ret = walk_down_reloc_tree(reloc_root, path, &level);
1784                 if (ret < 0)
1785                         goto out;
1786                 if (ret > 0)
1787                         break;
1788
1789                 if (!find_next_key(path, level, &key) &&
1790                     btrfs_comp_cpu_keys(&next_key, &key) >= 0) {
1791                         ret = 0;
1792                 } else {
1793                         ret = replace_path(trans, rc, root, reloc_root, path,
1794                                            &next_key, level, max_level);
1795                 }
1796                 if (ret < 0)
1797                         goto out;
1798                 if (ret > 0) {
1799                         level = ret;
1800                         btrfs_node_key_to_cpu(path->nodes[level], &key,
1801                                               path->slots[level]);
1802                         replaced = 1;
1803                 }
1804
1805                 ret = walk_up_reloc_tree(reloc_root, path, &level);
1806                 if (ret > 0)
1807                         break;
1808
1809                 BUG_ON(level == 0);
1810                 /*
1811                  * save the merging progress in the drop_progress.
1812                  * this is OK since root refs == 1 in this case.
1813                  */
1814                 btrfs_node_key(path->nodes[level], &root_item->drop_progress,
1815                                path->slots[level]);
1816                 btrfs_set_root_drop_level(root_item, level);
1817
1818                 btrfs_end_transaction_throttle(trans);
1819                 trans = NULL;
1820
1821                 btrfs_btree_balance_dirty(fs_info);
1822
1823                 if (replaced && rc->stage == UPDATE_DATA_PTRS)
1824                         invalidate_extent_cache(root, &key, &next_key);
1825         }
1826
1827         /*
1828          * handle the case only one block in the fs tree need to be
1829          * relocated and the block is tree root.
1830          */
1831         leaf = btrfs_lock_root_node(root);
1832         ret = btrfs_cow_block(trans, root, leaf, NULL, 0, &leaf,
1833                               BTRFS_NESTING_COW);
1834         btrfs_tree_unlock(leaf);
1835         free_extent_buffer(leaf);
1836 out:
1837         btrfs_free_path(path);
1838
1839         if (ret == 0) {
1840                 ret = insert_dirty_subvol(trans, rc, root);
1841                 if (ret)
1842                         btrfs_abort_transaction(trans, ret);
1843         }
1844
1845         if (trans)
1846                 btrfs_end_transaction_throttle(trans);
1847
1848         btrfs_btree_balance_dirty(fs_info);
1849
1850         if (replaced && rc->stage == UPDATE_DATA_PTRS)
1851                 invalidate_extent_cache(root, &key, &next_key);
1852
1853         return ret;
1854 }
1855
1856 static noinline_for_stack
1857 int prepare_to_merge(struct reloc_control *rc, int err)
1858 {
1859         struct btrfs_root *root = rc->extent_root;
1860         struct btrfs_fs_info *fs_info = root->fs_info;
1861         struct btrfs_root *reloc_root;
1862         struct btrfs_trans_handle *trans;
1863         LIST_HEAD(reloc_roots);
1864         u64 num_bytes = 0;
1865         int ret;
1866
1867         mutex_lock(&fs_info->reloc_mutex);
1868         rc->merging_rsv_size += fs_info->nodesize * (BTRFS_MAX_LEVEL - 1) * 2;
1869         rc->merging_rsv_size += rc->nodes_relocated * 2;
1870         mutex_unlock(&fs_info->reloc_mutex);
1871
1872 again:
1873         if (!err) {
1874                 num_bytes = rc->merging_rsv_size;
1875                 ret = btrfs_block_rsv_add(fs_info, rc->block_rsv, num_bytes,
1876                                           BTRFS_RESERVE_FLUSH_ALL);
1877                 if (ret)
1878                         err = ret;
1879         }
1880
1881         trans = btrfs_join_transaction(rc->extent_root);
1882         if (IS_ERR(trans)) {
1883                 if (!err)
1884                         btrfs_block_rsv_release(fs_info, rc->block_rsv,
1885                                                 num_bytes, NULL);
1886                 return PTR_ERR(trans);
1887         }
1888
1889         if (!err) {
1890                 if (num_bytes != rc->merging_rsv_size) {
1891                         btrfs_end_transaction(trans);
1892                         btrfs_block_rsv_release(fs_info, rc->block_rsv,
1893                                                 num_bytes, NULL);
1894                         goto again;
1895                 }
1896         }
1897
1898         rc->merge_reloc_tree = true;
1899
1900         while (!list_empty(&rc->reloc_roots)) {
1901                 reloc_root = list_entry(rc->reloc_roots.next,
1902                                         struct btrfs_root, root_list);
1903                 list_del_init(&reloc_root->root_list);
1904
1905                 root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
1906                                 false);
1907                 if (IS_ERR(root)) {
1908                         /*
1909                          * Even if we have an error we need this reloc root
1910                          * back on our list so we can clean up properly.
1911                          */
1912                         list_add(&reloc_root->root_list, &reloc_roots);
1913                         btrfs_abort_transaction(trans, (int)PTR_ERR(root));
1914                         if (!err)
1915                                 err = PTR_ERR(root);
1916                         break;
1917                 }
1918
1919                 if (unlikely(root->reloc_root != reloc_root)) {
1920                         if (root->reloc_root) {
1921                                 btrfs_err(fs_info,
1922 "reloc tree mismatch, root %lld has reloc root key (%lld %u %llu) gen %llu, expect reloc root key (%lld %u %llu) gen %llu",
1923                                           root->root_key.objectid,
1924                                           root->reloc_root->root_key.objectid,
1925                                           root->reloc_root->root_key.type,
1926                                           root->reloc_root->root_key.offset,
1927                                           btrfs_root_generation(
1928                                                   &root->reloc_root->root_item),
1929                                           reloc_root->root_key.objectid,
1930                                           reloc_root->root_key.type,
1931                                           reloc_root->root_key.offset,
1932                                           btrfs_root_generation(
1933                                                   &reloc_root->root_item));
1934                         } else {
1935                                 btrfs_err(fs_info,
1936 "reloc tree mismatch, root %lld has no reloc root, expect reloc root key (%lld %u %llu) gen %llu",
1937                                           root->root_key.objectid,
1938                                           reloc_root->root_key.objectid,
1939                                           reloc_root->root_key.type,
1940                                           reloc_root->root_key.offset,
1941                                           btrfs_root_generation(
1942                                                   &reloc_root->root_item));
1943                         }
1944                         list_add(&reloc_root->root_list, &reloc_roots);
1945                         btrfs_put_root(root);
1946                         btrfs_abort_transaction(trans, -EUCLEAN);
1947                         if (!err)
1948                                 err = -EUCLEAN;
1949                         break;
1950                 }
1951
1952                 /*
1953                  * set reference count to 1, so btrfs_recover_relocation
1954                  * knows it should resumes merging
1955                  */
1956                 if (!err)
1957                         btrfs_set_root_refs(&reloc_root->root_item, 1);
1958                 ret = btrfs_update_reloc_root(trans, root);
1959
1960                 /*
1961                  * Even if we have an error we need this reloc root back on our
1962                  * list so we can clean up properly.
1963                  */
1964                 list_add(&reloc_root->root_list, &reloc_roots);
1965                 btrfs_put_root(root);
1966
1967                 if (ret) {
1968                         btrfs_abort_transaction(trans, ret);
1969                         if (!err)
1970                                 err = ret;
1971                         break;
1972                 }
1973         }
1974
1975         list_splice(&reloc_roots, &rc->reloc_roots);
1976
1977         if (!err)
1978                 err = btrfs_commit_transaction(trans);
1979         else
1980                 btrfs_end_transaction(trans);
1981         return err;
1982 }
1983
1984 static noinline_for_stack
1985 void free_reloc_roots(struct list_head *list)
1986 {
1987         struct btrfs_root *reloc_root, *tmp;
1988
1989         list_for_each_entry_safe(reloc_root, tmp, list, root_list)
1990                 __del_reloc_root(reloc_root);
1991 }
1992
1993 static noinline_for_stack
1994 void merge_reloc_roots(struct reloc_control *rc)
1995 {
1996         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
1997         struct btrfs_root *root;
1998         struct btrfs_root *reloc_root;
1999         LIST_HEAD(reloc_roots);
2000         int found = 0;
2001         int ret = 0;
2002 again:
2003         root = rc->extent_root;
2004
2005         /*
2006          * this serializes us with btrfs_record_root_in_transaction,
2007          * we have to make sure nobody is in the middle of
2008          * adding their roots to the list while we are
2009          * doing this splice
2010          */
2011         mutex_lock(&fs_info->reloc_mutex);
2012         list_splice_init(&rc->reloc_roots, &reloc_roots);
2013         mutex_unlock(&fs_info->reloc_mutex);
2014
2015         while (!list_empty(&reloc_roots)) {
2016                 found = 1;
2017                 reloc_root = list_entry(reloc_roots.next,
2018                                         struct btrfs_root, root_list);
2019
2020                 root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
2021                                          false);
2022                 if (btrfs_root_refs(&reloc_root->root_item) > 0) {
2023                         if (WARN_ON(IS_ERR(root))) {
2024                                 /*
2025                                  * For recovery we read the fs roots on mount,
2026                                  * and if we didn't find the root then we marked
2027                                  * the reloc root as a garbage root.  For normal
2028                                  * relocation obviously the root should exist in
2029                                  * memory.  However there's no reason we can't
2030                                  * handle the error properly here just in case.
2031                                  */
2032                                 ret = PTR_ERR(root);
2033                                 goto out;
2034                         }
2035                         if (WARN_ON(root->reloc_root != reloc_root)) {
2036                                 /*
2037                                  * This can happen if on-disk metadata has some
2038                                  * corruption, e.g. bad reloc tree key offset.
2039                                  */
2040                                 ret = -EINVAL;
2041                                 goto out;
2042                         }
2043                         ret = merge_reloc_root(rc, root);
2044                         btrfs_put_root(root);
2045                         if (ret) {
2046                                 if (list_empty(&reloc_root->root_list))
2047                                         list_add_tail(&reloc_root->root_list,
2048                                                       &reloc_roots);
2049                                 goto out;
2050                         }
2051                 } else {
2052                         if (!IS_ERR(root)) {
2053                                 if (root->reloc_root == reloc_root) {
2054                                         root->reloc_root = NULL;
2055                                         btrfs_put_root(reloc_root);
2056                                 }
2057                                 clear_bit(BTRFS_ROOT_DEAD_RELOC_TREE,
2058                                           &root->state);
2059                                 btrfs_put_root(root);
2060                         }
2061
2062                         list_del_init(&reloc_root->root_list);
2063                         /* Don't forget to queue this reloc root for cleanup */
2064                         list_add_tail(&reloc_root->reloc_dirty_list,
2065                                       &rc->dirty_subvol_roots);
2066                 }
2067         }
2068
2069         if (found) {
2070                 found = 0;
2071                 goto again;
2072         }
2073 out:
2074         if (ret) {
2075                 btrfs_handle_fs_error(fs_info, ret, NULL);
2076                 free_reloc_roots(&reloc_roots);
2077
2078                 /* new reloc root may be added */
2079                 mutex_lock(&fs_info->reloc_mutex);
2080                 list_splice_init(&rc->reloc_roots, &reloc_roots);
2081                 mutex_unlock(&fs_info->reloc_mutex);
2082                 free_reloc_roots(&reloc_roots);
2083         }
2084
2085         /*
2086          * We used to have
2087          *
2088          * BUG_ON(!RB_EMPTY_ROOT(&rc->reloc_root_tree.rb_root));
2089          *
2090          * here, but it's wrong.  If we fail to start the transaction in
2091          * prepare_to_merge() we will have only 0 ref reloc roots, none of which
2092          * have actually been removed from the reloc_root_tree rb tree.  This is
2093          * fine because we're bailing here, and we hold a reference on the root
2094          * for the list that holds it, so these roots will be cleaned up when we
2095          * do the reloc_dirty_list afterwards.  Meanwhile the root->reloc_root
2096          * will be cleaned up on unmount.
2097          *
2098          * The remaining nodes will be cleaned up by free_reloc_control.
2099          */
2100 }
2101
2102 static void free_block_list(struct rb_root *blocks)
2103 {
2104         struct tree_block *block;
2105         struct rb_node *rb_node;
2106         while ((rb_node = rb_first(blocks))) {
2107                 block = rb_entry(rb_node, struct tree_block, rb_node);
2108                 rb_erase(rb_node, blocks);
2109                 kfree(block);
2110         }
2111 }
2112
2113 static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans,
2114                                       struct btrfs_root *reloc_root)
2115 {
2116         struct btrfs_fs_info *fs_info = reloc_root->fs_info;
2117         struct btrfs_root *root;
2118         int ret;
2119
2120         if (reloc_root->last_trans == trans->transid)
2121                 return 0;
2122
2123         root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false);
2124
2125         /*
2126          * This should succeed, since we can't have a reloc root without having
2127          * already looked up the actual root and created the reloc root for this
2128          * root.
2129          *
2130          * However if there's some sort of corruption where we have a ref to a
2131          * reloc root without a corresponding root this could return ENOENT.
2132          */
2133         if (IS_ERR(root)) {
2134                 ASSERT(0);
2135                 return PTR_ERR(root);
2136         }
2137         if (root->reloc_root != reloc_root) {
2138                 ASSERT(0);
2139                 btrfs_err(fs_info,
2140                           "root %llu has two reloc roots associated with it",
2141                           reloc_root->root_key.offset);
2142                 btrfs_put_root(root);
2143                 return -EUCLEAN;
2144         }
2145         ret = btrfs_record_root_in_trans(trans, root);
2146         btrfs_put_root(root);
2147
2148         return ret;
2149 }
2150
2151 static noinline_for_stack
2152 struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans,
2153                                      struct reloc_control *rc,
2154                                      struct btrfs_backref_node *node,
2155                                      struct btrfs_backref_edge *edges[])
2156 {
2157         struct btrfs_backref_node *next;
2158         struct btrfs_root *root;
2159         int index = 0;
2160         int ret;
2161
2162         next = node;
2163         while (1) {
2164                 cond_resched();
2165                 next = walk_up_backref(next, edges, &index);
2166                 root = next->root;
2167
2168                 /*
2169                  * If there is no root, then our references for this block are
2170                  * incomplete, as we should be able to walk all the way up to a
2171                  * block that is owned by a root.
2172                  *
2173                  * This path is only for SHAREABLE roots, so if we come upon a
2174                  * non-SHAREABLE root then we have backrefs that resolve
2175                  * improperly.
2176                  *
2177                  * Both of these cases indicate file system corruption, or a bug
2178                  * in the backref walking code.
2179                  */
2180                 if (!root) {
2181                         ASSERT(0);
2182                         btrfs_err(trans->fs_info,
2183                 "bytenr %llu doesn't have a backref path ending in a root",
2184                                   node->bytenr);
2185                         return ERR_PTR(-EUCLEAN);
2186                 }
2187                 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
2188                         ASSERT(0);
2189                         btrfs_err(trans->fs_info,
2190         "bytenr %llu has multiple refs with one ending in a non-shareable root",
2191                                   node->bytenr);
2192                         return ERR_PTR(-EUCLEAN);
2193                 }
2194
2195                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
2196                         ret = record_reloc_root_in_trans(trans, root);
2197                         if (ret)
2198                                 return ERR_PTR(ret);
2199                         break;
2200                 }
2201
2202                 ret = btrfs_record_root_in_trans(trans, root);
2203                 if (ret)
2204                         return ERR_PTR(ret);
2205                 root = root->reloc_root;
2206
2207                 /*
2208                  * We could have raced with another thread which failed, so
2209                  * root->reloc_root may not be set, return ENOENT in this case.
2210                  */
2211                 if (!root)
2212                         return ERR_PTR(-ENOENT);
2213
2214                 if (next->new_bytenr != root->node->start) {
2215                         /*
2216                          * We just created the reloc root, so we shouldn't have
2217                          * ->new_bytenr set and this shouldn't be in the changed
2218                          *  list.  If it is then we have multiple roots pointing
2219                          *  at the same bytenr which indicates corruption, or
2220                          *  we've made a mistake in the backref walking code.
2221                          */
2222                         ASSERT(next->new_bytenr == 0);
2223                         ASSERT(list_empty(&next->list));
2224                         if (next->new_bytenr || !list_empty(&next->list)) {
2225                                 btrfs_err(trans->fs_info,
2226         "bytenr %llu possibly has multiple roots pointing at the same bytenr %llu",
2227                                           node->bytenr, next->bytenr);
2228                                 return ERR_PTR(-EUCLEAN);
2229                         }
2230
2231                         next->new_bytenr = root->node->start;
2232                         btrfs_put_root(next->root);
2233                         next->root = btrfs_grab_root(root);
2234                         ASSERT(next->root);
2235                         list_add_tail(&next->list,
2236                                       &rc->backref_cache.changed);
2237                         mark_block_processed(rc, next);
2238                         break;
2239                 }
2240
2241                 WARN_ON(1);
2242                 root = NULL;
2243                 next = walk_down_backref(edges, &index);
2244                 if (!next || next->level <= node->level)
2245                         break;
2246         }
2247         if (!root) {
2248                 /*
2249                  * This can happen if there's fs corruption or if there's a bug
2250                  * in the backref lookup code.
2251                  */
2252                 ASSERT(0);
2253                 return ERR_PTR(-ENOENT);
2254         }
2255
2256         next = node;
2257         /* setup backref node path for btrfs_reloc_cow_block */
2258         while (1) {
2259                 rc->backref_cache.path[next->level] = next;
2260                 if (--index < 0)
2261                         break;
2262                 next = edges[index]->node[UPPER];
2263         }
2264         return root;
2265 }
2266
2267 /*
2268  * Select a tree root for relocation.
2269  *
2270  * Return NULL if the block is not shareable. We should use do_relocation() in
2271  * this case.
2272  *
2273  * Return a tree root pointer if the block is shareable.
2274  * Return -ENOENT if the block is root of reloc tree.
2275  */
2276 static noinline_for_stack
2277 struct btrfs_root *select_one_root(struct btrfs_backref_node *node)
2278 {
2279         struct btrfs_backref_node *next;
2280         struct btrfs_root *root;
2281         struct btrfs_root *fs_root = NULL;
2282         struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2283         int index = 0;
2284
2285         next = node;
2286         while (1) {
2287                 cond_resched();
2288                 next = walk_up_backref(next, edges, &index);
2289                 root = next->root;
2290
2291                 /*
2292                  * This can occur if we have incomplete extent refs leading all
2293                  * the way up a particular path, in this case return -EUCLEAN.
2294                  */
2295                 if (!root)
2296                         return ERR_PTR(-EUCLEAN);
2297
2298                 /* No other choice for non-shareable tree */
2299                 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
2300                         return root;
2301
2302                 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID)
2303                         fs_root = root;
2304
2305                 if (next != node)
2306                         return NULL;
2307
2308                 next = walk_down_backref(edges, &index);
2309                 if (!next || next->level <= node->level)
2310                         break;
2311         }
2312
2313         if (!fs_root)
2314                 return ERR_PTR(-ENOENT);
2315         return fs_root;
2316 }
2317
2318 static noinline_for_stack
2319 u64 calcu_metadata_size(struct reloc_control *rc,
2320                         struct btrfs_backref_node *node, int reserve)
2321 {
2322         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2323         struct btrfs_backref_node *next = node;
2324         struct btrfs_backref_edge *edge;
2325         struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2326         u64 num_bytes = 0;
2327         int index = 0;
2328
2329         BUG_ON(reserve && node->processed);
2330
2331         while (next) {
2332                 cond_resched();
2333                 while (1) {
2334                         if (next->processed && (reserve || next != node))
2335                                 break;
2336
2337                         num_bytes += fs_info->nodesize;
2338
2339                         if (list_empty(&next->upper))
2340                                 break;
2341
2342                         edge = list_entry(next->upper.next,
2343                                         struct btrfs_backref_edge, list[LOWER]);
2344                         edges[index++] = edge;
2345                         next = edge->node[UPPER];
2346                 }
2347                 next = walk_down_backref(edges, &index);
2348         }
2349         return num_bytes;
2350 }
2351
2352 static int reserve_metadata_space(struct btrfs_trans_handle *trans,
2353                                   struct reloc_control *rc,
2354                                   struct btrfs_backref_node *node)
2355 {
2356         struct btrfs_root *root = rc->extent_root;
2357         struct btrfs_fs_info *fs_info = root->fs_info;
2358         u64 num_bytes;
2359         int ret;
2360         u64 tmp;
2361
2362         num_bytes = calcu_metadata_size(rc, node, 1) * 2;
2363
2364         trans->block_rsv = rc->block_rsv;
2365         rc->reserved_bytes += num_bytes;
2366
2367         /*
2368          * We are under a transaction here so we can only do limited flushing.
2369          * If we get an enospc just kick back -EAGAIN so we know to drop the
2370          * transaction and try to refill when we can flush all the things.
2371          */
2372         ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv, num_bytes,
2373                                      BTRFS_RESERVE_FLUSH_LIMIT);
2374         if (ret) {
2375                 tmp = fs_info->nodesize * RELOCATION_RESERVED_NODES;
2376                 while (tmp <= rc->reserved_bytes)
2377                         tmp <<= 1;
2378                 /*
2379                  * only one thread can access block_rsv at this point,
2380                  * so we don't need hold lock to protect block_rsv.
2381                  * we expand more reservation size here to allow enough
2382                  * space for relocation and we will return earlier in
2383                  * enospc case.
2384                  */
2385                 rc->block_rsv->size = tmp + fs_info->nodesize *
2386                                       RELOCATION_RESERVED_NODES;
2387                 return -EAGAIN;
2388         }
2389
2390         return 0;
2391 }
2392
2393 /*
2394  * relocate a block tree, and then update pointers in upper level
2395  * blocks that reference the block to point to the new location.
2396  *
2397  * if called by link_to_upper, the block has already been relocated.
2398  * in that case this function just updates pointers.
2399  */
2400 static int do_relocation(struct btrfs_trans_handle *trans,
2401                          struct reloc_control *rc,
2402                          struct btrfs_backref_node *node,
2403                          struct btrfs_key *key,
2404                          struct btrfs_path *path, int lowest)
2405 {
2406         struct btrfs_backref_node *upper;
2407         struct btrfs_backref_edge *edge;
2408         struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2409         struct btrfs_root *root;
2410         struct extent_buffer *eb;
2411         u32 blocksize;
2412         u64 bytenr;
2413         int slot;
2414         int ret = 0;
2415
2416         /*
2417          * If we are lowest then this is the first time we're processing this
2418          * block, and thus shouldn't have an eb associated with it yet.
2419          */
2420         ASSERT(!lowest || !node->eb);
2421
2422         path->lowest_level = node->level + 1;
2423         rc->backref_cache.path[node->level] = node;
2424         list_for_each_entry(edge, &node->upper, list[LOWER]) {
2425                 struct btrfs_ref ref = { 0 };
2426
2427                 cond_resched();
2428
2429                 upper = edge->node[UPPER];
2430                 root = select_reloc_root(trans, rc, upper, edges);
2431                 if (IS_ERR(root)) {
2432                         ret = PTR_ERR(root);
2433                         goto next;
2434                 }
2435
2436                 if (upper->eb && !upper->locked) {
2437                         if (!lowest) {
2438                                 ret = btrfs_bin_search(upper->eb, 0, key, &slot);
2439                                 if (ret < 0)
2440                                         goto next;
2441                                 BUG_ON(ret);
2442                                 bytenr = btrfs_node_blockptr(upper->eb, slot);
2443                                 if (node->eb->start == bytenr)
2444                                         goto next;
2445                         }
2446                         btrfs_backref_drop_node_buffer(upper);
2447                 }
2448
2449                 if (!upper->eb) {
2450                         ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2451                         if (ret) {
2452                                 if (ret > 0)
2453                                         ret = -ENOENT;
2454
2455                                 btrfs_release_path(path);
2456                                 break;
2457                         }
2458
2459                         if (!upper->eb) {
2460                                 upper->eb = path->nodes[upper->level];
2461                                 path->nodes[upper->level] = NULL;
2462                         } else {
2463                                 BUG_ON(upper->eb != path->nodes[upper->level]);
2464                         }
2465
2466                         upper->locked = 1;
2467                         path->locks[upper->level] = 0;
2468
2469                         slot = path->slots[upper->level];
2470                         btrfs_release_path(path);
2471                 } else {
2472                         ret = btrfs_bin_search(upper->eb, 0, key, &slot);
2473                         if (ret < 0)
2474                                 goto next;
2475                         BUG_ON(ret);
2476                 }
2477
2478                 bytenr = btrfs_node_blockptr(upper->eb, slot);
2479                 if (lowest) {
2480                         if (bytenr != node->bytenr) {
2481                                 btrfs_err(root->fs_info,
2482                 "lowest leaf/node mismatch: bytenr %llu node->bytenr %llu slot %d upper %llu",
2483                                           bytenr, node->bytenr, slot,
2484                                           upper->eb->start);
2485                                 ret = -EIO;
2486                                 goto next;
2487                         }
2488                 } else {
2489                         if (node->eb->start == bytenr)
2490                                 goto next;
2491                 }
2492
2493                 blocksize = root->fs_info->nodesize;
2494                 eb = btrfs_read_node_slot(upper->eb, slot);
2495                 if (IS_ERR(eb)) {
2496                         ret = PTR_ERR(eb);
2497                         goto next;
2498                 }
2499                 btrfs_tree_lock(eb);
2500
2501                 if (!node->eb) {
2502                         ret = btrfs_cow_block(trans, root, eb, upper->eb,
2503                                               slot, &eb, BTRFS_NESTING_COW);
2504                         btrfs_tree_unlock(eb);
2505                         free_extent_buffer(eb);
2506                         if (ret < 0)
2507                                 goto next;
2508                         /*
2509                          * We've just COWed this block, it should have updated
2510                          * the correct backref node entry.
2511                          */
2512                         ASSERT(node->eb == eb);
2513                 } else {
2514                         btrfs_set_node_blockptr(upper->eb, slot,
2515                                                 node->eb->start);
2516                         btrfs_set_node_ptr_generation(upper->eb, slot,
2517                                                       trans->transid);
2518                         btrfs_mark_buffer_dirty(trans, upper->eb);
2519
2520                         btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
2521                                                node->eb->start, blocksize,
2522                                                upper->eb->start,
2523                                                btrfs_header_owner(upper->eb));
2524                         btrfs_init_tree_ref(&ref, node->level,
2525                                             btrfs_header_owner(upper->eb),
2526                                             root->root_key.objectid, false);
2527                         ret = btrfs_inc_extent_ref(trans, &ref);
2528                         if (!ret)
2529                                 ret = btrfs_drop_subtree(trans, root, eb,
2530                                                          upper->eb);
2531                         if (ret)
2532                                 btrfs_abort_transaction(trans, ret);
2533                 }
2534 next:
2535                 if (!upper->pending)
2536                         btrfs_backref_drop_node_buffer(upper);
2537                 else
2538                         btrfs_backref_unlock_node_buffer(upper);
2539                 if (ret)
2540                         break;
2541         }
2542
2543         if (!ret && node->pending) {
2544                 btrfs_backref_drop_node_buffer(node);
2545                 list_move_tail(&node->list, &rc->backref_cache.changed);
2546                 node->pending = 0;
2547         }
2548
2549         path->lowest_level = 0;
2550
2551         /*
2552          * We should have allocated all of our space in the block rsv and thus
2553          * shouldn't ENOSPC.
2554          */
2555         ASSERT(ret != -ENOSPC);
2556         return ret;
2557 }
2558
2559 static int link_to_upper(struct btrfs_trans_handle *trans,
2560                          struct reloc_control *rc,
2561                          struct btrfs_backref_node *node,
2562                          struct btrfs_path *path)
2563 {
2564         struct btrfs_key key;
2565
2566         btrfs_node_key_to_cpu(node->eb, &key, 0);
2567         return do_relocation(trans, rc, node, &key, path, 0);
2568 }
2569
2570 static int finish_pending_nodes(struct btrfs_trans_handle *trans,
2571                                 struct reloc_control *rc,
2572                                 struct btrfs_path *path, int err)
2573 {
2574         LIST_HEAD(list);
2575         struct btrfs_backref_cache *cache = &rc->backref_cache;
2576         struct btrfs_backref_node *node;
2577         int level;
2578         int ret;
2579
2580         for (level = 0; level < BTRFS_MAX_LEVEL; level++) {
2581                 while (!list_empty(&cache->pending[level])) {
2582                         node = list_entry(cache->pending[level].next,
2583                                           struct btrfs_backref_node, list);
2584                         list_move_tail(&node->list, &list);
2585                         BUG_ON(!node->pending);
2586
2587                         if (!err) {
2588                                 ret = link_to_upper(trans, rc, node, path);
2589                                 if (ret < 0)
2590                                         err = ret;
2591                         }
2592                 }
2593                 list_splice_init(&list, &cache->pending[level]);
2594         }
2595         return err;
2596 }
2597
2598 /*
2599  * mark a block and all blocks directly/indirectly reference the block
2600  * as processed.
2601  */
2602 static void update_processed_blocks(struct reloc_control *rc,
2603                                     struct btrfs_backref_node *node)
2604 {
2605         struct btrfs_backref_node *next = node;
2606         struct btrfs_backref_edge *edge;
2607         struct btrfs_backref_edge *edges[BTRFS_MAX_LEVEL - 1];
2608         int index = 0;
2609
2610         while (next) {
2611                 cond_resched();
2612                 while (1) {
2613                         if (next->processed)
2614                                 break;
2615
2616                         mark_block_processed(rc, next);
2617
2618                         if (list_empty(&next->upper))
2619                                 break;
2620
2621                         edge = list_entry(next->upper.next,
2622                                         struct btrfs_backref_edge, list[LOWER]);
2623                         edges[index++] = edge;
2624                         next = edge->node[UPPER];
2625                 }
2626                 next = walk_down_backref(edges, &index);
2627         }
2628 }
2629
2630 static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
2631 {
2632         u32 blocksize = rc->extent_root->fs_info->nodesize;
2633
2634         if (test_range_bit(&rc->processed_blocks, bytenr,
2635                            bytenr + blocksize - 1, EXTENT_DIRTY, NULL))
2636                 return 1;
2637         return 0;
2638 }
2639
2640 static int get_tree_block_key(struct btrfs_fs_info *fs_info,
2641                               struct tree_block *block)
2642 {
2643         struct btrfs_tree_parent_check check = {
2644                 .level = block->level,
2645                 .owner_root = block->owner,
2646                 .transid = block->key.offset
2647         };
2648         struct extent_buffer *eb;
2649
2650         eb = read_tree_block(fs_info, block->bytenr, &check);
2651         if (IS_ERR(eb))
2652                 return PTR_ERR(eb);
2653         if (!extent_buffer_uptodate(eb)) {
2654                 free_extent_buffer(eb);
2655                 return -EIO;
2656         }
2657         if (block->level == 0)
2658                 btrfs_item_key_to_cpu(eb, &block->key, 0);
2659         else
2660                 btrfs_node_key_to_cpu(eb, &block->key, 0);
2661         free_extent_buffer(eb);
2662         block->key_ready = true;
2663         return 0;
2664 }
2665
2666 /*
2667  * helper function to relocate a tree block
2668  */
2669 static int relocate_tree_block(struct btrfs_trans_handle *trans,
2670                                 struct reloc_control *rc,
2671                                 struct btrfs_backref_node *node,
2672                                 struct btrfs_key *key,
2673                                 struct btrfs_path *path)
2674 {
2675         struct btrfs_root *root;
2676         int ret = 0;
2677
2678         if (!node)
2679                 return 0;
2680
2681         /*
2682          * If we fail here we want to drop our backref_node because we are going
2683          * to start over and regenerate the tree for it.
2684          */
2685         ret = reserve_metadata_space(trans, rc, node);
2686         if (ret)
2687                 goto out;
2688
2689         BUG_ON(node->processed);
2690         root = select_one_root(node);
2691         if (IS_ERR(root)) {
2692                 ret = PTR_ERR(root);
2693
2694                 /* See explanation in select_one_root for the -EUCLEAN case. */
2695                 ASSERT(ret == -ENOENT);
2696                 if (ret == -ENOENT) {
2697                         ret = 0;
2698                         update_processed_blocks(rc, node);
2699                 }
2700                 goto out;
2701         }
2702
2703         if (root) {
2704                 if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
2705                         /*
2706                          * This block was the root block of a root, and this is
2707                          * the first time we're processing the block and thus it
2708                          * should not have had the ->new_bytenr modified and
2709                          * should have not been included on the changed list.
2710                          *
2711                          * However in the case of corruption we could have
2712                          * multiple refs pointing to the same block improperly,
2713                          * and thus we would trip over these checks.  ASSERT()
2714                          * for the developer case, because it could indicate a
2715                          * bug in the backref code, however error out for a
2716                          * normal user in the case of corruption.
2717                          */
2718                         ASSERT(node->new_bytenr == 0);
2719                         ASSERT(list_empty(&node->list));
2720                         if (node->new_bytenr || !list_empty(&node->list)) {
2721                                 btrfs_err(root->fs_info,
2722                                   "bytenr %llu has improper references to it",
2723                                           node->bytenr);
2724                                 ret = -EUCLEAN;
2725                                 goto out;
2726                         }
2727                         ret = btrfs_record_root_in_trans(trans, root);
2728                         if (ret)
2729                                 goto out;
2730                         /*
2731                          * Another thread could have failed, need to check if we
2732                          * have reloc_root actually set.
2733                          */
2734                         if (!root->reloc_root) {
2735                                 ret = -ENOENT;
2736                                 goto out;
2737                         }
2738                         root = root->reloc_root;
2739                         node->new_bytenr = root->node->start;
2740                         btrfs_put_root(node->root);
2741                         node->root = btrfs_grab_root(root);
2742                         ASSERT(node->root);
2743                         list_add_tail(&node->list, &rc->backref_cache.changed);
2744                 } else {
2745                         path->lowest_level = node->level;
2746                         if (root == root->fs_info->chunk_root)
2747                                 btrfs_reserve_chunk_metadata(trans, false);
2748                         ret = btrfs_search_slot(trans, root, key, path, 0, 1);
2749                         btrfs_release_path(path);
2750                         if (root == root->fs_info->chunk_root)
2751                                 btrfs_trans_release_chunk_metadata(trans);
2752                         if (ret > 0)
2753                                 ret = 0;
2754                 }
2755                 if (!ret)
2756                         update_processed_blocks(rc, node);
2757         } else {
2758                 ret = do_relocation(trans, rc, node, key, path, 1);
2759         }
2760 out:
2761         if (ret || node->level == 0 || node->cowonly)
2762                 btrfs_backref_cleanup_node(&rc->backref_cache, node);
2763         return ret;
2764 }
2765
2766 /*
2767  * relocate a list of blocks
2768  */
2769 static noinline_for_stack
2770 int relocate_tree_blocks(struct btrfs_trans_handle *trans,
2771                          struct reloc_control *rc, struct rb_root *blocks)
2772 {
2773         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
2774         struct btrfs_backref_node *node;
2775         struct btrfs_path *path;
2776         struct tree_block *block;
2777         struct tree_block *next;
2778         int ret;
2779         int err = 0;
2780
2781         path = btrfs_alloc_path();
2782         if (!path) {
2783                 err = -ENOMEM;
2784                 goto out_free_blocks;
2785         }
2786
2787         /* Kick in readahead for tree blocks with missing keys */
2788         rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
2789                 if (!block->key_ready)
2790                         btrfs_readahead_tree_block(fs_info, block->bytenr,
2791                                                    block->owner, 0,
2792                                                    block->level);
2793         }
2794
2795         /* Get first keys */
2796         rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
2797                 if (!block->key_ready) {
2798                         err = get_tree_block_key(fs_info, block);
2799                         if (err)
2800                                 goto out_free_path;
2801                 }
2802         }
2803
2804         /* Do tree relocation */
2805         rbtree_postorder_for_each_entry_safe(block, next, blocks, rb_node) {
2806                 node = build_backref_tree(trans, rc, &block->key,
2807                                           block->level, block->bytenr);
2808                 if (IS_ERR(node)) {
2809                         err = PTR_ERR(node);
2810                         goto out;
2811                 }
2812
2813                 ret = relocate_tree_block(trans, rc, node, &block->key,
2814                                           path);
2815                 if (ret < 0) {
2816                         err = ret;
2817                         break;
2818                 }
2819         }
2820 out:
2821         err = finish_pending_nodes(trans, rc, path, err);
2822
2823 out_free_path:
2824         btrfs_free_path(path);
2825 out_free_blocks:
2826         free_block_list(blocks);
2827         return err;
2828 }
2829
2830 static noinline_for_stack int prealloc_file_extent_cluster(
2831                                 struct btrfs_inode *inode,
2832                                 const struct file_extent_cluster *cluster)
2833 {
2834         u64 alloc_hint = 0;
2835         u64 start;
2836         u64 end;
2837         u64 offset = inode->index_cnt;
2838         u64 num_bytes;
2839         int nr;
2840         int ret = 0;
2841         u64 i_size = i_size_read(&inode->vfs_inode);
2842         u64 prealloc_start = cluster->start - offset;
2843         u64 prealloc_end = cluster->end - offset;
2844         u64 cur_offset = prealloc_start;
2845
2846         /*
2847          * For subpage case, previous i_size may not be aligned to PAGE_SIZE.
2848          * This means the range [i_size, PAGE_END + 1) is filled with zeros by
2849          * btrfs_do_readpage() call of previously relocated file cluster.
2850          *
2851          * If the current cluster starts in the above range, btrfs_do_readpage()
2852          * will skip the read, and relocate_one_page() will later writeback
2853          * the padding zeros as new data, causing data corruption.
2854          *
2855          * Here we have to manually invalidate the range (i_size, PAGE_END + 1).
2856          */
2857         if (!PAGE_ALIGNED(i_size)) {
2858                 struct address_space *mapping = inode->vfs_inode.i_mapping;
2859                 struct btrfs_fs_info *fs_info = inode->root->fs_info;
2860                 const u32 sectorsize = fs_info->sectorsize;
2861                 struct page *page;
2862
2863                 ASSERT(sectorsize < PAGE_SIZE);
2864                 ASSERT(IS_ALIGNED(i_size, sectorsize));
2865
2866                 /*
2867                  * Subpage can't handle page with DIRTY but without UPTODATE
2868                  * bit as it can lead to the following deadlock:
2869                  *
2870                  * btrfs_read_folio()
2871                  * | Page already *locked*
2872                  * |- btrfs_lock_and_flush_ordered_range()
2873                  *    |- btrfs_start_ordered_extent()
2874                  *       |- extent_write_cache_pages()
2875                  *          |- lock_page()
2876                  *             We try to lock the page we already hold.
2877                  *
2878                  * Here we just writeback the whole data reloc inode, so that
2879                  * we will be ensured to have no dirty range in the page, and
2880                  * are safe to clear the uptodate bits.
2881                  *
2882                  * This shouldn't cause too much overhead, as we need to write
2883                  * the data back anyway.
2884                  */
2885                 ret = filemap_write_and_wait(mapping);
2886                 if (ret < 0)
2887                         return ret;
2888
2889                 clear_extent_bits(&inode->io_tree, i_size,
2890                                   round_up(i_size, PAGE_SIZE) - 1,
2891                                   EXTENT_UPTODATE);
2892                 page = find_lock_page(mapping, i_size >> PAGE_SHIFT);
2893                 /*
2894                  * If page is freed we don't need to do anything then, as we
2895                  * will re-read the whole page anyway.
2896                  */
2897                 if (page) {
2898                         btrfs_subpage_clear_uptodate(fs_info, page_folio(page), i_size,
2899                                         round_up(i_size, PAGE_SIZE) - i_size);
2900                         unlock_page(page);
2901                         put_page(page);
2902                 }
2903         }
2904
2905         BUG_ON(cluster->start != cluster->boundary[0]);
2906         ret = btrfs_alloc_data_chunk_ondemand(inode,
2907                                               prealloc_end + 1 - prealloc_start);
2908         if (ret)
2909                 return ret;
2910
2911         btrfs_inode_lock(inode, 0);
2912         for (nr = 0; nr < cluster->nr; nr++) {
2913                 struct extent_state *cached_state = NULL;
2914
2915                 start = cluster->boundary[nr] - offset;
2916                 if (nr + 1 < cluster->nr)
2917                         end = cluster->boundary[nr + 1] - 1 - offset;
2918                 else
2919                         end = cluster->end - offset;
2920
2921                 lock_extent(&inode->io_tree, start, end, &cached_state);
2922                 num_bytes = end + 1 - start;
2923                 ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start,
2924                                                 num_bytes, num_bytes,
2925                                                 end + 1, &alloc_hint);
2926                 cur_offset = end + 1;
2927                 unlock_extent(&inode->io_tree, start, end, &cached_state);
2928                 if (ret)
2929                         break;
2930         }
2931         btrfs_inode_unlock(inode, 0);
2932
2933         if (cur_offset < prealloc_end)
2934                 btrfs_free_reserved_data_space_noquota(inode->root->fs_info,
2935                                                prealloc_end + 1 - cur_offset);
2936         return ret;
2937 }
2938
2939 static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inode,
2940                                 u64 start, u64 end, u64 block_start)
2941 {
2942         struct extent_map *em;
2943         struct extent_state *cached_state = NULL;
2944         int ret = 0;
2945
2946         em = alloc_extent_map();
2947         if (!em)
2948                 return -ENOMEM;
2949
2950         em->start = start;
2951         em->len = end + 1 - start;
2952         em->block_len = em->len;
2953         em->block_start = block_start;
2954         em->flags |= EXTENT_FLAG_PINNED;
2955
2956         lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
2957         ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, false);
2958         unlock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
2959         free_extent_map(em);
2960
2961         return ret;
2962 }
2963
2964 /*
2965  * Allow error injection to test balance/relocation cancellation
2966  */
2967 noinline int btrfs_should_cancel_balance(const struct btrfs_fs_info *fs_info)
2968 {
2969         return atomic_read(&fs_info->balance_cancel_req) ||
2970                 atomic_read(&fs_info->reloc_cancel_req) ||
2971                 fatal_signal_pending(current);
2972 }
2973 ALLOW_ERROR_INJECTION(btrfs_should_cancel_balance, TRUE);
2974
2975 static u64 get_cluster_boundary_end(const struct file_extent_cluster *cluster,
2976                                     int cluster_nr)
2977 {
2978         /* Last extent, use cluster end directly */
2979         if (cluster_nr >= cluster->nr - 1)
2980                 return cluster->end;
2981
2982         /* Use next boundary start*/
2983         return cluster->boundary[cluster_nr + 1] - 1;
2984 }
2985
2986 static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
2987                              const struct file_extent_cluster *cluster,
2988                              int *cluster_nr, unsigned long page_index)
2989 {
2990         struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
2991         u64 offset = BTRFS_I(inode)->index_cnt;
2992         const unsigned long last_index = (cluster->end - offset) >> PAGE_SHIFT;
2993         gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping);
2994         struct page *page;
2995         u64 page_start;
2996         u64 page_end;
2997         u64 cur;
2998         int ret;
2999
3000         ASSERT(page_index <= last_index);
3001         page = find_lock_page(inode->i_mapping, page_index);
3002         if (!page) {
3003                 page_cache_sync_readahead(inode->i_mapping, ra, NULL,
3004                                 page_index, last_index + 1 - page_index);
3005                 page = find_or_create_page(inode->i_mapping, page_index, mask);
3006                 if (!page)
3007                         return -ENOMEM;
3008         }
3009
3010         if (PageReadahead(page))
3011                 page_cache_async_readahead(inode->i_mapping, ra, NULL,
3012                                 page_folio(page), page_index,
3013                                 last_index + 1 - page_index);
3014
3015         if (!PageUptodate(page)) {
3016                 btrfs_read_folio(NULL, page_folio(page));
3017                 lock_page(page);
3018                 if (!PageUptodate(page)) {
3019                         ret = -EIO;
3020                         goto release_page;
3021                 }
3022         }
3023
3024         /*
3025          * We could have lost page private when we dropped the lock to read the
3026          * page above, make sure we set_page_extent_mapped here so we have any
3027          * of the subpage blocksize stuff we need in place.
3028          */
3029         ret = set_page_extent_mapped(page);
3030         if (ret < 0)
3031                 goto release_page;
3032
3033         page_start = page_offset(page);
3034         page_end = page_start + PAGE_SIZE - 1;
3035
3036         /*
3037          * Start from the cluster, as for subpage case, the cluster can start
3038          * inside the page.
3039          */
3040         cur = max(page_start, cluster->boundary[*cluster_nr] - offset);
3041         while (cur <= page_end) {
3042                 struct extent_state *cached_state = NULL;
3043                 u64 extent_start = cluster->boundary[*cluster_nr] - offset;
3044                 u64 extent_end = get_cluster_boundary_end(cluster,
3045                                                 *cluster_nr) - offset;
3046                 u64 clamped_start = max(page_start, extent_start);
3047                 u64 clamped_end = min(page_end, extent_end);
3048                 u32 clamped_len = clamped_end + 1 - clamped_start;
3049
3050                 /* Reserve metadata for this range */
3051                 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
3052                                                       clamped_len, clamped_len,
3053                                                       false);
3054                 if (ret)
3055                         goto release_page;
3056
3057                 /* Mark the range delalloc and dirty for later writeback */
3058                 lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end,
3059                             &cached_state);
3060                 ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start,
3061                                                 clamped_end, 0, &cached_state);
3062                 if (ret) {
3063                         clear_extent_bit(&BTRFS_I(inode)->io_tree,
3064                                          clamped_start, clamped_end,
3065                                          EXTENT_LOCKED | EXTENT_BOUNDARY,
3066                                          &cached_state);
3067                         btrfs_delalloc_release_metadata(BTRFS_I(inode),
3068                                                         clamped_len, true);
3069                         btrfs_delalloc_release_extents(BTRFS_I(inode),
3070                                                        clamped_len);
3071                         goto release_page;
3072                 }
3073                 btrfs_folio_set_dirty(fs_info, page_folio(page),
3074                                       clamped_start, clamped_len);
3075
3076                 /*
3077                  * Set the boundary if it's inside the page.
3078                  * Data relocation requires the destination extents to have the
3079                  * same size as the source.
3080                  * EXTENT_BOUNDARY bit prevents current extent from being merged
3081                  * with previous extent.
3082                  */
3083                 if (in_range(cluster->boundary[*cluster_nr] - offset,
3084                              page_start, PAGE_SIZE)) {
3085                         u64 boundary_start = cluster->boundary[*cluster_nr] -
3086                                                 offset;
3087                         u64 boundary_end = boundary_start +
3088                                            fs_info->sectorsize - 1;
3089
3090                         set_extent_bit(&BTRFS_I(inode)->io_tree,
3091                                        boundary_start, boundary_end,
3092                                        EXTENT_BOUNDARY, NULL);
3093                 }
3094                 unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end,
3095                               &cached_state);
3096                 btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len);
3097                 cur += clamped_len;
3098
3099                 /* Crossed extent end, go to next extent */
3100                 if (cur >= extent_end) {
3101                         (*cluster_nr)++;
3102                         /* Just finished the last extent of the cluster, exit. */
3103                         if (*cluster_nr >= cluster->nr)
3104                                 break;
3105                 }
3106         }
3107         unlock_page(page);
3108         put_page(page);
3109
3110         balance_dirty_pages_ratelimited(inode->i_mapping);
3111         btrfs_throttle(fs_info);
3112         if (btrfs_should_cancel_balance(fs_info))
3113                 ret = -ECANCELED;
3114         return ret;
3115
3116 release_page:
3117         unlock_page(page);
3118         put_page(page);
3119         return ret;
3120 }
3121
3122 static int relocate_file_extent_cluster(struct inode *inode,
3123                                         const struct file_extent_cluster *cluster)
3124 {
3125         u64 offset = BTRFS_I(inode)->index_cnt;
3126         unsigned long index;
3127         unsigned long last_index;
3128         struct file_ra_state *ra;
3129         int cluster_nr = 0;
3130         int ret = 0;
3131
3132         if (!cluster->nr)
3133                 return 0;
3134
3135         ra = kzalloc(sizeof(*ra), GFP_NOFS);
3136         if (!ra)
3137                 return -ENOMEM;
3138
3139         ret = prealloc_file_extent_cluster(BTRFS_I(inode), cluster);
3140         if (ret)
3141                 goto out;
3142
3143         file_ra_state_init(ra, inode->i_mapping);
3144
3145         ret = setup_relocation_extent_mapping(inode, cluster->start - offset,
3146                                    cluster->end - offset, cluster->start);
3147         if (ret)
3148                 goto out;
3149
3150         last_index = (cluster->end - offset) >> PAGE_SHIFT;
3151         for (index = (cluster->start - offset) >> PAGE_SHIFT;
3152              index <= last_index && !ret; index++)
3153                 ret = relocate_one_page(inode, ra, cluster, &cluster_nr, index);
3154         if (ret == 0)
3155                 WARN_ON(cluster_nr != cluster->nr);
3156 out:
3157         kfree(ra);
3158         return ret;
3159 }
3160
3161 static noinline_for_stack int relocate_data_extent(struct inode *inode,
3162                                 const struct btrfs_key *extent_key,
3163                                 struct file_extent_cluster *cluster)
3164 {
3165         int ret;
3166         struct btrfs_root *root = BTRFS_I(inode)->root;
3167
3168         if (cluster->nr > 0 && extent_key->objectid != cluster->end + 1) {
3169                 ret = relocate_file_extent_cluster(inode, cluster);
3170                 if (ret)
3171                         return ret;
3172                 cluster->nr = 0;
3173         }
3174
3175         /*
3176          * Under simple quotas, we set root->relocation_src_root when we find
3177          * the extent. If adjacent extents have different owners, we can't merge
3178          * them while relocating. Handle this by storing the owning root that
3179          * started a cluster and if we see an extent from a different root break
3180          * cluster formation (just like the above case of non-adjacent extents).
3181          *
3182          * Without simple quotas, relocation_src_root is always 0, so we should
3183          * never see a mismatch, and it should have no effect on relocation
3184          * clusters.
3185          */
3186         if (cluster->nr > 0 && cluster->owning_root != root->relocation_src_root) {
3187                 u64 tmp = root->relocation_src_root;
3188
3189                 /*
3190                  * root->relocation_src_root is the state that actually affects
3191                  * the preallocation we do here, so set it to the root owning
3192                  * the cluster we need to relocate.
3193                  */
3194                 root->relocation_src_root = cluster->owning_root;
3195                 ret = relocate_file_extent_cluster(inode, cluster);
3196                 if (ret)
3197                         return ret;
3198                 cluster->nr = 0;
3199                 /* And reset it back for the current extent's owning root. */
3200                 root->relocation_src_root = tmp;
3201         }
3202
3203         if (!cluster->nr) {
3204                 cluster->start = extent_key->objectid;
3205                 cluster->owning_root = root->relocation_src_root;
3206         }
3207         else
3208                 BUG_ON(cluster->nr >= MAX_EXTENTS);
3209         cluster->end = extent_key->objectid + extent_key->offset - 1;
3210         cluster->boundary[cluster->nr] = extent_key->objectid;
3211         cluster->nr++;
3212
3213         if (cluster->nr >= MAX_EXTENTS) {
3214                 ret = relocate_file_extent_cluster(inode, cluster);
3215                 if (ret)
3216                         return ret;
3217                 cluster->nr = 0;
3218         }
3219         return 0;
3220 }
3221
3222 /*
3223  * helper to add a tree block to the list.
3224  * the major work is getting the generation and level of the block
3225  */
3226 static int add_tree_block(struct reloc_control *rc,
3227                           const struct btrfs_key *extent_key,
3228                           struct btrfs_path *path,
3229                           struct rb_root *blocks)
3230 {
3231         struct extent_buffer *eb;
3232         struct btrfs_extent_item *ei;
3233         struct btrfs_tree_block_info *bi;
3234         struct tree_block *block;
3235         struct rb_node *rb_node;
3236         u32 item_size;
3237         int level = -1;
3238         u64 generation;
3239         u64 owner = 0;
3240
3241         eb =  path->nodes[0];
3242         item_size = btrfs_item_size(eb, path->slots[0]);
3243
3244         if (extent_key->type == BTRFS_METADATA_ITEM_KEY ||
3245             item_size >= sizeof(*ei) + sizeof(*bi)) {
3246                 unsigned long ptr = 0, end;
3247
3248                 ei = btrfs_item_ptr(eb, path->slots[0],
3249                                 struct btrfs_extent_item);
3250                 end = (unsigned long)ei + item_size;
3251                 if (extent_key->type == BTRFS_EXTENT_ITEM_KEY) {
3252                         bi = (struct btrfs_tree_block_info *)(ei + 1);
3253                         level = btrfs_tree_block_level(eb, bi);
3254                         ptr = (unsigned long)(bi + 1);
3255                 } else {
3256                         level = (int)extent_key->offset;
3257                         ptr = (unsigned long)(ei + 1);
3258                 }
3259                 generation = btrfs_extent_generation(eb, ei);
3260
3261                 /*
3262                  * We're reading random blocks without knowing their owner ahead
3263                  * of time.  This is ok most of the time, as all reloc roots and
3264                  * fs roots have the same lock type.  However normal trees do
3265                  * not, and the only way to know ahead of time is to read the
3266                  * inline ref offset.  We know it's an fs root if
3267                  *
3268                  * 1. There's more than one ref.
3269                  * 2. There's a SHARED_DATA_REF_KEY set.
3270                  * 3. FULL_BACKREF is set on the flags.
3271                  *
3272                  * Otherwise it's safe to assume that the ref offset == the
3273                  * owner of this block, so we can use that when calling
3274                  * read_tree_block.
3275                  */
3276                 if (btrfs_extent_refs(eb, ei) == 1 &&
3277                     !(btrfs_extent_flags(eb, ei) &
3278                       BTRFS_BLOCK_FLAG_FULL_BACKREF) &&
3279                     ptr < end) {
3280                         struct btrfs_extent_inline_ref *iref;
3281                         int type;
3282
3283                         iref = (struct btrfs_extent_inline_ref *)ptr;
3284                         type = btrfs_get_extent_inline_ref_type(eb, iref,
3285                                                         BTRFS_REF_TYPE_BLOCK);
3286                         if (type == BTRFS_REF_TYPE_INVALID)
3287                                 return -EINVAL;
3288                         if (type == BTRFS_TREE_BLOCK_REF_KEY)
3289                                 owner = btrfs_extent_inline_ref_offset(eb, iref);
3290                 }
3291         } else {
3292                 btrfs_print_leaf(eb);
3293                 btrfs_err(rc->block_group->fs_info,
3294                           "unrecognized tree backref at tree block %llu slot %u",
3295                           eb->start, path->slots[0]);
3296                 btrfs_release_path(path);
3297                 return -EUCLEAN;
3298         }
3299
3300         btrfs_release_path(path);
3301
3302         BUG_ON(level == -1);
3303
3304         block = kmalloc(sizeof(*block), GFP_NOFS);
3305         if (!block)
3306                 return -ENOMEM;
3307
3308         block->bytenr = extent_key->objectid;
3309         block->key.objectid = rc->extent_root->fs_info->nodesize;
3310         block->key.offset = generation;
3311         block->level = level;
3312         block->key_ready = false;
3313         block->owner = owner;
3314
3315         rb_node = rb_simple_insert(blocks, block->bytenr, &block->rb_node);
3316         if (rb_node)
3317                 btrfs_backref_panic(rc->extent_root->fs_info, block->bytenr,
3318                                     -EEXIST);
3319
3320         return 0;
3321 }
3322
3323 /*
3324  * helper to add tree blocks for backref of type BTRFS_SHARED_DATA_REF_KEY
3325  */
3326 static int __add_tree_block(struct reloc_control *rc,
3327                             u64 bytenr, u32 blocksize,
3328                             struct rb_root *blocks)
3329 {
3330         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3331         struct btrfs_path *path;
3332         struct btrfs_key key;
3333         int ret;
3334         bool skinny = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
3335
3336         if (tree_block_processed(bytenr, rc))
3337                 return 0;
3338
3339         if (rb_simple_search(blocks, bytenr))
3340                 return 0;
3341
3342         path = btrfs_alloc_path();
3343         if (!path)
3344                 return -ENOMEM;
3345 again:
3346         key.objectid = bytenr;
3347         if (skinny) {
3348                 key.type = BTRFS_METADATA_ITEM_KEY;
3349                 key.offset = (u64)-1;
3350         } else {
3351                 key.type = BTRFS_EXTENT_ITEM_KEY;
3352                 key.offset = blocksize;
3353         }
3354
3355         path->search_commit_root = 1;
3356         path->skip_locking = 1;
3357         ret = btrfs_search_slot(NULL, rc->extent_root, &key, path, 0, 0);
3358         if (ret < 0)
3359                 goto out;
3360
3361         if (ret > 0 && skinny) {
3362                 if (path->slots[0]) {
3363                         path->slots[0]--;
3364                         btrfs_item_key_to_cpu(path->nodes[0], &key,
3365                                               path->slots[0]);
3366                         if (key.objectid == bytenr &&
3367                             (key.type == BTRFS_METADATA_ITEM_KEY ||
3368                              (key.type == BTRFS_EXTENT_ITEM_KEY &&
3369                               key.offset == blocksize)))
3370                                 ret = 0;
3371                 }
3372
3373                 if (ret) {
3374                         skinny = false;
3375                         btrfs_release_path(path);
3376                         goto again;
3377                 }
3378         }
3379         if (ret) {
3380                 ASSERT(ret == 1);
3381                 btrfs_print_leaf(path->nodes[0]);
3382                 btrfs_err(fs_info,
3383              "tree block extent item (%llu) is not found in extent tree",
3384                      bytenr);
3385                 WARN_ON(1);
3386                 ret = -EINVAL;
3387                 goto out;
3388         }
3389
3390         ret = add_tree_block(rc, &key, path, blocks);
3391 out:
3392         btrfs_free_path(path);
3393         return ret;
3394 }
3395
3396 static int delete_block_group_cache(struct btrfs_fs_info *fs_info,
3397                                     struct btrfs_block_group *block_group,
3398                                     struct inode *inode,
3399                                     u64 ino)
3400 {
3401         struct btrfs_root *root = fs_info->tree_root;
3402         struct btrfs_trans_handle *trans;
3403         int ret = 0;
3404
3405         if (inode)
3406                 goto truncate;
3407
3408         inode = btrfs_iget(fs_info->sb, ino, root);
3409         if (IS_ERR(inode))
3410                 return -ENOENT;
3411
3412 truncate:
3413         ret = btrfs_check_trunc_cache_free_space(fs_info,
3414                                                  &fs_info->global_block_rsv);
3415         if (ret)
3416                 goto out;
3417
3418         trans = btrfs_join_transaction(root);
3419         if (IS_ERR(trans)) {
3420                 ret = PTR_ERR(trans);
3421                 goto out;
3422         }
3423
3424         ret = btrfs_truncate_free_space_cache(trans, block_group, inode);
3425
3426         btrfs_end_transaction(trans);
3427         btrfs_btree_balance_dirty(fs_info);
3428 out:
3429         iput(inode);
3430         return ret;
3431 }
3432
3433 /*
3434  * Locate the free space cache EXTENT_DATA in root tree leaf and delete the
3435  * cache inode, to avoid free space cache data extent blocking data relocation.
3436  */
3437 static int delete_v1_space_cache(struct extent_buffer *leaf,
3438                                  struct btrfs_block_group *block_group,
3439                                  u64 data_bytenr)
3440 {
3441         u64 space_cache_ino;
3442         struct btrfs_file_extent_item *ei;
3443         struct btrfs_key key;
3444         bool found = false;
3445         int i;
3446         int ret;
3447
3448         if (btrfs_header_owner(leaf) != BTRFS_ROOT_TREE_OBJECTID)
3449                 return 0;
3450
3451         for (i = 0; i < btrfs_header_nritems(leaf); i++) {
3452                 u8 type;
3453
3454                 btrfs_item_key_to_cpu(leaf, &key, i);
3455                 if (key.type != BTRFS_EXTENT_DATA_KEY)
3456                         continue;
3457                 ei = btrfs_item_ptr(leaf, i, struct btrfs_file_extent_item);
3458                 type = btrfs_file_extent_type(leaf, ei);
3459
3460                 if ((type == BTRFS_FILE_EXTENT_REG ||
3461                      type == BTRFS_FILE_EXTENT_PREALLOC) &&
3462                     btrfs_file_extent_disk_bytenr(leaf, ei) == data_bytenr) {
3463                         found = true;
3464                         space_cache_ino = key.objectid;
3465                         break;
3466                 }
3467         }
3468         if (!found)
3469                 return -ENOENT;
3470         ret = delete_block_group_cache(leaf->fs_info, block_group, NULL,
3471                                         space_cache_ino);
3472         return ret;
3473 }
3474
3475 /*
3476  * helper to find all tree blocks that reference a given data extent
3477  */
3478 static noinline_for_stack int add_data_references(struct reloc_control *rc,
3479                                                   const struct btrfs_key *extent_key,
3480                                                   struct btrfs_path *path,
3481                                                   struct rb_root *blocks)
3482 {
3483         struct btrfs_backref_walk_ctx ctx = { 0 };
3484         struct ulist_iterator leaf_uiter;
3485         struct ulist_node *ref_node = NULL;
3486         const u32 blocksize = rc->extent_root->fs_info->nodesize;
3487         int ret = 0;
3488
3489         btrfs_release_path(path);
3490
3491         ctx.bytenr = extent_key->objectid;
3492         ctx.skip_inode_ref_list = true;
3493         ctx.fs_info = rc->extent_root->fs_info;
3494
3495         ret = btrfs_find_all_leafs(&ctx);
3496         if (ret < 0)
3497                 return ret;
3498
3499         ULIST_ITER_INIT(&leaf_uiter);
3500         while ((ref_node = ulist_next(ctx.refs, &leaf_uiter))) {
3501                 struct btrfs_tree_parent_check check = { 0 };
3502                 struct extent_buffer *eb;
3503
3504                 eb = read_tree_block(ctx.fs_info, ref_node->val, &check);
3505                 if (IS_ERR(eb)) {
3506                         ret = PTR_ERR(eb);
3507                         break;
3508                 }
3509                 ret = delete_v1_space_cache(eb, rc->block_group,
3510                                             extent_key->objectid);
3511                 free_extent_buffer(eb);
3512                 if (ret < 0)
3513                         break;
3514                 ret = __add_tree_block(rc, ref_node->val, blocksize, blocks);
3515                 if (ret < 0)
3516                         break;
3517         }
3518         if (ret < 0)
3519                 free_block_list(blocks);
3520         ulist_free(ctx.refs);
3521         return ret;
3522 }
3523
3524 /*
3525  * helper to find next unprocessed extent
3526  */
3527 static noinline_for_stack
3528 int find_next_extent(struct reloc_control *rc, struct btrfs_path *path,
3529                      struct btrfs_key *extent_key)
3530 {
3531         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3532         struct btrfs_key key;
3533         struct extent_buffer *leaf;
3534         u64 start, end, last;
3535         int ret;
3536
3537         last = rc->block_group->start + rc->block_group->length;
3538         while (1) {
3539                 bool block_found;
3540
3541                 cond_resched();
3542                 if (rc->search_start >= last) {
3543                         ret = 1;
3544                         break;
3545                 }
3546
3547                 key.objectid = rc->search_start;
3548                 key.type = BTRFS_EXTENT_ITEM_KEY;
3549                 key.offset = 0;
3550
3551                 path->search_commit_root = 1;
3552                 path->skip_locking = 1;
3553                 ret = btrfs_search_slot(NULL, rc->extent_root, &key, path,
3554                                         0, 0);
3555                 if (ret < 0)
3556                         break;
3557 next:
3558                 leaf = path->nodes[0];
3559                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3560                         ret = btrfs_next_leaf(rc->extent_root, path);
3561                         if (ret != 0)
3562                                 break;
3563                         leaf = path->nodes[0];
3564                 }
3565
3566                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3567                 if (key.objectid >= last) {
3568                         ret = 1;
3569                         break;
3570                 }
3571
3572                 if (key.type != BTRFS_EXTENT_ITEM_KEY &&
3573                     key.type != BTRFS_METADATA_ITEM_KEY) {
3574                         path->slots[0]++;
3575                         goto next;
3576                 }
3577
3578                 if (key.type == BTRFS_EXTENT_ITEM_KEY &&
3579                     key.objectid + key.offset <= rc->search_start) {
3580                         path->slots[0]++;
3581                         goto next;
3582                 }
3583
3584                 if (key.type == BTRFS_METADATA_ITEM_KEY &&
3585                     key.objectid + fs_info->nodesize <=
3586                     rc->search_start) {
3587                         path->slots[0]++;
3588                         goto next;
3589                 }
3590
3591                 block_found = find_first_extent_bit(&rc->processed_blocks,
3592                                                     key.objectid, &start, &end,
3593                                                     EXTENT_DIRTY, NULL);
3594
3595                 if (block_found && start <= key.objectid) {
3596                         btrfs_release_path(path);
3597                         rc->search_start = end + 1;
3598                 } else {
3599                         if (key.type == BTRFS_EXTENT_ITEM_KEY)
3600                                 rc->search_start = key.objectid + key.offset;
3601                         else
3602                                 rc->search_start = key.objectid +
3603                                         fs_info->nodesize;
3604                         memcpy(extent_key, &key, sizeof(key));
3605                         return 0;
3606                 }
3607         }
3608         btrfs_release_path(path);
3609         return ret;
3610 }
3611
3612 static void set_reloc_control(struct reloc_control *rc)
3613 {
3614         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3615
3616         mutex_lock(&fs_info->reloc_mutex);
3617         fs_info->reloc_ctl = rc;
3618         mutex_unlock(&fs_info->reloc_mutex);
3619 }
3620
3621 static void unset_reloc_control(struct reloc_control *rc)
3622 {
3623         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3624
3625         mutex_lock(&fs_info->reloc_mutex);
3626         fs_info->reloc_ctl = NULL;
3627         mutex_unlock(&fs_info->reloc_mutex);
3628 }
3629
3630 static noinline_for_stack
3631 int prepare_to_relocate(struct reloc_control *rc)
3632 {
3633         struct btrfs_trans_handle *trans;
3634         int ret;
3635
3636         rc->block_rsv = btrfs_alloc_block_rsv(rc->extent_root->fs_info,
3637                                               BTRFS_BLOCK_RSV_TEMP);
3638         if (!rc->block_rsv)
3639                 return -ENOMEM;
3640
3641         memset(&rc->cluster, 0, sizeof(rc->cluster));
3642         rc->search_start = rc->block_group->start;
3643         rc->extents_found = 0;
3644         rc->nodes_relocated = 0;
3645         rc->merging_rsv_size = 0;
3646         rc->reserved_bytes = 0;
3647         rc->block_rsv->size = rc->extent_root->fs_info->nodesize *
3648                               RELOCATION_RESERVED_NODES;
3649         ret = btrfs_block_rsv_refill(rc->extent_root->fs_info,
3650                                      rc->block_rsv, rc->block_rsv->size,
3651                                      BTRFS_RESERVE_FLUSH_ALL);
3652         if (ret)
3653                 return ret;
3654
3655         rc->create_reloc_tree = true;
3656         set_reloc_control(rc);
3657
3658         trans = btrfs_join_transaction(rc->extent_root);
3659         if (IS_ERR(trans)) {
3660                 unset_reloc_control(rc);
3661                 /*
3662                  * extent tree is not a ref_cow tree and has no reloc_root to
3663                  * cleanup.  And callers are responsible to free the above
3664                  * block rsv.
3665                  */
3666                 return PTR_ERR(trans);
3667         }
3668
3669         ret = btrfs_commit_transaction(trans);
3670         if (ret)
3671                 unset_reloc_control(rc);
3672
3673         return ret;
3674 }
3675
3676 static noinline_for_stack int relocate_block_group(struct reloc_control *rc)
3677 {
3678         struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
3679         struct rb_root blocks = RB_ROOT;
3680         struct btrfs_key key;
3681         struct btrfs_trans_handle *trans = NULL;
3682         struct btrfs_path *path;
3683         struct btrfs_extent_item *ei;
3684         u64 flags;
3685         int ret;
3686         int err = 0;
3687         int progress = 0;
3688
3689         path = btrfs_alloc_path();
3690         if (!path)
3691                 return -ENOMEM;
3692         path->reada = READA_FORWARD;
3693
3694         ret = prepare_to_relocate(rc);
3695         if (ret) {
3696                 err = ret;
3697                 goto out_free;
3698         }
3699
3700         while (1) {
3701                 rc->reserved_bytes = 0;
3702                 ret = btrfs_block_rsv_refill(fs_info, rc->block_rsv,
3703                                              rc->block_rsv->size,
3704                                              BTRFS_RESERVE_FLUSH_ALL);
3705                 if (ret) {
3706                         err = ret;
3707                         break;
3708                 }
3709                 progress++;
3710                 trans = btrfs_start_transaction(rc->extent_root, 0);
3711                 if (IS_ERR(trans)) {
3712                         err = PTR_ERR(trans);
3713                         trans = NULL;
3714                         break;
3715                 }
3716 restart:
3717                 if (update_backref_cache(trans, &rc->backref_cache)) {
3718                         btrfs_end_transaction(trans);
3719                         trans = NULL;
3720                         continue;
3721                 }
3722
3723                 ret = find_next_extent(rc, path, &key);
3724                 if (ret < 0)
3725                         err = ret;
3726                 if (ret != 0)
3727                         break;
3728
3729                 rc->extents_found++;
3730
3731                 ei = btrfs_item_ptr(path->nodes[0], path->slots[0],
3732                                     struct btrfs_extent_item);
3733                 flags = btrfs_extent_flags(path->nodes[0], ei);
3734
3735                 /*
3736                  * If we are relocating a simple quota owned extent item, we
3737                  * need to note the owner on the reloc data root so that when
3738                  * we allocate the replacement item, we can attribute it to the
3739                  * correct eventual owner (rather than the reloc data root).
3740                  */
3741                 if (btrfs_qgroup_mode(fs_info) == BTRFS_QGROUP_MODE_SIMPLE) {
3742                         struct btrfs_root *root = BTRFS_I(rc->data_inode)->root;
3743                         u64 owning_root_id = btrfs_get_extent_owner_root(fs_info,
3744                                                                  path->nodes[0],
3745                                                                  path->slots[0]);
3746
3747                         root->relocation_src_root = owning_root_id;
3748                 }
3749
3750                 if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
3751                         ret = add_tree_block(rc, &key, path, &blocks);
3752                 } else if (rc->stage == UPDATE_DATA_PTRS &&
3753                            (flags & BTRFS_EXTENT_FLAG_DATA)) {
3754                         ret = add_data_references(rc, &key, path, &blocks);
3755                 } else {
3756                         btrfs_release_path(path);
3757                         ret = 0;
3758                 }
3759                 if (ret < 0) {
3760                         err = ret;
3761                         break;
3762                 }
3763
3764                 if (!RB_EMPTY_ROOT(&blocks)) {
3765                         ret = relocate_tree_blocks(trans, rc, &blocks);
3766                         if (ret < 0) {
3767                                 if (ret != -EAGAIN) {
3768                                         err = ret;
3769                                         break;
3770                                 }
3771                                 rc->extents_found--;
3772                                 rc->search_start = key.objectid;
3773                         }
3774                 }
3775
3776                 btrfs_end_transaction_throttle(trans);
3777                 btrfs_btree_balance_dirty(fs_info);
3778                 trans = NULL;
3779
3780                 if (rc->stage == MOVE_DATA_EXTENTS &&
3781                     (flags & BTRFS_EXTENT_FLAG_DATA)) {
3782                         rc->found_file_extent = true;
3783                         ret = relocate_data_extent(rc->data_inode,
3784                                                    &key, &rc->cluster);
3785                         if (ret < 0) {
3786                                 err = ret;
3787                                 break;
3788                         }
3789                 }
3790                 if (btrfs_should_cancel_balance(fs_info)) {
3791                         err = -ECANCELED;
3792                         break;
3793                 }
3794         }
3795         if (trans && progress && err == -ENOSPC) {
3796                 ret = btrfs_force_chunk_alloc(trans, rc->block_group->flags);
3797                 if (ret == 1) {
3798                         err = 0;
3799                         progress = 0;
3800                         goto restart;
3801                 }
3802         }
3803
3804         btrfs_release_path(path);
3805         clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY);
3806
3807         if (trans) {
3808                 btrfs_end_transaction_throttle(trans);
3809                 btrfs_btree_balance_dirty(fs_info);
3810         }
3811
3812         if (!err) {
3813                 ret = relocate_file_extent_cluster(rc->data_inode,
3814                                                    &rc->cluster);
3815                 if (ret < 0)
3816                         err = ret;
3817         }
3818
3819         rc->create_reloc_tree = false;
3820         set_reloc_control(rc);
3821
3822         btrfs_backref_release_cache(&rc->backref_cache);
3823         btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL);
3824
3825         /*
3826          * Even in the case when the relocation is cancelled, we should all go
3827          * through prepare_to_merge() and merge_reloc_roots().
3828          *
3829          * For error (including cancelled balance), prepare_to_merge() will
3830          * mark all reloc trees orphan, then queue them for cleanup in
3831          * merge_reloc_roots()
3832          */
3833         err = prepare_to_merge(rc, err);
3834
3835         merge_reloc_roots(rc);
3836
3837         rc->merge_reloc_tree = false;
3838         unset_reloc_control(rc);
3839         btrfs_block_rsv_release(fs_info, rc->block_rsv, (u64)-1, NULL);
3840
3841         /* get rid of pinned extents */
3842         trans = btrfs_join_transaction(rc->extent_root);
3843         if (IS_ERR(trans)) {
3844                 err = PTR_ERR(trans);
3845                 goto out_free;
3846         }
3847         ret = btrfs_commit_transaction(trans);
3848         if (ret && !err)
3849                 err = ret;
3850 out_free:
3851         ret = clean_dirty_subvols(rc);
3852         if (ret < 0 && !err)
3853                 err = ret;
3854         btrfs_free_block_rsv(fs_info, rc->block_rsv);
3855         btrfs_free_path(path);
3856         return err;
3857 }
3858
3859 static int __insert_orphan_inode(struct btrfs_trans_handle *trans,
3860                                  struct btrfs_root *root, u64 objectid)
3861 {
3862         struct btrfs_path *path;
3863         struct btrfs_inode_item *item;
3864         struct extent_buffer *leaf;
3865         int ret;
3866
3867         path = btrfs_alloc_path();
3868         if (!path)
3869                 return -ENOMEM;
3870
3871         ret = btrfs_insert_empty_inode(trans, root, path, objectid);
3872         if (ret)
3873                 goto out;
3874
3875         leaf = path->nodes[0];
3876         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item);
3877         memzero_extent_buffer(leaf, (unsigned long)item, sizeof(*item));
3878         btrfs_set_inode_generation(leaf, item, 1);
3879         btrfs_set_inode_size(leaf, item, 0);
3880         btrfs_set_inode_mode(leaf, item, S_IFREG | 0600);
3881         btrfs_set_inode_flags(leaf, item, BTRFS_INODE_NOCOMPRESS |
3882                                           BTRFS_INODE_PREALLOC);
3883         btrfs_mark_buffer_dirty(trans, leaf);
3884 out:
3885         btrfs_free_path(path);
3886         return ret;
3887 }
3888
3889 static void delete_orphan_inode(struct btrfs_trans_handle *trans,
3890                                 struct btrfs_root *root, u64 objectid)
3891 {
3892         struct btrfs_path *path;
3893         struct btrfs_key key;
3894         int ret = 0;
3895
3896         path = btrfs_alloc_path();
3897         if (!path) {
3898                 ret = -ENOMEM;
3899                 goto out;
3900         }
3901
3902         key.objectid = objectid;
3903         key.type = BTRFS_INODE_ITEM_KEY;
3904         key.offset = 0;
3905         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
3906         if (ret) {
3907                 if (ret > 0)
3908                         ret = -ENOENT;
3909                 goto out;
3910         }
3911         ret = btrfs_del_item(trans, root, path);
3912 out:
3913         if (ret)
3914                 btrfs_abort_transaction(trans, ret);
3915         btrfs_free_path(path);
3916 }
3917
3918 /*
3919  * helper to create inode for data relocation.
3920  * the inode is in data relocation tree and its link count is 0
3921  */
3922 static noinline_for_stack struct inode *create_reloc_inode(
3923                                         struct btrfs_fs_info *fs_info,
3924                                         const struct btrfs_block_group *group)
3925 {
3926         struct inode *inode = NULL;
3927         struct btrfs_trans_handle *trans;
3928         struct btrfs_root *root;
3929         u64 objectid;
3930         int err = 0;
3931
3932         root = btrfs_grab_root(fs_info->data_reloc_root);
3933         trans = btrfs_start_transaction(root, 6);
3934         if (IS_ERR(trans)) {
3935                 btrfs_put_root(root);
3936                 return ERR_CAST(trans);
3937         }
3938
3939         err = btrfs_get_free_objectid(root, &objectid);
3940         if (err)
3941                 goto out;
3942
3943         err = __insert_orphan_inode(trans, root, objectid);
3944         if (err)
3945                 goto out;
3946
3947         inode = btrfs_iget(fs_info->sb, objectid, root);
3948         if (IS_ERR(inode)) {
3949                 delete_orphan_inode(trans, root, objectid);
3950                 err = PTR_ERR(inode);
3951                 inode = NULL;
3952                 goto out;
3953         }
3954         BTRFS_I(inode)->index_cnt = group->start;
3955
3956         err = btrfs_orphan_add(trans, BTRFS_I(inode));
3957 out:
3958         btrfs_put_root(root);
3959         btrfs_end_transaction(trans);
3960         btrfs_btree_balance_dirty(fs_info);
3961         if (err) {
3962                 iput(inode);
3963                 inode = ERR_PTR(err);
3964         }
3965         return inode;
3966 }
3967
3968 /*
3969  * Mark start of chunk relocation that is cancellable. Check if the cancellation
3970  * has been requested meanwhile and don't start in that case.
3971  *
3972  * Return:
3973  *   0             success
3974  *   -EINPROGRESS  operation is already in progress, that's probably a bug
3975  *   -ECANCELED    cancellation request was set before the operation started
3976  */
3977 static int reloc_chunk_start(struct btrfs_fs_info *fs_info)
3978 {
3979         if (test_and_set_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) {
3980                 /* This should not happen */
3981                 btrfs_err(fs_info, "reloc already running, cannot start");
3982                 return -EINPROGRESS;
3983         }
3984
3985         if (atomic_read(&fs_info->reloc_cancel_req) > 0) {
3986                 btrfs_info(fs_info, "chunk relocation canceled on start");
3987                 /*
3988                  * On cancel, clear all requests but let the caller mark
3989                  * the end after cleanup operations.
3990                  */
3991                 atomic_set(&fs_info->reloc_cancel_req, 0);
3992                 return -ECANCELED;
3993         }
3994         return 0;
3995 }
3996
3997 /*
3998  * Mark end of chunk relocation that is cancellable and wake any waiters.
3999  */
4000 static void reloc_chunk_end(struct btrfs_fs_info *fs_info)
4001 {
4002         /* Requested after start, clear bit first so any waiters can continue */
4003         if (atomic_read(&fs_info->reloc_cancel_req) > 0)
4004                 btrfs_info(fs_info, "chunk relocation canceled during operation");
4005         clear_and_wake_up_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags);
4006         atomic_set(&fs_info->reloc_cancel_req, 0);
4007 }
4008
4009 static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
4010 {
4011         struct reloc_control *rc;
4012
4013         rc = kzalloc(sizeof(*rc), GFP_NOFS);
4014         if (!rc)
4015                 return NULL;
4016
4017         INIT_LIST_HEAD(&rc->reloc_roots);
4018         INIT_LIST_HEAD(&rc->dirty_subvol_roots);
4019         btrfs_backref_init_cache(fs_info, &rc->backref_cache, true);
4020         rc->reloc_root_tree.rb_root = RB_ROOT;
4021         spin_lock_init(&rc->reloc_root_tree.lock);
4022         extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS);
4023         return rc;
4024 }
4025
4026 static void free_reloc_control(struct reloc_control *rc)
4027 {
4028         struct mapping_node *node, *tmp;
4029
4030         free_reloc_roots(&rc->reloc_roots);
4031         rbtree_postorder_for_each_entry_safe(node, tmp,
4032                         &rc->reloc_root_tree.rb_root, rb_node)
4033                 kfree(node);
4034
4035         kfree(rc);
4036 }
4037
4038 /*
4039  * Print the block group being relocated
4040  */
4041 static void describe_relocation(struct btrfs_fs_info *fs_info,
4042                                 struct btrfs_block_group *block_group)
4043 {
4044         char buf[128] = {'\0'};
4045
4046         btrfs_describe_block_groups(block_group->flags, buf, sizeof(buf));
4047
4048         btrfs_info(fs_info,
4049                    "relocating block group %llu flags %s",
4050                    block_group->start, buf);
4051 }
4052
4053 static const char *stage_to_string(enum reloc_stage stage)
4054 {
4055         if (stage == MOVE_DATA_EXTENTS)
4056                 return "move data extents";
4057         if (stage == UPDATE_DATA_PTRS)
4058                 return "update data pointers";
4059         return "unknown";
4060 }
4061
4062 /*
4063  * function to relocate all extents in a block group.
4064  */
4065 int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
4066 {
4067         struct btrfs_block_group *bg;
4068         struct btrfs_root *extent_root = btrfs_extent_root(fs_info, group_start);
4069         struct reloc_control *rc;
4070         struct inode *inode;
4071         struct btrfs_path *path;
4072         int ret;
4073         int rw = 0;
4074         int err = 0;
4075
4076         /*
4077          * This only gets set if we had a half-deleted snapshot on mount.  We
4078          * cannot allow relocation to start while we're still trying to clean up
4079          * these pending deletions.
4080          */
4081         ret = wait_on_bit(&fs_info->flags, BTRFS_FS_UNFINISHED_DROPS, TASK_INTERRUPTIBLE);
4082         if (ret)
4083                 return ret;
4084
4085         /* We may have been woken up by close_ctree, so bail if we're closing. */
4086         if (btrfs_fs_closing(fs_info))
4087                 return -EINTR;
4088
4089         bg = btrfs_lookup_block_group(fs_info, group_start);
4090         if (!bg)
4091                 return -ENOENT;
4092
4093         /*
4094          * Relocation of a data block group creates ordered extents.  Without
4095          * sb_start_write(), we can freeze the filesystem while unfinished
4096          * ordered extents are left. Such ordered extents can cause a deadlock
4097          * e.g. when syncfs() is waiting for their completion but they can't
4098          * finish because they block when joining a transaction, due to the
4099          * fact that the freeze locks are being held in write mode.
4100          */
4101         if (bg->flags & BTRFS_BLOCK_GROUP_DATA)
4102                 ASSERT(sb_write_started(fs_info->sb));
4103
4104         if (btrfs_pinned_by_swapfile(fs_info, bg)) {
4105                 btrfs_put_block_group(bg);
4106                 return -ETXTBSY;
4107         }
4108
4109         rc = alloc_reloc_control(fs_info);
4110         if (!rc) {
4111                 btrfs_put_block_group(bg);
4112                 return -ENOMEM;
4113         }
4114
4115         ret = reloc_chunk_start(fs_info);
4116         if (ret < 0) {
4117                 err = ret;
4118                 goto out_put_bg;
4119         }
4120
4121         rc->extent_root = extent_root;
4122         rc->block_group = bg;
4123
4124         ret = btrfs_inc_block_group_ro(rc->block_group, true);
4125         if (ret) {
4126                 err = ret;
4127                 goto out;
4128         }
4129         rw = 1;
4130
4131         path = btrfs_alloc_path();
4132         if (!path) {
4133                 err = -ENOMEM;
4134                 goto out;
4135         }
4136
4137         inode = lookup_free_space_inode(rc->block_group, path);
4138         btrfs_free_path(path);
4139
4140         if (!IS_ERR(inode))
4141                 ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0);
4142         else
4143                 ret = PTR_ERR(inode);
4144
4145         if (ret && ret != -ENOENT) {
4146                 err = ret;
4147                 goto out;
4148         }
4149
4150         rc->data_inode = create_reloc_inode(fs_info, rc->block_group);
4151         if (IS_ERR(rc->data_inode)) {
4152                 err = PTR_ERR(rc->data_inode);
4153                 rc->data_inode = NULL;
4154                 goto out;
4155         }
4156
4157         describe_relocation(fs_info, rc->block_group);
4158
4159         btrfs_wait_block_group_reservations(rc->block_group);
4160         btrfs_wait_nocow_writers(rc->block_group);
4161         btrfs_wait_ordered_roots(fs_info, U64_MAX,
4162                                  rc->block_group->start,
4163                                  rc->block_group->length);
4164
4165         ret = btrfs_zone_finish(rc->block_group);
4166         WARN_ON(ret && ret != -EAGAIN);
4167
4168         while (1) {
4169                 enum reloc_stage finishes_stage;
4170
4171                 mutex_lock(&fs_info->cleaner_mutex);
4172                 ret = relocate_block_group(rc);
4173                 mutex_unlock(&fs_info->cleaner_mutex);
4174                 if (ret < 0)
4175                         err = ret;
4176
4177                 finishes_stage = rc->stage;
4178                 /*
4179                  * We may have gotten ENOSPC after we already dirtied some
4180                  * extents.  If writeout happens while we're relocating a
4181                  * different block group we could end up hitting the
4182                  * BUG_ON(rc->stage == UPDATE_DATA_PTRS) in
4183                  * btrfs_reloc_cow_block.  Make sure we write everything out
4184                  * properly so we don't trip over this problem, and then break
4185                  * out of the loop if we hit an error.
4186                  */
4187                 if (rc->stage == MOVE_DATA_EXTENTS && rc->found_file_extent) {
4188                         ret = btrfs_wait_ordered_range(rc->data_inode, 0,
4189                                                        (u64)-1);
4190                         if (ret)
4191                                 err = ret;
4192                         invalidate_mapping_pages(rc->data_inode->i_mapping,
4193                                                  0, -1);
4194                         rc->stage = UPDATE_DATA_PTRS;
4195                 }
4196
4197                 if (err < 0)
4198                         goto out;
4199
4200                 if (rc->extents_found == 0)
4201                         break;
4202
4203                 btrfs_info(fs_info, "found %llu extents, stage: %s",
4204                            rc->extents_found, stage_to_string(finishes_stage));
4205         }
4206
4207         WARN_ON(rc->block_group->pinned > 0);
4208         WARN_ON(rc->block_group->reserved > 0);
4209         WARN_ON(rc->block_group->used > 0);
4210 out:
4211         if (err && rw)
4212                 btrfs_dec_block_group_ro(rc->block_group);
4213         iput(rc->data_inode);
4214 out_put_bg:
4215         btrfs_put_block_group(bg);
4216         reloc_chunk_end(fs_info);
4217         free_reloc_control(rc);
4218         return err;
4219 }
4220
4221 static noinline_for_stack int mark_garbage_root(struct btrfs_root *root)
4222 {
4223         struct btrfs_fs_info *fs_info = root->fs_info;
4224         struct btrfs_trans_handle *trans;
4225         int ret, err;
4226
4227         trans = btrfs_start_transaction(fs_info->tree_root, 0);
4228         if (IS_ERR(trans))
4229                 return PTR_ERR(trans);
4230
4231         memset(&root->root_item.drop_progress, 0,
4232                 sizeof(root->root_item.drop_progress));
4233         btrfs_set_root_drop_level(&root->root_item, 0);
4234         btrfs_set_root_refs(&root->root_item, 0);
4235         ret = btrfs_update_root(trans, fs_info->tree_root,
4236                                 &root->root_key, &root->root_item);
4237
4238         err = btrfs_end_transaction(trans);
4239         if (err)
4240                 return err;
4241         return ret;
4242 }
4243
4244 /*
4245  * recover relocation interrupted by system crash.
4246  *
4247  * this function resumes merging reloc trees with corresponding fs trees.
4248  * this is important for keeping the sharing of tree blocks
4249  */
4250 int btrfs_recover_relocation(struct btrfs_fs_info *fs_info)
4251 {
4252         LIST_HEAD(reloc_roots);
4253         struct btrfs_key key;
4254         struct btrfs_root *fs_root;
4255         struct btrfs_root *reloc_root;
4256         struct btrfs_path *path;
4257         struct extent_buffer *leaf;
4258         struct reloc_control *rc = NULL;
4259         struct btrfs_trans_handle *trans;
4260         int ret;
4261         int err = 0;
4262
4263         path = btrfs_alloc_path();
4264         if (!path)
4265                 return -ENOMEM;
4266         path->reada = READA_BACK;
4267
4268         key.objectid = BTRFS_TREE_RELOC_OBJECTID;
4269         key.type = BTRFS_ROOT_ITEM_KEY;
4270         key.offset = (u64)-1;
4271
4272         while (1) {
4273                 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key,
4274                                         path, 0, 0);
4275                 if (ret < 0) {
4276                         err = ret;
4277                         goto out;
4278                 }
4279                 if (ret > 0) {
4280                         if (path->slots[0] == 0)
4281                                 break;
4282                         path->slots[0]--;
4283                 }
4284                 leaf = path->nodes[0];
4285                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4286                 btrfs_release_path(path);
4287
4288                 if (key.objectid != BTRFS_TREE_RELOC_OBJECTID ||
4289                     key.type != BTRFS_ROOT_ITEM_KEY)
4290                         break;
4291
4292                 reloc_root = btrfs_read_tree_root(fs_info->tree_root, &key);
4293                 if (IS_ERR(reloc_root)) {
4294                         err = PTR_ERR(reloc_root);
4295                         goto out;
4296                 }
4297
4298                 set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state);
4299                 list_add(&reloc_root->root_list, &reloc_roots);
4300
4301                 if (btrfs_root_refs(&reloc_root->root_item) > 0) {
4302                         fs_root = btrfs_get_fs_root(fs_info,
4303                                         reloc_root->root_key.offset, false);
4304                         if (IS_ERR(fs_root)) {
4305                                 ret = PTR_ERR(fs_root);
4306                                 if (ret != -ENOENT) {
4307                                         err = ret;
4308                                         goto out;
4309                                 }
4310                                 ret = mark_garbage_root(reloc_root);
4311                                 if (ret < 0) {
4312                                         err = ret;
4313                                         goto out;
4314                                 }
4315                         } else {
4316                                 btrfs_put_root(fs_root);
4317                         }
4318                 }
4319
4320                 if (key.offset == 0)
4321                         break;
4322
4323                 key.offset--;
4324         }
4325         btrfs_release_path(path);
4326
4327         if (list_empty(&reloc_roots))
4328                 goto out;
4329
4330         rc = alloc_reloc_control(fs_info);
4331         if (!rc) {
4332                 err = -ENOMEM;
4333                 goto out;
4334         }
4335
4336         ret = reloc_chunk_start(fs_info);
4337         if (ret < 0) {
4338                 err = ret;
4339                 goto out_end;
4340         }
4341
4342         rc->extent_root = btrfs_extent_root(fs_info, 0);
4343
4344         set_reloc_control(rc);
4345
4346         trans = btrfs_join_transaction(rc->extent_root);
4347         if (IS_ERR(trans)) {
4348                 err = PTR_ERR(trans);
4349                 goto out_unset;
4350         }
4351
4352         rc->merge_reloc_tree = true;
4353
4354         while (!list_empty(&reloc_roots)) {
4355                 reloc_root = list_entry(reloc_roots.next,
4356                                         struct btrfs_root, root_list);
4357                 list_del(&reloc_root->root_list);
4358
4359                 if (btrfs_root_refs(&reloc_root->root_item) == 0) {
4360                         list_add_tail(&reloc_root->root_list,
4361                                       &rc->reloc_roots);
4362                         continue;
4363                 }
4364
4365                 fs_root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset,
4366                                             false);
4367                 if (IS_ERR(fs_root)) {
4368                         err = PTR_ERR(fs_root);
4369                         list_add_tail(&reloc_root->root_list, &reloc_roots);
4370                         btrfs_end_transaction(trans);
4371                         goto out_unset;
4372                 }
4373
4374                 err = __add_reloc_root(reloc_root);
4375                 ASSERT(err != -EEXIST);
4376                 if (err) {
4377                         list_add_tail(&reloc_root->root_list, &reloc_roots);
4378                         btrfs_put_root(fs_root);
4379                         btrfs_end_transaction(trans);
4380                         goto out_unset;
4381                 }
4382                 fs_root->reloc_root = btrfs_grab_root(reloc_root);
4383                 btrfs_put_root(fs_root);
4384         }
4385
4386         err = btrfs_commit_transaction(trans);
4387         if (err)
4388                 goto out_unset;
4389
4390         merge_reloc_roots(rc);
4391
4392         unset_reloc_control(rc);
4393
4394         trans = btrfs_join_transaction(rc->extent_root);
4395         if (IS_ERR(trans)) {
4396                 err = PTR_ERR(trans);
4397                 goto out_clean;
4398         }
4399         err = btrfs_commit_transaction(trans);
4400 out_clean:
4401         ret = clean_dirty_subvols(rc);
4402         if (ret < 0 && !err)
4403                 err = ret;
4404 out_unset:
4405         unset_reloc_control(rc);
4406 out_end:
4407         reloc_chunk_end(fs_info);
4408         free_reloc_control(rc);
4409 out:
4410         free_reloc_roots(&reloc_roots);
4411
4412         btrfs_free_path(path);
4413
4414         if (err == 0) {
4415                 /* cleanup orphan inode in data relocation tree */
4416                 fs_root = btrfs_grab_root(fs_info->data_reloc_root);
4417                 ASSERT(fs_root);
4418                 err = btrfs_orphan_cleanup(fs_root);
4419                 btrfs_put_root(fs_root);
4420         }
4421         return err;
4422 }
4423
4424 /*
4425  * helper to add ordered checksum for data relocation.
4426  *
4427  * cloning checksum properly handles the nodatasum extents.
4428  * it also saves CPU time to re-calculate the checksum.
4429  */
4430 int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered)
4431 {
4432         struct btrfs_inode *inode = BTRFS_I(ordered->inode);
4433         struct btrfs_fs_info *fs_info = inode->root->fs_info;
4434         u64 disk_bytenr = ordered->file_offset + inode->index_cnt;
4435         struct btrfs_root *csum_root = btrfs_csum_root(fs_info, disk_bytenr);
4436         LIST_HEAD(list);
4437         int ret;
4438
4439         ret = btrfs_lookup_csums_list(csum_root, disk_bytenr,
4440                                       disk_bytenr + ordered->num_bytes - 1,
4441                                       &list, 0, false);
4442         if (ret)
4443                 return ret;
4444
4445         while (!list_empty(&list)) {
4446                 struct btrfs_ordered_sum *sums =
4447                         list_entry(list.next, struct btrfs_ordered_sum, list);
4448
4449                 list_del_init(&sums->list);
4450
4451                 /*
4452                  * We need to offset the new_bytenr based on where the csum is.
4453                  * We need to do this because we will read in entire prealloc
4454                  * extents but we may have written to say the middle of the
4455                  * prealloc extent, so we need to make sure the csum goes with
4456                  * the right disk offset.
4457                  *
4458                  * We can do this because the data reloc inode refers strictly
4459                  * to the on disk bytes, so we don't have to worry about
4460                  * disk_len vs real len like with real inodes since it's all
4461                  * disk length.
4462                  */
4463                 sums->logical = ordered->disk_bytenr + sums->logical - disk_bytenr;
4464                 btrfs_add_ordered_sum(ordered, sums);
4465         }
4466
4467         return 0;
4468 }
4469
4470 int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
4471                           struct btrfs_root *root,
4472                           const struct extent_buffer *buf,
4473                           struct extent_buffer *cow)
4474 {
4475         struct btrfs_fs_info *fs_info = root->fs_info;
4476         struct reloc_control *rc;
4477         struct btrfs_backref_node *node;
4478         int first_cow = 0;
4479         int level;
4480         int ret = 0;
4481
4482         rc = fs_info->reloc_ctl;
4483         if (!rc)
4484                 return 0;
4485
4486         BUG_ON(rc->stage == UPDATE_DATA_PTRS && btrfs_is_data_reloc_root(root));
4487
4488         level = btrfs_header_level(buf);
4489         if (btrfs_header_generation(buf) <=
4490             btrfs_root_last_snapshot(&root->root_item))
4491                 first_cow = 1;
4492
4493         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID &&
4494             rc->create_reloc_tree) {
4495                 WARN_ON(!first_cow && level == 0);
4496
4497                 node = rc->backref_cache.path[level];
4498                 BUG_ON(node->bytenr != buf->start &&
4499                        node->new_bytenr != buf->start);
4500
4501                 btrfs_backref_drop_node_buffer(node);
4502                 atomic_inc(&cow->refs);
4503                 node->eb = cow;
4504                 node->new_bytenr = cow->start;
4505
4506                 if (!node->pending) {
4507                         list_move_tail(&node->list,
4508                                        &rc->backref_cache.pending[level]);
4509                         node->pending = 1;
4510                 }
4511
4512                 if (first_cow)
4513                         mark_block_processed(rc, node);
4514
4515                 if (first_cow && level > 0)
4516                         rc->nodes_relocated += buf->len;
4517         }
4518
4519         if (level == 0 && first_cow && rc->stage == UPDATE_DATA_PTRS)
4520                 ret = replace_file_extents(trans, rc, root, cow);
4521         return ret;
4522 }
4523
4524 /*
4525  * called before creating snapshot. it calculates metadata reservation
4526  * required for relocating tree blocks in the snapshot
4527  */
4528 void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
4529                               u64 *bytes_to_reserve)
4530 {
4531         struct btrfs_root *root = pending->root;
4532         struct reloc_control *rc = root->fs_info->reloc_ctl;
4533
4534         if (!rc || !have_reloc_root(root))
4535                 return;
4536
4537         if (!rc->merge_reloc_tree)
4538                 return;
4539
4540         root = root->reloc_root;
4541         BUG_ON(btrfs_root_refs(&root->root_item) == 0);
4542         /*
4543          * relocation is in the stage of merging trees. the space
4544          * used by merging a reloc tree is twice the size of
4545          * relocated tree nodes in the worst case. half for cowing
4546          * the reloc tree, half for cowing the fs tree. the space
4547          * used by cowing the reloc tree will be freed after the
4548          * tree is dropped. if we create snapshot, cowing the fs
4549          * tree may use more space than it frees. so we need
4550          * reserve extra space.
4551          */
4552         *bytes_to_reserve += rc->nodes_relocated;
4553 }
4554
4555 /*
4556  * called after snapshot is created. migrate block reservation
4557  * and create reloc root for the newly created snapshot
4558  *
4559  * This is similar to btrfs_init_reloc_root(), we come out of here with two
4560  * references held on the reloc_root, one for root->reloc_root and one for
4561  * rc->reloc_roots.
4562  */
4563 int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
4564                                struct btrfs_pending_snapshot *pending)
4565 {
4566         struct btrfs_root *root = pending->root;
4567         struct btrfs_root *reloc_root;
4568         struct btrfs_root *new_root;
4569         struct reloc_control *rc = root->fs_info->reloc_ctl;
4570         int ret;
4571
4572         if (!rc || !have_reloc_root(root))
4573                 return 0;
4574
4575         rc = root->fs_info->reloc_ctl;
4576         rc->merging_rsv_size += rc->nodes_relocated;
4577
4578         if (rc->merge_reloc_tree) {
4579                 ret = btrfs_block_rsv_migrate(&pending->block_rsv,
4580                                               rc->block_rsv,
4581                                               rc->nodes_relocated, true);
4582                 if (ret)
4583                         return ret;
4584         }
4585
4586         new_root = pending->snap;
4587         reloc_root = create_reloc_root(trans, root->reloc_root,
4588                                        new_root->root_key.objectid);
4589         if (IS_ERR(reloc_root))
4590                 return PTR_ERR(reloc_root);
4591
4592         ret = __add_reloc_root(reloc_root);
4593         ASSERT(ret != -EEXIST);
4594         if (ret) {
4595                 /* Pairs with create_reloc_root */
4596                 btrfs_put_root(reloc_root);
4597                 return ret;
4598         }
4599         new_root->reloc_root = btrfs_grab_root(reloc_root);
4600
4601         if (rc->create_reloc_tree)
4602                 ret = clone_backref_node(trans, rc, root, reloc_root);
4603         return ret;
4604 }
4605
4606 /*
4607  * Get the current bytenr for the block group which is being relocated.
4608  *
4609  * Return U64_MAX if no running relocation.
4610  */
4611 u64 btrfs_get_reloc_bg_bytenr(const struct btrfs_fs_info *fs_info)
4612 {
4613         u64 logical = U64_MAX;
4614
4615         lockdep_assert_held(&fs_info->reloc_mutex);
4616
4617         if (fs_info->reloc_ctl && fs_info->reloc_ctl->block_group)
4618                 logical = fs_info->reloc_ctl->block_group->start;
4619         return logical;
4620 }