GNU Linux-libre 5.4.257-gnu1
[releases.git] / fs / btrfs / extent-tree.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007 Oracle.  All rights reserved.
4  */
5
6 #include <linux/sched.h>
7 #include <linux/sched/signal.h>
8 #include <linux/pagemap.h>
9 #include <linux/writeback.h>
10 #include <linux/blkdev.h>
11 #include <linux/sort.h>
12 #include <linux/rcupdate.h>
13 #include <linux/kthread.h>
14 #include <linux/slab.h>
15 #include <linux/ratelimit.h>
16 #include <linux/percpu_counter.h>
17 #include <linux/lockdep.h>
18 #include <linux/crc32c.h>
19 #include "misc.h"
20 #include "tree-log.h"
21 #include "disk-io.h"
22 #include "print-tree.h"
23 #include "volumes.h"
24 #include "raid56.h"
25 #include "locking.h"
26 #include "free-space-cache.h"
27 #include "free-space-tree.h"
28 #include "sysfs.h"
29 #include "qgroup.h"
30 #include "ref-verify.h"
31 #include "space-info.h"
32 #include "block-rsv.h"
33 #include "delalloc-space.h"
34 #include "block-group.h"
35 #include "rcu-string.h"
36
37 #undef SCRAMBLE_DELAYED_REFS
38
39
40 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
41                                struct btrfs_delayed_ref_node *node, u64 parent,
42                                u64 root_objectid, u64 owner_objectid,
43                                u64 owner_offset, int refs_to_drop,
44                                struct btrfs_delayed_extent_op *extra_op);
45 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
46                                     struct extent_buffer *leaf,
47                                     struct btrfs_extent_item *ei);
48 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
49                                       u64 parent, u64 root_objectid,
50                                       u64 flags, u64 owner, u64 offset,
51                                       struct btrfs_key *ins, int ref_mod);
52 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
53                                      struct btrfs_delayed_ref_node *node,
54                                      struct btrfs_delayed_extent_op *extent_op);
55 static int find_next_key(struct btrfs_path *path, int level,
56                          struct btrfs_key *key);
57
58 static int block_group_bits(struct btrfs_block_group_cache *cache, u64 bits)
59 {
60         return (cache->flags & bits) == bits;
61 }
62
63 int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,
64                               u64 start, u64 num_bytes)
65 {
66         u64 end = start + num_bytes - 1;
67         set_extent_bits(&fs_info->freed_extents[0],
68                         start, end, EXTENT_UPTODATE);
69         set_extent_bits(&fs_info->freed_extents[1],
70                         start, end, EXTENT_UPTODATE);
71         return 0;
72 }
73
74 void btrfs_free_excluded_extents(struct btrfs_block_group_cache *cache)
75 {
76         struct btrfs_fs_info *fs_info = cache->fs_info;
77         u64 start, end;
78
79         start = cache->key.objectid;
80         end = start + cache->key.offset - 1;
81
82         clear_extent_bits(&fs_info->freed_extents[0],
83                           start, end, EXTENT_UPTODATE);
84         clear_extent_bits(&fs_info->freed_extents[1],
85                           start, end, EXTENT_UPTODATE);
86 }
87
88 static u64 generic_ref_to_space_flags(struct btrfs_ref *ref)
89 {
90         if (ref->type == BTRFS_REF_METADATA) {
91                 if (ref->tree_ref.root == BTRFS_CHUNK_TREE_OBJECTID)
92                         return BTRFS_BLOCK_GROUP_SYSTEM;
93                 else
94                         return BTRFS_BLOCK_GROUP_METADATA;
95         }
96         return BTRFS_BLOCK_GROUP_DATA;
97 }
98
99 static void add_pinned_bytes(struct btrfs_fs_info *fs_info,
100                              struct btrfs_ref *ref)
101 {
102         struct btrfs_space_info *space_info;
103         u64 flags = generic_ref_to_space_flags(ref);
104
105         space_info = btrfs_find_space_info(fs_info, flags);
106         ASSERT(space_info);
107         percpu_counter_add_batch(&space_info->total_bytes_pinned, ref->len,
108                     BTRFS_TOTAL_BYTES_PINNED_BATCH);
109 }
110
111 static void sub_pinned_bytes(struct btrfs_fs_info *fs_info,
112                              struct btrfs_ref *ref)
113 {
114         struct btrfs_space_info *space_info;
115         u64 flags = generic_ref_to_space_flags(ref);
116
117         space_info = btrfs_find_space_info(fs_info, flags);
118         ASSERT(space_info);
119         percpu_counter_add_batch(&space_info->total_bytes_pinned, -ref->len,
120                     BTRFS_TOTAL_BYTES_PINNED_BATCH);
121 }
122
123 /* simple helper to search for an existing data extent at a given offset */
124 int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len)
125 {
126         int ret;
127         struct btrfs_key key;
128         struct btrfs_path *path;
129
130         path = btrfs_alloc_path();
131         if (!path)
132                 return -ENOMEM;
133
134         key.objectid = start;
135         key.offset = len;
136         key.type = BTRFS_EXTENT_ITEM_KEY;
137         ret = btrfs_search_slot(NULL, fs_info->extent_root, &key, path, 0, 0);
138         btrfs_free_path(path);
139         return ret;
140 }
141
142 /*
143  * helper function to lookup reference count and flags of a tree block.
144  *
145  * the head node for delayed ref is used to store the sum of all the
146  * reference count modifications queued up in the rbtree. the head
147  * node may also store the extent flags to set. This way you can check
148  * to see what the reference count and extent flags would be if all of
149  * the delayed refs are not processed.
150  */
151 int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
152                              struct btrfs_fs_info *fs_info, u64 bytenr,
153                              u64 offset, int metadata, u64 *refs, u64 *flags)
154 {
155         struct btrfs_delayed_ref_head *head;
156         struct btrfs_delayed_ref_root *delayed_refs;
157         struct btrfs_path *path;
158         struct btrfs_extent_item *ei;
159         struct extent_buffer *leaf;
160         struct btrfs_key key;
161         u32 item_size;
162         u64 num_refs;
163         u64 extent_flags;
164         int ret;
165
166         /*
167          * If we don't have skinny metadata, don't bother doing anything
168          * different
169          */
170         if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA)) {
171                 offset = fs_info->nodesize;
172                 metadata = 0;
173         }
174
175         path = btrfs_alloc_path();
176         if (!path)
177                 return -ENOMEM;
178
179         if (!trans) {
180                 path->skip_locking = 1;
181                 path->search_commit_root = 1;
182         }
183
184 search_again:
185         key.objectid = bytenr;
186         key.offset = offset;
187         if (metadata)
188                 key.type = BTRFS_METADATA_ITEM_KEY;
189         else
190                 key.type = BTRFS_EXTENT_ITEM_KEY;
191
192         ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 0);
193         if (ret < 0)
194                 goto out_free;
195
196         if (ret > 0 && metadata && key.type == BTRFS_METADATA_ITEM_KEY) {
197                 if (path->slots[0]) {
198                         path->slots[0]--;
199                         btrfs_item_key_to_cpu(path->nodes[0], &key,
200                                               path->slots[0]);
201                         if (key.objectid == bytenr &&
202                             key.type == BTRFS_EXTENT_ITEM_KEY &&
203                             key.offset == fs_info->nodesize)
204                                 ret = 0;
205                 }
206         }
207
208         if (ret == 0) {
209                 leaf = path->nodes[0];
210                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
211                 if (item_size >= sizeof(*ei)) {
212                         ei = btrfs_item_ptr(leaf, path->slots[0],
213                                             struct btrfs_extent_item);
214                         num_refs = btrfs_extent_refs(leaf, ei);
215                         extent_flags = btrfs_extent_flags(leaf, ei);
216                 } else {
217                         ret = -EINVAL;
218                         btrfs_print_v0_err(fs_info);
219                         if (trans)
220                                 btrfs_abort_transaction(trans, ret);
221                         else
222                                 btrfs_handle_fs_error(fs_info, ret, NULL);
223
224                         goto out_free;
225                 }
226
227                 BUG_ON(num_refs == 0);
228         } else {
229                 num_refs = 0;
230                 extent_flags = 0;
231                 ret = 0;
232         }
233
234         if (!trans)
235                 goto out;
236
237         delayed_refs = &trans->transaction->delayed_refs;
238         spin_lock(&delayed_refs->lock);
239         head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
240         if (head) {
241                 if (!mutex_trylock(&head->mutex)) {
242                         refcount_inc(&head->refs);
243                         spin_unlock(&delayed_refs->lock);
244
245                         btrfs_release_path(path);
246
247                         /*
248                          * Mutex was contended, block until it's released and try
249                          * again
250                          */
251                         mutex_lock(&head->mutex);
252                         mutex_unlock(&head->mutex);
253                         btrfs_put_delayed_ref_head(head);
254                         goto search_again;
255                 }
256                 spin_lock(&head->lock);
257                 if (head->extent_op && head->extent_op->update_flags)
258                         extent_flags |= head->extent_op->flags_to_set;
259                 else
260                         BUG_ON(num_refs == 0);
261
262                 num_refs += head->ref_mod;
263                 spin_unlock(&head->lock);
264                 mutex_unlock(&head->mutex);
265         }
266         spin_unlock(&delayed_refs->lock);
267 out:
268         WARN_ON(num_refs == 0);
269         if (refs)
270                 *refs = num_refs;
271         if (flags)
272                 *flags = extent_flags;
273 out_free:
274         btrfs_free_path(path);
275         return ret;
276 }
277
278 /*
279  * Back reference rules.  Back refs have three main goals:
280  *
281  * 1) differentiate between all holders of references to an extent so that
282  *    when a reference is dropped we can make sure it was a valid reference
283  *    before freeing the extent.
284  *
285  * 2) Provide enough information to quickly find the holders of an extent
286  *    if we notice a given block is corrupted or bad.
287  *
288  * 3) Make it easy to migrate blocks for FS shrinking or storage pool
289  *    maintenance.  This is actually the same as #2, but with a slightly
290  *    different use case.
291  *
292  * There are two kinds of back refs. The implicit back refs is optimized
293  * for pointers in non-shared tree blocks. For a given pointer in a block,
294  * back refs of this kind provide information about the block's owner tree
295  * and the pointer's key. These information allow us to find the block by
296  * b-tree searching. The full back refs is for pointers in tree blocks not
297  * referenced by their owner trees. The location of tree block is recorded
298  * in the back refs. Actually the full back refs is generic, and can be
299  * used in all cases the implicit back refs is used. The major shortcoming
300  * of the full back refs is its overhead. Every time a tree block gets
301  * COWed, we have to update back refs entry for all pointers in it.
302  *
303  * For a newly allocated tree block, we use implicit back refs for
304  * pointers in it. This means most tree related operations only involve
305  * implicit back refs. For a tree block created in old transaction, the
306  * only way to drop a reference to it is COW it. So we can detect the
307  * event that tree block loses its owner tree's reference and do the
308  * back refs conversion.
309  *
310  * When a tree block is COWed through a tree, there are four cases:
311  *
312  * The reference count of the block is one and the tree is the block's
313  * owner tree. Nothing to do in this case.
314  *
315  * The reference count of the block is one and the tree is not the
316  * block's owner tree. In this case, full back refs is used for pointers
317  * in the block. Remove these full back refs, add implicit back refs for
318  * every pointers in the new block.
319  *
320  * The reference count of the block is greater than one and the tree is
321  * the block's owner tree. In this case, implicit back refs is used for
322  * pointers in the block. Add full back refs for every pointers in the
323  * block, increase lower level extents' reference counts. The original
324  * implicit back refs are entailed to the new block.
325  *
326  * The reference count of the block is greater than one and the tree is
327  * not the block's owner tree. Add implicit back refs for every pointer in
328  * the new block, increase lower level extents' reference count.
329  *
330  * Back Reference Key composing:
331  *
332  * The key objectid corresponds to the first byte in the extent,
333  * The key type is used to differentiate between types of back refs.
334  * There are different meanings of the key offset for different types
335  * of back refs.
336  *
337  * File extents can be referenced by:
338  *
339  * - multiple snapshots, subvolumes, or different generations in one subvol
340  * - different files inside a single subvolume
341  * - different offsets inside a file (bookend extents in file.c)
342  *
343  * The extent ref structure for the implicit back refs has fields for:
344  *
345  * - Objectid of the subvolume root
346  * - objectid of the file holding the reference
347  * - original offset in the file
348  * - how many bookend extents
349  *
350  * The key offset for the implicit back refs is hash of the first
351  * three fields.
352  *
353  * The extent ref structure for the full back refs has field for:
354  *
355  * - number of pointers in the tree leaf
356  *
357  * The key offset for the implicit back refs is the first byte of
358  * the tree leaf
359  *
360  * When a file extent is allocated, The implicit back refs is used.
361  * the fields are filled in:
362  *
363  *     (root_key.objectid, inode objectid, offset in file, 1)
364  *
365  * When a file extent is removed file truncation, we find the
366  * corresponding implicit back refs and check the following fields:
367  *
368  *     (btrfs_header_owner(leaf), inode objectid, offset in file)
369  *
370  * Btree extents can be referenced by:
371  *
372  * - Different subvolumes
373  *
374  * Both the implicit back refs and the full back refs for tree blocks
375  * only consist of key. The key offset for the implicit back refs is
376  * objectid of block's owner tree. The key offset for the full back refs
377  * is the first byte of parent block.
378  *
379  * When implicit back refs is used, information about the lowest key and
380  * level of the tree block are required. These information are stored in
381  * tree block info structure.
382  */
383
384 /*
385  * is_data == BTRFS_REF_TYPE_BLOCK, tree block type is required,
386  * is_data == BTRFS_REF_TYPE_DATA, data type is requiried,
387  * is_data == BTRFS_REF_TYPE_ANY, either type is OK.
388  */
389 int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
390                                      struct btrfs_extent_inline_ref *iref,
391                                      enum btrfs_inline_ref_type is_data)
392 {
393         int type = btrfs_extent_inline_ref_type(eb, iref);
394         u64 offset = btrfs_extent_inline_ref_offset(eb, iref);
395
396         if (type == BTRFS_TREE_BLOCK_REF_KEY ||
397             type == BTRFS_SHARED_BLOCK_REF_KEY ||
398             type == BTRFS_SHARED_DATA_REF_KEY ||
399             type == BTRFS_EXTENT_DATA_REF_KEY) {
400                 if (is_data == BTRFS_REF_TYPE_BLOCK) {
401                         if (type == BTRFS_TREE_BLOCK_REF_KEY)
402                                 return type;
403                         if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
404                                 ASSERT(eb->fs_info);
405                                 /*
406                                  * Every shared one has parent tree block,
407                                  * which must be aligned to sector size.
408                                  */
409                                 if (offset &&
410                                     IS_ALIGNED(offset, eb->fs_info->sectorsize))
411                                         return type;
412                         }
413                 } else if (is_data == BTRFS_REF_TYPE_DATA) {
414                         if (type == BTRFS_EXTENT_DATA_REF_KEY)
415                                 return type;
416                         if (type == BTRFS_SHARED_DATA_REF_KEY) {
417                                 ASSERT(eb->fs_info);
418                                 /*
419                                  * Every shared one has parent tree block,
420                                  * which must be aligned to sector size.
421                                  */
422                                 if (offset &&
423                                     IS_ALIGNED(offset, eb->fs_info->sectorsize))
424                                         return type;
425                         }
426                 } else {
427                         ASSERT(is_data == BTRFS_REF_TYPE_ANY);
428                         return type;
429                 }
430         }
431
432         btrfs_print_leaf((struct extent_buffer *)eb);
433         btrfs_err(eb->fs_info,
434                   "eb %llu iref 0x%lx invalid extent inline ref type %d",
435                   eb->start, (unsigned long)iref, type);
436         WARN_ON(1);
437
438         return BTRFS_REF_TYPE_INVALID;
439 }
440
441 u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset)
442 {
443         u32 high_crc = ~(u32)0;
444         u32 low_crc = ~(u32)0;
445         __le64 lenum;
446
447         lenum = cpu_to_le64(root_objectid);
448         high_crc = btrfs_crc32c(high_crc, &lenum, sizeof(lenum));
449         lenum = cpu_to_le64(owner);
450         low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
451         lenum = cpu_to_le64(offset);
452         low_crc = btrfs_crc32c(low_crc, &lenum, sizeof(lenum));
453
454         return ((u64)high_crc << 31) ^ (u64)low_crc;
455 }
456
457 static u64 hash_extent_data_ref_item(struct extent_buffer *leaf,
458                                      struct btrfs_extent_data_ref *ref)
459 {
460         return hash_extent_data_ref(btrfs_extent_data_ref_root(leaf, ref),
461                                     btrfs_extent_data_ref_objectid(leaf, ref),
462                                     btrfs_extent_data_ref_offset(leaf, ref));
463 }
464
465 static int match_extent_data_ref(struct extent_buffer *leaf,
466                                  struct btrfs_extent_data_ref *ref,
467                                  u64 root_objectid, u64 owner, u64 offset)
468 {
469         if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid ||
470             btrfs_extent_data_ref_objectid(leaf, ref) != owner ||
471             btrfs_extent_data_ref_offset(leaf, ref) != offset)
472                 return 0;
473         return 1;
474 }
475
476 static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans,
477                                            struct btrfs_path *path,
478                                            u64 bytenr, u64 parent,
479                                            u64 root_objectid,
480                                            u64 owner, u64 offset)
481 {
482         struct btrfs_root *root = trans->fs_info->extent_root;
483         struct btrfs_key key;
484         struct btrfs_extent_data_ref *ref;
485         struct extent_buffer *leaf;
486         u32 nritems;
487         int ret;
488         int recow;
489         int err = -ENOENT;
490
491         key.objectid = bytenr;
492         if (parent) {
493                 key.type = BTRFS_SHARED_DATA_REF_KEY;
494                 key.offset = parent;
495         } else {
496                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
497                 key.offset = hash_extent_data_ref(root_objectid,
498                                                   owner, offset);
499         }
500 again:
501         recow = 0;
502         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
503         if (ret < 0) {
504                 err = ret;
505                 goto fail;
506         }
507
508         if (parent) {
509                 if (!ret)
510                         return 0;
511                 goto fail;
512         }
513
514         leaf = path->nodes[0];
515         nritems = btrfs_header_nritems(leaf);
516         while (1) {
517                 if (path->slots[0] >= nritems) {
518                         ret = btrfs_next_leaf(root, path);
519                         if (ret < 0)
520                                 err = ret;
521                         if (ret)
522                                 goto fail;
523
524                         leaf = path->nodes[0];
525                         nritems = btrfs_header_nritems(leaf);
526                         recow = 1;
527                 }
528
529                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
530                 if (key.objectid != bytenr ||
531                     key.type != BTRFS_EXTENT_DATA_REF_KEY)
532                         goto fail;
533
534                 ref = btrfs_item_ptr(leaf, path->slots[0],
535                                      struct btrfs_extent_data_ref);
536
537                 if (match_extent_data_ref(leaf, ref, root_objectid,
538                                           owner, offset)) {
539                         if (recow) {
540                                 btrfs_release_path(path);
541                                 goto again;
542                         }
543                         err = 0;
544                         break;
545                 }
546                 path->slots[0]++;
547         }
548 fail:
549         return err;
550 }
551
552 static noinline int insert_extent_data_ref(struct btrfs_trans_handle *trans,
553                                            struct btrfs_path *path,
554                                            u64 bytenr, u64 parent,
555                                            u64 root_objectid, u64 owner,
556                                            u64 offset, int refs_to_add)
557 {
558         struct btrfs_root *root = trans->fs_info->extent_root;
559         struct btrfs_key key;
560         struct extent_buffer *leaf;
561         u32 size;
562         u32 num_refs;
563         int ret;
564
565         key.objectid = bytenr;
566         if (parent) {
567                 key.type = BTRFS_SHARED_DATA_REF_KEY;
568                 key.offset = parent;
569                 size = sizeof(struct btrfs_shared_data_ref);
570         } else {
571                 key.type = BTRFS_EXTENT_DATA_REF_KEY;
572                 key.offset = hash_extent_data_ref(root_objectid,
573                                                   owner, offset);
574                 size = sizeof(struct btrfs_extent_data_ref);
575         }
576
577         ret = btrfs_insert_empty_item(trans, root, path, &key, size);
578         if (ret && ret != -EEXIST)
579                 goto fail;
580
581         leaf = path->nodes[0];
582         if (parent) {
583                 struct btrfs_shared_data_ref *ref;
584                 ref = btrfs_item_ptr(leaf, path->slots[0],
585                                      struct btrfs_shared_data_ref);
586                 if (ret == 0) {
587                         btrfs_set_shared_data_ref_count(leaf, ref, refs_to_add);
588                 } else {
589                         num_refs = btrfs_shared_data_ref_count(leaf, ref);
590                         num_refs += refs_to_add;
591                         btrfs_set_shared_data_ref_count(leaf, ref, num_refs);
592                 }
593         } else {
594                 struct btrfs_extent_data_ref *ref;
595                 while (ret == -EEXIST) {
596                         ref = btrfs_item_ptr(leaf, path->slots[0],
597                                              struct btrfs_extent_data_ref);
598                         if (match_extent_data_ref(leaf, ref, root_objectid,
599                                                   owner, offset))
600                                 break;
601                         btrfs_release_path(path);
602                         key.offset++;
603                         ret = btrfs_insert_empty_item(trans, root, path, &key,
604                                                       size);
605                         if (ret && ret != -EEXIST)
606                                 goto fail;
607
608                         leaf = path->nodes[0];
609                 }
610                 ref = btrfs_item_ptr(leaf, path->slots[0],
611                                      struct btrfs_extent_data_ref);
612                 if (ret == 0) {
613                         btrfs_set_extent_data_ref_root(leaf, ref,
614                                                        root_objectid);
615                         btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
616                         btrfs_set_extent_data_ref_offset(leaf, ref, offset);
617                         btrfs_set_extent_data_ref_count(leaf, ref, refs_to_add);
618                 } else {
619                         num_refs = btrfs_extent_data_ref_count(leaf, ref);
620                         num_refs += refs_to_add;
621                         btrfs_set_extent_data_ref_count(leaf, ref, num_refs);
622                 }
623         }
624         btrfs_mark_buffer_dirty(leaf);
625         ret = 0;
626 fail:
627         btrfs_release_path(path);
628         return ret;
629 }
630
631 static noinline int remove_extent_data_ref(struct btrfs_trans_handle *trans,
632                                            struct btrfs_path *path,
633                                            int refs_to_drop, int *last_ref)
634 {
635         struct btrfs_key key;
636         struct btrfs_extent_data_ref *ref1 = NULL;
637         struct btrfs_shared_data_ref *ref2 = NULL;
638         struct extent_buffer *leaf;
639         u32 num_refs = 0;
640         int ret = 0;
641
642         leaf = path->nodes[0];
643         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
644
645         if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
646                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
647                                       struct btrfs_extent_data_ref);
648                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
649         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
650                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
651                                       struct btrfs_shared_data_ref);
652                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
653         } else if (unlikely(key.type == BTRFS_EXTENT_REF_V0_KEY)) {
654                 btrfs_print_v0_err(trans->fs_info);
655                 btrfs_abort_transaction(trans, -EINVAL);
656                 return -EINVAL;
657         } else {
658                 BUG();
659         }
660
661         BUG_ON(num_refs < refs_to_drop);
662         num_refs -= refs_to_drop;
663
664         if (num_refs == 0) {
665                 ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
666                 *last_ref = 1;
667         } else {
668                 if (key.type == BTRFS_EXTENT_DATA_REF_KEY)
669                         btrfs_set_extent_data_ref_count(leaf, ref1, num_refs);
670                 else if (key.type == BTRFS_SHARED_DATA_REF_KEY)
671                         btrfs_set_shared_data_ref_count(leaf, ref2, num_refs);
672                 btrfs_mark_buffer_dirty(leaf);
673         }
674         return ret;
675 }
676
677 static noinline u32 extent_data_ref_count(struct btrfs_path *path,
678                                           struct btrfs_extent_inline_ref *iref)
679 {
680         struct btrfs_key key;
681         struct extent_buffer *leaf;
682         struct btrfs_extent_data_ref *ref1;
683         struct btrfs_shared_data_ref *ref2;
684         u32 num_refs = 0;
685         int type;
686
687         leaf = path->nodes[0];
688         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
689
690         BUG_ON(key.type == BTRFS_EXTENT_REF_V0_KEY);
691         if (iref) {
692                 /*
693                  * If type is invalid, we should have bailed out earlier than
694                  * this call.
695                  */
696                 type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
697                 ASSERT(type != BTRFS_REF_TYPE_INVALID);
698                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
699                         ref1 = (struct btrfs_extent_data_ref *)(&iref->offset);
700                         num_refs = btrfs_extent_data_ref_count(leaf, ref1);
701                 } else {
702                         ref2 = (struct btrfs_shared_data_ref *)(iref + 1);
703                         num_refs = btrfs_shared_data_ref_count(leaf, ref2);
704                 }
705         } else if (key.type == BTRFS_EXTENT_DATA_REF_KEY) {
706                 ref1 = btrfs_item_ptr(leaf, path->slots[0],
707                                       struct btrfs_extent_data_ref);
708                 num_refs = btrfs_extent_data_ref_count(leaf, ref1);
709         } else if (key.type == BTRFS_SHARED_DATA_REF_KEY) {
710                 ref2 = btrfs_item_ptr(leaf, path->slots[0],
711                                       struct btrfs_shared_data_ref);
712                 num_refs = btrfs_shared_data_ref_count(leaf, ref2);
713         } else {
714                 WARN_ON(1);
715         }
716         return num_refs;
717 }
718
719 static noinline int lookup_tree_block_ref(struct btrfs_trans_handle *trans,
720                                           struct btrfs_path *path,
721                                           u64 bytenr, u64 parent,
722                                           u64 root_objectid)
723 {
724         struct btrfs_root *root = trans->fs_info->extent_root;
725         struct btrfs_key key;
726         int ret;
727
728         key.objectid = bytenr;
729         if (parent) {
730                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
731                 key.offset = parent;
732         } else {
733                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
734                 key.offset = root_objectid;
735         }
736
737         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
738         if (ret > 0)
739                 ret = -ENOENT;
740         return ret;
741 }
742
743 static noinline int insert_tree_block_ref(struct btrfs_trans_handle *trans,
744                                           struct btrfs_path *path,
745                                           u64 bytenr, u64 parent,
746                                           u64 root_objectid)
747 {
748         struct btrfs_key key;
749         int ret;
750
751         key.objectid = bytenr;
752         if (parent) {
753                 key.type = BTRFS_SHARED_BLOCK_REF_KEY;
754                 key.offset = parent;
755         } else {
756                 key.type = BTRFS_TREE_BLOCK_REF_KEY;
757                 key.offset = root_objectid;
758         }
759
760         ret = btrfs_insert_empty_item(trans, trans->fs_info->extent_root,
761                                       path, &key, 0);
762         btrfs_release_path(path);
763         return ret;
764 }
765
766 static inline int extent_ref_type(u64 parent, u64 owner)
767 {
768         int type;
769         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
770                 if (parent > 0)
771                         type = BTRFS_SHARED_BLOCK_REF_KEY;
772                 else
773                         type = BTRFS_TREE_BLOCK_REF_KEY;
774         } else {
775                 if (parent > 0)
776                         type = BTRFS_SHARED_DATA_REF_KEY;
777                 else
778                         type = BTRFS_EXTENT_DATA_REF_KEY;
779         }
780         return type;
781 }
782
783 static int find_next_key(struct btrfs_path *path, int level,
784                          struct btrfs_key *key)
785
786 {
787         for (; level < BTRFS_MAX_LEVEL; level++) {
788                 if (!path->nodes[level])
789                         break;
790                 if (path->slots[level] + 1 >=
791                     btrfs_header_nritems(path->nodes[level]))
792                         continue;
793                 if (level == 0)
794                         btrfs_item_key_to_cpu(path->nodes[level], key,
795                                               path->slots[level] + 1);
796                 else
797                         btrfs_node_key_to_cpu(path->nodes[level], key,
798                                               path->slots[level] + 1);
799                 return 0;
800         }
801         return 1;
802 }
803
804 /*
805  * look for inline back ref. if back ref is found, *ref_ret is set
806  * to the address of inline back ref, and 0 is returned.
807  *
808  * if back ref isn't found, *ref_ret is set to the address where it
809  * should be inserted, and -ENOENT is returned.
810  *
811  * if insert is true and there are too many inline back refs, the path
812  * points to the extent item, and -EAGAIN is returned.
813  *
814  * NOTE: inline back refs are ordered in the same way that back ref
815  *       items in the tree are ordered.
816  */
817 static noinline_for_stack
818 int lookup_inline_extent_backref(struct btrfs_trans_handle *trans,
819                                  struct btrfs_path *path,
820                                  struct btrfs_extent_inline_ref **ref_ret,
821                                  u64 bytenr, u64 num_bytes,
822                                  u64 parent, u64 root_objectid,
823                                  u64 owner, u64 offset, int insert)
824 {
825         struct btrfs_fs_info *fs_info = trans->fs_info;
826         struct btrfs_root *root = fs_info->extent_root;
827         struct btrfs_key key;
828         struct extent_buffer *leaf;
829         struct btrfs_extent_item *ei;
830         struct btrfs_extent_inline_ref *iref;
831         u64 flags;
832         u64 item_size;
833         unsigned long ptr;
834         unsigned long end;
835         int extra_size;
836         int type;
837         int want;
838         int ret;
839         int err = 0;
840         bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
841         int needed;
842
843         key.objectid = bytenr;
844         key.type = BTRFS_EXTENT_ITEM_KEY;
845         key.offset = num_bytes;
846
847         want = extent_ref_type(parent, owner);
848         if (insert) {
849                 extra_size = btrfs_extent_inline_ref_size(want);
850                 path->keep_locks = 1;
851         } else
852                 extra_size = -1;
853
854         /*
855          * Owner is our level, so we can just add one to get the level for the
856          * block we are interested in.
857          */
858         if (skinny_metadata && owner < BTRFS_FIRST_FREE_OBJECTID) {
859                 key.type = BTRFS_METADATA_ITEM_KEY;
860                 key.offset = owner;
861         }
862
863 again:
864         ret = btrfs_search_slot(trans, root, &key, path, extra_size, 1);
865         if (ret < 0) {
866                 err = ret;
867                 goto out;
868         }
869
870         /*
871          * We may be a newly converted file system which still has the old fat
872          * extent entries for metadata, so try and see if we have one of those.
873          */
874         if (ret > 0 && skinny_metadata) {
875                 skinny_metadata = false;
876                 if (path->slots[0]) {
877                         path->slots[0]--;
878                         btrfs_item_key_to_cpu(path->nodes[0], &key,
879                                               path->slots[0]);
880                         if (key.objectid == bytenr &&
881                             key.type == BTRFS_EXTENT_ITEM_KEY &&
882                             key.offset == num_bytes)
883                                 ret = 0;
884                 }
885                 if (ret) {
886                         key.objectid = bytenr;
887                         key.type = BTRFS_EXTENT_ITEM_KEY;
888                         key.offset = num_bytes;
889                         btrfs_release_path(path);
890                         goto again;
891                 }
892         }
893
894         if (ret && !insert) {
895                 err = -ENOENT;
896                 goto out;
897         } else if (WARN_ON(ret)) {
898                 btrfs_print_leaf(path->nodes[0]);
899                 btrfs_err(fs_info,
900 "extent item not found for insert, bytenr %llu num_bytes %llu parent %llu root_objectid %llu owner %llu offset %llu",
901                           bytenr, num_bytes, parent, root_objectid, owner,
902                           offset);
903                 err = -EIO;
904                 goto out;
905         }
906
907         leaf = path->nodes[0];
908         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
909         if (unlikely(item_size < sizeof(*ei))) {
910                 err = -EINVAL;
911                 btrfs_print_v0_err(fs_info);
912                 btrfs_abort_transaction(trans, err);
913                 goto out;
914         }
915
916         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
917         flags = btrfs_extent_flags(leaf, ei);
918
919         ptr = (unsigned long)(ei + 1);
920         end = (unsigned long)ei + item_size;
921
922         if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK && !skinny_metadata) {
923                 ptr += sizeof(struct btrfs_tree_block_info);
924                 BUG_ON(ptr > end);
925         }
926
927         if (owner >= BTRFS_FIRST_FREE_OBJECTID)
928                 needed = BTRFS_REF_TYPE_DATA;
929         else
930                 needed = BTRFS_REF_TYPE_BLOCK;
931
932         err = -ENOENT;
933         while (1) {
934                 if (ptr >= end) {
935                         WARN_ON(ptr > end);
936                         break;
937                 }
938                 iref = (struct btrfs_extent_inline_ref *)ptr;
939                 type = btrfs_get_extent_inline_ref_type(leaf, iref, needed);
940                 if (type == BTRFS_REF_TYPE_INVALID) {
941                         err = -EUCLEAN;
942                         goto out;
943                 }
944
945                 if (want < type)
946                         break;
947                 if (want > type) {
948                         ptr += btrfs_extent_inline_ref_size(type);
949                         continue;
950                 }
951
952                 if (type == BTRFS_EXTENT_DATA_REF_KEY) {
953                         struct btrfs_extent_data_ref *dref;
954                         dref = (struct btrfs_extent_data_ref *)(&iref->offset);
955                         if (match_extent_data_ref(leaf, dref, root_objectid,
956                                                   owner, offset)) {
957                                 err = 0;
958                                 break;
959                         }
960                         if (hash_extent_data_ref_item(leaf, dref) <
961                             hash_extent_data_ref(root_objectid, owner, offset))
962                                 break;
963                 } else {
964                         u64 ref_offset;
965                         ref_offset = btrfs_extent_inline_ref_offset(leaf, iref);
966                         if (parent > 0) {
967                                 if (parent == ref_offset) {
968                                         err = 0;
969                                         break;
970                                 }
971                                 if (ref_offset < parent)
972                                         break;
973                         } else {
974                                 if (root_objectid == ref_offset) {
975                                         err = 0;
976                                         break;
977                                 }
978                                 if (ref_offset < root_objectid)
979                                         break;
980                         }
981                 }
982                 ptr += btrfs_extent_inline_ref_size(type);
983         }
984         if (err == -ENOENT && insert) {
985                 if (item_size + extra_size >=
986                     BTRFS_MAX_EXTENT_ITEM_SIZE(root)) {
987                         err = -EAGAIN;
988                         goto out;
989                 }
990                 /*
991                  * To add new inline back ref, we have to make sure
992                  * there is no corresponding back ref item.
993                  * For simplicity, we just do not add new inline back
994                  * ref if there is any kind of item for this block
995                  */
996                 if (find_next_key(path, 0, &key) == 0 &&
997                     key.objectid == bytenr &&
998                     key.type < BTRFS_BLOCK_GROUP_ITEM_KEY) {
999                         err = -EAGAIN;
1000                         goto out;
1001                 }
1002         }
1003         *ref_ret = (struct btrfs_extent_inline_ref *)ptr;
1004 out:
1005         if (insert) {
1006                 path->keep_locks = 0;
1007                 btrfs_unlock_up_safe(path, 1);
1008         }
1009         return err;
1010 }
1011
1012 /*
1013  * helper to add new inline back ref
1014  */
1015 static noinline_for_stack
1016 void setup_inline_extent_backref(struct btrfs_fs_info *fs_info,
1017                                  struct btrfs_path *path,
1018                                  struct btrfs_extent_inline_ref *iref,
1019                                  u64 parent, u64 root_objectid,
1020                                  u64 owner, u64 offset, int refs_to_add,
1021                                  struct btrfs_delayed_extent_op *extent_op)
1022 {
1023         struct extent_buffer *leaf;
1024         struct btrfs_extent_item *ei;
1025         unsigned long ptr;
1026         unsigned long end;
1027         unsigned long item_offset;
1028         u64 refs;
1029         int size;
1030         int type;
1031
1032         leaf = path->nodes[0];
1033         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1034         item_offset = (unsigned long)iref - (unsigned long)ei;
1035
1036         type = extent_ref_type(parent, owner);
1037         size = btrfs_extent_inline_ref_size(type);
1038
1039         btrfs_extend_item(path, size);
1040
1041         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1042         refs = btrfs_extent_refs(leaf, ei);
1043         refs += refs_to_add;
1044         btrfs_set_extent_refs(leaf, ei, refs);
1045         if (extent_op)
1046                 __run_delayed_extent_op(extent_op, leaf, ei);
1047
1048         ptr = (unsigned long)ei + item_offset;
1049         end = (unsigned long)ei + btrfs_item_size_nr(leaf, path->slots[0]);
1050         if (ptr < end - size)
1051                 memmove_extent_buffer(leaf, ptr + size, ptr,
1052                                       end - size - ptr);
1053
1054         iref = (struct btrfs_extent_inline_ref *)ptr;
1055         btrfs_set_extent_inline_ref_type(leaf, iref, type);
1056         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1057                 struct btrfs_extent_data_ref *dref;
1058                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1059                 btrfs_set_extent_data_ref_root(leaf, dref, root_objectid);
1060                 btrfs_set_extent_data_ref_objectid(leaf, dref, owner);
1061                 btrfs_set_extent_data_ref_offset(leaf, dref, offset);
1062                 btrfs_set_extent_data_ref_count(leaf, dref, refs_to_add);
1063         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1064                 struct btrfs_shared_data_ref *sref;
1065                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1066                 btrfs_set_shared_data_ref_count(leaf, sref, refs_to_add);
1067                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1068         } else if (type == BTRFS_SHARED_BLOCK_REF_KEY) {
1069                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
1070         } else {
1071                 btrfs_set_extent_inline_ref_offset(leaf, iref, root_objectid);
1072         }
1073         btrfs_mark_buffer_dirty(leaf);
1074 }
1075
1076 static int lookup_extent_backref(struct btrfs_trans_handle *trans,
1077                                  struct btrfs_path *path,
1078                                  struct btrfs_extent_inline_ref **ref_ret,
1079                                  u64 bytenr, u64 num_bytes, u64 parent,
1080                                  u64 root_objectid, u64 owner, u64 offset)
1081 {
1082         int ret;
1083
1084         ret = lookup_inline_extent_backref(trans, path, ref_ret, bytenr,
1085                                            num_bytes, parent, root_objectid,
1086                                            owner, offset, 0);
1087         if (ret != -ENOENT)
1088                 return ret;
1089
1090         btrfs_release_path(path);
1091         *ref_ret = NULL;
1092
1093         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1094                 ret = lookup_tree_block_ref(trans, path, bytenr, parent,
1095                                             root_objectid);
1096         } else {
1097                 ret = lookup_extent_data_ref(trans, path, bytenr, parent,
1098                                              root_objectid, owner, offset);
1099         }
1100         return ret;
1101 }
1102
1103 /*
1104  * helper to update/remove inline back ref
1105  */
1106 static noinline_for_stack
1107 void update_inline_extent_backref(struct btrfs_path *path,
1108                                   struct btrfs_extent_inline_ref *iref,
1109                                   int refs_to_mod,
1110                                   struct btrfs_delayed_extent_op *extent_op,
1111                                   int *last_ref)
1112 {
1113         struct extent_buffer *leaf = path->nodes[0];
1114         struct btrfs_extent_item *ei;
1115         struct btrfs_extent_data_ref *dref = NULL;
1116         struct btrfs_shared_data_ref *sref = NULL;
1117         unsigned long ptr;
1118         unsigned long end;
1119         u32 item_size;
1120         int size;
1121         int type;
1122         u64 refs;
1123
1124         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1125         refs = btrfs_extent_refs(leaf, ei);
1126         WARN_ON(refs_to_mod < 0 && refs + refs_to_mod <= 0);
1127         refs += refs_to_mod;
1128         btrfs_set_extent_refs(leaf, ei, refs);
1129         if (extent_op)
1130                 __run_delayed_extent_op(extent_op, leaf, ei);
1131
1132         /*
1133          * If type is invalid, we should have bailed out after
1134          * lookup_inline_extent_backref().
1135          */
1136         type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_ANY);
1137         ASSERT(type != BTRFS_REF_TYPE_INVALID);
1138
1139         if (type == BTRFS_EXTENT_DATA_REF_KEY) {
1140                 dref = (struct btrfs_extent_data_ref *)(&iref->offset);
1141                 refs = btrfs_extent_data_ref_count(leaf, dref);
1142         } else if (type == BTRFS_SHARED_DATA_REF_KEY) {
1143                 sref = (struct btrfs_shared_data_ref *)(iref + 1);
1144                 refs = btrfs_shared_data_ref_count(leaf, sref);
1145         } else {
1146                 refs = 1;
1147                 BUG_ON(refs_to_mod != -1);
1148         }
1149
1150         BUG_ON(refs_to_mod < 0 && refs < -refs_to_mod);
1151         refs += refs_to_mod;
1152
1153         if (refs > 0) {
1154                 if (type == BTRFS_EXTENT_DATA_REF_KEY)
1155                         btrfs_set_extent_data_ref_count(leaf, dref, refs);
1156                 else
1157                         btrfs_set_shared_data_ref_count(leaf, sref, refs);
1158         } else {
1159                 *last_ref = 1;
1160                 size =  btrfs_extent_inline_ref_size(type);
1161                 item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1162                 ptr = (unsigned long)iref;
1163                 end = (unsigned long)ei + item_size;
1164                 if (ptr + size < end)
1165                         memmove_extent_buffer(leaf, ptr, ptr + size,
1166                                               end - ptr - size);
1167                 item_size -= size;
1168                 btrfs_truncate_item(path, item_size, 1);
1169         }
1170         btrfs_mark_buffer_dirty(leaf);
1171 }
1172
1173 static noinline_for_stack
1174 int insert_inline_extent_backref(struct btrfs_trans_handle *trans,
1175                                  struct btrfs_path *path,
1176                                  u64 bytenr, u64 num_bytes, u64 parent,
1177                                  u64 root_objectid, u64 owner,
1178                                  u64 offset, int refs_to_add,
1179                                  struct btrfs_delayed_extent_op *extent_op)
1180 {
1181         struct btrfs_extent_inline_ref *iref;
1182         int ret;
1183
1184         ret = lookup_inline_extent_backref(trans, path, &iref, bytenr,
1185                                            num_bytes, parent, root_objectid,
1186                                            owner, offset, 1);
1187         if (ret == 0) {
1188                 BUG_ON(owner < BTRFS_FIRST_FREE_OBJECTID);
1189                 update_inline_extent_backref(path, iref, refs_to_add,
1190                                              extent_op, NULL);
1191         } else if (ret == -ENOENT) {
1192                 setup_inline_extent_backref(trans->fs_info, path, iref, parent,
1193                                             root_objectid, owner, offset,
1194                                             refs_to_add, extent_op);
1195                 ret = 0;
1196         }
1197         return ret;
1198 }
1199
1200 static int insert_extent_backref(struct btrfs_trans_handle *trans,
1201                                  struct btrfs_path *path,
1202                                  u64 bytenr, u64 parent, u64 root_objectid,
1203                                  u64 owner, u64 offset, int refs_to_add)
1204 {
1205         int ret;
1206         if (owner < BTRFS_FIRST_FREE_OBJECTID) {
1207                 BUG_ON(refs_to_add != 1);
1208                 ret = insert_tree_block_ref(trans, path, bytenr, parent,
1209                                             root_objectid);
1210         } else {
1211                 ret = insert_extent_data_ref(trans, path, bytenr, parent,
1212                                              root_objectid, owner, offset,
1213                                              refs_to_add);
1214         }
1215         return ret;
1216 }
1217
1218 static int remove_extent_backref(struct btrfs_trans_handle *trans,
1219                                  struct btrfs_path *path,
1220                                  struct btrfs_extent_inline_ref *iref,
1221                                  int refs_to_drop, int is_data, int *last_ref)
1222 {
1223         int ret = 0;
1224
1225         BUG_ON(!is_data && refs_to_drop != 1);
1226         if (iref) {
1227                 update_inline_extent_backref(path, iref, -refs_to_drop, NULL,
1228                                              last_ref);
1229         } else if (is_data) {
1230                 ret = remove_extent_data_ref(trans, path, refs_to_drop,
1231                                              last_ref);
1232         } else {
1233                 *last_ref = 1;
1234                 ret = btrfs_del_item(trans, trans->fs_info->extent_root, path);
1235         }
1236         return ret;
1237 }
1238
1239 static int btrfs_issue_discard(struct block_device *bdev, u64 start, u64 len,
1240                                u64 *discarded_bytes)
1241 {
1242         int j, ret = 0;
1243         u64 bytes_left, end;
1244         u64 aligned_start = ALIGN(start, 1 << 9);
1245
1246         if (WARN_ON(start != aligned_start)) {
1247                 len -= aligned_start - start;
1248                 len = round_down(len, 1 << 9);
1249                 start = aligned_start;
1250         }
1251
1252         *discarded_bytes = 0;
1253
1254         if (!len)
1255                 return 0;
1256
1257         end = start + len;
1258         bytes_left = len;
1259
1260         /* Skip any superblocks on this device. */
1261         for (j = 0; j < BTRFS_SUPER_MIRROR_MAX; j++) {
1262                 u64 sb_start = btrfs_sb_offset(j);
1263                 u64 sb_end = sb_start + BTRFS_SUPER_INFO_SIZE;
1264                 u64 size = sb_start - start;
1265
1266                 if (!in_range(sb_start, start, bytes_left) &&
1267                     !in_range(sb_end, start, bytes_left) &&
1268                     !in_range(start, sb_start, BTRFS_SUPER_INFO_SIZE))
1269                         continue;
1270
1271                 /*
1272                  * Superblock spans beginning of range.  Adjust start and
1273                  * try again.
1274                  */
1275                 if (sb_start <= start) {
1276                         start += sb_end - start;
1277                         if (start > end) {
1278                                 bytes_left = 0;
1279                                 break;
1280                         }
1281                         bytes_left = end - start;
1282                         continue;
1283                 }
1284
1285                 if (size) {
1286                         ret = blkdev_issue_discard(bdev, start >> 9, size >> 9,
1287                                                    GFP_NOFS, 0);
1288                         if (!ret)
1289                                 *discarded_bytes += size;
1290                         else if (ret != -EOPNOTSUPP)
1291                                 return ret;
1292                 }
1293
1294                 start = sb_end;
1295                 if (start > end) {
1296                         bytes_left = 0;
1297                         break;
1298                 }
1299                 bytes_left = end - start;
1300         }
1301
1302         if (bytes_left) {
1303                 ret = blkdev_issue_discard(bdev, start >> 9, bytes_left >> 9,
1304                                            GFP_NOFS, 0);
1305                 if (!ret)
1306                         *discarded_bytes += bytes_left;
1307         }
1308         return ret;
1309 }
1310
1311 int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr,
1312                          u64 num_bytes, u64 *actual_bytes)
1313 {
1314         int ret = 0;
1315         u64 discarded_bytes = 0;
1316         u64 end = bytenr + num_bytes;
1317         u64 cur = bytenr;
1318         struct btrfs_bio *bbio = NULL;
1319
1320
1321         /*
1322          * Avoid races with device replace and make sure our bbio has devices
1323          * associated to its stripes that don't go away while we are discarding.
1324          */
1325         btrfs_bio_counter_inc_blocked(fs_info);
1326         while (cur < end) {
1327                 struct btrfs_bio_stripe *stripe;
1328                 int i;
1329
1330                 num_bytes = end - cur;
1331                 /* Tell the block device(s) that the sectors can be discarded */
1332                 ret = btrfs_map_block(fs_info, BTRFS_MAP_DISCARD, cur,
1333                                       &num_bytes, &bbio, 0);
1334                 /*
1335                  * Error can be -ENOMEM, -ENOENT (no such chunk mapping) or
1336                  * -EOPNOTSUPP. For any such error, @num_bytes is not updated,
1337                  * thus we can't continue anyway.
1338                  */
1339                 if (ret < 0)
1340                         goto out;
1341
1342                 stripe = bbio->stripes;
1343                 for (i = 0; i < bbio->num_stripes; i++, stripe++) {
1344                         u64 bytes;
1345                         struct request_queue *req_q;
1346                         struct btrfs_device *device = stripe->dev;
1347
1348                         if (!device->bdev) {
1349                                 ASSERT(btrfs_test_opt(fs_info, DEGRADED));
1350                                 continue;
1351                         }
1352                         req_q = bdev_get_queue(device->bdev);
1353                         if (!blk_queue_discard(req_q))
1354                                 continue;
1355
1356                         if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
1357                                 continue;
1358
1359                         ret = btrfs_issue_discard(device->bdev,
1360                                                   stripe->physical,
1361                                                   stripe->length,
1362                                                   &bytes);
1363                         if (!ret) {
1364                                 discarded_bytes += bytes;
1365                         } else if (ret != -EOPNOTSUPP) {
1366                                 /*
1367                                  * Logic errors or -ENOMEM, or -EIO, but
1368                                  * unlikely to happen.
1369                                  *
1370                                  * And since there are two loops, explicitly
1371                                  * go to out to avoid confusion.
1372                                  */
1373                                 btrfs_put_bbio(bbio);
1374                                 goto out;
1375                         }
1376
1377                         /*
1378                          * Just in case we get back EOPNOTSUPP for some reason,
1379                          * just ignore the return value so we don't screw up
1380                          * people calling discard_extent.
1381                          */
1382                         ret = 0;
1383                 }
1384                 btrfs_put_bbio(bbio);
1385                 cur += num_bytes;
1386         }
1387 out:
1388         btrfs_bio_counter_dec(fs_info);
1389
1390         if (actual_bytes)
1391                 *actual_bytes = discarded_bytes;
1392
1393
1394         if (ret == -EOPNOTSUPP)
1395                 ret = 0;
1396         return ret;
1397 }
1398
1399 /* Can return -ENOMEM */
1400 int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1401                          struct btrfs_ref *generic_ref)
1402 {
1403         struct btrfs_fs_info *fs_info = trans->fs_info;
1404         int old_ref_mod, new_ref_mod;
1405         int ret;
1406
1407         ASSERT(generic_ref->type != BTRFS_REF_NOT_SET &&
1408                generic_ref->action);
1409         BUG_ON(generic_ref->type == BTRFS_REF_METADATA &&
1410                generic_ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID);
1411
1412         if (generic_ref->type == BTRFS_REF_METADATA)
1413                 ret = btrfs_add_delayed_tree_ref(trans, generic_ref,
1414                                 NULL, &old_ref_mod, &new_ref_mod);
1415         else
1416                 ret = btrfs_add_delayed_data_ref(trans, generic_ref, 0,
1417                                                  &old_ref_mod, &new_ref_mod);
1418
1419         btrfs_ref_tree_mod(fs_info, generic_ref);
1420
1421         if (ret == 0 && old_ref_mod < 0 && new_ref_mod >= 0)
1422                 sub_pinned_bytes(fs_info, generic_ref);
1423
1424         return ret;
1425 }
1426
1427 /*
1428  * __btrfs_inc_extent_ref - insert backreference for a given extent
1429  *
1430  * @trans:          Handle of transaction
1431  *
1432  * @node:           The delayed ref node used to get the bytenr/length for
1433  *                  extent whose references are incremented.
1434  *
1435  * @parent:         If this is a shared extent (BTRFS_SHARED_DATA_REF_KEY/
1436  *                  BTRFS_SHARED_BLOCK_REF_KEY) then it holds the logical
1437  *                  bytenr of the parent block. Since new extents are always
1438  *                  created with indirect references, this will only be the case
1439  *                  when relocating a shared extent. In that case, root_objectid
1440  *                  will be BTRFS_TREE_RELOC_OBJECTID. Otheriwse, parent must
1441  *                  be 0
1442  *
1443  * @root_objectid:  The id of the root where this modification has originated,
1444  *                  this can be either one of the well-known metadata trees or
1445  *                  the subvolume id which references this extent.
1446  *
1447  * @owner:          For data extents it is the inode number of the owning file.
1448  *                  For metadata extents this parameter holds the level in the
1449  *                  tree of the extent.
1450  *
1451  * @offset:         For metadata extents the offset is ignored and is currently
1452  *                  always passed as 0. For data extents it is the fileoffset
1453  *                  this extent belongs to.
1454  *
1455  * @refs_to_add     Number of references to add
1456  *
1457  * @extent_op       Pointer to a structure, holding information necessary when
1458  *                  updating a tree block's flags
1459  *
1460  */
1461 static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans,
1462                                   struct btrfs_delayed_ref_node *node,
1463                                   u64 parent, u64 root_objectid,
1464                                   u64 owner, u64 offset, int refs_to_add,
1465                                   struct btrfs_delayed_extent_op *extent_op)
1466 {
1467         struct btrfs_path *path;
1468         struct extent_buffer *leaf;
1469         struct btrfs_extent_item *item;
1470         struct btrfs_key key;
1471         u64 bytenr = node->bytenr;
1472         u64 num_bytes = node->num_bytes;
1473         u64 refs;
1474         int ret;
1475
1476         path = btrfs_alloc_path();
1477         if (!path)
1478                 return -ENOMEM;
1479
1480         path->reada = READA_FORWARD;
1481         path->leave_spinning = 1;
1482         /* this will setup the path even if it fails to insert the back ref */
1483         ret = insert_inline_extent_backref(trans, path, bytenr, num_bytes,
1484                                            parent, root_objectid, owner,
1485                                            offset, refs_to_add, extent_op);
1486         if ((ret < 0 && ret != -EAGAIN) || !ret)
1487                 goto out;
1488
1489         /*
1490          * Ok we had -EAGAIN which means we didn't have space to insert and
1491          * inline extent ref, so just update the reference count and add a
1492          * normal backref.
1493          */
1494         leaf = path->nodes[0];
1495         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
1496         item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1497         refs = btrfs_extent_refs(leaf, item);
1498         btrfs_set_extent_refs(leaf, item, refs + refs_to_add);
1499         if (extent_op)
1500                 __run_delayed_extent_op(extent_op, leaf, item);
1501
1502         btrfs_mark_buffer_dirty(leaf);
1503         btrfs_release_path(path);
1504
1505         path->reada = READA_FORWARD;
1506         path->leave_spinning = 1;
1507         /* now insert the actual backref */
1508         ret = insert_extent_backref(trans, path, bytenr, parent, root_objectid,
1509                                     owner, offset, refs_to_add);
1510         if (ret)
1511                 btrfs_abort_transaction(trans, ret);
1512 out:
1513         btrfs_free_path(path);
1514         return ret;
1515 }
1516
1517 static int run_delayed_data_ref(struct btrfs_trans_handle *trans,
1518                                 struct btrfs_delayed_ref_node *node,
1519                                 struct btrfs_delayed_extent_op *extent_op,
1520                                 int insert_reserved)
1521 {
1522         int ret = 0;
1523         struct btrfs_delayed_data_ref *ref;
1524         struct btrfs_key ins;
1525         u64 parent = 0;
1526         u64 ref_root = 0;
1527         u64 flags = 0;
1528
1529         ins.objectid = node->bytenr;
1530         ins.offset = node->num_bytes;
1531         ins.type = BTRFS_EXTENT_ITEM_KEY;
1532
1533         ref = btrfs_delayed_node_to_data_ref(node);
1534         trace_run_delayed_data_ref(trans->fs_info, node, ref, node->action);
1535
1536         if (node->type == BTRFS_SHARED_DATA_REF_KEY)
1537                 parent = ref->parent;
1538         ref_root = ref->root;
1539
1540         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1541                 if (extent_op)
1542                         flags |= extent_op->flags_to_set;
1543                 ret = alloc_reserved_file_extent(trans, parent, ref_root,
1544                                                  flags, ref->objectid,
1545                                                  ref->offset, &ins,
1546                                                  node->ref_mod);
1547         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1548                 ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
1549                                              ref->objectid, ref->offset,
1550                                              node->ref_mod, extent_op);
1551         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
1552                 ret = __btrfs_free_extent(trans, node, parent,
1553                                           ref_root, ref->objectid,
1554                                           ref->offset, node->ref_mod,
1555                                           extent_op);
1556         } else {
1557                 BUG();
1558         }
1559         return ret;
1560 }
1561
1562 static void __run_delayed_extent_op(struct btrfs_delayed_extent_op *extent_op,
1563                                     struct extent_buffer *leaf,
1564                                     struct btrfs_extent_item *ei)
1565 {
1566         u64 flags = btrfs_extent_flags(leaf, ei);
1567         if (extent_op->update_flags) {
1568                 flags |= extent_op->flags_to_set;
1569                 btrfs_set_extent_flags(leaf, ei, flags);
1570         }
1571
1572         if (extent_op->update_key) {
1573                 struct btrfs_tree_block_info *bi;
1574                 BUG_ON(!(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK));
1575                 bi = (struct btrfs_tree_block_info *)(ei + 1);
1576                 btrfs_set_tree_block_key(leaf, bi, &extent_op->key);
1577         }
1578 }
1579
1580 static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
1581                                  struct btrfs_delayed_ref_head *head,
1582                                  struct btrfs_delayed_extent_op *extent_op)
1583 {
1584         struct btrfs_fs_info *fs_info = trans->fs_info;
1585         struct btrfs_key key;
1586         struct btrfs_path *path;
1587         struct btrfs_extent_item *ei;
1588         struct extent_buffer *leaf;
1589         u32 item_size;
1590         int ret;
1591         int err = 0;
1592         int metadata = !extent_op->is_data;
1593
1594         if (TRANS_ABORTED(trans))
1595                 return 0;
1596
1597         if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
1598                 metadata = 0;
1599
1600         path = btrfs_alloc_path();
1601         if (!path)
1602                 return -ENOMEM;
1603
1604         key.objectid = head->bytenr;
1605
1606         if (metadata) {
1607                 key.type = BTRFS_METADATA_ITEM_KEY;
1608                 key.offset = extent_op->level;
1609         } else {
1610                 key.type = BTRFS_EXTENT_ITEM_KEY;
1611                 key.offset = head->num_bytes;
1612         }
1613
1614 again:
1615         path->reada = READA_FORWARD;
1616         path->leave_spinning = 1;
1617         ret = btrfs_search_slot(trans, fs_info->extent_root, &key, path, 0, 1);
1618         if (ret < 0) {
1619                 err = ret;
1620                 goto out;
1621         }
1622         if (ret > 0) {
1623                 if (metadata) {
1624                         if (path->slots[0] > 0) {
1625                                 path->slots[0]--;
1626                                 btrfs_item_key_to_cpu(path->nodes[0], &key,
1627                                                       path->slots[0]);
1628                                 if (key.objectid == head->bytenr &&
1629                                     key.type == BTRFS_EXTENT_ITEM_KEY &&
1630                                     key.offset == head->num_bytes)
1631                                         ret = 0;
1632                         }
1633                         if (ret > 0) {
1634                                 btrfs_release_path(path);
1635                                 metadata = 0;
1636
1637                                 key.objectid = head->bytenr;
1638                                 key.offset = head->num_bytes;
1639                                 key.type = BTRFS_EXTENT_ITEM_KEY;
1640                                 goto again;
1641                         }
1642                 } else {
1643                         err = -EIO;
1644                         goto out;
1645                 }
1646         }
1647
1648         leaf = path->nodes[0];
1649         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
1650
1651         if (unlikely(item_size < sizeof(*ei))) {
1652                 err = -EINVAL;
1653                 btrfs_print_v0_err(fs_info);
1654                 btrfs_abort_transaction(trans, err);
1655                 goto out;
1656         }
1657
1658         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
1659         __run_delayed_extent_op(extent_op, leaf, ei);
1660
1661         btrfs_mark_buffer_dirty(leaf);
1662 out:
1663         btrfs_free_path(path);
1664         return err;
1665 }
1666
1667 static int run_delayed_tree_ref(struct btrfs_trans_handle *trans,
1668                                 struct btrfs_delayed_ref_node *node,
1669                                 struct btrfs_delayed_extent_op *extent_op,
1670                                 int insert_reserved)
1671 {
1672         int ret = 0;
1673         struct btrfs_delayed_tree_ref *ref;
1674         u64 parent = 0;
1675         u64 ref_root = 0;
1676
1677         ref = btrfs_delayed_node_to_tree_ref(node);
1678         trace_run_delayed_tree_ref(trans->fs_info, node, ref, node->action);
1679
1680         if (node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1681                 parent = ref->parent;
1682         ref_root = ref->root;
1683
1684         if (node->ref_mod != 1) {
1685                 btrfs_err(trans->fs_info,
1686         "btree block(%llu) has %d references rather than 1: action %d ref_root %llu parent %llu",
1687                           node->bytenr, node->ref_mod, node->action, ref_root,
1688                           parent);
1689                 return -EIO;
1690         }
1691         if (node->action == BTRFS_ADD_DELAYED_REF && insert_reserved) {
1692                 BUG_ON(!extent_op || !extent_op->update_flags);
1693                 ret = alloc_reserved_tree_block(trans, node, extent_op);
1694         } else if (node->action == BTRFS_ADD_DELAYED_REF) {
1695                 ret = __btrfs_inc_extent_ref(trans, node, parent, ref_root,
1696                                              ref->level, 0, 1, extent_op);
1697         } else if (node->action == BTRFS_DROP_DELAYED_REF) {
1698                 ret = __btrfs_free_extent(trans, node, parent, ref_root,
1699                                           ref->level, 0, 1, extent_op);
1700         } else {
1701                 BUG();
1702         }
1703         return ret;
1704 }
1705
1706 /* helper function to actually process a single delayed ref entry */
1707 static int run_one_delayed_ref(struct btrfs_trans_handle *trans,
1708                                struct btrfs_delayed_ref_node *node,
1709                                struct btrfs_delayed_extent_op *extent_op,
1710                                int insert_reserved)
1711 {
1712         int ret = 0;
1713
1714         if (TRANS_ABORTED(trans)) {
1715                 if (insert_reserved)
1716                         btrfs_pin_extent(trans->fs_info, node->bytenr,
1717                                          node->num_bytes, 1);
1718                 return 0;
1719         }
1720
1721         if (node->type == BTRFS_TREE_BLOCK_REF_KEY ||
1722             node->type == BTRFS_SHARED_BLOCK_REF_KEY)
1723                 ret = run_delayed_tree_ref(trans, node, extent_op,
1724                                            insert_reserved);
1725         else if (node->type == BTRFS_EXTENT_DATA_REF_KEY ||
1726                  node->type == BTRFS_SHARED_DATA_REF_KEY)
1727                 ret = run_delayed_data_ref(trans, node, extent_op,
1728                                            insert_reserved);
1729         else
1730                 BUG();
1731         if (ret && insert_reserved)
1732                 btrfs_pin_extent(trans->fs_info, node->bytenr,
1733                                  node->num_bytes, 1);
1734         return ret;
1735 }
1736
1737 static inline struct btrfs_delayed_ref_node *
1738 select_delayed_ref(struct btrfs_delayed_ref_head *head)
1739 {
1740         struct btrfs_delayed_ref_node *ref;
1741
1742         if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
1743                 return NULL;
1744
1745         /*
1746          * Select a delayed ref of type BTRFS_ADD_DELAYED_REF first.
1747          * This is to prevent a ref count from going down to zero, which deletes
1748          * the extent item from the extent tree, when there still are references
1749          * to add, which would fail because they would not find the extent item.
1750          */
1751         if (!list_empty(&head->ref_add_list))
1752                 return list_first_entry(&head->ref_add_list,
1753                                 struct btrfs_delayed_ref_node, add_list);
1754
1755         ref = rb_entry(rb_first_cached(&head->ref_tree),
1756                        struct btrfs_delayed_ref_node, ref_node);
1757         ASSERT(list_empty(&ref->add_list));
1758         return ref;
1759 }
1760
1761 static void unselect_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
1762                                       struct btrfs_delayed_ref_head *head)
1763 {
1764         spin_lock(&delayed_refs->lock);
1765         head->processing = 0;
1766         delayed_refs->num_heads_ready++;
1767         spin_unlock(&delayed_refs->lock);
1768         btrfs_delayed_ref_unlock(head);
1769 }
1770
1771 static struct btrfs_delayed_extent_op *cleanup_extent_op(
1772                                 struct btrfs_delayed_ref_head *head)
1773 {
1774         struct btrfs_delayed_extent_op *extent_op = head->extent_op;
1775
1776         if (!extent_op)
1777                 return NULL;
1778
1779         if (head->must_insert_reserved) {
1780                 head->extent_op = NULL;
1781                 btrfs_free_delayed_extent_op(extent_op);
1782                 return NULL;
1783         }
1784         return extent_op;
1785 }
1786
1787 static int run_and_cleanup_extent_op(struct btrfs_trans_handle *trans,
1788                                      struct btrfs_delayed_ref_head *head)
1789 {
1790         struct btrfs_delayed_extent_op *extent_op;
1791         int ret;
1792
1793         extent_op = cleanup_extent_op(head);
1794         if (!extent_op)
1795                 return 0;
1796         head->extent_op = NULL;
1797         spin_unlock(&head->lock);
1798         ret = run_delayed_extent_op(trans, head, extent_op);
1799         btrfs_free_delayed_extent_op(extent_op);
1800         return ret ? ret : 1;
1801 }
1802
1803 void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
1804                                   struct btrfs_delayed_ref_root *delayed_refs,
1805                                   struct btrfs_delayed_ref_head *head)
1806 {
1807         int nr_items = 1;       /* Dropping this ref head update. */
1808
1809         if (head->total_ref_mod < 0) {
1810                 struct btrfs_space_info *space_info;
1811                 u64 flags;
1812
1813                 if (head->is_data)
1814                         flags = BTRFS_BLOCK_GROUP_DATA;
1815                 else if (head->is_system)
1816                         flags = BTRFS_BLOCK_GROUP_SYSTEM;
1817                 else
1818                         flags = BTRFS_BLOCK_GROUP_METADATA;
1819                 space_info = btrfs_find_space_info(fs_info, flags);
1820                 ASSERT(space_info);
1821                 percpu_counter_add_batch(&space_info->total_bytes_pinned,
1822                                    -head->num_bytes,
1823                                    BTRFS_TOTAL_BYTES_PINNED_BATCH);
1824
1825                 /*
1826                  * We had csum deletions accounted for in our delayed refs rsv,
1827                  * we need to drop the csum leaves for this update from our
1828                  * delayed_refs_rsv.
1829                  */
1830                 if (head->is_data) {
1831                         spin_lock(&delayed_refs->lock);
1832                         delayed_refs->pending_csums -= head->num_bytes;
1833                         spin_unlock(&delayed_refs->lock);
1834                         nr_items += btrfs_csum_bytes_to_leaves(fs_info,
1835                                 head->num_bytes);
1836                 }
1837         }
1838
1839         btrfs_delayed_refs_rsv_release(fs_info, nr_items);
1840 }
1841
1842 static int cleanup_ref_head(struct btrfs_trans_handle *trans,
1843                             struct btrfs_delayed_ref_head *head)
1844 {
1845
1846         struct btrfs_fs_info *fs_info = trans->fs_info;
1847         struct btrfs_delayed_ref_root *delayed_refs;
1848         int ret;
1849
1850         delayed_refs = &trans->transaction->delayed_refs;
1851
1852         ret = run_and_cleanup_extent_op(trans, head);
1853         if (ret < 0) {
1854                 unselect_delayed_ref_head(delayed_refs, head);
1855                 btrfs_debug(fs_info, "run_delayed_extent_op returned %d", ret);
1856                 return ret;
1857         } else if (ret) {
1858                 return ret;
1859         }
1860
1861         /*
1862          * Need to drop our head ref lock and re-acquire the delayed ref lock
1863          * and then re-check to make sure nobody got added.
1864          */
1865         spin_unlock(&head->lock);
1866         spin_lock(&delayed_refs->lock);
1867         spin_lock(&head->lock);
1868         if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root) || head->extent_op) {
1869                 spin_unlock(&head->lock);
1870                 spin_unlock(&delayed_refs->lock);
1871                 return 1;
1872         }
1873         btrfs_delete_ref_head(delayed_refs, head);
1874         spin_unlock(&head->lock);
1875         spin_unlock(&delayed_refs->lock);
1876
1877         if (head->must_insert_reserved) {
1878                 btrfs_pin_extent(fs_info, head->bytenr,
1879                                  head->num_bytes, 1);
1880                 if (head->is_data) {
1881                         ret = btrfs_del_csums(trans, fs_info->csum_root,
1882                                               head->bytenr, head->num_bytes);
1883                 }
1884         }
1885
1886         btrfs_cleanup_ref_head_accounting(fs_info, delayed_refs, head);
1887
1888         trace_run_delayed_ref_head(fs_info, head, 0);
1889         btrfs_delayed_ref_unlock(head);
1890         btrfs_put_delayed_ref_head(head);
1891         return ret;
1892 }
1893
1894 static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head(
1895                                         struct btrfs_trans_handle *trans)
1896 {
1897         struct btrfs_delayed_ref_root *delayed_refs =
1898                 &trans->transaction->delayed_refs;
1899         struct btrfs_delayed_ref_head *head = NULL;
1900         int ret;
1901
1902         spin_lock(&delayed_refs->lock);
1903         head = btrfs_select_ref_head(delayed_refs);
1904         if (!head) {
1905                 spin_unlock(&delayed_refs->lock);
1906                 return head;
1907         }
1908
1909         /*
1910          * Grab the lock that says we are going to process all the refs for
1911          * this head
1912          */
1913         ret = btrfs_delayed_ref_lock(delayed_refs, head);
1914         spin_unlock(&delayed_refs->lock);
1915
1916         /*
1917          * We may have dropped the spin lock to get the head mutex lock, and
1918          * that might have given someone else time to free the head.  If that's
1919          * true, it has been removed from our list and we can move on.
1920          */
1921         if (ret == -EAGAIN)
1922                 head = ERR_PTR(-EAGAIN);
1923
1924         return head;
1925 }
1926
1927 static int btrfs_run_delayed_refs_for_head(struct btrfs_trans_handle *trans,
1928                                     struct btrfs_delayed_ref_head *locked_ref,
1929                                     unsigned long *run_refs)
1930 {
1931         struct btrfs_fs_info *fs_info = trans->fs_info;
1932         struct btrfs_delayed_ref_root *delayed_refs;
1933         struct btrfs_delayed_extent_op *extent_op;
1934         struct btrfs_delayed_ref_node *ref;
1935         int must_insert_reserved = 0;
1936         int ret;
1937
1938         delayed_refs = &trans->transaction->delayed_refs;
1939
1940         lockdep_assert_held(&locked_ref->mutex);
1941         lockdep_assert_held(&locked_ref->lock);
1942
1943         while ((ref = select_delayed_ref(locked_ref))) {
1944                 if (ref->seq &&
1945                     btrfs_check_delayed_seq(fs_info, ref->seq)) {
1946                         spin_unlock(&locked_ref->lock);
1947                         unselect_delayed_ref_head(delayed_refs, locked_ref);
1948                         return -EAGAIN;
1949                 }
1950
1951                 (*run_refs)++;
1952                 ref->in_tree = 0;
1953                 rb_erase_cached(&ref->ref_node, &locked_ref->ref_tree);
1954                 RB_CLEAR_NODE(&ref->ref_node);
1955                 if (!list_empty(&ref->add_list))
1956                         list_del(&ref->add_list);
1957                 /*
1958                  * When we play the delayed ref, also correct the ref_mod on
1959                  * head
1960                  */
1961                 switch (ref->action) {
1962                 case BTRFS_ADD_DELAYED_REF:
1963                 case BTRFS_ADD_DELAYED_EXTENT:
1964                         locked_ref->ref_mod -= ref->ref_mod;
1965                         break;
1966                 case BTRFS_DROP_DELAYED_REF:
1967                         locked_ref->ref_mod += ref->ref_mod;
1968                         break;
1969                 default:
1970                         WARN_ON(1);
1971                 }
1972                 atomic_dec(&delayed_refs->num_entries);
1973
1974                 /*
1975                  * Record the must_insert_reserved flag before we drop the
1976                  * spin lock.
1977                  */
1978                 must_insert_reserved = locked_ref->must_insert_reserved;
1979                 locked_ref->must_insert_reserved = 0;
1980
1981                 extent_op = locked_ref->extent_op;
1982                 locked_ref->extent_op = NULL;
1983                 spin_unlock(&locked_ref->lock);
1984
1985                 ret = run_one_delayed_ref(trans, ref, extent_op,
1986                                           must_insert_reserved);
1987
1988                 btrfs_free_delayed_extent_op(extent_op);
1989                 if (ret) {
1990                         unselect_delayed_ref_head(delayed_refs, locked_ref);
1991                         btrfs_put_delayed_ref(ref);
1992                         btrfs_debug(fs_info, "run_one_delayed_ref returned %d",
1993                                     ret);
1994                         return ret;
1995                 }
1996
1997                 btrfs_put_delayed_ref(ref);
1998                 cond_resched();
1999
2000                 spin_lock(&locked_ref->lock);
2001                 btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
2002         }
2003
2004         return 0;
2005 }
2006
2007 /*
2008  * Returns 0 on success or if called with an already aborted transaction.
2009  * Returns -ENOMEM or -EIO on failure and will abort the transaction.
2010  */
2011 static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2012                                              unsigned long nr)
2013 {
2014         struct btrfs_fs_info *fs_info = trans->fs_info;
2015         struct btrfs_delayed_ref_root *delayed_refs;
2016         struct btrfs_delayed_ref_head *locked_ref = NULL;
2017         ktime_t start = ktime_get();
2018         int ret;
2019         unsigned long count = 0;
2020         unsigned long actual_count = 0;
2021
2022         delayed_refs = &trans->transaction->delayed_refs;
2023         do {
2024                 if (!locked_ref) {
2025                         locked_ref = btrfs_obtain_ref_head(trans);
2026                         if (IS_ERR_OR_NULL(locked_ref)) {
2027                                 if (PTR_ERR(locked_ref) == -EAGAIN) {
2028                                         continue;
2029                                 } else {
2030                                         break;
2031                                 }
2032                         }
2033                         count++;
2034                 }
2035                 /*
2036                  * We need to try and merge add/drops of the same ref since we
2037                  * can run into issues with relocate dropping the implicit ref
2038                  * and then it being added back again before the drop can
2039                  * finish.  If we merged anything we need to re-loop so we can
2040                  * get a good ref.
2041                  * Or we can get node references of the same type that weren't
2042                  * merged when created due to bumps in the tree mod seq, and
2043                  * we need to merge them to prevent adding an inline extent
2044                  * backref before dropping it (triggering a BUG_ON at
2045                  * insert_inline_extent_backref()).
2046                  */
2047                 spin_lock(&locked_ref->lock);
2048                 btrfs_merge_delayed_refs(trans, delayed_refs, locked_ref);
2049
2050                 ret = btrfs_run_delayed_refs_for_head(trans, locked_ref,
2051                                                       &actual_count);
2052                 if (ret < 0 && ret != -EAGAIN) {
2053                         /*
2054                          * Error, btrfs_run_delayed_refs_for_head already
2055                          * unlocked everything so just bail out
2056                          */
2057                         return ret;
2058                 } else if (!ret) {
2059                         /*
2060                          * Success, perform the usual cleanup of a processed
2061                          * head
2062                          */
2063                         ret = cleanup_ref_head(trans, locked_ref);
2064                         if (ret > 0 ) {
2065                                 /* We dropped our lock, we need to loop. */
2066                                 ret = 0;
2067                                 continue;
2068                         } else if (ret) {
2069                                 return ret;
2070                         }
2071                 }
2072
2073                 /*
2074                  * Either success case or btrfs_run_delayed_refs_for_head
2075                  * returned -EAGAIN, meaning we need to select another head
2076                  */
2077
2078                 locked_ref = NULL;
2079                 cond_resched();
2080         } while ((nr != -1 && count < nr) || locked_ref);
2081
2082         /*
2083          * We don't want to include ref heads since we can have empty ref heads
2084          * and those will drastically skew our runtime down since we just do
2085          * accounting, no actual extent tree updates.
2086          */
2087         if (actual_count > 0) {
2088                 u64 runtime = ktime_to_ns(ktime_sub(ktime_get(), start));
2089                 u64 avg;
2090
2091                 /*
2092                  * We weigh the current average higher than our current runtime
2093                  * to avoid large swings in the average.
2094                  */
2095                 spin_lock(&delayed_refs->lock);
2096                 avg = fs_info->avg_delayed_ref_runtime * 3 + runtime;
2097                 fs_info->avg_delayed_ref_runtime = avg >> 2;    /* div by 4 */
2098                 spin_unlock(&delayed_refs->lock);
2099         }
2100         return 0;
2101 }
2102
2103 #ifdef SCRAMBLE_DELAYED_REFS
2104 /*
2105  * Normally delayed refs get processed in ascending bytenr order. This
2106  * correlates in most cases to the order added. To expose dependencies on this
2107  * order, we start to process the tree in the middle instead of the beginning
2108  */
2109 static u64 find_middle(struct rb_root *root)
2110 {
2111         struct rb_node *n = root->rb_node;
2112         struct btrfs_delayed_ref_node *entry;
2113         int alt = 1;
2114         u64 middle;
2115         u64 first = 0, last = 0;
2116
2117         n = rb_first(root);
2118         if (n) {
2119                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2120                 first = entry->bytenr;
2121         }
2122         n = rb_last(root);
2123         if (n) {
2124                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2125                 last = entry->bytenr;
2126         }
2127         n = root->rb_node;
2128
2129         while (n) {
2130                 entry = rb_entry(n, struct btrfs_delayed_ref_node, rb_node);
2131                 WARN_ON(!entry->in_tree);
2132
2133                 middle = entry->bytenr;
2134
2135                 if (alt)
2136                         n = n->rb_left;
2137                 else
2138                         n = n->rb_right;
2139
2140                 alt = 1 - alt;
2141         }
2142         return middle;
2143 }
2144 #endif
2145
2146 static inline u64 heads_to_leaves(struct btrfs_fs_info *fs_info, u64 heads)
2147 {
2148         u64 num_bytes;
2149
2150         num_bytes = heads * (sizeof(struct btrfs_extent_item) +
2151                              sizeof(struct btrfs_extent_inline_ref));
2152         if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
2153                 num_bytes += heads * sizeof(struct btrfs_tree_block_info);
2154
2155         /*
2156          * We don't ever fill up leaves all the way so multiply by 2 just to be
2157          * closer to what we're really going to want to use.
2158          */
2159         return div_u64(num_bytes, BTRFS_LEAF_DATA_SIZE(fs_info));
2160 }
2161
2162 /*
2163  * Takes the number of bytes to be csumm'ed and figures out how many leaves it
2164  * would require to store the csums for that many bytes.
2165  */
2166 u64 btrfs_csum_bytes_to_leaves(struct btrfs_fs_info *fs_info, u64 csum_bytes)
2167 {
2168         u64 csum_size;
2169         u64 num_csums_per_leaf;
2170         u64 num_csums;
2171
2172         csum_size = BTRFS_MAX_ITEM_SIZE(fs_info);
2173         num_csums_per_leaf = div64_u64(csum_size,
2174                         (u64)btrfs_super_csum_size(fs_info->super_copy));
2175         num_csums = div64_u64(csum_bytes, fs_info->sectorsize);
2176         num_csums += num_csums_per_leaf - 1;
2177         num_csums = div64_u64(num_csums, num_csums_per_leaf);
2178         return num_csums;
2179 }
2180
2181 /*
2182  * this starts processing the delayed reference count updates and
2183  * extent insertions we have queued up so far.  count can be
2184  * 0, which means to process everything in the tree at the start
2185  * of the run (but not newly added entries), or it can be some target
2186  * number you'd like to process.
2187  *
2188  * Returns 0 on success or if called with an aborted transaction
2189  * Returns <0 on error and aborts the transaction
2190  */
2191 int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans,
2192                            unsigned long count)
2193 {
2194         struct btrfs_fs_info *fs_info = trans->fs_info;
2195         struct rb_node *node;
2196         struct btrfs_delayed_ref_root *delayed_refs;
2197         struct btrfs_delayed_ref_head *head;
2198         int ret;
2199         int run_all = count == (unsigned long)-1;
2200
2201         /* We'll clean this up in btrfs_cleanup_transaction */
2202         if (TRANS_ABORTED(trans))
2203                 return 0;
2204
2205         if (test_bit(BTRFS_FS_CREATING_FREE_SPACE_TREE, &fs_info->flags))
2206                 return 0;
2207
2208         delayed_refs = &trans->transaction->delayed_refs;
2209         if (count == 0)
2210                 count = atomic_read(&delayed_refs->num_entries) * 2;
2211
2212 again:
2213 #ifdef SCRAMBLE_DELAYED_REFS
2214         delayed_refs->run_delayed_start = find_middle(&delayed_refs->root);
2215 #endif
2216         ret = __btrfs_run_delayed_refs(trans, count);
2217         if (ret < 0) {
2218                 btrfs_abort_transaction(trans, ret);
2219                 return ret;
2220         }
2221
2222         if (run_all) {
2223                 btrfs_create_pending_block_groups(trans);
2224
2225                 spin_lock(&delayed_refs->lock);
2226                 node = rb_first_cached(&delayed_refs->href_root);
2227                 if (!node) {
2228                         spin_unlock(&delayed_refs->lock);
2229                         goto out;
2230                 }
2231                 head = rb_entry(node, struct btrfs_delayed_ref_head,
2232                                 href_node);
2233                 refcount_inc(&head->refs);
2234                 spin_unlock(&delayed_refs->lock);
2235
2236                 /* Mutex was contended, block until it's released and retry. */
2237                 mutex_lock(&head->mutex);
2238                 mutex_unlock(&head->mutex);
2239
2240                 btrfs_put_delayed_ref_head(head);
2241                 cond_resched();
2242                 goto again;
2243         }
2244 out:
2245         return 0;
2246 }
2247
2248 int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
2249                                 u64 bytenr, u64 num_bytes, u64 flags,
2250                                 int level, int is_data)
2251 {
2252         struct btrfs_delayed_extent_op *extent_op;
2253         int ret;
2254
2255         extent_op = btrfs_alloc_delayed_extent_op();
2256         if (!extent_op)
2257                 return -ENOMEM;
2258
2259         extent_op->flags_to_set = flags;
2260         extent_op->update_flags = true;
2261         extent_op->update_key = false;
2262         extent_op->is_data = is_data ? true : false;
2263         extent_op->level = level;
2264
2265         ret = btrfs_add_delayed_extent_op(trans, bytenr, num_bytes, extent_op);
2266         if (ret)
2267                 btrfs_free_delayed_extent_op(extent_op);
2268         return ret;
2269 }
2270
2271 static noinline int check_delayed_ref(struct btrfs_root *root,
2272                                       struct btrfs_path *path,
2273                                       u64 objectid, u64 offset, u64 bytenr)
2274 {
2275         struct btrfs_delayed_ref_head *head;
2276         struct btrfs_delayed_ref_node *ref;
2277         struct btrfs_delayed_data_ref *data_ref;
2278         struct btrfs_delayed_ref_root *delayed_refs;
2279         struct btrfs_transaction *cur_trans;
2280         struct rb_node *node;
2281         int ret = 0;
2282
2283         spin_lock(&root->fs_info->trans_lock);
2284         cur_trans = root->fs_info->running_transaction;
2285         if (cur_trans)
2286                 refcount_inc(&cur_trans->use_count);
2287         spin_unlock(&root->fs_info->trans_lock);
2288         if (!cur_trans)
2289                 return 0;
2290
2291         delayed_refs = &cur_trans->delayed_refs;
2292         spin_lock(&delayed_refs->lock);
2293         head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
2294         if (!head) {
2295                 spin_unlock(&delayed_refs->lock);
2296                 btrfs_put_transaction(cur_trans);
2297                 return 0;
2298         }
2299
2300         if (!mutex_trylock(&head->mutex)) {
2301                 refcount_inc(&head->refs);
2302                 spin_unlock(&delayed_refs->lock);
2303
2304                 btrfs_release_path(path);
2305
2306                 /*
2307                  * Mutex was contended, block until it's released and let
2308                  * caller try again
2309                  */
2310                 mutex_lock(&head->mutex);
2311                 mutex_unlock(&head->mutex);
2312                 btrfs_put_delayed_ref_head(head);
2313                 btrfs_put_transaction(cur_trans);
2314                 return -EAGAIN;
2315         }
2316         spin_unlock(&delayed_refs->lock);
2317
2318         spin_lock(&head->lock);
2319         /*
2320          * XXX: We should replace this with a proper search function in the
2321          * future.
2322          */
2323         for (node = rb_first_cached(&head->ref_tree); node;
2324              node = rb_next(node)) {
2325                 ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
2326                 /* If it's a shared ref we know a cross reference exists */
2327                 if (ref->type != BTRFS_EXTENT_DATA_REF_KEY) {
2328                         ret = 1;
2329                         break;
2330                 }
2331
2332                 data_ref = btrfs_delayed_node_to_data_ref(ref);
2333
2334                 /*
2335                  * If our ref doesn't match the one we're currently looking at
2336                  * then we have a cross reference.
2337                  */
2338                 if (data_ref->root != root->root_key.objectid ||
2339                     data_ref->objectid != objectid ||
2340                     data_ref->offset != offset) {
2341                         ret = 1;
2342                         break;
2343                 }
2344         }
2345         spin_unlock(&head->lock);
2346         mutex_unlock(&head->mutex);
2347         btrfs_put_transaction(cur_trans);
2348         return ret;
2349 }
2350
2351 static noinline int check_committed_ref(struct btrfs_root *root,
2352                                         struct btrfs_path *path,
2353                                         u64 objectid, u64 offset, u64 bytenr,
2354                                         bool strict)
2355 {
2356         struct btrfs_fs_info *fs_info = root->fs_info;
2357         struct btrfs_root *extent_root = fs_info->extent_root;
2358         struct extent_buffer *leaf;
2359         struct btrfs_extent_data_ref *ref;
2360         struct btrfs_extent_inline_ref *iref;
2361         struct btrfs_extent_item *ei;
2362         struct btrfs_key key;
2363         u32 item_size;
2364         int type;
2365         int ret;
2366
2367         key.objectid = bytenr;
2368         key.offset = (u64)-1;
2369         key.type = BTRFS_EXTENT_ITEM_KEY;
2370
2371         ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0);
2372         if (ret < 0)
2373                 goto out;
2374         BUG_ON(ret == 0); /* Corruption */
2375
2376         ret = -ENOENT;
2377         if (path->slots[0] == 0)
2378                 goto out;
2379
2380         path->slots[0]--;
2381         leaf = path->nodes[0];
2382         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
2383
2384         if (key.objectid != bytenr || key.type != BTRFS_EXTENT_ITEM_KEY)
2385                 goto out;
2386
2387         ret = 1;
2388         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
2389         ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item);
2390
2391         /* If extent item has more than 1 inline ref then it's shared */
2392         if (item_size != sizeof(*ei) +
2393             btrfs_extent_inline_ref_size(BTRFS_EXTENT_DATA_REF_KEY))
2394                 goto out;
2395
2396         /*
2397          * If extent created before last snapshot => it's shared unless the
2398          * snapshot has been deleted. Use the heuristic if strict is false.
2399          */
2400         if (!strict &&
2401             (btrfs_extent_generation(leaf, ei) <=
2402              btrfs_root_last_snapshot(&root->root_item)))
2403                 goto out;
2404
2405         iref = (struct btrfs_extent_inline_ref *)(ei + 1);
2406
2407         /* If this extent has SHARED_DATA_REF then it's shared */
2408         type = btrfs_get_extent_inline_ref_type(leaf, iref, BTRFS_REF_TYPE_DATA);
2409         if (type != BTRFS_EXTENT_DATA_REF_KEY)
2410                 goto out;
2411
2412         ref = (struct btrfs_extent_data_ref *)(&iref->offset);
2413         if (btrfs_extent_refs(leaf, ei) !=
2414             btrfs_extent_data_ref_count(leaf, ref) ||
2415             btrfs_extent_data_ref_root(leaf, ref) !=
2416             root->root_key.objectid ||
2417             btrfs_extent_data_ref_objectid(leaf, ref) != objectid ||
2418             btrfs_extent_data_ref_offset(leaf, ref) != offset)
2419                 goto out;
2420
2421         ret = 0;
2422 out:
2423         return ret;
2424 }
2425
2426 int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
2427                           u64 bytenr, bool strict)
2428 {
2429         struct btrfs_path *path;
2430         int ret;
2431
2432         path = btrfs_alloc_path();
2433         if (!path)
2434                 return -ENOMEM;
2435
2436         do {
2437                 ret = check_committed_ref(root, path, objectid,
2438                                           offset, bytenr, strict);
2439                 if (ret && ret != -ENOENT)
2440                         goto out;
2441
2442                 ret = check_delayed_ref(root, path, objectid, offset, bytenr);
2443         } while (ret == -EAGAIN);
2444
2445 out:
2446         btrfs_free_path(path);
2447         if (root->root_key.objectid == BTRFS_DATA_RELOC_TREE_OBJECTID)
2448                 WARN_ON(ret > 0);
2449         return ret;
2450 }
2451
2452 static int __btrfs_mod_ref(struct btrfs_trans_handle *trans,
2453                            struct btrfs_root *root,
2454                            struct extent_buffer *buf,
2455                            int full_backref, int inc)
2456 {
2457         struct btrfs_fs_info *fs_info = root->fs_info;
2458         u64 bytenr;
2459         u64 num_bytes;
2460         u64 parent;
2461         u64 ref_root;
2462         u32 nritems;
2463         struct btrfs_key key;
2464         struct btrfs_file_extent_item *fi;
2465         struct btrfs_ref generic_ref = { 0 };
2466         bool for_reloc = btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC);
2467         int i;
2468         int action;
2469         int level;
2470         int ret = 0;
2471
2472         if (btrfs_is_testing(fs_info))
2473                 return 0;
2474
2475         ref_root = btrfs_header_owner(buf);
2476         nritems = btrfs_header_nritems(buf);
2477         level = btrfs_header_level(buf);
2478
2479         if (!test_bit(BTRFS_ROOT_REF_COWS, &root->state) && level == 0)
2480                 return 0;
2481
2482         if (full_backref)
2483                 parent = buf->start;
2484         else
2485                 parent = 0;
2486         if (inc)
2487                 action = BTRFS_ADD_DELAYED_REF;
2488         else
2489                 action = BTRFS_DROP_DELAYED_REF;
2490
2491         for (i = 0; i < nritems; i++) {
2492                 if (level == 0) {
2493                         btrfs_item_key_to_cpu(buf, &key, i);
2494                         if (key.type != BTRFS_EXTENT_DATA_KEY)
2495                                 continue;
2496                         fi = btrfs_item_ptr(buf, i,
2497                                             struct btrfs_file_extent_item);
2498                         if (btrfs_file_extent_type(buf, fi) ==
2499                             BTRFS_FILE_EXTENT_INLINE)
2500                                 continue;
2501                         bytenr = btrfs_file_extent_disk_bytenr(buf, fi);
2502                         if (bytenr == 0)
2503                                 continue;
2504
2505                         num_bytes = btrfs_file_extent_disk_num_bytes(buf, fi);
2506                         key.offset -= btrfs_file_extent_offset(buf, fi);
2507                         btrfs_init_generic_ref(&generic_ref, action, bytenr,
2508                                                num_bytes, parent);
2509                         generic_ref.real_root = root->root_key.objectid;
2510                         btrfs_init_data_ref(&generic_ref, ref_root, key.objectid,
2511                                             key.offset);
2512                         generic_ref.skip_qgroup = for_reloc;
2513                         if (inc)
2514                                 ret = btrfs_inc_extent_ref(trans, &generic_ref);
2515                         else
2516                                 ret = btrfs_free_extent(trans, &generic_ref);
2517                         if (ret)
2518                                 goto fail;
2519                 } else {
2520                         bytenr = btrfs_node_blockptr(buf, i);
2521                         num_bytes = fs_info->nodesize;
2522                         btrfs_init_generic_ref(&generic_ref, action, bytenr,
2523                                                num_bytes, parent);
2524                         generic_ref.real_root = root->root_key.objectid;
2525                         btrfs_init_tree_ref(&generic_ref, level - 1, ref_root);
2526                         generic_ref.skip_qgroup = for_reloc;
2527                         if (inc)
2528                                 ret = btrfs_inc_extent_ref(trans, &generic_ref);
2529                         else
2530                                 ret = btrfs_free_extent(trans, &generic_ref);
2531                         if (ret)
2532                                 goto fail;
2533                 }
2534         }
2535         return 0;
2536 fail:
2537         return ret;
2538 }
2539
2540 int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2541                   struct extent_buffer *buf, int full_backref)
2542 {
2543         return __btrfs_mod_ref(trans, root, buf, full_backref, 1);
2544 }
2545
2546 int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2547                   struct extent_buffer *buf, int full_backref)
2548 {
2549         return __btrfs_mod_ref(trans, root, buf, full_backref, 0);
2550 }
2551
2552 int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr)
2553 {
2554         struct btrfs_block_group_cache *block_group;
2555         int readonly = 0;
2556
2557         block_group = btrfs_lookup_block_group(fs_info, bytenr);
2558         if (!block_group || block_group->ro)
2559                 readonly = 1;
2560         if (block_group)
2561                 btrfs_put_block_group(block_group);
2562         return readonly;
2563 }
2564
2565 static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
2566 {
2567         struct btrfs_fs_info *fs_info = root->fs_info;
2568         u64 flags;
2569         u64 ret;
2570
2571         if (data)
2572                 flags = BTRFS_BLOCK_GROUP_DATA;
2573         else if (root == fs_info->chunk_root)
2574                 flags = BTRFS_BLOCK_GROUP_SYSTEM;
2575         else
2576                 flags = BTRFS_BLOCK_GROUP_METADATA;
2577
2578         ret = btrfs_get_alloc_profile(fs_info, flags);
2579         return ret;
2580 }
2581
2582 static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
2583 {
2584         struct btrfs_block_group_cache *cache;
2585         u64 bytenr;
2586
2587         spin_lock(&fs_info->block_group_cache_lock);
2588         bytenr = fs_info->first_logical_byte;
2589         spin_unlock(&fs_info->block_group_cache_lock);
2590
2591         if (bytenr < (u64)-1)
2592                 return bytenr;
2593
2594         cache = btrfs_lookup_first_block_group(fs_info, search_start);
2595         if (!cache)
2596                 return 0;
2597
2598         bytenr = cache->key.objectid;
2599         btrfs_put_block_group(cache);
2600
2601         return bytenr;
2602 }
2603
2604 static int pin_down_extent(struct btrfs_block_group_cache *cache,
2605                            u64 bytenr, u64 num_bytes, int reserved)
2606 {
2607         struct btrfs_fs_info *fs_info = cache->fs_info;
2608
2609         spin_lock(&cache->space_info->lock);
2610         spin_lock(&cache->lock);
2611         cache->pinned += num_bytes;
2612         btrfs_space_info_update_bytes_pinned(fs_info, cache->space_info,
2613                                              num_bytes);
2614         if (reserved) {
2615                 cache->reserved -= num_bytes;
2616                 cache->space_info->bytes_reserved -= num_bytes;
2617         }
2618         spin_unlock(&cache->lock);
2619         spin_unlock(&cache->space_info->lock);
2620
2621         percpu_counter_add_batch(&cache->space_info->total_bytes_pinned,
2622                     num_bytes, BTRFS_TOTAL_BYTES_PINNED_BATCH);
2623         set_extent_dirty(fs_info->pinned_extents, bytenr,
2624                          bytenr + num_bytes - 1, GFP_NOFS | __GFP_NOFAIL);
2625         return 0;
2626 }
2627
2628 /*
2629  * this function must be called within transaction
2630  */
2631 int btrfs_pin_extent(struct btrfs_fs_info *fs_info,
2632                      u64 bytenr, u64 num_bytes, int reserved)
2633 {
2634         struct btrfs_block_group_cache *cache;
2635
2636         cache = btrfs_lookup_block_group(fs_info, bytenr);
2637         BUG_ON(!cache); /* Logic error */
2638
2639         pin_down_extent(cache, bytenr, num_bytes, reserved);
2640
2641         btrfs_put_block_group(cache);
2642         return 0;
2643 }
2644
2645 /*
2646  * this function must be called within transaction
2647  */
2648 int btrfs_pin_extent_for_log_replay(struct btrfs_fs_info *fs_info,
2649                                     u64 bytenr, u64 num_bytes)
2650 {
2651         struct btrfs_block_group_cache *cache;
2652         int ret;
2653
2654         cache = btrfs_lookup_block_group(fs_info, bytenr);
2655         if (!cache)
2656                 return -EINVAL;
2657
2658         /*
2659          * pull in the free space cache (if any) so that our pin
2660          * removes the free space from the cache.  We have load_only set
2661          * to one because the slow code to read in the free extents does check
2662          * the pinned extents.
2663          */
2664         btrfs_cache_block_group(cache, 1);
2665
2666         pin_down_extent(cache, bytenr, num_bytes, 0);
2667
2668         /* remove us from the free space cache (if we're there at all) */
2669         ret = btrfs_remove_free_space(cache, bytenr, num_bytes);
2670         btrfs_put_block_group(cache);
2671         return ret;
2672 }
2673
2674 static int __exclude_logged_extent(struct btrfs_fs_info *fs_info,
2675                                    u64 start, u64 num_bytes)
2676 {
2677         int ret;
2678         struct btrfs_block_group_cache *block_group;
2679         struct btrfs_caching_control *caching_ctl;
2680
2681         block_group = btrfs_lookup_block_group(fs_info, start);
2682         if (!block_group)
2683                 return -EINVAL;
2684
2685         btrfs_cache_block_group(block_group, 0);
2686         caching_ctl = btrfs_get_caching_control(block_group);
2687
2688         if (!caching_ctl) {
2689                 /* Logic error */
2690                 BUG_ON(!btrfs_block_group_cache_done(block_group));
2691                 ret = btrfs_remove_free_space(block_group, start, num_bytes);
2692         } else {
2693                 mutex_lock(&caching_ctl->mutex);
2694
2695                 if (start >= caching_ctl->progress) {
2696                         ret = btrfs_add_excluded_extent(fs_info, start,
2697                                                         num_bytes);
2698                 } else if (start + num_bytes <= caching_ctl->progress) {
2699                         ret = btrfs_remove_free_space(block_group,
2700                                                       start, num_bytes);
2701                 } else {
2702                         num_bytes = caching_ctl->progress - start;
2703                         ret = btrfs_remove_free_space(block_group,
2704                                                       start, num_bytes);
2705                         if (ret)
2706                                 goto out_lock;
2707
2708                         num_bytes = (start + num_bytes) -
2709                                 caching_ctl->progress;
2710                         start = caching_ctl->progress;
2711                         ret = btrfs_add_excluded_extent(fs_info, start,
2712                                                         num_bytes);
2713                 }
2714 out_lock:
2715                 mutex_unlock(&caching_ctl->mutex);
2716                 btrfs_put_caching_control(caching_ctl);
2717         }
2718         btrfs_put_block_group(block_group);
2719         return ret;
2720 }
2721
2722 int btrfs_exclude_logged_extents(struct extent_buffer *eb)
2723 {
2724         struct btrfs_fs_info *fs_info = eb->fs_info;
2725         struct btrfs_file_extent_item *item;
2726         struct btrfs_key key;
2727         int found_type;
2728         int i;
2729         int ret = 0;
2730
2731         if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS))
2732                 return 0;
2733
2734         for (i = 0; i < btrfs_header_nritems(eb); i++) {
2735                 btrfs_item_key_to_cpu(eb, &key, i);
2736                 if (key.type != BTRFS_EXTENT_DATA_KEY)
2737                         continue;
2738                 item = btrfs_item_ptr(eb, i, struct btrfs_file_extent_item);
2739                 found_type = btrfs_file_extent_type(eb, item);
2740                 if (found_type == BTRFS_FILE_EXTENT_INLINE)
2741                         continue;
2742                 if (btrfs_file_extent_disk_bytenr(eb, item) == 0)
2743                         continue;
2744                 key.objectid = btrfs_file_extent_disk_bytenr(eb, item);
2745                 key.offset = btrfs_file_extent_disk_num_bytes(eb, item);
2746                 ret = __exclude_logged_extent(fs_info, key.objectid, key.offset);
2747                 if (ret)
2748                         break;
2749         }
2750
2751         return ret;
2752 }
2753
2754 static void
2755 btrfs_inc_block_group_reservations(struct btrfs_block_group_cache *bg)
2756 {
2757         atomic_inc(&bg->reservations);
2758 }
2759
2760 void btrfs_prepare_extent_commit(struct btrfs_fs_info *fs_info)
2761 {
2762         struct btrfs_caching_control *next;
2763         struct btrfs_caching_control *caching_ctl;
2764         struct btrfs_block_group_cache *cache;
2765
2766         down_write(&fs_info->commit_root_sem);
2767
2768         list_for_each_entry_safe(caching_ctl, next,
2769                                  &fs_info->caching_block_groups, list) {
2770                 cache = caching_ctl->block_group;
2771                 if (btrfs_block_group_cache_done(cache)) {
2772                         cache->last_byte_to_unpin = (u64)-1;
2773                         list_del_init(&caching_ctl->list);
2774                         btrfs_put_caching_control(caching_ctl);
2775                 } else {
2776                         cache->last_byte_to_unpin = caching_ctl->progress;
2777                 }
2778         }
2779
2780         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
2781                 fs_info->pinned_extents = &fs_info->freed_extents[1];
2782         else
2783                 fs_info->pinned_extents = &fs_info->freed_extents[0];
2784
2785         up_write(&fs_info->commit_root_sem);
2786
2787         btrfs_update_global_block_rsv(fs_info);
2788 }
2789
2790 /*
2791  * Returns the free cluster for the given space info and sets empty_cluster to
2792  * what it should be based on the mount options.
2793  */
2794 static struct btrfs_free_cluster *
2795 fetch_cluster_info(struct btrfs_fs_info *fs_info,
2796                    struct btrfs_space_info *space_info, u64 *empty_cluster)
2797 {
2798         struct btrfs_free_cluster *ret = NULL;
2799
2800         *empty_cluster = 0;
2801         if (btrfs_mixed_space_info(space_info))
2802                 return ret;
2803
2804         if (space_info->flags & BTRFS_BLOCK_GROUP_METADATA) {
2805                 ret = &fs_info->meta_alloc_cluster;
2806                 if (btrfs_test_opt(fs_info, SSD))
2807                         *empty_cluster = SZ_2M;
2808                 else
2809                         *empty_cluster = SZ_64K;
2810         } else if ((space_info->flags & BTRFS_BLOCK_GROUP_DATA) &&
2811                    btrfs_test_opt(fs_info, SSD_SPREAD)) {
2812                 *empty_cluster = SZ_2M;
2813                 ret = &fs_info->data_alloc_cluster;
2814         }
2815
2816         return ret;
2817 }
2818
2819 static int unpin_extent_range(struct btrfs_fs_info *fs_info,
2820                               u64 start, u64 end,
2821                               const bool return_free_space)
2822 {
2823         struct btrfs_block_group_cache *cache = NULL;
2824         struct btrfs_space_info *space_info;
2825         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
2826         struct btrfs_free_cluster *cluster = NULL;
2827         u64 len;
2828         u64 total_unpinned = 0;
2829         u64 empty_cluster = 0;
2830         bool readonly;
2831
2832         while (start <= end) {
2833                 readonly = false;
2834                 if (!cache ||
2835                     start >= cache->key.objectid + cache->key.offset) {
2836                         if (cache)
2837                                 btrfs_put_block_group(cache);
2838                         total_unpinned = 0;
2839                         cache = btrfs_lookup_block_group(fs_info, start);
2840                         BUG_ON(!cache); /* Logic error */
2841
2842                         cluster = fetch_cluster_info(fs_info,
2843                                                      cache->space_info,
2844                                                      &empty_cluster);
2845                         empty_cluster <<= 1;
2846                 }
2847
2848                 len = cache->key.objectid + cache->key.offset - start;
2849                 len = min(len, end + 1 - start);
2850
2851                 if (start < cache->last_byte_to_unpin && return_free_space) {
2852                         u64 add_len = min(len, cache->last_byte_to_unpin - start);
2853
2854                         btrfs_add_free_space(cache, start, add_len);
2855                 }
2856
2857                 start += len;
2858                 total_unpinned += len;
2859                 space_info = cache->space_info;
2860
2861                 /*
2862                  * If this space cluster has been marked as fragmented and we've
2863                  * unpinned enough in this block group to potentially allow a
2864                  * cluster to be created inside of it go ahead and clear the
2865                  * fragmented check.
2866                  */
2867                 if (cluster && cluster->fragmented &&
2868                     total_unpinned > empty_cluster) {
2869                         spin_lock(&cluster->lock);
2870                         cluster->fragmented = 0;
2871                         spin_unlock(&cluster->lock);
2872                 }
2873
2874                 spin_lock(&space_info->lock);
2875                 spin_lock(&cache->lock);
2876                 cache->pinned -= len;
2877                 btrfs_space_info_update_bytes_pinned(fs_info, space_info, -len);
2878                 space_info->max_extent_size = 0;
2879                 percpu_counter_add_batch(&space_info->total_bytes_pinned,
2880                             -len, BTRFS_TOTAL_BYTES_PINNED_BATCH);
2881                 if (cache->ro) {
2882                         space_info->bytes_readonly += len;
2883                         readonly = true;
2884                 }
2885                 spin_unlock(&cache->lock);
2886                 if (!readonly && return_free_space &&
2887                     global_rsv->space_info == space_info) {
2888                         u64 to_add = len;
2889
2890                         spin_lock(&global_rsv->lock);
2891                         if (!global_rsv->full) {
2892                                 to_add = min(len, global_rsv->size -
2893                                              global_rsv->reserved);
2894                                 global_rsv->reserved += to_add;
2895                                 btrfs_space_info_update_bytes_may_use(fs_info,
2896                                                 space_info, to_add);
2897                                 if (global_rsv->reserved >= global_rsv->size)
2898                                         global_rsv->full = 1;
2899                                 len -= to_add;
2900                         }
2901                         spin_unlock(&global_rsv->lock);
2902                         /* Add to any tickets we may have */
2903                         if (len)
2904                                 btrfs_try_granting_tickets(fs_info,
2905                                                            space_info);
2906                 }
2907                 spin_unlock(&space_info->lock);
2908         }
2909
2910         if (cache)
2911                 btrfs_put_block_group(cache);
2912         return 0;
2913 }
2914
2915 int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans)
2916 {
2917         struct btrfs_fs_info *fs_info = trans->fs_info;
2918         struct btrfs_block_group_cache *block_group, *tmp;
2919         struct list_head *deleted_bgs;
2920         struct extent_io_tree *unpin;
2921         u64 start;
2922         u64 end;
2923         int ret;
2924
2925         if (fs_info->pinned_extents == &fs_info->freed_extents[0])
2926                 unpin = &fs_info->freed_extents[1];
2927         else
2928                 unpin = &fs_info->freed_extents[0];
2929
2930         while (!TRANS_ABORTED(trans)) {
2931                 struct extent_state *cached_state = NULL;
2932
2933                 mutex_lock(&fs_info->unused_bg_unpin_mutex);
2934                 ret = find_first_extent_bit(unpin, 0, &start, &end,
2935                                             EXTENT_DIRTY, &cached_state);
2936                 if (ret) {
2937                         mutex_unlock(&fs_info->unused_bg_unpin_mutex);
2938                         break;
2939                 }
2940
2941                 if (btrfs_test_opt(fs_info, DISCARD))
2942                         ret = btrfs_discard_extent(fs_info, start,
2943                                                    end + 1 - start, NULL);
2944
2945                 clear_extent_dirty(unpin, start, end, &cached_state);
2946                 unpin_extent_range(fs_info, start, end, true);
2947                 mutex_unlock(&fs_info->unused_bg_unpin_mutex);
2948                 free_extent_state(cached_state);
2949                 cond_resched();
2950         }
2951
2952         /*
2953          * Transaction is finished.  We don't need the lock anymore.  We
2954          * do need to clean up the block groups in case of a transaction
2955          * abort.
2956          */
2957         deleted_bgs = &trans->transaction->deleted_bgs;
2958         list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) {
2959                 u64 trimmed = 0;
2960
2961                 ret = -EROFS;
2962                 if (!TRANS_ABORTED(trans))
2963                         ret = btrfs_discard_extent(fs_info,
2964                                                    block_group->key.objectid,
2965                                                    block_group->key.offset,
2966                                                    &trimmed);
2967
2968                 list_del_init(&block_group->bg_list);
2969                 btrfs_put_block_group_trimming(block_group);
2970                 btrfs_put_block_group(block_group);
2971
2972                 if (ret) {
2973                         const char *errstr = btrfs_decode_error(ret);
2974                         btrfs_warn(fs_info,
2975                            "discard failed while removing blockgroup: errno=%d %s",
2976                                    ret, errstr);
2977                 }
2978         }
2979
2980         return 0;
2981 }
2982
2983 static int __btrfs_free_extent(struct btrfs_trans_handle *trans,
2984                                struct btrfs_delayed_ref_node *node, u64 parent,
2985                                u64 root_objectid, u64 owner_objectid,
2986                                u64 owner_offset, int refs_to_drop,
2987                                struct btrfs_delayed_extent_op *extent_op)
2988 {
2989         struct btrfs_fs_info *info = trans->fs_info;
2990         struct btrfs_key key;
2991         struct btrfs_path *path;
2992         struct btrfs_root *extent_root = info->extent_root;
2993         struct extent_buffer *leaf;
2994         struct btrfs_extent_item *ei;
2995         struct btrfs_extent_inline_ref *iref;
2996         int ret;
2997         int is_data;
2998         int extent_slot = 0;
2999         int found_extent = 0;
3000         int num_to_del = 1;
3001         u32 item_size;
3002         u64 refs;
3003         u64 bytenr = node->bytenr;
3004         u64 num_bytes = node->num_bytes;
3005         int last_ref = 0;
3006         bool skinny_metadata = btrfs_fs_incompat(info, SKINNY_METADATA);
3007
3008         path = btrfs_alloc_path();
3009         if (!path)
3010                 return -ENOMEM;
3011
3012         path->reada = READA_FORWARD;
3013         path->leave_spinning = 1;
3014
3015         is_data = owner_objectid >= BTRFS_FIRST_FREE_OBJECTID;
3016         BUG_ON(!is_data && refs_to_drop != 1);
3017
3018         if (is_data)
3019                 skinny_metadata = false;
3020
3021         ret = lookup_extent_backref(trans, path, &iref, bytenr, num_bytes,
3022                                     parent, root_objectid, owner_objectid,
3023                                     owner_offset);
3024         if (ret == 0) {
3025                 extent_slot = path->slots[0];
3026                 while (extent_slot >= 0) {
3027                         btrfs_item_key_to_cpu(path->nodes[0], &key,
3028                                               extent_slot);
3029                         if (key.objectid != bytenr)
3030                                 break;
3031                         if (key.type == BTRFS_EXTENT_ITEM_KEY &&
3032                             key.offset == num_bytes) {
3033                                 found_extent = 1;
3034                                 break;
3035                         }
3036                         if (key.type == BTRFS_METADATA_ITEM_KEY &&
3037                             key.offset == owner_objectid) {
3038                                 found_extent = 1;
3039                                 break;
3040                         }
3041                         if (path->slots[0] - extent_slot > 5)
3042                                 break;
3043                         extent_slot--;
3044                 }
3045
3046                 if (!found_extent) {
3047                         BUG_ON(iref);
3048                         ret = remove_extent_backref(trans, path, NULL,
3049                                                     refs_to_drop,
3050                                                     is_data, &last_ref);
3051                         if (ret) {
3052                                 btrfs_abort_transaction(trans, ret);
3053                                 goto out;
3054                         }
3055                         btrfs_release_path(path);
3056                         path->leave_spinning = 1;
3057
3058                         key.objectid = bytenr;
3059                         key.type = BTRFS_EXTENT_ITEM_KEY;
3060                         key.offset = num_bytes;
3061
3062                         if (!is_data && skinny_metadata) {
3063                                 key.type = BTRFS_METADATA_ITEM_KEY;
3064                                 key.offset = owner_objectid;
3065                         }
3066
3067                         ret = btrfs_search_slot(trans, extent_root,
3068                                                 &key, path, -1, 1);
3069                         if (ret > 0 && skinny_metadata && path->slots[0]) {
3070                                 /*
3071                                  * Couldn't find our skinny metadata item,
3072                                  * see if we have ye olde extent item.
3073                                  */
3074                                 path->slots[0]--;
3075                                 btrfs_item_key_to_cpu(path->nodes[0], &key,
3076                                                       path->slots[0]);
3077                                 if (key.objectid == bytenr &&
3078                                     key.type == BTRFS_EXTENT_ITEM_KEY &&
3079                                     key.offset == num_bytes)
3080                                         ret = 0;
3081                         }
3082
3083                         if (ret > 0 && skinny_metadata) {
3084                                 skinny_metadata = false;
3085                                 key.objectid = bytenr;
3086                                 key.type = BTRFS_EXTENT_ITEM_KEY;
3087                                 key.offset = num_bytes;
3088                                 btrfs_release_path(path);
3089                                 ret = btrfs_search_slot(trans, extent_root,
3090                                                         &key, path, -1, 1);
3091                         }
3092
3093                         if (ret) {
3094                                 btrfs_err(info,
3095                                           "umm, got %d back from search, was looking for %llu",
3096                                           ret, bytenr);
3097                                 if (ret > 0)
3098                                         btrfs_print_leaf(path->nodes[0]);
3099                         }
3100                         if (ret < 0) {
3101                                 btrfs_abort_transaction(trans, ret);
3102                                 goto out;
3103                         }
3104                         extent_slot = path->slots[0];
3105                 }
3106         } else if (WARN_ON(ret == -ENOENT)) {
3107                 btrfs_print_leaf(path->nodes[0]);
3108                 btrfs_err(info,
3109                         "unable to find ref byte nr %llu parent %llu root %llu  owner %llu offset %llu",
3110                         bytenr, parent, root_objectid, owner_objectid,
3111                         owner_offset);
3112                 btrfs_abort_transaction(trans, ret);
3113                 goto out;
3114         } else {
3115                 btrfs_abort_transaction(trans, ret);
3116                 goto out;
3117         }
3118
3119         leaf = path->nodes[0];
3120         item_size = btrfs_item_size_nr(leaf, extent_slot);
3121         if (unlikely(item_size < sizeof(*ei))) {
3122                 ret = -EINVAL;
3123                 btrfs_print_v0_err(info);
3124                 btrfs_abort_transaction(trans, ret);
3125                 goto out;
3126         }
3127         ei = btrfs_item_ptr(leaf, extent_slot,
3128                             struct btrfs_extent_item);
3129         if (owner_objectid < BTRFS_FIRST_FREE_OBJECTID &&
3130             key.type == BTRFS_EXTENT_ITEM_KEY) {
3131                 struct btrfs_tree_block_info *bi;
3132                 BUG_ON(item_size < sizeof(*ei) + sizeof(*bi));
3133                 bi = (struct btrfs_tree_block_info *)(ei + 1);
3134                 WARN_ON(owner_objectid != btrfs_tree_block_level(leaf, bi));
3135         }
3136
3137         refs = btrfs_extent_refs(leaf, ei);
3138         if (refs < refs_to_drop) {
3139                 btrfs_err(info,
3140                           "trying to drop %d refs but we only have %Lu for bytenr %Lu",
3141                           refs_to_drop, refs, bytenr);
3142                 ret = -EINVAL;
3143                 btrfs_abort_transaction(trans, ret);
3144                 goto out;
3145         }
3146         refs -= refs_to_drop;
3147
3148         if (refs > 0) {
3149                 if (extent_op)
3150                         __run_delayed_extent_op(extent_op, leaf, ei);
3151                 /*
3152                  * In the case of inline back ref, reference count will
3153                  * be updated by remove_extent_backref
3154                  */
3155                 if (iref) {
3156                         BUG_ON(!found_extent);
3157                 } else {
3158                         btrfs_set_extent_refs(leaf, ei, refs);
3159                         btrfs_mark_buffer_dirty(leaf);
3160                 }
3161                 if (found_extent) {
3162                         ret = remove_extent_backref(trans, path, iref,
3163                                                     refs_to_drop, is_data,
3164                                                     &last_ref);
3165                         if (ret) {
3166                                 btrfs_abort_transaction(trans, ret);
3167                                 goto out;
3168                         }
3169                 }
3170         } else {
3171                 if (found_extent) {
3172                         BUG_ON(is_data && refs_to_drop !=
3173                                extent_data_ref_count(path, iref));
3174                         if (iref) {
3175                                 BUG_ON(path->slots[0] != extent_slot);
3176                         } else {
3177                                 BUG_ON(path->slots[0] != extent_slot + 1);
3178                                 path->slots[0] = extent_slot;
3179                                 num_to_del = 2;
3180                         }
3181                 }
3182
3183                 last_ref = 1;
3184                 ret = btrfs_del_items(trans, extent_root, path, path->slots[0],
3185                                       num_to_del);
3186                 if (ret) {
3187                         btrfs_abort_transaction(trans, ret);
3188                         goto out;
3189                 }
3190                 btrfs_release_path(path);
3191
3192                 if (is_data) {
3193                         ret = btrfs_del_csums(trans, info->csum_root, bytenr,
3194                                               num_bytes);
3195                         if (ret) {
3196                                 btrfs_abort_transaction(trans, ret);
3197                                 goto out;
3198                         }
3199                 }
3200
3201                 ret = add_to_free_space_tree(trans, bytenr, num_bytes);
3202                 if (ret) {
3203                         btrfs_abort_transaction(trans, ret);
3204                         goto out;
3205                 }
3206
3207                 ret = btrfs_update_block_group(trans, bytenr, num_bytes, 0);
3208                 if (ret) {
3209                         btrfs_abort_transaction(trans, ret);
3210                         goto out;
3211                 }
3212         }
3213         btrfs_release_path(path);
3214
3215 out:
3216         btrfs_free_path(path);
3217         return ret;
3218 }
3219
3220 /*
3221  * when we free an block, it is possible (and likely) that we free the last
3222  * delayed ref for that extent as well.  This searches the delayed ref tree for
3223  * a given extent, and if there are no other delayed refs to be processed, it
3224  * removes it from the tree.
3225  */
3226 static noinline int check_ref_cleanup(struct btrfs_trans_handle *trans,
3227                                       u64 bytenr)
3228 {
3229         struct btrfs_delayed_ref_head *head;
3230         struct btrfs_delayed_ref_root *delayed_refs;
3231         int ret = 0;
3232
3233         delayed_refs = &trans->transaction->delayed_refs;
3234         spin_lock(&delayed_refs->lock);
3235         head = btrfs_find_delayed_ref_head(delayed_refs, bytenr);
3236         if (!head)
3237                 goto out_delayed_unlock;
3238
3239         spin_lock(&head->lock);
3240         if (!RB_EMPTY_ROOT(&head->ref_tree.rb_root))
3241                 goto out;
3242
3243         if (cleanup_extent_op(head) != NULL)
3244                 goto out;
3245
3246         /*
3247          * waiting for the lock here would deadlock.  If someone else has it
3248          * locked they are already in the process of dropping it anyway
3249          */
3250         if (!mutex_trylock(&head->mutex))
3251                 goto out;
3252
3253         btrfs_delete_ref_head(delayed_refs, head);
3254         head->processing = 0;
3255
3256         spin_unlock(&head->lock);
3257         spin_unlock(&delayed_refs->lock);
3258
3259         BUG_ON(head->extent_op);
3260         if (head->must_insert_reserved)
3261                 ret = 1;
3262
3263         btrfs_cleanup_ref_head_accounting(trans->fs_info, delayed_refs, head);
3264         mutex_unlock(&head->mutex);
3265         btrfs_put_delayed_ref_head(head);
3266         return ret;
3267 out:
3268         spin_unlock(&head->lock);
3269
3270 out_delayed_unlock:
3271         spin_unlock(&delayed_refs->lock);
3272         return 0;
3273 }
3274
3275 void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
3276                            struct btrfs_root *root,
3277                            struct extent_buffer *buf,
3278                            u64 parent, int last_ref)
3279 {
3280         struct btrfs_fs_info *fs_info = root->fs_info;
3281         struct btrfs_ref generic_ref = { 0 };
3282         int pin = 1;
3283         int ret;
3284
3285         btrfs_init_generic_ref(&generic_ref, BTRFS_DROP_DELAYED_REF,
3286                                buf->start, buf->len, parent);
3287         btrfs_init_tree_ref(&generic_ref, btrfs_header_level(buf),
3288                             root->root_key.objectid);
3289
3290         if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
3291                 int old_ref_mod, new_ref_mod;
3292
3293                 btrfs_ref_tree_mod(fs_info, &generic_ref);
3294                 ret = btrfs_add_delayed_tree_ref(trans, &generic_ref, NULL,
3295                                                  &old_ref_mod, &new_ref_mod);
3296                 BUG_ON(ret); /* -ENOMEM */
3297                 pin = old_ref_mod >= 0 && new_ref_mod < 0;
3298         }
3299
3300         if (last_ref && btrfs_header_generation(buf) == trans->transid) {
3301                 struct btrfs_block_group_cache *cache;
3302
3303                 if (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID) {
3304                         ret = check_ref_cleanup(trans, buf->start);
3305                         if (!ret)
3306                                 goto out;
3307                 }
3308
3309                 pin = 0;
3310                 cache = btrfs_lookup_block_group(fs_info, buf->start);
3311
3312                 if (btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN)) {
3313                         pin_down_extent(cache, buf->start, buf->len, 1);
3314                         btrfs_put_block_group(cache);
3315                         goto out;
3316                 }
3317
3318                 WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags));
3319
3320                 btrfs_add_free_space(cache, buf->start, buf->len);
3321                 btrfs_free_reserved_bytes(cache, buf->len, 0);
3322                 btrfs_put_block_group(cache);
3323                 trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len);
3324         }
3325 out:
3326         if (pin)
3327                 add_pinned_bytes(fs_info, &generic_ref);
3328
3329         if (last_ref) {
3330                 /*
3331                  * Deleting the buffer, clear the corrupt flag since it doesn't
3332                  * matter anymore.
3333                  */
3334                 clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags);
3335         }
3336 }
3337
3338 /* Can return -ENOMEM */
3339 int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref)
3340 {
3341         struct btrfs_fs_info *fs_info = trans->fs_info;
3342         int old_ref_mod, new_ref_mod;
3343         int ret;
3344
3345         if (btrfs_is_testing(fs_info))
3346                 return 0;
3347
3348         /*
3349          * tree log blocks never actually go into the extent allocation
3350          * tree, just update pinning info and exit early.
3351          */
3352         if ((ref->type == BTRFS_REF_METADATA &&
3353              ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
3354             (ref->type == BTRFS_REF_DATA &&
3355              ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)) {
3356                 /* unlocks the pinned mutex */
3357                 btrfs_pin_extent(fs_info, ref->bytenr, ref->len, 1);
3358                 old_ref_mod = new_ref_mod = 0;
3359                 ret = 0;
3360         } else if (ref->type == BTRFS_REF_METADATA) {
3361                 ret = btrfs_add_delayed_tree_ref(trans, ref, NULL,
3362                                                  &old_ref_mod, &new_ref_mod);
3363         } else {
3364                 ret = btrfs_add_delayed_data_ref(trans, ref, 0,
3365                                                  &old_ref_mod, &new_ref_mod);
3366         }
3367
3368         if (!((ref->type == BTRFS_REF_METADATA &&
3369                ref->tree_ref.root == BTRFS_TREE_LOG_OBJECTID) ||
3370               (ref->type == BTRFS_REF_DATA &&
3371                ref->data_ref.ref_root == BTRFS_TREE_LOG_OBJECTID)))
3372                 btrfs_ref_tree_mod(fs_info, ref);
3373
3374         if (ret == 0 && old_ref_mod >= 0 && new_ref_mod < 0)
3375                 add_pinned_bytes(fs_info, ref);
3376
3377         return ret;
3378 }
3379
3380 enum btrfs_loop_type {
3381         LOOP_CACHING_NOWAIT,
3382         LOOP_CACHING_WAIT,
3383         LOOP_ALLOC_CHUNK,
3384         LOOP_NO_EMPTY_SIZE,
3385 };
3386
3387 static inline void
3388 btrfs_lock_block_group(struct btrfs_block_group_cache *cache,
3389                        int delalloc)
3390 {
3391         if (delalloc)
3392                 down_read(&cache->data_rwsem);
3393 }
3394
3395 static inline void
3396 btrfs_grab_block_group(struct btrfs_block_group_cache *cache,
3397                        int delalloc)
3398 {
3399         btrfs_get_block_group(cache);
3400         if (delalloc)
3401                 down_read(&cache->data_rwsem);
3402 }
3403
3404 static struct btrfs_block_group_cache *
3405 btrfs_lock_cluster(struct btrfs_block_group_cache *block_group,
3406                    struct btrfs_free_cluster *cluster,
3407                    int delalloc)
3408 {
3409         struct btrfs_block_group_cache *used_bg = NULL;
3410
3411         spin_lock(&cluster->refill_lock);
3412         while (1) {
3413                 used_bg = cluster->block_group;
3414                 if (!used_bg)
3415                         return NULL;
3416
3417                 if (used_bg == block_group)
3418                         return used_bg;
3419
3420                 btrfs_get_block_group(used_bg);
3421
3422                 if (!delalloc)
3423                         return used_bg;
3424
3425                 if (down_read_trylock(&used_bg->data_rwsem))
3426                         return used_bg;
3427
3428                 spin_unlock(&cluster->refill_lock);
3429
3430                 /* We should only have one-level nested. */
3431                 down_read_nested(&used_bg->data_rwsem, SINGLE_DEPTH_NESTING);
3432
3433                 spin_lock(&cluster->refill_lock);
3434                 if (used_bg == cluster->block_group)
3435                         return used_bg;
3436
3437                 up_read(&used_bg->data_rwsem);
3438                 btrfs_put_block_group(used_bg);
3439         }
3440 }
3441
3442 static inline void
3443 btrfs_release_block_group(struct btrfs_block_group_cache *cache,
3444                          int delalloc)
3445 {
3446         if (delalloc)
3447                 up_read(&cache->data_rwsem);
3448         btrfs_put_block_group(cache);
3449 }
3450
3451 /*
3452  * Structure used internally for find_free_extent() function.  Wraps needed
3453  * parameters.
3454  */
3455 struct find_free_extent_ctl {
3456         /* Basic allocation info */
3457         u64 ram_bytes;
3458         u64 num_bytes;
3459         u64 empty_size;
3460         u64 flags;
3461         int delalloc;
3462
3463         /* Where to start the search inside the bg */
3464         u64 search_start;
3465
3466         /* For clustered allocation */
3467         u64 empty_cluster;
3468
3469         bool have_caching_bg;
3470         bool orig_have_caching_bg;
3471
3472         /* RAID index, converted from flags */
3473         int index;
3474
3475         /*
3476          * Current loop number, check find_free_extent_update_loop() for details
3477          */
3478         int loop;
3479
3480         /*
3481          * Whether we're refilling a cluster, if true we need to re-search
3482          * current block group but don't try to refill the cluster again.
3483          */
3484         bool retry_clustered;
3485
3486         /*
3487          * Whether we're updating free space cache, if true we need to re-search
3488          * current block group but don't try updating free space cache again.
3489          */
3490         bool retry_unclustered;
3491
3492         /* If current block group is cached */
3493         int cached;
3494
3495         /* Max contiguous hole found */
3496         u64 max_extent_size;
3497
3498         /* Total free space from free space cache, not always contiguous */
3499         u64 total_free_space;
3500
3501         /* Found result */
3502         u64 found_offset;
3503 };
3504
3505
3506 /*
3507  * Helper function for find_free_extent().
3508  *
3509  * Return -ENOENT to inform caller that we need fallback to unclustered mode.
3510  * Return -EAGAIN to inform caller that we need to re-search this block group
3511  * Return >0 to inform caller that we find nothing
3512  * Return 0 means we have found a location and set ffe_ctl->found_offset.
3513  */
3514 static int find_free_extent_clustered(struct btrfs_block_group_cache *bg,
3515                 struct btrfs_free_cluster *last_ptr,
3516                 struct find_free_extent_ctl *ffe_ctl,
3517                 struct btrfs_block_group_cache **cluster_bg_ret)
3518 {
3519         struct btrfs_block_group_cache *cluster_bg;
3520         u64 aligned_cluster;
3521         u64 offset;
3522         int ret;
3523
3524         cluster_bg = btrfs_lock_cluster(bg, last_ptr, ffe_ctl->delalloc);
3525         if (!cluster_bg)
3526                 goto refill_cluster;
3527         if (cluster_bg != bg && (cluster_bg->ro ||
3528             !block_group_bits(cluster_bg, ffe_ctl->flags)))
3529                 goto release_cluster;
3530
3531         offset = btrfs_alloc_from_cluster(cluster_bg, last_ptr,
3532                         ffe_ctl->num_bytes, cluster_bg->key.objectid,
3533                         &ffe_ctl->max_extent_size);
3534         if (offset) {
3535                 /* We have a block, we're done */
3536                 spin_unlock(&last_ptr->refill_lock);
3537                 trace_btrfs_reserve_extent_cluster(cluster_bg,
3538                                 ffe_ctl->search_start, ffe_ctl->num_bytes);
3539                 *cluster_bg_ret = cluster_bg;
3540                 ffe_ctl->found_offset = offset;
3541                 return 0;
3542         }
3543         WARN_ON(last_ptr->block_group != cluster_bg);
3544
3545 release_cluster:
3546         /*
3547          * If we are on LOOP_NO_EMPTY_SIZE, we can't set up a new clusters, so
3548          * lets just skip it and let the allocator find whatever block it can
3549          * find. If we reach this point, we will have tried the cluster
3550          * allocator plenty of times and not have found anything, so we are
3551          * likely way too fragmented for the clustering stuff to find anything.
3552          *
3553          * However, if the cluster is taken from the current block group,
3554          * release the cluster first, so that we stand a better chance of
3555          * succeeding in the unclustered allocation.
3556          */
3557         if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE && cluster_bg != bg) {
3558                 spin_unlock(&last_ptr->refill_lock);
3559                 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
3560                 return -ENOENT;
3561         }
3562
3563         /* This cluster didn't work out, free it and start over */
3564         btrfs_return_cluster_to_free_space(NULL, last_ptr);
3565
3566         if (cluster_bg != bg)
3567                 btrfs_release_block_group(cluster_bg, ffe_ctl->delalloc);
3568
3569 refill_cluster:
3570         if (ffe_ctl->loop >= LOOP_NO_EMPTY_SIZE) {
3571                 spin_unlock(&last_ptr->refill_lock);
3572                 return -ENOENT;
3573         }
3574
3575         aligned_cluster = max_t(u64,
3576                         ffe_ctl->empty_cluster + ffe_ctl->empty_size,
3577                         bg->full_stripe_len);
3578         ret = btrfs_find_space_cluster(bg, last_ptr, ffe_ctl->search_start,
3579                         ffe_ctl->num_bytes, aligned_cluster);
3580         if (ret == 0) {
3581                 /* Now pull our allocation out of this cluster */
3582                 offset = btrfs_alloc_from_cluster(bg, last_ptr,
3583                                 ffe_ctl->num_bytes, ffe_ctl->search_start,
3584                                 &ffe_ctl->max_extent_size);
3585                 if (offset) {
3586                         /* We found one, proceed */
3587                         spin_unlock(&last_ptr->refill_lock);
3588                         trace_btrfs_reserve_extent_cluster(bg,
3589                                         ffe_ctl->search_start,
3590                                         ffe_ctl->num_bytes);
3591                         ffe_ctl->found_offset = offset;
3592                         return 0;
3593                 }
3594         } else if (!ffe_ctl->cached && ffe_ctl->loop > LOOP_CACHING_NOWAIT &&
3595                    !ffe_ctl->retry_clustered) {
3596                 spin_unlock(&last_ptr->refill_lock);
3597
3598                 ffe_ctl->retry_clustered = true;
3599                 btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
3600                                 ffe_ctl->empty_cluster + ffe_ctl->empty_size);
3601                 return -EAGAIN;
3602         }
3603         /*
3604          * At this point we either didn't find a cluster or we weren't able to
3605          * allocate a block from our cluster.  Free the cluster we've been
3606          * trying to use, and go to the next block group.
3607          */
3608         btrfs_return_cluster_to_free_space(NULL, last_ptr);
3609         spin_unlock(&last_ptr->refill_lock);
3610         return 1;
3611 }
3612
3613 /*
3614  * Return >0 to inform caller that we find nothing
3615  * Return 0 when we found an free extent and set ffe_ctrl->found_offset
3616  * Return -EAGAIN to inform caller that we need to re-search this block group
3617  */
3618 static int find_free_extent_unclustered(struct btrfs_block_group_cache *bg,
3619                 struct btrfs_free_cluster *last_ptr,
3620                 struct find_free_extent_ctl *ffe_ctl)
3621 {
3622         u64 offset;
3623
3624         /*
3625          * We are doing an unclustered allocation, set the fragmented flag so
3626          * we don't bother trying to setup a cluster again until we get more
3627          * space.
3628          */
3629         if (unlikely(last_ptr)) {
3630                 spin_lock(&last_ptr->lock);
3631                 last_ptr->fragmented = 1;
3632                 spin_unlock(&last_ptr->lock);
3633         }
3634         if (ffe_ctl->cached) {
3635                 struct btrfs_free_space_ctl *free_space_ctl;
3636
3637                 free_space_ctl = bg->free_space_ctl;
3638                 spin_lock(&free_space_ctl->tree_lock);
3639                 if (free_space_ctl->free_space <
3640                     ffe_ctl->num_bytes + ffe_ctl->empty_cluster +
3641                     ffe_ctl->empty_size) {
3642                         ffe_ctl->total_free_space = max_t(u64,
3643                                         ffe_ctl->total_free_space,
3644                                         free_space_ctl->free_space);
3645                         spin_unlock(&free_space_ctl->tree_lock);
3646                         return 1;
3647                 }
3648                 spin_unlock(&free_space_ctl->tree_lock);
3649         }
3650
3651         offset = btrfs_find_space_for_alloc(bg, ffe_ctl->search_start,
3652                         ffe_ctl->num_bytes, ffe_ctl->empty_size,
3653                         &ffe_ctl->max_extent_size);
3654
3655         /*
3656          * If we didn't find a chunk, and we haven't failed on this block group
3657          * before, and this block group is in the middle of caching and we are
3658          * ok with waiting, then go ahead and wait for progress to be made, and
3659          * set @retry_unclustered to true.
3660          *
3661          * If @retry_unclustered is true then we've already waited on this
3662          * block group once and should move on to the next block group.
3663          */
3664         if (!offset && !ffe_ctl->retry_unclustered && !ffe_ctl->cached &&
3665             ffe_ctl->loop > LOOP_CACHING_NOWAIT) {
3666                 btrfs_wait_block_group_cache_progress(bg, ffe_ctl->num_bytes +
3667                                                       ffe_ctl->empty_size);
3668                 ffe_ctl->retry_unclustered = true;
3669                 return -EAGAIN;
3670         } else if (!offset) {
3671                 return 1;
3672         }
3673         ffe_ctl->found_offset = offset;
3674         return 0;
3675 }
3676
3677 /*
3678  * Return >0 means caller needs to re-search for free extent
3679  * Return 0 means we have the needed free extent.
3680  * Return <0 means we failed to locate any free extent.
3681  */
3682 static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info,
3683                                         struct btrfs_free_cluster *last_ptr,
3684                                         struct btrfs_key *ins,
3685                                         struct find_free_extent_ctl *ffe_ctl,
3686                                         int full_search, bool use_cluster)
3687 {
3688         struct btrfs_root *root = fs_info->extent_root;
3689         int ret;
3690
3691         if ((ffe_ctl->loop == LOOP_CACHING_NOWAIT) &&
3692             ffe_ctl->have_caching_bg && !ffe_ctl->orig_have_caching_bg)
3693                 ffe_ctl->orig_have_caching_bg = true;
3694
3695         if (!ins->objectid && ffe_ctl->loop >= LOOP_CACHING_WAIT &&
3696             ffe_ctl->have_caching_bg)
3697                 return 1;
3698
3699         if (!ins->objectid && ++(ffe_ctl->index) < BTRFS_NR_RAID_TYPES)
3700                 return 1;
3701
3702         if (ins->objectid) {
3703                 if (!use_cluster && last_ptr) {
3704                         spin_lock(&last_ptr->lock);
3705                         last_ptr->window_start = ins->objectid;
3706                         spin_unlock(&last_ptr->lock);
3707                 }
3708                 return 0;
3709         }
3710
3711         /*
3712          * LOOP_CACHING_NOWAIT, search partially cached block groups, kicking
3713          *                      caching kthreads as we move along
3714          * LOOP_CACHING_WAIT, search everything, and wait if our bg is caching
3715          * LOOP_ALLOC_CHUNK, force a chunk allocation and try again
3716          * LOOP_NO_EMPTY_SIZE, set empty_size and empty_cluster to 0 and try
3717          *                     again
3718          */
3719         if (ffe_ctl->loop < LOOP_NO_EMPTY_SIZE) {
3720                 ffe_ctl->index = 0;
3721                 if (ffe_ctl->loop == LOOP_CACHING_NOWAIT) {
3722                         /*
3723                          * We want to skip the LOOP_CACHING_WAIT step if we
3724                          * don't have any uncached bgs and we've already done a
3725                          * full search through.
3726                          */
3727                         if (ffe_ctl->orig_have_caching_bg || !full_search)
3728                                 ffe_ctl->loop = LOOP_CACHING_WAIT;
3729                         else
3730                                 ffe_ctl->loop = LOOP_ALLOC_CHUNK;
3731                 } else {
3732                         ffe_ctl->loop++;
3733                 }
3734
3735                 if (ffe_ctl->loop == LOOP_ALLOC_CHUNK) {
3736                         struct btrfs_trans_handle *trans;
3737                         int exist = 0;
3738
3739                         trans = current->journal_info;
3740                         if (trans)
3741                                 exist = 1;
3742                         else
3743                                 trans = btrfs_join_transaction(root);
3744
3745                         if (IS_ERR(trans)) {
3746                                 ret = PTR_ERR(trans);
3747                                 return ret;
3748                         }
3749
3750                         ret = btrfs_chunk_alloc(trans, ffe_ctl->flags,
3751                                                 CHUNK_ALLOC_FORCE);
3752
3753                         /*
3754                          * If we can't allocate a new chunk we've already looped
3755                          * through at least once, move on to the NO_EMPTY_SIZE
3756                          * case.
3757                          */
3758                         if (ret == -ENOSPC)
3759                                 ffe_ctl->loop = LOOP_NO_EMPTY_SIZE;
3760
3761                         /* Do not bail out on ENOSPC since we can do more. */
3762                         if (ret < 0 && ret != -ENOSPC)
3763                                 btrfs_abort_transaction(trans, ret);
3764                         else
3765                                 ret = 0;
3766                         if (!exist)
3767                                 btrfs_end_transaction(trans);
3768                         if (ret)
3769                                 return ret;
3770                 }
3771
3772                 if (ffe_ctl->loop == LOOP_NO_EMPTY_SIZE) {
3773                         /*
3774                          * Don't loop again if we already have no empty_size and
3775                          * no empty_cluster.
3776                          */
3777                         if (ffe_ctl->empty_size == 0 &&
3778                             ffe_ctl->empty_cluster == 0)
3779                                 return -ENOSPC;
3780                         ffe_ctl->empty_size = 0;
3781                         ffe_ctl->empty_cluster = 0;
3782                 }
3783                 return 1;
3784         }
3785         return -ENOSPC;
3786 }
3787
3788 /*
3789  * walks the btree of allocated extents and find a hole of a given size.
3790  * The key ins is changed to record the hole:
3791  * ins->objectid == start position
3792  * ins->flags = BTRFS_EXTENT_ITEM_KEY
3793  * ins->offset == the size of the hole.
3794  * Any available blocks before search_start are skipped.
3795  *
3796  * If there is no suitable free space, we will record the max size of
3797  * the free space extent currently.
3798  *
3799  * The overall logic and call chain:
3800  *
3801  * find_free_extent()
3802  * |- Iterate through all block groups
3803  * |  |- Get a valid block group
3804  * |  |- Try to do clustered allocation in that block group
3805  * |  |- Try to do unclustered allocation in that block group
3806  * |  |- Check if the result is valid
3807  * |  |  |- If valid, then exit
3808  * |  |- Jump to next block group
3809  * |
3810  * |- Push harder to find free extents
3811  *    |- If not found, re-iterate all block groups
3812  */
3813 static noinline int find_free_extent(struct btrfs_root *root,
3814                                 u64 ram_bytes, u64 num_bytes, u64 empty_size,
3815                                 u64 hint_byte, struct btrfs_key *ins,
3816                                 u64 flags, int delalloc)
3817 {
3818         struct btrfs_fs_info *fs_info = root->fs_info;
3819         int ret = 0;
3820         int cache_block_group_error = 0;
3821         struct btrfs_free_cluster *last_ptr = NULL;
3822         struct btrfs_block_group_cache *block_group = NULL;
3823         struct find_free_extent_ctl ffe_ctl = {0};
3824         struct btrfs_space_info *space_info;
3825         bool use_cluster = true;
3826         bool full_search = false;
3827
3828         WARN_ON(num_bytes < fs_info->sectorsize);
3829
3830         ffe_ctl.ram_bytes = ram_bytes;
3831         ffe_ctl.num_bytes = num_bytes;
3832         ffe_ctl.empty_size = empty_size;
3833         ffe_ctl.flags = flags;
3834         ffe_ctl.search_start = 0;
3835         ffe_ctl.retry_clustered = false;
3836         ffe_ctl.retry_unclustered = false;
3837         ffe_ctl.delalloc = delalloc;
3838         ffe_ctl.index = btrfs_bg_flags_to_raid_index(flags);
3839         ffe_ctl.have_caching_bg = false;
3840         ffe_ctl.orig_have_caching_bg = false;
3841         ffe_ctl.found_offset = 0;
3842
3843         ins->type = BTRFS_EXTENT_ITEM_KEY;
3844         ins->objectid = 0;
3845         ins->offset = 0;
3846
3847         trace_find_free_extent(root, num_bytes, empty_size, flags);
3848
3849         space_info = btrfs_find_space_info(fs_info, flags);
3850         if (!space_info) {
3851                 btrfs_err(fs_info, "No space info for %llu", flags);
3852                 return -ENOSPC;
3853         }
3854
3855         /*
3856          * If our free space is heavily fragmented we may not be able to make
3857          * big contiguous allocations, so instead of doing the expensive search
3858          * for free space, simply return ENOSPC with our max_extent_size so we
3859          * can go ahead and search for a more manageable chunk.
3860          *
3861          * If our max_extent_size is large enough for our allocation simply
3862          * disable clustering since we will likely not be able to find enough
3863          * space to create a cluster and induce latency trying.
3864          */
3865         if (unlikely(space_info->max_extent_size)) {
3866                 spin_lock(&space_info->lock);
3867                 if (space_info->max_extent_size &&
3868                     num_bytes > space_info->max_extent_size) {
3869                         ins->offset = space_info->max_extent_size;
3870                         spin_unlock(&space_info->lock);
3871                         return -ENOSPC;
3872                 } else if (space_info->max_extent_size) {
3873                         use_cluster = false;
3874                 }
3875                 spin_unlock(&space_info->lock);
3876         }
3877
3878         last_ptr = fetch_cluster_info(fs_info, space_info,
3879                                       &ffe_ctl.empty_cluster);
3880         if (last_ptr) {
3881                 spin_lock(&last_ptr->lock);
3882                 if (last_ptr->block_group)
3883                         hint_byte = last_ptr->window_start;
3884                 if (last_ptr->fragmented) {
3885                         /*
3886                          * We still set window_start so we can keep track of the
3887                          * last place we found an allocation to try and save
3888                          * some time.
3889                          */
3890                         hint_byte = last_ptr->window_start;
3891                         use_cluster = false;
3892                 }
3893                 spin_unlock(&last_ptr->lock);
3894         }
3895
3896         ffe_ctl.search_start = max(ffe_ctl.search_start,
3897                                    first_logical_byte(fs_info, 0));
3898         ffe_ctl.search_start = max(ffe_ctl.search_start, hint_byte);
3899         if (ffe_ctl.search_start == hint_byte) {
3900                 block_group = btrfs_lookup_block_group(fs_info,
3901                                                        ffe_ctl.search_start);
3902                 /*
3903                  * we don't want to use the block group if it doesn't match our
3904                  * allocation bits, or if its not cached.
3905                  *
3906                  * However if we are re-searching with an ideal block group
3907                  * picked out then we don't care that the block group is cached.
3908                  */
3909                 if (block_group && block_group_bits(block_group, flags) &&
3910                     block_group->cached != BTRFS_CACHE_NO) {
3911                         down_read(&space_info->groups_sem);
3912                         if (list_empty(&block_group->list) ||
3913                             block_group->ro) {
3914                                 /*
3915                                  * someone is removing this block group,
3916                                  * we can't jump into the have_block_group
3917                                  * target because our list pointers are not
3918                                  * valid
3919                                  */
3920                                 btrfs_put_block_group(block_group);
3921                                 up_read(&space_info->groups_sem);
3922                         } else {
3923                                 ffe_ctl.index = btrfs_bg_flags_to_raid_index(
3924                                                 block_group->flags);
3925                                 btrfs_lock_block_group(block_group, delalloc);
3926                                 goto have_block_group;
3927                         }
3928                 } else if (block_group) {
3929                         btrfs_put_block_group(block_group);
3930                 }
3931         }
3932 search:
3933         ffe_ctl.have_caching_bg = false;
3934         if (ffe_ctl.index == btrfs_bg_flags_to_raid_index(flags) ||
3935             ffe_ctl.index == 0)
3936                 full_search = true;
3937         down_read(&space_info->groups_sem);
3938         list_for_each_entry(block_group,
3939                             &space_info->block_groups[ffe_ctl.index], list) {
3940                 /* If the block group is read-only, we can skip it entirely. */
3941                 if (unlikely(block_group->ro))
3942                         continue;
3943
3944                 btrfs_grab_block_group(block_group, delalloc);
3945                 ffe_ctl.search_start = block_group->key.objectid;
3946
3947                 /*
3948                  * this can happen if we end up cycling through all the
3949                  * raid types, but we want to make sure we only allocate
3950                  * for the proper type.
3951                  */
3952                 if (!block_group_bits(block_group, flags)) {
3953                         u64 extra = BTRFS_BLOCK_GROUP_DUP |
3954                                 BTRFS_BLOCK_GROUP_RAID1_MASK |
3955                                 BTRFS_BLOCK_GROUP_RAID56_MASK |
3956                                 BTRFS_BLOCK_GROUP_RAID10;
3957
3958                         /*
3959                          * if they asked for extra copies and this block group
3960                          * doesn't provide them, bail.  This does allow us to
3961                          * fill raid0 from raid1.
3962                          */
3963                         if ((flags & extra) && !(block_group->flags & extra))
3964                                 goto loop;
3965
3966                         /*
3967                          * This block group has different flags than we want.
3968                          * It's possible that we have MIXED_GROUP flag but no
3969                          * block group is mixed.  Just skip such block group.
3970                          */
3971                         btrfs_release_block_group(block_group, delalloc);
3972                         continue;
3973                 }
3974
3975 have_block_group:
3976                 ffe_ctl.cached = btrfs_block_group_cache_done(block_group);
3977                 if (unlikely(!ffe_ctl.cached)) {
3978                         ffe_ctl.have_caching_bg = true;
3979                         ret = btrfs_cache_block_group(block_group, 0);
3980
3981                         /*
3982                          * If we get ENOMEM here or something else we want to
3983                          * try other block groups, because it may not be fatal.
3984                          * However if we can't find anything else we need to
3985                          * save our return here so that we return the actual
3986                          * error that caused problems, not ENOSPC.
3987                          */
3988                         if (ret < 0) {
3989                                 if (!cache_block_group_error)
3990                                         cache_block_group_error = ret;
3991                                 ret = 0;
3992                                 goto loop;
3993                         }
3994                         ret = 0;
3995                 }
3996
3997                 if (unlikely(block_group->cached == BTRFS_CACHE_ERROR)) {
3998                         if (!cache_block_group_error)
3999                                 cache_block_group_error = -EIO;
4000                         goto loop;
4001                 }
4002
4003                 /*
4004                  * Ok we want to try and use the cluster allocator, so
4005                  * lets look there
4006                  */
4007                 if (last_ptr && use_cluster) {
4008                         struct btrfs_block_group_cache *cluster_bg = NULL;
4009
4010                         ret = find_free_extent_clustered(block_group, last_ptr,
4011                                                          &ffe_ctl, &cluster_bg);
4012
4013                         if (ret == 0) {
4014                                 if (cluster_bg && cluster_bg != block_group) {
4015                                         btrfs_release_block_group(block_group,
4016                                                                   delalloc);
4017                                         block_group = cluster_bg;
4018                                 }
4019                                 goto checks;
4020                         } else if (ret == -EAGAIN) {
4021                                 goto have_block_group;
4022                         } else if (ret > 0) {
4023                                 goto loop;
4024                         }
4025                         /* ret == -ENOENT case falls through */
4026                 }
4027
4028                 ret = find_free_extent_unclustered(block_group, last_ptr,
4029                                                    &ffe_ctl);
4030                 if (ret == -EAGAIN)
4031                         goto have_block_group;
4032                 else if (ret > 0)
4033                         goto loop;
4034                 /* ret == 0 case falls through */
4035 checks:
4036                 ffe_ctl.search_start = round_up(ffe_ctl.found_offset,
4037                                              fs_info->stripesize);
4038
4039                 /* move on to the next group */
4040                 if (ffe_ctl.search_start + num_bytes >
4041                     block_group->key.objectid + block_group->key.offset) {
4042                         btrfs_add_free_space(block_group, ffe_ctl.found_offset,
4043                                              num_bytes);
4044                         goto loop;
4045                 }
4046
4047                 if (ffe_ctl.found_offset < ffe_ctl.search_start)
4048                         btrfs_add_free_space(block_group, ffe_ctl.found_offset,
4049                                 ffe_ctl.search_start - ffe_ctl.found_offset);
4050
4051                 ret = btrfs_add_reserved_bytes(block_group, ram_bytes,
4052                                 num_bytes, delalloc);
4053                 if (ret == -EAGAIN) {
4054                         btrfs_add_free_space(block_group, ffe_ctl.found_offset,
4055                                              num_bytes);
4056                         goto loop;
4057                 }
4058                 btrfs_inc_block_group_reservations(block_group);
4059
4060                 /* we are all good, lets return */
4061                 ins->objectid = ffe_ctl.search_start;
4062                 ins->offset = num_bytes;
4063
4064                 trace_btrfs_reserve_extent(block_group, ffe_ctl.search_start,
4065                                            num_bytes);
4066                 btrfs_release_block_group(block_group, delalloc);
4067                 break;
4068 loop:
4069                 ffe_ctl.retry_clustered = false;
4070                 ffe_ctl.retry_unclustered = false;
4071                 BUG_ON(btrfs_bg_flags_to_raid_index(block_group->flags) !=
4072                        ffe_ctl.index);
4073                 btrfs_release_block_group(block_group, delalloc);
4074                 cond_resched();
4075         }
4076         up_read(&space_info->groups_sem);
4077
4078         ret = find_free_extent_update_loop(fs_info, last_ptr, ins, &ffe_ctl,
4079                                            full_search, use_cluster);
4080         if (ret > 0)
4081                 goto search;
4082
4083         if (ret == -ENOSPC && !cache_block_group_error) {
4084                 /*
4085                  * Use ffe_ctl->total_free_space as fallback if we can't find
4086                  * any contiguous hole.
4087                  */
4088                 if (!ffe_ctl.max_extent_size)
4089                         ffe_ctl.max_extent_size = ffe_ctl.total_free_space;
4090                 spin_lock(&space_info->lock);
4091                 space_info->max_extent_size = ffe_ctl.max_extent_size;
4092                 spin_unlock(&space_info->lock);
4093                 ins->offset = ffe_ctl.max_extent_size;
4094         } else if (ret == -ENOSPC) {
4095                 ret = cache_block_group_error;
4096         }
4097         return ret;
4098 }
4099
4100 /*
4101  * btrfs_reserve_extent - entry point to the extent allocator. Tries to find a
4102  *                        hole that is at least as big as @num_bytes.
4103  *
4104  * @root           -    The root that will contain this extent
4105  *
4106  * @ram_bytes      -    The amount of space in ram that @num_bytes take. This
4107  *                      is used for accounting purposes. This value differs
4108  *                      from @num_bytes only in the case of compressed extents.
4109  *
4110  * @num_bytes      -    Number of bytes to allocate on-disk.
4111  *
4112  * @min_alloc_size -    Indicates the minimum amount of space that the
4113  *                      allocator should try to satisfy. In some cases
4114  *                      @num_bytes may be larger than what is required and if
4115  *                      the filesystem is fragmented then allocation fails.
4116  *                      However, the presence of @min_alloc_size gives a
4117  *                      chance to try and satisfy the smaller allocation.
4118  *
4119  * @empty_size     -    A hint that you plan on doing more COW. This is the
4120  *                      size in bytes the allocator should try to find free
4121  *                      next to the block it returns.  This is just a hint and
4122  *                      may be ignored by the allocator.
4123  *
4124  * @hint_byte      -    Hint to the allocator to start searching above the byte
4125  *                      address passed. It might be ignored.
4126  *
4127  * @ins            -    This key is modified to record the found hole. It will
4128  *                      have the following values:
4129  *                      ins->objectid == start position
4130  *                      ins->flags = BTRFS_EXTENT_ITEM_KEY
4131  *                      ins->offset == the size of the hole.
4132  *
4133  * @is_data        -    Boolean flag indicating whether an extent is
4134  *                      allocated for data (true) or metadata (false)
4135  *
4136  * @delalloc       -    Boolean flag indicating whether this allocation is for
4137  *                      delalloc or not. If 'true' data_rwsem of block groups
4138  *                      is going to be acquired.
4139  *
4140  *
4141  * Returns 0 when an allocation succeeded or < 0 when an error occurred. In
4142  * case -ENOSPC is returned then @ins->offset will contain the size of the
4143  * largest available hole the allocator managed to find.
4144  */
4145 int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes,
4146                          u64 num_bytes, u64 min_alloc_size,
4147                          u64 empty_size, u64 hint_byte,
4148                          struct btrfs_key *ins, int is_data, int delalloc)
4149 {
4150         struct btrfs_fs_info *fs_info = root->fs_info;
4151         bool final_tried = num_bytes == min_alloc_size;
4152         u64 flags;
4153         int ret;
4154
4155         flags = get_alloc_profile_by_root(root, is_data);
4156 again:
4157         WARN_ON(num_bytes < fs_info->sectorsize);
4158         ret = find_free_extent(root, ram_bytes, num_bytes, empty_size,
4159                                hint_byte, ins, flags, delalloc);
4160         if (!ret && !is_data) {
4161                 btrfs_dec_block_group_reservations(fs_info, ins->objectid);
4162         } else if (ret == -ENOSPC) {
4163                 if (!final_tried && ins->offset) {
4164                         num_bytes = min(num_bytes >> 1, ins->offset);
4165                         num_bytes = round_down(num_bytes,
4166                                                fs_info->sectorsize);
4167                         num_bytes = max(num_bytes, min_alloc_size);
4168                         ram_bytes = num_bytes;
4169                         if (num_bytes == min_alloc_size)
4170                                 final_tried = true;
4171                         goto again;
4172                 } else if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) {
4173                         struct btrfs_space_info *sinfo;
4174
4175                         sinfo = btrfs_find_space_info(fs_info, flags);
4176                         btrfs_err(fs_info,
4177                                   "allocation failed flags %llu, wanted %llu",
4178                                   flags, num_bytes);
4179                         if (sinfo)
4180                                 btrfs_dump_space_info(fs_info, sinfo,
4181                                                       num_bytes, 1);
4182                 }
4183         }
4184
4185         return ret;
4186 }
4187
4188 static int __btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
4189                                         u64 start, u64 len,
4190                                         int pin, int delalloc)
4191 {
4192         struct btrfs_block_group_cache *cache;
4193         int ret = 0;
4194
4195         cache = btrfs_lookup_block_group(fs_info, start);
4196         if (!cache) {
4197                 btrfs_err(fs_info, "Unable to find block group for %llu",
4198                           start);
4199                 return -ENOSPC;
4200         }
4201
4202         if (pin)
4203                 pin_down_extent(cache, start, len, 1);
4204         else {
4205                 if (btrfs_test_opt(fs_info, DISCARD))
4206                         ret = btrfs_discard_extent(fs_info, start, len, NULL);
4207                 btrfs_add_free_space(cache, start, len);
4208                 btrfs_free_reserved_bytes(cache, len, delalloc);
4209                 trace_btrfs_reserved_extent_free(fs_info, start, len);
4210         }
4211
4212         btrfs_put_block_group(cache);
4213         return ret;
4214 }
4215
4216 int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
4217                                u64 start, u64 len, int delalloc)
4218 {
4219         return __btrfs_free_reserved_extent(fs_info, start, len, 0, delalloc);
4220 }
4221
4222 int btrfs_free_and_pin_reserved_extent(struct btrfs_fs_info *fs_info,
4223                                        u64 start, u64 len)
4224 {
4225         return __btrfs_free_reserved_extent(fs_info, start, len, 1, 0);
4226 }
4227
4228 static int alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4229                                       u64 parent, u64 root_objectid,
4230                                       u64 flags, u64 owner, u64 offset,
4231                                       struct btrfs_key *ins, int ref_mod)
4232 {
4233         struct btrfs_fs_info *fs_info = trans->fs_info;
4234         int ret;
4235         struct btrfs_extent_item *extent_item;
4236         struct btrfs_extent_inline_ref *iref;
4237         struct btrfs_path *path;
4238         struct extent_buffer *leaf;
4239         int type;
4240         u32 size;
4241
4242         if (parent > 0)
4243                 type = BTRFS_SHARED_DATA_REF_KEY;
4244         else
4245                 type = BTRFS_EXTENT_DATA_REF_KEY;
4246
4247         size = sizeof(*extent_item) + btrfs_extent_inline_ref_size(type);
4248
4249         path = btrfs_alloc_path();
4250         if (!path)
4251                 return -ENOMEM;
4252
4253         path->leave_spinning = 1;
4254         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
4255                                       ins, size);
4256         if (ret) {
4257                 btrfs_free_path(path);
4258                 return ret;
4259         }
4260
4261         leaf = path->nodes[0];
4262         extent_item = btrfs_item_ptr(leaf, path->slots[0],
4263                                      struct btrfs_extent_item);
4264         btrfs_set_extent_refs(leaf, extent_item, ref_mod);
4265         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
4266         btrfs_set_extent_flags(leaf, extent_item,
4267                                flags | BTRFS_EXTENT_FLAG_DATA);
4268
4269         iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
4270         btrfs_set_extent_inline_ref_type(leaf, iref, type);
4271         if (parent > 0) {
4272                 struct btrfs_shared_data_ref *ref;
4273                 ref = (struct btrfs_shared_data_ref *)(iref + 1);
4274                 btrfs_set_extent_inline_ref_offset(leaf, iref, parent);
4275                 btrfs_set_shared_data_ref_count(leaf, ref, ref_mod);
4276         } else {
4277                 struct btrfs_extent_data_ref *ref;
4278                 ref = (struct btrfs_extent_data_ref *)(&iref->offset);
4279                 btrfs_set_extent_data_ref_root(leaf, ref, root_objectid);
4280                 btrfs_set_extent_data_ref_objectid(leaf, ref, owner);
4281                 btrfs_set_extent_data_ref_offset(leaf, ref, offset);
4282                 btrfs_set_extent_data_ref_count(leaf, ref, ref_mod);
4283         }
4284
4285         btrfs_mark_buffer_dirty(path->nodes[0]);
4286         btrfs_free_path(path);
4287
4288         ret = remove_from_free_space_tree(trans, ins->objectid, ins->offset);
4289         if (ret)
4290                 return ret;
4291
4292         ret = btrfs_update_block_group(trans, ins->objectid, ins->offset, 1);
4293         if (ret) { /* -ENOENT, logic error */
4294                 btrfs_err(fs_info, "update block group failed for %llu %llu",
4295                         ins->objectid, ins->offset);
4296                 BUG();
4297         }
4298         trace_btrfs_reserved_extent_alloc(fs_info, ins->objectid, ins->offset);
4299         return ret;
4300 }
4301
4302 static int alloc_reserved_tree_block(struct btrfs_trans_handle *trans,
4303                                      struct btrfs_delayed_ref_node *node,
4304                                      struct btrfs_delayed_extent_op *extent_op)
4305 {
4306         struct btrfs_fs_info *fs_info = trans->fs_info;
4307         int ret;
4308         struct btrfs_extent_item *extent_item;
4309         struct btrfs_key extent_key;
4310         struct btrfs_tree_block_info *block_info;
4311         struct btrfs_extent_inline_ref *iref;
4312         struct btrfs_path *path;
4313         struct extent_buffer *leaf;
4314         struct btrfs_delayed_tree_ref *ref;
4315         u32 size = sizeof(*extent_item) + sizeof(*iref);
4316         u64 num_bytes;
4317         u64 flags = extent_op->flags_to_set;
4318         bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
4319
4320         ref = btrfs_delayed_node_to_tree_ref(node);
4321
4322         extent_key.objectid = node->bytenr;
4323         if (skinny_metadata) {
4324                 extent_key.offset = ref->level;
4325                 extent_key.type = BTRFS_METADATA_ITEM_KEY;
4326                 num_bytes = fs_info->nodesize;
4327         } else {
4328                 extent_key.offset = node->num_bytes;
4329                 extent_key.type = BTRFS_EXTENT_ITEM_KEY;
4330                 size += sizeof(*block_info);
4331                 num_bytes = node->num_bytes;
4332         }
4333
4334         path = btrfs_alloc_path();
4335         if (!path)
4336                 return -ENOMEM;
4337
4338         path->leave_spinning = 1;
4339         ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path,
4340                                       &extent_key, size);
4341         if (ret) {
4342                 btrfs_free_path(path);
4343                 return ret;
4344         }
4345
4346         leaf = path->nodes[0];
4347         extent_item = btrfs_item_ptr(leaf, path->slots[0],
4348                                      struct btrfs_extent_item);
4349         btrfs_set_extent_refs(leaf, extent_item, 1);
4350         btrfs_set_extent_generation(leaf, extent_item, trans->transid);
4351         btrfs_set_extent_flags(leaf, extent_item,
4352                                flags | BTRFS_EXTENT_FLAG_TREE_BLOCK);
4353
4354         if (skinny_metadata) {
4355                 iref = (struct btrfs_extent_inline_ref *)(extent_item + 1);
4356         } else {
4357                 block_info = (struct btrfs_tree_block_info *)(extent_item + 1);
4358                 btrfs_set_tree_block_key(leaf, block_info, &extent_op->key);
4359                 btrfs_set_tree_block_level(leaf, block_info, ref->level);
4360                 iref = (struct btrfs_extent_inline_ref *)(block_info + 1);
4361         }
4362
4363         if (node->type == BTRFS_SHARED_BLOCK_REF_KEY) {
4364                 BUG_ON(!(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
4365                 btrfs_set_extent_inline_ref_type(leaf, iref,
4366                                                  BTRFS_SHARED_BLOCK_REF_KEY);
4367                 btrfs_set_extent_inline_ref_offset(leaf, iref, ref->parent);
4368         } else {
4369                 btrfs_set_extent_inline_ref_type(leaf, iref,
4370                                                  BTRFS_TREE_BLOCK_REF_KEY);
4371                 btrfs_set_extent_inline_ref_offset(leaf, iref, ref->root);
4372         }
4373
4374         btrfs_mark_buffer_dirty(leaf);
4375         btrfs_free_path(path);
4376
4377         ret = remove_from_free_space_tree(trans, extent_key.objectid,
4378                                           num_bytes);
4379         if (ret)
4380                 return ret;
4381
4382         ret = btrfs_update_block_group(trans, extent_key.objectid,
4383                                        fs_info->nodesize, 1);
4384         if (ret) { /* -ENOENT, logic error */
4385                 btrfs_err(fs_info, "update block group failed for %llu %llu",
4386                         extent_key.objectid, extent_key.offset);
4387                 BUG();
4388         }
4389
4390         trace_btrfs_reserved_extent_alloc(fs_info, extent_key.objectid,
4391                                           fs_info->nodesize);
4392         return ret;
4393 }
4394
4395 int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
4396                                      struct btrfs_root *root, u64 owner,
4397                                      u64 offset, u64 ram_bytes,
4398                                      struct btrfs_key *ins)
4399 {
4400         struct btrfs_ref generic_ref = { 0 };
4401         int ret;
4402
4403         BUG_ON(root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID);
4404
4405         btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
4406                                ins->objectid, ins->offset, 0);
4407         btrfs_init_data_ref(&generic_ref, root->root_key.objectid, owner, offset);
4408         btrfs_ref_tree_mod(root->fs_info, &generic_ref);
4409         ret = btrfs_add_delayed_data_ref(trans, &generic_ref,
4410                                          ram_bytes, NULL, NULL);
4411         return ret;
4412 }
4413
4414 /*
4415  * this is used by the tree logging recovery code.  It records that
4416  * an extent has been allocated and makes sure to clear the free
4417  * space cache bits as well
4418  */
4419 int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
4420                                    u64 root_objectid, u64 owner, u64 offset,
4421                                    struct btrfs_key *ins)
4422 {
4423         struct btrfs_fs_info *fs_info = trans->fs_info;
4424         int ret;
4425         struct btrfs_block_group_cache *block_group;
4426         struct btrfs_space_info *space_info;
4427
4428         /*
4429          * Mixed block groups will exclude before processing the log so we only
4430          * need to do the exclude dance if this fs isn't mixed.
4431          */
4432         if (!btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
4433                 ret = __exclude_logged_extent(fs_info, ins->objectid,
4434                                               ins->offset);
4435                 if (ret)
4436                         return ret;
4437         }
4438
4439         block_group = btrfs_lookup_block_group(fs_info, ins->objectid);
4440         if (!block_group)
4441                 return -EINVAL;
4442
4443         space_info = block_group->space_info;
4444         spin_lock(&space_info->lock);
4445         spin_lock(&block_group->lock);
4446         space_info->bytes_reserved += ins->offset;
4447         block_group->reserved += ins->offset;
4448         spin_unlock(&block_group->lock);
4449         spin_unlock(&space_info->lock);
4450
4451         ret = alloc_reserved_file_extent(trans, 0, root_objectid, 0, owner,
4452                                          offset, ins, 1);
4453         if (ret)
4454                 btrfs_pin_extent(fs_info, ins->objectid, ins->offset, 1);
4455         btrfs_put_block_group(block_group);
4456         return ret;
4457 }
4458
4459 static struct extent_buffer *
4460 btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4461                       u64 bytenr, int level, u64 owner)
4462 {
4463         struct btrfs_fs_info *fs_info = root->fs_info;
4464         struct extent_buffer *buf;
4465
4466         buf = btrfs_find_create_tree_block(fs_info, bytenr);
4467         if (IS_ERR(buf))
4468                 return buf;
4469
4470         /*
4471          * Extra safety check in case the extent tree is corrupted and extent
4472          * allocator chooses to use a tree block which is already used and
4473          * locked.
4474          */
4475         if (buf->lock_owner == current->pid) {
4476                 btrfs_err_rl(fs_info,
4477 "tree block %llu owner %llu already locked by pid=%d, extent tree corruption detected",
4478                         buf->start, btrfs_header_owner(buf), current->pid);
4479                 free_extent_buffer(buf);
4480                 return ERR_PTR(-EUCLEAN);
4481         }
4482
4483         btrfs_set_buffer_lockdep_class(owner, buf, level);
4484         btrfs_tree_lock(buf);
4485         btrfs_clean_tree_block(buf);
4486         clear_bit(EXTENT_BUFFER_STALE, &buf->bflags);
4487
4488         btrfs_set_lock_blocking_write(buf);
4489         set_extent_buffer_uptodate(buf);
4490
4491         memzero_extent_buffer(buf, 0, sizeof(struct btrfs_header));
4492         btrfs_set_header_level(buf, level);
4493         btrfs_set_header_bytenr(buf, buf->start);
4494         btrfs_set_header_generation(buf, trans->transid);
4495         btrfs_set_header_backref_rev(buf, BTRFS_MIXED_BACKREF_REV);
4496         btrfs_set_header_owner(buf, owner);
4497         write_extent_buffer_fsid(buf, fs_info->fs_devices->metadata_uuid);
4498         write_extent_buffer_chunk_tree_uuid(buf, fs_info->chunk_tree_uuid);
4499         if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) {
4500                 buf->log_index = root->log_transid % 2;
4501                 /*
4502                  * we allow two log transactions at a time, use different
4503                  * EXTENT bit to differentiate dirty pages.
4504                  */
4505                 if (buf->log_index == 0)
4506                         set_extent_dirty(&root->dirty_log_pages, buf->start,
4507                                         buf->start + buf->len - 1, GFP_NOFS);
4508                 else
4509                         set_extent_new(&root->dirty_log_pages, buf->start,
4510                                         buf->start + buf->len - 1);
4511         } else {
4512                 buf->log_index = -1;
4513                 set_extent_dirty(&trans->transaction->dirty_pages, buf->start,
4514                          buf->start + buf->len - 1, GFP_NOFS);
4515         }
4516         trans->dirty = true;
4517         /* this returns a buffer locked for blocking */
4518         return buf;
4519 }
4520
4521 /*
4522  * finds a free extent and does all the dirty work required for allocation
4523  * returns the tree buffer or an ERR_PTR on error.
4524  */
4525 struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
4526                                              struct btrfs_root *root,
4527                                              u64 parent, u64 root_objectid,
4528                                              const struct btrfs_disk_key *key,
4529                                              int level, u64 hint,
4530                                              u64 empty_size)
4531 {
4532         struct btrfs_fs_info *fs_info = root->fs_info;
4533         struct btrfs_key ins;
4534         struct btrfs_block_rsv *block_rsv;
4535         struct extent_buffer *buf;
4536         struct btrfs_delayed_extent_op *extent_op;
4537         struct btrfs_ref generic_ref = { 0 };
4538         u64 flags = 0;
4539         int ret;
4540         u32 blocksize = fs_info->nodesize;
4541         bool skinny_metadata = btrfs_fs_incompat(fs_info, SKINNY_METADATA);
4542
4543 #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
4544         if (btrfs_is_testing(fs_info)) {
4545                 buf = btrfs_init_new_buffer(trans, root, root->alloc_bytenr,
4546                                             level, root_objectid);
4547                 if (!IS_ERR(buf))
4548                         root->alloc_bytenr += blocksize;
4549                 return buf;
4550         }
4551 #endif
4552
4553         block_rsv = btrfs_use_block_rsv(trans, root, blocksize);
4554         if (IS_ERR(block_rsv))
4555                 return ERR_CAST(block_rsv);
4556
4557         ret = btrfs_reserve_extent(root, blocksize, blocksize, blocksize,
4558                                    empty_size, hint, &ins, 0, 0);
4559         if (ret)
4560                 goto out_unuse;
4561
4562         buf = btrfs_init_new_buffer(trans, root, ins.objectid, level,
4563                                     root_objectid);
4564         if (IS_ERR(buf)) {
4565                 ret = PTR_ERR(buf);
4566                 goto out_free_reserved;
4567         }
4568
4569         if (root_objectid == BTRFS_TREE_RELOC_OBJECTID) {
4570                 if (parent == 0)
4571                         parent = ins.objectid;
4572                 flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
4573         } else
4574                 BUG_ON(parent > 0);
4575
4576         if (root_objectid != BTRFS_TREE_LOG_OBJECTID) {
4577                 extent_op = btrfs_alloc_delayed_extent_op();
4578                 if (!extent_op) {
4579                         ret = -ENOMEM;
4580                         goto out_free_buf;
4581                 }
4582                 if (key)
4583                         memcpy(&extent_op->key, key, sizeof(extent_op->key));
4584                 else
4585                         memset(&extent_op->key, 0, sizeof(extent_op->key));
4586                 extent_op->flags_to_set = flags;
4587                 extent_op->update_key = skinny_metadata ? false : true;
4588                 extent_op->update_flags = true;
4589                 extent_op->is_data = false;
4590                 extent_op->level = level;
4591
4592                 btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
4593                                        ins.objectid, ins.offset, parent);
4594                 generic_ref.real_root = root->root_key.objectid;
4595                 btrfs_init_tree_ref(&generic_ref, level, root_objectid);
4596                 btrfs_ref_tree_mod(fs_info, &generic_ref);
4597                 ret = btrfs_add_delayed_tree_ref(trans, &generic_ref,
4598                                                  extent_op, NULL, NULL);
4599                 if (ret)
4600                         goto out_free_delayed;
4601         }
4602         return buf;
4603
4604 out_free_delayed:
4605         btrfs_free_delayed_extent_op(extent_op);
4606 out_free_buf:
4607         btrfs_tree_unlock(buf);
4608         free_extent_buffer(buf);
4609 out_free_reserved:
4610         btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0);
4611 out_unuse:
4612         btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize);
4613         return ERR_PTR(ret);
4614 }
4615
4616 struct walk_control {
4617         u64 refs[BTRFS_MAX_LEVEL];
4618         u64 flags[BTRFS_MAX_LEVEL];
4619         struct btrfs_key update_progress;
4620         struct btrfs_key drop_progress;
4621         int drop_level;
4622         int stage;
4623         int level;
4624         int shared_level;
4625         int update_ref;
4626         int keep_locks;
4627         int reada_slot;
4628         int reada_count;
4629         int restarted;
4630 };
4631
4632 #define DROP_REFERENCE  1
4633 #define UPDATE_BACKREF  2
4634
4635 static noinline void reada_walk_down(struct btrfs_trans_handle *trans,
4636                                      struct btrfs_root *root,
4637                                      struct walk_control *wc,
4638                                      struct btrfs_path *path)
4639 {
4640         struct btrfs_fs_info *fs_info = root->fs_info;
4641         u64 bytenr;
4642         u64 generation;
4643         u64 refs;
4644         u64 flags;
4645         u32 nritems;
4646         struct btrfs_key key;
4647         struct extent_buffer *eb;
4648         int ret;
4649         int slot;
4650         int nread = 0;
4651
4652         if (path->slots[wc->level] < wc->reada_slot) {
4653                 wc->reada_count = wc->reada_count * 2 / 3;
4654                 wc->reada_count = max(wc->reada_count, 2);
4655         } else {
4656                 wc->reada_count = wc->reada_count * 3 / 2;
4657                 wc->reada_count = min_t(int, wc->reada_count,
4658                                         BTRFS_NODEPTRS_PER_BLOCK(fs_info));
4659         }
4660
4661         eb = path->nodes[wc->level];
4662         nritems = btrfs_header_nritems(eb);
4663
4664         for (slot = path->slots[wc->level]; slot < nritems; slot++) {
4665                 if (nread >= wc->reada_count)
4666                         break;
4667
4668                 cond_resched();
4669                 bytenr = btrfs_node_blockptr(eb, slot);
4670                 generation = btrfs_node_ptr_generation(eb, slot);
4671
4672                 if (slot == path->slots[wc->level])
4673                         goto reada;
4674
4675                 if (wc->stage == UPDATE_BACKREF &&
4676                     generation <= root->root_key.offset)
4677                         continue;
4678
4679                 /* We don't lock the tree block, it's OK to be racy here */
4680                 ret = btrfs_lookup_extent_info(trans, fs_info, bytenr,
4681                                                wc->level - 1, 1, &refs,
4682                                                &flags);
4683                 /* We don't care about errors in readahead. */
4684                 if (ret < 0)
4685                         continue;
4686                 BUG_ON(refs == 0);
4687
4688                 if (wc->stage == DROP_REFERENCE) {
4689                         if (refs == 1)
4690                                 goto reada;
4691
4692                         if (wc->level == 1 &&
4693                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
4694                                 continue;
4695                         if (!wc->update_ref ||
4696                             generation <= root->root_key.offset)
4697                                 continue;
4698                         btrfs_node_key_to_cpu(eb, &key, slot);
4699                         ret = btrfs_comp_cpu_keys(&key,
4700                                                   &wc->update_progress);
4701                         if (ret < 0)
4702                                 continue;
4703                 } else {
4704                         if (wc->level == 1 &&
4705                             (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF))
4706                                 continue;
4707                 }
4708 reada:
4709                 readahead_tree_block(fs_info, bytenr);
4710                 nread++;
4711         }
4712         wc->reada_slot = slot;
4713 }
4714
4715 /*
4716  * helper to process tree block while walking down the tree.
4717  *
4718  * when wc->stage == UPDATE_BACKREF, this function updates
4719  * back refs for pointers in the block.
4720  *
4721  * NOTE: return value 1 means we should stop walking down.
4722  */
4723 static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
4724                                    struct btrfs_root *root,
4725                                    struct btrfs_path *path,
4726                                    struct walk_control *wc, int lookup_info)
4727 {
4728         struct btrfs_fs_info *fs_info = root->fs_info;
4729         int level = wc->level;
4730         struct extent_buffer *eb = path->nodes[level];
4731         u64 flag = BTRFS_BLOCK_FLAG_FULL_BACKREF;
4732         int ret;
4733
4734         if (wc->stage == UPDATE_BACKREF &&
4735             btrfs_header_owner(eb) != root->root_key.objectid)
4736                 return 1;
4737
4738         /*
4739          * when reference count of tree block is 1, it won't increase
4740          * again. once full backref flag is set, we never clear it.
4741          */
4742         if (lookup_info &&
4743             ((wc->stage == DROP_REFERENCE && wc->refs[level] != 1) ||
4744              (wc->stage == UPDATE_BACKREF && !(wc->flags[level] & flag)))) {
4745                 BUG_ON(!path->locks[level]);
4746                 ret = btrfs_lookup_extent_info(trans, fs_info,
4747                                                eb->start, level, 1,
4748                                                &wc->refs[level],
4749                                                &wc->flags[level]);
4750                 BUG_ON(ret == -ENOMEM);
4751                 if (ret)
4752                         return ret;
4753                 BUG_ON(wc->refs[level] == 0);
4754         }
4755
4756         if (wc->stage == DROP_REFERENCE) {
4757                 if (wc->refs[level] > 1)
4758                         return 1;
4759
4760                 if (path->locks[level] && !wc->keep_locks) {
4761                         btrfs_tree_unlock_rw(eb, path->locks[level]);
4762                         path->locks[level] = 0;
4763                 }
4764                 return 0;
4765         }
4766
4767         /* wc->stage == UPDATE_BACKREF */
4768         if (!(wc->flags[level] & flag)) {
4769                 BUG_ON(!path->locks[level]);
4770                 ret = btrfs_inc_ref(trans, root, eb, 1);
4771                 BUG_ON(ret); /* -ENOMEM */
4772                 ret = btrfs_dec_ref(trans, root, eb, 0);
4773                 BUG_ON(ret); /* -ENOMEM */
4774                 ret = btrfs_set_disk_extent_flags(trans, eb->start,
4775                                                   eb->len, flag,
4776                                                   btrfs_header_level(eb), 0);
4777                 BUG_ON(ret); /* -ENOMEM */
4778                 wc->flags[level] |= flag;
4779         }
4780
4781         /*
4782          * the block is shared by multiple trees, so it's not good to
4783          * keep the tree lock
4784          */
4785         if (path->locks[level] && level > 0) {
4786                 btrfs_tree_unlock_rw(eb, path->locks[level]);
4787                 path->locks[level] = 0;
4788         }
4789         return 0;
4790 }
4791
4792 /*
4793  * This is used to verify a ref exists for this root to deal with a bug where we
4794  * would have a drop_progress key that hadn't been updated properly.
4795  */
4796 static int check_ref_exists(struct btrfs_trans_handle *trans,
4797                             struct btrfs_root *root, u64 bytenr, u64 parent,
4798                             int level)
4799 {
4800         struct btrfs_path *path;
4801         struct btrfs_extent_inline_ref *iref;
4802         int ret;
4803
4804         path = btrfs_alloc_path();
4805         if (!path)
4806                 return -ENOMEM;
4807
4808         ret = lookup_extent_backref(trans, path, &iref, bytenr,
4809                                     root->fs_info->nodesize, parent,
4810                                     root->root_key.objectid, level, 0);
4811         btrfs_free_path(path);
4812         if (ret == -ENOENT)
4813                 return 0;
4814         if (ret < 0)
4815                 return ret;
4816         return 1;
4817 }
4818
4819 /*
4820  * helper to process tree block pointer.
4821  *
4822  * when wc->stage == DROP_REFERENCE, this function checks
4823  * reference count of the block pointed to. if the block
4824  * is shared and we need update back refs for the subtree
4825  * rooted at the block, this function changes wc->stage to
4826  * UPDATE_BACKREF. if the block is shared and there is no
4827  * need to update back, this function drops the reference
4828  * to the block.
4829  *
4830  * NOTE: return value 1 means we should stop walking down.
4831  */
4832 static noinline int do_walk_down(struct btrfs_trans_handle *trans,
4833                                  struct btrfs_root *root,
4834                                  struct btrfs_path *path,
4835                                  struct walk_control *wc, int *lookup_info)
4836 {
4837         struct btrfs_fs_info *fs_info = root->fs_info;
4838         u64 bytenr;
4839         u64 generation;
4840         u64 parent;
4841         struct btrfs_key key;
4842         struct btrfs_key first_key;
4843         struct btrfs_ref ref = { 0 };
4844         struct extent_buffer *next;
4845         int level = wc->level;
4846         int reada = 0;
4847         int ret = 0;
4848         bool need_account = false;
4849
4850         generation = btrfs_node_ptr_generation(path->nodes[level],
4851                                                path->slots[level]);
4852         /*
4853          * if the lower level block was created before the snapshot
4854          * was created, we know there is no need to update back refs
4855          * for the subtree
4856          */
4857         if (wc->stage == UPDATE_BACKREF &&
4858             generation <= root->root_key.offset) {
4859                 *lookup_info = 1;
4860                 return 1;
4861         }
4862
4863         bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
4864         btrfs_node_key_to_cpu(path->nodes[level], &first_key,
4865                               path->slots[level]);
4866
4867         next = find_extent_buffer(fs_info, bytenr);
4868         if (!next) {
4869                 next = btrfs_find_create_tree_block(fs_info, bytenr);
4870                 if (IS_ERR(next))
4871                         return PTR_ERR(next);
4872
4873                 btrfs_set_buffer_lockdep_class(root->root_key.objectid, next,
4874                                                level - 1);
4875                 reada = 1;
4876         }
4877         btrfs_tree_lock(next);
4878         btrfs_set_lock_blocking_write(next);
4879
4880         ret = btrfs_lookup_extent_info(trans, fs_info, bytenr, level - 1, 1,
4881                                        &wc->refs[level - 1],
4882                                        &wc->flags[level - 1]);
4883         if (ret < 0)
4884                 goto out_unlock;
4885
4886         if (unlikely(wc->refs[level - 1] == 0)) {
4887                 btrfs_err(fs_info, "Missing references.");
4888                 ret = -EIO;
4889                 goto out_unlock;
4890         }
4891         *lookup_info = 0;
4892
4893         if (wc->stage == DROP_REFERENCE) {
4894                 if (wc->refs[level - 1] > 1) {
4895                         need_account = true;
4896                         if (level == 1 &&
4897                             (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
4898                                 goto skip;
4899
4900                         if (!wc->update_ref ||
4901                             generation <= root->root_key.offset)
4902                                 goto skip;
4903
4904                         btrfs_node_key_to_cpu(path->nodes[level], &key,
4905                                               path->slots[level]);
4906                         ret = btrfs_comp_cpu_keys(&key, &wc->update_progress);
4907                         if (ret < 0)
4908                                 goto skip;
4909
4910                         wc->stage = UPDATE_BACKREF;
4911                         wc->shared_level = level - 1;
4912                 }
4913         } else {
4914                 if (level == 1 &&
4915                     (wc->flags[0] & BTRFS_BLOCK_FLAG_FULL_BACKREF))
4916                         goto skip;
4917         }
4918
4919         if (!btrfs_buffer_uptodate(next, generation, 0)) {
4920                 btrfs_tree_unlock(next);
4921                 free_extent_buffer(next);
4922                 next = NULL;
4923                 *lookup_info = 1;
4924         }
4925
4926         if (!next) {
4927                 if (reada && level == 1)
4928                         reada_walk_down(trans, root, wc, path);
4929                 next = read_tree_block(fs_info, bytenr, generation, level - 1,
4930                                        &first_key);
4931                 if (IS_ERR(next)) {
4932                         return PTR_ERR(next);
4933                 } else if (!extent_buffer_uptodate(next)) {
4934                         free_extent_buffer(next);
4935                         return -EIO;
4936                 }
4937                 btrfs_tree_lock(next);
4938                 btrfs_set_lock_blocking_write(next);
4939         }
4940
4941         level--;
4942         ASSERT(level == btrfs_header_level(next));
4943         if (level != btrfs_header_level(next)) {
4944                 btrfs_err(root->fs_info, "mismatched level");
4945                 ret = -EIO;
4946                 goto out_unlock;
4947         }
4948         path->nodes[level] = next;
4949         path->slots[level] = 0;
4950         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
4951         wc->level = level;
4952         if (wc->level == 1)
4953                 wc->reada_slot = 0;
4954         return 0;
4955 skip:
4956         wc->refs[level - 1] = 0;
4957         wc->flags[level - 1] = 0;
4958         if (wc->stage == DROP_REFERENCE) {
4959                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
4960                         parent = path->nodes[level]->start;
4961                 } else {
4962                         ASSERT(root->root_key.objectid ==
4963                                btrfs_header_owner(path->nodes[level]));
4964                         if (root->root_key.objectid !=
4965                             btrfs_header_owner(path->nodes[level])) {
4966                                 btrfs_err(root->fs_info,
4967                                                 "mismatched block owner");
4968                                 ret = -EIO;
4969                                 goto out_unlock;
4970                         }
4971                         parent = 0;
4972                 }
4973
4974                 /*
4975                  * If we had a drop_progress we need to verify the refs are set
4976                  * as expected.  If we find our ref then we know that from here
4977                  * on out everything should be correct, and we can clear the
4978                  * ->restarted flag.
4979                  */
4980                 if (wc->restarted) {
4981                         ret = check_ref_exists(trans, root, bytenr, parent,
4982                                                level - 1);
4983                         if (ret < 0)
4984                                 goto out_unlock;
4985                         if (ret == 0)
4986                                 goto no_delete;
4987                         ret = 0;
4988                         wc->restarted = 0;
4989                 }
4990
4991                 /*
4992                  * Reloc tree doesn't contribute to qgroup numbers, and we have
4993                  * already accounted them at merge time (replace_path),
4994                  * thus we could skip expensive subtree trace here.
4995                  */
4996                 if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
4997                     need_account) {
4998                         ret = btrfs_qgroup_trace_subtree(trans, next,
4999                                                          generation, level - 1);
5000                         if (ret) {
5001                                 btrfs_err_rl(fs_info,
5002                                              "Error %d accounting shared subtree. Quota is out of sync, rescan required.",
5003                                              ret);
5004                         }
5005                 }
5006
5007                 /*
5008                  * We need to update the next key in our walk control so we can
5009                  * update the drop_progress key accordingly.  We don't care if
5010                  * find_next_key doesn't find a key because that means we're at
5011                  * the end and are going to clean up now.
5012                  */
5013                 wc->drop_level = level;
5014                 find_next_key(path, level, &wc->drop_progress);
5015
5016                 btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
5017                                        fs_info->nodesize, parent);
5018                 btrfs_init_tree_ref(&ref, level - 1, root->root_key.objectid);
5019                 ret = btrfs_free_extent(trans, &ref);
5020                 if (ret)
5021                         goto out_unlock;
5022         }
5023 no_delete:
5024         *lookup_info = 1;
5025         ret = 1;
5026
5027 out_unlock:
5028         btrfs_tree_unlock(next);
5029         free_extent_buffer(next);
5030
5031         return ret;
5032 }
5033
5034 /*
5035  * helper to process tree block while walking up the tree.
5036  *
5037  * when wc->stage == DROP_REFERENCE, this function drops
5038  * reference count on the block.
5039  *
5040  * when wc->stage == UPDATE_BACKREF, this function changes
5041  * wc->stage back to DROP_REFERENCE if we changed wc->stage
5042  * to UPDATE_BACKREF previously while processing the block.
5043  *
5044  * NOTE: return value 1 means we should stop walking up.
5045  */
5046 static noinline int walk_up_proc(struct btrfs_trans_handle *trans,
5047                                  struct btrfs_root *root,
5048                                  struct btrfs_path *path,
5049                                  struct walk_control *wc)
5050 {
5051         struct btrfs_fs_info *fs_info = root->fs_info;
5052         int ret;
5053         int level = wc->level;
5054         struct extent_buffer *eb = path->nodes[level];
5055         u64 parent = 0;
5056
5057         if (wc->stage == UPDATE_BACKREF) {
5058                 BUG_ON(wc->shared_level < level);
5059                 if (level < wc->shared_level)
5060                         goto out;
5061
5062                 ret = find_next_key(path, level + 1, &wc->update_progress);
5063                 if (ret > 0)
5064                         wc->update_ref = 0;
5065
5066                 wc->stage = DROP_REFERENCE;
5067                 wc->shared_level = -1;
5068                 path->slots[level] = 0;
5069
5070                 /*
5071                  * check reference count again if the block isn't locked.
5072                  * we should start walking down the tree again if reference
5073                  * count is one.
5074                  */
5075                 if (!path->locks[level]) {
5076                         BUG_ON(level == 0);
5077                         btrfs_tree_lock(eb);
5078                         btrfs_set_lock_blocking_write(eb);
5079                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
5080
5081                         ret = btrfs_lookup_extent_info(trans, fs_info,
5082                                                        eb->start, level, 1,
5083                                                        &wc->refs[level],
5084                                                        &wc->flags[level]);
5085                         if (ret < 0) {
5086                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
5087                                 path->locks[level] = 0;
5088                                 return ret;
5089                         }
5090                         BUG_ON(wc->refs[level] == 0);
5091                         if (wc->refs[level] == 1) {
5092                                 btrfs_tree_unlock_rw(eb, path->locks[level]);
5093                                 path->locks[level] = 0;
5094                                 return 1;
5095                         }
5096                 }
5097         }
5098
5099         /* wc->stage == DROP_REFERENCE */
5100         BUG_ON(wc->refs[level] > 1 && !path->locks[level]);
5101
5102         if (wc->refs[level] == 1) {
5103                 if (level == 0) {
5104                         if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5105                                 ret = btrfs_dec_ref(trans, root, eb, 1);
5106                         else
5107                                 ret = btrfs_dec_ref(trans, root, eb, 0);
5108                         BUG_ON(ret); /* -ENOMEM */
5109                         if (is_fstree(root->root_key.objectid)) {
5110                                 ret = btrfs_qgroup_trace_leaf_items(trans, eb);
5111                                 if (ret) {
5112                                         btrfs_err_rl(fs_info,
5113         "error %d accounting leaf items, quota is out of sync, rescan required",
5114                                              ret);
5115                                 }
5116                         }
5117                 }
5118                 /* make block locked assertion in btrfs_clean_tree_block happy */
5119                 if (!path->locks[level] &&
5120                     btrfs_header_generation(eb) == trans->transid) {
5121                         btrfs_tree_lock(eb);
5122                         btrfs_set_lock_blocking_write(eb);
5123                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
5124                 }
5125                 btrfs_clean_tree_block(eb);
5126         }
5127
5128         if (eb == root->node) {
5129                 if (wc->flags[level] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5130                         parent = eb->start;
5131                 else if (root->root_key.objectid != btrfs_header_owner(eb))
5132                         goto owner_mismatch;
5133         } else {
5134                 if (wc->flags[level + 1] & BTRFS_BLOCK_FLAG_FULL_BACKREF)
5135                         parent = path->nodes[level + 1]->start;
5136                 else if (root->root_key.objectid !=
5137                          btrfs_header_owner(path->nodes[level + 1]))
5138                         goto owner_mismatch;
5139         }
5140
5141         btrfs_free_tree_block(trans, root, eb, parent, wc->refs[level] == 1);
5142 out:
5143         wc->refs[level] = 0;
5144         wc->flags[level] = 0;
5145         return 0;
5146
5147 owner_mismatch:
5148         btrfs_err_rl(fs_info, "unexpected tree owner, have %llu expect %llu",
5149                      btrfs_header_owner(eb), root->root_key.objectid);
5150         return -EUCLEAN;
5151 }
5152
5153 static noinline int walk_down_tree(struct btrfs_trans_handle *trans,
5154                                    struct btrfs_root *root,
5155                                    struct btrfs_path *path,
5156                                    struct walk_control *wc)
5157 {
5158         int level = wc->level;
5159         int lookup_info = 1;
5160         int ret;
5161
5162         while (level >= 0) {
5163                 ret = walk_down_proc(trans, root, path, wc, lookup_info);
5164                 if (ret > 0)
5165                         break;
5166
5167                 if (level == 0)
5168                         break;
5169
5170                 if (path->slots[level] >=
5171                     btrfs_header_nritems(path->nodes[level]))
5172                         break;
5173
5174                 ret = do_walk_down(trans, root, path, wc, &lookup_info);
5175                 if (ret > 0) {
5176                         path->slots[level]++;
5177                         continue;
5178                 } else if (ret < 0)
5179                         return ret;
5180                 level = wc->level;
5181         }
5182         return 0;
5183 }
5184
5185 static noinline int walk_up_tree(struct btrfs_trans_handle *trans,
5186                                  struct btrfs_root *root,
5187                                  struct btrfs_path *path,
5188                                  struct walk_control *wc, int max_level)
5189 {
5190         int level = wc->level;
5191         int ret;
5192
5193         path->slots[level] = btrfs_header_nritems(path->nodes[level]);
5194         while (level < max_level && path->nodes[level]) {
5195                 wc->level = level;
5196                 if (path->slots[level] + 1 <
5197                     btrfs_header_nritems(path->nodes[level])) {
5198                         path->slots[level]++;
5199                         return 0;
5200                 } else {
5201                         ret = walk_up_proc(trans, root, path, wc);
5202                         if (ret > 0)
5203                                 return 0;
5204                         if (ret < 0)
5205                                 return ret;
5206
5207                         if (path->locks[level]) {
5208                                 btrfs_tree_unlock_rw(path->nodes[level],
5209                                                      path->locks[level]);
5210                                 path->locks[level] = 0;
5211                         }
5212                         free_extent_buffer(path->nodes[level]);
5213                         path->nodes[level] = NULL;
5214                         level++;
5215                 }
5216         }
5217         return 1;
5218 }
5219
5220 /*
5221  * drop a subvolume tree.
5222  *
5223  * this function traverses the tree freeing any blocks that only
5224  * referenced by the tree.
5225  *
5226  * when a shared tree block is found. this function decreases its
5227  * reference count by one. if update_ref is true, this function
5228  * also make sure backrefs for the shared block and all lower level
5229  * blocks are properly updated.
5230  *
5231  * If called with for_reloc == 0, may exit early with -EAGAIN
5232  */
5233 int btrfs_drop_snapshot(struct btrfs_root *root,
5234                          struct btrfs_block_rsv *block_rsv, int update_ref,
5235                          int for_reloc)
5236 {
5237         struct btrfs_fs_info *fs_info = root->fs_info;
5238         struct btrfs_path *path;
5239         struct btrfs_trans_handle *trans;
5240         struct btrfs_root *tree_root = fs_info->tree_root;
5241         struct btrfs_root_item *root_item = &root->root_item;
5242         struct walk_control *wc;
5243         struct btrfs_key key;
5244         int err = 0;
5245         int ret;
5246         int level;
5247         bool root_dropped = false;
5248
5249         btrfs_debug(fs_info, "Drop subvolume %llu", root->root_key.objectid);
5250
5251         path = btrfs_alloc_path();
5252         if (!path) {
5253                 err = -ENOMEM;
5254                 goto out;
5255         }
5256
5257         wc = kzalloc(sizeof(*wc), GFP_NOFS);
5258         if (!wc) {
5259                 btrfs_free_path(path);
5260                 err = -ENOMEM;
5261                 goto out;
5262         }
5263
5264         /*
5265          * Use join to avoid potential EINTR from transaction start. See
5266          * wait_reserve_ticket and the whole reservation callchain.
5267          */
5268         if (for_reloc)
5269                 trans = btrfs_join_transaction(tree_root);
5270         else
5271                 trans = btrfs_start_transaction(tree_root, 0);
5272         if (IS_ERR(trans)) {
5273                 err = PTR_ERR(trans);
5274                 goto out_free;
5275         }
5276
5277         err = btrfs_run_delayed_items(trans);
5278         if (err)
5279                 goto out_end_trans;
5280
5281         if (block_rsv)
5282                 trans->block_rsv = block_rsv;
5283
5284         /*
5285          * This will help us catch people modifying the fs tree while we're
5286          * dropping it.  It is unsafe to mess with the fs tree while it's being
5287          * dropped as we unlock the root node and parent nodes as we walk down
5288          * the tree, assuming nothing will change.  If something does change
5289          * then we'll have stale information and drop references to blocks we've
5290          * already dropped.
5291          */
5292         set_bit(BTRFS_ROOT_DELETING, &root->state);
5293         if (btrfs_disk_key_objectid(&root_item->drop_progress) == 0) {
5294                 level = btrfs_header_level(root->node);
5295                 path->nodes[level] = btrfs_lock_root_node(root);
5296                 btrfs_set_lock_blocking_write(path->nodes[level]);
5297                 path->slots[level] = 0;
5298                 path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
5299                 memset(&wc->update_progress, 0,
5300                        sizeof(wc->update_progress));
5301         } else {
5302                 btrfs_disk_key_to_cpu(&key, &root_item->drop_progress);
5303                 memcpy(&wc->update_progress, &key,
5304                        sizeof(wc->update_progress));
5305
5306                 level = root_item->drop_level;
5307                 BUG_ON(level == 0);
5308                 path->lowest_level = level;
5309                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5310                 path->lowest_level = 0;
5311                 if (ret < 0) {
5312                         err = ret;
5313                         goto out_end_trans;
5314                 }
5315                 WARN_ON(ret > 0);
5316
5317                 /*
5318                  * unlock our path, this is safe because only this
5319                  * function is allowed to delete this snapshot
5320                  */
5321                 btrfs_unlock_up_safe(path, 0);
5322
5323                 level = btrfs_header_level(root->node);
5324                 while (1) {
5325                         btrfs_tree_lock(path->nodes[level]);
5326                         btrfs_set_lock_blocking_write(path->nodes[level]);
5327                         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
5328
5329                         ret = btrfs_lookup_extent_info(trans, fs_info,
5330                                                 path->nodes[level]->start,
5331                                                 level, 1, &wc->refs[level],
5332                                                 &wc->flags[level]);
5333                         if (ret < 0) {
5334                                 err = ret;
5335                                 goto out_end_trans;
5336                         }
5337                         BUG_ON(wc->refs[level] == 0);
5338
5339                         if (level == root_item->drop_level)
5340                                 break;
5341
5342                         btrfs_tree_unlock(path->nodes[level]);
5343                         path->locks[level] = 0;
5344                         WARN_ON(wc->refs[level] != 1);
5345                         level--;
5346                 }
5347         }
5348
5349         wc->restarted = test_bit(BTRFS_ROOT_DEAD_TREE, &root->state);
5350         wc->level = level;
5351         wc->shared_level = -1;
5352         wc->stage = DROP_REFERENCE;
5353         wc->update_ref = update_ref;
5354         wc->keep_locks = 0;
5355         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
5356
5357         while (1) {
5358
5359                 ret = walk_down_tree(trans, root, path, wc);
5360                 if (ret < 0) {
5361                         err = ret;
5362                         break;
5363                 }
5364
5365                 ret = walk_up_tree(trans, root, path, wc, BTRFS_MAX_LEVEL);
5366                 if (ret < 0) {
5367                         err = ret;
5368                         break;
5369                 }
5370
5371                 if (ret > 0) {
5372                         BUG_ON(wc->stage != DROP_REFERENCE);
5373                         break;
5374                 }
5375
5376                 if (wc->stage == DROP_REFERENCE) {
5377                         wc->drop_level = wc->level;
5378                         btrfs_node_key_to_cpu(path->nodes[wc->drop_level],
5379                                               &wc->drop_progress,
5380                                               path->slots[wc->drop_level]);
5381                 }
5382                 btrfs_cpu_key_to_disk(&root_item->drop_progress,
5383                                       &wc->drop_progress);
5384                 root_item->drop_level = wc->drop_level;
5385
5386                 BUG_ON(wc->level == 0);
5387                 if (btrfs_should_end_transaction(trans) ||
5388                     (!for_reloc && btrfs_need_cleaner_sleep(fs_info))) {
5389                         ret = btrfs_update_root(trans, tree_root,
5390                                                 &root->root_key,
5391                                                 root_item);
5392                         if (ret) {
5393                                 btrfs_abort_transaction(trans, ret);
5394                                 err = ret;
5395                                 goto out_end_trans;
5396                         }
5397
5398                         btrfs_end_transaction_throttle(trans);
5399                         if (!for_reloc && btrfs_need_cleaner_sleep(fs_info)) {
5400                                 btrfs_debug(fs_info,
5401                                             "drop snapshot early exit");
5402                                 err = -EAGAIN;
5403                                 goto out_free;
5404                         }
5405
5406                        /*
5407                         * Use join to avoid potential EINTR from transaction
5408                         * start. See wait_reserve_ticket and the whole
5409                         * reservation callchain.
5410                         */
5411                         if (for_reloc)
5412                                 trans = btrfs_join_transaction(tree_root);
5413                         else
5414                                 trans = btrfs_start_transaction(tree_root, 0);
5415                         if (IS_ERR(trans)) {
5416                                 err = PTR_ERR(trans);
5417                                 goto out_free;
5418                         }
5419                         if (block_rsv)
5420                                 trans->block_rsv = block_rsv;
5421                 }
5422         }
5423         btrfs_release_path(path);
5424         if (err)
5425                 goto out_end_trans;
5426
5427         ret = btrfs_del_root(trans, &root->root_key);
5428         if (ret) {
5429                 btrfs_abort_transaction(trans, ret);
5430                 err = ret;
5431                 goto out_end_trans;
5432         }
5433
5434         if (root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID) {
5435                 ret = btrfs_find_root(tree_root, &root->root_key, path,
5436                                       NULL, NULL);
5437                 if (ret < 0) {
5438                         btrfs_abort_transaction(trans, ret);
5439                         err = ret;
5440                         goto out_end_trans;
5441                 } else if (ret > 0) {
5442                         /* if we fail to delete the orphan item this time
5443                          * around, it'll get picked up the next time.
5444                          *
5445                          * The most common failure here is just -ENOENT.
5446                          */
5447                         btrfs_del_orphan_item(trans, tree_root,
5448                                               root->root_key.objectid);
5449                 }
5450         }
5451
5452         if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state)) {
5453                 btrfs_add_dropped_root(trans, root);
5454         } else {
5455                 free_extent_buffer(root->node);
5456                 free_extent_buffer(root->commit_root);
5457                 btrfs_put_fs_root(root);
5458         }
5459         root_dropped = true;
5460 out_end_trans:
5461         btrfs_end_transaction_throttle(trans);
5462 out_free:
5463         kfree(wc);
5464         btrfs_free_path(path);
5465 out:
5466         /*
5467          * So if we need to stop dropping the snapshot for whatever reason we
5468          * need to make sure to add it back to the dead root list so that we
5469          * keep trying to do the work later.  This also cleans up roots if we
5470          * don't have it in the radix (like when we recover after a power fail
5471          * or unmount) so we don't leak memory.
5472          */
5473         if (!for_reloc && !root_dropped)
5474                 btrfs_add_dead_root(root);
5475         return err;
5476 }
5477
5478 /*
5479  * drop subtree rooted at tree block 'node'.
5480  *
5481  * NOTE: this function will unlock and release tree block 'node'
5482  * only used by relocation code
5483  */
5484 int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
5485                         struct btrfs_root *root,
5486                         struct extent_buffer *node,
5487                         struct extent_buffer *parent)
5488 {
5489         struct btrfs_fs_info *fs_info = root->fs_info;
5490         struct btrfs_path *path;
5491         struct walk_control *wc;
5492         int level;
5493         int parent_level;
5494         int ret = 0;
5495         int wret;
5496
5497         BUG_ON(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID);
5498
5499         path = btrfs_alloc_path();
5500         if (!path)
5501                 return -ENOMEM;
5502
5503         wc = kzalloc(sizeof(*wc), GFP_NOFS);
5504         if (!wc) {
5505                 btrfs_free_path(path);
5506                 return -ENOMEM;
5507         }
5508
5509         btrfs_assert_tree_locked(parent);
5510         parent_level = btrfs_header_level(parent);
5511         extent_buffer_get(parent);
5512         path->nodes[parent_level] = parent;
5513         path->slots[parent_level] = btrfs_header_nritems(parent);
5514
5515         btrfs_assert_tree_locked(node);
5516         level = btrfs_header_level(node);
5517         path->nodes[level] = node;
5518         path->slots[level] = 0;
5519         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
5520
5521         wc->refs[parent_level] = 1;
5522         wc->flags[parent_level] = BTRFS_BLOCK_FLAG_FULL_BACKREF;
5523         wc->level = level;
5524         wc->shared_level = -1;
5525         wc->stage = DROP_REFERENCE;
5526         wc->update_ref = 0;
5527         wc->keep_locks = 1;
5528         wc->reada_count = BTRFS_NODEPTRS_PER_BLOCK(fs_info);
5529
5530         while (1) {
5531                 wret = walk_down_tree(trans, root, path, wc);
5532                 if (wret < 0) {
5533                         ret = wret;
5534                         break;
5535                 }
5536
5537                 wret = walk_up_tree(trans, root, path, wc, parent_level);
5538                 if (wret < 0)
5539                         ret = wret;
5540                 if (wret != 0)
5541                         break;
5542         }
5543
5544         kfree(wc);
5545         btrfs_free_path(path);
5546         return ret;
5547 }
5548
5549 /*
5550  * helper to account the unused space of all the readonly block group in the
5551  * space_info. takes mirrors into account.
5552  */
5553 u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
5554 {
5555         struct btrfs_block_group_cache *block_group;
5556         u64 free_bytes = 0;
5557         int factor;
5558
5559         /* It's df, we don't care if it's racy */
5560         if (list_empty(&sinfo->ro_bgs))
5561                 return 0;
5562
5563         spin_lock(&sinfo->lock);
5564         list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
5565                 spin_lock(&block_group->lock);
5566
5567                 if (!block_group->ro) {
5568                         spin_unlock(&block_group->lock);
5569                         continue;
5570                 }
5571
5572                 factor = btrfs_bg_type_to_factor(block_group->flags);
5573                 free_bytes += (block_group->key.offset -
5574                                btrfs_block_group_used(&block_group->item)) *
5575                                factor;
5576
5577                 spin_unlock(&block_group->lock);
5578         }
5579         spin_unlock(&sinfo->lock);
5580
5581         return free_bytes;
5582 }
5583
5584 int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
5585                                    u64 start, u64 end)
5586 {
5587         return unpin_extent_range(fs_info, start, end, false);
5588 }
5589
5590 /*
5591  * It used to be that old block groups would be left around forever.
5592  * Iterating over them would be enough to trim unused space.  Since we
5593  * now automatically remove them, we also need to iterate over unallocated
5594  * space.
5595  *
5596  * We don't want a transaction for this since the discard may take a
5597  * substantial amount of time.  We don't require that a transaction be
5598  * running, but we do need to take a running transaction into account
5599  * to ensure that we're not discarding chunks that were released or
5600  * allocated in the current transaction.
5601  *
5602  * Holding the chunks lock will prevent other threads from allocating
5603  * or releasing chunks, but it won't prevent a running transaction
5604  * from committing and releasing the memory that the pending chunks
5605  * list head uses.  For that, we need to take a reference to the
5606  * transaction and hold the commit root sem.  We only need to hold
5607  * it while performing the free space search since we have already
5608  * held back allocations.
5609  */
5610 static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
5611 {
5612         u64 start = SZ_1M, len = 0, end = 0;
5613         int ret;
5614
5615         *trimmed = 0;
5616
5617         /* Discard not supported = nothing to do. */
5618         if (!blk_queue_discard(bdev_get_queue(device->bdev)))
5619                 return 0;
5620
5621         /* Not writable = nothing to do. */
5622         if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state))
5623                 return 0;
5624
5625         /* No free space = nothing to do. */
5626         if (device->total_bytes <= device->bytes_used)
5627                 return 0;
5628
5629         ret = 0;
5630
5631         while (1) {
5632                 struct btrfs_fs_info *fs_info = device->fs_info;
5633                 u64 bytes;
5634
5635                 ret = mutex_lock_interruptible(&fs_info->chunk_mutex);
5636                 if (ret)
5637                         break;
5638
5639                 find_first_clear_extent_bit(&device->alloc_state, start,
5640                                             &start, &end,
5641                                             CHUNK_TRIMMED | CHUNK_ALLOCATED);
5642
5643                 /* Check if there are any CHUNK_* bits left */
5644                 if (start > device->total_bytes) {
5645                         WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG));
5646                         btrfs_warn_in_rcu(fs_info,
5647 "ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu",
5648                                           start, end - start + 1,
5649                                           rcu_str_deref(device->name),
5650                                           device->total_bytes);
5651                         mutex_unlock(&fs_info->chunk_mutex);
5652                         ret = 0;
5653                         break;
5654                 }
5655
5656                 /* Ensure we skip the reserved area in the first 1M */
5657                 start = max_t(u64, start, SZ_1M);
5658
5659                 /*
5660                  * If find_first_clear_extent_bit find a range that spans the
5661                  * end of the device it will set end to -1, in this case it's up
5662                  * to the caller to trim the value to the size of the device.
5663                  */
5664                 end = min(end, device->total_bytes - 1);
5665
5666                 len = end - start + 1;
5667
5668                 /* We didn't find any extents */
5669                 if (!len) {
5670                         mutex_unlock(&fs_info->chunk_mutex);
5671                         ret = 0;
5672                         break;
5673                 }
5674
5675                 ret = btrfs_issue_discard(device->bdev, start, len,
5676                                           &bytes);
5677                 if (!ret)
5678                         set_extent_bits(&device->alloc_state, start,
5679                                         start + bytes - 1,
5680                                         CHUNK_TRIMMED);
5681                 mutex_unlock(&fs_info->chunk_mutex);
5682
5683                 if (ret)
5684                         break;
5685
5686                 start += len;
5687                 *trimmed += bytes;
5688
5689                 if (fatal_signal_pending(current)) {
5690                         ret = -ERESTARTSYS;
5691                         break;
5692                 }
5693
5694                 cond_resched();
5695         }
5696
5697         return ret;
5698 }
5699
5700 /*
5701  * Trim the whole filesystem by:
5702  * 1) trimming the free space in each block group
5703  * 2) trimming the unallocated space on each device
5704  *
5705  * This will also continue trimming even if a block group or device encounters
5706  * an error.  The return value will be the last error, or 0 if nothing bad
5707  * happens.
5708  */
5709 int btrfs_trim_fs(struct btrfs_fs_info *fs_info, struct fstrim_range *range)
5710 {
5711         struct btrfs_block_group_cache *cache = NULL;
5712         struct btrfs_device *device;
5713         struct list_head *devices;
5714         u64 group_trimmed;
5715         u64 range_end = U64_MAX;
5716         u64 start;
5717         u64 end;
5718         u64 trimmed = 0;
5719         u64 bg_failed = 0;
5720         u64 dev_failed = 0;
5721         int bg_ret = 0;
5722         int dev_ret = 0;
5723         int ret = 0;
5724
5725         /*
5726          * Check range overflow if range->len is set.
5727          * The default range->len is U64_MAX.
5728          */
5729         if (range->len != U64_MAX &&
5730             check_add_overflow(range->start, range->len, &range_end))
5731                 return -EINVAL;
5732
5733         cache = btrfs_lookup_first_block_group(fs_info, range->start);
5734         for (; cache; cache = btrfs_next_block_group(cache)) {
5735                 if (cache->key.objectid >= range_end) {
5736                         btrfs_put_block_group(cache);
5737                         break;
5738                 }
5739
5740                 start = max(range->start, cache->key.objectid);
5741                 end = min(range_end, cache->key.objectid + cache->key.offset);
5742
5743                 if (end - start >= range->minlen) {
5744                         if (!btrfs_block_group_cache_done(cache)) {
5745                                 ret = btrfs_cache_block_group(cache, 0);
5746                                 if (ret) {
5747                                         bg_failed++;
5748                                         bg_ret = ret;
5749                                         continue;
5750                                 }
5751                                 ret = btrfs_wait_block_group_cache_done(cache);
5752                                 if (ret) {
5753                                         bg_failed++;
5754                                         bg_ret = ret;
5755                                         continue;
5756                                 }
5757                         }
5758                         ret = btrfs_trim_block_group(cache,
5759                                                      &group_trimmed,
5760                                                      start,
5761                                                      end,
5762                                                      range->minlen);
5763
5764                         trimmed += group_trimmed;
5765                         if (ret) {
5766                                 bg_failed++;
5767                                 bg_ret = ret;
5768                                 continue;
5769                         }
5770                 }
5771         }
5772
5773         if (bg_failed)
5774                 btrfs_warn(fs_info,
5775                         "failed to trim %llu block group(s), last error %d",
5776                         bg_failed, bg_ret);
5777         mutex_lock(&fs_info->fs_devices->device_list_mutex);
5778         devices = &fs_info->fs_devices->devices;
5779         list_for_each_entry(device, devices, dev_list) {
5780                 if (test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
5781                         continue;
5782
5783                 ret = btrfs_trim_free_extents(device, &group_trimmed);
5784                 if (ret) {
5785                         dev_failed++;
5786                         dev_ret = ret;
5787                         break;
5788                 }
5789
5790                 trimmed += group_trimmed;
5791         }
5792         mutex_unlock(&fs_info->fs_devices->device_list_mutex);
5793
5794         if (dev_failed)
5795                 btrfs_warn(fs_info,
5796                         "failed to trim %llu device(s), last error %d",
5797                         dev_failed, dev_ret);
5798         range->len = trimmed;
5799         if (bg_ret)
5800                 return bg_ret;
5801         return dev_ret;
5802 }
5803
5804 /*
5805  * btrfs_{start,end}_write_no_snapshotting() are similar to
5806  * mnt_{want,drop}_write(), they are used to prevent some tasks from writing
5807  * data into the page cache through nocow before the subvolume is snapshoted,
5808  * but flush the data into disk after the snapshot creation, or to prevent
5809  * operations while snapshotting is ongoing and that cause the snapshot to be
5810  * inconsistent (writes followed by expanding truncates for example).
5811  */
5812 void btrfs_end_write_no_snapshotting(struct btrfs_root *root)
5813 {
5814         percpu_counter_dec(&root->subv_writers->counter);
5815         cond_wake_up(&root->subv_writers->wait);
5816 }
5817
5818 int btrfs_start_write_no_snapshotting(struct btrfs_root *root)
5819 {
5820         if (atomic_read(&root->will_be_snapshotted))
5821                 return 0;
5822
5823         percpu_counter_inc(&root->subv_writers->counter);
5824         /*
5825          * Make sure counter is updated before we check for snapshot creation.
5826          */
5827         smp_mb();
5828         if (atomic_read(&root->will_be_snapshotted)) {
5829                 btrfs_end_write_no_snapshotting(root);
5830                 return 0;
5831         }
5832         return 1;
5833 }
5834
5835 void btrfs_wait_for_snapshot_creation(struct btrfs_root *root)
5836 {
5837         while (true) {
5838                 int ret;
5839
5840                 ret = btrfs_start_write_no_snapshotting(root);
5841                 if (ret)
5842                         break;
5843                 wait_var_event(&root->will_be_snapshotted,
5844                                !atomic_read(&root->will_be_snapshotted));
5845         }
5846 }