GNU Linux-libre 6.8.9-gnu
[releases.git] / fs / btrfs / ctree.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
4  */
5
6 #include <linux/sched.h>
7 #include <linux/slab.h>
8 #include <linux/rbtree.h>
9 #include <linux/mm.h>
10 #include <linux/error-injection.h>
11 #include "messages.h"
12 #include "ctree.h"
13 #include "disk-io.h"
14 #include "transaction.h"
15 #include "print-tree.h"
16 #include "locking.h"
17 #include "volumes.h"
18 #include "qgroup.h"
19 #include "tree-mod-log.h"
20 #include "tree-checker.h"
21 #include "fs.h"
22 #include "accessors.h"
23 #include "extent-tree.h"
24 #include "relocation.h"
25 #include "file-item.h"
26
27 static struct kmem_cache *btrfs_path_cachep;
28
29 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
30                       *root, struct btrfs_path *path, int level);
31 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
32                       const struct btrfs_key *ins_key, struct btrfs_path *path,
33                       int data_size, int extend);
34 static int push_node_left(struct btrfs_trans_handle *trans,
35                           struct extent_buffer *dst,
36                           struct extent_buffer *src, int empty);
37 static int balance_node_right(struct btrfs_trans_handle *trans,
38                               struct extent_buffer *dst_buf,
39                               struct extent_buffer *src_buf);
40
41 static const struct btrfs_csums {
42         u16             size;
43         const char      name[10];
44         const char      driver[12];
45 } btrfs_csums[] = {
46         [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
47         [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
48         [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
49         [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
50                                      .driver = "blake2b-256" },
51 };
52
53 /*
54  * The leaf data grows from end-to-front in the node.  this returns the address
55  * of the start of the last item, which is the stop of the leaf data stack.
56  */
57 static unsigned int leaf_data_end(const struct extent_buffer *leaf)
58 {
59         u32 nr = btrfs_header_nritems(leaf);
60
61         if (nr == 0)
62                 return BTRFS_LEAF_DATA_SIZE(leaf->fs_info);
63         return btrfs_item_offset(leaf, nr - 1);
64 }
65
66 /*
67  * Move data in a @leaf (using memmove, safe for overlapping ranges).
68  *
69  * @leaf:       leaf that we're doing a memmove on
70  * @dst_offset: item data offset we're moving to
71  * @src_offset: item data offset were' moving from
72  * @len:        length of the data we're moving
73  *
74  * Wrapper around memmove_extent_buffer() that takes into account the header on
75  * the leaf.  The btrfs_item offset's start directly after the header, so we
76  * have to adjust any offsets to account for the header in the leaf.  This
77  * handles that math to simplify the callers.
78  */
79 static inline void memmove_leaf_data(const struct extent_buffer *leaf,
80                                      unsigned long dst_offset,
81                                      unsigned long src_offset,
82                                      unsigned long len)
83 {
84         memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, 0) + dst_offset,
85                               btrfs_item_nr_offset(leaf, 0) + src_offset, len);
86 }
87
88 /*
89  * Copy item data from @src into @dst at the given @offset.
90  *
91  * @dst:        destination leaf that we're copying into
92  * @src:        source leaf that we're copying from
93  * @dst_offset: item data offset we're copying to
94  * @src_offset: item data offset were' copying from
95  * @len:        length of the data we're copying
96  *
97  * Wrapper around copy_extent_buffer() that takes into account the header on
98  * the leaf.  The btrfs_item offset's start directly after the header, so we
99  * have to adjust any offsets to account for the header in the leaf.  This
100  * handles that math to simplify the callers.
101  */
102 static inline void copy_leaf_data(const struct extent_buffer *dst,
103                                   const struct extent_buffer *src,
104                                   unsigned long dst_offset,
105                                   unsigned long src_offset, unsigned long len)
106 {
107         copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, 0) + dst_offset,
108                            btrfs_item_nr_offset(src, 0) + src_offset, len);
109 }
110
111 /*
112  * Move items in a @leaf (using memmove).
113  *
114  * @dst:        destination leaf for the items
115  * @dst_item:   the item nr we're copying into
116  * @src_item:   the item nr we're copying from
117  * @nr_items:   the number of items to copy
118  *
119  * Wrapper around memmove_extent_buffer() that does the math to get the
120  * appropriate offsets into the leaf from the item numbers.
121  */
122 static inline void memmove_leaf_items(const struct extent_buffer *leaf,
123                                       int dst_item, int src_item, int nr_items)
124 {
125         memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, dst_item),
126                               btrfs_item_nr_offset(leaf, src_item),
127                               nr_items * sizeof(struct btrfs_item));
128 }
129
130 /*
131  * Copy items from @src into @dst at the given @offset.
132  *
133  * @dst:        destination leaf for the items
134  * @src:        source leaf for the items
135  * @dst_item:   the item nr we're copying into
136  * @src_item:   the item nr we're copying from
137  * @nr_items:   the number of items to copy
138  *
139  * Wrapper around copy_extent_buffer() that does the math to get the
140  * appropriate offsets into the leaf from the item numbers.
141  */
142 static inline void copy_leaf_items(const struct extent_buffer *dst,
143                                    const struct extent_buffer *src,
144                                    int dst_item, int src_item, int nr_items)
145 {
146         copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, dst_item),
147                               btrfs_item_nr_offset(src, src_item),
148                               nr_items * sizeof(struct btrfs_item));
149 }
150
151 /* This exists for btrfs-progs usages. */
152 u16 btrfs_csum_type_size(u16 type)
153 {
154         return btrfs_csums[type].size;
155 }
156
157 int btrfs_super_csum_size(const struct btrfs_super_block *s)
158 {
159         u16 t = btrfs_super_csum_type(s);
160         /*
161          * csum type is validated at mount time
162          */
163         return btrfs_csum_type_size(t);
164 }
165
166 const char *btrfs_super_csum_name(u16 csum_type)
167 {
168         /* csum type is validated at mount time */
169         return btrfs_csums[csum_type].name;
170 }
171
172 /*
173  * Return driver name if defined, otherwise the name that's also a valid driver
174  * name
175  */
176 const char *btrfs_super_csum_driver(u16 csum_type)
177 {
178         /* csum type is validated at mount time */
179         return btrfs_csums[csum_type].driver[0] ?
180                 btrfs_csums[csum_type].driver :
181                 btrfs_csums[csum_type].name;
182 }
183
184 size_t __attribute_const__ btrfs_get_num_csums(void)
185 {
186         return ARRAY_SIZE(btrfs_csums);
187 }
188
189 struct btrfs_path *btrfs_alloc_path(void)
190 {
191         might_sleep();
192
193         return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
194 }
195
196 /* this also releases the path */
197 void btrfs_free_path(struct btrfs_path *p)
198 {
199         if (!p)
200                 return;
201         btrfs_release_path(p);
202         kmem_cache_free(btrfs_path_cachep, p);
203 }
204
205 /*
206  * path release drops references on the extent buffers in the path
207  * and it drops any locks held by this path
208  *
209  * It is safe to call this on paths that no locks or extent buffers held.
210  */
211 noinline void btrfs_release_path(struct btrfs_path *p)
212 {
213         int i;
214
215         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
216                 p->slots[i] = 0;
217                 if (!p->nodes[i])
218                         continue;
219                 if (p->locks[i]) {
220                         btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
221                         p->locks[i] = 0;
222                 }
223                 free_extent_buffer(p->nodes[i]);
224                 p->nodes[i] = NULL;
225         }
226 }
227
228 /*
229  * We want the transaction abort to print stack trace only for errors where the
230  * cause could be a bug, eg. due to ENOSPC, and not for common errors that are
231  * caused by external factors.
232  */
233 bool __cold abort_should_print_stack(int error)
234 {
235         switch (error) {
236         case -EIO:
237         case -EROFS:
238         case -ENOMEM:
239                 return false;
240         }
241         return true;
242 }
243
244 /*
245  * safely gets a reference on the root node of a tree.  A lock
246  * is not taken, so a concurrent writer may put a different node
247  * at the root of the tree.  See btrfs_lock_root_node for the
248  * looping required.
249  *
250  * The extent buffer returned by this has a reference taken, so
251  * it won't disappear.  It may stop being the root of the tree
252  * at any time because there are no locks held.
253  */
254 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
255 {
256         struct extent_buffer *eb;
257
258         while (1) {
259                 rcu_read_lock();
260                 eb = rcu_dereference(root->node);
261
262                 /*
263                  * RCU really hurts here, we could free up the root node because
264                  * it was COWed but we may not get the new root node yet so do
265                  * the inc_not_zero dance and if it doesn't work then
266                  * synchronize_rcu and try again.
267                  */
268                 if (atomic_inc_not_zero(&eb->refs)) {
269                         rcu_read_unlock();
270                         break;
271                 }
272                 rcu_read_unlock();
273                 synchronize_rcu();
274         }
275         return eb;
276 }
277
278 /*
279  * Cowonly root (not-shareable trees, everything not subvolume or reloc roots),
280  * just get put onto a simple dirty list.  Transaction walks this list to make
281  * sure they get properly updated on disk.
282  */
283 static void add_root_to_dirty_list(struct btrfs_root *root)
284 {
285         struct btrfs_fs_info *fs_info = root->fs_info;
286
287         if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
288             !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
289                 return;
290
291         spin_lock(&fs_info->trans_lock);
292         if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
293                 /* Want the extent tree to be the last on the list */
294                 if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
295                         list_move_tail(&root->dirty_list,
296                                        &fs_info->dirty_cowonly_roots);
297                 else
298                         list_move(&root->dirty_list,
299                                   &fs_info->dirty_cowonly_roots);
300         }
301         spin_unlock(&fs_info->trans_lock);
302 }
303
304 /*
305  * used by snapshot creation to make a copy of a root for a tree with
306  * a given objectid.  The buffer with the new root node is returned in
307  * cow_ret, and this func returns zero on success or a negative error code.
308  */
309 int btrfs_copy_root(struct btrfs_trans_handle *trans,
310                       struct btrfs_root *root,
311                       struct extent_buffer *buf,
312                       struct extent_buffer **cow_ret, u64 new_root_objectid)
313 {
314         struct btrfs_fs_info *fs_info = root->fs_info;
315         struct extent_buffer *cow;
316         int ret = 0;
317         int level;
318         struct btrfs_disk_key disk_key;
319         u64 reloc_src_root = 0;
320
321         WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
322                 trans->transid != fs_info->running_transaction->transid);
323         WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
324                 trans->transid != root->last_trans);
325
326         level = btrfs_header_level(buf);
327         if (level == 0)
328                 btrfs_item_key(buf, &disk_key, 0);
329         else
330                 btrfs_node_key(buf, &disk_key, 0);
331
332         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
333                 reloc_src_root = btrfs_header_owner(buf);
334         cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
335                                      &disk_key, level, buf->start, 0,
336                                      reloc_src_root, BTRFS_NESTING_NEW_ROOT);
337         if (IS_ERR(cow))
338                 return PTR_ERR(cow);
339
340         copy_extent_buffer_full(cow, buf);
341         btrfs_set_header_bytenr(cow, cow->start);
342         btrfs_set_header_generation(cow, trans->transid);
343         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
344         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
345                                      BTRFS_HEADER_FLAG_RELOC);
346         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
347                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
348         else
349                 btrfs_set_header_owner(cow, new_root_objectid);
350
351         write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
352
353         WARN_ON(btrfs_header_generation(buf) > trans->transid);
354         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
355                 ret = btrfs_inc_ref(trans, root, cow, 1);
356         else
357                 ret = btrfs_inc_ref(trans, root, cow, 0);
358         if (ret) {
359                 btrfs_tree_unlock(cow);
360                 free_extent_buffer(cow);
361                 btrfs_abort_transaction(trans, ret);
362                 return ret;
363         }
364
365         btrfs_mark_buffer_dirty(trans, cow);
366         *cow_ret = cow;
367         return 0;
368 }
369
370 /*
371  * check if the tree block can be shared by multiple trees
372  */
373 bool btrfs_block_can_be_shared(struct btrfs_trans_handle *trans,
374                                struct btrfs_root *root,
375                                struct extent_buffer *buf)
376 {
377         const u64 buf_gen = btrfs_header_generation(buf);
378
379         /*
380          * Tree blocks not in shareable trees and tree roots are never shared.
381          * If a block was allocated after the last snapshot and the block was
382          * not allocated by tree relocation, we know the block is not shared.
383          */
384
385         if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
386                 return false;
387
388         if (buf == root->node)
389                 return false;
390
391         if (buf_gen > btrfs_root_last_snapshot(&root->root_item) &&
392             !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC))
393                 return false;
394
395         if (buf != root->commit_root)
396                 return true;
397
398         /*
399          * An extent buffer that used to be the commit root may still be shared
400          * because the tree height may have increased and it became a child of a
401          * higher level root. This can happen when snapshotting a subvolume
402          * created in the current transaction.
403          */
404         if (buf_gen == trans->transid)
405                 return true;
406
407         return false;
408 }
409
410 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
411                                        struct btrfs_root *root,
412                                        struct extent_buffer *buf,
413                                        struct extent_buffer *cow,
414                                        int *last_ref)
415 {
416         struct btrfs_fs_info *fs_info = root->fs_info;
417         u64 refs;
418         u64 owner;
419         u64 flags;
420         u64 new_flags = 0;
421         int ret;
422
423         /*
424          * Backrefs update rules:
425          *
426          * Always use full backrefs for extent pointers in tree block
427          * allocated by tree relocation.
428          *
429          * If a shared tree block is no longer referenced by its owner
430          * tree (btrfs_header_owner(buf) == root->root_key.objectid),
431          * use full backrefs for extent pointers in tree block.
432          *
433          * If a tree block is been relocating
434          * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
435          * use full backrefs for extent pointers in tree block.
436          * The reason for this is some operations (such as drop tree)
437          * are only allowed for blocks use full backrefs.
438          */
439
440         if (btrfs_block_can_be_shared(trans, root, buf)) {
441                 ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
442                                                btrfs_header_level(buf), 1,
443                                                &refs, &flags, NULL);
444                 if (ret)
445                         return ret;
446                 if (unlikely(refs == 0)) {
447                         btrfs_crit(fs_info,
448                 "found 0 references for tree block at bytenr %llu level %d root %llu",
449                                    buf->start, btrfs_header_level(buf),
450                                    btrfs_root_id(root));
451                         ret = -EUCLEAN;
452                         btrfs_abort_transaction(trans, ret);
453                         return ret;
454                 }
455         } else {
456                 refs = 1;
457                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
458                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
459                         flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
460                 else
461                         flags = 0;
462         }
463
464         owner = btrfs_header_owner(buf);
465         BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
466                !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
467
468         if (refs > 1) {
469                 if ((owner == root->root_key.objectid ||
470                      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
471                     !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
472                         ret = btrfs_inc_ref(trans, root, buf, 1);
473                         if (ret)
474                                 return ret;
475
476                         if (root->root_key.objectid ==
477                             BTRFS_TREE_RELOC_OBJECTID) {
478                                 ret = btrfs_dec_ref(trans, root, buf, 0);
479                                 if (ret)
480                                         return ret;
481                                 ret = btrfs_inc_ref(trans, root, cow, 1);
482                                 if (ret)
483                                         return ret;
484                         }
485                         new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
486                 } else {
487
488                         if (root->root_key.objectid ==
489                             BTRFS_TREE_RELOC_OBJECTID)
490                                 ret = btrfs_inc_ref(trans, root, cow, 1);
491                         else
492                                 ret = btrfs_inc_ref(trans, root, cow, 0);
493                         if (ret)
494                                 return ret;
495                 }
496                 if (new_flags != 0) {
497                         ret = btrfs_set_disk_extent_flags(trans, buf, new_flags);
498                         if (ret)
499                                 return ret;
500                 }
501         } else {
502                 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
503                         if (root->root_key.objectid ==
504                             BTRFS_TREE_RELOC_OBJECTID)
505                                 ret = btrfs_inc_ref(trans, root, cow, 1);
506                         else
507                                 ret = btrfs_inc_ref(trans, root, cow, 0);
508                         if (ret)
509                                 return ret;
510                         ret = btrfs_dec_ref(trans, root, buf, 1);
511                         if (ret)
512                                 return ret;
513                 }
514                 btrfs_clear_buffer_dirty(trans, buf);
515                 *last_ref = 1;
516         }
517         return 0;
518 }
519
520 /*
521  * does the dirty work in cow of a single block.  The parent block (if
522  * supplied) is updated to point to the new cow copy.  The new buffer is marked
523  * dirty and returned locked.  If you modify the block it needs to be marked
524  * dirty again.
525  *
526  * search_start -- an allocation hint for the new block
527  *
528  * empty_size -- a hint that you plan on doing more cow.  This is the size in
529  * bytes the allocator should try to find free next to the block it returns.
530  * This is just a hint and may be ignored by the allocator.
531  */
532 int btrfs_force_cow_block(struct btrfs_trans_handle *trans,
533                           struct btrfs_root *root,
534                           struct extent_buffer *buf,
535                           struct extent_buffer *parent, int parent_slot,
536                           struct extent_buffer **cow_ret,
537                           u64 search_start, u64 empty_size,
538                           enum btrfs_lock_nesting nest)
539 {
540         struct btrfs_fs_info *fs_info = root->fs_info;
541         struct btrfs_disk_key disk_key;
542         struct extent_buffer *cow;
543         int level, ret;
544         int last_ref = 0;
545         int unlock_orig = 0;
546         u64 parent_start = 0;
547         u64 reloc_src_root = 0;
548
549         if (*cow_ret == buf)
550                 unlock_orig = 1;
551
552         btrfs_assert_tree_write_locked(buf);
553
554         WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
555                 trans->transid != fs_info->running_transaction->transid);
556         WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
557                 trans->transid != root->last_trans);
558
559         level = btrfs_header_level(buf);
560
561         if (level == 0)
562                 btrfs_item_key(buf, &disk_key, 0);
563         else
564                 btrfs_node_key(buf, &disk_key, 0);
565
566         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) {
567                 if (parent)
568                         parent_start = parent->start;
569                 reloc_src_root = btrfs_header_owner(buf);
570         }
571         cow = btrfs_alloc_tree_block(trans, root, parent_start,
572                                      root->root_key.objectid, &disk_key, level,
573                                      search_start, empty_size, reloc_src_root, nest);
574         if (IS_ERR(cow))
575                 return PTR_ERR(cow);
576
577         /* cow is set to blocking by btrfs_init_new_buffer */
578
579         copy_extent_buffer_full(cow, buf);
580         btrfs_set_header_bytenr(cow, cow->start);
581         btrfs_set_header_generation(cow, trans->transid);
582         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
583         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
584                                      BTRFS_HEADER_FLAG_RELOC);
585         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
586                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
587         else
588                 btrfs_set_header_owner(cow, root->root_key.objectid);
589
590         write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
591
592         ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
593         if (ret) {
594                 btrfs_tree_unlock(cow);
595                 free_extent_buffer(cow);
596                 btrfs_abort_transaction(trans, ret);
597                 return ret;
598         }
599
600         if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
601                 ret = btrfs_reloc_cow_block(trans, root, buf, cow);
602                 if (ret) {
603                         btrfs_tree_unlock(cow);
604                         free_extent_buffer(cow);
605                         btrfs_abort_transaction(trans, ret);
606                         return ret;
607                 }
608         }
609
610         if (buf == root->node) {
611                 WARN_ON(parent && parent != buf);
612                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
613                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
614                         parent_start = buf->start;
615
616                 ret = btrfs_tree_mod_log_insert_root(root->node, cow, true);
617                 if (ret < 0) {
618                         btrfs_tree_unlock(cow);
619                         free_extent_buffer(cow);
620                         btrfs_abort_transaction(trans, ret);
621                         return ret;
622                 }
623                 atomic_inc(&cow->refs);
624                 rcu_assign_pointer(root->node, cow);
625
626                 btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
627                                       parent_start, last_ref);
628                 free_extent_buffer(buf);
629                 add_root_to_dirty_list(root);
630         } else {
631                 WARN_ON(trans->transid != btrfs_header_generation(parent));
632                 ret = btrfs_tree_mod_log_insert_key(parent, parent_slot,
633                                                     BTRFS_MOD_LOG_KEY_REPLACE);
634                 if (ret) {
635                         btrfs_tree_unlock(cow);
636                         free_extent_buffer(cow);
637                         btrfs_abort_transaction(trans, ret);
638                         return ret;
639                 }
640                 btrfs_set_node_blockptr(parent, parent_slot,
641                                         cow->start);
642                 btrfs_set_node_ptr_generation(parent, parent_slot,
643                                               trans->transid);
644                 btrfs_mark_buffer_dirty(trans, parent);
645                 if (last_ref) {
646                         ret = btrfs_tree_mod_log_free_eb(buf);
647                         if (ret) {
648                                 btrfs_tree_unlock(cow);
649                                 free_extent_buffer(cow);
650                                 btrfs_abort_transaction(trans, ret);
651                                 return ret;
652                         }
653                 }
654                 btrfs_free_tree_block(trans, btrfs_root_id(root), buf,
655                                       parent_start, last_ref);
656         }
657         if (unlock_orig)
658                 btrfs_tree_unlock(buf);
659         free_extent_buffer_stale(buf);
660         btrfs_mark_buffer_dirty(trans, cow);
661         *cow_ret = cow;
662         return 0;
663 }
664
665 static inline int should_cow_block(struct btrfs_trans_handle *trans,
666                                    struct btrfs_root *root,
667                                    struct extent_buffer *buf)
668 {
669         if (btrfs_is_testing(root->fs_info))
670                 return 0;
671
672         /* Ensure we can see the FORCE_COW bit */
673         smp_mb__before_atomic();
674
675         /*
676          * We do not need to cow a block if
677          * 1) this block is not created or changed in this transaction;
678          * 2) this block does not belong to TREE_RELOC tree;
679          * 3) the root is not forced COW.
680          *
681          * What is forced COW:
682          *    when we create snapshot during committing the transaction,
683          *    after we've finished copying src root, we must COW the shared
684          *    block to ensure the metadata consistency.
685          */
686         if (btrfs_header_generation(buf) == trans->transid &&
687             !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
688             !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
689               btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
690             !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
691                 return 0;
692         return 1;
693 }
694
695 /*
696  * COWs a single block, see btrfs_force_cow_block() for the real work.
697  * This version of it has extra checks so that a block isn't COWed more than
698  * once per transaction, as long as it hasn't been written yet
699  */
700 int btrfs_cow_block(struct btrfs_trans_handle *trans,
701                     struct btrfs_root *root, struct extent_buffer *buf,
702                     struct extent_buffer *parent, int parent_slot,
703                     struct extent_buffer **cow_ret,
704                     enum btrfs_lock_nesting nest)
705 {
706         struct btrfs_fs_info *fs_info = root->fs_info;
707         u64 search_start;
708         int ret;
709
710         if (unlikely(test_bit(BTRFS_ROOT_DELETING, &root->state))) {
711                 btrfs_abort_transaction(trans, -EUCLEAN);
712                 btrfs_crit(fs_info,
713                    "attempt to COW block %llu on root %llu that is being deleted",
714                            buf->start, btrfs_root_id(root));
715                 return -EUCLEAN;
716         }
717
718         /*
719          * COWing must happen through a running transaction, which always
720          * matches the current fs generation (it's a transaction with a state
721          * less than TRANS_STATE_UNBLOCKED). If it doesn't, then turn the fs
722          * into error state to prevent the commit of any transaction.
723          */
724         if (unlikely(trans->transaction != fs_info->running_transaction ||
725                      trans->transid != fs_info->generation)) {
726                 btrfs_abort_transaction(trans, -EUCLEAN);
727                 btrfs_crit(fs_info,
728 "unexpected transaction when attempting to COW block %llu on root %llu, transaction %llu running transaction %llu fs generation %llu",
729                            buf->start, btrfs_root_id(root), trans->transid,
730                            fs_info->running_transaction->transid,
731                            fs_info->generation);
732                 return -EUCLEAN;
733         }
734
735         if (!should_cow_block(trans, root, buf)) {
736                 *cow_ret = buf;
737                 return 0;
738         }
739
740         search_start = round_down(buf->start, SZ_1G);
741
742         /*
743          * Before CoWing this block for later modification, check if it's
744          * the subtree root and do the delayed subtree trace if needed.
745          *
746          * Also We don't care about the error, as it's handled internally.
747          */
748         btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
749         ret = btrfs_force_cow_block(trans, root, buf, parent, parent_slot,
750                                     cow_ret, search_start, 0, nest);
751
752         trace_btrfs_cow_block(root, buf, *cow_ret);
753
754         return ret;
755 }
756 ALLOW_ERROR_INJECTION(btrfs_cow_block, ERRNO);
757
758 /*
759  * same as comp_keys only with two btrfs_key's
760  */
761 int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
762 {
763         if (k1->objectid > k2->objectid)
764                 return 1;
765         if (k1->objectid < k2->objectid)
766                 return -1;
767         if (k1->type > k2->type)
768                 return 1;
769         if (k1->type < k2->type)
770                 return -1;
771         if (k1->offset > k2->offset)
772                 return 1;
773         if (k1->offset < k2->offset)
774                 return -1;
775         return 0;
776 }
777
778 /*
779  * Search for a key in the given extent_buffer.
780  *
781  * The lower boundary for the search is specified by the slot number @first_slot.
782  * Use a value of 0 to search over the whole extent buffer. Works for both
783  * leaves and nodes.
784  *
785  * The slot in the extent buffer is returned via @slot. If the key exists in the
786  * extent buffer, then @slot will point to the slot where the key is, otherwise
787  * it points to the slot where you would insert the key.
788  *
789  * Slot may point to the total number of items (i.e. one position beyond the last
790  * key) if the key is bigger than the last key in the extent buffer.
791  */
792 int btrfs_bin_search(struct extent_buffer *eb, int first_slot,
793                      const struct btrfs_key *key, int *slot)
794 {
795         unsigned long p;
796         int item_size;
797         /*
798          * Use unsigned types for the low and high slots, so that we get a more
799          * efficient division in the search loop below.
800          */
801         u32 low = first_slot;
802         u32 high = btrfs_header_nritems(eb);
803         int ret;
804         const int key_size = sizeof(struct btrfs_disk_key);
805
806         if (unlikely(low > high)) {
807                 btrfs_err(eb->fs_info,
808                  "%s: low (%u) > high (%u) eb %llu owner %llu level %d",
809                           __func__, low, high, eb->start,
810                           btrfs_header_owner(eb), btrfs_header_level(eb));
811                 return -EINVAL;
812         }
813
814         if (btrfs_header_level(eb) == 0) {
815                 p = offsetof(struct btrfs_leaf, items);
816                 item_size = sizeof(struct btrfs_item);
817         } else {
818                 p = offsetof(struct btrfs_node, ptrs);
819                 item_size = sizeof(struct btrfs_key_ptr);
820         }
821
822         while (low < high) {
823                 const int unit_size = folio_size(eb->folios[0]);
824                 unsigned long oil;
825                 unsigned long offset;
826                 struct btrfs_disk_key *tmp;
827                 struct btrfs_disk_key unaligned;
828                 int mid;
829
830                 mid = (low + high) / 2;
831                 offset = p + mid * item_size;
832                 oil = get_eb_offset_in_folio(eb, offset);
833
834                 if (oil + key_size <= unit_size) {
835                         const unsigned long idx = get_eb_folio_index(eb, offset);
836                         char *kaddr = folio_address(eb->folios[idx]);
837
838                         oil = get_eb_offset_in_folio(eb, offset);
839                         tmp = (struct btrfs_disk_key *)(kaddr + oil);
840                 } else {
841                         read_extent_buffer(eb, &unaligned, offset, key_size);
842                         tmp = &unaligned;
843                 }
844
845                 ret = btrfs_comp_keys(tmp, key);
846
847                 if (ret < 0)
848                         low = mid + 1;
849                 else if (ret > 0)
850                         high = mid;
851                 else {
852                         *slot = mid;
853                         return 0;
854                 }
855         }
856         *slot = low;
857         return 1;
858 }
859
860 static void root_add_used_bytes(struct btrfs_root *root)
861 {
862         spin_lock(&root->accounting_lock);
863         btrfs_set_root_used(&root->root_item,
864                 btrfs_root_used(&root->root_item) + root->fs_info->nodesize);
865         spin_unlock(&root->accounting_lock);
866 }
867
868 static void root_sub_used_bytes(struct btrfs_root *root)
869 {
870         spin_lock(&root->accounting_lock);
871         btrfs_set_root_used(&root->root_item,
872                 btrfs_root_used(&root->root_item) - root->fs_info->nodesize);
873         spin_unlock(&root->accounting_lock);
874 }
875
876 /* given a node and slot number, this reads the blocks it points to.  The
877  * extent buffer is returned with a reference taken (but unlocked).
878  */
879 struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
880                                            int slot)
881 {
882         int level = btrfs_header_level(parent);
883         struct btrfs_tree_parent_check check = { 0 };
884         struct extent_buffer *eb;
885
886         if (slot < 0 || slot >= btrfs_header_nritems(parent))
887                 return ERR_PTR(-ENOENT);
888
889         ASSERT(level);
890
891         check.level = level - 1;
892         check.transid = btrfs_node_ptr_generation(parent, slot);
893         check.owner_root = btrfs_header_owner(parent);
894         check.has_first_key = true;
895         btrfs_node_key_to_cpu(parent, &check.first_key, slot);
896
897         eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
898                              &check);
899         if (IS_ERR(eb))
900                 return eb;
901         if (!extent_buffer_uptodate(eb)) {
902                 free_extent_buffer(eb);
903                 return ERR_PTR(-EIO);
904         }
905
906         return eb;
907 }
908
909 /*
910  * node level balancing, used to make sure nodes are in proper order for
911  * item deletion.  We balance from the top down, so we have to make sure
912  * that a deletion won't leave an node completely empty later on.
913  */
914 static noinline int balance_level(struct btrfs_trans_handle *trans,
915                          struct btrfs_root *root,
916                          struct btrfs_path *path, int level)
917 {
918         struct btrfs_fs_info *fs_info = root->fs_info;
919         struct extent_buffer *right = NULL;
920         struct extent_buffer *mid;
921         struct extent_buffer *left = NULL;
922         struct extent_buffer *parent = NULL;
923         int ret = 0;
924         int wret;
925         int pslot;
926         int orig_slot = path->slots[level];
927         u64 orig_ptr;
928
929         ASSERT(level > 0);
930
931         mid = path->nodes[level];
932
933         WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK);
934         WARN_ON(btrfs_header_generation(mid) != trans->transid);
935
936         orig_ptr = btrfs_node_blockptr(mid, orig_slot);
937
938         if (level < BTRFS_MAX_LEVEL - 1) {
939                 parent = path->nodes[level + 1];
940                 pslot = path->slots[level + 1];
941         }
942
943         /*
944          * deal with the case where there is only one pointer in the root
945          * by promoting the node below to a root
946          */
947         if (!parent) {
948                 struct extent_buffer *child;
949
950                 if (btrfs_header_nritems(mid) != 1)
951                         return 0;
952
953                 /* promote the child to a root */
954                 child = btrfs_read_node_slot(mid, 0);
955                 if (IS_ERR(child)) {
956                         ret = PTR_ERR(child);
957                         goto out;
958                 }
959
960                 btrfs_tree_lock(child);
961                 ret = btrfs_cow_block(trans, root, child, mid, 0, &child,
962                                       BTRFS_NESTING_COW);
963                 if (ret) {
964                         btrfs_tree_unlock(child);
965                         free_extent_buffer(child);
966                         goto out;
967                 }
968
969                 ret = btrfs_tree_mod_log_insert_root(root->node, child, true);
970                 if (ret < 0) {
971                         btrfs_tree_unlock(child);
972                         free_extent_buffer(child);
973                         btrfs_abort_transaction(trans, ret);
974                         goto out;
975                 }
976                 rcu_assign_pointer(root->node, child);
977
978                 add_root_to_dirty_list(root);
979                 btrfs_tree_unlock(child);
980
981                 path->locks[level] = 0;
982                 path->nodes[level] = NULL;
983                 btrfs_clear_buffer_dirty(trans, mid);
984                 btrfs_tree_unlock(mid);
985                 /* once for the path */
986                 free_extent_buffer(mid);
987
988                 root_sub_used_bytes(root);
989                 btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
990                 /* once for the root ptr */
991                 free_extent_buffer_stale(mid);
992                 return 0;
993         }
994         if (btrfs_header_nritems(mid) >
995             BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
996                 return 0;
997
998         if (pslot) {
999                 left = btrfs_read_node_slot(parent, pslot - 1);
1000                 if (IS_ERR(left)) {
1001                         ret = PTR_ERR(left);
1002                         left = NULL;
1003                         goto out;
1004                 }
1005
1006                 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
1007                 wret = btrfs_cow_block(trans, root, left,
1008                                        parent, pslot - 1, &left,
1009                                        BTRFS_NESTING_LEFT_COW);
1010                 if (wret) {
1011                         ret = wret;
1012                         goto out;
1013                 }
1014         }
1015
1016         if (pslot + 1 < btrfs_header_nritems(parent)) {
1017                 right = btrfs_read_node_slot(parent, pslot + 1);
1018                 if (IS_ERR(right)) {
1019                         ret = PTR_ERR(right);
1020                         right = NULL;
1021                         goto out;
1022                 }
1023
1024                 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
1025                 wret = btrfs_cow_block(trans, root, right,
1026                                        parent, pslot + 1, &right,
1027                                        BTRFS_NESTING_RIGHT_COW);
1028                 if (wret) {
1029                         ret = wret;
1030                         goto out;
1031                 }
1032         }
1033
1034         /* first, try to make some room in the middle buffer */
1035         if (left) {
1036                 orig_slot += btrfs_header_nritems(left);
1037                 wret = push_node_left(trans, left, mid, 1);
1038                 if (wret < 0)
1039                         ret = wret;
1040         }
1041
1042         /*
1043          * then try to empty the right most buffer into the middle
1044          */
1045         if (right) {
1046                 wret = push_node_left(trans, mid, right, 1);
1047                 if (wret < 0 && wret != -ENOSPC)
1048                         ret = wret;
1049                 if (btrfs_header_nritems(right) == 0) {
1050                         btrfs_clear_buffer_dirty(trans, right);
1051                         btrfs_tree_unlock(right);
1052                         ret = btrfs_del_ptr(trans, root, path, level + 1, pslot + 1);
1053                         if (ret < 0) {
1054                                 free_extent_buffer_stale(right);
1055                                 right = NULL;
1056                                 goto out;
1057                         }
1058                         root_sub_used_bytes(root);
1059                         btrfs_free_tree_block(trans, btrfs_root_id(root), right,
1060                                               0, 1);
1061                         free_extent_buffer_stale(right);
1062                         right = NULL;
1063                 } else {
1064                         struct btrfs_disk_key right_key;
1065                         btrfs_node_key(right, &right_key, 0);
1066                         ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
1067                                         BTRFS_MOD_LOG_KEY_REPLACE);
1068                         if (ret < 0) {
1069                                 btrfs_abort_transaction(trans, ret);
1070                                 goto out;
1071                         }
1072                         btrfs_set_node_key(parent, &right_key, pslot + 1);
1073                         btrfs_mark_buffer_dirty(trans, parent);
1074                 }
1075         }
1076         if (btrfs_header_nritems(mid) == 1) {
1077                 /*
1078                  * we're not allowed to leave a node with one item in the
1079                  * tree during a delete.  A deletion from lower in the tree
1080                  * could try to delete the only pointer in this node.
1081                  * So, pull some keys from the left.
1082                  * There has to be a left pointer at this point because
1083                  * otherwise we would have pulled some pointers from the
1084                  * right
1085                  */
1086                 if (unlikely(!left)) {
1087                         btrfs_crit(fs_info,
1088 "missing left child when middle child only has 1 item, parent bytenr %llu level %d mid bytenr %llu root %llu",
1089                                    parent->start, btrfs_header_level(parent),
1090                                    mid->start, btrfs_root_id(root));
1091                         ret = -EUCLEAN;
1092                         btrfs_abort_transaction(trans, ret);
1093                         goto out;
1094                 }
1095                 wret = balance_node_right(trans, mid, left);
1096                 if (wret < 0) {
1097                         ret = wret;
1098                         goto out;
1099                 }
1100                 if (wret == 1) {
1101                         wret = push_node_left(trans, left, mid, 1);
1102                         if (wret < 0)
1103                                 ret = wret;
1104                 }
1105                 BUG_ON(wret == 1);
1106         }
1107         if (btrfs_header_nritems(mid) == 0) {
1108                 btrfs_clear_buffer_dirty(trans, mid);
1109                 btrfs_tree_unlock(mid);
1110                 ret = btrfs_del_ptr(trans, root, path, level + 1, pslot);
1111                 if (ret < 0) {
1112                         free_extent_buffer_stale(mid);
1113                         mid = NULL;
1114                         goto out;
1115                 }
1116                 root_sub_used_bytes(root);
1117                 btrfs_free_tree_block(trans, btrfs_root_id(root), mid, 0, 1);
1118                 free_extent_buffer_stale(mid);
1119                 mid = NULL;
1120         } else {
1121                 /* update the parent key to reflect our changes */
1122                 struct btrfs_disk_key mid_key;
1123                 btrfs_node_key(mid, &mid_key, 0);
1124                 ret = btrfs_tree_mod_log_insert_key(parent, pslot,
1125                                                     BTRFS_MOD_LOG_KEY_REPLACE);
1126                 if (ret < 0) {
1127                         btrfs_abort_transaction(trans, ret);
1128                         goto out;
1129                 }
1130                 btrfs_set_node_key(parent, &mid_key, pslot);
1131                 btrfs_mark_buffer_dirty(trans, parent);
1132         }
1133
1134         /* update the path */
1135         if (left) {
1136                 if (btrfs_header_nritems(left) > orig_slot) {
1137                         atomic_inc(&left->refs);
1138                         /* left was locked after cow */
1139                         path->nodes[level] = left;
1140                         path->slots[level + 1] -= 1;
1141                         path->slots[level] = orig_slot;
1142                         if (mid) {
1143                                 btrfs_tree_unlock(mid);
1144                                 free_extent_buffer(mid);
1145                         }
1146                 } else {
1147                         orig_slot -= btrfs_header_nritems(left);
1148                         path->slots[level] = orig_slot;
1149                 }
1150         }
1151         /* double check we haven't messed things up */
1152         if (orig_ptr !=
1153             btrfs_node_blockptr(path->nodes[level], path->slots[level]))
1154                 BUG();
1155 out:
1156         if (right) {
1157                 btrfs_tree_unlock(right);
1158                 free_extent_buffer(right);
1159         }
1160         if (left) {
1161                 if (path->nodes[level] != left)
1162                         btrfs_tree_unlock(left);
1163                 free_extent_buffer(left);
1164         }
1165         return ret;
1166 }
1167
1168 /* Node balancing for insertion.  Here we only split or push nodes around
1169  * when they are completely full.  This is also done top down, so we
1170  * have to be pessimistic.
1171  */
1172 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
1173                                           struct btrfs_root *root,
1174                                           struct btrfs_path *path, int level)
1175 {
1176         struct btrfs_fs_info *fs_info = root->fs_info;
1177         struct extent_buffer *right = NULL;
1178         struct extent_buffer *mid;
1179         struct extent_buffer *left = NULL;
1180         struct extent_buffer *parent = NULL;
1181         int ret = 0;
1182         int wret;
1183         int pslot;
1184         int orig_slot = path->slots[level];
1185
1186         if (level == 0)
1187                 return 1;
1188
1189         mid = path->nodes[level];
1190         WARN_ON(btrfs_header_generation(mid) != trans->transid);
1191
1192         if (level < BTRFS_MAX_LEVEL - 1) {
1193                 parent = path->nodes[level + 1];
1194                 pslot = path->slots[level + 1];
1195         }
1196
1197         if (!parent)
1198                 return 1;
1199
1200         /* first, try to make some room in the middle buffer */
1201         if (pslot) {
1202                 u32 left_nr;
1203
1204                 left = btrfs_read_node_slot(parent, pslot - 1);
1205                 if (IS_ERR(left))
1206                         return PTR_ERR(left);
1207
1208                 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
1209
1210                 left_nr = btrfs_header_nritems(left);
1211                 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
1212                         wret = 1;
1213                 } else {
1214                         ret = btrfs_cow_block(trans, root, left, parent,
1215                                               pslot - 1, &left,
1216                                               BTRFS_NESTING_LEFT_COW);
1217                         if (ret)
1218                                 wret = 1;
1219                         else {
1220                                 wret = push_node_left(trans, left, mid, 0);
1221                         }
1222                 }
1223                 if (wret < 0)
1224                         ret = wret;
1225                 if (wret == 0) {
1226                         struct btrfs_disk_key disk_key;
1227                         orig_slot += left_nr;
1228                         btrfs_node_key(mid, &disk_key, 0);
1229                         ret = btrfs_tree_mod_log_insert_key(parent, pslot,
1230                                         BTRFS_MOD_LOG_KEY_REPLACE);
1231                         if (ret < 0) {
1232                                 btrfs_tree_unlock(left);
1233                                 free_extent_buffer(left);
1234                                 btrfs_abort_transaction(trans, ret);
1235                                 return ret;
1236                         }
1237                         btrfs_set_node_key(parent, &disk_key, pslot);
1238                         btrfs_mark_buffer_dirty(trans, parent);
1239                         if (btrfs_header_nritems(left) > orig_slot) {
1240                                 path->nodes[level] = left;
1241                                 path->slots[level + 1] -= 1;
1242                                 path->slots[level] = orig_slot;
1243                                 btrfs_tree_unlock(mid);
1244                                 free_extent_buffer(mid);
1245                         } else {
1246                                 orig_slot -=
1247                                         btrfs_header_nritems(left);
1248                                 path->slots[level] = orig_slot;
1249                                 btrfs_tree_unlock(left);
1250                                 free_extent_buffer(left);
1251                         }
1252                         return 0;
1253                 }
1254                 btrfs_tree_unlock(left);
1255                 free_extent_buffer(left);
1256         }
1257
1258         /*
1259          * then try to empty the right most buffer into the middle
1260          */
1261         if (pslot + 1 < btrfs_header_nritems(parent)) {
1262                 u32 right_nr;
1263
1264                 right = btrfs_read_node_slot(parent, pslot + 1);
1265                 if (IS_ERR(right))
1266                         return PTR_ERR(right);
1267
1268                 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
1269
1270                 right_nr = btrfs_header_nritems(right);
1271                 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
1272                         wret = 1;
1273                 } else {
1274                         ret = btrfs_cow_block(trans, root, right,
1275                                               parent, pslot + 1,
1276                                               &right, BTRFS_NESTING_RIGHT_COW);
1277                         if (ret)
1278                                 wret = 1;
1279                         else {
1280                                 wret = balance_node_right(trans, right, mid);
1281                         }
1282                 }
1283                 if (wret < 0)
1284                         ret = wret;
1285                 if (wret == 0) {
1286                         struct btrfs_disk_key disk_key;
1287
1288                         btrfs_node_key(right, &disk_key, 0);
1289                         ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
1290                                         BTRFS_MOD_LOG_KEY_REPLACE);
1291                         if (ret < 0) {
1292                                 btrfs_tree_unlock(right);
1293                                 free_extent_buffer(right);
1294                                 btrfs_abort_transaction(trans, ret);
1295                                 return ret;
1296                         }
1297                         btrfs_set_node_key(parent, &disk_key, pslot + 1);
1298                         btrfs_mark_buffer_dirty(trans, parent);
1299
1300                         if (btrfs_header_nritems(mid) <= orig_slot) {
1301                                 path->nodes[level] = right;
1302                                 path->slots[level + 1] += 1;
1303                                 path->slots[level] = orig_slot -
1304                                         btrfs_header_nritems(mid);
1305                                 btrfs_tree_unlock(mid);
1306                                 free_extent_buffer(mid);
1307                         } else {
1308                                 btrfs_tree_unlock(right);
1309                                 free_extent_buffer(right);
1310                         }
1311                         return 0;
1312                 }
1313                 btrfs_tree_unlock(right);
1314                 free_extent_buffer(right);
1315         }
1316         return 1;
1317 }
1318
1319 /*
1320  * readahead one full node of leaves, finding things that are close
1321  * to the block in 'slot', and triggering ra on them.
1322  */
1323 static void reada_for_search(struct btrfs_fs_info *fs_info,
1324                              struct btrfs_path *path,
1325                              int level, int slot, u64 objectid)
1326 {
1327         struct extent_buffer *node;
1328         struct btrfs_disk_key disk_key;
1329         u32 nritems;
1330         u64 search;
1331         u64 target;
1332         u64 nread = 0;
1333         u64 nread_max;
1334         u32 nr;
1335         u32 blocksize;
1336         u32 nscan = 0;
1337
1338         if (level != 1 && path->reada != READA_FORWARD_ALWAYS)
1339                 return;
1340
1341         if (!path->nodes[level])
1342                 return;
1343
1344         node = path->nodes[level];
1345
1346         /*
1347          * Since the time between visiting leaves is much shorter than the time
1348          * between visiting nodes, limit read ahead of nodes to 1, to avoid too
1349          * much IO at once (possibly random).
1350          */
1351         if (path->reada == READA_FORWARD_ALWAYS) {
1352                 if (level > 1)
1353                         nread_max = node->fs_info->nodesize;
1354                 else
1355                         nread_max = SZ_128K;
1356         } else {
1357                 nread_max = SZ_64K;
1358         }
1359
1360         search = btrfs_node_blockptr(node, slot);
1361         blocksize = fs_info->nodesize;
1362         if (path->reada != READA_FORWARD_ALWAYS) {
1363                 struct extent_buffer *eb;
1364
1365                 eb = find_extent_buffer(fs_info, search);
1366                 if (eb) {
1367                         free_extent_buffer(eb);
1368                         return;
1369                 }
1370         }
1371
1372         target = search;
1373
1374         nritems = btrfs_header_nritems(node);
1375         nr = slot;
1376
1377         while (1) {
1378                 if (path->reada == READA_BACK) {
1379                         if (nr == 0)
1380                                 break;
1381                         nr--;
1382                 } else if (path->reada == READA_FORWARD ||
1383                            path->reada == READA_FORWARD_ALWAYS) {
1384                         nr++;
1385                         if (nr >= nritems)
1386                                 break;
1387                 }
1388                 if (path->reada == READA_BACK && objectid) {
1389                         btrfs_node_key(node, &disk_key, nr);
1390                         if (btrfs_disk_key_objectid(&disk_key) != objectid)
1391                                 break;
1392                 }
1393                 search = btrfs_node_blockptr(node, nr);
1394                 if (path->reada == READA_FORWARD_ALWAYS ||
1395                     (search <= target && target - search <= 65536) ||
1396                     (search > target && search - target <= 65536)) {
1397                         btrfs_readahead_node_child(node, nr);
1398                         nread += blocksize;
1399                 }
1400                 nscan++;
1401                 if (nread > nread_max || nscan > 32)
1402                         break;
1403         }
1404 }
1405
1406 static noinline void reada_for_balance(struct btrfs_path *path, int level)
1407 {
1408         struct extent_buffer *parent;
1409         int slot;
1410         int nritems;
1411
1412         parent = path->nodes[level + 1];
1413         if (!parent)
1414                 return;
1415
1416         nritems = btrfs_header_nritems(parent);
1417         slot = path->slots[level + 1];
1418
1419         if (slot > 0)
1420                 btrfs_readahead_node_child(parent, slot - 1);
1421         if (slot + 1 < nritems)
1422                 btrfs_readahead_node_child(parent, slot + 1);
1423 }
1424
1425
1426 /*
1427  * when we walk down the tree, it is usually safe to unlock the higher layers
1428  * in the tree.  The exceptions are when our path goes through slot 0, because
1429  * operations on the tree might require changing key pointers higher up in the
1430  * tree.
1431  *
1432  * callers might also have set path->keep_locks, which tells this code to keep
1433  * the lock if the path points to the last slot in the block.  This is part of
1434  * walking through the tree, and selecting the next slot in the higher block.
1435  *
1436  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
1437  * if lowest_unlock is 1, level 0 won't be unlocked
1438  */
1439 static noinline void unlock_up(struct btrfs_path *path, int level,
1440                                int lowest_unlock, int min_write_lock_level,
1441                                int *write_lock_level)
1442 {
1443         int i;
1444         int skip_level = level;
1445         bool check_skip = true;
1446
1447         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
1448                 if (!path->nodes[i])
1449                         break;
1450                 if (!path->locks[i])
1451                         break;
1452
1453                 if (check_skip) {
1454                         if (path->slots[i] == 0) {
1455                                 skip_level = i + 1;
1456                                 continue;
1457                         }
1458
1459                         if (path->keep_locks) {
1460                                 u32 nritems;
1461
1462                                 nritems = btrfs_header_nritems(path->nodes[i]);
1463                                 if (nritems < 1 || path->slots[i] >= nritems - 1) {
1464                                         skip_level = i + 1;
1465                                         continue;
1466                                 }
1467                         }
1468                 }
1469
1470                 if (i >= lowest_unlock && i > skip_level) {
1471                         check_skip = false;
1472                         btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
1473                         path->locks[i] = 0;
1474                         if (write_lock_level &&
1475                             i > min_write_lock_level &&
1476                             i <= *write_lock_level) {
1477                                 *write_lock_level = i - 1;
1478                         }
1479                 }
1480         }
1481 }
1482
1483 /*
1484  * Helper function for btrfs_search_slot() and other functions that do a search
1485  * on a btree. The goal is to find a tree block in the cache (the radix tree at
1486  * fs_info->buffer_radix), but if we can't find it, or it's not up to date, read
1487  * its pages from disk.
1488  *
1489  * Returns -EAGAIN, with the path unlocked, if the caller needs to repeat the
1490  * whole btree search, starting again from the current root node.
1491  */
1492 static int
1493 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
1494                       struct extent_buffer **eb_ret, int level, int slot,
1495                       const struct btrfs_key *key)
1496 {
1497         struct btrfs_fs_info *fs_info = root->fs_info;
1498         struct btrfs_tree_parent_check check = { 0 };
1499         u64 blocknr;
1500         u64 gen;
1501         struct extent_buffer *tmp;
1502         int ret;
1503         int parent_level;
1504         bool unlock_up;
1505
1506         unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]);
1507         blocknr = btrfs_node_blockptr(*eb_ret, slot);
1508         gen = btrfs_node_ptr_generation(*eb_ret, slot);
1509         parent_level = btrfs_header_level(*eb_ret);
1510         btrfs_node_key_to_cpu(*eb_ret, &check.first_key, slot);
1511         check.has_first_key = true;
1512         check.level = parent_level - 1;
1513         check.transid = gen;
1514         check.owner_root = root->root_key.objectid;
1515
1516         /*
1517          * If we need to read an extent buffer from disk and we are holding locks
1518          * on upper level nodes, we unlock all the upper nodes before reading the
1519          * extent buffer, and then return -EAGAIN to the caller as it needs to
1520          * restart the search. We don't release the lock on the current level
1521          * because we need to walk this node to figure out which blocks to read.
1522          */
1523         tmp = find_extent_buffer(fs_info, blocknr);
1524         if (tmp) {
1525                 if (p->reada == READA_FORWARD_ALWAYS)
1526                         reada_for_search(fs_info, p, level, slot, key->objectid);
1527
1528                 /* first we do an atomic uptodate check */
1529                 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
1530                         /*
1531                          * Do extra check for first_key, eb can be stale due to
1532                          * being cached, read from scrub, or have multiple
1533                          * parents (shared tree blocks).
1534                          */
1535                         if (btrfs_verify_level_key(tmp,
1536                                         parent_level - 1, &check.first_key, gen)) {
1537                                 free_extent_buffer(tmp);
1538                                 return -EUCLEAN;
1539                         }
1540                         *eb_ret = tmp;
1541                         return 0;
1542                 }
1543
1544                 if (p->nowait) {
1545                         free_extent_buffer(tmp);
1546                         return -EAGAIN;
1547                 }
1548
1549                 if (unlock_up)
1550                         btrfs_unlock_up_safe(p, level + 1);
1551
1552                 /* now we're allowed to do a blocking uptodate check */
1553                 ret = btrfs_read_extent_buffer(tmp, &check);
1554                 if (ret) {
1555                         free_extent_buffer(tmp);
1556                         btrfs_release_path(p);
1557                         return -EIO;
1558                 }
1559                 if (btrfs_check_eb_owner(tmp, root->root_key.objectid)) {
1560                         free_extent_buffer(tmp);
1561                         btrfs_release_path(p);
1562                         return -EUCLEAN;
1563                 }
1564
1565                 if (unlock_up)
1566                         ret = -EAGAIN;
1567
1568                 goto out;
1569         } else if (p->nowait) {
1570                 return -EAGAIN;
1571         }
1572
1573         if (unlock_up) {
1574                 btrfs_unlock_up_safe(p, level + 1);
1575                 ret = -EAGAIN;
1576         } else {
1577                 ret = 0;
1578         }
1579
1580         if (p->reada != READA_NONE)
1581                 reada_for_search(fs_info, p, level, slot, key->objectid);
1582
1583         tmp = read_tree_block(fs_info, blocknr, &check);
1584         if (IS_ERR(tmp)) {
1585                 btrfs_release_path(p);
1586                 return PTR_ERR(tmp);
1587         }
1588         /*
1589          * If the read above didn't mark this buffer up to date,
1590          * it will never end up being up to date.  Set ret to EIO now
1591          * and give up so that our caller doesn't loop forever
1592          * on our EAGAINs.
1593          */
1594         if (!extent_buffer_uptodate(tmp))
1595                 ret = -EIO;
1596
1597 out:
1598         if (ret == 0) {
1599                 *eb_ret = tmp;
1600         } else {
1601                 free_extent_buffer(tmp);
1602                 btrfs_release_path(p);
1603         }
1604
1605         return ret;
1606 }
1607
1608 /*
1609  * helper function for btrfs_search_slot.  This does all of the checks
1610  * for node-level blocks and does any balancing required based on
1611  * the ins_len.
1612  *
1613  * If no extra work was required, zero is returned.  If we had to
1614  * drop the path, -EAGAIN is returned and btrfs_search_slot must
1615  * start over
1616  */
1617 static int
1618 setup_nodes_for_search(struct btrfs_trans_handle *trans,
1619                        struct btrfs_root *root, struct btrfs_path *p,
1620                        struct extent_buffer *b, int level, int ins_len,
1621                        int *write_lock_level)
1622 {
1623         struct btrfs_fs_info *fs_info = root->fs_info;
1624         int ret = 0;
1625
1626         if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
1627             BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
1628
1629                 if (*write_lock_level < level + 1) {
1630                         *write_lock_level = level + 1;
1631                         btrfs_release_path(p);
1632                         return -EAGAIN;
1633                 }
1634
1635                 reada_for_balance(p, level);
1636                 ret = split_node(trans, root, p, level);
1637
1638                 b = p->nodes[level];
1639         } else if (ins_len < 0 && btrfs_header_nritems(b) <
1640                    BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
1641
1642                 if (*write_lock_level < level + 1) {
1643                         *write_lock_level = level + 1;
1644                         btrfs_release_path(p);
1645                         return -EAGAIN;
1646                 }
1647
1648                 reada_for_balance(p, level);
1649                 ret = balance_level(trans, root, p, level);
1650                 if (ret)
1651                         return ret;
1652
1653                 b = p->nodes[level];
1654                 if (!b) {
1655                         btrfs_release_path(p);
1656                         return -EAGAIN;
1657                 }
1658                 BUG_ON(btrfs_header_nritems(b) == 1);
1659         }
1660         return ret;
1661 }
1662
1663 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
1664                 u64 iobjectid, u64 ioff, u8 key_type,
1665                 struct btrfs_key *found_key)
1666 {
1667         int ret;
1668         struct btrfs_key key;
1669         struct extent_buffer *eb;
1670
1671         ASSERT(path);
1672         ASSERT(found_key);
1673
1674         key.type = key_type;
1675         key.objectid = iobjectid;
1676         key.offset = ioff;
1677
1678         ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
1679         if (ret < 0)
1680                 return ret;
1681
1682         eb = path->nodes[0];
1683         if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
1684                 ret = btrfs_next_leaf(fs_root, path);
1685                 if (ret)
1686                         return ret;
1687                 eb = path->nodes[0];
1688         }
1689
1690         btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
1691         if (found_key->type != key.type ||
1692                         found_key->objectid != key.objectid)
1693                 return 1;
1694
1695         return 0;
1696 }
1697
1698 static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
1699                                                         struct btrfs_path *p,
1700                                                         int write_lock_level)
1701 {
1702         struct extent_buffer *b;
1703         int root_lock = 0;
1704         int level = 0;
1705
1706         if (p->search_commit_root) {
1707                 b = root->commit_root;
1708                 atomic_inc(&b->refs);
1709                 level = btrfs_header_level(b);
1710                 /*
1711                  * Ensure that all callers have set skip_locking when
1712                  * p->search_commit_root = 1.
1713                  */
1714                 ASSERT(p->skip_locking == 1);
1715
1716                 goto out;
1717         }
1718
1719         if (p->skip_locking) {
1720                 b = btrfs_root_node(root);
1721                 level = btrfs_header_level(b);
1722                 goto out;
1723         }
1724
1725         /* We try very hard to do read locks on the root */
1726         root_lock = BTRFS_READ_LOCK;
1727
1728         /*
1729          * If the level is set to maximum, we can skip trying to get the read
1730          * lock.
1731          */
1732         if (write_lock_level < BTRFS_MAX_LEVEL) {
1733                 /*
1734                  * We don't know the level of the root node until we actually
1735                  * have it read locked
1736                  */
1737                 if (p->nowait) {
1738                         b = btrfs_try_read_lock_root_node(root);
1739                         if (IS_ERR(b))
1740                                 return b;
1741                 } else {
1742                         b = btrfs_read_lock_root_node(root);
1743                 }
1744                 level = btrfs_header_level(b);
1745                 if (level > write_lock_level)
1746                         goto out;
1747
1748                 /* Whoops, must trade for write lock */
1749                 btrfs_tree_read_unlock(b);
1750                 free_extent_buffer(b);
1751         }
1752
1753         b = btrfs_lock_root_node(root);
1754         root_lock = BTRFS_WRITE_LOCK;
1755
1756         /* The level might have changed, check again */
1757         level = btrfs_header_level(b);
1758
1759 out:
1760         /*
1761          * The root may have failed to write out at some point, and thus is no
1762          * longer valid, return an error in this case.
1763          */
1764         if (!extent_buffer_uptodate(b)) {
1765                 if (root_lock)
1766                         btrfs_tree_unlock_rw(b, root_lock);
1767                 free_extent_buffer(b);
1768                 return ERR_PTR(-EIO);
1769         }
1770
1771         p->nodes[level] = b;
1772         if (!p->skip_locking)
1773                 p->locks[level] = root_lock;
1774         /*
1775          * Callers are responsible for dropping b's references.
1776          */
1777         return b;
1778 }
1779
1780 /*
1781  * Replace the extent buffer at the lowest level of the path with a cloned
1782  * version. The purpose is to be able to use it safely, after releasing the
1783  * commit root semaphore, even if relocation is happening in parallel, the
1784  * transaction used for relocation is committed and the extent buffer is
1785  * reallocated in the next transaction.
1786  *
1787  * This is used in a context where the caller does not prevent transaction
1788  * commits from happening, either by holding a transaction handle or holding
1789  * some lock, while it's doing searches through a commit root.
1790  * At the moment it's only used for send operations.
1791  */
1792 static int finish_need_commit_sem_search(struct btrfs_path *path)
1793 {
1794         const int i = path->lowest_level;
1795         const int slot = path->slots[i];
1796         struct extent_buffer *lowest = path->nodes[i];
1797         struct extent_buffer *clone;
1798
1799         ASSERT(path->need_commit_sem);
1800
1801         if (!lowest)
1802                 return 0;
1803
1804         lockdep_assert_held_read(&lowest->fs_info->commit_root_sem);
1805
1806         clone = btrfs_clone_extent_buffer(lowest);
1807         if (!clone)
1808                 return -ENOMEM;
1809
1810         btrfs_release_path(path);
1811         path->nodes[i] = clone;
1812         path->slots[i] = slot;
1813
1814         return 0;
1815 }
1816
1817 static inline int search_for_key_slot(struct extent_buffer *eb,
1818                                       int search_low_slot,
1819                                       const struct btrfs_key *key,
1820                                       int prev_cmp,
1821                                       int *slot)
1822 {
1823         /*
1824          * If a previous call to btrfs_bin_search() on a parent node returned an
1825          * exact match (prev_cmp == 0), we can safely assume the target key will
1826          * always be at slot 0 on lower levels, since each key pointer
1827          * (struct btrfs_key_ptr) refers to the lowest key accessible from the
1828          * subtree it points to. Thus we can skip searching lower levels.
1829          */
1830         if (prev_cmp == 0) {
1831                 *slot = 0;
1832                 return 0;
1833         }
1834
1835         return btrfs_bin_search(eb, search_low_slot, key, slot);
1836 }
1837
1838 static int search_leaf(struct btrfs_trans_handle *trans,
1839                        struct btrfs_root *root,
1840                        const struct btrfs_key *key,
1841                        struct btrfs_path *path,
1842                        int ins_len,
1843                        int prev_cmp)
1844 {
1845         struct extent_buffer *leaf = path->nodes[0];
1846         int leaf_free_space = -1;
1847         int search_low_slot = 0;
1848         int ret;
1849         bool do_bin_search = true;
1850
1851         /*
1852          * If we are doing an insertion, the leaf has enough free space and the
1853          * destination slot for the key is not slot 0, then we can unlock our
1854          * write lock on the parent, and any other upper nodes, before doing the
1855          * binary search on the leaf (with search_for_key_slot()), allowing other
1856          * tasks to lock the parent and any other upper nodes.
1857          */
1858         if (ins_len > 0) {
1859                 /*
1860                  * Cache the leaf free space, since we will need it later and it
1861                  * will not change until then.
1862                  */
1863                 leaf_free_space = btrfs_leaf_free_space(leaf);
1864
1865                 /*
1866                  * !path->locks[1] means we have a single node tree, the leaf is
1867                  * the root of the tree.
1868                  */
1869                 if (path->locks[1] && leaf_free_space >= ins_len) {
1870                         struct btrfs_disk_key first_key;
1871
1872                         ASSERT(btrfs_header_nritems(leaf) > 0);
1873                         btrfs_item_key(leaf, &first_key, 0);
1874
1875                         /*
1876                          * Doing the extra comparison with the first key is cheap,
1877                          * taking into account that the first key is very likely
1878                          * already in a cache line because it immediately follows
1879                          * the extent buffer's header and we have recently accessed
1880                          * the header's level field.
1881                          */
1882                         ret = btrfs_comp_keys(&first_key, key);
1883                         if (ret < 0) {
1884                                 /*
1885                                  * The first key is smaller than the key we want
1886                                  * to insert, so we are safe to unlock all upper
1887                                  * nodes and we have to do the binary search.
1888                                  *
1889                                  * We do use btrfs_unlock_up_safe() and not
1890                                  * unlock_up() because the later does not unlock
1891                                  * nodes with a slot of 0 - we can safely unlock
1892                                  * any node even if its slot is 0 since in this
1893                                  * case the key does not end up at slot 0 of the
1894                                  * leaf and there's no need to split the leaf.
1895                                  */
1896                                 btrfs_unlock_up_safe(path, 1);
1897                                 search_low_slot = 1;
1898                         } else {
1899                                 /*
1900                                  * The first key is >= then the key we want to
1901                                  * insert, so we can skip the binary search as
1902                                  * the target key will be at slot 0.
1903                                  *
1904                                  * We can not unlock upper nodes when the key is
1905                                  * less than the first key, because we will need
1906                                  * to update the key at slot 0 of the parent node
1907                                  * and possibly of other upper nodes too.
1908                                  * If the key matches the first key, then we can
1909                                  * unlock all the upper nodes, using
1910                                  * btrfs_unlock_up_safe() instead of unlock_up()
1911                                  * as stated above.
1912                                  */
1913                                 if (ret == 0)
1914                                         btrfs_unlock_up_safe(path, 1);
1915                                 /*
1916                                  * ret is already 0 or 1, matching the result of
1917                                  * a btrfs_bin_search() call, so there is no need
1918                                  * to adjust it.
1919                                  */
1920                                 do_bin_search = false;
1921                                 path->slots[0] = 0;
1922                         }
1923                 }
1924         }
1925
1926         if (do_bin_search) {
1927                 ret = search_for_key_slot(leaf, search_low_slot, key,
1928                                           prev_cmp, &path->slots[0]);
1929                 if (ret < 0)
1930                         return ret;
1931         }
1932
1933         if (ins_len > 0) {
1934                 /*
1935                  * Item key already exists. In this case, if we are allowed to
1936                  * insert the item (for example, in dir_item case, item key
1937                  * collision is allowed), it will be merged with the original
1938                  * item. Only the item size grows, no new btrfs item will be
1939                  * added. If search_for_extension is not set, ins_len already
1940                  * accounts the size btrfs_item, deduct it here so leaf space
1941                  * check will be correct.
1942                  */
1943                 if (ret == 0 && !path->search_for_extension) {
1944                         ASSERT(ins_len >= sizeof(struct btrfs_item));
1945                         ins_len -= sizeof(struct btrfs_item);
1946                 }
1947
1948                 ASSERT(leaf_free_space >= 0);
1949
1950                 if (leaf_free_space < ins_len) {
1951                         int err;
1952
1953                         err = split_leaf(trans, root, key, path, ins_len,
1954                                          (ret == 0));
1955                         ASSERT(err <= 0);
1956                         if (WARN_ON(err > 0))
1957                                 err = -EUCLEAN;
1958                         if (err)
1959                                 ret = err;
1960                 }
1961         }
1962
1963         return ret;
1964 }
1965
1966 /*
1967  * Look for a key in a tree and perform necessary modifications to preserve
1968  * tree invariants.
1969  *
1970  * @trans:      Handle of transaction, used when modifying the tree
1971  * @p:          Holds all btree nodes along the search path
1972  * @root:       The root node of the tree
1973  * @key:        The key we are looking for
1974  * @ins_len:    Indicates purpose of search:
1975  *              >0  for inserts it's size of item inserted (*)
1976  *              <0  for deletions
1977  *               0  for plain searches, not modifying the tree
1978  *
1979  *              (*) If size of item inserted doesn't include
1980  *              sizeof(struct btrfs_item), then p->search_for_extension must
1981  *              be set.
1982  * @cow:        boolean should CoW operations be performed. Must always be 1
1983  *              when modifying the tree.
1984  *
1985  * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
1986  * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
1987  *
1988  * If @key is found, 0 is returned and you can find the item in the leaf level
1989  * of the path (level 0)
1990  *
1991  * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
1992  * points to the slot where it should be inserted
1993  *
1994  * If an error is encountered while searching the tree a negative error number
1995  * is returned
1996  */
1997 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
1998                       const struct btrfs_key *key, struct btrfs_path *p,
1999                       int ins_len, int cow)
2000 {
2001         struct btrfs_fs_info *fs_info = root->fs_info;
2002         struct extent_buffer *b;
2003         int slot;
2004         int ret;
2005         int err;
2006         int level;
2007         int lowest_unlock = 1;
2008         /* everything at write_lock_level or lower must be write locked */
2009         int write_lock_level = 0;
2010         u8 lowest_level = 0;
2011         int min_write_lock_level;
2012         int prev_cmp;
2013
2014         might_sleep();
2015
2016         lowest_level = p->lowest_level;
2017         WARN_ON(lowest_level && ins_len > 0);
2018         WARN_ON(p->nodes[0] != NULL);
2019         BUG_ON(!cow && ins_len);
2020
2021         /*
2022          * For now only allow nowait for read only operations.  There's no
2023          * strict reason why we can't, we just only need it for reads so it's
2024          * only implemented for reads.
2025          */
2026         ASSERT(!p->nowait || !cow);
2027
2028         if (ins_len < 0) {
2029                 lowest_unlock = 2;
2030
2031                 /* when we are removing items, we might have to go up to level
2032                  * two as we update tree pointers  Make sure we keep write
2033                  * for those levels as well
2034                  */
2035                 write_lock_level = 2;
2036         } else if (ins_len > 0) {
2037                 /*
2038                  * for inserting items, make sure we have a write lock on
2039                  * level 1 so we can update keys
2040                  */
2041                 write_lock_level = 1;
2042         }
2043
2044         if (!cow)
2045                 write_lock_level = -1;
2046
2047         if (cow && (p->keep_locks || p->lowest_level))
2048                 write_lock_level = BTRFS_MAX_LEVEL;
2049
2050         min_write_lock_level = write_lock_level;
2051
2052         if (p->need_commit_sem) {
2053                 ASSERT(p->search_commit_root);
2054                 if (p->nowait) {
2055                         if (!down_read_trylock(&fs_info->commit_root_sem))
2056                                 return -EAGAIN;
2057                 } else {
2058                         down_read(&fs_info->commit_root_sem);
2059                 }
2060         }
2061
2062 again:
2063         prev_cmp = -1;
2064         b = btrfs_search_slot_get_root(root, p, write_lock_level);
2065         if (IS_ERR(b)) {
2066                 ret = PTR_ERR(b);
2067                 goto done;
2068         }
2069
2070         while (b) {
2071                 int dec = 0;
2072
2073                 level = btrfs_header_level(b);
2074
2075                 if (cow) {
2076                         bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
2077
2078                         /*
2079                          * if we don't really need to cow this block
2080                          * then we don't want to set the path blocking,
2081                          * so we test it here
2082                          */
2083                         if (!should_cow_block(trans, root, b))
2084                                 goto cow_done;
2085
2086                         /*
2087                          * must have write locks on this node and the
2088                          * parent
2089                          */
2090                         if (level > write_lock_level ||
2091                             (level + 1 > write_lock_level &&
2092                             level + 1 < BTRFS_MAX_LEVEL &&
2093                             p->nodes[level + 1])) {
2094                                 write_lock_level = level + 1;
2095                                 btrfs_release_path(p);
2096                                 goto again;
2097                         }
2098
2099                         if (last_level)
2100                                 err = btrfs_cow_block(trans, root, b, NULL, 0,
2101                                                       &b,
2102                                                       BTRFS_NESTING_COW);
2103                         else
2104                                 err = btrfs_cow_block(trans, root, b,
2105                                                       p->nodes[level + 1],
2106                                                       p->slots[level + 1], &b,
2107                                                       BTRFS_NESTING_COW);
2108                         if (err) {
2109                                 ret = err;
2110                                 goto done;
2111                         }
2112                 }
2113 cow_done:
2114                 p->nodes[level] = b;
2115
2116                 /*
2117                  * we have a lock on b and as long as we aren't changing
2118                  * the tree, there is no way to for the items in b to change.
2119                  * It is safe to drop the lock on our parent before we
2120                  * go through the expensive btree search on b.
2121                  *
2122                  * If we're inserting or deleting (ins_len != 0), then we might
2123                  * be changing slot zero, which may require changing the parent.
2124                  * So, we can't drop the lock until after we know which slot
2125                  * we're operating on.
2126                  */
2127                 if (!ins_len && !p->keep_locks) {
2128                         int u = level + 1;
2129
2130                         if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2131                                 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2132                                 p->locks[u] = 0;
2133                         }
2134                 }
2135
2136                 if (level == 0) {
2137                         if (ins_len > 0)
2138                                 ASSERT(write_lock_level >= 1);
2139
2140                         ret = search_leaf(trans, root, key, p, ins_len, prev_cmp);
2141                         if (!p->search_for_split)
2142                                 unlock_up(p, level, lowest_unlock,
2143                                           min_write_lock_level, NULL);
2144                         goto done;
2145                 }
2146
2147                 ret = search_for_key_slot(b, 0, key, prev_cmp, &slot);
2148                 if (ret < 0)
2149                         goto done;
2150                 prev_cmp = ret;
2151
2152                 if (ret && slot > 0) {
2153                         dec = 1;
2154                         slot--;
2155                 }
2156                 p->slots[level] = slot;
2157                 err = setup_nodes_for_search(trans, root, p, b, level, ins_len,
2158                                              &write_lock_level);
2159                 if (err == -EAGAIN)
2160                         goto again;
2161                 if (err) {
2162                         ret = err;
2163                         goto done;
2164                 }
2165                 b = p->nodes[level];
2166                 slot = p->slots[level];
2167
2168                 /*
2169                  * Slot 0 is special, if we change the key we have to update
2170                  * the parent pointer which means we must have a write lock on
2171                  * the parent
2172                  */
2173                 if (slot == 0 && ins_len && write_lock_level < level + 1) {
2174                         write_lock_level = level + 1;
2175                         btrfs_release_path(p);
2176                         goto again;
2177                 }
2178
2179                 unlock_up(p, level, lowest_unlock, min_write_lock_level,
2180                           &write_lock_level);
2181
2182                 if (level == lowest_level) {
2183                         if (dec)
2184                                 p->slots[level]++;
2185                         goto done;
2186                 }
2187
2188                 err = read_block_for_search(root, p, &b, level, slot, key);
2189                 if (err == -EAGAIN)
2190                         goto again;
2191                 if (err) {
2192                         ret = err;
2193                         goto done;
2194                 }
2195
2196                 if (!p->skip_locking) {
2197                         level = btrfs_header_level(b);
2198
2199                         btrfs_maybe_reset_lockdep_class(root, b);
2200
2201                         if (level <= write_lock_level) {
2202                                 btrfs_tree_lock(b);
2203                                 p->locks[level] = BTRFS_WRITE_LOCK;
2204                         } else {
2205                                 if (p->nowait) {
2206                                         if (!btrfs_try_tree_read_lock(b)) {
2207                                                 free_extent_buffer(b);
2208                                                 ret = -EAGAIN;
2209                                                 goto done;
2210                                         }
2211                                 } else {
2212                                         btrfs_tree_read_lock(b);
2213                                 }
2214                                 p->locks[level] = BTRFS_READ_LOCK;
2215                         }
2216                         p->nodes[level] = b;
2217                 }
2218         }
2219         ret = 1;
2220 done:
2221         if (ret < 0 && !p->skip_release_on_error)
2222                 btrfs_release_path(p);
2223
2224         if (p->need_commit_sem) {
2225                 int ret2;
2226
2227                 ret2 = finish_need_commit_sem_search(p);
2228                 up_read(&fs_info->commit_root_sem);
2229                 if (ret2)
2230                         ret = ret2;
2231         }
2232
2233         return ret;
2234 }
2235 ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO);
2236
2237 /*
2238  * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2239  * current state of the tree together with the operations recorded in the tree
2240  * modification log to search for the key in a previous version of this tree, as
2241  * denoted by the time_seq parameter.
2242  *
2243  * Naturally, there is no support for insert, delete or cow operations.
2244  *
2245  * The resulting path and return value will be set up as if we called
2246  * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2247  */
2248 int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
2249                           struct btrfs_path *p, u64 time_seq)
2250 {
2251         struct btrfs_fs_info *fs_info = root->fs_info;
2252         struct extent_buffer *b;
2253         int slot;
2254         int ret;
2255         int err;
2256         int level;
2257         int lowest_unlock = 1;
2258         u8 lowest_level = 0;
2259
2260         lowest_level = p->lowest_level;
2261         WARN_ON(p->nodes[0] != NULL);
2262         ASSERT(!p->nowait);
2263
2264         if (p->search_commit_root) {
2265                 BUG_ON(time_seq);
2266                 return btrfs_search_slot(NULL, root, key, p, 0, 0);
2267         }
2268
2269 again:
2270         b = btrfs_get_old_root(root, time_seq);
2271         if (!b) {
2272                 ret = -EIO;
2273                 goto done;
2274         }
2275         level = btrfs_header_level(b);
2276         p->locks[level] = BTRFS_READ_LOCK;
2277
2278         while (b) {
2279                 int dec = 0;
2280
2281                 level = btrfs_header_level(b);
2282                 p->nodes[level] = b;
2283
2284                 /*
2285                  * we have a lock on b and as long as we aren't changing
2286                  * the tree, there is no way to for the items in b to change.
2287                  * It is safe to drop the lock on our parent before we
2288                  * go through the expensive btree search on b.
2289                  */
2290                 btrfs_unlock_up_safe(p, level + 1);
2291
2292                 ret = btrfs_bin_search(b, 0, key, &slot);
2293                 if (ret < 0)
2294                         goto done;
2295
2296                 if (level == 0) {
2297                         p->slots[level] = slot;
2298                         unlock_up(p, level, lowest_unlock, 0, NULL);
2299                         goto done;
2300                 }
2301
2302                 if (ret && slot > 0) {
2303                         dec = 1;
2304                         slot--;
2305                 }
2306                 p->slots[level] = slot;
2307                 unlock_up(p, level, lowest_unlock, 0, NULL);
2308
2309                 if (level == lowest_level) {
2310                         if (dec)
2311                                 p->slots[level]++;
2312                         goto done;
2313                 }
2314
2315                 err = read_block_for_search(root, p, &b, level, slot, key);
2316                 if (err == -EAGAIN)
2317                         goto again;
2318                 if (err) {
2319                         ret = err;
2320                         goto done;
2321                 }
2322
2323                 level = btrfs_header_level(b);
2324                 btrfs_tree_read_lock(b);
2325                 b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq);
2326                 if (!b) {
2327                         ret = -ENOMEM;
2328                         goto done;
2329                 }
2330                 p->locks[level] = BTRFS_READ_LOCK;
2331                 p->nodes[level] = b;
2332         }
2333         ret = 1;
2334 done:
2335         if (ret < 0)
2336                 btrfs_release_path(p);
2337
2338         return ret;
2339 }
2340
2341 /*
2342  * Search the tree again to find a leaf with smaller keys.
2343  * Returns 0 if it found something.
2344  * Returns 1 if there are no smaller keys.
2345  * Returns < 0 on error.
2346  *
2347  * This may release the path, and so you may lose any locks held at the
2348  * time you call it.
2349  */
2350 static int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
2351 {
2352         struct btrfs_key key;
2353         struct btrfs_key orig_key;
2354         struct btrfs_disk_key found_key;
2355         int ret;
2356
2357         btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
2358         orig_key = key;
2359
2360         if (key.offset > 0) {
2361                 key.offset--;
2362         } else if (key.type > 0) {
2363                 key.type--;
2364                 key.offset = (u64)-1;
2365         } else if (key.objectid > 0) {
2366                 key.objectid--;
2367                 key.type = (u8)-1;
2368                 key.offset = (u64)-1;
2369         } else {
2370                 return 1;
2371         }
2372
2373         btrfs_release_path(path);
2374         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2375         if (ret <= 0)
2376                 return ret;
2377
2378         /*
2379          * Previous key not found. Even if we were at slot 0 of the leaf we had
2380          * before releasing the path and calling btrfs_search_slot(), we now may
2381          * be in a slot pointing to the same original key - this can happen if
2382          * after we released the path, one of more items were moved from a
2383          * sibling leaf into the front of the leaf we had due to an insertion
2384          * (see push_leaf_right()).
2385          * If we hit this case and our slot is > 0 and just decrement the slot
2386          * so that the caller does not process the same key again, which may or
2387          * may not break the caller, depending on its logic.
2388          */
2389         if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
2390                 btrfs_item_key(path->nodes[0], &found_key, path->slots[0]);
2391                 ret = btrfs_comp_keys(&found_key, &orig_key);
2392                 if (ret == 0) {
2393                         if (path->slots[0] > 0) {
2394                                 path->slots[0]--;
2395                                 return 0;
2396                         }
2397                         /*
2398                          * At slot 0, same key as before, it means orig_key is
2399                          * the lowest, leftmost, key in the tree. We're done.
2400                          */
2401                         return 1;
2402                 }
2403         }
2404
2405         btrfs_item_key(path->nodes[0], &found_key, 0);
2406         ret = btrfs_comp_keys(&found_key, &key);
2407         /*
2408          * We might have had an item with the previous key in the tree right
2409          * before we released our path. And after we released our path, that
2410          * item might have been pushed to the first slot (0) of the leaf we
2411          * were holding due to a tree balance. Alternatively, an item with the
2412          * previous key can exist as the only element of a leaf (big fat item).
2413          * Therefore account for these 2 cases, so that our callers (like
2414          * btrfs_previous_item) don't miss an existing item with a key matching
2415          * the previous key we computed above.
2416          */
2417         if (ret <= 0)
2418                 return 0;
2419         return 1;
2420 }
2421
2422 /*
2423  * helper to use instead of search slot if no exact match is needed but
2424  * instead the next or previous item should be returned.
2425  * When find_higher is true, the next higher item is returned, the next lower
2426  * otherwise.
2427  * When return_any and find_higher are both true, and no higher item is found,
2428  * return the next lower instead.
2429  * When return_any is true and find_higher is false, and no lower item is found,
2430  * return the next higher instead.
2431  * It returns 0 if any item is found, 1 if none is found (tree empty), and
2432  * < 0 on error
2433  */
2434 int btrfs_search_slot_for_read(struct btrfs_root *root,
2435                                const struct btrfs_key *key,
2436                                struct btrfs_path *p, int find_higher,
2437                                int return_any)
2438 {
2439         int ret;
2440         struct extent_buffer *leaf;
2441
2442 again:
2443         ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
2444         if (ret <= 0)
2445                 return ret;
2446         /*
2447          * a return value of 1 means the path is at the position where the
2448          * item should be inserted. Normally this is the next bigger item,
2449          * but in case the previous item is the last in a leaf, path points
2450          * to the first free slot in the previous leaf, i.e. at an invalid
2451          * item.
2452          */
2453         leaf = p->nodes[0];
2454
2455         if (find_higher) {
2456                 if (p->slots[0] >= btrfs_header_nritems(leaf)) {
2457                         ret = btrfs_next_leaf(root, p);
2458                         if (ret <= 0)
2459                                 return ret;
2460                         if (!return_any)
2461                                 return 1;
2462                         /*
2463                          * no higher item found, return the next
2464                          * lower instead
2465                          */
2466                         return_any = 0;
2467                         find_higher = 0;
2468                         btrfs_release_path(p);
2469                         goto again;
2470                 }
2471         } else {
2472                 if (p->slots[0] == 0) {
2473                         ret = btrfs_prev_leaf(root, p);
2474                         if (ret < 0)
2475                                 return ret;
2476                         if (!ret) {
2477                                 leaf = p->nodes[0];
2478                                 if (p->slots[0] == btrfs_header_nritems(leaf))
2479                                         p->slots[0]--;
2480                                 return 0;
2481                         }
2482                         if (!return_any)
2483                                 return 1;
2484                         /*
2485                          * no lower item found, return the next
2486                          * higher instead
2487                          */
2488                         return_any = 0;
2489                         find_higher = 1;
2490                         btrfs_release_path(p);
2491                         goto again;
2492                 } else {
2493                         --p->slots[0];
2494                 }
2495         }
2496         return 0;
2497 }
2498
2499 /*
2500  * Execute search and call btrfs_previous_item to traverse backwards if the item
2501  * was not found.
2502  *
2503  * Return 0 if found, 1 if not found and < 0 if error.
2504  */
2505 int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
2506                            struct btrfs_path *path)
2507 {
2508         int ret;
2509
2510         ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
2511         if (ret > 0)
2512                 ret = btrfs_previous_item(root, path, key->objectid, key->type);
2513
2514         if (ret == 0)
2515                 btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]);
2516
2517         return ret;
2518 }
2519
2520 /*
2521  * Search for a valid slot for the given path.
2522  *
2523  * @root:       The root node of the tree.
2524  * @key:        Will contain a valid item if found.
2525  * @path:       The starting point to validate the slot.
2526  *
2527  * Return: 0  if the item is valid
2528  *         1  if not found
2529  *         <0 if error.
2530  */
2531 int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key,
2532                               struct btrfs_path *path)
2533 {
2534         if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
2535                 int ret;
2536
2537                 ret = btrfs_next_leaf(root, path);
2538                 if (ret)
2539                         return ret;
2540         }
2541
2542         btrfs_item_key_to_cpu(path->nodes[0], key, path->slots[0]);
2543         return 0;
2544 }
2545
2546 /*
2547  * adjust the pointers going up the tree, starting at level
2548  * making sure the right key of each node is points to 'key'.
2549  * This is used after shifting pointers to the left, so it stops
2550  * fixing up pointers when a given leaf/node is not in slot 0 of the
2551  * higher levels
2552  *
2553  */
2554 static void fixup_low_keys(struct btrfs_trans_handle *trans,
2555                            struct btrfs_path *path,
2556                            struct btrfs_disk_key *key, int level)
2557 {
2558         int i;
2559         struct extent_buffer *t;
2560         int ret;
2561
2562         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2563                 int tslot = path->slots[i];
2564
2565                 if (!path->nodes[i])
2566                         break;
2567                 t = path->nodes[i];
2568                 ret = btrfs_tree_mod_log_insert_key(t, tslot,
2569                                                     BTRFS_MOD_LOG_KEY_REPLACE);
2570                 BUG_ON(ret < 0);
2571                 btrfs_set_node_key(t, key, tslot);
2572                 btrfs_mark_buffer_dirty(trans, path->nodes[i]);
2573                 if (tslot != 0)
2574                         break;
2575         }
2576 }
2577
2578 /*
2579  * update item key.
2580  *
2581  * This function isn't completely safe. It's the caller's responsibility
2582  * that the new key won't break the order
2583  */
2584 void btrfs_set_item_key_safe(struct btrfs_trans_handle *trans,
2585                              struct btrfs_path *path,
2586                              const struct btrfs_key *new_key)
2587 {
2588         struct btrfs_fs_info *fs_info = trans->fs_info;
2589         struct btrfs_disk_key disk_key;
2590         struct extent_buffer *eb;
2591         int slot;
2592
2593         eb = path->nodes[0];
2594         slot = path->slots[0];
2595         if (slot > 0) {
2596                 btrfs_item_key(eb, &disk_key, slot - 1);
2597                 if (unlikely(btrfs_comp_keys(&disk_key, new_key) >= 0)) {
2598                         btrfs_print_leaf(eb);
2599                         btrfs_crit(fs_info,
2600                 "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
2601                                    slot, btrfs_disk_key_objectid(&disk_key),
2602                                    btrfs_disk_key_type(&disk_key),
2603                                    btrfs_disk_key_offset(&disk_key),
2604                                    new_key->objectid, new_key->type,
2605                                    new_key->offset);
2606                         BUG();
2607                 }
2608         }
2609         if (slot < btrfs_header_nritems(eb) - 1) {
2610                 btrfs_item_key(eb, &disk_key, slot + 1);
2611                 if (unlikely(btrfs_comp_keys(&disk_key, new_key) <= 0)) {
2612                         btrfs_print_leaf(eb);
2613                         btrfs_crit(fs_info,
2614                 "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
2615                                    slot, btrfs_disk_key_objectid(&disk_key),
2616                                    btrfs_disk_key_type(&disk_key),
2617                                    btrfs_disk_key_offset(&disk_key),
2618                                    new_key->objectid, new_key->type,
2619                                    new_key->offset);
2620                         BUG();
2621                 }
2622         }
2623
2624         btrfs_cpu_key_to_disk(&disk_key, new_key);
2625         btrfs_set_item_key(eb, &disk_key, slot);
2626         btrfs_mark_buffer_dirty(trans, eb);
2627         if (slot == 0)
2628                 fixup_low_keys(trans, path, &disk_key, 1);
2629 }
2630
2631 /*
2632  * Check key order of two sibling extent buffers.
2633  *
2634  * Return true if something is wrong.
2635  * Return false if everything is fine.
2636  *
2637  * Tree-checker only works inside one tree block, thus the following
2638  * corruption can not be detected by tree-checker:
2639  *
2640  * Leaf @left                   | Leaf @right
2641  * --------------------------------------------------------------
2642  * | 1 | 2 | 3 | 4 | 5 | f6 |   | 7 | 8 |
2643  *
2644  * Key f6 in leaf @left itself is valid, but not valid when the next
2645  * key in leaf @right is 7.
2646  * This can only be checked at tree block merge time.
2647  * And since tree checker has ensured all key order in each tree block
2648  * is correct, we only need to bother the last key of @left and the first
2649  * key of @right.
2650  */
2651 static bool check_sibling_keys(struct extent_buffer *left,
2652                                struct extent_buffer *right)
2653 {
2654         struct btrfs_key left_last;
2655         struct btrfs_key right_first;
2656         int level = btrfs_header_level(left);
2657         int nr_left = btrfs_header_nritems(left);
2658         int nr_right = btrfs_header_nritems(right);
2659
2660         /* No key to check in one of the tree blocks */
2661         if (!nr_left || !nr_right)
2662                 return false;
2663
2664         if (level) {
2665                 btrfs_node_key_to_cpu(left, &left_last, nr_left - 1);
2666                 btrfs_node_key_to_cpu(right, &right_first, 0);
2667         } else {
2668                 btrfs_item_key_to_cpu(left, &left_last, nr_left - 1);
2669                 btrfs_item_key_to_cpu(right, &right_first, 0);
2670         }
2671
2672         if (unlikely(btrfs_comp_cpu_keys(&left_last, &right_first) >= 0)) {
2673                 btrfs_crit(left->fs_info, "left extent buffer:");
2674                 btrfs_print_tree(left, false);
2675                 btrfs_crit(left->fs_info, "right extent buffer:");
2676                 btrfs_print_tree(right, false);
2677                 btrfs_crit(left->fs_info,
2678 "bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)",
2679                            left_last.objectid, left_last.type,
2680                            left_last.offset, right_first.objectid,
2681                            right_first.type, right_first.offset);
2682                 return true;
2683         }
2684         return false;
2685 }
2686
2687 /*
2688  * try to push data from one node into the next node left in the
2689  * tree.
2690  *
2691  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
2692  * error, and > 0 if there was no room in the left hand block.
2693  */
2694 static int push_node_left(struct btrfs_trans_handle *trans,
2695                           struct extent_buffer *dst,
2696                           struct extent_buffer *src, int empty)
2697 {
2698         struct btrfs_fs_info *fs_info = trans->fs_info;
2699         int push_items = 0;
2700         int src_nritems;
2701         int dst_nritems;
2702         int ret = 0;
2703
2704         src_nritems = btrfs_header_nritems(src);
2705         dst_nritems = btrfs_header_nritems(dst);
2706         push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
2707         WARN_ON(btrfs_header_generation(src) != trans->transid);
2708         WARN_ON(btrfs_header_generation(dst) != trans->transid);
2709
2710         if (!empty && src_nritems <= 8)
2711                 return 1;
2712
2713         if (push_items <= 0)
2714                 return 1;
2715
2716         if (empty) {
2717                 push_items = min(src_nritems, push_items);
2718                 if (push_items < src_nritems) {
2719                         /* leave at least 8 pointers in the node if
2720                          * we aren't going to empty it
2721                          */
2722                         if (src_nritems - push_items < 8) {
2723                                 if (push_items <= 8)
2724                                         return 1;
2725                                 push_items -= 8;
2726                         }
2727                 }
2728         } else
2729                 push_items = min(src_nritems - 8, push_items);
2730
2731         /* dst is the left eb, src is the middle eb */
2732         if (check_sibling_keys(dst, src)) {
2733                 ret = -EUCLEAN;
2734                 btrfs_abort_transaction(trans, ret);
2735                 return ret;
2736         }
2737         ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
2738         if (ret) {
2739                 btrfs_abort_transaction(trans, ret);
2740                 return ret;
2741         }
2742         copy_extent_buffer(dst, src,
2743                            btrfs_node_key_ptr_offset(dst, dst_nritems),
2744                            btrfs_node_key_ptr_offset(src, 0),
2745                            push_items * sizeof(struct btrfs_key_ptr));
2746
2747         if (push_items < src_nritems) {
2748                 /*
2749                  * btrfs_tree_mod_log_eb_copy handles logging the move, so we
2750                  * don't need to do an explicit tree mod log operation for it.
2751                  */
2752                 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(src, 0),
2753                                       btrfs_node_key_ptr_offset(src, push_items),
2754                                       (src_nritems - push_items) *
2755                                       sizeof(struct btrfs_key_ptr));
2756         }
2757         btrfs_set_header_nritems(src, src_nritems - push_items);
2758         btrfs_set_header_nritems(dst, dst_nritems + push_items);
2759         btrfs_mark_buffer_dirty(trans, src);
2760         btrfs_mark_buffer_dirty(trans, dst);
2761
2762         return ret;
2763 }
2764
2765 /*
2766  * try to push data from one node into the next node right in the
2767  * tree.
2768  *
2769  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
2770  * error, and > 0 if there was no room in the right hand block.
2771  *
2772  * this will  only push up to 1/2 the contents of the left node over
2773  */
2774 static int balance_node_right(struct btrfs_trans_handle *trans,
2775                               struct extent_buffer *dst,
2776                               struct extent_buffer *src)
2777 {
2778         struct btrfs_fs_info *fs_info = trans->fs_info;
2779         int push_items = 0;
2780         int max_push;
2781         int src_nritems;
2782         int dst_nritems;
2783         int ret = 0;
2784
2785         WARN_ON(btrfs_header_generation(src) != trans->transid);
2786         WARN_ON(btrfs_header_generation(dst) != trans->transid);
2787
2788         src_nritems = btrfs_header_nritems(src);
2789         dst_nritems = btrfs_header_nritems(dst);
2790         push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
2791         if (push_items <= 0)
2792                 return 1;
2793
2794         if (src_nritems < 4)
2795                 return 1;
2796
2797         max_push = src_nritems / 2 + 1;
2798         /* don't try to empty the node */
2799         if (max_push >= src_nritems)
2800                 return 1;
2801
2802         if (max_push < push_items)
2803                 push_items = max_push;
2804
2805         /* dst is the right eb, src is the middle eb */
2806         if (check_sibling_keys(src, dst)) {
2807                 ret = -EUCLEAN;
2808                 btrfs_abort_transaction(trans, ret);
2809                 return ret;
2810         }
2811
2812         /*
2813          * btrfs_tree_mod_log_eb_copy handles logging the move, so we don't
2814          * need to do an explicit tree mod log operation for it.
2815          */
2816         memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(dst, push_items),
2817                                       btrfs_node_key_ptr_offset(dst, 0),
2818                                       (dst_nritems) *
2819                                       sizeof(struct btrfs_key_ptr));
2820
2821         ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
2822                                          push_items);
2823         if (ret) {
2824                 btrfs_abort_transaction(trans, ret);
2825                 return ret;
2826         }
2827         copy_extent_buffer(dst, src,
2828                            btrfs_node_key_ptr_offset(dst, 0),
2829                            btrfs_node_key_ptr_offset(src, src_nritems - push_items),
2830                            push_items * sizeof(struct btrfs_key_ptr));
2831
2832         btrfs_set_header_nritems(src, src_nritems - push_items);
2833         btrfs_set_header_nritems(dst, dst_nritems + push_items);
2834
2835         btrfs_mark_buffer_dirty(trans, src);
2836         btrfs_mark_buffer_dirty(trans, dst);
2837
2838         return ret;
2839 }
2840
2841 /*
2842  * helper function to insert a new root level in the tree.
2843  * A new node is allocated, and a single item is inserted to
2844  * point to the existing root
2845  *
2846  * returns zero on success or < 0 on failure.
2847  */
2848 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
2849                            struct btrfs_root *root,
2850                            struct btrfs_path *path, int level)
2851 {
2852         u64 lower_gen;
2853         struct extent_buffer *lower;
2854         struct extent_buffer *c;
2855         struct extent_buffer *old;
2856         struct btrfs_disk_key lower_key;
2857         int ret;
2858
2859         BUG_ON(path->nodes[level]);
2860         BUG_ON(path->nodes[level-1] != root->node);
2861
2862         lower = path->nodes[level-1];
2863         if (level == 1)
2864                 btrfs_item_key(lower, &lower_key, 0);
2865         else
2866                 btrfs_node_key(lower, &lower_key, 0);
2867
2868         c = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
2869                                    &lower_key, level, root->node->start, 0,
2870                                    0, BTRFS_NESTING_NEW_ROOT);
2871         if (IS_ERR(c))
2872                 return PTR_ERR(c);
2873
2874         root_add_used_bytes(root);
2875
2876         btrfs_set_header_nritems(c, 1);
2877         btrfs_set_node_key(c, &lower_key, 0);
2878         btrfs_set_node_blockptr(c, 0, lower->start);
2879         lower_gen = btrfs_header_generation(lower);
2880         WARN_ON(lower_gen != trans->transid);
2881
2882         btrfs_set_node_ptr_generation(c, 0, lower_gen);
2883
2884         btrfs_mark_buffer_dirty(trans, c);
2885
2886         old = root->node;
2887         ret = btrfs_tree_mod_log_insert_root(root->node, c, false);
2888         if (ret < 0) {
2889                 btrfs_free_tree_block(trans, btrfs_root_id(root), c, 0, 1);
2890                 btrfs_tree_unlock(c);
2891                 free_extent_buffer(c);
2892                 return ret;
2893         }
2894         rcu_assign_pointer(root->node, c);
2895
2896         /* the super has an extra ref to root->node */
2897         free_extent_buffer(old);
2898
2899         add_root_to_dirty_list(root);
2900         atomic_inc(&c->refs);
2901         path->nodes[level] = c;
2902         path->locks[level] = BTRFS_WRITE_LOCK;
2903         path->slots[level] = 0;
2904         return 0;
2905 }
2906
2907 /*
2908  * worker function to insert a single pointer in a node.
2909  * the node should have enough room for the pointer already
2910  *
2911  * slot and level indicate where you want the key to go, and
2912  * blocknr is the block the key points to.
2913  */
2914 static int insert_ptr(struct btrfs_trans_handle *trans,
2915                       struct btrfs_path *path,
2916                       struct btrfs_disk_key *key, u64 bytenr,
2917                       int slot, int level)
2918 {
2919         struct extent_buffer *lower;
2920         int nritems;
2921         int ret;
2922
2923         BUG_ON(!path->nodes[level]);
2924         btrfs_assert_tree_write_locked(path->nodes[level]);
2925         lower = path->nodes[level];
2926         nritems = btrfs_header_nritems(lower);
2927         BUG_ON(slot > nritems);
2928         BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
2929         if (slot != nritems) {
2930                 if (level) {
2931                         ret = btrfs_tree_mod_log_insert_move(lower, slot + 1,
2932                                         slot, nritems - slot);
2933                         if (ret < 0) {
2934                                 btrfs_abort_transaction(trans, ret);
2935                                 return ret;
2936                         }
2937                 }
2938                 memmove_extent_buffer(lower,
2939                               btrfs_node_key_ptr_offset(lower, slot + 1),
2940                               btrfs_node_key_ptr_offset(lower, slot),
2941                               (nritems - slot) * sizeof(struct btrfs_key_ptr));
2942         }
2943         if (level) {
2944                 ret = btrfs_tree_mod_log_insert_key(lower, slot,
2945                                                     BTRFS_MOD_LOG_KEY_ADD);
2946                 if (ret < 0) {
2947                         btrfs_abort_transaction(trans, ret);
2948                         return ret;
2949                 }
2950         }
2951         btrfs_set_node_key(lower, key, slot);
2952         btrfs_set_node_blockptr(lower, slot, bytenr);
2953         WARN_ON(trans->transid == 0);
2954         btrfs_set_node_ptr_generation(lower, slot, trans->transid);
2955         btrfs_set_header_nritems(lower, nritems + 1);
2956         btrfs_mark_buffer_dirty(trans, lower);
2957
2958         return 0;
2959 }
2960
2961 /*
2962  * split the node at the specified level in path in two.
2963  * The path is corrected to point to the appropriate node after the split
2964  *
2965  * Before splitting this tries to make some room in the node by pushing
2966  * left and right, if either one works, it returns right away.
2967  *
2968  * returns 0 on success and < 0 on failure
2969  */
2970 static noinline int split_node(struct btrfs_trans_handle *trans,
2971                                struct btrfs_root *root,
2972                                struct btrfs_path *path, int level)
2973 {
2974         struct btrfs_fs_info *fs_info = root->fs_info;
2975         struct extent_buffer *c;
2976         struct extent_buffer *split;
2977         struct btrfs_disk_key disk_key;
2978         int mid;
2979         int ret;
2980         u32 c_nritems;
2981
2982         c = path->nodes[level];
2983         WARN_ON(btrfs_header_generation(c) != trans->transid);
2984         if (c == root->node) {
2985                 /*
2986                  * trying to split the root, lets make a new one
2987                  *
2988                  * tree mod log: We don't log_removal old root in
2989                  * insert_new_root, because that root buffer will be kept as a
2990                  * normal node. We are going to log removal of half of the
2991                  * elements below with btrfs_tree_mod_log_eb_copy(). We're
2992                  * holding a tree lock on the buffer, which is why we cannot
2993                  * race with other tree_mod_log users.
2994                  */
2995                 ret = insert_new_root(trans, root, path, level + 1);
2996                 if (ret)
2997                         return ret;
2998         } else {
2999                 ret = push_nodes_for_insert(trans, root, path, level);
3000                 c = path->nodes[level];
3001                 if (!ret && btrfs_header_nritems(c) <
3002                     BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
3003                         return 0;
3004                 if (ret < 0)
3005                         return ret;
3006         }
3007
3008         c_nritems = btrfs_header_nritems(c);
3009         mid = (c_nritems + 1) / 2;
3010         btrfs_node_key(c, &disk_key, mid);
3011
3012         split = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3013                                        &disk_key, level, c->start, 0,
3014                                        0, BTRFS_NESTING_SPLIT);
3015         if (IS_ERR(split))
3016                 return PTR_ERR(split);
3017
3018         root_add_used_bytes(root);
3019         ASSERT(btrfs_header_level(c) == level);
3020
3021         ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
3022         if (ret) {
3023                 btrfs_tree_unlock(split);
3024                 free_extent_buffer(split);
3025                 btrfs_abort_transaction(trans, ret);
3026                 return ret;
3027         }
3028         copy_extent_buffer(split, c,
3029                            btrfs_node_key_ptr_offset(split, 0),
3030                            btrfs_node_key_ptr_offset(c, mid),
3031                            (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3032         btrfs_set_header_nritems(split, c_nritems - mid);
3033         btrfs_set_header_nritems(c, mid);
3034
3035         btrfs_mark_buffer_dirty(trans, c);
3036         btrfs_mark_buffer_dirty(trans, split);
3037
3038         ret = insert_ptr(trans, path, &disk_key, split->start,
3039                          path->slots[level + 1] + 1, level + 1);
3040         if (ret < 0) {
3041                 btrfs_tree_unlock(split);
3042                 free_extent_buffer(split);
3043                 return ret;
3044         }
3045
3046         if (path->slots[level] >= mid) {
3047                 path->slots[level] -= mid;
3048                 btrfs_tree_unlock(c);
3049                 free_extent_buffer(c);
3050                 path->nodes[level] = split;
3051                 path->slots[level + 1] += 1;
3052         } else {
3053                 btrfs_tree_unlock(split);
3054                 free_extent_buffer(split);
3055         }
3056         return 0;
3057 }
3058
3059 /*
3060  * how many bytes are required to store the items in a leaf.  start
3061  * and nr indicate which items in the leaf to check.  This totals up the
3062  * space used both by the item structs and the item data
3063  */
3064 static int leaf_space_used(const struct extent_buffer *l, int start, int nr)
3065 {
3066         int data_len;
3067         int nritems = btrfs_header_nritems(l);
3068         int end = min(nritems, start + nr) - 1;
3069
3070         if (!nr)
3071                 return 0;
3072         data_len = btrfs_item_offset(l, start) + btrfs_item_size(l, start);
3073         data_len = data_len - btrfs_item_offset(l, end);
3074         data_len += sizeof(struct btrfs_item) * nr;
3075         WARN_ON(data_len < 0);
3076         return data_len;
3077 }
3078
3079 /*
3080  * The space between the end of the leaf items and
3081  * the start of the leaf data.  IOW, how much room
3082  * the leaf has left for both items and data
3083  */
3084 int btrfs_leaf_free_space(const struct extent_buffer *leaf)
3085 {
3086         struct btrfs_fs_info *fs_info = leaf->fs_info;
3087         int nritems = btrfs_header_nritems(leaf);
3088         int ret;
3089
3090         ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
3091         if (ret < 0) {
3092                 btrfs_crit(fs_info,
3093                            "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3094                            ret,
3095                            (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
3096                            leaf_space_used(leaf, 0, nritems), nritems);
3097         }
3098         return ret;
3099 }
3100
3101 /*
3102  * min slot controls the lowest index we're willing to push to the
3103  * right.  We'll push up to and including min_slot, but no lower
3104  */
3105 static noinline int __push_leaf_right(struct btrfs_trans_handle *trans,
3106                                       struct btrfs_path *path,
3107                                       int data_size, int empty,
3108                                       struct extent_buffer *right,
3109                                       int free_space, u32 left_nritems,
3110                                       u32 min_slot)
3111 {
3112         struct btrfs_fs_info *fs_info = right->fs_info;
3113         struct extent_buffer *left = path->nodes[0];
3114         struct extent_buffer *upper = path->nodes[1];
3115         struct btrfs_map_token token;
3116         struct btrfs_disk_key disk_key;
3117         int slot;
3118         u32 i;
3119         int push_space = 0;
3120         int push_items = 0;
3121         u32 nr;
3122         u32 right_nritems;
3123         u32 data_end;
3124         u32 this_item_size;
3125
3126         if (empty)
3127                 nr = 0;
3128         else
3129                 nr = max_t(u32, 1, min_slot);
3130
3131         if (path->slots[0] >= left_nritems)
3132                 push_space += data_size;
3133
3134         slot = path->slots[1];
3135         i = left_nritems - 1;
3136         while (i >= nr) {
3137                 if (!empty && push_items > 0) {
3138                         if (path->slots[0] > i)
3139                                 break;
3140                         if (path->slots[0] == i) {
3141                                 int space = btrfs_leaf_free_space(left);
3142
3143                                 if (space + push_space * 2 > free_space)
3144                                         break;
3145                         }
3146                 }
3147
3148                 if (path->slots[0] == i)
3149                         push_space += data_size;
3150
3151                 this_item_size = btrfs_item_size(left, i);
3152                 if (this_item_size + sizeof(struct btrfs_item) +
3153                     push_space > free_space)
3154                         break;
3155
3156                 push_items++;
3157                 push_space += this_item_size + sizeof(struct btrfs_item);
3158                 if (i == 0)
3159                         break;
3160                 i--;
3161         }
3162
3163         if (push_items == 0)
3164                 goto out_unlock;
3165
3166         WARN_ON(!empty && push_items == left_nritems);
3167
3168         /* push left to right */
3169         right_nritems = btrfs_header_nritems(right);
3170
3171         push_space = btrfs_item_data_end(left, left_nritems - push_items);
3172         push_space -= leaf_data_end(left);
3173
3174         /* make room in the right data area */
3175         data_end = leaf_data_end(right);
3176         memmove_leaf_data(right, data_end - push_space, data_end,
3177                           BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
3178
3179         /* copy from the left data area */
3180         copy_leaf_data(right, left, BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3181                        leaf_data_end(left), push_space);
3182
3183         memmove_leaf_items(right, push_items, 0, right_nritems);
3184
3185         /* copy the items from left to right */
3186         copy_leaf_items(right, left, 0, left_nritems - push_items, push_items);
3187
3188         /* update the item pointers */
3189         btrfs_init_map_token(&token, right);
3190         right_nritems += push_items;
3191         btrfs_set_header_nritems(right, right_nritems);
3192         push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3193         for (i = 0; i < right_nritems; i++) {
3194                 push_space -= btrfs_token_item_size(&token, i);
3195                 btrfs_set_token_item_offset(&token, i, push_space);
3196         }
3197
3198         left_nritems -= push_items;
3199         btrfs_set_header_nritems(left, left_nritems);
3200
3201         if (left_nritems)
3202                 btrfs_mark_buffer_dirty(trans, left);
3203         else
3204                 btrfs_clear_buffer_dirty(trans, left);
3205
3206         btrfs_mark_buffer_dirty(trans, right);
3207
3208         btrfs_item_key(right, &disk_key, 0);
3209         btrfs_set_node_key(upper, &disk_key, slot + 1);
3210         btrfs_mark_buffer_dirty(trans, upper);
3211
3212         /* then fixup the leaf pointer in the path */
3213         if (path->slots[0] >= left_nritems) {
3214                 path->slots[0] -= left_nritems;
3215                 if (btrfs_header_nritems(path->nodes[0]) == 0)
3216                         btrfs_clear_buffer_dirty(trans, path->nodes[0]);
3217                 btrfs_tree_unlock(path->nodes[0]);
3218                 free_extent_buffer(path->nodes[0]);
3219                 path->nodes[0] = right;
3220                 path->slots[1] += 1;
3221         } else {
3222                 btrfs_tree_unlock(right);
3223                 free_extent_buffer(right);
3224         }
3225         return 0;
3226
3227 out_unlock:
3228         btrfs_tree_unlock(right);
3229         free_extent_buffer(right);
3230         return 1;
3231 }
3232
3233 /*
3234  * push some data in the path leaf to the right, trying to free up at
3235  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3236  *
3237  * returns 1 if the push failed because the other node didn't have enough
3238  * room, 0 if everything worked out and < 0 if there were major errors.
3239  *
3240  * this will push starting from min_slot to the end of the leaf.  It won't
3241  * push any slot lower than min_slot
3242  */
3243 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3244                            *root, struct btrfs_path *path,
3245                            int min_data_size, int data_size,
3246                            int empty, u32 min_slot)
3247 {
3248         struct extent_buffer *left = path->nodes[0];
3249         struct extent_buffer *right;
3250         struct extent_buffer *upper;
3251         int slot;
3252         int free_space;
3253         u32 left_nritems;
3254         int ret;
3255
3256         if (!path->nodes[1])
3257                 return 1;
3258
3259         slot = path->slots[1];
3260         upper = path->nodes[1];
3261         if (slot >= btrfs_header_nritems(upper) - 1)
3262                 return 1;
3263
3264         btrfs_assert_tree_write_locked(path->nodes[1]);
3265
3266         right = btrfs_read_node_slot(upper, slot + 1);
3267         if (IS_ERR(right))
3268                 return PTR_ERR(right);
3269
3270         __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
3271
3272         free_space = btrfs_leaf_free_space(right);
3273         if (free_space < data_size)
3274                 goto out_unlock;
3275
3276         ret = btrfs_cow_block(trans, root, right, upper,
3277                               slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
3278         if (ret)
3279                 goto out_unlock;
3280
3281         left_nritems = btrfs_header_nritems(left);
3282         if (left_nritems == 0)
3283                 goto out_unlock;
3284
3285         if (check_sibling_keys(left, right)) {
3286                 ret = -EUCLEAN;
3287                 btrfs_abort_transaction(trans, ret);
3288                 btrfs_tree_unlock(right);
3289                 free_extent_buffer(right);
3290                 return ret;
3291         }
3292         if (path->slots[0] == left_nritems && !empty) {
3293                 /* Key greater than all keys in the leaf, right neighbor has
3294                  * enough room for it and we're not emptying our leaf to delete
3295                  * it, therefore use right neighbor to insert the new item and
3296                  * no need to touch/dirty our left leaf. */
3297                 btrfs_tree_unlock(left);
3298                 free_extent_buffer(left);
3299                 path->nodes[0] = right;
3300                 path->slots[0] = 0;
3301                 path->slots[1]++;
3302                 return 0;
3303         }
3304
3305         return __push_leaf_right(trans, path, min_data_size, empty, right,
3306                                  free_space, left_nritems, min_slot);
3307 out_unlock:
3308         btrfs_tree_unlock(right);
3309         free_extent_buffer(right);
3310         return 1;
3311 }
3312
3313 /*
3314  * push some data in the path leaf to the left, trying to free up at
3315  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3316  *
3317  * max_slot can put a limit on how far into the leaf we'll push items.  The
3318  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
3319  * items
3320  */
3321 static noinline int __push_leaf_left(struct btrfs_trans_handle *trans,
3322                                      struct btrfs_path *path, int data_size,
3323                                      int empty, struct extent_buffer *left,
3324                                      int free_space, u32 right_nritems,
3325                                      u32 max_slot)
3326 {
3327         struct btrfs_fs_info *fs_info = left->fs_info;
3328         struct btrfs_disk_key disk_key;
3329         struct extent_buffer *right = path->nodes[0];
3330         int i;
3331         int push_space = 0;
3332         int push_items = 0;
3333         u32 old_left_nritems;
3334         u32 nr;
3335         int ret = 0;
3336         u32 this_item_size;
3337         u32 old_left_item_size;
3338         struct btrfs_map_token token;
3339
3340         if (empty)
3341                 nr = min(right_nritems, max_slot);
3342         else
3343                 nr = min(right_nritems - 1, max_slot);
3344
3345         for (i = 0; i < nr; i++) {
3346                 if (!empty && push_items > 0) {
3347                         if (path->slots[0] < i)
3348                                 break;
3349                         if (path->slots[0] == i) {
3350                                 int space = btrfs_leaf_free_space(right);
3351
3352                                 if (space + push_space * 2 > free_space)
3353                                         break;
3354                         }
3355                 }
3356
3357                 if (path->slots[0] == i)
3358                         push_space += data_size;
3359
3360                 this_item_size = btrfs_item_size(right, i);
3361                 if (this_item_size + sizeof(struct btrfs_item) + push_space >
3362                     free_space)
3363                         break;
3364
3365                 push_items++;
3366                 push_space += this_item_size + sizeof(struct btrfs_item);
3367         }
3368
3369         if (push_items == 0) {
3370                 ret = 1;
3371                 goto out;
3372         }
3373         WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3374
3375         /* push data from right to left */
3376         copy_leaf_items(left, right, btrfs_header_nritems(left), 0, push_items);
3377
3378         push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
3379                      btrfs_item_offset(right, push_items - 1);
3380
3381         copy_leaf_data(left, right, leaf_data_end(left) - push_space,
3382                        btrfs_item_offset(right, push_items - 1), push_space);
3383         old_left_nritems = btrfs_header_nritems(left);
3384         BUG_ON(old_left_nritems <= 0);
3385
3386         btrfs_init_map_token(&token, left);
3387         old_left_item_size = btrfs_item_offset(left, old_left_nritems - 1);
3388         for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3389                 u32 ioff;
3390
3391                 ioff = btrfs_token_item_offset(&token, i);
3392                 btrfs_set_token_item_offset(&token, i,
3393                       ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
3394         }
3395         btrfs_set_header_nritems(left, old_left_nritems + push_items);
3396
3397         /* fixup right node */
3398         if (push_items > right_nritems)
3399                 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3400                        right_nritems);
3401
3402         if (push_items < right_nritems) {
3403                 push_space = btrfs_item_offset(right, push_items - 1) -
3404                                                   leaf_data_end(right);
3405                 memmove_leaf_data(right,
3406                                   BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3407                                   leaf_data_end(right), push_space);
3408
3409                 memmove_leaf_items(right, 0, push_items,
3410                                    btrfs_header_nritems(right) - push_items);
3411         }
3412
3413         btrfs_init_map_token(&token, right);
3414         right_nritems -= push_items;
3415         btrfs_set_header_nritems(right, right_nritems);
3416         push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3417         for (i = 0; i < right_nritems; i++) {
3418                 push_space = push_space - btrfs_token_item_size(&token, i);
3419                 btrfs_set_token_item_offset(&token, i, push_space);
3420         }
3421
3422         btrfs_mark_buffer_dirty(trans, left);
3423         if (right_nritems)
3424                 btrfs_mark_buffer_dirty(trans, right);
3425         else
3426                 btrfs_clear_buffer_dirty(trans, right);
3427
3428         btrfs_item_key(right, &disk_key, 0);
3429         fixup_low_keys(trans, path, &disk_key, 1);
3430
3431         /* then fixup the leaf pointer in the path */
3432         if (path->slots[0] < push_items) {
3433                 path->slots[0] += old_left_nritems;
3434                 btrfs_tree_unlock(path->nodes[0]);
3435                 free_extent_buffer(path->nodes[0]);
3436                 path->nodes[0] = left;
3437                 path->slots[1] -= 1;
3438         } else {
3439                 btrfs_tree_unlock(left);
3440                 free_extent_buffer(left);
3441                 path->slots[0] -= push_items;
3442         }
3443         BUG_ON(path->slots[0] < 0);
3444         return ret;
3445 out:
3446         btrfs_tree_unlock(left);
3447         free_extent_buffer(left);
3448         return ret;
3449 }
3450
3451 /*
3452  * push some data in the path leaf to the left, trying to free up at
3453  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3454  *
3455  * max_slot can put a limit on how far into the leaf we'll push items.  The
3456  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
3457  * items
3458  */
3459 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
3460                           *root, struct btrfs_path *path, int min_data_size,
3461                           int data_size, int empty, u32 max_slot)
3462 {
3463         struct extent_buffer *right = path->nodes[0];
3464         struct extent_buffer *left;
3465         int slot;
3466         int free_space;
3467         u32 right_nritems;
3468         int ret = 0;
3469
3470         slot = path->slots[1];
3471         if (slot == 0)
3472                 return 1;
3473         if (!path->nodes[1])
3474                 return 1;
3475
3476         right_nritems = btrfs_header_nritems(right);
3477         if (right_nritems == 0)
3478                 return 1;
3479
3480         btrfs_assert_tree_write_locked(path->nodes[1]);
3481
3482         left = btrfs_read_node_slot(path->nodes[1], slot - 1);
3483         if (IS_ERR(left))
3484                 return PTR_ERR(left);
3485
3486         __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
3487
3488         free_space = btrfs_leaf_free_space(left);
3489         if (free_space < data_size) {
3490                 ret = 1;
3491                 goto out;
3492         }
3493
3494         ret = btrfs_cow_block(trans, root, left,
3495                               path->nodes[1], slot - 1, &left,
3496                               BTRFS_NESTING_LEFT_COW);
3497         if (ret) {
3498                 /* we hit -ENOSPC, but it isn't fatal here */
3499                 if (ret == -ENOSPC)
3500                         ret = 1;
3501                 goto out;
3502         }
3503
3504         if (check_sibling_keys(left, right)) {
3505                 ret = -EUCLEAN;
3506                 btrfs_abort_transaction(trans, ret);
3507                 goto out;
3508         }
3509         return __push_leaf_left(trans, path, min_data_size, empty, left,
3510                                 free_space, right_nritems, max_slot);
3511 out:
3512         btrfs_tree_unlock(left);
3513         free_extent_buffer(left);
3514         return ret;
3515 }
3516
3517 /*
3518  * split the path's leaf in two, making sure there is at least data_size
3519  * available for the resulting leaf level of the path.
3520  */
3521 static noinline int copy_for_split(struct btrfs_trans_handle *trans,
3522                                    struct btrfs_path *path,
3523                                    struct extent_buffer *l,
3524                                    struct extent_buffer *right,
3525                                    int slot, int mid, int nritems)
3526 {
3527         struct btrfs_fs_info *fs_info = trans->fs_info;
3528         int data_copy_size;
3529         int rt_data_off;
3530         int i;
3531         int ret;
3532         struct btrfs_disk_key disk_key;
3533         struct btrfs_map_token token;
3534
3535         nritems = nritems - mid;
3536         btrfs_set_header_nritems(right, nritems);
3537         data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l);
3538
3539         copy_leaf_items(right, l, 0, mid, nritems);
3540
3541         copy_leaf_data(right, l, BTRFS_LEAF_DATA_SIZE(fs_info) - data_copy_size,
3542                        leaf_data_end(l), data_copy_size);
3543
3544         rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid);
3545
3546         btrfs_init_map_token(&token, right);
3547         for (i = 0; i < nritems; i++) {
3548                 u32 ioff;
3549
3550                 ioff = btrfs_token_item_offset(&token, i);
3551                 btrfs_set_token_item_offset(&token, i, ioff + rt_data_off);
3552         }
3553
3554         btrfs_set_header_nritems(l, mid);
3555         btrfs_item_key(right, &disk_key, 0);
3556         ret = insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1);
3557         if (ret < 0)
3558                 return ret;
3559
3560         btrfs_mark_buffer_dirty(trans, right);
3561         btrfs_mark_buffer_dirty(trans, l);
3562         BUG_ON(path->slots[0] != slot);
3563
3564         if (mid <= slot) {
3565                 btrfs_tree_unlock(path->nodes[0]);
3566                 free_extent_buffer(path->nodes[0]);
3567                 path->nodes[0] = right;
3568                 path->slots[0] -= mid;
3569                 path->slots[1] += 1;
3570         } else {
3571                 btrfs_tree_unlock(right);
3572                 free_extent_buffer(right);
3573         }
3574
3575         BUG_ON(path->slots[0] < 0);
3576
3577         return 0;
3578 }
3579
3580 /*
3581  * double splits happen when we need to insert a big item in the middle
3582  * of a leaf.  A double split can leave us with 3 mostly empty leaves:
3583  * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
3584  *          A                 B                 C
3585  *
3586  * We avoid this by trying to push the items on either side of our target
3587  * into the adjacent leaves.  If all goes well we can avoid the double split
3588  * completely.
3589  */
3590 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
3591                                           struct btrfs_root *root,
3592                                           struct btrfs_path *path,
3593                                           int data_size)
3594 {
3595         int ret;
3596         int progress = 0;
3597         int slot;
3598         u32 nritems;
3599         int space_needed = data_size;
3600
3601         slot = path->slots[0];
3602         if (slot < btrfs_header_nritems(path->nodes[0]))
3603                 space_needed -= btrfs_leaf_free_space(path->nodes[0]);
3604
3605         /*
3606          * try to push all the items after our slot into the
3607          * right leaf
3608          */
3609         ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
3610         if (ret < 0)
3611                 return ret;
3612
3613         if (ret == 0)
3614                 progress++;
3615
3616         nritems = btrfs_header_nritems(path->nodes[0]);
3617         /*
3618          * our goal is to get our slot at the start or end of a leaf.  If
3619          * we've done so we're done
3620          */
3621         if (path->slots[0] == 0 || path->slots[0] == nritems)
3622                 return 0;
3623
3624         if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
3625                 return 0;
3626
3627         /* try to push all the items before our slot into the next leaf */
3628         slot = path->slots[0];
3629         space_needed = data_size;
3630         if (slot > 0)
3631                 space_needed -= btrfs_leaf_free_space(path->nodes[0]);
3632         ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
3633         if (ret < 0)
3634                 return ret;
3635
3636         if (ret == 0)
3637                 progress++;
3638
3639         if (progress)
3640                 return 0;
3641         return 1;
3642 }
3643
3644 /*
3645  * split the path's leaf in two, making sure there is at least data_size
3646  * available for the resulting leaf level of the path.
3647  *
3648  * returns 0 if all went well and < 0 on failure.
3649  */
3650 static noinline int split_leaf(struct btrfs_trans_handle *trans,
3651                                struct btrfs_root *root,
3652                                const struct btrfs_key *ins_key,
3653                                struct btrfs_path *path, int data_size,
3654                                int extend)
3655 {
3656         struct btrfs_disk_key disk_key;
3657         struct extent_buffer *l;
3658         u32 nritems;
3659         int mid;
3660         int slot;
3661         struct extent_buffer *right;
3662         struct btrfs_fs_info *fs_info = root->fs_info;
3663         int ret = 0;
3664         int wret;
3665         int split;
3666         int num_doubles = 0;
3667         int tried_avoid_double = 0;
3668
3669         l = path->nodes[0];
3670         slot = path->slots[0];
3671         if (extend && data_size + btrfs_item_size(l, slot) +
3672             sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
3673                 return -EOVERFLOW;
3674
3675         /* first try to make some room by pushing left and right */
3676         if (data_size && path->nodes[1]) {
3677                 int space_needed = data_size;
3678
3679                 if (slot < btrfs_header_nritems(l))
3680                         space_needed -= btrfs_leaf_free_space(l);
3681
3682                 wret = push_leaf_right(trans, root, path, space_needed,
3683                                        space_needed, 0, 0);
3684                 if (wret < 0)
3685                         return wret;
3686                 if (wret) {
3687                         space_needed = data_size;
3688                         if (slot > 0)
3689                                 space_needed -= btrfs_leaf_free_space(l);
3690                         wret = push_leaf_left(trans, root, path, space_needed,
3691                                               space_needed, 0, (u32)-1);
3692                         if (wret < 0)
3693                                 return wret;
3694                 }
3695                 l = path->nodes[0];
3696
3697                 /* did the pushes work? */
3698                 if (btrfs_leaf_free_space(l) >= data_size)
3699                         return 0;
3700         }
3701
3702         if (!path->nodes[1]) {
3703                 ret = insert_new_root(trans, root, path, 1);
3704                 if (ret)
3705                         return ret;
3706         }
3707 again:
3708         split = 1;
3709         l = path->nodes[0];
3710         slot = path->slots[0];
3711         nritems = btrfs_header_nritems(l);
3712         mid = (nritems + 1) / 2;
3713
3714         if (mid <= slot) {
3715                 if (nritems == 1 ||
3716                     leaf_space_used(l, mid, nritems - mid) + data_size >
3717                         BTRFS_LEAF_DATA_SIZE(fs_info)) {
3718                         if (slot >= nritems) {
3719                                 split = 0;
3720                         } else {
3721                                 mid = slot;
3722                                 if (mid != nritems &&
3723                                     leaf_space_used(l, mid, nritems - mid) +
3724                                     data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
3725                                         if (data_size && !tried_avoid_double)
3726                                                 goto push_for_double;
3727                                         split = 2;
3728                                 }
3729                         }
3730                 }
3731         } else {
3732                 if (leaf_space_used(l, 0, mid) + data_size >
3733                         BTRFS_LEAF_DATA_SIZE(fs_info)) {
3734                         if (!extend && data_size && slot == 0) {
3735                                 split = 0;
3736                         } else if ((extend || !data_size) && slot == 0) {
3737                                 mid = 1;
3738                         } else {
3739                                 mid = slot;
3740                                 if (mid != nritems &&
3741                                     leaf_space_used(l, mid, nritems - mid) +
3742                                     data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
3743                                         if (data_size && !tried_avoid_double)
3744                                                 goto push_for_double;
3745                                         split = 2;
3746                                 }
3747                         }
3748                 }
3749         }
3750
3751         if (split == 0)
3752                 btrfs_cpu_key_to_disk(&disk_key, ins_key);
3753         else
3754                 btrfs_item_key(l, &disk_key, mid);
3755
3756         /*
3757          * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double
3758          * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES
3759          * subclasses, which is 8 at the time of this patch, and we've maxed it
3760          * out.  In the future we could add a
3761          * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
3762          * use BTRFS_NESTING_NEW_ROOT.
3763          */
3764         right = btrfs_alloc_tree_block(trans, root, 0, root->root_key.objectid,
3765                                        &disk_key, 0, l->start, 0, 0,
3766                                        num_doubles ? BTRFS_NESTING_NEW_ROOT :
3767                                        BTRFS_NESTING_SPLIT);
3768         if (IS_ERR(right))
3769                 return PTR_ERR(right);
3770
3771         root_add_used_bytes(root);
3772
3773         if (split == 0) {
3774                 if (mid <= slot) {
3775                         btrfs_set_header_nritems(right, 0);
3776                         ret = insert_ptr(trans, path, &disk_key,
3777                                          right->start, path->slots[1] + 1, 1);
3778                         if (ret < 0) {
3779                                 btrfs_tree_unlock(right);
3780                                 free_extent_buffer(right);
3781                                 return ret;
3782                         }
3783                         btrfs_tree_unlock(path->nodes[0]);
3784                         free_extent_buffer(path->nodes[0]);
3785                         path->nodes[0] = right;
3786                         path->slots[0] = 0;
3787                         path->slots[1] += 1;
3788                 } else {
3789                         btrfs_set_header_nritems(right, 0);
3790                         ret = insert_ptr(trans, path, &disk_key,
3791                                          right->start, path->slots[1], 1);
3792                         if (ret < 0) {
3793                                 btrfs_tree_unlock(right);
3794                                 free_extent_buffer(right);
3795                                 return ret;
3796                         }
3797                         btrfs_tree_unlock(path->nodes[0]);
3798                         free_extent_buffer(path->nodes[0]);
3799                         path->nodes[0] = right;
3800                         path->slots[0] = 0;
3801                         if (path->slots[1] == 0)
3802                                 fixup_low_keys(trans, path, &disk_key, 1);
3803                 }
3804                 /*
3805                  * We create a new leaf 'right' for the required ins_len and
3806                  * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
3807                  * the content of ins_len to 'right'.
3808                  */
3809                 return ret;
3810         }
3811
3812         ret = copy_for_split(trans, path, l, right, slot, mid, nritems);
3813         if (ret < 0) {
3814                 btrfs_tree_unlock(right);
3815                 free_extent_buffer(right);
3816                 return ret;
3817         }
3818
3819         if (split == 2) {
3820                 BUG_ON(num_doubles != 0);
3821                 num_doubles++;
3822                 goto again;
3823         }
3824
3825         return 0;
3826
3827 push_for_double:
3828         push_for_double_split(trans, root, path, data_size);
3829         tried_avoid_double = 1;
3830         if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
3831                 return 0;
3832         goto again;
3833 }
3834
3835 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
3836                                          struct btrfs_root *root,
3837                                          struct btrfs_path *path, int ins_len)
3838 {
3839         struct btrfs_key key;
3840         struct extent_buffer *leaf;
3841         struct btrfs_file_extent_item *fi;
3842         u64 extent_len = 0;
3843         u32 item_size;
3844         int ret;
3845
3846         leaf = path->nodes[0];
3847         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3848
3849         BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
3850                key.type != BTRFS_EXTENT_CSUM_KEY);
3851
3852         if (btrfs_leaf_free_space(leaf) >= ins_len)
3853                 return 0;
3854
3855         item_size = btrfs_item_size(leaf, path->slots[0]);
3856         if (key.type == BTRFS_EXTENT_DATA_KEY) {
3857                 fi = btrfs_item_ptr(leaf, path->slots[0],
3858                                     struct btrfs_file_extent_item);
3859                 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
3860         }
3861         btrfs_release_path(path);
3862
3863         path->keep_locks = 1;
3864         path->search_for_split = 1;
3865         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
3866         path->search_for_split = 0;
3867         if (ret > 0)
3868                 ret = -EAGAIN;
3869         if (ret < 0)
3870                 goto err;
3871
3872         ret = -EAGAIN;
3873         leaf = path->nodes[0];
3874         /* if our item isn't there, return now */
3875         if (item_size != btrfs_item_size(leaf, path->slots[0]))
3876                 goto err;
3877
3878         /* the leaf has  changed, it now has room.  return now */
3879         if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len)
3880                 goto err;
3881
3882         if (key.type == BTRFS_EXTENT_DATA_KEY) {
3883                 fi = btrfs_item_ptr(leaf, path->slots[0],
3884                                     struct btrfs_file_extent_item);
3885                 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
3886                         goto err;
3887         }
3888
3889         ret = split_leaf(trans, root, &key, path, ins_len, 1);
3890         if (ret)
3891                 goto err;
3892
3893         path->keep_locks = 0;
3894         btrfs_unlock_up_safe(path, 1);
3895         return 0;
3896 err:
3897         path->keep_locks = 0;
3898         return ret;
3899 }
3900
3901 static noinline int split_item(struct btrfs_trans_handle *trans,
3902                                struct btrfs_path *path,
3903                                const struct btrfs_key *new_key,
3904                                unsigned long split_offset)
3905 {
3906         struct extent_buffer *leaf;
3907         int orig_slot, slot;
3908         char *buf;
3909         u32 nritems;
3910         u32 item_size;
3911         u32 orig_offset;
3912         struct btrfs_disk_key disk_key;
3913
3914         leaf = path->nodes[0];
3915         /*
3916          * Shouldn't happen because the caller must have previously called
3917          * setup_leaf_for_split() to make room for the new item in the leaf.
3918          */
3919         if (WARN_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item)))
3920                 return -ENOSPC;
3921
3922         orig_slot = path->slots[0];
3923         orig_offset = btrfs_item_offset(leaf, path->slots[0]);
3924         item_size = btrfs_item_size(leaf, path->slots[0]);
3925
3926         buf = kmalloc(item_size, GFP_NOFS);
3927         if (!buf)
3928                 return -ENOMEM;
3929
3930         read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
3931                             path->slots[0]), item_size);
3932
3933         slot = path->slots[0] + 1;
3934         nritems = btrfs_header_nritems(leaf);
3935         if (slot != nritems) {
3936                 /* shift the items */
3937                 memmove_leaf_items(leaf, slot + 1, slot, nritems - slot);
3938         }
3939
3940         btrfs_cpu_key_to_disk(&disk_key, new_key);
3941         btrfs_set_item_key(leaf, &disk_key, slot);
3942
3943         btrfs_set_item_offset(leaf, slot, orig_offset);
3944         btrfs_set_item_size(leaf, slot, item_size - split_offset);
3945
3946         btrfs_set_item_offset(leaf, orig_slot,
3947                                  orig_offset + item_size - split_offset);
3948         btrfs_set_item_size(leaf, orig_slot, split_offset);
3949
3950         btrfs_set_header_nritems(leaf, nritems + 1);
3951
3952         /* write the data for the start of the original item */
3953         write_extent_buffer(leaf, buf,
3954                             btrfs_item_ptr_offset(leaf, path->slots[0]),
3955                             split_offset);
3956
3957         /* write the data for the new item */
3958         write_extent_buffer(leaf, buf + split_offset,
3959                             btrfs_item_ptr_offset(leaf, slot),
3960                             item_size - split_offset);
3961         btrfs_mark_buffer_dirty(trans, leaf);
3962
3963         BUG_ON(btrfs_leaf_free_space(leaf) < 0);
3964         kfree(buf);
3965         return 0;
3966 }
3967
3968 /*
3969  * This function splits a single item into two items,
3970  * giving 'new_key' to the new item and splitting the
3971  * old one at split_offset (from the start of the item).
3972  *
3973  * The path may be released by this operation.  After
3974  * the split, the path is pointing to the old item.  The
3975  * new item is going to be in the same node as the old one.
3976  *
3977  * Note, the item being split must be smaller enough to live alone on
3978  * a tree block with room for one extra struct btrfs_item
3979  *
3980  * This allows us to split the item in place, keeping a lock on the
3981  * leaf the entire time.
3982  */
3983 int btrfs_split_item(struct btrfs_trans_handle *trans,
3984                      struct btrfs_root *root,
3985                      struct btrfs_path *path,
3986                      const struct btrfs_key *new_key,
3987                      unsigned long split_offset)
3988 {
3989         int ret;
3990         ret = setup_leaf_for_split(trans, root, path,
3991                                    sizeof(struct btrfs_item));
3992         if (ret)
3993                 return ret;
3994
3995         ret = split_item(trans, path, new_key, split_offset);
3996         return ret;
3997 }
3998
3999 /*
4000  * make the item pointed to by the path smaller.  new_size indicates
4001  * how small to make it, and from_end tells us if we just chop bytes
4002  * off the end of the item or if we shift the item to chop bytes off
4003  * the front.
4004  */
4005 void btrfs_truncate_item(struct btrfs_trans_handle *trans,
4006                          struct btrfs_path *path, u32 new_size, int from_end)
4007 {
4008         int slot;
4009         struct extent_buffer *leaf;
4010         u32 nritems;
4011         unsigned int data_end;
4012         unsigned int old_data_start;
4013         unsigned int old_size;
4014         unsigned int size_diff;
4015         int i;
4016         struct btrfs_map_token token;
4017
4018         leaf = path->nodes[0];
4019         slot = path->slots[0];
4020
4021         old_size = btrfs_item_size(leaf, slot);
4022         if (old_size == new_size)
4023                 return;
4024
4025         nritems = btrfs_header_nritems(leaf);
4026         data_end = leaf_data_end(leaf);
4027
4028         old_data_start = btrfs_item_offset(leaf, slot);
4029
4030         size_diff = old_size - new_size;
4031
4032         BUG_ON(slot < 0);
4033         BUG_ON(slot >= nritems);
4034
4035         /*
4036          * item0..itemN ... dataN.offset..dataN.size .. data0.size
4037          */
4038         /* first correct the data pointers */
4039         btrfs_init_map_token(&token, leaf);
4040         for (i = slot; i < nritems; i++) {
4041                 u32 ioff;
4042
4043                 ioff = btrfs_token_item_offset(&token, i);
4044                 btrfs_set_token_item_offset(&token, i, ioff + size_diff);
4045         }
4046
4047         /* shift the data */
4048         if (from_end) {
4049                 memmove_leaf_data(leaf, data_end + size_diff, data_end,
4050                                   old_data_start + new_size - data_end);
4051         } else {
4052                 struct btrfs_disk_key disk_key;
4053                 u64 offset;
4054
4055                 btrfs_item_key(leaf, &disk_key, slot);
4056
4057                 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4058                         unsigned long ptr;
4059                         struct btrfs_file_extent_item *fi;
4060
4061                         fi = btrfs_item_ptr(leaf, slot,
4062                                             struct btrfs_file_extent_item);
4063                         fi = (struct btrfs_file_extent_item *)(
4064                              (unsigned long)fi - size_diff);
4065
4066                         if (btrfs_file_extent_type(leaf, fi) ==
4067                             BTRFS_FILE_EXTENT_INLINE) {
4068                                 ptr = btrfs_item_ptr_offset(leaf, slot);
4069                                 memmove_extent_buffer(leaf, ptr,
4070                                       (unsigned long)fi,
4071                                       BTRFS_FILE_EXTENT_INLINE_DATA_START);
4072                         }
4073                 }
4074
4075                 memmove_leaf_data(leaf, data_end + size_diff, data_end,
4076                                   old_data_start - data_end);
4077
4078                 offset = btrfs_disk_key_offset(&disk_key);
4079                 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4080                 btrfs_set_item_key(leaf, &disk_key, slot);
4081                 if (slot == 0)
4082                         fixup_low_keys(trans, path, &disk_key, 1);
4083         }
4084
4085         btrfs_set_item_size(leaf, slot, new_size);
4086         btrfs_mark_buffer_dirty(trans, leaf);
4087
4088         if (btrfs_leaf_free_space(leaf) < 0) {
4089                 btrfs_print_leaf(leaf);
4090                 BUG();
4091         }
4092 }
4093
4094 /*
4095  * make the item pointed to by the path bigger, data_size is the added size.
4096  */
4097 void btrfs_extend_item(struct btrfs_trans_handle *trans,
4098                        struct btrfs_path *path, u32 data_size)
4099 {
4100         int slot;
4101         struct extent_buffer *leaf;
4102         u32 nritems;
4103         unsigned int data_end;
4104         unsigned int old_data;
4105         unsigned int old_size;
4106         int i;
4107         struct btrfs_map_token token;
4108
4109         leaf = path->nodes[0];
4110
4111         nritems = btrfs_header_nritems(leaf);
4112         data_end = leaf_data_end(leaf);
4113
4114         if (btrfs_leaf_free_space(leaf) < data_size) {
4115                 btrfs_print_leaf(leaf);
4116                 BUG();
4117         }
4118         slot = path->slots[0];
4119         old_data = btrfs_item_data_end(leaf, slot);
4120
4121         BUG_ON(slot < 0);
4122         if (slot >= nritems) {
4123                 btrfs_print_leaf(leaf);
4124                 btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
4125                            slot, nritems);
4126                 BUG();
4127         }
4128
4129         /*
4130          * item0..itemN ... dataN.offset..dataN.size .. data0.size
4131          */
4132         /* first correct the data pointers */
4133         btrfs_init_map_token(&token, leaf);
4134         for (i = slot; i < nritems; i++) {
4135                 u32 ioff;
4136
4137                 ioff = btrfs_token_item_offset(&token, i);
4138                 btrfs_set_token_item_offset(&token, i, ioff - data_size);
4139         }
4140
4141         /* shift the data */
4142         memmove_leaf_data(leaf, data_end - data_size, data_end,
4143                           old_data - data_end);
4144
4145         data_end = old_data;
4146         old_size = btrfs_item_size(leaf, slot);
4147         btrfs_set_item_size(leaf, slot, old_size + data_size);
4148         btrfs_mark_buffer_dirty(trans, leaf);
4149
4150         if (btrfs_leaf_free_space(leaf) < 0) {
4151                 btrfs_print_leaf(leaf);
4152                 BUG();
4153         }
4154 }
4155
4156 /*
4157  * Make space in the node before inserting one or more items.
4158  *
4159  * @trans:      transaction handle
4160  * @root:       root we are inserting items to
4161  * @path:       points to the leaf/slot where we are going to insert new items
4162  * @batch:      information about the batch of items to insert
4163  *
4164  * Main purpose is to save stack depth by doing the bulk of the work in a
4165  * function that doesn't call btrfs_search_slot
4166  */
4167 static void setup_items_for_insert(struct btrfs_trans_handle *trans,
4168                                    struct btrfs_root *root, struct btrfs_path *path,
4169                                    const struct btrfs_item_batch *batch)
4170 {
4171         struct btrfs_fs_info *fs_info = root->fs_info;
4172         int i;
4173         u32 nritems;
4174         unsigned int data_end;
4175         struct btrfs_disk_key disk_key;
4176         struct extent_buffer *leaf;
4177         int slot;
4178         struct btrfs_map_token token;
4179         u32 total_size;
4180
4181         /*
4182          * Before anything else, update keys in the parent and other ancestors
4183          * if needed, then release the write locks on them, so that other tasks
4184          * can use them while we modify the leaf.
4185          */
4186         if (path->slots[0] == 0) {
4187                 btrfs_cpu_key_to_disk(&disk_key, &batch->keys[0]);
4188                 fixup_low_keys(trans, path, &disk_key, 1);
4189         }
4190         btrfs_unlock_up_safe(path, 1);
4191
4192         leaf = path->nodes[0];
4193         slot = path->slots[0];
4194
4195         nritems = btrfs_header_nritems(leaf);
4196         data_end = leaf_data_end(leaf);
4197         total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
4198
4199         if (btrfs_leaf_free_space(leaf) < total_size) {
4200                 btrfs_print_leaf(leaf);
4201                 btrfs_crit(fs_info, "not enough freespace need %u have %d",
4202                            total_size, btrfs_leaf_free_space(leaf));
4203                 BUG();
4204         }
4205
4206         btrfs_init_map_token(&token, leaf);
4207         if (slot != nritems) {
4208                 unsigned int old_data = btrfs_item_data_end(leaf, slot);
4209
4210                 if (old_data < data_end) {
4211                         btrfs_print_leaf(leaf);
4212                         btrfs_crit(fs_info,
4213                 "item at slot %d with data offset %u beyond data end of leaf %u",
4214                                    slot, old_data, data_end);
4215                         BUG();
4216                 }
4217                 /*
4218                  * item0..itemN ... dataN.offset..dataN.size .. data0.size
4219                  */
4220                 /* first correct the data pointers */
4221                 for (i = slot; i < nritems; i++) {
4222                         u32 ioff;
4223
4224                         ioff = btrfs_token_item_offset(&token, i);
4225                         btrfs_set_token_item_offset(&token, i,
4226                                                        ioff - batch->total_data_size);
4227                 }
4228                 /* shift the items */
4229                 memmove_leaf_items(leaf, slot + batch->nr, slot, nritems - slot);
4230
4231                 /* shift the data */
4232                 memmove_leaf_data(leaf, data_end - batch->total_data_size,
4233                                   data_end, old_data - data_end);
4234                 data_end = old_data;
4235         }
4236
4237         /* setup the item for the new data */
4238         for (i = 0; i < batch->nr; i++) {
4239                 btrfs_cpu_key_to_disk(&disk_key, &batch->keys[i]);
4240                 btrfs_set_item_key(leaf, &disk_key, slot + i);
4241                 data_end -= batch->data_sizes[i];
4242                 btrfs_set_token_item_offset(&token, slot + i, data_end);
4243                 btrfs_set_token_item_size(&token, slot + i, batch->data_sizes[i]);
4244         }
4245
4246         btrfs_set_header_nritems(leaf, nritems + batch->nr);
4247         btrfs_mark_buffer_dirty(trans, leaf);
4248
4249         if (btrfs_leaf_free_space(leaf) < 0) {
4250                 btrfs_print_leaf(leaf);
4251                 BUG();
4252         }
4253 }
4254
4255 /*
4256  * Insert a new item into a leaf.
4257  *
4258  * @trans:     Transaction handle.
4259  * @root:      The root of the btree.
4260  * @path:      A path pointing to the target leaf and slot.
4261  * @key:       The key of the new item.
4262  * @data_size: The size of the data associated with the new key.
4263  */
4264 void btrfs_setup_item_for_insert(struct btrfs_trans_handle *trans,
4265                                  struct btrfs_root *root,
4266                                  struct btrfs_path *path,
4267                                  const struct btrfs_key *key,
4268                                  u32 data_size)
4269 {
4270         struct btrfs_item_batch batch;
4271
4272         batch.keys = key;
4273         batch.data_sizes = &data_size;
4274         batch.total_data_size = data_size;
4275         batch.nr = 1;
4276
4277         setup_items_for_insert(trans, root, path, &batch);
4278 }
4279
4280 /*
4281  * Given a key and some data, insert items into the tree.
4282  * This does all the path init required, making room in the tree if needed.
4283  */
4284 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4285                             struct btrfs_root *root,
4286                             struct btrfs_path *path,
4287                             const struct btrfs_item_batch *batch)
4288 {
4289         int ret = 0;
4290         int slot;
4291         u32 total_size;
4292
4293         total_size = batch->total_data_size + (batch->nr * sizeof(struct btrfs_item));
4294         ret = btrfs_search_slot(trans, root, &batch->keys[0], path, total_size, 1);
4295         if (ret == 0)
4296                 return -EEXIST;
4297         if (ret < 0)
4298                 return ret;
4299
4300         slot = path->slots[0];
4301         BUG_ON(slot < 0);
4302
4303         setup_items_for_insert(trans, root, path, batch);
4304         return 0;
4305 }
4306
4307 /*
4308  * Given a key and some data, insert an item into the tree.
4309  * This does all the path init required, making room in the tree if needed.
4310  */
4311 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4312                       const struct btrfs_key *cpu_key, void *data,
4313                       u32 data_size)
4314 {
4315         int ret = 0;
4316         struct btrfs_path *path;
4317         struct extent_buffer *leaf;
4318         unsigned long ptr;
4319
4320         path = btrfs_alloc_path();
4321         if (!path)
4322                 return -ENOMEM;
4323         ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4324         if (!ret) {
4325                 leaf = path->nodes[0];
4326                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4327                 write_extent_buffer(leaf, data, ptr, data_size);
4328                 btrfs_mark_buffer_dirty(trans, leaf);
4329         }
4330         btrfs_free_path(path);
4331         return ret;
4332 }
4333
4334 /*
4335  * This function duplicates an item, giving 'new_key' to the new item.
4336  * It guarantees both items live in the same tree leaf and the new item is
4337  * contiguous with the original item.
4338  *
4339  * This allows us to split a file extent in place, keeping a lock on the leaf
4340  * the entire time.
4341  */
4342 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4343                          struct btrfs_root *root,
4344                          struct btrfs_path *path,
4345                          const struct btrfs_key *new_key)
4346 {
4347         struct extent_buffer *leaf;
4348         int ret;
4349         u32 item_size;
4350
4351         leaf = path->nodes[0];
4352         item_size = btrfs_item_size(leaf, path->slots[0]);
4353         ret = setup_leaf_for_split(trans, root, path,
4354                                    item_size + sizeof(struct btrfs_item));
4355         if (ret)
4356                 return ret;
4357
4358         path->slots[0]++;
4359         btrfs_setup_item_for_insert(trans, root, path, new_key, item_size);
4360         leaf = path->nodes[0];
4361         memcpy_extent_buffer(leaf,
4362                              btrfs_item_ptr_offset(leaf, path->slots[0]),
4363                              btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4364                              item_size);
4365         return 0;
4366 }
4367
4368 /*
4369  * delete the pointer from a given node.
4370  *
4371  * the tree should have been previously balanced so the deletion does not
4372  * empty a node.
4373  *
4374  * This is exported for use inside btrfs-progs, don't un-export it.
4375  */
4376 int btrfs_del_ptr(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4377                   struct btrfs_path *path, int level, int slot)
4378 {
4379         struct extent_buffer *parent = path->nodes[level];
4380         u32 nritems;
4381         int ret;
4382
4383         nritems = btrfs_header_nritems(parent);
4384         if (slot != nritems - 1) {
4385                 if (level) {
4386                         ret = btrfs_tree_mod_log_insert_move(parent, slot,
4387                                         slot + 1, nritems - slot - 1);
4388                         if (ret < 0) {
4389                                 btrfs_abort_transaction(trans, ret);
4390                                 return ret;
4391                         }
4392                 }
4393                 memmove_extent_buffer(parent,
4394                               btrfs_node_key_ptr_offset(parent, slot),
4395                               btrfs_node_key_ptr_offset(parent, slot + 1),
4396                               sizeof(struct btrfs_key_ptr) *
4397                               (nritems - slot - 1));
4398         } else if (level) {
4399                 ret = btrfs_tree_mod_log_insert_key(parent, slot,
4400                                                     BTRFS_MOD_LOG_KEY_REMOVE);
4401                 if (ret < 0) {
4402                         btrfs_abort_transaction(trans, ret);
4403                         return ret;
4404                 }
4405         }
4406
4407         nritems--;
4408         btrfs_set_header_nritems(parent, nritems);
4409         if (nritems == 0 && parent == root->node) {
4410                 BUG_ON(btrfs_header_level(root->node) != 1);
4411                 /* just turn the root into a leaf and break */
4412                 btrfs_set_header_level(root->node, 0);
4413         } else if (slot == 0) {
4414                 struct btrfs_disk_key disk_key;
4415
4416                 btrfs_node_key(parent, &disk_key, 0);
4417                 fixup_low_keys(trans, path, &disk_key, level + 1);
4418         }
4419         btrfs_mark_buffer_dirty(trans, parent);
4420         return 0;
4421 }
4422
4423 /*
4424  * a helper function to delete the leaf pointed to by path->slots[1] and
4425  * path->nodes[1].
4426  *
4427  * This deletes the pointer in path->nodes[1] and frees the leaf
4428  * block extent.  zero is returned if it all worked out, < 0 otherwise.
4429  *
4430  * The path must have already been setup for deleting the leaf, including
4431  * all the proper balancing.  path->nodes[1] must be locked.
4432  */
4433 static noinline int btrfs_del_leaf(struct btrfs_trans_handle *trans,
4434                                    struct btrfs_root *root,
4435                                    struct btrfs_path *path,
4436                                    struct extent_buffer *leaf)
4437 {
4438         int ret;
4439
4440         WARN_ON(btrfs_header_generation(leaf) != trans->transid);
4441         ret = btrfs_del_ptr(trans, root, path, 1, path->slots[1]);
4442         if (ret < 0)
4443                 return ret;
4444
4445         /*
4446          * btrfs_free_extent is expensive, we want to make sure we
4447          * aren't holding any locks when we call it
4448          */
4449         btrfs_unlock_up_safe(path, 0);
4450
4451         root_sub_used_bytes(root);
4452
4453         atomic_inc(&leaf->refs);
4454         btrfs_free_tree_block(trans, btrfs_root_id(root), leaf, 0, 1);
4455         free_extent_buffer_stale(leaf);
4456         return 0;
4457 }
4458 /*
4459  * delete the item at the leaf level in path.  If that empties
4460  * the leaf, remove it from the tree
4461  */
4462 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4463                     struct btrfs_path *path, int slot, int nr)
4464 {
4465         struct btrfs_fs_info *fs_info = root->fs_info;
4466         struct extent_buffer *leaf;
4467         int ret = 0;
4468         int wret;
4469         u32 nritems;
4470
4471         leaf = path->nodes[0];
4472         nritems = btrfs_header_nritems(leaf);
4473
4474         if (slot + nr != nritems) {
4475                 const u32 last_off = btrfs_item_offset(leaf, slot + nr - 1);
4476                 const int data_end = leaf_data_end(leaf);
4477                 struct btrfs_map_token token;
4478                 u32 dsize = 0;
4479                 int i;
4480
4481                 for (i = 0; i < nr; i++)
4482                         dsize += btrfs_item_size(leaf, slot + i);
4483
4484                 memmove_leaf_data(leaf, data_end + dsize, data_end,
4485                                   last_off - data_end);
4486
4487                 btrfs_init_map_token(&token, leaf);
4488                 for (i = slot + nr; i < nritems; i++) {
4489                         u32 ioff;
4490
4491                         ioff = btrfs_token_item_offset(&token, i);
4492                         btrfs_set_token_item_offset(&token, i, ioff + dsize);
4493                 }
4494
4495                 memmove_leaf_items(leaf, slot, slot + nr, nritems - slot - nr);
4496         }
4497         btrfs_set_header_nritems(leaf, nritems - nr);
4498         nritems -= nr;
4499
4500         /* delete the leaf if we've emptied it */
4501         if (nritems == 0) {
4502                 if (leaf == root->node) {
4503                         btrfs_set_header_level(leaf, 0);
4504                 } else {
4505                         btrfs_clear_buffer_dirty(trans, leaf);
4506                         ret = btrfs_del_leaf(trans, root, path, leaf);
4507                         if (ret < 0)
4508                                 return ret;
4509                 }
4510         } else {
4511                 int used = leaf_space_used(leaf, 0, nritems);
4512                 if (slot == 0) {
4513                         struct btrfs_disk_key disk_key;
4514
4515                         btrfs_item_key(leaf, &disk_key, 0);
4516                         fixup_low_keys(trans, path, &disk_key, 1);
4517                 }
4518
4519                 /*
4520                  * Try to delete the leaf if it is mostly empty. We do this by
4521                  * trying to move all its items into its left and right neighbours.
4522                  * If we can't move all the items, then we don't delete it - it's
4523                  * not ideal, but future insertions might fill the leaf with more
4524                  * items, or items from other leaves might be moved later into our
4525                  * leaf due to deletions on those leaves.
4526                  */
4527                 if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
4528                         u32 min_push_space;
4529
4530                         /* push_leaf_left fixes the path.
4531                          * make sure the path still points to our leaf
4532                          * for possible call to btrfs_del_ptr below
4533                          */
4534                         slot = path->slots[1];
4535                         atomic_inc(&leaf->refs);
4536                         /*
4537                          * We want to be able to at least push one item to the
4538                          * left neighbour leaf, and that's the first item.
4539                          */
4540                         min_push_space = sizeof(struct btrfs_item) +
4541                                 btrfs_item_size(leaf, 0);
4542                         wret = push_leaf_left(trans, root, path, 0,
4543                                               min_push_space, 1, (u32)-1);
4544                         if (wret < 0 && wret != -ENOSPC)
4545                                 ret = wret;
4546
4547                         if (path->nodes[0] == leaf &&
4548                             btrfs_header_nritems(leaf)) {
4549                                 /*
4550                                  * If we were not able to push all items from our
4551                                  * leaf to its left neighbour, then attempt to
4552                                  * either push all the remaining items to the
4553                                  * right neighbour or none. There's no advantage
4554                                  * in pushing only some items, instead of all, as
4555                                  * it's pointless to end up with a leaf having
4556                                  * too few items while the neighbours can be full
4557                                  * or nearly full.
4558                                  */
4559                                 nritems = btrfs_header_nritems(leaf);
4560                                 min_push_space = leaf_space_used(leaf, 0, nritems);
4561                                 wret = push_leaf_right(trans, root, path, 0,
4562                                                        min_push_space, 1, 0);
4563                                 if (wret < 0 && wret != -ENOSPC)
4564                                         ret = wret;
4565                         }
4566
4567                         if (btrfs_header_nritems(leaf) == 0) {
4568                                 path->slots[1] = slot;
4569                                 ret = btrfs_del_leaf(trans, root, path, leaf);
4570                                 if (ret < 0)
4571                                         return ret;
4572                                 free_extent_buffer(leaf);
4573                                 ret = 0;
4574                         } else {
4575                                 /* if we're still in the path, make sure
4576                                  * we're dirty.  Otherwise, one of the
4577                                  * push_leaf functions must have already
4578                                  * dirtied this buffer
4579                                  */
4580                                 if (path->nodes[0] == leaf)
4581                                         btrfs_mark_buffer_dirty(trans, leaf);
4582                                 free_extent_buffer(leaf);
4583                         }
4584                 } else {
4585                         btrfs_mark_buffer_dirty(trans, leaf);
4586                 }
4587         }
4588         return ret;
4589 }
4590
4591 /*
4592  * A helper function to walk down the tree starting at min_key, and looking
4593  * for nodes or leaves that are have a minimum transaction id.
4594  * This is used by the btree defrag code, and tree logging
4595  *
4596  * This does not cow, but it does stuff the starting key it finds back
4597  * into min_key, so you can call btrfs_search_slot with cow=1 on the
4598  * key and get a writable path.
4599  *
4600  * This honors path->lowest_level to prevent descent past a given level
4601  * of the tree.
4602  *
4603  * min_trans indicates the oldest transaction that you are interested
4604  * in walking through.  Any nodes or leaves older than min_trans are
4605  * skipped over (without reading them).
4606  *
4607  * returns zero if something useful was found, < 0 on error and 1 if there
4608  * was nothing in the tree that matched the search criteria.
4609  */
4610 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
4611                          struct btrfs_path *path,
4612                          u64 min_trans)
4613 {
4614         struct extent_buffer *cur;
4615         struct btrfs_key found_key;
4616         int slot;
4617         int sret;
4618         u32 nritems;
4619         int level;
4620         int ret = 1;
4621         int keep_locks = path->keep_locks;
4622
4623         ASSERT(!path->nowait);
4624         path->keep_locks = 1;
4625 again:
4626         cur = btrfs_read_lock_root_node(root);
4627         level = btrfs_header_level(cur);
4628         WARN_ON(path->nodes[level]);
4629         path->nodes[level] = cur;
4630         path->locks[level] = BTRFS_READ_LOCK;
4631
4632         if (btrfs_header_generation(cur) < min_trans) {
4633                 ret = 1;
4634                 goto out;
4635         }
4636         while (1) {
4637                 nritems = btrfs_header_nritems(cur);
4638                 level = btrfs_header_level(cur);
4639                 sret = btrfs_bin_search(cur, 0, min_key, &slot);
4640                 if (sret < 0) {
4641                         ret = sret;
4642                         goto out;
4643                 }
4644
4645                 /* at the lowest level, we're done, setup the path and exit */
4646                 if (level == path->lowest_level) {
4647                         if (slot >= nritems)
4648                                 goto find_next_key;
4649                         ret = 0;
4650                         path->slots[level] = slot;
4651                         btrfs_item_key_to_cpu(cur, &found_key, slot);
4652                         goto out;
4653                 }
4654                 if (sret && slot > 0)
4655                         slot--;
4656                 /*
4657                  * check this node pointer against the min_trans parameters.
4658                  * If it is too old, skip to the next one.
4659                  */
4660                 while (slot < nritems) {
4661                         u64 gen;
4662
4663                         gen = btrfs_node_ptr_generation(cur, slot);
4664                         if (gen < min_trans) {
4665                                 slot++;
4666                                 continue;
4667                         }
4668                         break;
4669                 }
4670 find_next_key:
4671                 /*
4672                  * we didn't find a candidate key in this node, walk forward
4673                  * and find another one
4674                  */
4675                 if (slot >= nritems) {
4676                         path->slots[level] = slot;
4677                         sret = btrfs_find_next_key(root, path, min_key, level,
4678                                                   min_trans);
4679                         if (sret == 0) {
4680                                 btrfs_release_path(path);
4681                                 goto again;
4682                         } else {
4683                                 goto out;
4684                         }
4685                 }
4686                 /* save our key for returning back */
4687                 btrfs_node_key_to_cpu(cur, &found_key, slot);
4688                 path->slots[level] = slot;
4689                 if (level == path->lowest_level) {
4690                         ret = 0;
4691                         goto out;
4692                 }
4693                 cur = btrfs_read_node_slot(cur, slot);
4694                 if (IS_ERR(cur)) {
4695                         ret = PTR_ERR(cur);
4696                         goto out;
4697                 }
4698
4699                 btrfs_tree_read_lock(cur);
4700
4701                 path->locks[level - 1] = BTRFS_READ_LOCK;
4702                 path->nodes[level - 1] = cur;
4703                 unlock_up(path, level, 1, 0, NULL);
4704         }
4705 out:
4706         path->keep_locks = keep_locks;
4707         if (ret == 0) {
4708                 btrfs_unlock_up_safe(path, path->lowest_level + 1);
4709                 memcpy(min_key, &found_key, sizeof(found_key));
4710         }
4711         return ret;
4712 }
4713
4714 /*
4715  * this is similar to btrfs_next_leaf, but does not try to preserve
4716  * and fixup the path.  It looks for and returns the next key in the
4717  * tree based on the current path and the min_trans parameters.
4718  *
4719  * 0 is returned if another key is found, < 0 if there are any errors
4720  * and 1 is returned if there are no higher keys in the tree
4721  *
4722  * path->keep_locks should be set to 1 on the search made before
4723  * calling this function.
4724  */
4725 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
4726                         struct btrfs_key *key, int level, u64 min_trans)
4727 {
4728         int slot;
4729         struct extent_buffer *c;
4730
4731         WARN_ON(!path->keep_locks && !path->skip_locking);
4732         while (level < BTRFS_MAX_LEVEL) {
4733                 if (!path->nodes[level])
4734                         return 1;
4735
4736                 slot = path->slots[level] + 1;
4737                 c = path->nodes[level];
4738 next:
4739                 if (slot >= btrfs_header_nritems(c)) {
4740                         int ret;
4741                         int orig_lowest;
4742                         struct btrfs_key cur_key;
4743                         if (level + 1 >= BTRFS_MAX_LEVEL ||
4744                             !path->nodes[level + 1])
4745                                 return 1;
4746
4747                         if (path->locks[level + 1] || path->skip_locking) {
4748                                 level++;
4749                                 continue;
4750                         }
4751
4752                         slot = btrfs_header_nritems(c) - 1;
4753                         if (level == 0)
4754                                 btrfs_item_key_to_cpu(c, &cur_key, slot);
4755                         else
4756                                 btrfs_node_key_to_cpu(c, &cur_key, slot);
4757
4758                         orig_lowest = path->lowest_level;
4759                         btrfs_release_path(path);
4760                         path->lowest_level = level;
4761                         ret = btrfs_search_slot(NULL, root, &cur_key, path,
4762                                                 0, 0);
4763                         path->lowest_level = orig_lowest;
4764                         if (ret < 0)
4765                                 return ret;
4766
4767                         c = path->nodes[level];
4768                         slot = path->slots[level];
4769                         if (ret == 0)
4770                                 slot++;
4771                         goto next;
4772                 }
4773
4774                 if (level == 0)
4775                         btrfs_item_key_to_cpu(c, key, slot);
4776                 else {
4777                         u64 gen = btrfs_node_ptr_generation(c, slot);
4778
4779                         if (gen < min_trans) {
4780                                 slot++;
4781                                 goto next;
4782                         }
4783                         btrfs_node_key_to_cpu(c, key, slot);
4784                 }
4785                 return 0;
4786         }
4787         return 1;
4788 }
4789
4790 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
4791                         u64 time_seq)
4792 {
4793         int slot;
4794         int level;
4795         struct extent_buffer *c;
4796         struct extent_buffer *next;
4797         struct btrfs_fs_info *fs_info = root->fs_info;
4798         struct btrfs_key key;
4799         bool need_commit_sem = false;
4800         u32 nritems;
4801         int ret;
4802         int i;
4803
4804         /*
4805          * The nowait semantics are used only for write paths, where we don't
4806          * use the tree mod log and sequence numbers.
4807          */
4808         if (time_seq)
4809                 ASSERT(!path->nowait);
4810
4811         nritems = btrfs_header_nritems(path->nodes[0]);
4812         if (nritems == 0)
4813                 return 1;
4814
4815         btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
4816 again:
4817         level = 1;
4818         next = NULL;
4819         btrfs_release_path(path);
4820
4821         path->keep_locks = 1;
4822
4823         if (time_seq) {
4824                 ret = btrfs_search_old_slot(root, &key, path, time_seq);
4825         } else {
4826                 if (path->need_commit_sem) {
4827                         path->need_commit_sem = 0;
4828                         need_commit_sem = true;
4829                         if (path->nowait) {
4830                                 if (!down_read_trylock(&fs_info->commit_root_sem)) {
4831                                         ret = -EAGAIN;
4832                                         goto done;
4833                                 }
4834                         } else {
4835                                 down_read(&fs_info->commit_root_sem);
4836                         }
4837                 }
4838                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
4839         }
4840         path->keep_locks = 0;
4841
4842         if (ret < 0)
4843                 goto done;
4844
4845         nritems = btrfs_header_nritems(path->nodes[0]);
4846         /*
4847          * by releasing the path above we dropped all our locks.  A balance
4848          * could have added more items next to the key that used to be
4849          * at the very end of the block.  So, check again here and
4850          * advance the path if there are now more items available.
4851          */
4852         if (nritems > 0 && path->slots[0] < nritems - 1) {
4853                 if (ret == 0)
4854                         path->slots[0]++;
4855                 ret = 0;
4856                 goto done;
4857         }
4858         /*
4859          * So the above check misses one case:
4860          * - after releasing the path above, someone has removed the item that
4861          *   used to be at the very end of the block, and balance between leafs
4862          *   gets another one with bigger key.offset to replace it.
4863          *
4864          * This one should be returned as well, or we can get leaf corruption
4865          * later(esp. in __btrfs_drop_extents()).
4866          *
4867          * And a bit more explanation about this check,
4868          * with ret > 0, the key isn't found, the path points to the slot
4869          * where it should be inserted, so the path->slots[0] item must be the
4870          * bigger one.
4871          */
4872         if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
4873                 ret = 0;
4874                 goto done;
4875         }
4876
4877         while (level < BTRFS_MAX_LEVEL) {
4878                 if (!path->nodes[level]) {
4879                         ret = 1;
4880                         goto done;
4881                 }
4882
4883                 slot = path->slots[level] + 1;
4884                 c = path->nodes[level];
4885                 if (slot >= btrfs_header_nritems(c)) {
4886                         level++;
4887                         if (level == BTRFS_MAX_LEVEL) {
4888                                 ret = 1;
4889                                 goto done;
4890                         }
4891                         continue;
4892                 }
4893
4894
4895                 /*
4896                  * Our current level is where we're going to start from, and to
4897                  * make sure lockdep doesn't complain we need to drop our locks
4898                  * and nodes from 0 to our current level.
4899                  */
4900                 for (i = 0; i < level; i++) {
4901                         if (path->locks[level]) {
4902                                 btrfs_tree_read_unlock(path->nodes[i]);
4903                                 path->locks[i] = 0;
4904                         }
4905                         free_extent_buffer(path->nodes[i]);
4906                         path->nodes[i] = NULL;
4907                 }
4908
4909                 next = c;
4910                 ret = read_block_for_search(root, path, &next, level,
4911                                             slot, &key);
4912                 if (ret == -EAGAIN && !path->nowait)
4913                         goto again;
4914
4915                 if (ret < 0) {
4916                         btrfs_release_path(path);
4917                         goto done;
4918                 }
4919
4920                 if (!path->skip_locking) {
4921                         ret = btrfs_try_tree_read_lock(next);
4922                         if (!ret && path->nowait) {
4923                                 ret = -EAGAIN;
4924                                 goto done;
4925                         }
4926                         if (!ret && time_seq) {
4927                                 /*
4928                                  * If we don't get the lock, we may be racing
4929                                  * with push_leaf_left, holding that lock while
4930                                  * itself waiting for the leaf we've currently
4931                                  * locked. To solve this situation, we give up
4932                                  * on our lock and cycle.
4933                                  */
4934                                 free_extent_buffer(next);
4935                                 btrfs_release_path(path);
4936                                 cond_resched();
4937                                 goto again;
4938                         }
4939                         if (!ret)
4940                                 btrfs_tree_read_lock(next);
4941                 }
4942                 break;
4943         }
4944         path->slots[level] = slot;
4945         while (1) {
4946                 level--;
4947                 path->nodes[level] = next;
4948                 path->slots[level] = 0;
4949                 if (!path->skip_locking)
4950                         path->locks[level] = BTRFS_READ_LOCK;
4951                 if (!level)
4952                         break;
4953
4954                 ret = read_block_for_search(root, path, &next, level,
4955                                             0, &key);
4956                 if (ret == -EAGAIN && !path->nowait)
4957                         goto again;
4958
4959                 if (ret < 0) {
4960                         btrfs_release_path(path);
4961                         goto done;
4962                 }
4963
4964                 if (!path->skip_locking) {
4965                         if (path->nowait) {
4966                                 if (!btrfs_try_tree_read_lock(next)) {
4967                                         ret = -EAGAIN;
4968                                         goto done;
4969                                 }
4970                         } else {
4971                                 btrfs_tree_read_lock(next);
4972                         }
4973                 }
4974         }
4975         ret = 0;
4976 done:
4977         unlock_up(path, 0, 1, 0, NULL);
4978         if (need_commit_sem) {
4979                 int ret2;
4980
4981                 path->need_commit_sem = 1;
4982                 ret2 = finish_need_commit_sem_search(path);
4983                 up_read(&fs_info->commit_root_sem);
4984                 if (ret2)
4985                         ret = ret2;
4986         }
4987
4988         return ret;
4989 }
4990
4991 int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq)
4992 {
4993         path->slots[0]++;
4994         if (path->slots[0] >= btrfs_header_nritems(path->nodes[0]))
4995                 return btrfs_next_old_leaf(root, path, time_seq);
4996         return 0;
4997 }
4998
4999 /*
5000  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5001  * searching until it gets past min_objectid or finds an item of 'type'
5002  *
5003  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5004  */
5005 int btrfs_previous_item(struct btrfs_root *root,
5006                         struct btrfs_path *path, u64 min_objectid,
5007                         int type)
5008 {
5009         struct btrfs_key found_key;
5010         struct extent_buffer *leaf;
5011         u32 nritems;
5012         int ret;
5013
5014         while (1) {
5015                 if (path->slots[0] == 0) {
5016                         ret = btrfs_prev_leaf(root, path);
5017                         if (ret != 0)
5018                                 return ret;
5019                 } else {
5020                         path->slots[0]--;
5021                 }
5022                 leaf = path->nodes[0];
5023                 nritems = btrfs_header_nritems(leaf);
5024                 if (nritems == 0)
5025                         return 1;
5026                 if (path->slots[0] == nritems)
5027                         path->slots[0]--;
5028
5029                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5030                 if (found_key.objectid < min_objectid)
5031                         break;
5032                 if (found_key.type == type)
5033                         return 0;
5034                 if (found_key.objectid == min_objectid &&
5035                     found_key.type < type)
5036                         break;
5037         }
5038         return 1;
5039 }
5040
5041 /*
5042  * search in extent tree to find a previous Metadata/Data extent item with
5043  * min objecitd.
5044  *
5045  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5046  */
5047 int btrfs_previous_extent_item(struct btrfs_root *root,
5048                         struct btrfs_path *path, u64 min_objectid)
5049 {
5050         struct btrfs_key found_key;
5051         struct extent_buffer *leaf;
5052         u32 nritems;
5053         int ret;
5054
5055         while (1) {
5056                 if (path->slots[0] == 0) {
5057                         ret = btrfs_prev_leaf(root, path);
5058                         if (ret != 0)
5059                                 return ret;
5060                 } else {
5061                         path->slots[0]--;
5062                 }
5063                 leaf = path->nodes[0];
5064                 nritems = btrfs_header_nritems(leaf);
5065                 if (nritems == 0)
5066                         return 1;
5067                 if (path->slots[0] == nritems)
5068                         path->slots[0]--;
5069
5070                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5071                 if (found_key.objectid < min_objectid)
5072                         break;
5073                 if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5074                     found_key.type == BTRFS_METADATA_ITEM_KEY)
5075                         return 0;
5076                 if (found_key.objectid == min_objectid &&
5077                     found_key.type < BTRFS_EXTENT_ITEM_KEY)
5078                         break;
5079         }
5080         return 1;
5081 }
5082
5083 int __init btrfs_ctree_init(void)
5084 {
5085         btrfs_path_cachep = kmem_cache_create("btrfs_path",
5086                         sizeof(struct btrfs_path), 0,
5087                         SLAB_MEM_SPREAD, NULL);
5088         if (!btrfs_path_cachep)
5089                 return -ENOMEM;
5090         return 0;
5091 }
5092
5093 void __cold btrfs_ctree_exit(void)
5094 {
5095         kmem_cache_destroy(btrfs_path_cachep);
5096 }