GNU Linux-libre 5.10.217-gnu1
[releases.git] / fs / btrfs / ctree.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
4  */
5
6 #include <linux/sched.h>
7 #include <linux/slab.h>
8 #include <linux/rbtree.h>
9 #include <linux/mm.h>
10 #include "ctree.h"
11 #include "disk-io.h"
12 #include "transaction.h"
13 #include "print-tree.h"
14 #include "locking.h"
15 #include "volumes.h"
16 #include "qgroup.h"
17
18 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
19                       *root, struct btrfs_path *path, int level);
20 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
21                       const struct btrfs_key *ins_key, struct btrfs_path *path,
22                       int data_size, int extend);
23 static int push_node_left(struct btrfs_trans_handle *trans,
24                           struct extent_buffer *dst,
25                           struct extent_buffer *src, int empty);
26 static int balance_node_right(struct btrfs_trans_handle *trans,
27                               struct extent_buffer *dst_buf,
28                               struct extent_buffer *src_buf);
29 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
30                     int level, int slot);
31
32 static const struct btrfs_csums {
33         u16             size;
34         const char      name[10];
35         const char      driver[12];
36 } btrfs_csums[] = {
37         [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
38         [BTRFS_CSUM_TYPE_XXHASH] = { .size = 8, .name = "xxhash64" },
39         [BTRFS_CSUM_TYPE_SHA256] = { .size = 32, .name = "sha256" },
40         [BTRFS_CSUM_TYPE_BLAKE2] = { .size = 32, .name = "blake2b",
41                                      .driver = "blake2b-256" },
42 };
43
44 int btrfs_super_csum_size(const struct btrfs_super_block *s)
45 {
46         u16 t = btrfs_super_csum_type(s);
47         /*
48          * csum type is validated at mount time
49          */
50         return btrfs_csums[t].size;
51 }
52
53 const char *btrfs_super_csum_name(u16 csum_type)
54 {
55         /* csum type is validated at mount time */
56         return btrfs_csums[csum_type].name;
57 }
58
59 /*
60  * Return driver name if defined, otherwise the name that's also a valid driver
61  * name
62  */
63 const char *btrfs_super_csum_driver(u16 csum_type)
64 {
65         /* csum type is validated at mount time */
66         return btrfs_csums[csum_type].driver[0] ?
67                 btrfs_csums[csum_type].driver :
68                 btrfs_csums[csum_type].name;
69 }
70
71 size_t __attribute_const__ btrfs_get_num_csums(void)
72 {
73         return ARRAY_SIZE(btrfs_csums);
74 }
75
76 struct btrfs_path *btrfs_alloc_path(void)
77 {
78         return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
79 }
80
81 /* this also releases the path */
82 void btrfs_free_path(struct btrfs_path *p)
83 {
84         if (!p)
85                 return;
86         btrfs_release_path(p);
87         kmem_cache_free(btrfs_path_cachep, p);
88 }
89
90 /*
91  * path release drops references on the extent buffers in the path
92  * and it drops any locks held by this path
93  *
94  * It is safe to call this on paths that no locks or extent buffers held.
95  */
96 noinline void btrfs_release_path(struct btrfs_path *p)
97 {
98         int i;
99
100         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
101                 p->slots[i] = 0;
102                 if (!p->nodes[i])
103                         continue;
104                 if (p->locks[i]) {
105                         btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
106                         p->locks[i] = 0;
107                 }
108                 free_extent_buffer(p->nodes[i]);
109                 p->nodes[i] = NULL;
110         }
111 }
112
113 /*
114  * safely gets a reference on the root node of a tree.  A lock
115  * is not taken, so a concurrent writer may put a different node
116  * at the root of the tree.  See btrfs_lock_root_node for the
117  * looping required.
118  *
119  * The extent buffer returned by this has a reference taken, so
120  * it won't disappear.  It may stop being the root of the tree
121  * at any time because there are no locks held.
122  */
123 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
124 {
125         struct extent_buffer *eb;
126
127         while (1) {
128                 rcu_read_lock();
129                 eb = rcu_dereference(root->node);
130
131                 /*
132                  * RCU really hurts here, we could free up the root node because
133                  * it was COWed but we may not get the new root node yet so do
134                  * the inc_not_zero dance and if it doesn't work then
135                  * synchronize_rcu and try again.
136                  */
137                 if (atomic_inc_not_zero(&eb->refs)) {
138                         rcu_read_unlock();
139                         break;
140                 }
141                 rcu_read_unlock();
142                 synchronize_rcu();
143         }
144         return eb;
145 }
146
147 /*
148  * Cowonly root (not-shareable trees, everything not subvolume or reloc roots),
149  * just get put onto a simple dirty list.  Transaction walks this list to make
150  * sure they get properly updated on disk.
151  */
152 static void add_root_to_dirty_list(struct btrfs_root *root)
153 {
154         struct btrfs_fs_info *fs_info = root->fs_info;
155
156         if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
157             !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
158                 return;
159
160         spin_lock(&fs_info->trans_lock);
161         if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
162                 /* Want the extent tree to be the last on the list */
163                 if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
164                         list_move_tail(&root->dirty_list,
165                                        &fs_info->dirty_cowonly_roots);
166                 else
167                         list_move(&root->dirty_list,
168                                   &fs_info->dirty_cowonly_roots);
169         }
170         spin_unlock(&fs_info->trans_lock);
171 }
172
173 /*
174  * used by snapshot creation to make a copy of a root for a tree with
175  * a given objectid.  The buffer with the new root node is returned in
176  * cow_ret, and this func returns zero on success or a negative error code.
177  */
178 int btrfs_copy_root(struct btrfs_trans_handle *trans,
179                       struct btrfs_root *root,
180                       struct extent_buffer *buf,
181                       struct extent_buffer **cow_ret, u64 new_root_objectid)
182 {
183         struct btrfs_fs_info *fs_info = root->fs_info;
184         struct extent_buffer *cow;
185         int ret = 0;
186         int level;
187         struct btrfs_disk_key disk_key;
188
189         WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
190                 trans->transid != fs_info->running_transaction->transid);
191         WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
192                 trans->transid != root->last_trans);
193
194         level = btrfs_header_level(buf);
195         if (level == 0)
196                 btrfs_item_key(buf, &disk_key, 0);
197         else
198                 btrfs_node_key(buf, &disk_key, 0);
199
200         cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
201                                      &disk_key, level, buf->start, 0,
202                                      BTRFS_NESTING_NEW_ROOT);
203         if (IS_ERR(cow))
204                 return PTR_ERR(cow);
205
206         copy_extent_buffer_full(cow, buf);
207         btrfs_set_header_bytenr(cow, cow->start);
208         btrfs_set_header_generation(cow, trans->transid);
209         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
210         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
211                                      BTRFS_HEADER_FLAG_RELOC);
212         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
213                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
214         else
215                 btrfs_set_header_owner(cow, new_root_objectid);
216
217         write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
218
219         WARN_ON(btrfs_header_generation(buf) > trans->transid);
220         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
221                 ret = btrfs_inc_ref(trans, root, cow, 1);
222         else
223                 ret = btrfs_inc_ref(trans, root, cow, 0);
224         if (ret) {
225                 btrfs_tree_unlock(cow);
226                 free_extent_buffer(cow);
227                 btrfs_abort_transaction(trans, ret);
228                 return ret;
229         }
230
231         btrfs_mark_buffer_dirty(cow);
232         *cow_ret = cow;
233         return 0;
234 }
235
236 enum mod_log_op {
237         MOD_LOG_KEY_REPLACE,
238         MOD_LOG_KEY_ADD,
239         MOD_LOG_KEY_REMOVE,
240         MOD_LOG_KEY_REMOVE_WHILE_FREEING,
241         MOD_LOG_KEY_REMOVE_WHILE_MOVING,
242         MOD_LOG_MOVE_KEYS,
243         MOD_LOG_ROOT_REPLACE,
244 };
245
246 struct tree_mod_root {
247         u64 logical;
248         u8 level;
249 };
250
251 struct tree_mod_elem {
252         struct rb_node node;
253         u64 logical;
254         u64 seq;
255         enum mod_log_op op;
256
257         /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
258         int slot;
259
260         /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
261         u64 generation;
262
263         /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
264         struct btrfs_disk_key key;
265         u64 blockptr;
266
267         /* this is used for op == MOD_LOG_MOVE_KEYS */
268         struct {
269                 int dst_slot;
270                 int nr_items;
271         } move;
272
273         /* this is used for op == MOD_LOG_ROOT_REPLACE */
274         struct tree_mod_root old_root;
275 };
276
277 /*
278  * Pull a new tree mod seq number for our operation.
279  */
280 static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
281 {
282         return atomic64_inc_return(&fs_info->tree_mod_seq);
283 }
284
285 /*
286  * This adds a new blocker to the tree mod log's blocker list if the @elem
287  * passed does not already have a sequence number set. So when a caller expects
288  * to record tree modifications, it should ensure to set elem->seq to zero
289  * before calling btrfs_get_tree_mod_seq.
290  * Returns a fresh, unused tree log modification sequence number, even if no new
291  * blocker was added.
292  */
293 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
294                            struct seq_list *elem)
295 {
296         write_lock(&fs_info->tree_mod_log_lock);
297         if (!elem->seq) {
298                 elem->seq = btrfs_inc_tree_mod_seq(fs_info);
299                 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
300         }
301         write_unlock(&fs_info->tree_mod_log_lock);
302
303         return elem->seq;
304 }
305
306 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
307                             struct seq_list *elem)
308 {
309         struct rb_root *tm_root;
310         struct rb_node *node;
311         struct rb_node *next;
312         struct tree_mod_elem *tm;
313         u64 min_seq = (u64)-1;
314         u64 seq_putting = elem->seq;
315
316         if (!seq_putting)
317                 return;
318
319         write_lock(&fs_info->tree_mod_log_lock);
320         list_del(&elem->list);
321         elem->seq = 0;
322
323         if (!list_empty(&fs_info->tree_mod_seq_list)) {
324                 struct seq_list *first;
325
326                 first = list_first_entry(&fs_info->tree_mod_seq_list,
327                                          struct seq_list, list);
328                 if (seq_putting > first->seq) {
329                         /*
330                          * Blocker with lower sequence number exists, we
331                          * cannot remove anything from the log.
332                          */
333                         write_unlock(&fs_info->tree_mod_log_lock);
334                         return;
335                 }
336                 min_seq = first->seq;
337         }
338
339         /*
340          * anything that's lower than the lowest existing (read: blocked)
341          * sequence number can be removed from the tree.
342          */
343         tm_root = &fs_info->tree_mod_log;
344         for (node = rb_first(tm_root); node; node = next) {
345                 next = rb_next(node);
346                 tm = rb_entry(node, struct tree_mod_elem, node);
347                 if (tm->seq >= min_seq)
348                         continue;
349                 rb_erase(node, tm_root);
350                 kfree(tm);
351         }
352         write_unlock(&fs_info->tree_mod_log_lock);
353 }
354
355 /*
356  * key order of the log:
357  *       node/leaf start address -> sequence
358  *
359  * The 'start address' is the logical address of the *new* root node
360  * for root replace operations, or the logical address of the affected
361  * block for all other operations.
362  */
363 static noinline int
364 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
365 {
366         struct rb_root *tm_root;
367         struct rb_node **new;
368         struct rb_node *parent = NULL;
369         struct tree_mod_elem *cur;
370
371         lockdep_assert_held_write(&fs_info->tree_mod_log_lock);
372
373         tm->seq = btrfs_inc_tree_mod_seq(fs_info);
374
375         tm_root = &fs_info->tree_mod_log;
376         new = &tm_root->rb_node;
377         while (*new) {
378                 cur = rb_entry(*new, struct tree_mod_elem, node);
379                 parent = *new;
380                 if (cur->logical < tm->logical)
381                         new = &((*new)->rb_left);
382                 else if (cur->logical > tm->logical)
383                         new = &((*new)->rb_right);
384                 else if (cur->seq < tm->seq)
385                         new = &((*new)->rb_left);
386                 else if (cur->seq > tm->seq)
387                         new = &((*new)->rb_right);
388                 else
389                         return -EEXIST;
390         }
391
392         rb_link_node(&tm->node, parent, new);
393         rb_insert_color(&tm->node, tm_root);
394         return 0;
395 }
396
397 /*
398  * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
399  * returns zero with the tree_mod_log_lock acquired. The caller must hold
400  * this until all tree mod log insertions are recorded in the rb tree and then
401  * write unlock fs_info::tree_mod_log_lock.
402  */
403 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
404                                     struct extent_buffer *eb) {
405         smp_mb();
406         if (list_empty(&(fs_info)->tree_mod_seq_list))
407                 return 1;
408         if (eb && btrfs_header_level(eb) == 0)
409                 return 1;
410
411         write_lock(&fs_info->tree_mod_log_lock);
412         if (list_empty(&(fs_info)->tree_mod_seq_list)) {
413                 write_unlock(&fs_info->tree_mod_log_lock);
414                 return 1;
415         }
416
417         return 0;
418 }
419
420 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
421 static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
422                                     struct extent_buffer *eb)
423 {
424         smp_mb();
425         if (list_empty(&(fs_info)->tree_mod_seq_list))
426                 return 0;
427         if (eb && btrfs_header_level(eb) == 0)
428                 return 0;
429
430         return 1;
431 }
432
433 static struct tree_mod_elem *
434 alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
435                     enum mod_log_op op, gfp_t flags)
436 {
437         struct tree_mod_elem *tm;
438
439         tm = kzalloc(sizeof(*tm), flags);
440         if (!tm)
441                 return NULL;
442
443         tm->logical = eb->start;
444         if (op != MOD_LOG_KEY_ADD) {
445                 btrfs_node_key(eb, &tm->key, slot);
446                 tm->blockptr = btrfs_node_blockptr(eb, slot);
447         }
448         tm->op = op;
449         tm->slot = slot;
450         tm->generation = btrfs_node_ptr_generation(eb, slot);
451         RB_CLEAR_NODE(&tm->node);
452
453         return tm;
454 }
455
456 static noinline int tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
457                 enum mod_log_op op, gfp_t flags)
458 {
459         struct tree_mod_elem *tm;
460         int ret;
461
462         if (!tree_mod_need_log(eb->fs_info, eb))
463                 return 0;
464
465         tm = alloc_tree_mod_elem(eb, slot, op, flags);
466         if (!tm)
467                 return -ENOMEM;
468
469         if (tree_mod_dont_log(eb->fs_info, eb)) {
470                 kfree(tm);
471                 return 0;
472         }
473
474         ret = __tree_mod_log_insert(eb->fs_info, tm);
475         write_unlock(&eb->fs_info->tree_mod_log_lock);
476         if (ret)
477                 kfree(tm);
478
479         return ret;
480 }
481
482 static noinline int tree_mod_log_insert_move(struct extent_buffer *eb,
483                 int dst_slot, int src_slot, int nr_items)
484 {
485         struct tree_mod_elem *tm = NULL;
486         struct tree_mod_elem **tm_list = NULL;
487         int ret = 0;
488         int i;
489         int locked = 0;
490
491         if (!tree_mod_need_log(eb->fs_info, eb))
492                 return 0;
493
494         tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
495         if (!tm_list)
496                 return -ENOMEM;
497
498         tm = kzalloc(sizeof(*tm), GFP_NOFS);
499         if (!tm) {
500                 ret = -ENOMEM;
501                 goto free_tms;
502         }
503
504         tm->logical = eb->start;
505         tm->slot = src_slot;
506         tm->move.dst_slot = dst_slot;
507         tm->move.nr_items = nr_items;
508         tm->op = MOD_LOG_MOVE_KEYS;
509
510         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
511                 tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
512                     MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
513                 if (!tm_list[i]) {
514                         ret = -ENOMEM;
515                         goto free_tms;
516                 }
517         }
518
519         if (tree_mod_dont_log(eb->fs_info, eb))
520                 goto free_tms;
521         locked = 1;
522
523         /*
524          * When we override something during the move, we log these removals.
525          * This can only happen when we move towards the beginning of the
526          * buffer, i.e. dst_slot < src_slot.
527          */
528         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
529                 ret = __tree_mod_log_insert(eb->fs_info, tm_list[i]);
530                 if (ret)
531                         goto free_tms;
532         }
533
534         ret = __tree_mod_log_insert(eb->fs_info, tm);
535         if (ret)
536                 goto free_tms;
537         write_unlock(&eb->fs_info->tree_mod_log_lock);
538         kfree(tm_list);
539
540         return 0;
541 free_tms:
542         for (i = 0; i < nr_items; i++) {
543                 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
544                         rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
545                 kfree(tm_list[i]);
546         }
547         if (locked)
548                 write_unlock(&eb->fs_info->tree_mod_log_lock);
549         kfree(tm_list);
550         kfree(tm);
551
552         return ret;
553 }
554
555 static inline int
556 __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
557                        struct tree_mod_elem **tm_list,
558                        int nritems)
559 {
560         int i, j;
561         int ret;
562
563         for (i = nritems - 1; i >= 0; i--) {
564                 ret = __tree_mod_log_insert(fs_info, tm_list[i]);
565                 if (ret) {
566                         for (j = nritems - 1; j > i; j--)
567                                 rb_erase(&tm_list[j]->node,
568                                          &fs_info->tree_mod_log);
569                         return ret;
570                 }
571         }
572
573         return 0;
574 }
575
576 static noinline int tree_mod_log_insert_root(struct extent_buffer *old_root,
577                          struct extent_buffer *new_root, int log_removal)
578 {
579         struct btrfs_fs_info *fs_info = old_root->fs_info;
580         struct tree_mod_elem *tm = NULL;
581         struct tree_mod_elem **tm_list = NULL;
582         int nritems = 0;
583         int ret = 0;
584         int i;
585
586         if (!tree_mod_need_log(fs_info, NULL))
587                 return 0;
588
589         if (log_removal && btrfs_header_level(old_root) > 0) {
590                 nritems = btrfs_header_nritems(old_root);
591                 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
592                                   GFP_NOFS);
593                 if (!tm_list) {
594                         ret = -ENOMEM;
595                         goto free_tms;
596                 }
597                 for (i = 0; i < nritems; i++) {
598                         tm_list[i] = alloc_tree_mod_elem(old_root, i,
599                             MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
600                         if (!tm_list[i]) {
601                                 ret = -ENOMEM;
602                                 goto free_tms;
603                         }
604                 }
605         }
606
607         tm = kzalloc(sizeof(*tm), GFP_NOFS);
608         if (!tm) {
609                 ret = -ENOMEM;
610                 goto free_tms;
611         }
612
613         tm->logical = new_root->start;
614         tm->old_root.logical = old_root->start;
615         tm->old_root.level = btrfs_header_level(old_root);
616         tm->generation = btrfs_header_generation(old_root);
617         tm->op = MOD_LOG_ROOT_REPLACE;
618
619         if (tree_mod_dont_log(fs_info, NULL))
620                 goto free_tms;
621
622         if (tm_list)
623                 ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
624         if (!ret)
625                 ret = __tree_mod_log_insert(fs_info, tm);
626
627         write_unlock(&fs_info->tree_mod_log_lock);
628         if (ret)
629                 goto free_tms;
630         kfree(tm_list);
631
632         return ret;
633
634 free_tms:
635         if (tm_list) {
636                 for (i = 0; i < nritems; i++)
637                         kfree(tm_list[i]);
638                 kfree(tm_list);
639         }
640         kfree(tm);
641
642         return ret;
643 }
644
645 static struct tree_mod_elem *
646 __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
647                       int smallest)
648 {
649         struct rb_root *tm_root;
650         struct rb_node *node;
651         struct tree_mod_elem *cur = NULL;
652         struct tree_mod_elem *found = NULL;
653
654         read_lock(&fs_info->tree_mod_log_lock);
655         tm_root = &fs_info->tree_mod_log;
656         node = tm_root->rb_node;
657         while (node) {
658                 cur = rb_entry(node, struct tree_mod_elem, node);
659                 if (cur->logical < start) {
660                         node = node->rb_left;
661                 } else if (cur->logical > start) {
662                         node = node->rb_right;
663                 } else if (cur->seq < min_seq) {
664                         node = node->rb_left;
665                 } else if (!smallest) {
666                         /* we want the node with the highest seq */
667                         if (found)
668                                 BUG_ON(found->seq > cur->seq);
669                         found = cur;
670                         node = node->rb_left;
671                 } else if (cur->seq > min_seq) {
672                         /* we want the node with the smallest seq */
673                         if (found)
674                                 BUG_ON(found->seq < cur->seq);
675                         found = cur;
676                         node = node->rb_right;
677                 } else {
678                         found = cur;
679                         break;
680                 }
681         }
682         read_unlock(&fs_info->tree_mod_log_lock);
683
684         return found;
685 }
686
687 /*
688  * this returns the element from the log with the smallest time sequence
689  * value that's in the log (the oldest log item). any element with a time
690  * sequence lower than min_seq will be ignored.
691  */
692 static struct tree_mod_elem *
693 tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
694                            u64 min_seq)
695 {
696         return __tree_mod_log_search(fs_info, start, min_seq, 1);
697 }
698
699 /*
700  * this returns the element from the log with the largest time sequence
701  * value that's in the log (the most recent log item). any element with
702  * a time sequence lower than min_seq will be ignored.
703  */
704 static struct tree_mod_elem *
705 tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
706 {
707         return __tree_mod_log_search(fs_info, start, min_seq, 0);
708 }
709
710 static noinline int tree_mod_log_eb_copy(struct extent_buffer *dst,
711                      struct extent_buffer *src, unsigned long dst_offset,
712                      unsigned long src_offset, int nr_items)
713 {
714         struct btrfs_fs_info *fs_info = dst->fs_info;
715         int ret = 0;
716         struct tree_mod_elem **tm_list = NULL;
717         struct tree_mod_elem **tm_list_add, **tm_list_rem;
718         int i;
719         int locked = 0;
720
721         if (!tree_mod_need_log(fs_info, NULL))
722                 return 0;
723
724         if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
725                 return 0;
726
727         tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
728                           GFP_NOFS);
729         if (!tm_list)
730                 return -ENOMEM;
731
732         tm_list_add = tm_list;
733         tm_list_rem = tm_list + nr_items;
734         for (i = 0; i < nr_items; i++) {
735                 tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
736                     MOD_LOG_KEY_REMOVE, GFP_NOFS);
737                 if (!tm_list_rem[i]) {
738                         ret = -ENOMEM;
739                         goto free_tms;
740                 }
741
742                 tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
743                     MOD_LOG_KEY_ADD, GFP_NOFS);
744                 if (!tm_list_add[i]) {
745                         ret = -ENOMEM;
746                         goto free_tms;
747                 }
748         }
749
750         if (tree_mod_dont_log(fs_info, NULL))
751                 goto free_tms;
752         locked = 1;
753
754         for (i = 0; i < nr_items; i++) {
755                 ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
756                 if (ret)
757                         goto free_tms;
758                 ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
759                 if (ret)
760                         goto free_tms;
761         }
762
763         write_unlock(&fs_info->tree_mod_log_lock);
764         kfree(tm_list);
765
766         return 0;
767
768 free_tms:
769         for (i = 0; i < nr_items * 2; i++) {
770                 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
771                         rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
772                 kfree(tm_list[i]);
773         }
774         if (locked)
775                 write_unlock(&fs_info->tree_mod_log_lock);
776         kfree(tm_list);
777
778         return ret;
779 }
780
781 static noinline int tree_mod_log_free_eb(struct extent_buffer *eb)
782 {
783         struct tree_mod_elem **tm_list = NULL;
784         int nritems = 0;
785         int i;
786         int ret = 0;
787
788         if (btrfs_header_level(eb) == 0)
789                 return 0;
790
791         if (!tree_mod_need_log(eb->fs_info, NULL))
792                 return 0;
793
794         nritems = btrfs_header_nritems(eb);
795         tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
796         if (!tm_list)
797                 return -ENOMEM;
798
799         for (i = 0; i < nritems; i++) {
800                 tm_list[i] = alloc_tree_mod_elem(eb, i,
801                     MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
802                 if (!tm_list[i]) {
803                         ret = -ENOMEM;
804                         goto free_tms;
805                 }
806         }
807
808         if (tree_mod_dont_log(eb->fs_info, eb))
809                 goto free_tms;
810
811         ret = __tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
812         write_unlock(&eb->fs_info->tree_mod_log_lock);
813         if (ret)
814                 goto free_tms;
815         kfree(tm_list);
816
817         return 0;
818
819 free_tms:
820         for (i = 0; i < nritems; i++)
821                 kfree(tm_list[i]);
822         kfree(tm_list);
823
824         return ret;
825 }
826
827 /*
828  * check if the tree block can be shared by multiple trees
829  */
830 int btrfs_block_can_be_shared(struct btrfs_root *root,
831                               struct extent_buffer *buf)
832 {
833         /*
834          * Tree blocks not in shareable trees and tree roots are never shared.
835          * If a block was allocated after the last snapshot and the block was
836          * not allocated by tree relocation, we know the block is not shared.
837          */
838         if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
839             buf != root->node && buf != root->commit_root &&
840             (btrfs_header_generation(buf) <=
841              btrfs_root_last_snapshot(&root->root_item) ||
842              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
843                 return 1;
844
845         return 0;
846 }
847
848 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
849                                        struct btrfs_root *root,
850                                        struct extent_buffer *buf,
851                                        struct extent_buffer *cow,
852                                        int *last_ref)
853 {
854         struct btrfs_fs_info *fs_info = root->fs_info;
855         u64 refs;
856         u64 owner;
857         u64 flags;
858         u64 new_flags = 0;
859         int ret;
860
861         /*
862          * Backrefs update rules:
863          *
864          * Always use full backrefs for extent pointers in tree block
865          * allocated by tree relocation.
866          *
867          * If a shared tree block is no longer referenced by its owner
868          * tree (btrfs_header_owner(buf) == root->root_key.objectid),
869          * use full backrefs for extent pointers in tree block.
870          *
871          * If a tree block is been relocating
872          * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
873          * use full backrefs for extent pointers in tree block.
874          * The reason for this is some operations (such as drop tree)
875          * are only allowed for blocks use full backrefs.
876          */
877
878         if (btrfs_block_can_be_shared(root, buf)) {
879                 ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
880                                                btrfs_header_level(buf), 1,
881                                                &refs, &flags);
882                 if (ret)
883                         return ret;
884                 if (refs == 0) {
885                         ret = -EROFS;
886                         btrfs_handle_fs_error(fs_info, ret, NULL);
887                         return ret;
888                 }
889         } else {
890                 refs = 1;
891                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
892                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
893                         flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
894                 else
895                         flags = 0;
896         }
897
898         owner = btrfs_header_owner(buf);
899         BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
900                !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
901
902         if (refs > 1) {
903                 if ((owner == root->root_key.objectid ||
904                      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
905                     !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
906                         ret = btrfs_inc_ref(trans, root, buf, 1);
907                         if (ret)
908                                 return ret;
909
910                         if (root->root_key.objectid ==
911                             BTRFS_TREE_RELOC_OBJECTID) {
912                                 ret = btrfs_dec_ref(trans, root, buf, 0);
913                                 if (ret)
914                                         return ret;
915                                 ret = btrfs_inc_ref(trans, root, cow, 1);
916                                 if (ret)
917                                         return ret;
918                         }
919                         new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
920                 } else {
921
922                         if (root->root_key.objectid ==
923                             BTRFS_TREE_RELOC_OBJECTID)
924                                 ret = btrfs_inc_ref(trans, root, cow, 1);
925                         else
926                                 ret = btrfs_inc_ref(trans, root, cow, 0);
927                         if (ret)
928                                 return ret;
929                 }
930                 if (new_flags != 0) {
931                         int level = btrfs_header_level(buf);
932
933                         ret = btrfs_set_disk_extent_flags(trans, buf,
934                                                           new_flags, level, 0);
935                         if (ret)
936                                 return ret;
937                 }
938         } else {
939                 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
940                         if (root->root_key.objectid ==
941                             BTRFS_TREE_RELOC_OBJECTID)
942                                 ret = btrfs_inc_ref(trans, root, cow, 1);
943                         else
944                                 ret = btrfs_inc_ref(trans, root, cow, 0);
945                         if (ret)
946                                 return ret;
947                         ret = btrfs_dec_ref(trans, root, buf, 1);
948                         if (ret)
949                                 return ret;
950                 }
951                 btrfs_clean_tree_block(buf);
952                 *last_ref = 1;
953         }
954         return 0;
955 }
956
957 static struct extent_buffer *alloc_tree_block_no_bg_flush(
958                                           struct btrfs_trans_handle *trans,
959                                           struct btrfs_root *root,
960                                           u64 parent_start,
961                                           const struct btrfs_disk_key *disk_key,
962                                           int level,
963                                           u64 hint,
964                                           u64 empty_size,
965                                           enum btrfs_lock_nesting nest)
966 {
967         struct btrfs_fs_info *fs_info = root->fs_info;
968         struct extent_buffer *ret;
969
970         /*
971          * If we are COWing a node/leaf from the extent, chunk, device or free
972          * space trees, make sure that we do not finish block group creation of
973          * pending block groups. We do this to avoid a deadlock.
974          * COWing can result in allocation of a new chunk, and flushing pending
975          * block groups (btrfs_create_pending_block_groups()) can be triggered
976          * when finishing allocation of a new chunk. Creation of a pending block
977          * group modifies the extent, chunk, device and free space trees,
978          * therefore we could deadlock with ourselves since we are holding a
979          * lock on an extent buffer that btrfs_create_pending_block_groups() may
980          * try to COW later.
981          * For similar reasons, we also need to delay flushing pending block
982          * groups when splitting a leaf or node, from one of those trees, since
983          * we are holding a write lock on it and its parent or when inserting a
984          * new root node for one of those trees.
985          */
986         if (root == fs_info->extent_root ||
987             root == fs_info->chunk_root ||
988             root == fs_info->dev_root ||
989             root == fs_info->free_space_root)
990                 trans->can_flush_pending_bgs = false;
991
992         ret = btrfs_alloc_tree_block(trans, root, parent_start,
993                                      root->root_key.objectid, disk_key, level,
994                                      hint, empty_size, nest);
995         trans->can_flush_pending_bgs = true;
996
997         return ret;
998 }
999
1000 /*
1001  * does the dirty work in cow of a single block.  The parent block (if
1002  * supplied) is updated to point to the new cow copy.  The new buffer is marked
1003  * dirty and returned locked.  If you modify the block it needs to be marked
1004  * dirty again.
1005  *
1006  * search_start -- an allocation hint for the new block
1007  *
1008  * empty_size -- a hint that you plan on doing more cow.  This is the size in
1009  * bytes the allocator should try to find free next to the block it returns.
1010  * This is just a hint and may be ignored by the allocator.
1011  */
1012 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1013                              struct btrfs_root *root,
1014                              struct extent_buffer *buf,
1015                              struct extent_buffer *parent, int parent_slot,
1016                              struct extent_buffer **cow_ret,
1017                              u64 search_start, u64 empty_size,
1018                              enum btrfs_lock_nesting nest)
1019 {
1020         struct btrfs_fs_info *fs_info = root->fs_info;
1021         struct btrfs_disk_key disk_key;
1022         struct extent_buffer *cow;
1023         int level, ret;
1024         int last_ref = 0;
1025         int unlock_orig = 0;
1026         u64 parent_start = 0;
1027
1028         if (*cow_ret == buf)
1029                 unlock_orig = 1;
1030
1031         btrfs_assert_tree_locked(buf);
1032
1033         WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
1034                 trans->transid != fs_info->running_transaction->transid);
1035         WARN_ON(test_bit(BTRFS_ROOT_SHAREABLE, &root->state) &&
1036                 trans->transid != root->last_trans);
1037
1038         level = btrfs_header_level(buf);
1039
1040         if (level == 0)
1041                 btrfs_item_key(buf, &disk_key, 0);
1042         else
1043                 btrfs_node_key(buf, &disk_key, 0);
1044
1045         if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
1046                 parent_start = parent->start;
1047
1048         cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
1049                                            level, search_start, empty_size, nest);
1050         if (IS_ERR(cow))
1051                 return PTR_ERR(cow);
1052
1053         /* cow is set to blocking by btrfs_init_new_buffer */
1054
1055         copy_extent_buffer_full(cow, buf);
1056         btrfs_set_header_bytenr(cow, cow->start);
1057         btrfs_set_header_generation(cow, trans->transid);
1058         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1059         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1060                                      BTRFS_HEADER_FLAG_RELOC);
1061         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1062                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1063         else
1064                 btrfs_set_header_owner(cow, root->root_key.objectid);
1065
1066         write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
1067
1068         ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1069         if (ret) {
1070                 btrfs_tree_unlock(cow);
1071                 free_extent_buffer(cow);
1072                 btrfs_abort_transaction(trans, ret);
1073                 return ret;
1074         }
1075
1076         if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) {
1077                 ret = btrfs_reloc_cow_block(trans, root, buf, cow);
1078                 if (ret) {
1079                         btrfs_tree_unlock(cow);
1080                         free_extent_buffer(cow);
1081                         btrfs_abort_transaction(trans, ret);
1082                         return ret;
1083                 }
1084         }
1085
1086         if (buf == root->node) {
1087                 WARN_ON(parent && parent != buf);
1088                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1089                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1090                         parent_start = buf->start;
1091
1092                 atomic_inc(&cow->refs);
1093                 ret = tree_mod_log_insert_root(root->node, cow, 1);
1094                 BUG_ON(ret < 0);
1095                 rcu_assign_pointer(root->node, cow);
1096
1097                 btrfs_free_tree_block(trans, root, buf, parent_start,
1098                                       last_ref);
1099                 free_extent_buffer(buf);
1100                 add_root_to_dirty_list(root);
1101         } else {
1102                 WARN_ON(trans->transid != btrfs_header_generation(parent));
1103                 tree_mod_log_insert_key(parent, parent_slot,
1104                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
1105                 btrfs_set_node_blockptr(parent, parent_slot,
1106                                         cow->start);
1107                 btrfs_set_node_ptr_generation(parent, parent_slot,
1108                                               trans->transid);
1109                 btrfs_mark_buffer_dirty(parent);
1110                 if (last_ref) {
1111                         ret = tree_mod_log_free_eb(buf);
1112                         if (ret) {
1113                                 btrfs_tree_unlock(cow);
1114                                 free_extent_buffer(cow);
1115                                 btrfs_abort_transaction(trans, ret);
1116                                 return ret;
1117                         }
1118                 }
1119                 btrfs_free_tree_block(trans, root, buf, parent_start,
1120                                       last_ref);
1121         }
1122         if (unlock_orig)
1123                 btrfs_tree_unlock(buf);
1124         free_extent_buffer_stale(buf);
1125         btrfs_mark_buffer_dirty(cow);
1126         *cow_ret = cow;
1127         return 0;
1128 }
1129
1130 /*
1131  * returns the logical address of the oldest predecessor of the given root.
1132  * entries older than time_seq are ignored.
1133  */
1134 static struct tree_mod_elem *__tree_mod_log_oldest_root(
1135                 struct extent_buffer *eb_root, u64 time_seq)
1136 {
1137         struct tree_mod_elem *tm;
1138         struct tree_mod_elem *found = NULL;
1139         u64 root_logical = eb_root->start;
1140         int looped = 0;
1141
1142         if (!time_seq)
1143                 return NULL;
1144
1145         /*
1146          * the very last operation that's logged for a root is the
1147          * replacement operation (if it is replaced at all). this has
1148          * the logical address of the *new* root, making it the very
1149          * first operation that's logged for this root.
1150          */
1151         while (1) {
1152                 tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
1153                                                 time_seq);
1154                 if (!looped && !tm)
1155                         return NULL;
1156                 /*
1157                  * if there are no tree operation for the oldest root, we simply
1158                  * return it. this should only happen if that (old) root is at
1159                  * level 0.
1160                  */
1161                 if (!tm)
1162                         break;
1163
1164                 /*
1165                  * if there's an operation that's not a root replacement, we
1166                  * found the oldest version of our root. normally, we'll find a
1167                  * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1168                  */
1169                 if (tm->op != MOD_LOG_ROOT_REPLACE)
1170                         break;
1171
1172                 found = tm;
1173                 root_logical = tm->old_root.logical;
1174                 looped = 1;
1175         }
1176
1177         /* if there's no old root to return, return what we found instead */
1178         if (!found)
1179                 found = tm;
1180
1181         return found;
1182 }
1183
1184 /*
1185  * tm is a pointer to the first operation to rewind within eb. then, all
1186  * previous operations will be rewound (until we reach something older than
1187  * time_seq).
1188  */
1189 static void
1190 __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1191                       u64 time_seq, struct tree_mod_elem *first_tm)
1192 {
1193         u32 n;
1194         struct rb_node *next;
1195         struct tree_mod_elem *tm = first_tm;
1196         unsigned long o_dst;
1197         unsigned long o_src;
1198         unsigned long p_size = sizeof(struct btrfs_key_ptr);
1199
1200         n = btrfs_header_nritems(eb);
1201         read_lock(&fs_info->tree_mod_log_lock);
1202         while (tm && tm->seq >= time_seq) {
1203                 /*
1204                  * all the operations are recorded with the operator used for
1205                  * the modification. as we're going backwards, we do the
1206                  * opposite of each operation here.
1207                  */
1208                 switch (tm->op) {
1209                 case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1210                         BUG_ON(tm->slot < n);
1211                         fallthrough;
1212                 case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1213                 case MOD_LOG_KEY_REMOVE:
1214                         btrfs_set_node_key(eb, &tm->key, tm->slot);
1215                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1216                         btrfs_set_node_ptr_generation(eb, tm->slot,
1217                                                       tm->generation);
1218                         n++;
1219                         break;
1220                 case MOD_LOG_KEY_REPLACE:
1221                         BUG_ON(tm->slot >= n);
1222                         btrfs_set_node_key(eb, &tm->key, tm->slot);
1223                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1224                         btrfs_set_node_ptr_generation(eb, tm->slot,
1225                                                       tm->generation);
1226                         break;
1227                 case MOD_LOG_KEY_ADD:
1228                         /* if a move operation is needed it's in the log */
1229                         n--;
1230                         break;
1231                 case MOD_LOG_MOVE_KEYS:
1232                         o_dst = btrfs_node_key_ptr_offset(tm->slot);
1233                         o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1234                         memmove_extent_buffer(eb, o_dst, o_src,
1235                                               tm->move.nr_items * p_size);
1236                         break;
1237                 case MOD_LOG_ROOT_REPLACE:
1238                         /*
1239                          * this operation is special. for roots, this must be
1240                          * handled explicitly before rewinding.
1241                          * for non-roots, this operation may exist if the node
1242                          * was a root: root A -> child B; then A gets empty and
1243                          * B is promoted to the new root. in the mod log, we'll
1244                          * have a root-replace operation for B, a tree block
1245                          * that is no root. we simply ignore that operation.
1246                          */
1247                         break;
1248                 }
1249                 next = rb_next(&tm->node);
1250                 if (!next)
1251                         break;
1252                 tm = rb_entry(next, struct tree_mod_elem, node);
1253                 if (tm->logical != first_tm->logical)
1254                         break;
1255         }
1256         read_unlock(&fs_info->tree_mod_log_lock);
1257         btrfs_set_header_nritems(eb, n);
1258 }
1259
1260 /*
1261  * Called with eb read locked. If the buffer cannot be rewound, the same buffer
1262  * is returned. If rewind operations happen, a fresh buffer is returned. The
1263  * returned buffer is always read-locked. If the returned buffer is not the
1264  * input buffer, the lock on the input buffer is released and the input buffer
1265  * is freed (its refcount is decremented).
1266  */
1267 static struct extent_buffer *
1268 tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1269                     struct extent_buffer *eb, u64 time_seq)
1270 {
1271         struct extent_buffer *eb_rewin;
1272         struct tree_mod_elem *tm;
1273
1274         if (!time_seq)
1275                 return eb;
1276
1277         if (btrfs_header_level(eb) == 0)
1278                 return eb;
1279
1280         tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1281         if (!tm)
1282                 return eb;
1283
1284         btrfs_set_path_blocking(path);
1285         btrfs_set_lock_blocking_read(eb);
1286
1287         if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1288                 BUG_ON(tm->slot != 0);
1289                 eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
1290                 if (!eb_rewin) {
1291                         btrfs_tree_read_unlock_blocking(eb);
1292                         free_extent_buffer(eb);
1293                         return NULL;
1294                 }
1295                 btrfs_set_header_bytenr(eb_rewin, eb->start);
1296                 btrfs_set_header_backref_rev(eb_rewin,
1297                                              btrfs_header_backref_rev(eb));
1298                 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1299                 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1300         } else {
1301                 eb_rewin = btrfs_clone_extent_buffer(eb);
1302                 if (!eb_rewin) {
1303                         btrfs_tree_read_unlock_blocking(eb);
1304                         free_extent_buffer(eb);
1305                         return NULL;
1306                 }
1307         }
1308
1309         btrfs_tree_read_unlock_blocking(eb);
1310         free_extent_buffer(eb);
1311
1312         btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
1313                                        eb_rewin, btrfs_header_level(eb_rewin));
1314         btrfs_tree_read_lock(eb_rewin);
1315         __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1316         WARN_ON(btrfs_header_nritems(eb_rewin) >
1317                 BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1318
1319         return eb_rewin;
1320 }
1321
1322 /*
1323  * get_old_root() rewinds the state of @root's root node to the given @time_seq
1324  * value. If there are no changes, the current root->root_node is returned. If
1325  * anything changed in between, there's a fresh buffer allocated on which the
1326  * rewind operations are done. In any case, the returned buffer is read locked.
1327  * Returns NULL on error (with no locks held).
1328  */
1329 static inline struct extent_buffer *
1330 get_old_root(struct btrfs_root *root, u64 time_seq)
1331 {
1332         struct btrfs_fs_info *fs_info = root->fs_info;
1333         struct tree_mod_elem *tm;
1334         struct extent_buffer *eb = NULL;
1335         struct extent_buffer *eb_root;
1336         u64 eb_root_owner = 0;
1337         struct extent_buffer *old;
1338         struct tree_mod_root *old_root = NULL;
1339         u64 old_generation = 0;
1340         u64 logical;
1341         int level;
1342
1343         eb_root = btrfs_read_lock_root_node(root);
1344         tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1345         if (!tm)
1346                 return eb_root;
1347
1348         if (tm->op == MOD_LOG_ROOT_REPLACE) {
1349                 old_root = &tm->old_root;
1350                 old_generation = tm->generation;
1351                 logical = old_root->logical;
1352                 level = old_root->level;
1353         } else {
1354                 logical = eb_root->start;
1355                 level = btrfs_header_level(eb_root);
1356         }
1357
1358         tm = tree_mod_log_search(fs_info, logical, time_seq);
1359         if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1360                 btrfs_tree_read_unlock(eb_root);
1361                 free_extent_buffer(eb_root);
1362                 old = read_tree_block(fs_info, logical, 0, level, NULL);
1363                 if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
1364                         if (!IS_ERR(old))
1365                                 free_extent_buffer(old);
1366                         btrfs_warn(fs_info,
1367                                    "failed to read tree block %llu from get_old_root",
1368                                    logical);
1369                 } else {
1370                         struct tree_mod_elem *tm2;
1371
1372                         btrfs_tree_read_lock(old);
1373                         eb = btrfs_clone_extent_buffer(old);
1374                         /*
1375                          * After the lookup for the most recent tree mod operation
1376                          * above and before we locked and cloned the extent buffer
1377                          * 'old', a new tree mod log operation may have been added.
1378                          * So lookup for a more recent one to make sure the number
1379                          * of mod log operations we replay is consistent with the
1380                          * number of items we have in the cloned extent buffer,
1381                          * otherwise we can hit a BUG_ON when rewinding the extent
1382                          * buffer.
1383                          */
1384                         tm2 = tree_mod_log_search(fs_info, logical, time_seq);
1385                         btrfs_tree_read_unlock(old);
1386                         free_extent_buffer(old);
1387                         ASSERT(tm2);
1388                         ASSERT(tm2 == tm || tm2->seq > tm->seq);
1389                         if (!tm2 || tm2->seq < tm->seq) {
1390                                 free_extent_buffer(eb);
1391                                 return NULL;
1392                         }
1393                         tm = tm2;
1394                 }
1395         } else if (old_root) {
1396                 eb_root_owner = btrfs_header_owner(eb_root);
1397                 btrfs_tree_read_unlock(eb_root);
1398                 free_extent_buffer(eb_root);
1399                 eb = alloc_dummy_extent_buffer(fs_info, logical);
1400         } else {
1401                 btrfs_set_lock_blocking_read(eb_root);
1402                 eb = btrfs_clone_extent_buffer(eb_root);
1403                 btrfs_tree_read_unlock_blocking(eb_root);
1404                 free_extent_buffer(eb_root);
1405         }
1406
1407         if (!eb)
1408                 return NULL;
1409         if (old_root) {
1410                 btrfs_set_header_bytenr(eb, eb->start);
1411                 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1412                 btrfs_set_header_owner(eb, eb_root_owner);
1413                 btrfs_set_header_level(eb, old_root->level);
1414                 btrfs_set_header_generation(eb, old_generation);
1415         }
1416         btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
1417                                        btrfs_header_level(eb));
1418         btrfs_tree_read_lock(eb);
1419         if (tm)
1420                 __tree_mod_log_rewind(fs_info, eb, time_seq, tm);
1421         else
1422                 WARN_ON(btrfs_header_level(eb) != 0);
1423         WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1424
1425         return eb;
1426 }
1427
1428 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1429 {
1430         struct tree_mod_elem *tm;
1431         int level;
1432         struct extent_buffer *eb_root = btrfs_root_node(root);
1433
1434         tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1435         if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1436                 level = tm->old_root.level;
1437         } else {
1438                 level = btrfs_header_level(eb_root);
1439         }
1440         free_extent_buffer(eb_root);
1441
1442         return level;
1443 }
1444
1445 static inline int should_cow_block(struct btrfs_trans_handle *trans,
1446                                    struct btrfs_root *root,
1447                                    struct extent_buffer *buf)
1448 {
1449         if (btrfs_is_testing(root->fs_info))
1450                 return 0;
1451
1452         /* Ensure we can see the FORCE_COW bit */
1453         smp_mb__before_atomic();
1454
1455         /*
1456          * We do not need to cow a block if
1457          * 1) this block is not created or changed in this transaction;
1458          * 2) this block does not belong to TREE_RELOC tree;
1459          * 3) the root is not forced COW.
1460          *
1461          * What is forced COW:
1462          *    when we create snapshot during committing the transaction,
1463          *    after we've finished copying src root, we must COW the shared
1464          *    block to ensure the metadata consistency.
1465          */
1466         if (btrfs_header_generation(buf) == trans->transid &&
1467             !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1468             !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1469               btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1470             !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
1471                 return 0;
1472         return 1;
1473 }
1474
1475 /*
1476  * cows a single block, see __btrfs_cow_block for the real work.
1477  * This version of it has extra checks so that a block isn't COWed more than
1478  * once per transaction, as long as it hasn't been written yet
1479  */
1480 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1481                     struct btrfs_root *root, struct extent_buffer *buf,
1482                     struct extent_buffer *parent, int parent_slot,
1483                     struct extent_buffer **cow_ret,
1484                     enum btrfs_lock_nesting nest)
1485 {
1486         struct btrfs_fs_info *fs_info = root->fs_info;
1487         u64 search_start;
1488         int ret;
1489
1490         if (test_bit(BTRFS_ROOT_DELETING, &root->state))
1491                 btrfs_err(fs_info,
1492                         "COW'ing blocks on a fs root that's being dropped");
1493
1494         if (trans->transaction != fs_info->running_transaction)
1495                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1496                        trans->transid,
1497                        fs_info->running_transaction->transid);
1498
1499         if (trans->transid != fs_info->generation)
1500                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1501                        trans->transid, fs_info->generation);
1502
1503         if (!should_cow_block(trans, root, buf)) {
1504                 trans->dirty = true;
1505                 *cow_ret = buf;
1506                 return 0;
1507         }
1508
1509         search_start = buf->start & ~((u64)SZ_1G - 1);
1510
1511         if (parent)
1512                 btrfs_set_lock_blocking_write(parent);
1513         btrfs_set_lock_blocking_write(buf);
1514
1515         /*
1516          * Before CoWing this block for later modification, check if it's
1517          * the subtree root and do the delayed subtree trace if needed.
1518          *
1519          * Also We don't care about the error, as it's handled internally.
1520          */
1521         btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
1522         ret = __btrfs_cow_block(trans, root, buf, parent,
1523                                  parent_slot, cow_ret, search_start, 0, nest);
1524
1525         trace_btrfs_cow_block(root, buf, *cow_ret);
1526
1527         return ret;
1528 }
1529
1530 /*
1531  * helper function for defrag to decide if two blocks pointed to by a
1532  * node are actually close by
1533  */
1534 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1535 {
1536         if (blocknr < other && other - (blocknr + blocksize) < 32768)
1537                 return 1;
1538         if (blocknr > other && blocknr - (other + blocksize) < 32768)
1539                 return 1;
1540         return 0;
1541 }
1542
1543 #ifdef __LITTLE_ENDIAN
1544
1545 /*
1546  * Compare two keys, on little-endian the disk order is same as CPU order and
1547  * we can avoid the conversion.
1548  */
1549 static int comp_keys(const struct btrfs_disk_key *disk_key,
1550                      const struct btrfs_key *k2)
1551 {
1552         const struct btrfs_key *k1 = (const struct btrfs_key *)disk_key;
1553
1554         return btrfs_comp_cpu_keys(k1, k2);
1555 }
1556
1557 #else
1558
1559 /*
1560  * compare two keys in a memcmp fashion
1561  */
1562 static int comp_keys(const struct btrfs_disk_key *disk,
1563                      const struct btrfs_key *k2)
1564 {
1565         struct btrfs_key k1;
1566
1567         btrfs_disk_key_to_cpu(&k1, disk);
1568
1569         return btrfs_comp_cpu_keys(&k1, k2);
1570 }
1571 #endif
1572
1573 /*
1574  * same as comp_keys only with two btrfs_key's
1575  */
1576 int __pure btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
1577 {
1578         if (k1->objectid > k2->objectid)
1579                 return 1;
1580         if (k1->objectid < k2->objectid)
1581                 return -1;
1582         if (k1->type > k2->type)
1583                 return 1;
1584         if (k1->type < k2->type)
1585                 return -1;
1586         if (k1->offset > k2->offset)
1587                 return 1;
1588         if (k1->offset < k2->offset)
1589                 return -1;
1590         return 0;
1591 }
1592
1593 /*
1594  * this is used by the defrag code to go through all the
1595  * leaves pointed to by a node and reallocate them so that
1596  * disk order is close to key order
1597  */
1598 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1599                        struct btrfs_root *root, struct extent_buffer *parent,
1600                        int start_slot, u64 *last_ret,
1601                        struct btrfs_key *progress)
1602 {
1603         struct btrfs_fs_info *fs_info = root->fs_info;
1604         struct extent_buffer *cur;
1605         u64 blocknr;
1606         u64 gen;
1607         u64 search_start = *last_ret;
1608         u64 last_block = 0;
1609         u64 other;
1610         u32 parent_nritems;
1611         int end_slot;
1612         int i;
1613         int err = 0;
1614         int parent_level;
1615         int uptodate;
1616         u32 blocksize;
1617         int progress_passed = 0;
1618         struct btrfs_disk_key disk_key;
1619
1620         parent_level = btrfs_header_level(parent);
1621
1622         WARN_ON(trans->transaction != fs_info->running_transaction);
1623         WARN_ON(trans->transid != fs_info->generation);
1624
1625         parent_nritems = btrfs_header_nritems(parent);
1626         blocksize = fs_info->nodesize;
1627         end_slot = parent_nritems - 1;
1628
1629         if (parent_nritems <= 1)
1630                 return 0;
1631
1632         btrfs_set_lock_blocking_write(parent);
1633
1634         for (i = start_slot; i <= end_slot; i++) {
1635                 struct btrfs_key first_key;
1636                 int close = 1;
1637
1638                 btrfs_node_key(parent, &disk_key, i);
1639                 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1640                         continue;
1641
1642                 progress_passed = 1;
1643                 blocknr = btrfs_node_blockptr(parent, i);
1644                 gen = btrfs_node_ptr_generation(parent, i);
1645                 btrfs_node_key_to_cpu(parent, &first_key, i);
1646                 if (last_block == 0)
1647                         last_block = blocknr;
1648
1649                 if (i > 0) {
1650                         other = btrfs_node_blockptr(parent, i - 1);
1651                         close = close_blocks(blocknr, other, blocksize);
1652                 }
1653                 if (!close && i < end_slot) {
1654                         other = btrfs_node_blockptr(parent, i + 1);
1655                         close = close_blocks(blocknr, other, blocksize);
1656                 }
1657                 if (close) {
1658                         last_block = blocknr;
1659                         continue;
1660                 }
1661
1662                 cur = find_extent_buffer(fs_info, blocknr);
1663                 if (cur)
1664                         uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1665                 else
1666                         uptodate = 0;
1667                 if (!cur || !uptodate) {
1668                         if (!cur) {
1669                                 cur = read_tree_block(fs_info, blocknr, gen,
1670                                                       parent_level - 1,
1671                                                       &first_key);
1672                                 if (IS_ERR(cur)) {
1673                                         return PTR_ERR(cur);
1674                                 } else if (!extent_buffer_uptodate(cur)) {
1675                                         free_extent_buffer(cur);
1676                                         return -EIO;
1677                                 }
1678                         } else if (!uptodate) {
1679                                 err = btrfs_read_buffer(cur, gen,
1680                                                 parent_level - 1,&first_key);
1681                                 if (err) {
1682                                         free_extent_buffer(cur);
1683                                         return err;
1684                                 }
1685                         }
1686                 }
1687                 if (search_start == 0)
1688                         search_start = last_block;
1689
1690                 btrfs_tree_lock(cur);
1691                 btrfs_set_lock_blocking_write(cur);
1692                 err = __btrfs_cow_block(trans, root, cur, parent, i,
1693                                         &cur, search_start,
1694                                         min(16 * blocksize,
1695                                             (end_slot - i) * blocksize),
1696                                         BTRFS_NESTING_COW);
1697                 if (err) {
1698                         btrfs_tree_unlock(cur);
1699                         free_extent_buffer(cur);
1700                         break;
1701                 }
1702                 search_start = cur->start;
1703                 last_block = cur->start;
1704                 *last_ret = search_start;
1705                 btrfs_tree_unlock(cur);
1706                 free_extent_buffer(cur);
1707         }
1708         return err;
1709 }
1710
1711 /*
1712  * search for key in the extent_buffer.  The items start at offset p,
1713  * and they are item_size apart.  There are 'max' items in p.
1714  *
1715  * the slot in the array is returned via slot, and it points to
1716  * the place where you would insert key if it is not found in
1717  * the array.
1718  *
1719  * slot may point to max if the key is bigger than all of the keys
1720  */
1721 static noinline int generic_bin_search(struct extent_buffer *eb,
1722                                        unsigned long p, int item_size,
1723                                        const struct btrfs_key *key,
1724                                        int max, int *slot)
1725 {
1726         int low = 0;
1727         int high = max;
1728         int ret;
1729         const int key_size = sizeof(struct btrfs_disk_key);
1730
1731         if (low > high) {
1732                 btrfs_err(eb->fs_info,
1733                  "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
1734                           __func__, low, high, eb->start,
1735                           btrfs_header_owner(eb), btrfs_header_level(eb));
1736                 return -EINVAL;
1737         }
1738
1739         while (low < high) {
1740                 unsigned long oip;
1741                 unsigned long offset;
1742                 struct btrfs_disk_key *tmp;
1743                 struct btrfs_disk_key unaligned;
1744                 int mid;
1745
1746                 mid = (low + high) / 2;
1747                 offset = p + mid * item_size;
1748                 oip = offset_in_page(offset);
1749
1750                 if (oip + key_size <= PAGE_SIZE) {
1751                         const unsigned long idx = offset >> PAGE_SHIFT;
1752                         char *kaddr = page_address(eb->pages[idx]);
1753
1754                         tmp = (struct btrfs_disk_key *)(kaddr + oip);
1755                 } else {
1756                         read_extent_buffer(eb, &unaligned, offset, key_size);
1757                         tmp = &unaligned;
1758                 }
1759
1760                 ret = comp_keys(tmp, key);
1761
1762                 if (ret < 0)
1763                         low = mid + 1;
1764                 else if (ret > 0)
1765                         high = mid;
1766                 else {
1767                         *slot = mid;
1768                         return 0;
1769                 }
1770         }
1771         *slot = low;
1772         return 1;
1773 }
1774
1775 /*
1776  * simple bin_search frontend that does the right thing for
1777  * leaves vs nodes
1778  */
1779 int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
1780                      int *slot)
1781 {
1782         if (btrfs_header_level(eb) == 0)
1783                 return generic_bin_search(eb,
1784                                           offsetof(struct btrfs_leaf, items),
1785                                           sizeof(struct btrfs_item),
1786                                           key, btrfs_header_nritems(eb),
1787                                           slot);
1788         else
1789                 return generic_bin_search(eb,
1790                                           offsetof(struct btrfs_node, ptrs),
1791                                           sizeof(struct btrfs_key_ptr),
1792                                           key, btrfs_header_nritems(eb),
1793                                           slot);
1794 }
1795
1796 static void root_add_used(struct btrfs_root *root, u32 size)
1797 {
1798         spin_lock(&root->accounting_lock);
1799         btrfs_set_root_used(&root->root_item,
1800                             btrfs_root_used(&root->root_item) + size);
1801         spin_unlock(&root->accounting_lock);
1802 }
1803
1804 static void root_sub_used(struct btrfs_root *root, u32 size)
1805 {
1806         spin_lock(&root->accounting_lock);
1807         btrfs_set_root_used(&root->root_item,
1808                             btrfs_root_used(&root->root_item) - size);
1809         spin_unlock(&root->accounting_lock);
1810 }
1811
1812 /* given a node and slot number, this reads the blocks it points to.  The
1813  * extent buffer is returned with a reference taken (but unlocked).
1814  */
1815 struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
1816                                            int slot)
1817 {
1818         int level = btrfs_header_level(parent);
1819         struct extent_buffer *eb;
1820         struct btrfs_key first_key;
1821
1822         if (slot < 0 || slot >= btrfs_header_nritems(parent))
1823                 return ERR_PTR(-ENOENT);
1824
1825         BUG_ON(level == 0);
1826
1827         btrfs_node_key_to_cpu(parent, &first_key, slot);
1828         eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
1829                              btrfs_node_ptr_generation(parent, slot),
1830                              level - 1, &first_key);
1831         if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
1832                 free_extent_buffer(eb);
1833                 eb = ERR_PTR(-EIO);
1834         }
1835
1836         return eb;
1837 }
1838
1839 /*
1840  * node level balancing, used to make sure nodes are in proper order for
1841  * item deletion.  We balance from the top down, so we have to make sure
1842  * that a deletion won't leave an node completely empty later on.
1843  */
1844 static noinline int balance_level(struct btrfs_trans_handle *trans,
1845                          struct btrfs_root *root,
1846                          struct btrfs_path *path, int level)
1847 {
1848         struct btrfs_fs_info *fs_info = root->fs_info;
1849         struct extent_buffer *right = NULL;
1850         struct extent_buffer *mid;
1851         struct extent_buffer *left = NULL;
1852         struct extent_buffer *parent = NULL;
1853         int ret = 0;
1854         int wret;
1855         int pslot;
1856         int orig_slot = path->slots[level];
1857         u64 orig_ptr;
1858
1859         ASSERT(level > 0);
1860
1861         mid = path->nodes[level];
1862
1863         WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1864                 path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1865         WARN_ON(btrfs_header_generation(mid) != trans->transid);
1866
1867         orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1868
1869         if (level < BTRFS_MAX_LEVEL - 1) {
1870                 parent = path->nodes[level + 1];
1871                 pslot = path->slots[level + 1];
1872         }
1873
1874         /*
1875          * deal with the case where there is only one pointer in the root
1876          * by promoting the node below to a root
1877          */
1878         if (!parent) {
1879                 struct extent_buffer *child;
1880
1881                 if (btrfs_header_nritems(mid) != 1)
1882                         return 0;
1883
1884                 /* promote the child to a root */
1885                 child = btrfs_read_node_slot(mid, 0);
1886                 if (IS_ERR(child)) {
1887                         ret = PTR_ERR(child);
1888                         btrfs_handle_fs_error(fs_info, ret, NULL);
1889                         goto enospc;
1890                 }
1891
1892                 btrfs_tree_lock(child);
1893                 btrfs_set_lock_blocking_write(child);
1894                 ret = btrfs_cow_block(trans, root, child, mid, 0, &child,
1895                                       BTRFS_NESTING_COW);
1896                 if (ret) {
1897                         btrfs_tree_unlock(child);
1898                         free_extent_buffer(child);
1899                         goto enospc;
1900                 }
1901
1902                 ret = tree_mod_log_insert_root(root->node, child, 1);
1903                 BUG_ON(ret < 0);
1904                 rcu_assign_pointer(root->node, child);
1905
1906                 add_root_to_dirty_list(root);
1907                 btrfs_tree_unlock(child);
1908
1909                 path->locks[level] = 0;
1910                 path->nodes[level] = NULL;
1911                 btrfs_clean_tree_block(mid);
1912                 btrfs_tree_unlock(mid);
1913                 /* once for the path */
1914                 free_extent_buffer(mid);
1915
1916                 root_sub_used(root, mid->len);
1917                 btrfs_free_tree_block(trans, root, mid, 0, 1);
1918                 /* once for the root ptr */
1919                 free_extent_buffer_stale(mid);
1920                 return 0;
1921         }
1922         if (btrfs_header_nritems(mid) >
1923             BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
1924                 return 0;
1925
1926         left = btrfs_read_node_slot(parent, pslot - 1);
1927         if (IS_ERR(left))
1928                 left = NULL;
1929
1930         if (left) {
1931                 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
1932                 btrfs_set_lock_blocking_write(left);
1933                 wret = btrfs_cow_block(trans, root, left,
1934                                        parent, pslot - 1, &left,
1935                                        BTRFS_NESTING_LEFT_COW);
1936                 if (wret) {
1937                         ret = wret;
1938                         goto enospc;
1939                 }
1940         }
1941
1942         right = btrfs_read_node_slot(parent, pslot + 1);
1943         if (IS_ERR(right))
1944                 right = NULL;
1945
1946         if (right) {
1947                 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
1948                 btrfs_set_lock_blocking_write(right);
1949                 wret = btrfs_cow_block(trans, root, right,
1950                                        parent, pslot + 1, &right,
1951                                        BTRFS_NESTING_RIGHT_COW);
1952                 if (wret) {
1953                         ret = wret;
1954                         goto enospc;
1955                 }
1956         }
1957
1958         /* first, try to make some room in the middle buffer */
1959         if (left) {
1960                 orig_slot += btrfs_header_nritems(left);
1961                 wret = push_node_left(trans, left, mid, 1);
1962                 if (wret < 0)
1963                         ret = wret;
1964         }
1965
1966         /*
1967          * then try to empty the right most buffer into the middle
1968          */
1969         if (right) {
1970                 wret = push_node_left(trans, mid, right, 1);
1971                 if (wret < 0 && wret != -ENOSPC)
1972                         ret = wret;
1973                 if (btrfs_header_nritems(right) == 0) {
1974                         btrfs_clean_tree_block(right);
1975                         btrfs_tree_unlock(right);
1976                         del_ptr(root, path, level + 1, pslot + 1);
1977                         root_sub_used(root, right->len);
1978                         btrfs_free_tree_block(trans, root, right, 0, 1);
1979                         free_extent_buffer_stale(right);
1980                         right = NULL;
1981                 } else {
1982                         struct btrfs_disk_key right_key;
1983                         btrfs_node_key(right, &right_key, 0);
1984                         ret = tree_mod_log_insert_key(parent, pslot + 1,
1985                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
1986                         BUG_ON(ret < 0);
1987                         btrfs_set_node_key(parent, &right_key, pslot + 1);
1988                         btrfs_mark_buffer_dirty(parent);
1989                 }
1990         }
1991         if (btrfs_header_nritems(mid) == 1) {
1992                 /*
1993                  * we're not allowed to leave a node with one item in the
1994                  * tree during a delete.  A deletion from lower in the tree
1995                  * could try to delete the only pointer in this node.
1996                  * So, pull some keys from the left.
1997                  * There has to be a left pointer at this point because
1998                  * otherwise we would have pulled some pointers from the
1999                  * right
2000                  */
2001                 if (!left) {
2002                         ret = -EROFS;
2003                         btrfs_handle_fs_error(fs_info, ret, NULL);
2004                         goto enospc;
2005                 }
2006                 wret = balance_node_right(trans, mid, left);
2007                 if (wret < 0) {
2008                         ret = wret;
2009                         goto enospc;
2010                 }
2011                 if (wret == 1) {
2012                         wret = push_node_left(trans, left, mid, 1);
2013                         if (wret < 0)
2014                                 ret = wret;
2015                 }
2016                 BUG_ON(wret == 1);
2017         }
2018         if (btrfs_header_nritems(mid) == 0) {
2019                 btrfs_clean_tree_block(mid);
2020                 btrfs_tree_unlock(mid);
2021                 del_ptr(root, path, level + 1, pslot);
2022                 root_sub_used(root, mid->len);
2023                 btrfs_free_tree_block(trans, root, mid, 0, 1);
2024                 free_extent_buffer_stale(mid);
2025                 mid = NULL;
2026         } else {
2027                 /* update the parent key to reflect our changes */
2028                 struct btrfs_disk_key mid_key;
2029                 btrfs_node_key(mid, &mid_key, 0);
2030                 ret = tree_mod_log_insert_key(parent, pslot,
2031                                 MOD_LOG_KEY_REPLACE, GFP_NOFS);
2032                 BUG_ON(ret < 0);
2033                 btrfs_set_node_key(parent, &mid_key, pslot);
2034                 btrfs_mark_buffer_dirty(parent);
2035         }
2036
2037         /* update the path */
2038         if (left) {
2039                 if (btrfs_header_nritems(left) > orig_slot) {
2040                         atomic_inc(&left->refs);
2041                         /* left was locked after cow */
2042                         path->nodes[level] = left;
2043                         path->slots[level + 1] -= 1;
2044                         path->slots[level] = orig_slot;
2045                         if (mid) {
2046                                 btrfs_tree_unlock(mid);
2047                                 free_extent_buffer(mid);
2048                         }
2049                 } else {
2050                         orig_slot -= btrfs_header_nritems(left);
2051                         path->slots[level] = orig_slot;
2052                 }
2053         }
2054         /* double check we haven't messed things up */
2055         if (orig_ptr !=
2056             btrfs_node_blockptr(path->nodes[level], path->slots[level]))
2057                 BUG();
2058 enospc:
2059         if (right) {
2060                 btrfs_tree_unlock(right);
2061                 free_extent_buffer(right);
2062         }
2063         if (left) {
2064                 if (path->nodes[level] != left)
2065                         btrfs_tree_unlock(left);
2066                 free_extent_buffer(left);
2067         }
2068         return ret;
2069 }
2070
2071 /* Node balancing for insertion.  Here we only split or push nodes around
2072  * when they are completely full.  This is also done top down, so we
2073  * have to be pessimistic.
2074  */
2075 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2076                                           struct btrfs_root *root,
2077                                           struct btrfs_path *path, int level)
2078 {
2079         struct btrfs_fs_info *fs_info = root->fs_info;
2080         struct extent_buffer *right = NULL;
2081         struct extent_buffer *mid;
2082         struct extent_buffer *left = NULL;
2083         struct extent_buffer *parent = NULL;
2084         int ret = 0;
2085         int wret;
2086         int pslot;
2087         int orig_slot = path->slots[level];
2088
2089         if (level == 0)
2090                 return 1;
2091
2092         mid = path->nodes[level];
2093         WARN_ON(btrfs_header_generation(mid) != trans->transid);
2094
2095         if (level < BTRFS_MAX_LEVEL - 1) {
2096                 parent = path->nodes[level + 1];
2097                 pslot = path->slots[level + 1];
2098         }
2099
2100         if (!parent)
2101                 return 1;
2102
2103         left = btrfs_read_node_slot(parent, pslot - 1);
2104         if (IS_ERR(left))
2105                 left = NULL;
2106
2107         /* first, try to make some room in the middle buffer */
2108         if (left) {
2109                 u32 left_nr;
2110
2111                 __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
2112                 btrfs_set_lock_blocking_write(left);
2113
2114                 left_nr = btrfs_header_nritems(left);
2115                 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2116                         wret = 1;
2117                 } else {
2118                         ret = btrfs_cow_block(trans, root, left, parent,
2119                                               pslot - 1, &left,
2120                                               BTRFS_NESTING_LEFT_COW);
2121                         if (ret)
2122                                 wret = 1;
2123                         else {
2124                                 wret = push_node_left(trans, left, mid, 0);
2125                         }
2126                 }
2127                 if (wret < 0)
2128                         ret = wret;
2129                 if (wret == 0) {
2130                         struct btrfs_disk_key disk_key;
2131                         orig_slot += left_nr;
2132                         btrfs_node_key(mid, &disk_key, 0);
2133                         ret = tree_mod_log_insert_key(parent, pslot,
2134                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
2135                         BUG_ON(ret < 0);
2136                         btrfs_set_node_key(parent, &disk_key, pslot);
2137                         btrfs_mark_buffer_dirty(parent);
2138                         if (btrfs_header_nritems(left) > orig_slot) {
2139                                 path->nodes[level] = left;
2140                                 path->slots[level + 1] -= 1;
2141                                 path->slots[level] = orig_slot;
2142                                 btrfs_tree_unlock(mid);
2143                                 free_extent_buffer(mid);
2144                         } else {
2145                                 orig_slot -=
2146                                         btrfs_header_nritems(left);
2147                                 path->slots[level] = orig_slot;
2148                                 btrfs_tree_unlock(left);
2149                                 free_extent_buffer(left);
2150                         }
2151                         return 0;
2152                 }
2153                 btrfs_tree_unlock(left);
2154                 free_extent_buffer(left);
2155         }
2156         right = btrfs_read_node_slot(parent, pslot + 1);
2157         if (IS_ERR(right))
2158                 right = NULL;
2159
2160         /*
2161          * then try to empty the right most buffer into the middle
2162          */
2163         if (right) {
2164                 u32 right_nr;
2165
2166                 __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
2167                 btrfs_set_lock_blocking_write(right);
2168
2169                 right_nr = btrfs_header_nritems(right);
2170                 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2171                         wret = 1;
2172                 } else {
2173                         ret = btrfs_cow_block(trans, root, right,
2174                                               parent, pslot + 1,
2175                                               &right, BTRFS_NESTING_RIGHT_COW);
2176                         if (ret)
2177                                 wret = 1;
2178                         else {
2179                                 wret = balance_node_right(trans, right, mid);
2180                         }
2181                 }
2182                 if (wret < 0)
2183                         ret = wret;
2184                 if (wret == 0) {
2185                         struct btrfs_disk_key disk_key;
2186
2187                         btrfs_node_key(right, &disk_key, 0);
2188                         ret = tree_mod_log_insert_key(parent, pslot + 1,
2189                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
2190                         BUG_ON(ret < 0);
2191                         btrfs_set_node_key(parent, &disk_key, pslot + 1);
2192                         btrfs_mark_buffer_dirty(parent);
2193
2194                         if (btrfs_header_nritems(mid) <= orig_slot) {
2195                                 path->nodes[level] = right;
2196                                 path->slots[level + 1] += 1;
2197                                 path->slots[level] = orig_slot -
2198                                         btrfs_header_nritems(mid);
2199                                 btrfs_tree_unlock(mid);
2200                                 free_extent_buffer(mid);
2201                         } else {
2202                                 btrfs_tree_unlock(right);
2203                                 free_extent_buffer(right);
2204                         }
2205                         return 0;
2206                 }
2207                 btrfs_tree_unlock(right);
2208                 free_extent_buffer(right);
2209         }
2210         return 1;
2211 }
2212
2213 /*
2214  * readahead one full node of leaves, finding things that are close
2215  * to the block in 'slot', and triggering ra on them.
2216  */
2217 static void reada_for_search(struct btrfs_fs_info *fs_info,
2218                              struct btrfs_path *path,
2219                              int level, int slot, u64 objectid)
2220 {
2221         struct extent_buffer *node;
2222         struct btrfs_disk_key disk_key;
2223         u32 nritems;
2224         u64 search;
2225         u64 target;
2226         u64 nread = 0;
2227         struct extent_buffer *eb;
2228         u32 nr;
2229         u32 blocksize;
2230         u32 nscan = 0;
2231
2232         if (level != 1)
2233                 return;
2234
2235         if (!path->nodes[level])
2236                 return;
2237
2238         node = path->nodes[level];
2239
2240         search = btrfs_node_blockptr(node, slot);
2241         blocksize = fs_info->nodesize;
2242         eb = find_extent_buffer(fs_info, search);
2243         if (eb) {
2244                 free_extent_buffer(eb);
2245                 return;
2246         }
2247
2248         target = search;
2249
2250         nritems = btrfs_header_nritems(node);
2251         nr = slot;
2252
2253         while (1) {
2254                 if (path->reada == READA_BACK) {
2255                         if (nr == 0)
2256                                 break;
2257                         nr--;
2258                 } else if (path->reada == READA_FORWARD) {
2259                         nr++;
2260                         if (nr >= nritems)
2261                                 break;
2262                 }
2263                 if (path->reada == READA_BACK && objectid) {
2264                         btrfs_node_key(node, &disk_key, nr);
2265                         if (btrfs_disk_key_objectid(&disk_key) != objectid)
2266                                 break;
2267                 }
2268                 search = btrfs_node_blockptr(node, nr);
2269                 if ((search <= target && target - search <= 65536) ||
2270                     (search > target && search - target <= 65536)) {
2271                         readahead_tree_block(fs_info, search);
2272                         nread += blocksize;
2273                 }
2274                 nscan++;
2275                 if ((nread > 65536 || nscan > 32))
2276                         break;
2277         }
2278 }
2279
2280 static noinline void reada_for_balance(struct btrfs_fs_info *fs_info,
2281                                        struct btrfs_path *path, int level)
2282 {
2283         int slot;
2284         int nritems;
2285         struct extent_buffer *parent;
2286         struct extent_buffer *eb;
2287         u64 gen;
2288         u64 block1 = 0;
2289         u64 block2 = 0;
2290
2291         parent = path->nodes[level + 1];
2292         if (!parent)
2293                 return;
2294
2295         nritems = btrfs_header_nritems(parent);
2296         slot = path->slots[level + 1];
2297
2298         if (slot > 0) {
2299                 block1 = btrfs_node_blockptr(parent, slot - 1);
2300                 gen = btrfs_node_ptr_generation(parent, slot - 1);
2301                 eb = find_extent_buffer(fs_info, block1);
2302                 /*
2303                  * if we get -eagain from btrfs_buffer_uptodate, we
2304                  * don't want to return eagain here.  That will loop
2305                  * forever
2306                  */
2307                 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2308                         block1 = 0;
2309                 free_extent_buffer(eb);
2310         }
2311         if (slot + 1 < nritems) {
2312                 block2 = btrfs_node_blockptr(parent, slot + 1);
2313                 gen = btrfs_node_ptr_generation(parent, slot + 1);
2314                 eb = find_extent_buffer(fs_info, block2);
2315                 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2316                         block2 = 0;
2317                 free_extent_buffer(eb);
2318         }
2319
2320         if (block1)
2321                 readahead_tree_block(fs_info, block1);
2322         if (block2)
2323                 readahead_tree_block(fs_info, block2);
2324 }
2325
2326
2327 /*
2328  * when we walk down the tree, it is usually safe to unlock the higher layers
2329  * in the tree.  The exceptions are when our path goes through slot 0, because
2330  * operations on the tree might require changing key pointers higher up in the
2331  * tree.
2332  *
2333  * callers might also have set path->keep_locks, which tells this code to keep
2334  * the lock if the path points to the last slot in the block.  This is part of
2335  * walking through the tree, and selecting the next slot in the higher block.
2336  *
2337  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
2338  * if lowest_unlock is 1, level 0 won't be unlocked
2339  */
2340 static noinline void unlock_up(struct btrfs_path *path, int level,
2341                                int lowest_unlock, int min_write_lock_level,
2342                                int *write_lock_level)
2343 {
2344         int i;
2345         int skip_level = level;
2346         int no_skips = 0;
2347         struct extent_buffer *t;
2348
2349         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2350                 if (!path->nodes[i])
2351                         break;
2352                 if (!path->locks[i])
2353                         break;
2354                 if (!no_skips && path->slots[i] == 0) {
2355                         skip_level = i + 1;
2356                         continue;
2357                 }
2358                 if (!no_skips && path->keep_locks) {
2359                         u32 nritems;
2360                         t = path->nodes[i];
2361                         nritems = btrfs_header_nritems(t);
2362                         if (nritems < 1 || path->slots[i] >= nritems - 1) {
2363                                 skip_level = i + 1;
2364                                 continue;
2365                         }
2366                 }
2367                 if (skip_level < i && i >= lowest_unlock)
2368                         no_skips = 1;
2369
2370                 t = path->nodes[i];
2371                 if (i >= lowest_unlock && i > skip_level) {
2372                         btrfs_tree_unlock_rw(t, path->locks[i]);
2373                         path->locks[i] = 0;
2374                         if (write_lock_level &&
2375                             i > min_write_lock_level &&
2376                             i <= *write_lock_level) {
2377                                 *write_lock_level = i - 1;
2378                         }
2379                 }
2380         }
2381 }
2382
2383 /*
2384  * helper function for btrfs_search_slot.  The goal is to find a block
2385  * in cache without setting the path to blocking.  If we find the block
2386  * we return zero and the path is unchanged.
2387  *
2388  * If we can't find the block, we set the path blocking and do some
2389  * reada.  -EAGAIN is returned and the search must be repeated.
2390  */
2391 static int
2392 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
2393                       struct extent_buffer **eb_ret, int level, int slot,
2394                       const struct btrfs_key *key)
2395 {
2396         struct btrfs_fs_info *fs_info = root->fs_info;
2397         u64 blocknr;
2398         u64 gen;
2399         struct extent_buffer *tmp;
2400         struct btrfs_key first_key;
2401         int ret;
2402         int parent_level;
2403
2404         blocknr = btrfs_node_blockptr(*eb_ret, slot);
2405         gen = btrfs_node_ptr_generation(*eb_ret, slot);
2406         parent_level = btrfs_header_level(*eb_ret);
2407         btrfs_node_key_to_cpu(*eb_ret, &first_key, slot);
2408
2409         tmp = find_extent_buffer(fs_info, blocknr);
2410         if (tmp) {
2411                 /* first we do an atomic uptodate check */
2412                 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2413                         /*
2414                          * Do extra check for first_key, eb can be stale due to
2415                          * being cached, read from scrub, or have multiple
2416                          * parents (shared tree blocks).
2417                          */
2418                         if (btrfs_verify_level_key(tmp,
2419                                         parent_level - 1, &first_key, gen)) {
2420                                 free_extent_buffer(tmp);
2421                                 return -EUCLEAN;
2422                         }
2423                         *eb_ret = tmp;
2424                         return 0;
2425                 }
2426
2427                 /* the pages were up to date, but we failed
2428                  * the generation number check.  Do a full
2429                  * read for the generation number that is correct.
2430                  * We must do this without dropping locks so
2431                  * we can trust our generation number
2432                  */
2433                 btrfs_set_path_blocking(p);
2434
2435                 /* now we're allowed to do a blocking uptodate check */
2436                 ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
2437                 if (!ret) {
2438                         *eb_ret = tmp;
2439                         return 0;
2440                 }
2441                 free_extent_buffer(tmp);
2442                 btrfs_release_path(p);
2443                 return -EIO;
2444         }
2445
2446         /*
2447          * reduce lock contention at high levels
2448          * of the btree by dropping locks before
2449          * we read.  Don't release the lock on the current
2450          * level because we need to walk this node to figure
2451          * out which blocks to read.
2452          */
2453         btrfs_unlock_up_safe(p, level + 1);
2454         btrfs_set_path_blocking(p);
2455
2456         if (p->reada != READA_NONE)
2457                 reada_for_search(fs_info, p, level, slot, key->objectid);
2458
2459         ret = -EAGAIN;
2460         tmp = read_tree_block(fs_info, blocknr, gen, parent_level - 1,
2461                               &first_key);
2462         if (!IS_ERR(tmp)) {
2463                 /*
2464                  * If the read above didn't mark this buffer up to date,
2465                  * it will never end up being up to date.  Set ret to EIO now
2466                  * and give up so that our caller doesn't loop forever
2467                  * on our EAGAINs.
2468                  */
2469                 if (!extent_buffer_uptodate(tmp))
2470                         ret = -EIO;
2471                 free_extent_buffer(tmp);
2472         } else {
2473                 ret = PTR_ERR(tmp);
2474         }
2475
2476         btrfs_release_path(p);
2477         return ret;
2478 }
2479
2480 /*
2481  * helper function for btrfs_search_slot.  This does all of the checks
2482  * for node-level blocks and does any balancing required based on
2483  * the ins_len.
2484  *
2485  * If no extra work was required, zero is returned.  If we had to
2486  * drop the path, -EAGAIN is returned and btrfs_search_slot must
2487  * start over
2488  */
2489 static int
2490 setup_nodes_for_search(struct btrfs_trans_handle *trans,
2491                        struct btrfs_root *root, struct btrfs_path *p,
2492                        struct extent_buffer *b, int level, int ins_len,
2493                        int *write_lock_level)
2494 {
2495         struct btrfs_fs_info *fs_info = root->fs_info;
2496         int ret;
2497
2498         if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2499             BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
2500                 int sret;
2501
2502                 if (*write_lock_level < level + 1) {
2503                         *write_lock_level = level + 1;
2504                         btrfs_release_path(p);
2505                         goto again;
2506                 }
2507
2508                 btrfs_set_path_blocking(p);
2509                 reada_for_balance(fs_info, p, level);
2510                 sret = split_node(trans, root, p, level);
2511
2512                 BUG_ON(sret > 0);
2513                 if (sret) {
2514                         ret = sret;
2515                         goto done;
2516                 }
2517                 b = p->nodes[level];
2518         } else if (ins_len < 0 && btrfs_header_nritems(b) <
2519                    BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
2520                 int sret;
2521
2522                 if (*write_lock_level < level + 1) {
2523                         *write_lock_level = level + 1;
2524                         btrfs_release_path(p);
2525                         goto again;
2526                 }
2527
2528                 btrfs_set_path_blocking(p);
2529                 reada_for_balance(fs_info, p, level);
2530                 sret = balance_level(trans, root, p, level);
2531
2532                 if (sret) {
2533                         ret = sret;
2534                         goto done;
2535                 }
2536                 b = p->nodes[level];
2537                 if (!b) {
2538                         btrfs_release_path(p);
2539                         goto again;
2540                 }
2541                 BUG_ON(btrfs_header_nritems(b) == 1);
2542         }
2543         return 0;
2544
2545 again:
2546         ret = -EAGAIN;
2547 done:
2548         return ret;
2549 }
2550
2551 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
2552                 u64 iobjectid, u64 ioff, u8 key_type,
2553                 struct btrfs_key *found_key)
2554 {
2555         int ret;
2556         struct btrfs_key key;
2557         struct extent_buffer *eb;
2558
2559         ASSERT(path);
2560         ASSERT(found_key);
2561
2562         key.type = key_type;
2563         key.objectid = iobjectid;
2564         key.offset = ioff;
2565
2566         ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
2567         if (ret < 0)
2568                 return ret;
2569
2570         eb = path->nodes[0];
2571         if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
2572                 ret = btrfs_next_leaf(fs_root, path);
2573                 if (ret)
2574                         return ret;
2575                 eb = path->nodes[0];
2576         }
2577
2578         btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
2579         if (found_key->type != key.type ||
2580                         found_key->objectid != key.objectid)
2581                 return 1;
2582
2583         return 0;
2584 }
2585
2586 static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
2587                                                         struct btrfs_path *p,
2588                                                         int write_lock_level)
2589 {
2590         struct btrfs_fs_info *fs_info = root->fs_info;
2591         struct extent_buffer *b;
2592         int root_lock = 0;
2593         int level = 0;
2594
2595         if (p->search_commit_root) {
2596                 /*
2597                  * The commit roots are read only so we always do read locks,
2598                  * and we always must hold the commit_root_sem when doing
2599                  * searches on them, the only exception is send where we don't
2600                  * want to block transaction commits for a long time, so
2601                  * we need to clone the commit root in order to avoid races
2602                  * with transaction commits that create a snapshot of one of
2603                  * the roots used by a send operation.
2604                  */
2605                 if (p->need_commit_sem) {
2606                         down_read(&fs_info->commit_root_sem);
2607                         b = btrfs_clone_extent_buffer(root->commit_root);
2608                         up_read(&fs_info->commit_root_sem);
2609                         if (!b)
2610                                 return ERR_PTR(-ENOMEM);
2611
2612                 } else {
2613                         b = root->commit_root;
2614                         atomic_inc(&b->refs);
2615                 }
2616                 level = btrfs_header_level(b);
2617                 /*
2618                  * Ensure that all callers have set skip_locking when
2619                  * p->search_commit_root = 1.
2620                  */
2621                 ASSERT(p->skip_locking == 1);
2622
2623                 goto out;
2624         }
2625
2626         if (p->skip_locking) {
2627                 b = btrfs_root_node(root);
2628                 level = btrfs_header_level(b);
2629                 goto out;
2630         }
2631
2632         /* We try very hard to do read locks on the root */
2633         root_lock = BTRFS_READ_LOCK;
2634
2635         /*
2636          * If the level is set to maximum, we can skip trying to get the read
2637          * lock.
2638          */
2639         if (write_lock_level < BTRFS_MAX_LEVEL) {
2640                 /*
2641                  * We don't know the level of the root node until we actually
2642                  * have it read locked
2643                  */
2644                 b = __btrfs_read_lock_root_node(root, p->recurse);
2645                 level = btrfs_header_level(b);
2646                 if (level > write_lock_level)
2647                         goto out;
2648
2649                 /* Whoops, must trade for write lock */
2650                 btrfs_tree_read_unlock(b);
2651                 free_extent_buffer(b);
2652         }
2653
2654         b = btrfs_lock_root_node(root);
2655         root_lock = BTRFS_WRITE_LOCK;
2656
2657         /* The level might have changed, check again */
2658         level = btrfs_header_level(b);
2659
2660 out:
2661         /*
2662          * The root may have failed to write out at some point, and thus is no
2663          * longer valid, return an error in this case.
2664          */
2665         if (!extent_buffer_uptodate(b)) {
2666                 if (root_lock)
2667                         btrfs_tree_unlock_rw(b, root_lock);
2668                 free_extent_buffer(b);
2669                 return ERR_PTR(-EIO);
2670         }
2671
2672         p->nodes[level] = b;
2673         if (!p->skip_locking)
2674                 p->locks[level] = root_lock;
2675         /*
2676          * Callers are responsible for dropping b's references.
2677          */
2678         return b;
2679 }
2680
2681
2682 /*
2683  * btrfs_search_slot - look for a key in a tree and perform necessary
2684  * modifications to preserve tree invariants.
2685  *
2686  * @trans:      Handle of transaction, used when modifying the tree
2687  * @p:          Holds all btree nodes along the search path
2688  * @root:       The root node of the tree
2689  * @key:        The key we are looking for
2690  * @ins_len:    Indicates purpose of search, for inserts it is 1, for
2691  *              deletions it's -1. 0 for plain searches
2692  * @cow:        boolean should CoW operations be performed. Must always be 1
2693  *              when modifying the tree.
2694  *
2695  * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
2696  * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
2697  *
2698  * If @key is found, 0 is returned and you can find the item in the leaf level
2699  * of the path (level 0)
2700  *
2701  * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
2702  * points to the slot where it should be inserted
2703  *
2704  * If an error is encountered while searching the tree a negative error number
2705  * is returned
2706  */
2707 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2708                       const struct btrfs_key *key, struct btrfs_path *p,
2709                       int ins_len, int cow)
2710 {
2711         struct extent_buffer *b;
2712         int slot;
2713         int ret;
2714         int err;
2715         int level;
2716         int lowest_unlock = 1;
2717         /* everything at write_lock_level or lower must be write locked */
2718         int write_lock_level = 0;
2719         u8 lowest_level = 0;
2720         int min_write_lock_level;
2721         int prev_cmp;
2722
2723         lowest_level = p->lowest_level;
2724         WARN_ON(lowest_level && ins_len > 0);
2725         WARN_ON(p->nodes[0] != NULL);
2726         BUG_ON(!cow && ins_len);
2727
2728         if (ins_len < 0) {
2729                 lowest_unlock = 2;
2730
2731                 /* when we are removing items, we might have to go up to level
2732                  * two as we update tree pointers  Make sure we keep write
2733                  * for those levels as well
2734                  */
2735                 write_lock_level = 2;
2736         } else if (ins_len > 0) {
2737                 /*
2738                  * for inserting items, make sure we have a write lock on
2739                  * level 1 so we can update keys
2740                  */
2741                 write_lock_level = 1;
2742         }
2743
2744         if (!cow)
2745                 write_lock_level = -1;
2746
2747         if (cow && (p->keep_locks || p->lowest_level))
2748                 write_lock_level = BTRFS_MAX_LEVEL;
2749
2750         min_write_lock_level = write_lock_level;
2751
2752 again:
2753         prev_cmp = -1;
2754         b = btrfs_search_slot_get_root(root, p, write_lock_level);
2755         if (IS_ERR(b)) {
2756                 ret = PTR_ERR(b);
2757                 goto done;
2758         }
2759
2760         while (b) {
2761                 int dec = 0;
2762
2763                 level = btrfs_header_level(b);
2764
2765                 if (cow) {
2766                         bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
2767
2768                         /*
2769                          * if we don't really need to cow this block
2770                          * then we don't want to set the path blocking,
2771                          * so we test it here
2772                          */
2773                         if (!should_cow_block(trans, root, b)) {
2774                                 trans->dirty = true;
2775                                 goto cow_done;
2776                         }
2777
2778                         /*
2779                          * must have write locks on this node and the
2780                          * parent
2781                          */
2782                         if (level > write_lock_level ||
2783                             (level + 1 > write_lock_level &&
2784                             level + 1 < BTRFS_MAX_LEVEL &&
2785                             p->nodes[level + 1])) {
2786                                 write_lock_level = level + 1;
2787                                 btrfs_release_path(p);
2788                                 goto again;
2789                         }
2790
2791                         btrfs_set_path_blocking(p);
2792                         if (last_level)
2793                                 err = btrfs_cow_block(trans, root, b, NULL, 0,
2794                                                       &b,
2795                                                       BTRFS_NESTING_COW);
2796                         else
2797                                 err = btrfs_cow_block(trans, root, b,
2798                                                       p->nodes[level + 1],
2799                                                       p->slots[level + 1], &b,
2800                                                       BTRFS_NESTING_COW);
2801                         if (err) {
2802                                 ret = err;
2803                                 goto done;
2804                         }
2805                 }
2806 cow_done:
2807                 p->nodes[level] = b;
2808                 /*
2809                  * Leave path with blocking locks to avoid massive
2810                  * lock context switch, this is made on purpose.
2811                  */
2812
2813                 /*
2814                  * we have a lock on b and as long as we aren't changing
2815                  * the tree, there is no way to for the items in b to change.
2816                  * It is safe to drop the lock on our parent before we
2817                  * go through the expensive btree search on b.
2818                  *
2819                  * If we're inserting or deleting (ins_len != 0), then we might
2820                  * be changing slot zero, which may require changing the parent.
2821                  * So, we can't drop the lock until after we know which slot
2822                  * we're operating on.
2823                  */
2824                 if (!ins_len && !p->keep_locks) {
2825                         int u = level + 1;
2826
2827                         if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2828                                 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2829                                 p->locks[u] = 0;
2830                         }
2831                 }
2832
2833                 /*
2834                  * If btrfs_bin_search returns an exact match (prev_cmp == 0)
2835                  * we can safely assume the target key will always be in slot 0
2836                  * on lower levels due to the invariants BTRFS' btree provides,
2837                  * namely that a btrfs_key_ptr entry always points to the
2838                  * lowest key in the child node, thus we can skip searching
2839                  * lower levels
2840                  */
2841                 if (prev_cmp == 0) {
2842                         slot = 0;
2843                         ret = 0;
2844                 } else {
2845                         ret = btrfs_bin_search(b, key, &slot);
2846                         prev_cmp = ret;
2847                         if (ret < 0)
2848                                 goto done;
2849                 }
2850
2851                 if (level == 0) {
2852                         p->slots[level] = slot;
2853                         if (ins_len > 0 &&
2854                             btrfs_leaf_free_space(b) < ins_len) {
2855                                 if (write_lock_level < 1) {
2856                                         write_lock_level = 1;
2857                                         btrfs_release_path(p);
2858                                         goto again;
2859                                 }
2860
2861                                 btrfs_set_path_blocking(p);
2862                                 err = split_leaf(trans, root, key,
2863                                                  p, ins_len, ret == 0);
2864
2865                                 BUG_ON(err > 0);
2866                                 if (err) {
2867                                         ret = err;
2868                                         goto done;
2869                                 }
2870                         }
2871                         if (!p->search_for_split)
2872                                 unlock_up(p, level, lowest_unlock,
2873                                           min_write_lock_level, NULL);
2874                         goto done;
2875                 }
2876                 if (ret && slot > 0) {
2877                         dec = 1;
2878                         slot--;
2879                 }
2880                 p->slots[level] = slot;
2881                 err = setup_nodes_for_search(trans, root, p, b, level, ins_len,
2882                                              &write_lock_level);
2883                 if (err == -EAGAIN)
2884                         goto again;
2885                 if (err) {
2886                         ret = err;
2887                         goto done;
2888                 }
2889                 b = p->nodes[level];
2890                 slot = p->slots[level];
2891
2892                 /*
2893                  * Slot 0 is special, if we change the key we have to update
2894                  * the parent pointer which means we must have a write lock on
2895                  * the parent
2896                  */
2897                 if (slot == 0 && ins_len && write_lock_level < level + 1) {
2898                         write_lock_level = level + 1;
2899                         btrfs_release_path(p);
2900                         goto again;
2901                 }
2902
2903                 unlock_up(p, level, lowest_unlock, min_write_lock_level,
2904                           &write_lock_level);
2905
2906                 if (level == lowest_level) {
2907                         if (dec)
2908                                 p->slots[level]++;
2909                         goto done;
2910                 }
2911
2912                 err = read_block_for_search(root, p, &b, level, slot, key);
2913                 if (err == -EAGAIN)
2914                         goto again;
2915                 if (err) {
2916                         ret = err;
2917                         goto done;
2918                 }
2919
2920                 if (!p->skip_locking) {
2921                         level = btrfs_header_level(b);
2922                         if (level <= write_lock_level) {
2923                                 if (!btrfs_try_tree_write_lock(b)) {
2924                                         btrfs_set_path_blocking(p);
2925                                         btrfs_tree_lock(b);
2926                                 }
2927                                 p->locks[level] = BTRFS_WRITE_LOCK;
2928                         } else {
2929                                 if (!btrfs_tree_read_lock_atomic(b)) {
2930                                         btrfs_set_path_blocking(p);
2931                                         __btrfs_tree_read_lock(b, BTRFS_NESTING_NORMAL,
2932                                                                p->recurse);
2933                                 }
2934                                 p->locks[level] = BTRFS_READ_LOCK;
2935                         }
2936                         p->nodes[level] = b;
2937                 }
2938         }
2939         ret = 1;
2940 done:
2941         /*
2942          * we don't really know what they plan on doing with the path
2943          * from here on, so for now just mark it as blocking
2944          */
2945         if (!p->leave_spinning)
2946                 btrfs_set_path_blocking(p);
2947         if (ret < 0 && !p->skip_release_on_error)
2948                 btrfs_release_path(p);
2949         return ret;
2950 }
2951
2952 /*
2953  * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
2954  * current state of the tree together with the operations recorded in the tree
2955  * modification log to search for the key in a previous version of this tree, as
2956  * denoted by the time_seq parameter.
2957  *
2958  * Naturally, there is no support for insert, delete or cow operations.
2959  *
2960  * The resulting path and return value will be set up as if we called
2961  * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
2962  */
2963 int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
2964                           struct btrfs_path *p, u64 time_seq)
2965 {
2966         struct btrfs_fs_info *fs_info = root->fs_info;
2967         struct extent_buffer *b;
2968         int slot;
2969         int ret;
2970         int err;
2971         int level;
2972         int lowest_unlock = 1;
2973         u8 lowest_level = 0;
2974
2975         lowest_level = p->lowest_level;
2976         WARN_ON(p->nodes[0] != NULL);
2977
2978         if (p->search_commit_root) {
2979                 BUG_ON(time_seq);
2980                 return btrfs_search_slot(NULL, root, key, p, 0, 0);
2981         }
2982
2983 again:
2984         b = get_old_root(root, time_seq);
2985         if (!b) {
2986                 ret = -EIO;
2987                 goto done;
2988         }
2989         level = btrfs_header_level(b);
2990         p->locks[level] = BTRFS_READ_LOCK;
2991
2992         while (b) {
2993                 int dec = 0;
2994
2995                 level = btrfs_header_level(b);
2996                 p->nodes[level] = b;
2997
2998                 /*
2999                  * we have a lock on b and as long as we aren't changing
3000                  * the tree, there is no way to for the items in b to change.
3001                  * It is safe to drop the lock on our parent before we
3002                  * go through the expensive btree search on b.
3003                  */
3004                 btrfs_unlock_up_safe(p, level + 1);
3005
3006                 ret = btrfs_bin_search(b, key, &slot);
3007                 if (ret < 0)
3008                         goto done;
3009
3010                 if (level == 0) {
3011                         p->slots[level] = slot;
3012                         unlock_up(p, level, lowest_unlock, 0, NULL);
3013                         goto done;
3014                 }
3015
3016                 if (ret && slot > 0) {
3017                         dec = 1;
3018                         slot--;
3019                 }
3020                 p->slots[level] = slot;
3021                 unlock_up(p, level, lowest_unlock, 0, NULL);
3022
3023                 if (level == lowest_level) {
3024                         if (dec)
3025                                 p->slots[level]++;
3026                         goto done;
3027                 }
3028
3029                 err = read_block_for_search(root, p, &b, level, slot, key);
3030                 if (err == -EAGAIN)
3031                         goto again;
3032                 if (err) {
3033                         ret = err;
3034                         goto done;
3035                 }
3036
3037                 level = btrfs_header_level(b);
3038                 if (!btrfs_tree_read_lock_atomic(b)) {
3039                         btrfs_set_path_blocking(p);
3040                         btrfs_tree_read_lock(b);
3041                 }
3042                 b = tree_mod_log_rewind(fs_info, p, b, time_seq);
3043                 if (!b) {
3044                         ret = -ENOMEM;
3045                         goto done;
3046                 }
3047                 p->locks[level] = BTRFS_READ_LOCK;
3048                 p->nodes[level] = b;
3049         }
3050         ret = 1;
3051 done:
3052         if (!p->leave_spinning)
3053                 btrfs_set_path_blocking(p);
3054         if (ret < 0)
3055                 btrfs_release_path(p);
3056
3057         return ret;
3058 }
3059
3060 /*
3061  * helper to use instead of search slot if no exact match is needed but
3062  * instead the next or previous item should be returned.
3063  * When find_higher is true, the next higher item is returned, the next lower
3064  * otherwise.
3065  * When return_any and find_higher are both true, and no higher item is found,
3066  * return the next lower instead.
3067  * When return_any is true and find_higher is false, and no lower item is found,
3068  * return the next higher instead.
3069  * It returns 0 if any item is found, 1 if none is found (tree empty), and
3070  * < 0 on error
3071  */
3072 int btrfs_search_slot_for_read(struct btrfs_root *root,
3073                                const struct btrfs_key *key,
3074                                struct btrfs_path *p, int find_higher,
3075                                int return_any)
3076 {
3077         int ret;
3078         struct extent_buffer *leaf;
3079
3080 again:
3081         ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
3082         if (ret <= 0)
3083                 return ret;
3084         /*
3085          * a return value of 1 means the path is at the position where the
3086          * item should be inserted. Normally this is the next bigger item,
3087          * but in case the previous item is the last in a leaf, path points
3088          * to the first free slot in the previous leaf, i.e. at an invalid
3089          * item.
3090          */
3091         leaf = p->nodes[0];
3092
3093         if (find_higher) {
3094                 if (p->slots[0] >= btrfs_header_nritems(leaf)) {
3095                         ret = btrfs_next_leaf(root, p);
3096                         if (ret <= 0)
3097                                 return ret;
3098                         if (!return_any)
3099                                 return 1;
3100                         /*
3101                          * no higher item found, return the next
3102                          * lower instead
3103                          */
3104                         return_any = 0;
3105                         find_higher = 0;
3106                         btrfs_release_path(p);
3107                         goto again;
3108                 }
3109         } else {
3110                 if (p->slots[0] == 0) {
3111                         ret = btrfs_prev_leaf(root, p);
3112                         if (ret < 0)
3113                                 return ret;
3114                         if (!ret) {
3115                                 leaf = p->nodes[0];
3116                                 if (p->slots[0] == btrfs_header_nritems(leaf))
3117                                         p->slots[0]--;
3118                                 return 0;
3119                         }
3120                         if (!return_any)
3121                                 return 1;
3122                         /*
3123                          * no lower item found, return the next
3124                          * higher instead
3125                          */
3126                         return_any = 0;
3127                         find_higher = 1;
3128                         btrfs_release_path(p);
3129                         goto again;
3130                 } else {
3131                         --p->slots[0];
3132                 }
3133         }
3134         return 0;
3135 }
3136
3137 /*
3138  * adjust the pointers going up the tree, starting at level
3139  * making sure the right key of each node is points to 'key'.
3140  * This is used after shifting pointers to the left, so it stops
3141  * fixing up pointers when a given leaf/node is not in slot 0 of the
3142  * higher levels
3143  *
3144  */
3145 static void fixup_low_keys(struct btrfs_path *path,
3146                            struct btrfs_disk_key *key, int level)
3147 {
3148         int i;
3149         struct extent_buffer *t;
3150         int ret;
3151
3152         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
3153                 int tslot = path->slots[i];
3154
3155                 if (!path->nodes[i])
3156                         break;
3157                 t = path->nodes[i];
3158                 ret = tree_mod_log_insert_key(t, tslot, MOD_LOG_KEY_REPLACE,
3159                                 GFP_ATOMIC);
3160                 BUG_ON(ret < 0);
3161                 btrfs_set_node_key(t, key, tslot);
3162                 btrfs_mark_buffer_dirty(path->nodes[i]);
3163                 if (tslot != 0)
3164                         break;
3165         }
3166 }
3167
3168 /*
3169  * update item key.
3170  *
3171  * This function isn't completely safe. It's the caller's responsibility
3172  * that the new key won't break the order
3173  */
3174 void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
3175                              struct btrfs_path *path,
3176                              const struct btrfs_key *new_key)
3177 {
3178         struct btrfs_disk_key disk_key;
3179         struct extent_buffer *eb;
3180         int slot;
3181
3182         eb = path->nodes[0];
3183         slot = path->slots[0];
3184         if (slot > 0) {
3185                 btrfs_item_key(eb, &disk_key, slot - 1);
3186                 if (unlikely(comp_keys(&disk_key, new_key) >= 0)) {
3187                         btrfs_crit(fs_info,
3188                 "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
3189                                    slot, btrfs_disk_key_objectid(&disk_key),
3190                                    btrfs_disk_key_type(&disk_key),
3191                                    btrfs_disk_key_offset(&disk_key),
3192                                    new_key->objectid, new_key->type,
3193                                    new_key->offset);
3194                         btrfs_print_leaf(eb);
3195                         BUG();
3196                 }
3197         }
3198         if (slot < btrfs_header_nritems(eb) - 1) {
3199                 btrfs_item_key(eb, &disk_key, slot + 1);
3200                 if (unlikely(comp_keys(&disk_key, new_key) <= 0)) {
3201                         btrfs_crit(fs_info,
3202                 "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
3203                                    slot, btrfs_disk_key_objectid(&disk_key),
3204                                    btrfs_disk_key_type(&disk_key),
3205                                    btrfs_disk_key_offset(&disk_key),
3206                                    new_key->objectid, new_key->type,
3207                                    new_key->offset);
3208                         btrfs_print_leaf(eb);
3209                         BUG();
3210                 }
3211         }
3212
3213         btrfs_cpu_key_to_disk(&disk_key, new_key);
3214         btrfs_set_item_key(eb, &disk_key, slot);
3215         btrfs_mark_buffer_dirty(eb);
3216         if (slot == 0)
3217                 fixup_low_keys(path, &disk_key, 1);
3218 }
3219
3220 /*
3221  * Check key order of two sibling extent buffers.
3222  *
3223  * Return true if something is wrong.
3224  * Return false if everything is fine.
3225  *
3226  * Tree-checker only works inside one tree block, thus the following
3227  * corruption can not be detected by tree-checker:
3228  *
3229  * Leaf @left                   | Leaf @right
3230  * --------------------------------------------------------------
3231  * | 1 | 2 | 3 | 4 | 5 | f6 |   | 7 | 8 |
3232  *
3233  * Key f6 in leaf @left itself is valid, but not valid when the next
3234  * key in leaf @right is 7.
3235  * This can only be checked at tree block merge time.
3236  * And since tree checker has ensured all key order in each tree block
3237  * is correct, we only need to bother the last key of @left and the first
3238  * key of @right.
3239  */
3240 static bool check_sibling_keys(struct extent_buffer *left,
3241                                struct extent_buffer *right)
3242 {
3243         struct btrfs_key left_last;
3244         struct btrfs_key right_first;
3245         int level = btrfs_header_level(left);
3246         int nr_left = btrfs_header_nritems(left);
3247         int nr_right = btrfs_header_nritems(right);
3248
3249         /* No key to check in one of the tree blocks */
3250         if (!nr_left || !nr_right)
3251                 return false;
3252
3253         if (level) {
3254                 btrfs_node_key_to_cpu(left, &left_last, nr_left - 1);
3255                 btrfs_node_key_to_cpu(right, &right_first, 0);
3256         } else {
3257                 btrfs_item_key_to_cpu(left, &left_last, nr_left - 1);
3258                 btrfs_item_key_to_cpu(right, &right_first, 0);
3259         }
3260
3261         if (btrfs_comp_cpu_keys(&left_last, &right_first) >= 0) {
3262                 btrfs_crit(left->fs_info,
3263 "bad key order, sibling blocks, left last (%llu %u %llu) right first (%llu %u %llu)",
3264                            left_last.objectid, left_last.type,
3265                            left_last.offset, right_first.objectid,
3266                            right_first.type, right_first.offset);
3267                 return true;
3268         }
3269         return false;
3270 }
3271
3272 /*
3273  * try to push data from one node into the next node left in the
3274  * tree.
3275  *
3276  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
3277  * error, and > 0 if there was no room in the left hand block.
3278  */
3279 static int push_node_left(struct btrfs_trans_handle *trans,
3280                           struct extent_buffer *dst,
3281                           struct extent_buffer *src, int empty)
3282 {
3283         struct btrfs_fs_info *fs_info = trans->fs_info;
3284         int push_items = 0;
3285         int src_nritems;
3286         int dst_nritems;
3287         int ret = 0;
3288
3289         src_nritems = btrfs_header_nritems(src);
3290         dst_nritems = btrfs_header_nritems(dst);
3291         push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3292         WARN_ON(btrfs_header_generation(src) != trans->transid);
3293         WARN_ON(btrfs_header_generation(dst) != trans->transid);
3294
3295         if (!empty && src_nritems <= 8)
3296                 return 1;
3297
3298         if (push_items <= 0)
3299                 return 1;
3300
3301         if (empty) {
3302                 push_items = min(src_nritems, push_items);
3303                 if (push_items < src_nritems) {
3304                         /* leave at least 8 pointers in the node if
3305                          * we aren't going to empty it
3306                          */
3307                         if (src_nritems - push_items < 8) {
3308                                 if (push_items <= 8)
3309                                         return 1;
3310                                 push_items -= 8;
3311                         }
3312                 }
3313         } else
3314                 push_items = min(src_nritems - 8, push_items);
3315
3316         /* dst is the left eb, src is the middle eb */
3317         if (check_sibling_keys(dst, src)) {
3318                 ret = -EUCLEAN;
3319                 btrfs_abort_transaction(trans, ret);
3320                 return ret;
3321         }
3322         ret = tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
3323         if (ret) {
3324                 btrfs_abort_transaction(trans, ret);
3325                 return ret;
3326         }
3327         copy_extent_buffer(dst, src,
3328                            btrfs_node_key_ptr_offset(dst_nritems),
3329                            btrfs_node_key_ptr_offset(0),
3330                            push_items * sizeof(struct btrfs_key_ptr));
3331
3332         if (push_items < src_nritems) {
3333                 /*
3334                  * Don't call tree_mod_log_insert_move here, key removal was
3335                  * already fully logged by tree_mod_log_eb_copy above.
3336                  */
3337                 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
3338                                       btrfs_node_key_ptr_offset(push_items),
3339                                       (src_nritems - push_items) *
3340                                       sizeof(struct btrfs_key_ptr));
3341         }
3342         btrfs_set_header_nritems(src, src_nritems - push_items);
3343         btrfs_set_header_nritems(dst, dst_nritems + push_items);
3344         btrfs_mark_buffer_dirty(src);
3345         btrfs_mark_buffer_dirty(dst);
3346
3347         return ret;
3348 }
3349
3350 /*
3351  * try to push data from one node into the next node right in the
3352  * tree.
3353  *
3354  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
3355  * error, and > 0 if there was no room in the right hand block.
3356  *
3357  * this will  only push up to 1/2 the contents of the left node over
3358  */
3359 static int balance_node_right(struct btrfs_trans_handle *trans,
3360                               struct extent_buffer *dst,
3361                               struct extent_buffer *src)
3362 {
3363         struct btrfs_fs_info *fs_info = trans->fs_info;
3364         int push_items = 0;
3365         int max_push;
3366         int src_nritems;
3367         int dst_nritems;
3368         int ret = 0;
3369
3370         WARN_ON(btrfs_header_generation(src) != trans->transid);
3371         WARN_ON(btrfs_header_generation(dst) != trans->transid);
3372
3373         src_nritems = btrfs_header_nritems(src);
3374         dst_nritems = btrfs_header_nritems(dst);
3375         push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3376         if (push_items <= 0)
3377                 return 1;
3378
3379         if (src_nritems < 4)
3380                 return 1;
3381
3382         max_push = src_nritems / 2 + 1;
3383         /* don't try to empty the node */
3384         if (max_push >= src_nritems)
3385                 return 1;
3386
3387         if (max_push < push_items)
3388                 push_items = max_push;
3389
3390         /* dst is the right eb, src is the middle eb */
3391         if (check_sibling_keys(src, dst)) {
3392                 ret = -EUCLEAN;
3393                 btrfs_abort_transaction(trans, ret);
3394                 return ret;
3395         }
3396         ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
3397         BUG_ON(ret < 0);
3398         memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
3399                                       btrfs_node_key_ptr_offset(0),
3400                                       (dst_nritems) *
3401                                       sizeof(struct btrfs_key_ptr));
3402
3403         ret = tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
3404                                    push_items);
3405         if (ret) {
3406                 btrfs_abort_transaction(trans, ret);
3407                 return ret;
3408         }
3409         copy_extent_buffer(dst, src,
3410                            btrfs_node_key_ptr_offset(0),
3411                            btrfs_node_key_ptr_offset(src_nritems - push_items),
3412                            push_items * sizeof(struct btrfs_key_ptr));
3413
3414         btrfs_set_header_nritems(src, src_nritems - push_items);
3415         btrfs_set_header_nritems(dst, dst_nritems + push_items);
3416
3417         btrfs_mark_buffer_dirty(src);
3418         btrfs_mark_buffer_dirty(dst);
3419
3420         return ret;
3421 }
3422
3423 /*
3424  * helper function to insert a new root level in the tree.
3425  * A new node is allocated, and a single item is inserted to
3426  * point to the existing root
3427  *
3428  * returns zero on success or < 0 on failure.
3429  */
3430 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3431                            struct btrfs_root *root,
3432                            struct btrfs_path *path, int level)
3433 {
3434         struct btrfs_fs_info *fs_info = root->fs_info;
3435         u64 lower_gen;
3436         struct extent_buffer *lower;
3437         struct extent_buffer *c;
3438         struct extent_buffer *old;
3439         struct btrfs_disk_key lower_key;
3440         int ret;
3441
3442         BUG_ON(path->nodes[level]);
3443         BUG_ON(path->nodes[level-1] != root->node);
3444
3445         lower = path->nodes[level-1];
3446         if (level == 1)
3447                 btrfs_item_key(lower, &lower_key, 0);
3448         else
3449                 btrfs_node_key(lower, &lower_key, 0);
3450
3451         c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
3452                                          root->node->start, 0,
3453                                          BTRFS_NESTING_NEW_ROOT);
3454         if (IS_ERR(c))
3455                 return PTR_ERR(c);
3456
3457         root_add_used(root, fs_info->nodesize);
3458
3459         btrfs_set_header_nritems(c, 1);
3460         btrfs_set_node_key(c, &lower_key, 0);
3461         btrfs_set_node_blockptr(c, 0, lower->start);
3462         lower_gen = btrfs_header_generation(lower);
3463         WARN_ON(lower_gen != trans->transid);
3464
3465         btrfs_set_node_ptr_generation(c, 0, lower_gen);
3466
3467         btrfs_mark_buffer_dirty(c);
3468
3469         old = root->node;
3470         ret = tree_mod_log_insert_root(root->node, c, 0);
3471         BUG_ON(ret < 0);
3472         rcu_assign_pointer(root->node, c);
3473
3474         /* the super has an extra ref to root->node */
3475         free_extent_buffer(old);
3476
3477         add_root_to_dirty_list(root);
3478         atomic_inc(&c->refs);
3479         path->nodes[level] = c;
3480         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
3481         path->slots[level] = 0;
3482         return 0;
3483 }
3484
3485 /*
3486  * worker function to insert a single pointer in a node.
3487  * the node should have enough room for the pointer already
3488  *
3489  * slot and level indicate where you want the key to go, and
3490  * blocknr is the block the key points to.
3491  */
3492 static void insert_ptr(struct btrfs_trans_handle *trans,
3493                        struct btrfs_path *path,
3494                        struct btrfs_disk_key *key, u64 bytenr,
3495                        int slot, int level)
3496 {
3497         struct extent_buffer *lower;
3498         int nritems;
3499         int ret;
3500
3501         BUG_ON(!path->nodes[level]);
3502         btrfs_assert_tree_locked(path->nodes[level]);
3503         lower = path->nodes[level];
3504         nritems = btrfs_header_nritems(lower);
3505         BUG_ON(slot > nritems);
3506         BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
3507         if (slot != nritems) {
3508                 if (level) {
3509                         ret = tree_mod_log_insert_move(lower, slot + 1, slot,
3510                                         nritems - slot);
3511                         BUG_ON(ret < 0);
3512                 }
3513                 memmove_extent_buffer(lower,
3514                               btrfs_node_key_ptr_offset(slot + 1),
3515                               btrfs_node_key_ptr_offset(slot),
3516                               (nritems - slot) * sizeof(struct btrfs_key_ptr));
3517         }
3518         if (level) {
3519                 ret = tree_mod_log_insert_key(lower, slot, MOD_LOG_KEY_ADD,
3520                                 GFP_NOFS);
3521                 BUG_ON(ret < 0);
3522         }
3523         btrfs_set_node_key(lower, key, slot);
3524         btrfs_set_node_blockptr(lower, slot, bytenr);
3525         WARN_ON(trans->transid == 0);
3526         btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3527         btrfs_set_header_nritems(lower, nritems + 1);
3528         btrfs_mark_buffer_dirty(lower);
3529 }
3530
3531 /*
3532  * split the node at the specified level in path in two.
3533  * The path is corrected to point to the appropriate node after the split
3534  *
3535  * Before splitting this tries to make some room in the node by pushing
3536  * left and right, if either one works, it returns right away.
3537  *
3538  * returns 0 on success and < 0 on failure
3539  */
3540 static noinline int split_node(struct btrfs_trans_handle *trans,
3541                                struct btrfs_root *root,
3542                                struct btrfs_path *path, int level)
3543 {
3544         struct btrfs_fs_info *fs_info = root->fs_info;
3545         struct extent_buffer *c;
3546         struct extent_buffer *split;
3547         struct btrfs_disk_key disk_key;
3548         int mid;
3549         int ret;
3550         u32 c_nritems;
3551
3552         c = path->nodes[level];
3553         WARN_ON(btrfs_header_generation(c) != trans->transid);
3554         if (c == root->node) {
3555                 /*
3556                  * trying to split the root, lets make a new one
3557                  *
3558                  * tree mod log: We don't log_removal old root in
3559                  * insert_new_root, because that root buffer will be kept as a
3560                  * normal node. We are going to log removal of half of the
3561                  * elements below with tree_mod_log_eb_copy. We're holding a
3562                  * tree lock on the buffer, which is why we cannot race with
3563                  * other tree_mod_log users.
3564                  */
3565                 ret = insert_new_root(trans, root, path, level + 1);
3566                 if (ret)
3567                         return ret;
3568         } else {
3569                 ret = push_nodes_for_insert(trans, root, path, level);
3570                 c = path->nodes[level];
3571                 if (!ret && btrfs_header_nritems(c) <
3572                     BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
3573                         return 0;
3574                 if (ret < 0)
3575                         return ret;
3576         }
3577
3578         c_nritems = btrfs_header_nritems(c);
3579         mid = (c_nritems + 1) / 2;
3580         btrfs_node_key(c, &disk_key, mid);
3581
3582         split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
3583                                              c->start, 0, BTRFS_NESTING_SPLIT);
3584         if (IS_ERR(split))
3585                 return PTR_ERR(split);
3586
3587         root_add_used(root, fs_info->nodesize);
3588         ASSERT(btrfs_header_level(c) == level);
3589
3590         ret = tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
3591         if (ret) {
3592                 btrfs_tree_unlock(split);
3593                 free_extent_buffer(split);
3594                 btrfs_abort_transaction(trans, ret);
3595                 return ret;
3596         }
3597         copy_extent_buffer(split, c,
3598                            btrfs_node_key_ptr_offset(0),
3599                            btrfs_node_key_ptr_offset(mid),
3600                            (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3601         btrfs_set_header_nritems(split, c_nritems - mid);
3602         btrfs_set_header_nritems(c, mid);
3603         ret = 0;
3604
3605         btrfs_mark_buffer_dirty(c);
3606         btrfs_mark_buffer_dirty(split);
3607
3608         insert_ptr(trans, path, &disk_key, split->start,
3609                    path->slots[level + 1] + 1, level + 1);
3610
3611         if (path->slots[level] >= mid) {
3612                 path->slots[level] -= mid;
3613                 btrfs_tree_unlock(c);
3614                 free_extent_buffer(c);
3615                 path->nodes[level] = split;
3616                 path->slots[level + 1] += 1;
3617         } else {
3618                 btrfs_tree_unlock(split);
3619                 free_extent_buffer(split);
3620         }
3621         return ret;
3622 }
3623
3624 /*
3625  * how many bytes are required to store the items in a leaf.  start
3626  * and nr indicate which items in the leaf to check.  This totals up the
3627  * space used both by the item structs and the item data
3628  */
3629 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3630 {
3631         struct btrfs_item *start_item;
3632         struct btrfs_item *end_item;
3633         int data_len;
3634         int nritems = btrfs_header_nritems(l);
3635         int end = min(nritems, start + nr) - 1;
3636
3637         if (!nr)
3638                 return 0;
3639         start_item = btrfs_item_nr(start);
3640         end_item = btrfs_item_nr(end);
3641         data_len = btrfs_item_offset(l, start_item) +
3642                    btrfs_item_size(l, start_item);
3643         data_len = data_len - btrfs_item_offset(l, end_item);
3644         data_len += sizeof(struct btrfs_item) * nr;
3645         WARN_ON(data_len < 0);
3646         return data_len;
3647 }
3648
3649 /*
3650  * The space between the end of the leaf items and
3651  * the start of the leaf data.  IOW, how much room
3652  * the leaf has left for both items and data
3653  */
3654 noinline int btrfs_leaf_free_space(struct extent_buffer *leaf)
3655 {
3656         struct btrfs_fs_info *fs_info = leaf->fs_info;
3657         int nritems = btrfs_header_nritems(leaf);
3658         int ret;
3659
3660         ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
3661         if (ret < 0) {
3662                 btrfs_crit(fs_info,
3663                            "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3664                            ret,
3665                            (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
3666                            leaf_space_used(leaf, 0, nritems), nritems);
3667         }
3668         return ret;
3669 }
3670
3671 /*
3672  * min slot controls the lowest index we're willing to push to the
3673  * right.  We'll push up to and including min_slot, but no lower
3674  */
3675 static noinline int __push_leaf_right(struct btrfs_path *path,
3676                                       int data_size, int empty,
3677                                       struct extent_buffer *right,
3678                                       int free_space, u32 left_nritems,
3679                                       u32 min_slot)
3680 {
3681         struct btrfs_fs_info *fs_info = right->fs_info;
3682         struct extent_buffer *left = path->nodes[0];
3683         struct extent_buffer *upper = path->nodes[1];
3684         struct btrfs_map_token token;
3685         struct btrfs_disk_key disk_key;
3686         int slot;
3687         u32 i;
3688         int push_space = 0;
3689         int push_items = 0;
3690         struct btrfs_item *item;
3691         u32 nr;
3692         u32 right_nritems;
3693         u32 data_end;
3694         u32 this_item_size;
3695
3696         if (empty)
3697                 nr = 0;
3698         else
3699                 nr = max_t(u32, 1, min_slot);
3700
3701         if (path->slots[0] >= left_nritems)
3702                 push_space += data_size;
3703
3704         slot = path->slots[1];
3705         i = left_nritems - 1;
3706         while (i >= nr) {
3707                 item = btrfs_item_nr(i);
3708
3709                 if (!empty && push_items > 0) {
3710                         if (path->slots[0] > i)
3711                                 break;
3712                         if (path->slots[0] == i) {
3713                                 int space = btrfs_leaf_free_space(left);
3714
3715                                 if (space + push_space * 2 > free_space)
3716                                         break;
3717                         }
3718                 }
3719
3720                 if (path->slots[0] == i)
3721                         push_space += data_size;
3722
3723                 this_item_size = btrfs_item_size(left, item);
3724                 if (this_item_size + sizeof(*item) + push_space > free_space)
3725                         break;
3726
3727                 push_items++;
3728                 push_space += this_item_size + sizeof(*item);
3729                 if (i == 0)
3730                         break;
3731                 i--;
3732         }
3733
3734         if (push_items == 0)
3735                 goto out_unlock;
3736
3737         WARN_ON(!empty && push_items == left_nritems);
3738
3739         /* push left to right */
3740         right_nritems = btrfs_header_nritems(right);
3741
3742         push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3743         push_space -= leaf_data_end(left);
3744
3745         /* make room in the right data area */
3746         data_end = leaf_data_end(right);
3747         memmove_extent_buffer(right,
3748                               BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
3749                               BTRFS_LEAF_DATA_OFFSET + data_end,
3750                               BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
3751
3752         /* copy from the left data area */
3753         copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
3754                      BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3755                      BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left),
3756                      push_space);
3757
3758         memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3759                               btrfs_item_nr_offset(0),
3760                               right_nritems * sizeof(struct btrfs_item));
3761
3762         /* copy the items from left to right */
3763         copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3764                    btrfs_item_nr_offset(left_nritems - push_items),
3765                    push_items * sizeof(struct btrfs_item));
3766
3767         /* update the item pointers */
3768         btrfs_init_map_token(&token, right);
3769         right_nritems += push_items;
3770         btrfs_set_header_nritems(right, right_nritems);
3771         push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3772         for (i = 0; i < right_nritems; i++) {
3773                 item = btrfs_item_nr(i);
3774                 push_space -= btrfs_token_item_size(&token, item);
3775                 btrfs_set_token_item_offset(&token, item, push_space);
3776         }
3777
3778         left_nritems -= push_items;
3779         btrfs_set_header_nritems(left, left_nritems);
3780
3781         if (left_nritems)
3782                 btrfs_mark_buffer_dirty(left);
3783         else
3784                 btrfs_clean_tree_block(left);
3785
3786         btrfs_mark_buffer_dirty(right);
3787
3788         btrfs_item_key(right, &disk_key, 0);
3789         btrfs_set_node_key(upper, &disk_key, slot + 1);
3790         btrfs_mark_buffer_dirty(upper);
3791
3792         /* then fixup the leaf pointer in the path */
3793         if (path->slots[0] >= left_nritems) {
3794                 path->slots[0] -= left_nritems;
3795                 if (btrfs_header_nritems(path->nodes[0]) == 0)
3796                         btrfs_clean_tree_block(path->nodes[0]);
3797                 btrfs_tree_unlock(path->nodes[0]);
3798                 free_extent_buffer(path->nodes[0]);
3799                 path->nodes[0] = right;
3800                 path->slots[1] += 1;
3801         } else {
3802                 btrfs_tree_unlock(right);
3803                 free_extent_buffer(right);
3804         }
3805         return 0;
3806
3807 out_unlock:
3808         btrfs_tree_unlock(right);
3809         free_extent_buffer(right);
3810         return 1;
3811 }
3812
3813 /*
3814  * push some data in the path leaf to the right, trying to free up at
3815  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3816  *
3817  * returns 1 if the push failed because the other node didn't have enough
3818  * room, 0 if everything worked out and < 0 if there were major errors.
3819  *
3820  * this will push starting from min_slot to the end of the leaf.  It won't
3821  * push any slot lower than min_slot
3822  */
3823 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3824                            *root, struct btrfs_path *path,
3825                            int min_data_size, int data_size,
3826                            int empty, u32 min_slot)
3827 {
3828         struct extent_buffer *left = path->nodes[0];
3829         struct extent_buffer *right;
3830         struct extent_buffer *upper;
3831         int slot;
3832         int free_space;
3833         u32 left_nritems;
3834         int ret;
3835
3836         if (!path->nodes[1])
3837                 return 1;
3838
3839         slot = path->slots[1];
3840         upper = path->nodes[1];
3841         if (slot >= btrfs_header_nritems(upper) - 1)
3842                 return 1;
3843
3844         btrfs_assert_tree_locked(path->nodes[1]);
3845
3846         right = btrfs_read_node_slot(upper, slot + 1);
3847         /*
3848          * slot + 1 is not valid or we fail to read the right node,
3849          * no big deal, just return.
3850          */
3851         if (IS_ERR(right))
3852                 return 1;
3853
3854         __btrfs_tree_lock(right, BTRFS_NESTING_RIGHT);
3855         btrfs_set_lock_blocking_write(right);
3856
3857         free_space = btrfs_leaf_free_space(right);
3858         if (free_space < data_size)
3859                 goto out_unlock;
3860
3861         /* cow and double check */
3862         ret = btrfs_cow_block(trans, root, right, upper,
3863                               slot + 1, &right, BTRFS_NESTING_RIGHT_COW);
3864         if (ret)
3865                 goto out_unlock;
3866
3867         free_space = btrfs_leaf_free_space(right);
3868         if (free_space < data_size)
3869                 goto out_unlock;
3870
3871         left_nritems = btrfs_header_nritems(left);
3872         if (left_nritems == 0)
3873                 goto out_unlock;
3874
3875         if (check_sibling_keys(left, right)) {
3876                 ret = -EUCLEAN;
3877                 btrfs_abort_transaction(trans, ret);
3878                 btrfs_tree_unlock(right);
3879                 free_extent_buffer(right);
3880                 return ret;
3881         }
3882         if (path->slots[0] == left_nritems && !empty) {
3883                 /* Key greater than all keys in the leaf, right neighbor has
3884                  * enough room for it and we're not emptying our leaf to delete
3885                  * it, therefore use right neighbor to insert the new item and
3886                  * no need to touch/dirty our left leaf. */
3887                 btrfs_tree_unlock(left);
3888                 free_extent_buffer(left);
3889                 path->nodes[0] = right;
3890                 path->slots[0] = 0;
3891                 path->slots[1]++;
3892                 return 0;
3893         }
3894
3895         return __push_leaf_right(path, min_data_size, empty,
3896                                 right, free_space, left_nritems, min_slot);
3897 out_unlock:
3898         btrfs_tree_unlock(right);
3899         free_extent_buffer(right);
3900         return 1;
3901 }
3902
3903 /*
3904  * push some data in the path leaf to the left, trying to free up at
3905  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3906  *
3907  * max_slot can put a limit on how far into the leaf we'll push items.  The
3908  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
3909  * items
3910  */
3911 static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
3912                                      int empty, struct extent_buffer *left,
3913                                      int free_space, u32 right_nritems,
3914                                      u32 max_slot)
3915 {
3916         struct btrfs_fs_info *fs_info = left->fs_info;
3917         struct btrfs_disk_key disk_key;
3918         struct extent_buffer *right = path->nodes[0];
3919         int i;
3920         int push_space = 0;
3921         int push_items = 0;
3922         struct btrfs_item *item;
3923         u32 old_left_nritems;
3924         u32 nr;
3925         int ret = 0;
3926         u32 this_item_size;
3927         u32 old_left_item_size;
3928         struct btrfs_map_token token;
3929
3930         if (empty)
3931                 nr = min(right_nritems, max_slot);
3932         else
3933                 nr = min(right_nritems - 1, max_slot);
3934
3935         for (i = 0; i < nr; i++) {
3936                 item = btrfs_item_nr(i);
3937
3938                 if (!empty && push_items > 0) {
3939                         if (path->slots[0] < i)
3940                                 break;
3941                         if (path->slots[0] == i) {
3942                                 int space = btrfs_leaf_free_space(right);
3943
3944                                 if (space + push_space * 2 > free_space)
3945                                         break;
3946                         }
3947                 }
3948
3949                 if (path->slots[0] == i)
3950                         push_space += data_size;
3951
3952                 this_item_size = btrfs_item_size(right, item);
3953                 if (this_item_size + sizeof(*item) + push_space > free_space)
3954                         break;
3955
3956                 push_items++;
3957                 push_space += this_item_size + sizeof(*item);
3958         }
3959
3960         if (push_items == 0) {
3961                 ret = 1;
3962                 goto out;
3963         }
3964         WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3965
3966         /* push data from right to left */
3967         copy_extent_buffer(left, right,
3968                            btrfs_item_nr_offset(btrfs_header_nritems(left)),
3969                            btrfs_item_nr_offset(0),
3970                            push_items * sizeof(struct btrfs_item));
3971
3972         push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
3973                      btrfs_item_offset_nr(right, push_items - 1);
3974
3975         copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
3976                      leaf_data_end(left) - push_space,
3977                      BTRFS_LEAF_DATA_OFFSET +
3978                      btrfs_item_offset_nr(right, push_items - 1),
3979                      push_space);
3980         old_left_nritems = btrfs_header_nritems(left);
3981         BUG_ON(old_left_nritems <= 0);
3982
3983         btrfs_init_map_token(&token, left);
3984         old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3985         for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3986                 u32 ioff;
3987
3988                 item = btrfs_item_nr(i);
3989
3990                 ioff = btrfs_token_item_offset(&token, item);
3991                 btrfs_set_token_item_offset(&token, item,
3992                       ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size));
3993         }
3994         btrfs_set_header_nritems(left, old_left_nritems + push_items);
3995
3996         /* fixup right node */
3997         if (push_items > right_nritems)
3998                 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3999                        right_nritems);
4000
4001         if (push_items < right_nritems) {
4002                 push_space = btrfs_item_offset_nr(right, push_items - 1) -
4003                                                   leaf_data_end(right);
4004                 memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
4005                                       BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
4006                                       BTRFS_LEAF_DATA_OFFSET +
4007                                       leaf_data_end(right), push_space);
4008
4009                 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
4010                               btrfs_item_nr_offset(push_items),
4011                              (btrfs_header_nritems(right) - push_items) *
4012                              sizeof(struct btrfs_item));
4013         }
4014
4015         btrfs_init_map_token(&token, right);
4016         right_nritems -= push_items;
4017         btrfs_set_header_nritems(right, right_nritems);
4018         push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
4019         for (i = 0; i < right_nritems; i++) {
4020                 item = btrfs_item_nr(i);
4021
4022                 push_space = push_space - btrfs_token_item_size(&token, item);
4023                 btrfs_set_token_item_offset(&token, item, push_space);
4024         }
4025
4026         btrfs_mark_buffer_dirty(left);
4027         if (right_nritems)
4028                 btrfs_mark_buffer_dirty(right);
4029         else
4030                 btrfs_clean_tree_block(right);
4031
4032         btrfs_item_key(right, &disk_key, 0);
4033         fixup_low_keys(path, &disk_key, 1);
4034
4035         /* then fixup the leaf pointer in the path */
4036         if (path->slots[0] < push_items) {
4037                 path->slots[0] += old_left_nritems;
4038                 btrfs_tree_unlock(path->nodes[0]);
4039                 free_extent_buffer(path->nodes[0]);
4040                 path->nodes[0] = left;
4041                 path->slots[1] -= 1;
4042         } else {
4043                 btrfs_tree_unlock(left);
4044                 free_extent_buffer(left);
4045                 path->slots[0] -= push_items;
4046         }
4047         BUG_ON(path->slots[0] < 0);
4048         return ret;
4049 out:
4050         btrfs_tree_unlock(left);
4051         free_extent_buffer(left);
4052         return ret;
4053 }
4054
4055 /*
4056  * push some data in the path leaf to the left, trying to free up at
4057  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
4058  *
4059  * max_slot can put a limit on how far into the leaf we'll push items.  The
4060  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
4061  * items
4062  */
4063 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
4064                           *root, struct btrfs_path *path, int min_data_size,
4065                           int data_size, int empty, u32 max_slot)
4066 {
4067         struct extent_buffer *right = path->nodes[0];
4068         struct extent_buffer *left;
4069         int slot;
4070         int free_space;
4071         u32 right_nritems;
4072         int ret = 0;
4073
4074         slot = path->slots[1];
4075         if (slot == 0)
4076                 return 1;
4077         if (!path->nodes[1])
4078                 return 1;
4079
4080         right_nritems = btrfs_header_nritems(right);
4081         if (right_nritems == 0)
4082                 return 1;
4083
4084         btrfs_assert_tree_locked(path->nodes[1]);
4085
4086         left = btrfs_read_node_slot(path->nodes[1], slot - 1);
4087         /*
4088          * slot - 1 is not valid or we fail to read the left node,
4089          * no big deal, just return.
4090          */
4091         if (IS_ERR(left))
4092                 return 1;
4093
4094         __btrfs_tree_lock(left, BTRFS_NESTING_LEFT);
4095         btrfs_set_lock_blocking_write(left);
4096
4097         free_space = btrfs_leaf_free_space(left);
4098         if (free_space < data_size) {
4099                 ret = 1;
4100                 goto out;
4101         }
4102
4103         /* cow and double check */
4104         ret = btrfs_cow_block(trans, root, left,
4105                               path->nodes[1], slot - 1, &left,
4106                               BTRFS_NESTING_LEFT_COW);
4107         if (ret) {
4108                 /* we hit -ENOSPC, but it isn't fatal here */
4109                 if (ret == -ENOSPC)
4110                         ret = 1;
4111                 goto out;
4112         }
4113
4114         free_space = btrfs_leaf_free_space(left);
4115         if (free_space < data_size) {
4116                 ret = 1;
4117                 goto out;
4118         }
4119
4120         if (check_sibling_keys(left, right)) {
4121                 ret = -EUCLEAN;
4122                 btrfs_abort_transaction(trans, ret);
4123                 goto out;
4124         }
4125         return __push_leaf_left(path, min_data_size,
4126                                empty, left, free_space, right_nritems,
4127                                max_slot);
4128 out:
4129         btrfs_tree_unlock(left);
4130         free_extent_buffer(left);
4131         return ret;
4132 }
4133
4134 /*
4135  * split the path's leaf in two, making sure there is at least data_size
4136  * available for the resulting leaf level of the path.
4137  */
4138 static noinline void copy_for_split(struct btrfs_trans_handle *trans,
4139                                     struct btrfs_path *path,
4140                                     struct extent_buffer *l,
4141                                     struct extent_buffer *right,
4142                                     int slot, int mid, int nritems)
4143 {
4144         struct btrfs_fs_info *fs_info = trans->fs_info;
4145         int data_copy_size;
4146         int rt_data_off;
4147         int i;
4148         struct btrfs_disk_key disk_key;
4149         struct btrfs_map_token token;
4150
4151         nritems = nritems - mid;
4152         btrfs_set_header_nritems(right, nritems);
4153         data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(l);
4154
4155         copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
4156                            btrfs_item_nr_offset(mid),
4157                            nritems * sizeof(struct btrfs_item));
4158
4159         copy_extent_buffer(right, l,
4160                      BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
4161                      data_copy_size, BTRFS_LEAF_DATA_OFFSET +
4162                      leaf_data_end(l), data_copy_size);
4163
4164         rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
4165
4166         btrfs_init_map_token(&token, right);
4167         for (i = 0; i < nritems; i++) {
4168                 struct btrfs_item *item = btrfs_item_nr(i);
4169                 u32 ioff;
4170
4171                 ioff = btrfs_token_item_offset(&token, item);
4172                 btrfs_set_token_item_offset(&token, item, ioff + rt_data_off);
4173         }
4174
4175         btrfs_set_header_nritems(l, mid);
4176         btrfs_item_key(right, &disk_key, 0);
4177         insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1);
4178
4179         btrfs_mark_buffer_dirty(right);
4180         btrfs_mark_buffer_dirty(l);
4181         BUG_ON(path->slots[0] != slot);
4182
4183         if (mid <= slot) {
4184                 btrfs_tree_unlock(path->nodes[0]);
4185                 free_extent_buffer(path->nodes[0]);
4186                 path->nodes[0] = right;
4187                 path->slots[0] -= mid;
4188                 path->slots[1] += 1;
4189         } else {
4190                 btrfs_tree_unlock(right);
4191                 free_extent_buffer(right);
4192         }
4193
4194         BUG_ON(path->slots[0] < 0);
4195 }
4196
4197 /*
4198  * double splits happen when we need to insert a big item in the middle
4199  * of a leaf.  A double split can leave us with 3 mostly empty leaves:
4200  * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
4201  *          A                 B                 C
4202  *
4203  * We avoid this by trying to push the items on either side of our target
4204  * into the adjacent leaves.  If all goes well we can avoid the double split
4205  * completely.
4206  */
4207 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
4208                                           struct btrfs_root *root,
4209                                           struct btrfs_path *path,
4210                                           int data_size)
4211 {
4212         int ret;
4213         int progress = 0;
4214         int slot;
4215         u32 nritems;
4216         int space_needed = data_size;
4217
4218         slot = path->slots[0];
4219         if (slot < btrfs_header_nritems(path->nodes[0]))
4220                 space_needed -= btrfs_leaf_free_space(path->nodes[0]);
4221
4222         /*
4223          * try to push all the items after our slot into the
4224          * right leaf
4225          */
4226         ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
4227         if (ret < 0)
4228                 return ret;
4229
4230         if (ret == 0)
4231                 progress++;
4232
4233         nritems = btrfs_header_nritems(path->nodes[0]);
4234         /*
4235          * our goal is to get our slot at the start or end of a leaf.  If
4236          * we've done so we're done
4237          */
4238         if (path->slots[0] == 0 || path->slots[0] == nritems)
4239                 return 0;
4240
4241         if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
4242                 return 0;
4243
4244         /* try to push all the items before our slot into the next leaf */
4245         slot = path->slots[0];
4246         space_needed = data_size;
4247         if (slot > 0)
4248                 space_needed -= btrfs_leaf_free_space(path->nodes[0]);
4249         ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
4250         if (ret < 0)
4251                 return ret;
4252
4253         if (ret == 0)
4254                 progress++;
4255
4256         if (progress)
4257                 return 0;
4258         return 1;
4259 }
4260
4261 /*
4262  * split the path's leaf in two, making sure there is at least data_size
4263  * available for the resulting leaf level of the path.
4264  *
4265  * returns 0 if all went well and < 0 on failure.
4266  */
4267 static noinline int split_leaf(struct btrfs_trans_handle *trans,
4268                                struct btrfs_root *root,
4269                                const struct btrfs_key *ins_key,
4270                                struct btrfs_path *path, int data_size,
4271                                int extend)
4272 {
4273         struct btrfs_disk_key disk_key;
4274         struct extent_buffer *l;
4275         u32 nritems;
4276         int mid;
4277         int slot;
4278         struct extent_buffer *right;
4279         struct btrfs_fs_info *fs_info = root->fs_info;
4280         int ret = 0;
4281         int wret;
4282         int split;
4283         int num_doubles = 0;
4284         int tried_avoid_double = 0;
4285
4286         l = path->nodes[0];
4287         slot = path->slots[0];
4288         if (extend && data_size + btrfs_item_size_nr(l, slot) +
4289             sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
4290                 return -EOVERFLOW;
4291
4292         /* first try to make some room by pushing left and right */
4293         if (data_size && path->nodes[1]) {
4294                 int space_needed = data_size;
4295
4296                 if (slot < btrfs_header_nritems(l))
4297                         space_needed -= btrfs_leaf_free_space(l);
4298
4299                 wret = push_leaf_right(trans, root, path, space_needed,
4300                                        space_needed, 0, 0);
4301                 if (wret < 0)
4302                         return wret;
4303                 if (wret) {
4304                         space_needed = data_size;
4305                         if (slot > 0)
4306                                 space_needed -= btrfs_leaf_free_space(l);
4307                         wret = push_leaf_left(trans, root, path, space_needed,
4308                                               space_needed, 0, (u32)-1);
4309                         if (wret < 0)
4310                                 return wret;
4311                 }
4312                 l = path->nodes[0];
4313
4314                 /* did the pushes work? */
4315                 if (btrfs_leaf_free_space(l) >= data_size)
4316                         return 0;
4317         }
4318
4319         if (!path->nodes[1]) {
4320                 ret = insert_new_root(trans, root, path, 1);
4321                 if (ret)
4322                         return ret;
4323         }
4324 again:
4325         split = 1;
4326         l = path->nodes[0];
4327         slot = path->slots[0];
4328         nritems = btrfs_header_nritems(l);
4329         mid = (nritems + 1) / 2;
4330
4331         if (mid <= slot) {
4332                 if (nritems == 1 ||
4333                     leaf_space_used(l, mid, nritems - mid) + data_size >
4334                         BTRFS_LEAF_DATA_SIZE(fs_info)) {
4335                         if (slot >= nritems) {
4336                                 split = 0;
4337                         } else {
4338                                 mid = slot;
4339                                 if (mid != nritems &&
4340                                     leaf_space_used(l, mid, nritems - mid) +
4341                                     data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4342                                         if (data_size && !tried_avoid_double)
4343                                                 goto push_for_double;
4344                                         split = 2;
4345                                 }
4346                         }
4347                 }
4348         } else {
4349                 if (leaf_space_used(l, 0, mid) + data_size >
4350                         BTRFS_LEAF_DATA_SIZE(fs_info)) {
4351                         if (!extend && data_size && slot == 0) {
4352                                 split = 0;
4353                         } else if ((extend || !data_size) && slot == 0) {
4354                                 mid = 1;
4355                         } else {
4356                                 mid = slot;
4357                                 if (mid != nritems &&
4358                                     leaf_space_used(l, mid, nritems - mid) +
4359                                     data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4360                                         if (data_size && !tried_avoid_double)
4361                                                 goto push_for_double;
4362                                         split = 2;
4363                                 }
4364                         }
4365                 }
4366         }
4367
4368         if (split == 0)
4369                 btrfs_cpu_key_to_disk(&disk_key, ins_key);
4370         else
4371                 btrfs_item_key(l, &disk_key, mid);
4372
4373         /*
4374          * We have to about BTRFS_NESTING_NEW_ROOT here if we've done a double
4375          * split, because we're only allowed to have MAX_LOCKDEP_SUBCLASSES
4376          * subclasses, which is 8 at the time of this patch, and we've maxed it
4377          * out.  In the future we could add a
4378          * BTRFS_NESTING_SPLIT_THE_SPLITTENING if we need to, but for now just
4379          * use BTRFS_NESTING_NEW_ROOT.
4380          */
4381         right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
4382                                              l->start, 0, num_doubles ?
4383                                              BTRFS_NESTING_NEW_ROOT :
4384                                              BTRFS_NESTING_SPLIT);
4385         if (IS_ERR(right))
4386                 return PTR_ERR(right);
4387
4388         root_add_used(root, fs_info->nodesize);
4389
4390         if (split == 0) {
4391                 if (mid <= slot) {
4392                         btrfs_set_header_nritems(right, 0);
4393                         insert_ptr(trans, path, &disk_key,
4394                                    right->start, path->slots[1] + 1, 1);
4395                         btrfs_tree_unlock(path->nodes[0]);
4396                         free_extent_buffer(path->nodes[0]);
4397                         path->nodes[0] = right;
4398                         path->slots[0] = 0;
4399                         path->slots[1] += 1;
4400                 } else {
4401                         btrfs_set_header_nritems(right, 0);
4402                         insert_ptr(trans, path, &disk_key,
4403                                    right->start, path->slots[1], 1);
4404                         btrfs_tree_unlock(path->nodes[0]);
4405                         free_extent_buffer(path->nodes[0]);
4406                         path->nodes[0] = right;
4407                         path->slots[0] = 0;
4408                         if (path->slots[1] == 0)
4409                                 fixup_low_keys(path, &disk_key, 1);
4410                 }
4411                 /*
4412                  * We create a new leaf 'right' for the required ins_len and
4413                  * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
4414                  * the content of ins_len to 'right'.
4415                  */
4416                 return ret;
4417         }
4418
4419         copy_for_split(trans, path, l, right, slot, mid, nritems);
4420
4421         if (split == 2) {
4422                 BUG_ON(num_doubles != 0);
4423                 num_doubles++;
4424                 goto again;
4425         }
4426
4427         return 0;
4428
4429 push_for_double:
4430         push_for_double_split(trans, root, path, data_size);
4431         tried_avoid_double = 1;
4432         if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
4433                 return 0;
4434         goto again;
4435 }
4436
4437 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
4438                                          struct btrfs_root *root,
4439                                          struct btrfs_path *path, int ins_len)
4440 {
4441         struct btrfs_key key;
4442         struct extent_buffer *leaf;
4443         struct btrfs_file_extent_item *fi;
4444         u64 extent_len = 0;
4445         u32 item_size;
4446         int ret;
4447
4448         leaf = path->nodes[0];
4449         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4450
4451         BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
4452                key.type != BTRFS_EXTENT_CSUM_KEY);
4453
4454         if (btrfs_leaf_free_space(leaf) >= ins_len)
4455                 return 0;
4456
4457         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4458         if (key.type == BTRFS_EXTENT_DATA_KEY) {
4459                 fi = btrfs_item_ptr(leaf, path->slots[0],
4460                                     struct btrfs_file_extent_item);
4461                 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
4462         }
4463         btrfs_release_path(path);
4464
4465         path->keep_locks = 1;
4466         path->search_for_split = 1;
4467         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4468         path->search_for_split = 0;
4469         if (ret > 0)
4470                 ret = -EAGAIN;
4471         if (ret < 0)
4472                 goto err;
4473
4474         ret = -EAGAIN;
4475         leaf = path->nodes[0];
4476         /* if our item isn't there, return now */
4477         if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
4478                 goto err;
4479
4480         /* the leaf has  changed, it now has room.  return now */
4481         if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len)
4482                 goto err;
4483
4484         if (key.type == BTRFS_EXTENT_DATA_KEY) {
4485                 fi = btrfs_item_ptr(leaf, path->slots[0],
4486                                     struct btrfs_file_extent_item);
4487                 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
4488                         goto err;
4489         }
4490
4491         btrfs_set_path_blocking(path);
4492         ret = split_leaf(trans, root, &key, path, ins_len, 1);
4493         if (ret)
4494                 goto err;
4495
4496         path->keep_locks = 0;
4497         btrfs_unlock_up_safe(path, 1);
4498         return 0;
4499 err:
4500         path->keep_locks = 0;
4501         return ret;
4502 }
4503
4504 static noinline int split_item(struct btrfs_path *path,
4505                                const struct btrfs_key *new_key,
4506                                unsigned long split_offset)
4507 {
4508         struct extent_buffer *leaf;
4509         struct btrfs_item *item;
4510         struct btrfs_item *new_item;
4511         int slot;
4512         char *buf;
4513         u32 nritems;
4514         u32 item_size;
4515         u32 orig_offset;
4516         struct btrfs_disk_key disk_key;
4517
4518         leaf = path->nodes[0];
4519         BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
4520
4521         btrfs_set_path_blocking(path);
4522
4523         item = btrfs_item_nr(path->slots[0]);
4524         orig_offset = btrfs_item_offset(leaf, item);
4525         item_size = btrfs_item_size(leaf, item);
4526
4527         buf = kmalloc(item_size, GFP_NOFS);
4528         if (!buf)
4529                 return -ENOMEM;
4530
4531         read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4532                             path->slots[0]), item_size);
4533
4534         slot = path->slots[0] + 1;
4535         nritems = btrfs_header_nritems(leaf);
4536         if (slot != nritems) {
4537                 /* shift the items */
4538                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
4539                                 btrfs_item_nr_offset(slot),
4540                                 (nritems - slot) * sizeof(struct btrfs_item));
4541         }
4542
4543         btrfs_cpu_key_to_disk(&disk_key, new_key);
4544         btrfs_set_item_key(leaf, &disk_key, slot);
4545
4546         new_item = btrfs_item_nr(slot);
4547
4548         btrfs_set_item_offset(leaf, new_item, orig_offset);
4549         btrfs_set_item_size(leaf, new_item, item_size - split_offset);
4550
4551         btrfs_set_item_offset(leaf, item,
4552                               orig_offset + item_size - split_offset);
4553         btrfs_set_item_size(leaf, item, split_offset);
4554
4555         btrfs_set_header_nritems(leaf, nritems + 1);
4556
4557         /* write the data for the start of the original item */
4558         write_extent_buffer(leaf, buf,
4559                             btrfs_item_ptr_offset(leaf, path->slots[0]),
4560                             split_offset);
4561
4562         /* write the data for the new item */
4563         write_extent_buffer(leaf, buf + split_offset,
4564                             btrfs_item_ptr_offset(leaf, slot),
4565                             item_size - split_offset);
4566         btrfs_mark_buffer_dirty(leaf);
4567
4568         BUG_ON(btrfs_leaf_free_space(leaf) < 0);
4569         kfree(buf);
4570         return 0;
4571 }
4572
4573 /*
4574  * This function splits a single item into two items,
4575  * giving 'new_key' to the new item and splitting the
4576  * old one at split_offset (from the start of the item).
4577  *
4578  * The path may be released by this operation.  After
4579  * the split, the path is pointing to the old item.  The
4580  * new item is going to be in the same node as the old one.
4581  *
4582  * Note, the item being split must be smaller enough to live alone on
4583  * a tree block with room for one extra struct btrfs_item
4584  *
4585  * This allows us to split the item in place, keeping a lock on the
4586  * leaf the entire time.
4587  */
4588 int btrfs_split_item(struct btrfs_trans_handle *trans,
4589                      struct btrfs_root *root,
4590                      struct btrfs_path *path,
4591                      const struct btrfs_key *new_key,
4592                      unsigned long split_offset)
4593 {
4594         int ret;
4595         ret = setup_leaf_for_split(trans, root, path,
4596                                    sizeof(struct btrfs_item));
4597         if (ret)
4598                 return ret;
4599
4600         ret = split_item(path, new_key, split_offset);
4601         return ret;
4602 }
4603
4604 /*
4605  * This function duplicate a item, giving 'new_key' to the new item.
4606  * It guarantees both items live in the same tree leaf and the new item
4607  * is contiguous with the original item.
4608  *
4609  * This allows us to split file extent in place, keeping a lock on the
4610  * leaf the entire time.
4611  */
4612 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4613                          struct btrfs_root *root,
4614                          struct btrfs_path *path,
4615                          const struct btrfs_key *new_key)
4616 {
4617         struct extent_buffer *leaf;
4618         int ret;
4619         u32 item_size;
4620
4621         leaf = path->nodes[0];
4622         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4623         ret = setup_leaf_for_split(trans, root, path,
4624                                    item_size + sizeof(struct btrfs_item));
4625         if (ret)
4626                 return ret;
4627
4628         path->slots[0]++;
4629         setup_items_for_insert(root, path, new_key, &item_size, 1);
4630         leaf = path->nodes[0];
4631         memcpy_extent_buffer(leaf,
4632                              btrfs_item_ptr_offset(leaf, path->slots[0]),
4633                              btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4634                              item_size);
4635         return 0;
4636 }
4637
4638 /*
4639  * make the item pointed to by the path smaller.  new_size indicates
4640  * how small to make it, and from_end tells us if we just chop bytes
4641  * off the end of the item or if we shift the item to chop bytes off
4642  * the front.
4643  */
4644 void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
4645 {
4646         int slot;
4647         struct extent_buffer *leaf;
4648         struct btrfs_item *item;
4649         u32 nritems;
4650         unsigned int data_end;
4651         unsigned int old_data_start;
4652         unsigned int old_size;
4653         unsigned int size_diff;
4654         int i;
4655         struct btrfs_map_token token;
4656
4657         leaf = path->nodes[0];
4658         slot = path->slots[0];
4659
4660         old_size = btrfs_item_size_nr(leaf, slot);
4661         if (old_size == new_size)
4662                 return;
4663
4664         nritems = btrfs_header_nritems(leaf);
4665         data_end = leaf_data_end(leaf);
4666
4667         old_data_start = btrfs_item_offset_nr(leaf, slot);
4668
4669         size_diff = old_size - new_size;
4670
4671         BUG_ON(slot < 0);
4672         BUG_ON(slot >= nritems);
4673
4674         /*
4675          * item0..itemN ... dataN.offset..dataN.size .. data0.size
4676          */
4677         /* first correct the data pointers */
4678         btrfs_init_map_token(&token, leaf);
4679         for (i = slot; i < nritems; i++) {
4680                 u32 ioff;
4681                 item = btrfs_item_nr(i);
4682
4683                 ioff = btrfs_token_item_offset(&token, item);
4684                 btrfs_set_token_item_offset(&token, item, ioff + size_diff);
4685         }
4686
4687         /* shift the data */
4688         if (from_end) {
4689                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4690                               data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4691                               data_end, old_data_start + new_size - data_end);
4692         } else {
4693                 struct btrfs_disk_key disk_key;
4694                 u64 offset;
4695
4696                 btrfs_item_key(leaf, &disk_key, slot);
4697
4698                 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4699                         unsigned long ptr;
4700                         struct btrfs_file_extent_item *fi;
4701
4702                         fi = btrfs_item_ptr(leaf, slot,
4703                                             struct btrfs_file_extent_item);
4704                         fi = (struct btrfs_file_extent_item *)(
4705                              (unsigned long)fi - size_diff);
4706
4707                         if (btrfs_file_extent_type(leaf, fi) ==
4708                             BTRFS_FILE_EXTENT_INLINE) {
4709                                 ptr = btrfs_item_ptr_offset(leaf, slot);
4710                                 memmove_extent_buffer(leaf, ptr,
4711                                       (unsigned long)fi,
4712                                       BTRFS_FILE_EXTENT_INLINE_DATA_START);
4713                         }
4714                 }
4715
4716                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4717                               data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4718                               data_end, old_data_start - data_end);
4719
4720                 offset = btrfs_disk_key_offset(&disk_key);
4721                 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4722                 btrfs_set_item_key(leaf, &disk_key, slot);
4723                 if (slot == 0)
4724                         fixup_low_keys(path, &disk_key, 1);
4725         }
4726
4727         item = btrfs_item_nr(slot);
4728         btrfs_set_item_size(leaf, item, new_size);
4729         btrfs_mark_buffer_dirty(leaf);
4730
4731         if (btrfs_leaf_free_space(leaf) < 0) {
4732                 btrfs_print_leaf(leaf);
4733                 BUG();
4734         }
4735 }
4736
4737 /*
4738  * make the item pointed to by the path bigger, data_size is the added size.
4739  */
4740 void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
4741 {
4742         int slot;
4743         struct extent_buffer *leaf;
4744         struct btrfs_item *item;
4745         u32 nritems;
4746         unsigned int data_end;
4747         unsigned int old_data;
4748         unsigned int old_size;
4749         int i;
4750         struct btrfs_map_token token;
4751
4752         leaf = path->nodes[0];
4753
4754         nritems = btrfs_header_nritems(leaf);
4755         data_end = leaf_data_end(leaf);
4756
4757         if (btrfs_leaf_free_space(leaf) < data_size) {
4758                 btrfs_print_leaf(leaf);
4759                 BUG();
4760         }
4761         slot = path->slots[0];
4762         old_data = btrfs_item_end_nr(leaf, slot);
4763
4764         BUG_ON(slot < 0);
4765         if (slot >= nritems) {
4766                 btrfs_print_leaf(leaf);
4767                 btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
4768                            slot, nritems);
4769                 BUG();
4770         }
4771
4772         /*
4773          * item0..itemN ... dataN.offset..dataN.size .. data0.size
4774          */
4775         /* first correct the data pointers */
4776         btrfs_init_map_token(&token, leaf);
4777         for (i = slot; i < nritems; i++) {
4778                 u32 ioff;
4779                 item = btrfs_item_nr(i);
4780
4781                 ioff = btrfs_token_item_offset(&token, item);
4782                 btrfs_set_token_item_offset(&token, item, ioff - data_size);
4783         }
4784
4785         /* shift the data */
4786         memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4787                       data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
4788                       data_end, old_data - data_end);
4789
4790         data_end = old_data;
4791         old_size = btrfs_item_size_nr(leaf, slot);
4792         item = btrfs_item_nr(slot);
4793         btrfs_set_item_size(leaf, item, old_size + data_size);
4794         btrfs_mark_buffer_dirty(leaf);
4795
4796         if (btrfs_leaf_free_space(leaf) < 0) {
4797                 btrfs_print_leaf(leaf);
4798                 BUG();
4799         }
4800 }
4801
4802 /**
4803  * setup_items_for_insert - Helper called before inserting one or more items
4804  * to a leaf. Main purpose is to save stack depth by doing the bulk of the work
4805  * in a function that doesn't call btrfs_search_slot
4806  *
4807  * @root:       root we are inserting items to
4808  * @path:       points to the leaf/slot where we are going to insert new items
4809  * @cpu_key:    array of keys for items to be inserted
4810  * @data_size:  size of the body of each item we are going to insert
4811  * @nr:         size of @cpu_key/@data_size arrays
4812  */
4813 void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4814                             const struct btrfs_key *cpu_key, u32 *data_size,
4815                             int nr)
4816 {
4817         struct btrfs_fs_info *fs_info = root->fs_info;
4818         struct btrfs_item *item;
4819         int i;
4820         u32 nritems;
4821         unsigned int data_end;
4822         struct btrfs_disk_key disk_key;
4823         struct extent_buffer *leaf;
4824         int slot;
4825         struct btrfs_map_token token;
4826         u32 total_size;
4827         u32 total_data = 0;
4828
4829         for (i = 0; i < nr; i++)
4830                 total_data += data_size[i];
4831         total_size = total_data + (nr * sizeof(struct btrfs_item));
4832
4833         if (path->slots[0] == 0) {
4834                 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4835                 fixup_low_keys(path, &disk_key, 1);
4836         }
4837         btrfs_unlock_up_safe(path, 1);
4838
4839         leaf = path->nodes[0];
4840         slot = path->slots[0];
4841
4842         nritems = btrfs_header_nritems(leaf);
4843         data_end = leaf_data_end(leaf);
4844
4845         if (btrfs_leaf_free_space(leaf) < total_size) {
4846                 btrfs_print_leaf(leaf);
4847                 btrfs_crit(fs_info, "not enough freespace need %u have %d",
4848                            total_size, btrfs_leaf_free_space(leaf));
4849                 BUG();
4850         }
4851
4852         btrfs_init_map_token(&token, leaf);
4853         if (slot != nritems) {
4854                 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4855
4856                 if (old_data < data_end) {
4857                         btrfs_print_leaf(leaf);
4858                         btrfs_crit(fs_info,
4859                 "item at slot %d with data offset %u beyond data end of leaf %u",
4860                                    slot, old_data, data_end);
4861                         BUG();
4862                 }
4863                 /*
4864                  * item0..itemN ... dataN.offset..dataN.size .. data0.size
4865                  */
4866                 /* first correct the data pointers */
4867                 for (i = slot; i < nritems; i++) {
4868                         u32 ioff;
4869
4870                         item = btrfs_item_nr(i);
4871                         ioff = btrfs_token_item_offset(&token, item);
4872                         btrfs_set_token_item_offset(&token, item,
4873                                                     ioff - total_data);
4874                 }
4875                 /* shift the items */
4876                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4877                               btrfs_item_nr_offset(slot),
4878                               (nritems - slot) * sizeof(struct btrfs_item));
4879
4880                 /* shift the data */
4881                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4882                               data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
4883                               data_end, old_data - data_end);
4884                 data_end = old_data;
4885         }
4886
4887         /* setup the item for the new data */
4888         for (i = 0; i < nr; i++) {
4889                 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4890                 btrfs_set_item_key(leaf, &disk_key, slot + i);
4891                 item = btrfs_item_nr(slot + i);
4892                 data_end -= data_size[i];
4893                 btrfs_set_token_item_offset(&token, item, data_end);
4894                 btrfs_set_token_item_size(&token, item, data_size[i]);
4895         }
4896
4897         btrfs_set_header_nritems(leaf, nritems + nr);
4898         btrfs_mark_buffer_dirty(leaf);
4899
4900         if (btrfs_leaf_free_space(leaf) < 0) {
4901                 btrfs_print_leaf(leaf);
4902                 BUG();
4903         }
4904 }
4905
4906 /*
4907  * Given a key and some data, insert items into the tree.
4908  * This does all the path init required, making room in the tree if needed.
4909  */
4910 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4911                             struct btrfs_root *root,
4912                             struct btrfs_path *path,
4913                             const struct btrfs_key *cpu_key, u32 *data_size,
4914                             int nr)
4915 {
4916         int ret = 0;
4917         int slot;
4918         int i;
4919         u32 total_size = 0;
4920         u32 total_data = 0;
4921
4922         for (i = 0; i < nr; i++)
4923                 total_data += data_size[i];
4924
4925         total_size = total_data + (nr * sizeof(struct btrfs_item));
4926         ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4927         if (ret == 0)
4928                 return -EEXIST;
4929         if (ret < 0)
4930                 return ret;
4931
4932         slot = path->slots[0];
4933         BUG_ON(slot < 0);
4934
4935         setup_items_for_insert(root, path, cpu_key, data_size, nr);
4936         return 0;
4937 }
4938
4939 /*
4940  * Given a key and some data, insert an item into the tree.
4941  * This does all the path init required, making room in the tree if needed.
4942  */
4943 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4944                       const struct btrfs_key *cpu_key, void *data,
4945                       u32 data_size)
4946 {
4947         int ret = 0;
4948         struct btrfs_path *path;
4949         struct extent_buffer *leaf;
4950         unsigned long ptr;
4951
4952         path = btrfs_alloc_path();
4953         if (!path)
4954                 return -ENOMEM;
4955         ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4956         if (!ret) {
4957                 leaf = path->nodes[0];
4958                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4959                 write_extent_buffer(leaf, data, ptr, data_size);
4960                 btrfs_mark_buffer_dirty(leaf);
4961         }
4962         btrfs_free_path(path);
4963         return ret;
4964 }
4965
4966 /*
4967  * delete the pointer from a given node.
4968  *
4969  * the tree should have been previously balanced so the deletion does not
4970  * empty a node.
4971  */
4972 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4973                     int level, int slot)
4974 {
4975         struct extent_buffer *parent = path->nodes[level];
4976         u32 nritems;
4977         int ret;
4978
4979         nritems = btrfs_header_nritems(parent);
4980         if (slot != nritems - 1) {
4981                 if (level) {
4982                         ret = tree_mod_log_insert_move(parent, slot, slot + 1,
4983                                         nritems - slot - 1);
4984                         BUG_ON(ret < 0);
4985                 }
4986                 memmove_extent_buffer(parent,
4987                               btrfs_node_key_ptr_offset(slot),
4988                               btrfs_node_key_ptr_offset(slot + 1),
4989                               sizeof(struct btrfs_key_ptr) *
4990                               (nritems - slot - 1));
4991         } else if (level) {
4992                 ret = tree_mod_log_insert_key(parent, slot, MOD_LOG_KEY_REMOVE,
4993                                 GFP_NOFS);
4994                 BUG_ON(ret < 0);
4995         }
4996
4997         nritems--;
4998         btrfs_set_header_nritems(parent, nritems);
4999         if (nritems == 0 && parent == root->node) {
5000                 BUG_ON(btrfs_header_level(root->node) != 1);
5001                 /* just turn the root into a leaf and break */
5002                 btrfs_set_header_level(root->node, 0);
5003         } else if (slot == 0) {
5004                 struct btrfs_disk_key disk_key;
5005
5006                 btrfs_node_key(parent, &disk_key, 0);
5007                 fixup_low_keys(path, &disk_key, level + 1);
5008         }
5009         btrfs_mark_buffer_dirty(parent);
5010 }
5011
5012 /*
5013  * a helper function to delete the leaf pointed to by path->slots[1] and
5014  * path->nodes[1].
5015  *
5016  * This deletes the pointer in path->nodes[1] and frees the leaf
5017  * block extent.  zero is returned if it all worked out, < 0 otherwise.
5018  *
5019  * The path must have already been setup for deleting the leaf, including
5020  * all the proper balancing.  path->nodes[1] must be locked.
5021  */
5022 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
5023                                     struct btrfs_root *root,
5024                                     struct btrfs_path *path,
5025                                     struct extent_buffer *leaf)
5026 {
5027         WARN_ON(btrfs_header_generation(leaf) != trans->transid);
5028         del_ptr(root, path, 1, path->slots[1]);
5029
5030         /*
5031          * btrfs_free_extent is expensive, we want to make sure we
5032          * aren't holding any locks when we call it
5033          */
5034         btrfs_unlock_up_safe(path, 0);
5035
5036         root_sub_used(root, leaf->len);
5037
5038         atomic_inc(&leaf->refs);
5039         btrfs_free_tree_block(trans, root, leaf, 0, 1);
5040         free_extent_buffer_stale(leaf);
5041 }
5042 /*
5043  * delete the item at the leaf level in path.  If that empties
5044  * the leaf, remove it from the tree
5045  */
5046 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5047                     struct btrfs_path *path, int slot, int nr)
5048 {
5049         struct btrfs_fs_info *fs_info = root->fs_info;
5050         struct extent_buffer *leaf;
5051         struct btrfs_item *item;
5052         u32 last_off;
5053         u32 dsize = 0;
5054         int ret = 0;
5055         int wret;
5056         int i;
5057         u32 nritems;
5058
5059         leaf = path->nodes[0];
5060         last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
5061
5062         for (i = 0; i < nr; i++)
5063                 dsize += btrfs_item_size_nr(leaf, slot + i);
5064
5065         nritems = btrfs_header_nritems(leaf);
5066
5067         if (slot + nr != nritems) {
5068                 int data_end = leaf_data_end(leaf);
5069                 struct btrfs_map_token token;
5070
5071                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
5072                               data_end + dsize,
5073                               BTRFS_LEAF_DATA_OFFSET + data_end,
5074                               last_off - data_end);
5075
5076                 btrfs_init_map_token(&token, leaf);
5077                 for (i = slot + nr; i < nritems; i++) {
5078                         u32 ioff;
5079
5080                         item = btrfs_item_nr(i);
5081                         ioff = btrfs_token_item_offset(&token, item);
5082                         btrfs_set_token_item_offset(&token, item, ioff + dsize);
5083                 }
5084
5085                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
5086                               btrfs_item_nr_offset(slot + nr),
5087                               sizeof(struct btrfs_item) *
5088                               (nritems - slot - nr));
5089         }
5090         btrfs_set_header_nritems(leaf, nritems - nr);
5091         nritems -= nr;
5092
5093         /* delete the leaf if we've emptied it */
5094         if (nritems == 0) {
5095                 if (leaf == root->node) {
5096                         btrfs_set_header_level(leaf, 0);
5097                 } else {
5098                         btrfs_set_path_blocking(path);
5099                         btrfs_clean_tree_block(leaf);
5100                         btrfs_del_leaf(trans, root, path, leaf);
5101                 }
5102         } else {
5103                 int used = leaf_space_used(leaf, 0, nritems);
5104                 if (slot == 0) {
5105                         struct btrfs_disk_key disk_key;
5106
5107                         btrfs_item_key(leaf, &disk_key, 0);
5108                         fixup_low_keys(path, &disk_key, 1);
5109                 }
5110
5111                 /* delete the leaf if it is mostly empty */
5112                 if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
5113                         /* push_leaf_left fixes the path.
5114                          * make sure the path still points to our leaf
5115                          * for possible call to del_ptr below
5116                          */
5117                         slot = path->slots[1];
5118                         atomic_inc(&leaf->refs);
5119
5120                         btrfs_set_path_blocking(path);
5121                         wret = push_leaf_left(trans, root, path, 1, 1,
5122                                               1, (u32)-1);
5123                         if (wret < 0 && wret != -ENOSPC)
5124                                 ret = wret;
5125
5126                         if (path->nodes[0] == leaf &&
5127                             btrfs_header_nritems(leaf)) {
5128                                 wret = push_leaf_right(trans, root, path, 1,
5129                                                        1, 1, 0);
5130                                 if (wret < 0 && wret != -ENOSPC)
5131                                         ret = wret;
5132                         }
5133
5134                         if (btrfs_header_nritems(leaf) == 0) {
5135                                 path->slots[1] = slot;
5136                                 btrfs_del_leaf(trans, root, path, leaf);
5137                                 free_extent_buffer(leaf);
5138                                 ret = 0;
5139                         } else {
5140                                 /* if we're still in the path, make sure
5141                                  * we're dirty.  Otherwise, one of the
5142                                  * push_leaf functions must have already
5143                                  * dirtied this buffer
5144                                  */
5145                                 if (path->nodes[0] == leaf)
5146                                         btrfs_mark_buffer_dirty(leaf);
5147                                 free_extent_buffer(leaf);
5148                         }
5149                 } else {
5150                         btrfs_mark_buffer_dirty(leaf);
5151                 }
5152         }
5153         return ret;
5154 }
5155
5156 /*
5157  * search the tree again to find a leaf with lesser keys
5158  * returns 0 if it found something or 1 if there are no lesser leaves.
5159  * returns < 0 on io errors.
5160  *
5161  * This may release the path, and so you may lose any locks held at the
5162  * time you call it.
5163  */
5164 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
5165 {
5166         struct btrfs_key key;
5167         struct btrfs_key orig_key;
5168         struct btrfs_disk_key found_key;
5169         int ret;
5170
5171         btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
5172         orig_key = key;
5173
5174         if (key.offset > 0) {
5175                 key.offset--;
5176         } else if (key.type > 0) {
5177                 key.type--;
5178                 key.offset = (u64)-1;
5179         } else if (key.objectid > 0) {
5180                 key.objectid--;
5181                 key.type = (u8)-1;
5182                 key.offset = (u64)-1;
5183         } else {
5184                 return 1;
5185         }
5186
5187         btrfs_release_path(path);
5188         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5189         if (ret <= 0)
5190                 return ret;
5191
5192         /*
5193          * Previous key not found. Even if we were at slot 0 of the leaf we had
5194          * before releasing the path and calling btrfs_search_slot(), we now may
5195          * be in a slot pointing to the same original key - this can happen if
5196          * after we released the path, one of more items were moved from a
5197          * sibling leaf into the front of the leaf we had due to an insertion
5198          * (see push_leaf_right()).
5199          * If we hit this case and our slot is > 0 and just decrement the slot
5200          * so that the caller does not process the same key again, which may or
5201          * may not break the caller, depending on its logic.
5202          */
5203         if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
5204                 btrfs_item_key(path->nodes[0], &found_key, path->slots[0]);
5205                 ret = comp_keys(&found_key, &orig_key);
5206                 if (ret == 0) {
5207                         if (path->slots[0] > 0) {
5208                                 path->slots[0]--;
5209                                 return 0;
5210                         }
5211                         /*
5212                          * At slot 0, same key as before, it means orig_key is
5213                          * the lowest, leftmost, key in the tree. We're done.
5214                          */
5215                         return 1;
5216                 }
5217         }
5218
5219         btrfs_item_key(path->nodes[0], &found_key, 0);
5220         ret = comp_keys(&found_key, &key);
5221         /*
5222          * We might have had an item with the previous key in the tree right
5223          * before we released our path. And after we released our path, that
5224          * item might have been pushed to the first slot (0) of the leaf we
5225          * were holding due to a tree balance. Alternatively, an item with the
5226          * previous key can exist as the only element of a leaf (big fat item).
5227          * Therefore account for these 2 cases, so that our callers (like
5228          * btrfs_previous_item) don't miss an existing item with a key matching
5229          * the previous key we computed above.
5230          */
5231         if (ret <= 0)
5232                 return 0;
5233         return 1;
5234 }
5235
5236 /*
5237  * A helper function to walk down the tree starting at min_key, and looking
5238  * for nodes or leaves that are have a minimum transaction id.
5239  * This is used by the btree defrag code, and tree logging
5240  *
5241  * This does not cow, but it does stuff the starting key it finds back
5242  * into min_key, so you can call btrfs_search_slot with cow=1 on the
5243  * key and get a writable path.
5244  *
5245  * This honors path->lowest_level to prevent descent past a given level
5246  * of the tree.
5247  *
5248  * min_trans indicates the oldest transaction that you are interested
5249  * in walking through.  Any nodes or leaves older than min_trans are
5250  * skipped over (without reading them).
5251  *
5252  * returns zero if something useful was found, < 0 on error and 1 if there
5253  * was nothing in the tree that matched the search criteria.
5254  */
5255 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
5256                          struct btrfs_path *path,
5257                          u64 min_trans)
5258 {
5259         struct extent_buffer *cur;
5260         struct btrfs_key found_key;
5261         int slot;
5262         int sret;
5263         u32 nritems;
5264         int level;
5265         int ret = 1;
5266         int keep_locks = path->keep_locks;
5267
5268         path->keep_locks = 1;
5269 again:
5270         cur = btrfs_read_lock_root_node(root);
5271         level = btrfs_header_level(cur);
5272         WARN_ON(path->nodes[level]);
5273         path->nodes[level] = cur;
5274         path->locks[level] = BTRFS_READ_LOCK;
5275
5276         if (btrfs_header_generation(cur) < min_trans) {
5277                 ret = 1;
5278                 goto out;
5279         }
5280         while (1) {
5281                 nritems = btrfs_header_nritems(cur);
5282                 level = btrfs_header_level(cur);
5283                 sret = btrfs_bin_search(cur, min_key, &slot);
5284                 if (sret < 0) {
5285                         ret = sret;
5286                         goto out;
5287                 }
5288
5289                 /* at the lowest level, we're done, setup the path and exit */
5290                 if (level == path->lowest_level) {
5291                         if (slot >= nritems)
5292                                 goto find_next_key;
5293                         ret = 0;
5294                         path->slots[level] = slot;
5295                         btrfs_item_key_to_cpu(cur, &found_key, slot);
5296                         goto out;
5297                 }
5298                 if (sret && slot > 0)
5299                         slot--;
5300                 /*
5301                  * check this node pointer against the min_trans parameters.
5302                  * If it is too old, skip to the next one.
5303                  */
5304                 while (slot < nritems) {
5305                         u64 gen;
5306
5307                         gen = btrfs_node_ptr_generation(cur, slot);
5308                         if (gen < min_trans) {
5309                                 slot++;
5310                                 continue;
5311                         }
5312                         break;
5313                 }
5314 find_next_key:
5315                 /*
5316                  * we didn't find a candidate key in this node, walk forward
5317                  * and find another one
5318                  */
5319                 if (slot >= nritems) {
5320                         path->slots[level] = slot;
5321                         btrfs_set_path_blocking(path);
5322                         sret = btrfs_find_next_key(root, path, min_key, level,
5323                                                   min_trans);
5324                         if (sret == 0) {
5325                                 btrfs_release_path(path);
5326                                 goto again;
5327                         } else {
5328                                 goto out;
5329                         }
5330                 }
5331                 /* save our key for returning back */
5332                 btrfs_node_key_to_cpu(cur, &found_key, slot);
5333                 path->slots[level] = slot;
5334                 if (level == path->lowest_level) {
5335                         ret = 0;
5336                         goto out;
5337                 }
5338                 btrfs_set_path_blocking(path);
5339                 cur = btrfs_read_node_slot(cur, slot);
5340                 if (IS_ERR(cur)) {
5341                         ret = PTR_ERR(cur);
5342                         goto out;
5343                 }
5344
5345                 btrfs_tree_read_lock(cur);
5346
5347                 path->locks[level - 1] = BTRFS_READ_LOCK;
5348                 path->nodes[level - 1] = cur;
5349                 unlock_up(path, level, 1, 0, NULL);
5350         }
5351 out:
5352         path->keep_locks = keep_locks;
5353         if (ret == 0) {
5354                 btrfs_unlock_up_safe(path, path->lowest_level + 1);
5355                 btrfs_set_path_blocking(path);
5356                 memcpy(min_key, &found_key, sizeof(found_key));
5357         }
5358         return ret;
5359 }
5360
5361 /*
5362  * this is similar to btrfs_next_leaf, but does not try to preserve
5363  * and fixup the path.  It looks for and returns the next key in the
5364  * tree based on the current path and the min_trans parameters.
5365  *
5366  * 0 is returned if another key is found, < 0 if there are any errors
5367  * and 1 is returned if there are no higher keys in the tree
5368  *
5369  * path->keep_locks should be set to 1 on the search made before
5370  * calling this function.
5371  */
5372 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5373                         struct btrfs_key *key, int level, u64 min_trans)
5374 {
5375         int slot;
5376         struct extent_buffer *c;
5377
5378         WARN_ON(!path->keep_locks && !path->skip_locking);
5379         while (level < BTRFS_MAX_LEVEL) {
5380                 if (!path->nodes[level])
5381                         return 1;
5382
5383                 slot = path->slots[level] + 1;
5384                 c = path->nodes[level];
5385 next:
5386                 if (slot >= btrfs_header_nritems(c)) {
5387                         int ret;
5388                         int orig_lowest;
5389                         struct btrfs_key cur_key;
5390                         if (level + 1 >= BTRFS_MAX_LEVEL ||
5391                             !path->nodes[level + 1])
5392                                 return 1;
5393
5394                         if (path->locks[level + 1] || path->skip_locking) {
5395                                 level++;
5396                                 continue;
5397                         }
5398
5399                         slot = btrfs_header_nritems(c) - 1;
5400                         if (level == 0)
5401                                 btrfs_item_key_to_cpu(c, &cur_key, slot);
5402                         else
5403                                 btrfs_node_key_to_cpu(c, &cur_key, slot);
5404
5405                         orig_lowest = path->lowest_level;
5406                         btrfs_release_path(path);
5407                         path->lowest_level = level;
5408                         ret = btrfs_search_slot(NULL, root, &cur_key, path,
5409                                                 0, 0);
5410                         path->lowest_level = orig_lowest;
5411                         if (ret < 0)
5412                                 return ret;
5413
5414                         c = path->nodes[level];
5415                         slot = path->slots[level];
5416                         if (ret == 0)
5417                                 slot++;
5418                         goto next;
5419                 }
5420
5421                 if (level == 0)
5422                         btrfs_item_key_to_cpu(c, key, slot);
5423                 else {
5424                         u64 gen = btrfs_node_ptr_generation(c, slot);
5425
5426                         if (gen < min_trans) {
5427                                 slot++;
5428                                 goto next;
5429                         }
5430                         btrfs_node_key_to_cpu(c, key, slot);
5431                 }
5432                 return 0;
5433         }
5434         return 1;
5435 }
5436
5437 /*
5438  * search the tree again to find a leaf with greater keys
5439  * returns 0 if it found something or 1 if there are no greater leaves.
5440  * returns < 0 on io errors.
5441  */
5442 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5443 {
5444         return btrfs_next_old_leaf(root, path, 0);
5445 }
5446
5447 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5448                         u64 time_seq)
5449 {
5450         int slot;
5451         int level;
5452         struct extent_buffer *c;
5453         struct extent_buffer *next;
5454         struct btrfs_key key;
5455         u32 nritems;
5456         int ret;
5457         int old_spinning = path->leave_spinning;
5458         int next_rw_lock = 0;
5459
5460         nritems = btrfs_header_nritems(path->nodes[0]);
5461         if (nritems == 0)
5462                 return 1;
5463
5464         btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5465 again:
5466         level = 1;
5467         next = NULL;
5468         next_rw_lock = 0;
5469         btrfs_release_path(path);
5470
5471         path->keep_locks = 1;
5472         path->leave_spinning = 1;
5473
5474         if (time_seq)
5475                 ret = btrfs_search_old_slot(root, &key, path, time_seq);
5476         else
5477                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5478         path->keep_locks = 0;
5479
5480         if (ret < 0)
5481                 return ret;
5482
5483         nritems = btrfs_header_nritems(path->nodes[0]);
5484         /*
5485          * by releasing the path above we dropped all our locks.  A balance
5486          * could have added more items next to the key that used to be
5487          * at the very end of the block.  So, check again here and
5488          * advance the path if there are now more items available.
5489          */
5490         if (nritems > 0 && path->slots[0] < nritems - 1) {
5491                 if (ret == 0)
5492                         path->slots[0]++;
5493                 ret = 0;
5494                 goto done;
5495         }
5496         /*
5497          * So the above check misses one case:
5498          * - after releasing the path above, someone has removed the item that
5499          *   used to be at the very end of the block, and balance between leafs
5500          *   gets another one with bigger key.offset to replace it.
5501          *
5502          * This one should be returned as well, or we can get leaf corruption
5503          * later(esp. in __btrfs_drop_extents()).
5504          *
5505          * And a bit more explanation about this check,
5506          * with ret > 0, the key isn't found, the path points to the slot
5507          * where it should be inserted, so the path->slots[0] item must be the
5508          * bigger one.
5509          */
5510         if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
5511                 ret = 0;
5512                 goto done;
5513         }
5514
5515         while (level < BTRFS_MAX_LEVEL) {
5516                 if (!path->nodes[level]) {
5517                         ret = 1;
5518                         goto done;
5519                 }
5520
5521                 slot = path->slots[level] + 1;
5522                 c = path->nodes[level];
5523                 if (slot >= btrfs_header_nritems(c)) {
5524                         level++;
5525                         if (level == BTRFS_MAX_LEVEL) {
5526                                 ret = 1;
5527                                 goto done;
5528                         }
5529                         continue;
5530                 }
5531
5532                 if (next) {
5533                         btrfs_tree_unlock_rw(next, next_rw_lock);
5534                         free_extent_buffer(next);
5535                 }
5536
5537                 next = c;
5538                 next_rw_lock = path->locks[level];
5539                 ret = read_block_for_search(root, path, &next, level,
5540                                             slot, &key);
5541                 if (ret == -EAGAIN)
5542                         goto again;
5543
5544                 if (ret < 0) {
5545                         btrfs_release_path(path);
5546                         goto done;
5547                 }
5548
5549                 if (!path->skip_locking) {
5550                         ret = btrfs_try_tree_read_lock(next);
5551                         if (!ret && time_seq) {
5552                                 /*
5553                                  * If we don't get the lock, we may be racing
5554                                  * with push_leaf_left, holding that lock while
5555                                  * itself waiting for the leaf we've currently
5556                                  * locked. To solve this situation, we give up
5557                                  * on our lock and cycle.
5558                                  */
5559                                 free_extent_buffer(next);
5560                                 btrfs_release_path(path);
5561                                 cond_resched();
5562                                 goto again;
5563                         }
5564                         if (!ret) {
5565                                 btrfs_set_path_blocking(path);
5566                                 __btrfs_tree_read_lock(next,
5567                                                        BTRFS_NESTING_RIGHT,
5568                                                        path->recurse);
5569                         }
5570                         next_rw_lock = BTRFS_READ_LOCK;
5571                 }
5572                 break;
5573         }
5574         path->slots[level] = slot;
5575         while (1) {
5576                 level--;
5577                 c = path->nodes[level];
5578                 if (path->locks[level])
5579                         btrfs_tree_unlock_rw(c, path->locks[level]);
5580
5581                 free_extent_buffer(c);
5582                 path->nodes[level] = next;
5583                 path->slots[level] = 0;
5584                 if (!path->skip_locking)
5585                         path->locks[level] = next_rw_lock;
5586                 if (!level)
5587                         break;
5588
5589                 ret = read_block_for_search(root, path, &next, level,
5590                                             0, &key);
5591                 if (ret == -EAGAIN)
5592                         goto again;
5593
5594                 if (ret < 0) {
5595                         btrfs_release_path(path);
5596                         goto done;
5597                 }
5598
5599                 if (!path->skip_locking) {
5600                         ret = btrfs_try_tree_read_lock(next);
5601                         if (!ret) {
5602                                 btrfs_set_path_blocking(path);
5603                                 __btrfs_tree_read_lock(next,
5604                                                        BTRFS_NESTING_RIGHT,
5605                                                        path->recurse);
5606                         }
5607                         next_rw_lock = BTRFS_READ_LOCK;
5608                 }
5609         }
5610         ret = 0;
5611 done:
5612         unlock_up(path, 0, 1, 0, NULL);
5613         path->leave_spinning = old_spinning;
5614         if (!old_spinning)
5615                 btrfs_set_path_blocking(path);
5616
5617         return ret;
5618 }
5619
5620 /*
5621  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5622  * searching until it gets past min_objectid or finds an item of 'type'
5623  *
5624  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5625  */
5626 int btrfs_previous_item(struct btrfs_root *root,
5627                         struct btrfs_path *path, u64 min_objectid,
5628                         int type)
5629 {
5630         struct btrfs_key found_key;
5631         struct extent_buffer *leaf;
5632         u32 nritems;
5633         int ret;
5634
5635         while (1) {
5636                 if (path->slots[0] == 0) {
5637                         btrfs_set_path_blocking(path);
5638                         ret = btrfs_prev_leaf(root, path);
5639                         if (ret != 0)
5640                                 return ret;
5641                 } else {
5642                         path->slots[0]--;
5643                 }
5644                 leaf = path->nodes[0];
5645                 nritems = btrfs_header_nritems(leaf);
5646                 if (nritems == 0)
5647                         return 1;
5648                 if (path->slots[0] == nritems)
5649                         path->slots[0]--;
5650
5651                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5652                 if (found_key.objectid < min_objectid)
5653                         break;
5654                 if (found_key.type == type)
5655                         return 0;
5656                 if (found_key.objectid == min_objectid &&
5657                     found_key.type < type)
5658                         break;
5659         }
5660         return 1;
5661 }
5662
5663 /*
5664  * search in extent tree to find a previous Metadata/Data extent item with
5665  * min objecitd.
5666  *
5667  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5668  */
5669 int btrfs_previous_extent_item(struct btrfs_root *root,
5670                         struct btrfs_path *path, u64 min_objectid)
5671 {
5672         struct btrfs_key found_key;
5673         struct extent_buffer *leaf;
5674         u32 nritems;
5675         int ret;
5676
5677         while (1) {
5678                 if (path->slots[0] == 0) {
5679                         btrfs_set_path_blocking(path);
5680                         ret = btrfs_prev_leaf(root, path);
5681                         if (ret != 0)
5682                                 return ret;
5683                 } else {
5684                         path->slots[0]--;
5685                 }
5686                 leaf = path->nodes[0];
5687                 nritems = btrfs_header_nritems(leaf);
5688                 if (nritems == 0)
5689                         return 1;
5690                 if (path->slots[0] == nritems)
5691                         path->slots[0]--;
5692
5693                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5694                 if (found_key.objectid < min_objectid)
5695                         break;
5696                 if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5697                     found_key.type == BTRFS_METADATA_ITEM_KEY)
5698                         return 0;
5699                 if (found_key.objectid == min_objectid &&
5700                     found_key.type < BTRFS_EXTENT_ITEM_KEY)
5701                         break;
5702         }
5703         return 1;
5704 }