608e41b61689cd75b23a359ef6034cb95d1017a0
[releases.git] / ctree.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2007,2008 Oracle.  All rights reserved.
4  */
5
6 #include <linux/sched.h>
7 #include <linux/slab.h>
8 #include <linux/rbtree.h>
9 #include <linux/mm.h>
10 #include "ctree.h"
11 #include "disk-io.h"
12 #include "transaction.h"
13 #include "print-tree.h"
14 #include "locking.h"
15 #include "volumes.h"
16 #include "qgroup.h"
17
18 static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
19                       *root, struct btrfs_path *path, int level);
20 static int split_leaf(struct btrfs_trans_handle *trans, struct btrfs_root *root,
21                       const struct btrfs_key *ins_key, struct btrfs_path *path,
22                       int data_size, int extend);
23 static int push_node_left(struct btrfs_trans_handle *trans,
24                           struct extent_buffer *dst,
25                           struct extent_buffer *src, int empty);
26 static int balance_node_right(struct btrfs_trans_handle *trans,
27                               struct extent_buffer *dst_buf,
28                               struct extent_buffer *src_buf);
29 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
30                     int level, int slot);
31
32 static const struct btrfs_csums {
33         u16             size;
34         const char      *name;
35 } btrfs_csums[] = {
36         [BTRFS_CSUM_TYPE_CRC32] = { .size = 4, .name = "crc32c" },
37 };
38
39 int btrfs_super_csum_size(const struct btrfs_super_block *s)
40 {
41         u16 t = btrfs_super_csum_type(s);
42         /*
43          * csum type is validated at mount time
44          */
45         return btrfs_csums[t].size;
46 }
47
48 const char *btrfs_super_csum_name(u16 csum_type)
49 {
50         /* csum type is validated at mount time */
51         return btrfs_csums[csum_type].name;
52 }
53
54 struct btrfs_path *btrfs_alloc_path(void)
55 {
56         return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
57 }
58
59 /*
60  * set all locked nodes in the path to blocking locks.  This should
61  * be done before scheduling
62  */
63 noinline void btrfs_set_path_blocking(struct btrfs_path *p)
64 {
65         int i;
66         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
67                 if (!p->nodes[i] || !p->locks[i])
68                         continue;
69                 /*
70                  * If we currently have a spinning reader or writer lock this
71                  * will bump the count of blocking holders and drop the
72                  * spinlock.
73                  */
74                 if (p->locks[i] == BTRFS_READ_LOCK) {
75                         btrfs_set_lock_blocking_read(p->nodes[i]);
76                         p->locks[i] = BTRFS_READ_LOCK_BLOCKING;
77                 } else if (p->locks[i] == BTRFS_WRITE_LOCK) {
78                         btrfs_set_lock_blocking_write(p->nodes[i]);
79                         p->locks[i] = BTRFS_WRITE_LOCK_BLOCKING;
80                 }
81         }
82 }
83
84 /* this also releases the path */
85 void btrfs_free_path(struct btrfs_path *p)
86 {
87         if (!p)
88                 return;
89         btrfs_release_path(p);
90         kmem_cache_free(btrfs_path_cachep, p);
91 }
92
93 /*
94  * path release drops references on the extent buffers in the path
95  * and it drops any locks held by this path
96  *
97  * It is safe to call this on paths that no locks or extent buffers held.
98  */
99 noinline void btrfs_release_path(struct btrfs_path *p)
100 {
101         int i;
102
103         for (i = 0; i < BTRFS_MAX_LEVEL; i++) {
104                 p->slots[i] = 0;
105                 if (!p->nodes[i])
106                         continue;
107                 if (p->locks[i]) {
108                         btrfs_tree_unlock_rw(p->nodes[i], p->locks[i]);
109                         p->locks[i] = 0;
110                 }
111                 free_extent_buffer(p->nodes[i]);
112                 p->nodes[i] = NULL;
113         }
114 }
115
116 /*
117  * safely gets a reference on the root node of a tree.  A lock
118  * is not taken, so a concurrent writer may put a different node
119  * at the root of the tree.  See btrfs_lock_root_node for the
120  * looping required.
121  *
122  * The extent buffer returned by this has a reference taken, so
123  * it won't disappear.  It may stop being the root of the tree
124  * at any time because there are no locks held.
125  */
126 struct extent_buffer *btrfs_root_node(struct btrfs_root *root)
127 {
128         struct extent_buffer *eb;
129
130         while (1) {
131                 rcu_read_lock();
132                 eb = rcu_dereference(root->node);
133
134                 /*
135                  * RCU really hurts here, we could free up the root node because
136                  * it was COWed but we may not get the new root node yet so do
137                  * the inc_not_zero dance and if it doesn't work then
138                  * synchronize_rcu and try again.
139                  */
140                 if (atomic_inc_not_zero(&eb->refs)) {
141                         rcu_read_unlock();
142                         break;
143                 }
144                 rcu_read_unlock();
145                 synchronize_rcu();
146         }
147         return eb;
148 }
149
150 /* loop around taking references on and locking the root node of the
151  * tree until you end up with a lock on the root.  A locked buffer
152  * is returned, with a reference held.
153  */
154 struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root)
155 {
156         struct extent_buffer *eb;
157
158         while (1) {
159                 eb = btrfs_root_node(root);
160                 btrfs_tree_lock(eb);
161                 if (eb == root->node)
162                         break;
163                 btrfs_tree_unlock(eb);
164                 free_extent_buffer(eb);
165         }
166         return eb;
167 }
168
169 /* loop around taking references on and locking the root node of the
170  * tree until you end up with a lock on the root.  A locked buffer
171  * is returned, with a reference held.
172  */
173 struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root)
174 {
175         struct extent_buffer *eb;
176
177         while (1) {
178                 eb = btrfs_root_node(root);
179                 btrfs_tree_read_lock(eb);
180                 if (eb == root->node)
181                         break;
182                 btrfs_tree_read_unlock(eb);
183                 free_extent_buffer(eb);
184         }
185         return eb;
186 }
187
188 /* cowonly root (everything not a reference counted cow subvolume), just get
189  * put onto a simple dirty list.  transaction.c walks this to make sure they
190  * get properly updated on disk.
191  */
192 static void add_root_to_dirty_list(struct btrfs_root *root)
193 {
194         struct btrfs_fs_info *fs_info = root->fs_info;
195
196         if (test_bit(BTRFS_ROOT_DIRTY, &root->state) ||
197             !test_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state))
198                 return;
199
200         spin_lock(&fs_info->trans_lock);
201         if (!test_and_set_bit(BTRFS_ROOT_DIRTY, &root->state)) {
202                 /* Want the extent tree to be the last on the list */
203                 if (root->root_key.objectid == BTRFS_EXTENT_TREE_OBJECTID)
204                         list_move_tail(&root->dirty_list,
205                                        &fs_info->dirty_cowonly_roots);
206                 else
207                         list_move(&root->dirty_list,
208                                   &fs_info->dirty_cowonly_roots);
209         }
210         spin_unlock(&fs_info->trans_lock);
211 }
212
213 /*
214  * used by snapshot creation to make a copy of a root for a tree with
215  * a given objectid.  The buffer with the new root node is returned in
216  * cow_ret, and this func returns zero on success or a negative error code.
217  */
218 int btrfs_copy_root(struct btrfs_trans_handle *trans,
219                       struct btrfs_root *root,
220                       struct extent_buffer *buf,
221                       struct extent_buffer **cow_ret, u64 new_root_objectid)
222 {
223         struct btrfs_fs_info *fs_info = root->fs_info;
224         struct extent_buffer *cow;
225         int ret = 0;
226         int level;
227         struct btrfs_disk_key disk_key;
228
229         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
230                 trans->transid != fs_info->running_transaction->transid);
231         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
232                 trans->transid != root->last_trans);
233
234         level = btrfs_header_level(buf);
235         if (level == 0)
236                 btrfs_item_key(buf, &disk_key, 0);
237         else
238                 btrfs_node_key(buf, &disk_key, 0);
239
240         cow = btrfs_alloc_tree_block(trans, root, 0, new_root_objectid,
241                         &disk_key, level, buf->start, 0);
242         if (IS_ERR(cow))
243                 return PTR_ERR(cow);
244
245         copy_extent_buffer_full(cow, buf);
246         btrfs_set_header_bytenr(cow, cow->start);
247         btrfs_set_header_generation(cow, trans->transid);
248         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
249         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
250                                      BTRFS_HEADER_FLAG_RELOC);
251         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
252                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
253         else
254                 btrfs_set_header_owner(cow, new_root_objectid);
255
256         write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
257
258         WARN_ON(btrfs_header_generation(buf) > trans->transid);
259         if (new_root_objectid == BTRFS_TREE_RELOC_OBJECTID)
260                 ret = btrfs_inc_ref(trans, root, cow, 1);
261         else
262                 ret = btrfs_inc_ref(trans, root, cow, 0);
263         if (ret) {
264                 btrfs_tree_unlock(cow);
265                 free_extent_buffer(cow);
266                 btrfs_abort_transaction(trans, ret);
267                 return ret;
268         }
269
270         btrfs_mark_buffer_dirty(cow);
271         *cow_ret = cow;
272         return 0;
273 }
274
275 enum mod_log_op {
276         MOD_LOG_KEY_REPLACE,
277         MOD_LOG_KEY_ADD,
278         MOD_LOG_KEY_REMOVE,
279         MOD_LOG_KEY_REMOVE_WHILE_FREEING,
280         MOD_LOG_KEY_REMOVE_WHILE_MOVING,
281         MOD_LOG_MOVE_KEYS,
282         MOD_LOG_ROOT_REPLACE,
283 };
284
285 struct tree_mod_root {
286         u64 logical;
287         u8 level;
288 };
289
290 struct tree_mod_elem {
291         struct rb_node node;
292         u64 logical;
293         u64 seq;
294         enum mod_log_op op;
295
296         /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */
297         int slot;
298
299         /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */
300         u64 generation;
301
302         /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */
303         struct btrfs_disk_key key;
304         u64 blockptr;
305
306         /* this is used for op == MOD_LOG_MOVE_KEYS */
307         struct {
308                 int dst_slot;
309                 int nr_items;
310         } move;
311
312         /* this is used for op == MOD_LOG_ROOT_REPLACE */
313         struct tree_mod_root old_root;
314 };
315
316 /*
317  * Pull a new tree mod seq number for our operation.
318  */
319 static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
320 {
321         return atomic64_inc_return(&fs_info->tree_mod_seq);
322 }
323
324 /*
325  * This adds a new blocker to the tree mod log's blocker list if the @elem
326  * passed does not already have a sequence number set. So when a caller expects
327  * to record tree modifications, it should ensure to set elem->seq to zero
328  * before calling btrfs_get_tree_mod_seq.
329  * Returns a fresh, unused tree log modification sequence number, even if no new
330  * blocker was added.
331  */
332 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
333                            struct seq_list *elem)
334 {
335         write_lock(&fs_info->tree_mod_log_lock);
336         if (!elem->seq) {
337                 elem->seq = btrfs_inc_tree_mod_seq(fs_info);
338                 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
339         }
340         write_unlock(&fs_info->tree_mod_log_lock);
341
342         return elem->seq;
343 }
344
345 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
346                             struct seq_list *elem)
347 {
348         struct rb_root *tm_root;
349         struct rb_node *node;
350         struct rb_node *next;
351         struct seq_list *cur_elem;
352         struct tree_mod_elem *tm;
353         u64 min_seq = (u64)-1;
354         u64 seq_putting = elem->seq;
355
356         if (!seq_putting)
357                 return;
358
359         write_lock(&fs_info->tree_mod_log_lock);
360         list_del(&elem->list);
361         elem->seq = 0;
362
363         list_for_each_entry(cur_elem, &fs_info->tree_mod_seq_list, list) {
364                 if (cur_elem->seq < min_seq) {
365                         if (seq_putting > cur_elem->seq) {
366                                 /*
367                                  * blocker with lower sequence number exists, we
368                                  * cannot remove anything from the log
369                                  */
370                                 write_unlock(&fs_info->tree_mod_log_lock);
371                                 return;
372                         }
373                         min_seq = cur_elem->seq;
374                 }
375         }
376
377         /*
378          * anything that's lower than the lowest existing (read: blocked)
379          * sequence number can be removed from the tree.
380          */
381         tm_root = &fs_info->tree_mod_log;
382         for (node = rb_first(tm_root); node; node = next) {
383                 next = rb_next(node);
384                 tm = rb_entry(node, struct tree_mod_elem, node);
385                 if (tm->seq >= min_seq)
386                         continue;
387                 rb_erase(node, tm_root);
388                 kfree(tm);
389         }
390         write_unlock(&fs_info->tree_mod_log_lock);
391 }
392
393 /*
394  * key order of the log:
395  *       node/leaf start address -> sequence
396  *
397  * The 'start address' is the logical address of the *new* root node
398  * for root replace operations, or the logical address of the affected
399  * block for all other operations.
400  */
401 static noinline int
402 __tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm)
403 {
404         struct rb_root *tm_root;
405         struct rb_node **new;
406         struct rb_node *parent = NULL;
407         struct tree_mod_elem *cur;
408
409         lockdep_assert_held_write(&fs_info->tree_mod_log_lock);
410
411         tm->seq = btrfs_inc_tree_mod_seq(fs_info);
412
413         tm_root = &fs_info->tree_mod_log;
414         new = &tm_root->rb_node;
415         while (*new) {
416                 cur = rb_entry(*new, struct tree_mod_elem, node);
417                 parent = *new;
418                 if (cur->logical < tm->logical)
419                         new = &((*new)->rb_left);
420                 else if (cur->logical > tm->logical)
421                         new = &((*new)->rb_right);
422                 else if (cur->seq < tm->seq)
423                         new = &((*new)->rb_left);
424                 else if (cur->seq > tm->seq)
425                         new = &((*new)->rb_right);
426                 else
427                         return -EEXIST;
428         }
429
430         rb_link_node(&tm->node, parent, new);
431         rb_insert_color(&tm->node, tm_root);
432         return 0;
433 }
434
435 /*
436  * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it
437  * returns zero with the tree_mod_log_lock acquired. The caller must hold
438  * this until all tree mod log insertions are recorded in the rb tree and then
439  * write unlock fs_info::tree_mod_log_lock.
440  */
441 static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info,
442                                     struct extent_buffer *eb) {
443         smp_mb();
444         if (list_empty(&(fs_info)->tree_mod_seq_list))
445                 return 1;
446         if (eb && btrfs_header_level(eb) == 0)
447                 return 1;
448
449         write_lock(&fs_info->tree_mod_log_lock);
450         if (list_empty(&(fs_info)->tree_mod_seq_list)) {
451                 write_unlock(&fs_info->tree_mod_log_lock);
452                 return 1;
453         }
454
455         return 0;
456 }
457
458 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
459 static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info,
460                                     struct extent_buffer *eb)
461 {
462         smp_mb();
463         if (list_empty(&(fs_info)->tree_mod_seq_list))
464                 return 0;
465         if (eb && btrfs_header_level(eb) == 0)
466                 return 0;
467
468         return 1;
469 }
470
471 static struct tree_mod_elem *
472 alloc_tree_mod_elem(struct extent_buffer *eb, int slot,
473                     enum mod_log_op op, gfp_t flags)
474 {
475         struct tree_mod_elem *tm;
476
477         tm = kzalloc(sizeof(*tm), flags);
478         if (!tm)
479                 return NULL;
480
481         tm->logical = eb->start;
482         if (op != MOD_LOG_KEY_ADD) {
483                 btrfs_node_key(eb, &tm->key, slot);
484                 tm->blockptr = btrfs_node_blockptr(eb, slot);
485         }
486         tm->op = op;
487         tm->slot = slot;
488         tm->generation = btrfs_node_ptr_generation(eb, slot);
489         RB_CLEAR_NODE(&tm->node);
490
491         return tm;
492 }
493
494 static noinline int tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
495                 enum mod_log_op op, gfp_t flags)
496 {
497         struct tree_mod_elem *tm;
498         int ret;
499
500         if (!tree_mod_need_log(eb->fs_info, eb))
501                 return 0;
502
503         tm = alloc_tree_mod_elem(eb, slot, op, flags);
504         if (!tm)
505                 return -ENOMEM;
506
507         if (tree_mod_dont_log(eb->fs_info, eb)) {
508                 kfree(tm);
509                 return 0;
510         }
511
512         ret = __tree_mod_log_insert(eb->fs_info, tm);
513         write_unlock(&eb->fs_info->tree_mod_log_lock);
514         if (ret)
515                 kfree(tm);
516
517         return ret;
518 }
519
520 static noinline int tree_mod_log_insert_move(struct extent_buffer *eb,
521                 int dst_slot, int src_slot, int nr_items)
522 {
523         struct tree_mod_elem *tm = NULL;
524         struct tree_mod_elem **tm_list = NULL;
525         int ret = 0;
526         int i;
527         int locked = 0;
528
529         if (!tree_mod_need_log(eb->fs_info, eb))
530                 return 0;
531
532         tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
533         if (!tm_list)
534                 return -ENOMEM;
535
536         tm = kzalloc(sizeof(*tm), GFP_NOFS);
537         if (!tm) {
538                 ret = -ENOMEM;
539                 goto free_tms;
540         }
541
542         tm->logical = eb->start;
543         tm->slot = src_slot;
544         tm->move.dst_slot = dst_slot;
545         tm->move.nr_items = nr_items;
546         tm->op = MOD_LOG_MOVE_KEYS;
547
548         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
549                 tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
550                     MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS);
551                 if (!tm_list[i]) {
552                         ret = -ENOMEM;
553                         goto free_tms;
554                 }
555         }
556
557         if (tree_mod_dont_log(eb->fs_info, eb))
558                 goto free_tms;
559         locked = 1;
560
561         /*
562          * When we override something during the move, we log these removals.
563          * This can only happen when we move towards the beginning of the
564          * buffer, i.e. dst_slot < src_slot.
565          */
566         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
567                 ret = __tree_mod_log_insert(eb->fs_info, tm_list[i]);
568                 if (ret)
569                         goto free_tms;
570         }
571
572         ret = __tree_mod_log_insert(eb->fs_info, tm);
573         if (ret)
574                 goto free_tms;
575         write_unlock(&eb->fs_info->tree_mod_log_lock);
576         kfree(tm_list);
577
578         return 0;
579 free_tms:
580         for (i = 0; i < nr_items; i++) {
581                 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
582                         rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
583                 kfree(tm_list[i]);
584         }
585         if (locked)
586                 write_unlock(&eb->fs_info->tree_mod_log_lock);
587         kfree(tm_list);
588         kfree(tm);
589
590         return ret;
591 }
592
593 static inline int
594 __tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
595                        struct tree_mod_elem **tm_list,
596                        int nritems)
597 {
598         int i, j;
599         int ret;
600
601         for (i = nritems - 1; i >= 0; i--) {
602                 ret = __tree_mod_log_insert(fs_info, tm_list[i]);
603                 if (ret) {
604                         for (j = nritems - 1; j > i; j--)
605                                 rb_erase(&tm_list[j]->node,
606                                          &fs_info->tree_mod_log);
607                         return ret;
608                 }
609         }
610
611         return 0;
612 }
613
614 static noinline int tree_mod_log_insert_root(struct extent_buffer *old_root,
615                          struct extent_buffer *new_root, int log_removal)
616 {
617         struct btrfs_fs_info *fs_info = old_root->fs_info;
618         struct tree_mod_elem *tm = NULL;
619         struct tree_mod_elem **tm_list = NULL;
620         int nritems = 0;
621         int ret = 0;
622         int i;
623
624         if (!tree_mod_need_log(fs_info, NULL))
625                 return 0;
626
627         if (log_removal && btrfs_header_level(old_root) > 0) {
628                 nritems = btrfs_header_nritems(old_root);
629                 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
630                                   GFP_NOFS);
631                 if (!tm_list) {
632                         ret = -ENOMEM;
633                         goto free_tms;
634                 }
635                 for (i = 0; i < nritems; i++) {
636                         tm_list[i] = alloc_tree_mod_elem(old_root, i,
637                             MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
638                         if (!tm_list[i]) {
639                                 ret = -ENOMEM;
640                                 goto free_tms;
641                         }
642                 }
643         }
644
645         tm = kzalloc(sizeof(*tm), GFP_NOFS);
646         if (!tm) {
647                 ret = -ENOMEM;
648                 goto free_tms;
649         }
650
651         tm->logical = new_root->start;
652         tm->old_root.logical = old_root->start;
653         tm->old_root.level = btrfs_header_level(old_root);
654         tm->generation = btrfs_header_generation(old_root);
655         tm->op = MOD_LOG_ROOT_REPLACE;
656
657         if (tree_mod_dont_log(fs_info, NULL))
658                 goto free_tms;
659
660         if (tm_list)
661                 ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems);
662         if (!ret)
663                 ret = __tree_mod_log_insert(fs_info, tm);
664
665         write_unlock(&fs_info->tree_mod_log_lock);
666         if (ret)
667                 goto free_tms;
668         kfree(tm_list);
669
670         return ret;
671
672 free_tms:
673         if (tm_list) {
674                 for (i = 0; i < nritems; i++)
675                         kfree(tm_list[i]);
676                 kfree(tm_list);
677         }
678         kfree(tm);
679
680         return ret;
681 }
682
683 static struct tree_mod_elem *
684 __tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq,
685                       int smallest)
686 {
687         struct rb_root *tm_root;
688         struct rb_node *node;
689         struct tree_mod_elem *cur = NULL;
690         struct tree_mod_elem *found = NULL;
691
692         read_lock(&fs_info->tree_mod_log_lock);
693         tm_root = &fs_info->tree_mod_log;
694         node = tm_root->rb_node;
695         while (node) {
696                 cur = rb_entry(node, struct tree_mod_elem, node);
697                 if (cur->logical < start) {
698                         node = node->rb_left;
699                 } else if (cur->logical > start) {
700                         node = node->rb_right;
701                 } else if (cur->seq < min_seq) {
702                         node = node->rb_left;
703                 } else if (!smallest) {
704                         /* we want the node with the highest seq */
705                         if (found)
706                                 BUG_ON(found->seq > cur->seq);
707                         found = cur;
708                         node = node->rb_left;
709                 } else if (cur->seq > min_seq) {
710                         /* we want the node with the smallest seq */
711                         if (found)
712                                 BUG_ON(found->seq < cur->seq);
713                         found = cur;
714                         node = node->rb_right;
715                 } else {
716                         found = cur;
717                         break;
718                 }
719         }
720         read_unlock(&fs_info->tree_mod_log_lock);
721
722         return found;
723 }
724
725 /*
726  * this returns the element from the log with the smallest time sequence
727  * value that's in the log (the oldest log item). any element with a time
728  * sequence lower than min_seq will be ignored.
729  */
730 static struct tree_mod_elem *
731 tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start,
732                            u64 min_seq)
733 {
734         return __tree_mod_log_search(fs_info, start, min_seq, 1);
735 }
736
737 /*
738  * this returns the element from the log with the largest time sequence
739  * value that's in the log (the most recent log item). any element with
740  * a time sequence lower than min_seq will be ignored.
741  */
742 static struct tree_mod_elem *
743 tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq)
744 {
745         return __tree_mod_log_search(fs_info, start, min_seq, 0);
746 }
747
748 static noinline int tree_mod_log_eb_copy(struct extent_buffer *dst,
749                      struct extent_buffer *src, unsigned long dst_offset,
750                      unsigned long src_offset, int nr_items)
751 {
752         struct btrfs_fs_info *fs_info = dst->fs_info;
753         int ret = 0;
754         struct tree_mod_elem **tm_list = NULL;
755         struct tree_mod_elem **tm_list_add, **tm_list_rem;
756         int i;
757         int locked = 0;
758
759         if (!tree_mod_need_log(fs_info, NULL))
760                 return 0;
761
762         if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
763                 return 0;
764
765         tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
766                           GFP_NOFS);
767         if (!tm_list)
768                 return -ENOMEM;
769
770         tm_list_add = tm_list;
771         tm_list_rem = tm_list + nr_items;
772         for (i = 0; i < nr_items; i++) {
773                 tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
774                     MOD_LOG_KEY_REMOVE, GFP_NOFS);
775                 if (!tm_list_rem[i]) {
776                         ret = -ENOMEM;
777                         goto free_tms;
778                 }
779
780                 tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
781                     MOD_LOG_KEY_ADD, GFP_NOFS);
782                 if (!tm_list_add[i]) {
783                         ret = -ENOMEM;
784                         goto free_tms;
785                 }
786         }
787
788         if (tree_mod_dont_log(fs_info, NULL))
789                 goto free_tms;
790         locked = 1;
791
792         for (i = 0; i < nr_items; i++) {
793                 ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]);
794                 if (ret)
795                         goto free_tms;
796                 ret = __tree_mod_log_insert(fs_info, tm_list_add[i]);
797                 if (ret)
798                         goto free_tms;
799         }
800
801         write_unlock(&fs_info->tree_mod_log_lock);
802         kfree(tm_list);
803
804         return 0;
805
806 free_tms:
807         for (i = 0; i < nr_items * 2; i++) {
808                 if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
809                         rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
810                 kfree(tm_list[i]);
811         }
812         if (locked)
813                 write_unlock(&fs_info->tree_mod_log_lock);
814         kfree(tm_list);
815
816         return ret;
817 }
818
819 static noinline int tree_mod_log_free_eb(struct extent_buffer *eb)
820 {
821         struct tree_mod_elem **tm_list = NULL;
822         int nritems = 0;
823         int i;
824         int ret = 0;
825
826         if (btrfs_header_level(eb) == 0)
827                 return 0;
828
829         if (!tree_mod_need_log(eb->fs_info, NULL))
830                 return 0;
831
832         nritems = btrfs_header_nritems(eb);
833         tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
834         if (!tm_list)
835                 return -ENOMEM;
836
837         for (i = 0; i < nritems; i++) {
838                 tm_list[i] = alloc_tree_mod_elem(eb, i,
839                     MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS);
840                 if (!tm_list[i]) {
841                         ret = -ENOMEM;
842                         goto free_tms;
843                 }
844         }
845
846         if (tree_mod_dont_log(eb->fs_info, eb))
847                 goto free_tms;
848
849         ret = __tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
850         write_unlock(&eb->fs_info->tree_mod_log_lock);
851         if (ret)
852                 goto free_tms;
853         kfree(tm_list);
854
855         return 0;
856
857 free_tms:
858         for (i = 0; i < nritems; i++)
859                 kfree(tm_list[i]);
860         kfree(tm_list);
861
862         return ret;
863 }
864
865 /*
866  * check if the tree block can be shared by multiple trees
867  */
868 int btrfs_block_can_be_shared(struct btrfs_root *root,
869                               struct extent_buffer *buf)
870 {
871         /*
872          * Tree blocks not in reference counted trees and tree roots
873          * are never shared. If a block was allocated after the last
874          * snapshot and the block was not allocated by tree relocation,
875          * we know the block is not shared.
876          */
877         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
878             buf != root->node && buf != root->commit_root &&
879             (btrfs_header_generation(buf) <=
880              btrfs_root_last_snapshot(&root->root_item) ||
881              btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)))
882                 return 1;
883
884         return 0;
885 }
886
887 static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
888                                        struct btrfs_root *root,
889                                        struct extent_buffer *buf,
890                                        struct extent_buffer *cow,
891                                        int *last_ref)
892 {
893         struct btrfs_fs_info *fs_info = root->fs_info;
894         u64 refs;
895         u64 owner;
896         u64 flags;
897         u64 new_flags = 0;
898         int ret;
899
900         /*
901          * Backrefs update rules:
902          *
903          * Always use full backrefs for extent pointers in tree block
904          * allocated by tree relocation.
905          *
906          * If a shared tree block is no longer referenced by its owner
907          * tree (btrfs_header_owner(buf) == root->root_key.objectid),
908          * use full backrefs for extent pointers in tree block.
909          *
910          * If a tree block is been relocating
911          * (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID),
912          * use full backrefs for extent pointers in tree block.
913          * The reason for this is some operations (such as drop tree)
914          * are only allowed for blocks use full backrefs.
915          */
916
917         if (btrfs_block_can_be_shared(root, buf)) {
918                 ret = btrfs_lookup_extent_info(trans, fs_info, buf->start,
919                                                btrfs_header_level(buf), 1,
920                                                &refs, &flags);
921                 if (ret)
922                         return ret;
923                 if (refs == 0) {
924                         ret = -EROFS;
925                         btrfs_handle_fs_error(fs_info, ret, NULL);
926                         return ret;
927                 }
928         } else {
929                 refs = 1;
930                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
931                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
932                         flags = BTRFS_BLOCK_FLAG_FULL_BACKREF;
933                 else
934                         flags = 0;
935         }
936
937         owner = btrfs_header_owner(buf);
938         BUG_ON(owner == BTRFS_TREE_RELOC_OBJECTID &&
939                !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF));
940
941         if (refs > 1) {
942                 if ((owner == root->root_key.objectid ||
943                      root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) &&
944                     !(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) {
945                         ret = btrfs_inc_ref(trans, root, buf, 1);
946                         if (ret)
947                                 return ret;
948
949                         if (root->root_key.objectid ==
950                             BTRFS_TREE_RELOC_OBJECTID) {
951                                 ret = btrfs_dec_ref(trans, root, buf, 0);
952                                 if (ret)
953                                         return ret;
954                                 ret = btrfs_inc_ref(trans, root, cow, 1);
955                                 if (ret)
956                                         return ret;
957                         }
958                         new_flags |= BTRFS_BLOCK_FLAG_FULL_BACKREF;
959                 } else {
960
961                         if (root->root_key.objectid ==
962                             BTRFS_TREE_RELOC_OBJECTID)
963                                 ret = btrfs_inc_ref(trans, root, cow, 1);
964                         else
965                                 ret = btrfs_inc_ref(trans, root, cow, 0);
966                         if (ret)
967                                 return ret;
968                 }
969                 if (new_flags != 0) {
970                         int level = btrfs_header_level(buf);
971
972                         ret = btrfs_set_disk_extent_flags(trans,
973                                                           buf->start,
974                                                           buf->len,
975                                                           new_flags, level, 0);
976                         if (ret)
977                                 return ret;
978                 }
979         } else {
980                 if (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF) {
981                         if (root->root_key.objectid ==
982                             BTRFS_TREE_RELOC_OBJECTID)
983                                 ret = btrfs_inc_ref(trans, root, cow, 1);
984                         else
985                                 ret = btrfs_inc_ref(trans, root, cow, 0);
986                         if (ret)
987                                 return ret;
988                         ret = btrfs_dec_ref(trans, root, buf, 1);
989                         if (ret)
990                                 return ret;
991                 }
992                 btrfs_clean_tree_block(buf);
993                 *last_ref = 1;
994         }
995         return 0;
996 }
997
998 static struct extent_buffer *alloc_tree_block_no_bg_flush(
999                                           struct btrfs_trans_handle *trans,
1000                                           struct btrfs_root *root,
1001                                           u64 parent_start,
1002                                           const struct btrfs_disk_key *disk_key,
1003                                           int level,
1004                                           u64 hint,
1005                                           u64 empty_size)
1006 {
1007         struct btrfs_fs_info *fs_info = root->fs_info;
1008         struct extent_buffer *ret;
1009
1010         /*
1011          * If we are COWing a node/leaf from the extent, chunk, device or free
1012          * space trees, make sure that we do not finish block group creation of
1013          * pending block groups. We do this to avoid a deadlock.
1014          * COWing can result in allocation of a new chunk, and flushing pending
1015          * block groups (btrfs_create_pending_block_groups()) can be triggered
1016          * when finishing allocation of a new chunk. Creation of a pending block
1017          * group modifies the extent, chunk, device and free space trees,
1018          * therefore we could deadlock with ourselves since we are holding a
1019          * lock on an extent buffer that btrfs_create_pending_block_groups() may
1020          * try to COW later.
1021          * For similar reasons, we also need to delay flushing pending block
1022          * groups when splitting a leaf or node, from one of those trees, since
1023          * we are holding a write lock on it and its parent or when inserting a
1024          * new root node for one of those trees.
1025          */
1026         if (root == fs_info->extent_root ||
1027             root == fs_info->chunk_root ||
1028             root == fs_info->dev_root ||
1029             root == fs_info->free_space_root)
1030                 trans->can_flush_pending_bgs = false;
1031
1032         ret = btrfs_alloc_tree_block(trans, root, parent_start,
1033                                      root->root_key.objectid, disk_key, level,
1034                                      hint, empty_size);
1035         trans->can_flush_pending_bgs = true;
1036
1037         return ret;
1038 }
1039
1040 /*
1041  * does the dirty work in cow of a single block.  The parent block (if
1042  * supplied) is updated to point to the new cow copy.  The new buffer is marked
1043  * dirty and returned locked.  If you modify the block it needs to be marked
1044  * dirty again.
1045  *
1046  * search_start -- an allocation hint for the new block
1047  *
1048  * empty_size -- a hint that you plan on doing more cow.  This is the size in
1049  * bytes the allocator should try to find free next to the block it returns.
1050  * This is just a hint and may be ignored by the allocator.
1051  */
1052 static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
1053                              struct btrfs_root *root,
1054                              struct extent_buffer *buf,
1055                              struct extent_buffer *parent, int parent_slot,
1056                              struct extent_buffer **cow_ret,
1057                              u64 search_start, u64 empty_size)
1058 {
1059         struct btrfs_fs_info *fs_info = root->fs_info;
1060         struct btrfs_disk_key disk_key;
1061         struct extent_buffer *cow;
1062         int level, ret;
1063         int last_ref = 0;
1064         int unlock_orig = 0;
1065         u64 parent_start = 0;
1066
1067         if (*cow_ret == buf)
1068                 unlock_orig = 1;
1069
1070         btrfs_assert_tree_locked(buf);
1071
1072         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1073                 trans->transid != fs_info->running_transaction->transid);
1074         WARN_ON(test_bit(BTRFS_ROOT_REF_COWS, &root->state) &&
1075                 trans->transid != root->last_trans);
1076
1077         level = btrfs_header_level(buf);
1078
1079         if (level == 0)
1080                 btrfs_item_key(buf, &disk_key, 0);
1081         else
1082                 btrfs_node_key(buf, &disk_key, 0);
1083
1084         if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent)
1085                 parent_start = parent->start;
1086
1087         cow = alloc_tree_block_no_bg_flush(trans, root, parent_start, &disk_key,
1088                                            level, search_start, empty_size);
1089         if (IS_ERR(cow))
1090                 return PTR_ERR(cow);
1091
1092         /* cow is set to blocking by btrfs_init_new_buffer */
1093
1094         copy_extent_buffer_full(cow, buf);
1095         btrfs_set_header_bytenr(cow, cow->start);
1096         btrfs_set_header_generation(cow, trans->transid);
1097         btrfs_set_header_backref_rev(cow, BTRFS_MIXED_BACKREF_REV);
1098         btrfs_clear_header_flag(cow, BTRFS_HEADER_FLAG_WRITTEN |
1099                                      BTRFS_HEADER_FLAG_RELOC);
1100         if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID)
1101                 btrfs_set_header_flag(cow, BTRFS_HEADER_FLAG_RELOC);
1102         else
1103                 btrfs_set_header_owner(cow, root->root_key.objectid);
1104
1105         write_extent_buffer_fsid(cow, fs_info->fs_devices->metadata_uuid);
1106
1107         ret = update_ref_for_cow(trans, root, buf, cow, &last_ref);
1108         if (ret) {
1109                 btrfs_tree_unlock(cow);
1110                 free_extent_buffer(cow);
1111                 btrfs_abort_transaction(trans, ret);
1112                 return ret;
1113         }
1114
1115         if (test_bit(BTRFS_ROOT_REF_COWS, &root->state)) {
1116                 ret = btrfs_reloc_cow_block(trans, root, buf, cow);
1117                 if (ret) {
1118                         btrfs_tree_unlock(cow);
1119                         free_extent_buffer(cow);
1120                         btrfs_abort_transaction(trans, ret);
1121                         return ret;
1122                 }
1123         }
1124
1125         if (buf == root->node) {
1126                 WARN_ON(parent && parent != buf);
1127                 if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID ||
1128                     btrfs_header_backref_rev(buf) < BTRFS_MIXED_BACKREF_REV)
1129                         parent_start = buf->start;
1130
1131                 extent_buffer_get(cow);
1132                 ret = tree_mod_log_insert_root(root->node, cow, 1);
1133                 BUG_ON(ret < 0);
1134                 rcu_assign_pointer(root->node, cow);
1135
1136                 btrfs_free_tree_block(trans, root, buf, parent_start,
1137                                       last_ref);
1138                 free_extent_buffer(buf);
1139                 add_root_to_dirty_list(root);
1140         } else {
1141                 WARN_ON(trans->transid != btrfs_header_generation(parent));
1142                 tree_mod_log_insert_key(parent, parent_slot,
1143                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
1144                 btrfs_set_node_blockptr(parent, parent_slot,
1145                                         cow->start);
1146                 btrfs_set_node_ptr_generation(parent, parent_slot,
1147                                               trans->transid);
1148                 btrfs_mark_buffer_dirty(parent);
1149                 if (last_ref) {
1150                         ret = tree_mod_log_free_eb(buf);
1151                         if (ret) {
1152                                 btrfs_tree_unlock(cow);
1153                                 free_extent_buffer(cow);
1154                                 btrfs_abort_transaction(trans, ret);
1155                                 return ret;
1156                         }
1157                 }
1158                 btrfs_free_tree_block(trans, root, buf, parent_start,
1159                                       last_ref);
1160         }
1161         if (unlock_orig)
1162                 btrfs_tree_unlock(buf);
1163         free_extent_buffer_stale(buf);
1164         btrfs_mark_buffer_dirty(cow);
1165         *cow_ret = cow;
1166         return 0;
1167 }
1168
1169 /*
1170  * returns the logical address of the oldest predecessor of the given root.
1171  * entries older than time_seq are ignored.
1172  */
1173 static struct tree_mod_elem *__tree_mod_log_oldest_root(
1174                 struct extent_buffer *eb_root, u64 time_seq)
1175 {
1176         struct tree_mod_elem *tm;
1177         struct tree_mod_elem *found = NULL;
1178         u64 root_logical = eb_root->start;
1179         int looped = 0;
1180
1181         if (!time_seq)
1182                 return NULL;
1183
1184         /*
1185          * the very last operation that's logged for a root is the
1186          * replacement operation (if it is replaced at all). this has
1187          * the logical address of the *new* root, making it the very
1188          * first operation that's logged for this root.
1189          */
1190         while (1) {
1191                 tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
1192                                                 time_seq);
1193                 if (!looped && !tm)
1194                         return NULL;
1195                 /*
1196                  * if there are no tree operation for the oldest root, we simply
1197                  * return it. this should only happen if that (old) root is at
1198                  * level 0.
1199                  */
1200                 if (!tm)
1201                         break;
1202
1203                 /*
1204                  * if there's an operation that's not a root replacement, we
1205                  * found the oldest version of our root. normally, we'll find a
1206                  * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
1207                  */
1208                 if (tm->op != MOD_LOG_ROOT_REPLACE)
1209                         break;
1210
1211                 found = tm;
1212                 root_logical = tm->old_root.logical;
1213                 looped = 1;
1214         }
1215
1216         /* if there's no old root to return, return what we found instead */
1217         if (!found)
1218                 found = tm;
1219
1220         return found;
1221 }
1222
1223 /*
1224  * tm is a pointer to the first operation to rewind within eb. then, all
1225  * previous operations will be rewound (until we reach something older than
1226  * time_seq).
1227  */
1228 static void
1229 __tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb,
1230                       u64 time_seq, struct tree_mod_elem *first_tm)
1231 {
1232         u32 n;
1233         struct rb_node *next;
1234         struct tree_mod_elem *tm = first_tm;
1235         unsigned long o_dst;
1236         unsigned long o_src;
1237         unsigned long p_size = sizeof(struct btrfs_key_ptr);
1238
1239         n = btrfs_header_nritems(eb);
1240         read_lock(&fs_info->tree_mod_log_lock);
1241         while (tm && tm->seq >= time_seq) {
1242                 /*
1243                  * all the operations are recorded with the operator used for
1244                  * the modification. as we're going backwards, we do the
1245                  * opposite of each operation here.
1246                  */
1247                 switch (tm->op) {
1248                 case MOD_LOG_KEY_REMOVE_WHILE_FREEING:
1249                         BUG_ON(tm->slot < n);
1250                         /* Fallthrough */
1251                 case MOD_LOG_KEY_REMOVE_WHILE_MOVING:
1252                 case MOD_LOG_KEY_REMOVE:
1253                         btrfs_set_node_key(eb, &tm->key, tm->slot);
1254                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1255                         btrfs_set_node_ptr_generation(eb, tm->slot,
1256                                                       tm->generation);
1257                         n++;
1258                         break;
1259                 case MOD_LOG_KEY_REPLACE:
1260                         BUG_ON(tm->slot >= n);
1261                         btrfs_set_node_key(eb, &tm->key, tm->slot);
1262                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
1263                         btrfs_set_node_ptr_generation(eb, tm->slot,
1264                                                       tm->generation);
1265                         break;
1266                 case MOD_LOG_KEY_ADD:
1267                         /* if a move operation is needed it's in the log */
1268                         n--;
1269                         break;
1270                 case MOD_LOG_MOVE_KEYS:
1271                         o_dst = btrfs_node_key_ptr_offset(tm->slot);
1272                         o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot);
1273                         memmove_extent_buffer(eb, o_dst, o_src,
1274                                               tm->move.nr_items * p_size);
1275                         break;
1276                 case MOD_LOG_ROOT_REPLACE:
1277                         /*
1278                          * this operation is special. for roots, this must be
1279                          * handled explicitly before rewinding.
1280                          * for non-roots, this operation may exist if the node
1281                          * was a root: root A -> child B; then A gets empty and
1282                          * B is promoted to the new root. in the mod log, we'll
1283                          * have a root-replace operation for B, a tree block
1284                          * that is no root. we simply ignore that operation.
1285                          */
1286                         break;
1287                 }
1288                 next = rb_next(&tm->node);
1289                 if (!next)
1290                         break;
1291                 tm = rb_entry(next, struct tree_mod_elem, node);
1292                 if (tm->logical != first_tm->logical)
1293                         break;
1294         }
1295         read_unlock(&fs_info->tree_mod_log_lock);
1296         btrfs_set_header_nritems(eb, n);
1297 }
1298
1299 /*
1300  * Called with eb read locked. If the buffer cannot be rewound, the same buffer
1301  * is returned. If rewind operations happen, a fresh buffer is returned. The
1302  * returned buffer is always read-locked. If the returned buffer is not the
1303  * input buffer, the lock on the input buffer is released and the input buffer
1304  * is freed (its refcount is decremented).
1305  */
1306 static struct extent_buffer *
1307 tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path,
1308                     struct extent_buffer *eb, u64 time_seq)
1309 {
1310         struct extent_buffer *eb_rewin;
1311         struct tree_mod_elem *tm;
1312
1313         if (!time_seq)
1314                 return eb;
1315
1316         if (btrfs_header_level(eb) == 0)
1317                 return eb;
1318
1319         tm = tree_mod_log_search(fs_info, eb->start, time_seq);
1320         if (!tm)
1321                 return eb;
1322
1323         btrfs_set_path_blocking(path);
1324         btrfs_set_lock_blocking_read(eb);
1325
1326         if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1327                 BUG_ON(tm->slot != 0);
1328                 eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
1329                 if (!eb_rewin) {
1330                         btrfs_tree_read_unlock_blocking(eb);
1331                         free_extent_buffer(eb);
1332                         return NULL;
1333                 }
1334                 btrfs_set_header_bytenr(eb_rewin, eb->start);
1335                 btrfs_set_header_backref_rev(eb_rewin,
1336                                              btrfs_header_backref_rev(eb));
1337                 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
1338                 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
1339         } else {
1340                 eb_rewin = btrfs_clone_extent_buffer(eb);
1341                 if (!eb_rewin) {
1342                         btrfs_tree_read_unlock_blocking(eb);
1343                         free_extent_buffer(eb);
1344                         return NULL;
1345                 }
1346         }
1347
1348         btrfs_tree_read_unlock_blocking(eb);
1349         free_extent_buffer(eb);
1350
1351         btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
1352                                        eb_rewin, btrfs_header_level(eb_rewin));
1353         btrfs_tree_read_lock(eb_rewin);
1354         __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
1355         WARN_ON(btrfs_header_nritems(eb_rewin) >
1356                 BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1357
1358         return eb_rewin;
1359 }
1360
1361 /*
1362  * get_old_root() rewinds the state of @root's root node to the given @time_seq
1363  * value. If there are no changes, the current root->root_node is returned. If
1364  * anything changed in between, there's a fresh buffer allocated on which the
1365  * rewind operations are done. In any case, the returned buffer is read locked.
1366  * Returns NULL on error (with no locks held).
1367  */
1368 static inline struct extent_buffer *
1369 get_old_root(struct btrfs_root *root, u64 time_seq)
1370 {
1371         struct btrfs_fs_info *fs_info = root->fs_info;
1372         struct tree_mod_elem *tm;
1373         struct extent_buffer *eb = NULL;
1374         struct extent_buffer *eb_root;
1375         u64 eb_root_owner = 0;
1376         struct extent_buffer *old;
1377         struct tree_mod_root *old_root = NULL;
1378         u64 old_generation = 0;
1379         u64 logical;
1380         int level;
1381
1382         eb_root = btrfs_read_lock_root_node(root);
1383         tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1384         if (!tm)
1385                 return eb_root;
1386
1387         if (tm->op == MOD_LOG_ROOT_REPLACE) {
1388                 old_root = &tm->old_root;
1389                 old_generation = tm->generation;
1390                 logical = old_root->logical;
1391                 level = old_root->level;
1392         } else {
1393                 logical = eb_root->start;
1394                 level = btrfs_header_level(eb_root);
1395         }
1396
1397         tm = tree_mod_log_search(fs_info, logical, time_seq);
1398         if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1399                 btrfs_tree_read_unlock(eb_root);
1400                 free_extent_buffer(eb_root);
1401                 old = read_tree_block(fs_info, logical, 0, level, NULL);
1402                 if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
1403                         if (!IS_ERR(old))
1404                                 free_extent_buffer(old);
1405                         btrfs_warn(fs_info,
1406                                    "failed to read tree block %llu from get_old_root",
1407                                    logical);
1408                 } else {
1409                         struct tree_mod_elem *tm2;
1410
1411                         btrfs_tree_read_lock(old);
1412                         eb = btrfs_clone_extent_buffer(old);
1413                         /*
1414                          * After the lookup for the most recent tree mod operation
1415                          * above and before we locked and cloned the extent buffer
1416                          * 'old', a new tree mod log operation may have been added.
1417                          * So lookup for a more recent one to make sure the number
1418                          * of mod log operations we replay is consistent with the
1419                          * number of items we have in the cloned extent buffer,
1420                          * otherwise we can hit a BUG_ON when rewinding the extent
1421                          * buffer.
1422                          */
1423                         tm2 = tree_mod_log_search(fs_info, logical, time_seq);
1424                         btrfs_tree_read_unlock(old);
1425                         free_extent_buffer(old);
1426                         ASSERT(tm2);
1427                         ASSERT(tm2 == tm || tm2->seq > tm->seq);
1428                         if (!tm2 || tm2->seq < tm->seq) {
1429                                 free_extent_buffer(eb);
1430                                 return NULL;
1431                         }
1432                         tm = tm2;
1433                 }
1434         } else if (old_root) {
1435                 eb_root_owner = btrfs_header_owner(eb_root);
1436                 btrfs_tree_read_unlock(eb_root);
1437                 free_extent_buffer(eb_root);
1438                 eb = alloc_dummy_extent_buffer(fs_info, logical);
1439         } else {
1440                 btrfs_set_lock_blocking_read(eb_root);
1441                 eb = btrfs_clone_extent_buffer(eb_root);
1442                 btrfs_tree_read_unlock_blocking(eb_root);
1443                 free_extent_buffer(eb_root);
1444         }
1445
1446         if (!eb)
1447                 return NULL;
1448         if (old_root) {
1449                 btrfs_set_header_bytenr(eb, eb->start);
1450                 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1451                 btrfs_set_header_owner(eb, eb_root_owner);
1452                 btrfs_set_header_level(eb, old_root->level);
1453                 btrfs_set_header_generation(eb, old_generation);
1454         }
1455         btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
1456                                        btrfs_header_level(eb));
1457         btrfs_tree_read_lock(eb);
1458         if (tm)
1459                 __tree_mod_log_rewind(fs_info, eb, time_seq, tm);
1460         else
1461                 WARN_ON(btrfs_header_level(eb) != 0);
1462         WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1463
1464         return eb;
1465 }
1466
1467 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1468 {
1469         struct tree_mod_elem *tm;
1470         int level;
1471         struct extent_buffer *eb_root = btrfs_root_node(root);
1472
1473         tm = __tree_mod_log_oldest_root(eb_root, time_seq);
1474         if (tm && tm->op == MOD_LOG_ROOT_REPLACE) {
1475                 level = tm->old_root.level;
1476         } else {
1477                 level = btrfs_header_level(eb_root);
1478         }
1479         free_extent_buffer(eb_root);
1480
1481         return level;
1482 }
1483
1484 static inline int should_cow_block(struct btrfs_trans_handle *trans,
1485                                    struct btrfs_root *root,
1486                                    struct extent_buffer *buf)
1487 {
1488         if (btrfs_is_testing(root->fs_info))
1489                 return 0;
1490
1491         /* Ensure we can see the FORCE_COW bit */
1492         smp_mb__before_atomic();
1493
1494         /*
1495          * We do not need to cow a block if
1496          * 1) this block is not created or changed in this transaction;
1497          * 2) this block does not belong to TREE_RELOC tree;
1498          * 3) the root is not forced COW.
1499          *
1500          * What is forced COW:
1501          *    when we create snapshot during committing the transaction,
1502          *    after we've finished copying src root, we must COW the shared
1503          *    block to ensure the metadata consistency.
1504          */
1505         if (btrfs_header_generation(buf) == trans->transid &&
1506             !btrfs_header_flag(buf, BTRFS_HEADER_FLAG_WRITTEN) &&
1507             !(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
1508               btrfs_header_flag(buf, BTRFS_HEADER_FLAG_RELOC)) &&
1509             !test_bit(BTRFS_ROOT_FORCE_COW, &root->state))
1510                 return 0;
1511         return 1;
1512 }
1513
1514 /*
1515  * cows a single block, see __btrfs_cow_block for the real work.
1516  * This version of it has extra checks so that a block isn't COWed more than
1517  * once per transaction, as long as it hasn't been written yet
1518  */
1519 noinline int btrfs_cow_block(struct btrfs_trans_handle *trans,
1520                     struct btrfs_root *root, struct extent_buffer *buf,
1521                     struct extent_buffer *parent, int parent_slot,
1522                     struct extent_buffer **cow_ret)
1523 {
1524         struct btrfs_fs_info *fs_info = root->fs_info;
1525         u64 search_start;
1526         int ret;
1527
1528         if (test_bit(BTRFS_ROOT_DELETING, &root->state))
1529                 btrfs_err(fs_info,
1530                         "COW'ing blocks on a fs root that's being dropped");
1531
1532         if (trans->transaction != fs_info->running_transaction)
1533                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1534                        trans->transid,
1535                        fs_info->running_transaction->transid);
1536
1537         if (trans->transid != fs_info->generation)
1538                 WARN(1, KERN_CRIT "trans %llu running %llu\n",
1539                        trans->transid, fs_info->generation);
1540
1541         if (!should_cow_block(trans, root, buf)) {
1542                 trans->dirty = true;
1543                 *cow_ret = buf;
1544                 return 0;
1545         }
1546
1547         search_start = buf->start & ~((u64)SZ_1G - 1);
1548
1549         if (parent)
1550                 btrfs_set_lock_blocking_write(parent);
1551         btrfs_set_lock_blocking_write(buf);
1552
1553         /*
1554          * Before CoWing this block for later modification, check if it's
1555          * the subtree root and do the delayed subtree trace if needed.
1556          *
1557          * Also We don't care about the error, as it's handled internally.
1558          */
1559         btrfs_qgroup_trace_subtree_after_cow(trans, root, buf);
1560         ret = __btrfs_cow_block(trans, root, buf, parent,
1561                                  parent_slot, cow_ret, search_start, 0);
1562
1563         trace_btrfs_cow_block(root, buf, *cow_ret);
1564
1565         return ret;
1566 }
1567
1568 /*
1569  * helper function for defrag to decide if two blocks pointed to by a
1570  * node are actually close by
1571  */
1572 static int close_blocks(u64 blocknr, u64 other, u32 blocksize)
1573 {
1574         if (blocknr < other && other - (blocknr + blocksize) < 32768)
1575                 return 1;
1576         if (blocknr > other && blocknr - (other + blocksize) < 32768)
1577                 return 1;
1578         return 0;
1579 }
1580
1581 /*
1582  * compare two keys in a memcmp fashion
1583  */
1584 static int comp_keys(const struct btrfs_disk_key *disk,
1585                      const struct btrfs_key *k2)
1586 {
1587         struct btrfs_key k1;
1588
1589         btrfs_disk_key_to_cpu(&k1, disk);
1590
1591         return btrfs_comp_cpu_keys(&k1, k2);
1592 }
1593
1594 /*
1595  * same as comp_keys only with two btrfs_key's
1596  */
1597 int btrfs_comp_cpu_keys(const struct btrfs_key *k1, const struct btrfs_key *k2)
1598 {
1599         if (k1->objectid > k2->objectid)
1600                 return 1;
1601         if (k1->objectid < k2->objectid)
1602                 return -1;
1603         if (k1->type > k2->type)
1604                 return 1;
1605         if (k1->type < k2->type)
1606                 return -1;
1607         if (k1->offset > k2->offset)
1608                 return 1;
1609         if (k1->offset < k2->offset)
1610                 return -1;
1611         return 0;
1612 }
1613
1614 /*
1615  * this is used by the defrag code to go through all the
1616  * leaves pointed to by a node and reallocate them so that
1617  * disk order is close to key order
1618  */
1619 int btrfs_realloc_node(struct btrfs_trans_handle *trans,
1620                        struct btrfs_root *root, struct extent_buffer *parent,
1621                        int start_slot, u64 *last_ret,
1622                        struct btrfs_key *progress)
1623 {
1624         struct btrfs_fs_info *fs_info = root->fs_info;
1625         struct extent_buffer *cur;
1626         u64 blocknr;
1627         u64 gen;
1628         u64 search_start = *last_ret;
1629         u64 last_block = 0;
1630         u64 other;
1631         u32 parent_nritems;
1632         int end_slot;
1633         int i;
1634         int err = 0;
1635         int parent_level;
1636         int uptodate;
1637         u32 blocksize;
1638         int progress_passed = 0;
1639         struct btrfs_disk_key disk_key;
1640
1641         parent_level = btrfs_header_level(parent);
1642
1643         WARN_ON(trans->transaction != fs_info->running_transaction);
1644         WARN_ON(trans->transid != fs_info->generation);
1645
1646         parent_nritems = btrfs_header_nritems(parent);
1647         blocksize = fs_info->nodesize;
1648         end_slot = parent_nritems - 1;
1649
1650         if (parent_nritems <= 1)
1651                 return 0;
1652
1653         btrfs_set_lock_blocking_write(parent);
1654
1655         for (i = start_slot; i <= end_slot; i++) {
1656                 struct btrfs_key first_key;
1657                 int close = 1;
1658
1659                 btrfs_node_key(parent, &disk_key, i);
1660                 if (!progress_passed && comp_keys(&disk_key, progress) < 0)
1661                         continue;
1662
1663                 progress_passed = 1;
1664                 blocknr = btrfs_node_blockptr(parent, i);
1665                 gen = btrfs_node_ptr_generation(parent, i);
1666                 btrfs_node_key_to_cpu(parent, &first_key, i);
1667                 if (last_block == 0)
1668                         last_block = blocknr;
1669
1670                 if (i > 0) {
1671                         other = btrfs_node_blockptr(parent, i - 1);
1672                         close = close_blocks(blocknr, other, blocksize);
1673                 }
1674                 if (!close && i < end_slot) {
1675                         other = btrfs_node_blockptr(parent, i + 1);
1676                         close = close_blocks(blocknr, other, blocksize);
1677                 }
1678                 if (close) {
1679                         last_block = blocknr;
1680                         continue;
1681                 }
1682
1683                 cur = find_extent_buffer(fs_info, blocknr);
1684                 if (cur)
1685                         uptodate = btrfs_buffer_uptodate(cur, gen, 0);
1686                 else
1687                         uptodate = 0;
1688                 if (!cur || !uptodate) {
1689                         if (!cur) {
1690                                 cur = read_tree_block(fs_info, blocknr, gen,
1691                                                       parent_level - 1,
1692                                                       &first_key);
1693                                 if (IS_ERR(cur)) {
1694                                         return PTR_ERR(cur);
1695                                 } else if (!extent_buffer_uptodate(cur)) {
1696                                         free_extent_buffer(cur);
1697                                         return -EIO;
1698                                 }
1699                         } else if (!uptodate) {
1700                                 err = btrfs_read_buffer(cur, gen,
1701                                                 parent_level - 1,&first_key);
1702                                 if (err) {
1703                                         free_extent_buffer(cur);
1704                                         return err;
1705                                 }
1706                         }
1707                 }
1708                 if (search_start == 0)
1709                         search_start = last_block;
1710
1711                 btrfs_tree_lock(cur);
1712                 btrfs_set_lock_blocking_write(cur);
1713                 err = __btrfs_cow_block(trans, root, cur, parent, i,
1714                                         &cur, search_start,
1715                                         min(16 * blocksize,
1716                                             (end_slot - i) * blocksize));
1717                 if (err) {
1718                         btrfs_tree_unlock(cur);
1719                         free_extent_buffer(cur);
1720                         break;
1721                 }
1722                 search_start = cur->start;
1723                 last_block = cur->start;
1724                 *last_ret = search_start;
1725                 btrfs_tree_unlock(cur);
1726                 free_extent_buffer(cur);
1727         }
1728         return err;
1729 }
1730
1731 /*
1732  * search for key in the extent_buffer.  The items start at offset p,
1733  * and they are item_size apart.  There are 'max' items in p.
1734  *
1735  * the slot in the array is returned via slot, and it points to
1736  * the place where you would insert key if it is not found in
1737  * the array.
1738  *
1739  * slot may point to max if the key is bigger than all of the keys
1740  */
1741 static noinline int generic_bin_search(struct extent_buffer *eb,
1742                                        unsigned long p, int item_size,
1743                                        const struct btrfs_key *key,
1744                                        int max, int *slot)
1745 {
1746         int low = 0;
1747         int high = max;
1748         int mid;
1749         int ret;
1750         struct btrfs_disk_key *tmp = NULL;
1751         struct btrfs_disk_key unaligned;
1752         unsigned long offset;
1753         char *kaddr = NULL;
1754         unsigned long map_start = 0;
1755         unsigned long map_len = 0;
1756         int err;
1757
1758         if (low > high) {
1759                 btrfs_err(eb->fs_info,
1760                  "%s: low (%d) > high (%d) eb %llu owner %llu level %d",
1761                           __func__, low, high, eb->start,
1762                           btrfs_header_owner(eb), btrfs_header_level(eb));
1763                 return -EINVAL;
1764         }
1765
1766         while (low < high) {
1767                 mid = (low + high) / 2;
1768                 offset = p + mid * item_size;
1769
1770                 if (!kaddr || offset < map_start ||
1771                     (offset + sizeof(struct btrfs_disk_key)) >
1772                     map_start + map_len) {
1773
1774                         err = map_private_extent_buffer(eb, offset,
1775                                                 sizeof(struct btrfs_disk_key),
1776                                                 &kaddr, &map_start, &map_len);
1777
1778                         if (!err) {
1779                                 tmp = (struct btrfs_disk_key *)(kaddr + offset -
1780                                                         map_start);
1781                         } else if (err == 1) {
1782                                 read_extent_buffer(eb, &unaligned,
1783                                                    offset, sizeof(unaligned));
1784                                 tmp = &unaligned;
1785                         } else {
1786                                 return err;
1787                         }
1788
1789                 } else {
1790                         tmp = (struct btrfs_disk_key *)(kaddr + offset -
1791                                                         map_start);
1792                 }
1793                 ret = comp_keys(tmp, key);
1794
1795                 if (ret < 0)
1796                         low = mid + 1;
1797                 else if (ret > 0)
1798                         high = mid;
1799                 else {
1800                         *slot = mid;
1801                         return 0;
1802                 }
1803         }
1804         *slot = low;
1805         return 1;
1806 }
1807
1808 /*
1809  * simple bin_search frontend that does the right thing for
1810  * leaves vs nodes
1811  */
1812 int btrfs_bin_search(struct extent_buffer *eb, const struct btrfs_key *key,
1813                      int level, int *slot)
1814 {
1815         if (level == 0)
1816                 return generic_bin_search(eb,
1817                                           offsetof(struct btrfs_leaf, items),
1818                                           sizeof(struct btrfs_item),
1819                                           key, btrfs_header_nritems(eb),
1820                                           slot);
1821         else
1822                 return generic_bin_search(eb,
1823                                           offsetof(struct btrfs_node, ptrs),
1824                                           sizeof(struct btrfs_key_ptr),
1825                                           key, btrfs_header_nritems(eb),
1826                                           slot);
1827 }
1828
1829 static void root_add_used(struct btrfs_root *root, u32 size)
1830 {
1831         spin_lock(&root->accounting_lock);
1832         btrfs_set_root_used(&root->root_item,
1833                             btrfs_root_used(&root->root_item) + size);
1834         spin_unlock(&root->accounting_lock);
1835 }
1836
1837 static void root_sub_used(struct btrfs_root *root, u32 size)
1838 {
1839         spin_lock(&root->accounting_lock);
1840         btrfs_set_root_used(&root->root_item,
1841                             btrfs_root_used(&root->root_item) - size);
1842         spin_unlock(&root->accounting_lock);
1843 }
1844
1845 /* given a node and slot number, this reads the blocks it points to.  The
1846  * extent buffer is returned with a reference taken (but unlocked).
1847  */
1848 struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
1849                                            int slot)
1850 {
1851         int level = btrfs_header_level(parent);
1852         struct extent_buffer *eb;
1853         struct btrfs_key first_key;
1854
1855         if (slot < 0 || slot >= btrfs_header_nritems(parent))
1856                 return ERR_PTR(-ENOENT);
1857
1858         BUG_ON(level == 0);
1859
1860         btrfs_node_key_to_cpu(parent, &first_key, slot);
1861         eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
1862                              btrfs_node_ptr_generation(parent, slot),
1863                              level - 1, &first_key);
1864         if (!IS_ERR(eb) && !extent_buffer_uptodate(eb)) {
1865                 free_extent_buffer(eb);
1866                 eb = ERR_PTR(-EIO);
1867         }
1868
1869         return eb;
1870 }
1871
1872 /*
1873  * node level balancing, used to make sure nodes are in proper order for
1874  * item deletion.  We balance from the top down, so we have to make sure
1875  * that a deletion won't leave an node completely empty later on.
1876  */
1877 static noinline int balance_level(struct btrfs_trans_handle *trans,
1878                          struct btrfs_root *root,
1879                          struct btrfs_path *path, int level)
1880 {
1881         struct btrfs_fs_info *fs_info = root->fs_info;
1882         struct extent_buffer *right = NULL;
1883         struct extent_buffer *mid;
1884         struct extent_buffer *left = NULL;
1885         struct extent_buffer *parent = NULL;
1886         int ret = 0;
1887         int wret;
1888         int pslot;
1889         int orig_slot = path->slots[level];
1890         u64 orig_ptr;
1891
1892         ASSERT(level > 0);
1893
1894         mid = path->nodes[level];
1895
1896         WARN_ON(path->locks[level] != BTRFS_WRITE_LOCK &&
1897                 path->locks[level] != BTRFS_WRITE_LOCK_BLOCKING);
1898         WARN_ON(btrfs_header_generation(mid) != trans->transid);
1899
1900         orig_ptr = btrfs_node_blockptr(mid, orig_slot);
1901
1902         if (level < BTRFS_MAX_LEVEL - 1) {
1903                 parent = path->nodes[level + 1];
1904                 pslot = path->slots[level + 1];
1905         }
1906
1907         /*
1908          * deal with the case where there is only one pointer in the root
1909          * by promoting the node below to a root
1910          */
1911         if (!parent) {
1912                 struct extent_buffer *child;
1913
1914                 if (btrfs_header_nritems(mid) != 1)
1915                         return 0;
1916
1917                 /* promote the child to a root */
1918                 child = btrfs_read_node_slot(mid, 0);
1919                 if (IS_ERR(child)) {
1920                         ret = PTR_ERR(child);
1921                         btrfs_handle_fs_error(fs_info, ret, NULL);
1922                         goto enospc;
1923                 }
1924
1925                 btrfs_tree_lock(child);
1926                 btrfs_set_lock_blocking_write(child);
1927                 ret = btrfs_cow_block(trans, root, child, mid, 0, &child);
1928                 if (ret) {
1929                         btrfs_tree_unlock(child);
1930                         free_extent_buffer(child);
1931                         goto enospc;
1932                 }
1933
1934                 ret = tree_mod_log_insert_root(root->node, child, 1);
1935                 BUG_ON(ret < 0);
1936                 rcu_assign_pointer(root->node, child);
1937
1938                 add_root_to_dirty_list(root);
1939                 btrfs_tree_unlock(child);
1940
1941                 path->locks[level] = 0;
1942                 path->nodes[level] = NULL;
1943                 btrfs_clean_tree_block(mid);
1944                 btrfs_tree_unlock(mid);
1945                 /* once for the path */
1946                 free_extent_buffer(mid);
1947
1948                 root_sub_used(root, mid->len);
1949                 btrfs_free_tree_block(trans, root, mid, 0, 1);
1950                 /* once for the root ptr */
1951                 free_extent_buffer_stale(mid);
1952                 return 0;
1953         }
1954         if (btrfs_header_nritems(mid) >
1955             BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 4)
1956                 return 0;
1957
1958         left = btrfs_read_node_slot(parent, pslot - 1);
1959         if (IS_ERR(left))
1960                 left = NULL;
1961
1962         if (left) {
1963                 btrfs_tree_lock(left);
1964                 btrfs_set_lock_blocking_write(left);
1965                 wret = btrfs_cow_block(trans, root, left,
1966                                        parent, pslot - 1, &left);
1967                 if (wret) {
1968                         ret = wret;
1969                         goto enospc;
1970                 }
1971         }
1972
1973         right = btrfs_read_node_slot(parent, pslot + 1);
1974         if (IS_ERR(right))
1975                 right = NULL;
1976
1977         if (right) {
1978                 btrfs_tree_lock(right);
1979                 btrfs_set_lock_blocking_write(right);
1980                 wret = btrfs_cow_block(trans, root, right,
1981                                        parent, pslot + 1, &right);
1982                 if (wret) {
1983                         ret = wret;
1984                         goto enospc;
1985                 }
1986         }
1987
1988         /* first, try to make some room in the middle buffer */
1989         if (left) {
1990                 orig_slot += btrfs_header_nritems(left);
1991                 wret = push_node_left(trans, left, mid, 1);
1992                 if (wret < 0)
1993                         ret = wret;
1994         }
1995
1996         /*
1997          * then try to empty the right most buffer into the middle
1998          */
1999         if (right) {
2000                 wret = push_node_left(trans, mid, right, 1);
2001                 if (wret < 0 && wret != -ENOSPC)
2002                         ret = wret;
2003                 if (btrfs_header_nritems(right) == 0) {
2004                         btrfs_clean_tree_block(right);
2005                         btrfs_tree_unlock(right);
2006                         del_ptr(root, path, level + 1, pslot + 1);
2007                         root_sub_used(root, right->len);
2008                         btrfs_free_tree_block(trans, root, right, 0, 1);
2009                         free_extent_buffer_stale(right);
2010                         right = NULL;
2011                 } else {
2012                         struct btrfs_disk_key right_key;
2013                         btrfs_node_key(right, &right_key, 0);
2014                         ret = tree_mod_log_insert_key(parent, pslot + 1,
2015                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
2016                         BUG_ON(ret < 0);
2017                         btrfs_set_node_key(parent, &right_key, pslot + 1);
2018                         btrfs_mark_buffer_dirty(parent);
2019                 }
2020         }
2021         if (btrfs_header_nritems(mid) == 1) {
2022                 /*
2023                  * we're not allowed to leave a node with one item in the
2024                  * tree during a delete.  A deletion from lower in the tree
2025                  * could try to delete the only pointer in this node.
2026                  * So, pull some keys from the left.
2027                  * There has to be a left pointer at this point because
2028                  * otherwise we would have pulled some pointers from the
2029                  * right
2030                  */
2031                 if (!left) {
2032                         ret = -EROFS;
2033                         btrfs_handle_fs_error(fs_info, ret, NULL);
2034                         goto enospc;
2035                 }
2036                 wret = balance_node_right(trans, mid, left);
2037                 if (wret < 0) {
2038                         ret = wret;
2039                         goto enospc;
2040                 }
2041                 if (wret == 1) {
2042                         wret = push_node_left(trans, left, mid, 1);
2043                         if (wret < 0)
2044                                 ret = wret;
2045                 }
2046                 BUG_ON(wret == 1);
2047         }
2048         if (btrfs_header_nritems(mid) == 0) {
2049                 btrfs_clean_tree_block(mid);
2050                 btrfs_tree_unlock(mid);
2051                 del_ptr(root, path, level + 1, pslot);
2052                 root_sub_used(root, mid->len);
2053                 btrfs_free_tree_block(trans, root, mid, 0, 1);
2054                 free_extent_buffer_stale(mid);
2055                 mid = NULL;
2056         } else {
2057                 /* update the parent key to reflect our changes */
2058                 struct btrfs_disk_key mid_key;
2059                 btrfs_node_key(mid, &mid_key, 0);
2060                 ret = tree_mod_log_insert_key(parent, pslot,
2061                                 MOD_LOG_KEY_REPLACE, GFP_NOFS);
2062                 BUG_ON(ret < 0);
2063                 btrfs_set_node_key(parent, &mid_key, pslot);
2064                 btrfs_mark_buffer_dirty(parent);
2065         }
2066
2067         /* update the path */
2068         if (left) {
2069                 if (btrfs_header_nritems(left) > orig_slot) {
2070                         extent_buffer_get(left);
2071                         /* left was locked after cow */
2072                         path->nodes[level] = left;
2073                         path->slots[level + 1] -= 1;
2074                         path->slots[level] = orig_slot;
2075                         if (mid) {
2076                                 btrfs_tree_unlock(mid);
2077                                 free_extent_buffer(mid);
2078                         }
2079                 } else {
2080                         orig_slot -= btrfs_header_nritems(left);
2081                         path->slots[level] = orig_slot;
2082                 }
2083         }
2084         /* double check we haven't messed things up */
2085         if (orig_ptr !=
2086             btrfs_node_blockptr(path->nodes[level], path->slots[level]))
2087                 BUG();
2088 enospc:
2089         if (right) {
2090                 btrfs_tree_unlock(right);
2091                 free_extent_buffer(right);
2092         }
2093         if (left) {
2094                 if (path->nodes[level] != left)
2095                         btrfs_tree_unlock(left);
2096                 free_extent_buffer(left);
2097         }
2098         return ret;
2099 }
2100
2101 /* Node balancing for insertion.  Here we only split or push nodes around
2102  * when they are completely full.  This is also done top down, so we
2103  * have to be pessimistic.
2104  */
2105 static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
2106                                           struct btrfs_root *root,
2107                                           struct btrfs_path *path, int level)
2108 {
2109         struct btrfs_fs_info *fs_info = root->fs_info;
2110         struct extent_buffer *right = NULL;
2111         struct extent_buffer *mid;
2112         struct extent_buffer *left = NULL;
2113         struct extent_buffer *parent = NULL;
2114         int ret = 0;
2115         int wret;
2116         int pslot;
2117         int orig_slot = path->slots[level];
2118
2119         if (level == 0)
2120                 return 1;
2121
2122         mid = path->nodes[level];
2123         WARN_ON(btrfs_header_generation(mid) != trans->transid);
2124
2125         if (level < BTRFS_MAX_LEVEL - 1) {
2126                 parent = path->nodes[level + 1];
2127                 pslot = path->slots[level + 1];
2128         }
2129
2130         if (!parent)
2131                 return 1;
2132
2133         left = btrfs_read_node_slot(parent, pslot - 1);
2134         if (IS_ERR(left))
2135                 left = NULL;
2136
2137         /* first, try to make some room in the middle buffer */
2138         if (left) {
2139                 u32 left_nr;
2140
2141                 btrfs_tree_lock(left);
2142                 btrfs_set_lock_blocking_write(left);
2143
2144                 left_nr = btrfs_header_nritems(left);
2145                 if (left_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2146                         wret = 1;
2147                 } else {
2148                         ret = btrfs_cow_block(trans, root, left, parent,
2149                                               pslot - 1, &left);
2150                         if (ret)
2151                                 wret = 1;
2152                         else {
2153                                 wret = push_node_left(trans, left, mid, 0);
2154                         }
2155                 }
2156                 if (wret < 0)
2157                         ret = wret;
2158                 if (wret == 0) {
2159                         struct btrfs_disk_key disk_key;
2160                         orig_slot += left_nr;
2161                         btrfs_node_key(mid, &disk_key, 0);
2162                         ret = tree_mod_log_insert_key(parent, pslot,
2163                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
2164                         BUG_ON(ret < 0);
2165                         btrfs_set_node_key(parent, &disk_key, pslot);
2166                         btrfs_mark_buffer_dirty(parent);
2167                         if (btrfs_header_nritems(left) > orig_slot) {
2168                                 path->nodes[level] = left;
2169                                 path->slots[level + 1] -= 1;
2170                                 path->slots[level] = orig_slot;
2171                                 btrfs_tree_unlock(mid);
2172                                 free_extent_buffer(mid);
2173                         } else {
2174                                 orig_slot -=
2175                                         btrfs_header_nritems(left);
2176                                 path->slots[level] = orig_slot;
2177                                 btrfs_tree_unlock(left);
2178                                 free_extent_buffer(left);
2179                         }
2180                         return 0;
2181                 }
2182                 btrfs_tree_unlock(left);
2183                 free_extent_buffer(left);
2184         }
2185         right = btrfs_read_node_slot(parent, pslot + 1);
2186         if (IS_ERR(right))
2187                 right = NULL;
2188
2189         /*
2190          * then try to empty the right most buffer into the middle
2191          */
2192         if (right) {
2193                 u32 right_nr;
2194
2195                 btrfs_tree_lock(right);
2196                 btrfs_set_lock_blocking_write(right);
2197
2198                 right_nr = btrfs_header_nritems(right);
2199                 if (right_nr >= BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 1) {
2200                         wret = 1;
2201                 } else {
2202                         ret = btrfs_cow_block(trans, root, right,
2203                                               parent, pslot + 1,
2204                                               &right);
2205                         if (ret)
2206                                 wret = 1;
2207                         else {
2208                                 wret = balance_node_right(trans, right, mid);
2209                         }
2210                 }
2211                 if (wret < 0)
2212                         ret = wret;
2213                 if (wret == 0) {
2214                         struct btrfs_disk_key disk_key;
2215
2216                         btrfs_node_key(right, &disk_key, 0);
2217                         ret = tree_mod_log_insert_key(parent, pslot + 1,
2218                                         MOD_LOG_KEY_REPLACE, GFP_NOFS);
2219                         BUG_ON(ret < 0);
2220                         btrfs_set_node_key(parent, &disk_key, pslot + 1);
2221                         btrfs_mark_buffer_dirty(parent);
2222
2223                         if (btrfs_header_nritems(mid) <= orig_slot) {
2224                                 path->nodes[level] = right;
2225                                 path->slots[level + 1] += 1;
2226                                 path->slots[level] = orig_slot -
2227                                         btrfs_header_nritems(mid);
2228                                 btrfs_tree_unlock(mid);
2229                                 free_extent_buffer(mid);
2230                         } else {
2231                                 btrfs_tree_unlock(right);
2232                                 free_extent_buffer(right);
2233                         }
2234                         return 0;
2235                 }
2236                 btrfs_tree_unlock(right);
2237                 free_extent_buffer(right);
2238         }
2239         return 1;
2240 }
2241
2242 /*
2243  * readahead one full node of leaves, finding things that are close
2244  * to the block in 'slot', and triggering ra on them.
2245  */
2246 static void reada_for_search(struct btrfs_fs_info *fs_info,
2247                              struct btrfs_path *path,
2248                              int level, int slot, u64 objectid)
2249 {
2250         struct extent_buffer *node;
2251         struct btrfs_disk_key disk_key;
2252         u32 nritems;
2253         u64 search;
2254         u64 target;
2255         u64 nread = 0;
2256         struct extent_buffer *eb;
2257         u32 nr;
2258         u32 blocksize;
2259         u32 nscan = 0;
2260
2261         if (level != 1)
2262                 return;
2263
2264         if (!path->nodes[level])
2265                 return;
2266
2267         node = path->nodes[level];
2268
2269         search = btrfs_node_blockptr(node, slot);
2270         blocksize = fs_info->nodesize;
2271         eb = find_extent_buffer(fs_info, search);
2272         if (eb) {
2273                 free_extent_buffer(eb);
2274                 return;
2275         }
2276
2277         target = search;
2278
2279         nritems = btrfs_header_nritems(node);
2280         nr = slot;
2281
2282         while (1) {
2283                 if (path->reada == READA_BACK) {
2284                         if (nr == 0)
2285                                 break;
2286                         nr--;
2287                 } else if (path->reada == READA_FORWARD) {
2288                         nr++;
2289                         if (nr >= nritems)
2290                                 break;
2291                 }
2292                 if (path->reada == READA_BACK && objectid) {
2293                         btrfs_node_key(node, &disk_key, nr);
2294                         if (btrfs_disk_key_objectid(&disk_key) != objectid)
2295                                 break;
2296                 }
2297                 search = btrfs_node_blockptr(node, nr);
2298                 if ((search <= target && target - search <= 65536) ||
2299                     (search > target && search - target <= 65536)) {
2300                         readahead_tree_block(fs_info, search);
2301                         nread += blocksize;
2302                 }
2303                 nscan++;
2304                 if ((nread > 65536 || nscan > 32))
2305                         break;
2306         }
2307 }
2308
2309 static noinline void reada_for_balance(struct btrfs_fs_info *fs_info,
2310                                        struct btrfs_path *path, int level)
2311 {
2312         int slot;
2313         int nritems;
2314         struct extent_buffer *parent;
2315         struct extent_buffer *eb;
2316         u64 gen;
2317         u64 block1 = 0;
2318         u64 block2 = 0;
2319
2320         parent = path->nodes[level + 1];
2321         if (!parent)
2322                 return;
2323
2324         nritems = btrfs_header_nritems(parent);
2325         slot = path->slots[level + 1];
2326
2327         if (slot > 0) {
2328                 block1 = btrfs_node_blockptr(parent, slot - 1);
2329                 gen = btrfs_node_ptr_generation(parent, slot - 1);
2330                 eb = find_extent_buffer(fs_info, block1);
2331                 /*
2332                  * if we get -eagain from btrfs_buffer_uptodate, we
2333                  * don't want to return eagain here.  That will loop
2334                  * forever
2335                  */
2336                 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2337                         block1 = 0;
2338                 free_extent_buffer(eb);
2339         }
2340         if (slot + 1 < nritems) {
2341                 block2 = btrfs_node_blockptr(parent, slot + 1);
2342                 gen = btrfs_node_ptr_generation(parent, slot + 1);
2343                 eb = find_extent_buffer(fs_info, block2);
2344                 if (eb && btrfs_buffer_uptodate(eb, gen, 1) != 0)
2345                         block2 = 0;
2346                 free_extent_buffer(eb);
2347         }
2348
2349         if (block1)
2350                 readahead_tree_block(fs_info, block1);
2351         if (block2)
2352                 readahead_tree_block(fs_info, block2);
2353 }
2354
2355
2356 /*
2357  * when we walk down the tree, it is usually safe to unlock the higher layers
2358  * in the tree.  The exceptions are when our path goes through slot 0, because
2359  * operations on the tree might require changing key pointers higher up in the
2360  * tree.
2361  *
2362  * callers might also have set path->keep_locks, which tells this code to keep
2363  * the lock if the path points to the last slot in the block.  This is part of
2364  * walking through the tree, and selecting the next slot in the higher block.
2365  *
2366  * lowest_unlock sets the lowest level in the tree we're allowed to unlock.  so
2367  * if lowest_unlock is 1, level 0 won't be unlocked
2368  */
2369 static noinline void unlock_up(struct btrfs_path *path, int level,
2370                                int lowest_unlock, int min_write_lock_level,
2371                                int *write_lock_level)
2372 {
2373         int i;
2374         int skip_level = level;
2375         int no_skips = 0;
2376         struct extent_buffer *t;
2377
2378         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2379                 if (!path->nodes[i])
2380                         break;
2381                 if (!path->locks[i])
2382                         break;
2383                 if (!no_skips && path->slots[i] == 0) {
2384                         skip_level = i + 1;
2385                         continue;
2386                 }
2387                 if (!no_skips && path->keep_locks) {
2388                         u32 nritems;
2389                         t = path->nodes[i];
2390                         nritems = btrfs_header_nritems(t);
2391                         if (nritems < 1 || path->slots[i] >= nritems - 1) {
2392                                 skip_level = i + 1;
2393                                 continue;
2394                         }
2395                 }
2396                 if (skip_level < i && i >= lowest_unlock)
2397                         no_skips = 1;
2398
2399                 t = path->nodes[i];
2400                 if (i >= lowest_unlock && i > skip_level) {
2401                         btrfs_tree_unlock_rw(t, path->locks[i]);
2402                         path->locks[i] = 0;
2403                         if (write_lock_level &&
2404                             i > min_write_lock_level &&
2405                             i <= *write_lock_level) {
2406                                 *write_lock_level = i - 1;
2407                         }
2408                 }
2409         }
2410 }
2411
2412 /*
2413  * This releases any locks held in the path starting at level and
2414  * going all the way up to the root.
2415  *
2416  * btrfs_search_slot will keep the lock held on higher nodes in a few
2417  * corner cases, such as COW of the block at slot zero in the node.  This
2418  * ignores those rules, and it should only be called when there are no
2419  * more updates to be done higher up in the tree.
2420  */
2421 noinline void btrfs_unlock_up_safe(struct btrfs_path *path, int level)
2422 {
2423         int i;
2424
2425         if (path->keep_locks)
2426                 return;
2427
2428         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
2429                 if (!path->nodes[i])
2430                         continue;
2431                 if (!path->locks[i])
2432                         continue;
2433                 btrfs_tree_unlock_rw(path->nodes[i], path->locks[i]);
2434                 path->locks[i] = 0;
2435         }
2436 }
2437
2438 /*
2439  * helper function for btrfs_search_slot.  The goal is to find a block
2440  * in cache without setting the path to blocking.  If we find the block
2441  * we return zero and the path is unchanged.
2442  *
2443  * If we can't find the block, we set the path blocking and do some
2444  * reada.  -EAGAIN is returned and the search must be repeated.
2445  */
2446 static int
2447 read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
2448                       struct extent_buffer **eb_ret, int level, int slot,
2449                       const struct btrfs_key *key)
2450 {
2451         struct btrfs_fs_info *fs_info = root->fs_info;
2452         u64 blocknr;
2453         u64 gen;
2454         struct extent_buffer *b = *eb_ret;
2455         struct extent_buffer *tmp;
2456         struct btrfs_key first_key;
2457         int ret;
2458         int parent_level;
2459
2460         blocknr = btrfs_node_blockptr(b, slot);
2461         gen = btrfs_node_ptr_generation(b, slot);
2462         parent_level = btrfs_header_level(b);
2463         btrfs_node_key_to_cpu(b, &first_key, slot);
2464
2465         tmp = find_extent_buffer(fs_info, blocknr);
2466         if (tmp) {
2467                 /* first we do an atomic uptodate check */
2468                 if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) {
2469                         /*
2470                          * Do extra check for first_key, eb can be stale due to
2471                          * being cached, read from scrub, or have multiple
2472                          * parents (shared tree blocks).
2473                          */
2474                         if (btrfs_verify_level_key(tmp,
2475                                         parent_level - 1, &first_key, gen)) {
2476                                 free_extent_buffer(tmp);
2477                                 return -EUCLEAN;
2478                         }
2479                         *eb_ret = tmp;
2480                         return 0;
2481                 }
2482
2483                 /* the pages were up to date, but we failed
2484                  * the generation number check.  Do a full
2485                  * read for the generation number that is correct.
2486                  * We must do this without dropping locks so
2487                  * we can trust our generation number
2488                  */
2489                 btrfs_set_path_blocking(p);
2490
2491                 /* now we're allowed to do a blocking uptodate check */
2492                 ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
2493                 if (!ret) {
2494                         *eb_ret = tmp;
2495                         return 0;
2496                 }
2497                 free_extent_buffer(tmp);
2498                 btrfs_release_path(p);
2499                 return -EIO;
2500         }
2501
2502         /*
2503          * reduce lock contention at high levels
2504          * of the btree by dropping locks before
2505          * we read.  Don't release the lock on the current
2506          * level because we need to walk this node to figure
2507          * out which blocks to read.
2508          */
2509         btrfs_unlock_up_safe(p, level + 1);
2510         btrfs_set_path_blocking(p);
2511
2512         if (p->reada != READA_NONE)
2513                 reada_for_search(fs_info, p, level, slot, key->objectid);
2514
2515         ret = -EAGAIN;
2516         tmp = read_tree_block(fs_info, blocknr, gen, parent_level - 1,
2517                               &first_key);
2518         if (!IS_ERR(tmp)) {
2519                 /*
2520                  * If the read above didn't mark this buffer up to date,
2521                  * it will never end up being up to date.  Set ret to EIO now
2522                  * and give up so that our caller doesn't loop forever
2523                  * on our EAGAINs.
2524                  */
2525                 if (!extent_buffer_uptodate(tmp))
2526                         ret = -EIO;
2527                 free_extent_buffer(tmp);
2528         } else {
2529                 ret = PTR_ERR(tmp);
2530         }
2531
2532         btrfs_release_path(p);
2533         return ret;
2534 }
2535
2536 /*
2537  * helper function for btrfs_search_slot.  This does all of the checks
2538  * for node-level blocks and does any balancing required based on
2539  * the ins_len.
2540  *
2541  * If no extra work was required, zero is returned.  If we had to
2542  * drop the path, -EAGAIN is returned and btrfs_search_slot must
2543  * start over
2544  */
2545 static int
2546 setup_nodes_for_search(struct btrfs_trans_handle *trans,
2547                        struct btrfs_root *root, struct btrfs_path *p,
2548                        struct extent_buffer *b, int level, int ins_len,
2549                        int *write_lock_level)
2550 {
2551         struct btrfs_fs_info *fs_info = root->fs_info;
2552         int ret;
2553
2554         if ((p->search_for_split || ins_len > 0) && btrfs_header_nritems(b) >=
2555             BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3) {
2556                 int sret;
2557
2558                 if (*write_lock_level < level + 1) {
2559                         *write_lock_level = level + 1;
2560                         btrfs_release_path(p);
2561                         goto again;
2562                 }
2563
2564                 btrfs_set_path_blocking(p);
2565                 reada_for_balance(fs_info, p, level);
2566                 sret = split_node(trans, root, p, level);
2567
2568                 BUG_ON(sret > 0);
2569                 if (sret) {
2570                         ret = sret;
2571                         goto done;
2572                 }
2573                 b = p->nodes[level];
2574         } else if (ins_len < 0 && btrfs_header_nritems(b) <
2575                    BTRFS_NODEPTRS_PER_BLOCK(fs_info) / 2) {
2576                 int sret;
2577
2578                 if (*write_lock_level < level + 1) {
2579                         *write_lock_level = level + 1;
2580                         btrfs_release_path(p);
2581                         goto again;
2582                 }
2583
2584                 btrfs_set_path_blocking(p);
2585                 reada_for_balance(fs_info, p, level);
2586                 sret = balance_level(trans, root, p, level);
2587
2588                 if (sret) {
2589                         ret = sret;
2590                         goto done;
2591                 }
2592                 b = p->nodes[level];
2593                 if (!b) {
2594                         btrfs_release_path(p);
2595                         goto again;
2596                 }
2597                 BUG_ON(btrfs_header_nritems(b) == 1);
2598         }
2599         return 0;
2600
2601 again:
2602         ret = -EAGAIN;
2603 done:
2604         return ret;
2605 }
2606
2607 static int key_search(struct extent_buffer *b, const struct btrfs_key *key,
2608                       int level, int *prev_cmp, int *slot)
2609 {
2610         if (*prev_cmp != 0) {
2611                 *prev_cmp = btrfs_bin_search(b, key, level, slot);
2612                 return *prev_cmp;
2613         }
2614
2615         *slot = 0;
2616
2617         return 0;
2618 }
2619
2620 int btrfs_find_item(struct btrfs_root *fs_root, struct btrfs_path *path,
2621                 u64 iobjectid, u64 ioff, u8 key_type,
2622                 struct btrfs_key *found_key)
2623 {
2624         int ret;
2625         struct btrfs_key key;
2626         struct extent_buffer *eb;
2627
2628         ASSERT(path);
2629         ASSERT(found_key);
2630
2631         key.type = key_type;
2632         key.objectid = iobjectid;
2633         key.offset = ioff;
2634
2635         ret = btrfs_search_slot(NULL, fs_root, &key, path, 0, 0);
2636         if (ret < 0)
2637                 return ret;
2638
2639         eb = path->nodes[0];
2640         if (ret && path->slots[0] >= btrfs_header_nritems(eb)) {
2641                 ret = btrfs_next_leaf(fs_root, path);
2642                 if (ret)
2643                         return ret;
2644                 eb = path->nodes[0];
2645         }
2646
2647         btrfs_item_key_to_cpu(eb, found_key, path->slots[0]);
2648         if (found_key->type != key.type ||
2649                         found_key->objectid != key.objectid)
2650                 return 1;
2651
2652         return 0;
2653 }
2654
2655 static struct extent_buffer *btrfs_search_slot_get_root(struct btrfs_root *root,
2656                                                         struct btrfs_path *p,
2657                                                         int write_lock_level)
2658 {
2659         struct btrfs_fs_info *fs_info = root->fs_info;
2660         struct extent_buffer *b;
2661         int root_lock = 0;
2662         int level = 0;
2663
2664         if (p->search_commit_root) {
2665                 /*
2666                  * The commit roots are read only so we always do read locks,
2667                  * and we always must hold the commit_root_sem when doing
2668                  * searches on them, the only exception is send where we don't
2669                  * want to block transaction commits for a long time, so
2670                  * we need to clone the commit root in order to avoid races
2671                  * with transaction commits that create a snapshot of one of
2672                  * the roots used by a send operation.
2673                  */
2674                 if (p->need_commit_sem) {
2675                         down_read(&fs_info->commit_root_sem);
2676                         b = btrfs_clone_extent_buffer(root->commit_root);
2677                         up_read(&fs_info->commit_root_sem);
2678                         if (!b)
2679                                 return ERR_PTR(-ENOMEM);
2680
2681                 } else {
2682                         b = root->commit_root;
2683                         extent_buffer_get(b);
2684                 }
2685                 level = btrfs_header_level(b);
2686                 /*
2687                  * Ensure that all callers have set skip_locking when
2688                  * p->search_commit_root = 1.
2689                  */
2690                 ASSERT(p->skip_locking == 1);
2691
2692                 goto out;
2693         }
2694
2695         if (p->skip_locking) {
2696                 b = btrfs_root_node(root);
2697                 level = btrfs_header_level(b);
2698                 goto out;
2699         }
2700
2701         /* We try very hard to do read locks on the root */
2702         root_lock = BTRFS_READ_LOCK;
2703
2704         /*
2705          * If the level is set to maximum, we can skip trying to get the read
2706          * lock.
2707          */
2708         if (write_lock_level < BTRFS_MAX_LEVEL) {
2709                 /*
2710                  * We don't know the level of the root node until we actually
2711                  * have it read locked
2712                  */
2713                 b = btrfs_read_lock_root_node(root);
2714                 level = btrfs_header_level(b);
2715                 if (level > write_lock_level)
2716                         goto out;
2717
2718                 /* Whoops, must trade for write lock */
2719                 btrfs_tree_read_unlock(b);
2720                 free_extent_buffer(b);
2721         }
2722
2723         b = btrfs_lock_root_node(root);
2724         root_lock = BTRFS_WRITE_LOCK;
2725
2726         /* The level might have changed, check again */
2727         level = btrfs_header_level(b);
2728
2729 out:
2730         /*
2731          * The root may have failed to write out at some point, and thus is no
2732          * longer valid, return an error in this case.
2733          */
2734         if (!extent_buffer_uptodate(b)) {
2735                 if (root_lock)
2736                         btrfs_tree_unlock_rw(b, root_lock);
2737                 free_extent_buffer(b);
2738                 return ERR_PTR(-EIO);
2739         }
2740
2741         p->nodes[level] = b;
2742         if (!p->skip_locking)
2743                 p->locks[level] = root_lock;
2744         /*
2745          * Callers are responsible for dropping b's references.
2746          */
2747         return b;
2748 }
2749
2750
2751 /*
2752  * btrfs_search_slot - look for a key in a tree and perform necessary
2753  * modifications to preserve tree invariants.
2754  *
2755  * @trans:      Handle of transaction, used when modifying the tree
2756  * @p:          Holds all btree nodes along the search path
2757  * @root:       The root node of the tree
2758  * @key:        The key we are looking for
2759  * @ins_len:    Indicates purpose of search, for inserts it is 1, for
2760  *              deletions it's -1. 0 for plain searches
2761  * @cow:        boolean should CoW operations be performed. Must always be 1
2762  *              when modifying the tree.
2763  *
2764  * If @ins_len > 0, nodes and leaves will be split as we walk down the tree.
2765  * If @ins_len < 0, nodes will be merged as we walk down the tree (if possible)
2766  *
2767  * If @key is found, 0 is returned and you can find the item in the leaf level
2768  * of the path (level 0)
2769  *
2770  * If @key isn't found, 1 is returned and the leaf level of the path (level 0)
2771  * points to the slot where it should be inserted
2772  *
2773  * If an error is encountered while searching the tree a negative error number
2774  * is returned
2775  */
2776 int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
2777                       const struct btrfs_key *key, struct btrfs_path *p,
2778                       int ins_len, int cow)
2779 {
2780         struct extent_buffer *b;
2781         int slot;
2782         int ret;
2783         int err;
2784         int level;
2785         int lowest_unlock = 1;
2786         /* everything at write_lock_level or lower must be write locked */
2787         int write_lock_level = 0;
2788         u8 lowest_level = 0;
2789         int min_write_lock_level;
2790         int prev_cmp;
2791
2792         lowest_level = p->lowest_level;
2793         WARN_ON(lowest_level && ins_len > 0);
2794         WARN_ON(p->nodes[0] != NULL);
2795         BUG_ON(!cow && ins_len);
2796
2797         if (ins_len < 0) {
2798                 lowest_unlock = 2;
2799
2800                 /* when we are removing items, we might have to go up to level
2801                  * two as we update tree pointers  Make sure we keep write
2802                  * for those levels as well
2803                  */
2804                 write_lock_level = 2;
2805         } else if (ins_len > 0) {
2806                 /*
2807                  * for inserting items, make sure we have a write lock on
2808                  * level 1 so we can update keys
2809                  */
2810                 write_lock_level = 1;
2811         }
2812
2813         if (!cow)
2814                 write_lock_level = -1;
2815
2816         if (cow && (p->keep_locks || p->lowest_level))
2817                 write_lock_level = BTRFS_MAX_LEVEL;
2818
2819         min_write_lock_level = write_lock_level;
2820
2821 again:
2822         prev_cmp = -1;
2823         b = btrfs_search_slot_get_root(root, p, write_lock_level);
2824         if (IS_ERR(b)) {
2825                 ret = PTR_ERR(b);
2826                 goto done;
2827         }
2828
2829         while (b) {
2830                 level = btrfs_header_level(b);
2831
2832                 /*
2833                  * setup the path here so we can release it under lock
2834                  * contention with the cow code
2835                  */
2836                 if (cow) {
2837                         bool last_level = (level == (BTRFS_MAX_LEVEL - 1));
2838
2839                         /*
2840                          * if we don't really need to cow this block
2841                          * then we don't want to set the path blocking,
2842                          * so we test it here
2843                          */
2844                         if (!should_cow_block(trans, root, b)) {
2845                                 trans->dirty = true;
2846                                 goto cow_done;
2847                         }
2848
2849                         /*
2850                          * must have write locks on this node and the
2851                          * parent
2852                          */
2853                         if (level > write_lock_level ||
2854                             (level + 1 > write_lock_level &&
2855                             level + 1 < BTRFS_MAX_LEVEL &&
2856                             p->nodes[level + 1])) {
2857                                 write_lock_level = level + 1;
2858                                 btrfs_release_path(p);
2859                                 goto again;
2860                         }
2861
2862                         btrfs_set_path_blocking(p);
2863                         if (last_level)
2864                                 err = btrfs_cow_block(trans, root, b, NULL, 0,
2865                                                       &b);
2866                         else
2867                                 err = btrfs_cow_block(trans, root, b,
2868                                                       p->nodes[level + 1],
2869                                                       p->slots[level + 1], &b);
2870                         if (err) {
2871                                 ret = err;
2872                                 goto done;
2873                         }
2874                 }
2875 cow_done:
2876                 p->nodes[level] = b;
2877                 /*
2878                  * Leave path with blocking locks to avoid massive
2879                  * lock context switch, this is made on purpose.
2880                  */
2881
2882                 /*
2883                  * we have a lock on b and as long as we aren't changing
2884                  * the tree, there is no way to for the items in b to change.
2885                  * It is safe to drop the lock on our parent before we
2886                  * go through the expensive btree search on b.
2887                  *
2888                  * If we're inserting or deleting (ins_len != 0), then we might
2889                  * be changing slot zero, which may require changing the parent.
2890                  * So, we can't drop the lock until after we know which slot
2891                  * we're operating on.
2892                  */
2893                 if (!ins_len && !p->keep_locks) {
2894                         int u = level + 1;
2895
2896                         if (u < BTRFS_MAX_LEVEL && p->locks[u]) {
2897                                 btrfs_tree_unlock_rw(p->nodes[u], p->locks[u]);
2898                                 p->locks[u] = 0;
2899                         }
2900                 }
2901
2902                 ret = key_search(b, key, level, &prev_cmp, &slot);
2903                 if (ret < 0)
2904                         goto done;
2905
2906                 if (level != 0) {
2907                         int dec = 0;
2908                         if (ret && slot > 0) {
2909                                 dec = 1;
2910                                 slot -= 1;
2911                         }
2912                         p->slots[level] = slot;
2913                         err = setup_nodes_for_search(trans, root, p, b, level,
2914                                              ins_len, &write_lock_level);
2915                         if (err == -EAGAIN)
2916                                 goto again;
2917                         if (err) {
2918                                 ret = err;
2919                                 goto done;
2920                         }
2921                         b = p->nodes[level];
2922                         slot = p->slots[level];
2923
2924                         /*
2925                          * slot 0 is special, if we change the key
2926                          * we have to update the parent pointer
2927                          * which means we must have a write lock
2928                          * on the parent
2929                          */
2930                         if (slot == 0 && ins_len &&
2931                             write_lock_level < level + 1) {
2932                                 write_lock_level = level + 1;
2933                                 btrfs_release_path(p);
2934                                 goto again;
2935                         }
2936
2937                         unlock_up(p, level, lowest_unlock,
2938                                   min_write_lock_level, &write_lock_level);
2939
2940                         if (level == lowest_level) {
2941                                 if (dec)
2942                                         p->slots[level]++;
2943                                 goto done;
2944                         }
2945
2946                         err = read_block_for_search(root, p, &b, level,
2947                                                     slot, key);
2948                         if (err == -EAGAIN)
2949                                 goto again;
2950                         if (err) {
2951                                 ret = err;
2952                                 goto done;
2953                         }
2954
2955                         if (!p->skip_locking) {
2956                                 level = btrfs_header_level(b);
2957                                 if (level <= write_lock_level) {
2958                                         if (!btrfs_try_tree_write_lock(b)) {
2959                                                 btrfs_set_path_blocking(p);
2960                                                 btrfs_tree_lock(b);
2961                                         }
2962                                         p->locks[level] = BTRFS_WRITE_LOCK;
2963                                 } else {
2964                                         if (!btrfs_tree_read_lock_atomic(b)) {
2965                                                 btrfs_set_path_blocking(p);
2966                                                 btrfs_tree_read_lock(b);
2967                                         }
2968                                         p->locks[level] = BTRFS_READ_LOCK;
2969                                 }
2970                                 p->nodes[level] = b;
2971                         }
2972                 } else {
2973                         p->slots[level] = slot;
2974                         if (ins_len > 0 &&
2975                             btrfs_leaf_free_space(b) < ins_len) {
2976                                 if (write_lock_level < 1) {
2977                                         write_lock_level = 1;
2978                                         btrfs_release_path(p);
2979                                         goto again;
2980                                 }
2981
2982                                 btrfs_set_path_blocking(p);
2983                                 err = split_leaf(trans, root, key,
2984                                                  p, ins_len, ret == 0);
2985
2986                                 BUG_ON(err > 0);
2987                                 if (err) {
2988                                         ret = err;
2989                                         goto done;
2990                                 }
2991                         }
2992                         if (!p->search_for_split)
2993                                 unlock_up(p, level, lowest_unlock,
2994                                           min_write_lock_level, NULL);
2995                         goto done;
2996                 }
2997         }
2998         ret = 1;
2999 done:
3000         /*
3001          * we don't really know what they plan on doing with the path
3002          * from here on, so for now just mark it as blocking
3003          */
3004         if (!p->leave_spinning)
3005                 btrfs_set_path_blocking(p);
3006         if (ret < 0 && !p->skip_release_on_error)
3007                 btrfs_release_path(p);
3008         return ret;
3009 }
3010
3011 /*
3012  * Like btrfs_search_slot, this looks for a key in the given tree. It uses the
3013  * current state of the tree together with the operations recorded in the tree
3014  * modification log to search for the key in a previous version of this tree, as
3015  * denoted by the time_seq parameter.
3016  *
3017  * Naturally, there is no support for insert, delete or cow operations.
3018  *
3019  * The resulting path and return value will be set up as if we called
3020  * btrfs_search_slot at that point in time with ins_len and cow both set to 0.
3021  */
3022 int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key,
3023                           struct btrfs_path *p, u64 time_seq)
3024 {
3025         struct btrfs_fs_info *fs_info = root->fs_info;
3026         struct extent_buffer *b;
3027         int slot;
3028         int ret;
3029         int err;
3030         int level;
3031         int lowest_unlock = 1;
3032         u8 lowest_level = 0;
3033         int prev_cmp = -1;
3034
3035         lowest_level = p->lowest_level;
3036         WARN_ON(p->nodes[0] != NULL);
3037
3038         if (p->search_commit_root) {
3039                 BUG_ON(time_seq);
3040                 return btrfs_search_slot(NULL, root, key, p, 0, 0);
3041         }
3042
3043 again:
3044         b = get_old_root(root, time_seq);
3045         if (!b) {
3046                 ret = -EIO;
3047                 goto done;
3048         }
3049         level = btrfs_header_level(b);
3050         p->locks[level] = BTRFS_READ_LOCK;
3051
3052         while (b) {
3053                 level = btrfs_header_level(b);
3054                 p->nodes[level] = b;
3055
3056                 /*
3057                  * we have a lock on b and as long as we aren't changing
3058                  * the tree, there is no way to for the items in b to change.
3059                  * It is safe to drop the lock on our parent before we
3060                  * go through the expensive btree search on b.
3061                  */
3062                 btrfs_unlock_up_safe(p, level + 1);
3063
3064                 /*
3065                  * Since we can unwind ebs we want to do a real search every
3066                  * time.
3067                  */
3068                 prev_cmp = -1;
3069                 ret = key_search(b, key, level, &prev_cmp, &slot);
3070                 if (ret < 0)
3071                         goto done;
3072
3073                 if (level != 0) {
3074                         int dec = 0;
3075                         if (ret && slot > 0) {
3076                                 dec = 1;
3077                                 slot -= 1;
3078                         }
3079                         p->slots[level] = slot;
3080                         unlock_up(p, level, lowest_unlock, 0, NULL);
3081
3082                         if (level == lowest_level) {
3083                                 if (dec)
3084                                         p->slots[level]++;
3085                                 goto done;
3086                         }
3087
3088                         err = read_block_for_search(root, p, &b, level,
3089                                                     slot, key);
3090                         if (err == -EAGAIN)
3091                                 goto again;
3092                         if (err) {
3093                                 ret = err;
3094                                 goto done;
3095                         }
3096
3097                         level = btrfs_header_level(b);
3098                         if (!btrfs_tree_read_lock_atomic(b)) {
3099                                 btrfs_set_path_blocking(p);
3100                                 btrfs_tree_read_lock(b);
3101                         }
3102                         b = tree_mod_log_rewind(fs_info, p, b, time_seq);
3103                         if (!b) {
3104                                 ret = -ENOMEM;
3105                                 goto done;
3106                         }
3107                         p->locks[level] = BTRFS_READ_LOCK;
3108                         p->nodes[level] = b;
3109                 } else {
3110                         p->slots[level] = slot;
3111                         unlock_up(p, level, lowest_unlock, 0, NULL);
3112                         goto done;
3113                 }
3114         }
3115         ret = 1;
3116 done:
3117         if (!p->leave_spinning)
3118                 btrfs_set_path_blocking(p);
3119         if (ret < 0)
3120                 btrfs_release_path(p);
3121
3122         return ret;
3123 }
3124
3125 /*
3126  * helper to use instead of search slot if no exact match is needed but
3127  * instead the next or previous item should be returned.
3128  * When find_higher is true, the next higher item is returned, the next lower
3129  * otherwise.
3130  * When return_any and find_higher are both true, and no higher item is found,
3131  * return the next lower instead.
3132  * When return_any is true and find_higher is false, and no lower item is found,
3133  * return the next higher instead.
3134  * It returns 0 if any item is found, 1 if none is found (tree empty), and
3135  * < 0 on error
3136  */
3137 int btrfs_search_slot_for_read(struct btrfs_root *root,
3138                                const struct btrfs_key *key,
3139                                struct btrfs_path *p, int find_higher,
3140                                int return_any)
3141 {
3142         int ret;
3143         struct extent_buffer *leaf;
3144
3145 again:
3146         ret = btrfs_search_slot(NULL, root, key, p, 0, 0);
3147         if (ret <= 0)
3148                 return ret;
3149         /*
3150          * a return value of 1 means the path is at the position where the
3151          * item should be inserted. Normally this is the next bigger item,
3152          * but in case the previous item is the last in a leaf, path points
3153          * to the first free slot in the previous leaf, i.e. at an invalid
3154          * item.
3155          */
3156         leaf = p->nodes[0];
3157
3158         if (find_higher) {
3159                 if (p->slots[0] >= btrfs_header_nritems(leaf)) {
3160                         ret = btrfs_next_leaf(root, p);
3161                         if (ret <= 0)
3162                                 return ret;
3163                         if (!return_any)
3164                                 return 1;
3165                         /*
3166                          * no higher item found, return the next
3167                          * lower instead
3168                          */
3169                         return_any = 0;
3170                         find_higher = 0;
3171                         btrfs_release_path(p);
3172                         goto again;
3173                 }
3174         } else {
3175                 if (p->slots[0] == 0) {
3176                         ret = btrfs_prev_leaf(root, p);
3177                         if (ret < 0)
3178                                 return ret;
3179                         if (!ret) {
3180                                 leaf = p->nodes[0];
3181                                 if (p->slots[0] == btrfs_header_nritems(leaf))
3182                                         p->slots[0]--;
3183                                 return 0;
3184                         }
3185                         if (!return_any)
3186                                 return 1;
3187                         /*
3188                          * no lower item found, return the next
3189                          * higher instead
3190                          */
3191                         return_any = 0;
3192                         find_higher = 1;
3193                         btrfs_release_path(p);
3194                         goto again;
3195                 } else {
3196                         --p->slots[0];
3197                 }
3198         }
3199         return 0;
3200 }
3201
3202 /*
3203  * adjust the pointers going up the tree, starting at level
3204  * making sure the right key of each node is points to 'key'.
3205  * This is used after shifting pointers to the left, so it stops
3206  * fixing up pointers when a given leaf/node is not in slot 0 of the
3207  * higher levels
3208  *
3209  */
3210 static void fixup_low_keys(struct btrfs_path *path,
3211                            struct btrfs_disk_key *key, int level)
3212 {
3213         int i;
3214         struct extent_buffer *t;
3215         int ret;
3216
3217         for (i = level; i < BTRFS_MAX_LEVEL; i++) {
3218                 int tslot = path->slots[i];
3219
3220                 if (!path->nodes[i])
3221                         break;
3222                 t = path->nodes[i];
3223                 ret = tree_mod_log_insert_key(t, tslot, MOD_LOG_KEY_REPLACE,
3224                                 GFP_ATOMIC);
3225                 BUG_ON(ret < 0);
3226                 btrfs_set_node_key(t, key, tslot);
3227                 btrfs_mark_buffer_dirty(path->nodes[i]);
3228                 if (tslot != 0)
3229                         break;
3230         }
3231 }
3232
3233 /*
3234  * update item key.
3235  *
3236  * This function isn't completely safe. It's the caller's responsibility
3237  * that the new key won't break the order
3238  */
3239 void btrfs_set_item_key_safe(struct btrfs_fs_info *fs_info,
3240                              struct btrfs_path *path,
3241                              const struct btrfs_key *new_key)
3242 {
3243         struct btrfs_disk_key disk_key;
3244         struct extent_buffer *eb;
3245         int slot;
3246
3247         eb = path->nodes[0];
3248         slot = path->slots[0];
3249         if (slot > 0) {
3250                 btrfs_item_key(eb, &disk_key, slot - 1);
3251                 if (unlikely(comp_keys(&disk_key, new_key) >= 0)) {
3252                         btrfs_crit(fs_info,
3253                 "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
3254                                    slot, btrfs_disk_key_objectid(&disk_key),
3255                                    btrfs_disk_key_type(&disk_key),
3256                                    btrfs_disk_key_offset(&disk_key),
3257                                    new_key->objectid, new_key->type,
3258                                    new_key->offset);
3259                         btrfs_print_leaf(eb);
3260                         BUG();
3261                 }
3262         }
3263         if (slot < btrfs_header_nritems(eb) - 1) {
3264                 btrfs_item_key(eb, &disk_key, slot + 1);
3265                 if (unlikely(comp_keys(&disk_key, new_key) <= 0)) {
3266                         btrfs_crit(fs_info,
3267                 "slot %u key (%llu %u %llu) new key (%llu %u %llu)",
3268                                    slot, btrfs_disk_key_objectid(&disk_key),
3269                                    btrfs_disk_key_type(&disk_key),
3270                                    btrfs_disk_key_offset(&disk_key),
3271                                    new_key->objectid, new_key->type,
3272                                    new_key->offset);
3273                         btrfs_print_leaf(eb);
3274                         BUG();
3275                 }
3276         }
3277
3278         btrfs_cpu_key_to_disk(&disk_key, new_key);
3279         btrfs_set_item_key(eb, &disk_key, slot);
3280         btrfs_mark_buffer_dirty(eb);
3281         if (slot == 0)
3282                 fixup_low_keys(path, &disk_key, 1);
3283 }
3284
3285 /*
3286  * try to push data from one node into the next node left in the
3287  * tree.
3288  *
3289  * returns 0 if some ptrs were pushed left, < 0 if there was some horrible
3290  * error, and > 0 if there was no room in the left hand block.
3291  */
3292 static int push_node_left(struct btrfs_trans_handle *trans,
3293                           struct extent_buffer *dst,
3294                           struct extent_buffer *src, int empty)
3295 {
3296         struct btrfs_fs_info *fs_info = trans->fs_info;
3297         int push_items = 0;
3298         int src_nritems;
3299         int dst_nritems;
3300         int ret = 0;
3301
3302         src_nritems = btrfs_header_nritems(src);
3303         dst_nritems = btrfs_header_nritems(dst);
3304         push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3305         WARN_ON(btrfs_header_generation(src) != trans->transid);
3306         WARN_ON(btrfs_header_generation(dst) != trans->transid);
3307
3308         if (!empty && src_nritems <= 8)
3309                 return 1;
3310
3311         if (push_items <= 0)
3312                 return 1;
3313
3314         if (empty) {
3315                 push_items = min(src_nritems, push_items);
3316                 if (push_items < src_nritems) {
3317                         /* leave at least 8 pointers in the node if
3318                          * we aren't going to empty it
3319                          */
3320                         if (src_nritems - push_items < 8) {
3321                                 if (push_items <= 8)
3322                                         return 1;
3323                                 push_items -= 8;
3324                         }
3325                 }
3326         } else
3327                 push_items = min(src_nritems - 8, push_items);
3328
3329         ret = tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items);
3330         if (ret) {
3331                 btrfs_abort_transaction(trans, ret);
3332                 return ret;
3333         }
3334         copy_extent_buffer(dst, src,
3335                            btrfs_node_key_ptr_offset(dst_nritems),
3336                            btrfs_node_key_ptr_offset(0),
3337                            push_items * sizeof(struct btrfs_key_ptr));
3338
3339         if (push_items < src_nritems) {
3340                 /*
3341                  * Don't call tree_mod_log_insert_move here, key removal was
3342                  * already fully logged by tree_mod_log_eb_copy above.
3343                  */
3344                 memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
3345                                       btrfs_node_key_ptr_offset(push_items),
3346                                       (src_nritems - push_items) *
3347                                       sizeof(struct btrfs_key_ptr));
3348         }
3349         btrfs_set_header_nritems(src, src_nritems - push_items);
3350         btrfs_set_header_nritems(dst, dst_nritems + push_items);
3351         btrfs_mark_buffer_dirty(src);
3352         btrfs_mark_buffer_dirty(dst);
3353
3354         return ret;
3355 }
3356
3357 /*
3358  * try to push data from one node into the next node right in the
3359  * tree.
3360  *
3361  * returns 0 if some ptrs were pushed, < 0 if there was some horrible
3362  * error, and > 0 if there was no room in the right hand block.
3363  *
3364  * this will  only push up to 1/2 the contents of the left node over
3365  */
3366 static int balance_node_right(struct btrfs_trans_handle *trans,
3367                               struct extent_buffer *dst,
3368                               struct extent_buffer *src)
3369 {
3370         struct btrfs_fs_info *fs_info = trans->fs_info;
3371         int push_items = 0;
3372         int max_push;
3373         int src_nritems;
3374         int dst_nritems;
3375         int ret = 0;
3376
3377         WARN_ON(btrfs_header_generation(src) != trans->transid);
3378         WARN_ON(btrfs_header_generation(dst) != trans->transid);
3379
3380         src_nritems = btrfs_header_nritems(src);
3381         dst_nritems = btrfs_header_nritems(dst);
3382         push_items = BTRFS_NODEPTRS_PER_BLOCK(fs_info) - dst_nritems;
3383         if (push_items <= 0)
3384                 return 1;
3385
3386         if (src_nritems < 4)
3387                 return 1;
3388
3389         max_push = src_nritems / 2 + 1;
3390         /* don't try to empty the node */
3391         if (max_push >= src_nritems)
3392                 return 1;
3393
3394         if (max_push < push_items)
3395                 push_items = max_push;
3396
3397         ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
3398         BUG_ON(ret < 0);
3399         memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
3400                                       btrfs_node_key_ptr_offset(0),
3401                                       (dst_nritems) *
3402                                       sizeof(struct btrfs_key_ptr));
3403
3404         ret = tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items,
3405                                    push_items);
3406         if (ret) {
3407                 btrfs_abort_transaction(trans, ret);
3408                 return ret;
3409         }
3410         copy_extent_buffer(dst, src,
3411                            btrfs_node_key_ptr_offset(0),
3412                            btrfs_node_key_ptr_offset(src_nritems - push_items),
3413                            push_items * sizeof(struct btrfs_key_ptr));
3414
3415         btrfs_set_header_nritems(src, src_nritems - push_items);
3416         btrfs_set_header_nritems(dst, dst_nritems + push_items);
3417
3418         btrfs_mark_buffer_dirty(src);
3419         btrfs_mark_buffer_dirty(dst);
3420
3421         return ret;
3422 }
3423
3424 /*
3425  * helper function to insert a new root level in the tree.
3426  * A new node is allocated, and a single item is inserted to
3427  * point to the existing root
3428  *
3429  * returns zero on success or < 0 on failure.
3430  */
3431 static noinline int insert_new_root(struct btrfs_trans_handle *trans,
3432                            struct btrfs_root *root,
3433                            struct btrfs_path *path, int level)
3434 {
3435         struct btrfs_fs_info *fs_info = root->fs_info;
3436         u64 lower_gen;
3437         struct extent_buffer *lower;
3438         struct extent_buffer *c;
3439         struct extent_buffer *old;
3440         struct btrfs_disk_key lower_key;
3441         int ret;
3442
3443         BUG_ON(path->nodes[level]);
3444         BUG_ON(path->nodes[level-1] != root->node);
3445
3446         lower = path->nodes[level-1];
3447         if (level == 1)
3448                 btrfs_item_key(lower, &lower_key, 0);
3449         else
3450                 btrfs_node_key(lower, &lower_key, 0);
3451
3452         c = alloc_tree_block_no_bg_flush(trans, root, 0, &lower_key, level,
3453                                          root->node->start, 0);
3454         if (IS_ERR(c))
3455                 return PTR_ERR(c);
3456
3457         root_add_used(root, fs_info->nodesize);
3458
3459         btrfs_set_header_nritems(c, 1);
3460         btrfs_set_node_key(c, &lower_key, 0);
3461         btrfs_set_node_blockptr(c, 0, lower->start);
3462         lower_gen = btrfs_header_generation(lower);
3463         WARN_ON(lower_gen != trans->transid);
3464
3465         btrfs_set_node_ptr_generation(c, 0, lower_gen);
3466
3467         btrfs_mark_buffer_dirty(c);
3468
3469         old = root->node;
3470         ret = tree_mod_log_insert_root(root->node, c, 0);
3471         BUG_ON(ret < 0);
3472         rcu_assign_pointer(root->node, c);
3473
3474         /* the super has an extra ref to root->node */
3475         free_extent_buffer(old);
3476
3477         add_root_to_dirty_list(root);
3478         extent_buffer_get(c);
3479         path->nodes[level] = c;
3480         path->locks[level] = BTRFS_WRITE_LOCK_BLOCKING;
3481         path->slots[level] = 0;
3482         return 0;
3483 }
3484
3485 /*
3486  * worker function to insert a single pointer in a node.
3487  * the node should have enough room for the pointer already
3488  *
3489  * slot and level indicate where you want the key to go, and
3490  * blocknr is the block the key points to.
3491  */
3492 static void insert_ptr(struct btrfs_trans_handle *trans,
3493                        struct btrfs_path *path,
3494                        struct btrfs_disk_key *key, u64 bytenr,
3495                        int slot, int level)
3496 {
3497         struct extent_buffer *lower;
3498         int nritems;
3499         int ret;
3500
3501         BUG_ON(!path->nodes[level]);
3502         btrfs_assert_tree_locked(path->nodes[level]);
3503         lower = path->nodes[level];
3504         nritems = btrfs_header_nritems(lower);
3505         BUG_ON(slot > nritems);
3506         BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info));
3507         if (slot != nritems) {
3508                 if (level) {
3509                         ret = tree_mod_log_insert_move(lower, slot + 1, slot,
3510                                         nritems - slot);
3511                         BUG_ON(ret < 0);
3512                 }
3513                 memmove_extent_buffer(lower,
3514                               btrfs_node_key_ptr_offset(slot + 1),
3515                               btrfs_node_key_ptr_offset(slot),
3516                               (nritems - slot) * sizeof(struct btrfs_key_ptr));
3517         }
3518         if (level) {
3519                 ret = tree_mod_log_insert_key(lower, slot, MOD_LOG_KEY_ADD,
3520                                 GFP_NOFS);
3521                 BUG_ON(ret < 0);
3522         }
3523         btrfs_set_node_key(lower, key, slot);
3524         btrfs_set_node_blockptr(lower, slot, bytenr);
3525         WARN_ON(trans->transid == 0);
3526         btrfs_set_node_ptr_generation(lower, slot, trans->transid);
3527         btrfs_set_header_nritems(lower, nritems + 1);
3528         btrfs_mark_buffer_dirty(lower);
3529 }
3530
3531 /*
3532  * split the node at the specified level in path in two.
3533  * The path is corrected to point to the appropriate node after the split
3534  *
3535  * Before splitting this tries to make some room in the node by pushing
3536  * left and right, if either one works, it returns right away.
3537  *
3538  * returns 0 on success and < 0 on failure
3539  */
3540 static noinline int split_node(struct btrfs_trans_handle *trans,
3541                                struct btrfs_root *root,
3542                                struct btrfs_path *path, int level)
3543 {
3544         struct btrfs_fs_info *fs_info = root->fs_info;
3545         struct extent_buffer *c;
3546         struct extent_buffer *split;
3547         struct btrfs_disk_key disk_key;
3548         int mid;
3549         int ret;
3550         u32 c_nritems;
3551
3552         c = path->nodes[level];
3553         WARN_ON(btrfs_header_generation(c) != trans->transid);
3554         if (c == root->node) {
3555                 /*
3556                  * trying to split the root, lets make a new one
3557                  *
3558                  * tree mod log: We don't log_removal old root in
3559                  * insert_new_root, because that root buffer will be kept as a
3560                  * normal node. We are going to log removal of half of the
3561                  * elements below with tree_mod_log_eb_copy. We're holding a
3562                  * tree lock on the buffer, which is why we cannot race with
3563                  * other tree_mod_log users.
3564                  */
3565                 ret = insert_new_root(trans, root, path, level + 1);
3566                 if (ret)
3567                         return ret;
3568         } else {
3569                 ret = push_nodes_for_insert(trans, root, path, level);
3570                 c = path->nodes[level];
3571                 if (!ret && btrfs_header_nritems(c) <
3572                     BTRFS_NODEPTRS_PER_BLOCK(fs_info) - 3)
3573                         return 0;
3574                 if (ret < 0)
3575                         return ret;
3576         }
3577
3578         c_nritems = btrfs_header_nritems(c);
3579         mid = (c_nritems + 1) / 2;
3580         btrfs_node_key(c, &disk_key, mid);
3581
3582         split = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, level,
3583                                              c->start, 0);
3584         if (IS_ERR(split))
3585                 return PTR_ERR(split);
3586
3587         root_add_used(root, fs_info->nodesize);
3588         ASSERT(btrfs_header_level(c) == level);
3589
3590         ret = tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid);
3591         if (ret) {
3592                 btrfs_tree_unlock(split);
3593                 free_extent_buffer(split);
3594                 btrfs_abort_transaction(trans, ret);
3595                 return ret;
3596         }
3597         copy_extent_buffer(split, c,
3598                            btrfs_node_key_ptr_offset(0),
3599                            btrfs_node_key_ptr_offset(mid),
3600                            (c_nritems - mid) * sizeof(struct btrfs_key_ptr));
3601         btrfs_set_header_nritems(split, c_nritems - mid);
3602         btrfs_set_header_nritems(c, mid);
3603         ret = 0;
3604
3605         btrfs_mark_buffer_dirty(c);
3606         btrfs_mark_buffer_dirty(split);
3607
3608         insert_ptr(trans, path, &disk_key, split->start,
3609                    path->slots[level + 1] + 1, level + 1);
3610
3611         if (path->slots[level] >= mid) {
3612                 path->slots[level] -= mid;
3613                 btrfs_tree_unlock(c);
3614                 free_extent_buffer(c);
3615                 path->nodes[level] = split;
3616                 path->slots[level + 1] += 1;
3617         } else {
3618                 btrfs_tree_unlock(split);
3619                 free_extent_buffer(split);
3620         }
3621         return ret;
3622 }
3623
3624 /*
3625  * how many bytes are required to store the items in a leaf.  start
3626  * and nr indicate which items in the leaf to check.  This totals up the
3627  * space used both by the item structs and the item data
3628  */
3629 static int leaf_space_used(struct extent_buffer *l, int start, int nr)
3630 {
3631         struct btrfs_item *start_item;
3632         struct btrfs_item *end_item;
3633         struct btrfs_map_token token;
3634         int data_len;
3635         int nritems = btrfs_header_nritems(l);
3636         int end = min(nritems, start + nr) - 1;
3637
3638         if (!nr)
3639                 return 0;
3640         btrfs_init_map_token(&token, l);
3641         start_item = btrfs_item_nr(start);
3642         end_item = btrfs_item_nr(end);
3643         data_len = btrfs_token_item_offset(l, start_item, &token) +
3644                 btrfs_token_item_size(l, start_item, &token);
3645         data_len = data_len - btrfs_token_item_offset(l, end_item, &token);
3646         data_len += sizeof(struct btrfs_item) * nr;
3647         WARN_ON(data_len < 0);
3648         return data_len;
3649 }
3650
3651 /*
3652  * The space between the end of the leaf items and
3653  * the start of the leaf data.  IOW, how much room
3654  * the leaf has left for both items and data
3655  */
3656 noinline int btrfs_leaf_free_space(struct extent_buffer *leaf)
3657 {
3658         struct btrfs_fs_info *fs_info = leaf->fs_info;
3659         int nritems = btrfs_header_nritems(leaf);
3660         int ret;
3661
3662         ret = BTRFS_LEAF_DATA_SIZE(fs_info) - leaf_space_used(leaf, 0, nritems);
3663         if (ret < 0) {
3664                 btrfs_crit(fs_info,
3665                            "leaf free space ret %d, leaf data size %lu, used %d nritems %d",
3666                            ret,
3667                            (unsigned long) BTRFS_LEAF_DATA_SIZE(fs_info),
3668                            leaf_space_used(leaf, 0, nritems), nritems);
3669         }
3670         return ret;
3671 }
3672
3673 /*
3674  * min slot controls the lowest index we're willing to push to the
3675  * right.  We'll push up to and including min_slot, but no lower
3676  */
3677 static noinline int __push_leaf_right(struct btrfs_path *path,
3678                                       int data_size, int empty,
3679                                       struct extent_buffer *right,
3680                                       int free_space, u32 left_nritems,
3681                                       u32 min_slot)
3682 {
3683         struct btrfs_fs_info *fs_info = right->fs_info;
3684         struct extent_buffer *left = path->nodes[0];
3685         struct extent_buffer *upper = path->nodes[1];
3686         struct btrfs_map_token token;
3687         struct btrfs_disk_key disk_key;
3688         int slot;
3689         u32 i;
3690         int push_space = 0;
3691         int push_items = 0;
3692         struct btrfs_item *item;
3693         u32 nr;
3694         u32 right_nritems;
3695         u32 data_end;
3696         u32 this_item_size;
3697
3698         if (empty)
3699                 nr = 0;
3700         else
3701                 nr = max_t(u32, 1, min_slot);
3702
3703         if (path->slots[0] >= left_nritems)
3704                 push_space += data_size;
3705
3706         slot = path->slots[1];
3707         i = left_nritems - 1;
3708         while (i >= nr) {
3709                 item = btrfs_item_nr(i);
3710
3711                 if (!empty && push_items > 0) {
3712                         if (path->slots[0] > i)
3713                                 break;
3714                         if (path->slots[0] == i) {
3715                                 int space = btrfs_leaf_free_space(left);
3716
3717                                 if (space + push_space * 2 > free_space)
3718                                         break;
3719                         }
3720                 }
3721
3722                 if (path->slots[0] == i)
3723                         push_space += data_size;
3724
3725                 this_item_size = btrfs_item_size(left, item);
3726                 if (this_item_size + sizeof(*item) + push_space > free_space)
3727                         break;
3728
3729                 push_items++;
3730                 push_space += this_item_size + sizeof(*item);
3731                 if (i == 0)
3732                         break;
3733                 i--;
3734         }
3735
3736         if (push_items == 0)
3737                 goto out_unlock;
3738
3739         WARN_ON(!empty && push_items == left_nritems);
3740
3741         /* push left to right */
3742         right_nritems = btrfs_header_nritems(right);
3743
3744         push_space = btrfs_item_end_nr(left, left_nritems - push_items);
3745         push_space -= leaf_data_end(left);
3746
3747         /* make room in the right data area */
3748         data_end = leaf_data_end(right);
3749         memmove_extent_buffer(right,
3750                               BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
3751                               BTRFS_LEAF_DATA_OFFSET + data_end,
3752                               BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
3753
3754         /* copy from the left data area */
3755         copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
3756                      BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
3757                      BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left),
3758                      push_space);
3759
3760         memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
3761                               btrfs_item_nr_offset(0),
3762                               right_nritems * sizeof(struct btrfs_item));
3763
3764         /* copy the items from left to right */
3765         copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
3766                    btrfs_item_nr_offset(left_nritems - push_items),
3767                    push_items * sizeof(struct btrfs_item));
3768
3769         /* update the item pointers */
3770         btrfs_init_map_token(&token, right);
3771         right_nritems += push_items;
3772         btrfs_set_header_nritems(right, right_nritems);
3773         push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
3774         for (i = 0; i < right_nritems; i++) {
3775                 item = btrfs_item_nr(i);
3776                 push_space -= btrfs_token_item_size(right, item, &token);
3777                 btrfs_set_token_item_offset(right, item, push_space, &token);
3778         }
3779
3780         left_nritems -= push_items;
3781         btrfs_set_header_nritems(left, left_nritems);
3782
3783         if (left_nritems)
3784                 btrfs_mark_buffer_dirty(left);
3785         else
3786                 btrfs_clean_tree_block(left);
3787
3788         btrfs_mark_buffer_dirty(right);
3789
3790         btrfs_item_key(right, &disk_key, 0);
3791         btrfs_set_node_key(upper, &disk_key, slot + 1);
3792         btrfs_mark_buffer_dirty(upper);
3793
3794         /* then fixup the leaf pointer in the path */
3795         if (path->slots[0] >= left_nritems) {
3796                 path->slots[0] -= left_nritems;
3797                 if (btrfs_header_nritems(path->nodes[0]) == 0)
3798                         btrfs_clean_tree_block(path->nodes[0]);
3799                 btrfs_tree_unlock(path->nodes[0]);
3800                 free_extent_buffer(path->nodes[0]);
3801                 path->nodes[0] = right;
3802                 path->slots[1] += 1;
3803         } else {
3804                 btrfs_tree_unlock(right);
3805                 free_extent_buffer(right);
3806         }
3807         return 0;
3808
3809 out_unlock:
3810         btrfs_tree_unlock(right);
3811         free_extent_buffer(right);
3812         return 1;
3813 }
3814
3815 /*
3816  * push some data in the path leaf to the right, trying to free up at
3817  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3818  *
3819  * returns 1 if the push failed because the other node didn't have enough
3820  * room, 0 if everything worked out and < 0 if there were major errors.
3821  *
3822  * this will push starting from min_slot to the end of the leaf.  It won't
3823  * push any slot lower than min_slot
3824  */
3825 static int push_leaf_right(struct btrfs_trans_handle *trans, struct btrfs_root
3826                            *root, struct btrfs_path *path,
3827                            int min_data_size, int data_size,
3828                            int empty, u32 min_slot)
3829 {
3830         struct extent_buffer *left = path->nodes[0];
3831         struct extent_buffer *right;
3832         struct extent_buffer *upper;
3833         int slot;
3834         int free_space;
3835         u32 left_nritems;
3836         int ret;
3837
3838         if (!path->nodes[1])
3839                 return 1;
3840
3841         slot = path->slots[1];
3842         upper = path->nodes[1];
3843         if (slot >= btrfs_header_nritems(upper) - 1)
3844                 return 1;
3845
3846         btrfs_assert_tree_locked(path->nodes[1]);
3847
3848         right = btrfs_read_node_slot(upper, slot + 1);
3849         /*
3850          * slot + 1 is not valid or we fail to read the right node,
3851          * no big deal, just return.
3852          */
3853         if (IS_ERR(right))
3854                 return 1;
3855
3856         btrfs_tree_lock(right);
3857         btrfs_set_lock_blocking_write(right);
3858
3859         free_space = btrfs_leaf_free_space(right);
3860         if (free_space < data_size)
3861                 goto out_unlock;
3862
3863         /* cow and double check */
3864         ret = btrfs_cow_block(trans, root, right, upper,
3865                               slot + 1, &right);
3866         if (ret)
3867                 goto out_unlock;
3868
3869         free_space = btrfs_leaf_free_space(right);
3870         if (free_space < data_size)
3871                 goto out_unlock;
3872
3873         left_nritems = btrfs_header_nritems(left);
3874         if (left_nritems == 0)
3875                 goto out_unlock;
3876
3877         if (path->slots[0] == left_nritems && !empty) {
3878                 /* Key greater than all keys in the leaf, right neighbor has
3879                  * enough room for it and we're not emptying our leaf to delete
3880                  * it, therefore use right neighbor to insert the new item and
3881                  * no need to touch/dirty our left leaf. */
3882                 btrfs_tree_unlock(left);
3883                 free_extent_buffer(left);
3884                 path->nodes[0] = right;
3885                 path->slots[0] = 0;
3886                 path->slots[1]++;
3887                 return 0;
3888         }
3889
3890         return __push_leaf_right(path, min_data_size, empty,
3891                                 right, free_space, left_nritems, min_slot);
3892 out_unlock:
3893         btrfs_tree_unlock(right);
3894         free_extent_buffer(right);
3895         return 1;
3896 }
3897
3898 /*
3899  * push some data in the path leaf to the left, trying to free up at
3900  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
3901  *
3902  * max_slot can put a limit on how far into the leaf we'll push items.  The
3903  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us do all the
3904  * items
3905  */
3906 static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
3907                                      int empty, struct extent_buffer *left,
3908                                      int free_space, u32 right_nritems,
3909                                      u32 max_slot)
3910 {
3911         struct btrfs_fs_info *fs_info = left->fs_info;
3912         struct btrfs_disk_key disk_key;
3913         struct extent_buffer *right = path->nodes[0];
3914         int i;
3915         int push_space = 0;
3916         int push_items = 0;
3917         struct btrfs_item *item;
3918         u32 old_left_nritems;
3919         u32 nr;
3920         int ret = 0;
3921         u32 this_item_size;
3922         u32 old_left_item_size;
3923         struct btrfs_map_token token;
3924
3925         if (empty)
3926                 nr = min(right_nritems, max_slot);
3927         else
3928                 nr = min(right_nritems - 1, max_slot);
3929
3930         for (i = 0; i < nr; i++) {
3931                 item = btrfs_item_nr(i);
3932
3933                 if (!empty && push_items > 0) {
3934                         if (path->slots[0] < i)
3935                                 break;
3936                         if (path->slots[0] == i) {
3937                                 int space = btrfs_leaf_free_space(right);
3938
3939                                 if (space + push_space * 2 > free_space)
3940                                         break;
3941                         }
3942                 }
3943
3944                 if (path->slots[0] == i)
3945                         push_space += data_size;
3946
3947                 this_item_size = btrfs_item_size(right, item);
3948                 if (this_item_size + sizeof(*item) + push_space > free_space)
3949                         break;
3950
3951                 push_items++;
3952                 push_space += this_item_size + sizeof(*item);
3953         }
3954
3955         if (push_items == 0) {
3956                 ret = 1;
3957                 goto out;
3958         }
3959         WARN_ON(!empty && push_items == btrfs_header_nritems(right));
3960
3961         /* push data from right to left */
3962         copy_extent_buffer(left, right,
3963                            btrfs_item_nr_offset(btrfs_header_nritems(left)),
3964                            btrfs_item_nr_offset(0),
3965                            push_items * sizeof(struct btrfs_item));
3966
3967         push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
3968                      btrfs_item_offset_nr(right, push_items - 1);
3969
3970         copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
3971                      leaf_data_end(left) - push_space,
3972                      BTRFS_LEAF_DATA_OFFSET +
3973                      btrfs_item_offset_nr(right, push_items - 1),
3974                      push_space);
3975         old_left_nritems = btrfs_header_nritems(left);
3976         BUG_ON(old_left_nritems <= 0);
3977
3978         btrfs_init_map_token(&token, left);
3979         old_left_item_size = btrfs_item_offset_nr(left, old_left_nritems - 1);
3980         for (i = old_left_nritems; i < old_left_nritems + push_items; i++) {
3981                 u32 ioff;
3982
3983                 item = btrfs_item_nr(i);
3984
3985                 ioff = btrfs_token_item_offset(left, item, &token);
3986                 btrfs_set_token_item_offset(left, item,
3987                       ioff - (BTRFS_LEAF_DATA_SIZE(fs_info) - old_left_item_size),
3988                       &token);
3989         }
3990         btrfs_set_header_nritems(left, old_left_nritems + push_items);
3991
3992         /* fixup right node */
3993         if (push_items > right_nritems)
3994                 WARN(1, KERN_CRIT "push items %d nr %u\n", push_items,
3995                        right_nritems);
3996
3997         if (push_items < right_nritems) {
3998                 push_space = btrfs_item_offset_nr(right, push_items - 1) -
3999                                                   leaf_data_end(right);
4000                 memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
4001                                       BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
4002                                       BTRFS_LEAF_DATA_OFFSET +
4003                                       leaf_data_end(right), push_space);
4004
4005                 memmove_extent_buffer(right, btrfs_item_nr_offset(0),
4006                               btrfs_item_nr_offset(push_items),
4007                              (btrfs_header_nritems(right) - push_items) *
4008                              sizeof(struct btrfs_item));
4009         }
4010
4011         btrfs_init_map_token(&token, right);
4012         right_nritems -= push_items;
4013         btrfs_set_header_nritems(right, right_nritems);
4014         push_space = BTRFS_LEAF_DATA_SIZE(fs_info);
4015         for (i = 0; i < right_nritems; i++) {
4016                 item = btrfs_item_nr(i);
4017
4018                 push_space = push_space - btrfs_token_item_size(right,
4019                                                                 item, &token);
4020                 btrfs_set_token_item_offset(right, item, push_space, &token);
4021         }
4022
4023         btrfs_mark_buffer_dirty(left);
4024         if (right_nritems)
4025                 btrfs_mark_buffer_dirty(right);
4026         else
4027                 btrfs_clean_tree_block(right);
4028
4029         btrfs_item_key(right, &disk_key, 0);
4030         fixup_low_keys(path, &disk_key, 1);
4031
4032         /* then fixup the leaf pointer in the path */
4033         if (path->slots[0] < push_items) {
4034                 path->slots[0] += old_left_nritems;
4035                 btrfs_tree_unlock(path->nodes[0]);
4036                 free_extent_buffer(path->nodes[0]);
4037                 path->nodes[0] = left;
4038                 path->slots[1] -= 1;
4039         } else {
4040                 btrfs_tree_unlock(left);
4041                 free_extent_buffer(left);
4042                 path->slots[0] -= push_items;
4043         }
4044         BUG_ON(path->slots[0] < 0);
4045         return ret;
4046 out:
4047         btrfs_tree_unlock(left);
4048         free_extent_buffer(left);
4049         return ret;
4050 }
4051
4052 /*
4053  * push some data in the path leaf to the left, trying to free up at
4054  * least data_size bytes.  returns zero if the push worked, nonzero otherwise
4055  *
4056  * max_slot can put a limit on how far into the leaf we'll push items.  The
4057  * item at 'max_slot' won't be touched.  Use (u32)-1 to make us push all the
4058  * items
4059  */
4060 static int push_leaf_left(struct btrfs_trans_handle *trans, struct btrfs_root
4061                           *root, struct btrfs_path *path, int min_data_size,
4062                           int data_size, int empty, u32 max_slot)
4063 {
4064         struct extent_buffer *right = path->nodes[0];
4065         struct extent_buffer *left;
4066         int slot;
4067         int free_space;
4068         u32 right_nritems;
4069         int ret = 0;
4070
4071         slot = path->slots[1];
4072         if (slot == 0)
4073                 return 1;
4074         if (!path->nodes[1])
4075                 return 1;
4076
4077         right_nritems = btrfs_header_nritems(right);
4078         if (right_nritems == 0)
4079                 return 1;
4080
4081         btrfs_assert_tree_locked(path->nodes[1]);
4082
4083         left = btrfs_read_node_slot(path->nodes[1], slot - 1);
4084         /*
4085          * slot - 1 is not valid or we fail to read the left node,
4086          * no big deal, just return.
4087          */
4088         if (IS_ERR(left))
4089                 return 1;
4090
4091         btrfs_tree_lock(left);
4092         btrfs_set_lock_blocking_write(left);
4093
4094         free_space = btrfs_leaf_free_space(left);
4095         if (free_space < data_size) {
4096                 ret = 1;
4097                 goto out;
4098         }
4099
4100         /* cow and double check */
4101         ret = btrfs_cow_block(trans, root, left,
4102                               path->nodes[1], slot - 1, &left);
4103         if (ret) {
4104                 /* we hit -ENOSPC, but it isn't fatal here */
4105                 if (ret == -ENOSPC)
4106                         ret = 1;
4107                 goto out;
4108         }
4109
4110         free_space = btrfs_leaf_free_space(left);
4111         if (free_space < data_size) {
4112                 ret = 1;
4113                 goto out;
4114         }
4115
4116         return __push_leaf_left(path, min_data_size,
4117                                empty, left, free_space, right_nritems,
4118                                max_slot);
4119 out:
4120         btrfs_tree_unlock(left);
4121         free_extent_buffer(left);
4122         return ret;
4123 }
4124
4125 /*
4126  * split the path's leaf in two, making sure there is at least data_size
4127  * available for the resulting leaf level of the path.
4128  */
4129 static noinline void copy_for_split(struct btrfs_trans_handle *trans,
4130                                     struct btrfs_path *path,
4131                                     struct extent_buffer *l,
4132                                     struct extent_buffer *right,
4133                                     int slot, int mid, int nritems)
4134 {
4135         struct btrfs_fs_info *fs_info = trans->fs_info;
4136         int data_copy_size;
4137         int rt_data_off;
4138         int i;
4139         struct btrfs_disk_key disk_key;
4140         struct btrfs_map_token token;
4141
4142         nritems = nritems - mid;
4143         btrfs_set_header_nritems(right, nritems);
4144         data_copy_size = btrfs_item_end_nr(l, mid) - leaf_data_end(l);
4145
4146         copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
4147                            btrfs_item_nr_offset(mid),
4148                            nritems * sizeof(struct btrfs_item));
4149
4150         copy_extent_buffer(right, l,
4151                      BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
4152                      data_copy_size, BTRFS_LEAF_DATA_OFFSET +
4153                      leaf_data_end(l), data_copy_size);
4154
4155         rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_end_nr(l, mid);
4156
4157         btrfs_init_map_token(&token, right);
4158         for (i = 0; i < nritems; i++) {
4159                 struct btrfs_item *item = btrfs_item_nr(i);
4160                 u32 ioff;
4161
4162                 ioff = btrfs_token_item_offset(right, item, &token);
4163                 btrfs_set_token_item_offset(right, item,
4164                                             ioff + rt_data_off, &token);
4165         }
4166
4167         btrfs_set_header_nritems(l, mid);
4168         btrfs_item_key(right, &disk_key, 0);
4169         insert_ptr(trans, path, &disk_key, right->start, path->slots[1] + 1, 1);
4170
4171         btrfs_mark_buffer_dirty(right);
4172         btrfs_mark_buffer_dirty(l);
4173         BUG_ON(path->slots[0] != slot);
4174
4175         if (mid <= slot) {
4176                 btrfs_tree_unlock(path->nodes[0]);
4177                 free_extent_buffer(path->nodes[0]);
4178                 path->nodes[0] = right;
4179                 path->slots[0] -= mid;
4180                 path->slots[1] += 1;
4181         } else {
4182                 btrfs_tree_unlock(right);
4183                 free_extent_buffer(right);
4184         }
4185
4186         BUG_ON(path->slots[0] < 0);
4187 }
4188
4189 /*
4190  * double splits happen when we need to insert a big item in the middle
4191  * of a leaf.  A double split can leave us with 3 mostly empty leaves:
4192  * leaf: [ slots 0 - N] [ our target ] [ N + 1 - total in leaf ]
4193  *          A                 B                 C
4194  *
4195  * We avoid this by trying to push the items on either side of our target
4196  * into the adjacent leaves.  If all goes well we can avoid the double split
4197  * completely.
4198  */
4199 static noinline int push_for_double_split(struct btrfs_trans_handle *trans,
4200                                           struct btrfs_root *root,
4201                                           struct btrfs_path *path,
4202                                           int data_size)
4203 {
4204         int ret;
4205         int progress = 0;
4206         int slot;
4207         u32 nritems;
4208         int space_needed = data_size;
4209
4210         slot = path->slots[0];
4211         if (slot < btrfs_header_nritems(path->nodes[0]))
4212                 space_needed -= btrfs_leaf_free_space(path->nodes[0]);
4213
4214         /*
4215          * try to push all the items after our slot into the
4216          * right leaf
4217          */
4218         ret = push_leaf_right(trans, root, path, 1, space_needed, 0, slot);
4219         if (ret < 0)
4220                 return ret;
4221
4222         if (ret == 0)
4223                 progress++;
4224
4225         nritems = btrfs_header_nritems(path->nodes[0]);
4226         /*
4227          * our goal is to get our slot at the start or end of a leaf.  If
4228          * we've done so we're done
4229          */
4230         if (path->slots[0] == 0 || path->slots[0] == nritems)
4231                 return 0;
4232
4233         if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
4234                 return 0;
4235
4236         /* try to push all the items before our slot into the next leaf */
4237         slot = path->slots[0];
4238         space_needed = data_size;
4239         if (slot > 0)
4240                 space_needed -= btrfs_leaf_free_space(path->nodes[0]);
4241         ret = push_leaf_left(trans, root, path, 1, space_needed, 0, slot);
4242         if (ret < 0)
4243                 return ret;
4244
4245         if (ret == 0)
4246                 progress++;
4247
4248         if (progress)
4249                 return 0;
4250         return 1;
4251 }
4252
4253 /*
4254  * split the path's leaf in two, making sure there is at least data_size
4255  * available for the resulting leaf level of the path.
4256  *
4257  * returns 0 if all went well and < 0 on failure.
4258  */
4259 static noinline int split_leaf(struct btrfs_trans_handle *trans,
4260                                struct btrfs_root *root,
4261                                const struct btrfs_key *ins_key,
4262                                struct btrfs_path *path, int data_size,
4263                                int extend)
4264 {
4265         struct btrfs_disk_key disk_key;
4266         struct extent_buffer *l;
4267         u32 nritems;
4268         int mid;
4269         int slot;
4270         struct extent_buffer *right;
4271         struct btrfs_fs_info *fs_info = root->fs_info;
4272         int ret = 0;
4273         int wret;
4274         int split;
4275         int num_doubles = 0;
4276         int tried_avoid_double = 0;
4277
4278         l = path->nodes[0];
4279         slot = path->slots[0];
4280         if (extend && data_size + btrfs_item_size_nr(l, slot) +
4281             sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(fs_info))
4282                 return -EOVERFLOW;
4283
4284         /* first try to make some room by pushing left and right */
4285         if (data_size && path->nodes[1]) {
4286                 int space_needed = data_size;
4287
4288                 if (slot < btrfs_header_nritems(l))
4289                         space_needed -= btrfs_leaf_free_space(l);
4290
4291                 wret = push_leaf_right(trans, root, path, space_needed,
4292                                        space_needed, 0, 0);
4293                 if (wret < 0)
4294                         return wret;
4295                 if (wret) {
4296                         space_needed = data_size;
4297                         if (slot > 0)
4298                                 space_needed -= btrfs_leaf_free_space(l);
4299                         wret = push_leaf_left(trans, root, path, space_needed,
4300                                               space_needed, 0, (u32)-1);
4301                         if (wret < 0)
4302                                 return wret;
4303                 }
4304                 l = path->nodes[0];
4305
4306                 /* did the pushes work? */
4307                 if (btrfs_leaf_free_space(l) >= data_size)
4308                         return 0;
4309         }
4310
4311         if (!path->nodes[1]) {
4312                 ret = insert_new_root(trans, root, path, 1);
4313                 if (ret)
4314                         return ret;
4315         }
4316 again:
4317         split = 1;
4318         l = path->nodes[0];
4319         slot = path->slots[0];
4320         nritems = btrfs_header_nritems(l);
4321         mid = (nritems + 1) / 2;
4322
4323         if (mid <= slot) {
4324                 if (nritems == 1 ||
4325                     leaf_space_used(l, mid, nritems - mid) + data_size >
4326                         BTRFS_LEAF_DATA_SIZE(fs_info)) {
4327                         if (slot >= nritems) {
4328                                 split = 0;
4329                         } else {
4330                                 mid = slot;
4331                                 if (mid != nritems &&
4332                                     leaf_space_used(l, mid, nritems - mid) +
4333                                     data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4334                                         if (data_size && !tried_avoid_double)
4335                                                 goto push_for_double;
4336                                         split = 2;
4337                                 }
4338                         }
4339                 }
4340         } else {
4341                 if (leaf_space_used(l, 0, mid) + data_size >
4342                         BTRFS_LEAF_DATA_SIZE(fs_info)) {
4343                         if (!extend && data_size && slot == 0) {
4344                                 split = 0;
4345                         } else if ((extend || !data_size) && slot == 0) {
4346                                 mid = 1;
4347                         } else {
4348                                 mid = slot;
4349                                 if (mid != nritems &&
4350                                     leaf_space_used(l, mid, nritems - mid) +
4351                                     data_size > BTRFS_LEAF_DATA_SIZE(fs_info)) {
4352                                         if (data_size && !tried_avoid_double)
4353                                                 goto push_for_double;
4354                                         split = 2;
4355                                 }
4356                         }
4357                 }
4358         }
4359
4360         if (split == 0)
4361                 btrfs_cpu_key_to_disk(&disk_key, ins_key);
4362         else
4363                 btrfs_item_key(l, &disk_key, mid);
4364
4365         right = alloc_tree_block_no_bg_flush(trans, root, 0, &disk_key, 0,
4366                                              l->start, 0);
4367         if (IS_ERR(right))
4368                 return PTR_ERR(right);
4369
4370         root_add_used(root, fs_info->nodesize);
4371
4372         if (split == 0) {
4373                 if (mid <= slot) {
4374                         btrfs_set_header_nritems(right, 0);
4375                         insert_ptr(trans, path, &disk_key,
4376                                    right->start, path->slots[1] + 1, 1);
4377                         btrfs_tree_unlock(path->nodes[0]);
4378                         free_extent_buffer(path->nodes[0]);
4379                         path->nodes[0] = right;
4380                         path->slots[0] = 0;
4381                         path->slots[1] += 1;
4382                 } else {
4383                         btrfs_set_header_nritems(right, 0);
4384                         insert_ptr(trans, path, &disk_key,
4385                                    right->start, path->slots[1], 1);
4386                         btrfs_tree_unlock(path->nodes[0]);
4387                         free_extent_buffer(path->nodes[0]);
4388                         path->nodes[0] = right;
4389                         path->slots[0] = 0;
4390                         if (path->slots[1] == 0)
4391                                 fixup_low_keys(path, &disk_key, 1);
4392                 }
4393                 /*
4394                  * We create a new leaf 'right' for the required ins_len and
4395                  * we'll do btrfs_mark_buffer_dirty() on this leaf after copying
4396                  * the content of ins_len to 'right'.
4397                  */
4398                 return ret;
4399         }
4400
4401         copy_for_split(trans, path, l, right, slot, mid, nritems);
4402
4403         if (split == 2) {
4404                 BUG_ON(num_doubles != 0);
4405                 num_doubles++;
4406                 goto again;
4407         }
4408
4409         return 0;
4410
4411 push_for_double:
4412         push_for_double_split(trans, root, path, data_size);
4413         tried_avoid_double = 1;
4414         if (btrfs_leaf_free_space(path->nodes[0]) >= data_size)
4415                 return 0;
4416         goto again;
4417 }
4418
4419 static noinline int setup_leaf_for_split(struct btrfs_trans_handle *trans,
4420                                          struct btrfs_root *root,
4421                                          struct btrfs_path *path, int ins_len)
4422 {
4423         struct btrfs_key key;
4424         struct extent_buffer *leaf;
4425         struct btrfs_file_extent_item *fi;
4426         u64 extent_len = 0;
4427         u32 item_size;
4428         int ret;
4429
4430         leaf = path->nodes[0];
4431         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
4432
4433         BUG_ON(key.type != BTRFS_EXTENT_DATA_KEY &&
4434                key.type != BTRFS_EXTENT_CSUM_KEY);
4435
4436         if (btrfs_leaf_free_space(leaf) >= ins_len)
4437                 return 0;
4438
4439         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4440         if (key.type == BTRFS_EXTENT_DATA_KEY) {
4441                 fi = btrfs_item_ptr(leaf, path->slots[0],
4442                                     struct btrfs_file_extent_item);
4443                 extent_len = btrfs_file_extent_num_bytes(leaf, fi);
4444         }
4445         btrfs_release_path(path);
4446
4447         path->keep_locks = 1;
4448         path->search_for_split = 1;
4449         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
4450         path->search_for_split = 0;
4451         if (ret > 0)
4452                 ret = -EAGAIN;
4453         if (ret < 0)
4454                 goto err;
4455
4456         ret = -EAGAIN;
4457         leaf = path->nodes[0];
4458         /* if our item isn't there, return now */
4459         if (item_size != btrfs_item_size_nr(leaf, path->slots[0]))
4460                 goto err;
4461
4462         /* the leaf has  changed, it now has room.  return now */
4463         if (btrfs_leaf_free_space(path->nodes[0]) >= ins_len)
4464                 goto err;
4465
4466         if (key.type == BTRFS_EXTENT_DATA_KEY) {
4467                 fi = btrfs_item_ptr(leaf, path->slots[0],
4468                                     struct btrfs_file_extent_item);
4469                 if (extent_len != btrfs_file_extent_num_bytes(leaf, fi))
4470                         goto err;
4471         }
4472
4473         btrfs_set_path_blocking(path);
4474         ret = split_leaf(trans, root, &key, path, ins_len, 1);
4475         if (ret)
4476                 goto err;
4477
4478         path->keep_locks = 0;
4479         btrfs_unlock_up_safe(path, 1);
4480         return 0;
4481 err:
4482         path->keep_locks = 0;
4483         return ret;
4484 }
4485
4486 static noinline int split_item(struct btrfs_path *path,
4487                                const struct btrfs_key *new_key,
4488                                unsigned long split_offset)
4489 {
4490         struct extent_buffer *leaf;
4491         struct btrfs_item *item;
4492         struct btrfs_item *new_item;
4493         int slot;
4494         char *buf;
4495         u32 nritems;
4496         u32 item_size;
4497         u32 orig_offset;
4498         struct btrfs_disk_key disk_key;
4499
4500         leaf = path->nodes[0];
4501         BUG_ON(btrfs_leaf_free_space(leaf) < sizeof(struct btrfs_item));
4502
4503         btrfs_set_path_blocking(path);
4504
4505         item = btrfs_item_nr(path->slots[0]);
4506         orig_offset = btrfs_item_offset(leaf, item);
4507         item_size = btrfs_item_size(leaf, item);
4508
4509         buf = kmalloc(item_size, GFP_NOFS);
4510         if (!buf)
4511                 return -ENOMEM;
4512
4513         read_extent_buffer(leaf, buf, btrfs_item_ptr_offset(leaf,
4514                             path->slots[0]), item_size);
4515
4516         slot = path->slots[0] + 1;
4517         nritems = btrfs_header_nritems(leaf);
4518         if (slot != nritems) {
4519                 /* shift the items */
4520                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
4521                                 btrfs_item_nr_offset(slot),
4522                                 (nritems - slot) * sizeof(struct btrfs_item));
4523         }
4524
4525         btrfs_cpu_key_to_disk(&disk_key, new_key);
4526         btrfs_set_item_key(leaf, &disk_key, slot);
4527
4528         new_item = btrfs_item_nr(slot);
4529
4530         btrfs_set_item_offset(leaf, new_item, orig_offset);
4531         btrfs_set_item_size(leaf, new_item, item_size - split_offset);
4532
4533         btrfs_set_item_offset(leaf, item,
4534                               orig_offset + item_size - split_offset);
4535         btrfs_set_item_size(leaf, item, split_offset);
4536
4537         btrfs_set_header_nritems(leaf, nritems + 1);
4538
4539         /* write the data for the start of the original item */
4540         write_extent_buffer(leaf, buf,
4541                             btrfs_item_ptr_offset(leaf, path->slots[0]),
4542                             split_offset);
4543
4544         /* write the data for the new item */
4545         write_extent_buffer(leaf, buf + split_offset,
4546                             btrfs_item_ptr_offset(leaf, slot),
4547                             item_size - split_offset);
4548         btrfs_mark_buffer_dirty(leaf);
4549
4550         BUG_ON(btrfs_leaf_free_space(leaf) < 0);
4551         kfree(buf);
4552         return 0;
4553 }
4554
4555 /*
4556  * This function splits a single item into two items,
4557  * giving 'new_key' to the new item and splitting the
4558  * old one at split_offset (from the start of the item).
4559  *
4560  * The path may be released by this operation.  After
4561  * the split, the path is pointing to the old item.  The
4562  * new item is going to be in the same node as the old one.
4563  *
4564  * Note, the item being split must be smaller enough to live alone on
4565  * a tree block with room for one extra struct btrfs_item
4566  *
4567  * This allows us to split the item in place, keeping a lock on the
4568  * leaf the entire time.
4569  */
4570 int btrfs_split_item(struct btrfs_trans_handle *trans,
4571                      struct btrfs_root *root,
4572                      struct btrfs_path *path,
4573                      const struct btrfs_key *new_key,
4574                      unsigned long split_offset)
4575 {
4576         int ret;
4577         ret = setup_leaf_for_split(trans, root, path,
4578                                    sizeof(struct btrfs_item));
4579         if (ret)
4580                 return ret;
4581
4582         ret = split_item(path, new_key, split_offset);
4583         return ret;
4584 }
4585
4586 /*
4587  * This function duplicate a item, giving 'new_key' to the new item.
4588  * It guarantees both items live in the same tree leaf and the new item
4589  * is contiguous with the original item.
4590  *
4591  * This allows us to split file extent in place, keeping a lock on the
4592  * leaf the entire time.
4593  */
4594 int btrfs_duplicate_item(struct btrfs_trans_handle *trans,
4595                          struct btrfs_root *root,
4596                          struct btrfs_path *path,
4597                          const struct btrfs_key *new_key)
4598 {
4599         struct extent_buffer *leaf;
4600         int ret;
4601         u32 item_size;
4602
4603         leaf = path->nodes[0];
4604         item_size = btrfs_item_size_nr(leaf, path->slots[0]);
4605         ret = setup_leaf_for_split(trans, root, path,
4606                                    item_size + sizeof(struct btrfs_item));
4607         if (ret)
4608                 return ret;
4609
4610         path->slots[0]++;
4611         setup_items_for_insert(root, path, new_key, &item_size,
4612                                item_size, item_size +
4613                                sizeof(struct btrfs_item), 1);
4614         leaf = path->nodes[0];
4615         memcpy_extent_buffer(leaf,
4616                              btrfs_item_ptr_offset(leaf, path->slots[0]),
4617                              btrfs_item_ptr_offset(leaf, path->slots[0] - 1),
4618                              item_size);
4619         return 0;
4620 }
4621
4622 /*
4623  * make the item pointed to by the path smaller.  new_size indicates
4624  * how small to make it, and from_end tells us if we just chop bytes
4625  * off the end of the item or if we shift the item to chop bytes off
4626  * the front.
4627  */
4628 void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
4629 {
4630         int slot;
4631         struct extent_buffer *leaf;
4632         struct btrfs_item *item;
4633         u32 nritems;
4634         unsigned int data_end;
4635         unsigned int old_data_start;
4636         unsigned int old_size;
4637         unsigned int size_diff;
4638         int i;
4639         struct btrfs_map_token token;
4640
4641         leaf = path->nodes[0];
4642         slot = path->slots[0];
4643
4644         old_size = btrfs_item_size_nr(leaf, slot);
4645         if (old_size == new_size)
4646                 return;
4647
4648         nritems = btrfs_header_nritems(leaf);
4649         data_end = leaf_data_end(leaf);
4650
4651         old_data_start = btrfs_item_offset_nr(leaf, slot);
4652
4653         size_diff = old_size - new_size;
4654
4655         BUG_ON(slot < 0);
4656         BUG_ON(slot >= nritems);
4657
4658         /*
4659          * item0..itemN ... dataN.offset..dataN.size .. data0.size
4660          */
4661         /* first correct the data pointers */
4662         btrfs_init_map_token(&token, leaf);
4663         for (i = slot; i < nritems; i++) {
4664                 u32 ioff;
4665                 item = btrfs_item_nr(i);
4666
4667                 ioff = btrfs_token_item_offset(leaf, item, &token);
4668                 btrfs_set_token_item_offset(leaf, item,
4669                                             ioff + size_diff, &token);
4670         }
4671
4672         /* shift the data */
4673         if (from_end) {
4674                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4675                               data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4676                               data_end, old_data_start + new_size - data_end);
4677         } else {
4678                 struct btrfs_disk_key disk_key;
4679                 u64 offset;
4680
4681                 btrfs_item_key(leaf, &disk_key, slot);
4682
4683                 if (btrfs_disk_key_type(&disk_key) == BTRFS_EXTENT_DATA_KEY) {
4684                         unsigned long ptr;
4685                         struct btrfs_file_extent_item *fi;
4686
4687                         fi = btrfs_item_ptr(leaf, slot,
4688                                             struct btrfs_file_extent_item);
4689                         fi = (struct btrfs_file_extent_item *)(
4690                              (unsigned long)fi - size_diff);
4691
4692                         if (btrfs_file_extent_type(leaf, fi) ==
4693                             BTRFS_FILE_EXTENT_INLINE) {
4694                                 ptr = btrfs_item_ptr_offset(leaf, slot);
4695                                 memmove_extent_buffer(leaf, ptr,
4696                                       (unsigned long)fi,
4697                                       BTRFS_FILE_EXTENT_INLINE_DATA_START);
4698                         }
4699                 }
4700
4701                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4702                               data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
4703                               data_end, old_data_start - data_end);
4704
4705                 offset = btrfs_disk_key_offset(&disk_key);
4706                 btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
4707                 btrfs_set_item_key(leaf, &disk_key, slot);
4708                 if (slot == 0)
4709                         fixup_low_keys(path, &disk_key, 1);
4710         }
4711
4712         item = btrfs_item_nr(slot);
4713         btrfs_set_item_size(leaf, item, new_size);
4714         btrfs_mark_buffer_dirty(leaf);
4715
4716         if (btrfs_leaf_free_space(leaf) < 0) {
4717                 btrfs_print_leaf(leaf);
4718                 BUG();
4719         }
4720 }
4721
4722 /*
4723  * make the item pointed to by the path bigger, data_size is the added size.
4724  */
4725 void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
4726 {
4727         int slot;
4728         struct extent_buffer *leaf;
4729         struct btrfs_item *item;
4730         u32 nritems;
4731         unsigned int data_end;
4732         unsigned int old_data;
4733         unsigned int old_size;
4734         int i;
4735         struct btrfs_map_token token;
4736
4737         leaf = path->nodes[0];
4738
4739         nritems = btrfs_header_nritems(leaf);
4740         data_end = leaf_data_end(leaf);
4741
4742         if (btrfs_leaf_free_space(leaf) < data_size) {
4743                 btrfs_print_leaf(leaf);
4744                 BUG();
4745         }
4746         slot = path->slots[0];
4747         old_data = btrfs_item_end_nr(leaf, slot);
4748
4749         BUG_ON(slot < 0);
4750         if (slot >= nritems) {
4751                 btrfs_print_leaf(leaf);
4752                 btrfs_crit(leaf->fs_info, "slot %d too large, nritems %d",
4753                            slot, nritems);
4754                 BUG();
4755         }
4756
4757         /*
4758          * item0..itemN ... dataN.offset..dataN.size .. data0.size
4759          */
4760         /* first correct the data pointers */
4761         btrfs_init_map_token(&token, leaf);
4762         for (i = slot; i < nritems; i++) {
4763                 u32 ioff;
4764                 item = btrfs_item_nr(i);
4765
4766                 ioff = btrfs_token_item_offset(leaf, item, &token);
4767                 btrfs_set_token_item_offset(leaf, item,
4768                                             ioff - data_size, &token);
4769         }
4770
4771         /* shift the data */
4772         memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4773                       data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
4774                       data_end, old_data - data_end);
4775
4776         data_end = old_data;
4777         old_size = btrfs_item_size_nr(leaf, slot);
4778         item = btrfs_item_nr(slot);
4779         btrfs_set_item_size(leaf, item, old_size + data_size);
4780         btrfs_mark_buffer_dirty(leaf);
4781
4782         if (btrfs_leaf_free_space(leaf) < 0) {
4783                 btrfs_print_leaf(leaf);
4784                 BUG();
4785         }
4786 }
4787
4788 /*
4789  * this is a helper for btrfs_insert_empty_items, the main goal here is
4790  * to save stack depth by doing the bulk of the work in a function
4791  * that doesn't call btrfs_search_slot
4792  */
4793 void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
4794                             const struct btrfs_key *cpu_key, u32 *data_size,
4795                             u32 total_data, u32 total_size, int nr)
4796 {
4797         struct btrfs_fs_info *fs_info = root->fs_info;
4798         struct btrfs_item *item;
4799         int i;
4800         u32 nritems;
4801         unsigned int data_end;
4802         struct btrfs_disk_key disk_key;
4803         struct extent_buffer *leaf;
4804         int slot;
4805         struct btrfs_map_token token;
4806
4807         if (path->slots[0] == 0) {
4808                 btrfs_cpu_key_to_disk(&disk_key, cpu_key);
4809                 fixup_low_keys(path, &disk_key, 1);
4810         }
4811         btrfs_unlock_up_safe(path, 1);
4812
4813         leaf = path->nodes[0];
4814         slot = path->slots[0];
4815
4816         nritems = btrfs_header_nritems(leaf);
4817         data_end = leaf_data_end(leaf);
4818
4819         if (btrfs_leaf_free_space(leaf) < total_size) {
4820                 btrfs_print_leaf(leaf);
4821                 btrfs_crit(fs_info, "not enough freespace need %u have %d",
4822                            total_size, btrfs_leaf_free_space(leaf));
4823                 BUG();
4824         }
4825
4826         btrfs_init_map_token(&token, leaf);
4827         if (slot != nritems) {
4828                 unsigned int old_data = btrfs_item_end_nr(leaf, slot);
4829
4830                 if (old_data < data_end) {
4831                         btrfs_print_leaf(leaf);
4832                         btrfs_crit(fs_info, "slot %d old_data %d data_end %d",
4833                                    slot, old_data, data_end);
4834                         BUG();
4835                 }
4836                 /*
4837                  * item0..itemN ... dataN.offset..dataN.size .. data0.size
4838                  */
4839                 /* first correct the data pointers */
4840                 for (i = slot; i < nritems; i++) {
4841                         u32 ioff;
4842
4843                         item = btrfs_item_nr(i);
4844                         ioff = btrfs_token_item_offset(leaf, item, &token);
4845                         btrfs_set_token_item_offset(leaf, item,
4846                                                     ioff - total_data, &token);
4847                 }
4848                 /* shift the items */
4849                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + nr),
4850                               btrfs_item_nr_offset(slot),
4851                               (nritems - slot) * sizeof(struct btrfs_item));
4852
4853                 /* shift the data */
4854                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
4855                               data_end - total_data, BTRFS_LEAF_DATA_OFFSET +
4856                               data_end, old_data - data_end);
4857                 data_end = old_data;
4858         }
4859
4860         /* setup the item for the new data */
4861         for (i = 0; i < nr; i++) {
4862                 btrfs_cpu_key_to_disk(&disk_key, cpu_key + i);
4863                 btrfs_set_item_key(leaf, &disk_key, slot + i);
4864                 item = btrfs_item_nr(slot + i);
4865                 btrfs_set_token_item_offset(leaf, item,
4866                                             data_end - data_size[i], &token);
4867                 data_end -= data_size[i];
4868                 btrfs_set_token_item_size(leaf, item, data_size[i], &token);
4869         }
4870
4871         btrfs_set_header_nritems(leaf, nritems + nr);
4872         btrfs_mark_buffer_dirty(leaf);
4873
4874         if (btrfs_leaf_free_space(leaf) < 0) {
4875                 btrfs_print_leaf(leaf);
4876                 BUG();
4877         }
4878 }
4879
4880 /*
4881  * Given a key and some data, insert items into the tree.
4882  * This does all the path init required, making room in the tree if needed.
4883  */
4884 int btrfs_insert_empty_items(struct btrfs_trans_handle *trans,
4885                             struct btrfs_root *root,
4886                             struct btrfs_path *path,
4887                             const struct btrfs_key *cpu_key, u32 *data_size,
4888                             int nr)
4889 {
4890         int ret = 0;
4891         int slot;
4892         int i;
4893         u32 total_size = 0;
4894         u32 total_data = 0;
4895
4896         for (i = 0; i < nr; i++)
4897                 total_data += data_size[i];
4898
4899         total_size = total_data + (nr * sizeof(struct btrfs_item));
4900         ret = btrfs_search_slot(trans, root, cpu_key, path, total_size, 1);
4901         if (ret == 0)
4902                 return -EEXIST;
4903         if (ret < 0)
4904                 return ret;
4905
4906         slot = path->slots[0];
4907         BUG_ON(slot < 0);
4908
4909         setup_items_for_insert(root, path, cpu_key, data_size,
4910                                total_data, total_size, nr);
4911         return 0;
4912 }
4913
4914 /*
4915  * Given a key and some data, insert an item into the tree.
4916  * This does all the path init required, making room in the tree if needed.
4917  */
4918 int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root,
4919                       const struct btrfs_key *cpu_key, void *data,
4920                       u32 data_size)
4921 {
4922         int ret = 0;
4923         struct btrfs_path *path;
4924         struct extent_buffer *leaf;
4925         unsigned long ptr;
4926
4927         path = btrfs_alloc_path();
4928         if (!path)
4929                 return -ENOMEM;
4930         ret = btrfs_insert_empty_item(trans, root, path, cpu_key, data_size);
4931         if (!ret) {
4932                 leaf = path->nodes[0];
4933                 ptr = btrfs_item_ptr_offset(leaf, path->slots[0]);
4934                 write_extent_buffer(leaf, data, ptr, data_size);
4935                 btrfs_mark_buffer_dirty(leaf);
4936         }
4937         btrfs_free_path(path);
4938         return ret;
4939 }
4940
4941 /*
4942  * delete the pointer from a given node.
4943  *
4944  * the tree should have been previously balanced so the deletion does not
4945  * empty a node.
4946  */
4947 static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
4948                     int level, int slot)
4949 {
4950         struct extent_buffer *parent = path->nodes[level];
4951         u32 nritems;
4952         int ret;
4953
4954         nritems = btrfs_header_nritems(parent);
4955         if (slot != nritems - 1) {
4956                 if (level) {
4957                         ret = tree_mod_log_insert_move(parent, slot, slot + 1,
4958                                         nritems - slot - 1);
4959                         BUG_ON(ret < 0);
4960                 }
4961                 memmove_extent_buffer(parent,
4962                               btrfs_node_key_ptr_offset(slot),
4963                               btrfs_node_key_ptr_offset(slot + 1),
4964                               sizeof(struct btrfs_key_ptr) *
4965                               (nritems - slot - 1));
4966         } else if (level) {
4967                 ret = tree_mod_log_insert_key(parent, slot, MOD_LOG_KEY_REMOVE,
4968                                 GFP_NOFS);
4969                 BUG_ON(ret < 0);
4970         }
4971
4972         nritems--;
4973         btrfs_set_header_nritems(parent, nritems);
4974         if (nritems == 0 && parent == root->node) {
4975                 BUG_ON(btrfs_header_level(root->node) != 1);
4976                 /* just turn the root into a leaf and break */
4977                 btrfs_set_header_level(root->node, 0);
4978         } else if (slot == 0) {
4979                 struct btrfs_disk_key disk_key;
4980
4981                 btrfs_node_key(parent, &disk_key, 0);
4982                 fixup_low_keys(path, &disk_key, level + 1);
4983         }
4984         btrfs_mark_buffer_dirty(parent);
4985 }
4986
4987 /*
4988  * a helper function to delete the leaf pointed to by path->slots[1] and
4989  * path->nodes[1].
4990  *
4991  * This deletes the pointer in path->nodes[1] and frees the leaf
4992  * block extent.  zero is returned if it all worked out, < 0 otherwise.
4993  *
4994  * The path must have already been setup for deleting the leaf, including
4995  * all the proper balancing.  path->nodes[1] must be locked.
4996  */
4997 static noinline void btrfs_del_leaf(struct btrfs_trans_handle *trans,
4998                                     struct btrfs_root *root,
4999                                     struct btrfs_path *path,
5000                                     struct extent_buffer *leaf)
5001 {
5002         WARN_ON(btrfs_header_generation(leaf) != trans->transid);
5003         del_ptr(root, path, 1, path->slots[1]);
5004
5005         /*
5006          * btrfs_free_extent is expensive, we want to make sure we
5007          * aren't holding any locks when we call it
5008          */
5009         btrfs_unlock_up_safe(path, 0);
5010
5011         root_sub_used(root, leaf->len);
5012
5013         extent_buffer_get(leaf);
5014         btrfs_free_tree_block(trans, root, leaf, 0, 1);
5015         free_extent_buffer_stale(leaf);
5016 }
5017 /*
5018  * delete the item at the leaf level in path.  If that empties
5019  * the leaf, remove it from the tree
5020  */
5021 int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
5022                     struct btrfs_path *path, int slot, int nr)
5023 {
5024         struct btrfs_fs_info *fs_info = root->fs_info;
5025         struct extent_buffer *leaf;
5026         struct btrfs_item *item;
5027         u32 last_off;
5028         u32 dsize = 0;
5029         int ret = 0;
5030         int wret;
5031         int i;
5032         u32 nritems;
5033
5034         leaf = path->nodes[0];
5035         last_off = btrfs_item_offset_nr(leaf, slot + nr - 1);
5036
5037         for (i = 0; i < nr; i++)
5038                 dsize += btrfs_item_size_nr(leaf, slot + i);
5039
5040         nritems = btrfs_header_nritems(leaf);
5041
5042         if (slot + nr != nritems) {
5043                 int data_end = leaf_data_end(leaf);
5044                 struct btrfs_map_token token;
5045
5046                 memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
5047                               data_end + dsize,
5048                               BTRFS_LEAF_DATA_OFFSET + data_end,
5049                               last_off - data_end);
5050
5051                 btrfs_init_map_token(&token, leaf);
5052                 for (i = slot + nr; i < nritems; i++) {
5053                         u32 ioff;
5054
5055                         item = btrfs_item_nr(i);
5056                         ioff = btrfs_token_item_offset(leaf, item, &token);
5057                         btrfs_set_token_item_offset(leaf, item,
5058                                                     ioff + dsize, &token);
5059                 }
5060
5061                 memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
5062                               btrfs_item_nr_offset(slot + nr),
5063                               sizeof(struct btrfs_item) *
5064                               (nritems - slot - nr));
5065         }
5066         btrfs_set_header_nritems(leaf, nritems - nr);
5067         nritems -= nr;
5068
5069         /* delete the leaf if we've emptied it */
5070         if (nritems == 0) {
5071                 if (leaf == root->node) {
5072                         btrfs_set_header_level(leaf, 0);
5073                 } else {
5074                         btrfs_set_path_blocking(path);
5075                         btrfs_clean_tree_block(leaf);
5076                         btrfs_del_leaf(trans, root, path, leaf);
5077                 }
5078         } else {
5079                 int used = leaf_space_used(leaf, 0, nritems);
5080                 if (slot == 0) {
5081                         struct btrfs_disk_key disk_key;
5082
5083                         btrfs_item_key(leaf, &disk_key, 0);
5084                         fixup_low_keys(path, &disk_key, 1);
5085                 }
5086
5087                 /* delete the leaf if it is mostly empty */
5088                 if (used < BTRFS_LEAF_DATA_SIZE(fs_info) / 3) {
5089                         /* push_leaf_left fixes the path.
5090                          * make sure the path still points to our leaf
5091                          * for possible call to del_ptr below
5092                          */
5093                         slot = path->slots[1];
5094                         extent_buffer_get(leaf);
5095
5096                         btrfs_set_path_blocking(path);
5097                         wret = push_leaf_left(trans, root, path, 1, 1,
5098                                               1, (u32)-1);
5099                         if (wret < 0 && wret != -ENOSPC)
5100                                 ret = wret;
5101
5102                         if (path->nodes[0] == leaf &&
5103                             btrfs_header_nritems(leaf)) {
5104                                 wret = push_leaf_right(trans, root, path, 1,
5105                                                        1, 1, 0);
5106                                 if (wret < 0 && wret != -ENOSPC)
5107                                         ret = wret;
5108                         }
5109
5110                         if (btrfs_header_nritems(leaf) == 0) {
5111                                 path->slots[1] = slot;
5112                                 btrfs_del_leaf(trans, root, path, leaf);
5113                                 free_extent_buffer(leaf);
5114                                 ret = 0;
5115                         } else {
5116                                 /* if we're still in the path, make sure
5117                                  * we're dirty.  Otherwise, one of the
5118                                  * push_leaf functions must have already
5119                                  * dirtied this buffer
5120                                  */
5121                                 if (path->nodes[0] == leaf)
5122                                         btrfs_mark_buffer_dirty(leaf);
5123                                 free_extent_buffer(leaf);
5124                         }
5125                 } else {
5126                         btrfs_mark_buffer_dirty(leaf);
5127                 }
5128         }
5129         return ret;
5130 }
5131
5132 /*
5133  * search the tree again to find a leaf with lesser keys
5134  * returns 0 if it found something or 1 if there are no lesser leaves.
5135  * returns < 0 on io errors.
5136  *
5137  * This may release the path, and so you may lose any locks held at the
5138  * time you call it.
5139  */
5140 int btrfs_prev_leaf(struct btrfs_root *root, struct btrfs_path *path)
5141 {
5142         struct btrfs_key key;
5143         struct btrfs_key orig_key;
5144         struct btrfs_disk_key found_key;
5145         int ret;
5146
5147         btrfs_item_key_to_cpu(path->nodes[0], &key, 0);
5148         orig_key = key;
5149
5150         if (key.offset > 0) {
5151                 key.offset--;
5152         } else if (key.type > 0) {
5153                 key.type--;
5154                 key.offset = (u64)-1;
5155         } else if (key.objectid > 0) {
5156                 key.objectid--;
5157                 key.type = (u8)-1;
5158                 key.offset = (u64)-1;
5159         } else {
5160                 return 1;
5161         }
5162
5163         btrfs_release_path(path);
5164         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5165         if (ret <= 0)
5166                 return ret;
5167
5168         /*
5169          * Previous key not found. Even if we were at slot 0 of the leaf we had
5170          * before releasing the path and calling btrfs_search_slot(), we now may
5171          * be in a slot pointing to the same original key - this can happen if
5172          * after we released the path, one of more items were moved from a
5173          * sibling leaf into the front of the leaf we had due to an insertion
5174          * (see push_leaf_right()).
5175          * If we hit this case and our slot is > 0 and just decrement the slot
5176          * so that the caller does not process the same key again, which may or
5177          * may not break the caller, depending on its logic.
5178          */
5179         if (path->slots[0] < btrfs_header_nritems(path->nodes[0])) {
5180                 btrfs_item_key(path->nodes[0], &found_key, path->slots[0]);
5181                 ret = comp_keys(&found_key, &orig_key);
5182                 if (ret == 0) {
5183                         if (path->slots[0] > 0) {
5184                                 path->slots[0]--;
5185                                 return 0;
5186                         }
5187                         /*
5188                          * At slot 0, same key as before, it means orig_key is
5189                          * the lowest, leftmost, key in the tree. We're done.
5190                          */
5191                         return 1;
5192                 }
5193         }
5194
5195         btrfs_item_key(path->nodes[0], &found_key, 0);
5196         ret = comp_keys(&found_key, &key);
5197         /*
5198          * We might have had an item with the previous key in the tree right
5199          * before we released our path. And after we released our path, that
5200          * item might have been pushed to the first slot (0) of the leaf we
5201          * were holding due to a tree balance. Alternatively, an item with the
5202          * previous key can exist as the only element of a leaf (big fat item).
5203          * Therefore account for these 2 cases, so that our callers (like
5204          * btrfs_previous_item) don't miss an existing item with a key matching
5205          * the previous key we computed above.
5206          */
5207         if (ret <= 0)
5208                 return 0;
5209         return 1;
5210 }
5211
5212 /*
5213  * A helper function to walk down the tree starting at min_key, and looking
5214  * for nodes or leaves that are have a minimum transaction id.
5215  * This is used by the btree defrag code, and tree logging
5216  *
5217  * This does not cow, but it does stuff the starting key it finds back
5218  * into min_key, so you can call btrfs_search_slot with cow=1 on the
5219  * key and get a writable path.
5220  *
5221  * This honors path->lowest_level to prevent descent past a given level
5222  * of the tree.
5223  *
5224  * min_trans indicates the oldest transaction that you are interested
5225  * in walking through.  Any nodes or leaves older than min_trans are
5226  * skipped over (without reading them).
5227  *
5228  * returns zero if something useful was found, < 0 on error and 1 if there
5229  * was nothing in the tree that matched the search criteria.
5230  */
5231 int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key,
5232                          struct btrfs_path *path,
5233                          u64 min_trans)
5234 {
5235         struct extent_buffer *cur;
5236         struct btrfs_key found_key;
5237         int slot;
5238         int sret;
5239         u32 nritems;
5240         int level;
5241         int ret = 1;
5242         int keep_locks = path->keep_locks;
5243
5244         path->keep_locks = 1;
5245 again:
5246         cur = btrfs_read_lock_root_node(root);
5247         level = btrfs_header_level(cur);
5248         WARN_ON(path->nodes[level]);
5249         path->nodes[level] = cur;
5250         path->locks[level] = BTRFS_READ_LOCK;
5251
5252         if (btrfs_header_generation(cur) < min_trans) {
5253                 ret = 1;
5254                 goto out;
5255         }
5256         while (1) {
5257                 nritems = btrfs_header_nritems(cur);
5258                 level = btrfs_header_level(cur);
5259                 sret = btrfs_bin_search(cur, min_key, level, &slot);
5260                 if (sret < 0) {
5261                         ret = sret;
5262                         goto out;
5263                 }
5264
5265                 /* at the lowest level, we're done, setup the path and exit */
5266                 if (level == path->lowest_level) {
5267                         if (slot >= nritems)
5268                                 goto find_next_key;
5269                         ret = 0;
5270                         path->slots[level] = slot;
5271                         btrfs_item_key_to_cpu(cur, &found_key, slot);
5272                         goto out;
5273                 }
5274                 if (sret && slot > 0)
5275                         slot--;
5276                 /*
5277                  * check this node pointer against the min_trans parameters.
5278                  * If it is too old, skip to the next one.
5279                  */
5280                 while (slot < nritems) {
5281                         u64 gen;
5282
5283                         gen = btrfs_node_ptr_generation(cur, slot);
5284                         if (gen < min_trans) {
5285                                 slot++;
5286                                 continue;
5287                         }
5288                         break;
5289                 }
5290 find_next_key:
5291                 /*
5292                  * we didn't find a candidate key in this node, walk forward
5293                  * and find another one
5294                  */
5295                 if (slot >= nritems) {
5296                         path->slots[level] = slot;
5297                         btrfs_set_path_blocking(path);
5298                         sret = btrfs_find_next_key(root, path, min_key, level,
5299                                                   min_trans);
5300                         if (sret == 0) {
5301                                 btrfs_release_path(path);
5302                                 goto again;
5303                         } else {
5304                                 goto out;
5305                         }
5306                 }
5307                 /* save our key for returning back */
5308                 btrfs_node_key_to_cpu(cur, &found_key, slot);
5309                 path->slots[level] = slot;
5310                 if (level == path->lowest_level) {
5311                         ret = 0;
5312                         goto out;
5313                 }
5314                 btrfs_set_path_blocking(path);
5315                 cur = btrfs_read_node_slot(cur, slot);
5316                 if (IS_ERR(cur)) {
5317                         ret = PTR_ERR(cur);
5318                         goto out;
5319                 }
5320
5321                 btrfs_tree_read_lock(cur);
5322
5323                 path->locks[level - 1] = BTRFS_READ_LOCK;
5324                 path->nodes[level - 1] = cur;
5325                 unlock_up(path, level, 1, 0, NULL);
5326         }
5327 out:
5328         path->keep_locks = keep_locks;
5329         if (ret == 0) {
5330                 btrfs_unlock_up_safe(path, path->lowest_level + 1);
5331                 btrfs_set_path_blocking(path);
5332                 memcpy(min_key, &found_key, sizeof(found_key));
5333         }
5334         return ret;
5335 }
5336
5337 /*
5338  * this is similar to btrfs_next_leaf, but does not try to preserve
5339  * and fixup the path.  It looks for and returns the next key in the
5340  * tree based on the current path and the min_trans parameters.
5341  *
5342  * 0 is returned if another key is found, < 0 if there are any errors
5343  * and 1 is returned if there are no higher keys in the tree
5344  *
5345  * path->keep_locks should be set to 1 on the search made before
5346  * calling this function.
5347  */
5348 int btrfs_find_next_key(struct btrfs_root *root, struct btrfs_path *path,
5349                         struct btrfs_key *key, int level, u64 min_trans)
5350 {
5351         int slot;
5352         struct extent_buffer *c;
5353
5354         WARN_ON(!path->keep_locks && !path->skip_locking);
5355         while (level < BTRFS_MAX_LEVEL) {
5356                 if (!path->nodes[level])
5357                         return 1;
5358
5359                 slot = path->slots[level] + 1;
5360                 c = path->nodes[level];
5361 next:
5362                 if (slot >= btrfs_header_nritems(c)) {
5363                         int ret;
5364                         int orig_lowest;
5365                         struct btrfs_key cur_key;
5366                         if (level + 1 >= BTRFS_MAX_LEVEL ||
5367                             !path->nodes[level + 1])
5368                                 return 1;
5369
5370                         if (path->locks[level + 1] || path->skip_locking) {
5371                                 level++;
5372                                 continue;
5373                         }
5374
5375                         slot = btrfs_header_nritems(c) - 1;
5376                         if (level == 0)
5377                                 btrfs_item_key_to_cpu(c, &cur_key, slot);
5378                         else
5379                                 btrfs_node_key_to_cpu(c, &cur_key, slot);
5380
5381                         orig_lowest = path->lowest_level;
5382                         btrfs_release_path(path);
5383                         path->lowest_level = level;
5384                         ret = btrfs_search_slot(NULL, root, &cur_key, path,
5385                                                 0, 0);
5386                         path->lowest_level = orig_lowest;
5387                         if (ret < 0)
5388                                 return ret;
5389
5390                         c = path->nodes[level];
5391                         slot = path->slots[level];
5392                         if (ret == 0)
5393                                 slot++;
5394                         goto next;
5395                 }
5396
5397                 if (level == 0)
5398                         btrfs_item_key_to_cpu(c, key, slot);
5399                 else {
5400                         u64 gen = btrfs_node_ptr_generation(c, slot);
5401
5402                         if (gen < min_trans) {
5403                                 slot++;
5404                                 goto next;
5405                         }
5406                         btrfs_node_key_to_cpu(c, key, slot);
5407                 }
5408                 return 0;
5409         }
5410         return 1;
5411 }
5412
5413 /*
5414  * search the tree again to find a leaf with greater keys
5415  * returns 0 if it found something or 1 if there are no greater leaves.
5416  * returns < 0 on io errors.
5417  */
5418 int btrfs_next_leaf(struct btrfs_root *root, struct btrfs_path *path)
5419 {
5420         return btrfs_next_old_leaf(root, path, 0);
5421 }
5422
5423 int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
5424                         u64 time_seq)
5425 {
5426         int slot;
5427         int level;
5428         struct extent_buffer *c;
5429         struct extent_buffer *next;
5430         struct btrfs_key key;
5431         u32 nritems;
5432         int ret;
5433         int old_spinning = path->leave_spinning;
5434         int next_rw_lock = 0;
5435
5436         nritems = btrfs_header_nritems(path->nodes[0]);
5437         if (nritems == 0)
5438                 return 1;
5439
5440         btrfs_item_key_to_cpu(path->nodes[0], &key, nritems - 1);
5441 again:
5442         level = 1;
5443         next = NULL;
5444         next_rw_lock = 0;
5445         btrfs_release_path(path);
5446
5447         path->keep_locks = 1;
5448         path->leave_spinning = 1;
5449
5450         if (time_seq)
5451                 ret = btrfs_search_old_slot(root, &key, path, time_seq);
5452         else
5453                 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
5454         path->keep_locks = 0;
5455
5456         if (ret < 0)
5457                 return ret;
5458
5459         nritems = btrfs_header_nritems(path->nodes[0]);
5460         /*
5461          * by releasing the path above we dropped all our locks.  A balance
5462          * could have added more items next to the key that used to be
5463          * at the very end of the block.  So, check again here and
5464          * advance the path if there are now more items available.
5465          */
5466         if (nritems > 0 && path->slots[0] < nritems - 1) {
5467                 if (ret == 0)
5468                         path->slots[0]++;
5469                 ret = 0;
5470                 goto done;
5471         }
5472         /*
5473          * So the above check misses one case:
5474          * - after releasing the path above, someone has removed the item that
5475          *   used to be at the very end of the block, and balance between leafs
5476          *   gets another one with bigger key.offset to replace it.
5477          *
5478          * This one should be returned as well, or we can get leaf corruption
5479          * later(esp. in __btrfs_drop_extents()).
5480          *
5481          * And a bit more explanation about this check,
5482          * with ret > 0, the key isn't found, the path points to the slot
5483          * where it should be inserted, so the path->slots[0] item must be the
5484          * bigger one.
5485          */
5486         if (nritems > 0 && ret > 0 && path->slots[0] == nritems - 1) {
5487                 ret = 0;
5488                 goto done;
5489         }
5490
5491         while (level < BTRFS_MAX_LEVEL) {
5492                 if (!path->nodes[level]) {
5493                         ret = 1;
5494                         goto done;
5495                 }
5496
5497                 slot = path->slots[level] + 1;
5498                 c = path->nodes[level];
5499                 if (slot >= btrfs_header_nritems(c)) {
5500                         level++;
5501                         if (level == BTRFS_MAX_LEVEL) {
5502                                 ret = 1;
5503                                 goto done;
5504                         }
5505                         continue;
5506                 }
5507
5508                 if (next) {
5509                         btrfs_tree_unlock_rw(next, next_rw_lock);
5510                         free_extent_buffer(next);
5511                 }
5512
5513                 next = c;
5514                 next_rw_lock = path->locks[level];
5515                 ret = read_block_for_search(root, path, &next, level,
5516                                             slot, &key);
5517                 if (ret == -EAGAIN)
5518                         goto again;
5519
5520                 if (ret < 0) {
5521                         btrfs_release_path(path);
5522                         goto done;
5523                 }
5524
5525                 if (!path->skip_locking) {
5526                         ret = btrfs_try_tree_read_lock(next);
5527                         if (!ret && time_seq) {
5528                                 /*
5529                                  * If we don't get the lock, we may be racing
5530                                  * with push_leaf_left, holding that lock while
5531                                  * itself waiting for the leaf we've currently
5532                                  * locked. To solve this situation, we give up
5533                                  * on our lock and cycle.
5534                                  */
5535                                 free_extent_buffer(next);
5536                                 btrfs_release_path(path);
5537                                 cond_resched();
5538                                 goto again;
5539                         }
5540                         if (!ret) {
5541                                 btrfs_set_path_blocking(path);
5542                                 btrfs_tree_read_lock(next);
5543                         }
5544                         next_rw_lock = BTRFS_READ_LOCK;
5545                 }
5546                 break;
5547         }
5548         path->slots[level] = slot;
5549         while (1) {
5550                 level--;
5551                 c = path->nodes[level];
5552                 if (path->locks[level])
5553                         btrfs_tree_unlock_rw(c, path->locks[level]);
5554
5555                 free_extent_buffer(c);
5556                 path->nodes[level] = next;
5557                 path->slots[level] = 0;
5558                 if (!path->skip_locking)
5559                         path->locks[level] = next_rw_lock;
5560                 if (!level)
5561                         break;
5562
5563                 ret = read_block_for_search(root, path, &next, level,
5564                                             0, &key);
5565                 if (ret == -EAGAIN)
5566                         goto again;
5567
5568                 if (ret < 0) {
5569                         btrfs_release_path(path);
5570                         goto done;
5571                 }
5572
5573                 if (!path->skip_locking) {
5574                         ret = btrfs_try_tree_read_lock(next);
5575                         if (!ret) {
5576                                 btrfs_set_path_blocking(path);
5577                                 btrfs_tree_read_lock(next);
5578                         }
5579                         next_rw_lock = BTRFS_READ_LOCK;
5580                 }
5581         }
5582         ret = 0;
5583 done:
5584         unlock_up(path, 0, 1, 0, NULL);
5585         path->leave_spinning = old_spinning;
5586         if (!old_spinning)
5587                 btrfs_set_path_blocking(path);
5588
5589         return ret;
5590 }
5591
5592 /*
5593  * this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
5594  * searching until it gets past min_objectid or finds an item of 'type'
5595  *
5596  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5597  */
5598 int btrfs_previous_item(struct btrfs_root *root,
5599                         struct btrfs_path *path, u64 min_objectid,
5600                         int type)
5601 {
5602         struct btrfs_key found_key;
5603         struct extent_buffer *leaf;
5604         u32 nritems;
5605         int ret;
5606
5607         while (1) {
5608                 if (path->slots[0] == 0) {
5609                         btrfs_set_path_blocking(path);
5610                         ret = btrfs_prev_leaf(root, path);
5611                         if (ret != 0)
5612                                 return ret;
5613                 } else {
5614                         path->slots[0]--;
5615                 }
5616                 leaf = path->nodes[0];
5617                 nritems = btrfs_header_nritems(leaf);
5618                 if (nritems == 0)
5619                         return 1;
5620                 if (path->slots[0] == nritems)
5621                         path->slots[0]--;
5622
5623                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5624                 if (found_key.objectid < min_objectid)
5625                         break;
5626                 if (found_key.type == type)
5627                         return 0;
5628                 if (found_key.objectid == min_objectid &&
5629                     found_key.type < type)
5630                         break;
5631         }
5632         return 1;
5633 }
5634
5635 /*
5636  * search in extent tree to find a previous Metadata/Data extent item with
5637  * min objecitd.
5638  *
5639  * returns 0 if something is found, 1 if nothing was found and < 0 on error
5640  */
5641 int btrfs_previous_extent_item(struct btrfs_root *root,
5642                         struct btrfs_path *path, u64 min_objectid)
5643 {
5644         struct btrfs_key found_key;
5645         struct extent_buffer *leaf;
5646         u32 nritems;
5647         int ret;
5648
5649         while (1) {
5650                 if (path->slots[0] == 0) {
5651                         btrfs_set_path_blocking(path);
5652                         ret = btrfs_prev_leaf(root, path);
5653                         if (ret != 0)
5654                                 return ret;
5655                 } else {
5656                         path->slots[0]--;
5657                 }
5658                 leaf = path->nodes[0];
5659                 nritems = btrfs_header_nritems(leaf);
5660                 if (nritems == 0)
5661                         return 1;
5662                 if (path->slots[0] == nritems)
5663                         path->slots[0]--;
5664
5665                 btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]);
5666                 if (found_key.objectid < min_objectid)
5667                         break;
5668                 if (found_key.type == BTRFS_EXTENT_ITEM_KEY ||
5669                     found_key.type == BTRFS_METADATA_ITEM_KEY)
5670                         return 0;
5671                 if (found_key.objectid == min_objectid &&
5672                     found_key.type < BTRFS_EXTENT_ITEM_KEY)
5673                         break;
5674         }
5675         return 1;
5676 }