GNU Linux-libre 5.19-rc6-gnu
[releases.git] / fs / btrfs / delayed-ref.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (C) 2009 Oracle.  All rights reserved.
4  */
5
6 #include <linux/sched.h>
7 #include <linux/slab.h>
8 #include <linux/sort.h>
9 #include "ctree.h"
10 #include "delayed-ref.h"
11 #include "transaction.h"
12 #include "qgroup.h"
13 #include "space-info.h"
14 #include "tree-mod-log.h"
15
16 struct kmem_cache *btrfs_delayed_ref_head_cachep;
17 struct kmem_cache *btrfs_delayed_tree_ref_cachep;
18 struct kmem_cache *btrfs_delayed_data_ref_cachep;
19 struct kmem_cache *btrfs_delayed_extent_op_cachep;
20 /*
21  * delayed back reference update tracking.  For subvolume trees
22  * we queue up extent allocations and backref maintenance for
23  * delayed processing.   This avoids deep call chains where we
24  * add extents in the middle of btrfs_search_slot, and it allows
25  * us to buffer up frequently modified backrefs in an rb tree instead
26  * of hammering updates on the extent allocation tree.
27  */
28
29 bool btrfs_check_space_for_delayed_refs(struct btrfs_fs_info *fs_info)
30 {
31         struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
32         struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
33         bool ret = false;
34         u64 reserved;
35
36         spin_lock(&global_rsv->lock);
37         reserved = global_rsv->reserved;
38         spin_unlock(&global_rsv->lock);
39
40         /*
41          * Since the global reserve is just kind of magic we don't really want
42          * to rely on it to save our bacon, so if our size is more than the
43          * delayed_refs_rsv and the global rsv then it's time to think about
44          * bailing.
45          */
46         spin_lock(&delayed_refs_rsv->lock);
47         reserved += delayed_refs_rsv->reserved;
48         if (delayed_refs_rsv->size >= reserved)
49                 ret = true;
50         spin_unlock(&delayed_refs_rsv->lock);
51         return ret;
52 }
53
54 int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans)
55 {
56         u64 num_entries =
57                 atomic_read(&trans->transaction->delayed_refs.num_entries);
58         u64 avg_runtime;
59         u64 val;
60
61         smp_mb();
62         avg_runtime = trans->fs_info->avg_delayed_ref_runtime;
63         val = num_entries * avg_runtime;
64         if (val >= NSEC_PER_SEC)
65                 return 1;
66         if (val >= NSEC_PER_SEC / 2)
67                 return 2;
68
69         return btrfs_check_space_for_delayed_refs(trans->fs_info);
70 }
71
72 /**
73  * Release a ref head's reservation
74  *
75  * @fs_info:  the filesystem
76  * @nr:       number of items to drop
77  *
78  * This drops the delayed ref head's count from the delayed refs rsv and frees
79  * any excess reservation we had.
80  */
81 void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
82 {
83         struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
84         u64 num_bytes = btrfs_calc_insert_metadata_size(fs_info, nr);
85         u64 released = 0;
86
87         /*
88          * We have to check the mount option here because we could be enabling
89          * the free space tree for the first time and don't have the compat_ro
90          * option set yet.
91          *
92          * We need extra reservations if we have the free space tree because
93          * we'll have to modify that tree as well.
94          */
95         if (btrfs_test_opt(fs_info, FREE_SPACE_TREE))
96                 num_bytes *= 2;
97
98         released = btrfs_block_rsv_release(fs_info, block_rsv, num_bytes, NULL);
99         if (released)
100                 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
101                                               0, released, 0);
102 }
103
104 /*
105  * btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv
106  * @trans - the trans that may have generated delayed refs
107  *
108  * This is to be called anytime we may have adjusted trans->delayed_ref_updates,
109  * it'll calculate the additional size and add it to the delayed_refs_rsv.
110  */
111 void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
112 {
113         struct btrfs_fs_info *fs_info = trans->fs_info;
114         struct btrfs_block_rsv *delayed_rsv = &fs_info->delayed_refs_rsv;
115         u64 num_bytes;
116
117         if (!trans->delayed_ref_updates)
118                 return;
119
120         num_bytes = btrfs_calc_insert_metadata_size(fs_info,
121                                                     trans->delayed_ref_updates);
122         /*
123          * We have to check the mount option here because we could be enabling
124          * the free space tree for the first time and don't have the compat_ro
125          * option set yet.
126          *
127          * We need extra reservations if we have the free space tree because
128          * we'll have to modify that tree as well.
129          */
130         if (btrfs_test_opt(fs_info, FREE_SPACE_TREE))
131                 num_bytes *= 2;
132
133         spin_lock(&delayed_rsv->lock);
134         delayed_rsv->size += num_bytes;
135         delayed_rsv->full = 0;
136         spin_unlock(&delayed_rsv->lock);
137         trans->delayed_ref_updates = 0;
138 }
139
140 /**
141  * Transfer bytes to our delayed refs rsv
142  *
143  * @fs_info:   the filesystem
144  * @src:       source block rsv to transfer from
145  * @num_bytes: number of bytes to transfer
146  *
147  * This transfers up to the num_bytes amount from the src rsv to the
148  * delayed_refs_rsv.  Any extra bytes are returned to the space info.
149  */
150 void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
151                                        struct btrfs_block_rsv *src,
152                                        u64 num_bytes)
153 {
154         struct btrfs_block_rsv *delayed_refs_rsv = &fs_info->delayed_refs_rsv;
155         u64 to_free = 0;
156
157         spin_lock(&src->lock);
158         src->reserved -= num_bytes;
159         src->size -= num_bytes;
160         spin_unlock(&src->lock);
161
162         spin_lock(&delayed_refs_rsv->lock);
163         if (delayed_refs_rsv->size > delayed_refs_rsv->reserved) {
164                 u64 delta = delayed_refs_rsv->size -
165                         delayed_refs_rsv->reserved;
166                 if (num_bytes > delta) {
167                         to_free = num_bytes - delta;
168                         num_bytes = delta;
169                 }
170         } else {
171                 to_free = num_bytes;
172                 num_bytes = 0;
173         }
174
175         if (num_bytes)
176                 delayed_refs_rsv->reserved += num_bytes;
177         if (delayed_refs_rsv->reserved >= delayed_refs_rsv->size)
178                 delayed_refs_rsv->full = 1;
179         spin_unlock(&delayed_refs_rsv->lock);
180
181         if (num_bytes)
182                 trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
183                                               0, num_bytes, 1);
184         if (to_free)
185                 btrfs_space_info_free_bytes_may_use(fs_info,
186                                 delayed_refs_rsv->space_info, to_free);
187 }
188
189 /**
190  * Refill based on our delayed refs usage
191  *
192  * @fs_info: the filesystem
193  * @flush:   control how we can flush for this reservation.
194  *
195  * This will refill the delayed block_rsv up to 1 items size worth of space and
196  * will return -ENOSPC if we can't make the reservation.
197  */
198 int btrfs_delayed_refs_rsv_refill(struct btrfs_fs_info *fs_info,
199                                   enum btrfs_reserve_flush_enum flush)
200 {
201         struct btrfs_block_rsv *block_rsv = &fs_info->delayed_refs_rsv;
202         u64 limit = btrfs_calc_insert_metadata_size(fs_info, 1);
203         u64 num_bytes = 0;
204         int ret = -ENOSPC;
205
206         spin_lock(&block_rsv->lock);
207         if (block_rsv->reserved < block_rsv->size) {
208                 num_bytes = block_rsv->size - block_rsv->reserved;
209                 num_bytes = min(num_bytes, limit);
210         }
211         spin_unlock(&block_rsv->lock);
212
213         if (!num_bytes)
214                 return 0;
215
216         ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, num_bytes, flush);
217         if (ret)
218                 return ret;
219         btrfs_block_rsv_add_bytes(block_rsv, num_bytes, 0);
220         trace_btrfs_space_reservation(fs_info, "delayed_refs_rsv",
221                                       0, num_bytes, 1);
222         return 0;
223 }
224
225 /*
226  * compare two delayed tree backrefs with same bytenr and type
227  */
228 static int comp_tree_refs(struct btrfs_delayed_tree_ref *ref1,
229                           struct btrfs_delayed_tree_ref *ref2)
230 {
231         if (ref1->node.type == BTRFS_TREE_BLOCK_REF_KEY) {
232                 if (ref1->root < ref2->root)
233                         return -1;
234                 if (ref1->root > ref2->root)
235                         return 1;
236         } else {
237                 if (ref1->parent < ref2->parent)
238                         return -1;
239                 if (ref1->parent > ref2->parent)
240                         return 1;
241         }
242         return 0;
243 }
244
245 /*
246  * compare two delayed data backrefs with same bytenr and type
247  */
248 static int comp_data_refs(struct btrfs_delayed_data_ref *ref1,
249                           struct btrfs_delayed_data_ref *ref2)
250 {
251         if (ref1->node.type == BTRFS_EXTENT_DATA_REF_KEY) {
252                 if (ref1->root < ref2->root)
253                         return -1;
254                 if (ref1->root > ref2->root)
255                         return 1;
256                 if (ref1->objectid < ref2->objectid)
257                         return -1;
258                 if (ref1->objectid > ref2->objectid)
259                         return 1;
260                 if (ref1->offset < ref2->offset)
261                         return -1;
262                 if (ref1->offset > ref2->offset)
263                         return 1;
264         } else {
265                 if (ref1->parent < ref2->parent)
266                         return -1;
267                 if (ref1->parent > ref2->parent)
268                         return 1;
269         }
270         return 0;
271 }
272
273 static int comp_refs(struct btrfs_delayed_ref_node *ref1,
274                      struct btrfs_delayed_ref_node *ref2,
275                      bool check_seq)
276 {
277         int ret = 0;
278
279         if (ref1->type < ref2->type)
280                 return -1;
281         if (ref1->type > ref2->type)
282                 return 1;
283         if (ref1->type == BTRFS_TREE_BLOCK_REF_KEY ||
284             ref1->type == BTRFS_SHARED_BLOCK_REF_KEY)
285                 ret = comp_tree_refs(btrfs_delayed_node_to_tree_ref(ref1),
286                                      btrfs_delayed_node_to_tree_ref(ref2));
287         else
288                 ret = comp_data_refs(btrfs_delayed_node_to_data_ref(ref1),
289                                      btrfs_delayed_node_to_data_ref(ref2));
290         if (ret)
291                 return ret;
292         if (check_seq) {
293                 if (ref1->seq < ref2->seq)
294                         return -1;
295                 if (ref1->seq > ref2->seq)
296                         return 1;
297         }
298         return 0;
299 }
300
301 /* insert a new ref to head ref rbtree */
302 static struct btrfs_delayed_ref_head *htree_insert(struct rb_root_cached *root,
303                                                    struct rb_node *node)
304 {
305         struct rb_node **p = &root->rb_root.rb_node;
306         struct rb_node *parent_node = NULL;
307         struct btrfs_delayed_ref_head *entry;
308         struct btrfs_delayed_ref_head *ins;
309         u64 bytenr;
310         bool leftmost = true;
311
312         ins = rb_entry(node, struct btrfs_delayed_ref_head, href_node);
313         bytenr = ins->bytenr;
314         while (*p) {
315                 parent_node = *p;
316                 entry = rb_entry(parent_node, struct btrfs_delayed_ref_head,
317                                  href_node);
318
319                 if (bytenr < entry->bytenr) {
320                         p = &(*p)->rb_left;
321                 } else if (bytenr > entry->bytenr) {
322                         p = &(*p)->rb_right;
323                         leftmost = false;
324                 } else {
325                         return entry;
326                 }
327         }
328
329         rb_link_node(node, parent_node, p);
330         rb_insert_color_cached(node, root, leftmost);
331         return NULL;
332 }
333
334 static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root,
335                 struct btrfs_delayed_ref_node *ins)
336 {
337         struct rb_node **p = &root->rb_root.rb_node;
338         struct rb_node *node = &ins->ref_node;
339         struct rb_node *parent_node = NULL;
340         struct btrfs_delayed_ref_node *entry;
341         bool leftmost = true;
342
343         while (*p) {
344                 int comp;
345
346                 parent_node = *p;
347                 entry = rb_entry(parent_node, struct btrfs_delayed_ref_node,
348                                  ref_node);
349                 comp = comp_refs(ins, entry, true);
350                 if (comp < 0) {
351                         p = &(*p)->rb_left;
352                 } else if (comp > 0) {
353                         p = &(*p)->rb_right;
354                         leftmost = false;
355                 } else {
356                         return entry;
357                 }
358         }
359
360         rb_link_node(node, parent_node, p);
361         rb_insert_color_cached(node, root, leftmost);
362         return NULL;
363 }
364
365 static struct btrfs_delayed_ref_head *find_first_ref_head(
366                 struct btrfs_delayed_ref_root *dr)
367 {
368         struct rb_node *n;
369         struct btrfs_delayed_ref_head *entry;
370
371         n = rb_first_cached(&dr->href_root);
372         if (!n)
373                 return NULL;
374
375         entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
376
377         return entry;
378 }
379
380 /*
381  * Find a head entry based on bytenr. This returns the delayed ref head if it
382  * was able to find one, or NULL if nothing was in that spot.  If return_bigger
383  * is given, the next bigger entry is returned if no exact match is found.
384  */
385 static struct btrfs_delayed_ref_head *find_ref_head(
386                 struct btrfs_delayed_ref_root *dr, u64 bytenr,
387                 bool return_bigger)
388 {
389         struct rb_root *root = &dr->href_root.rb_root;
390         struct rb_node *n;
391         struct btrfs_delayed_ref_head *entry;
392
393         n = root->rb_node;
394         entry = NULL;
395         while (n) {
396                 entry = rb_entry(n, struct btrfs_delayed_ref_head, href_node);
397
398                 if (bytenr < entry->bytenr)
399                         n = n->rb_left;
400                 else if (bytenr > entry->bytenr)
401                         n = n->rb_right;
402                 else
403                         return entry;
404         }
405         if (entry && return_bigger) {
406                 if (bytenr > entry->bytenr) {
407                         n = rb_next(&entry->href_node);
408                         if (!n)
409                                 return NULL;
410                         entry = rb_entry(n, struct btrfs_delayed_ref_head,
411                                          href_node);
412                 }
413                 return entry;
414         }
415         return NULL;
416 }
417
418 int btrfs_delayed_ref_lock(struct btrfs_delayed_ref_root *delayed_refs,
419                            struct btrfs_delayed_ref_head *head)
420 {
421         lockdep_assert_held(&delayed_refs->lock);
422         if (mutex_trylock(&head->mutex))
423                 return 0;
424
425         refcount_inc(&head->refs);
426         spin_unlock(&delayed_refs->lock);
427
428         mutex_lock(&head->mutex);
429         spin_lock(&delayed_refs->lock);
430         if (RB_EMPTY_NODE(&head->href_node)) {
431                 mutex_unlock(&head->mutex);
432                 btrfs_put_delayed_ref_head(head);
433                 return -EAGAIN;
434         }
435         btrfs_put_delayed_ref_head(head);
436         return 0;
437 }
438
439 static inline void drop_delayed_ref(struct btrfs_trans_handle *trans,
440                                     struct btrfs_delayed_ref_root *delayed_refs,
441                                     struct btrfs_delayed_ref_head *head,
442                                     struct btrfs_delayed_ref_node *ref)
443 {
444         lockdep_assert_held(&head->lock);
445         rb_erase_cached(&ref->ref_node, &head->ref_tree);
446         RB_CLEAR_NODE(&ref->ref_node);
447         if (!list_empty(&ref->add_list))
448                 list_del(&ref->add_list);
449         ref->in_tree = 0;
450         btrfs_put_delayed_ref(ref);
451         atomic_dec(&delayed_refs->num_entries);
452 }
453
454 static bool merge_ref(struct btrfs_trans_handle *trans,
455                       struct btrfs_delayed_ref_root *delayed_refs,
456                       struct btrfs_delayed_ref_head *head,
457                       struct btrfs_delayed_ref_node *ref,
458                       u64 seq)
459 {
460         struct btrfs_delayed_ref_node *next;
461         struct rb_node *node = rb_next(&ref->ref_node);
462         bool done = false;
463
464         while (!done && node) {
465                 int mod;
466
467                 next = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
468                 node = rb_next(node);
469                 if (seq && next->seq >= seq)
470                         break;
471                 if (comp_refs(ref, next, false))
472                         break;
473
474                 if (ref->action == next->action) {
475                         mod = next->ref_mod;
476                 } else {
477                         if (ref->ref_mod < next->ref_mod) {
478                                 swap(ref, next);
479                                 done = true;
480                         }
481                         mod = -next->ref_mod;
482                 }
483
484                 drop_delayed_ref(trans, delayed_refs, head, next);
485                 ref->ref_mod += mod;
486                 if (ref->ref_mod == 0) {
487                         drop_delayed_ref(trans, delayed_refs, head, ref);
488                         done = true;
489                 } else {
490                         /*
491                          * Can't have multiples of the same ref on a tree block.
492                          */
493                         WARN_ON(ref->type == BTRFS_TREE_BLOCK_REF_KEY ||
494                                 ref->type == BTRFS_SHARED_BLOCK_REF_KEY);
495                 }
496         }
497
498         return done;
499 }
500
501 void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans,
502                               struct btrfs_delayed_ref_root *delayed_refs,
503                               struct btrfs_delayed_ref_head *head)
504 {
505         struct btrfs_fs_info *fs_info = trans->fs_info;
506         struct btrfs_delayed_ref_node *ref;
507         struct rb_node *node;
508         u64 seq = 0;
509
510         lockdep_assert_held(&head->lock);
511
512         if (RB_EMPTY_ROOT(&head->ref_tree.rb_root))
513                 return;
514
515         /* We don't have too many refs to merge for data. */
516         if (head->is_data)
517                 return;
518
519         seq = btrfs_tree_mod_log_lowest_seq(fs_info);
520 again:
521         for (node = rb_first_cached(&head->ref_tree); node;
522              node = rb_next(node)) {
523                 ref = rb_entry(node, struct btrfs_delayed_ref_node, ref_node);
524                 if (seq && ref->seq >= seq)
525                         continue;
526                 if (merge_ref(trans, delayed_refs, head, ref, seq))
527                         goto again;
528         }
529 }
530
531 int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq)
532 {
533         int ret = 0;
534         u64 min_seq = btrfs_tree_mod_log_lowest_seq(fs_info);
535
536         if (min_seq != 0 && seq >= min_seq) {
537                 btrfs_debug(fs_info,
538                             "holding back delayed_ref %llu, lowest is %llu",
539                             seq, min_seq);
540                 ret = 1;
541         }
542
543         return ret;
544 }
545
546 struct btrfs_delayed_ref_head *btrfs_select_ref_head(
547                 struct btrfs_delayed_ref_root *delayed_refs)
548 {
549         struct btrfs_delayed_ref_head *head;
550
551 again:
552         head = find_ref_head(delayed_refs, delayed_refs->run_delayed_start,
553                              true);
554         if (!head && delayed_refs->run_delayed_start != 0) {
555                 delayed_refs->run_delayed_start = 0;
556                 head = find_first_ref_head(delayed_refs);
557         }
558         if (!head)
559                 return NULL;
560
561         while (head->processing) {
562                 struct rb_node *node;
563
564                 node = rb_next(&head->href_node);
565                 if (!node) {
566                         if (delayed_refs->run_delayed_start == 0)
567                                 return NULL;
568                         delayed_refs->run_delayed_start = 0;
569                         goto again;
570                 }
571                 head = rb_entry(node, struct btrfs_delayed_ref_head,
572                                 href_node);
573         }
574
575         head->processing = 1;
576         WARN_ON(delayed_refs->num_heads_ready == 0);
577         delayed_refs->num_heads_ready--;
578         delayed_refs->run_delayed_start = head->bytenr +
579                 head->num_bytes;
580         return head;
581 }
582
583 void btrfs_delete_ref_head(struct btrfs_delayed_ref_root *delayed_refs,
584                            struct btrfs_delayed_ref_head *head)
585 {
586         lockdep_assert_held(&delayed_refs->lock);
587         lockdep_assert_held(&head->lock);
588
589         rb_erase_cached(&head->href_node, &delayed_refs->href_root);
590         RB_CLEAR_NODE(&head->href_node);
591         atomic_dec(&delayed_refs->num_entries);
592         delayed_refs->num_heads--;
593         if (head->processing == 0)
594                 delayed_refs->num_heads_ready--;
595 }
596
597 /*
598  * Helper to insert the ref_node to the tail or merge with tail.
599  *
600  * Return 0 for insert.
601  * Return >0 for merge.
602  */
603 static int insert_delayed_ref(struct btrfs_trans_handle *trans,
604                               struct btrfs_delayed_ref_root *root,
605                               struct btrfs_delayed_ref_head *href,
606                               struct btrfs_delayed_ref_node *ref)
607 {
608         struct btrfs_delayed_ref_node *exist;
609         int mod;
610         int ret = 0;
611
612         spin_lock(&href->lock);
613         exist = tree_insert(&href->ref_tree, ref);
614         if (!exist)
615                 goto inserted;
616
617         /* Now we are sure we can merge */
618         ret = 1;
619         if (exist->action == ref->action) {
620                 mod = ref->ref_mod;
621         } else {
622                 /* Need to change action */
623                 if (exist->ref_mod < ref->ref_mod) {
624                         exist->action = ref->action;
625                         mod = -exist->ref_mod;
626                         exist->ref_mod = ref->ref_mod;
627                         if (ref->action == BTRFS_ADD_DELAYED_REF)
628                                 list_add_tail(&exist->add_list,
629                                               &href->ref_add_list);
630                         else if (ref->action == BTRFS_DROP_DELAYED_REF) {
631                                 ASSERT(!list_empty(&exist->add_list));
632                                 list_del(&exist->add_list);
633                         } else {
634                                 ASSERT(0);
635                         }
636                 } else
637                         mod = -ref->ref_mod;
638         }
639         exist->ref_mod += mod;
640
641         /* remove existing tail if its ref_mod is zero */
642         if (exist->ref_mod == 0)
643                 drop_delayed_ref(trans, root, href, exist);
644         spin_unlock(&href->lock);
645         return ret;
646 inserted:
647         if (ref->action == BTRFS_ADD_DELAYED_REF)
648                 list_add_tail(&ref->add_list, &href->ref_add_list);
649         atomic_inc(&root->num_entries);
650         spin_unlock(&href->lock);
651         return ret;
652 }
653
654 /*
655  * helper function to update the accounting in the head ref
656  * existing and update must have the same bytenr
657  */
658 static noinline void update_existing_head_ref(struct btrfs_trans_handle *trans,
659                          struct btrfs_delayed_ref_head *existing,
660                          struct btrfs_delayed_ref_head *update)
661 {
662         struct btrfs_delayed_ref_root *delayed_refs =
663                 &trans->transaction->delayed_refs;
664         struct btrfs_fs_info *fs_info = trans->fs_info;
665         int old_ref_mod;
666
667         BUG_ON(existing->is_data != update->is_data);
668
669         spin_lock(&existing->lock);
670         if (update->must_insert_reserved) {
671                 /* if the extent was freed and then
672                  * reallocated before the delayed ref
673                  * entries were processed, we can end up
674                  * with an existing head ref without
675                  * the must_insert_reserved flag set.
676                  * Set it again here
677                  */
678                 existing->must_insert_reserved = update->must_insert_reserved;
679
680                 /*
681                  * update the num_bytes so we make sure the accounting
682                  * is done correctly
683                  */
684                 existing->num_bytes = update->num_bytes;
685
686         }
687
688         if (update->extent_op) {
689                 if (!existing->extent_op) {
690                         existing->extent_op = update->extent_op;
691                 } else {
692                         if (update->extent_op->update_key) {
693                                 memcpy(&existing->extent_op->key,
694                                        &update->extent_op->key,
695                                        sizeof(update->extent_op->key));
696                                 existing->extent_op->update_key = true;
697                         }
698                         if (update->extent_op->update_flags) {
699                                 existing->extent_op->flags_to_set |=
700                                         update->extent_op->flags_to_set;
701                                 existing->extent_op->update_flags = true;
702                         }
703                         btrfs_free_delayed_extent_op(update->extent_op);
704                 }
705         }
706         /*
707          * update the reference mod on the head to reflect this new operation,
708          * only need the lock for this case cause we could be processing it
709          * currently, for refs we just added we know we're a-ok.
710          */
711         old_ref_mod = existing->total_ref_mod;
712         existing->ref_mod += update->ref_mod;
713         existing->total_ref_mod += update->ref_mod;
714
715         /*
716          * If we are going to from a positive ref mod to a negative or vice
717          * versa we need to make sure to adjust pending_csums accordingly.
718          */
719         if (existing->is_data) {
720                 u64 csum_leaves =
721                         btrfs_csum_bytes_to_leaves(fs_info,
722                                                    existing->num_bytes);
723
724                 if (existing->total_ref_mod >= 0 && old_ref_mod < 0) {
725                         delayed_refs->pending_csums -= existing->num_bytes;
726                         btrfs_delayed_refs_rsv_release(fs_info, csum_leaves);
727                 }
728                 if (existing->total_ref_mod < 0 && old_ref_mod >= 0) {
729                         delayed_refs->pending_csums += existing->num_bytes;
730                         trans->delayed_ref_updates += csum_leaves;
731                 }
732         }
733
734         spin_unlock(&existing->lock);
735 }
736
737 static void init_delayed_ref_head(struct btrfs_delayed_ref_head *head_ref,
738                                   struct btrfs_qgroup_extent_record *qrecord,
739                                   u64 bytenr, u64 num_bytes, u64 ref_root,
740                                   u64 reserved, int action, bool is_data,
741                                   bool is_system)
742 {
743         int count_mod = 1;
744         int must_insert_reserved = 0;
745
746         /* If reserved is provided, it must be a data extent. */
747         BUG_ON(!is_data && reserved);
748
749         /*
750          * The head node stores the sum of all the mods, so dropping a ref
751          * should drop the sum in the head node by one.
752          */
753         if (action == BTRFS_UPDATE_DELAYED_HEAD)
754                 count_mod = 0;
755         else if (action == BTRFS_DROP_DELAYED_REF)
756                 count_mod = -1;
757
758         /*
759          * BTRFS_ADD_DELAYED_EXTENT means that we need to update the reserved
760          * accounting when the extent is finally added, or if a later
761          * modification deletes the delayed ref without ever inserting the
762          * extent into the extent allocation tree.  ref->must_insert_reserved
763          * is the flag used to record that accounting mods are required.
764          *
765          * Once we record must_insert_reserved, switch the action to
766          * BTRFS_ADD_DELAYED_REF because other special casing is not required.
767          */
768         if (action == BTRFS_ADD_DELAYED_EXTENT)
769                 must_insert_reserved = 1;
770         else
771                 must_insert_reserved = 0;
772
773         refcount_set(&head_ref->refs, 1);
774         head_ref->bytenr = bytenr;
775         head_ref->num_bytes = num_bytes;
776         head_ref->ref_mod = count_mod;
777         head_ref->must_insert_reserved = must_insert_reserved;
778         head_ref->is_data = is_data;
779         head_ref->is_system = is_system;
780         head_ref->ref_tree = RB_ROOT_CACHED;
781         INIT_LIST_HEAD(&head_ref->ref_add_list);
782         RB_CLEAR_NODE(&head_ref->href_node);
783         head_ref->processing = 0;
784         head_ref->total_ref_mod = count_mod;
785         spin_lock_init(&head_ref->lock);
786         mutex_init(&head_ref->mutex);
787
788         if (qrecord) {
789                 if (ref_root && reserved) {
790                         qrecord->data_rsv = reserved;
791                         qrecord->data_rsv_refroot = ref_root;
792                 }
793                 qrecord->bytenr = bytenr;
794                 qrecord->num_bytes = num_bytes;
795                 qrecord->old_roots = NULL;
796         }
797 }
798
799 /*
800  * helper function to actually insert a head node into the rbtree.
801  * this does all the dirty work in terms of maintaining the correct
802  * overall modification count.
803  */
804 static noinline struct btrfs_delayed_ref_head *
805 add_delayed_ref_head(struct btrfs_trans_handle *trans,
806                      struct btrfs_delayed_ref_head *head_ref,
807                      struct btrfs_qgroup_extent_record *qrecord,
808                      int action, int *qrecord_inserted_ret)
809 {
810         struct btrfs_delayed_ref_head *existing;
811         struct btrfs_delayed_ref_root *delayed_refs;
812         int qrecord_inserted = 0;
813
814         delayed_refs = &trans->transaction->delayed_refs;
815
816         /* Record qgroup extent info if provided */
817         if (qrecord) {
818                 if (btrfs_qgroup_trace_extent_nolock(trans->fs_info,
819                                         delayed_refs, qrecord))
820                         kfree(qrecord);
821                 else
822                         qrecord_inserted = 1;
823         }
824
825         trace_add_delayed_ref_head(trans->fs_info, head_ref, action);
826
827         existing = htree_insert(&delayed_refs->href_root,
828                                 &head_ref->href_node);
829         if (existing) {
830                 update_existing_head_ref(trans, existing, head_ref);
831                 /*
832                  * we've updated the existing ref, free the newly
833                  * allocated ref
834                  */
835                 kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
836                 head_ref = existing;
837         } else {
838                 if (head_ref->is_data && head_ref->ref_mod < 0) {
839                         delayed_refs->pending_csums += head_ref->num_bytes;
840                         trans->delayed_ref_updates +=
841                                 btrfs_csum_bytes_to_leaves(trans->fs_info,
842                                                            head_ref->num_bytes);
843                 }
844                 delayed_refs->num_heads++;
845                 delayed_refs->num_heads_ready++;
846                 atomic_inc(&delayed_refs->num_entries);
847                 trans->delayed_ref_updates++;
848         }
849         if (qrecord_inserted_ret)
850                 *qrecord_inserted_ret = qrecord_inserted;
851
852         return head_ref;
853 }
854
855 /*
856  * init_delayed_ref_common - Initialize the structure which represents a
857  *                           modification to a an extent.
858  *
859  * @fs_info:    Internal to the mounted filesystem mount structure.
860  *
861  * @ref:        The structure which is going to be initialized.
862  *
863  * @bytenr:     The logical address of the extent for which a modification is
864  *              going to be recorded.
865  *
866  * @num_bytes:  Size of the extent whose modification is being recorded.
867  *
868  * @ref_root:   The id of the root where this modification has originated, this
869  *              can be either one of the well-known metadata trees or the
870  *              subvolume id which references this extent.
871  *
872  * @action:     Can be one of BTRFS_ADD_DELAYED_REF/BTRFS_DROP_DELAYED_REF or
873  *              BTRFS_ADD_DELAYED_EXTENT
874  *
875  * @ref_type:   Holds the type of the extent which is being recorded, can be
876  *              one of BTRFS_SHARED_BLOCK_REF_KEY/BTRFS_TREE_BLOCK_REF_KEY
877  *              when recording a metadata extent or BTRFS_SHARED_DATA_REF_KEY/
878  *              BTRFS_EXTENT_DATA_REF_KEY when recording data extent
879  */
880 static void init_delayed_ref_common(struct btrfs_fs_info *fs_info,
881                                     struct btrfs_delayed_ref_node *ref,
882                                     u64 bytenr, u64 num_bytes, u64 ref_root,
883                                     int action, u8 ref_type)
884 {
885         u64 seq = 0;
886
887         if (action == BTRFS_ADD_DELAYED_EXTENT)
888                 action = BTRFS_ADD_DELAYED_REF;
889
890         if (is_fstree(ref_root))
891                 seq = atomic64_read(&fs_info->tree_mod_seq);
892
893         refcount_set(&ref->refs, 1);
894         ref->bytenr = bytenr;
895         ref->num_bytes = num_bytes;
896         ref->ref_mod = 1;
897         ref->action = action;
898         ref->is_head = 0;
899         ref->in_tree = 1;
900         ref->seq = seq;
901         ref->type = ref_type;
902         RB_CLEAR_NODE(&ref->ref_node);
903         INIT_LIST_HEAD(&ref->add_list);
904 }
905
906 /*
907  * add a delayed tree ref.  This does all of the accounting required
908  * to make sure the delayed ref is eventually processed before this
909  * transaction commits.
910  */
911 int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
912                                struct btrfs_ref *generic_ref,
913                                struct btrfs_delayed_extent_op *extent_op)
914 {
915         struct btrfs_fs_info *fs_info = trans->fs_info;
916         struct btrfs_delayed_tree_ref *ref;
917         struct btrfs_delayed_ref_head *head_ref;
918         struct btrfs_delayed_ref_root *delayed_refs;
919         struct btrfs_qgroup_extent_record *record = NULL;
920         int qrecord_inserted;
921         bool is_system;
922         int action = generic_ref->action;
923         int level = generic_ref->tree_ref.level;
924         int ret;
925         u64 bytenr = generic_ref->bytenr;
926         u64 num_bytes = generic_ref->len;
927         u64 parent = generic_ref->parent;
928         u8 ref_type;
929
930         is_system = (generic_ref->tree_ref.owning_root == BTRFS_CHUNK_TREE_OBJECTID);
931
932         ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action);
933         ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
934         if (!ref)
935                 return -ENOMEM;
936
937         head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
938         if (!head_ref) {
939                 kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
940                 return -ENOMEM;
941         }
942
943         if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
944             !generic_ref->skip_qgroup) {
945                 record = kzalloc(sizeof(*record), GFP_NOFS);
946                 if (!record) {
947                         kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
948                         kmem_cache_free(btrfs_delayed_ref_head_cachep, head_ref);
949                         return -ENOMEM;
950                 }
951         }
952
953         if (parent)
954                 ref_type = BTRFS_SHARED_BLOCK_REF_KEY;
955         else
956                 ref_type = BTRFS_TREE_BLOCK_REF_KEY;
957
958         init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes,
959                                 generic_ref->tree_ref.owning_root, action,
960                                 ref_type);
961         ref->root = generic_ref->tree_ref.owning_root;
962         ref->parent = parent;
963         ref->level = level;
964
965         init_delayed_ref_head(head_ref, record, bytenr, num_bytes,
966                               generic_ref->tree_ref.owning_root, 0, action,
967                               false, is_system);
968         head_ref->extent_op = extent_op;
969
970         delayed_refs = &trans->transaction->delayed_refs;
971         spin_lock(&delayed_refs->lock);
972
973         /*
974          * insert both the head node and the new ref without dropping
975          * the spin lock
976          */
977         head_ref = add_delayed_ref_head(trans, head_ref, record,
978                                         action, &qrecord_inserted);
979
980         ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
981         spin_unlock(&delayed_refs->lock);
982
983         /*
984          * Need to update the delayed_refs_rsv with any changes we may have
985          * made.
986          */
987         btrfs_update_delayed_refs_rsv(trans);
988
989         trace_add_delayed_tree_ref(fs_info, &ref->node, ref,
990                                    action == BTRFS_ADD_DELAYED_EXTENT ?
991                                    BTRFS_ADD_DELAYED_REF : action);
992         if (ret > 0)
993                 kmem_cache_free(btrfs_delayed_tree_ref_cachep, ref);
994
995         if (qrecord_inserted)
996                 btrfs_qgroup_trace_extent_post(trans, record);
997
998         return 0;
999 }
1000
1001 /*
1002  * add a delayed data ref. it's similar to btrfs_add_delayed_tree_ref.
1003  */
1004 int btrfs_add_delayed_data_ref(struct btrfs_trans_handle *trans,
1005                                struct btrfs_ref *generic_ref,
1006                                u64 reserved)
1007 {
1008         struct btrfs_fs_info *fs_info = trans->fs_info;
1009         struct btrfs_delayed_data_ref *ref;
1010         struct btrfs_delayed_ref_head *head_ref;
1011         struct btrfs_delayed_ref_root *delayed_refs;
1012         struct btrfs_qgroup_extent_record *record = NULL;
1013         int qrecord_inserted;
1014         int action = generic_ref->action;
1015         int ret;
1016         u64 bytenr = generic_ref->bytenr;
1017         u64 num_bytes = generic_ref->len;
1018         u64 parent = generic_ref->parent;
1019         u64 ref_root = generic_ref->data_ref.owning_root;
1020         u64 owner = generic_ref->data_ref.ino;
1021         u64 offset = generic_ref->data_ref.offset;
1022         u8 ref_type;
1023
1024         ASSERT(generic_ref->type == BTRFS_REF_DATA && action);
1025         ref = kmem_cache_alloc(btrfs_delayed_data_ref_cachep, GFP_NOFS);
1026         if (!ref)
1027                 return -ENOMEM;
1028
1029         if (parent)
1030                 ref_type = BTRFS_SHARED_DATA_REF_KEY;
1031         else
1032                 ref_type = BTRFS_EXTENT_DATA_REF_KEY;
1033         init_delayed_ref_common(fs_info, &ref->node, bytenr, num_bytes,
1034                                 ref_root, action, ref_type);
1035         ref->root = ref_root;
1036         ref->parent = parent;
1037         ref->objectid = owner;
1038         ref->offset = offset;
1039
1040
1041         head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
1042         if (!head_ref) {
1043                 kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
1044                 return -ENOMEM;
1045         }
1046
1047         if (test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags) &&
1048             !generic_ref->skip_qgroup) {
1049                 record = kzalloc(sizeof(*record), GFP_NOFS);
1050                 if (!record) {
1051                         kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
1052                         kmem_cache_free(btrfs_delayed_ref_head_cachep,
1053                                         head_ref);
1054                         return -ENOMEM;
1055                 }
1056         }
1057
1058         init_delayed_ref_head(head_ref, record, bytenr, num_bytes, ref_root,
1059                               reserved, action, true, false);
1060         head_ref->extent_op = NULL;
1061
1062         delayed_refs = &trans->transaction->delayed_refs;
1063         spin_lock(&delayed_refs->lock);
1064
1065         /*
1066          * insert both the head node and the new ref without dropping
1067          * the spin lock
1068          */
1069         head_ref = add_delayed_ref_head(trans, head_ref, record,
1070                                         action, &qrecord_inserted);
1071
1072         ret = insert_delayed_ref(trans, delayed_refs, head_ref, &ref->node);
1073         spin_unlock(&delayed_refs->lock);
1074
1075         /*
1076          * Need to update the delayed_refs_rsv with any changes we may have
1077          * made.
1078          */
1079         btrfs_update_delayed_refs_rsv(trans);
1080
1081         trace_add_delayed_data_ref(trans->fs_info, &ref->node, ref,
1082                                    action == BTRFS_ADD_DELAYED_EXTENT ?
1083                                    BTRFS_ADD_DELAYED_REF : action);
1084         if (ret > 0)
1085                 kmem_cache_free(btrfs_delayed_data_ref_cachep, ref);
1086
1087
1088         if (qrecord_inserted)
1089                 return btrfs_qgroup_trace_extent_post(trans, record);
1090         return 0;
1091 }
1092
1093 int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
1094                                 u64 bytenr, u64 num_bytes,
1095                                 struct btrfs_delayed_extent_op *extent_op)
1096 {
1097         struct btrfs_delayed_ref_head *head_ref;
1098         struct btrfs_delayed_ref_root *delayed_refs;
1099
1100         head_ref = kmem_cache_alloc(btrfs_delayed_ref_head_cachep, GFP_NOFS);
1101         if (!head_ref)
1102                 return -ENOMEM;
1103
1104         init_delayed_ref_head(head_ref, NULL, bytenr, num_bytes, 0, 0,
1105                               BTRFS_UPDATE_DELAYED_HEAD, false, false);
1106         head_ref->extent_op = extent_op;
1107
1108         delayed_refs = &trans->transaction->delayed_refs;
1109         spin_lock(&delayed_refs->lock);
1110
1111         add_delayed_ref_head(trans, head_ref, NULL, BTRFS_UPDATE_DELAYED_HEAD,
1112                              NULL);
1113
1114         spin_unlock(&delayed_refs->lock);
1115
1116         /*
1117          * Need to update the delayed_refs_rsv with any changes we may have
1118          * made.
1119          */
1120         btrfs_update_delayed_refs_rsv(trans);
1121         return 0;
1122 }
1123
1124 /*
1125  * This does a simple search for the head node for a given extent.  Returns the
1126  * head node if found, or NULL if not.
1127  */
1128 struct btrfs_delayed_ref_head *
1129 btrfs_find_delayed_ref_head(struct btrfs_delayed_ref_root *delayed_refs, u64 bytenr)
1130 {
1131         lockdep_assert_held(&delayed_refs->lock);
1132
1133         return find_ref_head(delayed_refs, bytenr, false);
1134 }
1135
1136 void __cold btrfs_delayed_ref_exit(void)
1137 {
1138         kmem_cache_destroy(btrfs_delayed_ref_head_cachep);
1139         kmem_cache_destroy(btrfs_delayed_tree_ref_cachep);
1140         kmem_cache_destroy(btrfs_delayed_data_ref_cachep);
1141         kmem_cache_destroy(btrfs_delayed_extent_op_cachep);
1142 }
1143
1144 int __init btrfs_delayed_ref_init(void)
1145 {
1146         btrfs_delayed_ref_head_cachep = kmem_cache_create(
1147                                 "btrfs_delayed_ref_head",
1148                                 sizeof(struct btrfs_delayed_ref_head), 0,
1149                                 SLAB_MEM_SPREAD, NULL);
1150         if (!btrfs_delayed_ref_head_cachep)
1151                 goto fail;
1152
1153         btrfs_delayed_tree_ref_cachep = kmem_cache_create(
1154                                 "btrfs_delayed_tree_ref",
1155                                 sizeof(struct btrfs_delayed_tree_ref), 0,
1156                                 SLAB_MEM_SPREAD, NULL);
1157         if (!btrfs_delayed_tree_ref_cachep)
1158                 goto fail;
1159
1160         btrfs_delayed_data_ref_cachep = kmem_cache_create(
1161                                 "btrfs_delayed_data_ref",
1162                                 sizeof(struct btrfs_delayed_data_ref), 0,
1163                                 SLAB_MEM_SPREAD, NULL);
1164         if (!btrfs_delayed_data_ref_cachep)
1165                 goto fail;
1166
1167         btrfs_delayed_extent_op_cachep = kmem_cache_create(
1168                                 "btrfs_delayed_extent_op",
1169                                 sizeof(struct btrfs_delayed_extent_op), 0,
1170                                 SLAB_MEM_SPREAD, NULL);
1171         if (!btrfs_delayed_extent_op_cachep)
1172                 goto fail;
1173
1174         return 0;
1175 fail:
1176         btrfs_delayed_ref_exit();
1177         return -ENOMEM;
1178 }