GNU Linux-libre 6.9-gnu
[releases.git] / fs / btrfs / tree-mod-log.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "messages.h"
4 #include "tree-mod-log.h"
5 #include "disk-io.h"
6 #include "fs.h"
7 #include "accessors.h"
8 #include "tree-checker.h"
9
10 struct tree_mod_root {
11         u64 logical;
12         u8 level;
13 };
14
15 struct tree_mod_elem {
16         struct rb_node node;
17         u64 logical;
18         u64 seq;
19         enum btrfs_mod_log_op op;
20
21         /*
22          * This is used for BTRFS_MOD_LOG_KEY_* and BTRFS_MOD_LOG_MOVE_KEYS
23          * operations.
24          */
25         int slot;
26
27         /* This is used for BTRFS_MOD_LOG_KEY* and BTRFS_MOD_LOG_ROOT_REPLACE. */
28         u64 generation;
29
30         /* Those are used for op == BTRFS_MOD_LOG_KEY_{REPLACE,REMOVE}. */
31         struct btrfs_disk_key key;
32         u64 blockptr;
33
34         /* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */
35         struct {
36                 int dst_slot;
37                 int nr_items;
38         } move;
39
40         /* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */
41         struct tree_mod_root old_root;
42 };
43
44 /*
45  * Pull a new tree mod seq number for our operation.
46  */
47 static u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
48 {
49         return atomic64_inc_return(&fs_info->tree_mod_seq);
50 }
51
52 /*
53  * This adds a new blocker to the tree mod log's blocker list if the @elem
54  * passed does not already have a sequence number set. So when a caller expects
55  * to record tree modifications, it should ensure to set elem->seq to zero
56  * before calling btrfs_get_tree_mod_seq.
57  * Returns a fresh, unused tree log modification sequence number, even if no new
58  * blocker was added.
59  */
60 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
61                            struct btrfs_seq_list *elem)
62 {
63         write_lock(&fs_info->tree_mod_log_lock);
64         if (!elem->seq) {
65                 elem->seq = btrfs_inc_tree_mod_seq(fs_info);
66                 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
67                 set_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
68         }
69         write_unlock(&fs_info->tree_mod_log_lock);
70
71         return elem->seq;
72 }
73
74 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
75                             struct btrfs_seq_list *elem)
76 {
77         struct rb_root *tm_root;
78         struct rb_node *node;
79         struct rb_node *next;
80         struct tree_mod_elem *tm;
81         u64 min_seq = BTRFS_SEQ_LAST;
82         u64 seq_putting = elem->seq;
83
84         if (!seq_putting)
85                 return;
86
87         write_lock(&fs_info->tree_mod_log_lock);
88         list_del(&elem->list);
89         elem->seq = 0;
90
91         if (list_empty(&fs_info->tree_mod_seq_list)) {
92                 clear_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
93         } else {
94                 struct btrfs_seq_list *first;
95
96                 first = list_first_entry(&fs_info->tree_mod_seq_list,
97                                          struct btrfs_seq_list, list);
98                 if (seq_putting > first->seq) {
99                         /*
100                          * Blocker with lower sequence number exists, we cannot
101                          * remove anything from the log.
102                          */
103                         write_unlock(&fs_info->tree_mod_log_lock);
104                         return;
105                 }
106                 min_seq = first->seq;
107         }
108
109         /*
110          * Anything that's lower than the lowest existing (read: blocked)
111          * sequence number can be removed from the tree.
112          */
113         tm_root = &fs_info->tree_mod_log;
114         for (node = rb_first(tm_root); node; node = next) {
115                 next = rb_next(node);
116                 tm = rb_entry(node, struct tree_mod_elem, node);
117                 if (tm->seq >= min_seq)
118                         continue;
119                 rb_erase(node, tm_root);
120                 kfree(tm);
121         }
122         write_unlock(&fs_info->tree_mod_log_lock);
123 }
124
125 /*
126  * Key order of the log:
127  *       node/leaf start address -> sequence
128  *
129  * The 'start address' is the logical address of the *new* root node for root
130  * replace operations, or the logical address of the affected block for all
131  * other operations.
132  */
133 static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info,
134                                         struct tree_mod_elem *tm)
135 {
136         struct rb_root *tm_root;
137         struct rb_node **new;
138         struct rb_node *parent = NULL;
139         struct tree_mod_elem *cur;
140
141         lockdep_assert_held_write(&fs_info->tree_mod_log_lock);
142
143         tm->seq = btrfs_inc_tree_mod_seq(fs_info);
144
145         tm_root = &fs_info->tree_mod_log;
146         new = &tm_root->rb_node;
147         while (*new) {
148                 cur = rb_entry(*new, struct tree_mod_elem, node);
149                 parent = *new;
150                 if (cur->logical < tm->logical)
151                         new = &((*new)->rb_left);
152                 else if (cur->logical > tm->logical)
153                         new = &((*new)->rb_right);
154                 else if (cur->seq < tm->seq)
155                         new = &((*new)->rb_left);
156                 else if (cur->seq > tm->seq)
157                         new = &((*new)->rb_right);
158                 else
159                         return -EEXIST;
160         }
161
162         rb_link_node(&tm->node, parent, new);
163         rb_insert_color(&tm->node, tm_root);
164         return 0;
165 }
166
167 /*
168  * Determines if logging can be omitted. Returns true if it can. Otherwise, it
169  * returns false with the tree_mod_log_lock acquired. The caller must hold
170  * this until all tree mod log insertions are recorded in the rb tree and then
171  * write unlock fs_info::tree_mod_log_lock.
172  */
173 static bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, struct extent_buffer *eb)
174 {
175         if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
176                 return true;
177         if (eb && btrfs_header_level(eb) == 0)
178                 return true;
179
180         write_lock(&fs_info->tree_mod_log_lock);
181         if (list_empty(&(fs_info)->tree_mod_seq_list)) {
182                 write_unlock(&fs_info->tree_mod_log_lock);
183                 return true;
184         }
185
186         return false;
187 }
188
189 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
190 static bool tree_mod_need_log(const struct btrfs_fs_info *fs_info,
191                                     struct extent_buffer *eb)
192 {
193         if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
194                 return false;
195         if (eb && btrfs_header_level(eb) == 0)
196                 return false;
197
198         return true;
199 }
200
201 static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb,
202                                                  int slot,
203                                                  enum btrfs_mod_log_op op)
204 {
205         struct tree_mod_elem *tm;
206
207         tm = kzalloc(sizeof(*tm), GFP_NOFS);
208         if (!tm)
209                 return NULL;
210
211         tm->logical = eb->start;
212         if (op != BTRFS_MOD_LOG_KEY_ADD) {
213                 btrfs_node_key(eb, &tm->key, slot);
214                 tm->blockptr = btrfs_node_blockptr(eb, slot);
215         }
216         tm->op = op;
217         tm->slot = slot;
218         tm->generation = btrfs_node_ptr_generation(eb, slot);
219         RB_CLEAR_NODE(&tm->node);
220
221         return tm;
222 }
223
224 int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
225                                   enum btrfs_mod_log_op op)
226 {
227         struct tree_mod_elem *tm;
228         int ret = 0;
229
230         if (!tree_mod_need_log(eb->fs_info, eb))
231                 return 0;
232
233         tm = alloc_tree_mod_elem(eb, slot, op);
234         if (!tm)
235                 ret = -ENOMEM;
236
237         if (tree_mod_dont_log(eb->fs_info, eb)) {
238                 kfree(tm);
239                 /*
240                  * Don't error if we failed to allocate memory because we don't
241                  * need to log.
242                  */
243                 return 0;
244         } else if (ret != 0) {
245                 /*
246                  * We previously failed to allocate memory and we need to log,
247                  * so we have to fail.
248                  */
249                 goto out_unlock;
250         }
251
252         ret = tree_mod_log_insert(eb->fs_info, tm);
253 out_unlock:
254         write_unlock(&eb->fs_info->tree_mod_log_lock);
255         if (ret)
256                 kfree(tm);
257
258         return ret;
259 }
260
261 static struct tree_mod_elem *tree_mod_log_alloc_move(struct extent_buffer *eb,
262                                                      int dst_slot, int src_slot,
263                                                      int nr_items)
264 {
265         struct tree_mod_elem *tm;
266
267         tm = kzalloc(sizeof(*tm), GFP_NOFS);
268         if (!tm)
269                 return ERR_PTR(-ENOMEM);
270
271         tm->logical = eb->start;
272         tm->slot = src_slot;
273         tm->move.dst_slot = dst_slot;
274         tm->move.nr_items = nr_items;
275         tm->op = BTRFS_MOD_LOG_MOVE_KEYS;
276         RB_CLEAR_NODE(&tm->node);
277
278         return tm;
279 }
280
281 int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb,
282                                    int dst_slot, int src_slot,
283                                    int nr_items)
284 {
285         struct tree_mod_elem *tm = NULL;
286         struct tree_mod_elem **tm_list = NULL;
287         int ret = 0;
288         int i;
289         bool locked = false;
290
291         if (!tree_mod_need_log(eb->fs_info, eb))
292                 return 0;
293
294         tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
295         if (!tm_list) {
296                 ret = -ENOMEM;
297                 goto lock;
298         }
299
300         tm = tree_mod_log_alloc_move(eb, dst_slot, src_slot, nr_items);
301         if (IS_ERR(tm)) {
302                 ret = PTR_ERR(tm);
303                 tm = NULL;
304                 goto lock;
305         }
306
307         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
308                 tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
309                                 BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING);
310                 if (!tm_list[i]) {
311                         ret = -ENOMEM;
312                         goto lock;
313                 }
314         }
315
316 lock:
317         if (tree_mod_dont_log(eb->fs_info, eb)) {
318                 /*
319                  * Don't error if we failed to allocate memory because we don't
320                  * need to log.
321                  */
322                 ret = 0;
323                 goto free_tms;
324         }
325         locked = true;
326
327         /*
328          * We previously failed to allocate memory and we need to log, so we
329          * have to fail.
330          */
331         if (ret != 0)
332                 goto free_tms;
333
334         /*
335          * When we override something during the move, we log these removals.
336          * This can only happen when we move towards the beginning of the
337          * buffer, i.e. dst_slot < src_slot.
338          */
339         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
340                 ret = tree_mod_log_insert(eb->fs_info, tm_list[i]);
341                 if (ret)
342                         goto free_tms;
343         }
344
345         ret = tree_mod_log_insert(eb->fs_info, tm);
346         if (ret)
347                 goto free_tms;
348         write_unlock(&eb->fs_info->tree_mod_log_lock);
349         kfree(tm_list);
350
351         return 0;
352
353 free_tms:
354         if (tm_list) {
355                 for (i = 0; i < nr_items; i++) {
356                         if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
357                                 rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
358                         kfree(tm_list[i]);
359                 }
360         }
361         if (locked)
362                 write_unlock(&eb->fs_info->tree_mod_log_lock);
363         kfree(tm_list);
364         kfree(tm);
365
366         return ret;
367 }
368
369 static int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
370                                 struct tree_mod_elem **tm_list,
371                                 int nritems)
372 {
373         int i, j;
374         int ret;
375
376         for (i = nritems - 1; i >= 0; i--) {
377                 ret = tree_mod_log_insert(fs_info, tm_list[i]);
378                 if (ret) {
379                         for (j = nritems - 1; j > i; j--)
380                                 rb_erase(&tm_list[j]->node,
381                                          &fs_info->tree_mod_log);
382                         return ret;
383                 }
384         }
385
386         return 0;
387 }
388
389 int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root,
390                                    struct extent_buffer *new_root,
391                                    bool log_removal)
392 {
393         struct btrfs_fs_info *fs_info = old_root->fs_info;
394         struct tree_mod_elem *tm = NULL;
395         struct tree_mod_elem **tm_list = NULL;
396         int nritems = 0;
397         int ret = 0;
398         int i;
399
400         if (!tree_mod_need_log(fs_info, NULL))
401                 return 0;
402
403         if (log_removal && btrfs_header_level(old_root) > 0) {
404                 nritems = btrfs_header_nritems(old_root);
405                 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
406                                   GFP_NOFS);
407                 if (!tm_list) {
408                         ret = -ENOMEM;
409                         goto lock;
410                 }
411                 for (i = 0; i < nritems; i++) {
412                         tm_list[i] = alloc_tree_mod_elem(old_root, i,
413                             BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING);
414                         if (!tm_list[i]) {
415                                 ret = -ENOMEM;
416                                 goto lock;
417                         }
418                 }
419         }
420
421         tm = kzalloc(sizeof(*tm), GFP_NOFS);
422         if (!tm) {
423                 ret = -ENOMEM;
424                 goto lock;
425         }
426
427         tm->logical = new_root->start;
428         tm->old_root.logical = old_root->start;
429         tm->old_root.level = btrfs_header_level(old_root);
430         tm->generation = btrfs_header_generation(old_root);
431         tm->op = BTRFS_MOD_LOG_ROOT_REPLACE;
432
433 lock:
434         if (tree_mod_dont_log(fs_info, NULL)) {
435                 /*
436                  * Don't error if we failed to allocate memory because we don't
437                  * need to log.
438                  */
439                 ret = 0;
440                 goto free_tms;
441         } else if (ret != 0) {
442                 /*
443                  * We previously failed to allocate memory and we need to log,
444                  * so we have to fail.
445                  */
446                 goto out_unlock;
447         }
448
449         if (tm_list)
450                 ret = tree_mod_log_free_eb(fs_info, tm_list, nritems);
451         if (!ret)
452                 ret = tree_mod_log_insert(fs_info, tm);
453
454 out_unlock:
455         write_unlock(&fs_info->tree_mod_log_lock);
456         if (ret)
457                 goto free_tms;
458         kfree(tm_list);
459
460         return ret;
461
462 free_tms:
463         if (tm_list) {
464                 for (i = 0; i < nritems; i++)
465                         kfree(tm_list[i]);
466                 kfree(tm_list);
467         }
468         kfree(tm);
469
470         return ret;
471 }
472
473 static struct tree_mod_elem *__tree_mod_log_search(struct btrfs_fs_info *fs_info,
474                                                    u64 start, u64 min_seq,
475                                                    bool smallest)
476 {
477         struct rb_root *tm_root;
478         struct rb_node *node;
479         struct tree_mod_elem *cur = NULL;
480         struct tree_mod_elem *found = NULL;
481
482         read_lock(&fs_info->tree_mod_log_lock);
483         tm_root = &fs_info->tree_mod_log;
484         node = tm_root->rb_node;
485         while (node) {
486                 cur = rb_entry(node, struct tree_mod_elem, node);
487                 if (cur->logical < start) {
488                         node = node->rb_left;
489                 } else if (cur->logical > start) {
490                         node = node->rb_right;
491                 } else if (cur->seq < min_seq) {
492                         node = node->rb_left;
493                 } else if (!smallest) {
494                         /* We want the node with the highest seq */
495                         if (found)
496                                 BUG_ON(found->seq > cur->seq);
497                         found = cur;
498                         node = node->rb_left;
499                 } else if (cur->seq > min_seq) {
500                         /* We want the node with the smallest seq */
501                         if (found)
502                                 BUG_ON(found->seq < cur->seq);
503                         found = cur;
504                         node = node->rb_right;
505                 } else {
506                         found = cur;
507                         break;
508                 }
509         }
510         read_unlock(&fs_info->tree_mod_log_lock);
511
512         return found;
513 }
514
515 /*
516  * This returns the element from the log with the smallest time sequence
517  * value that's in the log (the oldest log item). Any element with a time
518  * sequence lower than min_seq will be ignored.
519  */
520 static struct tree_mod_elem *tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info,
521                                                         u64 start, u64 min_seq)
522 {
523         return __tree_mod_log_search(fs_info, start, min_seq, true);
524 }
525
526 /*
527  * This returns the element from the log with the largest time sequence
528  * value that's in the log (the most recent log item). Any element with
529  * a time sequence lower than min_seq will be ignored.
530  */
531 static struct tree_mod_elem *tree_mod_log_search(struct btrfs_fs_info *fs_info,
532                                                  u64 start, u64 min_seq)
533 {
534         return __tree_mod_log_search(fs_info, start, min_seq, false);
535 }
536
537 int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst,
538                                struct extent_buffer *src,
539                                unsigned long dst_offset,
540                                unsigned long src_offset,
541                                int nr_items)
542 {
543         struct btrfs_fs_info *fs_info = dst->fs_info;
544         int ret = 0;
545         struct tree_mod_elem **tm_list = NULL;
546         struct tree_mod_elem **tm_list_add = NULL;
547         struct tree_mod_elem **tm_list_rem = NULL;
548         int i;
549         bool locked = false;
550         struct tree_mod_elem *dst_move_tm = NULL;
551         struct tree_mod_elem *src_move_tm = NULL;
552         u32 dst_move_nr_items = btrfs_header_nritems(dst) - dst_offset;
553         u32 src_move_nr_items = btrfs_header_nritems(src) - (src_offset + nr_items);
554
555         if (!tree_mod_need_log(fs_info, NULL))
556                 return 0;
557
558         if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
559                 return 0;
560
561         tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
562                           GFP_NOFS);
563         if (!tm_list) {
564                 ret = -ENOMEM;
565                 goto lock;
566         }
567
568         if (dst_move_nr_items) {
569                 dst_move_tm = tree_mod_log_alloc_move(dst, dst_offset + nr_items,
570                                                       dst_offset, dst_move_nr_items);
571                 if (IS_ERR(dst_move_tm)) {
572                         ret = PTR_ERR(dst_move_tm);
573                         dst_move_tm = NULL;
574                         goto lock;
575                 }
576         }
577         if (src_move_nr_items) {
578                 src_move_tm = tree_mod_log_alloc_move(src, src_offset,
579                                                       src_offset + nr_items,
580                                                       src_move_nr_items);
581                 if (IS_ERR(src_move_tm)) {
582                         ret = PTR_ERR(src_move_tm);
583                         src_move_tm = NULL;
584                         goto lock;
585                 }
586         }
587
588         tm_list_add = tm_list;
589         tm_list_rem = tm_list + nr_items;
590         for (i = 0; i < nr_items; i++) {
591                 tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
592                                                      BTRFS_MOD_LOG_KEY_REMOVE);
593                 if (!tm_list_rem[i]) {
594                         ret = -ENOMEM;
595                         goto lock;
596                 }
597
598                 tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
599                                                      BTRFS_MOD_LOG_KEY_ADD);
600                 if (!tm_list_add[i]) {
601                         ret = -ENOMEM;
602                         goto lock;
603                 }
604         }
605
606 lock:
607         if (tree_mod_dont_log(fs_info, NULL)) {
608                 /*
609                  * Don't error if we failed to allocate memory because we don't
610                  * need to log.
611                  */
612                 ret = 0;
613                 goto free_tms;
614         }
615         locked = true;
616
617         /*
618          * We previously failed to allocate memory and we need to log, so we
619          * have to fail.
620          */
621         if (ret != 0)
622                 goto free_tms;
623
624         if (dst_move_tm) {
625                 ret = tree_mod_log_insert(fs_info, dst_move_tm);
626                 if (ret)
627                         goto free_tms;
628         }
629         for (i = 0; i < nr_items; i++) {
630                 ret = tree_mod_log_insert(fs_info, tm_list_rem[i]);
631                 if (ret)
632                         goto free_tms;
633                 ret = tree_mod_log_insert(fs_info, tm_list_add[i]);
634                 if (ret)
635                         goto free_tms;
636         }
637         if (src_move_tm) {
638                 ret = tree_mod_log_insert(fs_info, src_move_tm);
639                 if (ret)
640                         goto free_tms;
641         }
642
643         write_unlock(&fs_info->tree_mod_log_lock);
644         kfree(tm_list);
645
646         return 0;
647
648 free_tms:
649         if (dst_move_tm && !RB_EMPTY_NODE(&dst_move_tm->node))
650                 rb_erase(&dst_move_tm->node, &fs_info->tree_mod_log);
651         kfree(dst_move_tm);
652         if (src_move_tm && !RB_EMPTY_NODE(&src_move_tm->node))
653                 rb_erase(&src_move_tm->node, &fs_info->tree_mod_log);
654         kfree(src_move_tm);
655         if (tm_list) {
656                 for (i = 0; i < nr_items * 2; i++) {
657                         if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
658                                 rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
659                         kfree(tm_list[i]);
660                 }
661         }
662         if (locked)
663                 write_unlock(&fs_info->tree_mod_log_lock);
664         kfree(tm_list);
665
666         return ret;
667 }
668
669 int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb)
670 {
671         struct tree_mod_elem **tm_list = NULL;
672         int nritems = 0;
673         int i;
674         int ret = 0;
675
676         if (!tree_mod_need_log(eb->fs_info, eb))
677                 return 0;
678
679         nritems = btrfs_header_nritems(eb);
680         tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
681         if (!tm_list) {
682                 ret = -ENOMEM;
683                 goto lock;
684         }
685
686         for (i = 0; i < nritems; i++) {
687                 tm_list[i] = alloc_tree_mod_elem(eb, i,
688                                     BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING);
689                 if (!tm_list[i]) {
690                         ret = -ENOMEM;
691                         goto lock;
692                 }
693         }
694
695 lock:
696         if (tree_mod_dont_log(eb->fs_info, eb)) {
697                 /*
698                  * Don't error if we failed to allocate memory because we don't
699                  * need to log.
700                  */
701                 ret = 0;
702                 goto free_tms;
703         } else if (ret != 0) {
704                 /*
705                  * We previously failed to allocate memory and we need to log,
706                  * so we have to fail.
707                  */
708                 goto out_unlock;
709         }
710
711         ret = tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
712 out_unlock:
713         write_unlock(&eb->fs_info->tree_mod_log_lock);
714         if (ret)
715                 goto free_tms;
716         kfree(tm_list);
717
718         return 0;
719
720 free_tms:
721         if (tm_list) {
722                 for (i = 0; i < nritems; i++)
723                         kfree(tm_list[i]);
724                 kfree(tm_list);
725         }
726
727         return ret;
728 }
729
730 /*
731  * Returns the logical address of the oldest predecessor of the given root.
732  * Entries older than time_seq are ignored.
733  */
734 static struct tree_mod_elem *tree_mod_log_oldest_root(struct extent_buffer *eb_root,
735                                                       u64 time_seq)
736 {
737         struct tree_mod_elem *tm;
738         struct tree_mod_elem *found = NULL;
739         u64 root_logical = eb_root->start;
740         bool looped = false;
741
742         if (!time_seq)
743                 return NULL;
744
745         /*
746          * The very last operation that's logged for a root is the replacement
747          * operation (if it is replaced at all). This has the logical address
748          * of the *new* root, making it the very first operation that's logged
749          * for this root.
750          */
751         while (1) {
752                 tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
753                                                 time_seq);
754                 if (!looped && !tm)
755                         return NULL;
756                 /*
757                  * If there are no tree operation for the oldest root, we simply
758                  * return it. This should only happen if that (old) root is at
759                  * level 0.
760                  */
761                 if (!tm)
762                         break;
763
764                 /*
765                  * If there's an operation that's not a root replacement, we
766                  * found the oldest version of our root. Normally, we'll find a
767                  * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
768                  */
769                 if (tm->op != BTRFS_MOD_LOG_ROOT_REPLACE)
770                         break;
771
772                 found = tm;
773                 root_logical = tm->old_root.logical;
774                 looped = true;
775         }
776
777         /* If there's no old root to return, return what we found instead */
778         if (!found)
779                 found = tm;
780
781         return found;
782 }
783
784
785 /*
786  * tm is a pointer to the first operation to rewind within eb. Then, all
787  * previous operations will be rewound (until we reach something older than
788  * time_seq).
789  */
790 static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
791                                 struct extent_buffer *eb,
792                                 u64 time_seq,
793                                 struct tree_mod_elem *first_tm)
794 {
795         u32 n;
796         struct rb_node *next;
797         struct tree_mod_elem *tm = first_tm;
798         unsigned long o_dst;
799         unsigned long o_src;
800         unsigned long p_size = sizeof(struct btrfs_key_ptr);
801         /*
802          * max_slot tracks the maximum valid slot of the rewind eb at every
803          * step of the rewind. This is in contrast with 'n' which eventually
804          * matches the number of items, but can be wrong during moves or if
805          * removes overlap on already valid slots (which is probably separately
806          * a bug). We do this to validate the offsets of memmoves for rewinding
807          * moves and detect invalid memmoves.
808          *
809          * Since a rewind eb can start empty, max_slot is a signed integer with
810          * a special meaning for -1, which is that no slot is valid to move out
811          * of. Any other negative value is invalid.
812          */
813         int max_slot;
814         int move_src_end_slot;
815         int move_dst_end_slot;
816
817         n = btrfs_header_nritems(eb);
818         max_slot = n - 1;
819         read_lock(&fs_info->tree_mod_log_lock);
820         while (tm && tm->seq >= time_seq) {
821                 ASSERT(max_slot >= -1);
822                 /*
823                  * All the operations are recorded with the operator used for
824                  * the modification. As we're going backwards, we do the
825                  * opposite of each operation here.
826                  */
827                 switch (tm->op) {
828                 case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING:
829                         BUG_ON(tm->slot < n);
830                         fallthrough;
831                 case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING:
832                 case BTRFS_MOD_LOG_KEY_REMOVE:
833                         btrfs_set_node_key(eb, &tm->key, tm->slot);
834                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
835                         btrfs_set_node_ptr_generation(eb, tm->slot,
836                                                       tm->generation);
837                         n++;
838                         if (tm->slot > max_slot)
839                                 max_slot = tm->slot;
840                         break;
841                 case BTRFS_MOD_LOG_KEY_REPLACE:
842                         BUG_ON(tm->slot >= n);
843                         btrfs_set_node_key(eb, &tm->key, tm->slot);
844                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
845                         btrfs_set_node_ptr_generation(eb, tm->slot,
846                                                       tm->generation);
847                         break;
848                 case BTRFS_MOD_LOG_KEY_ADD:
849                         /*
850                          * It is possible we could have already removed keys
851                          * behind the known max slot, so this will be an
852                          * overestimate. In practice, the copy operation
853                          * inserts them in increasing order, and overestimating
854                          * just means we miss some warnings, so it's OK. It
855                          * isn't worth carefully tracking the full array of
856                          * valid slots to check against when moving.
857                          */
858                         if (tm->slot == max_slot)
859                                 max_slot--;
860                         /* if a move operation is needed it's in the log */
861                         n--;
862                         break;
863                 case BTRFS_MOD_LOG_MOVE_KEYS:
864                         ASSERT(tm->move.nr_items > 0);
865                         move_src_end_slot = tm->move.dst_slot + tm->move.nr_items - 1;
866                         move_dst_end_slot = tm->slot + tm->move.nr_items - 1;
867                         o_dst = btrfs_node_key_ptr_offset(eb, tm->slot);
868                         o_src = btrfs_node_key_ptr_offset(eb, tm->move.dst_slot);
869                         if (WARN_ON(move_src_end_slot > max_slot ||
870                                     tm->move.nr_items <= 0)) {
871                                 btrfs_warn(fs_info,
872 "move from invalid tree mod log slot eb %llu slot %d dst_slot %d nr_items %d seq %llu n %u max_slot %d",
873                                            eb->start, tm->slot,
874                                            tm->move.dst_slot, tm->move.nr_items,
875                                            tm->seq, n, max_slot);
876                         }
877                         memmove_extent_buffer(eb, o_dst, o_src,
878                                               tm->move.nr_items * p_size);
879                         max_slot = move_dst_end_slot;
880                         break;
881                 case BTRFS_MOD_LOG_ROOT_REPLACE:
882                         /*
883                          * This operation is special. For roots, this must be
884                          * handled explicitly before rewinding.
885                          * For non-roots, this operation may exist if the node
886                          * was a root: root A -> child B; then A gets empty and
887                          * B is promoted to the new root. In the mod log, we'll
888                          * have a root-replace operation for B, a tree block
889                          * that is no root. We simply ignore that operation.
890                          */
891                         break;
892                 }
893                 next = rb_next(&tm->node);
894                 if (!next)
895                         break;
896                 tm = rb_entry(next, struct tree_mod_elem, node);
897                 if (tm->logical != first_tm->logical)
898                         break;
899         }
900         read_unlock(&fs_info->tree_mod_log_lock);
901         btrfs_set_header_nritems(eb, n);
902 }
903
904 /*
905  * Called with eb read locked. If the buffer cannot be rewound, the same buffer
906  * is returned. If rewind operations happen, a fresh buffer is returned. The
907  * returned buffer is always read-locked. If the returned buffer is not the
908  * input buffer, the lock on the input buffer is released and the input buffer
909  * is freed (its refcount is decremented).
910  */
911 struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
912                                                 struct btrfs_path *path,
913                                                 struct extent_buffer *eb,
914                                                 u64 time_seq)
915 {
916         struct extent_buffer *eb_rewin;
917         struct tree_mod_elem *tm;
918
919         if (!time_seq)
920                 return eb;
921
922         if (btrfs_header_level(eb) == 0)
923                 return eb;
924
925         tm = tree_mod_log_search(fs_info, eb->start, time_seq);
926         if (!tm)
927                 return eb;
928
929         if (tm->op == BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
930                 BUG_ON(tm->slot != 0);
931                 eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
932                 if (!eb_rewin) {
933                         btrfs_tree_read_unlock(eb);
934                         free_extent_buffer(eb);
935                         return NULL;
936                 }
937                 btrfs_set_header_bytenr(eb_rewin, eb->start);
938                 btrfs_set_header_backref_rev(eb_rewin,
939                                              btrfs_header_backref_rev(eb));
940                 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
941                 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
942         } else {
943                 eb_rewin = btrfs_clone_extent_buffer(eb);
944                 if (!eb_rewin) {
945                         btrfs_tree_read_unlock(eb);
946                         free_extent_buffer(eb);
947                         return NULL;
948                 }
949         }
950
951         btrfs_tree_read_unlock(eb);
952         free_extent_buffer(eb);
953
954         btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
955                                        eb_rewin, btrfs_header_level(eb_rewin));
956         btrfs_tree_read_lock(eb_rewin);
957         tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
958         WARN_ON(btrfs_header_nritems(eb_rewin) >
959                 BTRFS_NODEPTRS_PER_BLOCK(fs_info));
960
961         return eb_rewin;
962 }
963
964 /*
965  * Rewind the state of @root's root node to the given @time_seq value.
966  * If there are no changes, the current root->root_node is returned. If anything
967  * changed in between, there's a fresh buffer allocated on which the rewind
968  * operations are done. In any case, the returned buffer is read locked.
969  * Returns NULL on error (with no locks held).
970  */
971 struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq)
972 {
973         struct btrfs_fs_info *fs_info = root->fs_info;
974         struct tree_mod_elem *tm;
975         struct extent_buffer *eb = NULL;
976         struct extent_buffer *eb_root;
977         u64 eb_root_owner = 0;
978         struct extent_buffer *old;
979         struct tree_mod_root *old_root = NULL;
980         u64 old_generation = 0;
981         u64 logical;
982         int level;
983
984         eb_root = btrfs_read_lock_root_node(root);
985         tm = tree_mod_log_oldest_root(eb_root, time_seq);
986         if (!tm)
987                 return eb_root;
988
989         if (tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) {
990                 old_root = &tm->old_root;
991                 old_generation = tm->generation;
992                 logical = old_root->logical;
993                 level = old_root->level;
994         } else {
995                 logical = eb_root->start;
996                 level = btrfs_header_level(eb_root);
997         }
998
999         tm = tree_mod_log_search(fs_info, logical, time_seq);
1000         if (old_root && tm && tm->op != BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1001                 struct btrfs_tree_parent_check check = { 0 };
1002
1003                 btrfs_tree_read_unlock(eb_root);
1004                 free_extent_buffer(eb_root);
1005
1006                 check.level = level;
1007                 check.owner_root = root->root_key.objectid;
1008
1009                 old = read_tree_block(fs_info, logical, &check);
1010                 if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
1011                         if (!IS_ERR(old))
1012                                 free_extent_buffer(old);
1013                         btrfs_warn(fs_info,
1014                                    "failed to read tree block %llu from get_old_root",
1015                                    logical);
1016                 } else {
1017                         struct tree_mod_elem *tm2;
1018
1019                         btrfs_tree_read_lock(old);
1020                         eb = btrfs_clone_extent_buffer(old);
1021                         /*
1022                          * After the lookup for the most recent tree mod operation
1023                          * above and before we locked and cloned the extent buffer
1024                          * 'old', a new tree mod log operation may have been added.
1025                          * So lookup for a more recent one to make sure the number
1026                          * of mod log operations we replay is consistent with the
1027                          * number of items we have in the cloned extent buffer,
1028                          * otherwise we can hit a BUG_ON when rewinding the extent
1029                          * buffer.
1030                          */
1031                         tm2 = tree_mod_log_search(fs_info, logical, time_seq);
1032                         btrfs_tree_read_unlock(old);
1033                         free_extent_buffer(old);
1034                         ASSERT(tm2);
1035                         ASSERT(tm2 == tm || tm2->seq > tm->seq);
1036                         if (!tm2 || tm2->seq < tm->seq) {
1037                                 free_extent_buffer(eb);
1038                                 return NULL;
1039                         }
1040                         tm = tm2;
1041                 }
1042         } else if (old_root) {
1043                 eb_root_owner = btrfs_header_owner(eb_root);
1044                 btrfs_tree_read_unlock(eb_root);
1045                 free_extent_buffer(eb_root);
1046                 eb = alloc_dummy_extent_buffer(fs_info, logical);
1047         } else {
1048                 eb = btrfs_clone_extent_buffer(eb_root);
1049                 btrfs_tree_read_unlock(eb_root);
1050                 free_extent_buffer(eb_root);
1051         }
1052
1053         if (!eb)
1054                 return NULL;
1055         if (old_root) {
1056                 btrfs_set_header_bytenr(eb, eb->start);
1057                 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1058                 btrfs_set_header_owner(eb, eb_root_owner);
1059                 btrfs_set_header_level(eb, old_root->level);
1060                 btrfs_set_header_generation(eb, old_generation);
1061         }
1062         btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
1063                                        btrfs_header_level(eb));
1064         btrfs_tree_read_lock(eb);
1065         if (tm)
1066                 tree_mod_log_rewind(fs_info, eb, time_seq, tm);
1067         else
1068                 WARN_ON(btrfs_header_level(eb) != 0);
1069         WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1070
1071         return eb;
1072 }
1073
1074 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1075 {
1076         struct tree_mod_elem *tm;
1077         int level;
1078         struct extent_buffer *eb_root = btrfs_root_node(root);
1079
1080         tm = tree_mod_log_oldest_root(eb_root, time_seq);
1081         if (tm && tm->op == BTRFS_MOD_LOG_ROOT_REPLACE)
1082                 level = tm->old_root.level;
1083         else
1084                 level = btrfs_header_level(eb_root);
1085
1086         free_extent_buffer(eb_root);
1087
1088         return level;
1089 }
1090
1091 /*
1092  * Return the lowest sequence number in the tree modification log.
1093  *
1094  * Return the sequence number of the oldest tree modification log user, which
1095  * corresponds to the lowest sequence number of all existing users. If there are
1096  * no users it returns 0.
1097  */
1098 u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info)
1099 {
1100         u64 ret = 0;
1101
1102         read_lock(&fs_info->tree_mod_log_lock);
1103         if (!list_empty(&fs_info->tree_mod_seq_list)) {
1104                 struct btrfs_seq_list *elem;
1105
1106                 elem = list_first_entry(&fs_info->tree_mod_seq_list,
1107                                         struct btrfs_seq_list, list);
1108                 ret = elem->seq;
1109         }
1110         read_unlock(&fs_info->tree_mod_log_lock);
1111
1112         return ret;
1113 }