GNU Linux-libre 6.8.9-gnu
[releases.git] / fs / btrfs / tree-mod-log.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "messages.h"
4 #include "tree-mod-log.h"
5 #include "disk-io.h"
6 #include "fs.h"
7 #include "accessors.h"
8 #include "tree-checker.h"
9
10 struct tree_mod_root {
11         u64 logical;
12         u8 level;
13 };
14
15 struct tree_mod_elem {
16         struct rb_node node;
17         u64 logical;
18         u64 seq;
19         enum btrfs_mod_log_op op;
20
21         /*
22          * This is used for BTRFS_MOD_LOG_KEY_* and BTRFS_MOD_LOG_MOVE_KEYS
23          * operations.
24          */
25         int slot;
26
27         /* This is used for BTRFS_MOD_LOG_KEY* and BTRFS_MOD_LOG_ROOT_REPLACE. */
28         u64 generation;
29
30         /* Those are used for op == BTRFS_MOD_LOG_KEY_{REPLACE,REMOVE}. */
31         struct btrfs_disk_key key;
32         u64 blockptr;
33
34         /* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */
35         struct {
36                 int dst_slot;
37                 int nr_items;
38         } move;
39
40         /* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */
41         struct tree_mod_root old_root;
42 };
43
44 /*
45  * Pull a new tree mod seq number for our operation.
46  */
47 static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info)
48 {
49         return atomic64_inc_return(&fs_info->tree_mod_seq);
50 }
51
52 /*
53  * This adds a new blocker to the tree mod log's blocker list if the @elem
54  * passed does not already have a sequence number set. So when a caller expects
55  * to record tree modifications, it should ensure to set elem->seq to zero
56  * before calling btrfs_get_tree_mod_seq.
57  * Returns a fresh, unused tree log modification sequence number, even if no new
58  * blocker was added.
59  */
60 u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info,
61                            struct btrfs_seq_list *elem)
62 {
63         write_lock(&fs_info->tree_mod_log_lock);
64         if (!elem->seq) {
65                 elem->seq = btrfs_inc_tree_mod_seq(fs_info);
66                 list_add_tail(&elem->list, &fs_info->tree_mod_seq_list);
67                 set_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
68         }
69         write_unlock(&fs_info->tree_mod_log_lock);
70
71         return elem->seq;
72 }
73
74 void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info,
75                             struct btrfs_seq_list *elem)
76 {
77         struct rb_root *tm_root;
78         struct rb_node *node;
79         struct rb_node *next;
80         struct tree_mod_elem *tm;
81         u64 min_seq = BTRFS_SEQ_LAST;
82         u64 seq_putting = elem->seq;
83
84         if (!seq_putting)
85                 return;
86
87         write_lock(&fs_info->tree_mod_log_lock);
88         list_del(&elem->list);
89         elem->seq = 0;
90
91         if (list_empty(&fs_info->tree_mod_seq_list)) {
92                 clear_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags);
93         } else {
94                 struct btrfs_seq_list *first;
95
96                 first = list_first_entry(&fs_info->tree_mod_seq_list,
97                                          struct btrfs_seq_list, list);
98                 if (seq_putting > first->seq) {
99                         /*
100                          * Blocker with lower sequence number exists, we cannot
101                          * remove anything from the log.
102                          */
103                         write_unlock(&fs_info->tree_mod_log_lock);
104                         return;
105                 }
106                 min_seq = first->seq;
107         }
108
109         /*
110          * Anything that's lower than the lowest existing (read: blocked)
111          * sequence number can be removed from the tree.
112          */
113         tm_root = &fs_info->tree_mod_log;
114         for (node = rb_first(tm_root); node; node = next) {
115                 next = rb_next(node);
116                 tm = rb_entry(node, struct tree_mod_elem, node);
117                 if (tm->seq >= min_seq)
118                         continue;
119                 rb_erase(node, tm_root);
120                 kfree(tm);
121         }
122         write_unlock(&fs_info->tree_mod_log_lock);
123 }
124
125 /*
126  * Key order of the log:
127  *       node/leaf start address -> sequence
128  *
129  * The 'start address' is the logical address of the *new* root node for root
130  * replace operations, or the logical address of the affected block for all
131  * other operations.
132  */
133 static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info,
134                                         struct tree_mod_elem *tm)
135 {
136         struct rb_root *tm_root;
137         struct rb_node **new;
138         struct rb_node *parent = NULL;
139         struct tree_mod_elem *cur;
140
141         lockdep_assert_held_write(&fs_info->tree_mod_log_lock);
142
143         tm->seq = btrfs_inc_tree_mod_seq(fs_info);
144
145         tm_root = &fs_info->tree_mod_log;
146         new = &tm_root->rb_node;
147         while (*new) {
148                 cur = rb_entry(*new, struct tree_mod_elem, node);
149                 parent = *new;
150                 if (cur->logical < tm->logical)
151                         new = &((*new)->rb_left);
152                 else if (cur->logical > tm->logical)
153                         new = &((*new)->rb_right);
154                 else if (cur->seq < tm->seq)
155                         new = &((*new)->rb_left);
156                 else if (cur->seq > tm->seq)
157                         new = &((*new)->rb_right);
158                 else
159                         return -EEXIST;
160         }
161
162         rb_link_node(&tm->node, parent, new);
163         rb_insert_color(&tm->node, tm_root);
164         return 0;
165 }
166
167 /*
168  * Determines if logging can be omitted. Returns true if it can. Otherwise, it
169  * returns false with the tree_mod_log_lock acquired. The caller must hold
170  * this until all tree mod log insertions are recorded in the rb tree and then
171  * write unlock fs_info::tree_mod_log_lock.
172  */
173 static inline bool tree_mod_dont_log(struct btrfs_fs_info *fs_info,
174                                     struct extent_buffer *eb)
175 {
176         if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
177                 return true;
178         if (eb && btrfs_header_level(eb) == 0)
179                 return true;
180
181         write_lock(&fs_info->tree_mod_log_lock);
182         if (list_empty(&(fs_info)->tree_mod_seq_list)) {
183                 write_unlock(&fs_info->tree_mod_log_lock);
184                 return true;
185         }
186
187         return false;
188 }
189
190 /* Similar to tree_mod_dont_log, but doesn't acquire any locks. */
191 static inline bool tree_mod_need_log(const struct btrfs_fs_info *fs_info,
192                                     struct extent_buffer *eb)
193 {
194         if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags))
195                 return false;
196         if (eb && btrfs_header_level(eb) == 0)
197                 return false;
198
199         return true;
200 }
201
202 static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb,
203                                                  int slot,
204                                                  enum btrfs_mod_log_op op)
205 {
206         struct tree_mod_elem *tm;
207
208         tm = kzalloc(sizeof(*tm), GFP_NOFS);
209         if (!tm)
210                 return NULL;
211
212         tm->logical = eb->start;
213         if (op != BTRFS_MOD_LOG_KEY_ADD) {
214                 btrfs_node_key(eb, &tm->key, slot);
215                 tm->blockptr = btrfs_node_blockptr(eb, slot);
216         }
217         tm->op = op;
218         tm->slot = slot;
219         tm->generation = btrfs_node_ptr_generation(eb, slot);
220         RB_CLEAR_NODE(&tm->node);
221
222         return tm;
223 }
224
225 int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot,
226                                   enum btrfs_mod_log_op op)
227 {
228         struct tree_mod_elem *tm;
229         int ret = 0;
230
231         if (!tree_mod_need_log(eb->fs_info, eb))
232                 return 0;
233
234         tm = alloc_tree_mod_elem(eb, slot, op);
235         if (!tm)
236                 ret = -ENOMEM;
237
238         if (tree_mod_dont_log(eb->fs_info, eb)) {
239                 kfree(tm);
240                 /*
241                  * Don't error if we failed to allocate memory because we don't
242                  * need to log.
243                  */
244                 return 0;
245         } else if (ret != 0) {
246                 /*
247                  * We previously failed to allocate memory and we need to log,
248                  * so we have to fail.
249                  */
250                 goto out_unlock;
251         }
252
253         ret = tree_mod_log_insert(eb->fs_info, tm);
254 out_unlock:
255         write_unlock(&eb->fs_info->tree_mod_log_lock);
256         if (ret)
257                 kfree(tm);
258
259         return ret;
260 }
261
262 static struct tree_mod_elem *tree_mod_log_alloc_move(struct extent_buffer *eb,
263                                                      int dst_slot, int src_slot,
264                                                      int nr_items)
265 {
266         struct tree_mod_elem *tm;
267
268         tm = kzalloc(sizeof(*tm), GFP_NOFS);
269         if (!tm)
270                 return ERR_PTR(-ENOMEM);
271
272         tm->logical = eb->start;
273         tm->slot = src_slot;
274         tm->move.dst_slot = dst_slot;
275         tm->move.nr_items = nr_items;
276         tm->op = BTRFS_MOD_LOG_MOVE_KEYS;
277         RB_CLEAR_NODE(&tm->node);
278
279         return tm;
280 }
281
282 int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb,
283                                    int dst_slot, int src_slot,
284                                    int nr_items)
285 {
286         struct tree_mod_elem *tm = NULL;
287         struct tree_mod_elem **tm_list = NULL;
288         int ret = 0;
289         int i;
290         bool locked = false;
291
292         if (!tree_mod_need_log(eb->fs_info, eb))
293                 return 0;
294
295         tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS);
296         if (!tm_list) {
297                 ret = -ENOMEM;
298                 goto lock;
299         }
300
301         tm = tree_mod_log_alloc_move(eb, dst_slot, src_slot, nr_items);
302         if (IS_ERR(tm)) {
303                 ret = PTR_ERR(tm);
304                 tm = NULL;
305                 goto lock;
306         }
307
308         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
309                 tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot,
310                                 BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING);
311                 if (!tm_list[i]) {
312                         ret = -ENOMEM;
313                         goto lock;
314                 }
315         }
316
317 lock:
318         if (tree_mod_dont_log(eb->fs_info, eb)) {
319                 /*
320                  * Don't error if we failed to allocate memory because we don't
321                  * need to log.
322                  */
323                 ret = 0;
324                 goto free_tms;
325         }
326         locked = true;
327
328         /*
329          * We previously failed to allocate memory and we need to log, so we
330          * have to fail.
331          */
332         if (ret != 0)
333                 goto free_tms;
334
335         /*
336          * When we override something during the move, we log these removals.
337          * This can only happen when we move towards the beginning of the
338          * buffer, i.e. dst_slot < src_slot.
339          */
340         for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) {
341                 ret = tree_mod_log_insert(eb->fs_info, tm_list[i]);
342                 if (ret)
343                         goto free_tms;
344         }
345
346         ret = tree_mod_log_insert(eb->fs_info, tm);
347         if (ret)
348                 goto free_tms;
349         write_unlock(&eb->fs_info->tree_mod_log_lock);
350         kfree(tm_list);
351
352         return 0;
353
354 free_tms:
355         if (tm_list) {
356                 for (i = 0; i < nr_items; i++) {
357                         if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
358                                 rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log);
359                         kfree(tm_list[i]);
360                 }
361         }
362         if (locked)
363                 write_unlock(&eb->fs_info->tree_mod_log_lock);
364         kfree(tm_list);
365         kfree(tm);
366
367         return ret;
368 }
369
370 static inline int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info,
371                                        struct tree_mod_elem **tm_list,
372                                        int nritems)
373 {
374         int i, j;
375         int ret;
376
377         for (i = nritems - 1; i >= 0; i--) {
378                 ret = tree_mod_log_insert(fs_info, tm_list[i]);
379                 if (ret) {
380                         for (j = nritems - 1; j > i; j--)
381                                 rb_erase(&tm_list[j]->node,
382                                          &fs_info->tree_mod_log);
383                         return ret;
384                 }
385         }
386
387         return 0;
388 }
389
390 int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root,
391                                    struct extent_buffer *new_root,
392                                    bool log_removal)
393 {
394         struct btrfs_fs_info *fs_info = old_root->fs_info;
395         struct tree_mod_elem *tm = NULL;
396         struct tree_mod_elem **tm_list = NULL;
397         int nritems = 0;
398         int ret = 0;
399         int i;
400
401         if (!tree_mod_need_log(fs_info, NULL))
402                 return 0;
403
404         if (log_removal && btrfs_header_level(old_root) > 0) {
405                 nritems = btrfs_header_nritems(old_root);
406                 tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *),
407                                   GFP_NOFS);
408                 if (!tm_list) {
409                         ret = -ENOMEM;
410                         goto lock;
411                 }
412                 for (i = 0; i < nritems; i++) {
413                         tm_list[i] = alloc_tree_mod_elem(old_root, i,
414                             BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING);
415                         if (!tm_list[i]) {
416                                 ret = -ENOMEM;
417                                 goto lock;
418                         }
419                 }
420         }
421
422         tm = kzalloc(sizeof(*tm), GFP_NOFS);
423         if (!tm) {
424                 ret = -ENOMEM;
425                 goto lock;
426         }
427
428         tm->logical = new_root->start;
429         tm->old_root.logical = old_root->start;
430         tm->old_root.level = btrfs_header_level(old_root);
431         tm->generation = btrfs_header_generation(old_root);
432         tm->op = BTRFS_MOD_LOG_ROOT_REPLACE;
433
434 lock:
435         if (tree_mod_dont_log(fs_info, NULL)) {
436                 /*
437                  * Don't error if we failed to allocate memory because we don't
438                  * need to log.
439                  */
440                 ret = 0;
441                 goto free_tms;
442         } else if (ret != 0) {
443                 /*
444                  * We previously failed to allocate memory and we need to log,
445                  * so we have to fail.
446                  */
447                 goto out_unlock;
448         }
449
450         if (tm_list)
451                 ret = tree_mod_log_free_eb(fs_info, tm_list, nritems);
452         if (!ret)
453                 ret = tree_mod_log_insert(fs_info, tm);
454
455 out_unlock:
456         write_unlock(&fs_info->tree_mod_log_lock);
457         if (ret)
458                 goto free_tms;
459         kfree(tm_list);
460
461         return ret;
462
463 free_tms:
464         if (tm_list) {
465                 for (i = 0; i < nritems; i++)
466                         kfree(tm_list[i]);
467                 kfree(tm_list);
468         }
469         kfree(tm);
470
471         return ret;
472 }
473
474 static struct tree_mod_elem *__tree_mod_log_search(struct btrfs_fs_info *fs_info,
475                                                    u64 start, u64 min_seq,
476                                                    bool smallest)
477 {
478         struct rb_root *tm_root;
479         struct rb_node *node;
480         struct tree_mod_elem *cur = NULL;
481         struct tree_mod_elem *found = NULL;
482
483         read_lock(&fs_info->tree_mod_log_lock);
484         tm_root = &fs_info->tree_mod_log;
485         node = tm_root->rb_node;
486         while (node) {
487                 cur = rb_entry(node, struct tree_mod_elem, node);
488                 if (cur->logical < start) {
489                         node = node->rb_left;
490                 } else if (cur->logical > start) {
491                         node = node->rb_right;
492                 } else if (cur->seq < min_seq) {
493                         node = node->rb_left;
494                 } else if (!smallest) {
495                         /* We want the node with the highest seq */
496                         if (found)
497                                 BUG_ON(found->seq > cur->seq);
498                         found = cur;
499                         node = node->rb_left;
500                 } else if (cur->seq > min_seq) {
501                         /* We want the node with the smallest seq */
502                         if (found)
503                                 BUG_ON(found->seq < cur->seq);
504                         found = cur;
505                         node = node->rb_right;
506                 } else {
507                         found = cur;
508                         break;
509                 }
510         }
511         read_unlock(&fs_info->tree_mod_log_lock);
512
513         return found;
514 }
515
516 /*
517  * This returns the element from the log with the smallest time sequence
518  * value that's in the log (the oldest log item). Any element with a time
519  * sequence lower than min_seq will be ignored.
520  */
521 static struct tree_mod_elem *tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info,
522                                                         u64 start, u64 min_seq)
523 {
524         return __tree_mod_log_search(fs_info, start, min_seq, true);
525 }
526
527 /*
528  * This returns the element from the log with the largest time sequence
529  * value that's in the log (the most recent log item). Any element with
530  * a time sequence lower than min_seq will be ignored.
531  */
532 static struct tree_mod_elem *tree_mod_log_search(struct btrfs_fs_info *fs_info,
533                                                  u64 start, u64 min_seq)
534 {
535         return __tree_mod_log_search(fs_info, start, min_seq, false);
536 }
537
538 int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst,
539                                struct extent_buffer *src,
540                                unsigned long dst_offset,
541                                unsigned long src_offset,
542                                int nr_items)
543 {
544         struct btrfs_fs_info *fs_info = dst->fs_info;
545         int ret = 0;
546         struct tree_mod_elem **tm_list = NULL;
547         struct tree_mod_elem **tm_list_add = NULL;
548         struct tree_mod_elem **tm_list_rem = NULL;
549         int i;
550         bool locked = false;
551         struct tree_mod_elem *dst_move_tm = NULL;
552         struct tree_mod_elem *src_move_tm = NULL;
553         u32 dst_move_nr_items = btrfs_header_nritems(dst) - dst_offset;
554         u32 src_move_nr_items = btrfs_header_nritems(src) - (src_offset + nr_items);
555
556         if (!tree_mod_need_log(fs_info, NULL))
557                 return 0;
558
559         if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0)
560                 return 0;
561
562         tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *),
563                           GFP_NOFS);
564         if (!tm_list) {
565                 ret = -ENOMEM;
566                 goto lock;
567         }
568
569         if (dst_move_nr_items) {
570                 dst_move_tm = tree_mod_log_alloc_move(dst, dst_offset + nr_items,
571                                                       dst_offset, dst_move_nr_items);
572                 if (IS_ERR(dst_move_tm)) {
573                         ret = PTR_ERR(dst_move_tm);
574                         dst_move_tm = NULL;
575                         goto lock;
576                 }
577         }
578         if (src_move_nr_items) {
579                 src_move_tm = tree_mod_log_alloc_move(src, src_offset,
580                                                       src_offset + nr_items,
581                                                       src_move_nr_items);
582                 if (IS_ERR(src_move_tm)) {
583                         ret = PTR_ERR(src_move_tm);
584                         src_move_tm = NULL;
585                         goto lock;
586                 }
587         }
588
589         tm_list_add = tm_list;
590         tm_list_rem = tm_list + nr_items;
591         for (i = 0; i < nr_items; i++) {
592                 tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset,
593                                                      BTRFS_MOD_LOG_KEY_REMOVE);
594                 if (!tm_list_rem[i]) {
595                         ret = -ENOMEM;
596                         goto lock;
597                 }
598
599                 tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset,
600                                                      BTRFS_MOD_LOG_KEY_ADD);
601                 if (!tm_list_add[i]) {
602                         ret = -ENOMEM;
603                         goto lock;
604                 }
605         }
606
607 lock:
608         if (tree_mod_dont_log(fs_info, NULL)) {
609                 /*
610                  * Don't error if we failed to allocate memory because we don't
611                  * need to log.
612                  */
613                 ret = 0;
614                 goto free_tms;
615         }
616         locked = true;
617
618         /*
619          * We previously failed to allocate memory and we need to log, so we
620          * have to fail.
621          */
622         if (ret != 0)
623                 goto free_tms;
624
625         if (dst_move_tm) {
626                 ret = tree_mod_log_insert(fs_info, dst_move_tm);
627                 if (ret)
628                         goto free_tms;
629         }
630         for (i = 0; i < nr_items; i++) {
631                 ret = tree_mod_log_insert(fs_info, tm_list_rem[i]);
632                 if (ret)
633                         goto free_tms;
634                 ret = tree_mod_log_insert(fs_info, tm_list_add[i]);
635                 if (ret)
636                         goto free_tms;
637         }
638         if (src_move_tm) {
639                 ret = tree_mod_log_insert(fs_info, src_move_tm);
640                 if (ret)
641                         goto free_tms;
642         }
643
644         write_unlock(&fs_info->tree_mod_log_lock);
645         kfree(tm_list);
646
647         return 0;
648
649 free_tms:
650         if (dst_move_tm && !RB_EMPTY_NODE(&dst_move_tm->node))
651                 rb_erase(&dst_move_tm->node, &fs_info->tree_mod_log);
652         kfree(dst_move_tm);
653         if (src_move_tm && !RB_EMPTY_NODE(&src_move_tm->node))
654                 rb_erase(&src_move_tm->node, &fs_info->tree_mod_log);
655         kfree(src_move_tm);
656         if (tm_list) {
657                 for (i = 0; i < nr_items * 2; i++) {
658                         if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node))
659                                 rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log);
660                         kfree(tm_list[i]);
661                 }
662         }
663         if (locked)
664                 write_unlock(&fs_info->tree_mod_log_lock);
665         kfree(tm_list);
666
667         return ret;
668 }
669
670 int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb)
671 {
672         struct tree_mod_elem **tm_list = NULL;
673         int nritems = 0;
674         int i;
675         int ret = 0;
676
677         if (!tree_mod_need_log(eb->fs_info, eb))
678                 return 0;
679
680         nritems = btrfs_header_nritems(eb);
681         tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS);
682         if (!tm_list) {
683                 ret = -ENOMEM;
684                 goto lock;
685         }
686
687         for (i = 0; i < nritems; i++) {
688                 tm_list[i] = alloc_tree_mod_elem(eb, i,
689                                     BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING);
690                 if (!tm_list[i]) {
691                         ret = -ENOMEM;
692                         goto lock;
693                 }
694         }
695
696 lock:
697         if (tree_mod_dont_log(eb->fs_info, eb)) {
698                 /*
699                  * Don't error if we failed to allocate memory because we don't
700                  * need to log.
701                  */
702                 ret = 0;
703                 goto free_tms;
704         } else if (ret != 0) {
705                 /*
706                  * We previously failed to allocate memory and we need to log,
707                  * so we have to fail.
708                  */
709                 goto out_unlock;
710         }
711
712         ret = tree_mod_log_free_eb(eb->fs_info, tm_list, nritems);
713 out_unlock:
714         write_unlock(&eb->fs_info->tree_mod_log_lock);
715         if (ret)
716                 goto free_tms;
717         kfree(tm_list);
718
719         return 0;
720
721 free_tms:
722         if (tm_list) {
723                 for (i = 0; i < nritems; i++)
724                         kfree(tm_list[i]);
725                 kfree(tm_list);
726         }
727
728         return ret;
729 }
730
731 /*
732  * Returns the logical address of the oldest predecessor of the given root.
733  * Entries older than time_seq are ignored.
734  */
735 static struct tree_mod_elem *tree_mod_log_oldest_root(struct extent_buffer *eb_root,
736                                                       u64 time_seq)
737 {
738         struct tree_mod_elem *tm;
739         struct tree_mod_elem *found = NULL;
740         u64 root_logical = eb_root->start;
741         bool looped = false;
742
743         if (!time_seq)
744                 return NULL;
745
746         /*
747          * The very last operation that's logged for a root is the replacement
748          * operation (if it is replaced at all). This has the logical address
749          * of the *new* root, making it the very first operation that's logged
750          * for this root.
751          */
752         while (1) {
753                 tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical,
754                                                 time_seq);
755                 if (!looped && !tm)
756                         return NULL;
757                 /*
758                  * If there are no tree operation for the oldest root, we simply
759                  * return it. This should only happen if that (old) root is at
760                  * level 0.
761                  */
762                 if (!tm)
763                         break;
764
765                 /*
766                  * If there's an operation that's not a root replacement, we
767                  * found the oldest version of our root. Normally, we'll find a
768                  * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here.
769                  */
770                 if (tm->op != BTRFS_MOD_LOG_ROOT_REPLACE)
771                         break;
772
773                 found = tm;
774                 root_logical = tm->old_root.logical;
775                 looped = true;
776         }
777
778         /* If there's no old root to return, return what we found instead */
779         if (!found)
780                 found = tm;
781
782         return found;
783 }
784
785
786 /*
787  * tm is a pointer to the first operation to rewind within eb. Then, all
788  * previous operations will be rewound (until we reach something older than
789  * time_seq).
790  */
791 static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
792                                 struct extent_buffer *eb,
793                                 u64 time_seq,
794                                 struct tree_mod_elem *first_tm)
795 {
796         u32 n;
797         struct rb_node *next;
798         struct tree_mod_elem *tm = first_tm;
799         unsigned long o_dst;
800         unsigned long o_src;
801         unsigned long p_size = sizeof(struct btrfs_key_ptr);
802         /*
803          * max_slot tracks the maximum valid slot of the rewind eb at every
804          * step of the rewind. This is in contrast with 'n' which eventually
805          * matches the number of items, but can be wrong during moves or if
806          * removes overlap on already valid slots (which is probably separately
807          * a bug). We do this to validate the offsets of memmoves for rewinding
808          * moves and detect invalid memmoves.
809          *
810          * Since a rewind eb can start empty, max_slot is a signed integer with
811          * a special meaning for -1, which is that no slot is valid to move out
812          * of. Any other negative value is invalid.
813          */
814         int max_slot;
815         int move_src_end_slot;
816         int move_dst_end_slot;
817
818         n = btrfs_header_nritems(eb);
819         max_slot = n - 1;
820         read_lock(&fs_info->tree_mod_log_lock);
821         while (tm && tm->seq >= time_seq) {
822                 ASSERT(max_slot >= -1);
823                 /*
824                  * All the operations are recorded with the operator used for
825                  * the modification. As we're going backwards, we do the
826                  * opposite of each operation here.
827                  */
828                 switch (tm->op) {
829                 case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING:
830                         BUG_ON(tm->slot < n);
831                         fallthrough;
832                 case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING:
833                 case BTRFS_MOD_LOG_KEY_REMOVE:
834                         btrfs_set_node_key(eb, &tm->key, tm->slot);
835                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
836                         btrfs_set_node_ptr_generation(eb, tm->slot,
837                                                       tm->generation);
838                         n++;
839                         if (tm->slot > max_slot)
840                                 max_slot = tm->slot;
841                         break;
842                 case BTRFS_MOD_LOG_KEY_REPLACE:
843                         BUG_ON(tm->slot >= n);
844                         btrfs_set_node_key(eb, &tm->key, tm->slot);
845                         btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr);
846                         btrfs_set_node_ptr_generation(eb, tm->slot,
847                                                       tm->generation);
848                         break;
849                 case BTRFS_MOD_LOG_KEY_ADD:
850                         /*
851                          * It is possible we could have already removed keys
852                          * behind the known max slot, so this will be an
853                          * overestimate. In practice, the copy operation
854                          * inserts them in increasing order, and overestimating
855                          * just means we miss some warnings, so it's OK. It
856                          * isn't worth carefully tracking the full array of
857                          * valid slots to check against when moving.
858                          */
859                         if (tm->slot == max_slot)
860                                 max_slot--;
861                         /* if a move operation is needed it's in the log */
862                         n--;
863                         break;
864                 case BTRFS_MOD_LOG_MOVE_KEYS:
865                         ASSERT(tm->move.nr_items > 0);
866                         move_src_end_slot = tm->move.dst_slot + tm->move.nr_items - 1;
867                         move_dst_end_slot = tm->slot + tm->move.nr_items - 1;
868                         o_dst = btrfs_node_key_ptr_offset(eb, tm->slot);
869                         o_src = btrfs_node_key_ptr_offset(eb, tm->move.dst_slot);
870                         if (WARN_ON(move_src_end_slot > max_slot ||
871                                     tm->move.nr_items <= 0)) {
872                                 btrfs_warn(fs_info,
873 "move from invalid tree mod log slot eb %llu slot %d dst_slot %d nr_items %d seq %llu n %u max_slot %d",
874                                            eb->start, tm->slot,
875                                            tm->move.dst_slot, tm->move.nr_items,
876                                            tm->seq, n, max_slot);
877                         }
878                         memmove_extent_buffer(eb, o_dst, o_src,
879                                               tm->move.nr_items * p_size);
880                         max_slot = move_dst_end_slot;
881                         break;
882                 case BTRFS_MOD_LOG_ROOT_REPLACE:
883                         /*
884                          * This operation is special. For roots, this must be
885                          * handled explicitly before rewinding.
886                          * For non-roots, this operation may exist if the node
887                          * was a root: root A -> child B; then A gets empty and
888                          * B is promoted to the new root. In the mod log, we'll
889                          * have a root-replace operation for B, a tree block
890                          * that is no root. We simply ignore that operation.
891                          */
892                         break;
893                 }
894                 next = rb_next(&tm->node);
895                 if (!next)
896                         break;
897                 tm = rb_entry(next, struct tree_mod_elem, node);
898                 if (tm->logical != first_tm->logical)
899                         break;
900         }
901         read_unlock(&fs_info->tree_mod_log_lock);
902         btrfs_set_header_nritems(eb, n);
903 }
904
905 /*
906  * Called with eb read locked. If the buffer cannot be rewound, the same buffer
907  * is returned. If rewind operations happen, a fresh buffer is returned. The
908  * returned buffer is always read-locked. If the returned buffer is not the
909  * input buffer, the lock on the input buffer is released and the input buffer
910  * is freed (its refcount is decremented).
911  */
912 struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info,
913                                                 struct btrfs_path *path,
914                                                 struct extent_buffer *eb,
915                                                 u64 time_seq)
916 {
917         struct extent_buffer *eb_rewin;
918         struct tree_mod_elem *tm;
919
920         if (!time_seq)
921                 return eb;
922
923         if (btrfs_header_level(eb) == 0)
924                 return eb;
925
926         tm = tree_mod_log_search(fs_info, eb->start, time_seq);
927         if (!tm)
928                 return eb;
929
930         if (tm->op == BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
931                 BUG_ON(tm->slot != 0);
932                 eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start);
933                 if (!eb_rewin) {
934                         btrfs_tree_read_unlock(eb);
935                         free_extent_buffer(eb);
936                         return NULL;
937                 }
938                 btrfs_set_header_bytenr(eb_rewin, eb->start);
939                 btrfs_set_header_backref_rev(eb_rewin,
940                                              btrfs_header_backref_rev(eb));
941                 btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb));
942                 btrfs_set_header_level(eb_rewin, btrfs_header_level(eb));
943         } else {
944                 eb_rewin = btrfs_clone_extent_buffer(eb);
945                 if (!eb_rewin) {
946                         btrfs_tree_read_unlock(eb);
947                         free_extent_buffer(eb);
948                         return NULL;
949                 }
950         }
951
952         btrfs_tree_read_unlock(eb);
953         free_extent_buffer(eb);
954
955         btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin),
956                                        eb_rewin, btrfs_header_level(eb_rewin));
957         btrfs_tree_read_lock(eb_rewin);
958         tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm);
959         WARN_ON(btrfs_header_nritems(eb_rewin) >
960                 BTRFS_NODEPTRS_PER_BLOCK(fs_info));
961
962         return eb_rewin;
963 }
964
965 /*
966  * Rewind the state of @root's root node to the given @time_seq value.
967  * If there are no changes, the current root->root_node is returned. If anything
968  * changed in between, there's a fresh buffer allocated on which the rewind
969  * operations are done. In any case, the returned buffer is read locked.
970  * Returns NULL on error (with no locks held).
971  */
972 struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq)
973 {
974         struct btrfs_fs_info *fs_info = root->fs_info;
975         struct tree_mod_elem *tm;
976         struct extent_buffer *eb = NULL;
977         struct extent_buffer *eb_root;
978         u64 eb_root_owner = 0;
979         struct extent_buffer *old;
980         struct tree_mod_root *old_root = NULL;
981         u64 old_generation = 0;
982         u64 logical;
983         int level;
984
985         eb_root = btrfs_read_lock_root_node(root);
986         tm = tree_mod_log_oldest_root(eb_root, time_seq);
987         if (!tm)
988                 return eb_root;
989
990         if (tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) {
991                 old_root = &tm->old_root;
992                 old_generation = tm->generation;
993                 logical = old_root->logical;
994                 level = old_root->level;
995         } else {
996                 logical = eb_root->start;
997                 level = btrfs_header_level(eb_root);
998         }
999
1000         tm = tree_mod_log_search(fs_info, logical, time_seq);
1001         if (old_root && tm && tm->op != BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) {
1002                 struct btrfs_tree_parent_check check = { 0 };
1003
1004                 btrfs_tree_read_unlock(eb_root);
1005                 free_extent_buffer(eb_root);
1006
1007                 check.level = level;
1008                 check.owner_root = root->root_key.objectid;
1009
1010                 old = read_tree_block(fs_info, logical, &check);
1011                 if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) {
1012                         if (!IS_ERR(old))
1013                                 free_extent_buffer(old);
1014                         btrfs_warn(fs_info,
1015                                    "failed to read tree block %llu from get_old_root",
1016                                    logical);
1017                 } else {
1018                         struct tree_mod_elem *tm2;
1019
1020                         btrfs_tree_read_lock(old);
1021                         eb = btrfs_clone_extent_buffer(old);
1022                         /*
1023                          * After the lookup for the most recent tree mod operation
1024                          * above and before we locked and cloned the extent buffer
1025                          * 'old', a new tree mod log operation may have been added.
1026                          * So lookup for a more recent one to make sure the number
1027                          * of mod log operations we replay is consistent with the
1028                          * number of items we have in the cloned extent buffer,
1029                          * otherwise we can hit a BUG_ON when rewinding the extent
1030                          * buffer.
1031                          */
1032                         tm2 = tree_mod_log_search(fs_info, logical, time_seq);
1033                         btrfs_tree_read_unlock(old);
1034                         free_extent_buffer(old);
1035                         ASSERT(tm2);
1036                         ASSERT(tm2 == tm || tm2->seq > tm->seq);
1037                         if (!tm2 || tm2->seq < tm->seq) {
1038                                 free_extent_buffer(eb);
1039                                 return NULL;
1040                         }
1041                         tm = tm2;
1042                 }
1043         } else if (old_root) {
1044                 eb_root_owner = btrfs_header_owner(eb_root);
1045                 btrfs_tree_read_unlock(eb_root);
1046                 free_extent_buffer(eb_root);
1047                 eb = alloc_dummy_extent_buffer(fs_info, logical);
1048         } else {
1049                 eb = btrfs_clone_extent_buffer(eb_root);
1050                 btrfs_tree_read_unlock(eb_root);
1051                 free_extent_buffer(eb_root);
1052         }
1053
1054         if (!eb)
1055                 return NULL;
1056         if (old_root) {
1057                 btrfs_set_header_bytenr(eb, eb->start);
1058                 btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV);
1059                 btrfs_set_header_owner(eb, eb_root_owner);
1060                 btrfs_set_header_level(eb, old_root->level);
1061                 btrfs_set_header_generation(eb, old_generation);
1062         }
1063         btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb,
1064                                        btrfs_header_level(eb));
1065         btrfs_tree_read_lock(eb);
1066         if (tm)
1067                 tree_mod_log_rewind(fs_info, eb, time_seq, tm);
1068         else
1069                 WARN_ON(btrfs_header_level(eb) != 0);
1070         WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info));
1071
1072         return eb;
1073 }
1074
1075 int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq)
1076 {
1077         struct tree_mod_elem *tm;
1078         int level;
1079         struct extent_buffer *eb_root = btrfs_root_node(root);
1080
1081         tm = tree_mod_log_oldest_root(eb_root, time_seq);
1082         if (tm && tm->op == BTRFS_MOD_LOG_ROOT_REPLACE)
1083                 level = tm->old_root.level;
1084         else
1085                 level = btrfs_header_level(eb_root);
1086
1087         free_extent_buffer(eb_root);
1088
1089         return level;
1090 }
1091
1092 /*
1093  * Return the lowest sequence number in the tree modification log.
1094  *
1095  * Return the sequence number of the oldest tree modification log user, which
1096  * corresponds to the lowest sequence number of all existing users. If there are
1097  * no users it returns 0.
1098  */
1099 u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info)
1100 {
1101         u64 ret = 0;
1102
1103         read_lock(&fs_info->tree_mod_log_lock);
1104         if (!list_empty(&fs_info->tree_mod_seq_list)) {
1105                 struct btrfs_seq_list *elem;
1106
1107                 elem = list_first_entry(&fs_info->tree_mod_seq_list,
1108                                         struct btrfs_seq_list, list);
1109                 ret = elem->seq;
1110         }
1111         read_unlock(&fs_info->tree_mod_log_lock);
1112
1113         return ret;
1114 }