GNU Linux-libre 6.9.1-gnu
[releases.git] / fs / bcachefs / snapshot.c
1 // SPDX-License-Identifier: GPL-2.0
2
3 #include "bcachefs.h"
4 #include "bkey_buf.h"
5 #include "btree_key_cache.h"
6 #include "btree_update.h"
7 #include "buckets.h"
8 #include "errcode.h"
9 #include "error.h"
10 #include "fs.h"
11 #include "recovery_passes.h"
12 #include "snapshot.h"
13
14 #include <linux/random.h>
15
16 /*
17  * Snapshot trees:
18  *
19  * Keys in BTREE_ID_snapshot_trees identify a whole tree of snapshot nodes; they
20  * exist to provide a stable identifier for the whole lifetime of a snapshot
21  * tree.
22  */
23
24 void bch2_snapshot_tree_to_text(struct printbuf *out, struct bch_fs *c,
25                                 struct bkey_s_c k)
26 {
27         struct bkey_s_c_snapshot_tree t = bkey_s_c_to_snapshot_tree(k);
28
29         prt_printf(out, "subvol %u root snapshot %u",
30                    le32_to_cpu(t.v->master_subvol),
31                    le32_to_cpu(t.v->root_snapshot));
32 }
33
34 int bch2_snapshot_tree_invalid(struct bch_fs *c, struct bkey_s_c k,
35                                enum bkey_invalid_flags flags,
36                                struct printbuf *err)
37 {
38         int ret = 0;
39
40         bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
41                          bkey_lt(k.k->p, POS(0, 1)), c, err,
42                          snapshot_tree_pos_bad,
43                          "bad pos");
44 fsck_err:
45         return ret;
46 }
47
48 int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id,
49                               struct bch_snapshot_tree *s)
50 {
51         int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_snapshot_trees, POS(0, id),
52                                           BTREE_ITER_WITH_UPDATES, snapshot_tree, s);
53
54         if (bch2_err_matches(ret, ENOENT))
55                 ret = -BCH_ERR_ENOENT_snapshot_tree;
56         return ret;
57 }
58
59 struct bkey_i_snapshot_tree *
60 __bch2_snapshot_tree_create(struct btree_trans *trans)
61 {
62         struct btree_iter iter;
63         int ret = bch2_bkey_get_empty_slot(trans, &iter,
64                         BTREE_ID_snapshot_trees, POS(0, U32_MAX));
65         struct bkey_i_snapshot_tree *s_t;
66
67         if (ret == -BCH_ERR_ENOSPC_btree_slot)
68                 ret = -BCH_ERR_ENOSPC_snapshot_tree;
69         if (ret)
70                 return ERR_PTR(ret);
71
72         s_t = bch2_bkey_alloc(trans, &iter, 0, snapshot_tree);
73         ret = PTR_ERR_OR_ZERO(s_t);
74         bch2_trans_iter_exit(trans, &iter);
75         return ret ? ERR_PTR(ret) : s_t;
76 }
77
78 static int bch2_snapshot_tree_create(struct btree_trans *trans,
79                                 u32 root_id, u32 subvol_id, u32 *tree_id)
80 {
81         struct bkey_i_snapshot_tree *n_tree =
82                 __bch2_snapshot_tree_create(trans);
83
84         if (IS_ERR(n_tree))
85                 return PTR_ERR(n_tree);
86
87         n_tree->v.master_subvol = cpu_to_le32(subvol_id);
88         n_tree->v.root_snapshot = cpu_to_le32(root_id);
89         *tree_id = n_tree->k.p.offset;
90         return 0;
91 }
92
93 /* Snapshot nodes: */
94
95 static bool __bch2_snapshot_is_ancestor_early(struct snapshot_table *t, u32 id, u32 ancestor)
96 {
97         while (id && id < ancestor) {
98                 const struct snapshot_t *s = __snapshot_t(t, id);
99                 id = s ? s->parent : 0;
100         }
101         return id == ancestor;
102 }
103
104 static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor)
105 {
106         rcu_read_lock();
107         bool ret = __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor);
108         rcu_read_unlock();
109
110         return ret;
111 }
112
113 static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor)
114 {
115         const struct snapshot_t *s = __snapshot_t(t, id);
116         if (!s)
117                 return 0;
118
119         if (s->skip[2] <= ancestor)
120                 return s->skip[2];
121         if (s->skip[1] <= ancestor)
122                 return s->skip[1];
123         if (s->skip[0] <= ancestor)
124                 return s->skip[0];
125         return s->parent;
126 }
127
128 static bool test_ancestor_bitmap(struct snapshot_table *t, u32 id, u32 ancestor)
129 {
130         const struct snapshot_t *s = __snapshot_t(t, id);
131         if (!s)
132                 return false;
133
134         return test_bit(ancestor - id - 1, s->is_ancestor);
135 }
136
137 bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor)
138 {
139         bool ret;
140
141         rcu_read_lock();
142         struct snapshot_table *t = rcu_dereference(c->snapshots);
143
144         if (unlikely(c->recovery_pass_done < BCH_RECOVERY_PASS_check_snapshots)) {
145                 ret = __bch2_snapshot_is_ancestor_early(t, id, ancestor);
146                 goto out;
147         }
148
149         while (id && id < ancestor - IS_ANCESTOR_BITMAP)
150                 id = get_ancestor_below(t, id, ancestor);
151
152         ret = id && id < ancestor
153                 ? test_ancestor_bitmap(t, id, ancestor)
154                 : id == ancestor;
155
156         EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor));
157 out:
158         rcu_read_unlock();
159
160         return ret;
161 }
162
163 static noinline struct snapshot_t *__snapshot_t_mut(struct bch_fs *c, u32 id)
164 {
165         size_t idx = U32_MAX - id;
166         struct snapshot_table *new, *old;
167
168         size_t new_bytes = kmalloc_size_roundup(struct_size(new, s, idx + 1));
169         size_t new_size = (new_bytes - sizeof(*new)) / sizeof(new->s[0]);
170
171         new = kvzalloc(new_bytes, GFP_KERNEL);
172         if (!new)
173                 return NULL;
174
175         new->nr = new_size;
176
177         old = rcu_dereference_protected(c->snapshots, true);
178         if (old)
179                 memcpy(new->s, old->s, sizeof(old->s[0]) * old->nr);
180
181         rcu_assign_pointer(c->snapshots, new);
182         kvfree_rcu(old, rcu);
183
184         return &rcu_dereference_protected(c->snapshots,
185                                 lockdep_is_held(&c->snapshot_table_lock))->s[idx];
186 }
187
188 static inline struct snapshot_t *snapshot_t_mut(struct bch_fs *c, u32 id)
189 {
190         size_t idx = U32_MAX - id;
191         struct snapshot_table *table =
192                 rcu_dereference_protected(c->snapshots,
193                                 lockdep_is_held(&c->snapshot_table_lock));
194
195         lockdep_assert_held(&c->snapshot_table_lock);
196
197         if (likely(table && idx < table->nr))
198                 return &table->s[idx];
199
200         return __snapshot_t_mut(c, id);
201 }
202
203 void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c,
204                            struct bkey_s_c k)
205 {
206         struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k);
207
208         prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u",
209                BCH_SNAPSHOT_SUBVOL(s.v),
210                BCH_SNAPSHOT_DELETED(s.v),
211                le32_to_cpu(s.v->parent),
212                le32_to_cpu(s.v->children[0]),
213                le32_to_cpu(s.v->children[1]),
214                le32_to_cpu(s.v->subvol),
215                le32_to_cpu(s.v->tree));
216
217         if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, depth))
218                 prt_printf(out, " depth %u skiplist %u %u %u",
219                            le32_to_cpu(s.v->depth),
220                            le32_to_cpu(s.v->skip[0]),
221                            le32_to_cpu(s.v->skip[1]),
222                            le32_to_cpu(s.v->skip[2]));
223 }
224
225 int bch2_snapshot_invalid(struct bch_fs *c, struct bkey_s_c k,
226                           enum bkey_invalid_flags flags,
227                           struct printbuf *err)
228 {
229         struct bkey_s_c_snapshot s;
230         u32 i, id;
231         int ret = 0;
232
233         bkey_fsck_err_on(bkey_gt(k.k->p, POS(0, U32_MAX)) ||
234                          bkey_lt(k.k->p, POS(0, 1)), c, err,
235                          snapshot_pos_bad,
236                          "bad pos");
237
238         s = bkey_s_c_to_snapshot(k);
239
240         id = le32_to_cpu(s.v->parent);
241         bkey_fsck_err_on(id && id <= k.k->p.offset, c, err,
242                          snapshot_parent_bad,
243                          "bad parent node (%u <= %llu)",
244                          id, k.k->p.offset);
245
246         bkey_fsck_err_on(le32_to_cpu(s.v->children[0]) < le32_to_cpu(s.v->children[1]), c, err,
247                          snapshot_children_not_normalized,
248                          "children not normalized");
249
250         bkey_fsck_err_on(s.v->children[0] && s.v->children[0] == s.v->children[1], c, err,
251                          snapshot_child_duplicate,
252                          "duplicate child nodes");
253
254         for (i = 0; i < 2; i++) {
255                 id = le32_to_cpu(s.v->children[i]);
256
257                 bkey_fsck_err_on(id >= k.k->p.offset, c, err,
258                                  snapshot_child_bad,
259                                  "bad child node (%u >= %llu)",
260                                  id, k.k->p.offset);
261         }
262
263         if (bkey_val_bytes(k.k) > offsetof(struct bch_snapshot, skip)) {
264                 bkey_fsck_err_on(le32_to_cpu(s.v->skip[0]) > le32_to_cpu(s.v->skip[1]) ||
265                                  le32_to_cpu(s.v->skip[1]) > le32_to_cpu(s.v->skip[2]), c, err,
266                                  snapshot_skiplist_not_normalized,
267                                  "skiplist not normalized");
268
269                 for (i = 0; i < ARRAY_SIZE(s.v->skip); i++) {
270                         id = le32_to_cpu(s.v->skip[i]);
271
272                         bkey_fsck_err_on(id && id < le32_to_cpu(s.v->parent), c, err,
273                                          snapshot_skiplist_bad,
274                                          "bad skiplist node %u", id);
275                 }
276         }
277 fsck_err:
278         return ret;
279 }
280
281 static void __set_is_ancestor_bitmap(struct bch_fs *c, u32 id)
282 {
283         struct snapshot_t *t = snapshot_t_mut(c, id);
284         u32 parent = id;
285
286         while ((parent = bch2_snapshot_parent_early(c, parent)) &&
287                parent - id - 1 < IS_ANCESTOR_BITMAP)
288                 __set_bit(parent - id - 1, t->is_ancestor);
289 }
290
291 static void set_is_ancestor_bitmap(struct bch_fs *c, u32 id)
292 {
293         mutex_lock(&c->snapshot_table_lock);
294         __set_is_ancestor_bitmap(c, id);
295         mutex_unlock(&c->snapshot_table_lock);
296 }
297
298 static int __bch2_mark_snapshot(struct btree_trans *trans,
299                        enum btree_id btree, unsigned level,
300                        struct bkey_s_c old, struct bkey_s_c new,
301                        unsigned flags)
302 {
303         struct bch_fs *c = trans->c;
304         struct snapshot_t *t;
305         u32 id = new.k->p.offset;
306         int ret = 0;
307
308         mutex_lock(&c->snapshot_table_lock);
309
310         t = snapshot_t_mut(c, id);
311         if (!t) {
312                 ret = -BCH_ERR_ENOMEM_mark_snapshot;
313                 goto err;
314         }
315
316         if (new.k->type == KEY_TYPE_snapshot) {
317                 struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new);
318
319                 t->parent       = le32_to_cpu(s.v->parent);
320                 t->children[0]  = le32_to_cpu(s.v->children[0]);
321                 t->children[1]  = le32_to_cpu(s.v->children[1]);
322                 t->subvol       = BCH_SNAPSHOT_SUBVOL(s.v) ? le32_to_cpu(s.v->subvol) : 0;
323                 t->tree         = le32_to_cpu(s.v->tree);
324
325                 if (bkey_val_bytes(s.k) > offsetof(struct bch_snapshot, depth)) {
326                         t->depth        = le32_to_cpu(s.v->depth);
327                         t->skip[0]      = le32_to_cpu(s.v->skip[0]);
328                         t->skip[1]      = le32_to_cpu(s.v->skip[1]);
329                         t->skip[2]      = le32_to_cpu(s.v->skip[2]);
330                 } else {
331                         t->depth        = 0;
332                         t->skip[0]      = 0;
333                         t->skip[1]      = 0;
334                         t->skip[2]      = 0;
335                 }
336
337                 __set_is_ancestor_bitmap(c, id);
338
339                 if (BCH_SNAPSHOT_DELETED(s.v)) {
340                         set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
341                         if (c->curr_recovery_pass > BCH_RECOVERY_PASS_delete_dead_snapshots)
342                                 bch2_delete_dead_snapshots_async(c);
343                 }
344         } else {
345                 memset(t, 0, sizeof(*t));
346         }
347 err:
348         mutex_unlock(&c->snapshot_table_lock);
349         return ret;
350 }
351
352 int bch2_mark_snapshot(struct btree_trans *trans,
353                        enum btree_id btree, unsigned level,
354                        struct bkey_s_c old, struct bkey_s new,
355                        unsigned flags)
356 {
357         return __bch2_mark_snapshot(trans, btree, level, old, new.s_c, flags);
358 }
359
360 int bch2_snapshot_lookup(struct btree_trans *trans, u32 id,
361                          struct bch_snapshot *s)
362 {
363         return bch2_bkey_get_val_typed(trans, BTREE_ID_snapshots, POS(0, id),
364                                        BTREE_ITER_WITH_UPDATES, snapshot, s);
365 }
366
367 static int bch2_snapshot_live(struct btree_trans *trans, u32 id)
368 {
369         struct bch_snapshot v;
370         int ret;
371
372         if (!id)
373                 return 0;
374
375         ret = bch2_snapshot_lookup(trans, id, &v);
376         if (bch2_err_matches(ret, ENOENT))
377                 bch_err(trans->c, "snapshot node %u not found", id);
378         if (ret)
379                 return ret;
380
381         return !BCH_SNAPSHOT_DELETED(&v);
382 }
383
384 /*
385  * If @k is a snapshot with just one live child, it's part of a linear chain,
386  * which we consider to be an equivalence class: and then after snapshot
387  * deletion cleanup, there should only be a single key at a given position in
388  * this equivalence class.
389  *
390  * This sets the equivalence class of @k to be the child's equivalence class, if
391  * it's part of such a linear chain: this correctly sets equivalence classes on
392  * startup if we run leaf to root (i.e. in natural key order).
393  */
394 static int bch2_snapshot_set_equiv(struct btree_trans *trans, struct bkey_s_c k)
395 {
396         struct bch_fs *c = trans->c;
397         unsigned i, nr_live = 0, live_idx = 0;
398         struct bkey_s_c_snapshot snap;
399         u32 id = k.k->p.offset, child[2];
400
401         if (k.k->type != KEY_TYPE_snapshot)
402                 return 0;
403
404         snap = bkey_s_c_to_snapshot(k);
405
406         child[0] = le32_to_cpu(snap.v->children[0]);
407         child[1] = le32_to_cpu(snap.v->children[1]);
408
409         for (i = 0; i < 2; i++) {
410                 int ret = bch2_snapshot_live(trans, child[i]);
411
412                 if (ret < 0)
413                         return ret;
414
415                 if (ret)
416                         live_idx = i;
417                 nr_live += ret;
418         }
419
420         mutex_lock(&c->snapshot_table_lock);
421
422         snapshot_t_mut(c, id)->equiv = nr_live == 1
423                 ? snapshot_t_mut(c, child[live_idx])->equiv
424                 : id;
425
426         mutex_unlock(&c->snapshot_table_lock);
427
428         return 0;
429 }
430
431 /* fsck: */
432
433 static u32 bch2_snapshot_child(struct bch_fs *c, u32 id, unsigned child)
434 {
435         return snapshot_t(c, id)->children[child];
436 }
437
438 static u32 bch2_snapshot_left_child(struct bch_fs *c, u32 id)
439 {
440         return bch2_snapshot_child(c, id, 0);
441 }
442
443 static u32 bch2_snapshot_right_child(struct bch_fs *c, u32 id)
444 {
445         return bch2_snapshot_child(c, id, 1);
446 }
447
448 static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id)
449 {
450         u32 n, parent;
451
452         n = bch2_snapshot_left_child(c, id);
453         if (n)
454                 return n;
455
456         while ((parent = bch2_snapshot_parent(c, id))) {
457                 n = bch2_snapshot_right_child(c, parent);
458                 if (n && n != id)
459                         return n;
460                 id = parent;
461         }
462
463         return 0;
464 }
465
466 static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root)
467 {
468         u32 id = snapshot_root;
469         u32 subvol = 0, s;
470
471         while (id) {
472                 s = snapshot_t(c, id)->subvol;
473
474                 if (s && (!subvol || s < subvol))
475                         subvol = s;
476
477                 id = bch2_snapshot_tree_next(c, id);
478         }
479
480         return subvol;
481 }
482
483 static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans,
484                                             u32 snapshot_root, u32 *subvol_id)
485 {
486         struct bch_fs *c = trans->c;
487         struct btree_iter iter;
488         struct bkey_s_c k;
489         bool found = false;
490         int ret;
491
492         for_each_btree_key_norestart(trans, iter, BTREE_ID_subvolumes, POS_MIN,
493                                      0, k, ret) {
494                 if (k.k->type != KEY_TYPE_subvolume)
495                         continue;
496
497                 struct bkey_s_c_subvolume s = bkey_s_c_to_subvolume(k);
498                 if (!bch2_snapshot_is_ancestor(c, le32_to_cpu(s.v->snapshot), snapshot_root))
499                         continue;
500                 if (!BCH_SUBVOLUME_SNAP(s.v)) {
501                         *subvol_id = s.k->p.offset;
502                         found = true;
503                         break;
504                 }
505         }
506
507         bch2_trans_iter_exit(trans, &iter);
508
509         if (!ret && !found) {
510                 struct bkey_i_subvolume *u;
511
512                 *subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root);
513
514                 u = bch2_bkey_get_mut_typed(trans, &iter,
515                                             BTREE_ID_subvolumes, POS(0, *subvol_id),
516                                             0, subvolume);
517                 ret = PTR_ERR_OR_ZERO(u);
518                 if (ret)
519                         return ret;
520
521                 SET_BCH_SUBVOLUME_SNAP(&u->v, false);
522         }
523
524         return ret;
525 }
526
527 static int check_snapshot_tree(struct btree_trans *trans,
528                                struct btree_iter *iter,
529                                struct bkey_s_c k)
530 {
531         struct bch_fs *c = trans->c;
532         struct bkey_s_c_snapshot_tree st;
533         struct bch_snapshot s;
534         struct bch_subvolume subvol;
535         struct printbuf buf = PRINTBUF;
536         u32 root_id;
537         int ret;
538
539         if (k.k->type != KEY_TYPE_snapshot_tree)
540                 return 0;
541
542         st = bkey_s_c_to_snapshot_tree(k);
543         root_id = le32_to_cpu(st.v->root_snapshot);
544
545         ret = bch2_snapshot_lookup(trans, root_id, &s);
546         if (ret && !bch2_err_matches(ret, ENOENT))
547                 goto err;
548
549         if (fsck_err_on(ret ||
550                         root_id != bch2_snapshot_root(c, root_id) ||
551                         st.k->p.offset != le32_to_cpu(s.tree),
552                         c, snapshot_tree_to_missing_snapshot,
553                         "snapshot tree points to missing/incorrect snapshot:\n  %s",
554                         (bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
555                 ret = bch2_btree_delete_at(trans, iter, 0);
556                 goto err;
557         }
558
559         ret = bch2_subvolume_get(trans, le32_to_cpu(st.v->master_subvol),
560                                  false, 0, &subvol);
561         if (ret && !bch2_err_matches(ret, ENOENT))
562                 goto err;
563
564         if (fsck_err_on(ret,
565                         c, snapshot_tree_to_missing_subvol,
566                         "snapshot tree points to missing subvolume:\n  %s",
567                         (printbuf_reset(&buf),
568                          bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
569             fsck_err_on(!bch2_snapshot_is_ancestor(c,
570                                                 le32_to_cpu(subvol.snapshot),
571                                                 root_id),
572                         c, snapshot_tree_to_wrong_subvol,
573                         "snapshot tree points to subvolume that does not point to snapshot in this tree:\n  %s",
574                         (printbuf_reset(&buf),
575                          bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) ||
576             fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol),
577                         c, snapshot_tree_to_snapshot_subvol,
578                         "snapshot tree points to snapshot subvolume:\n  %s",
579                         (printbuf_reset(&buf),
580                          bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) {
581                 struct bkey_i_snapshot_tree *u;
582                 u32 subvol_id;
583
584                 ret = bch2_snapshot_tree_master_subvol(trans, root_id, &subvol_id);
585                 bch_err_fn(c, ret);
586
587                 if (bch2_err_matches(ret, ENOENT)) { /* nothing to be done here */
588                         ret = 0;
589                         goto err;
590                 }
591
592                 if (ret)
593                         goto err;
594
595                 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot_tree);
596                 ret = PTR_ERR_OR_ZERO(u);
597                 if (ret)
598                         goto err;
599
600                 u->v.master_subvol = cpu_to_le32(subvol_id);
601                 st = snapshot_tree_i_to_s_c(u);
602         }
603 err:
604 fsck_err:
605         printbuf_exit(&buf);
606         return ret;
607 }
608
609 /*
610  * For each snapshot_tree, make sure it points to the root of a snapshot tree
611  * and that snapshot entry points back to it, or delete it.
612  *
613  * And, make sure it points to a subvolume within that snapshot tree, or correct
614  * it to point to the oldest subvolume within that snapshot tree.
615  */
616 int bch2_check_snapshot_trees(struct bch_fs *c)
617 {
618         int ret = bch2_trans_run(c,
619                 for_each_btree_key_commit(trans, iter,
620                         BTREE_ID_snapshot_trees, POS_MIN,
621                         BTREE_ITER_PREFETCH, k,
622                         NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
623                 check_snapshot_tree(trans, &iter, k)));
624         bch_err_fn(c, ret);
625         return ret;
626 }
627
628 /*
629  * Look up snapshot tree for @tree_id and find root,
630  * make sure @snap_id is a descendent:
631  */
632 static int snapshot_tree_ptr_good(struct btree_trans *trans,
633                                   u32 snap_id, u32 tree_id)
634 {
635         struct bch_snapshot_tree s_t;
636         int ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
637
638         if (bch2_err_matches(ret, ENOENT))
639                 return 0;
640         if (ret)
641                 return ret;
642
643         return bch2_snapshot_is_ancestor_early(trans->c, snap_id, le32_to_cpu(s_t.root_snapshot));
644 }
645
646 u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id)
647 {
648         const struct snapshot_t *s;
649
650         if (!id)
651                 return 0;
652
653         rcu_read_lock();
654         s = snapshot_t(c, id);
655         if (s->parent)
656                 id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth));
657         rcu_read_unlock();
658
659         return id;
660 }
661
662 static int snapshot_skiplist_good(struct btree_trans *trans, u32 id, struct bch_snapshot s)
663 {
664         unsigned i;
665
666         for (i = 0; i < 3; i++)
667                 if (!s.parent) {
668                         if (s.skip[i])
669                                 return false;
670                 } else {
671                         if (!bch2_snapshot_is_ancestor_early(trans->c, id, le32_to_cpu(s.skip[i])))
672                                 return false;
673                 }
674
675         return true;
676 }
677
678 /*
679  * snapshot_tree pointer was incorrect: look up root snapshot node, make sure
680  * its snapshot_tree pointer is correct (allocate new one if necessary), then
681  * update this node's pointer to root node's pointer:
682  */
683 static int snapshot_tree_ptr_repair(struct btree_trans *trans,
684                                     struct btree_iter *iter,
685                                     struct bkey_s_c k,
686                                     struct bch_snapshot *s)
687 {
688         struct bch_fs *c = trans->c;
689         struct btree_iter root_iter;
690         struct bch_snapshot_tree s_t;
691         struct bkey_s_c_snapshot root;
692         struct bkey_i_snapshot *u;
693         u32 root_id = bch2_snapshot_root(c, k.k->p.offset), tree_id;
694         int ret;
695
696         root = bch2_bkey_get_iter_typed(trans, &root_iter,
697                                BTREE_ID_snapshots, POS(0, root_id),
698                                BTREE_ITER_WITH_UPDATES, snapshot);
699         ret = bkey_err(root);
700         if (ret)
701                 goto err;
702
703         tree_id = le32_to_cpu(root.v->tree);
704
705         ret = bch2_snapshot_tree_lookup(trans, tree_id, &s_t);
706         if (ret && !bch2_err_matches(ret, ENOENT))
707                 return ret;
708
709         if (ret || le32_to_cpu(s_t.root_snapshot) != root_id) {
710                 u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot);
711                 ret =   PTR_ERR_OR_ZERO(u) ?:
712                         bch2_snapshot_tree_create(trans, root_id,
713                                 bch2_snapshot_tree_oldest_subvol(c, root_id),
714                                 &tree_id);
715                 if (ret)
716                         goto err;
717
718                 u->v.tree = cpu_to_le32(tree_id);
719                 if (k.k->p.offset == root_id)
720                         *s = u->v;
721         }
722
723         if (k.k->p.offset != root_id) {
724                 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
725                 ret = PTR_ERR_OR_ZERO(u);
726                 if (ret)
727                         goto err;
728
729                 u->v.tree = cpu_to_le32(tree_id);
730                 *s = u->v;
731         }
732 err:
733         bch2_trans_iter_exit(trans, &root_iter);
734         return ret;
735 }
736
737 static int check_snapshot(struct btree_trans *trans,
738                           struct btree_iter *iter,
739                           struct bkey_s_c k)
740 {
741         struct bch_fs *c = trans->c;
742         struct bch_snapshot s;
743         struct bch_subvolume subvol;
744         struct bch_snapshot v;
745         struct bkey_i_snapshot *u;
746         u32 parent_id = bch2_snapshot_parent_early(c, k.k->p.offset);
747         u32 real_depth;
748         struct printbuf buf = PRINTBUF;
749         u32 i, id;
750         int ret = 0;
751
752         if (k.k->type != KEY_TYPE_snapshot)
753                 return 0;
754
755         memset(&s, 0, sizeof(s));
756         memcpy(&s, k.v, min(sizeof(s), bkey_val_bytes(k.k)));
757
758         id = le32_to_cpu(s.parent);
759         if (id) {
760                 ret = bch2_snapshot_lookup(trans, id, &v);
761                 if (bch2_err_matches(ret, ENOENT))
762                         bch_err(c, "snapshot with nonexistent parent:\n  %s",
763                                 (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
764                 if (ret)
765                         goto err;
766
767                 if (le32_to_cpu(v.children[0]) != k.k->p.offset &&
768                     le32_to_cpu(v.children[1]) != k.k->p.offset) {
769                         bch_err(c, "snapshot parent %u missing pointer to child %llu",
770                                 id, k.k->p.offset);
771                         ret = -EINVAL;
772                         goto err;
773                 }
774         }
775
776         for (i = 0; i < 2 && s.children[i]; i++) {
777                 id = le32_to_cpu(s.children[i]);
778
779                 ret = bch2_snapshot_lookup(trans, id, &v);
780                 if (bch2_err_matches(ret, ENOENT))
781                         bch_err(c, "snapshot node %llu has nonexistent child %u",
782                                 k.k->p.offset, id);
783                 if (ret)
784                         goto err;
785
786                 if (le32_to_cpu(v.parent) != k.k->p.offset) {
787                         bch_err(c, "snapshot child %u has wrong parent (got %u should be %llu)",
788                                 id, le32_to_cpu(v.parent), k.k->p.offset);
789                         ret = -EINVAL;
790                         goto err;
791                 }
792         }
793
794         bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) &&
795                 !BCH_SNAPSHOT_DELETED(&s);
796
797         if (should_have_subvol) {
798                 id = le32_to_cpu(s.subvol);
799                 ret = bch2_subvolume_get(trans, id, 0, false, &subvol);
800                 if (bch2_err_matches(ret, ENOENT))
801                         bch_err(c, "snapshot points to nonexistent subvolume:\n  %s",
802                                 (bch2_bkey_val_to_text(&buf, c, k), buf.buf));
803                 if (ret)
804                         goto err;
805
806                 if (BCH_SNAPSHOT_SUBVOL(&s) != (le32_to_cpu(subvol.snapshot) == k.k->p.offset)) {
807                         bch_err(c, "snapshot node %llu has wrong BCH_SNAPSHOT_SUBVOL",
808                                 k.k->p.offset);
809                         ret = -EINVAL;
810                         goto err;
811                 }
812         } else {
813                 if (fsck_err_on(s.subvol,
814                                 c, snapshot_should_not_have_subvol,
815                                 "snapshot should not point to subvol:\n  %s",
816                                 (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
817                         u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
818                         ret = PTR_ERR_OR_ZERO(u);
819                         if (ret)
820                                 goto err;
821
822                         u->v.subvol = 0;
823                         s = u->v;
824                 }
825         }
826
827         ret = snapshot_tree_ptr_good(trans, k.k->p.offset, le32_to_cpu(s.tree));
828         if (ret < 0)
829                 goto err;
830
831         if (fsck_err_on(!ret, c, snapshot_to_bad_snapshot_tree,
832                         "snapshot points to missing/incorrect tree:\n  %s",
833                         (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
834                 ret = snapshot_tree_ptr_repair(trans, iter, k, &s);
835                 if (ret)
836                         goto err;
837         }
838         ret = 0;
839
840         real_depth = bch2_snapshot_depth(c, parent_id);
841
842         if (fsck_err_on(le32_to_cpu(s.depth) != real_depth,
843                         c, snapshot_bad_depth,
844                         "snapshot with incorrect depth field, should be %u:\n  %s",
845                         real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
846                 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
847                 ret = PTR_ERR_OR_ZERO(u);
848                 if (ret)
849                         goto err;
850
851                 u->v.depth = cpu_to_le32(real_depth);
852                 s = u->v;
853         }
854
855         ret = snapshot_skiplist_good(trans, k.k->p.offset, s);
856         if (ret < 0)
857                 goto err;
858
859         if (fsck_err_on(!ret, c, snapshot_bad_skiplist,
860                         "snapshot with bad skiplist field:\n  %s",
861                         (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) {
862                 u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot);
863                 ret = PTR_ERR_OR_ZERO(u);
864                 if (ret)
865                         goto err;
866
867                 for (i = 0; i < ARRAY_SIZE(u->v.skip); i++)
868                         u->v.skip[i] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent_id));
869
870                 bubble_sort(u->v.skip, ARRAY_SIZE(u->v.skip), cmp_le32);
871                 s = u->v;
872         }
873         ret = 0;
874 err:
875 fsck_err:
876         printbuf_exit(&buf);
877         return ret;
878 }
879
880 int bch2_check_snapshots(struct bch_fs *c)
881 {
882         /*
883          * We iterate backwards as checking/fixing the depth field requires that
884          * the parent's depth already be correct:
885          */
886         int ret = bch2_trans_run(c,
887                 for_each_btree_key_reverse_commit(trans, iter,
888                                 BTREE_ID_snapshots, POS_MAX,
889                                 BTREE_ITER_PREFETCH, k,
890                                 NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
891                         check_snapshot(trans, &iter, k)));
892         bch_err_fn(c, ret);
893         return ret;
894 }
895
896 static int check_snapshot_exists(struct btree_trans *trans, u32 id)
897 {
898         struct bch_fs *c = trans->c;
899
900         if (bch2_snapshot_equiv(c, id))
901                 return 0;
902
903         u32 tree_id;
904         int ret = bch2_snapshot_tree_create(trans, id, 0, &tree_id);
905         if (ret)
906                 return ret;
907
908         struct bkey_i_snapshot *snapshot = bch2_trans_kmalloc(trans, sizeof(*snapshot));
909         ret = PTR_ERR_OR_ZERO(snapshot);
910         if (ret)
911                 return ret;
912
913         bkey_snapshot_init(&snapshot->k_i);
914         snapshot->k.p           = POS(0, id);
915         snapshot->v.tree        = cpu_to_le32(tree_id);
916         snapshot->v.btime.lo    = cpu_to_le64(bch2_current_time(c));
917
918         return  bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?:
919                 bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
920                                    bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0) ?:
921                 bch2_snapshot_set_equiv(trans, bkey_i_to_s_c(&snapshot->k_i));
922 }
923
924 /* Figure out which snapshot nodes belong in the same tree: */
925 struct snapshot_tree_reconstruct {
926         enum btree_id                   btree;
927         struct bpos                     cur_pos;
928         snapshot_id_list                cur_ids;
929         DARRAY(snapshot_id_list)        trees;
930 };
931
932 static void snapshot_tree_reconstruct_exit(struct snapshot_tree_reconstruct *r)
933 {
934         darray_for_each(r->trees, i)
935                 darray_exit(i);
936         darray_exit(&r->trees);
937         darray_exit(&r->cur_ids);
938 }
939
940 static inline bool same_snapshot(struct snapshot_tree_reconstruct *r, struct bpos pos)
941 {
942         return r->btree == BTREE_ID_inodes
943                 ? r->cur_pos.offset == pos.offset
944                 : r->cur_pos.inode == pos.inode;
945 }
946
947 static inline bool snapshot_id_lists_have_common(snapshot_id_list *l, snapshot_id_list *r)
948 {
949         darray_for_each(*l, i)
950                 if (snapshot_list_has_id(r, *i))
951                         return true;
952         return false;
953 }
954
955 static void snapshot_id_list_to_text(struct printbuf *out, snapshot_id_list *s)
956 {
957         bool first = true;
958         darray_for_each(*s, i) {
959                 if (!first)
960                         prt_char(out, ' ');
961                 first = false;
962                 prt_printf(out, "%u", *i);
963         }
964 }
965
966 static int snapshot_tree_reconstruct_next(struct bch_fs *c, struct snapshot_tree_reconstruct *r)
967 {
968         if (r->cur_ids.nr) {
969                 darray_for_each(r->trees, i)
970                         if (snapshot_id_lists_have_common(i, &r->cur_ids)) {
971                                 int ret = snapshot_list_merge(c, i, &r->cur_ids);
972                                 if (ret)
973                                         return ret;
974                                 goto out;
975                         }
976                 darray_push(&r->trees, r->cur_ids);
977                 darray_init(&r->cur_ids);
978         }
979 out:
980         r->cur_ids.nr = 0;
981         return 0;
982 }
983
984 static int get_snapshot_trees(struct bch_fs *c, struct snapshot_tree_reconstruct *r, struct bpos pos)
985 {
986         if (!same_snapshot(r, pos))
987                 snapshot_tree_reconstruct_next(c, r);
988         r->cur_pos = pos;
989         return snapshot_list_add_nodup(c, &r->cur_ids, pos.snapshot);
990 }
991
992 int bch2_reconstruct_snapshots(struct bch_fs *c)
993 {
994         struct btree_trans *trans = bch2_trans_get(c);
995         struct printbuf buf = PRINTBUF;
996         struct snapshot_tree_reconstruct r = {};
997         int ret = 0;
998
999         for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) {
1000                 if (btree_type_has_snapshots(btree)) {
1001                         r.btree = btree;
1002
1003                         ret = for_each_btree_key(trans, iter, btree, POS_MIN,
1004                                         BTREE_ITER_ALL_SNAPSHOTS|BTREE_ITER_PREFETCH, k, ({
1005                                 get_snapshot_trees(c, &r, k.k->p);
1006                         }));
1007                         if (ret)
1008                                 goto err;
1009
1010                         snapshot_tree_reconstruct_next(c, &r);
1011                 }
1012         }
1013
1014         darray_for_each(r.trees, t) {
1015                 printbuf_reset(&buf);
1016                 snapshot_id_list_to_text(&buf, t);
1017
1018                 darray_for_each(*t, id) {
1019                         if (fsck_err_on(!bch2_snapshot_equiv(c, *id),
1020                                         c, snapshot_node_missing,
1021                                         "snapshot node %u from tree %s missing", *id, buf.buf)) {
1022                                 if (t->nr > 1) {
1023                                         bch_err(c, "cannot reconstruct snapshot trees with multiple nodes");
1024                                         ret = -BCH_ERR_fsck_repair_unimplemented;
1025                                         goto err;
1026                                 }
1027
1028                                 ret = commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1029                                                 check_snapshot_exists(trans, *id));
1030                                 if (ret)
1031                                         goto err;
1032                         }
1033                 }
1034         }
1035 fsck_err:
1036 err:
1037         bch2_trans_put(trans);
1038         snapshot_tree_reconstruct_exit(&r);
1039         printbuf_exit(&buf);
1040         bch_err_fn(c, ret);
1041         return ret;
1042 }
1043
1044 /*
1045  * Mark a snapshot as deleted, for future cleanup:
1046  */
1047 int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id)
1048 {
1049         struct btree_iter iter;
1050         struct bkey_i_snapshot *s;
1051         int ret = 0;
1052
1053         s = bch2_bkey_get_mut_typed(trans, &iter,
1054                                     BTREE_ID_snapshots, POS(0, id),
1055                                     0, snapshot);
1056         ret = PTR_ERR_OR_ZERO(s);
1057         if (unlikely(ret)) {
1058                 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT),
1059                                         trans->c, "missing snapshot %u", id);
1060                 return ret;
1061         }
1062
1063         /* already deleted? */
1064         if (BCH_SNAPSHOT_DELETED(&s->v))
1065                 goto err;
1066
1067         SET_BCH_SNAPSHOT_DELETED(&s->v, true);
1068         SET_BCH_SNAPSHOT_SUBVOL(&s->v, false);
1069         s->v.subvol = 0;
1070 err:
1071         bch2_trans_iter_exit(trans, &iter);
1072         return ret;
1073 }
1074
1075 static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s)
1076 {
1077         if (le32_to_cpu(s->children[0]) < le32_to_cpu(s->children[1]))
1078                 swap(s->children[0], s->children[1]);
1079 }
1080
1081 static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id)
1082 {
1083         struct bch_fs *c = trans->c;
1084         struct btree_iter iter, p_iter = (struct btree_iter) { NULL };
1085         struct btree_iter c_iter = (struct btree_iter) { NULL };
1086         struct btree_iter tree_iter = (struct btree_iter) { NULL };
1087         struct bkey_s_c_snapshot s;
1088         u32 parent_id, child_id;
1089         unsigned i;
1090         int ret = 0;
1091
1092         s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id),
1093                                      BTREE_ITER_INTENT, snapshot);
1094         ret = bkey_err(s);
1095         bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
1096                                 "missing snapshot %u", id);
1097
1098         if (ret)
1099                 goto err;
1100
1101         BUG_ON(s.v->children[1]);
1102
1103         parent_id = le32_to_cpu(s.v->parent);
1104         child_id = le32_to_cpu(s.v->children[0]);
1105
1106         if (parent_id) {
1107                 struct bkey_i_snapshot *parent;
1108
1109                 parent = bch2_bkey_get_mut_typed(trans, &p_iter,
1110                                      BTREE_ID_snapshots, POS(0, parent_id),
1111                                      0, snapshot);
1112                 ret = PTR_ERR_OR_ZERO(parent);
1113                 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
1114                                         "missing snapshot %u", parent_id);
1115                 if (unlikely(ret))
1116                         goto err;
1117
1118                 /* find entry in parent->children for node being deleted */
1119                 for (i = 0; i < 2; i++)
1120                         if (le32_to_cpu(parent->v.children[i]) == id)
1121                                 break;
1122
1123                 if (bch2_fs_inconsistent_on(i == 2, c,
1124                                         "snapshot %u missing child pointer to %u",
1125                                         parent_id, id))
1126                         goto err;
1127
1128                 parent->v.children[i] = cpu_to_le32(child_id);
1129
1130                 normalize_snapshot_child_pointers(&parent->v);
1131         }
1132
1133         if (child_id) {
1134                 struct bkey_i_snapshot *child;
1135
1136                 child = bch2_bkey_get_mut_typed(trans, &c_iter,
1137                                      BTREE_ID_snapshots, POS(0, child_id),
1138                                      0, snapshot);
1139                 ret = PTR_ERR_OR_ZERO(child);
1140                 bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c,
1141                                         "missing snapshot %u", child_id);
1142                 if (unlikely(ret))
1143                         goto err;
1144
1145                 child->v.parent = cpu_to_le32(parent_id);
1146
1147                 if (!child->v.parent) {
1148                         child->v.skip[0] = 0;
1149                         child->v.skip[1] = 0;
1150                         child->v.skip[2] = 0;
1151                 }
1152         }
1153
1154         if (!parent_id) {
1155                 /*
1156                  * We're deleting the root of a snapshot tree: update the
1157                  * snapshot_tree entry to point to the new root, or delete it if
1158                  * this is the last snapshot ID in this tree:
1159                  */
1160                 struct bkey_i_snapshot_tree *s_t;
1161
1162                 BUG_ON(s.v->children[1]);
1163
1164                 s_t = bch2_bkey_get_mut_typed(trans, &tree_iter,
1165                                 BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)),
1166                                 0, snapshot_tree);
1167                 ret = PTR_ERR_OR_ZERO(s_t);
1168                 if (ret)
1169                         goto err;
1170
1171                 if (s.v->children[0]) {
1172                         s_t->v.root_snapshot = s.v->children[0];
1173                 } else {
1174                         s_t->k.type = KEY_TYPE_deleted;
1175                         set_bkey_val_u64s(&s_t->k, 0);
1176                 }
1177         }
1178
1179         ret = bch2_btree_delete_at(trans, &iter, 0);
1180 err:
1181         bch2_trans_iter_exit(trans, &tree_iter);
1182         bch2_trans_iter_exit(trans, &p_iter);
1183         bch2_trans_iter_exit(trans, &c_iter);
1184         bch2_trans_iter_exit(trans, &iter);
1185         return ret;
1186 }
1187
1188 static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree,
1189                           u32 *new_snapids,
1190                           u32 *snapshot_subvols,
1191                           unsigned nr_snapids)
1192 {
1193         struct bch_fs *c = trans->c;
1194         struct btree_iter iter;
1195         struct bkey_i_snapshot *n;
1196         struct bkey_s_c k;
1197         unsigned i, j;
1198         u32 depth = bch2_snapshot_depth(c, parent);
1199         int ret;
1200
1201         bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots,
1202                              POS_MIN, BTREE_ITER_INTENT);
1203         k = bch2_btree_iter_peek(&iter);
1204         ret = bkey_err(k);
1205         if (ret)
1206                 goto err;
1207
1208         for (i = 0; i < nr_snapids; i++) {
1209                 k = bch2_btree_iter_prev_slot(&iter);
1210                 ret = bkey_err(k);
1211                 if (ret)
1212                         goto err;
1213
1214                 if (!k.k || !k.k->p.offset) {
1215                         ret = -BCH_ERR_ENOSPC_snapshot_create;
1216                         goto err;
1217                 }
1218
1219                 n = bch2_bkey_alloc(trans, &iter, 0, snapshot);
1220                 ret = PTR_ERR_OR_ZERO(n);
1221                 if (ret)
1222                         goto err;
1223
1224                 n->v.flags      = 0;
1225                 n->v.parent     = cpu_to_le32(parent);
1226                 n->v.subvol     = cpu_to_le32(snapshot_subvols[i]);
1227                 n->v.tree       = cpu_to_le32(tree);
1228                 n->v.depth      = cpu_to_le32(depth);
1229                 n->v.btime.lo   = cpu_to_le64(bch2_current_time(c));
1230                 n->v.btime.hi   = 0;
1231
1232                 for (j = 0; j < ARRAY_SIZE(n->v.skip); j++)
1233                         n->v.skip[j] = cpu_to_le32(bch2_snapshot_skiplist_get(c, parent));
1234
1235                 bubble_sort(n->v.skip, ARRAY_SIZE(n->v.skip), cmp_le32);
1236                 SET_BCH_SNAPSHOT_SUBVOL(&n->v, true);
1237
1238                 ret = __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0,
1239                                          bkey_s_c_null, bkey_i_to_s_c(&n->k_i), 0);
1240                 if (ret)
1241                         goto err;
1242
1243                 new_snapids[i]  = iter.pos.offset;
1244
1245                 mutex_lock(&c->snapshot_table_lock);
1246                 snapshot_t_mut(c, new_snapids[i])->equiv = new_snapids[i];
1247                 mutex_unlock(&c->snapshot_table_lock);
1248         }
1249 err:
1250         bch2_trans_iter_exit(trans, &iter);
1251         return ret;
1252 }
1253
1254 /*
1255  * Create new snapshot IDs as children of an existing snapshot ID:
1256  */
1257 static int bch2_snapshot_node_create_children(struct btree_trans *trans, u32 parent,
1258                               u32 *new_snapids,
1259                               u32 *snapshot_subvols,
1260                               unsigned nr_snapids)
1261 {
1262         struct btree_iter iter;
1263         struct bkey_i_snapshot *n_parent;
1264         int ret = 0;
1265
1266         n_parent = bch2_bkey_get_mut_typed(trans, &iter,
1267                         BTREE_ID_snapshots, POS(0, parent),
1268                         0, snapshot);
1269         ret = PTR_ERR_OR_ZERO(n_parent);
1270         if (unlikely(ret)) {
1271                 if (bch2_err_matches(ret, ENOENT))
1272                         bch_err(trans->c, "snapshot %u not found", parent);
1273                 return ret;
1274         }
1275
1276         if (n_parent->v.children[0] || n_parent->v.children[1]) {
1277                 bch_err(trans->c, "Trying to add child snapshot nodes to parent that already has children");
1278                 ret = -EINVAL;
1279                 goto err;
1280         }
1281
1282         ret = create_snapids(trans, parent, le32_to_cpu(n_parent->v.tree),
1283                              new_snapids, snapshot_subvols, nr_snapids);
1284         if (ret)
1285                 goto err;
1286
1287         n_parent->v.children[0] = cpu_to_le32(new_snapids[0]);
1288         n_parent->v.children[1] = cpu_to_le32(new_snapids[1]);
1289         n_parent->v.subvol = 0;
1290         SET_BCH_SNAPSHOT_SUBVOL(&n_parent->v, false);
1291 err:
1292         bch2_trans_iter_exit(trans, &iter);
1293         return ret;
1294 }
1295
1296 /*
1297  * Create a snapshot node that is the root of a new tree:
1298  */
1299 static int bch2_snapshot_node_create_tree(struct btree_trans *trans,
1300                               u32 *new_snapids,
1301                               u32 *snapshot_subvols,
1302                               unsigned nr_snapids)
1303 {
1304         struct bkey_i_snapshot_tree *n_tree;
1305         int ret;
1306
1307         n_tree = __bch2_snapshot_tree_create(trans);
1308         ret =   PTR_ERR_OR_ZERO(n_tree) ?:
1309                 create_snapids(trans, 0, n_tree->k.p.offset,
1310                              new_snapids, snapshot_subvols, nr_snapids);
1311         if (ret)
1312                 return ret;
1313
1314         n_tree->v.master_subvol = cpu_to_le32(snapshot_subvols[0]);
1315         n_tree->v.root_snapshot = cpu_to_le32(new_snapids[0]);
1316         return 0;
1317 }
1318
1319 int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent,
1320                               u32 *new_snapids,
1321                               u32 *snapshot_subvols,
1322                               unsigned nr_snapids)
1323 {
1324         BUG_ON((parent == 0) != (nr_snapids == 1));
1325         BUG_ON((parent != 0) != (nr_snapids == 2));
1326
1327         return parent
1328                 ? bch2_snapshot_node_create_children(trans, parent,
1329                                 new_snapids, snapshot_subvols, nr_snapids)
1330                 : bch2_snapshot_node_create_tree(trans,
1331                                 new_snapids, snapshot_subvols, nr_snapids);
1332
1333 }
1334
1335 /*
1336  * If we have an unlinked inode in an internal snapshot node, and the inode
1337  * really has been deleted in all child snapshots, how does this get cleaned up?
1338  *
1339  * first there is the problem of how keys that have been overwritten in all
1340  * child snapshots get deleted (unimplemented?), but inodes may perhaps be
1341  * special?
1342  *
1343  * also: unlinked inode in internal snapshot appears to not be getting deleted
1344  * correctly if inode doesn't exist in leaf snapshots
1345  *
1346  * solution:
1347  *
1348  * for a key in an interior snapshot node that needs work to be done that
1349  * requires it to be mutated: iterate over all descendent leaf nodes and copy
1350  * that key to snapshot leaf nodes, where we can mutate it
1351  */
1352
1353 static int snapshot_delete_key(struct btree_trans *trans,
1354                                struct btree_iter *iter,
1355                                struct bkey_s_c k,
1356                                snapshot_id_list *deleted,
1357                                snapshot_id_list *equiv_seen,
1358                                struct bpos *last_pos)
1359 {
1360         struct bch_fs *c = trans->c;
1361         u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
1362
1363         if (!bkey_eq(k.k->p, *last_pos))
1364                 equiv_seen->nr = 0;
1365         *last_pos = k.k->p;
1366
1367         if (snapshot_list_has_id(deleted, k.k->p.snapshot) ||
1368             snapshot_list_has_id(equiv_seen, equiv)) {
1369                 return bch2_btree_delete_at(trans, iter,
1370                                             BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1371         } else {
1372                 return snapshot_list_add(c, equiv_seen, equiv);
1373         }
1374 }
1375
1376 static int move_key_to_correct_snapshot(struct btree_trans *trans,
1377                                struct btree_iter *iter,
1378                                struct bkey_s_c k)
1379 {
1380         struct bch_fs *c = trans->c;
1381         u32 equiv = bch2_snapshot_equiv(c, k.k->p.snapshot);
1382
1383         /*
1384          * When we have a linear chain of snapshot nodes, we consider
1385          * those to form an equivalence class: we're going to collapse
1386          * them all down to a single node, and keep the leaf-most node -
1387          * which has the same id as the equivalence class id.
1388          *
1389          * If there are multiple keys in different snapshots at the same
1390          * position, we're only going to keep the one in the newest
1391          * snapshot - the rest have been overwritten and are redundant,
1392          * and for the key we're going to keep we need to move it to the
1393          * equivalance class ID if it's not there already.
1394          */
1395         if (equiv != k.k->p.snapshot) {
1396                 struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k);
1397                 struct btree_iter new_iter;
1398                 int ret;
1399
1400                 ret = PTR_ERR_OR_ZERO(new);
1401                 if (ret)
1402                         return ret;
1403
1404                 new->k.p.snapshot = equiv;
1405
1406                 bch2_trans_iter_init(trans, &new_iter, iter->btree_id, new->k.p,
1407                                      BTREE_ITER_ALL_SNAPSHOTS|
1408                                      BTREE_ITER_CACHED|
1409                                      BTREE_ITER_INTENT);
1410
1411                 ret =   bch2_btree_iter_traverse(&new_iter) ?:
1412                         bch2_trans_update(trans, &new_iter, new,
1413                                         BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE) ?:
1414                         bch2_btree_delete_at(trans, iter,
1415                                         BTREE_UPDATE_INTERNAL_SNAPSHOT_NODE);
1416                 bch2_trans_iter_exit(trans, &new_iter);
1417                 if (ret)
1418                         return ret;
1419         }
1420
1421         return 0;
1422 }
1423
1424 static int bch2_snapshot_needs_delete(struct btree_trans *trans, struct bkey_s_c k)
1425 {
1426         struct bkey_s_c_snapshot snap;
1427         u32 children[2];
1428         int ret;
1429
1430         if (k.k->type != KEY_TYPE_snapshot)
1431                 return 0;
1432
1433         snap = bkey_s_c_to_snapshot(k);
1434         if (BCH_SNAPSHOT_DELETED(snap.v) ||
1435             BCH_SNAPSHOT_SUBVOL(snap.v))
1436                 return 0;
1437
1438         children[0] = le32_to_cpu(snap.v->children[0]);
1439         children[1] = le32_to_cpu(snap.v->children[1]);
1440
1441         ret   = bch2_snapshot_live(trans, children[0]) ?:
1442                 bch2_snapshot_live(trans, children[1]);
1443         if (ret < 0)
1444                 return ret;
1445         return !ret;
1446 }
1447
1448 /*
1449  * For a given snapshot, if it doesn't have a subvolume that points to it, and
1450  * it doesn't have child snapshot nodes - it's now redundant and we can mark it
1451  * as deleted.
1452  */
1453 static int bch2_delete_redundant_snapshot(struct btree_trans *trans, struct bkey_s_c k)
1454 {
1455         int ret = bch2_snapshot_needs_delete(trans, k);
1456
1457         return ret <= 0
1458                 ? ret
1459                 : bch2_snapshot_node_set_deleted(trans, k.k->p.offset);
1460 }
1461
1462 static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n,
1463                                                 snapshot_id_list *skip)
1464 {
1465         rcu_read_lock();
1466         while (snapshot_list_has_id(skip, id))
1467                 id = __bch2_snapshot_parent(c, id);
1468
1469         while (n--) {
1470                 do {
1471                         id = __bch2_snapshot_parent(c, id);
1472                 } while (snapshot_list_has_id(skip, id));
1473         }
1474         rcu_read_unlock();
1475
1476         return id;
1477 }
1478
1479 static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans,
1480                                               struct btree_iter *iter, struct bkey_s_c k,
1481                                               snapshot_id_list *deleted)
1482 {
1483         struct bch_fs *c = trans->c;
1484         u32 nr_deleted_ancestors = 0;
1485         struct bkey_i_snapshot *s;
1486         int ret;
1487
1488         if (k.k->type != KEY_TYPE_snapshot)
1489                 return 0;
1490
1491         if (snapshot_list_has_id(deleted, k.k->p.offset))
1492                 return 0;
1493
1494         s = bch2_bkey_make_mut_noupdate_typed(trans, k, snapshot);
1495         ret = PTR_ERR_OR_ZERO(s);
1496         if (ret)
1497                 return ret;
1498
1499         darray_for_each(*deleted, i)
1500                 nr_deleted_ancestors += bch2_snapshot_is_ancestor(c, s->k.p.offset, *i);
1501
1502         if (!nr_deleted_ancestors)
1503                 return 0;
1504
1505         le32_add_cpu(&s->v.depth, -nr_deleted_ancestors);
1506
1507         if (!s->v.depth) {
1508                 s->v.skip[0] = 0;
1509                 s->v.skip[1] = 0;
1510                 s->v.skip[2] = 0;
1511         } else {
1512                 u32 depth = le32_to_cpu(s->v.depth);
1513                 u32 parent = bch2_snapshot_parent(c, s->k.p.offset);
1514
1515                 for (unsigned j = 0; j < ARRAY_SIZE(s->v.skip); j++) {
1516                         u32 id = le32_to_cpu(s->v.skip[j]);
1517
1518                         if (snapshot_list_has_id(deleted, id)) {
1519                                 id = bch2_snapshot_nth_parent_skip(c,
1520                                                         parent,
1521                                                         depth > 1
1522                                                         ? get_random_u32_below(depth - 1)
1523                                                         : 0,
1524                                                         deleted);
1525                                 s->v.skip[j] = cpu_to_le32(id);
1526                         }
1527                 }
1528
1529                 bubble_sort(s->v.skip, ARRAY_SIZE(s->v.skip), cmp_le32);
1530         }
1531
1532         return bch2_trans_update(trans, iter, &s->k_i, 0);
1533 }
1534
1535 int bch2_delete_dead_snapshots(struct bch_fs *c)
1536 {
1537         struct btree_trans *trans;
1538         snapshot_id_list deleted = { 0 };
1539         snapshot_id_list deleted_interior = { 0 };
1540         u32 id;
1541         int ret = 0;
1542
1543         if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags))
1544                 return 0;
1545
1546         if (!test_bit(BCH_FS_started, &c->flags)) {
1547                 ret = bch2_fs_read_write_early(c);
1548                 bch_err_msg(c, ret, "deleting dead snapshots: error going rw");
1549                 if (ret)
1550                         return ret;
1551         }
1552
1553         trans = bch2_trans_get(c);
1554
1555         /*
1556          * For every snapshot node: If we have no live children and it's not
1557          * pointed to by a subvolume, delete it:
1558          */
1559         ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots,
1560                         POS_MIN, 0, k,
1561                         NULL, NULL, 0,
1562                 bch2_delete_redundant_snapshot(trans, k));
1563         bch_err_msg(c, ret, "deleting redundant snapshots");
1564         if (ret)
1565                 goto err;
1566
1567         ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots,
1568                                  POS_MIN, 0, k,
1569                 bch2_snapshot_set_equiv(trans, k));
1570         bch_err_msg(c, ret, "in bch2_snapshots_set_equiv");
1571         if (ret)
1572                 goto err;
1573
1574         ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots,
1575                                  POS_MIN, 0, k, ({
1576                 if (k.k->type != KEY_TYPE_snapshot)
1577                         continue;
1578
1579                 BCH_SNAPSHOT_DELETED(bkey_s_c_to_snapshot(k).v)
1580                         ? snapshot_list_add(c, &deleted, k.k->p.offset)
1581                         : 0;
1582         }));
1583         bch_err_msg(c, ret, "walking snapshots");
1584         if (ret)
1585                 goto err;
1586
1587         for (id = 0; id < BTREE_ID_NR; id++) {
1588                 struct bpos last_pos = POS_MIN;
1589                 snapshot_id_list equiv_seen = { 0 };
1590                 struct disk_reservation res = { 0 };
1591
1592                 if (!btree_type_has_snapshots(id))
1593                         continue;
1594
1595                 /*
1596                  * deleted inodes btree is maintained by a trigger on the inodes
1597                  * btree - no work for us to do here, and it's not safe to scan
1598                  * it because we'll see out of date keys due to the btree write
1599                  * buffer:
1600                  */
1601                 if (id == BTREE_ID_deleted_inodes)
1602                         continue;
1603
1604                 ret = for_each_btree_key_commit(trans, iter,
1605                                 id, POS_MIN,
1606                                 BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
1607                                 &res, NULL, BCH_TRANS_COMMIT_no_enospc,
1608                         snapshot_delete_key(trans, &iter, k, &deleted, &equiv_seen, &last_pos)) ?:
1609                       for_each_btree_key_commit(trans, iter,
1610                                 id, POS_MIN,
1611                                 BTREE_ITER_PREFETCH|BTREE_ITER_ALL_SNAPSHOTS, k,
1612                                 &res, NULL, BCH_TRANS_COMMIT_no_enospc,
1613                         move_key_to_correct_snapshot(trans, &iter, k));
1614
1615                 bch2_disk_reservation_put(c, &res);
1616                 darray_exit(&equiv_seen);
1617
1618                 bch_err_msg(c, ret, "deleting keys from dying snapshots");
1619                 if (ret)
1620                         goto err;
1621         }
1622
1623         bch2_trans_unlock(trans);
1624         down_write(&c->snapshot_create_lock);
1625
1626         ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots,
1627                                  POS_MIN, 0, k, ({
1628                 u32 snapshot = k.k->p.offset;
1629                 u32 equiv = bch2_snapshot_equiv(c, snapshot);
1630
1631                 equiv != snapshot
1632                         ? snapshot_list_add(c, &deleted_interior, snapshot)
1633                         : 0;
1634         }));
1635
1636         bch_err_msg(c, ret, "walking snapshots");
1637         if (ret)
1638                 goto err_create_lock;
1639
1640         /*
1641          * Fixing children of deleted snapshots can't be done completely
1642          * atomically, if we crash between here and when we delete the interior
1643          * nodes some depth fields will be off:
1644          */
1645         ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN,
1646                                   BTREE_ITER_INTENT, k,
1647                                   NULL, NULL, BCH_TRANS_COMMIT_no_enospc,
1648                 bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &deleted_interior));
1649         if (ret)
1650                 goto err_create_lock;
1651
1652         darray_for_each(deleted, i) {
1653                 ret = commit_do(trans, NULL, NULL, 0,
1654                         bch2_snapshot_node_delete(trans, *i));
1655                 bch_err_msg(c, ret, "deleting snapshot %u", *i);
1656                 if (ret)
1657                         goto err_create_lock;
1658         }
1659
1660         darray_for_each(deleted_interior, i) {
1661                 ret = commit_do(trans, NULL, NULL, 0,
1662                         bch2_snapshot_node_delete(trans, *i));
1663                 bch_err_msg(c, ret, "deleting snapshot %u", *i);
1664                 if (ret)
1665                         goto err_create_lock;
1666         }
1667 err_create_lock:
1668         up_write(&c->snapshot_create_lock);
1669 err:
1670         darray_exit(&deleted_interior);
1671         darray_exit(&deleted);
1672         bch2_trans_put(trans);
1673         bch_err_fn(c, ret);
1674         return ret;
1675 }
1676
1677 void bch2_delete_dead_snapshots_work(struct work_struct *work)
1678 {
1679         struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work);
1680
1681         bch2_delete_dead_snapshots(c);
1682         bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
1683 }
1684
1685 void bch2_delete_dead_snapshots_async(struct bch_fs *c)
1686 {
1687         if (bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots) &&
1688             !queue_work(c->write_ref_wq, &c->snapshot_delete_work))
1689                 bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots);
1690 }
1691
1692 int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans,
1693                                        enum btree_id id,
1694                                        struct bpos pos)
1695 {
1696         struct bch_fs *c = trans->c;
1697         struct btree_iter iter;
1698         struct bkey_s_c k;
1699         int ret;
1700
1701         bch2_trans_iter_init(trans, &iter, id, pos,
1702                              BTREE_ITER_NOT_EXTENTS|
1703                              BTREE_ITER_ALL_SNAPSHOTS);
1704         while (1) {
1705                 k = bch2_btree_iter_prev(&iter);
1706                 ret = bkey_err(k);
1707                 if (ret)
1708                         break;
1709
1710                 if (!k.k)
1711                         break;
1712
1713                 if (!bkey_eq(pos, k.k->p))
1714                         break;
1715
1716                 if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot)) {
1717                         ret = 1;
1718                         break;
1719                 }
1720         }
1721         bch2_trans_iter_exit(trans, &iter);
1722
1723         return ret;
1724 }
1725
1726 static u32 bch2_snapshot_smallest_child(struct bch_fs *c, u32 id)
1727 {
1728         const struct snapshot_t *s = snapshot_t(c, id);
1729
1730         return s->children[1] ?: s->children[0];
1731 }
1732
1733 static u32 bch2_snapshot_smallest_descendent(struct bch_fs *c, u32 id)
1734 {
1735         u32 child;
1736
1737         while ((child = bch2_snapshot_smallest_child(c, id)))
1738                 id = child;
1739         return id;
1740 }
1741
1742 static int bch2_propagate_key_to_snapshot_leaf(struct btree_trans *trans,
1743                                                enum btree_id btree,
1744                                                struct bkey_s_c interior_k,
1745                                                u32 leaf_id, struct bpos *new_min_pos)
1746 {
1747         struct btree_iter iter;
1748         struct bpos pos = interior_k.k->p;
1749         struct bkey_s_c k;
1750         struct bkey_i *new;
1751         int ret;
1752
1753         pos.snapshot = leaf_id;
1754
1755         bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_INTENT);
1756         k = bch2_btree_iter_peek_slot(&iter);
1757         ret = bkey_err(k);
1758         if (ret)
1759                 goto out;
1760
1761         /* key already overwritten in this snapshot? */
1762         if (k.k->p.snapshot != interior_k.k->p.snapshot)
1763                 goto out;
1764
1765         if (bpos_eq(*new_min_pos, POS_MIN)) {
1766                 *new_min_pos = k.k->p;
1767                 new_min_pos->snapshot = leaf_id;
1768         }
1769
1770         new = bch2_bkey_make_mut_noupdate(trans, interior_k);
1771         ret = PTR_ERR_OR_ZERO(new);
1772         if (ret)
1773                 goto out;
1774
1775         new->k.p.snapshot = leaf_id;
1776         ret = bch2_trans_update(trans, &iter, new, 0);
1777 out:
1778         bch2_trans_iter_exit(trans, &iter);
1779         return ret;
1780 }
1781
1782 int bch2_propagate_key_to_snapshot_leaves(struct btree_trans *trans,
1783                                           enum btree_id btree,
1784                                           struct bkey_s_c k,
1785                                           struct bpos *new_min_pos)
1786 {
1787         struct bch_fs *c = trans->c;
1788         struct bkey_buf sk;
1789         u32 restart_count = trans->restart_count;
1790         int ret = 0;
1791
1792         bch2_bkey_buf_init(&sk);
1793         bch2_bkey_buf_reassemble(&sk, c, k);
1794         k = bkey_i_to_s_c(sk.k);
1795
1796         *new_min_pos = POS_MIN;
1797
1798         for (u32 id = bch2_snapshot_smallest_descendent(c, k.k->p.snapshot);
1799              id < k.k->p.snapshot;
1800              id++) {
1801                 if (!bch2_snapshot_is_ancestor(c, id, k.k->p.snapshot) ||
1802                     !bch2_snapshot_is_leaf(c, id))
1803                         continue;
1804 again:
1805                 ret =   btree_trans_too_many_iters(trans) ?:
1806                         bch2_propagate_key_to_snapshot_leaf(trans, btree, k, id, new_min_pos) ?:
1807                         bch2_trans_commit(trans, NULL, NULL, 0);
1808                 if (ret && bch2_err_matches(ret, BCH_ERR_transaction_restart)) {
1809                         bch2_trans_begin(trans);
1810                         goto again;
1811                 }
1812
1813                 if (ret)
1814                         break;
1815         }
1816
1817         bch2_bkey_buf_exit(&sk, c);
1818
1819         return ret ?: trans_was_restarted(trans, restart_count);
1820 }
1821
1822 static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct bkey_s_c k)
1823 {
1824         struct bch_fs *c = trans->c;
1825         struct bkey_s_c_snapshot snap;
1826         int ret = 0;
1827
1828         if (k.k->type != KEY_TYPE_snapshot)
1829                 return 0;
1830
1831         snap = bkey_s_c_to_snapshot(k);
1832         if (BCH_SNAPSHOT_DELETED(snap.v) ||
1833             bch2_snapshot_equiv(c, k.k->p.offset) != k.k->p.offset ||
1834             (ret = bch2_snapshot_needs_delete(trans, k)) > 0) {
1835                 set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags);
1836                 return 0;
1837         }
1838
1839         return ret;
1840 }
1841
1842 int bch2_snapshots_read(struct bch_fs *c)
1843 {
1844         int ret = bch2_trans_run(c,
1845                 for_each_btree_key(trans, iter, BTREE_ID_snapshots,
1846                                    POS_MIN, 0, k,
1847                         __bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, bkey_s_c_null, k, 0) ?:
1848                         bch2_snapshot_set_equiv(trans, k) ?:
1849                         bch2_check_snapshot_needs_deletion(trans, k)) ?:
1850                 for_each_btree_key(trans, iter, BTREE_ID_snapshots,
1851                                    POS_MIN, 0, k,
1852                            (set_is_ancestor_bitmap(c, k.k->p.offset), 0)));
1853         bch_err_fn(c, ret);
1854
1855         /*
1856          * It's important that we check if we need to reconstruct snapshots
1857          * before going RW, so we mark that pass as required in the superblock -
1858          * otherwise, we could end up deleting keys with missing snapshot nodes
1859          * instead
1860          */
1861         BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) &&
1862                test_bit(BCH_FS_may_go_rw, &c->flags));
1863
1864         if (bch2_err_matches(ret, EIO) ||
1865             (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots)))
1866                 ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_reconstruct_snapshots);
1867
1868         return ret;
1869 }
1870
1871 void bch2_fs_snapshots_exit(struct bch_fs *c)
1872 {
1873         kvfree(rcu_dereference_protected(c->snapshots, true));
1874 }