GNU Linux-libre 5.19-rc6-gnu
[releases.git] / drivers / md / persistent-data / dm-btree-remove.c
1 /*
2  * Copyright (C) 2011 Red Hat, Inc.
3  *
4  * This file is released under the GPL.
5  */
6
7 #include "dm-btree.h"
8 #include "dm-btree-internal.h"
9 #include "dm-transaction-manager.h"
10
11 #include <linux/export.h>
12 #include <linux/device-mapper.h>
13
14 #define DM_MSG_PREFIX "btree"
15
16 /*
17  * Removing an entry from a btree
18  * ==============================
19  *
20  * A very important constraint for our btree is that no node, except the
21  * root, may have fewer than a certain number of entries.
22  * (MIN_ENTRIES <= nr_entries <= MAX_ENTRIES).
23  *
24  * Ensuring this is complicated by the way we want to only ever hold the
25  * locks on 2 nodes concurrently, and only change nodes in a top to bottom
26  * fashion.
27  *
28  * Each node may have a left or right sibling.  When decending the spine,
29  * if a node contains only MIN_ENTRIES then we try and increase this to at
30  * least MIN_ENTRIES + 1.  We do this in the following ways:
31  *
32  * [A] No siblings => this can only happen if the node is the root, in which
33  *     case we copy the childs contents over the root.
34  *
35  * [B] No left sibling
36  *     ==> rebalance(node, right sibling)
37  *
38  * [C] No right sibling
39  *     ==> rebalance(left sibling, node)
40  *
41  * [D] Both siblings, total_entries(left, node, right) <= DEL_THRESHOLD
42  *     ==> delete node adding it's contents to left and right
43  *
44  * [E] Both siblings, total_entries(left, node, right) > DEL_THRESHOLD
45  *     ==> rebalance(left, node, right)
46  *
47  * After these operations it's possible that the our original node no
48  * longer contains the desired sub tree.  For this reason this rebalancing
49  * is performed on the children of the current node.  This also avoids
50  * having a special case for the root.
51  *
52  * Once this rebalancing has occurred we can then step into the child node
53  * for internal nodes.  Or delete the entry for leaf nodes.
54  */
55
56 /*
57  * Some little utilities for moving node data around.
58  */
59 static void node_shift(struct btree_node *n, int shift)
60 {
61         uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
62         uint32_t value_size = le32_to_cpu(n->header.value_size);
63
64         if (shift < 0) {
65                 shift = -shift;
66                 BUG_ON(shift > nr_entries);
67                 BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift));
68                 memmove(key_ptr(n, 0),
69                         key_ptr(n, shift),
70                         (nr_entries - shift) * sizeof(__le64));
71                 memmove(value_ptr(n, 0),
72                         value_ptr(n, shift),
73                         (nr_entries - shift) * value_size);
74         } else {
75                 BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries));
76                 memmove(key_ptr(n, shift),
77                         key_ptr(n, 0),
78                         nr_entries * sizeof(__le64));
79                 memmove(value_ptr(n, shift),
80                         value_ptr(n, 0),
81                         nr_entries * value_size);
82         }
83 }
84
85 static int node_copy(struct btree_node *left, struct btree_node *right, int shift)
86 {
87         uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
88         uint32_t value_size = le32_to_cpu(left->header.value_size);
89         if (value_size != le32_to_cpu(right->header.value_size)) {
90                 DMERR("mismatched value size");
91                 return -EILSEQ;
92         }
93
94         if (shift < 0) {
95                 shift = -shift;
96
97                 if (nr_left + shift > le32_to_cpu(left->header.max_entries)) {
98                         DMERR("bad shift");
99                         return -EINVAL;
100                 }
101
102                 memcpy(key_ptr(left, nr_left),
103                        key_ptr(right, 0),
104                        shift * sizeof(__le64));
105                 memcpy(value_ptr(left, nr_left),
106                        value_ptr(right, 0),
107                        shift * value_size);
108         } else {
109                 if (shift > le32_to_cpu(right->header.max_entries)) {
110                         DMERR("bad shift");
111                         return -EINVAL;
112                 }
113
114                 memcpy(key_ptr(right, 0),
115                        key_ptr(left, nr_left - shift),
116                        shift * sizeof(__le64));
117                 memcpy(value_ptr(right, 0),
118                        value_ptr(left, nr_left - shift),
119                        shift * value_size);
120         }
121         return 0;
122 }
123
124 /*
125  * Delete a specific entry from a leaf node.
126  */
127 static void delete_at(struct btree_node *n, unsigned index)
128 {
129         unsigned nr_entries = le32_to_cpu(n->header.nr_entries);
130         unsigned nr_to_copy = nr_entries - (index + 1);
131         uint32_t value_size = le32_to_cpu(n->header.value_size);
132         BUG_ON(index >= nr_entries);
133
134         if (nr_to_copy) {
135                 memmove(key_ptr(n, index),
136                         key_ptr(n, index + 1),
137                         nr_to_copy * sizeof(__le64));
138
139                 memmove(value_ptr(n, index),
140                         value_ptr(n, index + 1),
141                         nr_to_copy * value_size);
142         }
143
144         n->header.nr_entries = cpu_to_le32(nr_entries - 1);
145 }
146
147 static unsigned merge_threshold(struct btree_node *n)
148 {
149         return le32_to_cpu(n->header.max_entries) / 3;
150 }
151
152 struct child {
153         unsigned index;
154         struct dm_block *block;
155         struct btree_node *n;
156 };
157
158 static int init_child(struct dm_btree_info *info, struct dm_btree_value_type *vt,
159                       struct btree_node *parent,
160                       unsigned index, struct child *result)
161 {
162         int r, inc;
163         dm_block_t root;
164
165         result->index = index;
166         root = value64(parent, index);
167
168         r = dm_tm_shadow_block(info->tm, root, &btree_node_validator,
169                                &result->block, &inc);
170         if (r)
171                 return r;
172
173         result->n = dm_block_data(result->block);
174
175         if (inc)
176                 inc_children(info->tm, result->n, vt);
177
178         *((__le64 *) value_ptr(parent, index)) =
179                 cpu_to_le64(dm_block_location(result->block));
180
181         return 0;
182 }
183
184 static void exit_child(struct dm_btree_info *info, struct child *c)
185 {
186         dm_tm_unlock(info->tm, c->block);
187 }
188
189 static int shift(struct btree_node *left, struct btree_node *right, int count)
190 {
191         int r;
192         uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
193         uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
194         uint32_t max_entries = le32_to_cpu(left->header.max_entries);
195         uint32_t r_max_entries = le32_to_cpu(right->header.max_entries);
196
197         if (max_entries != r_max_entries) {
198                 DMERR("node max_entries mismatch");
199                 return -EILSEQ;
200         }
201
202         if (nr_left - count > max_entries) {
203                 DMERR("node shift out of bounds");
204                 return -EINVAL;
205         }
206
207         if (nr_right + count > max_entries) {
208                 DMERR("node shift out of bounds");
209                 return -EINVAL;
210         }
211
212         if (!count)
213                 return 0;
214
215         if (count > 0) {
216                 node_shift(right, count);
217                 r = node_copy(left, right, count);
218                 if (r)
219                         return r;
220         } else {
221                 r = node_copy(left, right, count);
222                 if (r)
223                         return r;
224                 node_shift(right, count);
225         }
226
227         left->header.nr_entries = cpu_to_le32(nr_left - count);
228         right->header.nr_entries = cpu_to_le32(nr_right + count);
229
230         return 0;
231 }
232
233 static int __rebalance2(struct dm_btree_info *info, struct btree_node *parent,
234                         struct child *l, struct child *r)
235 {
236         int ret;
237         struct btree_node *left = l->n;
238         struct btree_node *right = r->n;
239         uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
240         uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
241         /*
242          * Ensure the number of entries in each child will be greater
243          * than or equal to (max_entries / 3 + 1), so no matter which
244          * child is used for removal, the number will still be not
245          * less than (max_entries / 3).
246          */
247         unsigned int threshold = 2 * (merge_threshold(left) + 1);
248
249         if (nr_left + nr_right < threshold) {
250                 /*
251                  * Merge
252                  */
253                 node_copy(left, right, -nr_right);
254                 left->header.nr_entries = cpu_to_le32(nr_left + nr_right);
255                 delete_at(parent, r->index);
256
257                 /*
258                  * We need to decrement the right block, but not it's
259                  * children, since they're still referenced by left.
260                  */
261                 dm_tm_dec(info->tm, dm_block_location(r->block));
262         } else {
263                 /*
264                  * Rebalance.
265                  */
266                 unsigned target_left = (nr_left + nr_right) / 2;
267                 ret = shift(left, right, nr_left - target_left);
268                 if (ret)
269                         return ret;
270                 *key_ptr(parent, r->index) = right->keys[0];
271         }
272         return 0;
273 }
274
275 static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info,
276                       struct dm_btree_value_type *vt, unsigned left_index)
277 {
278         int r;
279         struct btree_node *parent;
280         struct child left, right;
281
282         parent = dm_block_data(shadow_current(s));
283
284         r = init_child(info, vt, parent, left_index, &left);
285         if (r)
286                 return r;
287
288         r = init_child(info, vt, parent, left_index + 1, &right);
289         if (r) {
290                 exit_child(info, &left);
291                 return r;
292         }
293
294         r = __rebalance2(info, parent, &left, &right);
295
296         exit_child(info, &left);
297         exit_child(info, &right);
298
299         return r;
300 }
301
302 /*
303  * We dump as many entries from center as possible into left, then the rest
304  * in right, then rebalance2.  This wastes some cpu, but I want something
305  * simple atm.
306  */
307 static int delete_center_node(struct dm_btree_info *info, struct btree_node *parent,
308                               struct child *l, struct child *c, struct child *r,
309                               struct btree_node *left, struct btree_node *center, struct btree_node *right,
310                               uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
311 {
312         uint32_t max_entries = le32_to_cpu(left->header.max_entries);
313         unsigned shift = min(max_entries - nr_left, nr_center);
314
315         if (nr_left + shift > max_entries) {
316                 DMERR("node shift out of bounds");
317                 return -EINVAL;
318         }
319
320         node_copy(left, center, -shift);
321         left->header.nr_entries = cpu_to_le32(nr_left + shift);
322
323         if (shift != nr_center) {
324                 shift = nr_center - shift;
325
326                 if ((nr_right + shift) > max_entries) {
327                         DMERR("node shift out of bounds");
328                         return -EINVAL;
329                 }
330
331                 node_shift(right, shift);
332                 node_copy(center, right, shift);
333                 right->header.nr_entries = cpu_to_le32(nr_right + shift);
334         }
335         *key_ptr(parent, r->index) = right->keys[0];
336
337         delete_at(parent, c->index);
338         r->index--;
339
340         dm_tm_dec(info->tm, dm_block_location(c->block));
341         return __rebalance2(info, parent, l, r);
342 }
343
344 /*
345  * Redistributes entries among 3 sibling nodes.
346  */
347 static int redistribute3(struct dm_btree_info *info, struct btree_node *parent,
348                          struct child *l, struct child *c, struct child *r,
349                          struct btree_node *left, struct btree_node *center, struct btree_node *right,
350                          uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
351 {
352         int s, ret;
353         uint32_t max_entries = le32_to_cpu(left->header.max_entries);
354         unsigned total = nr_left + nr_center + nr_right;
355         unsigned target_right = total / 3;
356         unsigned remainder = (target_right * 3) != total;
357         unsigned target_left = target_right + remainder;
358
359         BUG_ON(target_left > max_entries);
360         BUG_ON(target_right > max_entries);
361
362         if (nr_left < nr_right) {
363                 s = nr_left - target_left;
364
365                 if (s < 0 && nr_center < -s) {
366                         /* not enough in central node */
367                         ret = shift(left, center, -nr_center);
368                         if (ret)
369                                 return ret;
370
371                         s += nr_center;
372                         ret = shift(left, right, s);
373                         if (ret)
374                                 return ret;
375
376                         nr_right += s;
377                 } else {
378                         ret = shift(left, center, s);
379                         if (ret)
380                                 return ret;
381                 }
382
383                 ret = shift(center, right, target_right - nr_right);
384                 if (ret)
385                         return ret;
386         } else {
387                 s = target_right - nr_right;
388                 if (s > 0 && nr_center < s) {
389                         /* not enough in central node */
390                         ret = shift(center, right, nr_center);
391                         if (ret)
392                                 return ret;
393                         s -= nr_center;
394                         ret = shift(left, right, s);
395                         if (ret)
396                                 return ret;
397                         nr_left -= s;
398                 } else {
399                         ret = shift(center, right, s);
400                         if (ret)
401                                 return ret;
402                 }
403
404                 ret = shift(left, center, nr_left - target_left);
405                 if (ret)
406                         return ret;
407         }
408
409         *key_ptr(parent, c->index) = center->keys[0];
410         *key_ptr(parent, r->index) = right->keys[0];
411         return 0;
412 }
413
414 static int __rebalance3(struct dm_btree_info *info, struct btree_node *parent,
415                         struct child *l, struct child *c, struct child *r)
416 {
417         struct btree_node *left = l->n;
418         struct btree_node *center = c->n;
419         struct btree_node *right = r->n;
420
421         uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
422         uint32_t nr_center = le32_to_cpu(center->header.nr_entries);
423         uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
424
425         unsigned threshold = merge_threshold(left) * 4 + 1;
426
427         if ((left->header.max_entries != center->header.max_entries) ||
428             (center->header.max_entries != right->header.max_entries)) {
429                 DMERR("bad btree metadata, max_entries differ");
430                 return -EILSEQ;
431         }
432
433         if ((nr_left + nr_center + nr_right) < threshold) {
434                 return delete_center_node(info, parent, l, c, r, left, center, right,
435                                           nr_left, nr_center, nr_right);
436         }
437
438         return redistribute3(info, parent, l, c, r, left, center, right,
439                              nr_left, nr_center, nr_right);
440 }
441
442 static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info,
443                       struct dm_btree_value_type *vt, unsigned left_index)
444 {
445         int r;
446         struct btree_node *parent = dm_block_data(shadow_current(s));
447         struct child left, center, right;
448
449         /*
450          * FIXME: fill out an array?
451          */
452         r = init_child(info, vt, parent, left_index, &left);
453         if (r)
454                 return r;
455
456         r = init_child(info, vt, parent, left_index + 1, &center);
457         if (r) {
458                 exit_child(info, &left);
459                 return r;
460         }
461
462         r = init_child(info, vt, parent, left_index + 2, &right);
463         if (r) {
464                 exit_child(info, &left);
465                 exit_child(info, &center);
466                 return r;
467         }
468
469         r = __rebalance3(info, parent, &left, &center, &right);
470
471         exit_child(info, &left);
472         exit_child(info, &center);
473         exit_child(info, &right);
474
475         return r;
476 }
477
478 static int rebalance_children(struct shadow_spine *s,
479                               struct dm_btree_info *info,
480                               struct dm_btree_value_type *vt, uint64_t key)
481 {
482         int i, r, has_left_sibling, has_right_sibling;
483         struct btree_node *n;
484
485         n = dm_block_data(shadow_current(s));
486
487         if (le32_to_cpu(n->header.nr_entries) == 1) {
488                 struct dm_block *child;
489                 dm_block_t b = value64(n, 0);
490
491                 r = dm_tm_read_lock(info->tm, b, &btree_node_validator, &child);
492                 if (r)
493                         return r;
494
495                 memcpy(n, dm_block_data(child),
496                        dm_bm_block_size(dm_tm_get_bm(info->tm)));
497
498                 dm_tm_dec(info->tm, dm_block_location(child));
499                 dm_tm_unlock(info->tm, child);
500                 return 0;
501         }
502
503         i = lower_bound(n, key);
504         if (i < 0)
505                 return -ENODATA;
506
507         has_left_sibling = i > 0;
508         has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1);
509
510         if (!has_left_sibling)
511                 r = rebalance2(s, info, vt, i);
512
513         else if (!has_right_sibling)
514                 r = rebalance2(s, info, vt, i - 1);
515
516         else
517                 r = rebalance3(s, info, vt, i - 1);
518
519         return r;
520 }
521
522 static int do_leaf(struct btree_node *n, uint64_t key, unsigned *index)
523 {
524         int i = lower_bound(n, key);
525
526         if ((i < 0) ||
527             (i >= le32_to_cpu(n->header.nr_entries)) ||
528             (le64_to_cpu(n->keys[i]) != key))
529                 return -ENODATA;
530
531         *index = i;
532
533         return 0;
534 }
535
536 /*
537  * Prepares for removal from one level of the hierarchy.  The caller must
538  * call delete_at() to remove the entry at index.
539  */
540 static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info,
541                       struct dm_btree_value_type *vt, dm_block_t root,
542                       uint64_t key, unsigned *index)
543 {
544         int i = *index, r;
545         struct btree_node *n;
546
547         for (;;) {
548                 r = shadow_step(s, root, vt);
549                 if (r < 0)
550                         break;
551
552                 /*
553                  * We have to patch up the parent node, ugly, but I don't
554                  * see a way to do this automatically as part of the spine
555                  * op.
556                  */
557                 if (shadow_has_parent(s)) {
558                         __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
559                         memcpy(value_ptr(dm_block_data(shadow_parent(s)), i),
560                                &location, sizeof(__le64));
561                 }
562
563                 n = dm_block_data(shadow_current(s));
564
565                 if (le32_to_cpu(n->header.flags) & LEAF_NODE)
566                         return do_leaf(n, key, index);
567
568                 r = rebalance_children(s, info, vt, key);
569                 if (r)
570                         break;
571
572                 n = dm_block_data(shadow_current(s));
573                 if (le32_to_cpu(n->header.flags) & LEAF_NODE)
574                         return do_leaf(n, key, index);
575
576                 i = lower_bound(n, key);
577
578                 /*
579                  * We know the key is present, or else
580                  * rebalance_children would have returned
581                  * -ENODATA
582                  */
583                 root = value64(n, i);
584         }
585
586         return r;
587 }
588
589 int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
590                     uint64_t *keys, dm_block_t *new_root)
591 {
592         unsigned level, last_level = info->levels - 1;
593         int index = 0, r = 0;
594         struct shadow_spine spine;
595         struct btree_node *n;
596         struct dm_btree_value_type le64_vt;
597
598         init_le64_type(info->tm, &le64_vt);
599         init_shadow_spine(&spine, info);
600         for (level = 0; level < info->levels; level++) {
601                 r = remove_raw(&spine, info,
602                                (level == last_level ?
603                                 &info->value_type : &le64_vt),
604                                root, keys[level], (unsigned *)&index);
605                 if (r < 0)
606                         break;
607
608                 n = dm_block_data(shadow_current(&spine));
609                 if (level != last_level) {
610                         root = value64(n, index);
611                         continue;
612                 }
613
614                 BUG_ON(index < 0 || index >= le32_to_cpu(n->header.nr_entries));
615
616                 if (info->value_type.dec)
617                         info->value_type.dec(info->value_type.context,
618                                              value_ptr(n, index), 1);
619
620                 delete_at(n, index);
621         }
622
623         if (!r)
624                 *new_root = shadow_root(&spine);
625         exit_shadow_spine(&spine);
626
627         return r;
628 }
629 EXPORT_SYMBOL_GPL(dm_btree_remove);
630
631 /*----------------------------------------------------------------*/
632
633 static int remove_nearest(struct shadow_spine *s, struct dm_btree_info *info,
634                           struct dm_btree_value_type *vt, dm_block_t root,
635                           uint64_t key, int *index)
636 {
637         int i = *index, r;
638         struct btree_node *n;
639
640         for (;;) {
641                 r = shadow_step(s, root, vt);
642                 if (r < 0)
643                         break;
644
645                 /*
646                  * We have to patch up the parent node, ugly, but I don't
647                  * see a way to do this automatically as part of the spine
648                  * op.
649                  */
650                 if (shadow_has_parent(s)) {
651                         __le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
652                         memcpy(value_ptr(dm_block_data(shadow_parent(s)), i),
653                                &location, sizeof(__le64));
654                 }
655
656                 n = dm_block_data(shadow_current(s));
657
658                 if (le32_to_cpu(n->header.flags) & LEAF_NODE) {
659                         *index = lower_bound(n, key);
660                         return 0;
661                 }
662
663                 r = rebalance_children(s, info, vt, key);
664                 if (r)
665                         break;
666
667                 n = dm_block_data(shadow_current(s));
668                 if (le32_to_cpu(n->header.flags) & LEAF_NODE) {
669                         *index = lower_bound(n, key);
670                         return 0;
671                 }
672
673                 i = lower_bound(n, key);
674
675                 /*
676                  * We know the key is present, or else
677                  * rebalance_children would have returned
678                  * -ENODATA
679                  */
680                 root = value64(n, i);
681         }
682
683         return r;
684 }
685
686 static int remove_one(struct dm_btree_info *info, dm_block_t root,
687                       uint64_t *keys, uint64_t end_key,
688                       dm_block_t *new_root, unsigned *nr_removed)
689 {
690         unsigned level, last_level = info->levels - 1;
691         int index = 0, r = 0;
692         struct shadow_spine spine;
693         struct btree_node *n;
694         struct dm_btree_value_type le64_vt;
695         uint64_t k;
696
697         init_le64_type(info->tm, &le64_vt);
698         init_shadow_spine(&spine, info);
699         for (level = 0; level < last_level; level++) {
700                 r = remove_raw(&spine, info, &le64_vt,
701                                root, keys[level], (unsigned *) &index);
702                 if (r < 0)
703                         goto out;
704
705                 n = dm_block_data(shadow_current(&spine));
706                 root = value64(n, index);
707         }
708
709         r = remove_nearest(&spine, info, &info->value_type,
710                            root, keys[last_level], &index);
711         if (r < 0)
712                 goto out;
713
714         n = dm_block_data(shadow_current(&spine));
715
716         if (index < 0)
717                 index = 0;
718
719         if (index >= le32_to_cpu(n->header.nr_entries)) {
720                 r = -ENODATA;
721                 goto out;
722         }
723
724         k = le64_to_cpu(n->keys[index]);
725         if (k >= keys[last_level] && k < end_key) {
726                 if (info->value_type.dec)
727                         info->value_type.dec(info->value_type.context,
728                                              value_ptr(n, index), 1);
729
730                 delete_at(n, index);
731                 keys[last_level] = k + 1ull;
732
733         } else
734                 r = -ENODATA;
735
736 out:
737         *new_root = shadow_root(&spine);
738         exit_shadow_spine(&spine);
739
740         return r;
741 }
742
743 int dm_btree_remove_leaves(struct dm_btree_info *info, dm_block_t root,
744                            uint64_t *first_key, uint64_t end_key,
745                            dm_block_t *new_root, unsigned *nr_removed)
746 {
747         int r;
748
749         *nr_removed = 0;
750         do {
751                 r = remove_one(info, root, first_key, end_key, &root, nr_removed);
752                 if (!r)
753                         (*nr_removed)++;
754         } while (!r);
755
756         *new_root = root;
757         return r == -ENODATA ? 0 : r;
758 }
759 EXPORT_SYMBOL_GPL(dm_btree_remove_leaves);