GNU Linux-libre 4.19.281-gnu1
[releases.git] / fs / nfs / blocklayout / extent_tree.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2014-2016 Christoph Hellwig.
4  */
5
6 #include <linux/vmalloc.h>
7
8 #include "blocklayout.h"
9
10 #define NFSDBG_FACILITY         NFSDBG_PNFS_LD
11
12 static inline struct pnfs_block_extent *
13 ext_node(struct rb_node *node)
14 {
15         return rb_entry(node, struct pnfs_block_extent, be_node);
16 }
17
18 static struct pnfs_block_extent *
19 ext_tree_first(struct rb_root *root)
20 {
21         struct rb_node *node = rb_first(root);
22         return node ? ext_node(node) : NULL;
23 }
24
25 static struct pnfs_block_extent *
26 ext_tree_prev(struct pnfs_block_extent *be)
27 {
28         struct rb_node *node = rb_prev(&be->be_node);
29         return node ? ext_node(node) : NULL;
30 }
31
32 static struct pnfs_block_extent *
33 ext_tree_next(struct pnfs_block_extent *be)
34 {
35         struct rb_node *node = rb_next(&be->be_node);
36         return node ? ext_node(node) : NULL;
37 }
38
39 static inline sector_t
40 ext_f_end(struct pnfs_block_extent *be)
41 {
42         return be->be_f_offset + be->be_length;
43 }
44
45 static struct pnfs_block_extent *
46 __ext_tree_search(struct rb_root *root, sector_t start)
47 {
48         struct rb_node *node = root->rb_node;
49         struct pnfs_block_extent *be = NULL;
50
51         while (node) {
52                 be = ext_node(node);
53                 if (start < be->be_f_offset)
54                         node = node->rb_left;
55                 else if (start >= ext_f_end(be))
56                         node = node->rb_right;
57                 else
58                         return be;
59         }
60
61         if (be) {
62                 if (start < be->be_f_offset)
63                         return be;
64
65                 if (start >= ext_f_end(be))
66                         return ext_tree_next(be);
67         }
68
69         return NULL;
70 }
71
72 static bool
73 ext_can_merge(struct pnfs_block_extent *be1, struct pnfs_block_extent *be2)
74 {
75         if (be1->be_state != be2->be_state)
76                 return false;
77         if (be1->be_device != be2->be_device)
78                 return false;
79
80         if (be1->be_f_offset + be1->be_length != be2->be_f_offset)
81                 return false;
82
83         if (be1->be_state != PNFS_BLOCK_NONE_DATA &&
84             (be1->be_v_offset + be1->be_length != be2->be_v_offset))
85                 return false;
86
87         if (be1->be_state == PNFS_BLOCK_INVALID_DATA &&
88             be1->be_tag != be2->be_tag)
89                 return false;
90
91         return true;
92 }
93
94 static struct pnfs_block_extent *
95 ext_try_to_merge_left(struct rb_root *root, struct pnfs_block_extent *be)
96 {
97         struct pnfs_block_extent *left = ext_tree_prev(be);
98
99         if (left && ext_can_merge(left, be)) {
100                 left->be_length += be->be_length;
101                 rb_erase(&be->be_node, root);
102                 nfs4_put_deviceid_node(be->be_device);
103                 kfree(be);
104                 return left;
105         }
106
107         return be;
108 }
109
110 static struct pnfs_block_extent *
111 ext_try_to_merge_right(struct rb_root *root, struct pnfs_block_extent *be)
112 {
113         struct pnfs_block_extent *right = ext_tree_next(be);
114
115         if (right && ext_can_merge(be, right)) {
116                 be->be_length += right->be_length;
117                 rb_erase(&right->be_node, root);
118                 nfs4_put_deviceid_node(right->be_device);
119                 kfree(right);
120         }
121
122         return be;
123 }
124
125 static void __ext_put_deviceids(struct list_head *head)
126 {
127         struct pnfs_block_extent *be, *tmp;
128
129         list_for_each_entry_safe(be, tmp, head, be_list) {
130                 nfs4_put_deviceid_node(be->be_device);
131                 kfree(be);
132         }
133 }
134
135 static void
136 __ext_tree_insert(struct rb_root *root,
137                 struct pnfs_block_extent *new, bool merge_ok)
138 {
139         struct rb_node **p = &root->rb_node, *parent = NULL;
140         struct pnfs_block_extent *be;
141
142         while (*p) {
143                 parent = *p;
144                 be = ext_node(parent);
145
146                 if (new->be_f_offset < be->be_f_offset) {
147                         if (merge_ok && ext_can_merge(new, be)) {
148                                 be->be_f_offset = new->be_f_offset;
149                                 if (be->be_state != PNFS_BLOCK_NONE_DATA)
150                                         be->be_v_offset = new->be_v_offset;
151                                 be->be_length += new->be_length;
152                                 be = ext_try_to_merge_left(root, be);
153                                 goto free_new;
154                         }
155                         p = &(*p)->rb_left;
156                 } else if (new->be_f_offset >= ext_f_end(be)) {
157                         if (merge_ok && ext_can_merge(be, new)) {
158                                 be->be_length += new->be_length;
159                                 be = ext_try_to_merge_right(root, be);
160                                 goto free_new;
161                         }
162                         p = &(*p)->rb_right;
163                 } else {
164                         BUG();
165                 }
166         }
167
168         rb_link_node(&new->be_node, parent, p);
169         rb_insert_color(&new->be_node, root);
170         return;
171 free_new:
172         nfs4_put_deviceid_node(new->be_device);
173         kfree(new);
174 }
175
176 static int
177 __ext_tree_remove(struct rb_root *root,
178                 sector_t start, sector_t end, struct list_head *tmp)
179 {
180         struct pnfs_block_extent *be;
181         sector_t len1 = 0, len2 = 0;
182         sector_t orig_v_offset;
183         sector_t orig_len;
184
185         be = __ext_tree_search(root, start);
186         if (!be)
187                 return 0;
188         if (be->be_f_offset >= end)
189                 return 0;
190
191         orig_v_offset = be->be_v_offset;
192         orig_len = be->be_length;
193
194         if (start > be->be_f_offset)
195                 len1 = start - be->be_f_offset;
196         if (ext_f_end(be) > end)
197                 len2 = ext_f_end(be) - end;
198
199         if (len2 > 0) {
200                 if (len1 > 0) {
201                         struct pnfs_block_extent *new;
202
203                         new = kzalloc(sizeof(*new), GFP_ATOMIC);
204                         if (!new)
205                                 return -ENOMEM;
206
207                         be->be_length = len1;
208
209                         new->be_f_offset = end;
210                         if (be->be_state != PNFS_BLOCK_NONE_DATA) {
211                                 new->be_v_offset =
212                                         orig_v_offset + orig_len - len2;
213                         }
214                         new->be_length = len2;
215                         new->be_state = be->be_state;
216                         new->be_tag = be->be_tag;
217                         new->be_device = nfs4_get_deviceid(be->be_device);
218
219                         __ext_tree_insert(root, new, true);
220                 } else {
221                         be->be_f_offset = end;
222                         if (be->be_state != PNFS_BLOCK_NONE_DATA) {
223                                 be->be_v_offset =
224                                         orig_v_offset + orig_len - len2;
225                         }
226                         be->be_length = len2;
227                 }
228         } else {
229                 if (len1 > 0) {
230                         be->be_length = len1;
231                         be = ext_tree_next(be);
232                 }
233
234                 while (be && ext_f_end(be) <= end) {
235                         struct pnfs_block_extent *next = ext_tree_next(be);
236
237                         rb_erase(&be->be_node, root);
238                         list_add_tail(&be->be_list, tmp);
239                         be = next;
240                 }
241
242                 if (be && be->be_f_offset < end) {
243                         len1 = ext_f_end(be) - end;
244                         be->be_f_offset = end;
245                         if (be->be_state != PNFS_BLOCK_NONE_DATA)
246                                 be->be_v_offset += be->be_length - len1;
247                         be->be_length = len1;
248                 }
249         }
250
251         return 0;
252 }
253
254 int
255 ext_tree_insert(struct pnfs_block_layout *bl, struct pnfs_block_extent *new)
256 {
257         struct pnfs_block_extent *be;
258         struct rb_root *root;
259         int err = 0;
260
261         switch (new->be_state) {
262         case PNFS_BLOCK_READWRITE_DATA:
263         case PNFS_BLOCK_INVALID_DATA:
264                 root = &bl->bl_ext_rw;
265                 break;
266         case PNFS_BLOCK_READ_DATA:
267         case PNFS_BLOCK_NONE_DATA:
268                 root = &bl->bl_ext_ro;
269                 break;
270         default:
271                 dprintk("invalid extent type\n");
272                 return -EINVAL;
273         }
274
275         spin_lock(&bl->bl_ext_lock);
276 retry:
277         be = __ext_tree_search(root, new->be_f_offset);
278         if (!be || be->be_f_offset >= ext_f_end(new)) {
279                 __ext_tree_insert(root, new, true);
280         } else if (new->be_f_offset >= be->be_f_offset) {
281                 if (ext_f_end(new) <= ext_f_end(be)) {
282                         nfs4_put_deviceid_node(new->be_device);
283                         kfree(new);
284                 } else {
285                         sector_t new_len = ext_f_end(new) - ext_f_end(be);
286                         sector_t diff = new->be_length - new_len;
287
288                         new->be_f_offset += diff;
289                         new->be_v_offset += diff;
290                         new->be_length = new_len;
291                         goto retry;
292                 }
293         } else if (ext_f_end(new) <= ext_f_end(be)) {
294                 new->be_length = be->be_f_offset - new->be_f_offset;
295                 __ext_tree_insert(root, new, true);
296         } else {
297                 struct pnfs_block_extent *split;
298                 sector_t new_len = ext_f_end(new) - ext_f_end(be);
299                 sector_t diff = new->be_length - new_len;
300
301                 split = kmemdup(new, sizeof(*new), GFP_ATOMIC);
302                 if (!split) {
303                         err = -EINVAL;
304                         goto out;
305                 }
306
307                 split->be_length = be->be_f_offset - split->be_f_offset;
308                 split->be_device = nfs4_get_deviceid(new->be_device);
309                 __ext_tree_insert(root, split, true);
310
311                 new->be_f_offset += diff;
312                 new->be_v_offset += diff;
313                 new->be_length = new_len;
314                 goto retry;
315         }
316 out:
317         spin_unlock(&bl->bl_ext_lock);
318         return err;
319 }
320
321 static bool
322 __ext_tree_lookup(struct rb_root *root, sector_t isect,
323                 struct pnfs_block_extent *ret)
324 {
325         struct rb_node *node;
326         struct pnfs_block_extent *be;
327
328         node = root->rb_node;
329         while (node) {
330                 be = ext_node(node);
331                 if (isect < be->be_f_offset)
332                         node = node->rb_left;
333                 else if (isect >= ext_f_end(be))
334                         node = node->rb_right;
335                 else {
336                         *ret = *be;
337                         return true;
338                 }
339         }
340
341         return false;
342 }
343
344 bool
345 ext_tree_lookup(struct pnfs_block_layout *bl, sector_t isect,
346             struct pnfs_block_extent *ret, bool rw)
347 {
348         bool found = false;
349
350         spin_lock(&bl->bl_ext_lock);
351         if (!rw)
352                 found = __ext_tree_lookup(&bl->bl_ext_ro, isect, ret);
353         if (!found)
354                 found = __ext_tree_lookup(&bl->bl_ext_rw, isect, ret);
355         spin_unlock(&bl->bl_ext_lock);
356
357         return found;
358 }
359
360 int ext_tree_remove(struct pnfs_block_layout *bl, bool rw,
361                 sector_t start, sector_t end)
362 {
363         int err, err2;
364         LIST_HEAD(tmp);
365
366         spin_lock(&bl->bl_ext_lock);
367         err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp);
368         if (rw) {
369                 err2 = __ext_tree_remove(&bl->bl_ext_rw, start, end, &tmp);
370                 if (!err)
371                         err = err2;
372         }
373         spin_unlock(&bl->bl_ext_lock);
374
375         __ext_put_deviceids(&tmp);
376         return err;
377 }
378
379 static int
380 ext_tree_split(struct rb_root *root, struct pnfs_block_extent *be,
381                 sector_t split)
382 {
383         struct pnfs_block_extent *new;
384         sector_t orig_len = be->be_length;
385
386         new = kzalloc(sizeof(*new), GFP_ATOMIC);
387         if (!new)
388                 return -ENOMEM;
389
390         be->be_length = split - be->be_f_offset;
391
392         new->be_f_offset = split;
393         if (be->be_state != PNFS_BLOCK_NONE_DATA)
394                 new->be_v_offset = be->be_v_offset + be->be_length;
395         new->be_length = orig_len - be->be_length;
396         new->be_state = be->be_state;
397         new->be_tag = be->be_tag;
398         new->be_device = nfs4_get_deviceid(be->be_device);
399
400         __ext_tree_insert(root, new, false);
401         return 0;
402 }
403
404 int
405 ext_tree_mark_written(struct pnfs_block_layout *bl, sector_t start,
406                 sector_t len, u64 lwb)
407 {
408         struct rb_root *root = &bl->bl_ext_rw;
409         sector_t end = start + len;
410         struct pnfs_block_extent *be;
411         int err = 0;
412         LIST_HEAD(tmp);
413
414         spin_lock(&bl->bl_ext_lock);
415         /*
416          * First remove all COW extents or holes from written to range.
417          */
418         err = __ext_tree_remove(&bl->bl_ext_ro, start, end, &tmp);
419         if (err)
420                 goto out;
421
422         /*
423          * Then mark all invalid extents in the range as written to.
424          */
425         for (be = __ext_tree_search(root, start); be; be = ext_tree_next(be)) {
426                 if (be->be_f_offset >= end)
427                         break;
428
429                 if (be->be_state != PNFS_BLOCK_INVALID_DATA || be->be_tag)
430                         continue;
431
432                 if (be->be_f_offset < start) {
433                         struct pnfs_block_extent *left = ext_tree_prev(be);
434
435                         if (left && ext_can_merge(left, be)) {
436                                 sector_t diff = start - be->be_f_offset;
437
438                                 left->be_length += diff;
439
440                                 be->be_f_offset += diff;
441                                 be->be_v_offset += diff;
442                                 be->be_length -= diff;
443                         } else {
444                                 err = ext_tree_split(root, be, start);
445                                 if (err)
446                                         goto out;
447                         }
448                 }
449
450                 if (ext_f_end(be) > end) {
451                         struct pnfs_block_extent *right = ext_tree_next(be);
452
453                         if (right && ext_can_merge(be, right)) {
454                                 sector_t diff = end - be->be_f_offset;
455
456                                 be->be_length -= diff;
457
458                                 right->be_f_offset -= diff;
459                                 right->be_v_offset -= diff;
460                                 right->be_length += diff;
461                         } else {
462                                 err = ext_tree_split(root, be, end);
463                                 if (err)
464                                         goto out;
465                         }
466                 }
467
468                 if (be->be_f_offset >= start && ext_f_end(be) <= end) {
469                         be->be_tag = EXTENT_WRITTEN;
470                         be = ext_try_to_merge_left(root, be);
471                         be = ext_try_to_merge_right(root, be);
472                 }
473         }
474 out:
475         if (bl->bl_lwb < lwb)
476                 bl->bl_lwb = lwb;
477         spin_unlock(&bl->bl_ext_lock);
478
479         __ext_put_deviceids(&tmp);
480         return err;
481 }
482
483 static size_t ext_tree_layoutupdate_size(struct pnfs_block_layout *bl, size_t count)
484 {
485         if (bl->bl_scsi_layout)
486                 return sizeof(__be32) + PNFS_SCSI_RANGE_SIZE * count;
487         else
488                 return sizeof(__be32) + PNFS_BLOCK_EXTENT_SIZE * count;
489 }
490
491 static void ext_tree_free_commitdata(struct nfs4_layoutcommit_args *arg,
492                 size_t buffer_size)
493 {
494         if (arg->layoutupdate_pages != &arg->layoutupdate_page) {
495                 int nr_pages = DIV_ROUND_UP(buffer_size, PAGE_SIZE), i;
496
497                 for (i = 0; i < nr_pages; i++)
498                         put_page(arg->layoutupdate_pages[i]);
499                 vfree(arg->start_p);
500                 kfree(arg->layoutupdate_pages);
501         } else {
502                 put_page(arg->layoutupdate_page);
503         }
504 }
505
506 static __be32 *encode_block_extent(struct pnfs_block_extent *be, __be32 *p)
507 {
508         p = xdr_encode_opaque_fixed(p, be->be_device->deviceid.data,
509                         NFS4_DEVICEID4_SIZE);
510         p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
511         p = xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
512         p = xdr_encode_hyper(p, 0LL);
513         *p++ = cpu_to_be32(PNFS_BLOCK_READWRITE_DATA);
514         return p;
515 }
516
517 static __be32 *encode_scsi_range(struct pnfs_block_extent *be, __be32 *p)
518 {
519         p = xdr_encode_hyper(p, be->be_f_offset << SECTOR_SHIFT);
520         return xdr_encode_hyper(p, be->be_length << SECTOR_SHIFT);
521 }
522
523 static int ext_tree_encode_commit(struct pnfs_block_layout *bl, __be32 *p,
524                 size_t buffer_size, size_t *count, __u64 *lastbyte)
525 {
526         struct pnfs_block_extent *be;
527         int ret = 0;
528
529         spin_lock(&bl->bl_ext_lock);
530         for (be = ext_tree_first(&bl->bl_ext_rw); be; be = ext_tree_next(be)) {
531                 if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
532                     be->be_tag != EXTENT_WRITTEN)
533                         continue;
534
535                 (*count)++;
536                 if (ext_tree_layoutupdate_size(bl, *count) > buffer_size) {
537                         /* keep counting.. */
538                         ret = -ENOSPC;
539                         continue;
540                 }
541
542                 if (bl->bl_scsi_layout)
543                         p = encode_scsi_range(be, p);
544                 else
545                         p = encode_block_extent(be, p);
546                 be->be_tag = EXTENT_COMMITTING;
547         }
548         *lastbyte = bl->bl_lwb - 1;
549         bl->bl_lwb = 0;
550         spin_unlock(&bl->bl_ext_lock);
551
552         return ret;
553 }
554
555 int
556 ext_tree_prepare_commit(struct nfs4_layoutcommit_args *arg)
557 {
558         struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout);
559         size_t count = 0, buffer_size = PAGE_SIZE;
560         __be32 *start_p;
561         int ret;
562
563         dprintk("%s enter\n", __func__);
564
565         arg->layoutupdate_page = alloc_page(GFP_NOFS);
566         if (!arg->layoutupdate_page)
567                 return -ENOMEM;
568         start_p = page_address(arg->layoutupdate_page);
569         arg->layoutupdate_pages = &arg->layoutupdate_page;
570
571 retry:
572         ret = ext_tree_encode_commit(bl, start_p + 1, buffer_size, &count, &arg->lastbytewritten);
573         if (unlikely(ret)) {
574                 ext_tree_free_commitdata(arg, buffer_size);
575
576                 buffer_size = ext_tree_layoutupdate_size(bl, count);
577                 count = 0;
578
579                 arg->layoutupdate_pages =
580                         kcalloc(DIV_ROUND_UP(buffer_size, PAGE_SIZE),
581                                 sizeof(struct page *), GFP_NOFS);
582                 if (!arg->layoutupdate_pages)
583                         return -ENOMEM;
584
585                 start_p = __vmalloc(buffer_size, GFP_NOFS, PAGE_KERNEL);
586                 if (!start_p) {
587                         kfree(arg->layoutupdate_pages);
588                         return -ENOMEM;
589                 }
590
591                 goto retry;
592         }
593
594         *start_p = cpu_to_be32(count);
595         arg->layoutupdate_len = ext_tree_layoutupdate_size(bl, count);
596
597         if (unlikely(arg->layoutupdate_pages != &arg->layoutupdate_page)) {
598                 void *p = start_p, *end = p + arg->layoutupdate_len;
599                 struct page *page = NULL;
600                 int i = 0;
601
602                 arg->start_p = start_p;
603                 for ( ; p < end; p += PAGE_SIZE) {
604                         page = vmalloc_to_page(p);
605                         arg->layoutupdate_pages[i++] = page;
606                         get_page(page);
607                 }
608         }
609
610         dprintk("%s found %zu ranges\n", __func__, count);
611         return 0;
612 }
613
614 void
615 ext_tree_mark_committed(struct nfs4_layoutcommit_args *arg, int status)
616 {
617         struct pnfs_block_layout *bl = BLK_LO2EXT(NFS_I(arg->inode)->layout);
618         struct rb_root *root = &bl->bl_ext_rw;
619         struct pnfs_block_extent *be;
620
621         dprintk("%s status %d\n", __func__, status);
622
623         ext_tree_free_commitdata(arg, arg->layoutupdate_len);
624
625         spin_lock(&bl->bl_ext_lock);
626         for (be = ext_tree_first(root); be; be = ext_tree_next(be)) {
627                 if (be->be_state != PNFS_BLOCK_INVALID_DATA ||
628                     be->be_tag != EXTENT_COMMITTING)
629                         continue;
630
631                 if (status) {
632                         /*
633                          * Mark as written and try again.
634                          *
635                          * XXX: some real error handling here wouldn't hurt..
636                          */
637                         be->be_tag = EXTENT_WRITTEN;
638                 } else {
639                         be->be_state = PNFS_BLOCK_READWRITE_DATA;
640                         be->be_tag = 0;
641                 }
642
643                 be = ext_try_to_merge_left(root, be);
644                 be = ext_try_to_merge_right(root, be);
645         }
646         spin_unlock(&bl->bl_ext_lock);
647 }