GNU Linux-libre 6.9-gnu
[releases.git] / fs / xfs / xfs_extent_busy.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
4  * Copyright (c) 2010 David Chinner.
5  * Copyright (c) 2011 Christoph Hellwig.
6  * All Rights Reserved.
7  */
8 #include "xfs.h"
9 #include "xfs_fs.h"
10 #include "xfs_format.h"
11 #include "xfs_log_format.h"
12 #include "xfs_shared.h"
13 #include "xfs_trans_resv.h"
14 #include "xfs_mount.h"
15 #include "xfs_alloc.h"
16 #include "xfs_extent_busy.h"
17 #include "xfs_trace.h"
18 #include "xfs_trans.h"
19 #include "xfs_log.h"
20 #include "xfs_ag.h"
21
22 static void
23 xfs_extent_busy_insert_list(
24         struct xfs_perag        *pag,
25         xfs_agblock_t           bno,
26         xfs_extlen_t            len,
27         unsigned int            flags,
28         struct list_head        *busy_list)
29 {
30         struct xfs_extent_busy  *new;
31         struct xfs_extent_busy  *busyp;
32         struct rb_node          **rbp;
33         struct rb_node          *parent = NULL;
34
35         new = kzalloc(sizeof(struct xfs_extent_busy),
36                         GFP_KERNEL | __GFP_NOFAIL);
37         new->agno = pag->pag_agno;
38         new->bno = bno;
39         new->length = len;
40         INIT_LIST_HEAD(&new->list);
41         new->flags = flags;
42
43         /* trace before insert to be able to see failed inserts */
44         trace_xfs_extent_busy(pag->pag_mount, pag->pag_agno, bno, len);
45
46         spin_lock(&pag->pagb_lock);
47         rbp = &pag->pagb_tree.rb_node;
48         while (*rbp) {
49                 parent = *rbp;
50                 busyp = rb_entry(parent, struct xfs_extent_busy, rb_node);
51
52                 if (new->bno < busyp->bno) {
53                         rbp = &(*rbp)->rb_left;
54                         ASSERT(new->bno + new->length <= busyp->bno);
55                 } else if (new->bno > busyp->bno) {
56                         rbp = &(*rbp)->rb_right;
57                         ASSERT(bno >= busyp->bno + busyp->length);
58                 } else {
59                         ASSERT(0);
60                 }
61         }
62
63         rb_link_node(&new->rb_node, parent, rbp);
64         rb_insert_color(&new->rb_node, &pag->pagb_tree);
65
66         /* always process discard lists in fifo order */
67         list_add_tail(&new->list, busy_list);
68         spin_unlock(&pag->pagb_lock);
69 }
70
71 void
72 xfs_extent_busy_insert(
73         struct xfs_trans        *tp,
74         struct xfs_perag        *pag,
75         xfs_agblock_t           bno,
76         xfs_extlen_t            len,
77         unsigned int            flags)
78 {
79         xfs_extent_busy_insert_list(pag, bno, len, flags, &tp->t_busy);
80 }
81
82 void
83 xfs_extent_busy_insert_discard(
84         struct xfs_perag        *pag,
85         xfs_agblock_t           bno,
86         xfs_extlen_t            len,
87         struct list_head        *busy_list)
88 {
89         xfs_extent_busy_insert_list(pag, bno, len, XFS_EXTENT_BUSY_DISCARDED,
90                         busy_list);
91 }
92
93 /*
94  * Search for a busy extent within the range of the extent we are about to
95  * allocate.  You need to be holding the busy extent tree lock when calling
96  * xfs_extent_busy_search(). This function returns 0 for no overlapping busy
97  * extent, -1 for an overlapping but not exact busy extent, and 1 for an exact
98  * match. This is done so that a non-zero return indicates an overlap that
99  * will require a synchronous transaction, but it can still be
100  * used to distinguish between a partial or exact match.
101  */
102 int
103 xfs_extent_busy_search(
104         struct xfs_mount        *mp,
105         struct xfs_perag        *pag,
106         xfs_agblock_t           bno,
107         xfs_extlen_t            len)
108 {
109         struct rb_node          *rbp;
110         struct xfs_extent_busy  *busyp;
111         int                     match = 0;
112
113         /* find closest start bno overlap */
114         spin_lock(&pag->pagb_lock);
115         rbp = pag->pagb_tree.rb_node;
116         while (rbp) {
117                 busyp = rb_entry(rbp, struct xfs_extent_busy, rb_node);
118                 if (bno < busyp->bno) {
119                         /* may overlap, but exact start block is lower */
120                         if (bno + len > busyp->bno)
121                                 match = -1;
122                         rbp = rbp->rb_left;
123                 } else if (bno > busyp->bno) {
124                         /* may overlap, but exact start block is higher */
125                         if (bno < busyp->bno + busyp->length)
126                                 match = -1;
127                         rbp = rbp->rb_right;
128                 } else {
129                         /* bno matches busyp, length determines exact match */
130                         match = (busyp->length == len) ? 1 : -1;
131                         break;
132                 }
133         }
134         spin_unlock(&pag->pagb_lock);
135         return match;
136 }
137
138 /*
139  * The found free extent [fbno, fend] overlaps part or all of the given busy
140  * extent.  If the overlap covers the beginning, the end, or all of the busy
141  * extent, the overlapping portion can be made unbusy and used for the
142  * allocation.  We can't split a busy extent because we can't modify a
143  * transaction/CIL context busy list, but we can update an entry's block
144  * number or length.
145  *
146  * Returns true if the extent can safely be reused, or false if the search
147  * needs to be restarted.
148  */
149 STATIC bool
150 xfs_extent_busy_update_extent(
151         struct xfs_mount        *mp,
152         struct xfs_perag        *pag,
153         struct xfs_extent_busy  *busyp,
154         xfs_agblock_t           fbno,
155         xfs_extlen_t            flen,
156         bool                    userdata) __releases(&pag->pagb_lock)
157                                           __acquires(&pag->pagb_lock)
158 {
159         xfs_agblock_t           fend = fbno + flen;
160         xfs_agblock_t           bbno = busyp->bno;
161         xfs_agblock_t           bend = bbno + busyp->length;
162
163         /*
164          * This extent is currently being discarded.  Give the thread
165          * performing the discard a chance to mark the extent unbusy
166          * and retry.
167          */
168         if (busyp->flags & XFS_EXTENT_BUSY_DISCARDED) {
169                 spin_unlock(&pag->pagb_lock);
170                 delay(1);
171                 spin_lock(&pag->pagb_lock);
172                 return false;
173         }
174
175         /*
176          * If there is a busy extent overlapping a user allocation, we have
177          * no choice but to force the log and retry the search.
178          *
179          * Fortunately this does not happen during normal operation, but
180          * only if the filesystem is very low on space and has to dip into
181          * the AGFL for normal allocations.
182          */
183         if (userdata)
184                 goto out_force_log;
185
186         if (bbno < fbno && bend > fend) {
187                 /*
188                  * Case 1:
189                  *    bbno           bend
190                  *    +BBBBBBBBBBBBBBBBB+
191                  *        +---------+
192                  *        fbno   fend
193                  */
194
195                 /*
196                  * We would have to split the busy extent to be able to track
197                  * it correct, which we cannot do because we would have to
198                  * modify the list of busy extents attached to the transaction
199                  * or CIL context, which is immutable.
200                  *
201                  * Force out the log to clear the busy extent and retry the
202                  * search.
203                  */
204                 goto out_force_log;
205         } else if (bbno >= fbno && bend <= fend) {
206                 /*
207                  * Case 2:
208                  *    bbno           bend
209                  *    +BBBBBBBBBBBBBBBBB+
210                  *    +-----------------+
211                  *    fbno           fend
212                  *
213                  * Case 3:
214                  *    bbno           bend
215                  *    +BBBBBBBBBBBBBBBBB+
216                  *    +--------------------------+
217                  *    fbno                    fend
218                  *
219                  * Case 4:
220                  *             bbno           bend
221                  *             +BBBBBBBBBBBBBBBBB+
222                  *    +--------------------------+
223                  *    fbno                    fend
224                  *
225                  * Case 5:
226                  *             bbno           bend
227                  *             +BBBBBBBBBBBBBBBBB+
228                  *    +-----------------------------------+
229                  *    fbno                             fend
230                  *
231                  */
232
233                 /*
234                  * The busy extent is fully covered by the extent we are
235                  * allocating, and can simply be removed from the rbtree.
236                  * However we cannot remove it from the immutable list
237                  * tracking busy extents in the transaction or CIL context,
238                  * so set the length to zero to mark it invalid.
239                  *
240                  * We also need to restart the busy extent search from the
241                  * tree root, because erasing the node can rearrange the
242                  * tree topology.
243                  */
244                 rb_erase(&busyp->rb_node, &pag->pagb_tree);
245                 busyp->length = 0;
246                 return false;
247         } else if (fend < bend) {
248                 /*
249                  * Case 6:
250                  *              bbno           bend
251                  *             +BBBBBBBBBBBBBBBBB+
252                  *             +---------+
253                  *             fbno   fend
254                  *
255                  * Case 7:
256                  *             bbno           bend
257                  *             +BBBBBBBBBBBBBBBBB+
258                  *    +------------------+
259                  *    fbno            fend
260                  *
261                  */
262                 busyp->bno = fend;
263                 busyp->length = bend - fend;
264         } else if (bbno < fbno) {
265                 /*
266                  * Case 8:
267                  *    bbno           bend
268                  *    +BBBBBBBBBBBBBBBBB+
269                  *        +-------------+
270                  *        fbno       fend
271                  *
272                  * Case 9:
273                  *    bbno           bend
274                  *    +BBBBBBBBBBBBBBBBB+
275                  *        +----------------------+
276                  *        fbno                fend
277                  */
278                 busyp->length = fbno - busyp->bno;
279         } else {
280                 ASSERT(0);
281         }
282
283         trace_xfs_extent_busy_reuse(mp, pag->pag_agno, fbno, flen);
284         return true;
285
286 out_force_log:
287         spin_unlock(&pag->pagb_lock);
288         xfs_log_force(mp, XFS_LOG_SYNC);
289         trace_xfs_extent_busy_force(mp, pag->pag_agno, fbno, flen);
290         spin_lock(&pag->pagb_lock);
291         return false;
292 }
293
294
295 /*
296  * For a given extent [fbno, flen], make sure we can reuse it safely.
297  */
298 void
299 xfs_extent_busy_reuse(
300         struct xfs_mount        *mp,
301         struct xfs_perag        *pag,
302         xfs_agblock_t           fbno,
303         xfs_extlen_t            flen,
304         bool                    userdata)
305 {
306         struct rb_node          *rbp;
307
308         ASSERT(flen > 0);
309         spin_lock(&pag->pagb_lock);
310 restart:
311         rbp = pag->pagb_tree.rb_node;
312         while (rbp) {
313                 struct xfs_extent_busy *busyp =
314                         rb_entry(rbp, struct xfs_extent_busy, rb_node);
315                 xfs_agblock_t   bbno = busyp->bno;
316                 xfs_agblock_t   bend = bbno + busyp->length;
317
318                 if (fbno + flen <= bbno) {
319                         rbp = rbp->rb_left;
320                         continue;
321                 } else if (fbno >= bend) {
322                         rbp = rbp->rb_right;
323                         continue;
324                 }
325
326                 if (!xfs_extent_busy_update_extent(mp, pag, busyp, fbno, flen,
327                                                   userdata))
328                         goto restart;
329         }
330         spin_unlock(&pag->pagb_lock);
331 }
332
333 /*
334  * For a given extent [fbno, flen], search the busy extent list to find a
335  * subset of the extent that is not busy.  If *rlen is smaller than
336  * args->minlen no suitable extent could be found, and the higher level
337  * code needs to force out the log and retry the allocation.
338  *
339  * Return the current busy generation for the AG if the extent is busy. This
340  * value can be used to wait for at least one of the currently busy extents
341  * to be cleared. Note that the busy list is not guaranteed to be empty after
342  * the gen is woken. The state of a specific extent must always be confirmed
343  * with another call to xfs_extent_busy_trim() before it can be used.
344  */
345 bool
346 xfs_extent_busy_trim(
347         struct xfs_alloc_arg    *args,
348         xfs_agblock_t           *bno,
349         xfs_extlen_t            *len,
350         unsigned                *busy_gen)
351 {
352         xfs_agblock_t           fbno;
353         xfs_extlen_t            flen;
354         struct rb_node          *rbp;
355         bool                    ret = false;
356
357         ASSERT(*len > 0);
358
359         spin_lock(&args->pag->pagb_lock);
360         fbno = *bno;
361         flen = *len;
362         rbp = args->pag->pagb_tree.rb_node;
363         while (rbp && flen >= args->minlen) {
364                 struct xfs_extent_busy *busyp =
365                         rb_entry(rbp, struct xfs_extent_busy, rb_node);
366                 xfs_agblock_t   fend = fbno + flen;
367                 xfs_agblock_t   bbno = busyp->bno;
368                 xfs_agblock_t   bend = bbno + busyp->length;
369
370                 if (fend <= bbno) {
371                         rbp = rbp->rb_left;
372                         continue;
373                 } else if (fbno >= bend) {
374                         rbp = rbp->rb_right;
375                         continue;
376                 }
377
378                 if (bbno <= fbno) {
379                         /* start overlap */
380
381                         /*
382                          * Case 1:
383                          *    bbno           bend
384                          *    +BBBBBBBBBBBBBBBBB+
385                          *        +---------+
386                          *        fbno   fend
387                          *
388                          * Case 2:
389                          *    bbno           bend
390                          *    +BBBBBBBBBBBBBBBBB+
391                          *    +-------------+
392                          *    fbno       fend
393                          *
394                          * Case 3:
395                          *    bbno           bend
396                          *    +BBBBBBBBBBBBBBBBB+
397                          *        +-------------+
398                          *        fbno       fend
399                          *
400                          * Case 4:
401                          *    bbno           bend
402                          *    +BBBBBBBBBBBBBBBBB+
403                          *    +-----------------+
404                          *    fbno           fend
405                          *
406                          * No unbusy region in extent, return failure.
407                          */
408                         if (fend <= bend)
409                                 goto fail;
410
411                         /*
412                          * Case 5:
413                          *    bbno           bend
414                          *    +BBBBBBBBBBBBBBBBB+
415                          *        +----------------------+
416                          *        fbno                fend
417                          *
418                          * Case 6:
419                          *    bbno           bend
420                          *    +BBBBBBBBBBBBBBBBB+
421                          *    +--------------------------+
422                          *    fbno                    fend
423                          *
424                          * Needs to be trimmed to:
425                          *                       +-------+
426                          *                       fbno fend
427                          */
428                         fbno = bend;
429                 } else if (bend >= fend) {
430                         /* end overlap */
431
432                         /*
433                          * Case 7:
434                          *             bbno           bend
435                          *             +BBBBBBBBBBBBBBBBB+
436                          *    +------------------+
437                          *    fbno            fend
438                          *
439                          * Case 8:
440                          *             bbno           bend
441                          *             +BBBBBBBBBBBBBBBBB+
442                          *    +--------------------------+
443                          *    fbno                    fend
444                          *
445                          * Needs to be trimmed to:
446                          *    +-------+
447                          *    fbno fend
448                          */
449                         fend = bbno;
450                 } else {
451                         /* middle overlap */
452
453                         /*
454                          * Case 9:
455                          *             bbno           bend
456                          *             +BBBBBBBBBBBBBBBBB+
457                          *    +-----------------------------------+
458                          *    fbno                             fend
459                          *
460                          * Can be trimmed to:
461                          *    +-------+        OR         +-------+
462                          *    fbno fend                   fbno fend
463                          *
464                          * Backward allocation leads to significant
465                          * fragmentation of directories, which degrades
466                          * directory performance, therefore we always want to
467                          * choose the option that produces forward allocation
468                          * patterns.
469                          * Preferring the lower bno extent will make the next
470                          * request use "fend" as the start of the next
471                          * allocation;  if the segment is no longer busy at
472                          * that point, we'll get a contiguous allocation, but
473                          * even if it is still busy, we will get a forward
474                          * allocation.
475                          * We try to avoid choosing the segment at "bend",
476                          * because that can lead to the next allocation
477                          * taking the segment at "fbno", which would be a
478                          * backward allocation.  We only use the segment at
479                          * "fbno" if it is much larger than the current
480                          * requested size, because in that case there's a
481                          * good chance subsequent allocations will be
482                          * contiguous.
483                          */
484                         if (bbno - fbno >= args->maxlen) {
485                                 /* left candidate fits perfect */
486                                 fend = bbno;
487                         } else if (fend - bend >= args->maxlen * 4) {
488                                 /* right candidate has enough free space */
489                                 fbno = bend;
490                         } else if (bbno - fbno >= args->minlen) {
491                                 /* left candidate fits minimum requirement */
492                                 fend = bbno;
493                         } else {
494                                 goto fail;
495                         }
496                 }
497
498                 flen = fend - fbno;
499         }
500 out:
501
502         if (fbno != *bno || flen != *len) {
503                 trace_xfs_extent_busy_trim(args->mp, args->agno, *bno, *len,
504                                           fbno, flen);
505                 *bno = fbno;
506                 *len = flen;
507                 *busy_gen = args->pag->pagb_gen;
508                 ret = true;
509         }
510         spin_unlock(&args->pag->pagb_lock);
511         return ret;
512 fail:
513         /*
514          * Return a zero extent length as failure indications.  All callers
515          * re-check if the trimmed extent satisfies the minlen requirement.
516          */
517         flen = 0;
518         goto out;
519 }
520
521 STATIC void
522 xfs_extent_busy_clear_one(
523         struct xfs_mount        *mp,
524         struct xfs_perag        *pag,
525         struct xfs_extent_busy  *busyp)
526 {
527         if (busyp->length) {
528                 trace_xfs_extent_busy_clear(mp, busyp->agno, busyp->bno,
529                                                 busyp->length);
530                 rb_erase(&busyp->rb_node, &pag->pagb_tree);
531         }
532
533         list_del_init(&busyp->list);
534         kfree(busyp);
535 }
536
537 static void
538 xfs_extent_busy_put_pag(
539         struct xfs_perag        *pag,
540         bool                    wakeup)
541                 __releases(pag->pagb_lock)
542 {
543         if (wakeup) {
544                 pag->pagb_gen++;
545                 wake_up_all(&pag->pagb_wait);
546         }
547
548         spin_unlock(&pag->pagb_lock);
549         xfs_perag_put(pag);
550 }
551
552 /*
553  * Remove all extents on the passed in list from the busy extents tree.
554  * If do_discard is set skip extents that need to be discarded, and mark
555  * these as undergoing a discard operation instead.
556  */
557 void
558 xfs_extent_busy_clear(
559         struct xfs_mount        *mp,
560         struct list_head        *list,
561         bool                    do_discard)
562 {
563         struct xfs_extent_busy  *busyp, *n;
564         struct xfs_perag        *pag = NULL;
565         xfs_agnumber_t          agno = NULLAGNUMBER;
566         bool                    wakeup = false;
567
568         list_for_each_entry_safe(busyp, n, list, list) {
569                 if (busyp->agno != agno) {
570                         if (pag)
571                                 xfs_extent_busy_put_pag(pag, wakeup);
572                         agno = busyp->agno;
573                         pag = xfs_perag_get(mp, agno);
574                         spin_lock(&pag->pagb_lock);
575                         wakeup = false;
576                 }
577
578                 if (do_discard && busyp->length &&
579                     !(busyp->flags & XFS_EXTENT_BUSY_SKIP_DISCARD)) {
580                         busyp->flags = XFS_EXTENT_BUSY_DISCARDED;
581                 } else {
582                         xfs_extent_busy_clear_one(mp, pag, busyp);
583                         wakeup = true;
584                 }
585         }
586
587         if (pag)
588                 xfs_extent_busy_put_pag(pag, wakeup);
589 }
590
591 /*
592  * Flush out all busy extents for this AG.
593  *
594  * If the current transaction is holding busy extents, the caller may not want
595  * to wait for committed busy extents to resolve. If we are being told just to
596  * try a flush or progress has been made since we last skipped a busy extent,
597  * return immediately to allow the caller to try again.
598  *
599  * If we are freeing extents, we might actually be holding the only free extents
600  * in the transaction busy list and the log force won't resolve that situation.
601  * In this case, we must return -EAGAIN to avoid a deadlock by informing the
602  * caller it needs to commit the busy extents it holds before retrying the
603  * extent free operation.
604  */
605 int
606 xfs_extent_busy_flush(
607         struct xfs_trans        *tp,
608         struct xfs_perag        *pag,
609         unsigned                busy_gen,
610         uint32_t                alloc_flags)
611 {
612         DEFINE_WAIT             (wait);
613         int                     error;
614
615         error = xfs_log_force(tp->t_mountp, XFS_LOG_SYNC);
616         if (error)
617                 return error;
618
619         /* Avoid deadlocks on uncommitted busy extents. */
620         if (!list_empty(&tp->t_busy)) {
621                 if (alloc_flags & XFS_ALLOC_FLAG_TRYFLUSH)
622                         return 0;
623
624                 if (busy_gen != READ_ONCE(pag->pagb_gen))
625                         return 0;
626
627                 if (alloc_flags & XFS_ALLOC_FLAG_FREEING)
628                         return -EAGAIN;
629         }
630
631         /* Wait for committed busy extents to resolve. */
632         do {
633                 prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE);
634                 if  (busy_gen != READ_ONCE(pag->pagb_gen))
635                         break;
636                 schedule();
637         } while (1);
638
639         finish_wait(&pag->pagb_wait, &wait);
640         return 0;
641 }
642
643 void
644 xfs_extent_busy_wait_all(
645         struct xfs_mount        *mp)
646 {
647         struct xfs_perag        *pag;
648         DEFINE_WAIT             (wait);
649         xfs_agnumber_t          agno;
650
651         for_each_perag(mp, agno, pag) {
652                 do {
653                         prepare_to_wait(&pag->pagb_wait, &wait, TASK_KILLABLE);
654                         if  (RB_EMPTY_ROOT(&pag->pagb_tree))
655                                 break;
656                         schedule();
657                 } while (1);
658                 finish_wait(&pag->pagb_wait, &wait);
659         }
660 }
661
662 /*
663  * Callback for list_sort to sort busy extents by the AG they reside in.
664  */
665 int
666 xfs_extent_busy_ag_cmp(
667         void                    *priv,
668         const struct list_head  *l1,
669         const struct list_head  *l2)
670 {
671         struct xfs_extent_busy  *b1 =
672                 container_of(l1, struct xfs_extent_busy, list);
673         struct xfs_extent_busy  *b2 =
674                 container_of(l2, struct xfs_extent_busy, list);
675         s32 diff;
676
677         diff = b1->agno - b2->agno;
678         if (!diff)
679                 diff = b1->bno - b2->bno;
680         return diff;
681 }
682
683 /* Are there any busy extents in this AG? */
684 bool
685 xfs_extent_busy_list_empty(
686         struct xfs_perag        *pag)
687 {
688         bool                    res;
689
690         spin_lock(&pag->pagb_lock);
691         res = RB_EMPTY_ROOT(&pag->pagb_tree);
692         spin_unlock(&pag->pagb_lock);
693         return res;
694 }