1 // SPDX-License-Identifier: GPL-2.0
3 * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
8 #include "xfs_format.h"
9 #include "xfs_log_format.h"
10 #include "xfs_shared.h"
11 #include "xfs_trans_resv.h"
13 #include "xfs_mount.h"
14 #include "xfs_defer.h"
15 #include "xfs_btree.h"
17 #include "xfs_alloc_btree.h"
18 #include "xfs_alloc.h"
19 #include "xfs_extent_busy.h"
20 #include "xfs_errortag.h"
21 #include "xfs_error.h"
22 #include "xfs_trace.h"
23 #include "xfs_trans.h"
24 #include "xfs_buf_item.h"
27 #include "xfs_ag_resv.h"
30 struct kmem_cache *xfs_extfree_item_cache;
32 struct workqueue_struct *xfs_alloc_wq;
34 #define XFS_ABSDIFF(a,b) (((a) <= (b)) ? ((b) - (a)) : ((a) - (b)))
36 #define XFSA_FIXUP_BNO_OK 1
37 #define XFSA_FIXUP_CNT_OK 2
39 STATIC int xfs_alloc_ag_vextent_exact(xfs_alloc_arg_t *);
40 STATIC int xfs_alloc_ag_vextent_near(xfs_alloc_arg_t *);
41 STATIC int xfs_alloc_ag_vextent_size(xfs_alloc_arg_t *);
44 * Size of the AGFL. For CRC-enabled filesystes we steal a couple of slots in
45 * the beginning of the block for a proper header with the location information
52 unsigned int size = mp->m_sb.sb_sectsize;
55 size -= sizeof(struct xfs_agfl);
57 return size / sizeof(xfs_agblock_t);
64 if (xfs_has_rmapbt(mp))
65 return XFS_RMAP_BLOCK(mp) + 1;
66 if (xfs_has_finobt(mp))
67 return XFS_FIBT_BLOCK(mp) + 1;
68 return XFS_IBT_BLOCK(mp) + 1;
75 if (xfs_has_reflink(mp))
76 return xfs_refc_block(mp) + 1;
77 if (xfs_has_rmapbt(mp))
78 return XFS_RMAP_BLOCK(mp) + 1;
79 if (xfs_has_finobt(mp))
80 return XFS_FIBT_BLOCK(mp) + 1;
81 return XFS_IBT_BLOCK(mp) + 1;
85 * The number of blocks per AG that we withhold from xfs_mod_fdblocks to
86 * guarantee that we can refill the AGFL prior to allocating space in a nearly
87 * full AG. Although the space described by the free space btrees, the
88 * blocks used by the freesp btrees themselves, and the blocks owned by the
89 * AGFL are counted in the ondisk fdblocks, it's a mistake to let the ondisk
90 * free space in the AG drop so low that the free space btrees cannot refill an
91 * empty AGFL up to the minimum level. Rather than grind through empty AGs
92 * until the fs goes down, we subtract this many AG blocks from the incore
93 * fdblocks to ensure user allocation does not overcommit the space the
94 * filesystem needs for the AGFLs. The rmap btree uses a per-AG reservation to
95 * withhold space from xfs_mod_fdblocks, so we do not account for that here.
97 #define XFS_ALLOCBT_AGFL_RESERVE 4
100 * Compute the number of blocks that we set aside to guarantee the ability to
101 * refill the AGFL and handle a full bmap btree split.
103 * In order to avoid ENOSPC-related deadlock caused by out-of-order locking of
104 * AGF buffer (PV 947395), we place constraints on the relationship among
105 * actual allocations for data blocks, freelist blocks, and potential file data
106 * bmap btree blocks. However, these restrictions may result in no actual space
107 * allocated for a delayed extent, for example, a data block in a certain AG is
108 * allocated but there is no additional block for the additional bmap btree
109 * block due to a split of the bmap btree of the file. The result of this may
110 * lead to an infinite loop when the file gets flushed to disk and all delayed
111 * extents need to be actually allocated. To get around this, we explicitly set
112 * aside a few blocks which will not be reserved in delayed allocation.
114 * For each AG, we need to reserve enough blocks to replenish a totally empty
115 * AGFL and 4 more to handle a potential split of the file's bmap btree.
119 struct xfs_mount *mp)
121 return mp->m_sb.sb_agcount * (XFS_ALLOCBT_AGFL_RESERVE + 4);
125 * When deciding how much space to allocate out of an AG, we limit the
126 * allocation maximum size to the size the AG. However, we cannot use all the
127 * blocks in the AG - some are permanently used by metadata. These
128 * blocks are generally:
129 * - the AG superblock, AGF, AGI and AGFL
130 * - the AGF (bno and cnt) and AGI btree root blocks, and optionally
131 * the AGI free inode and rmap btree root blocks.
132 * - blocks on the AGFL according to xfs_alloc_set_aside() limits
133 * - the rmapbt root block
135 * The AG headers are sector sized, so the amount of space they take up is
136 * dependent on filesystem geometry. The others are all single blocks.
139 xfs_alloc_ag_max_usable(
140 struct xfs_mount *mp)
144 blocks = XFS_BB_TO_FSB(mp, XFS_FSS_TO_BB(mp, 4)); /* ag headers */
145 blocks += XFS_ALLOCBT_AGFL_RESERVE;
146 blocks += 3; /* AGF, AGI btree root blocks */
147 if (xfs_has_finobt(mp))
148 blocks++; /* finobt root block */
149 if (xfs_has_rmapbt(mp))
150 blocks++; /* rmap root block */
151 if (xfs_has_reflink(mp))
152 blocks++; /* refcount root block */
154 return mp->m_sb.sb_agblocks - blocks;
158 * Lookup the record equal to [bno, len] in the btree given by cur.
160 STATIC int /* error */
162 struct xfs_btree_cur *cur, /* btree cursor */
163 xfs_agblock_t bno, /* starting block of extent */
164 xfs_extlen_t len, /* length of extent */
165 int *stat) /* success/failure */
169 cur->bc_rec.a.ar_startblock = bno;
170 cur->bc_rec.a.ar_blockcount = len;
171 error = xfs_btree_lookup(cur, XFS_LOOKUP_EQ, stat);
172 cur->bc_ag.abt.active = (*stat == 1);
177 * Lookup the first record greater than or equal to [bno, len]
178 * in the btree given by cur.
182 struct xfs_btree_cur *cur, /* btree cursor */
183 xfs_agblock_t bno, /* starting block of extent */
184 xfs_extlen_t len, /* length of extent */
185 int *stat) /* success/failure */
189 cur->bc_rec.a.ar_startblock = bno;
190 cur->bc_rec.a.ar_blockcount = len;
191 error = xfs_btree_lookup(cur, XFS_LOOKUP_GE, stat);
192 cur->bc_ag.abt.active = (*stat == 1);
197 * Lookup the first record less than or equal to [bno, len]
198 * in the btree given by cur.
202 struct xfs_btree_cur *cur, /* btree cursor */
203 xfs_agblock_t bno, /* starting block of extent */
204 xfs_extlen_t len, /* length of extent */
205 int *stat) /* success/failure */
208 cur->bc_rec.a.ar_startblock = bno;
209 cur->bc_rec.a.ar_blockcount = len;
210 error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, stat);
211 cur->bc_ag.abt.active = (*stat == 1);
216 xfs_alloc_cur_active(
217 struct xfs_btree_cur *cur)
219 return cur && cur->bc_ag.abt.active;
223 * Update the record referred to by cur to the value given
225 * This either works (return 0) or gets an EFSCORRUPTED error.
227 STATIC int /* error */
229 struct xfs_btree_cur *cur, /* btree cursor */
230 xfs_agblock_t bno, /* starting block of extent */
231 xfs_extlen_t len) /* length of extent */
233 union xfs_btree_rec rec;
235 rec.alloc.ar_startblock = cpu_to_be32(bno);
236 rec.alloc.ar_blockcount = cpu_to_be32(len);
237 return xfs_btree_update(cur, &rec);
241 * Get the data from the pointed-to record.
245 struct xfs_btree_cur *cur, /* btree cursor */
246 xfs_agblock_t *bno, /* output: starting block of extent */
247 xfs_extlen_t *len, /* output: length of extent */
248 int *stat) /* output: success/failure */
250 struct xfs_mount *mp = cur->bc_mp;
251 struct xfs_perag *pag = cur->bc_ag.pag;
252 union xfs_btree_rec *rec;
255 error = xfs_btree_get_rec(cur, &rec, stat);
256 if (error || !(*stat))
259 *bno = be32_to_cpu(rec->alloc.ar_startblock);
260 *len = be32_to_cpu(rec->alloc.ar_blockcount);
265 /* check for valid extent range, including overflow */
266 if (!xfs_verify_agbno(pag, *bno))
268 if (*bno > *bno + *len)
270 if (!xfs_verify_agbno(pag, *bno + *len - 1))
277 "%s Freespace BTree record corruption in AG %d detected!",
278 cur->bc_btnum == XFS_BTNUM_BNO ? "Block" : "Size",
281 "start block 0x%x block count 0x%x", *bno, *len);
282 return -EFSCORRUPTED;
286 * Compute aligned version of the found extent.
287 * Takes alignment and min length into account.
290 xfs_alloc_compute_aligned(
291 xfs_alloc_arg_t *args, /* allocation argument structure */
292 xfs_agblock_t foundbno, /* starting block in found extent */
293 xfs_extlen_t foundlen, /* length in found extent */
294 xfs_agblock_t *resbno, /* result block number */
295 xfs_extlen_t *reslen, /* result length */
298 xfs_agblock_t bno = foundbno;
299 xfs_extlen_t len = foundlen;
303 /* Trim busy sections out of found extent */
304 busy = xfs_extent_busy_trim(args, &bno, &len, busy_gen);
307 * If we have a largish extent that happens to start before min_agbno,
308 * see if we can shift it into range...
310 if (bno < args->min_agbno && bno + len > args->min_agbno) {
311 diff = args->min_agbno - bno;
318 if (args->alignment > 1 && len >= args->minlen) {
319 xfs_agblock_t aligned_bno = roundup(bno, args->alignment);
321 diff = aligned_bno - bno;
323 *resbno = aligned_bno;
324 *reslen = diff >= len ? 0 : len - diff;
334 * Compute best start block and diff for "near" allocations.
335 * freelen >= wantlen already checked by caller.
337 STATIC xfs_extlen_t /* difference value (absolute) */
338 xfs_alloc_compute_diff(
339 xfs_agblock_t wantbno, /* target starting block */
340 xfs_extlen_t wantlen, /* target length */
341 xfs_extlen_t alignment, /* target alignment */
342 int datatype, /* are we allocating data? */
343 xfs_agblock_t freebno, /* freespace's starting block */
344 xfs_extlen_t freelen, /* freespace's length */
345 xfs_agblock_t *newbnop) /* result: best start block from free */
347 xfs_agblock_t freeend; /* end of freespace extent */
348 xfs_agblock_t newbno1; /* return block number */
349 xfs_agblock_t newbno2; /* other new block number */
350 xfs_extlen_t newlen1=0; /* length with newbno1 */
351 xfs_extlen_t newlen2=0; /* length with newbno2 */
352 xfs_agblock_t wantend; /* end of target extent */
353 bool userdata = datatype & XFS_ALLOC_USERDATA;
355 ASSERT(freelen >= wantlen);
356 freeend = freebno + freelen;
357 wantend = wantbno + wantlen;
359 * We want to allocate from the start of a free extent if it is past
360 * the desired block or if we are allocating user data and the free
361 * extent is before desired block. The second case is there to allow
362 * for contiguous allocation from the remaining free space if the file
363 * grows in the short term.
365 if (freebno >= wantbno || (userdata && freeend < wantend)) {
366 if ((newbno1 = roundup(freebno, alignment)) >= freeend)
367 newbno1 = NULLAGBLOCK;
368 } else if (freeend >= wantend && alignment > 1) {
369 newbno1 = roundup(wantbno, alignment);
370 newbno2 = newbno1 - alignment;
371 if (newbno1 >= freeend)
372 newbno1 = NULLAGBLOCK;
374 newlen1 = XFS_EXTLEN_MIN(wantlen, freeend - newbno1);
375 if (newbno2 < freebno)
376 newbno2 = NULLAGBLOCK;
378 newlen2 = XFS_EXTLEN_MIN(wantlen, freeend - newbno2);
379 if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) {
380 if (newlen1 < newlen2 ||
381 (newlen1 == newlen2 &&
382 XFS_ABSDIFF(newbno1, wantbno) >
383 XFS_ABSDIFF(newbno2, wantbno)))
385 } else if (newbno2 != NULLAGBLOCK)
387 } else if (freeend >= wantend) {
389 } else if (alignment > 1) {
390 newbno1 = roundup(freeend - wantlen, alignment);
391 if (newbno1 > freeend - wantlen &&
392 newbno1 - alignment >= freebno)
393 newbno1 -= alignment;
394 else if (newbno1 >= freeend)
395 newbno1 = NULLAGBLOCK;
397 newbno1 = freeend - wantlen;
399 return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno);
403 * Fix up the length, based on mod and prod.
404 * len should be k * prod + mod for some k.
405 * If len is too small it is returned unchanged.
406 * If len hits maxlen it is left alone.
410 xfs_alloc_arg_t *args) /* allocation argument structure */
415 ASSERT(args->mod < args->prod);
417 ASSERT(rlen >= args->minlen);
418 ASSERT(rlen <= args->maxlen);
419 if (args->prod <= 1 || rlen < args->mod || rlen == args->maxlen ||
420 (args->mod == 0 && rlen < args->prod))
422 k = rlen % args->prod;
426 rlen = rlen - (k - args->mod);
428 rlen = rlen - args->prod + (args->mod - k);
429 /* casts to (int) catch length underflows */
430 if ((int)rlen < (int)args->minlen)
432 ASSERT(rlen >= args->minlen && rlen <= args->maxlen);
433 ASSERT(rlen % args->prod == args->mod);
434 ASSERT(args->pag->pagf_freeblks + args->pag->pagf_flcount >=
435 rlen + args->minleft);
440 * Update the two btrees, logically removing from freespace the extent
441 * starting at rbno, rlen blocks. The extent is contained within the
442 * actual (current) free extent fbno for flen blocks.
443 * Flags are passed in indicating whether the cursors are set to the
446 STATIC int /* error code */
447 xfs_alloc_fixup_trees(
448 struct xfs_btree_cur *cnt_cur, /* cursor for by-size btree */
449 struct xfs_btree_cur *bno_cur, /* cursor for by-block btree */
450 xfs_agblock_t fbno, /* starting block of free extent */
451 xfs_extlen_t flen, /* length of free extent */
452 xfs_agblock_t rbno, /* starting block of returned extent */
453 xfs_extlen_t rlen, /* length of returned extent */
454 int flags) /* flags, XFSA_FIXUP_... */
456 int error; /* error code */
457 int i; /* operation results */
458 xfs_agblock_t nfbno1; /* first new free startblock */
459 xfs_agblock_t nfbno2; /* second new free startblock */
460 xfs_extlen_t nflen1=0; /* first new free length */
461 xfs_extlen_t nflen2=0; /* second new free length */
462 struct xfs_mount *mp;
467 * Look up the record in the by-size tree if necessary.
469 if (flags & XFSA_FIXUP_CNT_OK) {
471 if ((error = xfs_alloc_get_rec(cnt_cur, &nfbno1, &nflen1, &i)))
473 if (XFS_IS_CORRUPT(mp,
477 return -EFSCORRUPTED;
480 if ((error = xfs_alloc_lookup_eq(cnt_cur, fbno, flen, &i)))
482 if (XFS_IS_CORRUPT(mp, i != 1))
483 return -EFSCORRUPTED;
486 * Look up the record in the by-block tree if necessary.
488 if (flags & XFSA_FIXUP_BNO_OK) {
490 if ((error = xfs_alloc_get_rec(bno_cur, &nfbno1, &nflen1, &i)))
492 if (XFS_IS_CORRUPT(mp,
496 return -EFSCORRUPTED;
499 if ((error = xfs_alloc_lookup_eq(bno_cur, fbno, flen, &i)))
501 if (XFS_IS_CORRUPT(mp, i != 1))
502 return -EFSCORRUPTED;
506 if (bno_cur->bc_nlevels == 1 && cnt_cur->bc_nlevels == 1) {
507 struct xfs_btree_block *bnoblock;
508 struct xfs_btree_block *cntblock;
510 bnoblock = XFS_BUF_TO_BLOCK(bno_cur->bc_levels[0].bp);
511 cntblock = XFS_BUF_TO_BLOCK(cnt_cur->bc_levels[0].bp);
513 if (XFS_IS_CORRUPT(mp,
514 bnoblock->bb_numrecs !=
515 cntblock->bb_numrecs))
516 return -EFSCORRUPTED;
521 * Deal with all four cases: the allocated record is contained
522 * within the freespace record, so we can have new freespace
523 * at either (or both) end, or no freespace remaining.
525 if (rbno == fbno && rlen == flen)
526 nfbno1 = nfbno2 = NULLAGBLOCK;
527 else if (rbno == fbno) {
528 nfbno1 = rbno + rlen;
529 nflen1 = flen - rlen;
530 nfbno2 = NULLAGBLOCK;
531 } else if (rbno + rlen == fbno + flen) {
533 nflen1 = flen - rlen;
534 nfbno2 = NULLAGBLOCK;
537 nflen1 = rbno - fbno;
538 nfbno2 = rbno + rlen;
539 nflen2 = (fbno + flen) - nfbno2;
542 * Delete the entry from the by-size btree.
544 if ((error = xfs_btree_delete(cnt_cur, &i)))
546 if (XFS_IS_CORRUPT(mp, i != 1))
547 return -EFSCORRUPTED;
549 * Add new by-size btree entry(s).
551 if (nfbno1 != NULLAGBLOCK) {
552 if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno1, nflen1, &i)))
554 if (XFS_IS_CORRUPT(mp, i != 0))
555 return -EFSCORRUPTED;
556 if ((error = xfs_btree_insert(cnt_cur, &i)))
558 if (XFS_IS_CORRUPT(mp, i != 1))
559 return -EFSCORRUPTED;
561 if (nfbno2 != NULLAGBLOCK) {
562 if ((error = xfs_alloc_lookup_eq(cnt_cur, nfbno2, nflen2, &i)))
564 if (XFS_IS_CORRUPT(mp, i != 0))
565 return -EFSCORRUPTED;
566 if ((error = xfs_btree_insert(cnt_cur, &i)))
568 if (XFS_IS_CORRUPT(mp, i != 1))
569 return -EFSCORRUPTED;
572 * Fix up the by-block btree entry(s).
574 if (nfbno1 == NULLAGBLOCK) {
576 * No remaining freespace, just delete the by-block tree entry.
578 if ((error = xfs_btree_delete(bno_cur, &i)))
580 if (XFS_IS_CORRUPT(mp, i != 1))
581 return -EFSCORRUPTED;
584 * Update the by-block entry to start later|be shorter.
586 if ((error = xfs_alloc_update(bno_cur, nfbno1, nflen1)))
589 if (nfbno2 != NULLAGBLOCK) {
591 * 2 resulting free entries, need to add one.
593 if ((error = xfs_alloc_lookup_eq(bno_cur, nfbno2, nflen2, &i)))
595 if (XFS_IS_CORRUPT(mp, i != 0))
596 return -EFSCORRUPTED;
597 if ((error = xfs_btree_insert(bno_cur, &i)))
599 if (XFS_IS_CORRUPT(mp, i != 1))
600 return -EFSCORRUPTED;
605 static xfs_failaddr_t
609 struct xfs_mount *mp = bp->b_mount;
610 struct xfs_agfl *agfl = XFS_BUF_TO_AGFL(bp);
611 __be32 *agfl_bno = xfs_buf_to_agfl_bno(bp);
615 * There is no verification of non-crc AGFLs because mkfs does not
616 * initialise the AGFL to zero or NULL. Hence the only valid part of the
617 * AGFL is what the AGF says is active. We can't get to the AGF, so we
618 * can't verify just those entries are valid.
620 if (!xfs_has_crc(mp))
623 if (!xfs_verify_magic(bp, agfl->agfl_magicnum))
624 return __this_address;
625 if (!uuid_equal(&agfl->agfl_uuid, &mp->m_sb.sb_meta_uuid))
626 return __this_address;
628 * during growfs operations, the perag is not fully initialised,
629 * so we can't use it for any useful checking. growfs ensures we can't
630 * use it by using uncached buffers that don't have the perag attached
631 * so we can detect and avoid this problem.
633 if (bp->b_pag && be32_to_cpu(agfl->agfl_seqno) != bp->b_pag->pag_agno)
634 return __this_address;
636 for (i = 0; i < xfs_agfl_size(mp); i++) {
637 if (be32_to_cpu(agfl_bno[i]) != NULLAGBLOCK &&
638 be32_to_cpu(agfl_bno[i]) >= mp->m_sb.sb_agblocks)
639 return __this_address;
642 if (!xfs_log_check_lsn(mp, be64_to_cpu(XFS_BUF_TO_AGFL(bp)->agfl_lsn)))
643 return __this_address;
648 xfs_agfl_read_verify(
651 struct xfs_mount *mp = bp->b_mount;
655 * There is no verification of non-crc AGFLs because mkfs does not
656 * initialise the AGFL to zero or NULL. Hence the only valid part of the
657 * AGFL is what the AGF says is active. We can't get to the AGF, so we
658 * can't verify just those entries are valid.
660 if (!xfs_has_crc(mp))
663 if (!xfs_buf_verify_cksum(bp, XFS_AGFL_CRC_OFF))
664 xfs_verifier_error(bp, -EFSBADCRC, __this_address);
666 fa = xfs_agfl_verify(bp);
668 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
673 xfs_agfl_write_verify(
676 struct xfs_mount *mp = bp->b_mount;
677 struct xfs_buf_log_item *bip = bp->b_log_item;
680 /* no verification of non-crc AGFLs */
681 if (!xfs_has_crc(mp))
684 fa = xfs_agfl_verify(bp);
686 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
691 XFS_BUF_TO_AGFL(bp)->agfl_lsn = cpu_to_be64(bip->bli_item.li_lsn);
693 xfs_buf_update_cksum(bp, XFS_AGFL_CRC_OFF);
696 const struct xfs_buf_ops xfs_agfl_buf_ops = {
698 .magic = { cpu_to_be32(XFS_AGFL_MAGIC), cpu_to_be32(XFS_AGFL_MAGIC) },
699 .verify_read = xfs_agfl_read_verify,
700 .verify_write = xfs_agfl_write_verify,
701 .verify_struct = xfs_agfl_verify,
705 * Read in the allocation group free block array.
709 struct xfs_perag *pag,
710 struct xfs_trans *tp,
711 struct xfs_buf **bpp)
713 struct xfs_mount *mp = pag->pag_mount;
717 error = xfs_trans_read_buf(
718 mp, tp, mp->m_ddev_targp,
719 XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGFL_DADDR(mp)),
720 XFS_FSS_TO_BB(mp, 1), 0, &bp, &xfs_agfl_buf_ops);
723 xfs_buf_set_ref(bp, XFS_AGFL_REF);
729 xfs_alloc_update_counters(
730 struct xfs_trans *tp,
731 struct xfs_buf *agbp,
734 struct xfs_agf *agf = agbp->b_addr;
736 agbp->b_pag->pagf_freeblks += len;
737 be32_add_cpu(&agf->agf_freeblks, len);
739 if (unlikely(be32_to_cpu(agf->agf_freeblks) >
740 be32_to_cpu(agf->agf_length))) {
741 xfs_buf_mark_corrupt(agbp);
742 return -EFSCORRUPTED;
745 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FREEBLKS);
750 * Block allocation algorithm and data structures.
752 struct xfs_alloc_cur {
753 struct xfs_btree_cur *cnt; /* btree cursors */
754 struct xfs_btree_cur *bnolt;
755 struct xfs_btree_cur *bnogt;
756 xfs_extlen_t cur_len;/* current search length */
757 xfs_agblock_t rec_bno;/* extent startblock */
758 xfs_extlen_t rec_len;/* extent length */
759 xfs_agblock_t bno; /* alloc bno */
760 xfs_extlen_t len; /* alloc len */
761 xfs_extlen_t diff; /* diff from search bno */
762 unsigned int busy_gen;/* busy state */
767 * Set up cursors, etc. in the extent allocation cursor. This function can be
768 * called multiple times to reset an initialized structure without having to
769 * reallocate cursors.
773 struct xfs_alloc_arg *args,
774 struct xfs_alloc_cur *acur)
779 ASSERT(args->alignment == 1 || args->type != XFS_ALLOCTYPE_THIS_BNO);
781 acur->cur_len = args->maxlen;
791 * Perform an initial cntbt lookup to check for availability of maxlen
792 * extents. If this fails, we'll return -ENOSPC to signal the caller to
793 * attempt a small allocation.
796 acur->cnt = xfs_allocbt_init_cursor(args->mp, args->tp,
797 args->agbp, args->pag, XFS_BTNUM_CNT);
798 error = xfs_alloc_lookup_ge(acur->cnt, 0, args->maxlen, &i);
803 * Allocate the bnobt left and right search cursors.
806 acur->bnolt = xfs_allocbt_init_cursor(args->mp, args->tp,
807 args->agbp, args->pag, XFS_BTNUM_BNO);
809 acur->bnogt = xfs_allocbt_init_cursor(args->mp, args->tp,
810 args->agbp, args->pag, XFS_BTNUM_BNO);
811 return i == 1 ? 0 : -ENOSPC;
816 struct xfs_alloc_cur *acur,
819 int cur_error = XFS_BTREE_NOERROR;
822 cur_error = XFS_BTREE_ERROR;
825 xfs_btree_del_cursor(acur->cnt, cur_error);
827 xfs_btree_del_cursor(acur->bnolt, cur_error);
829 xfs_btree_del_cursor(acur->bnogt, cur_error);
830 acur->cnt = acur->bnolt = acur->bnogt = NULL;
834 * Check an extent for allocation and track the best available candidate in the
835 * allocation structure. The cursor is deactivated if it has entered an out of
836 * range state based on allocation arguments. Optionally return the extent
837 * extent geometry and allocation status if requested by the caller.
841 struct xfs_alloc_arg *args,
842 struct xfs_alloc_cur *acur,
843 struct xfs_btree_cur *cur,
847 xfs_agblock_t bno, bnoa, bnew;
848 xfs_extlen_t len, lena, diff = -1;
850 unsigned busy_gen = 0;
851 bool deactivate = false;
852 bool isbnobt = cur->bc_btnum == XFS_BTNUM_BNO;
856 error = xfs_alloc_get_rec(cur, &bno, &len, &i);
859 if (XFS_IS_CORRUPT(args->mp, i != 1))
860 return -EFSCORRUPTED;
863 * Check minlen and deactivate a cntbt cursor if out of acceptable size
864 * range (i.e., walking backwards looking for a minlen extent).
866 if (len < args->minlen) {
867 deactivate = !isbnobt;
871 busy = xfs_alloc_compute_aligned(args, bno, len, &bnoa, &lena,
875 acur->busy_gen = busy_gen;
876 /* deactivate a bnobt cursor outside of locality range */
877 if (bnoa < args->min_agbno || bnoa > args->max_agbno) {
878 deactivate = isbnobt;
881 if (lena < args->minlen)
884 args->len = XFS_EXTLEN_MIN(lena, args->maxlen);
885 xfs_alloc_fix_len(args);
886 ASSERT(args->len >= args->minlen);
887 if (args->len < acur->len)
891 * We have an aligned record that satisfies minlen and beats or matches
892 * the candidate extent size. Compare locality for near allocation mode.
894 ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO);
895 diff = xfs_alloc_compute_diff(args->agbno, args->len,
896 args->alignment, args->datatype,
898 if (bnew == NULLAGBLOCK)
902 * Deactivate a bnobt cursor with worse locality than the current best.
904 if (diff > acur->diff) {
905 deactivate = isbnobt;
909 ASSERT(args->len > acur->len ||
910 (args->len == acur->len && diff <= acur->diff));
914 acur->len = args->len;
919 * We're done if we found a perfect allocation. This only deactivates
920 * the current cursor, but this is just an optimization to terminate a
921 * cntbt search that otherwise runs to the edge of the tree.
923 if (acur->diff == 0 && acur->len == args->maxlen)
927 cur->bc_ag.abt.active = false;
928 trace_xfs_alloc_cur_check(args->mp, cur->bc_btnum, bno, len, diff,
934 * Complete an allocation of a candidate extent. Remove the extent from both
935 * trees and update the args structure.
938 xfs_alloc_cur_finish(
939 struct xfs_alloc_arg *args,
940 struct xfs_alloc_cur *acur)
942 struct xfs_agf __maybe_unused *agf = args->agbp->b_addr;
945 ASSERT(acur->cnt && acur->bnolt);
946 ASSERT(acur->bno >= acur->rec_bno);
947 ASSERT(acur->bno + acur->len <= acur->rec_bno + acur->rec_len);
948 ASSERT(acur->rec_bno + acur->rec_len <= be32_to_cpu(agf->agf_length));
950 error = xfs_alloc_fixup_trees(acur->cnt, acur->bnolt, acur->rec_bno,
951 acur->rec_len, acur->bno, acur->len, 0);
955 args->agbno = acur->bno;
956 args->len = acur->len;
959 trace_xfs_alloc_cur(args);
964 * Locality allocation lookup algorithm. This expects a cntbt cursor and uses
965 * bno optimized lookup to search for extents with ideal size and locality.
968 xfs_alloc_cntbt_iter(
969 struct xfs_alloc_arg *args,
970 struct xfs_alloc_cur *acur)
972 struct xfs_btree_cur *cur = acur->cnt;
974 xfs_extlen_t len, cur_len;
978 if (!xfs_alloc_cur_active(cur))
981 /* locality optimized lookup */
982 cur_len = acur->cur_len;
983 error = xfs_alloc_lookup_ge(cur, args->agbno, cur_len, &i);
988 error = xfs_alloc_get_rec(cur, &bno, &len, &i);
992 /* check the current record and update search length from it */
993 error = xfs_alloc_cur_check(args, acur, cur, &i);
996 ASSERT(len >= acur->cur_len);
1000 * We looked up the first record >= [agbno, len] above. The agbno is a
1001 * secondary key and so the current record may lie just before or after
1002 * agbno. If it is past agbno, check the previous record too so long as
1003 * the length matches as it may be closer. Don't check a smaller record
1004 * because that could deactivate our cursor.
1006 if (bno > args->agbno) {
1007 error = xfs_btree_decrement(cur, 0, &i);
1009 error = xfs_alloc_get_rec(cur, &bno, &len, &i);
1010 if (!error && i && len == acur->cur_len)
1011 error = xfs_alloc_cur_check(args, acur, cur,
1019 * Increment the search key until we find at least one allocation
1020 * candidate or if the extent we found was larger. Otherwise, double the
1021 * search key to optimize the search. Efficiency is more important here
1022 * than absolute best locality.
1025 if (!acur->len || acur->cur_len >= cur_len)
1028 acur->cur_len = cur_len;
1034 * Deal with the case where only small freespaces remain. Either return the
1035 * contents of the last freespace record, or allocate space from the freelist if
1036 * there is nothing in the tree.
1038 STATIC int /* error */
1039 xfs_alloc_ag_vextent_small(
1040 struct xfs_alloc_arg *args, /* allocation argument structure */
1041 struct xfs_btree_cur *ccur, /* optional by-size cursor */
1042 xfs_agblock_t *fbnop, /* result block number */
1043 xfs_extlen_t *flenp, /* result length */
1044 int *stat) /* status: 0-freelist, 1-normal/none */
1046 struct xfs_agf *agf = args->agbp->b_addr;
1048 xfs_agblock_t fbno = NULLAGBLOCK;
1049 xfs_extlen_t flen = 0;
1053 * If a cntbt cursor is provided, try to allocate the largest record in
1054 * the tree. Try the AGFL if the cntbt is empty, otherwise fail the
1055 * allocation. Make sure to respect minleft even when pulling from the
1059 error = xfs_btree_decrement(ccur, 0, &i);
1063 error = xfs_alloc_get_rec(ccur, &fbno, &flen, &i);
1066 if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1067 error = -EFSCORRUPTED;
1073 if (args->minlen != 1 || args->alignment != 1 ||
1074 args->resv == XFS_AG_RESV_AGFL ||
1075 be32_to_cpu(agf->agf_flcount) <= args->minleft)
1078 error = xfs_alloc_get_freelist(args->pag, args->tp, args->agbp,
1082 if (fbno == NULLAGBLOCK)
1085 xfs_extent_busy_reuse(args->mp, args->pag, fbno, 1,
1086 (args->datatype & XFS_ALLOC_NOBUSY));
1088 if (args->datatype & XFS_ALLOC_USERDATA) {
1091 error = xfs_trans_get_buf(args->tp, args->mp->m_ddev_targp,
1092 XFS_AGB_TO_DADDR(args->mp, args->agno, fbno),
1093 args->mp->m_bsize, 0, &bp);
1096 xfs_trans_binval(args->tp, bp);
1098 *fbnop = args->agbno = fbno;
1099 *flenp = args->len = 1;
1100 if (XFS_IS_CORRUPT(args->mp, fbno >= be32_to_cpu(agf->agf_length))) {
1101 error = -EFSCORRUPTED;
1104 args->wasfromfl = 1;
1105 trace_xfs_alloc_small_freelist(args);
1108 * If we're feeding an AGFL block to something that doesn't live in the
1109 * free space, we need to clear out the OWN_AG rmap.
1111 error = xfs_rmap_free(args->tp, args->agbp, args->pag, fbno, 1,
1112 &XFS_RMAP_OINFO_AG);
1121 * Can't do the allocation, give up.
1123 if (flen < args->minlen) {
1124 args->agbno = NULLAGBLOCK;
1125 trace_xfs_alloc_small_notenough(args);
1131 trace_xfs_alloc_small_done(args);
1135 trace_xfs_alloc_small_error(args);
1140 * Allocate a variable extent in the allocation group agno.
1141 * Type and bno are used to determine where in the allocation group the
1142 * extent will start.
1143 * Extent's length (returned in *len) will be between minlen and maxlen,
1144 * and of the form k * prod + mod unless there's nothing that large.
1145 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1147 STATIC int /* error */
1148 xfs_alloc_ag_vextent(
1149 xfs_alloc_arg_t *args) /* argument structure for allocation */
1153 ASSERT(args->minlen > 0);
1154 ASSERT(args->maxlen > 0);
1155 ASSERT(args->minlen <= args->maxlen);
1156 ASSERT(args->mod < args->prod);
1157 ASSERT(args->alignment > 0);
1160 * Branch to correct routine based on the type.
1162 args->wasfromfl = 0;
1163 switch (args->type) {
1164 case XFS_ALLOCTYPE_THIS_AG:
1165 error = xfs_alloc_ag_vextent_size(args);
1167 case XFS_ALLOCTYPE_NEAR_BNO:
1168 error = xfs_alloc_ag_vextent_near(args);
1170 case XFS_ALLOCTYPE_THIS_BNO:
1171 error = xfs_alloc_ag_vextent_exact(args);
1178 if (error || args->agbno == NULLAGBLOCK)
1181 ASSERT(args->len >= args->minlen);
1182 ASSERT(args->len <= args->maxlen);
1183 ASSERT(!args->wasfromfl || args->resv != XFS_AG_RESV_AGFL);
1184 ASSERT(args->agbno % args->alignment == 0);
1186 /* if not file data, insert new block into the reverse map btree */
1187 if (!xfs_rmap_should_skip_owner_update(&args->oinfo)) {
1188 error = xfs_rmap_alloc(args->tp, args->agbp, args->pag,
1189 args->agbno, args->len, &args->oinfo);
1194 if (!args->wasfromfl) {
1195 error = xfs_alloc_update_counters(args->tp, args->agbp,
1196 -((long)(args->len)));
1200 ASSERT(!xfs_extent_busy_search(args->mp, args->pag,
1201 args->agbno, args->len));
1204 xfs_ag_resv_alloc_extent(args->pag, args->resv, args);
1206 XFS_STATS_INC(args->mp, xs_allocx);
1207 XFS_STATS_ADD(args->mp, xs_allocb, args->len);
1212 * Allocate a variable extent at exactly agno/bno.
1213 * Extent's length (returned in *len) will be between minlen and maxlen,
1214 * and of the form k * prod + mod unless there's nothing that large.
1215 * Return the starting a.g. block (bno), or NULLAGBLOCK if we can't do it.
1217 STATIC int /* error */
1218 xfs_alloc_ag_vextent_exact(
1219 xfs_alloc_arg_t *args) /* allocation argument structure */
1221 struct xfs_agf __maybe_unused *agf = args->agbp->b_addr;
1222 struct xfs_btree_cur *bno_cur;/* by block-number btree cursor */
1223 struct xfs_btree_cur *cnt_cur;/* by count btree cursor */
1225 xfs_agblock_t fbno; /* start block of found extent */
1226 xfs_extlen_t flen; /* length of found extent */
1227 xfs_agblock_t tbno; /* start block of busy extent */
1228 xfs_extlen_t tlen; /* length of busy extent */
1229 xfs_agblock_t tend; /* end block of busy extent */
1230 int i; /* success/failure of operation */
1233 ASSERT(args->alignment == 1);
1236 * Allocate/initialize a cursor for the by-number freespace btree.
1238 bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1239 args->pag, XFS_BTNUM_BNO);
1242 * Lookup bno and minlen in the btree (minlen is irrelevant, really).
1243 * Look for the closest free block <= bno, it must contain bno
1244 * if any free block does.
1246 error = xfs_alloc_lookup_le(bno_cur, args->agbno, args->minlen, &i);
1253 * Grab the freespace record.
1255 error = xfs_alloc_get_rec(bno_cur, &fbno, &flen, &i);
1258 if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1259 error = -EFSCORRUPTED;
1262 ASSERT(fbno <= args->agbno);
1265 * Check for overlapping busy extents.
1269 xfs_extent_busy_trim(args, &tbno, &tlen, &busy_gen);
1272 * Give up if the start of the extent is busy, or the freespace isn't
1273 * long enough for the minimum request.
1275 if (tbno > args->agbno)
1277 if (tlen < args->minlen)
1280 if (tend < args->agbno + args->minlen)
1284 * End of extent will be smaller of the freespace end and the
1285 * maximal requested end.
1287 * Fix the length according to mod and prod if given.
1289 args->len = XFS_AGBLOCK_MIN(tend, args->agbno + args->maxlen)
1291 xfs_alloc_fix_len(args);
1292 ASSERT(args->agbno + args->len <= tend);
1295 * We are allocating agbno for args->len
1296 * Allocate/initialize a cursor for the by-size btree.
1298 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1299 args->pag, XFS_BTNUM_CNT);
1300 ASSERT(args->agbno + args->len <= be32_to_cpu(agf->agf_length));
1301 error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen, args->agbno,
1302 args->len, XFSA_FIXUP_BNO_OK);
1304 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
1308 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1309 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1311 args->wasfromfl = 0;
1312 trace_xfs_alloc_exact_done(args);
1316 /* Didn't find it, return null. */
1317 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1318 args->agbno = NULLAGBLOCK;
1319 trace_xfs_alloc_exact_notfound(args);
1323 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
1324 trace_xfs_alloc_exact_error(args);
1329 * Search a given number of btree records in a given direction. Check each
1330 * record against the good extent we've already found.
1333 xfs_alloc_walk_iter(
1334 struct xfs_alloc_arg *args,
1335 struct xfs_alloc_cur *acur,
1336 struct xfs_btree_cur *cur,
1338 bool find_one, /* quit on first candidate */
1339 int count, /* rec count (-1 for infinite) */
1348 * Search so long as the cursor is active or we find a better extent.
1349 * The cursor is deactivated if it extends beyond the range of the
1350 * current allocation candidate.
1352 while (xfs_alloc_cur_active(cur) && count) {
1353 error = xfs_alloc_cur_check(args, acur, cur, &i);
1361 if (!xfs_alloc_cur_active(cur))
1365 error = xfs_btree_increment(cur, 0, &i);
1367 error = xfs_btree_decrement(cur, 0, &i);
1371 cur->bc_ag.abt.active = false;
1381 * Search the by-bno and by-size btrees in parallel in search of an extent with
1382 * ideal locality based on the NEAR mode ->agbno locality hint.
1385 xfs_alloc_ag_vextent_locality(
1386 struct xfs_alloc_arg *args,
1387 struct xfs_alloc_cur *acur,
1390 struct xfs_btree_cur *fbcur = NULL;
1395 ASSERT(acur->len == 0);
1396 ASSERT(args->type == XFS_ALLOCTYPE_NEAR_BNO);
1400 error = xfs_alloc_lookup_ge(acur->cnt, args->agbno, acur->cur_len, &i);
1403 error = xfs_alloc_lookup_le(acur->bnolt, args->agbno, 0, &i);
1406 error = xfs_alloc_lookup_ge(acur->bnogt, args->agbno, 0, &i);
1411 * Search the bnobt and cntbt in parallel. Search the bnobt left and
1412 * right and lookup the closest extent to the locality hint for each
1413 * extent size key in the cntbt. The entire search terminates
1414 * immediately on a bnobt hit because that means we've found best case
1415 * locality. Otherwise the search continues until the cntbt cursor runs
1416 * off the end of the tree. If no allocation candidate is found at this
1417 * point, give up on locality, walk backwards from the end of the cntbt
1418 * and take the first available extent.
1420 * The parallel tree searches balance each other out to provide fairly
1421 * consistent performance for various situations. The bnobt search can
1422 * have pathological behavior in the worst case scenario of larger
1423 * allocation requests and fragmented free space. On the other hand, the
1424 * bnobt is able to satisfy most smaller allocation requests much more
1425 * quickly than the cntbt. The cntbt search can sift through fragmented
1426 * free space and sets of free extents for larger allocation requests
1427 * more quickly than the bnobt. Since the locality hint is just a hint
1428 * and we don't want to scan the entire bnobt for perfect locality, the
1429 * cntbt search essentially bounds the bnobt search such that we can
1430 * find good enough locality at reasonable performance in most cases.
1432 while (xfs_alloc_cur_active(acur->bnolt) ||
1433 xfs_alloc_cur_active(acur->bnogt) ||
1434 xfs_alloc_cur_active(acur->cnt)) {
1436 trace_xfs_alloc_cur_lookup(args);
1439 * Search the bnobt left and right. In the case of a hit, finish
1440 * the search in the opposite direction and we're done.
1442 error = xfs_alloc_walk_iter(args, acur, acur->bnolt, false,
1447 trace_xfs_alloc_cur_left(args);
1448 fbcur = acur->bnogt;
1452 error = xfs_alloc_walk_iter(args, acur, acur->bnogt, true, true,
1457 trace_xfs_alloc_cur_right(args);
1458 fbcur = acur->bnolt;
1464 * Check the extent with best locality based on the current
1465 * extent size search key and keep track of the best candidate.
1467 error = xfs_alloc_cntbt_iter(args, acur);
1470 if (!xfs_alloc_cur_active(acur->cnt)) {
1471 trace_xfs_alloc_cur_lookup_done(args);
1477 * If we failed to find anything due to busy extents, return empty
1478 * handed so the caller can flush and retry. If no busy extents were
1479 * found, walk backwards from the end of the cntbt as a last resort.
1481 if (!xfs_alloc_cur_active(acur->cnt) && !acur->len && !acur->busy) {
1482 error = xfs_btree_decrement(acur->cnt, 0, &i);
1486 acur->cnt->bc_ag.abt.active = true;
1493 * Search in the opposite direction for a better entry in the case of
1494 * a bnobt hit or walk backwards from the end of the cntbt.
1497 error = xfs_alloc_walk_iter(args, acur, fbcur, fbinc, true, -1,
1509 /* Check the last block of the cnt btree for allocations. */
1511 xfs_alloc_ag_vextent_lastblock(
1512 struct xfs_alloc_arg *args,
1513 struct xfs_alloc_cur *acur,
1522 /* Randomly don't execute the first algorithm. */
1523 if (prandom_u32() & 1)
1528 * Start from the entry that lookup found, sequence through all larger
1529 * free blocks. If we're actually pointing at a record smaller than
1530 * maxlen, go to the start of this block, and skip all those smaller
1533 if (*len || args->alignment > 1) {
1534 acur->cnt->bc_levels[0].ptr = 1;
1536 error = xfs_alloc_get_rec(acur->cnt, bno, len, &i);
1539 if (XFS_IS_CORRUPT(args->mp, i != 1))
1540 return -EFSCORRUPTED;
1541 if (*len >= args->minlen)
1543 error = xfs_btree_increment(acur->cnt, 0, &i);
1547 ASSERT(*len >= args->minlen);
1552 error = xfs_alloc_walk_iter(args, acur, acur->cnt, true, false, -1, &i);
1557 * It didn't work. We COULD be in a case where there's a good record
1558 * somewhere, so try again.
1563 trace_xfs_alloc_near_first(args);
1569 * Allocate a variable extent near bno in the allocation group agno.
1570 * Extent's length (returned in len) will be between minlen and maxlen,
1571 * and of the form k * prod + mod unless there's nothing that large.
1572 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1575 xfs_alloc_ag_vextent_near(
1576 struct xfs_alloc_arg *args)
1578 struct xfs_alloc_cur acur = {};
1579 int error; /* error code */
1580 int i; /* result code, temporary */
1584 /* handle uninitialized agbno range so caller doesn't have to */
1585 if (!args->min_agbno && !args->max_agbno)
1586 args->max_agbno = args->mp->m_sb.sb_agblocks - 1;
1587 ASSERT(args->min_agbno <= args->max_agbno);
1589 /* clamp agbno to the range if it's outside */
1590 if (args->agbno < args->min_agbno)
1591 args->agbno = args->min_agbno;
1592 if (args->agbno > args->max_agbno)
1593 args->agbno = args->max_agbno;
1599 * Set up cursors and see if there are any free extents as big as
1600 * maxlen. If not, pick the last entry in the tree unless the tree is
1603 error = xfs_alloc_cur_setup(args, &acur);
1604 if (error == -ENOSPC) {
1605 error = xfs_alloc_ag_vextent_small(args, acur.cnt, &bno,
1609 if (i == 0 || len == 0) {
1610 trace_xfs_alloc_near_noentry(args);
1620 * If the requested extent is large wrt the freespaces available
1621 * in this a.g., then the cursor will be pointing to a btree entry
1622 * near the right edge of the tree. If it's in the last btree leaf
1623 * block, then we just examine all the entries in that block
1624 * that are big enough, and pick the best one.
1626 if (xfs_btree_islastblock(acur.cnt, 0)) {
1627 bool allocated = false;
1629 error = xfs_alloc_ag_vextent_lastblock(args, &acur, &bno, &len,
1638 * Second algorithm. Combined cntbt and bnobt search to find ideal
1641 error = xfs_alloc_ag_vextent_locality(args, &acur, &i);
1646 * If we couldn't get anything, give up.
1650 trace_xfs_alloc_near_busy(args);
1651 xfs_extent_busy_flush(args->mp, args->pag,
1655 trace_xfs_alloc_size_neither(args);
1656 args->agbno = NULLAGBLOCK;
1661 /* fix up btrees on a successful allocation */
1662 error = xfs_alloc_cur_finish(args, &acur);
1665 xfs_alloc_cur_close(&acur, error);
1670 * Allocate a variable extent anywhere in the allocation group agno.
1671 * Extent's length (returned in len) will be between minlen and maxlen,
1672 * and of the form k * prod + mod unless there's nothing that large.
1673 * Return the starting a.g. block, or NULLAGBLOCK if we can't do it.
1675 STATIC int /* error */
1676 xfs_alloc_ag_vextent_size(
1677 xfs_alloc_arg_t *args) /* allocation argument structure */
1679 struct xfs_agf *agf = args->agbp->b_addr;
1680 struct xfs_btree_cur *bno_cur; /* cursor for bno btree */
1681 struct xfs_btree_cur *cnt_cur; /* cursor for cnt btree */
1682 int error; /* error result */
1683 xfs_agblock_t fbno; /* start of found freespace */
1684 xfs_extlen_t flen; /* length of found freespace */
1685 int i; /* temp status variable */
1686 xfs_agblock_t rbno; /* returned block number */
1687 xfs_extlen_t rlen; /* length of returned extent */
1693 * Allocate and initialize a cursor for the by-size btree.
1695 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1696 args->pag, XFS_BTNUM_CNT);
1700 * Look for an entry >= maxlen+alignment-1 blocks.
1702 if ((error = xfs_alloc_lookup_ge(cnt_cur, 0,
1703 args->maxlen + args->alignment - 1, &i)))
1707 * If none then we have to settle for a smaller extent. In the case that
1708 * there are no large extents, this will return the last entry in the
1709 * tree unless the tree is empty. In the case that there are only busy
1710 * large extents, this will return the largest small extent unless there
1711 * are no smaller extents available.
1714 error = xfs_alloc_ag_vextent_small(args, cnt_cur,
1718 if (i == 0 || flen == 0) {
1719 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1720 trace_xfs_alloc_size_noentry(args);
1724 busy = xfs_alloc_compute_aligned(args, fbno, flen, &rbno,
1728 * Search for a non-busy extent that is large enough.
1731 error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, &i);
1734 if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1735 error = -EFSCORRUPTED;
1739 busy = xfs_alloc_compute_aligned(args, fbno, flen,
1740 &rbno, &rlen, &busy_gen);
1742 if (rlen >= args->maxlen)
1745 error = xfs_btree_increment(cnt_cur, 0, &i);
1750 * Our only valid extents must have been busy.
1751 * Make it unbusy by forcing the log out and
1754 xfs_btree_del_cursor(cnt_cur,
1756 trace_xfs_alloc_size_busy(args);
1757 xfs_extent_busy_flush(args->mp,
1758 args->pag, busy_gen);
1765 * In the first case above, we got the last entry in the
1766 * by-size btree. Now we check to see if the space hits maxlen
1767 * once aligned; if not, we search left for something better.
1768 * This can't happen in the second case above.
1770 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1771 if (XFS_IS_CORRUPT(args->mp,
1774 rbno + rlen > fbno + flen))) {
1775 error = -EFSCORRUPTED;
1778 if (rlen < args->maxlen) {
1779 xfs_agblock_t bestfbno;
1780 xfs_extlen_t bestflen;
1781 xfs_agblock_t bestrbno;
1782 xfs_extlen_t bestrlen;
1789 if ((error = xfs_btree_decrement(cnt_cur, 0, &i)))
1793 if ((error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen,
1796 if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1797 error = -EFSCORRUPTED;
1800 if (flen < bestrlen)
1802 busy = xfs_alloc_compute_aligned(args, fbno, flen,
1803 &rbno, &rlen, &busy_gen);
1804 rlen = XFS_EXTLEN_MIN(args->maxlen, rlen);
1805 if (XFS_IS_CORRUPT(args->mp,
1808 rbno + rlen > fbno + flen))) {
1809 error = -EFSCORRUPTED;
1812 if (rlen > bestrlen) {
1817 if (rlen == args->maxlen)
1821 if ((error = xfs_alloc_lookup_eq(cnt_cur, bestfbno, bestflen,
1824 if (XFS_IS_CORRUPT(args->mp, i != 1)) {
1825 error = -EFSCORRUPTED;
1833 args->wasfromfl = 0;
1835 * Fix up the length.
1838 if (rlen < args->minlen) {
1840 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1841 trace_xfs_alloc_size_busy(args);
1842 xfs_extent_busy_flush(args->mp, args->pag, busy_gen);
1847 xfs_alloc_fix_len(args);
1850 if (XFS_IS_CORRUPT(args->mp, rlen > flen)) {
1851 error = -EFSCORRUPTED;
1855 * Allocate and initialize a cursor for the by-block tree.
1857 bno_cur = xfs_allocbt_init_cursor(args->mp, args->tp, args->agbp,
1858 args->pag, XFS_BTNUM_BNO);
1859 if ((error = xfs_alloc_fixup_trees(cnt_cur, bno_cur, fbno, flen,
1860 rbno, rlen, XFSA_FIXUP_CNT_OK)))
1862 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1863 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
1864 cnt_cur = bno_cur = NULL;
1867 if (XFS_IS_CORRUPT(args->mp,
1868 args->agbno + args->len >
1869 be32_to_cpu(agf->agf_length))) {
1870 error = -EFSCORRUPTED;
1873 trace_xfs_alloc_size_done(args);
1877 trace_xfs_alloc_size_error(args);
1879 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
1881 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
1885 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
1886 trace_xfs_alloc_size_nominleft(args);
1887 args->agbno = NULLAGBLOCK;
1892 * Free the extent starting at agno/bno for length.
1896 struct xfs_trans *tp,
1897 struct xfs_buf *agbp,
1898 xfs_agnumber_t agno,
1901 const struct xfs_owner_info *oinfo,
1902 enum xfs_ag_resv_type type)
1904 struct xfs_mount *mp;
1905 struct xfs_btree_cur *bno_cur;
1906 struct xfs_btree_cur *cnt_cur;
1907 xfs_agblock_t gtbno; /* start of right neighbor */
1908 xfs_extlen_t gtlen; /* length of right neighbor */
1909 xfs_agblock_t ltbno; /* start of left neighbor */
1910 xfs_extlen_t ltlen; /* length of left neighbor */
1911 xfs_agblock_t nbno; /* new starting block of freesp */
1912 xfs_extlen_t nlen; /* new length of freespace */
1913 int haveleft; /* have a left neighbor */
1914 int haveright; /* have a right neighbor */
1917 struct xfs_perag *pag = agbp->b_pag;
1919 bno_cur = cnt_cur = NULL;
1922 if (!xfs_rmap_should_skip_owner_update(oinfo)) {
1923 error = xfs_rmap_free(tp, agbp, pag, bno, len, oinfo);
1929 * Allocate and initialize a cursor for the by-block btree.
1931 bno_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_BNO);
1933 * Look for a neighboring block on the left (lower block numbers)
1934 * that is contiguous with this space.
1936 if ((error = xfs_alloc_lookup_le(bno_cur, bno, len, &haveleft)))
1940 * There is a block to our left.
1942 if ((error = xfs_alloc_get_rec(bno_cur, <bno, <len, &i)))
1944 if (XFS_IS_CORRUPT(mp, i != 1)) {
1945 error = -EFSCORRUPTED;
1949 * It's not contiguous, though.
1951 if (ltbno + ltlen < bno)
1955 * If this failure happens the request to free this
1956 * space was invalid, it's (partly) already free.
1959 if (XFS_IS_CORRUPT(mp, ltbno + ltlen > bno)) {
1960 error = -EFSCORRUPTED;
1966 * Look for a neighboring block on the right (higher block numbers)
1967 * that is contiguous with this space.
1969 if ((error = xfs_btree_increment(bno_cur, 0, &haveright)))
1973 * There is a block to our right.
1975 if ((error = xfs_alloc_get_rec(bno_cur, >bno, >len, &i)))
1977 if (XFS_IS_CORRUPT(mp, i != 1)) {
1978 error = -EFSCORRUPTED;
1982 * It's not contiguous, though.
1984 if (bno + len < gtbno)
1988 * If this failure happens the request to free this
1989 * space was invalid, it's (partly) already free.
1992 if (XFS_IS_CORRUPT(mp, bno + len > gtbno)) {
1993 error = -EFSCORRUPTED;
1999 * Now allocate and initialize a cursor for the by-size tree.
2001 cnt_cur = xfs_allocbt_init_cursor(mp, tp, agbp, pag, XFS_BTNUM_CNT);
2003 * Have both left and right contiguous neighbors.
2004 * Merge all three into a single free block.
2006 if (haveleft && haveright) {
2008 * Delete the old by-size entry on the left.
2010 if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
2012 if (XFS_IS_CORRUPT(mp, i != 1)) {
2013 error = -EFSCORRUPTED;
2016 if ((error = xfs_btree_delete(cnt_cur, &i)))
2018 if (XFS_IS_CORRUPT(mp, i != 1)) {
2019 error = -EFSCORRUPTED;
2023 * Delete the old by-size entry on the right.
2025 if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
2027 if (XFS_IS_CORRUPT(mp, i != 1)) {
2028 error = -EFSCORRUPTED;
2031 if ((error = xfs_btree_delete(cnt_cur, &i)))
2033 if (XFS_IS_CORRUPT(mp, i != 1)) {
2034 error = -EFSCORRUPTED;
2038 * Delete the old by-block entry for the right block.
2040 if ((error = xfs_btree_delete(bno_cur, &i)))
2042 if (XFS_IS_CORRUPT(mp, i != 1)) {
2043 error = -EFSCORRUPTED;
2047 * Move the by-block cursor back to the left neighbor.
2049 if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
2051 if (XFS_IS_CORRUPT(mp, i != 1)) {
2052 error = -EFSCORRUPTED;
2057 * Check that this is the right record: delete didn't
2058 * mangle the cursor.
2061 xfs_agblock_t xxbno;
2064 if ((error = xfs_alloc_get_rec(bno_cur, &xxbno, &xxlen,
2067 if (XFS_IS_CORRUPT(mp,
2071 error = -EFSCORRUPTED;
2077 * Update remaining by-block entry to the new, joined block.
2080 nlen = len + ltlen + gtlen;
2081 if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2085 * Have only a left contiguous neighbor.
2086 * Merge it together with the new freespace.
2088 else if (haveleft) {
2090 * Delete the old by-size entry on the left.
2092 if ((error = xfs_alloc_lookup_eq(cnt_cur, ltbno, ltlen, &i)))
2094 if (XFS_IS_CORRUPT(mp, i != 1)) {
2095 error = -EFSCORRUPTED;
2098 if ((error = xfs_btree_delete(cnt_cur, &i)))
2100 if (XFS_IS_CORRUPT(mp, i != 1)) {
2101 error = -EFSCORRUPTED;
2105 * Back up the by-block cursor to the left neighbor, and
2106 * update its length.
2108 if ((error = xfs_btree_decrement(bno_cur, 0, &i)))
2110 if (XFS_IS_CORRUPT(mp, i != 1)) {
2111 error = -EFSCORRUPTED;
2116 if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2120 * Have only a right contiguous neighbor.
2121 * Merge it together with the new freespace.
2123 else if (haveright) {
2125 * Delete the old by-size entry on the right.
2127 if ((error = xfs_alloc_lookup_eq(cnt_cur, gtbno, gtlen, &i)))
2129 if (XFS_IS_CORRUPT(mp, i != 1)) {
2130 error = -EFSCORRUPTED;
2133 if ((error = xfs_btree_delete(cnt_cur, &i)))
2135 if (XFS_IS_CORRUPT(mp, i != 1)) {
2136 error = -EFSCORRUPTED;
2140 * Update the starting block and length of the right
2141 * neighbor in the by-block tree.
2145 if ((error = xfs_alloc_update(bno_cur, nbno, nlen)))
2149 * No contiguous neighbors.
2150 * Insert the new freespace into the by-block tree.
2155 if ((error = xfs_btree_insert(bno_cur, &i)))
2157 if (XFS_IS_CORRUPT(mp, i != 1)) {
2158 error = -EFSCORRUPTED;
2162 xfs_btree_del_cursor(bno_cur, XFS_BTREE_NOERROR);
2165 * In all cases we need to insert the new freespace in the by-size tree.
2167 if ((error = xfs_alloc_lookup_eq(cnt_cur, nbno, nlen, &i)))
2169 if (XFS_IS_CORRUPT(mp, i != 0)) {
2170 error = -EFSCORRUPTED;
2173 if ((error = xfs_btree_insert(cnt_cur, &i)))
2175 if (XFS_IS_CORRUPT(mp, i != 1)) {
2176 error = -EFSCORRUPTED;
2179 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_NOERROR);
2183 * Update the freespace totals in the ag and superblock.
2185 error = xfs_alloc_update_counters(tp, agbp, len);
2186 xfs_ag_resv_free_extent(agbp->b_pag, type, tp, len);
2190 XFS_STATS_INC(mp, xs_freex);
2191 XFS_STATS_ADD(mp, xs_freeb, len);
2193 trace_xfs_free_extent(mp, agno, bno, len, type, haveleft, haveright);
2198 trace_xfs_free_extent(mp, agno, bno, len, type, -1, -1);
2200 xfs_btree_del_cursor(bno_cur, XFS_BTREE_ERROR);
2202 xfs_btree_del_cursor(cnt_cur, XFS_BTREE_ERROR);
2207 * Visible (exported) allocation/free functions.
2208 * Some of these are used just by xfs_alloc_btree.c and this file.
2212 * Compute and fill in value of m_alloc_maxlevels.
2215 xfs_alloc_compute_maxlevels(
2216 xfs_mount_t *mp) /* file system mount structure */
2218 mp->m_alloc_maxlevels = xfs_btree_compute_maxlevels(mp->m_alloc_mnr,
2219 (mp->m_sb.sb_agblocks + 1) / 2);
2220 ASSERT(mp->m_alloc_maxlevels <= xfs_allocbt_maxlevels_ondisk());
2224 * Find the length of the longest extent in an AG. The 'need' parameter
2225 * specifies how much space we're going to need for the AGFL and the
2226 * 'reserved' parameter tells us how many blocks in this AG are reserved for
2230 xfs_alloc_longest_free_extent(
2231 struct xfs_perag *pag,
2233 xfs_extlen_t reserved)
2235 xfs_extlen_t delta = 0;
2238 * If the AGFL needs a recharge, we'll have to subtract that from the
2241 if (need > pag->pagf_flcount)
2242 delta = need - pag->pagf_flcount;
2245 * If we cannot maintain others' reservations with space from the
2246 * not-longest freesp extents, we'll have to subtract /that/ from
2247 * the longest extent too.
2249 if (pag->pagf_freeblks - pag->pagf_longest < reserved)
2250 delta += reserved - (pag->pagf_freeblks - pag->pagf_longest);
2253 * If the longest extent is long enough to satisfy all the
2254 * reservations and AGFL rules in place, we can return this extent.
2256 if (pag->pagf_longest > delta)
2257 return min_t(xfs_extlen_t, pag->pag_mount->m_ag_max_usable,
2258 pag->pagf_longest - delta);
2260 /* Otherwise, let the caller try for 1 block if there's space. */
2261 return pag->pagf_flcount > 0 || pag->pagf_longest > 0;
2265 * Compute the minimum length of the AGFL in the given AG. If @pag is NULL,
2266 * return the largest possible minimum length.
2269 xfs_alloc_min_freelist(
2270 struct xfs_mount *mp,
2271 struct xfs_perag *pag)
2273 /* AG btrees have at least 1 level. */
2274 static const uint8_t fake_levels[XFS_BTNUM_AGF] = {1, 1, 1};
2275 const uint8_t *levels = pag ? pag->pagf_levels : fake_levels;
2276 unsigned int min_free;
2278 ASSERT(mp->m_alloc_maxlevels > 0);
2280 /* space needed by-bno freespace btree */
2281 min_free = min_t(unsigned int, levels[XFS_BTNUM_BNOi] + 1,
2282 mp->m_alloc_maxlevels);
2283 /* space needed by-size freespace btree */
2284 min_free += min_t(unsigned int, levels[XFS_BTNUM_CNTi] + 1,
2285 mp->m_alloc_maxlevels);
2286 /* space needed reverse mapping used space btree */
2287 if (xfs_has_rmapbt(mp))
2288 min_free += min_t(unsigned int, levels[XFS_BTNUM_RMAPi] + 1,
2289 mp->m_rmap_maxlevels);
2295 * Check if the operation we are fixing up the freelist for should go ahead or
2296 * not. If we are freeing blocks, we always allow it, otherwise the allocation
2297 * is dependent on whether the size and shape of free space available will
2298 * permit the requested allocation to take place.
2301 xfs_alloc_space_available(
2302 struct xfs_alloc_arg *args,
2303 xfs_extlen_t min_free,
2306 struct xfs_perag *pag = args->pag;
2307 xfs_extlen_t alloc_len, longest;
2308 xfs_extlen_t reservation; /* blocks that are still reserved */
2310 xfs_extlen_t agflcount;
2312 if (flags & XFS_ALLOC_FLAG_FREEING)
2315 reservation = xfs_ag_resv_needed(pag, args->resv);
2317 /* do we have enough contiguous free space for the allocation? */
2318 alloc_len = args->minlen + (args->alignment - 1) + args->minalignslop;
2319 longest = xfs_alloc_longest_free_extent(pag, min_free, reservation);
2320 if (longest < alloc_len)
2324 * Do we have enough free space remaining for the allocation? Don't
2325 * account extra agfl blocks because we are about to defer free them,
2326 * making them unavailable until the current transaction commits.
2328 agflcount = min_t(xfs_extlen_t, pag->pagf_flcount, min_free);
2329 available = (int)(pag->pagf_freeblks + agflcount -
2330 reservation - min_free - args->minleft);
2331 if (available < (int)max(args->total, alloc_len))
2335 * Clamp maxlen to the amount of free space available for the actual
2336 * extent allocation.
2338 if (available < (int)args->maxlen && !(flags & XFS_ALLOC_FLAG_CHECK)) {
2339 args->maxlen = available;
2340 ASSERT(args->maxlen > 0);
2341 ASSERT(args->maxlen >= args->minlen);
2348 xfs_free_agfl_block(
2349 struct xfs_trans *tp,
2350 xfs_agnumber_t agno,
2351 xfs_agblock_t agbno,
2352 struct xfs_buf *agbp,
2353 struct xfs_owner_info *oinfo)
2358 error = xfs_free_ag_extent(tp, agbp, agno, agbno, 1, oinfo,
2363 error = xfs_trans_get_buf(tp, tp->t_mountp->m_ddev_targp,
2364 XFS_AGB_TO_DADDR(tp->t_mountp, agno, agbno),
2365 tp->t_mountp->m_bsize, 0, &bp);
2368 xfs_trans_binval(tp, bp);
2374 * Check the agfl fields of the agf for inconsistency or corruption. The purpose
2375 * is to detect an agfl header padding mismatch between current and early v5
2376 * kernels. This problem manifests as a 1-slot size difference between the
2377 * on-disk flcount and the active [first, last] range of a wrapped agfl. This
2378 * may also catch variants of agfl count corruption unrelated to padding. Either
2379 * way, we'll reset the agfl and warn the user.
2381 * Return true if a reset is required before the agfl can be used, false
2385 xfs_agfl_needs_reset(
2386 struct xfs_mount *mp,
2387 struct xfs_agf *agf)
2389 uint32_t f = be32_to_cpu(agf->agf_flfirst);
2390 uint32_t l = be32_to_cpu(agf->agf_fllast);
2391 uint32_t c = be32_to_cpu(agf->agf_flcount);
2392 int agfl_size = xfs_agfl_size(mp);
2395 /* no agfl header on v4 supers */
2396 if (!xfs_has_crc(mp))
2400 * The agf read verifier catches severe corruption of these fields.
2401 * Repeat some sanity checks to cover a packed -> unpacked mismatch if
2402 * the verifier allows it.
2404 if (f >= agfl_size || l >= agfl_size)
2410 * Check consistency between the on-disk count and the active range. An
2411 * agfl padding mismatch manifests as an inconsistent flcount.
2416 active = agfl_size - f + l + 1;
2424 * Reset the agfl to an empty state. Ignore/drop any existing blocks since the
2425 * agfl content cannot be trusted. Warn the user that a repair is required to
2426 * recover leaked blocks.
2428 * The purpose of this mechanism is to handle filesystems affected by the agfl
2429 * header padding mismatch problem. A reset keeps the filesystem online with a
2430 * relatively minor free space accounting inconsistency rather than suffer the
2431 * inevitable crash from use of an invalid agfl block.
2435 struct xfs_trans *tp,
2436 struct xfs_buf *agbp,
2437 struct xfs_perag *pag)
2439 struct xfs_mount *mp = tp->t_mountp;
2440 struct xfs_agf *agf = agbp->b_addr;
2442 ASSERT(pag->pagf_agflreset);
2443 trace_xfs_agfl_reset(mp, agf, 0, _RET_IP_);
2446 "WARNING: Reset corrupted AGFL on AG %u. %d blocks leaked. "
2447 "Please unmount and run xfs_repair.",
2448 pag->pag_agno, pag->pagf_flcount);
2450 agf->agf_flfirst = 0;
2451 agf->agf_fllast = cpu_to_be32(xfs_agfl_size(mp) - 1);
2452 agf->agf_flcount = 0;
2453 xfs_alloc_log_agf(tp, agbp, XFS_AGF_FLFIRST | XFS_AGF_FLLAST |
2456 pag->pagf_flcount = 0;
2457 pag->pagf_agflreset = false;
2461 * Defer an AGFL block free. This is effectively equivalent to
2462 * xfs_free_extent_later() with some special handling particular to AGFL blocks.
2464 * Deferring AGFL frees helps prevent log reservation overruns due to too many
2465 * allocation operations in a transaction. AGFL frees are prone to this problem
2466 * because for one they are always freed one at a time. Further, an immediate
2467 * AGFL block free can cause a btree join and require another block free before
2468 * the real allocation can proceed. Deferring the free disconnects freeing up
2469 * the AGFL slot from freeing the block.
2472 xfs_defer_agfl_block(
2473 struct xfs_trans *tp,
2474 xfs_agnumber_t agno,
2475 xfs_fsblock_t agbno,
2476 struct xfs_owner_info *oinfo)
2478 struct xfs_mount *mp = tp->t_mountp;
2479 struct xfs_extent_free_item *new; /* new element */
2481 ASSERT(xfs_extfree_item_cache != NULL);
2482 ASSERT(oinfo != NULL);
2484 new = kmem_cache_zalloc(xfs_extfree_item_cache,
2485 GFP_KERNEL | __GFP_NOFAIL);
2486 new->xefi_startblock = XFS_AGB_TO_FSB(mp, agno, agbno);
2487 new->xefi_blockcount = 1;
2488 new->xefi_owner = oinfo->oi_owner;
2490 trace_xfs_agfl_free_defer(mp, agno, 0, agbno, 1);
2492 xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_AGFL_FREE, &new->xefi_list);
2496 * Add the extent to the list of extents to be free at transaction end.
2497 * The list is maintained sorted (by block number).
2500 __xfs_free_extent_later(
2501 struct xfs_trans *tp,
2504 const struct xfs_owner_info *oinfo,
2507 struct xfs_extent_free_item *new; /* new element */
2509 struct xfs_mount *mp = tp->t_mountp;
2510 xfs_agnumber_t agno;
2511 xfs_agblock_t agbno;
2513 ASSERT(bno != NULLFSBLOCK);
2515 ASSERT(len <= XFS_MAX_BMBT_EXTLEN);
2516 ASSERT(!isnullstartblock(bno));
2517 agno = XFS_FSB_TO_AGNO(mp, bno);
2518 agbno = XFS_FSB_TO_AGBNO(mp, bno);
2519 ASSERT(agno < mp->m_sb.sb_agcount);
2520 ASSERT(agbno < mp->m_sb.sb_agblocks);
2521 ASSERT(len < mp->m_sb.sb_agblocks);
2522 ASSERT(agbno + len <= mp->m_sb.sb_agblocks);
2524 ASSERT(xfs_extfree_item_cache != NULL);
2526 new = kmem_cache_zalloc(xfs_extfree_item_cache,
2527 GFP_KERNEL | __GFP_NOFAIL);
2528 new->xefi_startblock = bno;
2529 new->xefi_blockcount = (xfs_extlen_t)len;
2531 new->xefi_flags |= XFS_EFI_SKIP_DISCARD;
2533 ASSERT(oinfo->oi_offset == 0);
2535 if (oinfo->oi_flags & XFS_OWNER_INFO_ATTR_FORK)
2536 new->xefi_flags |= XFS_EFI_ATTR_FORK;
2537 if (oinfo->oi_flags & XFS_OWNER_INFO_BMBT_BLOCK)
2538 new->xefi_flags |= XFS_EFI_BMBT_BLOCK;
2539 new->xefi_owner = oinfo->oi_owner;
2541 new->xefi_owner = XFS_RMAP_OWN_NULL;
2543 trace_xfs_bmap_free_defer(tp->t_mountp,
2544 XFS_FSB_TO_AGNO(tp->t_mountp, bno), 0,
2545 XFS_FSB_TO_AGBNO(tp->t_mountp, bno), len);
2546 xfs_defer_add(tp, XFS_DEFER_OPS_TYPE_FREE, &new->xefi_list);
2551 * Check if an AGF has a free extent record whose length is equal to
2555 xfs_exact_minlen_extent_available(
2556 struct xfs_alloc_arg *args,
2557 struct xfs_buf *agbp,
2560 struct xfs_btree_cur *cnt_cur;
2565 cnt_cur = xfs_allocbt_init_cursor(args->mp, args->tp, agbp,
2566 args->pag, XFS_BTNUM_CNT);
2567 error = xfs_alloc_lookup_ge(cnt_cur, 0, args->minlen, stat);
2572 error = -EFSCORRUPTED;
2576 error = xfs_alloc_get_rec(cnt_cur, &fbno, &flen, stat);
2580 if (*stat == 1 && flen != args->minlen)
2584 xfs_btree_del_cursor(cnt_cur, error);
2591 * Decide whether to use this allocation group for this allocation.
2592 * If so, fix up the btree freelist's size.
2595 xfs_alloc_fix_freelist(
2596 struct xfs_alloc_arg *args, /* allocation argument structure */
2597 int flags) /* XFS_ALLOC_FLAG_... */
2599 struct xfs_mount *mp = args->mp;
2600 struct xfs_perag *pag = args->pag;
2601 struct xfs_trans *tp = args->tp;
2602 struct xfs_buf *agbp = NULL;
2603 struct xfs_buf *agflbp = NULL;
2604 struct xfs_alloc_arg targs; /* local allocation arguments */
2605 xfs_agblock_t bno; /* freelist block */
2606 xfs_extlen_t need; /* total blocks needed in freelist */
2609 /* deferred ops (AGFL block frees) require permanent transactions */
2610 ASSERT(tp->t_flags & XFS_TRANS_PERM_LOG_RES);
2612 if (!pag->pagf_init) {
2613 error = xfs_alloc_read_agf(pag, tp, flags, &agbp);
2615 /* Couldn't lock the AGF so skip this AG. */
2616 if (error == -EAGAIN)
2623 * If this is a metadata preferred pag and we are user data then try
2624 * somewhere else if we are not being asked to try harder at this
2627 if (pag->pagf_metadata && (args->datatype & XFS_ALLOC_USERDATA) &&
2628 (flags & XFS_ALLOC_FLAG_TRYLOCK)) {
2629 ASSERT(!(flags & XFS_ALLOC_FLAG_FREEING));
2630 goto out_agbp_relse;
2633 need = xfs_alloc_min_freelist(mp, pag);
2634 if (!xfs_alloc_space_available(args, need, flags |
2635 XFS_ALLOC_FLAG_CHECK))
2636 goto out_agbp_relse;
2639 * Get the a.g. freespace buffer.
2640 * Can fail if we're not blocking on locks, and it's held.
2643 error = xfs_alloc_read_agf(pag, tp, flags, &agbp);
2645 /* Couldn't lock the AGF so skip this AG. */
2646 if (error == -EAGAIN)
2652 /* reset a padding mismatched agfl before final free space check */
2653 if (pag->pagf_agflreset)
2654 xfs_agfl_reset(tp, agbp, pag);
2656 /* If there isn't enough total space or single-extent, reject it. */
2657 need = xfs_alloc_min_freelist(mp, pag);
2658 if (!xfs_alloc_space_available(args, need, flags))
2659 goto out_agbp_relse;
2662 if (args->alloc_minlen_only) {
2665 error = xfs_exact_minlen_extent_available(args, agbp, &stat);
2667 goto out_agbp_relse;
2671 * Make the freelist shorter if it's too long.
2673 * Note that from this point onwards, we will always release the agf and
2674 * agfl buffers on error. This handles the case where we error out and
2675 * the buffers are clean or may not have been joined to the transaction
2676 * and hence need to be released manually. If they have been joined to
2677 * the transaction, then xfs_trans_brelse() will handle them
2678 * appropriately based on the recursion count and dirty state of the
2681 * XXX (dgc): When we have lots of free space, does this buy us
2682 * anything other than extra overhead when we need to put more blocks
2683 * back on the free list? Maybe we should only do this when space is
2684 * getting low or the AGFL is more than half full?
2686 * The NOSHRINK flag prevents the AGFL from being shrunk if it's too
2687 * big; the NORMAP flag prevents AGFL expand/shrink operations from
2688 * updating the rmapbt. Both flags are used in xfs_repair while we're
2689 * rebuilding the rmapbt, and neither are used by the kernel. They're
2690 * both required to ensure that rmaps are correctly recorded for the
2691 * regenerated AGFL, bnobt, and cntbt. See repair/phase5.c and
2692 * repair/rmap.c in xfsprogs for details.
2694 memset(&targs, 0, sizeof(targs));
2695 /* struct copy below */
2696 if (flags & XFS_ALLOC_FLAG_NORMAP)
2697 targs.oinfo = XFS_RMAP_OINFO_SKIP_UPDATE;
2699 targs.oinfo = XFS_RMAP_OINFO_AG;
2700 while (!(flags & XFS_ALLOC_FLAG_NOSHRINK) && pag->pagf_flcount > need) {
2701 error = xfs_alloc_get_freelist(pag, tp, agbp, &bno, 0);
2703 goto out_agbp_relse;
2705 /* defer agfl frees */
2706 xfs_defer_agfl_block(tp, args->agno, bno, &targs.oinfo);
2712 targs.agno = args->agno;
2713 targs.alignment = targs.minlen = targs.prod = 1;
2714 targs.type = XFS_ALLOCTYPE_THIS_AG;
2716 error = xfs_alloc_read_agfl(pag, tp, &agflbp);
2718 goto out_agbp_relse;
2720 /* Make the freelist longer if it's too short. */
2721 while (pag->pagf_flcount < need) {
2723 targs.maxlen = need - pag->pagf_flcount;
2724 targs.resv = XFS_AG_RESV_AGFL;
2726 /* Allocate as many blocks as possible at once. */
2727 error = xfs_alloc_ag_vextent(&targs);
2729 goto out_agflbp_relse;
2732 * Stop if we run out. Won't happen if callers are obeying
2733 * the restrictions correctly. Can happen for free calls
2734 * on a completely full ag.
2736 if (targs.agbno == NULLAGBLOCK) {
2737 if (flags & XFS_ALLOC_FLAG_FREEING)
2739 goto out_agflbp_relse;
2742 * Put each allocated block on the list.
2744 for (bno = targs.agbno; bno < targs.agbno + targs.len; bno++) {
2745 error = xfs_alloc_put_freelist(pag, tp, agbp,
2748 goto out_agflbp_relse;
2751 xfs_trans_brelse(tp, agflbp);
2756 xfs_trans_brelse(tp, agflbp);
2759 xfs_trans_brelse(tp, agbp);
2766 * Get a block from the freelist.
2767 * Returns with the buffer for the block gotten.
2770 xfs_alloc_get_freelist(
2771 struct xfs_perag *pag,
2772 struct xfs_trans *tp,
2773 struct xfs_buf *agbp,
2774 xfs_agblock_t *bnop,
2777 struct xfs_agf *agf = agbp->b_addr;
2778 struct xfs_buf *agflbp;
2783 struct xfs_mount *mp = tp->t_mountp;
2786 * Freelist is empty, give up.
2788 if (!agf->agf_flcount) {
2789 *bnop = NULLAGBLOCK;
2793 * Read the array of free blocks.
2795 error = xfs_alloc_read_agfl(pag, tp, &agflbp);
2801 * Get the block number and update the data structures.
2803 agfl_bno = xfs_buf_to_agfl_bno(agflbp);
2804 bno = be32_to_cpu(agfl_bno[be32_to_cpu(agf->agf_flfirst)]);
2805 be32_add_cpu(&agf->agf_flfirst, 1);
2806 xfs_trans_brelse(tp, agflbp);
2807 if (be32_to_cpu(agf->agf_flfirst) == xfs_agfl_size(mp))
2808 agf->agf_flfirst = 0;
2810 ASSERT(!pag->pagf_agflreset);
2811 be32_add_cpu(&agf->agf_flcount, -1);
2812 pag->pagf_flcount--;
2814 logflags = XFS_AGF_FLFIRST | XFS_AGF_FLCOUNT;
2816 be32_add_cpu(&agf->agf_btreeblks, 1);
2817 pag->pagf_btreeblks++;
2818 logflags |= XFS_AGF_BTREEBLKS;
2821 xfs_alloc_log_agf(tp, agbp, logflags);
2828 * Log the given fields from the agf structure.
2832 struct xfs_trans *tp,
2836 int first; /* first byte offset */
2837 int last; /* last byte offset */
2838 static const short offsets[] = {
2839 offsetof(xfs_agf_t, agf_magicnum),
2840 offsetof(xfs_agf_t, agf_versionnum),
2841 offsetof(xfs_agf_t, agf_seqno),
2842 offsetof(xfs_agf_t, agf_length),
2843 offsetof(xfs_agf_t, agf_roots[0]),
2844 offsetof(xfs_agf_t, agf_levels[0]),
2845 offsetof(xfs_agf_t, agf_flfirst),
2846 offsetof(xfs_agf_t, agf_fllast),
2847 offsetof(xfs_agf_t, agf_flcount),
2848 offsetof(xfs_agf_t, agf_freeblks),
2849 offsetof(xfs_agf_t, agf_longest),
2850 offsetof(xfs_agf_t, agf_btreeblks),
2851 offsetof(xfs_agf_t, agf_uuid),
2852 offsetof(xfs_agf_t, agf_rmap_blocks),
2853 offsetof(xfs_agf_t, agf_refcount_blocks),
2854 offsetof(xfs_agf_t, agf_refcount_root),
2855 offsetof(xfs_agf_t, agf_refcount_level),
2856 /* needed so that we don't log the whole rest of the structure: */
2857 offsetof(xfs_agf_t, agf_spare64),
2861 trace_xfs_agf(tp->t_mountp, bp->b_addr, fields, _RET_IP_);
2863 xfs_trans_buf_set_type(tp, bp, XFS_BLFT_AGF_BUF);
2865 xfs_btree_offsets(fields, offsets, XFS_AGF_NUM_BITS, &first, &last);
2866 xfs_trans_log_buf(tp, bp, (uint)first, (uint)last);
2870 * Put the block on the freelist for the allocation group.
2873 xfs_alloc_put_freelist(
2874 struct xfs_perag *pag,
2875 struct xfs_trans *tp,
2876 struct xfs_buf *agbp,
2877 struct xfs_buf *agflbp,
2881 struct xfs_mount *mp = tp->t_mountp;
2882 struct xfs_agf *agf = agbp->b_addr;
2890 error = xfs_alloc_read_agfl(pag, tp, &agflbp);
2895 be32_add_cpu(&agf->agf_fllast, 1);
2896 if (be32_to_cpu(agf->agf_fllast) == xfs_agfl_size(mp))
2897 agf->agf_fllast = 0;
2899 ASSERT(!pag->pagf_agflreset);
2900 be32_add_cpu(&agf->agf_flcount, 1);
2901 pag->pagf_flcount++;
2903 logflags = XFS_AGF_FLLAST | XFS_AGF_FLCOUNT;
2905 be32_add_cpu(&agf->agf_btreeblks, -1);
2906 pag->pagf_btreeblks--;
2907 logflags |= XFS_AGF_BTREEBLKS;
2910 xfs_alloc_log_agf(tp, agbp, logflags);
2912 ASSERT(be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp));
2914 agfl_bno = xfs_buf_to_agfl_bno(agflbp);
2915 blockp = &agfl_bno[be32_to_cpu(agf->agf_fllast)];
2916 *blockp = cpu_to_be32(bno);
2917 startoff = (char *)blockp - (char *)agflbp->b_addr;
2919 xfs_alloc_log_agf(tp, agbp, logflags);
2921 xfs_trans_buf_set_type(tp, agflbp, XFS_BLFT_AGFL_BUF);
2922 xfs_trans_log_buf(tp, agflbp, startoff,
2923 startoff + sizeof(xfs_agblock_t) - 1);
2927 static xfs_failaddr_t
2931 struct xfs_mount *mp = bp->b_mount;
2932 struct xfs_agf *agf = bp->b_addr;
2934 if (xfs_has_crc(mp)) {
2935 if (!uuid_equal(&agf->agf_uuid, &mp->m_sb.sb_meta_uuid))
2936 return __this_address;
2937 if (!xfs_log_check_lsn(mp, be64_to_cpu(agf->agf_lsn)))
2938 return __this_address;
2941 if (!xfs_verify_magic(bp, agf->agf_magicnum))
2942 return __this_address;
2944 if (!(XFS_AGF_GOOD_VERSION(be32_to_cpu(agf->agf_versionnum)) &&
2945 be32_to_cpu(agf->agf_freeblks) <= be32_to_cpu(agf->agf_length) &&
2946 be32_to_cpu(agf->agf_flfirst) < xfs_agfl_size(mp) &&
2947 be32_to_cpu(agf->agf_fllast) < xfs_agfl_size(mp) &&
2948 be32_to_cpu(agf->agf_flcount) <= xfs_agfl_size(mp)))
2949 return __this_address;
2951 if (be32_to_cpu(agf->agf_length) > mp->m_sb.sb_dblocks)
2952 return __this_address;
2954 if (be32_to_cpu(agf->agf_freeblks) < be32_to_cpu(agf->agf_longest) ||
2955 be32_to_cpu(agf->agf_freeblks) > be32_to_cpu(agf->agf_length))
2956 return __this_address;
2958 if (be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) < 1 ||
2959 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) < 1 ||
2960 be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNO]) >
2961 mp->m_alloc_maxlevels ||
2962 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNT]) >
2963 mp->m_alloc_maxlevels)
2964 return __this_address;
2966 if (xfs_has_rmapbt(mp) &&
2967 (be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) < 1 ||
2968 be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAP]) >
2969 mp->m_rmap_maxlevels))
2970 return __this_address;
2972 if (xfs_has_rmapbt(mp) &&
2973 be32_to_cpu(agf->agf_rmap_blocks) > be32_to_cpu(agf->agf_length))
2974 return __this_address;
2977 * during growfs operations, the perag is not fully initialised,
2978 * so we can't use it for any useful checking. growfs ensures we can't
2979 * use it by using uncached buffers that don't have the perag attached
2980 * so we can detect and avoid this problem.
2982 if (bp->b_pag && be32_to_cpu(agf->agf_seqno) != bp->b_pag->pag_agno)
2983 return __this_address;
2985 if (xfs_has_lazysbcount(mp) &&
2986 be32_to_cpu(agf->agf_btreeblks) > be32_to_cpu(agf->agf_length))
2987 return __this_address;
2989 if (xfs_has_reflink(mp) &&
2990 be32_to_cpu(agf->agf_refcount_blocks) >
2991 be32_to_cpu(agf->agf_length))
2992 return __this_address;
2994 if (xfs_has_reflink(mp) &&
2995 (be32_to_cpu(agf->agf_refcount_level) < 1 ||
2996 be32_to_cpu(agf->agf_refcount_level) > mp->m_refc_maxlevels))
2997 return __this_address;
3004 xfs_agf_read_verify(
3007 struct xfs_mount *mp = bp->b_mount;
3010 if (xfs_has_crc(mp) &&
3011 !xfs_buf_verify_cksum(bp, XFS_AGF_CRC_OFF))
3012 xfs_verifier_error(bp, -EFSBADCRC, __this_address);
3014 fa = xfs_agf_verify(bp);
3015 if (XFS_TEST_ERROR(fa, mp, XFS_ERRTAG_ALLOC_READ_AGF))
3016 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
3021 xfs_agf_write_verify(
3024 struct xfs_mount *mp = bp->b_mount;
3025 struct xfs_buf_log_item *bip = bp->b_log_item;
3026 struct xfs_agf *agf = bp->b_addr;
3029 fa = xfs_agf_verify(bp);
3031 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
3035 if (!xfs_has_crc(mp))
3039 agf->agf_lsn = cpu_to_be64(bip->bli_item.li_lsn);
3041 xfs_buf_update_cksum(bp, XFS_AGF_CRC_OFF);
3044 const struct xfs_buf_ops xfs_agf_buf_ops = {
3046 .magic = { cpu_to_be32(XFS_AGF_MAGIC), cpu_to_be32(XFS_AGF_MAGIC) },
3047 .verify_read = xfs_agf_read_verify,
3048 .verify_write = xfs_agf_write_verify,
3049 .verify_struct = xfs_agf_verify,
3053 * Read in the allocation group header (free/alloc section).
3057 struct xfs_perag *pag,
3058 struct xfs_trans *tp,
3060 struct xfs_buf **agfbpp)
3062 struct xfs_mount *mp = pag->pag_mount;
3065 trace_xfs_read_agf(pag->pag_mount, pag->pag_agno);
3067 error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp,
3068 XFS_AG_DADDR(mp, pag->pag_agno, XFS_AGF_DADDR(mp)),
3069 XFS_FSS_TO_BB(mp, 1), flags, agfbpp, &xfs_agf_buf_ops);
3073 xfs_buf_set_ref(*agfbpp, XFS_AGF_REF);
3078 * Read in the allocation group header (free/alloc section) and initialise the
3079 * perag structure if necessary. If the caller provides @agfbpp, then return the
3080 * locked buffer to the caller, otherwise free it.
3084 struct xfs_perag *pag,
3085 struct xfs_trans *tp,
3087 struct xfs_buf **agfbpp)
3089 struct xfs_buf *agfbp;
3090 struct xfs_agf *agf;
3094 trace_xfs_alloc_read_agf(pag->pag_mount, pag->pag_agno);
3096 /* We don't support trylock when freeing. */
3097 ASSERT((flags & (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK)) !=
3098 (XFS_ALLOC_FLAG_FREEING | XFS_ALLOC_FLAG_TRYLOCK));
3099 error = xfs_read_agf(pag, tp,
3100 (flags & XFS_ALLOC_FLAG_TRYLOCK) ? XBF_TRYLOCK : 0,
3105 agf = agfbp->b_addr;
3106 if (!pag->pagf_init) {
3107 pag->pagf_freeblks = be32_to_cpu(agf->agf_freeblks);
3108 pag->pagf_btreeblks = be32_to_cpu(agf->agf_btreeblks);
3109 pag->pagf_flcount = be32_to_cpu(agf->agf_flcount);
3110 pag->pagf_longest = be32_to_cpu(agf->agf_longest);
3111 pag->pagf_levels[XFS_BTNUM_BNOi] =
3112 be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]);
3113 pag->pagf_levels[XFS_BTNUM_CNTi] =
3114 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]);
3115 pag->pagf_levels[XFS_BTNUM_RMAPi] =
3116 be32_to_cpu(agf->agf_levels[XFS_BTNUM_RMAPi]);
3117 pag->pagf_refcount_level = be32_to_cpu(agf->agf_refcount_level);
3119 pag->pagf_agflreset = xfs_agfl_needs_reset(pag->pag_mount, agf);
3122 * Update the in-core allocbt counter. Filter out the rmapbt
3123 * subset of the btreeblks counter because the rmapbt is managed
3124 * by perag reservation. Subtract one for the rmapbt root block
3125 * because the rmap counter includes it while the btreeblks
3126 * counter only tracks non-root blocks.
3128 allocbt_blks = pag->pagf_btreeblks;
3129 if (xfs_has_rmapbt(pag->pag_mount))
3130 allocbt_blks -= be32_to_cpu(agf->agf_rmap_blocks) - 1;
3131 if (allocbt_blks > 0)
3132 atomic64_add(allocbt_blks,
3133 &pag->pag_mount->m_allocbt_blks);
3136 else if (!xfs_is_shutdown(pag->pag_mount)) {
3137 ASSERT(pag->pagf_freeblks == be32_to_cpu(agf->agf_freeblks));
3138 ASSERT(pag->pagf_btreeblks == be32_to_cpu(agf->agf_btreeblks));
3139 ASSERT(pag->pagf_flcount == be32_to_cpu(agf->agf_flcount));
3140 ASSERT(pag->pagf_longest == be32_to_cpu(agf->agf_longest));
3141 ASSERT(pag->pagf_levels[XFS_BTNUM_BNOi] ==
3142 be32_to_cpu(agf->agf_levels[XFS_BTNUM_BNOi]));
3143 ASSERT(pag->pagf_levels[XFS_BTNUM_CNTi] ==
3144 be32_to_cpu(agf->agf_levels[XFS_BTNUM_CNTi]));
3150 xfs_trans_brelse(tp, agfbp);
3155 * Allocate an extent (variable-size).
3156 * Depending on the allocation type, we either look in a single allocation
3157 * group or loop over the allocation groups to find the result.
3161 struct xfs_alloc_arg *args) /* allocation argument structure */
3163 xfs_agblock_t agsize; /* allocation group size */
3165 int flags; /* XFS_ALLOC_FLAG_... locking flags */
3166 struct xfs_mount *mp; /* mount structure pointer */
3167 xfs_agnumber_t sagno; /* starting allocation group number */
3168 xfs_alloctype_t type; /* input allocation type */
3170 xfs_agnumber_t rotorstep = xfs_rotorstep; /* inode32 agf stepper */
3173 type = args->otype = args->type;
3174 args->agbno = NULLAGBLOCK;
3176 * Just fix this up, for the case where the last a.g. is shorter
3177 * (or there's only one a.g.) and the caller couldn't easily figure
3178 * that out (xfs_bmap_alloc).
3180 agsize = mp->m_sb.sb_agblocks;
3181 if (args->maxlen > agsize)
3182 args->maxlen = agsize;
3183 if (args->alignment == 0)
3184 args->alignment = 1;
3185 ASSERT(XFS_FSB_TO_AGNO(mp, args->fsbno) < mp->m_sb.sb_agcount);
3186 ASSERT(XFS_FSB_TO_AGBNO(mp, args->fsbno) < agsize);
3187 ASSERT(args->minlen <= args->maxlen);
3188 ASSERT(args->minlen <= agsize);
3189 ASSERT(args->mod < args->prod);
3190 if (XFS_FSB_TO_AGNO(mp, args->fsbno) >= mp->m_sb.sb_agcount ||
3191 XFS_FSB_TO_AGBNO(mp, args->fsbno) >= agsize ||
3192 args->minlen > args->maxlen || args->minlen > agsize ||
3193 args->mod >= args->prod) {
3194 args->fsbno = NULLFSBLOCK;
3195 trace_xfs_alloc_vextent_badargs(args);
3200 case XFS_ALLOCTYPE_THIS_AG:
3201 case XFS_ALLOCTYPE_NEAR_BNO:
3202 case XFS_ALLOCTYPE_THIS_BNO:
3204 * These three force us into a single a.g.
3206 args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
3207 args->pag = xfs_perag_get(mp, args->agno);
3208 error = xfs_alloc_fix_freelist(args, 0);
3210 trace_xfs_alloc_vextent_nofix(args);
3214 trace_xfs_alloc_vextent_noagbp(args);
3217 args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
3218 if ((error = xfs_alloc_ag_vextent(args)))
3221 case XFS_ALLOCTYPE_START_BNO:
3223 * Try near allocation first, then anywhere-in-ag after
3224 * the first a.g. fails.
3226 if ((args->datatype & XFS_ALLOC_INITIAL_USER_DATA) &&
3227 xfs_is_inode32(mp)) {
3228 args->fsbno = XFS_AGB_TO_FSB(mp,
3229 ((mp->m_agfrotor / rotorstep) %
3230 mp->m_sb.sb_agcount), 0);
3233 args->agbno = XFS_FSB_TO_AGBNO(mp, args->fsbno);
3234 args->type = XFS_ALLOCTYPE_NEAR_BNO;
3236 case XFS_ALLOCTYPE_FIRST_AG:
3238 * Rotate through the allocation groups looking for a winner.
3240 if (type == XFS_ALLOCTYPE_FIRST_AG) {
3242 * Start with allocation group given by bno.
3244 args->agno = XFS_FSB_TO_AGNO(mp, args->fsbno);
3245 args->type = XFS_ALLOCTYPE_THIS_AG;
3250 * Start with the given allocation group.
3252 args->agno = sagno = XFS_FSB_TO_AGNO(mp, args->fsbno);
3253 flags = XFS_ALLOC_FLAG_TRYLOCK;
3256 * Loop over allocation groups twice; first time with
3257 * trylock set, second time without.
3260 args->pag = xfs_perag_get(mp, args->agno);
3261 error = xfs_alloc_fix_freelist(args, flags);
3263 trace_xfs_alloc_vextent_nofix(args);
3267 * If we get a buffer back then the allocation will fly.
3270 if ((error = xfs_alloc_ag_vextent(args)))
3275 trace_xfs_alloc_vextent_loopfailed(args);
3278 * Didn't work, figure out the next iteration.
3280 if (args->agno == sagno &&
3281 type == XFS_ALLOCTYPE_START_BNO)
3282 args->type = XFS_ALLOCTYPE_THIS_AG;
3284 * For the first allocation, we can try any AG to get
3285 * space. However, if we already have allocated a
3286 * block, we don't want to try AGs whose number is below
3287 * sagno. Otherwise, we may end up with out-of-order
3288 * locking of AGF, which might cause deadlock.
3290 if (++(args->agno) == mp->m_sb.sb_agcount) {
3291 if (args->tp->t_firstblock != NULLFSBLOCK)
3297 * Reached the starting a.g., must either be done
3298 * or switch to non-trylock mode.
3300 if (args->agno == sagno) {
3302 args->agbno = NULLAGBLOCK;
3303 trace_xfs_alloc_vextent_allfailed(args);
3308 if (type == XFS_ALLOCTYPE_START_BNO) {
3309 args->agbno = XFS_FSB_TO_AGBNO(mp,
3311 args->type = XFS_ALLOCTYPE_NEAR_BNO;
3314 xfs_perag_put(args->pag);
3317 if (args->agno == sagno)
3318 mp->m_agfrotor = (mp->m_agfrotor + 1) %
3319 (mp->m_sb.sb_agcount * rotorstep);
3321 mp->m_agfrotor = (args->agno * rotorstep + 1) %
3322 (mp->m_sb.sb_agcount * rotorstep);
3329 if (args->agbno == NULLAGBLOCK)
3330 args->fsbno = NULLFSBLOCK;
3332 args->fsbno = XFS_AGB_TO_FSB(mp, args->agno, args->agbno);
3334 ASSERT(args->len >= args->minlen);
3335 ASSERT(args->len <= args->maxlen);
3336 ASSERT(args->agbno % args->alignment == 0);
3337 XFS_AG_CHECK_DADDR(mp, XFS_FSB_TO_DADDR(mp, args->fsbno),
3342 xfs_perag_put(args->pag);
3345 xfs_perag_put(args->pag);
3349 /* Ensure that the freelist is at full capacity. */
3351 xfs_free_extent_fix_freelist(
3352 struct xfs_trans *tp,
3353 struct xfs_perag *pag,
3354 struct xfs_buf **agbp)
3356 struct xfs_alloc_arg args;
3359 memset(&args, 0, sizeof(struct xfs_alloc_arg));
3361 args.mp = tp->t_mountp;
3362 args.agno = pag->pag_agno;
3366 * validate that the block number is legal - the enables us to detect
3367 * and handle a silent filesystem corruption rather than crashing.
3369 if (args.agno >= args.mp->m_sb.sb_agcount)
3370 return -EFSCORRUPTED;
3372 error = xfs_alloc_fix_freelist(&args, XFS_ALLOC_FLAG_FREEING);
3382 * Just break up the extent address and hand off to xfs_free_ag_extent
3383 * after fixing up the freelist.
3387 struct xfs_trans *tp,
3390 const struct xfs_owner_info *oinfo,
3391 enum xfs_ag_resv_type type,
3394 struct xfs_mount *mp = tp->t_mountp;
3395 struct xfs_buf *agbp;
3396 xfs_agnumber_t agno = XFS_FSB_TO_AGNO(mp, bno);
3397 xfs_agblock_t agbno = XFS_FSB_TO_AGBNO(mp, bno);
3398 struct xfs_agf *agf;
3400 unsigned int busy_flags = 0;
3401 struct xfs_perag *pag;
3404 ASSERT(type != XFS_AG_RESV_AGFL);
3406 if (XFS_TEST_ERROR(false, mp,
3407 XFS_ERRTAG_FREE_EXTENT))
3410 pag = xfs_perag_get(mp, agno);
3411 error = xfs_free_extent_fix_freelist(tp, pag, &agbp);
3416 if (XFS_IS_CORRUPT(mp, agbno >= mp->m_sb.sb_agblocks)) {
3417 error = -EFSCORRUPTED;
3421 /* validate the extent size is legal now we have the agf locked */
3422 if (XFS_IS_CORRUPT(mp, agbno + len > be32_to_cpu(agf->agf_length))) {
3423 error = -EFSCORRUPTED;
3427 error = xfs_free_ag_extent(tp, agbp, agno, agbno, len, oinfo, type);
3432 busy_flags |= XFS_EXTENT_BUSY_SKIP_DISCARD;
3433 xfs_extent_busy_insert(tp, pag, agbno, len, busy_flags);
3438 xfs_trans_brelse(tp, agbp);
3444 struct xfs_alloc_query_range_info {
3445 xfs_alloc_query_range_fn fn;
3449 /* Format btree record and pass to our callback. */
3451 xfs_alloc_query_range_helper(
3452 struct xfs_btree_cur *cur,
3453 const union xfs_btree_rec *rec,
3456 struct xfs_alloc_query_range_info *query = priv;
3457 struct xfs_alloc_rec_incore irec;
3459 irec.ar_startblock = be32_to_cpu(rec->alloc.ar_startblock);
3460 irec.ar_blockcount = be32_to_cpu(rec->alloc.ar_blockcount);
3461 return query->fn(cur, &irec, query->priv);
3464 /* Find all free space within a given range of blocks. */
3466 xfs_alloc_query_range(
3467 struct xfs_btree_cur *cur,
3468 const struct xfs_alloc_rec_incore *low_rec,
3469 const struct xfs_alloc_rec_incore *high_rec,
3470 xfs_alloc_query_range_fn fn,
3473 union xfs_btree_irec low_brec;
3474 union xfs_btree_irec high_brec;
3475 struct xfs_alloc_query_range_info query;
3477 ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
3478 low_brec.a = *low_rec;
3479 high_brec.a = *high_rec;
3482 return xfs_btree_query_range(cur, &low_brec, &high_brec,
3483 xfs_alloc_query_range_helper, &query);
3486 /* Find all free space records. */
3488 xfs_alloc_query_all(
3489 struct xfs_btree_cur *cur,
3490 xfs_alloc_query_range_fn fn,
3493 struct xfs_alloc_query_range_info query;
3495 ASSERT(cur->bc_btnum == XFS_BTNUM_BNO);
3498 return xfs_btree_query_all(cur, xfs_alloc_query_range_helper, &query);
3501 /* Is there a record covering a given extent? */
3503 xfs_alloc_has_record(
3504 struct xfs_btree_cur *cur,
3509 union xfs_btree_irec low;
3510 union xfs_btree_irec high;
3512 memset(&low, 0, sizeof(low));
3513 low.a.ar_startblock = bno;
3514 memset(&high, 0xFF, sizeof(high));
3515 high.a.ar_startblock = bno + len - 1;
3517 return xfs_btree_has_record(cur, &low, &high, exists);
3521 * Walk all the blocks in the AGFL. The @walk_fn can return any negative
3522 * error code or XFS_ITER_*.
3526 struct xfs_mount *mp,
3527 struct xfs_agf *agf,
3528 struct xfs_buf *agflbp,
3529 xfs_agfl_walk_fn walk_fn,
3536 agfl_bno = xfs_buf_to_agfl_bno(agflbp);
3537 i = be32_to_cpu(agf->agf_flfirst);
3539 /* Nothing to walk in an empty AGFL. */
3540 if (agf->agf_flcount == cpu_to_be32(0))
3543 /* Otherwise, walk from first to last, wrapping as needed. */
3545 error = walk_fn(mp, be32_to_cpu(agfl_bno[i]), priv);
3548 if (i == be32_to_cpu(agf->agf_fllast))
3550 if (++i == xfs_agfl_size(mp))
3558 xfs_extfree_intent_init_cache(void)
3560 xfs_extfree_item_cache = kmem_cache_create("xfs_extfree_intent",
3561 sizeof(struct xfs_extent_free_item),
3564 return xfs_extfree_item_cache != NULL ? 0 : -ENOMEM;
3568 xfs_extfree_intent_destroy_cache(void)
3570 kmem_cache_destroy(xfs_extfree_item_cache);
3571 xfs_extfree_item_cache = NULL;