fs/xfs/libxfs/xfs_rmap_btree.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (c) 2014 Red Hat, Inc.
   4  * All Rights Reserved.
   5  */
   6 #include "xfs.h"
   7 #include "xfs_fs.h"
   8 #include "xfs_shared.h"
   9 #include "xfs_format.h"
  10 #include "xfs_log_format.h"
  11 #include "xfs_trans_resv.h"
  12 #include "xfs_mount.h"
  13 #include "xfs_trans.h"
  14 #include "xfs_alloc.h"
  15 #include "xfs_btree.h"
  16 #include "xfs_btree_staging.h"
  17 #include "xfs_rmap.h"
  18 #include "xfs_rmap_btree.h"
  19 #include "xfs_health.h"
  20 #include "xfs_trace.h"
  21 #include "xfs_error.h"
  22 #include "xfs_extent_busy.h"
  23 #include "xfs_ag.h"
  24 #include "xfs_ag_resv.h"
  25 #include "xfs_buf_mem.h"
  26 #include "xfs_btree_mem.h"
  27
  28 static struct kmem_cache        *xfs_rmapbt_cur_cache;
  29
  30 /*
  31  * Reverse map btree.
  32  *
  33  * This is a per-ag tree used to track the owner(s) of a given extent. With
  34  * reflink it is possible for there to be multiple owners, which is a departure
  35  * from classic XFS. Owner records for data extents are inserted when the
  36  * extent is mapped and removed when an extent is unmapped.  Owner records for
  37  * all other block types (i.e. metadata) are inserted when an extent is
  38  * allocated and removed when an extent is freed. There can only be one owner
  39  * of a metadata extent, usually an inode or some other metadata structure like
  40  * an AG btree.
  41  *
  42  * The rmap btree is part of the free space management, so blocks for the tree
  43  * are sourced from the agfl. Hence we need transaction reservation support for
  44  * this tree so that the freelist is always large enough. This also impacts on
  45  * the minimum space we need to leave free in the AG.
  46  *
  47  * The tree is ordered by [ag block, owner, offset]. This is a large key size,
  48  * but it is the only way to enforce unique keys when a block can be owned by
  49  * multiple files at any offset. There's no need to order/search by extent
  50  * size for online updating/management of the tree. It is intended that most
  51  * reverse lookups will be to find the owner(s) of a particular block, or to
  52  * try to recover tree and file data from corrupt primary metadata.
  53  */
  54
  55 static struct xfs_btree_cur *
  56 xfs_rmapbt_dup_cursor(
  57         struct xfs_btree_cur    *cur)
  58 {
  59         return xfs_rmapbt_init_cursor(cur->bc_mp, cur->bc_tp,
  60                                 cur->bc_ag.agbp, cur->bc_ag.pag);
  61 }
  62
  63 STATIC void
  64 xfs_rmapbt_set_root(
  65         struct xfs_btree_cur            *cur,
  66         const union xfs_btree_ptr       *ptr,
  67         int                             inc)
  68 {
  69         struct xfs_buf          *agbp = cur->bc_ag.agbp;
  70         struct xfs_agf          *agf = agbp->b_addr;
  71
  72         ASSERT(ptr->s != 0);
  73
  74         agf->agf_rmap_root = ptr->s;
  75         be32_add_cpu(&agf->agf_rmap_level, inc);
  76         cur->bc_ag.pag->pagf_rmap_level += inc;
  77
  78         xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS);
  79 }
  80
  81 STATIC int
  82 xfs_rmapbt_alloc_block(
  83         struct xfs_btree_cur            *cur,
  84         const union xfs_btree_ptr       *start,
  85         union xfs_btree_ptr             *new,
  86         int                             *stat)
  87 {
  88         struct xfs_buf          *agbp = cur->bc_ag.agbp;
  89         struct xfs_agf          *agf = agbp->b_addr;
  90         struct xfs_perag        *pag = cur->bc_ag.pag;
  91         int                     error;
  92         xfs_agblock_t           bno;
  93
  94         /* Allocate the new block from the freelist. If we can't, give up.  */
  95         error = xfs_alloc_get_freelist(pag, cur->bc_tp, cur->bc_ag.agbp,
  96                                        &bno, 1);
  97         if (error)
  98                 return error;
  99         if (bno == NULLAGBLOCK) {
 100                 *stat = 0;
 101                 return 0;
 102         }
 103
 104         xfs_extent_busy_reuse(cur->bc_mp, pag, bno, 1, false);
 105
 106         new->s = cpu_to_be32(bno);
 107         be32_add_cpu(&agf->agf_rmap_blocks, 1);
 108         xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
 109
 110         xfs_ag_resv_rmapbt_alloc(cur->bc_mp, pag->pag_agno);
 111
 112         *stat = 1;
 113         return 0;
 114 }
 115
 116 STATIC int
 117 xfs_rmapbt_free_block(
 118         struct xfs_btree_cur    *cur,
 119         struct xfs_buf          *bp)
 120 {
 121         struct xfs_buf          *agbp = cur->bc_ag.agbp;
 122         struct xfs_agf          *agf = agbp->b_addr;
 123         struct xfs_perag        *pag = cur->bc_ag.pag;
 124         xfs_agblock_t           bno;
 125         int                     error;
 126
 127         bno = xfs_daddr_to_agbno(cur->bc_mp, xfs_buf_daddr(bp));
 128         be32_add_cpu(&agf->agf_rmap_blocks, -1);
 129         xfs_alloc_log_agf(cur->bc_tp, agbp, XFS_AGF_RMAP_BLOCKS);
 130         error = xfs_alloc_put_freelist(pag, cur->bc_tp, agbp, NULL, bno, 1);
 131         if (error)
 132                 return error;
 133
 134         xfs_extent_busy_insert(cur->bc_tp, pag, bno, 1,
 135                               XFS_EXTENT_BUSY_SKIP_DISCARD);
 136
 137         xfs_ag_resv_free_extent(pag, XFS_AG_RESV_RMAPBT, NULL, 1);
 138         return 0;
 139 }
 140
 141 STATIC int
 142 xfs_rmapbt_get_minrecs(
 143         struct xfs_btree_cur    *cur,
 144         int                     level)
 145 {
 146         return cur->bc_mp->m_rmap_mnr[level != 0];
 147 }
 148
 149 STATIC int
 150 xfs_rmapbt_get_maxrecs(
 151         struct xfs_btree_cur    *cur,
 152         int                     level)
 153 {
 154         return cur->bc_mp->m_rmap_mxr[level != 0];
 155 }
 156
 157 /*
 158  * Convert the ondisk record's offset field into the ondisk key's offset field.
 159  * Fork and bmbt are significant parts of the rmap record key, but written
 160  * status is merely a record attribute.
 161  */
 162 static inline __be64 ondisk_rec_offset_to_key(const union xfs_btree_rec *rec)
 163 {
 164         return rec->rmap.rm_offset & ~cpu_to_be64(XFS_RMAP_OFF_UNWRITTEN);
 165 }
 166
 167 STATIC void
 168 xfs_rmapbt_init_key_from_rec(
 169         union xfs_btree_key             *key,
 170         const union xfs_btree_rec       *rec)
 171 {
 172         key->rmap.rm_startblock = rec->rmap.rm_startblock;
 173         key->rmap.rm_owner = rec->rmap.rm_owner;
 174         key->rmap.rm_offset = ondisk_rec_offset_to_key(rec);
 175 }
 176
 177 /*
 178  * The high key for a reverse mapping record can be computed by shifting
 179  * the startblock and offset to the highest value that would still map
 180  * to that record.  In practice this means that we add blockcount-1 to
 181  * the startblock for all records, and if the record is for a data/attr
 182  * fork mapping, we add blockcount-1 to the offset too.
 183  */
 184 STATIC void
 185 xfs_rmapbt_init_high_key_from_rec(
 186         union xfs_btree_key             *key,
 187         const union xfs_btree_rec       *rec)
 188 {
 189         uint64_t                        off;
 190         int                             adj;
 191
 192         adj = be32_to_cpu(rec->rmap.rm_blockcount) - 1;
 193
 194         key->rmap.rm_startblock = rec->rmap.rm_startblock;
 195         be32_add_cpu(&key->rmap.rm_startblock, adj);
 196         key->rmap.rm_owner = rec->rmap.rm_owner;
 197         key->rmap.rm_offset = ondisk_rec_offset_to_key(rec);
 198         if (XFS_RMAP_NON_INODE_OWNER(be64_to_cpu(rec->rmap.rm_owner)) ||
 199             XFS_RMAP_IS_BMBT_BLOCK(be64_to_cpu(rec->rmap.rm_offset)))
 200                 return;
 201         off = be64_to_cpu(key->rmap.rm_offset);
 202         off = (XFS_RMAP_OFF(off) + adj) | (off & ~XFS_RMAP_OFF_MASK);
 203         key->rmap.rm_offset = cpu_to_be64(off);
 204 }
 205
 206 STATIC void
 207 xfs_rmapbt_init_rec_from_cur(
 208         struct xfs_btree_cur    *cur,
 209         union xfs_btree_rec     *rec)
 210 {
 211         rec->rmap.rm_startblock = cpu_to_be32(cur->bc_rec.r.rm_startblock);
 212         rec->rmap.rm_blockcount = cpu_to_be32(cur->bc_rec.r.rm_blockcount);
 213         rec->rmap.rm_owner = cpu_to_be64(cur->bc_rec.r.rm_owner);
 214         rec->rmap.rm_offset = cpu_to_be64(
 215                         xfs_rmap_irec_offset_pack(&cur->bc_rec.r));
 216 }
 217
 218 STATIC void
 219 xfs_rmapbt_init_ptr_from_cur(
 220         struct xfs_btree_cur    *cur,
 221         union xfs_btree_ptr     *ptr)
 222 {
 223         struct xfs_agf          *agf = cur->bc_ag.agbp->b_addr;
 224
 225         ASSERT(cur->bc_ag.pag->pag_agno == be32_to_cpu(agf->agf_seqno));
 226
 227         ptr->s = agf->agf_rmap_root;
 228 }
 229
 230 /*
 231  * Mask the appropriate parts of the ondisk key field for a key comparison.
 232  * Fork and bmbt are significant parts of the rmap record key, but written
 233  * status is merely a record attribute.
 234  */
 235 static inline uint64_t offset_keymask(uint64_t offset)
 236 {
 237         return offset & ~XFS_RMAP_OFF_UNWRITTEN;
 238 }
 239
 240 STATIC int64_t
 241 xfs_rmapbt_key_diff(
 242         struct xfs_btree_cur            *cur,
 243         const union xfs_btree_key       *key)
 244 {
 245         struct xfs_rmap_irec            *rec = &cur->bc_rec.r;
 246         const struct xfs_rmap_key       *kp = &key->rmap;
 247         __u64                           x, y;
 248         int64_t                         d;
 249
 250         d = (int64_t)be32_to_cpu(kp->rm_startblock) - rec->rm_startblock;
 251         if (d)
 252                 return d;
 253
 254         x = be64_to_cpu(kp->rm_owner);
 255         y = rec->rm_owner;
 256         if (x > y)
 257                 return 1;
 258         else if (y > x)
 259                 return -1;
 260
 261         x = offset_keymask(be64_to_cpu(kp->rm_offset));
 262         y = offset_keymask(xfs_rmap_irec_offset_pack(rec));
 263         if (x > y)
 264                 return 1;
 265         else if (y > x)
 266                 return -1;
 267         return 0;
 268 }
 269
 270 STATIC int64_t
 271 xfs_rmapbt_diff_two_keys(
 272         struct xfs_btree_cur            *cur,
 273         const union xfs_btree_key       *k1,
 274         const union xfs_btree_key       *k2,
 275         const union xfs_btree_key       *mask)
 276 {
 277         const struct xfs_rmap_key       *kp1 = &k1->rmap;
 278         const struct xfs_rmap_key       *kp2 = &k2->rmap;
 279         int64_t                         d;
 280         __u64                           x, y;
 281
 282         /* Doesn't make sense to mask off the physical space part */
 283         ASSERT(!mask || mask->rmap.rm_startblock);
 284
 285         d = (int64_t)be32_to_cpu(kp1->rm_startblock) -
 286                      be32_to_cpu(kp2->rm_startblock);
 287         if (d)
 288                 return d;
 289
 290         if (!mask || mask->rmap.rm_owner) {
 291                 x = be64_to_cpu(kp1->rm_owner);
 292                 y = be64_to_cpu(kp2->rm_owner);
 293                 if (x > y)
 294                         return 1;
 295                 else if (y > x)
 296                         return -1;
 297         }
 298
 299         if (!mask || mask->rmap.rm_offset) {
 300                 /* Doesn't make sense to allow offset but not owner */
 301                 ASSERT(!mask || mask->rmap.rm_owner);
 302
 303                 x = offset_keymask(be64_to_cpu(kp1->rm_offset));
 304                 y = offset_keymask(be64_to_cpu(kp2->rm_offset));
 305                 if (x > y)
 306                         return 1;
 307                 else if (y > x)
 308                         return -1;
 309         }
 310
 311         return 0;
 312 }
 313
 314 static xfs_failaddr_t
 315 xfs_rmapbt_verify(
 316         struct xfs_buf          *bp)
 317 {
 318         struct xfs_mount        *mp = bp->b_mount;
 319         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
 320         struct xfs_perag        *pag = bp->b_pag;
 321         xfs_failaddr_t          fa;
 322         unsigned int            level;
 323
 324         /*
 325          * magic number and level verification
 326          *
 327          * During growfs operations, we can't verify the exact level or owner as
 328          * the perag is not fully initialised and hence not attached to the
 329          * buffer.  In this case, check against the maximum tree depth.
 330          *
 331          * Similarly, during log recovery we will have a perag structure
 332          * attached, but the agf information will not yet have been initialised
 333          * from the on disk AGF. Again, we can only check against maximum limits
 334          * in this case.
 335          */
 336         if (!xfs_verify_magic(bp, block->bb_magic))
 337                 return __this_address;
 338
 339         if (!xfs_has_rmapbt(mp))
 340                 return __this_address;
 341         fa = xfs_btree_agblock_v5hdr_verify(bp);
 342         if (fa)
 343                 return fa;
 344
 345         level = be16_to_cpu(block->bb_level);
 346         if (pag && xfs_perag_initialised_agf(pag)) {
 347                 unsigned int    maxlevel = pag->pagf_rmap_level;
 348
 349 #ifdef CONFIG_XFS_ONLINE_REPAIR
 350                 /*
 351                  * Online repair could be rewriting the free space btrees, so
 352                  * we'll validate against the larger of either tree while this
 353                  * is going on.
 354                  */
 355                 maxlevel = max_t(unsigned int, maxlevel,
 356                                 pag->pagf_repair_rmap_level);
 357 #endif
 358                 if (level >= maxlevel)
 359                         return __this_address;
 360         } else if (level >= mp->m_rmap_maxlevels)
 361                 return __this_address;
 362
 363         return xfs_btree_agblock_verify(bp, mp->m_rmap_mxr[level != 0]);
 364 }
 365
 366 static void
 367 xfs_rmapbt_read_verify(
 368         struct xfs_buf  *bp)
 369 {
 370         xfs_failaddr_t  fa;
 371
 372         if (!xfs_btree_agblock_verify_crc(bp))
 373                 xfs_verifier_error(bp, -EFSBADCRC, __this_address);
 374         else {
 375                 fa = xfs_rmapbt_verify(bp);
 376                 if (fa)
 377                         xfs_verifier_error(bp, -EFSCORRUPTED, fa);
 378         }
 379
 380         if (bp->b_error)
 381                 trace_xfs_btree_corrupt(bp, _RET_IP_);
 382 }
 383
 384 static void
 385 xfs_rmapbt_write_verify(
 386         struct xfs_buf  *bp)
 387 {
 388         xfs_failaddr_t  fa;
 389
 390         fa = xfs_rmapbt_verify(bp);
 391         if (fa) {
 392                 trace_xfs_btree_corrupt(bp, _RET_IP_);
 393                 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
 394                 return;
 395         }
 396         xfs_btree_agblock_calc_crc(bp);
 397
 398 }
 399
 400 const struct xfs_buf_ops xfs_rmapbt_buf_ops = {
 401         .name                   = "xfs_rmapbt",
 402         .magic                  = { 0, cpu_to_be32(XFS_RMAP_CRC_MAGIC) },
 403         .verify_read            = xfs_rmapbt_read_verify,
 404         .verify_write           = xfs_rmapbt_write_verify,
 405         .verify_struct          = xfs_rmapbt_verify,
 406 };
 407
 408 STATIC int
 409 xfs_rmapbt_keys_inorder(
 410         struct xfs_btree_cur            *cur,
 411         const union xfs_btree_key       *k1,
 412         const union xfs_btree_key       *k2)
 413 {
 414         uint32_t                x;
 415         uint32_t                y;
 416         uint64_t                a;
 417         uint64_t                b;
 418
 419         x = be32_to_cpu(k1->rmap.rm_startblock);
 420         y = be32_to_cpu(k2->rmap.rm_startblock);
 421         if (x < y)
 422                 return 1;
 423         else if (x > y)
 424                 return 0;
 425         a = be64_to_cpu(k1->rmap.rm_owner);
 426         b = be64_to_cpu(k2->rmap.rm_owner);
 427         if (a < b)
 428                 return 1;
 429         else if (a > b)
 430                 return 0;
 431         a = offset_keymask(be64_to_cpu(k1->rmap.rm_offset));
 432         b = offset_keymask(be64_to_cpu(k2->rmap.rm_offset));
 433         if (a <= b)
 434                 return 1;
 435         return 0;
 436 }
 437
 438 STATIC int
 439 xfs_rmapbt_recs_inorder(
 440         struct xfs_btree_cur            *cur,
 441         const union xfs_btree_rec       *r1,
 442         const union xfs_btree_rec       *r2)
 443 {
 444         uint32_t                x;
 445         uint32_t                y;
 446         uint64_t                a;
 447         uint64_t                b;
 448
 449         x = be32_to_cpu(r1->rmap.rm_startblock);
 450         y = be32_to_cpu(r2->rmap.rm_startblock);
 451         if (x < y)
 452                 return 1;
 453         else if (x > y)
 454                 return 0;
 455         a = be64_to_cpu(r1->rmap.rm_owner);
 456         b = be64_to_cpu(r2->rmap.rm_owner);
 457         if (a < b)
 458                 return 1;
 459         else if (a > b)
 460                 return 0;
 461         a = offset_keymask(be64_to_cpu(r1->rmap.rm_offset));
 462         b = offset_keymask(be64_to_cpu(r2->rmap.rm_offset));
 463         if (a <= b)
 464                 return 1;
 465         return 0;
 466 }
 467
 468 STATIC enum xbtree_key_contig
 469 xfs_rmapbt_keys_contiguous(
 470         struct xfs_btree_cur            *cur,
 471         const union xfs_btree_key       *key1,
 472         const union xfs_btree_key       *key2,
 473         const union xfs_btree_key       *mask)
 474 {
 475         ASSERT(!mask || mask->rmap.rm_startblock);
 476
 477         /*
 478          * We only support checking contiguity of the physical space component.
 479          * If any callers ever need more specificity than that, they'll have to
 480          * implement it here.
 481          */
 482         ASSERT(!mask || (!mask->rmap.rm_owner && !mask->rmap.rm_offset));
 483
 484         return xbtree_key_contig(be32_to_cpu(key1->rmap.rm_startblock),
 485                                  be32_to_cpu(key2->rmap.rm_startblock));
 486 }
 487
 488 const struct xfs_btree_ops xfs_rmapbt_ops = {
 489         .name                   = "rmap",
 490         .type                   = XFS_BTREE_TYPE_AG,
 491         .geom_flags             = XFS_BTGEO_OVERLAPPING,
 492
 493         .rec_len                = sizeof(struct xfs_rmap_rec),
 494         /* Overlapping btree; 2 keys per pointer. */
 495         .key_len                = 2 * sizeof(struct xfs_rmap_key),
 496         .ptr_len                = XFS_BTREE_SHORT_PTR_LEN,
 497
 498         .lru_refs               = XFS_RMAP_BTREE_REF,
 499         .statoff                = XFS_STATS_CALC_INDEX(xs_rmap_2),
 500         .sick_mask              = XFS_SICK_AG_RMAPBT,
 501
 502         .dup_cursor             = xfs_rmapbt_dup_cursor,
 503         .set_root               = xfs_rmapbt_set_root,
 504         .alloc_block            = xfs_rmapbt_alloc_block,
 505         .free_block             = xfs_rmapbt_free_block,
 506         .get_minrecs            = xfs_rmapbt_get_minrecs,
 507         .get_maxrecs            = xfs_rmapbt_get_maxrecs,
 508         .init_key_from_rec      = xfs_rmapbt_init_key_from_rec,
 509         .init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec,
 510         .init_rec_from_cur      = xfs_rmapbt_init_rec_from_cur,
 511         .init_ptr_from_cur      = xfs_rmapbt_init_ptr_from_cur,
 512         .key_diff               = xfs_rmapbt_key_diff,
 513         .buf_ops                = &xfs_rmapbt_buf_ops,
 514         .diff_two_keys          = xfs_rmapbt_diff_two_keys,
 515         .keys_inorder           = xfs_rmapbt_keys_inorder,
 516         .recs_inorder           = xfs_rmapbt_recs_inorder,
 517         .keys_contiguous        = xfs_rmapbt_keys_contiguous,
 518 };
 519
 520 /*
 521  * Create a new reverse mapping btree cursor.
 522  *
 523  * For staging cursors tp and agbp are NULL.
 524  */
 525 struct xfs_btree_cur *
 526 xfs_rmapbt_init_cursor(
 527         struct xfs_mount        *mp,
 528         struct xfs_trans        *tp,
 529         struct xfs_buf          *agbp,
 530         struct xfs_perag        *pag)
 531 {
 532         struct xfs_btree_cur    *cur;
 533
 534         cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rmapbt_ops,
 535                         mp->m_rmap_maxlevels, xfs_rmapbt_cur_cache);
 536         cur->bc_ag.pag = xfs_perag_hold(pag);
 537         cur->bc_ag.agbp = agbp;
 538         if (agbp) {
 539                 struct xfs_agf          *agf = agbp->b_addr;
 540
 541                 cur->bc_nlevels = be32_to_cpu(agf->agf_rmap_level);
 542         }
 543         return cur;
 544 }
 545
 546 #ifdef CONFIG_XFS_BTREE_IN_MEM
 547 static inline unsigned int
 548 xfs_rmapbt_mem_block_maxrecs(
 549         unsigned int            blocklen,
 550         bool                    leaf)
 551 {
 552         if (leaf)
 553                 return blocklen / sizeof(struct xfs_rmap_rec);
 554         return blocklen /
 555                 (2 * sizeof(struct xfs_rmap_key) + sizeof(__be64));
 556 }
 557
 558 /*
 559  * Validate an in-memory rmap btree block.  Callers are allowed to generate an
 560  * in-memory btree even if the ondisk feature is not enabled.
 561  */
 562 static xfs_failaddr_t
 563 xfs_rmapbt_mem_verify(
 564         struct xfs_buf          *bp)
 565 {
 566         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
 567         xfs_failaddr_t          fa;
 568         unsigned int            level;
 569         unsigned int            maxrecs;
 570
 571         if (!xfs_verify_magic(bp, block->bb_magic))
 572                 return __this_address;
 573
 574         fa = xfs_btree_fsblock_v5hdr_verify(bp, XFS_RMAP_OWN_UNKNOWN);
 575         if (fa)
 576                 return fa;
 577
 578         level = be16_to_cpu(block->bb_level);
 579         if (level >= xfs_rmapbt_maxlevels_ondisk())
 580                 return __this_address;
 581
 582         maxrecs = xfs_rmapbt_mem_block_maxrecs(
 583                         XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN, level == 0);
 584         return xfs_btree_memblock_verify(bp, maxrecs);
 585 }
 586
 587 static void
 588 xfs_rmapbt_mem_rw_verify(
 589         struct xfs_buf  *bp)
 590 {
 591         xfs_failaddr_t  fa = xfs_rmapbt_mem_verify(bp);
 592
 593         if (fa)
 594                 xfs_verifier_error(bp, -EFSCORRUPTED, fa);
 595 }
 596
 597 /* skip crc checks on in-memory btrees to save time */
 598 static const struct xfs_buf_ops xfs_rmapbt_mem_buf_ops = {
 599         .name                   = "xfs_rmapbt_mem",
 600         .magic                  = { 0, cpu_to_be32(XFS_RMAP_CRC_MAGIC) },
 601         .verify_read            = xfs_rmapbt_mem_rw_verify,
 602         .verify_write           = xfs_rmapbt_mem_rw_verify,
 603         .verify_struct          = xfs_rmapbt_mem_verify,
 604 };
 605
 606 const struct xfs_btree_ops xfs_rmapbt_mem_ops = {
 607         .name                   = "mem_rmap",
 608         .type                   = XFS_BTREE_TYPE_MEM,
 609         .geom_flags             = XFS_BTGEO_OVERLAPPING,
 610
 611         .rec_len                = sizeof(struct xfs_rmap_rec),
 612         /* Overlapping btree; 2 keys per pointer. */
 613         .key_len                = 2 * sizeof(struct xfs_rmap_key),
 614         .ptr_len                = XFS_BTREE_LONG_PTR_LEN,
 615
 616         .lru_refs               = XFS_RMAP_BTREE_REF,
 617         .statoff                = XFS_STATS_CALC_INDEX(xs_rmap_mem_2),
 618
 619         .dup_cursor             = xfbtree_dup_cursor,
 620         .set_root               = xfbtree_set_root,
 621         .alloc_block            = xfbtree_alloc_block,
 622         .free_block             = xfbtree_free_block,
 623         .get_minrecs            = xfbtree_get_minrecs,
 624         .get_maxrecs            = xfbtree_get_maxrecs,
 625         .init_key_from_rec      = xfs_rmapbt_init_key_from_rec,
 626         .init_high_key_from_rec = xfs_rmapbt_init_high_key_from_rec,
 627         .init_rec_from_cur      = xfs_rmapbt_init_rec_from_cur,
 628         .init_ptr_from_cur      = xfbtree_init_ptr_from_cur,
 629         .key_diff               = xfs_rmapbt_key_diff,
 630         .buf_ops                = &xfs_rmapbt_mem_buf_ops,
 631         .diff_two_keys          = xfs_rmapbt_diff_two_keys,
 632         .keys_inorder           = xfs_rmapbt_keys_inorder,
 633         .recs_inorder           = xfs_rmapbt_recs_inorder,
 634         .keys_contiguous        = xfs_rmapbt_keys_contiguous,
 635 };
 636
 637 /* Create a cursor for an in-memory btree. */
 638 struct xfs_btree_cur *
 639 xfs_rmapbt_mem_cursor(
 640         struct xfs_perag        *pag,
 641         struct xfs_trans        *tp,
 642         struct xfbtree          *xfbt)
 643 {
 644         struct xfs_btree_cur    *cur;
 645         struct xfs_mount        *mp = pag->pag_mount;
 646
 647         cur = xfs_btree_alloc_cursor(mp, tp, &xfs_rmapbt_mem_ops,
 648                         xfs_rmapbt_maxlevels_ondisk(), xfs_rmapbt_cur_cache);
 649         cur->bc_mem.xfbtree = xfbt;
 650         cur->bc_nlevels = xfbt->nlevels;
 651
 652         cur->bc_mem.pag = xfs_perag_hold(pag);
 653         return cur;
 654 }
 655
 656 /* Create an in-memory rmap btree. */
 657 int
 658 xfs_rmapbt_mem_init(
 659         struct xfs_mount        *mp,
 660         struct xfbtree          *xfbt,
 661         struct xfs_buftarg      *btp,
 662         xfs_agnumber_t          agno)
 663 {
 664         xfbt->owner = agno;
 665         return xfbtree_init(mp, xfbt, btp, &xfs_rmapbt_mem_ops);
 666 }
 667
 668 /* Compute the max possible height for reverse mapping btrees in memory. */
 669 static unsigned int
 670 xfs_rmapbt_mem_maxlevels(void)
 671 {
 672         unsigned int            minrecs[2];
 673         unsigned int            blocklen;
 674
 675         blocklen = XFBNO_BLOCKSIZE - XFS_BTREE_LBLOCK_CRC_LEN;
 676
 677         minrecs[0] = xfs_rmapbt_mem_block_maxrecs(blocklen, true) / 2;
 678         minrecs[1] = xfs_rmapbt_mem_block_maxrecs(blocklen, false) / 2;
 679
 680         /*
 681          * How tall can an in-memory rmap btree become if we filled the entire
 682          * AG with rmap records?
 683          */
 684         return xfs_btree_compute_maxlevels(minrecs,
 685                         XFS_MAX_AG_BYTES / sizeof(struct xfs_rmap_rec));
 686 }
 687 #else
 688 # define xfs_rmapbt_mem_maxlevels()     (0)
 689 #endif /* CONFIG_XFS_BTREE_IN_MEM */
 690
 691 /*
 692  * Install a new reverse mapping btree root.  Caller is responsible for
 693  * invalidating and freeing the old btree blocks.
 694  */
 695 void
 696 xfs_rmapbt_commit_staged_btree(
 697         struct xfs_btree_cur    *cur,
 698         struct xfs_trans        *tp,
 699         struct xfs_buf          *agbp)
 700 {
 701         struct xfs_agf          *agf = agbp->b_addr;
 702         struct xbtree_afakeroot *afake = cur->bc_ag.afake;
 703
 704         ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
 705
 706         agf->agf_rmap_root = cpu_to_be32(afake->af_root);
 707         agf->agf_rmap_level = cpu_to_be32(afake->af_levels);
 708         agf->agf_rmap_blocks = cpu_to_be32(afake->af_blocks);
 709         xfs_alloc_log_agf(tp, agbp, XFS_AGF_ROOTS | XFS_AGF_LEVELS |
 710                                     XFS_AGF_RMAP_BLOCKS);
 711         xfs_btree_commit_afakeroot(cur, tp, agbp);
 712 }
 713
 714 /* Calculate number of records in a reverse mapping btree block. */
 715 static inline unsigned int
 716 xfs_rmapbt_block_maxrecs(
 717         unsigned int            blocklen,
 718         bool                    leaf)
 719 {
 720         if (leaf)
 721                 return blocklen / sizeof(struct xfs_rmap_rec);
 722         return blocklen /
 723                 (2 * sizeof(struct xfs_rmap_key) + sizeof(xfs_rmap_ptr_t));
 724 }
 725
 726 /*
 727  * Calculate number of records in an rmap btree block.
 728  */
 729 int
 730 xfs_rmapbt_maxrecs(
 731         int                     blocklen,
 732         int                     leaf)
 733 {
 734         blocklen -= XFS_RMAP_BLOCK_LEN;
 735         return xfs_rmapbt_block_maxrecs(blocklen, leaf);
 736 }
 737
 738 /* Compute the max possible height for reverse mapping btrees. */
 739 unsigned int
 740 xfs_rmapbt_maxlevels_ondisk(void)
 741 {
 742         unsigned int            minrecs[2];
 743         unsigned int            blocklen;
 744
 745         blocklen = XFS_MIN_CRC_BLOCKSIZE - XFS_BTREE_SBLOCK_CRC_LEN;
 746
 747         minrecs[0] = xfs_rmapbt_block_maxrecs(blocklen, true) / 2;
 748         minrecs[1] = xfs_rmapbt_block_maxrecs(blocklen, false) / 2;
 749
 750         /*
 751          * Compute the asymptotic maxlevels for an rmapbt on any reflink fs.
 752          *
 753          * On a reflink filesystem, each AG block can have up to 2^32 (per the
 754          * refcount record format) owners, which means that theoretically we
 755          * could face up to 2^64 rmap records.  However, we're likely to run
 756          * out of blocks in the AG long before that happens, which means that
 757          * we must compute the max height based on what the btree will look
 758          * like if it consumes almost all the blocks in the AG due to maximal
 759          * sharing factor.
 760          */
 761         return max(xfs_btree_space_to_height(minrecs, XFS_MAX_CRC_AG_BLOCKS),
 762                    xfs_rmapbt_mem_maxlevels());
 763 }
 764
 765 /* Compute the maximum height of an rmap btree. */
 766 void
 767 xfs_rmapbt_compute_maxlevels(
 768         struct xfs_mount                *mp)
 769 {
 770         if (!xfs_has_rmapbt(mp)) {
 771                 mp->m_rmap_maxlevels = 0;
 772                 return;
 773         }
 774
 775         if (xfs_has_reflink(mp)) {
 776                 /*
 777                  * Compute the asymptotic maxlevels for an rmap btree on a
 778                  * filesystem that supports reflink.
 779                  *
 780                  * On a reflink filesystem, each AG block can have up to 2^32
 781                  * (per the refcount record format) owners, which means that
 782                  * theoretically we could face up to 2^64 rmap records.
 783                  * However, we're likely to run out of blocks in the AG long
 784                  * before that happens, which means that we must compute the
 785                  * max height based on what the btree will look like if it
 786                  * consumes almost all the blocks in the AG due to maximal
 787                  * sharing factor.
 788                  */
 789                 mp->m_rmap_maxlevels = xfs_btree_space_to_height(mp->m_rmap_mnr,
 790                                 mp->m_sb.sb_agblocks);
 791         } else {
 792                 /*
 793                  * If there's no block sharing, compute the maximum rmapbt
 794                  * height assuming one rmap record per AG block.
 795                  */
 796                 mp->m_rmap_maxlevels = xfs_btree_compute_maxlevels(
 797                                 mp->m_rmap_mnr, mp->m_sb.sb_agblocks);
 798         }
 799         ASSERT(mp->m_rmap_maxlevels <= xfs_rmapbt_maxlevels_ondisk());
 800 }
 801
 802 /* Calculate the refcount btree size for some records. */
 803 xfs_extlen_t
 804 xfs_rmapbt_calc_size(
 805         struct xfs_mount        *mp,
 806         unsigned long long      len)
 807 {
 808         return xfs_btree_calc_size(mp->m_rmap_mnr, len);
 809 }
 810
 811 /*
 812  * Calculate the maximum refcount btree size.
 813  */
 814 xfs_extlen_t
 815 xfs_rmapbt_max_size(
 816         struct xfs_mount        *mp,
 817         xfs_agblock_t           agblocks)
 818 {
 819         /* Bail out if we're uninitialized, which can happen in mkfs. */
 820         if (mp->m_rmap_mxr[0] == 0)
 821                 return 0;
 822
 823         return xfs_rmapbt_calc_size(mp, agblocks);
 824 }
 825
 826 /*
 827  * Figure out how many blocks to reserve and how many are used by this btree.
 828  */
 829 int
 830 xfs_rmapbt_calc_reserves(
 831         struct xfs_mount        *mp,
 832         struct xfs_trans        *tp,
 833         struct xfs_perag        *pag,
 834         xfs_extlen_t            *ask,
 835         xfs_extlen_t            *used)
 836 {
 837         struct xfs_buf          *agbp;
 838         struct xfs_agf          *agf;
 839         xfs_agblock_t           agblocks;
 840         xfs_extlen_t            tree_len;
 841         int                     error;
 842
 843         if (!xfs_has_rmapbt(mp))
 844                 return 0;
 845
 846         error = xfs_alloc_read_agf(pag, tp, 0, &agbp);
 847         if (error)
 848                 return error;
 849
 850         agf = agbp->b_addr;
 851         agblocks = be32_to_cpu(agf->agf_length);
 852         tree_len = be32_to_cpu(agf->agf_rmap_blocks);
 853         xfs_trans_brelse(tp, agbp);
 854
 855         /*
 856          * The log is permanently allocated, so the space it occupies will
 857          * never be available for the kinds of things that would require btree
 858          * expansion.  We therefore can pretend the space isn't there.
 859          */
 860         if (xfs_ag_contains_log(mp, pag->pag_agno))
 861                 agblocks -= mp->m_sb.sb_logblocks;
 862
 863         /* Reserve 1% of the AG or enough for 1 block per record. */
 864         *ask += max(agblocks / 100, xfs_rmapbt_max_size(mp, agblocks));
 865         *used += tree_len;
 866
 867         return error;
 868 }
 869
 870 int __init
 871 xfs_rmapbt_init_cur_cache(void)
 872 {
 873         xfs_rmapbt_cur_cache = kmem_cache_create("xfs_rmapbt_cur",
 874                         xfs_btree_cur_sizeof(xfs_rmapbt_maxlevels_ondisk()),
 875                         0, 0, NULL);
 876
 877         if (!xfs_rmapbt_cur_cache)
 878                 return -ENOMEM;
 879         return 0;
 880 }
 881
 882 void
 883 xfs_rmapbt_destroy_cur_cache(void)
 884 {
 885         kmem_cache_destroy(xfs_rmapbt_cur_cache);
 886         xfs_rmapbt_cur_cache = NULL;
 887 }