fs/xfs/libxfs/xfs_btree_staging.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * Copyright (C) 2020 Oracle.  All Rights Reserved.
   4  * Author: Darrick J. Wong <darrick.wong@oracle.com>
   5  */
   6 #include "xfs.h"
   7 #include "xfs_fs.h"
   8 #include "xfs_shared.h"
   9 #include "xfs_format.h"
  10 #include "xfs_log_format.h"
  11 #include "xfs_trans_resv.h"
  12 #include "xfs_bit.h"
  13 #include "xfs_mount.h"
  14 #include "xfs_inode.h"
  15 #include "xfs_trans.h"
  16 #include "xfs_btree.h"
  17 #include "xfs_trace.h"
  18 #include "xfs_btree_staging.h"
  19
  20 /*
  21  * Staging Cursors and Fake Roots for Btrees
  22  * =========================================
  23  *
  24  * A staging btree cursor is a special type of btree cursor that callers must
  25  * use to construct a new btree index using the btree bulk loader code.  The
  26  * bulk loading code uses the staging btree cursor to abstract the details of
  27  * initializing new btree blocks and filling them with records or key/ptr
  28  * pairs.  Regular btree operations (e.g. queries and modifications) are not
  29  * supported with staging cursors, and callers must not invoke them.
  30  *
  31  * Fake root structures contain all the information about a btree that is under
  32  * construction by the bulk loading code.  Staging btree cursors point to fake
  33  * root structures instead of the usual AG header or inode structure.
  34  *
  35  * Callers are expected to initialize a fake root structure and pass it into
  36  * the _stage_cursor function for a specific btree type.  When bulk loading is
  37  * complete, callers should call the _commit_staged_btree function for that
  38  * specific btree type to commit the new btree into the filesystem.
  39  */
  40
  41 /*
  42  * Bulk Loading for AG Btrees
  43  * ==========================
  44  *
  45  * For a btree rooted in an AG header, pass a xbtree_afakeroot structure to the
  46  * staging cursor.  Callers should initialize this to zero.
  47  *
  48  * The _stage_cursor() function for a specific btree type should call
  49  * xfs_btree_stage_afakeroot to set up the in-memory cursor as a staging
  50  * cursor.  The corresponding _commit_staged_btree() function should log the
  51  * new root and call xfs_btree_commit_afakeroot() to transform the staging
  52  * cursor into a regular btree cursor.
  53  */
  54
  55 /*
  56  * Initialize a AG-rooted btree cursor with the given AG btree fake root.
  57  */
  58 void
  59 xfs_btree_stage_afakeroot(
  60         struct xfs_btree_cur            *cur,
  61         struct xbtree_afakeroot         *afake)
  62 {
  63         ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
  64         ASSERT(cur->bc_ops->type != XFS_BTREE_TYPE_INODE);
  65         ASSERT(cur->bc_tp == NULL);
  66
  67         cur->bc_ag.afake = afake;
  68         cur->bc_nlevels = afake->af_levels;
  69         cur->bc_flags |= XFS_BTREE_STAGING;
  70 }
  71
  72 /*
  73  * Transform an AG-rooted staging btree cursor back into a regular cursor by
  74  * substituting a real btree root for the fake one and restoring normal btree
  75  * cursor ops.  The caller must log the btree root change prior to calling
  76  * this.
  77  */
  78 void
  79 xfs_btree_commit_afakeroot(
  80         struct xfs_btree_cur            *cur,
  81         struct xfs_trans                *tp,
  82         struct xfs_buf                  *agbp)
  83 {
  84         ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
  85         ASSERT(cur->bc_tp == NULL);
  86
  87         trace_xfs_btree_commit_afakeroot(cur);
  88
  89         cur->bc_ag.afake = NULL;
  90         cur->bc_ag.agbp = agbp;
  91         cur->bc_flags &= ~XFS_BTREE_STAGING;
  92         cur->bc_tp = tp;
  93 }
  94
  95 /*
  96  * Bulk Loading for Inode-Rooted Btrees
  97  * ====================================
  98  *
  99  * For a btree rooted in an inode fork, pass a xbtree_ifakeroot structure to
 100  * the staging cursor.  This structure should be initialized as follows:
 101  *
 102  * - if_fork_size field should be set to the number of bytes available to the
 103  *   fork in the inode.
 104  *
 105  * - if_fork should point to a freshly allocated struct xfs_ifork.
 106  *
 107  * - if_format should be set to the appropriate fork type (e.g.
 108  *   XFS_DINODE_FMT_BTREE).
 109  *
 110  * All other fields must be zero.
 111  *
 112  * The _stage_cursor() function for a specific btree type should call
 113  * xfs_btree_stage_ifakeroot to set up the in-memory cursor as a staging
 114  * cursor.  The corresponding _commit_staged_btree() function should log the
 115  * new root and call xfs_btree_commit_ifakeroot() to transform the staging
 116  * cursor into a regular btree cursor.
 117  */
 118
 119 /*
 120  * Initialize an inode-rooted btree cursor with the given inode btree fake
 121  * root.  The btree cursor's bc_ops will be overridden as needed to make the
 122  * staging functionality work.  If new_ops is not NULL, these new ops will be
 123  * passed out to the caller for further overriding.
 124  */
 125 void
 126 xfs_btree_stage_ifakeroot(
 127         struct xfs_btree_cur            *cur,
 128         struct xbtree_ifakeroot         *ifake)
 129 {
 130         ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
 131         ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE);
 132         ASSERT(cur->bc_tp == NULL);
 133
 134         cur->bc_ino.ifake = ifake;
 135         cur->bc_nlevels = ifake->if_levels;
 136         cur->bc_ino.forksize = ifake->if_fork_size;
 137         cur->bc_flags |= XFS_BTREE_STAGING;
 138 }
 139
 140 /*
 141  * Transform an inode-rooted staging btree cursor back into a regular cursor by
 142  * substituting a real btree root for the fake one and restoring normal btree
 143  * cursor ops.  The caller must log the btree root change prior to calling
 144  * this.
 145  */
 146 void
 147 xfs_btree_commit_ifakeroot(
 148         struct xfs_btree_cur            *cur,
 149         struct xfs_trans                *tp,
 150         int                             whichfork)
 151 {
 152         ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
 153         ASSERT(cur->bc_tp == NULL);
 154
 155         trace_xfs_btree_commit_ifakeroot(cur);
 156
 157         cur->bc_ino.ifake = NULL;
 158         cur->bc_ino.whichfork = whichfork;
 159         cur->bc_flags &= ~XFS_BTREE_STAGING;
 160         cur->bc_tp = tp;
 161 }
 162
 163 /*
 164  * Bulk Loading of Staged Btrees
 165  * =============================
 166  *
 167  * This interface is used with a staged btree cursor to create a totally new
 168  * btree with a large number of records (i.e. more than what would fit in a
 169  * single root block).  When the creation is complete, the new root can be
 170  * linked atomically into the filesystem by committing the staged cursor.
 171  *
 172  * Creation of a new btree proceeds roughly as follows:
 173  *
 174  * The first step is to initialize an appropriate fake btree root structure and
 175  * then construct a staged btree cursor.  Refer to the block comments about
 176  * "Bulk Loading for AG Btrees" and "Bulk Loading for Inode-Rooted Btrees" for
 177  * more information about how to do this.
 178  *
 179  * The second step is to initialize a struct xfs_btree_bload context as
 180  * documented in the structure definition.
 181  *
 182  * The third step is to call xfs_btree_bload_compute_geometry to compute the
 183  * height of and the number of blocks needed to construct the btree.  See the
 184  * section "Computing the Geometry of the New Btree" for details about this
 185  * computation.
 186  *
 187  * In step four, the caller must allocate xfs_btree_bload.nr_blocks blocks and
 188  * save them for later use by ->claim_block().  Bulk loading requires all
 189  * blocks to be allocated beforehand to avoid ENOSPC failures midway through a
 190  * rebuild, and to minimize seek distances of the new btree.
 191  *
 192  * Step five is to call xfs_btree_bload() to start constructing the btree.
 193  *
 194  * The final step is to commit the staging btree cursor, which logs the new
 195  * btree root and turns the staging cursor into a regular cursor.  The caller
 196  * is responsible for cleaning up the previous btree blocks, if any.
 197  *
 198  * Computing the Geometry of the New Btree
 199  * =======================================
 200  *
 201  * The number of items placed in each btree block is computed via the following
 202  * algorithm: For leaf levels, the number of items for the level is nr_records
 203  * in the bload structure.  For node levels, the number of items for the level
 204  * is the number of blocks in the next lower level of the tree.  For each
 205  * level, the desired number of items per block is defined as:
 206  *
 207  * desired = max(minrecs, maxrecs - slack factor)
 208  *
 209  * The number of blocks for the level is defined to be:
 210  *
 211  * blocks = floor(nr_items / desired)
 212  *
 213  * Note this is rounded down so that the npb calculation below will never fall
 214  * below minrecs.  The number of items that will actually be loaded into each
 215  * btree block is defined as:
 216  *
 217  * npb =  nr_items / blocks
 218  *
 219  * Some of the leftmost blocks in the level will contain one extra record as
 220  * needed to handle uneven division.  If the number of records in any block
 221  * would exceed maxrecs for that level, blocks is incremented and npb is
 222  * recalculated.
 223  *
 224  * In other words, we compute the number of blocks needed to satisfy a given
 225  * loading level, then spread the items as evenly as possible.
 226  *
 227  * The height and number of fs blocks required to create the btree are computed
 228  * and returned via btree_height and nr_blocks.
 229  */
 230
 231 /*
 232  * Put a btree block that we're loading onto the ordered list and release it.
 233  * The btree blocks will be written to disk when bulk loading is finished.
 234  * If we reach the dirty buffer threshold, flush them to disk before
 235  * continuing.
 236  */
 237 static int
 238 xfs_btree_bload_drop_buf(
 239         struct xfs_btree_bload          *bbl,
 240         struct list_head                *buffers_list,
 241         struct xfs_buf                  **bpp)
 242 {
 243         struct xfs_buf                  *bp = *bpp;
 244         int                             error;
 245
 246         if (!bp)
 247                 return 0;
 248
 249         /*
 250          * Mark this buffer XBF_DONE (i.e. uptodate) so that a subsequent
 251          * xfs_buf_read will not pointlessly reread the contents from the disk.
 252          */
 253         bp->b_flags |= XBF_DONE;
 254
 255         xfs_buf_delwri_queue_here(bp, buffers_list);
 256         xfs_buf_relse(bp);
 257         *bpp = NULL;
 258         bbl->nr_dirty++;
 259
 260         if (!bbl->max_dirty || bbl->nr_dirty < bbl->max_dirty)
 261                 return 0;
 262
 263         error = xfs_buf_delwri_submit(buffers_list);
 264         if (error)
 265                 return error;
 266
 267         bbl->nr_dirty = 0;
 268         return 0;
 269 }
 270
 271 /*
 272  * Allocate and initialize one btree block for bulk loading.
 273  *
 274  * The new btree block will have its level and numrecs fields set to the values
 275  * of the level and nr_this_block parameters, respectively.
 276  *
 277  * The caller should ensure that ptrp, bpp, and blockp refer to the left
 278  * sibling of the new block, if there is any.  On exit, ptrp, bpp, and blockp
 279  * will all point to the new block.
 280  */
 281 STATIC int
 282 xfs_btree_bload_prep_block(
 283         struct xfs_btree_cur            *cur,
 284         struct xfs_btree_bload          *bbl,
 285         struct list_head                *buffers_list,
 286         unsigned int                    level,
 287         unsigned int                    nr_this_block,
 288         union xfs_btree_ptr             *ptrp, /* in/out */
 289         struct xfs_buf                  **bpp, /* in/out */
 290         struct xfs_btree_block          **blockp, /* in/out */
 291         void                            *priv)
 292 {
 293         union xfs_btree_ptr             new_ptr;
 294         struct xfs_buf                  *new_bp;
 295         struct xfs_btree_block          *new_block;
 296         int                             ret;
 297
 298         if (xfs_btree_at_iroot(cur, level)) {
 299                 struct xfs_ifork        *ifp = xfs_btree_ifork_ptr(cur);
 300                 size_t                  new_size;
 301
 302                 ASSERT(*bpp == NULL);
 303
 304                 /* Allocate a new incore btree root block. */
 305                 new_size = bbl->iroot_size(cur, level, nr_this_block, priv);
 306                 ifp->if_broot = kzalloc(new_size, GFP_KERNEL | __GFP_NOFAIL);
 307                 ifp->if_broot_bytes = (int)new_size;
 308
 309                 /* Initialize it and send it out. */
 310                 xfs_btree_init_block(cur->bc_mp, ifp->if_broot, cur->bc_ops,
 311                                 level, nr_this_block, cur->bc_ino.ip->i_ino);
 312
 313                 *bpp = NULL;
 314                 *blockp = ifp->if_broot;
 315                 xfs_btree_set_ptr_null(cur, ptrp);
 316                 return 0;
 317         }
 318
 319         /* Claim one of the caller's preallocated blocks. */
 320         xfs_btree_set_ptr_null(cur, &new_ptr);
 321         ret = bbl->claim_block(cur, &new_ptr, priv);
 322         if (ret)
 323                 return ret;
 324
 325         ASSERT(!xfs_btree_ptr_is_null(cur, &new_ptr));
 326
 327         ret = xfs_btree_get_buf_block(cur, &new_ptr, &new_block, &new_bp);
 328         if (ret)
 329                 return ret;
 330
 331         /*
 332          * The previous block (if any) is the left sibling of the new block,
 333          * so set its right sibling pointer to the new block and drop it.
 334          */
 335         if (*blockp)
 336                 xfs_btree_set_sibling(cur, *blockp, &new_ptr, XFS_BB_RIGHTSIB);
 337
 338         ret = xfs_btree_bload_drop_buf(bbl, buffers_list, bpp);
 339         if (ret)
 340                 return ret;
 341
 342         /* Initialize the new btree block. */
 343         xfs_btree_init_block_cur(cur, new_bp, level, nr_this_block);
 344         xfs_btree_set_sibling(cur, new_block, ptrp, XFS_BB_LEFTSIB);
 345
 346         /* Set the out parameters. */
 347         *bpp = new_bp;
 348         *blockp = new_block;
 349         xfs_btree_copy_ptrs(cur, ptrp, &new_ptr, 1);
 350         return 0;
 351 }
 352
 353 /* Load one leaf block. */
 354 STATIC int
 355 xfs_btree_bload_leaf(
 356         struct xfs_btree_cur            *cur,
 357         unsigned int                    recs_this_block,
 358         xfs_btree_bload_get_records_fn  get_records,
 359         struct xfs_btree_block          *block,
 360         void                            *priv)
 361 {
 362         unsigned int                    j = 1;
 363         int                             ret;
 364
 365         /* Fill the leaf block with records. */
 366         while (j <= recs_this_block) {
 367                 ret = get_records(cur, j, block, recs_this_block - j + 1, priv);
 368                 if (ret < 0)
 369                         return ret;
 370                 j += ret;
 371         }
 372
 373         return 0;
 374 }
 375
 376 /*
 377  * Load one node block with key/ptr pairs.
 378  *
 379  * child_ptr must point to a block within the next level down in the tree.  A
 380  * key/ptr entry will be created in the new node block to the block pointed to
 381  * by child_ptr.  On exit, child_ptr points to the next block on the child
 382  * level that needs processing.
 383  */
 384 STATIC int
 385 xfs_btree_bload_node(
 386         struct xfs_btree_cur    *cur,
 387         unsigned int            recs_this_block,
 388         union xfs_btree_ptr     *child_ptr,
 389         struct xfs_btree_block  *block)
 390 {
 391         unsigned int            j;
 392         int                     ret;
 393
 394         /* Fill the node block with keys and pointers. */
 395         for (j = 1; j <= recs_this_block; j++) {
 396                 union xfs_btree_key     child_key;
 397                 union xfs_btree_ptr     *block_ptr;
 398                 union xfs_btree_key     *block_key;
 399                 struct xfs_btree_block  *child_block;
 400                 struct xfs_buf          *child_bp;
 401
 402                 ASSERT(!xfs_btree_ptr_is_null(cur, child_ptr));
 403
 404                 /*
 405                  * Read the lower-level block in case the buffer for it has
 406                  * been reclaimed.  LRU refs will be set on the block, which is
 407                  * desirable if the new btree commits.
 408                  */
 409                 ret = xfs_btree_read_buf_block(cur, child_ptr, 0, &child_block,
 410                                 &child_bp);
 411                 if (ret)
 412                         return ret;
 413
 414                 block_ptr = xfs_btree_ptr_addr(cur, j, block);
 415                 xfs_btree_copy_ptrs(cur, block_ptr, child_ptr, 1);
 416
 417                 block_key = xfs_btree_key_addr(cur, j, block);
 418                 xfs_btree_get_keys(cur, child_block, &child_key);
 419                 xfs_btree_copy_keys(cur, block_key, &child_key, 1);
 420
 421                 xfs_btree_get_sibling(cur, child_block, child_ptr,
 422                                 XFS_BB_RIGHTSIB);
 423                 xfs_buf_relse(child_bp);
 424         }
 425
 426         return 0;
 427 }
 428
 429 /*
 430  * Compute the maximum number of records (or keyptrs) per block that we want to
 431  * install at this level in the btree.  Caller is responsible for having set
 432  * @cur->bc_ino.forksize to the desired fork size, if appropriate.
 433  */
 434 STATIC unsigned int
 435 xfs_btree_bload_max_npb(
 436         struct xfs_btree_cur    *cur,
 437         struct xfs_btree_bload  *bbl,
 438         unsigned int            level)
 439 {
 440         unsigned int            ret;
 441
 442         if (level == cur->bc_nlevels - 1 && cur->bc_ops->get_dmaxrecs)
 443                 return cur->bc_ops->get_dmaxrecs(cur, level);
 444
 445         ret = cur->bc_ops->get_maxrecs(cur, level);
 446         if (level == 0)
 447                 ret -= bbl->leaf_slack;
 448         else
 449                 ret -= bbl->node_slack;
 450         return ret;
 451 }
 452
 453 /*
 454  * Compute the desired number of records (or keyptrs) per block that we want to
 455  * install at this level in the btree, which must be somewhere between minrecs
 456  * and max_npb.  The caller is free to install fewer records per block.
 457  */
 458 STATIC unsigned int
 459 xfs_btree_bload_desired_npb(
 460         struct xfs_btree_cur    *cur,
 461         struct xfs_btree_bload  *bbl,
 462         unsigned int            level)
 463 {
 464         unsigned int            npb = xfs_btree_bload_max_npb(cur, bbl, level);
 465
 466         /* Root blocks are not subject to minrecs rules. */
 467         if (level == cur->bc_nlevels - 1)
 468                 return max(1U, npb);
 469
 470         return max_t(unsigned int, cur->bc_ops->get_minrecs(cur, level), npb);
 471 }
 472
 473 /*
 474  * Compute the number of records to be stored in each block at this level and
 475  * the number of blocks for this level.  For leaf levels, we must populate an
 476  * empty root block even if there are no records, so we have to have at least
 477  * one block.
 478  */
 479 STATIC void
 480 xfs_btree_bload_level_geometry(
 481         struct xfs_btree_cur    *cur,
 482         struct xfs_btree_bload  *bbl,
 483         unsigned int            level,
 484         uint64_t                nr_this_level,
 485         unsigned int            *avg_per_block,
 486         uint64_t                *blocks,
 487         uint64_t                *blocks_with_extra)
 488 {
 489         uint64_t                npb;
 490         uint64_t                dontcare;
 491         unsigned int            desired_npb;
 492         unsigned int            maxnr;
 493
 494         /*
 495          * Compute the absolute maximum number of records that we can store in
 496          * the ondisk block or inode root.
 497          */
 498         if (cur->bc_ops->get_dmaxrecs)
 499                 maxnr = cur->bc_ops->get_dmaxrecs(cur, level);
 500         else
 501                 maxnr = cur->bc_ops->get_maxrecs(cur, level);
 502
 503         /*
 504          * Compute the number of blocks we need to fill each block with the
 505          * desired number of records/keyptrs per block.  Because desired_npb
 506          * could be minrecs, we use regular integer division (which rounds
 507          * the block count down) so that in the next step the effective # of
 508          * items per block will never be less than desired_npb.
 509          */
 510         desired_npb = xfs_btree_bload_desired_npb(cur, bbl, level);
 511         *blocks = div64_u64_rem(nr_this_level, desired_npb, &dontcare);
 512         *blocks = max(1ULL, *blocks);
 513
 514         /*
 515          * Compute the number of records that we will actually put in each
 516          * block, assuming that we want to spread the records evenly between
 517          * the blocks.  Take care that the effective # of items per block (npb)
 518          * won't exceed maxrecs even for the blocks that get an extra record,
 519          * since desired_npb could be maxrecs, and in the previous step we
 520          * rounded the block count down.
 521          */
 522         npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra);
 523         if (npb > maxnr || (npb == maxnr && *blocks_with_extra > 0)) {
 524                 (*blocks)++;
 525                 npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra);
 526         }
 527
 528         *avg_per_block = min_t(uint64_t, npb, nr_this_level);
 529
 530         trace_xfs_btree_bload_level_geometry(cur, level, nr_this_level,
 531                         *avg_per_block, desired_npb, *blocks,
 532                         *blocks_with_extra);
 533 }
 534
 535 /*
 536  * Ensure a slack value is appropriate for the btree.
 537  *
 538  * If the slack value is negative, set slack so that we fill the block to
 539  * halfway between minrecs and maxrecs.  Make sure the slack is never so large
 540  * that we can underflow minrecs.
 541  */
 542 static void
 543 xfs_btree_bload_ensure_slack(
 544         struct xfs_btree_cur    *cur,
 545         int                     *slack,
 546         int                     level)
 547 {
 548         int                     maxr;
 549         int                     minr;
 550
 551         maxr = cur->bc_ops->get_maxrecs(cur, level);
 552         minr = cur->bc_ops->get_minrecs(cur, level);
 553
 554         /*
 555          * If slack is negative, automatically set slack so that we load the
 556          * btree block approximately halfway between minrecs and maxrecs.
 557          * Generally, this will net us 75% loading.
 558          */
 559         if (*slack < 0)
 560                 *slack = maxr - ((maxr + minr) >> 1);
 561
 562         *slack = min(*slack, maxr - minr);
 563 }
 564
 565 /*
 566  * Prepare a btree cursor for a bulk load operation by computing the geometry
 567  * fields in bbl.  Caller must ensure that the btree cursor is a staging
 568  * cursor.  This function can be called multiple times.
 569  */
 570 int
 571 xfs_btree_bload_compute_geometry(
 572         struct xfs_btree_cur    *cur,
 573         struct xfs_btree_bload  *bbl,
 574         uint64_t                nr_records)
 575 {
 576         uint64_t                nr_blocks = 0;
 577         uint64_t                nr_this_level;
 578
 579         ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
 580
 581         /*
 582          * Make sure that the slack values make sense for traditional leaf and
 583          * node blocks.  Inode-rooted btrees will return different minrecs and
 584          * maxrecs values for the root block (bc_nlevels == level - 1).  We're
 585          * checking levels 0 and 1 here, so set bc_nlevels such that the btree
 586          * code doesn't interpret either as the root level.
 587          */
 588         cur->bc_nlevels = cur->bc_maxlevels - 1;
 589         xfs_btree_bload_ensure_slack(cur, &bbl->leaf_slack, 0);
 590         xfs_btree_bload_ensure_slack(cur, &bbl->node_slack, 1);
 591
 592         bbl->nr_records = nr_this_level = nr_records;
 593         for (cur->bc_nlevels = 1; cur->bc_nlevels <= cur->bc_maxlevels;) {
 594                 uint64_t        level_blocks;
 595                 uint64_t        dontcare64;
 596                 unsigned int    level = cur->bc_nlevels - 1;
 597                 unsigned int    avg_per_block;
 598
 599                 xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
 600                                 &avg_per_block, &level_blocks, &dontcare64);
 601
 602                 if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) {
 603                         /*
 604                          * If all the items we want to store at this level
 605                          * would fit in the inode root block, then we have our
 606                          * btree root and are done.
 607                          *
 608                          * Note that bmap btrees forbid records in the root.
 609                          */
 610                         if (level != 0 && nr_this_level <= avg_per_block) {
 611                                 nr_blocks++;
 612                                 break;
 613                         }
 614
 615                         /*
 616                          * Otherwise, we have to store all the items for this
 617                          * level in traditional btree blocks and therefore need
 618                          * another level of btree to point to those blocks.
 619                          *
 620                          * We have to re-compute the geometry for each level of
 621                          * an inode-rooted btree because the geometry differs
 622                          * between a btree root in an inode fork and a
 623                          * traditional btree block.
 624                          *
 625                          * This distinction is made in the btree code based on
 626                          * whether level == bc_nlevels - 1.  Based on the
 627                          * previous root block size check against the root
 628                          * block geometry, we know that we aren't yet ready to
 629                          * populate the root.  Increment bc_nevels and
 630                          * recalculate the geometry for a traditional
 631                          * block-based btree level.
 632                          */
 633                         cur->bc_nlevels++;
 634                         ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
 635                         xfs_btree_bload_level_geometry(cur, bbl, level,
 636                                         nr_this_level, &avg_per_block,
 637                                         &level_blocks, &dontcare64);
 638                 } else {
 639                         /*
 640                          * If all the items we want to store at this level
 641                          * would fit in a single root block, we're done.
 642                          */
 643                         if (nr_this_level <= avg_per_block) {
 644                                 nr_blocks++;
 645                                 break;
 646                         }
 647
 648                         /* Otherwise, we need another level of btree. */
 649                         cur->bc_nlevels++;
 650                         ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
 651                 }
 652
 653                 nr_blocks += level_blocks;
 654                 nr_this_level = level_blocks;
 655         }
 656
 657         if (cur->bc_nlevels > cur->bc_maxlevels)
 658                 return -EOVERFLOW;
 659
 660         bbl->btree_height = cur->bc_nlevels;
 661         if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
 662                 bbl->nr_blocks = nr_blocks - 1;
 663         else
 664                 bbl->nr_blocks = nr_blocks;
 665         return 0;
 666 }
 667
 668 /* Bulk load a btree given the parameters and geometry established in bbl. */
 669 int
 670 xfs_btree_bload(
 671         struct xfs_btree_cur            *cur,
 672         struct xfs_btree_bload          *bbl,
 673         void                            *priv)
 674 {
 675         struct list_head                buffers_list;
 676         union xfs_btree_ptr             child_ptr;
 677         union xfs_btree_ptr             ptr;
 678         struct xfs_buf                  *bp = NULL;
 679         struct xfs_btree_block          *block = NULL;
 680         uint64_t                        nr_this_level = bbl->nr_records;
 681         uint64_t                        blocks;
 682         uint64_t                        i;
 683         uint64_t                        blocks_with_extra;
 684         uint64_t                        total_blocks = 0;
 685         unsigned int                    avg_per_block;
 686         unsigned int                    level = 0;
 687         int                             ret;
 688
 689         ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
 690
 691         INIT_LIST_HEAD(&buffers_list);
 692         cur->bc_nlevels = bbl->btree_height;
 693         xfs_btree_set_ptr_null(cur, &child_ptr);
 694         xfs_btree_set_ptr_null(cur, &ptr);
 695         bbl->nr_dirty = 0;
 696
 697         xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
 698                         &avg_per_block, &blocks, &blocks_with_extra);
 699
 700         /* Load each leaf block. */
 701         for (i = 0; i < blocks; i++) {
 702                 unsigned int            nr_this_block = avg_per_block;
 703
 704                 /*
 705                  * Due to rounding, btree blocks will not be evenly populated
 706                  * in most cases.  blocks_with_extra tells us how many blocks
 707                  * will receive an extra record to distribute the excess across
 708                  * the current level as evenly as possible.
 709                  */
 710                 if (i < blocks_with_extra)
 711                         nr_this_block++;
 712
 713                 ret = xfs_btree_bload_prep_block(cur, bbl, &buffers_list, level,
 714                                 nr_this_block, &ptr, &bp, &block, priv);
 715                 if (ret)
 716                         goto out;
 717
 718                 trace_xfs_btree_bload_block(cur, level, i, blocks, &ptr,
 719                                 nr_this_block);
 720
 721                 ret = xfs_btree_bload_leaf(cur, nr_this_block, bbl->get_records,
 722                                 block, priv);
 723                 if (ret)
 724                         goto out;
 725
 726                 /*
 727                  * Record the leftmost leaf pointer so we know where to start
 728                  * with the first node level.
 729                  */
 730                 if (i == 0)
 731                         xfs_btree_copy_ptrs(cur, &child_ptr, &ptr, 1);
 732         }
 733         total_blocks += blocks;
 734
 735         ret = xfs_btree_bload_drop_buf(bbl, &buffers_list, &bp);
 736         if (ret)
 737                 goto out;
 738
 739         /* Populate the internal btree nodes. */
 740         for (level = 1; level < cur->bc_nlevels; level++) {
 741                 union xfs_btree_ptr     first_ptr;
 742
 743                 nr_this_level = blocks;
 744                 block = NULL;
 745                 xfs_btree_set_ptr_null(cur, &ptr);
 746
 747                 xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
 748                                 &avg_per_block, &blocks, &blocks_with_extra);
 749
 750                 /* Load each node block. */
 751                 for (i = 0; i < blocks; i++) {
 752                         unsigned int    nr_this_block = avg_per_block;
 753
 754                         if (i < blocks_with_extra)
 755                                 nr_this_block++;
 756
 757                         ret = xfs_btree_bload_prep_block(cur, bbl,
 758                                         &buffers_list, level, nr_this_block,
 759                                         &ptr, &bp, &block, priv);
 760                         if (ret)
 761                                 goto out;
 762
 763                         trace_xfs_btree_bload_block(cur, level, i, blocks,
 764                                         &ptr, nr_this_block);
 765
 766                         ret = xfs_btree_bload_node(cur, nr_this_block,
 767                                         &child_ptr, block);
 768                         if (ret)
 769                                 goto out;
 770
 771                         /*
 772                          * Record the leftmost node pointer so that we know
 773                          * where to start the next node level above this one.
 774                          */
 775                         if (i == 0)
 776                                 xfs_btree_copy_ptrs(cur, &first_ptr, &ptr, 1);
 777                 }
 778                 total_blocks += blocks;
 779
 780                 ret = xfs_btree_bload_drop_buf(bbl, &buffers_list, &bp);
 781                 if (ret)
 782                         goto out;
 783
 784                 xfs_btree_copy_ptrs(cur, &child_ptr, &first_ptr, 1);
 785         }
 786
 787         /* Initialize the new root. */
 788         if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) {
 789                 ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
 790                 cur->bc_ino.ifake->if_levels = cur->bc_nlevels;
 791                 cur->bc_ino.ifake->if_blocks = total_blocks - 1;
 792         } else {
 793                 cur->bc_ag.afake->af_root = be32_to_cpu(ptr.s);
 794                 cur->bc_ag.afake->af_levels = cur->bc_nlevels;
 795                 cur->bc_ag.afake->af_blocks = total_blocks;
 796         }
 797
 798         /*
 799          * Write the new blocks to disk.  If the ordered list isn't empty after
 800          * that, then something went wrong and we have to fail.  This should
 801          * never happen, but we'll check anyway.
 802          */
 803         ret = xfs_buf_delwri_submit(&buffers_list);
 804         if (ret)
 805                 goto out;
 806         if (!list_empty(&buffers_list)) {
 807                 ASSERT(list_empty(&buffers_list));
 808                 ret = -EIO;
 809         }
 810
 811 out:
 812         xfs_buf_delwri_cancel(&buffers_list);
 813         if (bp)
 814                 xfs_buf_relse(bp);
 815         return ret;
 816 }