libxfs/xfs_btree_staging.c

   1 // SPDX-License-Identifier: GPL-2.0-or-later
   2 /*
   3  * Copyright (C) 2020 Oracle.  All Rights Reserved.
   4  * Author: Darrick J. Wong <darrick.wong@oracle.com>
   5  */
   6 #include "xfs.h"
   7 #include "xfs_fs.h"
   8 #include "xfs_shared.h"
   9 #include "xfs_format.h"
  10 #include "xfs_log_format.h"
  11 #include "xfs_trans_resv.h"
  12 #include "xfs_bit.h"
  13 #include "xfs_mount.h"
  14 #include "xfs_inode.h"
  15 #include "xfs_trans.h"
  16 #include "xfs_btree.h"
  17 #include "xfs_trace.h"
  18 #include "xfs_btree_staging.h"
  19
  20 /*
  21  * Staging Cursors and Fake Roots for Btrees
  22  * =========================================
  23  *
  24  * A staging btree cursor is a special type of btree cursor that callers must
  25  * use to construct a new btree index using the btree bulk loader code.  The
  26  * bulk loading code uses the staging btree cursor to abstract the details of
  27  * initializing new btree blocks and filling them with records or key/ptr
  28  * pairs.  Regular btree operations (e.g. queries and modifications) are not
  29  * supported with staging cursors, and callers must not invoke them.
  30  *
  31  * Fake root structures contain all the information about a btree that is under
  32  * construction by the bulk loading code.  Staging btree cursors point to fake
  33  * root structures instead of the usual AG header or inode structure.
  34  *
  35  * Callers are expected to initialize a fake root structure and pass it into
  36  * the _stage_cursor function for a specific btree type.  When bulk loading is
  37  * complete, callers should call the _commit_staged_btree function for that
  38  * specific btree type to commit the new btree into the filesystem.
  39  */
  40
  41 /*
  42  * Don't allow staging cursors to be duplicated because they're supposed to be
  43  * kept private to a single thread.
  44  */
  45 STATIC struct xfs_btree_cur *
  46 xfs_btree_fakeroot_dup_cursor(
  47         struct xfs_btree_cur    *cur)
  48 {
  49         ASSERT(0);
  50         return NULL;
  51 }
  52
  53 /*
  54  * Don't allow block allocation for a staging cursor, because staging cursors
  55  * do not support regular btree modifications.
  56  *
  57  * Bulk loading uses a separate callback to obtain new blocks from a
  58  * preallocated list, which prevents ENOSPC failures during loading.
  59  */
  60 STATIC int
  61 xfs_btree_fakeroot_alloc_block(
  62         struct xfs_btree_cur            *cur,
  63         const union xfs_btree_ptr       *start_bno,
  64         union xfs_btree_ptr             *new_bno,
  65         int                             *stat)
  66 {
  67         ASSERT(0);
  68         return -EFSCORRUPTED;
  69 }
  70
  71 /*
  72  * Don't allow block freeing for a staging cursor, because staging cursors
  73  * do not support regular btree modifications.
  74  */
  75 STATIC int
  76 xfs_btree_fakeroot_free_block(
  77         struct xfs_btree_cur    *cur,
  78         struct xfs_buf          *bp)
  79 {
  80         ASSERT(0);
  81         return -EFSCORRUPTED;
  82 }
  83
  84 /* Initialize a pointer to the root block from the fakeroot. */
  85 STATIC void
  86 xfs_btree_fakeroot_init_ptr_from_cur(
  87         struct xfs_btree_cur    *cur,
  88         union xfs_btree_ptr     *ptr)
  89 {
  90         struct xbtree_afakeroot *afake;
  91
  92         ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
  93
  94         afake = cur->bc_ag.afake;
  95         ptr->s = cpu_to_be32(afake->af_root);
  96 }
  97
  98 /*
  99  * Bulk Loading for AG Btrees
 100  * ==========================
 101  *
 102  * For a btree rooted in an AG header, pass a xbtree_afakeroot structure to the
 103  * staging cursor.  Callers should initialize this to zero.
 104  *
 105  * The _stage_cursor() function for a specific btree type should call
 106  * xfs_btree_stage_afakeroot to set up the in-memory cursor as a staging
 107  * cursor.  The corresponding _commit_staged_btree() function should log the
 108  * new root and call xfs_btree_commit_afakeroot() to transform the staging
 109  * cursor into a regular btree cursor.
 110  */
 111
 112 /* Update the btree root information for a per-AG fake root. */
 113 STATIC void
 114 xfs_btree_afakeroot_set_root(
 115         struct xfs_btree_cur            *cur,
 116         const union xfs_btree_ptr       *ptr,
 117         int                             inc)
 118 {
 119         struct xbtree_afakeroot *afake = cur->bc_ag.afake;
 120
 121         ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
 122         afake->af_root = be32_to_cpu(ptr->s);
 123         afake->af_levels += inc;
 124 }
 125
 126 /*
 127  * Initialize a AG-rooted btree cursor with the given AG btree fake root.
 128  * The btree cursor's bc_ops will be overridden as needed to make the staging
 129  * functionality work.
 130  */
 131 void
 132 xfs_btree_stage_afakeroot(
 133         struct xfs_btree_cur            *cur,
 134         struct xbtree_afakeroot         *afake)
 135 {
 136         struct xfs_btree_ops            *nops;
 137
 138         ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
 139         ASSERT(!(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE));
 140         ASSERT(cur->bc_tp == NULL);
 141
 142         nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS);
 143         memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops));
 144         nops->alloc_block = xfs_btree_fakeroot_alloc_block;
 145         nops->free_block = xfs_btree_fakeroot_free_block;
 146         nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur;
 147         nops->set_root = xfs_btree_afakeroot_set_root;
 148         nops->dup_cursor = xfs_btree_fakeroot_dup_cursor;
 149
 150         cur->bc_ag.afake = afake;
 151         cur->bc_nlevels = afake->af_levels;
 152         cur->bc_ops = nops;
 153         cur->bc_flags |= XFS_BTREE_STAGING;
 154 }
 155
 156 /*
 157  * Transform an AG-rooted staging btree cursor back into a regular cursor by
 158  * substituting a real btree root for the fake one and restoring normal btree
 159  * cursor ops.  The caller must log the btree root change prior to calling
 160  * this.
 161  */
 162 void
 163 xfs_btree_commit_afakeroot(
 164         struct xfs_btree_cur            *cur,
 165         struct xfs_trans                *tp,
 166         struct xfs_buf                  *agbp,
 167         const struct xfs_btree_ops      *ops)
 168 {
 169         ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
 170         ASSERT(cur->bc_tp == NULL);
 171
 172         trace_xfs_btree_commit_afakeroot(cur);
 173
 174         kmem_free((void *)cur->bc_ops);
 175         cur->bc_ag.agbp = agbp;
 176         cur->bc_ops = ops;
 177         cur->bc_flags &= ~XFS_BTREE_STAGING;
 178         cur->bc_tp = tp;
 179 }
 180
 181 /*
 182  * Bulk Loading for Inode-Rooted Btrees
 183  * ====================================
 184  *
 185  * For a btree rooted in an inode fork, pass a xbtree_ifakeroot structure to
 186  * the staging cursor.  This structure should be initialized as follows:
 187  *
 188  * - if_fork_size field should be set to the number of bytes available to the
 189  *   fork in the inode.
 190  *
 191  * - if_fork should point to a freshly allocated struct xfs_ifork.
 192  *
 193  * - if_format should be set to the appropriate fork type (e.g.
 194  *   XFS_DINODE_FMT_BTREE).
 195  *
 196  * All other fields must be zero.
 197  *
 198  * The _stage_cursor() function for a specific btree type should call
 199  * xfs_btree_stage_ifakeroot to set up the in-memory cursor as a staging
 200  * cursor.  The corresponding _commit_staged_btree() function should log the
 201  * new root and call xfs_btree_commit_ifakeroot() to transform the staging
 202  * cursor into a regular btree cursor.
 203  */
 204
 205 /*
 206  * Initialize an inode-rooted btree cursor with the given inode btree fake
 207  * root.  The btree cursor's bc_ops will be overridden as needed to make the
 208  * staging functionality work.  If new_ops is not NULL, these new ops will be
 209  * passed out to the caller for further overriding.
 210  */
 211 void
 212 xfs_btree_stage_ifakeroot(
 213         struct xfs_btree_cur            *cur,
 214         struct xbtree_ifakeroot         *ifake,
 215         struct xfs_btree_ops            **new_ops)
 216 {
 217         struct xfs_btree_ops            *nops;
 218
 219         ASSERT(!(cur->bc_flags & XFS_BTREE_STAGING));
 220         ASSERT(cur->bc_flags & XFS_BTREE_ROOT_IN_INODE);
 221         ASSERT(cur->bc_tp == NULL);
 222
 223         nops = kmem_alloc(sizeof(struct xfs_btree_ops), KM_NOFS);
 224         memcpy(nops, cur->bc_ops, sizeof(struct xfs_btree_ops));
 225         nops->alloc_block = xfs_btree_fakeroot_alloc_block;
 226         nops->free_block = xfs_btree_fakeroot_free_block;
 227         nops->init_ptr_from_cur = xfs_btree_fakeroot_init_ptr_from_cur;
 228         nops->dup_cursor = xfs_btree_fakeroot_dup_cursor;
 229
 230         cur->bc_ino.ifake = ifake;
 231         cur->bc_nlevels = ifake->if_levels;
 232         cur->bc_ops = nops;
 233         cur->bc_flags |= XFS_BTREE_STAGING;
 234
 235         if (new_ops)
 236                 *new_ops = nops;
 237 }
 238
 239 /*
 240  * Transform an inode-rooted staging btree cursor back into a regular cursor by
 241  * substituting a real btree root for the fake one and restoring normal btree
 242  * cursor ops.  The caller must log the btree root change prior to calling
 243  * this.
 244  */
 245 void
 246 xfs_btree_commit_ifakeroot(
 247         struct xfs_btree_cur            *cur,
 248         struct xfs_trans                *tp,
 249         int                             whichfork,
 250         const struct xfs_btree_ops      *ops)
 251 {
 252         ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
 253         ASSERT(cur->bc_tp == NULL);
 254
 255         trace_xfs_btree_commit_ifakeroot(cur);
 256
 257         kmem_free((void *)cur->bc_ops);
 258         cur->bc_ino.ifake = NULL;
 259         cur->bc_ino.whichfork = whichfork;
 260         cur->bc_ops = ops;
 261         cur->bc_flags &= ~XFS_BTREE_STAGING;
 262         cur->bc_tp = tp;
 263 }
 264
 265 /*
 266  * Bulk Loading of Staged Btrees
 267  * =============================
 268  *
 269  * This interface is used with a staged btree cursor to create a totally new
 270  * btree with a large number of records (i.e. more than what would fit in a
 271  * single root block).  When the creation is complete, the new root can be
 272  * linked atomically into the filesystem by committing the staged cursor.
 273  *
 274  * Creation of a new btree proceeds roughly as follows:
 275  *
 276  * The first step is to initialize an appropriate fake btree root structure and
 277  * then construct a staged btree cursor.  Refer to the block comments about
 278  * "Bulk Loading for AG Btrees" and "Bulk Loading for Inode-Rooted Btrees" for
 279  * more information about how to do this.
 280  *
 281  * The second step is to initialize a struct xfs_btree_bload context as
 282  * documented in the structure definition.
 283  *
 284  * The third step is to call xfs_btree_bload_compute_geometry to compute the
 285  * height of and the number of blocks needed to construct the btree.  See the
 286  * section "Computing the Geometry of the New Btree" for details about this
 287  * computation.
 288  *
 289  * In step four, the caller must allocate xfs_btree_bload.nr_blocks blocks and
 290  * save them for later use by ->claim_block().  Bulk loading requires all
 291  * blocks to be allocated beforehand to avoid ENOSPC failures midway through a
 292  * rebuild, and to minimize seek distances of the new btree.
 293  *
 294  * Step five is to call xfs_btree_bload() to start constructing the btree.
 295  *
 296  * The final step is to commit the staging btree cursor, which logs the new
 297  * btree root and turns the staging cursor into a regular cursor.  The caller
 298  * is responsible for cleaning up the previous btree blocks, if any.
 299  *
 300  * Computing the Geometry of the New Btree
 301  * =======================================
 302  *
 303  * The number of items placed in each btree block is computed via the following
 304  * algorithm: For leaf levels, the number of items for the level is nr_records
 305  * in the bload structure.  For node levels, the number of items for the level
 306  * is the number of blocks in the next lower level of the tree.  For each
 307  * level, the desired number of items per block is defined as:
 308  *
 309  * desired = max(minrecs, maxrecs - slack factor)
 310  *
 311  * The number of blocks for the level is defined to be:
 312  *
 313  * blocks = floor(nr_items / desired)
 314  *
 315  * Note this is rounded down so that the npb calculation below will never fall
 316  * below minrecs.  The number of items that will actually be loaded into each
 317  * btree block is defined as:
 318  *
 319  * npb =  nr_items / blocks
 320  *
 321  * Some of the leftmost blocks in the level will contain one extra record as
 322  * needed to handle uneven division.  If the number of records in any block
 323  * would exceed maxrecs for that level, blocks is incremented and npb is
 324  * recalculated.
 325  *
 326  * In other words, we compute the number of blocks needed to satisfy a given
 327  * loading level, then spread the items as evenly as possible.
 328  *
 329  * The height and number of fs blocks required to create the btree are computed
 330  * and returned via btree_height and nr_blocks.
 331  */
 332
 333 /*
 334  * Put a btree block that we're loading onto the ordered list and release it.
 335  * The btree blocks will be written to disk when bulk loading is finished.
 336  * If we reach the dirty buffer threshold, flush them to disk before
 337  * continuing.
 338  */
 339 static int
 340 xfs_btree_bload_drop_buf(
 341         struct xfs_btree_bload          *bbl,
 342         struct list_head                *buffers_list,
 343         struct xfs_buf                  **bpp)
 344 {
 345         struct xfs_buf                  *bp = *bpp;
 346         int                             error;
 347
 348         if (!bp)
 349                 return 0;
 350
 351         /*
 352          * Mark this buffer XBF_DONE (i.e. uptodate) so that a subsequent
 353          * xfs_buf_read will not pointlessly reread the contents from the disk.
 354          */
 355         bp->b_flags |= XBF_DONE;
 356
 357         xfs_buf_delwri_queue_here(bp, buffers_list);
 358         xfs_buf_relse(bp);
 359         *bpp = NULL;
 360         bbl->nr_dirty++;
 361
 362         if (!bbl->max_dirty || bbl->nr_dirty < bbl->max_dirty)
 363                 return 0;
 364
 365         error = xfs_buf_delwri_submit(buffers_list);
 366         if (error)
 367                 return error;
 368
 369         bbl->nr_dirty = 0;
 370         return 0;
 371 }
 372
 373 /*
 374  * Allocate and initialize one btree block for bulk loading.
 375  *
 376  * The new btree block will have its level and numrecs fields set to the values
 377  * of the level and nr_this_block parameters, respectively.
 378  *
 379  * The caller should ensure that ptrp, bpp, and blockp refer to the left
 380  * sibling of the new block, if there is any.  On exit, ptrp, bpp, and blockp
 381  * will all point to the new block.
 382  */
 383 STATIC int
 384 xfs_btree_bload_prep_block(
 385         struct xfs_btree_cur            *cur,
 386         struct xfs_btree_bload          *bbl,
 387         struct list_head                *buffers_list,
 388         unsigned int                    level,
 389         unsigned int                    nr_this_block,
 390         union xfs_btree_ptr             *ptrp, /* in/out */
 391         struct xfs_buf                  **bpp, /* in/out */
 392         struct xfs_btree_block          **blockp, /* in/out */
 393         void                            *priv)
 394 {
 395         union xfs_btree_ptr             new_ptr;
 396         struct xfs_buf                  *new_bp;
 397         struct xfs_btree_block          *new_block;
 398         int                             ret;
 399
 400         if ((cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) &&
 401             level == cur->bc_nlevels - 1) {
 402                 struct xfs_ifork        *ifp = xfs_btree_ifork_ptr(cur);
 403                 size_t                  new_size;
 404
 405                 ASSERT(*bpp == NULL);
 406
 407                 /* Allocate a new incore btree root block. */
 408                 new_size = bbl->iroot_size(cur, level, nr_this_block, priv);
 409                 ifp->if_broot = kmem_zalloc(new_size, 0);
 410                 ifp->if_broot_bytes = (int)new_size;
 411
 412                 /* Initialize it and send it out. */
 413                 xfs_btree_init_block_int(cur->bc_mp, ifp->if_broot,
 414                                 XFS_BUF_DADDR_NULL, cur->bc_btnum, level,
 415                                 nr_this_block, cur->bc_ino.ip->i_ino,
 416                                 cur->bc_flags);
 417
 418                 *bpp = NULL;
 419                 *blockp = ifp->if_broot;
 420                 xfs_btree_set_ptr_null(cur, ptrp);
 421                 return 0;
 422         }
 423
 424         /* Claim one of the caller's preallocated blocks. */
 425         xfs_btree_set_ptr_null(cur, &new_ptr);
 426         ret = bbl->claim_block(cur, &new_ptr, priv);
 427         if (ret)
 428                 return ret;
 429
 430         ASSERT(!xfs_btree_ptr_is_null(cur, &new_ptr));
 431
 432         ret = xfs_btree_get_buf_block(cur, &new_ptr, &new_block, &new_bp);
 433         if (ret)
 434                 return ret;
 435
 436         /*
 437          * The previous block (if any) is the left sibling of the new block,
 438          * so set its right sibling pointer to the new block and drop it.
 439          */
 440         if (*blockp)
 441                 xfs_btree_set_sibling(cur, *blockp, &new_ptr, XFS_BB_RIGHTSIB);
 442
 443         ret = xfs_btree_bload_drop_buf(bbl, buffers_list, bpp);
 444         if (ret)
 445                 return ret;
 446
 447         /* Initialize the new btree block. */
 448         xfs_btree_init_block_cur(cur, new_bp, level, nr_this_block);
 449         xfs_btree_set_sibling(cur, new_block, ptrp, XFS_BB_LEFTSIB);
 450
 451         /* Set the out parameters. */
 452         *bpp = new_bp;
 453         *blockp = new_block;
 454         xfs_btree_copy_ptrs(cur, ptrp, &new_ptr, 1);
 455         return 0;
 456 }
 457
 458 /* Load one leaf block. */
 459 STATIC int
 460 xfs_btree_bload_leaf(
 461         struct xfs_btree_cur            *cur,
 462         unsigned int                    recs_this_block,
 463         xfs_btree_bload_get_records_fn  get_records,
 464         struct xfs_btree_block          *block,
 465         void                            *priv)
 466 {
 467         unsigned int                    j = 1;
 468         int                             ret;
 469
 470         /* Fill the leaf block with records. */
 471         while (j <= recs_this_block) {
 472                 ret = get_records(cur, j, block, recs_this_block - j + 1, priv);
 473                 if (ret < 0)
 474                         return ret;
 475                 j += ret;
 476         }
 477
 478         return 0;
 479 }
 480
 481 /*
 482  * Load one node block with key/ptr pairs.
 483  *
 484  * child_ptr must point to a block within the next level down in the tree.  A
 485  * key/ptr entry will be created in the new node block to the block pointed to
 486  * by child_ptr.  On exit, child_ptr points to the next block on the child
 487  * level that needs processing.
 488  */
 489 STATIC int
 490 xfs_btree_bload_node(
 491         struct xfs_btree_cur    *cur,
 492         unsigned int            recs_this_block,
 493         union xfs_btree_ptr     *child_ptr,
 494         struct xfs_btree_block  *block)
 495 {
 496         unsigned int            j;
 497         int                     ret;
 498
 499         /* Fill the node block with keys and pointers. */
 500         for (j = 1; j <= recs_this_block; j++) {
 501                 union xfs_btree_key     child_key;
 502                 union xfs_btree_ptr     *block_ptr;
 503                 union xfs_btree_key     *block_key;
 504                 struct xfs_btree_block  *child_block;
 505                 struct xfs_buf          *child_bp;
 506
 507                 ASSERT(!xfs_btree_ptr_is_null(cur, child_ptr));
 508
 509                 /*
 510                  * Read the lower-level block in case the buffer for it has
 511                  * been reclaimed.  LRU refs will be set on the block, which is
 512                  * desirable if the new btree commits.
 513                  */
 514                 ret = xfs_btree_read_buf_block(cur, child_ptr, 0, &child_block,
 515                                 &child_bp);
 516                 if (ret)
 517                         return ret;
 518
 519                 block_ptr = xfs_btree_ptr_addr(cur, j, block);
 520                 xfs_btree_copy_ptrs(cur, block_ptr, child_ptr, 1);
 521
 522                 block_key = xfs_btree_key_addr(cur, j, block);
 523                 xfs_btree_get_keys(cur, child_block, &child_key);
 524                 xfs_btree_copy_keys(cur, block_key, &child_key, 1);
 525
 526                 xfs_btree_get_sibling(cur, child_block, child_ptr,
 527                                 XFS_BB_RIGHTSIB);
 528                 xfs_buf_relse(child_bp);
 529         }
 530
 531         return 0;
 532 }
 533
 534 /*
 535  * Compute the maximum number of records (or keyptrs) per block that we want to
 536  * install at this level in the btree.  Caller is responsible for having set
 537  * @cur->bc_ino.forksize to the desired fork size, if appropriate.
 538  */
 539 STATIC unsigned int
 540 xfs_btree_bload_max_npb(
 541         struct xfs_btree_cur    *cur,
 542         struct xfs_btree_bload  *bbl,
 543         unsigned int            level)
 544 {
 545         unsigned int            ret;
 546
 547         if (level == cur->bc_nlevels - 1 && cur->bc_ops->get_dmaxrecs)
 548                 return cur->bc_ops->get_dmaxrecs(cur, level);
 549
 550         ret = cur->bc_ops->get_maxrecs(cur, level);
 551         if (level == 0)
 552                 ret -= bbl->leaf_slack;
 553         else
 554                 ret -= bbl->node_slack;
 555         return ret;
 556 }
 557
 558 /*
 559  * Compute the desired number of records (or keyptrs) per block that we want to
 560  * install at this level in the btree, which must be somewhere between minrecs
 561  * and max_npb.  The caller is free to install fewer records per block.
 562  */
 563 STATIC unsigned int
 564 xfs_btree_bload_desired_npb(
 565         struct xfs_btree_cur    *cur,
 566         struct xfs_btree_bload  *bbl,
 567         unsigned int            level)
 568 {
 569         unsigned int            npb = xfs_btree_bload_max_npb(cur, bbl, level);
 570
 571         /* Root blocks are not subject to minrecs rules. */
 572         if (level == cur->bc_nlevels - 1)
 573                 return max(1U, npb);
 574
 575         return max_t(unsigned int, cur->bc_ops->get_minrecs(cur, level), npb);
 576 }
 577
 578 /*
 579  * Compute the number of records to be stored in each block at this level and
 580  * the number of blocks for this level.  For leaf levels, we must populate an
 581  * empty root block even if there are no records, so we have to have at least
 582  * one block.
 583  */
 584 STATIC void
 585 xfs_btree_bload_level_geometry(
 586         struct xfs_btree_cur    *cur,
 587         struct xfs_btree_bload  *bbl,
 588         unsigned int            level,
 589         uint64_t                nr_this_level,
 590         unsigned int            *avg_per_block,
 591         uint64_t                *blocks,
 592         uint64_t                *blocks_with_extra)
 593 {
 594         uint64_t                npb;
 595         uint64_t                dontcare;
 596         unsigned int            desired_npb;
 597         unsigned int            maxnr;
 598
 599         /*
 600          * Compute the absolute maximum number of records that we can store in
 601          * the ondisk block or inode root.
 602          */
 603         if (cur->bc_ops->get_dmaxrecs)
 604                 maxnr = cur->bc_ops->get_dmaxrecs(cur, level);
 605         else
 606                 maxnr = cur->bc_ops->get_maxrecs(cur, level);
 607
 608         /*
 609          * Compute the number of blocks we need to fill each block with the
 610          * desired number of records/keyptrs per block.  Because desired_npb
 611          * could be minrecs, we use regular integer division (which rounds
 612          * the block count down) so that in the next step the effective # of
 613          * items per block will never be less than desired_npb.
 614          */
 615         desired_npb = xfs_btree_bload_desired_npb(cur, bbl, level);
 616         *blocks = div64_u64_rem(nr_this_level, desired_npb, &dontcare);
 617         *blocks = max(1ULL, *blocks);
 618
 619         /*
 620          * Compute the number of records that we will actually put in each
 621          * block, assuming that we want to spread the records evenly between
 622          * the blocks.  Take care that the effective # of items per block (npb)
 623          * won't exceed maxrecs even for the blocks that get an extra record,
 624          * since desired_npb could be maxrecs, and in the previous step we
 625          * rounded the block count down.
 626          */
 627         npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra);
 628         if (npb > maxnr || (npb == maxnr && *blocks_with_extra > 0)) {
 629                 (*blocks)++;
 630                 npb = div64_u64_rem(nr_this_level, *blocks, blocks_with_extra);
 631         }
 632
 633         *avg_per_block = min_t(uint64_t, npb, nr_this_level);
 634
 635         trace_xfs_btree_bload_level_geometry(cur, level, nr_this_level,
 636                         *avg_per_block, desired_npb, *blocks,
 637                         *blocks_with_extra);
 638 }
 639
 640 /*
 641  * Ensure a slack value is appropriate for the btree.
 642  *
 643  * If the slack value is negative, set slack so that we fill the block to
 644  * halfway between minrecs and maxrecs.  Make sure the slack is never so large
 645  * that we can underflow minrecs.
 646  */
 647 static void
 648 xfs_btree_bload_ensure_slack(
 649         struct xfs_btree_cur    *cur,
 650         int                     *slack,
 651         int                     level)
 652 {
 653         int                     maxr;
 654         int                     minr;
 655
 656         maxr = cur->bc_ops->get_maxrecs(cur, level);
 657         minr = cur->bc_ops->get_minrecs(cur, level);
 658
 659         /*
 660          * If slack is negative, automatically set slack so that we load the
 661          * btree block approximately halfway between minrecs and maxrecs.
 662          * Generally, this will net us 75% loading.
 663          */
 664         if (*slack < 0)
 665                 *slack = maxr - ((maxr + minr) >> 1);
 666
 667         *slack = min(*slack, maxr - minr);
 668 }
 669
 670 /*
 671  * Prepare a btree cursor for a bulk load operation by computing the geometry
 672  * fields in bbl.  Caller must ensure that the btree cursor is a staging
 673  * cursor.  This function can be called multiple times.
 674  */
 675 int
 676 xfs_btree_bload_compute_geometry(
 677         struct xfs_btree_cur    *cur,
 678         struct xfs_btree_bload  *bbl,
 679         uint64_t                nr_records)
 680 {
 681         uint64_t                nr_blocks = 0;
 682         uint64_t                nr_this_level;
 683
 684         ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
 685
 686         /*
 687          * Make sure that the slack values make sense for traditional leaf and
 688          * node blocks.  Inode-rooted btrees will return different minrecs and
 689          * maxrecs values for the root block (bc_nlevels == level - 1).  We're
 690          * checking levels 0 and 1 here, so set bc_nlevels such that the btree
 691          * code doesn't interpret either as the root level.
 692          */
 693         cur->bc_nlevels = cur->bc_maxlevels - 1;
 694         xfs_btree_bload_ensure_slack(cur, &bbl->leaf_slack, 0);
 695         xfs_btree_bload_ensure_slack(cur, &bbl->node_slack, 1);
 696
 697         bbl->nr_records = nr_this_level = nr_records;
 698         for (cur->bc_nlevels = 1; cur->bc_nlevels <= cur->bc_maxlevels;) {
 699                 uint64_t        level_blocks;
 700                 uint64_t        dontcare64;
 701                 unsigned int    level = cur->bc_nlevels - 1;
 702                 unsigned int    avg_per_block;
 703
 704                 xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
 705                                 &avg_per_block, &level_blocks, &dontcare64);
 706
 707                 if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
 708                         /*
 709                          * If all the items we want to store at this level
 710                          * would fit in the inode root block, then we have our
 711                          * btree root and are done.
 712                          *
 713                          * Note that bmap btrees forbid records in the root.
 714                          */
 715                         if (level != 0 && nr_this_level <= avg_per_block) {
 716                                 nr_blocks++;
 717                                 break;
 718                         }
 719
 720                         /*
 721                          * Otherwise, we have to store all the items for this
 722                          * level in traditional btree blocks and therefore need
 723                          * another level of btree to point to those blocks.
 724                          *
 725                          * We have to re-compute the geometry for each level of
 726                          * an inode-rooted btree because the geometry differs
 727                          * between a btree root in an inode fork and a
 728                          * traditional btree block.
 729                          *
 730                          * This distinction is made in the btree code based on
 731                          * whether level == bc_nlevels - 1.  Based on the
 732                          * previous root block size check against the root
 733                          * block geometry, we know that we aren't yet ready to
 734                          * populate the root.  Increment bc_nevels and
 735                          * recalculate the geometry for a traditional
 736                          * block-based btree level.
 737                          */
 738                         cur->bc_nlevels++;
 739                         ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
 740                         xfs_btree_bload_level_geometry(cur, bbl, level,
 741                                         nr_this_level, &avg_per_block,
 742                                         &level_blocks, &dontcare64);
 743                 } else {
 744                         /*
 745                          * If all the items we want to store at this level
 746                          * would fit in a single root block, we're done.
 747                          */
 748                         if (nr_this_level <= avg_per_block) {
 749                                 nr_blocks++;
 750                                 break;
 751                         }
 752
 753                         /* Otherwise, we need another level of btree. */
 754                         cur->bc_nlevels++;
 755                         ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
 756                 }
 757
 758                 nr_blocks += level_blocks;
 759                 nr_this_level = level_blocks;
 760         }
 761
 762         if (cur->bc_nlevels > cur->bc_maxlevels)
 763                 return -EOVERFLOW;
 764
 765         bbl->btree_height = cur->bc_nlevels;
 766         if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE)
 767                 bbl->nr_blocks = nr_blocks - 1;
 768         else
 769                 bbl->nr_blocks = nr_blocks;
 770         return 0;
 771 }
 772
 773 /* Bulk load a btree given the parameters and geometry established in bbl. */
 774 int
 775 xfs_btree_bload(
 776         struct xfs_btree_cur            *cur,
 777         struct xfs_btree_bload          *bbl,
 778         void                            *priv)
 779 {
 780         struct list_head                buffers_list;
 781         union xfs_btree_ptr             child_ptr;
 782         union xfs_btree_ptr             ptr;
 783         struct xfs_buf                  *bp = NULL;
 784         struct xfs_btree_block          *block = NULL;
 785         uint64_t                        nr_this_level = bbl->nr_records;
 786         uint64_t                        blocks;
 787         uint64_t                        i;
 788         uint64_t                        blocks_with_extra;
 789         uint64_t                        total_blocks = 0;
 790         unsigned int                    avg_per_block;
 791         unsigned int                    level = 0;
 792         int                             ret;
 793
 794         ASSERT(cur->bc_flags & XFS_BTREE_STAGING);
 795
 796         INIT_LIST_HEAD(&buffers_list);
 797         cur->bc_nlevels = bbl->btree_height;
 798         xfs_btree_set_ptr_null(cur, &child_ptr);
 799         xfs_btree_set_ptr_null(cur, &ptr);
 800         bbl->nr_dirty = 0;
 801
 802         xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
 803                         &avg_per_block, &blocks, &blocks_with_extra);
 804
 805         /* Load each leaf block. */
 806         for (i = 0; i < blocks; i++) {
 807                 unsigned int            nr_this_block = avg_per_block;
 808
 809                 /*
 810                  * Due to rounding, btree blocks will not be evenly populated
 811                  * in most cases.  blocks_with_extra tells us how many blocks
 812                  * will receive an extra record to distribute the excess across
 813                  * the current level as evenly as possible.
 814                  */
 815                 if (i < blocks_with_extra)
 816                         nr_this_block++;
 817
 818                 ret = xfs_btree_bload_prep_block(cur, bbl, &buffers_list, level,
 819                                 nr_this_block, &ptr, &bp, &block, priv);
 820                 if (ret)
 821                         goto out;
 822
 823                 trace_xfs_btree_bload_block(cur, level, i, blocks, &ptr,
 824                                 nr_this_block);
 825
 826                 ret = xfs_btree_bload_leaf(cur, nr_this_block, bbl->get_records,
 827                                 block, priv);
 828                 if (ret)
 829                         goto out;
 830
 831                 /*
 832                  * Record the leftmost leaf pointer so we know where to start
 833                  * with the first node level.
 834                  */
 835                 if (i == 0)
 836                         xfs_btree_copy_ptrs(cur, &child_ptr, &ptr, 1);
 837         }
 838         total_blocks += blocks;
 839
 840         ret = xfs_btree_bload_drop_buf(bbl, &buffers_list, &bp);
 841         if (ret)
 842                 goto out;
 843
 844         /* Populate the internal btree nodes. */
 845         for (level = 1; level < cur->bc_nlevels; level++) {
 846                 union xfs_btree_ptr     first_ptr;
 847
 848                 nr_this_level = blocks;
 849                 block = NULL;
 850                 xfs_btree_set_ptr_null(cur, &ptr);
 851
 852                 xfs_btree_bload_level_geometry(cur, bbl, level, nr_this_level,
 853                                 &avg_per_block, &blocks, &blocks_with_extra);
 854
 855                 /* Load each node block. */
 856                 for (i = 0; i < blocks; i++) {
 857                         unsigned int    nr_this_block = avg_per_block;
 858
 859                         if (i < blocks_with_extra)
 860                                 nr_this_block++;
 861
 862                         ret = xfs_btree_bload_prep_block(cur, bbl,
 863                                         &buffers_list, level, nr_this_block,
 864                                         &ptr, &bp, &block, priv);
 865                         if (ret)
 866                                 goto out;
 867
 868                         trace_xfs_btree_bload_block(cur, level, i, blocks,
 869                                         &ptr, nr_this_block);
 870
 871                         ret = xfs_btree_bload_node(cur, nr_this_block,
 872                                         &child_ptr, block);
 873                         if (ret)
 874                                 goto out;
 875
 876                         /*
 877                          * Record the leftmost node pointer so that we know
 878                          * where to start the next node level above this one.
 879                          */
 880                         if (i == 0)
 881                                 xfs_btree_copy_ptrs(cur, &first_ptr, &ptr, 1);
 882                 }
 883                 total_blocks += blocks;
 884
 885                 ret = xfs_btree_bload_drop_buf(bbl, &buffers_list, &bp);
 886                 if (ret)
 887                         goto out;
 888
 889                 xfs_btree_copy_ptrs(cur, &child_ptr, &first_ptr, 1);
 890         }
 891
 892         /* Initialize the new root. */
 893         if (cur->bc_flags & XFS_BTREE_ROOT_IN_INODE) {
 894                 ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
 895                 cur->bc_ino.ifake->if_levels = cur->bc_nlevels;
 896                 cur->bc_ino.ifake->if_blocks = total_blocks - 1;
 897         } else {
 898                 cur->bc_ag.afake->af_root = be32_to_cpu(ptr.s);
 899                 cur->bc_ag.afake->af_levels = cur->bc_nlevels;
 900                 cur->bc_ag.afake->af_blocks = total_blocks;
 901         }
 902
 903         /*
 904          * Write the new blocks to disk.  If the ordered list isn't empty after
 905          * that, then something went wrong and we have to fail.  This should
 906          * never happen, but we'll check anyway.
 907          */
 908         ret = xfs_buf_delwri_submit(&buffers_list);
 909         if (ret)
 910                 goto out;
 911         if (!list_empty(&buffers_list)) {
 912                 ASSERT(list_empty(&buffers_list));
 913                 ret = -EIO;
 914         }
 915
 916 out:
 917         xfs_buf_delwri_cancel(&buffers_list);
 918         if (bp)
 919                 xfs_buf_relse(bp);
 920         return ret;
 921 }