GNU Linux-libre 6.9-gnu
[releases.git] / fs / xfs / libxfs / xfs_btree.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * Copyright (c) 2000-2002,2005 Silicon Graphics, Inc.
4  * All Rights Reserved.
5  */
6 #include "xfs.h"
7 #include "xfs_fs.h"
8 #include "xfs_shared.h"
9 #include "xfs_format.h"
10 #include "xfs_log_format.h"
11 #include "xfs_trans_resv.h"
12 #include "xfs_bit.h"
13 #include "xfs_mount.h"
14 #include "xfs_inode.h"
15 #include "xfs_trans.h"
16 #include "xfs_buf_item.h"
17 #include "xfs_btree.h"
18 #include "xfs_errortag.h"
19 #include "xfs_error.h"
20 #include "xfs_trace.h"
21 #include "xfs_alloc.h"
22 #include "xfs_log.h"
23 #include "xfs_btree_staging.h"
24 #include "xfs_ag.h"
25 #include "xfs_alloc_btree.h"
26 #include "xfs_ialloc_btree.h"
27 #include "xfs_bmap_btree.h"
28 #include "xfs_rmap_btree.h"
29 #include "xfs_refcount_btree.h"
30 #include "xfs_health.h"
31 #include "xfs_buf_mem.h"
32 #include "xfs_btree_mem.h"
33
34 /*
35  * Btree magic numbers.
36  */
37 uint32_t
38 xfs_btree_magic(
39         struct xfs_mount                *mp,
40         const struct xfs_btree_ops      *ops)
41 {
42         int                             idx = xfs_has_crc(mp) ? 1 : 0;
43         __be32                          magic = ops->buf_ops->magic[idx];
44
45         /* Ensure we asked for crc for crc-only magics. */
46         ASSERT(magic != 0);
47         return be32_to_cpu(magic);
48 }
49
50 /*
51  * These sibling pointer checks are optimised for null sibling pointers. This
52  * happens a lot, and we don't need to byte swap at runtime if the sibling
53  * pointer is NULL.
54  *
55  * These are explicitly marked at inline because the cost of calling them as
56  * functions instead of inlining them is about 36 bytes extra code per call site
57  * on x86-64. Yes, gcc-11 fails to inline them, and explicit inlining of these
58  * two sibling check functions reduces the compiled code size by over 300
59  * bytes.
60  */
61 static inline xfs_failaddr_t
62 xfs_btree_check_fsblock_siblings(
63         struct xfs_mount        *mp,
64         xfs_fsblock_t           fsb,
65         __be64                  dsibling)
66 {
67         xfs_fsblock_t           sibling;
68
69         if (dsibling == cpu_to_be64(NULLFSBLOCK))
70                 return NULL;
71
72         sibling = be64_to_cpu(dsibling);
73         if (sibling == fsb)
74                 return __this_address;
75         if (!xfs_verify_fsbno(mp, sibling))
76                 return __this_address;
77         return NULL;
78 }
79
80 static inline xfs_failaddr_t
81 xfs_btree_check_memblock_siblings(
82         struct xfs_buftarg      *btp,
83         xfbno_t                 bno,
84         __be64                  dsibling)
85 {
86         xfbno_t                 sibling;
87
88         if (dsibling == cpu_to_be64(NULLFSBLOCK))
89                 return NULL;
90
91         sibling = be64_to_cpu(dsibling);
92         if (sibling == bno)
93                 return __this_address;
94         if (!xmbuf_verify_daddr(btp, xfbno_to_daddr(sibling)))
95                 return __this_address;
96         return NULL;
97 }
98
99 static inline xfs_failaddr_t
100 xfs_btree_check_agblock_siblings(
101         struct xfs_perag        *pag,
102         xfs_agblock_t           agbno,
103         __be32                  dsibling)
104 {
105         xfs_agblock_t           sibling;
106
107         if (dsibling == cpu_to_be32(NULLAGBLOCK))
108                 return NULL;
109
110         sibling = be32_to_cpu(dsibling);
111         if (sibling == agbno)
112                 return __this_address;
113         if (!xfs_verify_agbno(pag, sibling))
114                 return __this_address;
115         return NULL;
116 }
117
118 static xfs_failaddr_t
119 __xfs_btree_check_lblock_hdr(
120         struct xfs_btree_cur    *cur,
121         struct xfs_btree_block  *block,
122         int                     level,
123         struct xfs_buf          *bp)
124 {
125         struct xfs_mount        *mp = cur->bc_mp;
126
127         if (xfs_has_crc(mp)) {
128                 if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
129                         return __this_address;
130                 if (block->bb_u.l.bb_blkno !=
131                     cpu_to_be64(bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL))
132                         return __this_address;
133                 if (block->bb_u.l.bb_pad != cpu_to_be32(0))
134                         return __this_address;
135         }
136
137         if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(mp, cur->bc_ops))
138                 return __this_address;
139         if (be16_to_cpu(block->bb_level) != level)
140                 return __this_address;
141         if (be16_to_cpu(block->bb_numrecs) >
142             cur->bc_ops->get_maxrecs(cur, level))
143                 return __this_address;
144
145         return NULL;
146 }
147
148 /*
149  * Check a long btree block header.  Return the address of the failing check,
150  * or NULL if everything is ok.
151  */
152 static xfs_failaddr_t
153 __xfs_btree_check_fsblock(
154         struct xfs_btree_cur    *cur,
155         struct xfs_btree_block  *block,
156         int                     level,
157         struct xfs_buf          *bp)
158 {
159         struct xfs_mount        *mp = cur->bc_mp;
160         xfs_failaddr_t          fa;
161         xfs_fsblock_t           fsb;
162
163         fa = __xfs_btree_check_lblock_hdr(cur, block, level, bp);
164         if (fa)
165                 return fa;
166
167         /*
168          * For inode-rooted btrees, the root block sits in the inode fork.  In
169          * that case bp is NULL, and the block must not have any siblings.
170          */
171         if (!bp) {
172                 if (block->bb_u.l.bb_leftsib != cpu_to_be64(NULLFSBLOCK))
173                         return __this_address;
174                 if (block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK))
175                         return __this_address;
176                 return NULL;
177         }
178
179         fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
180         fa = xfs_btree_check_fsblock_siblings(mp, fsb,
181                         block->bb_u.l.bb_leftsib);
182         if (!fa)
183                 fa = xfs_btree_check_fsblock_siblings(mp, fsb,
184                                 block->bb_u.l.bb_rightsib);
185         return fa;
186 }
187
188 /*
189  * Check an in-memory btree block header.  Return the address of the failing
190  * check, or NULL if everything is ok.
191  */
192 static xfs_failaddr_t
193 __xfs_btree_check_memblock(
194         struct xfs_btree_cur    *cur,
195         struct xfs_btree_block  *block,
196         int                     level,
197         struct xfs_buf          *bp)
198 {
199         struct xfs_buftarg      *btp = cur->bc_mem.xfbtree->target;
200         xfs_failaddr_t          fa;
201         xfbno_t                 bno;
202
203         fa = __xfs_btree_check_lblock_hdr(cur, block, level, bp);
204         if (fa)
205                 return fa;
206
207         bno = xfs_daddr_to_xfbno(xfs_buf_daddr(bp));
208         fa = xfs_btree_check_memblock_siblings(btp, bno,
209                         block->bb_u.l.bb_leftsib);
210         if (!fa)
211                 fa = xfs_btree_check_memblock_siblings(btp, bno,
212                                 block->bb_u.l.bb_rightsib);
213         return fa;
214 }
215
216 /*
217  * Check a short btree block header.  Return the address of the failing check,
218  * or NULL if everything is ok.
219  */
220 static xfs_failaddr_t
221 __xfs_btree_check_agblock(
222         struct xfs_btree_cur    *cur,
223         struct xfs_btree_block  *block,
224         int                     level,
225         struct xfs_buf          *bp)
226 {
227         struct xfs_mount        *mp = cur->bc_mp;
228         struct xfs_perag        *pag = cur->bc_ag.pag;
229         xfs_failaddr_t          fa;
230         xfs_agblock_t           agbno;
231
232         if (xfs_has_crc(mp)) {
233                 if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
234                         return __this_address;
235                 if (block->bb_u.s.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp)))
236                         return __this_address;
237         }
238
239         if (be32_to_cpu(block->bb_magic) != xfs_btree_magic(mp, cur->bc_ops))
240                 return __this_address;
241         if (be16_to_cpu(block->bb_level) != level)
242                 return __this_address;
243         if (be16_to_cpu(block->bb_numrecs) >
244             cur->bc_ops->get_maxrecs(cur, level))
245                 return __this_address;
246
247         agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp));
248         fa = xfs_btree_check_agblock_siblings(pag, agbno,
249                         block->bb_u.s.bb_leftsib);
250         if (!fa)
251                 fa = xfs_btree_check_agblock_siblings(pag, agbno,
252                                 block->bb_u.s.bb_rightsib);
253         return fa;
254 }
255
256 /*
257  * Internal btree block check.
258  *
259  * Return NULL if the block is ok or the address of the failed check otherwise.
260  */
261 xfs_failaddr_t
262 __xfs_btree_check_block(
263         struct xfs_btree_cur    *cur,
264         struct xfs_btree_block  *block,
265         int                     level,
266         struct xfs_buf          *bp)
267 {
268         switch (cur->bc_ops->type) {
269         case XFS_BTREE_TYPE_MEM:
270                 return __xfs_btree_check_memblock(cur, block, level, bp);
271         case XFS_BTREE_TYPE_AG:
272                 return __xfs_btree_check_agblock(cur, block, level, bp);
273         case XFS_BTREE_TYPE_INODE:
274                 return __xfs_btree_check_fsblock(cur, block, level, bp);
275         default:
276                 ASSERT(0);
277                 return __this_address;
278         }
279 }
280
281 static inline unsigned int xfs_btree_block_errtag(struct xfs_btree_cur *cur)
282 {
283         if (cur->bc_ops->ptr_len == XFS_BTREE_SHORT_PTR_LEN)
284                 return XFS_ERRTAG_BTREE_CHECK_SBLOCK;
285         return XFS_ERRTAG_BTREE_CHECK_LBLOCK;
286 }
287
288 /*
289  * Debug routine: check that block header is ok.
290  */
291 int
292 xfs_btree_check_block(
293         struct xfs_btree_cur    *cur,   /* btree cursor */
294         struct xfs_btree_block  *block, /* generic btree block pointer */
295         int                     level,  /* level of the btree block */
296         struct xfs_buf          *bp)    /* buffer containing block, if any */
297 {
298         struct xfs_mount        *mp = cur->bc_mp;
299         xfs_failaddr_t          fa;
300
301         fa = __xfs_btree_check_block(cur, block, level, bp);
302         if (XFS_IS_CORRUPT(mp, fa != NULL) ||
303             XFS_TEST_ERROR(false, mp, xfs_btree_block_errtag(cur))) {
304                 if (bp)
305                         trace_xfs_btree_corrupt(bp, _RET_IP_);
306                 xfs_btree_mark_sick(cur);
307                 return -EFSCORRUPTED;
308         }
309         return 0;
310 }
311
312 int
313 __xfs_btree_check_ptr(
314         struct xfs_btree_cur            *cur,
315         const union xfs_btree_ptr       *ptr,
316         int                             index,
317         int                             level)
318 {
319         if (level <= 0)
320                 return -EFSCORRUPTED;
321
322         switch (cur->bc_ops->type) {
323         case XFS_BTREE_TYPE_MEM:
324                 if (!xfbtree_verify_bno(cur->bc_mem.xfbtree,
325                                 be64_to_cpu((&ptr->l)[index])))
326                         return -EFSCORRUPTED;
327                 break;
328         case XFS_BTREE_TYPE_INODE:
329                 if (!xfs_verify_fsbno(cur->bc_mp,
330                                 be64_to_cpu((&ptr->l)[index])))
331                         return -EFSCORRUPTED;
332                 break;
333         case XFS_BTREE_TYPE_AG:
334                 if (!xfs_verify_agbno(cur->bc_ag.pag,
335                                 be32_to_cpu((&ptr->s)[index])))
336                         return -EFSCORRUPTED;
337                 break;
338         }
339
340         return 0;
341 }
342
343 /*
344  * Check that a given (indexed) btree pointer at a certain level of a
345  * btree is valid and doesn't point past where it should.
346  */
347 static int
348 xfs_btree_check_ptr(
349         struct xfs_btree_cur            *cur,
350         const union xfs_btree_ptr       *ptr,
351         int                             index,
352         int                             level)
353 {
354         int                             error;
355
356         error = __xfs_btree_check_ptr(cur, ptr, index, level);
357         if (error) {
358                 switch (cur->bc_ops->type) {
359                 case XFS_BTREE_TYPE_MEM:
360                         xfs_err(cur->bc_mp,
361 "In-memory: Corrupt %sbt flags 0x%x pointer at level %d index %d fa %pS.",
362                                 cur->bc_ops->name, cur->bc_flags, level, index,
363                                 __this_address);
364                         break;
365                 case XFS_BTREE_TYPE_INODE:
366                         xfs_err(cur->bc_mp,
367 "Inode %llu fork %d: Corrupt %sbt pointer at level %d index %d.",
368                                 cur->bc_ino.ip->i_ino,
369                                 cur->bc_ino.whichfork, cur->bc_ops->name,
370                                 level, index);
371                         break;
372                 case XFS_BTREE_TYPE_AG:
373                         xfs_err(cur->bc_mp,
374 "AG %u: Corrupt %sbt pointer at level %d index %d.",
375                                 cur->bc_ag.pag->pag_agno, cur->bc_ops->name,
376                                 level, index);
377                         break;
378                 }
379                 xfs_btree_mark_sick(cur);
380         }
381
382         return error;
383 }
384
385 #ifdef DEBUG
386 # define xfs_btree_debug_check_ptr      xfs_btree_check_ptr
387 #else
388 # define xfs_btree_debug_check_ptr(...) (0)
389 #endif
390
391 /*
392  * Calculate CRC on the whole btree block and stuff it into the
393  * long-form btree header.
394  *
395  * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
396  * it into the buffer so recovery knows what the last modification was that made
397  * it to disk.
398  */
399 void
400 xfs_btree_fsblock_calc_crc(
401         struct xfs_buf          *bp)
402 {
403         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
404         struct xfs_buf_log_item *bip = bp->b_log_item;
405
406         if (!xfs_has_crc(bp->b_mount))
407                 return;
408         if (bip)
409                 block->bb_u.l.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
410         xfs_buf_update_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
411 }
412
413 bool
414 xfs_btree_fsblock_verify_crc(
415         struct xfs_buf          *bp)
416 {
417         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
418         struct xfs_mount        *mp = bp->b_mount;
419
420         if (xfs_has_crc(mp)) {
421                 if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.l.bb_lsn)))
422                         return false;
423                 return xfs_buf_verify_cksum(bp, XFS_BTREE_LBLOCK_CRC_OFF);
424         }
425
426         return true;
427 }
428
429 /*
430  * Calculate CRC on the whole btree block and stuff it into the
431  * short-form btree header.
432  *
433  * Prior to calculting the CRC, pull the LSN out of the buffer log item and put
434  * it into the buffer so recovery knows what the last modification was that made
435  * it to disk.
436  */
437 void
438 xfs_btree_agblock_calc_crc(
439         struct xfs_buf          *bp)
440 {
441         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
442         struct xfs_buf_log_item *bip = bp->b_log_item;
443
444         if (!xfs_has_crc(bp->b_mount))
445                 return;
446         if (bip)
447                 block->bb_u.s.bb_lsn = cpu_to_be64(bip->bli_item.li_lsn);
448         xfs_buf_update_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
449 }
450
451 bool
452 xfs_btree_agblock_verify_crc(
453         struct xfs_buf          *bp)
454 {
455         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
456         struct xfs_mount        *mp = bp->b_mount;
457
458         if (xfs_has_crc(mp)) {
459                 if (!xfs_log_check_lsn(mp, be64_to_cpu(block->bb_u.s.bb_lsn)))
460                         return false;
461                 return xfs_buf_verify_cksum(bp, XFS_BTREE_SBLOCK_CRC_OFF);
462         }
463
464         return true;
465 }
466
467 static int
468 xfs_btree_free_block(
469         struct xfs_btree_cur    *cur,
470         struct xfs_buf          *bp)
471 {
472         int                     error;
473
474         trace_xfs_btree_free_block(cur, bp);
475
476         /*
477          * Don't allow block freeing for a staging cursor, because staging
478          * cursors do not support regular btree modifications.
479          */
480         if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) {
481                 ASSERT(0);
482                 return -EFSCORRUPTED;
483         }
484
485         error = cur->bc_ops->free_block(cur, bp);
486         if (!error) {
487                 xfs_trans_binval(cur->bc_tp, bp);
488                 XFS_BTREE_STATS_INC(cur, free);
489         }
490         return error;
491 }
492
493 /*
494  * Delete the btree cursor.
495  */
496 void
497 xfs_btree_del_cursor(
498         struct xfs_btree_cur    *cur,           /* btree cursor */
499         int                     error)          /* del because of error */
500 {
501         int                     i;              /* btree level */
502
503         /*
504          * Clear the buffer pointers and release the buffers. If we're doing
505          * this because of an error, inspect all of the entries in the bc_bufs
506          * array for buffers to be unlocked. This is because some of the btree
507          * code works from level n down to 0, and if we get an error along the
508          * way we won't have initialized all the entries down to 0.
509          */
510         for (i = 0; i < cur->bc_nlevels; i++) {
511                 if (cur->bc_levels[i].bp)
512                         xfs_trans_brelse(cur->bc_tp, cur->bc_levels[i].bp);
513                 else if (!error)
514                         break;
515         }
516
517         /*
518          * If we are doing a BMBT update, the number of unaccounted blocks
519          * allocated during this cursor life time should be zero. If it's not
520          * zero, then we should be shut down or on our way to shutdown due to
521          * cancelling a dirty transaction on error.
522          */
523         ASSERT(!xfs_btree_is_bmap(cur->bc_ops) || cur->bc_bmap.allocated == 0 ||
524                xfs_is_shutdown(cur->bc_mp) || error != 0);
525
526         switch (cur->bc_ops->type) {
527         case XFS_BTREE_TYPE_AG:
528                 if (cur->bc_ag.pag)
529                         xfs_perag_put(cur->bc_ag.pag);
530                 break;
531         case XFS_BTREE_TYPE_INODE:
532                 /* nothing to do */
533                 break;
534         case XFS_BTREE_TYPE_MEM:
535                 if (cur->bc_mem.pag)
536                         xfs_perag_put(cur->bc_mem.pag);
537                 break;
538         }
539
540         kmem_cache_free(cur->bc_cache, cur);
541 }
542
543 /* Return the buffer target for this btree's buffer. */
544 static inline struct xfs_buftarg *
545 xfs_btree_buftarg(
546         struct xfs_btree_cur    *cur)
547 {
548         if (cur->bc_ops->type == XFS_BTREE_TYPE_MEM)
549                 return cur->bc_mem.xfbtree->target;
550         return cur->bc_mp->m_ddev_targp;
551 }
552
553 /* Return the block size (in units of 512b sectors) for this btree. */
554 static inline unsigned int
555 xfs_btree_bbsize(
556         struct xfs_btree_cur    *cur)
557 {
558         if (cur->bc_ops->type == XFS_BTREE_TYPE_MEM)
559                 return XFBNO_BBSIZE;
560         return cur->bc_mp->m_bsize;
561 }
562
563 /*
564  * Duplicate the btree cursor.
565  * Allocate a new one, copy the record, re-get the buffers.
566  */
567 int                                             /* error */
568 xfs_btree_dup_cursor(
569         struct xfs_btree_cur    *cur,           /* input cursor */
570         struct xfs_btree_cur    **ncur)         /* output cursor */
571 {
572         struct xfs_mount        *mp = cur->bc_mp;
573         struct xfs_trans        *tp = cur->bc_tp;
574         struct xfs_buf          *bp;
575         struct xfs_btree_cur    *new;
576         int                     error;
577         int                     i;
578
579         /*
580          * Don't allow staging cursors to be duplicated because they're supposed
581          * to be kept private to a single thread.
582          */
583         if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) {
584                 ASSERT(0);
585                 return -EFSCORRUPTED;
586         }
587
588         /*
589          * Allocate a new cursor like the old one.
590          */
591         new = cur->bc_ops->dup_cursor(cur);
592
593         /*
594          * Copy the record currently in the cursor.
595          */
596         new->bc_rec = cur->bc_rec;
597
598         /*
599          * For each level current, re-get the buffer and copy the ptr value.
600          */
601         for (i = 0; i < new->bc_nlevels; i++) {
602                 new->bc_levels[i].ptr = cur->bc_levels[i].ptr;
603                 new->bc_levels[i].ra = cur->bc_levels[i].ra;
604                 bp = cur->bc_levels[i].bp;
605                 if (bp) {
606                         error = xfs_trans_read_buf(mp, tp,
607                                         xfs_btree_buftarg(cur),
608                                         xfs_buf_daddr(bp),
609                                         xfs_btree_bbsize(cur), 0, &bp,
610                                         cur->bc_ops->buf_ops);
611                         if (xfs_metadata_is_sick(error))
612                                 xfs_btree_mark_sick(new);
613                         if (error) {
614                                 xfs_btree_del_cursor(new, error);
615                                 *ncur = NULL;
616                                 return error;
617                         }
618                 }
619                 new->bc_levels[i].bp = bp;
620         }
621         *ncur = new;
622         return 0;
623 }
624
625 /*
626  * XFS btree block layout and addressing:
627  *
628  * There are two types of blocks in the btree: leaf and non-leaf blocks.
629  *
630  * The leaf record start with a header then followed by records containing
631  * the values.  A non-leaf block also starts with the same header, and
632  * then first contains lookup keys followed by an equal number of pointers
633  * to the btree blocks at the previous level.
634  *
635  *              +--------+-------+-------+-------+-------+-------+-------+
636  * Leaf:        | header | rec 1 | rec 2 | rec 3 | rec 4 | rec 5 | rec N |
637  *              +--------+-------+-------+-------+-------+-------+-------+
638  *
639  *              +--------+-------+-------+-------+-------+-------+-------+
640  * Non-Leaf:    | header | key 1 | key 2 | key N | ptr 1 | ptr 2 | ptr N |
641  *              +--------+-------+-------+-------+-------+-------+-------+
642  *
643  * The header is called struct xfs_btree_block for reasons better left unknown
644  * and comes in different versions for short (32bit) and long (64bit) block
645  * pointers.  The record and key structures are defined by the btree instances
646  * and opaque to the btree core.  The block pointers are simple disk endian
647  * integers, available in a short (32bit) and long (64bit) variant.
648  *
649  * The helpers below calculate the offset of a given record, key or pointer
650  * into a btree block (xfs_btree_*_offset) or return a pointer to the given
651  * record, key or pointer (xfs_btree_*_addr).  Note that all addressing
652  * inside the btree block is done using indices starting at one, not zero!
653  *
654  * If XFS_BTGEO_OVERLAPPING is set, then this btree supports keys containing
655  * overlapping intervals.  In such a tree, records are still sorted lowest to
656  * highest and indexed by the smallest key value that refers to the record.
657  * However, nodes are different: each pointer has two associated keys -- one
658  * indexing the lowest key available in the block(s) below (the same behavior
659  * as the key in a regular btree) and another indexing the highest key
660  * available in the block(s) below.  Because records are /not/ sorted by the
661  * highest key, all leaf block updates require us to compute the highest key
662  * that matches any record in the leaf and to recursively update the high keys
663  * in the nodes going further up in the tree, if necessary.  Nodes look like
664  * this:
665  *
666  *              +--------+-----+-----+-----+-----+-----+-------+-------+-----+
667  * Non-Leaf:    | header | lo1 | hi1 | lo2 | hi2 | ... | ptr 1 | ptr 2 | ... |
668  *              +--------+-----+-----+-----+-----+-----+-------+-------+-----+
669  *
670  * To perform an interval query on an overlapped tree, perform the usual
671  * depth-first search and use the low and high keys to decide if we can skip
672  * that particular node.  If a leaf node is reached, return the records that
673  * intersect the interval.  Note that an interval query may return numerous
674  * entries.  For a non-overlapped tree, simply search for the record associated
675  * with the lowest key and iterate forward until a non-matching record is
676  * found.  Section 14.3 ("Interval Trees") of _Introduction to Algorithms_ by
677  * Cormen, Leiserson, Rivest, and Stein (2nd or 3rd ed. only) discuss this in
678  * more detail.
679  *
680  * Why do we care about overlapping intervals?  Let's say you have a bunch of
681  * reverse mapping records on a reflink filesystem:
682  *
683  * 1: +- file A startblock B offset C length D -----------+
684  * 2:      +- file E startblock F offset G length H --------------+
685  * 3:      +- file I startblock F offset J length K --+
686  * 4:                                                        +- file L... --+
687  *
688  * Now say we want to map block (B+D) into file A at offset (C+D).  Ideally,
689  * we'd simply increment the length of record 1.  But how do we find the record
690  * that ends at (B+D-1) (i.e. record 1)?  A LE lookup of (B+D-1) would return
691  * record 3 because the keys are ordered first by startblock.  An interval
692  * query would return records 1 and 2 because they both overlap (B+D-1), and
693  * from that we can pick out record 1 as the appropriate left neighbor.
694  *
695  * In the non-overlapped case you can do a LE lookup and decrement the cursor
696  * because a record's interval must end before the next record.
697  */
698
699 /*
700  * Return size of the btree block header for this btree instance.
701  */
702 static inline size_t xfs_btree_block_len(struct xfs_btree_cur *cur)
703 {
704         if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
705                 if (xfs_has_crc(cur->bc_mp))
706                         return XFS_BTREE_LBLOCK_CRC_LEN;
707                 return XFS_BTREE_LBLOCK_LEN;
708         }
709         if (xfs_has_crc(cur->bc_mp))
710                 return XFS_BTREE_SBLOCK_CRC_LEN;
711         return XFS_BTREE_SBLOCK_LEN;
712 }
713
714 /*
715  * Calculate offset of the n-th record in a btree block.
716  */
717 STATIC size_t
718 xfs_btree_rec_offset(
719         struct xfs_btree_cur    *cur,
720         int                     n)
721 {
722         return xfs_btree_block_len(cur) +
723                 (n - 1) * cur->bc_ops->rec_len;
724 }
725
726 /*
727  * Calculate offset of the n-th key in a btree block.
728  */
729 STATIC size_t
730 xfs_btree_key_offset(
731         struct xfs_btree_cur    *cur,
732         int                     n)
733 {
734         return xfs_btree_block_len(cur) +
735                 (n - 1) * cur->bc_ops->key_len;
736 }
737
738 /*
739  * Calculate offset of the n-th high key in a btree block.
740  */
741 STATIC size_t
742 xfs_btree_high_key_offset(
743         struct xfs_btree_cur    *cur,
744         int                     n)
745 {
746         return xfs_btree_block_len(cur) +
747                 (n - 1) * cur->bc_ops->key_len + (cur->bc_ops->key_len / 2);
748 }
749
750 /*
751  * Calculate offset of the n-th block pointer in a btree block.
752  */
753 STATIC size_t
754 xfs_btree_ptr_offset(
755         struct xfs_btree_cur    *cur,
756         int                     n,
757         int                     level)
758 {
759         return xfs_btree_block_len(cur) +
760                 cur->bc_ops->get_maxrecs(cur, level) * cur->bc_ops->key_len +
761                 (n - 1) * cur->bc_ops->ptr_len;
762 }
763
764 /*
765  * Return a pointer to the n-th record in the btree block.
766  */
767 union xfs_btree_rec *
768 xfs_btree_rec_addr(
769         struct xfs_btree_cur    *cur,
770         int                     n,
771         struct xfs_btree_block  *block)
772 {
773         return (union xfs_btree_rec *)
774                 ((char *)block + xfs_btree_rec_offset(cur, n));
775 }
776
777 /*
778  * Return a pointer to the n-th key in the btree block.
779  */
780 union xfs_btree_key *
781 xfs_btree_key_addr(
782         struct xfs_btree_cur    *cur,
783         int                     n,
784         struct xfs_btree_block  *block)
785 {
786         return (union xfs_btree_key *)
787                 ((char *)block + xfs_btree_key_offset(cur, n));
788 }
789
790 /*
791  * Return a pointer to the n-th high key in the btree block.
792  */
793 union xfs_btree_key *
794 xfs_btree_high_key_addr(
795         struct xfs_btree_cur    *cur,
796         int                     n,
797         struct xfs_btree_block  *block)
798 {
799         return (union xfs_btree_key *)
800                 ((char *)block + xfs_btree_high_key_offset(cur, n));
801 }
802
803 /*
804  * Return a pointer to the n-th block pointer in the btree block.
805  */
806 union xfs_btree_ptr *
807 xfs_btree_ptr_addr(
808         struct xfs_btree_cur    *cur,
809         int                     n,
810         struct xfs_btree_block  *block)
811 {
812         int                     level = xfs_btree_get_level(block);
813
814         ASSERT(block->bb_level != 0);
815
816         return (union xfs_btree_ptr *)
817                 ((char *)block + xfs_btree_ptr_offset(cur, n, level));
818 }
819
820 struct xfs_ifork *
821 xfs_btree_ifork_ptr(
822         struct xfs_btree_cur    *cur)
823 {
824         ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE);
825
826         if (cur->bc_flags & XFS_BTREE_STAGING)
827                 return cur->bc_ino.ifake->if_fork;
828         return xfs_ifork_ptr(cur->bc_ino.ip, cur->bc_ino.whichfork);
829 }
830
831 /*
832  * Get the root block which is stored in the inode.
833  *
834  * For now this btree implementation assumes the btree root is always
835  * stored in the if_broot field of an inode fork.
836  */
837 STATIC struct xfs_btree_block *
838 xfs_btree_get_iroot(
839         struct xfs_btree_cur    *cur)
840 {
841         struct xfs_ifork        *ifp = xfs_btree_ifork_ptr(cur);
842
843         return (struct xfs_btree_block *)ifp->if_broot;
844 }
845
846 /*
847  * Retrieve the block pointer from the cursor at the given level.
848  * This may be an inode btree root or from a buffer.
849  */
850 struct xfs_btree_block *                /* generic btree block pointer */
851 xfs_btree_get_block(
852         struct xfs_btree_cur    *cur,   /* btree cursor */
853         int                     level,  /* level in btree */
854         struct xfs_buf          **bpp)  /* buffer containing the block */
855 {
856         if (xfs_btree_at_iroot(cur, level)) {
857                 *bpp = NULL;
858                 return xfs_btree_get_iroot(cur);
859         }
860
861         *bpp = cur->bc_levels[level].bp;
862         return XFS_BUF_TO_BLOCK(*bpp);
863 }
864
865 /*
866  * Change the cursor to point to the first record at the given level.
867  * Other levels are unaffected.
868  */
869 STATIC int                              /* success=1, failure=0 */
870 xfs_btree_firstrec(
871         struct xfs_btree_cur    *cur,   /* btree cursor */
872         int                     level)  /* level to change */
873 {
874         struct xfs_btree_block  *block; /* generic btree block pointer */
875         struct xfs_buf          *bp;    /* buffer containing block */
876
877         /*
878          * Get the block pointer for this level.
879          */
880         block = xfs_btree_get_block(cur, level, &bp);
881         if (xfs_btree_check_block(cur, block, level, bp))
882                 return 0;
883         /*
884          * It's empty, there is no such record.
885          */
886         if (!block->bb_numrecs)
887                 return 0;
888         /*
889          * Set the ptr value to 1, that's the first record/key.
890          */
891         cur->bc_levels[level].ptr = 1;
892         return 1;
893 }
894
895 /*
896  * Change the cursor to point to the last record in the current block
897  * at the given level.  Other levels are unaffected.
898  */
899 STATIC int                              /* success=1, failure=0 */
900 xfs_btree_lastrec(
901         struct xfs_btree_cur    *cur,   /* btree cursor */
902         int                     level)  /* level to change */
903 {
904         struct xfs_btree_block  *block; /* generic btree block pointer */
905         struct xfs_buf          *bp;    /* buffer containing block */
906
907         /*
908          * Get the block pointer for this level.
909          */
910         block = xfs_btree_get_block(cur, level, &bp);
911         if (xfs_btree_check_block(cur, block, level, bp))
912                 return 0;
913         /*
914          * It's empty, there is no such record.
915          */
916         if (!block->bb_numrecs)
917                 return 0;
918         /*
919          * Set the ptr value to numrecs, that's the last record/key.
920          */
921         cur->bc_levels[level].ptr = be16_to_cpu(block->bb_numrecs);
922         return 1;
923 }
924
925 /*
926  * Compute first and last byte offsets for the fields given.
927  * Interprets the offsets table, which contains struct field offsets.
928  */
929 void
930 xfs_btree_offsets(
931         uint32_t        fields,         /* bitmask of fields */
932         const short     *offsets,       /* table of field offsets */
933         int             nbits,          /* number of bits to inspect */
934         int             *first,         /* output: first byte offset */
935         int             *last)          /* output: last byte offset */
936 {
937         int             i;              /* current bit number */
938         uint32_t        imask;          /* mask for current bit number */
939
940         ASSERT(fields != 0);
941         /*
942          * Find the lowest bit, so the first byte offset.
943          */
944         for (i = 0, imask = 1u; ; i++, imask <<= 1) {
945                 if (imask & fields) {
946                         *first = offsets[i];
947                         break;
948                 }
949         }
950         /*
951          * Find the highest bit, so the last byte offset.
952          */
953         for (i = nbits - 1, imask = 1u << i; ; i--, imask >>= 1) {
954                 if (imask & fields) {
955                         *last = offsets[i + 1] - 1;
956                         break;
957                 }
958         }
959 }
960
961 STATIC int
962 xfs_btree_readahead_fsblock(
963         struct xfs_btree_cur    *cur,
964         int                     lr,
965         struct xfs_btree_block  *block)
966 {
967         struct xfs_mount        *mp = cur->bc_mp;
968         xfs_fsblock_t           left = be64_to_cpu(block->bb_u.l.bb_leftsib);
969         xfs_fsblock_t           right = be64_to_cpu(block->bb_u.l.bb_rightsib);
970         int                     rval = 0;
971
972         if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) {
973                 xfs_buf_readahead(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, left),
974                                 mp->m_bsize, cur->bc_ops->buf_ops);
975                 rval++;
976         }
977
978         if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) {
979                 xfs_buf_readahead(mp->m_ddev_targp, XFS_FSB_TO_DADDR(mp, right),
980                                 mp->m_bsize, cur->bc_ops->buf_ops);
981                 rval++;
982         }
983
984         return rval;
985 }
986
987 STATIC int
988 xfs_btree_readahead_memblock(
989         struct xfs_btree_cur    *cur,
990         int                     lr,
991         struct xfs_btree_block  *block)
992 {
993         struct xfs_buftarg      *btp = cur->bc_mem.xfbtree->target;
994         xfbno_t                 left = be64_to_cpu(block->bb_u.l.bb_leftsib);
995         xfbno_t                 right = be64_to_cpu(block->bb_u.l.bb_rightsib);
996         int                     rval = 0;
997
998         if ((lr & XFS_BTCUR_LEFTRA) && left != NULLFSBLOCK) {
999                 xfs_buf_readahead(btp, xfbno_to_daddr(left), XFBNO_BBSIZE,
1000                                 cur->bc_ops->buf_ops);
1001                 rval++;
1002         }
1003
1004         if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLFSBLOCK) {
1005                 xfs_buf_readahead(btp, xfbno_to_daddr(right), XFBNO_BBSIZE,
1006                                 cur->bc_ops->buf_ops);
1007                 rval++;
1008         }
1009
1010         return rval;
1011 }
1012
1013 STATIC int
1014 xfs_btree_readahead_agblock(
1015         struct xfs_btree_cur    *cur,
1016         int                     lr,
1017         struct xfs_btree_block  *block)
1018 {
1019         struct xfs_mount        *mp = cur->bc_mp;
1020         xfs_agnumber_t          agno = cur->bc_ag.pag->pag_agno;
1021         xfs_agblock_t           left = be32_to_cpu(block->bb_u.s.bb_leftsib);
1022         xfs_agblock_t           right = be32_to_cpu(block->bb_u.s.bb_rightsib);
1023         int                     rval = 0;
1024
1025         if ((lr & XFS_BTCUR_LEFTRA) && left != NULLAGBLOCK) {
1026                 xfs_buf_readahead(mp->m_ddev_targp,
1027                                 XFS_AGB_TO_DADDR(mp, agno, left),
1028                                 mp->m_bsize, cur->bc_ops->buf_ops);
1029                 rval++;
1030         }
1031
1032         if ((lr & XFS_BTCUR_RIGHTRA) && right != NULLAGBLOCK) {
1033                 xfs_buf_readahead(mp->m_ddev_targp,
1034                                 XFS_AGB_TO_DADDR(mp, agno, right),
1035                                 mp->m_bsize, cur->bc_ops->buf_ops);
1036                 rval++;
1037         }
1038
1039         return rval;
1040 }
1041
1042 /*
1043  * Read-ahead btree blocks, at the given level.
1044  * Bits in lr are set from XFS_BTCUR_{LEFT,RIGHT}RA.
1045  */
1046 STATIC int
1047 xfs_btree_readahead(
1048         struct xfs_btree_cur    *cur,           /* btree cursor */
1049         int                     lev,            /* level in btree */
1050         int                     lr)             /* left/right bits */
1051 {
1052         struct xfs_btree_block  *block;
1053
1054         /*
1055          * No readahead needed if we are at the root level and the
1056          * btree root is stored in the inode.
1057          */
1058         if (xfs_btree_at_iroot(cur, lev))
1059                 return 0;
1060
1061         if ((cur->bc_levels[lev].ra | lr) == cur->bc_levels[lev].ra)
1062                 return 0;
1063
1064         cur->bc_levels[lev].ra |= lr;
1065         block = XFS_BUF_TO_BLOCK(cur->bc_levels[lev].bp);
1066
1067         switch (cur->bc_ops->type) {
1068         case XFS_BTREE_TYPE_AG:
1069                 return xfs_btree_readahead_agblock(cur, lr, block);
1070         case XFS_BTREE_TYPE_INODE:
1071                 return xfs_btree_readahead_fsblock(cur, lr, block);
1072         case XFS_BTREE_TYPE_MEM:
1073                 return xfs_btree_readahead_memblock(cur, lr, block);
1074         default:
1075                 ASSERT(0);
1076                 return 0;
1077         }
1078 }
1079
1080 STATIC int
1081 xfs_btree_ptr_to_daddr(
1082         struct xfs_btree_cur            *cur,
1083         const union xfs_btree_ptr       *ptr,
1084         xfs_daddr_t                     *daddr)
1085 {
1086         int                     error;
1087
1088         error = xfs_btree_check_ptr(cur, ptr, 0, 1);
1089         if (error)
1090                 return error;
1091
1092         switch (cur->bc_ops->type) {
1093         case XFS_BTREE_TYPE_AG:
1094                 *daddr = XFS_AGB_TO_DADDR(cur->bc_mp, cur->bc_ag.pag->pag_agno,
1095                                 be32_to_cpu(ptr->s));
1096                 break;
1097         case XFS_BTREE_TYPE_INODE:
1098                 *daddr = XFS_FSB_TO_DADDR(cur->bc_mp, be64_to_cpu(ptr->l));
1099                 break;
1100         case XFS_BTREE_TYPE_MEM:
1101                 *daddr = xfbno_to_daddr(be64_to_cpu(ptr->l));
1102                 break;
1103         }
1104         return 0;
1105 }
1106
1107 /*
1108  * Readahead @count btree blocks at the given @ptr location.
1109  *
1110  * We don't need to care about long or short form btrees here as we have a
1111  * method of converting the ptr directly to a daddr available to us.
1112  */
1113 STATIC void
1114 xfs_btree_readahead_ptr(
1115         struct xfs_btree_cur    *cur,
1116         union xfs_btree_ptr     *ptr,
1117         xfs_extlen_t            count)
1118 {
1119         xfs_daddr_t             daddr;
1120
1121         if (xfs_btree_ptr_to_daddr(cur, ptr, &daddr))
1122                 return;
1123         xfs_buf_readahead(xfs_btree_buftarg(cur), daddr,
1124                         xfs_btree_bbsize(cur) * count,
1125                         cur->bc_ops->buf_ops);
1126 }
1127
1128 /*
1129  * Set the buffer for level "lev" in the cursor to bp, releasing
1130  * any previous buffer.
1131  */
1132 STATIC void
1133 xfs_btree_setbuf(
1134         struct xfs_btree_cur    *cur,   /* btree cursor */
1135         int                     lev,    /* level in btree */
1136         struct xfs_buf          *bp)    /* new buffer to set */
1137 {
1138         struct xfs_btree_block  *b;     /* btree block */
1139
1140         if (cur->bc_levels[lev].bp)
1141                 xfs_trans_brelse(cur->bc_tp, cur->bc_levels[lev].bp);
1142         cur->bc_levels[lev].bp = bp;
1143         cur->bc_levels[lev].ra = 0;
1144
1145         b = XFS_BUF_TO_BLOCK(bp);
1146         if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
1147                 if (b->bb_u.l.bb_leftsib == cpu_to_be64(NULLFSBLOCK))
1148                         cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA;
1149                 if (b->bb_u.l.bb_rightsib == cpu_to_be64(NULLFSBLOCK))
1150                         cur->bc_levels[lev].ra |= XFS_BTCUR_RIGHTRA;
1151         } else {
1152                 if (b->bb_u.s.bb_leftsib == cpu_to_be32(NULLAGBLOCK))
1153                         cur->bc_levels[lev].ra |= XFS_BTCUR_LEFTRA;
1154                 if (b->bb_u.s.bb_rightsib == cpu_to_be32(NULLAGBLOCK))
1155                         cur->bc_levels[lev].ra |= XFS_BTCUR_RIGHTRA;
1156         }
1157 }
1158
1159 bool
1160 xfs_btree_ptr_is_null(
1161         struct xfs_btree_cur            *cur,
1162         const union xfs_btree_ptr       *ptr)
1163 {
1164         if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
1165                 return ptr->l == cpu_to_be64(NULLFSBLOCK);
1166         else
1167                 return ptr->s == cpu_to_be32(NULLAGBLOCK);
1168 }
1169
1170 void
1171 xfs_btree_set_ptr_null(
1172         struct xfs_btree_cur    *cur,
1173         union xfs_btree_ptr     *ptr)
1174 {
1175         if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
1176                 ptr->l = cpu_to_be64(NULLFSBLOCK);
1177         else
1178                 ptr->s = cpu_to_be32(NULLAGBLOCK);
1179 }
1180
1181 static inline bool
1182 xfs_btree_ptrs_equal(
1183         struct xfs_btree_cur            *cur,
1184         union xfs_btree_ptr             *ptr1,
1185         union xfs_btree_ptr             *ptr2)
1186 {
1187         if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
1188                 return ptr1->l == ptr2->l;
1189         return ptr1->s == ptr2->s;
1190 }
1191
1192 /*
1193  * Get/set/init sibling pointers
1194  */
1195 void
1196 xfs_btree_get_sibling(
1197         struct xfs_btree_cur    *cur,
1198         struct xfs_btree_block  *block,
1199         union xfs_btree_ptr     *ptr,
1200         int                     lr)
1201 {
1202         ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
1203
1204         if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
1205                 if (lr == XFS_BB_RIGHTSIB)
1206                         ptr->l = block->bb_u.l.bb_rightsib;
1207                 else
1208                         ptr->l = block->bb_u.l.bb_leftsib;
1209         } else {
1210                 if (lr == XFS_BB_RIGHTSIB)
1211                         ptr->s = block->bb_u.s.bb_rightsib;
1212                 else
1213                         ptr->s = block->bb_u.s.bb_leftsib;
1214         }
1215 }
1216
1217 void
1218 xfs_btree_set_sibling(
1219         struct xfs_btree_cur            *cur,
1220         struct xfs_btree_block          *block,
1221         const union xfs_btree_ptr       *ptr,
1222         int                             lr)
1223 {
1224         ASSERT(lr == XFS_BB_LEFTSIB || lr == XFS_BB_RIGHTSIB);
1225
1226         if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
1227                 if (lr == XFS_BB_RIGHTSIB)
1228                         block->bb_u.l.bb_rightsib = ptr->l;
1229                 else
1230                         block->bb_u.l.bb_leftsib = ptr->l;
1231         } else {
1232                 if (lr == XFS_BB_RIGHTSIB)
1233                         block->bb_u.s.bb_rightsib = ptr->s;
1234                 else
1235                         block->bb_u.s.bb_leftsib = ptr->s;
1236         }
1237 }
1238
1239 static void
1240 __xfs_btree_init_block(
1241         struct xfs_mount        *mp,
1242         struct xfs_btree_block  *buf,
1243         const struct xfs_btree_ops *ops,
1244         xfs_daddr_t             blkno,
1245         __u16                   level,
1246         __u16                   numrecs,
1247         __u64                   owner)
1248 {
1249         bool                    crc = xfs_has_crc(mp);
1250         __u32                   magic = xfs_btree_magic(mp, ops);
1251
1252         buf->bb_magic = cpu_to_be32(magic);
1253         buf->bb_level = cpu_to_be16(level);
1254         buf->bb_numrecs = cpu_to_be16(numrecs);
1255
1256         if (ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
1257                 buf->bb_u.l.bb_leftsib = cpu_to_be64(NULLFSBLOCK);
1258                 buf->bb_u.l.bb_rightsib = cpu_to_be64(NULLFSBLOCK);
1259                 if (crc) {
1260                         buf->bb_u.l.bb_blkno = cpu_to_be64(blkno);
1261                         buf->bb_u.l.bb_owner = cpu_to_be64(owner);
1262                         uuid_copy(&buf->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid);
1263                         buf->bb_u.l.bb_pad = 0;
1264                         buf->bb_u.l.bb_lsn = 0;
1265                 }
1266         } else {
1267                 buf->bb_u.s.bb_leftsib = cpu_to_be32(NULLAGBLOCK);
1268                 buf->bb_u.s.bb_rightsib = cpu_to_be32(NULLAGBLOCK);
1269                 if (crc) {
1270                         buf->bb_u.s.bb_blkno = cpu_to_be64(blkno);
1271                         /* owner is a 32 bit value on short blocks */
1272                         buf->bb_u.s.bb_owner = cpu_to_be32((__u32)owner);
1273                         uuid_copy(&buf->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid);
1274                         buf->bb_u.s.bb_lsn = 0;
1275                 }
1276         }
1277 }
1278
1279 void
1280 xfs_btree_init_block(
1281         struct xfs_mount        *mp,
1282         struct xfs_btree_block  *block,
1283         const struct xfs_btree_ops *ops,
1284         __u16                   level,
1285         __u16                   numrecs,
1286         __u64                   owner)
1287 {
1288         __xfs_btree_init_block(mp, block, ops, XFS_BUF_DADDR_NULL, level,
1289                         numrecs, owner);
1290 }
1291
1292 void
1293 xfs_btree_init_buf(
1294         struct xfs_mount                *mp,
1295         struct xfs_buf                  *bp,
1296         const struct xfs_btree_ops      *ops,
1297         __u16                           level,
1298         __u16                           numrecs,
1299         __u64                           owner)
1300 {
1301         __xfs_btree_init_block(mp, XFS_BUF_TO_BLOCK(bp), ops,
1302                         xfs_buf_daddr(bp), level, numrecs, owner);
1303         bp->b_ops = ops->buf_ops;
1304 }
1305
1306 static inline __u64
1307 xfs_btree_owner(
1308         struct xfs_btree_cur    *cur)
1309 {
1310         switch (cur->bc_ops->type) {
1311         case XFS_BTREE_TYPE_MEM:
1312                 return cur->bc_mem.xfbtree->owner;
1313         case XFS_BTREE_TYPE_INODE:
1314                 return cur->bc_ino.ip->i_ino;
1315         case XFS_BTREE_TYPE_AG:
1316                 return cur->bc_ag.pag->pag_agno;
1317         default:
1318                 ASSERT(0);
1319                 return 0;
1320         }
1321 }
1322
1323 void
1324 xfs_btree_init_block_cur(
1325         struct xfs_btree_cur    *cur,
1326         struct xfs_buf          *bp,
1327         int                     level,
1328         int                     numrecs)
1329 {
1330         xfs_btree_init_buf(cur->bc_mp, bp, cur->bc_ops, level, numrecs,
1331                         xfs_btree_owner(cur));
1332 }
1333
1334 /*
1335  * Return true if ptr is the last record in the btree and
1336  * we need to track updates to this record.  The decision
1337  * will be further refined in the update_lastrec method.
1338  */
1339 STATIC int
1340 xfs_btree_is_lastrec(
1341         struct xfs_btree_cur    *cur,
1342         struct xfs_btree_block  *block,
1343         int                     level)
1344 {
1345         union xfs_btree_ptr     ptr;
1346
1347         if (level > 0)
1348                 return 0;
1349         if (!(cur->bc_ops->geom_flags & XFS_BTGEO_LASTREC_UPDATE))
1350                 return 0;
1351
1352         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1353         if (!xfs_btree_ptr_is_null(cur, &ptr))
1354                 return 0;
1355         return 1;
1356 }
1357
1358 STATIC void
1359 xfs_btree_buf_to_ptr(
1360         struct xfs_btree_cur    *cur,
1361         struct xfs_buf          *bp,
1362         union xfs_btree_ptr     *ptr)
1363 {
1364         switch (cur->bc_ops->type) {
1365         case XFS_BTREE_TYPE_AG:
1366                 ptr->s = cpu_to_be32(xfs_daddr_to_agbno(cur->bc_mp,
1367                                         xfs_buf_daddr(bp)));
1368                 break;
1369         case XFS_BTREE_TYPE_INODE:
1370                 ptr->l = cpu_to_be64(XFS_DADDR_TO_FSB(cur->bc_mp,
1371                                         xfs_buf_daddr(bp)));
1372                 break;
1373         case XFS_BTREE_TYPE_MEM:
1374                 ptr->l = cpu_to_be64(xfs_daddr_to_xfbno(xfs_buf_daddr(bp)));
1375                 break;
1376         }
1377 }
1378
1379 static inline void
1380 xfs_btree_set_refs(
1381         struct xfs_btree_cur    *cur,
1382         struct xfs_buf          *bp)
1383 {
1384         xfs_buf_set_ref(bp, cur->bc_ops->lru_refs);
1385 }
1386
1387 int
1388 xfs_btree_get_buf_block(
1389         struct xfs_btree_cur            *cur,
1390         const union xfs_btree_ptr       *ptr,
1391         struct xfs_btree_block          **block,
1392         struct xfs_buf                  **bpp)
1393 {
1394         xfs_daddr_t                     d;
1395         int                             error;
1396
1397         error = xfs_btree_ptr_to_daddr(cur, ptr, &d);
1398         if (error)
1399                 return error;
1400         error = xfs_trans_get_buf(cur->bc_tp, xfs_btree_buftarg(cur), d,
1401                         xfs_btree_bbsize(cur), 0, bpp);
1402         if (error)
1403                 return error;
1404
1405         (*bpp)->b_ops = cur->bc_ops->buf_ops;
1406         *block = XFS_BUF_TO_BLOCK(*bpp);
1407         return 0;
1408 }
1409
1410 /*
1411  * Read in the buffer at the given ptr and return the buffer and
1412  * the block pointer within the buffer.
1413  */
1414 int
1415 xfs_btree_read_buf_block(
1416         struct xfs_btree_cur            *cur,
1417         const union xfs_btree_ptr       *ptr,
1418         int                             flags,
1419         struct xfs_btree_block          **block,
1420         struct xfs_buf                  **bpp)
1421 {
1422         struct xfs_mount        *mp = cur->bc_mp;
1423         xfs_daddr_t             d;
1424         int                     error;
1425
1426         /* need to sort out how callers deal with failures first */
1427         ASSERT(!(flags & XBF_TRYLOCK));
1428
1429         error = xfs_btree_ptr_to_daddr(cur, ptr, &d);
1430         if (error)
1431                 return error;
1432         error = xfs_trans_read_buf(mp, cur->bc_tp, xfs_btree_buftarg(cur), d,
1433                         xfs_btree_bbsize(cur), flags, bpp,
1434                         cur->bc_ops->buf_ops);
1435         if (xfs_metadata_is_sick(error))
1436                 xfs_btree_mark_sick(cur);
1437         if (error)
1438                 return error;
1439
1440         xfs_btree_set_refs(cur, *bpp);
1441         *block = XFS_BUF_TO_BLOCK(*bpp);
1442         return 0;
1443 }
1444
1445 /*
1446  * Copy keys from one btree block to another.
1447  */
1448 void
1449 xfs_btree_copy_keys(
1450         struct xfs_btree_cur            *cur,
1451         union xfs_btree_key             *dst_key,
1452         const union xfs_btree_key       *src_key,
1453         int                             numkeys)
1454 {
1455         ASSERT(numkeys >= 0);
1456         memcpy(dst_key, src_key, numkeys * cur->bc_ops->key_len);
1457 }
1458
1459 /*
1460  * Copy records from one btree block to another.
1461  */
1462 STATIC void
1463 xfs_btree_copy_recs(
1464         struct xfs_btree_cur    *cur,
1465         union xfs_btree_rec     *dst_rec,
1466         union xfs_btree_rec     *src_rec,
1467         int                     numrecs)
1468 {
1469         ASSERT(numrecs >= 0);
1470         memcpy(dst_rec, src_rec, numrecs * cur->bc_ops->rec_len);
1471 }
1472
1473 /*
1474  * Copy block pointers from one btree block to another.
1475  */
1476 void
1477 xfs_btree_copy_ptrs(
1478         struct xfs_btree_cur    *cur,
1479         union xfs_btree_ptr     *dst_ptr,
1480         const union xfs_btree_ptr *src_ptr,
1481         int                     numptrs)
1482 {
1483         ASSERT(numptrs >= 0);
1484         memcpy(dst_ptr, src_ptr, numptrs * cur->bc_ops->ptr_len);
1485 }
1486
1487 /*
1488  * Shift keys one index left/right inside a single btree block.
1489  */
1490 STATIC void
1491 xfs_btree_shift_keys(
1492         struct xfs_btree_cur    *cur,
1493         union xfs_btree_key     *key,
1494         int                     dir,
1495         int                     numkeys)
1496 {
1497         char                    *dst_key;
1498
1499         ASSERT(numkeys >= 0);
1500         ASSERT(dir == 1 || dir == -1);
1501
1502         dst_key = (char *)key + (dir * cur->bc_ops->key_len);
1503         memmove(dst_key, key, numkeys * cur->bc_ops->key_len);
1504 }
1505
1506 /*
1507  * Shift records one index left/right inside a single btree block.
1508  */
1509 STATIC void
1510 xfs_btree_shift_recs(
1511         struct xfs_btree_cur    *cur,
1512         union xfs_btree_rec     *rec,
1513         int                     dir,
1514         int                     numrecs)
1515 {
1516         char                    *dst_rec;
1517
1518         ASSERT(numrecs >= 0);
1519         ASSERT(dir == 1 || dir == -1);
1520
1521         dst_rec = (char *)rec + (dir * cur->bc_ops->rec_len);
1522         memmove(dst_rec, rec, numrecs * cur->bc_ops->rec_len);
1523 }
1524
1525 /*
1526  * Shift block pointers one index left/right inside a single btree block.
1527  */
1528 STATIC void
1529 xfs_btree_shift_ptrs(
1530         struct xfs_btree_cur    *cur,
1531         union xfs_btree_ptr     *ptr,
1532         int                     dir,
1533         int                     numptrs)
1534 {
1535         char                    *dst_ptr;
1536
1537         ASSERT(numptrs >= 0);
1538         ASSERT(dir == 1 || dir == -1);
1539
1540         dst_ptr = (char *)ptr + (dir * cur->bc_ops->ptr_len);
1541         memmove(dst_ptr, ptr, numptrs * cur->bc_ops->ptr_len);
1542 }
1543
1544 /*
1545  * Log key values from the btree block.
1546  */
1547 STATIC void
1548 xfs_btree_log_keys(
1549         struct xfs_btree_cur    *cur,
1550         struct xfs_buf          *bp,
1551         int                     first,
1552         int                     last)
1553 {
1554
1555         if (bp) {
1556                 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1557                 xfs_trans_log_buf(cur->bc_tp, bp,
1558                                   xfs_btree_key_offset(cur, first),
1559                                   xfs_btree_key_offset(cur, last + 1) - 1);
1560         } else {
1561                 xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip,
1562                                 xfs_ilog_fbroot(cur->bc_ino.whichfork));
1563         }
1564 }
1565
1566 /*
1567  * Log record values from the btree block.
1568  */
1569 void
1570 xfs_btree_log_recs(
1571         struct xfs_btree_cur    *cur,
1572         struct xfs_buf          *bp,
1573         int                     first,
1574         int                     last)
1575 {
1576
1577         xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1578         xfs_trans_log_buf(cur->bc_tp, bp,
1579                           xfs_btree_rec_offset(cur, first),
1580                           xfs_btree_rec_offset(cur, last + 1) - 1);
1581
1582 }
1583
1584 /*
1585  * Log block pointer fields from a btree block (nonleaf).
1586  */
1587 STATIC void
1588 xfs_btree_log_ptrs(
1589         struct xfs_btree_cur    *cur,   /* btree cursor */
1590         struct xfs_buf          *bp,    /* buffer containing btree block */
1591         int                     first,  /* index of first pointer to log */
1592         int                     last)   /* index of last pointer to log */
1593 {
1594
1595         if (bp) {
1596                 struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
1597                 int                     level = xfs_btree_get_level(block);
1598
1599                 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1600                 xfs_trans_log_buf(cur->bc_tp, bp,
1601                                 xfs_btree_ptr_offset(cur, first, level),
1602                                 xfs_btree_ptr_offset(cur, last + 1, level) - 1);
1603         } else {
1604                 xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip,
1605                         xfs_ilog_fbroot(cur->bc_ino.whichfork));
1606         }
1607
1608 }
1609
1610 /*
1611  * Log fields from a btree block header.
1612  */
1613 void
1614 xfs_btree_log_block(
1615         struct xfs_btree_cur    *cur,   /* btree cursor */
1616         struct xfs_buf          *bp,    /* buffer containing btree block */
1617         uint32_t                fields) /* mask of fields: XFS_BB_... */
1618 {
1619         int                     first;  /* first byte offset logged */
1620         int                     last;   /* last byte offset logged */
1621         static const short      soffsets[] = {  /* table of offsets (short) */
1622                 offsetof(struct xfs_btree_block, bb_magic),
1623                 offsetof(struct xfs_btree_block, bb_level),
1624                 offsetof(struct xfs_btree_block, bb_numrecs),
1625                 offsetof(struct xfs_btree_block, bb_u.s.bb_leftsib),
1626                 offsetof(struct xfs_btree_block, bb_u.s.bb_rightsib),
1627                 offsetof(struct xfs_btree_block, bb_u.s.bb_blkno),
1628                 offsetof(struct xfs_btree_block, bb_u.s.bb_lsn),
1629                 offsetof(struct xfs_btree_block, bb_u.s.bb_uuid),
1630                 offsetof(struct xfs_btree_block, bb_u.s.bb_owner),
1631                 offsetof(struct xfs_btree_block, bb_u.s.bb_crc),
1632                 XFS_BTREE_SBLOCK_CRC_LEN
1633         };
1634         static const short      loffsets[] = {  /* table of offsets (long) */
1635                 offsetof(struct xfs_btree_block, bb_magic),
1636                 offsetof(struct xfs_btree_block, bb_level),
1637                 offsetof(struct xfs_btree_block, bb_numrecs),
1638                 offsetof(struct xfs_btree_block, bb_u.l.bb_leftsib),
1639                 offsetof(struct xfs_btree_block, bb_u.l.bb_rightsib),
1640                 offsetof(struct xfs_btree_block, bb_u.l.bb_blkno),
1641                 offsetof(struct xfs_btree_block, bb_u.l.bb_lsn),
1642                 offsetof(struct xfs_btree_block, bb_u.l.bb_uuid),
1643                 offsetof(struct xfs_btree_block, bb_u.l.bb_owner),
1644                 offsetof(struct xfs_btree_block, bb_u.l.bb_crc),
1645                 offsetof(struct xfs_btree_block, bb_u.l.bb_pad),
1646                 XFS_BTREE_LBLOCK_CRC_LEN
1647         };
1648
1649         if (bp) {
1650                 int nbits;
1651
1652                 if (xfs_has_crc(cur->bc_mp)) {
1653                         /*
1654                          * We don't log the CRC when updating a btree
1655                          * block but instead recreate it during log
1656                          * recovery.  As the log buffers have checksums
1657                          * of their own this is safe and avoids logging a crc
1658                          * update in a lot of places.
1659                          */
1660                         if (fields == XFS_BB_ALL_BITS)
1661                                 fields = XFS_BB_ALL_BITS_CRC;
1662                         nbits = XFS_BB_NUM_BITS_CRC;
1663                 } else {
1664                         nbits = XFS_BB_NUM_BITS;
1665                 }
1666                 xfs_btree_offsets(fields,
1667                                   (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) ?
1668                                         loffsets : soffsets,
1669                                   nbits, &first, &last);
1670                 xfs_trans_buf_set_type(cur->bc_tp, bp, XFS_BLFT_BTREE_BUF);
1671                 xfs_trans_log_buf(cur->bc_tp, bp, first, last);
1672         } else {
1673                 xfs_trans_log_inode(cur->bc_tp, cur->bc_ino.ip,
1674                         xfs_ilog_fbroot(cur->bc_ino.whichfork));
1675         }
1676 }
1677
1678 /*
1679  * Increment cursor by one record at the level.
1680  * For nonzero levels the leaf-ward information is untouched.
1681  */
1682 int                                             /* error */
1683 xfs_btree_increment(
1684         struct xfs_btree_cur    *cur,
1685         int                     level,
1686         int                     *stat)          /* success/failure */
1687 {
1688         struct xfs_btree_block  *block;
1689         union xfs_btree_ptr     ptr;
1690         struct xfs_buf          *bp;
1691         int                     error;          /* error return value */
1692         int                     lev;
1693
1694         ASSERT(level < cur->bc_nlevels);
1695
1696         /* Read-ahead to the right at this level. */
1697         xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
1698
1699         /* Get a pointer to the btree block. */
1700         block = xfs_btree_get_block(cur, level, &bp);
1701
1702 #ifdef DEBUG
1703         error = xfs_btree_check_block(cur, block, level, bp);
1704         if (error)
1705                 goto error0;
1706 #endif
1707
1708         /* We're done if we remain in the block after the increment. */
1709         if (++cur->bc_levels[level].ptr <= xfs_btree_get_numrecs(block))
1710                 goto out1;
1711
1712         /* Fail if we just went off the right edge of the tree. */
1713         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
1714         if (xfs_btree_ptr_is_null(cur, &ptr))
1715                 goto out0;
1716
1717         XFS_BTREE_STATS_INC(cur, increment);
1718
1719         /*
1720          * March up the tree incrementing pointers.
1721          * Stop when we don't go off the right edge of a block.
1722          */
1723         for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1724                 block = xfs_btree_get_block(cur, lev, &bp);
1725
1726 #ifdef DEBUG
1727                 error = xfs_btree_check_block(cur, block, lev, bp);
1728                 if (error)
1729                         goto error0;
1730 #endif
1731
1732                 if (++cur->bc_levels[lev].ptr <= xfs_btree_get_numrecs(block))
1733                         break;
1734
1735                 /* Read-ahead the right block for the next loop. */
1736                 xfs_btree_readahead(cur, lev, XFS_BTCUR_RIGHTRA);
1737         }
1738
1739         /*
1740          * If we went off the root then we are either seriously
1741          * confused or have the tree root in an inode.
1742          */
1743         if (lev == cur->bc_nlevels) {
1744                 if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
1745                         goto out0;
1746                 ASSERT(0);
1747                 xfs_btree_mark_sick(cur);
1748                 error = -EFSCORRUPTED;
1749                 goto error0;
1750         }
1751         ASSERT(lev < cur->bc_nlevels);
1752
1753         /*
1754          * Now walk back down the tree, fixing up the cursor's buffer
1755          * pointers and key numbers.
1756          */
1757         for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1758                 union xfs_btree_ptr     *ptrp;
1759
1760                 ptrp = xfs_btree_ptr_addr(cur, cur->bc_levels[lev].ptr, block);
1761                 --lev;
1762                 error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
1763                 if (error)
1764                         goto error0;
1765
1766                 xfs_btree_setbuf(cur, lev, bp);
1767                 cur->bc_levels[lev].ptr = 1;
1768         }
1769 out1:
1770         *stat = 1;
1771         return 0;
1772
1773 out0:
1774         *stat = 0;
1775         return 0;
1776
1777 error0:
1778         return error;
1779 }
1780
1781 /*
1782  * Decrement cursor by one record at the level.
1783  * For nonzero levels the leaf-ward information is untouched.
1784  */
1785 int                                             /* error */
1786 xfs_btree_decrement(
1787         struct xfs_btree_cur    *cur,
1788         int                     level,
1789         int                     *stat)          /* success/failure */
1790 {
1791         struct xfs_btree_block  *block;
1792         struct xfs_buf          *bp;
1793         int                     error;          /* error return value */
1794         int                     lev;
1795         union xfs_btree_ptr     ptr;
1796
1797         ASSERT(level < cur->bc_nlevels);
1798
1799         /* Read-ahead to the left at this level. */
1800         xfs_btree_readahead(cur, level, XFS_BTCUR_LEFTRA);
1801
1802         /* We're done if we remain in the block after the decrement. */
1803         if (--cur->bc_levels[level].ptr > 0)
1804                 goto out1;
1805
1806         /* Get a pointer to the btree block. */
1807         block = xfs_btree_get_block(cur, level, &bp);
1808
1809 #ifdef DEBUG
1810         error = xfs_btree_check_block(cur, block, level, bp);
1811         if (error)
1812                 goto error0;
1813 #endif
1814
1815         /* Fail if we just went off the left edge of the tree. */
1816         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
1817         if (xfs_btree_ptr_is_null(cur, &ptr))
1818                 goto out0;
1819
1820         XFS_BTREE_STATS_INC(cur, decrement);
1821
1822         /*
1823          * March up the tree decrementing pointers.
1824          * Stop when we don't go off the left edge of a block.
1825          */
1826         for (lev = level + 1; lev < cur->bc_nlevels; lev++) {
1827                 if (--cur->bc_levels[lev].ptr > 0)
1828                         break;
1829                 /* Read-ahead the left block for the next loop. */
1830                 xfs_btree_readahead(cur, lev, XFS_BTCUR_LEFTRA);
1831         }
1832
1833         /*
1834          * If we went off the root then we are seriously confused.
1835          * or the root of the tree is in an inode.
1836          */
1837         if (lev == cur->bc_nlevels) {
1838                 if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE)
1839                         goto out0;
1840                 ASSERT(0);
1841                 xfs_btree_mark_sick(cur);
1842                 error = -EFSCORRUPTED;
1843                 goto error0;
1844         }
1845         ASSERT(lev < cur->bc_nlevels);
1846
1847         /*
1848          * Now walk back down the tree, fixing up the cursor's buffer
1849          * pointers and key numbers.
1850          */
1851         for (block = xfs_btree_get_block(cur, lev, &bp); lev > level; ) {
1852                 union xfs_btree_ptr     *ptrp;
1853
1854                 ptrp = xfs_btree_ptr_addr(cur, cur->bc_levels[lev].ptr, block);
1855                 --lev;
1856                 error = xfs_btree_read_buf_block(cur, ptrp, 0, &block, &bp);
1857                 if (error)
1858                         goto error0;
1859                 xfs_btree_setbuf(cur, lev, bp);
1860                 cur->bc_levels[lev].ptr = xfs_btree_get_numrecs(block);
1861         }
1862 out1:
1863         *stat = 1;
1864         return 0;
1865
1866 out0:
1867         *stat = 0;
1868         return 0;
1869
1870 error0:
1871         return error;
1872 }
1873
1874 /*
1875  * Check the btree block owner now that we have the context to know who the
1876  * real owner is.
1877  */
1878 static inline xfs_failaddr_t
1879 xfs_btree_check_block_owner(
1880         struct xfs_btree_cur    *cur,
1881         struct xfs_btree_block  *block)
1882 {
1883         __u64                   owner;
1884
1885         if (!xfs_has_crc(cur->bc_mp) ||
1886             (cur->bc_flags & XFS_BTREE_BMBT_INVALID_OWNER))
1887                 return NULL;
1888
1889         owner = xfs_btree_owner(cur);
1890         if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
1891                 if (be64_to_cpu(block->bb_u.l.bb_owner) != owner)
1892                         return __this_address;
1893         } else {
1894                 if (be32_to_cpu(block->bb_u.s.bb_owner) != owner)
1895                         return __this_address;
1896         }
1897
1898         return NULL;
1899 }
1900
1901 int
1902 xfs_btree_lookup_get_block(
1903         struct xfs_btree_cur            *cur,   /* btree cursor */
1904         int                             level,  /* level in the btree */
1905         const union xfs_btree_ptr       *pp,    /* ptr to btree block */
1906         struct xfs_btree_block          **blkp) /* return btree block */
1907 {
1908         struct xfs_buf          *bp;    /* buffer pointer for btree block */
1909         xfs_daddr_t             daddr;
1910         int                     error = 0;
1911
1912         /* special case the root block if in an inode */
1913         if (xfs_btree_at_iroot(cur, level)) {
1914                 *blkp = xfs_btree_get_iroot(cur);
1915                 return 0;
1916         }
1917
1918         /*
1919          * If the old buffer at this level for the disk address we are
1920          * looking for re-use it.
1921          *
1922          * Otherwise throw it away and get a new one.
1923          */
1924         bp = cur->bc_levels[level].bp;
1925         error = xfs_btree_ptr_to_daddr(cur, pp, &daddr);
1926         if (error)
1927                 return error;
1928         if (bp && xfs_buf_daddr(bp) == daddr) {
1929                 *blkp = XFS_BUF_TO_BLOCK(bp);
1930                 return 0;
1931         }
1932
1933         error = xfs_btree_read_buf_block(cur, pp, 0, blkp, &bp);
1934         if (error)
1935                 return error;
1936
1937         /* Check the inode owner since the verifiers don't. */
1938         if (xfs_btree_check_block_owner(cur, *blkp) != NULL)
1939                 goto out_bad;
1940
1941         /* Did we get the level we were looking for? */
1942         if (be16_to_cpu((*blkp)->bb_level) != level)
1943                 goto out_bad;
1944
1945         /* Check that internal nodes have at least one record. */
1946         if (level != 0 && be16_to_cpu((*blkp)->bb_numrecs) == 0)
1947                 goto out_bad;
1948
1949         xfs_btree_setbuf(cur, level, bp);
1950         return 0;
1951
1952 out_bad:
1953         *blkp = NULL;
1954         xfs_buf_mark_corrupt(bp);
1955         xfs_trans_brelse(cur->bc_tp, bp);
1956         xfs_btree_mark_sick(cur);
1957         return -EFSCORRUPTED;
1958 }
1959
1960 /*
1961  * Get current search key.  For level 0 we don't actually have a key
1962  * structure so we make one up from the record.  For all other levels
1963  * we just return the right key.
1964  */
1965 STATIC union xfs_btree_key *
1966 xfs_lookup_get_search_key(
1967         struct xfs_btree_cur    *cur,
1968         int                     level,
1969         int                     keyno,
1970         struct xfs_btree_block  *block,
1971         union xfs_btree_key     *kp)
1972 {
1973         if (level == 0) {
1974                 cur->bc_ops->init_key_from_rec(kp,
1975                                 xfs_btree_rec_addr(cur, keyno, block));
1976                 return kp;
1977         }
1978
1979         return xfs_btree_key_addr(cur, keyno, block);
1980 }
1981
1982 /*
1983  * Initialize a pointer to the root block.
1984  */
1985 void
1986 xfs_btree_init_ptr_from_cur(
1987         struct xfs_btree_cur    *cur,
1988         union xfs_btree_ptr     *ptr)
1989 {
1990         if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) {
1991                 /*
1992                  * Inode-rooted btrees call xfs_btree_get_iroot to find the root
1993                  * in xfs_btree_lookup_get_block and don't need a pointer here.
1994                  */
1995                 ptr->l = 0;
1996         } else if (cur->bc_flags & XFS_BTREE_STAGING) {
1997                 ptr->s = cpu_to_be32(cur->bc_ag.afake->af_root);
1998         } else {
1999                 cur->bc_ops->init_ptr_from_cur(cur, ptr);
2000         }
2001 }
2002
2003 /*
2004  * Lookup the record.  The cursor is made to point to it, based on dir.
2005  * stat is set to 0 if can't find any such record, 1 for success.
2006  */
2007 int                                     /* error */
2008 xfs_btree_lookup(
2009         struct xfs_btree_cur    *cur,   /* btree cursor */
2010         xfs_lookup_t            dir,    /* <=, ==, or >= */
2011         int                     *stat)  /* success/failure */
2012 {
2013         struct xfs_btree_block  *block; /* current btree block */
2014         int64_t                 diff;   /* difference for the current key */
2015         int                     error;  /* error return value */
2016         int                     keyno;  /* current key number */
2017         int                     level;  /* level in the btree */
2018         union xfs_btree_ptr     *pp;    /* ptr to btree block */
2019         union xfs_btree_ptr     ptr;    /* ptr to btree block */
2020
2021         XFS_BTREE_STATS_INC(cur, lookup);
2022
2023         /* No such thing as a zero-level tree. */
2024         if (XFS_IS_CORRUPT(cur->bc_mp, cur->bc_nlevels == 0)) {
2025                 xfs_btree_mark_sick(cur);
2026                 return -EFSCORRUPTED;
2027         }
2028
2029         block = NULL;
2030         keyno = 0;
2031
2032         /* initialise start pointer from cursor */
2033         xfs_btree_init_ptr_from_cur(cur, &ptr);
2034         pp = &ptr;
2035
2036         /*
2037          * Iterate over each level in the btree, starting at the root.
2038          * For each level above the leaves, find the key we need, based
2039          * on the lookup record, then follow the corresponding block
2040          * pointer down to the next level.
2041          */
2042         for (level = cur->bc_nlevels - 1, diff = 1; level >= 0; level--) {
2043                 /* Get the block we need to do the lookup on. */
2044                 error = xfs_btree_lookup_get_block(cur, level, pp, &block);
2045                 if (error)
2046                         goto error0;
2047
2048                 if (diff == 0) {
2049                         /*
2050                          * If we already had a key match at a higher level, we
2051                          * know we need to use the first entry in this block.
2052                          */
2053                         keyno = 1;
2054                 } else {
2055                         /* Otherwise search this block. Do a binary search. */
2056
2057                         int     high;   /* high entry number */
2058                         int     low;    /* low entry number */
2059
2060                         /* Set low and high entry numbers, 1-based. */
2061                         low = 1;
2062                         high = xfs_btree_get_numrecs(block);
2063                         if (!high) {
2064                                 /* Block is empty, must be an empty leaf. */
2065                                 if (level != 0 || cur->bc_nlevels != 1) {
2066                                         XFS_CORRUPTION_ERROR(__func__,
2067                                                         XFS_ERRLEVEL_LOW,
2068                                                         cur->bc_mp, block,
2069                                                         sizeof(*block));
2070                                         xfs_btree_mark_sick(cur);
2071                                         return -EFSCORRUPTED;
2072                                 }
2073
2074                                 cur->bc_levels[0].ptr = dir != XFS_LOOKUP_LE;
2075                                 *stat = 0;
2076                                 return 0;
2077                         }
2078
2079                         /* Binary search the block. */
2080                         while (low <= high) {
2081                                 union xfs_btree_key     key;
2082                                 union xfs_btree_key     *kp;
2083
2084                                 XFS_BTREE_STATS_INC(cur, compare);
2085
2086                                 /* keyno is average of low and high. */
2087                                 keyno = (low + high) >> 1;
2088
2089                                 /* Get current search key */
2090                                 kp = xfs_lookup_get_search_key(cur, level,
2091                                                 keyno, block, &key);
2092
2093                                 /*
2094                                  * Compute difference to get next direction:
2095                                  *  - less than, move right
2096                                  *  - greater than, move left
2097                                  *  - equal, we're done
2098                                  */
2099                                 diff = cur->bc_ops->key_diff(cur, kp);
2100                                 if (diff < 0)
2101                                         low = keyno + 1;
2102                                 else if (diff > 0)
2103                                         high = keyno - 1;
2104                                 else
2105                                         break;
2106                         }
2107                 }
2108
2109                 /*
2110                  * If there are more levels, set up for the next level
2111                  * by getting the block number and filling in the cursor.
2112                  */
2113                 if (level > 0) {
2114                         /*
2115                          * If we moved left, need the previous key number,
2116                          * unless there isn't one.
2117                          */
2118                         if (diff > 0 && --keyno < 1)
2119                                 keyno = 1;
2120                         pp = xfs_btree_ptr_addr(cur, keyno, block);
2121
2122                         error = xfs_btree_debug_check_ptr(cur, pp, 0, level);
2123                         if (error)
2124                                 goto error0;
2125
2126                         cur->bc_levels[level].ptr = keyno;
2127                 }
2128         }
2129
2130         /* Done with the search. See if we need to adjust the results. */
2131         if (dir != XFS_LOOKUP_LE && diff < 0) {
2132                 keyno++;
2133                 /*
2134                  * If ge search and we went off the end of the block, but it's
2135                  * not the last block, we're in the wrong block.
2136                  */
2137                 xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
2138                 if (dir == XFS_LOOKUP_GE &&
2139                     keyno > xfs_btree_get_numrecs(block) &&
2140                     !xfs_btree_ptr_is_null(cur, &ptr)) {
2141                         int     i;
2142
2143                         cur->bc_levels[0].ptr = keyno;
2144                         error = xfs_btree_increment(cur, 0, &i);
2145                         if (error)
2146                                 goto error0;
2147                         if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
2148                                 xfs_btree_mark_sick(cur);
2149                                 return -EFSCORRUPTED;
2150                         }
2151                         *stat = 1;
2152                         return 0;
2153                 }
2154         } else if (dir == XFS_LOOKUP_LE && diff > 0)
2155                 keyno--;
2156         cur->bc_levels[0].ptr = keyno;
2157
2158         /* Return if we succeeded or not. */
2159         if (keyno == 0 || keyno > xfs_btree_get_numrecs(block))
2160                 *stat = 0;
2161         else if (dir != XFS_LOOKUP_EQ || diff == 0)
2162                 *stat = 1;
2163         else
2164                 *stat = 0;
2165         return 0;
2166
2167 error0:
2168         return error;
2169 }
2170
2171 /* Find the high key storage area from a regular key. */
2172 union xfs_btree_key *
2173 xfs_btree_high_key_from_key(
2174         struct xfs_btree_cur    *cur,
2175         union xfs_btree_key     *key)
2176 {
2177         ASSERT(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING);
2178         return (union xfs_btree_key *)((char *)key +
2179                         (cur->bc_ops->key_len / 2));
2180 }
2181
2182 /* Determine the low (and high if overlapped) keys of a leaf block */
2183 STATIC void
2184 xfs_btree_get_leaf_keys(
2185         struct xfs_btree_cur    *cur,
2186         struct xfs_btree_block  *block,
2187         union xfs_btree_key     *key)
2188 {
2189         union xfs_btree_key     max_hkey;
2190         union xfs_btree_key     hkey;
2191         union xfs_btree_rec     *rec;
2192         union xfs_btree_key     *high;
2193         int                     n;
2194
2195         rec = xfs_btree_rec_addr(cur, 1, block);
2196         cur->bc_ops->init_key_from_rec(key, rec);
2197
2198         if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) {
2199
2200                 cur->bc_ops->init_high_key_from_rec(&max_hkey, rec);
2201                 for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
2202                         rec = xfs_btree_rec_addr(cur, n, block);
2203                         cur->bc_ops->init_high_key_from_rec(&hkey, rec);
2204                         if (xfs_btree_keycmp_gt(cur, &hkey, &max_hkey))
2205                                 max_hkey = hkey;
2206                 }
2207
2208                 high = xfs_btree_high_key_from_key(cur, key);
2209                 memcpy(high, &max_hkey, cur->bc_ops->key_len / 2);
2210         }
2211 }
2212
2213 /* Determine the low (and high if overlapped) keys of a node block */
2214 STATIC void
2215 xfs_btree_get_node_keys(
2216         struct xfs_btree_cur    *cur,
2217         struct xfs_btree_block  *block,
2218         union xfs_btree_key     *key)
2219 {
2220         union xfs_btree_key     *hkey;
2221         union xfs_btree_key     *max_hkey;
2222         union xfs_btree_key     *high;
2223         int                     n;
2224
2225         if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) {
2226                 memcpy(key, xfs_btree_key_addr(cur, 1, block),
2227                                 cur->bc_ops->key_len / 2);
2228
2229                 max_hkey = xfs_btree_high_key_addr(cur, 1, block);
2230                 for (n = 2; n <= xfs_btree_get_numrecs(block); n++) {
2231                         hkey = xfs_btree_high_key_addr(cur, n, block);
2232                         if (xfs_btree_keycmp_gt(cur, hkey, max_hkey))
2233                                 max_hkey = hkey;
2234                 }
2235
2236                 high = xfs_btree_high_key_from_key(cur, key);
2237                 memcpy(high, max_hkey, cur->bc_ops->key_len / 2);
2238         } else {
2239                 memcpy(key, xfs_btree_key_addr(cur, 1, block),
2240                                 cur->bc_ops->key_len);
2241         }
2242 }
2243
2244 /* Derive the keys for any btree block. */
2245 void
2246 xfs_btree_get_keys(
2247         struct xfs_btree_cur    *cur,
2248         struct xfs_btree_block  *block,
2249         union xfs_btree_key     *key)
2250 {
2251         if (be16_to_cpu(block->bb_level) == 0)
2252                 xfs_btree_get_leaf_keys(cur, block, key);
2253         else
2254                 xfs_btree_get_node_keys(cur, block, key);
2255 }
2256
2257 /*
2258  * Decide if we need to update the parent keys of a btree block.  For
2259  * a standard btree this is only necessary if we're updating the first
2260  * record/key.  For an overlapping btree, we must always update the
2261  * keys because the highest key can be in any of the records or keys
2262  * in the block.
2263  */
2264 static inline bool
2265 xfs_btree_needs_key_update(
2266         struct xfs_btree_cur    *cur,
2267         int                     ptr)
2268 {
2269         return (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) || ptr == 1;
2270 }
2271
2272 /*
2273  * Update the low and high parent keys of the given level, progressing
2274  * towards the root.  If force_all is false, stop if the keys for a given
2275  * level do not need updating.
2276  */
2277 STATIC int
2278 __xfs_btree_updkeys(
2279         struct xfs_btree_cur    *cur,
2280         int                     level,
2281         struct xfs_btree_block  *block,
2282         struct xfs_buf          *bp0,
2283         bool                    force_all)
2284 {
2285         union xfs_btree_key     key;    /* keys from current level */
2286         union xfs_btree_key     *lkey;  /* keys from the next level up */
2287         union xfs_btree_key     *hkey;
2288         union xfs_btree_key     *nlkey; /* keys from the next level up */
2289         union xfs_btree_key     *nhkey;
2290         struct xfs_buf          *bp;
2291         int                     ptr;
2292
2293         ASSERT(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING);
2294
2295         /* Exit if there aren't any parent levels to update. */
2296         if (level + 1 >= cur->bc_nlevels)
2297                 return 0;
2298
2299         trace_xfs_btree_updkeys(cur, level, bp0);
2300
2301         lkey = &key;
2302         hkey = xfs_btree_high_key_from_key(cur, lkey);
2303         xfs_btree_get_keys(cur, block, lkey);
2304         for (level++; level < cur->bc_nlevels; level++) {
2305 #ifdef DEBUG
2306                 int             error;
2307 #endif
2308                 block = xfs_btree_get_block(cur, level, &bp);
2309                 trace_xfs_btree_updkeys(cur, level, bp);
2310 #ifdef DEBUG
2311                 error = xfs_btree_check_block(cur, block, level, bp);
2312                 if (error)
2313                         return error;
2314 #endif
2315                 ptr = cur->bc_levels[level].ptr;
2316                 nlkey = xfs_btree_key_addr(cur, ptr, block);
2317                 nhkey = xfs_btree_high_key_addr(cur, ptr, block);
2318                 if (!force_all &&
2319                     xfs_btree_keycmp_eq(cur, nlkey, lkey) &&
2320                     xfs_btree_keycmp_eq(cur, nhkey, hkey))
2321                         break;
2322                 xfs_btree_copy_keys(cur, nlkey, lkey, 1);
2323                 xfs_btree_log_keys(cur, bp, ptr, ptr);
2324                 if (level + 1 >= cur->bc_nlevels)
2325                         break;
2326                 xfs_btree_get_node_keys(cur, block, lkey);
2327         }
2328
2329         return 0;
2330 }
2331
2332 /* Update all the keys from some level in cursor back to the root. */
2333 STATIC int
2334 xfs_btree_updkeys_force(
2335         struct xfs_btree_cur    *cur,
2336         int                     level)
2337 {
2338         struct xfs_buf          *bp;
2339         struct xfs_btree_block  *block;
2340
2341         block = xfs_btree_get_block(cur, level, &bp);
2342         return __xfs_btree_updkeys(cur, level, block, bp, true);
2343 }
2344
2345 /*
2346  * Update the parent keys of the given level, progressing towards the root.
2347  */
2348 STATIC int
2349 xfs_btree_update_keys(
2350         struct xfs_btree_cur    *cur,
2351         int                     level)
2352 {
2353         struct xfs_btree_block  *block;
2354         struct xfs_buf          *bp;
2355         union xfs_btree_key     *kp;
2356         union xfs_btree_key     key;
2357         int                     ptr;
2358
2359         ASSERT(level >= 0);
2360
2361         block = xfs_btree_get_block(cur, level, &bp);
2362         if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)
2363                 return __xfs_btree_updkeys(cur, level, block, bp, false);
2364
2365         /*
2366          * Go up the tree from this level toward the root.
2367          * At each level, update the key value to the value input.
2368          * Stop when we reach a level where the cursor isn't pointing
2369          * at the first entry in the block.
2370          */
2371         xfs_btree_get_keys(cur, block, &key);
2372         for (level++, ptr = 1; ptr == 1 && level < cur->bc_nlevels; level++) {
2373 #ifdef DEBUG
2374                 int             error;
2375 #endif
2376                 block = xfs_btree_get_block(cur, level, &bp);
2377 #ifdef DEBUG
2378                 error = xfs_btree_check_block(cur, block, level, bp);
2379                 if (error)
2380                         return error;
2381 #endif
2382                 ptr = cur->bc_levels[level].ptr;
2383                 kp = xfs_btree_key_addr(cur, ptr, block);
2384                 xfs_btree_copy_keys(cur, kp, &key, 1);
2385                 xfs_btree_log_keys(cur, bp, ptr, ptr);
2386         }
2387
2388         return 0;
2389 }
2390
2391 /*
2392  * Update the record referred to by cur to the value in the
2393  * given record. This either works (return 0) or gets an
2394  * EFSCORRUPTED error.
2395  */
2396 int
2397 xfs_btree_update(
2398         struct xfs_btree_cur    *cur,
2399         union xfs_btree_rec     *rec)
2400 {
2401         struct xfs_btree_block  *block;
2402         struct xfs_buf          *bp;
2403         int                     error;
2404         int                     ptr;
2405         union xfs_btree_rec     *rp;
2406
2407         /* Pick up the current block. */
2408         block = xfs_btree_get_block(cur, 0, &bp);
2409
2410 #ifdef DEBUG
2411         error = xfs_btree_check_block(cur, block, 0, bp);
2412         if (error)
2413                 goto error0;
2414 #endif
2415         /* Get the address of the rec to be updated. */
2416         ptr = cur->bc_levels[0].ptr;
2417         rp = xfs_btree_rec_addr(cur, ptr, block);
2418
2419         /* Fill in the new contents and log them. */
2420         xfs_btree_copy_recs(cur, rp, rec, 1);
2421         xfs_btree_log_recs(cur, bp, ptr, ptr);
2422
2423         /*
2424          * If we are tracking the last record in the tree and
2425          * we are at the far right edge of the tree, update it.
2426          */
2427         if (xfs_btree_is_lastrec(cur, block, 0)) {
2428                 cur->bc_ops->update_lastrec(cur, block, rec,
2429                                             ptr, LASTREC_UPDATE);
2430         }
2431
2432         /* Pass new key value up to our parent. */
2433         if (xfs_btree_needs_key_update(cur, ptr)) {
2434                 error = xfs_btree_update_keys(cur, 0);
2435                 if (error)
2436                         goto error0;
2437         }
2438
2439         return 0;
2440
2441 error0:
2442         return error;
2443 }
2444
2445 /*
2446  * Move 1 record left from cur/level if possible.
2447  * Update cur to reflect the new path.
2448  */
2449 STATIC int                                      /* error */
2450 xfs_btree_lshift(
2451         struct xfs_btree_cur    *cur,
2452         int                     level,
2453         int                     *stat)          /* success/failure */
2454 {
2455         struct xfs_buf          *lbp;           /* left buffer pointer */
2456         struct xfs_btree_block  *left;          /* left btree block */
2457         int                     lrecs;          /* left record count */
2458         struct xfs_buf          *rbp;           /* right buffer pointer */
2459         struct xfs_btree_block  *right;         /* right btree block */
2460         struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
2461         int                     rrecs;          /* right record count */
2462         union xfs_btree_ptr     lptr;           /* left btree pointer */
2463         union xfs_btree_key     *rkp = NULL;    /* right btree key */
2464         union xfs_btree_ptr     *rpp = NULL;    /* right address pointer */
2465         union xfs_btree_rec     *rrp = NULL;    /* right record pointer */
2466         int                     error;          /* error return value */
2467         int                     i;
2468
2469         if (xfs_btree_at_iroot(cur, level))
2470                 goto out0;
2471
2472         /* Set up variables for this block as "right". */
2473         right = xfs_btree_get_block(cur, level, &rbp);
2474
2475 #ifdef DEBUG
2476         error = xfs_btree_check_block(cur, right, level, rbp);
2477         if (error)
2478                 goto error0;
2479 #endif
2480
2481         /* If we've got no left sibling then we can't shift an entry left. */
2482         xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2483         if (xfs_btree_ptr_is_null(cur, &lptr))
2484                 goto out0;
2485
2486         /*
2487          * If the cursor entry is the one that would be moved, don't
2488          * do it... it's too complicated.
2489          */
2490         if (cur->bc_levels[level].ptr <= 1)
2491                 goto out0;
2492
2493         /* Set up the left neighbor as "left". */
2494         error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
2495         if (error)
2496                 goto error0;
2497
2498         /* If it's full, it can't take another entry. */
2499         lrecs = xfs_btree_get_numrecs(left);
2500         if (lrecs == cur->bc_ops->get_maxrecs(cur, level))
2501                 goto out0;
2502
2503         rrecs = xfs_btree_get_numrecs(right);
2504
2505         /*
2506          * We add one entry to the left side and remove one for the right side.
2507          * Account for it here, the changes will be updated on disk and logged
2508          * later.
2509          */
2510         lrecs++;
2511         rrecs--;
2512
2513         XFS_BTREE_STATS_INC(cur, lshift);
2514         XFS_BTREE_STATS_ADD(cur, moves, 1);
2515
2516         /*
2517          * If non-leaf, copy a key and a ptr to the left block.
2518          * Log the changes to the left block.
2519          */
2520         if (level > 0) {
2521                 /* It's a non-leaf.  Move keys and pointers. */
2522                 union xfs_btree_key     *lkp;   /* left btree key */
2523                 union xfs_btree_ptr     *lpp;   /* left address pointer */
2524
2525                 lkp = xfs_btree_key_addr(cur, lrecs, left);
2526                 rkp = xfs_btree_key_addr(cur, 1, right);
2527
2528                 lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2529                 rpp = xfs_btree_ptr_addr(cur, 1, right);
2530
2531                 error = xfs_btree_debug_check_ptr(cur, rpp, 0, level);
2532                 if (error)
2533                         goto error0;
2534
2535                 xfs_btree_copy_keys(cur, lkp, rkp, 1);
2536                 xfs_btree_copy_ptrs(cur, lpp, rpp, 1);
2537
2538                 xfs_btree_log_keys(cur, lbp, lrecs, lrecs);
2539                 xfs_btree_log_ptrs(cur, lbp, lrecs, lrecs);
2540
2541                 ASSERT(cur->bc_ops->keys_inorder(cur,
2542                         xfs_btree_key_addr(cur, lrecs - 1, left), lkp));
2543         } else {
2544                 /* It's a leaf.  Move records.  */
2545                 union xfs_btree_rec     *lrp;   /* left record pointer */
2546
2547                 lrp = xfs_btree_rec_addr(cur, lrecs, left);
2548                 rrp = xfs_btree_rec_addr(cur, 1, right);
2549
2550                 xfs_btree_copy_recs(cur, lrp, rrp, 1);
2551                 xfs_btree_log_recs(cur, lbp, lrecs, lrecs);
2552
2553                 ASSERT(cur->bc_ops->recs_inorder(cur,
2554                         xfs_btree_rec_addr(cur, lrecs - 1, left), lrp));
2555         }
2556
2557         xfs_btree_set_numrecs(left, lrecs);
2558         xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2559
2560         xfs_btree_set_numrecs(right, rrecs);
2561         xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2562
2563         /*
2564          * Slide the contents of right down one entry.
2565          */
2566         XFS_BTREE_STATS_ADD(cur, moves, rrecs - 1);
2567         if (level > 0) {
2568                 /* It's a nonleaf. operate on keys and ptrs */
2569                 for (i = 0; i < rrecs; i++) {
2570                         error = xfs_btree_debug_check_ptr(cur, rpp, i + 1, level);
2571                         if (error)
2572                                 goto error0;
2573                 }
2574
2575                 xfs_btree_shift_keys(cur,
2576                                 xfs_btree_key_addr(cur, 2, right),
2577                                 -1, rrecs);
2578                 xfs_btree_shift_ptrs(cur,
2579                                 xfs_btree_ptr_addr(cur, 2, right),
2580                                 -1, rrecs);
2581
2582                 xfs_btree_log_keys(cur, rbp, 1, rrecs);
2583                 xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2584         } else {
2585                 /* It's a leaf. operate on records */
2586                 xfs_btree_shift_recs(cur,
2587                         xfs_btree_rec_addr(cur, 2, right),
2588                         -1, rrecs);
2589                 xfs_btree_log_recs(cur, rbp, 1, rrecs);
2590         }
2591
2592         /*
2593          * Using a temporary cursor, update the parent key values of the
2594          * block on the left.
2595          */
2596         if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) {
2597                 error = xfs_btree_dup_cursor(cur, &tcur);
2598                 if (error)
2599                         goto error0;
2600                 i = xfs_btree_firstrec(tcur, level);
2601                 if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) {
2602                         xfs_btree_mark_sick(cur);
2603                         error = -EFSCORRUPTED;
2604                         goto error0;
2605                 }
2606
2607                 error = xfs_btree_decrement(tcur, level, &i);
2608                 if (error)
2609                         goto error1;
2610
2611                 /* Update the parent high keys of the left block, if needed. */
2612                 error = xfs_btree_update_keys(tcur, level);
2613                 if (error)
2614                         goto error1;
2615
2616                 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
2617         }
2618
2619         /* Update the parent keys of the right block. */
2620         error = xfs_btree_update_keys(cur, level);
2621         if (error)
2622                 goto error0;
2623
2624         /* Slide the cursor value left one. */
2625         cur->bc_levels[level].ptr--;
2626
2627         *stat = 1;
2628         return 0;
2629
2630 out0:
2631         *stat = 0;
2632         return 0;
2633
2634 error0:
2635         return error;
2636
2637 error1:
2638         xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
2639         return error;
2640 }
2641
2642 /*
2643  * Move 1 record right from cur/level if possible.
2644  * Update cur to reflect the new path.
2645  */
2646 STATIC int                                      /* error */
2647 xfs_btree_rshift(
2648         struct xfs_btree_cur    *cur,
2649         int                     level,
2650         int                     *stat)          /* success/failure */
2651 {
2652         struct xfs_buf          *lbp;           /* left buffer pointer */
2653         struct xfs_btree_block  *left;          /* left btree block */
2654         struct xfs_buf          *rbp;           /* right buffer pointer */
2655         struct xfs_btree_block  *right;         /* right btree block */
2656         struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
2657         union xfs_btree_ptr     rptr;           /* right block pointer */
2658         union xfs_btree_key     *rkp;           /* right btree key */
2659         int                     rrecs;          /* right record count */
2660         int                     lrecs;          /* left record count */
2661         int                     error;          /* error return value */
2662         int                     i;              /* loop counter */
2663
2664         if (xfs_btree_at_iroot(cur, level))
2665                 goto out0;
2666
2667         /* Set up variables for this block as "left". */
2668         left = xfs_btree_get_block(cur, level, &lbp);
2669
2670 #ifdef DEBUG
2671         error = xfs_btree_check_block(cur, left, level, lbp);
2672         if (error)
2673                 goto error0;
2674 #endif
2675
2676         /* If we've got no right sibling then we can't shift an entry right. */
2677         xfs_btree_get_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2678         if (xfs_btree_ptr_is_null(cur, &rptr))
2679                 goto out0;
2680
2681         /*
2682          * If the cursor entry is the one that would be moved, don't
2683          * do it... it's too complicated.
2684          */
2685         lrecs = xfs_btree_get_numrecs(left);
2686         if (cur->bc_levels[level].ptr >= lrecs)
2687                 goto out0;
2688
2689         /* Set up the right neighbor as "right". */
2690         error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
2691         if (error)
2692                 goto error0;
2693
2694         /* If it's full, it can't take another entry. */
2695         rrecs = xfs_btree_get_numrecs(right);
2696         if (rrecs == cur->bc_ops->get_maxrecs(cur, level))
2697                 goto out0;
2698
2699         XFS_BTREE_STATS_INC(cur, rshift);
2700         XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2701
2702         /*
2703          * Make a hole at the start of the right neighbor block, then
2704          * copy the last left block entry to the hole.
2705          */
2706         if (level > 0) {
2707                 /* It's a nonleaf. make a hole in the keys and ptrs */
2708                 union xfs_btree_key     *lkp;
2709                 union xfs_btree_ptr     *lpp;
2710                 union xfs_btree_ptr     *rpp;
2711
2712                 lkp = xfs_btree_key_addr(cur, lrecs, left);
2713                 lpp = xfs_btree_ptr_addr(cur, lrecs, left);
2714                 rkp = xfs_btree_key_addr(cur, 1, right);
2715                 rpp = xfs_btree_ptr_addr(cur, 1, right);
2716
2717                 for (i = rrecs - 1; i >= 0; i--) {
2718                         error = xfs_btree_debug_check_ptr(cur, rpp, i, level);
2719                         if (error)
2720                                 goto error0;
2721                 }
2722
2723                 xfs_btree_shift_keys(cur, rkp, 1, rrecs);
2724                 xfs_btree_shift_ptrs(cur, rpp, 1, rrecs);
2725
2726                 error = xfs_btree_debug_check_ptr(cur, lpp, 0, level);
2727                 if (error)
2728                         goto error0;
2729
2730                 /* Now put the new data in, and log it. */
2731                 xfs_btree_copy_keys(cur, rkp, lkp, 1);
2732                 xfs_btree_copy_ptrs(cur, rpp, lpp, 1);
2733
2734                 xfs_btree_log_keys(cur, rbp, 1, rrecs + 1);
2735                 xfs_btree_log_ptrs(cur, rbp, 1, rrecs + 1);
2736
2737                 ASSERT(cur->bc_ops->keys_inorder(cur, rkp,
2738                         xfs_btree_key_addr(cur, 2, right)));
2739         } else {
2740                 /* It's a leaf. make a hole in the records */
2741                 union xfs_btree_rec     *lrp;
2742                 union xfs_btree_rec     *rrp;
2743
2744                 lrp = xfs_btree_rec_addr(cur, lrecs, left);
2745                 rrp = xfs_btree_rec_addr(cur, 1, right);
2746
2747                 xfs_btree_shift_recs(cur, rrp, 1, rrecs);
2748
2749                 /* Now put the new data in, and log it. */
2750                 xfs_btree_copy_recs(cur, rrp, lrp, 1);
2751                 xfs_btree_log_recs(cur, rbp, 1, rrecs + 1);
2752         }
2753
2754         /*
2755          * Decrement and log left's numrecs, bump and log right's numrecs.
2756          */
2757         xfs_btree_set_numrecs(left, --lrecs);
2758         xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS);
2759
2760         xfs_btree_set_numrecs(right, ++rrecs);
2761         xfs_btree_log_block(cur, rbp, XFS_BB_NUMRECS);
2762
2763         /*
2764          * Using a temporary cursor, update the parent key values of the
2765          * block on the right.
2766          */
2767         error = xfs_btree_dup_cursor(cur, &tcur);
2768         if (error)
2769                 goto error0;
2770         i = xfs_btree_lastrec(tcur, level);
2771         if (XFS_IS_CORRUPT(tcur->bc_mp, i != 1)) {
2772                 xfs_btree_mark_sick(cur);
2773                 error = -EFSCORRUPTED;
2774                 goto error0;
2775         }
2776
2777         error = xfs_btree_increment(tcur, level, &i);
2778         if (error)
2779                 goto error1;
2780
2781         /* Update the parent high keys of the left block, if needed. */
2782         if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) {
2783                 error = xfs_btree_update_keys(cur, level);
2784                 if (error)
2785                         goto error1;
2786         }
2787
2788         /* Update the parent keys of the right block. */
2789         error = xfs_btree_update_keys(tcur, level);
2790         if (error)
2791                 goto error1;
2792
2793         xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
2794
2795         *stat = 1;
2796         return 0;
2797
2798 out0:
2799         *stat = 0;
2800         return 0;
2801
2802 error0:
2803         return error;
2804
2805 error1:
2806         xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
2807         return error;
2808 }
2809
2810 static inline int
2811 xfs_btree_alloc_block(
2812         struct xfs_btree_cur            *cur,
2813         const union xfs_btree_ptr       *hint_block,
2814         union xfs_btree_ptr             *new_block,
2815         int                             *stat)
2816 {
2817         int                             error;
2818
2819         /*
2820          * Don't allow block allocation for a staging cursor, because staging
2821          * cursors do not support regular btree modifications.
2822          *
2823          * Bulk loading uses a separate callback to obtain new blocks from a
2824          * preallocated list, which prevents ENOSPC failures during loading.
2825          */
2826         if (unlikely(cur->bc_flags & XFS_BTREE_STAGING)) {
2827                 ASSERT(0);
2828                 return -EFSCORRUPTED;
2829         }
2830
2831         error = cur->bc_ops->alloc_block(cur, hint_block, new_block, stat);
2832         trace_xfs_btree_alloc_block(cur, new_block, *stat, error);
2833         return error;
2834 }
2835
2836 /*
2837  * Split cur/level block in half.
2838  * Return new block number and the key to its first
2839  * record (to be inserted into parent).
2840  */
2841 STATIC int                                      /* error */
2842 __xfs_btree_split(
2843         struct xfs_btree_cur    *cur,
2844         int                     level,
2845         union xfs_btree_ptr     *ptrp,
2846         union xfs_btree_key     *key,
2847         struct xfs_btree_cur    **curp,
2848         int                     *stat)          /* success/failure */
2849 {
2850         union xfs_btree_ptr     lptr;           /* left sibling block ptr */
2851         struct xfs_buf          *lbp;           /* left buffer pointer */
2852         struct xfs_btree_block  *left;          /* left btree block */
2853         union xfs_btree_ptr     rptr;           /* right sibling block ptr */
2854         struct xfs_buf          *rbp;           /* right buffer pointer */
2855         struct xfs_btree_block  *right;         /* right btree block */
2856         union xfs_btree_ptr     rrptr;          /* right-right sibling ptr */
2857         struct xfs_buf          *rrbp;          /* right-right buffer pointer */
2858         struct xfs_btree_block  *rrblock;       /* right-right btree block */
2859         int                     lrecs;
2860         int                     rrecs;
2861         int                     src_index;
2862         int                     error;          /* error return value */
2863         int                     i;
2864
2865         XFS_BTREE_STATS_INC(cur, split);
2866
2867         /* Set up left block (current one). */
2868         left = xfs_btree_get_block(cur, level, &lbp);
2869
2870 #ifdef DEBUG
2871         error = xfs_btree_check_block(cur, left, level, lbp);
2872         if (error)
2873                 goto error0;
2874 #endif
2875
2876         xfs_btree_buf_to_ptr(cur, lbp, &lptr);
2877
2878         /* Allocate the new block. If we can't do it, we're toast. Give up. */
2879         error = xfs_btree_alloc_block(cur, &lptr, &rptr, stat);
2880         if (error)
2881                 goto error0;
2882         if (*stat == 0)
2883                 goto out0;
2884         XFS_BTREE_STATS_INC(cur, alloc);
2885
2886         /* Set up the new block as "right". */
2887         error = xfs_btree_get_buf_block(cur, &rptr, &right, &rbp);
2888         if (error)
2889                 goto error0;
2890
2891         /* Fill in the btree header for the new right block. */
2892         xfs_btree_init_block_cur(cur, rbp, xfs_btree_get_level(left), 0);
2893
2894         /*
2895          * Split the entries between the old and the new block evenly.
2896          * Make sure that if there's an odd number of entries now, that
2897          * each new block will have the same number of entries.
2898          */
2899         lrecs = xfs_btree_get_numrecs(left);
2900         rrecs = lrecs / 2;
2901         if ((lrecs & 1) && cur->bc_levels[level].ptr <= rrecs + 1)
2902                 rrecs++;
2903         src_index = (lrecs - rrecs + 1);
2904
2905         XFS_BTREE_STATS_ADD(cur, moves, rrecs);
2906
2907         /* Adjust numrecs for the later get_*_keys() calls. */
2908         lrecs -= rrecs;
2909         xfs_btree_set_numrecs(left, lrecs);
2910         xfs_btree_set_numrecs(right, xfs_btree_get_numrecs(right) + rrecs);
2911
2912         /*
2913          * Copy btree block entries from the left block over to the
2914          * new block, the right. Update the right block and log the
2915          * changes.
2916          */
2917         if (level > 0) {
2918                 /* It's a non-leaf.  Move keys and pointers. */
2919                 union xfs_btree_key     *lkp;   /* left btree key */
2920                 union xfs_btree_ptr     *lpp;   /* left address pointer */
2921                 union xfs_btree_key     *rkp;   /* right btree key */
2922                 union xfs_btree_ptr     *rpp;   /* right address pointer */
2923
2924                 lkp = xfs_btree_key_addr(cur, src_index, left);
2925                 lpp = xfs_btree_ptr_addr(cur, src_index, left);
2926                 rkp = xfs_btree_key_addr(cur, 1, right);
2927                 rpp = xfs_btree_ptr_addr(cur, 1, right);
2928
2929                 for (i = src_index; i < rrecs; i++) {
2930                         error = xfs_btree_debug_check_ptr(cur, lpp, i, level);
2931                         if (error)
2932                                 goto error0;
2933                 }
2934
2935                 /* Copy the keys & pointers to the new block. */
2936                 xfs_btree_copy_keys(cur, rkp, lkp, rrecs);
2937                 xfs_btree_copy_ptrs(cur, rpp, lpp, rrecs);
2938
2939                 xfs_btree_log_keys(cur, rbp, 1, rrecs);
2940                 xfs_btree_log_ptrs(cur, rbp, 1, rrecs);
2941
2942                 /* Stash the keys of the new block for later insertion. */
2943                 xfs_btree_get_node_keys(cur, right, key);
2944         } else {
2945                 /* It's a leaf.  Move records.  */
2946                 union xfs_btree_rec     *lrp;   /* left record pointer */
2947                 union xfs_btree_rec     *rrp;   /* right record pointer */
2948
2949                 lrp = xfs_btree_rec_addr(cur, src_index, left);
2950                 rrp = xfs_btree_rec_addr(cur, 1, right);
2951
2952                 /* Copy records to the new block. */
2953                 xfs_btree_copy_recs(cur, rrp, lrp, rrecs);
2954                 xfs_btree_log_recs(cur, rbp, 1, rrecs);
2955
2956                 /* Stash the keys of the new block for later insertion. */
2957                 xfs_btree_get_leaf_keys(cur, right, key);
2958         }
2959
2960         /*
2961          * Find the left block number by looking in the buffer.
2962          * Adjust sibling pointers.
2963          */
2964         xfs_btree_get_sibling(cur, left, &rrptr, XFS_BB_RIGHTSIB);
2965         xfs_btree_set_sibling(cur, right, &rrptr, XFS_BB_RIGHTSIB);
2966         xfs_btree_set_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
2967         xfs_btree_set_sibling(cur, left, &rptr, XFS_BB_RIGHTSIB);
2968
2969         xfs_btree_log_block(cur, rbp, XFS_BB_ALL_BITS);
2970         xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
2971
2972         /*
2973          * If there's a block to the new block's right, make that block
2974          * point back to right instead of to left.
2975          */
2976         if (!xfs_btree_ptr_is_null(cur, &rrptr)) {
2977                 error = xfs_btree_read_buf_block(cur, &rrptr,
2978                                                         0, &rrblock, &rrbp);
2979                 if (error)
2980                         goto error0;
2981                 xfs_btree_set_sibling(cur, rrblock, &rptr, XFS_BB_LEFTSIB);
2982                 xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
2983         }
2984
2985         /* Update the parent high keys of the left block, if needed. */
2986         if (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING) {
2987                 error = xfs_btree_update_keys(cur, level);
2988                 if (error)
2989                         goto error0;
2990         }
2991
2992         /*
2993          * If the cursor is really in the right block, move it there.
2994          * If it's just pointing past the last entry in left, then we'll
2995          * insert there, so don't change anything in that case.
2996          */
2997         if (cur->bc_levels[level].ptr > lrecs + 1) {
2998                 xfs_btree_setbuf(cur, level, rbp);
2999                 cur->bc_levels[level].ptr -= lrecs;
3000         }
3001         /*
3002          * If there are more levels, we'll need another cursor which refers
3003          * the right block, no matter where this cursor was.
3004          */
3005         if (level + 1 < cur->bc_nlevels) {
3006                 error = xfs_btree_dup_cursor(cur, curp);
3007                 if (error)
3008                         goto error0;
3009                 (*curp)->bc_levels[level + 1].ptr++;
3010         }
3011         *ptrp = rptr;
3012         *stat = 1;
3013         return 0;
3014 out0:
3015         *stat = 0;
3016         return 0;
3017
3018 error0:
3019         return error;
3020 }
3021
3022 #ifdef __KERNEL__
3023 struct xfs_btree_split_args {
3024         struct xfs_btree_cur    *cur;
3025         int                     level;
3026         union xfs_btree_ptr     *ptrp;
3027         union xfs_btree_key     *key;
3028         struct xfs_btree_cur    **curp;
3029         int                     *stat;          /* success/failure */
3030         int                     result;
3031         bool                    kswapd; /* allocation in kswapd context */
3032         struct completion       *done;
3033         struct work_struct      work;
3034 };
3035
3036 /*
3037  * Stack switching interfaces for allocation
3038  */
3039 static void
3040 xfs_btree_split_worker(
3041         struct work_struct      *work)
3042 {
3043         struct xfs_btree_split_args     *args = container_of(work,
3044                                                 struct xfs_btree_split_args, work);
3045         unsigned long           pflags;
3046         unsigned long           new_pflags = 0;
3047
3048         /*
3049          * we are in a transaction context here, but may also be doing work
3050          * in kswapd context, and hence we may need to inherit that state
3051          * temporarily to ensure that we don't block waiting for memory reclaim
3052          * in any way.
3053          */
3054         if (args->kswapd)
3055                 new_pflags |= PF_MEMALLOC | PF_KSWAPD;
3056
3057         current_set_flags_nested(&pflags, new_pflags);
3058         xfs_trans_set_context(args->cur->bc_tp);
3059
3060         args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
3061                                          args->key, args->curp, args->stat);
3062
3063         xfs_trans_clear_context(args->cur->bc_tp);
3064         current_restore_flags_nested(&pflags, new_pflags);
3065
3066         /*
3067          * Do not access args after complete() has run here. We don't own args
3068          * and the owner may run and free args before we return here.
3069          */
3070         complete(args->done);
3071
3072 }
3073
3074 /*
3075  * BMBT split requests often come in with little stack to work on so we push
3076  * them off to a worker thread so there is lots of stack to use. For the other
3077  * btree types, just call directly to avoid the context switch overhead here.
3078  *
3079  * Care must be taken here - the work queue rescuer thread introduces potential
3080  * AGF <> worker queue deadlocks if the BMBT block allocation has to lock new
3081  * AGFs to allocate blocks. A task being run by the rescuer could attempt to
3082  * lock an AGF that is already locked by a task queued to run by the rescuer,
3083  * resulting in an ABBA deadlock as the rescuer cannot run the lock holder to
3084  * release it until the current thread it is running gains the lock.
3085  *
3086  * To avoid this issue, we only ever queue BMBT splits that don't have an AGF
3087  * already locked to allocate from. The only place that doesn't hold an AGF
3088  * locked is unwritten extent conversion at IO completion, but that has already
3089  * been offloaded to a worker thread and hence has no stack consumption issues
3090  * we have to worry about.
3091  */
3092 STATIC int                                      /* error */
3093 xfs_btree_split(
3094         struct xfs_btree_cur    *cur,
3095         int                     level,
3096         union xfs_btree_ptr     *ptrp,
3097         union xfs_btree_key     *key,
3098         struct xfs_btree_cur    **curp,
3099         int                     *stat)          /* success/failure */
3100 {
3101         struct xfs_btree_split_args     args;
3102         DECLARE_COMPLETION_ONSTACK(done);
3103
3104         if (!xfs_btree_is_bmap(cur->bc_ops) ||
3105             cur->bc_tp->t_highest_agno == NULLAGNUMBER)
3106                 return __xfs_btree_split(cur, level, ptrp, key, curp, stat);
3107
3108         args.cur = cur;
3109         args.level = level;
3110         args.ptrp = ptrp;
3111         args.key = key;
3112         args.curp = curp;
3113         args.stat = stat;
3114         args.done = &done;
3115         args.kswapd = current_is_kswapd();
3116         INIT_WORK_ONSTACK(&args.work, xfs_btree_split_worker);
3117         queue_work(xfs_alloc_wq, &args.work);
3118         wait_for_completion(&done);
3119         destroy_work_on_stack(&args.work);
3120         return args.result;
3121 }
3122 #else
3123 #define xfs_btree_split __xfs_btree_split
3124 #endif /* __KERNEL__ */
3125
3126 /*
3127  * Copy the old inode root contents into a real block and make the
3128  * broot point to it.
3129  */
3130 int                                             /* error */
3131 xfs_btree_new_iroot(
3132         struct xfs_btree_cur    *cur,           /* btree cursor */
3133         int                     *logflags,      /* logging flags for inode */
3134         int                     *stat)          /* return status - 0 fail */
3135 {
3136         struct xfs_buf          *cbp;           /* buffer for cblock */
3137         struct xfs_btree_block  *block;         /* btree block */
3138         struct xfs_btree_block  *cblock;        /* child btree block */
3139         union xfs_btree_key     *ckp;           /* child key pointer */
3140         union xfs_btree_ptr     *cpp;           /* child ptr pointer */
3141         union xfs_btree_key     *kp;            /* pointer to btree key */
3142         union xfs_btree_ptr     *pp;            /* pointer to block addr */
3143         union xfs_btree_ptr     nptr;           /* new block addr */
3144         int                     level;          /* btree level */
3145         int                     error;          /* error return code */
3146         int                     i;              /* loop counter */
3147
3148         XFS_BTREE_STATS_INC(cur, newroot);
3149
3150         ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE);
3151
3152         level = cur->bc_nlevels - 1;
3153
3154         block = xfs_btree_get_iroot(cur);
3155         pp = xfs_btree_ptr_addr(cur, 1, block);
3156
3157         /* Allocate the new block. If we can't do it, we're toast. Give up. */
3158         error = xfs_btree_alloc_block(cur, pp, &nptr, stat);
3159         if (error)
3160                 goto error0;
3161         if (*stat == 0)
3162                 return 0;
3163
3164         XFS_BTREE_STATS_INC(cur, alloc);
3165
3166         /* Copy the root into a real block. */
3167         error = xfs_btree_get_buf_block(cur, &nptr, &cblock, &cbp);
3168         if (error)
3169                 goto error0;
3170
3171         /*
3172          * we can't just memcpy() the root in for CRC enabled btree blocks.
3173          * In that case have to also ensure the blkno remains correct
3174          */
3175         memcpy(cblock, block, xfs_btree_block_len(cur));
3176         if (xfs_has_crc(cur->bc_mp)) {
3177                 __be64 bno = cpu_to_be64(xfs_buf_daddr(cbp));
3178                 if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
3179                         cblock->bb_u.l.bb_blkno = bno;
3180                 else
3181                         cblock->bb_u.s.bb_blkno = bno;
3182         }
3183
3184         be16_add_cpu(&block->bb_level, 1);
3185         xfs_btree_set_numrecs(block, 1);
3186         cur->bc_nlevels++;
3187         ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
3188         cur->bc_levels[level + 1].ptr = 1;
3189
3190         kp = xfs_btree_key_addr(cur, 1, block);
3191         ckp = xfs_btree_key_addr(cur, 1, cblock);
3192         xfs_btree_copy_keys(cur, ckp, kp, xfs_btree_get_numrecs(cblock));
3193
3194         cpp = xfs_btree_ptr_addr(cur, 1, cblock);
3195         for (i = 0; i < be16_to_cpu(cblock->bb_numrecs); i++) {
3196                 error = xfs_btree_debug_check_ptr(cur, pp, i, level);
3197                 if (error)
3198                         goto error0;
3199         }
3200
3201         xfs_btree_copy_ptrs(cur, cpp, pp, xfs_btree_get_numrecs(cblock));
3202
3203         error = xfs_btree_debug_check_ptr(cur, &nptr, 0, level);
3204         if (error)
3205                 goto error0;
3206
3207         xfs_btree_copy_ptrs(cur, pp, &nptr, 1);
3208
3209         xfs_iroot_realloc(cur->bc_ino.ip,
3210                           1 - xfs_btree_get_numrecs(cblock),
3211                           cur->bc_ino.whichfork);
3212
3213         xfs_btree_setbuf(cur, level, cbp);
3214
3215         /*
3216          * Do all this logging at the end so that
3217          * the root is at the right level.
3218          */
3219         xfs_btree_log_block(cur, cbp, XFS_BB_ALL_BITS);
3220         xfs_btree_log_keys(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
3221         xfs_btree_log_ptrs(cur, cbp, 1, be16_to_cpu(cblock->bb_numrecs));
3222
3223         *logflags |=
3224                 XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork);
3225         *stat = 1;
3226         return 0;
3227 error0:
3228         return error;
3229 }
3230
3231 static void
3232 xfs_btree_set_root(
3233         struct xfs_btree_cur            *cur,
3234         const union xfs_btree_ptr       *ptr,
3235         int                             inc)
3236 {
3237         if (cur->bc_flags & XFS_BTREE_STAGING) {
3238                 /* Update the btree root information for a per-AG fake root. */
3239                 cur->bc_ag.afake->af_root = be32_to_cpu(ptr->s);
3240                 cur->bc_ag.afake->af_levels += inc;
3241         } else {
3242                 cur->bc_ops->set_root(cur, ptr, inc);
3243         }
3244 }
3245
3246 /*
3247  * Allocate a new root block, fill it in.
3248  */
3249 STATIC int                              /* error */
3250 xfs_btree_new_root(
3251         struct xfs_btree_cur    *cur,   /* btree cursor */
3252         int                     *stat)  /* success/failure */
3253 {
3254         struct xfs_btree_block  *block; /* one half of the old root block */
3255         struct xfs_buf          *bp;    /* buffer containing block */
3256         int                     error;  /* error return value */
3257         struct xfs_buf          *lbp;   /* left buffer pointer */
3258         struct xfs_btree_block  *left;  /* left btree block */
3259         struct xfs_buf          *nbp;   /* new (root) buffer */
3260         struct xfs_btree_block  *new;   /* new (root) btree block */
3261         int                     nptr;   /* new value for key index, 1 or 2 */
3262         struct xfs_buf          *rbp;   /* right buffer pointer */
3263         struct xfs_btree_block  *right; /* right btree block */
3264         union xfs_btree_ptr     rptr;
3265         union xfs_btree_ptr     lptr;
3266
3267         XFS_BTREE_STATS_INC(cur, newroot);
3268
3269         /* initialise our start point from the cursor */
3270         xfs_btree_init_ptr_from_cur(cur, &rptr);
3271
3272         /* Allocate the new block. If we can't do it, we're toast. Give up. */
3273         error = xfs_btree_alloc_block(cur, &rptr, &lptr, stat);
3274         if (error)
3275                 goto error0;
3276         if (*stat == 0)
3277                 goto out0;
3278         XFS_BTREE_STATS_INC(cur, alloc);
3279
3280         /* Set up the new block. */
3281         error = xfs_btree_get_buf_block(cur, &lptr, &new, &nbp);
3282         if (error)
3283                 goto error0;
3284
3285         /* Set the root in the holding structure  increasing the level by 1. */
3286         xfs_btree_set_root(cur, &lptr, 1);
3287
3288         /*
3289          * At the previous root level there are now two blocks: the old root,
3290          * and the new block generated when it was split.  We don't know which
3291          * one the cursor is pointing at, so we set up variables "left" and
3292          * "right" for each case.
3293          */
3294         block = xfs_btree_get_block(cur, cur->bc_nlevels - 1, &bp);
3295
3296 #ifdef DEBUG
3297         error = xfs_btree_check_block(cur, block, cur->bc_nlevels - 1, bp);
3298         if (error)
3299                 goto error0;
3300 #endif
3301
3302         xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
3303         if (!xfs_btree_ptr_is_null(cur, &rptr)) {
3304                 /* Our block is left, pick up the right block. */
3305                 lbp = bp;
3306                 xfs_btree_buf_to_ptr(cur, lbp, &lptr);
3307                 left = block;
3308                 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
3309                 if (error)
3310                         goto error0;
3311                 bp = rbp;
3312                 nptr = 1;
3313         } else {
3314                 /* Our block is right, pick up the left block. */
3315                 rbp = bp;
3316                 xfs_btree_buf_to_ptr(cur, rbp, &rptr);
3317                 right = block;
3318                 xfs_btree_get_sibling(cur, right, &lptr, XFS_BB_LEFTSIB);
3319                 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
3320                 if (error)
3321                         goto error0;
3322                 bp = lbp;
3323                 nptr = 2;
3324         }
3325
3326         /* Fill in the new block's btree header and log it. */
3327         xfs_btree_init_block_cur(cur, nbp, cur->bc_nlevels, 2);
3328         xfs_btree_log_block(cur, nbp, XFS_BB_ALL_BITS);
3329         ASSERT(!xfs_btree_ptr_is_null(cur, &lptr) &&
3330                         !xfs_btree_ptr_is_null(cur, &rptr));
3331
3332         /* Fill in the key data in the new root. */
3333         if (xfs_btree_get_level(left) > 0) {
3334                 /*
3335                  * Get the keys for the left block's keys and put them directly
3336                  * in the parent block.  Do the same for the right block.
3337                  */
3338                 xfs_btree_get_node_keys(cur, left,
3339                                 xfs_btree_key_addr(cur, 1, new));
3340                 xfs_btree_get_node_keys(cur, right,
3341                                 xfs_btree_key_addr(cur, 2, new));
3342         } else {
3343                 /*
3344                  * Get the keys for the left block's records and put them
3345                  * directly in the parent block.  Do the same for the right
3346                  * block.
3347                  */
3348                 xfs_btree_get_leaf_keys(cur, left,
3349                         xfs_btree_key_addr(cur, 1, new));
3350                 xfs_btree_get_leaf_keys(cur, right,
3351                         xfs_btree_key_addr(cur, 2, new));
3352         }
3353         xfs_btree_log_keys(cur, nbp, 1, 2);
3354
3355         /* Fill in the pointer data in the new root. */
3356         xfs_btree_copy_ptrs(cur,
3357                 xfs_btree_ptr_addr(cur, 1, new), &lptr, 1);
3358         xfs_btree_copy_ptrs(cur,
3359                 xfs_btree_ptr_addr(cur, 2, new), &rptr, 1);
3360         xfs_btree_log_ptrs(cur, nbp, 1, 2);
3361
3362         /* Fix up the cursor. */
3363         xfs_btree_setbuf(cur, cur->bc_nlevels, nbp);
3364         cur->bc_levels[cur->bc_nlevels].ptr = nptr;
3365         cur->bc_nlevels++;
3366         ASSERT(cur->bc_nlevels <= cur->bc_maxlevels);
3367         *stat = 1;
3368         return 0;
3369 error0:
3370         return error;
3371 out0:
3372         *stat = 0;
3373         return 0;
3374 }
3375
3376 STATIC int
3377 xfs_btree_make_block_unfull(
3378         struct xfs_btree_cur    *cur,   /* btree cursor */
3379         int                     level,  /* btree level */
3380         int                     numrecs,/* # of recs in block */
3381         int                     *oindex,/* old tree index */
3382         int                     *index, /* new tree index */
3383         union xfs_btree_ptr     *nptr,  /* new btree ptr */
3384         struct xfs_btree_cur    **ncur, /* new btree cursor */
3385         union xfs_btree_key     *key,   /* key of new block */
3386         int                     *stat)
3387 {
3388         int                     error = 0;
3389
3390         if (xfs_btree_at_iroot(cur, level)) {
3391                 struct xfs_inode *ip = cur->bc_ino.ip;
3392
3393                 if (numrecs < cur->bc_ops->get_dmaxrecs(cur, level)) {
3394                         /* A root block that can be made bigger. */
3395                         xfs_iroot_realloc(ip, 1, cur->bc_ino.whichfork);
3396                         *stat = 1;
3397                 } else {
3398                         /* A root block that needs replacing */
3399                         int     logflags = 0;
3400
3401                         error = xfs_btree_new_iroot(cur, &logflags, stat);
3402                         if (error || *stat == 0)
3403                                 return error;
3404
3405                         xfs_trans_log_inode(cur->bc_tp, ip, logflags);
3406                 }
3407
3408                 return 0;
3409         }
3410
3411         /* First, try shifting an entry to the right neighbor. */
3412         error = xfs_btree_rshift(cur, level, stat);
3413         if (error || *stat)
3414                 return error;
3415
3416         /* Next, try shifting an entry to the left neighbor. */
3417         error = xfs_btree_lshift(cur, level, stat);
3418         if (error)
3419                 return error;
3420
3421         if (*stat) {
3422                 *oindex = *index = cur->bc_levels[level].ptr;
3423                 return 0;
3424         }
3425
3426         /*
3427          * Next, try splitting the current block in half.
3428          *
3429          * If this works we have to re-set our variables because we
3430          * could be in a different block now.
3431          */
3432         error = xfs_btree_split(cur, level, nptr, key, ncur, stat);
3433         if (error || *stat == 0)
3434                 return error;
3435
3436
3437         *index = cur->bc_levels[level].ptr;
3438         return 0;
3439 }
3440
3441 /*
3442  * Insert one record/level.  Return information to the caller
3443  * allowing the next level up to proceed if necessary.
3444  */
3445 STATIC int
3446 xfs_btree_insrec(
3447         struct xfs_btree_cur    *cur,   /* btree cursor */
3448         int                     level,  /* level to insert record at */
3449         union xfs_btree_ptr     *ptrp,  /* i/o: block number inserted */
3450         union xfs_btree_rec     *rec,   /* record to insert */
3451         union xfs_btree_key     *key,   /* i/o: block key for ptrp */
3452         struct xfs_btree_cur    **curp, /* output: new cursor replacing cur */
3453         int                     *stat)  /* success/failure */
3454 {
3455         struct xfs_btree_block  *block; /* btree block */
3456         struct xfs_buf          *bp;    /* buffer for block */
3457         union xfs_btree_ptr     nptr;   /* new block ptr */
3458         struct xfs_btree_cur    *ncur = NULL;   /* new btree cursor */
3459         union xfs_btree_key     nkey;   /* new block key */
3460         union xfs_btree_key     *lkey;
3461         int                     optr;   /* old key/record index */
3462         int                     ptr;    /* key/record index */
3463         int                     numrecs;/* number of records */
3464         int                     error;  /* error return value */
3465         int                     i;
3466         xfs_daddr_t             old_bn;
3467
3468         ncur = NULL;
3469         lkey = &nkey;
3470
3471         /*
3472          * If we have an external root pointer, and we've made it to the
3473          * root level, allocate a new root block and we're done.
3474          */
3475         if (cur->bc_ops->type != XFS_BTREE_TYPE_INODE &&
3476             level >= cur->bc_nlevels) {
3477                 error = xfs_btree_new_root(cur, stat);
3478                 xfs_btree_set_ptr_null(cur, ptrp);
3479
3480                 return error;
3481         }
3482
3483         /* If we're off the left edge, return failure. */
3484         ptr = cur->bc_levels[level].ptr;
3485         if (ptr == 0) {
3486                 *stat = 0;
3487                 return 0;
3488         }
3489
3490         optr = ptr;
3491
3492         XFS_BTREE_STATS_INC(cur, insrec);
3493
3494         /* Get pointers to the btree buffer and block. */
3495         block = xfs_btree_get_block(cur, level, &bp);
3496         old_bn = bp ? xfs_buf_daddr(bp) : XFS_BUF_DADDR_NULL;
3497         numrecs = xfs_btree_get_numrecs(block);
3498
3499 #ifdef DEBUG
3500         error = xfs_btree_check_block(cur, block, level, bp);
3501         if (error)
3502                 goto error0;
3503
3504         /* Check that the new entry is being inserted in the right place. */
3505         if (ptr <= numrecs) {
3506                 if (level == 0) {
3507                         ASSERT(cur->bc_ops->recs_inorder(cur, rec,
3508                                 xfs_btree_rec_addr(cur, ptr, block)));
3509                 } else {
3510                         ASSERT(cur->bc_ops->keys_inorder(cur, key,
3511                                 xfs_btree_key_addr(cur, ptr, block)));
3512                 }
3513         }
3514 #endif
3515
3516         /*
3517          * If the block is full, we can't insert the new entry until we
3518          * make the block un-full.
3519          */
3520         xfs_btree_set_ptr_null(cur, &nptr);
3521         if (numrecs == cur->bc_ops->get_maxrecs(cur, level)) {
3522                 error = xfs_btree_make_block_unfull(cur, level, numrecs,
3523                                         &optr, &ptr, &nptr, &ncur, lkey, stat);
3524                 if (error || *stat == 0)
3525                         goto error0;
3526         }
3527
3528         /*
3529          * The current block may have changed if the block was
3530          * previously full and we have just made space in it.
3531          */
3532         block = xfs_btree_get_block(cur, level, &bp);
3533         numrecs = xfs_btree_get_numrecs(block);
3534
3535 #ifdef DEBUG
3536         error = xfs_btree_check_block(cur, block, level, bp);
3537         if (error)
3538                 goto error0;
3539 #endif
3540
3541         /*
3542          * At this point we know there's room for our new entry in the block
3543          * we're pointing at.
3544          */
3545         XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr + 1);
3546
3547         if (level > 0) {
3548                 /* It's a nonleaf. make a hole in the keys and ptrs */
3549                 union xfs_btree_key     *kp;
3550                 union xfs_btree_ptr     *pp;
3551
3552                 kp = xfs_btree_key_addr(cur, ptr, block);
3553                 pp = xfs_btree_ptr_addr(cur, ptr, block);
3554
3555                 for (i = numrecs - ptr; i >= 0; i--) {
3556                         error = xfs_btree_debug_check_ptr(cur, pp, i, level);
3557                         if (error)
3558                                 goto error0;
3559                 }
3560
3561                 xfs_btree_shift_keys(cur, kp, 1, numrecs - ptr + 1);
3562                 xfs_btree_shift_ptrs(cur, pp, 1, numrecs - ptr + 1);
3563
3564                 error = xfs_btree_debug_check_ptr(cur, ptrp, 0, level);
3565                 if (error)
3566                         goto error0;
3567
3568                 /* Now put the new data in, bump numrecs and log it. */
3569                 xfs_btree_copy_keys(cur, kp, key, 1);
3570                 xfs_btree_copy_ptrs(cur, pp, ptrp, 1);
3571                 numrecs++;
3572                 xfs_btree_set_numrecs(block, numrecs);
3573                 xfs_btree_log_ptrs(cur, bp, ptr, numrecs);
3574                 xfs_btree_log_keys(cur, bp, ptr, numrecs);
3575 #ifdef DEBUG
3576                 if (ptr < numrecs) {
3577                         ASSERT(cur->bc_ops->keys_inorder(cur, kp,
3578                                 xfs_btree_key_addr(cur, ptr + 1, block)));
3579                 }
3580 #endif
3581         } else {
3582                 /* It's a leaf. make a hole in the records */
3583                 union xfs_btree_rec             *rp;
3584
3585                 rp = xfs_btree_rec_addr(cur, ptr, block);
3586
3587                 xfs_btree_shift_recs(cur, rp, 1, numrecs - ptr + 1);
3588
3589                 /* Now put the new data in, bump numrecs and log it. */
3590                 xfs_btree_copy_recs(cur, rp, rec, 1);
3591                 xfs_btree_set_numrecs(block, ++numrecs);
3592                 xfs_btree_log_recs(cur, bp, ptr, numrecs);
3593 #ifdef DEBUG
3594                 if (ptr < numrecs) {
3595                         ASSERT(cur->bc_ops->recs_inorder(cur, rp,
3596                                 xfs_btree_rec_addr(cur, ptr + 1, block)));
3597                 }
3598 #endif
3599         }
3600
3601         /* Log the new number of records in the btree header. */
3602         xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
3603
3604         /*
3605          * If we just inserted into a new tree block, we have to
3606          * recalculate nkey here because nkey is out of date.
3607          *
3608          * Otherwise we're just updating an existing block (having shoved
3609          * some records into the new tree block), so use the regular key
3610          * update mechanism.
3611          */
3612         if (bp && xfs_buf_daddr(bp) != old_bn) {
3613                 xfs_btree_get_keys(cur, block, lkey);
3614         } else if (xfs_btree_needs_key_update(cur, optr)) {
3615                 error = xfs_btree_update_keys(cur, level);
3616                 if (error)
3617                         goto error0;
3618         }
3619
3620         /*
3621          * If we are tracking the last record in the tree and
3622          * we are at the far right edge of the tree, update it.
3623          */
3624         if (xfs_btree_is_lastrec(cur, block, level)) {
3625                 cur->bc_ops->update_lastrec(cur, block, rec,
3626                                             ptr, LASTREC_INSREC);
3627         }
3628
3629         /*
3630          * Return the new block number, if any.
3631          * If there is one, give back a record value and a cursor too.
3632          */
3633         *ptrp = nptr;
3634         if (!xfs_btree_ptr_is_null(cur, &nptr)) {
3635                 xfs_btree_copy_keys(cur, key, lkey, 1);
3636                 *curp = ncur;
3637         }
3638
3639         *stat = 1;
3640         return 0;
3641
3642 error0:
3643         if (ncur)
3644                 xfs_btree_del_cursor(ncur, error);
3645         return error;
3646 }
3647
3648 /*
3649  * Insert the record at the point referenced by cur.
3650  *
3651  * A multi-level split of the tree on insert will invalidate the original
3652  * cursor.  All callers of this function should assume that the cursor is
3653  * no longer valid and revalidate it.
3654  */
3655 int
3656 xfs_btree_insert(
3657         struct xfs_btree_cur    *cur,
3658         int                     *stat)
3659 {
3660         int                     error;  /* error return value */
3661         int                     i;      /* result value, 0 for failure */
3662         int                     level;  /* current level number in btree */
3663         union xfs_btree_ptr     nptr;   /* new block number (split result) */
3664         struct xfs_btree_cur    *ncur;  /* new cursor (split result) */
3665         struct xfs_btree_cur    *pcur;  /* previous level's cursor */
3666         union xfs_btree_key     bkey;   /* key of block to insert */
3667         union xfs_btree_key     *key;
3668         union xfs_btree_rec     rec;    /* record to insert */
3669
3670         level = 0;
3671         ncur = NULL;
3672         pcur = cur;
3673         key = &bkey;
3674
3675         xfs_btree_set_ptr_null(cur, &nptr);
3676
3677         /* Make a key out of the record data to be inserted, and save it. */
3678         cur->bc_ops->init_rec_from_cur(cur, &rec);
3679         cur->bc_ops->init_key_from_rec(key, &rec);
3680
3681         /*
3682          * Loop going up the tree, starting at the leaf level.
3683          * Stop when we don't get a split block, that must mean that
3684          * the insert is finished with this level.
3685          */
3686         do {
3687                 /*
3688                  * Insert nrec/nptr into this level of the tree.
3689                  * Note if we fail, nptr will be null.
3690                  */
3691                 error = xfs_btree_insrec(pcur, level, &nptr, &rec, key,
3692                                 &ncur, &i);
3693                 if (error) {
3694                         if (pcur != cur)
3695                                 xfs_btree_del_cursor(pcur, XFS_BTREE_ERROR);
3696                         goto error0;
3697                 }
3698
3699                 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
3700                         xfs_btree_mark_sick(cur);
3701                         error = -EFSCORRUPTED;
3702                         goto error0;
3703                 }
3704                 level++;
3705
3706                 /*
3707                  * See if the cursor we just used is trash.
3708                  * Can't trash the caller's cursor, but otherwise we should
3709                  * if ncur is a new cursor or we're about to be done.
3710                  */
3711                 if (pcur != cur &&
3712                     (ncur || xfs_btree_ptr_is_null(cur, &nptr))) {
3713                         /* Save the state from the cursor before we trash it */
3714                         if (cur->bc_ops->update_cursor &&
3715                             !(cur->bc_flags & XFS_BTREE_STAGING))
3716                                 cur->bc_ops->update_cursor(pcur, cur);
3717                         cur->bc_nlevels = pcur->bc_nlevels;
3718                         xfs_btree_del_cursor(pcur, XFS_BTREE_NOERROR);
3719                 }
3720                 /* If we got a new cursor, switch to it. */
3721                 if (ncur) {
3722                         pcur = ncur;
3723                         ncur = NULL;
3724                 }
3725         } while (!xfs_btree_ptr_is_null(cur, &nptr));
3726
3727         *stat = i;
3728         return 0;
3729 error0:
3730         return error;
3731 }
3732
3733 /*
3734  * Try to merge a non-leaf block back into the inode root.
3735  *
3736  * Note: the killroot names comes from the fact that we're effectively
3737  * killing the old root block.  But because we can't just delete the
3738  * inode we have to copy the single block it was pointing to into the
3739  * inode.
3740  */
3741 STATIC int
3742 xfs_btree_kill_iroot(
3743         struct xfs_btree_cur    *cur)
3744 {
3745         int                     whichfork = cur->bc_ino.whichfork;
3746         struct xfs_inode        *ip = cur->bc_ino.ip;
3747         struct xfs_ifork        *ifp = xfs_ifork_ptr(ip, whichfork);
3748         struct xfs_btree_block  *block;
3749         struct xfs_btree_block  *cblock;
3750         union xfs_btree_key     *kp;
3751         union xfs_btree_key     *ckp;
3752         union xfs_btree_ptr     *pp;
3753         union xfs_btree_ptr     *cpp;
3754         struct xfs_buf          *cbp;
3755         int                     level;
3756         int                     index;
3757         int                     numrecs;
3758         int                     error;
3759 #ifdef DEBUG
3760         union xfs_btree_ptr     ptr;
3761 #endif
3762         int                     i;
3763
3764         ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE);
3765         ASSERT(cur->bc_nlevels > 1);
3766
3767         /*
3768          * Don't deal with the root block needs to be a leaf case.
3769          * We're just going to turn the thing back into extents anyway.
3770          */
3771         level = cur->bc_nlevels - 1;
3772         if (level == 1)
3773                 goto out0;
3774
3775         /*
3776          * Give up if the root has multiple children.
3777          */
3778         block = xfs_btree_get_iroot(cur);
3779         if (xfs_btree_get_numrecs(block) != 1)
3780                 goto out0;
3781
3782         cblock = xfs_btree_get_block(cur, level - 1, &cbp);
3783         numrecs = xfs_btree_get_numrecs(cblock);
3784
3785         /*
3786          * Only do this if the next level will fit.
3787          * Then the data must be copied up to the inode,
3788          * instead of freeing the root you free the next level.
3789          */
3790         if (numrecs > cur->bc_ops->get_dmaxrecs(cur, level))
3791                 goto out0;
3792
3793         XFS_BTREE_STATS_INC(cur, killroot);
3794
3795 #ifdef DEBUG
3796         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_LEFTSIB);
3797         ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3798         xfs_btree_get_sibling(cur, block, &ptr, XFS_BB_RIGHTSIB);
3799         ASSERT(xfs_btree_ptr_is_null(cur, &ptr));
3800 #endif
3801
3802         index = numrecs - cur->bc_ops->get_maxrecs(cur, level);
3803         if (index) {
3804                 xfs_iroot_realloc(cur->bc_ino.ip, index,
3805                                   cur->bc_ino.whichfork);
3806                 block = ifp->if_broot;
3807         }
3808
3809         be16_add_cpu(&block->bb_numrecs, index);
3810         ASSERT(block->bb_numrecs == cblock->bb_numrecs);
3811
3812         kp = xfs_btree_key_addr(cur, 1, block);
3813         ckp = xfs_btree_key_addr(cur, 1, cblock);
3814         xfs_btree_copy_keys(cur, kp, ckp, numrecs);
3815
3816         pp = xfs_btree_ptr_addr(cur, 1, block);
3817         cpp = xfs_btree_ptr_addr(cur, 1, cblock);
3818
3819         for (i = 0; i < numrecs; i++) {
3820                 error = xfs_btree_debug_check_ptr(cur, cpp, i, level - 1);
3821                 if (error)
3822                         return error;
3823         }
3824
3825         xfs_btree_copy_ptrs(cur, pp, cpp, numrecs);
3826
3827         error = xfs_btree_free_block(cur, cbp);
3828         if (error)
3829                 return error;
3830
3831         cur->bc_levels[level - 1].bp = NULL;
3832         be16_add_cpu(&block->bb_level, -1);
3833         xfs_trans_log_inode(cur->bc_tp, ip,
3834                 XFS_ILOG_CORE | xfs_ilog_fbroot(cur->bc_ino.whichfork));
3835         cur->bc_nlevels--;
3836 out0:
3837         return 0;
3838 }
3839
3840 /*
3841  * Kill the current root node, and replace it with it's only child node.
3842  */
3843 STATIC int
3844 xfs_btree_kill_root(
3845         struct xfs_btree_cur    *cur,
3846         struct xfs_buf          *bp,
3847         int                     level,
3848         union xfs_btree_ptr     *newroot)
3849 {
3850         int                     error;
3851
3852         XFS_BTREE_STATS_INC(cur, killroot);
3853
3854         /*
3855          * Update the root pointer, decreasing the level by 1 and then
3856          * free the old root.
3857          */
3858         xfs_btree_set_root(cur, newroot, -1);
3859
3860         error = xfs_btree_free_block(cur, bp);
3861         if (error)
3862                 return error;
3863
3864         cur->bc_levels[level].bp = NULL;
3865         cur->bc_levels[level].ra = 0;
3866         cur->bc_nlevels--;
3867
3868         return 0;
3869 }
3870
3871 STATIC int
3872 xfs_btree_dec_cursor(
3873         struct xfs_btree_cur    *cur,
3874         int                     level,
3875         int                     *stat)
3876 {
3877         int                     error;
3878         int                     i;
3879
3880         if (level > 0) {
3881                 error = xfs_btree_decrement(cur, level, &i);
3882                 if (error)
3883                         return error;
3884         }
3885
3886         *stat = 1;
3887         return 0;
3888 }
3889
3890 /*
3891  * Single level of the btree record deletion routine.
3892  * Delete record pointed to by cur/level.
3893  * Remove the record from its block then rebalance the tree.
3894  * Return 0 for error, 1 for done, 2 to go on to the next level.
3895  */
3896 STATIC int                                      /* error */
3897 xfs_btree_delrec(
3898         struct xfs_btree_cur    *cur,           /* btree cursor */
3899         int                     level,          /* level removing record from */
3900         int                     *stat)          /* fail/done/go-on */
3901 {
3902         struct xfs_btree_block  *block;         /* btree block */
3903         union xfs_btree_ptr     cptr;           /* current block ptr */
3904         struct xfs_buf          *bp;            /* buffer for block */
3905         int                     error;          /* error return value */
3906         int                     i;              /* loop counter */
3907         union xfs_btree_ptr     lptr;           /* left sibling block ptr */
3908         struct xfs_buf          *lbp;           /* left buffer pointer */
3909         struct xfs_btree_block  *left;          /* left btree block */
3910         int                     lrecs = 0;      /* left record count */
3911         int                     ptr;            /* key/record index */
3912         union xfs_btree_ptr     rptr;           /* right sibling block ptr */
3913         struct xfs_buf          *rbp;           /* right buffer pointer */
3914         struct xfs_btree_block  *right;         /* right btree block */
3915         struct xfs_btree_block  *rrblock;       /* right-right btree block */
3916         struct xfs_buf          *rrbp;          /* right-right buffer pointer */
3917         int                     rrecs = 0;      /* right record count */
3918         struct xfs_btree_cur    *tcur;          /* temporary btree cursor */
3919         int                     numrecs;        /* temporary numrec count */
3920
3921         tcur = NULL;
3922
3923         /* Get the index of the entry being deleted, check for nothing there. */
3924         ptr = cur->bc_levels[level].ptr;
3925         if (ptr == 0) {
3926                 *stat = 0;
3927                 return 0;
3928         }
3929
3930         /* Get the buffer & block containing the record or key/ptr. */
3931         block = xfs_btree_get_block(cur, level, &bp);
3932         numrecs = xfs_btree_get_numrecs(block);
3933
3934 #ifdef DEBUG
3935         error = xfs_btree_check_block(cur, block, level, bp);
3936         if (error)
3937                 goto error0;
3938 #endif
3939
3940         /* Fail if we're off the end of the block. */
3941         if (ptr > numrecs) {
3942                 *stat = 0;
3943                 return 0;
3944         }
3945
3946         XFS_BTREE_STATS_INC(cur, delrec);
3947         XFS_BTREE_STATS_ADD(cur, moves, numrecs - ptr);
3948
3949         /* Excise the entries being deleted. */
3950         if (level > 0) {
3951                 /* It's a nonleaf. operate on keys and ptrs */
3952                 union xfs_btree_key     *lkp;
3953                 union xfs_btree_ptr     *lpp;
3954
3955                 lkp = xfs_btree_key_addr(cur, ptr + 1, block);
3956                 lpp = xfs_btree_ptr_addr(cur, ptr + 1, block);
3957
3958                 for (i = 0; i < numrecs - ptr; i++) {
3959                         error = xfs_btree_debug_check_ptr(cur, lpp, i, level);
3960                         if (error)
3961                                 goto error0;
3962                 }
3963
3964                 if (ptr < numrecs) {
3965                         xfs_btree_shift_keys(cur, lkp, -1, numrecs - ptr);
3966                         xfs_btree_shift_ptrs(cur, lpp, -1, numrecs - ptr);
3967                         xfs_btree_log_keys(cur, bp, ptr, numrecs - 1);
3968                         xfs_btree_log_ptrs(cur, bp, ptr, numrecs - 1);
3969                 }
3970         } else {
3971                 /* It's a leaf. operate on records */
3972                 if (ptr < numrecs) {
3973                         xfs_btree_shift_recs(cur,
3974                                 xfs_btree_rec_addr(cur, ptr + 1, block),
3975                                 -1, numrecs - ptr);
3976                         xfs_btree_log_recs(cur, bp, ptr, numrecs - 1);
3977                 }
3978         }
3979
3980         /*
3981          * Decrement and log the number of entries in the block.
3982          */
3983         xfs_btree_set_numrecs(block, --numrecs);
3984         xfs_btree_log_block(cur, bp, XFS_BB_NUMRECS);
3985
3986         /*
3987          * If we are tracking the last record in the tree and
3988          * we are at the far right edge of the tree, update it.
3989          */
3990         if (xfs_btree_is_lastrec(cur, block, level)) {
3991                 cur->bc_ops->update_lastrec(cur, block, NULL,
3992                                             ptr, LASTREC_DELREC);
3993         }
3994
3995         /*
3996          * We're at the root level.  First, shrink the root block in-memory.
3997          * Try to get rid of the next level down.  If we can't then there's
3998          * nothing left to do.
3999          */
4000         if (xfs_btree_at_iroot(cur, level)) {
4001                 xfs_iroot_realloc(cur->bc_ino.ip, -1, cur->bc_ino.whichfork);
4002
4003                 error = xfs_btree_kill_iroot(cur);
4004                 if (error)
4005                         goto error0;
4006
4007                 error = xfs_btree_dec_cursor(cur, level, stat);
4008                 if (error)
4009                         goto error0;
4010                 *stat = 1;
4011                 return 0;
4012         }
4013
4014         /*
4015          * If this is the root level, and there's only one entry left, and it's
4016          * NOT the leaf level, then we can get rid of this level.
4017          */
4018         if (level == cur->bc_nlevels - 1) {
4019                 if (numrecs == 1 && level > 0) {
4020                         union xfs_btree_ptr     *pp;
4021                         /*
4022                          * pp is still set to the first pointer in the block.
4023                          * Make it the new root of the btree.
4024                          */
4025                         pp = xfs_btree_ptr_addr(cur, 1, block);
4026                         error = xfs_btree_kill_root(cur, bp, level, pp);
4027                         if (error)
4028                                 goto error0;
4029                 } else if (level > 0) {
4030                         error = xfs_btree_dec_cursor(cur, level, stat);
4031                         if (error)
4032                                 goto error0;
4033                 }
4034                 *stat = 1;
4035                 return 0;
4036         }
4037
4038         /*
4039          * If we deleted the leftmost entry in the block, update the
4040          * key values above us in the tree.
4041          */
4042         if (xfs_btree_needs_key_update(cur, ptr)) {
4043                 error = xfs_btree_update_keys(cur, level);
4044                 if (error)
4045                         goto error0;
4046         }
4047
4048         /*
4049          * If the number of records remaining in the block is at least
4050          * the minimum, we're done.
4051          */
4052         if (numrecs >= cur->bc_ops->get_minrecs(cur, level)) {
4053                 error = xfs_btree_dec_cursor(cur, level, stat);
4054                 if (error)
4055                         goto error0;
4056                 return 0;
4057         }
4058
4059         /*
4060          * Otherwise, we have to move some records around to keep the
4061          * tree balanced.  Look at the left and right sibling blocks to
4062          * see if we can re-balance by moving only one record.
4063          */
4064         xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
4065         xfs_btree_get_sibling(cur, block, &lptr, XFS_BB_LEFTSIB);
4066
4067         if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE) {
4068                 /*
4069                  * One child of root, need to get a chance to copy its contents
4070                  * into the root and delete it. Can't go up to next level,
4071                  * there's nothing to delete there.
4072                  */
4073                 if (xfs_btree_ptr_is_null(cur, &rptr) &&
4074                     xfs_btree_ptr_is_null(cur, &lptr) &&
4075                     level == cur->bc_nlevels - 2) {
4076                         error = xfs_btree_kill_iroot(cur);
4077                         if (!error)
4078                                 error = xfs_btree_dec_cursor(cur, level, stat);
4079                         if (error)
4080                                 goto error0;
4081                         return 0;
4082                 }
4083         }
4084
4085         ASSERT(!xfs_btree_ptr_is_null(cur, &rptr) ||
4086                !xfs_btree_ptr_is_null(cur, &lptr));
4087
4088         /*
4089          * Duplicate the cursor so our btree manipulations here won't
4090          * disrupt the next level up.
4091          */
4092         error = xfs_btree_dup_cursor(cur, &tcur);
4093         if (error)
4094                 goto error0;
4095
4096         /*
4097          * If there's a right sibling, see if it's ok to shift an entry
4098          * out of it.
4099          */
4100         if (!xfs_btree_ptr_is_null(cur, &rptr)) {
4101                 /*
4102                  * Move the temp cursor to the last entry in the next block.
4103                  * Actually any entry but the first would suffice.
4104                  */
4105                 i = xfs_btree_lastrec(tcur, level);
4106                 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
4107                         xfs_btree_mark_sick(cur);
4108                         error = -EFSCORRUPTED;
4109                         goto error0;
4110                 }
4111
4112                 error = xfs_btree_increment(tcur, level, &i);
4113                 if (error)
4114                         goto error0;
4115                 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
4116                         xfs_btree_mark_sick(cur);
4117                         error = -EFSCORRUPTED;
4118                         goto error0;
4119                 }
4120
4121                 i = xfs_btree_lastrec(tcur, level);
4122                 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
4123                         xfs_btree_mark_sick(cur);
4124                         error = -EFSCORRUPTED;
4125                         goto error0;
4126                 }
4127
4128                 /* Grab a pointer to the block. */
4129                 right = xfs_btree_get_block(tcur, level, &rbp);
4130 #ifdef DEBUG
4131                 error = xfs_btree_check_block(tcur, right, level, rbp);
4132                 if (error)
4133                         goto error0;
4134 #endif
4135                 /* Grab the current block number, for future use. */
4136                 xfs_btree_get_sibling(tcur, right, &cptr, XFS_BB_LEFTSIB);
4137
4138                 /*
4139                  * If right block is full enough so that removing one entry
4140                  * won't make it too empty, and left-shifting an entry out
4141                  * of right to us works, we're done.
4142                  */
4143                 if (xfs_btree_get_numrecs(right) - 1 >=
4144                     cur->bc_ops->get_minrecs(tcur, level)) {
4145                         error = xfs_btree_lshift(tcur, level, &i);
4146                         if (error)
4147                                 goto error0;
4148                         if (i) {
4149                                 ASSERT(xfs_btree_get_numrecs(block) >=
4150                                        cur->bc_ops->get_minrecs(tcur, level));
4151
4152                                 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
4153                                 tcur = NULL;
4154
4155                                 error = xfs_btree_dec_cursor(cur, level, stat);
4156                                 if (error)
4157                                         goto error0;
4158                                 return 0;
4159                         }
4160                 }
4161
4162                 /*
4163                  * Otherwise, grab the number of records in right for
4164                  * future reference, and fix up the temp cursor to point
4165                  * to our block again (last record).
4166                  */
4167                 rrecs = xfs_btree_get_numrecs(right);
4168                 if (!xfs_btree_ptr_is_null(cur, &lptr)) {
4169                         i = xfs_btree_firstrec(tcur, level);
4170                         if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
4171                                 xfs_btree_mark_sick(cur);
4172                                 error = -EFSCORRUPTED;
4173                                 goto error0;
4174                         }
4175
4176                         error = xfs_btree_decrement(tcur, level, &i);
4177                         if (error)
4178                                 goto error0;
4179                         if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
4180                                 xfs_btree_mark_sick(cur);
4181                                 error = -EFSCORRUPTED;
4182                                 goto error0;
4183                         }
4184                 }
4185         }
4186
4187         /*
4188          * If there's a left sibling, see if it's ok to shift an entry
4189          * out of it.
4190          */
4191         if (!xfs_btree_ptr_is_null(cur, &lptr)) {
4192                 /*
4193                  * Move the temp cursor to the first entry in the
4194                  * previous block.
4195                  */
4196                 i = xfs_btree_firstrec(tcur, level);
4197                 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
4198                         xfs_btree_mark_sick(cur);
4199                         error = -EFSCORRUPTED;
4200                         goto error0;
4201                 }
4202
4203                 error = xfs_btree_decrement(tcur, level, &i);
4204                 if (error)
4205                         goto error0;
4206                 i = xfs_btree_firstrec(tcur, level);
4207                 if (XFS_IS_CORRUPT(cur->bc_mp, i != 1)) {
4208                         xfs_btree_mark_sick(cur);
4209                         error = -EFSCORRUPTED;
4210                         goto error0;
4211                 }
4212
4213                 /* Grab a pointer to the block. */
4214                 left = xfs_btree_get_block(tcur, level, &lbp);
4215 #ifdef DEBUG
4216                 error = xfs_btree_check_block(cur, left, level, lbp);
4217                 if (error)
4218                         goto error0;
4219 #endif
4220                 /* Grab the current block number, for future use. */
4221                 xfs_btree_get_sibling(tcur, left, &cptr, XFS_BB_RIGHTSIB);
4222
4223                 /*
4224                  * If left block is full enough so that removing one entry
4225                  * won't make it too empty, and right-shifting an entry out
4226                  * of left to us works, we're done.
4227                  */
4228                 if (xfs_btree_get_numrecs(left) - 1 >=
4229                     cur->bc_ops->get_minrecs(tcur, level)) {
4230                         error = xfs_btree_rshift(tcur, level, &i);
4231                         if (error)
4232                                 goto error0;
4233                         if (i) {
4234                                 ASSERT(xfs_btree_get_numrecs(block) >=
4235                                        cur->bc_ops->get_minrecs(tcur, level));
4236                                 xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
4237                                 tcur = NULL;
4238                                 if (level == 0)
4239                                         cur->bc_levels[0].ptr++;
4240
4241                                 *stat = 1;
4242                                 return 0;
4243                         }
4244                 }
4245
4246                 /*
4247                  * Otherwise, grab the number of records in right for
4248                  * future reference.
4249                  */
4250                 lrecs = xfs_btree_get_numrecs(left);
4251         }
4252
4253         /* Delete the temp cursor, we're done with it. */
4254         xfs_btree_del_cursor(tcur, XFS_BTREE_NOERROR);
4255         tcur = NULL;
4256
4257         /* If here, we need to do a join to keep the tree balanced. */
4258         ASSERT(!xfs_btree_ptr_is_null(cur, &cptr));
4259
4260         if (!xfs_btree_ptr_is_null(cur, &lptr) &&
4261             lrecs + xfs_btree_get_numrecs(block) <=
4262                         cur->bc_ops->get_maxrecs(cur, level)) {
4263                 /*
4264                  * Set "right" to be the starting block,
4265                  * "left" to be the left neighbor.
4266                  */
4267                 rptr = cptr;
4268                 right = block;
4269                 rbp = bp;
4270                 error = xfs_btree_read_buf_block(cur, &lptr, 0, &left, &lbp);
4271                 if (error)
4272                         goto error0;
4273
4274         /*
4275          * If that won't work, see if we can join with the right neighbor block.
4276          */
4277         } else if (!xfs_btree_ptr_is_null(cur, &rptr) &&
4278                    rrecs + xfs_btree_get_numrecs(block) <=
4279                         cur->bc_ops->get_maxrecs(cur, level)) {
4280                 /*
4281                  * Set "left" to be the starting block,
4282                  * "right" to be the right neighbor.
4283                  */
4284                 lptr = cptr;
4285                 left = block;
4286                 lbp = bp;
4287                 error = xfs_btree_read_buf_block(cur, &rptr, 0, &right, &rbp);
4288                 if (error)
4289                         goto error0;
4290
4291         /*
4292          * Otherwise, we can't fix the imbalance.
4293          * Just return.  This is probably a logic error, but it's not fatal.
4294          */
4295         } else {
4296                 error = xfs_btree_dec_cursor(cur, level, stat);
4297                 if (error)
4298                         goto error0;
4299                 return 0;
4300         }
4301
4302         rrecs = xfs_btree_get_numrecs(right);
4303         lrecs = xfs_btree_get_numrecs(left);
4304
4305         /*
4306          * We're now going to join "left" and "right" by moving all the stuff
4307          * in "right" to "left" and deleting "right".
4308          */
4309         XFS_BTREE_STATS_ADD(cur, moves, rrecs);
4310         if (level > 0) {
4311                 /* It's a non-leaf.  Move keys and pointers. */
4312                 union xfs_btree_key     *lkp;   /* left btree key */
4313                 union xfs_btree_ptr     *lpp;   /* left address pointer */
4314                 union xfs_btree_key     *rkp;   /* right btree key */
4315                 union xfs_btree_ptr     *rpp;   /* right address pointer */
4316
4317                 lkp = xfs_btree_key_addr(cur, lrecs + 1, left);
4318                 lpp = xfs_btree_ptr_addr(cur, lrecs + 1, left);
4319                 rkp = xfs_btree_key_addr(cur, 1, right);
4320                 rpp = xfs_btree_ptr_addr(cur, 1, right);
4321
4322                 for (i = 1; i < rrecs; i++) {
4323                         error = xfs_btree_debug_check_ptr(cur, rpp, i, level);
4324                         if (error)
4325                                 goto error0;
4326                 }
4327
4328                 xfs_btree_copy_keys(cur, lkp, rkp, rrecs);
4329                 xfs_btree_copy_ptrs(cur, lpp, rpp, rrecs);
4330
4331                 xfs_btree_log_keys(cur, lbp, lrecs + 1, lrecs + rrecs);
4332                 xfs_btree_log_ptrs(cur, lbp, lrecs + 1, lrecs + rrecs);
4333         } else {
4334                 /* It's a leaf.  Move records.  */
4335                 union xfs_btree_rec     *lrp;   /* left record pointer */
4336                 union xfs_btree_rec     *rrp;   /* right record pointer */
4337
4338                 lrp = xfs_btree_rec_addr(cur, lrecs + 1, left);
4339                 rrp = xfs_btree_rec_addr(cur, 1, right);
4340
4341                 xfs_btree_copy_recs(cur, lrp, rrp, rrecs);
4342                 xfs_btree_log_recs(cur, lbp, lrecs + 1, lrecs + rrecs);
4343         }
4344
4345         XFS_BTREE_STATS_INC(cur, join);
4346
4347         /*
4348          * Fix up the number of records and right block pointer in the
4349          * surviving block, and log it.
4350          */
4351         xfs_btree_set_numrecs(left, lrecs + rrecs);
4352         xfs_btree_get_sibling(cur, right, &cptr, XFS_BB_RIGHTSIB);
4353         xfs_btree_set_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
4354         xfs_btree_log_block(cur, lbp, XFS_BB_NUMRECS | XFS_BB_RIGHTSIB);
4355
4356         /* If there is a right sibling, point it to the remaining block. */
4357         xfs_btree_get_sibling(cur, left, &cptr, XFS_BB_RIGHTSIB);
4358         if (!xfs_btree_ptr_is_null(cur, &cptr)) {
4359                 error = xfs_btree_read_buf_block(cur, &cptr, 0, &rrblock, &rrbp);
4360                 if (error)
4361                         goto error0;
4362                 xfs_btree_set_sibling(cur, rrblock, &lptr, XFS_BB_LEFTSIB);
4363                 xfs_btree_log_block(cur, rrbp, XFS_BB_LEFTSIB);
4364         }
4365
4366         /* Free the deleted block. */
4367         error = xfs_btree_free_block(cur, rbp);
4368         if (error)
4369                 goto error0;
4370
4371         /*
4372          * If we joined with the left neighbor, set the buffer in the
4373          * cursor to the left block, and fix up the index.
4374          */
4375         if (bp != lbp) {
4376                 cur->bc_levels[level].bp = lbp;
4377                 cur->bc_levels[level].ptr += lrecs;
4378                 cur->bc_levels[level].ra = 0;
4379         }
4380         /*
4381          * If we joined with the right neighbor and there's a level above
4382          * us, increment the cursor at that level.
4383          */
4384         else if (cur->bc_ops->type == XFS_BTREE_TYPE_INODE ||
4385                  level + 1 < cur->bc_nlevels) {
4386                 error = xfs_btree_increment(cur, level + 1, &i);
4387                 if (error)
4388                         goto error0;
4389         }
4390
4391         /*
4392          * Readjust the ptr at this level if it's not a leaf, since it's
4393          * still pointing at the deletion point, which makes the cursor
4394          * inconsistent.  If this makes the ptr 0, the caller fixes it up.
4395          * We can't use decrement because it would change the next level up.
4396          */
4397         if (level > 0)
4398                 cur->bc_levels[level].ptr--;
4399
4400         /*
4401          * We combined blocks, so we have to update the parent keys if the
4402          * btree supports overlapped intervals.  However,
4403          * bc_levels[level + 1].ptr points to the old block so that the caller
4404          * knows which record to delete.  Therefore, the caller must be savvy
4405          * enough to call updkeys for us if we return stat == 2.  The other
4406          * exit points from this function don't require deletions further up
4407          * the tree, so they can call updkeys directly.
4408          */
4409
4410         /* Return value means the next level up has something to do. */
4411         *stat = 2;
4412         return 0;
4413
4414 error0:
4415         if (tcur)
4416                 xfs_btree_del_cursor(tcur, XFS_BTREE_ERROR);
4417         return error;
4418 }
4419
4420 /*
4421  * Delete the record pointed to by cur.
4422  * The cursor refers to the place where the record was (could be inserted)
4423  * when the operation returns.
4424  */
4425 int                                     /* error */
4426 xfs_btree_delete(
4427         struct xfs_btree_cur    *cur,
4428         int                     *stat)  /* success/failure */
4429 {
4430         int                     error;  /* error return value */
4431         int                     level;
4432         int                     i;
4433         bool                    joined = false;
4434
4435         /*
4436          * Go up the tree, starting at leaf level.
4437          *
4438          * If 2 is returned then a join was done; go to the next level.
4439          * Otherwise we are done.
4440          */
4441         for (level = 0, i = 2; i == 2; level++) {
4442                 error = xfs_btree_delrec(cur, level, &i);
4443                 if (error)
4444                         goto error0;
4445                 if (i == 2)
4446                         joined = true;
4447         }
4448
4449         /*
4450          * If we combined blocks as part of deleting the record, delrec won't
4451          * have updated the parent high keys so we have to do that here.
4452          */
4453         if (joined && (cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING)) {
4454                 error = xfs_btree_updkeys_force(cur, 0);
4455                 if (error)
4456                         goto error0;
4457         }
4458
4459         if (i == 0) {
4460                 for (level = 1; level < cur->bc_nlevels; level++) {
4461                         if (cur->bc_levels[level].ptr == 0) {
4462                                 error = xfs_btree_decrement(cur, level, &i);
4463                                 if (error)
4464                                         goto error0;
4465                                 break;
4466                         }
4467                 }
4468         }
4469
4470         *stat = i;
4471         return 0;
4472 error0:
4473         return error;
4474 }
4475
4476 /*
4477  * Get the data from the pointed-to record.
4478  */
4479 int                                     /* error */
4480 xfs_btree_get_rec(
4481         struct xfs_btree_cur    *cur,   /* btree cursor */
4482         union xfs_btree_rec     **recp, /* output: btree record */
4483         int                     *stat)  /* output: success/failure */
4484 {
4485         struct xfs_btree_block  *block; /* btree block */
4486         struct xfs_buf          *bp;    /* buffer pointer */
4487         int                     ptr;    /* record number */
4488 #ifdef DEBUG
4489         int                     error;  /* error return value */
4490 #endif
4491
4492         ptr = cur->bc_levels[0].ptr;
4493         block = xfs_btree_get_block(cur, 0, &bp);
4494
4495 #ifdef DEBUG
4496         error = xfs_btree_check_block(cur, block, 0, bp);
4497         if (error)
4498                 return error;
4499 #endif
4500
4501         /*
4502          * Off the right end or left end, return failure.
4503          */
4504         if (ptr > xfs_btree_get_numrecs(block) || ptr <= 0) {
4505                 *stat = 0;
4506                 return 0;
4507         }
4508
4509         /*
4510          * Point to the record and extract its data.
4511          */
4512         *recp = xfs_btree_rec_addr(cur, ptr, block);
4513         *stat = 1;
4514         return 0;
4515 }
4516
4517 /* Visit a block in a btree. */
4518 STATIC int
4519 xfs_btree_visit_block(
4520         struct xfs_btree_cur            *cur,
4521         int                             level,
4522         xfs_btree_visit_blocks_fn       fn,
4523         void                            *data)
4524 {
4525         struct xfs_btree_block          *block;
4526         struct xfs_buf                  *bp;
4527         union xfs_btree_ptr             rptr, bufptr;
4528         int                             error;
4529
4530         /* do right sibling readahead */
4531         xfs_btree_readahead(cur, level, XFS_BTCUR_RIGHTRA);
4532         block = xfs_btree_get_block(cur, level, &bp);
4533
4534         /* process the block */
4535         error = fn(cur, level, data);
4536         if (error)
4537                 return error;
4538
4539         /* now read rh sibling block for next iteration */
4540         xfs_btree_get_sibling(cur, block, &rptr, XFS_BB_RIGHTSIB);
4541         if (xfs_btree_ptr_is_null(cur, &rptr))
4542                 return -ENOENT;
4543
4544         /*
4545          * We only visit blocks once in this walk, so we have to avoid the
4546          * internal xfs_btree_lookup_get_block() optimisation where it will
4547          * return the same block without checking if the right sibling points
4548          * back to us and creates a cyclic reference in the btree.
4549          */
4550         xfs_btree_buf_to_ptr(cur, bp, &bufptr);
4551         if (xfs_btree_ptrs_equal(cur, &rptr, &bufptr)) {
4552                 xfs_btree_mark_sick(cur);
4553                 return -EFSCORRUPTED;
4554         }
4555
4556         return xfs_btree_lookup_get_block(cur, level, &rptr, &block);
4557 }
4558
4559
4560 /* Visit every block in a btree. */
4561 int
4562 xfs_btree_visit_blocks(
4563         struct xfs_btree_cur            *cur,
4564         xfs_btree_visit_blocks_fn       fn,
4565         unsigned int                    flags,
4566         void                            *data)
4567 {
4568         union xfs_btree_ptr             lptr;
4569         int                             level;
4570         struct xfs_btree_block          *block = NULL;
4571         int                             error = 0;
4572
4573         xfs_btree_init_ptr_from_cur(cur, &lptr);
4574
4575         /* for each level */
4576         for (level = cur->bc_nlevels - 1; level >= 0; level--) {
4577                 /* grab the left hand block */
4578                 error = xfs_btree_lookup_get_block(cur, level, &lptr, &block);
4579                 if (error)
4580                         return error;
4581
4582                 /* readahead the left most block for the next level down */
4583                 if (level > 0) {
4584                         union xfs_btree_ptr     *ptr;
4585
4586                         ptr = xfs_btree_ptr_addr(cur, 1, block);
4587                         xfs_btree_readahead_ptr(cur, ptr, 1);
4588
4589                         /* save for the next iteration of the loop */
4590                         xfs_btree_copy_ptrs(cur, &lptr, ptr, 1);
4591
4592                         if (!(flags & XFS_BTREE_VISIT_LEAVES))
4593                                 continue;
4594                 } else if (!(flags & XFS_BTREE_VISIT_RECORDS)) {
4595                         continue;
4596                 }
4597
4598                 /* for each buffer in the level */
4599                 do {
4600                         error = xfs_btree_visit_block(cur, level, fn, data);
4601                 } while (!error);
4602
4603                 if (error != -ENOENT)
4604                         return error;
4605         }
4606
4607         return 0;
4608 }
4609
4610 /*
4611  * Change the owner of a btree.
4612  *
4613  * The mechanism we use here is ordered buffer logging. Because we don't know
4614  * how many buffers were are going to need to modify, we don't really want to
4615  * have to make transaction reservations for the worst case of every buffer in a
4616  * full size btree as that may be more space that we can fit in the log....
4617  *
4618  * We do the btree walk in the most optimal manner possible - we have sibling
4619  * pointers so we can just walk all the blocks on each level from left to right
4620  * in a single pass, and then move to the next level and do the same. We can
4621  * also do readahead on the sibling pointers to get IO moving more quickly,
4622  * though for slow disks this is unlikely to make much difference to performance
4623  * as the amount of CPU work we have to do before moving to the next block is
4624  * relatively small.
4625  *
4626  * For each btree block that we load, modify the owner appropriately, set the
4627  * buffer as an ordered buffer and log it appropriately. We need to ensure that
4628  * we mark the region we change dirty so that if the buffer is relogged in
4629  * a subsequent transaction the changes we make here as an ordered buffer are
4630  * correctly relogged in that transaction.  If we are in recovery context, then
4631  * just queue the modified buffer as delayed write buffer so the transaction
4632  * recovery completion writes the changes to disk.
4633  */
4634 struct xfs_btree_block_change_owner_info {
4635         uint64_t                new_owner;
4636         struct list_head        *buffer_list;
4637 };
4638
4639 static int
4640 xfs_btree_block_change_owner(
4641         struct xfs_btree_cur    *cur,
4642         int                     level,
4643         void                    *data)
4644 {
4645         struct xfs_btree_block_change_owner_info        *bbcoi = data;
4646         struct xfs_btree_block  *block;
4647         struct xfs_buf          *bp;
4648
4649         /* modify the owner */
4650         block = xfs_btree_get_block(cur, level, &bp);
4651         if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN) {
4652                 if (block->bb_u.l.bb_owner == cpu_to_be64(bbcoi->new_owner))
4653                         return 0;
4654                 block->bb_u.l.bb_owner = cpu_to_be64(bbcoi->new_owner);
4655         } else {
4656                 if (block->bb_u.s.bb_owner == cpu_to_be32(bbcoi->new_owner))
4657                         return 0;
4658                 block->bb_u.s.bb_owner = cpu_to_be32(bbcoi->new_owner);
4659         }
4660
4661         /*
4662          * If the block is a root block hosted in an inode, we might not have a
4663          * buffer pointer here and we shouldn't attempt to log the change as the
4664          * information is already held in the inode and discarded when the root
4665          * block is formatted into the on-disk inode fork. We still change it,
4666          * though, so everything is consistent in memory.
4667          */
4668         if (!bp) {
4669                 ASSERT(cur->bc_ops->type == XFS_BTREE_TYPE_INODE);
4670                 ASSERT(level == cur->bc_nlevels - 1);
4671                 return 0;
4672         }
4673
4674         if (cur->bc_tp) {
4675                 if (!xfs_trans_ordered_buf(cur->bc_tp, bp)) {
4676                         xfs_btree_log_block(cur, bp, XFS_BB_OWNER);
4677                         return -EAGAIN;
4678                 }
4679         } else {
4680                 xfs_buf_delwri_queue(bp, bbcoi->buffer_list);
4681         }
4682
4683         return 0;
4684 }
4685
4686 int
4687 xfs_btree_change_owner(
4688         struct xfs_btree_cur    *cur,
4689         uint64_t                new_owner,
4690         struct list_head        *buffer_list)
4691 {
4692         struct xfs_btree_block_change_owner_info        bbcoi;
4693
4694         bbcoi.new_owner = new_owner;
4695         bbcoi.buffer_list = buffer_list;
4696
4697         return xfs_btree_visit_blocks(cur, xfs_btree_block_change_owner,
4698                         XFS_BTREE_VISIT_ALL, &bbcoi);
4699 }
4700
4701 /* Verify the v5 fields of a long-format btree block. */
4702 xfs_failaddr_t
4703 xfs_btree_fsblock_v5hdr_verify(
4704         struct xfs_buf          *bp,
4705         uint64_t                owner)
4706 {
4707         struct xfs_mount        *mp = bp->b_mount;
4708         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
4709
4710         if (!xfs_has_crc(mp))
4711                 return __this_address;
4712         if (!uuid_equal(&block->bb_u.l.bb_uuid, &mp->m_sb.sb_meta_uuid))
4713                 return __this_address;
4714         if (block->bb_u.l.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp)))
4715                 return __this_address;
4716         if (owner != XFS_RMAP_OWN_UNKNOWN &&
4717             be64_to_cpu(block->bb_u.l.bb_owner) != owner)
4718                 return __this_address;
4719         return NULL;
4720 }
4721
4722 /* Verify a long-format btree block. */
4723 xfs_failaddr_t
4724 xfs_btree_fsblock_verify(
4725         struct xfs_buf          *bp,
4726         unsigned int            max_recs)
4727 {
4728         struct xfs_mount        *mp = bp->b_mount;
4729         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
4730         xfs_fsblock_t           fsb;
4731         xfs_failaddr_t          fa;
4732
4733         ASSERT(!xfs_buftarg_is_mem(bp->b_target));
4734
4735         /* numrecs verification */
4736         if (be16_to_cpu(block->bb_numrecs) > max_recs)
4737                 return __this_address;
4738
4739         /* sibling pointer verification */
4740         fsb = XFS_DADDR_TO_FSB(mp, xfs_buf_daddr(bp));
4741         fa = xfs_btree_check_fsblock_siblings(mp, fsb,
4742                         block->bb_u.l.bb_leftsib);
4743         if (!fa)
4744                 fa = xfs_btree_check_fsblock_siblings(mp, fsb,
4745                                 block->bb_u.l.bb_rightsib);
4746         return fa;
4747 }
4748
4749 /* Verify an in-memory btree block. */
4750 xfs_failaddr_t
4751 xfs_btree_memblock_verify(
4752         struct xfs_buf          *bp,
4753         unsigned int            max_recs)
4754 {
4755         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
4756         struct xfs_buftarg      *btp = bp->b_target;
4757         xfs_failaddr_t          fa;
4758         xfbno_t                 bno;
4759
4760         ASSERT(xfs_buftarg_is_mem(bp->b_target));
4761
4762         /* numrecs verification */
4763         if (be16_to_cpu(block->bb_numrecs) > max_recs)
4764                 return __this_address;
4765
4766         /* sibling pointer verification */
4767         bno = xfs_daddr_to_xfbno(xfs_buf_daddr(bp));
4768         fa = xfs_btree_check_memblock_siblings(btp, bno,
4769                         block->bb_u.l.bb_leftsib);
4770         if (fa)
4771                 return fa;
4772         fa = xfs_btree_check_memblock_siblings(btp, bno,
4773                         block->bb_u.l.bb_rightsib);
4774         if (fa)
4775                 return fa;
4776
4777         return NULL;
4778 }
4779 /**
4780  * xfs_btree_agblock_v5hdr_verify() -- verify the v5 fields of a short-format
4781  *                                    btree block
4782  *
4783  * @bp: buffer containing the btree block
4784  */
4785 xfs_failaddr_t
4786 xfs_btree_agblock_v5hdr_verify(
4787         struct xfs_buf          *bp)
4788 {
4789         struct xfs_mount        *mp = bp->b_mount;
4790         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
4791         struct xfs_perag        *pag = bp->b_pag;
4792
4793         if (!xfs_has_crc(mp))
4794                 return __this_address;
4795         if (!uuid_equal(&block->bb_u.s.bb_uuid, &mp->m_sb.sb_meta_uuid))
4796                 return __this_address;
4797         if (block->bb_u.s.bb_blkno != cpu_to_be64(xfs_buf_daddr(bp)))
4798                 return __this_address;
4799         if (pag && be32_to_cpu(block->bb_u.s.bb_owner) != pag->pag_agno)
4800                 return __this_address;
4801         return NULL;
4802 }
4803
4804 /**
4805  * xfs_btree_agblock_verify() -- verify a short-format btree block
4806  *
4807  * @bp: buffer containing the btree block
4808  * @max_recs: maximum records allowed in this btree node
4809  */
4810 xfs_failaddr_t
4811 xfs_btree_agblock_verify(
4812         struct xfs_buf          *bp,
4813         unsigned int            max_recs)
4814 {
4815         struct xfs_mount        *mp = bp->b_mount;
4816         struct xfs_btree_block  *block = XFS_BUF_TO_BLOCK(bp);
4817         xfs_agblock_t           agbno;
4818         xfs_failaddr_t          fa;
4819
4820         ASSERT(!xfs_buftarg_is_mem(bp->b_target));
4821
4822         /* numrecs verification */
4823         if (be16_to_cpu(block->bb_numrecs) > max_recs)
4824                 return __this_address;
4825
4826         /* sibling pointer verification */
4827         agbno = xfs_daddr_to_agbno(mp, xfs_buf_daddr(bp));
4828         fa = xfs_btree_check_agblock_siblings(bp->b_pag, agbno,
4829                         block->bb_u.s.bb_leftsib);
4830         if (!fa)
4831                 fa = xfs_btree_check_agblock_siblings(bp->b_pag, agbno,
4832                                 block->bb_u.s.bb_rightsib);
4833         return fa;
4834 }
4835
4836 /*
4837  * For the given limits on leaf and keyptr records per block, calculate the
4838  * height of the tree needed to index the number of leaf records.
4839  */
4840 unsigned int
4841 xfs_btree_compute_maxlevels(
4842         const unsigned int      *limits,
4843         unsigned long long      records)
4844 {
4845         unsigned long long      level_blocks = howmany_64(records, limits[0]);
4846         unsigned int            height = 1;
4847
4848         while (level_blocks > 1) {
4849                 level_blocks = howmany_64(level_blocks, limits[1]);
4850                 height++;
4851         }
4852
4853         return height;
4854 }
4855
4856 /*
4857  * For the given limits on leaf and keyptr records per block, calculate the
4858  * number of blocks needed to index the given number of leaf records.
4859  */
4860 unsigned long long
4861 xfs_btree_calc_size(
4862         const unsigned int      *limits,
4863         unsigned long long      records)
4864 {
4865         unsigned long long      level_blocks = howmany_64(records, limits[0]);
4866         unsigned long long      blocks = level_blocks;
4867
4868         while (level_blocks > 1) {
4869                 level_blocks = howmany_64(level_blocks, limits[1]);
4870                 blocks += level_blocks;
4871         }
4872
4873         return blocks;
4874 }
4875
4876 /*
4877  * Given a number of available blocks for the btree to consume with records and
4878  * pointers, calculate the height of the tree needed to index all the records
4879  * that space can hold based on the number of pointers each interior node
4880  * holds.
4881  *
4882  * We start by assuming a single level tree consumes a single block, then track
4883  * the number of blocks each node level consumes until we no longer have space
4884  * to store the next node level. At this point, we are indexing all the leaf
4885  * blocks in the space, and there's no more free space to split the tree any
4886  * further. That's our maximum btree height.
4887  */
4888 unsigned int
4889 xfs_btree_space_to_height(
4890         const unsigned int      *limits,
4891         unsigned long long      leaf_blocks)
4892 {
4893         /*
4894          * The root btree block can have fewer than minrecs pointers in it
4895          * because the tree might not be big enough to require that amount of
4896          * fanout. Hence it has a minimum size of 2 pointers, not limits[1].
4897          */
4898         unsigned long long      node_blocks = 2;
4899         unsigned long long      blocks_left = leaf_blocks - 1;
4900         unsigned int            height = 1;
4901
4902         if (leaf_blocks < 1)
4903                 return 0;
4904
4905         while (node_blocks < blocks_left) {
4906                 blocks_left -= node_blocks;
4907                 node_blocks *= limits[1];
4908                 height++;
4909         }
4910
4911         return height;
4912 }
4913
4914 /*
4915  * Query a regular btree for all records overlapping a given interval.
4916  * Start with a LE lookup of the key of low_rec and return all records
4917  * until we find a record with a key greater than the key of high_rec.
4918  */
4919 STATIC int
4920 xfs_btree_simple_query_range(
4921         struct xfs_btree_cur            *cur,
4922         const union xfs_btree_key       *low_key,
4923         const union xfs_btree_key       *high_key,
4924         xfs_btree_query_range_fn        fn,
4925         void                            *priv)
4926 {
4927         union xfs_btree_rec             *recp;
4928         union xfs_btree_key             rec_key;
4929         int                             stat;
4930         bool                            firstrec = true;
4931         int                             error;
4932
4933         ASSERT(cur->bc_ops->init_high_key_from_rec);
4934         ASSERT(cur->bc_ops->diff_two_keys);
4935
4936         /*
4937          * Find the leftmost record.  The btree cursor must be set
4938          * to the low record used to generate low_key.
4939          */
4940         stat = 0;
4941         error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat);
4942         if (error)
4943                 goto out;
4944
4945         /* Nothing?  See if there's anything to the right. */
4946         if (!stat) {
4947                 error = xfs_btree_increment(cur, 0, &stat);
4948                 if (error)
4949                         goto out;
4950         }
4951
4952         while (stat) {
4953                 /* Find the record. */
4954                 error = xfs_btree_get_rec(cur, &recp, &stat);
4955                 if (error || !stat)
4956                         break;
4957
4958                 /* Skip if low_key > high_key(rec). */
4959                 if (firstrec) {
4960                         cur->bc_ops->init_high_key_from_rec(&rec_key, recp);
4961                         firstrec = false;
4962                         if (xfs_btree_keycmp_gt(cur, low_key, &rec_key))
4963                                 goto advloop;
4964                 }
4965
4966                 /* Stop if low_key(rec) > high_key. */
4967                 cur->bc_ops->init_key_from_rec(&rec_key, recp);
4968                 if (xfs_btree_keycmp_gt(cur, &rec_key, high_key))
4969                         break;
4970
4971                 /* Callback */
4972                 error = fn(cur, recp, priv);
4973                 if (error)
4974                         break;
4975
4976 advloop:
4977                 /* Move on to the next record. */
4978                 error = xfs_btree_increment(cur, 0, &stat);
4979                 if (error)
4980                         break;
4981         }
4982
4983 out:
4984         return error;
4985 }
4986
4987 /*
4988  * Query an overlapped interval btree for all records overlapping a given
4989  * interval.  This function roughly follows the algorithm given in
4990  * "Interval Trees" of _Introduction to Algorithms_, which is section
4991  * 14.3 in the 2nd and 3rd editions.
4992  *
4993  * First, generate keys for the low and high records passed in.
4994  *
4995  * For any leaf node, generate the high and low keys for the record.
4996  * If the record keys overlap with the query low/high keys, pass the
4997  * record to the function iterator.
4998  *
4999  * For any internal node, compare the low and high keys of each
5000  * pointer against the query low/high keys.  If there's an overlap,
5001  * follow the pointer.
5002  *
5003  * As an optimization, we stop scanning a block when we find a low key
5004  * that is greater than the query's high key.
5005  */
5006 STATIC int
5007 xfs_btree_overlapped_query_range(
5008         struct xfs_btree_cur            *cur,
5009         const union xfs_btree_key       *low_key,
5010         const union xfs_btree_key       *high_key,
5011         xfs_btree_query_range_fn        fn,
5012         void                            *priv)
5013 {
5014         union xfs_btree_ptr             ptr;
5015         union xfs_btree_ptr             *pp;
5016         union xfs_btree_key             rec_key;
5017         union xfs_btree_key             rec_hkey;
5018         union xfs_btree_key             *lkp;
5019         union xfs_btree_key             *hkp;
5020         union xfs_btree_rec             *recp;
5021         struct xfs_btree_block          *block;
5022         int                             level;
5023         struct xfs_buf                  *bp;
5024         int                             i;
5025         int                             error;
5026
5027         /* Load the root of the btree. */
5028         level = cur->bc_nlevels - 1;
5029         xfs_btree_init_ptr_from_cur(cur, &ptr);
5030         error = xfs_btree_lookup_get_block(cur, level, &ptr, &block);
5031         if (error)
5032                 return error;
5033         xfs_btree_get_block(cur, level, &bp);
5034         trace_xfs_btree_overlapped_query_range(cur, level, bp);
5035 #ifdef DEBUG
5036         error = xfs_btree_check_block(cur, block, level, bp);
5037         if (error)
5038                 goto out;
5039 #endif
5040         cur->bc_levels[level].ptr = 1;
5041
5042         while (level < cur->bc_nlevels) {
5043                 block = xfs_btree_get_block(cur, level, &bp);
5044
5045                 /* End of node, pop back towards the root. */
5046                 if (cur->bc_levels[level].ptr >
5047                                         be16_to_cpu(block->bb_numrecs)) {
5048 pop_up:
5049                         if (level < cur->bc_nlevels - 1)
5050                                 cur->bc_levels[level + 1].ptr++;
5051                         level++;
5052                         continue;
5053                 }
5054
5055                 if (level == 0) {
5056                         /* Handle a leaf node. */
5057                         recp = xfs_btree_rec_addr(cur, cur->bc_levels[0].ptr,
5058                                         block);
5059
5060                         cur->bc_ops->init_high_key_from_rec(&rec_hkey, recp);
5061                         cur->bc_ops->init_key_from_rec(&rec_key, recp);
5062
5063                         /*
5064                          * If (query's high key < record's low key), then there
5065                          * are no more interesting records in this block.  Pop
5066                          * up to the leaf level to find more record blocks.
5067                          *
5068                          * If (record's high key >= query's low key) and
5069                          *    (query's high key >= record's low key), then
5070                          * this record overlaps the query range; callback.
5071                          */
5072                         if (xfs_btree_keycmp_lt(cur, high_key, &rec_key))
5073                                 goto pop_up;
5074                         if (xfs_btree_keycmp_ge(cur, &rec_hkey, low_key)) {
5075                                 error = fn(cur, recp, priv);
5076                                 if (error)
5077                                         break;
5078                         }
5079                         cur->bc_levels[level].ptr++;
5080                         continue;
5081                 }
5082
5083                 /* Handle an internal node. */
5084                 lkp = xfs_btree_key_addr(cur, cur->bc_levels[level].ptr, block);
5085                 hkp = xfs_btree_high_key_addr(cur, cur->bc_levels[level].ptr,
5086                                 block);
5087                 pp = xfs_btree_ptr_addr(cur, cur->bc_levels[level].ptr, block);
5088
5089                 /*
5090                  * If (query's high key < pointer's low key), then there are no
5091                  * more interesting keys in this block.  Pop up one leaf level
5092                  * to continue looking for records.
5093                  *
5094                  * If (pointer's high key >= query's low key) and
5095                  *    (query's high key >= pointer's low key), then
5096                  * this record overlaps the query range; follow pointer.
5097                  */
5098                 if (xfs_btree_keycmp_lt(cur, high_key, lkp))
5099                         goto pop_up;
5100                 if (xfs_btree_keycmp_ge(cur, hkp, low_key)) {
5101                         level--;
5102                         error = xfs_btree_lookup_get_block(cur, level, pp,
5103                                         &block);
5104                         if (error)
5105                                 goto out;
5106                         xfs_btree_get_block(cur, level, &bp);
5107                         trace_xfs_btree_overlapped_query_range(cur, level, bp);
5108 #ifdef DEBUG
5109                         error = xfs_btree_check_block(cur, block, level, bp);
5110                         if (error)
5111                                 goto out;
5112 #endif
5113                         cur->bc_levels[level].ptr = 1;
5114                         continue;
5115                 }
5116                 cur->bc_levels[level].ptr++;
5117         }
5118
5119 out:
5120         /*
5121          * If we don't end this function with the cursor pointing at a record
5122          * block, a subsequent non-error cursor deletion will not release
5123          * node-level buffers, causing a buffer leak.  This is quite possible
5124          * with a zero-results range query, so release the buffers if we
5125          * failed to return any results.
5126          */
5127         if (cur->bc_levels[0].bp == NULL) {
5128                 for (i = 0; i < cur->bc_nlevels; i++) {
5129                         if (cur->bc_levels[i].bp) {
5130                                 xfs_trans_brelse(cur->bc_tp,
5131                                                 cur->bc_levels[i].bp);
5132                                 cur->bc_levels[i].bp = NULL;
5133                                 cur->bc_levels[i].ptr = 0;
5134                                 cur->bc_levels[i].ra = 0;
5135                         }
5136                 }
5137         }
5138
5139         return error;
5140 }
5141
5142 static inline void
5143 xfs_btree_key_from_irec(
5144         struct xfs_btree_cur            *cur,
5145         union xfs_btree_key             *key,
5146         const union xfs_btree_irec      *irec)
5147 {
5148         union xfs_btree_rec             rec;
5149
5150         cur->bc_rec = *irec;
5151         cur->bc_ops->init_rec_from_cur(cur, &rec);
5152         cur->bc_ops->init_key_from_rec(key, &rec);
5153 }
5154
5155 /*
5156  * Query a btree for all records overlapping a given interval of keys.  The
5157  * supplied function will be called with each record found; return one of the
5158  * XFS_BTREE_QUERY_RANGE_{CONTINUE,ABORT} values or the usual negative error
5159  * code.  This function returns -ECANCELED, zero, or a negative error code.
5160  */
5161 int
5162 xfs_btree_query_range(
5163         struct xfs_btree_cur            *cur,
5164         const union xfs_btree_irec      *low_rec,
5165         const union xfs_btree_irec      *high_rec,
5166         xfs_btree_query_range_fn        fn,
5167         void                            *priv)
5168 {
5169         union xfs_btree_key             low_key;
5170         union xfs_btree_key             high_key;
5171
5172         /* Find the keys of both ends of the interval. */
5173         xfs_btree_key_from_irec(cur, &high_key, high_rec);
5174         xfs_btree_key_from_irec(cur, &low_key, low_rec);
5175
5176         /* Enforce low key <= high key. */
5177         if (!xfs_btree_keycmp_le(cur, &low_key, &high_key))
5178                 return -EINVAL;
5179
5180         if (!(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING))
5181                 return xfs_btree_simple_query_range(cur, &low_key,
5182                                 &high_key, fn, priv);
5183         return xfs_btree_overlapped_query_range(cur, &low_key, &high_key,
5184                         fn, priv);
5185 }
5186
5187 /* Query a btree for all records. */
5188 int
5189 xfs_btree_query_all(
5190         struct xfs_btree_cur            *cur,
5191         xfs_btree_query_range_fn        fn,
5192         void                            *priv)
5193 {
5194         union xfs_btree_key             low_key;
5195         union xfs_btree_key             high_key;
5196
5197         memset(&cur->bc_rec, 0, sizeof(cur->bc_rec));
5198         memset(&low_key, 0, sizeof(low_key));
5199         memset(&high_key, 0xFF, sizeof(high_key));
5200
5201         return xfs_btree_simple_query_range(cur, &low_key, &high_key, fn, priv);
5202 }
5203
5204 static int
5205 xfs_btree_count_blocks_helper(
5206         struct xfs_btree_cur    *cur,
5207         int                     level,
5208         void                    *data)
5209 {
5210         xfs_extlen_t            *blocks = data;
5211         (*blocks)++;
5212
5213         return 0;
5214 }
5215
5216 /* Count the blocks in a btree and return the result in *blocks. */
5217 int
5218 xfs_btree_count_blocks(
5219         struct xfs_btree_cur    *cur,
5220         xfs_extlen_t            *blocks)
5221 {
5222         *blocks = 0;
5223         return xfs_btree_visit_blocks(cur, xfs_btree_count_blocks_helper,
5224                         XFS_BTREE_VISIT_ALL, blocks);
5225 }
5226
5227 /* Compare two btree pointers. */
5228 int64_t
5229 xfs_btree_diff_two_ptrs(
5230         struct xfs_btree_cur            *cur,
5231         const union xfs_btree_ptr       *a,
5232         const union xfs_btree_ptr       *b)
5233 {
5234         if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
5235                 return (int64_t)be64_to_cpu(a->l) - be64_to_cpu(b->l);
5236         return (int64_t)be32_to_cpu(a->s) - be32_to_cpu(b->s);
5237 }
5238
5239 struct xfs_btree_has_records {
5240         /* Keys for the start and end of the range we want to know about. */
5241         union xfs_btree_key             start_key;
5242         union xfs_btree_key             end_key;
5243
5244         /* Mask for key comparisons, if desired. */
5245         const union xfs_btree_key       *key_mask;
5246
5247         /* Highest record key we've seen so far. */
5248         union xfs_btree_key             high_key;
5249
5250         enum xbtree_recpacking          outcome;
5251 };
5252
5253 STATIC int
5254 xfs_btree_has_records_helper(
5255         struct xfs_btree_cur            *cur,
5256         const union xfs_btree_rec       *rec,
5257         void                            *priv)
5258 {
5259         union xfs_btree_key             rec_key;
5260         union xfs_btree_key             rec_high_key;
5261         struct xfs_btree_has_records    *info = priv;
5262         enum xbtree_key_contig          key_contig;
5263
5264         cur->bc_ops->init_key_from_rec(&rec_key, rec);
5265
5266         if (info->outcome == XBTREE_RECPACKING_EMPTY) {
5267                 info->outcome = XBTREE_RECPACKING_SPARSE;
5268
5269                 /*
5270                  * If the first record we find does not overlap the start key,
5271                  * then there is a hole at the start of the search range.
5272                  * Classify this as sparse and stop immediately.
5273                  */
5274                 if (xfs_btree_masked_keycmp_lt(cur, &info->start_key, &rec_key,
5275                                         info->key_mask))
5276                         return -ECANCELED;
5277         } else {
5278                 /*
5279                  * If a subsequent record does not overlap with the any record
5280                  * we've seen so far, there is a hole in the middle of the
5281                  * search range.  Classify this as sparse and stop.
5282                  * If the keys overlap and this btree does not allow overlap,
5283                  * signal corruption.
5284                  */
5285                 key_contig = cur->bc_ops->keys_contiguous(cur, &info->high_key,
5286                                         &rec_key, info->key_mask);
5287                 if (key_contig == XBTREE_KEY_OVERLAP &&
5288                                 !(cur->bc_ops->geom_flags & XFS_BTGEO_OVERLAPPING))
5289                         return -EFSCORRUPTED;
5290                 if (key_contig == XBTREE_KEY_GAP)
5291                         return -ECANCELED;
5292         }
5293
5294         /*
5295          * If high_key(rec) is larger than any other high key we've seen,
5296          * remember it for later.
5297          */
5298         cur->bc_ops->init_high_key_from_rec(&rec_high_key, rec);
5299         if (xfs_btree_masked_keycmp_gt(cur, &rec_high_key, &info->high_key,
5300                                 info->key_mask))
5301                 info->high_key = rec_high_key; /* struct copy */
5302
5303         return 0;
5304 }
5305
5306 /*
5307  * Scan part of the keyspace of a btree and tell us if that keyspace does not
5308  * map to any records; is fully mapped to records; or is partially mapped to
5309  * records.  This is the btree record equivalent to determining if a file is
5310  * sparse.
5311  *
5312  * For most btree types, the record scan should use all available btree key
5313  * fields to compare the keys encountered.  These callers should pass NULL for
5314  * @mask.  However, some callers (e.g.  scanning physical space in the rmapbt)
5315  * want to ignore some part of the btree record keyspace when performing the
5316  * comparison.  These callers should pass in a union xfs_btree_key object with
5317  * the fields that *should* be a part of the comparison set to any nonzero
5318  * value, and the rest zeroed.
5319  */
5320 int
5321 xfs_btree_has_records(
5322         struct xfs_btree_cur            *cur,
5323         const union xfs_btree_irec      *low,
5324         const union xfs_btree_irec      *high,
5325         const union xfs_btree_key       *mask,
5326         enum xbtree_recpacking          *outcome)
5327 {
5328         struct xfs_btree_has_records    info = {
5329                 .outcome                = XBTREE_RECPACKING_EMPTY,
5330                 .key_mask               = mask,
5331         };
5332         int                             error;
5333
5334         /* Not all btrees support this operation. */
5335         if (!cur->bc_ops->keys_contiguous) {
5336                 ASSERT(0);
5337                 return -EOPNOTSUPP;
5338         }
5339
5340         xfs_btree_key_from_irec(cur, &info.start_key, low);
5341         xfs_btree_key_from_irec(cur, &info.end_key, high);
5342
5343         error = xfs_btree_query_range(cur, low, high,
5344                         xfs_btree_has_records_helper, &info);
5345         if (error == -ECANCELED)
5346                 goto out;
5347         if (error)
5348                 return error;
5349
5350         if (info.outcome == XBTREE_RECPACKING_EMPTY)
5351                 goto out;
5352
5353         /*
5354          * If the largest high_key(rec) we saw during the walk is greater than
5355          * the end of the search range, classify this as full.  Otherwise,
5356          * there is a hole at the end of the search range.
5357          */
5358         if (xfs_btree_masked_keycmp_ge(cur, &info.high_key, &info.end_key,
5359                                 mask))
5360                 info.outcome = XBTREE_RECPACKING_FULL;
5361
5362 out:
5363         *outcome = info.outcome;
5364         return 0;
5365 }
5366
5367 /* Are there more records in this btree? */
5368 bool
5369 xfs_btree_has_more_records(
5370         struct xfs_btree_cur    *cur)
5371 {
5372         struct xfs_btree_block  *block;
5373         struct xfs_buf          *bp;
5374
5375         block = xfs_btree_get_block(cur, 0, &bp);
5376
5377         /* There are still records in this block. */
5378         if (cur->bc_levels[0].ptr < xfs_btree_get_numrecs(block))
5379                 return true;
5380
5381         /* There are more record blocks. */
5382         if (cur->bc_ops->ptr_len == XFS_BTREE_LONG_PTR_LEN)
5383                 return block->bb_u.l.bb_rightsib != cpu_to_be64(NULLFSBLOCK);
5384         else
5385                 return block->bb_u.s.bb_rightsib != cpu_to_be32(NULLAGBLOCK);
5386 }
5387
5388 /* Set up all the btree cursor caches. */
5389 int __init
5390 xfs_btree_init_cur_caches(void)
5391 {
5392         int             error;
5393
5394         error = xfs_allocbt_init_cur_cache();
5395         if (error)
5396                 return error;
5397         error = xfs_inobt_init_cur_cache();
5398         if (error)
5399                 goto err;
5400         error = xfs_bmbt_init_cur_cache();
5401         if (error)
5402                 goto err;
5403         error = xfs_rmapbt_init_cur_cache();
5404         if (error)
5405                 goto err;
5406         error = xfs_refcountbt_init_cur_cache();
5407         if (error)
5408                 goto err;
5409
5410         return 0;
5411 err:
5412         xfs_btree_destroy_cur_caches();
5413         return error;
5414 }
5415
5416 /* Destroy all the btree cursor caches, if they've been allocated. */
5417 void
5418 xfs_btree_destroy_cur_caches(void)
5419 {
5420         xfs_allocbt_destroy_cur_cache();
5421         xfs_inobt_destroy_cur_cache();
5422         xfs_bmbt_destroy_cur_cache();
5423         xfs_rmapbt_destroy_cur_cache();
5424         xfs_refcountbt_destroy_cur_cache();
5425 }
5426
5427 /* Move the btree cursor before the first record. */
5428 int
5429 xfs_btree_goto_left_edge(
5430         struct xfs_btree_cur    *cur)
5431 {
5432         int                     stat = 0;
5433         int                     error;
5434
5435         memset(&cur->bc_rec, 0, sizeof(cur->bc_rec));
5436         error = xfs_btree_lookup(cur, XFS_LOOKUP_LE, &stat);
5437         if (error)
5438                 return error;
5439         if (!stat)
5440                 return 0;
5441
5442         error = xfs_btree_decrement(cur, 0, &stat);
5443         if (error)
5444                 return error;
5445         if (stat != 0) {
5446                 ASSERT(0);
5447                 xfs_btree_mark_sick(cur);
5448                 return -EFSCORRUPTED;
5449         }
5450
5451         return 0;
5452 }