bmap.c

   1 // SPDX-License-Identifier: GPL-2.0-only
   2 /*
   3  * Copyright (C) Sistina Software, Inc.  1997-2003 All rights reserved.
   4  * Copyright (C) 2004-2006 Red Hat, Inc.  All rights reserved.
   5  */
   6
   7 #include <linux/spinlock.h>
   8 #include <linux/completion.h>
   9 #include <linux/buffer_head.h>
  10 #include <linux/blkdev.h>
  11 #include <linux/gfs2_ondisk.h>
  12 #include <linux/crc32.h>
  13 #include <linux/iomap.h>
  14 #include <linux/ktime.h>
  15
  16 #include "gfs2.h"
  17 #include "incore.h"
  18 #include "bmap.h"
  19 #include "glock.h"
  20 #include "inode.h"
  21 #include "meta_io.h"
  22 #include "quota.h"
  23 #include "rgrp.h"
  24 #include "log.h"
  25 #include "super.h"
  26 #include "trans.h"
  27 #include "dir.h"
  28 #include "util.h"
  29 #include "aops.h"
  30 #include "trace_gfs2.h"
  31
  32 /* This doesn't need to be that large as max 64 bit pointers in a 4k
  33  * block is 512, so __u16 is fine for that. It saves stack space to
  34  * keep it small.
  35  */
  36 struct metapath {
  37         struct buffer_head *mp_bh[GFS2_MAX_META_HEIGHT];
  38         __u16 mp_list[GFS2_MAX_META_HEIGHT];
  39         int mp_fheight; /* find_metapath height */
  40         int mp_aheight; /* actual height (lookup height) */
  41 };
  42
  43 static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length);
  44
  45 /**
  46  * gfs2_unstuffer_page - unstuff a stuffed inode into a block cached by a page
  47  * @ip: the inode
  48  * @dibh: the dinode buffer
  49  * @block: the block number that was allocated
  50  * @page: The (optional) page. This is looked up if @page is NULL
  51  *
  52  * Returns: errno
  53  */
  54
  55 static int gfs2_unstuffer_page(struct gfs2_inode *ip, struct buffer_head *dibh,
  56                                u64 block, struct page *page)
  57 {
  58         struct inode *inode = &ip->i_inode;
  59
  60         if (!PageUptodate(page)) {
  61                 void *kaddr = kmap(page);
  62                 u64 dsize = i_size_read(inode);
  63
  64                 memcpy(kaddr, dibh->b_data + sizeof(struct gfs2_dinode), dsize);
  65                 memset(kaddr + dsize, 0, PAGE_SIZE - dsize);
  66                 kunmap(page);
  67
  68                 SetPageUptodate(page);
  69         }
  70
  71         if (gfs2_is_jdata(ip)) {
  72                 struct buffer_head *bh;
  73
  74                 if (!page_has_buffers(page))
  75                         create_empty_buffers(page, BIT(inode->i_blkbits),
  76                                              BIT(BH_Uptodate));
  77
  78                 bh = page_buffers(page);
  79                 if (!buffer_mapped(bh))
  80                         map_bh(bh, inode->i_sb, block);
  81
  82                 set_buffer_uptodate(bh);
  83                 gfs2_trans_add_data(ip->i_gl, bh);
  84         } else {
  85                 set_page_dirty(page);
  86                 gfs2_ordered_add_inode(ip);
  87         }
  88
  89         return 0;
  90 }
  91
  92 static int __gfs2_unstuff_inode(struct gfs2_inode *ip, struct page *page)
  93 {
  94         struct buffer_head *bh, *dibh;
  95         struct gfs2_dinode *di;
  96         u64 block = 0;
  97         int isdir = gfs2_is_dir(ip);
  98         int error;
  99
 100         error = gfs2_meta_inode_buffer(ip, &dibh);
 101         if (error)
 102                 return error;
 103
 104         if (i_size_read(&ip->i_inode)) {
 105                 /* Get a free block, fill it with the stuffed data,
 106                    and write it out to disk */
 107
 108                 unsigned int n = 1;
 109                 error = gfs2_alloc_blocks(ip, &block, &n, 0, NULL);
 110                 if (error)
 111                         goto out_brelse;
 112                 if (isdir) {
 113                         gfs2_trans_remove_revoke(GFS2_SB(&ip->i_inode), block, 1);
 114                         error = gfs2_dir_get_new_buffer(ip, block, &bh);
 115                         if (error)
 116                                 goto out_brelse;
 117                         gfs2_buffer_copy_tail(bh, sizeof(struct gfs2_meta_header),
 118                                               dibh, sizeof(struct gfs2_dinode));
 119                         brelse(bh);
 120                 } else {
 121                         error = gfs2_unstuffer_page(ip, dibh, block, page);
 122                         if (error)
 123                                 goto out_brelse;
 124                 }
 125         }
 126
 127         /*  Set up the pointer to the new block  */
 128
 129         gfs2_trans_add_meta(ip->i_gl, dibh);
 130         di = (struct gfs2_dinode *)dibh->b_data;
 131         gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
 132
 133         if (i_size_read(&ip->i_inode)) {
 134                 *(__be64 *)(di + 1) = cpu_to_be64(block);
 135                 gfs2_add_inode_blocks(&ip->i_inode, 1);
 136                 di->di_blocks = cpu_to_be64(gfs2_get_inode_blocks(&ip->i_inode));
 137         }
 138
 139         ip->i_height = 1;
 140         di->di_height = cpu_to_be16(1);
 141
 142 out_brelse:
 143         brelse(dibh);
 144         return error;
 145 }
 146
 147 /**
 148  * gfs2_unstuff_dinode - Unstuff a dinode when the data has grown too big
 149  * @ip: The GFS2 inode to unstuff
 150  *
 151  * This routine unstuffs a dinode and returns it to a "normal" state such
 152  * that the height can be grown in the traditional way.
 153  *
 154  * Returns: errno
 155  */
 156
 157 int gfs2_unstuff_dinode(struct gfs2_inode *ip)
 158 {
 159         struct inode *inode = &ip->i_inode;
 160         struct page *page;
 161         int error;
 162
 163         down_write(&ip->i_rw_mutex);
 164         page = find_or_create_page(inode->i_mapping, 0, GFP_NOFS);
 165         error = -ENOMEM;
 166         if (!page)
 167                 goto out;
 168         error = __gfs2_unstuff_inode(ip, page);
 169         unlock_page(page);
 170         put_page(page);
 171 out:
 172         up_write(&ip->i_rw_mutex);
 173         return error;
 174 }
 175
 176 /**
 177  * find_metapath - Find path through the metadata tree
 178  * @sdp: The superblock
 179  * @block: The disk block to look up
 180  * @mp: The metapath to return the result in
 181  * @height: The pre-calculated height of the metadata tree
 182  *
 183  *   This routine returns a struct metapath structure that defines a path
 184  *   through the metadata of inode "ip" to get to block "block".
 185  *
 186  *   Example:
 187  *   Given:  "ip" is a height 3 file, "offset" is 101342453, and this is a
 188  *   filesystem with a blocksize of 4096.
 189  *
 190  *   find_metapath() would return a struct metapath structure set to:
 191  *   mp_fheight = 3, mp_list[0] = 0, mp_list[1] = 48, and mp_list[2] = 165.
 192  *
 193  *   That means that in order to get to the block containing the byte at
 194  *   offset 101342453, we would load the indirect block pointed to by pointer
 195  *   0 in the dinode.  We would then load the indirect block pointed to by
 196  *   pointer 48 in that indirect block.  We would then load the data block
 197  *   pointed to by pointer 165 in that indirect block.
 198  *
 199  *             ----------------------------------------
 200  *             | Dinode |                             |
 201  *             |        |                            4|
 202  *             |        |0 1 2 3 4 5                 9|
 203  *             |        |                            6|
 204  *             ----------------------------------------
 205  *                       |
 206  *                       |
 207  *                       V
 208  *             ----------------------------------------
 209  *             | Indirect Block                       |
 210  *             |                                     5|
 211  *             |            4 4 4 4 4 5 5            1|
 212  *             |0           5 6 7 8 9 0 1            2|
 213  *             ----------------------------------------
 214  *                                |
 215  *                                |
 216  *                                V
 217  *             ----------------------------------------
 218  *             | Indirect Block                       |
 219  *             |                         1 1 1 1 1   5|
 220  *             |                         6 6 6 6 6   1|
 221  *             |0                        3 4 5 6 7   2|
 222  *             ----------------------------------------
 223  *                                           |
 224  *                                           |
 225  *                                           V
 226  *             ----------------------------------------
 227  *             | Data block containing offset         |
 228  *             |            101342453                 |
 229  *             |                                      |
 230  *             |                                      |
 231  *             ----------------------------------------
 232  *
 233  */
 234
 235 static void find_metapath(const struct gfs2_sbd *sdp, u64 block,
 236                           struct metapath *mp, unsigned int height)
 237 {
 238         unsigned int i;
 239
 240         mp->mp_fheight = height;
 241         for (i = height; i--;)
 242                 mp->mp_list[i] = do_div(block, sdp->sd_inptrs);
 243 }
 244
 245 static inline unsigned int metapath_branch_start(const struct metapath *mp)
 246 {
 247         if (mp->mp_list[0] == 0)
 248                 return 2;
 249         return 1;
 250 }
 251
 252 /**
 253  * metaptr1 - Return the first possible metadata pointer in a metapath buffer
 254  * @height: The metadata height (0 = dinode)
 255  * @mp: The metapath
 256  */
 257 static inline __be64 *metaptr1(unsigned int height, const struct metapath *mp)
 258 {
 259         struct buffer_head *bh = mp->mp_bh[height];
 260         if (height == 0)
 261                 return ((__be64 *)(bh->b_data + sizeof(struct gfs2_dinode)));
 262         return ((__be64 *)(bh->b_data + sizeof(struct gfs2_meta_header)));
 263 }
 264
 265 /**
 266  * metapointer - Return pointer to start of metadata in a buffer
 267  * @height: The metadata height (0 = dinode)
 268  * @mp: The metapath
 269  *
 270  * Return a pointer to the block number of the next height of the metadata
 271  * tree given a buffer containing the pointer to the current height of the
 272  * metadata tree.
 273  */
 274
 275 static inline __be64 *metapointer(unsigned int height, const struct metapath *mp)
 276 {
 277         __be64 *p = metaptr1(height, mp);
 278         return p + mp->mp_list[height];
 279 }
 280
 281 static inline const __be64 *metaend(unsigned int height, const struct metapath *mp)
 282 {
 283         const struct buffer_head *bh = mp->mp_bh[height];
 284         return (const __be64 *)(bh->b_data + bh->b_size);
 285 }
 286
 287 static void clone_metapath(struct metapath *clone, struct metapath *mp)
 288 {
 289         unsigned int hgt;
 290
 291         *clone = *mp;
 292         for (hgt = 0; hgt < mp->mp_aheight; hgt++)
 293                 get_bh(clone->mp_bh[hgt]);
 294 }
 295
 296 static void gfs2_metapath_ra(struct gfs2_glock *gl, __be64 *start, __be64 *end)
 297 {
 298         const __be64 *t;
 299
 300         for (t = start; t < end; t++) {
 301                 struct buffer_head *rabh;
 302
 303                 if (!*t)
 304                         continue;
 305
 306                 rabh = gfs2_getbuf(gl, be64_to_cpu(*t), CREATE);
 307                 if (trylock_buffer(rabh)) {
 308                         if (!buffer_uptodate(rabh)) {
 309                                 rabh->b_end_io = end_buffer_read_sync;
 310                                 submit_bh(REQ_OP_READ | REQ_RAHEAD | REQ_META |
 311                                           REQ_PRIO, rabh);
 312                                 continue;
 313                         }
 314                         unlock_buffer(rabh);
 315                 }
 316                 brelse(rabh);
 317         }
 318 }
 319
 320 static int __fillup_metapath(struct gfs2_inode *ip, struct metapath *mp,
 321                              unsigned int x, unsigned int h)
 322 {
 323         for (; x < h; x++) {
 324                 __be64 *ptr = metapointer(x, mp);
 325                 u64 dblock = be64_to_cpu(*ptr);
 326                 int ret;
 327
 328                 if (!dblock)
 329                         break;
 330                 ret = gfs2_meta_buffer(ip, GFS2_METATYPE_IN, dblock, &mp->mp_bh[x + 1]);
 331                 if (ret)
 332                         return ret;
 333         }
 334         mp->mp_aheight = x + 1;
 335         return 0;
 336 }
 337
 338 /**
 339  * lookup_metapath - Walk the metadata tree to a specific point
 340  * @ip: The inode
 341  * @mp: The metapath
 342  *
 343  * Assumes that the inode's buffer has already been looked up and
 344  * hooked onto mp->mp_bh[0] and that the metapath has been initialised
 345  * by find_metapath().
 346  *
 347  * If this function encounters part of the tree which has not been
 348  * allocated, it returns the current height of the tree at the point
 349  * at which it found the unallocated block. Blocks which are found are
 350  * added to the mp->mp_bh[] list.
 351  *
 352  * Returns: error
 353  */
 354
 355 static int lookup_metapath(struct gfs2_inode *ip, struct metapath *mp)
 356 {
 357         return __fillup_metapath(ip, mp, 0, ip->i_height - 1);
 358 }
 359
 360 /**
 361  * fillup_metapath - fill up buffers for the metadata path to a specific height
 362  * @ip: The inode
 363  * @mp: The metapath
 364  * @h: The height to which it should be mapped
 365  *
 366  * Similar to lookup_metapath, but does lookups for a range of heights
 367  *
 368  * Returns: error or the number of buffers filled
 369  */
 370
 371 static int fillup_metapath(struct gfs2_inode *ip, struct metapath *mp, int h)
 372 {
 373         unsigned int x = 0;
 374         int ret;
 375
 376         if (h) {
 377                 /* find the first buffer we need to look up. */
 378                 for (x = h - 1; x > 0; x--) {
 379                         if (mp->mp_bh[x])
 380                                 break;
 381                 }
 382         }
 383         ret = __fillup_metapath(ip, mp, x, h);
 384         if (ret)
 385                 return ret;
 386         return mp->mp_aheight - x - 1;
 387 }
 388
 389 static sector_t metapath_to_block(struct gfs2_sbd *sdp, struct metapath *mp)
 390 {
 391         sector_t factor = 1, block = 0;
 392         int hgt;
 393
 394         for (hgt = mp->mp_fheight - 1; hgt >= 0; hgt--) {
 395                 if (hgt < mp->mp_aheight)
 396                         block += mp->mp_list[hgt] * factor;
 397                 factor *= sdp->sd_inptrs;
 398         }
 399         return block;
 400 }
 401
 402 static void release_metapath(struct metapath *mp)
 403 {
 404         int i;
 405
 406         for (i = 0; i < GFS2_MAX_META_HEIGHT; i++) {
 407                 if (mp->mp_bh[i] == NULL)
 408                         break;
 409                 brelse(mp->mp_bh[i]);
 410                 mp->mp_bh[i] = NULL;
 411         }
 412 }
 413
 414 /**
 415  * gfs2_extent_length - Returns length of an extent of blocks
 416  * @bh: The metadata block
 417  * @ptr: Current position in @bh
 418  * @limit: Max extent length to return
 419  * @eob: Set to 1 if we hit "end of block"
 420  *
 421  * Returns: The length of the extent (minimum of one block)
 422  */
 423
 424 static inline unsigned int gfs2_extent_length(struct buffer_head *bh, __be64 *ptr, size_t limit, int *eob)
 425 {
 426         const __be64 *end = (__be64 *)(bh->b_data + bh->b_size);
 427         const __be64 *first = ptr;
 428         u64 d = be64_to_cpu(*ptr);
 429
 430         *eob = 0;
 431         do {
 432                 ptr++;
 433                 if (ptr >= end)
 434                         break;
 435                 d++;
 436         } while(be64_to_cpu(*ptr) == d);
 437         if (ptr >= end)
 438                 *eob = 1;
 439         return ptr - first;
 440 }
 441
 442 enum walker_status { WALK_STOP, WALK_FOLLOW, WALK_CONTINUE };
 443
 444 /*
 445  * gfs2_metadata_walker - walk an indirect block
 446  * @mp: Metapath to indirect block
 447  * @ptrs: Number of pointers to look at
 448  *
 449  * When returning WALK_FOLLOW, the walker must update @mp to point at the right
 450  * indirect block to follow.
 451  */
 452 typedef enum walker_status (*gfs2_metadata_walker)(struct metapath *mp,
 453                                                    unsigned int ptrs);
 454
 455 /*
 456  * gfs2_walk_metadata - walk a tree of indirect blocks
 457  * @inode: The inode
 458  * @mp: Starting point of walk
 459  * @max_len: Maximum number of blocks to walk
 460  * @walker: Called during the walk
 461  *
 462  * Returns 1 if the walk was stopped by @walker, 0 if we went past @max_len or
 463  * past the end of metadata, and a negative error code otherwise.
 464  */
 465
 466 static int gfs2_walk_metadata(struct inode *inode, struct metapath *mp,
 467                 u64 max_len, gfs2_metadata_walker walker)
 468 {
 469         struct gfs2_inode *ip = GFS2_I(inode);
 470         struct gfs2_sbd *sdp = GFS2_SB(inode);
 471         u64 factor = 1;
 472         unsigned int hgt;
 473         int ret;
 474
 475         /*
 476          * The walk starts in the lowest allocated indirect block, which may be
 477          * before the position indicated by @mp.  Adjust @max_len accordingly
 478          * to avoid a short walk.
 479          */
 480         for (hgt = mp->mp_fheight - 1; hgt >= mp->mp_aheight; hgt--) {
 481                 max_len += mp->mp_list[hgt] * factor;
 482                 mp->mp_list[hgt] = 0;
 483                 factor *= sdp->sd_inptrs;
 484         }
 485
 486         for (;;) {
 487                 u16 start = mp->mp_list[hgt];
 488                 enum walker_status status;
 489                 unsigned int ptrs;
 490                 u64 len;
 491
 492                 /* Walk indirect block. */
 493                 ptrs = (hgt >= 1 ? sdp->sd_inptrs : sdp->sd_diptrs) - start;
 494                 len = ptrs * factor;
 495                 if (len > max_len)
 496                         ptrs = DIV_ROUND_UP_ULL(max_len, factor);
 497                 status = walker(mp, ptrs);
 498                 switch (status) {
 499                 case WALK_STOP:
 500                         return 1;
 501                 case WALK_FOLLOW:
 502                         BUG_ON(mp->mp_aheight == mp->mp_fheight);
 503                         ptrs = mp->mp_list[hgt] - start;
 504                         len = ptrs * factor;
 505                         break;
 506                 case WALK_CONTINUE:
 507                         break;
 508                 }
 509                 if (len >= max_len)
 510                         break;
 511                 max_len -= len;
 512                 if (status == WALK_FOLLOW)
 513                         goto fill_up_metapath;
 514
 515 lower_metapath:
 516                 /* Decrease height of metapath. */
 517                 brelse(mp->mp_bh[hgt]);
 518                 mp->mp_bh[hgt] = NULL;
 519                 mp->mp_list[hgt] = 0;
 520                 if (!hgt)
 521                         break;
 522                 hgt--;
 523                 factor *= sdp->sd_inptrs;
 524
 525                 /* Advance in metadata tree. */
 526                 (mp->mp_list[hgt])++;
 527                 if (hgt) {
 528                         if (mp->mp_list[hgt] >= sdp->sd_inptrs)
 529                                 goto lower_metapath;
 530                 } else {
 531                         if (mp->mp_list[hgt] >= sdp->sd_diptrs)
 532                                 break;
 533                 }
 534
 535 fill_up_metapath:
 536                 /* Increase height of metapath. */
 537                 ret = fillup_metapath(ip, mp, ip->i_height - 1);
 538                 if (ret < 0)
 539                         return ret;
 540                 hgt += ret;
 541                 for (; ret; ret--)
 542                         do_div(factor, sdp->sd_inptrs);
 543                 mp->mp_aheight = hgt + 1;
 544         }
 545         return 0;
 546 }
 547
 548 static enum walker_status gfs2_hole_walker(struct metapath *mp,
 549                                            unsigned int ptrs)
 550 {
 551         const __be64 *start, *ptr, *end;
 552         unsigned int hgt;
 553
 554         hgt = mp->mp_aheight - 1;
 555         start = metapointer(hgt, mp);
 556         end = start + ptrs;
 557
 558         for (ptr = start; ptr < end; ptr++) {
 559                 if (*ptr) {
 560                         mp->mp_list[hgt] += ptr - start;
 561                         if (mp->mp_aheight == mp->mp_fheight)
 562                                 return WALK_STOP;
 563                         return WALK_FOLLOW;
 564                 }
 565         }
 566         return WALK_CONTINUE;
 567 }
 568
 569 /**
 570  * gfs2_hole_size - figure out the size of a hole
 571  * @inode: The inode
 572  * @lblock: The logical starting block number
 573  * @len: How far to look (in blocks)
 574  * @mp: The metapath at lblock
 575  * @iomap: The iomap to store the hole size in
 576  *
 577  * This function modifies @mp.
 578  *
 579  * Returns: errno on error
 580  */
 581 static int gfs2_hole_size(struct inode *inode, sector_t lblock, u64 len,
 582                           struct metapath *mp, struct iomap *iomap)
 583 {
 584         struct metapath clone;
 585         u64 hole_size;
 586         int ret;
 587
 588         clone_metapath(&clone, mp);
 589         ret = gfs2_walk_metadata(inode, &clone, len, gfs2_hole_walker);
 590         if (ret < 0)
 591                 goto out;
 592
 593         if (ret == 1)
 594                 hole_size = metapath_to_block(GFS2_SB(inode), &clone) - lblock;
 595         else
 596                 hole_size = len;
 597         iomap->length = hole_size << inode->i_blkbits;
 598         ret = 0;
 599
 600 out:
 601         release_metapath(&clone);
 602         return ret;
 603 }
 604
 605 static inline void gfs2_indirect_init(struct metapath *mp,
 606                                       struct gfs2_glock *gl, unsigned int i,
 607                                       unsigned offset, u64 bn)
 608 {
 609         __be64 *ptr = (__be64 *)(mp->mp_bh[i - 1]->b_data +
 610                        ((i > 1) ? sizeof(struct gfs2_meta_header) :
 611                                  sizeof(struct gfs2_dinode)));
 612         BUG_ON(i < 1);
 613         BUG_ON(mp->mp_bh[i] != NULL);
 614         mp->mp_bh[i] = gfs2_meta_new(gl, bn);
 615         gfs2_trans_add_meta(gl, mp->mp_bh[i]);
 616         gfs2_metatype_set(mp->mp_bh[i], GFS2_METATYPE_IN, GFS2_FORMAT_IN);
 617         gfs2_buffer_clear_tail(mp->mp_bh[i], sizeof(struct gfs2_meta_header));
 618         ptr += offset;
 619         *ptr = cpu_to_be64(bn);
 620 }
 621
 622 enum alloc_state {
 623         ALLOC_DATA = 0,
 624         ALLOC_GROW_DEPTH = 1,
 625         ALLOC_GROW_HEIGHT = 2,
 626         /* ALLOC_UNSTUFF = 3,   TBD and rather complicated */
 627 };
 628
 629 /**
 630  * __gfs2_iomap_alloc - Build a metadata tree of the requested height
 631  * @inode: The GFS2 inode
 632  * @iomap: The iomap structure
 633  * @mp: The metapath, with proper height information calculated
 634  *
 635  * In this routine we may have to alloc:
 636  *   i) Indirect blocks to grow the metadata tree height
 637  *  ii) Indirect blocks to fill in lower part of the metadata tree
 638  * iii) Data blocks
 639  *
 640  * This function is called after __gfs2_iomap_get, which works out the
 641  * total number of blocks which we need via gfs2_alloc_size.
 642  *
 643  * We then do the actual allocation asking for an extent at a time (if
 644  * enough contiguous free blocks are available, there will only be one
 645  * allocation request per call) and uses the state machine to initialise
 646  * the blocks in order.
 647  *
 648  * Right now, this function will allocate at most one indirect block
 649  * worth of data -- with a default block size of 4K, that's slightly
 650  * less than 2M.  If this limitation is ever removed to allow huge
 651  * allocations, we would probably still want to limit the iomap size we
 652  * return to avoid stalling other tasks during huge writes; the next
 653  * iomap iteration would then find the blocks already allocated.
 654  *
 655  * Returns: errno on error
 656  */
 657
 658 static int __gfs2_iomap_alloc(struct inode *inode, struct iomap *iomap,
 659                               struct metapath *mp)
 660 {
 661         struct gfs2_inode *ip = GFS2_I(inode);
 662         struct gfs2_sbd *sdp = GFS2_SB(inode);
 663         struct buffer_head *dibh = mp->mp_bh[0];
 664         u64 bn;
 665         unsigned n, i, blks, alloced = 0, iblks = 0, branch_start = 0;
 666         size_t dblks = iomap->length >> inode->i_blkbits;
 667         const unsigned end_of_metadata = mp->mp_fheight - 1;
 668         int ret;
 669         enum alloc_state state;
 670         __be64 *ptr;
 671         __be64 zero_bn = 0;
 672
 673         BUG_ON(mp->mp_aheight < 1);
 674         BUG_ON(dibh == NULL);
 675         BUG_ON(dblks < 1);
 676
 677         gfs2_trans_add_meta(ip->i_gl, dibh);
 678
 679         down_write(&ip->i_rw_mutex);
 680
 681         if (mp->mp_fheight == mp->mp_aheight) {
 682                 /* Bottom indirect block exists */
 683                 state = ALLOC_DATA;
 684         } else {
 685                 /* Need to allocate indirect blocks */
 686                 if (mp->mp_fheight == ip->i_height) {
 687                         /* Writing into existing tree, extend tree down */
 688                         iblks = mp->mp_fheight - mp->mp_aheight;
 689                         state = ALLOC_GROW_DEPTH;
 690                 } else {
 691                         /* Building up tree height */
 692                         state = ALLOC_GROW_HEIGHT;
 693                         iblks = mp->mp_fheight - ip->i_height;
 694                         branch_start = metapath_branch_start(mp);
 695                         iblks += (mp->mp_fheight - branch_start);
 696                 }
 697         }
 698
 699         /* start of the second part of the function (state machine) */
 700
 701         blks = dblks + iblks;
 702         i = mp->mp_aheight;
 703         do {
 704                 n = blks - alloced;
 705                 ret = gfs2_alloc_blocks(ip, &bn, &n, 0, NULL);
 706                 if (ret)
 707                         goto out;
 708                 alloced += n;
 709                 if (state != ALLOC_DATA || gfs2_is_jdata(ip))
 710                         gfs2_trans_remove_revoke(sdp, bn, n);
 711                 switch (state) {
 712                 /* Growing height of tree */
 713                 case ALLOC_GROW_HEIGHT:
 714                         if (i == 1) {
 715                                 ptr = (__be64 *)(dibh->b_data +
 716                                                  sizeof(struct gfs2_dinode));
 717                                 zero_bn = *ptr;
 718                         }
 719                         for (; i - 1 < mp->mp_fheight - ip->i_height && n > 0;
 720                              i++, n--)
 721                                 gfs2_indirect_init(mp, ip->i_gl, i, 0, bn++);
 722                         if (i - 1 == mp->mp_fheight - ip->i_height) {
 723                                 i--;
 724                                 gfs2_buffer_copy_tail(mp->mp_bh[i],
 725                                                 sizeof(struct gfs2_meta_header),
 726                                                 dibh, sizeof(struct gfs2_dinode));
 727                                 gfs2_buffer_clear_tail(dibh,
 728                                                 sizeof(struct gfs2_dinode) +
 729                                                 sizeof(__be64));
 730                                 ptr = (__be64 *)(mp->mp_bh[i]->b_data +
 731                                         sizeof(struct gfs2_meta_header));
 732                                 *ptr = zero_bn;
 733                                 state = ALLOC_GROW_DEPTH;
 734                                 for(i = branch_start; i < mp->mp_fheight; i++) {
 735                                         if (mp->mp_bh[i] == NULL)
 736                                                 break;
 737                                         brelse(mp->mp_bh[i]);
 738                                         mp->mp_bh[i] = NULL;
 739                                 }
 740                                 i = branch_start;
 741                         }
 742                         if (n == 0)
 743                                 break;
 744                         fallthrough;    /* To branching from existing tree */
 745                 case ALLOC_GROW_DEPTH:
 746                         if (i > 1 && i < mp->mp_fheight)
 747                                 gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[i-1]);
 748                         for (; i < mp->mp_fheight && n > 0; i++, n--)
 749                                 gfs2_indirect_init(mp, ip->i_gl, i,
 750                                                    mp->mp_list[i-1], bn++);
 751                         if (i == mp->mp_fheight)
 752                                 state = ALLOC_DATA;
 753                         if (n == 0)
 754                                 break;
 755                         fallthrough;    /* To tree complete, adding data blocks */
 756                 case ALLOC_DATA:
 757                         BUG_ON(n > dblks);
 758                         BUG_ON(mp->mp_bh[end_of_metadata] == NULL);
 759                         gfs2_trans_add_meta(ip->i_gl, mp->mp_bh[end_of_metadata]);
 760                         dblks = n;
 761                         ptr = metapointer(end_of_metadata, mp);
 762                         iomap->addr = bn << inode->i_blkbits;
 763                         iomap->flags |= IOMAP_F_MERGED | IOMAP_F_NEW;
 764                         while (n-- > 0)
 765                                 *ptr++ = cpu_to_be64(bn++);
 766                         break;
 767                 }
 768         } while (iomap->addr == IOMAP_NULL_ADDR);
 769
 770         iomap->type = IOMAP_MAPPED;
 771         iomap->length = (u64)dblks << inode->i_blkbits;
 772         ip->i_height = mp->mp_fheight;
 773         gfs2_add_inode_blocks(&ip->i_inode, alloced);
 774         gfs2_dinode_out(ip, dibh->b_data);
 775 out:
 776         up_write(&ip->i_rw_mutex);
 777         return ret;
 778 }
 779
 780 #define IOMAP_F_GFS2_BOUNDARY IOMAP_F_PRIVATE
 781
 782 /**
 783  * gfs2_alloc_size - Compute the maximum allocation size
 784  * @inode: The inode
 785  * @mp: The metapath
 786  * @size: Requested size in blocks
 787  *
 788  * Compute the maximum size of the next allocation at @mp.
 789  *
 790  * Returns: size in blocks
 791  */
 792 static u64 gfs2_alloc_size(struct inode *inode, struct metapath *mp, u64 size)
 793 {
 794         struct gfs2_inode *ip = GFS2_I(inode);
 795         struct gfs2_sbd *sdp = GFS2_SB(inode);
 796         const __be64 *first, *ptr, *end;
 797
 798         /*
 799          * For writes to stuffed files, this function is called twice via
 800          * __gfs2_iomap_get, before and after unstuffing. The size we return the
 801          * first time needs to be large enough to get the reservation and
 802          * allocation sizes right.  The size we return the second time must
 803          * be exact or else __gfs2_iomap_alloc won't do the right thing.
 804          */
 805
 806         if (gfs2_is_stuffed(ip) || mp->mp_fheight != mp->mp_aheight) {
 807                 unsigned int maxsize = mp->mp_fheight > 1 ?
 808                         sdp->sd_inptrs : sdp->sd_diptrs;
 809                 maxsize -= mp->mp_list[mp->mp_fheight - 1];
 810                 if (size > maxsize)
 811                         size = maxsize;
 812                 return size;
 813         }
 814
 815         first = metapointer(ip->i_height - 1, mp);
 816         end = metaend(ip->i_height - 1, mp);
 817         if (end - first > size)
 818                 end = first + size;
 819         for (ptr = first; ptr < end; ptr++) {
 820                 if (*ptr)
 821                         break;
 822         }
 823         return ptr - first;
 824 }
 825
 826 /**
 827  * __gfs2_iomap_get - Map blocks from an inode to disk blocks
 828  * @inode: The inode
 829  * @pos: Starting position in bytes
 830  * @length: Length to map, in bytes
 831  * @flags: iomap flags
 832  * @iomap: The iomap structure
 833  * @mp: The metapath
 834  *
 835  * Returns: errno
 836  */
 837 static int __gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
 838                             unsigned flags, struct iomap *iomap,
 839                             struct metapath *mp)
 840 {
 841         struct gfs2_inode *ip = GFS2_I(inode);
 842         struct gfs2_sbd *sdp = GFS2_SB(inode);
 843         loff_t size = i_size_read(inode);
 844         __be64 *ptr;
 845         sector_t lblock;
 846         sector_t lblock_stop;
 847         int ret;
 848         int eob;
 849         u64 len;
 850         struct buffer_head *dibh = NULL, *bh;
 851         u8 height;
 852
 853         if (!length)
 854                 return -EINVAL;
 855
 856         down_read(&ip->i_rw_mutex);
 857
 858         ret = gfs2_meta_inode_buffer(ip, &dibh);
 859         if (ret)
 860                 goto unlock;
 861         mp->mp_bh[0] = dibh;
 862
 863         if (gfs2_is_stuffed(ip)) {
 864                 if (flags & IOMAP_WRITE) {
 865                         loff_t max_size = gfs2_max_stuffed_size(ip);
 866
 867                         if (pos + length > max_size)
 868                                 goto unstuff;
 869                         iomap->length = max_size;
 870                 } else {
 871                         if (pos >= size) {
 872                                 if (flags & IOMAP_REPORT) {
 873                                         ret = -ENOENT;
 874                                         goto unlock;
 875                                 } else {
 876                                         iomap->offset = pos;
 877                                         iomap->length = length;
 878                                         goto hole_found;
 879                                 }
 880                         }
 881                         iomap->length = size;
 882                 }
 883                 iomap->addr = (ip->i_no_addr << inode->i_blkbits) +
 884                               sizeof(struct gfs2_dinode);
 885                 iomap->type = IOMAP_INLINE;
 886                 iomap->inline_data = dibh->b_data + sizeof(struct gfs2_dinode);
 887                 goto out;
 888         }
 889
 890 unstuff:
 891         lblock = pos >> inode->i_blkbits;
 892         iomap->offset = lblock << inode->i_blkbits;
 893         lblock_stop = (pos + length - 1) >> inode->i_blkbits;
 894         len = lblock_stop - lblock + 1;
 895         iomap->length = len << inode->i_blkbits;
 896
 897         height = ip->i_height;
 898         while ((lblock + 1) * sdp->sd_sb.sb_bsize > sdp->sd_heightsize[height])
 899                 height++;
 900         find_metapath(sdp, lblock, mp, height);
 901         if (height > ip->i_height || gfs2_is_stuffed(ip))
 902                 goto do_alloc;
 903
 904         ret = lookup_metapath(ip, mp);
 905         if (ret)
 906                 goto unlock;
 907
 908         if (mp->mp_aheight != ip->i_height)
 909                 goto do_alloc;
 910
 911         ptr = metapointer(ip->i_height - 1, mp);
 912         if (*ptr == 0)
 913                 goto do_alloc;
 914
 915         bh = mp->mp_bh[ip->i_height - 1];
 916         len = gfs2_extent_length(bh, ptr, len, &eob);
 917
 918         iomap->addr = be64_to_cpu(*ptr) << inode->i_blkbits;
 919         iomap->length = len << inode->i_blkbits;
 920         iomap->type = IOMAP_MAPPED;
 921         iomap->flags |= IOMAP_F_MERGED;
 922         if (eob)
 923                 iomap->flags |= IOMAP_F_GFS2_BOUNDARY;
 924
 925 out:
 926         iomap->bdev = inode->i_sb->s_bdev;
 927 unlock:
 928         up_read(&ip->i_rw_mutex);
 929         return ret;
 930
 931 do_alloc:
 932         if (flags & IOMAP_REPORT) {
 933                 if (pos >= size)
 934                         ret = -ENOENT;
 935                 else if (height == ip->i_height)
 936                         ret = gfs2_hole_size(inode, lblock, len, mp, iomap);
 937                 else
 938                         iomap->length = size - iomap->offset;
 939         } else if (flags & IOMAP_WRITE) {
 940                 u64 alloc_size;
 941
 942                 if (flags & IOMAP_DIRECT)
 943                         goto out;  /* (see gfs2_file_direct_write) */
 944
 945                 len = gfs2_alloc_size(inode, mp, len);
 946                 alloc_size = len << inode->i_blkbits;
 947                 if (alloc_size < iomap->length)
 948                         iomap->length = alloc_size;
 949         } else {
 950                 if (pos < size && height == ip->i_height)
 951                         ret = gfs2_hole_size(inode, lblock, len, mp, iomap);
 952         }
 953 hole_found:
 954         iomap->addr = IOMAP_NULL_ADDR;
 955         iomap->type = IOMAP_HOLE;
 956         goto out;
 957 }
 958
 959 static int gfs2_iomap_page_prepare(struct inode *inode, loff_t pos,
 960                                    unsigned len)
 961 {
 962         unsigned int blockmask = i_blocksize(inode) - 1;
 963         struct gfs2_sbd *sdp = GFS2_SB(inode);
 964         unsigned int blocks;
 965
 966         blocks = ((pos & blockmask) + len + blockmask) >> inode->i_blkbits;
 967         return gfs2_trans_begin(sdp, RES_DINODE + blocks, 0);
 968 }
 969
 970 static void gfs2_iomap_page_done(struct inode *inode, loff_t pos,
 971                                  unsigned copied, struct page *page)
 972 {
 973         struct gfs2_trans *tr = current->journal_info;
 974         struct gfs2_inode *ip = GFS2_I(inode);
 975         struct gfs2_sbd *sdp = GFS2_SB(inode);
 976
 977         if (page && !gfs2_is_stuffed(ip))
 978                 gfs2_page_add_databufs(ip, page, offset_in_page(pos), copied);
 979
 980         if (tr->tr_num_buf_new)
 981                 __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
 982
 983         gfs2_trans_end(sdp);
 984 }
 985
 986 static const struct iomap_page_ops gfs2_iomap_page_ops = {
 987         .page_prepare = gfs2_iomap_page_prepare,
 988         .page_done = gfs2_iomap_page_done,
 989 };
 990
 991 static int gfs2_iomap_begin_write(struct inode *inode, loff_t pos,
 992                                   loff_t length, unsigned flags,
 993                                   struct iomap *iomap,
 994                                   struct metapath *mp)
 995 {
 996         struct gfs2_inode *ip = GFS2_I(inode);
 997         struct gfs2_sbd *sdp = GFS2_SB(inode);
 998         bool unstuff;
 999         int ret;
1000
1001         unstuff = gfs2_is_stuffed(ip) &&
1002                   pos + length > gfs2_max_stuffed_size(ip);
1003
1004         if (unstuff || iomap->type == IOMAP_HOLE) {
1005                 unsigned int data_blocks, ind_blocks;
1006                 struct gfs2_alloc_parms ap = {};
1007                 unsigned int rblocks;
1008                 struct gfs2_trans *tr;
1009
1010                 gfs2_write_calc_reserv(ip, iomap->length, &data_blocks,
1011                                        &ind_blocks);
1012                 ap.target = data_blocks + ind_blocks;
1013                 ret = gfs2_quota_lock_check(ip, &ap);
1014                 if (ret)
1015                         return ret;
1016
1017                 ret = gfs2_inplace_reserve(ip, &ap);
1018                 if (ret)
1019                         goto out_qunlock;
1020
1021                 rblocks = RES_DINODE + ind_blocks;
1022                 if (gfs2_is_jdata(ip))
1023                         rblocks += data_blocks;
1024                 if (ind_blocks || data_blocks)
1025                         rblocks += RES_STATFS + RES_QUOTA;
1026                 if (inode == sdp->sd_rindex)
1027                         rblocks += 2 * RES_STATFS;
1028                 rblocks += gfs2_rg_blocks(ip, data_blocks + ind_blocks);
1029
1030                 ret = gfs2_trans_begin(sdp, rblocks,
1031                                        iomap->length >> inode->i_blkbits);
1032                 if (ret)
1033                         goto out_trans_fail;
1034
1035                 if (unstuff) {
1036                         ret = gfs2_unstuff_dinode(ip);
1037                         if (ret)
1038                                 goto out_trans_end;
1039                         release_metapath(mp);
1040                         ret = __gfs2_iomap_get(inode, iomap->offset,
1041                                                iomap->length, flags, iomap, mp);
1042                         if (ret)
1043                                 goto out_trans_end;
1044                 }
1045
1046                 if (iomap->type == IOMAP_HOLE) {
1047                         ret = __gfs2_iomap_alloc(inode, iomap, mp);
1048                         if (ret) {
1049                                 gfs2_trans_end(sdp);
1050                                 gfs2_inplace_release(ip);
1051                                 punch_hole(ip, iomap->offset, iomap->length);
1052                                 goto out_qunlock;
1053                         }
1054                 }
1055
1056                 tr = current->journal_info;
1057                 if (tr->tr_num_buf_new)
1058                         __mark_inode_dirty(inode, I_DIRTY_DATASYNC);
1059
1060                 gfs2_trans_end(sdp);
1061         }
1062
1063         if (gfs2_is_stuffed(ip) || gfs2_is_jdata(ip))
1064                 iomap->page_ops = &gfs2_iomap_page_ops;
1065         return 0;
1066
1067 out_trans_end:
1068         gfs2_trans_end(sdp);
1069 out_trans_fail:
1070         gfs2_inplace_release(ip);
1071 out_qunlock:
1072         gfs2_quota_unlock(ip);
1073         return ret;
1074 }
1075
1076 static int gfs2_iomap_begin(struct inode *inode, loff_t pos, loff_t length,
1077                             unsigned flags, struct iomap *iomap,
1078                             struct iomap *srcmap)
1079 {
1080         struct gfs2_inode *ip = GFS2_I(inode);
1081         struct metapath mp = { .mp_aheight = 1, };
1082         int ret;
1083
1084         if (gfs2_is_jdata(ip))
1085                 iomap->flags |= IOMAP_F_BUFFER_HEAD;
1086
1087         trace_gfs2_iomap_start(ip, pos, length, flags);
1088         ret = __gfs2_iomap_get(inode, pos, length, flags, iomap, &mp);
1089         if (ret)
1090                 goto out_unlock;
1091
1092         switch(flags & (IOMAP_WRITE | IOMAP_ZERO)) {
1093         case IOMAP_WRITE:
1094                 if (flags & IOMAP_DIRECT) {
1095                         /*
1096                          * Silently fall back to buffered I/O for stuffed files
1097                          * or if we've got a hole (see gfs2_file_direct_write).
1098                          */
1099                         if (iomap->type != IOMAP_MAPPED)
1100                                 ret = -ENOTBLK;
1101                         goto out_unlock;
1102                 }
1103                 break;
1104         case IOMAP_ZERO:
1105                 if (iomap->type == IOMAP_HOLE)
1106                         goto out_unlock;
1107                 break;
1108         default:
1109                 goto out_unlock;
1110         }
1111
1112         ret = gfs2_iomap_begin_write(inode, pos, length, flags, iomap, &mp);
1113
1114 out_unlock:
1115         release_metapath(&mp);
1116         trace_gfs2_iomap_end(ip, iomap, ret);
1117         return ret;
1118 }
1119
1120 static int gfs2_iomap_end(struct inode *inode, loff_t pos, loff_t length,
1121                           ssize_t written, unsigned flags, struct iomap *iomap)
1122 {
1123         struct gfs2_inode *ip = GFS2_I(inode);
1124         struct gfs2_sbd *sdp = GFS2_SB(inode);
1125
1126         switch (flags & (IOMAP_WRITE | IOMAP_ZERO)) {
1127         case IOMAP_WRITE:
1128                 if (flags & IOMAP_DIRECT)
1129                         return 0;
1130                 break;
1131         case IOMAP_ZERO:
1132                  if (iomap->type == IOMAP_HOLE)
1133                          return 0;
1134                  break;
1135         default:
1136                  return 0;
1137         }
1138
1139         if (!gfs2_is_stuffed(ip))
1140                 gfs2_ordered_add_inode(ip);
1141
1142         if (inode == sdp->sd_rindex)
1143                 adjust_fs_space(inode);
1144
1145         gfs2_inplace_release(ip);
1146
1147         if (ip->i_qadata && ip->i_qadata->qa_qd_num)
1148                 gfs2_quota_unlock(ip);
1149
1150         if (length != written && (iomap->flags & IOMAP_F_NEW)) {
1151                 /* Deallocate blocks that were just allocated. */
1152                 loff_t hstart = round_up(pos + written, i_blocksize(inode));
1153                 loff_t hend = iomap->offset + iomap->length;
1154
1155                 if (hstart < hend) {
1156                         truncate_pagecache_range(inode, hstart, hend - 1);
1157                         punch_hole(ip, hstart, hend - hstart);
1158                 }
1159         }
1160
1161         if (unlikely(!written))
1162                 return 0;
1163
1164         if (iomap->flags & IOMAP_F_SIZE_CHANGED)
1165                 mark_inode_dirty(inode);
1166         set_bit(GLF_DIRTY, &ip->i_gl->gl_flags);
1167         return 0;
1168 }
1169
1170 const struct iomap_ops gfs2_iomap_ops = {
1171         .iomap_begin = gfs2_iomap_begin,
1172         .iomap_end = gfs2_iomap_end,
1173 };
1174
1175 /**
1176  * gfs2_block_map - Map one or more blocks of an inode to a disk block
1177  * @inode: The inode
1178  * @lblock: The logical block number
1179  * @bh_map: The bh to be mapped
1180  * @create: True if its ok to alloc blocks to satify the request
1181  *
1182  * The size of the requested mapping is defined in bh_map->b_size.
1183  *
1184  * Clears buffer_mapped(bh_map) and leaves bh_map->b_size unchanged
1185  * when @lblock is not mapped.  Sets buffer_mapped(bh_map) and
1186  * bh_map->b_size to indicate the size of the mapping when @lblock and
1187  * successive blocks are mapped, up to the requested size.
1188  *
1189  * Sets buffer_boundary() if a read of metadata will be required
1190  * before the next block can be mapped. Sets buffer_new() if new
1191  * blocks were allocated.
1192  *
1193  * Returns: errno
1194  */
1195
1196 int gfs2_block_map(struct inode *inode, sector_t lblock,
1197                    struct buffer_head *bh_map, int create)
1198 {
1199         struct gfs2_inode *ip = GFS2_I(inode);
1200         loff_t pos = (loff_t)lblock << inode->i_blkbits;
1201         loff_t length = bh_map->b_size;
1202         struct iomap iomap = { };
1203         int ret;
1204
1205         clear_buffer_mapped(bh_map);
1206         clear_buffer_new(bh_map);
1207         clear_buffer_boundary(bh_map);
1208         trace_gfs2_bmap(ip, bh_map, lblock, create, 1);
1209
1210         if (!create)
1211                 ret = gfs2_iomap_get(inode, pos, length, &iomap);
1212         else
1213                 ret = gfs2_iomap_alloc(inode, pos, length, &iomap);
1214         if (ret)
1215                 goto out;
1216
1217         if (iomap.length > bh_map->b_size) {
1218                 iomap.length = bh_map->b_size;
1219                 iomap.flags &= ~IOMAP_F_GFS2_BOUNDARY;
1220         }
1221         if (iomap.addr != IOMAP_NULL_ADDR)
1222                 map_bh(bh_map, inode->i_sb, iomap.addr >> inode->i_blkbits);
1223         bh_map->b_size = iomap.length;
1224         if (iomap.flags & IOMAP_F_GFS2_BOUNDARY)
1225                 set_buffer_boundary(bh_map);
1226         if (iomap.flags & IOMAP_F_NEW)
1227                 set_buffer_new(bh_map);
1228
1229 out:
1230         trace_gfs2_bmap(ip, bh_map, lblock, create, ret);
1231         return ret;
1232 }
1233
1234 int gfs2_get_extent(struct inode *inode, u64 lblock, u64 *dblock,
1235                     unsigned int *extlen)
1236 {
1237         unsigned int blkbits = inode->i_blkbits;
1238         struct iomap iomap = { };
1239         unsigned int len;
1240         int ret;
1241
1242         ret = gfs2_iomap_get(inode, lblock << blkbits, *extlen << blkbits,
1243                              &iomap);
1244         if (ret)
1245                 return ret;
1246         if (iomap.type != IOMAP_MAPPED)
1247                 return -EIO;
1248         *dblock = iomap.addr >> blkbits;
1249         len = iomap.length >> blkbits;
1250         if (len < *extlen)
1251                 *extlen = len;
1252         return 0;
1253 }
1254
1255 int gfs2_alloc_extent(struct inode *inode, u64 lblock, u64 *dblock,
1256                       unsigned int *extlen, bool *new)
1257 {
1258         unsigned int blkbits = inode->i_blkbits;
1259         struct iomap iomap = { };
1260         unsigned int len;
1261         int ret;
1262
1263         ret = gfs2_iomap_alloc(inode, lblock << blkbits, *extlen << blkbits,
1264                                &iomap);
1265         if (ret)
1266                 return ret;
1267         if (iomap.type != IOMAP_MAPPED)
1268                 return -EIO;
1269         *dblock = iomap.addr >> blkbits;
1270         len = iomap.length >> blkbits;
1271         if (len < *extlen)
1272                 *extlen = len;
1273         *new = iomap.flags & IOMAP_F_NEW;
1274         return 0;
1275 }
1276
1277 /*
1278  * NOTE: Never call gfs2_block_zero_range with an open transaction because it
1279  * uses iomap write to perform its actions, which begin their own transactions
1280  * (iomap_begin, page_prepare, etc.)
1281  */
1282 static int gfs2_block_zero_range(struct inode *inode, loff_t from,
1283                                  unsigned int length)
1284 {
1285         BUG_ON(current->journal_info);
1286         return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops);
1287 }
1288
1289 #define GFS2_JTRUNC_REVOKES 8192
1290
1291 /**
1292  * gfs2_journaled_truncate - Wrapper for truncate_pagecache for jdata files
1293  * @inode: The inode being truncated
1294  * @oldsize: The original (larger) size
1295  * @newsize: The new smaller size
1296  *
1297  * With jdata files, we have to journal a revoke for each block which is
1298  * truncated. As a result, we need to split this into separate transactions
1299  * if the number of pages being truncated gets too large.
1300  */
1301
1302 static int gfs2_journaled_truncate(struct inode *inode, u64 oldsize, u64 newsize)
1303 {
1304         struct gfs2_sbd *sdp = GFS2_SB(inode);
1305         u64 max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
1306         u64 chunk;
1307         int error;
1308
1309         while (oldsize != newsize) {
1310                 struct gfs2_trans *tr;
1311                 unsigned int offs;
1312
1313                 chunk = oldsize - newsize;
1314                 if (chunk > max_chunk)
1315                         chunk = max_chunk;
1316
1317                 offs = oldsize & ~PAGE_MASK;
1318                 if (offs && chunk > PAGE_SIZE)
1319                         chunk = offs + ((chunk - offs) & PAGE_MASK);
1320
1321                 truncate_pagecache(inode, oldsize - chunk);
1322                 oldsize -= chunk;
1323
1324                 tr = current->journal_info;
1325                 if (!test_bit(TR_TOUCHED, &tr->tr_flags))
1326                         continue;
1327
1328                 gfs2_trans_end(sdp);
1329                 error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
1330                 if (error)
1331                         return error;
1332         }
1333
1334         return 0;
1335 }
1336
1337 static int trunc_start(struct inode *inode, u64 newsize)
1338 {
1339         struct gfs2_inode *ip = GFS2_I(inode);
1340         struct gfs2_sbd *sdp = GFS2_SB(inode);
1341         struct buffer_head *dibh = NULL;
1342         int journaled = gfs2_is_jdata(ip);
1343         u64 oldsize = inode->i_size;
1344         int error;
1345
1346         if (!gfs2_is_stuffed(ip)) {
1347                 unsigned int blocksize = i_blocksize(inode);
1348                 unsigned int offs = newsize & (blocksize - 1);
1349                 if (offs) {
1350                         error = gfs2_block_zero_range(inode, newsize,
1351                                                       blocksize - offs);
1352                         if (error)
1353                                 return error;
1354                 }
1355         }
1356         if (journaled)
1357                 error = gfs2_trans_begin(sdp, RES_DINODE + RES_JDATA, GFS2_JTRUNC_REVOKES);
1358         else
1359                 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1360         if (error)
1361                 return error;
1362
1363         error = gfs2_meta_inode_buffer(ip, &dibh);
1364         if (error)
1365                 goto out;
1366
1367         gfs2_trans_add_meta(ip->i_gl, dibh);
1368
1369         if (gfs2_is_stuffed(ip))
1370                 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode) + newsize);
1371         else
1372                 ip->i_diskflags |= GFS2_DIF_TRUNC_IN_PROG;
1373
1374         i_size_write(inode, newsize);
1375         ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1376         gfs2_dinode_out(ip, dibh->b_data);
1377
1378         if (journaled)
1379                 error = gfs2_journaled_truncate(inode, oldsize, newsize);
1380         else
1381                 truncate_pagecache(inode, newsize);
1382
1383 out:
1384         brelse(dibh);
1385         if (current->journal_info)
1386                 gfs2_trans_end(sdp);
1387         return error;
1388 }
1389
1390 int gfs2_iomap_get(struct inode *inode, loff_t pos, loff_t length,
1391                    struct iomap *iomap)
1392 {
1393         struct metapath mp = { .mp_aheight = 1, };
1394         int ret;
1395
1396         ret = __gfs2_iomap_get(inode, pos, length, 0, iomap, &mp);
1397         release_metapath(&mp);
1398         return ret;
1399 }
1400
1401 int gfs2_iomap_alloc(struct inode *inode, loff_t pos, loff_t length,
1402                      struct iomap *iomap)
1403 {
1404         struct metapath mp = { .mp_aheight = 1, };
1405         int ret;
1406
1407         ret = __gfs2_iomap_get(inode, pos, length, IOMAP_WRITE, iomap, &mp);
1408         if (!ret && iomap->type == IOMAP_HOLE)
1409                 ret = __gfs2_iomap_alloc(inode, iomap, &mp);
1410         release_metapath(&mp);
1411         return ret;
1412 }
1413
1414 /**
1415  * sweep_bh_for_rgrps - find an rgrp in a meta buffer and free blocks therein
1416  * @ip: inode
1417  * @rd_gh: holder of resource group glock
1418  * @bh: buffer head to sweep
1419  * @start: starting point in bh
1420  * @end: end point in bh
1421  * @meta: true if bh points to metadata (rather than data)
1422  * @btotal: place to keep count of total blocks freed
1423  *
1424  * We sweep a metadata buffer (provided by the metapath) for blocks we need to
1425  * free, and free them all. However, we do it one rgrp at a time. If this
1426  * block has references to multiple rgrps, we break it into individual
1427  * transactions. This allows other processes to use the rgrps while we're
1428  * focused on a single one, for better concurrency / performance.
1429  * At every transaction boundary, we rewrite the inode into the journal.
1430  * That way the bitmaps are kept consistent with the inode and we can recover
1431  * if we're interrupted by power-outages.
1432  *
1433  * Returns: 0, or return code if an error occurred.
1434  *          *btotal has the total number of blocks freed
1435  */
1436 static int sweep_bh_for_rgrps(struct gfs2_inode *ip, struct gfs2_holder *rd_gh,
1437                               struct buffer_head *bh, __be64 *start, __be64 *end,
1438                               bool meta, u32 *btotal)
1439 {
1440         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1441         struct gfs2_rgrpd *rgd;
1442         struct gfs2_trans *tr;
1443         __be64 *p;
1444         int blks_outside_rgrp;
1445         u64 bn, bstart, isize_blks;
1446         s64 blen; /* needs to be s64 or gfs2_add_inode_blocks breaks */
1447         int ret = 0;
1448         bool buf_in_tr = false; /* buffer was added to transaction */
1449
1450 more_rgrps:
1451         rgd = NULL;
1452         if (gfs2_holder_initialized(rd_gh)) {
1453                 rgd = gfs2_glock2rgrp(rd_gh->gh_gl);
1454                 gfs2_assert_withdraw(sdp,
1455                              gfs2_glock_is_locked_by_me(rd_gh->gh_gl));
1456         }
1457         blks_outside_rgrp = 0;
1458         bstart = 0;
1459         blen = 0;
1460
1461         for (p = start; p < end; p++) {
1462                 if (!*p)
1463                         continue;
1464                 bn = be64_to_cpu(*p);
1465
1466                 if (rgd) {
1467                         if (!rgrp_contains_block(rgd, bn)) {
1468                                 blks_outside_rgrp++;
1469                                 continue;
1470                         }
1471                 } else {
1472                         rgd = gfs2_blk2rgrpd(sdp, bn, true);
1473                         if (unlikely(!rgd)) {
1474                                 ret = -EIO;
1475                                 goto out;
1476                         }
1477                         ret = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE,
1478                                                  LM_FLAG_NODE_SCOPE, rd_gh);
1479                         if (ret)
1480                                 goto out;
1481
1482                         /* Must be done with the rgrp glock held: */
1483                         if (gfs2_rs_active(&ip->i_res) &&
1484                             rgd == ip->i_res.rs_rgd)
1485                                 gfs2_rs_deltree(&ip->i_res);
1486                 }
1487
1488                 /* The size of our transactions will be unknown until we
1489                    actually process all the metadata blocks that relate to
1490                    the rgrp. So we estimate. We know it can't be more than
1491                    the dinode's i_blocks and we don't want to exceed the
1492                    journal flush threshold, sd_log_thresh2. */
1493                 if (current->journal_info == NULL) {
1494                         unsigned int jblocks_rqsted, revokes;
1495
1496                         jblocks_rqsted = rgd->rd_length + RES_DINODE +
1497                                 RES_INDIRECT;
1498                         isize_blks = gfs2_get_inode_blocks(&ip->i_inode);
1499                         if (isize_blks > atomic_read(&sdp->sd_log_thresh2))
1500                                 jblocks_rqsted +=
1501                                         atomic_read(&sdp->sd_log_thresh2);
1502                         else
1503                                 jblocks_rqsted += isize_blks;
1504                         revokes = jblocks_rqsted;
1505                         if (meta)
1506                                 revokes += end - start;
1507                         else if (ip->i_depth)
1508                                 revokes += sdp->sd_inptrs;
1509                         ret = gfs2_trans_begin(sdp, jblocks_rqsted, revokes);
1510                         if (ret)
1511                                 goto out_unlock;
1512                         down_write(&ip->i_rw_mutex);
1513                 }
1514                 /* check if we will exceed the transaction blocks requested */
1515                 tr = current->journal_info;
1516                 if (tr->tr_num_buf_new + RES_STATFS +
1517                     RES_QUOTA >= atomic_read(&sdp->sd_log_thresh2)) {
1518                         /* We set blks_outside_rgrp to ensure the loop will
1519                            be repeated for the same rgrp, but with a new
1520                            transaction. */
1521                         blks_outside_rgrp++;
1522                         /* This next part is tricky. If the buffer was added
1523                            to the transaction, we've already set some block
1524                            pointers to 0, so we better follow through and free
1525                            them, or we will introduce corruption (so break).
1526                            This may be impossible, or at least rare, but I
1527                            decided to cover the case regardless.
1528
1529                            If the buffer was not added to the transaction
1530                            (this call), doing so would exceed our transaction
1531                            size, so we need to end the transaction and start a
1532                            new one (so goto). */
1533
1534                         if (buf_in_tr)
1535                                 break;
1536                         goto out_unlock;
1537                 }
1538
1539                 gfs2_trans_add_meta(ip->i_gl, bh);
1540                 buf_in_tr = true;
1541                 *p = 0;
1542                 if (bstart + blen == bn) {
1543                         blen++;
1544                         continue;
1545                 }
1546                 if (bstart) {
1547                         __gfs2_free_blocks(ip, rgd, bstart, (u32)blen, meta);
1548                         (*btotal) += blen;
1549                         gfs2_add_inode_blocks(&ip->i_inode, -blen);
1550                 }
1551                 bstart = bn;
1552                 blen = 1;
1553         }
1554         if (bstart) {
1555                 __gfs2_free_blocks(ip, rgd, bstart, (u32)blen, meta);
1556                 (*btotal) += blen;
1557                 gfs2_add_inode_blocks(&ip->i_inode, -blen);
1558         }
1559 out_unlock:
1560         if (!ret && blks_outside_rgrp) { /* If buffer still has non-zero blocks
1561                                             outside the rgrp we just processed,
1562                                             do it all over again. */
1563                 if (current->journal_info) {
1564                         struct buffer_head *dibh;
1565
1566                         ret = gfs2_meta_inode_buffer(ip, &dibh);
1567                         if (ret)
1568                                 goto out;
1569
1570                         /* Every transaction boundary, we rewrite the dinode
1571                            to keep its di_blocks current in case of failure. */
1572                         ip->i_inode.i_mtime = ip->i_inode.i_ctime =
1573                                 current_time(&ip->i_inode);
1574                         gfs2_trans_add_meta(ip->i_gl, dibh);
1575                         gfs2_dinode_out(ip, dibh->b_data);
1576                         brelse(dibh);
1577                         up_write(&ip->i_rw_mutex);
1578                         gfs2_trans_end(sdp);
1579                         buf_in_tr = false;
1580                 }
1581                 gfs2_glock_dq_uninit(rd_gh);
1582                 cond_resched();
1583                 goto more_rgrps;
1584         }
1585 out:
1586         return ret;
1587 }
1588
1589 static bool mp_eq_to_hgt(struct metapath *mp, __u16 *list, unsigned int h)
1590 {
1591         if (memcmp(mp->mp_list, list, h * sizeof(mp->mp_list[0])))
1592                 return false;
1593         return true;
1594 }
1595
1596 /**
1597  * find_nonnull_ptr - find a non-null pointer given a metapath and height
1598  * @sdp: The superblock
1599  * @mp: starting metapath
1600  * @h: desired height to search
1601  * @end_list: See punch_hole().
1602  * @end_aligned: See punch_hole().
1603  *
1604  * Assumes the metapath is valid (with buffers) out to height h.
1605  * Returns: true if a non-null pointer was found in the metapath buffer
1606  *          false if all remaining pointers are NULL in the buffer
1607  */
1608 static bool find_nonnull_ptr(struct gfs2_sbd *sdp, struct metapath *mp,
1609                              unsigned int h,
1610                              __u16 *end_list, unsigned int end_aligned)
1611 {
1612         struct buffer_head *bh = mp->mp_bh[h];
1613         __be64 *first, *ptr, *end;
1614
1615         first = metaptr1(h, mp);
1616         ptr = first + mp->mp_list[h];
1617         end = (__be64 *)(bh->b_data + bh->b_size);
1618         if (end_list && mp_eq_to_hgt(mp, end_list, h)) {
1619                 bool keep_end = h < end_aligned;
1620                 end = first + end_list[h] + keep_end;
1621         }
1622
1623         while (ptr < end) {
1624                 if (*ptr) { /* if we have a non-null pointer */
1625                         mp->mp_list[h] = ptr - first;
1626                         h++;
1627                         if (h < GFS2_MAX_META_HEIGHT)
1628                                 mp->mp_list[h] = 0;
1629                         return true;
1630                 }
1631                 ptr++;
1632         }
1633         return false;
1634 }
1635
1636 enum dealloc_states {
1637         DEALLOC_MP_FULL = 0,    /* Strip a metapath with all buffers read in */
1638         DEALLOC_MP_LOWER = 1,   /* lower the metapath strip height */
1639         DEALLOC_FILL_MP = 2,  /* Fill in the metapath to the given height. */
1640         DEALLOC_DONE = 3,       /* process complete */
1641 };
1642
1643 static inline void
1644 metapointer_range(struct metapath *mp, int height,
1645                   __u16 *start_list, unsigned int start_aligned,
1646                   __u16 *end_list, unsigned int end_aligned,
1647                   __be64 **start, __be64 **end)
1648 {
1649         struct buffer_head *bh = mp->mp_bh[height];
1650         __be64 *first;
1651
1652         first = metaptr1(height, mp);
1653         *start = first;
1654         if (mp_eq_to_hgt(mp, start_list, height)) {
1655                 bool keep_start = height < start_aligned;
1656                 *start = first + start_list[height] + keep_start;
1657         }
1658         *end = (__be64 *)(bh->b_data + bh->b_size);
1659         if (end_list && mp_eq_to_hgt(mp, end_list, height)) {
1660                 bool keep_end = height < end_aligned;
1661                 *end = first + end_list[height] + keep_end;
1662         }
1663 }
1664
1665 static inline bool walk_done(struct gfs2_sbd *sdp,
1666                              struct metapath *mp, int height,
1667                              __u16 *end_list, unsigned int end_aligned)
1668 {
1669         __u16 end;
1670
1671         if (end_list) {
1672                 bool keep_end = height < end_aligned;
1673                 if (!mp_eq_to_hgt(mp, end_list, height))
1674                         return false;
1675                 end = end_list[height] + keep_end;
1676         } else
1677                 end = (height > 0) ? sdp->sd_inptrs : sdp->sd_diptrs;
1678         return mp->mp_list[height] >= end;
1679 }
1680
1681 /**
1682  * punch_hole - deallocate blocks in a file
1683  * @ip: inode to truncate
1684  * @offset: the start of the hole
1685  * @length: the size of the hole (or 0 for truncate)
1686  *
1687  * Punch a hole into a file or truncate a file at a given position.  This
1688  * function operates in whole blocks (@offset and @length are rounded
1689  * accordingly); partially filled blocks must be cleared otherwise.
1690  *
1691  * This function works from the bottom up, and from the right to the left. In
1692  * other words, it strips off the highest layer (data) before stripping any of
1693  * the metadata. Doing it this way is best in case the operation is interrupted
1694  * by power failure, etc.  The dinode is rewritten in every transaction to
1695  * guarantee integrity.
1696  */
1697 static int punch_hole(struct gfs2_inode *ip, u64 offset, u64 length)
1698 {
1699         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1700         u64 maxsize = sdp->sd_heightsize[ip->i_height];
1701         struct metapath mp = {};
1702         struct buffer_head *dibh, *bh;
1703         struct gfs2_holder rd_gh;
1704         unsigned int bsize_shift = sdp->sd_sb.sb_bsize_shift;
1705         u64 lblock = (offset + (1 << bsize_shift) - 1) >> bsize_shift;
1706         __u16 start_list[GFS2_MAX_META_HEIGHT];
1707         __u16 __end_list[GFS2_MAX_META_HEIGHT], *end_list = NULL;
1708         unsigned int start_aligned, end_aligned;
1709         unsigned int strip_h = ip->i_height - 1;
1710         u32 btotal = 0;
1711         int ret, state;
1712         int mp_h; /* metapath buffers are read in to this height */
1713         u64 prev_bnr = 0;
1714         __be64 *start, *end;
1715
1716         if (offset >= maxsize) {
1717                 /*
1718                  * The starting point lies beyond the allocated meta-data;
1719                  * there are no blocks do deallocate.
1720                  */
1721                 return 0;
1722         }
1723
1724         /*
1725          * The start position of the hole is defined by lblock, start_list, and
1726          * start_aligned.  The end position of the hole is defined by lend,
1727          * end_list, and end_aligned.
1728          *
1729          * start_aligned and end_aligned define down to which height the start
1730          * and end positions are aligned to the metadata tree (i.e., the
1731          * position is a multiple of the metadata granularity at the height
1732          * above).  This determines at which heights additional meta pointers
1733          * needs to be preserved for the remaining data.
1734          */
1735
1736         if (length) {
1737                 u64 end_offset = offset + length;
1738                 u64 lend;
1739
1740                 /*
1741                  * Clip the end at the maximum file size for the given height:
1742                  * that's how far the metadata goes; files bigger than that
1743                  * will have additional layers of indirection.
1744                  */
1745                 if (end_offset > maxsize)
1746                         end_offset = maxsize;
1747                 lend = end_offset >> bsize_shift;
1748
1749                 if (lblock >= lend)
1750                         return 0;
1751
1752                 find_metapath(sdp, lend, &mp, ip->i_height);
1753                 end_list = __end_list;
1754                 memcpy(end_list, mp.mp_list, sizeof(mp.mp_list));
1755
1756                 for (mp_h = ip->i_height - 1; mp_h > 0; mp_h--) {
1757                         if (end_list[mp_h])
1758                                 break;
1759                 }
1760                 end_aligned = mp_h;
1761         }
1762
1763         find_metapath(sdp, lblock, &mp, ip->i_height);
1764         memcpy(start_list, mp.mp_list, sizeof(start_list));
1765
1766         for (mp_h = ip->i_height - 1; mp_h > 0; mp_h--) {
1767                 if (start_list[mp_h])
1768                         break;
1769         }
1770         start_aligned = mp_h;
1771
1772         ret = gfs2_meta_inode_buffer(ip, &dibh);
1773         if (ret)
1774                 return ret;
1775
1776         mp.mp_bh[0] = dibh;
1777         ret = lookup_metapath(ip, &mp);
1778         if (ret)
1779                 goto out_metapath;
1780
1781         /* issue read-ahead on metadata */
1782         for (mp_h = 0; mp_h < mp.mp_aheight - 1; mp_h++) {
1783                 metapointer_range(&mp, mp_h, start_list, start_aligned,
1784                                   end_list, end_aligned, &start, &end);
1785                 gfs2_metapath_ra(ip->i_gl, start, end);
1786         }
1787
1788         if (mp.mp_aheight == ip->i_height)
1789                 state = DEALLOC_MP_FULL; /* We have a complete metapath */
1790         else
1791                 state = DEALLOC_FILL_MP; /* deal with partial metapath */
1792
1793         ret = gfs2_rindex_update(sdp);
1794         if (ret)
1795                 goto out_metapath;
1796
1797         ret = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE);
1798         if (ret)
1799                 goto out_metapath;
1800         gfs2_holder_mark_uninitialized(&rd_gh);
1801
1802         mp_h = strip_h;
1803
1804         while (state != DEALLOC_DONE) {
1805                 switch (state) {
1806                 /* Truncate a full metapath at the given strip height.
1807                  * Note that strip_h == mp_h in order to be in this state. */
1808                 case DEALLOC_MP_FULL:
1809                         bh = mp.mp_bh[mp_h];
1810                         gfs2_assert_withdraw(sdp, bh);
1811                         if (gfs2_assert_withdraw(sdp,
1812                                                  prev_bnr != bh->b_blocknr)) {
1813                                 fs_emerg(sdp, "inode %llu, block:%llu, i_h:%u,"
1814                                          "s_h:%u, mp_h:%u\n",
1815                                        (unsigned long long)ip->i_no_addr,
1816                                        prev_bnr, ip->i_height, strip_h, mp_h);
1817                         }
1818                         prev_bnr = bh->b_blocknr;
1819
1820                         if (gfs2_metatype_check(sdp, bh,
1821                                                 (mp_h ? GFS2_METATYPE_IN :
1822                                                         GFS2_METATYPE_DI))) {
1823                                 ret = -EIO;
1824                                 goto out;
1825                         }
1826
1827                         /*
1828                          * Below, passing end_aligned as 0 gives us the
1829                          * metapointer range excluding the end point: the end
1830                          * point is the first metapath we must not deallocate!
1831                          */
1832
1833                         metapointer_range(&mp, mp_h, start_list, start_aligned,
1834                                           end_list, 0 /* end_aligned */,
1835                                           &start, &end);
1836                         ret = sweep_bh_for_rgrps(ip, &rd_gh, mp.mp_bh[mp_h],
1837                                                  start, end,
1838                                                  mp_h != ip->i_height - 1,
1839                                                  &btotal);
1840
1841                         /* If we hit an error or just swept dinode buffer,
1842                            just exit. */
1843                         if (ret || !mp_h) {
1844                                 state = DEALLOC_DONE;
1845                                 break;
1846                         }
1847                         state = DEALLOC_MP_LOWER;
1848                         break;
1849
1850                 /* lower the metapath strip height */
1851                 case DEALLOC_MP_LOWER:
1852                         /* We're done with the current buffer, so release it,
1853                            unless it's the dinode buffer. Then back up to the
1854                            previous pointer. */
1855                         if (mp_h) {
1856                                 brelse(mp.mp_bh[mp_h]);
1857                                 mp.mp_bh[mp_h] = NULL;
1858                         }
1859                         /* If we can't get any lower in height, we've stripped
1860                            off all we can. Next step is to back up and start
1861                            stripping the previous level of metadata. */
1862                         if (mp_h == 0) {
1863                                 strip_h--;
1864                                 memcpy(mp.mp_list, start_list, sizeof(start_list));
1865                                 mp_h = strip_h;
1866                                 state = DEALLOC_FILL_MP;
1867                                 break;
1868                         }
1869                         mp.mp_list[mp_h] = 0;
1870                         mp_h--; /* search one metadata height down */
1871                         mp.mp_list[mp_h]++;
1872                         if (walk_done(sdp, &mp, mp_h, end_list, end_aligned))
1873                                 break;
1874                         /* Here we've found a part of the metapath that is not
1875                          * allocated. We need to search at that height for the
1876                          * next non-null pointer. */
1877                         if (find_nonnull_ptr(sdp, &mp, mp_h, end_list, end_aligned)) {
1878                                 state = DEALLOC_FILL_MP;
1879                                 mp_h++;
1880                         }
1881                         /* No more non-null pointers at this height. Back up
1882                            to the previous height and try again. */
1883                         break; /* loop around in the same state */
1884
1885                 /* Fill the metapath with buffers to the given height. */
1886                 case DEALLOC_FILL_MP:
1887                         /* Fill the buffers out to the current height. */
1888                         ret = fillup_metapath(ip, &mp, mp_h);
1889                         if (ret < 0)
1890                                 goto out;
1891
1892                         /* On the first pass, issue read-ahead on metadata. */
1893                         if (mp.mp_aheight > 1 && strip_h == ip->i_height - 1) {
1894                                 unsigned int height = mp.mp_aheight - 1;
1895
1896                                 /* No read-ahead for data blocks. */
1897                                 if (mp.mp_aheight - 1 == strip_h)
1898                                         height--;
1899
1900                                 for (; height >= mp.mp_aheight - ret; height--) {
1901                                         metapointer_range(&mp, height,
1902                                                           start_list, start_aligned,
1903                                                           end_list, end_aligned,
1904                                                           &start, &end);
1905                                         gfs2_metapath_ra(ip->i_gl, start, end);
1906                                 }
1907                         }
1908
1909                         /* If buffers found for the entire strip height */
1910                         if (mp.mp_aheight - 1 == strip_h) {
1911                                 state = DEALLOC_MP_FULL;
1912                                 break;
1913                         }
1914                         if (mp.mp_aheight < ip->i_height) /* We have a partial height */
1915                                 mp_h = mp.mp_aheight - 1;
1916
1917                         /* If we find a non-null block pointer, crawl a bit
1918                            higher up in the metapath and try again, otherwise
1919                            we need to look lower for a new starting point. */
1920                         if (find_nonnull_ptr(sdp, &mp, mp_h, end_list, end_aligned))
1921                                 mp_h++;
1922                         else
1923                                 state = DEALLOC_MP_LOWER;
1924                         break;
1925                 }
1926         }
1927
1928         if (btotal) {
1929                 if (current->journal_info == NULL) {
1930                         ret = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS +
1931                                                RES_QUOTA, 0);
1932                         if (ret)
1933                                 goto out;
1934                         down_write(&ip->i_rw_mutex);
1935                 }
1936                 gfs2_statfs_change(sdp, 0, +btotal, 0);
1937                 gfs2_quota_change(ip, -(s64)btotal, ip->i_inode.i_uid,
1938                                   ip->i_inode.i_gid);
1939                 ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1940                 gfs2_trans_add_meta(ip->i_gl, dibh);
1941                 gfs2_dinode_out(ip, dibh->b_data);
1942                 up_write(&ip->i_rw_mutex);
1943                 gfs2_trans_end(sdp);
1944         }
1945
1946 out:
1947         if (gfs2_holder_initialized(&rd_gh))
1948                 gfs2_glock_dq_uninit(&rd_gh);
1949         if (current->journal_info) {
1950                 up_write(&ip->i_rw_mutex);
1951                 gfs2_trans_end(sdp);
1952                 cond_resched();
1953         }
1954         gfs2_quota_unhold(ip);
1955 out_metapath:
1956         release_metapath(&mp);
1957         return ret;
1958 }
1959
1960 static int trunc_end(struct gfs2_inode *ip)
1961 {
1962         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
1963         struct buffer_head *dibh;
1964         int error;
1965
1966         error = gfs2_trans_begin(sdp, RES_DINODE, 0);
1967         if (error)
1968                 return error;
1969
1970         down_write(&ip->i_rw_mutex);
1971
1972         error = gfs2_meta_inode_buffer(ip, &dibh);
1973         if (error)
1974                 goto out;
1975
1976         if (!i_size_read(&ip->i_inode)) {
1977                 ip->i_height = 0;
1978                 ip->i_goal = ip->i_no_addr;
1979                 gfs2_buffer_clear_tail(dibh, sizeof(struct gfs2_dinode));
1980                 gfs2_ordered_del_inode(ip);
1981         }
1982         ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
1983         ip->i_diskflags &= ~GFS2_DIF_TRUNC_IN_PROG;
1984
1985         gfs2_trans_add_meta(ip->i_gl, dibh);
1986         gfs2_dinode_out(ip, dibh->b_data);
1987         brelse(dibh);
1988
1989 out:
1990         up_write(&ip->i_rw_mutex);
1991         gfs2_trans_end(sdp);
1992         return error;
1993 }
1994
1995 /**
1996  * do_shrink - make a file smaller
1997  * @inode: the inode
1998  * @newsize: the size to make the file
1999  *
2000  * Called with an exclusive lock on @inode. The @size must
2001  * be equal to or smaller than the current inode size.
2002  *
2003  * Returns: errno
2004  */
2005
2006 static int do_shrink(struct inode *inode, u64 newsize)
2007 {
2008         struct gfs2_inode *ip = GFS2_I(inode);
2009         int error;
2010
2011         error = trunc_start(inode, newsize);
2012         if (error < 0)
2013                 return error;
2014         if (gfs2_is_stuffed(ip))
2015                 return 0;
2016
2017         error = punch_hole(ip, newsize, 0);
2018         if (error == 0)
2019                 error = trunc_end(ip);
2020
2021         return error;
2022 }
2023
2024 void gfs2_trim_blocks(struct inode *inode)
2025 {
2026         int ret;
2027
2028         ret = do_shrink(inode, inode->i_size);
2029         WARN_ON(ret != 0);
2030 }
2031
2032 /**
2033  * do_grow - Touch and update inode size
2034  * @inode: The inode
2035  * @size: The new size
2036  *
2037  * This function updates the timestamps on the inode and
2038  * may also increase the size of the inode. This function
2039  * must not be called with @size any smaller than the current
2040  * inode size.
2041  *
2042  * Although it is not strictly required to unstuff files here,
2043  * earlier versions of GFS2 have a bug in the stuffed file reading
2044  * code which will result in a buffer overrun if the size is larger
2045  * than the max stuffed file size. In order to prevent this from
2046  * occurring, such files are unstuffed, but in other cases we can
2047  * just update the inode size directly.
2048  *
2049  * Returns: 0 on success, or -ve on error
2050  */
2051
2052 static int do_grow(struct inode *inode, u64 size)
2053 {
2054         struct gfs2_inode *ip = GFS2_I(inode);
2055         struct gfs2_sbd *sdp = GFS2_SB(inode);
2056         struct gfs2_alloc_parms ap = { .target = 1, };
2057         struct buffer_head *dibh;
2058         int error;
2059         int unstuff = 0;
2060
2061         if (gfs2_is_stuffed(ip) && size > gfs2_max_stuffed_size(ip)) {
2062                 error = gfs2_quota_lock_check(ip, &ap);
2063                 if (error)
2064                         return error;
2065
2066                 error = gfs2_inplace_reserve(ip, &ap);
2067                 if (error)
2068                         goto do_grow_qunlock;
2069                 unstuff = 1;
2070         }
2071
2072         error = gfs2_trans_begin(sdp, RES_DINODE + RES_STATFS + RES_RG_BIT +
2073                                  (unstuff &&
2074                                   gfs2_is_jdata(ip) ? RES_JDATA : 0) +
2075                                  (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF ?
2076                                   0 : RES_QUOTA), 0);
2077         if (error)
2078                 goto do_grow_release;
2079
2080         if (unstuff) {
2081                 error = gfs2_unstuff_dinode(ip);
2082                 if (error)
2083                         goto do_end_trans;
2084         }
2085
2086         error = gfs2_meta_inode_buffer(ip, &dibh);
2087         if (error)
2088                 goto do_end_trans;
2089
2090         truncate_setsize(inode, size);
2091         ip->i_inode.i_mtime = ip->i_inode.i_ctime = current_time(&ip->i_inode);
2092         gfs2_trans_add_meta(ip->i_gl, dibh);
2093         gfs2_dinode_out(ip, dibh->b_data);
2094         brelse(dibh);
2095
2096 do_end_trans:
2097         gfs2_trans_end(sdp);
2098 do_grow_release:
2099         if (unstuff) {
2100                 gfs2_inplace_release(ip);
2101 do_grow_qunlock:
2102                 gfs2_quota_unlock(ip);
2103         }
2104         return error;
2105 }
2106
2107 /**
2108  * gfs2_setattr_size - make a file a given size
2109  * @inode: the inode
2110  * @newsize: the size to make the file
2111  *
2112  * The file size can grow, shrink, or stay the same size. This
2113  * is called holding i_rwsem and an exclusive glock on the inode
2114  * in question.
2115  *
2116  * Returns: errno
2117  */
2118
2119 int gfs2_setattr_size(struct inode *inode, u64 newsize)
2120 {
2121         struct gfs2_inode *ip = GFS2_I(inode);
2122         int ret;
2123
2124         BUG_ON(!S_ISREG(inode->i_mode));
2125
2126         ret = inode_newsize_ok(inode, newsize);
2127         if (ret)
2128                 return ret;
2129
2130         inode_dio_wait(inode);
2131
2132         ret = gfs2_qa_get(ip);
2133         if (ret)
2134                 goto out;
2135
2136         if (newsize >= inode->i_size) {
2137                 ret = do_grow(inode, newsize);
2138                 goto out;
2139         }
2140
2141         ret = do_shrink(inode, newsize);
2142 out:
2143         gfs2_rs_delete(ip);
2144         gfs2_qa_put(ip);
2145         return ret;
2146 }
2147
2148 int gfs2_truncatei_resume(struct gfs2_inode *ip)
2149 {
2150         int error;
2151         error = punch_hole(ip, i_size_read(&ip->i_inode), 0);
2152         if (!error)
2153                 error = trunc_end(ip);
2154         return error;
2155 }
2156
2157 int gfs2_file_dealloc(struct gfs2_inode *ip)
2158 {
2159         return punch_hole(ip, 0, 0);
2160 }
2161
2162 /**
2163  * gfs2_free_journal_extents - Free cached journal bmap info
2164  * @jd: The journal
2165  *
2166  */
2167
2168 void gfs2_free_journal_extents(struct gfs2_jdesc *jd)
2169 {
2170         struct gfs2_journal_extent *jext;
2171
2172         while(!list_empty(&jd->extent_list)) {
2173                 jext = list_first_entry(&jd->extent_list, struct gfs2_journal_extent, list);
2174                 list_del(&jext->list);
2175                 kfree(jext);
2176         }
2177 }
2178
2179 /**
2180  * gfs2_add_jextent - Add or merge a new extent to extent cache
2181  * @jd: The journal descriptor
2182  * @lblock: The logical block at start of new extent
2183  * @dblock: The physical block at start of new extent
2184  * @blocks: Size of extent in fs blocks
2185  *
2186  * Returns: 0 on success or -ENOMEM
2187  */
2188
2189 static int gfs2_add_jextent(struct gfs2_jdesc *jd, u64 lblock, u64 dblock, u64 blocks)
2190 {
2191         struct gfs2_journal_extent *jext;
2192
2193         if (!list_empty(&jd->extent_list)) {
2194                 jext = list_last_entry(&jd->extent_list, struct gfs2_journal_extent, list);
2195                 if ((jext->dblock + jext->blocks) == dblock) {
2196                         jext->blocks += blocks;
2197                         return 0;
2198                 }
2199         }
2200
2201         jext = kzalloc(sizeof(struct gfs2_journal_extent), GFP_NOFS);
2202         if (jext == NULL)
2203                 return -ENOMEM;
2204         jext->dblock = dblock;
2205         jext->lblock = lblock;
2206         jext->blocks = blocks;
2207         list_add_tail(&jext->list, &jd->extent_list);
2208         jd->nr_extents++;
2209         return 0;
2210 }
2211
2212 /**
2213  * gfs2_map_journal_extents - Cache journal bmap info
2214  * @sdp: The super block
2215  * @jd: The journal to map
2216  *
2217  * Create a reusable "extent" mapping from all logical
2218  * blocks to all physical blocks for the given journal.  This will save
2219  * us time when writing journal blocks.  Most journals will have only one
2220  * extent that maps all their logical blocks.  That's because gfs2.mkfs
2221  * arranges the journal blocks sequentially to maximize performance.
2222  * So the extent would map the first block for the entire file length.
2223  * However, gfs2_jadd can happen while file activity is happening, so
2224  * those journals may not be sequential.  Less likely is the case where
2225  * the users created their own journals by mounting the metafs and
2226  * laying it out.  But it's still possible.  These journals might have
2227  * several extents.
2228  *
2229  * Returns: 0 on success, or error on failure
2230  */
2231
2232 int gfs2_map_journal_extents(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd)
2233 {
2234         u64 lblock = 0;
2235         u64 lblock_stop;
2236         struct gfs2_inode *ip = GFS2_I(jd->jd_inode);
2237         struct buffer_head bh;
2238         unsigned int shift = sdp->sd_sb.sb_bsize_shift;
2239         u64 size;
2240         int rc;
2241         ktime_t start, end;
2242
2243         start = ktime_get();
2244         lblock_stop = i_size_read(jd->jd_inode) >> shift;
2245         size = (lblock_stop - lblock) << shift;
2246         jd->nr_extents = 0;
2247         WARN_ON(!list_empty(&jd->extent_list));
2248
2249         do {
2250                 bh.b_state = 0;
2251                 bh.b_blocknr = 0;
2252                 bh.b_size = size;
2253                 rc = gfs2_block_map(jd->jd_inode, lblock, &bh, 0);
2254                 if (rc || !buffer_mapped(&bh))
2255                         goto fail;
2256                 rc = gfs2_add_jextent(jd, lblock, bh.b_blocknr, bh.b_size >> shift);
2257                 if (rc)
2258                         goto fail;
2259                 size -= bh.b_size;
2260                 lblock += (bh.b_size >> ip->i_inode.i_blkbits);
2261         } while(size > 0);
2262
2263         end = ktime_get();
2264         fs_info(sdp, "journal %d mapped with %u extents in %lldms\n", jd->jd_jid,
2265                 jd->nr_extents, ktime_ms_delta(end, start));
2266         return 0;
2267
2268 fail:
2269         fs_warn(sdp, "error %d mapping journal %u at offset %llu (extent %u)\n",
2270                 rc, jd->jd_jid,
2271                 (unsigned long long)(i_size_read(jd->jd_inode) - size),
2272                 jd->nr_extents);
2273         fs_warn(sdp, "bmap=%d lblock=%llu block=%llu, state=0x%08lx, size=%llu\n",
2274                 rc, (unsigned long long)lblock, (unsigned long long)bh.b_blocknr,
2275                 bh.b_state, (unsigned long long)bh.b_size);
2276         gfs2_free_journal_extents(jd);
2277         return rc;
2278 }
2279
2280 /**
2281  * gfs2_write_alloc_required - figure out if a write will require an allocation
2282  * @ip: the file being written to
2283  * @offset: the offset to write to
2284  * @len: the number of bytes being written
2285  *
2286  * Returns: 1 if an alloc is required, 0 otherwise
2287  */
2288
2289 int gfs2_write_alloc_required(struct gfs2_inode *ip, u64 offset,
2290                               unsigned int len)
2291 {
2292         struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
2293         struct buffer_head bh;
2294         unsigned int shift;
2295         u64 lblock, lblock_stop, size;
2296         u64 end_of_file;
2297
2298         if (!len)
2299                 return 0;
2300
2301         if (gfs2_is_stuffed(ip)) {
2302                 if (offset + len > gfs2_max_stuffed_size(ip))
2303                         return 1;
2304                 return 0;
2305         }
2306
2307         shift = sdp->sd_sb.sb_bsize_shift;
2308         BUG_ON(gfs2_is_dir(ip));
2309         end_of_file = (i_size_read(&ip->i_inode) + sdp->sd_sb.sb_bsize - 1) >> shift;
2310         lblock = offset >> shift;
2311         lblock_stop = (offset + len + sdp->sd_sb.sb_bsize - 1) >> shift;
2312         if (lblock_stop > end_of_file && ip != GFS2_I(sdp->sd_rindex))
2313                 return 1;
2314
2315         size = (lblock_stop - lblock) << shift;
2316         do {
2317                 bh.b_state = 0;
2318                 bh.b_size = size;
2319                 gfs2_block_map(&ip->i_inode, lblock, &bh, 0);
2320                 if (!buffer_mapped(&bh))
2321                         return 1;
2322                 size -= bh.b_size;
2323                 lblock += (bh.b_size >> ip->i_inode.i_blkbits);
2324         } while(size > 0);
2325
2326         return 0;
2327 }
2328
2329 static int stuffed_zero_range(struct inode *inode, loff_t offset, loff_t length)
2330 {
2331         struct gfs2_inode *ip = GFS2_I(inode);
2332         struct buffer_head *dibh;
2333         int error;
2334
2335         if (offset >= inode->i_size)
2336                 return 0;
2337         if (offset + length > inode->i_size)
2338                 length = inode->i_size - offset;
2339
2340         error = gfs2_meta_inode_buffer(ip, &dibh);
2341         if (error)
2342                 return error;
2343         gfs2_trans_add_meta(ip->i_gl, dibh);
2344         memset(dibh->b_data + sizeof(struct gfs2_dinode) + offset, 0,
2345                length);
2346         brelse(dibh);
2347         return 0;
2348 }
2349
2350 static int gfs2_journaled_truncate_range(struct inode *inode, loff_t offset,
2351                                          loff_t length)
2352 {
2353         struct gfs2_sbd *sdp = GFS2_SB(inode);
2354         loff_t max_chunk = GFS2_JTRUNC_REVOKES * sdp->sd_vfs->s_blocksize;
2355         int error;
2356
2357         while (length) {
2358                 struct gfs2_trans *tr;
2359                 loff_t chunk;
2360                 unsigned int offs;
2361
2362                 chunk = length;
2363                 if (chunk > max_chunk)
2364                         chunk = max_chunk;
2365
2366                 offs = offset & ~PAGE_MASK;
2367                 if (offs && chunk > PAGE_SIZE)
2368                         chunk = offs + ((chunk - offs) & PAGE_MASK);
2369
2370                 truncate_pagecache_range(inode, offset, chunk);
2371                 offset += chunk;
2372                 length -= chunk;
2373
2374                 tr = current->journal_info;
2375                 if (!test_bit(TR_TOUCHED, &tr->tr_flags))
2376                         continue;
2377
2378                 gfs2_trans_end(sdp);
2379                 error = gfs2_trans_begin(sdp, RES_DINODE, GFS2_JTRUNC_REVOKES);
2380                 if (error)
2381                         return error;
2382         }
2383         return 0;
2384 }
2385
2386 int __gfs2_punch_hole(struct file *file, loff_t offset, loff_t length)
2387 {
2388         struct inode *inode = file_inode(file);
2389         struct gfs2_inode *ip = GFS2_I(inode);
2390         struct gfs2_sbd *sdp = GFS2_SB(inode);
2391         unsigned int blocksize = i_blocksize(inode);
2392         loff_t start, end;
2393         int error;
2394
2395         if (!gfs2_is_stuffed(ip)) {
2396                 unsigned int start_off, end_len;
2397
2398                 start_off = offset & (blocksize - 1);
2399                 end_len = (offset + length) & (blocksize - 1);
2400                 if (start_off) {
2401                         unsigned int len = length;
2402                         if (length > blocksize - start_off)
2403                                 len = blocksize - start_off;
2404                         error = gfs2_block_zero_range(inode, offset, len);
2405                         if (error)
2406                                 goto out;
2407                         if (start_off + length < blocksize)
2408                                 end_len = 0;
2409                 }
2410                 if (end_len) {
2411                         error = gfs2_block_zero_range(inode,
2412                                 offset + length - end_len, end_len);
2413                         if (error)
2414                                 goto out;
2415                 }
2416         }
2417
2418         start = round_down(offset, blocksize);
2419         end = round_up(offset + length, blocksize) - 1;
2420         error = filemap_write_and_wait_range(inode->i_mapping, start, end);
2421         if (error)
2422                 return error;
2423
2424         if (gfs2_is_jdata(ip))
2425                 error = gfs2_trans_begin(sdp, RES_DINODE + 2 * RES_JDATA,
2426                                          GFS2_JTRUNC_REVOKES);
2427         else
2428                 error = gfs2_trans_begin(sdp, RES_DINODE, 0);
2429         if (error)
2430                 return error;
2431
2432         if (gfs2_is_stuffed(ip)) {
2433                 error = stuffed_zero_range(inode, offset, length);
2434                 if (error)
2435                         goto out;
2436         }
2437
2438         if (gfs2_is_jdata(ip)) {
2439                 BUG_ON(!current->journal_info);
2440                 gfs2_journaled_truncate_range(inode, offset, length);
2441         } else
2442                 truncate_pagecache_range(inode, offset, offset + length - 1);
2443
2444         file_update_time(file);
2445         mark_inode_dirty(inode);
2446
2447         if (current->journal_info)
2448                 gfs2_trans_end(sdp);
2449
2450         if (!gfs2_is_stuffed(ip))
2451                 error = punch_hole(ip, offset, length);
2452
2453 out:
2454         if (current->journal_info)
2455                 gfs2_trans_end(sdp);
2456         return error;
2457 }
2458
2459 static int gfs2_map_blocks(struct iomap_writepage_ctx *wpc, struct inode *inode,
2460                 loff_t offset)
2461 {
2462         int ret;
2463
2464         if (WARN_ON_ONCE(gfs2_is_stuffed(GFS2_I(inode))))
2465                 return -EIO;
2466
2467         if (offset >= wpc->iomap.offset &&
2468             offset < wpc->iomap.offset + wpc->iomap.length)
2469                 return 0;
2470
2471         memset(&wpc->iomap, 0, sizeof(wpc->iomap));
2472         ret = gfs2_iomap_get(inode, offset, INT_MAX, &wpc->iomap);
2473         return ret;
2474 }
2475
2476 const struct iomap_writeback_ops gfs2_writeback_ops = {
2477         .map_blocks             = gfs2_map_blocks,
2478 };