fs/xfs/xfs_aops.c

   1 /*
   2  * Copyright (c) 2000-2005 Silicon Graphics, Inc.
   3  * All Rights Reserved.
   4  *
   5  * This program is free software; you can redistribute it and/or
   6  * modify it under the terms of the GNU General Public License as
   7  * published by the Free Software Foundation.
   8  *
   9  * This program is distributed in the hope that it would be useful,
  10  * but WITHOUT ANY WARRANTY; without even the implied warranty of
  11  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  12  * GNU General Public License for more details.
  13  *
  14  * You should have received a copy of the GNU General Public License
  15  * along with this program; if not, write the Free Software Foundation,
  16  * Inc.,  51 Franklin St, Fifth Floor, Boston, MA  02110-1301  USA
  17  */
  18 #include "xfs.h"
  19 #include "xfs_shared.h"
  20 #include "xfs_format.h"
  21 #include "xfs_log_format.h"
  22 #include "xfs_trans_resv.h"
  23 #include "xfs_mount.h"
  24 #include "xfs_inode.h"
  25 #include "xfs_trans.h"
  26 #include "xfs_inode_item.h"
  27 #include "xfs_alloc.h"
  28 #include "xfs_error.h"
  29 #include "xfs_iomap.h"
  30 #include "xfs_trace.h"
  31 #include "xfs_bmap.h"
  32 #include "xfs_bmap_util.h"
  33 #include "xfs_bmap_btree.h"
  34 #include "xfs_reflink.h"
  35 #include <linux/gfp.h>
  36 #include <linux/mpage.h>
  37 #include <linux/pagevec.h>
  38 #include <linux/writeback.h>
  39
  40 /* flags for direct write completions */
  41 #define XFS_DIO_FLAG_UNWRITTEN  (1 << 0)
  42 #define XFS_DIO_FLAG_APPEND     (1 << 1)
  43 #define XFS_DIO_FLAG_COW        (1 << 2)
  44
  45 /*
  46  * structure owned by writepages passed to individual writepage calls
  47  */
  48 struct xfs_writepage_ctx {
  49         struct xfs_bmbt_irec    imap;
  50         bool                    imap_valid;
  51         unsigned int            io_type;
  52         struct xfs_ioend        *ioend;
  53         sector_t                last_block;
  54 };
  55
  56 void
  57 xfs_count_page_state(
  58         struct page             *page,
  59         int                     *delalloc,
  60         int                     *unwritten)
  61 {
  62         struct buffer_head      *bh, *head;
  63
  64         *delalloc = *unwritten = 0;
  65
  66         bh = head = page_buffers(page);
  67         do {
  68                 if (buffer_unwritten(bh))
  69                         (*unwritten) = 1;
  70                 else if (buffer_delay(bh))
  71                         (*delalloc) = 1;
  72         } while ((bh = bh->b_this_page) != head);
  73 }
  74
  75 struct block_device *
  76 xfs_find_bdev_for_inode(
  77         struct inode            *inode)
  78 {
  79         struct xfs_inode        *ip = XFS_I(inode);
  80         struct xfs_mount        *mp = ip->i_mount;
  81
  82         if (XFS_IS_REALTIME_INODE(ip))
  83                 return mp->m_rtdev_targp->bt_bdev;
  84         else
  85                 return mp->m_ddev_targp->bt_bdev;
  86 }
  87
  88 /*
  89  * We're now finished for good with this page.  Update the page state via the
  90  * associated buffer_heads, paying attention to the start and end offsets that
  91  * we need to process on the page.
  92  *
  93  * Note that we open code the action in end_buffer_async_write here so that we
  94  * only have to iterate over the buffers attached to the page once.  This is not
  95  * only more efficient, but also ensures that we only calls end_page_writeback
  96  * at the end of the iteration, and thus avoids the pitfall of having the page
  97  * and buffers potentially freed after every call to end_buffer_async_write.
  98  */
  99 static void
 100 xfs_finish_page_writeback(
 101         struct inode            *inode,
 102         struct bio_vec          *bvec,
 103         int                     error)
 104 {
 105         struct buffer_head      *head = page_buffers(bvec->bv_page), *bh = head;
 106         bool                    busy = false;
 107         unsigned int            off = 0;
 108         unsigned long           flags;
 109
 110         ASSERT(bvec->bv_offset < PAGE_SIZE);
 111         ASSERT((bvec->bv_offset & (i_blocksize(inode) - 1)) == 0);
 112         ASSERT(bvec->bv_offset + bvec->bv_len <= PAGE_SIZE);
 113         ASSERT((bvec->bv_len & (i_blocksize(inode) - 1)) == 0);
 114
 115         local_irq_save(flags);
 116         bit_spin_lock(BH_Uptodate_Lock, &head->b_state);
 117         do {
 118                 if (off >= bvec->bv_offset &&
 119                     off < bvec->bv_offset + bvec->bv_len) {
 120                         ASSERT(buffer_async_write(bh));
 121                         ASSERT(bh->b_end_io == NULL);
 122
 123                         if (error) {
 124                                 mapping_set_error(bvec->bv_page->mapping, -EIO);
 125                                 set_buffer_write_io_error(bh);
 126                                 clear_buffer_uptodate(bh);
 127                                 SetPageError(bvec->bv_page);
 128                         } else {
 129                                 set_buffer_uptodate(bh);
 130                         }
 131                         clear_buffer_async_write(bh);
 132                         unlock_buffer(bh);
 133                 } else if (buffer_async_write(bh)) {
 134                         ASSERT(buffer_locked(bh));
 135                         busy = true;
 136                 }
 137                 off += bh->b_size;
 138         } while ((bh = bh->b_this_page) != head);
 139         bit_spin_unlock(BH_Uptodate_Lock, &head->b_state);
 140         local_irq_restore(flags);
 141
 142         if (!busy)
 143                 end_page_writeback(bvec->bv_page);
 144 }
 145
 146 /*
 147  * We're now finished for good with this ioend structure.  Update the page
 148  * state, release holds on bios, and finally free up memory.  Do not use the
 149  * ioend after this.
 150  */
 151 STATIC void
 152 xfs_destroy_ioend(
 153         struct xfs_ioend        *ioend,
 154         int                     error)
 155 {
 156         struct inode            *inode = ioend->io_inode;
 157         struct bio              *bio = &ioend->io_inline_bio;
 158         struct bio              *last = ioend->io_bio, *next;
 159         u64                     start = bio->bi_iter.bi_sector;
 160         bool                    quiet = bio_flagged(bio, BIO_QUIET);
 161
 162         for (bio = &ioend->io_inline_bio; bio; bio = next) {
 163                 struct bio_vec  *bvec;
 164                 int             i;
 165
 166                 /*
 167                  * For the last bio, bi_private points to the ioend, so we
 168                  * need to explicitly end the iteration here.
 169                  */
 170                 if (bio == last)
 171                         next = NULL;
 172                 else
 173                         next = bio->bi_private;
 174
 175                 /* walk each page on bio, ending page IO on them */
 176                 bio_for_each_segment_all(bvec, bio, i)
 177                         xfs_finish_page_writeback(inode, bvec, error);
 178
 179                 bio_put(bio);
 180         }
 181
 182         if (unlikely(error && !quiet)) {
 183                 xfs_err_ratelimited(XFS_I(inode)->i_mount,
 184                         "writeback error on sector %llu", start);
 185         }
 186 }
 187
 188 /*
 189  * Fast and loose check if this write could update the on-disk inode size.
 190  */
 191 static inline bool xfs_ioend_is_append(struct xfs_ioend *ioend)
 192 {
 193         return ioend->io_offset + ioend->io_size >
 194                 XFS_I(ioend->io_inode)->i_d.di_size;
 195 }
 196
 197 STATIC int
 198 xfs_setfilesize_trans_alloc(
 199         struct xfs_ioend        *ioend)
 200 {
 201         struct xfs_mount        *mp = XFS_I(ioend->io_inode)->i_mount;
 202         struct xfs_trans        *tp;
 203         int                     error;
 204
 205         error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
 206         if (error)
 207                 return error;
 208
 209         ioend->io_append_trans = tp;
 210
 211         /*
 212          * We may pass freeze protection with a transaction.  So tell lockdep
 213          * we released it.
 214          */
 215         __sb_writers_release(ioend->io_inode->i_sb, SB_FREEZE_FS);
 216         /*
 217          * We hand off the transaction to the completion thread now, so
 218          * clear the flag here.
 219          */
 220         current_restore_flags_nested(&tp->t_pflags, PF_FSTRANS);
 221         return 0;
 222 }
 223
 224 /*
 225  * Update on-disk file size now that data has been written to disk.
 226  */
 227 STATIC int
 228 __xfs_setfilesize(
 229         struct xfs_inode        *ip,
 230         struct xfs_trans        *tp,
 231         xfs_off_t               offset,
 232         size_t                  size)
 233 {
 234         xfs_fsize_t             isize;
 235
 236         xfs_ilock(ip, XFS_ILOCK_EXCL);
 237         isize = xfs_new_eof(ip, offset + size);
 238         if (!isize) {
 239                 xfs_iunlock(ip, XFS_ILOCK_EXCL);
 240                 xfs_trans_cancel(tp);
 241                 return 0;
 242         }
 243
 244         trace_xfs_setfilesize(ip, offset, size);
 245
 246         ip->i_d.di_size = isize;
 247         xfs_trans_ijoin(tp, ip, XFS_ILOCK_EXCL);
 248         xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE);
 249
 250         return xfs_trans_commit(tp);
 251 }
 252
 253 int
 254 xfs_setfilesize(
 255         struct xfs_inode        *ip,
 256         xfs_off_t               offset,
 257         size_t                  size)
 258 {
 259         struct xfs_mount        *mp = ip->i_mount;
 260         struct xfs_trans        *tp;
 261         int                     error;
 262
 263         error = xfs_trans_alloc(mp, &M_RES(mp)->tr_fsyncts, 0, 0, 0, &tp);
 264         if (error)
 265                 return error;
 266
 267         return __xfs_setfilesize(ip, tp, offset, size);
 268 }
 269
 270 STATIC int
 271 xfs_setfilesize_ioend(
 272         struct xfs_ioend        *ioend,
 273         int                     error)
 274 {
 275         struct xfs_inode        *ip = XFS_I(ioend->io_inode);
 276         struct xfs_trans        *tp = ioend->io_append_trans;
 277
 278         /*
 279          * The transaction may have been allocated in the I/O submission thread,
 280          * thus we need to mark ourselves as being in a transaction manually.
 281          * Similarly for freeze protection.
 282          */
 283         current_set_flags_nested(&tp->t_pflags, PF_FSTRANS);
 284         __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
 285
 286         /* we abort the update if there was an IO error */
 287         if (error) {
 288                 xfs_trans_cancel(tp);
 289                 return error;
 290         }
 291
 292         return __xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size);
 293 }
 294
 295 /*
 296  * IO write completion.
 297  */
 298 STATIC void
 299 xfs_end_io(
 300         struct work_struct *work)
 301 {
 302         struct xfs_ioend        *ioend =
 303                 container_of(work, struct xfs_ioend, io_work);
 304         struct xfs_inode        *ip = XFS_I(ioend->io_inode);
 305         xfs_off_t               offset = ioend->io_offset;
 306         size_t                  size = ioend->io_size;
 307         int                     error = ioend->io_bio->bi_error;
 308
 309         /*
 310          * Just clean up the in-memory strutures if the fs has been shut down.
 311          */
 312         if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
 313                 error = -EIO;
 314                 goto done;
 315         }
 316
 317         /*
 318          * Clean up any COW blocks on an I/O error.
 319          */
 320         if (unlikely(error)) {
 321                 switch (ioend->io_type) {
 322                 case XFS_IO_COW:
 323                         xfs_reflink_cancel_cow_range(ip, offset, size, true);
 324                         break;
 325                 }
 326
 327                 goto done;
 328         }
 329
 330         /*
 331          * Success:  commit the COW or unwritten blocks if needed.
 332          */
 333         switch (ioend->io_type) {
 334         case XFS_IO_COW:
 335                 error = xfs_reflink_end_cow(ip, offset, size);
 336                 break;
 337         case XFS_IO_UNWRITTEN:
 338                 /* writeback should never update isize */
 339                 error = xfs_iomap_write_unwritten(ip, offset, size, false);
 340                 break;
 341         default:
 342                 ASSERT(!xfs_ioend_is_append(ioend) || ioend->io_append_trans);
 343                 break;
 344         }
 345
 346 done:
 347         if (ioend->io_append_trans)
 348                 error = xfs_setfilesize_ioend(ioend, error);
 349         xfs_destroy_ioend(ioend, error);
 350 }
 351
 352 STATIC void
 353 xfs_end_bio(
 354         struct bio              *bio)
 355 {
 356         struct xfs_ioend        *ioend = bio->bi_private;
 357         struct xfs_mount        *mp = XFS_I(ioend->io_inode)->i_mount;
 358
 359         if (ioend->io_type == XFS_IO_UNWRITTEN || ioend->io_type == XFS_IO_COW)
 360                 queue_work(mp->m_unwritten_workqueue, &ioend->io_work);
 361         else if (ioend->io_append_trans)
 362                 queue_work(mp->m_data_workqueue, &ioend->io_work);
 363         else
 364                 xfs_destroy_ioend(ioend, bio->bi_error);
 365 }
 366
 367 STATIC int
 368 xfs_map_blocks(
 369         struct inode            *inode,
 370         loff_t                  offset,
 371         struct xfs_bmbt_irec    *imap,
 372         int                     type)
 373 {
 374         struct xfs_inode        *ip = XFS_I(inode);
 375         struct xfs_mount        *mp = ip->i_mount;
 376         ssize_t                 count = i_blocksize(inode);
 377         xfs_fileoff_t           offset_fsb, end_fsb;
 378         int                     error = 0;
 379         int                     bmapi_flags = XFS_BMAPI_ENTIRE;
 380         int                     nimaps = 1;
 381
 382         if (XFS_FORCED_SHUTDOWN(mp))
 383                 return -EIO;
 384
 385         ASSERT(type != XFS_IO_COW);
 386         if (type == XFS_IO_UNWRITTEN)
 387                 bmapi_flags |= XFS_BMAPI_IGSTATE;
 388
 389         xfs_ilock(ip, XFS_ILOCK_SHARED);
 390         ASSERT(ip->i_d.di_format != XFS_DINODE_FMT_BTREE ||
 391                (ip->i_df.if_flags & XFS_IFEXTENTS));
 392         ASSERT(offset <= mp->m_super->s_maxbytes);
 393
 394         if ((xfs_ufsize_t)offset + count > mp->m_super->s_maxbytes)
 395                 count = mp->m_super->s_maxbytes - offset;
 396         end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + count);
 397         offset_fsb = XFS_B_TO_FSBT(mp, offset);
 398         error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
 399                                 imap, &nimaps, bmapi_flags);
 400         /*
 401          * Truncate an overwrite extent if there's a pending CoW
 402          * reservation before the end of this extent.  This forces us
 403          * to come back to writepage to take care of the CoW.
 404          */
 405         if (nimaps && type == XFS_IO_OVERWRITE)
 406                 xfs_reflink_trim_irec_to_next_cow(ip, offset_fsb, imap);
 407         xfs_iunlock(ip, XFS_ILOCK_SHARED);
 408
 409         if (error)
 410                 return error;
 411
 412         if (type == XFS_IO_DELALLOC &&
 413             (!nimaps || isnullstartblock(imap->br_startblock))) {
 414                 error = xfs_iomap_write_allocate(ip, XFS_DATA_FORK, offset,
 415                                 imap);
 416                 if (!error)
 417                         trace_xfs_map_blocks_alloc(ip, offset, count, type, imap);
 418                 return error;
 419         }
 420
 421 #ifdef DEBUG
 422         if (type == XFS_IO_UNWRITTEN) {
 423                 ASSERT(nimaps);
 424                 ASSERT(imap->br_startblock != HOLESTARTBLOCK);
 425                 ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
 426         }
 427 #endif
 428         if (nimaps)
 429                 trace_xfs_map_blocks_found(ip, offset, count, type, imap);
 430         return 0;
 431 }
 432
 433 STATIC bool
 434 xfs_imap_valid(
 435         struct inode            *inode,
 436         struct xfs_bmbt_irec    *imap,
 437         xfs_off_t               offset)
 438 {
 439         offset >>= inode->i_blkbits;
 440
 441         /*
 442          * We have to make sure the cached mapping is within EOF to protect
 443          * against eofblocks trimming on file release leaving us with a stale
 444          * mapping. Otherwise, a page for a subsequent file extending buffered
 445          * write could get picked up by this writeback cycle and written to the
 446          * wrong blocks.
 447          *
 448          * Note that what we really want here is a generic mapping invalidation
 449          * mechanism to protect us from arbitrary extent modifying contexts, not
 450          * just eofblocks.
 451          */
 452         xfs_trim_extent_eof(imap, XFS_I(inode));
 453
 454         return offset >= imap->br_startoff &&
 455                 offset < imap->br_startoff + imap->br_blockcount;
 456 }
 457
 458 STATIC void
 459 xfs_start_buffer_writeback(
 460         struct buffer_head      *bh)
 461 {
 462         ASSERT(buffer_mapped(bh));
 463         ASSERT(buffer_locked(bh));
 464         ASSERT(!buffer_delay(bh));
 465         ASSERT(!buffer_unwritten(bh));
 466
 467         bh->b_end_io = NULL;
 468         set_buffer_async_write(bh);
 469         set_buffer_uptodate(bh);
 470         clear_buffer_dirty(bh);
 471 }
 472
 473 STATIC void
 474 xfs_start_page_writeback(
 475         struct page             *page,
 476         int                     clear_dirty)
 477 {
 478         ASSERT(PageLocked(page));
 479         ASSERT(!PageWriteback(page));
 480
 481         /*
 482          * if the page was not fully cleaned, we need to ensure that the higher
 483          * layers come back to it correctly. That means we need to keep the page
 484          * dirty, and for WB_SYNC_ALL writeback we need to ensure the
 485          * PAGECACHE_TAG_TOWRITE index mark is not removed so another attempt to
 486          * write this page in this writeback sweep will be made.
 487          */
 488         if (clear_dirty) {
 489                 clear_page_dirty_for_io(page);
 490                 set_page_writeback(page);
 491         } else
 492                 set_page_writeback_keepwrite(page);
 493
 494         unlock_page(page);
 495 }
 496
 497 static inline int xfs_bio_add_buffer(struct bio *bio, struct buffer_head *bh)
 498 {
 499         return bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh));
 500 }
 501
 502 /*
 503  * Submit the bio for an ioend. We are passed an ioend with a bio attached to
 504  * it, and we submit that bio. The ioend may be used for multiple bio
 505  * submissions, so we only want to allocate an append transaction for the ioend
 506  * once. In the case of multiple bio submission, each bio will take an IO
 507  * reference to the ioend to ensure that the ioend completion is only done once
 508  * all bios have been submitted and the ioend is really done.
 509  *
 510  * If @fail is non-zero, it means that we have a situation where some part of
 511  * the submission process has failed after we have marked paged for writeback
 512  * and unlocked them. In this situation, we need to fail the bio and ioend
 513  * rather than submit it to IO. This typically only happens on a filesystem
 514  * shutdown.
 515  */
 516 STATIC int
 517 xfs_submit_ioend(
 518         struct writeback_control *wbc,
 519         struct xfs_ioend        *ioend,
 520         int                     status)
 521 {
 522         /* Convert CoW extents to regular */
 523         if (!status && ioend->io_type == XFS_IO_COW) {
 524                 status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode),
 525                                 ioend->io_offset, ioend->io_size);
 526         }
 527
 528         /* Reserve log space if we might write beyond the on-disk inode size. */
 529         if (!status &&
 530             ioend->io_type != XFS_IO_UNWRITTEN &&
 531             xfs_ioend_is_append(ioend) &&
 532             !ioend->io_append_trans)
 533                 status = xfs_setfilesize_trans_alloc(ioend);
 534
 535         ioend->io_bio->bi_private = ioend;
 536         ioend->io_bio->bi_end_io = xfs_end_bio;
 537         bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE,
 538                          (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0);
 539         /*
 540          * If we are failing the IO now, just mark the ioend with an
 541          * error and finish it. This will run IO completion immediately
 542          * as there is only one reference to the ioend at this point in
 543          * time.
 544          */
 545         if (status) {
 546                 ioend->io_bio->bi_error = status;
 547                 bio_endio(ioend->io_bio);
 548                 return status;
 549         }
 550
 551         submit_bio(ioend->io_bio);
 552         return 0;
 553 }
 554
 555 static void
 556 xfs_init_bio_from_bh(
 557         struct bio              *bio,
 558         struct buffer_head      *bh)
 559 {
 560         bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9);
 561         bio->bi_bdev = bh->b_bdev;
 562 }
 563
 564 static struct xfs_ioend *
 565 xfs_alloc_ioend(
 566         struct inode            *inode,
 567         unsigned int            type,
 568         xfs_off_t               offset,
 569         struct buffer_head      *bh)
 570 {
 571         struct xfs_ioend        *ioend;
 572         struct bio              *bio;
 573
 574         bio = bio_alloc_bioset(GFP_NOFS, BIO_MAX_PAGES, xfs_ioend_bioset);
 575         xfs_init_bio_from_bh(bio, bh);
 576
 577         ioend = container_of(bio, struct xfs_ioend, io_inline_bio);
 578         INIT_LIST_HEAD(&ioend->io_list);
 579         ioend->io_type = type;
 580         ioend->io_inode = inode;
 581         ioend->io_size = 0;
 582         ioend->io_offset = offset;
 583         INIT_WORK(&ioend->io_work, xfs_end_io);
 584         ioend->io_append_trans = NULL;
 585         ioend->io_bio = bio;
 586         return ioend;
 587 }
 588
 589 /*
 590  * Allocate a new bio, and chain the old bio to the new one.
 591  *
 592  * Note that we have to do perform the chaining in this unintuitive order
 593  * so that the bi_private linkage is set up in the right direction for the
 594  * traversal in xfs_destroy_ioend().
 595  */
 596 static void
 597 xfs_chain_bio(
 598         struct xfs_ioend        *ioend,
 599         struct writeback_control *wbc,
 600         struct buffer_head      *bh)
 601 {
 602         struct bio *new;
 603
 604         new = bio_alloc(GFP_NOFS, BIO_MAX_PAGES);
 605         xfs_init_bio_from_bh(new, bh);
 606
 607         bio_chain(ioend->io_bio, new);
 608         bio_get(ioend->io_bio);         /* for xfs_destroy_ioend */
 609         bio_set_op_attrs(ioend->io_bio, REQ_OP_WRITE,
 610                           (wbc->sync_mode == WB_SYNC_ALL) ? WRITE_SYNC : 0);
 611         submit_bio(ioend->io_bio);
 612         ioend->io_bio = new;
 613 }
 614
 615 /*
 616  * Test to see if we've been building up a completion structure for
 617  * earlier buffers -- if so, we try to append to this ioend if we
 618  * can, otherwise we finish off any current ioend and start another.
 619  * Return the ioend we finished off so that the caller can submit it
 620  * once it has finished processing the dirty page.
 621  */
 622 STATIC void
 623 xfs_add_to_ioend(
 624         struct inode            *inode,
 625         struct buffer_head      *bh,
 626         xfs_off_t               offset,
 627         struct xfs_writepage_ctx *wpc,
 628         struct writeback_control *wbc,
 629         struct list_head        *iolist)
 630 {
 631         if (!wpc->ioend || wpc->io_type != wpc->ioend->io_type ||
 632             bh->b_blocknr != wpc->last_block + 1 ||
 633             offset != wpc->ioend->io_offset + wpc->ioend->io_size) {
 634                 if (wpc->ioend)
 635                         list_add(&wpc->ioend->io_list, iolist);
 636                 wpc->ioend = xfs_alloc_ioend(inode, wpc->io_type, offset, bh);
 637         }
 638
 639         /*
 640          * If the buffer doesn't fit into the bio we need to allocate a new
 641          * one.  This shouldn't happen more than once for a given buffer.
 642          */
 643         while (xfs_bio_add_buffer(wpc->ioend->io_bio, bh) != bh->b_size)
 644                 xfs_chain_bio(wpc->ioend, wbc, bh);
 645
 646         wpc->ioend->io_size += bh->b_size;
 647         wpc->last_block = bh->b_blocknr;
 648         xfs_start_buffer_writeback(bh);
 649 }
 650
 651 STATIC void
 652 xfs_map_buffer(
 653         struct inode            *inode,
 654         struct buffer_head      *bh,
 655         struct xfs_bmbt_irec    *imap,
 656         xfs_off_t               offset)
 657 {
 658         sector_t                bn;
 659         struct xfs_mount        *m = XFS_I(inode)->i_mount;
 660         xfs_off_t               iomap_offset = XFS_FSB_TO_B(m, imap->br_startoff);
 661         xfs_daddr_t             iomap_bn = xfs_fsb_to_db(XFS_I(inode), imap->br_startblock);
 662
 663         ASSERT(imap->br_startblock != HOLESTARTBLOCK);
 664         ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
 665
 666         bn = (iomap_bn >> (inode->i_blkbits - BBSHIFT)) +
 667               ((offset - iomap_offset) >> inode->i_blkbits);
 668
 669         ASSERT(bn || XFS_IS_REALTIME_INODE(XFS_I(inode)));
 670
 671         bh->b_blocknr = bn;
 672         set_buffer_mapped(bh);
 673 }
 674
 675 STATIC void
 676 xfs_map_at_offset(
 677         struct inode            *inode,
 678         struct buffer_head      *bh,
 679         struct xfs_bmbt_irec    *imap,
 680         xfs_off_t               offset)
 681 {
 682         ASSERT(imap->br_startblock != HOLESTARTBLOCK);
 683         ASSERT(imap->br_startblock != DELAYSTARTBLOCK);
 684
 685         xfs_map_buffer(inode, bh, imap, offset);
 686         set_buffer_mapped(bh);
 687         clear_buffer_delay(bh);
 688         clear_buffer_unwritten(bh);
 689 }
 690
 691 /*
 692  * Test if a given page contains at least one buffer of a given @type.
 693  * If @check_all_buffers is true, then we walk all the buffers in the page to
 694  * try to find one of the type passed in. If it is not set, then the caller only
 695  * needs to check the first buffer on the page for a match.
 696  */
 697 STATIC bool
 698 xfs_check_page_type(
 699         struct page             *page,
 700         unsigned int            type,
 701         bool                    check_all_buffers)
 702 {
 703         struct buffer_head      *bh;
 704         struct buffer_head      *head;
 705
 706         if (PageWriteback(page))
 707                 return false;
 708         if (!page->mapping)
 709                 return false;
 710         if (!page_has_buffers(page))
 711                 return false;
 712
 713         bh = head = page_buffers(page);
 714         do {
 715                 if (buffer_unwritten(bh)) {
 716                         if (type == XFS_IO_UNWRITTEN)
 717                                 return true;
 718                 } else if (buffer_delay(bh)) {
 719                         if (type == XFS_IO_DELALLOC)
 720                                 return true;
 721                 } else if (buffer_dirty(bh) && buffer_mapped(bh)) {
 722                         if (type == XFS_IO_OVERWRITE)
 723                                 return true;
 724                 }
 725
 726                 /* If we are only checking the first buffer, we are done now. */
 727                 if (!check_all_buffers)
 728                         break;
 729         } while ((bh = bh->b_this_page) != head);
 730
 731         return false;
 732 }
 733
 734 STATIC void
 735 xfs_vm_invalidatepage(
 736         struct page             *page,
 737         unsigned int            offset,
 738         unsigned int            length)
 739 {
 740         trace_xfs_invalidatepage(page->mapping->host, page, offset,
 741                                  length);
 742
 743         /*
 744          * If we are invalidating the entire page, clear the dirty state from it
 745          * so that we can check for attempts to release dirty cached pages in
 746          * xfs_vm_releasepage().
 747          */
 748         if (offset == 0 && length >= PAGE_SIZE)
 749                 cancel_dirty_page(page);
 750         block_invalidatepage(page, offset, length);
 751 }
 752
 753 /*
 754  * If the page has delalloc buffers on it, we need to punch them out before we
 755  * invalidate the page. If we don't, we leave a stale delalloc mapping on the
 756  * inode that can trip a BUG() in xfs_get_blocks() later on if a direct IO read
 757  * is done on that same region - the delalloc extent is returned when none is
 758  * supposed to be there.
 759  *
 760  * We prevent this by truncating away the delalloc regions on the page before
 761  * invalidating it. Because they are delalloc, we can do this without needing a
 762  * transaction. Indeed - if we get ENOSPC errors, we have to be able to do this
 763  * truncation without a transaction as there is no space left for block
 764  * reservation (typically why we see a ENOSPC in writeback).
 765  *
 766  * This is not a performance critical path, so for now just do the punching a
 767  * buffer head at a time.
 768  */
 769 STATIC void
 770 xfs_aops_discard_page(
 771         struct page             *page)
 772 {
 773         struct inode            *inode = page->mapping->host;
 774         struct xfs_inode        *ip = XFS_I(inode);
 775         struct buffer_head      *bh, *head;
 776         loff_t                  offset = page_offset(page);
 777
 778         if (!xfs_check_page_type(page, XFS_IO_DELALLOC, true))
 779                 goto out_invalidate;
 780
 781         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
 782                 goto out_invalidate;
 783
 784         xfs_alert(ip->i_mount,
 785                 "page discard on page %p, inode 0x%llx, offset %llu.",
 786                         page, ip->i_ino, offset);
 787
 788         xfs_ilock(ip, XFS_ILOCK_EXCL);
 789         bh = head = page_buffers(page);
 790         do {
 791                 int             error;
 792                 xfs_fileoff_t   start_fsb;
 793
 794                 if (!buffer_delay(bh))
 795                         goto next_buffer;
 796
 797                 start_fsb = XFS_B_TO_FSBT(ip->i_mount, offset);
 798                 error = xfs_bmap_punch_delalloc_range(ip, start_fsb, 1);
 799                 if (error) {
 800                         /* something screwed, just bail */
 801                         if (!XFS_FORCED_SHUTDOWN(ip->i_mount)) {
 802                                 xfs_alert(ip->i_mount,
 803                         "page discard unable to remove delalloc mapping.");
 804                         }
 805                         break;
 806                 }
 807 next_buffer:
 808                 offset += i_blocksize(inode);
 809
 810         } while ((bh = bh->b_this_page) != head);
 811
 812         xfs_iunlock(ip, XFS_ILOCK_EXCL);
 813 out_invalidate:
 814         xfs_vm_invalidatepage(page, 0, PAGE_SIZE);
 815         return;
 816 }
 817
 818 static int
 819 xfs_map_cow(
 820         struct xfs_writepage_ctx *wpc,
 821         struct inode            *inode,
 822         loff_t                  offset,
 823         unsigned int            *new_type)
 824 {
 825         struct xfs_inode        *ip = XFS_I(inode);
 826         struct xfs_bmbt_irec    imap;
 827         bool                    is_cow = false, need_alloc = false;
 828         int                     error;
 829
 830         /*
 831          * If we already have a valid COW mapping keep using it.
 832          */
 833         if (wpc->io_type == XFS_IO_COW) {
 834                 wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap, offset);
 835                 if (wpc->imap_valid) {
 836                         *new_type = XFS_IO_COW;
 837                         return 0;
 838                 }
 839         }
 840
 841         /*
 842          * Else we need to check if there is a COW mapping at this offset.
 843          */
 844         xfs_ilock(ip, XFS_ILOCK_SHARED);
 845         is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap, &need_alloc);
 846         xfs_iunlock(ip, XFS_ILOCK_SHARED);
 847
 848         if (!is_cow)
 849                 return 0;
 850
 851         /*
 852          * And if the COW mapping has a delayed extent here we need to
 853          * allocate real space for it now.
 854          */
 855         if (need_alloc) {
 856                 error = xfs_iomap_write_allocate(ip, XFS_COW_FORK, offset,
 857                                 &imap);
 858                 if (error)
 859                         return error;
 860         }
 861
 862         wpc->io_type = *new_type = XFS_IO_COW;
 863         wpc->imap_valid = true;
 864         wpc->imap = imap;
 865         return 0;
 866 }
 867
 868 /*
 869  * We implement an immediate ioend submission policy here to avoid needing to
 870  * chain multiple ioends and hence nest mempool allocations which can violate
 871  * forward progress guarantees we need to provide. The current ioend we are
 872  * adding buffers to is cached on the writepage context, and if the new buffer
 873  * does not append to the cached ioend it will create a new ioend and cache that
 874  * instead.
 875  *
 876  * If a new ioend is created and cached, the old ioend is returned and queued
 877  * locally for submission once the entire page is processed or an error has been
 878  * detected.  While ioends are submitted immediately after they are completed,
 879  * batching optimisations are provided by higher level block plugging.
 880  *
 881  * At the end of a writeback pass, there will be a cached ioend remaining on the
 882  * writepage context that the caller will need to submit.
 883  */
 884 static int
 885 xfs_writepage_map(
 886         struct xfs_writepage_ctx *wpc,
 887         struct writeback_control *wbc,
 888         struct inode            *inode,
 889         struct page             *page,
 890         loff_t                  offset,
 891         __uint64_t              end_offset)
 892 {
 893         LIST_HEAD(submit_list);
 894         struct xfs_ioend        *ioend, *next;
 895         struct buffer_head      *bh, *head;
 896         ssize_t                 len = i_blocksize(inode);
 897         int                     error = 0;
 898         int                     count = 0;
 899         int                     uptodate = 1;
 900         unsigned int            new_type;
 901
 902         bh = head = page_buffers(page);
 903         offset = page_offset(page);
 904         do {
 905                 if (offset >= end_offset)
 906                         break;
 907                 if (!buffer_uptodate(bh))
 908                         uptodate = 0;
 909
 910                 /*
 911                  * set_page_dirty dirties all buffers in a page, independent
 912                  * of their state.  The dirty state however is entirely
 913                  * meaningless for holes (!mapped && uptodate), so skip
 914                  * buffers covering holes here.
 915                  */
 916                 if (!buffer_mapped(bh) && buffer_uptodate(bh)) {
 917                         wpc->imap_valid = false;
 918                         continue;
 919                 }
 920
 921                 if (buffer_unwritten(bh))
 922                         new_type = XFS_IO_UNWRITTEN;
 923                 else if (buffer_delay(bh))
 924                         new_type = XFS_IO_DELALLOC;
 925                 else if (buffer_uptodate(bh))
 926                         new_type = XFS_IO_OVERWRITE;
 927                 else {
 928                         if (PageUptodate(page))
 929                                 ASSERT(buffer_mapped(bh));
 930                         /*
 931                          * This buffer is not uptodate and will not be
 932                          * written to disk.  Ensure that we will put any
 933                          * subsequent writeable buffers into a new
 934                          * ioend.
 935                          */
 936                         wpc->imap_valid = false;
 937                         continue;
 938                 }
 939
 940                 if (xfs_is_reflink_inode(XFS_I(inode))) {
 941                         error = xfs_map_cow(wpc, inode, offset, &new_type);
 942                         if (error)
 943                                 goto out;
 944                 }
 945
 946                 if (wpc->io_type != new_type) {
 947                         wpc->io_type = new_type;
 948                         wpc->imap_valid = false;
 949                 }
 950
 951                 if (wpc->imap_valid)
 952                         wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
 953                                                          offset);
 954                 if (!wpc->imap_valid) {
 955                         error = xfs_map_blocks(inode, offset, &wpc->imap,
 956                                              wpc->io_type);
 957                         if (error)
 958                                 goto out;
 959                         wpc->imap_valid = xfs_imap_valid(inode, &wpc->imap,
 960                                                          offset);
 961                 }
 962                 if (wpc->imap_valid) {
 963                         lock_buffer(bh);
 964                         if (wpc->io_type != XFS_IO_OVERWRITE)
 965                                 xfs_map_at_offset(inode, bh, &wpc->imap, offset);
 966                         xfs_add_to_ioend(inode, bh, offset, wpc, wbc, &submit_list);
 967                         count++;
 968                 }
 969
 970         } while (offset += len, ((bh = bh->b_this_page) != head));
 971
 972         if (uptodate && bh == head)
 973                 SetPageUptodate(page);
 974
 975         ASSERT(wpc->ioend || list_empty(&submit_list));
 976
 977 out:
 978         /*
 979          * On error, we have to fail the ioend here because we have locked
 980          * buffers in the ioend. If we don't do this, we'll deadlock
 981          * invalidating the page as that tries to lock the buffers on the page.
 982          * Also, because we may have set pages under writeback, we have to make
 983          * sure we run IO completion to mark the error state of the IO
 984          * appropriately, so we can't cancel the ioend directly here. That means
 985          * we have to mark this page as under writeback if we included any
 986          * buffers from it in the ioend chain so that completion treats it
 987          * correctly.
 988          *
 989          * If we didn't include the page in the ioend, the on error we can
 990          * simply discard and unlock it as there are no other users of the page
 991          * or it's buffers right now. The caller will still need to trigger
 992          * submission of outstanding ioends on the writepage context so they are
 993          * treated correctly on error.
 994          */
 995         if (count) {
 996                 xfs_start_page_writeback(page, !error);
 997
 998                 /*
 999                  * Preserve the original error if there was one, otherwise catch
1000                  * submission errors here and propagate into subsequent ioend
1001                  * submissions.
1002                  */
1003                 list_for_each_entry_safe(ioend, next, &submit_list, io_list) {
1004                         int error2;
1005
1006                         list_del_init(&ioend->io_list);
1007                         error2 = xfs_submit_ioend(wbc, ioend, error);
1008                         if (error2 && !error)
1009                                 error = error2;
1010                 }
1011         } else if (error) {
1012                 xfs_aops_discard_page(page);
1013                 ClearPageUptodate(page);
1014                 unlock_page(page);
1015         } else {
1016                 /*
1017                  * We can end up here with no error and nothing to write if we
1018                  * race with a partial page truncate on a sub-page block sized
1019                  * filesystem. In that case we need to mark the page clean.
1020                  */
1021                 xfs_start_page_writeback(page, 1);
1022                 end_page_writeback(page);
1023         }
1024
1025         mapping_set_error(page->mapping, error);
1026         return error;
1027 }
1028
1029 /*
1030  * Write out a dirty page.
1031  *
1032  * For delalloc space on the page we need to allocate space and flush it.
1033  * For unwritten space on the page we need to start the conversion to
1034  * regular allocated space.
1035  * For any other dirty buffer heads on the page we should flush them.
1036  */
1037 STATIC int
1038 xfs_do_writepage(
1039         struct page             *page,
1040         struct writeback_control *wbc,
1041         void                    *data)
1042 {
1043         struct xfs_writepage_ctx *wpc = data;
1044         struct inode            *inode = page->mapping->host;
1045         loff_t                  offset;
1046         __uint64_t              end_offset;
1047         pgoff_t                 end_index;
1048
1049         trace_xfs_writepage(inode, page, 0, 0);
1050
1051         ASSERT(page_has_buffers(page));
1052
1053         /*
1054          * Refuse to write the page out if we are called from reclaim context.
1055          *
1056          * This avoids stack overflows when called from deeply used stacks in
1057          * random callers for direct reclaim or memcg reclaim.  We explicitly
1058          * allow reclaim from kswapd as the stack usage there is relatively low.
1059          *
1060          * This should never happen except in the case of a VM regression so
1061          * warn about it.
1062          */
1063         if (WARN_ON_ONCE((current->flags & (PF_MEMALLOC|PF_KSWAPD)) ==
1064                         PF_MEMALLOC))
1065                 goto redirty;
1066
1067         /*
1068          * Given that we do not allow direct reclaim to call us, we should
1069          * never be called while in a filesystem transaction.
1070          */
1071         if (WARN_ON_ONCE(current->flags & PF_FSTRANS))
1072                 goto redirty;
1073
1074         /*
1075          * Is this page beyond the end of the file?
1076          *
1077          * The page index is less than the end_index, adjust the end_offset
1078          * to the highest offset that this page should represent.
1079          * -----------------------------------------------------
1080          * |                    file mapping           | <EOF> |
1081          * -----------------------------------------------------
1082          * | Page ... | Page N-2 | Page N-1 |  Page N  |       |
1083          * ^--------------------------------^----------|--------
1084          * |     desired writeback range    |      see else    |
1085          * ---------------------------------^------------------|
1086          */
1087         offset = i_size_read(inode);
1088         end_index = offset >> PAGE_SHIFT;
1089         if (page->index < end_index)
1090                 end_offset = (xfs_off_t)(page->index + 1) << PAGE_SHIFT;
1091         else {
1092                 /*
1093                  * Check whether the page to write out is beyond or straddles
1094                  * i_size or not.
1095                  * -------------------------------------------------------
1096                  * |            file mapping                    | <EOF>  |
1097                  * -------------------------------------------------------
1098                  * | Page ... | Page N-2 | Page N-1 |  Page N   | Beyond |
1099                  * ^--------------------------------^-----------|---------
1100                  * |                                |      Straddles     |
1101                  * ---------------------------------^-----------|--------|
1102                  */
1103                 unsigned offset_into_page = offset & (PAGE_SIZE - 1);
1104
1105                 /*
1106                  * Skip the page if it is fully outside i_size, e.g. due to a
1107                  * truncate operation that is in progress. We must redirty the
1108                  * page so that reclaim stops reclaiming it. Otherwise
1109                  * xfs_vm_releasepage() is called on it and gets confused.
1110                  *
1111                  * Note that the end_index is unsigned long, it would overflow
1112                  * if the given offset is greater than 16TB on 32-bit system
1113                  * and if we do check the page is fully outside i_size or not
1114                  * via "if (page->index >= end_index + 1)" as "end_index + 1"
1115                  * will be evaluated to 0.  Hence this page will be redirtied
1116                  * and be written out repeatedly which would result in an
1117                  * infinite loop, the user program that perform this operation
1118                  * will hang.  Instead, we can verify this situation by checking
1119                  * if the page to write is totally beyond the i_size or if it's
1120                  * offset is just equal to the EOF.
1121                  */
1122                 if (page->index > end_index ||
1123                     (page->index == end_index && offset_into_page == 0))
1124                         goto redirty;
1125
1126                 /*
1127                  * The page straddles i_size.  It must be zeroed out on each
1128                  * and every writepage invocation because it may be mmapped.
1129                  * "A file is mapped in multiples of the page size.  For a file
1130                  * that is not a multiple of the page size, the remaining
1131                  * memory is zeroed when mapped, and writes to that region are
1132                  * not written out to the file."
1133                  */
1134                 zero_user_segment(page, offset_into_page, PAGE_SIZE);
1135
1136                 /* Adjust the end_offset to the end of file */
1137                 end_offset = offset;
1138         }
1139
1140         return xfs_writepage_map(wpc, wbc, inode, page, offset, end_offset);
1141
1142 redirty:
1143         redirty_page_for_writepage(wbc, page);
1144         unlock_page(page);
1145         return 0;
1146 }
1147
1148 STATIC int
1149 xfs_vm_writepage(
1150         struct page             *page,
1151         struct writeback_control *wbc)
1152 {
1153         struct xfs_writepage_ctx wpc = {
1154                 .io_type = XFS_IO_INVALID,
1155         };
1156         int                     ret;
1157
1158         ret = xfs_do_writepage(page, wbc, &wpc);
1159         if (wpc.ioend)
1160                 ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
1161         return ret;
1162 }
1163
1164 STATIC int
1165 xfs_vm_writepages(
1166         struct address_space    *mapping,
1167         struct writeback_control *wbc)
1168 {
1169         struct xfs_writepage_ctx wpc = {
1170                 .io_type = XFS_IO_INVALID,
1171         };
1172         int                     ret;
1173
1174         xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
1175         if (dax_mapping(mapping))
1176                 return dax_writeback_mapping_range(mapping,
1177                                 xfs_find_bdev_for_inode(mapping->host), wbc);
1178
1179         ret = write_cache_pages(mapping, wbc, xfs_do_writepage, &wpc);
1180         if (wpc.ioend)
1181                 ret = xfs_submit_ioend(wbc, wpc.ioend, ret);
1182         return ret;
1183 }
1184
1185 /*
1186  * Called to move a page into cleanable state - and from there
1187  * to be released. The page should already be clean. We always
1188  * have buffer heads in this call.
1189  *
1190  * Returns 1 if the page is ok to release, 0 otherwise.
1191  */
1192 STATIC int
1193 xfs_vm_releasepage(
1194         struct page             *page,
1195         gfp_t                   gfp_mask)
1196 {
1197         int                     delalloc, unwritten;
1198
1199         trace_xfs_releasepage(page->mapping->host, page, 0, 0);
1200
1201         /*
1202          * mm accommodates an old ext3 case where clean pages might not have had
1203          * the dirty bit cleared. Thus, it can send actual dirty pages to
1204          * ->releasepage() via shrink_active_list(). Conversely,
1205          * block_invalidatepage() can send pages that are still marked dirty but
1206          * otherwise have invalidated buffers.
1207          *
1208          * We want to release the latter to avoid unnecessary buildup of the
1209          * LRU, so xfs_vm_invalidatepage() clears the page dirty flag on pages
1210          * that are entirely invalidated and need to be released.  Hence the
1211          * only time we should get dirty pages here is through
1212          * shrink_active_list() and so we can simply skip those now.
1213          *
1214          * warn if we've left any lingering delalloc/unwritten buffers on clean
1215          * or invalidated pages we are about to release.
1216          */
1217         if (PageDirty(page))
1218                 return 0;
1219
1220         xfs_count_page_state(page, &delalloc, &unwritten);
1221
1222         if (WARN_ON_ONCE(delalloc))
1223                 return 0;
1224         if (WARN_ON_ONCE(unwritten))
1225                 return 0;
1226
1227         return try_to_free_buffers(page);
1228 }
1229
1230 /*
1231  * When we map a DIO buffer, we may need to pass flags to
1232  * xfs_end_io_direct_write to tell it what kind of write IO we are doing.
1233  *
1234  * Note that for DIO, an IO to the highest supported file block offset (i.e.
1235  * 2^63 - 1FSB bytes) will result in the offset + count overflowing a signed 64
1236  * bit variable. Hence if we see this overflow, we have to assume that the IO is
1237  * extending the file size. We won't know for sure until IO completion is run
1238  * and the actual max write offset is communicated to the IO completion
1239  * routine.
1240  */
1241 static void
1242 xfs_map_direct(
1243         struct inode            *inode,
1244         struct buffer_head      *bh_result,
1245         struct xfs_bmbt_irec    *imap,
1246         xfs_off_t               offset,
1247         bool                    is_cow)
1248 {
1249         uintptr_t               *flags = (uintptr_t *)&bh_result->b_private;
1250         xfs_off_t               size = bh_result->b_size;
1251
1252         trace_xfs_get_blocks_map_direct(XFS_I(inode), offset, size,
1253                 ISUNWRITTEN(imap) ? XFS_IO_UNWRITTEN : is_cow ? XFS_IO_COW :
1254                 XFS_IO_OVERWRITE, imap);
1255
1256         if (ISUNWRITTEN(imap)) {
1257                 *flags |= XFS_DIO_FLAG_UNWRITTEN;
1258                 set_buffer_defer_completion(bh_result);
1259         } else if (is_cow) {
1260                 *flags |= XFS_DIO_FLAG_COW;
1261                 set_buffer_defer_completion(bh_result);
1262         }
1263         if (offset + size > i_size_read(inode) || offset + size < 0) {
1264                 *flags |= XFS_DIO_FLAG_APPEND;
1265                 set_buffer_defer_completion(bh_result);
1266         }
1267 }
1268
1269 /*
1270  * If this is O_DIRECT or the mpage code calling tell them how large the mapping
1271  * is, so that we can avoid repeated get_blocks calls.
1272  *
1273  * If the mapping spans EOF, then we have to break the mapping up as the mapping
1274  * for blocks beyond EOF must be marked new so that sub block regions can be
1275  * correctly zeroed. We can't do this for mappings within EOF unless the mapping
1276  * was just allocated or is unwritten, otherwise the callers would overwrite
1277  * existing data with zeros. Hence we have to split the mapping into a range up
1278  * to and including EOF, and a second mapping for beyond EOF.
1279  */
1280 static void
1281 xfs_map_trim_size(
1282         struct inode            *inode,
1283         sector_t                iblock,
1284         struct buffer_head      *bh_result,
1285         struct xfs_bmbt_irec    *imap,
1286         xfs_off_t               offset,
1287         ssize_t                 size)
1288 {
1289         xfs_off_t               mapping_size;
1290
1291         mapping_size = imap->br_startoff + imap->br_blockcount - iblock;
1292         mapping_size <<= inode->i_blkbits;
1293
1294         ASSERT(mapping_size > 0);
1295         if (mapping_size > size)
1296                 mapping_size = size;
1297         if (offset < i_size_read(inode) &&
1298             (xfs_ufsize_t)offset + mapping_size >= i_size_read(inode)) {
1299                 /* limit mapping to block that spans EOF */
1300                 mapping_size = roundup_64(i_size_read(inode) - offset,
1301                                           i_blocksize(inode));
1302         }
1303         if (mapping_size > LONG_MAX)
1304                 mapping_size = LONG_MAX;
1305
1306         bh_result->b_size = mapping_size;
1307 }
1308
1309 STATIC int
1310 __xfs_get_blocks(
1311         struct inode            *inode,
1312         sector_t                iblock,
1313         struct buffer_head      *bh_result,
1314         int                     create,
1315         bool                    direct,
1316         bool                    dax_fault)
1317 {
1318         struct xfs_inode        *ip = XFS_I(inode);
1319         struct xfs_mount        *mp = ip->i_mount;
1320         xfs_fileoff_t           offset_fsb, end_fsb;
1321         int                     error = 0;
1322         int                     lockmode = 0;
1323         struct xfs_bmbt_irec    imap;
1324         int                     nimaps = 1;
1325         xfs_off_t               offset;
1326         ssize_t                 size;
1327         int                     new = 0;
1328         bool                    is_cow = false;
1329         bool                    need_alloc = false;
1330
1331         BUG_ON(create && !direct);
1332
1333         if (XFS_FORCED_SHUTDOWN(mp))
1334                 return -EIO;
1335
1336         offset = (xfs_off_t)iblock << inode->i_blkbits;
1337         ASSERT(bh_result->b_size >= i_blocksize(inode));
1338         size = bh_result->b_size;
1339
1340         if (!create && offset >= i_size_read(inode))
1341                 return 0;
1342
1343         /*
1344          * Direct I/O is usually done on preallocated files, so try getting
1345          * a block mapping without an exclusive lock first.
1346          */
1347         lockmode = xfs_ilock_data_map_shared(ip);
1348
1349         ASSERT(offset <= mp->m_super->s_maxbytes);
1350         if ((xfs_ufsize_t)offset + size > mp->m_super->s_maxbytes)
1351                 size = mp->m_super->s_maxbytes - offset;
1352         end_fsb = XFS_B_TO_FSB(mp, (xfs_ufsize_t)offset + size);
1353         offset_fsb = XFS_B_TO_FSBT(mp, offset);
1354
1355         if (create && direct && xfs_is_reflink_inode(ip))
1356                 is_cow = xfs_reflink_find_cow_mapping(ip, offset, &imap,
1357                                         &need_alloc);
1358         if (!is_cow) {
1359                 error = xfs_bmapi_read(ip, offset_fsb, end_fsb - offset_fsb,
1360                                         &imap, &nimaps, XFS_BMAPI_ENTIRE);
1361                 /*
1362                  * Truncate an overwrite extent if there's a pending CoW
1363                  * reservation before the end of this extent.  This
1364                  * forces us to come back to get_blocks to take care of
1365                  * the CoW.
1366                  */
1367                 if (create && direct && nimaps &&
1368                     imap.br_startblock != HOLESTARTBLOCK &&
1369                     imap.br_startblock != DELAYSTARTBLOCK &&
1370                     !ISUNWRITTEN(&imap))
1371                         xfs_reflink_trim_irec_to_next_cow(ip, offset_fsb,
1372                                         &imap);
1373         }
1374         ASSERT(!need_alloc);
1375         if (error)
1376                 goto out_unlock;
1377
1378         /*
1379          * The only time we can ever safely find delalloc blocks on direct I/O
1380          * is a dio write to post-eof speculative preallocation. All other
1381          * scenarios are indicative of a problem or misuse (such as mixing
1382          * direct and mapped I/O).
1383          *
1384          * The file may be unmapped by the time we get here so we cannot
1385          * reliably fail the I/O based on mapping. Instead, fail the I/O if this
1386          * is a read or a write within eof. Otherwise, carry on but warn as a
1387          * precuation if the file happens to be mapped.
1388          */
1389         if (direct && imap.br_startblock == DELAYSTARTBLOCK) {
1390                 if (!create || offset < i_size_read(VFS_I(ip))) {
1391                         WARN_ON_ONCE(1);
1392                         error = -EIO;
1393                         goto out_unlock;
1394                 }
1395                 WARN_ON_ONCE(mapping_mapped(VFS_I(ip)->i_mapping));
1396         }
1397
1398         /* for DAX, we convert unwritten extents directly */
1399         if (create &&
1400             (!nimaps ||
1401              (imap.br_startblock == HOLESTARTBLOCK ||
1402               imap.br_startblock == DELAYSTARTBLOCK) ||
1403              (IS_DAX(inode) && ISUNWRITTEN(&imap)))) {
1404                 /*
1405                  * xfs_iomap_write_direct() expects the shared lock. It
1406                  * is unlocked on return.
1407                  */
1408                 if (lockmode == XFS_ILOCK_EXCL)
1409                         xfs_ilock_demote(ip, lockmode);
1410
1411                 error = xfs_iomap_write_direct(ip, offset, size,
1412                                                &imap, nimaps);
1413                 if (error)
1414                         return error;
1415                 new = 1;
1416
1417                 trace_xfs_get_blocks_alloc(ip, offset, size,
1418                                 ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
1419                                                    : XFS_IO_DELALLOC, &imap);
1420         } else if (nimaps) {
1421                 trace_xfs_get_blocks_found(ip, offset, size,
1422                                 ISUNWRITTEN(&imap) ? XFS_IO_UNWRITTEN
1423                                                    : XFS_IO_OVERWRITE, &imap);
1424                 xfs_iunlock(ip, lockmode);
1425         } else {
1426                 trace_xfs_get_blocks_notfound(ip, offset, size);
1427                 goto out_unlock;
1428         }
1429
1430         if (IS_DAX(inode) && create) {
1431                 ASSERT(!ISUNWRITTEN(&imap));
1432                 /* zeroing is not needed at a higher layer */
1433                 new = 0;
1434         }
1435
1436         /* trim mapping down to size requested */
1437         xfs_map_trim_size(inode, iblock, bh_result, &imap, offset, size);
1438
1439         /*
1440          * For unwritten extents do not report a disk address in the buffered
1441          * read case (treat as if we're reading into a hole).
1442          */
1443         if (imap.br_startblock != HOLESTARTBLOCK &&
1444             imap.br_startblock != DELAYSTARTBLOCK &&
1445             (create || !ISUNWRITTEN(&imap))) {
1446                 xfs_map_buffer(inode, bh_result, &imap, offset);
1447                 if (ISUNWRITTEN(&imap))
1448                         set_buffer_unwritten(bh_result);
1449                 /* direct IO needs special help */
1450                 if (create) {
1451                         if (dax_fault)
1452                                 ASSERT(!ISUNWRITTEN(&imap));
1453                         else
1454                                 xfs_map_direct(inode, bh_result, &imap, offset,
1455                                                 is_cow);
1456                 }
1457         }
1458
1459         /*
1460          * If this is a realtime file, data may be on a different device.
1461          * to that pointed to from the buffer_head b_bdev currently.
1462          */
1463         bh_result->b_bdev = xfs_find_bdev_for_inode(inode);
1464
1465         /*
1466          * If we previously allocated a block out beyond eof and we are now
1467          * coming back to use it then we will need to flag it as new even if it
1468          * has a disk address.
1469          *
1470          * With sub-block writes into unwritten extents we also need to mark
1471          * the buffer as new so that the unwritten parts of the buffer gets
1472          * correctly zeroed.
1473          */
1474         if (create &&
1475             ((!buffer_mapped(bh_result) && !buffer_uptodate(bh_result)) ||
1476              (offset >= i_size_read(inode)) ||
1477              (new || ISUNWRITTEN(&imap))))
1478                 set_buffer_new(bh_result);
1479
1480         return 0;
1481
1482 out_unlock:
1483         xfs_iunlock(ip, lockmode);
1484         return error;
1485 }
1486
1487 int
1488 xfs_get_blocks(
1489         struct inode            *inode,
1490         sector_t                iblock,
1491         struct buffer_head      *bh_result,
1492         int                     create)
1493 {
1494         return __xfs_get_blocks(inode, iblock, bh_result, create, false, false);
1495 }
1496
1497 int
1498 xfs_get_blocks_direct(
1499         struct inode            *inode,
1500         sector_t                iblock,
1501         struct buffer_head      *bh_result,
1502         int                     create)
1503 {
1504         return __xfs_get_blocks(inode, iblock, bh_result, create, true, false);
1505 }
1506
1507 int
1508 xfs_get_blocks_dax_fault(
1509         struct inode            *inode,
1510         sector_t                iblock,
1511         struct buffer_head      *bh_result,
1512         int                     create)
1513 {
1514         return __xfs_get_blocks(inode, iblock, bh_result, create, true, true);
1515 }
1516
1517 /*
1518  * Complete a direct I/O write request.
1519  *
1520  * xfs_map_direct passes us some flags in the private data to tell us what to
1521  * do.  If no flags are set, then the write IO is an overwrite wholly within
1522  * the existing allocated file size and so there is nothing for us to do.
1523  *
1524  * Note that in this case the completion can be called in interrupt context,
1525  * whereas if we have flags set we will always be called in task context
1526  * (i.e. from a workqueue).
1527  */
1528 int
1529 xfs_end_io_direct_write(
1530         struct kiocb            *iocb,
1531         loff_t                  offset,
1532         ssize_t                 size,
1533         void                    *private)
1534 {
1535         struct inode            *inode = file_inode(iocb->ki_filp);
1536         struct xfs_inode        *ip = XFS_I(inode);
1537         uintptr_t               flags = (uintptr_t)private;
1538         int                     error = 0;
1539
1540         trace_xfs_end_io_direct_write(ip, offset, size);
1541
1542         if (XFS_FORCED_SHUTDOWN(ip->i_mount))
1543                 return -EIO;
1544
1545         if (size <= 0)
1546                 return size;
1547
1548         /*
1549          * The flags tell us whether we are doing unwritten extent conversions
1550          * or an append transaction that updates the on-disk file size. These
1551          * cases are the only cases where we should *potentially* be needing
1552          * to update the VFS inode size.
1553          */
1554         if (flags == 0) {
1555                 ASSERT(offset + size <= i_size_read(inode));
1556                 return 0;
1557         }
1558
1559         if (flags & XFS_DIO_FLAG_COW)
1560                 error = xfs_reflink_end_cow(ip, offset, size);
1561
1562         /*
1563          * Unwritten conversion updates the in-core isize after extent
1564          * conversion but before updating the on-disk size. Updating isize any
1565          * earlier allows a racing dio read to find unwritten extents before
1566          * they are converted.
1567          */
1568         if (flags & XFS_DIO_FLAG_UNWRITTEN) {
1569                 trace_xfs_end_io_direct_write_unwritten(ip, offset, size);
1570
1571                 return xfs_iomap_write_unwritten(ip, offset, size, true);
1572         }
1573
1574         /*
1575          * We need to update the in-core inode size here so that we don't end up
1576          * with the on-disk inode size being outside the in-core inode size. We
1577          * have no other method of updating EOF for AIO, so always do it here
1578          * if necessary.
1579          *
1580          * We need to lock the test/set EOF update as we can be racing with
1581          * other IO completions here to update the EOF. Failing to serialise
1582          * here can result in EOF moving backwards and Bad Things Happen when
1583          * that occurs.
1584          */
1585         spin_lock(&ip->i_flags_lock);
1586         if (offset + size > i_size_read(inode))
1587                 i_size_write(inode, offset + size);
1588         spin_unlock(&ip->i_flags_lock);
1589
1590         if (flags & XFS_DIO_FLAG_APPEND) {
1591                 trace_xfs_end_io_direct_write_append(ip, offset, size);
1592
1593                 error = xfs_setfilesize(ip, offset, size);
1594         }
1595
1596         return error;
1597 }
1598
1599 STATIC ssize_t
1600 xfs_vm_direct_IO(
1601         struct kiocb            *iocb,
1602         struct iov_iter         *iter)
1603 {
1604         /*
1605          * We just need the method present so that open/fcntl allow direct I/O.
1606          */
1607         return -EINVAL;
1608 }
1609
1610 STATIC sector_t
1611 xfs_vm_bmap(
1612         struct address_space    *mapping,
1613         sector_t                block)
1614 {
1615         struct inode            *inode = (struct inode *)mapping->host;
1616         struct xfs_inode        *ip = XFS_I(inode);
1617
1618         trace_xfs_vm_bmap(XFS_I(inode));
1619         xfs_ilock(ip, XFS_IOLOCK_SHARED);
1620
1621         /*
1622          * The swap code (ab-)uses ->bmap to get a block mapping and then
1623          * bypasseѕ the file system for actual I/O.  We really can't allow
1624          * that on reflinks inodes, so we have to skip out here.  And yes,
1625          * 0 is the magic code for a bmap error.
1626          *
1627          * Since we don't pass back blockdev info, we can't return bmap
1628          * information for rt files either.
1629          */
1630         if (xfs_is_reflink_inode(ip) || XFS_IS_REALTIME_INODE(ip)) {
1631                 xfs_iunlock(ip, XFS_IOLOCK_SHARED);
1632                 return 0;
1633         }
1634         filemap_write_and_wait(mapping);
1635         xfs_iunlock(ip, XFS_IOLOCK_SHARED);
1636         return generic_block_bmap(mapping, block, xfs_get_blocks);
1637 }
1638
1639 STATIC int
1640 xfs_vm_readpage(
1641         struct file             *unused,
1642         struct page             *page)
1643 {
1644         trace_xfs_vm_readpage(page->mapping->host, 1);
1645         return mpage_readpage(page, xfs_get_blocks);
1646 }
1647
1648 STATIC int
1649 xfs_vm_readpages(
1650         struct file             *unused,
1651         struct address_space    *mapping,
1652         struct list_head        *pages,
1653         unsigned                nr_pages)
1654 {
1655         trace_xfs_vm_readpages(mapping->host, nr_pages);
1656         return mpage_readpages(mapping, pages, nr_pages, xfs_get_blocks);
1657 }
1658
1659 /*
1660  * This is basically a copy of __set_page_dirty_buffers() with one
1661  * small tweak: buffers beyond EOF do not get marked dirty. If we mark them
1662  * dirty, we'll never be able to clean them because we don't write buffers
1663  * beyond EOF, and that means we can't invalidate pages that span EOF
1664  * that have been marked dirty. Further, the dirty state can leak into
1665  * the file interior if the file is extended, resulting in all sorts of
1666  * bad things happening as the state does not match the underlying data.
1667  *
1668  * XXX: this really indicates that bufferheads in XFS need to die. Warts like
1669  * this only exist because of bufferheads and how the generic code manages them.
1670  */
1671 STATIC int
1672 xfs_vm_set_page_dirty(
1673         struct page             *page)
1674 {
1675         struct address_space    *mapping = page->mapping;
1676         struct inode            *inode = mapping->host;
1677         loff_t                  end_offset;
1678         loff_t                  offset;
1679         int                     newly_dirty;
1680
1681         if (unlikely(!mapping))
1682                 return !TestSetPageDirty(page);
1683
1684         end_offset = i_size_read(inode);
1685         offset = page_offset(page);
1686
1687         spin_lock(&mapping->private_lock);
1688         if (page_has_buffers(page)) {
1689                 struct buffer_head *head = page_buffers(page);
1690                 struct buffer_head *bh = head;
1691
1692                 do {
1693                         if (offset < end_offset)
1694                                 set_buffer_dirty(bh);
1695                         bh = bh->b_this_page;
1696                         offset += i_blocksize(inode);
1697                 } while (bh != head);
1698         }
1699         /*
1700          * Lock out page->mem_cgroup migration to keep PageDirty
1701          * synchronized with per-memcg dirty page counters.
1702          */
1703         lock_page_memcg(page);
1704         newly_dirty = !TestSetPageDirty(page);
1705         spin_unlock(&mapping->private_lock);
1706
1707         if (newly_dirty) {
1708                 /* sigh - __set_page_dirty() is static, so copy it here, too */
1709                 unsigned long flags;
1710
1711                 spin_lock_irqsave(&mapping->tree_lock, flags);
1712                 if (page->mapping) {    /* Race with truncate? */
1713                         WARN_ON_ONCE(!PageUptodate(page));
1714                         account_page_dirtied(page, mapping);
1715                         radix_tree_tag_set(&mapping->page_tree,
1716                                         page_index(page), PAGECACHE_TAG_DIRTY);
1717                 }
1718                 spin_unlock_irqrestore(&mapping->tree_lock, flags);
1719         }
1720         unlock_page_memcg(page);
1721         if (newly_dirty)
1722                 __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
1723         return newly_dirty;
1724 }
1725
1726 const struct address_space_operations xfs_address_space_operations = {
1727         .readpage               = xfs_vm_readpage,
1728         .readpages              = xfs_vm_readpages,
1729         .writepage              = xfs_vm_writepage,
1730         .writepages             = xfs_vm_writepages,
1731         .set_page_dirty         = xfs_vm_set_page_dirty,
1732         .releasepage            = xfs_vm_releasepage,
1733         .invalidatepage         = xfs_vm_invalidatepage,
1734         .bmap                   = xfs_vm_bmap,
1735         .direct_IO              = xfs_vm_direct_IO,
1736         .migratepage            = buffer_migrate_page,
1737         .is_partially_uptodate  = block_is_partially_uptodate,
1738         .error_remove_page      = generic_error_remove_page,
1739 };